aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c12
-rw-r--r--kernel/exec_domain.c18
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/kexec.c7
-rw-r--r--kernel/module.c348
-rw-r--r--kernel/perf_event.c356
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/nvs.c (renamed from kernel/power/hibernate_nvs.c)24
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/sched.c161
-rw-r--r--kernel/sched_fair.c24
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/tick-sched.c21
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/trace_event_perf.c19
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_syscalls.c4
24 files changed, 633 insertions, 422 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 422cb19f156e..3ac6f5b0a64b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4598,7 +4598,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4598 parent_css = parent->subsys[subsys_id]; 4598 parent_css = parent->subsys[subsys_id];
4599 child_css = child->subsys[subsys_id]; 4599 child_css = child->subsys[subsys_id];
4600 parent_id = parent_css->id; 4600 parent_id = parent_css->id;
4601 depth = parent_id->depth; 4601 depth = parent_id->depth + 1;
4602 4602
4603 child_id = get_new_cssid(ss, depth); 4603 child_id = get_new_cssid(ss, depth);
4604 if (IS_ERR(child_id)) 4604 if (IS_ERR(child_id))
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8b92539b4754..97d1b426a4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,7 +34,7 @@ void cpu_maps_update_done(void)
34 mutex_unlock(&cpu_add_remove_lock); 34 mutex_unlock(&cpu_add_remove_lock);
35} 35}
36 36
37static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 37static RAW_NOTIFIER_HEAD(cpu_chain);
38 38
39/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. 39/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
40 * Should always be manipulated under cpu_add_remove_lock 40 * Should always be manipulated under cpu_add_remove_lock
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index b724c791b6d4..184cd8209c36 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1857,12 +1857,6 @@ static int kdb_ef(int argc, const char **argv)
1857} 1857}
1858 1858
1859#if defined(CONFIG_MODULES) 1859#if defined(CONFIG_MODULES)
1860/* modules using other modules */
1861struct module_use {
1862 struct list_head list;
1863 struct module *module_which_uses;
1864};
1865
1866/* 1860/*
1867 * kdb_lsmod - This function implements the 'lsmod' command. Lists 1861 * kdb_lsmod - This function implements the 'lsmod' command. Lists
1868 * currently loaded kernel modules. 1862 * currently loaded kernel modules.
@@ -1894,9 +1888,9 @@ static int kdb_lsmod(int argc, const char **argv)
1894 { 1888 {
1895 struct module_use *use; 1889 struct module_use *use;
1896 kdb_printf(" [ "); 1890 kdb_printf(" [ ");
1897 list_for_each_entry(use, &mod->modules_which_use_me, 1891 list_for_each_entry(use, &mod->source_list,
1898 list) 1892 source_list)
1899 kdb_printf("%s ", use->module_which_uses->name); 1893 kdb_printf("%s ", use->target->name);
1900 kdb_printf("]\n"); 1894 kdb_printf("]\n");
1901 } 1895 }
1902#endif 1896#endif
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadded..dd62f8e714ca 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
27static DEFINE_RWLOCK(exec_domains_lock); 27static DEFINE_RWLOCK(exec_domains_lock);
28 28
29 29
30static u_long ident_map[32] = { 30static unsigned long ident_map[32] = {
31 0, 1, 2, 3, 4, 5, 6, 7, 31 0, 1, 2, 3, 4, 5, 6, 7,
32 8, 9, 10, 11, 12, 13, 14, 15, 32 8, 9, 10, 11, 12, 13, 14, 15,
33 16, 17, 18, 19, 20, 21, 22, 23, 33 16, 17, 18, 19, 20, 21, 22, 23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
56} 56}
57 57
58static struct exec_domain * 58static struct exec_domain *
59lookup_exec_domain(u_long personality) 59lookup_exec_domain(unsigned int personality)
60{ 60{
61 struct exec_domain * ep; 61 unsigned int pers = personality(personality);
62 u_long pers = personality(personality); 62 struct exec_domain *ep;
63 63
64 read_lock(&exec_domains_lock); 64 read_lock(&exec_domains_lock);
65 for (ep = exec_domains; ep; ep = ep->next) { 65 for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
70 70
71#ifdef CONFIG_MODULES 71#ifdef CONFIG_MODULES
72 read_unlock(&exec_domains_lock); 72 read_unlock(&exec_domains_lock);
73 request_module("personality-%ld", pers); 73 request_module("personality-%d", pers);
74 read_lock(&exec_domains_lock); 74 read_lock(&exec_domains_lock);
75 75
76 for (ep = exec_domains; ep; ep = ep->next) { 76 for (ep = exec_domains; ep; ep = ep->next) {
@@ -135,7 +135,7 @@ unregister:
135} 135}
136 136
137int 137int
138__set_personality(u_long personality) 138__set_personality(unsigned int personality)
139{ 139{
140 struct exec_domain *ep, *oep; 140 struct exec_domain *ep, *oep;
141 141
@@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void)
188module_init(proc_execdomains_init); 188module_init(proc_execdomains_init);
189#endif 189#endif
190 190
191SYSCALL_DEFINE1(personality, u_long, personality) 191SYSCALL_DEFINE1(personality, unsigned int, personality)
192{ 192{
193 u_long old = current->personality; 193 unsigned int old = current->personality;
194 194
195 if (personality != 0xffffffff) { 195 if (personality != 0xffffffff) {
196 set_personality(personality); 196 set_personality(personality);
@@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality)
198 return -EINVAL; 198 return -EINVAL;
199 } 199 }
200 200
201 return (long)old; 201 return old;
202} 202}
203 203
204 204
diff --git a/kernel/futex.c b/kernel/futex.c
index e7a35f1039e7..6a3a5fa1526d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
429static struct task_struct * futex_find_get_task(pid_t pid) 429static struct task_struct * futex_find_get_task(pid_t pid)
430{ 430{
431 struct task_struct *p; 431 struct task_struct *p;
432 const struct cred *cred = current_cred(), *pcred;
433 432
434 rcu_read_lock(); 433 rcu_read_lock();
435 p = find_task_by_vpid(pid); 434 p = find_task_by_vpid(pid);
436 if (!p) { 435 if (p)
437 p = ERR_PTR(-ESRCH); 436 get_task_struct(p);
438 } else {
439 pcred = __task_cred(p);
440 if (cred->euid != pcred->euid &&
441 cred->euid != pcred->uid)
442 p = ERR_PTR(-ESRCH);
443 else
444 get_task_struct(p);
445 }
446 437
447 rcu_read_unlock(); 438 rcu_read_unlock();
448 439
@@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
564 if (!pid) 555 if (!pid)
565 return -ESRCH; 556 return -ESRCH;
566 p = futex_find_get_task(pid); 557 p = futex_find_get_task(pid);
567 if (IS_ERR(p)) 558 if (!p)
568 return PTR_ERR(p); 559 return -ESRCH;
569 560
570 /* 561 /*
571 * We need to look at the task state flags to figure out, 562 * We need to look at the task state flags to figure out,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3164ba7ce151..e1497481fe8a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -456,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
456 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 456 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
458 desc->status |= flags; 458 desc->status |= flags;
459
460 if (chip != desc->chip)
461 irq_chip_set_defaults(desc->chip);
459 } 462 }
460 463
461 return ret; 464 return ret;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 474a84715eac..131b1703936f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs)
1089 1089
1090size_t crash_get_memory_size(void) 1090size_t crash_get_memory_size(void)
1091{ 1091{
1092 size_t size; 1092 size_t size = 0;
1093 mutex_lock(&kexec_mutex); 1093 mutex_lock(&kexec_mutex);
1094 size = crashk_res.end - crashk_res.start + 1; 1094 if (crashk_res.end != crashk_res.start)
1095 size = crashk_res.end - crashk_res.start + 1;
1095 mutex_unlock(&kexec_mutex); 1096 mutex_unlock(&kexec_mutex);
1096 return size; 1097 return size;
1097} 1098}
@@ -1134,7 +1135,7 @@ int crash_shrink_memory(unsigned long new_size)
1134 1135
1135 free_reserved_phys_range(end, crashk_res.end); 1136 free_reserved_phys_range(end, crashk_res.end);
1136 1137
1137 if (start == end) 1138 if ((start == end) && (crashk_res.parent != NULL))
1138 release_resource(&crashk_res); 1139 release_resource(&crashk_res);
1139 crashk_res.end = end - 1; 1140 crashk_res.end = end - 1;
1140 1141
diff --git a/kernel/module.c b/kernel/module.c
index 333fbcc96978..5d2d28197c82 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -72,7 +72,11 @@
72/* If this is set, the section belongs in the init part of the module */ 72/* If this is set, the section belongs in the init part of the module */
73#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 73#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
74 74
75/* List of modules, protected by module_mutex or preempt_disable 75/*
76 * Mutex protects:
77 * 1) List of modules (also safely readable with preempt_disable),
78 * 2) module_use links,
79 * 3) module_addr_min/module_addr_max.
76 * (delete uses stop_machine/add uses RCU list operations). */ 80 * (delete uses stop_machine/add uses RCU list operations). */
77DEFINE_MUTEX(module_mutex); 81DEFINE_MUTEX(module_mutex);
78EXPORT_SYMBOL_GPL(module_mutex); 82EXPORT_SYMBOL_GPL(module_mutex);
@@ -90,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
90 94
91static BLOCKING_NOTIFIER_HEAD(module_notify_list); 95static BLOCKING_NOTIFIER_HEAD(module_notify_list);
92 96
93/* Bounds of module allocation, for speeding __module_address */ 97/* Bounds of module allocation, for speeding __module_address.
98 * Protected by module_mutex. */
94static unsigned long module_addr_min = -1UL, module_addr_max = 0; 99static unsigned long module_addr_min = -1UL, module_addr_max = 0;
95 100
96int register_module_notifier(struct notifier_block * nb) 101int register_module_notifier(struct notifier_block * nb)
@@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
329} 334}
330 335
331/* Find a symbol and return it, along with, (optional) crc and 336/* Find a symbol and return it, along with, (optional) crc and
332 * (optional) module which owns it */ 337 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
333const struct kernel_symbol *find_symbol(const char *name, 338const struct kernel_symbol *find_symbol(const char *name,
334 struct module **owner, 339 struct module **owner,
335 const unsigned long **crc, 340 const unsigned long **crc,
@@ -403,7 +408,7 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
403 Elf_Shdr *sechdrs, 408 Elf_Shdr *sechdrs,
404 const char *secstrings) 409 const char *secstrings)
405{ 410{
406 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 411 return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
407} 412}
408 413
409static void percpu_modcopy(struct module *mod, 414static void percpu_modcopy(struct module *mod,
@@ -523,7 +528,8 @@ static void module_unload_init(struct module *mod)
523{ 528{
524 int cpu; 529 int cpu;
525 530
526 INIT_LIST_HEAD(&mod->modules_which_use_me); 531 INIT_LIST_HEAD(&mod->source_list);
532 INIT_LIST_HEAD(&mod->target_list);
527 for_each_possible_cpu(cpu) { 533 for_each_possible_cpu(cpu) {
528 per_cpu_ptr(mod->refptr, cpu)->incs = 0; 534 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
529 per_cpu_ptr(mod->refptr, cpu)->decs = 0; 535 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
@@ -535,20 +541,13 @@ static void module_unload_init(struct module *mod)
535 mod->waiter = current; 541 mod->waiter = current;
536} 542}
537 543
538/* modules using other modules */
539struct module_use
540{
541 struct list_head list;
542 struct module *module_which_uses;
543};
544
545/* Does a already use b? */ 544/* Does a already use b? */
546static int already_uses(struct module *a, struct module *b) 545static int already_uses(struct module *a, struct module *b)
547{ 546{
548 struct module_use *use; 547 struct module_use *use;
549 548
550 list_for_each_entry(use, &b->modules_which_use_me, list) { 549 list_for_each_entry(use, &b->source_list, source_list) {
551 if (use->module_which_uses == a) { 550 if (use->source == a) {
552 DEBUGP("%s uses %s!\n", a->name, b->name); 551 DEBUGP("%s uses %s!\n", a->name, b->name);
553 return 1; 552 return 1;
554 } 553 }
@@ -557,62 +556,68 @@ static int already_uses(struct module *a, struct module *b)
557 return 0; 556 return 0;
558} 557}
559 558
560/* Module a uses b */ 559/*
561int use_module(struct module *a, struct module *b) 560 * Module a uses b
561 * - we add 'a' as a "source", 'b' as a "target" of module use
562 * - the module_use is added to the list of 'b' sources (so
563 * 'b' can walk the list to see who sourced them), and of 'a'
564 * targets (so 'a' can see what modules it targets).
565 */
566static int add_module_usage(struct module *a, struct module *b)
562{ 567{
563 struct module_use *use; 568 struct module_use *use;
564 int no_warn, err;
565 569
566 if (b == NULL || already_uses(a, b)) return 1; 570 DEBUGP("Allocating new usage for %s.\n", a->name);
571 use = kmalloc(sizeof(*use), GFP_ATOMIC);
572 if (!use) {
573 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
574 return -ENOMEM;
575 }
576
577 use->source = a;
578 use->target = b;
579 list_add(&use->source_list, &b->source_list);
580 list_add(&use->target_list, &a->target_list);
581 return 0;
582}
567 583
568 /* If we're interrupted or time out, we fail. */ 584/* Module a uses b: caller needs module_mutex() */
569 if (wait_event_interruptible_timeout( 585int ref_module(struct module *a, struct module *b)
570 module_wq, (err = strong_try_module_get(b)) != -EBUSY, 586{
571 30 * HZ) <= 0) { 587 int err;
572 printk("%s: gave up waiting for init of module %s.\n", 588
573 a->name, b->name); 589 if (b == NULL || already_uses(a, b))
574 return 0; 590 return 0;
575 }
576 591
577 /* If strong_try_module_get() returned a different error, we fail. */ 592 /* If module isn't available, we fail. */
593 err = strong_try_module_get(b);
578 if (err) 594 if (err)
579 return 0; 595 return err;
580 596
581 DEBUGP("Allocating new usage for %s.\n", a->name); 597 err = add_module_usage(a, b);
582 use = kmalloc(sizeof(*use), GFP_ATOMIC); 598 if (err) {
583 if (!use) {
584 printk("%s: out of memory loading\n", a->name);
585 module_put(b); 599 module_put(b);
586 return 0; 600 return err;
587 } 601 }
588 602 return 0;
589 use->module_which_uses = a;
590 list_add(&use->list, &b->modules_which_use_me);
591 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
592 return 1;
593} 603}
594EXPORT_SYMBOL_GPL(use_module); 604EXPORT_SYMBOL_GPL(ref_module);
595 605
596/* Clear the unload stuff of the module. */ 606/* Clear the unload stuff of the module. */
597static void module_unload_free(struct module *mod) 607static void module_unload_free(struct module *mod)
598{ 608{
599 struct module *i; 609 struct module_use *use, *tmp;
600
601 list_for_each_entry(i, &modules, list) {
602 struct module_use *use;
603 610
604 list_for_each_entry(use, &i->modules_which_use_me, list) { 611 mutex_lock(&module_mutex);
605 if (use->module_which_uses == mod) { 612 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
606 DEBUGP("%s unusing %s\n", mod->name, i->name); 613 struct module *i = use->target;
607 module_put(i); 614 DEBUGP("%s unusing %s\n", mod->name, i->name);
608 list_del(&use->list); 615 module_put(i);
609 kfree(use); 616 list_del(&use->source_list);
610 sysfs_remove_link(i->holders_dir, mod->name); 617 list_del(&use->target_list);
611 /* There can be at most one match. */ 618 kfree(use);
612 break;
613 }
614 }
615 } 619 }
620 mutex_unlock(&module_mutex);
616} 621}
617 622
618#ifdef CONFIG_MODULE_FORCE_UNLOAD 623#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -735,7 +740,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
735 goto out; 740 goto out;
736 } 741 }
737 742
738 if (!list_empty(&mod->modules_which_use_me)) { 743 if (!list_empty(&mod->source_list)) {
739 /* Other modules depend on us: get rid of them first. */ 744 /* Other modules depend on us: get rid of them first. */
740 ret = -EWOULDBLOCK; 745 ret = -EWOULDBLOCK;
741 goto out; 746 goto out;
@@ -779,13 +784,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
779 blocking_notifier_call_chain(&module_notify_list, 784 blocking_notifier_call_chain(&module_notify_list,
780 MODULE_STATE_GOING, mod); 785 MODULE_STATE_GOING, mod);
781 async_synchronize_full(); 786 async_synchronize_full();
782 mutex_lock(&module_mutex); 787
783 /* Store the name of the last unloaded module for diagnostic purposes */ 788 /* Store the name of the last unloaded module for diagnostic purposes */
784 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 789 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
785 ddebug_remove_module(mod->name); 790 ddebug_remove_module(mod->name);
786 free_module(mod);
787 791
788 out: 792 free_module(mod);
793 return 0;
794out:
789 mutex_unlock(&module_mutex); 795 mutex_unlock(&module_mutex);
790 return ret; 796 return ret;
791} 797}
@@ -799,9 +805,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
799 805
800 /* Always include a trailing , so userspace can differentiate 806 /* Always include a trailing , so userspace can differentiate
801 between this and the old multi-field proc format. */ 807 between this and the old multi-field proc format. */
802 list_for_each_entry(use, &mod->modules_which_use_me, list) { 808 list_for_each_entry(use, &mod->source_list, source_list) {
803 printed_something = 1; 809 printed_something = 1;
804 seq_printf(m, "%s,", use->module_which_uses->name); 810 seq_printf(m, "%s,", use->source->name);
805 } 811 }
806 812
807 if (mod->init != NULL && mod->exit == NULL) { 813 if (mod->init != NULL && mod->exit == NULL) {
@@ -880,11 +886,11 @@ static inline void module_unload_free(struct module *mod)
880{ 886{
881} 887}
882 888
883int use_module(struct module *a, struct module *b) 889int ref_module(struct module *a, struct module *b)
884{ 890{
885 return strong_try_module_get(b) == 0; 891 return strong_try_module_get(b);
886} 892}
887EXPORT_SYMBOL_GPL(use_module); 893EXPORT_SYMBOL_GPL(ref_module);
888 894
889static inline void module_unload_init(struct module *mod) 895static inline void module_unload_init(struct module *mod)
890{ 896{
@@ -1001,6 +1007,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1001{ 1007{
1002 const unsigned long *crc; 1008 const unsigned long *crc;
1003 1009
1010 /* Since this should be found in kernel (which can't be removed),
1011 * no locking is necessary. */
1004 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 1012 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1005 &crc, true, false)) 1013 &crc, true, false))
1006 BUG(); 1014 BUG();
@@ -1043,29 +1051,62 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1043} 1051}
1044#endif /* CONFIG_MODVERSIONS */ 1052#endif /* CONFIG_MODVERSIONS */
1045 1053
1046/* Resolve a symbol for this module. I.e. if we find one, record usage. 1054/* Resolve a symbol for this module. I.e. if we find one, record usage. */
1047 Must be holding module_mutex. */
1048static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1055static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1049 unsigned int versindex, 1056 unsigned int versindex,
1050 const char *name, 1057 const char *name,
1051 struct module *mod) 1058 struct module *mod,
1059 char ownername[])
1052{ 1060{
1053 struct module *owner; 1061 struct module *owner;
1054 const struct kernel_symbol *sym; 1062 const struct kernel_symbol *sym;
1055 const unsigned long *crc; 1063 const unsigned long *crc;
1064 int err;
1056 1065
1066 mutex_lock(&module_mutex);
1057 sym = find_symbol(name, &owner, &crc, 1067 sym = find_symbol(name, &owner, &crc,
1058 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1068 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1059 /* use_module can fail due to OOM, 1069 if (!sym)
1060 or module initialization or unloading */ 1070 goto unlock;
1061 if (sym) { 1071
1062 if (!check_version(sechdrs, versindex, name, mod, crc, owner) 1072 if (!check_version(sechdrs, versindex, name, mod, crc, owner)) {
1063 || !use_module(mod, owner)) 1073 sym = ERR_PTR(-EINVAL);
1064 sym = NULL; 1074 goto getname;
1065 } 1075 }
1076
1077 err = ref_module(mod, owner);
1078 if (err) {
1079 sym = ERR_PTR(err);
1080 goto getname;
1081 }
1082
1083getname:
1084 /* We must make copy under the lock if we failed to get ref. */
1085 strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
1086unlock:
1087 mutex_unlock(&module_mutex);
1066 return sym; 1088 return sym;
1067} 1089}
1068 1090
1091static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
1092 unsigned int versindex,
1093 const char *name,
1094 struct module *mod)
1095{
1096 const struct kernel_symbol *ksym;
1097 char ownername[MODULE_NAME_LEN];
1098
1099 if (wait_event_interruptible_timeout(module_wq,
1100 !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name,
1101 mod, ownername)) ||
1102 PTR_ERR(ksym) != -EBUSY,
1103 30 * HZ) <= 0) {
1104 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1105 mod->name, ownername);
1106 }
1107 return ksym;
1108}
1109
1069/* 1110/*
1070 * /sys/module/foo/sections stuff 1111 * /sys/module/foo/sections stuff
1071 * J. Corbet <corbet@lwn.net> 1112 * J. Corbet <corbet@lwn.net>
@@ -1295,7 +1336,34 @@ static inline void remove_notes_attrs(struct module *mod)
1295#endif 1336#endif
1296 1337
1297#ifdef CONFIG_SYSFS 1338#ifdef CONFIG_SYSFS
1298int module_add_modinfo_attrs(struct module *mod) 1339static void add_usage_links(struct module *mod)
1340{
1341#ifdef CONFIG_MODULE_UNLOAD
1342 struct module_use *use;
1343 int nowarn;
1344
1345 mutex_lock(&module_mutex);
1346 list_for_each_entry(use, &mod->target_list, target_list) {
1347 nowarn = sysfs_create_link(use->target->holders_dir,
1348 &mod->mkobj.kobj, mod->name);
1349 }
1350 mutex_unlock(&module_mutex);
1351#endif
1352}
1353
1354static void del_usage_links(struct module *mod)
1355{
1356#ifdef CONFIG_MODULE_UNLOAD
1357 struct module_use *use;
1358
1359 mutex_lock(&module_mutex);
1360 list_for_each_entry(use, &mod->target_list, target_list)
1361 sysfs_remove_link(use->target->holders_dir, mod->name);
1362 mutex_unlock(&module_mutex);
1363#endif
1364}
1365
1366static int module_add_modinfo_attrs(struct module *mod)
1299{ 1367{
1300 struct module_attribute *attr; 1368 struct module_attribute *attr;
1301 struct module_attribute *temp_attr; 1369 struct module_attribute *temp_attr;
@@ -1321,7 +1389,7 @@ int module_add_modinfo_attrs(struct module *mod)
1321 return error; 1389 return error;
1322} 1390}
1323 1391
1324void module_remove_modinfo_attrs(struct module *mod) 1392static void module_remove_modinfo_attrs(struct module *mod)
1325{ 1393{
1326 struct module_attribute *attr; 1394 struct module_attribute *attr;
1327 int i; 1395 int i;
@@ -1337,7 +1405,7 @@ void module_remove_modinfo_attrs(struct module *mod)
1337 kfree(mod->modinfo_attrs); 1405 kfree(mod->modinfo_attrs);
1338} 1406}
1339 1407
1340int mod_sysfs_init(struct module *mod) 1408static int mod_sysfs_init(struct module *mod)
1341{ 1409{
1342 int err; 1410 int err;
1343 struct kobject *kobj; 1411 struct kobject *kobj;
@@ -1371,12 +1439,16 @@ out:
1371 return err; 1439 return err;
1372} 1440}
1373 1441
1374int mod_sysfs_setup(struct module *mod, 1442static int mod_sysfs_setup(struct module *mod,
1375 struct kernel_param *kparam, 1443 struct kernel_param *kparam,
1376 unsigned int num_params) 1444 unsigned int num_params)
1377{ 1445{
1378 int err; 1446 int err;
1379 1447
1448 err = mod_sysfs_init(mod);
1449 if (err)
1450 goto out;
1451
1380 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); 1452 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
1381 if (!mod->holders_dir) { 1453 if (!mod->holders_dir) {
1382 err = -ENOMEM; 1454 err = -ENOMEM;
@@ -1391,6 +1463,8 @@ int mod_sysfs_setup(struct module *mod,
1391 if (err) 1463 if (err)
1392 goto out_unreg_param; 1464 goto out_unreg_param;
1393 1465
1466 add_usage_links(mod);
1467
1394 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1468 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1395 return 0; 1469 return 0;
1396 1470
@@ -1400,6 +1474,7 @@ out_unreg_holders:
1400 kobject_put(mod->holders_dir); 1474 kobject_put(mod->holders_dir);
1401out_unreg: 1475out_unreg:
1402 kobject_put(&mod->mkobj.kobj); 1476 kobject_put(&mod->mkobj.kobj);
1477out:
1403 return err; 1478 return err;
1404} 1479}
1405 1480
@@ -1410,14 +1485,40 @@ static void mod_sysfs_fini(struct module *mod)
1410 1485
1411#else /* CONFIG_SYSFS */ 1486#else /* CONFIG_SYSFS */
1412 1487
1488static inline int mod_sysfs_init(struct module *mod)
1489{
1490 return 0;
1491}
1492
1493static inline int mod_sysfs_setup(struct module *mod,
1494 struct kernel_param *kparam,
1495 unsigned int num_params)
1496{
1497 return 0;
1498}
1499
1500static inline int module_add_modinfo_attrs(struct module *mod)
1501{
1502 return 0;
1503}
1504
1505static inline void module_remove_modinfo_attrs(struct module *mod)
1506{
1507}
1508
1413static void mod_sysfs_fini(struct module *mod) 1509static void mod_sysfs_fini(struct module *mod)
1414{ 1510{
1415} 1511}
1416 1512
1513static void del_usage_links(struct module *mod)
1514{
1515}
1516
1417#endif /* CONFIG_SYSFS */ 1517#endif /* CONFIG_SYSFS */
1418 1518
1419static void mod_kobject_remove(struct module *mod) 1519static void mod_kobject_remove(struct module *mod)
1420{ 1520{
1521 del_usage_links(mod);
1421 module_remove_modinfo_attrs(mod); 1522 module_remove_modinfo_attrs(mod);
1422 module_param_sysfs_remove(mod); 1523 module_param_sysfs_remove(mod);
1423 kobject_put(mod->mkobj.drivers_dir); 1524 kobject_put(mod->mkobj.drivers_dir);
@@ -1436,13 +1537,15 @@ static int __unlink_module(void *_mod)
1436 return 0; 1537 return 0;
1437} 1538}
1438 1539
1439/* Free a module, remove from lists, etc (must hold module_mutex). */ 1540/* Free a module, remove from lists, etc. */
1440static void free_module(struct module *mod) 1541static void free_module(struct module *mod)
1441{ 1542{
1442 trace_module_free(mod); 1543 trace_module_free(mod);
1443 1544
1444 /* Delete from various lists */ 1545 /* Delete from various lists */
1546 mutex_lock(&module_mutex);
1445 stop_machine(__unlink_module, mod, NULL); 1547 stop_machine(__unlink_module, mod, NULL);
1548 mutex_unlock(&module_mutex);
1446 remove_notes_attrs(mod); 1549 remove_notes_attrs(mod);
1447 remove_sect_attrs(mod); 1550 remove_sect_attrs(mod);
1448 mod_kobject_remove(mod); 1551 mod_kobject_remove(mod);
@@ -1493,6 +1596,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
1493/* 1596/*
1494 * Ensure that an exported symbol [global namespace] does not already exist 1597 * Ensure that an exported symbol [global namespace] does not already exist
1495 * in the kernel or in some other module's exported symbol table. 1598 * in the kernel or in some other module's exported symbol table.
1599 *
1600 * You must hold the module_mutex.
1496 */ 1601 */
1497static int verify_export_symbols(struct module *mod) 1602static int verify_export_symbols(struct module *mod)
1498{ 1603{
@@ -1558,21 +1663,23 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1558 break; 1663 break;
1559 1664
1560 case SHN_UNDEF: 1665 case SHN_UNDEF:
1561 ksym = resolve_symbol(sechdrs, versindex, 1666 ksym = resolve_symbol_wait(sechdrs, versindex,
1562 strtab + sym[i].st_name, mod); 1667 strtab + sym[i].st_name,
1668 mod);
1563 /* Ok if resolved. */ 1669 /* Ok if resolved. */
1564 if (ksym) { 1670 if (ksym && !IS_ERR(ksym)) {
1565 sym[i].st_value = ksym->value; 1671 sym[i].st_value = ksym->value;
1566 break; 1672 break;
1567 } 1673 }
1568 1674
1569 /* Ok if weak. */ 1675 /* Ok if weak. */
1570 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1676 if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1571 break; 1677 break;
1572 1678
1573 printk(KERN_WARNING "%s: Unknown symbol %s\n", 1679 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1574 mod->name, strtab + sym[i].st_name); 1680 mod->name, strtab + sym[i].st_name,
1575 ret = -ENOENT; 1681 PTR_ERR(ksym));
1682 ret = PTR_ERR(ksym) ?: -ENOENT;
1576 break; 1683 break;
1577 1684
1578 default: 1685 default:
@@ -1955,16 +2062,24 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
1955#endif 2062#endif
1956} 2063}
1957 2064
2065static void dynamic_debug_remove(struct _ddebug *debug)
2066{
2067 if (debug)
2068 ddebug_remove_module(debug->modname);
2069}
2070
1958static void *module_alloc_update_bounds(unsigned long size) 2071static void *module_alloc_update_bounds(unsigned long size)
1959{ 2072{
1960 void *ret = module_alloc(size); 2073 void *ret = module_alloc(size);
1961 2074
1962 if (ret) { 2075 if (ret) {
2076 mutex_lock(&module_mutex);
1963 /* Update module bounds. */ 2077 /* Update module bounds. */
1964 if ((unsigned long)ret < module_addr_min) 2078 if ((unsigned long)ret < module_addr_min)
1965 module_addr_min = (unsigned long)ret; 2079 module_addr_min = (unsigned long)ret;
1966 if ((unsigned long)ret + size > module_addr_max) 2080 if ((unsigned long)ret + size > module_addr_max)
1967 module_addr_max = (unsigned long)ret + size; 2081 module_addr_max = (unsigned long)ret + size;
2082 mutex_unlock(&module_mutex);
1968 } 2083 }
1969 return ret; 2084 return ret;
1970} 2085}
@@ -2014,6 +2129,9 @@ static noinline struct module *load_module(void __user *umod,
2014 long err = 0; 2129 long err = 0;
2015 void *ptr = NULL; /* Stops spurious gcc warning */ 2130 void *ptr = NULL; /* Stops spurious gcc warning */
2016 unsigned long symoffs, stroffs, *strmap; 2131 unsigned long symoffs, stroffs, *strmap;
2132 void __percpu *percpu;
2133 struct _ddebug *debug = NULL;
2134 unsigned int num_debug = 0;
2017 2135
2018 mm_segment_t old_fs; 2136 mm_segment_t old_fs;
2019 2137
@@ -2138,11 +2256,6 @@ static noinline struct module *load_module(void __user *umod,
2138 goto free_mod; 2256 goto free_mod;
2139 } 2257 }
2140 2258
2141 if (find_module(mod->name)) {
2142 err = -EEXIST;
2143 goto free_mod;
2144 }
2145
2146 mod->state = MODULE_STATE_COMING; 2259 mod->state = MODULE_STATE_COMING;
2147 2260
2148 /* Allow arches to frob section contents and sizes. */ 2261 /* Allow arches to frob section contents and sizes. */
@@ -2158,6 +2271,8 @@ static noinline struct module *load_module(void __user *umod,
2158 goto free_mod; 2271 goto free_mod;
2159 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2272 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2160 } 2273 }
2274 /* Keep this around for failure path. */
2275 percpu = mod_percpu(mod);
2161 2276
2162 /* Determine total sizes, and put offsets in sh_entsize. For now 2277 /* Determine total sizes, and put offsets in sh_entsize. For now
2163 this is done generically; there doesn't appear to be any 2278 this is done generically; there doesn't appear to be any
@@ -2231,11 +2346,6 @@ static noinline struct module *load_module(void __user *umod,
2231 /* Now we've moved module, initialize linked lists, etc. */ 2346 /* Now we've moved module, initialize linked lists, etc. */
2232 module_unload_init(mod); 2347 module_unload_init(mod);
2233 2348
2234 /* add kobject, so we can reference it. */
2235 err = mod_sysfs_init(mod);
2236 if (err)
2237 goto free_unload;
2238
2239 /* Set up license info based on the info section */ 2349 /* Set up license info based on the info section */
2240 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 2350 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
2241 2351
@@ -2360,11 +2470,6 @@ static noinline struct module *load_module(void __user *umod,
2360 goto cleanup; 2470 goto cleanup;
2361 } 2471 }
2362 2472
2363 /* Find duplicate symbols */
2364 err = verify_export_symbols(mod);
2365 if (err < 0)
2366 goto cleanup;
2367
2368 /* Set up and sort exception table */ 2473 /* Set up and sort exception table */
2369 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", 2474 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2370 sizeof(*mod->extable), &mod->num_exentries); 2475 sizeof(*mod->extable), &mod->num_exentries);
@@ -2379,15 +2484,9 @@ static noinline struct module *load_module(void __user *umod,
2379 kfree(strmap); 2484 kfree(strmap);
2380 strmap = NULL; 2485 strmap = NULL;
2381 2486
2382 if (!mod->taints) { 2487 if (!mod->taints)
2383 struct _ddebug *debug;
2384 unsigned int num_debug;
2385
2386 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2488 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2387 sizeof(*debug), &num_debug); 2489 sizeof(*debug), &num_debug);
2388 if (debug)
2389 dynamic_debug_setup(debug, num_debug);
2390 }
2391 2490
2392 err = module_finalize(hdr, sechdrs, mod); 2491 err = module_finalize(hdr, sechdrs, mod);
2393 if (err < 0) 2492 if (err < 0)
@@ -2423,7 +2522,22 @@ static noinline struct module *load_module(void __user *umod,
2423 * function to insert in a way safe to concurrent readers. 2522 * function to insert in a way safe to concurrent readers.
2424 * The mutex protects against concurrent writers. 2523 * The mutex protects against concurrent writers.
2425 */ 2524 */
2525 mutex_lock(&module_mutex);
2526 if (find_module(mod->name)) {
2527 err = -EEXIST;
2528 goto unlock;
2529 }
2530
2531 if (debug)
2532 dynamic_debug_setup(debug, num_debug);
2533
2534 /* Find duplicate symbols */
2535 err = verify_export_symbols(mod);
2536 if (err < 0)
2537 goto ddebug;
2538
2426 list_add_rcu(&mod->list, &modules); 2539 list_add_rcu(&mod->list, &modules);
2540 mutex_unlock(&module_mutex);
2427 2541
2428 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2542 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2429 if (err < 0) 2543 if (err < 0)
@@ -2432,6 +2546,7 @@ static noinline struct module *load_module(void __user *umod,
2432 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2546 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
2433 if (err < 0) 2547 if (err < 0)
2434 goto unlink; 2548 goto unlink;
2549
2435 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2550 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2436 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2551 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2437 2552
@@ -2444,15 +2559,17 @@ static noinline struct module *load_module(void __user *umod,
2444 return mod; 2559 return mod;
2445 2560
2446 unlink: 2561 unlink:
2562 mutex_lock(&module_mutex);
2447 /* Unlink carefully: kallsyms could be walking list. */ 2563 /* Unlink carefully: kallsyms could be walking list. */
2448 list_del_rcu(&mod->list); 2564 list_del_rcu(&mod->list);
2565 ddebug:
2566 dynamic_debug_remove(debug);
2567 unlock:
2568 mutex_unlock(&module_mutex);
2449 synchronize_sched(); 2569 synchronize_sched();
2450 module_arch_cleanup(mod); 2570 module_arch_cleanup(mod);
2451 cleanup: 2571 cleanup:
2452 free_modinfo(mod); 2572 free_modinfo(mod);
2453 kobject_del(&mod->mkobj.kobj);
2454 kobject_put(&mod->mkobj.kobj);
2455 free_unload:
2456 module_unload_free(mod); 2573 module_unload_free(mod);
2457#if defined(CONFIG_MODULE_UNLOAD) 2574#if defined(CONFIG_MODULE_UNLOAD)
2458 free_percpu(mod->refptr); 2575 free_percpu(mod->refptr);
@@ -2463,7 +2580,7 @@ static noinline struct module *load_module(void __user *umod,
2463 module_free(mod, mod->module_core); 2580 module_free(mod, mod->module_core);
2464 /* mod will be freed with core. Don't access it beyond this line! */ 2581 /* mod will be freed with core. Don't access it beyond this line! */
2465 free_percpu: 2582 free_percpu:
2466 percpu_modfree(mod); 2583 free_percpu(percpu);
2467 free_mod: 2584 free_mod:
2468 kfree(args); 2585 kfree(args);
2469 kfree(strmap); 2586 kfree(strmap);
@@ -2499,19 +2616,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2499 if (!capable(CAP_SYS_MODULE) || modules_disabled) 2616 if (!capable(CAP_SYS_MODULE) || modules_disabled)
2500 return -EPERM; 2617 return -EPERM;
2501 2618
2502 /* Only one module load at a time, please */
2503 if (mutex_lock_interruptible(&module_mutex) != 0)
2504 return -EINTR;
2505
2506 /* Do all the hard work */ 2619 /* Do all the hard work */
2507 mod = load_module(umod, len, uargs); 2620 mod = load_module(umod, len, uargs);
2508 if (IS_ERR(mod)) { 2621 if (IS_ERR(mod))
2509 mutex_unlock(&module_mutex);
2510 return PTR_ERR(mod); 2622 return PTR_ERR(mod);
2511 }
2512
2513 /* Drop lock so they can recurse */
2514 mutex_unlock(&module_mutex);
2515 2623
2516 blocking_notifier_call_chain(&module_notify_list, 2624 blocking_notifier_call_chain(&module_notify_list,
2517 MODULE_STATE_COMING, mod); 2625 MODULE_STATE_COMING, mod);
@@ -2528,9 +2636,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2528 module_put(mod); 2636 module_put(mod);
2529 blocking_notifier_call_chain(&module_notify_list, 2637 blocking_notifier_call_chain(&module_notify_list,
2530 MODULE_STATE_GOING, mod); 2638 MODULE_STATE_GOING, mod);
2531 mutex_lock(&module_mutex);
2532 free_module(mod); 2639 free_module(mod);
2533 mutex_unlock(&module_mutex);
2534 wake_up(&module_wq); 2640 wake_up(&module_wq);
2535 return ret; 2641 return ret;
2536 } 2642 }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index bd7ce8ca5bb9..ff86c558af4c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
283static void 283static void
284list_add_event(struct perf_event *event, struct perf_event_context *ctx) 284list_add_event(struct perf_event *event, struct perf_event_context *ctx)
285{ 285{
286 struct perf_event *group_leader = event->group_leader; 286 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
287 event->attach_state |= PERF_ATTACH_CONTEXT;
287 288
288 /* 289 /*
289 * Depending on whether it is a standalone or sibling event, 290 * If we're a stand alone event or group leader, we go to the context
290 * add it straight to the context's event list, or to the group 291 * list, group events are kept attached to the group so that
291 * leader's sibling list: 292 * perf_group_detach can, at all times, locate all siblings.
292 */ 293 */
293 if (group_leader == event) { 294 if (event->group_leader == event) {
294 struct list_head *list; 295 struct list_head *list;
295 296
296 if (is_software_event(event)) 297 if (is_software_event(event))
@@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
298 299
299 list = ctx_group_list(event, ctx); 300 list = ctx_group_list(event, ctx);
300 list_add_tail(&event->group_entry, list); 301 list_add_tail(&event->group_entry, list);
301 } else {
302 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
303 !is_software_event(event))
304 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
305
306 list_add_tail(&event->group_entry, &group_leader->sibling_list);
307 group_leader->nr_siblings++;
308 } 302 }
309 303
310 list_add_rcu(&event->event_entry, &ctx->event_list); 304 list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
313 ctx->nr_stat++; 307 ctx->nr_stat++;
314} 308}
315 309
310static void perf_group_attach(struct perf_event *event)
311{
312 struct perf_event *group_leader = event->group_leader;
313
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
315 event->attach_state |= PERF_ATTACH_GROUP;
316
317 if (group_leader == event)
318 return;
319
320 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
321 !is_software_event(event))
322 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
323
324 list_add_tail(&event->group_entry, &group_leader->sibling_list);
325 group_leader->nr_siblings++;
326}
327
316/* 328/*
317 * Remove a event from the lists for its context. 329 * Remove a event from the lists for its context.
318 * Must be called with ctx->mutex and ctx->lock held. 330 * Must be called with ctx->mutex and ctx->lock held.
@@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
320static void 332static void
321list_del_event(struct perf_event *event, struct perf_event_context *ctx) 333list_del_event(struct perf_event *event, struct perf_event_context *ctx)
322{ 334{
323 if (list_empty(&event->group_entry)) 335 /*
336 * We can have double detach due to exit/hot-unplug + close.
337 */
338 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
324 return; 339 return;
340
341 event->attach_state &= ~PERF_ATTACH_CONTEXT;
342
325 ctx->nr_events--; 343 ctx->nr_events--;
326 if (event->attr.inherit_stat) 344 if (event->attr.inherit_stat)
327 ctx->nr_stat--; 345 ctx->nr_stat--;
328 346
329 list_del_init(&event->group_entry);
330 list_del_rcu(&event->event_entry); 347 list_del_rcu(&event->event_entry);
331 348
332 if (event->group_leader != event) 349 if (event->group_leader == event)
333 event->group_leader->nr_siblings--; 350 list_del_init(&event->group_entry);
334 351
335 update_group_times(event); 352 update_group_times(event);
336 353
@@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
345 event->state = PERF_EVENT_STATE_OFF; 362 event->state = PERF_EVENT_STATE_OFF;
346} 363}
347 364
348static void 365static void perf_group_detach(struct perf_event *event)
349perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
350{ 366{
351 struct perf_event *sibling, *tmp; 367 struct perf_event *sibling, *tmp;
368 struct list_head *list = NULL;
369
370 /*
371 * We can have double detach due to exit/hot-unplug + close.
372 */
373 if (!(event->attach_state & PERF_ATTACH_GROUP))
374 return;
375
376 event->attach_state &= ~PERF_ATTACH_GROUP;
377
378 /*
379 * If this is a sibling, remove it from its group.
380 */
381 if (event->group_leader != event) {
382 list_del_init(&event->group_entry);
383 event->group_leader->nr_siblings--;
384 return;
385 }
386
387 if (!list_empty(&event->group_entry))
388 list = &event->group_entry;
352 389
353 /* 390 /*
354 * If this was a group event with sibling events then 391 * If this was a group event with sibling events then
355 * upgrade the siblings to singleton events by adding them 392 * upgrade the siblings to singleton events by adding them
356 * to the context list directly: 393 * to whatever list we are on.
357 */ 394 */
358 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 395 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
359 struct list_head *list; 396 if (list)
360 397 list_move_tail(&sibling->group_entry, list);
361 list = ctx_group_list(event, ctx);
362 list_move_tail(&sibling->group_entry, list);
363 sibling->group_leader = sibling; 398 sibling->group_leader = sibling;
364 399
365 /* Inherit group flags from the previous leader */ 400 /* Inherit group flags from the previous leader */
@@ -652,8 +687,11 @@ group_sched_in(struct perf_event *group_event,
652 if (txn) 687 if (txn)
653 pmu->start_txn(pmu); 688 pmu->start_txn(pmu);
654 689
655 if (event_sched_in(group_event, cpuctx, ctx)) 690 if (event_sched_in(group_event, cpuctx, ctx)) {
691 if (txn)
692 pmu->cancel_txn(pmu);
656 return -EAGAIN; 693 return -EAGAIN;
694 }
657 695
658 /* 696 /*
659 * Schedule in siblings as one group (if any): 697 * Schedule in siblings as one group (if any):
@@ -675,9 +713,6 @@ group_sched_in(struct perf_event *group_event,
675 } 713 }
676 714
677group_error: 715group_error:
678 if (txn)
679 pmu->cancel_txn(pmu);
680
681 /* 716 /*
682 * Groups can be scheduled in as one unit only, so undo any 717 * Groups can be scheduled in as one unit only, so undo any
683 * partial group before returning: 718 * partial group before returning:
@@ -689,6 +724,9 @@ group_error:
689 } 724 }
690 event_sched_out(group_event, cpuctx, ctx); 725 event_sched_out(group_event, cpuctx, ctx);
691 726
727 if (txn)
728 pmu->cancel_txn(pmu);
729
692 return -EAGAIN; 730 return -EAGAIN;
693} 731}
694 732
@@ -727,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event,
727 struct perf_event_context *ctx) 765 struct perf_event_context *ctx)
728{ 766{
729 list_add_event(event, ctx); 767 list_add_event(event, ctx);
768 perf_group_attach(event);
730 event->tstamp_enabled = ctx->time; 769 event->tstamp_enabled = ctx->time;
731 event->tstamp_running = ctx->time; 770 event->tstamp_running = ctx->time;
732 event->tstamp_stopped = ctx->time; 771 event->tstamp_stopped = ctx->time;
@@ -1468,6 +1507,9 @@ do { \
1468 divisor = nsec * frequency; 1507 divisor = nsec * frequency;
1469 } 1508 }
1470 1509
1510 if (!divisor)
1511 return dividend;
1512
1471 return div64_u64(dividend, divisor); 1513 return div64_u64(dividend, divisor);
1472} 1514}
1473 1515
@@ -1490,7 +1532,7 @@ static int perf_event_start(struct perf_event *event)
1490static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1532static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1491{ 1533{
1492 struct hw_perf_event *hwc = &event->hw; 1534 struct hw_perf_event *hwc = &event->hw;
1493 u64 period, sample_period; 1535 s64 period, sample_period;
1494 s64 delta; 1536 s64 delta;
1495 1537
1496 period = perf_calculate_period(event, nsec, count); 1538 period = perf_calculate_period(event, nsec, count);
@@ -1841,6 +1883,7 @@ static void free_event_rcu(struct rcu_head *head)
1841} 1883}
1842 1884
1843static void perf_pending_sync(struct perf_event *event); 1885static void perf_pending_sync(struct perf_event *event);
1886static void perf_mmap_data_put(struct perf_mmap_data *data);
1844 1887
1845static void free_event(struct perf_event *event) 1888static void free_event(struct perf_event *event)
1846{ 1889{
@@ -1856,9 +1899,9 @@ static void free_event(struct perf_event *event)
1856 atomic_dec(&nr_task_events); 1899 atomic_dec(&nr_task_events);
1857 } 1900 }
1858 1901
1859 if (event->output) { 1902 if (event->data) {
1860 fput(event->output->filp); 1903 perf_mmap_data_put(event->data);
1861 event->output = NULL; 1904 event->data = NULL;
1862 } 1905 }
1863 1906
1864 if (event->destroy) 1907 if (event->destroy)
@@ -1893,8 +1936,8 @@ int perf_event_release_kernel(struct perf_event *event)
1893 */ 1936 */
1894 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 1937 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1895 raw_spin_lock_irq(&ctx->lock); 1938 raw_spin_lock_irq(&ctx->lock);
1939 perf_group_detach(event);
1896 list_del_event(event, ctx); 1940 list_del_event(event, ctx);
1897 perf_destroy_group(event, ctx);
1898 raw_spin_unlock_irq(&ctx->lock); 1941 raw_spin_unlock_irq(&ctx->lock);
1899 mutex_unlock(&ctx->mutex); 1942 mutex_unlock(&ctx->mutex);
1900 1943
@@ -2175,7 +2218,27 @@ unlock:
2175 return ret; 2218 return ret;
2176} 2219}
2177 2220
2178static int perf_event_set_output(struct perf_event *event, int output_fd); 2221static const struct file_operations perf_fops;
2222
2223static struct perf_event *perf_fget_light(int fd, int *fput_needed)
2224{
2225 struct file *file;
2226
2227 file = fget_light(fd, fput_needed);
2228 if (!file)
2229 return ERR_PTR(-EBADF);
2230
2231 if (file->f_op != &perf_fops) {
2232 fput_light(file, *fput_needed);
2233 *fput_needed = 0;
2234 return ERR_PTR(-EBADF);
2235 }
2236
2237 return file->private_data;
2238}
2239
2240static int perf_event_set_output(struct perf_event *event,
2241 struct perf_event *output_event);
2179static int perf_event_set_filter(struct perf_event *event, void __user *arg); 2242static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2180 2243
2181static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2244static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2202,7 +2265,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2202 return perf_event_period(event, (u64 __user *)arg); 2265 return perf_event_period(event, (u64 __user *)arg);
2203 2266
2204 case PERF_EVENT_IOC_SET_OUTPUT: 2267 case PERF_EVENT_IOC_SET_OUTPUT:
2205 return perf_event_set_output(event, arg); 2268 {
2269 struct perf_event *output_event = NULL;
2270 int fput_needed = 0;
2271 int ret;
2272
2273 if (arg != -1) {
2274 output_event = perf_fget_light(arg, &fput_needed);
2275 if (IS_ERR(output_event))
2276 return PTR_ERR(output_event);
2277 }
2278
2279 ret = perf_event_set_output(event, output_event);
2280 if (output_event)
2281 fput_light(output_event->filp, fput_needed);
2282
2283 return ret;
2284 }
2206 2285
2207 case PERF_EVENT_IOC_SET_FILTER: 2286 case PERF_EVENT_IOC_SET_FILTER:
2208 return perf_event_set_filter(event, (void __user *)arg); 2287 return perf_event_set_filter(event, (void __user *)arg);
@@ -2335,8 +2414,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2335 unsigned long size; 2414 unsigned long size;
2336 int i; 2415 int i;
2337 2416
2338 WARN_ON(atomic_read(&event->mmap_count));
2339
2340 size = sizeof(struct perf_mmap_data); 2417 size = sizeof(struct perf_mmap_data);
2341 size += nr_pages * sizeof(void *); 2418 size += nr_pages * sizeof(void *);
2342 2419
@@ -2452,8 +2529,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2452 unsigned long size; 2529 unsigned long size;
2453 void *all_buf; 2530 void *all_buf;
2454 2531
2455 WARN_ON(atomic_read(&event->mmap_count));
2456
2457 size = sizeof(struct perf_mmap_data); 2532 size = sizeof(struct perf_mmap_data);
2458 size += sizeof(void *); 2533 size += sizeof(void *);
2459 2534
@@ -2536,7 +2611,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2536 if (!data->watermark) 2611 if (!data->watermark)
2537 data->watermark = max_size / 2; 2612 data->watermark = max_size / 2;
2538 2613
2539 2614 atomic_set(&data->refcount, 1);
2540 rcu_assign_pointer(event->data, data); 2615 rcu_assign_pointer(event->data, data);
2541} 2616}
2542 2617
@@ -2548,13 +2623,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2548 perf_mmap_data_free(data); 2623 perf_mmap_data_free(data);
2549} 2624}
2550 2625
2551static void perf_mmap_data_release(struct perf_event *event) 2626static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
2552{ 2627{
2553 struct perf_mmap_data *data = event->data; 2628 struct perf_mmap_data *data;
2629
2630 rcu_read_lock();
2631 data = rcu_dereference(event->data);
2632 if (data) {
2633 if (!atomic_inc_not_zero(&data->refcount))
2634 data = NULL;
2635 }
2636 rcu_read_unlock();
2637
2638 return data;
2639}
2554 2640
2555 WARN_ON(atomic_read(&event->mmap_count)); 2641static void perf_mmap_data_put(struct perf_mmap_data *data)
2642{
2643 if (!atomic_dec_and_test(&data->refcount))
2644 return;
2556 2645
2557 rcu_assign_pointer(event->data, NULL);
2558 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); 2646 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2559} 2647}
2560 2648
@@ -2569,15 +2657,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2569{ 2657{
2570 struct perf_event *event = vma->vm_file->private_data; 2658 struct perf_event *event = vma->vm_file->private_data;
2571 2659
2572 WARN_ON_ONCE(event->ctx->parent_ctx);
2573 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2660 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2574 unsigned long size = perf_data_size(event->data); 2661 unsigned long size = perf_data_size(event->data);
2575 struct user_struct *user = current_user(); 2662 struct user_struct *user = event->mmap_user;
2663 struct perf_mmap_data *data = event->data;
2576 2664
2577 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2665 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2578 vma->vm_mm->locked_vm -= event->data->nr_locked; 2666 vma->vm_mm->locked_vm -= event->mmap_locked;
2579 perf_mmap_data_release(event); 2667 rcu_assign_pointer(event->data, NULL);
2580 mutex_unlock(&event->mmap_mutex); 2668 mutex_unlock(&event->mmap_mutex);
2669
2670 perf_mmap_data_put(data);
2671 free_uid(user);
2581 } 2672 }
2582} 2673}
2583 2674
@@ -2629,13 +2720,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2629 2720
2630 WARN_ON_ONCE(event->ctx->parent_ctx); 2721 WARN_ON_ONCE(event->ctx->parent_ctx);
2631 mutex_lock(&event->mmap_mutex); 2722 mutex_lock(&event->mmap_mutex);
2632 if (event->output) { 2723 if (event->data) {
2633 ret = -EINVAL; 2724 if (event->data->nr_pages == nr_pages)
2634 goto unlock; 2725 atomic_inc(&event->data->refcount);
2635 } 2726 else
2636
2637 if (atomic_inc_not_zero(&event->mmap_count)) {
2638 if (nr_pages != event->data->nr_pages)
2639 ret = -EINVAL; 2727 ret = -EINVAL;
2640 goto unlock; 2728 goto unlock;
2641 } 2729 }
@@ -2667,21 +2755,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2667 WARN_ON(event->data); 2755 WARN_ON(event->data);
2668 2756
2669 data = perf_mmap_data_alloc(event, nr_pages); 2757 data = perf_mmap_data_alloc(event, nr_pages);
2670 ret = -ENOMEM; 2758 if (!data) {
2671 if (!data) 2759 ret = -ENOMEM;
2672 goto unlock; 2760 goto unlock;
2761 }
2673 2762
2674 ret = 0;
2675 perf_mmap_data_init(event, data); 2763 perf_mmap_data_init(event, data);
2676
2677 atomic_set(&event->mmap_count, 1);
2678 atomic_long_add(user_extra, &user->locked_vm);
2679 vma->vm_mm->locked_vm += extra;
2680 event->data->nr_locked = extra;
2681 if (vma->vm_flags & VM_WRITE) 2764 if (vma->vm_flags & VM_WRITE)
2682 event->data->writable = 1; 2765 event->data->writable = 1;
2683 2766
2767 atomic_long_add(user_extra, &user->locked_vm);
2768 event->mmap_locked = extra;
2769 event->mmap_user = get_current_user();
2770 vma->vm_mm->locked_vm += event->mmap_locked;
2771
2684unlock: 2772unlock:
2773 if (!ret)
2774 atomic_inc(&event->mmap_count);
2685 mutex_unlock(&event->mmap_mutex); 2775 mutex_unlock(&event->mmap_mutex);
2686 2776
2687 vma->vm_flags |= VM_RESERVED; 2777 vma->vm_flags |= VM_RESERVED;
@@ -2977,6 +3067,7 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
2977 3067
2978 len -= size; 3068 len -= size;
2979 handle->addr += size; 3069 handle->addr += size;
3070 buf += size;
2980 handle->size -= size; 3071 handle->size -= size;
2981 if (!handle->size) { 3072 if (!handle->size) {
2982 struct perf_mmap_data *data = handle->data; 3073 struct perf_mmap_data *data = handle->data;
@@ -2993,7 +3084,6 @@ int perf_output_begin(struct perf_output_handle *handle,
2993 struct perf_event *event, unsigned int size, 3084 struct perf_event *event, unsigned int size,
2994 int nmi, int sample) 3085 int nmi, int sample)
2995{ 3086{
2996 struct perf_event *output_event;
2997 struct perf_mmap_data *data; 3087 struct perf_mmap_data *data;
2998 unsigned long tail, offset, head; 3088 unsigned long tail, offset, head;
2999 int have_lost; 3089 int have_lost;
@@ -3010,10 +3100,6 @@ int perf_output_begin(struct perf_output_handle *handle,
3010 if (event->parent) 3100 if (event->parent)
3011 event = event->parent; 3101 event = event->parent;
3012 3102
3013 output_event = rcu_dereference(event->output);
3014 if (output_event)
3015 event = output_event;
3016
3017 data = rcu_dereference(event->data); 3103 data = rcu_dereference(event->data);
3018 if (!data) 3104 if (!data)
3019 goto out; 3105 goto out;
@@ -3972,13 +4058,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3972 } 4058 }
3973} 4059}
3974 4060
3975static void perf_swevent_unthrottle(struct perf_event *event)
3976{
3977 /*
3978 * Nothing to do, we already reset hwc->interrupts.
3979 */
3980}
3981
3982static void perf_swevent_add(struct perf_event *event, u64 nr, 4061static void perf_swevent_add(struct perf_event *event, u64 nr,
3983 int nmi, struct perf_sample_data *data, 4062 int nmi, struct perf_sample_data *data,
3984 struct pt_regs *regs) 4063 struct pt_regs *regs)
@@ -4193,11 +4272,22 @@ static void perf_swevent_disable(struct perf_event *event)
4193 hlist_del_rcu(&event->hlist_entry); 4272 hlist_del_rcu(&event->hlist_entry);
4194} 4273}
4195 4274
4275static void perf_swevent_void(struct perf_event *event)
4276{
4277}
4278
4279static int perf_swevent_int(struct perf_event *event)
4280{
4281 return 0;
4282}
4283
4196static const struct pmu perf_ops_generic = { 4284static const struct pmu perf_ops_generic = {
4197 .enable = perf_swevent_enable, 4285 .enable = perf_swevent_enable,
4198 .disable = perf_swevent_disable, 4286 .disable = perf_swevent_disable,
4287 .start = perf_swevent_int,
4288 .stop = perf_swevent_void,
4199 .read = perf_swevent_read, 4289 .read = perf_swevent_read,
4200 .unthrottle = perf_swevent_unthrottle, 4290 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4201}; 4291};
4202 4292
4203/* 4293/*
@@ -4478,8 +4568,10 @@ static int swevent_hlist_get(struct perf_event *event)
4478static const struct pmu perf_ops_tracepoint = { 4568static const struct pmu perf_ops_tracepoint = {
4479 .enable = perf_trace_enable, 4569 .enable = perf_trace_enable,
4480 .disable = perf_trace_disable, 4570 .disable = perf_trace_disable,
4571 .start = perf_swevent_int,
4572 .stop = perf_swevent_void,
4481 .read = perf_swevent_read, 4573 .read = perf_swevent_read,
4482 .unthrottle = perf_swevent_unthrottle, 4574 .unthrottle = perf_swevent_void,
4483}; 4575};
4484 4576
4485static int perf_tp_filter_match(struct perf_event *event, 4577static int perf_tp_filter_match(struct perf_event *event,
@@ -4912,39 +5004,17 @@ err_size:
4912 goto out; 5004 goto out;
4913} 5005}
4914 5006
4915static int perf_event_set_output(struct perf_event *event, int output_fd) 5007static int
5008perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
4916{ 5009{
4917 struct perf_event *output_event = NULL; 5010 struct perf_mmap_data *data = NULL, *old_data = NULL;
4918 struct file *output_file = NULL;
4919 struct perf_event *old_output;
4920 int fput_needed = 0;
4921 int ret = -EINVAL; 5011 int ret = -EINVAL;
4922 5012
4923 /* 5013 if (!output_event)
4924 * Don't allow output of inherited per-task events. This would
4925 * create performance issues due to cross cpu access.
4926 */
4927 if (event->cpu == -1 && event->attr.inherit)
4928 return -EINVAL;
4929
4930 if (!output_fd)
4931 goto set; 5014 goto set;
4932 5015
4933 output_file = fget_light(output_fd, &fput_needed); 5016 /* don't allow circular references */
4934 if (!output_file) 5017 if (event == output_event)
4935 return -EBADF;
4936
4937 if (output_file->f_op != &perf_fops)
4938 goto out;
4939
4940 output_event = output_file->private_data;
4941
4942 /* Don't chain output fds */
4943 if (output_event->output)
4944 goto out;
4945
4946 /* Don't set an output fd when we already have an output channel */
4947 if (event->data)
4948 goto out; 5018 goto out;
4949 5019
4950 /* 5020 /*
@@ -4959,26 +5029,28 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
4959 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 5029 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
4960 goto out; 5030 goto out;
4961 5031
4962 atomic_long_inc(&output_file->f_count);
4963
4964set: 5032set:
4965 mutex_lock(&event->mmap_mutex); 5033 mutex_lock(&event->mmap_mutex);
4966 old_output = event->output; 5034 /* Can't redirect output if we've got an active mmap() */
4967 rcu_assign_pointer(event->output, output_event); 5035 if (atomic_read(&event->mmap_count))
4968 mutex_unlock(&event->mmap_mutex); 5036 goto unlock;
4969 5037
4970 if (old_output) { 5038 if (output_event) {
4971 /* 5039 /* get the buffer we want to redirect to */
4972 * we need to make sure no existing perf_output_*() 5040 data = perf_mmap_data_get(output_event);
4973 * is still referencing this event. 5041 if (!data)
4974 */ 5042 goto unlock;
4975 synchronize_rcu();
4976 fput(old_output->filp);
4977 } 5043 }
4978 5044
5045 old_data = event->data;
5046 rcu_assign_pointer(event->data, data);
4979 ret = 0; 5047 ret = 0;
5048unlock:
5049 mutex_unlock(&event->mmap_mutex);
5050
5051 if (old_data)
5052 perf_mmap_data_put(old_data);
4980out: 5053out:
4981 fput_light(output_file, fput_needed);
4982 return ret; 5054 return ret;
4983} 5055}
4984 5056
@@ -4994,7 +5066,7 @@ SYSCALL_DEFINE5(perf_event_open,
4994 struct perf_event_attr __user *, attr_uptr, 5066 struct perf_event_attr __user *, attr_uptr,
4995 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5067 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4996{ 5068{
4997 struct perf_event *event, *group_leader; 5069 struct perf_event *event, *group_leader = NULL, *output_event = NULL;
4998 struct perf_event_attr attr; 5070 struct perf_event_attr attr;
4999 struct perf_event_context *ctx; 5071 struct perf_event_context *ctx;
5000 struct file *event_file = NULL; 5072 struct file *event_file = NULL;
@@ -5034,19 +5106,25 @@ SYSCALL_DEFINE5(perf_event_open,
5034 goto err_fd; 5106 goto err_fd;
5035 } 5107 }
5036 5108
5109 if (group_fd != -1) {
5110 group_leader = perf_fget_light(group_fd, &fput_needed);
5111 if (IS_ERR(group_leader)) {
5112 err = PTR_ERR(group_leader);
5113 goto err_put_context;
5114 }
5115 group_file = group_leader->filp;
5116 if (flags & PERF_FLAG_FD_OUTPUT)
5117 output_event = group_leader;
5118 if (flags & PERF_FLAG_FD_NO_GROUP)
5119 group_leader = NULL;
5120 }
5121
5037 /* 5122 /*
5038 * Look up the group leader (we will attach this event to it): 5123 * Look up the group leader (we will attach this event to it):
5039 */ 5124 */
5040 group_leader = NULL; 5125 if (group_leader) {
5041 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
5042 err = -EINVAL; 5126 err = -EINVAL;
5043 group_file = fget_light(group_fd, &fput_needed);
5044 if (!group_file)
5045 goto err_put_context;
5046 if (group_file->f_op != &perf_fops)
5047 goto err_put_context;
5048 5127
5049 group_leader = group_file->private_data;
5050 /* 5128 /*
5051 * Do not allow a recursive hierarchy (this new sibling 5129 * Do not allow a recursive hierarchy (this new sibling
5052 * becoming part of another group-sibling): 5130 * becoming part of another group-sibling):
@@ -5068,9 +5146,16 @@ SYSCALL_DEFINE5(perf_event_open,
5068 5146
5069 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 5147 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5070 NULL, NULL, GFP_KERNEL); 5148 NULL, NULL, GFP_KERNEL);
5071 err = PTR_ERR(event); 5149 if (IS_ERR(event)) {
5072 if (IS_ERR(event)) 5150 err = PTR_ERR(event);
5073 goto err_put_context; 5151 goto err_put_context;
5152 }
5153
5154 if (output_event) {
5155 err = perf_event_set_output(event, output_event);
5156 if (err)
5157 goto err_free_put_context;
5158 }
5074 5159
5075 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 5160 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5076 if (IS_ERR(event_file)) { 5161 if (IS_ERR(event_file)) {
@@ -5078,12 +5163,6 @@ SYSCALL_DEFINE5(perf_event_open,
5078 goto err_free_put_context; 5163 goto err_free_put_context;
5079 } 5164 }
5080 5165
5081 if (flags & PERF_FLAG_FD_OUTPUT) {
5082 err = perf_event_set_output(event, group_fd);
5083 if (err)
5084 goto err_fput_free_put_context;
5085 }
5086
5087 event->filp = event_file; 5166 event->filp = event_file;
5088 WARN_ON_ONCE(ctx->parent_ctx); 5167 WARN_ON_ONCE(ctx->parent_ctx);
5089 mutex_lock(&ctx->mutex); 5168 mutex_lock(&ctx->mutex);
@@ -5097,12 +5176,16 @@ SYSCALL_DEFINE5(perf_event_open,
5097 list_add_tail(&event->owner_entry, &current->perf_event_list); 5176 list_add_tail(&event->owner_entry, &current->perf_event_list);
5098 mutex_unlock(&current->perf_event_mutex); 5177 mutex_unlock(&current->perf_event_mutex);
5099 5178
5179 /*
5180 * Drop the reference on the group_event after placing the
5181 * new event on the sibling_list. This ensures destruction
5182 * of the group leader will find the pointer to itself in
5183 * perf_group_detach().
5184 */
5100 fput_light(group_file, fput_needed); 5185 fput_light(group_file, fput_needed);
5101 fd_install(event_fd, event_file); 5186 fd_install(event_fd, event_file);
5102 return event_fd; 5187 return event_fd;
5103 5188
5104err_fput_free_put_context:
5105 fput(event_file);
5106err_free_put_context: 5189err_free_put_context:
5107 free_event(event); 5190 free_event(event);
5108err_put_context: 5191err_put_context:
@@ -5420,6 +5503,7 @@ static void perf_free_event(struct perf_event *event,
5420 5503
5421 fput(parent->filp); 5504 fput(parent->filp);
5422 5505
5506 perf_group_detach(event);
5423 list_del_event(event, ctx); 5507 list_del_event(event, ctx);
5424 free_event(event); 5508 free_event(event);
5425} 5509}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5c36ea9d55d2..ca6066a6952e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG
99 depends on PM_ADVANCED_DEBUG 99 depends on PM_ADVANCED_DEBUG
100 default n 100 default n
101 101
102config SUSPEND_NVS
103 bool
104
102config SUSPEND 105config SUSPEND
103 bool "Suspend to RAM and standby" 106 bool "Suspend to RAM and standby"
104 depends on PM && ARCH_SUSPEND_POSSIBLE 107 depends on PM && ARCH_SUSPEND_POSSIBLE
108 select SUSPEND_NVS if HAS_IOMEM
105 default y 109 default y
106 ---help--- 110 ---help---
107 Allow the system to enter sleep states in which main memory is 111 Allow the system to enter sleep states in which main memory is
@@ -130,13 +134,10 @@ config SUSPEND_FREEZER
130 134
131 Turning OFF this setting is NOT recommended! If in doubt, say Y. 135 Turning OFF this setting is NOT recommended! If in doubt, say Y.
132 136
133config HIBERNATION_NVS
134 bool
135
136config HIBERNATION 137config HIBERNATION
137 bool "Hibernation (aka 'suspend to disk')" 138 bool "Hibernation (aka 'suspend to disk')"
138 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 139 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
139 select HIBERNATION_NVS if HAS_IOMEM 140 select SUSPEND_NVS if HAS_IOMEM
140 ---help--- 141 ---help---
141 Enable the suspend to disk (STD) functionality, which is usually 142 Enable the suspend to disk (STD) functionality, which is usually
142 called "hibernation" in user interfaces. STD checkpoints the 143 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 524e058dcf06..f9063c6b185d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -10,6 +10,6 @@ obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o 12 block_io.o
13obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
14 14
15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c
index fdcad9ed5a7b..1836db60bbb6 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/nvs.c
@@ -15,7 +15,7 @@
15 15
16/* 16/*
17 * Platforms, like ACPI, may want us to save some memory used by them during 17 * Platforms, like ACPI, may want us to save some memory used by them during
18 * hibernation and to restore the contents of this memory during the subsequent 18 * suspend and to restore the contents of this memory during the subsequent
19 * resume. The code below implements a mechanism allowing us to do that. 19 * resume. The code below implements a mechanism allowing us to do that.
20 */ 20 */
21 21
@@ -30,7 +30,7 @@ struct nvs_page {
30static LIST_HEAD(nvs_list); 30static LIST_HEAD(nvs_list);
31 31
32/** 32/**
33 * hibernate_nvs_register - register platform NVS memory region to save 33 * suspend_nvs_register - register platform NVS memory region to save
34 * @start - physical address of the region 34 * @start - physical address of the region
35 * @size - size of the region 35 * @size - size of the region
36 * 36 *
@@ -38,7 +38,7 @@ static LIST_HEAD(nvs_list);
38 * things so that the data from page-aligned addresses in this region will 38 * things so that the data from page-aligned addresses in this region will
39 * be copied into separate RAM pages. 39 * be copied into separate RAM pages.
40 */ 40 */
41int hibernate_nvs_register(unsigned long start, unsigned long size) 41int suspend_nvs_register(unsigned long start, unsigned long size)
42{ 42{
43 struct nvs_page *entry, *next; 43 struct nvs_page *entry, *next;
44 44
@@ -68,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size)
68} 68}
69 69
70/** 70/**
71 * hibernate_nvs_free - free data pages allocated for saving NVS regions 71 * suspend_nvs_free - free data pages allocated for saving NVS regions
72 */ 72 */
73void hibernate_nvs_free(void) 73void suspend_nvs_free(void)
74{ 74{
75 struct nvs_page *entry; 75 struct nvs_page *entry;
76 76
@@ -86,16 +86,16 @@ void hibernate_nvs_free(void)
86} 86}
87 87
88/** 88/**
89 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions 89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
90 */ 90 */
91int hibernate_nvs_alloc(void) 91int suspend_nvs_alloc(void)
92{ 92{
93 struct nvs_page *entry; 93 struct nvs_page *entry;
94 94
95 list_for_each_entry(entry, &nvs_list, node) { 95 list_for_each_entry(entry, &nvs_list, node) {
96 entry->data = (void *)__get_free_page(GFP_KERNEL); 96 entry->data = (void *)__get_free_page(GFP_KERNEL);
97 if (!entry->data) { 97 if (!entry->data) {
98 hibernate_nvs_free(); 98 suspend_nvs_free();
99 return -ENOMEM; 99 return -ENOMEM;
100 } 100 }
101 } 101 }
@@ -103,9 +103,9 @@ int hibernate_nvs_alloc(void)
103} 103}
104 104
105/** 105/**
106 * hibernate_nvs_save - save NVS memory regions 106 * suspend_nvs_save - save NVS memory regions
107 */ 107 */
108void hibernate_nvs_save(void) 108void suspend_nvs_save(void)
109{ 109{
110 struct nvs_page *entry; 110 struct nvs_page *entry;
111 111
@@ -119,12 +119,12 @@ void hibernate_nvs_save(void)
119} 119}
120 120
121/** 121/**
122 * hibernate_nvs_restore - restore NVS memory regions 122 * suspend_nvs_restore - restore NVS memory regions
123 * 123 *
124 * This function is going to be called with interrupts disabled, so it 124 * This function is going to be called with interrupts disabled, so it
125 * cannot iounmap the virtual addresses used to access the NVS region. 125 * cannot iounmap the virtual addresses used to access the NVS region.
126 */ 126 */
127void hibernate_nvs_restore(void) 127void suspend_nvs_restore(void)
128{ 128{
129 struct nvs_page *entry; 129 struct nvs_page *entry;
130 130
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 56e7dbb8b996..f37cb7dd4402 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -16,6 +16,12 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/gfp.h> 18#include <linux/gfp.h>
19#include <linux/io.h>
20#include <linux/kernel.h>
21#include <linux/list.h>
22#include <linux/mm.h>
23#include <linux/slab.h>
24#include <linux/suspend.h>
19 25
20#include "power.h" 26#include "power.h"
21 27
diff --git a/kernel/sched.c b/kernel/sched.c
index d48408142503..f52a8801b7a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -306,52 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
306 */ 306 */
307struct task_group init_task_group; 307struct task_group init_task_group;
308 308
309/* return group to which a task belongs */
310static inline struct task_group *task_group(struct task_struct *p)
311{
312 struct task_group *tg;
313
314#ifdef CONFIG_CGROUP_SCHED
315 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
316 struct task_group, css);
317#else
318 tg = &init_task_group;
319#endif
320 return tg;
321}
322
323/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
324static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
325{
326 /*
327 * Strictly speaking this rcu_read_lock() is not needed since the
328 * task_group is tied to the cgroup, which in turn can never go away
329 * as long as there are tasks attached to it.
330 *
331 * However since task_group() uses task_subsys_state() which is an
332 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
333 */
334 rcu_read_lock();
335#ifdef CONFIG_FAIR_GROUP_SCHED
336 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
337 p->se.parent = task_group(p)->se[cpu];
338#endif
339
340#ifdef CONFIG_RT_GROUP_SCHED
341 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
342 p->rt.parent = task_group(p)->rt_se[cpu];
343#endif
344 rcu_read_unlock();
345}
346
347#else
348
349static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
350static inline struct task_group *task_group(struct task_struct *p)
351{
352 return NULL;
353}
354
355#endif /* CONFIG_CGROUP_SCHED */ 309#endif /* CONFIG_CGROUP_SCHED */
356 310
357/* CFS-related fields in a runqueue */ 311/* CFS-related fields in a runqueue */
@@ -544,6 +498,8 @@ struct rq {
544 struct root_domain *rd; 498 struct root_domain *rd;
545 struct sched_domain *sd; 499 struct sched_domain *sd;
546 500
501 unsigned long cpu_power;
502
547 unsigned char idle_at_tick; 503 unsigned char idle_at_tick;
548 /* For active balancing */ 504 /* For active balancing */
549 int post_schedule; 505 int post_schedule;
@@ -642,6 +598,49 @@ static inline int cpu_of(struct rq *rq)
642#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 598#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
643#define raw_rq() (&__raw_get_cpu_var(runqueues)) 599#define raw_rq() (&__raw_get_cpu_var(runqueues))
644 600
601#ifdef CONFIG_CGROUP_SCHED
602
603/*
604 * Return the group to which this tasks belongs.
605 *
606 * We use task_subsys_state_check() and extend the RCU verification
607 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
608 * holds that lock for each task it moves into the cgroup. Therefore
609 * by holding that lock, we pin the task to the current cgroup.
610 */
611static inline struct task_group *task_group(struct task_struct *p)
612{
613 struct cgroup_subsys_state *css;
614
615 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
616 lockdep_is_held(&task_rq(p)->lock));
617 return container_of(css, struct task_group, css);
618}
619
620/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
621static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
622{
623#ifdef CONFIG_FAIR_GROUP_SCHED
624 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
625 p->se.parent = task_group(p)->se[cpu];
626#endif
627
628#ifdef CONFIG_RT_GROUP_SCHED
629 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
630 p->rt.parent = task_group(p)->rt_se[cpu];
631#endif
632}
633
634#else /* CONFIG_CGROUP_SCHED */
635
636static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
637static inline struct task_group *task_group(struct task_struct *p)
638{
639 return NULL;
640}
641
642#endif /* CONFIG_CGROUP_SCHED */
643
645inline void update_rq_clock(struct rq *rq) 644inline void update_rq_clock(struct rq *rq)
646{ 645{
647 if (!rq->skip_clock_update) 646 if (!rq->skip_clock_update)
@@ -1255,6 +1254,12 @@ static void sched_avg_update(struct rq *rq)
1255 s64 period = sched_avg_period(); 1254 s64 period = sched_avg_period();
1256 1255
1257 while ((s64)(rq->clock - rq->age_stamp) > period) { 1256 while ((s64)(rq->clock - rq->age_stamp) > period) {
1257 /*
1258 * Inline assembly required to prevent the compiler
1259 * optimising this loop into a divmod call.
1260 * See __iter_div_u64_rem() for another example of this.
1261 */
1262 asm("" : "+rm" (rq->age_stamp));
1258 rq->age_stamp += period; 1263 rq->age_stamp += period;
1259 rq->rt_avg /= 2; 1264 rq->rt_avg /= 2;
1260 } 1265 }
@@ -1499,24 +1504,9 @@ static unsigned long target_load(int cpu, int type)
1499 return max(rq->cpu_load[type-1], total); 1504 return max(rq->cpu_load[type-1], total);
1500} 1505}
1501 1506
1502static struct sched_group *group_of(int cpu)
1503{
1504 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1505
1506 if (!sd)
1507 return NULL;
1508
1509 return sd->groups;
1510}
1511
1512static unsigned long power_of(int cpu) 1507static unsigned long power_of(int cpu)
1513{ 1508{
1514 struct sched_group *group = group_of(cpu); 1509 return cpu_rq(cpu)->cpu_power;
1515
1516 if (!group)
1517 return SCHED_LOAD_SCALE;
1518
1519 return group->cpu_power;
1520} 1510}
1521 1511
1522static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1512static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1673,9 +1663,6 @@ static void update_shares(struct sched_domain *sd)
1673 1663
1674static void update_h_load(long cpu) 1664static void update_h_load(long cpu)
1675{ 1665{
1676 if (root_task_group_empty())
1677 return;
1678
1679 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1666 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1680} 1667}
1681 1668
@@ -1854,8 +1841,8 @@ static void dec_nr_running(struct rq *rq)
1854static void set_load_weight(struct task_struct *p) 1841static void set_load_weight(struct task_struct *p)
1855{ 1842{
1856 if (task_has_rt_policy(p)) { 1843 if (task_has_rt_policy(p)) {
1857 p->se.load.weight = prio_to_weight[0] * 2; 1844 p->se.load.weight = 0;
1858 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 1845 p->se.load.inv_weight = WMULT_CONST;
1859 return; 1846 return;
1860 } 1847 }
1861 1848
@@ -2507,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2507 if (p->sched_class->task_fork) 2494 if (p->sched_class->task_fork)
2508 p->sched_class->task_fork(p); 2495 p->sched_class->task_fork(p);
2509 2496
2497 /*
2498 * The child is not yet in the pid-hash so no cgroup attach races,
2499 * and the cgroup is pinned to this child due to cgroup_fork()
2500 * is ran before sched_fork().
2501 *
2502 * Silence PROVE_RCU.
2503 */
2504 rcu_read_lock();
2510 set_task_cpu(p, cpu); 2505 set_task_cpu(p, cpu);
2506 rcu_read_unlock();
2511 2507
2512#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2508#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2513 if (likely(sched_info_on())) 2509 if (likely(sched_info_on()))
@@ -2877,9 +2873,9 @@ unsigned long nr_iowait(void)
2877 return sum; 2873 return sum;
2878} 2874}
2879 2875
2880unsigned long nr_iowait_cpu(void) 2876unsigned long nr_iowait_cpu(int cpu)
2881{ 2877{
2882 struct rq *this = this_rq(); 2878 struct rq *this = cpu_rq(cpu);
2883 return atomic_read(&this->nr_iowait); 2879 return atomic_read(&this->nr_iowait);
2884} 2880}
2885 2881
@@ -4478,16 +4474,6 @@ recheck:
4478 } 4474 }
4479 4475
4480 if (user) { 4476 if (user) {
4481#ifdef CONFIG_RT_GROUP_SCHED
4482 /*
4483 * Do not allow realtime tasks into groups that have no runtime
4484 * assigned.
4485 */
4486 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4487 task_group(p)->rt_bandwidth.rt_runtime == 0)
4488 return -EPERM;
4489#endif
4490
4491 retval = security_task_setscheduler(p, policy, param); 4477 retval = security_task_setscheduler(p, policy, param);
4492 if (retval) 4478 if (retval)
4493 return retval; 4479 return retval;
@@ -4503,6 +4489,22 @@ recheck:
4503 * runqueue lock must be held. 4489 * runqueue lock must be held.
4504 */ 4490 */
4505 rq = __task_rq_lock(p); 4491 rq = __task_rq_lock(p);
4492
4493#ifdef CONFIG_RT_GROUP_SCHED
4494 if (user) {
4495 /*
4496 * Do not allow realtime tasks into groups that have no runtime
4497 * assigned.
4498 */
4499 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4500 task_group(p)->rt_bandwidth.rt_runtime == 0) {
4501 __task_rq_unlock(rq);
4502 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4503 return -EPERM;
4504 }
4505 }
4506#endif
4507
4506 /* recheck policy now with rq lock held */ 4508 /* recheck policy now with rq lock held */
4507 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4509 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4508 policy = oldpolicy = -1; 4510 policy = oldpolicy = -1;
@@ -7605,6 +7607,7 @@ void __init sched_init(void)
7605#ifdef CONFIG_SMP 7607#ifdef CONFIG_SMP
7606 rq->sd = NULL; 7608 rq->sd = NULL;
7607 rq->rd = NULL; 7609 rq->rd = NULL;
7610 rq->cpu_power = SCHED_LOAD_SCALE;
7608 rq->post_schedule = 0; 7611 rq->post_schedule = 0;
7609 rq->active_balance = 0; 7612 rq->active_balance = 0;
7610 rq->next_balance = jiffies; 7613 rq->next_balance = jiffies;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 217e4a9393e4..a878b5332daa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1225 unsigned long this_load, load; 1225 unsigned long this_load, load;
1226 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1227 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
1228 unsigned int imbalance;
1229 struct task_group *tg; 1228 struct task_group *tg;
1230 unsigned long weight; 1229 unsigned long weight;
1231 int balanced; 1230 int balanced;
@@ -1241,6 +1240,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1241 * effect of the currently running task from the load 1240 * effect of the currently running task from the load
1242 * of the current CPU: 1241 * of the current CPU:
1243 */ 1242 */
1243 rcu_read_lock();
1244 if (sync) { 1244 if (sync) {
1245 tg = task_group(current); 1245 tg = task_group(current);
1246 weight = current->se.load.weight; 1246 weight = current->se.load.weight;
@@ -1252,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1252 tg = task_group(p); 1252 tg = task_group(p);
1253 weight = p->se.load.weight; 1253 weight = p->se.load.weight;
1254 1254
1255 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1256
1257 /* 1255 /*
1258 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1256 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1259 * due to the sync cause above having dropped this_load to 0, we'll 1257 * due to the sync cause above having dropped this_load to 0, we'll
@@ -1263,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1263 * Otherwise check if either cpus are near enough in load to allow this 1261 * Otherwise check if either cpus are near enough in load to allow this
1264 * task to be woken on this_cpu. 1262 * task to be woken on this_cpu.
1265 */ 1263 */
1266 balanced = !this_load || 1264 if (this_load) {
1267 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= 1265 unsigned long this_eff_load, prev_eff_load;
1268 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1266
1267 this_eff_load = 100;
1268 this_eff_load *= power_of(prev_cpu);
1269 this_eff_load *= this_load +
1270 effective_load(tg, this_cpu, weight, weight);
1271
1272 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
1273 prev_eff_load *= power_of(this_cpu);
1274 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
1275
1276 balanced = this_eff_load <= prev_eff_load;
1277 } else
1278 balanced = true;
1279 rcu_read_unlock();
1269 1280
1270 /* 1281 /*
1271 * If the currently running task will sleep within 1282 * If the currently running task will sleep within
@@ -2298,6 +2309,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2298 if (!power) 2309 if (!power)
2299 power = 1; 2310 power = 1;
2300 2311
2312 cpu_rq(cpu)->cpu_power = power;
2301 sdg->cpu_power = power; 2313 sdg->cpu_power = power;
2302} 2314}
2303 2315
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 825e1126008f..07b4f1b1a73a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
850 void *cpu = (void *)(long)smp_processor_id(); 850 void *cpu = (void *)(long)smp_processor_id();
851 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 851 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
852 852
853 BUG_ON(err == NOTIFY_BAD); 853 BUG_ON(err != NOTIFY_OK);
854 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 854 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
855 register_cpu_notifier(&cpu_nfb); 855 register_cpu_notifier(&cpu_nfb);
856 return 0; 856 return 0;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4e7431e7c78..70f8d90331e9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
321 321
322#ifdef CONFIG_HOTPLUG_CPU 322#ifdef CONFIG_HOTPLUG_CPU
323 case CPU_UP_CANCELED: 323 case CPU_UP_CANCELED:
324 case CPU_DEAD: 324 case CPU_POST_DEAD:
325 { 325 {
326 struct cpu_stop_work *work; 326 struct cpu_stop_work *work;
327 327
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 997080f00e0b..d24f761f4876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = {
1471 }, 1471 },
1472#endif 1472#endif
1473 { 1473 {
1474 .procname = "pipe-max-pages", 1474 .procname = "pipe-max-size",
1475 .data = &pipe_max_pages, 1475 .data = &pipe_max_size,
1476 .maxlen = sizeof(int), 1476 .maxlen = sizeof(int),
1477 .mode = 0644, 1477 .mode = 0644,
1478 .proc_handler = &proc_dointvec_minmax, 1478 .proc_handler = &pipe_proc_fn,
1479 .extra1 = &two, 1479 .extra1 = &pipe_min_size,
1480 }, 1480 },
1481/* 1481/*
1482 * NOTE: do not add new entries to this table unless you have read 1482 * NOTE: do not add new entries to this table unless you have read
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1d7b9bc1c034..813993b5fb61 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now)
154 * Updates the per cpu time idle statistics counters 154 * Updates the per cpu time idle statistics counters
155 */ 155 */
156static void 156static void
157update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time) 157update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
158{ 158{
159 ktime_t delta; 159 ktime_t delta;
160 160
161 if (ts->idle_active) { 161 if (ts->idle_active) {
162 delta = ktime_sub(now, ts->idle_entrytime); 162 delta = ktime_sub(now, ts->idle_entrytime);
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 if (nr_iowait_cpu() > 0) 164 if (nr_iowait_cpu(cpu) > 0)
165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
166 ts->idle_entrytime = now; 166 ts->idle_entrytime = now;
167 } 167 }
@@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
175{ 175{
176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
177 177
178 update_ts_time_stats(ts, now, NULL); 178 update_ts_time_stats(cpu, ts, now, NULL);
179 ts->idle_active = 0; 179 ts->idle_active = 0;
180 180
181 sched_clock_idle_wakeup_event(0); 181 sched_clock_idle_wakeup_event(0);
182} 182}
183 183
184static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 184static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
185{ 185{
186 ktime_t now; 186 ktime_t now;
187 187
188 now = ktime_get(); 188 now = ktime_get();
189 189
190 update_ts_time_stats(ts, now, NULL); 190 update_ts_time_stats(cpu, ts, now, NULL);
191 191
192 ts->idle_entrytime = now; 192 ts->idle_entrytime = now;
193 ts->idle_active = 1; 193 ts->idle_active = 1;
@@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
216 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
217 return -1; 217 return -1;
218 218
219 update_ts_time_stats(ts, ktime_get(), last_update_time); 219 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
220 220
221 return ktime_to_us(ts->idle_sleeptime); 221 return ktime_to_us(ts->idle_sleeptime);
222} 222}
@@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
242 if (!tick_nohz_enabled) 242 if (!tick_nohz_enabled)
243 return -1; 243 return -1;
244 244
245 update_ts_time_stats(ts, ktime_get(), last_update_time); 245 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
246 246
247 return ktime_to_us(ts->iowait_sleeptime); 247 return ktime_to_us(ts->iowait_sleeptime);
248} 248}
@@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle)
284 */ 284 */
285 ts->inidle = 1; 285 ts->inidle = 1;
286 286
287 now = tick_nohz_start_idle(ts); 287 now = tick_nohz_start_idle(cpu, ts);
288 288
289 /* 289 /*
290 * If this cpu is offline and it is the one which updates 290 * If this cpu is offline and it is the one which updates
@@ -315,9 +315,6 @@ void tick_nohz_stop_sched_tick(int inidle)
315 goto end; 315 goto end;
316 } 316 }
317 317
318 if (nohz_ratelimit(cpu))
319 goto end;
320
321 ts->idle_calls++; 318 ts->idle_calls++;
322 /* Read jiffies and the time when jiffies were updated last */ 319 /* Read jiffies and the time when jiffies were updated last */
323 do { 320 do {
@@ -328,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
328 } while (read_seqretry(&xtime_lock, seq)); 325 } while (read_seqretry(&xtime_lock, seq));
329 326
330 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
331 arch_needs_cpu(cpu)) { 328 arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
332 next_jiffies = last_jiffies + 1; 329 next_jiffies = last_jiffies + 1;
333 delta_jiffies = 1; 330 delta_jiffies = 1;
334 } else { 331 } else {
diff --git a/kernel/timer.c b/kernel/timer.c
index 2454172a80d3..ee305c8d4e18 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1717,7 +1717,7 @@ void __init init_timers(void)
1717 1717
1718 init_timer_stats(); 1718 init_timer_stats();
1719 1719
1720 BUG_ON(err == NOTIFY_BAD); 1720 BUG_ON(err != NOTIFY_OK);
1721 register_cpu_notifier(&timers_nb); 1721 register_cpu_notifier(&timers_nb);
1722 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 1722 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1723} 1723}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 36ea2b65dcdc..638711c17504 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -842,6 +842,7 @@ static void blk_add_trace_split(void *ignore,
842 842
843/** 843/**
844 * blk_add_trace_remap - Add a trace for a remap operation 844 * blk_add_trace_remap - Add a trace for a remap operation
845 * @ignore: trace callback data parameter (not used)
845 * @q: queue the io is for 846 * @q: queue the io is for
846 * @bio: the source bio 847 * @bio: the source bio
847 * @dev: target device 848 * @dev: target device
@@ -873,6 +874,7 @@ static void blk_add_trace_remap(void *ignore,
873 874
874/** 875/**
875 * blk_add_trace_rq_remap - Add a trace for a request-remap operation 876 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
877 * @ignore: trace callback data parameter (not used)
876 * @q: queue the io is for 878 * @q: queue the io is for
877 * @rq: the source request 879 * @rq: the source request
878 * @dev: target device 880 * @dev: target device
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cb6f365016e4..8a2b73f7c068 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -96,7 +96,9 @@ int perf_trace_init(struct perf_event *p_event)
96 mutex_lock(&event_mutex); 96 mutex_lock(&event_mutex);
97 list_for_each_entry(tp_event, &ftrace_events, list) { 97 list_for_each_entry(tp_event, &ftrace_events, list) {
98 if (tp_event->event.type == event_id && 98 if (tp_event->event.type == event_id &&
99 tp_event->class && tp_event->class->perf_probe && 99 tp_event->class &&
100 (tp_event->class->perf_probe ||
101 tp_event->class->reg) &&
100 try_module_get(tp_event->mod)) { 102 try_module_get(tp_event->mod)) {
101 ret = perf_trace_event_init(tp_event, p_event); 103 ret = perf_trace_event_init(tp_event, p_event);
102 break; 104 break;
@@ -116,7 +118,7 @@ int perf_trace_enable(struct perf_event *p_event)
116 if (WARN_ON_ONCE(!list)) 118 if (WARN_ON_ONCE(!list))
117 return -EINVAL; 119 return -EINVAL;
118 120
119 list = per_cpu_ptr(list, smp_processor_id()); 121 list = this_cpu_ptr(list);
120 hlist_add_head_rcu(&p_event->hlist_entry, list); 122 hlist_add_head_rcu(&p_event->hlist_entry, list);
121 123
122 return 0; 124 return 0;
@@ -132,8 +134,9 @@ void perf_trace_destroy(struct perf_event *p_event)
132 struct ftrace_event_call *tp_event = p_event->tp_event; 134 struct ftrace_event_call *tp_event = p_event->tp_event;
133 int i; 135 int i;
134 136
137 mutex_lock(&event_mutex);
135 if (--tp_event->perf_refcount > 0) 138 if (--tp_event->perf_refcount > 0)
136 return; 139 goto out;
137 140
138 if (tp_event->class->reg) 141 if (tp_event->class->reg)
139 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); 142 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
@@ -142,6 +145,12 @@ void perf_trace_destroy(struct perf_event *p_event)
142 tp_event->class->perf_probe, 145 tp_event->class->perf_probe,
143 tp_event); 146 tp_event);
144 147
148 /*
149 * Ensure our callback won't be called anymore. See
150 * tracepoint_probe_unregister() and __DO_TRACE().
151 */
152 synchronize_sched();
153
145 free_percpu(tp_event->perf_events); 154 free_percpu(tp_event->perf_events);
146 tp_event->perf_events = NULL; 155 tp_event->perf_events = NULL;
147 156
@@ -151,6 +160,8 @@ void perf_trace_destroy(struct perf_event *p_event)
151 perf_trace_buf[i] = NULL; 160 perf_trace_buf[i] = NULL;
152 } 161 }
153 } 162 }
163out:
164 mutex_unlock(&event_mutex);
154} 165}
155 166
156__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 167__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
@@ -169,7 +180,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
169 if (*rctxp < 0) 180 if (*rctxp < 0)
170 return NULL; 181 return NULL;
171 182
172 raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id()); 183 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
173 184
174 /* zero the dead bytes from align to not leak stack to user */ 185 /* zero the dead bytes from align to not leak stack to user */
175 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); 186 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index faf7cefd15da..f52b5f50299d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1359,7 +1359,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1359 for (i = 0; i < tp->nr_args; i++) 1359 for (i = 0; i < tp->nr_args; i++)
1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); 1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1361 1361
1362 head = per_cpu_ptr(call->perf_events, smp_processor_id()); 1362 head = this_cpu_ptr(call->perf_events);
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
1364} 1364}
1365 1365
@@ -1392,7 +1392,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1392 for (i = 0; i < tp->nr_args; i++) 1392 for (i = 0; i < tp->nr_args; i++)
1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); 1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1394 1394
1395 head = per_cpu_ptr(call->perf_events, smp_processor_id()); 1395 head = this_cpu_ptr(call->perf_events);
1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1397} 1397}
1398 1398
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d2c859cec9ea..34e35804304b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -519,7 +519,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
519 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 519 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
520 (unsigned long *)&rec->args); 520 (unsigned long *)&rec->args);
521 521
522 head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id()); 522 head = this_cpu_ptr(sys_data->enter_event->perf_events);
523 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 523 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
524} 524}
525 525
@@ -595,7 +595,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
595 rec->nr = syscall_nr; 595 rec->nr = syscall_nr;
596 rec->ret = syscall_get_return_value(current, regs); 596 rec->ret = syscall_get_return_value(current, regs);
597 597
598 head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id()); 598 head = this_cpu_ptr(sys_data->exit_event->perf_events);
599 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 599 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
600} 600}
601 601