diff options
author | Ingo Molnar <mingo@elte.hu> | 2010-03-09 11:11:53 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-03-09 11:11:53 -0500 |
commit | 548b84166917d6f5e2296123b85ad24aecd3801d (patch) | |
tree | 0ab0300e23a02df0fe3c0579627e4998bb122c00 /kernel | |
parent | cfb581bcd4f8c158c6f2b48bf5e232bb9e6855c0 (diff) | |
parent | 57d54889cd00db2752994b389ba714138652e60c (diff) |
Merge commit 'v2.6.34-rc1' into perf/urgent
Conflicts:
tools/perf/util/probe-event.c
Merge reason: Pick up -rc1 and resolve the conflict as well.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
79 files changed, 5379 insertions, 3457 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f2..a987aa1676b5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
13 | async.o | 13 | async.o range.o |
14 | obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o | ||
14 | obj-y += groups.o | 15 | obj-y += groups.o |
15 | 16 | ||
16 | ifdef CONFIG_FUNCTION_TRACER | 17 | ifdef CONFIG_FUNCTION_TRACER |
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | |||
90 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 91 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
91 | obj-$(CONFIG_TRACEPOINTS) += tracepoint.o | 92 | obj-$(CONFIG_TRACEPOINTS) += tracepoint.o |
92 | obj-$(CONFIG_LATENCYTOP) += latencytop.o | 93 | obj-$(CONFIG_LATENCYTOP) += latencytop.o |
94 | obj-$(CONFIG_BINFMT_ELF) += elfcore.o | ||
95 | obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o | ||
96 | obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o | ||
93 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ | 97 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ |
94 | obj-$(CONFIG_TRACING) += trace/ | 98 | obj-$(CONFIG_TRACING) += trace/ |
95 | obj-$(CONFIG_X86_DS) += trace/ | 99 | obj-$(CONFIG_X86_DS) += trace/ |
@@ -100,6 +104,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o | |||
100 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 104 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
101 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 105 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
102 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 106 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
107 | obj-$(CONFIG_PADATA) += padata.o | ||
103 | 108 | ||
104 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | 109 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) |
105 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 110 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 4b05bd9479db..028e85663f27 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule) | |||
548 | return 0; | 548 | return 0; |
549 | } | 549 | } |
550 | 550 | ||
551 | static int compare_root(struct vfsmount *mnt, void *arg) | ||
552 | { | ||
553 | return mnt->mnt_root->d_inode == arg; | ||
554 | } | ||
555 | |||
551 | void audit_trim_trees(void) | 556 | void audit_trim_trees(void) |
552 | { | 557 | { |
553 | struct list_head cursor; | 558 | struct list_head cursor; |
@@ -559,7 +564,6 @@ void audit_trim_trees(void) | |||
559 | struct path path; | 564 | struct path path; |
560 | struct vfsmount *root_mnt; | 565 | struct vfsmount *root_mnt; |
561 | struct node *node; | 566 | struct node *node; |
562 | struct list_head list; | ||
563 | int err; | 567 | int err; |
564 | 568 | ||
565 | tree = container_of(cursor.next, struct audit_tree, list); | 569 | tree = container_of(cursor.next, struct audit_tree, list); |
@@ -577,24 +581,16 @@ void audit_trim_trees(void) | |||
577 | if (!root_mnt) | 581 | if (!root_mnt) |
578 | goto skip_it; | 582 | goto skip_it; |
579 | 583 | ||
580 | list_add_tail(&list, &root_mnt->mnt_list); | ||
581 | spin_lock(&hash_lock); | 584 | spin_lock(&hash_lock); |
582 | list_for_each_entry(node, &tree->chunks, list) { | 585 | list_for_each_entry(node, &tree->chunks, list) { |
583 | struct audit_chunk *chunk = find_chunk(node); | 586 | struct inode *inode = find_chunk(node)->watch.inode; |
584 | struct inode *inode = chunk->watch.inode; | ||
585 | struct vfsmount *mnt; | ||
586 | node->index |= 1U<<31; | 587 | node->index |= 1U<<31; |
587 | list_for_each_entry(mnt, &list, mnt_list) { | 588 | if (iterate_mounts(compare_root, inode, root_mnt)) |
588 | if (mnt->mnt_root->d_inode == inode) { | 589 | node->index &= ~(1U<<31); |
589 | node->index &= ~(1U<<31); | ||
590 | break; | ||
591 | } | ||
592 | } | ||
593 | } | 590 | } |
594 | spin_unlock(&hash_lock); | 591 | spin_unlock(&hash_lock); |
595 | trim_marked(tree); | 592 | trim_marked(tree); |
596 | put_tree(tree); | 593 | put_tree(tree); |
597 | list_del_init(&list); | ||
598 | drop_collected_mounts(root_mnt); | 594 | drop_collected_mounts(root_mnt); |
599 | skip_it: | 595 | skip_it: |
600 | mutex_lock(&audit_filter_mutex); | 596 | mutex_lock(&audit_filter_mutex); |
@@ -603,22 +599,6 @@ skip_it: | |||
603 | mutex_unlock(&audit_filter_mutex); | 599 | mutex_unlock(&audit_filter_mutex); |
604 | } | 600 | } |
605 | 601 | ||
606 | static int is_under(struct vfsmount *mnt, struct dentry *dentry, | ||
607 | struct path *path) | ||
608 | { | ||
609 | if (mnt != path->mnt) { | ||
610 | for (;;) { | ||
611 | if (mnt->mnt_parent == mnt) | ||
612 | return 0; | ||
613 | if (mnt->mnt_parent == path->mnt) | ||
614 | break; | ||
615 | mnt = mnt->mnt_parent; | ||
616 | } | ||
617 | dentry = mnt->mnt_mountpoint; | ||
618 | } | ||
619 | return is_subdir(dentry, path->dentry); | ||
620 | } | ||
621 | |||
622 | int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) | 602 | int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) |
623 | { | 603 | { |
624 | 604 | ||
@@ -638,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree) | |||
638 | put_tree(tree); | 618 | put_tree(tree); |
639 | } | 619 | } |
640 | 620 | ||
621 | static int tag_mount(struct vfsmount *mnt, void *arg) | ||
622 | { | ||
623 | return tag_chunk(mnt->mnt_root->d_inode, arg); | ||
624 | } | ||
625 | |||
641 | /* called with audit_filter_mutex */ | 626 | /* called with audit_filter_mutex */ |
642 | int audit_add_tree_rule(struct audit_krule *rule) | 627 | int audit_add_tree_rule(struct audit_krule *rule) |
643 | { | 628 | { |
644 | struct audit_tree *seed = rule->tree, *tree; | 629 | struct audit_tree *seed = rule->tree, *tree; |
645 | struct path path; | 630 | struct path path; |
646 | struct vfsmount *mnt, *p; | 631 | struct vfsmount *mnt; |
647 | struct list_head list; | ||
648 | int err; | 632 | int err; |
649 | 633 | ||
650 | list_for_each_entry(tree, &tree_list, list) { | 634 | list_for_each_entry(tree, &tree_list, list) { |
@@ -670,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule) | |||
670 | err = -ENOMEM; | 654 | err = -ENOMEM; |
671 | goto Err; | 655 | goto Err; |
672 | } | 656 | } |
673 | list_add_tail(&list, &mnt->mnt_list); | ||
674 | 657 | ||
675 | get_tree(tree); | 658 | get_tree(tree); |
676 | list_for_each_entry(p, &list, mnt_list) { | 659 | err = iterate_mounts(tag_mount, tree, mnt); |
677 | err = tag_chunk(p->mnt_root->d_inode, tree); | ||
678 | if (err) | ||
679 | break; | ||
680 | } | ||
681 | |||
682 | list_del(&list); | ||
683 | drop_collected_mounts(mnt); | 660 | drop_collected_mounts(mnt); |
684 | 661 | ||
685 | if (!err) { | 662 | if (!err) { |
@@ -714,31 +691,23 @@ int audit_tag_tree(char *old, char *new) | |||
714 | { | 691 | { |
715 | struct list_head cursor, barrier; | 692 | struct list_head cursor, barrier; |
716 | int failed = 0; | 693 | int failed = 0; |
717 | struct path path; | 694 | struct path path1, path2; |
718 | struct vfsmount *tagged; | 695 | struct vfsmount *tagged; |
719 | struct list_head list; | ||
720 | struct vfsmount *mnt; | ||
721 | struct dentry *dentry; | ||
722 | int err; | 696 | int err; |
723 | 697 | ||
724 | err = kern_path(new, 0, &path); | 698 | err = kern_path(new, 0, &path2); |
725 | if (err) | 699 | if (err) |
726 | return err; | 700 | return err; |
727 | tagged = collect_mounts(&path); | 701 | tagged = collect_mounts(&path2); |
728 | path_put(&path); | 702 | path_put(&path2); |
729 | if (!tagged) | 703 | if (!tagged) |
730 | return -ENOMEM; | 704 | return -ENOMEM; |
731 | 705 | ||
732 | err = kern_path(old, 0, &path); | 706 | err = kern_path(old, 0, &path1); |
733 | if (err) { | 707 | if (err) { |
734 | drop_collected_mounts(tagged); | 708 | drop_collected_mounts(tagged); |
735 | return err; | 709 | return err; |
736 | } | 710 | } |
737 | mnt = mntget(path.mnt); | ||
738 | dentry = dget(path.dentry); | ||
739 | path_put(&path); | ||
740 | |||
741 | list_add_tail(&list, &tagged->mnt_list); | ||
742 | 711 | ||
743 | mutex_lock(&audit_filter_mutex); | 712 | mutex_lock(&audit_filter_mutex); |
744 | list_add(&barrier, &tree_list); | 713 | list_add(&barrier, &tree_list); |
@@ -746,7 +715,7 @@ int audit_tag_tree(char *old, char *new) | |||
746 | 715 | ||
747 | while (cursor.next != &tree_list) { | 716 | while (cursor.next != &tree_list) { |
748 | struct audit_tree *tree; | 717 | struct audit_tree *tree; |
749 | struct vfsmount *p; | 718 | int good_one = 0; |
750 | 719 | ||
751 | tree = container_of(cursor.next, struct audit_tree, list); | 720 | tree = container_of(cursor.next, struct audit_tree, list); |
752 | get_tree(tree); | 721 | get_tree(tree); |
@@ -754,30 +723,19 @@ int audit_tag_tree(char *old, char *new) | |||
754 | list_add(&cursor, &tree->list); | 723 | list_add(&cursor, &tree->list); |
755 | mutex_unlock(&audit_filter_mutex); | 724 | mutex_unlock(&audit_filter_mutex); |
756 | 725 | ||
757 | err = kern_path(tree->pathname, 0, &path); | 726 | err = kern_path(tree->pathname, 0, &path2); |
758 | if (err) { | 727 | if (!err) { |
759 | put_tree(tree); | 728 | good_one = path_is_under(&path1, &path2); |
760 | mutex_lock(&audit_filter_mutex); | 729 | path_put(&path2); |
761 | continue; | ||
762 | } | 730 | } |
763 | 731 | ||
764 | spin_lock(&vfsmount_lock); | 732 | if (!good_one) { |
765 | if (!is_under(mnt, dentry, &path)) { | ||
766 | spin_unlock(&vfsmount_lock); | ||
767 | path_put(&path); | ||
768 | put_tree(tree); | 733 | put_tree(tree); |
769 | mutex_lock(&audit_filter_mutex); | 734 | mutex_lock(&audit_filter_mutex); |
770 | continue; | 735 | continue; |
771 | } | 736 | } |
772 | spin_unlock(&vfsmount_lock); | ||
773 | path_put(&path); | ||
774 | |||
775 | list_for_each_entry(p, &list, mnt_list) { | ||
776 | failed = tag_chunk(p->mnt_root->d_inode, tree); | ||
777 | if (failed) | ||
778 | break; | ||
779 | } | ||
780 | 737 | ||
738 | failed = iterate_mounts(tag_mount, tree, tagged); | ||
781 | if (failed) { | 739 | if (failed) { |
782 | put_tree(tree); | 740 | put_tree(tree); |
783 | mutex_lock(&audit_filter_mutex); | 741 | mutex_lock(&audit_filter_mutex); |
@@ -818,10 +776,8 @@ int audit_tag_tree(char *old, char *new) | |||
818 | } | 776 | } |
819 | list_del(&barrier); | 777 | list_del(&barrier); |
820 | list_del(&cursor); | 778 | list_del(&cursor); |
821 | list_del(&list); | ||
822 | mutex_unlock(&audit_filter_mutex); | 779 | mutex_unlock(&audit_filter_mutex); |
823 | dput(dentry); | 780 | path_put(&path1); |
824 | mntput(mnt); | ||
825 | drop_collected_mounts(tagged); | 781 | drop_collected_mounts(tagged); |
826 | return failed; | 782 | return failed; |
827 | } | 783 | } |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fc0f928167e7..f3a461c0970a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry) | |||
1988 | 1988 | ||
1989 | /** | 1989 | /** |
1990 | * audit_inode_child - collect inode info for created/removed objects | 1990 | * audit_inode_child - collect inode info for created/removed objects |
1991 | * @dname: inode's dentry name | ||
1992 | * @dentry: dentry being audited | 1991 | * @dentry: dentry being audited |
1993 | * @parent: inode of dentry parent | 1992 | * @parent: inode of dentry parent |
1994 | * | 1993 | * |
@@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry) | |||
2000 | * must be hooked prior, in order to capture the target inode during | 1999 | * must be hooked prior, in order to capture the target inode during |
2001 | * unsuccessful attempts. | 2000 | * unsuccessful attempts. |
2002 | */ | 2001 | */ |
2003 | void __audit_inode_child(const char *dname, const struct dentry *dentry, | 2002 | void __audit_inode_child(const struct dentry *dentry, |
2004 | const struct inode *parent) | 2003 | const struct inode *parent) |
2005 | { | 2004 | { |
2006 | int idx; | 2005 | int idx; |
2007 | struct audit_context *context = current->audit_context; | 2006 | struct audit_context *context = current->audit_context; |
2008 | const char *found_parent = NULL, *found_child = NULL; | 2007 | const char *found_parent = NULL, *found_child = NULL; |
2009 | const struct inode *inode = dentry->d_inode; | 2008 | const struct inode *inode = dentry->d_inode; |
2009 | const char *dname = dentry->d_name.name; | ||
2010 | int dirlen = 0; | 2010 | int dirlen = 0; |
2011 | 2011 | ||
2012 | if (!context->in_syscall) | 2012 | if (!context->in_syscall) |
@@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry, | |||
2014 | 2014 | ||
2015 | if (inode) | 2015 | if (inode) |
2016 | handle_one(inode); | 2016 | handle_one(inode); |
2017 | /* determine matching parent */ | ||
2018 | if (!dname) | ||
2019 | goto add_names; | ||
2020 | 2017 | ||
2021 | /* parent is more likely, look for it first */ | 2018 | /* parent is more likely, look for it first */ |
2022 | for (idx = 0; idx < context->name_count; idx++) { | 2019 | for (idx = 0; idx < context->name_count; idx++) { |
diff --git a/kernel/capability.c b/kernel/capability.c index 7f876e60521f..9e4697e9b276 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, | |||
135 | if (pid && (pid != task_pid_vnr(current))) { | 135 | if (pid && (pid != task_pid_vnr(current))) { |
136 | struct task_struct *target; | 136 | struct task_struct *target; |
137 | 137 | ||
138 | read_lock(&tasklist_lock); | 138 | rcu_read_lock(); |
139 | 139 | ||
140 | target = find_task_by_vpid(pid); | 140 | target = find_task_by_vpid(pid); |
141 | if (!target) | 141 | if (!target) |
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, | |||
143 | else | 143 | else |
144 | ret = security_capget(target, pEp, pIp, pPp); | 144 | ret = security_capget(target, pEp, pIp, pPp); |
145 | 145 | ||
146 | read_unlock(&tasklist_lock); | 146 | rcu_read_unlock(); |
147 | } else | 147 | } else |
148 | ret = security_capget(current, pEp, pIp, pPp); | 148 | ret = security_capget(current, pEp, pIp, pPp); |
149 | 149 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aa3bee566446..4fd90e129772 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -23,6 +23,7 @@ | |||
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/cgroup.h> | 25 | #include <linux/cgroup.h> |
26 | #include <linux/module.h> | ||
26 | #include <linux/ctype.h> | 27 | #include <linux/ctype.h> |
27 | #include <linux/errno.h> | 28 | #include <linux/errno.h> |
28 | #include <linux/fs.h> | 29 | #include <linux/fs.h> |
@@ -166,6 +167,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
166 | */ | 167 | */ |
167 | static int need_forkexit_callback __read_mostly; | 168 | static int need_forkexit_callback __read_mostly; |
168 | 169 | ||
170 | #ifdef CONFIG_PROVE_LOCKING | ||
171 | int cgroup_lock_is_held(void) | ||
172 | { | ||
173 | return lockdep_is_held(&cgroup_mutex); | ||
174 | } | ||
175 | #else /* #ifdef CONFIG_PROVE_LOCKING */ | ||
176 | int cgroup_lock_is_held(void) | ||
177 | { | ||
178 | return mutex_is_locked(&cgroup_mutex); | ||
179 | } | ||
180 | #endif /* #else #ifdef CONFIG_PROVE_LOCKING */ | ||
181 | |||
182 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | ||
183 | |||
169 | /* convenient tests for these bits */ | 184 | /* convenient tests for these bits */ |
170 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 185 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
171 | { | 186 | { |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 677f25376a38..f8cced2692b3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -338,7 +338,7 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
338 | if (!cpu_possible(cpu)) { | 338 | if (!cpu_possible(cpu)) { |
339 | printk(KERN_ERR "can't online cpu %d because it is not " | 339 | printk(KERN_ERR "can't online cpu %d because it is not " |
340 | "configured as may-hotadd at boot time\n", cpu); | 340 | "configured as may-hotadd at boot time\n", cpu); |
341 | #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) | 341 | #if defined(CONFIG_IA64) |
342 | printk(KERN_ERR "please check additional_cpus= boot " | 342 | printk(KERN_ERR "please check additional_cpus= boot " |
343 | "parameter\n"); | 343 | "parameter\n"); |
344 | #endif | 344 | #endif |
diff --git a/kernel/early_res.c b/kernel/early_res.c new file mode 100644 index 000000000000..3cb2c661bb78 --- /dev/null +++ b/kernel/early_res.c | |||
@@ -0,0 +1,578 @@ | |||
1 | /* | ||
2 | * early_res, could be used to replace bootmem | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/bootmem.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/early_res.h> | ||
10 | |||
11 | /* | ||
12 | * Early reserved memory areas. | ||
13 | */ | ||
14 | /* | ||
15 | * need to make sure this one is bigger enough before | ||
16 | * find_fw_memmap_area could be used | ||
17 | */ | ||
18 | #define MAX_EARLY_RES_X 32 | ||
19 | |||
20 | struct early_res { | ||
21 | u64 start, end; | ||
22 | char name[15]; | ||
23 | char overlap_ok; | ||
24 | }; | ||
25 | static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; | ||
26 | |||
27 | static int max_early_res __initdata = MAX_EARLY_RES_X; | ||
28 | static struct early_res *early_res __initdata = &early_res_x[0]; | ||
29 | static int early_res_count __initdata; | ||
30 | |||
31 | static int __init find_overlapped_early(u64 start, u64 end) | ||
32 | { | ||
33 | int i; | ||
34 | struct early_res *r; | ||
35 | |||
36 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
37 | r = &early_res[i]; | ||
38 | if (end > r->start && start < r->end) | ||
39 | break; | ||
40 | } | ||
41 | |||
42 | return i; | ||
43 | } | ||
44 | |||
45 | /* | ||
46 | * Drop the i-th range from the early reservation map, | ||
47 | * by copying any higher ranges down one over it, and | ||
48 | * clearing what had been the last slot. | ||
49 | */ | ||
50 | static void __init drop_range(int i) | ||
51 | { | ||
52 | int j; | ||
53 | |||
54 | for (j = i + 1; j < max_early_res && early_res[j].end; j++) | ||
55 | ; | ||
56 | |||
57 | memmove(&early_res[i], &early_res[i + 1], | ||
58 | (j - 1 - i) * sizeof(struct early_res)); | ||
59 | |||
60 | early_res[j - 1].end = 0; | ||
61 | early_res_count--; | ||
62 | } | ||
63 | |||
64 | static void __init drop_range_partial(int i, u64 start, u64 end) | ||
65 | { | ||
66 | u64 common_start, common_end; | ||
67 | u64 old_start, old_end; | ||
68 | |||
69 | old_start = early_res[i].start; | ||
70 | old_end = early_res[i].end; | ||
71 | common_start = max(old_start, start); | ||
72 | common_end = min(old_end, end); | ||
73 | |||
74 | /* no overlap ? */ | ||
75 | if (common_start >= common_end) | ||
76 | return; | ||
77 | |||
78 | if (old_start < common_start) { | ||
79 | /* make head segment */ | ||
80 | early_res[i].end = common_start; | ||
81 | if (old_end > common_end) { | ||
82 | char name[15]; | ||
83 | |||
84 | /* | ||
85 | * Save a local copy of the name, since the | ||
86 | * early_res array could get resized inside | ||
87 | * reserve_early_without_check() -> | ||
88 | * __check_and_double_early_res(), which would | ||
89 | * make the current name pointer invalid. | ||
90 | */ | ||
91 | strncpy(name, early_res[i].name, | ||
92 | sizeof(early_res[i].name) - 1); | ||
93 | /* add another for left over on tail */ | ||
94 | reserve_early_without_check(common_end, old_end, name); | ||
95 | } | ||
96 | return; | ||
97 | } else { | ||
98 | if (old_end > common_end) { | ||
99 | /* reuse the entry for tail left */ | ||
100 | early_res[i].start = common_end; | ||
101 | return; | ||
102 | } | ||
103 | /* all covered */ | ||
104 | drop_range(i); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Split any existing ranges that: | ||
110 | * 1) are marked 'overlap_ok', and | ||
111 | * 2) overlap with the stated range [start, end) | ||
112 | * into whatever portion (if any) of the existing range is entirely | ||
113 | * below or entirely above the stated range. Drop the portion | ||
114 | * of the existing range that overlaps with the stated range, | ||
115 | * which will allow the caller of this routine to then add that | ||
116 | * stated range without conflicting with any existing range. | ||
117 | */ | ||
118 | static void __init drop_overlaps_that_are_ok(u64 start, u64 end) | ||
119 | { | ||
120 | int i; | ||
121 | struct early_res *r; | ||
122 | u64 lower_start, lower_end; | ||
123 | u64 upper_start, upper_end; | ||
124 | char name[15]; | ||
125 | |||
126 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
127 | r = &early_res[i]; | ||
128 | |||
129 | /* Continue past non-overlapping ranges */ | ||
130 | if (end <= r->start || start >= r->end) | ||
131 | continue; | ||
132 | |||
133 | /* | ||
134 | * Leave non-ok overlaps as is; let caller | ||
135 | * panic "Overlapping early reservations" | ||
136 | * when it hits this overlap. | ||
137 | */ | ||
138 | if (!r->overlap_ok) | ||
139 | return; | ||
140 | |||
141 | /* | ||
142 | * We have an ok overlap. We will drop it from the early | ||
143 | * reservation map, and add back in any non-overlapping | ||
144 | * portions (lower or upper) as separate, overlap_ok, | ||
145 | * non-overlapping ranges. | ||
146 | */ | ||
147 | |||
148 | /* 1. Note any non-overlapping (lower or upper) ranges. */ | ||
149 | strncpy(name, r->name, sizeof(name) - 1); | ||
150 | |||
151 | lower_start = lower_end = 0; | ||
152 | upper_start = upper_end = 0; | ||
153 | if (r->start < start) { | ||
154 | lower_start = r->start; | ||
155 | lower_end = start; | ||
156 | } | ||
157 | if (r->end > end) { | ||
158 | upper_start = end; | ||
159 | upper_end = r->end; | ||
160 | } | ||
161 | |||
162 | /* 2. Drop the original ok overlapping range */ | ||
163 | drop_range(i); | ||
164 | |||
165 | i--; /* resume for-loop on copied down entry */ | ||
166 | |||
167 | /* 3. Add back in any non-overlapping ranges. */ | ||
168 | if (lower_end) | ||
169 | reserve_early_overlap_ok(lower_start, lower_end, name); | ||
170 | if (upper_end) | ||
171 | reserve_early_overlap_ok(upper_start, upper_end, name); | ||
172 | } | ||
173 | } | ||
174 | |||
175 | static void __init __reserve_early(u64 start, u64 end, char *name, | ||
176 | int overlap_ok) | ||
177 | { | ||
178 | int i; | ||
179 | struct early_res *r; | ||
180 | |||
181 | i = find_overlapped_early(start, end); | ||
182 | if (i >= max_early_res) | ||
183 | panic("Too many early reservations"); | ||
184 | r = &early_res[i]; | ||
185 | if (r->end) | ||
186 | panic("Overlapping early reservations " | ||
187 | "%llx-%llx %s to %llx-%llx %s\n", | ||
188 | start, end - 1, name ? name : "", r->start, | ||
189 | r->end - 1, r->name); | ||
190 | r->start = start; | ||
191 | r->end = end; | ||
192 | r->overlap_ok = overlap_ok; | ||
193 | if (name) | ||
194 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
195 | early_res_count++; | ||
196 | } | ||
197 | |||
198 | /* | ||
199 | * A few early reservtations come here. | ||
200 | * | ||
201 | * The 'overlap_ok' in the name of this routine does -not- mean it | ||
202 | * is ok for these reservations to overlap an earlier reservation. | ||
203 | * Rather it means that it is ok for subsequent reservations to | ||
204 | * overlap this one. | ||
205 | * | ||
206 | * Use this entry point to reserve early ranges when you are doing | ||
207 | * so out of "Paranoia", reserving perhaps more memory than you need, | ||
208 | * just in case, and don't mind a subsequent overlapping reservation | ||
209 | * that is known to be needed. | ||
210 | * | ||
211 | * The drop_overlaps_that_are_ok() call here isn't really needed. | ||
212 | * It would be needed if we had two colliding 'overlap_ok' | ||
213 | * reservations, so that the second such would not panic on the | ||
214 | * overlap with the first. We don't have any such as of this | ||
215 | * writing, but might as well tolerate such if it happens in | ||
216 | * the future. | ||
217 | */ | ||
218 | void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) | ||
219 | { | ||
220 | drop_overlaps_that_are_ok(start, end); | ||
221 | __reserve_early(start, end, name, 1); | ||
222 | } | ||
223 | |||
224 | static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) | ||
225 | { | ||
226 | u64 start, end, size, mem; | ||
227 | struct early_res *new; | ||
228 | |||
229 | /* do we have enough slots left ? */ | ||
230 | if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) | ||
231 | return; | ||
232 | |||
233 | /* double it */ | ||
234 | mem = -1ULL; | ||
235 | size = sizeof(struct early_res) * max_early_res * 2; | ||
236 | if (early_res == early_res_x) | ||
237 | start = 0; | ||
238 | else | ||
239 | start = early_res[0].end; | ||
240 | end = ex_start; | ||
241 | if (start + size < end) | ||
242 | mem = find_fw_memmap_area(start, end, size, | ||
243 | sizeof(struct early_res)); | ||
244 | if (mem == -1ULL) { | ||
245 | start = ex_end; | ||
246 | end = get_max_mapped(); | ||
247 | if (start + size < end) | ||
248 | mem = find_fw_memmap_area(start, end, size, | ||
249 | sizeof(struct early_res)); | ||
250 | } | ||
251 | if (mem == -1ULL) | ||
252 | panic("can not find more space for early_res array"); | ||
253 | |||
254 | new = __va(mem); | ||
255 | /* save the first one for own */ | ||
256 | new[0].start = mem; | ||
257 | new[0].end = mem + size; | ||
258 | new[0].overlap_ok = 0; | ||
259 | /* copy old to new */ | ||
260 | if (early_res == early_res_x) { | ||
261 | memcpy(&new[1], &early_res[0], | ||
262 | sizeof(struct early_res) * max_early_res); | ||
263 | memset(&new[max_early_res+1], 0, | ||
264 | sizeof(struct early_res) * (max_early_res - 1)); | ||
265 | early_res_count++; | ||
266 | } else { | ||
267 | memcpy(&new[1], &early_res[1], | ||
268 | sizeof(struct early_res) * (max_early_res - 1)); | ||
269 | memset(&new[max_early_res], 0, | ||
270 | sizeof(struct early_res) * max_early_res); | ||
271 | } | ||
272 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
273 | early_res = new; | ||
274 | max_early_res *= 2; | ||
275 | printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", | ||
276 | max_early_res, mem, mem + size - 1); | ||
277 | } | ||
278 | |||
279 | /* | ||
280 | * Most early reservations come here. | ||
281 | * | ||
282 | * We first have drop_overlaps_that_are_ok() drop any pre-existing | ||
283 | * 'overlap_ok' ranges, so that we can then reserve this memory | ||
284 | * range without risk of panic'ing on an overlapping overlap_ok | ||
285 | * early reservation. | ||
286 | */ | ||
287 | void __init reserve_early(u64 start, u64 end, char *name) | ||
288 | { | ||
289 | if (start >= end) | ||
290 | return; | ||
291 | |||
292 | __check_and_double_early_res(start, end); | ||
293 | |||
294 | drop_overlaps_that_are_ok(start, end); | ||
295 | __reserve_early(start, end, name, 0); | ||
296 | } | ||
297 | |||
298 | void __init reserve_early_without_check(u64 start, u64 end, char *name) | ||
299 | { | ||
300 | struct early_res *r; | ||
301 | |||
302 | if (start >= end) | ||
303 | return; | ||
304 | |||
305 | __check_and_double_early_res(start, end); | ||
306 | |||
307 | r = &early_res[early_res_count]; | ||
308 | |||
309 | r->start = start; | ||
310 | r->end = end; | ||
311 | r->overlap_ok = 0; | ||
312 | if (name) | ||
313 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
314 | early_res_count++; | ||
315 | } | ||
316 | |||
317 | void __init free_early(u64 start, u64 end) | ||
318 | { | ||
319 | struct early_res *r; | ||
320 | int i; | ||
321 | |||
322 | i = find_overlapped_early(start, end); | ||
323 | r = &early_res[i]; | ||
324 | if (i >= max_early_res || r->end != end || r->start != start) | ||
325 | panic("free_early on not reserved area: %llx-%llx!", | ||
326 | start, end - 1); | ||
327 | |||
328 | drop_range(i); | ||
329 | } | ||
330 | |||
331 | void __init free_early_partial(u64 start, u64 end) | ||
332 | { | ||
333 | struct early_res *r; | ||
334 | int i; | ||
335 | |||
336 | try_next: | ||
337 | i = find_overlapped_early(start, end); | ||
338 | if (i >= max_early_res) | ||
339 | return; | ||
340 | |||
341 | r = &early_res[i]; | ||
342 | /* hole ? */ | ||
343 | if (r->end >= end && r->start <= start) { | ||
344 | drop_range_partial(i, start, end); | ||
345 | return; | ||
346 | } | ||
347 | |||
348 | drop_range_partial(i, start, end); | ||
349 | goto try_next; | ||
350 | } | ||
351 | |||
352 | #ifdef CONFIG_NO_BOOTMEM | ||
353 | static void __init subtract_early_res(struct range *range, int az) | ||
354 | { | ||
355 | int i, count; | ||
356 | u64 final_start, final_end; | ||
357 | int idx = 0; | ||
358 | |||
359 | count = 0; | ||
360 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
361 | count++; | ||
362 | |||
363 | /* need to skip first one ?*/ | ||
364 | if (early_res != early_res_x) | ||
365 | idx = 1; | ||
366 | |||
367 | #define DEBUG_PRINT_EARLY_RES 1 | ||
368 | |||
369 | #if DEBUG_PRINT_EARLY_RES | ||
370 | printk(KERN_INFO "Subtract (%d early reservations)\n", count); | ||
371 | #endif | ||
372 | for (i = idx; i < count; i++) { | ||
373 | struct early_res *r = &early_res[i]; | ||
374 | #if DEBUG_PRINT_EARLY_RES | ||
375 | printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, | ||
376 | r->start, r->end, r->name); | ||
377 | #endif | ||
378 | final_start = PFN_DOWN(r->start); | ||
379 | final_end = PFN_UP(r->end); | ||
380 | if (final_start >= final_end) | ||
381 | continue; | ||
382 | subtract_range(range, az, final_start, final_end); | ||
383 | } | ||
384 | |||
385 | } | ||
386 | |||
387 | int __init get_free_all_memory_range(struct range **rangep, int nodeid) | ||
388 | { | ||
389 | int i, count; | ||
390 | u64 start = 0, end; | ||
391 | u64 size; | ||
392 | u64 mem; | ||
393 | struct range *range; | ||
394 | int nr_range; | ||
395 | |||
396 | count = 0; | ||
397 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
398 | count++; | ||
399 | |||
400 | count *= 2; | ||
401 | |||
402 | size = sizeof(struct range) * count; | ||
403 | end = get_max_mapped(); | ||
404 | #ifdef MAX_DMA32_PFN | ||
405 | if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) | ||
406 | start = MAX_DMA32_PFN << PAGE_SHIFT; | ||
407 | #endif | ||
408 | mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); | ||
409 | if (mem == -1ULL) | ||
410 | panic("can not find more space for range free"); | ||
411 | |||
412 | range = __va(mem); | ||
413 | /* use early_node_map[] and early_res to get range array at first */ | ||
414 | memset(range, 0, size); | ||
415 | nr_range = 0; | ||
416 | |||
417 | /* need to go over early_node_map to find out good range for node */ | ||
418 | nr_range = add_from_early_node_map(range, count, nr_range, nodeid); | ||
419 | #ifdef CONFIG_X86_32 | ||
420 | subtract_range(range, count, max_low_pfn, -1ULL); | ||
421 | #endif | ||
422 | subtract_early_res(range, count); | ||
423 | nr_range = clean_sort_range(range, count); | ||
424 | |||
425 | /* need to clear it ? */ | ||
426 | if (nodeid == MAX_NUMNODES) { | ||
427 | memset(&early_res[0], 0, | ||
428 | sizeof(struct early_res) * max_early_res); | ||
429 | early_res = NULL; | ||
430 | max_early_res = 0; | ||
431 | } | ||
432 | |||
433 | *rangep = range; | ||
434 | return nr_range; | ||
435 | } | ||
436 | #else | ||
437 | void __init early_res_to_bootmem(u64 start, u64 end) | ||
438 | { | ||
439 | int i, count; | ||
440 | u64 final_start, final_end; | ||
441 | int idx = 0; | ||
442 | |||
443 | count = 0; | ||
444 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
445 | count++; | ||
446 | |||
447 | /* need to skip first one ?*/ | ||
448 | if (early_res != early_res_x) | ||
449 | idx = 1; | ||
450 | |||
451 | printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", | ||
452 | count - idx, max_early_res, start, end); | ||
453 | for (i = idx; i < count; i++) { | ||
454 | struct early_res *r = &early_res[i]; | ||
455 | printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, | ||
456 | r->start, r->end, r->name); | ||
457 | final_start = max(start, r->start); | ||
458 | final_end = min(end, r->end); | ||
459 | if (final_start >= final_end) { | ||
460 | printk(KERN_CONT "\n"); | ||
461 | continue; | ||
462 | } | ||
463 | printk(KERN_CONT " ==> [%010llx - %010llx]\n", | ||
464 | final_start, final_end); | ||
465 | reserve_bootmem_generic(final_start, final_end - final_start, | ||
466 | BOOTMEM_DEFAULT); | ||
467 | } | ||
468 | /* clear them */ | ||
469 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
470 | early_res = NULL; | ||
471 | max_early_res = 0; | ||
472 | early_res_count = 0; | ||
473 | } | ||
474 | #endif | ||
475 | |||
476 | /* Check for already reserved areas */ | ||
477 | static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) | ||
478 | { | ||
479 | int i; | ||
480 | u64 addr = *addrp; | ||
481 | int changed = 0; | ||
482 | struct early_res *r; | ||
483 | again: | ||
484 | i = find_overlapped_early(addr, addr + size); | ||
485 | r = &early_res[i]; | ||
486 | if (i < max_early_res && r->end) { | ||
487 | *addrp = addr = round_up(r->end, align); | ||
488 | changed = 1; | ||
489 | goto again; | ||
490 | } | ||
491 | return changed; | ||
492 | } | ||
493 | |||
494 | /* Check for already reserved areas */ | ||
495 | static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) | ||
496 | { | ||
497 | int i; | ||
498 | u64 addr = *addrp, last; | ||
499 | u64 size = *sizep; | ||
500 | int changed = 0; | ||
501 | again: | ||
502 | last = addr + size; | ||
503 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
504 | struct early_res *r = &early_res[i]; | ||
505 | if (last > r->start && addr < r->start) { | ||
506 | size = r->start - addr; | ||
507 | changed = 1; | ||
508 | goto again; | ||
509 | } | ||
510 | if (last > r->end && addr < r->end) { | ||
511 | addr = round_up(r->end, align); | ||
512 | size = last - addr; | ||
513 | changed = 1; | ||
514 | goto again; | ||
515 | } | ||
516 | if (last <= r->end && addr >= r->start) { | ||
517 | (*sizep)++; | ||
518 | return 0; | ||
519 | } | ||
520 | } | ||
521 | if (changed) { | ||
522 | *addrp = addr; | ||
523 | *sizep = size; | ||
524 | } | ||
525 | return changed; | ||
526 | } | ||
527 | |||
528 | /* | ||
529 | * Find a free area with specified alignment in a specific range. | ||
530 | * only with the area.between start to end is active range from early_node_map | ||
531 | * so they are good as RAM | ||
532 | */ | ||
533 | u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, | ||
534 | u64 size, u64 align) | ||
535 | { | ||
536 | u64 addr, last; | ||
537 | |||
538 | addr = round_up(ei_start, align); | ||
539 | if (addr < start) | ||
540 | addr = round_up(start, align); | ||
541 | if (addr >= ei_last) | ||
542 | goto out; | ||
543 | while (bad_addr(&addr, size, align) && addr+size <= ei_last) | ||
544 | ; | ||
545 | last = addr + size; | ||
546 | if (last > ei_last) | ||
547 | goto out; | ||
548 | if (last > end) | ||
549 | goto out; | ||
550 | |||
551 | return addr; | ||
552 | |||
553 | out: | ||
554 | return -1ULL; | ||
555 | } | ||
556 | |||
557 | u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, | ||
558 | u64 *sizep, u64 align) | ||
559 | { | ||
560 | u64 addr, last; | ||
561 | |||
562 | addr = round_up(ei_start, align); | ||
563 | if (addr < start) | ||
564 | addr = round_up(start, align); | ||
565 | if (addr >= ei_last) | ||
566 | goto out; | ||
567 | *sizep = ei_last - addr; | ||
568 | while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) | ||
569 | ; | ||
570 | last = addr + *sizep; | ||
571 | if (last > ei_last) | ||
572 | goto out; | ||
573 | |||
574 | return addr; | ||
575 | |||
576 | out: | ||
577 | return -1ULL; | ||
578 | } | ||
diff --git a/kernel/elfcore.c b/kernel/elfcore.c new file mode 100644 index 000000000000..ff915efef66d --- /dev/null +++ b/kernel/elfcore.c | |||
@@ -0,0 +1,28 @@ | |||
1 | #include <linux/elf.h> | ||
2 | #include <linux/fs.h> | ||
3 | #include <linux/mm.h> | ||
4 | |||
5 | #include <asm/elf.h> | ||
6 | |||
7 | |||
8 | Elf_Half __weak elf_core_extra_phdrs(void) | ||
9 | { | ||
10 | return 0; | ||
11 | } | ||
12 | |||
13 | int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, | ||
14 | unsigned long limit) | ||
15 | { | ||
16 | return 1; | ||
17 | } | ||
18 | |||
19 | int __weak elf_core_write_extra_data(struct file *file, size_t *size, | ||
20 | unsigned long limit) | ||
21 | { | ||
22 | return 1; | ||
23 | } | ||
24 | |||
25 | size_t __weak elf_core_extra_data_size(void) | ||
26 | { | ||
27 | return 0; | ||
28 | } | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 546774a31a66..ce1e48c2d93d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk) | |||
85 | BUG_ON(!sig); | 85 | BUG_ON(!sig); |
86 | BUG_ON(!atomic_read(&sig->count)); | 86 | BUG_ON(!atomic_read(&sig->count)); |
87 | 87 | ||
88 | sighand = rcu_dereference(tsk->sighand); | 88 | sighand = rcu_dereference_check(tsk->sighand, |
89 | rcu_read_lock_held() || | ||
90 | lockdep_is_held(&tasklist_lock)); | ||
89 | spin_lock(&sighand->siglock); | 91 | spin_lock(&sighand->siglock); |
90 | 92 | ||
91 | posix_cpu_timers_exit(tsk); | 93 | posix_cpu_timers_exit(tsk); |
@@ -170,8 +172,10 @@ void release_task(struct task_struct * p) | |||
170 | repeat: | 172 | repeat: |
171 | tracehook_prepare_release_task(p); | 173 | tracehook_prepare_release_task(p); |
172 | /* don't need to get the RCU readlock here - the process is dead and | 174 | /* don't need to get the RCU readlock here - the process is dead and |
173 | * can't be modifying its own credentials */ | 175 | * can't be modifying its own credentials. But shut RCU-lockdep up */ |
176 | rcu_read_lock(); | ||
174 | atomic_dec(&__task_cred(p)->user->processes); | 177 | atomic_dec(&__task_cred(p)->user->processes); |
178 | rcu_read_unlock(); | ||
175 | 179 | ||
176 | proc_flush_task(p); | 180 | proc_flush_task(p); |
177 | 181 | ||
@@ -473,9 +477,11 @@ static void close_files(struct files_struct * files) | |||
473 | /* | 477 | /* |
474 | * It is safe to dereference the fd table without RCU or | 478 | * It is safe to dereference the fd table without RCU or |
475 | * ->file_lock because this is the last reference to the | 479 | * ->file_lock because this is the last reference to the |
476 | * files structure. | 480 | * files structure. But use RCU to shut RCU-lockdep up. |
477 | */ | 481 | */ |
482 | rcu_read_lock(); | ||
478 | fdt = files_fdtable(files); | 483 | fdt = files_fdtable(files); |
484 | rcu_read_unlock(); | ||
479 | for (;;) { | 485 | for (;;) { |
480 | unsigned long set; | 486 | unsigned long set; |
481 | i = j * __NFDBITS; | 487 | i = j * __NFDBITS; |
@@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files) | |||
521 | * at the end of the RCU grace period. Otherwise, | 527 | * at the end of the RCU grace period. Otherwise, |
522 | * you can free files immediately. | 528 | * you can free files immediately. |
523 | */ | 529 | */ |
530 | rcu_read_lock(); | ||
524 | fdt = files_fdtable(files); | 531 | fdt = files_fdtable(files); |
525 | if (fdt != &files->fdtab) | 532 | if (fdt != &files->fdtab) |
526 | kmem_cache_free(files_cachep, files); | 533 | kmem_cache_free(files_cachep, files); |
527 | free_fdtable(fdt); | 534 | free_fdtable(fdt); |
535 | rcu_read_unlock(); | ||
528 | } | 536 | } |
529 | } | 537 | } |
530 | 538 | ||
@@ -944,7 +952,8 @@ NORET_TYPE void do_exit(long code) | |||
944 | preempt_count()); | 952 | preempt_count()); |
945 | 953 | ||
946 | acct_update_integrals(tsk); | 954 | acct_update_integrals(tsk); |
947 | 955 | /* sync mm's RSS info before statistics gathering */ | |
956 | sync_mm_rss(tsk, tsk->mm); | ||
948 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 957 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
949 | if (group_dead) { | 958 | if (group_dead) { |
950 | hrtimer_cancel(&tsk->signal->real_timer); | 959 | hrtimer_cancel(&tsk->signal->real_timer); |
@@ -1180,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1180 | 1189 | ||
1181 | if (unlikely(wo->wo_flags & WNOWAIT)) { | 1190 | if (unlikely(wo->wo_flags & WNOWAIT)) { |
1182 | int exit_code = p->exit_code; | 1191 | int exit_code = p->exit_code; |
1183 | int why, status; | 1192 | int why; |
1184 | 1193 | ||
1185 | get_task_struct(p); | 1194 | get_task_struct(p); |
1186 | read_unlock(&tasklist_lock); | 1195 | read_unlock(&tasklist_lock); |
diff --git a/kernel/fork.c b/kernel/fork.c index f88bd984df35..b0ec34abc0bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -86,6 +86,7 @@ int max_threads; /* tunable limit on nr_threads */ | |||
86 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; | 86 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
87 | 87 | ||
88 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ | 88 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
89 | EXPORT_SYMBOL_GPL(tasklist_lock); | ||
89 | 90 | ||
90 | int nr_processes(void) | 91 | int nr_processes(void) |
91 | { | 92 | { |
@@ -328,15 +329,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
328 | if (!tmp) | 329 | if (!tmp) |
329 | goto fail_nomem; | 330 | goto fail_nomem; |
330 | *tmp = *mpnt; | 331 | *tmp = *mpnt; |
332 | INIT_LIST_HEAD(&tmp->anon_vma_chain); | ||
331 | pol = mpol_dup(vma_policy(mpnt)); | 333 | pol = mpol_dup(vma_policy(mpnt)); |
332 | retval = PTR_ERR(pol); | 334 | retval = PTR_ERR(pol); |
333 | if (IS_ERR(pol)) | 335 | if (IS_ERR(pol)) |
334 | goto fail_nomem_policy; | 336 | goto fail_nomem_policy; |
335 | vma_set_policy(tmp, pol); | 337 | vma_set_policy(tmp, pol); |
338 | if (anon_vma_fork(tmp, mpnt)) | ||
339 | goto fail_nomem_anon_vma_fork; | ||
336 | tmp->vm_flags &= ~VM_LOCKED; | 340 | tmp->vm_flags &= ~VM_LOCKED; |
337 | tmp->vm_mm = mm; | 341 | tmp->vm_mm = mm; |
338 | tmp->vm_next = NULL; | 342 | tmp->vm_next = NULL; |
339 | anon_vma_link(tmp); | ||
340 | file = tmp->vm_file; | 343 | file = tmp->vm_file; |
341 | if (file) { | 344 | if (file) { |
342 | struct inode *inode = file->f_path.dentry->d_inode; | 345 | struct inode *inode = file->f_path.dentry->d_inode; |
@@ -391,6 +394,8 @@ out: | |||
391 | flush_tlb_mm(oldmm); | 394 | flush_tlb_mm(oldmm); |
392 | up_write(&oldmm->mmap_sem); | 395 | up_write(&oldmm->mmap_sem); |
393 | return retval; | 396 | return retval; |
397 | fail_nomem_anon_vma_fork: | ||
398 | mpol_put(pol); | ||
394 | fail_nomem_policy: | 399 | fail_nomem_policy: |
395 | kmem_cache_free(vm_area_cachep, tmp); | 400 | kmem_cache_free(vm_area_cachep, tmp); |
396 | fail_nomem: | 401 | fail_nomem: |
@@ -454,8 +459,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
454 | (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; | 459 | (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; |
455 | mm->core_state = NULL; | 460 | mm->core_state = NULL; |
456 | mm->nr_ptes = 0; | 461 | mm->nr_ptes = 0; |
457 | set_mm_counter(mm, file_rss, 0); | 462 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
458 | set_mm_counter(mm, anon_rss, 0); | ||
459 | spin_lock_init(&mm->page_table_lock); | 463 | spin_lock_init(&mm->page_table_lock); |
460 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 464 | mm->free_area_cache = TASK_UNMAPPED_BASE; |
461 | mm->cached_hole_size = ~0UL; | 465 | mm->cached_hole_size = ~0UL; |
@@ -824,6 +828,8 @@ void __cleanup_sighand(struct sighand_struct *sighand) | |||
824 | */ | 828 | */ |
825 | static void posix_cpu_timers_init_group(struct signal_struct *sig) | 829 | static void posix_cpu_timers_init_group(struct signal_struct *sig) |
826 | { | 830 | { |
831 | unsigned long cpu_limit; | ||
832 | |||
827 | /* Thread group counters. */ | 833 | /* Thread group counters. */ |
828 | thread_group_cputime_init(sig); | 834 | thread_group_cputime_init(sig); |
829 | 835 | ||
@@ -838,9 +844,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) | |||
838 | sig->cputime_expires.virt_exp = cputime_zero; | 844 | sig->cputime_expires.virt_exp = cputime_zero; |
839 | sig->cputime_expires.sched_exp = 0; | 845 | sig->cputime_expires.sched_exp = 0; |
840 | 846 | ||
841 | if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { | 847 | cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
842 | sig->cputime_expires.prof_exp = | 848 | if (cpu_limit != RLIM_INFINITY) { |
843 | secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); | 849 | sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); |
844 | sig->cputimer.running = 1; | 850 | sig->cputimer.running = 1; |
845 | } | 851 | } |
846 | 852 | ||
@@ -1033,7 +1039,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1033 | #endif | 1039 | #endif |
1034 | retval = -EAGAIN; | 1040 | retval = -EAGAIN; |
1035 | if (atomic_read(&p->real_cred->user->processes) >= | 1041 | if (atomic_read(&p->real_cred->user->processes) >= |
1036 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { | 1042 | task_rlimit(p, RLIMIT_NPROC)) { |
1037 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && | 1043 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && |
1038 | p->real_cred->user != INIT_USER) | 1044 | p->real_cred->user != INIT_USER) |
1039 | goto bad_fork_free; | 1045 | goto bad_fork_free; |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 235716556bf1..d49afb2395e5 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
146 | struct task_struct *p; | 146 | struct task_struct *p; |
147 | 147 | ||
148 | ret = -ESRCH; | 148 | ret = -ESRCH; |
149 | read_lock(&tasklist_lock); | 149 | rcu_read_lock(); |
150 | p = find_task_by_vpid(pid); | 150 | p = find_task_by_vpid(pid); |
151 | if (!p) | 151 | if (!p) |
152 | goto err_unlock; | 152 | goto err_unlock; |
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
157 | !capable(CAP_SYS_PTRACE)) | 157 | !capable(CAP_SYS_PTRACE)) |
158 | goto err_unlock; | 158 | goto err_unlock; |
159 | head = p->compat_robust_list; | 159 | head = p->compat_robust_list; |
160 | read_unlock(&tasklist_lock); | 160 | rcu_read_unlock(); |
161 | } | 161 | } |
162 | 162 | ||
163 | if (put_user(sizeof(*head), len_ptr)) | 163 | if (put_user(sizeof(*head), len_ptr)) |
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
165 | return put_user(ptr_to_compat(head), head_ptr); | 165 | return put_user(ptr_to_compat(head), head_ptr); |
166 | 166 | ||
167 | err_unlock: | 167 | err_unlock: |
168 | read_unlock(&tasklist_lock); | 168 | rcu_read_unlock(); |
169 | 169 | ||
170 | return ret; | 170 | return ret; |
171 | } | 171 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa28f666..d70394f12ee9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -18,11 +18,7 @@ | |||
18 | 18 | ||
19 | #include "internals.h" | 19 | #include "internals.h" |
20 | 20 | ||
21 | /** | 21 | static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) |
22 | * dynamic_irq_init - initialize a dynamically allocated irq | ||
23 | * @irq: irq number to initialize | ||
24 | */ | ||
25 | void dynamic_irq_init(unsigned int irq) | ||
26 | { | 22 | { |
27 | struct irq_desc *desc; | 23 | struct irq_desc *desc; |
28 | unsigned long flags; | 24 | unsigned long flags; |
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq) | |||
41 | desc->depth = 1; | 37 | desc->depth = 1; |
42 | desc->msi_desc = NULL; | 38 | desc->msi_desc = NULL; |
43 | desc->handler_data = NULL; | 39 | desc->handler_data = NULL; |
44 | desc->chip_data = NULL; | 40 | if (!keep_chip_data) |
41 | desc->chip_data = NULL; | ||
45 | desc->action = NULL; | 42 | desc->action = NULL; |
46 | desc->irq_count = 0; | 43 | desc->irq_count = 0; |
47 | desc->irqs_unhandled = 0; | 44 | desc->irqs_unhandled = 0; |
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq) | |||
55 | } | 52 | } |
56 | 53 | ||
57 | /** | 54 | /** |
58 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | 55 | * dynamic_irq_init - initialize a dynamically allocated irq |
59 | * @irq: irq number to initialize | 56 | * @irq: irq number to initialize |
60 | */ | 57 | */ |
61 | void dynamic_irq_cleanup(unsigned int irq) | 58 | void dynamic_irq_init(unsigned int irq) |
59 | { | ||
60 | dynamic_irq_init_x(irq, false); | ||
61 | } | ||
62 | |||
63 | /** | ||
64 | * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq | ||
65 | * @irq: irq number to initialize | ||
66 | * | ||
67 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
68 | */ | ||
69 | void dynamic_irq_init_keep_chip_data(unsigned int irq) | ||
70 | { | ||
71 | dynamic_irq_init_x(irq, true); | ||
72 | } | ||
73 | |||
74 | static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) | ||
62 | { | 75 | { |
63 | struct irq_desc *desc = irq_to_desc(irq); | 76 | struct irq_desc *desc = irq_to_desc(irq); |
64 | unsigned long flags; | 77 | unsigned long flags; |
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
77 | } | 90 | } |
78 | desc->msi_desc = NULL; | 91 | desc->msi_desc = NULL; |
79 | desc->handler_data = NULL; | 92 | desc->handler_data = NULL; |
80 | desc->chip_data = NULL; | 93 | if (!keep_chip_data) |
94 | desc->chip_data = NULL; | ||
81 | desc->handle_irq = handle_bad_irq; | 95 | desc->handle_irq = handle_bad_irq; |
82 | desc->chip = &no_irq_chip; | 96 | desc->chip = &no_irq_chip; |
83 | desc->name = NULL; | 97 | desc->name = NULL; |
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
85 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 99 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
86 | } | 100 | } |
87 | 101 | ||
102 | /** | ||
103 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
104 | * @irq: irq number to initialize | ||
105 | */ | ||
106 | void dynamic_irq_cleanup(unsigned int irq) | ||
107 | { | ||
108 | dynamic_irq_cleanup_x(irq, false); | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq | ||
113 | * @irq: irq number to initialize | ||
114 | * | ||
115 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
116 | */ | ||
117 | void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) | ||
118 | { | ||
119 | dynamic_irq_cleanup_x(irq, true); | ||
120 | } | ||
121 | |||
88 | 122 | ||
89 | /** | 123 | /** |
90 | * set_irq_chip - set the irq chip for an irq | 124 | * set_irq_chip - set the irq chip for an irq |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 814940e7f485..76d5a671bfe1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -19,7 +19,7 @@ | |||
19 | #include <linux/kernel_stat.h> | 19 | #include <linux/kernel_stat.h> |
20 | #include <linux/rculist.h> | 20 | #include <linux/rculist.h> |
21 | #include <linux/hash.h> | 21 | #include <linux/hash.h> |
22 | #include <linux/bootmem.h> | 22 | #include <linux/radix-tree.h> |
23 | #include <trace/events/irq.h> | 23 | #include <trace/events/irq.h> |
24 | 24 | ||
25 | #include "internals.h" | 25 | #include "internals.h" |
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) | |||
87 | { | 87 | { |
88 | void *ptr; | 88 | void *ptr; |
89 | 89 | ||
90 | if (slab_is_available()) | 90 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), |
91 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), | 91 | GFP_ATOMIC, node); |
92 | GFP_ATOMIC, node); | ||
93 | else | ||
94 | ptr = alloc_bootmem_node(NODE_DATA(node), | ||
95 | nr * sizeof(*desc->kstat_irqs)); | ||
96 | 92 | ||
97 | /* | 93 | /* |
98 | * don't overwite if can not get new one | 94 | * don't overwite if can not get new one |
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) | |||
132 | */ | 128 | */ |
133 | DEFINE_RAW_SPINLOCK(sparse_irq_lock); | 129 | DEFINE_RAW_SPINLOCK(sparse_irq_lock); |
134 | 130 | ||
135 | struct irq_desc **irq_desc_ptrs __read_mostly; | 131 | static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); |
132 | |||
133 | static void set_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
134 | { | ||
135 | radix_tree_insert(&irq_desc_tree, irq, desc); | ||
136 | } | ||
137 | |||
138 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
139 | { | ||
140 | return radix_tree_lookup(&irq_desc_tree, irq); | ||
141 | } | ||
142 | |||
143 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
144 | { | ||
145 | void **ptr; | ||
146 | |||
147 | ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); | ||
148 | if (ptr) | ||
149 | radix_tree_replace_slot(ptr, desc); | ||
150 | } | ||
136 | 151 | ||
137 | static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { | 152 | static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { |
138 | [0 ... NR_IRQS_LEGACY-1] = { | 153 | [0 ... NR_IRQS_LEGACY-1] = { |
@@ -164,9 +179,6 @@ int __init early_irq_init(void) | |||
164 | legacy_count = ARRAY_SIZE(irq_desc_legacy); | 179 | legacy_count = ARRAY_SIZE(irq_desc_legacy); |
165 | node = first_online_node; | 180 | node = first_online_node; |
166 | 181 | ||
167 | /* allocate irq_desc_ptrs array based on nr_irqs */ | ||
168 | irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); | ||
169 | |||
170 | /* allocate based on nr_cpu_ids */ | 182 | /* allocate based on nr_cpu_ids */ |
171 | kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * | 183 | kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * |
172 | sizeof(int), GFP_NOWAIT, node); | 184 | sizeof(int), GFP_NOWAIT, node); |
@@ -180,23 +192,12 @@ int __init early_irq_init(void) | |||
180 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 192 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
181 | alloc_desc_masks(&desc[i], node, true); | 193 | alloc_desc_masks(&desc[i], node, true); |
182 | init_desc_masks(&desc[i]); | 194 | init_desc_masks(&desc[i]); |
183 | irq_desc_ptrs[i] = desc + i; | 195 | set_irq_desc(i, &desc[i]); |
184 | } | 196 | } |
185 | 197 | ||
186 | for (i = legacy_count; i < nr_irqs; i++) | ||
187 | irq_desc_ptrs[i] = NULL; | ||
188 | |||
189 | return arch_early_irq_init(); | 198 | return arch_early_irq_init(); |
190 | } | 199 | } |
191 | 200 | ||
192 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
193 | { | ||
194 | if (irq_desc_ptrs && irq < nr_irqs) | ||
195 | return irq_desc_ptrs[irq]; | ||
196 | |||
197 | return NULL; | ||
198 | } | ||
199 | |||
200 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | 201 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) |
201 | { | 202 | { |
202 | struct irq_desc *desc; | 203 | struct irq_desc *desc; |
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | |||
208 | return NULL; | 209 | return NULL; |
209 | } | 210 | } |
210 | 211 | ||
211 | desc = irq_desc_ptrs[irq]; | 212 | desc = irq_to_desc(irq); |
212 | if (desc) | 213 | if (desc) |
213 | return desc; | 214 | return desc; |
214 | 215 | ||
215 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | 216 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); |
216 | 217 | ||
217 | /* We have to check it to avoid races with another CPU */ | 218 | /* We have to check it to avoid races with another CPU */ |
218 | desc = irq_desc_ptrs[irq]; | 219 | desc = irq_to_desc(irq); |
219 | if (desc) | 220 | if (desc) |
220 | goto out_unlock; | 221 | goto out_unlock; |
221 | 222 | ||
222 | if (slab_is_available()) | 223 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); |
223 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | ||
224 | else | ||
225 | desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); | ||
226 | 224 | ||
227 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); | 225 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); |
228 | if (!desc) { | 226 | if (!desc) { |
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | |||
231 | } | 229 | } |
232 | init_one_irq_desc(irq, desc, node); | 230 | init_one_irq_desc(irq, desc, node); |
233 | 231 | ||
234 | irq_desc_ptrs[irq] = desc; | 232 | set_irq_desc(irq, desc); |
235 | 233 | ||
236 | out_unlock: | 234 | out_unlock: |
237 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | 235 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b2821f070a3d..c63f3bc88f0b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc); | |||
21 | extern raw_spinlock_t sparse_irq_lock; | 21 | extern raw_spinlock_t sparse_irq_lock; |
22 | 22 | ||
23 | #ifdef CONFIG_SPARSE_IRQ | 23 | #ifdef CONFIG_SPARSE_IRQ |
24 | /* irq_desc_ptrs allocated at boot time */ | 24 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc); |
25 | extern struct irq_desc **irq_desc_ptrs; | ||
26 | #else | ||
27 | /* irq_desc_ptrs is a fixed size array */ | ||
28 | extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; | ||
29 | #endif | 25 | #endif |
30 | 26 | ||
31 | #ifdef CONFIG_PROC_FS | 27 | #ifdef CONFIG_PROC_FS |
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 26bac9d8f860..963559dbd858 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c | |||
@@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | |||
70 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | 70 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); |
71 | 71 | ||
72 | /* We have to check it to avoid races with another CPU */ | 72 | /* We have to check it to avoid races with another CPU */ |
73 | desc = irq_desc_ptrs[irq]; | 73 | desc = irq_to_desc(irq); |
74 | 74 | ||
75 | if (desc && old_desc != desc) | 75 | if (desc && old_desc != desc) |
76 | goto out_unlock; | 76 | goto out_unlock; |
@@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | |||
90 | goto out_unlock; | 90 | goto out_unlock; |
91 | } | 91 | } |
92 | 92 | ||
93 | irq_desc_ptrs[irq] = desc; | 93 | replace_irq_desc(irq, desc); |
94 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | 94 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); |
95 | 95 | ||
96 | /* free the old one */ | 96 | /* free the old one */ |
diff --git a/kernel/kexec.c b/kernel/kexec.c index ef077fb73155..87ebe8adc474 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -41,7 +41,7 @@ | |||
41 | #include <asm/sections.h> | 41 | #include <asm/sections.h> |
42 | 42 | ||
43 | /* Per cpu memory for storing cpu states in case of system crash. */ | 43 | /* Per cpu memory for storing cpu states in case of system crash. */ |
44 | note_buf_t* crash_notes; | 44 | note_buf_t __percpu *crash_notes; |
45 | 45 | ||
46 | /* vmcoreinfo stuff */ | 46 | /* vmcoreinfo stuff */ |
47 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | 47 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ccec774c716d..fa034d29cf73 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -42,9 +42,11 @@ | |||
42 | #include <linux/freezer.h> | 42 | #include <linux/freezer.h> |
43 | #include <linux/seq_file.h> | 43 | #include <linux/seq_file.h> |
44 | #include <linux/debugfs.h> | 44 | #include <linux/debugfs.h> |
45 | #include <linux/sysctl.h> | ||
45 | #include <linux/kdebug.h> | 46 | #include <linux/kdebug.h> |
46 | #include <linux/memory.h> | 47 | #include <linux/memory.h> |
47 | #include <linux/ftrace.h> | 48 | #include <linux/ftrace.h> |
49 | #include <linux/cpu.h> | ||
48 | 50 | ||
49 | #include <asm-generic/sections.h> | 51 | #include <asm-generic/sections.h> |
50 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
@@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { | |||
105 | * stepping on the instruction on a vmalloced/kmalloced/data page | 107 | * stepping on the instruction on a vmalloced/kmalloced/data page |
106 | * is a recipe for disaster | 108 | * is a recipe for disaster |
107 | */ | 109 | */ |
108 | #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) | ||
109 | |||
110 | struct kprobe_insn_page { | 110 | struct kprobe_insn_page { |
111 | struct list_head list; | 111 | struct list_head list; |
112 | kprobe_opcode_t *insns; /* Page of instruction slots */ | 112 | kprobe_opcode_t *insns; /* Page of instruction slots */ |
113 | char slot_used[INSNS_PER_PAGE]; | ||
114 | int nused; | 113 | int nused; |
115 | int ngarbage; | 114 | int ngarbage; |
115 | char slot_used[]; | ||
116 | }; | ||
117 | |||
118 | #define KPROBE_INSN_PAGE_SIZE(slots) \ | ||
119 | (offsetof(struct kprobe_insn_page, slot_used) + \ | ||
120 | (sizeof(char) * (slots))) | ||
121 | |||
122 | struct kprobe_insn_cache { | ||
123 | struct list_head pages; /* list of kprobe_insn_page */ | ||
124 | size_t insn_size; /* size of instruction slot */ | ||
125 | int nr_garbage; | ||
116 | }; | 126 | }; |
117 | 127 | ||
128 | static int slots_per_page(struct kprobe_insn_cache *c) | ||
129 | { | ||
130 | return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); | ||
131 | } | ||
132 | |||
118 | enum kprobe_slot_state { | 133 | enum kprobe_slot_state { |
119 | SLOT_CLEAN = 0, | 134 | SLOT_CLEAN = 0, |
120 | SLOT_DIRTY = 1, | 135 | SLOT_DIRTY = 1, |
121 | SLOT_USED = 2, | 136 | SLOT_USED = 2, |
122 | }; | 137 | }; |
123 | 138 | ||
124 | static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ | 139 | static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ |
125 | static LIST_HEAD(kprobe_insn_pages); | 140 | static struct kprobe_insn_cache kprobe_insn_slots = { |
126 | static int kprobe_garbage_slots; | 141 | .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), |
127 | static int collect_garbage_slots(void); | 142 | .insn_size = MAX_INSN_SIZE, |
143 | .nr_garbage = 0, | ||
144 | }; | ||
145 | static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); | ||
128 | 146 | ||
129 | /** | 147 | /** |
130 | * __get_insn_slot() - Find a slot on an executable page for an instruction. | 148 | * __get_insn_slot() - Find a slot on an executable page for an instruction. |
131 | * We allocate an executable page if there's no room on existing ones. | 149 | * We allocate an executable page if there's no room on existing ones. |
132 | */ | 150 | */ |
133 | static kprobe_opcode_t __kprobes *__get_insn_slot(void) | 151 | static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) |
134 | { | 152 | { |
135 | struct kprobe_insn_page *kip; | 153 | struct kprobe_insn_page *kip; |
136 | 154 | ||
137 | retry: | 155 | retry: |
138 | list_for_each_entry(kip, &kprobe_insn_pages, list) { | 156 | list_for_each_entry(kip, &c->pages, list) { |
139 | if (kip->nused < INSNS_PER_PAGE) { | 157 | if (kip->nused < slots_per_page(c)) { |
140 | int i; | 158 | int i; |
141 | for (i = 0; i < INSNS_PER_PAGE; i++) { | 159 | for (i = 0; i < slots_per_page(c); i++) { |
142 | if (kip->slot_used[i] == SLOT_CLEAN) { | 160 | if (kip->slot_used[i] == SLOT_CLEAN) { |
143 | kip->slot_used[i] = SLOT_USED; | 161 | kip->slot_used[i] = SLOT_USED; |
144 | kip->nused++; | 162 | kip->nused++; |
145 | return kip->insns + (i * MAX_INSN_SIZE); | 163 | return kip->insns + (i * c->insn_size); |
146 | } | 164 | } |
147 | } | 165 | } |
148 | /* Surprise! No unused slots. Fix kip->nused. */ | 166 | /* kip->nused is broken. Fix it. */ |
149 | kip->nused = INSNS_PER_PAGE; | 167 | kip->nused = slots_per_page(c); |
168 | WARN_ON(1); | ||
150 | } | 169 | } |
151 | } | 170 | } |
152 | 171 | ||
153 | /* If there are any garbage slots, collect it and try again. */ | 172 | /* If there are any garbage slots, collect it and try again. */ |
154 | if (kprobe_garbage_slots && collect_garbage_slots() == 0) { | 173 | if (c->nr_garbage && collect_garbage_slots(c) == 0) |
155 | goto retry; | 174 | goto retry; |
156 | } | 175 | |
157 | /* All out of space. Need to allocate a new page. Use slot 0. */ | 176 | /* All out of space. Need to allocate a new page. */ |
158 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | 177 | kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); |
159 | if (!kip) | 178 | if (!kip) |
160 | return NULL; | 179 | return NULL; |
161 | 180 | ||
@@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) | |||
170 | return NULL; | 189 | return NULL; |
171 | } | 190 | } |
172 | INIT_LIST_HEAD(&kip->list); | 191 | INIT_LIST_HEAD(&kip->list); |
173 | list_add(&kip->list, &kprobe_insn_pages); | 192 | memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); |
174 | memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); | ||
175 | kip->slot_used[0] = SLOT_USED; | 193 | kip->slot_used[0] = SLOT_USED; |
176 | kip->nused = 1; | 194 | kip->nused = 1; |
177 | kip->ngarbage = 0; | 195 | kip->ngarbage = 0; |
196 | list_add(&kip->list, &c->pages); | ||
178 | return kip->insns; | 197 | return kip->insns; |
179 | } | 198 | } |
180 | 199 | ||
200 | |||
181 | kprobe_opcode_t __kprobes *get_insn_slot(void) | 201 | kprobe_opcode_t __kprobes *get_insn_slot(void) |
182 | { | 202 | { |
183 | kprobe_opcode_t *ret; | 203 | kprobe_opcode_t *ret = NULL; |
204 | |||
184 | mutex_lock(&kprobe_insn_mutex); | 205 | mutex_lock(&kprobe_insn_mutex); |
185 | ret = __get_insn_slot(); | 206 | ret = __get_insn_slot(&kprobe_insn_slots); |
186 | mutex_unlock(&kprobe_insn_mutex); | 207 | mutex_unlock(&kprobe_insn_mutex); |
208 | |||
187 | return ret; | 209 | return ret; |
188 | } | 210 | } |
189 | 211 | ||
@@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | |||
199 | * so as not to have to set it up again the | 221 | * so as not to have to set it up again the |
200 | * next time somebody inserts a probe. | 222 | * next time somebody inserts a probe. |
201 | */ | 223 | */ |
202 | if (!list_is_singular(&kprobe_insn_pages)) { | 224 | if (!list_is_singular(&kip->list)) { |
203 | list_del(&kip->list); | 225 | list_del(&kip->list); |
204 | module_free(NULL, kip->insns); | 226 | module_free(NULL, kip->insns); |
205 | kfree(kip); | 227 | kfree(kip); |
@@ -209,51 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | |||
209 | return 0; | 231 | return 0; |
210 | } | 232 | } |
211 | 233 | ||
212 | static int __kprobes collect_garbage_slots(void) | 234 | static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) |
213 | { | 235 | { |
214 | struct kprobe_insn_page *kip, *next; | 236 | struct kprobe_insn_page *kip, *next; |
215 | 237 | ||
216 | /* Ensure no-one is interrupted on the garbages */ | 238 | /* Ensure no-one is interrupted on the garbages */ |
217 | synchronize_sched(); | 239 | synchronize_sched(); |
218 | 240 | ||
219 | list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { | 241 | list_for_each_entry_safe(kip, next, &c->pages, list) { |
220 | int i; | 242 | int i; |
221 | if (kip->ngarbage == 0) | 243 | if (kip->ngarbage == 0) |
222 | continue; | 244 | continue; |
223 | kip->ngarbage = 0; /* we will collect all garbages */ | 245 | kip->ngarbage = 0; /* we will collect all garbages */ |
224 | for (i = 0; i < INSNS_PER_PAGE; i++) { | 246 | for (i = 0; i < slots_per_page(c); i++) { |
225 | if (kip->slot_used[i] == SLOT_DIRTY && | 247 | if (kip->slot_used[i] == SLOT_DIRTY && |
226 | collect_one_slot(kip, i)) | 248 | collect_one_slot(kip, i)) |
227 | break; | 249 | break; |
228 | } | 250 | } |
229 | } | 251 | } |
230 | kprobe_garbage_slots = 0; | 252 | c->nr_garbage = 0; |
231 | return 0; | 253 | return 0; |
232 | } | 254 | } |
233 | 255 | ||
234 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | 256 | static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, |
257 | kprobe_opcode_t *slot, int dirty) | ||
235 | { | 258 | { |
236 | struct kprobe_insn_page *kip; | 259 | struct kprobe_insn_page *kip; |
237 | 260 | ||
238 | mutex_lock(&kprobe_insn_mutex); | 261 | list_for_each_entry(kip, &c->pages, list) { |
239 | list_for_each_entry(kip, &kprobe_insn_pages, list) { | 262 | long idx = ((long)slot - (long)kip->insns) / c->insn_size; |
240 | if (kip->insns <= slot && | 263 | if (idx >= 0 && idx < slots_per_page(c)) { |
241 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { | 264 | WARN_ON(kip->slot_used[idx] != SLOT_USED); |
242 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | ||
243 | if (dirty) { | 265 | if (dirty) { |
244 | kip->slot_used[i] = SLOT_DIRTY; | 266 | kip->slot_used[idx] = SLOT_DIRTY; |
245 | kip->ngarbage++; | 267 | kip->ngarbage++; |
268 | if (++c->nr_garbage > slots_per_page(c)) | ||
269 | collect_garbage_slots(c); | ||
246 | } else | 270 | } else |
247 | collect_one_slot(kip, i); | 271 | collect_one_slot(kip, idx); |
248 | break; | 272 | return; |
249 | } | 273 | } |
250 | } | 274 | } |
275 | /* Could not free this slot. */ | ||
276 | WARN_ON(1); | ||
277 | } | ||
251 | 278 | ||
252 | if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) | 279 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) |
253 | collect_garbage_slots(); | 280 | { |
254 | 281 | mutex_lock(&kprobe_insn_mutex); | |
282 | __free_insn_slot(&kprobe_insn_slots, slot, dirty); | ||
255 | mutex_unlock(&kprobe_insn_mutex); | 283 | mutex_unlock(&kprobe_insn_mutex); |
256 | } | 284 | } |
285 | #ifdef CONFIG_OPTPROBES | ||
286 | /* For optimized_kprobe buffer */ | ||
287 | static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ | ||
288 | static struct kprobe_insn_cache kprobe_optinsn_slots = { | ||
289 | .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), | ||
290 | /* .insn_size is initialized later */ | ||
291 | .nr_garbage = 0, | ||
292 | }; | ||
293 | /* Get a slot for optimized_kprobe buffer */ | ||
294 | kprobe_opcode_t __kprobes *get_optinsn_slot(void) | ||
295 | { | ||
296 | kprobe_opcode_t *ret = NULL; | ||
297 | |||
298 | mutex_lock(&kprobe_optinsn_mutex); | ||
299 | ret = __get_insn_slot(&kprobe_optinsn_slots); | ||
300 | mutex_unlock(&kprobe_optinsn_mutex); | ||
301 | |||
302 | return ret; | ||
303 | } | ||
304 | |||
305 | void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) | ||
306 | { | ||
307 | mutex_lock(&kprobe_optinsn_mutex); | ||
308 | __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); | ||
309 | mutex_unlock(&kprobe_optinsn_mutex); | ||
310 | } | ||
311 | #endif | ||
257 | #endif | 312 | #endif |
258 | 313 | ||
259 | /* We have preemption disabled.. so it is safe to use __ versions */ | 314 | /* We have preemption disabled.. so it is safe to use __ versions */ |
@@ -284,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr) | |||
284 | if (p->addr == addr) | 339 | if (p->addr == addr) |
285 | return p; | 340 | return p; |
286 | } | 341 | } |
342 | |||
343 | return NULL; | ||
344 | } | ||
345 | |||
346 | static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); | ||
347 | |||
348 | /* Return true if the kprobe is an aggregator */ | ||
349 | static inline int kprobe_aggrprobe(struct kprobe *p) | ||
350 | { | ||
351 | return p->pre_handler == aggr_pre_handler; | ||
352 | } | ||
353 | |||
354 | /* | ||
355 | * Keep all fields in the kprobe consistent | ||
356 | */ | ||
357 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
358 | { | ||
359 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | ||
360 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | ||
361 | } | ||
362 | |||
363 | #ifdef CONFIG_OPTPROBES | ||
364 | /* NOTE: change this value only with kprobe_mutex held */ | ||
365 | static bool kprobes_allow_optimization; | ||
366 | |||
367 | /* | ||
368 | * Call all pre_handler on the list, but ignores its return value. | ||
369 | * This must be called from arch-dep optimized caller. | ||
370 | */ | ||
371 | void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
372 | { | ||
373 | struct kprobe *kp; | ||
374 | |||
375 | list_for_each_entry_rcu(kp, &p->list, list) { | ||
376 | if (kp->pre_handler && likely(!kprobe_disabled(kp))) { | ||
377 | set_kprobe_instance(kp); | ||
378 | kp->pre_handler(kp, regs); | ||
379 | } | ||
380 | reset_kprobe_instance(); | ||
381 | } | ||
382 | } | ||
383 | |||
384 | /* Return true(!0) if the kprobe is ready for optimization. */ | ||
385 | static inline int kprobe_optready(struct kprobe *p) | ||
386 | { | ||
387 | struct optimized_kprobe *op; | ||
388 | |||
389 | if (kprobe_aggrprobe(p)) { | ||
390 | op = container_of(p, struct optimized_kprobe, kp); | ||
391 | return arch_prepared_optinsn(&op->optinsn); | ||
392 | } | ||
393 | |||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * Return an optimized kprobe whose optimizing code replaces | ||
399 | * instructions including addr (exclude breakpoint). | ||
400 | */ | ||
401 | struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | ||
402 | { | ||
403 | int i; | ||
404 | struct kprobe *p = NULL; | ||
405 | struct optimized_kprobe *op; | ||
406 | |||
407 | /* Don't check i == 0, since that is a breakpoint case. */ | ||
408 | for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) | ||
409 | p = get_kprobe((void *)(addr - i)); | ||
410 | |||
411 | if (p && kprobe_optready(p)) { | ||
412 | op = container_of(p, struct optimized_kprobe, kp); | ||
413 | if (arch_within_optimized_kprobe(op, addr)) | ||
414 | return p; | ||
415 | } | ||
416 | |||
287 | return NULL; | 417 | return NULL; |
288 | } | 418 | } |
289 | 419 | ||
420 | /* Optimization staging list, protected by kprobe_mutex */ | ||
421 | static LIST_HEAD(optimizing_list); | ||
422 | |||
423 | static void kprobe_optimizer(struct work_struct *work); | ||
424 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | ||
425 | #define OPTIMIZE_DELAY 5 | ||
426 | |||
427 | /* Kprobe jump optimizer */ | ||
428 | static __kprobes void kprobe_optimizer(struct work_struct *work) | ||
429 | { | ||
430 | struct optimized_kprobe *op, *tmp; | ||
431 | |||
432 | /* Lock modules while optimizing kprobes */ | ||
433 | mutex_lock(&module_mutex); | ||
434 | mutex_lock(&kprobe_mutex); | ||
435 | if (kprobes_all_disarmed || !kprobes_allow_optimization) | ||
436 | goto end; | ||
437 | |||
438 | /* | ||
439 | * Wait for quiesence period to ensure all running interrupts | ||
440 | * are done. Because optprobe may modify multiple instructions | ||
441 | * there is a chance that Nth instruction is interrupted. In that | ||
442 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
443 | * instruction. This wait is for avoiding it. | ||
444 | */ | ||
445 | synchronize_sched(); | ||
446 | |||
447 | /* | ||
448 | * The optimization/unoptimization refers online_cpus via | ||
449 | * stop_machine() and cpu-hotplug modifies online_cpus. | ||
450 | * And same time, text_mutex will be held in cpu-hotplug and here. | ||
451 | * This combination can cause a deadlock (cpu-hotplug try to lock | ||
452 | * text_mutex but stop_machine can not be done because online_cpus | ||
453 | * has been changed) | ||
454 | * To avoid this deadlock, we need to call get_online_cpus() | ||
455 | * for preventing cpu-hotplug outside of text_mutex locking. | ||
456 | */ | ||
457 | get_online_cpus(); | ||
458 | mutex_lock(&text_mutex); | ||
459 | list_for_each_entry_safe(op, tmp, &optimizing_list, list) { | ||
460 | WARN_ON(kprobe_disabled(&op->kp)); | ||
461 | if (arch_optimize_kprobe(op) < 0) | ||
462 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
463 | list_del_init(&op->list); | ||
464 | } | ||
465 | mutex_unlock(&text_mutex); | ||
466 | put_online_cpus(); | ||
467 | end: | ||
468 | mutex_unlock(&kprobe_mutex); | ||
469 | mutex_unlock(&module_mutex); | ||
470 | } | ||
471 | |||
472 | /* Optimize kprobe if p is ready to be optimized */ | ||
473 | static __kprobes void optimize_kprobe(struct kprobe *p) | ||
474 | { | ||
475 | struct optimized_kprobe *op; | ||
476 | |||
477 | /* Check if the kprobe is disabled or not ready for optimization. */ | ||
478 | if (!kprobe_optready(p) || !kprobes_allow_optimization || | ||
479 | (kprobe_disabled(p) || kprobes_all_disarmed)) | ||
480 | return; | ||
481 | |||
482 | /* Both of break_handler and post_handler are not supported. */ | ||
483 | if (p->break_handler || p->post_handler) | ||
484 | return; | ||
485 | |||
486 | op = container_of(p, struct optimized_kprobe, kp); | ||
487 | |||
488 | /* Check there is no other kprobes at the optimized instructions */ | ||
489 | if (arch_check_optimized_kprobe(op) < 0) | ||
490 | return; | ||
491 | |||
492 | /* Check if it is already optimized. */ | ||
493 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) | ||
494 | return; | ||
495 | |||
496 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; | ||
497 | list_add(&op->list, &optimizing_list); | ||
498 | if (!delayed_work_pending(&optimizing_work)) | ||
499 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | ||
500 | } | ||
501 | |||
502 | /* Unoptimize a kprobe if p is optimized */ | ||
503 | static __kprobes void unoptimize_kprobe(struct kprobe *p) | ||
504 | { | ||
505 | struct optimized_kprobe *op; | ||
506 | |||
507 | if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { | ||
508 | op = container_of(p, struct optimized_kprobe, kp); | ||
509 | if (!list_empty(&op->list)) | ||
510 | /* Dequeue from the optimization queue */ | ||
511 | list_del_init(&op->list); | ||
512 | else | ||
513 | /* Replace jump with break */ | ||
514 | arch_unoptimize_kprobe(op); | ||
515 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
516 | } | ||
517 | } | ||
518 | |||
519 | /* Remove optimized instructions */ | ||
520 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) | ||
521 | { | ||
522 | struct optimized_kprobe *op; | ||
523 | |||
524 | op = container_of(p, struct optimized_kprobe, kp); | ||
525 | if (!list_empty(&op->list)) { | ||
526 | /* Dequeue from the optimization queue */ | ||
527 | list_del_init(&op->list); | ||
528 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
529 | } | ||
530 | /* Don't unoptimize, because the target code will be freed. */ | ||
531 | arch_remove_optimized_kprobe(op); | ||
532 | } | ||
533 | |||
534 | /* Try to prepare optimized instructions */ | ||
535 | static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | ||
536 | { | ||
537 | struct optimized_kprobe *op; | ||
538 | |||
539 | op = container_of(p, struct optimized_kprobe, kp); | ||
540 | arch_prepare_optimized_kprobe(op); | ||
541 | } | ||
542 | |||
543 | /* Free optimized instructions and optimized_kprobe */ | ||
544 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
545 | { | ||
546 | struct optimized_kprobe *op; | ||
547 | |||
548 | op = container_of(p, struct optimized_kprobe, kp); | ||
549 | arch_remove_optimized_kprobe(op); | ||
550 | kfree(op); | ||
551 | } | ||
552 | |||
553 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | ||
554 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | ||
555 | { | ||
556 | struct optimized_kprobe *op; | ||
557 | |||
558 | op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL); | ||
559 | if (!op) | ||
560 | return NULL; | ||
561 | |||
562 | INIT_LIST_HEAD(&op->list); | ||
563 | op->kp.addr = p->addr; | ||
564 | arch_prepare_optimized_kprobe(op); | ||
565 | |||
566 | return &op->kp; | ||
567 | } | ||
568 | |||
569 | static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); | ||
570 | |||
571 | /* | ||
572 | * Prepare an optimized_kprobe and optimize it | ||
573 | * NOTE: p must be a normal registered kprobe | ||
574 | */ | ||
575 | static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | ||
576 | { | ||
577 | struct kprobe *ap; | ||
578 | struct optimized_kprobe *op; | ||
579 | |||
580 | ap = alloc_aggr_kprobe(p); | ||
581 | if (!ap) | ||
582 | return; | ||
583 | |||
584 | op = container_of(ap, struct optimized_kprobe, kp); | ||
585 | if (!arch_prepared_optinsn(&op->optinsn)) { | ||
586 | /* If failed to setup optimizing, fallback to kprobe */ | ||
587 | free_aggr_kprobe(ap); | ||
588 | return; | ||
589 | } | ||
590 | |||
591 | init_aggr_kprobe(ap, p); | ||
592 | optimize_kprobe(ap); | ||
593 | } | ||
594 | |||
595 | #ifdef CONFIG_SYSCTL | ||
596 | static void __kprobes optimize_all_kprobes(void) | ||
597 | { | ||
598 | struct hlist_head *head; | ||
599 | struct hlist_node *node; | ||
600 | struct kprobe *p; | ||
601 | unsigned int i; | ||
602 | |||
603 | /* If optimization is already allowed, just return */ | ||
604 | if (kprobes_allow_optimization) | ||
605 | return; | ||
606 | |||
607 | kprobes_allow_optimization = true; | ||
608 | mutex_lock(&text_mutex); | ||
609 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | ||
610 | head = &kprobe_table[i]; | ||
611 | hlist_for_each_entry_rcu(p, node, head, hlist) | ||
612 | if (!kprobe_disabled(p)) | ||
613 | optimize_kprobe(p); | ||
614 | } | ||
615 | mutex_unlock(&text_mutex); | ||
616 | printk(KERN_INFO "Kprobes globally optimized\n"); | ||
617 | } | ||
618 | |||
619 | static void __kprobes unoptimize_all_kprobes(void) | ||
620 | { | ||
621 | struct hlist_head *head; | ||
622 | struct hlist_node *node; | ||
623 | struct kprobe *p; | ||
624 | unsigned int i; | ||
625 | |||
626 | /* If optimization is already prohibited, just return */ | ||
627 | if (!kprobes_allow_optimization) | ||
628 | return; | ||
629 | |||
630 | kprobes_allow_optimization = false; | ||
631 | printk(KERN_INFO "Kprobes globally unoptimized\n"); | ||
632 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
633 | mutex_lock(&text_mutex); | ||
634 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | ||
635 | head = &kprobe_table[i]; | ||
636 | hlist_for_each_entry_rcu(p, node, head, hlist) { | ||
637 | if (!kprobe_disabled(p)) | ||
638 | unoptimize_kprobe(p); | ||
639 | } | ||
640 | } | ||
641 | |||
642 | mutex_unlock(&text_mutex); | ||
643 | put_online_cpus(); | ||
644 | /* Allow all currently running kprobes to complete */ | ||
645 | synchronize_sched(); | ||
646 | } | ||
647 | |||
648 | int sysctl_kprobes_optimization; | ||
649 | int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | ||
650 | void __user *buffer, size_t *length, | ||
651 | loff_t *ppos) | ||
652 | { | ||
653 | int ret; | ||
654 | |||
655 | mutex_lock(&kprobe_mutex); | ||
656 | sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; | ||
657 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
658 | |||
659 | if (sysctl_kprobes_optimization) | ||
660 | optimize_all_kprobes(); | ||
661 | else | ||
662 | unoptimize_all_kprobes(); | ||
663 | mutex_unlock(&kprobe_mutex); | ||
664 | |||
665 | return ret; | ||
666 | } | ||
667 | #endif /* CONFIG_SYSCTL */ | ||
668 | |||
669 | static void __kprobes __arm_kprobe(struct kprobe *p) | ||
670 | { | ||
671 | struct kprobe *old_p; | ||
672 | |||
673 | /* Check collision with other optimized kprobes */ | ||
674 | old_p = get_optimized_kprobe((unsigned long)p->addr); | ||
675 | if (unlikely(old_p)) | ||
676 | unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ | ||
677 | |||
678 | arch_arm_kprobe(p); | ||
679 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ | ||
680 | } | ||
681 | |||
682 | static void __kprobes __disarm_kprobe(struct kprobe *p) | ||
683 | { | ||
684 | struct kprobe *old_p; | ||
685 | |||
686 | unoptimize_kprobe(p); /* Try to unoptimize */ | ||
687 | arch_disarm_kprobe(p); | ||
688 | |||
689 | /* If another kprobe was blocked, optimize it. */ | ||
690 | old_p = get_optimized_kprobe((unsigned long)p->addr); | ||
691 | if (unlikely(old_p)) | ||
692 | optimize_kprobe(old_p); | ||
693 | } | ||
694 | |||
695 | #else /* !CONFIG_OPTPROBES */ | ||
696 | |||
697 | #define optimize_kprobe(p) do {} while (0) | ||
698 | #define unoptimize_kprobe(p) do {} while (0) | ||
699 | #define kill_optimized_kprobe(p) do {} while (0) | ||
700 | #define prepare_optimized_kprobe(p) do {} while (0) | ||
701 | #define try_to_optimize_kprobe(p) do {} while (0) | ||
702 | #define __arm_kprobe(p) arch_arm_kprobe(p) | ||
703 | #define __disarm_kprobe(p) arch_disarm_kprobe(p) | ||
704 | |||
705 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
706 | { | ||
707 | kfree(p); | ||
708 | } | ||
709 | |||
710 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | ||
711 | { | ||
712 | return kzalloc(sizeof(struct kprobe), GFP_KERNEL); | ||
713 | } | ||
714 | #endif /* CONFIG_OPTPROBES */ | ||
715 | |||
290 | /* Arm a kprobe with text_mutex */ | 716 | /* Arm a kprobe with text_mutex */ |
291 | static void __kprobes arm_kprobe(struct kprobe *kp) | 717 | static void __kprobes arm_kprobe(struct kprobe *kp) |
292 | { | 718 | { |
719 | /* | ||
720 | * Here, since __arm_kprobe() doesn't use stop_machine(), | ||
721 | * this doesn't cause deadlock on text_mutex. So, we don't | ||
722 | * need get_online_cpus(). | ||
723 | */ | ||
293 | mutex_lock(&text_mutex); | 724 | mutex_lock(&text_mutex); |
294 | arch_arm_kprobe(kp); | 725 | __arm_kprobe(kp); |
295 | mutex_unlock(&text_mutex); | 726 | mutex_unlock(&text_mutex); |
296 | } | 727 | } |
297 | 728 | ||
298 | /* Disarm a kprobe with text_mutex */ | 729 | /* Disarm a kprobe with text_mutex */ |
299 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 730 | static void __kprobes disarm_kprobe(struct kprobe *kp) |
300 | { | 731 | { |
732 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
301 | mutex_lock(&text_mutex); | 733 | mutex_lock(&text_mutex); |
302 | arch_disarm_kprobe(kp); | 734 | __disarm_kprobe(kp); |
303 | mutex_unlock(&text_mutex); | 735 | mutex_unlock(&text_mutex); |
736 | put_online_cpus(); | ||
304 | } | 737 | } |
305 | 738 | ||
306 | /* | 739 | /* |
@@ -369,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | |||
369 | void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) | 802 | void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) |
370 | { | 803 | { |
371 | struct kprobe *kp; | 804 | struct kprobe *kp; |
372 | if (p->pre_handler != aggr_pre_handler) { | 805 | if (!kprobe_aggrprobe(p)) { |
373 | p->nmissed++; | 806 | p->nmissed++; |
374 | } else { | 807 | } else { |
375 | list_for_each_entry_rcu(kp, &p->list, list) | 808 | list_for_each_entry_rcu(kp, &p->list, list) |
@@ -493,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) | |||
493 | } | 926 | } |
494 | 927 | ||
495 | /* | 928 | /* |
496 | * Keep all fields in the kprobe consistent | ||
497 | */ | ||
498 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
499 | { | ||
500 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | ||
501 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * Add the new probe to ap->list. Fail if this is the | 929 | * Add the new probe to ap->list. Fail if this is the |
506 | * second jprobe at the address - two jprobes can't coexist | 930 | * second jprobe at the address - two jprobes can't coexist |
507 | */ | 931 | */ |
508 | static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | 932 | static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) |
509 | { | 933 | { |
510 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); | 934 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); |
935 | |||
936 | if (p->break_handler || p->post_handler) | ||
937 | unoptimize_kprobe(ap); /* Fall back to normal kprobe */ | ||
938 | |||
511 | if (p->break_handler) { | 939 | if (p->break_handler) { |
512 | if (ap->break_handler) | 940 | if (ap->break_handler) |
513 | return -EEXIST; | 941 | return -EEXIST; |
@@ -522,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
522 | ap->flags &= ~KPROBE_FLAG_DISABLED; | 950 | ap->flags &= ~KPROBE_FLAG_DISABLED; |
523 | if (!kprobes_all_disarmed) | 951 | if (!kprobes_all_disarmed) |
524 | /* Arm the breakpoint again. */ | 952 | /* Arm the breakpoint again. */ |
525 | arm_kprobe(ap); | 953 | __arm_kprobe(ap); |
526 | } | 954 | } |
527 | return 0; | 955 | return 0; |
528 | } | 956 | } |
@@ -531,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
531 | * Fill in the required fields of the "manager kprobe". Replace the | 959 | * Fill in the required fields of the "manager kprobe". Replace the |
532 | * earlier kprobe in the hlist with the manager kprobe | 960 | * earlier kprobe in the hlist with the manager kprobe |
533 | */ | 961 | */ |
534 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | 962 | static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) |
535 | { | 963 | { |
964 | /* Copy p's insn slot to ap */ | ||
536 | copy_kprobe(p, ap); | 965 | copy_kprobe(p, ap); |
537 | flush_insn_slot(ap); | 966 | flush_insn_slot(ap); |
538 | ap->addr = p->addr; | 967 | ap->addr = p->addr; |
539 | ap->flags = p->flags; | 968 | ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; |
540 | ap->pre_handler = aggr_pre_handler; | 969 | ap->pre_handler = aggr_pre_handler; |
541 | ap->fault_handler = aggr_fault_handler; | 970 | ap->fault_handler = aggr_fault_handler; |
542 | /* We don't care the kprobe which has gone. */ | 971 | /* We don't care the kprobe which has gone. */ |
@@ -546,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
546 | ap->break_handler = aggr_break_handler; | 975 | ap->break_handler = aggr_break_handler; |
547 | 976 | ||
548 | INIT_LIST_HEAD(&ap->list); | 977 | INIT_LIST_HEAD(&ap->list); |
549 | list_add_rcu(&p->list, &ap->list); | 978 | INIT_HLIST_NODE(&ap->hlist); |
550 | 979 | ||
980 | list_add_rcu(&p->list, &ap->list); | ||
551 | hlist_replace_rcu(&p->hlist, &ap->hlist); | 981 | hlist_replace_rcu(&p->hlist, &ap->hlist); |
552 | } | 982 | } |
553 | 983 | ||
@@ -561,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
561 | int ret = 0; | 991 | int ret = 0; |
562 | struct kprobe *ap = old_p; | 992 | struct kprobe *ap = old_p; |
563 | 993 | ||
564 | if (old_p->pre_handler != aggr_pre_handler) { | 994 | if (!kprobe_aggrprobe(old_p)) { |
565 | /* If old_p is not an aggr_probe, create new aggr_kprobe. */ | 995 | /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ |
566 | ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); | 996 | ap = alloc_aggr_kprobe(old_p); |
567 | if (!ap) | 997 | if (!ap) |
568 | return -ENOMEM; | 998 | return -ENOMEM; |
569 | add_aggr_kprobe(ap, old_p); | 999 | init_aggr_kprobe(ap, old_p); |
570 | } | 1000 | } |
571 | 1001 | ||
572 | if (kprobe_gone(ap)) { | 1002 | if (kprobe_gone(ap)) { |
@@ -585,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
585 | */ | 1015 | */ |
586 | return ret; | 1016 | return ret; |
587 | 1017 | ||
1018 | /* Prepare optimized instructions if possible. */ | ||
1019 | prepare_optimized_kprobe(ap); | ||
1020 | |||
588 | /* | 1021 | /* |
589 | * Clear gone flag to prevent allocating new slot again, and | 1022 | * Clear gone flag to prevent allocating new slot again, and |
590 | * set disabled flag because it is not armed yet. | 1023 | * set disabled flag because it is not armed yet. |
@@ -593,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
593 | | KPROBE_FLAG_DISABLED; | 1026 | | KPROBE_FLAG_DISABLED; |
594 | } | 1027 | } |
595 | 1028 | ||
1029 | /* Copy ap's insn slot to p */ | ||
596 | copy_kprobe(ap, p); | 1030 | copy_kprobe(ap, p); |
597 | return add_new_kprobe(ap, p); | 1031 | return add_new_kprobe(ap, p); |
598 | } | 1032 | } |
@@ -743,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
743 | p->nmissed = 0; | 1177 | p->nmissed = 0; |
744 | INIT_LIST_HEAD(&p->list); | 1178 | INIT_LIST_HEAD(&p->list); |
745 | mutex_lock(&kprobe_mutex); | 1179 | mutex_lock(&kprobe_mutex); |
1180 | |||
1181 | get_online_cpus(); /* For avoiding text_mutex deadlock. */ | ||
1182 | mutex_lock(&text_mutex); | ||
1183 | |||
746 | old_p = get_kprobe(p->addr); | 1184 | old_p = get_kprobe(p->addr); |
747 | if (old_p) { | 1185 | if (old_p) { |
1186 | /* Since this may unoptimize old_p, locking text_mutex. */ | ||
748 | ret = register_aggr_kprobe(old_p, p); | 1187 | ret = register_aggr_kprobe(old_p, p); |
749 | goto out; | 1188 | goto out; |
750 | } | 1189 | } |
751 | 1190 | ||
752 | mutex_lock(&text_mutex); | ||
753 | ret = arch_prepare_kprobe(p); | 1191 | ret = arch_prepare_kprobe(p); |
754 | if (ret) | 1192 | if (ret) |
755 | goto out_unlock_text; | 1193 | goto out; |
756 | 1194 | ||
757 | INIT_HLIST_NODE(&p->hlist); | 1195 | INIT_HLIST_NODE(&p->hlist); |
758 | hlist_add_head_rcu(&p->hlist, | 1196 | hlist_add_head_rcu(&p->hlist, |
759 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 1197 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
760 | 1198 | ||
761 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) | 1199 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) |
762 | arch_arm_kprobe(p); | 1200 | __arm_kprobe(p); |
1201 | |||
1202 | /* Try to optimize kprobe */ | ||
1203 | try_to_optimize_kprobe(p); | ||
763 | 1204 | ||
764 | out_unlock_text: | ||
765 | mutex_unlock(&text_mutex); | ||
766 | out: | 1205 | out: |
1206 | mutex_unlock(&text_mutex); | ||
1207 | put_online_cpus(); | ||
767 | mutex_unlock(&kprobe_mutex); | 1208 | mutex_unlock(&kprobe_mutex); |
768 | 1209 | ||
769 | if (probed_mod) | 1210 | if (probed_mod) |
@@ -785,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) | |||
785 | return -EINVAL; | 1226 | return -EINVAL; |
786 | 1227 | ||
787 | if (old_p == p || | 1228 | if (old_p == p || |
788 | (old_p->pre_handler == aggr_pre_handler && | 1229 | (kprobe_aggrprobe(old_p) && |
789 | list_is_singular(&old_p->list))) { | 1230 | list_is_singular(&old_p->list))) { |
790 | /* | 1231 | /* |
791 | * Only probe on the hash list. Disarm only if kprobes are | 1232 | * Only probe on the hash list. Disarm only if kprobes are |
@@ -793,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) | |||
793 | * already have been removed. We save on flushing icache. | 1234 | * already have been removed. We save on flushing icache. |
794 | */ | 1235 | */ |
795 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) | 1236 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) |
796 | disarm_kprobe(p); | 1237 | disarm_kprobe(old_p); |
797 | hlist_del_rcu(&old_p->hlist); | 1238 | hlist_del_rcu(&old_p->hlist); |
798 | } else { | 1239 | } else { |
799 | if (p->break_handler && !kprobe_gone(p)) | 1240 | if (p->break_handler && !kprobe_gone(p)) |
@@ -809,8 +1250,13 @@ noclean: | |||
809 | list_del_rcu(&p->list); | 1250 | list_del_rcu(&p->list); |
810 | if (!kprobe_disabled(old_p)) { | 1251 | if (!kprobe_disabled(old_p)) { |
811 | try_to_disable_aggr_kprobe(old_p); | 1252 | try_to_disable_aggr_kprobe(old_p); |
812 | if (!kprobes_all_disarmed && kprobe_disabled(old_p)) | 1253 | if (!kprobes_all_disarmed) { |
813 | disarm_kprobe(old_p); | 1254 | if (kprobe_disabled(old_p)) |
1255 | disarm_kprobe(old_p); | ||
1256 | else | ||
1257 | /* Try to optimize this probe again */ | ||
1258 | optimize_kprobe(old_p); | ||
1259 | } | ||
814 | } | 1260 | } |
815 | } | 1261 | } |
816 | return 0; | 1262 | return 0; |
@@ -827,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | |||
827 | old_p = list_entry(p->list.next, struct kprobe, list); | 1273 | old_p = list_entry(p->list.next, struct kprobe, list); |
828 | list_del(&p->list); | 1274 | list_del(&p->list); |
829 | arch_remove_kprobe(old_p); | 1275 | arch_remove_kprobe(old_p); |
830 | kfree(old_p); | 1276 | free_aggr_kprobe(old_p); |
831 | } | 1277 | } |
832 | } | 1278 | } |
833 | 1279 | ||
@@ -1123,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1123 | struct kprobe *kp; | 1569 | struct kprobe *kp; |
1124 | 1570 | ||
1125 | p->flags |= KPROBE_FLAG_GONE; | 1571 | p->flags |= KPROBE_FLAG_GONE; |
1126 | if (p->pre_handler == aggr_pre_handler) { | 1572 | if (kprobe_aggrprobe(p)) { |
1127 | /* | 1573 | /* |
1128 | * If this is an aggr_kprobe, we have to list all the | 1574 | * If this is an aggr_kprobe, we have to list all the |
1129 | * chained probes and mark them GONE. | 1575 | * chained probes and mark them GONE. |
@@ -1132,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1132 | kp->flags |= KPROBE_FLAG_GONE; | 1578 | kp->flags |= KPROBE_FLAG_GONE; |
1133 | p->post_handler = NULL; | 1579 | p->post_handler = NULL; |
1134 | p->break_handler = NULL; | 1580 | p->break_handler = NULL; |
1581 | kill_optimized_kprobe(p); | ||
1135 | } | 1582 | } |
1136 | /* | 1583 | /* |
1137 | * Here, we can remove insn_slot safely, because no thread calls | 1584 | * Here, we can remove insn_slot safely, because no thread calls |
@@ -1241,6 +1688,15 @@ static int __init init_kprobes(void) | |||
1241 | } | 1688 | } |
1242 | } | 1689 | } |
1243 | 1690 | ||
1691 | #if defined(CONFIG_OPTPROBES) | ||
1692 | #if defined(__ARCH_WANT_KPROBES_INSN_SLOT) | ||
1693 | /* Init kprobe_optinsn_slots */ | ||
1694 | kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; | ||
1695 | #endif | ||
1696 | /* By default, kprobes can be optimized */ | ||
1697 | kprobes_allow_optimization = true; | ||
1698 | #endif | ||
1699 | |||
1244 | /* By default, kprobes are armed */ | 1700 | /* By default, kprobes are armed */ |
1245 | kprobes_all_disarmed = false; | 1701 | kprobes_all_disarmed = false; |
1246 | 1702 | ||
@@ -1259,7 +1715,7 @@ static int __init init_kprobes(void) | |||
1259 | 1715 | ||
1260 | #ifdef CONFIG_DEBUG_FS | 1716 | #ifdef CONFIG_DEBUG_FS |
1261 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | 1717 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, |
1262 | const char *sym, int offset,char *modname) | 1718 | const char *sym, int offset, char *modname, struct kprobe *pp) |
1263 | { | 1719 | { |
1264 | char *kprobe_type; | 1720 | char *kprobe_type; |
1265 | 1721 | ||
@@ -1269,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | |||
1269 | kprobe_type = "j"; | 1725 | kprobe_type = "j"; |
1270 | else | 1726 | else |
1271 | kprobe_type = "k"; | 1727 | kprobe_type = "k"; |
1728 | |||
1272 | if (sym) | 1729 | if (sym) |
1273 | seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", | 1730 | seq_printf(pi, "%p %s %s+0x%x %s ", |
1274 | p->addr, kprobe_type, sym, offset, | 1731 | p->addr, kprobe_type, sym, offset, |
1275 | (modname ? modname : " "), | 1732 | (modname ? modname : " ")); |
1276 | (kprobe_gone(p) ? "[GONE]" : ""), | ||
1277 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? | ||
1278 | "[DISABLED]" : "")); | ||
1279 | else | 1733 | else |
1280 | seq_printf(pi, "%p %s %p %s%s\n", | 1734 | seq_printf(pi, "%p %s %p ", |
1281 | p->addr, kprobe_type, p->addr, | 1735 | p->addr, kprobe_type, p->addr); |
1282 | (kprobe_gone(p) ? "[GONE]" : ""), | 1736 | |
1283 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? | 1737 | if (!pp) |
1284 | "[DISABLED]" : "")); | 1738 | pp = p; |
1739 | seq_printf(pi, "%s%s%s\n", | ||
1740 | (kprobe_gone(p) ? "[GONE]" : ""), | ||
1741 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), | ||
1742 | (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); | ||
1285 | } | 1743 | } |
1286 | 1744 | ||
1287 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) | 1745 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) |
@@ -1317,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) | |||
1317 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 1775 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
1318 | sym = kallsyms_lookup((unsigned long)p->addr, NULL, | 1776 | sym = kallsyms_lookup((unsigned long)p->addr, NULL, |
1319 | &offset, &modname, namebuf); | 1777 | &offset, &modname, namebuf); |
1320 | if (p->pre_handler == aggr_pre_handler) { | 1778 | if (kprobe_aggrprobe(p)) { |
1321 | list_for_each_entry_rcu(kp, &p->list, list) | 1779 | list_for_each_entry_rcu(kp, &p->list, list) |
1322 | report_probe(pi, kp, sym, offset, modname); | 1780 | report_probe(pi, kp, sym, offset, modname, p); |
1323 | } else | 1781 | } else |
1324 | report_probe(pi, p, sym, offset, modname); | 1782 | report_probe(pi, p, sym, offset, modname, NULL); |
1325 | } | 1783 | } |
1326 | preempt_enable(); | 1784 | preempt_enable(); |
1327 | return 0; | 1785 | return 0; |
@@ -1399,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp) | |||
1399 | goto out; | 1857 | goto out; |
1400 | } | 1858 | } |
1401 | 1859 | ||
1402 | if (!kprobes_all_disarmed && kprobe_disabled(p)) | ||
1403 | arm_kprobe(p); | ||
1404 | |||
1405 | p->flags &= ~KPROBE_FLAG_DISABLED; | ||
1406 | if (p != kp) | 1860 | if (p != kp) |
1407 | kp->flags &= ~KPROBE_FLAG_DISABLED; | 1861 | kp->flags &= ~KPROBE_FLAG_DISABLED; |
1862 | |||
1863 | if (!kprobes_all_disarmed && kprobe_disabled(p)) { | ||
1864 | p->flags &= ~KPROBE_FLAG_DISABLED; | ||
1865 | arm_kprobe(p); | ||
1866 | } | ||
1408 | out: | 1867 | out: |
1409 | mutex_unlock(&kprobe_mutex); | 1868 | mutex_unlock(&kprobe_mutex); |
1410 | return ret; | 1869 | return ret; |
@@ -1424,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void) | |||
1424 | if (!kprobes_all_disarmed) | 1883 | if (!kprobes_all_disarmed) |
1425 | goto already_enabled; | 1884 | goto already_enabled; |
1426 | 1885 | ||
1886 | /* Arming kprobes doesn't optimize kprobe itself */ | ||
1427 | mutex_lock(&text_mutex); | 1887 | mutex_lock(&text_mutex); |
1428 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1888 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1429 | head = &kprobe_table[i]; | 1889 | head = &kprobe_table[i]; |
1430 | hlist_for_each_entry_rcu(p, node, head, hlist) | 1890 | hlist_for_each_entry_rcu(p, node, head, hlist) |
1431 | if (!kprobe_disabled(p)) | 1891 | if (!kprobe_disabled(p)) |
1432 | arch_arm_kprobe(p); | 1892 | __arm_kprobe(p); |
1433 | } | 1893 | } |
1434 | mutex_unlock(&text_mutex); | 1894 | mutex_unlock(&text_mutex); |
1435 | 1895 | ||
@@ -1456,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void) | |||
1456 | 1916 | ||
1457 | kprobes_all_disarmed = true; | 1917 | kprobes_all_disarmed = true; |
1458 | printk(KERN_INFO "Kprobes globally disabled\n"); | 1918 | printk(KERN_INFO "Kprobes globally disabled\n"); |
1919 | |||
1920 | /* | ||
1921 | * Here we call get_online_cpus() for avoiding text_mutex deadlock, | ||
1922 | * because disarming may also unoptimize kprobes. | ||
1923 | */ | ||
1924 | get_online_cpus(); | ||
1459 | mutex_lock(&text_mutex); | 1925 | mutex_lock(&text_mutex); |
1460 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1926 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1461 | head = &kprobe_table[i]; | 1927 | head = &kprobe_table[i]; |
1462 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 1928 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
1463 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 1929 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
1464 | arch_disarm_kprobe(p); | 1930 | __disarm_kprobe(p); |
1465 | } | 1931 | } |
1466 | } | 1932 | } |
1467 | 1933 | ||
1468 | mutex_unlock(&text_mutex); | 1934 | mutex_unlock(&text_mutex); |
1935 | put_online_cpus(); | ||
1469 | mutex_unlock(&kprobe_mutex); | 1936 | mutex_unlock(&kprobe_mutex); |
1470 | /* Allow all currently running kprobes to complete */ | 1937 | /* Allow all currently running kprobes to complete */ |
1471 | synchronize_sched(); | 1938 | synchronize_sched(); |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3feaf5a74514..6b1ccc3f0205 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void) | |||
197 | goto group_exit; | 197 | goto group_exit; |
198 | } | 198 | } |
199 | 199 | ||
200 | /* create the /sys/kernel/uids/ directory */ | ||
201 | error = uids_sysfs_init(); | ||
202 | if (error) | ||
203 | goto notes_exit; | ||
204 | |||
205 | return 0; | 200 | return 0; |
206 | 201 | ||
207 | notes_exit: | ||
208 | if (notes_size > 0) | ||
209 | sysfs_remove_bin_file(kernel_kobj, ¬es_attr); | ||
210 | group_exit: | 202 | group_exit: |
211 | sysfs_remove_group(kernel_kobj, &kernel_attr_group); | 203 | sysfs_remove_group(kernel_kobj, &kernel_attr_group); |
212 | kset_exit: | 204 | kset_exit: |
diff --git a/kernel/kthread.c b/kernel/kthread.c index fbb6222fe7e0..82ed0ea15194 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
101 | * | 101 | * |
102 | * Description: This helper function creates and names a kernel | 102 | * Description: This helper function creates and names a kernel |
103 | * thread. The thread will be stopped: use wake_up_process() to start | 103 | * thread. The thread will be stopped: use wake_up_process() to start |
104 | * it. See also kthread_run(), kthread_create_on_cpu(). | 104 | * it. See also kthread_run(). |
105 | * | 105 | * |
106 | * When woken, the thread will run @threadfn() with @data as its | 106 | * When woken, the thread will run @threadfn() with @data as its |
107 | * argument. @threadfn() can either call do_exit() directly if it is a | 107 | * argument. @threadfn() can either call do_exit() directly if it is a |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c62ec14609b9..0c30d0455de1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -3809,3 +3809,21 @@ void lockdep_sys_exit(void) | |||
3809 | lockdep_print_held_locks(curr); | 3809 | lockdep_print_held_locks(curr); |
3810 | } | 3810 | } |
3811 | } | 3811 | } |
3812 | |||
3813 | void lockdep_rcu_dereference(const char *file, const int line) | ||
3814 | { | ||
3815 | struct task_struct *curr = current; | ||
3816 | |||
3817 | if (!debug_locks_off()) | ||
3818 | return; | ||
3819 | printk("\n===================================================\n"); | ||
3820 | printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); | ||
3821 | printk( "---------------------------------------------------\n"); | ||
3822 | printk("%s:%d invoked rcu_dereference_check() without protection!\n", | ||
3823 | file, line); | ||
3824 | printk("\nother info that might help us debug this:\n\n"); | ||
3825 | lockdep_print_held_locks(curr); | ||
3826 | printk("\nstack backtrace:\n"); | ||
3827 | dump_stack(); | ||
3828 | } | ||
3829 | EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); | ||
diff --git a/kernel/module.c b/kernel/module.c index f82386bd9ee9..c968d3606dca 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod) | |||
474 | 474 | ||
475 | INIT_LIST_HEAD(&mod->modules_which_use_me); | 475 | INIT_LIST_HEAD(&mod->modules_which_use_me); |
476 | for_each_possible_cpu(cpu) | 476 | for_each_possible_cpu(cpu) |
477 | local_set(__module_ref_addr(mod, cpu), 0); | 477 | per_cpu_ptr(mod->refptr, cpu)->count = 0; |
478 | |||
478 | /* Hold reference count during initialization. */ | 479 | /* Hold reference count during initialization. */ |
479 | local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); | 480 | __this_cpu_write(mod->refptr->count, 1); |
480 | /* Backwards compatibility macros put refcount during init. */ | 481 | /* Backwards compatibility macros put refcount during init. */ |
481 | mod->waiter = current; | 482 | mod->waiter = current; |
482 | } | 483 | } |
@@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod) | |||
619 | int cpu; | 620 | int cpu; |
620 | 621 | ||
621 | for_each_possible_cpu(cpu) | 622 | for_each_possible_cpu(cpu) |
622 | total += local_read(__module_ref_addr(mod, cpu)); | 623 | total += per_cpu_ptr(mod->refptr, cpu)->count; |
623 | return total; | 624 | return total; |
624 | } | 625 | } |
625 | EXPORT_SYMBOL(module_refcount); | 626 | EXPORT_SYMBOL(module_refcount); |
@@ -796,14 +797,15 @@ static struct module_attribute refcnt = { | |||
796 | void module_put(struct module *module) | 797 | void module_put(struct module *module) |
797 | { | 798 | { |
798 | if (module) { | 799 | if (module) { |
799 | unsigned int cpu = get_cpu(); | 800 | preempt_disable(); |
800 | local_dec(__module_ref_addr(module, cpu)); | 801 | __this_cpu_dec(module->refptr->count); |
802 | |||
801 | trace_module_put(module, _RET_IP_, | 803 | trace_module_put(module, _RET_IP_, |
802 | local_read(__module_ref_addr(module, cpu))); | 804 | __this_cpu_read(module->refptr->count)); |
803 | /* Maybe they're waiting for us to drop reference? */ | 805 | /* Maybe they're waiting for us to drop reference? */ |
804 | if (unlikely(!module_is_live(module))) | 806 | if (unlikely(!module_is_live(module))) |
805 | wake_up_process(module->waiter); | 807 | wake_up_process(module->waiter); |
806 | put_cpu(); | 808 | preempt_enable(); |
807 | } | 809 | } |
808 | } | 810 | } |
809 | EXPORT_SYMBOL(module_put); | 811 | EXPORT_SYMBOL(module_put); |
@@ -1083,6 +1085,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, | |||
1083 | if (sattr->name == NULL) | 1085 | if (sattr->name == NULL) |
1084 | goto out; | 1086 | goto out; |
1085 | sect_attrs->nsections++; | 1087 | sect_attrs->nsections++; |
1088 | sysfs_attr_init(&sattr->mattr.attr); | ||
1086 | sattr->mattr.show = module_sect_show; | 1089 | sattr->mattr.show = module_sect_show; |
1087 | sattr->mattr.store = NULL; | 1090 | sattr->mattr.store = NULL; |
1088 | sattr->mattr.attr.name = sattr->name; | 1091 | sattr->mattr.attr.name = sattr->name; |
@@ -1178,6 +1181,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, | |||
1178 | if (sect_empty(&sechdrs[i])) | 1181 | if (sect_empty(&sechdrs[i])) |
1179 | continue; | 1182 | continue; |
1180 | if (sechdrs[i].sh_type == SHT_NOTE) { | 1183 | if (sechdrs[i].sh_type == SHT_NOTE) { |
1184 | sysfs_bin_attr_init(nattr); | ||
1181 | nattr->attr.name = mod->sect_attrs->attrs[loaded].name; | 1185 | nattr->attr.name = mod->sect_attrs->attrs[loaded].name; |
1182 | nattr->attr.mode = S_IRUGO; | 1186 | nattr->attr.mode = S_IRUGO; |
1183 | nattr->size = sechdrs[i].sh_size; | 1187 | nattr->size = sechdrs[i].sh_size; |
@@ -1250,6 +1254,7 @@ int module_add_modinfo_attrs(struct module *mod) | |||
1250 | if (!attr->test || | 1254 | if (!attr->test || |
1251 | (attr->test && attr->test(mod))) { | 1255 | (attr->test && attr->test(mod))) { |
1252 | memcpy(temp_attr, attr, sizeof(*temp_attr)); | 1256 | memcpy(temp_attr, attr, sizeof(*temp_attr)); |
1257 | sysfs_attr_init(&temp_attr->attr); | ||
1253 | error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); | 1258 | error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); |
1254 | ++temp_attr; | 1259 | ++temp_attr; |
1255 | } | 1260 | } |
@@ -1397,9 +1402,9 @@ static void free_module(struct module *mod) | |||
1397 | kfree(mod->args); | 1402 | kfree(mod->args); |
1398 | if (mod->percpu) | 1403 | if (mod->percpu) |
1399 | percpu_modfree(mod->percpu); | 1404 | percpu_modfree(mod->percpu); |
1400 | #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) | 1405 | #if defined(CONFIG_MODULE_UNLOAD) |
1401 | if (mod->refptr) | 1406 | if (mod->refptr) |
1402 | percpu_modfree(mod->refptr); | 1407 | free_percpu(mod->refptr); |
1403 | #endif | 1408 | #endif |
1404 | /* Free lock-classes: */ | 1409 | /* Free lock-classes: */ |
1405 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1410 | lockdep_free_key_range(mod->module_core, mod->core_size); |
@@ -2162,9 +2167,8 @@ static noinline struct module *load_module(void __user *umod, | |||
2162 | mod = (void *)sechdrs[modindex].sh_addr; | 2167 | mod = (void *)sechdrs[modindex].sh_addr; |
2163 | kmemleak_load_module(mod, hdr, sechdrs, secstrings); | 2168 | kmemleak_load_module(mod, hdr, sechdrs, secstrings); |
2164 | 2169 | ||
2165 | #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) | 2170 | #if defined(CONFIG_MODULE_UNLOAD) |
2166 | mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), | 2171 | mod->refptr = alloc_percpu(struct module_ref); |
2167 | mod->name); | ||
2168 | if (!mod->refptr) { | 2172 | if (!mod->refptr) { |
2169 | err = -ENOMEM; | 2173 | err = -ENOMEM; |
2170 | goto free_init; | 2174 | goto free_init; |
@@ -2396,8 +2400,8 @@ static noinline struct module *load_module(void __user *umod, | |||
2396 | kobject_put(&mod->mkobj.kobj); | 2400 | kobject_put(&mod->mkobj.kobj); |
2397 | free_unload: | 2401 | free_unload: |
2398 | module_unload_free(mod); | 2402 | module_unload_free(mod); |
2399 | #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) | 2403 | #if defined(CONFIG_MODULE_UNLOAD) |
2400 | percpu_modfree(mod->refptr); | 2404 | free_percpu(mod->refptr); |
2401 | free_init: | 2405 | free_init: |
2402 | #endif | 2406 | #endif |
2403 | module_free(mod, mod->module_init); | 2407 | module_free(mod, mod->module_init); |
diff --git a/kernel/notifier.c b/kernel/notifier.c index acd24e7643eb..2488ba7eb568 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, | |||
78 | int ret = NOTIFY_DONE; | 78 | int ret = NOTIFY_DONE; |
79 | struct notifier_block *nb, *next_nb; | 79 | struct notifier_block *nb, *next_nb; |
80 | 80 | ||
81 | nb = rcu_dereference(*nl); | 81 | nb = rcu_dereference_raw(*nl); |
82 | 82 | ||
83 | while (nb && nr_to_call) { | 83 | while (nb && nr_to_call) { |
84 | next_nb = rcu_dereference(nb->next); | 84 | next_nb = rcu_dereference_raw(nb->next); |
85 | 85 | ||
86 | #ifdef CONFIG_DEBUG_NOTIFIERS | 86 | #ifdef CONFIG_DEBUG_NOTIFIERS |
87 | if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { | 87 | if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { |
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, | |||
309 | * racy then it does not matter what the result of the test | 309 | * racy then it does not matter what the result of the test |
310 | * is, we re-check the list after having taken the lock anyway: | 310 | * is, we re-check the list after having taken the lock anyway: |
311 | */ | 311 | */ |
312 | if (rcu_dereference(nh->head)) { | 312 | if (rcu_dereference_raw(nh->head)) { |
313 | down_read(&nh->rwsem); | 313 | down_read(&nh->rwsem); |
314 | ret = notifier_call_chain(&nh->head, val, v, nr_to_call, | 314 | ret = notifier_call_chain(&nh->head, val, v, nr_to_call, |
315 | nr_calls); | 315 | nr_calls); |
diff --git a/kernel/padata.c b/kernel/padata.c new file mode 100644 index 000000000000..93caf65ff57c --- /dev/null +++ b/kernel/padata.c | |||
@@ -0,0 +1,696 @@ | |||
1 | /* | ||
2 | * padata.c - generic interface to process data streams in parallel | ||
3 | * | ||
4 | * Copyright (C) 2008, 2009 secunet Security Networks AG | ||
5 | * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify it | ||
8 | * under the terms and conditions of the GNU General Public License, | ||
9 | * version 2, as published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
14 | * more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License along with | ||
17 | * this program; if not, write to the Free Software Foundation, Inc., | ||
18 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/cpumask.h> | ||
23 | #include <linux/err.h> | ||
24 | #include <linux/cpu.h> | ||
25 | #include <linux/padata.h> | ||
26 | #include <linux/mutex.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/rcupdate.h> | ||
29 | |||
30 | #define MAX_SEQ_NR INT_MAX - NR_CPUS | ||
31 | #define MAX_OBJ_NUM 10000 * NR_CPUS | ||
32 | |||
33 | static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) | ||
34 | { | ||
35 | int cpu, target_cpu; | ||
36 | |||
37 | target_cpu = cpumask_first(pd->cpumask); | ||
38 | for (cpu = 0; cpu < cpu_index; cpu++) | ||
39 | target_cpu = cpumask_next(target_cpu, pd->cpumask); | ||
40 | |||
41 | return target_cpu; | ||
42 | } | ||
43 | |||
44 | static int padata_cpu_hash(struct padata_priv *padata) | ||
45 | { | ||
46 | int cpu_index; | ||
47 | struct parallel_data *pd; | ||
48 | |||
49 | pd = padata->pd; | ||
50 | |||
51 | /* | ||
52 | * Hash the sequence numbers to the cpus by taking | ||
53 | * seq_nr mod. number of cpus in use. | ||
54 | */ | ||
55 | cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); | ||
56 | |||
57 | return padata_index_to_cpu(pd, cpu_index); | ||
58 | } | ||
59 | |||
60 | static void padata_parallel_worker(struct work_struct *work) | ||
61 | { | ||
62 | struct padata_queue *queue; | ||
63 | struct parallel_data *pd; | ||
64 | struct padata_instance *pinst; | ||
65 | LIST_HEAD(local_list); | ||
66 | |||
67 | local_bh_disable(); | ||
68 | queue = container_of(work, struct padata_queue, pwork); | ||
69 | pd = queue->pd; | ||
70 | pinst = pd->pinst; | ||
71 | |||
72 | spin_lock(&queue->parallel.lock); | ||
73 | list_replace_init(&queue->parallel.list, &local_list); | ||
74 | spin_unlock(&queue->parallel.lock); | ||
75 | |||
76 | while (!list_empty(&local_list)) { | ||
77 | struct padata_priv *padata; | ||
78 | |||
79 | padata = list_entry(local_list.next, | ||
80 | struct padata_priv, list); | ||
81 | |||
82 | list_del_init(&padata->list); | ||
83 | |||
84 | padata->parallel(padata); | ||
85 | } | ||
86 | |||
87 | local_bh_enable(); | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * padata_do_parallel - padata parallelization function | ||
92 | * | ||
93 | * @pinst: padata instance | ||
94 | * @padata: object to be parallelized | ||
95 | * @cb_cpu: cpu the serialization callback function will run on, | ||
96 | * must be in the cpumask of padata. | ||
97 | * | ||
98 | * The parallelization callback function will run with BHs off. | ||
99 | * Note: Every object which is parallelized by padata_do_parallel | ||
100 | * must be seen by padata_do_serial. | ||
101 | */ | ||
102 | int padata_do_parallel(struct padata_instance *pinst, | ||
103 | struct padata_priv *padata, int cb_cpu) | ||
104 | { | ||
105 | int target_cpu, err; | ||
106 | struct padata_queue *queue; | ||
107 | struct parallel_data *pd; | ||
108 | |||
109 | rcu_read_lock_bh(); | ||
110 | |||
111 | pd = rcu_dereference(pinst->pd); | ||
112 | |||
113 | err = 0; | ||
114 | if (!(pinst->flags & PADATA_INIT)) | ||
115 | goto out; | ||
116 | |||
117 | err = -EBUSY; | ||
118 | if ((pinst->flags & PADATA_RESET)) | ||
119 | goto out; | ||
120 | |||
121 | if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) | ||
122 | goto out; | ||
123 | |||
124 | err = -EINVAL; | ||
125 | if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) | ||
126 | goto out; | ||
127 | |||
128 | err = -EINPROGRESS; | ||
129 | atomic_inc(&pd->refcnt); | ||
130 | padata->pd = pd; | ||
131 | padata->cb_cpu = cb_cpu; | ||
132 | |||
133 | if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) | ||
134 | atomic_set(&pd->seq_nr, -1); | ||
135 | |||
136 | padata->seq_nr = atomic_inc_return(&pd->seq_nr); | ||
137 | |||
138 | target_cpu = padata_cpu_hash(padata); | ||
139 | queue = per_cpu_ptr(pd->queue, target_cpu); | ||
140 | |||
141 | spin_lock(&queue->parallel.lock); | ||
142 | list_add_tail(&padata->list, &queue->parallel.list); | ||
143 | spin_unlock(&queue->parallel.lock); | ||
144 | |||
145 | queue_work_on(target_cpu, pinst->wq, &queue->pwork); | ||
146 | |||
147 | out: | ||
148 | rcu_read_unlock_bh(); | ||
149 | |||
150 | return err; | ||
151 | } | ||
152 | EXPORT_SYMBOL(padata_do_parallel); | ||
153 | |||
154 | static struct padata_priv *padata_get_next(struct parallel_data *pd) | ||
155 | { | ||
156 | int cpu, num_cpus, empty, calc_seq_nr; | ||
157 | int seq_nr, next_nr, overrun, next_overrun; | ||
158 | struct padata_queue *queue, *next_queue; | ||
159 | struct padata_priv *padata; | ||
160 | struct padata_list *reorder; | ||
161 | |||
162 | empty = 0; | ||
163 | next_nr = -1; | ||
164 | next_overrun = 0; | ||
165 | next_queue = NULL; | ||
166 | |||
167 | num_cpus = cpumask_weight(pd->cpumask); | ||
168 | |||
169 | for_each_cpu(cpu, pd->cpumask) { | ||
170 | queue = per_cpu_ptr(pd->queue, cpu); | ||
171 | reorder = &queue->reorder; | ||
172 | |||
173 | /* | ||
174 | * Calculate the seq_nr of the object that should be | ||
175 | * next in this queue. | ||
176 | */ | ||
177 | overrun = 0; | ||
178 | calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) | ||
179 | + queue->cpu_index; | ||
180 | |||
181 | if (unlikely(calc_seq_nr > pd->max_seq_nr)) { | ||
182 | calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; | ||
183 | overrun = 1; | ||
184 | } | ||
185 | |||
186 | if (!list_empty(&reorder->list)) { | ||
187 | padata = list_entry(reorder->list.next, | ||
188 | struct padata_priv, list); | ||
189 | |||
190 | seq_nr = padata->seq_nr; | ||
191 | BUG_ON(calc_seq_nr != seq_nr); | ||
192 | } else { | ||
193 | seq_nr = calc_seq_nr; | ||
194 | empty++; | ||
195 | } | ||
196 | |||
197 | if (next_nr < 0 || seq_nr < next_nr | ||
198 | || (next_overrun && !overrun)) { | ||
199 | next_nr = seq_nr; | ||
200 | next_overrun = overrun; | ||
201 | next_queue = queue; | ||
202 | } | ||
203 | } | ||
204 | |||
205 | padata = NULL; | ||
206 | |||
207 | if (empty == num_cpus) | ||
208 | goto out; | ||
209 | |||
210 | reorder = &next_queue->reorder; | ||
211 | |||
212 | if (!list_empty(&reorder->list)) { | ||
213 | padata = list_entry(reorder->list.next, | ||
214 | struct padata_priv, list); | ||
215 | |||
216 | if (unlikely(next_overrun)) { | ||
217 | for_each_cpu(cpu, pd->cpumask) { | ||
218 | queue = per_cpu_ptr(pd->queue, cpu); | ||
219 | atomic_set(&queue->num_obj, 0); | ||
220 | } | ||
221 | } | ||
222 | |||
223 | spin_lock(&reorder->lock); | ||
224 | list_del_init(&padata->list); | ||
225 | atomic_dec(&pd->reorder_objects); | ||
226 | spin_unlock(&reorder->lock); | ||
227 | |||
228 | atomic_inc(&next_queue->num_obj); | ||
229 | |||
230 | goto out; | ||
231 | } | ||
232 | |||
233 | if (next_nr % num_cpus == next_queue->cpu_index) { | ||
234 | padata = ERR_PTR(-ENODATA); | ||
235 | goto out; | ||
236 | } | ||
237 | |||
238 | padata = ERR_PTR(-EINPROGRESS); | ||
239 | out: | ||
240 | return padata; | ||
241 | } | ||
242 | |||
243 | static void padata_reorder(struct parallel_data *pd) | ||
244 | { | ||
245 | struct padata_priv *padata; | ||
246 | struct padata_queue *queue; | ||
247 | struct padata_instance *pinst = pd->pinst; | ||
248 | |||
249 | try_again: | ||
250 | if (!spin_trylock_bh(&pd->lock)) | ||
251 | goto out; | ||
252 | |||
253 | while (1) { | ||
254 | padata = padata_get_next(pd); | ||
255 | |||
256 | if (!padata || PTR_ERR(padata) == -EINPROGRESS) | ||
257 | break; | ||
258 | |||
259 | if (PTR_ERR(padata) == -ENODATA) { | ||
260 | spin_unlock_bh(&pd->lock); | ||
261 | goto out; | ||
262 | } | ||
263 | |||
264 | queue = per_cpu_ptr(pd->queue, padata->cb_cpu); | ||
265 | |||
266 | spin_lock(&queue->serial.lock); | ||
267 | list_add_tail(&padata->list, &queue->serial.list); | ||
268 | spin_unlock(&queue->serial.lock); | ||
269 | |||
270 | queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); | ||
271 | } | ||
272 | |||
273 | spin_unlock_bh(&pd->lock); | ||
274 | |||
275 | if (atomic_read(&pd->reorder_objects)) | ||
276 | goto try_again; | ||
277 | |||
278 | out: | ||
279 | return; | ||
280 | } | ||
281 | |||
282 | static void padata_serial_worker(struct work_struct *work) | ||
283 | { | ||
284 | struct padata_queue *queue; | ||
285 | struct parallel_data *pd; | ||
286 | LIST_HEAD(local_list); | ||
287 | |||
288 | local_bh_disable(); | ||
289 | queue = container_of(work, struct padata_queue, swork); | ||
290 | pd = queue->pd; | ||
291 | |||
292 | spin_lock(&queue->serial.lock); | ||
293 | list_replace_init(&queue->serial.list, &local_list); | ||
294 | spin_unlock(&queue->serial.lock); | ||
295 | |||
296 | while (!list_empty(&local_list)) { | ||
297 | struct padata_priv *padata; | ||
298 | |||
299 | padata = list_entry(local_list.next, | ||
300 | struct padata_priv, list); | ||
301 | |||
302 | list_del_init(&padata->list); | ||
303 | |||
304 | padata->serial(padata); | ||
305 | atomic_dec(&pd->refcnt); | ||
306 | } | ||
307 | local_bh_enable(); | ||
308 | } | ||
309 | |||
310 | /* | ||
311 | * padata_do_serial - padata serialization function | ||
312 | * | ||
313 | * @padata: object to be serialized. | ||
314 | * | ||
315 | * padata_do_serial must be called for every parallelized object. | ||
316 | * The serialization callback function will run with BHs off. | ||
317 | */ | ||
318 | void padata_do_serial(struct padata_priv *padata) | ||
319 | { | ||
320 | int cpu; | ||
321 | struct padata_queue *queue; | ||
322 | struct parallel_data *pd; | ||
323 | |||
324 | pd = padata->pd; | ||
325 | |||
326 | cpu = get_cpu(); | ||
327 | queue = per_cpu_ptr(pd->queue, cpu); | ||
328 | |||
329 | spin_lock(&queue->reorder.lock); | ||
330 | atomic_inc(&pd->reorder_objects); | ||
331 | list_add_tail(&padata->list, &queue->reorder.list); | ||
332 | spin_unlock(&queue->reorder.lock); | ||
333 | |||
334 | put_cpu(); | ||
335 | |||
336 | padata_reorder(pd); | ||
337 | } | ||
338 | EXPORT_SYMBOL(padata_do_serial); | ||
339 | |||
340 | static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, | ||
341 | const struct cpumask *cpumask) | ||
342 | { | ||
343 | int cpu, cpu_index, num_cpus; | ||
344 | struct padata_queue *queue; | ||
345 | struct parallel_data *pd; | ||
346 | |||
347 | cpu_index = 0; | ||
348 | |||
349 | pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); | ||
350 | if (!pd) | ||
351 | goto err; | ||
352 | |||
353 | pd->queue = alloc_percpu(struct padata_queue); | ||
354 | if (!pd->queue) | ||
355 | goto err_free_pd; | ||
356 | |||
357 | if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) | ||
358 | goto err_free_queue; | ||
359 | |||
360 | for_each_possible_cpu(cpu) { | ||
361 | queue = per_cpu_ptr(pd->queue, cpu); | ||
362 | |||
363 | queue->pd = pd; | ||
364 | |||
365 | if (cpumask_test_cpu(cpu, cpumask) | ||
366 | && cpumask_test_cpu(cpu, cpu_active_mask)) { | ||
367 | queue->cpu_index = cpu_index; | ||
368 | cpu_index++; | ||
369 | } else | ||
370 | queue->cpu_index = -1; | ||
371 | |||
372 | INIT_LIST_HEAD(&queue->reorder.list); | ||
373 | INIT_LIST_HEAD(&queue->parallel.list); | ||
374 | INIT_LIST_HEAD(&queue->serial.list); | ||
375 | spin_lock_init(&queue->reorder.lock); | ||
376 | spin_lock_init(&queue->parallel.lock); | ||
377 | spin_lock_init(&queue->serial.lock); | ||
378 | |||
379 | INIT_WORK(&queue->pwork, padata_parallel_worker); | ||
380 | INIT_WORK(&queue->swork, padata_serial_worker); | ||
381 | atomic_set(&queue->num_obj, 0); | ||
382 | } | ||
383 | |||
384 | cpumask_and(pd->cpumask, cpumask, cpu_active_mask); | ||
385 | |||
386 | num_cpus = cpumask_weight(pd->cpumask); | ||
387 | pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; | ||
388 | |||
389 | atomic_set(&pd->seq_nr, -1); | ||
390 | atomic_set(&pd->reorder_objects, 0); | ||
391 | atomic_set(&pd->refcnt, 0); | ||
392 | pd->pinst = pinst; | ||
393 | spin_lock_init(&pd->lock); | ||
394 | |||
395 | return pd; | ||
396 | |||
397 | err_free_queue: | ||
398 | free_percpu(pd->queue); | ||
399 | err_free_pd: | ||
400 | kfree(pd); | ||
401 | err: | ||
402 | return NULL; | ||
403 | } | ||
404 | |||
405 | static void padata_free_pd(struct parallel_data *pd) | ||
406 | { | ||
407 | free_cpumask_var(pd->cpumask); | ||
408 | free_percpu(pd->queue); | ||
409 | kfree(pd); | ||
410 | } | ||
411 | |||
412 | static void padata_replace(struct padata_instance *pinst, | ||
413 | struct parallel_data *pd_new) | ||
414 | { | ||
415 | struct parallel_data *pd_old = pinst->pd; | ||
416 | |||
417 | pinst->flags |= PADATA_RESET; | ||
418 | |||
419 | rcu_assign_pointer(pinst->pd, pd_new); | ||
420 | |||
421 | synchronize_rcu(); | ||
422 | |||
423 | while (atomic_read(&pd_old->refcnt) != 0) | ||
424 | yield(); | ||
425 | |||
426 | flush_workqueue(pinst->wq); | ||
427 | |||
428 | padata_free_pd(pd_old); | ||
429 | |||
430 | pinst->flags &= ~PADATA_RESET; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * padata_set_cpumask - set the cpumask that padata should use | ||
435 | * | ||
436 | * @pinst: padata instance | ||
437 | * @cpumask: the cpumask to use | ||
438 | */ | ||
439 | int padata_set_cpumask(struct padata_instance *pinst, | ||
440 | cpumask_var_t cpumask) | ||
441 | { | ||
442 | struct parallel_data *pd; | ||
443 | int err = 0; | ||
444 | |||
445 | might_sleep(); | ||
446 | |||
447 | mutex_lock(&pinst->lock); | ||
448 | |||
449 | pd = padata_alloc_pd(pinst, cpumask); | ||
450 | if (!pd) { | ||
451 | err = -ENOMEM; | ||
452 | goto out; | ||
453 | } | ||
454 | |||
455 | cpumask_copy(pinst->cpumask, cpumask); | ||
456 | |||
457 | padata_replace(pinst, pd); | ||
458 | |||
459 | out: | ||
460 | mutex_unlock(&pinst->lock); | ||
461 | |||
462 | return err; | ||
463 | } | ||
464 | EXPORT_SYMBOL(padata_set_cpumask); | ||
465 | |||
466 | static int __padata_add_cpu(struct padata_instance *pinst, int cpu) | ||
467 | { | ||
468 | struct parallel_data *pd; | ||
469 | |||
470 | if (cpumask_test_cpu(cpu, cpu_active_mask)) { | ||
471 | pd = padata_alloc_pd(pinst, pinst->cpumask); | ||
472 | if (!pd) | ||
473 | return -ENOMEM; | ||
474 | |||
475 | padata_replace(pinst, pd); | ||
476 | } | ||
477 | |||
478 | return 0; | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * padata_add_cpu - add a cpu to the padata cpumask | ||
483 | * | ||
484 | * @pinst: padata instance | ||
485 | * @cpu: cpu to add | ||
486 | */ | ||
487 | int padata_add_cpu(struct padata_instance *pinst, int cpu) | ||
488 | { | ||
489 | int err; | ||
490 | |||
491 | might_sleep(); | ||
492 | |||
493 | mutex_lock(&pinst->lock); | ||
494 | |||
495 | cpumask_set_cpu(cpu, pinst->cpumask); | ||
496 | err = __padata_add_cpu(pinst, cpu); | ||
497 | |||
498 | mutex_unlock(&pinst->lock); | ||
499 | |||
500 | return err; | ||
501 | } | ||
502 | EXPORT_SYMBOL(padata_add_cpu); | ||
503 | |||
504 | static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) | ||
505 | { | ||
506 | struct parallel_data *pd; | ||
507 | |||
508 | if (cpumask_test_cpu(cpu, cpu_online_mask)) { | ||
509 | pd = padata_alloc_pd(pinst, pinst->cpumask); | ||
510 | if (!pd) | ||
511 | return -ENOMEM; | ||
512 | |||
513 | padata_replace(pinst, pd); | ||
514 | } | ||
515 | |||
516 | return 0; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * padata_remove_cpu - remove a cpu from the padata cpumask | ||
521 | * | ||
522 | * @pinst: padata instance | ||
523 | * @cpu: cpu to remove | ||
524 | */ | ||
525 | int padata_remove_cpu(struct padata_instance *pinst, int cpu) | ||
526 | { | ||
527 | int err; | ||
528 | |||
529 | might_sleep(); | ||
530 | |||
531 | mutex_lock(&pinst->lock); | ||
532 | |||
533 | cpumask_clear_cpu(cpu, pinst->cpumask); | ||
534 | err = __padata_remove_cpu(pinst, cpu); | ||
535 | |||
536 | mutex_unlock(&pinst->lock); | ||
537 | |||
538 | return err; | ||
539 | } | ||
540 | EXPORT_SYMBOL(padata_remove_cpu); | ||
541 | |||
542 | /* | ||
543 | * padata_start - start the parallel processing | ||
544 | * | ||
545 | * @pinst: padata instance to start | ||
546 | */ | ||
547 | void padata_start(struct padata_instance *pinst) | ||
548 | { | ||
549 | might_sleep(); | ||
550 | |||
551 | mutex_lock(&pinst->lock); | ||
552 | pinst->flags |= PADATA_INIT; | ||
553 | mutex_unlock(&pinst->lock); | ||
554 | } | ||
555 | EXPORT_SYMBOL(padata_start); | ||
556 | |||
557 | /* | ||
558 | * padata_stop - stop the parallel processing | ||
559 | * | ||
560 | * @pinst: padata instance to stop | ||
561 | */ | ||
562 | void padata_stop(struct padata_instance *pinst) | ||
563 | { | ||
564 | might_sleep(); | ||
565 | |||
566 | mutex_lock(&pinst->lock); | ||
567 | pinst->flags &= ~PADATA_INIT; | ||
568 | mutex_unlock(&pinst->lock); | ||
569 | } | ||
570 | EXPORT_SYMBOL(padata_stop); | ||
571 | |||
572 | static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, | ||
573 | unsigned long action, void *hcpu) | ||
574 | { | ||
575 | int err; | ||
576 | struct padata_instance *pinst; | ||
577 | int cpu = (unsigned long)hcpu; | ||
578 | |||
579 | pinst = container_of(nfb, struct padata_instance, cpu_notifier); | ||
580 | |||
581 | switch (action) { | ||
582 | case CPU_ONLINE: | ||
583 | case CPU_ONLINE_FROZEN: | ||
584 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
585 | break; | ||
586 | mutex_lock(&pinst->lock); | ||
587 | err = __padata_add_cpu(pinst, cpu); | ||
588 | mutex_unlock(&pinst->lock); | ||
589 | if (err) | ||
590 | return NOTIFY_BAD; | ||
591 | break; | ||
592 | |||
593 | case CPU_DOWN_PREPARE: | ||
594 | case CPU_DOWN_PREPARE_FROZEN: | ||
595 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
596 | break; | ||
597 | mutex_lock(&pinst->lock); | ||
598 | err = __padata_remove_cpu(pinst, cpu); | ||
599 | mutex_unlock(&pinst->lock); | ||
600 | if (err) | ||
601 | return NOTIFY_BAD; | ||
602 | break; | ||
603 | |||
604 | case CPU_UP_CANCELED: | ||
605 | case CPU_UP_CANCELED_FROZEN: | ||
606 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
607 | break; | ||
608 | mutex_lock(&pinst->lock); | ||
609 | __padata_remove_cpu(pinst, cpu); | ||
610 | mutex_unlock(&pinst->lock); | ||
611 | |||
612 | case CPU_DOWN_FAILED: | ||
613 | case CPU_DOWN_FAILED_FROZEN: | ||
614 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
615 | break; | ||
616 | mutex_lock(&pinst->lock); | ||
617 | __padata_add_cpu(pinst, cpu); | ||
618 | mutex_unlock(&pinst->lock); | ||
619 | } | ||
620 | |||
621 | return NOTIFY_OK; | ||
622 | } | ||
623 | |||
624 | /* | ||
625 | * padata_alloc - allocate and initialize a padata instance | ||
626 | * | ||
627 | * @cpumask: cpumask that padata uses for parallelization | ||
628 | * @wq: workqueue to use for the allocated padata instance | ||
629 | */ | ||
630 | struct padata_instance *padata_alloc(const struct cpumask *cpumask, | ||
631 | struct workqueue_struct *wq) | ||
632 | { | ||
633 | int err; | ||
634 | struct padata_instance *pinst; | ||
635 | struct parallel_data *pd; | ||
636 | |||
637 | pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); | ||
638 | if (!pinst) | ||
639 | goto err; | ||
640 | |||
641 | pd = padata_alloc_pd(pinst, cpumask); | ||
642 | if (!pd) | ||
643 | goto err_free_inst; | ||
644 | |||
645 | if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) | ||
646 | goto err_free_pd; | ||
647 | |||
648 | rcu_assign_pointer(pinst->pd, pd); | ||
649 | |||
650 | pinst->wq = wq; | ||
651 | |||
652 | cpumask_copy(pinst->cpumask, cpumask); | ||
653 | |||
654 | pinst->flags = 0; | ||
655 | |||
656 | pinst->cpu_notifier.notifier_call = padata_cpu_callback; | ||
657 | pinst->cpu_notifier.priority = 0; | ||
658 | err = register_hotcpu_notifier(&pinst->cpu_notifier); | ||
659 | if (err) | ||
660 | goto err_free_cpumask; | ||
661 | |||
662 | mutex_init(&pinst->lock); | ||
663 | |||
664 | return pinst; | ||
665 | |||
666 | err_free_cpumask: | ||
667 | free_cpumask_var(pinst->cpumask); | ||
668 | err_free_pd: | ||
669 | padata_free_pd(pd); | ||
670 | err_free_inst: | ||
671 | kfree(pinst); | ||
672 | err: | ||
673 | return NULL; | ||
674 | } | ||
675 | EXPORT_SYMBOL(padata_alloc); | ||
676 | |||
677 | /* | ||
678 | * padata_free - free a padata instance | ||
679 | * | ||
680 | * @ padata_inst: padata instance to free | ||
681 | */ | ||
682 | void padata_free(struct padata_instance *pinst) | ||
683 | { | ||
684 | padata_stop(pinst); | ||
685 | |||
686 | synchronize_rcu(); | ||
687 | |||
688 | while (atomic_read(&pinst->pd->refcnt) != 0) | ||
689 | yield(); | ||
690 | |||
691 | unregister_hotcpu_notifier(&pinst->cpu_notifier); | ||
692 | padata_free_pd(pinst->pd); | ||
693 | free_cpumask_var(pinst->cpumask); | ||
694 | kfree(pinst); | ||
695 | } | ||
696 | EXPORT_SYMBOL(padata_free); | ||
diff --git a/kernel/panic.c b/kernel/panic.c index c787333282b8..13d966b4c14a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | |||
36 | 36 | ||
37 | EXPORT_SYMBOL(panic_notifier_list); | 37 | EXPORT_SYMBOL(panic_notifier_list); |
38 | 38 | ||
39 | static long no_blink(long time) | ||
40 | { | ||
41 | return 0; | ||
42 | } | ||
43 | |||
44 | /* Returns how long it waited in ms */ | 39 | /* Returns how long it waited in ms */ |
45 | long (*panic_blink)(long time); | 40 | long (*panic_blink)(long time); |
46 | EXPORT_SYMBOL(panic_blink); | 41 | EXPORT_SYMBOL(panic_blink); |
47 | 42 | ||
43 | static void panic_blink_one_second(void) | ||
44 | { | ||
45 | static long i = 0, end; | ||
46 | |||
47 | if (panic_blink) { | ||
48 | end = i + MSEC_PER_SEC; | ||
49 | |||
50 | while (i < end) { | ||
51 | i += panic_blink(i); | ||
52 | mdelay(1); | ||
53 | i++; | ||
54 | } | ||
55 | } else { | ||
56 | /* | ||
57 | * When running under a hypervisor a small mdelay may get | ||
58 | * rounded up to the hypervisor timeslice. For example, with | ||
59 | * a 1ms in 10ms hypervisor timeslice we might inflate a | ||
60 | * mdelay(1) loop by 10x. | ||
61 | * | ||
62 | * If we have nothing to blink, spin on 1 second calls to | ||
63 | * mdelay to avoid this. | ||
64 | */ | ||
65 | mdelay(MSEC_PER_SEC); | ||
66 | } | ||
67 | } | ||
68 | |||
48 | /** | 69 | /** |
49 | * panic - halt the system | 70 | * panic - halt the system |
50 | * @fmt: The text string to print | 71 | * @fmt: The text string to print |
@@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
95 | 116 | ||
96 | bust_spinlocks(0); | 117 | bust_spinlocks(0); |
97 | 118 | ||
98 | if (!panic_blink) | ||
99 | panic_blink = no_blink; | ||
100 | |||
101 | if (panic_timeout > 0) { | 119 | if (panic_timeout > 0) { |
102 | /* | 120 | /* |
103 | * Delay timeout seconds before rebooting the machine. | 121 | * Delay timeout seconds before rebooting the machine. |
@@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
105 | */ | 123 | */ |
106 | printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); | 124 | printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); |
107 | 125 | ||
108 | for (i = 0; i < panic_timeout*1000; ) { | 126 | for (i = 0; i < panic_timeout; i++) { |
109 | touch_nmi_watchdog(); | 127 | touch_nmi_watchdog(); |
110 | i += panic_blink(i); | 128 | panic_blink_one_second(); |
111 | mdelay(1); | ||
112 | i++; | ||
113 | } | 129 | } |
114 | /* | 130 | /* |
115 | * This will not be a clean reboot, with everything | 131 | * This will not be a clean reboot, with everything |
@@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
135 | } | 151 | } |
136 | #endif | 152 | #endif |
137 | local_irq_enable(); | 153 | local_irq_enable(); |
138 | for (i = 0; ; ) { | 154 | while (1) { |
139 | touch_softlockup_watchdog(); | 155 | touch_softlockup_watchdog(); |
140 | i += panic_blink(i); | 156 | panic_blink_one_second(); |
141 | mdelay(1); | ||
142 | i++; | ||
143 | } | 157 | } |
144 | } | 158 | } |
145 | 159 | ||
diff --git a/kernel/params.c b/kernel/params.c index cf1b69183127..d55a53ec9234 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/err.h> | 24 | #include <linux/err.h> |
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/ctype.h> | 26 | #include <linux/ctype.h> |
27 | #include <linux/string.h> | ||
28 | 27 | ||
29 | #if 0 | 28 | #if 0 |
30 | #define DEBUGP printk | 29 | #define DEBUGP printk |
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, | |||
517 | new->grp.attrs = attrs; | 516 | new->grp.attrs = attrs; |
518 | 517 | ||
519 | /* Tack new one on the end. */ | 518 | /* Tack new one on the end. */ |
519 | sysfs_attr_init(&new->attrs[num].mattr.attr); | ||
520 | new->attrs[num].param = kp; | 520 | new->attrs[num].param = kp; |
521 | new->attrs[num].mattr.show = param_attr_show; | 521 | new->attrs[num].mattr.show = param_attr_show; |
522 | new->attrs[num].mattr.store = param_attr_store; | 522 | new->attrs[num].mattr.store = param_attr_store; |
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj, | |||
723 | return ret; | 723 | return ret; |
724 | } | 724 | } |
725 | 725 | ||
726 | static struct sysfs_ops module_sysfs_ops = { | 726 | static const struct sysfs_ops module_sysfs_ops = { |
727 | .show = module_attr_show, | 727 | .show = module_attr_show, |
728 | .store = module_attr_store, | 728 | .store = module_attr_store, |
729 | }; | 729 | }; |
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) | |||
737 | return 0; | 737 | return 0; |
738 | } | 738 | } |
739 | 739 | ||
740 | static struct kset_uevent_ops module_uevent_ops = { | 740 | static const struct kset_uevent_ops module_uevent_ops = { |
741 | .filter = uevent_filter, | 741 | .filter = uevent_filter, |
742 | }; | 742 | }; |
743 | 743 | ||
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 482d5e1d3764..e68745053013 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -2595,7 +2595,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2595 | if (user_locked > user_lock_limit) | 2595 | if (user_locked > user_lock_limit) |
2596 | extra = user_locked - user_lock_limit; | 2596 | extra = user_locked - user_lock_limit; |
2597 | 2597 | ||
2598 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 2598 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
2599 | lock_limit >>= PAGE_SHIFT; | 2599 | lock_limit >>= PAGE_SHIFT; |
2600 | locked = vma->vm_mm->locked_vm + extra; | 2600 | locked = vma->vm_mm->locked_vm + extra; |
2601 | 2601 | ||
@@ -5466,13 +5466,16 @@ void __init perf_event_init(void) | |||
5466 | register_cpu_notifier(&perf_cpu_nb); | 5466 | register_cpu_notifier(&perf_cpu_nb); |
5467 | } | 5467 | } |
5468 | 5468 | ||
5469 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) | 5469 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, |
5470 | struct sysdev_class_attribute *attr, | ||
5471 | char *buf) | ||
5470 | { | 5472 | { |
5471 | return sprintf(buf, "%d\n", perf_reserved_percpu); | 5473 | return sprintf(buf, "%d\n", perf_reserved_percpu); |
5472 | } | 5474 | } |
5473 | 5475 | ||
5474 | static ssize_t | 5476 | static ssize_t |
5475 | perf_set_reserve_percpu(struct sysdev_class *class, | 5477 | perf_set_reserve_percpu(struct sysdev_class *class, |
5478 | struct sysdev_class_attribute *attr, | ||
5476 | const char *buf, | 5479 | const char *buf, |
5477 | size_t count) | 5480 | size_t count) |
5478 | { | 5481 | { |
@@ -5501,13 +5504,17 @@ perf_set_reserve_percpu(struct sysdev_class *class, | |||
5501 | return count; | 5504 | return count; |
5502 | } | 5505 | } |
5503 | 5506 | ||
5504 | static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) | 5507 | static ssize_t perf_show_overcommit(struct sysdev_class *class, |
5508 | struct sysdev_class_attribute *attr, | ||
5509 | char *buf) | ||
5505 | { | 5510 | { |
5506 | return sprintf(buf, "%d\n", perf_overcommit); | 5511 | return sprintf(buf, "%d\n", perf_overcommit); |
5507 | } | 5512 | } |
5508 | 5513 | ||
5509 | static ssize_t | 5514 | static ssize_t |
5510 | perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) | 5515 | perf_set_overcommit(struct sysdev_class *class, |
5516 | struct sysdev_class_attribute *attr, | ||
5517 | const char *buf, size_t count) | ||
5511 | { | 5518 | { |
5512 | unsigned long val; | 5519 | unsigned long val; |
5513 | int err; | 5520 | int err; |
diff --git a/kernel/pid.c b/kernel/pid.c index 2e17c9c92cbe..86b296943e5f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -367,7 +367,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
367 | struct task_struct *result = NULL; | 367 | struct task_struct *result = NULL; |
368 | if (pid) { | 368 | if (pid) { |
369 | struct hlist_node *first; | 369 | struct hlist_node *first; |
370 | first = rcu_dereference(pid->tasks[type].first); | 370 | first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); |
371 | if (first) | 371 | if (first) |
372 | result = hlist_entry(first, struct task_struct, pids[(type)].node); | 372 | result = hlist_entry(first, struct task_struct, pids[(type)].node); |
373 | } | 373 | } |
@@ -376,7 +376,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
376 | EXPORT_SYMBOL(pid_task); | 376 | EXPORT_SYMBOL(pid_task); |
377 | 377 | ||
378 | /* | 378 | /* |
379 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. | 379 | * Must be called under rcu_read_lock(). |
380 | */ | 380 | */ |
381 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 381 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
382 | { | 382 | { |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 438ff4523513..1a22dfd42df9 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
982 | int maxfire; | 982 | int maxfire; |
983 | struct list_head *timers = tsk->cpu_timers; | 983 | struct list_head *timers = tsk->cpu_timers; |
984 | struct signal_struct *const sig = tsk->signal; | 984 | struct signal_struct *const sig = tsk->signal; |
985 | unsigned long soft; | ||
985 | 986 | ||
986 | maxfire = 20; | 987 | maxfire = 20; |
987 | tsk->cputime_expires.prof_exp = cputime_zero; | 988 | tsk->cputime_expires.prof_exp = cputime_zero; |
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1030 | /* | 1031 | /* |
1031 | * Check for the special case thread timers. | 1032 | * Check for the special case thread timers. |
1032 | */ | 1033 | */ |
1033 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { | 1034 | soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); |
1034 | unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; | 1035 | if (soft != RLIM_INFINITY) { |
1035 | unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; | 1036 | unsigned long hard = |
1037 | ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); | ||
1036 | 1038 | ||
1037 | if (hard != RLIM_INFINITY && | 1039 | if (hard != RLIM_INFINITY && |
1038 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { | 1040 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { |
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1043 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | 1045 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
1044 | return; | 1046 | return; |
1045 | } | 1047 | } |
1046 | if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { | 1048 | if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { |
1047 | /* | 1049 | /* |
1048 | * At the soft limit, send a SIGXCPU every second. | 1050 | * At the soft limit, send a SIGXCPU every second. |
1049 | */ | 1051 | */ |
1050 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur | 1052 | if (soft < hard) { |
1051 | < sig->rlim[RLIMIT_RTTIME].rlim_max) { | 1053 | soft += USEC_PER_SEC; |
1052 | sig->rlim[RLIMIT_RTTIME].rlim_cur += | 1054 | sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; |
1053 | USEC_PER_SEC; | ||
1054 | } | 1055 | } |
1055 | printk(KERN_INFO | 1056 | printk(KERN_INFO |
1056 | "RT Watchdog Timeout: %s[%d]\n", | 1057 | "RT Watchdog Timeout: %s[%d]\n", |
@@ -1121,6 +1122,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1121 | unsigned long long sum_sched_runtime, sched_expires; | 1122 | unsigned long long sum_sched_runtime, sched_expires; |
1122 | struct list_head *timers = sig->cpu_timers; | 1123 | struct list_head *timers = sig->cpu_timers; |
1123 | struct task_cputime cputime; | 1124 | struct task_cputime cputime; |
1125 | unsigned long soft; | ||
1124 | 1126 | ||
1125 | /* | 1127 | /* |
1126 | * Don't sample the current process CPU clocks if there are no timers. | 1128 | * Don't sample the current process CPU clocks if there are no timers. |
@@ -1193,11 +1195,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
1193 | SIGPROF); | 1195 | SIGPROF); |
1194 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, | 1196 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, |
1195 | SIGVTALRM); | 1197 | SIGVTALRM); |
1196 | 1198 | soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); | |
1197 | if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { | 1199 | if (soft != RLIM_INFINITY) { |
1198 | unsigned long psecs = cputime_to_secs(ptime); | 1200 | unsigned long psecs = cputime_to_secs(ptime); |
1201 | unsigned long hard = | ||
1202 | ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); | ||
1199 | cputime_t x; | 1203 | cputime_t x; |
1200 | if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { | 1204 | if (psecs >= hard) { |
1201 | /* | 1205 | /* |
1202 | * At the hard limit, we just die. | 1206 | * At the hard limit, we just die. |
1203 | * No need to calculate anything else now. | 1207 | * No need to calculate anything else now. |
@@ -1205,17 +1209,17 @@ static void check_process_timers(struct task_struct *tsk, | |||
1205 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | 1209 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
1206 | return; | 1210 | return; |
1207 | } | 1211 | } |
1208 | if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { | 1212 | if (psecs >= soft) { |
1209 | /* | 1213 | /* |
1210 | * At the soft limit, send a SIGXCPU every second. | 1214 | * At the soft limit, send a SIGXCPU every second. |
1211 | */ | 1215 | */ |
1212 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | 1216 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); |
1213 | if (sig->rlim[RLIMIT_CPU].rlim_cur | 1217 | if (soft < hard) { |
1214 | < sig->rlim[RLIMIT_CPU].rlim_max) { | 1218 | soft++; |
1215 | sig->rlim[RLIMIT_CPU].rlim_cur++; | 1219 | sig->rlim[RLIMIT_CPU].rlim_cur = soft; |
1216 | } | 1220 | } |
1217 | } | 1221 | } |
1218 | x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); | 1222 | x = secs_to_cputime(soft); |
1219 | if (cputime_eq(prof_expires, cputime_zero) || | 1223 | if (cputime_eq(prof_expires, cputime_zero) || |
1220 | cputime_lt(x, prof_expires)) { | 1224 | cputime_lt(x, prof_expires)) { |
1221 | prof_expires = x; | 1225 | prof_expires = x; |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 495440779ce3..00d1fda58ab6 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, | |||
256 | return 0; | 256 | return 0; |
257 | } | 257 | } |
258 | 258 | ||
259 | int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) | 259 | static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) |
260 | { | 260 | { |
261 | *tp = ktime_to_timespec(KTIME_LOW_RES); | 261 | *tp = ktime_to_timespec(KTIME_LOW_RES); |
262 | return 0; | 262 | return 0; |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 91e09d3b2eb2..5c36ea9d55d2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -27,6 +27,15 @@ config PM_DEBUG | |||
27 | code. This is helpful when debugging and reporting PM bugs, like | 27 | code. This is helpful when debugging and reporting PM bugs, like |
28 | suspend support. | 28 | suspend support. |
29 | 29 | ||
30 | config PM_ADVANCED_DEBUG | ||
31 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | ||
32 | depends on PM_DEBUG | ||
33 | default n | ||
34 | ---help--- | ||
35 | Add extra sysfs attributes allowing one to access some Power Management | ||
36 | fields of device objects from user space. If you are not a kernel | ||
37 | developer interested in debugging/testing Power Management, say "no". | ||
38 | |||
30 | config PM_VERBOSE | 39 | config PM_VERBOSE |
31 | bool "Verbose Power Management debugging" | 40 | bool "Verbose Power Management debugging" |
32 | depends on PM_DEBUG | 41 | depends on PM_DEBUG |
@@ -85,6 +94,11 @@ config PM_SLEEP | |||
85 | depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE | 94 | depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE |
86 | default y | 95 | default y |
87 | 96 | ||
97 | config PM_SLEEP_ADVANCED_DEBUG | ||
98 | bool | ||
99 | depends on PM_ADVANCED_DEBUG | ||
100 | default n | ||
101 | |||
88 | config SUSPEND | 102 | config SUSPEND |
89 | bool "Suspend to RAM and standby" | 103 | bool "Suspend to RAM and standby" |
90 | depends on PM && ARCH_SUSPEND_POSSIBLE | 104 | depends on PM && ARCH_SUSPEND_POSSIBLE |
@@ -222,3 +236,8 @@ config PM_RUNTIME | |||
222 | and the bus type drivers of the buses the devices are on are | 236 | and the bus type drivers of the buses the devices are on are |
223 | responsible for the actual handling of the autosuspend requests and | 237 | responsible for the actual handling of the autosuspend requests and |
224 | wake-up events. | 238 | wake-up events. |
239 | |||
240 | config PM_OPS | ||
241 | bool | ||
242 | depends on PM_SLEEP || PM_RUNTIME | ||
243 | default y | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index bbfe472d7524..da5288ec2392 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -323,6 +323,7 @@ static int create_image(int platform_mode) | |||
323 | int hibernation_snapshot(int platform_mode) | 323 | int hibernation_snapshot(int platform_mode) |
324 | { | 324 | { |
325 | int error; | 325 | int error; |
326 | gfp_t saved_mask; | ||
326 | 327 | ||
327 | error = platform_begin(platform_mode); | 328 | error = platform_begin(platform_mode); |
328 | if (error) | 329 | if (error) |
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode) | |||
334 | goto Close; | 335 | goto Close; |
335 | 336 | ||
336 | suspend_console(); | 337 | suspend_console(); |
338 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
337 | error = dpm_suspend_start(PMSG_FREEZE); | 339 | error = dpm_suspend_start(PMSG_FREEZE); |
338 | if (error) | 340 | if (error) |
339 | goto Recover_platform; | 341 | goto Recover_platform; |
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode) | |||
351 | 353 | ||
352 | dpm_resume_end(in_suspend ? | 354 | dpm_resume_end(in_suspend ? |
353 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); | 355 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); |
356 | set_gfp_allowed_mask(saved_mask); | ||
354 | resume_console(); | 357 | resume_console(); |
355 | Close: | 358 | Close: |
356 | platform_end(platform_mode); | 359 | platform_end(platform_mode); |
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode) | |||
445 | int hibernation_restore(int platform_mode) | 448 | int hibernation_restore(int platform_mode) |
446 | { | 449 | { |
447 | int error; | 450 | int error; |
451 | gfp_t saved_mask; | ||
448 | 452 | ||
449 | pm_prepare_console(); | 453 | pm_prepare_console(); |
450 | suspend_console(); | 454 | suspend_console(); |
455 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
451 | error = dpm_suspend_start(PMSG_QUIESCE); | 456 | error = dpm_suspend_start(PMSG_QUIESCE); |
452 | if (!error) { | 457 | if (!error) { |
453 | error = resume_target_kernel(platform_mode); | 458 | error = resume_target_kernel(platform_mode); |
454 | dpm_resume_end(PMSG_RECOVER); | 459 | dpm_resume_end(PMSG_RECOVER); |
455 | } | 460 | } |
461 | set_gfp_allowed_mask(saved_mask); | ||
456 | resume_console(); | 462 | resume_console(); |
457 | pm_restore_console(); | 463 | pm_restore_console(); |
458 | return error; | 464 | return error; |
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode) | |||
466 | int hibernation_platform_enter(void) | 472 | int hibernation_platform_enter(void) |
467 | { | 473 | { |
468 | int error; | 474 | int error; |
475 | gfp_t saved_mask; | ||
469 | 476 | ||
470 | if (!hibernation_ops) | 477 | if (!hibernation_ops) |
471 | return -ENOSYS; | 478 | return -ENOSYS; |
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void) | |||
481 | 488 | ||
482 | entering_platform_hibernation = true; | 489 | entering_platform_hibernation = true; |
483 | suspend_console(); | 490 | suspend_console(); |
491 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
484 | error = dpm_suspend_start(PMSG_HIBERNATE); | 492 | error = dpm_suspend_start(PMSG_HIBERNATE); |
485 | if (error) { | 493 | if (error) { |
486 | if (hibernation_ops->recover) | 494 | if (hibernation_ops->recover) |
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void) | |||
518 | Resume_devices: | 526 | Resume_devices: |
519 | entering_platform_hibernation = false; | 527 | entering_platform_hibernation = false; |
520 | dpm_resume_end(PMSG_RESTORE); | 528 | dpm_resume_end(PMSG_RESTORE); |
529 | set_gfp_allowed_mask(saved_mask); | ||
521 | resume_console(); | 530 | resume_console(); |
522 | 531 | ||
523 | Close: | 532 | Close: |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 0998c7139053..b58800b21fc0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val) | |||
44 | == NOTIFY_BAD) ? -EINVAL : 0; | 44 | == NOTIFY_BAD) ? -EINVAL : 0; |
45 | } | 45 | } |
46 | 46 | ||
47 | /* If set, devices may be suspended and resumed asynchronously. */ | ||
48 | int pm_async_enabled = 1; | ||
49 | |||
50 | static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
51 | char *buf) | ||
52 | { | ||
53 | return sprintf(buf, "%d\n", pm_async_enabled); | ||
54 | } | ||
55 | |||
56 | static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
57 | const char *buf, size_t n) | ||
58 | { | ||
59 | unsigned long val; | ||
60 | |||
61 | if (strict_strtoul(buf, 10, &val)) | ||
62 | return -EINVAL; | ||
63 | |||
64 | if (val > 1) | ||
65 | return -EINVAL; | ||
66 | |||
67 | pm_async_enabled = val; | ||
68 | return n; | ||
69 | } | ||
70 | |||
71 | power_attr(pm_async); | ||
72 | |||
47 | #ifdef CONFIG_PM_DEBUG | 73 | #ifdef CONFIG_PM_DEBUG |
48 | int pm_test_level = TEST_NONE; | 74 | int pm_test_level = TEST_NONE; |
49 | 75 | ||
@@ -208,9 +234,12 @@ static struct attribute * g[] = { | |||
208 | #ifdef CONFIG_PM_TRACE | 234 | #ifdef CONFIG_PM_TRACE |
209 | &pm_trace_attr.attr, | 235 | &pm_trace_attr.attr, |
210 | #endif | 236 | #endif |
211 | #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) | 237 | #ifdef CONFIG_PM_SLEEP |
238 | &pm_async_attr.attr, | ||
239 | #ifdef CONFIG_PM_DEBUG | ||
212 | &pm_test_attr.attr, | 240 | &pm_test_attr.attr, |
213 | #endif | 241 | #endif |
242 | #endif | ||
214 | NULL, | 243 | NULL, |
215 | }; | 244 | }; |
216 | 245 | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 36cb168e4330..830cadecbdfc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void) | |||
1181 | 1181 | ||
1182 | memory_bm_position_reset(©_bm); | 1182 | memory_bm_position_reset(©_bm); |
1183 | 1183 | ||
1184 | while (to_free_normal > 0 && to_free_highmem > 0) { | 1184 | while (to_free_normal > 0 || to_free_highmem > 0) { |
1185 | unsigned long pfn = memory_bm_next_pfn(©_bm); | 1185 | unsigned long pfn = memory_bm_next_pfn(©_bm); |
1186 | struct page *page = pfn_to_page(pfn); | 1186 | struct page *page = pfn_to_page(pfn); |
1187 | 1187 | ||
@@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void) | |||
1500 | { | 1500 | { |
1501 | unsigned int nr_pages, nr_highmem; | 1501 | unsigned int nr_pages, nr_highmem; |
1502 | 1502 | ||
1503 | printk(KERN_INFO "PM: Creating hibernation image: \n"); | 1503 | printk(KERN_INFO "PM: Creating hibernation image:\n"); |
1504 | 1504 | ||
1505 | drain_local_pages(NULL); | 1505 | drain_local_pages(NULL); |
1506 | nr_pages = count_data_pages(); | 1506 | nr_pages = count_data_pages(); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6f10dfc2d3e9..44cce10b582d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state) | |||
189 | int suspend_devices_and_enter(suspend_state_t state) | 189 | int suspend_devices_and_enter(suspend_state_t state) |
190 | { | 190 | { |
191 | int error; | 191 | int error; |
192 | gfp_t saved_mask; | ||
192 | 193 | ||
193 | if (!suspend_ops) | 194 | if (!suspend_ops) |
194 | return -ENOSYS; | 195 | return -ENOSYS; |
@@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
199 | goto Close; | 200 | goto Close; |
200 | } | 201 | } |
201 | suspend_console(); | 202 | suspend_console(); |
203 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
202 | suspend_test_start(); | 204 | suspend_test_start(); |
203 | error = dpm_suspend_start(PMSG_SUSPEND); | 205 | error = dpm_suspend_start(PMSG_SUSPEND); |
204 | if (error) { | 206 | if (error) { |
@@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
215 | suspend_test_start(); | 217 | suspend_test_start(); |
216 | dpm_resume_end(PMSG_RESUME); | 218 | dpm_resume_end(PMSG_RESUME); |
217 | suspend_test_finish("resume devices"); | 219 | suspend_test_finish("resume devices"); |
220 | set_gfp_allowed_mask(saved_mask); | ||
218 | resume_console(); | 221 | resume_console(); |
219 | Close: | 222 | Close: |
220 | if (suspend_ops->end) | 223 | if (suspend_ops->end) |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 09b2b0ae9e9d..1d575733d4e1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p) | |||
657 | struct swsusp_info *header; | 657 | struct swsusp_info *header; |
658 | 658 | ||
659 | *flags_p = swsusp_header->flags; | 659 | *flags_p = swsusp_header->flags; |
660 | if (IS_ERR(resume_bdev)) { | ||
661 | pr_debug("PM: Image device not initialised\n"); | ||
662 | return PTR_ERR(resume_bdev); | ||
663 | } | ||
664 | 660 | ||
665 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | 661 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); |
666 | error = snapshot_write_next(&snapshot, PAGE_SIZE); | 662 | error = snapshot_write_next(&snapshot, PAGE_SIZE); |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c deleted file mode 100644 index 5b3601bd1893..000000000000 --- a/kernel/power/swsusp.c +++ /dev/null | |||
@@ -1,58 +0,0 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/swsusp.c | ||
3 | * | ||
4 | * This file provides code to write suspend image to swap and read it back. | ||
5 | * | ||
6 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> | ||
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> | ||
8 | * | ||
9 | * This file is released under the GPLv2. | ||
10 | * | ||
11 | * I'd like to thank the following people for their work: | ||
12 | * | ||
13 | * Pavel Machek <pavel@ucw.cz>: | ||
14 | * Modifications, defectiveness pointing, being with me at the very beginning, | ||
15 | * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. | ||
16 | * | ||
17 | * Steve Doddi <dirk@loth.demon.co.uk>: | ||
18 | * Support the possibility of hardware state restoring. | ||
19 | * | ||
20 | * Raph <grey.havens@earthling.net>: | ||
21 | * Support for preserving states of network devices and virtual console | ||
22 | * (including X and svgatextmode) | ||
23 | * | ||
24 | * Kurt Garloff <garloff@suse.de>: | ||
25 | * Straightened the critical function in order to prevent compilers from | ||
26 | * playing tricks with local variables. | ||
27 | * | ||
28 | * Andreas Mohr <a.mohr@mailto.de> | ||
29 | * | ||
30 | * Alex Badea <vampire@go.ro>: | ||
31 | * Fixed runaway init | ||
32 | * | ||
33 | * Rafael J. Wysocki <rjw@sisk.pl> | ||
34 | * Reworked the freeing of memory and the handling of swap | ||
35 | * | ||
36 | * More state savers are welcome. Especially for the scsi layer... | ||
37 | * | ||
38 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt | ||
39 | */ | ||
40 | |||
41 | #include <linux/mm.h> | ||
42 | #include <linux/suspend.h> | ||
43 | #include <linux/spinlock.h> | ||
44 | #include <linux/kernel.h> | ||
45 | #include <linux/major.h> | ||
46 | #include <linux/swap.h> | ||
47 | #include <linux/pm.h> | ||
48 | #include <linux/swapops.h> | ||
49 | #include <linux/bootmem.h> | ||
50 | #include <linux/syscalls.h> | ||
51 | #include <linux/highmem.h> | ||
52 | #include <linux/time.h> | ||
53 | #include <linux/rbtree.h> | ||
54 | #include <linux/io.h> | ||
55 | |||
56 | #include "power.h" | ||
57 | |||
58 | int in_suspend __nosavedata = 0; | ||
diff --git a/kernel/power/user.c b/kernel/power/user.c index bf0014d6a5f0..4d2289626a84 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
195 | return res; | 195 | return res; |
196 | } | 196 | } |
197 | 197 | ||
198 | static void snapshot_deprecated_ioctl(unsigned int cmd) | ||
199 | { | ||
200 | if (printk_ratelimit()) | ||
201 | printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " | ||
202 | "be removed soon, update your suspend-to-disk " | ||
203 | "utilities\n", | ||
204 | __builtin_return_address(0), cmd); | ||
205 | } | ||
206 | |||
198 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, | 207 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, |
199 | unsigned long arg) | 208 | unsigned long arg) |
200 | { | 209 | { |
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
246 | data->frozen = 0; | 255 | data->frozen = 0; |
247 | break; | 256 | break; |
248 | 257 | ||
249 | case SNAPSHOT_CREATE_IMAGE: | ||
250 | case SNAPSHOT_ATOMIC_SNAPSHOT: | 258 | case SNAPSHOT_ATOMIC_SNAPSHOT: |
259 | snapshot_deprecated_ioctl(cmd); | ||
260 | case SNAPSHOT_CREATE_IMAGE: | ||
251 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { | 261 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { |
252 | error = -EPERM; | 262 | error = -EPERM; |
253 | break; | 263 | break; |
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
275 | data->ready = 0; | 285 | data->ready = 0; |
276 | break; | 286 | break; |
277 | 287 | ||
278 | case SNAPSHOT_PREF_IMAGE_SIZE: | ||
279 | case SNAPSHOT_SET_IMAGE_SIZE: | 288 | case SNAPSHOT_SET_IMAGE_SIZE: |
289 | snapshot_deprecated_ioctl(cmd); | ||
290 | case SNAPSHOT_PREF_IMAGE_SIZE: | ||
280 | image_size = arg; | 291 | image_size = arg; |
281 | break; | 292 | break; |
282 | 293 | ||
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
290 | error = put_user(size, (loff_t __user *)arg); | 301 | error = put_user(size, (loff_t __user *)arg); |
291 | break; | 302 | break; |
292 | 303 | ||
293 | case SNAPSHOT_AVAIL_SWAP_SIZE: | ||
294 | case SNAPSHOT_AVAIL_SWAP: | 304 | case SNAPSHOT_AVAIL_SWAP: |
305 | snapshot_deprecated_ioctl(cmd); | ||
306 | case SNAPSHOT_AVAIL_SWAP_SIZE: | ||
295 | size = count_swap_pages(data->swap, 1); | 307 | size = count_swap_pages(data->swap, 1); |
296 | size <<= PAGE_SHIFT; | 308 | size <<= PAGE_SHIFT; |
297 | error = put_user(size, (loff_t __user *)arg); | 309 | error = put_user(size, (loff_t __user *)arg); |
298 | break; | 310 | break; |
299 | 311 | ||
300 | case SNAPSHOT_ALLOC_SWAP_PAGE: | ||
301 | case SNAPSHOT_GET_SWAP_PAGE: | 312 | case SNAPSHOT_GET_SWAP_PAGE: |
313 | snapshot_deprecated_ioctl(cmd); | ||
314 | case SNAPSHOT_ALLOC_SWAP_PAGE: | ||
302 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | 315 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { |
303 | error = -ENODEV; | 316 | error = -ENODEV; |
304 | break; | 317 | break; |
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
321 | break; | 334 | break; |
322 | 335 | ||
323 | case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ | 336 | case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ |
337 | snapshot_deprecated_ioctl(cmd); | ||
324 | if (!swsusp_swap_in_use()) { | 338 | if (!swsusp_swap_in_use()) { |
325 | /* | 339 | /* |
326 | * User space encodes device types as two-byte values, | 340 | * User space encodes device types as two-byte values, |
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
362 | break; | 376 | break; |
363 | 377 | ||
364 | case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ | 378 | case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ |
379 | snapshot_deprecated_ioctl(cmd); | ||
365 | error = -EINVAL; | 380 | error = -EINVAL; |
366 | 381 | ||
367 | switch (arg) { | 382 | switch (arg) { |
diff --git a/kernel/printk.c b/kernel/printk.c index 1751c456b71f..75077ad0b537 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/kexec.h> | 35 | #include <linux/kexec.h> |
36 | #include <linux/ratelimit.h> | 36 | #include <linux/ratelimit.h> |
37 | #include <linux/kmsg_dump.h> | 37 | #include <linux/kmsg_dump.h> |
38 | #include <linux/syslog.h> | ||
38 | 39 | ||
39 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
40 | 41 | ||
@@ -69,8 +70,6 @@ int console_printk[4] = { | |||
69 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | 70 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ |
70 | }; | 71 | }; |
71 | 72 | ||
72 | static int saved_console_loglevel = -1; | ||
73 | |||
74 | /* | 73 | /* |
75 | * Low level drivers may need that to know if they can schedule in | 74 | * Low level drivers may need that to know if they can schedule in |
76 | * their unblank() callback or not. So let's export it. | 75 | * their unblank() callback or not. So let's export it. |
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN]; | |||
145 | static char *log_buf = __log_buf; | 144 | static char *log_buf = __log_buf; |
146 | static int log_buf_len = __LOG_BUF_LEN; | 145 | static int log_buf_len = __LOG_BUF_LEN; |
147 | static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ | 146 | static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ |
147 | static int saved_console_loglevel = -1; | ||
148 | 148 | ||
149 | #ifdef CONFIG_KEXEC | 149 | #ifdef CONFIG_KEXEC |
150 | /* | 150 | /* |
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void) | |||
258 | } | 258 | } |
259 | #endif | 259 | #endif |
260 | 260 | ||
261 | /* | 261 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
262 | * Commands to do_syslog: | ||
263 | * | ||
264 | * 0 -- Close the log. Currently a NOP. | ||
265 | * 1 -- Open the log. Currently a NOP. | ||
266 | * 2 -- Read from the log. | ||
267 | * 3 -- Read all messages remaining in the ring buffer. | ||
268 | * 4 -- Read and clear all messages remaining in the ring buffer | ||
269 | * 5 -- Clear ring buffer. | ||
270 | * 6 -- Disable printk's to console | ||
271 | * 7 -- Enable printk's to console | ||
272 | * 8 -- Set level of messages printed to console | ||
273 | * 9 -- Return number of unread characters in the log buffer | ||
274 | * 10 -- Return size of the log buffer | ||
275 | */ | ||
276 | int do_syslog(int type, char __user *buf, int len) | ||
277 | { | 262 | { |
278 | unsigned i, j, limit, count; | 263 | unsigned i, j, limit, count; |
279 | int do_clear = 0; | 264 | int do_clear = 0; |
280 | char c; | 265 | char c; |
281 | int error = 0; | 266 | int error = 0; |
282 | 267 | ||
283 | error = security_syslog(type); | 268 | error = security_syslog(type, from_file); |
284 | if (error) | 269 | if (error) |
285 | return error; | 270 | return error; |
286 | 271 | ||
287 | switch (type) { | 272 | switch (type) { |
288 | case 0: /* Close log */ | 273 | case SYSLOG_ACTION_CLOSE: /* Close log */ |
289 | break; | 274 | break; |
290 | case 1: /* Open log */ | 275 | case SYSLOG_ACTION_OPEN: /* Open log */ |
291 | break; | 276 | break; |
292 | case 2: /* Read from log */ | 277 | case SYSLOG_ACTION_READ: /* Read from log */ |
293 | error = -EINVAL; | 278 | error = -EINVAL; |
294 | if (!buf || len < 0) | 279 | if (!buf || len < 0) |
295 | goto out; | 280 | goto out; |
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len) | |||
320 | if (!error) | 305 | if (!error) |
321 | error = i; | 306 | error = i; |
322 | break; | 307 | break; |
323 | case 4: /* Read/clear last kernel messages */ | 308 | /* Read/clear last kernel messages */ |
309 | case SYSLOG_ACTION_READ_CLEAR: | ||
324 | do_clear = 1; | 310 | do_clear = 1; |
325 | /* FALL THRU */ | 311 | /* FALL THRU */ |
326 | case 3: /* Read last kernel messages */ | 312 | /* Read last kernel messages */ |
313 | case SYSLOG_ACTION_READ_ALL: | ||
327 | error = -EINVAL; | 314 | error = -EINVAL; |
328 | if (!buf || len < 0) | 315 | if (!buf || len < 0) |
329 | goto out; | 316 | goto out; |
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len) | |||
376 | } | 363 | } |
377 | } | 364 | } |
378 | break; | 365 | break; |
379 | case 5: /* Clear ring buffer */ | 366 | /* Clear ring buffer */ |
367 | case SYSLOG_ACTION_CLEAR: | ||
380 | logged_chars = 0; | 368 | logged_chars = 0; |
381 | break; | 369 | break; |
382 | case 6: /* Disable logging to console */ | 370 | /* Disable logging to console */ |
371 | case SYSLOG_ACTION_CONSOLE_OFF: | ||
383 | if (saved_console_loglevel == -1) | 372 | if (saved_console_loglevel == -1) |
384 | saved_console_loglevel = console_loglevel; | 373 | saved_console_loglevel = console_loglevel; |
385 | console_loglevel = minimum_console_loglevel; | 374 | console_loglevel = minimum_console_loglevel; |
386 | break; | 375 | break; |
387 | case 7: /* Enable logging to console */ | 376 | /* Enable logging to console */ |
377 | case SYSLOG_ACTION_CONSOLE_ON: | ||
388 | if (saved_console_loglevel != -1) { | 378 | if (saved_console_loglevel != -1) { |
389 | console_loglevel = saved_console_loglevel; | 379 | console_loglevel = saved_console_loglevel; |
390 | saved_console_loglevel = -1; | 380 | saved_console_loglevel = -1; |
391 | } | 381 | } |
392 | break; | 382 | break; |
393 | case 8: /* Set level of messages printed to console */ | 383 | /* Set level of messages printed to console */ |
384 | case SYSLOG_ACTION_CONSOLE_LEVEL: | ||
394 | error = -EINVAL; | 385 | error = -EINVAL; |
395 | if (len < 1 || len > 8) | 386 | if (len < 1 || len > 8) |
396 | goto out; | 387 | goto out; |
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len) | |||
401 | saved_console_loglevel = -1; | 392 | saved_console_loglevel = -1; |
402 | error = 0; | 393 | error = 0; |
403 | break; | 394 | break; |
404 | case 9: /* Number of chars in the log buffer */ | 395 | /* Number of chars in the log buffer */ |
396 | case SYSLOG_ACTION_SIZE_UNREAD: | ||
405 | error = log_end - log_start; | 397 | error = log_end - log_start; |
406 | break; | 398 | break; |
407 | case 10: /* Size of the log buffer */ | 399 | /* Size of the log buffer */ |
400 | case SYSLOG_ACTION_SIZE_BUFFER: | ||
408 | error = log_buf_len; | 401 | error = log_buf_len; |
409 | break; | 402 | break; |
410 | default: | 403 | default: |
@@ -417,7 +410,7 @@ out: | |||
417 | 410 | ||
418 | SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | 411 | SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) |
419 | { | 412 | { |
420 | return do_syslog(type, buf, len); | 413 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
421 | } | 414 | } |
422 | 415 | ||
423 | /* | 416 | /* |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 23bd09cd042e..42ad8ae729a0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
23 | #include <linux/syscalls.h> | 23 | #include <linux/syscalls.h> |
24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
25 | #include <linux/regset.h> | ||
25 | 26 | ||
26 | 27 | ||
27 | /* | 28 | /* |
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data) | |||
511 | return 0; | 512 | return 0; |
512 | } | 513 | } |
513 | 514 | ||
515 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | ||
516 | |||
517 | static const struct user_regset * | ||
518 | find_regset(const struct user_regset_view *view, unsigned int type) | ||
519 | { | ||
520 | const struct user_regset *regset; | ||
521 | int n; | ||
522 | |||
523 | for (n = 0; n < view->n; ++n) { | ||
524 | regset = view->regsets + n; | ||
525 | if (regset->core_note_type == type) | ||
526 | return regset; | ||
527 | } | ||
528 | |||
529 | return NULL; | ||
530 | } | ||
531 | |||
532 | static int ptrace_regset(struct task_struct *task, int req, unsigned int type, | ||
533 | struct iovec *kiov) | ||
534 | { | ||
535 | const struct user_regset_view *view = task_user_regset_view(task); | ||
536 | const struct user_regset *regset = find_regset(view, type); | ||
537 | int regset_no; | ||
538 | |||
539 | if (!regset || (kiov->iov_len % regset->size) != 0) | ||
540 | return -EINVAL; | ||
541 | |||
542 | regset_no = regset - view->regsets; | ||
543 | kiov->iov_len = min(kiov->iov_len, | ||
544 | (__kernel_size_t) (regset->n * regset->size)); | ||
545 | |||
546 | if (req == PTRACE_GETREGSET) | ||
547 | return copy_regset_to_user(task, view, regset_no, 0, | ||
548 | kiov->iov_len, kiov->iov_base); | ||
549 | else | ||
550 | return copy_regset_from_user(task, view, regset_no, 0, | ||
551 | kiov->iov_len, kiov->iov_base); | ||
552 | } | ||
553 | |||
554 | #endif | ||
555 | |||
514 | int ptrace_request(struct task_struct *child, long request, | 556 | int ptrace_request(struct task_struct *child, long request, |
515 | long addr, long data) | 557 | long addr, long data) |
516 | { | 558 | { |
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request, | |||
573 | return 0; | 615 | return 0; |
574 | return ptrace_resume(child, request, SIGKILL); | 616 | return ptrace_resume(child, request, SIGKILL); |
575 | 617 | ||
618 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | ||
619 | case PTRACE_GETREGSET: | ||
620 | case PTRACE_SETREGSET: | ||
621 | { | ||
622 | struct iovec kiov; | ||
623 | struct iovec __user *uiov = (struct iovec __user *) data; | ||
624 | |||
625 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) | ||
626 | return -EFAULT; | ||
627 | |||
628 | if (__get_user(kiov.iov_base, &uiov->iov_base) || | ||
629 | __get_user(kiov.iov_len, &uiov->iov_len)) | ||
630 | return -EFAULT; | ||
631 | |||
632 | ret = ptrace_regset(child, request, addr, &kiov); | ||
633 | if (!ret) | ||
634 | ret = __put_user(kiov.iov_len, &uiov->iov_len); | ||
635 | break; | ||
636 | } | ||
637 | #endif | ||
576 | default: | 638 | default: |
577 | break; | 639 | break; |
578 | } | 640 | } |
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
711 | else | 773 | else |
712 | ret = ptrace_setsiginfo(child, &siginfo); | 774 | ret = ptrace_setsiginfo(child, &siginfo); |
713 | break; | 775 | break; |
776 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | ||
777 | case PTRACE_GETREGSET: | ||
778 | case PTRACE_SETREGSET: | ||
779 | { | ||
780 | struct iovec kiov; | ||
781 | struct compat_iovec __user *uiov = | ||
782 | (struct compat_iovec __user *) datap; | ||
783 | compat_uptr_t ptr; | ||
784 | compat_size_t len; | ||
785 | |||
786 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) | ||
787 | return -EFAULT; | ||
788 | |||
789 | if (__get_user(ptr, &uiov->iov_base) || | ||
790 | __get_user(len, &uiov->iov_len)) | ||
791 | return -EFAULT; | ||
792 | |||
793 | kiov.iov_base = compat_ptr(ptr); | ||
794 | kiov.iov_len = len; | ||
795 | |||
796 | ret = ptrace_regset(child, request, addr, &kiov); | ||
797 | if (!ret) | ||
798 | ret = __put_user(kiov.iov_len, &uiov->iov_len); | ||
799 | break; | ||
800 | } | ||
801 | #endif | ||
714 | 802 | ||
715 | default: | 803 | default: |
716 | ret = ptrace_request(child, request, addr, data); | 804 | ret = ptrace_request(child, request, addr, data); |
diff --git a/kernel/range.c b/kernel/range.c new file mode 100644 index 000000000000..74e2e6114927 --- /dev/null +++ b/kernel/range.c | |||
@@ -0,0 +1,163 @@ | |||
1 | /* | ||
2 | * Range add and subtract | ||
3 | */ | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/init.h> | ||
6 | #include <linux/sort.h> | ||
7 | |||
8 | #include <linux/range.h> | ||
9 | |||
10 | #ifndef ARRAY_SIZE | ||
11 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) | ||
12 | #endif | ||
13 | |||
14 | int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) | ||
15 | { | ||
16 | if (start >= end) | ||
17 | return nr_range; | ||
18 | |||
19 | /* Out of slots: */ | ||
20 | if (nr_range >= az) | ||
21 | return nr_range; | ||
22 | |||
23 | range[nr_range].start = start; | ||
24 | range[nr_range].end = end; | ||
25 | |||
26 | nr_range++; | ||
27 | |||
28 | return nr_range; | ||
29 | } | ||
30 | |||
31 | int add_range_with_merge(struct range *range, int az, int nr_range, | ||
32 | u64 start, u64 end) | ||
33 | { | ||
34 | int i; | ||
35 | |||
36 | if (start >= end) | ||
37 | return nr_range; | ||
38 | |||
39 | /* Try to merge it with old one: */ | ||
40 | for (i = 0; i < nr_range; i++) { | ||
41 | u64 final_start, final_end; | ||
42 | u64 common_start, common_end; | ||
43 | |||
44 | if (!range[i].end) | ||
45 | continue; | ||
46 | |||
47 | common_start = max(range[i].start, start); | ||
48 | common_end = min(range[i].end, end); | ||
49 | if (common_start > common_end) | ||
50 | continue; | ||
51 | |||
52 | final_start = min(range[i].start, start); | ||
53 | final_end = max(range[i].end, end); | ||
54 | |||
55 | range[i].start = final_start; | ||
56 | range[i].end = final_end; | ||
57 | return nr_range; | ||
58 | } | ||
59 | |||
60 | /* Need to add it: */ | ||
61 | return add_range(range, az, nr_range, start, end); | ||
62 | } | ||
63 | |||
64 | void subtract_range(struct range *range, int az, u64 start, u64 end) | ||
65 | { | ||
66 | int i, j; | ||
67 | |||
68 | if (start >= end) | ||
69 | return; | ||
70 | |||
71 | for (j = 0; j < az; j++) { | ||
72 | if (!range[j].end) | ||
73 | continue; | ||
74 | |||
75 | if (start <= range[j].start && end >= range[j].end) { | ||
76 | range[j].start = 0; | ||
77 | range[j].end = 0; | ||
78 | continue; | ||
79 | } | ||
80 | |||
81 | if (start <= range[j].start && end < range[j].end && | ||
82 | range[j].start < end) { | ||
83 | range[j].start = end; | ||
84 | continue; | ||
85 | } | ||
86 | |||
87 | |||
88 | if (start > range[j].start && end >= range[j].end && | ||
89 | range[j].end > start) { | ||
90 | range[j].end = start; | ||
91 | continue; | ||
92 | } | ||
93 | |||
94 | if (start > range[j].start && end < range[j].end) { | ||
95 | /* Find the new spare: */ | ||
96 | for (i = 0; i < az; i++) { | ||
97 | if (range[i].end == 0) | ||
98 | break; | ||
99 | } | ||
100 | if (i < az) { | ||
101 | range[i].end = range[j].end; | ||
102 | range[i].start = end; | ||
103 | } else { | ||
104 | printk(KERN_ERR "run of slot in ranges\n"); | ||
105 | } | ||
106 | range[j].end = start; | ||
107 | continue; | ||
108 | } | ||
109 | } | ||
110 | } | ||
111 | |||
112 | static int cmp_range(const void *x1, const void *x2) | ||
113 | { | ||
114 | const struct range *r1 = x1; | ||
115 | const struct range *r2 = x2; | ||
116 | s64 start1, start2; | ||
117 | |||
118 | start1 = r1->start; | ||
119 | start2 = r2->start; | ||
120 | |||
121 | return start1 - start2; | ||
122 | } | ||
123 | |||
124 | int clean_sort_range(struct range *range, int az) | ||
125 | { | ||
126 | int i, j, k = az - 1, nr_range = 0; | ||
127 | |||
128 | for (i = 0; i < k; i++) { | ||
129 | if (range[i].end) | ||
130 | continue; | ||
131 | for (j = k; j > i; j--) { | ||
132 | if (range[j].end) { | ||
133 | k = j; | ||
134 | break; | ||
135 | } | ||
136 | } | ||
137 | if (j == i) | ||
138 | break; | ||
139 | range[i].start = range[k].start; | ||
140 | range[i].end = range[k].end; | ||
141 | range[k].start = 0; | ||
142 | range[k].end = 0; | ||
143 | k--; | ||
144 | } | ||
145 | /* count it */ | ||
146 | for (i = 0; i < az; i++) { | ||
147 | if (!range[i].end) { | ||
148 | nr_range = i; | ||
149 | break; | ||
150 | } | ||
151 | } | ||
152 | |||
153 | /* sort them */ | ||
154 | sort(range, nr_range, sizeof(struct range), cmp_range, NULL); | ||
155 | |||
156 | return nr_range; | ||
157 | } | ||
158 | |||
159 | void sort_range(struct range *range, int nr_range) | ||
160 | { | ||
161 | /* sort them */ | ||
162 | sort(range, nr_range, sizeof(struct range), cmp_range, NULL); | ||
163 | } | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 9b7fd4723878..f1125c1a6321 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -44,14 +44,43 @@ | |||
44 | #include <linux/cpu.h> | 44 | #include <linux/cpu.h> |
45 | #include <linux/mutex.h> | 45 | #include <linux/mutex.h> |
46 | #include <linux/module.h> | 46 | #include <linux/module.h> |
47 | #include <linux/kernel_stat.h> | ||
47 | 48 | ||
48 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
49 | static struct lock_class_key rcu_lock_key; | 50 | static struct lock_class_key rcu_lock_key; |
50 | struct lockdep_map rcu_lock_map = | 51 | struct lockdep_map rcu_lock_map = |
51 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | 52 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); |
52 | EXPORT_SYMBOL_GPL(rcu_lock_map); | 53 | EXPORT_SYMBOL_GPL(rcu_lock_map); |
54 | |||
55 | static struct lock_class_key rcu_bh_lock_key; | ||
56 | struct lockdep_map rcu_bh_lock_map = | ||
57 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); | ||
58 | EXPORT_SYMBOL_GPL(rcu_bh_lock_map); | ||
59 | |||
60 | static struct lock_class_key rcu_sched_lock_key; | ||
61 | struct lockdep_map rcu_sched_lock_map = | ||
62 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); | ||
63 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); | ||
53 | #endif | 64 | #endif |
54 | 65 | ||
66 | int rcu_scheduler_active __read_mostly; | ||
67 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
68 | |||
69 | /* | ||
70 | * This function is invoked towards the end of the scheduler's initialization | ||
71 | * process. Before this is called, the idle task might contain | ||
72 | * RCU read-side critical sections (during which time, this idle | ||
73 | * task is booting the system). After this function is called, the | ||
74 | * idle tasks are prohibited from containing RCU read-side critical | ||
75 | * sections. | ||
76 | */ | ||
77 | void rcu_scheduler_starting(void) | ||
78 | { | ||
79 | WARN_ON(num_online_cpus() != 1); | ||
80 | WARN_ON(nr_context_switches() > 0); | ||
81 | rcu_scheduler_active = 1; | ||
82 | } | ||
83 | |||
55 | /* | 84 | /* |
56 | * Awaken the corresponding synchronize_rcu() instance now that a | 85 | * Awaken the corresponding synchronize_rcu() instance now that a |
57 | * grace period has elapsed. | 86 | * grace period has elapsed. |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9bb52177af02..58df55bf83ed 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | |||
61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ |
62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ |
63 | static int irqreader = 1; /* RCU readers from irq (timers). */ | 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ |
64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | ||
65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | ||
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | ||
64 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | 67 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ |
65 | 68 | ||
66 | module_param(nreaders, int, 0444); | 69 | module_param(nreaders, int, 0444); |
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444); | |||
79 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | 82 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); |
80 | module_param(irqreader, int, 0444); | 83 | module_param(irqreader, int, 0444); |
81 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | 84 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); |
85 | module_param(fqs_duration, int, 0444); | ||
86 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); | ||
87 | module_param(fqs_holdoff, int, 0444); | ||
88 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | ||
89 | module_param(fqs_stutter, int, 0444); | ||
90 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | ||
82 | module_param(torture_type, charp, 0444); | 91 | module_param(torture_type, charp, 0444); |
83 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 92 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); |
84 | 93 | ||
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks; | |||
99 | static struct task_struct *stats_task; | 108 | static struct task_struct *stats_task; |
100 | static struct task_struct *shuffler_task; | 109 | static struct task_struct *shuffler_task; |
101 | static struct task_struct *stutter_task; | 110 | static struct task_struct *stutter_task; |
111 | static struct task_struct *fqs_task; | ||
102 | 112 | ||
103 | #define RCU_TORTURE_PIPE_LEN 10 | 113 | #define RCU_TORTURE_PIPE_LEN 10 |
104 | 114 | ||
@@ -263,6 +273,7 @@ struct rcu_torture_ops { | |||
263 | void (*deferred_free)(struct rcu_torture *p); | 273 | void (*deferred_free)(struct rcu_torture *p); |
264 | void (*sync)(void); | 274 | void (*sync)(void); |
265 | void (*cb_barrier)(void); | 275 | void (*cb_barrier)(void); |
276 | void (*fqs)(void); | ||
266 | int (*stats)(char *page); | 277 | int (*stats)(char *page); |
267 | int irq_capable; | 278 | int irq_capable; |
268 | char *name; | 279 | char *name; |
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
347 | .deferred_free = rcu_torture_deferred_free, | 358 | .deferred_free = rcu_torture_deferred_free, |
348 | .sync = synchronize_rcu, | 359 | .sync = synchronize_rcu, |
349 | .cb_barrier = rcu_barrier, | 360 | .cb_barrier = rcu_barrier, |
361 | .fqs = rcu_force_quiescent_state, | ||
350 | .stats = NULL, | 362 | .stats = NULL, |
351 | .irq_capable = 1, | 363 | .irq_capable = 1, |
352 | .name = "rcu" | 364 | .name = "rcu" |
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
388 | .deferred_free = rcu_sync_torture_deferred_free, | 400 | .deferred_free = rcu_sync_torture_deferred_free, |
389 | .sync = synchronize_rcu, | 401 | .sync = synchronize_rcu, |
390 | .cb_barrier = NULL, | 402 | .cb_barrier = NULL, |
403 | .fqs = rcu_force_quiescent_state, | ||
391 | .stats = NULL, | 404 | .stats = NULL, |
392 | .irq_capable = 1, | 405 | .irq_capable = 1, |
393 | .name = "rcu_sync" | 406 | .name = "rcu_sync" |
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
403 | .deferred_free = rcu_sync_torture_deferred_free, | 416 | .deferred_free = rcu_sync_torture_deferred_free, |
404 | .sync = synchronize_rcu_expedited, | 417 | .sync = synchronize_rcu_expedited, |
405 | .cb_barrier = NULL, | 418 | .cb_barrier = NULL, |
419 | .fqs = rcu_force_quiescent_state, | ||
406 | .stats = NULL, | 420 | .stats = NULL, |
407 | .irq_capable = 1, | 421 | .irq_capable = 1, |
408 | .name = "rcu_expedited" | 422 | .name = "rcu_expedited" |
@@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
465 | .deferred_free = rcu_bh_torture_deferred_free, | 479 | .deferred_free = rcu_bh_torture_deferred_free, |
466 | .sync = rcu_bh_torture_synchronize, | 480 | .sync = rcu_bh_torture_synchronize, |
467 | .cb_barrier = rcu_barrier_bh, | 481 | .cb_barrier = rcu_barrier_bh, |
482 | .fqs = rcu_bh_force_quiescent_state, | ||
468 | .stats = NULL, | 483 | .stats = NULL, |
469 | .irq_capable = 1, | 484 | .irq_capable = 1, |
470 | .name = "rcu_bh" | 485 | .name = "rcu_bh" |
@@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
480 | .deferred_free = rcu_sync_torture_deferred_free, | 495 | .deferred_free = rcu_sync_torture_deferred_free, |
481 | .sync = rcu_bh_torture_synchronize, | 496 | .sync = rcu_bh_torture_synchronize, |
482 | .cb_barrier = NULL, | 497 | .cb_barrier = NULL, |
498 | .fqs = rcu_bh_force_quiescent_state, | ||
483 | .stats = NULL, | 499 | .stats = NULL, |
484 | .irq_capable = 1, | 500 | .irq_capable = 1, |
485 | .name = "rcu_bh_sync" | 501 | .name = "rcu_bh_sync" |
@@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = { | |||
621 | .deferred_free = rcu_sched_torture_deferred_free, | 637 | .deferred_free = rcu_sched_torture_deferred_free, |
622 | .sync = sched_torture_synchronize, | 638 | .sync = sched_torture_synchronize, |
623 | .cb_barrier = rcu_barrier_sched, | 639 | .cb_barrier = rcu_barrier_sched, |
640 | .fqs = rcu_sched_force_quiescent_state, | ||
624 | .stats = NULL, | 641 | .stats = NULL, |
625 | .irq_capable = 1, | 642 | .irq_capable = 1, |
626 | .name = "sched" | 643 | .name = "sched" |
@@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = { | |||
636 | .deferred_free = rcu_sync_torture_deferred_free, | 653 | .deferred_free = rcu_sync_torture_deferred_free, |
637 | .sync = sched_torture_synchronize, | 654 | .sync = sched_torture_synchronize, |
638 | .cb_barrier = NULL, | 655 | .cb_barrier = NULL, |
656 | .fqs = rcu_sched_force_quiescent_state, | ||
639 | .stats = NULL, | 657 | .stats = NULL, |
640 | .name = "sched_sync" | 658 | .name = "sched_sync" |
641 | }; | 659 | }; |
@@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = { | |||
650 | .deferred_free = rcu_sync_torture_deferred_free, | 668 | .deferred_free = rcu_sync_torture_deferred_free, |
651 | .sync = synchronize_sched_expedited, | 669 | .sync = synchronize_sched_expedited, |
652 | .cb_barrier = NULL, | 670 | .cb_barrier = NULL, |
671 | .fqs = rcu_sched_force_quiescent_state, | ||
653 | .stats = rcu_expedited_torture_stats, | 672 | .stats = rcu_expedited_torture_stats, |
654 | .irq_capable = 1, | 673 | .irq_capable = 1, |
655 | .name = "sched_expedited" | 674 | .name = "sched_expedited" |
656 | }; | 675 | }; |
657 | 676 | ||
658 | /* | 677 | /* |
678 | * RCU torture force-quiescent-state kthread. Repeatedly induces | ||
679 | * bursts of calls to force_quiescent_state(), increasing the probability | ||
680 | * of occurrence of some important types of race conditions. | ||
681 | */ | ||
682 | static int | ||
683 | rcu_torture_fqs(void *arg) | ||
684 | { | ||
685 | unsigned long fqs_resume_time; | ||
686 | int fqs_burst_remaining; | ||
687 | |||
688 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); | ||
689 | do { | ||
690 | fqs_resume_time = jiffies + fqs_stutter * HZ; | ||
691 | while (jiffies - fqs_resume_time > LONG_MAX) { | ||
692 | schedule_timeout_interruptible(1); | ||
693 | } | ||
694 | fqs_burst_remaining = fqs_duration; | ||
695 | while (fqs_burst_remaining > 0) { | ||
696 | cur_ops->fqs(); | ||
697 | udelay(fqs_holdoff); | ||
698 | fqs_burst_remaining -= fqs_holdoff; | ||
699 | } | ||
700 | rcu_stutter_wait("rcu_torture_fqs"); | ||
701 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
702 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); | ||
703 | rcutorture_shutdown_absorb("rcu_torture_fqs"); | ||
704 | while (!kthread_should_stop()) | ||
705 | schedule_timeout_uninterruptible(1); | ||
706 | return 0; | ||
707 | } | ||
708 | |||
709 | /* | ||
659 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 710 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
660 | * for that pointed to by rcu_torture_current, freeing the old structure | 711 | * for that pointed to by rcu_torture_current, freeing the old structure |
661 | * after a series of grace periods (the "pipeline"). | 712 | * after a series of grace periods (the "pipeline"). |
@@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused) | |||
745 | 796 | ||
746 | idx = cur_ops->readlock(); | 797 | idx = cur_ops->readlock(); |
747 | completed = cur_ops->completed(); | 798 | completed = cur_ops->completed(); |
748 | p = rcu_dereference(rcu_torture_current); | 799 | p = rcu_dereference_check(rcu_torture_current, |
800 | rcu_read_lock_held() || | ||
801 | rcu_read_lock_bh_held() || | ||
802 | rcu_read_lock_sched_held() || | ||
803 | srcu_read_lock_held(&srcu_ctl)); | ||
749 | if (p == NULL) { | 804 | if (p == NULL) { |
750 | /* Leave because rcu_torture_writer is not yet underway */ | 805 | /* Leave because rcu_torture_writer is not yet underway */ |
751 | cur_ops->readunlock(idx); | 806 | cur_ops->readunlock(idx); |
@@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused) | |||
763 | /* Should not happen, but... */ | 818 | /* Should not happen, but... */ |
764 | pipe_count = RCU_TORTURE_PIPE_LEN; | 819 | pipe_count = RCU_TORTURE_PIPE_LEN; |
765 | } | 820 | } |
766 | __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); | 821 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
767 | completed = cur_ops->completed() - completed; | 822 | completed = cur_ops->completed() - completed; |
768 | if (completed > RCU_TORTURE_PIPE_LEN) { | 823 | if (completed > RCU_TORTURE_PIPE_LEN) { |
769 | /* Should not happen, but... */ | 824 | /* Should not happen, but... */ |
770 | completed = RCU_TORTURE_PIPE_LEN; | 825 | completed = RCU_TORTURE_PIPE_LEN; |
771 | } | 826 | } |
772 | __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); | 827 | __this_cpu_inc(rcu_torture_batch[completed]); |
773 | preempt_enable(); | 828 | preempt_enable(); |
774 | cur_ops->readunlock(idx); | 829 | cur_ops->readunlock(idx); |
775 | } | 830 | } |
@@ -798,11 +853,15 @@ rcu_torture_reader(void *arg) | |||
798 | do { | 853 | do { |
799 | if (irqreader && cur_ops->irq_capable) { | 854 | if (irqreader && cur_ops->irq_capable) { |
800 | if (!timer_pending(&t)) | 855 | if (!timer_pending(&t)) |
801 | mod_timer(&t, 1); | 856 | mod_timer(&t, jiffies + 1); |
802 | } | 857 | } |
803 | idx = cur_ops->readlock(); | 858 | idx = cur_ops->readlock(); |
804 | completed = cur_ops->completed(); | 859 | completed = cur_ops->completed(); |
805 | p = rcu_dereference(rcu_torture_current); | 860 | p = rcu_dereference_check(rcu_torture_current, |
861 | rcu_read_lock_held() || | ||
862 | rcu_read_lock_bh_held() || | ||
863 | rcu_read_lock_sched_held() || | ||
864 | srcu_read_lock_held(&srcu_ctl)); | ||
806 | if (p == NULL) { | 865 | if (p == NULL) { |
807 | /* Wait for rcu_torture_writer to get underway */ | 866 | /* Wait for rcu_torture_writer to get underway */ |
808 | cur_ops->readunlock(idx); | 867 | cur_ops->readunlock(idx); |
@@ -818,13 +877,13 @@ rcu_torture_reader(void *arg) | |||
818 | /* Should not happen, but... */ | 877 | /* Should not happen, but... */ |
819 | pipe_count = RCU_TORTURE_PIPE_LEN; | 878 | pipe_count = RCU_TORTURE_PIPE_LEN; |
820 | } | 879 | } |
821 | __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); | 880 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
822 | completed = cur_ops->completed() - completed; | 881 | completed = cur_ops->completed() - completed; |
823 | if (completed > RCU_TORTURE_PIPE_LEN) { | 882 | if (completed > RCU_TORTURE_PIPE_LEN) { |
824 | /* Should not happen, but... */ | 883 | /* Should not happen, but... */ |
825 | completed = RCU_TORTURE_PIPE_LEN; | 884 | completed = RCU_TORTURE_PIPE_LEN; |
826 | } | 885 | } |
827 | __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); | 886 | __this_cpu_inc(rcu_torture_batch[completed]); |
828 | preempt_enable(); | 887 | preempt_enable(); |
829 | cur_ops->readunlock(idx); | 888 | cur_ops->readunlock(idx); |
830 | schedule(); | 889 | schedule(); |
@@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag) | |||
1030 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1089 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1031 | "--- %s: nreaders=%d nfakewriters=%d " | 1090 | "--- %s: nreaders=%d nfakewriters=%d " |
1032 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1091 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
1033 | "shuffle_interval=%d stutter=%d irqreader=%d\n", | 1092 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1093 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", | ||
1034 | torture_type, tag, nrealreaders, nfakewriters, | 1094 | torture_type, tag, nrealreaders, nfakewriters, |
1035 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1095 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1036 | stutter, irqreader); | 1096 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); |
1037 | } | 1097 | } |
1038 | 1098 | ||
1039 | static struct notifier_block rcutorture_nb = { | 1099 | static struct notifier_block rcutorture_nb = { |
@@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void) | |||
1109 | } | 1169 | } |
1110 | stats_task = NULL; | 1170 | stats_task = NULL; |
1111 | 1171 | ||
1172 | if (fqs_task) { | ||
1173 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); | ||
1174 | kthread_stop(fqs_task); | ||
1175 | } | ||
1176 | fqs_task = NULL; | ||
1177 | |||
1112 | /* Wait for all RCU callbacks to fire. */ | 1178 | /* Wait for all RCU callbacks to fire. */ |
1113 | 1179 | ||
1114 | if (cur_ops->cb_barrier != NULL) | 1180 | if (cur_ops->cb_barrier != NULL) |
@@ -1154,6 +1220,11 @@ rcu_torture_init(void) | |||
1154 | mutex_unlock(&fullstop_mutex); | 1220 | mutex_unlock(&fullstop_mutex); |
1155 | return -EINVAL; | 1221 | return -EINVAL; |
1156 | } | 1222 | } |
1223 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | ||
1224 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " | ||
1225 | "fqs_duration, fqs disabled.\n"); | ||
1226 | fqs_duration = 0; | ||
1227 | } | ||
1157 | if (cur_ops->init) | 1228 | if (cur_ops->init) |
1158 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 1229 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
1159 | 1230 | ||
@@ -1282,6 +1353,19 @@ rcu_torture_init(void) | |||
1282 | goto unwind; | 1353 | goto unwind; |
1283 | } | 1354 | } |
1284 | } | 1355 | } |
1356 | if (fqs_duration < 0) | ||
1357 | fqs_duration = 0; | ||
1358 | if (fqs_duration) { | ||
1359 | /* Create the stutter thread */ | ||
1360 | fqs_task = kthread_run(rcu_torture_fqs, NULL, | ||
1361 | "rcu_torture_fqs"); | ||
1362 | if (IS_ERR(fqs_task)) { | ||
1363 | firsterr = PTR_ERR(fqs_task); | ||
1364 | VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); | ||
1365 | fqs_task = NULL; | ||
1366 | goto unwind; | ||
1367 | } | ||
1368 | } | ||
1285 | register_reboot_notifier(&rcutorture_nb); | 1369 | register_reboot_notifier(&rcutorture_nb); |
1286 | mutex_unlock(&fullstop_mutex); | 1370 | mutex_unlock(&fullstop_mutex); |
1287 | return 0; | 1371 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 53ae9598f798..3ec8160fc75f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -46,7 +46,6 @@ | |||
46 | #include <linux/cpu.h> | 46 | #include <linux/cpu.h> |
47 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
48 | #include <linux/time.h> | 48 | #include <linux/time.h> |
49 | #include <linux/kernel_stat.h> | ||
50 | 49 | ||
51 | #include "rcutree.h" | 50 | #include "rcutree.h" |
52 | 51 | ||
@@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
66 | .signaled = RCU_GP_IDLE, \ | 65 | .signaled = RCU_GP_IDLE, \ |
67 | .gpnum = -300, \ | 66 | .gpnum = -300, \ |
68 | .completed = -300, \ | 67 | .completed = -300, \ |
69 | .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ | 68 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ |
70 | .orphan_cbs_list = NULL, \ | 69 | .orphan_cbs_list = NULL, \ |
71 | .orphan_cbs_tail = &name.orphan_cbs_list, \ | 70 | .orphan_cbs_tail = &name.orphan_cbs_list, \ |
72 | .orphan_qlen = 0, \ | 71 | .orphan_qlen = 0, \ |
73 | .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ | 72 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ |
74 | .n_force_qs = 0, \ | 73 | .n_force_qs = 0, \ |
75 | .n_force_qs_ngp = 0, \ | 74 | .n_force_qs_ngp = 0, \ |
76 | } | 75 | } |
@@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | |||
81 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | 80 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); |
82 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 81 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
83 | 82 | ||
84 | static int rcu_scheduler_active __read_mostly; | ||
85 | |||
86 | |||
87 | /* | 83 | /* |
88 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 84 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
89 | * permit this function to be invoked without holding the root rcu_node | 85 | * permit this function to be invoked without holding the root rcu_node |
@@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void) | |||
157 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | 153 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); |
158 | 154 | ||
159 | /* | 155 | /* |
156 | * Force a quiescent state for RCU BH. | ||
157 | */ | ||
158 | void rcu_bh_force_quiescent_state(void) | ||
159 | { | ||
160 | force_quiescent_state(&rcu_bh_state, 0); | ||
161 | } | ||
162 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | ||
163 | |||
164 | /* | ||
165 | * Force a quiescent state for RCU-sched. | ||
166 | */ | ||
167 | void rcu_sched_force_quiescent_state(void) | ||
168 | { | ||
169 | force_quiescent_state(&rcu_sched_state, 0); | ||
170 | } | ||
171 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | ||
172 | |||
173 | /* | ||
160 | * Does the CPU have callbacks ready to be invoked? | 174 | * Does the CPU have callbacks ready to be invoked? |
161 | */ | 175 | */ |
162 | static int | 176 | static int |
@@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
439 | 453 | ||
440 | /* Only let one CPU complain about others per time interval. */ | 454 | /* Only let one CPU complain about others per time interval. */ |
441 | 455 | ||
442 | spin_lock_irqsave(&rnp->lock, flags); | 456 | raw_spin_lock_irqsave(&rnp->lock, flags); |
443 | delta = jiffies - rsp->jiffies_stall; | 457 | delta = jiffies - rsp->jiffies_stall; |
444 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { | 458 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { |
445 | spin_unlock_irqrestore(&rnp->lock, flags); | 459 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
446 | return; | 460 | return; |
447 | } | 461 | } |
448 | rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | 462 | rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; |
@@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
452 | * due to CPU offlining. | 466 | * due to CPU offlining. |
453 | */ | 467 | */ |
454 | rcu_print_task_stall(rnp); | 468 | rcu_print_task_stall(rnp); |
455 | spin_unlock_irqrestore(&rnp->lock, flags); | 469 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
456 | 470 | ||
457 | /* OK, time to rat on our buddy... */ | 471 | /* OK, time to rat on our buddy... */ |
458 | 472 | ||
459 | printk(KERN_ERR "INFO: RCU detected CPU stalls:"); | 473 | printk(KERN_ERR "INFO: RCU detected CPU stalls:"); |
460 | rcu_for_each_leaf_node(rsp, rnp) { | 474 | rcu_for_each_leaf_node(rsp, rnp) { |
475 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
461 | rcu_print_task_stall(rnp); | 476 | rcu_print_task_stall(rnp); |
477 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
462 | if (rnp->qsmask == 0) | 478 | if (rnp->qsmask == 0) |
463 | continue; | 479 | continue; |
464 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 480 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) |
@@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
469 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 485 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); |
470 | trigger_all_cpu_backtrace(); | 486 | trigger_all_cpu_backtrace(); |
471 | 487 | ||
488 | /* If so configured, complain about tasks blocking the grace period. */ | ||
489 | |||
490 | rcu_print_detail_task_stall(rsp); | ||
491 | |||
472 | force_quiescent_state(rsp, 0); /* Kick them all. */ | 492 | force_quiescent_state(rsp, 0); /* Kick them all. */ |
473 | } | 493 | } |
474 | 494 | ||
@@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
481 | smp_processor_id(), jiffies - rsp->gp_start); | 501 | smp_processor_id(), jiffies - rsp->gp_start); |
482 | trigger_all_cpu_backtrace(); | 502 | trigger_all_cpu_backtrace(); |
483 | 503 | ||
484 | spin_lock_irqsave(&rnp->lock, flags); | 504 | raw_spin_lock_irqsave(&rnp->lock, flags); |
485 | if ((long)(jiffies - rsp->jiffies_stall) >= 0) | 505 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) |
486 | rsp->jiffies_stall = | 506 | rsp->jiffies_stall = |
487 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | 507 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; |
488 | spin_unlock_irqrestore(&rnp->lock, flags); | 508 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
489 | 509 | ||
490 | set_need_resched(); /* kick ourselves to get things going. */ | 510 | set_need_resched(); /* kick ourselves to get things going. */ |
491 | } | 511 | } |
@@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) | |||
545 | local_irq_save(flags); | 565 | local_irq_save(flags); |
546 | rnp = rdp->mynode; | 566 | rnp = rdp->mynode; |
547 | if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ | 567 | if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ |
548 | !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ | 568 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ |
549 | local_irq_restore(flags); | 569 | local_irq_restore(flags); |
550 | return; | 570 | return; |
551 | } | 571 | } |
552 | __note_new_gpnum(rsp, rnp, rdp); | 572 | __note_new_gpnum(rsp, rnp, rdp); |
553 | spin_unlock_irqrestore(&rnp->lock, flags); | 573 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
554 | } | 574 | } |
555 | 575 | ||
556 | /* | 576 | /* |
@@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) | |||
609 | local_irq_save(flags); | 629 | local_irq_save(flags); |
610 | rnp = rdp->mynode; | 630 | rnp = rdp->mynode; |
611 | if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ | 631 | if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ |
612 | !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ | 632 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ |
613 | local_irq_restore(flags); | 633 | local_irq_restore(flags); |
614 | return; | 634 | return; |
615 | } | 635 | } |
616 | __rcu_process_gp_end(rsp, rnp, rdp); | 636 | __rcu_process_gp_end(rsp, rnp, rdp); |
617 | spin_unlock_irqrestore(&rnp->lock, flags); | 637 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
618 | } | 638 | } |
619 | 639 | ||
620 | /* | 640 | /* |
@@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
659 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 679 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; |
660 | struct rcu_node *rnp = rcu_get_root(rsp); | 680 | struct rcu_node *rnp = rcu_get_root(rsp); |
661 | 681 | ||
662 | if (!cpu_needs_another_gp(rsp, rdp)) { | 682 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { |
683 | if (cpu_needs_another_gp(rsp, rdp)) | ||
684 | rsp->fqs_need_gp = 1; | ||
663 | if (rnp->completed == rsp->completed) { | 685 | if (rnp->completed == rsp->completed) { |
664 | spin_unlock_irqrestore(&rnp->lock, flags); | 686 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
665 | return; | 687 | return; |
666 | } | 688 | } |
667 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 689 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
668 | 690 | ||
669 | /* | 691 | /* |
670 | * Propagate new ->completed value to rcu_node structures | 692 | * Propagate new ->completed value to rcu_node structures |
@@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
672 | * of the next grace period to process their callbacks. | 694 | * of the next grace period to process their callbacks. |
673 | */ | 695 | */ |
674 | rcu_for_each_node_breadth_first(rsp, rnp) { | 696 | rcu_for_each_node_breadth_first(rsp, rnp) { |
675 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 697 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
676 | rnp->completed = rsp->completed; | 698 | rnp->completed = rsp->completed; |
677 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 699 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
678 | } | 700 | } |
679 | local_irq_restore(flags); | 701 | local_irq_restore(flags); |
680 | return; | 702 | return; |
@@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
695 | rnp->completed = rsp->completed; | 717 | rnp->completed = rsp->completed; |
696 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 718 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
697 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 719 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
698 | spin_unlock_irqrestore(&rnp->lock, flags); | 720 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
699 | return; | 721 | return; |
700 | } | 722 | } |
701 | 723 | ||
702 | spin_unlock(&rnp->lock); /* leave irqs disabled. */ | 724 | raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ |
703 | 725 | ||
704 | 726 | ||
705 | /* Exclude any concurrent CPU-hotplug operations. */ | 727 | /* Exclude any concurrent CPU-hotplug operations. */ |
706 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 728 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
707 | 729 | ||
708 | /* | 730 | /* |
709 | * Set the quiescent-state-needed bits in all the rcu_node | 731 | * Set the quiescent-state-needed bits in all the rcu_node |
@@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
723 | * irqs disabled. | 745 | * irqs disabled. |
724 | */ | 746 | */ |
725 | rcu_for_each_node_breadth_first(rsp, rnp) { | 747 | rcu_for_each_node_breadth_first(rsp, rnp) { |
726 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 748 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
727 | rcu_preempt_check_blocked_tasks(rnp); | 749 | rcu_preempt_check_blocked_tasks(rnp); |
728 | rnp->qsmask = rnp->qsmaskinit; | 750 | rnp->qsmask = rnp->qsmaskinit; |
729 | rnp->gpnum = rsp->gpnum; | 751 | rnp->gpnum = rsp->gpnum; |
730 | rnp->completed = rsp->completed; | 752 | rnp->completed = rsp->completed; |
731 | if (rnp == rdp->mynode) | 753 | if (rnp == rdp->mynode) |
732 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 754 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
733 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 755 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
734 | } | 756 | } |
735 | 757 | ||
736 | rnp = rcu_get_root(rsp); | 758 | rnp = rcu_get_root(rsp); |
737 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 759 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
738 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | 760 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ |
739 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 761 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
740 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 762 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
741 | } | 763 | } |
742 | 764 | ||
743 | /* | 765 | /* |
@@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
776 | if (!(rnp->qsmask & mask)) { | 798 | if (!(rnp->qsmask & mask)) { |
777 | 799 | ||
778 | /* Our bit has already been cleared, so done. */ | 800 | /* Our bit has already been cleared, so done. */ |
779 | spin_unlock_irqrestore(&rnp->lock, flags); | 801 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
780 | return; | 802 | return; |
781 | } | 803 | } |
782 | rnp->qsmask &= ~mask; | 804 | rnp->qsmask &= ~mask; |
783 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 805 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { |
784 | 806 | ||
785 | /* Other bits still set at this level, so done. */ | 807 | /* Other bits still set at this level, so done. */ |
786 | spin_unlock_irqrestore(&rnp->lock, flags); | 808 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
787 | return; | 809 | return; |
788 | } | 810 | } |
789 | mask = rnp->grpmask; | 811 | mask = rnp->grpmask; |
@@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
793 | 815 | ||
794 | break; | 816 | break; |
795 | } | 817 | } |
796 | spin_unlock_irqrestore(&rnp->lock, flags); | 818 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
797 | rnp_c = rnp; | 819 | rnp_c = rnp; |
798 | rnp = rnp->parent; | 820 | rnp = rnp->parent; |
799 | spin_lock_irqsave(&rnp->lock, flags); | 821 | raw_spin_lock_irqsave(&rnp->lock, flags); |
800 | WARN_ON_ONCE(rnp_c->qsmask); | 822 | WARN_ON_ONCE(rnp_c->qsmask); |
801 | } | 823 | } |
802 | 824 | ||
@@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
825 | struct rcu_node *rnp; | 847 | struct rcu_node *rnp; |
826 | 848 | ||
827 | rnp = rdp->mynode; | 849 | rnp = rdp->mynode; |
828 | spin_lock_irqsave(&rnp->lock, flags); | 850 | raw_spin_lock_irqsave(&rnp->lock, flags); |
829 | if (lastcomp != rnp->completed) { | 851 | if (lastcomp != rnp->completed) { |
830 | 852 | ||
831 | /* | 853 | /* |
@@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
837 | * race occurred. | 859 | * race occurred. |
838 | */ | 860 | */ |
839 | rdp->passed_quiesc = 0; /* try again later! */ | 861 | rdp->passed_quiesc = 0; /* try again later! */ |
840 | spin_unlock_irqrestore(&rnp->lock, flags); | 862 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
841 | return; | 863 | return; |
842 | } | 864 | } |
843 | mask = rdp->grpmask; | 865 | mask = rdp->grpmask; |
844 | if ((rnp->qsmask & mask) == 0) { | 866 | if ((rnp->qsmask & mask) == 0) { |
845 | spin_unlock_irqrestore(&rnp->lock, flags); | 867 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
846 | } else { | 868 | } else { |
847 | rdp->qs_pending = 0; | 869 | rdp->qs_pending = 0; |
848 | 870 | ||
@@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |||
906 | 928 | ||
907 | if (rdp->nxtlist == NULL) | 929 | if (rdp->nxtlist == NULL) |
908 | return; /* irqs disabled, so comparison is stable. */ | 930 | return; /* irqs disabled, so comparison is stable. */ |
909 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 931 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
910 | *rsp->orphan_cbs_tail = rdp->nxtlist; | 932 | *rsp->orphan_cbs_tail = rdp->nxtlist; |
911 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 933 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; |
912 | rdp->nxtlist = NULL; | 934 | rdp->nxtlist = NULL; |
@@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |||
914 | rdp->nxttail[i] = &rdp->nxtlist; | 936 | rdp->nxttail[i] = &rdp->nxtlist; |
915 | rsp->orphan_qlen += rdp->qlen; | 937 | rsp->orphan_qlen += rdp->qlen; |
916 | rdp->qlen = 0; | 938 | rdp->qlen = 0; |
917 | spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 939 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
918 | } | 940 | } |
919 | 941 | ||
920 | /* | 942 | /* |
@@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
925 | unsigned long flags; | 947 | unsigned long flags; |
926 | struct rcu_data *rdp; | 948 | struct rcu_data *rdp; |
927 | 949 | ||
928 | spin_lock_irqsave(&rsp->onofflock, flags); | 950 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
929 | rdp = rsp->rda[smp_processor_id()]; | 951 | rdp = rsp->rda[smp_processor_id()]; |
930 | if (rsp->orphan_cbs_list == NULL) { | 952 | if (rsp->orphan_cbs_list == NULL) { |
931 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 953 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
932 | return; | 954 | return; |
933 | } | 955 | } |
934 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | 956 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; |
@@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
937 | rsp->orphan_cbs_list = NULL; | 959 | rsp->orphan_cbs_list = NULL; |
938 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | 960 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; |
939 | rsp->orphan_qlen = 0; | 961 | rsp->orphan_qlen = 0; |
940 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 962 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
941 | } | 963 | } |
942 | 964 | ||
943 | /* | 965 | /* |
@@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
953 | struct rcu_node *rnp; | 975 | struct rcu_node *rnp; |
954 | 976 | ||
955 | /* Exclude any attempts to start a new grace period. */ | 977 | /* Exclude any attempts to start a new grace period. */ |
956 | spin_lock_irqsave(&rsp->onofflock, flags); | 978 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
957 | 979 | ||
958 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 980 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
959 | rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ | 981 | rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ |
960 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 982 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
961 | do { | 983 | do { |
962 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 984 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
963 | rnp->qsmaskinit &= ~mask; | 985 | rnp->qsmaskinit &= ~mask; |
964 | if (rnp->qsmaskinit != 0) { | 986 | if (rnp->qsmaskinit != 0) { |
965 | if (rnp != rdp->mynode) | 987 | if (rnp != rdp->mynode) |
966 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 988 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
967 | break; | 989 | break; |
968 | } | 990 | } |
969 | if (rnp == rdp->mynode) | 991 | if (rnp == rdp->mynode) |
970 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); | 992 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); |
971 | else | 993 | else |
972 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 994 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
973 | mask = rnp->grpmask; | 995 | mask = rnp->grpmask; |
974 | rnp = rnp->parent; | 996 | rnp = rnp->parent; |
975 | } while (rnp != NULL); | 997 | } while (rnp != NULL); |
@@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
980 | * because invoking rcu_report_unblock_qs_rnp() with ->onofflock | 1002 | * because invoking rcu_report_unblock_qs_rnp() with ->onofflock |
981 | * held leads to deadlock. | 1003 | * held leads to deadlock. |
982 | */ | 1004 | */ |
983 | spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 1005 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
984 | rnp = rdp->mynode; | 1006 | rnp = rdp->mynode; |
985 | if (need_report & RCU_OFL_TASKS_NORM_GP) | 1007 | if (need_report & RCU_OFL_TASKS_NORM_GP) |
986 | rcu_report_unblock_qs_rnp(rnp, flags); | 1008 | rcu_report_unblock_qs_rnp(rnp, flags); |
987 | else | 1009 | else |
988 | spin_unlock_irqrestore(&rnp->lock, flags); | 1010 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
989 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1011 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
990 | rcu_report_exp_rnp(rsp, rnp); | 1012 | rcu_report_exp_rnp(rsp, rnp); |
991 | 1013 | ||
@@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user) | |||
1144 | /* | 1166 | /* |
1145 | * Scan the leaf rcu_node structures, processing dyntick state for any that | 1167 | * Scan the leaf rcu_node structures, processing dyntick state for any that |
1146 | * have not yet encountered a quiescent state, using the function specified. | 1168 | * have not yet encountered a quiescent state, using the function specified. |
1147 | * Returns 1 if the current grace period ends while scanning (possibly | 1169 | * The caller must have suppressed start of new grace periods. |
1148 | * because we made it end). | ||
1149 | */ | 1170 | */ |
1150 | static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | 1171 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) |
1151 | int (*f)(struct rcu_data *)) | ||
1152 | { | 1172 | { |
1153 | unsigned long bit; | 1173 | unsigned long bit; |
1154 | int cpu; | 1174 | int cpu; |
@@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | |||
1158 | 1178 | ||
1159 | rcu_for_each_leaf_node(rsp, rnp) { | 1179 | rcu_for_each_leaf_node(rsp, rnp) { |
1160 | mask = 0; | 1180 | mask = 0; |
1161 | spin_lock_irqsave(&rnp->lock, flags); | 1181 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1162 | if (rnp->completed != lastcomp) { | 1182 | if (!rcu_gp_in_progress(rsp)) { |
1163 | spin_unlock_irqrestore(&rnp->lock, flags); | 1183 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1164 | return 1; | 1184 | return; |
1165 | } | 1185 | } |
1166 | if (rnp->qsmask == 0) { | 1186 | if (rnp->qsmask == 0) { |
1167 | spin_unlock_irqrestore(&rnp->lock, flags); | 1187 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1168 | continue; | 1188 | continue; |
1169 | } | 1189 | } |
1170 | cpu = rnp->grplo; | 1190 | cpu = rnp->grplo; |
@@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | |||
1173 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) | 1193 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) |
1174 | mask |= bit; | 1194 | mask |= bit; |
1175 | } | 1195 | } |
1176 | if (mask != 0 && rnp->completed == lastcomp) { | 1196 | if (mask != 0) { |
1177 | 1197 | ||
1178 | /* rcu_report_qs_rnp() releases rnp->lock. */ | 1198 | /* rcu_report_qs_rnp() releases rnp->lock. */ |
1179 | rcu_report_qs_rnp(mask, rsp, rnp, flags); | 1199 | rcu_report_qs_rnp(mask, rsp, rnp, flags); |
1180 | continue; | 1200 | continue; |
1181 | } | 1201 | } |
1182 | spin_unlock_irqrestore(&rnp->lock, flags); | 1202 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1183 | } | 1203 | } |
1184 | return 0; | ||
1185 | } | 1204 | } |
1186 | 1205 | ||
1187 | /* | 1206 | /* |
@@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | |||
1191 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | 1210 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed) |
1192 | { | 1211 | { |
1193 | unsigned long flags; | 1212 | unsigned long flags; |
1194 | long lastcomp; | ||
1195 | struct rcu_node *rnp = rcu_get_root(rsp); | 1213 | struct rcu_node *rnp = rcu_get_root(rsp); |
1196 | u8 signaled; | ||
1197 | u8 forcenow; | ||
1198 | 1214 | ||
1199 | if (!rcu_gp_in_progress(rsp)) | 1215 | if (!rcu_gp_in_progress(rsp)) |
1200 | return; /* No grace period in progress, nothing to force. */ | 1216 | return; /* No grace period in progress, nothing to force. */ |
1201 | if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { | 1217 | if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { |
1202 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ | 1218 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ |
1203 | return; /* Someone else is already on the job. */ | 1219 | return; /* Someone else is already on the job. */ |
1204 | } | 1220 | } |
1205 | if (relaxed && | 1221 | if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) |
1206 | (long)(rsp->jiffies_force_qs - jiffies) >= 0) | 1222 | goto unlock_fqs_ret; /* no emergency and done recently. */ |
1207 | goto unlock_ret; /* no emergency and done recently. */ | ||
1208 | rsp->n_force_qs++; | 1223 | rsp->n_force_qs++; |
1209 | spin_lock(&rnp->lock); | 1224 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
1210 | lastcomp = rsp->gpnum - 1; | ||
1211 | signaled = rsp->signaled; | ||
1212 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | 1225 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; |
1213 | if(!rcu_gp_in_progress(rsp)) { | 1226 | if(!rcu_gp_in_progress(rsp)) { |
1214 | rsp->n_force_qs_ngp++; | 1227 | rsp->n_force_qs_ngp++; |
1215 | spin_unlock(&rnp->lock); | 1228 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
1216 | goto unlock_ret; /* no GP in progress, time updated. */ | 1229 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ |
1217 | } | 1230 | } |
1218 | spin_unlock(&rnp->lock); | 1231 | rsp->fqs_active = 1; |
1219 | switch (signaled) { | 1232 | switch (rsp->signaled) { |
1220 | case RCU_GP_IDLE: | 1233 | case RCU_GP_IDLE: |
1221 | case RCU_GP_INIT: | 1234 | case RCU_GP_INIT: |
1222 | 1235 | ||
@@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1224 | 1237 | ||
1225 | case RCU_SAVE_DYNTICK: | 1238 | case RCU_SAVE_DYNTICK: |
1226 | 1239 | ||
1240 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
1227 | if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) | 1241 | if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) |
1228 | break; /* So gcc recognizes the dead code. */ | 1242 | break; /* So gcc recognizes the dead code. */ |
1229 | 1243 | ||
1230 | /* Record dyntick-idle state. */ | 1244 | /* Record dyntick-idle state. */ |
1231 | if (rcu_process_dyntick(rsp, lastcomp, | 1245 | force_qs_rnp(rsp, dyntick_save_progress_counter); |
1232 | dyntick_save_progress_counter)) | 1246 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
1233 | goto unlock_ret; | 1247 | if (rcu_gp_in_progress(rsp)) |
1234 | /* fall into next case. */ | ||
1235 | |||
1236 | case RCU_SAVE_COMPLETED: | ||
1237 | |||
1238 | /* Update state, record completion counter. */ | ||
1239 | forcenow = 0; | ||
1240 | spin_lock(&rnp->lock); | ||
1241 | if (lastcomp + 1 == rsp->gpnum && | ||
1242 | lastcomp == rsp->completed && | ||
1243 | rsp->signaled == signaled) { | ||
1244 | rsp->signaled = RCU_FORCE_QS; | 1248 | rsp->signaled = RCU_FORCE_QS; |
1245 | rsp->completed_fqs = lastcomp; | 1249 | break; |
1246 | forcenow = signaled == RCU_SAVE_COMPLETED; | ||
1247 | } | ||
1248 | spin_unlock(&rnp->lock); | ||
1249 | if (!forcenow) | ||
1250 | break; | ||
1251 | /* fall into next case. */ | ||
1252 | 1250 | ||
1253 | case RCU_FORCE_QS: | 1251 | case RCU_FORCE_QS: |
1254 | 1252 | ||
1255 | /* Check dyntick-idle state, send IPI to laggarts. */ | 1253 | /* Check dyntick-idle state, send IPI to laggarts. */ |
1256 | if (rcu_process_dyntick(rsp, rsp->completed_fqs, | 1254 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
1257 | rcu_implicit_dynticks_qs)) | 1255 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); |
1258 | goto unlock_ret; | ||
1259 | 1256 | ||
1260 | /* Leave state in case more forcing is required. */ | 1257 | /* Leave state in case more forcing is required. */ |
1261 | 1258 | ||
1259 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
1262 | break; | 1260 | break; |
1263 | } | 1261 | } |
1264 | unlock_ret: | 1262 | rsp->fqs_active = 0; |
1265 | spin_unlock_irqrestore(&rsp->fqslock, flags); | 1263 | if (rsp->fqs_need_gp) { |
1264 | raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ | ||
1265 | rsp->fqs_need_gp = 0; | ||
1266 | rcu_start_gp(rsp, flags); /* releases rnp->lock */ | ||
1267 | return; | ||
1268 | } | ||
1269 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
1270 | unlock_fqs_ret: | ||
1271 | raw_spin_unlock_irqrestore(&rsp->fqslock, flags); | ||
1266 | } | 1272 | } |
1267 | 1273 | ||
1268 | #else /* #ifdef CONFIG_SMP */ | 1274 | #else /* #ifdef CONFIG_SMP */ |
@@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1290 | * If an RCU GP has gone long enough, go check for dyntick | 1296 | * If an RCU GP has gone long enough, go check for dyntick |
1291 | * idle CPUs and, if needed, send resched IPIs. | 1297 | * idle CPUs and, if needed, send resched IPIs. |
1292 | */ | 1298 | */ |
1293 | if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) | 1299 | if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
1294 | force_quiescent_state(rsp, 1); | 1300 | force_quiescent_state(rsp, 1); |
1295 | 1301 | ||
1296 | /* | 1302 | /* |
@@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1304 | 1310 | ||
1305 | /* Does this CPU require a not-yet-started grace period? */ | 1311 | /* Does this CPU require a not-yet-started grace period? */ |
1306 | if (cpu_needs_another_gp(rsp, rdp)) { | 1312 | if (cpu_needs_another_gp(rsp, rdp)) { |
1307 | spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); | 1313 | raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); |
1308 | rcu_start_gp(rsp, flags); /* releases above lock */ | 1314 | rcu_start_gp(rsp, flags); /* releases above lock */ |
1309 | } | 1315 | } |
1310 | 1316 | ||
@@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
1335 | * grace-period manipulations above. | 1341 | * grace-period manipulations above. |
1336 | */ | 1342 | */ |
1337 | smp_mb(); /* See above block comment. */ | 1343 | smp_mb(); /* See above block comment. */ |
1344 | |||
1345 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ | ||
1346 | rcu_needs_cpu_flush(); | ||
1338 | } | 1347 | } |
1339 | 1348 | ||
1340 | static void | 1349 | static void |
@@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1369 | unsigned long nestflag; | 1378 | unsigned long nestflag; |
1370 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 1379 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
1371 | 1380 | ||
1372 | spin_lock_irqsave(&rnp_root->lock, nestflag); | 1381 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); |
1373 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ | 1382 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ |
1374 | } | 1383 | } |
1375 | 1384 | ||
@@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1387 | force_quiescent_state(rsp, 0); | 1396 | force_quiescent_state(rsp, 0); |
1388 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1397 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1389 | rdp->qlen_last_fqs_check = rdp->qlen; | 1398 | rdp->qlen_last_fqs_check = rdp->qlen; |
1390 | } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) | 1399 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
1391 | force_quiescent_state(rsp, 1); | 1400 | force_quiescent_state(rsp, 1); |
1392 | local_irq_restore(flags); | 1401 | local_irq_restore(flags); |
1393 | } | 1402 | } |
@@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1520 | 1529 | ||
1521 | /* Has an RCU GP gone long enough to send resched IPIs &c? */ | 1530 | /* Has an RCU GP gone long enough to send resched IPIs &c? */ |
1522 | if (rcu_gp_in_progress(rsp) && | 1531 | if (rcu_gp_in_progress(rsp) && |
1523 | ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { | 1532 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { |
1524 | rdp->n_rp_need_fqs++; | 1533 | rdp->n_rp_need_fqs++; |
1525 | return 1; | 1534 | return 1; |
1526 | } | 1535 | } |
@@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu) | |||
1545 | /* | 1554 | /* |
1546 | * Check to see if any future RCU-related work will need to be done | 1555 | * Check to see if any future RCU-related work will need to be done |
1547 | * by the current CPU, even if none need be done immediately, returning | 1556 | * by the current CPU, even if none need be done immediately, returning |
1548 | * 1 if so. This function is part of the RCU implementation; it is -not- | 1557 | * 1 if so. |
1549 | * an exported member of the RCU API. | ||
1550 | */ | 1558 | */ |
1551 | int rcu_needs_cpu(int cpu) | 1559 | static int rcu_needs_cpu_quick_check(int cpu) |
1552 | { | 1560 | { |
1553 | /* RCU callbacks either ready or pending? */ | 1561 | /* RCU callbacks either ready or pending? */ |
1554 | return per_cpu(rcu_sched_data, cpu).nxtlist || | 1562 | return per_cpu(rcu_sched_data, cpu).nxtlist || |
@@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu) | |||
1556 | rcu_preempt_needs_cpu(cpu); | 1564 | rcu_preempt_needs_cpu(cpu); |
1557 | } | 1565 | } |
1558 | 1566 | ||
1559 | /* | ||
1560 | * This function is invoked towards the end of the scheduler's initialization | ||
1561 | * process. Before this is called, the idle task might contain | ||
1562 | * RCU read-side critical sections (during which time, this idle | ||
1563 | * task is booting the system). After this function is called, the | ||
1564 | * idle tasks are prohibited from containing RCU read-side critical | ||
1565 | * sections. | ||
1566 | */ | ||
1567 | void rcu_scheduler_starting(void) | ||
1568 | { | ||
1569 | WARN_ON(num_online_cpus() != 1); | ||
1570 | WARN_ON(nr_context_switches() > 0); | ||
1571 | rcu_scheduler_active = 1; | ||
1572 | } | ||
1573 | |||
1574 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 1567 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; |
1575 | static atomic_t rcu_barrier_cpu_count; | 1568 | static atomic_t rcu_barrier_cpu_count; |
1576 | static DEFINE_MUTEX(rcu_barrier_mutex); | 1569 | static DEFINE_MUTEX(rcu_barrier_mutex); |
@@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1659 | struct rcu_node *rnp = rcu_get_root(rsp); | 1652 | struct rcu_node *rnp = rcu_get_root(rsp); |
1660 | 1653 | ||
1661 | /* Set up local state, ensuring consistent view of global state. */ | 1654 | /* Set up local state, ensuring consistent view of global state. */ |
1662 | spin_lock_irqsave(&rnp->lock, flags); | 1655 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1663 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); | 1656 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); |
1664 | rdp->nxtlist = NULL; | 1657 | rdp->nxtlist = NULL; |
1665 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1658 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
@@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1669 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 1662 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
1670 | #endif /* #ifdef CONFIG_NO_HZ */ | 1663 | #endif /* #ifdef CONFIG_NO_HZ */ |
1671 | rdp->cpu = cpu; | 1664 | rdp->cpu = cpu; |
1672 | spin_unlock_irqrestore(&rnp->lock, flags); | 1665 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1673 | } | 1666 | } |
1674 | 1667 | ||
1675 | /* | 1668 | /* |
@@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1687 | struct rcu_node *rnp = rcu_get_root(rsp); | 1680 | struct rcu_node *rnp = rcu_get_root(rsp); |
1688 | 1681 | ||
1689 | /* Set up local state, ensuring consistent view of global state. */ | 1682 | /* Set up local state, ensuring consistent view of global state. */ |
1690 | spin_lock_irqsave(&rnp->lock, flags); | 1683 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1691 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | 1684 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ |
1692 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | 1685 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ |
1693 | rdp->beenonline = 1; /* We have now been online. */ | 1686 | rdp->beenonline = 1; /* We have now been online. */ |
@@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1695 | rdp->qlen_last_fqs_check = 0; | 1688 | rdp->qlen_last_fqs_check = 0; |
1696 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1689 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1697 | rdp->blimit = blimit; | 1690 | rdp->blimit = blimit; |
1698 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1691 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1699 | 1692 | ||
1700 | /* | 1693 | /* |
1701 | * A new grace period might start here. If so, we won't be part | 1694 | * A new grace period might start here. If so, we won't be part |
@@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1703 | */ | 1696 | */ |
1704 | 1697 | ||
1705 | /* Exclude any attempts to start a new GP on large systems. */ | 1698 | /* Exclude any attempts to start a new GP on large systems. */ |
1706 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1699 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
1707 | 1700 | ||
1708 | /* Add CPU to rcu_node bitmasks. */ | 1701 | /* Add CPU to rcu_node bitmasks. */ |
1709 | rnp = rdp->mynode; | 1702 | rnp = rdp->mynode; |
1710 | mask = rdp->grpmask; | 1703 | mask = rdp->grpmask; |
1711 | do { | 1704 | do { |
1712 | /* Exclude any attempts to start a new GP on small systems. */ | 1705 | /* Exclude any attempts to start a new GP on small systems. */ |
1713 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 1706 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
1714 | rnp->qsmaskinit |= mask; | 1707 | rnp->qsmaskinit |= mask; |
1715 | mask = rnp->grpmask; | 1708 | mask = rnp->grpmask; |
1716 | if (rnp == rdp->mynode) { | 1709 | if (rnp == rdp->mynode) { |
@@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1718 | rdp->completed = rnp->completed; | 1711 | rdp->completed = rnp->completed; |
1719 | rdp->passed_quiesc_completed = rnp->completed - 1; | 1712 | rdp->passed_quiesc_completed = rnp->completed - 1; |
1720 | } | 1713 | } |
1721 | spin_unlock(&rnp->lock); /* irqs already disabled. */ | 1714 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
1722 | rnp = rnp->parent; | 1715 | rnp = rnp->parent; |
1723 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); | 1716 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); |
1724 | 1717 | ||
1725 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 1718 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
1726 | } | 1719 | } |
1727 | 1720 | ||
1728 | static void __cpuinit rcu_online_cpu(int cpu) | 1721 | static void __cpuinit rcu_online_cpu(int cpu) |
@@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
1806 | */ | 1799 | */ |
1807 | static void __init rcu_init_one(struct rcu_state *rsp) | 1800 | static void __init rcu_init_one(struct rcu_state *rsp) |
1808 | { | 1801 | { |
1802 | static char *buf[] = { "rcu_node_level_0", | ||
1803 | "rcu_node_level_1", | ||
1804 | "rcu_node_level_2", | ||
1805 | "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ | ||
1809 | int cpustride = 1; | 1806 | int cpustride = 1; |
1810 | int i; | 1807 | int i; |
1811 | int j; | 1808 | int j; |
1812 | struct rcu_node *rnp; | 1809 | struct rcu_node *rnp; |
1813 | 1810 | ||
1811 | BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ | ||
1812 | |||
1814 | /* Initialize the level-tracking arrays. */ | 1813 | /* Initialize the level-tracking arrays. */ |
1815 | 1814 | ||
1816 | for (i = 1; i < NUM_RCU_LVLS; i++) | 1815 | for (i = 1; i < NUM_RCU_LVLS; i++) |
@@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
1823 | cpustride *= rsp->levelspread[i]; | 1822 | cpustride *= rsp->levelspread[i]; |
1824 | rnp = rsp->level[i]; | 1823 | rnp = rsp->level[i]; |
1825 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { | 1824 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { |
1826 | spin_lock_init(&rnp->lock); | 1825 | raw_spin_lock_init(&rnp->lock); |
1827 | lockdep_set_class(&rnp->lock, &rcu_node_class[i]); | 1826 | lockdep_set_class_and_name(&rnp->lock, |
1827 | &rcu_node_class[i], buf[i]); | ||
1828 | rnp->gpnum = 0; | 1828 | rnp->gpnum = 0; |
1829 | rnp->qsmask = 0; | 1829 | rnp->qsmask = 0; |
1830 | rnp->qsmaskinit = 0; | 1830 | rnp->qsmaskinit = 0; |
@@ -1876,7 +1876,7 @@ do { \ | |||
1876 | 1876 | ||
1877 | void __init rcu_init(void) | 1877 | void __init rcu_init(void) |
1878 | { | 1878 | { |
1879 | int i; | 1879 | int cpu; |
1880 | 1880 | ||
1881 | rcu_bootup_announce(); | 1881 | rcu_bootup_announce(); |
1882 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 1882 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
@@ -1896,8 +1896,8 @@ void __init rcu_init(void) | |||
1896 | * or the scheduler are operational. | 1896 | * or the scheduler are operational. |
1897 | */ | 1897 | */ |
1898 | cpu_notifier(rcu_cpu_notify, 0); | 1898 | cpu_notifier(rcu_cpu_notify, 0); |
1899 | for_each_online_cpu(i) | 1899 | for_each_online_cpu(cpu) |
1900 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); | 1900 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
1901 | } | 1901 | } |
1902 | 1902 | ||
1903 | #include "rcutree_plugin.h" | 1903 | #include "rcutree_plugin.h" |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d2a0046f63b2..1439eb504c22 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -90,12 +90,12 @@ struct rcu_dynticks { | |||
90 | * Definition for node within the RCU grace-period-detection hierarchy. | 90 | * Definition for node within the RCU grace-period-detection hierarchy. |
91 | */ | 91 | */ |
92 | struct rcu_node { | 92 | struct rcu_node { |
93 | spinlock_t lock; /* Root rcu_node's lock protects some */ | 93 | raw_spinlock_t lock; /* Root rcu_node's lock protects some */ |
94 | /* rcu_state fields as well as following. */ | 94 | /* rcu_state fields as well as following. */ |
95 | long gpnum; /* Current grace period for this node. */ | 95 | unsigned long gpnum; /* Current grace period for this node. */ |
96 | /* This will either be equal to or one */ | 96 | /* This will either be equal to or one */ |
97 | /* behind the root rcu_node's gpnum. */ | 97 | /* behind the root rcu_node's gpnum. */ |
98 | long completed; /* Last grace period completed for this node. */ | 98 | unsigned long completed; /* Last GP completed for this node. */ |
99 | /* This will either be equal to or one */ | 99 | /* This will either be equal to or one */ |
100 | /* behind the root rcu_node's gpnum. */ | 100 | /* behind the root rcu_node's gpnum. */ |
101 | unsigned long qsmask; /* CPUs or groups that need to switch in */ | 101 | unsigned long qsmask; /* CPUs or groups that need to switch in */ |
@@ -161,11 +161,11 @@ struct rcu_node { | |||
161 | /* Per-CPU data for read-copy update. */ | 161 | /* Per-CPU data for read-copy update. */ |
162 | struct rcu_data { | 162 | struct rcu_data { |
163 | /* 1) quiescent-state and grace-period handling : */ | 163 | /* 1) quiescent-state and grace-period handling : */ |
164 | long completed; /* Track rsp->completed gp number */ | 164 | unsigned long completed; /* Track rsp->completed gp number */ |
165 | /* in order to detect GP end. */ | 165 | /* in order to detect GP end. */ |
166 | long gpnum; /* Highest gp number that this CPU */ | 166 | unsigned long gpnum; /* Highest gp number that this CPU */ |
167 | /* is aware of having started. */ | 167 | /* is aware of having started. */ |
168 | long passed_quiesc_completed; | 168 | unsigned long passed_quiesc_completed; |
169 | /* Value of completed at time of qs. */ | 169 | /* Value of completed at time of qs. */ |
170 | bool passed_quiesc; /* User-mode/idle loop etc. */ | 170 | bool passed_quiesc; /* User-mode/idle loop etc. */ |
171 | bool qs_pending; /* Core waits for quiesc state. */ | 171 | bool qs_pending; /* Core waits for quiesc state. */ |
@@ -221,14 +221,14 @@ struct rcu_data { | |||
221 | unsigned long resched_ipi; /* Sent a resched IPI. */ | 221 | unsigned long resched_ipi; /* Sent a resched IPI. */ |
222 | 222 | ||
223 | /* 5) __rcu_pending() statistics. */ | 223 | /* 5) __rcu_pending() statistics. */ |
224 | long n_rcu_pending; /* rcu_pending() calls since boot. */ | 224 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ |
225 | long n_rp_qs_pending; | 225 | unsigned long n_rp_qs_pending; |
226 | long n_rp_cb_ready; | 226 | unsigned long n_rp_cb_ready; |
227 | long n_rp_cpu_needs_gp; | 227 | unsigned long n_rp_cpu_needs_gp; |
228 | long n_rp_gp_completed; | 228 | unsigned long n_rp_gp_completed; |
229 | long n_rp_gp_started; | 229 | unsigned long n_rp_gp_started; |
230 | long n_rp_need_fqs; | 230 | unsigned long n_rp_need_fqs; |
231 | long n_rp_need_nothing; | 231 | unsigned long n_rp_need_nothing; |
232 | 232 | ||
233 | int cpu; | 233 | int cpu; |
234 | }; | 234 | }; |
@@ -237,12 +237,11 @@ struct rcu_data { | |||
237 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ | 237 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ |
238 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ | 238 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ |
239 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ | 239 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ |
240 | #define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ | 240 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ |
241 | #define RCU_FORCE_QS 4 /* Need to force quiescent state. */ | ||
242 | #ifdef CONFIG_NO_HZ | 241 | #ifdef CONFIG_NO_HZ |
243 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 242 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
244 | #else /* #ifdef CONFIG_NO_HZ */ | 243 | #else /* #ifdef CONFIG_NO_HZ */ |
245 | #define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED | 244 | #define RCU_SIGNAL_INIT RCU_FORCE_QS |
246 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 245 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
247 | 246 | ||
248 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 247 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
@@ -256,6 +255,9 @@ struct rcu_data { | |||
256 | 255 | ||
257 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 256 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
258 | 257 | ||
258 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) | ||
259 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | ||
260 | |||
259 | /* | 261 | /* |
260 | * RCU global state, including node hierarchy. This hierarchy is | 262 | * RCU global state, including node hierarchy. This hierarchy is |
261 | * represented in "heap" form in a dense array. The root (first level) | 263 | * represented in "heap" form in a dense array. The root (first level) |
@@ -277,12 +279,19 @@ struct rcu_state { | |||
277 | 279 | ||
278 | u8 signaled ____cacheline_internodealigned_in_smp; | 280 | u8 signaled ____cacheline_internodealigned_in_smp; |
279 | /* Force QS state. */ | 281 | /* Force QS state. */ |
280 | long gpnum; /* Current gp number. */ | 282 | u8 fqs_active; /* force_quiescent_state() */ |
281 | long completed; /* # of last completed gp. */ | 283 | /* is running. */ |
284 | u8 fqs_need_gp; /* A CPU was prevented from */ | ||
285 | /* starting a new grace */ | ||
286 | /* period because */ | ||
287 | /* force_quiescent_state() */ | ||
288 | /* was running. */ | ||
289 | unsigned long gpnum; /* Current gp number. */ | ||
290 | unsigned long completed; /* # of last completed gp. */ | ||
282 | 291 | ||
283 | /* End of fields guarded by root rcu_node's lock. */ | 292 | /* End of fields guarded by root rcu_node's lock. */ |
284 | 293 | ||
285 | spinlock_t onofflock; /* exclude on/offline and */ | 294 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
286 | /* starting new GP. Also */ | 295 | /* starting new GP. Also */ |
287 | /* protects the following */ | 296 | /* protects the following */ |
288 | /* orphan_cbs fields. */ | 297 | /* orphan_cbs fields. */ |
@@ -292,10 +301,8 @@ struct rcu_state { | |||
292 | /* going offline. */ | 301 | /* going offline. */ |
293 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ | 302 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ |
294 | long orphan_qlen; /* Number of orphaned cbs. */ | 303 | long orphan_qlen; /* Number of orphaned cbs. */ |
295 | spinlock_t fqslock; /* Only one task forcing */ | 304 | raw_spinlock_t fqslock; /* Only one task forcing */ |
296 | /* quiescent states. */ | 305 | /* quiescent states. */ |
297 | long completed_fqs; /* Value of completed @ snap. */ | ||
298 | /* Protected by fqslock. */ | ||
299 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 306 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
300 | /* force_quiescent_state(). */ | 307 | /* force_quiescent_state(). */ |
301 | unsigned long n_force_qs; /* Number of calls to */ | 308 | unsigned long n_force_qs; /* Number of calls to */ |
@@ -319,8 +326,6 @@ struct rcu_state { | |||
319 | #define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ | 326 | #define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ |
320 | /* GP were moved to root. */ | 327 | /* GP were moved to root. */ |
321 | 328 | ||
322 | #ifdef RCU_TREE_NONCORE | ||
323 | |||
324 | /* | 329 | /* |
325 | * RCU implementation internal declarations: | 330 | * RCU implementation internal declarations: |
326 | */ | 331 | */ |
@@ -335,7 +340,7 @@ extern struct rcu_state rcu_preempt_state; | |||
335 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | 340 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); |
336 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 341 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
337 | 342 | ||
338 | #else /* #ifdef RCU_TREE_NONCORE */ | 343 | #ifndef RCU_TREE_NONCORE |
339 | 344 | ||
340 | /* Forward declarations for rcutree_plugin.h */ | 345 | /* Forward declarations for rcutree_plugin.h */ |
341 | static void rcu_bootup_announce(void); | 346 | static void rcu_bootup_announce(void); |
@@ -347,6 +352,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | |||
347 | unsigned long flags); | 352 | unsigned long flags); |
348 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 353 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
349 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 354 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
355 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | ||
350 | static void rcu_print_task_stall(struct rcu_node *rnp); | 356 | static void rcu_print_task_stall(struct rcu_node *rnp); |
351 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 357 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
352 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 358 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
@@ -367,5 +373,6 @@ static int rcu_preempt_needs_cpu(int cpu); | |||
367 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 373 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
368 | static void rcu_preempt_send_cbs_to_orphanage(void); | 374 | static void rcu_preempt_send_cbs_to_orphanage(void); |
369 | static void __init __rcu_init_preempt(void); | 375 | static void __init __rcu_init_preempt(void); |
376 | static void rcu_needs_cpu_flush(void); | ||
370 | 377 | ||
371 | #endif /* #else #ifdef RCU_TREE_NONCORE */ | 378 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 37fbccdf41d5..464ad2cdee00 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -62,6 +62,15 @@ long rcu_batches_completed(void) | |||
62 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 62 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * Force a quiescent state for preemptible RCU. | ||
66 | */ | ||
67 | void rcu_force_quiescent_state(void) | ||
68 | { | ||
69 | force_quiescent_state(&rcu_preempt_state, 0); | ||
70 | } | ||
71 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
72 | |||
73 | /* | ||
65 | * Record a preemptable-RCU quiescent state for the specified CPU. Note | 74 | * Record a preemptable-RCU quiescent state for the specified CPU. Note |
66 | * that this just means that the task currently running on the CPU is | 75 | * that this just means that the task currently running on the CPU is |
67 | * not in a quiescent state. There might be any number of tasks blocked | 76 | * not in a quiescent state. There might be any number of tasks blocked |
@@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
102 | /* Possibly blocking in an RCU read-side critical section. */ | 111 | /* Possibly blocking in an RCU read-side critical section. */ |
103 | rdp = rcu_preempt_state.rda[cpu]; | 112 | rdp = rcu_preempt_state.rda[cpu]; |
104 | rnp = rdp->mynode; | 113 | rnp = rdp->mynode; |
105 | spin_lock_irqsave(&rnp->lock, flags); | 114 | raw_spin_lock_irqsave(&rnp->lock, flags); |
106 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 115 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
107 | t->rcu_blocked_node = rnp; | 116 | t->rcu_blocked_node = rnp; |
108 | 117 | ||
@@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
123 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 132 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); |
124 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; | 133 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; |
125 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); | 134 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); |
126 | spin_unlock_irqrestore(&rnp->lock, flags); | 135 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
127 | } | 136 | } |
128 | 137 | ||
129 | /* | 138 | /* |
@@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
180 | struct rcu_node *rnp_p; | 189 | struct rcu_node *rnp_p; |
181 | 190 | ||
182 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 191 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { |
183 | spin_unlock_irqrestore(&rnp->lock, flags); | 192 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
184 | return; /* Still need more quiescent states! */ | 193 | return; /* Still need more quiescent states! */ |
185 | } | 194 | } |
186 | 195 | ||
@@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
197 | 206 | ||
198 | /* Report up the rest of the hierarchy. */ | 207 | /* Report up the rest of the hierarchy. */ |
199 | mask = rnp->grpmask; | 208 | mask = rnp->grpmask; |
200 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 209 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
201 | spin_lock(&rnp_p->lock); /* irqs already disabled. */ | 210 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ |
202 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); | 211 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); |
203 | } | 212 | } |
204 | 213 | ||
@@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
248 | */ | 257 | */ |
249 | for (;;) { | 258 | for (;;) { |
250 | rnp = t->rcu_blocked_node; | 259 | rnp = t->rcu_blocked_node; |
251 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 260 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
252 | if (rnp == t->rcu_blocked_node) | 261 | if (rnp == t->rcu_blocked_node) |
253 | break; | 262 | break; |
254 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 263 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
255 | } | 264 | } |
256 | empty = !rcu_preempted_readers(rnp); | 265 | empty = !rcu_preempted_readers(rnp); |
257 | empty_exp = !rcu_preempted_readers_exp(rnp); | 266 | empty_exp = !rcu_preempted_readers_exp(rnp); |
@@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
265 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. | 274 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. |
266 | */ | 275 | */ |
267 | if (empty) | 276 | if (empty) |
268 | spin_unlock_irqrestore(&rnp->lock, flags); | 277 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
269 | else | 278 | else |
270 | rcu_report_unblock_qs_rnp(rnp, flags); | 279 | rcu_report_unblock_qs_rnp(rnp, flags); |
271 | 280 | ||
@@ -295,29 +304,73 @@ void __rcu_read_unlock(void) | |||
295 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && | 304 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && |
296 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 305 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
297 | rcu_read_unlock_special(t); | 306 | rcu_read_unlock_special(t); |
307 | #ifdef CONFIG_PROVE_LOCKING | ||
308 | WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); | ||
309 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
298 | } | 310 | } |
299 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | 311 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); |
300 | 312 | ||
301 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 313 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
302 | 314 | ||
315 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | ||
316 | |||
317 | /* | ||
318 | * Dump detailed information for all tasks blocking the current RCU | ||
319 | * grace period on the specified rcu_node structure. | ||
320 | */ | ||
321 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | ||
322 | { | ||
323 | unsigned long flags; | ||
324 | struct list_head *lp; | ||
325 | int phase; | ||
326 | struct task_struct *t; | ||
327 | |||
328 | if (rcu_preempted_readers(rnp)) { | ||
329 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
330 | phase = rnp->gpnum & 0x1; | ||
331 | lp = &rnp->blocked_tasks[phase]; | ||
332 | list_for_each_entry(t, lp, rcu_node_entry) | ||
333 | sched_show_task(t); | ||
334 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
335 | } | ||
336 | } | ||
337 | |||
338 | /* | ||
339 | * Dump detailed information for all tasks blocking the current RCU | ||
340 | * grace period. | ||
341 | */ | ||
342 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
343 | { | ||
344 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
345 | |||
346 | rcu_print_detail_task_stall_rnp(rnp); | ||
347 | rcu_for_each_leaf_node(rsp, rnp) | ||
348 | rcu_print_detail_task_stall_rnp(rnp); | ||
349 | } | ||
350 | |||
351 | #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
352 | |||
353 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
354 | { | ||
355 | } | ||
356 | |||
357 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
358 | |||
303 | /* | 359 | /* |
304 | * Scan the current list of tasks blocked within RCU read-side critical | 360 | * Scan the current list of tasks blocked within RCU read-side critical |
305 | * sections, printing out the tid of each. | 361 | * sections, printing out the tid of each. |
306 | */ | 362 | */ |
307 | static void rcu_print_task_stall(struct rcu_node *rnp) | 363 | static void rcu_print_task_stall(struct rcu_node *rnp) |
308 | { | 364 | { |
309 | unsigned long flags; | ||
310 | struct list_head *lp; | 365 | struct list_head *lp; |
311 | int phase; | 366 | int phase; |
312 | struct task_struct *t; | 367 | struct task_struct *t; |
313 | 368 | ||
314 | if (rcu_preempted_readers(rnp)) { | 369 | if (rcu_preempted_readers(rnp)) { |
315 | spin_lock_irqsave(&rnp->lock, flags); | ||
316 | phase = rnp->gpnum & 0x1; | 370 | phase = rnp->gpnum & 0x1; |
317 | lp = &rnp->blocked_tasks[phase]; | 371 | lp = &rnp->blocked_tasks[phase]; |
318 | list_for_each_entry(t, lp, rcu_node_entry) | 372 | list_for_each_entry(t, lp, rcu_node_entry) |
319 | printk(" P%d", t->pid); | 373 | printk(" P%d", t->pid); |
320 | spin_unlock_irqrestore(&rnp->lock, flags); | ||
321 | } | 374 | } |
322 | } | 375 | } |
323 | 376 | ||
@@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
388 | lp_root = &rnp_root->blocked_tasks[i]; | 441 | lp_root = &rnp_root->blocked_tasks[i]; |
389 | while (!list_empty(lp)) { | 442 | while (!list_empty(lp)) { |
390 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); | 443 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); |
391 | spin_lock(&rnp_root->lock); /* irqs already disabled */ | 444 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
392 | list_del(&tp->rcu_node_entry); | 445 | list_del(&tp->rcu_node_entry); |
393 | tp->rcu_blocked_node = rnp_root; | 446 | tp->rcu_blocked_node = rnp_root; |
394 | list_add(&tp->rcu_node_entry, lp_root); | 447 | list_add(&tp->rcu_node_entry, lp_root); |
395 | spin_unlock(&rnp_root->lock); /* irqs remain disabled */ | 448 | raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ |
396 | } | 449 | } |
397 | } | 450 | } |
398 | return retval; | 451 | return retval; |
@@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
516 | unsigned long flags; | 569 | unsigned long flags; |
517 | unsigned long mask; | 570 | unsigned long mask; |
518 | 571 | ||
519 | spin_lock_irqsave(&rnp->lock, flags); | 572 | raw_spin_lock_irqsave(&rnp->lock, flags); |
520 | for (;;) { | 573 | for (;;) { |
521 | if (!sync_rcu_preempt_exp_done(rnp)) | 574 | if (!sync_rcu_preempt_exp_done(rnp)) |
522 | break; | 575 | break; |
@@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
525 | break; | 578 | break; |
526 | } | 579 | } |
527 | mask = rnp->grpmask; | 580 | mask = rnp->grpmask; |
528 | spin_unlock(&rnp->lock); /* irqs remain disabled */ | 581 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
529 | rnp = rnp->parent; | 582 | rnp = rnp->parent; |
530 | spin_lock(&rnp->lock); /* irqs already disabled */ | 583 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
531 | rnp->expmask &= ~mask; | 584 | rnp->expmask &= ~mask; |
532 | } | 585 | } |
533 | spin_unlock_irqrestore(&rnp->lock, flags); | 586 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
534 | } | 587 | } |
535 | 588 | ||
536 | /* | 589 | /* |
@@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
545 | { | 598 | { |
546 | int must_wait; | 599 | int must_wait; |
547 | 600 | ||
548 | spin_lock(&rnp->lock); /* irqs already disabled */ | 601 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
549 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); | 602 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); |
550 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); | 603 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); |
551 | must_wait = rcu_preempted_readers_exp(rnp); | 604 | must_wait = rcu_preempted_readers_exp(rnp); |
552 | spin_unlock(&rnp->lock); /* irqs remain disabled */ | 605 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
553 | if (!must_wait) | 606 | if (!must_wait) |
554 | rcu_report_exp_rnp(rsp, rnp); | 607 | rcu_report_exp_rnp(rsp, rnp); |
555 | } | 608 | } |
@@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void) | |||
594 | /* force all RCU readers onto blocked_tasks[]. */ | 647 | /* force all RCU readers onto blocked_tasks[]. */ |
595 | synchronize_sched_expedited(); | 648 | synchronize_sched_expedited(); |
596 | 649 | ||
597 | spin_lock_irqsave(&rsp->onofflock, flags); | 650 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
598 | 651 | ||
599 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | 652 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ |
600 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | 653 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { |
601 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 654 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
602 | rnp->expmask = rnp->qsmaskinit; | 655 | rnp->expmask = rnp->qsmaskinit; |
603 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 656 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
604 | } | 657 | } |
605 | 658 | ||
606 | /* Snapshot current state of ->blocked_tasks[] lists. */ | 659 | /* Snapshot current state of ->blocked_tasks[] lists. */ |
@@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void) | |||
609 | if (NUM_RCU_NODES > 1) | 662 | if (NUM_RCU_NODES > 1) |
610 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); | 663 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); |
611 | 664 | ||
612 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 665 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
613 | 666 | ||
614 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ | 667 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ |
615 | rnp = rcu_get_root(rsp); | 668 | rnp = rcu_get_root(rsp); |
@@ -713,6 +766,16 @@ long rcu_batches_completed(void) | |||
713 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 766 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
714 | 767 | ||
715 | /* | 768 | /* |
769 | * Force a quiescent state for RCU, which, because there is no preemptible | ||
770 | * RCU, becomes the same as rcu-sched. | ||
771 | */ | ||
772 | void rcu_force_quiescent_state(void) | ||
773 | { | ||
774 | rcu_sched_force_quiescent_state(); | ||
775 | } | ||
776 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
777 | |||
778 | /* | ||
716 | * Because preemptable RCU does not exist, we never have to check for | 779 | * Because preemptable RCU does not exist, we never have to check for |
717 | * CPUs being in quiescent states. | 780 | * CPUs being in quiescent states. |
718 | */ | 781 | */ |
@@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp) | |||
734 | /* Because preemptible RCU does not exist, no quieting of tasks. */ | 797 | /* Because preemptible RCU does not exist, no quieting of tasks. */ |
735 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | 798 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) |
736 | { | 799 | { |
737 | spin_unlock_irqrestore(&rnp->lock, flags); | 800 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
738 | } | 801 | } |
739 | 802 | ||
740 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 803 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
@@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
745 | * Because preemptable RCU does not exist, we never have to check for | 808 | * Because preemptable RCU does not exist, we never have to check for |
746 | * tasks blocked within RCU read-side critical sections. | 809 | * tasks blocked within RCU read-side critical sections. |
747 | */ | 810 | */ |
811 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
812 | { | ||
813 | } | ||
814 | |||
815 | /* | ||
816 | * Because preemptable RCU does not exist, we never have to check for | ||
817 | * tasks blocked within RCU read-side critical sections. | ||
818 | */ | ||
748 | static void rcu_print_task_stall(struct rcu_node *rnp) | 819 | static void rcu_print_task_stall(struct rcu_node *rnp) |
749 | { | 820 | { |
750 | } | 821 | } |
@@ -884,3 +955,113 @@ static void __init __rcu_init_preempt(void) | |||
884 | } | 955 | } |
885 | 956 | ||
886 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 957 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
958 | |||
959 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | ||
960 | |||
961 | /* | ||
962 | * Check to see if any future RCU-related work will need to be done | ||
963 | * by the current CPU, even if none need be done immediately, returning | ||
964 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
965 | * an exported member of the RCU API. | ||
966 | * | ||
967 | * Because we have preemptible RCU, just check whether this CPU needs | ||
968 | * any flavor of RCU. Do not chew up lots of CPU cycles with preemption | ||
969 | * disabled in a most-likely vain attempt to cause RCU not to need this CPU. | ||
970 | */ | ||
971 | int rcu_needs_cpu(int cpu) | ||
972 | { | ||
973 | return rcu_needs_cpu_quick_check(cpu); | ||
974 | } | ||
975 | |||
976 | /* | ||
977 | * Check to see if we need to continue a callback-flush operations to | ||
978 | * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle | ||
979 | * entry is not configured, so we never do need to. | ||
980 | */ | ||
981 | static void rcu_needs_cpu_flush(void) | ||
982 | { | ||
983 | } | ||
984 | |||
985 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
986 | |||
987 | #define RCU_NEEDS_CPU_FLUSHES 5 | ||
988 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | ||
989 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | ||
990 | |||
991 | /* | ||
992 | * Check to see if any future RCU-related work will need to be done | ||
993 | * by the current CPU, even if none need be done immediately, returning | ||
994 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
995 | * an exported member of the RCU API. | ||
996 | * | ||
997 | * Because we are not supporting preemptible RCU, attempt to accelerate | ||
998 | * any current grace periods so that RCU no longer needs this CPU, but | ||
999 | * only if all other CPUs are already in dynticks-idle mode. This will | ||
1000 | * allow the CPU cores to be powered down immediately, as opposed to after | ||
1001 | * waiting many milliseconds for grace periods to elapse. | ||
1002 | * | ||
1003 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | ||
1004 | * disabled, we do one pass of force_quiescent_state(), then do a | ||
1005 | * raise_softirq() to cause rcu_process_callbacks() to be invoked later. | ||
1006 | * The per-cpu rcu_dyntick_drain variable controls the sequencing. | ||
1007 | */ | ||
1008 | int rcu_needs_cpu(int cpu) | ||
1009 | { | ||
1010 | int c = 0; | ||
1011 | int thatcpu; | ||
1012 | |||
1013 | /* Don't bother unless we are the last non-dyntick-idle CPU. */ | ||
1014 | for_each_cpu_not(thatcpu, nohz_cpu_mask) | ||
1015 | if (thatcpu != cpu) { | ||
1016 | per_cpu(rcu_dyntick_drain, cpu) = 0; | ||
1017 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | ||
1018 | return rcu_needs_cpu_quick_check(cpu); | ||
1019 | } | ||
1020 | |||
1021 | /* Check and update the rcu_dyntick_drain sequencing. */ | ||
1022 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | ||
1023 | /* First time through, initialize the counter. */ | ||
1024 | per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; | ||
1025 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | ||
1026 | /* We have hit the limit, so time to give up. */ | ||
1027 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | ||
1028 | return rcu_needs_cpu_quick_check(cpu); | ||
1029 | } | ||
1030 | |||
1031 | /* Do one step pushing remaining RCU callbacks through. */ | ||
1032 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | ||
1033 | rcu_sched_qs(cpu); | ||
1034 | force_quiescent_state(&rcu_sched_state, 0); | ||
1035 | c = c || per_cpu(rcu_sched_data, cpu).nxtlist; | ||
1036 | } | ||
1037 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | ||
1038 | rcu_bh_qs(cpu); | ||
1039 | force_quiescent_state(&rcu_bh_state, 0); | ||
1040 | c = c || per_cpu(rcu_bh_data, cpu).nxtlist; | ||
1041 | } | ||
1042 | |||
1043 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | ||
1044 | if (c) { | ||
1045 | raise_softirq(RCU_SOFTIRQ); | ||
1046 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | ||
1047 | } | ||
1048 | return c; | ||
1049 | } | ||
1050 | |||
1051 | /* | ||
1052 | * Check to see if we need to continue a callback-flush operations to | ||
1053 | * allow the last CPU to enter dyntick-idle mode. | ||
1054 | */ | ||
1055 | static void rcu_needs_cpu_flush(void) | ||
1056 | { | ||
1057 | int cpu = smp_processor_id(); | ||
1058 | unsigned long flags; | ||
1059 | |||
1060 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) | ||
1061 | return; | ||
1062 | local_irq_save(flags); | ||
1063 | (void)rcu_needs_cpu(cpu); | ||
1064 | local_irq_restore(flags); | ||
1065 | } | ||
1066 | |||
1067 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9d2c88423b31..d45db2e35d27 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
50 | { | 50 | { |
51 | if (!rdp->beenonline) | 51 | if (!rdp->beenonline) |
52 | return; | 52 | return; |
53 | seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", | 53 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", |
54 | rdp->cpu, | 54 | rdp->cpu, |
55 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 55 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
56 | rdp->completed, rdp->gpnum, | 56 | rdp->completed, rdp->gpnum, |
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
105 | { | 105 | { |
106 | if (!rdp->beenonline) | 106 | if (!rdp->beenonline) |
107 | return; | 107 | return; |
108 | seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", | 108 | seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", |
109 | rdp->cpu, | 109 | rdp->cpu, |
110 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", | 110 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", |
111 | rdp->completed, rdp->gpnum, | 111 | rdp->completed, rdp->gpnum, |
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = { | |||
155 | 155 | ||
156 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 156 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
157 | { | 157 | { |
158 | long gpnum; | 158 | unsigned long gpnum; |
159 | int level = 0; | 159 | int level = 0; |
160 | int phase; | 160 | int phase; |
161 | struct rcu_node *rnp; | 161 | struct rcu_node *rnp; |
162 | 162 | ||
163 | gpnum = rsp->gpnum; | 163 | gpnum = rsp->gpnum; |
164 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " | 164 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
165 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", | 165 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", |
166 | rsp->completed, gpnum, rsp->signaled, | 166 | rsp->completed, gpnum, rsp->signaled, |
167 | (long)(rsp->jiffies_force_qs - jiffies), | 167 | (long)(rsp->jiffies_force_qs - jiffies), |
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = { | |||
215 | static int show_rcugp(struct seq_file *m, void *unused) | 215 | static int show_rcugp(struct seq_file *m, void *unused) |
216 | { | 216 | { |
217 | #ifdef CONFIG_TREE_PREEMPT_RCU | 217 | #ifdef CONFIG_TREE_PREEMPT_RCU |
218 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", | 218 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", |
219 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); | 219 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); |
220 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 220 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
221 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", | 221 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", |
222 | rcu_sched_state.completed, rcu_sched_state.gpnum); | 222 | rcu_sched_state.completed, rcu_sched_state.gpnum); |
223 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", | 223 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", |
224 | rcu_bh_state.completed, rcu_bh_state.gpnum); | 224 | rcu_bh_state.completed, rcu_bh_state.gpnum); |
225 | return 0; | 225 | return 0; |
226 | } | 226 | } |
diff --git a/kernel/relay.c b/kernel/relay.c index c705a41b4ba3..3d97f2821611 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) | |||
1215 | /* | 1215 | /* |
1216 | * subbuf_splice_actor - splice up to one subbuf's worth of data | 1216 | * subbuf_splice_actor - splice up to one subbuf's worth of data |
1217 | */ | 1217 | */ |
1218 | static int subbuf_splice_actor(struct file *in, | 1218 | static ssize_t subbuf_splice_actor(struct file *in, |
1219 | loff_t *ppos, | 1219 | loff_t *ppos, |
1220 | struct pipe_inode_info *pipe, | 1220 | struct pipe_inode_info *pipe, |
1221 | size_t len, | 1221 | size_t len, |
1222 | unsigned int flags, | 1222 | unsigned int flags, |
1223 | int *nonpad_ret) | 1223 | int *nonpad_ret) |
1224 | { | 1224 | { |
1225 | unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; | 1225 | unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; |
1226 | struct rchan_buf *rbuf = in->private_data; | 1226 | struct rchan_buf *rbuf = in->private_data; |
1227 | unsigned int subbuf_size = rbuf->chan->subbuf_size; | 1227 | unsigned int subbuf_size = rbuf->chan->subbuf_size; |
1228 | uint64_t pos = (uint64_t) *ppos; | 1228 | uint64_t pos = (uint64_t) *ppos; |
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in, | |||
1241 | .ops = &relay_pipe_buf_ops, | 1241 | .ops = &relay_pipe_buf_ops, |
1242 | .spd_release = relay_page_release, | 1242 | .spd_release = relay_page_release, |
1243 | }; | 1243 | }; |
1244 | ssize_t ret; | ||
1244 | 1245 | ||
1245 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) | 1246 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) |
1246 | return 0; | 1247 | return 0; |
diff --git a/kernel/resource.c b/kernel/resource.c index af96c1e4b54b..2d5be5d9bf5f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -188,6 +188,36 @@ static int __release_resource(struct resource *old) | |||
188 | return -EINVAL; | 188 | return -EINVAL; |
189 | } | 189 | } |
190 | 190 | ||
191 | static void __release_child_resources(struct resource *r) | ||
192 | { | ||
193 | struct resource *tmp, *p; | ||
194 | resource_size_t size; | ||
195 | |||
196 | p = r->child; | ||
197 | r->child = NULL; | ||
198 | while (p) { | ||
199 | tmp = p; | ||
200 | p = p->sibling; | ||
201 | |||
202 | tmp->parent = NULL; | ||
203 | tmp->sibling = NULL; | ||
204 | __release_child_resources(tmp); | ||
205 | |||
206 | printk(KERN_DEBUG "release child resource %pR\n", tmp); | ||
207 | /* need to restore size, and keep flags */ | ||
208 | size = resource_size(tmp); | ||
209 | tmp->start = 0; | ||
210 | tmp->end = size - 1; | ||
211 | } | ||
212 | } | ||
213 | |||
214 | void release_child_resources(struct resource *r) | ||
215 | { | ||
216 | write_lock(&resource_lock); | ||
217 | __release_child_resources(r); | ||
218 | write_unlock(&resource_lock); | ||
219 | } | ||
220 | |||
191 | /** | 221 | /** |
192 | * request_resource - request and reserve an I/O or memory resource | 222 | * request_resource - request and reserve an I/O or memory resource |
193 | * @root: root resource descriptor | 223 | * @root: root resource descriptor |
@@ -274,7 +304,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
274 | void *arg, int (*func)(unsigned long, unsigned long, void *)) | 304 | void *arg, int (*func)(unsigned long, unsigned long, void *)) |
275 | { | 305 | { |
276 | struct resource res; | 306 | struct resource res; |
277 | unsigned long pfn, len; | 307 | unsigned long pfn, end_pfn; |
278 | u64 orig_end; | 308 | u64 orig_end; |
279 | int ret = -1; | 309 | int ret = -1; |
280 | 310 | ||
@@ -284,9 +314,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
284 | orig_end = res.end; | 314 | orig_end = res.end; |
285 | while ((res.start < res.end) && | 315 | while ((res.start < res.end) && |
286 | (find_next_system_ram(&res, "System RAM") >= 0)) { | 316 | (find_next_system_ram(&res, "System RAM") >= 0)) { |
287 | pfn = (unsigned long)(res.start >> PAGE_SHIFT); | 317 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; |
288 | len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); | 318 | end_pfn = (res.end + 1) >> PAGE_SHIFT; |
289 | ret = (*func)(pfn, len, arg); | 319 | if (end_pfn > pfn) |
320 | ret = (*func)(pfn, end_pfn - pfn, arg); | ||
290 | if (ret) | 321 | if (ret) |
291 | break; | 322 | break; |
292 | res.start = res.end + 1; | 323 | res.start = res.end + 1; |
@@ -297,14 +328,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
297 | 328 | ||
298 | #endif | 329 | #endif |
299 | 330 | ||
331 | static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) | ||
332 | { | ||
333 | return 1; | ||
334 | } | ||
335 | /* | ||
336 | * This generic page_is_ram() returns true if specified address is | ||
337 | * registered as "System RAM" in iomem_resource list. | ||
338 | */ | ||
339 | int __weak page_is_ram(unsigned long pfn) | ||
340 | { | ||
341 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | ||
342 | } | ||
343 | |||
300 | /* | 344 | /* |
301 | * Find empty slot in the resource tree given range and alignment. | 345 | * Find empty slot in the resource tree given range and alignment. |
302 | */ | 346 | */ |
303 | static int find_resource(struct resource *root, struct resource *new, | 347 | static int find_resource(struct resource *root, struct resource *new, |
304 | resource_size_t size, resource_size_t min, | 348 | resource_size_t size, resource_size_t min, |
305 | resource_size_t max, resource_size_t align, | 349 | resource_size_t max, resource_size_t align, |
306 | void (*alignf)(void *, struct resource *, | 350 | resource_size_t (*alignf)(void *, |
307 | resource_size_t, resource_size_t), | 351 | const struct resource *, |
352 | resource_size_t, | ||
353 | resource_size_t), | ||
308 | void *alignf_data) | 354 | void *alignf_data) |
309 | { | 355 | { |
310 | struct resource *this = root->child; | 356 | struct resource *this = root->child; |
@@ -330,7 +376,7 @@ static int find_resource(struct resource *root, struct resource *new, | |||
330 | tmp.end = max; | 376 | tmp.end = max; |
331 | tmp.start = ALIGN(tmp.start, align); | 377 | tmp.start = ALIGN(tmp.start, align); |
332 | if (alignf) | 378 | if (alignf) |
333 | alignf(alignf_data, &tmp, size, align); | 379 | tmp.start = alignf(alignf_data, &tmp, size, align); |
334 | if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { | 380 | if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { |
335 | new->start = tmp.start; | 381 | new->start = tmp.start; |
336 | new->end = tmp.start + size - 1; | 382 | new->end = tmp.start + size - 1; |
@@ -358,8 +404,10 @@ static int find_resource(struct resource *root, struct resource *new, | |||
358 | int allocate_resource(struct resource *root, struct resource *new, | 404 | int allocate_resource(struct resource *root, struct resource *new, |
359 | resource_size_t size, resource_size_t min, | 405 | resource_size_t size, resource_size_t min, |
360 | resource_size_t max, resource_size_t align, | 406 | resource_size_t max, resource_size_t align, |
361 | void (*alignf)(void *, struct resource *, | 407 | resource_size_t (*alignf)(void *, |
362 | resource_size_t, resource_size_t), | 408 | const struct resource *, |
409 | resource_size_t, | ||
410 | resource_size_t), | ||
363 | void *alignf_data) | 411 | void *alignf_data) |
364 | { | 412 | { |
365 | int err; | 413 | int err; |
diff --git a/kernel/sched.c b/kernel/sched.c index 3e71ebb101c2..150b6988de49 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
233 | */ | 233 | */ |
234 | static DEFINE_MUTEX(sched_domains_mutex); | 234 | static DEFINE_MUTEX(sched_domains_mutex); |
235 | 235 | ||
236 | #ifdef CONFIG_GROUP_SCHED | 236 | #ifdef CONFIG_CGROUP_SCHED |
237 | 237 | ||
238 | #include <linux/cgroup.h> | 238 | #include <linux/cgroup.h> |
239 | 239 | ||
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups); | |||
243 | 243 | ||
244 | /* task group related information */ | 244 | /* task group related information */ |
245 | struct task_group { | 245 | struct task_group { |
246 | #ifdef CONFIG_CGROUP_SCHED | ||
247 | struct cgroup_subsys_state css; | 246 | struct cgroup_subsys_state css; |
248 | #endif | ||
249 | |||
250 | #ifdef CONFIG_USER_SCHED | ||
251 | uid_t uid; | ||
252 | #endif | ||
253 | 247 | ||
254 | #ifdef CONFIG_FAIR_GROUP_SCHED | 248 | #ifdef CONFIG_FAIR_GROUP_SCHED |
255 | /* schedulable entities of this group on each cpu */ | 249 | /* schedulable entities of this group on each cpu */ |
@@ -274,35 +268,7 @@ struct task_group { | |||
274 | struct list_head children; | 268 | struct list_head children; |
275 | }; | 269 | }; |
276 | 270 | ||
277 | #ifdef CONFIG_USER_SCHED | ||
278 | |||
279 | /* Helper function to pass uid information to create_sched_user() */ | ||
280 | void set_tg_uid(struct user_struct *user) | ||
281 | { | ||
282 | user->tg->uid = user->uid; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * Root task group. | ||
287 | * Every UID task group (including init_task_group aka UID-0) will | ||
288 | * be a child to this group. | ||
289 | */ | ||
290 | struct task_group root_task_group; | ||
291 | |||
292 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
293 | /* Default task group's sched entity on each cpu */ | ||
294 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | ||
295 | /* Default task group's cfs_rq on each cpu */ | ||
296 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); | ||
297 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
298 | |||
299 | #ifdef CONFIG_RT_GROUP_SCHED | ||
300 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
301 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var); | ||
302 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
303 | #else /* !CONFIG_USER_SCHED */ | ||
304 | #define root_task_group init_task_group | 271 | #define root_task_group init_task_group |
305 | #endif /* CONFIG_USER_SCHED */ | ||
306 | 272 | ||
307 | /* task_group_lock serializes add/remove of task groups and also changes to | 273 | /* task_group_lock serializes add/remove of task groups and also changes to |
308 | * a task group's cpu shares. | 274 | * a task group's cpu shares. |
@@ -318,11 +284,7 @@ static int root_task_group_empty(void) | |||
318 | } | 284 | } |
319 | #endif | 285 | #endif |
320 | 286 | ||
321 | #ifdef CONFIG_USER_SCHED | ||
322 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | ||
323 | #else /* !CONFIG_USER_SCHED */ | ||
324 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 287 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
325 | #endif /* CONFIG_USER_SCHED */ | ||
326 | 288 | ||
327 | /* | 289 | /* |
328 | * A weight of 0 or 1 can cause arithmetics problems. | 290 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
348 | { | 310 | { |
349 | struct task_group *tg; | 311 | struct task_group *tg; |
350 | 312 | ||
351 | #ifdef CONFIG_USER_SCHED | 313 | #ifdef CONFIG_CGROUP_SCHED |
352 | rcu_read_lock(); | ||
353 | tg = __task_cred(p)->user->tg; | ||
354 | rcu_read_unlock(); | ||
355 | #elif defined(CONFIG_CGROUP_SCHED) | ||
356 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), | 314 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), |
357 | struct task_group, css); | 315 | struct task_group, css); |
358 | #else | 316 | #else |
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
383 | return NULL; | 341 | return NULL; |
384 | } | 342 | } |
385 | 343 | ||
386 | #endif /* CONFIG_GROUP_SCHED */ | 344 | #endif /* CONFIG_CGROUP_SCHED */ |
387 | 345 | ||
388 | /* CFS-related fields in a runqueue */ | 346 | /* CFS-related fields in a runqueue */ |
389 | struct cfs_rq { | 347 | struct cfs_rq { |
@@ -478,7 +436,6 @@ struct rt_rq { | |||
478 | struct rq *rq; | 436 | struct rq *rq; |
479 | struct list_head leaf_rt_rq_list; | 437 | struct list_head leaf_rt_rq_list; |
480 | struct task_group *tg; | 438 | struct task_group *tg; |
481 | struct sched_rt_entity *rt_se; | ||
482 | #endif | 439 | #endif |
483 | }; | 440 | }; |
484 | 441 | ||
@@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq) | |||
645 | #endif | 602 | #endif |
646 | } | 603 | } |
647 | 604 | ||
605 | #define rcu_dereference_check_sched_domain(p) \ | ||
606 | rcu_dereference_check((p), \ | ||
607 | rcu_read_lock_sched_held() || \ | ||
608 | lockdep_is_held(&sched_domains_mutex)) | ||
609 | |||
648 | /* | 610 | /* |
649 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 611 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
650 | * See detach_destroy_domains: synchronize_sched for details. | 612 | * See detach_destroy_domains: synchronize_sched for details. |
@@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq) | |||
653 | * preempt-disabled sections. | 615 | * preempt-disabled sections. |
654 | */ | 616 | */ |
655 | #define for_each_domain(cpu, __sd) \ | 617 | #define for_each_domain(cpu, __sd) \ |
656 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | 618 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
657 | 619 | ||
658 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 620 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
659 | #define this_rq() (&__get_cpu_var(runqueues)) | 621 | #define this_rq() (&__get_cpu_var(runqueues)) |
@@ -941,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
941 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 903 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
942 | 904 | ||
943 | /* | 905 | /* |
906 | * Check whether the task is waking, we use this to synchronize against | ||
907 | * ttwu() so that task_cpu() reports a stable number. | ||
908 | * | ||
909 | * We need to make an exception for PF_STARTING tasks because the fork | ||
910 | * path might require task_rq_lock() to work, eg. it can call | ||
911 | * set_cpus_allowed_ptr() from the cpuset clone_ns code. | ||
912 | */ | ||
913 | static inline int task_is_waking(struct task_struct *p) | ||
914 | { | ||
915 | return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); | ||
916 | } | ||
917 | |||
918 | /* | ||
944 | * __task_rq_lock - lock the runqueue a given task resides on. | 919 | * __task_rq_lock - lock the runqueue a given task resides on. |
945 | * Must be called interrupts disabled. | 920 | * Must be called interrupts disabled. |
946 | */ | 921 | */ |
947 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 922 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
948 | __acquires(rq->lock) | 923 | __acquires(rq->lock) |
949 | { | 924 | { |
925 | struct rq *rq; | ||
926 | |||
950 | for (;;) { | 927 | for (;;) { |
951 | struct rq *rq = task_rq(p); | 928 | while (task_is_waking(p)) |
929 | cpu_relax(); | ||
930 | rq = task_rq(p); | ||
952 | raw_spin_lock(&rq->lock); | 931 | raw_spin_lock(&rq->lock); |
953 | if (likely(rq == task_rq(p))) | 932 | if (likely(rq == task_rq(p) && !task_is_waking(p))) |
954 | return rq; | 933 | return rq; |
955 | raw_spin_unlock(&rq->lock); | 934 | raw_spin_unlock(&rq->lock); |
956 | } | 935 | } |
@@ -967,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
967 | struct rq *rq; | 946 | struct rq *rq; |
968 | 947 | ||
969 | for (;;) { | 948 | for (;;) { |
949 | while (task_is_waking(p)) | ||
950 | cpu_relax(); | ||
970 | local_irq_save(*flags); | 951 | local_irq_save(*flags); |
971 | rq = task_rq(p); | 952 | rq = task_rq(p); |
972 | raw_spin_lock(&rq->lock); | 953 | raw_spin_lock(&rq->lock); |
973 | if (likely(rq == task_rq(p))) | 954 | if (likely(rq == task_rq(p) && !task_is_waking(p))) |
974 | return rq; | 955 | return rq; |
975 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 956 | raw_spin_unlock_irqrestore(&rq->lock, *flags); |
976 | } | 957 | } |
@@ -1390,32 +1371,6 @@ static const u32 prio_to_wmult[40] = { | |||
1390 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1371 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
1391 | }; | 1372 | }; |
1392 | 1373 | ||
1393 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | ||
1394 | |||
1395 | /* | ||
1396 | * runqueue iterator, to support SMP load-balancing between different | ||
1397 | * scheduling classes, without having to expose their internal data | ||
1398 | * structures to the load-balancing proper: | ||
1399 | */ | ||
1400 | struct rq_iterator { | ||
1401 | void *arg; | ||
1402 | struct task_struct *(*start)(void *); | ||
1403 | struct task_struct *(*next)(void *); | ||
1404 | }; | ||
1405 | |||
1406 | #ifdef CONFIG_SMP | ||
1407 | static unsigned long | ||
1408 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1409 | unsigned long max_load_move, struct sched_domain *sd, | ||
1410 | enum cpu_idle_type idle, int *all_pinned, | ||
1411 | int *this_best_prio, struct rq_iterator *iterator); | ||
1412 | |||
1413 | static int | ||
1414 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1415 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1416 | struct rq_iterator *iterator); | ||
1417 | #endif | ||
1418 | |||
1419 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 1374 | /* Time spent by the tasks of the cpu accounting group executing in ... */ |
1420 | enum cpuacct_stat_index { | 1375 | enum cpuacct_stat_index { |
1421 | CPUACCT_STAT_USER, /* ... user mode */ | 1376 | CPUACCT_STAT_USER, /* ... user mode */ |
@@ -1531,7 +1486,7 @@ static unsigned long target_load(int cpu, int type) | |||
1531 | 1486 | ||
1532 | static struct sched_group *group_of(int cpu) | 1487 | static struct sched_group *group_of(int cpu) |
1533 | { | 1488 | { |
1534 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | 1489 | struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); |
1535 | 1490 | ||
1536 | if (!sd) | 1491 | if (!sd) |
1537 | return NULL; | 1492 | return NULL; |
@@ -1566,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1566 | 1521 | ||
1567 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1522 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1568 | 1523 | ||
1569 | static __read_mostly unsigned long *update_shares_data; | 1524 | static __read_mostly unsigned long __percpu *update_shares_data; |
1570 | 1525 | ||
1571 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1526 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1572 | 1527 | ||
@@ -1701,16 +1656,6 @@ static void update_shares(struct sched_domain *sd) | |||
1701 | } | 1656 | } |
1702 | } | 1657 | } |
1703 | 1658 | ||
1704 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1705 | { | ||
1706 | if (root_task_group_empty()) | ||
1707 | return; | ||
1708 | |||
1709 | raw_spin_unlock(&rq->lock); | ||
1710 | update_shares(sd); | ||
1711 | raw_spin_lock(&rq->lock); | ||
1712 | } | ||
1713 | |||
1714 | static void update_h_load(long cpu) | 1659 | static void update_h_load(long cpu) |
1715 | { | 1660 | { |
1716 | if (root_task_group_empty()) | 1661 | if (root_task_group_empty()) |
@@ -1725,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd) | |||
1725 | { | 1670 | { |
1726 | } | 1671 | } |
1727 | 1672 | ||
1728 | static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1729 | { | ||
1730 | } | ||
1731 | |||
1732 | #endif | 1673 | #endif |
1733 | 1674 | ||
1734 | #ifdef CONFIG_PREEMPT | 1675 | #ifdef CONFIG_PREEMPT |
@@ -1805,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | |||
1805 | raw_spin_unlock(&busiest->lock); | 1746 | raw_spin_unlock(&busiest->lock); |
1806 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | 1747 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); |
1807 | } | 1748 | } |
1749 | |||
1750 | /* | ||
1751 | * double_rq_lock - safely lock two runqueues | ||
1752 | * | ||
1753 | * Note this does not disable interrupts like task_rq_lock, | ||
1754 | * you need to do so manually before calling. | ||
1755 | */ | ||
1756 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1757 | __acquires(rq1->lock) | ||
1758 | __acquires(rq2->lock) | ||
1759 | { | ||
1760 | BUG_ON(!irqs_disabled()); | ||
1761 | if (rq1 == rq2) { | ||
1762 | raw_spin_lock(&rq1->lock); | ||
1763 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1764 | } else { | ||
1765 | if (rq1 < rq2) { | ||
1766 | raw_spin_lock(&rq1->lock); | ||
1767 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
1768 | } else { | ||
1769 | raw_spin_lock(&rq2->lock); | ||
1770 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
1771 | } | ||
1772 | } | ||
1773 | update_rq_clock(rq1); | ||
1774 | update_rq_clock(rq2); | ||
1775 | } | ||
1776 | |||
1777 | /* | ||
1778 | * double_rq_unlock - safely unlock two runqueues | ||
1779 | * | ||
1780 | * Note this does not restore interrupts like task_rq_unlock, | ||
1781 | * you need to do so manually after calling. | ||
1782 | */ | ||
1783 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1784 | __releases(rq1->lock) | ||
1785 | __releases(rq2->lock) | ||
1786 | { | ||
1787 | raw_spin_unlock(&rq1->lock); | ||
1788 | if (rq1 != rq2) | ||
1789 | raw_spin_unlock(&rq2->lock); | ||
1790 | else | ||
1791 | __release(rq2->lock); | ||
1792 | } | ||
1793 | |||
1808 | #endif | 1794 | #endif |
1809 | 1795 | ||
1810 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1796 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1834 | #endif | 1820 | #endif |
1835 | } | 1821 | } |
1836 | 1822 | ||
1837 | #include "sched_stats.h" | 1823 | static const struct sched_class rt_sched_class; |
1838 | #include "sched_idletask.c" | ||
1839 | #include "sched_fair.c" | ||
1840 | #include "sched_rt.c" | ||
1841 | #ifdef CONFIG_SCHED_DEBUG | ||
1842 | # include "sched_debug.c" | ||
1843 | #endif | ||
1844 | 1824 | ||
1845 | #define sched_class_highest (&rt_sched_class) | 1825 | #define sched_class_highest (&rt_sched_class) |
1846 | #define for_each_class(class) \ | 1826 | #define for_each_class(class) \ |
1847 | for (class = sched_class_highest; class; class = class->next) | 1827 | for (class = sched_class_highest; class; class = class->next) |
1848 | 1828 | ||
1829 | #include "sched_stats.h" | ||
1830 | |||
1849 | static void inc_nr_running(struct rq *rq) | 1831 | static void inc_nr_running(struct rq *rq) |
1850 | { | 1832 | { |
1851 | rq->nr_running++; | 1833 | rq->nr_running++; |
@@ -1883,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample) | |||
1883 | *avg += diff >> 3; | 1865 | *avg += diff >> 3; |
1884 | } | 1866 | } |
1885 | 1867 | ||
1886 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1868 | static void |
1869 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
1887 | { | 1870 | { |
1888 | if (wakeup) | 1871 | if (wakeup) |
1889 | p->se.start_runtime = p->se.sum_exec_runtime; | 1872 | p->se.start_runtime = p->se.sum_exec_runtime; |
1890 | 1873 | ||
1891 | sched_info_queued(p); | 1874 | sched_info_queued(p); |
1892 | p->sched_class->enqueue_task(rq, p, wakeup); | 1875 | p->sched_class->enqueue_task(rq, p, wakeup, head); |
1893 | p->se.on_rq = 1; | 1876 | p->se.on_rq = 1; |
1894 | } | 1877 | } |
1895 | 1878 | ||
@@ -1912,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1912 | } | 1895 | } |
1913 | 1896 | ||
1914 | /* | 1897 | /* |
1898 | * activate_task - move a task to the runqueue. | ||
1899 | */ | ||
1900 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | ||
1901 | { | ||
1902 | if (task_contributes_to_load(p)) | ||
1903 | rq->nr_uninterruptible--; | ||
1904 | |||
1905 | enqueue_task(rq, p, wakeup, false); | ||
1906 | inc_nr_running(rq); | ||
1907 | } | ||
1908 | |||
1909 | /* | ||
1910 | * deactivate_task - remove a task from the runqueue. | ||
1911 | */ | ||
1912 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | ||
1913 | { | ||
1914 | if (task_contributes_to_load(p)) | ||
1915 | rq->nr_uninterruptible++; | ||
1916 | |||
1917 | dequeue_task(rq, p, sleep); | ||
1918 | dec_nr_running(rq); | ||
1919 | } | ||
1920 | |||
1921 | #include "sched_idletask.c" | ||
1922 | #include "sched_fair.c" | ||
1923 | #include "sched_rt.c" | ||
1924 | #ifdef CONFIG_SCHED_DEBUG | ||
1925 | # include "sched_debug.c" | ||
1926 | #endif | ||
1927 | |||
1928 | /* | ||
1915 | * __normal_prio - return the priority that is based on the static prio | 1929 | * __normal_prio - return the priority that is based on the static prio |
1916 | */ | 1930 | */ |
1917 | static inline int __normal_prio(struct task_struct *p) | 1931 | static inline int __normal_prio(struct task_struct *p) |
@@ -1957,30 +1971,6 @@ static int effective_prio(struct task_struct *p) | |||
1957 | return p->prio; | 1971 | return p->prio; |
1958 | } | 1972 | } |
1959 | 1973 | ||
1960 | /* | ||
1961 | * activate_task - move a task to the runqueue. | ||
1962 | */ | ||
1963 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | ||
1964 | { | ||
1965 | if (task_contributes_to_load(p)) | ||
1966 | rq->nr_uninterruptible--; | ||
1967 | |||
1968 | enqueue_task(rq, p, wakeup); | ||
1969 | inc_nr_running(rq); | ||
1970 | } | ||
1971 | |||
1972 | /* | ||
1973 | * deactivate_task - remove a task from the runqueue. | ||
1974 | */ | ||
1975 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | ||
1976 | { | ||
1977 | if (task_contributes_to_load(p)) | ||
1978 | rq->nr_uninterruptible++; | ||
1979 | |||
1980 | dequeue_task(rq, p, sleep); | ||
1981 | dec_nr_running(rq); | ||
1982 | } | ||
1983 | |||
1984 | /** | 1974 | /** |
1985 | * task_curr - is this task currently executing on a CPU? | 1975 | * task_curr - is this task currently executing on a CPU? |
1986 | * @p: the task in question. | 1976 | * @p: the task in question. |
@@ -2408,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2408 | __task_rq_unlock(rq); | 2398 | __task_rq_unlock(rq); |
2409 | 2399 | ||
2410 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 2400 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2411 | if (cpu != orig_cpu) | 2401 | if (cpu != orig_cpu) { |
2402 | /* | ||
2403 | * Since we migrate the task without holding any rq->lock, | ||
2404 | * we need to be careful with task_rq_lock(), since that | ||
2405 | * might end up locking an invalid rq. | ||
2406 | */ | ||
2412 | set_task_cpu(p, cpu); | 2407 | set_task_cpu(p, cpu); |
2408 | } | ||
2413 | 2409 | ||
2414 | rq = __task_rq_lock(p); | 2410 | rq = cpu_rq(cpu); |
2411 | raw_spin_lock(&rq->lock); | ||
2415 | update_rq_clock(rq); | 2412 | update_rq_clock(rq); |
2416 | 2413 | ||
2414 | /* | ||
2415 | * We migrated the task without holding either rq->lock, however | ||
2416 | * since the task is not on the task list itself, nobody else | ||
2417 | * will try and migrate the task, hence the rq should match the | ||
2418 | * cpu we just moved it to. | ||
2419 | */ | ||
2420 | WARN_ON(task_cpu(p) != cpu); | ||
2417 | WARN_ON(p->state != TASK_WAKING); | 2421 | WARN_ON(p->state != TASK_WAKING); |
2418 | cpu = task_cpu(p); | ||
2419 | 2422 | ||
2420 | #ifdef CONFIG_SCHEDSTATS | 2423 | #ifdef CONFIG_SCHEDSTATS |
2421 | schedstat_inc(rq, ttwu_count); | 2424 | schedstat_inc(rq, ttwu_count); |
@@ -2663,7 +2666,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2663 | set_task_cpu(p, cpu); | 2666 | set_task_cpu(p, cpu); |
2664 | #endif | 2667 | #endif |
2665 | 2668 | ||
2666 | rq = task_rq_lock(p, &flags); | 2669 | /* |
2670 | * Since the task is not on the rq and we still have TASK_WAKING set | ||
2671 | * nobody else will migrate this task. | ||
2672 | */ | ||
2673 | rq = cpu_rq(cpu); | ||
2674 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
2675 | |||
2667 | BUG_ON(p->state != TASK_WAKING); | 2676 | BUG_ON(p->state != TASK_WAKING); |
2668 | p->state = TASK_RUNNING; | 2677 | p->state = TASK_RUNNING; |
2669 | update_rq_clock(rq); | 2678 | update_rq_clock(rq); |
@@ -3105,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq) | |||
3105 | #ifdef CONFIG_SMP | 3114 | #ifdef CONFIG_SMP |
3106 | 3115 | ||
3107 | /* | 3116 | /* |
3108 | * double_rq_lock - safely lock two runqueues | ||
3109 | * | ||
3110 | * Note this does not disable interrupts like task_rq_lock, | ||
3111 | * you need to do so manually before calling. | ||
3112 | */ | ||
3113 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
3114 | __acquires(rq1->lock) | ||
3115 | __acquires(rq2->lock) | ||
3116 | { | ||
3117 | BUG_ON(!irqs_disabled()); | ||
3118 | if (rq1 == rq2) { | ||
3119 | raw_spin_lock(&rq1->lock); | ||
3120 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
3121 | } else { | ||
3122 | if (rq1 < rq2) { | ||
3123 | raw_spin_lock(&rq1->lock); | ||
3124 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
3125 | } else { | ||
3126 | raw_spin_lock(&rq2->lock); | ||
3127 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
3128 | } | ||
3129 | } | ||
3130 | update_rq_clock(rq1); | ||
3131 | update_rq_clock(rq2); | ||
3132 | } | ||
3133 | |||
3134 | /* | ||
3135 | * double_rq_unlock - safely unlock two runqueues | ||
3136 | * | ||
3137 | * Note this does not restore interrupts like task_rq_unlock, | ||
3138 | * you need to do so manually after calling. | ||
3139 | */ | ||
3140 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
3141 | __releases(rq1->lock) | ||
3142 | __releases(rq2->lock) | ||
3143 | { | ||
3144 | raw_spin_unlock(&rq1->lock); | ||
3145 | if (rq1 != rq2) | ||
3146 | raw_spin_unlock(&rq2->lock); | ||
3147 | else | ||
3148 | __release(rq2->lock); | ||
3149 | } | ||
3150 | |||
3151 | /* | ||
3152 | * sched_exec - execve() is a valuable balancing opportunity, because at | 3117 | * sched_exec - execve() is a valuable balancing opportunity, because at |
3153 | * this point the task has the smallest effective memory and cache footprint. | 3118 | * this point the task has the smallest effective memory and cache footprint. |
3154 | */ | 3119 | */ |
@@ -3196,1771 +3161,6 @@ again: | |||
3196 | task_rq_unlock(rq, &flags); | 3161 | task_rq_unlock(rq, &flags); |
3197 | } | 3162 | } |
3198 | 3163 | ||
3199 | /* | ||
3200 | * pull_task - move a task from a remote runqueue to the local runqueue. | ||
3201 | * Both runqueues must be locked. | ||
3202 | */ | ||
3203 | static void pull_task(struct rq *src_rq, struct task_struct *p, | ||
3204 | struct rq *this_rq, int this_cpu) | ||
3205 | { | ||
3206 | deactivate_task(src_rq, p, 0); | ||
3207 | set_task_cpu(p, this_cpu); | ||
3208 | activate_task(this_rq, p, 0); | ||
3209 | check_preempt_curr(this_rq, p, 0); | ||
3210 | } | ||
3211 | |||
3212 | /* | ||
3213 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | ||
3214 | */ | ||
3215 | static | ||
3216 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | ||
3217 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3218 | int *all_pinned) | ||
3219 | { | ||
3220 | int tsk_cache_hot = 0; | ||
3221 | /* | ||
3222 | * We do not migrate tasks that are: | ||
3223 | * 1) running (obviously), or | ||
3224 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | ||
3225 | * 3) are cache-hot on their current CPU. | ||
3226 | */ | ||
3227 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | ||
3228 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
3229 | return 0; | ||
3230 | } | ||
3231 | *all_pinned = 0; | ||
3232 | |||
3233 | if (task_running(rq, p)) { | ||
3234 | schedstat_inc(p, se.nr_failed_migrations_running); | ||
3235 | return 0; | ||
3236 | } | ||
3237 | |||
3238 | /* | ||
3239 | * Aggressive migration if: | ||
3240 | * 1) task is cache cold, or | ||
3241 | * 2) too many balance attempts have failed. | ||
3242 | */ | ||
3243 | |||
3244 | tsk_cache_hot = task_hot(p, rq->clock, sd); | ||
3245 | if (!tsk_cache_hot || | ||
3246 | sd->nr_balance_failed > sd->cache_nice_tries) { | ||
3247 | #ifdef CONFIG_SCHEDSTATS | ||
3248 | if (tsk_cache_hot) { | ||
3249 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
3250 | schedstat_inc(p, se.nr_forced_migrations); | ||
3251 | } | ||
3252 | #endif | ||
3253 | return 1; | ||
3254 | } | ||
3255 | |||
3256 | if (tsk_cache_hot) { | ||
3257 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
3258 | return 0; | ||
3259 | } | ||
3260 | return 1; | ||
3261 | } | ||
3262 | |||
3263 | static unsigned long | ||
3264 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3265 | unsigned long max_load_move, struct sched_domain *sd, | ||
3266 | enum cpu_idle_type idle, int *all_pinned, | ||
3267 | int *this_best_prio, struct rq_iterator *iterator) | ||
3268 | { | ||
3269 | int loops = 0, pulled = 0, pinned = 0; | ||
3270 | struct task_struct *p; | ||
3271 | long rem_load_move = max_load_move; | ||
3272 | |||
3273 | if (max_load_move == 0) | ||
3274 | goto out; | ||
3275 | |||
3276 | pinned = 1; | ||
3277 | |||
3278 | /* | ||
3279 | * Start the load-balancing iterator: | ||
3280 | */ | ||
3281 | p = iterator->start(iterator->arg); | ||
3282 | next: | ||
3283 | if (!p || loops++ > sysctl_sched_nr_migrate) | ||
3284 | goto out; | ||
3285 | |||
3286 | if ((p->se.load.weight >> 1) > rem_load_move || | ||
3287 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | ||
3288 | p = iterator->next(iterator->arg); | ||
3289 | goto next; | ||
3290 | } | ||
3291 | |||
3292 | pull_task(busiest, p, this_rq, this_cpu); | ||
3293 | pulled++; | ||
3294 | rem_load_move -= p->se.load.weight; | ||
3295 | |||
3296 | #ifdef CONFIG_PREEMPT | ||
3297 | /* | ||
3298 | * NEWIDLE balancing is a source of latency, so preemptible kernels | ||
3299 | * will stop after the first task is pulled to minimize the critical | ||
3300 | * section. | ||
3301 | */ | ||
3302 | if (idle == CPU_NEWLY_IDLE) | ||
3303 | goto out; | ||
3304 | #endif | ||
3305 | |||
3306 | /* | ||
3307 | * We only want to steal up to the prescribed amount of weighted load. | ||
3308 | */ | ||
3309 | if (rem_load_move > 0) { | ||
3310 | if (p->prio < *this_best_prio) | ||
3311 | *this_best_prio = p->prio; | ||
3312 | p = iterator->next(iterator->arg); | ||
3313 | goto next; | ||
3314 | } | ||
3315 | out: | ||
3316 | /* | ||
3317 | * Right now, this is one of only two places pull_task() is called, | ||
3318 | * so we can safely collect pull_task() stats here rather than | ||
3319 | * inside pull_task(). | ||
3320 | */ | ||
3321 | schedstat_add(sd, lb_gained[idle], pulled); | ||
3322 | |||
3323 | if (all_pinned) | ||
3324 | *all_pinned = pinned; | ||
3325 | |||
3326 | return max_load_move - rem_load_move; | ||
3327 | } | ||
3328 | |||
3329 | /* | ||
3330 | * move_tasks tries to move up to max_load_move weighted load from busiest to | ||
3331 | * this_rq, as part of a balancing operation within domain "sd". | ||
3332 | * Returns 1 if successful and 0 otherwise. | ||
3333 | * | ||
3334 | * Called with both runqueues locked. | ||
3335 | */ | ||
3336 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3337 | unsigned long max_load_move, | ||
3338 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3339 | int *all_pinned) | ||
3340 | { | ||
3341 | const struct sched_class *class = sched_class_highest; | ||
3342 | unsigned long total_load_moved = 0; | ||
3343 | int this_best_prio = this_rq->curr->prio; | ||
3344 | |||
3345 | do { | ||
3346 | total_load_moved += | ||
3347 | class->load_balance(this_rq, this_cpu, busiest, | ||
3348 | max_load_move - total_load_moved, | ||
3349 | sd, idle, all_pinned, &this_best_prio); | ||
3350 | class = class->next; | ||
3351 | |||
3352 | #ifdef CONFIG_PREEMPT | ||
3353 | /* | ||
3354 | * NEWIDLE balancing is a source of latency, so preemptible | ||
3355 | * kernels will stop after the first task is pulled to minimize | ||
3356 | * the critical section. | ||
3357 | */ | ||
3358 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | ||
3359 | break; | ||
3360 | #endif | ||
3361 | } while (class && max_load_move > total_load_moved); | ||
3362 | |||
3363 | return total_load_moved > 0; | ||
3364 | } | ||
3365 | |||
3366 | static int | ||
3367 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3368 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3369 | struct rq_iterator *iterator) | ||
3370 | { | ||
3371 | struct task_struct *p = iterator->start(iterator->arg); | ||
3372 | int pinned = 0; | ||
3373 | |||
3374 | while (p) { | ||
3375 | if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | ||
3376 | pull_task(busiest, p, this_rq, this_cpu); | ||
3377 | /* | ||
3378 | * Right now, this is only the second place pull_task() | ||
3379 | * is called, so we can safely collect pull_task() | ||
3380 | * stats here rather than inside pull_task(). | ||
3381 | */ | ||
3382 | schedstat_inc(sd, lb_gained[idle]); | ||
3383 | |||
3384 | return 1; | ||
3385 | } | ||
3386 | p = iterator->next(iterator->arg); | ||
3387 | } | ||
3388 | |||
3389 | return 0; | ||
3390 | } | ||
3391 | |||
3392 | /* | ||
3393 | * move_one_task tries to move exactly one task from busiest to this_rq, as | ||
3394 | * part of active balancing operations within "domain". | ||
3395 | * Returns 1 if successful and 0 otherwise. | ||
3396 | * | ||
3397 | * Called with both runqueues locked. | ||
3398 | */ | ||
3399 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3400 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
3401 | { | ||
3402 | const struct sched_class *class; | ||
3403 | |||
3404 | for_each_class(class) { | ||
3405 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | ||
3406 | return 1; | ||
3407 | } | ||
3408 | |||
3409 | return 0; | ||
3410 | } | ||
3411 | /********** Helpers for find_busiest_group ************************/ | ||
3412 | /* | ||
3413 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
3414 | * during load balancing. | ||
3415 | */ | ||
3416 | struct sd_lb_stats { | ||
3417 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
3418 | struct sched_group *this; /* Local group in this sd */ | ||
3419 | unsigned long total_load; /* Total load of all groups in sd */ | ||
3420 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
3421 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
3422 | |||
3423 | /** Statistics of this group */ | ||
3424 | unsigned long this_load; | ||
3425 | unsigned long this_load_per_task; | ||
3426 | unsigned long this_nr_running; | ||
3427 | |||
3428 | /* Statistics of the busiest group */ | ||
3429 | unsigned long max_load; | ||
3430 | unsigned long busiest_load_per_task; | ||
3431 | unsigned long busiest_nr_running; | ||
3432 | |||
3433 | int group_imb; /* Is there imbalance in this sd */ | ||
3434 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3435 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
3436 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
3437 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
3438 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
3439 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
3440 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
3441 | #endif | ||
3442 | }; | ||
3443 | |||
3444 | /* | ||
3445 | * sg_lb_stats - stats of a sched_group required for load_balancing | ||
3446 | */ | ||
3447 | struct sg_lb_stats { | ||
3448 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | ||
3449 | unsigned long group_load; /* Total load over the CPUs of the group */ | ||
3450 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
3451 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
3452 | unsigned long group_capacity; | ||
3453 | int group_imb; /* Is there an imbalance in the group ? */ | ||
3454 | }; | ||
3455 | |||
3456 | /** | ||
3457 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
3458 | * @group: The group whose first cpu is to be returned. | ||
3459 | */ | ||
3460 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
3461 | { | ||
3462 | return cpumask_first(sched_group_cpus(group)); | ||
3463 | } | ||
3464 | |||
3465 | /** | ||
3466 | * get_sd_load_idx - Obtain the load index for a given sched domain. | ||
3467 | * @sd: The sched_domain whose load_idx is to be obtained. | ||
3468 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | ||
3469 | */ | ||
3470 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
3471 | enum cpu_idle_type idle) | ||
3472 | { | ||
3473 | int load_idx; | ||
3474 | |||
3475 | switch (idle) { | ||
3476 | case CPU_NOT_IDLE: | ||
3477 | load_idx = sd->busy_idx; | ||
3478 | break; | ||
3479 | |||
3480 | case CPU_NEWLY_IDLE: | ||
3481 | load_idx = sd->newidle_idx; | ||
3482 | break; | ||
3483 | default: | ||
3484 | load_idx = sd->idle_idx; | ||
3485 | break; | ||
3486 | } | ||
3487 | |||
3488 | return load_idx; | ||
3489 | } | ||
3490 | |||
3491 | |||
3492 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3493 | /** | ||
3494 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
3495 | * the given sched_domain, during load balancing. | ||
3496 | * | ||
3497 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
3498 | * @sds: Variable containing the statistics for sd. | ||
3499 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
3500 | */ | ||
3501 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3502 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3503 | { | ||
3504 | /* | ||
3505 | * Busy processors will not participate in power savings | ||
3506 | * balance. | ||
3507 | */ | ||
3508 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3509 | sds->power_savings_balance = 0; | ||
3510 | else { | ||
3511 | sds->power_savings_balance = 1; | ||
3512 | sds->min_nr_running = ULONG_MAX; | ||
3513 | sds->leader_nr_running = 0; | ||
3514 | } | ||
3515 | } | ||
3516 | |||
3517 | /** | ||
3518 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
3519 | * sched_domain while performing load balancing. | ||
3520 | * | ||
3521 | * @group: sched_group belonging to the sched_domain under consideration. | ||
3522 | * @sds: Variable containing the statistics of the sched_domain | ||
3523 | * @local_group: Does group contain the CPU for which we're performing | ||
3524 | * load balancing ? | ||
3525 | * @sgs: Variable containing the statistics of the group. | ||
3526 | */ | ||
3527 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3528 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3529 | { | ||
3530 | |||
3531 | if (!sds->power_savings_balance) | ||
3532 | return; | ||
3533 | |||
3534 | /* | ||
3535 | * If the local group is idle or completely loaded | ||
3536 | * no need to do power savings balance at this domain | ||
3537 | */ | ||
3538 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
3539 | !sds->this_nr_running)) | ||
3540 | sds->power_savings_balance = 0; | ||
3541 | |||
3542 | /* | ||
3543 | * If a group is already running at full capacity or idle, | ||
3544 | * don't include that group in power savings calculations | ||
3545 | */ | ||
3546 | if (!sds->power_savings_balance || | ||
3547 | sgs->sum_nr_running >= sgs->group_capacity || | ||
3548 | !sgs->sum_nr_running) | ||
3549 | return; | ||
3550 | |||
3551 | /* | ||
3552 | * Calculate the group which has the least non-idle load. | ||
3553 | * This is the group from where we need to pick up the load | ||
3554 | * for saving power | ||
3555 | */ | ||
3556 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
3557 | (sgs->sum_nr_running == sds->min_nr_running && | ||
3558 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
3559 | sds->group_min = group; | ||
3560 | sds->min_nr_running = sgs->sum_nr_running; | ||
3561 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
3562 | sgs->sum_nr_running; | ||
3563 | } | ||
3564 | |||
3565 | /* | ||
3566 | * Calculate the group which is almost near its | ||
3567 | * capacity but still has some space to pick up some load | ||
3568 | * from other group and save more power | ||
3569 | */ | ||
3570 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
3571 | return; | ||
3572 | |||
3573 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
3574 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
3575 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
3576 | sds->group_leader = group; | ||
3577 | sds->leader_nr_running = sgs->sum_nr_running; | ||
3578 | } | ||
3579 | } | ||
3580 | |||
3581 | /** | ||
3582 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
3583 | * @sds: Variable containing the statistics of the sched_domain | ||
3584 | * under consideration. | ||
3585 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
3586 | * @imbalance: Variable to store the imbalance. | ||
3587 | * | ||
3588 | * Description: | ||
3589 | * Check if we have potential to perform some power-savings balance. | ||
3590 | * If yes, set the busiest group to be the least loaded group in the | ||
3591 | * sched_domain, so that it's CPUs can be put to idle. | ||
3592 | * | ||
3593 | * Returns 1 if there is potential to perform power-savings balance. | ||
3594 | * Else returns 0. | ||
3595 | */ | ||
3596 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3597 | int this_cpu, unsigned long *imbalance) | ||
3598 | { | ||
3599 | if (!sds->power_savings_balance) | ||
3600 | return 0; | ||
3601 | |||
3602 | if (sds->this != sds->group_leader || | ||
3603 | sds->group_leader == sds->group_min) | ||
3604 | return 0; | ||
3605 | |||
3606 | *imbalance = sds->min_load_per_task; | ||
3607 | sds->busiest = sds->group_min; | ||
3608 | |||
3609 | return 1; | ||
3610 | |||
3611 | } | ||
3612 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3613 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3614 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3615 | { | ||
3616 | return; | ||
3617 | } | ||
3618 | |||
3619 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3620 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3621 | { | ||
3622 | return; | ||
3623 | } | ||
3624 | |||
3625 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3626 | int this_cpu, unsigned long *imbalance) | ||
3627 | { | ||
3628 | return 0; | ||
3629 | } | ||
3630 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3631 | |||
3632 | |||
3633 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3634 | { | ||
3635 | return SCHED_LOAD_SCALE; | ||
3636 | } | ||
3637 | |||
3638 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3639 | { | ||
3640 | return default_scale_freq_power(sd, cpu); | ||
3641 | } | ||
3642 | |||
3643 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3644 | { | ||
3645 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3646 | unsigned long smt_gain = sd->smt_gain; | ||
3647 | |||
3648 | smt_gain /= weight; | ||
3649 | |||
3650 | return smt_gain; | ||
3651 | } | ||
3652 | |||
3653 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3654 | { | ||
3655 | return default_scale_smt_power(sd, cpu); | ||
3656 | } | ||
3657 | |||
3658 | unsigned long scale_rt_power(int cpu) | ||
3659 | { | ||
3660 | struct rq *rq = cpu_rq(cpu); | ||
3661 | u64 total, available; | ||
3662 | |||
3663 | sched_avg_update(rq); | ||
3664 | |||
3665 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
3666 | available = total - rq->rt_avg; | ||
3667 | |||
3668 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
3669 | total = SCHED_LOAD_SCALE; | ||
3670 | |||
3671 | total >>= SCHED_LOAD_SHIFT; | ||
3672 | |||
3673 | return div_u64(available, total); | ||
3674 | } | ||
3675 | |||
3676 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
3677 | { | ||
3678 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3679 | unsigned long power = SCHED_LOAD_SCALE; | ||
3680 | struct sched_group *sdg = sd->groups; | ||
3681 | |||
3682 | if (sched_feat(ARCH_POWER)) | ||
3683 | power *= arch_scale_freq_power(sd, cpu); | ||
3684 | else | ||
3685 | power *= default_scale_freq_power(sd, cpu); | ||
3686 | |||
3687 | power >>= SCHED_LOAD_SHIFT; | ||
3688 | |||
3689 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
3690 | if (sched_feat(ARCH_POWER)) | ||
3691 | power *= arch_scale_smt_power(sd, cpu); | ||
3692 | else | ||
3693 | power *= default_scale_smt_power(sd, cpu); | ||
3694 | |||
3695 | power >>= SCHED_LOAD_SHIFT; | ||
3696 | } | ||
3697 | |||
3698 | power *= scale_rt_power(cpu); | ||
3699 | power >>= SCHED_LOAD_SHIFT; | ||
3700 | |||
3701 | if (!power) | ||
3702 | power = 1; | ||
3703 | |||
3704 | sdg->cpu_power = power; | ||
3705 | } | ||
3706 | |||
3707 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
3708 | { | ||
3709 | struct sched_domain *child = sd->child; | ||
3710 | struct sched_group *group, *sdg = sd->groups; | ||
3711 | unsigned long power; | ||
3712 | |||
3713 | if (!child) { | ||
3714 | update_cpu_power(sd, cpu); | ||
3715 | return; | ||
3716 | } | ||
3717 | |||
3718 | power = 0; | ||
3719 | |||
3720 | group = child->groups; | ||
3721 | do { | ||
3722 | power += group->cpu_power; | ||
3723 | group = group->next; | ||
3724 | } while (group != child->groups); | ||
3725 | |||
3726 | sdg->cpu_power = power; | ||
3727 | } | ||
3728 | |||
3729 | /** | ||
3730 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | ||
3731 | * @sd: The sched_domain whose statistics are to be updated. | ||
3732 | * @group: sched_group whose statistics are to be updated. | ||
3733 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3734 | * @idle: Idle status of this_cpu | ||
3735 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | ||
3736 | * @sd_idle: Idle status of the sched_domain containing group. | ||
3737 | * @local_group: Does group contain this_cpu. | ||
3738 | * @cpus: Set of cpus considered for load balancing. | ||
3739 | * @balance: Should we balance. | ||
3740 | * @sgs: variable to hold the statistics for this group. | ||
3741 | */ | ||
3742 | static inline void update_sg_lb_stats(struct sched_domain *sd, | ||
3743 | struct sched_group *group, int this_cpu, | ||
3744 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | ||
3745 | int local_group, const struct cpumask *cpus, | ||
3746 | int *balance, struct sg_lb_stats *sgs) | ||
3747 | { | ||
3748 | unsigned long load, max_cpu_load, min_cpu_load; | ||
3749 | int i; | ||
3750 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
3751 | unsigned long sum_avg_load_per_task; | ||
3752 | unsigned long avg_load_per_task; | ||
3753 | |||
3754 | if (local_group) { | ||
3755 | balance_cpu = group_first_cpu(group); | ||
3756 | if (balance_cpu == this_cpu) | ||
3757 | update_group_power(sd, this_cpu); | ||
3758 | } | ||
3759 | |||
3760 | /* Tally up the load of all CPUs in the group */ | ||
3761 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3762 | max_cpu_load = 0; | ||
3763 | min_cpu_load = ~0UL; | ||
3764 | |||
3765 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | ||
3766 | struct rq *rq = cpu_rq(i); | ||
3767 | |||
3768 | if (*sd_idle && rq->nr_running) | ||
3769 | *sd_idle = 0; | ||
3770 | |||
3771 | /* Bias balancing toward cpus of our domain */ | ||
3772 | if (local_group) { | ||
3773 | if (idle_cpu(i) && !first_idle_cpu) { | ||
3774 | first_idle_cpu = 1; | ||
3775 | balance_cpu = i; | ||
3776 | } | ||
3777 | |||
3778 | load = target_load(i, load_idx); | ||
3779 | } else { | ||
3780 | load = source_load(i, load_idx); | ||
3781 | if (load > max_cpu_load) | ||
3782 | max_cpu_load = load; | ||
3783 | if (min_cpu_load > load) | ||
3784 | min_cpu_load = load; | ||
3785 | } | ||
3786 | |||
3787 | sgs->group_load += load; | ||
3788 | sgs->sum_nr_running += rq->nr_running; | ||
3789 | sgs->sum_weighted_load += weighted_cpuload(i); | ||
3790 | |||
3791 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
3792 | } | ||
3793 | |||
3794 | /* | ||
3795 | * First idle cpu or the first cpu(busiest) in this sched group | ||
3796 | * is eligible for doing load balancing at this and above | ||
3797 | * domains. In the newly idle case, we will allow all the cpu's | ||
3798 | * to do the newly idle load balance. | ||
3799 | */ | ||
3800 | if (idle != CPU_NEWLY_IDLE && local_group && | ||
3801 | balance_cpu != this_cpu && balance) { | ||
3802 | *balance = 0; | ||
3803 | return; | ||
3804 | } | ||
3805 | |||
3806 | /* Adjust by relative CPU power of the group */ | ||
3807 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
3808 | |||
3809 | |||
3810 | /* | ||
3811 | * Consider the group unbalanced when the imbalance is larger | ||
3812 | * than the average weight of two tasks. | ||
3813 | * | ||
3814 | * APZ: with cgroup the avg task weight can vary wildly and | ||
3815 | * might not be a suitable number - should we keep a | ||
3816 | * normalized nr_running number somewhere that negates | ||
3817 | * the hierarchy? | ||
3818 | */ | ||
3819 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / | ||
3820 | group->cpu_power; | ||
3821 | |||
3822 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
3823 | sgs->group_imb = 1; | ||
3824 | |||
3825 | sgs->group_capacity = | ||
3826 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | ||
3827 | } | ||
3828 | |||
3829 | /** | ||
3830 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | ||
3831 | * @sd: sched_domain whose statistics are to be updated. | ||
3832 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3833 | * @idle: Idle status of this_cpu | ||
3834 | * @sd_idle: Idle status of the sched_domain containing group. | ||
3835 | * @cpus: Set of cpus considered for load balancing. | ||
3836 | * @balance: Should we balance. | ||
3837 | * @sds: variable to hold the statistics for this sched_domain. | ||
3838 | */ | ||
3839 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | ||
3840 | enum cpu_idle_type idle, int *sd_idle, | ||
3841 | const struct cpumask *cpus, int *balance, | ||
3842 | struct sd_lb_stats *sds) | ||
3843 | { | ||
3844 | struct sched_domain *child = sd->child; | ||
3845 | struct sched_group *group = sd->groups; | ||
3846 | struct sg_lb_stats sgs; | ||
3847 | int load_idx, prefer_sibling = 0; | ||
3848 | |||
3849 | if (child && child->flags & SD_PREFER_SIBLING) | ||
3850 | prefer_sibling = 1; | ||
3851 | |||
3852 | init_sd_power_savings_stats(sd, sds, idle); | ||
3853 | load_idx = get_sd_load_idx(sd, idle); | ||
3854 | |||
3855 | do { | ||
3856 | int local_group; | ||
3857 | |||
3858 | local_group = cpumask_test_cpu(this_cpu, | ||
3859 | sched_group_cpus(group)); | ||
3860 | memset(&sgs, 0, sizeof(sgs)); | ||
3861 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | ||
3862 | local_group, cpus, balance, &sgs); | ||
3863 | |||
3864 | if (local_group && balance && !(*balance)) | ||
3865 | return; | ||
3866 | |||
3867 | sds->total_load += sgs.group_load; | ||
3868 | sds->total_pwr += group->cpu_power; | ||
3869 | |||
3870 | /* | ||
3871 | * In case the child domain prefers tasks go to siblings | ||
3872 | * first, lower the group capacity to one so that we'll try | ||
3873 | * and move all the excess tasks away. | ||
3874 | */ | ||
3875 | if (prefer_sibling) | ||
3876 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
3877 | |||
3878 | if (local_group) { | ||
3879 | sds->this_load = sgs.avg_load; | ||
3880 | sds->this = group; | ||
3881 | sds->this_nr_running = sgs.sum_nr_running; | ||
3882 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
3883 | } else if (sgs.avg_load > sds->max_load && | ||
3884 | (sgs.sum_nr_running > sgs.group_capacity || | ||
3885 | sgs.group_imb)) { | ||
3886 | sds->max_load = sgs.avg_load; | ||
3887 | sds->busiest = group; | ||
3888 | sds->busiest_nr_running = sgs.sum_nr_running; | ||
3889 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
3890 | sds->group_imb = sgs.group_imb; | ||
3891 | } | ||
3892 | |||
3893 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | ||
3894 | group = group->next; | ||
3895 | } while (group != sd->groups); | ||
3896 | } | ||
3897 | |||
3898 | /** | ||
3899 | * fix_small_imbalance - Calculate the minor imbalance that exists | ||
3900 | * amongst the groups of a sched_domain, during | ||
3901 | * load balancing. | ||
3902 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | ||
3903 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
3904 | * @imbalance: Variable to store the imbalance. | ||
3905 | */ | ||
3906 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | ||
3907 | int this_cpu, unsigned long *imbalance) | ||
3908 | { | ||
3909 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | ||
3910 | unsigned int imbn = 2; | ||
3911 | |||
3912 | if (sds->this_nr_running) { | ||
3913 | sds->this_load_per_task /= sds->this_nr_running; | ||
3914 | if (sds->busiest_load_per_task > | ||
3915 | sds->this_load_per_task) | ||
3916 | imbn = 1; | ||
3917 | } else | ||
3918 | sds->this_load_per_task = | ||
3919 | cpu_avg_load_per_task(this_cpu); | ||
3920 | |||
3921 | if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= | ||
3922 | sds->busiest_load_per_task * imbn) { | ||
3923 | *imbalance = sds->busiest_load_per_task; | ||
3924 | return; | ||
3925 | } | ||
3926 | |||
3927 | /* | ||
3928 | * OK, we don't have enough imbalance to justify moving tasks, | ||
3929 | * however we may be able to increase total CPU power used by | ||
3930 | * moving them. | ||
3931 | */ | ||
3932 | |||
3933 | pwr_now += sds->busiest->cpu_power * | ||
3934 | min(sds->busiest_load_per_task, sds->max_load); | ||
3935 | pwr_now += sds->this->cpu_power * | ||
3936 | min(sds->this_load_per_task, sds->this_load); | ||
3937 | pwr_now /= SCHED_LOAD_SCALE; | ||
3938 | |||
3939 | /* Amount of load we'd subtract */ | ||
3940 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
3941 | sds->busiest->cpu_power; | ||
3942 | if (sds->max_load > tmp) | ||
3943 | pwr_move += sds->busiest->cpu_power * | ||
3944 | min(sds->busiest_load_per_task, sds->max_load - tmp); | ||
3945 | |||
3946 | /* Amount of load we'd add */ | ||
3947 | if (sds->max_load * sds->busiest->cpu_power < | ||
3948 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | ||
3949 | tmp = (sds->max_load * sds->busiest->cpu_power) / | ||
3950 | sds->this->cpu_power; | ||
3951 | else | ||
3952 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
3953 | sds->this->cpu_power; | ||
3954 | pwr_move += sds->this->cpu_power * | ||
3955 | min(sds->this_load_per_task, sds->this_load + tmp); | ||
3956 | pwr_move /= SCHED_LOAD_SCALE; | ||
3957 | |||
3958 | /* Move if we gain throughput */ | ||
3959 | if (pwr_move > pwr_now) | ||
3960 | *imbalance = sds->busiest_load_per_task; | ||
3961 | } | ||
3962 | |||
3963 | /** | ||
3964 | * calculate_imbalance - Calculate the amount of imbalance present within the | ||
3965 | * groups of a given sched_domain during load balance. | ||
3966 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | ||
3967 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
3968 | * @imbalance: The variable to store the imbalance. | ||
3969 | */ | ||
3970 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | ||
3971 | unsigned long *imbalance) | ||
3972 | { | ||
3973 | unsigned long max_pull; | ||
3974 | /* | ||
3975 | * In the presence of smp nice balancing, certain scenarios can have | ||
3976 | * max load less than avg load(as we skip the groups at or below | ||
3977 | * its cpu_power, while calculating max_load..) | ||
3978 | */ | ||
3979 | if (sds->max_load < sds->avg_load) { | ||
3980 | *imbalance = 0; | ||
3981 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
3982 | } | ||
3983 | |||
3984 | /* Don't want to pull so many tasks that a group would go idle */ | ||
3985 | max_pull = min(sds->max_load - sds->avg_load, | ||
3986 | sds->max_load - sds->busiest_load_per_task); | ||
3987 | |||
3988 | /* How much load to actually move to equalise the imbalance */ | ||
3989 | *imbalance = min(max_pull * sds->busiest->cpu_power, | ||
3990 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | ||
3991 | / SCHED_LOAD_SCALE; | ||
3992 | |||
3993 | /* | ||
3994 | * if *imbalance is less than the average load per runnable task | ||
3995 | * there is no gaurantee that any tasks will be moved so we'll have | ||
3996 | * a think about bumping its value to force at least one task to be | ||
3997 | * moved | ||
3998 | */ | ||
3999 | if (*imbalance < sds->busiest_load_per_task) | ||
4000 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
4001 | |||
4002 | } | ||
4003 | /******* find_busiest_group() helpers end here *********************/ | ||
4004 | |||
4005 | /** | ||
4006 | * find_busiest_group - Returns the busiest group within the sched_domain | ||
4007 | * if there is an imbalance. If there isn't an imbalance, and | ||
4008 | * the user has opted for power-savings, it returns a group whose | ||
4009 | * CPUs can be put to idle by rebalancing those tasks elsewhere, if | ||
4010 | * such a group exists. | ||
4011 | * | ||
4012 | * Also calculates the amount of weighted load which should be moved | ||
4013 | * to restore balance. | ||
4014 | * | ||
4015 | * @sd: The sched_domain whose busiest group is to be returned. | ||
4016 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
4017 | * @imbalance: Variable which stores amount of weighted load which should | ||
4018 | * be moved to restore balance/put a group to idle. | ||
4019 | * @idle: The idle status of this_cpu. | ||
4020 | * @sd_idle: The idleness of sd | ||
4021 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
4022 | * @balance: Pointer to a variable indicating if this_cpu | ||
4023 | * is the appropriate cpu to perform load balancing at this_level. | ||
4024 | * | ||
4025 | * Returns: - the busiest group if imbalance exists. | ||
4026 | * - If no imbalance and user has opted for power-savings balance, | ||
4027 | * return the least loaded group whose CPUs can be | ||
4028 | * put to idle by rebalancing its tasks onto our group. | ||
4029 | */ | ||
4030 | static struct sched_group * | ||
4031 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
4032 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
4033 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
4034 | { | ||
4035 | struct sd_lb_stats sds; | ||
4036 | |||
4037 | memset(&sds, 0, sizeof(sds)); | ||
4038 | |||
4039 | /* | ||
4040 | * Compute the various statistics relavent for load balancing at | ||
4041 | * this level. | ||
4042 | */ | ||
4043 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | ||
4044 | balance, &sds); | ||
4045 | |||
4046 | /* Cases where imbalance does not exist from POV of this_cpu */ | ||
4047 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | ||
4048 | * at this level. | ||
4049 | * 2) There is no busy sibling group to pull from. | ||
4050 | * 3) This group is the busiest group. | ||
4051 | * 4) This group is more busy than the avg busieness at this | ||
4052 | * sched_domain. | ||
4053 | * 5) The imbalance is within the specified limit. | ||
4054 | * 6) Any rebalance would lead to ping-pong | ||
4055 | */ | ||
4056 | if (balance && !(*balance)) | ||
4057 | goto ret; | ||
4058 | |||
4059 | if (!sds.busiest || sds.busiest_nr_running == 0) | ||
4060 | goto out_balanced; | ||
4061 | |||
4062 | if (sds.this_load >= sds.max_load) | ||
4063 | goto out_balanced; | ||
4064 | |||
4065 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | ||
4066 | |||
4067 | if (sds.this_load >= sds.avg_load) | ||
4068 | goto out_balanced; | ||
4069 | |||
4070 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
4071 | goto out_balanced; | ||
4072 | |||
4073 | sds.busiest_load_per_task /= sds.busiest_nr_running; | ||
4074 | if (sds.group_imb) | ||
4075 | sds.busiest_load_per_task = | ||
4076 | min(sds.busiest_load_per_task, sds.avg_load); | ||
4077 | |||
4078 | /* | ||
4079 | * We're trying to get all the cpus to the average_load, so we don't | ||
4080 | * want to push ourselves above the average load, nor do we wish to | ||
4081 | * reduce the max loaded cpu below the average load, as either of these | ||
4082 | * actions would just result in more rebalancing later, and ping-pong | ||
4083 | * tasks around. Thus we look for the minimum possible imbalance. | ||
4084 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
4085 | * be counted as no imbalance for these purposes -- we can't fix that | ||
4086 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
4087 | * appear as very large values with unsigned longs. | ||
4088 | */ | ||
4089 | if (sds.max_load <= sds.busiest_load_per_task) | ||
4090 | goto out_balanced; | ||
4091 | |||
4092 | /* Looks like there is an imbalance. Compute it */ | ||
4093 | calculate_imbalance(&sds, this_cpu, imbalance); | ||
4094 | return sds.busiest; | ||
4095 | |||
4096 | out_balanced: | ||
4097 | /* | ||
4098 | * There is no obvious imbalance. But check if we can do some balancing | ||
4099 | * to save power. | ||
4100 | */ | ||
4101 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
4102 | return sds.busiest; | ||
4103 | ret: | ||
4104 | *imbalance = 0; | ||
4105 | return NULL; | ||
4106 | } | ||
4107 | |||
4108 | /* | ||
4109 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | ||
4110 | */ | ||
4111 | static struct rq * | ||
4112 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | ||
4113 | unsigned long imbalance, const struct cpumask *cpus) | ||
4114 | { | ||
4115 | struct rq *busiest = NULL, *rq; | ||
4116 | unsigned long max_load = 0; | ||
4117 | int i; | ||
4118 | |||
4119 | for_each_cpu(i, sched_group_cpus(group)) { | ||
4120 | unsigned long power = power_of(i); | ||
4121 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
4122 | unsigned long wl; | ||
4123 | |||
4124 | if (!cpumask_test_cpu(i, cpus)) | ||
4125 | continue; | ||
4126 | |||
4127 | rq = cpu_rq(i); | ||
4128 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; | ||
4129 | wl /= power; | ||
4130 | |||
4131 | if (capacity && rq->nr_running == 1 && wl > imbalance) | ||
4132 | continue; | ||
4133 | |||
4134 | if (wl > max_load) { | ||
4135 | max_load = wl; | ||
4136 | busiest = rq; | ||
4137 | } | ||
4138 | } | ||
4139 | |||
4140 | return busiest; | ||
4141 | } | ||
4142 | |||
4143 | /* | ||
4144 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
4145 | * so long as it is large enough. | ||
4146 | */ | ||
4147 | #define MAX_PINNED_INTERVAL 512 | ||
4148 | |||
4149 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
4150 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | ||
4151 | |||
4152 | /* | ||
4153 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
4154 | * tasks if there is an imbalance. | ||
4155 | */ | ||
4156 | static int load_balance(int this_cpu, struct rq *this_rq, | ||
4157 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
4158 | int *balance) | ||
4159 | { | ||
4160 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
4161 | struct sched_group *group; | ||
4162 | unsigned long imbalance; | ||
4163 | struct rq *busiest; | ||
4164 | unsigned long flags; | ||
4165 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
4166 | |||
4167 | cpumask_copy(cpus, cpu_active_mask); | ||
4168 | |||
4169 | /* | ||
4170 | * When power savings policy is enabled for the parent domain, idle | ||
4171 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
4172 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
4173 | * portraying it as CPU_NOT_IDLE. | ||
4174 | */ | ||
4175 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
4176 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4177 | sd_idle = 1; | ||
4178 | |||
4179 | schedstat_inc(sd, lb_count[idle]); | ||
4180 | |||
4181 | redo: | ||
4182 | update_shares(sd); | ||
4183 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
4184 | cpus, balance); | ||
4185 | |||
4186 | if (*balance == 0) | ||
4187 | goto out_balanced; | ||
4188 | |||
4189 | if (!group) { | ||
4190 | schedstat_inc(sd, lb_nobusyg[idle]); | ||
4191 | goto out_balanced; | ||
4192 | } | ||
4193 | |||
4194 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | ||
4195 | if (!busiest) { | ||
4196 | schedstat_inc(sd, lb_nobusyq[idle]); | ||
4197 | goto out_balanced; | ||
4198 | } | ||
4199 | |||
4200 | BUG_ON(busiest == this_rq); | ||
4201 | |||
4202 | schedstat_add(sd, lb_imbalance[idle], imbalance); | ||
4203 | |||
4204 | ld_moved = 0; | ||
4205 | if (busiest->nr_running > 1) { | ||
4206 | /* | ||
4207 | * Attempt to move tasks. If find_busiest_group has found | ||
4208 | * an imbalance but busiest->nr_running <= 1, the group is | ||
4209 | * still unbalanced. ld_moved simply stays zero, so it is | ||
4210 | * correctly treated as an imbalance. | ||
4211 | */ | ||
4212 | local_irq_save(flags); | ||
4213 | double_rq_lock(this_rq, busiest); | ||
4214 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
4215 | imbalance, sd, idle, &all_pinned); | ||
4216 | double_rq_unlock(this_rq, busiest); | ||
4217 | local_irq_restore(flags); | ||
4218 | |||
4219 | /* | ||
4220 | * some other cpu did the load balance for us. | ||
4221 | */ | ||
4222 | if (ld_moved && this_cpu != smp_processor_id()) | ||
4223 | resched_cpu(this_cpu); | ||
4224 | |||
4225 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
4226 | if (unlikely(all_pinned)) { | ||
4227 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
4228 | if (!cpumask_empty(cpus)) | ||
4229 | goto redo; | ||
4230 | goto out_balanced; | ||
4231 | } | ||
4232 | } | ||
4233 | |||
4234 | if (!ld_moved) { | ||
4235 | schedstat_inc(sd, lb_failed[idle]); | ||
4236 | sd->nr_balance_failed++; | ||
4237 | |||
4238 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | ||
4239 | |||
4240 | raw_spin_lock_irqsave(&busiest->lock, flags); | ||
4241 | |||
4242 | /* don't kick the migration_thread, if the curr | ||
4243 | * task on busiest cpu can't be moved to this_cpu | ||
4244 | */ | ||
4245 | if (!cpumask_test_cpu(this_cpu, | ||
4246 | &busiest->curr->cpus_allowed)) { | ||
4247 | raw_spin_unlock_irqrestore(&busiest->lock, | ||
4248 | flags); | ||
4249 | all_pinned = 1; | ||
4250 | goto out_one_pinned; | ||
4251 | } | ||
4252 | |||
4253 | if (!busiest->active_balance) { | ||
4254 | busiest->active_balance = 1; | ||
4255 | busiest->push_cpu = this_cpu; | ||
4256 | active_balance = 1; | ||
4257 | } | ||
4258 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | ||
4259 | if (active_balance) | ||
4260 | wake_up_process(busiest->migration_thread); | ||
4261 | |||
4262 | /* | ||
4263 | * We've kicked active balancing, reset the failure | ||
4264 | * counter. | ||
4265 | */ | ||
4266 | sd->nr_balance_failed = sd->cache_nice_tries+1; | ||
4267 | } | ||
4268 | } else | ||
4269 | sd->nr_balance_failed = 0; | ||
4270 | |||
4271 | if (likely(!active_balance)) { | ||
4272 | /* We were unbalanced, so reset the balancing interval */ | ||
4273 | sd->balance_interval = sd->min_interval; | ||
4274 | } else { | ||
4275 | /* | ||
4276 | * If we've begun active balancing, start to back off. This | ||
4277 | * case may not be covered by the all_pinned logic if there | ||
4278 | * is only 1 task on the busy runqueue (because we don't call | ||
4279 | * move_tasks). | ||
4280 | */ | ||
4281 | if (sd->balance_interval < sd->max_interval) | ||
4282 | sd->balance_interval *= 2; | ||
4283 | } | ||
4284 | |||
4285 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4286 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4287 | ld_moved = -1; | ||
4288 | |||
4289 | goto out; | ||
4290 | |||
4291 | out_balanced: | ||
4292 | schedstat_inc(sd, lb_balanced[idle]); | ||
4293 | |||
4294 | sd->nr_balance_failed = 0; | ||
4295 | |||
4296 | out_one_pinned: | ||
4297 | /* tune up the balancing interval */ | ||
4298 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
4299 | (sd->balance_interval < sd->max_interval)) | ||
4300 | sd->balance_interval *= 2; | ||
4301 | |||
4302 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4303 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4304 | ld_moved = -1; | ||
4305 | else | ||
4306 | ld_moved = 0; | ||
4307 | out: | ||
4308 | if (ld_moved) | ||
4309 | update_shares(sd); | ||
4310 | return ld_moved; | ||
4311 | } | ||
4312 | |||
4313 | /* | ||
4314 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
4315 | * tasks if there is an imbalance. | ||
4316 | * | ||
4317 | * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). | ||
4318 | * this_rq is locked. | ||
4319 | */ | ||
4320 | static int | ||
4321 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | ||
4322 | { | ||
4323 | struct sched_group *group; | ||
4324 | struct rq *busiest = NULL; | ||
4325 | unsigned long imbalance; | ||
4326 | int ld_moved = 0; | ||
4327 | int sd_idle = 0; | ||
4328 | int all_pinned = 0; | ||
4329 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
4330 | |||
4331 | cpumask_copy(cpus, cpu_active_mask); | ||
4332 | |||
4333 | /* | ||
4334 | * When power savings policy is enabled for the parent domain, idle | ||
4335 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
4336 | * let the state of idle sibling percolate up as IDLE, instead of | ||
4337 | * portraying it as CPU_NOT_IDLE. | ||
4338 | */ | ||
4339 | if (sd->flags & SD_SHARE_CPUPOWER && | ||
4340 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4341 | sd_idle = 1; | ||
4342 | |||
4343 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | ||
4344 | redo: | ||
4345 | update_shares_locked(this_rq, sd); | ||
4346 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | ||
4347 | &sd_idle, cpus, NULL); | ||
4348 | if (!group) { | ||
4349 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | ||
4350 | goto out_balanced; | ||
4351 | } | ||
4352 | |||
4353 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); | ||
4354 | if (!busiest) { | ||
4355 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | ||
4356 | goto out_balanced; | ||
4357 | } | ||
4358 | |||
4359 | BUG_ON(busiest == this_rq); | ||
4360 | |||
4361 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); | ||
4362 | |||
4363 | ld_moved = 0; | ||
4364 | if (busiest->nr_running > 1) { | ||
4365 | /* Attempt to move tasks */ | ||
4366 | double_lock_balance(this_rq, busiest); | ||
4367 | /* this_rq->clock is already updated */ | ||
4368 | update_rq_clock(busiest); | ||
4369 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
4370 | imbalance, sd, CPU_NEWLY_IDLE, | ||
4371 | &all_pinned); | ||
4372 | double_unlock_balance(this_rq, busiest); | ||
4373 | |||
4374 | if (unlikely(all_pinned)) { | ||
4375 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
4376 | if (!cpumask_empty(cpus)) | ||
4377 | goto redo; | ||
4378 | } | ||
4379 | } | ||
4380 | |||
4381 | if (!ld_moved) { | ||
4382 | int active_balance = 0; | ||
4383 | |||
4384 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); | ||
4385 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4386 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4387 | return -1; | ||
4388 | |||
4389 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
4390 | return -1; | ||
4391 | |||
4392 | if (sd->nr_balance_failed++ < 2) | ||
4393 | return -1; | ||
4394 | |||
4395 | /* | ||
4396 | * The only task running in a non-idle cpu can be moved to this | ||
4397 | * cpu in an attempt to completely freeup the other CPU | ||
4398 | * package. The same method used to move task in load_balance() | ||
4399 | * have been extended for load_balance_newidle() to speedup | ||
4400 | * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) | ||
4401 | * | ||
4402 | * The package power saving logic comes from | ||
4403 | * find_busiest_group(). If there are no imbalance, then | ||
4404 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
4405 | * f_b_g() will select a group from which a running task may be | ||
4406 | * pulled to this cpu in order to make the other package idle. | ||
4407 | * If there is no opportunity to make a package idle and if | ||
4408 | * there are no imbalance, then f_b_g() will return NULL and no | ||
4409 | * action will be taken in load_balance_newidle(). | ||
4410 | * | ||
4411 | * Under normal task pull operation due to imbalance, there | ||
4412 | * will be more than one task in the source run queue and | ||
4413 | * move_tasks() will succeed. ld_moved will be true and this | ||
4414 | * active balance code will not be triggered. | ||
4415 | */ | ||
4416 | |||
4417 | /* Lock busiest in correct order while this_rq is held */ | ||
4418 | double_lock_balance(this_rq, busiest); | ||
4419 | |||
4420 | /* | ||
4421 | * don't kick the migration_thread, if the curr | ||
4422 | * task on busiest cpu can't be moved to this_cpu | ||
4423 | */ | ||
4424 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | ||
4425 | double_unlock_balance(this_rq, busiest); | ||
4426 | all_pinned = 1; | ||
4427 | return ld_moved; | ||
4428 | } | ||
4429 | |||
4430 | if (!busiest->active_balance) { | ||
4431 | busiest->active_balance = 1; | ||
4432 | busiest->push_cpu = this_cpu; | ||
4433 | active_balance = 1; | ||
4434 | } | ||
4435 | |||
4436 | double_unlock_balance(this_rq, busiest); | ||
4437 | /* | ||
4438 | * Should not call ttwu while holding a rq->lock | ||
4439 | */ | ||
4440 | raw_spin_unlock(&this_rq->lock); | ||
4441 | if (active_balance) | ||
4442 | wake_up_process(busiest->migration_thread); | ||
4443 | raw_spin_lock(&this_rq->lock); | ||
4444 | |||
4445 | } else | ||
4446 | sd->nr_balance_failed = 0; | ||
4447 | |||
4448 | update_shares_locked(this_rq, sd); | ||
4449 | return ld_moved; | ||
4450 | |||
4451 | out_balanced: | ||
4452 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); | ||
4453 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4454 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4455 | return -1; | ||
4456 | sd->nr_balance_failed = 0; | ||
4457 | |||
4458 | return 0; | ||
4459 | } | ||
4460 | |||
4461 | /* | ||
4462 | * idle_balance is called by schedule() if this_cpu is about to become | ||
4463 | * idle. Attempts to pull tasks from other CPUs. | ||
4464 | */ | ||
4465 | static void idle_balance(int this_cpu, struct rq *this_rq) | ||
4466 | { | ||
4467 | struct sched_domain *sd; | ||
4468 | int pulled_task = 0; | ||
4469 | unsigned long next_balance = jiffies + HZ; | ||
4470 | |||
4471 | this_rq->idle_stamp = this_rq->clock; | ||
4472 | |||
4473 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
4474 | return; | ||
4475 | |||
4476 | for_each_domain(this_cpu, sd) { | ||
4477 | unsigned long interval; | ||
4478 | |||
4479 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
4480 | continue; | ||
4481 | |||
4482 | if (sd->flags & SD_BALANCE_NEWIDLE) | ||
4483 | /* If we've pulled tasks over stop searching: */ | ||
4484 | pulled_task = load_balance_newidle(this_cpu, this_rq, | ||
4485 | sd); | ||
4486 | |||
4487 | interval = msecs_to_jiffies(sd->balance_interval); | ||
4488 | if (time_after(next_balance, sd->last_balance + interval)) | ||
4489 | next_balance = sd->last_balance + interval; | ||
4490 | if (pulled_task) { | ||
4491 | this_rq->idle_stamp = 0; | ||
4492 | break; | ||
4493 | } | ||
4494 | } | ||
4495 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | ||
4496 | /* | ||
4497 | * We are going idle. next_balance may be set based on | ||
4498 | * a busy processor. So reset next_balance. | ||
4499 | */ | ||
4500 | this_rq->next_balance = next_balance; | ||
4501 | } | ||
4502 | } | ||
4503 | |||
4504 | /* | ||
4505 | * active_load_balance is run by migration threads. It pushes running tasks | ||
4506 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | ||
4507 | * running on each physical CPU where possible, and avoids physical / | ||
4508 | * logical imbalances. | ||
4509 | * | ||
4510 | * Called with busiest_rq locked. | ||
4511 | */ | ||
4512 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | ||
4513 | { | ||
4514 | int target_cpu = busiest_rq->push_cpu; | ||
4515 | struct sched_domain *sd; | ||
4516 | struct rq *target_rq; | ||
4517 | |||
4518 | /* Is there any task to move? */ | ||
4519 | if (busiest_rq->nr_running <= 1) | ||
4520 | return; | ||
4521 | |||
4522 | target_rq = cpu_rq(target_cpu); | ||
4523 | |||
4524 | /* | ||
4525 | * This condition is "impossible", if it occurs | ||
4526 | * we need to fix it. Originally reported by | ||
4527 | * Bjorn Helgaas on a 128-cpu setup. | ||
4528 | */ | ||
4529 | BUG_ON(busiest_rq == target_rq); | ||
4530 | |||
4531 | /* move a task from busiest_rq to target_rq */ | ||
4532 | double_lock_balance(busiest_rq, target_rq); | ||
4533 | update_rq_clock(busiest_rq); | ||
4534 | update_rq_clock(target_rq); | ||
4535 | |||
4536 | /* Search for an sd spanning us and the target CPU. */ | ||
4537 | for_each_domain(target_cpu, sd) { | ||
4538 | if ((sd->flags & SD_LOAD_BALANCE) && | ||
4539 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | ||
4540 | break; | ||
4541 | } | ||
4542 | |||
4543 | if (likely(sd)) { | ||
4544 | schedstat_inc(sd, alb_count); | ||
4545 | |||
4546 | if (move_one_task(target_rq, target_cpu, busiest_rq, | ||
4547 | sd, CPU_IDLE)) | ||
4548 | schedstat_inc(sd, alb_pushed); | ||
4549 | else | ||
4550 | schedstat_inc(sd, alb_failed); | ||
4551 | } | ||
4552 | double_unlock_balance(busiest_rq, target_rq); | ||
4553 | } | ||
4554 | |||
4555 | #ifdef CONFIG_NO_HZ | ||
4556 | static struct { | ||
4557 | atomic_t load_balancer; | ||
4558 | cpumask_var_t cpu_mask; | ||
4559 | cpumask_var_t ilb_grp_nohz_mask; | ||
4560 | } nohz ____cacheline_aligned = { | ||
4561 | .load_balancer = ATOMIC_INIT(-1), | ||
4562 | }; | ||
4563 | |||
4564 | int get_nohz_load_balancer(void) | ||
4565 | { | ||
4566 | return atomic_read(&nohz.load_balancer); | ||
4567 | } | ||
4568 | |||
4569 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
4570 | /** | ||
4571 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4572 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4573 | * be returned. | ||
4574 | * @flag: The flag to check for the lowest sched_domain | ||
4575 | * for the given cpu. | ||
4576 | * | ||
4577 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4578 | */ | ||
4579 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4580 | { | ||
4581 | struct sched_domain *sd; | ||
4582 | |||
4583 | for_each_domain(cpu, sd) | ||
4584 | if (sd && (sd->flags & flag)) | ||
4585 | break; | ||
4586 | |||
4587 | return sd; | ||
4588 | } | ||
4589 | |||
4590 | /** | ||
4591 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4592 | * @cpu: The cpu whose domains we're iterating over. | ||
4593 | * @sd: variable holding the value of the power_savings_sd | ||
4594 | * for cpu. | ||
4595 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4596 | * | ||
4597 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4598 | * set, starting from the lowest sched_domain to the highest. | ||
4599 | */ | ||
4600 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4601 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4602 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4603 | |||
4604 | /** | ||
4605 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
4606 | * @ilb_group: group to be checked for semi-idleness | ||
4607 | * | ||
4608 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
4609 | * | ||
4610 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
4611 | * and atleast one non-idle CPU. This helper function checks if the given | ||
4612 | * sched_group is semi-idle or not. | ||
4613 | */ | ||
4614 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
4615 | { | ||
4616 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
4617 | sched_group_cpus(ilb_group)); | ||
4618 | |||
4619 | /* | ||
4620 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
4621 | * and atleast one idle cpu. | ||
4622 | */ | ||
4623 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
4624 | return 0; | ||
4625 | |||
4626 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
4627 | return 0; | ||
4628 | |||
4629 | return 1; | ||
4630 | } | ||
4631 | /** | ||
4632 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4633 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4634 | * | ||
4635 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4636 | * Else, returns >= nr_cpu_ids. | ||
4637 | * | ||
4638 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4639 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4640 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4641 | * when there are other idle cpu's which are better suited for that job. | ||
4642 | */ | ||
4643 | static int find_new_ilb(int cpu) | ||
4644 | { | ||
4645 | struct sched_domain *sd; | ||
4646 | struct sched_group *ilb_group; | ||
4647 | |||
4648 | /* | ||
4649 | * Have idle load balancer selection from semi-idle packages only | ||
4650 | * when power-aware load balancing is enabled | ||
4651 | */ | ||
4652 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4653 | goto out_done; | ||
4654 | |||
4655 | /* | ||
4656 | * Optimize for the case when we have no idle CPUs or only one | ||
4657 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4658 | */ | ||
4659 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
4660 | goto out_done; | ||
4661 | |||
4662 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4663 | ilb_group = sd->groups; | ||
4664 | |||
4665 | do { | ||
4666 | if (is_semi_idle_group(ilb_group)) | ||
4667 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
4668 | |||
4669 | ilb_group = ilb_group->next; | ||
4670 | |||
4671 | } while (ilb_group != sd->groups); | ||
4672 | } | ||
4673 | |||
4674 | out_done: | ||
4675 | return cpumask_first(nohz.cpu_mask); | ||
4676 | } | ||
4677 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4678 | static inline int find_new_ilb(int call_cpu) | ||
4679 | { | ||
4680 | return cpumask_first(nohz.cpu_mask); | ||
4681 | } | ||
4682 | #endif | ||
4683 | |||
4684 | /* | ||
4685 | * This routine will try to nominate the ilb (idle load balancing) | ||
4686 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
4687 | * load balancing on behalf of all those cpus. If all the cpus in the system | ||
4688 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
4689 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
4690 | * arrives... | ||
4691 | * | ||
4692 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
4693 | * for idle load balancing. ilb owner will still be part of | ||
4694 | * nohz.cpu_mask.. | ||
4695 | * | ||
4696 | * While stopping the tick, this cpu will become the ilb owner if there | ||
4697 | * is no other owner. And will be the owner till that cpu becomes busy | ||
4698 | * or if all cpus in the system stop their ticks at which point | ||
4699 | * there is no need for ilb owner. | ||
4700 | * | ||
4701 | * When the ilb owner becomes busy, it nominates another owner, during the | ||
4702 | * next busy scheduler_tick() | ||
4703 | */ | ||
4704 | int select_nohz_load_balancer(int stop_tick) | ||
4705 | { | ||
4706 | int cpu = smp_processor_id(); | ||
4707 | |||
4708 | if (stop_tick) { | ||
4709 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
4710 | |||
4711 | if (!cpu_active(cpu)) { | ||
4712 | if (atomic_read(&nohz.load_balancer) != cpu) | ||
4713 | return 0; | ||
4714 | |||
4715 | /* | ||
4716 | * If we are going offline and still the leader, | ||
4717 | * give up! | ||
4718 | */ | ||
4719 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
4720 | BUG(); | ||
4721 | |||
4722 | return 0; | ||
4723 | } | ||
4724 | |||
4725 | cpumask_set_cpu(cpu, nohz.cpu_mask); | ||
4726 | |||
4727 | /* time for ilb owner also to sleep */ | ||
4728 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | ||
4729 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
4730 | atomic_set(&nohz.load_balancer, -1); | ||
4731 | return 0; | ||
4732 | } | ||
4733 | |||
4734 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
4735 | /* make me the ilb owner */ | ||
4736 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
4737 | return 1; | ||
4738 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
4739 | int new_ilb; | ||
4740 | |||
4741 | if (!(sched_smt_power_savings || | ||
4742 | sched_mc_power_savings)) | ||
4743 | return 1; | ||
4744 | /* | ||
4745 | * Check to see if there is a more power-efficient | ||
4746 | * ilb. | ||
4747 | */ | ||
4748 | new_ilb = find_new_ilb(cpu); | ||
4749 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
4750 | atomic_set(&nohz.load_balancer, -1); | ||
4751 | resched_cpu(new_ilb); | ||
4752 | return 0; | ||
4753 | } | ||
4754 | return 1; | ||
4755 | } | ||
4756 | } else { | ||
4757 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
4758 | return 0; | ||
4759 | |||
4760 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
4761 | |||
4762 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
4763 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
4764 | BUG(); | ||
4765 | } | ||
4766 | return 0; | ||
4767 | } | ||
4768 | #endif | ||
4769 | |||
4770 | static DEFINE_SPINLOCK(balancing); | ||
4771 | |||
4772 | /* | ||
4773 | * It checks each scheduling domain to see if it is due to be balanced, | ||
4774 | * and initiates a balancing operation if so. | ||
4775 | * | ||
4776 | * Balancing parameters are set up in arch_init_sched_domains. | ||
4777 | */ | ||
4778 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | ||
4779 | { | ||
4780 | int balance = 1; | ||
4781 | struct rq *rq = cpu_rq(cpu); | ||
4782 | unsigned long interval; | ||
4783 | struct sched_domain *sd; | ||
4784 | /* Earliest time when we have to do rebalance again */ | ||
4785 | unsigned long next_balance = jiffies + 60*HZ; | ||
4786 | int update_next_balance = 0; | ||
4787 | int need_serialize; | ||
4788 | |||
4789 | for_each_domain(cpu, sd) { | ||
4790 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
4791 | continue; | ||
4792 | |||
4793 | interval = sd->balance_interval; | ||
4794 | if (idle != CPU_IDLE) | ||
4795 | interval *= sd->busy_factor; | ||
4796 | |||
4797 | /* scale ms to jiffies */ | ||
4798 | interval = msecs_to_jiffies(interval); | ||
4799 | if (unlikely(!interval)) | ||
4800 | interval = 1; | ||
4801 | if (interval > HZ*NR_CPUS/10) | ||
4802 | interval = HZ*NR_CPUS/10; | ||
4803 | |||
4804 | need_serialize = sd->flags & SD_SERIALIZE; | ||
4805 | |||
4806 | if (need_serialize) { | ||
4807 | if (!spin_trylock(&balancing)) | ||
4808 | goto out; | ||
4809 | } | ||
4810 | |||
4811 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
4812 | if (load_balance(cpu, rq, sd, idle, &balance)) { | ||
4813 | /* | ||
4814 | * We've pulled tasks over so either we're no | ||
4815 | * longer idle, or one of our SMT siblings is | ||
4816 | * not idle. | ||
4817 | */ | ||
4818 | idle = CPU_NOT_IDLE; | ||
4819 | } | ||
4820 | sd->last_balance = jiffies; | ||
4821 | } | ||
4822 | if (need_serialize) | ||
4823 | spin_unlock(&balancing); | ||
4824 | out: | ||
4825 | if (time_after(next_balance, sd->last_balance + interval)) { | ||
4826 | next_balance = sd->last_balance + interval; | ||
4827 | update_next_balance = 1; | ||
4828 | } | ||
4829 | |||
4830 | /* | ||
4831 | * Stop the load balance at this level. There is another | ||
4832 | * CPU in our sched group which is doing load balancing more | ||
4833 | * actively. | ||
4834 | */ | ||
4835 | if (!balance) | ||
4836 | break; | ||
4837 | } | ||
4838 | |||
4839 | /* | ||
4840 | * next_balance will be updated only when there is a need. | ||
4841 | * When the cpu is attached to null domain for ex, it will not be | ||
4842 | * updated. | ||
4843 | */ | ||
4844 | if (likely(update_next_balance)) | ||
4845 | rq->next_balance = next_balance; | ||
4846 | } | ||
4847 | |||
4848 | /* | ||
4849 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
4850 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
4851 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
4852 | */ | ||
4853 | static void run_rebalance_domains(struct softirq_action *h) | ||
4854 | { | ||
4855 | int this_cpu = smp_processor_id(); | ||
4856 | struct rq *this_rq = cpu_rq(this_cpu); | ||
4857 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | ||
4858 | CPU_IDLE : CPU_NOT_IDLE; | ||
4859 | |||
4860 | rebalance_domains(this_cpu, idle); | ||
4861 | |||
4862 | #ifdef CONFIG_NO_HZ | ||
4863 | /* | ||
4864 | * If this cpu is the owner for idle load balancing, then do the | ||
4865 | * balancing on behalf of the other idle cpus whose ticks are | ||
4866 | * stopped. | ||
4867 | */ | ||
4868 | if (this_rq->idle_at_tick && | ||
4869 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
4870 | struct rq *rq; | ||
4871 | int balance_cpu; | ||
4872 | |||
4873 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
4874 | if (balance_cpu == this_cpu) | ||
4875 | continue; | ||
4876 | |||
4877 | /* | ||
4878 | * If this cpu gets work to do, stop the load balancing | ||
4879 | * work being done for other cpus. Next load | ||
4880 | * balancing owner will pick it up. | ||
4881 | */ | ||
4882 | if (need_resched()) | ||
4883 | break; | ||
4884 | |||
4885 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
4886 | |||
4887 | rq = cpu_rq(balance_cpu); | ||
4888 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
4889 | this_rq->next_balance = rq->next_balance; | ||
4890 | } | ||
4891 | } | ||
4892 | #endif | ||
4893 | } | ||
4894 | |||
4895 | static inline int on_null_domain(int cpu) | ||
4896 | { | ||
4897 | return !rcu_dereference(cpu_rq(cpu)->sd); | ||
4898 | } | ||
4899 | |||
4900 | /* | ||
4901 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | ||
4902 | * | ||
4903 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
4904 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
4905 | * if the whole system is idle. | ||
4906 | */ | ||
4907 | static inline void trigger_load_balance(struct rq *rq, int cpu) | ||
4908 | { | ||
4909 | #ifdef CONFIG_NO_HZ | ||
4910 | /* | ||
4911 | * If we were in the nohz mode recently and busy at the current | ||
4912 | * scheduler tick, then check if we need to nominate new idle | ||
4913 | * load balancer. | ||
4914 | */ | ||
4915 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
4916 | rq->in_nohz_recently = 0; | ||
4917 | |||
4918 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
4919 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
4920 | atomic_set(&nohz.load_balancer, -1); | ||
4921 | } | ||
4922 | |||
4923 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
4924 | int ilb = find_new_ilb(cpu); | ||
4925 | |||
4926 | if (ilb < nr_cpu_ids) | ||
4927 | resched_cpu(ilb); | ||
4928 | } | ||
4929 | } | ||
4930 | |||
4931 | /* | ||
4932 | * If this cpu is idle and doing idle load balancing for all the | ||
4933 | * cpus with ticks stopped, is it time for that to stop? | ||
4934 | */ | ||
4935 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
4936 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
4937 | resched_cpu(cpu); | ||
4938 | return; | ||
4939 | } | ||
4940 | |||
4941 | /* | ||
4942 | * If this cpu is idle and the idle load balancing is done by | ||
4943 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
4944 | */ | ||
4945 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
4946 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
4947 | return; | ||
4948 | #endif | ||
4949 | /* Don't need to rebalance while attached to NULL domain */ | ||
4950 | if (time_after_eq(jiffies, rq->next_balance) && | ||
4951 | likely(!on_null_domain(cpu))) | ||
4952 | raise_softirq(SCHED_SOFTIRQ); | ||
4953 | } | ||
4954 | |||
4955 | #else /* CONFIG_SMP */ | ||
4956 | |||
4957 | /* | ||
4958 | * on UP we do not need to balance between CPUs: | ||
4959 | */ | ||
4960 | static inline void idle_balance(int cpu, struct rq *rq) | ||
4961 | { | ||
4962 | } | ||
4963 | |||
4964 | #endif | 3164 | #endif |
4965 | 3165 | ||
4966 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3166 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
@@ -6060,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6060 | unsigned long flags; | 4260 | unsigned long flags; |
6061 | int oldprio, on_rq, running; | 4261 | int oldprio, on_rq, running; |
6062 | struct rq *rq; | 4262 | struct rq *rq; |
6063 | const struct sched_class *prev_class = p->sched_class; | 4263 | const struct sched_class *prev_class; |
6064 | 4264 | ||
6065 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4265 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
6066 | 4266 | ||
@@ -6068,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6068 | update_rq_clock(rq); | 4268 | update_rq_clock(rq); |
6069 | 4269 | ||
6070 | oldprio = p->prio; | 4270 | oldprio = p->prio; |
4271 | prev_class = p->sched_class; | ||
6071 | on_rq = p->se.on_rq; | 4272 | on_rq = p->se.on_rq; |
6072 | running = task_current(rq, p); | 4273 | running = task_current(rq, p); |
6073 | if (on_rq) | 4274 | if (on_rq) |
@@ -6085,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6085 | if (running) | 4286 | if (running) |
6086 | p->sched_class->set_curr_task(rq); | 4287 | p->sched_class->set_curr_task(rq); |
6087 | if (on_rq) { | 4288 | if (on_rq) { |
6088 | enqueue_task(rq, p, 0); | 4289 | enqueue_task(rq, p, 0, oldprio < prio); |
6089 | 4290 | ||
6090 | check_class_changed(rq, p, prev_class, oldprio, running); | 4291 | check_class_changed(rq, p, prev_class, oldprio, running); |
6091 | } | 4292 | } |
@@ -6129,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
6129 | delta = p->prio - old_prio; | 4330 | delta = p->prio - old_prio; |
6130 | 4331 | ||
6131 | if (on_rq) { | 4332 | if (on_rq) { |
6132 | enqueue_task(rq, p, 0); | 4333 | enqueue_task(rq, p, 0, false); |
6133 | /* | 4334 | /* |
6134 | * If the task increased its priority or is running and | 4335 | * If the task increased its priority or is running and |
6135 | * lowered its priority, then reschedule its CPU: | 4336 | * lowered its priority, then reschedule its CPU: |
@@ -6152,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice) | |||
6152 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 4353 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
6153 | int nice_rlim = 20 - nice; | 4354 | int nice_rlim = 20 - nice; |
6154 | 4355 | ||
6155 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 4356 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
6156 | capable(CAP_SYS_NICE)); | 4357 | capable(CAP_SYS_NICE)); |
6157 | } | 4358 | } |
6158 | 4359 | ||
@@ -6287,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
6287 | { | 4488 | { |
6288 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4489 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
6289 | unsigned long flags; | 4490 | unsigned long flags; |
6290 | const struct sched_class *prev_class = p->sched_class; | 4491 | const struct sched_class *prev_class; |
6291 | struct rq *rq; | 4492 | struct rq *rq; |
6292 | int reset_on_fork; | 4493 | int reset_on_fork; |
6293 | 4494 | ||
@@ -6329,7 +4530,7 @@ recheck: | |||
6329 | 4530 | ||
6330 | if (!lock_task_sighand(p, &flags)) | 4531 | if (!lock_task_sighand(p, &flags)) |
6331 | return -ESRCH; | 4532 | return -ESRCH; |
6332 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; | 4533 | rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); |
6333 | unlock_task_sighand(p, &flags); | 4534 | unlock_task_sighand(p, &flags); |
6334 | 4535 | ||
6335 | /* can't set/change the rt policy */ | 4536 | /* can't set/change the rt policy */ |
@@ -6401,6 +4602,7 @@ recheck: | |||
6401 | p->sched_reset_on_fork = reset_on_fork; | 4602 | p->sched_reset_on_fork = reset_on_fork; |
6402 | 4603 | ||
6403 | oldprio = p->prio; | 4604 | oldprio = p->prio; |
4605 | prev_class = p->sched_class; | ||
6404 | __setscheduler(rq, p, policy, param->sched_priority); | 4606 | __setscheduler(rq, p, policy, param->sched_priority); |
6405 | 4607 | ||
6406 | if (running) | 4608 | if (running) |
@@ -7151,27 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
7151 | struct rq *rq; | 5353 | struct rq *rq; |
7152 | int ret = 0; | 5354 | int ret = 0; |
7153 | 5355 | ||
7154 | /* | ||
7155 | * Since we rely on wake-ups to migrate sleeping tasks, don't change | ||
7156 | * the ->cpus_allowed mask from under waking tasks, which would be | ||
7157 | * possible when we change rq->lock in ttwu(), so synchronize against | ||
7158 | * TASK_WAKING to avoid that. | ||
7159 | * | ||
7160 | * Make an exception for freshly cloned tasks, since cpuset namespaces | ||
7161 | * might move the task about, we have to validate the target in | ||
7162 | * wake_up_new_task() anyway since the cpu might have gone away. | ||
7163 | */ | ||
7164 | again: | ||
7165 | while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) | ||
7166 | cpu_relax(); | ||
7167 | |||
7168 | rq = task_rq_lock(p, &flags); | 5356 | rq = task_rq_lock(p, &flags); |
7169 | 5357 | ||
7170 | if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { | ||
7171 | task_rq_unlock(rq, &flags); | ||
7172 | goto again; | ||
7173 | } | ||
7174 | |||
7175 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 5358 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
7176 | ret = -EINVAL; | 5359 | ret = -EINVAL; |
7177 | goto out; | 5360 | goto out; |
@@ -9223,11 +7406,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
9223 | 7406 | ||
9224 | #ifdef CONFIG_SCHED_MC | 7407 | #ifdef CONFIG_SCHED_MC |
9225 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, | 7408 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, |
7409 | struct sysdev_class_attribute *attr, | ||
9226 | char *page) | 7410 | char *page) |
9227 | { | 7411 | { |
9228 | return sprintf(page, "%u\n", sched_mc_power_savings); | 7412 | return sprintf(page, "%u\n", sched_mc_power_savings); |
9229 | } | 7413 | } |
9230 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, | 7414 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, |
7415 | struct sysdev_class_attribute *attr, | ||
9231 | const char *buf, size_t count) | 7416 | const char *buf, size_t count) |
9232 | { | 7417 | { |
9233 | return sched_power_savings_store(buf, count, 0); | 7418 | return sched_power_savings_store(buf, count, 0); |
@@ -9239,11 +7424,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, | |||
9239 | 7424 | ||
9240 | #ifdef CONFIG_SCHED_SMT | 7425 | #ifdef CONFIG_SCHED_SMT |
9241 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, | 7426 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, |
7427 | struct sysdev_class_attribute *attr, | ||
9242 | char *page) | 7428 | char *page) |
9243 | { | 7429 | { |
9244 | return sprintf(page, "%u\n", sched_smt_power_savings); | 7430 | return sprintf(page, "%u\n", sched_smt_power_savings); |
9245 | } | 7431 | } |
9246 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, | 7432 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, |
7433 | struct sysdev_class_attribute *attr, | ||
9247 | const char *buf, size_t count) | 7434 | const char *buf, size_t count) |
9248 | { | 7435 | { |
9249 | return sched_power_savings_store(buf, count, 1); | 7436 | return sched_power_savings_store(buf, count, 1); |
@@ -9458,7 +7645,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
9458 | tg->rt_rq[cpu] = rt_rq; | 7645 | tg->rt_rq[cpu] = rt_rq; |
9459 | init_rt_rq(rt_rq, rq); | 7646 | init_rt_rq(rt_rq, rq); |
9460 | rt_rq->tg = tg; | 7647 | rt_rq->tg = tg; |
9461 | rt_rq->rt_se = rt_se; | ||
9462 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7648 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
9463 | if (add) | 7649 | if (add) |
9464 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 7650 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
@@ -9489,9 +7675,6 @@ void __init sched_init(void) | |||
9489 | #ifdef CONFIG_RT_GROUP_SCHED | 7675 | #ifdef CONFIG_RT_GROUP_SCHED |
9490 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7676 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
9491 | #endif | 7677 | #endif |
9492 | #ifdef CONFIG_USER_SCHED | ||
9493 | alloc_size *= 2; | ||
9494 | #endif | ||
9495 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7678 | #ifdef CONFIG_CPUMASK_OFFSTACK |
9496 | alloc_size += num_possible_cpus() * cpumask_size(); | 7679 | alloc_size += num_possible_cpus() * cpumask_size(); |
9497 | #endif | 7680 | #endif |
@@ -9505,13 +7688,6 @@ void __init sched_init(void) | |||
9505 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7688 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; |
9506 | ptr += nr_cpu_ids * sizeof(void **); | 7689 | ptr += nr_cpu_ids * sizeof(void **); |
9507 | 7690 | ||
9508 | #ifdef CONFIG_USER_SCHED | ||
9509 | root_task_group.se = (struct sched_entity **)ptr; | ||
9510 | ptr += nr_cpu_ids * sizeof(void **); | ||
9511 | |||
9512 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
9513 | ptr += nr_cpu_ids * sizeof(void **); | ||
9514 | #endif /* CONFIG_USER_SCHED */ | ||
9515 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7691 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9516 | #ifdef CONFIG_RT_GROUP_SCHED | 7692 | #ifdef CONFIG_RT_GROUP_SCHED |
9517 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7693 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; |
@@ -9520,13 +7696,6 @@ void __init sched_init(void) | |||
9520 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 7696 | init_task_group.rt_rq = (struct rt_rq **)ptr; |
9521 | ptr += nr_cpu_ids * sizeof(void **); | 7697 | ptr += nr_cpu_ids * sizeof(void **); |
9522 | 7698 | ||
9523 | #ifdef CONFIG_USER_SCHED | ||
9524 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
9525 | ptr += nr_cpu_ids * sizeof(void **); | ||
9526 | |||
9527 | root_task_group.rt_rq = (struct rt_rq **)ptr; | ||
9528 | ptr += nr_cpu_ids * sizeof(void **); | ||
9529 | #endif /* CONFIG_USER_SCHED */ | ||
9530 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7699 | #endif /* CONFIG_RT_GROUP_SCHED */ |
9531 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7700 | #ifdef CONFIG_CPUMASK_OFFSTACK |
9532 | for_each_possible_cpu(i) { | 7701 | for_each_possible_cpu(i) { |
@@ -9546,22 +7715,13 @@ void __init sched_init(void) | |||
9546 | #ifdef CONFIG_RT_GROUP_SCHED | 7715 | #ifdef CONFIG_RT_GROUP_SCHED |
9547 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 7716 | init_rt_bandwidth(&init_task_group.rt_bandwidth, |
9548 | global_rt_period(), global_rt_runtime()); | 7717 | global_rt_period(), global_rt_runtime()); |
9549 | #ifdef CONFIG_USER_SCHED | ||
9550 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | ||
9551 | global_rt_period(), RUNTIME_INF); | ||
9552 | #endif /* CONFIG_USER_SCHED */ | ||
9553 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7718 | #endif /* CONFIG_RT_GROUP_SCHED */ |
9554 | 7719 | ||
9555 | #ifdef CONFIG_GROUP_SCHED | 7720 | #ifdef CONFIG_CGROUP_SCHED |
9556 | list_add(&init_task_group.list, &task_groups); | 7721 | list_add(&init_task_group.list, &task_groups); |
9557 | INIT_LIST_HEAD(&init_task_group.children); | 7722 | INIT_LIST_HEAD(&init_task_group.children); |
9558 | 7723 | ||
9559 | #ifdef CONFIG_USER_SCHED | 7724 | #endif /* CONFIG_CGROUP_SCHED */ |
9560 | INIT_LIST_HEAD(&root_task_group.children); | ||
9561 | init_task_group.parent = &root_task_group; | ||
9562 | list_add(&init_task_group.siblings, &root_task_group.children); | ||
9563 | #endif /* CONFIG_USER_SCHED */ | ||
9564 | #endif /* CONFIG_GROUP_SCHED */ | ||
9565 | 7725 | ||
9566 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | 7726 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP |
9567 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | 7727 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), |
@@ -9601,25 +7761,6 @@ void __init sched_init(void) | |||
9601 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7761 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). |
9602 | */ | 7762 | */ |
9603 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7763 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); |
9604 | #elif defined CONFIG_USER_SCHED | ||
9605 | root_task_group.shares = NICE_0_LOAD; | ||
9606 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); | ||
9607 | /* | ||
9608 | * In case of task-groups formed thr' the user id of tasks, | ||
9609 | * init_task_group represents tasks belonging to root user. | ||
9610 | * Hence it forms a sibling of all subsequent groups formed. | ||
9611 | * In this case, init_task_group gets only a fraction of overall | ||
9612 | * system cpu resource, based on the weight assigned to root | ||
9613 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
9614 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
9615 | * (init_tg_cfs_rq) and having one entity represent this group of | ||
9616 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
9617 | */ | ||
9618 | init_tg_cfs_entry(&init_task_group, | ||
9619 | &per_cpu(init_tg_cfs_rq, i), | ||
9620 | &per_cpu(init_sched_entity, i), i, 1, | ||
9621 | root_task_group.se[i]); | ||
9622 | |||
9623 | #endif | 7764 | #endif |
9624 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7765 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9625 | 7766 | ||
@@ -9628,12 +7769,6 @@ void __init sched_init(void) | |||
9628 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7769 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
9629 | #ifdef CONFIG_CGROUP_SCHED | 7770 | #ifdef CONFIG_CGROUP_SCHED |
9630 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | 7771 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); |
9631 | #elif defined CONFIG_USER_SCHED | ||
9632 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); | ||
9633 | init_tg_rt_entry(&init_task_group, | ||
9634 | &per_cpu(init_rt_rq_var, i), | ||
9635 | &per_cpu(init_sched_rt_entity, i), i, 1, | ||
9636 | root_task_group.rt_se[i]); | ||
9637 | #endif | 7772 | #endif |
9638 | #endif | 7773 | #endif |
9639 | 7774 | ||
@@ -9718,7 +7853,7 @@ static inline int preempt_count_equals(int preempt_offset) | |||
9718 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 7853 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); |
9719 | } | 7854 | } |
9720 | 7855 | ||
9721 | void __might_sleep(char *file, int line, int preempt_offset) | 7856 | void __might_sleep(const char *file, int line, int preempt_offset) |
9722 | { | 7857 | { |
9723 | #ifdef in_atomic | 7858 | #ifdef in_atomic |
9724 | static unsigned long prev_jiffy; /* ratelimiting */ | 7859 | static unsigned long prev_jiffy; /* ratelimiting */ |
@@ -10029,7 +8164,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
10029 | } | 8164 | } |
10030 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8165 | #endif /* CONFIG_RT_GROUP_SCHED */ |
10031 | 8166 | ||
10032 | #ifdef CONFIG_GROUP_SCHED | 8167 | #ifdef CONFIG_CGROUP_SCHED |
10033 | static void free_sched_group(struct task_group *tg) | 8168 | static void free_sched_group(struct task_group *tg) |
10034 | { | 8169 | { |
10035 | free_fair_sched_group(tg); | 8170 | free_fair_sched_group(tg); |
@@ -10134,11 +8269,11 @@ void sched_move_task(struct task_struct *tsk) | |||
10134 | if (unlikely(running)) | 8269 | if (unlikely(running)) |
10135 | tsk->sched_class->set_curr_task(rq); | 8270 | tsk->sched_class->set_curr_task(rq); |
10136 | if (on_rq) | 8271 | if (on_rq) |
10137 | enqueue_task(rq, tsk, 0); | 8272 | enqueue_task(rq, tsk, 0, false); |
10138 | 8273 | ||
10139 | task_rq_unlock(rq, &flags); | 8274 | task_rq_unlock(rq, &flags); |
10140 | } | 8275 | } |
10141 | #endif /* CONFIG_GROUP_SCHED */ | 8276 | #endif /* CONFIG_CGROUP_SCHED */ |
10142 | 8277 | ||
10143 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8278 | #ifdef CONFIG_FAIR_GROUP_SCHED |
10144 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | 8279 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
@@ -10280,13 +8415,6 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
10280 | runtime = d->rt_runtime; | 8415 | runtime = d->rt_runtime; |
10281 | } | 8416 | } |
10282 | 8417 | ||
10283 | #ifdef CONFIG_USER_SCHED | ||
10284 | if (tg == &root_task_group) { | ||
10285 | period = global_rt_period(); | ||
10286 | runtime = global_rt_runtime(); | ||
10287 | } | ||
10288 | #endif | ||
10289 | |||
10290 | /* | 8418 | /* |
10291 | * Cannot have more runtime than the period. | 8419 | * Cannot have more runtime than the period. |
10292 | */ | 8420 | */ |
@@ -10689,7 +8817,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
10689 | struct cpuacct { | 8817 | struct cpuacct { |
10690 | struct cgroup_subsys_state css; | 8818 | struct cgroup_subsys_state css; |
10691 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 8819 | /* cpuusage holds pointer to a u64-type object on every cpu */ |
10692 | u64 *cpuusage; | 8820 | u64 __percpu *cpuusage; |
10693 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | 8821 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; |
10694 | struct cpuacct *parent; | 8822 | struct cpuacct *parent; |
10695 | }; | 8823 | }; |
@@ -10906,12 +9034,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
10906 | } | 9034 | } |
10907 | 9035 | ||
10908 | /* | 9036 | /* |
9037 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | ||
9038 | * in cputime_t units. As a result, cpuacct_update_stats calls | ||
9039 | * percpu_counter_add with values large enough to always overflow the | ||
9040 | * per cpu batch limit causing bad SMP scalability. | ||
9041 | * | ||
9042 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | ||
9043 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | ||
9044 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | ||
9045 | */ | ||
9046 | #ifdef CONFIG_SMP | ||
9047 | #define CPUACCT_BATCH \ | ||
9048 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | ||
9049 | #else | ||
9050 | #define CPUACCT_BATCH 0 | ||
9051 | #endif | ||
9052 | |||
9053 | /* | ||
10909 | * Charge the system/user time to the task's accounting group. | 9054 | * Charge the system/user time to the task's accounting group. |
10910 | */ | 9055 | */ |
10911 | static void cpuacct_update_stats(struct task_struct *tsk, | 9056 | static void cpuacct_update_stats(struct task_struct *tsk, |
10912 | enum cpuacct_stat_index idx, cputime_t val) | 9057 | enum cpuacct_stat_index idx, cputime_t val) |
10913 | { | 9058 | { |
10914 | struct cpuacct *ca; | 9059 | struct cpuacct *ca; |
9060 | int batch = CPUACCT_BATCH; | ||
10915 | 9061 | ||
10916 | if (unlikely(!cpuacct_subsys.active)) | 9062 | if (unlikely(!cpuacct_subsys.active)) |
10917 | return; | 9063 | return; |
@@ -10920,7 +9066,7 @@ static void cpuacct_update_stats(struct task_struct *tsk, | |||
10920 | ca = task_ca(tsk); | 9066 | ca = task_ca(tsk); |
10921 | 9067 | ||
10922 | do { | 9068 | do { |
10923 | percpu_counter_add(&ca->cpustat[idx], val); | 9069 | __percpu_counter_add(&ca->cpustat[idx], val, batch); |
10924 | ca = ca->parent; | 9070 | ca = ca->parent; |
10925 | } while (ca); | 9071 | } while (ca); |
10926 | rcu_read_unlock(); | 9072 | rcu_read_unlock(); |
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 597b33099dfa..82095bf2099f 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -47,9 +47,7 @@ static int convert_prio(int prio) | |||
47 | } | 47 | } |
48 | 48 | ||
49 | #define for_each_cpupri_active(array, idx) \ | 49 | #define for_each_cpupri_active(array, idx) \ |
50 | for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ | 50 | for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) |
51 | idx < CPUPRI_NR_PRIORITIES; \ | ||
52 | idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) | ||
53 | 51 | ||
54 | /** | 52 | /** |
55 | * cpupri_find - find the best (lowest-pri) CPU in the system | 53 | * cpupri_find - find the best (lowest-pri) CPU in the system |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8fe7ee81c552..3e1fd96c6cf9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq) | |||
1053 | * increased. Here we update the fair scheduling stats and | 1053 | * increased. Here we update the fair scheduling stats and |
1054 | * then put the task into the rbtree: | 1054 | * then put the task into the rbtree: |
1055 | */ | 1055 | */ |
1056 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 1056 | static void |
1057 | enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
1057 | { | 1058 | { |
1058 | struct cfs_rq *cfs_rq; | 1059 | struct cfs_rq *cfs_rq; |
1059 | struct sched_entity *se = &p->se; | 1060 | struct sched_entity *se = &p->se; |
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1815 | */ | 1816 | */ |
1816 | 1817 | ||
1817 | /* | 1818 | /* |
1818 | * Load-balancing iterator. Note: while the runqueue stays locked | 1819 | * pull_task - move a task from a remote runqueue to the local runqueue. |
1819 | * during the whole iteration, the current task might be | 1820 | * Both runqueues must be locked. |
1820 | * dequeued so the iterator has to be dequeue-safe. Here we | ||
1821 | * achieve that by always pre-iterating before returning | ||
1822 | * the current task: | ||
1823 | */ | 1821 | */ |
1824 | static struct task_struct * | 1822 | static void pull_task(struct rq *src_rq, struct task_struct *p, |
1825 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) | 1823 | struct rq *this_rq, int this_cpu) |
1826 | { | 1824 | { |
1827 | struct task_struct *p = NULL; | 1825 | deactivate_task(src_rq, p, 0); |
1828 | struct sched_entity *se; | 1826 | set_task_cpu(p, this_cpu); |
1827 | activate_task(this_rq, p, 0); | ||
1828 | check_preempt_curr(this_rq, p, 0); | ||
1829 | } | ||
1829 | 1830 | ||
1830 | if (next == &cfs_rq->tasks) | 1831 | /* |
1831 | return NULL; | 1832 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
1833 | */ | ||
1834 | static | ||
1835 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | ||
1836 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1837 | int *all_pinned) | ||
1838 | { | ||
1839 | int tsk_cache_hot = 0; | ||
1840 | /* | ||
1841 | * We do not migrate tasks that are: | ||
1842 | * 1) running (obviously), or | ||
1843 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | ||
1844 | * 3) are cache-hot on their current CPU. | ||
1845 | */ | ||
1846 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | ||
1847 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
1848 | return 0; | ||
1849 | } | ||
1850 | *all_pinned = 0; | ||
1832 | 1851 | ||
1833 | se = list_entry(next, struct sched_entity, group_node); | 1852 | if (task_running(rq, p)) { |
1834 | p = task_of(se); | 1853 | schedstat_inc(p, se.nr_failed_migrations_running); |
1835 | cfs_rq->balance_iterator = next->next; | 1854 | return 0; |
1855 | } | ||
1836 | 1856 | ||
1837 | return p; | 1857 | /* |
1838 | } | 1858 | * Aggressive migration if: |
1859 | * 1) task is cache cold, or | ||
1860 | * 2) too many balance attempts have failed. | ||
1861 | */ | ||
1839 | 1862 | ||
1840 | static struct task_struct *load_balance_start_fair(void *arg) | 1863 | tsk_cache_hot = task_hot(p, rq->clock, sd); |
1841 | { | 1864 | if (!tsk_cache_hot || |
1842 | struct cfs_rq *cfs_rq = arg; | 1865 | sd->nr_balance_failed > sd->cache_nice_tries) { |
1866 | #ifdef CONFIG_SCHEDSTATS | ||
1867 | if (tsk_cache_hot) { | ||
1868 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
1869 | schedstat_inc(p, se.nr_forced_migrations); | ||
1870 | } | ||
1871 | #endif | ||
1872 | return 1; | ||
1873 | } | ||
1843 | 1874 | ||
1844 | return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); | 1875 | if (tsk_cache_hot) { |
1876 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
1877 | return 0; | ||
1878 | } | ||
1879 | return 1; | ||
1845 | } | 1880 | } |
1846 | 1881 | ||
1847 | static struct task_struct *load_balance_next_fair(void *arg) | 1882 | /* |
1883 | * move_one_task tries to move exactly one task from busiest to this_rq, as | ||
1884 | * part of active balancing operations within "domain". | ||
1885 | * Returns 1 if successful and 0 otherwise. | ||
1886 | * | ||
1887 | * Called with both runqueues locked. | ||
1888 | */ | ||
1889 | static int | ||
1890 | move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1891 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
1848 | { | 1892 | { |
1849 | struct cfs_rq *cfs_rq = arg; | 1893 | struct task_struct *p, *n; |
1894 | struct cfs_rq *cfs_rq; | ||
1895 | int pinned = 0; | ||
1896 | |||
1897 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | ||
1898 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | ||
1899 | |||
1900 | if (!can_migrate_task(p, busiest, this_cpu, | ||
1901 | sd, idle, &pinned)) | ||
1902 | continue; | ||
1850 | 1903 | ||
1851 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); | 1904 | pull_task(busiest, p, this_rq, this_cpu); |
1905 | /* | ||
1906 | * Right now, this is only the second place pull_task() | ||
1907 | * is called, so we can safely collect pull_task() | ||
1908 | * stats here rather than inside pull_task(). | ||
1909 | */ | ||
1910 | schedstat_inc(sd, lb_gained[idle]); | ||
1911 | return 1; | ||
1912 | } | ||
1913 | } | ||
1914 | |||
1915 | return 0; | ||
1852 | } | 1916 | } |
1853 | 1917 | ||
1854 | static unsigned long | 1918 | static unsigned long |
1855 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1919 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1856 | unsigned long max_load_move, struct sched_domain *sd, | 1920 | unsigned long max_load_move, struct sched_domain *sd, |
1857 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | 1921 | enum cpu_idle_type idle, int *all_pinned, |
1858 | struct cfs_rq *cfs_rq) | 1922 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) |
1859 | { | 1923 | { |
1860 | struct rq_iterator cfs_rq_iterator; | 1924 | int loops = 0, pulled = 0, pinned = 0; |
1925 | long rem_load_move = max_load_move; | ||
1926 | struct task_struct *p, *n; | ||
1861 | 1927 | ||
1862 | cfs_rq_iterator.start = load_balance_start_fair; | 1928 | if (max_load_move == 0) |
1863 | cfs_rq_iterator.next = load_balance_next_fair; | 1929 | goto out; |
1864 | cfs_rq_iterator.arg = cfs_rq; | ||
1865 | 1930 | ||
1866 | return balance_tasks(this_rq, this_cpu, busiest, | 1931 | pinned = 1; |
1867 | max_load_move, sd, idle, all_pinned, | 1932 | |
1868 | this_best_prio, &cfs_rq_iterator); | 1933 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
1934 | if (loops++ > sysctl_sched_nr_migrate) | ||
1935 | break; | ||
1936 | |||
1937 | if ((p->se.load.weight >> 1) > rem_load_move || | ||
1938 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) | ||
1939 | continue; | ||
1940 | |||
1941 | pull_task(busiest, p, this_rq, this_cpu); | ||
1942 | pulled++; | ||
1943 | rem_load_move -= p->se.load.weight; | ||
1944 | |||
1945 | #ifdef CONFIG_PREEMPT | ||
1946 | /* | ||
1947 | * NEWIDLE balancing is a source of latency, so preemptible | ||
1948 | * kernels will stop after the first task is pulled to minimize | ||
1949 | * the critical section. | ||
1950 | */ | ||
1951 | if (idle == CPU_NEWLY_IDLE) | ||
1952 | break; | ||
1953 | #endif | ||
1954 | |||
1955 | /* | ||
1956 | * We only want to steal up to the prescribed amount of | ||
1957 | * weighted load. | ||
1958 | */ | ||
1959 | if (rem_load_move <= 0) | ||
1960 | break; | ||
1961 | |||
1962 | if (p->prio < *this_best_prio) | ||
1963 | *this_best_prio = p->prio; | ||
1964 | } | ||
1965 | out: | ||
1966 | /* | ||
1967 | * Right now, this is one of only two places pull_task() is called, | ||
1968 | * so we can safely collect pull_task() stats here rather than | ||
1969 | * inside pull_task(). | ||
1970 | */ | ||
1971 | schedstat_add(sd, lb_gained[idle], pulled); | ||
1972 | |||
1973 | if (all_pinned) | ||
1974 | *all_pinned = pinned; | ||
1975 | |||
1976 | return max_load_move - rem_load_move; | ||
1869 | } | 1977 | } |
1870 | 1978 | ||
1871 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1979 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1897 | rem_load = (u64)rem_load_move * busiest_weight; | 2005 | rem_load = (u64)rem_load_move * busiest_weight; |
1898 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 2006 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
1899 | 2007 | ||
1900 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, | 2008 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
1901 | rem_load, sd, idle, all_pinned, this_best_prio, | 2009 | rem_load, sd, idle, all_pinned, this_best_prio, |
1902 | tg->cfs_rq[busiest_cpu]); | 2010 | busiest_cfs_rq); |
1903 | 2011 | ||
1904 | if (!moved_load) | 2012 | if (!moved_load) |
1905 | continue; | 2013 | continue; |
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1922 | struct sched_domain *sd, enum cpu_idle_type idle, | 2030 | struct sched_domain *sd, enum cpu_idle_type idle, |
1923 | int *all_pinned, int *this_best_prio) | 2031 | int *all_pinned, int *this_best_prio) |
1924 | { | 2032 | { |
1925 | return __load_balance_fair(this_rq, this_cpu, busiest, | 2033 | return balance_tasks(this_rq, this_cpu, busiest, |
1926 | max_load_move, sd, idle, all_pinned, | 2034 | max_load_move, sd, idle, all_pinned, |
1927 | this_best_prio, &busiest->cfs); | 2035 | this_best_prio, &busiest->cfs); |
1928 | } | 2036 | } |
1929 | #endif | 2037 | #endif |
1930 | 2038 | ||
1931 | static int | 2039 | /* |
1932 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2040 | * move_tasks tries to move up to max_load_move weighted load from busiest to |
1933 | struct sched_domain *sd, enum cpu_idle_type idle) | 2041 | * this_rq, as part of a balancing operation within domain "sd". |
2042 | * Returns 1 if successful and 0 otherwise. | ||
2043 | * | ||
2044 | * Called with both runqueues locked. | ||
2045 | */ | ||
2046 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
2047 | unsigned long max_load_move, | ||
2048 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
2049 | int *all_pinned) | ||
1934 | { | 2050 | { |
1935 | struct cfs_rq *busy_cfs_rq; | 2051 | unsigned long total_load_moved = 0, load_moved; |
1936 | struct rq_iterator cfs_rq_iterator; | 2052 | int this_best_prio = this_rq->curr->prio; |
1937 | 2053 | ||
1938 | cfs_rq_iterator.start = load_balance_start_fair; | 2054 | do { |
1939 | cfs_rq_iterator.next = load_balance_next_fair; | 2055 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
2056 | max_load_move - total_load_moved, | ||
2057 | sd, idle, all_pinned, &this_best_prio); | ||
1940 | 2058 | ||
1941 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 2059 | total_load_moved += load_moved; |
2060 | |||
2061 | #ifdef CONFIG_PREEMPT | ||
1942 | /* | 2062 | /* |
1943 | * pass busy_cfs_rq argument into | 2063 | * NEWIDLE balancing is a source of latency, so preemptible |
1944 | * load_balance_[start|next]_fair iterators | 2064 | * kernels will stop after the first task is pulled to minimize |
2065 | * the critical section. | ||
1945 | */ | 2066 | */ |
1946 | cfs_rq_iterator.arg = busy_cfs_rq; | 2067 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) |
1947 | if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, | 2068 | break; |
1948 | &cfs_rq_iterator)) | 2069 | |
1949 | return 1; | 2070 | if (raw_spin_is_contended(&this_rq->lock) || |
2071 | raw_spin_is_contended(&busiest->lock)) | ||
2072 | break; | ||
2073 | #endif | ||
2074 | } while (load_moved && max_load_move > total_load_moved); | ||
2075 | |||
2076 | return total_load_moved > 0; | ||
2077 | } | ||
2078 | |||
2079 | /********** Helpers for find_busiest_group ************************/ | ||
2080 | /* | ||
2081 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
2082 | * during load balancing. | ||
2083 | */ | ||
2084 | struct sd_lb_stats { | ||
2085 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
2086 | struct sched_group *this; /* Local group in this sd */ | ||
2087 | unsigned long total_load; /* Total load of all groups in sd */ | ||
2088 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
2089 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
2090 | |||
2091 | /** Statistics of this group */ | ||
2092 | unsigned long this_load; | ||
2093 | unsigned long this_load_per_task; | ||
2094 | unsigned long this_nr_running; | ||
2095 | |||
2096 | /* Statistics of the busiest group */ | ||
2097 | unsigned long max_load; | ||
2098 | unsigned long busiest_load_per_task; | ||
2099 | unsigned long busiest_nr_running; | ||
2100 | unsigned long busiest_group_capacity; | ||
2101 | |||
2102 | int group_imb; /* Is there imbalance in this sd */ | ||
2103 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2104 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
2105 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
2106 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
2107 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
2108 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
2109 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
2110 | #endif | ||
2111 | }; | ||
2112 | |||
2113 | /* | ||
2114 | * sg_lb_stats - stats of a sched_group required for load_balancing | ||
2115 | */ | ||
2116 | struct sg_lb_stats { | ||
2117 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | ||
2118 | unsigned long group_load; /* Total load over the CPUs of the group */ | ||
2119 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
2120 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
2121 | unsigned long group_capacity; | ||
2122 | int group_imb; /* Is there an imbalance in the group ? */ | ||
2123 | }; | ||
2124 | |||
2125 | /** | ||
2126 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
2127 | * @group: The group whose first cpu is to be returned. | ||
2128 | */ | ||
2129 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
2130 | { | ||
2131 | return cpumask_first(sched_group_cpus(group)); | ||
2132 | } | ||
2133 | |||
2134 | /** | ||
2135 | * get_sd_load_idx - Obtain the load index for a given sched domain. | ||
2136 | * @sd: The sched_domain whose load_idx is to be obtained. | ||
2137 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | ||
2138 | */ | ||
2139 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
2140 | enum cpu_idle_type idle) | ||
2141 | { | ||
2142 | int load_idx; | ||
2143 | |||
2144 | switch (idle) { | ||
2145 | case CPU_NOT_IDLE: | ||
2146 | load_idx = sd->busy_idx; | ||
2147 | break; | ||
2148 | |||
2149 | case CPU_NEWLY_IDLE: | ||
2150 | load_idx = sd->newidle_idx; | ||
2151 | break; | ||
2152 | default: | ||
2153 | load_idx = sd->idle_idx; | ||
2154 | break; | ||
1950 | } | 2155 | } |
1951 | 2156 | ||
2157 | return load_idx; | ||
2158 | } | ||
2159 | |||
2160 | |||
2161 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2162 | /** | ||
2163 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
2164 | * the given sched_domain, during load balancing. | ||
2165 | * | ||
2166 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
2167 | * @sds: Variable containing the statistics for sd. | ||
2168 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
2169 | */ | ||
2170 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
2171 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
2172 | { | ||
2173 | /* | ||
2174 | * Busy processors will not participate in power savings | ||
2175 | * balance. | ||
2176 | */ | ||
2177 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2178 | sds->power_savings_balance = 0; | ||
2179 | else { | ||
2180 | sds->power_savings_balance = 1; | ||
2181 | sds->min_nr_running = ULONG_MAX; | ||
2182 | sds->leader_nr_running = 0; | ||
2183 | } | ||
2184 | } | ||
2185 | |||
2186 | /** | ||
2187 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
2188 | * sched_domain while performing load balancing. | ||
2189 | * | ||
2190 | * @group: sched_group belonging to the sched_domain under consideration. | ||
2191 | * @sds: Variable containing the statistics of the sched_domain | ||
2192 | * @local_group: Does group contain the CPU for which we're performing | ||
2193 | * load balancing ? | ||
2194 | * @sgs: Variable containing the statistics of the group. | ||
2195 | */ | ||
2196 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
2197 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
2198 | { | ||
2199 | |||
2200 | if (!sds->power_savings_balance) | ||
2201 | return; | ||
2202 | |||
2203 | /* | ||
2204 | * If the local group is idle or completely loaded | ||
2205 | * no need to do power savings balance at this domain | ||
2206 | */ | ||
2207 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
2208 | !sds->this_nr_running)) | ||
2209 | sds->power_savings_balance = 0; | ||
2210 | |||
2211 | /* | ||
2212 | * If a group is already running at full capacity or idle, | ||
2213 | * don't include that group in power savings calculations | ||
2214 | */ | ||
2215 | if (!sds->power_savings_balance || | ||
2216 | sgs->sum_nr_running >= sgs->group_capacity || | ||
2217 | !sgs->sum_nr_running) | ||
2218 | return; | ||
2219 | |||
2220 | /* | ||
2221 | * Calculate the group which has the least non-idle load. | ||
2222 | * This is the group from where we need to pick up the load | ||
2223 | * for saving power | ||
2224 | */ | ||
2225 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
2226 | (sgs->sum_nr_running == sds->min_nr_running && | ||
2227 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
2228 | sds->group_min = group; | ||
2229 | sds->min_nr_running = sgs->sum_nr_running; | ||
2230 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
2231 | sgs->sum_nr_running; | ||
2232 | } | ||
2233 | |||
2234 | /* | ||
2235 | * Calculate the group which is almost near its | ||
2236 | * capacity but still has some space to pick up some load | ||
2237 | * from other group and save more power | ||
2238 | */ | ||
2239 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
2240 | return; | ||
2241 | |||
2242 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
2243 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
2244 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
2245 | sds->group_leader = group; | ||
2246 | sds->leader_nr_running = sgs->sum_nr_running; | ||
2247 | } | ||
2248 | } | ||
2249 | |||
2250 | /** | ||
2251 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
2252 | * @sds: Variable containing the statistics of the sched_domain | ||
2253 | * under consideration. | ||
2254 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
2255 | * @imbalance: Variable to store the imbalance. | ||
2256 | * | ||
2257 | * Description: | ||
2258 | * Check if we have potential to perform some power-savings balance. | ||
2259 | * If yes, set the busiest group to be the least loaded group in the | ||
2260 | * sched_domain, so that it's CPUs can be put to idle. | ||
2261 | * | ||
2262 | * Returns 1 if there is potential to perform power-savings balance. | ||
2263 | * Else returns 0. | ||
2264 | */ | ||
2265 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
2266 | int this_cpu, unsigned long *imbalance) | ||
2267 | { | ||
2268 | if (!sds->power_savings_balance) | ||
2269 | return 0; | ||
2270 | |||
2271 | if (sds->this != sds->group_leader || | ||
2272 | sds->group_leader == sds->group_min) | ||
2273 | return 0; | ||
2274 | |||
2275 | *imbalance = sds->min_load_per_task; | ||
2276 | sds->busiest = sds->group_min; | ||
2277 | |||
2278 | return 1; | ||
2279 | |||
2280 | } | ||
2281 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
2282 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
2283 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
2284 | { | ||
2285 | return; | ||
2286 | } | ||
2287 | |||
2288 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
2289 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
2290 | { | ||
2291 | return; | ||
2292 | } | ||
2293 | |||
2294 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
2295 | int this_cpu, unsigned long *imbalance) | ||
2296 | { | ||
1952 | return 0; | 2297 | return 0; |
1953 | } | 2298 | } |
2299 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
2300 | |||
2301 | |||
2302 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
2303 | { | ||
2304 | return SCHED_LOAD_SCALE; | ||
2305 | } | ||
2306 | |||
2307 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
2308 | { | ||
2309 | return default_scale_freq_power(sd, cpu); | ||
2310 | } | ||
2311 | |||
2312 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
2313 | { | ||
2314 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
2315 | unsigned long smt_gain = sd->smt_gain; | ||
2316 | |||
2317 | smt_gain /= weight; | ||
2318 | |||
2319 | return smt_gain; | ||
2320 | } | ||
2321 | |||
2322 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
2323 | { | ||
2324 | return default_scale_smt_power(sd, cpu); | ||
2325 | } | ||
2326 | |||
2327 | unsigned long scale_rt_power(int cpu) | ||
2328 | { | ||
2329 | struct rq *rq = cpu_rq(cpu); | ||
2330 | u64 total, available; | ||
2331 | |||
2332 | sched_avg_update(rq); | ||
2333 | |||
2334 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
2335 | available = total - rq->rt_avg; | ||
2336 | |||
2337 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
2338 | total = SCHED_LOAD_SCALE; | ||
2339 | |||
2340 | total >>= SCHED_LOAD_SHIFT; | ||
2341 | |||
2342 | return div_u64(available, total); | ||
2343 | } | ||
2344 | |||
2345 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
2346 | { | ||
2347 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
2348 | unsigned long power = SCHED_LOAD_SCALE; | ||
2349 | struct sched_group *sdg = sd->groups; | ||
2350 | |||
2351 | if (sched_feat(ARCH_POWER)) | ||
2352 | power *= arch_scale_freq_power(sd, cpu); | ||
2353 | else | ||
2354 | power *= default_scale_freq_power(sd, cpu); | ||
2355 | |||
2356 | power >>= SCHED_LOAD_SHIFT; | ||
2357 | |||
2358 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
2359 | if (sched_feat(ARCH_POWER)) | ||
2360 | power *= arch_scale_smt_power(sd, cpu); | ||
2361 | else | ||
2362 | power *= default_scale_smt_power(sd, cpu); | ||
2363 | |||
2364 | power >>= SCHED_LOAD_SHIFT; | ||
2365 | } | ||
2366 | |||
2367 | power *= scale_rt_power(cpu); | ||
2368 | power >>= SCHED_LOAD_SHIFT; | ||
2369 | |||
2370 | if (!power) | ||
2371 | power = 1; | ||
2372 | |||
2373 | sdg->cpu_power = power; | ||
2374 | } | ||
2375 | |||
2376 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
2377 | { | ||
2378 | struct sched_domain *child = sd->child; | ||
2379 | struct sched_group *group, *sdg = sd->groups; | ||
2380 | unsigned long power; | ||
2381 | |||
2382 | if (!child) { | ||
2383 | update_cpu_power(sd, cpu); | ||
2384 | return; | ||
2385 | } | ||
2386 | |||
2387 | power = 0; | ||
2388 | |||
2389 | group = child->groups; | ||
2390 | do { | ||
2391 | power += group->cpu_power; | ||
2392 | group = group->next; | ||
2393 | } while (group != child->groups); | ||
2394 | |||
2395 | sdg->cpu_power = power; | ||
2396 | } | ||
2397 | |||
2398 | /** | ||
2399 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | ||
2400 | * @sd: The sched_domain whose statistics are to be updated. | ||
2401 | * @group: sched_group whose statistics are to be updated. | ||
2402 | * @this_cpu: Cpu for which load balance is currently performed. | ||
2403 | * @idle: Idle status of this_cpu | ||
2404 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | ||
2405 | * @sd_idle: Idle status of the sched_domain containing group. | ||
2406 | * @local_group: Does group contain this_cpu. | ||
2407 | * @cpus: Set of cpus considered for load balancing. | ||
2408 | * @balance: Should we balance. | ||
2409 | * @sgs: variable to hold the statistics for this group. | ||
2410 | */ | ||
2411 | static inline void update_sg_lb_stats(struct sched_domain *sd, | ||
2412 | struct sched_group *group, int this_cpu, | ||
2413 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | ||
2414 | int local_group, const struct cpumask *cpus, | ||
2415 | int *balance, struct sg_lb_stats *sgs) | ||
2416 | { | ||
2417 | unsigned long load, max_cpu_load, min_cpu_load; | ||
2418 | int i; | ||
2419 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
2420 | unsigned long avg_load_per_task = 0; | ||
2421 | |||
2422 | if (local_group) | ||
2423 | balance_cpu = group_first_cpu(group); | ||
2424 | |||
2425 | /* Tally up the load of all CPUs in the group */ | ||
2426 | max_cpu_load = 0; | ||
2427 | min_cpu_load = ~0UL; | ||
2428 | |||
2429 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | ||
2430 | struct rq *rq = cpu_rq(i); | ||
2431 | |||
2432 | if (*sd_idle && rq->nr_running) | ||
2433 | *sd_idle = 0; | ||
2434 | |||
2435 | /* Bias balancing toward cpus of our domain */ | ||
2436 | if (local_group) { | ||
2437 | if (idle_cpu(i) && !first_idle_cpu) { | ||
2438 | first_idle_cpu = 1; | ||
2439 | balance_cpu = i; | ||
2440 | } | ||
2441 | |||
2442 | load = target_load(i, load_idx); | ||
2443 | } else { | ||
2444 | load = source_load(i, load_idx); | ||
2445 | if (load > max_cpu_load) | ||
2446 | max_cpu_load = load; | ||
2447 | if (min_cpu_load > load) | ||
2448 | min_cpu_load = load; | ||
2449 | } | ||
2450 | |||
2451 | sgs->group_load += load; | ||
2452 | sgs->sum_nr_running += rq->nr_running; | ||
2453 | sgs->sum_weighted_load += weighted_cpuload(i); | ||
2454 | |||
2455 | } | ||
2456 | |||
2457 | /* | ||
2458 | * First idle cpu or the first cpu(busiest) in this sched group | ||
2459 | * is eligible for doing load balancing at this and above | ||
2460 | * domains. In the newly idle case, we will allow all the cpu's | ||
2461 | * to do the newly idle load balance. | ||
2462 | */ | ||
2463 | if (idle != CPU_NEWLY_IDLE && local_group && | ||
2464 | balance_cpu != this_cpu) { | ||
2465 | *balance = 0; | ||
2466 | return; | ||
2467 | } | ||
2468 | |||
2469 | update_group_power(sd, this_cpu); | ||
2470 | |||
2471 | /* Adjust by relative CPU power of the group */ | ||
2472 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
2473 | |||
2474 | /* | ||
2475 | * Consider the group unbalanced when the imbalance is larger | ||
2476 | * than the average weight of two tasks. | ||
2477 | * | ||
2478 | * APZ: with cgroup the avg task weight can vary wildly and | ||
2479 | * might not be a suitable number - should we keep a | ||
2480 | * normalized nr_running number somewhere that negates | ||
2481 | * the hierarchy? | ||
2482 | */ | ||
2483 | if (sgs->sum_nr_running) | ||
2484 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | ||
2485 | |||
2486 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
2487 | sgs->group_imb = 1; | ||
2488 | |||
2489 | sgs->group_capacity = | ||
2490 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | ||
2491 | } | ||
2492 | |||
2493 | /** | ||
2494 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | ||
2495 | * @sd: sched_domain whose statistics are to be updated. | ||
2496 | * @this_cpu: Cpu for which load balance is currently performed. | ||
2497 | * @idle: Idle status of this_cpu | ||
2498 | * @sd_idle: Idle status of the sched_domain containing group. | ||
2499 | * @cpus: Set of cpus considered for load balancing. | ||
2500 | * @balance: Should we balance. | ||
2501 | * @sds: variable to hold the statistics for this sched_domain. | ||
2502 | */ | ||
2503 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | ||
2504 | enum cpu_idle_type idle, int *sd_idle, | ||
2505 | const struct cpumask *cpus, int *balance, | ||
2506 | struct sd_lb_stats *sds) | ||
2507 | { | ||
2508 | struct sched_domain *child = sd->child; | ||
2509 | struct sched_group *group = sd->groups; | ||
2510 | struct sg_lb_stats sgs; | ||
2511 | int load_idx, prefer_sibling = 0; | ||
2512 | |||
2513 | if (child && child->flags & SD_PREFER_SIBLING) | ||
2514 | prefer_sibling = 1; | ||
2515 | |||
2516 | init_sd_power_savings_stats(sd, sds, idle); | ||
2517 | load_idx = get_sd_load_idx(sd, idle); | ||
2518 | |||
2519 | do { | ||
2520 | int local_group; | ||
2521 | |||
2522 | local_group = cpumask_test_cpu(this_cpu, | ||
2523 | sched_group_cpus(group)); | ||
2524 | memset(&sgs, 0, sizeof(sgs)); | ||
2525 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | ||
2526 | local_group, cpus, balance, &sgs); | ||
2527 | |||
2528 | if (local_group && !(*balance)) | ||
2529 | return; | ||
2530 | |||
2531 | sds->total_load += sgs.group_load; | ||
2532 | sds->total_pwr += group->cpu_power; | ||
2533 | |||
2534 | /* | ||
2535 | * In case the child domain prefers tasks go to siblings | ||
2536 | * first, lower the group capacity to one so that we'll try | ||
2537 | * and move all the excess tasks away. | ||
2538 | */ | ||
2539 | if (prefer_sibling) | ||
2540 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
2541 | |||
2542 | if (local_group) { | ||
2543 | sds->this_load = sgs.avg_load; | ||
2544 | sds->this = group; | ||
2545 | sds->this_nr_running = sgs.sum_nr_running; | ||
2546 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
2547 | } else if (sgs.avg_load > sds->max_load && | ||
2548 | (sgs.sum_nr_running > sgs.group_capacity || | ||
2549 | sgs.group_imb)) { | ||
2550 | sds->max_load = sgs.avg_load; | ||
2551 | sds->busiest = group; | ||
2552 | sds->busiest_nr_running = sgs.sum_nr_running; | ||
2553 | sds->busiest_group_capacity = sgs.group_capacity; | ||
2554 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
2555 | sds->group_imb = sgs.group_imb; | ||
2556 | } | ||
2557 | |||
2558 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | ||
2559 | group = group->next; | ||
2560 | } while (group != sd->groups); | ||
2561 | } | ||
2562 | |||
2563 | /** | ||
2564 | * fix_small_imbalance - Calculate the minor imbalance that exists | ||
2565 | * amongst the groups of a sched_domain, during | ||
2566 | * load balancing. | ||
2567 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | ||
2568 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
2569 | * @imbalance: Variable to store the imbalance. | ||
2570 | */ | ||
2571 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | ||
2572 | int this_cpu, unsigned long *imbalance) | ||
2573 | { | ||
2574 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | ||
2575 | unsigned int imbn = 2; | ||
2576 | unsigned long scaled_busy_load_per_task; | ||
2577 | |||
2578 | if (sds->this_nr_running) { | ||
2579 | sds->this_load_per_task /= sds->this_nr_running; | ||
2580 | if (sds->busiest_load_per_task > | ||
2581 | sds->this_load_per_task) | ||
2582 | imbn = 1; | ||
2583 | } else | ||
2584 | sds->this_load_per_task = | ||
2585 | cpu_avg_load_per_task(this_cpu); | ||
2586 | |||
2587 | scaled_busy_load_per_task = sds->busiest_load_per_task | ||
2588 | * SCHED_LOAD_SCALE; | ||
2589 | scaled_busy_load_per_task /= sds->busiest->cpu_power; | ||
2590 | |||
2591 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | ||
2592 | (scaled_busy_load_per_task * imbn)) { | ||
2593 | *imbalance = sds->busiest_load_per_task; | ||
2594 | return; | ||
2595 | } | ||
2596 | |||
2597 | /* | ||
2598 | * OK, we don't have enough imbalance to justify moving tasks, | ||
2599 | * however we may be able to increase total CPU power used by | ||
2600 | * moving them. | ||
2601 | */ | ||
2602 | |||
2603 | pwr_now += sds->busiest->cpu_power * | ||
2604 | min(sds->busiest_load_per_task, sds->max_load); | ||
2605 | pwr_now += sds->this->cpu_power * | ||
2606 | min(sds->this_load_per_task, sds->this_load); | ||
2607 | pwr_now /= SCHED_LOAD_SCALE; | ||
2608 | |||
2609 | /* Amount of load we'd subtract */ | ||
2610 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
2611 | sds->busiest->cpu_power; | ||
2612 | if (sds->max_load > tmp) | ||
2613 | pwr_move += sds->busiest->cpu_power * | ||
2614 | min(sds->busiest_load_per_task, sds->max_load - tmp); | ||
2615 | |||
2616 | /* Amount of load we'd add */ | ||
2617 | if (sds->max_load * sds->busiest->cpu_power < | ||
2618 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | ||
2619 | tmp = (sds->max_load * sds->busiest->cpu_power) / | ||
2620 | sds->this->cpu_power; | ||
2621 | else | ||
2622 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
2623 | sds->this->cpu_power; | ||
2624 | pwr_move += sds->this->cpu_power * | ||
2625 | min(sds->this_load_per_task, sds->this_load + tmp); | ||
2626 | pwr_move /= SCHED_LOAD_SCALE; | ||
2627 | |||
2628 | /* Move if we gain throughput */ | ||
2629 | if (pwr_move > pwr_now) | ||
2630 | *imbalance = sds->busiest_load_per_task; | ||
2631 | } | ||
2632 | |||
2633 | /** | ||
2634 | * calculate_imbalance - Calculate the amount of imbalance present within the | ||
2635 | * groups of a given sched_domain during load balance. | ||
2636 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | ||
2637 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
2638 | * @imbalance: The variable to store the imbalance. | ||
2639 | */ | ||
2640 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | ||
2641 | unsigned long *imbalance) | ||
2642 | { | ||
2643 | unsigned long max_pull, load_above_capacity = ~0UL; | ||
2644 | |||
2645 | sds->busiest_load_per_task /= sds->busiest_nr_running; | ||
2646 | if (sds->group_imb) { | ||
2647 | sds->busiest_load_per_task = | ||
2648 | min(sds->busiest_load_per_task, sds->avg_load); | ||
2649 | } | ||
2650 | |||
2651 | /* | ||
2652 | * In the presence of smp nice balancing, certain scenarios can have | ||
2653 | * max load less than avg load(as we skip the groups at or below | ||
2654 | * its cpu_power, while calculating max_load..) | ||
2655 | */ | ||
2656 | if (sds->max_load < sds->avg_load) { | ||
2657 | *imbalance = 0; | ||
2658 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
2659 | } | ||
2660 | |||
2661 | if (!sds->group_imb) { | ||
2662 | /* | ||
2663 | * Don't want to pull so many tasks that a group would go idle. | ||
2664 | */ | ||
2665 | load_above_capacity = (sds->busiest_nr_running - | ||
2666 | sds->busiest_group_capacity); | ||
2667 | |||
2668 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); | ||
2669 | |||
2670 | load_above_capacity /= sds->busiest->cpu_power; | ||
2671 | } | ||
2672 | |||
2673 | /* | ||
2674 | * We're trying to get all the cpus to the average_load, so we don't | ||
2675 | * want to push ourselves above the average load, nor do we wish to | ||
2676 | * reduce the max loaded cpu below the average load. At the same time, | ||
2677 | * we also don't want to reduce the group load below the group capacity | ||
2678 | * (so that we can implement power-savings policies etc). Thus we look | ||
2679 | * for the minimum possible imbalance. | ||
2680 | * Be careful of negative numbers as they'll appear as very large values | ||
2681 | * with unsigned longs. | ||
2682 | */ | ||
2683 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | ||
2684 | |||
2685 | /* How much load to actually move to equalise the imbalance */ | ||
2686 | *imbalance = min(max_pull * sds->busiest->cpu_power, | ||
2687 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | ||
2688 | / SCHED_LOAD_SCALE; | ||
2689 | |||
2690 | /* | ||
2691 | * if *imbalance is less than the average load per runnable task | ||
2692 | * there is no gaurantee that any tasks will be moved so we'll have | ||
2693 | * a think about bumping its value to force at least one task to be | ||
2694 | * moved | ||
2695 | */ | ||
2696 | if (*imbalance < sds->busiest_load_per_task) | ||
2697 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
2698 | |||
2699 | } | ||
2700 | /******* find_busiest_group() helpers end here *********************/ | ||
2701 | |||
2702 | /** | ||
2703 | * find_busiest_group - Returns the busiest group within the sched_domain | ||
2704 | * if there is an imbalance. If there isn't an imbalance, and | ||
2705 | * the user has opted for power-savings, it returns a group whose | ||
2706 | * CPUs can be put to idle by rebalancing those tasks elsewhere, if | ||
2707 | * such a group exists. | ||
2708 | * | ||
2709 | * Also calculates the amount of weighted load which should be moved | ||
2710 | * to restore balance. | ||
2711 | * | ||
2712 | * @sd: The sched_domain whose busiest group is to be returned. | ||
2713 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
2714 | * @imbalance: Variable which stores amount of weighted load which should | ||
2715 | * be moved to restore balance/put a group to idle. | ||
2716 | * @idle: The idle status of this_cpu. | ||
2717 | * @sd_idle: The idleness of sd | ||
2718 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
2719 | * @balance: Pointer to a variable indicating if this_cpu | ||
2720 | * is the appropriate cpu to perform load balancing at this_level. | ||
2721 | * | ||
2722 | * Returns: - the busiest group if imbalance exists. | ||
2723 | * - If no imbalance and user has opted for power-savings balance, | ||
2724 | * return the least loaded group whose CPUs can be | ||
2725 | * put to idle by rebalancing its tasks onto our group. | ||
2726 | */ | ||
2727 | static struct sched_group * | ||
2728 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
2729 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
2730 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
2731 | { | ||
2732 | struct sd_lb_stats sds; | ||
2733 | |||
2734 | memset(&sds, 0, sizeof(sds)); | ||
2735 | |||
2736 | /* | ||
2737 | * Compute the various statistics relavent for load balancing at | ||
2738 | * this level. | ||
2739 | */ | ||
2740 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | ||
2741 | balance, &sds); | ||
2742 | |||
2743 | /* Cases where imbalance does not exist from POV of this_cpu */ | ||
2744 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | ||
2745 | * at this level. | ||
2746 | * 2) There is no busy sibling group to pull from. | ||
2747 | * 3) This group is the busiest group. | ||
2748 | * 4) This group is more busy than the avg busieness at this | ||
2749 | * sched_domain. | ||
2750 | * 5) The imbalance is within the specified limit. | ||
2751 | */ | ||
2752 | if (!(*balance)) | ||
2753 | goto ret; | ||
2754 | |||
2755 | if (!sds.busiest || sds.busiest_nr_running == 0) | ||
2756 | goto out_balanced; | ||
2757 | |||
2758 | if (sds.this_load >= sds.max_load) | ||
2759 | goto out_balanced; | ||
2760 | |||
2761 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | ||
2762 | |||
2763 | if (sds.this_load >= sds.avg_load) | ||
2764 | goto out_balanced; | ||
2765 | |||
2766 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
2767 | goto out_balanced; | ||
2768 | |||
2769 | /* Looks like there is an imbalance. Compute it */ | ||
2770 | calculate_imbalance(&sds, this_cpu, imbalance); | ||
2771 | return sds.busiest; | ||
2772 | |||
2773 | out_balanced: | ||
2774 | /* | ||
2775 | * There is no obvious imbalance. But check if we can do some balancing | ||
2776 | * to save power. | ||
2777 | */ | ||
2778 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
2779 | return sds.busiest; | ||
2780 | ret: | ||
2781 | *imbalance = 0; | ||
2782 | return NULL; | ||
2783 | } | ||
2784 | |||
2785 | /* | ||
2786 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | ||
2787 | */ | ||
2788 | static struct rq * | ||
2789 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | ||
2790 | unsigned long imbalance, const struct cpumask *cpus) | ||
2791 | { | ||
2792 | struct rq *busiest = NULL, *rq; | ||
2793 | unsigned long max_load = 0; | ||
2794 | int i; | ||
2795 | |||
2796 | for_each_cpu(i, sched_group_cpus(group)) { | ||
2797 | unsigned long power = power_of(i); | ||
2798 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
2799 | unsigned long wl; | ||
2800 | |||
2801 | if (!cpumask_test_cpu(i, cpus)) | ||
2802 | continue; | ||
2803 | |||
2804 | rq = cpu_rq(i); | ||
2805 | wl = weighted_cpuload(i); | ||
2806 | |||
2807 | /* | ||
2808 | * When comparing with imbalance, use weighted_cpuload() | ||
2809 | * which is not scaled with the cpu power. | ||
2810 | */ | ||
2811 | if (capacity && rq->nr_running == 1 && wl > imbalance) | ||
2812 | continue; | ||
2813 | |||
2814 | /* | ||
2815 | * For the load comparisons with the other cpu's, consider | ||
2816 | * the weighted_cpuload() scaled with the cpu power, so that | ||
2817 | * the load can be moved away from the cpu that is potentially | ||
2818 | * running at a lower capacity. | ||
2819 | */ | ||
2820 | wl = (wl * SCHED_LOAD_SCALE) / power; | ||
2821 | |||
2822 | if (wl > max_load) { | ||
2823 | max_load = wl; | ||
2824 | busiest = rq; | ||
2825 | } | ||
2826 | } | ||
2827 | |||
2828 | return busiest; | ||
2829 | } | ||
2830 | |||
2831 | /* | ||
2832 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
2833 | * so long as it is large enough. | ||
2834 | */ | ||
2835 | #define MAX_PINNED_INTERVAL 512 | ||
2836 | |||
2837 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
2838 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | ||
2839 | |||
2840 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | ||
2841 | { | ||
2842 | if (idle == CPU_NEWLY_IDLE) { | ||
2843 | /* | ||
2844 | * The only task running in a non-idle cpu can be moved to this | ||
2845 | * cpu in an attempt to completely freeup the other CPU | ||
2846 | * package. | ||
2847 | * | ||
2848 | * The package power saving logic comes from | ||
2849 | * find_busiest_group(). If there are no imbalance, then | ||
2850 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
2851 | * f_b_g() will select a group from which a running task may be | ||
2852 | * pulled to this cpu in order to make the other package idle. | ||
2853 | * If there is no opportunity to make a package idle and if | ||
2854 | * there are no imbalance, then f_b_g() will return NULL and no | ||
2855 | * action will be taken in load_balance_newidle(). | ||
2856 | * | ||
2857 | * Under normal task pull operation due to imbalance, there | ||
2858 | * will be more than one task in the source run queue and | ||
2859 | * move_tasks() will succeed. ld_moved will be true and this | ||
2860 | * active balance code will not be triggered. | ||
2861 | */ | ||
2862 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
2863 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2864 | return 0; | ||
2865 | |||
2866 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
2867 | return 0; | ||
2868 | } | ||
2869 | |||
2870 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | ||
2871 | } | ||
2872 | |||
2873 | /* | ||
2874 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
2875 | * tasks if there is an imbalance. | ||
2876 | */ | ||
2877 | static int load_balance(int this_cpu, struct rq *this_rq, | ||
2878 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
2879 | int *balance) | ||
2880 | { | ||
2881 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
2882 | struct sched_group *group; | ||
2883 | unsigned long imbalance; | ||
2884 | struct rq *busiest; | ||
2885 | unsigned long flags; | ||
2886 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
2887 | |||
2888 | cpumask_copy(cpus, cpu_active_mask); | ||
2889 | |||
2890 | /* | ||
2891 | * When power savings policy is enabled for the parent domain, idle | ||
2892 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
2893 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
2894 | * portraying it as CPU_NOT_IDLE. | ||
2895 | */ | ||
2896 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
2897 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2898 | sd_idle = 1; | ||
2899 | |||
2900 | schedstat_inc(sd, lb_count[idle]); | ||
2901 | |||
2902 | redo: | ||
2903 | update_shares(sd); | ||
2904 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
2905 | cpus, balance); | ||
2906 | |||
2907 | if (*balance == 0) | ||
2908 | goto out_balanced; | ||
2909 | |||
2910 | if (!group) { | ||
2911 | schedstat_inc(sd, lb_nobusyg[idle]); | ||
2912 | goto out_balanced; | ||
2913 | } | ||
2914 | |||
2915 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | ||
2916 | if (!busiest) { | ||
2917 | schedstat_inc(sd, lb_nobusyq[idle]); | ||
2918 | goto out_balanced; | ||
2919 | } | ||
2920 | |||
2921 | BUG_ON(busiest == this_rq); | ||
2922 | |||
2923 | schedstat_add(sd, lb_imbalance[idle], imbalance); | ||
2924 | |||
2925 | ld_moved = 0; | ||
2926 | if (busiest->nr_running > 1) { | ||
2927 | /* | ||
2928 | * Attempt to move tasks. If find_busiest_group has found | ||
2929 | * an imbalance but busiest->nr_running <= 1, the group is | ||
2930 | * still unbalanced. ld_moved simply stays zero, so it is | ||
2931 | * correctly treated as an imbalance. | ||
2932 | */ | ||
2933 | local_irq_save(flags); | ||
2934 | double_rq_lock(this_rq, busiest); | ||
2935 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
2936 | imbalance, sd, idle, &all_pinned); | ||
2937 | double_rq_unlock(this_rq, busiest); | ||
2938 | local_irq_restore(flags); | ||
2939 | |||
2940 | /* | ||
2941 | * some other cpu did the load balance for us. | ||
2942 | */ | ||
2943 | if (ld_moved && this_cpu != smp_processor_id()) | ||
2944 | resched_cpu(this_cpu); | ||
2945 | |||
2946 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
2947 | if (unlikely(all_pinned)) { | ||
2948 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
2949 | if (!cpumask_empty(cpus)) | ||
2950 | goto redo; | ||
2951 | goto out_balanced; | ||
2952 | } | ||
2953 | } | ||
2954 | |||
2955 | if (!ld_moved) { | ||
2956 | schedstat_inc(sd, lb_failed[idle]); | ||
2957 | sd->nr_balance_failed++; | ||
2958 | |||
2959 | if (need_active_balance(sd, sd_idle, idle)) { | ||
2960 | raw_spin_lock_irqsave(&busiest->lock, flags); | ||
2961 | |||
2962 | /* don't kick the migration_thread, if the curr | ||
2963 | * task on busiest cpu can't be moved to this_cpu | ||
2964 | */ | ||
2965 | if (!cpumask_test_cpu(this_cpu, | ||
2966 | &busiest->curr->cpus_allowed)) { | ||
2967 | raw_spin_unlock_irqrestore(&busiest->lock, | ||
2968 | flags); | ||
2969 | all_pinned = 1; | ||
2970 | goto out_one_pinned; | ||
2971 | } | ||
2972 | |||
2973 | if (!busiest->active_balance) { | ||
2974 | busiest->active_balance = 1; | ||
2975 | busiest->push_cpu = this_cpu; | ||
2976 | active_balance = 1; | ||
2977 | } | ||
2978 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | ||
2979 | if (active_balance) | ||
2980 | wake_up_process(busiest->migration_thread); | ||
2981 | |||
2982 | /* | ||
2983 | * We've kicked active balancing, reset the failure | ||
2984 | * counter. | ||
2985 | */ | ||
2986 | sd->nr_balance_failed = sd->cache_nice_tries+1; | ||
2987 | } | ||
2988 | } else | ||
2989 | sd->nr_balance_failed = 0; | ||
2990 | |||
2991 | if (likely(!active_balance)) { | ||
2992 | /* We were unbalanced, so reset the balancing interval */ | ||
2993 | sd->balance_interval = sd->min_interval; | ||
2994 | } else { | ||
2995 | /* | ||
2996 | * If we've begun active balancing, start to back off. This | ||
2997 | * case may not be covered by the all_pinned logic if there | ||
2998 | * is only 1 task on the busy runqueue (because we don't call | ||
2999 | * move_tasks). | ||
3000 | */ | ||
3001 | if (sd->balance_interval < sd->max_interval) | ||
3002 | sd->balance_interval *= 2; | ||
3003 | } | ||
3004 | |||
3005 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3006 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3007 | ld_moved = -1; | ||
3008 | |||
3009 | goto out; | ||
3010 | |||
3011 | out_balanced: | ||
3012 | schedstat_inc(sd, lb_balanced[idle]); | ||
3013 | |||
3014 | sd->nr_balance_failed = 0; | ||
3015 | |||
3016 | out_one_pinned: | ||
3017 | /* tune up the balancing interval */ | ||
3018 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
3019 | (sd->balance_interval < sd->max_interval)) | ||
3020 | sd->balance_interval *= 2; | ||
3021 | |||
3022 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3023 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3024 | ld_moved = -1; | ||
3025 | else | ||
3026 | ld_moved = 0; | ||
3027 | out: | ||
3028 | if (ld_moved) | ||
3029 | update_shares(sd); | ||
3030 | return ld_moved; | ||
3031 | } | ||
3032 | |||
3033 | /* | ||
3034 | * idle_balance is called by schedule() if this_cpu is about to become | ||
3035 | * idle. Attempts to pull tasks from other CPUs. | ||
3036 | */ | ||
3037 | static void idle_balance(int this_cpu, struct rq *this_rq) | ||
3038 | { | ||
3039 | struct sched_domain *sd; | ||
3040 | int pulled_task = 0; | ||
3041 | unsigned long next_balance = jiffies + HZ; | ||
3042 | |||
3043 | this_rq->idle_stamp = this_rq->clock; | ||
3044 | |||
3045 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
3046 | return; | ||
3047 | |||
3048 | /* | ||
3049 | * Drop the rq->lock, but keep IRQ/preempt disabled. | ||
3050 | */ | ||
3051 | raw_spin_unlock(&this_rq->lock); | ||
3052 | |||
3053 | for_each_domain(this_cpu, sd) { | ||
3054 | unsigned long interval; | ||
3055 | int balance = 1; | ||
3056 | |||
3057 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
3058 | continue; | ||
3059 | |||
3060 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
3061 | /* If we've pulled tasks over stop searching: */ | ||
3062 | pulled_task = load_balance(this_cpu, this_rq, | ||
3063 | sd, CPU_NEWLY_IDLE, &balance); | ||
3064 | } | ||
3065 | |||
3066 | interval = msecs_to_jiffies(sd->balance_interval); | ||
3067 | if (time_after(next_balance, sd->last_balance + interval)) | ||
3068 | next_balance = sd->last_balance + interval; | ||
3069 | if (pulled_task) { | ||
3070 | this_rq->idle_stamp = 0; | ||
3071 | break; | ||
3072 | } | ||
3073 | } | ||
3074 | |||
3075 | raw_spin_lock(&this_rq->lock); | ||
3076 | |||
3077 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | ||
3078 | /* | ||
3079 | * We are going idle. next_balance may be set based on | ||
3080 | * a busy processor. So reset next_balance. | ||
3081 | */ | ||
3082 | this_rq->next_balance = next_balance; | ||
3083 | } | ||
3084 | } | ||
3085 | |||
3086 | /* | ||
3087 | * active_load_balance is run by migration threads. It pushes running tasks | ||
3088 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | ||
3089 | * running on each physical CPU where possible, and avoids physical / | ||
3090 | * logical imbalances. | ||
3091 | * | ||
3092 | * Called with busiest_rq locked. | ||
3093 | */ | ||
3094 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | ||
3095 | { | ||
3096 | int target_cpu = busiest_rq->push_cpu; | ||
3097 | struct sched_domain *sd; | ||
3098 | struct rq *target_rq; | ||
3099 | |||
3100 | /* Is there any task to move? */ | ||
3101 | if (busiest_rq->nr_running <= 1) | ||
3102 | return; | ||
3103 | |||
3104 | target_rq = cpu_rq(target_cpu); | ||
3105 | |||
3106 | /* | ||
3107 | * This condition is "impossible", if it occurs | ||
3108 | * we need to fix it. Originally reported by | ||
3109 | * Bjorn Helgaas on a 128-cpu setup. | ||
3110 | */ | ||
3111 | BUG_ON(busiest_rq == target_rq); | ||
3112 | |||
3113 | /* move a task from busiest_rq to target_rq */ | ||
3114 | double_lock_balance(busiest_rq, target_rq); | ||
3115 | update_rq_clock(busiest_rq); | ||
3116 | update_rq_clock(target_rq); | ||
3117 | |||
3118 | /* Search for an sd spanning us and the target CPU. */ | ||
3119 | for_each_domain(target_cpu, sd) { | ||
3120 | if ((sd->flags & SD_LOAD_BALANCE) && | ||
3121 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | ||
3122 | break; | ||
3123 | } | ||
3124 | |||
3125 | if (likely(sd)) { | ||
3126 | schedstat_inc(sd, alb_count); | ||
3127 | |||
3128 | if (move_one_task(target_rq, target_cpu, busiest_rq, | ||
3129 | sd, CPU_IDLE)) | ||
3130 | schedstat_inc(sd, alb_pushed); | ||
3131 | else | ||
3132 | schedstat_inc(sd, alb_failed); | ||
3133 | } | ||
3134 | double_unlock_balance(busiest_rq, target_rq); | ||
3135 | } | ||
3136 | |||
3137 | #ifdef CONFIG_NO_HZ | ||
3138 | static struct { | ||
3139 | atomic_t load_balancer; | ||
3140 | cpumask_var_t cpu_mask; | ||
3141 | cpumask_var_t ilb_grp_nohz_mask; | ||
3142 | } nohz ____cacheline_aligned = { | ||
3143 | .load_balancer = ATOMIC_INIT(-1), | ||
3144 | }; | ||
3145 | |||
3146 | int get_nohz_load_balancer(void) | ||
3147 | { | ||
3148 | return atomic_read(&nohz.load_balancer); | ||
3149 | } | ||
3150 | |||
3151 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3152 | /** | ||
3153 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
3154 | * @cpu: The cpu whose lowest level of sched domain is to | ||
3155 | * be returned. | ||
3156 | * @flag: The flag to check for the lowest sched_domain | ||
3157 | * for the given cpu. | ||
3158 | * | ||
3159 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
3160 | */ | ||
3161 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
3162 | { | ||
3163 | struct sched_domain *sd; | ||
3164 | |||
3165 | for_each_domain(cpu, sd) | ||
3166 | if (sd && (sd->flags & flag)) | ||
3167 | break; | ||
3168 | |||
3169 | return sd; | ||
3170 | } | ||
3171 | |||
3172 | /** | ||
3173 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
3174 | * @cpu: The cpu whose domains we're iterating over. | ||
3175 | * @sd: variable holding the value of the power_savings_sd | ||
3176 | * for cpu. | ||
3177 | * @flag: The flag to filter the sched_domains to be iterated. | ||
3178 | * | ||
3179 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
3180 | * set, starting from the lowest sched_domain to the highest. | ||
3181 | */ | ||
3182 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
3183 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
3184 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
3185 | |||
3186 | /** | ||
3187 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
3188 | * @ilb_group: group to be checked for semi-idleness | ||
3189 | * | ||
3190 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
3191 | * | ||
3192 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
3193 | * and atleast one non-idle CPU. This helper function checks if the given | ||
3194 | * sched_group is semi-idle or not. | ||
3195 | */ | ||
3196 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
3197 | { | ||
3198 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
3199 | sched_group_cpus(ilb_group)); | ||
3200 | |||
3201 | /* | ||
3202 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
3203 | * and atleast one idle cpu. | ||
3204 | */ | ||
3205 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
3206 | return 0; | ||
3207 | |||
3208 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
3209 | return 0; | ||
3210 | |||
3211 | return 1; | ||
3212 | } | ||
3213 | /** | ||
3214 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
3215 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
3216 | * | ||
3217 | * Returns: Returns the id of the idle load balancer if it exists, | ||
3218 | * Else, returns >= nr_cpu_ids. | ||
3219 | * | ||
3220 | * This algorithm picks the idle load balancer such that it belongs to a | ||
3221 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
3222 | * completely idle packages/cores just for the purpose of idle load balancing | ||
3223 | * when there are other idle cpu's which are better suited for that job. | ||
3224 | */ | ||
3225 | static int find_new_ilb(int cpu) | ||
3226 | { | ||
3227 | struct sched_domain *sd; | ||
3228 | struct sched_group *ilb_group; | ||
3229 | |||
3230 | /* | ||
3231 | * Have idle load balancer selection from semi-idle packages only | ||
3232 | * when power-aware load balancing is enabled | ||
3233 | */ | ||
3234 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
3235 | goto out_done; | ||
3236 | |||
3237 | /* | ||
3238 | * Optimize for the case when we have no idle CPUs or only one | ||
3239 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
3240 | */ | ||
3241 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
3242 | goto out_done; | ||
3243 | |||
3244 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
3245 | ilb_group = sd->groups; | ||
3246 | |||
3247 | do { | ||
3248 | if (is_semi_idle_group(ilb_group)) | ||
3249 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
3250 | |||
3251 | ilb_group = ilb_group->next; | ||
3252 | |||
3253 | } while (ilb_group != sd->groups); | ||
3254 | } | ||
3255 | |||
3256 | out_done: | ||
3257 | return cpumask_first(nohz.cpu_mask); | ||
3258 | } | ||
3259 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
3260 | static inline int find_new_ilb(int call_cpu) | ||
3261 | { | ||
3262 | return cpumask_first(nohz.cpu_mask); | ||
3263 | } | ||
3264 | #endif | ||
3265 | |||
3266 | /* | ||
3267 | * This routine will try to nominate the ilb (idle load balancing) | ||
3268 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
3269 | * load balancing on behalf of all those cpus. If all the cpus in the system | ||
3270 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
3271 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
3272 | * arrives... | ||
3273 | * | ||
3274 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
3275 | * for idle load balancing. ilb owner will still be part of | ||
3276 | * nohz.cpu_mask.. | ||
3277 | * | ||
3278 | * While stopping the tick, this cpu will become the ilb owner if there | ||
3279 | * is no other owner. And will be the owner till that cpu becomes busy | ||
3280 | * or if all cpus in the system stop their ticks at which point | ||
3281 | * there is no need for ilb owner. | ||
3282 | * | ||
3283 | * When the ilb owner becomes busy, it nominates another owner, during the | ||
3284 | * next busy scheduler_tick() | ||
3285 | */ | ||
3286 | int select_nohz_load_balancer(int stop_tick) | ||
3287 | { | ||
3288 | int cpu = smp_processor_id(); | ||
3289 | |||
3290 | if (stop_tick) { | ||
3291 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
3292 | |||
3293 | if (!cpu_active(cpu)) { | ||
3294 | if (atomic_read(&nohz.load_balancer) != cpu) | ||
3295 | return 0; | ||
3296 | |||
3297 | /* | ||
3298 | * If we are going offline and still the leader, | ||
3299 | * give up! | ||
3300 | */ | ||
3301 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
3302 | BUG(); | ||
3303 | |||
3304 | return 0; | ||
3305 | } | ||
3306 | |||
3307 | cpumask_set_cpu(cpu, nohz.cpu_mask); | ||
3308 | |||
3309 | /* time for ilb owner also to sleep */ | ||
3310 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | ||
3311 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
3312 | atomic_set(&nohz.load_balancer, -1); | ||
3313 | return 0; | ||
3314 | } | ||
3315 | |||
3316 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3317 | /* make me the ilb owner */ | ||
3318 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
3319 | return 1; | ||
3320 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3321 | int new_ilb; | ||
3322 | |||
3323 | if (!(sched_smt_power_savings || | ||
3324 | sched_mc_power_savings)) | ||
3325 | return 1; | ||
3326 | /* | ||
3327 | * Check to see if there is a more power-efficient | ||
3328 | * ilb. | ||
3329 | */ | ||
3330 | new_ilb = find_new_ilb(cpu); | ||
3331 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
3332 | atomic_set(&nohz.load_balancer, -1); | ||
3333 | resched_cpu(new_ilb); | ||
3334 | return 0; | ||
3335 | } | ||
3336 | return 1; | ||
3337 | } | ||
3338 | } else { | ||
3339 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
3340 | return 0; | ||
3341 | |||
3342 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
3343 | |||
3344 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
3345 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
3346 | BUG(); | ||
3347 | } | ||
3348 | return 0; | ||
3349 | } | ||
3350 | #endif | ||
3351 | |||
3352 | static DEFINE_SPINLOCK(balancing); | ||
3353 | |||
3354 | /* | ||
3355 | * It checks each scheduling domain to see if it is due to be balanced, | ||
3356 | * and initiates a balancing operation if so. | ||
3357 | * | ||
3358 | * Balancing parameters are set up in arch_init_sched_domains. | ||
3359 | */ | ||
3360 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | ||
3361 | { | ||
3362 | int balance = 1; | ||
3363 | struct rq *rq = cpu_rq(cpu); | ||
3364 | unsigned long interval; | ||
3365 | struct sched_domain *sd; | ||
3366 | /* Earliest time when we have to do rebalance again */ | ||
3367 | unsigned long next_balance = jiffies + 60*HZ; | ||
3368 | int update_next_balance = 0; | ||
3369 | int need_serialize; | ||
3370 | |||
3371 | for_each_domain(cpu, sd) { | ||
3372 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
3373 | continue; | ||
3374 | |||
3375 | interval = sd->balance_interval; | ||
3376 | if (idle != CPU_IDLE) | ||
3377 | interval *= sd->busy_factor; | ||
3378 | |||
3379 | /* scale ms to jiffies */ | ||
3380 | interval = msecs_to_jiffies(interval); | ||
3381 | if (unlikely(!interval)) | ||
3382 | interval = 1; | ||
3383 | if (interval > HZ*NR_CPUS/10) | ||
3384 | interval = HZ*NR_CPUS/10; | ||
3385 | |||
3386 | need_serialize = sd->flags & SD_SERIALIZE; | ||
3387 | |||
3388 | if (need_serialize) { | ||
3389 | if (!spin_trylock(&balancing)) | ||
3390 | goto out; | ||
3391 | } | ||
3392 | |||
3393 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
3394 | if (load_balance(cpu, rq, sd, idle, &balance)) { | ||
3395 | /* | ||
3396 | * We've pulled tasks over so either we're no | ||
3397 | * longer idle, or one of our SMT siblings is | ||
3398 | * not idle. | ||
3399 | */ | ||
3400 | idle = CPU_NOT_IDLE; | ||
3401 | } | ||
3402 | sd->last_balance = jiffies; | ||
3403 | } | ||
3404 | if (need_serialize) | ||
3405 | spin_unlock(&balancing); | ||
3406 | out: | ||
3407 | if (time_after(next_balance, sd->last_balance + interval)) { | ||
3408 | next_balance = sd->last_balance + interval; | ||
3409 | update_next_balance = 1; | ||
3410 | } | ||
3411 | |||
3412 | /* | ||
3413 | * Stop the load balance at this level. There is another | ||
3414 | * CPU in our sched group which is doing load balancing more | ||
3415 | * actively. | ||
3416 | */ | ||
3417 | if (!balance) | ||
3418 | break; | ||
3419 | } | ||
3420 | |||
3421 | /* | ||
3422 | * next_balance will be updated only when there is a need. | ||
3423 | * When the cpu is attached to null domain for ex, it will not be | ||
3424 | * updated. | ||
3425 | */ | ||
3426 | if (likely(update_next_balance)) | ||
3427 | rq->next_balance = next_balance; | ||
3428 | } | ||
3429 | |||
3430 | /* | ||
3431 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
3432 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
3433 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
3434 | */ | ||
3435 | static void run_rebalance_domains(struct softirq_action *h) | ||
3436 | { | ||
3437 | int this_cpu = smp_processor_id(); | ||
3438 | struct rq *this_rq = cpu_rq(this_cpu); | ||
3439 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | ||
3440 | CPU_IDLE : CPU_NOT_IDLE; | ||
3441 | |||
3442 | rebalance_domains(this_cpu, idle); | ||
3443 | |||
3444 | #ifdef CONFIG_NO_HZ | ||
3445 | /* | ||
3446 | * If this cpu is the owner for idle load balancing, then do the | ||
3447 | * balancing on behalf of the other idle cpus whose ticks are | ||
3448 | * stopped. | ||
3449 | */ | ||
3450 | if (this_rq->idle_at_tick && | ||
3451 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
3452 | struct rq *rq; | ||
3453 | int balance_cpu; | ||
3454 | |||
3455 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
3456 | if (balance_cpu == this_cpu) | ||
3457 | continue; | ||
3458 | |||
3459 | /* | ||
3460 | * If this cpu gets work to do, stop the load balancing | ||
3461 | * work being done for other cpus. Next load | ||
3462 | * balancing owner will pick it up. | ||
3463 | */ | ||
3464 | if (need_resched()) | ||
3465 | break; | ||
3466 | |||
3467 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3468 | |||
3469 | rq = cpu_rq(balance_cpu); | ||
3470 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3471 | this_rq->next_balance = rq->next_balance; | ||
3472 | } | ||
3473 | } | ||
3474 | #endif | ||
3475 | } | ||
3476 | |||
3477 | static inline int on_null_domain(int cpu) | ||
3478 | { | ||
3479 | return !rcu_dereference(cpu_rq(cpu)->sd); | ||
3480 | } | ||
3481 | |||
3482 | /* | ||
3483 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | ||
3484 | * | ||
3485 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
3486 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
3487 | * if the whole system is idle. | ||
3488 | */ | ||
3489 | static inline void trigger_load_balance(struct rq *rq, int cpu) | ||
3490 | { | ||
3491 | #ifdef CONFIG_NO_HZ | ||
3492 | /* | ||
3493 | * If we were in the nohz mode recently and busy at the current | ||
3494 | * scheduler tick, then check if we need to nominate new idle | ||
3495 | * load balancer. | ||
3496 | */ | ||
3497 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
3498 | rq->in_nohz_recently = 0; | ||
3499 | |||
3500 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3501 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
3502 | atomic_set(&nohz.load_balancer, -1); | ||
3503 | } | ||
3504 | |||
3505 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3506 | int ilb = find_new_ilb(cpu); | ||
3507 | |||
3508 | if (ilb < nr_cpu_ids) | ||
3509 | resched_cpu(ilb); | ||
3510 | } | ||
3511 | } | ||
3512 | |||
3513 | /* | ||
3514 | * If this cpu is idle and doing idle load balancing for all the | ||
3515 | * cpus with ticks stopped, is it time for that to stop? | ||
3516 | */ | ||
3517 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
3518 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3519 | resched_cpu(cpu); | ||
3520 | return; | ||
3521 | } | ||
3522 | |||
3523 | /* | ||
3524 | * If this cpu is idle and the idle load balancing is done by | ||
3525 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
3526 | */ | ||
3527 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
3528 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
3529 | return; | ||
3530 | #endif | ||
3531 | /* Don't need to rebalance while attached to NULL domain */ | ||
3532 | if (time_after_eq(jiffies, rq->next_balance) && | ||
3533 | likely(!on_null_domain(cpu))) | ||
3534 | raise_softirq(SCHED_SOFTIRQ); | ||
3535 | } | ||
1954 | 3536 | ||
1955 | static void rq_online_fair(struct rq *rq) | 3537 | static void rq_online_fair(struct rq *rq) |
1956 | { | 3538 | { |
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq) | |||
1962 | update_sysctl(); | 3544 | update_sysctl(); |
1963 | } | 3545 | } |
1964 | 3546 | ||
3547 | #else /* CONFIG_SMP */ | ||
3548 | |||
3549 | /* | ||
3550 | * on UP we do not need to balance between CPUs: | ||
3551 | */ | ||
3552 | static inline void idle_balance(int cpu, struct rq *rq) | ||
3553 | { | ||
3554 | } | ||
3555 | |||
1965 | #endif /* CONFIG_SMP */ | 3556 | #endif /* CONFIG_SMP */ |
1966 | 3557 | ||
1967 | /* | 3558 | /* |
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq) | |||
2076 | } | 3667 | } |
2077 | #endif | 3668 | #endif |
2078 | 3669 | ||
2079 | unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) | 3670 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) |
2080 | { | 3671 | { |
2081 | struct sched_entity *se = &task->se; | 3672 | struct sched_entity *se = &task->se; |
2082 | unsigned int rr_interval = 0; | 3673 | unsigned int rr_interval = 0; |
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = { | |||
2108 | #ifdef CONFIG_SMP | 3699 | #ifdef CONFIG_SMP |
2109 | .select_task_rq = select_task_rq_fair, | 3700 | .select_task_rq = select_task_rq_fair, |
2110 | 3701 | ||
2111 | .load_balance = load_balance_fair, | ||
2112 | .move_one_task = move_one_task_fair, | ||
2113 | .rq_online = rq_online_fair, | 3702 | .rq_online = rq_online_fair, |
2114 | .rq_offline = rq_offline_fair, | 3703 | .rq_offline = rq_offline_fair, |
2115 | 3704 | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 5f93b570d383..a8a6d8a50947 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | |||
44 | { | 44 | { |
45 | } | 45 | } |
46 | 46 | ||
47 | #ifdef CONFIG_SMP | ||
48 | static unsigned long | ||
49 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
50 | unsigned long max_load_move, | ||
51 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
52 | int *all_pinned, int *this_best_prio) | ||
53 | { | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static int | ||
58 | move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
59 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
60 | { | ||
61 | return 0; | ||
62 | } | ||
63 | #endif | ||
64 | |||
65 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | 47 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
66 | { | 48 | { |
67 | } | 49 | } |
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, | |||
97 | check_preempt_curr(rq, p, 0); | 79 | check_preempt_curr(rq, p, 0); |
98 | } | 80 | } |
99 | 81 | ||
100 | unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | 82 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) |
101 | { | 83 | { |
102 | return 0; | 84 | return 0; |
103 | } | 85 | } |
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = { | |||
119 | 101 | ||
120 | #ifdef CONFIG_SMP | 102 | #ifdef CONFIG_SMP |
121 | .select_task_rq = select_task_rq_idle, | 103 | .select_task_rq = select_task_rq_idle, |
122 | |||
123 | .load_balance = load_balance_idle, | ||
124 | .move_one_task = move_one_task_idle, | ||
125 | #endif | 104 | #endif |
126 | 105 | ||
127 | .set_curr_task = set_curr_task_idle, | 106 | .set_curr_task = set_curr_task_idle, |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f48328ac216f..5a6ed1f0990a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | |||
194 | return rt_se->my_q; | 194 | return rt_se->my_q; |
195 | } | 195 | } |
196 | 196 | ||
197 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se); | 197 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); |
198 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | 198 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); |
199 | 199 | ||
200 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 200 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
201 | { | 201 | { |
202 | int this_cpu = smp_processor_id(); | ||
202 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 203 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; |
203 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | 204 | struct sched_rt_entity *rt_se; |
205 | |||
206 | rt_se = rt_rq->tg->rt_se[this_cpu]; | ||
204 | 207 | ||
205 | if (rt_rq->rt_nr_running) { | 208 | if (rt_rq->rt_nr_running) { |
206 | if (rt_se && !on_rt_rq(rt_se)) | 209 | if (rt_se && !on_rt_rq(rt_se)) |
207 | enqueue_rt_entity(rt_se); | 210 | enqueue_rt_entity(rt_se, false); |
208 | if (rt_rq->highest_prio.curr < curr->prio) | 211 | if (rt_rq->highest_prio.curr < curr->prio) |
209 | resched_task(curr); | 212 | resched_task(curr); |
210 | } | 213 | } |
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
212 | 215 | ||
213 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 216 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
214 | { | 217 | { |
215 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | 218 | int this_cpu = smp_processor_id(); |
219 | struct sched_rt_entity *rt_se; | ||
220 | |||
221 | rt_se = rt_rq->tg->rt_se[this_cpu]; | ||
216 | 222 | ||
217 | if (rt_se && on_rt_rq(rt_se)) | 223 | if (rt_se && on_rt_rq(rt_se)) |
218 | dequeue_rt_entity(rt_se); | 224 | dequeue_rt_entity(rt_se); |
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
803 | dec_rt_group(rt_se, rt_rq); | 809 | dec_rt_group(rt_se, rt_rq); |
804 | } | 810 | } |
805 | 811 | ||
806 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | 812 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) |
807 | { | 813 | { |
808 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 814 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
809 | struct rt_prio_array *array = &rt_rq->active; | 815 | struct rt_prio_array *array = &rt_rq->active; |
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | |||
819 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 825 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
820 | return; | 826 | return; |
821 | 827 | ||
822 | list_add_tail(&rt_se->run_list, queue); | 828 | if (head) |
829 | list_add(&rt_se->run_list, queue); | ||
830 | else | ||
831 | list_add_tail(&rt_se->run_list, queue); | ||
823 | __set_bit(rt_se_prio(rt_se), array->bitmap); | 832 | __set_bit(rt_se_prio(rt_se), array->bitmap); |
824 | 833 | ||
825 | inc_rt_tasks(rt_se, rt_rq); | 834 | inc_rt_tasks(rt_se, rt_rq); |
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | |||
856 | } | 865 | } |
857 | } | 866 | } |
858 | 867 | ||
859 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se) | 868 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) |
860 | { | 869 | { |
861 | dequeue_rt_stack(rt_se); | 870 | dequeue_rt_stack(rt_se); |
862 | for_each_sched_rt_entity(rt_se) | 871 | for_each_sched_rt_entity(rt_se) |
863 | __enqueue_rt_entity(rt_se); | 872 | __enqueue_rt_entity(rt_se, head); |
864 | } | 873 | } |
865 | 874 | ||
866 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | 875 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) |
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
871 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | 880 | struct rt_rq *rt_rq = group_rt_rq(rt_se); |
872 | 881 | ||
873 | if (rt_rq && rt_rq->rt_nr_running) | 882 | if (rt_rq && rt_rq->rt_nr_running) |
874 | __enqueue_rt_entity(rt_se); | 883 | __enqueue_rt_entity(rt_se, false); |
875 | } | 884 | } |
876 | } | 885 | } |
877 | 886 | ||
878 | /* | 887 | /* |
879 | * Adding/removing a task to/from a priority array: | 888 | * Adding/removing a task to/from a priority array: |
880 | */ | 889 | */ |
881 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | 890 | static void |
891 | enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
882 | { | 892 | { |
883 | struct sched_rt_entity *rt_se = &p->rt; | 893 | struct sched_rt_entity *rt_se = &p->rt; |
884 | 894 | ||
885 | if (wakeup) | 895 | if (wakeup) |
886 | rt_se->timeout = 0; | 896 | rt_se->timeout = 0; |
887 | 897 | ||
888 | enqueue_rt_entity(rt_se); | 898 | enqueue_rt_entity(rt_se, head); |
889 | 899 | ||
890 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 900 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
891 | enqueue_pushable_task(rq, p); | 901 | enqueue_pushable_task(rq, p); |
@@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1481 | push_rt_tasks(rq); | 1491 | push_rt_tasks(rq); |
1482 | } | 1492 | } |
1483 | 1493 | ||
1484 | static unsigned long | ||
1485 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1486 | unsigned long max_load_move, | ||
1487 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1488 | int *all_pinned, int *this_best_prio) | ||
1489 | { | ||
1490 | /* don't touch RT tasks */ | ||
1491 | return 0; | ||
1492 | } | ||
1493 | |||
1494 | static int | ||
1495 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1496 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
1497 | { | ||
1498 | /* don't touch RT tasks */ | ||
1499 | return 0; | ||
1500 | } | ||
1501 | |||
1502 | static void set_cpus_allowed_rt(struct task_struct *p, | 1494 | static void set_cpus_allowed_rt(struct task_struct *p, |
1503 | const struct cpumask *new_mask) | 1495 | const struct cpumask *new_mask) |
1504 | { | 1496 | { |
@@ -1670,8 +1662,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
1670 | if (!p->signal) | 1662 | if (!p->signal) |
1671 | return; | 1663 | return; |
1672 | 1664 | ||
1673 | soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; | 1665 | /* max may change after cur was read, this will be fixed next tick */ |
1674 | hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; | 1666 | soft = task_rlimit(p, RLIMIT_RTTIME); |
1667 | hard = task_rlimit_max(p, RLIMIT_RTTIME); | ||
1675 | 1668 | ||
1676 | if (soft != RLIM_INFINITY) { | 1669 | if (soft != RLIM_INFINITY) { |
1677 | unsigned long next; | 1670 | unsigned long next; |
@@ -1721,7 +1714,7 @@ static void set_curr_task_rt(struct rq *rq) | |||
1721 | dequeue_pushable_task(rq, p); | 1714 | dequeue_pushable_task(rq, p); |
1722 | } | 1715 | } |
1723 | 1716 | ||
1724 | unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | 1717 | static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) |
1725 | { | 1718 | { |
1726 | /* | 1719 | /* |
1727 | * Time slice is 0 for SCHED_FIFO tasks | 1720 | * Time slice is 0 for SCHED_FIFO tasks |
@@ -1746,8 +1739,6 @@ static const struct sched_class rt_sched_class = { | |||
1746 | #ifdef CONFIG_SMP | 1739 | #ifdef CONFIG_SMP |
1747 | .select_task_rq = select_task_rq_rt, | 1740 | .select_task_rq = select_task_rq_rt, |
1748 | 1741 | ||
1749 | .load_balance = load_balance_rt, | ||
1750 | .move_one_task = move_one_task_rt, | ||
1751 | .set_cpus_allowed = set_cpus_allowed_rt, | 1742 | .set_cpus_allowed = set_cpus_allowed_rt, |
1752 | .rq_online = rq_online_rt, | 1743 | .rq_online = rq_online_rt, |
1753 | .rq_offline = rq_offline_rt, | 1744 | .rq_offline = rq_offline_rt, |
diff --git a/kernel/signal.c b/kernel/signal.c index 934ae5e687b9..dbd7fe073c55 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -159,6 +159,10 @@ void recalc_sigpending(void) | |||
159 | 159 | ||
160 | /* Given the mask, find the first available signal that should be serviced. */ | 160 | /* Given the mask, find the first available signal that should be serviced. */ |
161 | 161 | ||
162 | #define SYNCHRONOUS_MASK \ | ||
163 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ | ||
164 | sigmask(SIGTRAP) | sigmask(SIGFPE)) | ||
165 | |||
162 | int next_signal(struct sigpending *pending, sigset_t *mask) | 166 | int next_signal(struct sigpending *pending, sigset_t *mask) |
163 | { | 167 | { |
164 | unsigned long i, *s, *m, x; | 168 | unsigned long i, *s, *m, x; |
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask) | |||
166 | 170 | ||
167 | s = pending->signal.sig; | 171 | s = pending->signal.sig; |
168 | m = mask->sig; | 172 | m = mask->sig; |
173 | |||
174 | /* | ||
175 | * Handle the first word specially: it contains the | ||
176 | * synchronous signals that need to be dequeued first. | ||
177 | */ | ||
178 | x = *s &~ *m; | ||
179 | if (x) { | ||
180 | if (x & SYNCHRONOUS_MASK) | ||
181 | x &= SYNCHRONOUS_MASK; | ||
182 | sig = ffz(~x) + 1; | ||
183 | return sig; | ||
184 | } | ||
185 | |||
169 | switch (_NSIG_WORDS) { | 186 | switch (_NSIG_WORDS) { |
170 | default: | 187 | default: |
171 | for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) | 188 | for (i = 1; i < _NSIG_WORDS; ++i) { |
172 | if ((x = *s &~ *m) != 0) { | 189 | x = *++s &~ *++m; |
173 | sig = ffz(~x) + i*_NSIG_BPW + 1; | 190 | if (!x) |
174 | break; | 191 | continue; |
175 | } | 192 | sig = ffz(~x) + i*_NSIG_BPW + 1; |
193 | break; | ||
194 | } | ||
176 | break; | 195 | break; |
177 | 196 | ||
178 | case 2: if ((x = s[0] &~ m[0]) != 0) | 197 | case 2: |
179 | sig = 1; | 198 | x = s[1] &~ m[1]; |
180 | else if ((x = s[1] &~ m[1]) != 0) | 199 | if (!x) |
181 | sig = _NSIG_BPW + 1; | ||
182 | else | ||
183 | break; | 200 | break; |
184 | sig += ffz(~x); | 201 | sig = ffz(~x) + _NSIG_BPW + 1; |
185 | break; | 202 | break; |
186 | 203 | ||
187 | case 1: if ((x = *s &~ *m) != 0) | 204 | case 1: |
188 | sig = ffz(~x) + 1; | 205 | /* Nothing to do */ |
189 | break; | 206 | break; |
190 | } | 207 | } |
191 | 208 | ||
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi | |||
228 | 245 | ||
229 | if (override_rlimit || | 246 | if (override_rlimit || |
230 | atomic_read(&user->sigpending) <= | 247 | atomic_read(&user->sigpending) <= |
231 | t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { | 248 | task_rlimit(t, RLIMIT_SIGPENDING)) { |
232 | q = kmem_cache_alloc(sigqueue_cachep, flags); | 249 | q = kmem_cache_alloc(sigqueue_cachep, flags); |
233 | } else { | 250 | } else { |
234 | print_dropped_signal(sig); | 251 | print_dropped_signal(sig); |
diff --git a/kernel/smp.c b/kernel/smp.c index f10408422444..9867b6bfefce 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -12,8 +12,6 @@ | |||
12 | #include <linux/smp.h> | 12 | #include <linux/smp.h> |
13 | #include <linux/cpu.h> | 13 | #include <linux/cpu.h> |
14 | 14 | ||
15 | static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); | ||
16 | |||
17 | static struct { | 15 | static struct { |
18 | struct list_head queue; | 16 | struct list_head queue; |
19 | raw_spinlock_t lock; | 17 | raw_spinlock_t lock; |
@@ -33,12 +31,14 @@ struct call_function_data { | |||
33 | cpumask_var_t cpumask; | 31 | cpumask_var_t cpumask; |
34 | }; | 32 | }; |
35 | 33 | ||
34 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); | ||
35 | |||
36 | struct call_single_queue { | 36 | struct call_single_queue { |
37 | struct list_head list; | 37 | struct list_head list; |
38 | raw_spinlock_t lock; | 38 | raw_spinlock_t lock; |
39 | }; | 39 | }; |
40 | 40 | ||
41 | static DEFINE_PER_CPU(struct call_function_data, cfd_data); | 41 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); |
42 | 42 | ||
43 | static int | 43 | static int |
44 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | 44 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) |
@@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void) | |||
256 | } | 256 | } |
257 | } | 257 | } |
258 | 258 | ||
259 | static DEFINE_PER_CPU(struct call_single_data, csd_data); | 259 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); |
260 | 260 | ||
261 | /* | 261 | /* |
262 | * smp_call_function_single - Run a function on a specific CPU | 262 | * smp_call_function_single - Run a function on a specific CPU |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 818d7d9aa03c..bde4295774c8 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -34,6 +34,30 @@ | |||
34 | #include <linux/smp.h> | 34 | #include <linux/smp.h> |
35 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
36 | 36 | ||
37 | static int init_srcu_struct_fields(struct srcu_struct *sp) | ||
38 | { | ||
39 | sp->completed = 0; | ||
40 | mutex_init(&sp->mutex); | ||
41 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | ||
42 | return sp->per_cpu_ref ? 0 : -ENOMEM; | ||
43 | } | ||
44 | |||
45 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
46 | |||
47 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
48 | struct lock_class_key *key) | ||
49 | { | ||
50 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
51 | /* Don't re-initialize a lock while it is held. */ | ||
52 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
53 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
54 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
55 | return init_srcu_struct_fields(sp); | ||
56 | } | ||
57 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
58 | |||
59 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
60 | |||
37 | /** | 61 | /** |
38 | * init_srcu_struct - initialize a sleep-RCU structure | 62 | * init_srcu_struct - initialize a sleep-RCU structure |
39 | * @sp: structure to initialize. | 63 | * @sp: structure to initialize. |
@@ -44,13 +68,12 @@ | |||
44 | */ | 68 | */ |
45 | int init_srcu_struct(struct srcu_struct *sp) | 69 | int init_srcu_struct(struct srcu_struct *sp) |
46 | { | 70 | { |
47 | sp->completed = 0; | 71 | return init_srcu_struct_fields(sp); |
48 | mutex_init(&sp->mutex); | ||
49 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | ||
50 | return (sp->per_cpu_ref ? 0 : -ENOMEM); | ||
51 | } | 72 | } |
52 | EXPORT_SYMBOL_GPL(init_srcu_struct); | 73 | EXPORT_SYMBOL_GPL(init_srcu_struct); |
53 | 74 | ||
75 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
76 | |||
54 | /* | 77 | /* |
55 | * srcu_readers_active_idx -- returns approximate number of readers | 78 | * srcu_readers_active_idx -- returns approximate number of readers |
56 | * active on the specified rank of per-CPU counters. | 79 | * active on the specified rank of per-CPU counters. |
@@ -100,15 +123,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp) | |||
100 | } | 123 | } |
101 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | 124 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); |
102 | 125 | ||
103 | /** | 126 | /* |
104 | * srcu_read_lock - register a new reader for an SRCU-protected structure. | ||
105 | * @sp: srcu_struct in which to register the new reader. | ||
106 | * | ||
107 | * Counts the new reader in the appropriate per-CPU element of the | 127 | * Counts the new reader in the appropriate per-CPU element of the |
108 | * srcu_struct. Must be called from process context. | 128 | * srcu_struct. Must be called from process context. |
109 | * Returns an index that must be passed to the matching srcu_read_unlock(). | 129 | * Returns an index that must be passed to the matching srcu_read_unlock(). |
110 | */ | 130 | */ |
111 | int srcu_read_lock(struct srcu_struct *sp) | 131 | int __srcu_read_lock(struct srcu_struct *sp) |
112 | { | 132 | { |
113 | int idx; | 133 | int idx; |
114 | 134 | ||
@@ -120,31 +140,27 @@ int srcu_read_lock(struct srcu_struct *sp) | |||
120 | preempt_enable(); | 140 | preempt_enable(); |
121 | return idx; | 141 | return idx; |
122 | } | 142 | } |
123 | EXPORT_SYMBOL_GPL(srcu_read_lock); | 143 | EXPORT_SYMBOL_GPL(__srcu_read_lock); |
124 | 144 | ||
125 | /** | 145 | /* |
126 | * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. | ||
127 | * @sp: srcu_struct in which to unregister the old reader. | ||
128 | * @idx: return value from corresponding srcu_read_lock(). | ||
129 | * | ||
130 | * Removes the count for the old reader from the appropriate per-CPU | 146 | * Removes the count for the old reader from the appropriate per-CPU |
131 | * element of the srcu_struct. Note that this may well be a different | 147 | * element of the srcu_struct. Note that this may well be a different |
132 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | 148 | * CPU than that which was incremented by the corresponding srcu_read_lock(). |
133 | * Must be called from process context. | 149 | * Must be called from process context. |
134 | */ | 150 | */ |
135 | void srcu_read_unlock(struct srcu_struct *sp, int idx) | 151 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
136 | { | 152 | { |
137 | preempt_disable(); | 153 | preempt_disable(); |
138 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 154 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ |
139 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 155 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; |
140 | preempt_enable(); | 156 | preempt_enable(); |
141 | } | 157 | } |
142 | EXPORT_SYMBOL_GPL(srcu_read_unlock); | 158 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
143 | 159 | ||
144 | /* | 160 | /* |
145 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 161 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
146 | */ | 162 | */ |
147 | void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 163 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) |
148 | { | 164 | { |
149 | int idx; | 165 | int idx; |
150 | 166 | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 912823e2a11b..9bb9fb1bd79c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -45,7 +45,7 @@ static int refcount; | |||
45 | static struct workqueue_struct *stop_machine_wq; | 45 | static struct workqueue_struct *stop_machine_wq; |
46 | static struct stop_machine_data active, idle; | 46 | static struct stop_machine_data active, idle; |
47 | static const struct cpumask *active_cpus; | 47 | static const struct cpumask *active_cpus; |
48 | static void *stop_machine_work; | 48 | static void __percpu *stop_machine_work; |
49 | 49 | ||
50 | static void set_state(enum stopmachine_state newstate) | 50 | static void set_state(enum stopmachine_state newstate) |
51 | { | 51 | { |
diff --git a/kernel/sys.c b/kernel/sys.c index 18bde979f346..9814e43fb23b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -571,13 +571,7 @@ static int set_user(struct cred *new) | |||
571 | if (!new_user) | 571 | if (!new_user) |
572 | return -EAGAIN; | 572 | return -EAGAIN; |
573 | 573 | ||
574 | if (!task_can_switch_user(new_user, current)) { | 574 | if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && |
575 | free_uid(new_user); | ||
576 | return -EINVAL; | ||
577 | } | ||
578 | |||
579 | if (atomic_read(&new_user->processes) >= | ||
580 | current->signal->rlim[RLIMIT_NPROC].rlim_cur && | ||
581 | new_user != INIT_USER) { | 575 | new_user != INIT_USER) { |
582 | free_uid(new_user); | 576 | free_uid(new_user); |
583 | return -EAGAIN; | 577 | return -EAGAIN; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a68b2448468..0ef19c614f6d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/ftrace.h> | 50 | #include <linux/ftrace.h> |
51 | #include <linux/slow-work.h> | 51 | #include <linux/slow-work.h> |
52 | #include <linux/perf_event.h> | 52 | #include <linux/perf_event.h> |
53 | #include <linux/kprobes.h> | ||
53 | 54 | ||
54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
55 | #include <asm/processor.h> | 56 | #include <asm/processor.h> |
@@ -1441,7 +1442,7 @@ static struct ctl_table fs_table[] = { | |||
1441 | }; | 1442 | }; |
1442 | 1443 | ||
1443 | static struct ctl_table debug_table[] = { | 1444 | static struct ctl_table debug_table[] = { |
1444 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) | 1445 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) |
1445 | { | 1446 | { |
1446 | .procname = "exception-trace", | 1447 | .procname = "exception-trace", |
1447 | .data = &show_unhandled_signals, | 1448 | .data = &show_unhandled_signals, |
@@ -1450,6 +1451,17 @@ static struct ctl_table debug_table[] = { | |||
1450 | .proc_handler = proc_dointvec | 1451 | .proc_handler = proc_dointvec |
1451 | }, | 1452 | }, |
1452 | #endif | 1453 | #endif |
1454 | #if defined(CONFIG_OPTPROBES) | ||
1455 | { | ||
1456 | .procname = "kprobes-optimization", | ||
1457 | .data = &sysctl_kprobes_optimization, | ||
1458 | .maxlen = sizeof(int), | ||
1459 | .mode = 0644, | ||
1460 | .proc_handler = proc_kprobes_optimization_handler, | ||
1461 | .extra1 = &zero, | ||
1462 | .extra2 = &one, | ||
1463 | }, | ||
1464 | #endif | ||
1453 | { } | 1465 | { } |
1454 | }; | 1466 | }; |
1455 | 1467 | ||
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 8f5d16e0707a..8cd50d8f9bde 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1331 | ssize_t result; | 1331 | ssize_t result; |
1332 | char *pathname; | 1332 | char *pathname; |
1333 | int flags; | 1333 | int flags; |
1334 | int acc_mode, fmode; | 1334 | int acc_mode; |
1335 | 1335 | ||
1336 | pathname = sysctl_getname(name, nlen, &table); | 1336 | pathname = sysctl_getname(name, nlen, &table); |
1337 | result = PTR_ERR(pathname); | 1337 | result = PTR_ERR(pathname); |
@@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1342 | if (oldval && oldlen && newval && newlen) { | 1342 | if (oldval && oldlen && newval && newlen) { |
1343 | flags = O_RDWR; | 1343 | flags = O_RDWR; |
1344 | acc_mode = MAY_READ | MAY_WRITE; | 1344 | acc_mode = MAY_READ | MAY_WRITE; |
1345 | fmode = FMODE_READ | FMODE_WRITE; | ||
1346 | } else if (newval && newlen) { | 1345 | } else if (newval && newlen) { |
1347 | flags = O_WRONLY; | 1346 | flags = O_WRONLY; |
1348 | acc_mode = MAY_WRITE; | 1347 | acc_mode = MAY_WRITE; |
1349 | fmode = FMODE_WRITE; | ||
1350 | } else if (oldval && oldlen) { | 1348 | } else if (oldval && oldlen) { |
1351 | flags = O_RDONLY; | 1349 | flags = O_RDONLY; |
1352 | acc_mode = MAY_READ; | 1350 | acc_mode = MAY_READ; |
1353 | fmode = FMODE_READ; | ||
1354 | } else { | 1351 | } else { |
1355 | result = 0; | 1352 | result = 0; |
1356 | goto out_putname; | 1353 | goto out_putname; |
@@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1361 | if (result) | 1358 | if (result) |
1362 | goto out_putname; | 1359 | goto out_putname; |
1363 | 1360 | ||
1364 | result = may_open(&nd.path, acc_mode, fmode); | 1361 | result = may_open(&nd.path, acc_mode, flags); |
1365 | if (result) | 1362 | if (result) |
1366 | goto out_putpath; | 1363 | goto out_putpath; |
1367 | 1364 | ||
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ea8384d3caa7..899ca51be5e8 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -46,15 +46,13 @@ static struct genl_family family = { | |||
46 | .maxattr = TASKSTATS_CMD_ATTR_MAX, | 46 | .maxattr = TASKSTATS_CMD_ATTR_MAX, |
47 | }; | 47 | }; |
48 | 48 | ||
49 | static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] | 49 | static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { |
50 | __read_mostly = { | ||
51 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, | 50 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, |
52 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, | 51 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, |
53 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, | 52 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, |
54 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; | 53 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; |
55 | 54 | ||
56 | static struct nla_policy | 55 | static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { |
57 | cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { | ||
58 | [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, | 56 | [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, |
59 | }; | 57 | }; |
60 | 58 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 13700833c181..1f663d23e85e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -453,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; } | |||
453 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ | 453 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ |
454 | 454 | ||
455 | /** | 455 | /** |
456 | * clocksource_suspend - suspend the clocksource(s) | ||
457 | */ | ||
458 | void clocksource_suspend(void) | ||
459 | { | ||
460 | struct clocksource *cs; | ||
461 | |||
462 | list_for_each_entry_reverse(cs, &clocksource_list, list) | ||
463 | if (cs->suspend) | ||
464 | cs->suspend(cs); | ||
465 | } | ||
466 | |||
467 | /** | ||
456 | * clocksource_resume - resume the clocksource(s) | 468 | * clocksource_resume - resume the clocksource(s) |
457 | */ | 469 | */ |
458 | void clocksource_resume(void) | 470 | void clocksource_resume(void) |
@@ -461,7 +473,7 @@ void clocksource_resume(void) | |||
461 | 473 | ||
462 | list_for_each_entry(cs, &clocksource_list, list) | 474 | list_for_each_entry(cs, &clocksource_list, list) |
463 | if (cs->resume) | 475 | if (cs->resume) |
464 | cs->resume(); | 476 | cs->resume(cs); |
465 | 477 | ||
466 | clocksource_resume_watchdog(); | 478 | clocksource_resume_watchdog(); |
467 | } | 479 | } |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4800f933910e..7c0f180d6e9d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -58,10 +58,10 @@ static s64 time_offset; | |||
58 | static long time_constant = 2; | 58 | static long time_constant = 2; |
59 | 59 | ||
60 | /* maximum error (usecs): */ | 60 | /* maximum error (usecs): */ |
61 | long time_maxerror = NTP_PHASE_LIMIT; | 61 | static long time_maxerror = NTP_PHASE_LIMIT; |
62 | 62 | ||
63 | /* estimated error (usecs): */ | 63 | /* estimated error (usecs): */ |
64 | long time_esterror = NTP_PHASE_LIMIT; | 64 | static long time_esterror = NTP_PHASE_LIMIT; |
65 | 65 | ||
66 | /* frequency offset (scaled nsecs/secs): */ | 66 | /* frequency offset (scaled nsecs/secs): */ |
67 | static s64 time_freq; | 67 | static s64 time_freq; |
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset) | |||
142 | * Select how the frequency is to be controlled | 142 | * Select how the frequency is to be controlled |
143 | * and in which mode (PLL or FLL). | 143 | * and in which mode (PLL or FLL). |
144 | */ | 144 | */ |
145 | secs = xtime.tv_sec - time_reftime; | 145 | secs = get_seconds() - time_reftime; |
146 | if (unlikely(time_status & STA_FREQHOLD)) | 146 | if (unlikely(time_status & STA_FREQHOLD)) |
147 | secs = 0; | 147 | secs = 0; |
148 | 148 | ||
149 | time_reftime = xtime.tv_sec; | 149 | time_reftime = get_seconds(); |
150 | 150 | ||
151 | offset64 = offset; | 151 | offset64 = offset; |
152 | freq_adj = (offset64 * secs) << | 152 | freq_adj = (offset64 * secs) << |
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
368 | * reference time to current time. | 368 | * reference time to current time. |
369 | */ | 369 | */ |
370 | if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) | 370 | if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) |
371 | time_reftime = xtime.tv_sec; | 371 | time_reftime = get_seconds(); |
372 | 372 | ||
373 | /* only set allowed bits */ | 373 | /* only set allowed bits */ |
374 | time_status &= STA_RONLY; | 374 | time_status &= STA_RONLY; |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e2ab064c6d41..16736379a9ca 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -622,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
622 | write_sequnlock_irqrestore(&xtime_lock, flags); | 622 | write_sequnlock_irqrestore(&xtime_lock, flags); |
623 | 623 | ||
624 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 624 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
625 | clocksource_suspend(); | ||
625 | 626 | ||
626 | return 0; | 627 | return 0; |
627 | } | 628 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 60e2ce0181ee..13e13d428cd3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -328,15 +328,6 @@ config BRANCH_TRACER | |||
328 | 328 | ||
329 | Say N if unsure. | 329 | Say N if unsure. |
330 | 330 | ||
331 | config POWER_TRACER | ||
332 | bool "Trace power consumption behavior" | ||
333 | depends on X86 | ||
334 | select GENERIC_TRACER | ||
335 | help | ||
336 | This tracer helps developers to analyze and optimize the kernel's | ||
337 | power management decisions, specifically the C-state and P-state | ||
338 | behavior. | ||
339 | |||
340 | config KSYM_TRACER | 331 | config KSYM_TRACER |
341 | bool "Trace read and write access on kernel memory locations" | 332 | bool "Trace read and write access on kernel memory locations" |
342 | depends on HAVE_HW_BREAKPOINT | 333 | depends on HAVE_HW_BREAKPOINT |
@@ -449,7 +440,7 @@ config BLK_DEV_IO_TRACE | |||
449 | 440 | ||
450 | config KPROBE_EVENT | 441 | config KPROBE_EVENT |
451 | depends on KPROBES | 442 | depends on KPROBES |
452 | depends on X86 | 443 | depends on HAVE_REGS_AND_STACK_ACCESS_API |
453 | bool "Enable kprobes-based dynamic events" | 444 | bool "Enable kprobes-based dynamic events" |
454 | select TRACING | 445 | select TRACING |
455 | default y | 446 | default y |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d9d6206e0b14..07f945a99430 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -540,9 +540,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
540 | if (ret) | 540 | if (ret) |
541 | return ret; | 541 | return ret; |
542 | 542 | ||
543 | if (copy_to_user(arg, &buts, sizeof(buts))) | 543 | if (copy_to_user(arg, &buts, sizeof(buts))) { |
544 | blk_trace_remove(q); | ||
544 | return -EFAULT; | 545 | return -EFAULT; |
545 | 546 | } | |
546 | return 0; | 547 | return 0; |
547 | } | 548 | } |
548 | EXPORT_SYMBOL_GPL(blk_trace_setup); | 549 | EXPORT_SYMBOL_GPL(blk_trace_setup); |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1904797f4a8a..83783579378f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -2402,6 +2402,7 @@ static const struct file_operations ftrace_notrace_fops = { | |||
2402 | static DEFINE_MUTEX(graph_lock); | 2402 | static DEFINE_MUTEX(graph_lock); |
2403 | 2403 | ||
2404 | int ftrace_graph_count; | 2404 | int ftrace_graph_count; |
2405 | int ftrace_graph_filter_enabled; | ||
2405 | unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; | 2406 | unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; |
2406 | 2407 | ||
2407 | static void * | 2408 | static void * |
@@ -2424,7 +2425,7 @@ static void *g_start(struct seq_file *m, loff_t *pos) | |||
2424 | mutex_lock(&graph_lock); | 2425 | mutex_lock(&graph_lock); |
2425 | 2426 | ||
2426 | /* Nothing, tell g_show to print all functions are enabled */ | 2427 | /* Nothing, tell g_show to print all functions are enabled */ |
2427 | if (!ftrace_graph_count && !*pos) | 2428 | if (!ftrace_graph_filter_enabled && !*pos) |
2428 | return (void *)1; | 2429 | return (void *)1; |
2429 | 2430 | ||
2430 | return __g_next(m, pos); | 2431 | return __g_next(m, pos); |
@@ -2470,6 +2471,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) | |||
2470 | mutex_lock(&graph_lock); | 2471 | mutex_lock(&graph_lock); |
2471 | if ((file->f_mode & FMODE_WRITE) && | 2472 | if ((file->f_mode & FMODE_WRITE) && |
2472 | (file->f_flags & O_TRUNC)) { | 2473 | (file->f_flags & O_TRUNC)) { |
2474 | ftrace_graph_filter_enabled = 0; | ||
2473 | ftrace_graph_count = 0; | 2475 | ftrace_graph_count = 0; |
2474 | memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); | 2476 | memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); |
2475 | } | 2477 | } |
@@ -2495,7 +2497,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2495 | struct dyn_ftrace *rec; | 2497 | struct dyn_ftrace *rec; |
2496 | struct ftrace_page *pg; | 2498 | struct ftrace_page *pg; |
2497 | int search_len; | 2499 | int search_len; |
2498 | int found = 0; | 2500 | int fail = 1; |
2499 | int type, not; | 2501 | int type, not; |
2500 | char *search; | 2502 | char *search; |
2501 | bool exists; | 2503 | bool exists; |
@@ -2506,37 +2508,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2506 | 2508 | ||
2507 | /* decode regex */ | 2509 | /* decode regex */ |
2508 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); | 2510 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); |
2509 | if (not) | 2511 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) |
2510 | return -EINVAL; | 2512 | return -EBUSY; |
2511 | 2513 | ||
2512 | search_len = strlen(search); | 2514 | search_len = strlen(search); |
2513 | 2515 | ||
2514 | mutex_lock(&ftrace_lock); | 2516 | mutex_lock(&ftrace_lock); |
2515 | do_for_each_ftrace_rec(pg, rec) { | 2517 | do_for_each_ftrace_rec(pg, rec) { |
2516 | 2518 | ||
2517 | if (*idx >= FTRACE_GRAPH_MAX_FUNCS) | ||
2518 | break; | ||
2519 | |||
2520 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) | 2519 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) |
2521 | continue; | 2520 | continue; |
2522 | 2521 | ||
2523 | if (ftrace_match_record(rec, search, search_len, type)) { | 2522 | if (ftrace_match_record(rec, search, search_len, type)) { |
2524 | /* ensure it is not already in the array */ | 2523 | /* if it is in the array */ |
2525 | exists = false; | 2524 | exists = false; |
2526 | for (i = 0; i < *idx; i++) | 2525 | for (i = 0; i < *idx; i++) { |
2527 | if (array[i] == rec->ip) { | 2526 | if (array[i] == rec->ip) { |
2528 | exists = true; | 2527 | exists = true; |
2529 | break; | 2528 | break; |
2530 | } | 2529 | } |
2531 | if (!exists) | 2530 | } |
2532 | array[(*idx)++] = rec->ip; | 2531 | |
2533 | found = 1; | 2532 | if (!not) { |
2533 | fail = 0; | ||
2534 | if (!exists) { | ||
2535 | array[(*idx)++] = rec->ip; | ||
2536 | if (*idx >= FTRACE_GRAPH_MAX_FUNCS) | ||
2537 | goto out; | ||
2538 | } | ||
2539 | } else { | ||
2540 | if (exists) { | ||
2541 | array[i] = array[--(*idx)]; | ||
2542 | array[*idx] = 0; | ||
2543 | fail = 0; | ||
2544 | } | ||
2545 | } | ||
2534 | } | 2546 | } |
2535 | } while_for_each_ftrace_rec(); | 2547 | } while_for_each_ftrace_rec(); |
2536 | 2548 | out: | |
2537 | mutex_unlock(&ftrace_lock); | 2549 | mutex_unlock(&ftrace_lock); |
2538 | 2550 | ||
2539 | return found ? 0 : -EINVAL; | 2551 | if (fail) |
2552 | return -EINVAL; | ||
2553 | |||
2554 | ftrace_graph_filter_enabled = 1; | ||
2555 | return 0; | ||
2540 | } | 2556 | } |
2541 | 2557 | ||
2542 | static ssize_t | 2558 | static ssize_t |
@@ -2546,16 +2562,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, | |||
2546 | struct trace_parser parser; | 2562 | struct trace_parser parser; |
2547 | ssize_t read, ret; | 2563 | ssize_t read, ret; |
2548 | 2564 | ||
2549 | if (!cnt || cnt < 0) | 2565 | if (!cnt) |
2550 | return 0; | 2566 | return 0; |
2551 | 2567 | ||
2552 | mutex_lock(&graph_lock); | 2568 | mutex_lock(&graph_lock); |
2553 | 2569 | ||
2554 | if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { | ||
2555 | ret = -EBUSY; | ||
2556 | goto out_unlock; | ||
2557 | } | ||
2558 | |||
2559 | if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { | 2570 | if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { |
2560 | ret = -ENOMEM; | 2571 | ret = -ENOMEM; |
2561 | goto out_unlock; | 2572 | goto out_unlock; |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8c1b2d290718..0287f9f52f5a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/cpu.h> | 20 | #include <linux/cpu.h> |
21 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
22 | 22 | ||
23 | #include <asm/local.h> | ||
23 | #include "trace.h" | 24 | #include "trace.h" |
24 | 25 | ||
25 | /* | 26 | /* |
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index b2477caf09c2..df74c7982255 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/kthread.h> | 8 | #include <linux/kthread.h> |
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/time.h> | 10 | #include <linux/time.h> |
11 | #include <asm/local.h> | ||
11 | 12 | ||
12 | struct rb_page { | 13 | struct rb_page { |
13 | u64 ts; | 14 | u64 ts; |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eac6875cb990..ed01fdba4a55 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/splice.h> | 32 | #include <linux/splice.h> |
33 | #include <linux/kdebug.h> | 33 | #include <linux/kdebug.h> |
34 | #include <linux/string.h> | 34 | #include <linux/string.h> |
35 | #include <linux/rwsem.h> | ||
35 | #include <linux/ctype.h> | 36 | #include <linux/ctype.h> |
36 | #include <linux/init.h> | 37 | #include <linux/init.h> |
37 | #include <linux/poll.h> | 38 | #include <linux/poll.h> |
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled); | |||
91 | static inline void ftrace_disable_cpu(void) | 92 | static inline void ftrace_disable_cpu(void) |
92 | { | 93 | { |
93 | preempt_disable(); | 94 | preempt_disable(); |
94 | __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); | 95 | __this_cpu_inc(ftrace_cpu_disabled); |
95 | } | 96 | } |
96 | 97 | ||
97 | static inline void ftrace_enable_cpu(void) | 98 | static inline void ftrace_enable_cpu(void) |
98 | { | 99 | { |
99 | __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); | 100 | __this_cpu_dec(ftrace_cpu_disabled); |
100 | preempt_enable(); | 101 | preempt_enable(); |
101 | } | 102 | } |
102 | 103 | ||
103 | static cpumask_var_t __read_mostly tracing_buffer_mask; | 104 | static cpumask_var_t __read_mostly tracing_buffer_mask; |
104 | 105 | ||
105 | /* Define which cpu buffers are currently read in trace_pipe */ | ||
106 | static cpumask_var_t tracing_reader_cpumask; | ||
107 | |||
108 | #define for_each_tracing_cpu(cpu) \ | 106 | #define for_each_tracing_cpu(cpu) \ |
109 | for_each_cpu(cpu, tracing_buffer_mask) | 107 | for_each_cpu(cpu, tracing_buffer_mask) |
110 | 108 | ||
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly; | |||
243 | 241 | ||
244 | /* | 242 | /* |
245 | * trace_types_lock is used to protect the trace_types list. | 243 | * trace_types_lock is used to protect the trace_types list. |
246 | * This lock is also used to keep user access serialized. | ||
247 | * Accesses from userspace will grab this lock while userspace | ||
248 | * activities happen inside the kernel. | ||
249 | */ | 244 | */ |
250 | static DEFINE_MUTEX(trace_types_lock); | 245 | static DEFINE_MUTEX(trace_types_lock); |
251 | 246 | ||
247 | /* | ||
248 | * serialize the access of the ring buffer | ||
249 | * | ||
250 | * ring buffer serializes readers, but it is low level protection. | ||
251 | * The validity of the events (which returns by ring_buffer_peek() ..etc) | ||
252 | * are not protected by ring buffer. | ||
253 | * | ||
254 | * The content of events may become garbage if we allow other process consumes | ||
255 | * these events concurrently: | ||
256 | * A) the page of the consumed events may become a normal page | ||
257 | * (not reader page) in ring buffer, and this page will be rewrited | ||
258 | * by events producer. | ||
259 | * B) The page of the consumed events may become a page for splice_read, | ||
260 | * and this page will be returned to system. | ||
261 | * | ||
262 | * These primitives allow multi process access to different cpu ring buffer | ||
263 | * concurrently. | ||
264 | * | ||
265 | * These primitives don't distinguish read-only and read-consume access. | ||
266 | * Multi read-only access are also serialized. | ||
267 | */ | ||
268 | |||
269 | #ifdef CONFIG_SMP | ||
270 | static DECLARE_RWSEM(all_cpu_access_lock); | ||
271 | static DEFINE_PER_CPU(struct mutex, cpu_access_lock); | ||
272 | |||
273 | static inline void trace_access_lock(int cpu) | ||
274 | { | ||
275 | if (cpu == TRACE_PIPE_ALL_CPU) { | ||
276 | /* gain it for accessing the whole ring buffer. */ | ||
277 | down_write(&all_cpu_access_lock); | ||
278 | } else { | ||
279 | /* gain it for accessing a cpu ring buffer. */ | ||
280 | |||
281 | /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ | ||
282 | down_read(&all_cpu_access_lock); | ||
283 | |||
284 | /* Secondly block other access to this @cpu ring buffer. */ | ||
285 | mutex_lock(&per_cpu(cpu_access_lock, cpu)); | ||
286 | } | ||
287 | } | ||
288 | |||
289 | static inline void trace_access_unlock(int cpu) | ||
290 | { | ||
291 | if (cpu == TRACE_PIPE_ALL_CPU) { | ||
292 | up_write(&all_cpu_access_lock); | ||
293 | } else { | ||
294 | mutex_unlock(&per_cpu(cpu_access_lock, cpu)); | ||
295 | up_read(&all_cpu_access_lock); | ||
296 | } | ||
297 | } | ||
298 | |||
299 | static inline void trace_access_lock_init(void) | ||
300 | { | ||
301 | int cpu; | ||
302 | |||
303 | for_each_possible_cpu(cpu) | ||
304 | mutex_init(&per_cpu(cpu_access_lock, cpu)); | ||
305 | } | ||
306 | |||
307 | #else | ||
308 | |||
309 | static DEFINE_MUTEX(access_lock); | ||
310 | |||
311 | static inline void trace_access_lock(int cpu) | ||
312 | { | ||
313 | (void)cpu; | ||
314 | mutex_lock(&access_lock); | ||
315 | } | ||
316 | |||
317 | static inline void trace_access_unlock(int cpu) | ||
318 | { | ||
319 | (void)cpu; | ||
320 | mutex_unlock(&access_lock); | ||
321 | } | ||
322 | |||
323 | static inline void trace_access_lock_init(void) | ||
324 | { | ||
325 | } | ||
326 | |||
327 | #endif | ||
328 | |||
252 | /* trace_wait is a waitqueue for tasks blocked on trace_poll */ | 329 | /* trace_wait is a waitqueue for tasks blocked on trace_poll */ |
253 | static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | 330 | static DECLARE_WAIT_QUEUE_HEAD(trace_wait); |
254 | 331 | ||
@@ -1089,7 +1166,7 @@ trace_function(struct trace_array *tr, | |||
1089 | struct ftrace_entry *entry; | 1166 | struct ftrace_entry *entry; |
1090 | 1167 | ||
1091 | /* If we are reading the ring buffer, don't trace */ | 1168 | /* If we are reading the ring buffer, don't trace */ |
1092 | if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) | 1169 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
1093 | return; | 1170 | return; |
1094 | 1171 | ||
1095 | event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), | 1172 | event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), |
@@ -1320,8 +1397,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1320 | entry->fmt = fmt; | 1397 | entry->fmt = fmt; |
1321 | 1398 | ||
1322 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); | 1399 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); |
1323 | if (!filter_check_discard(call, entry, buffer, event)) | 1400 | if (!filter_check_discard(call, entry, buffer, event)) { |
1324 | ring_buffer_unlock_commit(buffer, event); | 1401 | ring_buffer_unlock_commit(buffer, event); |
1402 | ftrace_trace_stack(buffer, flags, 6, pc); | ||
1403 | } | ||
1325 | 1404 | ||
1326 | out_unlock: | 1405 | out_unlock: |
1327 | arch_spin_unlock(&trace_buf_lock); | 1406 | arch_spin_unlock(&trace_buf_lock); |
@@ -1394,8 +1473,10 @@ int trace_array_vprintk(struct trace_array *tr, | |||
1394 | 1473 | ||
1395 | memcpy(&entry->buf, trace_buf, len); | 1474 | memcpy(&entry->buf, trace_buf, len); |
1396 | entry->buf[len] = '\0'; | 1475 | entry->buf[len] = '\0'; |
1397 | if (!filter_check_discard(call, entry, buffer, event)) | 1476 | if (!filter_check_discard(call, entry, buffer, event)) { |
1398 | ring_buffer_unlock_commit(buffer, event); | 1477 | ring_buffer_unlock_commit(buffer, event); |
1478 | ftrace_trace_stack(buffer, irq_flags, 6, pc); | ||
1479 | } | ||
1399 | 1480 | ||
1400 | out_unlock: | 1481 | out_unlock: |
1401 | arch_spin_unlock(&trace_buf_lock); | 1482 | arch_spin_unlock(&trace_buf_lock); |
@@ -1585,12 +1666,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu) | |||
1585 | } | 1666 | } |
1586 | 1667 | ||
1587 | /* | 1668 | /* |
1588 | * No necessary locking here. The worst thing which can | ||
1589 | * happen is loosing events consumed at the same time | ||
1590 | * by a trace_pipe reader. | ||
1591 | * Other than that, we don't risk to crash the ring buffer | ||
1592 | * because it serializes the readers. | ||
1593 | * | ||
1594 | * The current tracer is copied to avoid a global locking | 1669 | * The current tracer is copied to avoid a global locking |
1595 | * all around. | 1670 | * all around. |
1596 | */ | 1671 | */ |
@@ -1645,12 +1720,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1645 | } | 1720 | } |
1646 | 1721 | ||
1647 | trace_event_read_lock(); | 1722 | trace_event_read_lock(); |
1723 | trace_access_lock(cpu_file); | ||
1648 | return p; | 1724 | return p; |
1649 | } | 1725 | } |
1650 | 1726 | ||
1651 | static void s_stop(struct seq_file *m, void *p) | 1727 | static void s_stop(struct seq_file *m, void *p) |
1652 | { | 1728 | { |
1729 | struct trace_iterator *iter = m->private; | ||
1730 | |||
1653 | atomic_dec(&trace_record_cmdline_disabled); | 1731 | atomic_dec(&trace_record_cmdline_disabled); |
1732 | trace_access_unlock(iter->cpu_file); | ||
1654 | trace_event_read_unlock(); | 1733 | trace_event_read_unlock(); |
1655 | } | 1734 | } |
1656 | 1735 | ||
@@ -2841,22 +2920,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
2841 | 2920 | ||
2842 | mutex_lock(&trace_types_lock); | 2921 | mutex_lock(&trace_types_lock); |
2843 | 2922 | ||
2844 | /* We only allow one reader per cpu */ | ||
2845 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | ||
2846 | if (!cpumask_empty(tracing_reader_cpumask)) { | ||
2847 | ret = -EBUSY; | ||
2848 | goto out; | ||
2849 | } | ||
2850 | cpumask_setall(tracing_reader_cpumask); | ||
2851 | } else { | ||
2852 | if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) | ||
2853 | cpumask_set_cpu(cpu_file, tracing_reader_cpumask); | ||
2854 | else { | ||
2855 | ret = -EBUSY; | ||
2856 | goto out; | ||
2857 | } | ||
2858 | } | ||
2859 | |||
2860 | /* create a buffer to store the information to pass to userspace */ | 2923 | /* create a buffer to store the information to pass to userspace */ |
2861 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2924 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); |
2862 | if (!iter) { | 2925 | if (!iter) { |
@@ -2912,12 +2975,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) | |||
2912 | 2975 | ||
2913 | mutex_lock(&trace_types_lock); | 2976 | mutex_lock(&trace_types_lock); |
2914 | 2977 | ||
2915 | if (iter->cpu_file == TRACE_PIPE_ALL_CPU) | ||
2916 | cpumask_clear(tracing_reader_cpumask); | ||
2917 | else | ||
2918 | cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); | ||
2919 | |||
2920 | |||
2921 | if (iter->trace->pipe_close) | 2978 | if (iter->trace->pipe_close) |
2922 | iter->trace->pipe_close(iter); | 2979 | iter->trace->pipe_close(iter); |
2923 | 2980 | ||
@@ -3079,6 +3136,7 @@ waitagain: | |||
3079 | iter->pos = -1; | 3136 | iter->pos = -1; |
3080 | 3137 | ||
3081 | trace_event_read_lock(); | 3138 | trace_event_read_lock(); |
3139 | trace_access_lock(iter->cpu_file); | ||
3082 | while (find_next_entry_inc(iter) != NULL) { | 3140 | while (find_next_entry_inc(iter) != NULL) { |
3083 | enum print_line_t ret; | 3141 | enum print_line_t ret; |
3084 | int len = iter->seq.len; | 3142 | int len = iter->seq.len; |
@@ -3095,6 +3153,7 @@ waitagain: | |||
3095 | if (iter->seq.len >= cnt) | 3153 | if (iter->seq.len >= cnt) |
3096 | break; | 3154 | break; |
3097 | } | 3155 | } |
3156 | trace_access_unlock(iter->cpu_file); | ||
3098 | trace_event_read_unlock(); | 3157 | trace_event_read_unlock(); |
3099 | 3158 | ||
3100 | /* Now copy what we have to the user */ | 3159 | /* Now copy what we have to the user */ |
@@ -3220,6 +3279,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3220 | } | 3279 | } |
3221 | 3280 | ||
3222 | trace_event_read_lock(); | 3281 | trace_event_read_lock(); |
3282 | trace_access_lock(iter->cpu_file); | ||
3223 | 3283 | ||
3224 | /* Fill as many pages as possible. */ | 3284 | /* Fill as many pages as possible. */ |
3225 | for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { | 3285 | for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { |
@@ -3243,6 +3303,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3243 | trace_seq_init(&iter->seq); | 3303 | trace_seq_init(&iter->seq); |
3244 | } | 3304 | } |
3245 | 3305 | ||
3306 | trace_access_unlock(iter->cpu_file); | ||
3246 | trace_event_read_unlock(); | 3307 | trace_event_read_unlock(); |
3247 | mutex_unlock(&iter->mutex); | 3308 | mutex_unlock(&iter->mutex); |
3248 | 3309 | ||
@@ -3544,10 +3605,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3544 | 3605 | ||
3545 | info->read = 0; | 3606 | info->read = 0; |
3546 | 3607 | ||
3608 | trace_access_lock(info->cpu); | ||
3547 | ret = ring_buffer_read_page(info->tr->buffer, | 3609 | ret = ring_buffer_read_page(info->tr->buffer, |
3548 | &info->spare, | 3610 | &info->spare, |
3549 | count, | 3611 | count, |
3550 | info->cpu, 0); | 3612 | info->cpu, 0); |
3613 | trace_access_unlock(info->cpu); | ||
3551 | if (ret < 0) | 3614 | if (ret < 0) |
3552 | return 0; | 3615 | return 0; |
3553 | 3616 | ||
@@ -3675,6 +3738,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3675 | len &= PAGE_MASK; | 3738 | len &= PAGE_MASK; |
3676 | } | 3739 | } |
3677 | 3740 | ||
3741 | trace_access_lock(info->cpu); | ||
3678 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 3742 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); |
3679 | 3743 | ||
3680 | for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { | 3744 | for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { |
@@ -3722,6 +3786,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3722 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 3786 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); |
3723 | } | 3787 | } |
3724 | 3788 | ||
3789 | trace_access_unlock(info->cpu); | ||
3725 | spd.nr_pages = i; | 3790 | spd.nr_pages = i; |
3726 | 3791 | ||
3727 | /* did we read anything? */ | 3792 | /* did we read anything? */ |
@@ -4158,6 +4223,8 @@ static __init int tracer_init_debugfs(void) | |||
4158 | struct dentry *d_tracer; | 4223 | struct dentry *d_tracer; |
4159 | int cpu; | 4224 | int cpu; |
4160 | 4225 | ||
4226 | trace_access_lock_init(); | ||
4227 | |||
4161 | d_tracer = tracing_init_dentry(); | 4228 | d_tracer = tracing_init_dentry(); |
4162 | 4229 | ||
4163 | trace_create_file("tracing_enabled", 0644, d_tracer, | 4230 | trace_create_file("tracing_enabled", 0644, d_tracer, |
@@ -4392,9 +4459,6 @@ __init static int tracer_alloc_buffers(void) | |||
4392 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 4459 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) |
4393 | goto out_free_buffer_mask; | 4460 | goto out_free_buffer_mask; |
4394 | 4461 | ||
4395 | if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) | ||
4396 | goto out_free_tracing_cpumask; | ||
4397 | |||
4398 | /* To save memory, keep the ring buffer size to its minimum */ | 4462 | /* To save memory, keep the ring buffer size to its minimum */ |
4399 | if (ring_buffer_expanded) | 4463 | if (ring_buffer_expanded) |
4400 | ring_buf_size = trace_buf_size; | 4464 | ring_buf_size = trace_buf_size; |
@@ -4452,8 +4516,6 @@ __init static int tracer_alloc_buffers(void) | |||
4452 | return 0; | 4516 | return 0; |
4453 | 4517 | ||
4454 | out_free_cpumask: | 4518 | out_free_cpumask: |
4455 | free_cpumask_var(tracing_reader_cpumask); | ||
4456 | out_free_tracing_cpumask: | ||
4457 | free_cpumask_var(tracing_cpumask); | 4519 | free_cpumask_var(tracing_cpumask); |
4458 | out_free_buffer_mask: | 4520 | out_free_buffer_mask: |
4459 | free_cpumask_var(tracing_buffer_mask); | 4521 | free_cpumask_var(tracing_buffer_mask); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4df6a77eb196..fd05bcaf91b0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -497,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); | |||
497 | #ifdef CONFIG_DYNAMIC_FTRACE | 497 | #ifdef CONFIG_DYNAMIC_FTRACE |
498 | /* TODO: make this variable */ | 498 | /* TODO: make this variable */ |
499 | #define FTRACE_GRAPH_MAX_FUNCS 32 | 499 | #define FTRACE_GRAPH_MAX_FUNCS 32 |
500 | extern int ftrace_graph_filter_enabled; | ||
500 | extern int ftrace_graph_count; | 501 | extern int ftrace_graph_count; |
501 | extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; | 502 | extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; |
502 | 503 | ||
@@ -504,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr) | |||
504 | { | 505 | { |
505 | int i; | 506 | int i; |
506 | 507 | ||
507 | if (!ftrace_graph_count || test_tsk_trace_graph(current)) | 508 | if (!ftrace_graph_filter_enabled) |
508 | return 1; | 509 | return 1; |
509 | 510 | ||
510 | for (i = 0; i < ftrace_graph_count; i++) { | 511 | for (i = 0; i < ftrace_graph_count; i++) { |
@@ -791,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[]; | |||
791 | 792 | ||
792 | #undef FTRACE_ENTRY | 793 | #undef FTRACE_ENTRY |
793 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ | 794 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ |
794 | extern struct ftrace_event_call event_##call; | 795 | extern struct ftrace_event_call \ |
796 | __attribute__((__aligned__(4))) event_##call; | ||
795 | #undef FTRACE_ENTRY_DUP | 797 | #undef FTRACE_ENTRY_DUP |
796 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ | 798 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ |
797 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) | 799 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4a194f08f88c..b9bc4d470177 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2) | |||
307 | return -1; | 307 | return -1; |
308 | if (percent_a > percent_b) | 308 | if (percent_a > percent_b) |
309 | return 1; | 309 | return 1; |
310 | else | 310 | |
311 | return 0; | 311 | if (a->incorrect < b->incorrect) |
312 | return -1; | ||
313 | if (a->incorrect > b->incorrect) | ||
314 | return 1; | ||
315 | |||
316 | /* | ||
317 | * Since the above shows worse (incorrect) cases | ||
318 | * first, we continue that by showing best (correct) | ||
319 | * cases last. | ||
320 | */ | ||
321 | if (a->correct > b->correct) | ||
322 | return -1; | ||
323 | if (a->correct < b->correct) | ||
324 | return 1; | ||
325 | |||
326 | return 0; | ||
312 | } | 327 | } |
313 | 328 | ||
314 | static struct tracer_stat annotated_branch_stats = { | 329 | static struct tracer_stat annotated_branch_stats = { |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 189b09baf4fb..3f972ad98d04 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, | |||
60 | return 0; | 60 | return 0; |
61 | 61 | ||
62 | err: | 62 | err: |
63 | if (field) { | 63 | if (field) |
64 | kfree(field->name); | 64 | kfree(field->name); |
65 | kfree(field->type); | ||
66 | } | ||
67 | kfree(field); | 65 | kfree(field); |
68 | 66 | ||
69 | return -ENOMEM; | 67 | return -ENOMEM; |
@@ -520,41 +518,16 @@ out: | |||
520 | return ret; | 518 | return ret; |
521 | } | 519 | } |
522 | 520 | ||
523 | extern char *__bad_type_size(void); | ||
524 | |||
525 | #undef FIELD | ||
526 | #define FIELD(type, name) \ | ||
527 | sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ | ||
528 | #type, "common_" #name, offsetof(typeof(field), name), \ | ||
529 | sizeof(field.name), is_signed_type(type) | ||
530 | |||
531 | static int trace_write_header(struct trace_seq *s) | ||
532 | { | ||
533 | struct trace_entry field; | ||
534 | |||
535 | /* struct trace_entry */ | ||
536 | return trace_seq_printf(s, | ||
537 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
538 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
539 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
540 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
541 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
542 | "\n", | ||
543 | FIELD(unsigned short, type), | ||
544 | FIELD(unsigned char, flags), | ||
545 | FIELD(unsigned char, preempt_count), | ||
546 | FIELD(int, pid), | ||
547 | FIELD(int, lock_depth)); | ||
548 | } | ||
549 | |||
550 | static ssize_t | 521 | static ssize_t |
551 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | 522 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, |
552 | loff_t *ppos) | 523 | loff_t *ppos) |
553 | { | 524 | { |
554 | struct ftrace_event_call *call = filp->private_data; | 525 | struct ftrace_event_call *call = filp->private_data; |
526 | struct ftrace_event_field *field; | ||
555 | struct trace_seq *s; | 527 | struct trace_seq *s; |
528 | int common_field_count = 5; | ||
556 | char *buf; | 529 | char *buf; |
557 | int r; | 530 | int r = 0; |
558 | 531 | ||
559 | if (*ppos) | 532 | if (*ppos) |
560 | return 0; | 533 | return 0; |
@@ -565,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
565 | 538 | ||
566 | trace_seq_init(s); | 539 | trace_seq_init(s); |
567 | 540 | ||
568 | /* If any of the first writes fail, so will the show_format. */ | ||
569 | |||
570 | trace_seq_printf(s, "name: %s\n", call->name); | 541 | trace_seq_printf(s, "name: %s\n", call->name); |
571 | trace_seq_printf(s, "ID: %d\n", call->id); | 542 | trace_seq_printf(s, "ID: %d\n", call->id); |
572 | trace_seq_printf(s, "format:\n"); | 543 | trace_seq_printf(s, "format:\n"); |
573 | trace_write_header(s); | ||
574 | 544 | ||
575 | r = call->show_format(call, s); | 545 | list_for_each_entry_reverse(field, &call->fields, link) { |
546 | /* | ||
547 | * Smartly shows the array type(except dynamic array). | ||
548 | * Normal: | ||
549 | * field:TYPE VAR | ||
550 | * If TYPE := TYPE[LEN], it is shown: | ||
551 | * field:TYPE VAR[LEN] | ||
552 | */ | ||
553 | const char *array_descriptor = strchr(field->type, '['); | ||
554 | |||
555 | if (!strncmp(field->type, "__data_loc", 10)) | ||
556 | array_descriptor = NULL; | ||
557 | |||
558 | if (!array_descriptor) { | ||
559 | r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" | ||
560 | "\tsize:%u;\tsigned:%d;\n", | ||
561 | field->type, field->name, field->offset, | ||
562 | field->size, !!field->is_signed); | ||
563 | } else { | ||
564 | r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" | ||
565 | "\tsize:%u;\tsigned:%d;\n", | ||
566 | (int)(array_descriptor - field->type), | ||
567 | field->type, field->name, | ||
568 | array_descriptor, field->offset, | ||
569 | field->size, !!field->is_signed); | ||
570 | } | ||
571 | |||
572 | if (--common_field_count == 0) | ||
573 | r = trace_seq_printf(s, "\n"); | ||
574 | |||
575 | if (!r) | ||
576 | break; | ||
577 | } | ||
578 | |||
579 | if (r) | ||
580 | r = trace_seq_printf(s, "\nprint fmt: %s\n", | ||
581 | call->print_fmt); | ||
582 | |||
576 | if (!r) { | 583 | if (!r) { |
577 | /* | 584 | /* |
578 | * ug! The format output is bigger than a PAGE!! | 585 | * ug! The format output is bigger than a PAGE!! |
@@ -948,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
948 | filter); | 955 | filter); |
949 | } | 956 | } |
950 | 957 | ||
951 | /* A trace may not want to export its format */ | ||
952 | if (!call->show_format) | ||
953 | return 0; | ||
954 | |||
955 | trace_create_file("format", 0444, call->dir, call, | 958 | trace_create_file("format", 0444, call->dir, call, |
956 | format); | 959 | format); |
957 | 960 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4fa5dc1ee4e..e091f64ba6ce 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
62 | 62 | ||
63 | #include "trace_entries.h" | 63 | #include "trace_entries.h" |
64 | 64 | ||
65 | |||
66 | #undef __field | ||
67 | #define __field(type, item) \ | ||
68 | ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ | ||
69 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
70 | offsetof(typeof(field), item), \ | ||
71 | sizeof(field.item), is_signed_type(type)); \ | ||
72 | if (!ret) \ | ||
73 | return 0; | ||
74 | |||
75 | #undef __field_desc | ||
76 | #define __field_desc(type, container, item) \ | ||
77 | ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ | ||
78 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
79 | offsetof(typeof(field), container.item), \ | ||
80 | sizeof(field.container.item), \ | ||
81 | is_signed_type(type)); \ | ||
82 | if (!ret) \ | ||
83 | return 0; | ||
84 | |||
85 | #undef __array | ||
86 | #define __array(type, item, len) \ | ||
87 | ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ | ||
88 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
89 | offsetof(typeof(field), item), \ | ||
90 | sizeof(field.item), is_signed_type(type)); \ | ||
91 | if (!ret) \ | ||
92 | return 0; | ||
93 | |||
94 | #undef __array_desc | ||
95 | #define __array_desc(type, container, item, len) \ | ||
96 | ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ | ||
97 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
98 | offsetof(typeof(field), container.item), \ | ||
99 | sizeof(field.container.item), \ | ||
100 | is_signed_type(type)); \ | ||
101 | if (!ret) \ | ||
102 | return 0; | ||
103 | |||
104 | #undef __dynamic_array | ||
105 | #define __dynamic_array(type, item) \ | ||
106 | ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ | ||
107 | "offset:%zu;\tsize:0;\tsigned:%u;\n", \ | ||
108 | offsetof(typeof(field), item), \ | ||
109 | is_signed_type(type)); \ | ||
110 | if (!ret) \ | ||
111 | return 0; | ||
112 | |||
113 | #undef F_printk | ||
114 | #define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) | ||
115 | |||
116 | #undef __entry | ||
117 | #define __entry REC | ||
118 | |||
119 | #undef FTRACE_ENTRY | ||
120 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ | ||
121 | static int \ | ||
122 | ftrace_format_##name(struct ftrace_event_call *unused, \ | ||
123 | struct trace_seq *s) \ | ||
124 | { \ | ||
125 | struct struct_name field __attribute__((unused)); \ | ||
126 | int ret = 0; \ | ||
127 | \ | ||
128 | tstruct; \ | ||
129 | \ | ||
130 | trace_seq_printf(s, "\nprint fmt: " print); \ | ||
131 | \ | ||
132 | return ret; \ | ||
133 | } | ||
134 | |||
135 | #include "trace_entries.h" | ||
136 | |||
137 | #undef __field | 65 | #undef __field |
138 | #define __field(type, item) \ | 66 | #define __field(type, item) \ |
139 | ret = trace_define_field(event_call, #type, #item, \ | 67 | ret = trace_define_field(event_call, #type, #item, \ |
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ | |||
175 | return ret; | 103 | return ret; |
176 | 104 | ||
177 | #undef __dynamic_array | 105 | #undef __dynamic_array |
178 | #define __dynamic_array(type, item) | 106 | #define __dynamic_array(type, item) \ |
107 | ret = trace_define_field(event_call, #type, #item, \ | ||
108 | offsetof(typeof(field), item), \ | ||
109 | 0, is_signed_type(type), FILTER_OTHER);\ | ||
110 | if (ret) \ | ||
111 | return ret; | ||
179 | 112 | ||
180 | #undef FTRACE_ENTRY | 113 | #undef FTRACE_ENTRY |
181 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ | 114 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ |
@@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) | |||
198 | return 0; | 131 | return 0; |
199 | } | 132 | } |
200 | 133 | ||
134 | #undef __entry | ||
135 | #define __entry REC | ||
136 | |||
201 | #undef __field | 137 | #undef __field |
202 | #define __field(type, item) | 138 | #define __field(type, item) |
203 | 139 | ||
@@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) | |||
213 | #undef __dynamic_array | 149 | #undef __dynamic_array |
214 | #define __dynamic_array(type, item) | 150 | #define __dynamic_array(type, item) |
215 | 151 | ||
152 | #undef F_printk | ||
153 | #define F_printk(fmt, args...) #fmt ", " __stringify(args) | ||
154 | |||
216 | #undef FTRACE_ENTRY | 155 | #undef FTRACE_ENTRY |
217 | #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ | 156 | #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ |
218 | \ | 157 | \ |
@@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ | |||
223 | .id = type, \ | 162 | .id = type, \ |
224 | .system = __stringify(TRACE_SYSTEM), \ | 163 | .system = __stringify(TRACE_SYSTEM), \ |
225 | .raw_init = ftrace_raw_init_event, \ | 164 | .raw_init = ftrace_raw_init_event, \ |
226 | .show_format = ftrace_format_##call, \ | 165 | .print_fmt = print, \ |
227 | .define_fields = ftrace_define_fields_##call, \ | 166 | .define_fields = ftrace_define_fields_##call, \ |
228 | }; \ | 167 | }; \ |
229 | 168 | ||
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b1342c5d37cf..3fc2a575664f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -18,6 +18,7 @@ struct fgraph_cpu_data { | |||
18 | pid_t last_pid; | 18 | pid_t last_pid; |
19 | int depth; | 19 | int depth; |
20 | int ignore; | 20 | int ignore; |
21 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; | ||
21 | }; | 22 | }; |
22 | 23 | ||
23 | struct fgraph_data { | 24 | struct fgraph_data { |
@@ -187,7 +188,7 @@ static int __trace_graph_entry(struct trace_array *tr, | |||
187 | struct ring_buffer *buffer = tr->buffer; | 188 | struct ring_buffer *buffer = tr->buffer; |
188 | struct ftrace_graph_ent_entry *entry; | 189 | struct ftrace_graph_ent_entry *entry; |
189 | 190 | ||
190 | if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) | 191 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
191 | return 0; | 192 | return 0; |
192 | 193 | ||
193 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, | 194 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, |
@@ -212,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
212 | int cpu; | 213 | int cpu; |
213 | int pc; | 214 | int pc; |
214 | 215 | ||
215 | if (unlikely(!tr)) | ||
216 | return 0; | ||
217 | |||
218 | if (!ftrace_trace_task(current)) | 216 | if (!ftrace_trace_task(current)) |
219 | return 0; | 217 | return 0; |
220 | 218 | ||
221 | if (!ftrace_graph_addr(trace->func)) | 219 | /* trace it when it is-nested-in or is a function enabled. */ |
220 | if (!(trace->depth || ftrace_graph_addr(trace->func))) | ||
222 | return 0; | 221 | return 0; |
223 | 222 | ||
224 | local_irq_save(flags); | 223 | local_irq_save(flags); |
@@ -231,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
231 | } else { | 230 | } else { |
232 | ret = 0; | 231 | ret = 0; |
233 | } | 232 | } |
234 | /* Only do the atomic if it is not already set */ | ||
235 | if (!test_tsk_trace_graph(current)) | ||
236 | set_tsk_trace_graph(current); | ||
237 | 233 | ||
238 | atomic_dec(&data->disabled); | 234 | atomic_dec(&data->disabled); |
239 | local_irq_restore(flags); | 235 | local_irq_restore(flags); |
@@ -251,7 +247,7 @@ static void __trace_graph_return(struct trace_array *tr, | |||
251 | struct ring_buffer *buffer = tr->buffer; | 247 | struct ring_buffer *buffer = tr->buffer; |
252 | struct ftrace_graph_ret_entry *entry; | 248 | struct ftrace_graph_ret_entry *entry; |
253 | 249 | ||
254 | if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) | 250 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
255 | return; | 251 | return; |
256 | 252 | ||
257 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, | 253 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, |
@@ -281,17 +277,24 @@ void trace_graph_return(struct ftrace_graph_ret *trace) | |||
281 | pc = preempt_count(); | 277 | pc = preempt_count(); |
282 | __trace_graph_return(tr, trace, flags, pc); | 278 | __trace_graph_return(tr, trace, flags, pc); |
283 | } | 279 | } |
284 | if (!trace->depth) | ||
285 | clear_tsk_trace_graph(current); | ||
286 | atomic_dec(&data->disabled); | 280 | atomic_dec(&data->disabled); |
287 | local_irq_restore(flags); | 281 | local_irq_restore(flags); |
288 | } | 282 | } |
289 | 283 | ||
284 | void set_graph_array(struct trace_array *tr) | ||
285 | { | ||
286 | graph_array = tr; | ||
287 | |||
288 | /* Make graph_array visible before we start tracing */ | ||
289 | |||
290 | smp_mb(); | ||
291 | } | ||
292 | |||
290 | static int graph_trace_init(struct trace_array *tr) | 293 | static int graph_trace_init(struct trace_array *tr) |
291 | { | 294 | { |
292 | int ret; | 295 | int ret; |
293 | 296 | ||
294 | graph_array = tr; | 297 | set_graph_array(tr); |
295 | ret = register_ftrace_graph(&trace_graph_return, | 298 | ret = register_ftrace_graph(&trace_graph_return, |
296 | &trace_graph_entry); | 299 | &trace_graph_entry); |
297 | if (ret) | 300 | if (ret) |
@@ -301,11 +304,6 @@ static int graph_trace_init(struct trace_array *tr) | |||
301 | return 0; | 304 | return 0; |
302 | } | 305 | } |
303 | 306 | ||
304 | void set_graph_array(struct trace_array *tr) | ||
305 | { | ||
306 | graph_array = tr; | ||
307 | } | ||
308 | |||
309 | static void graph_trace_reset(struct trace_array *tr) | 307 | static void graph_trace_reset(struct trace_array *tr) |
310 | { | 308 | { |
311 | tracing_stop_cmdline_record(); | 309 | tracing_stop_cmdline_record(); |
@@ -673,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
673 | duration = graph_ret->rettime - graph_ret->calltime; | 671 | duration = graph_ret->rettime - graph_ret->calltime; |
674 | 672 | ||
675 | if (data) { | 673 | if (data) { |
674 | struct fgraph_cpu_data *cpu_data; | ||
676 | int cpu = iter->cpu; | 675 | int cpu = iter->cpu; |
677 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 676 | |
677 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); | ||
678 | 678 | ||
679 | /* | 679 | /* |
680 | * Comments display at + 1 to depth. Since | 680 | * Comments display at + 1 to depth. Since |
681 | * this is a leaf function, keep the comments | 681 | * this is a leaf function, keep the comments |
682 | * equal to this depth. | 682 | * equal to this depth. |
683 | */ | 683 | */ |
684 | *depth = call->depth - 1; | 684 | cpu_data->depth = call->depth - 1; |
685 | |||
686 | /* No need to keep this function around for this depth */ | ||
687 | if (call->depth < FTRACE_RETFUNC_DEPTH) | ||
688 | cpu_data->enter_funcs[call->depth] = 0; | ||
685 | } | 689 | } |
686 | 690 | ||
687 | /* Overhead */ | 691 | /* Overhead */ |
@@ -721,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
721 | int i; | 725 | int i; |
722 | 726 | ||
723 | if (data) { | 727 | if (data) { |
728 | struct fgraph_cpu_data *cpu_data; | ||
724 | int cpu = iter->cpu; | 729 | int cpu = iter->cpu; |
725 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | ||
726 | 730 | ||
727 | *depth = call->depth; | 731 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); |
732 | cpu_data->depth = call->depth; | ||
733 | |||
734 | /* Save this function pointer to see if the exit matches */ | ||
735 | if (call->depth < FTRACE_RETFUNC_DEPTH) | ||
736 | cpu_data->enter_funcs[call->depth] = call->func; | ||
728 | } | 737 | } |
729 | 738 | ||
730 | /* No overhead */ | 739 | /* No overhead */ |
@@ -854,19 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
854 | struct fgraph_data *data = iter->private; | 863 | struct fgraph_data *data = iter->private; |
855 | pid_t pid = ent->pid; | 864 | pid_t pid = ent->pid; |
856 | int cpu = iter->cpu; | 865 | int cpu = iter->cpu; |
866 | int func_match = 1; | ||
857 | int ret; | 867 | int ret; |
858 | int i; | 868 | int i; |
859 | 869 | ||
860 | if (data) { | 870 | if (data) { |
871 | struct fgraph_cpu_data *cpu_data; | ||
861 | int cpu = iter->cpu; | 872 | int cpu = iter->cpu; |
862 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 873 | |
874 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); | ||
863 | 875 | ||
864 | /* | 876 | /* |
865 | * Comments display at + 1 to depth. This is the | 877 | * Comments display at + 1 to depth. This is the |
866 | * return from a function, we now want the comments | 878 | * return from a function, we now want the comments |
867 | * to display at the same level of the bracket. | 879 | * to display at the same level of the bracket. |
868 | */ | 880 | */ |
869 | *depth = trace->depth - 1; | 881 | cpu_data->depth = trace->depth - 1; |
882 | |||
883 | if (trace->depth < FTRACE_RETFUNC_DEPTH) { | ||
884 | if (cpu_data->enter_funcs[trace->depth] != trace->func) | ||
885 | func_match = 0; | ||
886 | cpu_data->enter_funcs[trace->depth] = 0; | ||
887 | } | ||
870 | } | 888 | } |
871 | 889 | ||
872 | if (print_graph_prologue(iter, s, 0, 0)) | 890 | if (print_graph_prologue(iter, s, 0, 0)) |
@@ -891,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
891 | return TRACE_TYPE_PARTIAL_LINE; | 909 | return TRACE_TYPE_PARTIAL_LINE; |
892 | } | 910 | } |
893 | 911 | ||
894 | ret = trace_seq_printf(s, "}\n"); | 912 | /* |
895 | if (!ret) | 913 | * If the return function does not have a matching entry, |
896 | return TRACE_TYPE_PARTIAL_LINE; | 914 | * then the entry was lost. Instead of just printing |
915 | * the '}' and letting the user guess what function this | ||
916 | * belongs to, write out the function name. | ||
917 | */ | ||
918 | if (func_match) { | ||
919 | ret = trace_seq_printf(s, "}\n"); | ||
920 | if (!ret) | ||
921 | return TRACE_TYPE_PARTIAL_LINE; | ||
922 | } else { | ||
923 | ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); | ||
924 | if (!ret) | ||
925 | return TRACE_TYPE_PARTIAL_LINE; | ||
926 | } | ||
897 | 927 | ||
898 | /* Overrun */ | 928 | /* Overrun */ |
899 | if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { | 929 | if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 356c10227c98..505c92273b1a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -635,12 +635,12 @@ static int create_trace_probe(int argc, char **argv) | |||
635 | event = strchr(group, '/') + 1; | 635 | event = strchr(group, '/') + 1; |
636 | event[-1] = '\0'; | 636 | event[-1] = '\0'; |
637 | if (strlen(group) == 0) { | 637 | if (strlen(group) == 0) { |
638 | pr_info("Group name is not specifiled\n"); | 638 | pr_info("Group name is not specified\n"); |
639 | return -EINVAL; | 639 | return -EINVAL; |
640 | } | 640 | } |
641 | } | 641 | } |
642 | if (strlen(event) == 0) { | 642 | if (strlen(event) == 0) { |
643 | pr_info("Event name is not specifiled\n"); | 643 | pr_info("Event name is not specified\n"); |
644 | return -EINVAL; | 644 | return -EINVAL; |
645 | } | 645 | } |
646 | } | 646 | } |
@@ -1155,80 +1155,60 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
1155 | return 0; | 1155 | return 0; |
1156 | } | 1156 | } |
1157 | 1157 | ||
1158 | static int __probe_event_show_format(struct trace_seq *s, | 1158 | static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) |
1159 | struct trace_probe *tp, const char *fmt, | ||
1160 | const char *arg) | ||
1161 | { | 1159 | { |
1162 | int i; | 1160 | int i; |
1161 | int pos = 0; | ||
1163 | 1162 | ||
1164 | /* Show format */ | 1163 | const char *fmt, *arg; |
1165 | if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) | ||
1166 | return 0; | ||
1167 | 1164 | ||
1168 | for (i = 0; i < tp->nr_args; i++) | 1165 | if (!probe_is_return(tp)) { |
1169 | if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) | 1166 | fmt = "(%lx)"; |
1170 | return 0; | 1167 | arg = "REC->" FIELD_STRING_IP; |
1168 | } else { | ||
1169 | fmt = "(%lx <- %lx)"; | ||
1170 | arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; | ||
1171 | } | ||
1171 | 1172 | ||
1172 | if (!trace_seq_printf(s, "\", %s", arg)) | 1173 | /* When len=0, we just calculate the needed length */ |
1173 | return 0; | 1174 | #define LEN_OR_ZERO (len ? len - pos : 0) |
1174 | 1175 | ||
1175 | for (i = 0; i < tp->nr_args; i++) | 1176 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); |
1176 | if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) | ||
1177 | return 0; | ||
1178 | 1177 | ||
1179 | return trace_seq_puts(s, "\n"); | 1178 | for (i = 0; i < tp->nr_args; i++) { |
1180 | } | 1179 | pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", |
1180 | tp->args[i].name); | ||
1181 | } | ||
1181 | 1182 | ||
1182 | #undef SHOW_FIELD | 1183 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); |
1183 | #define SHOW_FIELD(type, item, name) \ | ||
1184 | do { \ | ||
1185 | ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ | ||
1186 | "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ | ||
1187 | (unsigned int)offsetof(typeof(field), item),\ | ||
1188 | (unsigned int)sizeof(type), \ | ||
1189 | is_signed_type(type)); \ | ||
1190 | if (!ret) \ | ||
1191 | return 0; \ | ||
1192 | } while (0) | ||
1193 | 1184 | ||
1194 | static int kprobe_event_show_format(struct ftrace_event_call *call, | 1185 | for (i = 0; i < tp->nr_args; i++) { |
1195 | struct trace_seq *s) | 1186 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", |
1196 | { | 1187 | tp->args[i].name); |
1197 | struct kprobe_trace_entry field __attribute__((unused)); | 1188 | } |
1198 | int ret, i; | ||
1199 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1200 | |||
1201 | SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); | ||
1202 | SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); | ||
1203 | 1189 | ||
1204 | /* Show fields */ | 1190 | #undef LEN_OR_ZERO |
1205 | for (i = 0; i < tp->nr_args; i++) | ||
1206 | SHOW_FIELD(unsigned long, args[i], tp->args[i].name); | ||
1207 | trace_seq_puts(s, "\n"); | ||
1208 | 1191 | ||
1209 | return __probe_event_show_format(s, tp, "(%lx)", | 1192 | /* return the length of print_fmt */ |
1210 | "REC->" FIELD_STRING_IP); | 1193 | return pos; |
1211 | } | 1194 | } |
1212 | 1195 | ||
1213 | static int kretprobe_event_show_format(struct ftrace_event_call *call, | 1196 | static int set_print_fmt(struct trace_probe *tp) |
1214 | struct trace_seq *s) | ||
1215 | { | 1197 | { |
1216 | struct kretprobe_trace_entry field __attribute__((unused)); | 1198 | int len; |
1217 | int ret, i; | 1199 | char *print_fmt; |
1218 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1219 | 1200 | ||
1220 | SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); | 1201 | /* First: called with 0 length to calculate the needed length */ |
1221 | SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); | 1202 | len = __set_print_fmt(tp, NULL, 0); |
1222 | SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); | 1203 | print_fmt = kmalloc(len + 1, GFP_KERNEL); |
1204 | if (!print_fmt) | ||
1205 | return -ENOMEM; | ||
1223 | 1206 | ||
1224 | /* Show fields */ | 1207 | /* Second: actually write the @print_fmt */ |
1225 | for (i = 0; i < tp->nr_args; i++) | 1208 | __set_print_fmt(tp, print_fmt, len + 1); |
1226 | SHOW_FIELD(unsigned long, args[i], tp->args[i].name); | 1209 | tp->call.print_fmt = print_fmt; |
1227 | trace_seq_puts(s, "\n"); | ||
1228 | 1210 | ||
1229 | return __probe_event_show_format(s, tp, "(%lx <- %lx)", | 1211 | return 0; |
1230 | "REC->" FIELD_STRING_FUNC | ||
1231 | ", REC->" FIELD_STRING_RETIP); | ||
1232 | } | 1212 | } |
1233 | 1213 | ||
1234 | #ifdef CONFIG_PERF_EVENTS | 1214 | #ifdef CONFIG_PERF_EVENTS |
@@ -1359,18 +1339,20 @@ static int register_probe_event(struct trace_probe *tp) | |||
1359 | if (probe_is_return(tp)) { | 1339 | if (probe_is_return(tp)) { |
1360 | tp->event.trace = print_kretprobe_event; | 1340 | tp->event.trace = print_kretprobe_event; |
1361 | call->raw_init = probe_event_raw_init; | 1341 | call->raw_init = probe_event_raw_init; |
1362 | call->show_format = kretprobe_event_show_format; | ||
1363 | call->define_fields = kretprobe_event_define_fields; | 1342 | call->define_fields = kretprobe_event_define_fields; |
1364 | } else { | 1343 | } else { |
1365 | tp->event.trace = print_kprobe_event; | 1344 | tp->event.trace = print_kprobe_event; |
1366 | call->raw_init = probe_event_raw_init; | 1345 | call->raw_init = probe_event_raw_init; |
1367 | call->show_format = kprobe_event_show_format; | ||
1368 | call->define_fields = kprobe_event_define_fields; | 1346 | call->define_fields = kprobe_event_define_fields; |
1369 | } | 1347 | } |
1348 | if (set_print_fmt(tp) < 0) | ||
1349 | return -ENOMEM; | ||
1370 | call->event = &tp->event; | 1350 | call->event = &tp->event; |
1371 | call->id = register_ftrace_event(&tp->event); | 1351 | call->id = register_ftrace_event(&tp->event); |
1372 | if (!call->id) | 1352 | if (!call->id) { |
1353 | kfree(call->print_fmt); | ||
1373 | return -ENODEV; | 1354 | return -ENODEV; |
1355 | } | ||
1374 | call->enabled = 0; | 1356 | call->enabled = 0; |
1375 | call->regfunc = probe_event_enable; | 1357 | call->regfunc = probe_event_enable; |
1376 | call->unregfunc = probe_event_disable; | 1358 | call->unregfunc = probe_event_disable; |
@@ -1383,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp) | |||
1383 | ret = trace_add_event_call(call); | 1365 | ret = trace_add_event_call(call); |
1384 | if (ret) { | 1366 | if (ret) { |
1385 | pr_info("Failed to register kprobe event: %s\n", call->name); | 1367 | pr_info("Failed to register kprobe event: %s\n", call->name); |
1368 | kfree(call->print_fmt); | ||
1386 | unregister_ftrace_event(&tp->event); | 1369 | unregister_ftrace_event(&tp->event); |
1387 | } | 1370 | } |
1388 | return ret; | 1371 | return ret; |
@@ -1392,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp) | |||
1392 | { | 1375 | { |
1393 | /* tp->event is unregistered in trace_remove_event_call() */ | 1376 | /* tp->event is unregistered in trace_remove_event_call() */ |
1394 | trace_remove_event_call(&tp->call); | 1377 | trace_remove_event_call(&tp->call); |
1378 | kfree(tp->call.print_fmt); | ||
1395 | } | 1379 | } |
1396 | 1380 | ||
1397 | /* Make a debugfs interface for controling probe points */ | 1381 | /* Make a debugfs interface for controling probe points */ |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4e332b9e449c..cba47d7935cc 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -143,70 +143,65 @@ extern char *__bad_type_size(void); | |||
143 | #type, #name, offsetof(typeof(trace), name), \ | 143 | #type, #name, offsetof(typeof(trace), name), \ |
144 | sizeof(trace.name), is_signed_type(type) | 144 | sizeof(trace.name), is_signed_type(type) |
145 | 145 | ||
146 | int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) | 146 | static |
147 | int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) | ||
147 | { | 148 | { |
148 | int i; | 149 | int i; |
149 | int ret; | 150 | int pos = 0; |
150 | struct syscall_metadata *entry = call->data; | ||
151 | struct syscall_trace_enter trace; | ||
152 | int offset = offsetof(struct syscall_trace_enter, args); | ||
153 | 151 | ||
154 | ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" | 152 | /* When len=0, we just calculate the needed length */ |
155 | "\tsigned:%u;\n", | 153 | #define LEN_OR_ZERO (len ? len - pos : 0) |
156 | SYSCALL_FIELD(int, nr)); | ||
157 | if (!ret) | ||
158 | return 0; | ||
159 | 154 | ||
155 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); | ||
160 | for (i = 0; i < entry->nb_args; i++) { | 156 | for (i = 0; i < entry->nb_args; i++) { |
161 | ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], | 157 | pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", |
162 | entry->args[i]); | 158 | entry->args[i], sizeof(unsigned long), |
163 | if (!ret) | 159 | i == entry->nb_args - 1 ? "" : ", "); |
164 | return 0; | ||
165 | ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" | ||
166 | "\tsigned:%u;\n", offset, | ||
167 | sizeof(unsigned long), | ||
168 | is_signed_type(unsigned long)); | ||
169 | if (!ret) | ||
170 | return 0; | ||
171 | offset += sizeof(unsigned long); | ||
172 | } | 160 | } |
161 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); | ||
173 | 162 | ||
174 | trace_seq_puts(s, "\nprint fmt: \""); | ||
175 | for (i = 0; i < entry->nb_args; i++) { | 163 | for (i = 0; i < entry->nb_args; i++) { |
176 | ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], | 164 | pos += snprintf(buf + pos, LEN_OR_ZERO, |
177 | sizeof(unsigned long), | 165 | ", ((unsigned long)(REC->%s))", entry->args[i]); |
178 | i == entry->nb_args - 1 ? "" : ", "); | ||
179 | if (!ret) | ||
180 | return 0; | ||
181 | } | 166 | } |
182 | trace_seq_putc(s, '"'); | ||
183 | 167 | ||
184 | for (i = 0; i < entry->nb_args; i++) { | 168 | #undef LEN_OR_ZERO |
185 | ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", | ||
186 | entry->args[i]); | ||
187 | if (!ret) | ||
188 | return 0; | ||
189 | } | ||
190 | 169 | ||
191 | return trace_seq_putc(s, '\n'); | 170 | /* return the length of print_fmt */ |
171 | return pos; | ||
192 | } | 172 | } |
193 | 173 | ||
194 | int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) | 174 | static int set_syscall_print_fmt(struct ftrace_event_call *call) |
195 | { | 175 | { |
196 | int ret; | 176 | char *print_fmt; |
197 | struct syscall_trace_exit trace; | 177 | int len; |
178 | struct syscall_metadata *entry = call->data; | ||
198 | 179 | ||
199 | ret = trace_seq_printf(s, | 180 | if (entry->enter_event != call) { |
200 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" | 181 | call->print_fmt = "\"0x%lx\", REC->ret"; |
201 | "\tsigned:%u;\n" | ||
202 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" | ||
203 | "\tsigned:%u;\n", | ||
204 | SYSCALL_FIELD(int, nr), | ||
205 | SYSCALL_FIELD(long, ret)); | ||
206 | if (!ret) | ||
207 | return 0; | 182 | return 0; |
183 | } | ||
208 | 184 | ||
209 | return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); | 185 | /* First: called with 0 length to calculate the needed length */ |
186 | len = __set_enter_print_fmt(entry, NULL, 0); | ||
187 | |||
188 | print_fmt = kmalloc(len + 1, GFP_KERNEL); | ||
189 | if (!print_fmt) | ||
190 | return -ENOMEM; | ||
191 | |||
192 | /* Second: actually write the @print_fmt */ | ||
193 | __set_enter_print_fmt(entry, print_fmt, len + 1); | ||
194 | call->print_fmt = print_fmt; | ||
195 | |||
196 | return 0; | ||
197 | } | ||
198 | |||
199 | static void free_syscall_print_fmt(struct ftrace_event_call *call) | ||
200 | { | ||
201 | struct syscall_metadata *entry = call->data; | ||
202 | |||
203 | if (entry->enter_event == call) | ||
204 | kfree(call->print_fmt); | ||
210 | } | 205 | } |
211 | 206 | ||
212 | int syscall_enter_define_fields(struct ftrace_event_call *call) | 207 | int syscall_enter_define_fields(struct ftrace_event_call *call) |
@@ -386,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
386 | { | 381 | { |
387 | int id; | 382 | int id; |
388 | 383 | ||
389 | id = register_ftrace_event(call->event); | 384 | if (set_syscall_print_fmt(call) < 0) |
390 | if (!id) | 385 | return -ENOMEM; |
391 | return -ENODEV; | 386 | |
392 | call->id = id; | 387 | id = trace_event_raw_init(call); |
393 | INIT_LIST_HEAD(&call->fields); | 388 | |
394 | return 0; | 389 | if (id < 0) { |
390 | free_syscall_print_fmt(call); | ||
391 | return id; | ||
392 | } | ||
393 | |||
394 | return id; | ||
395 | } | ||
396 | |||
397 | unsigned long __init arch_syscall_addr(int nr) | ||
398 | { | ||
399 | return (unsigned long)sys_call_table[nr]; | ||
395 | } | 400 | } |
396 | 401 | ||
397 | int __init init_ftrace_syscalls(void) | 402 | int __init init_ftrace_syscalls(void) |
@@ -552,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call) | |||
552 | ret = register_trace_sys_exit(prof_syscall_exit); | 557 | ret = register_trace_sys_exit(prof_syscall_exit); |
553 | if (ret) { | 558 | if (ret) { |
554 | pr_info("event trace: Could not activate" | 559 | pr_info("event trace: Could not activate" |
555 | "syscall entry trace point"); | 560 | "syscall exit trace point"); |
556 | } else { | 561 | } else { |
557 | set_bit(num, enabled_prof_exit_syscalls); | 562 | set_bit(num, enabled_prof_exit_syscalls); |
558 | sys_prof_refcount_exit++; | 563 | sys_prof_refcount_exit++; |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 00d59d048edf..0a67e041edf8 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/tsacct_kern.h> | 21 | #include <linux/tsacct_kern.h> |
22 | #include <linux/acct.h> | 22 | #include <linux/acct.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/mm.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * fill in basic accounting fields | 27 | * fill in basic accounting fields |
diff --git a/kernel/user.c b/kernel/user.c index 46d0165ca70c..766467b3bcb7 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -56,9 +56,6 @@ struct user_struct root_user = { | |||
56 | .sigpending = ATOMIC_INIT(0), | 56 | .sigpending = ATOMIC_INIT(0), |
57 | .locked_shm = 0, | 57 | .locked_shm = 0, |
58 | .user_ns = &init_user_ns, | 58 | .user_ns = &init_user_ns, |
59 | #ifdef CONFIG_USER_SCHED | ||
60 | .tg = &init_task_group, | ||
61 | #endif | ||
62 | }; | 59 | }; |
63 | 60 | ||
64 | /* | 61 | /* |
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up) | |||
75 | put_user_ns(up->user_ns); | 72 | put_user_ns(up->user_ns); |
76 | } | 73 | } |
77 | 74 | ||
78 | #ifdef CONFIG_USER_SCHED | ||
79 | |||
80 | static void sched_destroy_user(struct user_struct *up) | ||
81 | { | ||
82 | sched_destroy_group(up->tg); | ||
83 | } | ||
84 | |||
85 | static int sched_create_user(struct user_struct *up) | ||
86 | { | ||
87 | int rc = 0; | ||
88 | |||
89 | up->tg = sched_create_group(&root_task_group); | ||
90 | if (IS_ERR(up->tg)) | ||
91 | rc = -ENOMEM; | ||
92 | |||
93 | set_tg_uid(up); | ||
94 | |||
95 | return rc; | ||
96 | } | ||
97 | |||
98 | #else /* CONFIG_USER_SCHED */ | ||
99 | |||
100 | static void sched_destroy_user(struct user_struct *up) { } | ||
101 | static int sched_create_user(struct user_struct *up) { return 0; } | ||
102 | |||
103 | #endif /* CONFIG_USER_SCHED */ | ||
104 | |||
105 | #if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) | ||
106 | |||
107 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | ||
108 | { | ||
109 | struct user_struct *user; | ||
110 | struct hlist_node *h; | ||
111 | |||
112 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | ||
113 | if (user->uid == uid) { | ||
114 | /* possibly resurrect an "almost deleted" object */ | ||
115 | if (atomic_inc_return(&user->__count) == 1) | ||
116 | cancel_delayed_work(&user->work); | ||
117 | return user; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | return NULL; | ||
122 | } | ||
123 | |||
124 | static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ | ||
125 | static DEFINE_MUTEX(uids_mutex); | ||
126 | |||
127 | static inline void uids_mutex_lock(void) | ||
128 | { | ||
129 | mutex_lock(&uids_mutex); | ||
130 | } | ||
131 | |||
132 | static inline void uids_mutex_unlock(void) | ||
133 | { | ||
134 | mutex_unlock(&uids_mutex); | ||
135 | } | ||
136 | |||
137 | /* uid directory attributes */ | ||
138 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
139 | static ssize_t cpu_shares_show(struct kobject *kobj, | ||
140 | struct kobj_attribute *attr, | ||
141 | char *buf) | ||
142 | { | ||
143 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
144 | |||
145 | return sprintf(buf, "%lu\n", sched_group_shares(up->tg)); | ||
146 | } | ||
147 | |||
148 | static ssize_t cpu_shares_store(struct kobject *kobj, | ||
149 | struct kobj_attribute *attr, | ||
150 | const char *buf, size_t size) | ||
151 | { | ||
152 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
153 | unsigned long shares; | ||
154 | int rc; | ||
155 | |||
156 | sscanf(buf, "%lu", &shares); | ||
157 | |||
158 | rc = sched_group_set_shares(up->tg, shares); | ||
159 | |||
160 | return (rc ? rc : size); | ||
161 | } | ||
162 | |||
163 | static struct kobj_attribute cpu_share_attr = | ||
164 | __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); | ||
165 | #endif | ||
166 | |||
167 | #ifdef CONFIG_RT_GROUP_SCHED | ||
168 | static ssize_t cpu_rt_runtime_show(struct kobject *kobj, | ||
169 | struct kobj_attribute *attr, | ||
170 | char *buf) | ||
171 | { | ||
172 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
173 | |||
174 | return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); | ||
175 | } | ||
176 | |||
177 | static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | ||
178 | struct kobj_attribute *attr, | ||
179 | const char *buf, size_t size) | ||
180 | { | ||
181 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
182 | unsigned long rt_runtime; | ||
183 | int rc; | ||
184 | |||
185 | sscanf(buf, "%ld", &rt_runtime); | ||
186 | |||
187 | rc = sched_group_set_rt_runtime(up->tg, rt_runtime); | ||
188 | |||
189 | return (rc ? rc : size); | ||
190 | } | ||
191 | |||
192 | static struct kobj_attribute cpu_rt_runtime_attr = | ||
193 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); | ||
194 | |||
195 | static ssize_t cpu_rt_period_show(struct kobject *kobj, | ||
196 | struct kobj_attribute *attr, | ||
197 | char *buf) | ||
198 | { | ||
199 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
200 | |||
201 | return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); | ||
202 | } | ||
203 | |||
204 | static ssize_t cpu_rt_period_store(struct kobject *kobj, | ||
205 | struct kobj_attribute *attr, | ||
206 | const char *buf, size_t size) | ||
207 | { | ||
208 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
209 | unsigned long rt_period; | ||
210 | int rc; | ||
211 | |||
212 | sscanf(buf, "%lu", &rt_period); | ||
213 | |||
214 | rc = sched_group_set_rt_period(up->tg, rt_period); | ||
215 | |||
216 | return (rc ? rc : size); | ||
217 | } | ||
218 | |||
219 | static struct kobj_attribute cpu_rt_period_attr = | ||
220 | __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); | ||
221 | #endif | ||
222 | |||
223 | /* default attributes per uid directory */ | ||
224 | static struct attribute *uids_attributes[] = { | ||
225 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
226 | &cpu_share_attr.attr, | ||
227 | #endif | ||
228 | #ifdef CONFIG_RT_GROUP_SCHED | ||
229 | &cpu_rt_runtime_attr.attr, | ||
230 | &cpu_rt_period_attr.attr, | ||
231 | #endif | ||
232 | NULL | ||
233 | }; | ||
234 | |||
235 | /* the lifetime of user_struct is not managed by the core (now) */ | ||
236 | static void uids_release(struct kobject *kobj) | ||
237 | { | ||
238 | return; | ||
239 | } | ||
240 | |||
241 | static struct kobj_type uids_ktype = { | ||
242 | .sysfs_ops = &kobj_sysfs_ops, | ||
243 | .default_attrs = uids_attributes, | ||
244 | .release = uids_release, | ||
245 | }; | ||
246 | |||
247 | /* | ||
248 | * Create /sys/kernel/uids/<uid>/cpu_share file for this user | ||
249 | * We do not create this file for users in a user namespace (until | ||
250 | * sysfs tagging is implemented). | ||
251 | * | ||
252 | * See Documentation/scheduler/sched-design-CFS.txt for ramifications. | ||
253 | */ | ||
254 | static int uids_user_create(struct user_struct *up) | ||
255 | { | ||
256 | struct kobject *kobj = &up->kobj; | ||
257 | int error; | ||
258 | |||
259 | memset(kobj, 0, sizeof(struct kobject)); | ||
260 | if (up->user_ns != &init_user_ns) | ||
261 | return 0; | ||
262 | kobj->kset = uids_kset; | ||
263 | error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); | ||
264 | if (error) { | ||
265 | kobject_put(kobj); | ||
266 | goto done; | ||
267 | } | ||
268 | |||
269 | kobject_uevent(kobj, KOBJ_ADD); | ||
270 | done: | ||
271 | return error; | ||
272 | } | ||
273 | |||
274 | /* create these entries in sysfs: | ||
275 | * "/sys/kernel/uids" directory | ||
276 | * "/sys/kernel/uids/0" directory (for root user) | ||
277 | * "/sys/kernel/uids/0/cpu_share" file (for root user) | ||
278 | */ | ||
279 | int __init uids_sysfs_init(void) | ||
280 | { | ||
281 | uids_kset = kset_create_and_add("uids", NULL, kernel_kobj); | ||
282 | if (!uids_kset) | ||
283 | return -ENOMEM; | ||
284 | |||
285 | return uids_user_create(&root_user); | ||
286 | } | ||
287 | |||
288 | /* delayed work function to remove sysfs directory for a user and free up | ||
289 | * corresponding structures. | ||
290 | */ | ||
291 | static void cleanup_user_struct(struct work_struct *w) | ||
292 | { | ||
293 | struct user_struct *up = container_of(w, struct user_struct, work.work); | ||
294 | unsigned long flags; | ||
295 | int remove_user = 0; | ||
296 | |||
297 | /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() | ||
298 | * atomic. | ||
299 | */ | ||
300 | uids_mutex_lock(); | ||
301 | |||
302 | spin_lock_irqsave(&uidhash_lock, flags); | ||
303 | if (atomic_read(&up->__count) == 0) { | ||
304 | uid_hash_remove(up); | ||
305 | remove_user = 1; | ||
306 | } | ||
307 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
308 | |||
309 | if (!remove_user) | ||
310 | goto done; | ||
311 | |||
312 | if (up->user_ns == &init_user_ns) { | ||
313 | kobject_uevent(&up->kobj, KOBJ_REMOVE); | ||
314 | kobject_del(&up->kobj); | ||
315 | kobject_put(&up->kobj); | ||
316 | } | ||
317 | |||
318 | sched_destroy_user(up); | ||
319 | key_put(up->uid_keyring); | ||
320 | key_put(up->session_keyring); | ||
321 | kmem_cache_free(uid_cachep, up); | ||
322 | |||
323 | done: | ||
324 | uids_mutex_unlock(); | ||
325 | } | ||
326 | |||
327 | /* IRQs are disabled and uidhash_lock is held upon function entry. | ||
328 | * IRQ state (as stored in flags) is restored and uidhash_lock released | ||
329 | * upon function exit. | ||
330 | */ | ||
331 | static void free_user(struct user_struct *up, unsigned long flags) | ||
332 | { | ||
333 | INIT_DELAYED_WORK(&up->work, cleanup_user_struct); | ||
334 | schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); | ||
335 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
336 | } | ||
337 | |||
338 | #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ | ||
339 | |||
340 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | 75 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) |
341 | { | 76 | { |
342 | struct user_struct *user; | 77 | struct user_struct *user; |
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | |||
352 | return NULL; | 87 | return NULL; |
353 | } | 88 | } |
354 | 89 | ||
355 | int uids_sysfs_init(void) { return 0; } | ||
356 | static inline int uids_user_create(struct user_struct *up) { return 0; } | ||
357 | static inline void uids_mutex_lock(void) { } | ||
358 | static inline void uids_mutex_unlock(void) { } | ||
359 | |||
360 | /* IRQs are disabled and uidhash_lock is held upon function entry. | 90 | /* IRQs are disabled and uidhash_lock is held upon function entry. |
361 | * IRQ state (as stored in flags) is restored and uidhash_lock released | 91 | * IRQ state (as stored in flags) is restored and uidhash_lock released |
362 | * upon function exit. | 92 | * upon function exit. |
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags) | |||
365 | { | 95 | { |
366 | uid_hash_remove(up); | 96 | uid_hash_remove(up); |
367 | spin_unlock_irqrestore(&uidhash_lock, flags); | 97 | spin_unlock_irqrestore(&uidhash_lock, flags); |
368 | sched_destroy_user(up); | ||
369 | key_put(up->uid_keyring); | 98 | key_put(up->uid_keyring); |
370 | key_put(up->session_keyring); | 99 | key_put(up->session_keyring); |
371 | kmem_cache_free(uid_cachep, up); | 100 | kmem_cache_free(uid_cachep, up); |
372 | } | 101 | } |
373 | 102 | ||
374 | #endif | ||
375 | |||
376 | #if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) | ||
377 | /* | ||
378 | * We need to check if a setuid can take place. This function should be called | ||
379 | * before successfully completing the setuid. | ||
380 | */ | ||
381 | int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) | ||
382 | { | ||
383 | |||
384 | return sched_rt_can_attach(up->tg, tsk); | ||
385 | |||
386 | } | ||
387 | #else | ||
388 | int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) | ||
389 | { | ||
390 | return 1; | ||
391 | } | ||
392 | #endif | ||
393 | |||
394 | /* | 103 | /* |
395 | * Locate the user_struct for the passed UID. If found, take a ref on it. The | 104 | * Locate the user_struct for the passed UID. If found, take a ref on it. The |
396 | * caller must undo that ref with free_uid(). | 105 | * caller must undo that ref with free_uid(). |
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
431 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() | 140 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() |
432 | * atomic. | 141 | * atomic. |
433 | */ | 142 | */ |
434 | uids_mutex_lock(); | ||
435 | |||
436 | spin_lock_irq(&uidhash_lock); | 143 | spin_lock_irq(&uidhash_lock); |
437 | up = uid_hash_find(uid, hashent); | 144 | up = uid_hash_find(uid, hashent); |
438 | spin_unlock_irq(&uidhash_lock); | 145 | spin_unlock_irq(&uidhash_lock); |
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
445 | new->uid = uid; | 152 | new->uid = uid; |
446 | atomic_set(&new->__count, 1); | 153 | atomic_set(&new->__count, 1); |
447 | 154 | ||
448 | if (sched_create_user(new) < 0) | ||
449 | goto out_free_user; | ||
450 | |||
451 | new->user_ns = get_user_ns(ns); | 155 | new->user_ns = get_user_ns(ns); |
452 | 156 | ||
453 | if (uids_user_create(new)) | ||
454 | goto out_destoy_sched; | ||
455 | |||
456 | /* | 157 | /* |
457 | * Before adding this, check whether we raced | 158 | * Before adding this, check whether we raced |
458 | * on adding the same user already.. | 159 | * on adding the same user already.. |
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
475 | spin_unlock_irq(&uidhash_lock); | 176 | spin_unlock_irq(&uidhash_lock); |
476 | } | 177 | } |
477 | 178 | ||
478 | uids_mutex_unlock(); | ||
479 | |||
480 | return up; | 179 | return up; |
481 | 180 | ||
482 | out_destoy_sched: | ||
483 | sched_destroy_user(new); | ||
484 | put_user_ns(new->user_ns); | 181 | put_user_ns(new->user_ns); |
485 | out_free_user: | ||
486 | kmem_cache_free(uid_cachep, new); | 182 | kmem_cache_free(uid_cachep, new); |
487 | out_unlock: | 183 | out_unlock: |
488 | uids_mutex_unlock(); | ||
489 | return NULL; | 184 | return NULL; |
490 | } | 185 | } |
491 | 186 | ||