diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-26 16:08:01 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-26 16:08:01 -0400 |
commit | 79071638ce655c1f78a50d05c7dae0ad04a3e92a (patch) | |
tree | d9e76997c418b78a2485ac50d5970f7d420a5600 | |
parent | 44a6b8442190cf213081060b610dae2e822f802b (diff) | |
parent | 8323f26ce3425460769605a6aece7a174edaa7d1 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"The biggest change is a performance improvement on SMP systems:
| 4 socket 40 core + SMT Westmere box, single 30 sec tbench
| runs, higher is better:
|
| clients 1 2 4 8 16 32 64 128
|..........................................................................
| pre 30 41 118 645 3769 6214 12233 14312
| post 299 603 1211 2418 4697 6847 11606 14557
|
| A nice increase in performance.
which speedup is particularly noticeable on heavily interacting
few-tasks workloads, so the changes should help desktop-style Xorg
workloads and interactivity as well, on multi-core CPUs.
There are also cpuset suspend behavior fixes/restructuring and various
smaller tweaks."
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched: Fix race in task_group()
sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned) task
sched: Reset loop counters if all tasks are pinned and we need to redo load balance
sched: Reorder 'struct lb_env' members to reduce its size
sched: Improve scalability via 'CPU buddies', which withstand random perturbations
cpusets: Remove/update outdated comments
cpusets, hotplug: Restructure functions that are invoked during hotplug
cpusets, hotplug: Implement cpuset tree traversal in a helper function
CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
sched/x86: Remove broken power estimation
-rw-r--r-- | arch/x86/kernel/cpu/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sched.c | 55 | ||||
-rw-r--r-- | include/linux/cpuset.h | 4 | ||||
-rw-r--r-- | include/linux/init_task.h | 12 | ||||
-rw-r--r-- | include/linux/sched.h | 6 | ||||
-rw-r--r-- | kernel/cpuset.c | 130 | ||||
-rw-r--r-- | kernel/sched/core.c | 92 | ||||
-rw-r--r-- | kernel/sched/fair.c | 113 | ||||
-rw-r--r-- | kernel/sched/sched.h | 23 |
9 files changed, 291 insertions, 146 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index bac4c3804cc7..d30a6a9a0121 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp) | |||
14 | 14 | ||
15 | obj-y := intel_cacheinfo.o scattered.o topology.o | 15 | obj-y := intel_cacheinfo.o scattered.o topology.o |
16 | obj-y += proc.o capflags.o powerflags.o common.o | 16 | obj-y += proc.o capflags.o powerflags.o common.o |
17 | obj-y += vmware.o hypervisor.o sched.o mshyperv.o | 17 | obj-y += vmware.o hypervisor.o mshyperv.o |
18 | obj-y += rdrand.o | 18 | obj-y += rdrand.o |
19 | obj-y += match.o | 19 | obj-y += match.o |
20 | 20 | ||
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c deleted file mode 100644 index a640ae5ad201..000000000000 --- a/arch/x86/kernel/cpu/sched.c +++ /dev/null | |||
@@ -1,55 +0,0 @@ | |||
1 | #include <linux/sched.h> | ||
2 | #include <linux/math64.h> | ||
3 | #include <linux/percpu.h> | ||
4 | #include <linux/irqflags.h> | ||
5 | |||
6 | #include <asm/cpufeature.h> | ||
7 | #include <asm/processor.h> | ||
8 | |||
9 | #ifdef CONFIG_SMP | ||
10 | |||
11 | static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); | ||
12 | |||
13 | static unsigned long scale_aperfmperf(void) | ||
14 | { | ||
15 | struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); | ||
16 | unsigned long ratio, flags; | ||
17 | |||
18 | local_irq_save(flags); | ||
19 | get_aperfmperf(&val); | ||
20 | local_irq_restore(flags); | ||
21 | |||
22 | ratio = calc_aperfmperf_ratio(old, &val); | ||
23 | *old = val; | ||
24 | |||
25 | return ratio; | ||
26 | } | ||
27 | |||
28 | unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
29 | { | ||
30 | /* | ||
31 | * do aperf/mperf on the cpu level because it includes things | ||
32 | * like turbo mode, which are relevant to full cores. | ||
33 | */ | ||
34 | if (boot_cpu_has(X86_FEATURE_APERFMPERF)) | ||
35 | return scale_aperfmperf(); | ||
36 | |||
37 | /* | ||
38 | * maybe have something cpufreq here | ||
39 | */ | ||
40 | |||
41 | return default_scale_freq_power(sd, cpu); | ||
42 | } | ||
43 | |||
44 | unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
45 | { | ||
46 | /* | ||
47 | * aperf/mperf already includes the smt gain | ||
48 | */ | ||
49 | if (boot_cpu_has(X86_FEATURE_APERFMPERF)) | ||
50 | return SCHED_LOAD_SCALE; | ||
51 | |||
52 | return default_scale_smt_power(sd, cpu); | ||
53 | } | ||
54 | |||
55 | #endif | ||
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 668f66baac7b..838320fc3d1d 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -20,7 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ | |||
20 | 20 | ||
21 | extern int cpuset_init(void); | 21 | extern int cpuset_init(void); |
22 | extern void cpuset_init_smp(void); | 22 | extern void cpuset_init_smp(void); |
23 | extern void cpuset_update_active_cpus(void); | 23 | extern void cpuset_update_active_cpus(bool cpu_online); |
24 | extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); | 24 | extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); |
25 | extern void cpuset_cpus_allowed_fallback(struct task_struct *p); | 25 | extern void cpuset_cpus_allowed_fallback(struct task_struct *p); |
26 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); | 26 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); |
@@ -124,7 +124,7 @@ static inline void set_mems_allowed(nodemask_t nodemask) | |||
124 | static inline int cpuset_init(void) { return 0; } | 124 | static inline int cpuset_init(void) { return 0; } |
125 | static inline void cpuset_init_smp(void) {} | 125 | static inline void cpuset_init_smp(void) {} |
126 | 126 | ||
127 | static inline void cpuset_update_active_cpus(void) | 127 | static inline void cpuset_update_active_cpus(bool cpu_online) |
128 | { | 128 | { |
129 | partition_sched_domains(1, NULL, NULL); | 129 | partition_sched_domains(1, NULL, NULL); |
130 | } | 130 | } |
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 8a7476186990..89f1cb1056f0 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -123,8 +123,17 @@ extern struct group_info init_groups; | |||
123 | 123 | ||
124 | extern struct cred init_cred; | 124 | extern struct cred init_cred; |
125 | 125 | ||
126 | extern struct task_group root_task_group; | ||
127 | |||
128 | #ifdef CONFIG_CGROUP_SCHED | ||
129 | # define INIT_CGROUP_SCHED(tsk) \ | ||
130 | .sched_task_group = &root_task_group, | ||
131 | #else | ||
132 | # define INIT_CGROUP_SCHED(tsk) | ||
133 | #endif | ||
134 | |||
126 | #ifdef CONFIG_PERF_EVENTS | 135 | #ifdef CONFIG_PERF_EVENTS |
127 | # define INIT_PERF_EVENTS(tsk) \ | 136 | # define INIT_PERF_EVENTS(tsk) \ |
128 | .perf_event_mutex = \ | 137 | .perf_event_mutex = \ |
129 | __MUTEX_INITIALIZER(tsk.perf_event_mutex), \ | 138 | __MUTEX_INITIALIZER(tsk.perf_event_mutex), \ |
130 | .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list), | 139 | .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list), |
@@ -161,6 +170,7 @@ extern struct cred init_cred; | |||
161 | }, \ | 170 | }, \ |
162 | .tasks = LIST_HEAD_INIT(tsk.tasks), \ | 171 | .tasks = LIST_HEAD_INIT(tsk.tasks), \ |
163 | INIT_PUSHABLE_TASKS(tsk) \ | 172 | INIT_PUSHABLE_TASKS(tsk) \ |
173 | INIT_CGROUP_SCHED(tsk) \ | ||
164 | .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ | 174 | .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ |
165 | .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ | 175 | .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ |
166 | .real_parent = &tsk, \ | 176 | .real_parent = &tsk, \ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 1a2ebd39b800..a721cef7e2d4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -949,6 +949,7 @@ struct sched_domain { | |||
949 | unsigned int smt_gain; | 949 | unsigned int smt_gain; |
950 | int flags; /* See SD_* */ | 950 | int flags; /* See SD_* */ |
951 | int level; | 951 | int level; |
952 | int idle_buddy; /* cpu assigned to select_idle_sibling() */ | ||
952 | 953 | ||
953 | /* Runtime fields. */ | 954 | /* Runtime fields. */ |
954 | unsigned long last_balance; /* init to jiffies. units in jiffies */ | 955 | unsigned long last_balance; /* init to jiffies. units in jiffies */ |
@@ -1244,6 +1245,9 @@ struct task_struct { | |||
1244 | const struct sched_class *sched_class; | 1245 | const struct sched_class *sched_class; |
1245 | struct sched_entity se; | 1246 | struct sched_entity se; |
1246 | struct sched_rt_entity rt; | 1247 | struct sched_rt_entity rt; |
1248 | #ifdef CONFIG_CGROUP_SCHED | ||
1249 | struct task_group *sched_task_group; | ||
1250 | #endif | ||
1247 | 1251 | ||
1248 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1252 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1249 | /* list of struct preempt_notifier: */ | 1253 | /* list of struct preempt_notifier: */ |
@@ -2721,7 +2725,7 @@ extern int sched_group_set_rt_period(struct task_group *tg, | |||
2721 | extern long sched_group_rt_period(struct task_group *tg); | 2725 | extern long sched_group_rt_period(struct task_group *tg); |
2722 | extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); | 2726 | extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); |
2723 | #endif | 2727 | #endif |
2724 | #endif | 2728 | #endif /* CONFIG_CGROUP_SCHED */ |
2725 | 2729 | ||
2726 | extern int task_can_switch_user(struct user_struct *up, | 2730 | extern int task_can_switch_user(struct user_struct *up, |
2727 | struct task_struct *tsk); | 2731 | struct task_struct *tsk); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c8bd652dd12..f33c7153b6d7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -147,6 +147,12 @@ typedef enum { | |||
147 | CS_SPREAD_SLAB, | 147 | CS_SPREAD_SLAB, |
148 | } cpuset_flagbits_t; | 148 | } cpuset_flagbits_t; |
149 | 149 | ||
150 | /* the type of hotplug event */ | ||
151 | enum hotplug_event { | ||
152 | CPUSET_CPU_OFFLINE, | ||
153 | CPUSET_MEM_OFFLINE, | ||
154 | }; | ||
155 | |||
150 | /* convenient tests for these bits */ | 156 | /* convenient tests for these bits */ |
151 | static inline int is_cpu_exclusive(const struct cpuset *cs) | 157 | static inline int is_cpu_exclusive(const struct cpuset *cs) |
152 | { | 158 | { |
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
1990 | } | 1996 | } |
1991 | 1997 | ||
1992 | /* | 1998 | /* |
1993 | * Walk the specified cpuset subtree and look for empty cpusets. | 1999 | * Helper function to traverse cpusets. |
1994 | * The tasks of such cpuset must be moved to a parent cpuset. | 2000 | * It can be used to walk the cpuset tree from top to bottom, completing |
2001 | * one layer before dropping down to the next (thus always processing a | ||
2002 | * node before any of its children). | ||
2003 | */ | ||
2004 | static struct cpuset *cpuset_next(struct list_head *queue) | ||
2005 | { | ||
2006 | struct cpuset *cp; | ||
2007 | struct cpuset *child; /* scans child cpusets of cp */ | ||
2008 | struct cgroup *cont; | ||
2009 | |||
2010 | if (list_empty(queue)) | ||
2011 | return NULL; | ||
2012 | |||
2013 | cp = list_first_entry(queue, struct cpuset, stack_list); | ||
2014 | list_del(queue->next); | ||
2015 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
2016 | child = cgroup_cs(cont); | ||
2017 | list_add_tail(&child->stack_list, queue); | ||
2018 | } | ||
2019 | |||
2020 | return cp; | ||
2021 | } | ||
2022 | |||
2023 | |||
2024 | /* | ||
2025 | * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory | ||
2026 | * online/offline) and update the cpusets accordingly. | ||
2027 | * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such | ||
2028 | * cpuset must be moved to a parent cpuset. | ||
1995 | * | 2029 | * |
1996 | * Called with cgroup_mutex held. We take callback_mutex to modify | 2030 | * Called with cgroup_mutex held. We take callback_mutex to modify |
1997 | * cpus_allowed and mems_allowed. | 2031 | * cpus_allowed and mems_allowed. |
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
2000 | * before dropping down to the next. It always processes a node before | 2034 | * before dropping down to the next. It always processes a node before |
2001 | * any of its children. | 2035 | * any of its children. |
2002 | * | 2036 | * |
2003 | * For now, since we lack memory hot unplug, we'll never see a cpuset | 2037 | * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY |
2004 | * that has tasks along with an empty 'mems'. But if we did see such | 2038 | * if all present pages from a node are offlined. |
2005 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | ||
2006 | */ | 2039 | */ |
2007 | static void scan_for_empty_cpusets(struct cpuset *root) | 2040 | static void |
2041 | scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | ||
2008 | { | 2042 | { |
2009 | LIST_HEAD(queue); | 2043 | LIST_HEAD(queue); |
2010 | struct cpuset *cp; /* scans cpusets being updated */ | 2044 | struct cpuset *cp; /* scans cpusets being updated */ |
2011 | struct cpuset *child; /* scans child cpusets of cp */ | ||
2012 | struct cgroup *cont; | ||
2013 | static nodemask_t oldmems; /* protected by cgroup_mutex */ | 2045 | static nodemask_t oldmems; /* protected by cgroup_mutex */ |
2014 | 2046 | ||
2015 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 2047 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
2016 | 2048 | ||
2017 | while (!list_empty(&queue)) { | 2049 | switch (event) { |
2018 | cp = list_first_entry(&queue, struct cpuset, stack_list); | 2050 | case CPUSET_CPU_OFFLINE: |
2019 | list_del(queue.next); | 2051 | while ((cp = cpuset_next(&queue)) != NULL) { |
2020 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 2052 | |
2021 | child = cgroup_cs(cont); | 2053 | /* Continue past cpusets with all cpus online */ |
2022 | list_add_tail(&child->stack_list, &queue); | 2054 | if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) |
2055 | continue; | ||
2056 | |||
2057 | /* Remove offline cpus from this cpuset. */ | ||
2058 | mutex_lock(&callback_mutex); | ||
2059 | cpumask_and(cp->cpus_allowed, cp->cpus_allowed, | ||
2060 | cpu_active_mask); | ||
2061 | mutex_unlock(&callback_mutex); | ||
2062 | |||
2063 | /* Move tasks from the empty cpuset to a parent */ | ||
2064 | if (cpumask_empty(cp->cpus_allowed)) | ||
2065 | remove_tasks_in_empty_cpuset(cp); | ||
2066 | else | ||
2067 | update_tasks_cpumask(cp, NULL); | ||
2023 | } | 2068 | } |
2069 | break; | ||
2024 | 2070 | ||
2025 | /* Continue past cpusets with all cpus, mems online */ | 2071 | case CPUSET_MEM_OFFLINE: |
2026 | if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && | 2072 | while ((cp = cpuset_next(&queue)) != NULL) { |
2027 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | ||
2028 | continue; | ||
2029 | 2073 | ||
2030 | oldmems = cp->mems_allowed; | 2074 | /* Continue past cpusets with all mems online */ |
2075 | if (nodes_subset(cp->mems_allowed, | ||
2076 | node_states[N_HIGH_MEMORY])) | ||
2077 | continue; | ||
2031 | 2078 | ||
2032 | /* Remove offline cpus and mems from this cpuset. */ | 2079 | oldmems = cp->mems_allowed; |
2033 | mutex_lock(&callback_mutex); | 2080 | |
2034 | cpumask_and(cp->cpus_allowed, cp->cpus_allowed, | 2081 | /* Remove offline mems from this cpuset. */ |
2035 | cpu_active_mask); | 2082 | mutex_lock(&callback_mutex); |
2036 | nodes_and(cp->mems_allowed, cp->mems_allowed, | 2083 | nodes_and(cp->mems_allowed, cp->mems_allowed, |
2037 | node_states[N_HIGH_MEMORY]); | 2084 | node_states[N_HIGH_MEMORY]); |
2038 | mutex_unlock(&callback_mutex); | 2085 | mutex_unlock(&callback_mutex); |
2039 | 2086 | ||
2040 | /* Move tasks from the empty cpuset to a parent */ | 2087 | /* Move tasks from the empty cpuset to a parent */ |
2041 | if (cpumask_empty(cp->cpus_allowed) || | 2088 | if (nodes_empty(cp->mems_allowed)) |
2042 | nodes_empty(cp->mems_allowed)) | 2089 | remove_tasks_in_empty_cpuset(cp); |
2043 | remove_tasks_in_empty_cpuset(cp); | 2090 | else |
2044 | else { | 2091 | update_tasks_nodemask(cp, &oldmems, NULL); |
2045 | update_tasks_cpumask(cp, NULL); | ||
2046 | update_tasks_nodemask(cp, &oldmems, NULL); | ||
2047 | } | 2092 | } |
2048 | } | 2093 | } |
2049 | } | 2094 | } |
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2054 | * (of no affect) on systems that are actively using CPU hotplug | 2099 | * (of no affect) on systems that are actively using CPU hotplug |
2055 | * but making no active use of cpusets. | 2100 | * but making no active use of cpusets. |
2056 | * | 2101 | * |
2102 | * The only exception to this is suspend/resume, where we don't | ||
2103 | * modify cpusets at all. | ||
2104 | * | ||
2057 | * This routine ensures that top_cpuset.cpus_allowed tracks | 2105 | * This routine ensures that top_cpuset.cpus_allowed tracks |
2058 | * cpu_active_mask on each CPU hotplug (cpuhp) event. | 2106 | * cpu_active_mask on each CPU hotplug (cpuhp) event. |
2059 | * | 2107 | * |
2060 | * Called within get_online_cpus(). Needs to call cgroup_lock() | 2108 | * Called within get_online_cpus(). Needs to call cgroup_lock() |
2061 | * before calling generate_sched_domains(). | 2109 | * before calling generate_sched_domains(). |
2110 | * | ||
2111 | * @cpu_online: Indicates whether this is a CPU online event (true) or | ||
2112 | * a CPU offline event (false). | ||
2062 | */ | 2113 | */ |
2063 | void cpuset_update_active_cpus(void) | 2114 | void cpuset_update_active_cpus(bool cpu_online) |
2064 | { | 2115 | { |
2065 | struct sched_domain_attr *attr; | 2116 | struct sched_domain_attr *attr; |
2066 | cpumask_var_t *doms; | 2117 | cpumask_var_t *doms; |
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void) | |||
2070 | mutex_lock(&callback_mutex); | 2121 | mutex_lock(&callback_mutex); |
2071 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2122 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2072 | mutex_unlock(&callback_mutex); | 2123 | mutex_unlock(&callback_mutex); |
2073 | scan_for_empty_cpusets(&top_cpuset); | 2124 | |
2125 | if (!cpu_online) | ||
2126 | scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); | ||
2127 | |||
2074 | ndoms = generate_sched_domains(&doms, &attr); | 2128 | ndoms = generate_sched_domains(&doms, &attr); |
2075 | cgroup_unlock(); | 2129 | cgroup_unlock(); |
2076 | 2130 | ||
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void) | |||
2082 | /* | 2136 | /* |
2083 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2137 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. |
2084 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. | 2138 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. |
2085 | * See also the previous routine cpuset_track_online_cpus(). | 2139 | * See cpuset_update_active_cpus() for CPU hotplug handling. |
2086 | */ | 2140 | */ |
2087 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2141 | static int cpuset_track_online_nodes(struct notifier_block *self, |
2088 | unsigned long action, void *arg) | 2142 | unsigned long action, void *arg) |
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2101 | case MEM_OFFLINE: | 2155 | case MEM_OFFLINE: |
2102 | /* | 2156 | /* |
2103 | * needn't update top_cpuset.mems_allowed explicitly because | 2157 | * needn't update top_cpuset.mems_allowed explicitly because |
2104 | * scan_for_empty_cpusets() will update it. | 2158 | * scan_cpusets_upon_hotplug() will update it. |
2105 | */ | 2159 | */ |
2106 | scan_for_empty_cpusets(&top_cpuset); | 2160 | scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); |
2107 | break; | 2161 | break; |
2108 | default: | 2162 | default: |
2109 | break; | 2163 | break; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 468bdd44c1ba..5d011ef4c0df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1096 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. | 1096 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. |
1097 | * | 1097 | * |
1098 | * sched_move_task() holds both and thus holding either pins the cgroup, | 1098 | * sched_move_task() holds both and thus holding either pins the cgroup, |
1099 | * see set_task_rq(). | 1099 | * see task_group(). |
1100 | * | 1100 | * |
1101 | * Furthermore, all task_rq users should acquire both locks, see | 1101 | * Furthermore, all task_rq users should acquire both locks, see |
1102 | * task_rq_lock(). | 1102 | * task_rq_lock(). |
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
6024 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | 6024 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this |
6025 | * allows us to avoid some pointer chasing select_idle_sibling(). | 6025 | * allows us to avoid some pointer chasing select_idle_sibling(). |
6026 | * | 6026 | * |
6027 | * Iterate domains and sched_groups downward, assigning CPUs to be | ||
6028 | * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing | ||
6029 | * due to random perturbation self canceling, ie sw buddies pull | ||
6030 | * their counterpart to their CPU's hw counterpart. | ||
6031 | * | ||
6027 | * Also keep a unique ID per domain (we use the first cpu number in | 6032 | * Also keep a unique ID per domain (we use the first cpu number in |
6028 | * the cpumask of the domain), this allows us to quickly tell if | 6033 | * the cpumask of the domain), this allows us to quickly tell if |
6029 | * two cpus are in the same cache domain, see cpus_share_cache(). | 6034 | * two cpus are in the same cache domain, see cpus_share_cache(). |
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu) | |||
6037 | int id = cpu; | 6042 | int id = cpu; |
6038 | 6043 | ||
6039 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | 6044 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); |
6040 | if (sd) | 6045 | if (sd) { |
6046 | struct sched_domain *tmp = sd; | ||
6047 | struct sched_group *sg, *prev; | ||
6048 | bool right; | ||
6049 | |||
6050 | /* | ||
6051 | * Traverse to first CPU in group, and count hops | ||
6052 | * to cpu from there, switching direction on each | ||
6053 | * hop, never ever pointing the last CPU rightward. | ||
6054 | */ | ||
6055 | do { | ||
6056 | id = cpumask_first(sched_domain_span(tmp)); | ||
6057 | prev = sg = tmp->groups; | ||
6058 | right = 1; | ||
6059 | |||
6060 | while (cpumask_first(sched_group_cpus(sg)) != id) | ||
6061 | sg = sg->next; | ||
6062 | |||
6063 | while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { | ||
6064 | prev = sg; | ||
6065 | sg = sg->next; | ||
6066 | right = !right; | ||
6067 | } | ||
6068 | |||
6069 | /* A CPU went down, never point back to domain start. */ | ||
6070 | if (right && cpumask_first(sched_group_cpus(sg->next)) == id) | ||
6071 | right = false; | ||
6072 | |||
6073 | sg = right ? sg->next : prev; | ||
6074 | tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); | ||
6075 | } while ((tmp = tmp->child)); | ||
6076 | |||
6041 | id = cpumask_first(sched_domain_span(sd)); | 6077 | id = cpumask_first(sched_domain_span(sd)); |
6078 | } | ||
6042 | 6079 | ||
6043 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 6080 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
6044 | per_cpu(sd_llc_id, cpu) = id; | 6081 | per_cpu(sd_llc_id, cpu) = id; |
@@ -7097,34 +7134,66 @@ match2: | |||
7097 | mutex_unlock(&sched_domains_mutex); | 7134 | mutex_unlock(&sched_domains_mutex); |
7098 | } | 7135 | } |
7099 | 7136 | ||
7137 | static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ | ||
7138 | |||
7100 | /* | 7139 | /* |
7101 | * Update cpusets according to cpu_active mask. If cpusets are | 7140 | * Update cpusets according to cpu_active mask. If cpusets are |
7102 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper | 7141 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
7103 | * around partition_sched_domains(). | 7142 | * around partition_sched_domains(). |
7143 | * | ||
7144 | * If we come here as part of a suspend/resume, don't touch cpusets because we | ||
7145 | * want to restore it back to its original state upon resume anyway. | ||
7104 | */ | 7146 | */ |
7105 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | 7147 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, |
7106 | void *hcpu) | 7148 | void *hcpu) |
7107 | { | 7149 | { |
7108 | switch (action & ~CPU_TASKS_FROZEN) { | 7150 | switch (action) { |
7151 | case CPU_ONLINE_FROZEN: | ||
7152 | case CPU_DOWN_FAILED_FROZEN: | ||
7153 | |||
7154 | /* | ||
7155 | * num_cpus_frozen tracks how many CPUs are involved in suspend | ||
7156 | * resume sequence. As long as this is not the last online | ||
7157 | * operation in the resume sequence, just build a single sched | ||
7158 | * domain, ignoring cpusets. | ||
7159 | */ | ||
7160 | num_cpus_frozen--; | ||
7161 | if (likely(num_cpus_frozen)) { | ||
7162 | partition_sched_domains(1, NULL, NULL); | ||
7163 | break; | ||
7164 | } | ||
7165 | |||
7166 | /* | ||
7167 | * This is the last CPU online operation. So fall through and | ||
7168 | * restore the original sched domains by considering the | ||
7169 | * cpuset configurations. | ||
7170 | */ | ||
7171 | |||
7109 | case CPU_ONLINE: | 7172 | case CPU_ONLINE: |
7110 | case CPU_DOWN_FAILED: | 7173 | case CPU_DOWN_FAILED: |
7111 | cpuset_update_active_cpus(); | 7174 | cpuset_update_active_cpus(true); |
7112 | return NOTIFY_OK; | 7175 | break; |
7113 | default: | 7176 | default: |
7114 | return NOTIFY_DONE; | 7177 | return NOTIFY_DONE; |
7115 | } | 7178 | } |
7179 | return NOTIFY_OK; | ||
7116 | } | 7180 | } |
7117 | 7181 | ||
7118 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 7182 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
7119 | void *hcpu) | 7183 | void *hcpu) |
7120 | { | 7184 | { |
7121 | switch (action & ~CPU_TASKS_FROZEN) { | 7185 | switch (action) { |
7122 | case CPU_DOWN_PREPARE: | 7186 | case CPU_DOWN_PREPARE: |
7123 | cpuset_update_active_cpus(); | 7187 | cpuset_update_active_cpus(false); |
7124 | return NOTIFY_OK; | 7188 | break; |
7189 | case CPU_DOWN_PREPARE_FROZEN: | ||
7190 | num_cpus_frozen++; | ||
7191 | partition_sched_domains(1, NULL, NULL); | ||
7192 | break; | ||
7125 | default: | 7193 | default: |
7126 | return NOTIFY_DONE; | 7194 | return NOTIFY_DONE; |
7127 | } | 7195 | } |
7196 | return NOTIFY_OK; | ||
7128 | } | 7197 | } |
7129 | 7198 | ||
7130 | void __init sched_init_smp(void) | 7199 | void __init sched_init_smp(void) |
@@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg) | |||
7589 | */ | 7658 | */ |
7590 | void sched_move_task(struct task_struct *tsk) | 7659 | void sched_move_task(struct task_struct *tsk) |
7591 | { | 7660 | { |
7661 | struct task_group *tg; | ||
7592 | int on_rq, running; | 7662 | int on_rq, running; |
7593 | unsigned long flags; | 7663 | unsigned long flags; |
7594 | struct rq *rq; | 7664 | struct rq *rq; |
@@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk) | |||
7603 | if (unlikely(running)) | 7673 | if (unlikely(running)) |
7604 | tsk->sched_class->put_prev_task(rq, tsk); | 7674 | tsk->sched_class->put_prev_task(rq, tsk); |
7605 | 7675 | ||
7676 | tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, | ||
7677 | lockdep_is_held(&tsk->sighand->siglock)), | ||
7678 | struct task_group, css); | ||
7679 | tg = autogroup_task_group(tsk, tg); | ||
7680 | tsk->sched_task_group = tg; | ||
7681 | |||
7606 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7682 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7607 | if (tsk->sched_class->task_move_group) | 7683 | if (tsk->sched_class->task_move_group) |
7608 | tsk->sched_class->task_move_group(tsk, on_rq); | 7684 | tsk->sched_class->task_move_group(tsk, on_rq); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe3..22321db64952 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2637 | int cpu = smp_processor_id(); | 2637 | int cpu = smp_processor_id(); |
2638 | int prev_cpu = task_cpu(p); | 2638 | int prev_cpu = task_cpu(p); |
2639 | struct sched_domain *sd; | 2639 | struct sched_domain *sd; |
2640 | struct sched_group *sg; | ||
2641 | int i; | ||
2642 | 2640 | ||
2643 | /* | 2641 | /* |
2644 | * If the task is going to be woken-up on this cpu and if it is | 2642 | * If the task is going to be woken-up on this cpu and if it is |
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2655 | return prev_cpu; | 2653 | return prev_cpu; |
2656 | 2654 | ||
2657 | /* | 2655 | /* |
2658 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2656 | * Otherwise, check assigned siblings to find an elegible idle cpu. |
2659 | */ | 2657 | */ |
2660 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 2658 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
2661 | for_each_lower_domain(sd) { | ||
2662 | sg = sd->groups; | ||
2663 | do { | ||
2664 | if (!cpumask_intersects(sched_group_cpus(sg), | ||
2665 | tsk_cpus_allowed(p))) | ||
2666 | goto next; | ||
2667 | |||
2668 | for_each_cpu(i, sched_group_cpus(sg)) { | ||
2669 | if (!idle_cpu(i)) | ||
2670 | goto next; | ||
2671 | } | ||
2672 | 2659 | ||
2673 | target = cpumask_first_and(sched_group_cpus(sg), | 2660 | for_each_lower_domain(sd) { |
2674 | tsk_cpus_allowed(p)); | 2661 | if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) |
2675 | goto done; | 2662 | continue; |
2676 | next: | 2663 | if (idle_cpu(sd->idle_buddy)) |
2677 | sg = sg->next; | 2664 | return sd->idle_buddy; |
2678 | } while (sg != sd->groups); | ||
2679 | } | 2665 | } |
2680 | done: | 2666 | |
2681 | return target; | 2667 | return target; |
2682 | } | 2668 | } |
2683 | 2669 | ||
@@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; | |||
3068 | 3054 | ||
3069 | #define LBF_ALL_PINNED 0x01 | 3055 | #define LBF_ALL_PINNED 0x01 |
3070 | #define LBF_NEED_BREAK 0x02 | 3056 | #define LBF_NEED_BREAK 0x02 |
3057 | #define LBF_SOME_PINNED 0x04 | ||
3071 | 3058 | ||
3072 | struct lb_env { | 3059 | struct lb_env { |
3073 | struct sched_domain *sd; | 3060 | struct sched_domain *sd; |
3074 | 3061 | ||
3075 | int src_cpu; | ||
3076 | struct rq *src_rq; | 3062 | struct rq *src_rq; |
3063 | int src_cpu; | ||
3077 | 3064 | ||
3078 | int dst_cpu; | 3065 | int dst_cpu; |
3079 | struct rq *dst_rq; | 3066 | struct rq *dst_rq; |
3080 | 3067 | ||
3068 | struct cpumask *dst_grpmask; | ||
3069 | int new_dst_cpu; | ||
3081 | enum cpu_idle_type idle; | 3070 | enum cpu_idle_type idle; |
3082 | long imbalance; | 3071 | long imbalance; |
3083 | unsigned int flags; | 3072 | unsigned int flags; |
@@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3145 | * 3) are cache-hot on their current CPU. | 3134 | * 3) are cache-hot on their current CPU. |
3146 | */ | 3135 | */ |
3147 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { | 3136 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
3137 | int new_dst_cpu; | ||
3138 | |||
3148 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3139 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3140 | |||
3141 | /* | ||
3142 | * Remember if this task can be migrated to any other cpu in | ||
3143 | * our sched_group. We may want to revisit it if we couldn't | ||
3144 | * meet load balance goals by pulling other tasks on src_cpu. | ||
3145 | * | ||
3146 | * Also avoid computing new_dst_cpu if we have already computed | ||
3147 | * one in current iteration. | ||
3148 | */ | ||
3149 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | ||
3150 | return 0; | ||
3151 | |||
3152 | new_dst_cpu = cpumask_first_and(env->dst_grpmask, | ||
3153 | tsk_cpus_allowed(p)); | ||
3154 | if (new_dst_cpu < nr_cpu_ids) { | ||
3155 | env->flags |= LBF_SOME_PINNED; | ||
3156 | env->new_dst_cpu = new_dst_cpu; | ||
3157 | } | ||
3149 | return 0; | 3158 | return 0; |
3150 | } | 3159 | } |
3160 | |||
3161 | /* Record that we found atleast one task that could run on dst_cpu */ | ||
3151 | env->flags &= ~LBF_ALL_PINNED; | 3162 | env->flags &= ~LBF_ALL_PINNED; |
3152 | 3163 | ||
3153 | if (task_running(env->src_rq, p)) { | 3164 | if (task_running(env->src_rq, p)) { |
@@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4227 | struct sched_domain *sd, enum cpu_idle_type idle, | 4238 | struct sched_domain *sd, enum cpu_idle_type idle, |
4228 | int *balance) | 4239 | int *balance) |
4229 | { | 4240 | { |
4230 | int ld_moved, active_balance = 0; | 4241 | int ld_moved, cur_ld_moved, active_balance = 0; |
4242 | int lb_iterations, max_lb_iterations; | ||
4231 | struct sched_group *group; | 4243 | struct sched_group *group; |
4232 | struct rq *busiest; | 4244 | struct rq *busiest; |
4233 | unsigned long flags; | 4245 | unsigned long flags; |
@@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4237 | .sd = sd, | 4249 | .sd = sd, |
4238 | .dst_cpu = this_cpu, | 4250 | .dst_cpu = this_cpu, |
4239 | .dst_rq = this_rq, | 4251 | .dst_rq = this_rq, |
4252 | .dst_grpmask = sched_group_cpus(sd->groups), | ||
4240 | .idle = idle, | 4253 | .idle = idle, |
4241 | .loop_break = sched_nr_migrate_break, | 4254 | .loop_break = sched_nr_migrate_break, |
4242 | }; | 4255 | }; |
4243 | 4256 | ||
4244 | cpumask_copy(cpus, cpu_active_mask); | 4257 | cpumask_copy(cpus, cpu_active_mask); |
4258 | max_lb_iterations = cpumask_weight(env.dst_grpmask); | ||
4245 | 4259 | ||
4246 | schedstat_inc(sd, lb_count[idle]); | 4260 | schedstat_inc(sd, lb_count[idle]); |
4247 | 4261 | ||
@@ -4267,6 +4281,7 @@ redo: | |||
4267 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 4281 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
4268 | 4282 | ||
4269 | ld_moved = 0; | 4283 | ld_moved = 0; |
4284 | lb_iterations = 1; | ||
4270 | if (busiest->nr_running > 1) { | 4285 | if (busiest->nr_running > 1) { |
4271 | /* | 4286 | /* |
4272 | * Attempt to move tasks. If find_busiest_group has found | 4287 | * Attempt to move tasks. If find_busiest_group has found |
@@ -4284,7 +4299,13 @@ more_balance: | |||
4284 | double_rq_lock(this_rq, busiest); | 4299 | double_rq_lock(this_rq, busiest); |
4285 | if (!env.loop) | 4300 | if (!env.loop) |
4286 | update_h_load(env.src_cpu); | 4301 | update_h_load(env.src_cpu); |
4287 | ld_moved += move_tasks(&env); | 4302 | |
4303 | /* | ||
4304 | * cur_ld_moved - load moved in current iteration | ||
4305 | * ld_moved - cumulative load moved across iterations | ||
4306 | */ | ||
4307 | cur_ld_moved = move_tasks(&env); | ||
4308 | ld_moved += cur_ld_moved; | ||
4288 | double_rq_unlock(this_rq, busiest); | 4309 | double_rq_unlock(this_rq, busiest); |
4289 | local_irq_restore(flags); | 4310 | local_irq_restore(flags); |
4290 | 4311 | ||
@@ -4296,14 +4317,52 @@ more_balance: | |||
4296 | /* | 4317 | /* |
4297 | * some other cpu did the load balance for us. | 4318 | * some other cpu did the load balance for us. |
4298 | */ | 4319 | */ |
4299 | if (ld_moved && this_cpu != smp_processor_id()) | 4320 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) |
4300 | resched_cpu(this_cpu); | 4321 | resched_cpu(env.dst_cpu); |
4322 | |||
4323 | /* | ||
4324 | * Revisit (affine) tasks on src_cpu that couldn't be moved to | ||
4325 | * us and move them to an alternate dst_cpu in our sched_group | ||
4326 | * where they can run. The upper limit on how many times we | ||
4327 | * iterate on same src_cpu is dependent on number of cpus in our | ||
4328 | * sched_group. | ||
4329 | * | ||
4330 | * This changes load balance semantics a bit on who can move | ||
4331 | * load to a given_cpu. In addition to the given_cpu itself | ||
4332 | * (or a ilb_cpu acting on its behalf where given_cpu is | ||
4333 | * nohz-idle), we now have balance_cpu in a position to move | ||
4334 | * load to given_cpu. In rare situations, this may cause | ||
4335 | * conflicts (balance_cpu and given_cpu/ilb_cpu deciding | ||
4336 | * _independently_ and at _same_ time to move some load to | ||
4337 | * given_cpu) causing exceess load to be moved to given_cpu. | ||
4338 | * This however should not happen so much in practice and | ||
4339 | * moreover subsequent load balance cycles should correct the | ||
4340 | * excess load moved. | ||
4341 | */ | ||
4342 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && | ||
4343 | lb_iterations++ < max_lb_iterations) { | ||
4344 | |||
4345 | this_rq = cpu_rq(env.new_dst_cpu); | ||
4346 | env.dst_rq = this_rq; | ||
4347 | env.dst_cpu = env.new_dst_cpu; | ||
4348 | env.flags &= ~LBF_SOME_PINNED; | ||
4349 | env.loop = 0; | ||
4350 | env.loop_break = sched_nr_migrate_break; | ||
4351 | /* | ||
4352 | * Go back to "more_balance" rather than "redo" since we | ||
4353 | * need to continue with same src_cpu. | ||
4354 | */ | ||
4355 | goto more_balance; | ||
4356 | } | ||
4301 | 4357 | ||
4302 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4358 | /* All tasks on this runqueue were pinned by CPU affinity */ |
4303 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 4359 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
4304 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4360 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
4305 | if (!cpumask_empty(cpus)) | 4361 | if (!cpumask_empty(cpus)) { |
4362 | env.loop = 0; | ||
4363 | env.loop_break = sched_nr_migrate_break; | ||
4306 | goto redo; | 4364 | goto redo; |
4365 | } | ||
4307 | goto out_balanced; | 4366 | goto out_balanced; |
4308 | } | 4367 | } |
4309 | } | 4368 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55844f24435a..c35a1a7dd4d6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg); | |||
538 | /* | 538 | /* |
539 | * Return the group to which this tasks belongs. | 539 | * Return the group to which this tasks belongs. |
540 | * | 540 | * |
541 | * We use task_subsys_state_check() and extend the RCU verification with | 541 | * We cannot use task_subsys_state() and friends because the cgroup |
542 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | 542 | * subsystem changes that value before the cgroup_subsys::attach() method |
543 | * task it moves into the cgroup. Therefore by holding either of those locks, | 543 | * is called, therefore we cannot pin it and might observe the wrong value. |
544 | * we pin the task to the current cgroup. | 544 | * |
545 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup | ||
546 | * core changes this before calling sched_move_task(). | ||
547 | * | ||
548 | * Instead we use a 'copy' which is updated from sched_move_task() while | ||
549 | * holding both task_struct::pi_lock and rq::lock. | ||
545 | */ | 550 | */ |
546 | static inline struct task_group *task_group(struct task_struct *p) | 551 | static inline struct task_group *task_group(struct task_struct *p) |
547 | { | 552 | { |
548 | struct task_group *tg; | 553 | return p->sched_task_group; |
549 | struct cgroup_subsys_state *css; | ||
550 | |||
551 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
552 | lockdep_is_held(&p->pi_lock) || | ||
553 | lockdep_is_held(&task_rq(p)->lock)); | ||
554 | tg = container_of(css, struct task_group, css); | ||
555 | |||
556 | return autogroup_task_group(p, tg); | ||
557 | } | 554 | } |
558 | 555 | ||
559 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 556 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |