aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 00:21:34 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 00:21:34 -0500
commit86c6a2fddf0b89b494c7616f2c06cf915c4bff01 (patch)
tree0e6930c93e5d49ead71b17fcadf0cc9ba28c3d2d
parentbee2782f30f66898be3f74ad02e4d1f87a969694 (diff)
parentfd7de1e8d5b2b2b35e71332fafb899f584597150 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle are: - 'Nested Sleep Debugging', activated when CONFIG_DEBUG_ATOMIC_SLEEP=y. This instruments might_sleep() checks to catch places that nest blocking primitives - such as mutex usage in a wait loop. Such bugs can result in hard to debug races/hangs. Another category of invalid nesting that this facility will detect is the calling of blocking functions from within schedule() -> sched_submit_work() -> blk_schedule_flush_plug(). There's some potential for false positives (if secondary blocking primitives themselves are not ready yet for this facility), but the kernel will warn once about such bugs per bootup, so the warning isn't much of a nuisance. This feature comes with a number of fixes, for problems uncovered with it, so no messages are expected normally. - Another round of sched/numa optimizations and refinements, for CONFIG_NUMA_BALANCING=y. - Another round of sched/dl fixes and refinements. Plus various smaller fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched: Add missing rcu protection to wake_up_all_idle_cpus sched/deadline: Introduce start_hrtick_dl() for !CONFIG_SCHED_HRTICK sched/numa: Init numa balancing fields of init_task sched/deadline: Remove unnecessary definitions in cpudeadline.h sched/cpupri: Remove unnecessary definitions in cpupri.h sched/deadline: Fix rq->dl.pushable_tasks bug in push_dl_task() sched/fair: Fix stale overloaded status in the busiest group finding logic sched: Move p->nr_cpus_allowed check to select_task_rq() sched/completion: Document when to use wait_for_completion_io_*() sched: Update comments about CLONE_NEWUTS and CLONE_NEWIPC sched/fair: Kill task_struct::numa_entry and numa_group::task_list sched: Refactor task_struct to use numa_faults instead of numa_* pointers sched/deadline: Don't check CONFIG_SMP in switched_from_dl() sched/deadline: Reschedule from switched_from_dl() after a successful pull sched/deadline: Push task away if the deadline is equal to curr during wakeup sched/deadline: Add deadline rq status print sched/deadline: Fix artificial overrun introduced by yield_task_dl() sched/rt: Clean up check_preempt_equal_prio() sched/core: Use dl_bw_of() under rcu_read_lock_sched() sched: Check if we got a shallowest_idle_cpu before searching for least_loaded_cpu ...
-rw-r--r--arch/x86/include/asm/preempt.h3
-rw-r--r--drivers/tty/n_tty.c17
-rw-r--r--fs/notify/inotify/inotify_user.c9
-rw-r--r--include/asm-generic/preempt.h3
-rw-r--r--include/linux/freezer.h50
-rw-r--r--include/linux/init_task.h10
-rw-r--r--include/linux/kernel.h5
-rw-r--r--include/linux/sched.h87
-rw-r--r--include/linux/wait.h80
-rw-r--r--include/net/sock.h1
-rw-r--r--include/trace/events/sched.h9
-rw-r--r--include/uapi/linux/sched.h4
-rw-r--r--kernel/audit.c11
-rw-r--r--kernel/cpuset.c23
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/locking/mutex.c8
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c241
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/deadline.c99
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c354
-rw-r--r--kernel/sched/rt.c17
-rw-r--r--kernel/sched/sched.h43
-rw-r--r--kernel/sched/wait.c66
-rw-r--r--kernel/smpboot.c15
-rw-r--r--net/bluetooth/rfcomm/core.c18
-rw-r--r--net/core/dev.c10
-rw-r--r--net/core/rtnetlink.c10
31 files changed, 915 insertions, 335 deletions
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 400873450e33..8f3271842533 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -30,9 +30,6 @@ static __always_inline void preempt_count_set(int pc)
30/* 30/*
31 * must be macros to avoid header recursion hell 31 * must be macros to avoid header recursion hell
32 */ 32 */
33#define task_preempt_count(p) \
34 (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED)
35
36#define init_task_preempt_count(p) do { \ 33#define init_task_preempt_count(p) do { \
37 task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ 34 task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
38} while (0) 35} while (0)
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 2e900a98c3e3..26f097f60b10 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -2123,7 +2123,7 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
2123{ 2123{
2124 struct n_tty_data *ldata = tty->disc_data; 2124 struct n_tty_data *ldata = tty->disc_data;
2125 unsigned char __user *b = buf; 2125 unsigned char __user *b = buf;
2126 DECLARE_WAITQUEUE(wait, current); 2126 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2127 int c; 2127 int c;
2128 int minimum, time; 2128 int minimum, time;
2129 ssize_t retval = 0; 2129 ssize_t retval = 0;
@@ -2186,10 +2186,6 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
2186 nr--; 2186 nr--;
2187 break; 2187 break;
2188 } 2188 }
2189 /* This statement must be first before checking for input
2190 so that any interrupt will set the state back to
2191 TASK_RUNNING. */
2192 set_current_state(TASK_INTERRUPTIBLE);
2193 2189
2194 if (((minimum - (b - buf)) < ldata->minimum_to_wake) && 2190 if (((minimum - (b - buf)) < ldata->minimum_to_wake) &&
2195 ((minimum - (b - buf)) >= 1)) 2191 ((minimum - (b - buf)) >= 1))
@@ -2220,13 +2216,13 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
2220 n_tty_set_room(tty); 2216 n_tty_set_room(tty);
2221 up_read(&tty->termios_rwsem); 2217 up_read(&tty->termios_rwsem);
2222 2218
2223 timeout = schedule_timeout(timeout); 2219 timeout = wait_woken(&wait, TASK_INTERRUPTIBLE,
2220 timeout);
2224 2221
2225 down_read(&tty->termios_rwsem); 2222 down_read(&tty->termios_rwsem);
2226 continue; 2223 continue;
2227 } 2224 }
2228 } 2225 }
2229 __set_current_state(TASK_RUNNING);
2230 2226
2231 /* Deal with packet mode. */ 2227 /* Deal with packet mode. */
2232 if (packet && b == buf) { 2228 if (packet && b == buf) {
@@ -2273,7 +2269,6 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
2273 2269
2274 mutex_unlock(&ldata->atomic_read_lock); 2270 mutex_unlock(&ldata->atomic_read_lock);
2275 2271
2276 __set_current_state(TASK_RUNNING);
2277 if (b - buf) 2272 if (b - buf)
2278 retval = b - buf; 2273 retval = b - buf;
2279 2274
@@ -2306,7 +2301,7 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
2306 const unsigned char *buf, size_t nr) 2301 const unsigned char *buf, size_t nr)
2307{ 2302{
2308 const unsigned char *b = buf; 2303 const unsigned char *b = buf;
2309 DECLARE_WAITQUEUE(wait, current); 2304 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2310 int c; 2305 int c;
2311 ssize_t retval = 0; 2306 ssize_t retval = 0;
2312 2307
@@ -2324,7 +2319,6 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
2324 2319
2325 add_wait_queue(&tty->write_wait, &wait); 2320 add_wait_queue(&tty->write_wait, &wait);
2326 while (1) { 2321 while (1) {
2327 set_current_state(TASK_INTERRUPTIBLE);
2328 if (signal_pending(current)) { 2322 if (signal_pending(current)) {
2329 retval = -ERESTARTSYS; 2323 retval = -ERESTARTSYS;
2330 break; 2324 break;
@@ -2378,12 +2372,11 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
2378 } 2372 }
2379 up_read(&tty->termios_rwsem); 2373 up_read(&tty->termios_rwsem);
2380 2374
2381 schedule(); 2375 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2382 2376
2383 down_read(&tty->termios_rwsem); 2377 down_read(&tty->termios_rwsem);
2384 } 2378 }
2385break_out: 2379break_out:
2386 __set_current_state(TASK_RUNNING);
2387 remove_wait_queue(&tty->write_wait, &wait); 2380 remove_wait_queue(&tty->write_wait, &wait);
2388 if (b - buf != nr && tty->fasync) 2381 if (b - buf != nr && tty->fasync)
2389 set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); 2382 set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index daf76652fe58..283aa312d745 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -227,14 +227,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
227 struct fsnotify_event *kevent; 227 struct fsnotify_event *kevent;
228 char __user *start; 228 char __user *start;
229 int ret; 229 int ret;
230 DEFINE_WAIT(wait); 230 DEFINE_WAIT_FUNC(wait, woken_wake_function);
231 231
232 start = buf; 232 start = buf;
233 group = file->private_data; 233 group = file->private_data;
234 234
235 add_wait_queue(&group->notification_waitq, &wait);
235 while (1) { 236 while (1) {
236 prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
237
238 mutex_lock(&group->notification_mutex); 237 mutex_lock(&group->notification_mutex);
239 kevent = get_one_event(group, count); 238 kevent = get_one_event(group, count);
240 mutex_unlock(&group->notification_mutex); 239 mutex_unlock(&group->notification_mutex);
@@ -264,10 +263,10 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
264 if (start != buf) 263 if (start != buf)
265 break; 264 break;
266 265
267 schedule(); 266 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
268 } 267 }
268 remove_wait_queue(&group->notification_waitq, &wait);
269 269
270 finish_wait(&group->notification_waitq, &wait);
271 if (start != buf && ret != -EFAULT) 270 if (start != buf && ret != -EFAULT)
272 ret = buf - start; 271 ret = buf - start;
273 return ret; 272 return ret;
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index 1cd3f5d767a8..eb6f9e6c3075 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -23,9 +23,6 @@ static __always_inline void preempt_count_set(int pc)
23/* 23/*
24 * must be macros to avoid header recursion hell 24 * must be macros to avoid header recursion hell
25 */ 25 */
26#define task_preempt_count(p) \
27 (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED)
28
29#define init_task_preempt_count(p) do { \ 26#define init_task_preempt_count(p) do { \
30 task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ 27 task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
31} while (0) 28} while (0)
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 7fd81b8c4897..6b7fd9cf5ea2 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -246,15 +246,6 @@ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
246 * defined in <linux/wait.h> 246 * defined in <linux/wait.h>
247 */ 247 */
248 248
249#define wait_event_freezekillable(wq, condition) \
250({ \
251 int __retval; \
252 freezer_do_not_count(); \
253 __retval = wait_event_killable(wq, (condition)); \
254 freezer_count(); \
255 __retval; \
256})
257
258/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ 249/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
259#define wait_event_freezekillable_unsafe(wq, condition) \ 250#define wait_event_freezekillable_unsafe(wq, condition) \
260({ \ 251({ \
@@ -265,35 +256,6 @@ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
265 __retval; \ 256 __retval; \
266}) 257})
267 258
268#define wait_event_freezable(wq, condition) \
269({ \
270 int __retval; \
271 freezer_do_not_count(); \
272 __retval = wait_event_interruptible(wq, (condition)); \
273 freezer_count(); \
274 __retval; \
275})
276
277#define wait_event_freezable_timeout(wq, condition, timeout) \
278({ \
279 long __retval = timeout; \
280 freezer_do_not_count(); \
281 __retval = wait_event_interruptible_timeout(wq, (condition), \
282 __retval); \
283 freezer_count(); \
284 __retval; \
285})
286
287#define wait_event_freezable_exclusive(wq, condition) \
288({ \
289 int __retval; \
290 freezer_do_not_count(); \
291 __retval = wait_event_interruptible_exclusive(wq, condition); \
292 freezer_count(); \
293 __retval; \
294})
295
296
297#else /* !CONFIG_FREEZER */ 259#else /* !CONFIG_FREEZER */
298static inline bool frozen(struct task_struct *p) { return false; } 260static inline bool frozen(struct task_struct *p) { return false; }
299static inline bool freezing(struct task_struct *p) { return false; } 261static inline bool freezing(struct task_struct *p) { return false; }
@@ -331,18 +293,6 @@ static inline void set_freezable(void) {}
331#define freezable_schedule_hrtimeout_range(expires, delta, mode) \ 293#define freezable_schedule_hrtimeout_range(expires, delta, mode) \
332 schedule_hrtimeout_range(expires, delta, mode) 294 schedule_hrtimeout_range(expires, delta, mode)
333 295
334#define wait_event_freezable(wq, condition) \
335 wait_event_interruptible(wq, condition)
336
337#define wait_event_freezable_timeout(wq, condition, timeout) \
338 wait_event_interruptible_timeout(wq, condition, timeout)
339
340#define wait_event_freezable_exclusive(wq, condition) \
341 wait_event_interruptible_exclusive(wq, condition)
342
343#define wait_event_freezekillable(wq, condition) \
344 wait_event_killable(wq, condition)
345
346#define wait_event_freezekillable_unsafe(wq, condition) \ 296#define wait_event_freezekillable_unsafe(wq, condition) \
347 wait_event_killable(wq, condition) 297 wait_event_killable(wq, condition)
348 298
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d996aef8044f..3037fc085e8e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -166,6 +166,15 @@ extern struct task_group root_task_group;
166# define INIT_RT_MUTEXES(tsk) 166# define INIT_RT_MUTEXES(tsk)
167#endif 167#endif
168 168
169#ifdef CONFIG_NUMA_BALANCING
170# define INIT_NUMA_BALANCING(tsk) \
171 .numa_preferred_nid = -1, \
172 .numa_group = NULL, \
173 .numa_faults = NULL,
174#else
175# define INIT_NUMA_BALANCING(tsk)
176#endif
177
169/* 178/*
170 * INIT_TASK is used to set up the first task table, touch at 179 * INIT_TASK is used to set up the first task table, touch at
171 * your own risk!. Base=0, limit=0x1fffff (=2MB) 180 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -237,6 +246,7 @@ extern struct task_group root_task_group;
237 INIT_CPUSET_SEQ(tsk) \ 246 INIT_CPUSET_SEQ(tsk) \
238 INIT_RT_MUTEXES(tsk) \ 247 INIT_RT_MUTEXES(tsk) \
239 INIT_VTIME(tsk) \ 248 INIT_VTIME(tsk) \
249 INIT_NUMA_BALANCING(tsk) \
240} 250}
241 251
242 252
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3d770f5564b8..446d76a87ba1 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -162,6 +162,7 @@ extern int _cond_resched(void);
162#endif 162#endif
163 163
164#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 164#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
165 void ___might_sleep(const char *file, int line, int preempt_offset);
165 void __might_sleep(const char *file, int line, int preempt_offset); 166 void __might_sleep(const char *file, int line, int preempt_offset);
166/** 167/**
167 * might_sleep - annotation for functions that can sleep 168 * might_sleep - annotation for functions that can sleep
@@ -175,10 +176,14 @@ extern int _cond_resched(void);
175 */ 176 */
176# define might_sleep() \ 177# define might_sleep() \
177 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) 178 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
179# define sched_annotate_sleep() __set_current_state(TASK_RUNNING)
178#else 180#else
181 static inline void ___might_sleep(const char *file, int line,
182 int preempt_offset) { }
179 static inline void __might_sleep(const char *file, int line, 183 static inline void __might_sleep(const char *file, int line,
180 int preempt_offset) { } 184 int preempt_offset) { }
181# define might_sleep() do { might_resched(); } while (0) 185# define might_sleep() do { might_resched(); } while (0)
186# define sched_annotate_sleep() do { } while (0)
182#endif 187#endif
183 188
184#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) 189#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 706a9f744909..55f5ee7cc3d3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -243,6 +243,43 @@ extern char ___assert_task_state[1 - 2*!!(
243 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ 243 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
244 (task->flags & PF_FROZEN) == 0) 244 (task->flags & PF_FROZEN) == 0)
245 245
246#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
247
248#define __set_task_state(tsk, state_value) \
249 do { \
250 (tsk)->task_state_change = _THIS_IP_; \
251 (tsk)->state = (state_value); \
252 } while (0)
253#define set_task_state(tsk, state_value) \
254 do { \
255 (tsk)->task_state_change = _THIS_IP_; \
256 set_mb((tsk)->state, (state_value)); \
257 } while (0)
258
259/*
260 * set_current_state() includes a barrier so that the write of current->state
261 * is correctly serialised wrt the caller's subsequent test of whether to
262 * actually sleep:
263 *
264 * set_current_state(TASK_UNINTERRUPTIBLE);
265 * if (do_i_need_to_sleep())
266 * schedule();
267 *
268 * If the caller does not need such serialisation then use __set_current_state()
269 */
270#define __set_current_state(state_value) \
271 do { \
272 current->task_state_change = _THIS_IP_; \
273 current->state = (state_value); \
274 } while (0)
275#define set_current_state(state_value) \
276 do { \
277 current->task_state_change = _THIS_IP_; \
278 set_mb(current->state, (state_value)); \
279 } while (0)
280
281#else
282
246#define __set_task_state(tsk, state_value) \ 283#define __set_task_state(tsk, state_value) \
247 do { (tsk)->state = (state_value); } while (0) 284 do { (tsk)->state = (state_value); } while (0)
248#define set_task_state(tsk, state_value) \ 285#define set_task_state(tsk, state_value) \
@@ -259,11 +296,13 @@ extern char ___assert_task_state[1 - 2*!!(
259 * 296 *
260 * If the caller does not need such serialisation then use __set_current_state() 297 * If the caller does not need such serialisation then use __set_current_state()
261 */ 298 */
262#define __set_current_state(state_value) \ 299#define __set_current_state(state_value) \
263 do { current->state = (state_value); } while (0) 300 do { current->state = (state_value); } while (0)
264#define set_current_state(state_value) \ 301#define set_current_state(state_value) \
265 set_mb(current->state, (state_value)) 302 set_mb(current->state, (state_value))
266 303
304#endif
305
267/* Task command name length */ 306/* Task command name length */
268#define TASK_COMM_LEN 16 307#define TASK_COMM_LEN 16
269 308
@@ -1558,28 +1597,23 @@ struct task_struct {
1558 struct numa_group *numa_group; 1597 struct numa_group *numa_group;
1559 1598
1560 /* 1599 /*
1561 * Exponential decaying average of faults on a per-node basis. 1600 * numa_faults is an array split into four regions:
1562 * Scheduling placement decisions are made based on the these counts. 1601 * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
1563 * The values remain static for the duration of a PTE scan 1602 * in this precise order.
1603 *
1604 * faults_memory: Exponential decaying average of faults on a per-node
1605 * basis. Scheduling placement decisions are made based on these
1606 * counts. The values remain static for the duration of a PTE scan.
1607 * faults_cpu: Track the nodes the process was running on when a NUMA
1608 * hinting fault was incurred.
1609 * faults_memory_buffer and faults_cpu_buffer: Record faults per node
1610 * during the current scan window. When the scan completes, the counts
1611 * in faults_memory and faults_cpu decay and these values are copied.
1564 */ 1612 */
1565 unsigned long *numa_faults_memory; 1613 unsigned long *numa_faults;
1566 unsigned long total_numa_faults; 1614 unsigned long total_numa_faults;
1567 1615
1568 /* 1616 /*
1569 * numa_faults_buffer records faults per node during the current
1570 * scan window. When the scan completes, the counts in
1571 * numa_faults_memory decay and these values are copied.
1572 */
1573 unsigned long *numa_faults_buffer_memory;
1574
1575 /*
1576 * Track the nodes the process was running on when a NUMA hinting
1577 * fault was incurred.
1578 */
1579 unsigned long *numa_faults_cpu;
1580 unsigned long *numa_faults_buffer_cpu;
1581
1582 /*
1583 * numa_faults_locality tracks if faults recorded during the last 1617 * numa_faults_locality tracks if faults recorded during the last
1584 * scan window were remote/local. The task scan period is adapted 1618 * scan window were remote/local. The task scan period is adapted
1585 * based on the locality of the faults with different weights 1619 * based on the locality of the faults with different weights
@@ -1661,6 +1695,9 @@ struct task_struct {
1661 unsigned int sequential_io; 1695 unsigned int sequential_io;
1662 unsigned int sequential_io_avg; 1696 unsigned int sequential_io_avg;
1663#endif 1697#endif
1698#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
1699 unsigned long task_state_change;
1700#endif
1664}; 1701};
1665 1702
1666/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1703/* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -2052,6 +2089,10 @@ static inline void tsk_restore_flags(struct task_struct *task,
2052 task->flags |= orig_flags & flags; 2089 task->flags |= orig_flags & flags;
2053} 2090}
2054 2091
2092extern int cpuset_cpumask_can_shrink(const struct cpumask *cur,
2093 const struct cpumask *trial);
2094extern int task_can_attach(struct task_struct *p,
2095 const struct cpumask *cs_cpus_allowed);
2055#ifdef CONFIG_SMP 2096#ifdef CONFIG_SMP
2056extern void do_set_cpus_allowed(struct task_struct *p, 2097extern void do_set_cpus_allowed(struct task_struct *p,
2057 const struct cpumask *new_mask); 2098 const struct cpumask *new_mask);
@@ -2760,7 +2801,7 @@ static inline int signal_pending_state(long state, struct task_struct *p)
2760extern int _cond_resched(void); 2801extern int _cond_resched(void);
2761 2802
2762#define cond_resched() ({ \ 2803#define cond_resched() ({ \
2763 __might_sleep(__FILE__, __LINE__, 0); \ 2804 ___might_sleep(__FILE__, __LINE__, 0); \
2764 _cond_resched(); \ 2805 _cond_resched(); \
2765}) 2806})
2766 2807
@@ -2773,14 +2814,14 @@ extern int __cond_resched_lock(spinlock_t *lock);
2773#endif 2814#endif
2774 2815
2775#define cond_resched_lock(lock) ({ \ 2816#define cond_resched_lock(lock) ({ \
2776 __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ 2817 ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
2777 __cond_resched_lock(lock); \ 2818 __cond_resched_lock(lock); \
2778}) 2819})
2779 2820
2780extern int __cond_resched_softirq(void); 2821extern int __cond_resched_softirq(void);
2781 2822
2782#define cond_resched_softirq() ({ \ 2823#define cond_resched_softirq() ({ \
2783 __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ 2824 ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
2784 __cond_resched_softirq(); \ 2825 __cond_resched_softirq(); \
2785}) 2826})
2786 2827
diff --git a/include/linux/wait.h b/include/linux/wait.h
index e4a8eb9312ea..2232ed16635a 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -13,9 +13,12 @@ typedef struct __wait_queue wait_queue_t;
13typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key); 13typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
14int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); 14int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
15 15
16/* __wait_queue::flags */
17#define WQ_FLAG_EXCLUSIVE 0x01
18#define WQ_FLAG_WOKEN 0x02
19
16struct __wait_queue { 20struct __wait_queue {
17 unsigned int flags; 21 unsigned int flags;
18#define WQ_FLAG_EXCLUSIVE 0x01
19 void *private; 22 void *private;
20 wait_queue_func_t func; 23 wait_queue_func_t func;
21 struct list_head task_list; 24 struct list_head task_list;
@@ -258,11 +261,37 @@ __out: __ret; \
258 */ 261 */
259#define wait_event(wq, condition) \ 262#define wait_event(wq, condition) \
260do { \ 263do { \
264 might_sleep(); \
261 if (condition) \ 265 if (condition) \
262 break; \ 266 break; \
263 __wait_event(wq, condition); \ 267 __wait_event(wq, condition); \
264} while (0) 268} while (0)
265 269
270#define __wait_event_freezable(wq, condition) \
271 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
272 schedule(); try_to_freeze())
273
274/**
275 * wait_event - sleep (or freeze) until a condition gets true
276 * @wq: the waitqueue to wait on
277 * @condition: a C expression for the event to wait for
278 *
279 * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
280 * to system load) until the @condition evaluates to true. The
281 * @condition is checked each time the waitqueue @wq is woken up.
282 *
283 * wake_up() has to be called after changing any variable that could
284 * change the result of the wait condition.
285 */
286#define wait_event_freezable(wq, condition) \
287({ \
288 int __ret = 0; \
289 might_sleep(); \
290 if (!(condition)) \
291 __ret = __wait_event_freezable(wq, condition); \
292 __ret; \
293})
294
266#define __wait_event_timeout(wq, condition, timeout) \ 295#define __wait_event_timeout(wq, condition, timeout) \
267 ___wait_event(wq, ___wait_cond_timeout(condition), \ 296 ___wait_event(wq, ___wait_cond_timeout(condition), \
268 TASK_UNINTERRUPTIBLE, 0, timeout, \ 297 TASK_UNINTERRUPTIBLE, 0, timeout, \
@@ -290,11 +319,30 @@ do { \
290#define wait_event_timeout(wq, condition, timeout) \ 319#define wait_event_timeout(wq, condition, timeout) \
291({ \ 320({ \
292 long __ret = timeout; \ 321 long __ret = timeout; \
322 might_sleep(); \
293 if (!___wait_cond_timeout(condition)) \ 323 if (!___wait_cond_timeout(condition)) \
294 __ret = __wait_event_timeout(wq, condition, timeout); \ 324 __ret = __wait_event_timeout(wq, condition, timeout); \
295 __ret; \ 325 __ret; \
296}) 326})
297 327
328#define __wait_event_freezable_timeout(wq, condition, timeout) \
329 ___wait_event(wq, ___wait_cond_timeout(condition), \
330 TASK_INTERRUPTIBLE, 0, timeout, \
331 __ret = schedule_timeout(__ret); try_to_freeze())
332
333/*
334 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
335 * increasing load and is freezable.
336 */
337#define wait_event_freezable_timeout(wq, condition, timeout) \
338({ \
339 long __ret = timeout; \
340 might_sleep(); \
341 if (!___wait_cond_timeout(condition)) \
342 __ret = __wait_event_freezable_timeout(wq, condition, timeout); \
343 __ret; \
344})
345
298#define __wait_event_cmd(wq, condition, cmd1, cmd2) \ 346#define __wait_event_cmd(wq, condition, cmd1, cmd2) \
299 (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ 347 (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
300 cmd1; schedule(); cmd2) 348 cmd1; schedule(); cmd2)
@@ -315,6 +363,7 @@ do { \
315 */ 363 */
316#define wait_event_cmd(wq, condition, cmd1, cmd2) \ 364#define wait_event_cmd(wq, condition, cmd1, cmd2) \
317do { \ 365do { \
366 might_sleep(); \
318 if (condition) \ 367 if (condition) \
319 break; \ 368 break; \
320 __wait_event_cmd(wq, condition, cmd1, cmd2); \ 369 __wait_event_cmd(wq, condition, cmd1, cmd2); \
@@ -342,6 +391,7 @@ do { \
342#define wait_event_interruptible(wq, condition) \ 391#define wait_event_interruptible(wq, condition) \
343({ \ 392({ \
344 int __ret = 0; \ 393 int __ret = 0; \
394 might_sleep(); \
345 if (!(condition)) \ 395 if (!(condition)) \
346 __ret = __wait_event_interruptible(wq, condition); \ 396 __ret = __wait_event_interruptible(wq, condition); \
347 __ret; \ 397 __ret; \
@@ -375,6 +425,7 @@ do { \
375#define wait_event_interruptible_timeout(wq, condition, timeout) \ 425#define wait_event_interruptible_timeout(wq, condition, timeout) \
376({ \ 426({ \
377 long __ret = timeout; \ 427 long __ret = timeout; \
428 might_sleep(); \
378 if (!___wait_cond_timeout(condition)) \ 429 if (!___wait_cond_timeout(condition)) \
379 __ret = __wait_event_interruptible_timeout(wq, \ 430 __ret = __wait_event_interruptible_timeout(wq, \
380 condition, timeout); \ 431 condition, timeout); \
@@ -425,6 +476,7 @@ do { \
425#define wait_event_hrtimeout(wq, condition, timeout) \ 476#define wait_event_hrtimeout(wq, condition, timeout) \
426({ \ 477({ \
427 int __ret = 0; \ 478 int __ret = 0; \
479 might_sleep(); \
428 if (!(condition)) \ 480 if (!(condition)) \
429 __ret = __wait_event_hrtimeout(wq, condition, timeout, \ 481 __ret = __wait_event_hrtimeout(wq, condition, timeout, \
430 TASK_UNINTERRUPTIBLE); \ 482 TASK_UNINTERRUPTIBLE); \
@@ -450,6 +502,7 @@ do { \
450#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ 502#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \
451({ \ 503({ \
452 long __ret = 0; \ 504 long __ret = 0; \
505 might_sleep(); \
453 if (!(condition)) \ 506 if (!(condition)) \
454 __ret = __wait_event_hrtimeout(wq, condition, timeout, \ 507 __ret = __wait_event_hrtimeout(wq, condition, timeout, \
455 TASK_INTERRUPTIBLE); \ 508 TASK_INTERRUPTIBLE); \
@@ -463,12 +516,27 @@ do { \
463#define wait_event_interruptible_exclusive(wq, condition) \ 516#define wait_event_interruptible_exclusive(wq, condition) \
464({ \ 517({ \
465 int __ret = 0; \ 518 int __ret = 0; \
519 might_sleep(); \
466 if (!(condition)) \ 520 if (!(condition)) \
467 __ret = __wait_event_interruptible_exclusive(wq, condition);\ 521 __ret = __wait_event_interruptible_exclusive(wq, condition);\
468 __ret; \ 522 __ret; \
469}) 523})
470 524
471 525
526#define __wait_event_freezable_exclusive(wq, condition) \
527 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
528 schedule(); try_to_freeze())
529
530#define wait_event_freezable_exclusive(wq, condition) \
531({ \
532 int __ret = 0; \
533 might_sleep(); \
534 if (!(condition)) \
535 __ret = __wait_event_freezable_exclusive(wq, condition);\
536 __ret; \
537})
538
539
472#define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \ 540#define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \
473({ \ 541({ \
474 int __ret = 0; \ 542 int __ret = 0; \
@@ -637,6 +705,7 @@ do { \
637#define wait_event_killable(wq, condition) \ 705#define wait_event_killable(wq, condition) \
638({ \ 706({ \
639 int __ret = 0; \ 707 int __ret = 0; \
708 might_sleep(); \
640 if (!(condition)) \ 709 if (!(condition)) \
641 __ret = __wait_event_killable(wq, condition); \ 710 __ret = __wait_event_killable(wq, condition); \
642 __ret; \ 711 __ret; \
@@ -830,6 +899,8 @@ void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int sta
830long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); 899long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
831void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); 900void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
832void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); 901void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
902long wait_woken(wait_queue_t *wait, unsigned mode, long timeout);
903int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
833int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 904int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
834int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 905int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
835 906
@@ -886,6 +957,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *);
886static inline int 957static inline int
887wait_on_bit(void *word, int bit, unsigned mode) 958wait_on_bit(void *word, int bit, unsigned mode)
888{ 959{
960 might_sleep();
889 if (!test_bit(bit, word)) 961 if (!test_bit(bit, word))
890 return 0; 962 return 0;
891 return out_of_line_wait_on_bit(word, bit, 963 return out_of_line_wait_on_bit(word, bit,
@@ -910,6 +982,7 @@ wait_on_bit(void *word, int bit, unsigned mode)
910static inline int 982static inline int
911wait_on_bit_io(void *word, int bit, unsigned mode) 983wait_on_bit_io(void *word, int bit, unsigned mode)
912{ 984{
985 might_sleep();
913 if (!test_bit(bit, word)) 986 if (!test_bit(bit, word))
914 return 0; 987 return 0;
915 return out_of_line_wait_on_bit(word, bit, 988 return out_of_line_wait_on_bit(word, bit,
@@ -936,6 +1009,7 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
936static inline int 1009static inline int
937wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) 1010wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
938{ 1011{
1012 might_sleep();
939 if (!test_bit(bit, word)) 1013 if (!test_bit(bit, word))
940 return 0; 1014 return 0;
941 return out_of_line_wait_on_bit(word, bit, action, mode); 1015 return out_of_line_wait_on_bit(word, bit, action, mode);
@@ -963,6 +1037,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode
963static inline int 1037static inline int
964wait_on_bit_lock(void *word, int bit, unsigned mode) 1038wait_on_bit_lock(void *word, int bit, unsigned mode)
965{ 1039{
1040 might_sleep();
966 if (!test_and_set_bit(bit, word)) 1041 if (!test_and_set_bit(bit, word))
967 return 0; 1042 return 0;
968 return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); 1043 return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
@@ -986,6 +1061,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode)
986static inline int 1061static inline int
987wait_on_bit_lock_io(void *word, int bit, unsigned mode) 1062wait_on_bit_lock_io(void *word, int bit, unsigned mode)
988{ 1063{
1064 might_sleep();
989 if (!test_and_set_bit(bit, word)) 1065 if (!test_and_set_bit(bit, word))
990 return 0; 1066 return 0;
991 return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); 1067 return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
@@ -1011,6 +1087,7 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode)
1011static inline int 1087static inline int
1012wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) 1088wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
1013{ 1089{
1090 might_sleep();
1014 if (!test_and_set_bit(bit, word)) 1091 if (!test_and_set_bit(bit, word))
1015 return 0; 1092 return 0;
1016 return out_of_line_wait_on_bit_lock(word, bit, action, mode); 1093 return out_of_line_wait_on_bit_lock(word, bit, action, mode);
@@ -1029,6 +1106,7 @@ wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned
1029static inline 1106static inline
1030int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) 1107int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
1031{ 1108{
1109 might_sleep();
1032 if (atomic_read(val) == 0) 1110 if (atomic_read(val) == 0)
1033 return 0; 1111 return 0;
1034 return out_of_line_wait_on_atomic_t(val, action, mode); 1112 return out_of_line_wait_on_atomic_t(val, action, mode);
diff --git a/include/net/sock.h b/include/net/sock.h
index 7db3db112baa..e6f235ebf6c9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -897,6 +897,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
897 if (!__rc) { \ 897 if (!__rc) { \
898 *(__timeo) = schedule_timeout(*(__timeo)); \ 898 *(__timeo) = schedule_timeout(*(__timeo)); \
899 } \ 899 } \
900 sched_annotate_sleep(); \
900 lock_sock(__sk); \ 901 lock_sock(__sk); \
901 __rc = __condition; \ 902 __rc = __condition; \
902 __rc; \ 903 __rc; \
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 0a68d5ae584e..30fedaf3e56a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -97,16 +97,19 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
97 long state = p->state; 97 long state = p->state;
98 98
99#ifdef CONFIG_PREEMPT 99#ifdef CONFIG_PREEMPT
100#ifdef CONFIG_SCHED_DEBUG
101 BUG_ON(p != current);
102#endif /* CONFIG_SCHED_DEBUG */
100 /* 103 /*
101 * For all intents and purposes a preempted task is a running task. 104 * For all intents and purposes a preempted task is a running task.
102 */ 105 */
103 if (task_preempt_count(p) & PREEMPT_ACTIVE) 106 if (preempt_count() & PREEMPT_ACTIVE)
104 state = TASK_RUNNING | TASK_STATE_MAX; 107 state = TASK_RUNNING | TASK_STATE_MAX;
105#endif 108#endif /* CONFIG_PREEMPT */
106 109
107 return state; 110 return state;
108} 111}
109#endif 112#endif /* CREATE_TRACE_POINTS */
110 113
111/* 114/*
112 * Tracepoint for task switches, performed by the scheduler: 115 * Tracepoint for task switches, performed by the scheduler:
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index b932be9f5c5b..cc89ddefa926 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -23,8 +23,8 @@
23#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ 23#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
24/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) 24/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
25 and is now available for re-use. */ 25 and is now available for re-use. */
26#define CLONE_NEWUTS 0x04000000 /* New utsname group? */ 26#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
27#define CLONE_NEWIPC 0x08000000 /* New ipcs */ 27#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
28#define CLONE_NEWUSER 0x10000000 /* New user namespace */ 28#define CLONE_NEWUSER 0x10000000 /* New user namespace */
29#define CLONE_NEWPID 0x20000000 /* New pid namespace */ 29#define CLONE_NEWPID 0x20000000 /* New pid namespace */
30#define CLONE_NEWNET 0x40000000 /* New network namespace */ 30#define CLONE_NEWNET 0x40000000 /* New network namespace */
diff --git a/kernel/audit.c b/kernel/audit.c
index cebb11db4d34..1f37f15117e5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy)
499 set_freezable(); 499 set_freezable();
500 while (!kthread_should_stop()) { 500 while (!kthread_should_stop()) {
501 struct sk_buff *skb; 501 struct sk_buff *skb;
502 DECLARE_WAITQUEUE(wait, current);
503 502
504 flush_hold_queue(); 503 flush_hold_queue();
505 504
@@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy)
514 audit_printk_skb(skb); 513 audit_printk_skb(skb);
515 continue; 514 continue;
516 } 515 }
517 set_current_state(TASK_INTERRUPTIBLE);
518 add_wait_queue(&kauditd_wait, &wait);
519 516
520 if (!skb_queue_len(&audit_skb_queue)) { 517 wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
521 try_to_freeze();
522 schedule();
523 }
524
525 __set_current_state(TASK_RUNNING);
526 remove_wait_queue(&kauditd_wait, &wait);
527 } 518 }
528 return 0; 519 return 0;
529} 520}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1f107c74087b..723cfc9d0ad7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
506 goto out; 506 goto out;
507 } 507 }
508 508
509 /*
510 * We can't shrink if we won't have enough room for SCHED_DEADLINE
511 * tasks.
512 */
513 ret = -EBUSY;
514 if (is_cpu_exclusive(cur) &&
515 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
516 trial->cpus_allowed))
517 goto out;
518
509 ret = 0; 519 ret = 0;
510out: 520out:
511 rcu_read_unlock(); 521 rcu_read_unlock();
@@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1429 goto out_unlock; 1439 goto out_unlock;
1430 1440
1431 cgroup_taskset_for_each(task, tset) { 1441 cgroup_taskset_for_each(task, tset) {
1432 /* 1442 ret = task_can_attach(task, cs->cpus_allowed);
1433 * Kthreads which disallow setaffinity shouldn't be moved 1443 if (ret)
1434 * to a new cpuset; we don't want to change their cpu
1435 * affinity and isolating such threads by their set of
1436 * allowed nodes is unnecessary. Thus, cpusets are not
1437 * applicable for such threads. This prevents checking for
1438 * success of set_cpus_allowed_ptr() on all attached tasks
1439 * before cpus_allowed may be changed.
1440 */
1441 ret = -EINVAL;
1442 if (task->flags & PF_NO_SETAFFINITY)
1443 goto out_unlock; 1444 goto out_unlock;
1444 ret = security_task_setscheduler(task); 1445 ret = security_task_setscheduler(task);
1445 if (ret) 1446 if (ret)
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019ff953..232c4bc8bcc9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -997,6 +997,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
997 997
998 get_task_struct(p); 998 get_task_struct(p);
999 read_unlock(&tasklist_lock); 999 read_unlock(&tasklist_lock);
1000 sched_annotate_sleep();
1001
1000 if ((exit_code & 0x7f) == 0) { 1002 if ((exit_code & 0x7f) == 0) {
1001 why = CLD_EXITED; 1003 why = CLD_EXITED;
1002 status = exit_code >> 8; 1004 status = exit_code >> 8;
@@ -1079,6 +1081,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1079 * thread can reap it because we its state == DEAD/TRACE. 1081 * thread can reap it because we its state == DEAD/TRACE.
1080 */ 1082 */
1081 read_unlock(&tasklist_lock); 1083 read_unlock(&tasklist_lock);
1084 sched_annotate_sleep();
1082 1085
1083 retval = wo->wo_rusage 1086 retval = wo->wo_rusage
1084 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1087 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
@@ -1210,6 +1213,7 @@ unlock_sig:
1210 pid = task_pid_vnr(p); 1213 pid = task_pid_vnr(p);
1211 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1214 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1212 read_unlock(&tasklist_lock); 1215 read_unlock(&tasklist_lock);
1216 sched_annotate_sleep();
1213 1217
1214 if (unlikely(wo->wo_flags & WNOWAIT)) 1218 if (unlikely(wo->wo_flags & WNOWAIT))
1215 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); 1219 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1272,6 +1276,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1272 pid = task_pid_vnr(p); 1276 pid = task_pid_vnr(p);
1273 get_task_struct(p); 1277 get_task_struct(p);
1274 read_unlock(&tasklist_lock); 1278 read_unlock(&tasklist_lock);
1279 sched_annotate_sleep();
1275 1280
1276 if (!wo->wo_info) { 1281 if (!wo->wo_info) {
1277 retval = wo->wo_rusage 1282 retval = wo->wo_rusage
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index dadbf88c22c4..454195194d4a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -378,8 +378,14 @@ done:
378 * reschedule now, before we try-lock the mutex. This avoids getting 378 * reschedule now, before we try-lock the mutex. This avoids getting
379 * scheduled out right after we obtained the mutex. 379 * scheduled out right after we obtained the mutex.
380 */ 380 */
381 if (need_resched()) 381 if (need_resched()) {
382 /*
383 * We _should_ have TASK_RUNNING here, but just in case
384 * we do not, make it so, otherwise we might get stuck.
385 */
386 __set_current_state(TASK_RUNNING);
382 schedule_preempt_disabled(); 387 schedule_preempt_disabled();
388 }
383 389
384 return false; 390 return false;
385} 391}
diff --git a/kernel/module.c b/kernel/module.c
index 88cec1ddb1e3..e52a8739361a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3097,6 +3097,32 @@ static int may_init_module(void)
3097} 3097}
3098 3098
3099/* 3099/*
3100 * Can't use wait_event_interruptible() because our condition
3101 * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
3102 */
3103static int wait_finished_loading(struct module *mod)
3104{
3105 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3106 int ret = 0;
3107
3108 add_wait_queue(&module_wq, &wait);
3109 for (;;) {
3110 if (finished_loading(mod->name))
3111 break;
3112
3113 if (signal_pending(current)) {
3114 ret = -ERESTARTSYS;
3115 break;
3116 }
3117
3118 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3119 }
3120 remove_wait_queue(&module_wq, &wait);
3121
3122 return ret;
3123}
3124
3125/*
3100 * We try to place it in the list now to make sure it's unique before 3126 * We try to place it in the list now to make sure it's unique before
3101 * we dedicate too many resources. In particular, temporary percpu 3127 * we dedicate too many resources. In particular, temporary percpu
3102 * memory exhaustion. 3128 * memory exhaustion.
@@ -3116,8 +3142,8 @@ again:
3116 || old->state == MODULE_STATE_UNFORMED) { 3142 || old->state == MODULE_STATE_UNFORMED) {
3117 /* Wait in case it fails to load. */ 3143 /* Wait in case it fails to load. */
3118 mutex_unlock(&module_mutex); 3144 mutex_unlock(&module_mutex);
3119 err = wait_event_interruptible(module_wq, 3145
3120 finished_loading(mod->name)); 3146 err = wait_finished_loading(mod);
3121 if (err) 3147 if (err)
3122 goto out_unlocked; 3148 goto out_unlocked;
3123 goto again; 3149 goto again;
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..607f852b4d04 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
148 * 148 *
149 * This waits to be signaled for completion of a specific task. It is NOT 149 * This waits to be signaled for completion of a specific task. It is NOT
150 * interruptible and there is no timeout. The caller is accounted as waiting 150 * interruptible and there is no timeout. The caller is accounted as waiting
151 * for IO. 151 * for IO (which traditionally means blkio only).
152 */ 152 */
153void __sched wait_for_completion_io(struct completion *x) 153void __sched wait_for_completion_io(struct completion *x)
154{ 154{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
163 * 163 *
164 * This waits for either a completion of a specific task to be signaled or for a 164 * This waits for either a completion of a specific task to be signaled or for a
165 * specified timeout to expire. The timeout is in jiffies. It is not 165 * specified timeout to expire. The timeout is in jiffies. It is not
166 * interruptible. The caller is accounted as waiting for IO. 166 * interruptible. The caller is accounted as waiting for IO (which traditionally
167 * means blkio only).
167 * 168 *
168 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left 169 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
169 * till timeout) if completed. 170 * till timeout) if completed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e67a6e88e125..bb398c0c5f08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
1008 return cpu_curr(task_cpu(p)) == p; 1008 return cpu_curr(task_cpu(p)) == p;
1009} 1009}
1010 1010
1011/*
1012 * Can drop rq->lock because from sched_class::switched_from() methods drop it.
1013 */
1011static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1014static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1012 const struct sched_class *prev_class, 1015 const struct sched_class *prev_class,
1013 int oldprio) 1016 int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1015 if (prev_class != p->sched_class) { 1018 if (prev_class != p->sched_class) {
1016 if (prev_class->switched_from) 1019 if (prev_class->switched_from)
1017 prev_class->switched_from(rq, p); 1020 prev_class->switched_from(rq, p);
1021 /* Possble rq->lock 'hole'. */
1018 p->sched_class->switched_to(rq, p); 1022 p->sched_class->switched_to(rq, p);
1019 } else if (oldprio != p->prio || dl_task(p)) 1023 } else if (oldprio != p->prio || dl_task(p))
1020 p->sched_class->prio_changed(rq, p, oldprio); 1024 p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1054 * ttwu() will sort out the placement. 1058 * ttwu() will sort out the placement.
1055 */ 1059 */
1056 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1060 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1057 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 1061 !p->on_rq);
1058 1062
1059#ifdef CONFIG_LOCKDEP 1063#ifdef CONFIG_LOCKDEP
1060 /* 1064 /*
@@ -1407,7 +1411,8 @@ out:
1407static inline 1411static inline
1408int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1412int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1409{ 1413{
1410 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1414 if (p->nr_cpus_allowed > 1)
1415 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1411 1416
1412 /* 1417 /*
1413 * In order not to call set_task_cpu() on a blocking task we need 1418 * In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
1623 struct rq *rq = cpu_rq(cpu); 1628 struct rq *rq = cpu_rq(cpu);
1624 unsigned long flags; 1629 unsigned long flags;
1625 1630
1626 if (!is_idle_task(rq->curr)) 1631 rcu_read_lock();
1627 return; 1632
1633 if (!is_idle_task(rcu_dereference(rq->curr)))
1634 goto out;
1628 1635
1629 if (set_nr_if_polling(rq->idle)) { 1636 if (set_nr_if_polling(rq->idle)) {
1630 trace_sched_wake_idle_without_ipi(cpu); 1637 trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
1635 /* Else cpu is not in idle, do nothing here */ 1642 /* Else cpu is not in idle, do nothing here */
1636 raw_spin_unlock_irqrestore(&rq->lock, flags); 1643 raw_spin_unlock_irqrestore(&rq->lock, flags);
1637 } 1644 }
1645
1646out:
1647 rcu_read_unlock();
1638} 1648}
1639 1649
1640bool cpus_share_cache(int this_cpu, int that_cpu) 1650bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1853 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1863 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1854 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1864 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1855 p->numa_work.next = &p->numa_work; 1865 p->numa_work.next = &p->numa_work;
1856 p->numa_faults_memory = NULL; 1866 p->numa_faults = NULL;
1857 p->numa_faults_buffer_memory = NULL;
1858 p->last_task_numa_placement = 0; 1867 p->last_task_numa_placement = 0;
1859 p->last_sum_exec_runtime = 0; 1868 p->last_sum_exec_runtime = 0;
1860 1869
1861 INIT_LIST_HEAD(&p->numa_entry);
1862 p->numa_group = NULL; 1870 p->numa_group = NULL;
1863#endif /* CONFIG_NUMA_BALANCING */ 1871#endif /* CONFIG_NUMA_BALANCING */
1864} 1872}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
2034} 2042}
2035#endif 2043#endif
2036 2044
2037static inline
2038void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2039{
2040 dl_b->total_bw -= tsk_bw;
2041}
2042
2043static inline
2044void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2045{
2046 dl_b->total_bw += tsk_bw;
2047}
2048
2049static inline
2050bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2051{
2052 return dl_b->bw != -1 &&
2053 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2054}
2055
2056/* 2045/*
2057 * We must be sure that accepting a new task (or allowing changing the 2046 * We must be sure that accepting a new task (or allowing changing the
2058 * parameters of an existing one) is consistent with the bandwidth 2047 * parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2220 2209
2221/** 2210/**
2222 * finish_task_switch - clean up after a task-switch 2211 * finish_task_switch - clean up after a task-switch
2223 * @rq: runqueue associated with task-switch
2224 * @prev: the thread we just switched away from. 2212 * @prev: the thread we just switched away from.
2225 * 2213 *
2226 * finish_task_switch must be called after the context switch, paired 2214 * finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2232 * so, we finish that here outside of the runqueue lock. (Doing it 2220 * so, we finish that here outside of the runqueue lock. (Doing it
2233 * with the lock held can cause deadlocks; see schedule() for 2221 * with the lock held can cause deadlocks; see schedule() for
2234 * details.) 2222 * details.)
2223 *
2224 * The context switch have flipped the stack from under us and restored the
2225 * local variables which were saved when this task called schedule() in the
2226 * past. prev == current is still correct but we need to recalculate this_rq
2227 * because prev may have moved to another CPU.
2235 */ 2228 */
2236static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2229static struct rq *finish_task_switch(struct task_struct *prev)
2237 __releases(rq->lock) 2230 __releases(rq->lock)
2238{ 2231{
2232 struct rq *rq = this_rq();
2239 struct mm_struct *mm = rq->prev_mm; 2233 struct mm_struct *mm = rq->prev_mm;
2240 long prev_state; 2234 long prev_state;
2241 2235
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2275 } 2269 }
2276 2270
2277 tick_nohz_task_switch(current); 2271 tick_nohz_task_switch(current);
2272 return rq;
2278} 2273}
2279 2274
2280#ifdef CONFIG_SMP 2275#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
2309asmlinkage __visible void schedule_tail(struct task_struct *prev) 2304asmlinkage __visible void schedule_tail(struct task_struct *prev)
2310 __releases(rq->lock) 2305 __releases(rq->lock)
2311{ 2306{
2312 struct rq *rq = this_rq(); 2307 struct rq *rq;
2313
2314 finish_task_switch(rq, prev);
2315 2308
2316 /* 2309 /* finish_task_switch() drops rq->lock and enables preemtion */
2317 * FIXME: do we need to worry about rq being invalidated by the 2310 preempt_disable();
2318 * task_switch? 2311 rq = finish_task_switch(prev);
2319 */
2320 post_schedule(rq); 2312 post_schedule(rq);
2313 preempt_enable();
2321 2314
2322 if (current->set_child_tid) 2315 if (current->set_child_tid)
2323 put_user(task_pid_vnr(current), current->set_child_tid); 2316 put_user(task_pid_vnr(current), current->set_child_tid);
2324} 2317}
2325 2318
2326/* 2319/*
2327 * context_switch - switch to the new MM and the new 2320 * context_switch - switch to the new MM and the new thread's register state.
2328 * thread's register state.
2329 */ 2321 */
2330static inline void 2322static inline struct rq *
2331context_switch(struct rq *rq, struct task_struct *prev, 2323context_switch(struct rq *rq, struct task_struct *prev,
2332 struct task_struct *next) 2324 struct task_struct *next)
2333{ 2325{
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
2366 context_tracking_task_switch(prev, next); 2358 context_tracking_task_switch(prev, next);
2367 /* Here we just switch the register state and the stack. */ 2359 /* Here we just switch the register state and the stack. */
2368 switch_to(prev, next, prev); 2360 switch_to(prev, next, prev);
2369
2370 barrier(); 2361 barrier();
2371 /* 2362
2372 * this_rq must be evaluated again because prev may have moved 2363 return finish_task_switch(prev);
2373 * CPUs since it called schedule(), thus the 'rq' on its stack
2374 * frame will be invalid.
2375 */
2376 finish_task_switch(this_rq(), prev);
2377} 2364}
2378 2365
2379/* 2366/*
@@ -2826,15 +2813,8 @@ need_resched:
2826 rq->curr = next; 2813 rq->curr = next;
2827 ++*switch_count; 2814 ++*switch_count;
2828 2815
2829 context_switch(rq, prev, next); /* unlocks the rq */ 2816 rq = context_switch(rq, prev, next); /* unlocks the rq */
2830 /* 2817 cpu = cpu_of(rq);
2831 * The context switch have flipped the stack from under us
2832 * and restored the local variables which were saved when
2833 * this task called schedule() in the past. prev == current
2834 * is still correct, but it can be moved to another cpu/rq.
2835 */
2836 cpu = smp_processor_id();
2837 rq = cpu_rq(cpu);
2838 } else 2818 } else
2839 raw_spin_unlock_irq(&rq->lock); 2819 raw_spin_unlock_irq(&rq->lock);
2840 2820
@@ -4653,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
4653#endif 4633#endif
4654} 4634}
4655 4635
4636int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4637 const struct cpumask *trial)
4638{
4639 int ret = 1, trial_cpus;
4640 struct dl_bw *cur_dl_b;
4641 unsigned long flags;
4642
4643 rcu_read_lock_sched();
4644 cur_dl_b = dl_bw_of(cpumask_any(cur));
4645 trial_cpus = cpumask_weight(trial);
4646
4647 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
4648 if (cur_dl_b->bw != -1 &&
4649 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
4650 ret = 0;
4651 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
4652 rcu_read_unlock_sched();
4653
4654 return ret;
4655}
4656
4657int task_can_attach(struct task_struct *p,
4658 const struct cpumask *cs_cpus_allowed)
4659{
4660 int ret = 0;
4661
4662 /*
4663 * Kthreads which disallow setaffinity shouldn't be moved
4664 * to a new cpuset; we don't want to change their cpu
4665 * affinity and isolating such threads by their set of
4666 * allowed nodes is unnecessary. Thus, cpusets are not
4667 * applicable for such threads. This prevents checking for
4668 * success of set_cpus_allowed_ptr() on all attached tasks
4669 * before cpus_allowed may be changed.
4670 */
4671 if (p->flags & PF_NO_SETAFFINITY) {
4672 ret = -EINVAL;
4673 goto out;
4674 }
4675
4676#ifdef CONFIG_SMP
4677 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
4678 cs_cpus_allowed)) {
4679 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
4680 cs_cpus_allowed);
4681 struct dl_bw *dl_b;
4682 bool overflow;
4683 int cpus;
4684 unsigned long flags;
4685
4686 rcu_read_lock_sched();
4687 dl_b = dl_bw_of(dest_cpu);
4688 raw_spin_lock_irqsave(&dl_b->lock, flags);
4689 cpus = dl_bw_cpus(dest_cpu);
4690 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
4691 if (overflow)
4692 ret = -EBUSY;
4693 else {
4694 /*
4695 * We reserve space for this task in the destination
4696 * root_domain, as we can't fail after this point.
4697 * We will free resources in the source root_domain
4698 * later on (see set_cpus_allowed_dl()).
4699 */
4700 __dl_add(dl_b, p->dl.dl_bw);
4701 }
4702 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
4703 rcu_read_unlock_sched();
4704
4705 }
4706#endif
4707out:
4708 return ret;
4709}
4710
4656#ifdef CONFIG_SMP 4711#ifdef CONFIG_SMP
4657/* 4712/*
4658 * move_queued_task - move a queued task to new rq. 4713 * move_queued_task - move a queued task to new rq.
@@ -6103,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6103 6158
6104#ifdef CONFIG_NUMA 6159#ifdef CONFIG_NUMA
6105static int sched_domains_numa_levels; 6160static int sched_domains_numa_levels;
6161enum numa_topology_type sched_numa_topology_type;
6106static int *sched_domains_numa_distance; 6162static int *sched_domains_numa_distance;
6163int sched_max_numa_distance;
6107static struct cpumask ***sched_domains_numa_masks; 6164static struct cpumask ***sched_domains_numa_masks;
6108static int sched_domains_curr_level; 6165static int sched_domains_curr_level;
6109#endif 6166#endif
@@ -6275,7 +6332,7 @@ static void sched_numa_warn(const char *str)
6275 printk(KERN_WARNING "\n"); 6332 printk(KERN_WARNING "\n");
6276} 6333}
6277 6334
6278static bool find_numa_distance(int distance) 6335bool find_numa_distance(int distance)
6279{ 6336{
6280 int i; 6337 int i;
6281 6338
@@ -6290,6 +6347,56 @@ static bool find_numa_distance(int distance)
6290 return false; 6347 return false;
6291} 6348}
6292 6349
6350/*
6351 * A system can have three types of NUMA topology:
6352 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
6353 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
6354 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
6355 *
6356 * The difference between a glueless mesh topology and a backplane
6357 * topology lies in whether communication between not directly
6358 * connected nodes goes through intermediary nodes (where programs
6359 * could run), or through backplane controllers. This affects
6360 * placement of programs.
6361 *
6362 * The type of topology can be discerned with the following tests:
6363 * - If the maximum distance between any nodes is 1 hop, the system
6364 * is directly connected.
6365 * - If for two nodes A and B, located N > 1 hops away from each other,
6366 * there is an intermediary node C, which is < N hops away from both
6367 * nodes A and B, the system is a glueless mesh.
6368 */
6369static void init_numa_topology_type(void)
6370{
6371 int a, b, c, n;
6372
6373 n = sched_max_numa_distance;
6374
6375 if (n <= 1)
6376 sched_numa_topology_type = NUMA_DIRECT;
6377
6378 for_each_online_node(a) {
6379 for_each_online_node(b) {
6380 /* Find two nodes furthest removed from each other. */
6381 if (node_distance(a, b) < n)
6382 continue;
6383
6384 /* Is there an intermediary node between a and b? */
6385 for_each_online_node(c) {
6386 if (node_distance(a, c) < n &&
6387 node_distance(b, c) < n) {
6388 sched_numa_topology_type =
6389 NUMA_GLUELESS_MESH;
6390 return;
6391 }
6392 }
6393
6394 sched_numa_topology_type = NUMA_BACKPLANE;
6395 return;
6396 }
6397 }
6398}
6399
6293static void sched_init_numa(void) 6400static void sched_init_numa(void)
6294{ 6401{
6295 int next_distance, curr_distance = node_distance(0, 0); 6402 int next_distance, curr_distance = node_distance(0, 0);
@@ -6426,6 +6533,9 @@ static void sched_init_numa(void)
6426 sched_domain_topology = tl; 6533 sched_domain_topology = tl;
6427 6534
6428 sched_domains_numa_levels = level; 6535 sched_domains_numa_levels = level;
6536 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6537
6538 init_numa_topology_type();
6429} 6539}
6430 6540
6431static void sched_domains_numa_masks_set(int cpu) 6541static void sched_domains_numa_masks_set(int cpu)
@@ -7178,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
7178 7288
7179void __might_sleep(const char *file, int line, int preempt_offset) 7289void __might_sleep(const char *file, int line, int preempt_offset)
7180{ 7290{
7291 /*
7292 * Blocking primitives will set (and therefore destroy) current->state,
7293 * since we will exit with TASK_RUNNING make sure we enter with it,
7294 * otherwise we will destroy state.
7295 */
7296 if (WARN_ONCE(current->state != TASK_RUNNING,
7297 "do not call blocking ops when !TASK_RUNNING; "
7298 "state=%lx set at [<%p>] %pS\n",
7299 current->state,
7300 (void *)current->task_state_change,
7301 (void *)current->task_state_change))
7302 __set_current_state(TASK_RUNNING);
7303
7304 ___might_sleep(file, line, preempt_offset);
7305}
7306EXPORT_SYMBOL(__might_sleep);
7307
7308void ___might_sleep(const char *file, int line, int preempt_offset)
7309{
7181 static unsigned long prev_jiffy; /* ratelimiting */ 7310 static unsigned long prev_jiffy; /* ratelimiting */
7182 7311
7183 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7312 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7209,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7209#endif 7338#endif
7210 dump_stack(); 7339 dump_stack();
7211} 7340}
7212EXPORT_SYMBOL(__might_sleep); 7341EXPORT_SYMBOL(___might_sleep);
7213#endif 7342#endif
7214 7343
7215#ifdef CONFIG_MAGIC_SYSRQ 7344#ifdef CONFIG_MAGIC_SYSRQ
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..020039bd1326 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_cleanup(struct cpudl *cp); 27void cpudl_cleanup(struct cpudl *cp);
28#else
29#define cpudl_set(cp, cpu, dl) do { } while (0)
30#define cpudl_init() do { } while (0)
31#endif /* CONFIG_SMP */ 28#endif /* CONFIG_SMP */
32 29
33#endif /* _LINUX_CPUDL_H */ 30#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
26void cpupri_set(struct cpupri *cp, int cpu, int pri); 26void cpupri_set(struct cpupri *cp, int cpu, int pri);
27int cpupri_init(struct cpupri *cp); 27int cpupri_init(struct cpupri *cp);
28void cpupri_cleanup(struct cpupri *cp); 28void cpupri_cleanup(struct cpupri *cp);
29#else
30#define cpupri_set(cp, cpu, pri) do { } while (0)
31#define cpupri_init() do { } while (0)
32#endif 29#endif
33 30
34#endif /* _LINUX_CPUPRI_H */ 31#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 28fa9d9e9201..e5db8c6feebd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -563,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
563{ 563{
564 struct hrtimer *timer = &dl_se->dl_timer; 564 struct hrtimer *timer = &dl_se->dl_timer;
565 565
566 if (hrtimer_active(timer)) {
567 hrtimer_try_to_cancel(timer);
568 return;
569 }
570
571 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 566 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
572 timer->function = dl_task_timer; 567 timer->function = dl_task_timer;
573} 568}
@@ -633,7 +628,7 @@ static void update_curr_dl(struct rq *rq)
633 628
634 sched_rt_avg_update(rq, delta_exec); 629 sched_rt_avg_update(rq, delta_exec);
635 630
636 dl_se->runtime -= delta_exec; 631 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
637 if (dl_runtime_exceeded(rq, dl_se)) { 632 if (dl_runtime_exceeded(rq, dl_se)) {
638 __dequeue_task_dl(rq, curr, 0); 633 __dequeue_task_dl(rq, curr, 0);
639 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 634 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -933,7 +928,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
933 struct task_struct *curr; 928 struct task_struct *curr;
934 struct rq *rq; 929 struct rq *rq;
935 930
936 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 931 if (sd_flag != SD_BALANCE_WAKE)
937 goto out; 932 goto out;
938 933
939 rq = cpu_rq(cpu); 934 rq = cpu_rq(cpu);
@@ -1018,6 +1013,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1018{ 1013{
1019 hrtick_start(rq, p->dl.runtime); 1014 hrtick_start(rq, p->dl.runtime);
1020} 1015}
1016#else /* !CONFIG_SCHED_HRTICK */
1017static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1018{
1019}
1021#endif 1020#endif
1022 1021
1023static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, 1022static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1071,10 +1070,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1071 /* Running task will never be pushed. */ 1070 /* Running task will never be pushed. */
1072 dequeue_pushable_dl_task(rq, p); 1071 dequeue_pushable_dl_task(rq, p);
1073 1072
1074#ifdef CONFIG_SCHED_HRTICK
1075 if (hrtick_enabled(rq)) 1073 if (hrtick_enabled(rq))
1076 start_hrtick_dl(rq, p); 1074 start_hrtick_dl(rq, p);
1077#endif
1078 1075
1079 set_post_schedule(rq); 1076 set_post_schedule(rq);
1080 1077
@@ -1093,10 +1090,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1093{ 1090{
1094 update_curr_dl(rq); 1091 update_curr_dl(rq);
1095 1092
1096#ifdef CONFIG_SCHED_HRTICK
1097 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1093 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
1098 start_hrtick_dl(rq, p); 1094 start_hrtick_dl(rq, p);
1099#endif
1100} 1095}
1101 1096
1102static void task_fork_dl(struct task_struct *p) 1097static void task_fork_dl(struct task_struct *p)
@@ -1333,6 +1328,7 @@ static int push_dl_task(struct rq *rq)
1333{ 1328{
1334 struct task_struct *next_task; 1329 struct task_struct *next_task;
1335 struct rq *later_rq; 1330 struct rq *later_rq;
1331 int ret = 0;
1336 1332
1337 if (!rq->dl.overloaded) 1333 if (!rq->dl.overloaded)
1338 return 0; 1334 return 0;
@@ -1378,7 +1374,6 @@ retry:
1378 * The task is still there. We don't try 1374 * The task is still there. We don't try
1379 * again, some other cpu will pull it when ready. 1375 * again, some other cpu will pull it when ready.
1380 */ 1376 */
1381 dequeue_pushable_dl_task(rq, next_task);
1382 goto out; 1377 goto out;
1383 } 1378 }
1384 1379
@@ -1394,6 +1389,7 @@ retry:
1394 deactivate_task(rq, next_task, 0); 1389 deactivate_task(rq, next_task, 0);
1395 set_task_cpu(next_task, later_rq->cpu); 1390 set_task_cpu(next_task, later_rq->cpu);
1396 activate_task(later_rq, next_task, 0); 1391 activate_task(later_rq, next_task, 0);
1392 ret = 1;
1397 1393
1398 resched_curr(later_rq); 1394 resched_curr(later_rq);
1399 1395
@@ -1402,7 +1398,7 @@ retry:
1402out: 1398out:
1403 put_task_struct(next_task); 1399 put_task_struct(next_task);
1404 1400
1405 return 1; 1401 return ret;
1406} 1402}
1407 1403
1408static void push_dl_tasks(struct rq *rq) 1404static void push_dl_tasks(struct rq *rq)
@@ -1508,7 +1504,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
1508 p->nr_cpus_allowed > 1 && 1504 p->nr_cpus_allowed > 1 &&
1509 dl_task(rq->curr) && 1505 dl_task(rq->curr) &&
1510 (rq->curr->nr_cpus_allowed < 2 || 1506 (rq->curr->nr_cpus_allowed < 2 ||
1511 dl_entity_preempt(&rq->curr->dl, &p->dl))) { 1507 !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
1512 push_dl_tasks(rq); 1508 push_dl_tasks(rq);
1513 } 1509 }
1514} 1510}
@@ -1517,10 +1513,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1517 const struct cpumask *new_mask) 1513 const struct cpumask *new_mask)
1518{ 1514{
1519 struct rq *rq; 1515 struct rq *rq;
1516 struct root_domain *src_rd;
1520 int weight; 1517 int weight;
1521 1518
1522 BUG_ON(!dl_task(p)); 1519 BUG_ON(!dl_task(p));
1523 1520
1521 rq = task_rq(p);
1522 src_rd = rq->rd;
1523 /*
1524 * Migrating a SCHED_DEADLINE task between exclusive
1525 * cpusets (different root_domains) entails a bandwidth
1526 * update. We already made space for us in the destination
1527 * domain (see cpuset_can_attach()).
1528 */
1529 if (!cpumask_intersects(src_rd->span, new_mask)) {
1530 struct dl_bw *src_dl_b;
1531
1532 src_dl_b = dl_bw_of(cpu_of(rq));
1533 /*
1534 * We now free resources of the root_domain we are migrating
1535 * off. In the worst case, sched_setattr() may temporary fail
1536 * until we complete the update.
1537 */
1538 raw_spin_lock(&src_dl_b->lock);
1539 __dl_clear(src_dl_b, p->dl.dl_bw);
1540 raw_spin_unlock(&src_dl_b->lock);
1541 }
1542
1524 /* 1543 /*
1525 * Update only if the task is actually running (i.e., 1544 * Update only if the task is actually running (i.e.,
1526 * it is on the rq AND it is not throttled). 1545 * it is on the rq AND it is not throttled).
@@ -1537,8 +1556,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1537 if ((p->nr_cpus_allowed > 1) == (weight > 1)) 1556 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1538 return; 1557 return;
1539 1558
1540 rq = task_rq(p);
1541
1542 /* 1559 /*
1543 * The process used to be able to migrate OR it can now migrate 1560 * The process used to be able to migrate OR it can now migrate
1544 */ 1561 */
@@ -1586,22 +1603,48 @@ void init_sched_dl_class(void)
1586 1603
1587#endif /* CONFIG_SMP */ 1604#endif /* CONFIG_SMP */
1588 1605
1606/*
1607 * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
1608 */
1609static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
1610{
1611 struct hrtimer *dl_timer = &p->dl.dl_timer;
1612
1613 /* Nobody will change task's class if pi_lock is held */
1614 lockdep_assert_held(&p->pi_lock);
1615
1616 if (hrtimer_active(dl_timer)) {
1617 int ret = hrtimer_try_to_cancel(dl_timer);
1618
1619 if (unlikely(ret == -1)) {
1620 /*
1621 * Note, p may migrate OR new deadline tasks
1622 * may appear in rq when we are unlocking it.
1623 * A caller of us must be fine with that.
1624 */
1625 raw_spin_unlock(&rq->lock);
1626 hrtimer_cancel(dl_timer);
1627 raw_spin_lock(&rq->lock);
1628 }
1629 }
1630}
1631
1589static void switched_from_dl(struct rq *rq, struct task_struct *p) 1632static void switched_from_dl(struct rq *rq, struct task_struct *p)
1590{ 1633{
1591 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) 1634 cancel_dl_timer(rq, p);
1592 hrtimer_try_to_cancel(&p->dl.dl_timer);
1593 1635
1594 __dl_clear_params(p); 1636 __dl_clear_params(p);
1595 1637
1596#ifdef CONFIG_SMP
1597 /* 1638 /*
1598 * Since this might be the only -deadline task on the rq, 1639 * Since this might be the only -deadline task on the rq,
1599 * this is the right place to try to pull some other one 1640 * this is the right place to try to pull some other one
1600 * from an overloaded cpu, if any. 1641 * from an overloaded cpu, if any.
1601 */ 1642 */
1602 if (!rq->dl.dl_nr_running) 1643 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
1603 pull_dl_task(rq); 1644 return;
1604#endif 1645
1646 if (pull_dl_task(rq))
1647 resched_curr(rq);
1605} 1648}
1606 1649
1607/* 1650/*
@@ -1622,7 +1665,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1622 1665
1623 if (task_on_rq_queued(p) && rq->curr != p) { 1666 if (task_on_rq_queued(p) && rq->curr != p) {
1624#ifdef CONFIG_SMP 1667#ifdef CONFIG_SMP
1625 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1668 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
1669 push_dl_task(rq) && rq != task_rq(p))
1626 /* Only reschedule if pushing failed */ 1670 /* Only reschedule if pushing failed */
1627 check_resched = 0; 1671 check_resched = 0;
1628#endif /* CONFIG_SMP */ 1672#endif /* CONFIG_SMP */
@@ -1704,3 +1748,12 @@ const struct sched_class dl_sched_class = {
1704 1748
1705 .update_curr = update_curr_dl, 1749 .update_curr = update_curr_dl,
1706}; 1750};
1751
1752#ifdef CONFIG_SCHED_DEBUG
1753extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
1754
1755void print_dl_stats(struct seq_file *m, int cpu)
1756{
1757 print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
1758}
1759#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ce33780d8f20..92cc52001e74 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
261#undef P 261#undef P
262} 262}
263 263
264void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
265{
266 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
267 SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
268}
269
264extern __read_mostly int sched_clock_running; 270extern __read_mostly int sched_clock_running;
265 271
266static void print_cpu(struct seq_file *m, int cpu) 272static void print_cpu(struct seq_file *m, int cpu)
@@ -329,6 +335,7 @@ do { \
329 spin_lock_irqsave(&sched_debug_lock, flags); 335 spin_lock_irqsave(&sched_debug_lock, flags);
330 print_cfs_stats(m, cpu); 336 print_cfs_stats(m, cpu);
331 print_rt_stats(m, cpu); 337 print_rt_stats(m, cpu);
338 print_dl_stats(m, cpu);
332 339
333 print_rq(m, rq, cpu); 340 print_rq(m, rq, cpu);
334 spin_unlock_irqrestore(&sched_debug_lock, flags); 341 spin_unlock_irqrestore(&sched_debug_lock, flags);
@@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
528 unsigned long nr_faults = -1; 535 unsigned long nr_faults = -1;
529 int cpu_current, home_node; 536 int cpu_current, home_node;
530 537
531 if (p->numa_faults_memory) 538 if (p->numa_faults)
532 nr_faults = p->numa_faults_memory[2*node + i]; 539 nr_faults = p->numa_faults[2*node + i];
533 540
534 cpu_current = !i ? (task_node(p) == node) : 541 cpu_current = !i ? (task_node(p) == node) :
535 (pol && node_isset(node, pol->v.nodes)); 542 (pol && node_isset(node, pol->v.nodes));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef2b104b254c..df2cdf77f899 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -873,7 +873,6 @@ struct numa_group {
873 spinlock_t lock; /* nr_tasks, tasks */ 873 spinlock_t lock; /* nr_tasks, tasks */
874 int nr_tasks; 874 int nr_tasks;
875 pid_t gid; 875 pid_t gid;
876 struct list_head task_list;
877 876
878 struct rcu_head rcu; 877 struct rcu_head rcu;
879 nodemask_t active_nodes; 878 nodemask_t active_nodes;
@@ -901,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
901 return p->numa_group ? p->numa_group->gid : 0; 900 return p->numa_group ? p->numa_group->gid : 0;
902} 901}
903 902
904static inline int task_faults_idx(int nid, int priv) 903/*
904 * The averaged statistics, shared & private, memory & cpu,
905 * occupy the first half of the array. The second half of the
906 * array is for current counters, which are averaged into the
907 * first set by task_numa_placement.
908 */
909static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
905{ 910{
906 return NR_NUMA_HINT_FAULT_TYPES * nid + priv; 911 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
907} 912}
908 913
909static inline unsigned long task_faults(struct task_struct *p, int nid) 914static inline unsigned long task_faults(struct task_struct *p, int nid)
910{ 915{
911 if (!p->numa_faults_memory) 916 if (!p->numa_faults)
912 return 0; 917 return 0;
913 918
914 return p->numa_faults_memory[task_faults_idx(nid, 0)] + 919 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
915 p->numa_faults_memory[task_faults_idx(nid, 1)]; 920 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
916} 921}
917 922
918static inline unsigned long group_faults(struct task_struct *p, int nid) 923static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -920,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
920 if (!p->numa_group) 925 if (!p->numa_group)
921 return 0; 926 return 0;
922 927
923 return p->numa_group->faults[task_faults_idx(nid, 0)] + 928 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
924 p->numa_group->faults[task_faults_idx(nid, 1)]; 929 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
925} 930}
926 931
927static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) 932static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
928{ 933{
929 return group->faults_cpu[task_faults_idx(nid, 0)] + 934 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
930 group->faults_cpu[task_faults_idx(nid, 1)]; 935 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
936}
937
938/* Handle placement on systems where not all nodes are directly connected. */
939static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
940 int maxdist, bool task)
941{
942 unsigned long score = 0;
943 int node;
944
945 /*
946 * All nodes are directly connected, and the same distance
947 * from each other. No need for fancy placement algorithms.
948 */
949 if (sched_numa_topology_type == NUMA_DIRECT)
950 return 0;
951
952 /*
953 * This code is called for each node, introducing N^2 complexity,
954 * which should be ok given the number of nodes rarely exceeds 8.
955 */
956 for_each_online_node(node) {
957 unsigned long faults;
958 int dist = node_distance(nid, node);
959
960 /*
961 * The furthest away nodes in the system are not interesting
962 * for placement; nid was already counted.
963 */
964 if (dist == sched_max_numa_distance || node == nid)
965 continue;
966
967 /*
968 * On systems with a backplane NUMA topology, compare groups
969 * of nodes, and move tasks towards the group with the most
970 * memory accesses. When comparing two nodes at distance
971 * "hoplimit", only nodes closer by than "hoplimit" are part
972 * of each group. Skip other nodes.
973 */
974 if (sched_numa_topology_type == NUMA_BACKPLANE &&
975 dist > maxdist)
976 continue;
977
978 /* Add up the faults from nearby nodes. */
979 if (task)
980 faults = task_faults(p, node);
981 else
982 faults = group_faults(p, node);
983
984 /*
985 * On systems with a glueless mesh NUMA topology, there are
986 * no fixed "groups of nodes". Instead, nodes that are not
987 * directly connected bounce traffic through intermediate
988 * nodes; a numa_group can occupy any set of nodes.
989 * The further away a node is, the less the faults count.
990 * This seems to result in good task placement.
991 */
992 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
993 faults *= (sched_max_numa_distance - dist);
994 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
995 }
996
997 score += faults;
998 }
999
1000 return score;
931} 1001}
932 1002
933/* 1003/*
@@ -936,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
936 * larger multiplier, in order to group tasks together that are almost 1006 * larger multiplier, in order to group tasks together that are almost
937 * evenly spread out between numa nodes. 1007 * evenly spread out between numa nodes.
938 */ 1008 */
939static inline unsigned long task_weight(struct task_struct *p, int nid) 1009static inline unsigned long task_weight(struct task_struct *p, int nid,
1010 int dist)
940{ 1011{
941 unsigned long total_faults; 1012 unsigned long faults, total_faults;
942 1013
943 if (!p->numa_faults_memory) 1014 if (!p->numa_faults)
944 return 0; 1015 return 0;
945 1016
946 total_faults = p->total_numa_faults; 1017 total_faults = p->total_numa_faults;
@@ -948,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
948 if (!total_faults) 1019 if (!total_faults)
949 return 0; 1020 return 0;
950 1021
951 return 1000 * task_faults(p, nid) / total_faults; 1022 faults = task_faults(p, nid);
1023 faults += score_nearby_nodes(p, nid, dist, true);
1024
1025 return 1000 * faults / total_faults;
952} 1026}
953 1027
954static inline unsigned long group_weight(struct task_struct *p, int nid) 1028static inline unsigned long group_weight(struct task_struct *p, int nid,
1029 int dist)
955{ 1030{
956 if (!p->numa_group || !p->numa_group->total_faults) 1031 unsigned long faults, total_faults;
1032
1033 if (!p->numa_group)
957 return 0; 1034 return 0;
958 1035
959 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 1036 total_faults = p->numa_group->total_faults;
1037
1038 if (!total_faults)
1039 return 0;
1040
1041 faults = group_faults(p, nid);
1042 faults += score_nearby_nodes(p, nid, dist, false);
1043
1044 return 1000 * faults / total_faults;
960} 1045}
961 1046
962bool should_numa_migrate_memory(struct task_struct *p, struct page * page, 1047bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1089,6 +1174,7 @@ struct task_numa_env {
1089 struct numa_stats src_stats, dst_stats; 1174 struct numa_stats src_stats, dst_stats;
1090 1175
1091 int imbalance_pct; 1176 int imbalance_pct;
1177 int dist;
1092 1178
1093 struct task_struct *best_task; 1179 struct task_struct *best_task;
1094 long best_imp; 1180 long best_imp;
@@ -1168,6 +1254,7 @@ static void task_numa_compare(struct task_numa_env *env,
1168 long load; 1254 long load;
1169 long imp = env->p->numa_group ? groupimp : taskimp; 1255 long imp = env->p->numa_group ? groupimp : taskimp;
1170 long moveimp = imp; 1256 long moveimp = imp;
1257 int dist = env->dist;
1171 1258
1172 rcu_read_lock(); 1259 rcu_read_lock();
1173 1260
@@ -1208,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
1208 * in any group then look only at task weights. 1295 * in any group then look only at task weights.
1209 */ 1296 */
1210 if (cur->numa_group == env->p->numa_group) { 1297 if (cur->numa_group == env->p->numa_group) {
1211 imp = taskimp + task_weight(cur, env->src_nid) - 1298 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1212 task_weight(cur, env->dst_nid); 1299 task_weight(cur, env->dst_nid, dist);
1213 /* 1300 /*
1214 * Add some hysteresis to prevent swapping the 1301 * Add some hysteresis to prevent swapping the
1215 * tasks within a group over tiny differences. 1302 * tasks within a group over tiny differences.
@@ -1223,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
1223 * instead. 1310 * instead.
1224 */ 1311 */
1225 if (cur->numa_group) 1312 if (cur->numa_group)
1226 imp += group_weight(cur, env->src_nid) - 1313 imp += group_weight(cur, env->src_nid, dist) -
1227 group_weight(cur, env->dst_nid); 1314 group_weight(cur, env->dst_nid, dist);
1228 else 1315 else
1229 imp += task_weight(cur, env->src_nid) - 1316 imp += task_weight(cur, env->src_nid, dist) -
1230 task_weight(cur, env->dst_nid); 1317 task_weight(cur, env->dst_nid, dist);
1231 } 1318 }
1232 } 1319 }
1233 1320
@@ -1326,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
1326 }; 1413 };
1327 struct sched_domain *sd; 1414 struct sched_domain *sd;
1328 unsigned long taskweight, groupweight; 1415 unsigned long taskweight, groupweight;
1329 int nid, ret; 1416 int nid, ret, dist;
1330 long taskimp, groupimp; 1417 long taskimp, groupimp;
1331 1418
1332 /* 1419 /*
@@ -1354,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
1354 return -EINVAL; 1441 return -EINVAL;
1355 } 1442 }
1356 1443
1357 taskweight = task_weight(p, env.src_nid);
1358 groupweight = group_weight(p, env.src_nid);
1359 update_numa_stats(&env.src_stats, env.src_nid);
1360 env.dst_nid = p->numa_preferred_nid; 1444 env.dst_nid = p->numa_preferred_nid;
1361 taskimp = task_weight(p, env.dst_nid) - taskweight; 1445 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1362 groupimp = group_weight(p, env.dst_nid) - groupweight; 1446 taskweight = task_weight(p, env.src_nid, dist);
1447 groupweight = group_weight(p, env.src_nid, dist);
1448 update_numa_stats(&env.src_stats, env.src_nid);
1449 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1450 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1363 update_numa_stats(&env.dst_stats, env.dst_nid); 1451 update_numa_stats(&env.dst_stats, env.dst_nid);
1364 1452
1365 /* Try to find a spot on the preferred nid. */ 1453 /* Try to find a spot on the preferred nid. */
1366 task_numa_find_cpu(&env, taskimp, groupimp); 1454 task_numa_find_cpu(&env, taskimp, groupimp);
1367 1455
1368 /* No space available on the preferred nid. Look elsewhere. */ 1456 /*
1369 if (env.best_cpu == -1) { 1457 * Look at other nodes in these cases:
1458 * - there is no space available on the preferred_nid
1459 * - the task is part of a numa_group that is interleaved across
1460 * multiple NUMA nodes; in order to better consolidate the group,
1461 * we need to check other locations.
1462 */
1463 if (env.best_cpu == -1 || (p->numa_group &&
1464 nodes_weight(p->numa_group->active_nodes) > 1)) {
1370 for_each_online_node(nid) { 1465 for_each_online_node(nid) {
1371 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1466 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1372 continue; 1467 continue;
1373 1468
1469 dist = node_distance(env.src_nid, env.dst_nid);
1470 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1471 dist != env.dist) {
1472 taskweight = task_weight(p, env.src_nid, dist);
1473 groupweight = group_weight(p, env.src_nid, dist);
1474 }
1475
1374 /* Only consider nodes where both task and groups benefit */ 1476 /* Only consider nodes where both task and groups benefit */
1375 taskimp = task_weight(p, nid) - taskweight; 1477 taskimp = task_weight(p, nid, dist) - taskweight;
1376 groupimp = group_weight(p, nid) - groupweight; 1478 groupimp = group_weight(p, nid, dist) - groupweight;
1377 if (taskimp < 0 && groupimp < 0) 1479 if (taskimp < 0 && groupimp < 0)
1378 continue; 1480 continue;
1379 1481
1482 env.dist = dist;
1380 env.dst_nid = nid; 1483 env.dst_nid = nid;
1381 update_numa_stats(&env.dst_stats, env.dst_nid); 1484 update_numa_stats(&env.dst_stats, env.dst_nid);
1382 task_numa_find_cpu(&env, taskimp, groupimp); 1485 task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1431,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1431 unsigned long interval = HZ; 1534 unsigned long interval = HZ;
1432 1535
1433 /* This task has no NUMA fault statistics yet */ 1536 /* This task has no NUMA fault statistics yet */
1434 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1537 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1435 return; 1538 return;
1436 1539
1437 /* Periodically retry migrating the task to the preferred node */ 1540 /* Periodically retry migrating the task to the preferred node */
@@ -1580,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1580 return delta; 1683 return delta;
1581} 1684}
1582 1685
1686/*
1687 * Determine the preferred nid for a task in a numa_group. This needs to
1688 * be done in a way that produces consistent results with group_weight,
1689 * otherwise workloads might not converge.
1690 */
1691static int preferred_group_nid(struct task_struct *p, int nid)
1692{
1693 nodemask_t nodes;
1694 int dist;
1695
1696 /* Direct connections between all NUMA nodes. */
1697 if (sched_numa_topology_type == NUMA_DIRECT)
1698 return nid;
1699
1700 /*
1701 * On a system with glueless mesh NUMA topology, group_weight
1702 * scores nodes according to the number of NUMA hinting faults on
1703 * both the node itself, and on nearby nodes.
1704 */
1705 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1706 unsigned long score, max_score = 0;
1707 int node, max_node = nid;
1708
1709 dist = sched_max_numa_distance;
1710
1711 for_each_online_node(node) {
1712 score = group_weight(p, node, dist);
1713 if (score > max_score) {
1714 max_score = score;
1715 max_node = node;
1716 }
1717 }
1718 return max_node;
1719 }
1720
1721 /*
1722 * Finding the preferred nid in a system with NUMA backplane
1723 * interconnect topology is more involved. The goal is to locate
1724 * tasks from numa_groups near each other in the system, and
1725 * untangle workloads from different sides of the system. This requires
1726 * searching down the hierarchy of node groups, recursively searching
1727 * inside the highest scoring group of nodes. The nodemask tricks
1728 * keep the complexity of the search down.
1729 */
1730 nodes = node_online_map;
1731 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1732 unsigned long max_faults = 0;
1733 nodemask_t max_group;
1734 int a, b;
1735
1736 /* Are there nodes at this distance from each other? */
1737 if (!find_numa_distance(dist))
1738 continue;
1739
1740 for_each_node_mask(a, nodes) {
1741 unsigned long faults = 0;
1742 nodemask_t this_group;
1743 nodes_clear(this_group);
1744
1745 /* Sum group's NUMA faults; includes a==b case. */
1746 for_each_node_mask(b, nodes) {
1747 if (node_distance(a, b) < dist) {
1748 faults += group_faults(p, b);
1749 node_set(b, this_group);
1750 node_clear(b, nodes);
1751 }
1752 }
1753
1754 /* Remember the top group. */
1755 if (faults > max_faults) {
1756 max_faults = faults;
1757 max_group = this_group;
1758 /*
1759 * subtle: at the smallest distance there is
1760 * just one node left in each "group", the
1761 * winner is the preferred nid.
1762 */
1763 nid = a;
1764 }
1765 }
1766 /* Next round, evaluate the nodes within max_group. */
1767 nodes = max_group;
1768 }
1769 return nid;
1770}
1771
1583static void task_numa_placement(struct task_struct *p) 1772static void task_numa_placement(struct task_struct *p)
1584{ 1773{
1585 int seq, nid, max_nid = -1, max_group_nid = -1; 1774 int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1607,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
1607 1796
1608 /* Find the node with the highest number of faults */ 1797 /* Find the node with the highest number of faults */
1609 for_each_online_node(nid) { 1798 for_each_online_node(nid) {
1799 /* Keep track of the offsets in numa_faults array */
1800 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1610 unsigned long faults = 0, group_faults = 0; 1801 unsigned long faults = 0, group_faults = 0;
1611 int priv, i; 1802 int priv;
1612 1803
1613 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { 1804 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1614 long diff, f_diff, f_weight; 1805 long diff, f_diff, f_weight;
1615 1806
1616 i = task_faults_idx(nid, priv); 1807 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1808 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1809 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1810 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1617 1811
1618 /* Decay existing window, copy faults since last scan */ 1812 /* Decay existing window, copy faults since last scan */
1619 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; 1813 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1620 fault_types[priv] += p->numa_faults_buffer_memory[i]; 1814 fault_types[priv] += p->numa_faults[membuf_idx];
1621 p->numa_faults_buffer_memory[i] = 0; 1815 p->numa_faults[membuf_idx] = 0;
1622 1816
1623 /* 1817 /*
1624 * Normalize the faults_from, so all tasks in a group 1818 * Normalize the faults_from, so all tasks in a group
@@ -1628,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
1628 * faults are less important. 1822 * faults are less important.
1629 */ 1823 */
1630 f_weight = div64_u64(runtime << 16, period + 1); 1824 f_weight = div64_u64(runtime << 16, period + 1);
1631 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / 1825 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1632 (total_faults + 1); 1826 (total_faults + 1);
1633 f_diff = f_weight - p->numa_faults_cpu[i] / 2; 1827 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1634 p->numa_faults_buffer_cpu[i] = 0; 1828 p->numa_faults[cpubuf_idx] = 0;
1635 1829
1636 p->numa_faults_memory[i] += diff; 1830 p->numa_faults[mem_idx] += diff;
1637 p->numa_faults_cpu[i] += f_diff; 1831 p->numa_faults[cpu_idx] += f_diff;
1638 faults += p->numa_faults_memory[i]; 1832 faults += p->numa_faults[mem_idx];
1639 p->total_numa_faults += diff; 1833 p->total_numa_faults += diff;
1640 if (p->numa_group) { 1834 if (p->numa_group) {
1641 /* safe because we can only change our own group */ 1835 /*
1642 p->numa_group->faults[i] += diff; 1836 * safe because we can only change our own group
1643 p->numa_group->faults_cpu[i] += f_diff; 1837 *
1838 * mem_idx represents the offset for a given
1839 * nid and priv in a specific region because it
1840 * is at the beginning of the numa_faults array.
1841 */
1842 p->numa_group->faults[mem_idx] += diff;
1843 p->numa_group->faults_cpu[mem_idx] += f_diff;
1644 p->numa_group->total_faults += diff; 1844 p->numa_group->total_faults += diff;
1645 group_faults += p->numa_group->faults[i]; 1845 group_faults += p->numa_group->faults[mem_idx];
1646 } 1846 }
1647 } 1847 }
1648 1848
@@ -1662,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
1662 if (p->numa_group) { 1862 if (p->numa_group) {
1663 update_numa_active_node_mask(p->numa_group); 1863 update_numa_active_node_mask(p->numa_group);
1664 spin_unlock_irq(group_lock); 1864 spin_unlock_irq(group_lock);
1665 max_nid = max_group_nid; 1865 max_nid = preferred_group_nid(p, max_group_nid);
1666 } 1866 }
1667 1867
1668 if (max_faults) { 1868 if (max_faults) {
@@ -1705,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1705 1905
1706 atomic_set(&grp->refcount, 1); 1906 atomic_set(&grp->refcount, 1);
1707 spin_lock_init(&grp->lock); 1907 spin_lock_init(&grp->lock);
1708 INIT_LIST_HEAD(&grp->task_list);
1709 grp->gid = p->pid; 1908 grp->gid = p->pid;
1710 /* Second half of the array tracks nids where faults happen */ 1909 /* Second half of the array tracks nids where faults happen */
1711 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * 1910 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1714,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1714 node_set(task_node(current), grp->active_nodes); 1913 node_set(task_node(current), grp->active_nodes);
1715 1914
1716 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 1915 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1717 grp->faults[i] = p->numa_faults_memory[i]; 1916 grp->faults[i] = p->numa_faults[i];
1718 1917
1719 grp->total_faults = p->total_numa_faults; 1918 grp->total_faults = p->total_numa_faults;
1720 1919
1721 list_add(&p->numa_entry, &grp->task_list);
1722 grp->nr_tasks++; 1920 grp->nr_tasks++;
1723 rcu_assign_pointer(p->numa_group, grp); 1921 rcu_assign_pointer(p->numa_group, grp);
1724 } 1922 }
@@ -1773,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1773 double_lock_irq(&my_grp->lock, &grp->lock); 1971 double_lock_irq(&my_grp->lock, &grp->lock);
1774 1972
1775 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { 1973 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1776 my_grp->faults[i] -= p->numa_faults_memory[i]; 1974 my_grp->faults[i] -= p->numa_faults[i];
1777 grp->faults[i] += p->numa_faults_memory[i]; 1975 grp->faults[i] += p->numa_faults[i];
1778 } 1976 }
1779 my_grp->total_faults -= p->total_numa_faults; 1977 my_grp->total_faults -= p->total_numa_faults;
1780 grp->total_faults += p->total_numa_faults; 1978 grp->total_faults += p->total_numa_faults;
1781 1979
1782 list_move(&p->numa_entry, &grp->task_list);
1783 my_grp->nr_tasks--; 1980 my_grp->nr_tasks--;
1784 grp->nr_tasks++; 1981 grp->nr_tasks++;
1785 1982
@@ -1799,27 +1996,23 @@ no_join:
1799void task_numa_free(struct task_struct *p) 1996void task_numa_free(struct task_struct *p)
1800{ 1997{
1801 struct numa_group *grp = p->numa_group; 1998 struct numa_group *grp = p->numa_group;
1802 void *numa_faults = p->numa_faults_memory; 1999 void *numa_faults = p->numa_faults;
1803 unsigned long flags; 2000 unsigned long flags;
1804 int i; 2001 int i;
1805 2002
1806 if (grp) { 2003 if (grp) {
1807 spin_lock_irqsave(&grp->lock, flags); 2004 spin_lock_irqsave(&grp->lock, flags);
1808 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2005 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1809 grp->faults[i] -= p->numa_faults_memory[i]; 2006 grp->faults[i] -= p->numa_faults[i];
1810 grp->total_faults -= p->total_numa_faults; 2007 grp->total_faults -= p->total_numa_faults;
1811 2008
1812 list_del(&p->numa_entry);
1813 grp->nr_tasks--; 2009 grp->nr_tasks--;
1814 spin_unlock_irqrestore(&grp->lock, flags); 2010 spin_unlock_irqrestore(&grp->lock, flags);
1815 RCU_INIT_POINTER(p->numa_group, NULL); 2011 RCU_INIT_POINTER(p->numa_group, NULL);
1816 put_numa_group(grp); 2012 put_numa_group(grp);
1817 } 2013 }
1818 2014
1819 p->numa_faults_memory = NULL; 2015 p->numa_faults = NULL;
1820 p->numa_faults_buffer_memory = NULL;
1821 p->numa_faults_cpu= NULL;
1822 p->numa_faults_buffer_cpu = NULL;
1823 kfree(numa_faults); 2016 kfree(numa_faults);
1824} 2017}
1825 2018
@@ -1842,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1842 return; 2035 return;
1843 2036
1844 /* Allocate buffer to track faults on a per-node basis */ 2037 /* Allocate buffer to track faults on a per-node basis */
1845 if (unlikely(!p->numa_faults_memory)) { 2038 if (unlikely(!p->numa_faults)) {
1846 int size = sizeof(*p->numa_faults_memory) * 2039 int size = sizeof(*p->numa_faults) *
1847 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 2040 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1848 2041
1849 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 2042 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1850 if (!p->numa_faults_memory) 2043 if (!p->numa_faults)
1851 return; 2044 return;
1852 2045
1853 BUG_ON(p->numa_faults_buffer_memory);
1854 /*
1855 * The averaged statistics, shared & private, memory & cpu,
1856 * occupy the first half of the array. The second half of the
1857 * array is for current counters, which are averaged into the
1858 * first set by task_numa_placement.
1859 */
1860 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1861 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1862 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1863 p->total_numa_faults = 0; 2046 p->total_numa_faults = 0;
1864 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 2047 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1865 } 2048 }
@@ -1899,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1899 if (migrated) 2082 if (migrated)
1900 p->numa_pages_migrated += pages; 2083 p->numa_pages_migrated += pages;
1901 2084
1902 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 2085 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
1903 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 2086 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
1904 p->numa_faults_locality[local] += pages; 2087 p->numa_faults_locality[local] += pages;
1905} 2088}
1906 2089
@@ -4469,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4469 latest_idle_timestamp = rq->idle_stamp; 4652 latest_idle_timestamp = rq->idle_stamp;
4470 shallowest_idle_cpu = i; 4653 shallowest_idle_cpu = i;
4471 } 4654 }
4472 } else { 4655 } else if (shallowest_idle_cpu == -1) {
4473 load = weighted_cpuload(i); 4656 load = weighted_cpuload(i);
4474 if (load < min_load || (load == min_load && i == this_cpu)) { 4657 if (load < min_load || (load == min_load && i == this_cpu)) {
4475 min_load = load; 4658 min_load = load;
@@ -4547,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4547 int want_affine = 0; 4730 int want_affine = 0;
4548 int sync = wake_flags & WF_SYNC; 4731 int sync = wake_flags & WF_SYNC;
4549 4732
4550 if (p->nr_cpus_allowed == 1)
4551 return prev_cpu;
4552
4553 if (sd_flag & SD_BALANCE_WAKE) 4733 if (sd_flag & SD_BALANCE_WAKE)
4554 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); 4734 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4555 4735
@@ -5189,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5189 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5369 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5190 int src_nid, dst_nid; 5370 int src_nid, dst_nid;
5191 5371
5192 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5372 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
5193 !(env->sd->flags & SD_NUMA)) { 5373 !(env->sd->flags & SD_NUMA)) {
5194 return false; 5374 return false;
5195 } 5375 }
@@ -5228,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5228 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5408 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
5229 return false; 5409 return false;
5230 5410
5231 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) 5411 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5232 return false; 5412 return false;
5233 5413
5234 src_nid = cpu_to_node(env->src_cpu); 5414 src_nid = cpu_to_node(env->src_cpu);
@@ -6172,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6172 * with a large weight task outweighs the tasks on the system). 6352 * with a large weight task outweighs the tasks on the system).
6173 */ 6353 */
6174 if (prefer_sibling && sds->local && 6354 if (prefer_sibling && sds->local &&
6175 sds->local_stat.group_has_free_capacity) 6355 sds->local_stat.group_has_free_capacity) {
6176 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); 6356 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
6357 sgs->group_type = group_classify(sg, sgs);
6358 }
6177 6359
6178 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6360 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6179 sds->busiest = sg; 6361 sds->busiest = sg;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 20bca398084a..ee15f5a0d1c1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1301 struct task_struct *curr; 1301 struct task_struct *curr;
1302 struct rq *rq; 1302 struct rq *rq;
1303 1303
1304 if (p->nr_cpus_allowed == 1)
1305 goto out;
1306
1307 /* For anything but wake ups, just return the task_cpu */ 1304 /* For anything but wake ups, just return the task_cpu */
1308 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1305 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1309 goto out; 1306 goto out;
@@ -1351,16 +1348,22 @@ out:
1351 1348
1352static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1349static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1353{ 1350{
1354 if (rq->curr->nr_cpus_allowed == 1) 1351 /*
1352 * Current can't be migrated, useless to reschedule,
1353 * let's hope p can move out.
1354 */
1355 if (rq->curr->nr_cpus_allowed == 1 ||
1356 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1355 return; 1357 return;
1356 1358
1359 /*
1360 * p is migratable, so let's not schedule it and
1361 * see if it is pushed or pulled somewhere else.
1362 */
1357 if (p->nr_cpus_allowed != 1 1363 if (p->nr_cpus_allowed != 1
1358 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1364 && cpupri_find(&rq->rd->cpupri, p, NULL))
1359 return; 1365 return;
1360 1366
1361 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1362 return;
1363
1364 /* 1367 /*
1365 * There appears to be other cpus that can accept 1368 * There appears to be other cpus that can accept
1366 * current and none to run 'p', so lets reschedule 1369 * current and none to run 'p', so lets reschedule
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2df8ef067cc5..9a2a45c970e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -176,6 +176,25 @@ struct dl_bw {
176 u64 bw, total_bw; 176 u64 bw, total_bw;
177}; 177};
178 178
179static inline
180void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
181{
182 dl_b->total_bw -= tsk_bw;
183}
184
185static inline
186void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
187{
188 dl_b->total_bw += tsk_bw;
189}
190
191static inline
192bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
193{
194 return dl_b->bw != -1 &&
195 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
196}
197
179extern struct mutex sched_domains_mutex; 198extern struct mutex sched_domains_mutex;
180 199
181#ifdef CONFIG_CGROUP_SCHED 200#ifdef CONFIG_CGROUP_SCHED
@@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq)
678 return rq->clock_task; 697 return rq->clock_task;
679} 698}
680 699
700#ifdef CONFIG_NUMA
701enum numa_topology_type {
702 NUMA_DIRECT,
703 NUMA_GLUELESS_MESH,
704 NUMA_BACKPLANE,
705};
706extern enum numa_topology_type sched_numa_topology_type;
707extern int sched_max_numa_distance;
708extern bool find_numa_distance(int distance);
709#endif
710
681#ifdef CONFIG_NUMA_BALANCING 711#ifdef CONFIG_NUMA_BALANCING
712/* The regions in numa_faults array from task_struct */
713enum numa_faults_stats {
714 NUMA_MEM = 0,
715 NUMA_CPU,
716 NUMA_MEMBUF,
717 NUMA_CPUBUF
718};
682extern void sched_setnuma(struct task_struct *p, int node); 719extern void sched_setnuma(struct task_struct *p, int node);
683extern int migrate_task_to(struct task_struct *p, int cpu); 720extern int migrate_task_to(struct task_struct *p, int cpu);
684extern int migrate_swap(struct task_struct *, struct task_struct *); 721extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -1127,6 +1164,11 @@ struct sched_class {
1127 void (*task_fork) (struct task_struct *p); 1164 void (*task_fork) (struct task_struct *p);
1128 void (*task_dead) (struct task_struct *p); 1165 void (*task_dead) (struct task_struct *p);
1129 1166
1167 /*
1168 * The switched_from() call is allowed to drop rq->lock, therefore we
1169 * cannot assume the switched_from/switched_to pair is serliazed by
1170 * rq->lock. They are however serialized by p->pi_lock.
1171 */
1130 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1172 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1131 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1173 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1132 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1174 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1504,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1504extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); 1546extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1505extern void print_cfs_stats(struct seq_file *m, int cpu); 1547extern void print_cfs_stats(struct seq_file *m, int cpu);
1506extern void print_rt_stats(struct seq_file *m, int cpu); 1548extern void print_rt_stats(struct seq_file *m, int cpu);
1549extern void print_dl_stats(struct seq_file *m, int cpu);
1507 1550
1508extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1551extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1509extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1552extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 5a62915f47a8..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12#include <linux/kthread.h>
12 13
13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) 14void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 15{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
297} 298}
298EXPORT_SYMBOL(autoremove_wake_function); 299EXPORT_SYMBOL(autoremove_wake_function);
299 300
301static inline bool is_kthread_should_stop(void)
302{
303 return (current->flags & PF_KTHREAD) && kthread_should_stop();
304}
305
306/*
307 * DEFINE_WAIT_FUNC(wait, woken_wake_func);
308 *
309 * add_wait_queue(&wq, &wait);
310 * for (;;) {
311 * if (condition)
312 * break;
313 *
314 * p->state = mode; condition = true;
315 * smp_mb(); // A smp_wmb(); // C
316 * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
317 * schedule() try_to_wake_up();
318 * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
319 * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
320 * smp_mb() // B smp_wmb(); // C
321 * wait->flags |= WQ_FLAG_WOKEN;
322 * }
323 * remove_wait_queue(&wq, &wait);
324 *
325 */
326long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
327{
328 set_current_state(mode); /* A */
329 /*
330 * The above implies an smp_mb(), which matches with the smp_wmb() from
331 * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
332 * also observe all state before the wakeup.
333 */
334 if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
335 timeout = schedule_timeout(timeout);
336 __set_current_state(TASK_RUNNING);
337
338 /*
339 * The below implies an smp_mb(), it too pairs with the smp_wmb() from
340 * woken_wake_function() such that we must either observe the wait
341 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
342 * an event.
343 */
344 set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
345
346 return timeout;
347}
348EXPORT_SYMBOL(wait_woken);
349
350int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
351{
352 /*
353 * Although this function is called under waitqueue lock, LOCK
354 * doesn't imply write barrier and the users expects write
355 * barrier semantics on wakeup functions. The following
356 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
357 * and is paired with set_mb() in wait_woken().
358 */
359 smp_wmb(); /* C */
360 wait->flags |= WQ_FLAG_WOKEN;
361
362 return default_wake_function(wait, mode, sync, key);
363}
364EXPORT_SYMBOL(woken_wake_function);
365
300int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) 366int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
301{ 367{
302 struct wait_bit_key *key = arg; 368 struct wait_bit_key *key = arg;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..f032fb5284e3 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data)
110 set_current_state(TASK_INTERRUPTIBLE); 110 set_current_state(TASK_INTERRUPTIBLE);
111 preempt_disable(); 111 preempt_disable();
112 if (kthread_should_stop()) { 112 if (kthread_should_stop()) {
113 set_current_state(TASK_RUNNING); 113 __set_current_state(TASK_RUNNING);
114 preempt_enable(); 114 preempt_enable();
115 if (ht->cleanup) 115 if (ht->cleanup)
116 ht->cleanup(td->cpu, cpu_online(td->cpu)); 116 ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data)
136 /* Check for state change setup */ 136 /* Check for state change setup */
137 switch (td->status) { 137 switch (td->status) {
138 case HP_THREAD_NONE: 138 case HP_THREAD_NONE:
139 __set_current_state(TASK_RUNNING);
139 preempt_enable(); 140 preempt_enable();
140 if (ht->setup) 141 if (ht->setup)
141 ht->setup(td->cpu); 142 ht->setup(td->cpu);
142 td->status = HP_THREAD_ACTIVE; 143 td->status = HP_THREAD_ACTIVE;
143 preempt_disable(); 144 continue;
144 break; 145
145 case HP_THREAD_PARKED: 146 case HP_THREAD_PARKED:
147 __set_current_state(TASK_RUNNING);
146 preempt_enable(); 148 preempt_enable();
147 if (ht->unpark) 149 if (ht->unpark)
148 ht->unpark(td->cpu); 150 ht->unpark(td->cpu);
149 td->status = HP_THREAD_ACTIVE; 151 td->status = HP_THREAD_ACTIVE;
150 preempt_disable(); 152 continue;
151 break;
152 } 153 }
153 154
154 if (!ht->thread_should_run(td->cpu)) { 155 if (!ht->thread_should_run(td->cpu)) {
155 preempt_enable(); 156 preempt_enable_no_resched();
156 schedule(); 157 schedule();
157 } else { 158 } else {
158 set_current_state(TASK_RUNNING); 159 __set_current_state(TASK_RUNNING);
159 preempt_enable(); 160 preempt_enable();
160 ht->thread_fn(td->cpu); 161 ht->thread_fn(td->cpu);
161 } 162 }
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index af73bc3acb40..410dd5e76c41 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -101,11 +101,11 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s);
101#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) 101#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
102#define __get_rpn_parity(line) (((line) >> 3) & 0x7) 102#define __get_rpn_parity(line) (((line) >> 3) & 0x7)
103 103
104static DECLARE_WAIT_QUEUE_HEAD(rfcomm_wq);
105
104static void rfcomm_schedule(void) 106static void rfcomm_schedule(void)
105{ 107{
106 if (!rfcomm_thread) 108 wake_up_all(&rfcomm_wq);
107 return;
108 wake_up_process(rfcomm_thread);
109} 109}
110 110
111/* ---- RFCOMM FCS computation ---- */ 111/* ---- RFCOMM FCS computation ---- */
@@ -2086,24 +2086,22 @@ static void rfcomm_kill_listener(void)
2086 2086
2087static int rfcomm_run(void *unused) 2087static int rfcomm_run(void *unused)
2088{ 2088{
2089 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2089 BT_DBG(""); 2090 BT_DBG("");
2090 2091
2091 set_user_nice(current, -10); 2092 set_user_nice(current, -10);
2092 2093
2093 rfcomm_add_listener(BDADDR_ANY); 2094 rfcomm_add_listener(BDADDR_ANY);
2094 2095
2095 while (1) { 2096 add_wait_queue(&rfcomm_wq, &wait);
2096 set_current_state(TASK_INTERRUPTIBLE); 2097 while (!kthread_should_stop()) {
2097
2098 if (kthread_should_stop())
2099 break;
2100 2098
2101 /* Process stuff */ 2099 /* Process stuff */
2102 rfcomm_process_sessions(); 2100 rfcomm_process_sessions();
2103 2101
2104 schedule(); 2102 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2105 } 2103 }
2106 __set_current_state(TASK_RUNNING); 2104 remove_wait_queue(&rfcomm_wq, &wait);
2107 2105
2108 rfcomm_kill_listener(); 2106 rfcomm_kill_listener();
2109 2107
diff --git a/net/core/dev.c b/net/core/dev.c
index 945bbd001359..3acff0974560 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7200,11 +7200,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7200 */ 7200 */
7201 struct net *net; 7201 struct net *net;
7202 bool unregistering; 7202 bool unregistering;
7203 DEFINE_WAIT(wait); 7203 DEFINE_WAIT_FUNC(wait, woken_wake_function);
7204 7204
7205 add_wait_queue(&netdev_unregistering_wq, &wait);
7205 for (;;) { 7206 for (;;) {
7206 prepare_to_wait(&netdev_unregistering_wq, &wait,
7207 TASK_UNINTERRUPTIBLE);
7208 unregistering = false; 7207 unregistering = false;
7209 rtnl_lock(); 7208 rtnl_lock();
7210 list_for_each_entry(net, net_list, exit_list) { 7209 list_for_each_entry(net, net_list, exit_list) {
@@ -7216,9 +7215,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7216 if (!unregistering) 7215 if (!unregistering)
7217 break; 7216 break;
7218 __rtnl_unlock(); 7217 __rtnl_unlock();
7219 schedule(); 7218
7219 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7220 } 7220 }
7221 finish_wait(&netdev_unregistering_wq, &wait); 7221 remove_wait_queue(&netdev_unregistering_wq, &wait);
7222} 7222}
7223 7223
7224static void __net_exit default_device_exit_batch(struct list_head *net_list) 7224static void __net_exit default_device_exit_batch(struct list_head *net_list)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 76321ea442c3..88e8de3b59b0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -365,11 +365,10 @@ static void rtnl_lock_unregistering_all(void)
365{ 365{
366 struct net *net; 366 struct net *net;
367 bool unregistering; 367 bool unregistering;
368 DEFINE_WAIT(wait); 368 DEFINE_WAIT_FUNC(wait, woken_wake_function);
369 369
370 add_wait_queue(&netdev_unregistering_wq, &wait);
370 for (;;) { 371 for (;;) {
371 prepare_to_wait(&netdev_unregistering_wq, &wait,
372 TASK_UNINTERRUPTIBLE);
373 unregistering = false; 372 unregistering = false;
374 rtnl_lock(); 373 rtnl_lock();
375 for_each_net(net) { 374 for_each_net(net) {
@@ -381,9 +380,10 @@ static void rtnl_lock_unregistering_all(void)
381 if (!unregistering) 380 if (!unregistering)
382 break; 381 break;
383 __rtnl_unlock(); 382 __rtnl_unlock();
384 schedule(); 383
384 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
385 } 385 }
386 finish_wait(&netdev_unregistering_wq, &wait); 386 remove_wait_queue(&netdev_unregistering_wq, &wait);
387} 387}
388 388
389/** 389/**