diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/fork.c | 8 | ||||
-rw-r--r-- | kernel/futex.c | 33 | ||||
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 6 | ||||
-rw-r--r-- | kernel/sched/auto_group.h | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 92 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 2 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 276 | ||||
-rw-r--r-- | kernel/sched/loadavg.c (renamed from kernel/sched/proc.c) | 236 | ||||
-rw-r--r-- | kernel/sched/rt.c | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 10 | ||||
-rw-r--r-- | kernel/sched/stats.h | 15 | ||||
-rw-r--r-- | kernel/sched/wait.c | 4 | ||||
-rw-r--r-- | kernel/signal.c | 6 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 87 |
16 files changed, 424 insertions, 359 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 03c1eaaa6ef5..0bb88b555550 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) | |||
1091 | { | 1091 | { |
1092 | unsigned long cpu_limit; | 1092 | unsigned long cpu_limit; |
1093 | 1093 | ||
1094 | /* Thread group counters. */ | 1094 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
1095 | thread_group_cputime_init(sig); | ||
1096 | |||
1097 | cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); | ||
1098 | if (cpu_limit != RLIM_INFINITY) { | 1095 | if (cpu_limit != RLIM_INFINITY) { |
1099 | sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); | 1096 | sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); |
1100 | sig->cputimer.running = 1; | 1097 | sig->cputimer.running = 1; |
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1396 | p->hardirq_context = 0; | 1393 | p->hardirq_context = 0; |
1397 | p->softirq_context = 0; | 1394 | p->softirq_context = 0; |
1398 | #endif | 1395 | #endif |
1396 | |||
1397 | p->pagefault_disabled = 0; | ||
1398 | |||
1399 | #ifdef CONFIG_LOCKDEP | 1399 | #ifdef CONFIG_LOCKDEP |
1400 | p->lockdep_depth = 0; /* no locks held yet */ | 1400 | p->lockdep_depth = 0; /* no locks held yet */ |
1401 | p->curr_chain_key = 0; | 1401 | p->curr_chain_key = 0; |
diff --git a/kernel/futex.c b/kernel/futex.c index 2579e407ff67..f9984c363e9a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q) | |||
1090 | 1090 | ||
1091 | /* | 1091 | /* |
1092 | * The hash bucket lock must be held when this is called. | 1092 | * The hash bucket lock must be held when this is called. |
1093 | * Afterwards, the futex_q must not be accessed. | 1093 | * Afterwards, the futex_q must not be accessed. Callers |
1094 | * must ensure to later call wake_up_q() for the actual | ||
1095 | * wakeups to occur. | ||
1094 | */ | 1096 | */ |
1095 | static void wake_futex(struct futex_q *q) | 1097 | static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) |
1096 | { | 1098 | { |
1097 | struct task_struct *p = q->task; | 1099 | struct task_struct *p = q->task; |
1098 | 1100 | ||
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q) | |||
1100 | return; | 1102 | return; |
1101 | 1103 | ||
1102 | /* | 1104 | /* |
1103 | * We set q->lock_ptr = NULL _before_ we wake up the task. If | 1105 | * Queue the task for later wakeup for after we've released |
1104 | * a non-futex wake up happens on another CPU then the task | 1106 | * the hb->lock. wake_q_add() grabs reference to p. |
1105 | * might exit and p would dereference a non-existing task | ||
1106 | * struct. Prevent this by holding a reference on p across the | ||
1107 | * wake up. | ||
1108 | */ | 1107 | */ |
1109 | get_task_struct(p); | 1108 | wake_q_add(wake_q, p); |
1110 | |||
1111 | __unqueue_futex(q); | 1109 | __unqueue_futex(q); |
1112 | /* | 1110 | /* |
1113 | * The waiting task can free the futex_q as soon as | 1111 | * The waiting task can free the futex_q as soon as |
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q) | |||
1117 | */ | 1115 | */ |
1118 | smp_wmb(); | 1116 | smp_wmb(); |
1119 | q->lock_ptr = NULL; | 1117 | q->lock_ptr = NULL; |
1120 | |||
1121 | wake_up_state(p, TASK_NORMAL); | ||
1122 | put_task_struct(p); | ||
1123 | } | 1118 | } |
1124 | 1119 | ||
1125 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | 1120 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) |
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |||
1217 | struct futex_q *this, *next; | 1212 | struct futex_q *this, *next; |
1218 | union futex_key key = FUTEX_KEY_INIT; | 1213 | union futex_key key = FUTEX_KEY_INIT; |
1219 | int ret; | 1214 | int ret; |
1215 | WAKE_Q(wake_q); | ||
1220 | 1216 | ||
1221 | if (!bitset) | 1217 | if (!bitset) |
1222 | return -EINVAL; | 1218 | return -EINVAL; |
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |||
1244 | if (!(this->bitset & bitset)) | 1240 | if (!(this->bitset & bitset)) |
1245 | continue; | 1241 | continue; |
1246 | 1242 | ||
1247 | wake_futex(this); | 1243 | mark_wake_futex(&wake_q, this); |
1248 | if (++ret >= nr_wake) | 1244 | if (++ret >= nr_wake) |
1249 | break; | 1245 | break; |
1250 | } | 1246 | } |
1251 | } | 1247 | } |
1252 | 1248 | ||
1253 | spin_unlock(&hb->lock); | 1249 | spin_unlock(&hb->lock); |
1250 | wake_up_q(&wake_q); | ||
1254 | out_put_key: | 1251 | out_put_key: |
1255 | put_futex_key(&key); | 1252 | put_futex_key(&key); |
1256 | out: | 1253 | out: |
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, | |||
1269 | struct futex_hash_bucket *hb1, *hb2; | 1266 | struct futex_hash_bucket *hb1, *hb2; |
1270 | struct futex_q *this, *next; | 1267 | struct futex_q *this, *next; |
1271 | int ret, op_ret; | 1268 | int ret, op_ret; |
1269 | WAKE_Q(wake_q); | ||
1272 | 1270 | ||
1273 | retry: | 1271 | retry: |
1274 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); | 1272 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
@@ -1320,7 +1318,7 @@ retry_private: | |||
1320 | ret = -EINVAL; | 1318 | ret = -EINVAL; |
1321 | goto out_unlock; | 1319 | goto out_unlock; |
1322 | } | 1320 | } |
1323 | wake_futex(this); | 1321 | mark_wake_futex(&wake_q, this); |
1324 | if (++ret >= nr_wake) | 1322 | if (++ret >= nr_wake) |
1325 | break; | 1323 | break; |
1326 | } | 1324 | } |
@@ -1334,7 +1332,7 @@ retry_private: | |||
1334 | ret = -EINVAL; | 1332 | ret = -EINVAL; |
1335 | goto out_unlock; | 1333 | goto out_unlock; |
1336 | } | 1334 | } |
1337 | wake_futex(this); | 1335 | mark_wake_futex(&wake_q, this); |
1338 | if (++op_ret >= nr_wake2) | 1336 | if (++op_ret >= nr_wake2) |
1339 | break; | 1337 | break; |
1340 | } | 1338 | } |
@@ -1344,6 +1342,7 @@ retry_private: | |||
1344 | 1342 | ||
1345 | out_unlock: | 1343 | out_unlock: |
1346 | double_unlock_hb(hb1, hb2); | 1344 | double_unlock_hb(hb1, hb2); |
1345 | wake_up_q(&wake_q); | ||
1347 | out_put_keys: | 1346 | out_put_keys: |
1348 | put_futex_key(&key2); | 1347 | put_futex_key(&key2); |
1349 | out_put_key1: | 1348 | out_put_key1: |
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | |||
1503 | struct futex_pi_state *pi_state = NULL; | 1502 | struct futex_pi_state *pi_state = NULL; |
1504 | struct futex_hash_bucket *hb1, *hb2; | 1503 | struct futex_hash_bucket *hb1, *hb2; |
1505 | struct futex_q *this, *next; | 1504 | struct futex_q *this, *next; |
1505 | WAKE_Q(wake_q); | ||
1506 | 1506 | ||
1507 | if (requeue_pi) { | 1507 | if (requeue_pi) { |
1508 | /* | 1508 | /* |
@@ -1679,7 +1679,7 @@ retry_private: | |||
1679 | * woken by futex_unlock_pi(). | 1679 | * woken by futex_unlock_pi(). |
1680 | */ | 1680 | */ |
1681 | if (++task_count <= nr_wake && !requeue_pi) { | 1681 | if (++task_count <= nr_wake && !requeue_pi) { |
1682 | wake_futex(this); | 1682 | mark_wake_futex(&wake_q, this); |
1683 | continue; | 1683 | continue; |
1684 | } | 1684 | } |
1685 | 1685 | ||
@@ -1719,6 +1719,7 @@ retry_private: | |||
1719 | out_unlock: | 1719 | out_unlock: |
1720 | free_pi_state(pi_state); | 1720 | free_pi_state(pi_state); |
1721 | double_unlock_hb(hb1, hb2); | 1721 | double_unlock_hb(hb1, hb2); |
1722 | wake_up_q(&wake_q); | ||
1722 | hb_waiters_dec(hb2); | 1723 | hb_waiters_dec(hb2); |
1723 | 1724 | ||
1724 | /* | 1725 | /* |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be87024875..67687973ce80 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o proc.o clock.o cputime.o | 14 | obj-y += core.o loadavg.o clock.o cputime.o |
15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | 15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o |
16 | obj-y += wait.o completion.o idle.o | 16 | obj-y += wait.o completion.o idle.o |
17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o | 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index eae160dd669d..750ed601ddf7 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -1,5 +1,3 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
2 | |||
3 | #include "sched.h" | 1 | #include "sched.h" |
4 | 2 | ||
5 | #include <linux/proc_fs.h> | 3 | #include <linux/proc_fs.h> |
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
141 | 139 | ||
142 | p->signal->autogroup = autogroup_kref_get(ag); | 140 | p->signal->autogroup = autogroup_kref_get(ag); |
143 | 141 | ||
144 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | 142 | if (!READ_ONCE(sysctl_sched_autogroup_enabled)) |
145 | goto out; | 143 | goto out; |
146 | 144 | ||
147 | for_each_thread(p, t) | 145 | for_each_thread(p, t) |
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) | |||
249 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 247 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
250 | } | 248 | } |
251 | #endif /* CONFIG_SCHED_DEBUG */ | 249 | #endif /* CONFIG_SCHED_DEBUG */ |
252 | |||
253 | #endif /* CONFIG_SCHED_AUTOGROUP */ | ||
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142816..890c95f2587a 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h | |||
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); | |||
29 | static inline struct task_group * | 29 | static inline struct task_group * |
30 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | 30 | autogroup_task_group(struct task_struct *p, struct task_group *tg) |
31 | { | 31 | { |
32 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | 32 | int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); |
33 | 33 | ||
34 | if (enabled && task_wants_autogroup(p, tg)) | 34 | if (enabled && task_wants_autogroup(p, tg)) |
35 | return p->signal->autogroup->tg; | 35 | return p->signal->autogroup->tg; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 123673291ffb..20b858f2db22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p) | |||
511 | static bool set_nr_if_polling(struct task_struct *p) | 511 | static bool set_nr_if_polling(struct task_struct *p) |
512 | { | 512 | { |
513 | struct thread_info *ti = task_thread_info(p); | 513 | struct thread_info *ti = task_thread_info(p); |
514 | typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); | 514 | typeof(ti->flags) old, val = READ_ONCE(ti->flags); |
515 | 515 | ||
516 | for (;;) { | 516 | for (;;) { |
517 | if (!(val & _TIF_POLLING_NRFLAG)) | 517 | if (!(val & _TIF_POLLING_NRFLAG)) |
@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p) | |||
541 | #endif | 541 | #endif |
542 | #endif | 542 | #endif |
543 | 543 | ||
544 | void wake_q_add(struct wake_q_head *head, struct task_struct *task) | ||
545 | { | ||
546 | struct wake_q_node *node = &task->wake_q; | ||
547 | |||
548 | /* | ||
549 | * Atomically grab the task, if ->wake_q is !nil already it means | ||
550 | * its already queued (either by us or someone else) and will get the | ||
551 | * wakeup due to that. | ||
552 | * | ||
553 | * This cmpxchg() implies a full barrier, which pairs with the write | ||
554 | * barrier implied by the wakeup in wake_up_list(). | ||
555 | */ | ||
556 | if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) | ||
557 | return; | ||
558 | |||
559 | get_task_struct(task); | ||
560 | |||
561 | /* | ||
562 | * The head is context local, there can be no concurrency. | ||
563 | */ | ||
564 | *head->lastp = node; | ||
565 | head->lastp = &node->next; | ||
566 | } | ||
567 | |||
568 | void wake_up_q(struct wake_q_head *head) | ||
569 | { | ||
570 | struct wake_q_node *node = head->first; | ||
571 | |||
572 | while (node != WAKE_Q_TAIL) { | ||
573 | struct task_struct *task; | ||
574 | |||
575 | task = container_of(node, struct task_struct, wake_q); | ||
576 | BUG_ON(!task); | ||
577 | /* task can safely be re-inserted now */ | ||
578 | node = node->next; | ||
579 | task->wake_q.next = NULL; | ||
580 | |||
581 | /* | ||
582 | * wake_up_process() implies a wmb() to pair with the queueing | ||
583 | * in wake_q_add() so as not to miss wakeups. | ||
584 | */ | ||
585 | wake_up_process(task); | ||
586 | put_task_struct(task); | ||
587 | } | ||
588 | } | ||
589 | |||
544 | /* | 590 | /* |
545 | * resched_curr - mark rq's current task 'to be rescheduled now'. | 591 | * resched_curr - mark rq's current task 'to be rescheduled now'. |
546 | * | 592 | * |
@@ -2397,9 +2443,9 @@ unsigned long nr_iowait_cpu(int cpu) | |||
2397 | 2443 | ||
2398 | void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) | 2444 | void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) |
2399 | { | 2445 | { |
2400 | struct rq *this = this_rq(); | 2446 | struct rq *rq = this_rq(); |
2401 | *nr_waiters = atomic_read(&this->nr_iowait); | 2447 | *nr_waiters = atomic_read(&rq->nr_iowait); |
2402 | *load = this->cpu_load[0]; | 2448 | *load = rq->load.weight; |
2403 | } | 2449 | } |
2404 | 2450 | ||
2405 | #ifdef CONFIG_SMP | 2451 | #ifdef CONFIG_SMP |
@@ -2497,6 +2543,7 @@ void scheduler_tick(void) | |||
2497 | update_rq_clock(rq); | 2543 | update_rq_clock(rq); |
2498 | curr->sched_class->task_tick(rq, curr, 0); | 2544 | curr->sched_class->task_tick(rq, curr, 0); |
2499 | update_cpu_load_active(rq); | 2545 | update_cpu_load_active(rq); |
2546 | calc_global_load_tick(rq); | ||
2500 | raw_spin_unlock(&rq->lock); | 2547 | raw_spin_unlock(&rq->lock); |
2501 | 2548 | ||
2502 | perf_event_task_tick(); | 2549 | perf_event_task_tick(); |
@@ -2525,7 +2572,7 @@ void scheduler_tick(void) | |||
2525 | u64 scheduler_tick_max_deferment(void) | 2572 | u64 scheduler_tick_max_deferment(void) |
2526 | { | 2573 | { |
2527 | struct rq *rq = this_rq(); | 2574 | struct rq *rq = this_rq(); |
2528 | unsigned long next, now = ACCESS_ONCE(jiffies); | 2575 | unsigned long next, now = READ_ONCE(jiffies); |
2529 | 2576 | ||
2530 | next = rq->last_sched_tick + HZ; | 2577 | next = rq->last_sched_tick + HZ; |
2531 | 2578 | ||
@@ -2726,9 +2773,7 @@ again: | |||
2726 | * - return from syscall or exception to user-space | 2773 | * - return from syscall or exception to user-space |
2727 | * - return from interrupt-handler to user-space | 2774 | * - return from interrupt-handler to user-space |
2728 | * | 2775 | * |
2729 | * WARNING: all callers must re-check need_resched() afterward and reschedule | 2776 | * WARNING: must be called with preemption disabled! |
2730 | * accordingly in case an event triggered the need for rescheduling (such as | ||
2731 | * an interrupt waking up a task) while preemption was disabled in __schedule(). | ||
2732 | */ | 2777 | */ |
2733 | static void __sched __schedule(void) | 2778 | static void __sched __schedule(void) |
2734 | { | 2779 | { |
@@ -2737,7 +2782,6 @@ static void __sched __schedule(void) | |||
2737 | struct rq *rq; | 2782 | struct rq *rq; |
2738 | int cpu; | 2783 | int cpu; |
2739 | 2784 | ||
2740 | preempt_disable(); | ||
2741 | cpu = smp_processor_id(); | 2785 | cpu = smp_processor_id(); |
2742 | rq = cpu_rq(cpu); | 2786 | rq = cpu_rq(cpu); |
2743 | rcu_note_context_switch(); | 2787 | rcu_note_context_switch(); |
@@ -2801,8 +2845,6 @@ static void __sched __schedule(void) | |||
2801 | raw_spin_unlock_irq(&rq->lock); | 2845 | raw_spin_unlock_irq(&rq->lock); |
2802 | 2846 | ||
2803 | post_schedule(rq); | 2847 | post_schedule(rq); |
2804 | |||
2805 | sched_preempt_enable_no_resched(); | ||
2806 | } | 2848 | } |
2807 | 2849 | ||
2808 | static inline void sched_submit_work(struct task_struct *tsk) | 2850 | static inline void sched_submit_work(struct task_struct *tsk) |
@@ -2823,7 +2865,9 @@ asmlinkage __visible void __sched schedule(void) | |||
2823 | 2865 | ||
2824 | sched_submit_work(tsk); | 2866 | sched_submit_work(tsk); |
2825 | do { | 2867 | do { |
2868 | preempt_disable(); | ||
2826 | __schedule(); | 2869 | __schedule(); |
2870 | sched_preempt_enable_no_resched(); | ||
2827 | } while (need_resched()); | 2871 | } while (need_resched()); |
2828 | } | 2872 | } |
2829 | EXPORT_SYMBOL(schedule); | 2873 | EXPORT_SYMBOL(schedule); |
@@ -2862,15 +2906,14 @@ void __sched schedule_preempt_disabled(void) | |||
2862 | static void __sched notrace preempt_schedule_common(void) | 2906 | static void __sched notrace preempt_schedule_common(void) |
2863 | { | 2907 | { |
2864 | do { | 2908 | do { |
2865 | __preempt_count_add(PREEMPT_ACTIVE); | 2909 | preempt_active_enter(); |
2866 | __schedule(); | 2910 | __schedule(); |
2867 | __preempt_count_sub(PREEMPT_ACTIVE); | 2911 | preempt_active_exit(); |
2868 | 2912 | ||
2869 | /* | 2913 | /* |
2870 | * Check again in case we missed a preemption opportunity | 2914 | * Check again in case we missed a preemption opportunity |
2871 | * between schedule and now. | 2915 | * between schedule and now. |
2872 | */ | 2916 | */ |
2873 | barrier(); | ||
2874 | } while (need_resched()); | 2917 | } while (need_resched()); |
2875 | } | 2918 | } |
2876 | 2919 | ||
@@ -2917,7 +2960,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void) | |||
2917 | return; | 2960 | return; |
2918 | 2961 | ||
2919 | do { | 2962 | do { |
2920 | __preempt_count_add(PREEMPT_ACTIVE); | 2963 | preempt_active_enter(); |
2921 | /* | 2964 | /* |
2922 | * Needs preempt disabled in case user_exit() is traced | 2965 | * Needs preempt disabled in case user_exit() is traced |
2923 | * and the tracer calls preempt_enable_notrace() causing | 2966 | * and the tracer calls preempt_enable_notrace() causing |
@@ -2927,8 +2970,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void) | |||
2927 | __schedule(); | 2970 | __schedule(); |
2928 | exception_exit(prev_ctx); | 2971 | exception_exit(prev_ctx); |
2929 | 2972 | ||
2930 | __preempt_count_sub(PREEMPT_ACTIVE); | 2973 | preempt_active_exit(); |
2931 | barrier(); | ||
2932 | } while (need_resched()); | 2974 | } while (need_resched()); |
2933 | } | 2975 | } |
2934 | EXPORT_SYMBOL_GPL(preempt_schedule_context); | 2976 | EXPORT_SYMBOL_GPL(preempt_schedule_context); |
@@ -2952,17 +2994,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) | |||
2952 | prev_state = exception_enter(); | 2994 | prev_state = exception_enter(); |
2953 | 2995 | ||
2954 | do { | 2996 | do { |
2955 | __preempt_count_add(PREEMPT_ACTIVE); | 2997 | preempt_active_enter(); |
2956 | local_irq_enable(); | 2998 | local_irq_enable(); |
2957 | __schedule(); | 2999 | __schedule(); |
2958 | local_irq_disable(); | 3000 | local_irq_disable(); |
2959 | __preempt_count_sub(PREEMPT_ACTIVE); | 3001 | preempt_active_exit(); |
2960 | |||
2961 | /* | ||
2962 | * Check again in case we missed a preemption opportunity | ||
2963 | * between schedule and now. | ||
2964 | */ | ||
2965 | barrier(); | ||
2966 | } while (need_resched()); | 3002 | } while (need_resched()); |
2967 | 3003 | ||
2968 | exception_exit(prev_state); | 3004 | exception_exit(prev_state); |
@@ -5314,7 +5350,7 @@ static struct notifier_block migration_notifier = { | |||
5314 | .priority = CPU_PRI_MIGRATION, | 5350 | .priority = CPU_PRI_MIGRATION, |
5315 | }; | 5351 | }; |
5316 | 5352 | ||
5317 | static void __cpuinit set_cpu_rq_start_time(void) | 5353 | static void set_cpu_rq_start_time(void) |
5318 | { | 5354 | { |
5319 | int cpu = smp_processor_id(); | 5355 | int cpu = smp_processor_id(); |
5320 | struct rq *rq = cpu_rq(cpu); | 5356 | struct rq *rq = cpu_rq(cpu); |
@@ -7734,11 +7770,11 @@ static long sched_group_rt_runtime(struct task_group *tg) | |||
7734 | return rt_runtime_us; | 7770 | return rt_runtime_us; |
7735 | } | 7771 | } |
7736 | 7772 | ||
7737 | static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | 7773 | static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) |
7738 | { | 7774 | { |
7739 | u64 rt_runtime, rt_period; | 7775 | u64 rt_runtime, rt_period; |
7740 | 7776 | ||
7741 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | 7777 | rt_period = rt_period_us * NSEC_PER_USEC; |
7742 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 7778 | rt_runtime = tg->rt_bandwidth.rt_runtime; |
7743 | 7779 | ||
7744 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | 7780 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8394b1ee600c..f5a64ffad176 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new) | |||
567 | { | 567 | { |
568 | cputime_t old; | 568 | cputime_t old; |
569 | 569 | ||
570 | while (new > (old = ACCESS_ONCE(*counter))) | 570 | while (new > (old = READ_ONCE(*counter))) |
571 | cmpxchg_cputime(counter, old, new); | 571 | cmpxchg_cputime(counter, old, new); |
572 | } | 572 | } |
573 | 573 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5e95145088fd..890ce951c717 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
995 | rq = cpu_rq(cpu); | 995 | rq = cpu_rq(cpu); |
996 | 996 | ||
997 | rcu_read_lock(); | 997 | rcu_read_lock(); |
998 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | 998 | curr = READ_ONCE(rq->curr); /* unlocked access */ |
999 | 999 | ||
1000 | /* | 1000 | /* |
1001 | * If we are dealing with a -deadline task, we must | 1001 | * If we are dealing with a -deadline task, we must |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffeaa4105e48..0d4632f7799b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) | |||
141 | * | 141 | * |
142 | * This idea comes from the SD scheduler of Con Kolivas: | 142 | * This idea comes from the SD scheduler of Con Kolivas: |
143 | */ | 143 | */ |
144 | static int get_update_sysctl_factor(void) | 144 | static unsigned int get_update_sysctl_factor(void) |
145 | { | 145 | { |
146 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | 146 | unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); |
147 | unsigned int factor; | 147 | unsigned int factor; |
148 | 148 | ||
149 | switch (sysctl_sched_tunable_scaling) { | 149 | switch (sysctl_sched_tunable_scaling) { |
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
576 | loff_t *ppos) | 576 | loff_t *ppos) |
577 | { | 577 | { |
578 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 578 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
579 | int factor = get_update_sysctl_factor(); | 579 | unsigned int factor = get_update_sysctl_factor(); |
580 | 580 | ||
581 | if (ret || !write) | 581 | if (ret || !write) |
582 | return ret; | 582 | return ret; |
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) | |||
834 | 834 | ||
835 | static unsigned int task_scan_min(struct task_struct *p) | 835 | static unsigned int task_scan_min(struct task_struct *p) |
836 | { | 836 | { |
837 | unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); | 837 | unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); |
838 | unsigned int scan, floor; | 838 | unsigned int scan, floor; |
839 | unsigned int windows = 1; | 839 | unsigned int windows = 1; |
840 | 840 | ||
@@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p) | |||
1794 | u64 runtime, period; | 1794 | u64 runtime, period; |
1795 | spinlock_t *group_lock = NULL; | 1795 | spinlock_t *group_lock = NULL; |
1796 | 1796 | ||
1797 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1797 | /* |
1798 | * The p->mm->numa_scan_seq field gets updated without | ||
1799 | * exclusive access. Use READ_ONCE() here to ensure | ||
1800 | * that the field is read in a single access: | ||
1801 | */ | ||
1802 | seq = READ_ONCE(p->mm->numa_scan_seq); | ||
1798 | if (p->numa_scan_seq == seq) | 1803 | if (p->numa_scan_seq == seq) |
1799 | return; | 1804 | return; |
1800 | p->numa_scan_seq = seq; | 1805 | p->numa_scan_seq = seq; |
@@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1938 | } | 1943 | } |
1939 | 1944 | ||
1940 | rcu_read_lock(); | 1945 | rcu_read_lock(); |
1941 | tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); | 1946 | tsk = READ_ONCE(cpu_rq(cpu)->curr); |
1942 | 1947 | ||
1943 | if (!cpupid_match_pid(tsk, cpupid)) | 1948 | if (!cpupid_match_pid(tsk, cpupid)) |
1944 | goto no_join; | 1949 | goto no_join; |
@@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
2107 | 2112 | ||
2108 | static void reset_ptenuma_scan(struct task_struct *p) | 2113 | static void reset_ptenuma_scan(struct task_struct *p) |
2109 | { | 2114 | { |
2110 | ACCESS_ONCE(p->mm->numa_scan_seq)++; | 2115 | /* |
2116 | * We only did a read acquisition of the mmap sem, so | ||
2117 | * p->mm->numa_scan_seq is written to without exclusive access | ||
2118 | * and the update is not guaranteed to be atomic. That's not | ||
2119 | * much of an issue though, since this is just used for | ||
2120 | * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not | ||
2121 | * expensive, to avoid any form of compiler optimizations: | ||
2122 | */ | ||
2123 | WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); | ||
2111 | p->mm->numa_scan_offset = 0; | 2124 | p->mm->numa_scan_offset = 0; |
2112 | } | 2125 | } |
2113 | 2126 | ||
@@ -4323,6 +4336,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4323 | } | 4336 | } |
4324 | 4337 | ||
4325 | #ifdef CONFIG_SMP | 4338 | #ifdef CONFIG_SMP |
4339 | |||
4340 | /* | ||
4341 | * per rq 'load' arrray crap; XXX kill this. | ||
4342 | */ | ||
4343 | |||
4344 | /* | ||
4345 | * The exact cpuload at various idx values, calculated at every tick would be | ||
4346 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
4347 | * | ||
4348 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
4349 | * on nth tick when cpu may be busy, then we have: | ||
4350 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
4351 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
4352 | * | ||
4353 | * decay_load_missed() below does efficient calculation of | ||
4354 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
4355 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
4356 | * | ||
4357 | * The calculation is approximated on a 128 point scale. | ||
4358 | * degrade_zero_ticks is the number of ticks after which load at any | ||
4359 | * particular idx is approximated to be zero. | ||
4360 | * degrade_factor is a precomputed table, a row for each load idx. | ||
4361 | * Each column corresponds to degradation factor for a power of two ticks, | ||
4362 | * based on 128 point scale. | ||
4363 | * Example: | ||
4364 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
4365 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
4366 | * | ||
4367 | * With this power of 2 load factors, we can degrade the load n times | ||
4368 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
4369 | * n mult/shifts needed by the exact degradation. | ||
4370 | */ | ||
4371 | #define DEGRADE_SHIFT 7 | ||
4372 | static const unsigned char | ||
4373 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
4374 | static const unsigned char | ||
4375 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
4376 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
4377 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
4378 | {96, 72, 40, 12, 1, 0, 0}, | ||
4379 | {112, 98, 75, 43, 15, 1, 0}, | ||
4380 | {120, 112, 98, 76, 45, 16, 2} }; | ||
4381 | |||
4382 | /* | ||
4383 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
4384 | * would be when CPU is idle and so we just decay the old load without | ||
4385 | * adding any new load. | ||
4386 | */ | ||
4387 | static unsigned long | ||
4388 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
4389 | { | ||
4390 | int j = 0; | ||
4391 | |||
4392 | if (!missed_updates) | ||
4393 | return load; | ||
4394 | |||
4395 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
4396 | return 0; | ||
4397 | |||
4398 | if (idx == 1) | ||
4399 | return load >> missed_updates; | ||
4400 | |||
4401 | while (missed_updates) { | ||
4402 | if (missed_updates % 2) | ||
4403 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
4404 | |||
4405 | missed_updates >>= 1; | ||
4406 | j++; | ||
4407 | } | ||
4408 | return load; | ||
4409 | } | ||
4410 | |||
4411 | /* | ||
4412 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
4413 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
4414 | * every tick. We fix it up based on jiffies. | ||
4415 | */ | ||
4416 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
4417 | unsigned long pending_updates) | ||
4418 | { | ||
4419 | int i, scale; | ||
4420 | |||
4421 | this_rq->nr_load_updates++; | ||
4422 | |||
4423 | /* Update our load: */ | ||
4424 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
4425 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
4426 | unsigned long old_load, new_load; | ||
4427 | |||
4428 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
4429 | |||
4430 | old_load = this_rq->cpu_load[i]; | ||
4431 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
4432 | new_load = this_load; | ||
4433 | /* | ||
4434 | * Round up the averaging division if load is increasing. This | ||
4435 | * prevents us from getting stuck on 9 if the load is 10, for | ||
4436 | * example. | ||
4437 | */ | ||
4438 | if (new_load > old_load) | ||
4439 | new_load += scale - 1; | ||
4440 | |||
4441 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
4442 | } | ||
4443 | |||
4444 | sched_avg_update(this_rq); | ||
4445 | } | ||
4446 | |||
4447 | #ifdef CONFIG_NO_HZ_COMMON | ||
4448 | /* | ||
4449 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
4450 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
4451 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
4452 | * | ||
4453 | * Therefore we cannot use the delta approach from the regular tick since that | ||
4454 | * would seriously skew the load calculation. However we'll make do for those | ||
4455 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
4456 | * (tick_nohz_idle_exit). | ||
4457 | * | ||
4458 | * This means we might still be one tick off for nohz periods. | ||
4459 | */ | ||
4460 | |||
4461 | /* | ||
4462 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
4463 | * idle balance. | ||
4464 | */ | ||
4465 | static void update_idle_cpu_load(struct rq *this_rq) | ||
4466 | { | ||
4467 | unsigned long curr_jiffies = READ_ONCE(jiffies); | ||
4468 | unsigned long load = this_rq->cfs.runnable_load_avg; | ||
4469 | unsigned long pending_updates; | ||
4470 | |||
4471 | /* | ||
4472 | * bail if there's load or we're actually up-to-date. | ||
4473 | */ | ||
4474 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
4475 | return; | ||
4476 | |||
4477 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
4478 | this_rq->last_load_update_tick = curr_jiffies; | ||
4479 | |||
4480 | __update_cpu_load(this_rq, load, pending_updates); | ||
4481 | } | ||
4482 | |||
4483 | /* | ||
4484 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
4485 | */ | ||
4486 | void update_cpu_load_nohz(void) | ||
4487 | { | ||
4488 | struct rq *this_rq = this_rq(); | ||
4489 | unsigned long curr_jiffies = READ_ONCE(jiffies); | ||
4490 | unsigned long pending_updates; | ||
4491 | |||
4492 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
4493 | return; | ||
4494 | |||
4495 | raw_spin_lock(&this_rq->lock); | ||
4496 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
4497 | if (pending_updates) { | ||
4498 | this_rq->last_load_update_tick = curr_jiffies; | ||
4499 | /* | ||
4500 | * We were idle, this means load 0, the current load might be | ||
4501 | * !0 due to remote wakeups and the sort. | ||
4502 | */ | ||
4503 | __update_cpu_load(this_rq, 0, pending_updates); | ||
4504 | } | ||
4505 | raw_spin_unlock(&this_rq->lock); | ||
4506 | } | ||
4507 | #endif /* CONFIG_NO_HZ */ | ||
4508 | |||
4509 | /* | ||
4510 | * Called from scheduler_tick() | ||
4511 | */ | ||
4512 | void update_cpu_load_active(struct rq *this_rq) | ||
4513 | { | ||
4514 | unsigned long load = this_rq->cfs.runnable_load_avg; | ||
4515 | /* | ||
4516 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
4517 | */ | ||
4518 | this_rq->last_load_update_tick = jiffies; | ||
4519 | __update_cpu_load(this_rq, load, 1); | ||
4520 | } | ||
4521 | |||
4326 | /* Used instead of source_load when we know the type == 0 */ | 4522 | /* Used instead of source_load when we know the type == 0 */ |
4327 | static unsigned long weighted_cpuload(const int cpu) | 4523 | static unsigned long weighted_cpuload(const int cpu) |
4328 | { | 4524 | { |
@@ -4375,7 +4571,7 @@ static unsigned long capacity_orig_of(int cpu) | |||
4375 | static unsigned long cpu_avg_load_per_task(int cpu) | 4571 | static unsigned long cpu_avg_load_per_task(int cpu) |
4376 | { | 4572 | { |
4377 | struct rq *rq = cpu_rq(cpu); | 4573 | struct rq *rq = cpu_rq(cpu); |
4378 | unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); | 4574 | unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); |
4379 | unsigned long load_avg = rq->cfs.runnable_load_avg; | 4575 | unsigned long load_avg = rq->cfs.runnable_load_avg; |
4380 | 4576 | ||
4381 | if (nr_running) | 4577 | if (nr_running) |
@@ -5467,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env) | |||
5467 | } | 5663 | } |
5468 | 5664 | ||
5469 | #ifdef CONFIG_NUMA_BALANCING | 5665 | #ifdef CONFIG_NUMA_BALANCING |
5470 | /* Returns true if the destination node has incurred more faults */ | 5666 | /* |
5667 | * Returns true if the destination node is the preferred node. | ||
5668 | * Needs to match fbq_classify_rq(): if there is a runnable task | ||
5669 | * that is not on its preferred node, we should identify it. | ||
5670 | */ | ||
5471 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | 5671 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) |
5472 | { | 5672 | { |
5473 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | 5673 | struct numa_group *numa_group = rcu_dereference(p->numa_group); |
5674 | unsigned long src_faults, dst_faults; | ||
5474 | int src_nid, dst_nid; | 5675 | int src_nid, dst_nid; |
5475 | 5676 | ||
5476 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | 5677 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || |
@@ -5484,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
5484 | if (src_nid == dst_nid) | 5685 | if (src_nid == dst_nid) |
5485 | return false; | 5686 | return false; |
5486 | 5687 | ||
5487 | if (numa_group) { | ||
5488 | /* Task is already in the group's interleave set. */ | ||
5489 | if (node_isset(src_nid, numa_group->active_nodes)) | ||
5490 | return false; | ||
5491 | |||
5492 | /* Task is moving into the group's interleave set. */ | ||
5493 | if (node_isset(dst_nid, numa_group->active_nodes)) | ||
5494 | return true; | ||
5495 | |||
5496 | return group_faults(p, dst_nid) > group_faults(p, src_nid); | ||
5497 | } | ||
5498 | |||
5499 | /* Encourage migration to the preferred node. */ | 5688 | /* Encourage migration to the preferred node. */ |
5500 | if (dst_nid == p->numa_preferred_nid) | 5689 | if (dst_nid == p->numa_preferred_nid) |
5501 | return true; | 5690 | return true; |
5502 | 5691 | ||
5503 | return task_faults(p, dst_nid) > task_faults(p, src_nid); | 5692 | /* Migrating away from the preferred node is bad. */ |
5693 | if (src_nid == p->numa_preferred_nid) | ||
5694 | return false; | ||
5695 | |||
5696 | if (numa_group) { | ||
5697 | src_faults = group_faults(p, src_nid); | ||
5698 | dst_faults = group_faults(p, dst_nid); | ||
5699 | } else { | ||
5700 | src_faults = task_faults(p, src_nid); | ||
5701 | dst_faults = task_faults(p, dst_nid); | ||
5702 | } | ||
5703 | |||
5704 | return dst_faults > src_faults; | ||
5504 | } | 5705 | } |
5505 | 5706 | ||
5506 | 5707 | ||
5507 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | 5708 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) |
5508 | { | 5709 | { |
5509 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | 5710 | struct numa_group *numa_group = rcu_dereference(p->numa_group); |
5711 | unsigned long src_faults, dst_faults; | ||
5510 | int src_nid, dst_nid; | 5712 | int src_nid, dst_nid; |
5511 | 5713 | ||
5512 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | 5714 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) |
@@ -5521,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
5521 | if (src_nid == dst_nid) | 5723 | if (src_nid == dst_nid) |
5522 | return false; | 5724 | return false; |
5523 | 5725 | ||
5524 | if (numa_group) { | 5726 | /* Migrating away from the preferred node is bad. */ |
5525 | /* Task is moving within/into the group's interleave set. */ | 5727 | if (src_nid == p->numa_preferred_nid) |
5526 | if (node_isset(dst_nid, numa_group->active_nodes)) | 5728 | return true; |
5527 | return false; | ||
5528 | 5729 | ||
5529 | /* Task is moving out of the group's interleave set. */ | 5730 | /* Encourage migration to the preferred node. */ |
5530 | if (node_isset(src_nid, numa_group->active_nodes)) | 5731 | if (dst_nid == p->numa_preferred_nid) |
5531 | return true; | 5732 | return false; |
5532 | 5733 | ||
5533 | return group_faults(p, dst_nid) < group_faults(p, src_nid); | 5734 | if (numa_group) { |
5735 | src_faults = group_faults(p, src_nid); | ||
5736 | dst_faults = group_faults(p, dst_nid); | ||
5737 | } else { | ||
5738 | src_faults = task_faults(p, src_nid); | ||
5739 | dst_faults = task_faults(p, dst_nid); | ||
5534 | } | 5740 | } |
5535 | 5741 | ||
5536 | /* Migrating away from the preferred node is always bad. */ | 5742 | return dst_faults < src_faults; |
5537 | if (src_nid == p->numa_preferred_nid) | ||
5538 | return true; | ||
5539 | |||
5540 | return task_faults(p, dst_nid) < task_faults(p, src_nid); | ||
5541 | } | 5743 | } |
5542 | 5744 | ||
5543 | #else | 5745 | #else |
@@ -6037,8 +6239,8 @@ static unsigned long scale_rt_capacity(int cpu) | |||
6037 | * Since we're reading these variables without serialization make sure | 6239 | * Since we're reading these variables without serialization make sure |
6038 | * we read them once before doing sanity checks on them. | 6240 | * we read them once before doing sanity checks on them. |
6039 | */ | 6241 | */ |
6040 | age_stamp = ACCESS_ONCE(rq->age_stamp); | 6242 | age_stamp = READ_ONCE(rq->age_stamp); |
6041 | avg = ACCESS_ONCE(rq->rt_avg); | 6243 | avg = READ_ONCE(rq->rt_avg); |
6042 | delta = __rq_clock_broken(rq) - age_stamp; | 6244 | delta = __rq_clock_broken(rq) - age_stamp; |
6043 | 6245 | ||
6044 | if (unlikely(delta < 0)) | 6246 | if (unlikely(delta < 0)) |
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c index 8ecd552fe4f2..ef7159012cf3 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/loadavg.c | |||
@@ -1,7 +1,9 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched/proc.c | 2 | * kernel/sched/loadavg.c |
3 | * | 3 | * |
4 | * Kernel load calculations, forked from sched/core.c | 4 | * This file contains the magic bits required to compute the global loadavg |
5 | * figure. Its a silly number but people think its important. We go through | ||
6 | * great pains to make it work on big machines and tickless kernels. | ||
5 | */ | 7 | */ |
6 | 8 | ||
7 | #include <linux/export.h> | 9 | #include <linux/export.h> |
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq) | |||
81 | long nr_active, delta = 0; | 83 | long nr_active, delta = 0; |
82 | 84 | ||
83 | nr_active = this_rq->nr_running; | 85 | nr_active = this_rq->nr_running; |
84 | nr_active += (long) this_rq->nr_uninterruptible; | 86 | nr_active += (long)this_rq->nr_uninterruptible; |
85 | 87 | ||
86 | if (nr_active != this_rq->calc_load_active) { | 88 | if (nr_active != this_rq->calc_load_active) { |
87 | delta = nr_active - this_rq->calc_load_active; | 89 | delta = nr_active - this_rq->calc_load_active; |
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void) | |||
186 | delta = calc_load_fold_active(this_rq); | 188 | delta = calc_load_fold_active(this_rq); |
187 | if (delta) { | 189 | if (delta) { |
188 | int idx = calc_load_write_idx(); | 190 | int idx = calc_load_write_idx(); |
191 | |||
189 | atomic_long_add(delta, &calc_load_idle[idx]); | 192 | atomic_long_add(delta, &calc_load_idle[idx]); |
190 | } | 193 | } |
191 | } | 194 | } |
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | |||
241 | { | 244 | { |
242 | unsigned long result = 1UL << frac_bits; | 245 | unsigned long result = 1UL << frac_bits; |
243 | 246 | ||
244 | if (n) for (;;) { | 247 | if (n) { |
245 | if (n & 1) { | 248 | for (;;) { |
246 | result *= x; | 249 | if (n & 1) { |
247 | result += 1UL << (frac_bits - 1); | 250 | result *= x; |
248 | result >>= frac_bits; | 251 | result += 1UL << (frac_bits - 1); |
252 | result >>= frac_bits; | ||
253 | } | ||
254 | n >>= 1; | ||
255 | if (!n) | ||
256 | break; | ||
257 | x *= x; | ||
258 | x += 1UL << (frac_bits - 1); | ||
259 | x >>= frac_bits; | ||
249 | } | 260 | } |
250 | n >>= 1; | ||
251 | if (!n) | ||
252 | break; | ||
253 | x *= x; | ||
254 | x += 1UL << (frac_bits - 1); | ||
255 | x >>= frac_bits; | ||
256 | } | 261 | } |
257 | 262 | ||
258 | return result; | 263 | return result; |
@@ -285,7 +290,6 @@ static unsigned long | |||
285 | calc_load_n(unsigned long load, unsigned long exp, | 290 | calc_load_n(unsigned long load, unsigned long exp, |
286 | unsigned long active, unsigned int n) | 291 | unsigned long active, unsigned int n) |
287 | { | 292 | { |
288 | |||
289 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | 293 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); |
290 | } | 294 | } |
291 | 295 | ||
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { } | |||
339 | /* | 343 | /* |
340 | * calc_load - update the avenrun load estimates 10 ticks after the | 344 | * calc_load - update the avenrun load estimates 10 ticks after the |
341 | * CPUs have updated calc_load_tasks. | 345 | * CPUs have updated calc_load_tasks. |
346 | * | ||
347 | * Called from the global timer code. | ||
342 | */ | 348 | */ |
343 | void calc_global_load(unsigned long ticks) | 349 | void calc_global_load(unsigned long ticks) |
344 | { | 350 | { |
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks) | |||
370 | } | 376 | } |
371 | 377 | ||
372 | /* | 378 | /* |
373 | * Called from update_cpu_load() to periodically update this CPU's | 379 | * Called from scheduler_tick() to periodically update this CPU's |
374 | * active count. | 380 | * active count. |
375 | */ | 381 | */ |
376 | static void calc_load_account_active(struct rq *this_rq) | 382 | void calc_global_load_tick(struct rq *this_rq) |
377 | { | 383 | { |
378 | long delta; | 384 | long delta; |
379 | 385 | ||
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq) | |||
386 | 392 | ||
387 | this_rq->calc_load_update += LOAD_FREQ; | 393 | this_rq->calc_load_update += LOAD_FREQ; |
388 | } | 394 | } |
389 | |||
390 | /* | ||
391 | * End of global load-average stuff | ||
392 | */ | ||
393 | |||
394 | /* | ||
395 | * The exact cpuload at various idx values, calculated at every tick would be | ||
396 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
397 | * | ||
398 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
399 | * on nth tick when cpu may be busy, then we have: | ||
400 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
401 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
402 | * | ||
403 | * decay_load_missed() below does efficient calculation of | ||
404 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
405 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
406 | * | ||
407 | * The calculation is approximated on a 128 point scale. | ||
408 | * degrade_zero_ticks is the number of ticks after which load at any | ||
409 | * particular idx is approximated to be zero. | ||
410 | * degrade_factor is a precomputed table, a row for each load idx. | ||
411 | * Each column corresponds to degradation factor for a power of two ticks, | ||
412 | * based on 128 point scale. | ||
413 | * Example: | ||
414 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
415 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
416 | * | ||
417 | * With this power of 2 load factors, we can degrade the load n times | ||
418 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
419 | * n mult/shifts needed by the exact degradation. | ||
420 | */ | ||
421 | #define DEGRADE_SHIFT 7 | ||
422 | static const unsigned char | ||
423 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
424 | static const unsigned char | ||
425 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
426 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
427 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
428 | {96, 72, 40, 12, 1, 0, 0}, | ||
429 | {112, 98, 75, 43, 15, 1, 0}, | ||
430 | {120, 112, 98, 76, 45, 16, 2} }; | ||
431 | |||
432 | /* | ||
433 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
434 | * would be when CPU is idle and so we just decay the old load without | ||
435 | * adding any new load. | ||
436 | */ | ||
437 | static unsigned long | ||
438 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
439 | { | ||
440 | int j = 0; | ||
441 | |||
442 | if (!missed_updates) | ||
443 | return load; | ||
444 | |||
445 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
446 | return 0; | ||
447 | |||
448 | if (idx == 1) | ||
449 | return load >> missed_updates; | ||
450 | |||
451 | while (missed_updates) { | ||
452 | if (missed_updates % 2) | ||
453 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
454 | |||
455 | missed_updates >>= 1; | ||
456 | j++; | ||
457 | } | ||
458 | return load; | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
463 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
464 | * every tick. We fix it up based on jiffies. | ||
465 | */ | ||
466 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
467 | unsigned long pending_updates) | ||
468 | { | ||
469 | int i, scale; | ||
470 | |||
471 | this_rq->nr_load_updates++; | ||
472 | |||
473 | /* Update our load: */ | ||
474 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
475 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
476 | unsigned long old_load, new_load; | ||
477 | |||
478 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
479 | |||
480 | old_load = this_rq->cpu_load[i]; | ||
481 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
482 | new_load = this_load; | ||
483 | /* | ||
484 | * Round up the averaging division if load is increasing. This | ||
485 | * prevents us from getting stuck on 9 if the load is 10, for | ||
486 | * example. | ||
487 | */ | ||
488 | if (new_load > old_load) | ||
489 | new_load += scale - 1; | ||
490 | |||
491 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
492 | } | ||
493 | |||
494 | sched_avg_update(this_rq); | ||
495 | } | ||
496 | |||
497 | #ifdef CONFIG_SMP | ||
498 | static inline unsigned long get_rq_runnable_load(struct rq *rq) | ||
499 | { | ||
500 | return rq->cfs.runnable_load_avg; | ||
501 | } | ||
502 | #else | ||
503 | static inline unsigned long get_rq_runnable_load(struct rq *rq) | ||
504 | { | ||
505 | return rq->load.weight; | ||
506 | } | ||
507 | #endif | ||
508 | |||
509 | #ifdef CONFIG_NO_HZ_COMMON | ||
510 | /* | ||
511 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
512 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
513 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
514 | * | ||
515 | * Therefore we cannot use the delta approach from the regular tick since that | ||
516 | * would seriously skew the load calculation. However we'll make do for those | ||
517 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
518 | * (tick_nohz_idle_exit). | ||
519 | * | ||
520 | * This means we might still be one tick off for nohz periods. | ||
521 | */ | ||
522 | |||
523 | /* | ||
524 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
525 | * idle balance. | ||
526 | */ | ||
527 | void update_idle_cpu_load(struct rq *this_rq) | ||
528 | { | ||
529 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
530 | unsigned long load = get_rq_runnable_load(this_rq); | ||
531 | unsigned long pending_updates; | ||
532 | |||
533 | /* | ||
534 | * bail if there's load or we're actually up-to-date. | ||
535 | */ | ||
536 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
537 | return; | ||
538 | |||
539 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
540 | this_rq->last_load_update_tick = curr_jiffies; | ||
541 | |||
542 | __update_cpu_load(this_rq, load, pending_updates); | ||
543 | } | ||
544 | |||
545 | /* | ||
546 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
547 | */ | ||
548 | void update_cpu_load_nohz(void) | ||
549 | { | ||
550 | struct rq *this_rq = this_rq(); | ||
551 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
552 | unsigned long pending_updates; | ||
553 | |||
554 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
555 | return; | ||
556 | |||
557 | raw_spin_lock(&this_rq->lock); | ||
558 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
559 | if (pending_updates) { | ||
560 | this_rq->last_load_update_tick = curr_jiffies; | ||
561 | /* | ||
562 | * We were idle, this means load 0, the current load might be | ||
563 | * !0 due to remote wakeups and the sort. | ||
564 | */ | ||
565 | __update_cpu_load(this_rq, 0, pending_updates); | ||
566 | } | ||
567 | raw_spin_unlock(&this_rq->lock); | ||
568 | } | ||
569 | #endif /* CONFIG_NO_HZ */ | ||
570 | |||
571 | /* | ||
572 | * Called from scheduler_tick() | ||
573 | */ | ||
574 | void update_cpu_load_active(struct rq *this_rq) | ||
575 | { | ||
576 | unsigned long load = get_rq_runnable_load(this_rq); | ||
577 | /* | ||
578 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
579 | */ | ||
580 | this_rq->last_load_update_tick = jiffies; | ||
581 | __update_cpu_load(this_rq, load, 1); | ||
582 | |||
583 | calc_load_account_active(this_rq); | ||
584 | } | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 575da76a3874..560d2fa623c3 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
1323 | rq = cpu_rq(cpu); | 1323 | rq = cpu_rq(cpu); |
1324 | 1324 | ||
1325 | rcu_read_lock(); | 1325 | rcu_read_lock(); |
1326 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | 1326 | curr = READ_ONCE(rq->curr); /* unlocked access */ |
1327 | 1327 | ||
1328 | /* | 1328 | /* |
1329 | * If the current task on @p's runqueue is an RT task, then | 1329 | * If the current task on @p's runqueue is an RT task, then |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e129993958..d85455539d5c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running; | |||
26 | extern unsigned long calc_load_update; | 26 | extern unsigned long calc_load_update; |
27 | extern atomic_long_t calc_load_tasks; | 27 | extern atomic_long_t calc_load_tasks; |
28 | 28 | ||
29 | extern void calc_global_load_tick(struct rq *this_rq); | ||
29 | extern long calc_load_fold_active(struct rq *this_rq); | 30 | extern long calc_load_fold_active(struct rq *this_rq); |
31 | |||
32 | #ifdef CONFIG_SMP | ||
30 | extern void update_cpu_load_active(struct rq *this_rq); | 33 | extern void update_cpu_load_active(struct rq *this_rq); |
34 | #else | ||
35 | static inline void update_cpu_load_active(struct rq *this_rq) { } | ||
36 | #endif | ||
31 | 37 | ||
32 | /* | 38 | /* |
33 | * Helpers for converting nanosecond timing to jiffy resolution | 39 | * Helpers for converting nanosecond timing to jiffy resolution |
@@ -707,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | |||
707 | 713 | ||
708 | static inline u64 __rq_clock_broken(struct rq *rq) | 714 | static inline u64 __rq_clock_broken(struct rq *rq) |
709 | { | 715 | { |
710 | return ACCESS_ONCE(rq->clock); | 716 | return READ_ONCE(rq->clock); |
711 | } | 717 | } |
712 | 718 | ||
713 | static inline u64 rq_clock(struct rq *rq) | 719 | static inline u64 rq_clock(struct rq *rq) |
@@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | |||
1298 | 1304 | ||
1299 | unsigned long to_ratio(u64 period, u64 runtime); | 1305 | unsigned long to_ratio(u64 period, u64 runtime); |
1300 | 1306 | ||
1301 | extern void update_idle_cpu_load(struct rq *this_rq); | ||
1302 | |||
1303 | extern void init_task_runnable_average(struct task_struct *p); | 1307 | extern void init_task_runnable_average(struct task_struct *p); |
1304 | 1308 | ||
1305 | static inline void add_nr_running(struct rq *rq, unsigned count) | 1309 | static inline void add_nr_running(struct rq *rq, unsigned count) |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4ab704339656..077ebbd5e10f 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk) | |||
174 | { | 174 | { |
175 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 175 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
176 | 176 | ||
177 | if (!cputimer->running) | 177 | /* Check if cputimer isn't running. This is accessed without locking. */ |
178 | if (!READ_ONCE(cputimer->running)) | ||
178 | return false; | 179 | return false; |
179 | 180 | ||
180 | /* | 181 | /* |
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
215 | if (!cputimer_running(tsk)) | 216 | if (!cputimer_running(tsk)) |
216 | return; | 217 | return; |
217 | 218 | ||
218 | raw_spin_lock(&cputimer->lock); | 219 | atomic64_add(cputime, &cputimer->cputime_atomic.utime); |
219 | cputimer->cputime.utime += cputime; | ||
220 | raw_spin_unlock(&cputimer->lock); | ||
221 | } | 220 | } |
222 | 221 | ||
223 | /** | 222 | /** |
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
238 | if (!cputimer_running(tsk)) | 237 | if (!cputimer_running(tsk)) |
239 | return; | 238 | return; |
240 | 239 | ||
241 | raw_spin_lock(&cputimer->lock); | 240 | atomic64_add(cputime, &cputimer->cputime_atomic.stime); |
242 | cputimer->cputime.stime += cputime; | ||
243 | raw_spin_unlock(&cputimer->lock); | ||
244 | } | 241 | } |
245 | 242 | ||
246 | /** | 243 | /** |
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, | |||
261 | if (!cputimer_running(tsk)) | 258 | if (!cputimer_running(tsk)) |
262 | return; | 259 | return; |
263 | 260 | ||
264 | raw_spin_lock(&cputimer->lock); | 261 | atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); |
265 | cputimer->cputime.sum_exec_runtime += ns; | ||
266 | raw_spin_unlock(&cputimer->lock); | ||
267 | } | 262 | } |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 852143a79f36..2ccec988d6b7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io); | |||
601 | 601 | ||
602 | __sched int bit_wait_timeout(struct wait_bit_key *word) | 602 | __sched int bit_wait_timeout(struct wait_bit_key *word) |
603 | { | 603 | { |
604 | unsigned long now = ACCESS_ONCE(jiffies); | 604 | unsigned long now = READ_ONCE(jiffies); |
605 | if (signal_pending_state(current->state, current)) | 605 | if (signal_pending_state(current->state, current)) |
606 | return 1; | 606 | return 1; |
607 | if (time_after_eq(now, word->timeout)) | 607 | if (time_after_eq(now, word->timeout)) |
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); | |||
613 | 613 | ||
614 | __sched int bit_wait_io_timeout(struct wait_bit_key *word) | 614 | __sched int bit_wait_io_timeout(struct wait_bit_key *word) |
615 | { | 615 | { |
616 | unsigned long now = ACCESS_ONCE(jiffies); | 616 | unsigned long now = READ_ONCE(jiffies); |
617 | if (signal_pending_state(current->state, current)) | 617 | if (signal_pending_state(current->state, current)) |
618 | return 1; | 618 | return 1; |
619 | if (time_after_eq(now, word->timeout)) | 619 | if (time_after_eq(now, word->timeout)) |
diff --git a/kernel/signal.c b/kernel/signal.c index d51c5ddd855c..f19833b5db3c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig) | |||
245 | * RETURNS: | 245 | * RETURNS: |
246 | * %true if @mask is set, %false if made noop because @task was dying. | 246 | * %true if @mask is set, %false if made noop because @task was dying. |
247 | */ | 247 | */ |
248 | bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) | 248 | bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) |
249 | { | 249 | { |
250 | BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | | 250 | BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | |
251 | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); | 251 | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); |
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) | |||
297 | * CONTEXT: | 297 | * CONTEXT: |
298 | * Must be called with @task->sighand->siglock held. | 298 | * Must be called with @task->sighand->siglock held. |
299 | */ | 299 | */ |
300 | void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) | 300 | void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) |
301 | { | 301 | { |
302 | BUG_ON(mask & ~JOBCTL_PENDING_MASK); | 302 | BUG_ON(mask & ~JOBCTL_PENDING_MASK); |
303 | 303 | ||
@@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr) | |||
2000 | struct signal_struct *sig = current->signal; | 2000 | struct signal_struct *sig = current->signal; |
2001 | 2001 | ||
2002 | if (!(current->jobctl & JOBCTL_STOP_PENDING)) { | 2002 | if (!(current->jobctl & JOBCTL_STOP_PENDING)) { |
2003 | unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; | 2003 | unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; |
2004 | struct task_struct *t; | 2004 | struct task_struct *t; |
2005 | 2005 | ||
2006 | /* signr will be recorded in task->jobctl for retries */ | 2006 | /* signr will be recorded in task->jobctl for retries */ |
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0075da74abf0..892e3dae0aac 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | |||
196 | return 0; | 196 | return 0; |
197 | } | 197 | } |
198 | 198 | ||
199 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) | 199 | /* |
200 | * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg | ||
201 | * to avoid race conditions with concurrent updates to cputime. | ||
202 | */ | ||
203 | static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) | ||
200 | { | 204 | { |
201 | if (b->utime > a->utime) | 205 | u64 curr_cputime; |
202 | a->utime = b->utime; | 206 | retry: |
207 | curr_cputime = atomic64_read(cputime); | ||
208 | if (sum_cputime > curr_cputime) { | ||
209 | if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime) | ||
210 | goto retry; | ||
211 | } | ||
212 | } | ||
203 | 213 | ||
204 | if (b->stime > a->stime) | 214 | static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) |
205 | a->stime = b->stime; | 215 | { |
216 | __update_gt_cputime(&cputime_atomic->utime, sum->utime); | ||
217 | __update_gt_cputime(&cputime_atomic->stime, sum->stime); | ||
218 | __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); | ||
219 | } | ||
206 | 220 | ||
207 | if (b->sum_exec_runtime > a->sum_exec_runtime) | 221 | /* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ |
208 | a->sum_exec_runtime = b->sum_exec_runtime; | 222 | static inline void sample_cputime_atomic(struct task_cputime *times, |
223 | struct task_cputime_atomic *atomic_times) | ||
224 | { | ||
225 | times->utime = atomic64_read(&atomic_times->utime); | ||
226 | times->stime = atomic64_read(&atomic_times->stime); | ||
227 | times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); | ||
209 | } | 228 | } |
210 | 229 | ||
211 | void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | 230 | void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) |
212 | { | 231 | { |
213 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 232 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
214 | struct task_cputime sum; | 233 | struct task_cputime sum; |
215 | unsigned long flags; | ||
216 | 234 | ||
217 | if (!cputimer->running) { | 235 | /* Check if cputimer isn't running. This is accessed without locking. */ |
236 | if (!READ_ONCE(cputimer->running)) { | ||
218 | /* | 237 | /* |
219 | * The POSIX timer interface allows for absolute time expiry | 238 | * The POSIX timer interface allows for absolute time expiry |
220 | * values through the TIMER_ABSTIME flag, therefore we have | 239 | * values through the TIMER_ABSTIME flag, therefore we have |
221 | * to synchronize the timer to the clock every time we start | 240 | * to synchronize the timer to the clock every time we start it. |
222 | * it. | ||
223 | */ | 241 | */ |
224 | thread_group_cputime(tsk, &sum); | 242 | thread_group_cputime(tsk, &sum); |
225 | raw_spin_lock_irqsave(&cputimer->lock, flags); | 243 | update_gt_cputime(&cputimer->cputime_atomic, &sum); |
226 | cputimer->running = 1; | 244 | |
227 | update_gt_cputime(&cputimer->cputime, &sum); | 245 | /* |
228 | } else | 246 | * We're setting cputimer->running without a lock. Ensure |
229 | raw_spin_lock_irqsave(&cputimer->lock, flags); | 247 | * this only gets written to in one operation. We set |
230 | *times = cputimer->cputime; | 248 | * running after update_gt_cputime() as a small optimization, |
231 | raw_spin_unlock_irqrestore(&cputimer->lock, flags); | 249 | * but barriers are not required because update_gt_cputime() |
250 | * can handle concurrent updates. | ||
251 | */ | ||
252 | WRITE_ONCE(cputimer->running, 1); | ||
253 | } | ||
254 | sample_cputime_atomic(times, &cputimer->cputime_atomic); | ||
232 | } | 255 | } |
233 | 256 | ||
234 | /* | 257 | /* |
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) | |||
582 | if (!task_cputime_zero(&tsk->cputime_expires)) | 605 | if (!task_cputime_zero(&tsk->cputime_expires)) |
583 | return false; | 606 | return false; |
584 | 607 | ||
585 | if (tsk->signal->cputimer.running) | 608 | /* Check if cputimer is running. This is accessed without locking. */ |
609 | if (READ_ONCE(tsk->signal->cputimer.running)) | ||
586 | return false; | 610 | return false; |
587 | 611 | ||
588 | return true; | 612 | return true; |
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk, | |||
852 | /* | 876 | /* |
853 | * Check for the special case thread timers. | 877 | * Check for the special case thread timers. |
854 | */ | 878 | */ |
855 | soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); | 879 | soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); |
856 | if (soft != RLIM_INFINITY) { | 880 | if (soft != RLIM_INFINITY) { |
857 | unsigned long hard = | 881 | unsigned long hard = |
858 | ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); | 882 | READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); |
859 | 883 | ||
860 | if (hard != RLIM_INFINITY && | 884 | if (hard != RLIM_INFINITY && |
861 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { | 885 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { |
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk, | |||
882 | } | 906 | } |
883 | } | 907 | } |
884 | 908 | ||
885 | static void stop_process_timers(struct signal_struct *sig) | 909 | static inline void stop_process_timers(struct signal_struct *sig) |
886 | { | 910 | { |
887 | struct thread_group_cputimer *cputimer = &sig->cputimer; | 911 | struct thread_group_cputimer *cputimer = &sig->cputimer; |
888 | unsigned long flags; | ||
889 | 912 | ||
890 | raw_spin_lock_irqsave(&cputimer->lock, flags); | 913 | /* Turn off cputimer->running. This is done without locking. */ |
891 | cputimer->running = 0; | 914 | WRITE_ONCE(cputimer->running, 0); |
892 | raw_spin_unlock_irqrestore(&cputimer->lock, flags); | ||
893 | } | 915 | } |
894 | 916 | ||
895 | static u32 onecputick; | 917 | static u32 onecputick; |
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk, | |||
958 | SIGPROF); | 980 | SIGPROF); |
959 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, | 981 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, |
960 | SIGVTALRM); | 982 | SIGVTALRM); |
961 | soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); | 983 | soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
962 | if (soft != RLIM_INFINITY) { | 984 | if (soft != RLIM_INFINITY) { |
963 | unsigned long psecs = cputime_to_secs(ptime); | 985 | unsigned long psecs = cputime_to_secs(ptime); |
964 | unsigned long hard = | 986 | unsigned long hard = |
965 | ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); | 987 | READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); |
966 | cputime_t x; | 988 | cputime_t x; |
967 | if (psecs >= hard) { | 989 | if (psecs >= hard) { |
968 | /* | 990 | /* |
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
1111 | } | 1133 | } |
1112 | 1134 | ||
1113 | sig = tsk->signal; | 1135 | sig = tsk->signal; |
1114 | if (sig->cputimer.running) { | 1136 | /* Check if cputimer is running. This is accessed without locking. */ |
1137 | if (READ_ONCE(sig->cputimer.running)) { | ||
1115 | struct task_cputime group_sample; | 1138 | struct task_cputime group_sample; |
1116 | 1139 | ||
1117 | raw_spin_lock(&sig->cputimer.lock); | 1140 | sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); |
1118 | group_sample = sig->cputimer.cputime; | ||
1119 | raw_spin_unlock(&sig->cputimer.lock); | ||
1120 | 1141 | ||
1121 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) | 1142 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) |
1122 | return 1; | 1143 | return 1; |
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1157 | * If there are any active process wide timers (POSIX 1.b, itimers, | 1178 | * If there are any active process wide timers (POSIX 1.b, itimers, |
1158 | * RLIMIT_CPU) cputimer must be running. | 1179 | * RLIMIT_CPU) cputimer must be running. |
1159 | */ | 1180 | */ |
1160 | if (tsk->signal->cputimer.running) | 1181 | if (READ_ONCE(tsk->signal->cputimer.running)) |
1161 | check_process_timers(tsk, &firing); | 1182 | check_process_timers(tsk, &firing); |
1162 | 1183 | ||
1163 | /* | 1184 | /* |