aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/futex.c33
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/auto_group.h2
-rw-r--r--kernel/sched/core.c92
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c276
-rw-r--r--kernel/sched/loadavg.c (renamed from kernel/sched/proc.c)236
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h10
-rw-r--r--kernel/sched/stats.h15
-rw-r--r--kernel/sched/wait.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/time/posix-cpu-timers.c87
16 files changed, 424 insertions, 359 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..0bb88b555550 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
1091{ 1091{
1092 unsigned long cpu_limit; 1092 unsigned long cpu_limit;
1093 1093
1094 /* Thread group counters. */ 1094 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1095 thread_group_cputime_init(sig);
1096
1097 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1098 if (cpu_limit != RLIM_INFINITY) { 1095 if (cpu_limit != RLIM_INFINITY) {
1099 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); 1096 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
1100 sig->cputimer.running = 1; 1097 sig->cputimer.running = 1;
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1396 p->hardirq_context = 0; 1393 p->hardirq_context = 0;
1397 p->softirq_context = 0; 1394 p->softirq_context = 0;
1398#endif 1395#endif
1396
1397 p->pagefault_disabled = 0;
1398
1399#ifdef CONFIG_LOCKDEP 1399#ifdef CONFIG_LOCKDEP
1400 p->lockdep_depth = 0; /* no locks held yet */ 1400 p->lockdep_depth = 0; /* no locks held yet */
1401 p->curr_chain_key = 0; 1401 p->curr_chain_key = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..f9984c363e9a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
1090 1090
1091/* 1091/*
1092 * The hash bucket lock must be held when this is called. 1092 * The hash bucket lock must be held when this is called.
1093 * Afterwards, the futex_q must not be accessed. 1093 * Afterwards, the futex_q must not be accessed. Callers
1094 * must ensure to later call wake_up_q() for the actual
1095 * wakeups to occur.
1094 */ 1096 */
1095static void wake_futex(struct futex_q *q) 1097static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1096{ 1098{
1097 struct task_struct *p = q->task; 1099 struct task_struct *p = q->task;
1098 1100
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
1100 return; 1102 return;
1101 1103
1102 /* 1104 /*
1103 * We set q->lock_ptr = NULL _before_ we wake up the task. If 1105 * Queue the task for later wakeup for after we've released
1104 * a non-futex wake up happens on another CPU then the task 1106 * the hb->lock. wake_q_add() grabs reference to p.
1105 * might exit and p would dereference a non-existing task
1106 * struct. Prevent this by holding a reference on p across the
1107 * wake up.
1108 */ 1107 */
1109 get_task_struct(p); 1108 wake_q_add(wake_q, p);
1110
1111 __unqueue_futex(q); 1109 __unqueue_futex(q);
1112 /* 1110 /*
1113 * The waiting task can free the futex_q as soon as 1111 * The waiting task can free the futex_q as soon as
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
1117 */ 1115 */
1118 smp_wmb(); 1116 smp_wmb();
1119 q->lock_ptr = NULL; 1117 q->lock_ptr = NULL;
1120
1121 wake_up_state(p, TASK_NORMAL);
1122 put_task_struct(p);
1123} 1118}
1124 1119
1125static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 1120static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1217 struct futex_q *this, *next; 1212 struct futex_q *this, *next;
1218 union futex_key key = FUTEX_KEY_INIT; 1213 union futex_key key = FUTEX_KEY_INIT;
1219 int ret; 1214 int ret;
1215 WAKE_Q(wake_q);
1220 1216
1221 if (!bitset) 1217 if (!bitset)
1222 return -EINVAL; 1218 return -EINVAL;
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1244 if (!(this->bitset & bitset)) 1240 if (!(this->bitset & bitset))
1245 continue; 1241 continue;
1246 1242
1247 wake_futex(this); 1243 mark_wake_futex(&wake_q, this);
1248 if (++ret >= nr_wake) 1244 if (++ret >= nr_wake)
1249 break; 1245 break;
1250 } 1246 }
1251 } 1247 }
1252 1248
1253 spin_unlock(&hb->lock); 1249 spin_unlock(&hb->lock);
1250 wake_up_q(&wake_q);
1254out_put_key: 1251out_put_key:
1255 put_futex_key(&key); 1252 put_futex_key(&key);
1256out: 1253out:
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1269 struct futex_hash_bucket *hb1, *hb2; 1266 struct futex_hash_bucket *hb1, *hb2;
1270 struct futex_q *this, *next; 1267 struct futex_q *this, *next;
1271 int ret, op_ret; 1268 int ret, op_ret;
1269 WAKE_Q(wake_q);
1272 1270
1273retry: 1271retry:
1274 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1272 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1318,7 @@ retry_private:
1320 ret = -EINVAL; 1318 ret = -EINVAL;
1321 goto out_unlock; 1319 goto out_unlock;
1322 } 1320 }
1323 wake_futex(this); 1321 mark_wake_futex(&wake_q, this);
1324 if (++ret >= nr_wake) 1322 if (++ret >= nr_wake)
1325 break; 1323 break;
1326 } 1324 }
@@ -1334,7 +1332,7 @@ retry_private:
1334 ret = -EINVAL; 1332 ret = -EINVAL;
1335 goto out_unlock; 1333 goto out_unlock;
1336 } 1334 }
1337 wake_futex(this); 1335 mark_wake_futex(&wake_q, this);
1338 if (++op_ret >= nr_wake2) 1336 if (++op_ret >= nr_wake2)
1339 break; 1337 break;
1340 } 1338 }
@@ -1344,6 +1342,7 @@ retry_private:
1344 1342
1345out_unlock: 1343out_unlock:
1346 double_unlock_hb(hb1, hb2); 1344 double_unlock_hb(hb1, hb2);
1345 wake_up_q(&wake_q);
1347out_put_keys: 1346out_put_keys:
1348 put_futex_key(&key2); 1347 put_futex_key(&key2);
1349out_put_key1: 1348out_put_key1:
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1503 struct futex_pi_state *pi_state = NULL; 1502 struct futex_pi_state *pi_state = NULL;
1504 struct futex_hash_bucket *hb1, *hb2; 1503 struct futex_hash_bucket *hb1, *hb2;
1505 struct futex_q *this, *next; 1504 struct futex_q *this, *next;
1505 WAKE_Q(wake_q);
1506 1506
1507 if (requeue_pi) { 1507 if (requeue_pi) {
1508 /* 1508 /*
@@ -1679,7 +1679,7 @@ retry_private:
1679 * woken by futex_unlock_pi(). 1679 * woken by futex_unlock_pi().
1680 */ 1680 */
1681 if (++task_count <= nr_wake && !requeue_pi) { 1681 if (++task_count <= nr_wake && !requeue_pi) {
1682 wake_futex(this); 1682 mark_wake_futex(&wake_q, this);
1683 continue; 1683 continue;
1684 } 1684 }
1685 1685
@@ -1719,6 +1719,7 @@ retry_private:
1719out_unlock: 1719out_unlock:
1720 free_pi_state(pi_state); 1720 free_pi_state(pi_state);
1721 double_unlock_hb(hb1, hb2); 1721 double_unlock_hb(hb1, hb2);
1722 wake_up_q(&wake_q);
1722 hb_waiters_dec(hb2); 1723 hb_waiters_dec(hb2);
1723 1724
1724 /* 1725 /*
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be87024875..67687973ce80 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o 14obj-y += core.o loadavg.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o idle.o 16obj-y += wait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd669d..750ed601ddf7 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include "sched.h" 1#include "sched.h"
4 2
5#include <linux/proc_fs.h> 3#include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
141 139
142 p->signal->autogroup = autogroup_kref_get(ag); 140 p->signal->autogroup = autogroup_kref_get(ag);
143 141
144 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) 142 if (!READ_ONCE(sysctl_sched_autogroup_enabled))
145 goto out; 143 goto out;
146 144
147 for_each_thread(p, t) 145 for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
249 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 247 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
250} 248}
251#endif /* CONFIG_SCHED_DEBUG */ 249#endif /* CONFIG_SCHED_DEBUG */
252
253#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
29static inline struct task_group * 29static inline struct task_group *
30autogroup_task_group(struct task_struct *p, struct task_group *tg) 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{ 31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 32 int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
33 33
34 if (enabled && task_wants_autogroup(p, tg)) 34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg; 35 return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 123673291ffb..20b858f2db22 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
511static bool set_nr_if_polling(struct task_struct *p) 511static bool set_nr_if_polling(struct task_struct *p)
512{ 512{
513 struct thread_info *ti = task_thread_info(p); 513 struct thread_info *ti = task_thread_info(p);
514 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); 514 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
515 515
516 for (;;) { 516 for (;;) {
517 if (!(val & _TIF_POLLING_NRFLAG)) 517 if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p)
541#endif 541#endif
542#endif 542#endif
543 543
544void wake_q_add(struct wake_q_head *head, struct task_struct *task)
545{
546 struct wake_q_node *node = &task->wake_q;
547
548 /*
549 * Atomically grab the task, if ->wake_q is !nil already it means
550 * its already queued (either by us or someone else) and will get the
551 * wakeup due to that.
552 *
553 * This cmpxchg() implies a full barrier, which pairs with the write
554 * barrier implied by the wakeup in wake_up_list().
555 */
556 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
557 return;
558
559 get_task_struct(task);
560
561 /*
562 * The head is context local, there can be no concurrency.
563 */
564 *head->lastp = node;
565 head->lastp = &node->next;
566}
567
568void wake_up_q(struct wake_q_head *head)
569{
570 struct wake_q_node *node = head->first;
571
572 while (node != WAKE_Q_TAIL) {
573 struct task_struct *task;
574
575 task = container_of(node, struct task_struct, wake_q);
576 BUG_ON(!task);
577 /* task can safely be re-inserted now */
578 node = node->next;
579 task->wake_q.next = NULL;
580
581 /*
582 * wake_up_process() implies a wmb() to pair with the queueing
583 * in wake_q_add() so as not to miss wakeups.
584 */
585 wake_up_process(task);
586 put_task_struct(task);
587 }
588}
589
544/* 590/*
545 * resched_curr - mark rq's current task 'to be rescheduled now'. 591 * resched_curr - mark rq's current task 'to be rescheduled now'.
546 * 592 *
@@ -2397,9 +2443,9 @@ unsigned long nr_iowait_cpu(int cpu)
2397 2443
2398void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) 2444void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2399{ 2445{
2400 struct rq *this = this_rq(); 2446 struct rq *rq = this_rq();
2401 *nr_waiters = atomic_read(&this->nr_iowait); 2447 *nr_waiters = atomic_read(&rq->nr_iowait);
2402 *load = this->cpu_load[0]; 2448 *load = rq->load.weight;
2403} 2449}
2404 2450
2405#ifdef CONFIG_SMP 2451#ifdef CONFIG_SMP
@@ -2497,6 +2543,7 @@ void scheduler_tick(void)
2497 update_rq_clock(rq); 2543 update_rq_clock(rq);
2498 curr->sched_class->task_tick(rq, curr, 0); 2544 curr->sched_class->task_tick(rq, curr, 0);
2499 update_cpu_load_active(rq); 2545 update_cpu_load_active(rq);
2546 calc_global_load_tick(rq);
2500 raw_spin_unlock(&rq->lock); 2547 raw_spin_unlock(&rq->lock);
2501 2548
2502 perf_event_task_tick(); 2549 perf_event_task_tick();
@@ -2525,7 +2572,7 @@ void scheduler_tick(void)
2525u64 scheduler_tick_max_deferment(void) 2572u64 scheduler_tick_max_deferment(void)
2526{ 2573{
2527 struct rq *rq = this_rq(); 2574 struct rq *rq = this_rq();
2528 unsigned long next, now = ACCESS_ONCE(jiffies); 2575 unsigned long next, now = READ_ONCE(jiffies);
2529 2576
2530 next = rq->last_sched_tick + HZ; 2577 next = rq->last_sched_tick + HZ;
2531 2578
@@ -2726,9 +2773,7 @@ again:
2726 * - return from syscall or exception to user-space 2773 * - return from syscall or exception to user-space
2727 * - return from interrupt-handler to user-space 2774 * - return from interrupt-handler to user-space
2728 * 2775 *
2729 * WARNING: all callers must re-check need_resched() afterward and reschedule 2776 * WARNING: must be called with preemption disabled!
2730 * accordingly in case an event triggered the need for rescheduling (such as
2731 * an interrupt waking up a task) while preemption was disabled in __schedule().
2732 */ 2777 */
2733static void __sched __schedule(void) 2778static void __sched __schedule(void)
2734{ 2779{
@@ -2737,7 +2782,6 @@ static void __sched __schedule(void)
2737 struct rq *rq; 2782 struct rq *rq;
2738 int cpu; 2783 int cpu;
2739 2784
2740 preempt_disable();
2741 cpu = smp_processor_id(); 2785 cpu = smp_processor_id();
2742 rq = cpu_rq(cpu); 2786 rq = cpu_rq(cpu);
2743 rcu_note_context_switch(); 2787 rcu_note_context_switch();
@@ -2801,8 +2845,6 @@ static void __sched __schedule(void)
2801 raw_spin_unlock_irq(&rq->lock); 2845 raw_spin_unlock_irq(&rq->lock);
2802 2846
2803 post_schedule(rq); 2847 post_schedule(rq);
2804
2805 sched_preempt_enable_no_resched();
2806} 2848}
2807 2849
2808static inline void sched_submit_work(struct task_struct *tsk) 2850static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +2865,9 @@ asmlinkage __visible void __sched schedule(void)
2823 2865
2824 sched_submit_work(tsk); 2866 sched_submit_work(tsk);
2825 do { 2867 do {
2868 preempt_disable();
2826 __schedule(); 2869 __schedule();
2870 sched_preempt_enable_no_resched();
2827 } while (need_resched()); 2871 } while (need_resched());
2828} 2872}
2829EXPORT_SYMBOL(schedule); 2873EXPORT_SYMBOL(schedule);
@@ -2862,15 +2906,14 @@ void __sched schedule_preempt_disabled(void)
2862static void __sched notrace preempt_schedule_common(void) 2906static void __sched notrace preempt_schedule_common(void)
2863{ 2907{
2864 do { 2908 do {
2865 __preempt_count_add(PREEMPT_ACTIVE); 2909 preempt_active_enter();
2866 __schedule(); 2910 __schedule();
2867 __preempt_count_sub(PREEMPT_ACTIVE); 2911 preempt_active_exit();
2868 2912
2869 /* 2913 /*
2870 * Check again in case we missed a preemption opportunity 2914 * Check again in case we missed a preemption opportunity
2871 * between schedule and now. 2915 * between schedule and now.
2872 */ 2916 */
2873 barrier();
2874 } while (need_resched()); 2917 } while (need_resched());
2875} 2918}
2876 2919
@@ -2917,7 +2960,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2917 return; 2960 return;
2918 2961
2919 do { 2962 do {
2920 __preempt_count_add(PREEMPT_ACTIVE); 2963 preempt_active_enter();
2921 /* 2964 /*
2922 * Needs preempt disabled in case user_exit() is traced 2965 * Needs preempt disabled in case user_exit() is traced
2923 * and the tracer calls preempt_enable_notrace() causing 2966 * and the tracer calls preempt_enable_notrace() causing
@@ -2927,8 +2970,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2927 __schedule(); 2970 __schedule();
2928 exception_exit(prev_ctx); 2971 exception_exit(prev_ctx);
2929 2972
2930 __preempt_count_sub(PREEMPT_ACTIVE); 2973 preempt_active_exit();
2931 barrier();
2932 } while (need_resched()); 2974 } while (need_resched());
2933} 2975}
2934EXPORT_SYMBOL_GPL(preempt_schedule_context); 2976EXPORT_SYMBOL_GPL(preempt_schedule_context);
@@ -2952,17 +2994,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
2952 prev_state = exception_enter(); 2994 prev_state = exception_enter();
2953 2995
2954 do { 2996 do {
2955 __preempt_count_add(PREEMPT_ACTIVE); 2997 preempt_active_enter();
2956 local_irq_enable(); 2998 local_irq_enable();
2957 __schedule(); 2999 __schedule();
2958 local_irq_disable(); 3000 local_irq_disable();
2959 __preempt_count_sub(PREEMPT_ACTIVE); 3001 preempt_active_exit();
2960
2961 /*
2962 * Check again in case we missed a preemption opportunity
2963 * between schedule and now.
2964 */
2965 barrier();
2966 } while (need_resched()); 3002 } while (need_resched());
2967 3003
2968 exception_exit(prev_state); 3004 exception_exit(prev_state);
@@ -5314,7 +5350,7 @@ static struct notifier_block migration_notifier = {
5314 .priority = CPU_PRI_MIGRATION, 5350 .priority = CPU_PRI_MIGRATION,
5315}; 5351};
5316 5352
5317static void __cpuinit set_cpu_rq_start_time(void) 5353static void set_cpu_rq_start_time(void)
5318{ 5354{
5319 int cpu = smp_processor_id(); 5355 int cpu = smp_processor_id();
5320 struct rq *rq = cpu_rq(cpu); 5356 struct rq *rq = cpu_rq(cpu);
@@ -7734,11 +7770,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
7734 return rt_runtime_us; 7770 return rt_runtime_us;
7735} 7771}
7736 7772
7737static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7773static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
7738{ 7774{
7739 u64 rt_runtime, rt_period; 7775 u64 rt_runtime, rt_period;
7740 7776
7741 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7777 rt_period = rt_period_us * NSEC_PER_USEC;
7742 rt_runtime = tg->rt_bandwidth.rt_runtime; 7778 rt_runtime = tg->rt_bandwidth.rt_runtime;
7743 7779
7744 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7780 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..f5a64ffad176 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
567{ 567{
568 cputime_t old; 568 cputime_t old;
569 569
570 while (new > (old = ACCESS_ONCE(*counter))) 570 while (new > (old = READ_ONCE(*counter)))
571 cmpxchg_cputime(counter, old, new); 571 cmpxchg_cputime(counter, old, new);
572} 572}
573 573
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145088fd..890ce951c717 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
995 rq = cpu_rq(cpu); 995 rq = cpu_rq(cpu);
996 996
997 rcu_read_lock(); 997 rcu_read_lock();
998 curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 998 curr = READ_ONCE(rq->curr); /* unlocked access */
999 999
1000 /* 1000 /*
1001 * If we are dealing with a -deadline task, we must 1001 * If we are dealing with a -deadline task, we must
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa4105e48..0d4632f7799b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
141 * 141 *
142 * This idea comes from the SD scheduler of Con Kolivas: 142 * This idea comes from the SD scheduler of Con Kolivas:
143 */ 143 */
144static int get_update_sysctl_factor(void) 144static unsigned int get_update_sysctl_factor(void)
145{ 145{
146 unsigned int cpus = min_t(int, num_online_cpus(), 8); 146 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 unsigned int factor; 147 unsigned int factor;
148 148
149 switch (sysctl_sched_tunable_scaling) { 149 switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
576 loff_t *ppos) 576 loff_t *ppos)
577{ 577{
578 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 578 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
579 int factor = get_update_sysctl_factor(); 579 unsigned int factor = get_update_sysctl_factor();
580 580
581 if (ret || !write) 581 if (ret || !write)
582 return ret; 582 return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
834 834
835static unsigned int task_scan_min(struct task_struct *p) 835static unsigned int task_scan_min(struct task_struct *p)
836{ 836{
837 unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); 837 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
838 unsigned int scan, floor; 838 unsigned int scan, floor;
839 unsigned int windows = 1; 839 unsigned int windows = 1;
840 840
@@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p)
1794 u64 runtime, period; 1794 u64 runtime, period;
1795 spinlock_t *group_lock = NULL; 1795 spinlock_t *group_lock = NULL;
1796 1796
1797 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1797 /*
1798 * The p->mm->numa_scan_seq field gets updated without
1799 * exclusive access. Use READ_ONCE() here to ensure
1800 * that the field is read in a single access:
1801 */
1802 seq = READ_ONCE(p->mm->numa_scan_seq);
1798 if (p->numa_scan_seq == seq) 1803 if (p->numa_scan_seq == seq)
1799 return; 1804 return;
1800 p->numa_scan_seq = seq; 1805 p->numa_scan_seq = seq;
@@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1938 } 1943 }
1939 1944
1940 rcu_read_lock(); 1945 rcu_read_lock();
1941 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); 1946 tsk = READ_ONCE(cpu_rq(cpu)->curr);
1942 1947
1943 if (!cpupid_match_pid(tsk, cpupid)) 1948 if (!cpupid_match_pid(tsk, cpupid))
1944 goto no_join; 1949 goto no_join;
@@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2107 2112
2108static void reset_ptenuma_scan(struct task_struct *p) 2113static void reset_ptenuma_scan(struct task_struct *p)
2109{ 2114{
2110 ACCESS_ONCE(p->mm->numa_scan_seq)++; 2115 /*
2116 * We only did a read acquisition of the mmap sem, so
2117 * p->mm->numa_scan_seq is written to without exclusive access
2118 * and the update is not guaranteed to be atomic. That's not
2119 * much of an issue though, since this is just used for
2120 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2121 * expensive, to avoid any form of compiler optimizations:
2122 */
2123 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2111 p->mm->numa_scan_offset = 0; 2124 p->mm->numa_scan_offset = 0;
2112} 2125}
2113 2126
@@ -4323,6 +4336,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4323} 4336}
4324 4337
4325#ifdef CONFIG_SMP 4338#ifdef CONFIG_SMP
4339
4340/*
4341 * per rq 'load' arrray crap; XXX kill this.
4342 */
4343
4344/*
4345 * The exact cpuload at various idx values, calculated at every tick would be
4346 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
4347 *
4348 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
4349 * on nth tick when cpu may be busy, then we have:
4350 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4351 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
4352 *
4353 * decay_load_missed() below does efficient calculation of
4354 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4355 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
4356 *
4357 * The calculation is approximated on a 128 point scale.
4358 * degrade_zero_ticks is the number of ticks after which load at any
4359 * particular idx is approximated to be zero.
4360 * degrade_factor is a precomputed table, a row for each load idx.
4361 * Each column corresponds to degradation factor for a power of two ticks,
4362 * based on 128 point scale.
4363 * Example:
4364 * row 2, col 3 (=12) says that the degradation at load idx 2 after
4365 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4366 *
4367 * With this power of 2 load factors, we can degrade the load n times
4368 * by looking at 1 bits in n and doing as many mult/shift instead of
4369 * n mult/shifts needed by the exact degradation.
4370 */
4371#define DEGRADE_SHIFT 7
4372static const unsigned char
4373 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4374static const unsigned char
4375 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4376 {0, 0, 0, 0, 0, 0, 0, 0},
4377 {64, 32, 8, 0, 0, 0, 0, 0},
4378 {96, 72, 40, 12, 1, 0, 0},
4379 {112, 98, 75, 43, 15, 1, 0},
4380 {120, 112, 98, 76, 45, 16, 2} };
4381
4382/*
4383 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4384 * would be when CPU is idle and so we just decay the old load without
4385 * adding any new load.
4386 */
4387static unsigned long
4388decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4389{
4390 int j = 0;
4391
4392 if (!missed_updates)
4393 return load;
4394
4395 if (missed_updates >= degrade_zero_ticks[idx])
4396 return 0;
4397
4398 if (idx == 1)
4399 return load >> missed_updates;
4400
4401 while (missed_updates) {
4402 if (missed_updates % 2)
4403 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4404
4405 missed_updates >>= 1;
4406 j++;
4407 }
4408 return load;
4409}
4410
4411/*
4412 * Update rq->cpu_load[] statistics. This function is usually called every
4413 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
4414 * every tick. We fix it up based on jiffies.
4415 */
4416static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4417 unsigned long pending_updates)
4418{
4419 int i, scale;
4420
4421 this_rq->nr_load_updates++;
4422
4423 /* Update our load: */
4424 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4425 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4426 unsigned long old_load, new_load;
4427
4428 /* scale is effectively 1 << i now, and >> i divides by scale */
4429
4430 old_load = this_rq->cpu_load[i];
4431 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4432 new_load = this_load;
4433 /*
4434 * Round up the averaging division if load is increasing. This
4435 * prevents us from getting stuck on 9 if the load is 10, for
4436 * example.
4437 */
4438 if (new_load > old_load)
4439 new_load += scale - 1;
4440
4441 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4442 }
4443
4444 sched_avg_update(this_rq);
4445}
4446
4447#ifdef CONFIG_NO_HZ_COMMON
4448/*
4449 * There is no sane way to deal with nohz on smp when using jiffies because the
4450 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4451 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4452 *
4453 * Therefore we cannot use the delta approach from the regular tick since that
4454 * would seriously skew the load calculation. However we'll make do for those
4455 * updates happening while idle (nohz_idle_balance) or coming out of idle
4456 * (tick_nohz_idle_exit).
4457 *
4458 * This means we might still be one tick off for nohz periods.
4459 */
4460
4461/*
4462 * Called from nohz_idle_balance() to update the load ratings before doing the
4463 * idle balance.
4464 */
4465static void update_idle_cpu_load(struct rq *this_rq)
4466{
4467 unsigned long curr_jiffies = READ_ONCE(jiffies);
4468 unsigned long load = this_rq->cfs.runnable_load_avg;
4469 unsigned long pending_updates;
4470
4471 /*
4472 * bail if there's load or we're actually up-to-date.
4473 */
4474 if (load || curr_jiffies == this_rq->last_load_update_tick)
4475 return;
4476
4477 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4478 this_rq->last_load_update_tick = curr_jiffies;
4479
4480 __update_cpu_load(this_rq, load, pending_updates);
4481}
4482
4483/*
4484 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4485 */
4486void update_cpu_load_nohz(void)
4487{
4488 struct rq *this_rq = this_rq();
4489 unsigned long curr_jiffies = READ_ONCE(jiffies);
4490 unsigned long pending_updates;
4491
4492 if (curr_jiffies == this_rq->last_load_update_tick)
4493 return;
4494
4495 raw_spin_lock(&this_rq->lock);
4496 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4497 if (pending_updates) {
4498 this_rq->last_load_update_tick = curr_jiffies;
4499 /*
4500 * We were idle, this means load 0, the current load might be
4501 * !0 due to remote wakeups and the sort.
4502 */
4503 __update_cpu_load(this_rq, 0, pending_updates);
4504 }
4505 raw_spin_unlock(&this_rq->lock);
4506}
4507#endif /* CONFIG_NO_HZ */
4508
4509/*
4510 * Called from scheduler_tick()
4511 */
4512void update_cpu_load_active(struct rq *this_rq)
4513{
4514 unsigned long load = this_rq->cfs.runnable_load_avg;
4515 /*
4516 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4517 */
4518 this_rq->last_load_update_tick = jiffies;
4519 __update_cpu_load(this_rq, load, 1);
4520}
4521
4326/* Used instead of source_load when we know the type == 0 */ 4522/* Used instead of source_load when we know the type == 0 */
4327static unsigned long weighted_cpuload(const int cpu) 4523static unsigned long weighted_cpuload(const int cpu)
4328{ 4524{
@@ -4375,7 +4571,7 @@ static unsigned long capacity_orig_of(int cpu)
4375static unsigned long cpu_avg_load_per_task(int cpu) 4571static unsigned long cpu_avg_load_per_task(int cpu)
4376{ 4572{
4377 struct rq *rq = cpu_rq(cpu); 4573 struct rq *rq = cpu_rq(cpu);
4378 unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); 4574 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4379 unsigned long load_avg = rq->cfs.runnable_load_avg; 4575 unsigned long load_avg = rq->cfs.runnable_load_avg;
4380 4576
4381 if (nr_running) 4577 if (nr_running)
@@ -5467,10 +5663,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
5467} 5663}
5468 5664
5469#ifdef CONFIG_NUMA_BALANCING 5665#ifdef CONFIG_NUMA_BALANCING
5470/* Returns true if the destination node has incurred more faults */ 5666/*
5667 * Returns true if the destination node is the preferred node.
5668 * Needs to match fbq_classify_rq(): if there is a runnable task
5669 * that is not on its preferred node, we should identify it.
5670 */
5471static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5671static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5472{ 5672{
5473 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5673 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5674 unsigned long src_faults, dst_faults;
5474 int src_nid, dst_nid; 5675 int src_nid, dst_nid;
5475 5676
5476 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5677 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5484,29 +5685,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5484 if (src_nid == dst_nid) 5685 if (src_nid == dst_nid)
5485 return false; 5686 return false;
5486 5687
5487 if (numa_group) {
5488 /* Task is already in the group's interleave set. */
5489 if (node_isset(src_nid, numa_group->active_nodes))
5490 return false;
5491
5492 /* Task is moving into the group's interleave set. */
5493 if (node_isset(dst_nid, numa_group->active_nodes))
5494 return true;
5495
5496 return group_faults(p, dst_nid) > group_faults(p, src_nid);
5497 }
5498
5499 /* Encourage migration to the preferred node. */ 5688 /* Encourage migration to the preferred node. */
5500 if (dst_nid == p->numa_preferred_nid) 5689 if (dst_nid == p->numa_preferred_nid)
5501 return true; 5690 return true;
5502 5691
5503 return task_faults(p, dst_nid) > task_faults(p, src_nid); 5692 /* Migrating away from the preferred node is bad. */
5693 if (src_nid == p->numa_preferred_nid)
5694 return false;
5695
5696 if (numa_group) {
5697 src_faults = group_faults(p, src_nid);
5698 dst_faults = group_faults(p, dst_nid);
5699 } else {
5700 src_faults = task_faults(p, src_nid);
5701 dst_faults = task_faults(p, dst_nid);
5702 }
5703
5704 return dst_faults > src_faults;
5504} 5705}
5505 5706
5506 5707
5507static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5708static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5508{ 5709{
5509 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5710 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5711 unsigned long src_faults, dst_faults;
5510 int src_nid, dst_nid; 5712 int src_nid, dst_nid;
5511 5713
5512 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5714 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5521,23 +5723,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5521 if (src_nid == dst_nid) 5723 if (src_nid == dst_nid)
5522 return false; 5724 return false;
5523 5725
5524 if (numa_group) { 5726 /* Migrating away from the preferred node is bad. */
5525 /* Task is moving within/into the group's interleave set. */ 5727 if (src_nid == p->numa_preferred_nid)
5526 if (node_isset(dst_nid, numa_group->active_nodes)) 5728 return true;
5527 return false;
5528 5729
5529 /* Task is moving out of the group's interleave set. */ 5730 /* Encourage migration to the preferred node. */
5530 if (node_isset(src_nid, numa_group->active_nodes)) 5731 if (dst_nid == p->numa_preferred_nid)
5531 return true; 5732 return false;
5532 5733
5533 return group_faults(p, dst_nid) < group_faults(p, src_nid); 5734 if (numa_group) {
5735 src_faults = group_faults(p, src_nid);
5736 dst_faults = group_faults(p, dst_nid);
5737 } else {
5738 src_faults = task_faults(p, src_nid);
5739 dst_faults = task_faults(p, dst_nid);
5534 } 5740 }
5535 5741
5536 /* Migrating away from the preferred node is always bad. */ 5742 return dst_faults < src_faults;
5537 if (src_nid == p->numa_preferred_nid)
5538 return true;
5539
5540 return task_faults(p, dst_nid) < task_faults(p, src_nid);
5541} 5743}
5542 5744
5543#else 5745#else
@@ -6037,8 +6239,8 @@ static unsigned long scale_rt_capacity(int cpu)
6037 * Since we're reading these variables without serialization make sure 6239 * Since we're reading these variables without serialization make sure
6038 * we read them once before doing sanity checks on them. 6240 * we read them once before doing sanity checks on them.
6039 */ 6241 */
6040 age_stamp = ACCESS_ONCE(rq->age_stamp); 6242 age_stamp = READ_ONCE(rq->age_stamp);
6041 avg = ACCESS_ONCE(rq->rt_avg); 6243 avg = READ_ONCE(rq->rt_avg);
6042 delta = __rq_clock_broken(rq) - age_stamp; 6244 delta = __rq_clock_broken(rq) - age_stamp;
6043 6245
6044 if (unlikely(delta < 0)) 6246 if (unlikely(delta < 0))
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 8ecd552fe4f2..ef7159012cf3 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
1/* 1/*
2 * kernel/sched/proc.c 2 * kernel/sched/loadavg.c
3 * 3 *
4 * Kernel load calculations, forked from sched/core.c 4 * This file contains the magic bits required to compute the global loadavg
5 * figure. Its a silly number but people think its important. We go through
6 * great pains to make it work on big machines and tickless kernels.
5 */ 7 */
6 8
7#include <linux/export.h> 9#include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
81 long nr_active, delta = 0; 83 long nr_active, delta = 0;
82 84
83 nr_active = this_rq->nr_running; 85 nr_active = this_rq->nr_running;
84 nr_active += (long) this_rq->nr_uninterruptible; 86 nr_active += (long)this_rq->nr_uninterruptible;
85 87
86 if (nr_active != this_rq->calc_load_active) { 88 if (nr_active != this_rq->calc_load_active) {
87 delta = nr_active - this_rq->calc_load_active; 89 delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
186 delta = calc_load_fold_active(this_rq); 188 delta = calc_load_fold_active(this_rq);
187 if (delta) { 189 if (delta) {
188 int idx = calc_load_write_idx(); 190 int idx = calc_load_write_idx();
191
189 atomic_long_add(delta, &calc_load_idle[idx]); 192 atomic_long_add(delta, &calc_load_idle[idx]);
190 } 193 }
191} 194}
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
241{ 244{
242 unsigned long result = 1UL << frac_bits; 245 unsigned long result = 1UL << frac_bits;
243 246
244 if (n) for (;;) { 247 if (n) {
245 if (n & 1) { 248 for (;;) {
246 result *= x; 249 if (n & 1) {
247 result += 1UL << (frac_bits - 1); 250 result *= x;
248 result >>= frac_bits; 251 result += 1UL << (frac_bits - 1);
252 result >>= frac_bits;
253 }
254 n >>= 1;
255 if (!n)
256 break;
257 x *= x;
258 x += 1UL << (frac_bits - 1);
259 x >>= frac_bits;
249 } 260 }
250 n >>= 1;
251 if (!n)
252 break;
253 x *= x;
254 x += 1UL << (frac_bits - 1);
255 x >>= frac_bits;
256 } 261 }
257 262
258 return result; 263 return result;
@@ -285,7 +290,6 @@ static unsigned long
285calc_load_n(unsigned long load, unsigned long exp, 290calc_load_n(unsigned long load, unsigned long exp,
286 unsigned long active, unsigned int n) 291 unsigned long active, unsigned int n)
287{ 292{
288
289 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); 293 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
290} 294}
291 295
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
339/* 343/*
340 * calc_load - update the avenrun load estimates 10 ticks after the 344 * calc_load - update the avenrun load estimates 10 ticks after the
341 * CPUs have updated calc_load_tasks. 345 * CPUs have updated calc_load_tasks.
346 *
347 * Called from the global timer code.
342 */ 348 */
343void calc_global_load(unsigned long ticks) 349void calc_global_load(unsigned long ticks)
344{ 350{
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
370} 376}
371 377
372/* 378/*
373 * Called from update_cpu_load() to periodically update this CPU's 379 * Called from scheduler_tick() to periodically update this CPU's
374 * active count. 380 * active count.
375 */ 381 */
376static void calc_load_account_active(struct rq *this_rq) 382void calc_global_load_tick(struct rq *this_rq)
377{ 383{
378 long delta; 384 long delta;
379 385
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
386 392
387 this_rq->calc_load_update += LOAD_FREQ; 393 this_rq->calc_load_update += LOAD_FREQ;
388} 394}
389
390/*
391 * End of global load-average stuff
392 */
393
394/*
395 * The exact cpuload at various idx values, calculated at every tick would be
396 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
397 *
398 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
399 * on nth tick when cpu may be busy, then we have:
400 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
401 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
402 *
403 * decay_load_missed() below does efficient calculation of
404 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
405 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
406 *
407 * The calculation is approximated on a 128 point scale.
408 * degrade_zero_ticks is the number of ticks after which load at any
409 * particular idx is approximated to be zero.
410 * degrade_factor is a precomputed table, a row for each load idx.
411 * Each column corresponds to degradation factor for a power of two ticks,
412 * based on 128 point scale.
413 * Example:
414 * row 2, col 3 (=12) says that the degradation at load idx 2 after
415 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
416 *
417 * With this power of 2 load factors, we can degrade the load n times
418 * by looking at 1 bits in n and doing as many mult/shift instead of
419 * n mult/shifts needed by the exact degradation.
420 */
421#define DEGRADE_SHIFT 7
422static const unsigned char
423 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
424static const unsigned char
425 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
426 {0, 0, 0, 0, 0, 0, 0, 0},
427 {64, 32, 8, 0, 0, 0, 0, 0},
428 {96, 72, 40, 12, 1, 0, 0},
429 {112, 98, 75, 43, 15, 1, 0},
430 {120, 112, 98, 76, 45, 16, 2} };
431
432/*
433 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
434 * would be when CPU is idle and so we just decay the old load without
435 * adding any new load.
436 */
437static unsigned long
438decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
439{
440 int j = 0;
441
442 if (!missed_updates)
443 return load;
444
445 if (missed_updates >= degrade_zero_ticks[idx])
446 return 0;
447
448 if (idx == 1)
449 return load >> missed_updates;
450
451 while (missed_updates) {
452 if (missed_updates % 2)
453 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
454
455 missed_updates >>= 1;
456 j++;
457 }
458 return load;
459}
460
461/*
462 * Update rq->cpu_load[] statistics. This function is usually called every
463 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
464 * every tick. We fix it up based on jiffies.
465 */
466static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
467 unsigned long pending_updates)
468{
469 int i, scale;
470
471 this_rq->nr_load_updates++;
472
473 /* Update our load: */
474 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
475 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
476 unsigned long old_load, new_load;
477
478 /* scale is effectively 1 << i now, and >> i divides by scale */
479
480 old_load = this_rq->cpu_load[i];
481 old_load = decay_load_missed(old_load, pending_updates - 1, i);
482 new_load = this_load;
483 /*
484 * Round up the averaging division if load is increasing. This
485 * prevents us from getting stuck on 9 if the load is 10, for
486 * example.
487 */
488 if (new_load > old_load)
489 new_load += scale - 1;
490
491 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
492 }
493
494 sched_avg_update(this_rq);
495}
496
497#ifdef CONFIG_SMP
498static inline unsigned long get_rq_runnable_load(struct rq *rq)
499{
500 return rq->cfs.runnable_load_avg;
501}
502#else
503static inline unsigned long get_rq_runnable_load(struct rq *rq)
504{
505 return rq->load.weight;
506}
507#endif
508
509#ifdef CONFIG_NO_HZ_COMMON
510/*
511 * There is no sane way to deal with nohz on smp when using jiffies because the
512 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
513 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
514 *
515 * Therefore we cannot use the delta approach from the regular tick since that
516 * would seriously skew the load calculation. However we'll make do for those
517 * updates happening while idle (nohz_idle_balance) or coming out of idle
518 * (tick_nohz_idle_exit).
519 *
520 * This means we might still be one tick off for nohz periods.
521 */
522
523/*
524 * Called from nohz_idle_balance() to update the load ratings before doing the
525 * idle balance.
526 */
527void update_idle_cpu_load(struct rq *this_rq)
528{
529 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
530 unsigned long load = get_rq_runnable_load(this_rq);
531 unsigned long pending_updates;
532
533 /*
534 * bail if there's load or we're actually up-to-date.
535 */
536 if (load || curr_jiffies == this_rq->last_load_update_tick)
537 return;
538
539 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
540 this_rq->last_load_update_tick = curr_jiffies;
541
542 __update_cpu_load(this_rq, load, pending_updates);
543}
544
545/*
546 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
547 */
548void update_cpu_load_nohz(void)
549{
550 struct rq *this_rq = this_rq();
551 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
552 unsigned long pending_updates;
553
554 if (curr_jiffies == this_rq->last_load_update_tick)
555 return;
556
557 raw_spin_lock(&this_rq->lock);
558 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
559 if (pending_updates) {
560 this_rq->last_load_update_tick = curr_jiffies;
561 /*
562 * We were idle, this means load 0, the current load might be
563 * !0 due to remote wakeups and the sort.
564 */
565 __update_cpu_load(this_rq, 0, pending_updates);
566 }
567 raw_spin_unlock(&this_rq->lock);
568}
569#endif /* CONFIG_NO_HZ */
570
571/*
572 * Called from scheduler_tick()
573 */
574void update_cpu_load_active(struct rq *this_rq)
575{
576 unsigned long load = get_rq_runnable_load(this_rq);
577 /*
578 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
579 */
580 this_rq->last_load_update_tick = jiffies;
581 __update_cpu_load(this_rq, load, 1);
582
583 calc_load_account_active(this_rq);
584}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3874..560d2fa623c3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1323 rq = cpu_rq(cpu); 1323 rq = cpu_rq(cpu);
1324 1324
1325 rcu_read_lock(); 1325 rcu_read_lock();
1326 curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 1326 curr = READ_ONCE(rq->curr); /* unlocked access */
1327 1327
1328 /* 1328 /*
1329 * If the current task on @p's runqueue is an RT task, then 1329 * If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e129993958..d85455539d5c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
26extern unsigned long calc_load_update; 26extern unsigned long calc_load_update;
27extern atomic_long_t calc_load_tasks; 27extern atomic_long_t calc_load_tasks;
28 28
29extern void calc_global_load_tick(struct rq *this_rq);
29extern long calc_load_fold_active(struct rq *this_rq); 30extern long calc_load_fold_active(struct rq *this_rq);
31
32#ifdef CONFIG_SMP
30extern void update_cpu_load_active(struct rq *this_rq); 33extern void update_cpu_load_active(struct rq *this_rq);
34#else
35static inline void update_cpu_load_active(struct rq *this_rq) { }
36#endif
31 37
32/* 38/*
33 * Helpers for converting nanosecond timing to jiffy resolution 39 * Helpers for converting nanosecond timing to jiffy resolution
@@ -707,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
707 713
708static inline u64 __rq_clock_broken(struct rq *rq) 714static inline u64 __rq_clock_broken(struct rq *rq)
709{ 715{
710 return ACCESS_ONCE(rq->clock); 716 return READ_ONCE(rq->clock);
711} 717}
712 718
713static inline u64 rq_clock(struct rq *rq) 719static inline u64 rq_clock(struct rq *rq)
@@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1298 1304
1299unsigned long to_ratio(u64 period, u64 runtime); 1305unsigned long to_ratio(u64 period, u64 runtime);
1300 1306
1301extern void update_idle_cpu_load(struct rq *this_rq);
1302
1303extern void init_task_runnable_average(struct task_struct *p); 1307extern void init_task_runnable_average(struct task_struct *p);
1304 1308
1305static inline void add_nr_running(struct rq *rq, unsigned count) 1309static inline void add_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339656..077ebbd5e10f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
174{ 174{
175 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 175 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
176 176
177 if (!cputimer->running) 177 /* Check if cputimer isn't running. This is accessed without locking. */
178 if (!READ_ONCE(cputimer->running))
178 return false; 179 return false;
179 180
180 /* 181 /*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
215 if (!cputimer_running(tsk)) 216 if (!cputimer_running(tsk))
216 return; 217 return;
217 218
218 raw_spin_lock(&cputimer->lock); 219 atomic64_add(cputime, &cputimer->cputime_atomic.utime);
219 cputimer->cputime.utime += cputime;
220 raw_spin_unlock(&cputimer->lock);
221} 220}
222 221
223/** 222/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
238 if (!cputimer_running(tsk)) 237 if (!cputimer_running(tsk))
239 return; 238 return;
240 239
241 raw_spin_lock(&cputimer->lock); 240 atomic64_add(cputime, &cputimer->cputime_atomic.stime);
242 cputimer->cputime.stime += cputime;
243 raw_spin_unlock(&cputimer->lock);
244} 241}
245 242
246/** 243/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
261 if (!cputimer_running(tsk)) 258 if (!cputimer_running(tsk))
262 return; 259 return;
263 260
264 raw_spin_lock(&cputimer->lock); 261 atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
265 cputimer->cputime.sum_exec_runtime += ns;
266 raw_spin_unlock(&cputimer->lock);
267} 262}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..2ccec988d6b7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
601 601
602__sched int bit_wait_timeout(struct wait_bit_key *word) 602__sched int bit_wait_timeout(struct wait_bit_key *word)
603{ 603{
604 unsigned long now = ACCESS_ONCE(jiffies); 604 unsigned long now = READ_ONCE(jiffies);
605 if (signal_pending_state(current->state, current)) 605 if (signal_pending_state(current->state, current))
606 return 1; 606 return 1;
607 if (time_after_eq(now, word->timeout)) 607 if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
613 613
614__sched int bit_wait_io_timeout(struct wait_bit_key *word) 614__sched int bit_wait_io_timeout(struct wait_bit_key *word)
615{ 615{
616 unsigned long now = ACCESS_ONCE(jiffies); 616 unsigned long now = READ_ONCE(jiffies);
617 if (signal_pending_state(current->state, current)) 617 if (signal_pending_state(current->state, current))
618 return 1; 618 return 1;
619 if (time_after_eq(now, word->timeout)) 619 if (time_after_eq(now, word->timeout))
diff --git a/kernel/signal.c b/kernel/signal.c
index d51c5ddd855c..f19833b5db3c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
245 * RETURNS: 245 * RETURNS:
246 * %true if @mask is set, %false if made noop because @task was dying. 246 * %true if @mask is set, %false if made noop because @task was dying.
247 */ 247 */
248bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) 248bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
249{ 249{
250 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | 250 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
251 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); 251 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
297 * CONTEXT: 297 * CONTEXT:
298 * Must be called with @task->sighand->siglock held. 298 * Must be called with @task->sighand->siglock held.
299 */ 299 */
300void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) 300void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
301{ 301{
302 BUG_ON(mask & ~JOBCTL_PENDING_MASK); 302 BUG_ON(mask & ~JOBCTL_PENDING_MASK);
303 303
@@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr)
2000 struct signal_struct *sig = current->signal; 2000 struct signal_struct *sig = current->signal;
2001 2001
2002 if (!(current->jobctl & JOBCTL_STOP_PENDING)) { 2002 if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
2003 unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; 2003 unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
2004 struct task_struct *t; 2004 struct task_struct *t;
2005 2005
2006 /* signr will be recorded in task->jobctl for retries */ 2006 /* signr will be recorded in task->jobctl for retries */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da74abf0..892e3dae0aac 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
196 return 0; 196 return 0;
197} 197}
198 198
199static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 199/*
200 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
201 * to avoid race conditions with concurrent updates to cputime.
202 */
203static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
200{ 204{
201 if (b->utime > a->utime) 205 u64 curr_cputime;
202 a->utime = b->utime; 206retry:
207 curr_cputime = atomic64_read(cputime);
208 if (sum_cputime > curr_cputime) {
209 if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
210 goto retry;
211 }
212}
203 213
204 if (b->stime > a->stime) 214static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
205 a->stime = b->stime; 215{
216 __update_gt_cputime(&cputime_atomic->utime, sum->utime);
217 __update_gt_cputime(&cputime_atomic->stime, sum->stime);
218 __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
219}
206 220
207 if (b->sum_exec_runtime > a->sum_exec_runtime) 221/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
208 a->sum_exec_runtime = b->sum_exec_runtime; 222static inline void sample_cputime_atomic(struct task_cputime *times,
223 struct task_cputime_atomic *atomic_times)
224{
225 times->utime = atomic64_read(&atomic_times->utime);
226 times->stime = atomic64_read(&atomic_times->stime);
227 times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
209} 228}
210 229
211void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) 230void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
212{ 231{
213 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 232 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
214 struct task_cputime sum; 233 struct task_cputime sum;
215 unsigned long flags;
216 234
217 if (!cputimer->running) { 235 /* Check if cputimer isn't running. This is accessed without locking. */
236 if (!READ_ONCE(cputimer->running)) {
218 /* 237 /*
219 * The POSIX timer interface allows for absolute time expiry 238 * The POSIX timer interface allows for absolute time expiry
220 * values through the TIMER_ABSTIME flag, therefore we have 239 * values through the TIMER_ABSTIME flag, therefore we have
221 * to synchronize the timer to the clock every time we start 240 * to synchronize the timer to the clock every time we start it.
222 * it.
223 */ 241 */
224 thread_group_cputime(tsk, &sum); 242 thread_group_cputime(tsk, &sum);
225 raw_spin_lock_irqsave(&cputimer->lock, flags); 243 update_gt_cputime(&cputimer->cputime_atomic, &sum);
226 cputimer->running = 1; 244
227 update_gt_cputime(&cputimer->cputime, &sum); 245 /*
228 } else 246 * We're setting cputimer->running without a lock. Ensure
229 raw_spin_lock_irqsave(&cputimer->lock, flags); 247 * this only gets written to in one operation. We set
230 *times = cputimer->cputime; 248 * running after update_gt_cputime() as a small optimization,
231 raw_spin_unlock_irqrestore(&cputimer->lock, flags); 249 * but barriers are not required because update_gt_cputime()
250 * can handle concurrent updates.
251 */
252 WRITE_ONCE(cputimer->running, 1);
253 }
254 sample_cputime_atomic(times, &cputimer->cputime_atomic);
232} 255}
233 256
234/* 257/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
582 if (!task_cputime_zero(&tsk->cputime_expires)) 605 if (!task_cputime_zero(&tsk->cputime_expires))
583 return false; 606 return false;
584 607
585 if (tsk->signal->cputimer.running) 608 /* Check if cputimer is running. This is accessed without locking. */
609 if (READ_ONCE(tsk->signal->cputimer.running))
586 return false; 610 return false;
587 611
588 return true; 612 return true;
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk,
852 /* 876 /*
853 * Check for the special case thread timers. 877 * Check for the special case thread timers.
854 */ 878 */
855 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); 879 soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
856 if (soft != RLIM_INFINITY) { 880 if (soft != RLIM_INFINITY) {
857 unsigned long hard = 881 unsigned long hard =
858 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); 882 READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
859 883
860 if (hard != RLIM_INFINITY && 884 if (hard != RLIM_INFINITY &&
861 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 885 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
882 } 906 }
883} 907}
884 908
885static void stop_process_timers(struct signal_struct *sig) 909static inline void stop_process_timers(struct signal_struct *sig)
886{ 910{
887 struct thread_group_cputimer *cputimer = &sig->cputimer; 911 struct thread_group_cputimer *cputimer = &sig->cputimer;
888 unsigned long flags;
889 912
890 raw_spin_lock_irqsave(&cputimer->lock, flags); 913 /* Turn off cputimer->running. This is done without locking. */
891 cputimer->running = 0; 914 WRITE_ONCE(cputimer->running, 0);
892 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
893} 915}
894 916
895static u32 onecputick; 917static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
958 SIGPROF); 980 SIGPROF);
959 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 981 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
960 SIGVTALRM); 982 SIGVTALRM);
961 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); 983 soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
962 if (soft != RLIM_INFINITY) { 984 if (soft != RLIM_INFINITY) {
963 unsigned long psecs = cputime_to_secs(ptime); 985 unsigned long psecs = cputime_to_secs(ptime);
964 unsigned long hard = 986 unsigned long hard =
965 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); 987 READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
966 cputime_t x; 988 cputime_t x;
967 if (psecs >= hard) { 989 if (psecs >= hard) {
968 /* 990 /*
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1111 } 1133 }
1112 1134
1113 sig = tsk->signal; 1135 sig = tsk->signal;
1114 if (sig->cputimer.running) { 1136 /* Check if cputimer is running. This is accessed without locking. */
1137 if (READ_ONCE(sig->cputimer.running)) {
1115 struct task_cputime group_sample; 1138 struct task_cputime group_sample;
1116 1139
1117 raw_spin_lock(&sig->cputimer.lock); 1140 sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
1118 group_sample = sig->cputimer.cputime;
1119 raw_spin_unlock(&sig->cputimer.lock);
1120 1141
1121 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1142 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1122 return 1; 1143 return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1157 * If there are any active process wide timers (POSIX 1.b, itimers, 1178 * If there are any active process wide timers (POSIX 1.b, itimers,
1158 * RLIMIT_CPU) cputimer must be running. 1179 * RLIMIT_CPU) cputimer must be running.
1159 */ 1180 */
1160 if (tsk->signal->cputimer.running) 1181 if (READ_ONCE(tsk->signal->cputimer.running))
1161 check_process_timers(tsk, &firing); 1182 check_process_timers(tsk, &firing);
1162 1183
1163 /* 1184 /*