aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/sched/core.c68
1 files changed, 61 insertions, 7 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 96a4267e6020..9fd37169b302 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2089,11 +2089,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2089 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2089 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2090 p->state = TASK_WAKING; 2090 p->state = TASK_WAKING;
2091 2091
2092 if (p->in_iowait) {
2093 delayacct_blkio_end();
2094 atomic_dec(&task_rq(p)->nr_iowait);
2095 }
2096
2092 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 2097 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2093 if (task_cpu(p) != cpu) { 2098 if (task_cpu(p) != cpu) {
2094 wake_flags |= WF_MIGRATED; 2099 wake_flags |= WF_MIGRATED;
2095 set_task_cpu(p, cpu); 2100 set_task_cpu(p, cpu);
2096 } 2101 }
2102
2103#else /* CONFIG_SMP */
2104
2105 if (p->in_iowait) {
2106 delayacct_blkio_end();
2107 atomic_dec(&task_rq(p)->nr_iowait);
2108 }
2109
2097#endif /* CONFIG_SMP */ 2110#endif /* CONFIG_SMP */
2098 2111
2099 ttwu_queue(p, cpu, wake_flags); 2112 ttwu_queue(p, cpu, wake_flags);
@@ -2143,8 +2156,13 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2143 2156
2144 trace_sched_waking(p); 2157 trace_sched_waking(p);
2145 2158
2146 if (!task_on_rq_queued(p)) 2159 if (!task_on_rq_queued(p)) {
2160 if (p->in_iowait) {
2161 delayacct_blkio_end();
2162 atomic_dec(&rq->nr_iowait);
2163 }
2147 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2164 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2165 }
2148 2166
2149 ttwu_do_wakeup(rq, p, 0, rf); 2167 ttwu_do_wakeup(rq, p, 0, rf);
2150 ttwu_stat(p, smp_processor_id(), 0); 2168 ttwu_stat(p, smp_processor_id(), 0);
@@ -2956,6 +2974,36 @@ unsigned long long nr_context_switches(void)
2956 return sum; 2974 return sum;
2957} 2975}
2958 2976
2977/*
2978 * IO-wait accounting, and how its mostly bollocks (on SMP).
2979 *
2980 * The idea behind IO-wait account is to account the idle time that we could
2981 * have spend running if it were not for IO. That is, if we were to improve the
2982 * storage performance, we'd have a proportional reduction in IO-wait time.
2983 *
2984 * This all works nicely on UP, where, when a task blocks on IO, we account
2985 * idle time as IO-wait, because if the storage were faster, it could've been
2986 * running and we'd not be idle.
2987 *
2988 * This has been extended to SMP, by doing the same for each CPU. This however
2989 * is broken.
2990 *
2991 * Imagine for instance the case where two tasks block on one CPU, only the one
2992 * CPU will have IO-wait accounted, while the other has regular idle. Even
2993 * though, if the storage were faster, both could've ran at the same time,
2994 * utilising both CPUs.
2995 *
2996 * This means, that when looking globally, the current IO-wait accounting on
2997 * SMP is a lower bound, by reason of under accounting.
2998 *
2999 * Worse, since the numbers are provided per CPU, they are sometimes
3000 * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
3001 * associated with any one particular CPU, it can wake to another CPU than it
3002 * blocked on. This means the per CPU IO-wait number is meaningless.
3003 *
3004 * Task CPU affinities can make all that even more 'interesting'.
3005 */
3006
2959unsigned long nr_iowait(void) 3007unsigned long nr_iowait(void)
2960{ 3008{
2961 unsigned long i, sum = 0; 3009 unsigned long i, sum = 0;
@@ -2966,6 +3014,13 @@ unsigned long nr_iowait(void)
2966 return sum; 3014 return sum;
2967} 3015}
2968 3016
3017/*
3018 * Consumers of these two interfaces, like for example the cpufreq menu
3019 * governor are using nonsensical data. Boosting frequency for a CPU that has
3020 * IO-wait which might not even end up running the task when it does become
3021 * runnable.
3022 */
3023
2969unsigned long nr_iowait_cpu(int cpu) 3024unsigned long nr_iowait_cpu(int cpu)
2970{ 3025{
2971 struct rq *this = cpu_rq(cpu); 3026 struct rq *this = cpu_rq(cpu);
@@ -3377,6 +3432,11 @@ static void __sched notrace __schedule(bool preempt)
3377 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3432 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3378 prev->on_rq = 0; 3433 prev->on_rq = 0;
3379 3434
3435 if (prev->in_iowait) {
3436 atomic_inc(&rq->nr_iowait);
3437 delayacct_blkio_start();
3438 }
3439
3380 /* 3440 /*
3381 * If a worker went to sleep, notify and ask workqueue 3441 * If a worker went to sleep, notify and ask workqueue
3382 * whether it wants to wake up a task to maintain 3442 * whether it wants to wake up a task to maintain
@@ -5075,19 +5135,13 @@ EXPORT_SYMBOL_GPL(yield_to);
5075long __sched io_schedule_timeout(long timeout) 5135long __sched io_schedule_timeout(long timeout)
5076{ 5136{
5077 int old_iowait = current->in_iowait; 5137 int old_iowait = current->in_iowait;
5078 struct rq *rq;
5079 long ret; 5138 long ret;
5080 5139
5081 current->in_iowait = 1; 5140 current->in_iowait = 1;
5082 blk_schedule_flush_plug(current); 5141 blk_schedule_flush_plug(current);
5083 5142
5084 delayacct_blkio_start();
5085 rq = raw_rq();
5086 atomic_inc(&rq->nr_iowait);
5087 ret = schedule_timeout(timeout); 5143 ret = schedule_timeout(timeout);
5088 current->in_iowait = old_iowait; 5144 current->in_iowait = old_iowait;
5089 atomic_dec(&rq->nr_iowait);
5090 delayacct_blkio_end();
5091 5145
5092 return ret; 5146 return ret;
5093} 5147}