diff options
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | kernel/sched.c | 14 | ||||
-rw-r--r-- | kernel/sched_fair.c | 77 | ||||
-rw-r--r-- | kernel/sysctl.c | 11 |
4 files changed, 86 insertions, 17 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index ba78807eab91..322764e04052 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1399,6 +1399,7 @@ static inline void idle_task_exit(void) {} | |||
1399 | 1399 | ||
1400 | extern void sched_idle_next(void); | 1400 | extern void sched_idle_next(void); |
1401 | 1401 | ||
1402 | extern unsigned int sysctl_sched_latency; | ||
1402 | extern unsigned int sysctl_sched_granularity; | 1403 | extern unsigned int sysctl_sched_granularity; |
1403 | extern unsigned int sysctl_sched_wakeup_granularity; | 1404 | extern unsigned int sysctl_sched_wakeup_granularity; |
1404 | extern unsigned int sysctl_sched_batch_wakeup_granularity; | 1405 | extern unsigned int sysctl_sched_batch_wakeup_granularity; |
diff --git a/kernel/sched.c b/kernel/sched.c index 6798328a2e0e..da26f46d50d7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4911,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
4911 | static inline void sched_init_granularity(void) | 4911 | static inline void sched_init_granularity(void) |
4912 | { | 4912 | { |
4913 | unsigned int factor = 1 + ilog2(num_online_cpus()); | 4913 | unsigned int factor = 1 + ilog2(num_online_cpus()); |
4914 | const unsigned long gran_limit = 100000000; | 4914 | const unsigned long limit = 100000000; |
4915 | 4915 | ||
4916 | sysctl_sched_granularity *= factor; | 4916 | sysctl_sched_granularity *= factor; |
4917 | if (sysctl_sched_granularity > gran_limit) | 4917 | if (sysctl_sched_granularity > limit) |
4918 | sysctl_sched_granularity = gran_limit; | 4918 | sysctl_sched_granularity = limit; |
4919 | 4919 | ||
4920 | sysctl_sched_runtime_limit = sysctl_sched_granularity * 5; | 4920 | sysctl_sched_latency *= factor; |
4921 | sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; | 4921 | if (sysctl_sched_latency > limit) |
4922 | sysctl_sched_latency = limit; | ||
4923 | |||
4924 | sysctl_sched_runtime_limit = sysctl_sched_latency * 5; | ||
4925 | sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2; | ||
4922 | } | 4926 | } |
4923 | 4927 | ||
4924 | #ifdef CONFIG_SMP | 4928 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4d6b7e2df2aa..0ba1e60f08d0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -15,23 +15,32 @@ | |||
15 | * | 15 | * |
16 | * Scaled math optimizations by Thomas Gleixner | 16 | * Scaled math optimizations by Thomas Gleixner |
17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> | 17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> |
18 | * | ||
19 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra | ||
20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
18 | */ | 21 | */ |
19 | 22 | ||
20 | /* | 23 | /* |
21 | * Preemption granularity: | 24 | * Targeted preemption latency for CPU-bound tasks: |
22 | * (default: 10 msec, units: nanoseconds) | 25 | * (default: 20ms, units: nanoseconds) |
23 | * | 26 | * |
24 | * NOTE: this granularity value is not the same as the concept of | 27 | * NOTE: this latency value is not the same as the concept of |
25 | * 'timeslice length' - timeslices in CFS will typically be somewhat | 28 | * 'timeslice length' - timeslices in CFS are of variable length. |
26 | * larger than this value. (to see the precise effective timeslice | 29 | * (to see the precise effective timeslice length of your workload, |
27 | * length of your workload, run vmstat and monitor the context-switches | 30 | * run vmstat and monitor the context-switches field) |
28 | * field) | ||
29 | * | 31 | * |
30 | * On SMP systems the value of this is multiplied by the log2 of the | 32 | * On SMP systems the value of this is multiplied by the log2 of the |
31 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | 33 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way |
32 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | 34 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) |
35 | * Targeted preemption latency for CPU-bound tasks: | ||
33 | */ | 36 | */ |
34 | unsigned int sysctl_sched_granularity __read_mostly = 10000000UL; | 37 | unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; |
38 | |||
39 | /* | ||
40 | * Minimal preemption granularity for CPU-bound tasks: | ||
41 | * (default: 2 msec, units: nanoseconds) | ||
42 | */ | ||
43 | unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL; | ||
35 | 44 | ||
36 | /* | 45 | /* |
37 | * SCHED_BATCH wake-up granularity. | 46 | * SCHED_BATCH wake-up granularity. |
@@ -213,6 +222,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
213 | */ | 222 | */ |
214 | 223 | ||
215 | /* | 224 | /* |
225 | * Calculate the preemption granularity needed to schedule every | ||
226 | * runnable task once per sysctl_sched_latency amount of time. | ||
227 | * (down to a sensible low limit on granularity) | ||
228 | * | ||
229 | * For example, if there are 2 tasks running and latency is 10 msecs, | ||
230 | * we switch tasks every 5 msecs. If we have 3 tasks running, we have | ||
231 | * to switch tasks every 3.33 msecs to get a 10 msecs observed latency | ||
232 | * for each task. We do finer and finer scheduling up to until we | ||
233 | * reach the minimum granularity value. | ||
234 | * | ||
235 | * To achieve this we use the following dynamic-granularity rule: | ||
236 | * | ||
237 | * gran = lat/nr - lat/nr/nr | ||
238 | * | ||
239 | * This comes out of the following equations: | ||
240 | * | ||
241 | * kA1 + gran = kB1 | ||
242 | * kB2 + gran = kA2 | ||
243 | * kA2 = kA1 | ||
244 | * kB2 = kB1 - d + d/nr | ||
245 | * lat = d * nr | ||
246 | * | ||
247 | * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), | ||
248 | * '1' is start of time, '2' is end of time, 'd' is delay between | ||
249 | * 1 and 2 (during which task B was running), 'nr' is number of tasks | ||
250 | * running, 'lat' is the the period of each task. ('lat' is the | ||
251 | * sched_latency that we aim for.) | ||
252 | */ | ||
253 | static long | ||
254 | sched_granularity(struct cfs_rq *cfs_rq) | ||
255 | { | ||
256 | unsigned int gran = sysctl_sched_latency; | ||
257 | unsigned int nr = cfs_rq->nr_running; | ||
258 | |||
259 | if (nr > 1) { | ||
260 | gran = gran/nr - gran/nr/nr; | ||
261 | gran = max(gran, sysctl_sched_granularity); | ||
262 | } | ||
263 | |||
264 | return gran; | ||
265 | } | ||
266 | |||
267 | /* | ||
216 | * We rescale the rescheduling granularity of tasks according to their | 268 | * We rescale the rescheduling granularity of tasks according to their |
217 | * nice level, but only linearly, not exponentially: | 269 | * nice level, but only linearly, not exponentially: |
218 | */ | 270 | */ |
@@ -302,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
302 | delta_fair = calc_delta_fair(delta_exec, lw); | 354 | delta_fair = calc_delta_fair(delta_exec, lw); |
303 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | 355 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); |
304 | 356 | ||
305 | if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { | 357 | if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { |
306 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); | 358 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); |
307 | delta = min(delta, (unsigned long)( | 359 | delta = min(delta, (unsigned long)( |
308 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); | 360 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); |
@@ -689,7 +741,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
689 | if (next == curr) | 741 | if (next == curr) |
690 | return; | 742 | return; |
691 | 743 | ||
692 | __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); | 744 | __check_preempt_curr_fair(cfs_rq, next, curr, |
745 | sched_granularity(cfs_rq)); | ||
693 | } | 746 | } |
694 | 747 | ||
695 | /************************************************** | 748 | /************************************************** |
@@ -1034,7 +1087,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1034 | * it will preempt the parent: | 1087 | * it will preempt the parent: |
1035 | */ | 1088 | */ |
1036 | p->se.fair_key = current->se.fair_key - | 1089 | p->se.fair_key = current->se.fair_key - |
1037 | niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; | 1090 | niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; |
1038 | /* | 1091 | /* |
1039 | * The first wait is dominated by the child-runs-first logic, | 1092 | * The first wait is dominated by the child-runs-first logic, |
1040 | * so do not credit it with that waiting time yet: | 1093 | * so do not credit it with that waiting time yet: |
@@ -1047,7 +1100,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1047 | * -granularity/2, so initialize the task with that: | 1100 | * -granularity/2, so initialize the task with that: |
1048 | */ | 1101 | */ |
1049 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | 1102 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) |
1050 | p->se.wait_runtime = -((long)sysctl_sched_granularity / 2); | 1103 | p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); |
1051 | 1104 | ||
1052 | __enqueue_entity(cfs_rq, se); | 1105 | __enqueue_entity(cfs_rq, se); |
1053 | } | 1106 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ea90ef51085c..9e3d2960faf5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -233,6 +233,17 @@ static ctl_table kern_table[] = { | |||
233 | }, | 233 | }, |
234 | { | 234 | { |
235 | .ctl_name = CTL_UNNUMBERED, | 235 | .ctl_name = CTL_UNNUMBERED, |
236 | .procname = "sched_latency_ns", | ||
237 | .data = &sysctl_sched_latency, | ||
238 | .maxlen = sizeof(unsigned int), | ||
239 | .mode = 0644, | ||
240 | .proc_handler = &proc_dointvec_minmax, | ||
241 | .strategy = &sysctl_intvec, | ||
242 | .extra1 = &min_sched_granularity_ns, | ||
243 | .extra2 = &max_sched_granularity_ns, | ||
244 | }, | ||
245 | { | ||
246 | .ctl_name = CTL_UNNUMBERED, | ||
236 | .procname = "sched_wakeup_granularity_ns", | 247 | .procname = "sched_wakeup_granularity_ns", |
237 | .data = &sysctl_sched_wakeup_granularity, | 248 | .data = &sysctl_sched_wakeup_granularity, |
238 | .maxlen = sizeof(unsigned int), | 249 | .maxlen = sizeof(unsigned int), |