diff options
| -rw-r--r-- | include/linux/sched.h | 1 | ||||
| -rw-r--r-- | kernel/sched.c | 14 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 77 | ||||
| -rw-r--r-- | kernel/sysctl.c | 11 |
4 files changed, 86 insertions, 17 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index ba78807eab91..322764e04052 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -1399,6 +1399,7 @@ static inline void idle_task_exit(void) {} | |||
| 1399 | 1399 | ||
| 1400 | extern void sched_idle_next(void); | 1400 | extern void sched_idle_next(void); |
| 1401 | 1401 | ||
| 1402 | extern unsigned int sysctl_sched_latency; | ||
| 1402 | extern unsigned int sysctl_sched_granularity; | 1403 | extern unsigned int sysctl_sched_granularity; |
| 1403 | extern unsigned int sysctl_sched_wakeup_granularity; | 1404 | extern unsigned int sysctl_sched_wakeup_granularity; |
| 1404 | extern unsigned int sysctl_sched_batch_wakeup_granularity; | 1405 | extern unsigned int sysctl_sched_batch_wakeup_granularity; |
diff --git a/kernel/sched.c b/kernel/sched.c index 6798328a2e0e..da26f46d50d7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -4911,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
| 4911 | static inline void sched_init_granularity(void) | 4911 | static inline void sched_init_granularity(void) |
| 4912 | { | 4912 | { |
| 4913 | unsigned int factor = 1 + ilog2(num_online_cpus()); | 4913 | unsigned int factor = 1 + ilog2(num_online_cpus()); |
| 4914 | const unsigned long gran_limit = 100000000; | 4914 | const unsigned long limit = 100000000; |
| 4915 | 4915 | ||
| 4916 | sysctl_sched_granularity *= factor; | 4916 | sysctl_sched_granularity *= factor; |
| 4917 | if (sysctl_sched_granularity > gran_limit) | 4917 | if (sysctl_sched_granularity > limit) |
| 4918 | sysctl_sched_granularity = gran_limit; | 4918 | sysctl_sched_granularity = limit; |
| 4919 | 4919 | ||
| 4920 | sysctl_sched_runtime_limit = sysctl_sched_granularity * 5; | 4920 | sysctl_sched_latency *= factor; |
| 4921 | sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; | 4921 | if (sysctl_sched_latency > limit) |
| 4922 | sysctl_sched_latency = limit; | ||
| 4923 | |||
| 4924 | sysctl_sched_runtime_limit = sysctl_sched_latency * 5; | ||
| 4925 | sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2; | ||
| 4922 | } | 4926 | } |
| 4923 | 4927 | ||
| 4924 | #ifdef CONFIG_SMP | 4928 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4d6b7e2df2aa..0ba1e60f08d0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -15,23 +15,32 @@ | |||
| 15 | * | 15 | * |
| 16 | * Scaled math optimizations by Thomas Gleixner | 16 | * Scaled math optimizations by Thomas Gleixner |
| 17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> | 17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> |
| 18 | * | ||
| 19 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra | ||
| 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
| 18 | */ | 21 | */ |
| 19 | 22 | ||
| 20 | /* | 23 | /* |
| 21 | * Preemption granularity: | 24 | * Targeted preemption latency for CPU-bound tasks: |
| 22 | * (default: 10 msec, units: nanoseconds) | 25 | * (default: 20ms, units: nanoseconds) |
| 23 | * | 26 | * |
| 24 | * NOTE: this granularity value is not the same as the concept of | 27 | * NOTE: this latency value is not the same as the concept of |
| 25 | * 'timeslice length' - timeslices in CFS will typically be somewhat | 28 | * 'timeslice length' - timeslices in CFS are of variable length. |
| 26 | * larger than this value. (to see the precise effective timeslice | 29 | * (to see the precise effective timeslice length of your workload, |
| 27 | * length of your workload, run vmstat and monitor the context-switches | 30 | * run vmstat and monitor the context-switches field) |
| 28 | * field) | ||
| 29 | * | 31 | * |
| 30 | * On SMP systems the value of this is multiplied by the log2 of the | 32 | * On SMP systems the value of this is multiplied by the log2 of the |
| 31 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | 33 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way |
| 32 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | 34 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) |
| 35 | * Targeted preemption latency for CPU-bound tasks: | ||
| 33 | */ | 36 | */ |
| 34 | unsigned int sysctl_sched_granularity __read_mostly = 10000000UL; | 37 | unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; |
| 38 | |||
| 39 | /* | ||
| 40 | * Minimal preemption granularity for CPU-bound tasks: | ||
| 41 | * (default: 2 msec, units: nanoseconds) | ||
| 42 | */ | ||
| 43 | unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL; | ||
| 35 | 44 | ||
| 36 | /* | 45 | /* |
| 37 | * SCHED_BATCH wake-up granularity. | 46 | * SCHED_BATCH wake-up granularity. |
| @@ -213,6 +222,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
| 213 | */ | 222 | */ |
| 214 | 223 | ||
| 215 | /* | 224 | /* |
| 225 | * Calculate the preemption granularity needed to schedule every | ||
| 226 | * runnable task once per sysctl_sched_latency amount of time. | ||
| 227 | * (down to a sensible low limit on granularity) | ||
| 228 | * | ||
| 229 | * For example, if there are 2 tasks running and latency is 10 msecs, | ||
| 230 | * we switch tasks every 5 msecs. If we have 3 tasks running, we have | ||
| 231 | * to switch tasks every 3.33 msecs to get a 10 msecs observed latency | ||
| 232 | * for each task. We do finer and finer scheduling up to until we | ||
| 233 | * reach the minimum granularity value. | ||
| 234 | * | ||
| 235 | * To achieve this we use the following dynamic-granularity rule: | ||
| 236 | * | ||
| 237 | * gran = lat/nr - lat/nr/nr | ||
| 238 | * | ||
| 239 | * This comes out of the following equations: | ||
| 240 | * | ||
| 241 | * kA1 + gran = kB1 | ||
| 242 | * kB2 + gran = kA2 | ||
| 243 | * kA2 = kA1 | ||
| 244 | * kB2 = kB1 - d + d/nr | ||
| 245 | * lat = d * nr | ||
| 246 | * | ||
| 247 | * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), | ||
| 248 | * '1' is start of time, '2' is end of time, 'd' is delay between | ||
| 249 | * 1 and 2 (during which task B was running), 'nr' is number of tasks | ||
| 250 | * running, 'lat' is the the period of each task. ('lat' is the | ||
| 251 | * sched_latency that we aim for.) | ||
| 252 | */ | ||
| 253 | static long | ||
| 254 | sched_granularity(struct cfs_rq *cfs_rq) | ||
| 255 | { | ||
| 256 | unsigned int gran = sysctl_sched_latency; | ||
| 257 | unsigned int nr = cfs_rq->nr_running; | ||
| 258 | |||
| 259 | if (nr > 1) { | ||
| 260 | gran = gran/nr - gran/nr/nr; | ||
| 261 | gran = max(gran, sysctl_sched_granularity); | ||
| 262 | } | ||
| 263 | |||
| 264 | return gran; | ||
| 265 | } | ||
| 266 | |||
| 267 | /* | ||
| 216 | * We rescale the rescheduling granularity of tasks according to their | 268 | * We rescale the rescheduling granularity of tasks according to their |
| 217 | * nice level, but only linearly, not exponentially: | 269 | * nice level, but only linearly, not exponentially: |
| 218 | */ | 270 | */ |
| @@ -302,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
| 302 | delta_fair = calc_delta_fair(delta_exec, lw); | 354 | delta_fair = calc_delta_fair(delta_exec, lw); |
| 303 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | 355 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); |
| 304 | 356 | ||
| 305 | if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { | 357 | if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { |
| 306 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); | 358 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); |
| 307 | delta = min(delta, (unsigned long)( | 359 | delta = min(delta, (unsigned long)( |
| 308 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); | 360 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); |
| @@ -689,7 +741,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
| 689 | if (next == curr) | 741 | if (next == curr) |
| 690 | return; | 742 | return; |
| 691 | 743 | ||
| 692 | __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); | 744 | __check_preempt_curr_fair(cfs_rq, next, curr, |
| 745 | sched_granularity(cfs_rq)); | ||
| 693 | } | 746 | } |
| 694 | 747 | ||
| 695 | /************************************************** | 748 | /************************************************** |
| @@ -1034,7 +1087,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
| 1034 | * it will preempt the parent: | 1087 | * it will preempt the parent: |
| 1035 | */ | 1088 | */ |
| 1036 | p->se.fair_key = current->se.fair_key - | 1089 | p->se.fair_key = current->se.fair_key - |
| 1037 | niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; | 1090 | niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; |
| 1038 | /* | 1091 | /* |
| 1039 | * The first wait is dominated by the child-runs-first logic, | 1092 | * The first wait is dominated by the child-runs-first logic, |
| 1040 | * so do not credit it with that waiting time yet: | 1093 | * so do not credit it with that waiting time yet: |
| @@ -1047,7 +1100,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
| 1047 | * -granularity/2, so initialize the task with that: | 1100 | * -granularity/2, so initialize the task with that: |
| 1048 | */ | 1101 | */ |
| 1049 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | 1102 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) |
| 1050 | p->se.wait_runtime = -((long)sysctl_sched_granularity / 2); | 1103 | p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); |
| 1051 | 1104 | ||
| 1052 | __enqueue_entity(cfs_rq, se); | 1105 | __enqueue_entity(cfs_rq, se); |
| 1053 | } | 1106 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ea90ef51085c..9e3d2960faf5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -233,6 +233,17 @@ static ctl_table kern_table[] = { | |||
| 233 | }, | 233 | }, |
| 234 | { | 234 | { |
| 235 | .ctl_name = CTL_UNNUMBERED, | 235 | .ctl_name = CTL_UNNUMBERED, |
| 236 | .procname = "sched_latency_ns", | ||
| 237 | .data = &sysctl_sched_latency, | ||
| 238 | .maxlen = sizeof(unsigned int), | ||
| 239 | .mode = 0644, | ||
| 240 | .proc_handler = &proc_dointvec_minmax, | ||
| 241 | .strategy = &sysctl_intvec, | ||
| 242 | .extra1 = &min_sched_granularity_ns, | ||
| 243 | .extra2 = &max_sched_granularity_ns, | ||
| 244 | }, | ||
| 245 | { | ||
| 246 | .ctl_name = CTL_UNNUMBERED, | ||
| 236 | .procname = "sched_wakeup_granularity_ns", | 247 | .procname = "sched_wakeup_granularity_ns", |
| 237 | .data = &sysctl_sched_wakeup_granularity, | 248 | .data = &sysctl_sched_wakeup_granularity, |
| 238 | .maxlen = sizeof(unsigned int), | 249 | .maxlen = sizeof(unsigned int), |
