diff options
| author | Rafael J. Wysocki <rafael.j.wysocki@intel.com> | 2013-07-26 19:41:34 -0400 |
|---|---|---|
| committer | Rafael J. Wysocki <rafael.j.wysocki@intel.com> | 2013-07-29 07:32:29 -0400 |
| commit | 148519120c6d1f19ad53349683aeae9f228b0b8d (patch) | |
| tree | 6d585444bbc27d2752ac1eb69180b7312150c5a5 | |
| parent | 228b30234f258a193317874854eee1ca7807186e (diff) | |
Revert "cpuidle: Quickly notice prediction failure for repeat mode"
Revert commit 69a37bea (cpuidle: Quickly notice prediction failure for
repeat mode), because it has been identified as the source of a
significant performance regression in v3.8 and later as explained by
Jeremy Eder:
We believe we've identified a particular commit to the cpuidle code
that seems to be impacting performance of variety of workloads.
The simplest way to reproduce is using netperf TCP_RR test, so
we're using that, on a pair of Sandy Bridge based servers. We also
have data from a large database setup where performance is also
measurably/positively impacted, though that test data isn't easily
share-able.
Included below are test results from 3 test kernels:
kernel reverts
-----------------------------------------------------------
1) vanilla upstream (no reverts)
2) perfteam2 reverts e11538d1f03914eb92af5a1a378375c05ae8520c
3) test reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4
e11538d1f03914eb92af5a1a378375c05ae8520c
In summary, netperf TCP_RR numbers improve by approximately 4%
after reverting 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. When
69a37beabf1f0a6705c08e879bdd5d82ff6486c4 is included, C0 residency
never seems to get above 40%. Taking that patch out gets C0 near
100% quite often, and performance increases.
The below data are histograms representing the %c0 residency @
1-second sample rates (using turbostat), while under netperf test.
- If you look at the first 4 histograms, you can see %c0 residency
almost entirely in the 30,40% bin.
- The last pair, which reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4,
shows %c0 in the 80,90,100% bins.
Below each kernel name are netperf TCP_RR trans/s numbers for the
particular kernel that can be disclosed publicly, comparing the 3
test kernels. We ran a 4th test with the vanilla kernel where
we've also set /dev/cpu_dma_latency=0 to show overall impact
boosting single-threaded TCP_RR performance over 11% above
baseline.
3.10-rc2 vanilla RX + c0 lock (/dev/cpu_dma_latency=0):
TCP_RR trans/s 54323.78
-----------------------------------------------------------
3.10-rc2 vanilla RX (no reverts)
TCP_RR trans/s 48192.47
Receiver %c0
0.0000 - 10.0000 [ 1]: *
10.0000 - 20.0000 [ 0]:
20.0000 - 30.0000 [ 0]:
30.0000 - 40.0000 [ 59]:
***********************************************************
40.0000 - 50.0000 [ 1]: *
50.0000 - 60.0000 [ 0]:
60.0000 - 70.0000 [ 0]:
70.0000 - 80.0000 [ 0]:
80.0000 - 90.0000 [ 0]:
90.0000 - 100.0000 [ 0]:
Sender %c0
0.0000 - 10.0000 [ 1]: *
10.0000 - 20.0000 [ 0]:
20.0000 - 30.0000 [ 0]:
30.0000 - 40.0000 [ 11]: ***********
40.0000 - 50.0000 [ 49]:
*************************************************
50.0000 - 60.0000 [ 0]:
60.0000 - 70.0000 [ 0]:
70.0000 - 80.0000 [ 0]:
80.0000 - 90.0000 [ 0]:
90.0000 - 100.0000 [ 0]:
-----------------------------------------------------------
3.10-rc2 perfteam2 RX (reverts commit
e11538d1f03914eb92af5a1a378375c05ae8520c)
TCP_RR trans/s 49698.69
Receiver %c0
0.0000 - 10.0000 [ 1]: *
10.0000 - 20.0000 [ 1]: *
20.0000 - 30.0000 [ 0]:
30.0000 - 40.0000 [ 59]:
***********************************************************
40.0000 - 50.0000 [ 0]:
50.0000 - 60.0000 [ 0]:
60.0000 - 70.0000 [ 0]:
70.0000 - 80.0000 [ 0]:
80.0000 - 90.0000 [ 0]:
90.0000 - 100.0000 [ 0]:
Sender %c0
0.0000 - 10.0000 [ 1]: *
10.0000 - 20.0000 [ 0]:
20.0000 - 30.0000 [ 0]:
30.0000 - 40.0000 [ 2]: **
40.0000 - 50.0000 [ 58]:
**********************************************************
50.0000 - 60.0000 [ 0]:
60.0000 - 70.0000 [ 0]:
70.0000 - 80.0000 [ 0]:
80.0000 - 90.0000 [ 0]:
90.0000 - 100.0000 [ 0]:
-----------------------------------------------------------
3.10-rc2 test RX (reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4
and e11538d1f03914eb92af5a1a378375c05ae8520c)
TCP_RR trans/s 47766.95
Receiver %c0
0.0000 - 10.0000 [ 1]: *
10.0000 - 20.0000 [ 1]: *
20.0000 - 30.0000 [ 0]:
30.0000 - 40.0000 [ 27]: ***************************
40.0000 - 50.0000 [ 2]: **
50.0000 - 60.0000 [ 0]:
60.0000 - 70.0000 [ 2]: **
70.0000 - 80.0000 [ 0]:
80.0000 - 90.0000 [ 0]:
90.0000 - 100.0000 [ 28]: ****************************
Sender:
0.0000 - 10.0000 [ 1]: *
10.0000 - 20.0000 [ 0]:
20.0000 - 30.0000 [ 0]:
30.0000 - 40.0000 [ 11]: ***********
40.0000 - 50.0000 [ 0]:
50.0000 - 60.0000 [ 1]: *
60.0000 - 70.0000 [ 0]:
70.0000 - 80.0000 [ 3]: ***
80.0000 - 90.0000 [ 7]: *******
90.0000 - 100.0000 [ 38]: **************************************
These results demonstrate gaining back the tendency of the CPU to
stay in more responsive, performant C-states (and thus yield
measurably better performance), by reverting commit
69a37beabf1f0a6705c08e879bdd5d82ff6486c4.
Requested-by: Jeremy Eder <jeder@redhat.com>
Tested-by: Len Brown <len.brown@intel.com>
Cc: 3.8+ <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
| -rw-r--r-- | drivers/cpuidle/governors/menu.c | 73 | ||||
| -rw-r--r-- | include/linux/tick.h | 6 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 9 |
3 files changed, 6 insertions, 82 deletions
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index b69a87e22155..bc580b67a652 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c | |||
| @@ -28,13 +28,6 @@ | |||
| 28 | #define MAX_INTERESTING 50000 | 28 | #define MAX_INTERESTING 50000 |
| 29 | #define STDDEV_THRESH 400 | 29 | #define STDDEV_THRESH 400 |
| 30 | 30 | ||
| 31 | /* 60 * 60 > STDDEV_THRESH * INTERVALS = 400 * 8 */ | ||
| 32 | #define MAX_DEVIATION 60 | ||
| 33 | |||
| 34 | static DEFINE_PER_CPU(struct hrtimer, menu_hrtimer); | ||
| 35 | static DEFINE_PER_CPU(int, hrtimer_status); | ||
| 36 | /* menu hrtimer mode */ | ||
| 37 | enum {MENU_HRTIMER_STOP, MENU_HRTIMER_REPEAT}; | ||
| 38 | 31 | ||
| 39 | /* | 32 | /* |
| 40 | * Concepts and ideas behind the menu governor | 33 | * Concepts and ideas behind the menu governor |
| @@ -198,42 +191,17 @@ static u64 div_round64(u64 dividend, u32 divisor) | |||
| 198 | return div_u64(dividend + (divisor / 2), divisor); | 191 | return div_u64(dividend + (divisor / 2), divisor); |
| 199 | } | 192 | } |
| 200 | 193 | ||
| 201 | /* Cancel the hrtimer if it is not triggered yet */ | ||
| 202 | void menu_hrtimer_cancel(void) | ||
| 203 | { | ||
| 204 | int cpu = smp_processor_id(); | ||
| 205 | struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu); | ||
| 206 | |||
| 207 | /* The timer is still not time out*/ | ||
| 208 | if (per_cpu(hrtimer_status, cpu)) { | ||
| 209 | hrtimer_cancel(hrtmr); | ||
| 210 | per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP; | ||
| 211 | } | ||
| 212 | } | ||
| 213 | EXPORT_SYMBOL_GPL(menu_hrtimer_cancel); | ||
| 214 | |||
| 215 | /* Call back for hrtimer is triggered */ | ||
| 216 | static enum hrtimer_restart menu_hrtimer_notify(struct hrtimer *hrtimer) | ||
| 217 | { | ||
| 218 | int cpu = smp_processor_id(); | ||
| 219 | |||
| 220 | per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP; | ||
| 221 | |||
| 222 | return HRTIMER_NORESTART; | ||
| 223 | } | ||
| 224 | |||
| 225 | /* | 194 | /* |
| 226 | * Try detecting repeating patterns by keeping track of the last 8 | 195 | * Try detecting repeating patterns by keeping track of the last 8 |
| 227 | * intervals, and checking if the standard deviation of that set | 196 | * intervals, and checking if the standard deviation of that set |
| 228 | * of points is below a threshold. If it is... then use the | 197 | * of points is below a threshold. If it is... then use the |
| 229 | * average of these 8 points as the estimated value. | 198 | * average of these 8 points as the estimated value. |
| 230 | */ | 199 | */ |
| 231 | static u32 get_typical_interval(struct menu_device *data) | 200 | static void get_typical_interval(struct menu_device *data) |
| 232 | { | 201 | { |
| 233 | int i = 0, divisor = 0; | 202 | int i = 0, divisor = 0; |
| 234 | uint64_t max = 0, avg = 0, stddev = 0; | 203 | uint64_t max = 0, avg = 0, stddev = 0; |
| 235 | int64_t thresh = LLONG_MAX; /* Discard outliers above this value. */ | 204 | int64_t thresh = LLONG_MAX; /* Discard outliers above this value. */ |
| 236 | unsigned int ret = 0; | ||
| 237 | 205 | ||
| 238 | again: | 206 | again: |
| 239 | 207 | ||
| @@ -274,16 +242,13 @@ again: | |||
| 274 | if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3)) | 242 | if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3)) |
| 275 | || stddev <= 20) { | 243 | || stddev <= 20) { |
| 276 | data->predicted_us = avg; | 244 | data->predicted_us = avg; |
| 277 | ret = 1; | 245 | return; |
| 278 | return ret; | ||
| 279 | 246 | ||
| 280 | } else if ((divisor * 4) > INTERVALS * 3) { | 247 | } else if ((divisor * 4) > INTERVALS * 3) { |
| 281 | /* Exclude the max interval */ | 248 | /* Exclude the max interval */ |
| 282 | thresh = max - 1; | 249 | thresh = max - 1; |
| 283 | goto again; | 250 | goto again; |
| 284 | } | 251 | } |
| 285 | |||
| 286 | return ret; | ||
| 287 | } | 252 | } |
| 288 | 253 | ||
| 289 | /** | 254 | /** |
| @@ -298,9 +263,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) | |||
| 298 | int i; | 263 | int i; |
| 299 | int multiplier; | 264 | int multiplier; |
| 300 | struct timespec t; | 265 | struct timespec t; |
| 301 | int repeat = 0, low_predicted = 0; | ||
| 302 | int cpu = smp_processor_id(); | ||
| 303 | struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu); | ||
| 304 | 266 | ||
| 305 | if (data->needs_update) { | 267 | if (data->needs_update) { |
| 306 | menu_update(drv, dev); | 268 | menu_update(drv, dev); |
| @@ -335,7 +297,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) | |||
| 335 | data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket], | 297 | data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket], |
| 336 | RESOLUTION * DECAY); | 298 | RESOLUTION * DECAY); |
| 337 | 299 | ||
| 338 | repeat = get_typical_interval(data); | 300 | get_typical_interval(data); |
| 339 | 301 | ||
| 340 | /* | 302 | /* |
| 341 | * We want to default to C1 (hlt), not to busy polling | 303 | * We want to default to C1 (hlt), not to busy polling |
| @@ -356,10 +318,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) | |||
| 356 | 318 | ||
| 357 | if (s->disabled || su->disable) | 319 | if (s->disabled || su->disable) |
| 358 | continue; | 320 | continue; |
| 359 | if (s->target_residency > data->predicted_us) { | 321 | if (s->target_residency > data->predicted_us) |
| 360 | low_predicted = 1; | ||
| 361 | continue; | 322 | continue; |
| 362 | } | ||
| 363 | if (s->exit_latency > latency_req) | 323 | if (s->exit_latency > latency_req) |
| 364 | continue; | 324 | continue; |
| 365 | if (s->exit_latency * multiplier > data->predicted_us) | 325 | if (s->exit_latency * multiplier > data->predicted_us) |
| @@ -369,28 +329,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) | |||
| 369 | data->exit_us = s->exit_latency; | 329 | data->exit_us = s->exit_latency; |
| 370 | } | 330 | } |
| 371 | 331 | ||
| 372 | /* not deepest C-state chosen for low predicted residency */ | ||
| 373 | if (low_predicted) { | ||
| 374 | unsigned int timer_us = 0; | ||
| 375 | |||
| 376 | /* | ||
| 377 | * Set a timer to detect whether this sleep is much | ||
| 378 | * longer than repeat mode predicted. If the timer | ||
| 379 | * triggers, the code will evaluate whether to put | ||
| 380 | * the CPU into a deeper C-state. | ||
| 381 | * The timer is cancelled on CPU wakeup. | ||
| 382 | */ | ||
| 383 | timer_us = 2 * (data->predicted_us + MAX_DEVIATION); | ||
| 384 | |||
| 385 | if (repeat && (4 * timer_us < data->expected_us)) { | ||
| 386 | RCU_NONIDLE(hrtimer_start(hrtmr, | ||
| 387 | ns_to_ktime(1000 * timer_us), | ||
| 388 | HRTIMER_MODE_REL_PINNED)); | ||
| 389 | /* In repeat case, menu hrtimer is started */ | ||
| 390 | per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_REPEAT; | ||
| 391 | } | ||
| 392 | } | ||
| 393 | |||
| 394 | return data->last_state_idx; | 332 | return data->last_state_idx; |
| 395 | } | 333 | } |
| 396 | 334 | ||
| @@ -481,9 +419,6 @@ static int menu_enable_device(struct cpuidle_driver *drv, | |||
| 481 | struct cpuidle_device *dev) | 419 | struct cpuidle_device *dev) |
| 482 | { | 420 | { |
| 483 | struct menu_device *data = &per_cpu(menu_devices, dev->cpu); | 421 | struct menu_device *data = &per_cpu(menu_devices, dev->cpu); |
| 484 | struct hrtimer *t = &per_cpu(menu_hrtimer, dev->cpu); | ||
| 485 | hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 486 | t->function = menu_hrtimer_notify; | ||
| 487 | 422 | ||
| 488 | memset(data, 0, sizeof(struct menu_device)); | 423 | memset(data, 0, sizeof(struct menu_device)); |
| 489 | 424 | ||
diff --git a/include/linux/tick.h b/include/linux/tick.h index 9180f4b85e6d..62bd8b72873c 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h | |||
| @@ -174,10 +174,4 @@ static inline void tick_nohz_task_switch(struct task_struct *tsk) { } | |||
| 174 | #endif | 174 | #endif |
| 175 | 175 | ||
| 176 | 176 | ||
| 177 | # ifdef CONFIG_CPU_IDLE_GOV_MENU | ||
| 178 | extern void menu_hrtimer_cancel(void); | ||
| 179 | # else | ||
| 180 | static inline void menu_hrtimer_cancel(void) {} | ||
| 181 | # endif /* CONFIG_CPU_IDLE_GOV_MENU */ | ||
| 182 | |||
| 183 | #endif | 177 | #endif |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e80183f4a6c4..e77edc97e036 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -827,13 +827,10 @@ void tick_nohz_irq_exit(void) | |||
| 827 | { | 827 | { |
| 828 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 828 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
| 829 | 829 | ||
| 830 | if (ts->inidle) { | 830 | if (ts->inidle) |
| 831 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
| 832 | menu_hrtimer_cancel(); | ||
| 833 | __tick_nohz_idle_enter(ts); | 831 | __tick_nohz_idle_enter(ts); |
| 834 | } else { | 832 | else |
| 835 | tick_nohz_full_stop_tick(ts); | 833 | tick_nohz_full_stop_tick(ts); |
| 836 | } | ||
| 837 | } | 834 | } |
| 838 | 835 | ||
| 839 | /** | 836 | /** |
| @@ -931,8 +928,6 @@ void tick_nohz_idle_exit(void) | |||
| 931 | 928 | ||
| 932 | ts->inidle = 0; | 929 | ts->inidle = 0; |
| 933 | 930 | ||
| 934 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
| 935 | menu_hrtimer_cancel(); | ||
| 936 | if (ts->idle_active || ts->tick_stopped) | 931 | if (ts->idle_active || ts->tick_stopped) |
| 937 | now = ktime_get(); | 932 | now = ktime_get(); |
| 938 | 933 | ||
