aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/cpuidle
diff options
context:
space:
mode:
authorRafael J. Wysocki <rafael.j.wysocki@intel.com>2013-07-26 19:41:34 -0400
committerRafael J. Wysocki <rafael.j.wysocki@intel.com>2013-07-29 07:32:29 -0400
commit148519120c6d1f19ad53349683aeae9f228b0b8d (patch)
tree6d585444bbc27d2752ac1eb69180b7312150c5a5 /drivers/cpuidle
parent228b30234f258a193317874854eee1ca7807186e (diff)
Revert "cpuidle: Quickly notice prediction failure for repeat mode"
Revert commit 69a37bea (cpuidle: Quickly notice prediction failure for repeat mode), because it has been identified as the source of a significant performance regression in v3.8 and later as explained by Jeremy Eder: We believe we've identified a particular commit to the cpuidle code that seems to be impacting performance of variety of workloads. The simplest way to reproduce is using netperf TCP_RR test, so we're using that, on a pair of Sandy Bridge based servers. We also have data from a large database setup where performance is also measurably/positively impacted, though that test data isn't easily share-able. Included below are test results from 3 test kernels: kernel reverts ----------------------------------------------------------- 1) vanilla upstream (no reverts) 2) perfteam2 reverts e11538d1f03914eb92af5a1a378375c05ae8520c 3) test reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 e11538d1f03914eb92af5a1a378375c05ae8520c In summary, netperf TCP_RR numbers improve by approximately 4% after reverting 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. When 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 is included, C0 residency never seems to get above 40%. Taking that patch out gets C0 near 100% quite often, and performance increases. The below data are histograms representing the %c0 residency @ 1-second sample rates (using turbostat), while under netperf test. - If you look at the first 4 histograms, you can see %c0 residency almost entirely in the 30,40% bin. - The last pair, which reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4, shows %c0 in the 80,90,100% bins. Below each kernel name are netperf TCP_RR trans/s numbers for the particular kernel that can be disclosed publicly, comparing the 3 test kernels. We ran a 4th test with the vanilla kernel where we've also set /dev/cpu_dma_latency=0 to show overall impact boosting single-threaded TCP_RR performance over 11% above baseline. 3.10-rc2 vanilla RX + c0 lock (/dev/cpu_dma_latency=0): TCP_RR trans/s 54323.78 ----------------------------------------------------------- 3.10-rc2 vanilla RX (no reverts) TCP_RR trans/s 48192.47 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 1]: * 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 49]: ************************************************* 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 perfteam2 RX (reverts commit e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 49698.69 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 2]: ** 40.0000 - 50.0000 [ 58]: ********************************************************** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 test RX (reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 and e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 47766.95 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 27]: *************************** 40.0000 - 50.0000 [ 2]: ** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 2]: ** 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 28]: **************************** Sender: 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 1]: * 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 3]: *** 80.0000 - 90.0000 [ 7]: ******* 90.0000 - 100.0000 [ 38]: ************************************** These results demonstrate gaining back the tendency of the CPU to stay in more responsive, performant C-states (and thus yield measurably better performance), by reverting commit 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. Requested-by: Jeremy Eder <jeder@redhat.com> Tested-by: Len Brown <len.brown@intel.com> Cc: 3.8+ <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Diffstat (limited to 'drivers/cpuidle')
-rw-r--r--drivers/cpuidle/governors/menu.c73
1 files changed, 4 insertions, 69 deletions
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b69a87e22155..bc580b67a652 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -28,13 +28,6 @@
28#define MAX_INTERESTING 50000 28#define MAX_INTERESTING 50000
29#define STDDEV_THRESH 400 29#define STDDEV_THRESH 400
30 30
31/* 60 * 60 > STDDEV_THRESH * INTERVALS = 400 * 8 */
32#define MAX_DEVIATION 60
33
34static DEFINE_PER_CPU(struct hrtimer, menu_hrtimer);
35static DEFINE_PER_CPU(int, hrtimer_status);
36/* menu hrtimer mode */
37enum {MENU_HRTIMER_STOP, MENU_HRTIMER_REPEAT};
38 31
39/* 32/*
40 * Concepts and ideas behind the menu governor 33 * Concepts and ideas behind the menu governor
@@ -198,42 +191,17 @@ static u64 div_round64(u64 dividend, u32 divisor)
198 return div_u64(dividend + (divisor / 2), divisor); 191 return div_u64(dividend + (divisor / 2), divisor);
199} 192}
200 193
201/* Cancel the hrtimer if it is not triggered yet */
202void menu_hrtimer_cancel(void)
203{
204 int cpu = smp_processor_id();
205 struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu);
206
207 /* The timer is still not time out*/
208 if (per_cpu(hrtimer_status, cpu)) {
209 hrtimer_cancel(hrtmr);
210 per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP;
211 }
212}
213EXPORT_SYMBOL_GPL(menu_hrtimer_cancel);
214
215/* Call back for hrtimer is triggered */
216static enum hrtimer_restart menu_hrtimer_notify(struct hrtimer *hrtimer)
217{
218 int cpu = smp_processor_id();
219
220 per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP;
221
222 return HRTIMER_NORESTART;
223}
224
225/* 194/*
226 * Try detecting repeating patterns by keeping track of the last 8 195 * Try detecting repeating patterns by keeping track of the last 8
227 * intervals, and checking if the standard deviation of that set 196 * intervals, and checking if the standard deviation of that set
228 * of points is below a threshold. If it is... then use the 197 * of points is below a threshold. If it is... then use the
229 * average of these 8 points as the estimated value. 198 * average of these 8 points as the estimated value.
230 */ 199 */
231static u32 get_typical_interval(struct menu_device *data) 200static void get_typical_interval(struct menu_device *data)
232{ 201{
233 int i = 0, divisor = 0; 202 int i = 0, divisor = 0;
234 uint64_t max = 0, avg = 0, stddev = 0; 203 uint64_t max = 0, avg = 0, stddev = 0;
235 int64_t thresh = LLONG_MAX; /* Discard outliers above this value. */ 204 int64_t thresh = LLONG_MAX; /* Discard outliers above this value. */
236 unsigned int ret = 0;
237 205
238again: 206again:
239 207
@@ -274,16 +242,13 @@ again:
274 if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3)) 242 if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3))
275 || stddev <= 20) { 243 || stddev <= 20) {
276 data->predicted_us = avg; 244 data->predicted_us = avg;
277 ret = 1; 245 return;
278 return ret;
279 246
280 } else if ((divisor * 4) > INTERVALS * 3) { 247 } else if ((divisor * 4) > INTERVALS * 3) {
281 /* Exclude the max interval */ 248 /* Exclude the max interval */
282 thresh = max - 1; 249 thresh = max - 1;
283 goto again; 250 goto again;
284 } 251 }
285
286 return ret;
287} 252}
288 253
289/** 254/**
@@ -298,9 +263,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
298 int i; 263 int i;
299 int multiplier; 264 int multiplier;
300 struct timespec t; 265 struct timespec t;
301 int repeat = 0, low_predicted = 0;
302 int cpu = smp_processor_id();
303 struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu);
304 266
305 if (data->needs_update) { 267 if (data->needs_update) {
306 menu_update(drv, dev); 268 menu_update(drv, dev);
@@ -335,7 +297,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
335 data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket], 297 data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket],
336 RESOLUTION * DECAY); 298 RESOLUTION * DECAY);
337 299
338 repeat = get_typical_interval(data); 300 get_typical_interval(data);
339 301
340 /* 302 /*
341 * We want to default to C1 (hlt), not to busy polling 303 * We want to default to C1 (hlt), not to busy polling
@@ -356,10 +318,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
356 318
357 if (s->disabled || su->disable) 319 if (s->disabled || su->disable)
358 continue; 320 continue;
359 if (s->target_residency > data->predicted_us) { 321 if (s->target_residency > data->predicted_us)
360 low_predicted = 1;
361 continue; 322 continue;
362 }
363 if (s->exit_latency > latency_req) 323 if (s->exit_latency > latency_req)
364 continue; 324 continue;
365 if (s->exit_latency * multiplier > data->predicted_us) 325 if (s->exit_latency * multiplier > data->predicted_us)
@@ -369,28 +329,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
369 data->exit_us = s->exit_latency; 329 data->exit_us = s->exit_latency;
370 } 330 }
371 331
372 /* not deepest C-state chosen for low predicted residency */
373 if (low_predicted) {
374 unsigned int timer_us = 0;
375
376 /*
377 * Set a timer to detect whether this sleep is much
378 * longer than repeat mode predicted. If the timer
379 * triggers, the code will evaluate whether to put
380 * the CPU into a deeper C-state.
381 * The timer is cancelled on CPU wakeup.
382 */
383 timer_us = 2 * (data->predicted_us + MAX_DEVIATION);
384
385 if (repeat && (4 * timer_us < data->expected_us)) {
386 RCU_NONIDLE(hrtimer_start(hrtmr,
387 ns_to_ktime(1000 * timer_us),
388 HRTIMER_MODE_REL_PINNED));
389 /* In repeat case, menu hrtimer is started */
390 per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_REPEAT;
391 }
392 }
393
394 return data->last_state_idx; 332 return data->last_state_idx;
395} 333}
396 334
@@ -481,9 +419,6 @@ static int menu_enable_device(struct cpuidle_driver *drv,
481 struct cpuidle_device *dev) 419 struct cpuidle_device *dev)
482{ 420{
483 struct menu_device *data = &per_cpu(menu_devices, dev->cpu); 421 struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
484 struct hrtimer *t = &per_cpu(menu_hrtimer, dev->cpu);
485 hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
486 t->function = menu_hrtimer_notify;
487 422
488 memset(data, 0, sizeof(struct menu_device)); 423 memset(data, 0, sizeof(struct menu_device));
489 424