aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cpuidle/sysfs.txt6
-rw-r--r--arch/x86/xen/smp_pv.c1
-rw-r--r--drivers/cpuidle/cpuidle.c10
-rw-r--r--drivers/cpuidle/governors/ladder.c3
-rw-r--r--drivers/cpuidle/governors/menu.c113
-rw-r--r--drivers/net/ethernet/sfc/mcdi.c2
-rw-r--r--include/linux/cpuidle.h8
-rw-r--r--include/linux/hrtimer.h1
-rw-r--r--include/linux/jiffies.h7
-rw-r--r--include/linux/tick.h25
-rw-r--r--kernel/power/qos.c2
-rw-r--r--kernel/sched/idle.c34
-rw-r--r--kernel/time/hrtimer.c53
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-sched.c250
-rw-r--r--kernel/time/tick-sched.h18
16 files changed, 415 insertions, 120 deletions
diff --git a/Documentation/cpuidle/sysfs.txt b/Documentation/cpuidle/sysfs.txt
index b6f44f490ed7..d1587f434e7b 100644
--- a/Documentation/cpuidle/sysfs.txt
+++ b/Documentation/cpuidle/sysfs.txt
@@ -40,6 +40,7 @@ total 0
40-r--r--r-- 1 root root 4096 Feb 8 10:42 latency 40-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
41-r--r--r-- 1 root root 4096 Feb 8 10:42 name 41-r--r--r-- 1 root root 4096 Feb 8 10:42 name
42-r--r--r-- 1 root root 4096 Feb 8 10:42 power 42-r--r--r-- 1 root root 4096 Feb 8 10:42 power
43-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
43-r--r--r-- 1 root root 4096 Feb 8 10:42 time 44-r--r--r-- 1 root root 4096 Feb 8 10:42 time
44-r--r--r-- 1 root root 4096 Feb 8 10:42 usage 45-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
45 46
@@ -50,6 +51,7 @@ total 0
50-r--r--r-- 1 root root 4096 Feb 8 10:42 latency 51-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
51-r--r--r-- 1 root root 4096 Feb 8 10:42 name 52-r--r--r-- 1 root root 4096 Feb 8 10:42 name
52-r--r--r-- 1 root root 4096 Feb 8 10:42 power 53-r--r--r-- 1 root root 4096 Feb 8 10:42 power
54-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
53-r--r--r-- 1 root root 4096 Feb 8 10:42 time 55-r--r--r-- 1 root root 4096 Feb 8 10:42 time
54-r--r--r-- 1 root root 4096 Feb 8 10:42 usage 56-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
55 57
@@ -60,6 +62,7 @@ total 0
60-r--r--r-- 1 root root 4096 Feb 8 10:42 latency 62-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
61-r--r--r-- 1 root root 4096 Feb 8 10:42 name 63-r--r--r-- 1 root root 4096 Feb 8 10:42 name
62-r--r--r-- 1 root root 4096 Feb 8 10:42 power 64-r--r--r-- 1 root root 4096 Feb 8 10:42 power
65-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
63-r--r--r-- 1 root root 4096 Feb 8 10:42 time 66-r--r--r-- 1 root root 4096 Feb 8 10:42 time
64-r--r--r-- 1 root root 4096 Feb 8 10:42 usage 67-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
65 68
@@ -70,6 +73,7 @@ total 0
70-r--r--r-- 1 root root 4096 Feb 8 10:42 latency 73-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
71-r--r--r-- 1 root root 4096 Feb 8 10:42 name 74-r--r--r-- 1 root root 4096 Feb 8 10:42 name
72-r--r--r-- 1 root root 4096 Feb 8 10:42 power 75-r--r--r-- 1 root root 4096 Feb 8 10:42 power
76-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
73-r--r--r-- 1 root root 4096 Feb 8 10:42 time 77-r--r--r-- 1 root root 4096 Feb 8 10:42 time
74-r--r--r-- 1 root root 4096 Feb 8 10:42 usage 78-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
75-------------------------------------------------------------------------------- 79--------------------------------------------------------------------------------
@@ -78,6 +82,8 @@ total 0
78* desc : Small description about the idle state (string) 82* desc : Small description about the idle state (string)
79* disable : Option to disable this idle state (bool) -> see note below 83* disable : Option to disable this idle state (bool) -> see note below
80* latency : Latency to exit out of this idle state (in microseconds) 84* latency : Latency to exit out of this idle state (in microseconds)
85* residency : Time after which a state becomes more effecient than any
86 shallower state (in microseconds)
81* name : Name of the idle state (string) 87* name : Name of the idle state (string)
82* power : Power consumed while in this idle state (in milliwatts) 88* power : Power consumed while in this idle state (in milliwatts)
83* time : Total time spent in this idle state (in microseconds) 89* time : Total time spent in this idle state (in microseconds)
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index c0c756c76afe..2e20ae2fa2d6 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -425,6 +425,7 @@ static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
425 * data back is to call: 425 * data back is to call:
426 */ 426 */
427 tick_nohz_idle_enter(); 427 tick_nohz_idle_enter();
428 tick_nohz_idle_stop_tick_protected();
428 429
429 cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); 430 cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE);
430} 431}
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 0003e9a02637..6df894d65d9e 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -272,12 +272,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
272 * 272 *
273 * @drv: the cpuidle driver 273 * @drv: the cpuidle driver
274 * @dev: the cpuidle device 274 * @dev: the cpuidle device
275 * @stop_tick: indication on whether or not to stop the tick
275 * 276 *
276 * Returns the index of the idle state. The return value must not be negative. 277 * Returns the index of the idle state. The return value must not be negative.
278 *
279 * The memory location pointed to by @stop_tick is expected to be written the
280 * 'false' boolean value if the scheduler tick should not be stopped before
281 * entering the returned state.
277 */ 282 */
278int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) 283int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
284 bool *stop_tick)
279{ 285{
280 return cpuidle_curr_governor->select(drv, dev); 286 return cpuidle_curr_governor->select(drv, dev, stop_tick);
281} 287}
282 288
283/** 289/**
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 1ad8745fd6d6..b24883f85c99 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -63,9 +63,10 @@ static inline void ladder_do_selection(struct ladder_device *ldev,
63 * ladder_select_state - selects the next state to enter 63 * ladder_select_state - selects the next state to enter
64 * @drv: cpuidle driver 64 * @drv: cpuidle driver
65 * @dev: the CPU 65 * @dev: the CPU
66 * @dummy: not used
66 */ 67 */
67static int ladder_select_state(struct cpuidle_driver *drv, 68static int ladder_select_state(struct cpuidle_driver *drv,
68 struct cpuidle_device *dev) 69 struct cpuidle_device *dev, bool *dummy)
69{ 70{
70 struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); 71 struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
71 struct device *device = get_cpu_device(dev->cpu); 72 struct device *device = get_cpu_device(dev->cpu);
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index aa390404e85f..1bfe03ceb236 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -123,6 +123,7 @@
123struct menu_device { 123struct menu_device {
124 int last_state_idx; 124 int last_state_idx;
125 int needs_update; 125 int needs_update;
126 int tick_wakeup;
126 127
127 unsigned int next_timer_us; 128 unsigned int next_timer_us;
128 unsigned int predicted_us; 129 unsigned int predicted_us;
@@ -279,8 +280,10 @@ again:
279 * menu_select - selects the next idle state to enter 280 * menu_select - selects the next idle state to enter
280 * @drv: cpuidle driver containing state data 281 * @drv: cpuidle driver containing state data
281 * @dev: the CPU 282 * @dev: the CPU
283 * @stop_tick: indication on whether or not to stop the tick
282 */ 284 */
283static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) 285static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
286 bool *stop_tick)
284{ 287{
285 struct menu_device *data = this_cpu_ptr(&menu_devices); 288 struct menu_device *data = this_cpu_ptr(&menu_devices);
286 struct device *device = get_cpu_device(dev->cpu); 289 struct device *device = get_cpu_device(dev->cpu);
@@ -292,6 +295,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
292 unsigned int expected_interval; 295 unsigned int expected_interval;
293 unsigned long nr_iowaiters, cpu_load; 296 unsigned long nr_iowaiters, cpu_load;
294 int resume_latency = dev_pm_qos_raw_read_value(device); 297 int resume_latency = dev_pm_qos_raw_read_value(device);
298 ktime_t delta_next;
295 299
296 if (data->needs_update) { 300 if (data->needs_update) {
297 menu_update(drv, dev); 301 menu_update(drv, dev);
@@ -303,11 +307,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
303 latency_req = resume_latency; 307 latency_req = resume_latency;
304 308
305 /* Special case when user has set very strict latency requirement */ 309 /* Special case when user has set very strict latency requirement */
306 if (unlikely(latency_req == 0)) 310 if (unlikely(latency_req == 0)) {
311 *stop_tick = false;
307 return 0; 312 return 0;
313 }
308 314
309 /* determine the expected residency time, round up */ 315 /* determine the expected residency time, round up */
310 data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length()); 316 data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next));
311 317
312 get_iowait_load(&nr_iowaiters, &cpu_load); 318 get_iowait_load(&nr_iowaiters, &cpu_load);
313 data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); 319 data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
@@ -346,14 +352,30 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
346 */ 352 */
347 data->predicted_us = min(data->predicted_us, expected_interval); 353 data->predicted_us = min(data->predicted_us, expected_interval);
348 354
349 /* 355 if (tick_nohz_tick_stopped()) {
350 * Use the performance multiplier and the user-configurable 356 /*
351 * latency_req to determine the maximum exit latency. 357 * If the tick is already stopped, the cost of possible short
352 */ 358 * idle duration misprediction is much higher, because the CPU
353 interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load); 359 * may be stuck in a shallow idle state for a long time as a
354 if (latency_req > interactivity_req) 360 * result of it. In that case say we might mispredict and try
355 latency_req = interactivity_req; 361 * to force the CPU into a state for which we would have stopped
362 * the tick, unless a timer is going to expire really soon
363 * anyway.
364 */
365 if (data->predicted_us < TICK_USEC)
366 data->predicted_us = min_t(unsigned int, TICK_USEC,
367 ktime_to_us(delta_next));
368 } else {
369 /*
370 * Use the performance multiplier and the user-configurable
371 * latency_req to determine the maximum exit latency.
372 */
373 interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
374 if (latency_req > interactivity_req)
375 latency_req = interactivity_req;
376 }
356 377
378 expected_interval = data->predicted_us;
357 /* 379 /*
358 * Find the idle state with the lowest power while satisfying 380 * Find the idle state with the lowest power while satisfying
359 * our constraints. 381 * our constraints.
@@ -369,15 +391,52 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
369 idx = i; /* first enabled state */ 391 idx = i; /* first enabled state */
370 if (s->target_residency > data->predicted_us) 392 if (s->target_residency > data->predicted_us)
371 break; 393 break;
372 if (s->exit_latency > latency_req) 394 if (s->exit_latency > latency_req) {
395 /*
396 * If we break out of the loop for latency reasons, use
397 * the target residency of the selected state as the
398 * expected idle duration so that the tick is retained
399 * as long as that target residency is low enough.
400 */
401 expected_interval = drv->states[idx].target_residency;
373 break; 402 break;
374 403 }
375 idx = i; 404 idx = i;
376 } 405 }
377 406
378 if (idx == -1) 407 if (idx == -1)
379 idx = 0; /* No states enabled. Must use 0. */ 408 idx = 0; /* No states enabled. Must use 0. */
380 409
410 /*
411 * Don't stop the tick if the selected state is a polling one or if the
412 * expected idle duration is shorter than the tick period length.
413 */
414 if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
415 expected_interval < TICK_USEC) {
416 unsigned int delta_next_us = ktime_to_us(delta_next);
417
418 *stop_tick = false;
419
420 if (!tick_nohz_tick_stopped() && idx > 0 &&
421 drv->states[idx].target_residency > delta_next_us) {
422 /*
423 * The tick is not going to be stopped and the target
424 * residency of the state to be returned is not within
425 * the time until the next timer event including the
426 * tick, so try to correct that.
427 */
428 for (i = idx - 1; i >= 0; i--) {
429 if (drv->states[i].disabled ||
430 dev->states_usage[i].disable)
431 continue;
432
433 idx = i;
434 if (drv->states[i].target_residency <= delta_next_us)
435 break;
436 }
437 }
438 }
439
381 data->last_state_idx = idx; 440 data->last_state_idx = idx;
382 441
383 return data->last_state_idx; 442 return data->last_state_idx;
@@ -397,6 +456,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index)
397 456
398 data->last_state_idx = index; 457 data->last_state_idx = index;
399 data->needs_update = 1; 458 data->needs_update = 1;
459 data->tick_wakeup = tick_nohz_idle_got_tick();
400} 460}
401 461
402/** 462/**
@@ -427,14 +487,27 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
427 * assume the state was never reached and the exit latency is 0. 487 * assume the state was never reached and the exit latency is 0.
428 */ 488 */
429 489
430 /* measured value */ 490 if (data->tick_wakeup && data->next_timer_us > TICK_USEC) {
431 measured_us = cpuidle_get_last_residency(dev); 491 /*
432 492 * The nohz code said that there wouldn't be any events within
433 /* Deduct exit latency */ 493 * the tick boundary (if the tick was stopped), but the idle
434 if (measured_us > 2 * target->exit_latency) 494 * duration predictor had a differing opinion. Since the CPU
435 measured_us -= target->exit_latency; 495 * was woken up by a tick (that wasn't stopped after all), the
436 else 496 * predictor was not quite right, so assume that the CPU could
437 measured_us /= 2; 497 * have been idle long (but not forever) to help the idle
498 * duration predictor do a better job next time.
499 */
500 measured_us = 9 * MAX_INTERESTING / 10;
501 } else {
502 /* measured value */
503 measured_us = cpuidle_get_last_residency(dev);
504
505 /* Deduct exit latency */
506 if (measured_us > 2 * target->exit_latency)
507 measured_us -= target->exit_latency;
508 else
509 measured_us /= 2;
510 }
438 511
439 /* Make sure our coefficients do not exceed unity */ 512 /* Make sure our coefficients do not exceed unity */
440 if (measured_us > data->next_timer_us) 513 if (measured_us > data->next_timer_us)
diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index 9c2567b0d93e..dfad93fca0a6 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -375,7 +375,7 @@ static int efx_mcdi_poll(struct efx_nic *efx)
375 * because generally mcdi responses are fast. After that, back off 375 * because generally mcdi responses are fast. After that, back off
376 * and poll once a jiffy (approximately) 376 * and poll once a jiffy (approximately)
377 */ 377 */
378 spins = TICK_USEC; 378 spins = USER_TICK_USEC;
379 finish = jiffies + MCDI_RPC_TIMEOUT; 379 finish = jiffies + MCDI_RPC_TIMEOUT;
380 380
381 while (1) { 381 while (1) {
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index a806e94c482f..1eefabf1621f 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -135,7 +135,8 @@ extern bool cpuidle_not_available(struct cpuidle_driver *drv,
135 struct cpuidle_device *dev); 135 struct cpuidle_device *dev);
136 136
137extern int cpuidle_select(struct cpuidle_driver *drv, 137extern int cpuidle_select(struct cpuidle_driver *drv,
138 struct cpuidle_device *dev); 138 struct cpuidle_device *dev,
139 bool *stop_tick);
139extern int cpuidle_enter(struct cpuidle_driver *drv, 140extern int cpuidle_enter(struct cpuidle_driver *drv,
140 struct cpuidle_device *dev, int index); 141 struct cpuidle_device *dev, int index);
141extern void cpuidle_reflect(struct cpuidle_device *dev, int index); 142extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
@@ -167,7 +168,7 @@ static inline bool cpuidle_not_available(struct cpuidle_driver *drv,
167 struct cpuidle_device *dev) 168 struct cpuidle_device *dev)
168{return true; } 169{return true; }
169static inline int cpuidle_select(struct cpuidle_driver *drv, 170static inline int cpuidle_select(struct cpuidle_driver *drv,
170 struct cpuidle_device *dev) 171 struct cpuidle_device *dev, bool *stop_tick)
171{return -ENODEV; } 172{return -ENODEV; }
172static inline int cpuidle_enter(struct cpuidle_driver *drv, 173static inline int cpuidle_enter(struct cpuidle_driver *drv,
173 struct cpuidle_device *dev, int index) 174 struct cpuidle_device *dev, int index)
@@ -250,7 +251,8 @@ struct cpuidle_governor {
250 struct cpuidle_device *dev); 251 struct cpuidle_device *dev);
251 252
252 int (*select) (struct cpuidle_driver *drv, 253 int (*select) (struct cpuidle_driver *drv,
253 struct cpuidle_device *dev); 254 struct cpuidle_device *dev,
255 bool *stop_tick);
254 void (*reflect) (struct cpuidle_device *dev, int index); 256 void (*reflect) (struct cpuidle_device *dev, int index);
255}; 257};
256 258
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 78f456fcd242..a2656c3ebe81 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -424,6 +424,7 @@ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
424} 424}
425 425
426extern u64 hrtimer_get_next_event(void); 426extern u64 hrtimer_get_next_event(void);
427extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);
427 428
428extern bool hrtimer_active(const struct hrtimer *timer); 429extern bool hrtimer_active(const struct hrtimer *timer);
429 430
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 9385aa57497b..a27cf6652327 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -62,8 +62,11 @@ extern int register_refined_jiffies(long clock_tick_rate);
62/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ 62/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */
63#define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ) 63#define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ)
64 64
65/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ 65/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */
66#define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) 66#define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ)
67
68/* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
69#define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
67 70
68#ifndef __jiffy_arch_data 71#ifndef __jiffy_arch_data
69#define __jiffy_arch_data 72#define __jiffy_arch_data
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7f8c9a127f5a..55388ab45fd4 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -115,27 +115,46 @@ enum tick_dep_bits {
115extern bool tick_nohz_enabled; 115extern bool tick_nohz_enabled;
116extern bool tick_nohz_tick_stopped(void); 116extern bool tick_nohz_tick_stopped(void);
117extern bool tick_nohz_tick_stopped_cpu(int cpu); 117extern bool tick_nohz_tick_stopped_cpu(int cpu);
118extern void tick_nohz_idle_stop_tick(void);
119extern void tick_nohz_idle_retain_tick(void);
120extern void tick_nohz_idle_restart_tick(void);
118extern void tick_nohz_idle_enter(void); 121extern void tick_nohz_idle_enter(void);
119extern void tick_nohz_idle_exit(void); 122extern void tick_nohz_idle_exit(void);
120extern void tick_nohz_irq_exit(void); 123extern void tick_nohz_irq_exit(void);
121extern ktime_t tick_nohz_get_sleep_length(void); 124extern bool tick_nohz_idle_got_tick(void);
125extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next);
122extern unsigned long tick_nohz_get_idle_calls(void); 126extern unsigned long tick_nohz_get_idle_calls(void);
123extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); 127extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
124extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); 128extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
125extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); 129extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
130
131static inline void tick_nohz_idle_stop_tick_protected(void)
132{
133 local_irq_disable();
134 tick_nohz_idle_stop_tick();
135 local_irq_enable();
136}
137
126#else /* !CONFIG_NO_HZ_COMMON */ 138#else /* !CONFIG_NO_HZ_COMMON */
127#define tick_nohz_enabled (0) 139#define tick_nohz_enabled (0)
128static inline int tick_nohz_tick_stopped(void) { return 0; } 140static inline int tick_nohz_tick_stopped(void) { return 0; }
129static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } 141static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
142static inline void tick_nohz_idle_stop_tick(void) { }
143static inline void tick_nohz_idle_retain_tick(void) { }
144static inline void tick_nohz_idle_restart_tick(void) { }
130static inline void tick_nohz_idle_enter(void) { } 145static inline void tick_nohz_idle_enter(void) { }
131static inline void tick_nohz_idle_exit(void) { } 146static inline void tick_nohz_idle_exit(void) { }
147static inline bool tick_nohz_idle_got_tick(void) { return false; }
132 148
133static inline ktime_t tick_nohz_get_sleep_length(void) 149static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
134{ 150{
135 return NSEC_PER_SEC / HZ; 151 *delta_next = TICK_NSEC;
152 return *delta_next;
136} 153}
137static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } 154static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
138static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } 155static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
156
157static inline void tick_nohz_idle_stop_tick_protected(void) { }
139#endif /* !CONFIG_NO_HZ_COMMON */ 158#endif /* !CONFIG_NO_HZ_COMMON */
140 159
141#ifdef CONFIG_NO_HZ_FULL 160#ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d7503910ce2..fa39092b7aea 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -295,6 +295,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
295 * changed 295 * changed
296 */ 296 */
297 plist_del(node, &c->list); 297 plist_del(node, &c->list);
298 /* fall through */
298 case PM_QOS_ADD_REQ: 299 case PM_QOS_ADD_REQ:
299 plist_node_init(node, new_value); 300 plist_node_init(node, new_value);
300 plist_add(node, &c->list); 301 plist_add(node, &c->list);
@@ -367,6 +368,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
367 break; 368 break;
368 case PM_QOS_UPDATE_REQ: 369 case PM_QOS_UPDATE_REQ:
369 pm_qos_flags_remove_req(pqf, req); 370 pm_qos_flags_remove_req(pqf, req);
371 /* fall through */
370 case PM_QOS_ADD_REQ: 372 case PM_QOS_ADD_REQ:
371 req->flags = val; 373 req->flags = val;
372 INIT_LIST_HEAD(&req->node); 374 INIT_LIST_HEAD(&req->node);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2975f195e1c4..1a3e9bddd17b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -141,13 +141,15 @@ static void cpuidle_idle_call(void)
141 } 141 }
142 142
143 /* 143 /*
144 * Tell the RCU framework we are entering an idle section, 144 * The RCU framework needs to be told that we are entering an idle
145 * so no more rcu read side critical sections and one more 145 * section, so no more rcu read side critical sections and one more
146 * step to the grace period 146 * step to the grace period
147 */ 147 */
148 rcu_idle_enter();
149 148
150 if (cpuidle_not_available(drv, dev)) { 149 if (cpuidle_not_available(drv, dev)) {
150 tick_nohz_idle_stop_tick();
151 rcu_idle_enter();
152
151 default_idle_call(); 153 default_idle_call();
152 goto exit_idle; 154 goto exit_idle;
153 } 155 }
@@ -164,20 +166,37 @@ static void cpuidle_idle_call(void)
164 166
165 if (idle_should_enter_s2idle() || dev->use_deepest_state) { 167 if (idle_should_enter_s2idle() || dev->use_deepest_state) {
166 if (idle_should_enter_s2idle()) { 168 if (idle_should_enter_s2idle()) {
169 rcu_idle_enter();
170
167 entered_state = cpuidle_enter_s2idle(drv, dev); 171 entered_state = cpuidle_enter_s2idle(drv, dev);
168 if (entered_state > 0) { 172 if (entered_state > 0) {
169 local_irq_enable(); 173 local_irq_enable();
170 goto exit_idle; 174 goto exit_idle;
171 } 175 }
176
177 rcu_idle_exit();
172 } 178 }
173 179
180 tick_nohz_idle_stop_tick();
181 rcu_idle_enter();
182
174 next_state = cpuidle_find_deepest_state(drv, dev); 183 next_state = cpuidle_find_deepest_state(drv, dev);
175 call_cpuidle(drv, dev, next_state); 184 call_cpuidle(drv, dev, next_state);
176 } else { 185 } else {
186 bool stop_tick = true;
187
177 /* 188 /*
178 * Ask the cpuidle framework to choose a convenient idle state. 189 * Ask the cpuidle framework to choose a convenient idle state.
179 */ 190 */
180 next_state = cpuidle_select(drv, dev); 191 next_state = cpuidle_select(drv, dev, &stop_tick);
192
193 if (stop_tick)
194 tick_nohz_idle_stop_tick();
195 else
196 tick_nohz_idle_retain_tick();
197
198 rcu_idle_enter();
199
181 entered_state = call_cpuidle(drv, dev, next_state); 200 entered_state = call_cpuidle(drv, dev, next_state);
182 /* 201 /*
183 * Give the governor an opportunity to reflect on the outcome 202 * Give the governor an opportunity to reflect on the outcome
@@ -222,6 +241,7 @@ static void do_idle(void)
222 rmb(); 241 rmb();
223 242
224 if (cpu_is_offline(cpu)) { 243 if (cpu_is_offline(cpu)) {
244 tick_nohz_idle_stop_tick_protected();
225 cpuhp_report_idle_dead(); 245 cpuhp_report_idle_dead();
226 arch_cpu_idle_dead(); 246 arch_cpu_idle_dead();
227 } 247 }
@@ -235,10 +255,12 @@ static void do_idle(void)
235 * broadcast device expired for us, we don't want to go deep 255 * broadcast device expired for us, we don't want to go deep
236 * idle as we know that the IPI is going to arrive right away. 256 * idle as we know that the IPI is going to arrive right away.
237 */ 257 */
238 if (cpu_idle_force_poll || tick_check_broadcast_expired()) 258 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
259 tick_nohz_idle_restart_tick();
239 cpu_idle_poll(); 260 cpu_idle_poll();
240 else 261 } else {
241 cpuidle_idle_call(); 262 cpuidle_idle_call();
263 }
242 arch_cpu_idle_exit(); 264 arch_cpu_idle_exit();
243 } 265 }
244 266
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9b082ce86325..eda1210ce50f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -480,6 +480,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
480 while ((base = __next_base((cpu_base), &(active)))) 480 while ((base = __next_base((cpu_base), &(active))))
481 481
482static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, 482static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
483 const struct hrtimer *exclude,
483 unsigned int active, 484 unsigned int active,
484 ktime_t expires_next) 485 ktime_t expires_next)
485{ 486{
@@ -492,9 +493,22 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
492 493
493 next = timerqueue_getnext(&base->active); 494 next = timerqueue_getnext(&base->active);
494 timer = container_of(next, struct hrtimer, node); 495 timer = container_of(next, struct hrtimer, node);
496 if (timer == exclude) {
497 /* Get to the next timer in the queue. */
498 next = timerqueue_iterate_next(next);
499 if (!next)
500 continue;
501
502 timer = container_of(next, struct hrtimer, node);
503 }
495 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 504 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
496 if (expires < expires_next) { 505 if (expires < expires_next) {
497 expires_next = expires; 506 expires_next = expires;
507
508 /* Skip cpu_base update if a timer is being excluded. */
509 if (exclude)
510 continue;
511
498 if (timer->is_soft) 512 if (timer->is_soft)
499 cpu_base->softirq_next_timer = timer; 513 cpu_base->softirq_next_timer = timer;
500 else 514 else
@@ -538,7 +552,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
538 if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { 552 if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
539 active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; 553 active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
540 cpu_base->softirq_next_timer = NULL; 554 cpu_base->softirq_next_timer = NULL;
541 expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); 555 expires_next = __hrtimer_next_event_base(cpu_base, NULL,
556 active, KTIME_MAX);
542 557
543 next_timer = cpu_base->softirq_next_timer; 558 next_timer = cpu_base->softirq_next_timer;
544 } 559 }
@@ -546,7 +561,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
546 if (active_mask & HRTIMER_ACTIVE_HARD) { 561 if (active_mask & HRTIMER_ACTIVE_HARD) {
547 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; 562 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
548 cpu_base->next_timer = next_timer; 563 cpu_base->next_timer = next_timer;
549 expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); 564 expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
565 expires_next);
550 } 566 }
551 567
552 return expires_next; 568 return expires_next;
@@ -1190,6 +1206,39 @@ u64 hrtimer_get_next_event(void)
1190 1206
1191 return expires; 1207 return expires;
1192} 1208}
1209
1210/**
1211 * hrtimer_next_event_without - time until next expiry event w/o one timer
1212 * @exclude: timer to exclude
1213 *
1214 * Returns the next expiry time over all timers except for the @exclude one or
1215 * KTIME_MAX if none of them is pending.
1216 */
1217u64 hrtimer_next_event_without(const struct hrtimer *exclude)
1218{
1219 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1220 u64 expires = KTIME_MAX;
1221 unsigned long flags;
1222
1223 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1224
1225 if (__hrtimer_hres_active(cpu_base)) {
1226 unsigned int active;
1227
1228 if (!cpu_base->softirq_activated) {
1229 active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
1230 expires = __hrtimer_next_event_base(cpu_base, exclude,
1231 active, KTIME_MAX);
1232 }
1233 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
1234 expires = __hrtimer_next_event_base(cpu_base, exclude, active,
1235 expires);
1236 }
1237
1238 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1239
1240 return expires;
1241}
1193#endif 1242#endif
1194 1243
1195static inline int hrtimer_clockid_to_base(clockid_t clock_id) 1244static inline int hrtimer_clockid_to_base(clockid_t clock_id)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8d70da1b9a0d..a09ded765f6c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,7 +31,7 @@
31 31
32 32
33/* USER_HZ period (usecs): */ 33/* USER_HZ period (usecs): */
34unsigned long tick_usec = TICK_USEC; 34unsigned long tick_usec = USER_TICK_USEC;
35 35
36/* SHIFTED_HZ period (nsecs): */ 36/* SHIFTED_HZ period (nsecs): */
37unsigned long tick_nsec; 37unsigned long tick_nsec;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f3ab08caa2c3..646645e981f9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -122,8 +122,7 @@ static ktime_t tick_init_jiffy_update(void)
122 return period; 122 return period;
123} 123}
124 124
125 125static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
126static void tick_sched_do_timer(ktime_t now)
127{ 126{
128 int cpu = smp_processor_id(); 127 int cpu = smp_processor_id();
129 128
@@ -143,6 +142,9 @@ static void tick_sched_do_timer(ktime_t now)
143 /* Check, if the jiffies need an update */ 142 /* Check, if the jiffies need an update */
144 if (tick_do_timer_cpu == cpu) 143 if (tick_do_timer_cpu == cpu)
145 tick_do_update_jiffies64(now); 144 tick_do_update_jiffies64(now);
145
146 if (ts->inidle)
147 ts->got_idle_tick = 1;
146} 148}
147 149
148static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 150static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
@@ -474,7 +476,9 @@ __setup("nohz=", setup_tick_nohz);
474 476
475bool tick_nohz_tick_stopped(void) 477bool tick_nohz_tick_stopped(void)
476{ 478{
477 return __this_cpu_read(tick_cpu_sched.tick_stopped); 479 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
480
481 return ts->tick_stopped;
478} 482}
479 483
480bool tick_nohz_tick_stopped_cpu(int cpu) 484bool tick_nohz_tick_stopped_cpu(int cpu)
@@ -537,14 +541,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
537 sched_clock_idle_wakeup_event(); 541 sched_clock_idle_wakeup_event();
538} 542}
539 543
540static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 544static void tick_nohz_start_idle(struct tick_sched *ts)
541{ 545{
542 ktime_t now = ktime_get(); 546 ts->idle_entrytime = ktime_get();
543
544 ts->idle_entrytime = now;
545 ts->idle_active = 1; 547 ts->idle_active = 1;
546 sched_clock_idle_sleep_event(); 548 sched_clock_idle_sleep_event();
547 return now;
548} 549}
549 550
550/** 551/**
@@ -653,13 +654,10 @@ static inline bool local_timer_softirq_pending(void)
653 return local_softirq_pending() & TIMER_SOFTIRQ; 654 return local_softirq_pending() & TIMER_SOFTIRQ;
654} 655}
655 656
656static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, 657static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
657 ktime_t now, int cpu)
658{ 658{
659 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
660 u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; 659 u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
661 unsigned long seq, basejiff; 660 unsigned long seq, basejiff;
662 ktime_t tick;
663 661
664 /* Read jiffies and the time when jiffies were updated last */ 662 /* Read jiffies and the time when jiffies were updated last */
665 do { 663 do {
@@ -668,6 +666,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
668 basejiff = jiffies; 666 basejiff = jiffies;
669 } while (read_seqretry(&jiffies_lock, seq)); 667 } while (read_seqretry(&jiffies_lock, seq));
670 ts->last_jiffies = basejiff; 668 ts->last_jiffies = basejiff;
669 ts->timer_expires_base = basemono;
671 670
672 /* 671 /*
673 * Keep the periodic tick, when RCU, architecture or irq_work 672 * Keep the periodic tick, when RCU, architecture or irq_work
@@ -712,47 +711,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
712 * next period, so no point in stopping it either, bail. 711 * next period, so no point in stopping it either, bail.
713 */ 712 */
714 if (!ts->tick_stopped) { 713 if (!ts->tick_stopped) {
715 tick = 0; 714 ts->timer_expires = 0;
716 goto out; 715 goto out;
717 } 716 }
718 } 717 }
719 718
720 /* 719 /*
720 * If this CPU is the one which had the do_timer() duty last, we limit
721 * the sleep time to the timekeeping max_deferment value.
722 * Otherwise we can sleep as long as we want.
723 */
724 delta = timekeeping_max_deferment();
725 if (cpu != tick_do_timer_cpu &&
726 (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last))
727 delta = KTIME_MAX;
728
729 /* Calculate the next expiry time */
730 if (delta < (KTIME_MAX - basemono))
731 expires = basemono + delta;
732 else
733 expires = KTIME_MAX;
734
735 ts->timer_expires = min_t(u64, expires, next_tick);
736
737out:
738 return ts->timer_expires;
739}
740
741static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
742{
743 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
744 u64 basemono = ts->timer_expires_base;
745 u64 expires = ts->timer_expires;
746 ktime_t tick = expires;
747
748 /* Make sure we won't be trying to stop it twice in a row. */
749 ts->timer_expires_base = 0;
750
751 /*
721 * If this CPU is the one which updates jiffies, then give up 752 * If this CPU is the one which updates jiffies, then give up
722 * the assignment and let it be taken by the CPU which runs 753 * the assignment and let it be taken by the CPU which runs
723 * the tick timer next, which might be this CPU as well. If we 754 * the tick timer next, which might be this CPU as well. If we
724 * don't drop this here the jiffies might be stale and 755 * don't drop this here the jiffies might be stale and
725 * do_timer() never invoked. Keep track of the fact that it 756 * do_timer() never invoked. Keep track of the fact that it
726 * was the one which had the do_timer() duty last. If this CPU 757 * was the one which had the do_timer() duty last.
727 * is the one which had the do_timer() duty last, we limit the
728 * sleep time to the timekeeping max_deferment value.
729 * Otherwise we can sleep as long as we want.
730 */ 758 */
731 delta = timekeeping_max_deferment();
732 if (cpu == tick_do_timer_cpu) { 759 if (cpu == tick_do_timer_cpu) {
733 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 760 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
734 ts->do_timer_last = 1; 761 ts->do_timer_last = 1;
735 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { 762 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
736 delta = KTIME_MAX;
737 ts->do_timer_last = 0; 763 ts->do_timer_last = 0;
738 } else if (!ts->do_timer_last) {
739 delta = KTIME_MAX;
740 } 764 }
741 765
742 /* Calculate the next expiry time */
743 if (delta < (KTIME_MAX - basemono))
744 expires = basemono + delta;
745 else
746 expires = KTIME_MAX;
747
748 expires = min_t(u64, expires, next_tick);
749 tick = expires;
750
751 /* Skip reprogram of event if its not changed */ 766 /* Skip reprogram of event if its not changed */
752 if (ts->tick_stopped && (expires == ts->next_tick)) { 767 if (ts->tick_stopped && (expires == ts->next_tick)) {
753 /* Sanity check: make sure clockevent is actually programmed */ 768 /* Sanity check: make sure clockevent is actually programmed */
754 if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) 769 if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
755 goto out; 770 return;
756 771
757 WARN_ON_ONCE(1); 772 WARN_ON_ONCE(1);
758 printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", 773 printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
@@ -786,7 +801,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
786 if (unlikely(expires == KTIME_MAX)) { 801 if (unlikely(expires == KTIME_MAX)) {
787 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 802 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
788 hrtimer_cancel(&ts->sched_timer); 803 hrtimer_cancel(&ts->sched_timer);
789 goto out; 804 return;
790 } 805 }
791 806
792 hrtimer_set_expires(&ts->sched_timer, tick); 807 hrtimer_set_expires(&ts->sched_timer, tick);
@@ -795,15 +810,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
795 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); 810 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
796 else 811 else
797 tick_program_event(tick, 1); 812 tick_program_event(tick, 1);
798out:
799 /*
800 * Update the estimated sleep length until the next timer
801 * (not only the tick).
802 */
803 ts->sleep_length = ktime_sub(dev->next_event, now);
804 return tick;
805} 813}
806 814
815static void tick_nohz_retain_tick(struct tick_sched *ts)
816{
817 ts->timer_expires_base = 0;
818}
819
820#ifdef CONFIG_NO_HZ_FULL
821static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu)
822{
823 if (tick_nohz_next_event(ts, cpu))
824 tick_nohz_stop_tick(ts, cpu);
825 else
826 tick_nohz_retain_tick(ts);
827}
828#endif /* CONFIG_NO_HZ_FULL */
829
807static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 830static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
808{ 831{
809 /* Update jiffies first */ 832 /* Update jiffies first */
@@ -839,7 +862,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
839 return; 862 return;
840 863
841 if (can_stop_full_tick(cpu, ts)) 864 if (can_stop_full_tick(cpu, ts))
842 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); 865 tick_nohz_stop_sched_tick(ts, cpu);
843 else if (ts->tick_stopped) 866 else if (ts->tick_stopped)
844 tick_nohz_restart_sched_tick(ts, ktime_get()); 867 tick_nohz_restart_sched_tick(ts, ktime_get());
845#endif 868#endif
@@ -865,10 +888,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
865 return false; 888 return false;
866 } 889 }
867 890
868 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { 891 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
869 ts->sleep_length = NSEC_PER_SEC / HZ;
870 return false; 892 return false;
871 }
872 893
873 if (need_resched()) 894 if (need_resched())
874 return false; 895 return false;
@@ -903,42 +924,65 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
903 return true; 924 return true;
904} 925}
905 926
906static void __tick_nohz_idle_enter(struct tick_sched *ts) 927static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
907{ 928{
908 ktime_t now, expires; 929 ktime_t expires;
909 int cpu = smp_processor_id(); 930 int cpu = smp_processor_id();
910 931
911 now = tick_nohz_start_idle(ts); 932 /*
933 * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
934 * tick timer expiration time is known already.
935 */
936 if (ts->timer_expires_base)
937 expires = ts->timer_expires;
938 else if (can_stop_idle_tick(cpu, ts))
939 expires = tick_nohz_next_event(ts, cpu);
940 else
941 return;
942
943 ts->idle_calls++;
912 944
913 if (can_stop_idle_tick(cpu, ts)) { 945 if (expires > 0LL) {
914 int was_stopped = ts->tick_stopped; 946 int was_stopped = ts->tick_stopped;
915 947
916 ts->idle_calls++; 948 tick_nohz_stop_tick(ts, cpu);
917 949
918 expires = tick_nohz_stop_sched_tick(ts, now, cpu); 950 ts->idle_sleeps++;
919 if (expires > 0LL) { 951 ts->idle_expires = expires;
920 ts->idle_sleeps++;
921 ts->idle_expires = expires;
922 }
923 952
924 if (!was_stopped && ts->tick_stopped) { 953 if (!was_stopped && ts->tick_stopped) {
925 ts->idle_jiffies = ts->last_jiffies; 954 ts->idle_jiffies = ts->last_jiffies;
926 nohz_balance_enter_idle(cpu); 955 nohz_balance_enter_idle(cpu);
927 } 956 }
957 } else {
958 tick_nohz_retain_tick(ts);
928 } 959 }
929} 960}
930 961
931/** 962/**
932 * tick_nohz_idle_enter - stop the idle tick from the idle task 963 * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
933 * 964 *
934 * When the next event is more than a tick into the future, stop the idle tick 965 * When the next event is more than a tick into the future, stop the idle tick
935 * Called when we start the idle loop. 966 */
936 * 967void tick_nohz_idle_stop_tick(void)
937 * The arch is responsible of calling: 968{
969 __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
970}
971
972void tick_nohz_idle_retain_tick(void)
973{
974 tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
975 /*
976 * Undo the effect of get_next_timer_interrupt() called from
977 * tick_nohz_next_event().
978 */
979 timer_clear_idle();
980}
981
982/**
983 * tick_nohz_idle_enter - prepare for entering idle on the current CPU
938 * 984 *
939 * - rcu_idle_enter() after its last use of RCU before the CPU is put 985 * Called when we start the idle loop.
940 * to sleep.
941 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
942 */ 986 */
943void tick_nohz_idle_enter(void) 987void tick_nohz_idle_enter(void)
944{ 988{
@@ -949,8 +993,11 @@ void tick_nohz_idle_enter(void)
949 local_irq_disable(); 993 local_irq_disable();
950 994
951 ts = this_cpu_ptr(&tick_cpu_sched); 995 ts = this_cpu_ptr(&tick_cpu_sched);
996
997 WARN_ON_ONCE(ts->timer_expires_base);
998
952 ts->inidle = 1; 999 ts->inidle = 1;
953 __tick_nohz_idle_enter(ts); 1000 tick_nohz_start_idle(ts);
954 1001
955 local_irq_enable(); 1002 local_irq_enable();
956} 1003}
@@ -968,21 +1015,62 @@ void tick_nohz_irq_exit(void)
968 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1015 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
969 1016
970 if (ts->inidle) 1017 if (ts->inidle)
971 __tick_nohz_idle_enter(ts); 1018 tick_nohz_start_idle(ts);
972 else 1019 else
973 tick_nohz_full_update_tick(ts); 1020 tick_nohz_full_update_tick(ts);
974} 1021}
975 1022
976/** 1023/**
977 * tick_nohz_get_sleep_length - return the length of the current sleep 1024 * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
1025 */
1026bool tick_nohz_idle_got_tick(void)
1027{
1028 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1029
1030 if (ts->got_idle_tick) {
1031 ts->got_idle_tick = 0;
1032 return true;
1033 }
1034 return false;
1035}
1036
1037/**
1038 * tick_nohz_get_sleep_length - return the expected length of the current sleep
1039 * @delta_next: duration until the next event if the tick cannot be stopped
978 * 1040 *
979 * Called from power state control code with interrupts disabled 1041 * Called from power state control code with interrupts disabled
980 */ 1042 */
981ktime_t tick_nohz_get_sleep_length(void) 1043ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
982{ 1044{
1045 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
983 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1046 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1047 int cpu = smp_processor_id();
1048 /*
1049 * The idle entry time is expected to be a sufficient approximation of
1050 * the current time at this point.
1051 */
1052 ktime_t now = ts->idle_entrytime;
1053 ktime_t next_event;
1054
1055 WARN_ON_ONCE(!ts->inidle);
1056
1057 *delta_next = ktime_sub(dev->next_event, now);
1058
1059 if (!can_stop_idle_tick(cpu, ts))
1060 return *delta_next;
1061
1062 next_event = tick_nohz_next_event(ts, cpu);
1063 if (!next_event)
1064 return *delta_next;
1065
1066 /*
1067 * If the next highres timer to expire is earlier than next_event, the
1068 * idle governor needs to know that.
1069 */
1070 next_event = min_t(u64, next_event,
1071 hrtimer_next_event_without(&ts->sched_timer));
984 1072
985 return ts->sleep_length; 1073 return ktime_sub(next_event, now);
986} 1074}
987 1075
988/** 1076/**
@@ -1031,6 +1119,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
1031#endif 1119#endif
1032} 1120}
1033 1121
1122static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
1123{
1124 tick_nohz_restart_sched_tick(ts, now);
1125 tick_nohz_account_idle_ticks(ts);
1126}
1127
1128void tick_nohz_idle_restart_tick(void)
1129{
1130 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1131
1132 if (ts->tick_stopped)
1133 __tick_nohz_idle_restart_tick(ts, ktime_get());
1134}
1135
1034/** 1136/**
1035 * tick_nohz_idle_exit - restart the idle tick from the idle task 1137 * tick_nohz_idle_exit - restart the idle tick from the idle task
1036 * 1138 *
@@ -1041,24 +1143,26 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
1041void tick_nohz_idle_exit(void) 1143void tick_nohz_idle_exit(void)
1042{ 1144{
1043 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1145 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1146 bool idle_active, tick_stopped;
1044 ktime_t now; 1147 ktime_t now;
1045 1148
1046 local_irq_disable(); 1149 local_irq_disable();
1047 1150
1048 WARN_ON_ONCE(!ts->inidle); 1151 WARN_ON_ONCE(!ts->inidle);
1152 WARN_ON_ONCE(ts->timer_expires_base);
1049 1153
1050 ts->inidle = 0; 1154 ts->inidle = 0;
1155 idle_active = ts->idle_active;
1156 tick_stopped = ts->tick_stopped;
1051 1157
1052 if (ts->idle_active || ts->tick_stopped) 1158 if (idle_active || tick_stopped)
1053 now = ktime_get(); 1159 now = ktime_get();
1054 1160
1055 if (ts->idle_active) 1161 if (idle_active)
1056 tick_nohz_stop_idle(ts, now); 1162 tick_nohz_stop_idle(ts, now);
1057 1163
1058 if (ts->tick_stopped) { 1164 if (tick_stopped)
1059 tick_nohz_restart_sched_tick(ts, now); 1165 __tick_nohz_idle_restart_tick(ts, now);
1060 tick_nohz_account_idle_ticks(ts);
1061 }
1062 1166
1063 local_irq_enable(); 1167 local_irq_enable();
1064} 1168}
@@ -1074,7 +1178,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
1074 1178
1075 dev->next_event = KTIME_MAX; 1179 dev->next_event = KTIME_MAX;
1076 1180
1077 tick_sched_do_timer(now); 1181 tick_sched_do_timer(ts, now);
1078 tick_sched_handle(ts, regs); 1182 tick_sched_handle(ts, regs);
1079 1183
1080 /* No need to reprogram if we are running tickless */ 1184 /* No need to reprogram if we are running tickless */
@@ -1169,7 +1273,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
1169 struct pt_regs *regs = get_irq_regs(); 1273 struct pt_regs *regs = get_irq_regs();
1170 ktime_t now = ktime_get(); 1274 ktime_t now = ktime_get();
1171 1275
1172 tick_sched_do_timer(now); 1276 tick_sched_do_timer(ts, now);
1173 1277
1174 /* 1278 /*
1175 * Do not call, when we are not in irq context and have 1279 * Do not call, when we are not in irq context and have
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 954b43dbf21c..6de959a854b2 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -38,31 +38,37 @@ enum tick_nohz_mode {
38 * @idle_exittime: Time when the idle state was left 38 * @idle_exittime: Time when the idle state was left
39 * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped 39 * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
40 * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding 40 * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
41 * @sleep_length: Duration of the current idle sleep 41 * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
42 * @timer_expires_base: Base time clock monotonic for @timer_expires
42 * @do_timer_lst: CPU was the last one doing do_timer before going idle 43 * @do_timer_lst: CPU was the last one doing do_timer before going idle
44 * @got_idle_tick: Tick timer function has run with @inidle set
43 */ 45 */
44struct tick_sched { 46struct tick_sched {
45 struct hrtimer sched_timer; 47 struct hrtimer sched_timer;
46 unsigned long check_clocks; 48 unsigned long check_clocks;
47 enum tick_nohz_mode nohz_mode; 49 enum tick_nohz_mode nohz_mode;
50
51 unsigned int inidle : 1;
52 unsigned int tick_stopped : 1;
53 unsigned int idle_active : 1;
54 unsigned int do_timer_last : 1;
55 unsigned int got_idle_tick : 1;
56
48 ktime_t last_tick; 57 ktime_t last_tick;
49 ktime_t next_tick; 58 ktime_t next_tick;
50 int inidle;
51 int tick_stopped;
52 unsigned long idle_jiffies; 59 unsigned long idle_jiffies;
53 unsigned long idle_calls; 60 unsigned long idle_calls;
54 unsigned long idle_sleeps; 61 unsigned long idle_sleeps;
55 int idle_active;
56 ktime_t idle_entrytime; 62 ktime_t idle_entrytime;
57 ktime_t idle_waketime; 63 ktime_t idle_waketime;
58 ktime_t idle_exittime; 64 ktime_t idle_exittime;
59 ktime_t idle_sleeptime; 65 ktime_t idle_sleeptime;
60 ktime_t iowait_sleeptime; 66 ktime_t iowait_sleeptime;
61 ktime_t sleep_length;
62 unsigned long last_jiffies; 67 unsigned long last_jiffies;
68 u64 timer_expires;
69 u64 timer_expires_base;
63 u64 next_timer; 70 u64 next_timer;
64 ktime_t idle_expires; 71 ktime_t idle_expires;
65 int do_timer_last;
66 atomic_t tick_dep_mask; 72 atomic_t tick_dep_mask;
67}; 73};
68 74