diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-17 15:35:15 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-17 15:35:15 -0400 |
commit | 7f2444d38f6bbfa12bc15e2533d8f9daa85ca02b (patch) | |
tree | 6506ec79036890edfd9797b001391a350b5ac10f /kernel | |
parent | c5f12fdb8bd873aa3ffdb79512e6bdac92b257b0 (diff) | |
parent | 77b4b5420422fc037d00b8f3f0e89b2262e4ae29 (diff) |
Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull core timer updates from Thomas Gleixner:
"Timers and timekeeping updates:
- A large overhaul of the posix CPU timer code which is a preparation
for moving the CPU timer expiry out into task work so it can be
properly accounted on the task/process.
An update to the bogus permission checks will come later during the
merge window as feedback was not complete before heading of for
travel.
- Switch the timerqueue code to use cached rbtrees and get rid of the
homebrewn caching of the leftmost node.
- Consolidate hrtimer_init() + hrtimer_init_sleeper() calls into a
single function
- Implement the separation of hrtimers to be forced to expire in hard
interrupt context even when PREEMPT_RT is enabled and mark the
affected timers accordingly.
- Implement a mechanism for hrtimers and the timer wheel to protect
RT against priority inversion and live lock issues when a (hr)timer
which should be canceled is currently executing the callback.
Instead of infinitely spinning, the task which tries to cancel the
timer blocks on a per cpu base expiry lock which is held and
released by the (hr)timer expiry code.
- Enable the Hyper-V TSC page based sched_clock for Hyper-V guests
resulting in faster access to timekeeping functions.
- Updates to various clocksource/clockevent drivers and their device
tree bindings.
- The usual small improvements all over the place"
* 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (101 commits)
posix-cpu-timers: Fix permission check regression
posix-cpu-timers: Always clear head pointer on dequeue
hrtimer: Add a missing bracket and hide `migration_base' on !SMP
posix-cpu-timers: Make expiry_active check actually work correctly
posix-timers: Unbreak CONFIG_POSIX_TIMERS=n build
tick: Mark sched_timer to expire in hard interrupt context
hrtimer: Add kernel doc annotation for HRTIMER_MODE_HARD
x86/hyperv: Hide pv_ops access for CONFIG_PARAVIRT=n
posix-cpu-timers: Utilize timerqueue for storage
posix-cpu-timers: Move state tracking to struct posix_cputimers
posix-cpu-timers: Deduplicate rlimit handling
posix-cpu-timers: Remove pointless comparisons
posix-cpu-timers: Get rid of 64bit divisions
posix-cpu-timers: Consolidate timer expiry further
posix-cpu-timers: Get rid of zero checks
rlimit: Rewrite non-sensical RLIMIT_CPU comment
posix-cpu-timers: Respect INFINITY for hard RTTIME limit
posix-cpu-timers: Switch thread group sampling to array
posix-cpu-timers: Restructure expiry array
posix-cpu-timers: Remove cputime_expires
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/core.c | 8 | ||||
-rw-r--r-- | kernel/fork.c | 34 | ||||
-rw-r--r-- | kernel/futex.c | 12 | ||||
-rw-r--r-- | kernel/sched/core.c | 6 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 8 | ||||
-rw-r--r-- | kernel/sched/rt.c | 13 | ||||
-rw-r--r-- | kernel/sys.c | 16 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 16 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 235 | ||||
-rw-r--r-- | kernel/time/itimer.c | 12 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 1010 | ||||
-rw-r--r-- | kernel/time/posix-timers.c | 61 | ||||
-rw-r--r-- | kernel/time/posix-timers.h | 1 | ||||
-rw-r--r-- | kernel/time/tick-broadcast-hrtimer.c | 13 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 17 | ||||
-rw-r--r-- | kernel/time/timer.c | 105 | ||||
-rw-r--r-- | kernel/watchdog.c | 4 |
17 files changed, 921 insertions, 650 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 1c414b8866b4..4f08b17d6426 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -1103,7 +1103,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) | |||
1103 | cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); | 1103 | cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); |
1104 | 1104 | ||
1105 | raw_spin_lock_init(&cpuctx->hrtimer_lock); | 1105 | raw_spin_lock_init(&cpuctx->hrtimer_lock); |
1106 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); | 1106 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); |
1107 | timer->function = perf_mux_hrtimer_handler; | 1107 | timer->function = perf_mux_hrtimer_handler; |
1108 | } | 1108 | } |
1109 | 1109 | ||
@@ -1121,7 +1121,7 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) | |||
1121 | if (!cpuctx->hrtimer_active) { | 1121 | if (!cpuctx->hrtimer_active) { |
1122 | cpuctx->hrtimer_active = 1; | 1122 | cpuctx->hrtimer_active = 1; |
1123 | hrtimer_forward_now(timer, cpuctx->hrtimer_interval); | 1123 | hrtimer_forward_now(timer, cpuctx->hrtimer_interval); |
1124 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); | 1124 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); |
1125 | } | 1125 | } |
1126 | raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); | 1126 | raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); |
1127 | 1127 | ||
@@ -9574,7 +9574,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) | |||
9574 | period = max_t(u64, 10000, hwc->sample_period); | 9574 | period = max_t(u64, 10000, hwc->sample_period); |
9575 | } | 9575 | } |
9576 | hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), | 9576 | hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), |
9577 | HRTIMER_MODE_REL_PINNED); | 9577 | HRTIMER_MODE_REL_PINNED_HARD); |
9578 | } | 9578 | } |
9579 | 9579 | ||
9580 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | 9580 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
@@ -9596,7 +9596,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) | |||
9596 | if (!is_sampling_event(event)) | 9596 | if (!is_sampling_event(event)) |
9597 | return; | 9597 | return; |
9598 | 9598 | ||
9599 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 9599 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); |
9600 | hwc->hrtimer.function = perf_swevent_hrtimer; | 9600 | hwc->hrtimer.function = perf_swevent_hrtimer; |
9601 | 9601 | ||
9602 | /* | 9602 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index 1d1cd06edbc1..53e780748fe3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1519,28 +1519,17 @@ void __cleanup_sighand(struct sighand_struct *sighand) | |||
1519 | } | 1519 | } |
1520 | } | 1520 | } |
1521 | 1521 | ||
1522 | #ifdef CONFIG_POSIX_TIMERS | ||
1523 | /* | 1522 | /* |
1524 | * Initialize POSIX timer handling for a thread group. | 1523 | * Initialize POSIX timer handling for a thread group. |
1525 | */ | 1524 | */ |
1526 | static void posix_cpu_timers_init_group(struct signal_struct *sig) | 1525 | static void posix_cpu_timers_init_group(struct signal_struct *sig) |
1527 | { | 1526 | { |
1527 | struct posix_cputimers *pct = &sig->posix_cputimers; | ||
1528 | unsigned long cpu_limit; | 1528 | unsigned long cpu_limit; |
1529 | 1529 | ||
1530 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); | 1530 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
1531 | if (cpu_limit != RLIM_INFINITY) { | 1531 | posix_cputimers_group_init(pct, cpu_limit); |
1532 | sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; | ||
1533 | sig->cputimer.running = true; | ||
1534 | } | ||
1535 | |||
1536 | /* The timer lists. */ | ||
1537 | INIT_LIST_HEAD(&sig->cpu_timers[0]); | ||
1538 | INIT_LIST_HEAD(&sig->cpu_timers[1]); | ||
1539 | INIT_LIST_HEAD(&sig->cpu_timers[2]); | ||
1540 | } | 1532 | } |
1541 | #else | ||
1542 | static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } | ||
1543 | #endif | ||
1544 | 1533 | ||
1545 | static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | 1534 | static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) |
1546 | { | 1535 | { |
@@ -1642,23 +1631,6 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1642 | #endif | 1631 | #endif |
1643 | } | 1632 | } |
1644 | 1633 | ||
1645 | #ifdef CONFIG_POSIX_TIMERS | ||
1646 | /* | ||
1647 | * Initialize POSIX timer handling for a single task. | ||
1648 | */ | ||
1649 | static void posix_cpu_timers_init(struct task_struct *tsk) | ||
1650 | { | ||
1651 | tsk->cputime_expires.prof_exp = 0; | ||
1652 | tsk->cputime_expires.virt_exp = 0; | ||
1653 | tsk->cputime_expires.sched_exp = 0; | ||
1654 | INIT_LIST_HEAD(&tsk->cpu_timers[0]); | ||
1655 | INIT_LIST_HEAD(&tsk->cpu_timers[1]); | ||
1656 | INIT_LIST_HEAD(&tsk->cpu_timers[2]); | ||
1657 | } | ||
1658 | #else | ||
1659 | static inline void posix_cpu_timers_init(struct task_struct *tsk) { } | ||
1660 | #endif | ||
1661 | |||
1662 | static inline void init_task_pid_links(struct task_struct *task) | 1634 | static inline void init_task_pid_links(struct task_struct *task) |
1663 | { | 1635 | { |
1664 | enum pid_type type; | 1636 | enum pid_type type; |
@@ -1945,7 +1917,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
1945 | task_io_accounting_init(&p->ioac); | 1917 | task_io_accounting_init(&p->ioac); |
1946 | acct_clear_integrals(p); | 1918 | acct_clear_integrals(p); |
1947 | 1919 | ||
1948 | posix_cpu_timers_init(p); | 1920 | posix_cputimers_init(&p->posix_cputimers); |
1949 | 1921 | ||
1950 | p->io_context = NULL; | 1922 | p->io_context = NULL; |
1951 | audit_set_context(p, NULL); | 1923 | audit_set_context(p, NULL); |
diff --git a/kernel/futex.c b/kernel/futex.c index 6d50728ef2e7..bd18f60e4c6c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -487,11 +487,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, | |||
487 | if (!time) | 487 | if (!time) |
488 | return NULL; | 488 | return NULL; |
489 | 489 | ||
490 | hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ? | 490 | hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? |
491 | CLOCK_REALTIME : CLOCK_MONOTONIC, | 491 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
492 | HRTIMER_MODE_ABS); | 492 | HRTIMER_MODE_ABS); |
493 | hrtimer_init_sleeper(timeout, current); | ||
494 | |||
495 | /* | 493 | /* |
496 | * If range_ns is 0, calling hrtimer_set_expires_range_ns() is | 494 | * If range_ns is 0, calling hrtimer_set_expires_range_ns() is |
497 | * effectively the same as calling hrtimer_set_expires(). | 495 | * effectively the same as calling hrtimer_set_expires(). |
@@ -2613,7 +2611,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
2613 | 2611 | ||
2614 | /* Arm the timer */ | 2612 | /* Arm the timer */ |
2615 | if (timeout) | 2613 | if (timeout) |
2616 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | 2614 | hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); |
2617 | 2615 | ||
2618 | /* | 2616 | /* |
2619 | * If we have been removed from the hash list, then another task | 2617 | * If we have been removed from the hash list, then another task |
@@ -2899,7 +2897,7 @@ retry_private: | |||
2899 | } | 2897 | } |
2900 | 2898 | ||
2901 | if (unlikely(to)) | 2899 | if (unlikely(to)) |
2902 | hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); | 2900 | hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); |
2903 | 2901 | ||
2904 | ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); | 2902 | ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); |
2905 | 2903 | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 06961b997ed6..5e8387bdd09c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -255,7 +255,7 @@ static void __hrtick_restart(struct rq *rq) | |||
255 | { | 255 | { |
256 | struct hrtimer *timer = &rq->hrtick_timer; | 256 | struct hrtimer *timer = &rq->hrtick_timer; |
257 | 257 | ||
258 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); | 258 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); |
259 | } | 259 | } |
260 | 260 | ||
261 | /* | 261 | /* |
@@ -314,7 +314,7 @@ void hrtick_start(struct rq *rq, u64 delay) | |||
314 | */ | 314 | */ |
315 | delay = max_t(u64, delay, 10000LL); | 315 | delay = max_t(u64, delay, 10000LL); |
316 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), | 316 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), |
317 | HRTIMER_MODE_REL_PINNED); | 317 | HRTIMER_MODE_REL_PINNED_HARD); |
318 | } | 318 | } |
319 | #endif /* CONFIG_SMP */ | 319 | #endif /* CONFIG_SMP */ |
320 | 320 | ||
@@ -328,7 +328,7 @@ static void hrtick_rq_init(struct rq *rq) | |||
328 | rq->hrtick_csd.info = rq; | 328 | rq->hrtick_csd.info = rq; |
329 | #endif | 329 | #endif |
330 | 330 | ||
331 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 331 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); |
332 | rq->hrtick_timer.function = hrtick; | 332 | rq->hrtick_timer.function = hrtick; |
333 | } | 333 | } |
334 | #else /* CONFIG_SCHED_HRTICK */ | 334 | #else /* CONFIG_SCHED_HRTICK */ |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 39dc9f74f289..2dc48720f189 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -287,7 +287,7 @@ static void task_non_contending(struct task_struct *p) | |||
287 | 287 | ||
288 | dl_se->dl_non_contending = 1; | 288 | dl_se->dl_non_contending = 1; |
289 | get_task_struct(p); | 289 | get_task_struct(p); |
290 | hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); | 290 | hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); |
291 | } | 291 | } |
292 | 292 | ||
293 | static void task_contending(struct sched_dl_entity *dl_se, int flags) | 293 | static void task_contending(struct sched_dl_entity *dl_se, int flags) |
@@ -956,7 +956,7 @@ static int start_dl_timer(struct task_struct *p) | |||
956 | */ | 956 | */ |
957 | if (!hrtimer_is_queued(timer)) { | 957 | if (!hrtimer_is_queued(timer)) { |
958 | get_task_struct(p); | 958 | get_task_struct(p); |
959 | hrtimer_start(timer, act, HRTIMER_MODE_ABS); | 959 | hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); |
960 | } | 960 | } |
961 | 961 | ||
962 | return 1; | 962 | return 1; |
@@ -1086,7 +1086,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) | |||
1086 | { | 1086 | { |
1087 | struct hrtimer *timer = &dl_se->dl_timer; | 1087 | struct hrtimer *timer = &dl_se->dl_timer; |
1088 | 1088 | ||
1089 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1089 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); |
1090 | timer->function = dl_task_timer; | 1090 | timer->function = dl_task_timer; |
1091 | } | 1091 | } |
1092 | 1092 | ||
@@ -1325,7 +1325,7 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) | |||
1325 | { | 1325 | { |
1326 | struct hrtimer *timer = &dl_se->inactive_timer; | 1326 | struct hrtimer *timer = &dl_se->inactive_timer; |
1327 | 1327 | ||
1328 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1328 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); |
1329 | timer->function = inactive_task_timer; | 1329 | timer->function = inactive_task_timer; |
1330 | } | 1330 | } |
1331 | 1331 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 858c4cc6f99b..ebaa4e619684 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -45,8 +45,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
45 | 45 | ||
46 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | 46 | raw_spin_lock_init(&rt_b->rt_runtime_lock); |
47 | 47 | ||
48 | hrtimer_init(&rt_b->rt_period_timer, | 48 | hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, |
49 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 49 | HRTIMER_MODE_REL_HARD); |
50 | rt_b->rt_period_timer.function = sched_rt_period_timer; | 50 | rt_b->rt_period_timer.function = sched_rt_period_timer; |
51 | } | 51 | } |
52 | 52 | ||
@@ -67,7 +67,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
67 | * to update the period. | 67 | * to update the period. |
68 | */ | 68 | */ |
69 | hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); | 69 | hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); |
70 | hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); | 70 | hrtimer_start_expires(&rt_b->rt_period_timer, |
71 | HRTIMER_MODE_ABS_PINNED_HARD); | ||
71 | } | 72 | } |
72 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 73 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
73 | } | 74 | } |
@@ -2289,8 +2290,10 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
2289 | } | 2290 | } |
2290 | 2291 | ||
2291 | next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); | 2292 | next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); |
2292 | if (p->rt.timeout > next) | 2293 | if (p->rt.timeout > next) { |
2293 | p->cputime_expires.sched_exp = p->se.sum_exec_runtime; | 2294 | posix_cputimers_rt_watchdog(&p->posix_cputimers, |
2295 | p->se.sum_exec_runtime); | ||
2296 | } | ||
2294 | } | 2297 | } |
2295 | } | 2298 | } |
2296 | #else | 2299 | #else |
diff --git a/kernel/sys.c b/kernel/sys.c index d605fe5e58a5..a611d1d58c7d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1557,15 +1557,6 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, | |||
1557 | retval = -EPERM; | 1557 | retval = -EPERM; |
1558 | if (!retval) | 1558 | if (!retval) |
1559 | retval = security_task_setrlimit(tsk, resource, new_rlim); | 1559 | retval = security_task_setrlimit(tsk, resource, new_rlim); |
1560 | if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { | ||
1561 | /* | ||
1562 | * The caller is asking for an immediate RLIMIT_CPU | ||
1563 | * expiry. But we use the zero value to mean "it was | ||
1564 | * never set". So let's cheat and make it one second | ||
1565 | * instead | ||
1566 | */ | ||
1567 | new_rlim->rlim_cur = 1; | ||
1568 | } | ||
1569 | } | 1560 | } |
1570 | if (!retval) { | 1561 | if (!retval) { |
1571 | if (old_rlim) | 1562 | if (old_rlim) |
@@ -1576,10 +1567,9 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, | |||
1576 | task_unlock(tsk->group_leader); | 1567 | task_unlock(tsk->group_leader); |
1577 | 1568 | ||
1578 | /* | 1569 | /* |
1579 | * RLIMIT_CPU handling. Note that the kernel fails to return an error | 1570 | * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not |
1580 | * code if it rejected the user's attempt to set RLIMIT_CPU. This is a | 1571 | * infite. In case of RLIM_INFINITY the posix CPU timer code |
1581 | * very long-standing error, and fixing it now risks breakage of | 1572 | * ignores the rlimit. |
1582 | * applications, so we live with it | ||
1583 | */ | 1573 | */ |
1584 | if (!retval && new_rlim && resource == RLIMIT_CPU && | 1574 | if (!retval && new_rlim && resource == RLIMIT_CPU && |
1585 | new_rlim->rlim_cur != RLIM_INFINITY && | 1575 | new_rlim->rlim_cur != RLIM_INFINITY && |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index b7d75a9e8ccf..271ce6c12907 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -432,7 +432,7 @@ int alarm_cancel(struct alarm *alarm) | |||
432 | int ret = alarm_try_to_cancel(alarm); | 432 | int ret = alarm_try_to_cancel(alarm); |
433 | if (ret >= 0) | 433 | if (ret >= 0) |
434 | return ret; | 434 | return ret; |
435 | cpu_relax(); | 435 | hrtimer_cancel_wait_running(&alarm->timer); |
436 | } | 436 | } |
437 | } | 437 | } |
438 | EXPORT_SYMBOL_GPL(alarm_cancel); | 438 | EXPORT_SYMBOL_GPL(alarm_cancel); |
@@ -606,6 +606,19 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr) | |||
606 | } | 606 | } |
607 | 607 | ||
608 | /** | 608 | /** |
609 | * alarm_timer_wait_running - Posix timer callback to wait for a timer | ||
610 | * @timr: Pointer to the posixtimer data struct | ||
611 | * | ||
612 | * Called from the core code when timer cancel detected that the callback | ||
613 | * is running. @timr is unlocked and rcu read lock is held to prevent it | ||
614 | * from being freed. | ||
615 | */ | ||
616 | static void alarm_timer_wait_running(struct k_itimer *timr) | ||
617 | { | ||
618 | hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer); | ||
619 | } | ||
620 | |||
621 | /** | ||
609 | * alarm_timer_arm - Posix timer callback to arm a timer | 622 | * alarm_timer_arm - Posix timer callback to arm a timer |
610 | * @timr: Pointer to the posixtimer data struct | 623 | * @timr: Pointer to the posixtimer data struct |
611 | * @expires: The new expiry time | 624 | * @expires: The new expiry time |
@@ -834,6 +847,7 @@ const struct k_clock alarm_clock = { | |||
834 | .timer_forward = alarm_timer_forward, | 847 | .timer_forward = alarm_timer_forward, |
835 | .timer_remaining = alarm_timer_remaining, | 848 | .timer_remaining = alarm_timer_remaining, |
836 | .timer_try_to_cancel = alarm_timer_try_to_cancel, | 849 | .timer_try_to_cancel = alarm_timer_try_to_cancel, |
850 | .timer_wait_running = alarm_timer_wait_running, | ||
837 | .nsleep = alarm_timer_nsleep, | 851 | .nsleep = alarm_timer_nsleep, |
838 | }; | 852 | }; |
839 | #endif /* CONFIG_POSIX_TIMERS */ | 853 | #endif /* CONFIG_POSIX_TIMERS */ |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5ee77f1a8a92..0d4dc241c0fb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -140,6 +140,11 @@ static struct hrtimer_cpu_base migration_cpu_base = { | |||
140 | 140 | ||
141 | #define migration_base migration_cpu_base.clock_base[0] | 141 | #define migration_base migration_cpu_base.clock_base[0] |
142 | 142 | ||
143 | static inline bool is_migration_base(struct hrtimer_clock_base *base) | ||
144 | { | ||
145 | return base == &migration_base; | ||
146 | } | ||
147 | |||
143 | /* | 148 | /* |
144 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock | 149 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock |
145 | * means that all timers which are tied to this base via timer->base are | 150 | * means that all timers which are tied to this base via timer->base are |
@@ -264,6 +269,11 @@ again: | |||
264 | 269 | ||
265 | #else /* CONFIG_SMP */ | 270 | #else /* CONFIG_SMP */ |
266 | 271 | ||
272 | static inline bool is_migration_base(struct hrtimer_clock_base *base) | ||
273 | { | ||
274 | return false; | ||
275 | } | ||
276 | |||
267 | static inline struct hrtimer_clock_base * | 277 | static inline struct hrtimer_clock_base * |
268 | lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | 278 | lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) |
269 | { | 279 | { |
@@ -427,6 +437,17 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, | |||
427 | } | 437 | } |
428 | EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); | 438 | EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); |
429 | 439 | ||
440 | static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, | ||
441 | clockid_t clock_id, enum hrtimer_mode mode); | ||
442 | |||
443 | void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, | ||
444 | clockid_t clock_id, enum hrtimer_mode mode) | ||
445 | { | ||
446 | debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); | ||
447 | __hrtimer_init_sleeper(sl, clock_id, mode); | ||
448 | } | ||
449 | EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); | ||
450 | |||
430 | void destroy_hrtimer_on_stack(struct hrtimer *timer) | 451 | void destroy_hrtimer_on_stack(struct hrtimer *timer) |
431 | { | 452 | { |
432 | debug_object_free(timer, &hrtimer_debug_descr); | 453 | debug_object_free(timer, &hrtimer_debug_descr); |
@@ -1096,9 +1117,13 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
1096 | 1117 | ||
1097 | /* | 1118 | /* |
1098 | * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft | 1119 | * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft |
1099 | * match. | 1120 | * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard |
1121 | * expiry mode because unmarked timers are moved to softirq expiry. | ||
1100 | */ | 1122 | */ |
1101 | WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); | 1123 | if (!IS_ENABLED(CONFIG_PREEMPT_RT)) |
1124 | WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); | ||
1125 | else | ||
1126 | WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); | ||
1102 | 1127 | ||
1103 | base = lock_hrtimer_base(timer, &flags); | 1128 | base = lock_hrtimer_base(timer, &flags); |
1104 | 1129 | ||
@@ -1147,6 +1172,93 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) | |||
1147 | } | 1172 | } |
1148 | EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); | 1173 | EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); |
1149 | 1174 | ||
1175 | #ifdef CONFIG_PREEMPT_RT | ||
1176 | static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) | ||
1177 | { | ||
1178 | spin_lock_init(&base->softirq_expiry_lock); | ||
1179 | } | ||
1180 | |||
1181 | static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) | ||
1182 | { | ||
1183 | spin_lock(&base->softirq_expiry_lock); | ||
1184 | } | ||
1185 | |||
1186 | static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) | ||
1187 | { | ||
1188 | spin_unlock(&base->softirq_expiry_lock); | ||
1189 | } | ||
1190 | |||
1191 | /* | ||
1192 | * The counterpart to hrtimer_cancel_wait_running(). | ||
1193 | * | ||
1194 | * If there is a waiter for cpu_base->expiry_lock, then it was waiting for | ||
1195 | * the timer callback to finish. Drop expiry_lock and reaquire it. That | ||
1196 | * allows the waiter to acquire the lock and make progress. | ||
1197 | */ | ||
1198 | static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, | ||
1199 | unsigned long flags) | ||
1200 | { | ||
1201 | if (atomic_read(&cpu_base->timer_waiters)) { | ||
1202 | raw_spin_unlock_irqrestore(&cpu_base->lock, flags); | ||
1203 | spin_unlock(&cpu_base->softirq_expiry_lock); | ||
1204 | spin_lock(&cpu_base->softirq_expiry_lock); | ||
1205 | raw_spin_lock_irq(&cpu_base->lock); | ||
1206 | } | ||
1207 | } | ||
1208 | |||
1209 | /* | ||
1210 | * This function is called on PREEMPT_RT kernels when the fast path | ||
1211 | * deletion of a timer failed because the timer callback function was | ||
1212 | * running. | ||
1213 | * | ||
1214 | * This prevents priority inversion: if the soft irq thread is preempted | ||
1215 | * in the middle of a timer callback, then calling del_timer_sync() can | ||
1216 | * lead to two issues: | ||
1217 | * | ||
1218 | * - If the caller is on a remote CPU then it has to spin wait for the timer | ||
1219 | * handler to complete. This can result in unbound priority inversion. | ||
1220 | * | ||
1221 | * - If the caller originates from the task which preempted the timer | ||
1222 | * handler on the same CPU, then spin waiting for the timer handler to | ||
1223 | * complete is never going to end. | ||
1224 | */ | ||
1225 | void hrtimer_cancel_wait_running(const struct hrtimer *timer) | ||
1226 | { | ||
1227 | /* Lockless read. Prevent the compiler from reloading it below */ | ||
1228 | struct hrtimer_clock_base *base = READ_ONCE(timer->base); | ||
1229 | |||
1230 | /* | ||
1231 | * Just relax if the timer expires in hard interrupt context or if | ||
1232 | * it is currently on the migration base. | ||
1233 | */ | ||
1234 | if (!timer->is_soft || is_migration_base(base)) { | ||
1235 | cpu_relax(); | ||
1236 | return; | ||
1237 | } | ||
1238 | |||
1239 | /* | ||
1240 | * Mark the base as contended and grab the expiry lock, which is | ||
1241 | * held by the softirq across the timer callback. Drop the lock | ||
1242 | * immediately so the softirq can expire the next timer. In theory | ||
1243 | * the timer could already be running again, but that's more than | ||
1244 | * unlikely and just causes another wait loop. | ||
1245 | */ | ||
1246 | atomic_inc(&base->cpu_base->timer_waiters); | ||
1247 | spin_lock_bh(&base->cpu_base->softirq_expiry_lock); | ||
1248 | atomic_dec(&base->cpu_base->timer_waiters); | ||
1249 | spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); | ||
1250 | } | ||
1251 | #else | ||
1252 | static inline void | ||
1253 | hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } | ||
1254 | static inline void | ||
1255 | hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } | ||
1256 | static inline void | ||
1257 | hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } | ||
1258 | static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, | ||
1259 | unsigned long flags) { } | ||
1260 | #endif | ||
1261 | |||
1150 | /** | 1262 | /** |
1151 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. | 1263 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. |
1152 | * @timer: the timer to be cancelled | 1264 | * @timer: the timer to be cancelled |
@@ -1157,13 +1269,15 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); | |||
1157 | */ | 1269 | */ |
1158 | int hrtimer_cancel(struct hrtimer *timer) | 1270 | int hrtimer_cancel(struct hrtimer *timer) |
1159 | { | 1271 | { |
1160 | for (;;) { | 1272 | int ret; |
1161 | int ret = hrtimer_try_to_cancel(timer); | ||
1162 | 1273 | ||
1163 | if (ret >= 0) | 1274 | do { |
1164 | return ret; | 1275 | ret = hrtimer_try_to_cancel(timer); |
1165 | cpu_relax(); | 1276 | |
1166 | } | 1277 | if (ret < 0) |
1278 | hrtimer_cancel_wait_running(timer); | ||
1279 | } while (ret < 0); | ||
1280 | return ret; | ||
1167 | } | 1281 | } |
1168 | EXPORT_SYMBOL_GPL(hrtimer_cancel); | 1282 | EXPORT_SYMBOL_GPL(hrtimer_cancel); |
1169 | 1283 | ||
@@ -1260,8 +1374,17 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1260 | enum hrtimer_mode mode) | 1374 | enum hrtimer_mode mode) |
1261 | { | 1375 | { |
1262 | bool softtimer = !!(mode & HRTIMER_MODE_SOFT); | 1376 | bool softtimer = !!(mode & HRTIMER_MODE_SOFT); |
1263 | int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; | ||
1264 | struct hrtimer_cpu_base *cpu_base; | 1377 | struct hrtimer_cpu_base *cpu_base; |
1378 | int base; | ||
1379 | |||
1380 | /* | ||
1381 | * On PREEMPT_RT enabled kernels hrtimers which are not explicitely | ||
1382 | * marked for hard interrupt expiry mode are moved into soft | ||
1383 | * interrupt context for latency reasons and because the callbacks | ||
1384 | * can invoke functions which might sleep on RT, e.g. spin_lock(). | ||
1385 | */ | ||
1386 | if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) | ||
1387 | softtimer = true; | ||
1265 | 1388 | ||
1266 | memset(timer, 0, sizeof(struct hrtimer)); | 1389 | memset(timer, 0, sizeof(struct hrtimer)); |
1267 | 1390 | ||
@@ -1275,8 +1398,10 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1275 | if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) | 1398 | if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) |
1276 | clock_id = CLOCK_MONOTONIC; | 1399 | clock_id = CLOCK_MONOTONIC; |
1277 | 1400 | ||
1401 | base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; | ||
1278 | base += hrtimer_clockid_to_base(clock_id); | 1402 | base += hrtimer_clockid_to_base(clock_id); |
1279 | timer->is_soft = softtimer; | 1403 | timer->is_soft = softtimer; |
1404 | timer->is_hard = !softtimer; | ||
1280 | timer->base = &cpu_base->clock_base[base]; | 1405 | timer->base = &cpu_base->clock_base[base]; |
1281 | timerqueue_init(&timer->node); | 1406 | timerqueue_init(&timer->node); |
1282 | } | 1407 | } |
@@ -1449,6 +1574,8 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, | |||
1449 | break; | 1574 | break; |
1450 | 1575 | ||
1451 | __run_hrtimer(cpu_base, base, timer, &basenow, flags); | 1576 | __run_hrtimer(cpu_base, base, timer, &basenow, flags); |
1577 | if (active_mask == HRTIMER_ACTIVE_SOFT) | ||
1578 | hrtimer_sync_wait_running(cpu_base, flags); | ||
1452 | } | 1579 | } |
1453 | } | 1580 | } |
1454 | } | 1581 | } |
@@ -1459,6 +1586,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) | |||
1459 | unsigned long flags; | 1586 | unsigned long flags; |
1460 | ktime_t now; | 1587 | ktime_t now; |
1461 | 1588 | ||
1589 | hrtimer_cpu_base_lock_expiry(cpu_base); | ||
1462 | raw_spin_lock_irqsave(&cpu_base->lock, flags); | 1590 | raw_spin_lock_irqsave(&cpu_base->lock, flags); |
1463 | 1591 | ||
1464 | now = hrtimer_update_base(cpu_base); | 1592 | now = hrtimer_update_base(cpu_base); |
@@ -1468,6 +1596,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) | |||
1468 | hrtimer_update_softirq_timer(cpu_base, true); | 1596 | hrtimer_update_softirq_timer(cpu_base, true); |
1469 | 1597 | ||
1470 | raw_spin_unlock_irqrestore(&cpu_base->lock, flags); | 1598 | raw_spin_unlock_irqrestore(&cpu_base->lock, flags); |
1599 | hrtimer_cpu_base_unlock_expiry(cpu_base); | ||
1471 | } | 1600 | } |
1472 | 1601 | ||
1473 | #ifdef CONFIG_HIGH_RES_TIMERS | 1602 | #ifdef CONFIG_HIGH_RES_TIMERS |
@@ -1639,10 +1768,75 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) | |||
1639 | return HRTIMER_NORESTART; | 1768 | return HRTIMER_NORESTART; |
1640 | } | 1769 | } |
1641 | 1770 | ||
1642 | void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | 1771 | /** |
1772 | * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer | ||
1773 | * @sl: sleeper to be started | ||
1774 | * @mode: timer mode abs/rel | ||
1775 | * | ||
1776 | * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers | ||
1777 | * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) | ||
1778 | */ | ||
1779 | void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, | ||
1780 | enum hrtimer_mode mode) | ||
1781 | { | ||
1782 | /* | ||
1783 | * Make the enqueue delivery mode check work on RT. If the sleeper | ||
1784 | * was initialized for hard interrupt delivery, force the mode bit. | ||
1785 | * This is a special case for hrtimer_sleepers because | ||
1786 | * hrtimer_init_sleeper() determines the delivery mode on RT so the | ||
1787 | * fiddling with this decision is avoided at the call sites. | ||
1788 | */ | ||
1789 | if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) | ||
1790 | mode |= HRTIMER_MODE_HARD; | ||
1791 | |||
1792 | hrtimer_start_expires(&sl->timer, mode); | ||
1793 | } | ||
1794 | EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); | ||
1795 | |||
1796 | static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, | ||
1797 | clockid_t clock_id, enum hrtimer_mode mode) | ||
1643 | { | 1798 | { |
1799 | /* | ||
1800 | * On PREEMPT_RT enabled kernels hrtimers which are not explicitely | ||
1801 | * marked for hard interrupt expiry mode are moved into soft | ||
1802 | * interrupt context either for latency reasons or because the | ||
1803 | * hrtimer callback takes regular spinlocks or invokes other | ||
1804 | * functions which are not suitable for hard interrupt context on | ||
1805 | * PREEMPT_RT. | ||
1806 | * | ||
1807 | * The hrtimer_sleeper callback is RT compatible in hard interrupt | ||
1808 | * context, but there is a latency concern: Untrusted userspace can | ||
1809 | * spawn many threads which arm timers for the same expiry time on | ||
1810 | * the same CPU. That causes a latency spike due to the wakeup of | ||
1811 | * a gazillion threads. | ||
1812 | * | ||
1813 | * OTOH, priviledged real-time user space applications rely on the | ||
1814 | * low latency of hard interrupt wakeups. If the current task is in | ||
1815 | * a real-time scheduling class, mark the mode for hard interrupt | ||
1816 | * expiry. | ||
1817 | */ | ||
1818 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) { | ||
1819 | if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) | ||
1820 | mode |= HRTIMER_MODE_HARD; | ||
1821 | } | ||
1822 | |||
1823 | __hrtimer_init(&sl->timer, clock_id, mode); | ||
1644 | sl->timer.function = hrtimer_wakeup; | 1824 | sl->timer.function = hrtimer_wakeup; |
1645 | sl->task = task; | 1825 | sl->task = current; |
1826 | } | ||
1827 | |||
1828 | /** | ||
1829 | * hrtimer_init_sleeper - initialize sleeper to the given clock | ||
1830 | * @sl: sleeper to be initialized | ||
1831 | * @clock_id: the clock to be used | ||
1832 | * @mode: timer mode abs/rel | ||
1833 | */ | ||
1834 | void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, | ||
1835 | enum hrtimer_mode mode) | ||
1836 | { | ||
1837 | debug_init(&sl->timer, clock_id, mode); | ||
1838 | __hrtimer_init_sleeper(sl, clock_id, mode); | ||
1839 | |||
1646 | } | 1840 | } |
1647 | EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); | 1841 | EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); |
1648 | 1842 | ||
@@ -1669,11 +1863,9 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
1669 | { | 1863 | { |
1670 | struct restart_block *restart; | 1864 | struct restart_block *restart; |
1671 | 1865 | ||
1672 | hrtimer_init_sleeper(t, current); | ||
1673 | |||
1674 | do { | 1866 | do { |
1675 | set_current_state(TASK_INTERRUPTIBLE); | 1867 | set_current_state(TASK_INTERRUPTIBLE); |
1676 | hrtimer_start_expires(&t->timer, mode); | 1868 | hrtimer_sleeper_start_expires(t, mode); |
1677 | 1869 | ||
1678 | if (likely(t->task)) | 1870 | if (likely(t->task)) |
1679 | freezable_schedule(); | 1871 | freezable_schedule(); |
@@ -1707,10 +1899,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) | |||
1707 | struct hrtimer_sleeper t; | 1899 | struct hrtimer_sleeper t; |
1708 | int ret; | 1900 | int ret; |
1709 | 1901 | ||
1710 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, | 1902 | hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, |
1711 | HRTIMER_MODE_ABS); | 1903 | HRTIMER_MODE_ABS); |
1712 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); | 1904 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); |
1713 | |||
1714 | ret = do_nanosleep(&t, HRTIMER_MODE_ABS); | 1905 | ret = do_nanosleep(&t, HRTIMER_MODE_ABS); |
1715 | destroy_hrtimer_on_stack(&t.timer); | 1906 | destroy_hrtimer_on_stack(&t.timer); |
1716 | return ret; | 1907 | return ret; |
@@ -1728,7 +1919,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, | |||
1728 | if (dl_task(current) || rt_task(current)) | 1919 | if (dl_task(current) || rt_task(current)) |
1729 | slack = 0; | 1920 | slack = 0; |
1730 | 1921 | ||
1731 | hrtimer_init_on_stack(&t.timer, clockid, mode); | 1922 | hrtimer_init_sleeper_on_stack(&t, clockid, mode); |
1732 | hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); | 1923 | hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); |
1733 | ret = do_nanosleep(&t, mode); | 1924 | ret = do_nanosleep(&t, mode); |
1734 | if (ret != -ERESTART_RESTARTBLOCK) | 1925 | if (ret != -ERESTART_RESTARTBLOCK) |
@@ -1809,6 +2000,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) | |||
1809 | cpu_base->softirq_next_timer = NULL; | 2000 | cpu_base->softirq_next_timer = NULL; |
1810 | cpu_base->expires_next = KTIME_MAX; | 2001 | cpu_base->expires_next = KTIME_MAX; |
1811 | cpu_base->softirq_expires_next = KTIME_MAX; | 2002 | cpu_base->softirq_expires_next = KTIME_MAX; |
2003 | hrtimer_cpu_base_init_expiry_lock(cpu_base); | ||
1812 | return 0; | 2004 | return 0; |
1813 | } | 2005 | } |
1814 | 2006 | ||
@@ -1927,12 +2119,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, | |||
1927 | return -EINTR; | 2119 | return -EINTR; |
1928 | } | 2120 | } |
1929 | 2121 | ||
1930 | hrtimer_init_on_stack(&t.timer, clock_id, mode); | 2122 | hrtimer_init_sleeper_on_stack(&t, clock_id, mode); |
1931 | hrtimer_set_expires_range_ns(&t.timer, *expires, delta); | 2123 | hrtimer_set_expires_range_ns(&t.timer, *expires, delta); |
1932 | 2124 | hrtimer_sleeper_start_expires(&t, mode); | |
1933 | hrtimer_init_sleeper(&t, current); | ||
1934 | |||
1935 | hrtimer_start_expires(&t.timer, mode); | ||
1936 | 2125 | ||
1937 | if (likely(t.task)) | 2126 | if (likely(t.task)) |
1938 | schedule(); | 2127 | schedule(); |
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 02068b2d5862..77f1e5635cc1 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c | |||
@@ -55,15 +55,10 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | |||
55 | val = it->expires; | 55 | val = it->expires; |
56 | interval = it->incr; | 56 | interval = it->incr; |
57 | if (val) { | 57 | if (val) { |
58 | struct task_cputime cputime; | 58 | u64 t, samples[CPUCLOCK_MAX]; |
59 | u64 t; | ||
60 | 59 | ||
61 | thread_group_cputimer(tsk, &cputime); | 60 | thread_group_sample_cputime(tsk, samples); |
62 | if (clock_id == CPUCLOCK_PROF) | 61 | t = samples[clock_id]; |
63 | t = cputime.utime + cputime.stime; | ||
64 | else | ||
65 | /* CPUCLOCK_VIRT */ | ||
66 | t = cputime.utime; | ||
67 | 62 | ||
68 | if (val < t) | 63 | if (val < t) |
69 | /* about to fire */ | 64 | /* about to fire */ |
@@ -213,6 +208,7 @@ again: | |||
213 | /* We are sharing ->siglock with it_real_fn() */ | 208 | /* We are sharing ->siglock with it_real_fn() */ |
214 | if (hrtimer_try_to_cancel(timer) < 0) { | 209 | if (hrtimer_try_to_cancel(timer) < 0) { |
215 | spin_unlock_irq(&tsk->sighand->siglock); | 210 | spin_unlock_irq(&tsk->sighand->siglock); |
211 | hrtimer_cancel_wait_running(timer); | ||
216 | goto again; | 212 | goto again; |
217 | } | 213 | } |
218 | expires = timeval_to_ktime(value->it_value); | 214 | expires = timeval_to_ktime(value->it_value); |
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0a426f4e3125..92a431981b1c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
@@ -20,11 +20,20 @@ | |||
20 | 20 | ||
21 | static void posix_cpu_timer_rearm(struct k_itimer *timer); | 21 | static void posix_cpu_timer_rearm(struct k_itimer *timer); |
22 | 22 | ||
23 | void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) | ||
24 | { | ||
25 | posix_cputimers_init(pct); | ||
26 | if (cpu_limit != RLIM_INFINITY) { | ||
27 | pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC; | ||
28 | pct->timers_active = true; | ||
29 | } | ||
30 | } | ||
31 | |||
23 | /* | 32 | /* |
24 | * Called after updating RLIMIT_CPU to run cpu timer and update | 33 | * Called after updating RLIMIT_CPU to run cpu timer and update |
25 | * tsk->signal->cputime_expires expiration cache if necessary. Needs | 34 | * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if |
26 | * siglock protection since other code may update expiration cache as | 35 | * necessary. Needs siglock protection since other code may update the |
27 | * well. | 36 | * expiration cache as well. |
28 | */ | 37 | */ |
29 | void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) | 38 | void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) |
30 | { | 39 | { |
@@ -35,46 +44,97 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) | |||
35 | spin_unlock_irq(&task->sighand->siglock); | 44 | spin_unlock_irq(&task->sighand->siglock); |
36 | } | 45 | } |
37 | 46 | ||
38 | static int check_clock(const clockid_t which_clock) | 47 | /* |
48 | * Functions for validating access to tasks. | ||
49 | */ | ||
50 | static struct task_struct *lookup_task(const pid_t pid, bool thread, | ||
51 | bool gettime) | ||
39 | { | 52 | { |
40 | int error = 0; | ||
41 | struct task_struct *p; | 53 | struct task_struct *p; |
42 | const pid_t pid = CPUCLOCK_PID(which_clock); | ||
43 | |||
44 | if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) | ||
45 | return -EINVAL; | ||
46 | 54 | ||
47 | if (pid == 0) | 55 | /* |
48 | return 0; | 56 | * If the encoded PID is 0, then the timer is targeted at current |
57 | * or the process to which current belongs. | ||
58 | */ | ||
59 | if (!pid) | ||
60 | return thread ? current : current->group_leader; | ||
49 | 61 | ||
50 | rcu_read_lock(); | ||
51 | p = find_task_by_vpid(pid); | 62 | p = find_task_by_vpid(pid); |
52 | if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? | 63 | if (!p) |
53 | same_thread_group(p, current) : has_group_leader_pid(p))) { | 64 | return p; |
54 | error = -EINVAL; | 65 | |
66 | if (thread) | ||
67 | return same_thread_group(p, current) ? p : NULL; | ||
68 | |||
69 | if (gettime) { | ||
70 | /* | ||
71 | * For clock_gettime(PROCESS) the task does not need to be | ||
72 | * the actual group leader. tsk->sighand gives | ||
73 | * access to the group's clock. | ||
74 | * | ||
75 | * Timers need the group leader because they take a | ||
76 | * reference on it and store the task pointer until the | ||
77 | * timer is destroyed. | ||
78 | */ | ||
79 | return (p == current || thread_group_leader(p)) ? p : NULL; | ||
55 | } | 80 | } |
81 | |||
82 | /* | ||
83 | * For processes require that p is group leader. | ||
84 | */ | ||
85 | return has_group_leader_pid(p) ? p : NULL; | ||
86 | } | ||
87 | |||
88 | static struct task_struct *__get_task_for_clock(const clockid_t clock, | ||
89 | bool getref, bool gettime) | ||
90 | { | ||
91 | const bool thread = !!CPUCLOCK_PERTHREAD(clock); | ||
92 | const pid_t pid = CPUCLOCK_PID(clock); | ||
93 | struct task_struct *p; | ||
94 | |||
95 | if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) | ||
96 | return NULL; | ||
97 | |||
98 | rcu_read_lock(); | ||
99 | p = lookup_task(pid, thread, gettime); | ||
100 | if (p && getref) | ||
101 | get_task_struct(p); | ||
56 | rcu_read_unlock(); | 102 | rcu_read_unlock(); |
103 | return p; | ||
104 | } | ||
57 | 105 | ||
58 | return error; | 106 | static inline struct task_struct *get_task_for_clock(const clockid_t clock) |
107 | { | ||
108 | return __get_task_for_clock(clock, true, false); | ||
109 | } | ||
110 | |||
111 | static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) | ||
112 | { | ||
113 | return __get_task_for_clock(clock, true, true); | ||
114 | } | ||
115 | |||
116 | static inline int validate_clock_permissions(const clockid_t clock) | ||
117 | { | ||
118 | return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; | ||
59 | } | 119 | } |
60 | 120 | ||
61 | /* | 121 | /* |
62 | * Update expiry time from increment, and increase overrun count, | 122 | * Update expiry time from increment, and increase overrun count, |
63 | * given the current clock sample. | 123 | * given the current clock sample. |
64 | */ | 124 | */ |
65 | static void bump_cpu_timer(struct k_itimer *timer, u64 now) | 125 | static u64 bump_cpu_timer(struct k_itimer *timer, u64 now) |
66 | { | 126 | { |
127 | u64 delta, incr, expires = timer->it.cpu.node.expires; | ||
67 | int i; | 128 | int i; |
68 | u64 delta, incr; | ||
69 | 129 | ||
70 | if (!timer->it_interval) | 130 | if (!timer->it_interval) |
71 | return; | 131 | return expires; |
72 | 132 | ||
73 | if (now < timer->it.cpu.expires) | 133 | if (now < expires) |
74 | return; | 134 | return expires; |
75 | 135 | ||
76 | incr = timer->it_interval; | 136 | incr = timer->it_interval; |
77 | delta = now + incr - timer->it.cpu.expires; | 137 | delta = now + incr - expires; |
78 | 138 | ||
79 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ | 139 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ |
80 | for (i = 0; incr < delta - incr; i++) | 140 | for (i = 0; incr < delta - incr; i++) |
@@ -84,48 +144,26 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) | |||
84 | if (delta < incr) | 144 | if (delta < incr) |
85 | continue; | 145 | continue; |
86 | 146 | ||
87 | timer->it.cpu.expires += incr; | 147 | timer->it.cpu.node.expires += incr; |
88 | timer->it_overrun += 1LL << i; | 148 | timer->it_overrun += 1LL << i; |
89 | delta -= incr; | 149 | delta -= incr; |
90 | } | 150 | } |
151 | return timer->it.cpu.node.expires; | ||
91 | } | 152 | } |
92 | 153 | ||
93 | /** | 154 | /* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */ |
94 | * task_cputime_zero - Check a task_cputime struct for all zero fields. | 155 | static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct) |
95 | * | ||
96 | * @cputime: The struct to compare. | ||
97 | * | ||
98 | * Checks @cputime to see if all fields are zero. Returns true if all fields | ||
99 | * are zero, false if any field is nonzero. | ||
100 | */ | ||
101 | static inline int task_cputime_zero(const struct task_cputime *cputime) | ||
102 | { | 156 | { |
103 | if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) | 157 | return !(~pct->bases[CPUCLOCK_PROF].nextevt | |
104 | return 1; | 158 | ~pct->bases[CPUCLOCK_VIRT].nextevt | |
105 | return 0; | 159 | ~pct->bases[CPUCLOCK_SCHED].nextevt); |
106 | } | ||
107 | |||
108 | static inline u64 prof_ticks(struct task_struct *p) | ||
109 | { | ||
110 | u64 utime, stime; | ||
111 | |||
112 | task_cputime(p, &utime, &stime); | ||
113 | |||
114 | return utime + stime; | ||
115 | } | ||
116 | static inline u64 virt_ticks(struct task_struct *p) | ||
117 | { | ||
118 | u64 utime, stime; | ||
119 | |||
120 | task_cputime(p, &utime, &stime); | ||
121 | |||
122 | return utime; | ||
123 | } | 160 | } |
124 | 161 | ||
125 | static int | 162 | static int |
126 | posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) | 163 | posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) |
127 | { | 164 | { |
128 | int error = check_clock(which_clock); | 165 | int error = validate_clock_permissions(which_clock); |
166 | |||
129 | if (!error) { | 167 | if (!error) { |
130 | tp->tv_sec = 0; | 168 | tp->tv_sec = 0; |
131 | tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); | 169 | tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); |
@@ -142,42 +180,66 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) | |||
142 | } | 180 | } |
143 | 181 | ||
144 | static int | 182 | static int |
145 | posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp) | 183 | posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp) |
146 | { | 184 | { |
185 | int error = validate_clock_permissions(clock); | ||
186 | |||
147 | /* | 187 | /* |
148 | * You can never reset a CPU clock, but we check for other errors | 188 | * You can never reset a CPU clock, but we check for other errors |
149 | * in the call before failing with EPERM. | 189 | * in the call before failing with EPERM. |
150 | */ | 190 | */ |
151 | int error = check_clock(which_clock); | 191 | return error ? : -EPERM; |
152 | if (error == 0) { | ||
153 | error = -EPERM; | ||
154 | } | ||
155 | return error; | ||
156 | } | 192 | } |
157 | 193 | ||
158 | |||
159 | /* | 194 | /* |
160 | * Sample a per-thread clock for the given task. | 195 | * Sample a per-thread clock for the given task. clkid is validated. |
161 | */ | 196 | */ |
162 | static int cpu_clock_sample(const clockid_t which_clock, | 197 | static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p) |
163 | struct task_struct *p, u64 *sample) | ||
164 | { | 198 | { |
165 | switch (CPUCLOCK_WHICH(which_clock)) { | 199 | u64 utime, stime; |
166 | default: | 200 | |
167 | return -EINVAL; | 201 | if (clkid == CPUCLOCK_SCHED) |
202 | return task_sched_runtime(p); | ||
203 | |||
204 | task_cputime(p, &utime, &stime); | ||
205 | |||
206 | switch (clkid) { | ||
168 | case CPUCLOCK_PROF: | 207 | case CPUCLOCK_PROF: |
169 | *sample = prof_ticks(p); | 208 | return utime + stime; |
170 | break; | ||
171 | case CPUCLOCK_VIRT: | 209 | case CPUCLOCK_VIRT: |
172 | *sample = virt_ticks(p); | 210 | return utime; |
173 | break; | 211 | default: |
174 | case CPUCLOCK_SCHED: | 212 | WARN_ON_ONCE(1); |
175 | *sample = task_sched_runtime(p); | ||
176 | break; | ||
177 | } | 213 | } |
178 | return 0; | 214 | return 0; |
179 | } | 215 | } |
180 | 216 | ||
217 | static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime) | ||
218 | { | ||
219 | samples[CPUCLOCK_PROF] = stime + utime; | ||
220 | samples[CPUCLOCK_VIRT] = utime; | ||
221 | samples[CPUCLOCK_SCHED] = rtime; | ||
222 | } | ||
223 | |||
224 | static void task_sample_cputime(struct task_struct *p, u64 *samples) | ||
225 | { | ||
226 | u64 stime, utime; | ||
227 | |||
228 | task_cputime(p, &utime, &stime); | ||
229 | store_samples(samples, stime, utime, p->se.sum_exec_runtime); | ||
230 | } | ||
231 | |||
232 | static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, | ||
233 | u64 *samples) | ||
234 | { | ||
235 | u64 stime, utime, rtime; | ||
236 | |||
237 | utime = atomic64_read(&at->utime); | ||
238 | stime = atomic64_read(&at->stime); | ||
239 | rtime = atomic64_read(&at->sum_exec_runtime); | ||
240 | store_samples(samples, stime, utime, rtime); | ||
241 | } | ||
242 | |||
181 | /* | 243 | /* |
182 | * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg | 244 | * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg |
183 | * to avoid race conditions with concurrent updates to cputime. | 245 | * to avoid race conditions with concurrent updates to cputime. |
@@ -193,29 +255,56 @@ retry: | |||
193 | } | 255 | } |
194 | } | 256 | } |
195 | 257 | ||
196 | static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) | 258 | static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, |
259 | struct task_cputime *sum) | ||
197 | { | 260 | { |
198 | __update_gt_cputime(&cputime_atomic->utime, sum->utime); | 261 | __update_gt_cputime(&cputime_atomic->utime, sum->utime); |
199 | __update_gt_cputime(&cputime_atomic->stime, sum->stime); | 262 | __update_gt_cputime(&cputime_atomic->stime, sum->stime); |
200 | __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); | 263 | __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); |
201 | } | 264 | } |
202 | 265 | ||
203 | /* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ | 266 | /** |
204 | static inline void sample_cputime_atomic(struct task_cputime *times, | 267 | * thread_group_sample_cputime - Sample cputime for a given task |
205 | struct task_cputime_atomic *atomic_times) | 268 | * @tsk: Task for which cputime needs to be started |
269 | * @iimes: Storage for time samples | ||
270 | * | ||
271 | * Called from sys_getitimer() to calculate the expiry time of an active | ||
272 | * timer. That means group cputime accounting is already active. Called | ||
273 | * with task sighand lock held. | ||
274 | * | ||
275 | * Updates @times with an uptodate sample of the thread group cputimes. | ||
276 | */ | ||
277 | void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples) | ||
206 | { | 278 | { |
207 | times->utime = atomic64_read(&atomic_times->utime); | 279 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
208 | times->stime = atomic64_read(&atomic_times->stime); | 280 | struct posix_cputimers *pct = &tsk->signal->posix_cputimers; |
209 | times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); | 281 | |
282 | WARN_ON_ONCE(!pct->timers_active); | ||
283 | |||
284 | proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); | ||
210 | } | 285 | } |
211 | 286 | ||
212 | void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | 287 | /** |
288 | * thread_group_start_cputime - Start cputime and return a sample | ||
289 | * @tsk: Task for which cputime needs to be started | ||
290 | * @samples: Storage for time samples | ||
291 | * | ||
292 | * The thread group cputime accouting is avoided when there are no posix | ||
293 | * CPU timers armed. Before starting a timer it's required to check whether | ||
294 | * the time accounting is active. If not, a full update of the atomic | ||
295 | * accounting store needs to be done and the accounting enabled. | ||
296 | * | ||
297 | * Updates @times with an uptodate sample of the thread group cputimes. | ||
298 | */ | ||
299 | static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) | ||
213 | { | 300 | { |
214 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 301 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
215 | struct task_cputime sum; | 302 | struct posix_cputimers *pct = &tsk->signal->posix_cputimers; |
216 | 303 | ||
217 | /* Check if cputimer isn't running. This is accessed without locking. */ | 304 | /* Check if cputimer isn't running. This is accessed without locking. */ |
218 | if (!READ_ONCE(cputimer->running)) { | 305 | if (!READ_ONCE(pct->timers_active)) { |
306 | struct task_cputime sum; | ||
307 | |||
219 | /* | 308 | /* |
220 | * The POSIX timer interface allows for absolute time expiry | 309 | * The POSIX timer interface allows for absolute time expiry |
221 | * values through the TIMER_ABSTIME flag, therefore we have | 310 | * values through the TIMER_ABSTIME flag, therefore we have |
@@ -225,94 +314,69 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
225 | update_gt_cputime(&cputimer->cputime_atomic, &sum); | 314 | update_gt_cputime(&cputimer->cputime_atomic, &sum); |
226 | 315 | ||
227 | /* | 316 | /* |
228 | * We're setting cputimer->running without a lock. Ensure | 317 | * We're setting timers_active without a lock. Ensure this |
229 | * this only gets written to in one operation. We set | 318 | * only gets written to in one operation. We set it after |
230 | * running after update_gt_cputime() as a small optimization, | 319 | * update_gt_cputime() as a small optimization, but |
231 | * but barriers are not required because update_gt_cputime() | 320 | * barriers are not required because update_gt_cputime() |
232 | * can handle concurrent updates. | 321 | * can handle concurrent updates. |
233 | */ | 322 | */ |
234 | WRITE_ONCE(cputimer->running, true); | 323 | WRITE_ONCE(pct->timers_active, true); |
235 | } | 324 | } |
236 | sample_cputime_atomic(times, &cputimer->cputime_atomic); | 325 | proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); |
237 | } | 326 | } |
238 | 327 | ||
239 | /* | 328 | static void __thread_group_cputime(struct task_struct *tsk, u64 *samples) |
240 | * Sample a process (thread group) clock for the given group_leader task. | ||
241 | * Must be called with task sighand lock held for safe while_each_thread() | ||
242 | * traversal. | ||
243 | */ | ||
244 | static int cpu_clock_sample_group(const clockid_t which_clock, | ||
245 | struct task_struct *p, | ||
246 | u64 *sample) | ||
247 | { | 329 | { |
248 | struct task_cputime cputime; | 330 | struct task_cputime ct; |
249 | 331 | ||
250 | switch (CPUCLOCK_WHICH(which_clock)) { | 332 | thread_group_cputime(tsk, &ct); |
251 | default: | 333 | store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime); |
252 | return -EINVAL; | ||
253 | case CPUCLOCK_PROF: | ||
254 | thread_group_cputime(p, &cputime); | ||
255 | *sample = cputime.utime + cputime.stime; | ||
256 | break; | ||
257 | case CPUCLOCK_VIRT: | ||
258 | thread_group_cputime(p, &cputime); | ||
259 | *sample = cputime.utime; | ||
260 | break; | ||
261 | case CPUCLOCK_SCHED: | ||
262 | thread_group_cputime(p, &cputime); | ||
263 | *sample = cputime.sum_exec_runtime; | ||
264 | break; | ||
265 | } | ||
266 | return 0; | ||
267 | } | 334 | } |
268 | 335 | ||
269 | static int posix_cpu_clock_get_task(struct task_struct *tsk, | 336 | /* |
270 | const clockid_t which_clock, | 337 | * Sample a process (thread group) clock for the given task clkid. If the |
271 | struct timespec64 *tp) | 338 | * group's cputime accounting is already enabled, read the atomic |
339 | * store. Otherwise a full update is required. Task's sighand lock must be | ||
340 | * held to protect the task traversal on a full update. clkid is already | ||
341 | * validated. | ||
342 | */ | ||
343 | static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, | ||
344 | bool start) | ||
272 | { | 345 | { |
273 | int err = -EINVAL; | 346 | struct thread_group_cputimer *cputimer = &p->signal->cputimer; |
274 | u64 rtn; | 347 | struct posix_cputimers *pct = &p->signal->posix_cputimers; |
348 | u64 samples[CPUCLOCK_MAX]; | ||
275 | 349 | ||
276 | if (CPUCLOCK_PERTHREAD(which_clock)) { | 350 | if (!READ_ONCE(pct->timers_active)) { |
277 | if (same_thread_group(tsk, current)) | 351 | if (start) |
278 | err = cpu_clock_sample(which_clock, tsk, &rtn); | 352 | thread_group_start_cputime(p, samples); |
353 | else | ||
354 | __thread_group_cputime(p, samples); | ||
279 | } else { | 355 | } else { |
280 | if (tsk == current || thread_group_leader(tsk)) | 356 | proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); |
281 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); | ||
282 | } | 357 | } |
283 | 358 | ||
284 | if (!err) | 359 | return samples[clkid]; |
285 | *tp = ns_to_timespec64(rtn); | ||
286 | |||
287 | return err; | ||
288 | } | 360 | } |
289 | 361 | ||
290 | 362 | static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) | |
291 | static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp) | ||
292 | { | 363 | { |
293 | const pid_t pid = CPUCLOCK_PID(which_clock); | 364 | const clockid_t clkid = CPUCLOCK_WHICH(clock); |
294 | int err = -EINVAL; | 365 | struct task_struct *tsk; |
366 | u64 t; | ||
295 | 367 | ||
296 | if (pid == 0) { | 368 | tsk = get_task_for_clock_get(clock); |
297 | /* | 369 | if (!tsk) |
298 | * Special case constant value for our own clocks. | 370 | return -EINVAL; |
299 | * We don't have to do any lookup to find ourselves. | ||
300 | */ | ||
301 | err = posix_cpu_clock_get_task(current, which_clock, tp); | ||
302 | } else { | ||
303 | /* | ||
304 | * Find the given PID, and validate that the caller | ||
305 | * should be able to see it. | ||
306 | */ | ||
307 | struct task_struct *p; | ||
308 | rcu_read_lock(); | ||
309 | p = find_task_by_vpid(pid); | ||
310 | if (p) | ||
311 | err = posix_cpu_clock_get_task(p, which_clock, tp); | ||
312 | rcu_read_unlock(); | ||
313 | } | ||
314 | 371 | ||
315 | return err; | 372 | if (CPUCLOCK_PERTHREAD(clock)) |
373 | t = cpu_clock_sample(clkid, tsk); | ||
374 | else | ||
375 | t = cpu_clock_sample_group(clkid, tsk, false); | ||
376 | put_task_struct(tsk); | ||
377 | |||
378 | *tp = ns_to_timespec64(t); | ||
379 | return 0; | ||
316 | } | 380 | } |
317 | 381 | ||
318 | /* | 382 | /* |
@@ -322,44 +386,15 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *t | |||
322 | */ | 386 | */ |
323 | static int posix_cpu_timer_create(struct k_itimer *new_timer) | 387 | static int posix_cpu_timer_create(struct k_itimer *new_timer) |
324 | { | 388 | { |
325 | int ret = 0; | 389 | struct task_struct *p = get_task_for_clock(new_timer->it_clock); |
326 | const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); | ||
327 | struct task_struct *p; | ||
328 | 390 | ||
329 | if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) | 391 | if (!p) |
330 | return -EINVAL; | 392 | return -EINVAL; |
331 | 393 | ||
332 | new_timer->kclock = &clock_posix_cpu; | 394 | new_timer->kclock = &clock_posix_cpu; |
333 | 395 | timerqueue_init(&new_timer->it.cpu.node); | |
334 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); | ||
335 | |||
336 | rcu_read_lock(); | ||
337 | if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { | ||
338 | if (pid == 0) { | ||
339 | p = current; | ||
340 | } else { | ||
341 | p = find_task_by_vpid(pid); | ||
342 | if (p && !same_thread_group(p, current)) | ||
343 | p = NULL; | ||
344 | } | ||
345 | } else { | ||
346 | if (pid == 0) { | ||
347 | p = current->group_leader; | ||
348 | } else { | ||
349 | p = find_task_by_vpid(pid); | ||
350 | if (p && !has_group_leader_pid(p)) | ||
351 | p = NULL; | ||
352 | } | ||
353 | } | ||
354 | new_timer->it.cpu.task = p; | 396 | new_timer->it.cpu.task = p; |
355 | if (p) { | 397 | return 0; |
356 | get_task_struct(p); | ||
357 | } else { | ||
358 | ret = -EINVAL; | ||
359 | } | ||
360 | rcu_read_unlock(); | ||
361 | |||
362 | return ret; | ||
363 | } | 398 | } |
364 | 399 | ||
365 | /* | 400 | /* |
@@ -370,12 +405,14 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
370 | */ | 405 | */ |
371 | static int posix_cpu_timer_del(struct k_itimer *timer) | 406 | static int posix_cpu_timer_del(struct k_itimer *timer) |
372 | { | 407 | { |
373 | int ret = 0; | 408 | struct cpu_timer *ctmr = &timer->it.cpu; |
374 | unsigned long flags; | 409 | struct task_struct *p = ctmr->task; |
375 | struct sighand_struct *sighand; | 410 | struct sighand_struct *sighand; |
376 | struct task_struct *p = timer->it.cpu.task; | 411 | unsigned long flags; |
412 | int ret = 0; | ||
377 | 413 | ||
378 | WARN_ON_ONCE(p == NULL); | 414 | if (WARN_ON_ONCE(!p)) |
415 | return -EINVAL; | ||
379 | 416 | ||
380 | /* | 417 | /* |
381 | * Protect against sighand release/switch in exit/exec and process/ | 418 | * Protect against sighand release/switch in exit/exec and process/ |
@@ -384,15 +421,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer) | |||
384 | sighand = lock_task_sighand(p, &flags); | 421 | sighand = lock_task_sighand(p, &flags); |
385 | if (unlikely(sighand == NULL)) { | 422 | if (unlikely(sighand == NULL)) { |
386 | /* | 423 | /* |
387 | * We raced with the reaping of the task. | 424 | * This raced with the reaping of the task. The exit cleanup |
388 | * The deletion should have cleared us off the list. | 425 | * should have removed this timer from the timer queue. |
389 | */ | 426 | */ |
390 | WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); | 427 | WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); |
391 | } else { | 428 | } else { |
392 | if (timer->it.cpu.firing) | 429 | if (timer->it.cpu.firing) |
393 | ret = TIMER_RETRY; | 430 | ret = TIMER_RETRY; |
394 | else | 431 | else |
395 | list_del(&timer->it.cpu.entry); | 432 | cpu_timer_dequeue(ctmr); |
396 | 433 | ||
397 | unlock_task_sighand(p, &flags); | 434 | unlock_task_sighand(p, &flags); |
398 | } | 435 | } |
@@ -403,25 +440,30 @@ static int posix_cpu_timer_del(struct k_itimer *timer) | |||
403 | return ret; | 440 | return ret; |
404 | } | 441 | } |
405 | 442 | ||
406 | static void cleanup_timers_list(struct list_head *head) | 443 | static void cleanup_timerqueue(struct timerqueue_head *head) |
407 | { | 444 | { |
408 | struct cpu_timer_list *timer, *next; | 445 | struct timerqueue_node *node; |
446 | struct cpu_timer *ctmr; | ||
409 | 447 | ||
410 | list_for_each_entry_safe(timer, next, head, entry) | 448 | while ((node = timerqueue_getnext(head))) { |
411 | list_del_init(&timer->entry); | 449 | timerqueue_del(head, node); |
450 | ctmr = container_of(node, struct cpu_timer, node); | ||
451 | ctmr->head = NULL; | ||
452 | } | ||
412 | } | 453 | } |
413 | 454 | ||
414 | /* | 455 | /* |
415 | * Clean out CPU timers still ticking when a thread exited. The task | 456 | * Clean out CPU timers which are still armed when a thread exits. The |
416 | * pointer is cleared, and the expiry time is replaced with the residual | 457 | * timers are only removed from the list. No other updates are done. The |
417 | * time for later timer_gettime calls to return. | 458 | * corresponding posix timers are still accessible, but cannot be rearmed. |
459 | * | ||
418 | * This must be called with the siglock held. | 460 | * This must be called with the siglock held. |
419 | */ | 461 | */ |
420 | static void cleanup_timers(struct list_head *head) | 462 | static void cleanup_timers(struct posix_cputimers *pct) |
421 | { | 463 | { |
422 | cleanup_timers_list(head); | 464 | cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead); |
423 | cleanup_timers_list(++head); | 465 | cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead); |
424 | cleanup_timers_list(++head); | 466 | cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead); |
425 | } | 467 | } |
426 | 468 | ||
427 | /* | 469 | /* |
@@ -431,16 +473,11 @@ static void cleanup_timers(struct list_head *head) | |||
431 | */ | 473 | */ |
432 | void posix_cpu_timers_exit(struct task_struct *tsk) | 474 | void posix_cpu_timers_exit(struct task_struct *tsk) |
433 | { | 475 | { |
434 | cleanup_timers(tsk->cpu_timers); | 476 | cleanup_timers(&tsk->posix_cputimers); |
435 | } | 477 | } |
436 | void posix_cpu_timers_exit_group(struct task_struct *tsk) | 478 | void posix_cpu_timers_exit_group(struct task_struct *tsk) |
437 | { | 479 | { |
438 | cleanup_timers(tsk->signal->cpu_timers); | 480 | cleanup_timers(&tsk->signal->posix_cputimers); |
439 | } | ||
440 | |||
441 | static inline int expires_gt(u64 expires, u64 new_exp) | ||
442 | { | ||
443 | return expires == 0 || expires > new_exp; | ||
444 | } | 481 | } |
445 | 482 | ||
446 | /* | 483 | /* |
@@ -449,58 +486,33 @@ static inline int expires_gt(u64 expires, u64 new_exp) | |||
449 | */ | 486 | */ |
450 | static void arm_timer(struct k_itimer *timer) | 487 | static void arm_timer(struct k_itimer *timer) |
451 | { | 488 | { |
452 | struct task_struct *p = timer->it.cpu.task; | 489 | int clkidx = CPUCLOCK_WHICH(timer->it_clock); |
453 | struct list_head *head, *listpos; | 490 | struct cpu_timer *ctmr = &timer->it.cpu; |
454 | struct task_cputime *cputime_expires; | 491 | u64 newexp = cpu_timer_getexpires(ctmr); |
455 | struct cpu_timer_list *const nt = &timer->it.cpu; | 492 | struct task_struct *p = ctmr->task; |
456 | struct cpu_timer_list *next; | 493 | struct posix_cputimer_base *base; |
457 | 494 | ||
458 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | 495 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) |
459 | head = p->cpu_timers; | 496 | base = p->posix_cputimers.bases + clkidx; |
460 | cputime_expires = &p->cputime_expires; | 497 | else |
461 | } else { | 498 | base = p->signal->posix_cputimers.bases + clkidx; |
462 | head = p->signal->cpu_timers; | 499 | |
463 | cputime_expires = &p->signal->cputime_expires; | 500 | if (!cpu_timer_enqueue(&base->tqhead, ctmr)) |
464 | } | 501 | return; |
465 | head += CPUCLOCK_WHICH(timer->it_clock); | ||
466 | |||
467 | listpos = head; | ||
468 | list_for_each_entry(next, head, entry) { | ||
469 | if (nt->expires < next->expires) | ||
470 | break; | ||
471 | listpos = &next->entry; | ||
472 | } | ||
473 | list_add(&nt->entry, listpos); | ||
474 | |||
475 | if (listpos == head) { | ||
476 | u64 exp = nt->expires; | ||
477 | 502 | ||
478 | /* | 503 | /* |
479 | * We are the new earliest-expiring POSIX 1.b timer, hence | 504 | * We are the new earliest-expiring POSIX 1.b timer, hence |
480 | * need to update expiration cache. Take into account that | 505 | * need to update expiration cache. Take into account that |
481 | * for process timers we share expiration cache with itimers | 506 | * for process timers we share expiration cache with itimers |
482 | * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. | 507 | * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. |
483 | */ | 508 | */ |
509 | if (newexp < base->nextevt) | ||
510 | base->nextevt = newexp; | ||
484 | 511 | ||
485 | switch (CPUCLOCK_WHICH(timer->it_clock)) { | 512 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) |
486 | case CPUCLOCK_PROF: | 513 | tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); |
487 | if (expires_gt(cputime_expires->prof_exp, exp)) | 514 | else |
488 | cputime_expires->prof_exp = exp; | 515 | tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); |
489 | break; | ||
490 | case CPUCLOCK_VIRT: | ||
491 | if (expires_gt(cputime_expires->virt_exp, exp)) | ||
492 | cputime_expires->virt_exp = exp; | ||
493 | break; | ||
494 | case CPUCLOCK_SCHED: | ||
495 | if (expires_gt(cputime_expires->sched_exp, exp)) | ||
496 | cputime_expires->sched_exp = exp; | ||
497 | break; | ||
498 | } | ||
499 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) | ||
500 | tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); | ||
501 | else | ||
502 | tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); | ||
503 | } | ||
504 | } | 516 | } |
505 | 517 | ||
506 | /* | 518 | /* |
@@ -508,24 +520,26 @@ static void arm_timer(struct k_itimer *timer) | |||
508 | */ | 520 | */ |
509 | static void cpu_timer_fire(struct k_itimer *timer) | 521 | static void cpu_timer_fire(struct k_itimer *timer) |
510 | { | 522 | { |
523 | struct cpu_timer *ctmr = &timer->it.cpu; | ||
524 | |||
511 | if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { | 525 | if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { |
512 | /* | 526 | /* |
513 | * User don't want any signal. | 527 | * User don't want any signal. |
514 | */ | 528 | */ |
515 | timer->it.cpu.expires = 0; | 529 | cpu_timer_setexpires(ctmr, 0); |
516 | } else if (unlikely(timer->sigq == NULL)) { | 530 | } else if (unlikely(timer->sigq == NULL)) { |
517 | /* | 531 | /* |
518 | * This a special case for clock_nanosleep, | 532 | * This a special case for clock_nanosleep, |
519 | * not a normal timer from sys_timer_create. | 533 | * not a normal timer from sys_timer_create. |
520 | */ | 534 | */ |
521 | wake_up_process(timer->it_process); | 535 | wake_up_process(timer->it_process); |
522 | timer->it.cpu.expires = 0; | 536 | cpu_timer_setexpires(ctmr, 0); |
523 | } else if (!timer->it_interval) { | 537 | } else if (!timer->it_interval) { |
524 | /* | 538 | /* |
525 | * One-shot timer. Clear it as soon as it's fired. | 539 | * One-shot timer. Clear it as soon as it's fired. |
526 | */ | 540 | */ |
527 | posix_timer_event(timer, 0); | 541 | posix_timer_event(timer, 0); |
528 | timer->it.cpu.expires = 0; | 542 | cpu_timer_setexpires(ctmr, 0); |
529 | } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { | 543 | } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { |
530 | /* | 544 | /* |
531 | * The signal did not get queued because the signal | 545 | * The signal did not get queued because the signal |
@@ -539,33 +553,6 @@ static void cpu_timer_fire(struct k_itimer *timer) | |||
539 | } | 553 | } |
540 | 554 | ||
541 | /* | 555 | /* |
542 | * Sample a process (thread group) timer for the given group_leader task. | ||
543 | * Must be called with task sighand lock held for safe while_each_thread() | ||
544 | * traversal. | ||
545 | */ | ||
546 | static int cpu_timer_sample_group(const clockid_t which_clock, | ||
547 | struct task_struct *p, u64 *sample) | ||
548 | { | ||
549 | struct task_cputime cputime; | ||
550 | |||
551 | thread_group_cputimer(p, &cputime); | ||
552 | switch (CPUCLOCK_WHICH(which_clock)) { | ||
553 | default: | ||
554 | return -EINVAL; | ||
555 | case CPUCLOCK_PROF: | ||
556 | *sample = cputime.utime + cputime.stime; | ||
557 | break; | ||
558 | case CPUCLOCK_VIRT: | ||
559 | *sample = cputime.utime; | ||
560 | break; | ||
561 | case CPUCLOCK_SCHED: | ||
562 | *sample = cputime.sum_exec_runtime; | ||
563 | break; | ||
564 | } | ||
565 | return 0; | ||
566 | } | ||
567 | |||
568 | /* | ||
569 | * Guts of sys_timer_settime for CPU timers. | 556 | * Guts of sys_timer_settime for CPU timers. |
570 | * This is called with the timer locked and interrupts disabled. | 557 | * This is called with the timer locked and interrupts disabled. |
571 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | 558 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
@@ -574,13 +561,16 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
574 | static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | 561 | static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, |
575 | struct itimerspec64 *new, struct itimerspec64 *old) | 562 | struct itimerspec64 *new, struct itimerspec64 *old) |
576 | { | 563 | { |
577 | unsigned long flags; | 564 | clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); |
578 | struct sighand_struct *sighand; | ||
579 | struct task_struct *p = timer->it.cpu.task; | ||
580 | u64 old_expires, new_expires, old_incr, val; | 565 | u64 old_expires, new_expires, old_incr, val; |
581 | int ret; | 566 | struct cpu_timer *ctmr = &timer->it.cpu; |
567 | struct task_struct *p = ctmr->task; | ||
568 | struct sighand_struct *sighand; | ||
569 | unsigned long flags; | ||
570 | int ret = 0; | ||
582 | 571 | ||
583 | WARN_ON_ONCE(p == NULL); | 572 | if (WARN_ON_ONCE(!p)) |
573 | return -EINVAL; | ||
584 | 574 | ||
585 | /* | 575 | /* |
586 | * Use the to_ktime conversion because that clamps the maximum | 576 | * Use the to_ktime conversion because that clamps the maximum |
@@ -597,22 +587,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
597 | * If p has just been reaped, we can no | 587 | * If p has just been reaped, we can no |
598 | * longer get any information about it at all. | 588 | * longer get any information about it at all. |
599 | */ | 589 | */ |
600 | if (unlikely(sighand == NULL)) { | 590 | if (unlikely(sighand == NULL)) |
601 | return -ESRCH; | 591 | return -ESRCH; |
602 | } | ||
603 | 592 | ||
604 | /* | 593 | /* |
605 | * Disarm any old timer after extracting its expiry time. | 594 | * Disarm any old timer after extracting its expiry time. |
606 | */ | 595 | */ |
607 | |||
608 | ret = 0; | ||
609 | old_incr = timer->it_interval; | 596 | old_incr = timer->it_interval; |
610 | old_expires = timer->it.cpu.expires; | 597 | old_expires = cpu_timer_getexpires(ctmr); |
598 | |||
611 | if (unlikely(timer->it.cpu.firing)) { | 599 | if (unlikely(timer->it.cpu.firing)) { |
612 | timer->it.cpu.firing = -1; | 600 | timer->it.cpu.firing = -1; |
613 | ret = TIMER_RETRY; | 601 | ret = TIMER_RETRY; |
614 | } else | 602 | } else { |
615 | list_del_init(&timer->it.cpu.entry); | 603 | cpu_timer_dequeue(ctmr); |
604 | } | ||
616 | 605 | ||
617 | /* | 606 | /* |
618 | * We need to sample the current value to convert the new | 607 | * We need to sample the current value to convert the new |
@@ -622,11 +611,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
622 | * times (in arm_timer). With an absolute time, we must | 611 | * times (in arm_timer). With an absolute time, we must |
623 | * check if it's already passed. In short, we need a sample. | 612 | * check if it's already passed. In short, we need a sample. |
624 | */ | 613 | */ |
625 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | 614 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) |
626 | cpu_clock_sample(timer->it_clock, p, &val); | 615 | val = cpu_clock_sample(clkid, p); |
627 | } else { | 616 | else |
628 | cpu_timer_sample_group(timer->it_clock, p, &val); | 617 | val = cpu_clock_sample_group(clkid, p, true); |
629 | } | ||
630 | 618 | ||
631 | if (old) { | 619 | if (old) { |
632 | if (old_expires == 0) { | 620 | if (old_expires == 0) { |
@@ -634,18 +622,16 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
634 | old->it_value.tv_nsec = 0; | 622 | old->it_value.tv_nsec = 0; |
635 | } else { | 623 | } else { |
636 | /* | 624 | /* |
637 | * Update the timer in case it has | 625 | * Update the timer in case it has overrun already. |
638 | * overrun already. If it has, | 626 | * If it has, we'll report it as having overrun and |
639 | * we'll report it as having overrun | 627 | * with the next reloaded timer already ticking, |
640 | * and with the next reloaded timer | 628 | * though we are swallowing that pending |
641 | * already ticking, though we are | 629 | * notification here to install the new setting. |
642 | * swallowing that pending | ||
643 | * notification here to install the | ||
644 | * new setting. | ||
645 | */ | 630 | */ |
646 | bump_cpu_timer(timer, val); | 631 | u64 exp = bump_cpu_timer(timer, val); |
647 | if (val < timer->it.cpu.expires) { | 632 | |
648 | old_expires = timer->it.cpu.expires - val; | 633 | if (val < exp) { |
634 | old_expires = exp - val; | ||
649 | old->it_value = ns_to_timespec64(old_expires); | 635 | old->it_value = ns_to_timespec64(old_expires); |
650 | } else { | 636 | } else { |
651 | old->it_value.tv_nsec = 1; | 637 | old->it_value.tv_nsec = 1; |
@@ -674,7 +660,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
674 | * For a timer with no notification action, we don't actually | 660 | * For a timer with no notification action, we don't actually |
675 | * arm the timer (we'll just fake it for timer_gettime). | 661 | * arm the timer (we'll just fake it for timer_gettime). |
676 | */ | 662 | */ |
677 | timer->it.cpu.expires = new_expires; | 663 | cpu_timer_setexpires(ctmr, new_expires); |
678 | if (new_expires != 0 && val < new_expires) { | 664 | if (new_expires != 0 && val < new_expires) { |
679 | arm_timer(timer); | 665 | arm_timer(timer); |
680 | } | 666 | } |
@@ -715,24 +701,27 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
715 | 701 | ||
716 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) | 702 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) |
717 | { | 703 | { |
718 | u64 now; | 704 | clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); |
719 | struct task_struct *p = timer->it.cpu.task; | 705 | struct cpu_timer *ctmr = &timer->it.cpu; |
706 | u64 now, expires = cpu_timer_getexpires(ctmr); | ||
707 | struct task_struct *p = ctmr->task; | ||
720 | 708 | ||
721 | WARN_ON_ONCE(p == NULL); | 709 | if (WARN_ON_ONCE(!p)) |
710 | return; | ||
722 | 711 | ||
723 | /* | 712 | /* |
724 | * Easy part: convert the reload time. | 713 | * Easy part: convert the reload time. |
725 | */ | 714 | */ |
726 | itp->it_interval = ktime_to_timespec64(timer->it_interval); | 715 | itp->it_interval = ktime_to_timespec64(timer->it_interval); |
727 | 716 | ||
728 | if (!timer->it.cpu.expires) | 717 | if (!expires) |
729 | return; | 718 | return; |
730 | 719 | ||
731 | /* | 720 | /* |
732 | * Sample the clock to take the difference with the expiry time. | 721 | * Sample the clock to take the difference with the expiry time. |
733 | */ | 722 | */ |
734 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | 723 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { |
735 | cpu_clock_sample(timer->it_clock, p, &now); | 724 | now = cpu_clock_sample(clkid, p); |
736 | } else { | 725 | } else { |
737 | struct sighand_struct *sighand; | 726 | struct sighand_struct *sighand; |
738 | unsigned long flags; | 727 | unsigned long flags; |
@@ -747,18 +736,18 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp | |||
747 | /* | 736 | /* |
748 | * The process has been reaped. | 737 | * The process has been reaped. |
749 | * We can't even collect a sample any more. | 738 | * We can't even collect a sample any more. |
750 | * Call the timer disarmed, nothing else to do. | 739 | * Disarm the timer, nothing else to do. |
751 | */ | 740 | */ |
752 | timer->it.cpu.expires = 0; | 741 | cpu_timer_setexpires(ctmr, 0); |
753 | return; | 742 | return; |
754 | } else { | 743 | } else { |
755 | cpu_timer_sample_group(timer->it_clock, p, &now); | 744 | now = cpu_clock_sample_group(clkid, p, false); |
756 | unlock_task_sighand(p, &flags); | 745 | unlock_task_sighand(p, &flags); |
757 | } | 746 | } |
758 | } | 747 | } |
759 | 748 | ||
760 | if (now < timer->it.cpu.expires) { | 749 | if (now < expires) { |
761 | itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now); | 750 | itp->it_value = ns_to_timespec64(expires - now); |
762 | } else { | 751 | } else { |
763 | /* | 752 | /* |
764 | * The timer should have expired already, but the firing | 753 | * The timer should have expired already, but the firing |
@@ -769,26 +758,42 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp | |||
769 | } | 758 | } |
770 | } | 759 | } |
771 | 760 | ||
772 | static unsigned long long | 761 | #define MAX_COLLECTED 20 |
773 | check_timers_list(struct list_head *timers, | ||
774 | struct list_head *firing, | ||
775 | unsigned long long curr) | ||
776 | { | ||
777 | int maxfire = 20; | ||
778 | 762 | ||
779 | while (!list_empty(timers)) { | 763 | static u64 collect_timerqueue(struct timerqueue_head *head, |
780 | struct cpu_timer_list *t; | 764 | struct list_head *firing, u64 now) |
765 | { | ||
766 | struct timerqueue_node *next; | ||
767 | int i = 0; | ||
768 | |||
769 | while ((next = timerqueue_getnext(head))) { | ||
770 | struct cpu_timer *ctmr; | ||
771 | u64 expires; | ||
772 | |||
773 | ctmr = container_of(next, struct cpu_timer, node); | ||
774 | expires = cpu_timer_getexpires(ctmr); | ||
775 | /* Limit the number of timers to expire at once */ | ||
776 | if (++i == MAX_COLLECTED || now < expires) | ||
777 | return expires; | ||
778 | |||
779 | ctmr->firing = 1; | ||
780 | cpu_timer_dequeue(ctmr); | ||
781 | list_add_tail(&ctmr->elist, firing); | ||
782 | } | ||
781 | 783 | ||
782 | t = list_first_entry(timers, struct cpu_timer_list, entry); | 784 | return U64_MAX; |
785 | } | ||
783 | 786 | ||
784 | if (!--maxfire || curr < t->expires) | 787 | static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, |
785 | return t->expires; | 788 | struct list_head *firing) |
789 | { | ||
790 | struct posix_cputimer_base *base = pct->bases; | ||
791 | int i; | ||
786 | 792 | ||
787 | t->firing = 1; | 793 | for (i = 0; i < CPUCLOCK_MAX; i++, base++) { |
788 | list_move_tail(&t->entry, firing); | 794 | base->nextevt = collect_timerqueue(&base->tqhead, firing, |
795 | samples[i]); | ||
789 | } | 796 | } |
790 | |||
791 | return 0; | ||
792 | } | 797 | } |
793 | 798 | ||
794 | static inline void check_dl_overrun(struct task_struct *tsk) | 799 | static inline void check_dl_overrun(struct task_struct *tsk) |
@@ -799,6 +804,20 @@ static inline void check_dl_overrun(struct task_struct *tsk) | |||
799 | } | 804 | } |
800 | } | 805 | } |
801 | 806 | ||
807 | static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) | ||
808 | { | ||
809 | if (time < limit) | ||
810 | return false; | ||
811 | |||
812 | if (print_fatal_signals) { | ||
813 | pr_info("%s Watchdog Timeout (%s): %s[%d]\n", | ||
814 | rt ? "RT" : "CPU", hard ? "hard" : "soft", | ||
815 | current->comm, task_pid_nr(current)); | ||
816 | } | ||
817 | __group_send_sig_info(signo, SEND_SIG_PRIV, current); | ||
818 | return true; | ||
819 | } | ||
820 | |||
802 | /* | 821 | /* |
803 | * Check for any per-thread CPU timers that have fired and move them off | 822 | * Check for any per-thread CPU timers that have fired and move them off |
804 | * the tsk->cpu_timers[N] list onto the firing list. Here we update the | 823 | * the tsk->cpu_timers[N] list onto the firing list. Here we update the |
@@ -807,76 +826,50 @@ static inline void check_dl_overrun(struct task_struct *tsk) | |||
807 | static void check_thread_timers(struct task_struct *tsk, | 826 | static void check_thread_timers(struct task_struct *tsk, |
808 | struct list_head *firing) | 827 | struct list_head *firing) |
809 | { | 828 | { |
810 | struct list_head *timers = tsk->cpu_timers; | 829 | struct posix_cputimers *pct = &tsk->posix_cputimers; |
811 | struct task_cputime *tsk_expires = &tsk->cputime_expires; | 830 | u64 samples[CPUCLOCK_MAX]; |
812 | u64 expires; | ||
813 | unsigned long soft; | 831 | unsigned long soft; |
814 | 832 | ||
815 | if (dl_task(tsk)) | 833 | if (dl_task(tsk)) |
816 | check_dl_overrun(tsk); | 834 | check_dl_overrun(tsk); |
817 | 835 | ||
818 | /* | 836 | if (expiry_cache_is_inactive(pct)) |
819 | * If cputime_expires is zero, then there are no active | ||
820 | * per thread CPU timers. | ||
821 | */ | ||
822 | if (task_cputime_zero(&tsk->cputime_expires)) | ||
823 | return; | 837 | return; |
824 | 838 | ||
825 | expires = check_timers_list(timers, firing, prof_ticks(tsk)); | 839 | task_sample_cputime(tsk, samples); |
826 | tsk_expires->prof_exp = expires; | 840 | collect_posix_cputimers(pct, samples, firing); |
827 | |||
828 | expires = check_timers_list(++timers, firing, virt_ticks(tsk)); | ||
829 | tsk_expires->virt_exp = expires; | ||
830 | |||
831 | tsk_expires->sched_exp = check_timers_list(++timers, firing, | ||
832 | tsk->se.sum_exec_runtime); | ||
833 | 841 | ||
834 | /* | 842 | /* |
835 | * Check for the special case thread timers. | 843 | * Check for the special case thread timers. |
836 | */ | 844 | */ |
837 | soft = task_rlimit(tsk, RLIMIT_RTTIME); | 845 | soft = task_rlimit(tsk, RLIMIT_RTTIME); |
838 | if (soft != RLIM_INFINITY) { | 846 | if (soft != RLIM_INFINITY) { |
847 | /* Task RT timeout is accounted in jiffies. RTTIME is usec */ | ||
848 | unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); | ||
839 | unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); | 849 | unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); |
840 | 850 | ||
851 | /* At the hard limit, send SIGKILL. No further action. */ | ||
841 | if (hard != RLIM_INFINITY && | 852 | if (hard != RLIM_INFINITY && |
842 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { | 853 | check_rlimit(rttime, hard, SIGKILL, true, true)) |
843 | /* | ||
844 | * At the hard limit, we just die. | ||
845 | * No need to calculate anything else now. | ||
846 | */ | ||
847 | if (print_fatal_signals) { | ||
848 | pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", | ||
849 | tsk->comm, task_pid_nr(tsk)); | ||
850 | } | ||
851 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | ||
852 | return; | 854 | return; |
853 | } | 855 | |
854 | if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { | 856 | /* At the soft limit, send a SIGXCPU every second */ |
855 | /* | 857 | if (check_rlimit(rttime, soft, SIGXCPU, true, false)) { |
856 | * At the soft limit, send a SIGXCPU every second. | 858 | soft += USEC_PER_SEC; |
857 | */ | 859 | tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft; |
858 | if (soft < hard) { | ||
859 | soft += USEC_PER_SEC; | ||
860 | tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = | ||
861 | soft; | ||
862 | } | ||
863 | if (print_fatal_signals) { | ||
864 | pr_info("RT Watchdog Timeout (soft): %s[%d]\n", | ||
865 | tsk->comm, task_pid_nr(tsk)); | ||
866 | } | ||
867 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | ||
868 | } | 860 | } |
869 | } | 861 | } |
870 | if (task_cputime_zero(tsk_expires)) | 862 | |
863 | if (expiry_cache_is_inactive(pct)) | ||
871 | tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); | 864 | tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); |
872 | } | 865 | } |
873 | 866 | ||
874 | static inline void stop_process_timers(struct signal_struct *sig) | 867 | static inline void stop_process_timers(struct signal_struct *sig) |
875 | { | 868 | { |
876 | struct thread_group_cputimer *cputimer = &sig->cputimer; | 869 | struct posix_cputimers *pct = &sig->posix_cputimers; |
877 | 870 | ||
878 | /* Turn off cputimer->running. This is done without locking. */ | 871 | /* Turn off the active flag. This is done without locking. */ |
879 | WRITE_ONCE(cputimer->running, false); | 872 | WRITE_ONCE(pct->timers_active, false); |
880 | tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); | 873 | tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); |
881 | } | 874 | } |
882 | 875 | ||
@@ -898,7 +891,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
898 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); | 891 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); |
899 | } | 892 | } |
900 | 893 | ||
901 | if (it->expires && (!*expires || it->expires < *expires)) | 894 | if (it->expires && it->expires < *expires) |
902 | *expires = it->expires; | 895 | *expires = it->expires; |
903 | } | 896 | } |
904 | 897 | ||
@@ -911,87 +904,69 @@ static void check_process_timers(struct task_struct *tsk, | |||
911 | struct list_head *firing) | 904 | struct list_head *firing) |
912 | { | 905 | { |
913 | struct signal_struct *const sig = tsk->signal; | 906 | struct signal_struct *const sig = tsk->signal; |
914 | u64 utime, ptime, virt_expires, prof_expires; | 907 | struct posix_cputimers *pct = &sig->posix_cputimers; |
915 | u64 sum_sched_runtime, sched_expires; | 908 | u64 samples[CPUCLOCK_MAX]; |
916 | struct list_head *timers = sig->cpu_timers; | ||
917 | struct task_cputime cputime; | ||
918 | unsigned long soft; | 909 | unsigned long soft; |
919 | 910 | ||
920 | /* | 911 | /* |
921 | * If cputimer is not running, then there are no active | 912 | * If there are no active process wide timers (POSIX 1.b, itimers, |
922 | * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). | 913 | * RLIMIT_CPU) nothing to check. Also skip the process wide timer |
914 | * processing when there is already another task handling them. | ||
923 | */ | 915 | */ |
924 | if (!READ_ONCE(tsk->signal->cputimer.running)) | 916 | if (!READ_ONCE(pct->timers_active) || pct->expiry_active) |
925 | return; | 917 | return; |
926 | 918 | ||
927 | /* | 919 | /* |
928 | * Signify that a thread is checking for process timers. | 920 | * Signify that a thread is checking for process timers. |
929 | * Write access to this field is protected by the sighand lock. | 921 | * Write access to this field is protected by the sighand lock. |
930 | */ | 922 | */ |
931 | sig->cputimer.checking_timer = true; | 923 | pct->expiry_active = true; |
932 | 924 | ||
933 | /* | 925 | /* |
934 | * Collect the current process totals. | 926 | * Collect the current process totals. Group accounting is active |
927 | * so the sample can be taken directly. | ||
935 | */ | 928 | */ |
936 | thread_group_cputimer(tsk, &cputime); | 929 | proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples); |
937 | utime = cputime.utime; | 930 | collect_posix_cputimers(pct, samples, firing); |
938 | ptime = utime + cputime.stime; | ||
939 | sum_sched_runtime = cputime.sum_exec_runtime; | ||
940 | |||
941 | prof_expires = check_timers_list(timers, firing, ptime); | ||
942 | virt_expires = check_timers_list(++timers, firing, utime); | ||
943 | sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); | ||
944 | 931 | ||
945 | /* | 932 | /* |
946 | * Check for the special case process timers. | 933 | * Check for the special case process timers. |
947 | */ | 934 | */ |
948 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, | 935 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], |
949 | SIGPROF); | 936 | &pct->bases[CPUCLOCK_PROF].nextevt, |
950 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, | 937 | samples[CPUCLOCK_PROF], SIGPROF); |
951 | SIGVTALRM); | 938 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], |
939 | &pct->bases[CPUCLOCK_VIRT].nextevt, | ||
940 | samples[CPUCLOCK_VIRT], SIGVTALRM); | ||
941 | |||
952 | soft = task_rlimit(tsk, RLIMIT_CPU); | 942 | soft = task_rlimit(tsk, RLIMIT_CPU); |
953 | if (soft != RLIM_INFINITY) { | 943 | if (soft != RLIM_INFINITY) { |
954 | unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); | 944 | /* RLIMIT_CPU is in seconds. Samples are nanoseconds */ |
955 | unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); | 945 | unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); |
956 | u64 x; | 946 | u64 ptime = samples[CPUCLOCK_PROF]; |
957 | if (psecs >= hard) { | 947 | u64 softns = (u64)soft * NSEC_PER_SEC; |
958 | /* | 948 | u64 hardns = (u64)hard * NSEC_PER_SEC; |
959 | * At the hard limit, we just die. | 949 | |
960 | * No need to calculate anything else now. | 950 | /* At the hard limit, send SIGKILL. No further action. */ |
961 | */ | 951 | if (hard != RLIM_INFINITY && |
962 | if (print_fatal_signals) { | 952 | check_rlimit(ptime, hardns, SIGKILL, false, true)) |
963 | pr_info("RT Watchdog Timeout (hard): %s[%d]\n", | ||
964 | tsk->comm, task_pid_nr(tsk)); | ||
965 | } | ||
966 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | ||
967 | return; | 953 | return; |
954 | |||
955 | /* At the soft limit, send a SIGXCPU every second */ | ||
956 | if (check_rlimit(ptime, softns, SIGXCPU, false, false)) { | ||
957 | sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1; | ||
958 | softns += NSEC_PER_SEC; | ||
968 | } | 959 | } |
969 | if (psecs >= soft) { | 960 | |
970 | /* | 961 | /* Update the expiry cache */ |
971 | * At the soft limit, send a SIGXCPU every second. | 962 | if (softns < pct->bases[CPUCLOCK_PROF].nextevt) |
972 | */ | 963 | pct->bases[CPUCLOCK_PROF].nextevt = softns; |
973 | if (print_fatal_signals) { | ||
974 | pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", | ||
975 | tsk->comm, task_pid_nr(tsk)); | ||
976 | } | ||
977 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | ||
978 | if (soft < hard) { | ||
979 | soft++; | ||
980 | sig->rlim[RLIMIT_CPU].rlim_cur = soft; | ||
981 | } | ||
982 | } | ||
983 | x = soft * NSEC_PER_SEC; | ||
984 | if (!prof_expires || x < prof_expires) | ||
985 | prof_expires = x; | ||
986 | } | 964 | } |
987 | 965 | ||
988 | sig->cputime_expires.prof_exp = prof_expires; | 966 | if (expiry_cache_is_inactive(pct)) |
989 | sig->cputime_expires.virt_exp = virt_expires; | ||
990 | sig->cputime_expires.sched_exp = sched_expires; | ||
991 | if (task_cputime_zero(&sig->cputime_expires)) | ||
992 | stop_process_timers(sig); | 967 | stop_process_timers(sig); |
993 | 968 | ||
994 | sig->cputimer.checking_timer = false; | 969 | pct->expiry_active = false; |
995 | } | 970 | } |
996 | 971 | ||
997 | /* | 972 | /* |
@@ -1000,18 +975,21 @@ static void check_process_timers(struct task_struct *tsk, | |||
1000 | */ | 975 | */ |
1001 | static void posix_cpu_timer_rearm(struct k_itimer *timer) | 976 | static void posix_cpu_timer_rearm(struct k_itimer *timer) |
1002 | { | 977 | { |
978 | clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); | ||
979 | struct cpu_timer *ctmr = &timer->it.cpu; | ||
980 | struct task_struct *p = ctmr->task; | ||
1003 | struct sighand_struct *sighand; | 981 | struct sighand_struct *sighand; |
1004 | unsigned long flags; | 982 | unsigned long flags; |
1005 | struct task_struct *p = timer->it.cpu.task; | ||
1006 | u64 now; | 983 | u64 now; |
1007 | 984 | ||
1008 | WARN_ON_ONCE(p == NULL); | 985 | if (WARN_ON_ONCE(!p)) |
986 | return; | ||
1009 | 987 | ||
1010 | /* | 988 | /* |
1011 | * Fetch the current sample and update the timer's expiry time. | 989 | * Fetch the current sample and update the timer's expiry time. |
1012 | */ | 990 | */ |
1013 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | 991 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { |
1014 | cpu_clock_sample(timer->it_clock, p, &now); | 992 | now = cpu_clock_sample(clkid, p); |
1015 | bump_cpu_timer(timer, now); | 993 | bump_cpu_timer(timer, now); |
1016 | if (unlikely(p->exit_state)) | 994 | if (unlikely(p->exit_state)) |
1017 | return; | 995 | return; |
@@ -1031,13 +1009,13 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) | |||
1031 | * The process has been reaped. | 1009 | * The process has been reaped. |
1032 | * We can't even collect a sample any more. | 1010 | * We can't even collect a sample any more. |
1033 | */ | 1011 | */ |
1034 | timer->it.cpu.expires = 0; | 1012 | cpu_timer_setexpires(ctmr, 0); |
1035 | return; | 1013 | return; |
1036 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { | 1014 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { |
1037 | /* If the process is dying, no need to rearm */ | 1015 | /* If the process is dying, no need to rearm */ |
1038 | goto unlock; | 1016 | goto unlock; |
1039 | } | 1017 | } |
1040 | cpu_timer_sample_group(timer->it_clock, p, &now); | 1018 | now = cpu_clock_sample_group(clkid, p, true); |
1041 | bump_cpu_timer(timer, now); | 1019 | bump_cpu_timer(timer, now); |
1042 | /* Leave the sighand locked for the call below. */ | 1020 | /* Leave the sighand locked for the call below. */ |
1043 | } | 1021 | } |
@@ -1051,26 +1029,24 @@ unlock: | |||
1051 | } | 1029 | } |
1052 | 1030 | ||
1053 | /** | 1031 | /** |
1054 | * task_cputime_expired - Compare two task_cputime entities. | 1032 | * task_cputimers_expired - Check whether posix CPU timers are expired |
1055 | * | 1033 | * |
1056 | * @sample: The task_cputime structure to be checked for expiration. | 1034 | * @samples: Array of current samples for the CPUCLOCK clocks |
1057 | * @expires: Expiration times, against which @sample will be checked. | 1035 | * @pct: Pointer to a posix_cputimers container |
1058 | * | 1036 | * |
1059 | * Checks @sample against @expires to see if any field of @sample has expired. | 1037 | * Returns true if any member of @samples is greater than the corresponding |
1060 | * Returns true if any field of the former is greater than the corresponding | 1038 | * member of @pct->bases[CLK].nextevt. False otherwise |
1061 | * field of the latter if the latter field is set. Otherwise returns false. | ||
1062 | */ | 1039 | */ |
1063 | static inline int task_cputime_expired(const struct task_cputime *sample, | 1040 | static inline bool |
1064 | const struct task_cputime *expires) | 1041 | task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct) |
1065 | { | 1042 | { |
1066 | if (expires->utime && sample->utime >= expires->utime) | 1043 | int i; |
1067 | return 1; | 1044 | |
1068 | if (expires->stime && sample->utime + sample->stime >= expires->stime) | 1045 | for (i = 0; i < CPUCLOCK_MAX; i++) { |
1069 | return 1; | 1046 | if (sample[i] >= pct->bases[i].nextevt) |
1070 | if (expires->sum_exec_runtime != 0 && | 1047 | return true; |
1071 | sample->sum_exec_runtime >= expires->sum_exec_runtime) | 1048 | } |
1072 | return 1; | 1049 | return false; |
1073 | return 0; | ||
1074 | } | 1050 | } |
1075 | 1051 | ||
1076 | /** | 1052 | /** |
@@ -1083,48 +1059,50 @@ static inline int task_cputime_expired(const struct task_cputime *sample, | |||
1083 | * timers and compare them with the corresponding expiration times. Return | 1059 | * timers and compare them with the corresponding expiration times. Return |
1084 | * true if a timer has expired, else return false. | 1060 | * true if a timer has expired, else return false. |
1085 | */ | 1061 | */ |
1086 | static inline int fastpath_timer_check(struct task_struct *tsk) | 1062 | static inline bool fastpath_timer_check(struct task_struct *tsk) |
1087 | { | 1063 | { |
1064 | struct posix_cputimers *pct = &tsk->posix_cputimers; | ||
1088 | struct signal_struct *sig; | 1065 | struct signal_struct *sig; |
1089 | 1066 | ||
1090 | if (!task_cputime_zero(&tsk->cputime_expires)) { | 1067 | if (!expiry_cache_is_inactive(pct)) { |
1091 | struct task_cputime task_sample; | 1068 | u64 samples[CPUCLOCK_MAX]; |
1092 | 1069 | ||
1093 | task_cputime(tsk, &task_sample.utime, &task_sample.stime); | 1070 | task_sample_cputime(tsk, samples); |
1094 | task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; | 1071 | if (task_cputimers_expired(samples, pct)) |
1095 | if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) | 1072 | return true; |
1096 | return 1; | ||
1097 | } | 1073 | } |
1098 | 1074 | ||
1099 | sig = tsk->signal; | 1075 | sig = tsk->signal; |
1076 | pct = &sig->posix_cputimers; | ||
1100 | /* | 1077 | /* |
1101 | * Check if thread group timers expired when the cputimer is | 1078 | * Check if thread group timers expired when timers are active and |
1102 | * running and no other thread in the group is already checking | 1079 | * no other thread in the group is already handling expiry for |
1103 | * for thread group cputimers. These fields are read without the | 1080 | * thread group cputimers. These fields are read without the |
1104 | * sighand lock. However, this is fine because this is meant to | 1081 | * sighand lock. However, this is fine because this is meant to be |
1105 | * be a fastpath heuristic to determine whether we should try to | 1082 | * a fastpath heuristic to determine whether we should try to |
1106 | * acquire the sighand lock to check/handle timers. | 1083 | * acquire the sighand lock to handle timer expiry. |
1107 | * | 1084 | * |
1108 | * In the worst case scenario, if 'running' or 'checking_timer' gets | 1085 | * In the worst case scenario, if concurrently timers_active is set |
1109 | * set but the current thread doesn't see the change yet, we'll wait | 1086 | * or expiry_active is cleared, but the current thread doesn't see |
1110 | * until the next thread in the group gets a scheduler interrupt to | 1087 | * the change yet, the timer checks are delayed until the next |
1111 | * handle the timer. This isn't an issue in practice because these | 1088 | * thread in the group gets a scheduler interrupt to handle the |
1112 | * types of delays with signals actually getting sent are expected. | 1089 | * timer. This isn't an issue in practice because these types of |
1090 | * delays with signals actually getting sent are expected. | ||
1113 | */ | 1091 | */ |
1114 | if (READ_ONCE(sig->cputimer.running) && | 1092 | if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) { |
1115 | !READ_ONCE(sig->cputimer.checking_timer)) { | 1093 | u64 samples[CPUCLOCK_MAX]; |
1116 | struct task_cputime group_sample; | ||
1117 | 1094 | ||
1118 | sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); | 1095 | proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, |
1096 | samples); | ||
1119 | 1097 | ||
1120 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) | 1098 | if (task_cputimers_expired(samples, pct)) |
1121 | return 1; | 1099 | return true; |
1122 | } | 1100 | } |
1123 | 1101 | ||
1124 | if (dl_task(tsk) && tsk->dl.dl_overrun) | 1102 | if (dl_task(tsk) && tsk->dl.dl_overrun) |
1125 | return 1; | 1103 | return true; |
1126 | 1104 | ||
1127 | return 0; | 1105 | return false; |
1128 | } | 1106 | } |
1129 | 1107 | ||
1130 | /* | 1108 | /* |
@@ -1132,11 +1110,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
1132 | * already updated our counts. We need to check if any timers fire now. | 1110 | * already updated our counts. We need to check if any timers fire now. |
1133 | * Interrupts are disabled. | 1111 | * Interrupts are disabled. |
1134 | */ | 1112 | */ |
1135 | void run_posix_cpu_timers(struct task_struct *tsk) | 1113 | void run_posix_cpu_timers(void) |
1136 | { | 1114 | { |
1137 | LIST_HEAD(firing); | 1115 | struct task_struct *tsk = current; |
1138 | struct k_itimer *timer, *next; | 1116 | struct k_itimer *timer, *next; |
1139 | unsigned long flags; | 1117 | unsigned long flags; |
1118 | LIST_HEAD(firing); | ||
1140 | 1119 | ||
1141 | lockdep_assert_irqs_disabled(); | 1120 | lockdep_assert_irqs_disabled(); |
1142 | 1121 | ||
@@ -1174,11 +1153,11 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1174 | * each timer's lock before clearing its firing flag, so no | 1153 | * each timer's lock before clearing its firing flag, so no |
1175 | * timer call will interfere. | 1154 | * timer call will interfere. |
1176 | */ | 1155 | */ |
1177 | list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { | 1156 | list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { |
1178 | int cpu_firing; | 1157 | int cpu_firing; |
1179 | 1158 | ||
1180 | spin_lock(&timer->it_lock); | 1159 | spin_lock(&timer->it_lock); |
1181 | list_del_init(&timer->it.cpu.entry); | 1160 | list_del_init(&timer->it.cpu.elist); |
1182 | cpu_firing = timer->it.cpu.firing; | 1161 | cpu_firing = timer->it.cpu.firing; |
1183 | timer->it.cpu.firing = 0; | 1162 | timer->it.cpu.firing = 0; |
1184 | /* | 1163 | /* |
@@ -1196,16 +1175,18 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1196 | * Set one of the process-wide special case CPU timers or RLIMIT_CPU. | 1175 | * Set one of the process-wide special case CPU timers or RLIMIT_CPU. |
1197 | * The tsk->sighand->siglock must be held by the caller. | 1176 | * The tsk->sighand->siglock must be held by the caller. |
1198 | */ | 1177 | */ |
1199 | void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | 1178 | void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, |
1200 | u64 *newval, u64 *oldval) | 1179 | u64 *newval, u64 *oldval) |
1201 | { | 1180 | { |
1202 | u64 now; | 1181 | u64 now, *nextevt; |
1203 | int ret; | 1182 | |
1183 | if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED)) | ||
1184 | return; | ||
1204 | 1185 | ||
1205 | WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); | 1186 | nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt; |
1206 | ret = cpu_timer_sample_group(clock_idx, tsk, &now); | 1187 | now = cpu_clock_sample_group(clkid, tsk, true); |
1207 | 1188 | ||
1208 | if (oldval && ret != -EINVAL) { | 1189 | if (oldval) { |
1209 | /* | 1190 | /* |
1210 | * We are setting itimer. The *oldval is absolute and we update | 1191 | * We are setting itimer. The *oldval is absolute and we update |
1211 | * it to be relative, *newval argument is relative and we update | 1192 | * it to be relative, *newval argument is relative and we update |
@@ -1226,19 +1207,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1226 | } | 1207 | } |
1227 | 1208 | ||
1228 | /* | 1209 | /* |
1229 | * Update expiration cache if we are the earliest timer, or eventually | 1210 | * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF |
1230 | * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. | 1211 | * expiry cache is also used by RLIMIT_CPU!. |
1231 | */ | 1212 | */ |
1232 | switch (clock_idx) { | 1213 | if (*newval < *nextevt) |
1233 | case CPUCLOCK_PROF: | 1214 | *nextevt = *newval; |
1234 | if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) | ||
1235 | tsk->signal->cputime_expires.prof_exp = *newval; | ||
1236 | break; | ||
1237 | case CPUCLOCK_VIRT: | ||
1238 | if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) | ||
1239 | tsk->signal->cputime_expires.virt_exp = *newval; | ||
1240 | break; | ||
1241 | } | ||
1242 | 1215 | ||
1243 | tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); | 1216 | tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); |
1244 | } | 1217 | } |
@@ -1260,6 +1233,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1260 | timer.it_overrun = -1; | 1233 | timer.it_overrun = -1; |
1261 | error = posix_cpu_timer_create(&timer); | 1234 | error = posix_cpu_timer_create(&timer); |
1262 | timer.it_process = current; | 1235 | timer.it_process = current; |
1236 | |||
1263 | if (!error) { | 1237 | if (!error) { |
1264 | static struct itimerspec64 zero_it; | 1238 | static struct itimerspec64 zero_it; |
1265 | struct restart_block *restart; | 1239 | struct restart_block *restart; |
@@ -1275,7 +1249,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1275 | } | 1249 | } |
1276 | 1250 | ||
1277 | while (!signal_pending(current)) { | 1251 | while (!signal_pending(current)) { |
1278 | if (timer.it.cpu.expires == 0) { | 1252 | if (!cpu_timer_getexpires(&timer.it.cpu)) { |
1279 | /* | 1253 | /* |
1280 | * Our timer fired and was reset, below | 1254 | * Our timer fired and was reset, below |
1281 | * deletion can not fail. | 1255 | * deletion can not fail. |
@@ -1297,7 +1271,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1297 | /* | 1271 | /* |
1298 | * We were interrupted by a signal. | 1272 | * We were interrupted by a signal. |
1299 | */ | 1273 | */ |
1300 | expires = timer.it.cpu.expires; | 1274 | expires = cpu_timer_getexpires(&timer.it.cpu); |
1301 | error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); | 1275 | error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); |
1302 | if (!error) { | 1276 | if (!error) { |
1303 | /* | 1277 | /* |
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d7f2d91acdac..0ec5b7a1d769 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
@@ -442,7 +442,7 @@ static struct k_itimer * alloc_posix_timer(void) | |||
442 | 442 | ||
443 | static void k_itimer_rcu_free(struct rcu_head *head) | 443 | static void k_itimer_rcu_free(struct rcu_head *head) |
444 | { | 444 | { |
445 | struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); | 445 | struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); |
446 | 446 | ||
447 | kmem_cache_free(posix_timers_cache, tmr); | 447 | kmem_cache_free(posix_timers_cache, tmr); |
448 | } | 448 | } |
@@ -459,7 +459,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
459 | } | 459 | } |
460 | put_pid(tmr->it_pid); | 460 | put_pid(tmr->it_pid); |
461 | sigqueue_free(tmr->sigq); | 461 | sigqueue_free(tmr->sigq); |
462 | call_rcu(&tmr->it.rcu, k_itimer_rcu_free); | 462 | call_rcu(&tmr->rcu, k_itimer_rcu_free); |
463 | } | 463 | } |
464 | 464 | ||
465 | static int common_timer_create(struct k_itimer *new_timer) | 465 | static int common_timer_create(struct k_itimer *new_timer) |
@@ -805,6 +805,35 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr) | |||
805 | return hrtimer_try_to_cancel(&timr->it.real.timer); | 805 | return hrtimer_try_to_cancel(&timr->it.real.timer); |
806 | } | 806 | } |
807 | 807 | ||
808 | static void common_timer_wait_running(struct k_itimer *timer) | ||
809 | { | ||
810 | hrtimer_cancel_wait_running(&timer->it.real.timer); | ||
811 | } | ||
812 | |||
813 | /* | ||
814 | * On PREEMPT_RT this prevent priority inversion against softirq kthread in | ||
815 | * case it gets preempted while executing a timer callback. See comments in | ||
816 | * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a | ||
817 | * cpu_relax(). | ||
818 | */ | ||
819 | static struct k_itimer *timer_wait_running(struct k_itimer *timer, | ||
820 | unsigned long *flags) | ||
821 | { | ||
822 | const struct k_clock *kc = READ_ONCE(timer->kclock); | ||
823 | timer_t timer_id = READ_ONCE(timer->it_id); | ||
824 | |||
825 | /* Prevent kfree(timer) after dropping the lock */ | ||
826 | rcu_read_lock(); | ||
827 | unlock_timer(timer, *flags); | ||
828 | |||
829 | if (!WARN_ON_ONCE(!kc->timer_wait_running)) | ||
830 | kc->timer_wait_running(timer); | ||
831 | |||
832 | rcu_read_unlock(); | ||
833 | /* Relock the timer. It might be not longer hashed. */ | ||
834 | return lock_timer(timer_id, flags); | ||
835 | } | ||
836 | |||
808 | /* Set a POSIX.1b interval timer. */ | 837 | /* Set a POSIX.1b interval timer. */ |
809 | int common_timer_set(struct k_itimer *timr, int flags, | 838 | int common_timer_set(struct k_itimer *timr, int flags, |
810 | struct itimerspec64 *new_setting, | 839 | struct itimerspec64 *new_setting, |
@@ -844,13 +873,13 @@ int common_timer_set(struct k_itimer *timr, int flags, | |||
844 | return 0; | 873 | return 0; |
845 | } | 874 | } |
846 | 875 | ||
847 | static int do_timer_settime(timer_t timer_id, int flags, | 876 | static int do_timer_settime(timer_t timer_id, int tmr_flags, |
848 | struct itimerspec64 *new_spec64, | 877 | struct itimerspec64 *new_spec64, |
849 | struct itimerspec64 *old_spec64) | 878 | struct itimerspec64 *old_spec64) |
850 | { | 879 | { |
851 | const struct k_clock *kc; | 880 | const struct k_clock *kc; |
852 | struct k_itimer *timr; | 881 | struct k_itimer *timr; |
853 | unsigned long flag; | 882 | unsigned long flags; |
854 | int error = 0; | 883 | int error = 0; |
855 | 884 | ||
856 | if (!timespec64_valid(&new_spec64->it_interval) || | 885 | if (!timespec64_valid(&new_spec64->it_interval) || |
@@ -859,8 +888,9 @@ static int do_timer_settime(timer_t timer_id, int flags, | |||
859 | 888 | ||
860 | if (old_spec64) | 889 | if (old_spec64) |
861 | memset(old_spec64, 0, sizeof(*old_spec64)); | 890 | memset(old_spec64, 0, sizeof(*old_spec64)); |
891 | |||
892 | timr = lock_timer(timer_id, &flags); | ||
862 | retry: | 893 | retry: |
863 | timr = lock_timer(timer_id, &flag); | ||
864 | if (!timr) | 894 | if (!timr) |
865 | return -EINVAL; | 895 | return -EINVAL; |
866 | 896 | ||
@@ -868,13 +898,16 @@ retry: | |||
868 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) | 898 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) |
869 | error = -EINVAL; | 899 | error = -EINVAL; |
870 | else | 900 | else |
871 | error = kc->timer_set(timr, flags, new_spec64, old_spec64); | 901 | error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); |
872 | 902 | ||
873 | unlock_timer(timr, flag); | ||
874 | if (error == TIMER_RETRY) { | 903 | if (error == TIMER_RETRY) { |
875 | old_spec64 = NULL; // We already got the old time... | 904 | // We already got the old time... |
905 | old_spec64 = NULL; | ||
906 | /* Unlocks and relocks the timer if it still exists */ | ||
907 | timr = timer_wait_running(timr, &flags); | ||
876 | goto retry; | 908 | goto retry; |
877 | } | 909 | } |
910 | unlock_timer(timr, flags); | ||
878 | 911 | ||
879 | return error; | 912 | return error; |
880 | } | 913 | } |
@@ -951,13 +984,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) | |||
951 | struct k_itimer *timer; | 984 | struct k_itimer *timer; |
952 | unsigned long flags; | 985 | unsigned long flags; |
953 | 986 | ||
954 | retry_delete: | ||
955 | timer = lock_timer(timer_id, &flags); | 987 | timer = lock_timer(timer_id, &flags); |
988 | |||
989 | retry_delete: | ||
956 | if (!timer) | 990 | if (!timer) |
957 | return -EINVAL; | 991 | return -EINVAL; |
958 | 992 | ||
959 | if (timer_delete_hook(timer) == TIMER_RETRY) { | 993 | if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { |
960 | unlock_timer(timer, flags); | 994 | /* Unlocks and relocks the timer if it still exists */ |
995 | timer = timer_wait_running(timer, &flags); | ||
961 | goto retry_delete; | 996 | goto retry_delete; |
962 | } | 997 | } |
963 | 998 | ||
@@ -1238,6 +1273,7 @@ static const struct k_clock clock_realtime = { | |||
1238 | .timer_forward = common_hrtimer_forward, | 1273 | .timer_forward = common_hrtimer_forward, |
1239 | .timer_remaining = common_hrtimer_remaining, | 1274 | .timer_remaining = common_hrtimer_remaining, |
1240 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, | 1275 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, |
1276 | .timer_wait_running = common_timer_wait_running, | ||
1241 | .timer_arm = common_hrtimer_arm, | 1277 | .timer_arm = common_hrtimer_arm, |
1242 | }; | 1278 | }; |
1243 | 1279 | ||
@@ -1253,6 +1289,7 @@ static const struct k_clock clock_monotonic = { | |||
1253 | .timer_forward = common_hrtimer_forward, | 1289 | .timer_forward = common_hrtimer_forward, |
1254 | .timer_remaining = common_hrtimer_remaining, | 1290 | .timer_remaining = common_hrtimer_remaining, |
1255 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, | 1291 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, |
1292 | .timer_wait_running = common_timer_wait_running, | ||
1256 | .timer_arm = common_hrtimer_arm, | 1293 | .timer_arm = common_hrtimer_arm, |
1257 | }; | 1294 | }; |
1258 | 1295 | ||
@@ -1283,6 +1320,7 @@ static const struct k_clock clock_tai = { | |||
1283 | .timer_forward = common_hrtimer_forward, | 1320 | .timer_forward = common_hrtimer_forward, |
1284 | .timer_remaining = common_hrtimer_remaining, | 1321 | .timer_remaining = common_hrtimer_remaining, |
1285 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, | 1322 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, |
1323 | .timer_wait_running = common_timer_wait_running, | ||
1286 | .timer_arm = common_hrtimer_arm, | 1324 | .timer_arm = common_hrtimer_arm, |
1287 | }; | 1325 | }; |
1288 | 1326 | ||
@@ -1298,6 +1336,7 @@ static const struct k_clock clock_boottime = { | |||
1298 | .timer_forward = common_hrtimer_forward, | 1336 | .timer_forward = common_hrtimer_forward, |
1299 | .timer_remaining = common_hrtimer_remaining, | 1337 | .timer_remaining = common_hrtimer_remaining, |
1300 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, | 1338 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, |
1339 | .timer_wait_running = common_timer_wait_running, | ||
1301 | .timer_arm = common_hrtimer_arm, | 1340 | .timer_arm = common_hrtimer_arm, |
1302 | }; | 1341 | }; |
1303 | 1342 | ||
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index de5daa6d975a..897c29e162b9 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h | |||
@@ -24,6 +24,7 @@ struct k_clock { | |||
24 | int (*timer_try_to_cancel)(struct k_itimer *timr); | 24 | int (*timer_try_to_cancel)(struct k_itimer *timr); |
25 | void (*timer_arm)(struct k_itimer *timr, ktime_t expires, | 25 | void (*timer_arm)(struct k_itimer *timr, ktime_t expires, |
26 | bool absolute, bool sigev_none); | 26 | bool absolute, bool sigev_none); |
27 | void (*timer_wait_running)(struct k_itimer *timr); | ||
27 | }; | 28 | }; |
28 | 29 | ||
29 | extern const struct k_clock clock_posix_cpu; | 30 | extern const struct k_clock clock_posix_cpu; |
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 5be6154e2fd2..c1f5bb590b5e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c | |||
@@ -59,11 +59,16 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | |||
59 | * hrtimer_{start/cancel} functions call into tracing, | 59 | * hrtimer_{start/cancel} functions call into tracing, |
60 | * calls to these functions must be bound within RCU_NONIDLE. | 60 | * calls to these functions must be bound within RCU_NONIDLE. |
61 | */ | 61 | */ |
62 | RCU_NONIDLE({ | 62 | RCU_NONIDLE( |
63 | { | ||
63 | bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; | 64 | bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; |
64 | if (bc_moved) | 65 | if (bc_moved) { |
65 | hrtimer_start(&bctimer, expires, | 66 | hrtimer_start(&bctimer, expires, |
66 | HRTIMER_MODE_ABS_PINNED);}); | 67 | HRTIMER_MODE_ABS_PINNED_HARD); |
68 | } | ||
69 | } | ||
70 | ); | ||
71 | |||
67 | if (bc_moved) { | 72 | if (bc_moved) { |
68 | /* Bind the "device" to the cpu */ | 73 | /* Bind the "device" to the cpu */ |
69 | bc->bound_on = smp_processor_id(); | 74 | bc->bound_on = smp_processor_id(); |
@@ -104,7 +109,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) | |||
104 | 109 | ||
105 | void tick_setup_hrtimer_broadcast(void) | 110 | void tick_setup_hrtimer_broadcast(void) |
106 | { | 111 | { |
107 | hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 112 | hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); |
108 | bctimer.function = bc_handler; | 113 | bctimer.function = bc_handler; |
109 | clockevents_register_device(&ce_broadcast_hrtimer); | 114 | clockevents_register_device(&ce_broadcast_hrtimer); |
110 | } | 115 | } |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index be9707f68024..955851748dc3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -634,10 +634,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
634 | /* Forward the time to expire in the future */ | 634 | /* Forward the time to expire in the future */ |
635 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 635 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
636 | 636 | ||
637 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) | 637 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { |
638 | hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); | 638 | hrtimer_start_expires(&ts->sched_timer, |
639 | else | 639 | HRTIMER_MODE_ABS_PINNED_HARD); |
640 | } else { | ||
640 | tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); | 641 | tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); |
642 | } | ||
641 | 643 | ||
642 | /* | 644 | /* |
643 | * Reset to make sure next tick stop doesn't get fooled by past | 645 | * Reset to make sure next tick stop doesn't get fooled by past |
@@ -802,7 +804,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) | |||
802 | } | 804 | } |
803 | 805 | ||
804 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { | 806 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { |
805 | hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); | 807 | hrtimer_start(&ts->sched_timer, tick, |
808 | HRTIMER_MODE_ABS_PINNED_HARD); | ||
806 | } else { | 809 | } else { |
807 | hrtimer_set_expires(&ts->sched_timer, tick); | 810 | hrtimer_set_expires(&ts->sched_timer, tick); |
808 | tick_program_event(tick, 1); | 811 | tick_program_event(tick, 1); |
@@ -1230,7 +1233,7 @@ static void tick_nohz_switch_to_nohz(void) | |||
1230 | * Recycle the hrtimer in ts, so we can share the | 1233 | * Recycle the hrtimer in ts, so we can share the |
1231 | * hrtimer_forward with the highres code. | 1234 | * hrtimer_forward with the highres code. |
1232 | */ | 1235 | */ |
1233 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1236 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); |
1234 | /* Get the next period */ | 1237 | /* Get the next period */ |
1235 | next = tick_init_jiffy_update(); | 1238 | next = tick_init_jiffy_update(); |
1236 | 1239 | ||
@@ -1327,7 +1330,7 @@ void tick_setup_sched_timer(void) | |||
1327 | /* | 1330 | /* |
1328 | * Emulate tick processing via per-CPU hrtimers: | 1331 | * Emulate tick processing via per-CPU hrtimers: |
1329 | */ | 1332 | */ |
1330 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1333 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); |
1331 | ts->sched_timer.function = tick_sched_timer; | 1334 | ts->sched_timer.function = tick_sched_timer; |
1332 | 1335 | ||
1333 | /* Get the next period (per-CPU) */ | 1336 | /* Get the next period (per-CPU) */ |
@@ -1342,7 +1345,7 @@ void tick_setup_sched_timer(void) | |||
1342 | } | 1345 | } |
1343 | 1346 | ||
1344 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 1347 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
1345 | hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); | 1348 | hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); |
1346 | tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); | 1349 | tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); |
1347 | } | 1350 | } |
1348 | #endif /* HIGH_RES_TIMERS */ | 1351 | #endif /* HIGH_RES_TIMERS */ |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 343c7ba33b1c..0e315a2e77ae 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -196,6 +196,10 @@ EXPORT_SYMBOL(jiffies_64); | |||
196 | struct timer_base { | 196 | struct timer_base { |
197 | raw_spinlock_t lock; | 197 | raw_spinlock_t lock; |
198 | struct timer_list *running_timer; | 198 | struct timer_list *running_timer; |
199 | #ifdef CONFIG_PREEMPT_RT | ||
200 | spinlock_t expiry_lock; | ||
201 | atomic_t timer_waiters; | ||
202 | #endif | ||
199 | unsigned long clk; | 203 | unsigned long clk; |
200 | unsigned long next_expiry; | 204 | unsigned long next_expiry; |
201 | unsigned int cpu; | 205 | unsigned int cpu; |
@@ -1227,7 +1231,78 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
1227 | } | 1231 | } |
1228 | EXPORT_SYMBOL(try_to_del_timer_sync); | 1232 | EXPORT_SYMBOL(try_to_del_timer_sync); |
1229 | 1233 | ||
1230 | #ifdef CONFIG_SMP | 1234 | #ifdef CONFIG_PREEMPT_RT |
1235 | static __init void timer_base_init_expiry_lock(struct timer_base *base) | ||
1236 | { | ||
1237 | spin_lock_init(&base->expiry_lock); | ||
1238 | } | ||
1239 | |||
1240 | static inline void timer_base_lock_expiry(struct timer_base *base) | ||
1241 | { | ||
1242 | spin_lock(&base->expiry_lock); | ||
1243 | } | ||
1244 | |||
1245 | static inline void timer_base_unlock_expiry(struct timer_base *base) | ||
1246 | { | ||
1247 | spin_unlock(&base->expiry_lock); | ||
1248 | } | ||
1249 | |||
1250 | /* | ||
1251 | * The counterpart to del_timer_wait_running(). | ||
1252 | * | ||
1253 | * If there is a waiter for base->expiry_lock, then it was waiting for the | ||
1254 | * timer callback to finish. Drop expiry_lock and reaquire it. That allows | ||
1255 | * the waiter to acquire the lock and make progress. | ||
1256 | */ | ||
1257 | static void timer_sync_wait_running(struct timer_base *base) | ||
1258 | { | ||
1259 | if (atomic_read(&base->timer_waiters)) { | ||
1260 | spin_unlock(&base->expiry_lock); | ||
1261 | spin_lock(&base->expiry_lock); | ||
1262 | } | ||
1263 | } | ||
1264 | |||
1265 | /* | ||
1266 | * This function is called on PREEMPT_RT kernels when the fast path | ||
1267 | * deletion of a timer failed because the timer callback function was | ||
1268 | * running. | ||
1269 | * | ||
1270 | * This prevents priority inversion, if the softirq thread on a remote CPU | ||
1271 | * got preempted, and it prevents a life lock when the task which tries to | ||
1272 | * delete a timer preempted the softirq thread running the timer callback | ||
1273 | * function. | ||
1274 | */ | ||
1275 | static void del_timer_wait_running(struct timer_list *timer) | ||
1276 | { | ||
1277 | u32 tf; | ||
1278 | |||
1279 | tf = READ_ONCE(timer->flags); | ||
1280 | if (!(tf & TIMER_MIGRATING)) { | ||
1281 | struct timer_base *base = get_timer_base(tf); | ||
1282 | |||
1283 | /* | ||
1284 | * Mark the base as contended and grab the expiry lock, | ||
1285 | * which is held by the softirq across the timer | ||
1286 | * callback. Drop the lock immediately so the softirq can | ||
1287 | * expire the next timer. In theory the timer could already | ||
1288 | * be running again, but that's more than unlikely and just | ||
1289 | * causes another wait loop. | ||
1290 | */ | ||
1291 | atomic_inc(&base->timer_waiters); | ||
1292 | spin_lock_bh(&base->expiry_lock); | ||
1293 | atomic_dec(&base->timer_waiters); | ||
1294 | spin_unlock_bh(&base->expiry_lock); | ||
1295 | } | ||
1296 | } | ||
1297 | #else | ||
1298 | static inline void timer_base_init_expiry_lock(struct timer_base *base) { } | ||
1299 | static inline void timer_base_lock_expiry(struct timer_base *base) { } | ||
1300 | static inline void timer_base_unlock_expiry(struct timer_base *base) { } | ||
1301 | static inline void timer_sync_wait_running(struct timer_base *base) { } | ||
1302 | static inline void del_timer_wait_running(struct timer_list *timer) { } | ||
1303 | #endif | ||
1304 | |||
1305 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) | ||
1231 | /** | 1306 | /** |
1232 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 1307 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
1233 | * @timer: the timer to be deactivated | 1308 | * @timer: the timer to be deactivated |
@@ -1266,6 +1341,8 @@ EXPORT_SYMBOL(try_to_del_timer_sync); | |||
1266 | */ | 1341 | */ |
1267 | int del_timer_sync(struct timer_list *timer) | 1342 | int del_timer_sync(struct timer_list *timer) |
1268 | { | 1343 | { |
1344 | int ret; | ||
1345 | |||
1269 | #ifdef CONFIG_LOCKDEP | 1346 | #ifdef CONFIG_LOCKDEP |
1270 | unsigned long flags; | 1347 | unsigned long flags; |
1271 | 1348 | ||
@@ -1283,12 +1360,17 @@ int del_timer_sync(struct timer_list *timer) | |||
1283 | * could lead to deadlock. | 1360 | * could lead to deadlock. |
1284 | */ | 1361 | */ |
1285 | WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); | 1362 | WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); |
1286 | for (;;) { | 1363 | |
1287 | int ret = try_to_del_timer_sync(timer); | 1364 | do { |
1288 | if (ret >= 0) | 1365 | ret = try_to_del_timer_sync(timer); |
1289 | return ret; | 1366 | |
1290 | cpu_relax(); | 1367 | if (unlikely(ret < 0)) { |
1291 | } | 1368 | del_timer_wait_running(timer); |
1369 | cpu_relax(); | ||
1370 | } | ||
1371 | } while (ret < 0); | ||
1372 | |||
1373 | return ret; | ||
1292 | } | 1374 | } |
1293 | EXPORT_SYMBOL(del_timer_sync); | 1375 | EXPORT_SYMBOL(del_timer_sync); |
1294 | #endif | 1376 | #endif |
@@ -1360,10 +1442,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) | |||
1360 | if (timer->flags & TIMER_IRQSAFE) { | 1442 | if (timer->flags & TIMER_IRQSAFE) { |
1361 | raw_spin_unlock(&base->lock); | 1443 | raw_spin_unlock(&base->lock); |
1362 | call_timer_fn(timer, fn, baseclk); | 1444 | call_timer_fn(timer, fn, baseclk); |
1445 | base->running_timer = NULL; | ||
1363 | raw_spin_lock(&base->lock); | 1446 | raw_spin_lock(&base->lock); |
1364 | } else { | 1447 | } else { |
1365 | raw_spin_unlock_irq(&base->lock); | 1448 | raw_spin_unlock_irq(&base->lock); |
1366 | call_timer_fn(timer, fn, baseclk); | 1449 | call_timer_fn(timer, fn, baseclk); |
1450 | base->running_timer = NULL; | ||
1451 | timer_sync_wait_running(base); | ||
1367 | raw_spin_lock_irq(&base->lock); | 1452 | raw_spin_lock_irq(&base->lock); |
1368 | } | 1453 | } |
1369 | } | 1454 | } |
@@ -1643,7 +1728,7 @@ void update_process_times(int user_tick) | |||
1643 | #endif | 1728 | #endif |
1644 | scheduler_tick(); | 1729 | scheduler_tick(); |
1645 | if (IS_ENABLED(CONFIG_POSIX_TIMERS)) | 1730 | if (IS_ENABLED(CONFIG_POSIX_TIMERS)) |
1646 | run_posix_cpu_timers(p); | 1731 | run_posix_cpu_timers(); |
1647 | } | 1732 | } |
1648 | 1733 | ||
1649 | /** | 1734 | /** |
@@ -1658,6 +1743,7 @@ static inline void __run_timers(struct timer_base *base) | |||
1658 | if (!time_after_eq(jiffies, base->clk)) | 1743 | if (!time_after_eq(jiffies, base->clk)) |
1659 | return; | 1744 | return; |
1660 | 1745 | ||
1746 | timer_base_lock_expiry(base); | ||
1661 | raw_spin_lock_irq(&base->lock); | 1747 | raw_spin_lock_irq(&base->lock); |
1662 | 1748 | ||
1663 | /* | 1749 | /* |
@@ -1684,8 +1770,8 @@ static inline void __run_timers(struct timer_base *base) | |||
1684 | while (levels--) | 1770 | while (levels--) |
1685 | expire_timers(base, heads + levels); | 1771 | expire_timers(base, heads + levels); |
1686 | } | 1772 | } |
1687 | base->running_timer = NULL; | ||
1688 | raw_spin_unlock_irq(&base->lock); | 1773 | raw_spin_unlock_irq(&base->lock); |
1774 | timer_base_unlock_expiry(base); | ||
1689 | } | 1775 | } |
1690 | 1776 | ||
1691 | /* | 1777 | /* |
@@ -1930,6 +2016,7 @@ static void __init init_timer_cpu(int cpu) | |||
1930 | base->cpu = cpu; | 2016 | base->cpu = cpu; |
1931 | raw_spin_lock_init(&base->lock); | 2017 | raw_spin_lock_init(&base->lock); |
1932 | base->clk = jiffies; | 2018 | base->clk = jiffies; |
2019 | timer_base_init_expiry_lock(base); | ||
1933 | } | 2020 | } |
1934 | } | 2021 | } |
1935 | 2022 | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f9e7b9306fe..f41334ef0971 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -490,10 +490,10 @@ static void watchdog_enable(unsigned int cpu) | |||
490 | * Start the timer first to prevent the NMI watchdog triggering | 490 | * Start the timer first to prevent the NMI watchdog triggering |
491 | * before the timer has a chance to fire. | 491 | * before the timer has a chance to fire. |
492 | */ | 492 | */ |
493 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 493 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); |
494 | hrtimer->function = watchdog_timer_fn; | 494 | hrtimer->function = watchdog_timer_fn; |
495 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | 495 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), |
496 | HRTIMER_MODE_REL_PINNED); | 496 | HRTIMER_MODE_REL_PINNED_HARD); |
497 | 497 | ||
498 | /* Initialize timestamp */ | 498 | /* Initialize timestamp */ |
499 | __touch_watchdog(); | 499 | __touch_watchdog(); |