diff options
author | Peter Zijlstra <peterz@infradead.org> | 2015-06-11 08:46:48 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2015-06-18 18:09:56 -0400 |
commit | 887d9dc989eb0154492e41e7c07492edbb088ba1 (patch) | |
tree | 53f3c08252d60c9ceb32fa488c1e0ab77a95b9ef | |
parent | c4bfa3f5f906aee2e084c5b1fb15caf876338ef8 (diff) |
hrtimer: Allow hrtimer::function() to free the timer
Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.
Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.471563047@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r-- | include/linux/hrtimer.h | 41 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 114 |
2 files changed, 107 insertions, 48 deletions
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2f9e57d3d126..5db055821ef3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h | |||
@@ -53,30 +53,25 @@ enum hrtimer_restart { | |||
53 | * | 53 | * |
54 | * 0x00 inactive | 54 | * 0x00 inactive |
55 | * 0x01 enqueued into rbtree | 55 | * 0x01 enqueued into rbtree |
56 | * 0x02 callback function running | ||
57 | * 0x04 timer is migrated to another cpu | ||
58 | * | 56 | * |
59 | * Special cases: | 57 | * The callback state is not part of the timer->state because clearing it would |
60 | * 0x03 callback function running and enqueued | 58 | * mean touching the timer after the callback, this makes it impossible to free |
61 | * (was requeued on another CPU) | 59 | * the timer from the callback function. |
62 | * 0x05 timer was migrated on CPU hotunplug | ||
63 | * | 60 | * |
64 | * The "callback function running and enqueued" status is only possible on | 61 | * Therefore we track the callback state in: |
65 | * SMP. It happens for example when a posix timer expired and the callback | 62 | * |
63 | * timer->base->cpu_base->running == timer | ||
64 | * | ||
65 | * On SMP it is possible to have a "callback function running and enqueued" | ||
66 | * status. It happens for example when a posix timer expired and the callback | ||
66 | * queued a signal. Between dropping the lock which protects the posix timer | 67 | * queued a signal. Between dropping the lock which protects the posix timer |
67 | * and reacquiring the base lock of the hrtimer, another CPU can deliver the | 68 | * and reacquiring the base lock of the hrtimer, another CPU can deliver the |
68 | * signal and rearm the timer. We have to preserve the callback running state, | 69 | * signal and rearm the timer. |
69 | * as otherwise the timer could be removed before the softirq code finishes the | ||
70 | * the handling of the timer. | ||
71 | * | ||
72 | * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state | ||
73 | * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. | ||
74 | * | 70 | * |
75 | * All state transitions are protected by cpu_base->lock. | 71 | * All state transitions are protected by cpu_base->lock. |
76 | */ | 72 | */ |
77 | #define HRTIMER_STATE_INACTIVE 0x00 | 73 | #define HRTIMER_STATE_INACTIVE 0x00 |
78 | #define HRTIMER_STATE_ENQUEUED 0x01 | 74 | #define HRTIMER_STATE_ENQUEUED 0x01 |
79 | #define HRTIMER_STATE_CALLBACK 0x02 | ||
80 | 75 | ||
81 | /** | 76 | /** |
82 | * struct hrtimer - the basic hrtimer structure | 77 | * struct hrtimer - the basic hrtimer structure |
@@ -163,6 +158,8 @@ enum hrtimer_base_type { | |||
163 | * struct hrtimer_cpu_base - the per cpu clock bases | 158 | * struct hrtimer_cpu_base - the per cpu clock bases |
164 | * @lock: lock protecting the base and associated clock bases | 159 | * @lock: lock protecting the base and associated clock bases |
165 | * and timers | 160 | * and timers |
161 | * @seq: seqcount around __run_hrtimer | ||
162 | * @running: pointer to the currently running hrtimer | ||
166 | * @cpu: cpu number | 163 | * @cpu: cpu number |
167 | * @active_bases: Bitfield to mark bases with active timers | 164 | * @active_bases: Bitfield to mark bases with active timers |
168 | * @clock_was_set_seq: Sequence counter of clock was set events | 165 | * @clock_was_set_seq: Sequence counter of clock was set events |
@@ -184,6 +181,8 @@ enum hrtimer_base_type { | |||
184 | */ | 181 | */ |
185 | struct hrtimer_cpu_base { | 182 | struct hrtimer_cpu_base { |
186 | raw_spinlock_t lock; | 183 | raw_spinlock_t lock; |
184 | seqcount_t seq; | ||
185 | struct hrtimer *running; | ||
187 | unsigned int cpu; | 186 | unsigned int cpu; |
188 | unsigned int active_bases; | 187 | unsigned int active_bases; |
189 | unsigned int clock_was_set_seq; | 188 | unsigned int clock_was_set_seq; |
@@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); | |||
391 | 390 | ||
392 | extern u64 hrtimer_get_next_event(void); | 391 | extern u64 hrtimer_get_next_event(void); |
393 | 392 | ||
394 | /* | 393 | extern bool hrtimer_active(const struct hrtimer *timer); |
395 | * A timer is active, when it is enqueued into the rbtree or the | ||
396 | * callback function is running or it's in the state of being migrated | ||
397 | * to another cpu. | ||
398 | */ | ||
399 | static inline int hrtimer_active(const struct hrtimer *timer) | ||
400 | { | ||
401 | return timer->state != HRTIMER_STATE_INACTIVE; | ||
402 | } | ||
403 | 394 | ||
404 | /* | 395 | /* |
405 | * Helper function to check, whether the timer is on one of the queues | 396 | * Helper function to check, whether the timer is on one of the queues |
@@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer) | |||
415 | */ | 406 | */ |
416 | static inline int hrtimer_callback_running(struct hrtimer *timer) | 407 | static inline int hrtimer_callback_running(struct hrtimer *timer) |
417 | { | 408 | { |
418 | return timer->state & HRTIMER_STATE_CALLBACK; | 409 | return timer->base->cpu_base->running == timer; |
419 | } | 410 | } |
420 | 411 | ||
421 | /* Forward a hrtimer so it expires after now: */ | 412 | /* Forward a hrtimer so it expires after now: */ |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1604157374d7..f026413de4d6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -67,6 +67,7 @@ | |||
67 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | 67 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = |
68 | { | 68 | { |
69 | .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), | 69 | .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), |
70 | .seq = SEQCNT_ZERO(hrtimer_bases.seq), | ||
70 | .clock_base = | 71 | .clock_base = |
71 | { | 72 | { |
72 | { | 73 | { |
@@ -111,6 +112,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) | |||
111 | #ifdef CONFIG_SMP | 112 | #ifdef CONFIG_SMP |
112 | 113 | ||
113 | /* | 114 | /* |
115 | * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() | ||
116 | * such that hrtimer_callback_running() can unconditionally dereference | ||
117 | * timer->base->cpu_base | ||
118 | */ | ||
119 | static struct hrtimer_cpu_base migration_cpu_base = { | ||
120 | .seq = SEQCNT_ZERO(migration_cpu_base), | ||
121 | .clock_base = { { .cpu_base = &migration_cpu_base, }, }, | ||
122 | }; | ||
123 | |||
124 | #define migration_base migration_cpu_base.clock_base[0] | ||
125 | |||
126 | /* | ||
114 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock | 127 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock |
115 | * means that all timers which are tied to this base via timer->base are | 128 | * means that all timers which are tied to this base via timer->base are |
116 | * locked, and the base itself is locked too. | 129 | * locked, and the base itself is locked too. |
@@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) | |||
119 | * be found on the lists/queues. | 132 | * be found on the lists/queues. |
120 | * | 133 | * |
121 | * When the timer's base is locked, and the timer removed from list, it is | 134 | * When the timer's base is locked, and the timer removed from list, it is |
122 | * possible to set timer->base = NULL and drop the lock: the timer remains | 135 | * possible to set timer->base = &migration_base and drop the lock: the timer |
123 | * locked. | 136 | * remains locked. |
124 | */ | 137 | */ |
125 | static | 138 | static |
126 | struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, | 139 | struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, |
@@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, | |||
130 | 143 | ||
131 | for (;;) { | 144 | for (;;) { |
132 | base = timer->base; | 145 | base = timer->base; |
133 | if (likely(base != NULL)) { | 146 | if (likely(base != &migration_base)) { |
134 | raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); | 147 | raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); |
135 | if (likely(base == timer->base)) | 148 | if (likely(base == timer->base)) |
136 | return base; | 149 | return base; |
@@ -194,8 +207,8 @@ again: | |||
194 | if (unlikely(hrtimer_callback_running(timer))) | 207 | if (unlikely(hrtimer_callback_running(timer))) |
195 | return base; | 208 | return base; |
196 | 209 | ||
197 | /* See the comment in lock_timer_base() */ | 210 | /* See the comment in lock_hrtimer_base() */ |
198 | timer->base = NULL; | 211 | timer->base = &migration_base; |
199 | raw_spin_unlock(&base->cpu_base->lock); | 212 | raw_spin_unlock(&base->cpu_base->lock); |
200 | raw_spin_lock(&new_base->cpu_base->lock); | 213 | raw_spin_lock(&new_base->cpu_base->lock); |
201 | 214 | ||
@@ -838,11 +851,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, | |||
838 | 851 | ||
839 | base->cpu_base->active_bases |= 1 << base->index; | 852 | base->cpu_base->active_bases |= 1 << base->index; |
840 | 853 | ||
841 | /* | 854 | timer->state = HRTIMER_STATE_ENQUEUED; |
842 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the | ||
843 | * state of a possibly running callback. | ||
844 | */ | ||
845 | timer->state |= HRTIMER_STATE_ENQUEUED; | ||
846 | 855 | ||
847 | return timerqueue_add(&base->active, &timer->node); | 856 | return timerqueue_add(&base->active, &timer->node); |
848 | } | 857 | } |
@@ -907,14 +916,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest | |||
907 | timer_stats_hrtimer_clear_start_info(timer); | 916 | timer_stats_hrtimer_clear_start_info(timer); |
908 | reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); | 917 | reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); |
909 | 918 | ||
910 | if (!restart) { | 919 | if (!restart) |
911 | /* | 920 | state = HRTIMER_STATE_INACTIVE; |
912 | * We must preserve the CALLBACK state flag here, | 921 | |
913 | * otherwise we could move the timer base in | ||
914 | * switch_hrtimer_base. | ||
915 | */ | ||
916 | state &= HRTIMER_STATE_CALLBACK; | ||
917 | } | ||
918 | __remove_hrtimer(timer, base, state, reprogram); | 922 | __remove_hrtimer(timer, base, state, reprogram); |
919 | return 1; | 923 | return 1; |
920 | } | 924 | } |
@@ -1115,6 +1119,51 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1115 | } | 1119 | } |
1116 | EXPORT_SYMBOL_GPL(hrtimer_init); | 1120 | EXPORT_SYMBOL_GPL(hrtimer_init); |
1117 | 1121 | ||
1122 | /* | ||
1123 | * A timer is active, when it is enqueued into the rbtree or the | ||
1124 | * callback function is running or it's in the state of being migrated | ||
1125 | * to another cpu. | ||
1126 | * | ||
1127 | * It is important for this function to not return a false negative. | ||
1128 | */ | ||
1129 | bool hrtimer_active(const struct hrtimer *timer) | ||
1130 | { | ||
1131 | struct hrtimer_cpu_base *cpu_base; | ||
1132 | unsigned int seq; | ||
1133 | |||
1134 | do { | ||
1135 | cpu_base = READ_ONCE(timer->base->cpu_base); | ||
1136 | seq = raw_read_seqcount_begin(&cpu_base->seq); | ||
1137 | |||
1138 | if (timer->state != HRTIMER_STATE_INACTIVE || | ||
1139 | cpu_base->running == timer) | ||
1140 | return true; | ||
1141 | |||
1142 | } while (read_seqcount_retry(&cpu_base->seq, seq) || | ||
1143 | cpu_base != READ_ONCE(timer->base->cpu_base)); | ||
1144 | |||
1145 | return false; | ||
1146 | } | ||
1147 | EXPORT_SYMBOL_GPL(hrtimer_active); | ||
1148 | |||
1149 | /* | ||
1150 | * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 | ||
1151 | * distinct sections: | ||
1152 | * | ||
1153 | * - queued: the timer is queued | ||
1154 | * - callback: the timer is being ran | ||
1155 | * - post: the timer is inactive or (re)queued | ||
1156 | * | ||
1157 | * On the read side we ensure we observe timer->state and cpu_base->running | ||
1158 | * from the same section, if anything changed while we looked at it, we retry. | ||
1159 | * This includes timer->base changing because sequence numbers alone are | ||
1160 | * insufficient for that. | ||
1161 | * | ||
1162 | * The sequence numbers are required because otherwise we could still observe | ||
1163 | * a false negative if the read side got smeared over multiple consequtive | ||
1164 | * __run_hrtimer() invocations. | ||
1165 | */ | ||
1166 | |||
1118 | static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, | 1167 | static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, |
1119 | struct hrtimer_clock_base *base, | 1168 | struct hrtimer_clock_base *base, |
1120 | struct hrtimer *timer, ktime_t *now) | 1169 | struct hrtimer *timer, ktime_t *now) |
@@ -1122,10 +1171,21 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, | |||
1122 | enum hrtimer_restart (*fn)(struct hrtimer *); | 1171 | enum hrtimer_restart (*fn)(struct hrtimer *); |
1123 | int restart; | 1172 | int restart; |
1124 | 1173 | ||
1125 | WARN_ON(!irqs_disabled()); | 1174 | lockdep_assert_held(&cpu_base->lock); |
1126 | 1175 | ||
1127 | debug_deactivate(timer); | 1176 | debug_deactivate(timer); |
1128 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | 1177 | cpu_base->running = timer; |
1178 | |||
1179 | /* | ||
1180 | * Separate the ->running assignment from the ->state assignment. | ||
1181 | * | ||
1182 | * As with a regular write barrier, this ensures the read side in | ||
1183 | * hrtimer_active() cannot observe cpu_base->running == NULL && | ||
1184 | * timer->state == INACTIVE. | ||
1185 | */ | ||
1186 | raw_write_seqcount_barrier(&cpu_base->seq); | ||
1187 | |||
1188 | __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); | ||
1129 | timer_stats_account_hrtimer(timer); | 1189 | timer_stats_account_hrtimer(timer); |
1130 | fn = timer->function; | 1190 | fn = timer->function; |
1131 | 1191 | ||
@@ -1141,7 +1201,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, | |||
1141 | raw_spin_lock(&cpu_base->lock); | 1201 | raw_spin_lock(&cpu_base->lock); |
1142 | 1202 | ||
1143 | /* | 1203 | /* |
1144 | * Note: We clear the CALLBACK bit after enqueue_hrtimer and | 1204 | * Note: We clear the running state after enqueue_hrtimer and |
1145 | * we do not reprogramm the event hardware. Happens either in | 1205 | * we do not reprogramm the event hardware. Happens either in |
1146 | * hrtimer_start_range_ns() or in hrtimer_interrupt() | 1206 | * hrtimer_start_range_ns() or in hrtimer_interrupt() |
1147 | * | 1207 | * |
@@ -1153,9 +1213,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, | |||
1153 | !(timer->state & HRTIMER_STATE_ENQUEUED)) | 1213 | !(timer->state & HRTIMER_STATE_ENQUEUED)) |
1154 | enqueue_hrtimer(timer, base); | 1214 | enqueue_hrtimer(timer, base); |
1155 | 1215 | ||
1156 | WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); | 1216 | /* |
1217 | * Separate the ->running assignment from the ->state assignment. | ||
1218 | * | ||
1219 | * As with a regular write barrier, this ensures the read side in | ||
1220 | * hrtimer_active() cannot observe cpu_base->running == NULL && | ||
1221 | * timer->state == INACTIVE. | ||
1222 | */ | ||
1223 | raw_write_seqcount_barrier(&cpu_base->seq); | ||
1157 | 1224 | ||
1158 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1225 | WARN_ON_ONCE(cpu_base->running != timer); |
1226 | cpu_base->running = NULL; | ||
1159 | } | 1227 | } |
1160 | 1228 | ||
1161 | static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) | 1229 | static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) |