aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/hashtab.c84
-rw-r--r--kernel/bpf/verifier.c7
-rw-r--r--kernel/events/core.c174
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/futex.c23
-rw-r--r--kernel/irq/affinity.c2
-rw-r--r--kernel/irq/chip.c11
-rw-r--r--kernel/irq/manage.c8
-rw-r--r--kernel/irq/msi.c11
-rw-r--r--kernel/locking/qspinlock_paravirt.h2
-rw-r--r--kernel/locking/qspinlock_stat.h1
-rw-r--r--kernel/power/hibernate.c4
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/printk/braille.c4
-rw-r--r--kernel/sched/core.c19
-rw-r--r--kernel/sched/cpudeadline.c2
-rw-r--r--kernel/sched/cputime.c41
-rw-r--r--kernel/sched/deadline.c5
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sysctl.c45
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timekeeping_debug.c9
-rw-r--r--kernel/time/timer.c5
-rw-r--r--kernel/trace/blktrace.c2
24 files changed, 382 insertions, 99 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fff3650d52fc..570eeca7bdfa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -26,11 +26,18 @@ struct bpf_htab {
26 struct bucket *buckets; 26 struct bucket *buckets;
27 void *elems; 27 void *elems;
28 struct pcpu_freelist freelist; 28 struct pcpu_freelist freelist;
29 void __percpu *extra_elems;
29 atomic_t count; /* number of elements in this hashtable */ 30 atomic_t count; /* number of elements in this hashtable */
30 u32 n_buckets; /* number of hash buckets */ 31 u32 n_buckets; /* number of hash buckets */
31 u32 elem_size; /* size of each element in bytes */ 32 u32 elem_size; /* size of each element in bytes */
32}; 33};
33 34
35enum extra_elem_state {
36 HTAB_NOT_AN_EXTRA_ELEM = 0,
37 HTAB_EXTRA_ELEM_FREE,
38 HTAB_EXTRA_ELEM_USED
39};
40
34/* each htab element is struct htab_elem + key + value */ 41/* each htab element is struct htab_elem + key + value */
35struct htab_elem { 42struct htab_elem {
36 union { 43 union {
@@ -38,7 +45,10 @@ struct htab_elem {
38 struct bpf_htab *htab; 45 struct bpf_htab *htab;
39 struct pcpu_freelist_node fnode; 46 struct pcpu_freelist_node fnode;
40 }; 47 };
41 struct rcu_head rcu; 48 union {
49 struct rcu_head rcu;
50 enum extra_elem_state state;
51 };
42 u32 hash; 52 u32 hash;
43 char key[0] __aligned(8); 53 char key[0] __aligned(8);
44}; 54};
@@ -113,6 +123,23 @@ free_elems:
113 return err; 123 return err;
114} 124}
115 125
126static int alloc_extra_elems(struct bpf_htab *htab)
127{
128 void __percpu *pptr;
129 int cpu;
130
131 pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
132 if (!pptr)
133 return -ENOMEM;
134
135 for_each_possible_cpu(cpu) {
136 ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
137 HTAB_EXTRA_ELEM_FREE;
138 }
139 htab->extra_elems = pptr;
140 return 0;
141}
142
116/* Called from syscall */ 143/* Called from syscall */
117static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 144static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
118{ 145{
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
185 if (percpu) 212 if (percpu)
186 cost += (u64) round_up(htab->map.value_size, 8) * 213 cost += (u64) round_up(htab->map.value_size, 8) *
187 num_possible_cpus() * htab->map.max_entries; 214 num_possible_cpus() * htab->map.max_entries;
215 else
216 cost += (u64) htab->elem_size * num_possible_cpus();
188 217
189 if (cost >= U32_MAX - PAGE_SIZE) 218 if (cost >= U32_MAX - PAGE_SIZE)
190 /* make sure page count doesn't overflow */ 219 /* make sure page count doesn't overflow */
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
212 raw_spin_lock_init(&htab->buckets[i].lock); 241 raw_spin_lock_init(&htab->buckets[i].lock);
213 } 242 }
214 243
244 if (!percpu) {
245 err = alloc_extra_elems(htab);
246 if (err)
247 goto free_buckets;
248 }
249
215 if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { 250 if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
216 err = prealloc_elems_and_freelist(htab); 251 err = prealloc_elems_and_freelist(htab);
217 if (err) 252 if (err)
218 goto free_buckets; 253 goto free_extra_elems;
219 } 254 }
220 255
221 return &htab->map; 256 return &htab->map;
222 257
258free_extra_elems:
259 free_percpu(htab->extra_elems);
223free_buckets: 260free_buckets:
224 kvfree(htab->buckets); 261 kvfree(htab->buckets);
225free_htab: 262free_htab:
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
349 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 386 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
350 free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); 387 free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
351 kfree(l); 388 kfree(l);
352
353} 389}
354 390
355static void htab_elem_free_rcu(struct rcu_head *head) 391static void htab_elem_free_rcu(struct rcu_head *head)
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
370 406
371static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 407static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
372{ 408{
409 if (l->state == HTAB_EXTRA_ELEM_USED) {
410 l->state = HTAB_EXTRA_ELEM_FREE;
411 return;
412 }
413
373 if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { 414 if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
374 pcpu_freelist_push(&htab->freelist, &l->fnode); 415 pcpu_freelist_push(&htab->freelist, &l->fnode);
375 } else { 416 } else {
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
381 422
382static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 423static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
383 void *value, u32 key_size, u32 hash, 424 void *value, u32 key_size, u32 hash,
384 bool percpu, bool onallcpus) 425 bool percpu, bool onallcpus,
426 bool old_elem_exists)
385{ 427{
386 u32 size = htab->map.value_size; 428 u32 size = htab->map.value_size;
387 bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); 429 bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
388 struct htab_elem *l_new; 430 struct htab_elem *l_new;
389 void __percpu *pptr; 431 void __percpu *pptr;
432 int err = 0;
390 433
391 if (prealloc) { 434 if (prealloc) {
392 l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); 435 l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
393 if (!l_new) 436 if (!l_new)
394 return ERR_PTR(-E2BIG); 437 err = -E2BIG;
395 } else { 438 } else {
396 if (atomic_inc_return(&htab->count) > htab->map.max_entries) { 439 if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
397 atomic_dec(&htab->count); 440 atomic_dec(&htab->count);
398 return ERR_PTR(-E2BIG); 441 err = -E2BIG;
442 } else {
443 l_new = kmalloc(htab->elem_size,
444 GFP_ATOMIC | __GFP_NOWARN);
445 if (!l_new)
446 return ERR_PTR(-ENOMEM);
399 } 447 }
400 l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); 448 }
401 if (!l_new) 449
402 return ERR_PTR(-ENOMEM); 450 if (err) {
451 if (!old_elem_exists)
452 return ERR_PTR(err);
453
454 /* if we're updating the existing element and the hash table
455 * is full, use per-cpu extra elems
456 */
457 l_new = this_cpu_ptr(htab->extra_elems);
458 if (l_new->state != HTAB_EXTRA_ELEM_FREE)
459 return ERR_PTR(-E2BIG);
460 l_new->state = HTAB_EXTRA_ELEM_USED;
461 } else {
462 l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
403 } 463 }
404 464
405 memcpy(l_new->key, key, key_size); 465 memcpy(l_new->key, key, key_size);
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
489 if (ret) 549 if (ret)
490 goto err; 550 goto err;
491 551
492 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); 552 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
553 !!l_old);
493 if (IS_ERR(l_new)) { 554 if (IS_ERR(l_new)) {
494 /* all pre-allocated elements are in use or memory exhausted */ 555 /* all pre-allocated elements are in use or memory exhausted */
495 ret = PTR_ERR(l_new); 556 ret = PTR_ERR(l_new);
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
563 } 624 }
564 } else { 625 } else {
565 l_new = alloc_htab_elem(htab, key, value, key_size, 626 l_new = alloc_htab_elem(htab, key, value, key_size,
566 hash, true, onallcpus); 627 hash, true, onallcpus, false);
567 if (IS_ERR(l_new)) { 628 if (IS_ERR(l_new)) {
568 ret = PTR_ERR(l_new); 629 ret = PTR_ERR(l_new);
569 goto err; 630 goto err;
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
652 htab_free_elems(htab); 713 htab_free_elems(htab);
653 pcpu_freelist_destroy(&htab->freelist); 714 pcpu_freelist_destroy(&htab->freelist);
654 } 715 }
716 free_percpu(htab->extra_elems);
655 kvfree(htab->buckets); 717 kvfree(htab->buckets);
656 kfree(htab); 718 kfree(htab);
657} 719}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f72f23b8fdab..daea765d72e6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,7 @@ struct verifier_env {
194 struct verifier_state_list **explored_states; /* search pruning optimization */ 194 struct verifier_state_list **explored_states; /* search pruning optimization */
195 struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ 195 struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
196 u32 used_map_cnt; /* number of used maps */ 196 u32 used_map_cnt; /* number of used maps */
197 u32 id_gen; /* used to generate unique reg IDs */
197 bool allow_ptr_leaks; 198 bool allow_ptr_leaks;
198}; 199};
199 200
@@ -1052,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
1052 goto error; 1053 goto error;
1053 break; 1054 break;
1054 case BPF_MAP_TYPE_CGROUP_ARRAY: 1055 case BPF_MAP_TYPE_CGROUP_ARRAY:
1055 if (func_id != BPF_FUNC_skb_in_cgroup) 1056 if (func_id != BPF_FUNC_skb_under_cgroup)
1056 goto error; 1057 goto error;
1057 break; 1058 break;
1058 default: 1059 default:
@@ -1074,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
1074 if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) 1075 if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
1075 goto error; 1076 goto error;
1076 break; 1077 break;
1077 case BPF_FUNC_skb_in_cgroup: 1078 case BPF_FUNC_skb_under_cgroup:
1078 if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) 1079 if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
1079 goto error; 1080 goto error;
1080 break; 1081 break;
@@ -1301,7 +1302,7 @@ add_imm:
1301 /* dst_reg stays as pkt_ptr type and since some positive 1302 /* dst_reg stays as pkt_ptr type and since some positive
1302 * integer value was added to the pointer, increment its 'id' 1303 * integer value was added to the pointer, increment its 'id'
1303 */ 1304 */
1304 dst_reg->id++; 1305 dst_reg->id = ++env->id_gen;
1305 1306
1306 /* something was added to pkt_ptr, set range and off to zero */ 1307 /* something was added to pkt_ptr, set range and off to zero */
1307 dst_reg->off = 0; 1308 dst_reg->off = 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a19550d80ab1..3cfabdf7b942 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
242 return ret; 242 return ret;
243} 243}
244 244
245static void event_function_local(struct perf_event *event, event_f func, void *data)
246{
247 struct event_function_struct efs = {
248 .event = event,
249 .func = func,
250 .data = data,
251 };
252
253 int ret = event_function(&efs);
254 WARN_ON_ONCE(ret);
255}
256
257static void event_function_call(struct perf_event *event, event_f func, void *data) 245static void event_function_call(struct perf_event *event, event_f func, void *data)
258{ 246{
259 struct perf_event_context *ctx = event->ctx; 247 struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
303 raw_spin_unlock_irq(&ctx->lock); 291 raw_spin_unlock_irq(&ctx->lock);
304} 292}
305 293
294/*
295 * Similar to event_function_call() + event_function(), but hard assumes IRQs
296 * are already disabled and we're on the right CPU.
297 */
298static void event_function_local(struct perf_event *event, event_f func, void *data)
299{
300 struct perf_event_context *ctx = event->ctx;
301 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
302 struct task_struct *task = READ_ONCE(ctx->task);
303 struct perf_event_context *task_ctx = NULL;
304
305 WARN_ON_ONCE(!irqs_disabled());
306
307 if (task) {
308 if (task == TASK_TOMBSTONE)
309 return;
310
311 task_ctx = ctx;
312 }
313
314 perf_ctx_lock(cpuctx, task_ctx);
315
316 task = ctx->task;
317 if (task == TASK_TOMBSTONE)
318 goto unlock;
319
320 if (task) {
321 /*
322 * We must be either inactive or active and the right task,
323 * otherwise we're screwed, since we cannot IPI to somewhere
324 * else.
325 */
326 if (ctx->is_active) {
327 if (WARN_ON_ONCE(task != current))
328 goto unlock;
329
330 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
331 goto unlock;
332 }
333 } else {
334 WARN_ON_ONCE(&cpuctx->ctx != ctx);
335 }
336
337 func(event, cpuctx, ctx, data);
338unlock:
339 perf_ctx_unlock(cpuctx, task_ctx);
340}
341
306#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 342#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
307 PERF_FLAG_FD_OUTPUT |\ 343 PERF_FLAG_FD_OUTPUT |\
308 PERF_FLAG_PID_CGROUP |\ 344 PERF_FLAG_PID_CGROUP |\
@@ -843,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
843 } 879 }
844 } 880 }
845} 881}
882
883/*
884 * Update cpuctx->cgrp so that it is set when first cgroup event is added and
885 * cleared when last cgroup event is removed.
886 */
887static inline void
888list_update_cgroup_event(struct perf_event *event,
889 struct perf_event_context *ctx, bool add)
890{
891 struct perf_cpu_context *cpuctx;
892
893 if (!is_cgroup_event(event))
894 return;
895
896 if (add && ctx->nr_cgroups++)
897 return;
898 else if (!add && --ctx->nr_cgroups)
899 return;
900 /*
901 * Because cgroup events are always per-cpu events,
902 * this will always be called from the right CPU.
903 */
904 cpuctx = __get_cpu_context(ctx);
905 cpuctx->cgrp = add ? event->cgrp : NULL;
906}
907
846#else /* !CONFIG_CGROUP_PERF */ 908#else /* !CONFIG_CGROUP_PERF */
847 909
848static inline bool 910static inline bool
@@ -920,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
920 struct perf_event_context *ctx) 982 struct perf_event_context *ctx)
921{ 983{
922} 984}
985
986static inline void
987list_update_cgroup_event(struct perf_event *event,
988 struct perf_event_context *ctx, bool add)
989{
990}
991
923#endif 992#endif
924 993
925/* 994/*
@@ -1392,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1392static void 1461static void
1393list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1462list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1394{ 1463{
1464
1395 lockdep_assert_held(&ctx->lock); 1465 lockdep_assert_held(&ctx->lock);
1396 1466
1397 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1467 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1412,8 +1482,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1412 list_add_tail(&event->group_entry, list); 1482 list_add_tail(&event->group_entry, list);
1413 } 1483 }
1414 1484
1415 if (is_cgroup_event(event)) 1485 list_update_cgroup_event(event, ctx, true);
1416 ctx->nr_cgroups++;
1417 1486
1418 list_add_rcu(&event->event_entry, &ctx->event_list); 1487 list_add_rcu(&event->event_entry, &ctx->event_list);
1419 ctx->nr_events++; 1488 ctx->nr_events++;
@@ -1581,8 +1650,6 @@ static void perf_group_attach(struct perf_event *event)
1581static void 1650static void
1582list_del_event(struct perf_event *event, struct perf_event_context *ctx) 1651list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1583{ 1652{
1584 struct perf_cpu_context *cpuctx;
1585
1586 WARN_ON_ONCE(event->ctx != ctx); 1653 WARN_ON_ONCE(event->ctx != ctx);
1587 lockdep_assert_held(&ctx->lock); 1654 lockdep_assert_held(&ctx->lock);
1588 1655
@@ -1594,20 +1661,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1594 1661
1595 event->attach_state &= ~PERF_ATTACH_CONTEXT; 1662 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1596 1663
1597 if (is_cgroup_event(event)) { 1664 list_update_cgroup_event(event, ctx, false);
1598 ctx->nr_cgroups--;
1599 /*
1600 * Because cgroup events are always per-cpu events, this will
1601 * always be called from the right CPU.
1602 */
1603 cpuctx = __get_cpu_context(ctx);
1604 /*
1605 * If there are no more cgroup events then clear cgrp to avoid
1606 * stale pointer in update_cgrp_time_from_cpuctx().
1607 */
1608 if (!ctx->nr_cgroups)
1609 cpuctx->cgrp = NULL;
1610 }
1611 1665
1612 ctx->nr_events--; 1666 ctx->nr_events--;
1613 if (event->attr.inherit_stat) 1667 if (event->attr.inherit_stat)
@@ -1716,8 +1770,8 @@ static inline int pmu_filter_match(struct perf_event *event)
1716static inline int 1770static inline int
1717event_filter_match(struct perf_event *event) 1771event_filter_match(struct perf_event *event)
1718{ 1772{
1719 return (event->cpu == -1 || event->cpu == smp_processor_id()) 1773 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1720 && perf_cgroup_match(event) && pmu_filter_match(event); 1774 perf_cgroup_match(event) && pmu_filter_match(event);
1721} 1775}
1722 1776
1723static void 1777static void
@@ -1737,8 +1791,8 @@ event_sched_out(struct perf_event *event,
1737 * maintained, otherwise bogus information is return 1791 * maintained, otherwise bogus information is return
1738 * via read() for time_enabled, time_running: 1792 * via read() for time_enabled, time_running:
1739 */ 1793 */
1740 if (event->state == PERF_EVENT_STATE_INACTIVE 1794 if (event->state == PERF_EVENT_STATE_INACTIVE &&
1741 && !event_filter_match(event)) { 1795 !event_filter_match(event)) {
1742 delta = tstamp - event->tstamp_stopped; 1796 delta = tstamp - event->tstamp_stopped;
1743 event->tstamp_running += delta; 1797 event->tstamp_running += delta;
1744 event->tstamp_stopped = tstamp; 1798 event->tstamp_stopped = tstamp;
@@ -2236,10 +2290,15 @@ perf_install_in_context(struct perf_event_context *ctx,
2236 2290
2237 lockdep_assert_held(&ctx->mutex); 2291 lockdep_assert_held(&ctx->mutex);
2238 2292
2239 event->ctx = ctx;
2240 if (event->cpu != -1) 2293 if (event->cpu != -1)
2241 event->cpu = cpu; 2294 event->cpu = cpu;
2242 2295
2296 /*
2297 * Ensures that if we can observe event->ctx, both the event and ctx
2298 * will be 'complete'. See perf_iterate_sb_cpu().
2299 */
2300 smp_store_release(&event->ctx, ctx);
2301
2243 if (!task) { 2302 if (!task) {
2244 cpu_function_call(cpu, __perf_install_in_context, event); 2303 cpu_function_call(cpu, __perf_install_in_context, event);
2245 return; 2304 return;
@@ -3490,9 +3549,10 @@ static int perf_event_read(struct perf_event *event, bool group)
3490 .group = group, 3549 .group = group,
3491 .ret = 0, 3550 .ret = 0,
3492 }; 3551 };
3493 smp_call_function_single(event->oncpu, 3552 ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
3494 __perf_event_read, &data, 1); 3553 /* The event must have been read from an online CPU: */
3495 ret = data.ret; 3554 WARN_ON_ONCE(ret);
3555 ret = ret ? : data.ret;
3496 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 3556 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3497 struct perf_event_context *ctx = event->ctx; 3557 struct perf_event_context *ctx = event->ctx;
3498 unsigned long flags; 3558 unsigned long flags;
@@ -5969,6 +6029,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
5969 struct perf_event *event; 6029 struct perf_event *event;
5970 6030
5971 list_for_each_entry_rcu(event, &pel->list, sb_list) { 6031 list_for_each_entry_rcu(event, &pel->list, sb_list) {
6032 /*
6033 * Skip events that are not fully formed yet; ensure that
6034 * if we observe event->ctx, both event and ctx will be
6035 * complete enough. See perf_install_in_context().
6036 */
6037 if (!smp_load_acquire(&event->ctx))
6038 continue;
6039
5972 if (event->state < PERF_EVENT_STATE_INACTIVE) 6040 if (event->state < PERF_EVENT_STATE_INACTIVE)
5973 continue; 6041 continue;
5974 if (!event_filter_match(event)) 6042 if (!event_filter_match(event))
@@ -6098,7 +6166,7 @@ static int __perf_pmu_output_stop(void *info)
6098{ 6166{
6099 struct perf_event *event = info; 6167 struct perf_event *event = info;
6100 struct pmu *pmu = event->pmu; 6168 struct pmu *pmu = event->pmu;
6101 struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 6169 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6102 struct remote_output ro = { 6170 struct remote_output ro = {
6103 .rb = event->rb, 6171 .rb = event->rb,
6104 }; 6172 };
@@ -6553,15 +6621,6 @@ got_name:
6553} 6621}
6554 6622
6555/* 6623/*
6556 * Whether this @filter depends on a dynamic object which is not loaded
6557 * yet or its load addresses are not known.
6558 */
6559static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
6560{
6561 return filter->filter && filter->inode;
6562}
6563
6564/*
6565 * Check whether inode and address range match filter criteria. 6624 * Check whether inode and address range match filter criteria.
6566 */ 6625 */
6567static bool perf_addr_filter_match(struct perf_addr_filter *filter, 6626static bool perf_addr_filter_match(struct perf_addr_filter *filter,
@@ -6622,6 +6681,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
6622 struct perf_event_context *ctx; 6681 struct perf_event_context *ctx;
6623 int ctxn; 6682 int ctxn;
6624 6683
6684 /*
6685 * Data tracing isn't supported yet and as such there is no need
6686 * to keep track of anything that isn't related to executable code:
6687 */
6688 if (!(vma->vm_flags & VM_EXEC))
6689 return;
6690
6625 rcu_read_lock(); 6691 rcu_read_lock();
6626 for_each_task_context_nr(ctxn) { 6692 for_each_task_context_nr(ctxn) {
6627 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 6693 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
@@ -7774,7 +7840,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
7774 list_for_each_entry(filter, &ifh->list, entry) { 7840 list_for_each_entry(filter, &ifh->list, entry) {
7775 event->addr_filters_offs[count] = 0; 7841 event->addr_filters_offs[count] = 0;
7776 7842
7777 if (perf_addr_filter_needs_mmap(filter)) 7843 /*
7844 * Adjust base offset if the filter is associated to a binary
7845 * that needs to be mapped:
7846 */
7847 if (filter->inode)
7778 event->addr_filters_offs[count] = 7848 event->addr_filters_offs[count] =
7779 perf_addr_filter_apply(filter, mm); 7849 perf_addr_filter_apply(filter, mm);
7780 7850
@@ -7905,8 +7975,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
7905 goto fail; 7975 goto fail;
7906 } 7976 }
7907 7977
7908 if (token == IF_SRC_FILE) { 7978 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
7909 filename = match_strdup(&args[2]); 7979 int fpos = filter->range ? 2 : 1;
7980
7981 filename = match_strdup(&args[fpos]);
7910 if (!filename) { 7982 if (!filename) {
7911 ret = -ENOMEM; 7983 ret = -ENOMEM;
7912 goto fail; 7984 goto fail;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525ab2083..8c50276b60d1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
172 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 172 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
173 err = -EAGAIN; 173 err = -EAGAIN;
174 ptep = page_check_address(page, mm, addr, &ptl, 0); 174 ptep = page_check_address(page, mm, addr, &ptl, 0);
175 if (!ptep) 175 if (!ptep) {
176 mem_cgroup_cancel_charge(kpage, memcg, false);
176 goto unlock; 177 goto unlock;
178 }
177 179
178 get_page(kpage); 180 get_page(kpage);
179 page_add_new_anon_rmap(kpage, vma, addr, false); 181 page_add_new_anon_rmap(kpage, vma, addr, false);
@@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
200 202
201 err = 0; 203 err = 0;
202 unlock: 204 unlock:
203 mem_cgroup_cancel_charge(kpage, memcg, false);
204 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 205 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
205 unlock_page(page); 206 unlock_page(page);
206 return err; 207 return err;
diff --git a/kernel/futex.c b/kernel/futex.c
index 33664f70e2d2..46cb3a301bc1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled;
179 * Futex flags used to encode options to functions and preserve them across 179 * Futex flags used to encode options to functions and preserve them across
180 * restarts. 180 * restarts.
181 */ 181 */
182#define FLAGS_SHARED 0x01 182#ifdef CONFIG_MMU
183# define FLAGS_SHARED 0x01
184#else
185/*
186 * NOMMU does not have per process address space. Let the compiler optimize
187 * code away.
188 */
189# define FLAGS_SHARED 0x00
190#endif
183#define FLAGS_CLOCKRT 0x02 191#define FLAGS_CLOCKRT 0x02
184#define FLAGS_HAS_TIMEOUT 0x04 192#define FLAGS_HAS_TIMEOUT 0x04
185 193
@@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key)
405 if (!key->both.ptr) 413 if (!key->both.ptr)
406 return; 414 return;
407 415
416 /*
417 * On MMU less systems futexes are always "private" as there is no per
418 * process address space. We need the smp wmb nevertheless - yes,
419 * arch/blackfin has MMU less SMP ...
420 */
421 if (!IS_ENABLED(CONFIG_MMU)) {
422 smp_mb(); /* explicit smp_mb(); (B) */
423 return;
424 }
425
408 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 426 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
409 case FUT_OFF_INODE: 427 case FUT_OFF_INODE:
410 ihold(key->shared.inode); /* implies smp_mb(); (B) */ 428 ihold(key->shared.inode); /* implies smp_mb(); (B) */
@@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key)
436 return; 454 return;
437 } 455 }
438 456
457 if (!IS_ENABLED(CONFIG_MMU))
458 return;
459
439 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 460 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
440 case FUT_OFF_INODE: 461 case FUT_OFF_INODE:
441 iput(key->shared.inode); 462 iput(key->shared.inode);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f68959341c0f..32f6cfcff212 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,6 +39,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
39 return NULL; 39 return NULL;
40 } 40 }
41 41
42 get_online_cpus();
42 if (max_vecs >= num_online_cpus()) { 43 if (max_vecs >= num_online_cpus()) {
43 cpumask_copy(affinity_mask, cpu_online_mask); 44 cpumask_copy(affinity_mask, cpu_online_mask);
44 *nr_vecs = num_online_cpus(); 45 *nr_vecs = num_online_cpus();
@@ -56,6 +57,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
56 } 57 }
57 *nr_vecs = vecs; 58 *nr_vecs = vecs;
58 } 59 }
60 put_online_cpus();
59 61
60 return affinity_mask; 62 return affinity_mask;
61} 63}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b4c1bc7c9ca2..637389088b3f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -820,6 +820,17 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
820 desc->name = name; 820 desc->name = name;
821 821
822 if (handle != handle_bad_irq && is_chained) { 822 if (handle != handle_bad_irq && is_chained) {
823 /*
824 * We're about to start this interrupt immediately,
825 * hence the need to set the trigger configuration.
826 * But the .set_type callback may have overridden the
827 * flow handler, ignoring that we're dealing with a
828 * chained interrupt. Reset it immediately because we
829 * do know better.
830 */
831 __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data));
832 desc->handle_irq = handle;
833
823 irq_settings_set_noprobe(desc); 834 irq_settings_set_noprobe(desc);
824 irq_settings_set_norequest(desc); 835 irq_settings_set_norequest(desc);
825 irq_settings_set_nothread(desc); 836 irq_settings_set_nothread(desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 73a2b786b5e9..9530fcd27704 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1681,8 +1681,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1681 action->dev_id = dev_id; 1681 action->dev_id = dev_id;
1682 1682
1683 retval = irq_chip_pm_get(&desc->irq_data); 1683 retval = irq_chip_pm_get(&desc->irq_data);
1684 if (retval < 0) 1684 if (retval < 0) {
1685 kfree(action);
1685 return retval; 1686 return retval;
1687 }
1686 1688
1687 chip_bus_lock(desc); 1689 chip_bus_lock(desc);
1688 retval = __setup_irq(irq, desc, action); 1690 retval = __setup_irq(irq, desc, action);
@@ -1985,8 +1987,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1985 action->percpu_dev_id = dev_id; 1987 action->percpu_dev_id = dev_id;
1986 1988
1987 retval = irq_chip_pm_get(&desc->irq_data); 1989 retval = irq_chip_pm_get(&desc->irq_data);
1988 if (retval < 0) 1990 if (retval < 0) {
1991 kfree(action);
1989 return retval; 1992 return retval;
1993 }
1990 1994
1991 chip_bus_lock(desc); 1995 chip_bus_lock(desc);
1992 retval = __setup_irq(irq, desc, action); 1996 retval = __setup_irq(irq, desc, action);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 54999350162c..19e9dfbe97fa 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
359 else 359 else
360 dev_dbg(dev, "irq [%d-%d] for MSI\n", 360 dev_dbg(dev, "irq [%d-%d] for MSI\n",
361 virq, virq + desc->nvec_used - 1); 361 virq, virq + desc->nvec_used - 1);
362 /*
363 * This flag is set by the PCI layer as we need to activate
364 * the MSI entries before the PCI layer enables MSI in the
365 * card. Otherwise the card latches a random msi message.
366 */
367 if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
368 struct irq_data *irq_data;
369
370 irq_data = irq_domain_get_irq_data(domain, desc->irq);
371 irq_domain_activate_irq(irq_data);
372 }
362 } 373 }
363 374
364 return 0; 375 return 0;
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 37649e69056c..8a99abf58080 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
450 goto gotlock; 450 goto gotlock;
451 } 451 }
452 } 452 }
453 WRITE_ONCE(pn->state, vcpu_halted); 453 WRITE_ONCE(pn->state, vcpu_hashed);
454 qstat_inc(qstat_pv_wait_head, true); 454 qstat_inc(qstat_pv_wait_head, true);
455 qstat_inc(qstat_pv_wait_again, waitcnt); 455 qstat_inc(qstat_pv_wait_again, waitcnt);
456 pv_wait(&l->locked, _Q_SLOW_VAL); 456 pv_wait(&l->locked, _Q_SLOW_VAL);
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 22e025309845..b9d031516254 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
153 */ 153 */
154 if ((counter == qstat_pv_latency_kick) || 154 if ((counter == qstat_pv_latency_kick) ||
155 (counter == qstat_pv_latency_wake)) { 155 (counter == qstat_pv_latency_wake)) {
156 stat = 0;
157 if (kicks) 156 if (kicks)
158 stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); 157 stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
159 } 158 }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a881c6a7ba74..33c79b6105c5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -300,12 +300,12 @@ static int create_image(int platform_mode)
300 save_processor_state(); 300 save_processor_state();
301 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); 301 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
302 error = swsusp_arch_suspend(); 302 error = swsusp_arch_suspend();
303 /* Restore control flow magically appears here */
304 restore_processor_state();
303 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); 305 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
304 if (error) 306 if (error)
305 printk(KERN_ERR "PM: Error %d creating hibernation image\n", 307 printk(KERN_ERR "PM: Error %d creating hibernation image\n",
306 error); 308 error);
307 /* Restore control flow magically appears here */
308 restore_processor_state();
309 if (!in_suspend) 309 if (!in_suspend)
310 events_check_enabled = false; 310 events_check_enabled = false;
311 311
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 9a0178c2ac1d..b02228411d57 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
835 */ 835 */
836static bool rtree_next_node(struct memory_bitmap *bm) 836static bool rtree_next_node(struct memory_bitmap *bm)
837{ 837{
838 bm->cur.node = list_entry(bm->cur.node->list.next, 838 if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
839 struct rtree_node, list); 839 bm->cur.node = list_entry(bm->cur.node->list.next,
840 if (&bm->cur.node->list != &bm->cur.zone->leaves) { 840 struct rtree_node, list);
841 bm->cur.node_pfn += BM_BITS_PER_BLOCK; 841 bm->cur.node_pfn += BM_BITS_PER_BLOCK;
842 bm->cur.node_bit = 0; 842 bm->cur.node_bit = 0;
843 touch_softlockup_watchdog(); 843 touch_softlockup_watchdog();
@@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
845 } 845 }
846 846
847 /* No more nodes, goto next zone */ 847 /* No more nodes, goto next zone */
848 bm->cur.zone = list_entry(bm->cur.zone->list.next, 848 if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
849 bm->cur.zone = list_entry(bm->cur.zone->list.next,
849 struct mem_zone_bm_rtree, list); 850 struct mem_zone_bm_rtree, list);
850 if (&bm->cur.zone->list != &bm->zones) {
851 bm->cur.node = list_entry(bm->cur.zone->leaves.next, 851 bm->cur.node = list_entry(bm->cur.zone->leaves.next,
852 struct rtree_node, list); 852 struct rtree_node, list);
853 bm->cur.node_pfn = 0; 853 bm->cur.node_pfn = 0;
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
9 9
10char *_braille_console_setup(char **str, char **brl_options) 10char *_braille_console_setup(char **str, char **brl_options)
11{ 11{
12 if (!memcmp(*str, "brl,", 4)) { 12 if (!strncmp(*str, "brl,", 4)) {
13 *brl_options = ""; 13 *brl_options = "";
14 *str += 4; 14 *str += 4;
15 } else if (!memcmp(str, "brl=", 4)) { 15 } else if (!strncmp(*str, "brl=", 4)) {
16 *brl_options = *str + 4; 16 *brl_options = *str + 4;
17 *str = strchr(*brl_options, ','); 17 *str = strchr(*brl_options, ',');
18 if (!*str) 18 if (!*str)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c883fe8e440..2a906f20fba7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
74#include <linux/context_tracking.h> 74#include <linux/context_tracking.h>
75#include <linux/compiler.h> 75#include <linux/compiler.h>
76#include <linux/frame.h> 76#include <linux/frame.h>
77#include <linux/prefetch.h>
77 78
78#include <asm/switch_to.h> 79#include <asm/switch_to.h>
79#include <asm/tlb.h> 80#include <asm/tlb.h>
@@ -2972,6 +2973,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2972EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2973EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2973 2974
2974/* 2975/*
2976 * The function fair_sched_class.update_curr accesses the struct curr
2977 * and its field curr->exec_start; when called from task_sched_runtime(),
2978 * we observe a high rate of cache misses in practice.
2979 * Prefetching this data results in improved performance.
2980 */
2981static inline void prefetch_curr_exec_start(struct task_struct *p)
2982{
2983#ifdef CONFIG_FAIR_GROUP_SCHED
2984 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
2985#else
2986 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
2987#endif
2988 prefetch(curr);
2989 prefetch(&curr->exec_start);
2990}
2991
2992/*
2975 * Return accounted runtime for the task. 2993 * Return accounted runtime for the task.
2976 * In case the task is currently running, return the runtime plus current's 2994 * In case the task is currently running, return the runtime plus current's
2977 * pending runtime that have not been accounted yet. 2995 * pending runtime that have not been accounted yet.
@@ -3005,6 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3005 * thread, breaking clock_gettime(). 3023 * thread, breaking clock_gettime().
3006 */ 3024 */
3007 if (task_current(rq, p) && task_on_rq_queued(p)) { 3025 if (task_current(rq, p) && task_on_rq_queued(p)) {
3026 prefetch_curr_exec_start(p);
3008 update_rq_clock(rq); 3027 update_rq_clock(rq);
3009 p->sched_class->update_curr(rq); 3028 p->sched_class->update_curr(rq);
3010 } 3029 }
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5be58820465c..d4184498c9f5 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
168 168
169 if (old_idx == IDX_INVALID) { 169 if (old_idx == IDX_INVALID) {
170 cp->size++; 170 cp->size++;
171 cp->elements[cp->size - 1].dl = 0; 171 cp->elements[cp->size - 1].dl = dl;
172 cp->elements[cp->size - 1].cpu = cpu; 172 cp->elements[cp->size - 1].cpu = cpu;
173 cp->elements[cpu].idx = cp->size - 1; 173 cp->elements[cpu].idx = cp->size - 1;
174 cpudl_change_key(cp, cp->size - 1, dl); 174 cpudl_change_key(cp, cp->size - 1, dl);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 1934f658c036..a846cf89eb96 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime)
263 cpustat[CPUTIME_IDLE] += (__force u64) cputime; 263 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
264} 264}
265 265
266/*
267 * When a guest is interrupted for a longer amount of time, missed clock
268 * ticks are not redelivered later. Due to that, this function may on
269 * occasion account more time than the calling functions think elapsed.
270 */
266static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) 271static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
267{ 272{
268#ifdef CONFIG_PARAVIRT 273#ifdef CONFIG_PARAVIRT
@@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
371 * idle, or potentially user or system time. Due to rounding, 376 * idle, or potentially user or system time. Due to rounding,
372 * other time can exceed ticks occasionally. 377 * other time can exceed ticks occasionally.
373 */ 378 */
374 other = account_other_time(cputime); 379 other = account_other_time(ULONG_MAX);
375 if (other >= cputime) 380 if (other >= cputime)
376 return; 381 return;
377 cputime -= other; 382 cputime -= other;
@@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
486 } 491 }
487 492
488 cputime = cputime_one_jiffy; 493 cputime = cputime_one_jiffy;
489 steal = steal_account_process_time(cputime); 494 steal = steal_account_process_time(ULONG_MAX);
490 495
491 if (steal >= cputime) 496 if (steal >= cputime)
492 return; 497 return;
@@ -508,13 +513,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
508 */ 513 */
509void account_idle_ticks(unsigned long ticks) 514void account_idle_ticks(unsigned long ticks)
510{ 515{
516 cputime_t cputime, steal;
511 517
512 if (sched_clock_irqtime) { 518 if (sched_clock_irqtime) {
513 irqtime_account_idle_ticks(ticks); 519 irqtime_account_idle_ticks(ticks);
514 return; 520 return;
515 } 521 }
516 522
517 account_idle_time(jiffies_to_cputime(ticks)); 523 cputime = jiffies_to_cputime(ticks);
524 steal = steal_account_process_time(ULONG_MAX);
525
526 if (steal >= cputime)
527 return;
528
529 cputime -= steal;
530 account_idle_time(cputime);
518} 531}
519 532
520/* 533/*
@@ -606,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr,
606 stime = curr->stime; 619 stime = curr->stime;
607 utime = curr->utime; 620 utime = curr->utime;
608 621
609 if (utime == 0) { 622 /*
610 stime = rtime; 623 * If either stime or both stime and utime are 0, assume all runtime is
624 * userspace. Once a task gets some ticks, the monotonicy code at
625 * 'update' will ensure things converge to the observed ratio.
626 */
627 if (stime == 0) {
628 utime = rtime;
611 goto update; 629 goto update;
612 } 630 }
613 631
614 if (stime == 0) { 632 if (utime == 0) {
615 utime = rtime; 633 stime = rtime;
616 goto update; 634 goto update;
617 } 635 }
618 636
619 stime = scale_stime((__force u64)stime, (__force u64)rtime, 637 stime = scale_stime((__force u64)stime, (__force u64)rtime,
620 (__force u64)(stime + utime)); 638 (__force u64)(stime + utime));
621 639
640update:
622 /* 641 /*
623 * Make sure stime doesn't go backwards; this preserves monotonicity 642 * Make sure stime doesn't go backwards; this preserves monotonicity
624 * for utime because rtime is monotonic. 643 * for utime because rtime is monotonic.
@@ -641,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr,
641 stime = rtime - utime; 660 stime = rtime - utime;
642 } 661 }
643 662
644update:
645 prev->stime = stime; 663 prev->stime = stime;
646 prev->utime = utime; 664 prev->utime = utime;
647out: 665out:
@@ -686,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
686 unsigned long now = READ_ONCE(jiffies); 704 unsigned long now = READ_ONCE(jiffies);
687 cputime_t delta, other; 705 cputime_t delta, other;
688 706
707 /*
708 * Unlike tick based timing, vtime based timing never has lost
709 * ticks, and no need for steal time accounting to make up for
710 * lost ticks. Vtime accounts a rounded version of actual
711 * elapsed time. Limit account_other_time to prevent rounding
712 * errors from causing elapsed vtime to go negative.
713 */
689 delta = jiffies_to_cputime(now - tsk->vtime_snap); 714 delta = jiffies_to_cputime(now - tsk->vtime_snap);
690 other = account_other_time(delta); 715 other = account_other_time(delta);
691 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); 716 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217ff4..1ce8867283dc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
658 * 658 *
659 * XXX figure out if select_task_rq_dl() deals with offline cpus. 659 * XXX figure out if select_task_rq_dl() deals with offline cpus.
660 */ 660 */
661 if (unlikely(!rq->online)) 661 if (unlikely(!rq->online)) {
662 lockdep_unpin_lock(&rq->lock, rf.cookie);
662 rq = dl_task_offline_migration(rq, p); 663 rq = dl_task_offline_migration(rq, p);
664 rf.cookie = lockdep_pin_lock(&rq->lock);
665 }
663 666
664 /* 667 /*
665 * Queueing this task back might have overloaded rq, check if we need 668 * Queueing this task back might have overloaded rq, check if we need
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4088eedea763..039de34f1521 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
4269 pcfs_rq = tg->parent->cfs_rq[cpu]; 4269 pcfs_rq = tg->parent->cfs_rq[cpu];
4270 4270
4271 cfs_rq->throttle_count = pcfs_rq->throttle_count; 4271 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4272 pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); 4272 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4273} 4273}
4274 4274
4275/* conditionally throttle active cfs_rq's from put_prev_entity() */ 4275/* conditionally throttle active cfs_rq's from put_prev_entity() */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b43d0b27c1fe..a13bbdaab47d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2140,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
2140 return 0; 2140 return 0;
2141} 2141}
2142 2142
2143static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
2144 int *valp,
2145 int write, void *data)
2146{
2147 if (write) {
2148 if (*negp)
2149 return -EINVAL;
2150 *valp = *lvalp;
2151 } else {
2152 unsigned int val = *valp;
2153 *lvalp = (unsigned long)val;
2154 }
2155 return 0;
2156}
2157
2143static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; 2158static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
2144 2159
2145static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2160static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2259,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
2259int proc_dointvec(struct ctl_table *table, int write, 2274int proc_dointvec(struct ctl_table *table, int write,
2260 void __user *buffer, size_t *lenp, loff_t *ppos) 2275 void __user *buffer, size_t *lenp, loff_t *ppos)
2261{ 2276{
2262 return do_proc_dointvec(table,write,buffer,lenp,ppos, 2277 return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
2263 NULL,NULL); 2278}
2279
2280/**
2281 * proc_douintvec - read a vector of unsigned integers
2282 * @table: the sysctl table
2283 * @write: %TRUE if this is a write to the sysctl file
2284 * @buffer: the user buffer
2285 * @lenp: the size of the user buffer
2286 * @ppos: file position
2287 *
2288 * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
2289 * values from/to the user buffer, treated as an ASCII string.
2290 *
2291 * Returns 0 on success.
2292 */
2293int proc_douintvec(struct ctl_table *table, int write,
2294 void __user *buffer, size_t *lenp, loff_t *ppos)
2295{
2296 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2297 do_proc_douintvec_conv, NULL);
2264} 2298}
2265 2299
2266/* 2300/*
@@ -2858,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write,
2858 return -ENOSYS; 2892 return -ENOSYS;
2859} 2893}
2860 2894
2895int proc_douintvec(struct ctl_table *table, int write,
2896 void __user *buffer, size_t *lenp, loff_t *ppos)
2897{
2898 return -ENOSYS;
2899}
2900
2861int proc_dointvec_minmax(struct ctl_table *table, int write, 2901int proc_dointvec_minmax(struct ctl_table *table, int write,
2862 void __user *buffer, size_t *lenp, loff_t *ppos) 2902 void __user *buffer, size_t *lenp, loff_t *ppos)
2863{ 2903{
@@ -2903,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2903 * exception granted :-) 2943 * exception granted :-)
2904 */ 2944 */
2905EXPORT_SYMBOL(proc_dointvec); 2945EXPORT_SYMBOL(proc_dointvec);
2946EXPORT_SYMBOL(proc_douintvec);
2906EXPORT_SYMBOL(proc_dointvec_jiffies); 2947EXPORT_SYMBOL(proc_dointvec_jiffies);
2907EXPORT_SYMBOL(proc_dointvec_minmax); 2948EXPORT_SYMBOL(proc_dointvec_minmax);
2908EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); 2949EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3b65746c7f15..e07fb093f819 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
401 do { 401 do {
402 seq = raw_read_seqcount_latch(&tkf->seq); 402 seq = raw_read_seqcount_latch(&tkf->seq);
403 tkr = tkf->base + (seq & 0x01); 403 tkr = tkf->base + (seq & 0x01);
404 now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); 404 now = ktime_to_ns(tkr->base);
405
406 now += clocksource_delta(tkr->read(tkr->clock),
407 tkr->cycle_last, tkr->mask);
405 } while (read_seqcount_retry(&tkf->seq, seq)); 408 } while (read_seqcount_retry(&tkf->seq, seq));
406 409
407 return now; 410 return now;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..107310a6f36f 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
23 23
24#include "timekeeping_internal.h" 24#include "timekeeping_internal.h"
25 25
26static unsigned int sleep_time_bin[32] = {0}; 26#define NUM_BINS 32
27
28static unsigned int sleep_time_bin[NUM_BINS] = {0};
27 29
28static int tk_debug_show_sleep_time(struct seq_file *s, void *data) 30static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
29{ 31{
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
69 71
70void tk_debug_account_sleep_time(struct timespec64 *t) 72void tk_debug_account_sleep_time(struct timespec64 *t)
71{ 73{
72 sleep_time_bin[fls(t->tv_sec)]++; 74 /* Cap bin index so we don't overflow the array */
75 int bin = min(fls(t->tv_sec), NUM_BINS-1);
76
77 sleep_time_bin[bin]++;
73} 78}
74 79
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 555670a5143c..32bf6f75a8fe 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1496 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 1496 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1497 u64 expires = KTIME_MAX; 1497 u64 expires = KTIME_MAX;
1498 unsigned long nextevt; 1498 unsigned long nextevt;
1499 bool is_max_delta;
1499 1500
1500 /* 1501 /*
1501 * Pretend that there is no timer pending if the cpu is offline. 1502 * Pretend that there is no timer pending if the cpu is offline.
@@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1506 1507
1507 spin_lock(&base->lock); 1508 spin_lock(&base->lock);
1508 nextevt = __next_timer_interrupt(base); 1509 nextevt = __next_timer_interrupt(base);
1510 is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
1509 base->next_expiry = nextevt; 1511 base->next_expiry = nextevt;
1510 /* 1512 /*
1511 * We have a fresh next event. Check whether we can forward the base: 1513 * We have a fresh next event. Check whether we can forward the base:
@@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1519 expires = basem; 1521 expires = basem;
1520 base->is_idle = false; 1522 base->is_idle = false;
1521 } else { 1523 } else {
1522 expires = basem + (nextevt - basej) * TICK_NSEC; 1524 if (!is_max_delta)
1525 expires = basem + (nextevt - basej) * TICK_NSEC;
1523 /* 1526 /*
1524 * If we expect to sleep more than a tick, mark the base idle: 1527 * If we expect to sleep more than a tick, mark the base idle:
1525 */ 1528 */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7598e6ca817a..dbafc5df03f3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
223 what |= MASK_TC_BIT(op_flags, META); 223 what |= MASK_TC_BIT(op_flags, META);
224 what |= MASK_TC_BIT(op_flags, PREFLUSH); 224 what |= MASK_TC_BIT(op_flags, PREFLUSH);
225 what |= MASK_TC_BIT(op_flags, FUA); 225 what |= MASK_TC_BIT(op_flags, FUA);
226 if (op == REQ_OP_DISCARD) 226 if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
227 what |= BLK_TC_ACT(BLK_TC_DISCARD); 227 what |= BLK_TC_ACT(BLK_TC_DISCARD);
228 if (op == REQ_OP_FLUSH) 228 if (op == REQ_OP_FLUSH)
229 what |= BLK_TC_ACT(BLK_TC_FLUSH); 229 what |= BLK_TC_ACT(BLK_TC_FLUSH);