aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c8
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/manage.c127
-rw-r--r--kernel/irq/msi.c11
-rw-r--r--kernel/livepatch/core.c69
-rw-r--r--kernel/locking/lockdep.c81
-rw-r--r--kernel/locking/mcs_spinlock.h6
-rw-r--r--kernel/locking/mutex.c51
-rw-r--r--kernel/locking/osq_lock.c14
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-spinlock.c7
-rw-r--r--kernel/locking/rwsem-xadd.c98
-rw-r--r--kernel/locking/rwsem.c22
-rw-r--r--kernel/locking/rwsem.h20
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/power/snapshot.c21
-rw-r--r--kernel/sched/core.c119
-rw-r--r--kernel/sched/deadline.c77
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c429
-rw-r--r--kernel/sched/features.h13
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/sched/rt.c181
-rw-r--r--kernel/sched/sched.h38
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/Kconfig6
-rw-r--r--kernel/time/Makefile6
-rw-r--r--kernel/time/clockevents.c229
-rw-r--r--kernel/time/clocksource.c173
-rw-r--r--kernel/time/hrtimer.c9
-rw-r--r--kernel/time/jiffies.c7
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/time/sched_clock.c236
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c11
-rw-r--r--kernel/time/tick-broadcast.c179
-rw-r--r--kernel/time/tick-common.c82
-rw-r--r--kernel/time/tick-internal.h211
-rw-r--r--kernel/time/tick-oneshot.c6
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/tick-sched.h74
-rw-r--r--kernel/time/timekeeping.c490
-rw-r--r--kernel/time/timekeeping.h7
-rw-r--r--kernel/time/timer.c149
-rw-r--r--kernel/time/timer_list.c34
-rw-r--r--kernel/trace/Kconfig28
-rw-r--r--kernel/trace/ftrace.c44
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.c491
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c153
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c7
-rw-r--r--kernel/trace/trace_kprobe.c15
-rw-r--r--kernel/trace/trace_probe.c19
-rw-r--r--kernel/trace/trace_probe.h12
-rw-r--r--kernel/trace/trace_stat.c10
-rw-r--r--kernel/trace/trace_uprobe.c5
-rw-r--r--kernel/workqueue.c847
63 files changed, 3351 insertions, 1727 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 29a7b2cc593e..a220fdb66568 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count)
3806 3806
3807static void pidlist_free(void *p) 3807static void pidlist_free(void *p)
3808{ 3808{
3809 if (is_vmalloc_addr(p)) 3809 kvfree(p);
3810 vfree(p);
3811 else
3812 kfree(p);
3813} 3810}
3814 3811
3815/* 3812/*
@@ -5040,6 +5037,9 @@ int __init cgroup_init(void)
5040 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); 5037 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5041 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); 5038 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5042 } 5039 }
5040
5041 if (ss->bind)
5042 ss->bind(init_css_set.subsys[ssid]);
5043 } 5043 }
5044 5044
5045 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 5045 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 57858cebd6b5..94bbe4695232 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
20#include <linux/gfp.h> 20#include <linux/gfp.h>
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/lockdep.h> 22#include <linux/lockdep.h>
23#include <linux/tick.h>
23#include <trace/events/power.h> 24#include <trace/events/power.h>
24 25
25#include "smpboot.h" 26#include "smpboot.h"
@@ -338,6 +339,8 @@ static int __ref take_cpu_down(void *_param)
338 return err; 339 return err;
339 340
340 cpu_notify(CPU_DYING | param->mod, param->hcpu); 341 cpu_notify(CPU_DYING | param->mod, param->hcpu);
342 /* Give up timekeeping duties */
343 tick_handover_do_timer();
341 /* Park the stopper thread */ 344 /* Park the stopper thread */
342 kthread_park(current); 345 kthread_park(current);
343 return 0; 346 return 0;
@@ -413,10 +416,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
413 smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ 416 smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
414 per_cpu(cpu_dead_idle, cpu) = false; 417 per_cpu(cpu_dead_idle, cpu) = false;
415 418
419 hotplug_cpu__broadcast_tick_pull(cpu);
416 /* This actually kills the CPU. */ 420 /* This actually kills the CPU. */
417 __cpu_die(cpu); 421 __cpu_die(cpu);
418 422
419 /* CPU is completely dead: tell everyone. Too late to complain. */ 423 /* CPU is completely dead: tell everyone. Too late to complain. */
424 tick_cleanup_dead_cpu(cpu);
420 cpu_notify_nofail(CPU_DEAD | mod, hcpu); 425 cpu_notify_nofail(CPU_DEAD | mod, hcpu);
421 426
422 check_for_tasks(cpu); 427 check_for_tasks(cpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc7f4748d34a..c68f0721df10 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
622 int csn; /* how many cpuset ptrs in csa so far */ 622 int csn; /* how many cpuset ptrs in csa so far */
623 int i, j, k; /* indices for partition finding loops */ 623 int i, j, k; /* indices for partition finding loops */
624 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 624 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
625 cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
625 struct sched_domain_attr *dattr; /* attributes for custom domains */ 626 struct sched_domain_attr *dattr; /* attributes for custom domains */
626 int ndoms = 0; /* number of sched domains in result */ 627 int ndoms = 0; /* number of sched domains in result */
627 int nslot; /* next empty doms[] struct cpumask slot */ 628 int nslot; /* next empty doms[] struct cpumask slot */
@@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
631 dattr = NULL; 632 dattr = NULL;
632 csa = NULL; 633 csa = NULL;
633 634
635 if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
636 goto done;
637 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
638
634 /* Special case for the 99% of systems with one, full, sched domain */ 639 /* Special case for the 99% of systems with one, full, sched domain */
635 if (is_sched_load_balance(&top_cpuset)) { 640 if (is_sched_load_balance(&top_cpuset)) {
636 ndoms = 1; 641 ndoms = 1;
@@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
643 *dattr = SD_ATTR_INIT; 648 *dattr = SD_ATTR_INIT;
644 update_domain_attr_tree(dattr, &top_cpuset); 649 update_domain_attr_tree(dattr, &top_cpuset);
645 } 650 }
646 cpumask_copy(doms[0], top_cpuset.effective_cpus); 651 cpumask_and(doms[0], top_cpuset.effective_cpus,
652 non_isolated_cpus);
647 653
648 goto done; 654 goto done;
649 } 655 }
@@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
666 * the corresponding sched domain. 672 * the corresponding sched domain.
667 */ 673 */
668 if (!cpumask_empty(cp->cpus_allowed) && 674 if (!cpumask_empty(cp->cpus_allowed) &&
669 !is_sched_load_balance(cp)) 675 !(is_sched_load_balance(cp) &&
676 cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
670 continue; 677 continue;
671 678
672 if (is_sched_load_balance(cp)) 679 if (is_sched_load_balance(cp))
@@ -748,6 +755,7 @@ restart:
748 755
749 if (apn == b->pn) { 756 if (apn == b->pn) {
750 cpumask_or(dp, dp, b->effective_cpus); 757 cpumask_or(dp, dp, b->effective_cpus);
758 cpumask_and(dp, dp, non_isolated_cpus);
751 if (dattr) 759 if (dattr)
752 update_domain_attr_tree(dattr + nslot, b); 760 update_domain_attr_tree(dattr + nslot, b);
753 761
@@ -760,6 +768,7 @@ restart:
760 BUG_ON(nslot != ndoms); 768 BUG_ON(nslot != ndoms);
761 769
762done: 770done:
771 free_cpumask_var(non_isolated_cpus);
763 kfree(csa); 772 kfree(csa);
764 773
765 /* 774 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 453ef61311d4..2fabc0627165 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4574,6 +4574,13 @@ static void perf_pending_event(struct irq_work *entry)
4574{ 4574{
4575 struct perf_event *event = container_of(entry, 4575 struct perf_event *event = container_of(entry,
4576 struct perf_event, pending); 4576 struct perf_event, pending);
4577 int rctx;
4578
4579 rctx = perf_swevent_get_recursion_context();
4580 /*
4581 * If we 'fail' here, that's OK, it means recursion is already disabled
4582 * and we won't recurse 'further'.
4583 */
4577 4584
4578 if (event->pending_disable) { 4585 if (event->pending_disable) {
4579 event->pending_disable = 0; 4586 event->pending_disable = 0;
@@ -4584,6 +4591,9 @@ static void perf_pending_event(struct irq_work *entry)
4584 event->pending_wakeup = 0; 4591 event->pending_wakeup = 0;
4585 perf_event_wakeup(event); 4592 perf_event_wakeup(event);
4586 } 4593 }
4594
4595 if (rctx >= 0)
4596 perf_swevent_put_recursion_context(rctx);
4587} 4597}
4588 4598
4589/* 4599/*
diff --git a/kernel/futex.c b/kernel/futex.c
index 2a5e3830e953..2579e407ff67 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
900 if (!p) 900 if (!p)
901 return -ESRCH; 901 return -ESRCH;
902 902
903 if (!p->mm) { 903 if (unlikely(p->flags & PF_KTHREAD)) {
904 put_task_struct(p); 904 put_task_struct(p);
905 return -EPERM; 905 return -EPERM;
906 } 906 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6f1c7a566b95..eb9a4ea394ab 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
948 948
949 return -ENOSYS; 949 return -ENOSYS;
950} 950}
951
952/**
953 * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
954 * @data: Pointer to interrupt specific data
955 * @on: Whether to set or reset the wake-up capability of this irq
956 *
957 * Conditional, as the underlying parent chip might not implement it.
958 */
959int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
960{
961 data = data->parent_data;
962 if (data->chip->irq_set_wake)
963 return data->chip->irq_set_wake(data, on);
964
965 return -ENOSYS;
966}
951#endif 967#endif
952 968
953/** 969/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 886d09e691d5..e68932bb308e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
68 * Do not use this for shutdown scenarios where you must be sure 68 * Do not use this for shutdown scenarios where you must be sure
69 * that all parts (hardirq and threaded handler) have completed. 69 * that all parts (hardirq and threaded handler) have completed.
70 * 70 *
71 * Returns: false if a threaded handler is active.
72 *
71 * This function may be called - with care - from IRQ context. 73 * This function may be called - with care - from IRQ context.
72 */ 74 */
73void synchronize_hardirq(unsigned int irq) 75bool synchronize_hardirq(unsigned int irq)
74{ 76{
75 struct irq_desc *desc = irq_to_desc(irq); 77 struct irq_desc *desc = irq_to_desc(irq);
76 78
77 if (desc) 79 if (desc) {
78 __synchronize_hardirq(desc); 80 __synchronize_hardirq(desc);
81 return !atomic_read(&desc->threads_active);
82 }
83
84 return true;
79} 85}
80EXPORT_SYMBOL(synchronize_hardirq); 86EXPORT_SYMBOL(synchronize_hardirq);
81 87
@@ -440,6 +446,32 @@ void disable_irq(unsigned int irq)
440} 446}
441EXPORT_SYMBOL(disable_irq); 447EXPORT_SYMBOL(disable_irq);
442 448
449/**
450 * disable_hardirq - disables an irq and waits for hardirq completion
451 * @irq: Interrupt to disable
452 *
453 * Disable the selected interrupt line. Enables and Disables are
454 * nested.
455 * This function waits for any pending hard IRQ handlers for this
456 * interrupt to complete before returning. If you use this function while
457 * holding a resource the hard IRQ handler may need you will deadlock.
458 *
459 * When used to optimistically disable an interrupt from atomic context
460 * the return value must be checked.
461 *
462 * Returns: false if a threaded handler is active.
463 *
464 * This function may be called - with care - from IRQ context.
465 */
466bool disable_hardirq(unsigned int irq)
467{
468 if (!__disable_irq_nosync(irq))
469 return synchronize_hardirq(irq);
470
471 return false;
472}
473EXPORT_SYMBOL_GPL(disable_hardirq);
474
443void __enable_irq(struct irq_desc *desc, unsigned int irq) 475void __enable_irq(struct irq_desc *desc, unsigned int irq)
444{ 476{
445 switch (desc->depth) { 477 switch (desc->depth) {
@@ -1766,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1766 1798
1767 return retval; 1799 return retval;
1768} 1800}
1801
1802/**
1803 * irq_get_irqchip_state - returns the irqchip state of a interrupt.
1804 * @irq: Interrupt line that is forwarded to a VM
1805 * @which: One of IRQCHIP_STATE_* the caller wants to know about
1806 * @state: a pointer to a boolean where the state is to be storeed
1807 *
1808 * This call snapshots the internal irqchip state of an
1809 * interrupt, returning into @state the bit corresponding to
1810 * stage @which
1811 *
1812 * This function should be called with preemption disabled if the
1813 * interrupt controller has per-cpu registers.
1814 */
1815int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
1816 bool *state)
1817{
1818 struct irq_desc *desc;
1819 struct irq_data *data;
1820 struct irq_chip *chip;
1821 unsigned long flags;
1822 int err = -EINVAL;
1823
1824 desc = irq_get_desc_buslock(irq, &flags, 0);
1825 if (!desc)
1826 return err;
1827
1828 data = irq_desc_get_irq_data(desc);
1829
1830 do {
1831 chip = irq_data_get_irq_chip(data);
1832 if (chip->irq_get_irqchip_state)
1833 break;
1834#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
1835 data = data->parent_data;
1836#else
1837 data = NULL;
1838#endif
1839 } while (data);
1840
1841 if (data)
1842 err = chip->irq_get_irqchip_state(data, which, state);
1843
1844 irq_put_desc_busunlock(desc, flags);
1845 return err;
1846}
1847
1848/**
1849 * irq_set_irqchip_state - set the state of a forwarded interrupt.
1850 * @irq: Interrupt line that is forwarded to a VM
1851 * @which: State to be restored (one of IRQCHIP_STATE_*)
1852 * @val: Value corresponding to @which
1853 *
1854 * This call sets the internal irqchip state of an interrupt,
1855 * depending on the value of @which.
1856 *
1857 * This function should be called with preemption disabled if the
1858 * interrupt controller has per-cpu registers.
1859 */
1860int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
1861 bool val)
1862{
1863 struct irq_desc *desc;
1864 struct irq_data *data;
1865 struct irq_chip *chip;
1866 unsigned long flags;
1867 int err = -EINVAL;
1868
1869 desc = irq_get_desc_buslock(irq, &flags, 0);
1870 if (!desc)
1871 return err;
1872
1873 data = irq_desc_get_irq_data(desc);
1874
1875 do {
1876 chip = irq_data_get_irq_chip(data);
1877 if (chip->irq_set_irqchip_state)
1878 break;
1879#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
1880 data = data->parent_data;
1881#else
1882 data = NULL;
1883#endif
1884 } while (data);
1885
1886 if (data)
1887 err = chip->irq_set_irqchip_state(data, which, val);
1888
1889 irq_put_desc_busunlock(desc, flags);
1890 return err;
1891}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3e18163f336f..474de5cb394d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
310 struct msi_desc *desc; 310 struct msi_desc *desc;
311 311
312 for_each_msi_entry(desc, dev) { 312 for_each_msi_entry(desc, dev) {
313 irq_domain_free_irqs(desc->irq, desc->nvec_used); 313 /*
314 desc->irq = 0; 314 * We might have failed to allocate an MSI early
315 * enough that there is no IRQ associated to this
316 * entry. If that's the case, don't do anything.
317 */
318 if (desc->irq) {
319 irq_domain_free_irqs(desc->irq, desc->nvec_used);
320 desc->irq = 0;
321 }
315 } 322 }
316} 323}
317 324
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 3f9f1d6b4c2e..284e2691e380 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -335,32 +335,20 @@ unlock:
335 rcu_read_unlock(); 335 rcu_read_unlock();
336} 336}
337 337
338static int klp_disable_func(struct klp_func *func) 338static void klp_disable_func(struct klp_func *func)
339{ 339{
340 struct klp_ops *ops; 340 struct klp_ops *ops;
341 int ret;
342
343 if (WARN_ON(func->state != KLP_ENABLED))
344 return -EINVAL;
345 341
346 if (WARN_ON(!func->old_addr)) 342 WARN_ON(func->state != KLP_ENABLED);
347 return -EINVAL; 343 WARN_ON(!func->old_addr);
348 344
349 ops = klp_find_ops(func->old_addr); 345 ops = klp_find_ops(func->old_addr);
350 if (WARN_ON(!ops)) 346 if (WARN_ON(!ops))
351 return -EINVAL; 347 return;
352 348
353 if (list_is_singular(&ops->func_stack)) { 349 if (list_is_singular(&ops->func_stack)) {
354 ret = unregister_ftrace_function(&ops->fops); 350 WARN_ON(unregister_ftrace_function(&ops->fops));
355 if (ret) { 351 WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
356 pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
357 func->old_name, ret);
358 return ret;
359 }
360
361 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
362 if (ret)
363 pr_warn("function unregister succeeded but failed to clear the filter\n");
364 352
365 list_del_rcu(&func->stack_node); 353 list_del_rcu(&func->stack_node);
366 list_del(&ops->node); 354 list_del(&ops->node);
@@ -370,8 +358,6 @@ static int klp_disable_func(struct klp_func *func)
370 } 358 }
371 359
372 func->state = KLP_DISABLED; 360 func->state = KLP_DISABLED;
373
374 return 0;
375} 361}
376 362
377static int klp_enable_func(struct klp_func *func) 363static int klp_enable_func(struct klp_func *func)
@@ -432,23 +418,15 @@ err:
432 return ret; 418 return ret;
433} 419}
434 420
435static int klp_disable_object(struct klp_object *obj) 421static void klp_disable_object(struct klp_object *obj)
436{ 422{
437 struct klp_func *func; 423 struct klp_func *func;
438 int ret;
439 424
440 for (func = obj->funcs; func->old_name; func++) { 425 for (func = obj->funcs; func->old_name; func++)
441 if (func->state != KLP_ENABLED) 426 if (func->state == KLP_ENABLED)
442 continue; 427 klp_disable_func(func);
443
444 ret = klp_disable_func(func);
445 if (ret)
446 return ret;
447 }
448 428
449 obj->state = KLP_DISABLED; 429 obj->state = KLP_DISABLED;
450
451 return 0;
452} 430}
453 431
454static int klp_enable_object(struct klp_object *obj) 432static int klp_enable_object(struct klp_object *obj)
@@ -464,22 +442,19 @@ static int klp_enable_object(struct klp_object *obj)
464 442
465 for (func = obj->funcs; func->old_name; func++) { 443 for (func = obj->funcs; func->old_name; func++) {
466 ret = klp_enable_func(func); 444 ret = klp_enable_func(func);
467 if (ret) 445 if (ret) {
468 goto unregister; 446 klp_disable_object(obj);
447 return ret;
448 }
469 } 449 }
470 obj->state = KLP_ENABLED; 450 obj->state = KLP_ENABLED;
471 451
472 return 0; 452 return 0;
473
474unregister:
475 WARN_ON(klp_disable_object(obj));
476 return ret;
477} 453}
478 454
479static int __klp_disable_patch(struct klp_patch *patch) 455static int __klp_disable_patch(struct klp_patch *patch)
480{ 456{
481 struct klp_object *obj; 457 struct klp_object *obj;
482 int ret;
483 458
484 /* enforce stacking: only the last enabled patch can be disabled */ 459 /* enforce stacking: only the last enabled patch can be disabled */
485 if (!list_is_last(&patch->list, &klp_patches) && 460 if (!list_is_last(&patch->list, &klp_patches) &&
@@ -489,12 +464,8 @@ static int __klp_disable_patch(struct klp_patch *patch)
489 pr_notice("disabling patch '%s'\n", patch->mod->name); 464 pr_notice("disabling patch '%s'\n", patch->mod->name);
490 465
491 for (obj = patch->objs; obj->funcs; obj++) { 466 for (obj = patch->objs; obj->funcs; obj++) {
492 if (obj->state != KLP_ENABLED) 467 if (obj->state == KLP_ENABLED)
493 continue; 468 klp_disable_object(obj);
494
495 ret = klp_disable_object(obj);
496 if (ret)
497 return ret;
498 } 469 }
499 470
500 patch->state = KLP_DISABLED; 471 patch->state = KLP_DISABLED;
@@ -553,8 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
553 pr_notice("enabling patch '%s'\n", patch->mod->name); 524 pr_notice("enabling patch '%s'\n", patch->mod->name);
554 525
555 for (obj = patch->objs; obj->funcs; obj++) { 526 for (obj = patch->objs; obj->funcs; obj++) {
556 klp_find_object_module(obj);
557
558 if (!klp_is_object_loaded(obj)) 527 if (!klp_is_object_loaded(obj))
559 continue; 528 continue;
560 529
@@ -945,7 +914,6 @@ static void klp_module_notify_going(struct klp_patch *patch,
945{ 914{
946 struct module *pmod = patch->mod; 915 struct module *pmod = patch->mod;
947 struct module *mod = obj->mod; 916 struct module *mod = obj->mod;
948 int ret;
949 917
950 if (patch->state == KLP_DISABLED) 918 if (patch->state == KLP_DISABLED)
951 goto disabled; 919 goto disabled;
@@ -953,10 +921,7 @@ static void klp_module_notify_going(struct klp_patch *patch,
953 pr_notice("reverting patch '%s' on unloading module '%s'\n", 921 pr_notice("reverting patch '%s' on unloading module '%s'\n",
954 pmod->name, mod->name); 922 pmod->name, mod->name);
955 923
956 ret = klp_disable_object(obj); 924 klp_disable_object(obj);
957 if (ret)
958 pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
959 pmod->name, mod->name, ret);
960 925
961disabled: 926disabled:
962 klp_free_object_loaded(obj); 927 klp_free_object_loaded(obj);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d4420ad2..ba77ab5f64dd 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class)
633 if (!new_class->name) 633 if (!new_class->name)
634 return 0; 634 return 0;
635 635
636 list_for_each_entry(class, &all_lock_classes, lock_entry) { 636 list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
637 if (new_class->key - new_class->subclass == class->key) 637 if (new_class->key - new_class->subclass == class->key)
638 return class->name_version; 638 return class->name_version;
639 if (class->name && !strcmp(class->name, new_class->name)) 639 if (class->name && !strcmp(class->name, new_class->name))
@@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
700 hash_head = classhashentry(key); 700 hash_head = classhashentry(key);
701 701
702 /* 702 /*
703 * We can walk the hash lockfree, because the hash only 703 * We do an RCU walk of the hash, see lockdep_free_key_range().
704 * grows, and we are careful when adding entries to the end:
705 */ 704 */
706 list_for_each_entry(class, hash_head, hash_entry) { 705 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
706 return NULL;
707
708 list_for_each_entry_rcu(class, hash_head, hash_entry) {
707 if (class->key == key) { 709 if (class->key == key) {
708 /* 710 /*
709 * Huh! same key, different name? Did someone trample 711 * Huh! same key, different name? Did someone trample
@@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
728 struct lockdep_subclass_key *key; 730 struct lockdep_subclass_key *key;
729 struct list_head *hash_head; 731 struct list_head *hash_head;
730 struct lock_class *class; 732 struct lock_class *class;
731 unsigned long flags; 733
734 DEBUG_LOCKS_WARN_ON(!irqs_disabled());
732 735
733 class = look_up_lock_class(lock, subclass); 736 class = look_up_lock_class(lock, subclass);
734 if (likely(class)) 737 if (likely(class))
@@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
750 key = lock->key->subkeys + subclass; 753 key = lock->key->subkeys + subclass;
751 hash_head = classhashentry(key); 754 hash_head = classhashentry(key);
752 755
753 raw_local_irq_save(flags);
754 if (!graph_lock()) { 756 if (!graph_lock()) {
755 raw_local_irq_restore(flags);
756 return NULL; 757 return NULL;
757 } 758 }
758 /* 759 /*
759 * We have to do the hash-walk again, to avoid races 760 * We have to do the hash-walk again, to avoid races
760 * with another CPU: 761 * with another CPU:
761 */ 762 */
762 list_for_each_entry(class, hash_head, hash_entry) 763 list_for_each_entry_rcu(class, hash_head, hash_entry) {
763 if (class->key == key) 764 if (class->key == key)
764 goto out_unlock_set; 765 goto out_unlock_set;
766 }
767
765 /* 768 /*
766 * Allocate a new key from the static array, and add it to 769 * Allocate a new key from the static array, and add it to
767 * the hash: 770 * the hash:
768 */ 771 */
769 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { 772 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
770 if (!debug_locks_off_graph_unlock()) { 773 if (!debug_locks_off_graph_unlock()) {
771 raw_local_irq_restore(flags);
772 return NULL; 774 return NULL;
773 } 775 }
774 raw_local_irq_restore(flags);
775 776
776 print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); 777 print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
777 dump_stack(); 778 dump_stack();
@@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
798 799
799 if (verbose(class)) { 800 if (verbose(class)) {
800 graph_unlock(); 801 graph_unlock();
801 raw_local_irq_restore(flags);
802 802
803 printk("\nnew class %p: %s", class->key, class->name); 803 printk("\nnew class %p: %s", class->key, class->name);
804 if (class->name_version > 1) 804 if (class->name_version > 1)
@@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
806 printk("\n"); 806 printk("\n");
807 dump_stack(); 807 dump_stack();
808 808
809 raw_local_irq_save(flags);
810 if (!graph_lock()) { 809 if (!graph_lock()) {
811 raw_local_irq_restore(flags);
812 return NULL; 810 return NULL;
813 } 811 }
814 } 812 }
815out_unlock_set: 813out_unlock_set:
816 graph_unlock(); 814 graph_unlock();
817 raw_local_irq_restore(flags);
818 815
819out_set_class_cache: 816out_set_class_cache:
820 if (!subclass || force) 817 if (!subclass || force)
@@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
870 entry->distance = distance; 867 entry->distance = distance;
871 entry->trace = *trace; 868 entry->trace = *trace;
872 /* 869 /*
873 * Since we never remove from the dependency list, the list can 870 * Both allocation and removal are done under the graph lock; but
874 * be walked lockless by other CPUs, it's only allocation 871 * iteration is under RCU-sched; see look_up_lock_class() and
875 * that must be protected by the spinlock. But this also means 872 * lockdep_free_key_range().
876 * we must make new entries visible only once writes to the
877 * entry become visible - hence the RCU op:
878 */ 873 */
879 list_add_tail_rcu(&entry->entry, head); 874 list_add_tail_rcu(&entry->entry, head);
880 875
@@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry,
1025 else 1020 else
1026 head = &lock->class->locks_before; 1021 head = &lock->class->locks_before;
1027 1022
1028 list_for_each_entry(entry, head, entry) { 1023 DEBUG_LOCKS_WARN_ON(!irqs_disabled());
1024
1025 list_for_each_entry_rcu(entry, head, entry) {
1029 if (!lock_accessed(entry)) { 1026 if (!lock_accessed(entry)) {
1030 unsigned int cq_depth; 1027 unsigned int cq_depth;
1031 mark_lock_accessed(entry, lock); 1028 mark_lock_accessed(entry, lock);
@@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
2022 * We can walk it lock-free, because entries only get added 2019 * We can walk it lock-free, because entries only get added
2023 * to the hash: 2020 * to the hash:
2024 */ 2021 */
2025 list_for_each_entry(chain, hash_head, entry) { 2022 list_for_each_entry_rcu(chain, hash_head, entry) {
2026 if (chain->chain_key == chain_key) { 2023 if (chain->chain_key == chain_key) {
2027cache_hit: 2024cache_hit:
2028 debug_atomic_inc(chain_lookup_hits); 2025 debug_atomic_inc(chain_lookup_hits);
@@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2996 if (unlikely(!debug_locks)) 2993 if (unlikely(!debug_locks))
2997 return; 2994 return;
2998 2995
2999 if (subclass) 2996 if (subclass) {
2997 unsigned long flags;
2998
2999 if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
3000 return;
3001
3002 raw_local_irq_save(flags);
3003 current->lockdep_recursion = 1;
3000 register_lock_class(lock, subclass, 1); 3004 register_lock_class(lock, subclass, 1);
3005 current->lockdep_recursion = 0;
3006 raw_local_irq_restore(flags);
3007 }
3001} 3008}
3002EXPORT_SYMBOL_GPL(lockdep_init_map); 3009EXPORT_SYMBOL_GPL(lockdep_init_map);
3003 3010
@@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size)
3887 return addr >= start && addr < start + size; 3894 return addr >= start && addr < start + size;
3888} 3895}
3889 3896
3897/*
3898 * Used in module.c to remove lock classes from memory that is going to be
3899 * freed; and possibly re-used by other modules.
3900 *
3901 * We will have had one sync_sched() before getting here, so we're guaranteed
3902 * nobody will look up these exact classes -- they're properly dead but still
3903 * allocated.
3904 */
3890void lockdep_free_key_range(void *start, unsigned long size) 3905void lockdep_free_key_range(void *start, unsigned long size)
3891{ 3906{
3892 struct lock_class *class, *next; 3907 struct lock_class *class;
3893 struct list_head *head; 3908 struct list_head *head;
3894 unsigned long flags; 3909 unsigned long flags;
3895 int i; 3910 int i;
@@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
3905 head = classhash_table + i; 3920 head = classhash_table + i;
3906 if (list_empty(head)) 3921 if (list_empty(head))
3907 continue; 3922 continue;
3908 list_for_each_entry_safe(class, next, head, hash_entry) { 3923 list_for_each_entry_rcu(class, head, hash_entry) {
3909 if (within(class->key, start, size)) 3924 if (within(class->key, start, size))
3910 zap_class(class); 3925 zap_class(class);
3911 else if (within(class->name, start, size)) 3926 else if (within(class->name, start, size))
@@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size)
3916 if (locked) 3931 if (locked)
3917 graph_unlock(); 3932 graph_unlock();
3918 raw_local_irq_restore(flags); 3933 raw_local_irq_restore(flags);
3934
3935 /*
3936 * Wait for any possible iterators from look_up_lock_class() to pass
3937 * before continuing to free the memory they refer to.
3938 *
3939 * sync_sched() is sufficient because the read-side is IRQ disable.
3940 */
3941 synchronize_sched();
3942
3943 /*
3944 * XXX at this point we could return the resources to the pool;
3945 * instead we leak them. We would need to change to bitmap allocators
3946 * instead of the linear allocators we have now.
3947 */
3919} 3948}
3920 3949
3921void lockdep_reset_lock(struct lockdep_map *lock) 3950void lockdep_reset_lock(struct lockdep_map *lock)
3922{ 3951{
3923 struct lock_class *class, *next; 3952 struct lock_class *class;
3924 struct list_head *head; 3953 struct list_head *head;
3925 unsigned long flags; 3954 unsigned long flags;
3926 int i, j; 3955 int i, j;
@@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3948 head = classhash_table + i; 3977 head = classhash_table + i;
3949 if (list_empty(head)) 3978 if (list_empty(head))
3950 continue; 3979 continue;
3951 list_for_each_entry_safe(class, next, head, hash_entry) { 3980 list_for_each_entry_rcu(class, head, hash_entry) {
3952 int match = 0; 3981 int match = 0;
3953 3982
3954 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) 3983 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index d1fe2ba5bac9..75e114bdf3f2 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
78 */ 78 */
79 return; 79 return;
80 } 80 }
81 ACCESS_ONCE(prev->next) = node; 81 WRITE_ONCE(prev->next, node);
82 82
83 /* Wait until the lock holder passes the lock down. */ 83 /* Wait until the lock holder passes the lock down. */
84 arch_mcs_spin_lock_contended(&node->locked); 84 arch_mcs_spin_lock_contended(&node->locked);
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
91static inline 91static inline
92void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) 92void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
93{ 93{
94 struct mcs_spinlock *next = ACCESS_ONCE(node->next); 94 struct mcs_spinlock *next = READ_ONCE(node->next);
95 95
96 if (likely(!next)) { 96 if (likely(!next)) {
97 /* 97 /*
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
100 if (likely(cmpxchg(lock, node, NULL) == node)) 100 if (likely(cmpxchg(lock, node, NULL) == node))
101 return; 101 return;
102 /* Wait until the next pointer is set */ 102 /* Wait until the next pointer is set */
103 while (!(next = ACCESS_ONCE(node->next))) 103 while (!(next = READ_ONCE(node->next)))
104 cpu_relax_lowlatency(); 104 cpu_relax_lowlatency();
105 } 105 }
106 106
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 94674e5919cb..4cccea6b8934 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,7 +25,7 @@
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/debug_locks.h> 27#include <linux/debug_locks.h>
28#include "mcs_spinlock.h" 28#include <linux/osq_lock.h>
29 29
30/* 30/*
31 * In the DEBUG case we are using the "NULL fastpath" for mutexes, 31 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock,
217} 217}
218 218
219#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 219#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
220static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
221{
222 if (lock->owner != owner)
223 return false;
224
225 /*
226 * Ensure we emit the owner->on_cpu, dereference _after_ checking
227 * lock->owner still matches owner, if that fails, owner might
228 * point to free()d memory, if it still matches, the rcu_read_lock()
229 * ensures the memory stays valid.
230 */
231 barrier();
232
233 return owner->on_cpu;
234}
235
236/* 220/*
237 * Look out! "owner" is an entirely speculative pointer 221 * Look out! "owner" is an entirely speculative pointer
238 * access and not reliable. 222 * access and not reliable.
239 */ 223 */
240static noinline 224static noinline
241int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) 225bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
242{ 226{
227 bool ret = true;
228
243 rcu_read_lock(); 229 rcu_read_lock();
244 while (owner_running(lock, owner)) { 230 while (lock->owner == owner) {
245 if (need_resched()) 231 /*
232 * Ensure we emit the owner->on_cpu, dereference _after_
233 * checking lock->owner still matches owner. If that fails,
234 * owner might point to freed memory. If it still matches,
235 * the rcu_read_lock() ensures the memory stays valid.
236 */
237 barrier();
238
239 if (!owner->on_cpu || need_resched()) {
240 ret = false;
246 break; 241 break;
242 }
247 243
248 cpu_relax_lowlatency(); 244 cpu_relax_lowlatency();
249 } 245 }
250 rcu_read_unlock(); 246 rcu_read_unlock();
251 247
252 /* 248 return ret;
253 * We break out the loop above on need_resched() and when the
254 * owner changed, which is a sign for heavy contention. Return
255 * success only when lock->owner is NULL.
256 */
257 return lock->owner == NULL;
258} 249}
259 250
260/* 251/*
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
269 return 0; 260 return 0;
270 261
271 rcu_read_lock(); 262 rcu_read_lock();
272 owner = ACCESS_ONCE(lock->owner); 263 owner = READ_ONCE(lock->owner);
273 if (owner) 264 if (owner)
274 retval = owner->on_cpu; 265 retval = owner->on_cpu;
275 rcu_read_unlock(); 266 rcu_read_unlock();
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
343 * As such, when deadlock detection needs to be 334 * As such, when deadlock detection needs to be
344 * performed the optimistic spinning cannot be done. 335 * performed the optimistic spinning cannot be done.
345 */ 336 */
346 if (ACCESS_ONCE(ww->ctx)) 337 if (READ_ONCE(ww->ctx))
347 break; 338 break;
348 } 339 }
349 340
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
351 * If there's an owner, wait for it to either 342 * If there's an owner, wait for it to either
352 * release the lock or go to sleep. 343 * release the lock or go to sleep.
353 */ 344 */
354 owner = ACCESS_ONCE(lock->owner); 345 owner = READ_ONCE(lock->owner);
355 if (owner && !mutex_spin_on_owner(lock, owner)) 346 if (owner && !mutex_spin_on_owner(lock, owner))
356 break; 347 break;
357 348
@@ -490,7 +481,7 @@ static inline int __sched
490__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) 481__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
491{ 482{
492 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 483 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
493 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); 484 struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
494 485
495 if (!hold_ctx) 486 if (!hold_ctx)
496 return 0; 487 return 0;
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index c112d00341b0..dc85ee23a26f 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
98 98
99 prev = decode_cpu(old); 99 prev = decode_cpu(old);
100 node->prev = prev; 100 node->prev = prev;
101 ACCESS_ONCE(prev->next) = node; 101 WRITE_ONCE(prev->next, node);
102 102
103 /* 103 /*
104 * Normally @prev is untouchable after the above store; because at that 104 * Normally @prev is untouchable after the above store; because at that
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
109 * cmpxchg in an attempt to undo our queueing. 109 * cmpxchg in an attempt to undo our queueing.
110 */ 110 */
111 111
112 while (!ACCESS_ONCE(node->locked)) { 112 while (!READ_ONCE(node->locked)) {
113 /* 113 /*
114 * If we need to reschedule bail... so we can block. 114 * If we need to reschedule bail... so we can block.
115 */ 115 */
@@ -148,7 +148,7 @@ unqueue:
148 * Or we race against a concurrent unqueue()'s step-B, in which 148 * Or we race against a concurrent unqueue()'s step-B, in which
149 * case its step-C will write us a new @node->prev pointer. 149 * case its step-C will write us a new @node->prev pointer.
150 */ 150 */
151 prev = ACCESS_ONCE(node->prev); 151 prev = READ_ONCE(node->prev);
152 } 152 }
153 153
154 /* 154 /*
@@ -170,8 +170,8 @@ unqueue:
170 * it will wait in Step-A. 170 * it will wait in Step-A.
171 */ 171 */
172 172
173 ACCESS_ONCE(next->prev) = prev; 173 WRITE_ONCE(next->prev, prev);
174 ACCESS_ONCE(prev->next) = next; 174 WRITE_ONCE(prev->next, next);
175 175
176 return false; 176 return false;
177} 177}
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
193 node = this_cpu_ptr(&osq_node); 193 node = this_cpu_ptr(&osq_node);
194 next = xchg(&node->next, NULL); 194 next = xchg(&node->next, NULL);
195 if (next) { 195 if (next) {
196 ACCESS_ONCE(next->locked) = 1; 196 WRITE_ONCE(next->locked, 1);
197 return; 197 return;
198 } 198 }
199 199
200 next = osq_wait_next(lock, node, NULL); 200 next = osq_wait_next(lock, node, NULL);
201 if (next) 201 if (next)
202 ACCESS_ONCE(next->locked) = 1; 202 WRITE_ONCE(next->locked, 1);
203} 203}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6357265a31ad..b73279367087 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
349 * 349 *
350 * @task: the task owning the mutex (owner) for which a chain walk is 350 * @task: the task owning the mutex (owner) for which a chain walk is
351 * probably needed 351 * probably needed
352 * @deadlock_detect: do we have to carry out deadlock detection? 352 * @chwalk: do we have to carry out deadlock detection?
353 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck 353 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
354 * things for a task that has just got its priority adjusted, and 354 * things for a task that has just got its priority adjusted, and
355 * is waiting on a mutex) 355 * is waiting on a mutex)
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2555ae15ec14..3a5048572065 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
85 85
86 list_del(&waiter->list); 86 list_del(&waiter->list);
87 tsk = waiter->task; 87 tsk = waiter->task;
88 /*
89 * Make sure we do not wakeup the next reader before
90 * setting the nil condition to grant the next reader;
91 * otherwise we could miss the wakeup on the other
92 * side and end up sleeping again. See the pairing
93 * in rwsem_down_read_failed().
94 */
88 smp_mb(); 95 smp_mb();
89 waiter->task = NULL; 96 waiter->task = NULL;
90 wake_up_process(tsk); 97 wake_up_process(tsk);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2f7cc4076f50..3417d0172a5d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -14,8 +14,9 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/export.h> 15#include <linux/export.h>
16#include <linux/sched/rt.h> 16#include <linux/sched/rt.h>
17#include <linux/osq_lock.h>
17 18
18#include "mcs_spinlock.h" 19#include "rwsem.h"
19 20
20/* 21/*
21 * Guide to the rw_semaphore's count field for common values. 22 * Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
186 waiter = list_entry(next, struct rwsem_waiter, list); 187 waiter = list_entry(next, struct rwsem_waiter, list);
187 next = waiter->list.next; 188 next = waiter->list.next;
188 tsk = waiter->task; 189 tsk = waiter->task;
190 /*
191 * Make sure we do not wakeup the next reader before
192 * setting the nil condition to grant the next reader;
193 * otherwise we could miss the wakeup on the other
194 * side and end up sleeping again. See the pairing
195 * in rwsem_down_read_failed().
196 */
189 smp_mb(); 197 smp_mb();
190 waiter->task = NULL; 198 waiter->task = NULL;
191 wake_up_process(tsk); 199 wake_up_process(tsk);
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
258 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { 266 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
259 if (!list_is_singular(&sem->wait_list)) 267 if (!list_is_singular(&sem->wait_list))
260 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); 268 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
269 rwsem_set_owner(sem);
261 return true; 270 return true;
262 } 271 }
263 272
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
270 */ 279 */
271static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) 280static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
272{ 281{
273 long old, count = ACCESS_ONCE(sem->count); 282 long old, count = READ_ONCE(sem->count);
274 283
275 while (true) { 284 while (true) {
276 if (!(count == 0 || count == RWSEM_WAITING_BIAS)) 285 if (!(count == 0 || count == RWSEM_WAITING_BIAS))
277 return false; 286 return false;
278 287
279 old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); 288 old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
280 if (old == count) 289 if (old == count) {
290 rwsem_set_owner(sem);
281 return true; 291 return true;
292 }
282 293
283 count = old; 294 count = old;
284 } 295 }
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
287static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) 298static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
288{ 299{
289 struct task_struct *owner; 300 struct task_struct *owner;
290 bool on_cpu = false; 301 bool ret = true;
291 302
292 if (need_resched()) 303 if (need_resched())
293 return false; 304 return false;
294 305
295 rcu_read_lock(); 306 rcu_read_lock();
296 owner = ACCESS_ONCE(sem->owner); 307 owner = READ_ONCE(sem->owner);
297 if (owner) 308 if (!owner) {
298 on_cpu = owner->on_cpu; 309 long count = READ_ONCE(sem->count);
299 rcu_read_unlock(); 310 /*
300 311 * If sem->owner is not set, yet we have just recently entered the
301 /* 312 * slowpath with the lock being active, then there is a possibility
302 * If sem->owner is not set, yet we have just recently entered the 313 * reader(s) may have the lock. To be safe, bail spinning in these
303 * slowpath, then there is a possibility reader(s) may have the lock. 314 * situations.
304 * To be safe, avoid spinning in these situations. 315 */
305 */ 316 if (count & RWSEM_ACTIVE_MASK)
306 return on_cpu; 317 ret = false;
307} 318 goto done;
308 319 }
309static inline bool owner_running(struct rw_semaphore *sem,
310 struct task_struct *owner)
311{
312 if (sem->owner != owner)
313 return false;
314
315 /*
316 * Ensure we emit the owner->on_cpu, dereference _after_ checking
317 * sem->owner still matches owner, if that fails, owner might
318 * point to free()d memory, if it still matches, the rcu_read_lock()
319 * ensures the memory stays valid.
320 */
321 barrier();
322 320
323 return owner->on_cpu; 321 ret = owner->on_cpu;
322done:
323 rcu_read_unlock();
324 return ret;
324} 325}
325 326
326static noinline 327static noinline
327bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) 328bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
328{ 329{
330 long count;
331
329 rcu_read_lock(); 332 rcu_read_lock();
330 while (owner_running(sem, owner)) { 333 while (sem->owner == owner) {
331 if (need_resched()) 334 /*
332 break; 335 * Ensure we emit the owner->on_cpu, dereference _after_
336 * checking sem->owner still matches owner, if that fails,
337 * owner might point to free()d memory, if it still matches,
338 * the rcu_read_lock() ensures the memory stays valid.
339 */
340 barrier();
341
342 /* abort spinning when need_resched or owner is not running */
343 if (!owner->on_cpu || need_resched()) {
344 rcu_read_unlock();
345 return false;
346 }
333 347
334 cpu_relax_lowlatency(); 348 cpu_relax_lowlatency();
335 } 349 }
336 rcu_read_unlock(); 350 rcu_read_unlock();
337 351
352 if (READ_ONCE(sem->owner))
353 return true; /* new owner, continue spinning */
354
338 /* 355 /*
339 * We break out the loop above on need_resched() or when the 356 * When the owner is not set, the lock could be free or
340 * owner changed, which is a sign for heavy contention. Return 357 * held by readers. Check the counter to verify the
341 * success only when sem->owner is NULL. 358 * state.
342 */ 359 */
343 return sem->owner == NULL; 360 count = READ_ONCE(sem->count);
361 return (count == 0 || count == RWSEM_WAITING_BIAS);
344} 362}
345 363
346static bool rwsem_optimistic_spin(struct rw_semaphore *sem) 364static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
358 goto done; 376 goto done;
359 377
360 while (true) { 378 while (true) {
361 owner = ACCESS_ONCE(sem->owner); 379 owner = READ_ONCE(sem->owner);
362 if (owner && !rwsem_spin_on_owner(sem, owner)) 380 if (owner && !rwsem_spin_on_owner(sem, owner))
363 break; 381 break;
364 382
@@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
432 450
433 /* we're now waiting on the lock, but no longer actively locking */ 451 /* we're now waiting on the lock, but no longer actively locking */
434 if (waiting) { 452 if (waiting) {
435 count = ACCESS_ONCE(sem->count); 453 count = READ_ONCE(sem->count);
436 454
437 /* 455 /*
438 * If there were already threads queued before us and there are 456 * If there were already threads queued before us and there are
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e2d3bc7f03b4..205be0ce34de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -9,29 +9,9 @@
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/export.h> 10#include <linux/export.h>
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12
13#include <linux/atomic.h> 12#include <linux/atomic.h>
14 13
15#ifdef CONFIG_RWSEM_SPIN_ON_OWNER 14#include "rwsem.h"
16static inline void rwsem_set_owner(struct rw_semaphore *sem)
17{
18 sem->owner = current;
19}
20
21static inline void rwsem_clear_owner(struct rw_semaphore *sem)
22{
23 sem->owner = NULL;
24}
25
26#else
27static inline void rwsem_set_owner(struct rw_semaphore *sem)
28{
29}
30
31static inline void rwsem_clear_owner(struct rw_semaphore *sem)
32{
33}
34#endif
35 15
36/* 16/*
37 * lock for reading 17 * lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000000..870ed9a5b426
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
1#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
2static inline void rwsem_set_owner(struct rw_semaphore *sem)
3{
4 sem->owner = current;
5}
6
7static inline void rwsem_clear_owner(struct rw_semaphore *sem)
8{
9 sem->owner = NULL;
10}
11
12#else
13static inline void rwsem_set_owner(struct rw_semaphore *sem)
14{
15}
16
17static inline void rwsem_clear_owner(struct rw_semaphore *sem)
18{
19}
20#endif
diff --git a/kernel/module.c b/kernel/module.c
index b3d634ed06c9..650b038ae520 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1865,7 +1865,7 @@ static void free_module(struct module *mod)
1865 kfree(mod->args); 1865 kfree(mod->args);
1866 percpu_modfree(mod); 1866 percpu_modfree(mod);
1867 1867
1868 /* Free lock-classes: */ 1868 /* Free lock-classes; relies on the preceding sync_rcu(). */
1869 lockdep_free_key_range(mod->module_core, mod->core_size); 1869 lockdep_free_key_range(mod->module_core, mod->core_size);
1870 1870
1871 /* Finally, free the core (containing the module structure) */ 1871 /* Finally, free the core (containing the module structure) */
@@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info)
2479 return 0; 2479 return 0;
2480} 2480}
2481 2481
2482#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
2483
2484static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
2485{
2486 do {
2487 unsigned long n = min(len, COPY_CHUNK_SIZE);
2488
2489 if (copy_from_user(dst, usrc, n) != 0)
2490 return -EFAULT;
2491 cond_resched();
2492 dst += n;
2493 usrc += n;
2494 len -= n;
2495 } while (len);
2496 return 0;
2497}
2498
2482/* Sets info->hdr and info->len. */ 2499/* Sets info->hdr and info->len. */
2483static int copy_module_from_user(const void __user *umod, unsigned long len, 2500static int copy_module_from_user(const void __user *umod, unsigned long len,
2484 struct load_info *info) 2501 struct load_info *info)
@@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
2498 if (!info->hdr) 2515 if (!info->hdr)
2499 return -ENOMEM; 2516 return -ENOMEM;
2500 2517
2501 if (copy_from_user(info->hdr, umod, info->len) != 0) { 2518 if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
2502 vfree(info->hdr); 2519 vfree(info->hdr);
2503 return -EFAULT; 2520 return -EFAULT;
2504 } 2521 }
@@ -2753,6 +2770,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
2753 mod->trace_events = section_objs(info, "_ftrace_events", 2770 mod->trace_events = section_objs(info, "_ftrace_events",
2754 sizeof(*mod->trace_events), 2771 sizeof(*mod->trace_events),
2755 &mod->num_trace_events); 2772 &mod->num_trace_events);
2773 mod->trace_enums = section_objs(info, "_ftrace_enum_map",
2774 sizeof(*mod->trace_enums),
2775 &mod->num_trace_enums);
2756#endif 2776#endif
2757#ifdef CONFIG_TRACING 2777#ifdef CONFIG_TRACING
2758 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", 2778 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -3349,9 +3369,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
3349 module_bug_cleanup(mod); 3369 module_bug_cleanup(mod);
3350 mutex_unlock(&module_mutex); 3370 mutex_unlock(&module_mutex);
3351 3371
3352 /* Free lock-classes: */
3353 lockdep_free_key_range(mod->module_core, mod->core_size);
3354
3355 /* we can't deallocate the module until we clear memory protection */ 3372 /* we can't deallocate the module until we clear memory protection */
3356 unset_module_init_ro_nx(mod); 3373 unset_module_init_ro_nx(mod);
3357 unset_module_core_ro_nx(mod); 3374 unset_module_core_ro_nx(mod);
@@ -3375,6 +3392,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
3375 synchronize_rcu(); 3392 synchronize_rcu();
3376 mutex_unlock(&module_mutex); 3393 mutex_unlock(&module_mutex);
3377 free_module: 3394 free_module:
3395 /* Free lock-classes; relies on the preceding sync_rcu() */
3396 lockdep_free_key_range(mod->module_core, mod->core_size);
3397
3378 module_deallocate(mod, info); 3398 module_deallocate(mod, info);
3379 free_copy: 3399 free_copy:
3380 free_copy(info); 3400 free_copy(info);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c24d5a23bf93..5235dd4e1e2f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
955 } 955 }
956} 956}
957 957
958static bool is_nosave_page(unsigned long pfn)
959{
960 struct nosave_region *region;
961
962 list_for_each_entry(region, &nosave_regions, list) {
963 if (pfn >= region->start_pfn && pfn < region->end_pfn) {
964 pr_err("PM: %#010llx in e820 nosave region: "
965 "[mem %#010llx-%#010llx]\n",
966 (unsigned long long) pfn << PAGE_SHIFT,
967 (unsigned long long) region->start_pfn << PAGE_SHIFT,
968 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
969 - 1);
970 return true;
971 }
972 }
973
974 return false;
975}
976
977/** 958/**
978 * create_basic_memory_bitmaps - create bitmaps needed for marking page 959 * create_basic_memory_bitmaps - create bitmaps needed for marking page
979 * frames that should not be saved and free page frames. The pointers 960 * frames that should not be saved and free page frames. The pointers
@@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
2042 do { 2023 do {
2043 pfn = memory_bm_next_pfn(bm); 2024 pfn = memory_bm_next_pfn(bm);
2044 if (likely(pfn != BM_END_OF_MAP)) { 2025 if (likely(pfn != BM_END_OF_MAP)) {
2045 if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) 2026 if (likely(pfn_valid(pfn)))
2046 swsusp_set_page_free(pfn_to_page(pfn)); 2027 swsusp_set_page_free(pfn_to_page(pfn));
2047 else 2028 else
2048 return -EFAULT; 2029 return -EFAULT;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..2f7937ee9e3a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -306,6 +306,9 @@ __read_mostly int scheduler_running;
306 */ 306 */
307int sysctl_sched_rt_runtime = 950000; 307int sysctl_sched_rt_runtime = 950000;
308 308
309/* cpus with isolated domains */
310cpumask_var_t cpu_isolated_map;
311
309/* 312/*
310 * this_rq_lock - lock this runqueue and disable interrupts. 313 * this_rq_lock - lock this runqueue and disable interrupts.
311 */ 314 */
@@ -690,6 +693,23 @@ static inline bool got_nohz_idle_kick(void)
690bool sched_can_stop_tick(void) 693bool sched_can_stop_tick(void)
691{ 694{
692 /* 695 /*
696 * FIFO realtime policy runs the highest priority task. Other runnable
697 * tasks are of a lower priority. The scheduler tick does nothing.
698 */
699 if (current->policy == SCHED_FIFO)
700 return true;
701
702 /*
703 * Round-robin realtime tasks time slice with other tasks at the same
704 * realtime priority. Is this task the only one at this priority?
705 */
706 if (current->policy == SCHED_RR) {
707 struct sched_rt_entity *rt_se = &current->rt;
708
709 return rt_se->run_list.prev == rt_se->run_list.next;
710 }
711
712 /*
693 * More than one running task need preemption. 713 * More than one running task need preemption.
694 * nr_running update is assumed to be visible 714 * nr_running update is assumed to be visible
695 * after IPI is sent from wakers. 715 * after IPI is sent from wakers.
@@ -996,6 +1016,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
996 rq_clock_skip_update(rq, true); 1016 rq_clock_skip_update(rq, true);
997} 1017}
998 1018
1019static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
1020
1021void register_task_migration_notifier(struct notifier_block *n)
1022{
1023 atomic_notifier_chain_register(&task_migration_notifier, n);
1024}
1025
999#ifdef CONFIG_SMP 1026#ifdef CONFIG_SMP
1000void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1027void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1001{ 1028{
@@ -1026,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1026 trace_sched_migrate_task(p, new_cpu); 1053 trace_sched_migrate_task(p, new_cpu);
1027 1054
1028 if (task_cpu(p) != new_cpu) { 1055 if (task_cpu(p) != new_cpu) {
1056 struct task_migration_notifier tmn;
1057
1029 if (p->sched_class->migrate_task_rq) 1058 if (p->sched_class->migrate_task_rq)
1030 p->sched_class->migrate_task_rq(p, new_cpu); 1059 p->sched_class->migrate_task_rq(p, new_cpu);
1031 p->se.nr_migrations++; 1060 p->se.nr_migrations++;
1032 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); 1061 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1062
1063 tmn.task = p;
1064 tmn.from_cpu = task_cpu(p);
1065 tmn.to_cpu = new_cpu;
1066
1067 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
1033 } 1068 }
1034 1069
1035 __set_task_cpu(p, new_cpu); 1070 __set_task_cpu(p, new_cpu);
@@ -3034,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3034 } else { 3069 } else {
3035 if (dl_prio(oldprio)) 3070 if (dl_prio(oldprio))
3036 p->dl.dl_boosted = 0; 3071 p->dl.dl_boosted = 0;
3072 if (rt_prio(oldprio))
3073 p->rt.timeout = 0;
3037 p->sched_class = &fair_sched_class; 3074 p->sched_class = &fair_sched_class;
3038 } 3075 }
3039 3076
@@ -5318,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
5318static int sched_cpu_inactive(struct notifier_block *nfb, 5355static int sched_cpu_inactive(struct notifier_block *nfb,
5319 unsigned long action, void *hcpu) 5356 unsigned long action, void *hcpu)
5320{ 5357{
5321 unsigned long flags;
5322 long cpu = (long)hcpu;
5323 struct dl_bw *dl_b;
5324
5325 switch (action & ~CPU_TASKS_FROZEN) { 5358 switch (action & ~CPU_TASKS_FROZEN) {
5326 case CPU_DOWN_PREPARE: 5359 case CPU_DOWN_PREPARE:
5327 set_cpu_active(cpu, false); 5360 set_cpu_active((long)hcpu, false);
5328
5329 /* explicitly allow suspend */
5330 if (!(action & CPU_TASKS_FROZEN)) {
5331 bool overflow;
5332 int cpus;
5333
5334 rcu_read_lock_sched();
5335 dl_b = dl_bw_of(cpu);
5336
5337 raw_spin_lock_irqsave(&dl_b->lock, flags);
5338 cpus = dl_bw_cpus(cpu);
5339 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5340 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5341
5342 rcu_read_unlock_sched();
5343
5344 if (overflow)
5345 return notifier_from_errno(-EBUSY);
5346 }
5347 return NOTIFY_OK; 5361 return NOTIFY_OK;
5362 default:
5363 return NOTIFY_DONE;
5348 } 5364 }
5349
5350 return NOTIFY_DONE;
5351} 5365}
5352 5366
5353static int __init migration_init(void) 5367static int __init migration_init(void)
@@ -5428,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5428 break; 5442 break;
5429 } 5443 }
5430 5444
5431 /*
5432 * Even though we initialize ->capacity to something semi-sane,
5433 * we leave capacity_orig unset. This allows us to detect if
5434 * domain iteration is still funny without causing /0 traps.
5435 */
5436 if (!group->sgc->capacity_orig) {
5437 printk(KERN_CONT "\n");
5438 printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
5439 break;
5440 }
5441
5442 if (!cpumask_weight(sched_group_cpus(group))) { 5445 if (!cpumask_weight(sched_group_cpus(group))) {
5443 printk(KERN_CONT "\n"); 5446 printk(KERN_CONT "\n");
5444 printk(KERN_ERR "ERROR: empty group\n"); 5447 printk(KERN_ERR "ERROR: empty group\n");
@@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5811 update_top_cache_domain(cpu); 5814 update_top_cache_domain(cpu);
5812} 5815}
5813 5816
5814/* cpus with isolated domains */
5815static cpumask_var_t cpu_isolated_map;
5816
5817/* Setup the mask of cpus configured for isolated domains */ 5817/* Setup the mask of cpus configured for isolated domains */
5818static int __init isolated_cpu_setup(char *str) 5818static int __init isolated_cpu_setup(char *str)
5819{ 5819{
@@ -5922,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5922 * die on a /0 trap. 5922 * die on a /0 trap.
5923 */ 5923 */
5924 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 5924 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
5925 sg->sgc->capacity_orig = sg->sgc->capacity;
5926 5925
5927 /* 5926 /*
5928 * Make sure the first group of this domain contains the 5927 * Make sure the first group of this domain contains the
@@ -6233,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
6233 */ 6232 */
6234 6233
6235 if (sd->flags & SD_SHARE_CPUCAPACITY) { 6234 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6235 sd->flags |= SD_PREFER_SIBLING;
6236 sd->imbalance_pct = 110; 6236 sd->imbalance_pct = 110;
6237 sd->smt_gain = 1178; /* ~15% */ 6237 sd->smt_gain = 1178; /* ~15% */
6238 6238
@@ -6998,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6998 */ 6998 */
6999 6999
7000 case CPU_ONLINE: 7000 case CPU_ONLINE:
7001 case CPU_DOWN_FAILED:
7002 cpuset_update_active_cpus(true); 7001 cpuset_update_active_cpus(true);
7003 break; 7002 break;
7004 default: 7003 default:
@@ -7010,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7010static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7009static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7011 void *hcpu) 7010 void *hcpu)
7012{ 7011{
7013 switch (action) { 7012 unsigned long flags;
7013 long cpu = (long)hcpu;
7014 struct dl_bw *dl_b;
7015
7016 switch (action & ~CPU_TASKS_FROZEN) {
7014 case CPU_DOWN_PREPARE: 7017 case CPU_DOWN_PREPARE:
7018 /* explicitly allow suspend */
7019 if (!(action & CPU_TASKS_FROZEN)) {
7020 bool overflow;
7021 int cpus;
7022
7023 rcu_read_lock_sched();
7024 dl_b = dl_bw_of(cpu);
7025
7026 raw_spin_lock_irqsave(&dl_b->lock, flags);
7027 cpus = dl_bw_cpus(cpu);
7028 overflow = __dl_overflow(dl_b, cpus, 0, 0);
7029 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7030
7031 rcu_read_unlock_sched();
7032
7033 if (overflow)
7034 return notifier_from_errno(-EBUSY);
7035 }
7015 cpuset_update_active_cpus(false); 7036 cpuset_update_active_cpus(false);
7016 break; 7037 break;
7017 case CPU_DOWN_PREPARE_FROZEN: 7038 case CPU_DOWN_PREPARE_FROZEN:
@@ -7156,8 +7177,8 @@ void __init sched_init(void)
7156 rq->calc_load_active = 0; 7177 rq->calc_load_active = 0;
7157 rq->calc_load_update = jiffies + LOAD_FREQ; 7178 rq->calc_load_update = jiffies + LOAD_FREQ;
7158 init_cfs_rq(&rq->cfs); 7179 init_cfs_rq(&rq->cfs);
7159 init_rt_rq(&rq->rt, rq); 7180 init_rt_rq(&rq->rt);
7160 init_dl_rq(&rq->dl, rq); 7181 init_dl_rq(&rq->dl);
7161#ifdef CONFIG_FAIR_GROUP_SCHED 7182#ifdef CONFIG_FAIR_GROUP_SCHED
7162 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 7183 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7163 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7184 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7197,7 +7218,7 @@ void __init sched_init(void)
7197#ifdef CONFIG_SMP 7218#ifdef CONFIG_SMP
7198 rq->sd = NULL; 7219 rq->sd = NULL;
7199 rq->rd = NULL; 7220 rq->rd = NULL;
7200 rq->cpu_capacity = SCHED_CAPACITY_SCALE; 7221 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
7201 rq->post_schedule = 0; 7222 rq->post_schedule = 0;
7202 rq->active_balance = 0; 7223 rq->active_balance = 0;
7203 rq->next_balance = jiffies; 7224 rq->next_balance = jiffies;
@@ -7796,7 +7817,7 @@ static int sched_rt_global_constraints(void)
7796} 7817}
7797#endif /* CONFIG_RT_GROUP_SCHED */ 7818#endif /* CONFIG_RT_GROUP_SCHED */
7798 7819
7799static int sched_dl_global_constraints(void) 7820static int sched_dl_global_validate(void)
7800{ 7821{
7801 u64 runtime = global_rt_runtime(); 7822 u64 runtime = global_rt_runtime();
7802 u64 period = global_rt_period(); 7823 u64 period = global_rt_period();
@@ -7897,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
7897 if (ret) 7918 if (ret)
7898 goto undo; 7919 goto undo;
7899 7920
7900 ret = sched_rt_global_constraints(); 7921 ret = sched_dl_global_validate();
7901 if (ret) 7922 if (ret)
7902 goto undo; 7923 goto undo;
7903 7924
7904 ret = sched_dl_global_constraints(); 7925 ret = sched_rt_global_constraints();
7905 if (ret) 7926 if (ret)
7906 goto undo; 7927 goto undo;
7907 7928
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3fa8fa6d9403..5e95145088fd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b)
69 dl_b->total_bw = 0; 69 dl_b->total_bw = 0;
70} 70}
71 71
72void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) 72void init_dl_rq(struct dl_rq *dl_rq)
73{ 73{
74 dl_rq->rb_root = RB_ROOT; 74 dl_rq->rb_root = RB_ROOT;
75 75
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq)
218 rq->post_schedule = has_pushable_dl_tasks(rq); 218 rq->post_schedule = has_pushable_dl_tasks(rq);
219} 219}
220 220
221static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
222
223static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
224{
225 struct rq *later_rq = NULL;
226 bool fallback = false;
227
228 later_rq = find_lock_later_rq(p, rq);
229
230 if (!later_rq) {
231 int cpu;
232
233 /*
234 * If we cannot preempt any rq, fall back to pick any
235 * online cpu.
236 */
237 fallback = true;
238 cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
239 if (cpu >= nr_cpu_ids) {
240 /*
241 * Fail to find any suitable cpu.
242 * The task will never come back!
243 */
244 BUG_ON(dl_bandwidth_enabled());
245
246 /*
247 * If admission control is disabled we
248 * try a little harder to let the task
249 * run.
250 */
251 cpu = cpumask_any(cpu_active_mask);
252 }
253 later_rq = cpu_rq(cpu);
254 double_lock_balance(rq, later_rq);
255 }
256
257 deactivate_task(rq, p, 0);
258 set_task_cpu(p, later_rq->cpu);
259 activate_task(later_rq, p, ENQUEUE_REPLENISH);
260
261 if (!fallback)
262 resched_curr(later_rq);
263
264 double_unlock_balance(rq, later_rq);
265}
266
221#else 267#else
222 268
223static inline 269static inline
@@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
514 unsigned long flags; 560 unsigned long flags;
515 struct rq *rq; 561 struct rq *rq;
516 562
517 rq = task_rq_lock(current, &flags); 563 rq = task_rq_lock(p, &flags);
518 564
519 /* 565 /*
520 * We need to take care of several possible races here: 566 * We need to take care of several possible races here:
@@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
536 sched_clock_tick(); 582 sched_clock_tick();
537 update_rq_clock(rq); 583 update_rq_clock(rq);
538 584
585#ifdef CONFIG_SMP
586 /*
587 * If we find that the rq the task was on is no longer
588 * available, we need to select a new rq.
589 */
590 if (unlikely(!rq->online)) {
591 dl_task_offline_migration(rq, p);
592 goto unlock;
593 }
594#endif
595
539 /* 596 /*
540 * If the throttle happened during sched-out; like: 597 * If the throttle happened during sched-out; like:
541 * 598 *
@@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
569 push_dl_task(rq); 626 push_dl_task(rq);
570#endif 627#endif
571unlock: 628unlock:
572 task_rq_unlock(rq, current, &flags); 629 task_rq_unlock(rq, p, &flags);
573 630
574 return HRTIMER_NORESTART; 631 return HRTIMER_NORESTART;
575} 632}
@@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq)
914 } 971 }
915 update_rq_clock(rq); 972 update_rq_clock(rq);
916 update_curr_dl(rq); 973 update_curr_dl(rq);
974 /*
975 * Tell update_rq_clock() that we've just updated,
976 * so we don't do microscopic update in schedule()
977 * and double the fastpath cost.
978 */
979 rq_clock_skip_update(rq, true);
917} 980}
918 981
919#ifdef CONFIG_SMP 982#ifdef CONFIG_SMP
@@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1659{ 1722{
1660 int check_resched = 1; 1723 int check_resched = 1;
1661 1724
1662 /*
1663 * If p is throttled, don't consider the possibility
1664 * of preempting rq->curr, the check will be done right
1665 * after its runtime will get replenished.
1666 */
1667 if (unlikely(p->dl.dl_throttled))
1668 return;
1669
1670 if (task_on_rq_queued(p) && rq->curr != p) { 1725 if (task_on_rq_queued(p) && rq->curr != p) {
1671#ifdef CONFIG_SMP 1726#ifdef CONFIG_SMP
1672 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && 1727 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8baaf858d25c..a245c1fc6f0a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
71 if (!se) { 71 if (!se) {
72 struct sched_avg *avg = &cpu_rq(cpu)->avg; 72 struct sched_avg *avg = &cpu_rq(cpu)->avg;
73 P(avg->runnable_avg_sum); 73 P(avg->runnable_avg_sum);
74 P(avg->runnable_avg_period); 74 P(avg->avg_period);
75 return; 75 return;
76 } 76 }
77 77
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
94 P(se->load.weight); 94 P(se->load.weight);
95#ifdef CONFIG_SMP 95#ifdef CONFIG_SMP
96 P(se->avg.runnable_avg_sum); 96 P(se->avg.runnable_avg_sum);
97 P(se->avg.runnable_avg_period); 97 P(se->avg.running_avg_sum);
98 P(se->avg.avg_period);
98 P(se->avg.load_avg_contrib); 99 P(se->avg.load_avg_contrib);
100 P(se->avg.utilization_avg_contrib);
99 P(se->avg.decay_count); 101 P(se->avg.decay_count);
100#endif 102#endif
101#undef PN 103#undef PN
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
214 cfs_rq->runnable_load_avg); 216 cfs_rq->runnable_load_avg);
215 SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", 217 SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
216 cfs_rq->blocked_load_avg); 218 cfs_rq->blocked_load_avg);
219 SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
220 cfs_rq->utilization_load_avg);
217#ifdef CONFIG_FAIR_GROUP_SCHED 221#ifdef CONFIG_FAIR_GROUP_SCHED
218 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", 222 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
219 cfs_rq->tg_load_contrib); 223 cfs_rq->tg_load_contrib);
@@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
636 P(se.load.weight); 640 P(se.load.weight);
637#ifdef CONFIG_SMP 641#ifdef CONFIG_SMP
638 P(se.avg.runnable_avg_sum); 642 P(se.avg.runnable_avg_sum);
639 P(se.avg.runnable_avg_period); 643 P(se.avg.running_avg_sum);
644 P(se.avg.avg_period);
640 P(se.avg.load_avg_contrib); 645 P(se.avg.load_avg_contrib);
646 P(se.avg.utilization_avg_contrib);
641 P(se.avg.decay_count); 647 P(se.avg.decay_count);
642#endif 648#endif
643 P(policy); 649 P(policy);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcfe32088b37..ffeaa4105e48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
670static unsigned long task_h_load(struct task_struct *p); 670static unsigned long task_h_load(struct task_struct *p);
671 671
672static inline void __update_task_entity_contrib(struct sched_entity *se); 672static inline void __update_task_entity_contrib(struct sched_entity *se);
673static inline void __update_task_entity_utilization(struct sched_entity *se);
673 674
674/* Give new task start runnable values to heavy its load in infant time */ 675/* Give new task start runnable values to heavy its load in infant time */
675void init_task_runnable_average(struct task_struct *p) 676void init_task_runnable_average(struct task_struct *p)
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p)
677 u32 slice; 678 u32 slice;
678 679
679 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; 680 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
680 p->se.avg.runnable_avg_sum = slice; 681 p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
681 p->se.avg.runnable_avg_period = slice; 682 p->se.avg.avg_period = slice;
682 __update_task_entity_contrib(&p->se); 683 __update_task_entity_contrib(&p->se);
684 __update_task_entity_utilization(&p->se);
683} 685}
684#else 686#else
685void init_task_runnable_average(struct task_struct *p) 687void init_task_runnable_average(struct task_struct *p)
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env,
1196static bool load_too_imbalanced(long src_load, long dst_load, 1198static bool load_too_imbalanced(long src_load, long dst_load,
1197 struct task_numa_env *env) 1199 struct task_numa_env *env)
1198{ 1200{
1199 long imb, old_imb;
1200 long orig_src_load, orig_dst_load;
1201 long src_capacity, dst_capacity; 1201 long src_capacity, dst_capacity;
1202 long orig_src_load;
1203 long load_a, load_b;
1204 long moved_load;
1205 long imb;
1202 1206
1203 /* 1207 /*
1204 * The load is corrected for the CPU capacity available on each node. 1208 * The load is corrected for the CPU capacity available on each node.
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load,
1211 dst_capacity = env->dst_stats.compute_capacity; 1215 dst_capacity = env->dst_stats.compute_capacity;
1212 1216
1213 /* We care about the slope of the imbalance, not the direction. */ 1217 /* We care about the slope of the imbalance, not the direction. */
1214 if (dst_load < src_load) 1218 load_a = dst_load;
1215 swap(dst_load, src_load); 1219 load_b = src_load;
1220 if (load_a < load_b)
1221 swap(load_a, load_b);
1216 1222
1217 /* Is the difference below the threshold? */ 1223 /* Is the difference below the threshold? */
1218 imb = dst_load * src_capacity * 100 - 1224 imb = load_a * src_capacity * 100 -
1219 src_load * dst_capacity * env->imbalance_pct; 1225 load_b * dst_capacity * env->imbalance_pct;
1220 if (imb <= 0) 1226 if (imb <= 0)
1221 return false; 1227 return false;
1222 1228
1223 /* 1229 /*
1224 * The imbalance is above the allowed threshold. 1230 * The imbalance is above the allowed threshold.
1225 * Compare it with the old imbalance. 1231 * Allow a move that brings us closer to a balanced situation,
1232 * without moving things past the point of balance.
1226 */ 1233 */
1227 orig_src_load = env->src_stats.load; 1234 orig_src_load = env->src_stats.load;
1228 orig_dst_load = env->dst_stats.load;
1229 1235
1230 if (orig_dst_load < orig_src_load) 1236 /*
1231 swap(orig_dst_load, orig_src_load); 1237 * In a task swap, there will be one load moving from src to dst,
1232 1238 * and another moving back. This is the net sum of both moves.
1233 old_imb = orig_dst_load * src_capacity * 100 - 1239 * A simple task move will always have a positive value.
1234 orig_src_load * dst_capacity * env->imbalance_pct; 1240 * Allow the move if it brings the system closer to a balanced
1241 * situation, without crossing over the balance point.
1242 */
1243 moved_load = orig_src_load - src_load;
1235 1244
1236 /* Would this change make things worse? */ 1245 if (moved_load > 0)
1237 return (imb > old_imb); 1246 /* Moving src -> dst. Did we overshoot balance? */
1247 return src_load * dst_capacity < dst_load * src_capacity;
1248 else
1249 /* Moving dst -> src. Did we overshoot balance? */
1250 return dst_load * src_capacity < src_load * dst_capacity;
1238} 1251}
1239 1252
1240/* 1253/*
@@ -1675,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1675 *period = now - p->last_task_numa_placement; 1688 *period = now - p->last_task_numa_placement;
1676 } else { 1689 } else {
1677 delta = p->se.avg.runnable_avg_sum; 1690 delta = p->se.avg.runnable_avg_sum;
1678 *period = p->se.avg.runnable_avg_period; 1691 *period = p->se.avg.avg_period;
1679 } 1692 }
1680 1693
1681 p->last_sum_exec_runtime = runtime; 1694 p->last_sum_exec_runtime = runtime;
@@ -1765,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
1765 } 1778 }
1766 } 1779 }
1767 /* Next round, evaluate the nodes within max_group. */ 1780 /* Next round, evaluate the nodes within max_group. */
1781 if (!max_faults)
1782 break;
1768 nodes = max_group; 1783 nodes = max_group;
1769 } 1784 }
1770 return nid; 1785 return nid;
@@ -2165,8 +2180,10 @@ void task_numa_work(struct callback_head *work)
2165 vma = mm->mmap; 2180 vma = mm->mmap;
2166 } 2181 }
2167 for (; vma; vma = vma->vm_next) { 2182 for (; vma; vma = vma->vm_next) {
2168 if (!vma_migratable(vma) || !vma_policy_mof(vma)) 2183 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2184 is_vm_hugetlb_page(vma)) {
2169 continue; 2185 continue;
2186 }
2170 2187
2171 /* 2188 /*
2172 * Shared library pages mapped by multiple processes are not 2189 * Shared library pages mapped by multiple processes are not
@@ -2501,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n)
2501 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) 2518 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2502 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] 2519 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2503 */ 2520 */
2504static __always_inline int __update_entity_runnable_avg(u64 now, 2521static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
2505 struct sched_avg *sa, 2522 struct sched_avg *sa,
2506 int runnable) 2523 int runnable,
2524 int running)
2507{ 2525{
2508 u64 delta, periods; 2526 u64 delta, periods;
2509 u32 runnable_contrib; 2527 u32 runnable_contrib;
2510 int delta_w, decayed = 0; 2528 int delta_w, decayed = 0;
2529 unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
2511 2530
2512 delta = now - sa->last_runnable_update; 2531 delta = now - sa->last_runnable_update;
2513 /* 2532 /*
@@ -2529,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
2529 sa->last_runnable_update = now; 2548 sa->last_runnable_update = now;
2530 2549
2531 /* delta_w is the amount already accumulated against our next period */ 2550 /* delta_w is the amount already accumulated against our next period */
2532 delta_w = sa->runnable_avg_period % 1024; 2551 delta_w = sa->avg_period % 1024;
2533 if (delta + delta_w >= 1024) { 2552 if (delta + delta_w >= 1024) {
2534 /* period roll-over */ 2553 /* period roll-over */
2535 decayed = 1; 2554 decayed = 1;
@@ -2542,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
2542 delta_w = 1024 - delta_w; 2561 delta_w = 1024 - delta_w;
2543 if (runnable) 2562 if (runnable)
2544 sa->runnable_avg_sum += delta_w; 2563 sa->runnable_avg_sum += delta_w;
2545 sa->runnable_avg_period += delta_w; 2564 if (running)
2565 sa->running_avg_sum += delta_w * scale_freq
2566 >> SCHED_CAPACITY_SHIFT;
2567 sa->avg_period += delta_w;
2546 2568
2547 delta -= delta_w; 2569 delta -= delta_w;
2548 2570
@@ -2552,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
2552 2574
2553 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, 2575 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
2554 periods + 1); 2576 periods + 1);
2555 sa->runnable_avg_period = decay_load(sa->runnable_avg_period, 2577 sa->running_avg_sum = decay_load(sa->running_avg_sum,
2578 periods + 1);
2579 sa->avg_period = decay_load(sa->avg_period,
2556 periods + 1); 2580 periods + 1);
2557 2581
2558 /* Efficiently calculate \sum (1..n_period) 1024*y^i */ 2582 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2559 runnable_contrib = __compute_runnable_contrib(periods); 2583 runnable_contrib = __compute_runnable_contrib(periods);
2560 if (runnable) 2584 if (runnable)
2561 sa->runnable_avg_sum += runnable_contrib; 2585 sa->runnable_avg_sum += runnable_contrib;
2562 sa->runnable_avg_period += runnable_contrib; 2586 if (running)
2587 sa->running_avg_sum += runnable_contrib * scale_freq
2588 >> SCHED_CAPACITY_SHIFT;
2589 sa->avg_period += runnable_contrib;
2563 } 2590 }
2564 2591
2565 /* Remainder of delta accrued against u_0` */ 2592 /* Remainder of delta accrued against u_0` */
2566 if (runnable) 2593 if (runnable)
2567 sa->runnable_avg_sum += delta; 2594 sa->runnable_avg_sum += delta;
2568 sa->runnable_avg_period += delta; 2595 if (running)
2596 sa->running_avg_sum += delta * scale_freq
2597 >> SCHED_CAPACITY_SHIFT;
2598 sa->avg_period += delta;
2569 2599
2570 return decayed; 2600 return decayed;
2571} 2601}
@@ -2582,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2582 return 0; 2612 return 0;
2583 2613
2584 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); 2614 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2615 se->avg.utilization_avg_contrib =
2616 decay_load(se->avg.utilization_avg_contrib, decays);
2585 2617
2586 return decays; 2618 return decays;
2587} 2619}
@@ -2617,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2617 2649
2618 /* The fraction of a cpu used by this cfs_rq */ 2650 /* The fraction of a cpu used by this cfs_rq */
2619 contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, 2651 contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2620 sa->runnable_avg_period + 1); 2652 sa->avg_period + 1);
2621 contrib -= cfs_rq->tg_runnable_contrib; 2653 contrib -= cfs_rq->tg_runnable_contrib;
2622 2654
2623 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { 2655 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
@@ -2670,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
2670 2702
2671static inline void update_rq_runnable_avg(struct rq *rq, int runnable) 2703static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2672{ 2704{
2673 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); 2705 __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
2706 runnable, runnable);
2674 __update_tg_runnable_avg(&rq->avg, &rq->cfs); 2707 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2675} 2708}
2676#else /* CONFIG_FAIR_GROUP_SCHED */ 2709#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -2688,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
2688 2721
2689 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ 2722 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2690 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); 2723 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2691 contrib /= (se->avg.runnable_avg_period + 1); 2724 contrib /= (se->avg.avg_period + 1);
2692 se->avg.load_avg_contrib = scale_load(contrib); 2725 se->avg.load_avg_contrib = scale_load(contrib);
2693} 2726}
2694 2727
@@ -2707,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
2707 return se->avg.load_avg_contrib - old_contrib; 2740 return se->avg.load_avg_contrib - old_contrib;
2708} 2741}
2709 2742
2743
2744static inline void __update_task_entity_utilization(struct sched_entity *se)
2745{
2746 u32 contrib;
2747
2748 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2749 contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
2750 contrib /= (se->avg.avg_period + 1);
2751 se->avg.utilization_avg_contrib = scale_load(contrib);
2752}
2753
2754static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
2755{
2756 long old_contrib = se->avg.utilization_avg_contrib;
2757
2758 if (entity_is_task(se))
2759 __update_task_entity_utilization(se);
2760 else
2761 se->avg.utilization_avg_contrib =
2762 group_cfs_rq(se)->utilization_load_avg;
2763
2764 return se->avg.utilization_avg_contrib - old_contrib;
2765}
2766
2710static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, 2767static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2711 long load_contrib) 2768 long load_contrib)
2712{ 2769{
@@ -2723,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
2723 int update_cfs_rq) 2780 int update_cfs_rq)
2724{ 2781{
2725 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2782 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2726 long contrib_delta; 2783 long contrib_delta, utilization_delta;
2784 int cpu = cpu_of(rq_of(cfs_rq));
2727 u64 now; 2785 u64 now;
2728 2786
2729 /* 2787 /*
@@ -2735,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
2735 else 2793 else
2736 now = cfs_rq_clock_task(group_cfs_rq(se)); 2794 now = cfs_rq_clock_task(group_cfs_rq(se));
2737 2795
2738 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) 2796 if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
2797 cfs_rq->curr == se))
2739 return; 2798 return;
2740 2799
2741 contrib_delta = __update_entity_load_avg_contrib(se); 2800 contrib_delta = __update_entity_load_avg_contrib(se);
2801 utilization_delta = __update_entity_utilization_avg_contrib(se);
2742 2802
2743 if (!update_cfs_rq) 2803 if (!update_cfs_rq)
2744 return; 2804 return;
2745 2805
2746 if (se->on_rq) 2806 if (se->on_rq) {
2747 cfs_rq->runnable_load_avg += contrib_delta; 2807 cfs_rq->runnable_load_avg += contrib_delta;
2748 else 2808 cfs_rq->utilization_load_avg += utilization_delta;
2809 } else {
2749 subtract_blocked_load_contrib(cfs_rq, -contrib_delta); 2810 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2811 }
2750} 2812}
2751 2813
2752/* 2814/*
@@ -2821,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2821 } 2883 }
2822 2884
2823 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; 2885 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2886 cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
2824 /* we force update consideration on load-balancer moves */ 2887 /* we force update consideration on load-balancer moves */
2825 update_cfs_rq_blocked_load(cfs_rq, !wakeup); 2888 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2826} 2889}
@@ -2839,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2839 update_cfs_rq_blocked_load(cfs_rq, !sleep); 2902 update_cfs_rq_blocked_load(cfs_rq, !sleep);
2840 2903
2841 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; 2904 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2905 cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
2842 if (sleep) { 2906 if (sleep) {
2843 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; 2907 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2844 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 2908 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -3176,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3176 */ 3240 */
3177 update_stats_wait_end(cfs_rq, se); 3241 update_stats_wait_end(cfs_rq, se);
3178 __dequeue_entity(cfs_rq, se); 3242 __dequeue_entity(cfs_rq, se);
3243 update_entity_load_avg(se, 1);
3179 } 3244 }
3180 3245
3181 update_stats_curr_start(cfs_rq, se); 3246 update_stats_curr_start(cfs_rq, se);
@@ -4302,6 +4367,11 @@ static unsigned long capacity_of(int cpu)
4302 return cpu_rq(cpu)->cpu_capacity; 4367 return cpu_rq(cpu)->cpu_capacity;
4303} 4368}
4304 4369
4370static unsigned long capacity_orig_of(int cpu)
4371{
4372 return cpu_rq(cpu)->cpu_capacity_orig;
4373}
4374
4305static unsigned long cpu_avg_load_per_task(int cpu) 4375static unsigned long cpu_avg_load_per_task(int cpu)
4306{ 4376{
4307 struct rq *rq = cpu_rq(cpu); 4377 struct rq *rq = cpu_rq(cpu);
@@ -4715,6 +4785,33 @@ next:
4715done: 4785done:
4716 return target; 4786 return target;
4717} 4787}
4788/*
4789 * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
4790 * tasks. The unit of the return value must be the one of capacity so we can
4791 * compare the usage with the capacity of the CPU that is available for CFS
4792 * task (ie cpu_capacity).
4793 * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
4794 * CPU. It represents the amount of utilization of a CPU in the range
4795 * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
4796 * capacity of the CPU because it's about the running time on this CPU.
4797 * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
4798 * because of unfortunate rounding in avg_period and running_load_avg or just
4799 * after migrating tasks until the average stabilizes with the new running
4800 * time. So we need to check that the usage stays into the range
4801 * [0..cpu_capacity_orig] and cap if necessary.
4802 * Without capping the usage, a group could be seen as overloaded (CPU0 usage
4803 * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
4804 */
4805static int get_cpu_usage(int cpu)
4806{
4807 unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
4808 unsigned long capacity = capacity_orig_of(cpu);
4809
4810 if (usage >= SCHED_LOAD_SCALE)
4811 return capacity;
4812
4813 return (usage * capacity) >> SCHED_LOAD_SHIFT;
4814}
4718 4815
4719/* 4816/*
4720 * select_task_rq_fair: Select target runqueue for the waking task in domains 4817 * select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -5841,12 +5938,12 @@ struct sg_lb_stats {
5841 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 5938 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5842 unsigned long load_per_task; 5939 unsigned long load_per_task;
5843 unsigned long group_capacity; 5940 unsigned long group_capacity;
5941 unsigned long group_usage; /* Total usage of the group */
5844 unsigned int sum_nr_running; /* Nr tasks running in the group */ 5942 unsigned int sum_nr_running; /* Nr tasks running in the group */
5845 unsigned int group_capacity_factor;
5846 unsigned int idle_cpus; 5943 unsigned int idle_cpus;
5847 unsigned int group_weight; 5944 unsigned int group_weight;
5848 enum group_type group_type; 5945 enum group_type group_type;
5849 int group_has_free_capacity; 5946 int group_no_capacity;
5850#ifdef CONFIG_NUMA_BALANCING 5947#ifdef CONFIG_NUMA_BALANCING
5851 unsigned int nr_numa_running; 5948 unsigned int nr_numa_running;
5852 unsigned int nr_preferred_running; 5949 unsigned int nr_preferred_running;
@@ -5917,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
5917 return load_idx; 6014 return load_idx;
5918} 6015}
5919 6016
5920static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
5921{
5922 return SCHED_CAPACITY_SCALE;
5923}
5924
5925unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5926{
5927 return default_scale_capacity(sd, cpu);
5928}
5929
5930static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) 6017static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5931{ 6018{
5932 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) 6019 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
@@ -5943,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5943static unsigned long scale_rt_capacity(int cpu) 6030static unsigned long scale_rt_capacity(int cpu)
5944{ 6031{
5945 struct rq *rq = cpu_rq(cpu); 6032 struct rq *rq = cpu_rq(cpu);
5946 u64 total, available, age_stamp, avg; 6033 u64 total, used, age_stamp, avg;
5947 s64 delta; 6034 s64 delta;
5948 6035
5949 /* 6036 /*
@@ -5959,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu)
5959 6046
5960 total = sched_avg_period() + delta; 6047 total = sched_avg_period() + delta;
5961 6048
5962 if (unlikely(total < avg)) { 6049 used = div_u64(avg, total);
5963 /* Ensures that capacity won't end up being negative */
5964 available = 0;
5965 } else {
5966 available = total - avg;
5967 }
5968
5969 if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
5970 total = SCHED_CAPACITY_SCALE;
5971 6050
5972 total >>= SCHED_CAPACITY_SHIFT; 6051 if (likely(used < SCHED_CAPACITY_SCALE))
6052 return SCHED_CAPACITY_SCALE - used;
5973 6053
5974 return div_u64(available, total); 6054 return 1;
5975} 6055}
5976 6056
5977static void update_cpu_capacity(struct sched_domain *sd, int cpu) 6057static void update_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -5986,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5986 6066
5987 capacity >>= SCHED_CAPACITY_SHIFT; 6067 capacity >>= SCHED_CAPACITY_SHIFT;
5988 6068
5989 sdg->sgc->capacity_orig = capacity; 6069 cpu_rq(cpu)->cpu_capacity_orig = capacity;
5990
5991 if (sched_feat(ARCH_CAPACITY))
5992 capacity *= arch_scale_freq_capacity(sd, cpu);
5993 else
5994 capacity *= default_scale_capacity(sd, cpu);
5995
5996 capacity >>= SCHED_CAPACITY_SHIFT;
5997 6070
5998 capacity *= scale_rt_capacity(cpu); 6071 capacity *= scale_rt_capacity(cpu);
5999 capacity >>= SCHED_CAPACITY_SHIFT; 6072 capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6009,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6009{ 6082{
6010 struct sched_domain *child = sd->child; 6083 struct sched_domain *child = sd->child;
6011 struct sched_group *group, *sdg = sd->groups; 6084 struct sched_group *group, *sdg = sd->groups;
6012 unsigned long capacity, capacity_orig; 6085 unsigned long capacity;
6013 unsigned long interval; 6086 unsigned long interval;
6014 6087
6015 interval = msecs_to_jiffies(sd->balance_interval); 6088 interval = msecs_to_jiffies(sd->balance_interval);
@@ -6021,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6021 return; 6094 return;
6022 } 6095 }
6023 6096
6024 capacity_orig = capacity = 0; 6097 capacity = 0;
6025 6098
6026 if (child->flags & SD_OVERLAP) { 6099 if (child->flags & SD_OVERLAP) {
6027 /* 6100 /*
@@ -6041,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6041 * Use capacity_of(), which is set irrespective of domains 6114 * Use capacity_of(), which is set irrespective of domains
6042 * in update_cpu_capacity(). 6115 * in update_cpu_capacity().
6043 * 6116 *
6044 * This avoids capacity/capacity_orig from being 0 and 6117 * This avoids capacity from being 0 and
6045 * causing divide-by-zero issues on boot. 6118 * causing divide-by-zero issues on boot.
6046 *
6047 * Runtime updates will correct capacity_orig.
6048 */ 6119 */
6049 if (unlikely(!rq->sd)) { 6120 if (unlikely(!rq->sd)) {
6050 capacity_orig += capacity_of(cpu);
6051 capacity += capacity_of(cpu); 6121 capacity += capacity_of(cpu);
6052 continue; 6122 continue;
6053 } 6123 }
6054 6124
6055 sgc = rq->sd->groups->sgc; 6125 sgc = rq->sd->groups->sgc;
6056 capacity_orig += sgc->capacity_orig;
6057 capacity += sgc->capacity; 6126 capacity += sgc->capacity;
6058 } 6127 }
6059 } else { 6128 } else {
@@ -6064,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6064 6133
6065 group = child->groups; 6134 group = child->groups;
6066 do { 6135 do {
6067 capacity_orig += group->sgc->capacity_orig;
6068 capacity += group->sgc->capacity; 6136 capacity += group->sgc->capacity;
6069 group = group->next; 6137 group = group->next;
6070 } while (group != child->groups); 6138 } while (group != child->groups);
6071 } 6139 }
6072 6140
6073 sdg->sgc->capacity_orig = capacity_orig;
6074 sdg->sgc->capacity = capacity; 6141 sdg->sgc->capacity = capacity;
6075} 6142}
6076 6143
6077/* 6144/*
6078 * Try and fix up capacity for tiny siblings, this is needed when 6145 * Check whether the capacity of the rq has been noticeably reduced by side
6079 * things like SD_ASYM_PACKING need f_b_g to select another sibling 6146 * activity. The imbalance_pct is used for the threshold.
6080 * which on its own isn't powerful enough. 6147 * Return true is the capacity is reduced
6081 *
6082 * See update_sd_pick_busiest() and check_asym_packing().
6083 */ 6148 */
6084static inline int 6149static inline int
6085fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 6150check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6086{ 6151{
6087 /* 6152 return ((rq->cpu_capacity * sd->imbalance_pct) <
6088 * Only siblings can have significantly less than SCHED_CAPACITY_SCALE 6153 (rq->cpu_capacity_orig * 100));
6089 */
6090 if (!(sd->flags & SD_SHARE_CPUCAPACITY))
6091 return 0;
6092
6093 /*
6094 * If ~90% of the cpu_capacity is still there, we're good.
6095 */
6096 if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
6097 return 1;
6098
6099 return 0;
6100} 6154}
6101 6155
6102/* 6156/*
@@ -6134,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group)
6134} 6188}
6135 6189
6136/* 6190/*
6137 * Compute the group capacity factor. 6191 * group_has_capacity returns true if the group has spare capacity that could
6138 * 6192 * be used by some tasks.
6139 * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by 6193 * We consider that a group has spare capacity if the * number of task is
6140 * first dividing out the smt factor and computing the actual number of cores 6194 * smaller than the number of CPUs or if the usage is lower than the available
6141 * and limit unit capacity with that. 6195 * capacity for CFS tasks.
6196 * For the latter, we use a threshold to stabilize the state, to take into
6197 * account the variance of the tasks' load and to return true if the available
6198 * capacity in meaningful for the load balancer.
6199 * As an example, an available capacity of 1% can appear but it doesn't make
6200 * any benefit for the load balance.
6142 */ 6201 */
6143static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) 6202static inline bool
6203group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6144{ 6204{
6145 unsigned int capacity_factor, smt, cpus; 6205 if (sgs->sum_nr_running < sgs->group_weight)
6146 unsigned int capacity, capacity_orig; 6206 return true;
6147 6207
6148 capacity = group->sgc->capacity; 6208 if ((sgs->group_capacity * 100) >
6149 capacity_orig = group->sgc->capacity_orig; 6209 (sgs->group_usage * env->sd->imbalance_pct))
6150 cpus = group->group_weight; 6210 return true;
6211
6212 return false;
6213}
6151 6214
6152 /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ 6215/*
6153 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); 6216 * group_is_overloaded returns true if the group has more tasks than it can
6154 capacity_factor = cpus / smt; /* cores */ 6217 * handle.
6218 * group_is_overloaded is not equals to !group_has_capacity because a group
6219 * with the exact right number of tasks, has no more spare capacity but is not
6220 * overloaded so both group_has_capacity and group_is_overloaded return
6221 * false.
6222 */
6223static inline bool
6224group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6225{
6226 if (sgs->sum_nr_running <= sgs->group_weight)
6227 return false;
6155 6228
6156 capacity_factor = min_t(unsigned, 6229 if ((sgs->group_capacity * 100) <
6157 capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); 6230 (sgs->group_usage * env->sd->imbalance_pct))
6158 if (!capacity_factor) 6231 return true;
6159 capacity_factor = fix_small_capacity(env->sd, group);
6160 6232
6161 return capacity_factor; 6233 return false;
6162} 6234}
6163 6235
6164static enum group_type 6236static enum group_type group_classify(struct lb_env *env,
6165group_classify(struct sched_group *group, struct sg_lb_stats *sgs) 6237 struct sched_group *group,
6238 struct sg_lb_stats *sgs)
6166{ 6239{
6167 if (sgs->sum_nr_running > sgs->group_capacity_factor) 6240 if (sgs->group_no_capacity)
6168 return group_overloaded; 6241 return group_overloaded;
6169 6242
6170 if (sg_imbalanced(group)) 6243 if (sg_imbalanced(group))
@@ -6202,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6202 load = source_load(i, load_idx); 6275 load = source_load(i, load_idx);
6203 6276
6204 sgs->group_load += load; 6277 sgs->group_load += load;
6278 sgs->group_usage += get_cpu_usage(i);
6205 sgs->sum_nr_running += rq->cfs.h_nr_running; 6279 sgs->sum_nr_running += rq->cfs.h_nr_running;
6206 6280
6207 if (rq->nr_running > 1) 6281 if (rq->nr_running > 1)
@@ -6224,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6224 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 6298 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6225 6299
6226 sgs->group_weight = group->group_weight; 6300 sgs->group_weight = group->group_weight;
6227 sgs->group_capacity_factor = sg_capacity_factor(env, group);
6228 sgs->group_type = group_classify(group, sgs);
6229 6301
6230 if (sgs->group_capacity_factor > sgs->sum_nr_running) 6302 sgs->group_no_capacity = group_is_overloaded(env, sgs);
6231 sgs->group_has_free_capacity = 1; 6303 sgs->group_type = group_classify(env, group, sgs);
6232} 6304}
6233 6305
6234/** 6306/**
@@ -6350,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6350 6422
6351 /* 6423 /*
6352 * In case the child domain prefers tasks go to siblings 6424 * In case the child domain prefers tasks go to siblings
6353 * first, lower the sg capacity factor to one so that we'll try 6425 * first, lower the sg capacity so that we'll try
6354 * and move all the excess tasks away. We lower the capacity 6426 * and move all the excess tasks away. We lower the capacity
6355 * of a group only if the local group has the capacity to fit 6427 * of a group only if the local group has the capacity to fit
6356 * these excess tasks, i.e. nr_running < group_capacity_factor. The 6428 * these excess tasks. The extra check prevents the case where
6357 * extra check prevents the case where you always pull from the 6429 * you always pull from the heaviest group when it is already
6358 * heaviest group when it is already under-utilized (possible 6430 * under-utilized (possible with a large weight task outweighs
6359 * with a large weight task outweighs the tasks on the system). 6431 * the tasks on the system).
6360 */ 6432 */
6361 if (prefer_sibling && sds->local && 6433 if (prefer_sibling && sds->local &&
6362 sds->local_stat.group_has_free_capacity) { 6434 group_has_capacity(env, &sds->local_stat) &&
6363 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); 6435 (sgs->sum_nr_running > 1)) {
6364 sgs->group_type = group_classify(sg, sgs); 6436 sgs->group_no_capacity = 1;
6437 sgs->group_type = group_overloaded;
6365 } 6438 }
6366 6439
6367 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6440 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -6541,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6541 */ 6614 */
6542 if (busiest->group_type == group_overloaded && 6615 if (busiest->group_type == group_overloaded &&
6543 local->group_type == group_overloaded) { 6616 local->group_type == group_overloaded) {
6544 load_above_capacity = 6617 load_above_capacity = busiest->sum_nr_running *
6545 (busiest->sum_nr_running - busiest->group_capacity_factor); 6618 SCHED_LOAD_SCALE;
6546 6619 if (load_above_capacity > busiest->group_capacity)
6547 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); 6620 load_above_capacity -= busiest->group_capacity;
6548 load_above_capacity /= busiest->group_capacity; 6621 else
6622 load_above_capacity = ~0UL;
6549 } 6623 }
6550 6624
6551 /* 6625 /*
@@ -6608,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6608 local = &sds.local_stat; 6682 local = &sds.local_stat;
6609 busiest = &sds.busiest_stat; 6683 busiest = &sds.busiest_stat;
6610 6684
6685 /* ASYM feature bypasses nice load balance check */
6611 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && 6686 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
6612 check_asym_packing(env, &sds)) 6687 check_asym_packing(env, &sds))
6613 return sds.busiest; 6688 return sds.busiest;
@@ -6628,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6628 goto force_balance; 6703 goto force_balance;
6629 6704
6630 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 6705 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6631 if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && 6706 if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
6632 !busiest->group_has_free_capacity) 6707 busiest->group_no_capacity)
6633 goto force_balance; 6708 goto force_balance;
6634 6709
6635 /* 6710 /*
@@ -6688,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6688 int i; 6763 int i;
6689 6764
6690 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 6765 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6691 unsigned long capacity, capacity_factor, wl; 6766 unsigned long capacity, wl;
6692 enum fbq_type rt; 6767 enum fbq_type rt;
6693 6768
6694 rq = cpu_rq(i); 6769 rq = cpu_rq(i);
@@ -6717,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6717 continue; 6792 continue;
6718 6793
6719 capacity = capacity_of(i); 6794 capacity = capacity_of(i);
6720 capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
6721 if (!capacity_factor)
6722 capacity_factor = fix_small_capacity(env->sd, group);
6723 6795
6724 wl = weighted_cpuload(i); 6796 wl = weighted_cpuload(i);
6725 6797
@@ -6727,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6727 * When comparing with imbalance, use weighted_cpuload() 6799 * When comparing with imbalance, use weighted_cpuload()
6728 * which is not scaled with the cpu capacity. 6800 * which is not scaled with the cpu capacity.
6729 */ 6801 */
6730 if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) 6802
6803 if (rq->nr_running == 1 && wl > env->imbalance &&
6804 !check_cpu_capacity(rq, env->sd))
6731 continue; 6805 continue;
6732 6806
6733 /* 6807 /*
@@ -6775,6 +6849,19 @@ static int need_active_balance(struct lb_env *env)
6775 return 1; 6849 return 1;
6776 } 6850 }
6777 6851
6852 /*
6853 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
6854 * It's worth migrating the task if the src_cpu's capacity is reduced
6855 * because of other sched_class or IRQs if more capacity stays
6856 * available on dst_cpu.
6857 */
6858 if ((env->idle != CPU_NOT_IDLE) &&
6859 (env->src_rq->cfs.h_nr_running == 1)) {
6860 if ((check_cpu_capacity(env->src_rq, sd)) &&
6861 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
6862 return 1;
6863 }
6864
6778 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 6865 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6779} 6866}
6780 6867
@@ -6874,6 +6961,9 @@ redo:
6874 6961
6875 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 6962 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
6876 6963
6964 env.src_cpu = busiest->cpu;
6965 env.src_rq = busiest;
6966
6877 ld_moved = 0; 6967 ld_moved = 0;
6878 if (busiest->nr_running > 1) { 6968 if (busiest->nr_running > 1) {
6879 /* 6969 /*
@@ -6883,8 +6973,6 @@ redo:
6883 * correctly treated as an imbalance. 6973 * correctly treated as an imbalance.
6884 */ 6974 */
6885 env.flags |= LBF_ALL_PINNED; 6975 env.flags |= LBF_ALL_PINNED;
6886 env.src_cpu = busiest->cpu;
6887 env.src_rq = busiest;
6888 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 6976 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6889 6977
6890more_balance: 6978more_balance:
@@ -7584,22 +7672,25 @@ end:
7584 7672
7585/* 7673/*
7586 * Current heuristic for kicking the idle load balancer in the presence 7674 * Current heuristic for kicking the idle load balancer in the presence
7587 * of an idle cpu is the system. 7675 * of an idle cpu in the system.
7588 * - This rq has more than one task. 7676 * - This rq has more than one task.
7589 * - At any scheduler domain level, this cpu's scheduler group has multiple 7677 * - This rq has at least one CFS task and the capacity of the CPU is
7590 * busy cpu's exceeding the group's capacity. 7678 * significantly reduced because of RT tasks or IRQs.
7679 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
7680 * multiple busy cpu.
7591 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 7681 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7592 * domain span are idle. 7682 * domain span are idle.
7593 */ 7683 */
7594static inline int nohz_kick_needed(struct rq *rq) 7684static inline bool nohz_kick_needed(struct rq *rq)
7595{ 7685{
7596 unsigned long now = jiffies; 7686 unsigned long now = jiffies;
7597 struct sched_domain *sd; 7687 struct sched_domain *sd;
7598 struct sched_group_capacity *sgc; 7688 struct sched_group_capacity *sgc;
7599 int nr_busy, cpu = rq->cpu; 7689 int nr_busy, cpu = rq->cpu;
7690 bool kick = false;
7600 7691
7601 if (unlikely(rq->idle_balance)) 7692 if (unlikely(rq->idle_balance))
7602 return 0; 7693 return false;
7603 7694
7604 /* 7695 /*
7605 * We may be recently in ticked or tickless idle mode. At the first 7696 * We may be recently in ticked or tickless idle mode. At the first
@@ -7613,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq)
7613 * balancing. 7704 * balancing.
7614 */ 7705 */
7615 if (likely(!atomic_read(&nohz.nr_cpus))) 7706 if (likely(!atomic_read(&nohz.nr_cpus)))
7616 return 0; 7707 return false;
7617 7708
7618 if (time_before(now, nohz.next_balance)) 7709 if (time_before(now, nohz.next_balance))
7619 return 0; 7710 return false;
7620 7711
7621 if (rq->nr_running >= 2) 7712 if (rq->nr_running >= 2)
7622 goto need_kick; 7713 return true;
7623 7714
7624 rcu_read_lock(); 7715 rcu_read_lock();
7625 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 7716 sd = rcu_dereference(per_cpu(sd_busy, cpu));
7626
7627 if (sd) { 7717 if (sd) {
7628 sgc = sd->groups->sgc; 7718 sgc = sd->groups->sgc;
7629 nr_busy = atomic_read(&sgc->nr_busy_cpus); 7719 nr_busy = atomic_read(&sgc->nr_busy_cpus);
7630 7720
7631 if (nr_busy > 1) 7721 if (nr_busy > 1) {
7632 goto need_kick_unlock; 7722 kick = true;
7723 goto unlock;
7724 }
7725
7633 } 7726 }
7634 7727
7635 sd = rcu_dereference(per_cpu(sd_asym, cpu)); 7728 sd = rcu_dereference(rq->sd);
7729 if (sd) {
7730 if ((rq->cfs.h_nr_running >= 1) &&
7731 check_cpu_capacity(rq, sd)) {
7732 kick = true;
7733 goto unlock;
7734 }
7735 }
7636 7736
7737 sd = rcu_dereference(per_cpu(sd_asym, cpu));
7637 if (sd && (cpumask_first_and(nohz.idle_cpus_mask, 7738 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7638 sched_domain_span(sd)) < cpu)) 7739 sched_domain_span(sd)) < cpu)) {
7639 goto need_kick_unlock; 7740 kick = true;
7640 7741 goto unlock;
7641 rcu_read_unlock(); 7742 }
7642 return 0;
7643 7743
7644need_kick_unlock: 7744unlock:
7645 rcu_read_unlock(); 7745 rcu_read_unlock();
7646need_kick: 7746 return kick;
7647 return 1;
7648} 7747}
7649#else 7748#else
7650static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } 7749static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7660,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h)
7660 enum cpu_idle_type idle = this_rq->idle_balance ? 7759 enum cpu_idle_type idle = this_rq->idle_balance ?
7661 CPU_IDLE : CPU_NOT_IDLE; 7760 CPU_IDLE : CPU_NOT_IDLE;
7662 7761
7663 rebalance_domains(this_rq, idle);
7664
7665 /* 7762 /*
7666 * If this cpu has a pending nohz_balance_kick, then do the 7763 * If this cpu has a pending nohz_balance_kick, then do the
7667 * balancing on behalf of the other idle cpus whose ticks are 7764 * balancing on behalf of the other idle cpus whose ticks are
7668 * stopped. 7765 * stopped. Do nohz_idle_balance *before* rebalance_domains to
7766 * give the idle cpus a chance to load balance. Else we may
7767 * load balance only within the local sched_domain hierarchy
7768 * and abort nohz_idle_balance altogether if we pull some load.
7669 */ 7769 */
7670 nohz_idle_balance(this_rq, idle); 7770 nohz_idle_balance(this_rq, idle);
7771 rebalance_domains(this_rq, idle);
7671} 7772}
7672 7773
7673/* 7774/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..91e33cd485f6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
56 */ 56 */
57SCHED_FEAT(TTWU_QUEUE, true) 57SCHED_FEAT(TTWU_QUEUE, true)
58 58
59#ifdef HAVE_RT_PUSH_IPI
60/*
61 * In order to avoid a thundering herd attack of CPUs that are
62 * lowering their priorities at the same time, and there being
63 * a single CPU that has an RT task that can migrate and is waiting
64 * to run, where the other CPUs will try to take that CPUs
65 * rq lock and possibly create a large contention, sending an
66 * IPI to that CPU and let that CPU push the RT task to where
67 * it should go may be a better scenario.
68 */
69SCHED_FEAT(RT_PUSH_IPI, true)
70#endif
71
59SCHED_FEAT(FORCE_SD_OVERLAP, false) 72SCHED_FEAT(FORCE_SD_OVERLAP, false)
60SCHED_FEAT(RT_RUNTIME_SHARE, true) 73SCHED_FEAT(RT_RUNTIME_SHARE, true)
61SCHED_FEAT(LB_MIN, false) 74SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index d27d36476dca..deef1caa94c6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,8 +158,7 @@ static void cpuidle_idle_call(void)
158 * is used from another cpu as a broadcast timer, this call may 158 * is used from another cpu as a broadcast timer, this call may
159 * fail if it is not available 159 * fail if it is not available
160 */ 160 */
161 if (broadcast && 161 if (broadcast && tick_broadcast_enter())
162 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
163 goto use_default; 162 goto use_default;
164 163
165 /* Take note of the planned idle state. */ 164 /* Take note of the planned idle state. */
@@ -176,7 +175,7 @@ static void cpuidle_idle_call(void)
176 idle_set_state(this_rq(), NULL); 175 idle_set_state(this_rq(), NULL);
177 176
178 if (broadcast) 177 if (broadcast)
179 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 178 tick_broadcast_exit();
180 179
181 /* 180 /*
182 * Give the governor an opportunity to reflect on the outcome 181 * Give the governor an opportunity to reflect on the outcome
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f4d4b077eba0..575da76a3874 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -6,6 +6,7 @@
6#include "sched.h" 6#include "sched.h"
7 7
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/irq_work.h>
9 10
10int sched_rr_timeslice = RR_TIMESLICE; 11int sched_rr_timeslice = RR_TIMESLICE;
11 12
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
59 raw_spin_unlock(&rt_b->rt_runtime_lock); 60 raw_spin_unlock(&rt_b->rt_runtime_lock);
60} 61}
61 62
62void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 63#ifdef CONFIG_SMP
64static void push_irq_work_func(struct irq_work *work);
65#endif
66
67void init_rt_rq(struct rt_rq *rt_rq)
63{ 68{
64 struct rt_prio_array *array; 69 struct rt_prio_array *array;
65 int i; 70 int i;
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
78 rt_rq->rt_nr_migratory = 0; 83 rt_rq->rt_nr_migratory = 0;
79 rt_rq->overloaded = 0; 84 rt_rq->overloaded = 0;
80 plist_head_init(&rt_rq->pushable_tasks); 85 plist_head_init(&rt_rq->pushable_tasks);
86
87#ifdef HAVE_RT_PUSH_IPI
88 rt_rq->push_flags = 0;
89 rt_rq->push_cpu = nr_cpu_ids;
90 raw_spin_lock_init(&rt_rq->push_lock);
91 init_irq_work(&rt_rq->push_work, push_irq_work_func);
81#endif 92#endif
93#endif /* CONFIG_SMP */
82 /* We start is dequeued state, because no RT tasks are queued */ 94 /* We start is dequeued state, because no RT tasks are queued */
83 rt_rq->rt_queued = 0; 95 rt_rq->rt_queued = 0;
84 96
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
193 if (!rt_se) 205 if (!rt_se)
194 goto err_free_rq; 206 goto err_free_rq;
195 207
196 init_rt_rq(rt_rq, cpu_rq(i)); 208 init_rt_rq(rt_rq);
197 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 209 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
198 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 210 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
199 } 211 }
@@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq)
1778 ; 1790 ;
1779} 1791}
1780 1792
1793#ifdef HAVE_RT_PUSH_IPI
1794/*
1795 * The search for the next cpu always starts at rq->cpu and ends
1796 * when we reach rq->cpu again. It will never return rq->cpu.
1797 * This returns the next cpu to check, or nr_cpu_ids if the loop
1798 * is complete.
1799 *
1800 * rq->rt.push_cpu holds the last cpu returned by this function,
1801 * or if this is the first instance, it must hold rq->cpu.
1802 */
1803static int rto_next_cpu(struct rq *rq)
1804{
1805 int prev_cpu = rq->rt.push_cpu;
1806 int cpu;
1807
1808 cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
1809
1810 /*
1811 * If the previous cpu is less than the rq's CPU, then it already
1812 * passed the end of the mask, and has started from the beginning.
1813 * We end if the next CPU is greater or equal to rq's CPU.
1814 */
1815 if (prev_cpu < rq->cpu) {
1816 if (cpu >= rq->cpu)
1817 return nr_cpu_ids;
1818
1819 } else if (cpu >= nr_cpu_ids) {
1820 /*
1821 * We passed the end of the mask, start at the beginning.
1822 * If the result is greater or equal to the rq's CPU, then
1823 * the loop is finished.
1824 */
1825 cpu = cpumask_first(rq->rd->rto_mask);
1826 if (cpu >= rq->cpu)
1827 return nr_cpu_ids;
1828 }
1829 rq->rt.push_cpu = cpu;
1830
1831 /* Return cpu to let the caller know if the loop is finished or not */
1832 return cpu;
1833}
1834
1835static int find_next_push_cpu(struct rq *rq)
1836{
1837 struct rq *next_rq;
1838 int cpu;
1839
1840 while (1) {
1841 cpu = rto_next_cpu(rq);
1842 if (cpu >= nr_cpu_ids)
1843 break;
1844 next_rq = cpu_rq(cpu);
1845
1846 /* Make sure the next rq can push to this rq */
1847 if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
1848 break;
1849 }
1850
1851 return cpu;
1852}
1853
1854#define RT_PUSH_IPI_EXECUTING 1
1855#define RT_PUSH_IPI_RESTART 2
1856
1857static void tell_cpu_to_push(struct rq *rq)
1858{
1859 int cpu;
1860
1861 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
1862 raw_spin_lock(&rq->rt.push_lock);
1863 /* Make sure it's still executing */
1864 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
1865 /*
1866 * Tell the IPI to restart the loop as things have
1867 * changed since it started.
1868 */
1869 rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
1870 raw_spin_unlock(&rq->rt.push_lock);
1871 return;
1872 }
1873 raw_spin_unlock(&rq->rt.push_lock);
1874 }
1875
1876 /* When here, there's no IPI going around */
1877
1878 rq->rt.push_cpu = rq->cpu;
1879 cpu = find_next_push_cpu(rq);
1880 if (cpu >= nr_cpu_ids)
1881 return;
1882
1883 rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
1884
1885 irq_work_queue_on(&rq->rt.push_work, cpu);
1886}
1887
1888/* Called from hardirq context */
1889static void try_to_push_tasks(void *arg)
1890{
1891 struct rt_rq *rt_rq = arg;
1892 struct rq *rq, *src_rq;
1893 int this_cpu;
1894 int cpu;
1895
1896 this_cpu = rt_rq->push_cpu;
1897
1898 /* Paranoid check */
1899 BUG_ON(this_cpu != smp_processor_id());
1900
1901 rq = cpu_rq(this_cpu);
1902 src_rq = rq_of_rt_rq(rt_rq);
1903
1904again:
1905 if (has_pushable_tasks(rq)) {
1906 raw_spin_lock(&rq->lock);
1907 push_rt_task(rq);
1908 raw_spin_unlock(&rq->lock);
1909 }
1910
1911 /* Pass the IPI to the next rt overloaded queue */
1912 raw_spin_lock(&rt_rq->push_lock);
1913 /*
1914 * If the source queue changed since the IPI went out,
1915 * we need to restart the search from that CPU again.
1916 */
1917 if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
1918 rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
1919 rt_rq->push_cpu = src_rq->cpu;
1920 }
1921
1922 cpu = find_next_push_cpu(src_rq);
1923
1924 if (cpu >= nr_cpu_ids)
1925 rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
1926 raw_spin_unlock(&rt_rq->push_lock);
1927
1928 if (cpu >= nr_cpu_ids)
1929 return;
1930
1931 /*
1932 * It is possible that a restart caused this CPU to be
1933 * chosen again. Don't bother with an IPI, just see if we
1934 * have more to push.
1935 */
1936 if (unlikely(cpu == rq->cpu))
1937 goto again;
1938
1939 /* Try the next RT overloaded CPU */
1940 irq_work_queue_on(&rt_rq->push_work, cpu);
1941}
1942
1943static void push_irq_work_func(struct irq_work *work)
1944{
1945 struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
1946
1947 try_to_push_tasks(rt_rq);
1948}
1949#endif /* HAVE_RT_PUSH_IPI */
1950
1781static int pull_rt_task(struct rq *this_rq) 1951static int pull_rt_task(struct rq *this_rq)
1782{ 1952{
1783 int this_cpu = this_rq->cpu, ret = 0, cpu; 1953 int this_cpu = this_rq->cpu, ret = 0, cpu;
@@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq)
1793 */ 1963 */
1794 smp_rmb(); 1964 smp_rmb();
1795 1965
1966#ifdef HAVE_RT_PUSH_IPI
1967 if (sched_feat(RT_PUSH_IPI)) {
1968 tell_cpu_to_push(this_rq);
1969 return 0;
1970 }
1971#endif
1972
1796 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1973 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1797 if (this_cpu == cpu) 1974 if (this_cpu == cpu)
1798 continue; 1975 continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc0f435a2779..e0e129993958 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
6#include <linux/mutex.h> 6#include <linux/mutex.h>
7#include <linux/spinlock.h> 7#include <linux/spinlock.h>
8#include <linux/stop_machine.h> 8#include <linux/stop_machine.h>
9#include <linux/irq_work.h>
9#include <linux/tick.h> 10#include <linux/tick.h>
10#include <linux/slab.h> 11#include <linux/slab.h>
11 12
@@ -362,8 +363,14 @@ struct cfs_rq {
362 * Under CFS, load is tracked on a per-entity basis and aggregated up. 363 * Under CFS, load is tracked on a per-entity basis and aggregated up.
363 * This allows for the description of both thread and group usage (in 364 * This allows for the description of both thread and group usage (in
364 * the FAIR_GROUP_SCHED case). 365 * the FAIR_GROUP_SCHED case).
366 * runnable_load_avg is the sum of the load_avg_contrib of the
367 * sched_entities on the rq.
368 * blocked_load_avg is similar to runnable_load_avg except that its
369 * the blocked sched_entities on the rq.
370 * utilization_load_avg is the sum of the average running time of the
371 * sched_entities on the rq.
365 */ 372 */
366 unsigned long runnable_load_avg, blocked_load_avg; 373 unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
367 atomic64_t decay_counter; 374 atomic64_t decay_counter;
368 u64 last_decay; 375 u64 last_decay;
369 atomic_long_t removed_load; 376 atomic_long_t removed_load;
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void)
418 return sysctl_sched_rt_runtime >= 0; 425 return sysctl_sched_rt_runtime >= 0;
419} 426}
420 427
428/* RT IPI pull logic requires IRQ_WORK */
429#ifdef CONFIG_IRQ_WORK
430# define HAVE_RT_PUSH_IPI
431#endif
432
421/* Real-Time classes' related field in a runqueue: */ 433/* Real-Time classes' related field in a runqueue: */
422struct rt_rq { 434struct rt_rq {
423 struct rt_prio_array active; 435 struct rt_prio_array active;
@@ -435,7 +447,13 @@ struct rt_rq {
435 unsigned long rt_nr_total; 447 unsigned long rt_nr_total;
436 int overloaded; 448 int overloaded;
437 struct plist_head pushable_tasks; 449 struct plist_head pushable_tasks;
450#ifdef HAVE_RT_PUSH_IPI
451 int push_flags;
452 int push_cpu;
453 struct irq_work push_work;
454 raw_spinlock_t push_lock;
438#endif 455#endif
456#endif /* CONFIG_SMP */
439 int rt_queued; 457 int rt_queued;
440 458
441 int rt_throttled; 459 int rt_throttled;
@@ -597,6 +615,7 @@ struct rq {
597 struct sched_domain *sd; 615 struct sched_domain *sd;
598 616
599 unsigned long cpu_capacity; 617 unsigned long cpu_capacity;
618 unsigned long cpu_capacity_orig;
600 619
601 unsigned char idle_balance; 620 unsigned char idle_balance;
602 /* For active balancing */ 621 /* For active balancing */
@@ -807,7 +826,7 @@ struct sched_group_capacity {
807 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity 826 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
808 * for a single CPU. 827 * for a single CPU.
809 */ 828 */
810 unsigned int capacity, capacity_orig; 829 unsigned int capacity;
811 unsigned long next_update; 830 unsigned long next_update;
812 int imbalance; /* XXX unrelated to capacity but shared group state */ 831 int imbalance; /* XXX unrelated to capacity but shared group state */
813 /* 832 /*
@@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq)
1368 1387
1369#ifdef CONFIG_SMP 1388#ifdef CONFIG_SMP
1370extern void sched_avg_update(struct rq *rq); 1389extern void sched_avg_update(struct rq *rq);
1390
1391#ifndef arch_scale_freq_capacity
1392static __always_inline
1393unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
1394{
1395 return SCHED_CAPACITY_SCALE;
1396}
1397#endif
1398
1371static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1399static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1372{ 1400{
1373 rq->rt_avg += rt_delta; 1401 rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
1374 sched_avg_update(rq); 1402 sched_avg_update(rq);
1375} 1403}
1376#else 1404#else
@@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1643extern void print_dl_stats(struct seq_file *m, int cpu); 1671extern void print_dl_stats(struct seq_file *m, int cpu);
1644 1672
1645extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1673extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1646extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1674extern void init_rt_rq(struct rt_rq *rt_rq);
1647extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); 1675extern void init_dl_rq(struct dl_rq *dl_rq);
1648 1676
1649extern void cfs_bandwidth_usage_inc(void); 1677extern void cfs_bandwidth_usage_inc(void);
1650extern void cfs_bandwidth_usage_dec(void); 1678extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6e0031..ce410bb9f2e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1228,6 +1228,14 @@ static struct ctl_table vm_table[] = {
1228 .extra1 = &zero, 1228 .extra1 = &zero,
1229 }, 1229 },
1230 { 1230 {
1231 .procname = "dirtytime_expire_seconds",
1232 .data = &dirtytime_expire_interval,
1233 .maxlen = sizeof(dirty_expire_interval),
1234 .mode = 0644,
1235 .proc_handler = dirtytime_interval_handler,
1236 .extra1 = &zero,
1237 },
1238 {
1231 .procname = "nr_pdflush_threads", 1239 .procname = "nr_pdflush_threads",
1232 .mode = 0444 /* read-only */, 1240 .mode = 0444 /* read-only */,
1233 .proc_handler = pdflush_proc_obsolete, 1241 .proc_handler = pdflush_proc_obsolete,
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d626dc98e8df..579ce1b929af 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET
33config GENERIC_CLOCKEVENTS 33config GENERIC_CLOCKEVENTS
34 bool 34 bool
35 35
36# Migration helper. Builds, but does not invoke
37config GENERIC_CLOCKEVENTS_BUILD
38 bool
39 default y
40 depends on GENERIC_CLOCKEVENTS
41
42# Architecture can handle broadcast in a driver-agnostic way 36# Architecture can handle broadcast in a driver-agnostic way
43config ARCH_HAS_TICK_BROADCAST 37config ARCH_HAS_TICK_BROADCAST
44 bool 38 bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index c09c07817d7a..01f0312419b3 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
3obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
4 4
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
7ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) 6ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
8 obj-y += tick-broadcast.o 7 obj-y += tick-broadcast.o
9 obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o 8 obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
10endif 9endif
11obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o 10obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
12obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 11obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
14obj-$(CONFIG_TIMER_STATS) += timer_stats.o 12obj-$(CONFIG_TIMER_STATS) += timer_stats.o
15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 13obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
16obj-$(CONFIG_TEST_UDELAY) += test_udelay.o 14obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 55449909f114..25d942d1da27 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
94} 94}
95EXPORT_SYMBOL_GPL(clockevent_delta2ns); 95EXPORT_SYMBOL_GPL(clockevent_delta2ns);
96 96
97static int __clockevents_set_state(struct clock_event_device *dev,
98 enum clock_event_state state)
99{
100 /* Transition with legacy set_mode() callback */
101 if (dev->set_mode) {
102 /* Legacy callback doesn't support new modes */
103 if (state > CLOCK_EVT_STATE_ONESHOT)
104 return -ENOSYS;
105 /*
106 * 'clock_event_state' and 'clock_event_mode' have 1-to-1
107 * mapping until *_ONESHOT, and so a simple cast will work.
108 */
109 dev->set_mode((enum clock_event_mode)state, dev);
110 dev->mode = (enum clock_event_mode)state;
111 return 0;
112 }
113
114 if (dev->features & CLOCK_EVT_FEAT_DUMMY)
115 return 0;
116
117 /* Transition with new state-specific callbacks */
118 switch (state) {
119 case CLOCK_EVT_STATE_DETACHED:
120 /*
121 * This is an internal state, which is guaranteed to go from
122 * SHUTDOWN to DETACHED. No driver interaction required.
123 */
124 return 0;
125
126 case CLOCK_EVT_STATE_SHUTDOWN:
127 return dev->set_state_shutdown(dev);
128
129 case CLOCK_EVT_STATE_PERIODIC:
130 /* Core internal bug */
131 if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
132 return -ENOSYS;
133 return dev->set_state_periodic(dev);
134
135 case CLOCK_EVT_STATE_ONESHOT:
136 /* Core internal bug */
137 if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
138 return -ENOSYS;
139 return dev->set_state_oneshot(dev);
140
141 default:
142 return -ENOSYS;
143 }
144}
145
97/** 146/**
98 * clockevents_set_mode - set the operating mode of a clock event device 147 * clockevents_set_state - set the operating state of a clock event device
99 * @dev: device to modify 148 * @dev: device to modify
100 * @mode: new mode 149 * @state: new state
101 * 150 *
102 * Must be called with interrupts disabled ! 151 * Must be called with interrupts disabled !
103 */ 152 */
104void clockevents_set_mode(struct clock_event_device *dev, 153void clockevents_set_state(struct clock_event_device *dev,
105 enum clock_event_mode mode) 154 enum clock_event_state state)
106{ 155{
107 if (dev->mode != mode) { 156 if (dev->state != state) {
108 dev->set_mode(mode, dev); 157 if (__clockevents_set_state(dev, state))
109 dev->mode = mode; 158 return;
159
160 dev->state = state;
110 161
111 /* 162 /*
112 * A nsec2cyc multiplicator of 0 is invalid and we'd crash 163 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
113 * on it, so fix it up and emit a warning: 164 * on it, so fix it up and emit a warning:
114 */ 165 */
115 if (mode == CLOCK_EVT_MODE_ONESHOT) { 166 if (state == CLOCK_EVT_STATE_ONESHOT) {
116 if (unlikely(!dev->mult)) { 167 if (unlikely(!dev->mult)) {
117 dev->mult = 1; 168 dev->mult = 1;
118 WARN_ON(1); 169 WARN_ON(1);
@@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev,
127 */ 178 */
128void clockevents_shutdown(struct clock_event_device *dev) 179void clockevents_shutdown(struct clock_event_device *dev)
129{ 180{
130 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 181 clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
131 dev->next_event.tv64 = KTIME_MAX; 182 dev->next_event.tv64 = KTIME_MAX;
132} 183}
133 184
185/**
186 * clockevents_tick_resume - Resume the tick device before using it again
187 * @dev: device to resume
188 */
189int clockevents_tick_resume(struct clock_event_device *dev)
190{
191 int ret = 0;
192
193 if (dev->set_mode) {
194 dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
195 dev->mode = CLOCK_EVT_MODE_RESUME;
196 } else if (dev->tick_resume) {
197 ret = dev->tick_resume(dev);
198 }
199
200 return ret;
201}
202
134#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST 203#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
135 204
136/* Limit min_delta to a jiffie */ 205/* Limit min_delta to a jiffie */
@@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
183 delta = dev->min_delta_ns; 252 delta = dev->min_delta_ns;
184 dev->next_event = ktime_add_ns(ktime_get(), delta); 253 dev->next_event = ktime_add_ns(ktime_get(), delta);
185 254
186 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) 255 if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
187 return 0; 256 return 0;
188 257
189 dev->retries++; 258 dev->retries++;
@@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
220 delta = dev->min_delta_ns; 289 delta = dev->min_delta_ns;
221 dev->next_event = ktime_add_ns(ktime_get(), delta); 290 dev->next_event = ktime_add_ns(ktime_get(), delta);
222 291
223 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) 292 if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
224 return 0; 293 return 0;
225 294
226 dev->retries++; 295 dev->retries++;
@@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
252 321
253 dev->next_event = expires; 322 dev->next_event = expires;
254 323
255 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) 324 if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
256 return 0; 325 return 0;
257 326
258 /* Shortcut for clockevent devices that can deal with ktime. */ 327 /* Shortcut for clockevent devices that can deal with ktime. */
@@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced)
297 struct clock_event_device *dev, *newdev = NULL; 366 struct clock_event_device *dev, *newdev = NULL;
298 367
299 list_for_each_entry(dev, &clockevent_devices, list) { 368 list_for_each_entry(dev, &clockevent_devices, list) {
300 if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) 369 if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
301 continue; 370 continue;
302 371
303 if (!tick_check_replacement(newdev, dev)) 372 if (!tick_check_replacement(newdev, dev))
@@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced)
323static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) 392static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
324{ 393{
325 /* Fast track. Device is unused */ 394 /* Fast track. Device is unused */
326 if (ced->mode == CLOCK_EVT_MODE_UNUSED) { 395 if (ced->state == CLOCK_EVT_STATE_DETACHED) {
327 list_del_init(&ced->list); 396 list_del_init(&ced->list);
328 return 0; 397 return 0;
329 } 398 }
@@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
373} 442}
374EXPORT_SYMBOL_GPL(clockevents_unbind); 443EXPORT_SYMBOL_GPL(clockevents_unbind);
375 444
445/* Sanity check of state transition callbacks */
446static int clockevents_sanity_check(struct clock_event_device *dev)
447{
448 /* Legacy set_mode() callback */
449 if (dev->set_mode) {
450 /* We shouldn't be supporting new modes now */
451 WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
452 dev->set_state_shutdown || dev->tick_resume);
453
454 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
455 return 0;
456 }
457
458 if (dev->features & CLOCK_EVT_FEAT_DUMMY)
459 return 0;
460
461 /* New state-specific callbacks */
462 if (!dev->set_state_shutdown)
463 return -EINVAL;
464
465 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
466 !dev->set_state_periodic)
467 return -EINVAL;
468
469 if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
470 !dev->set_state_oneshot)
471 return -EINVAL;
472
473 return 0;
474}
475
376/** 476/**
377 * clockevents_register_device - register a clock event device 477 * clockevents_register_device - register a clock event device
378 * @dev: device to register 478 * @dev: device to register
@@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev)
381{ 481{
382 unsigned long flags; 482 unsigned long flags;
383 483
384 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 484 BUG_ON(clockevents_sanity_check(dev));
485
486 /* Initialize state to DETACHED */
487 dev->state = CLOCK_EVT_STATE_DETACHED;
488
385 if (!dev->cpumask) { 489 if (!dev->cpumask) {
386 WARN_ON(num_possible_cpus() > 1); 490 WARN_ON(num_possible_cpus() > 1);
387 dev->cpumask = cpumask_of(smp_processor_id()); 491 dev->cpumask = cpumask_of(smp_processor_id());
@@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
445{ 549{
446 clockevents_config(dev, freq); 550 clockevents_config(dev, freq);
447 551
448 if (dev->mode == CLOCK_EVT_MODE_ONESHOT) 552 if (dev->state == CLOCK_EVT_STATE_ONESHOT)
449 return clockevents_program_event(dev, dev->next_event, false); 553 return clockevents_program_event(dev, dev->next_event, false);
450 554
451 if (dev->mode == CLOCK_EVT_MODE_PERIODIC) 555 if (dev->state == CLOCK_EVT_STATE_PERIODIC)
452 dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); 556 return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
453 557
454 return 0; 558 return 0;
455} 559}
@@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev)
491 * @old: device to release (can be NULL) 595 * @old: device to release (can be NULL)
492 * @new: device to request (can be NULL) 596 * @new: device to request (can be NULL)
493 * 597 *
494 * Called from the notifier chain. clockevents_lock is held already 598 * Called from various tick functions with clockevents_lock held and
599 * interrupts disabled.
495 */ 600 */
496void clockevents_exchange_device(struct clock_event_device *old, 601void clockevents_exchange_device(struct clock_event_device *old,
497 struct clock_event_device *new) 602 struct clock_event_device *new)
498{ 603{
499 unsigned long flags;
500
501 local_irq_save(flags);
502 /* 604 /*
503 * Caller releases a clock event device. We queue it into the 605 * Caller releases a clock event device. We queue it into the
504 * released list and do a notify add later. 606 * released list and do a notify add later.
505 */ 607 */
506 if (old) { 608 if (old) {
507 module_put(old->owner); 609 module_put(old->owner);
508 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 610 clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
509 list_del(&old->list); 611 list_del(&old->list);
510 list_add(&old->list, &clockevents_released); 612 list_add(&old->list, &clockevents_released);
511 } 613 }
512 614
513 if (new) { 615 if (new) {
514 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); 616 BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
515 clockevents_shutdown(new); 617 clockevents_shutdown(new);
516 } 618 }
517 local_irq_restore(flags);
518} 619}
519 620
520/** 621/**
@@ -541,74 +642,40 @@ void clockevents_resume(void)
541 dev->resume(dev); 642 dev->resume(dev);
542} 643}
543 644
544#ifdef CONFIG_GENERIC_CLOCKEVENTS 645#ifdef CONFIG_HOTPLUG_CPU
545/** 646/**
546 * clockevents_notify - notification about relevant events 647 * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
547 * Returns 0 on success, any other value on error
548 */ 648 */
549int clockevents_notify(unsigned long reason, void *arg) 649void tick_cleanup_dead_cpu(int cpu)
550{ 650{
551 struct clock_event_device *dev, *tmp; 651 struct clock_event_device *dev, *tmp;
552 unsigned long flags; 652 unsigned long flags;
553 int cpu, ret = 0;
554 653
555 raw_spin_lock_irqsave(&clockevents_lock, flags); 654 raw_spin_lock_irqsave(&clockevents_lock, flags);
556 655
557 switch (reason) { 656 tick_shutdown_broadcast_oneshot(cpu);
558 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 657 tick_shutdown_broadcast(cpu);
559 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 658 tick_shutdown(cpu);
560 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 659 /*
561 tick_broadcast_on_off(reason, arg); 660 * Unregister the clock event devices which were
562 break; 661 * released from the users in the notify chain.
563 662 */
564 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: 663 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
565 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: 664 list_del(&dev->list);
566 ret = tick_broadcast_oneshot_control(reason); 665 /*
567 break; 666 * Now check whether the CPU has left unused per cpu devices
568 667 */
569 case CLOCK_EVT_NOTIFY_CPU_DYING: 668 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
570 tick_handover_do_timer(arg); 669 if (cpumask_test_cpu(cpu, dev->cpumask) &&
571 break; 670 cpumask_weight(dev->cpumask) == 1 &&
572 671 !tick_is_broadcast_device(dev)) {
573 case CLOCK_EVT_NOTIFY_SUSPEND: 672 BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
574 tick_suspend();
575 tick_suspend_broadcast();
576 break;
577
578 case CLOCK_EVT_NOTIFY_RESUME:
579 tick_resume();
580 break;
581
582 case CLOCK_EVT_NOTIFY_CPU_DEAD:
583 tick_shutdown_broadcast_oneshot(arg);
584 tick_shutdown_broadcast(arg);
585 tick_shutdown(arg);
586 /*
587 * Unregister the clock event devices which were
588 * released from the users in the notify chain.
589 */
590 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
591 list_del(&dev->list); 673 list_del(&dev->list);
592 /*
593 * Now check whether the CPU has left unused per cpu devices
594 */
595 cpu = *((int *)arg);
596 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
597 if (cpumask_test_cpu(cpu, dev->cpumask) &&
598 cpumask_weight(dev->cpumask) == 1 &&
599 !tick_is_broadcast_device(dev)) {
600 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
601 list_del(&dev->list);
602 }
603 } 674 }
604 break;
605 default:
606 break;
607 } 675 }
608 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 676 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
609 return ret;
610} 677}
611EXPORT_SYMBOL_GPL(clockevents_notify); 678#endif
612 679
613#ifdef CONFIG_SYSFS 680#ifdef CONFIG_SYSFS
614struct bus_type clockevents_subsys = { 681struct bus_type clockevents_subsys = {
@@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void)
727} 794}
728device_initcall(clockevents_init_sysfs); 795device_initcall(clockevents_init_sysfs);
729#endif /* SYSFS */ 796#endif /* SYSFS */
730
731#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4892352f0e49..15facb1b9c60 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
142 schedule_work(&watchdog_work); 142 schedule_work(&watchdog_work);
143} 143}
144 144
145static void clocksource_unstable(struct clocksource *cs, int64_t delta)
146{
147 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
148 cs->name, delta);
149 __clocksource_unstable(cs);
150}
151
152/** 145/**
153 * clocksource_mark_unstable - mark clocksource unstable via watchdog 146 * clocksource_mark_unstable - mark clocksource unstable via watchdog
154 * @cs: clocksource to be marked unstable 147 * @cs: clocksource to be marked unstable
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
174static void clocksource_watchdog(unsigned long data) 167static void clocksource_watchdog(unsigned long data)
175{ 168{
176 struct clocksource *cs; 169 struct clocksource *cs;
177 cycle_t csnow, wdnow, delta; 170 cycle_t csnow, wdnow, cslast, wdlast, delta;
178 int64_t wd_nsec, cs_nsec; 171 int64_t wd_nsec, cs_nsec;
179 int next_cpu, reset_pending; 172 int next_cpu, reset_pending;
180 173
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)
213 206
214 delta = clocksource_delta(csnow, cs->cs_last, cs->mask); 207 delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
215 cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); 208 cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
209 wdlast = cs->wd_last; /* save these in case we print them */
210 cslast = cs->cs_last;
216 cs->cs_last = csnow; 211 cs->cs_last = csnow;
217 cs->wd_last = wdnow; 212 cs->wd_last = wdnow;
218 213
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
221 216
222 /* Check the deviation from the watchdog clocksource. */ 217 /* Check the deviation from the watchdog clocksource. */
223 if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { 218 if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
224 clocksource_unstable(cs, cs_nsec - wd_nsec); 219 pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
220 pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
221 watchdog->name, wdnow, wdlast, watchdog->mask);
222 pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
223 cs->name, csnow, cslast, cs->mask);
224 __clocksource_unstable(cs);
225 continue; 225 continue;
226 } 226 }
227 227
@@ -469,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
469 * @shift: cycle to nanosecond divisor (power of two) 469 * @shift: cycle to nanosecond divisor (power of two)
470 * @maxadj: maximum adjustment value to mult (~11%) 470 * @maxadj: maximum adjustment value to mult (~11%)
471 * @mask: bitmask for two's complement subtraction of non 64 bit counters 471 * @mask: bitmask for two's complement subtraction of non 64 bit counters
472 * @max_cyc: maximum cycle value before potential overflow (does not include
473 * any safety margin)
474 *
475 * NOTE: This function includes a safety margin of 50%, in other words, we
476 * return half the number of nanoseconds the hardware counter can technically
477 * cover. This is done so that we can potentially detect problems caused by
478 * delayed timers or bad hardware, which might result in time intervals that
479 * are larger then what the math used can handle without overflows.
472 */ 480 */
473u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) 481u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
474{ 482{
475 u64 max_nsecs, max_cycles; 483 u64 max_nsecs, max_cycles;
476 484
477 /* 485 /*
478 * Calculate the maximum number of cycles that we can pass to the 486 * Calculate the maximum number of cycles that we can pass to the
479 * cyc2ns function without overflowing a 64-bit signed result. The 487 * cyc2ns() function without overflowing a 64-bit result.
480 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
481 * which is equivalent to the below.
482 * max_cycles < (2^63)/(mult + maxadj)
483 * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
484 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
485 * max_cycles < 2^(63 - log2(mult + maxadj))
486 * max_cycles < 1 << (63 - log2(mult + maxadj))
487 * Please note that we add 1 to the result of the log2 to account for
488 * any rounding errors, ensure the above inequality is satisfied and
489 * no overflow will occur.
490 */ 488 */
491 max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); 489 max_cycles = ULLONG_MAX;
490 do_div(max_cycles, mult+maxadj);
492 491
493 /* 492 /*
494 * The actual maximum number of cycles we can defer the clocksource is 493 * The actual maximum number of cycles we can defer the clocksource is
@@ -499,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
499 max_cycles = min(max_cycles, mask); 498 max_cycles = min(max_cycles, mask);
500 max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); 499 max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
501 500
501 /* return the max_cycles value as well if requested */
502 if (max_cyc)
503 *max_cyc = max_cycles;
504
505 /* Return 50% of the actual maximum, so we can detect bad values */
506 max_nsecs >>= 1;
507
502 return max_nsecs; 508 return max_nsecs;
503} 509}
504 510
505/** 511/**
506 * clocksource_max_deferment - Returns max time the clocksource can be deferred 512 * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
507 * @cs: Pointer to clocksource 513 * @cs: Pointer to clocksource to be updated
508 * 514 *
509 */ 515 */
510static u64 clocksource_max_deferment(struct clocksource *cs) 516static inline void clocksource_update_max_deferment(struct clocksource *cs)
511{ 517{
512 u64 max_nsecs; 518 cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
513 519 cs->maxadj, cs->mask,
514 max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, 520 &cs->max_cycles);
515 cs->mask);
516 /*
517 * To ensure that the clocksource does not wrap whilst we are idle,
518 * limit the time the clocksource can be deferred by 12.5%. Please
519 * note a margin of 12.5% is used because this can be computed with
520 * a shift, versus say 10% which would require division.
521 */
522 return max_nsecs - (max_nsecs >> 3);
523} 521}
524 522
525#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET 523#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -648,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs)
648} 646}
649 647
650/** 648/**
651 * __clocksource_updatefreq_scale - Used update clocksource with new freq 649 * __clocksource_update_freq_scale - Used update clocksource with new freq
652 * @cs: clocksource to be registered 650 * @cs: clocksource to be registered
653 * @scale: Scale factor multiplied against freq to get clocksource hz 651 * @scale: Scale factor multiplied against freq to get clocksource hz
654 * @freq: clocksource frequency (cycles per second) divided by scale 652 * @freq: clocksource frequency (cycles per second) divided by scale
@@ -656,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs)
656 * This should only be called from the clocksource->enable() method. 654 * This should only be called from the clocksource->enable() method.
657 * 655 *
658 * This *SHOULD NOT* be called directly! Please use the 656 * This *SHOULD NOT* be called directly! Please use the
659 * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. 657 * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
658 * functions.
660 */ 659 */
661void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 660void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
662{ 661{
663 u64 sec; 662 u64 sec;
663
664 /* 664 /*
665 * Calc the maximum number of seconds which we can run before 665 * Default clocksources are *special* and self-define their mult/shift.
666 * wrapping around. For clocksources which have a mask > 32bit 666 * But, you're not special, so you should specify a freq value.
667 * we need to limit the max sleep time to have a good
668 * conversion precision. 10 minutes is still a reasonable
669 * amount. That results in a shift value of 24 for a
670 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
671 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
672 * margin as we do in clocksource_max_deferment()
673 */ 667 */
674 sec = (cs->mask - (cs->mask >> 3)); 668 if (freq) {
675 do_div(sec, freq); 669 /*
676 do_div(sec, scale); 670 * Calc the maximum number of seconds which we can run before
677 if (!sec) 671 * wrapping around. For clocksources which have a mask > 32-bit
678 sec = 1; 672 * we need to limit the max sleep time to have a good
679 else if (sec > 600 && cs->mask > UINT_MAX) 673 * conversion precision. 10 minutes is still a reasonable
680 sec = 600; 674 * amount. That results in a shift value of 24 for a
681 675 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
682 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 676 * ~ 0.06ppm granularity for NTP.
683 NSEC_PER_SEC / scale, sec * scale); 677 */
684 678 sec = cs->mask;
679 do_div(sec, freq);
680 do_div(sec, scale);
681 if (!sec)
682 sec = 1;
683 else if (sec > 600 && cs->mask > UINT_MAX)
684 sec = 600;
685
686 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
687 NSEC_PER_SEC / scale, sec * scale);
688 }
685 /* 689 /*
686 * for clocksources that have large mults, to avoid overflow. 690 * Ensure clocksources that have large 'mult' values don't overflow
687 * Since mult may be adjusted by ntp, add an safety extra margin 691 * when adjusted.
688 *
689 */ 692 */
690 cs->maxadj = clocksource_max_adjustment(cs); 693 cs->maxadj = clocksource_max_adjustment(cs);
691 while ((cs->mult + cs->maxadj < cs->mult) 694 while (freq && ((cs->mult + cs->maxadj < cs->mult)
692 || (cs->mult - cs->maxadj > cs->mult)) { 695 || (cs->mult - cs->maxadj > cs->mult))) {
693 cs->mult >>= 1; 696 cs->mult >>= 1;
694 cs->shift--; 697 cs->shift--;
695 cs->maxadj = clocksource_max_adjustment(cs); 698 cs->maxadj = clocksource_max_adjustment(cs);
696 } 699 }
697 700
698 cs->max_idle_ns = clocksource_max_deferment(cs); 701 /*
702 * Only warn for *special* clocksources that self-define
703 * their mult/shift values and don't specify a freq.
704 */
705 WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
706 "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
707 cs->name);
708
709 clocksource_update_max_deferment(cs);
710
711 pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
712 cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
699} 713}
700EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 714EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
701 715
702/** 716/**
703 * __clocksource_register_scale - Used to install new clocksources 717 * __clocksource_register_scale - Used to install new clocksources
@@ -714,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
714{ 728{
715 729
716 /* Initialize mult/shift and max_idle_ns */ 730 /* Initialize mult/shift and max_idle_ns */
717 __clocksource_updatefreq_scale(cs, scale, freq); 731 __clocksource_update_freq_scale(cs, scale, freq);
718 732
719 /* Add clocksource to the clocksource list */ 733 /* Add clocksource to the clocksource list */
720 mutex_lock(&clocksource_mutex); 734 mutex_lock(&clocksource_mutex);
@@ -726,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
726} 740}
727EXPORT_SYMBOL_GPL(__clocksource_register_scale); 741EXPORT_SYMBOL_GPL(__clocksource_register_scale);
728 742
729
730/**
731 * clocksource_register - Used to install new clocksources
732 * @cs: clocksource to be registered
733 *
734 * Returns -EBUSY if registration fails, zero otherwise.
735 */
736int clocksource_register(struct clocksource *cs)
737{
738 /* calculate max adjustment for given mult/shift */
739 cs->maxadj = clocksource_max_adjustment(cs);
740 WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
741 "Clocksource %s might overflow on 11%% adjustment\n",
742 cs->name);
743
744 /* calculate max idle time permitted for this clocksource */
745 cs->max_idle_ns = clocksource_max_deferment(cs);
746
747 mutex_lock(&clocksource_mutex);
748 clocksource_enqueue(cs);
749 clocksource_enqueue_watchdog(cs);
750 clocksource_select();
751 mutex_unlock(&clocksource_mutex);
752 return 0;
753}
754EXPORT_SYMBOL(clocksource_register);
755
756static void __clocksource_change_rating(struct clocksource *cs, int rating) 743static void __clocksource_change_rating(struct clocksource *cs, int rating)
757{ 744{
758 list_del(&cs->list); 745 list_del(&cs->list);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bee0c1f78091..76d4bd962b19 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,7 +54,7 @@
54 54
55#include <trace/events/timer.h> 55#include <trace/events/timer.h>
56 56
57#include "timekeeping.h" 57#include "tick-internal.h"
58 58
59/* 59/*
60 * The timer bases: 60 * The timer bases:
@@ -1707,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
1707 break; 1707 break;
1708 1708
1709#ifdef CONFIG_HOTPLUG_CPU 1709#ifdef CONFIG_HOTPLUG_CPU
1710 case CPU_DYING:
1711 case CPU_DYING_FROZEN:
1712 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
1713 break;
1714 case CPU_DEAD: 1710 case CPU_DEAD:
1715 case CPU_DEAD_FROZEN: 1711 case CPU_DEAD_FROZEN:
1716 {
1717 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1718 migrate_hrtimers(scpu); 1712 migrate_hrtimers(scpu);
1719 break; 1713 break;
1720 }
1721#endif 1714#endif
1722 1715
1723 default: 1716 default:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..347fecf86a3f 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,7 +25,7 @@
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/init.h> 26#include <linux/init.h>
27 27
28#include "tick-internal.h" 28#include "timekeeping.h"
29 29
30/* The Jiffies based clocksource is the lowest common 30/* The Jiffies based clocksource is the lowest common
31 * denominator clock source which should function on 31 * denominator clock source which should function on
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
71 .mask = 0xffffffff, /*32bits*/ 71 .mask = 0xffffffff, /*32bits*/
72 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 72 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
73 .shift = JIFFIES_SHIFT, 73 .shift = JIFFIES_SHIFT,
74 .max_cycles = 10,
74}; 75};
75 76
76__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); 77__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
94 95
95static int __init init_jiffies_clocksource(void) 96static int __init init_jiffies_clocksource(void)
96{ 97{
97 return clocksource_register(&clocksource_jiffies); 98 return __clocksource_register(&clocksource_jiffies);
98} 99}
99 100
100core_initcall(init_jiffies_clocksource); 101core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
130 131
131 refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; 132 refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
132 133
133 clocksource_register(&refined_jiffies); 134 __clocksource_register(&refined_jiffies);
134 return 0; 135 return 0;
135} 136}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 0f60b08a4f07..7a681003001c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rtc.h> 18#include <linux/rtc.h>
19 19
20#include "tick-internal.h"
21#include "ntp_internal.h" 20#include "ntp_internal.h"
22 21
23/* 22/*
@@ -459,6 +458,16 @@ out:
459 return leap; 458 return leap;
460} 459}
461 460
461#ifdef CONFIG_GENERIC_CMOS_UPDATE
462int __weak update_persistent_clock64(struct timespec64 now64)
463{
464 struct timespec now;
465
466 now = timespec64_to_timespec(now64);
467 return update_persistent_clock(now);
468}
469#endif
470
462#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) 471#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
463static void sync_cmos_clock(struct work_struct *work); 472static void sync_cmos_clock(struct work_struct *work);
464 473
@@ -494,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work)
494 if (persistent_clock_is_local) 503 if (persistent_clock_is_local)
495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); 504 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
496#ifdef CONFIG_GENERIC_CMOS_UPDATE 505#ifdef CONFIG_GENERIC_CMOS_UPDATE
497 fail = update_persistent_clock(timespec64_to_timespec(adjust)); 506 fail = update_persistent_clock64(adjust);
498#endif 507#endif
508
499#ifdef CONFIG_RTC_SYSTOHC 509#ifdef CONFIG_RTC_SYSTOHC
500 if (fail == -ENODEV) 510 if (fail == -ENODEV)
501 fail = rtc_set_ntp_time(adjust); 511 fail = rtc_set_ntp_time(adjust);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * sched_clock.c: support for extending counters to full 64-bit ns counter 2 * sched_clock.c: Generic sched_clock() support, to extend low level
3 * hardware time counters to full 64-bit ns values.
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
@@ -18,15 +19,53 @@
18#include <linux/seqlock.h> 19#include <linux/seqlock.h>
19#include <linux/bitops.h> 20#include <linux/bitops.h>
20 21
21struct clock_data { 22/**
22 ktime_t wrap_kt; 23 * struct clock_read_data - data required to read from sched_clock()
24 *
25 * @epoch_ns: sched_clock() value at last update
26 * @epoch_cyc: Clock cycle value at last update.
27 * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
28 * clocks.
29 * @read_sched_clock: Current clock source (or dummy source when suspended).
30 * @mult: Multipler for scaled math conversion.
31 * @shift: Shift value for scaled math conversion.
32 *
33 * Care must be taken when updating this structure; it is read by
34 * some very hot code paths. It occupies <=40 bytes and, when combined
35 * with the seqcount used to synchronize access, comfortably fits into
36 * a 64 byte cache line.
37 */
38struct clock_read_data {
23 u64 epoch_ns; 39 u64 epoch_ns;
24 u64 epoch_cyc; 40 u64 epoch_cyc;
25 seqcount_t seq; 41 u64 sched_clock_mask;
26 unsigned long rate; 42 u64 (*read_sched_clock)(void);
27 u32 mult; 43 u32 mult;
28 u32 shift; 44 u32 shift;
29 bool suspended; 45};
46
47/**
48 * struct clock_data - all data needed for sched_clock() (including
49 * registration of a new clock source)
50 *
51 * @seq: Sequence counter for protecting updates. The lowest
52 * bit is the index for @read_data.
53 * @read_data: Data required to read from sched_clock.
54 * @wrap_kt: Duration for which clock can run before wrapping.
55 * @rate: Tick rate of the registered clock.
56 * @actual_read_sched_clock: Registered hardware level clock read function.
57 *
58 * The ordering of this structure has been chosen to optimize cache
59 * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
60 * into a single 64-byte cache line.
61 */
62struct clock_data {
63 seqcount_t seq;
64 struct clock_read_data read_data[2];
65 ktime_t wrap_kt;
66 unsigned long rate;
67
68 u64 (*actual_read_sched_clock)(void);
30}; 69};
31 70
32static struct hrtimer sched_clock_timer; 71static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
34 73
35core_param(irqtime, irqtime, int, 0400); 74core_param(irqtime, irqtime, int, 0400);
36 75
37static struct clock_data cd = {
38 .mult = NSEC_PER_SEC / HZ,
39};
40
41static u64 __read_mostly sched_clock_mask;
42
43static u64 notrace jiffy_sched_clock_read(void) 76static u64 notrace jiffy_sched_clock_read(void)
44{ 77{
45 /* 78 /*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
49 return (u64)(jiffies - INITIAL_JIFFIES); 82 return (u64)(jiffies - INITIAL_JIFFIES);
50} 83}
51 84
52static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; 85static struct clock_data cd ____cacheline_aligned = {
86 .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
87 .read_sched_clock = jiffy_sched_clock_read, },
88 .actual_read_sched_clock = jiffy_sched_clock_read,
89};
53 90
54static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 91static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
55{ 92{
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
58 95
59unsigned long long notrace sched_clock(void) 96unsigned long long notrace sched_clock(void)
60{ 97{
61 u64 epoch_ns; 98 u64 cyc, res;
62 u64 epoch_cyc;
63 u64 cyc;
64 unsigned long seq; 99 unsigned long seq;
65 100 struct clock_read_data *rd;
66 if (cd.suspended)
67 return cd.epoch_ns;
68 101
69 do { 102 do {
70 seq = raw_read_seqcount_begin(&cd.seq); 103 seq = raw_read_seqcount(&cd.seq);
71 epoch_cyc = cd.epoch_cyc; 104 rd = cd.read_data + (seq & 1);
72 epoch_ns = cd.epoch_ns; 105
106 cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
107 rd->sched_clock_mask;
108 res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
73 } while (read_seqcount_retry(&cd.seq, seq)); 109 } while (read_seqcount_retry(&cd.seq, seq));
74 110
75 cyc = read_sched_clock(); 111 return res;
76 cyc = (cyc - epoch_cyc) & sched_clock_mask; 112}
77 return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); 113
114/*
115 * Updating the data required to read the clock.
116 *
117 * sched_clock() will never observe mis-matched data even if called from
118 * an NMI. We do this by maintaining an odd/even copy of the data and
119 * steering sched_clock() to one or the other using a sequence counter.
120 * In order to preserve the data cache profile of sched_clock() as much
121 * as possible the system reverts back to the even copy when the update
122 * completes; the odd copy is used *only* during an update.
123 */
124static void update_clock_read_data(struct clock_read_data *rd)
125{
126 /* update the backup (odd) copy with the new data */
127 cd.read_data[1] = *rd;
128
129 /* steer readers towards the odd copy */
130 raw_write_seqcount_latch(&cd.seq);
131
132 /* now its safe for us to update the normal (even) copy */
133 cd.read_data[0] = *rd;
134
135 /* switch readers back to the even copy */
136 raw_write_seqcount_latch(&cd.seq);
78} 137}
79 138
80/* 139/*
81 * Atomically update the sched_clock epoch. 140 * Atomically update the sched_clock() epoch.
82 */ 141 */
83static void notrace update_sched_clock(void) 142static void update_sched_clock(void)
84{ 143{
85 unsigned long flags;
86 u64 cyc; 144 u64 cyc;
87 u64 ns; 145 u64 ns;
146 struct clock_read_data rd;
147
148 rd = cd.read_data[0];
149
150 cyc = cd.actual_read_sched_clock();
151 ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
152
153 rd.epoch_ns = ns;
154 rd.epoch_cyc = cyc;
88 155
89 cyc = read_sched_clock(); 156 update_clock_read_data(&rd);
90 ns = cd.epoch_ns +
91 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
92 cd.mult, cd.shift);
93
94 raw_local_irq_save(flags);
95 raw_write_seqcount_begin(&cd.seq);
96 cd.epoch_ns = ns;
97 cd.epoch_cyc = cyc;
98 raw_write_seqcount_end(&cd.seq);
99 raw_local_irq_restore(flags);
100} 157}
101 158
102static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) 159static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
103{ 160{
104 update_sched_clock(); 161 update_sched_clock();
105 hrtimer_forward_now(hrt, cd.wrap_kt); 162 hrtimer_forward_now(hrt, cd.wrap_kt);
163
106 return HRTIMER_RESTART; 164 return HRTIMER_RESTART;
107} 165}
108 166
109void __init sched_clock_register(u64 (*read)(void), int bits, 167void __init
110 unsigned long rate) 168sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
111{ 169{
112 u64 res, wrap, new_mask, new_epoch, cyc, ns; 170 u64 res, wrap, new_mask, new_epoch, cyc, ns;
113 u32 new_mult, new_shift; 171 u32 new_mult, new_shift;
114 ktime_t new_wrap_kt;
115 unsigned long r; 172 unsigned long r;
116 char r_unit; 173 char r_unit;
174 struct clock_read_data rd;
117 175
118 if (cd.rate > rate) 176 if (cd.rate > rate)
119 return; 177 return;
120 178
121 WARN_ON(!irqs_disabled()); 179 WARN_ON(!irqs_disabled());
122 180
123 /* calculate the mult/shift to convert counter ticks to ns. */ 181 /* Calculate the mult/shift to convert counter ticks to ns. */
124 clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); 182 clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
125 183
126 new_mask = CLOCKSOURCE_MASK(bits); 184 new_mask = CLOCKSOURCE_MASK(bits);
185 cd.rate = rate;
186
187 /* Calculate how many nanosecs until we risk wrapping */
188 wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
189 cd.wrap_kt = ns_to_ktime(wrap);
127 190
128 /* calculate how many ns until we wrap */ 191 rd = cd.read_data[0];
129 wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
130 new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
131 192
132 /* update epoch for new counter and update epoch_ns from old counter*/ 193 /* Update epoch for new counter and update 'epoch_ns' from old counter*/
133 new_epoch = read(); 194 new_epoch = read();
134 cyc = read_sched_clock(); 195 cyc = cd.actual_read_sched_clock();
135 ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, 196 ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
136 cd.mult, cd.shift); 197 cd.actual_read_sched_clock = read;
137 198
138 raw_write_seqcount_begin(&cd.seq); 199 rd.read_sched_clock = read;
139 read_sched_clock = read; 200 rd.sched_clock_mask = new_mask;
140 sched_clock_mask = new_mask; 201 rd.mult = new_mult;
141 cd.rate = rate; 202 rd.shift = new_shift;
142 cd.wrap_kt = new_wrap_kt; 203 rd.epoch_cyc = new_epoch;
143 cd.mult = new_mult; 204 rd.epoch_ns = ns;
144 cd.shift = new_shift; 205
145 cd.epoch_cyc = new_epoch; 206 update_clock_read_data(&rd);
146 cd.epoch_ns = ns;
147 raw_write_seqcount_end(&cd.seq);
148 207
149 r = rate; 208 r = rate;
150 if (r >= 4000000) { 209 if (r >= 4000000) {
151 r /= 1000000; 210 r /= 1000000;
152 r_unit = 'M'; 211 r_unit = 'M';
153 } else if (r >= 1000) { 212 } else {
154 r /= 1000; 213 if (r >= 1000) {
155 r_unit = 'k'; 214 r /= 1000;
156 } else 215 r_unit = 'k';
157 r_unit = ' '; 216 } else {
158 217 r_unit = ' ';
159 /* calculate the ns resolution of this counter */ 218 }
219 }
220
221 /* Calculate the ns resolution of this counter */
160 res = cyc_to_ns(1ULL, new_mult, new_shift); 222 res = cyc_to_ns(1ULL, new_mult, new_shift);
161 223
162 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", 224 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
163 bits, r, r_unit, res, wrap); 225 bits, r, r_unit, res, wrap);
164 226
165 /* Enable IRQ time accounting if we have a fast enough sched_clock */ 227 /* Enable IRQ time accounting if we have a fast enough sched_clock() */
166 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) 228 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
167 enable_sched_clock_irqtime(); 229 enable_sched_clock_irqtime();
168 230
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
172void __init sched_clock_postinit(void) 234void __init sched_clock_postinit(void)
173{ 235{
174 /* 236 /*
175 * If no sched_clock function has been provided at that point, 237 * If no sched_clock() function has been provided at that point,
176 * make it the final one one. 238 * make it the final one one.
177 */ 239 */
178 if (read_sched_clock == jiffy_sched_clock_read) 240 if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
179 sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); 241 sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
180 242
181 update_sched_clock(); 243 update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
189 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); 251 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
190} 252}
191 253
254/*
255 * Clock read function for use when the clock is suspended.
256 *
257 * This function makes it appear to sched_clock() as if the clock
258 * stopped counting at its last update.
259 *
260 * This function must only be called from the critical
261 * section in sched_clock(). It relies on the read_seqcount_retry()
262 * at the end of the critical section to be sure we observe the
263 * correct copy of 'epoch_cyc'.
264 */
265static u64 notrace suspended_sched_clock_read(void)
266{
267 unsigned long seq = raw_read_seqcount(&cd.seq);
268
269 return cd.read_data[seq & 1].epoch_cyc;
270}
271
192static int sched_clock_suspend(void) 272static int sched_clock_suspend(void)
193{ 273{
274 struct clock_read_data *rd = &cd.read_data[0];
275
194 update_sched_clock(); 276 update_sched_clock();
195 hrtimer_cancel(&sched_clock_timer); 277 hrtimer_cancel(&sched_clock_timer);
196 cd.suspended = true; 278 rd->read_sched_clock = suspended_sched_clock_read;
279
197 return 0; 280 return 0;
198} 281}
199 282
200static void sched_clock_resume(void) 283static void sched_clock_resume(void)
201{ 284{
202 cd.epoch_cyc = read_sched_clock(); 285 struct clock_read_data *rd = &cd.read_data[0];
286
287 rd->epoch_cyc = cd.actual_read_sched_clock();
203 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); 288 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
204 cd.suspended = false; 289 rd->read_sched_clock = cd.actual_read_sched_clock;
205} 290}
206 291
207static struct syscore_ops sched_clock_ops = { 292static struct syscore_ops sched_clock_ops = {
208 .suspend = sched_clock_suspend, 293 .suspend = sched_clock_suspend,
209 .resume = sched_clock_resume, 294 .resume = sched_clock_resume,
210}; 295};
211 296
212static int __init sched_clock_syscore_init(void) 297static int __init sched_clock_syscore_init(void)
213{ 298{
214 register_syscore_ops(&sched_clock_ops); 299 register_syscore_ops(&sched_clock_ops);
300
215 return 0; 301 return 0;
216} 302}
217device_initcall(sched_clock_syscore_init); 303device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index eb682d5c697c..6aac4beedbbe 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode,
49 */ 49 */
50static int bc_set_next(ktime_t expires, struct clock_event_device *bc) 50static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
51{ 51{
52 int bc_moved;
52 /* 53 /*
53 * We try to cancel the timer first. If the callback is on 54 * We try to cancel the timer first. If the callback is on
54 * flight on some other cpu then we let it handle it. If we 55 * flight on some other cpu then we let it handle it. If we
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
60 * restart the timer because we are in the callback, but we 61 * restart the timer because we are in the callback, but we
61 * can set the expiry time and let the callback return 62 * can set the expiry time and let the callback return
62 * HRTIMER_RESTART. 63 * HRTIMER_RESTART.
64 *
65 * Since we are in the idle loop at this point and because
66 * hrtimer_{start/cancel} functions call into tracing,
67 * calls to these functions must be bound within RCU_NONIDLE.
63 */ 68 */
64 if (hrtimer_try_to_cancel(&bctimer) >= 0) { 69 RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
65 hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); 70 !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
71 0);
72 if (bc_moved) {
66 /* Bind the "device" to the cpu */ 73 /* Bind the "device" to the cpu */
67 bc->bound_on = smp_processor_id(); 74 bc->bound_on = smp_processor_id();
68 } else if (bc->bound_on == smp_processor_id()) { 75 } else if (bc->bound_on == smp_processor_id()) {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 066f0ec05e48..7e8ca4f448a8 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask;
33static cpumask_var_t tick_broadcast_on; 33static cpumask_var_t tick_broadcast_on;
34static cpumask_var_t tmpmask; 34static cpumask_var_t tmpmask;
35static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 35static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
36static int tick_broadcast_force; 36static int tick_broadcast_forced;
37 37
38#ifdef CONFIG_TICK_ONESHOT 38#ifdef CONFIG_TICK_ONESHOT
39static void tick_broadcast_clear_oneshot(int cpu); 39static void tick_broadcast_clear_oneshot(int cpu);
40static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
40#else 41#else
41static inline void tick_broadcast_clear_oneshot(int cpu) { } 42static inline void tick_broadcast_clear_oneshot(int cpu) { }
43static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
42#endif 44#endif
43 45
44/* 46/*
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
303 /* 305 /*
304 * The device is in periodic mode. No reprogramming necessary: 306 * The device is in periodic mode. No reprogramming necessary:
305 */ 307 */
306 if (dev->mode == CLOCK_EVT_MODE_PERIODIC) 308 if (dev->state == CLOCK_EVT_STATE_PERIODIC)
307 goto unlock; 309 goto unlock;
308 310
309 /* 311 /*
@@ -324,49 +326,54 @@ unlock:
324 raw_spin_unlock(&tick_broadcast_lock); 326 raw_spin_unlock(&tick_broadcast_lock);
325} 327}
326 328
327/* 329/**
328 * Powerstate information: The system enters/leaves a state, where 330 * tick_broadcast_control - Enable/disable or force broadcast mode
329 * affected devices might stop 331 * @mode: The selected broadcast mode
332 *
333 * Called when the system enters a state where affected tick devices
334 * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
335 *
336 * Called with interrupts disabled, so clockevents_lock is not
337 * required here because the local clock event device cannot go away
338 * under us.
330 */ 339 */
331static void tick_do_broadcast_on_off(unsigned long *reason) 340void tick_broadcast_control(enum tick_broadcast_mode mode)
332{ 341{
333 struct clock_event_device *bc, *dev; 342 struct clock_event_device *bc, *dev;
334 struct tick_device *td; 343 struct tick_device *td;
335 unsigned long flags;
336 int cpu, bc_stopped; 344 int cpu, bc_stopped;
337 345
338 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 346 td = this_cpu_ptr(&tick_cpu_device);
339
340 cpu = smp_processor_id();
341 td = &per_cpu(tick_cpu_device, cpu);
342 dev = td->evtdev; 347 dev = td->evtdev;
343 bc = tick_broadcast_device.evtdev;
344 348
345 /* 349 /*
346 * Is the device not affected by the powerstate ? 350 * Is the device not affected by the powerstate ?
347 */ 351 */
348 if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) 352 if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
349 goto out; 353 return;
350 354
351 if (!tick_device_is_functional(dev)) 355 if (!tick_device_is_functional(dev))
352 goto out; 356 return;
353 357
358 raw_spin_lock(&tick_broadcast_lock);
359 cpu = smp_processor_id();
360 bc = tick_broadcast_device.evtdev;
354 bc_stopped = cpumask_empty(tick_broadcast_mask); 361 bc_stopped = cpumask_empty(tick_broadcast_mask);
355 362
356 switch (*reason) { 363 switch (mode) {
357 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 364 case TICK_BROADCAST_FORCE:
358 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 365 tick_broadcast_forced = 1;
366 case TICK_BROADCAST_ON:
359 cpumask_set_cpu(cpu, tick_broadcast_on); 367 cpumask_set_cpu(cpu, tick_broadcast_on);
360 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { 368 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
361 if (tick_broadcast_device.mode == 369 if (tick_broadcast_device.mode ==
362 TICKDEV_MODE_PERIODIC) 370 TICKDEV_MODE_PERIODIC)
363 clockevents_shutdown(dev); 371 clockevents_shutdown(dev);
364 } 372 }
365 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
366 tick_broadcast_force = 1;
367 break; 373 break;
368 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 374
369 if (tick_broadcast_force) 375 case TICK_BROADCAST_OFF:
376 if (tick_broadcast_forced)
370 break; 377 break;
371 cpumask_clear_cpu(cpu, tick_broadcast_on); 378 cpumask_clear_cpu(cpu, tick_broadcast_on);
372 if (!tick_device_is_functional(dev)) 379 if (!tick_device_is_functional(dev))
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
388 else 395 else
389 tick_broadcast_setup_oneshot(bc); 396 tick_broadcast_setup_oneshot(bc);
390 } 397 }
391out: 398 raw_spin_unlock(&tick_broadcast_lock);
392 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
393}
394
395/*
396 * Powerstate information: The system enters/leaves a state, where
397 * affected devices might stop.
398 */
399void tick_broadcast_on_off(unsigned long reason, int *oncpu)
400{
401 if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
402 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
403 "offline CPU #%d\n", *oncpu);
404 else
405 tick_do_broadcast_on_off(&reason);
406} 399}
400EXPORT_SYMBOL_GPL(tick_broadcast_control);
407 401
408/* 402/*
409 * Set the periodic handler depending on broadcast on/off 403 * Set the periodic handler depending on broadcast on/off
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
416 dev->event_handler = tick_handle_periodic_broadcast; 410 dev->event_handler = tick_handle_periodic_broadcast;
417} 411}
418 412
413#ifdef CONFIG_HOTPLUG_CPU
419/* 414/*
420 * Remove a CPU from broadcasting 415 * Remove a CPU from broadcasting
421 */ 416 */
422void tick_shutdown_broadcast(unsigned int *cpup) 417void tick_shutdown_broadcast(unsigned int cpu)
423{ 418{
424 struct clock_event_device *bc; 419 struct clock_event_device *bc;
425 unsigned long flags; 420 unsigned long flags;
426 unsigned int cpu = *cpup;
427 421
428 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 422 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
429 423
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
438 432
439 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 433 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
440} 434}
435#endif
441 436
442void tick_suspend_broadcast(void) 437void tick_suspend_broadcast(void)
443{ 438{
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void)
453 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 448 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
454} 449}
455 450
456int tick_resume_broadcast(void) 451/*
452 * This is called from tick_resume_local() on a resuming CPU. That's
453 * called from the core resume function, tick_unfreeze() and the magic XEN
454 * resume hackery.
455 *
456 * In none of these cases the broadcast device mode can change and the
457 * bit of the resuming CPU in the broadcast mask is safe as well.
458 */
459bool tick_resume_check_broadcast(void)
460{
461 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
462 return false;
463 else
464 return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
465}
466
467void tick_resume_broadcast(void)
457{ 468{
458 struct clock_event_device *bc; 469 struct clock_event_device *bc;
459 unsigned long flags; 470 unsigned long flags;
460 int broadcast = 0;
461 471
462 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 472 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
463 473
464 bc = tick_broadcast_device.evtdev; 474 bc = tick_broadcast_device.evtdev;
465 475
466 if (bc) { 476 if (bc) {
467 clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); 477 clockevents_tick_resume(bc);
468 478
469 switch (tick_broadcast_device.mode) { 479 switch (tick_broadcast_device.mode) {
470 case TICKDEV_MODE_PERIODIC: 480 case TICKDEV_MODE_PERIODIC:
471 if (!cpumask_empty(tick_broadcast_mask)) 481 if (!cpumask_empty(tick_broadcast_mask))
472 tick_broadcast_start_periodic(bc); 482 tick_broadcast_start_periodic(bc);
473 broadcast = cpumask_test_cpu(smp_processor_id(),
474 tick_broadcast_mask);
475 break; 483 break;
476 case TICKDEV_MODE_ONESHOT: 484 case TICKDEV_MODE_ONESHOT:
477 if (!cpumask_empty(tick_broadcast_mask)) 485 if (!cpumask_empty(tick_broadcast_mask))
478 broadcast = tick_resume_broadcast_oneshot(bc); 486 tick_resume_broadcast_oneshot(bc);
479 break; 487 break;
480 } 488 }
481 } 489 }
482 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 490 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
483
484 return broadcast;
485} 491}
486 492
487
488#ifdef CONFIG_TICK_ONESHOT 493#ifdef CONFIG_TICK_ONESHOT
489 494
490static cpumask_var_t tick_broadcast_oneshot_mask; 495static cpumask_var_t tick_broadcast_oneshot_mask;
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
532{ 537{
533 int ret; 538 int ret;
534 539
535 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) 540 if (bc->state != CLOCK_EVT_STATE_ONESHOT)
536 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 541 clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
537 542
538 ret = clockevents_program_event(bc, expires, force); 543 ret = clockevents_program_event(bc, expires, force);
539 if (!ret) 544 if (!ret)
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
541 return ret; 546 return ret;
542} 547}
543 548
544int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 549static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
545{ 550{
546 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 551 clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
547 return 0;
548} 552}
549 553
550/* 554/*
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void)
562 * switched over, leave the device alone. 566 * switched over, leave the device alone.
563 */ 567 */
564 if (td->mode == TICKDEV_MODE_ONESHOT) { 568 if (td->mode == TICKDEV_MODE_ONESHOT) {
565 clockevents_set_mode(td->evtdev, 569 clockevents_set_state(td->evtdev,
566 CLOCK_EVT_MODE_ONESHOT); 570 CLOCK_EVT_STATE_ONESHOT);
567 } 571 }
568 } 572 }
569} 573}
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
666 if (dev->next_event.tv64 < bc->next_event.tv64) 670 if (dev->next_event.tv64 < bc->next_event.tv64)
667 return; 671 return;
668 } 672 }
669 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 673 clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
670} 674}
671 675
672static void broadcast_move_bc(int deadcpu) 676/**
673{ 677 * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
674 struct clock_event_device *bc = tick_broadcast_device.evtdev; 678 * @state: The target state (enter/exit)
675 679 *
676 if (!bc || !broadcast_needs_cpu(bc, deadcpu)) 680 * The system enters/leaves a state, where affected devices might stop
677 return;
678 /* This moves the broadcast assignment to this cpu */
679 clockevents_program_event(bc, bc->next_event, 1);
680}
681
682/*
683 * Powerstate information: The system enters/leaves a state, where
684 * affected devices might stop
685 * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. 681 * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
682 *
683 * Called with interrupts disabled, so clockevents_lock is not
684 * required here because the local clock event device cannot go away
685 * under us.
686 */ 686 */
687int tick_broadcast_oneshot_control(unsigned long reason) 687int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
688{ 688{
689 struct clock_event_device *bc, *dev; 689 struct clock_event_device *bc, *dev;
690 struct tick_device *td; 690 struct tick_device *td;
691 unsigned long flags;
692 ktime_t now;
693 int cpu, ret = 0; 691 int cpu, ret = 0;
692 ktime_t now;
694 693
695 /* 694 /*
696 * Periodic mode does not care about the enter/exit of power 695 * Periodic mode does not care about the enter/exit of power
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason)
703 * We are called with preemtion disabled from the depth of the 702 * We are called with preemtion disabled from the depth of the
704 * idle code, so we can't be moved away. 703 * idle code, so we can't be moved away.
705 */ 704 */
706 cpu = smp_processor_id(); 705 td = this_cpu_ptr(&tick_cpu_device);
707 td = &per_cpu(tick_cpu_device, cpu);
708 dev = td->evtdev; 706 dev = td->evtdev;
709 707
710 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) 708 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
711 return 0; 709 return 0;
712 710
711 raw_spin_lock(&tick_broadcast_lock);
713 bc = tick_broadcast_device.evtdev; 712 bc = tick_broadcast_device.evtdev;
713 cpu = smp_processor_id();
714 714
715 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 715 if (state == TICK_BROADCAST_ENTER) {
716 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
717 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { 716 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
718 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); 717 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
719 broadcast_shutdown_local(bc, dev); 718 broadcast_shutdown_local(bc, dev);
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
741 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); 740 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
742 } else { 741 } else {
743 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { 742 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
744 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 743 clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
745 /* 744 /*
746 * The cpu which was handling the broadcast 745 * The cpu which was handling the broadcast
747 * timer marked this cpu in the broadcast 746 * timer marked this cpu in the broadcast
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason)
805 } 804 }
806 } 805 }
807out: 806out:
808 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 807 raw_spin_unlock(&tick_broadcast_lock);
809 return ret; 808 return ret;
810} 809}
810EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
811 811
812/* 812/*
813 * Reset the one shot broadcast for a cpu 813 * Reset the one shot broadcast for a cpu
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
842 842
843 /* Set it up only once ! */ 843 /* Set it up only once ! */
844 if (bc->event_handler != tick_handle_oneshot_broadcast) { 844 if (bc->event_handler != tick_handle_oneshot_broadcast) {
845 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 845 int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
846 846
847 bc->event_handler = tick_handle_oneshot_broadcast; 847 bc->event_handler = tick_handle_oneshot_broadcast;
848 848
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
858 tick_broadcast_oneshot_mask, tmpmask); 858 tick_broadcast_oneshot_mask, tmpmask);
859 859
860 if (was_periodic && !cpumask_empty(tmpmask)) { 860 if (was_periodic && !cpumask_empty(tmpmask)) {
861 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 861 clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
862 tick_broadcast_init_next_event(tmpmask, 862 tick_broadcast_init_next_event(tmpmask,
863 tick_next_period); 863 tick_next_period);
864 tick_broadcast_set_event(bc, cpu, tick_next_period, 1); 864 tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void)
894 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 894 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
895} 895}
896 896
897#ifdef CONFIG_HOTPLUG_CPU
898void hotplug_cpu__broadcast_tick_pull(int deadcpu)
899{
900 struct clock_event_device *bc;
901 unsigned long flags;
902
903 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
904 bc = tick_broadcast_device.evtdev;
905
906 if (bc && broadcast_needs_cpu(bc, deadcpu)) {
907 /* This moves the broadcast assignment to this CPU: */
908 clockevents_program_event(bc, bc->next_event, 1);
909 }
910 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
911}
897 912
898/* 913/*
899 * Remove a dead CPU from broadcasting 914 * Remove a dead CPU from broadcasting
900 */ 915 */
901void tick_shutdown_broadcast_oneshot(unsigned int *cpup) 916void tick_shutdown_broadcast_oneshot(unsigned int cpu)
902{ 917{
903 unsigned long flags; 918 unsigned long flags;
904 unsigned int cpu = *cpup;
905 919
906 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 920 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
907 921
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
913 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); 927 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
914 cpumask_clear_cpu(cpu, tick_broadcast_force_mask); 928 cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
915 929
916 broadcast_move_bc(cpu);
917
918 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 930 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
919} 931}
932#endif
920 933
921/* 934/*
922 * Check, whether the broadcast device is in one shot mode 935 * Check, whether the broadcast device is in one shot mode
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f7c515595b42..3ae6afa1eb98 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
102 102
103 tick_periodic(cpu); 103 tick_periodic(cpu);
104 104
105 if (dev->mode != CLOCK_EVT_MODE_ONESHOT) 105 if (dev->state != CLOCK_EVT_STATE_ONESHOT)
106 return; 106 return;
107 for (;;) { 107 for (;;) {
108 /* 108 /*
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
140 140
141 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && 141 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
142 !tick_broadcast_oneshot_active()) { 142 !tick_broadcast_oneshot_active()) {
143 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); 143 clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
144 } else { 144 } else {
145 unsigned long seq; 145 unsigned long seq;
146 ktime_t next; 146 ktime_t next;
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
150 next = tick_next_period; 150 next = tick_next_period;
151 } while (read_seqretry(&jiffies_lock, seq)); 151 } while (read_seqretry(&jiffies_lock, seq));
152 152
153 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 153 clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
154 154
155 for (;;) { 155 for (;;) {
156 if (!clockevents_program_event(dev, next, false)) 156 if (!clockevents_program_event(dev, next, false))
@@ -332,14 +332,16 @@ out_bc:
332 tick_install_broadcast_device(newdev); 332 tick_install_broadcast_device(newdev);
333} 333}
334 334
335#ifdef CONFIG_HOTPLUG_CPU
335/* 336/*
336 * Transfer the do_timer job away from a dying cpu. 337 * Transfer the do_timer job away from a dying cpu.
337 * 338 *
338 * Called with interrupts disabled. 339 * Called with interrupts disabled. Not locking required. If
340 * tick_do_timer_cpu is owned by this cpu, nothing can change it.
339 */ 341 */
340void tick_handover_do_timer(int *cpup) 342void tick_handover_do_timer(void)
341{ 343{
342 if (*cpup == tick_do_timer_cpu) { 344 if (tick_do_timer_cpu == smp_processor_id()) {
343 int cpu = cpumask_first(cpu_online_mask); 345 int cpu = cpumask_first(cpu_online_mask);
344 346
345 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : 347 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup)
354 * access the hardware device itself. 356 * access the hardware device itself.
355 * We just set the mode and remove it from the lists. 357 * We just set the mode and remove it from the lists.
356 */ 358 */
357void tick_shutdown(unsigned int *cpup) 359void tick_shutdown(unsigned int cpu)
358{ 360{
359 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); 361 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
360 struct clock_event_device *dev = td->evtdev; 362 struct clock_event_device *dev = td->evtdev;
361 363
362 td->mode = TICKDEV_MODE_PERIODIC; 364 td->mode = TICKDEV_MODE_PERIODIC;
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup)
365 * Prevent that the clock events layer tries to call 367 * Prevent that the clock events layer tries to call
366 * the set mode function! 368 * the set mode function!
367 */ 369 */
370 dev->state = CLOCK_EVT_STATE_DETACHED;
368 dev->mode = CLOCK_EVT_MODE_UNUSED; 371 dev->mode = CLOCK_EVT_MODE_UNUSED;
369 clockevents_exchange_device(dev, NULL); 372 clockevents_exchange_device(dev, NULL);
370 dev->event_handler = clockevents_handle_noop; 373 dev->event_handler = clockevents_handle_noop;
371 td->evtdev = NULL; 374 td->evtdev = NULL;
372 } 375 }
373} 376}
377#endif
374 378
375void tick_suspend(void) 379/**
380 * tick_suspend_local - Suspend the local tick device
381 *
382 * Called from the local cpu for freeze with interrupts disabled.
383 *
384 * No locks required. Nothing can change the per cpu device.
385 */
386void tick_suspend_local(void)
376{ 387{
377 struct tick_device *td = this_cpu_ptr(&tick_cpu_device); 388 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
378 389
379 clockevents_shutdown(td->evtdev); 390 clockevents_shutdown(td->evtdev);
380} 391}
381 392
382void tick_resume(void) 393/**
394 * tick_resume_local - Resume the local tick device
395 *
396 * Called from the local CPU for unfreeze or XEN resume magic.
397 *
398 * No locks required. Nothing can change the per cpu device.
399 */
400void tick_resume_local(void)
383{ 401{
384 struct tick_device *td = this_cpu_ptr(&tick_cpu_device); 402 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
385 int broadcast = tick_resume_broadcast(); 403 bool broadcast = tick_resume_check_broadcast();
386
387 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
388 404
405 clockevents_tick_resume(td->evtdev);
389 if (!broadcast) { 406 if (!broadcast) {
390 if (td->mode == TICKDEV_MODE_PERIODIC) 407 if (td->mode == TICKDEV_MODE_PERIODIC)
391 tick_setup_periodic(td->evtdev, 0); 408 tick_setup_periodic(td->evtdev, 0);
@@ -394,6 +411,35 @@ void tick_resume(void)
394 } 411 }
395} 412}
396 413
414/**
415 * tick_suspend - Suspend the tick and the broadcast device
416 *
417 * Called from syscore_suspend() via timekeeping_suspend with only one
418 * CPU online and interrupts disabled or from tick_unfreeze() under
419 * tick_freeze_lock.
420 *
421 * No locks required. Nothing can change the per cpu device.
422 */
423void tick_suspend(void)
424{
425 tick_suspend_local();
426 tick_suspend_broadcast();
427}
428
429/**
430 * tick_resume - Resume the tick and the broadcast device
431 *
432 * Called from syscore_resume() via timekeeping_resume with only one
433 * CPU online and interrupts disabled.
434 *
435 * No locks required. Nothing can change the per cpu device.
436 */
437void tick_resume(void)
438{
439 tick_resume_broadcast();
440 tick_resume_local();
441}
442
397static DEFINE_RAW_SPINLOCK(tick_freeze_lock); 443static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
398static unsigned int tick_freeze_depth; 444static unsigned int tick_freeze_depth;
399 445
@@ -411,12 +457,10 @@ void tick_freeze(void)
411 raw_spin_lock(&tick_freeze_lock); 457 raw_spin_lock(&tick_freeze_lock);
412 458
413 tick_freeze_depth++; 459 tick_freeze_depth++;
414 if (tick_freeze_depth == num_online_cpus()) { 460 if (tick_freeze_depth == num_online_cpus())
415 timekeeping_suspend(); 461 timekeeping_suspend();
416 } else { 462 else
417 tick_suspend(); 463 tick_suspend_local();
418 tick_suspend_broadcast();
419 }
420 464
421 raw_spin_unlock(&tick_freeze_lock); 465 raw_spin_unlock(&tick_freeze_lock);
422} 466}
@@ -437,7 +481,7 @@ void tick_unfreeze(void)
437 if (tick_freeze_depth == num_online_cpus()) 481 if (tick_freeze_depth == num_online_cpus())
438 timekeeping_resume(); 482 timekeeping_resume();
439 else 483 else
440 tick_resume(); 484 tick_resume_local();
441 485
442 tick_freeze_depth--; 486 tick_freeze_depth--;
443 487
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 366aeb4f2c66..b64fdd8054c5 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,15 +5,12 @@
5#include <linux/tick.h> 5#include <linux/tick.h>
6 6
7#include "timekeeping.h" 7#include "timekeeping.h"
8#include "tick-sched.h"
8 9
9extern seqlock_t jiffies_lock; 10#ifdef CONFIG_GENERIC_CLOCKEVENTS
10 11
11#define CS_NAME_LEN 32 12# define TICK_DO_TIMER_NONE -1
12 13# define TICK_DO_TIMER_BOOT -2
13#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
14
15#define TICK_DO_TIMER_NONE -1
16#define TICK_DO_TIMER_BOOT -2
17 14
18DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 15DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
19extern ktime_t tick_next_period; 16extern ktime_t tick_next_period;
@@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly;
23extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 20extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
24extern void tick_handle_periodic(struct clock_event_device *dev); 21extern void tick_handle_periodic(struct clock_event_device *dev);
25extern void tick_check_new_device(struct clock_event_device *dev); 22extern void tick_check_new_device(struct clock_event_device *dev);
26extern void tick_handover_do_timer(int *cpup); 23extern void tick_shutdown(unsigned int cpu);
27extern void tick_shutdown(unsigned int *cpup);
28extern void tick_suspend(void); 24extern void tick_suspend(void);
29extern void tick_resume(void); 25extern void tick_resume(void);
30extern bool tick_check_replacement(struct clock_event_device *curdev, 26extern bool tick_check_replacement(struct clock_event_device *curdev,
31 struct clock_event_device *newdev); 27 struct clock_event_device *newdev);
32extern void tick_install_replacement(struct clock_event_device *dev); 28extern void tick_install_replacement(struct clock_event_device *dev);
29extern int tick_is_oneshot_available(void);
30extern struct tick_device *tick_get_device(int cpu);
33 31
34extern void clockevents_shutdown(struct clock_event_device *dev); 32extern int clockevents_tick_resume(struct clock_event_device *dev);
33/* Check, if the device is functional or a dummy for broadcast */
34static inline int tick_device_is_functional(struct clock_event_device *dev)
35{
36 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
37}
35 38
39extern void clockevents_shutdown(struct clock_event_device *dev);
40extern void clockevents_exchange_device(struct clock_event_device *old,
41 struct clock_event_device *new);
42extern void clockevents_set_state(struct clock_event_device *dev,
43 enum clock_event_state state);
44extern int clockevents_program_event(struct clock_event_device *dev,
45 ktime_t expires, bool force);
46extern void clockevents_handle_noop(struct clock_event_device *dev);
47extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
36extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); 48extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
37 49
38/* 50/* Broadcasting support */
39 * NO_HZ / high resolution timer shared code 51# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
40 */ 52extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
53extern void tick_install_broadcast_device(struct clock_event_device *dev);
54extern int tick_is_broadcast_device(struct clock_event_device *dev);
55extern void tick_shutdown_broadcast(unsigned int cpu);
56extern void tick_suspend_broadcast(void);
57extern void tick_resume_broadcast(void);
58extern bool tick_resume_check_broadcast(void);
59extern void tick_broadcast_init(void);
60extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
61extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
62extern struct tick_device *tick_get_broadcast_device(void);
63extern struct cpumask *tick_get_broadcast_mask(void);
64# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
65static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
66static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
67static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
68static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
69static inline void tick_shutdown_broadcast(unsigned int cpu) { }
70static inline void tick_suspend_broadcast(void) { }
71static inline void tick_resume_broadcast(void) { }
72static inline bool tick_resume_check_broadcast(void) { return false; }
73static inline void tick_broadcast_init(void) { }
74static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
75
76/* Set the periodic handler in non broadcast mode */
77static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
78{
79 dev->event_handler = tick_handle_periodic;
80}
81# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
82
83#else /* !GENERIC_CLOCKEVENTS: */
84static inline void tick_suspend(void) { }
85static inline void tick_resume(void) { }
86#endif /* !GENERIC_CLOCKEVENTS */
87
88/* Oneshot related functions */
41#ifdef CONFIG_TICK_ONESHOT 89#ifdef CONFIG_TICK_ONESHOT
42extern void tick_setup_oneshot(struct clock_event_device *newdev, 90extern void tick_setup_oneshot(struct clock_event_device *newdev,
43 void (*handler)(struct clock_event_device *), 91 void (*handler)(struct clock_event_device *),
@@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force);
46extern void tick_oneshot_notify(void); 94extern void tick_oneshot_notify(void);
47extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 95extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
48extern void tick_resume_oneshot(void); 96extern void tick_resume_oneshot(void);
49# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 97static inline bool tick_oneshot_possible(void) { return true; }
98extern int tick_oneshot_mode_active(void);
99extern void tick_clock_notify(void);
100extern int tick_check_oneshot_change(int allow_nohz);
101extern int tick_init_highres(void);
102#else /* !CONFIG_TICK_ONESHOT: */
103static inline
104void tick_setup_oneshot(struct clock_event_device *newdev,
105 void (*handler)(struct clock_event_device *),
106 ktime_t nextevt) { BUG(); }
107static inline void tick_resume_oneshot(void) { BUG(); }
108static inline int tick_program_event(ktime_t expires, int force) { return 0; }
109static inline void tick_oneshot_notify(void) { }
110static inline bool tick_oneshot_possible(void) { return false; }
111static inline int tick_oneshot_mode_active(void) { return 0; }
112static inline void tick_clock_notify(void) { }
113static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
114#endif /* !CONFIG_TICK_ONESHOT */
115
116/* Functions related to oneshot broadcasting */
117#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
50extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); 118extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
51extern int tick_broadcast_oneshot_control(unsigned long reason);
52extern void tick_broadcast_switch_to_oneshot(void); 119extern void tick_broadcast_switch_to_oneshot(void);
53extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 120extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
54extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
55extern int tick_broadcast_oneshot_active(void); 121extern int tick_broadcast_oneshot_active(void);
56extern void tick_check_oneshot_broadcast_this_cpu(void); 122extern void tick_check_oneshot_broadcast_this_cpu(void);
57bool tick_broadcast_oneshot_available(void); 123bool tick_broadcast_oneshot_available(void);
58# else /* BROADCAST */ 124extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
59static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 125#else /* !(BROADCAST && ONESHOT): */
60{ 126static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
61 BUG();
62}
63static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
64static inline void tick_broadcast_switch_to_oneshot(void) { } 127static inline void tick_broadcast_switch_to_oneshot(void) { }
65static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 128static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
66static inline int tick_broadcast_oneshot_active(void) { return 0; } 129static inline int tick_broadcast_oneshot_active(void) { return 0; }
67static inline void tick_check_oneshot_broadcast_this_cpu(void) { } 130static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
68static inline bool tick_broadcast_oneshot_available(void) { return true; } 131static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
69# endif /* !BROADCAST */ 132#endif /* !(BROADCAST && ONESHOT) */
70
71#else /* !ONESHOT */
72static inline
73void tick_setup_oneshot(struct clock_event_device *newdev,
74 void (*handler)(struct clock_event_device *),
75 ktime_t nextevt)
76{
77 BUG();
78}
79static inline void tick_resume_oneshot(void)
80{
81 BUG();
82}
83static inline int tick_program_event(ktime_t expires, int force)
84{
85 return 0;
86}
87static inline void tick_oneshot_notify(void) { }
88static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
89{
90 BUG();
91}
92static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
93static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
94static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
95{
96 return 0;
97}
98static inline int tick_broadcast_oneshot_active(void) { return 0; }
99static inline bool tick_broadcast_oneshot_available(void) { return false; }
100#endif /* !TICK_ONESHOT */
101 133
102/* NO_HZ_FULL internal */ 134/* NO_HZ_FULL internal */
103#ifdef CONFIG_NO_HZ_FULL 135#ifdef CONFIG_NO_HZ_FULL
@@ -105,68 +137,3 @@ extern void tick_nohz_init(void);
105# else 137# else
106static inline void tick_nohz_init(void) { } 138static inline void tick_nohz_init(void) { }
107#endif 139#endif
108
109/*
110 * Broadcasting support
111 */
112#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
113extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
114extern void tick_install_broadcast_device(struct clock_event_device *dev);
115extern int tick_is_broadcast_device(struct clock_event_device *dev);
116extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
117extern void tick_shutdown_broadcast(unsigned int *cpup);
118extern void tick_suspend_broadcast(void);
119extern int tick_resume_broadcast(void);
120extern void tick_broadcast_init(void);
121extern void
122tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
123int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
124
125#else /* !BROADCAST */
126
127static inline void tick_install_broadcast_device(struct clock_event_device *dev)
128{
129}
130
131static inline int tick_is_broadcast_device(struct clock_event_device *dev)
132{
133 return 0;
134}
135static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
136 int cpu)
137{
138 return 0;
139}
140static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
141static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
142static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
143static inline void tick_suspend_broadcast(void) { }
144static inline int tick_resume_broadcast(void) { return 0; }
145static inline void tick_broadcast_init(void) { }
146static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
147 u32 freq) { return -ENODEV; }
148
149/*
150 * Set the periodic handler in non broadcast mode
151 */
152static inline void tick_set_periodic_handler(struct clock_event_device *dev,
153 int broadcast)
154{
155 dev->event_handler = tick_handle_periodic;
156}
157#endif /* !BROADCAST */
158
159/*
160 * Check, if the device is functional or a dummy for broadcast
161 */
162static inline int tick_device_is_functional(struct clock_event_device *dev)
163{
164 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
165}
166
167int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
168
169#endif
170
171extern void do_timer(unsigned long ticks);
172extern void update_wall_time(void);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 7ce740e78e1b..67a64b1670bf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void)
38{ 38{
39 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 39 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
40 40
41 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 41 clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
42 clockevents_program_event(dev, ktime_get(), true); 42 clockevents_program_event(dev, ktime_get(), true);
43} 43}
44 44
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
50 ktime_t next_event) 50 ktime_t next_event)
51{ 51{
52 newdev->event_handler = handler; 52 newdev->event_handler = handler;
53 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 53 clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
54 clockevents_program_event(newdev, next_event, true); 54 clockevents_program_event(newdev, next_event, true);
55} 55}
56 56
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
81 81
82 td->mode = TICKDEV_MODE_ONESHOT; 82 td->mode = TICKDEV_MODE_ONESHOT;
83 dev->event_handler = handler; 83 dev->event_handler = handler;
84 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 84 clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
85 tick_broadcast_switch_to_oneshot(); 85 tick_broadcast_switch_to_oneshot();
86 return 0; 86 return 0;
87} 87}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a4c4edac4528..914259128145 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -34,7 +34,7 @@
34/* 34/*
35 * Per cpu nohz control structure 35 * Per cpu nohz control structure
36 */ 36 */
37DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 37static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
38 38
39/* 39/*
40 * The time, when the last jiffy update happened. Protected by jiffies_lock. 40 * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str)
416 416
417__setup("nohz=", setup_tick_nohz); 417__setup("nohz=", setup_tick_nohz);
418 418
419int tick_nohz_tick_stopped(void)
420{
421 return __this_cpu_read(tick_cpu_sched.tick_stopped);
422}
423
419/** 424/**
420 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 425 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
421 * 426 *
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644
index 000000000000..28b5da3e1a17
--- /dev/null
+++ b/kernel/time/tick-sched.h
@@ -0,0 +1,74 @@
1#ifndef _TICK_SCHED_H
2#define _TICK_SCHED_H
3
4#include <linux/hrtimer.h>
5
6enum tick_device_mode {
7 TICKDEV_MODE_PERIODIC,
8 TICKDEV_MODE_ONESHOT,
9};
10
11struct tick_device {
12 struct clock_event_device *evtdev;
13 enum tick_device_mode mode;
14};
15
16enum tick_nohz_mode {
17 NOHZ_MODE_INACTIVE,
18 NOHZ_MODE_LOWRES,
19 NOHZ_MODE_HIGHRES,
20};
21
22/**
23 * struct tick_sched - sched tick emulation and no idle tick control/stats
24 * @sched_timer: hrtimer to schedule the periodic tick in high
25 * resolution mode
26 * @last_tick: Store the last tick expiry time when the tick
27 * timer is modified for nohz sleeps. This is necessary
28 * to resume the tick timer operation in the timeline
29 * when the CPU returns from nohz sleep.
30 * @tick_stopped: Indicator that the idle tick has been stopped
31 * @idle_jiffies: jiffies at the entry to idle for idle time accounting
32 * @idle_calls: Total number of idle calls
33 * @idle_sleeps: Number of idle calls, where the sched tick was stopped
34 * @idle_entrytime: Time when the idle call was entered
35 * @idle_waketime: Time when the idle was interrupted
36 * @idle_exittime: Time when the idle state was left
37 * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
38 * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
39 * @sleep_length: Duration of the current idle sleep
40 * @do_timer_lst: CPU was the last one doing do_timer before going idle
41 */
42struct tick_sched {
43 struct hrtimer sched_timer;
44 unsigned long check_clocks;
45 enum tick_nohz_mode nohz_mode;
46 ktime_t last_tick;
47 int inidle;
48 int tick_stopped;
49 unsigned long idle_jiffies;
50 unsigned long idle_calls;
51 unsigned long idle_sleeps;
52 int idle_active;
53 ktime_t idle_entrytime;
54 ktime_t idle_waketime;
55 ktime_t idle_exittime;
56 ktime_t idle_sleeptime;
57 ktime_t iowait_sleeptime;
58 ktime_t sleep_length;
59 unsigned long last_jiffies;
60 unsigned long next_jiffies;
61 ktime_t idle_expires;
62 int do_timer_last;
63};
64
65extern struct tick_sched *tick_get_tick_sched(int cpu);
66
67extern void tick_setup_sched_timer(void);
68#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
69extern void tick_cancel_sched_timer(int cpu);
70#else
71static inline void tick_cancel_sched_timer(int cpu) { }
72#endif
73
74#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 91db94136c10..946acb72179f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -59,17 +59,15 @@ struct tk_fast {
59}; 59};
60 60
61static struct tk_fast tk_fast_mono ____cacheline_aligned; 61static struct tk_fast tk_fast_mono ____cacheline_aligned;
62static struct tk_fast tk_fast_raw ____cacheline_aligned;
62 63
63/* flag for if timekeeping is suspended */ 64/* flag for if timekeeping is suspended */
64int __read_mostly timekeeping_suspended; 65int __read_mostly timekeeping_suspended;
65 66
66/* Flag for if there is a persistent clock on this platform */
67bool __read_mostly persistent_clock_exist = false;
68
69static inline void tk_normalize_xtime(struct timekeeper *tk) 67static inline void tk_normalize_xtime(struct timekeeper *tk)
70{ 68{
71 while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { 69 while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
72 tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; 70 tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
73 tk->xtime_sec++; 71 tk->xtime_sec++;
74 } 72 }
75} 73}
@@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
79 struct timespec64 ts; 77 struct timespec64 ts;
80 78
81 ts.tv_sec = tk->xtime_sec; 79 ts.tv_sec = tk->xtime_sec;
82 ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); 80 ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
83 return ts; 81 return ts;
84} 82}
85 83
86static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) 84static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
87{ 85{
88 tk->xtime_sec = ts->tv_sec; 86 tk->xtime_sec = ts->tv_sec;
89 tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; 87 tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
90} 88}
91 89
92static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) 90static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
93{ 91{
94 tk->xtime_sec += ts->tv_sec; 92 tk->xtime_sec += ts->tv_sec;
95 tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; 93 tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
96 tk_normalize_xtime(tk); 94 tk_normalize_xtime(tk);
97} 95}
98 96
@@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
118 tk->offs_boot = ktime_add(tk->offs_boot, delta); 116 tk->offs_boot = ktime_add(tk->offs_boot, delta);
119} 117}
120 118
119#ifdef CONFIG_DEBUG_TIMEKEEPING
120#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
121/*
122 * These simple flag variables are managed
123 * without locks, which is racy, but ok since
124 * we don't really care about being super
125 * precise about how many events were seen,
126 * just that a problem was observed.
127 */
128static int timekeeping_underflow_seen;
129static int timekeeping_overflow_seen;
130
131/* last_warning is only modified under the timekeeping lock */
132static long timekeeping_last_warning;
133
134static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
135{
136
137 cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
138 const char *name = tk->tkr_mono.clock->name;
139
140 if (offset > max_cycles) {
141 printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
142 offset, name, max_cycles);
143 printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
144 } else {
145 if (offset > (max_cycles >> 1)) {
146 printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
147 offset, name, max_cycles >> 1);
148 printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
149 }
150 }
151
152 if (timekeeping_underflow_seen) {
153 if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
154 printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
155 printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
156 printk_deferred(" Your kernel is probably still fine.\n");
157 timekeeping_last_warning = jiffies;
158 }
159 timekeeping_underflow_seen = 0;
160 }
161
162 if (timekeeping_overflow_seen) {
163 if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
164 printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
165 printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
166 printk_deferred(" Your kernel is probably still fine.\n");
167 timekeeping_last_warning = jiffies;
168 }
169 timekeeping_overflow_seen = 0;
170 }
171}
172
173static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
174{
175 cycle_t now, last, mask, max, delta;
176 unsigned int seq;
177
178 /*
179 * Since we're called holding a seqlock, the data may shift
180 * under us while we're doing the calculation. This can cause
181 * false positives, since we'd note a problem but throw the
182 * results away. So nest another seqlock here to atomically
183 * grab the points we are checking with.
184 */
185 do {
186 seq = read_seqcount_begin(&tk_core.seq);
187 now = tkr->read(tkr->clock);
188 last = tkr->cycle_last;
189 mask = tkr->mask;
190 max = tkr->clock->max_cycles;
191 } while (read_seqcount_retry(&tk_core.seq, seq));
192
193 delta = clocksource_delta(now, last, mask);
194
195 /*
196 * Try to catch underflows by checking if we are seeing small
197 * mask-relative negative values.
198 */
199 if (unlikely((~delta & mask) < (mask >> 3))) {
200 timekeeping_underflow_seen = 1;
201 delta = 0;
202 }
203
204 /* Cap delta value to the max_cycles values to avoid mult overflows */
205 if (unlikely(delta > max)) {
206 timekeeping_overflow_seen = 1;
207 delta = tkr->clock->max_cycles;
208 }
209
210 return delta;
211}
212#else
213static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
214{
215}
216static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
217{
218 cycle_t cycle_now, delta;
219
220 /* read clocksource */
221 cycle_now = tkr->read(tkr->clock);
222
223 /* calculate the delta since the last update_wall_time */
224 delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
225
226 return delta;
227}
228#endif
229
121/** 230/**
122 * tk_setup_internals - Set up internals to use clocksource clock. 231 * tk_setup_internals - Set up internals to use clocksource clock.
123 * 232 *
@@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
135 u64 tmp, ntpinterval; 244 u64 tmp, ntpinterval;
136 struct clocksource *old_clock; 245 struct clocksource *old_clock;
137 246
138 old_clock = tk->tkr.clock; 247 old_clock = tk->tkr_mono.clock;
139 tk->tkr.clock = clock; 248 tk->tkr_mono.clock = clock;
140 tk->tkr.read = clock->read; 249 tk->tkr_mono.read = clock->read;
141 tk->tkr.mask = clock->mask; 250 tk->tkr_mono.mask = clock->mask;
142 tk->tkr.cycle_last = tk->tkr.read(clock); 251 tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
252
253 tk->tkr_raw.clock = clock;
254 tk->tkr_raw.read = clock->read;
255 tk->tkr_raw.mask = clock->mask;
256 tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
143 257
144 /* Do the ns -> cycle conversion first, using original mult */ 258 /* Do the ns -> cycle conversion first, using original mult */
145 tmp = NTP_INTERVAL_LENGTH; 259 tmp = NTP_INTERVAL_LENGTH;
@@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
163 if (old_clock) { 277 if (old_clock) {
164 int shift_change = clock->shift - old_clock->shift; 278 int shift_change = clock->shift - old_clock->shift;
165 if (shift_change < 0) 279 if (shift_change < 0)
166 tk->tkr.xtime_nsec >>= -shift_change; 280 tk->tkr_mono.xtime_nsec >>= -shift_change;
167 else 281 else
168 tk->tkr.xtime_nsec <<= shift_change; 282 tk->tkr_mono.xtime_nsec <<= shift_change;
169 } 283 }
170 tk->tkr.shift = clock->shift; 284 tk->tkr_raw.xtime_nsec = 0;
285
286 tk->tkr_mono.shift = clock->shift;
287 tk->tkr_raw.shift = clock->shift;
171 288
172 tk->ntp_error = 0; 289 tk->ntp_error = 0;
173 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; 290 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
178 * active clocksource. These value will be adjusted via NTP 295 * active clocksource. These value will be adjusted via NTP
179 * to counteract clock drifting. 296 * to counteract clock drifting.
180 */ 297 */
181 tk->tkr.mult = clock->mult; 298 tk->tkr_mono.mult = clock->mult;
299 tk->tkr_raw.mult = clock->mult;
182 tk->ntp_err_mult = 0; 300 tk->ntp_err_mult = 0;
183} 301}
184 302
@@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
193 311
194static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) 312static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
195{ 313{
196 cycle_t cycle_now, delta; 314 cycle_t delta;
197 s64 nsec; 315 s64 nsec;
198 316
199 /* read clocksource: */ 317 delta = timekeeping_get_delta(tkr);
200 cycle_now = tkr->read(tkr->clock);
201
202 /* calculate the delta since the last update_wall_time: */
203 delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
204 318
205 nsec = delta * tkr->mult + tkr->xtime_nsec; 319 nsec = delta * tkr->mult + tkr->xtime_nsec;
206 nsec >>= tkr->shift; 320 nsec >>= tkr->shift;
@@ -209,25 +323,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
209 return nsec + arch_gettimeoffset(); 323 return nsec + arch_gettimeoffset();
210} 324}
211 325
212static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
213{
214 struct clocksource *clock = tk->tkr.clock;
215 cycle_t cycle_now, delta;
216 s64 nsec;
217
218 /* read clocksource: */
219 cycle_now = tk->tkr.read(clock);
220
221 /* calculate the delta since the last update_wall_time: */
222 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
223
224 /* convert delta to nanoseconds. */
225 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
226
227 /* If arch requires, add in get_arch_timeoffset() */
228 return nsec + arch_gettimeoffset();
229}
230
231/** 326/**
232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. 327 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
233 * @tkr: Timekeeping readout base from which we take the update 328 * @tkr: Timekeeping readout base from which we take the update
@@ -267,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
267 * slightly wrong timestamp (a few nanoseconds). See 362 * slightly wrong timestamp (a few nanoseconds). See
268 * @ktime_get_mono_fast_ns. 363 * @ktime_get_mono_fast_ns.
269 */ 364 */
270static void update_fast_timekeeper(struct tk_read_base *tkr) 365static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
271{ 366{
272 struct tk_read_base *base = tk_fast_mono.base; 367 struct tk_read_base *base = tkf->base;
273 368
274 /* Force readers off to base[1] */ 369 /* Force readers off to base[1] */
275 raw_write_seqcount_latch(&tk_fast_mono.seq); 370 raw_write_seqcount_latch(&tkf->seq);
276 371
277 /* Update base[0] */ 372 /* Update base[0] */
278 memcpy(base, tkr, sizeof(*base)); 373 memcpy(base, tkr, sizeof(*base));
279 374
280 /* Force readers back to base[0] */ 375 /* Force readers back to base[0] */
281 raw_write_seqcount_latch(&tk_fast_mono.seq); 376 raw_write_seqcount_latch(&tkf->seq);
282 377
283 /* Update base[1] */ 378 /* Update base[1] */
284 memcpy(base + 1, base, sizeof(*base)); 379 memcpy(base + 1, base, sizeof(*base));
@@ -316,22 +411,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr)
316 * of the following timestamps. Callers need to be aware of that and 411 * of the following timestamps. Callers need to be aware of that and
317 * deal with it. 412 * deal with it.
318 */ 413 */
319u64 notrace ktime_get_mono_fast_ns(void) 414static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
320{ 415{
321 struct tk_read_base *tkr; 416 struct tk_read_base *tkr;
322 unsigned int seq; 417 unsigned int seq;
323 u64 now; 418 u64 now;
324 419
325 do { 420 do {
326 seq = raw_read_seqcount(&tk_fast_mono.seq); 421 seq = raw_read_seqcount(&tkf->seq);
327 tkr = tk_fast_mono.base + (seq & 0x01); 422 tkr = tkf->base + (seq & 0x01);
328 now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); 423 now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
424 } while (read_seqcount_retry(&tkf->seq, seq));
329 425
330 } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
331 return now; 426 return now;
332} 427}
428
429u64 ktime_get_mono_fast_ns(void)
430{
431 return __ktime_get_fast_ns(&tk_fast_mono);
432}
333EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); 433EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
334 434
435u64 ktime_get_raw_fast_ns(void)
436{
437 return __ktime_get_fast_ns(&tk_fast_raw);
438}
439EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
440
335/* Suspend-time cycles value for halted fast timekeeper. */ 441/* Suspend-time cycles value for halted fast timekeeper. */
336static cycle_t cycles_at_suspend; 442static cycle_t cycles_at_suspend;
337 443
@@ -353,12 +459,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
353static void halt_fast_timekeeper(struct timekeeper *tk) 459static void halt_fast_timekeeper(struct timekeeper *tk)
354{ 460{
355 static struct tk_read_base tkr_dummy; 461 static struct tk_read_base tkr_dummy;
356 struct tk_read_base *tkr = &tk->tkr; 462 struct tk_read_base *tkr = &tk->tkr_mono;
357 463
358 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); 464 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
359 cycles_at_suspend = tkr->read(tkr->clock); 465 cycles_at_suspend = tkr->read(tkr->clock);
360 tkr_dummy.read = dummy_clock_read; 466 tkr_dummy.read = dummy_clock_read;
361 update_fast_timekeeper(&tkr_dummy); 467 update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
468
469 tkr = &tk->tkr_raw;
470 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
471 tkr_dummy.read = dummy_clock_read;
472 update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
362} 473}
363 474
364#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD 475#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@@ -369,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
369 480
370 xt = timespec64_to_timespec(tk_xtime(tk)); 481 xt = timespec64_to_timespec(tk_xtime(tk));
371 wm = timespec64_to_timespec(tk->wall_to_monotonic); 482 wm = timespec64_to_timespec(tk->wall_to_monotonic);
372 update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, 483 update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
373 tk->tkr.cycle_last); 484 tk->tkr_mono.cycle_last);
374} 485}
375 486
376static inline void old_vsyscall_fixup(struct timekeeper *tk) 487static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -387,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
387 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD 498 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
388 * users are removed, this can be killed. 499 * users are removed, this can be killed.
389 */ 500 */
390 remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); 501 remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
391 tk->tkr.xtime_nsec -= remainder; 502 tk->tkr_mono.xtime_nsec -= remainder;
392 tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; 503 tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
393 tk->ntp_error += remainder << tk->ntp_error_shift; 504 tk->ntp_error += remainder << tk->ntp_error_shift;
394 tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; 505 tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
395} 506}
396#else 507#else
397#define old_vsyscall_fixup(tk) 508#define old_vsyscall_fixup(tk)
@@ -456,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
456 */ 567 */
457 seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); 568 seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
458 nsec = (u32) tk->wall_to_monotonic.tv_nsec; 569 nsec = (u32) tk->wall_to_monotonic.tv_nsec;
459 tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); 570 tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
460 571
461 /* Update the monotonic raw base */ 572 /* Update the monotonic raw base */
462 tk->base_raw = timespec64_to_ktime(tk->raw_time); 573 tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
463 574
464 /* 575 /*
465 * The sum of the nanoseconds portions of xtime and 576 * The sum of the nanoseconds portions of xtime and
466 * wall_to_monotonic can be greater/equal one second. Take 577 * wall_to_monotonic can be greater/equal one second. Take
467 * this into account before updating tk->ktime_sec. 578 * this into account before updating tk->ktime_sec.
468 */ 579 */
469 nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); 580 nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
470 if (nsec >= NSEC_PER_SEC) 581 if (nsec >= NSEC_PER_SEC)
471 seconds++; 582 seconds++;
472 tk->ktime_sec = seconds; 583 tk->ktime_sec = seconds;
@@ -489,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
489 memcpy(&shadow_timekeeper, &tk_core.timekeeper, 600 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
490 sizeof(tk_core.timekeeper)); 601 sizeof(tk_core.timekeeper));
491 602
492 update_fast_timekeeper(&tk->tkr); 603 update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
604 update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
493} 605}
494 606
495/** 607/**
@@ -501,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
501 */ 613 */
502static void timekeeping_forward_now(struct timekeeper *tk) 614static void timekeeping_forward_now(struct timekeeper *tk)
503{ 615{
504 struct clocksource *clock = tk->tkr.clock; 616 struct clocksource *clock = tk->tkr_mono.clock;
505 cycle_t cycle_now, delta; 617 cycle_t cycle_now, delta;
506 s64 nsec; 618 s64 nsec;
507 619
508 cycle_now = tk->tkr.read(clock); 620 cycle_now = tk->tkr_mono.read(clock);
509 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); 621 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
510 tk->tkr.cycle_last = cycle_now; 622 tk->tkr_mono.cycle_last = cycle_now;
623 tk->tkr_raw.cycle_last = cycle_now;
511 624
512 tk->tkr.xtime_nsec += delta * tk->tkr.mult; 625 tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
513 626
514 /* If arch requires, add in get_arch_timeoffset() */ 627 /* If arch requires, add in get_arch_timeoffset() */
515 tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; 628 tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
516 629
517 tk_normalize_xtime(tk); 630 tk_normalize_xtime(tk);
518 631
519 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); 632 nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
520 timespec64_add_ns(&tk->raw_time, nsec); 633 timespec64_add_ns(&tk->raw_time, nsec);
521} 634}
522 635
@@ -537,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts)
537 seq = read_seqcount_begin(&tk_core.seq); 650 seq = read_seqcount_begin(&tk_core.seq);
538 651
539 ts->tv_sec = tk->xtime_sec; 652 ts->tv_sec = tk->xtime_sec;
540 nsecs = timekeeping_get_ns(&tk->tkr); 653 nsecs = timekeeping_get_ns(&tk->tkr_mono);
541 654
542 } while (read_seqcount_retry(&tk_core.seq, seq)); 655 } while (read_seqcount_retry(&tk_core.seq, seq));
543 656
@@ -577,8 +690,8 @@ ktime_t ktime_get(void)
577 690
578 do { 691 do {
579 seq = read_seqcount_begin(&tk_core.seq); 692 seq = read_seqcount_begin(&tk_core.seq);
580 base = tk->tkr.base_mono; 693 base = tk->tkr_mono.base;
581 nsecs = timekeeping_get_ns(&tk->tkr); 694 nsecs = timekeeping_get_ns(&tk->tkr_mono);
582 695
583 } while (read_seqcount_retry(&tk_core.seq, seq)); 696 } while (read_seqcount_retry(&tk_core.seq, seq));
584 697
@@ -603,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
603 716
604 do { 717 do {
605 seq = read_seqcount_begin(&tk_core.seq); 718 seq = read_seqcount_begin(&tk_core.seq);
606 base = ktime_add(tk->tkr.base_mono, *offset); 719 base = ktime_add(tk->tkr_mono.base, *offset);
607 nsecs = timekeeping_get_ns(&tk->tkr); 720 nsecs = timekeeping_get_ns(&tk->tkr_mono);
608 721
609 } while (read_seqcount_retry(&tk_core.seq, seq)); 722 } while (read_seqcount_retry(&tk_core.seq, seq));
610 723
@@ -645,8 +758,8 @@ ktime_t ktime_get_raw(void)
645 758
646 do { 759 do {
647 seq = read_seqcount_begin(&tk_core.seq); 760 seq = read_seqcount_begin(&tk_core.seq);
648 base = tk->base_raw; 761 base = tk->tkr_raw.base;
649 nsecs = timekeeping_get_ns_raw(tk); 762 nsecs = timekeeping_get_ns(&tk->tkr_raw);
650 763
651 } while (read_seqcount_retry(&tk_core.seq, seq)); 764 } while (read_seqcount_retry(&tk_core.seq, seq));
652 765
@@ -674,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts)
674 do { 787 do {
675 seq = read_seqcount_begin(&tk_core.seq); 788 seq = read_seqcount_begin(&tk_core.seq);
676 ts->tv_sec = tk->xtime_sec; 789 ts->tv_sec = tk->xtime_sec;
677 nsec = timekeeping_get_ns(&tk->tkr); 790 nsec = timekeeping_get_ns(&tk->tkr_mono);
678 tomono = tk->wall_to_monotonic; 791 tomono = tk->wall_to_monotonic;
679 792
680 } while (read_seqcount_retry(&tk_core.seq, seq)); 793 } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -759,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
759 ts_real->tv_sec = tk->xtime_sec; 872 ts_real->tv_sec = tk->xtime_sec;
760 ts_real->tv_nsec = 0; 873 ts_real->tv_nsec = 0;
761 874
762 nsecs_raw = timekeeping_get_ns_raw(tk); 875 nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
763 nsecs_real = timekeeping_get_ns(&tk->tkr); 876 nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
764 877
765 } while (read_seqcount_retry(&tk_core.seq, seq)); 878 } while (read_seqcount_retry(&tk_core.seq, seq));
766 879
@@ -943,7 +1056,7 @@ static int change_clocksource(void *data)
943 */ 1056 */
944 if (try_module_get(new->owner)) { 1057 if (try_module_get(new->owner)) {
945 if (!new->enable || new->enable(new) == 0) { 1058 if (!new->enable || new->enable(new) == 0) {
946 old = tk->tkr.clock; 1059 old = tk->tkr_mono.clock;
947 tk_setup_internals(tk, new); 1060 tk_setup_internals(tk, new);
948 if (old->disable) 1061 if (old->disable)
949 old->disable(old); 1062 old->disable(old);
@@ -971,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock)
971{ 1084{
972 struct timekeeper *tk = &tk_core.timekeeper; 1085 struct timekeeper *tk = &tk_core.timekeeper;
973 1086
974 if (tk->tkr.clock == clock) 1087 if (tk->tkr_mono.clock == clock)
975 return 0; 1088 return 0;
976 stop_machine(change_clocksource, clock, NULL); 1089 stop_machine(change_clocksource, clock, NULL);
977 tick_clock_notify(); 1090 tick_clock_notify();
978 return tk->tkr.clock == clock ? 0 : -1; 1091 return tk->tkr_mono.clock == clock ? 0 : -1;
979} 1092}
980 1093
981/** 1094/**
@@ -993,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts)
993 1106
994 do { 1107 do {
995 seq = read_seqcount_begin(&tk_core.seq); 1108 seq = read_seqcount_begin(&tk_core.seq);
996 nsecs = timekeeping_get_ns_raw(tk); 1109 nsecs = timekeeping_get_ns(&tk->tkr_raw);
997 ts64 = tk->raw_time; 1110 ts64 = tk->raw_time;
998 1111
999 } while (read_seqcount_retry(&tk_core.seq, seq)); 1112 } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1016,7 +1129,7 @@ int timekeeping_valid_for_hres(void)
1016 do { 1129 do {
1017 seq = read_seqcount_begin(&tk_core.seq); 1130 seq = read_seqcount_begin(&tk_core.seq);
1018 1131
1019 ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 1132 ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
1020 1133
1021 } while (read_seqcount_retry(&tk_core.seq, seq)); 1134 } while (read_seqcount_retry(&tk_core.seq, seq));
1022 1135
@@ -1035,7 +1148,7 @@ u64 timekeeping_max_deferment(void)
1035 do { 1148 do {
1036 seq = read_seqcount_begin(&tk_core.seq); 1149 seq = read_seqcount_begin(&tk_core.seq);
1037 1150
1038 ret = tk->tkr.clock->max_idle_ns; 1151 ret = tk->tkr_mono.clock->max_idle_ns;
1039 1152
1040 } while (read_seqcount_retry(&tk_core.seq, seq)); 1153 } while (read_seqcount_retry(&tk_core.seq, seq));
1041 1154
@@ -1057,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts)
1057 ts->tv_nsec = 0; 1170 ts->tv_nsec = 0;
1058} 1171}
1059 1172
1173void __weak read_persistent_clock64(struct timespec64 *ts64)
1174{
1175 struct timespec ts;
1176
1177 read_persistent_clock(&ts);
1178 *ts64 = timespec_to_timespec64(ts);
1179}
1180
1060/** 1181/**
1061 * read_boot_clock - Return time of the system start. 1182 * read_boot_clock - Return time of the system start.
1062 * 1183 *
@@ -1072,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts)
1072 ts->tv_nsec = 0; 1193 ts->tv_nsec = 0;
1073} 1194}
1074 1195
1196void __weak read_boot_clock64(struct timespec64 *ts64)
1197{
1198 struct timespec ts;
1199
1200 read_boot_clock(&ts);
1201 *ts64 = timespec_to_timespec64(ts);
1202}
1203
1204/* Flag for if timekeeping_resume() has injected sleeptime */
1205static bool sleeptime_injected;
1206
1207/* Flag for if there is a persistent clock on this platform */
1208static bool persistent_clock_exists;
1209
1075/* 1210/*
1076 * timekeeping_init - Initializes the clocksource and common timekeeping values 1211 * timekeeping_init - Initializes the clocksource and common timekeeping values
1077 */ 1212 */
@@ -1081,20 +1216,17 @@ void __init timekeeping_init(void)
1081 struct clocksource *clock; 1216 struct clocksource *clock;
1082 unsigned long flags; 1217 unsigned long flags;
1083 struct timespec64 now, boot, tmp; 1218 struct timespec64 now, boot, tmp;
1084 struct timespec ts;
1085 1219
1086 read_persistent_clock(&ts); 1220 read_persistent_clock64(&now);
1087 now = timespec_to_timespec64(ts);
1088 if (!timespec64_valid_strict(&now)) { 1221 if (!timespec64_valid_strict(&now)) {
1089 pr_warn("WARNING: Persistent clock returned invalid value!\n" 1222 pr_warn("WARNING: Persistent clock returned invalid value!\n"
1090 " Check your CMOS/BIOS settings.\n"); 1223 " Check your CMOS/BIOS settings.\n");
1091 now.tv_sec = 0; 1224 now.tv_sec = 0;
1092 now.tv_nsec = 0; 1225 now.tv_nsec = 0;
1093 } else if (now.tv_sec || now.tv_nsec) 1226 } else if (now.tv_sec || now.tv_nsec)
1094 persistent_clock_exist = true; 1227 persistent_clock_exists = true;
1095 1228
1096 read_boot_clock(&ts); 1229 read_boot_clock64(&boot);
1097 boot = timespec_to_timespec64(ts);
1098 if (!timespec64_valid_strict(&boot)) { 1230 if (!timespec64_valid_strict(&boot)) {
1099 pr_warn("WARNING: Boot clock returned invalid value!\n" 1231 pr_warn("WARNING: Boot clock returned invalid value!\n"
1100 " Check your CMOS/BIOS settings.\n"); 1232 " Check your CMOS/BIOS settings.\n");
@@ -1114,7 +1246,6 @@ void __init timekeeping_init(void)
1114 tk_set_xtime(tk, &now); 1246 tk_set_xtime(tk, &now);
1115 tk->raw_time.tv_sec = 0; 1247 tk->raw_time.tv_sec = 0;
1116 tk->raw_time.tv_nsec = 0; 1248 tk->raw_time.tv_nsec = 0;
1117 tk->base_raw.tv64 = 0;
1118 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 1249 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
1119 boot = tk_xtime(tk); 1250 boot = tk_xtime(tk);
1120 1251
@@ -1127,7 +1258,7 @@ void __init timekeeping_init(void)
1127 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1258 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1128} 1259}
1129 1260
1130/* time in seconds when suspend began */ 1261/* time in seconds when suspend began for persistent clock */
1131static struct timespec64 timekeeping_suspend_time; 1262static struct timespec64 timekeeping_suspend_time;
1132 1263
1133/** 1264/**
@@ -1152,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
1152 tk_debug_account_sleep_time(delta); 1283 tk_debug_account_sleep_time(delta);
1153} 1284}
1154 1285
1286#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
1287/**
1288 * We have three kinds of time sources to use for sleep time
1289 * injection, the preference order is:
1290 * 1) non-stop clocksource
1291 * 2) persistent clock (ie: RTC accessible when irqs are off)
1292 * 3) RTC
1293 *
1294 * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
1295 * If system has neither 1) nor 2), 3) will be used finally.
1296 *
1297 *
1298 * If timekeeping has injected sleeptime via either 1) or 2),
1299 * 3) becomes needless, so in this case we don't need to call
1300 * rtc_resume(), and this is what timekeeping_rtc_skipresume()
1301 * means.
1302 */
1303bool timekeeping_rtc_skipresume(void)
1304{
1305 return sleeptime_injected;
1306}
1307
1308/**
1309 * 1) can be determined whether to use or not only when doing
1310 * timekeeping_resume() which is invoked after rtc_suspend(),
1311 * so we can't skip rtc_suspend() surely if system has 1).
1312 *
1313 * But if system has 2), 2) will definitely be used, so in this
1314 * case we don't need to call rtc_suspend(), and this is what
1315 * timekeeping_rtc_skipsuspend() means.
1316 */
1317bool timekeeping_rtc_skipsuspend(void)
1318{
1319 return persistent_clock_exists;
1320}
1321
1155/** 1322/**
1156 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values 1323 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
1157 * @delta: pointer to a timespec64 delta value 1324 * @delta: pointer to a timespec64 delta value
1158 * 1325 *
1159 * This hook is for architectures that cannot support read_persistent_clock 1326 * This hook is for architectures that cannot support read_persistent_clock64
1160 * because their RTC/persistent clock is only accessible when irqs are enabled. 1327 * because their RTC/persistent clock is only accessible when irqs are enabled.
1328 * and also don't have an effective nonstop clocksource.
1161 * 1329 *
1162 * This function should only be called by rtc_resume(), and allows 1330 * This function should only be called by rtc_resume(), and allows
1163 * a suspend offset to be injected into the timekeeping values. 1331 * a suspend offset to be injected into the timekeeping values.
@@ -1167,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1167 struct timekeeper *tk = &tk_core.timekeeper; 1335 struct timekeeper *tk = &tk_core.timekeeper;
1168 unsigned long flags; 1336 unsigned long flags;
1169 1337
1170 /*
1171 * Make sure we don't set the clock twice, as timekeeping_resume()
1172 * already did it
1173 */
1174 if (has_persistent_clock())
1175 return;
1176
1177 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1338 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1178 write_seqcount_begin(&tk_core.seq); 1339 write_seqcount_begin(&tk_core.seq);
1179 1340
@@ -1189,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1189 /* signal hrtimers about time change */ 1350 /* signal hrtimers about time change */
1190 clock_was_set(); 1351 clock_was_set();
1191} 1352}
1353#endif
1192 1354
1193/** 1355/**
1194 * timekeeping_resume - Resumes the generic timekeeping subsystem. 1356 * timekeeping_resume - Resumes the generic timekeeping subsystem.
1195 *
1196 * This is for the generic clocksource timekeeping.
1197 * xtime/wall_to_monotonic/jiffies/etc are
1198 * still managed by arch specific suspend/resume code.
1199 */ 1357 */
1200void timekeeping_resume(void) 1358void timekeeping_resume(void)
1201{ 1359{
1202 struct timekeeper *tk = &tk_core.timekeeper; 1360 struct timekeeper *tk = &tk_core.timekeeper;
1203 struct clocksource *clock = tk->tkr.clock; 1361 struct clocksource *clock = tk->tkr_mono.clock;
1204 unsigned long flags; 1362 unsigned long flags;
1205 struct timespec64 ts_new, ts_delta; 1363 struct timespec64 ts_new, ts_delta;
1206 struct timespec tmp;
1207 cycle_t cycle_now, cycle_delta; 1364 cycle_t cycle_now, cycle_delta;
1208 bool suspendtime_found = false;
1209 1365
1210 read_persistent_clock(&tmp); 1366 sleeptime_injected = false;
1211 ts_new = timespec_to_timespec64(tmp); 1367 read_persistent_clock64(&ts_new);
1212 1368
1213 clockevents_resume(); 1369 clockevents_resume();
1214 clocksource_resume(); 1370 clocksource_resume();
@@ -1228,16 +1384,16 @@ void timekeeping_resume(void)
1228 * The less preferred source will only be tried if there is no better 1384 * The less preferred source will only be tried if there is no better
1229 * usable source. The rtc part is handled separately in rtc core code. 1385 * usable source. The rtc part is handled separately in rtc core code.
1230 */ 1386 */
1231 cycle_now = tk->tkr.read(clock); 1387 cycle_now = tk->tkr_mono.read(clock);
1232 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && 1388 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
1233 cycle_now > tk->tkr.cycle_last) { 1389 cycle_now > tk->tkr_mono.cycle_last) {
1234 u64 num, max = ULLONG_MAX; 1390 u64 num, max = ULLONG_MAX;
1235 u32 mult = clock->mult; 1391 u32 mult = clock->mult;
1236 u32 shift = clock->shift; 1392 u32 shift = clock->shift;
1237 s64 nsec = 0; 1393 s64 nsec = 0;
1238 1394
1239 cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, 1395 cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
1240 tk->tkr.mask); 1396 tk->tkr_mono.mask);
1241 1397
1242 /* 1398 /*
1243 * "cycle_delta * mutl" may cause 64 bits overflow, if the 1399 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1253,17 +1409,19 @@ void timekeeping_resume(void)
1253 nsec += ((u64) cycle_delta * mult) >> shift; 1409 nsec += ((u64) cycle_delta * mult) >> shift;
1254 1410
1255 ts_delta = ns_to_timespec64(nsec); 1411 ts_delta = ns_to_timespec64(nsec);
1256 suspendtime_found = true; 1412 sleeptime_injected = true;
1257 } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { 1413 } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
1258 ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); 1414 ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
1259 suspendtime_found = true; 1415 sleeptime_injected = true;
1260 } 1416 }
1261 1417
1262 if (suspendtime_found) 1418 if (sleeptime_injected)
1263 __timekeeping_inject_sleeptime(tk, &ts_delta); 1419 __timekeeping_inject_sleeptime(tk, &ts_delta);
1264 1420
1265 /* Re-base the last cycle value */ 1421 /* Re-base the last cycle value */
1266 tk->tkr.cycle_last = cycle_now; 1422 tk->tkr_mono.cycle_last = cycle_now;
1423 tk->tkr_raw.cycle_last = cycle_now;
1424
1267 tk->ntp_error = 0; 1425 tk->ntp_error = 0;
1268 timekeeping_suspended = 0; 1426 timekeeping_suspended = 0;
1269 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 1427 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1272,9 +1430,7 @@ void timekeeping_resume(void)
1272 1430
1273 touch_softlockup_watchdog(); 1431 touch_softlockup_watchdog();
1274 1432
1275 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); 1433 tick_resume();
1276
1277 /* Resume hrtimers */
1278 hrtimers_resume(); 1434 hrtimers_resume();
1279} 1435}
1280 1436
@@ -1284,10 +1440,8 @@ int timekeeping_suspend(void)
1284 unsigned long flags; 1440 unsigned long flags;
1285 struct timespec64 delta, delta_delta; 1441 struct timespec64 delta, delta_delta;
1286 static struct timespec64 old_delta; 1442 static struct timespec64 old_delta;
1287 struct timespec tmp;
1288 1443
1289 read_persistent_clock(&tmp); 1444 read_persistent_clock64(&timekeeping_suspend_time);
1290 timekeeping_suspend_time = timespec_to_timespec64(tmp);
1291 1445
1292 /* 1446 /*
1293 * On some systems the persistent_clock can not be detected at 1447 * On some systems the persistent_clock can not be detected at
@@ -1295,31 +1449,33 @@ int timekeeping_suspend(void)
1295 * value returned, update the persistent_clock_exists flag. 1449 * value returned, update the persistent_clock_exists flag.
1296 */ 1450 */
1297 if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) 1451 if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
1298 persistent_clock_exist = true; 1452 persistent_clock_exists = true;
1299 1453
1300 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1454 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1301 write_seqcount_begin(&tk_core.seq); 1455 write_seqcount_begin(&tk_core.seq);
1302 timekeeping_forward_now(tk); 1456 timekeeping_forward_now(tk);
1303 timekeeping_suspended = 1; 1457 timekeeping_suspended = 1;
1304 1458
1305 /* 1459 if (persistent_clock_exists) {
1306 * To avoid drift caused by repeated suspend/resumes,
1307 * which each can add ~1 second drift error,
1308 * try to compensate so the difference in system time
1309 * and persistent_clock time stays close to constant.
1310 */
1311 delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
1312 delta_delta = timespec64_sub(delta, old_delta);
1313 if (abs(delta_delta.tv_sec) >= 2) {
1314 /* 1460 /*
1315 * if delta_delta is too large, assume time correction 1461 * To avoid drift caused by repeated suspend/resumes,
1316 * has occured and set old_delta to the current delta. 1462 * which each can add ~1 second drift error,
1463 * try to compensate so the difference in system time
1464 * and persistent_clock time stays close to constant.
1317 */ 1465 */
1318 old_delta = delta; 1466 delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
1319 } else { 1467 delta_delta = timespec64_sub(delta, old_delta);
1320 /* Otherwise try to adjust old_system to compensate */ 1468 if (abs(delta_delta.tv_sec) >= 2) {
1321 timekeeping_suspend_time = 1469 /*
1322 timespec64_add(timekeeping_suspend_time, delta_delta); 1470 * if delta_delta is too large, assume time correction
1471 * has occurred and set old_delta to the current delta.
1472 */
1473 old_delta = delta;
1474 } else {
1475 /* Otherwise try to adjust old_system to compensate */
1476 timekeeping_suspend_time =
1477 timespec64_add(timekeeping_suspend_time, delta_delta);
1478 }
1323 } 1479 }
1324 1480
1325 timekeeping_update(tk, TK_MIRROR); 1481 timekeeping_update(tk, TK_MIRROR);
@@ -1327,7 +1483,7 @@ int timekeeping_suspend(void)
1327 write_seqcount_end(&tk_core.seq); 1483 write_seqcount_end(&tk_core.seq);
1328 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1484 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1329 1485
1330 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 1486 tick_suspend();
1331 clocksource_suspend(); 1487 clocksource_suspend();
1332 clockevents_suspend(); 1488 clockevents_suspend();
1333 1489
@@ -1416,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
1416 * 1572 *
1417 * XXX - TODO: Doc ntp_error calculation. 1573 * XXX - TODO: Doc ntp_error calculation.
1418 */ 1574 */
1419 if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { 1575 if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
1420 /* NTP adjustment caused clocksource mult overflow */ 1576 /* NTP adjustment caused clocksource mult overflow */
1421 WARN_ON_ONCE(1); 1577 WARN_ON_ONCE(1);
1422 return; 1578 return;
1423 } 1579 }
1424 1580
1425 tk->tkr.mult += mult_adj; 1581 tk->tkr_mono.mult += mult_adj;
1426 tk->xtime_interval += interval; 1582 tk->xtime_interval += interval;
1427 tk->tkr.xtime_nsec -= offset; 1583 tk->tkr_mono.xtime_nsec -= offset;
1428 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; 1584 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
1429} 1585}
1430 1586
@@ -1486,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1486 tk->ntp_err_mult = 0; 1642 tk->ntp_err_mult = 0;
1487 } 1643 }
1488 1644
1489 if (unlikely(tk->tkr.clock->maxadj && 1645 if (unlikely(tk->tkr_mono.clock->maxadj &&
1490 (abs(tk->tkr.mult - tk->tkr.clock->mult) 1646 (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
1491 > tk->tkr.clock->maxadj))) { 1647 > tk->tkr_mono.clock->maxadj))) {
1492 printk_once(KERN_WARNING 1648 printk_once(KERN_WARNING
1493 "Adjusting %s more than 11%% (%ld vs %ld)\n", 1649 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1494 tk->tkr.clock->name, (long)tk->tkr.mult, 1650 tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
1495 (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); 1651 (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
1496 } 1652 }
1497 1653
1498 /* 1654 /*
@@ -1509,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1509 * We'll correct this error next time through this function, when 1665 * We'll correct this error next time through this function, when
1510 * xtime_nsec is not as small. 1666 * xtime_nsec is not as small.
1511 */ 1667 */
1512 if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { 1668 if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
1513 s64 neg = -(s64)tk->tkr.xtime_nsec; 1669 s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
1514 tk->tkr.xtime_nsec = 0; 1670 tk->tkr_mono.xtime_nsec = 0;
1515 tk->ntp_error += neg << tk->ntp_error_shift; 1671 tk->ntp_error += neg << tk->ntp_error_shift;
1516 } 1672 }
1517} 1673}
@@ -1526,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1526 */ 1682 */
1527static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) 1683static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1528{ 1684{
1529 u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; 1685 u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
1530 unsigned int clock_set = 0; 1686 unsigned int clock_set = 0;
1531 1687
1532 while (tk->tkr.xtime_nsec >= nsecps) { 1688 while (tk->tkr_mono.xtime_nsec >= nsecps) {
1533 int leap; 1689 int leap;
1534 1690
1535 tk->tkr.xtime_nsec -= nsecps; 1691 tk->tkr_mono.xtime_nsec -= nsecps;
1536 tk->xtime_sec++; 1692 tk->xtime_sec++;
1537 1693
1538 /* Figure out if its a leap sec and apply if needed */ 1694 /* Figure out if its a leap sec and apply if needed */
@@ -1577,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1577 1733
1578 /* Accumulate one shifted interval */ 1734 /* Accumulate one shifted interval */
1579 offset -= interval; 1735 offset -= interval;
1580 tk->tkr.cycle_last += interval; 1736 tk->tkr_mono.cycle_last += interval;
1737 tk->tkr_raw.cycle_last += interval;
1581 1738
1582 tk->tkr.xtime_nsec += tk->xtime_interval << shift; 1739 tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
1583 *clock_set |= accumulate_nsecs_to_secs(tk); 1740 *clock_set |= accumulate_nsecs_to_secs(tk);
1584 1741
1585 /* Accumulate raw time */ 1742 /* Accumulate raw time */
@@ -1622,14 +1779,17 @@ void update_wall_time(void)
1622#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1779#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1623 offset = real_tk->cycle_interval; 1780 offset = real_tk->cycle_interval;
1624#else 1781#else
1625 offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), 1782 offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
1626 tk->tkr.cycle_last, tk->tkr.mask); 1783 tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
1627#endif 1784#endif
1628 1785
1629 /* Check if there's really nothing to do */ 1786 /* Check if there's really nothing to do */
1630 if (offset < real_tk->cycle_interval) 1787 if (offset < real_tk->cycle_interval)
1631 goto out; 1788 goto out;
1632 1789
1790 /* Do some additional sanity checking */
1791 timekeeping_check_update(real_tk, offset);
1792
1633 /* 1793 /*
1634 * With NO_HZ we may have to accumulate many cycle_intervals 1794 * With NO_HZ we may have to accumulate many cycle_intervals
1635 * (think "ticks") worth of time at once. To do this efficiently, 1795 * (think "ticks") worth of time at once. To do this efficiently,
@@ -1784,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
1784 do { 1944 do {
1785 seq = read_seqcount_begin(&tk_core.seq); 1945 seq = read_seqcount_begin(&tk_core.seq);
1786 1946
1787 base = tk->tkr.base_mono; 1947 base = tk->tkr_mono.base;
1788 nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; 1948 nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
1789 1949
1790 *offs_real = tk->offs_real; 1950 *offs_real = tk->offs_real;
1791 *offs_boot = tk->offs_boot; 1951 *offs_boot = tk->offs_boot;
@@ -1816,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
1816 do { 1976 do {
1817 seq = read_seqcount_begin(&tk_core.seq); 1977 seq = read_seqcount_begin(&tk_core.seq);
1818 1978
1819 base = tk->tkr.base_mono; 1979 base = tk->tkr_mono.base;
1820 nsecs = timekeeping_get_ns(&tk->tkr); 1980 nsecs = timekeeping_get_ns(&tk->tkr_mono);
1821 1981
1822 *offs_real = tk->offs_real; 1982 *offs_real = tk->offs_real;
1823 *offs_boot = tk->offs_boot; 1983 *offs_boot = tk->offs_boot;
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 1d91416055d5..ead8794b9a4e 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts);
19extern int timekeeping_suspend(void); 19extern int timekeeping_suspend(void);
20extern void timekeeping_resume(void); 20extern void timekeeping_resume(void);
21 21
22extern void do_timer(unsigned long ticks);
23extern void update_wall_time(void);
24
25extern seqlock_t jiffies_lock;
26
27#define CS_NAME_LEN 32
28
22#endif 29#endif
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..2ece3aa5069c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -90,8 +90,18 @@ struct tvec_base {
90 struct tvec tv5; 90 struct tvec tv5;
91} ____cacheline_aligned; 91} ____cacheline_aligned;
92 92
93/*
94 * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
95 * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
96 * pointer to per-cpu entries because we don't know where we'll map the section,
97 * even for the boot cpu.
98 *
99 * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
100 * rest of them.
101 */
93struct tvec_base boot_tvec_bases; 102struct tvec_base boot_tvec_bases;
94EXPORT_SYMBOL(boot_tvec_bases); 103EXPORT_SYMBOL(boot_tvec_bases);
104
95static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; 105static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
96 106
97/* Functions below help us manage 'deferrable' flag */ 107/* Functions below help us manage 'deferrable' flag */
@@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
1027EXPORT_SYMBOL(try_to_del_timer_sync); 1037EXPORT_SYMBOL(try_to_del_timer_sync);
1028 1038
1029#ifdef CONFIG_SMP 1039#ifdef CONFIG_SMP
1040static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
1041
1030/** 1042/**
1031 * del_timer_sync - deactivate a timer and wait for the handler to finish. 1043 * del_timer_sync - deactivate a timer and wait for the handler to finish.
1032 * @timer: the timer to be deactivated 1044 * @timer: the timer to be deactivated
@@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1532} 1544}
1533EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1545EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1534 1546
1535static int init_timers_cpu(int cpu)
1536{
1537 int j;
1538 struct tvec_base *base;
1539 static char tvec_base_done[NR_CPUS];
1540
1541 if (!tvec_base_done[cpu]) {
1542 static char boot_done;
1543
1544 if (boot_done) {
1545 /*
1546 * The APs use this path later in boot
1547 */
1548 base = kzalloc_node(sizeof(*base), GFP_KERNEL,
1549 cpu_to_node(cpu));
1550 if (!base)
1551 return -ENOMEM;
1552
1553 /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
1554 if (WARN_ON(base != tbase_get_base(base))) {
1555 kfree(base);
1556 return -ENOMEM;
1557 }
1558 per_cpu(tvec_bases, cpu) = base;
1559 } else {
1560 /*
1561 * This is for the boot CPU - we use compile-time
1562 * static initialisation because per-cpu memory isn't
1563 * ready yet and because the memory allocators are not
1564 * initialised either.
1565 */
1566 boot_done = 1;
1567 base = &boot_tvec_bases;
1568 }
1569 spin_lock_init(&base->lock);
1570 tvec_base_done[cpu] = 1;
1571 base->cpu = cpu;
1572 } else {
1573 base = per_cpu(tvec_bases, cpu);
1574 }
1575
1576
1577 for (j = 0; j < TVN_SIZE; j++) {
1578 INIT_LIST_HEAD(base->tv5.vec + j);
1579 INIT_LIST_HEAD(base->tv4.vec + j);
1580 INIT_LIST_HEAD(base->tv3.vec + j);
1581 INIT_LIST_HEAD(base->tv2.vec + j);
1582 }
1583 for (j = 0; j < TVR_SIZE; j++)
1584 INIT_LIST_HEAD(base->tv1.vec + j);
1585
1586 base->timer_jiffies = jiffies;
1587 base->next_timer = base->timer_jiffies;
1588 base->active_timers = 0;
1589 base->all_timers = 0;
1590 return 0;
1591}
1592
1593#ifdef CONFIG_HOTPLUG_CPU 1547#ifdef CONFIG_HOTPLUG_CPU
1594static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) 1548static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
1595{ 1549{
@@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu)
1631 migrate_timer_list(new_base, old_base->tv5.vec + i); 1585 migrate_timer_list(new_base, old_base->tv5.vec + i);
1632 } 1586 }
1633 1587
1588 old_base->active_timers = 0;
1589 old_base->all_timers = 0;
1590
1634 spin_unlock(&old_base->lock); 1591 spin_unlock(&old_base->lock);
1635 spin_unlock_irq(&new_base->lock); 1592 spin_unlock_irq(&new_base->lock);
1636 put_cpu_var(tvec_bases); 1593 put_cpu_var(tvec_bases);
1637} 1594}
1638#endif /* CONFIG_HOTPLUG_CPU */
1639 1595
1640static int timer_cpu_notify(struct notifier_block *self, 1596static int timer_cpu_notify(struct notifier_block *self,
1641 unsigned long action, void *hcpu) 1597 unsigned long action, void *hcpu)
1642{ 1598{
1643 long cpu = (long)hcpu; 1599 switch (action) {
1644 int err;
1645
1646 switch(action) {
1647 case CPU_UP_PREPARE:
1648 case CPU_UP_PREPARE_FROZEN:
1649 err = init_timers_cpu(cpu);
1650 if (err < 0)
1651 return notifier_from_errno(err);
1652 break;
1653#ifdef CONFIG_HOTPLUG_CPU
1654 case CPU_DEAD: 1600 case CPU_DEAD:
1655 case CPU_DEAD_FROZEN: 1601 case CPU_DEAD_FROZEN:
1656 migrate_timers(cpu); 1602 migrate_timers((long)hcpu);
1657 break; 1603 break;
1658#endif
1659 default: 1604 default:
1660 break; 1605 break;
1661 } 1606 }
1607
1662 return NOTIFY_OK; 1608 return NOTIFY_OK;
1663} 1609}
1664 1610
1665static struct notifier_block timers_nb = { 1611static inline void timer_register_cpu_notifier(void)
1666 .notifier_call = timer_cpu_notify, 1612{
1667}; 1613 cpu_notifier(timer_cpu_notify, 0);
1614}
1615#else
1616static inline void timer_register_cpu_notifier(void) { }
1617#endif /* CONFIG_HOTPLUG_CPU */
1668 1618
1619static void __init init_timer_cpu(struct tvec_base *base, int cpu)
1620{
1621 int j;
1669 1622
1670void __init init_timers(void) 1623 BUG_ON(base != tbase_get_base(base));
1624
1625 base->cpu = cpu;
1626 per_cpu(tvec_bases, cpu) = base;
1627 spin_lock_init(&base->lock);
1628
1629 for (j = 0; j < TVN_SIZE; j++) {
1630 INIT_LIST_HEAD(base->tv5.vec + j);
1631 INIT_LIST_HEAD(base->tv4.vec + j);
1632 INIT_LIST_HEAD(base->tv3.vec + j);
1633 INIT_LIST_HEAD(base->tv2.vec + j);
1634 }
1635 for (j = 0; j < TVR_SIZE; j++)
1636 INIT_LIST_HEAD(base->tv1.vec + j);
1637
1638 base->timer_jiffies = jiffies;
1639 base->next_timer = base->timer_jiffies;
1640}
1641
1642static void __init init_timer_cpus(void)
1671{ 1643{
1672 int err; 1644 struct tvec_base *base;
1645 int local_cpu = smp_processor_id();
1646 int cpu;
1673 1647
1648 for_each_possible_cpu(cpu) {
1649 if (cpu == local_cpu)
1650 base = &boot_tvec_bases;
1651#ifdef CONFIG_SMP
1652 else
1653 base = per_cpu_ptr(&__tvec_bases, cpu);
1654#endif
1655
1656 init_timer_cpu(base, cpu);
1657 }
1658}
1659
1660void __init init_timers(void)
1661{
1674 /* ensure there are enough low bits for flags in timer->base pointer */ 1662 /* ensure there are enough low bits for flags in timer->base pointer */
1675 BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); 1663 BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
1676 1664
1677 err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1665 init_timer_cpus();
1678 (void *)(long)smp_processor_id());
1679 BUG_ON(err != NOTIFY_OK);
1680
1681 init_timer_stats(); 1666 init_timer_stats();
1682 register_cpu_notifier(&timers_nb); 1667 timer_register_cpu_notifier();
1683 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 1668 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1684} 1669}
1685 1670
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 61ed862cdd37..e878c2e0ba45 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,10 +16,10 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/tick.h>
20 19
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
22 21
22#include "tick-internal.h"
23 23
24struct timer_list_iter { 24struct timer_list_iter {
25 int cpu; 25 int cpu;
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
228 print_name_offset(m, dev->set_next_event); 228 print_name_offset(m, dev->set_next_event);
229 SEQ_printf(m, "\n"); 229 SEQ_printf(m, "\n");
230 230
231 SEQ_printf(m, " set_mode: "); 231 if (dev->set_mode) {
232 print_name_offset(m, dev->set_mode); 232 SEQ_printf(m, " set_mode: ");
233 SEQ_printf(m, "\n"); 233 print_name_offset(m, dev->set_mode);
234 SEQ_printf(m, "\n");
235 } else {
236 if (dev->set_state_shutdown) {
237 SEQ_printf(m, " shutdown: ");
238 print_name_offset(m, dev->set_state_shutdown);
239 SEQ_printf(m, "\n");
240 }
241
242 if (dev->set_state_periodic) {
243 SEQ_printf(m, " periodic: ");
244 print_name_offset(m, dev->set_state_periodic);
245 SEQ_printf(m, "\n");
246 }
247
248 if (dev->set_state_oneshot) {
249 SEQ_printf(m, " oneshot: ");
250 print_name_offset(m, dev->set_state_oneshot);
251 SEQ_printf(m, "\n");
252 }
253
254 if (dev->tick_resume) {
255 SEQ_printf(m, " resume: ");
256 print_name_offset(m, dev->tick_resume);
257 SEQ_printf(m, "\n");
258 }
259 }
234 260
235 SEQ_printf(m, " event_handler: "); 261 SEQ_printf(m, " event_handler: ");
236 print_name_offset(m, dev->event_handler); 262 print_name_offset(m, dev->event_handler);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..fedbdd7d5d1e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -599,6 +599,34 @@ config RING_BUFFER_STARTUP_TEST
599 599
600 If unsure, say N 600 If unsure, say N
601 601
602config TRACE_ENUM_MAP_FILE
603 bool "Show enum mappings for trace events"
604 depends on TRACING
605 help
606 The "print fmt" of the trace events will show the enum names instead
607 of their values. This can cause problems for user space tools that
608 use this string to parse the raw data as user space does not know
609 how to convert the string to its value.
610
611 To fix this, there's a special macro in the kernel that can be used
612 to convert the enum into its value. If this macro is used, then the
613 print fmt strings will have the enums converted to their values.
614
615 If something does not get converted properly, this option can be
616 used to show what enums the kernel tried to convert.
617
618 This option is for debugging the enum conversions. A file is created
619 in the tracing directory called "enum_map" that will show the enum
620 names matched with their values and what trace event system they
621 belong too.
622
623 Normally, the mapping of the strings to values will be freed after
624 boot up or module load. With this option, they will not be freed, as
625 they are needed for the "enum_map" file. Enabling this option will
626 increase the memory footprint of the running kernel.
627
628 If unsure, say N
629
602endif # FTRACE 630endif # FTRACE
603 631
604endif # TRACING_SUPPORT 632endif # TRACING_SUPPORT
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f228024055b..02bece4a99ea 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,7 +18,7 @@
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/suspend.h> 20#include <linux/suspend.h>
21#include <linux/debugfs.h> 21#include <linux/tracefs.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
@@ -249,6 +249,19 @@ static void update_function_graph_func(void);
249static inline void update_function_graph_func(void) { } 249static inline void update_function_graph_func(void) { }
250#endif 250#endif
251 251
252
253static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
254{
255 /*
256 * If this is a dynamic ops or we force list func,
257 * then it needs to call the list anyway.
258 */
259 if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
260 return ftrace_ops_list_func;
261
262 return ftrace_ops_get_func(ops);
263}
264
252static void update_ftrace_function(void) 265static void update_ftrace_function(void)
253{ 266{
254 ftrace_func_t func; 267 ftrace_func_t func;
@@ -270,7 +283,7 @@ static void update_ftrace_function(void)
270 * then have the mcount trampoline call the function directly. 283 * then have the mcount trampoline call the function directly.
271 */ 284 */
272 } else if (ftrace_ops_list->next == &ftrace_list_end) { 285 } else if (ftrace_ops_list->next == &ftrace_list_end) {
273 func = ftrace_ops_get_func(ftrace_ops_list); 286 func = ftrace_ops_get_list_func(ftrace_ops_list);
274 287
275 } else { 288 } else {
276 /* Just use the default ftrace_ops */ 289 /* Just use the default ftrace_ops */
@@ -1008,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = {
1008 .stat_show = function_stat_show 1021 .stat_show = function_stat_show
1009}; 1022};
1010 1023
1011static __init void ftrace_profile_debugfs(struct dentry *d_tracer) 1024static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
1012{ 1025{
1013 struct ftrace_profile_stat *stat; 1026 struct ftrace_profile_stat *stat;
1014 struct dentry *entry; 1027 struct dentry *entry;
@@ -1044,15 +1057,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1044 } 1057 }
1045 } 1058 }
1046 1059
1047 entry = debugfs_create_file("function_profile_enabled", 0644, 1060 entry = tracefs_create_file("function_profile_enabled", 0644,
1048 d_tracer, NULL, &ftrace_profile_fops); 1061 d_tracer, NULL, &ftrace_profile_fops);
1049 if (!entry) 1062 if (!entry)
1050 pr_warning("Could not create debugfs " 1063 pr_warning("Could not create tracefs "
1051 "'function_profile_enabled' entry\n"); 1064 "'function_profile_enabled' entry\n");
1052} 1065}
1053 1066
1054#else /* CONFIG_FUNCTION_PROFILER */ 1067#else /* CONFIG_FUNCTION_PROFILER */
1055static __init void ftrace_profile_debugfs(struct dentry *d_tracer) 1068static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
1056{ 1069{
1057} 1070}
1058#endif /* CONFIG_FUNCTION_PROFILER */ 1071#endif /* CONFIG_FUNCTION_PROFILER */
@@ -4712,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
4712 mutex_unlock(&ftrace_lock); 4725 mutex_unlock(&ftrace_lock);
4713} 4726}
4714 4727
4715static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 4728static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
4716{ 4729{
4717 4730
4718 trace_create_file("available_filter_functions", 0444, 4731 trace_create_file("available_filter_functions", 0444,
@@ -5020,7 +5033,7 @@ static int __init ftrace_nodyn_init(void)
5020} 5033}
5021core_initcall(ftrace_nodyn_init); 5034core_initcall(ftrace_nodyn_init);
5022 5035
5023static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 5036static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
5024static inline void ftrace_startup_enable(int command) { } 5037static inline void ftrace_startup_enable(int command) { }
5025static inline void ftrace_startup_all(int command) { } 5038static inline void ftrace_startup_all(int command) { }
5026/* Keep as macros so we do not need to define the commands */ 5039/* Keep as macros so we do not need to define the commands */
@@ -5209,13 +5222,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
5209ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) 5222ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
5210{ 5223{
5211 /* 5224 /*
5212 * If this is a dynamic ops or we force list func,
5213 * then it needs to call the list anyway.
5214 */
5215 if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
5216 return ftrace_ops_list_func;
5217
5218 /*
5219 * If the func handles its own recursion, call it directly. 5225 * If the func handles its own recursion, call it directly.
5220 * Otherwise call the recursion protected function that 5226 * Otherwise call the recursion protected function that
5221 * will call the ftrace ops function. 5227 * will call the ftrace ops function.
@@ -5473,7 +5479,7 @@ static const struct file_operations ftrace_pid_fops = {
5473 .release = ftrace_pid_release, 5479 .release = ftrace_pid_release,
5474}; 5480};
5475 5481
5476static __init int ftrace_init_debugfs(void) 5482static __init int ftrace_init_tracefs(void)
5477{ 5483{
5478 struct dentry *d_tracer; 5484 struct dentry *d_tracer;
5479 5485
@@ -5481,16 +5487,16 @@ static __init int ftrace_init_debugfs(void)
5481 if (IS_ERR(d_tracer)) 5487 if (IS_ERR(d_tracer))
5482 return 0; 5488 return 0;
5483 5489
5484 ftrace_init_dyn_debugfs(d_tracer); 5490 ftrace_init_dyn_tracefs(d_tracer);
5485 5491
5486 trace_create_file("set_ftrace_pid", 0644, d_tracer, 5492 trace_create_file("set_ftrace_pid", 0644, d_tracer,
5487 NULL, &ftrace_pid_fops); 5493 NULL, &ftrace_pid_fops);
5488 5494
5489 ftrace_profile_debugfs(d_tracer); 5495 ftrace_profile_tracefs(d_tracer);
5490 5496
5491 return 0; 5497 return 0;
5492} 5498}
5493fs_initcall(ftrace_init_debugfs); 5499fs_initcall(ftrace_init_tracefs);
5494 5500
5495/** 5501/**
5496 * ftrace_kill - kill ftrace 5502 * ftrace_kill - kill ftrace
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5040d44fe5a3..0315d43176d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context);
2679 2679
2680static __always_inline int trace_recursive_lock(void) 2680static __always_inline int trace_recursive_lock(void)
2681{ 2681{
2682 unsigned int val = this_cpu_read(current_context); 2682 unsigned int val = __this_cpu_read(current_context);
2683 int bit; 2683 int bit;
2684 2684
2685 if (in_interrupt()) { 2685 if (in_interrupt()) {
@@ -2696,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void)
2696 return 1; 2696 return 1;
2697 2697
2698 val |= (1 << bit); 2698 val |= (1 << bit);
2699 this_cpu_write(current_context, val); 2699 __this_cpu_write(current_context, val);
2700 2700
2701 return 0; 2701 return 0;
2702} 2702}
2703 2703
2704static __always_inline void trace_recursive_unlock(void) 2704static __always_inline void trace_recursive_unlock(void)
2705{ 2705{
2706 unsigned int val = this_cpu_read(current_context); 2706 __this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
2707
2708 val--;
2709 val &= this_cpu_read(current_context);
2710 this_cpu_write(current_context, val);
2711} 2707}
2712 2708
2713#else 2709#else
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62c6506d663f..91eecaaa43e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/debugfs.h> 22#include <linux/debugfs.h>
23#include <linux/tracefs.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/hardirq.h> 25#include <linux/hardirq.h>
25#include <linux/linkage.h> 26#include <linux/linkage.h>
@@ -31,6 +32,7 @@
31#include <linux/splice.h> 32#include <linux/splice.h>
32#include <linux/kdebug.h> 33#include <linux/kdebug.h>
33#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/mount.h>
34#include <linux/rwsem.h> 36#include <linux/rwsem.h>
35#include <linux/slab.h> 37#include <linux/slab.h>
36#include <linux/ctype.h> 38#include <linux/ctype.h>
@@ -123,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
123/* When set, tracing will stop when a WARN*() is hit */ 125/* When set, tracing will stop when a WARN*() is hit */
124int __disable_trace_on_warning; 126int __disable_trace_on_warning;
125 127
128#ifdef CONFIG_TRACE_ENUM_MAP_FILE
129/* Map of enums to their values, for "enum_map" file */
130struct trace_enum_map_head {
131 struct module *mod;
132 unsigned long length;
133};
134
135union trace_enum_map_item;
136
137struct trace_enum_map_tail {
138 /*
139 * "end" is first and points to NULL as it must be different
140 * than "mod" or "enum_string"
141 */
142 union trace_enum_map_item *next;
143 const char *end; /* points to NULL */
144};
145
146static DEFINE_MUTEX(trace_enum_mutex);
147
148/*
149 * The trace_enum_maps are saved in an array with two extra elements,
150 * one at the beginning, and one at the end. The beginning item contains
151 * the count of the saved maps (head.length), and the module they
152 * belong to if not built in (head.mod). The ending item contains a
153 * pointer to the next array of saved enum_map items.
154 */
155union trace_enum_map_item {
156 struct trace_enum_map map;
157 struct trace_enum_map_head head;
158 struct trace_enum_map_tail tail;
159};
160
161static union trace_enum_map_item *trace_enum_maps;
162#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
163
126static int tracing_set_tracer(struct trace_array *tr, const char *buf); 164static int tracing_set_tracer(struct trace_array *tr, const char *buf);
127 165
128#define MAX_TRACER_SIZE 100 166#define MAX_TRACER_SIZE 100
@@ -3908,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
3908 .write = tracing_saved_cmdlines_size_write, 3946 .write = tracing_saved_cmdlines_size_write,
3909}; 3947};
3910 3948
3949#ifdef CONFIG_TRACE_ENUM_MAP_FILE
3950static union trace_enum_map_item *
3951update_enum_map(union trace_enum_map_item *ptr)
3952{
3953 if (!ptr->map.enum_string) {
3954 if (ptr->tail.next) {
3955 ptr = ptr->tail.next;
3956 /* Set ptr to the next real item (skip head) */
3957 ptr++;
3958 } else
3959 return NULL;
3960 }
3961 return ptr;
3962}
3963
3964static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
3965{
3966 union trace_enum_map_item *ptr = v;
3967
3968 /*
3969 * Paranoid! If ptr points to end, we don't want to increment past it.
3970 * This really should never happen.
3971 */
3972 ptr = update_enum_map(ptr);
3973 if (WARN_ON_ONCE(!ptr))
3974 return NULL;
3975
3976 ptr++;
3977
3978 (*pos)++;
3979
3980 ptr = update_enum_map(ptr);
3981
3982 return ptr;
3983}
3984
3985static void *enum_map_start(struct seq_file *m, loff_t *pos)
3986{
3987 union trace_enum_map_item *v;
3988 loff_t l = 0;
3989
3990 mutex_lock(&trace_enum_mutex);
3991
3992 v = trace_enum_maps;
3993 if (v)
3994 v++;
3995
3996 while (v && l < *pos) {
3997 v = enum_map_next(m, v, &l);
3998 }
3999
4000 return v;
4001}
4002
4003static void enum_map_stop(struct seq_file *m, void *v)
4004{
4005 mutex_unlock(&trace_enum_mutex);
4006}
4007
4008static int enum_map_show(struct seq_file *m, void *v)
4009{
4010 union trace_enum_map_item *ptr = v;
4011
4012 seq_printf(m, "%s %ld (%s)\n",
4013 ptr->map.enum_string, ptr->map.enum_value,
4014 ptr->map.system);
4015
4016 return 0;
4017}
4018
4019static const struct seq_operations tracing_enum_map_seq_ops = {
4020 .start = enum_map_start,
4021 .next = enum_map_next,
4022 .stop = enum_map_stop,
4023 .show = enum_map_show,
4024};
4025
4026static int tracing_enum_map_open(struct inode *inode, struct file *filp)
4027{
4028 if (tracing_disabled)
4029 return -ENODEV;
4030
4031 return seq_open(filp, &tracing_enum_map_seq_ops);
4032}
4033
4034static const struct file_operations tracing_enum_map_fops = {
4035 .open = tracing_enum_map_open,
4036 .read = seq_read,
4037 .llseek = seq_lseek,
4038 .release = seq_release,
4039};
4040
4041static inline union trace_enum_map_item *
4042trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
4043{
4044 /* Return tail of array given the head */
4045 return ptr + ptr->head.length + 1;
4046}
4047
4048static void
4049trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
4050 int len)
4051{
4052 struct trace_enum_map **stop;
4053 struct trace_enum_map **map;
4054 union trace_enum_map_item *map_array;
4055 union trace_enum_map_item *ptr;
4056
4057 stop = start + len;
4058
4059 /*
4060 * The trace_enum_maps contains the map plus a head and tail item,
4061 * where the head holds the module and length of array, and the
4062 * tail holds a pointer to the next list.
4063 */
4064 map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
4065 if (!map_array) {
4066 pr_warning("Unable to allocate trace enum mapping\n");
4067 return;
4068 }
4069
4070 mutex_lock(&trace_enum_mutex);
4071
4072 if (!trace_enum_maps)
4073 trace_enum_maps = map_array;
4074 else {
4075 ptr = trace_enum_maps;
4076 for (;;) {
4077 ptr = trace_enum_jmp_to_tail(ptr);
4078 if (!ptr->tail.next)
4079 break;
4080 ptr = ptr->tail.next;
4081
4082 }
4083 ptr->tail.next = map_array;
4084 }
4085 map_array->head.mod = mod;
4086 map_array->head.length = len;
4087 map_array++;
4088
4089 for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
4090 map_array->map = **map;
4091 map_array++;
4092 }
4093 memset(map_array, 0, sizeof(*map_array));
4094
4095 mutex_unlock(&trace_enum_mutex);
4096}
4097
4098static void trace_create_enum_file(struct dentry *d_tracer)
4099{
4100 trace_create_file("enum_map", 0444, d_tracer,
4101 NULL, &tracing_enum_map_fops);
4102}
4103
4104#else /* CONFIG_TRACE_ENUM_MAP_FILE */
4105static inline void trace_create_enum_file(struct dentry *d_tracer) { }
4106static inline void trace_insert_enum_map_file(struct module *mod,
4107 struct trace_enum_map **start, int len) { }
4108#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
4109
4110static void trace_insert_enum_map(struct module *mod,
4111 struct trace_enum_map **start, int len)
4112{
4113 struct trace_enum_map **map;
4114
4115 if (len <= 0)
4116 return;
4117
4118 map = start;
4119
4120 trace_event_enum_update(map, len);
4121
4122 trace_insert_enum_map_file(mod, start, len);
4123}
4124
3911static ssize_t 4125static ssize_t
3912tracing_set_trace_read(struct file *filp, char __user *ubuf, 4126tracing_set_trace_read(struct file *filp, char __user *ubuf,
3913 size_t cnt, loff_t *ppos) 4127 size_t cnt, loff_t *ppos)
@@ -4105,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr)
4105 tr->current_trace = &nop_trace; 4319 tr->current_trace = &nop_trace;
4106} 4320}
4107 4321
4108static int tracing_set_tracer(struct trace_array *tr, const char *buf) 4322static void update_tracer_options(struct trace_array *tr, struct tracer *t)
4109{ 4323{
4110 static struct trace_option_dentry *topts; 4324 static struct trace_option_dentry *topts;
4325
4326 /* Only enable if the directory has been created already. */
4327 if (!tr->dir)
4328 return;
4329
4330 /* Currently, only the top instance has options */
4331 if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
4332 return;
4333
4334 destroy_trace_option_files(topts);
4335 topts = create_trace_option_files(tr, t);
4336}
4337
4338static int tracing_set_tracer(struct trace_array *tr, const char *buf)
4339{
4111 struct tracer *t; 4340 struct tracer *t;
4112#ifdef CONFIG_TRACER_MAX_TRACE 4341#ifdef CONFIG_TRACER_MAX_TRACE
4113 bool had_max_tr; 4342 bool had_max_tr;
@@ -4172,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
4172 free_snapshot(tr); 4401 free_snapshot(tr);
4173 } 4402 }
4174#endif 4403#endif
4175 /* Currently, only the top instance has options */ 4404 update_tracer_options(tr, t);
4176 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
4177 destroy_trace_option_files(topts);
4178 topts = create_trace_option_files(tr, t);
4179 }
4180 4405
4181#ifdef CONFIG_TRACER_MAX_TRACE 4406#ifdef CONFIG_TRACER_MAX_TRACE
4182 if (t->use_max_tr && !had_max_tr) { 4407 if (t->use_max_tr && !had_max_tr) {
@@ -5817,6 +6042,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
5817 6042
5818static struct dentry *tracing_get_dentry(struct trace_array *tr) 6043static struct dentry *tracing_get_dentry(struct trace_array *tr)
5819{ 6044{
6045 if (WARN_ON(!tr->dir))
6046 return ERR_PTR(-ENODEV);
6047
6048 /* Top directory uses NULL as the parent */
6049 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
6050 return NULL;
6051
6052 /* All sub buffers have a descriptor */
5820 return tr->dir; 6053 return tr->dir;
5821} 6054}
5822 6055
@@ -5831,10 +6064,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5831 if (IS_ERR(d_tracer)) 6064 if (IS_ERR(d_tracer))
5832 return NULL; 6065 return NULL;
5833 6066
5834 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); 6067 tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
5835 6068
5836 WARN_ONCE(!tr->percpu_dir, 6069 WARN_ONCE(!tr->percpu_dir,
5837 "Could not create debugfs directory 'per_cpu/%d'\n", cpu); 6070 "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
5838 6071
5839 return tr->percpu_dir; 6072 return tr->percpu_dir;
5840} 6073}
@@ -5851,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
5851} 6084}
5852 6085
5853static void 6086static void
5854tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) 6087tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
5855{ 6088{
5856 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); 6089 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
5857 struct dentry *d_cpu; 6090 struct dentry *d_cpu;
@@ -5861,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
5861 return; 6094 return;
5862 6095
5863 snprintf(cpu_dir, 30, "cpu%ld", cpu); 6096 snprintf(cpu_dir, 30, "cpu%ld", cpu);
5864 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 6097 d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
5865 if (!d_cpu) { 6098 if (!d_cpu) {
5866 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); 6099 pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
5867 return; 6100 return;
5868 } 6101 }
5869 6102
@@ -6015,9 +6248,9 @@ struct dentry *trace_create_file(const char *name,
6015{ 6248{
6016 struct dentry *ret; 6249 struct dentry *ret;
6017 6250
6018 ret = debugfs_create_file(name, mode, parent, data, fops); 6251 ret = tracefs_create_file(name, mode, parent, data, fops);
6019 if (!ret) 6252 if (!ret)
6020 pr_warning("Could not create debugfs '%s' entry\n", name); 6253 pr_warning("Could not create tracefs '%s' entry\n", name);
6021 6254
6022 return ret; 6255 return ret;
6023} 6256}
@@ -6034,9 +6267,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
6034 if (IS_ERR(d_tracer)) 6267 if (IS_ERR(d_tracer))
6035 return NULL; 6268 return NULL;
6036 6269
6037 tr->options = debugfs_create_dir("options", d_tracer); 6270 tr->options = tracefs_create_dir("options", d_tracer);
6038 if (!tr->options) { 6271 if (!tr->options) {
6039 pr_warning("Could not create debugfs directory 'options'\n"); 6272 pr_warning("Could not create tracefs directory 'options'\n");
6040 return NULL; 6273 return NULL;
6041 } 6274 }
6042 6275
@@ -6105,7 +6338,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
6105 return; 6338 return;
6106 6339
6107 for (cnt = 0; topts[cnt].opt; cnt++) 6340 for (cnt = 0; topts[cnt].opt; cnt++)
6108 debugfs_remove(topts[cnt].entry); 6341 tracefs_remove(topts[cnt].entry);
6109 6342
6110 kfree(topts); 6343 kfree(topts);
6111} 6344}
@@ -6194,7 +6427,7 @@ static const struct file_operations rb_simple_fops = {
6194struct dentry *trace_instance_dir; 6427struct dentry *trace_instance_dir;
6195 6428
6196static void 6429static void
6197init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); 6430init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
6198 6431
6199static int 6432static int
6200allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) 6433allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
@@ -6271,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr)
6271#endif 6504#endif
6272} 6505}
6273 6506
6274static int new_instance_create(const char *name) 6507static int instance_mkdir(const char *name)
6275{ 6508{
6276 struct trace_array *tr; 6509 struct trace_array *tr;
6277 int ret; 6510 int ret;
@@ -6310,17 +6543,17 @@ static int new_instance_create(const char *name)
6310 if (allocate_trace_buffers(tr, trace_buf_size) < 0) 6543 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
6311 goto out_free_tr; 6544 goto out_free_tr;
6312 6545
6313 tr->dir = debugfs_create_dir(name, trace_instance_dir); 6546 tr->dir = tracefs_create_dir(name, trace_instance_dir);
6314 if (!tr->dir) 6547 if (!tr->dir)
6315 goto out_free_tr; 6548 goto out_free_tr;
6316 6549
6317 ret = event_trace_add_tracer(tr->dir, tr); 6550 ret = event_trace_add_tracer(tr->dir, tr);
6318 if (ret) { 6551 if (ret) {
6319 debugfs_remove_recursive(tr->dir); 6552 tracefs_remove_recursive(tr->dir);
6320 goto out_free_tr; 6553 goto out_free_tr;
6321 } 6554 }
6322 6555
6323 init_tracer_debugfs(tr, tr->dir); 6556 init_tracer_tracefs(tr, tr->dir);
6324 6557
6325 list_add(&tr->list, &ftrace_trace_arrays); 6558 list_add(&tr->list, &ftrace_trace_arrays);
6326 6559
@@ -6341,7 +6574,7 @@ static int new_instance_create(const char *name)
6341 6574
6342} 6575}
6343 6576
6344static int instance_delete(const char *name) 6577static int instance_rmdir(const char *name)
6345{ 6578{
6346 struct trace_array *tr; 6579 struct trace_array *tr;
6347 int found = 0; 6580 int found = 0;
@@ -6382,82 +6615,17 @@ static int instance_delete(const char *name)
6382 return ret; 6615 return ret;
6383} 6616}
6384 6617
6385static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
6386{
6387 struct dentry *parent;
6388 int ret;
6389
6390 /* Paranoid: Make sure the parent is the "instances" directory */
6391 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6392 if (WARN_ON_ONCE(parent != trace_instance_dir))
6393 return -ENOENT;
6394
6395 /*
6396 * The inode mutex is locked, but debugfs_create_dir() will also
6397 * take the mutex. As the instances directory can not be destroyed
6398 * or changed in any other way, it is safe to unlock it, and
6399 * let the dentry try. If two users try to make the same dir at
6400 * the same time, then the new_instance_create() will determine the
6401 * winner.
6402 */
6403 mutex_unlock(&inode->i_mutex);
6404
6405 ret = new_instance_create(dentry->d_iname);
6406
6407 mutex_lock(&inode->i_mutex);
6408
6409 return ret;
6410}
6411
6412static int instance_rmdir(struct inode *inode, struct dentry *dentry)
6413{
6414 struct dentry *parent;
6415 int ret;
6416
6417 /* Paranoid: Make sure the parent is the "instances" directory */
6418 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6419 if (WARN_ON_ONCE(parent != trace_instance_dir))
6420 return -ENOENT;
6421
6422 /* The caller did a dget() on dentry */
6423 mutex_unlock(&dentry->d_inode->i_mutex);
6424
6425 /*
6426 * The inode mutex is locked, but debugfs_create_dir() will also
6427 * take the mutex. As the instances directory can not be destroyed
6428 * or changed in any other way, it is safe to unlock it, and
6429 * let the dentry try. If two users try to make the same dir at
6430 * the same time, then the instance_delete() will determine the
6431 * winner.
6432 */
6433 mutex_unlock(&inode->i_mutex);
6434
6435 ret = instance_delete(dentry->d_iname);
6436
6437 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
6438 mutex_lock(&dentry->d_inode->i_mutex);
6439
6440 return ret;
6441}
6442
6443static const struct inode_operations instance_dir_inode_operations = {
6444 .lookup = simple_lookup,
6445 .mkdir = instance_mkdir,
6446 .rmdir = instance_rmdir,
6447};
6448
6449static __init void create_trace_instances(struct dentry *d_tracer) 6618static __init void create_trace_instances(struct dentry *d_tracer)
6450{ 6619{
6451 trace_instance_dir = debugfs_create_dir("instances", d_tracer); 6620 trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
6621 instance_mkdir,
6622 instance_rmdir);
6452 if (WARN_ON(!trace_instance_dir)) 6623 if (WARN_ON(!trace_instance_dir))
6453 return; 6624 return;
6454
6455 /* Hijack the dir inode operations, to allow mkdir */
6456 trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
6457} 6625}
6458 6626
6459static void 6627static void
6460init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) 6628init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
6461{ 6629{
6462 int cpu; 6630 int cpu;
6463 6631
@@ -6511,10 +6679,32 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6511#endif 6679#endif
6512 6680
6513 for_each_tracing_cpu(cpu) 6681 for_each_tracing_cpu(cpu)
6514 tracing_init_debugfs_percpu(tr, cpu); 6682 tracing_init_tracefs_percpu(tr, cpu);
6515 6683
6516} 6684}
6517 6685
6686static struct vfsmount *trace_automount(void *ingore)
6687{
6688 struct vfsmount *mnt;
6689 struct file_system_type *type;
6690
6691 /*
6692 * To maintain backward compatibility for tools that mount
6693 * debugfs to get to the tracing facility, tracefs is automatically
6694 * mounted to the debugfs/tracing directory.
6695 */
6696 type = get_fs_type("tracefs");
6697 if (!type)
6698 return NULL;
6699 mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
6700 put_filesystem(type);
6701 if (IS_ERR(mnt))
6702 return NULL;
6703 mntget(mnt);
6704
6705 return mnt;
6706}
6707
6518/** 6708/**
6519 * tracing_init_dentry - initialize top level trace array 6709 * tracing_init_dentry - initialize top level trace array
6520 * 6710 *
@@ -6526,23 +6716,112 @@ struct dentry *tracing_init_dentry(void)
6526{ 6716{
6527 struct trace_array *tr = &global_trace; 6717 struct trace_array *tr = &global_trace;
6528 6718
6719 /* The top level trace array uses NULL as parent */
6529 if (tr->dir) 6720 if (tr->dir)
6530 return tr->dir; 6721 return NULL;
6531 6722
6532 if (WARN_ON(!debugfs_initialized())) 6723 if (WARN_ON(!debugfs_initialized()))
6533 return ERR_PTR(-ENODEV); 6724 return ERR_PTR(-ENODEV);
6534 6725
6535 tr->dir = debugfs_create_dir("tracing", NULL); 6726 /*
6536 6727 * As there may still be users that expect the tracing
6728 * files to exist in debugfs/tracing, we must automount
6729 * the tracefs file system there, so older tools still
6730 * work with the newer kerenl.
6731 */
6732 tr->dir = debugfs_create_automount("tracing", NULL,
6733 trace_automount, NULL);
6537 if (!tr->dir) { 6734 if (!tr->dir) {
6538 pr_warn_once("Could not create debugfs directory 'tracing'\n"); 6735 pr_warn_once("Could not create debugfs directory 'tracing'\n");
6539 return ERR_PTR(-ENOMEM); 6736 return ERR_PTR(-ENOMEM);
6540 } 6737 }
6541 6738
6542 return tr->dir; 6739 return NULL;
6740}
6741
6742extern struct trace_enum_map *__start_ftrace_enum_maps[];
6743extern struct trace_enum_map *__stop_ftrace_enum_maps[];
6744
6745static void __init trace_enum_init(void)
6746{
6747 int len;
6748
6749 len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
6750 trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
6751}
6752
6753#ifdef CONFIG_MODULES
6754static void trace_module_add_enums(struct module *mod)
6755{
6756 if (!mod->num_trace_enums)
6757 return;
6758
6759 /*
6760 * Modules with bad taint do not have events created, do
6761 * not bother with enums either.
6762 */
6763 if (trace_module_has_bad_taint(mod))
6764 return;
6765
6766 trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
6543} 6767}
6544 6768
6545static __init int tracer_init_debugfs(void) 6769#ifdef CONFIG_TRACE_ENUM_MAP_FILE
6770static void trace_module_remove_enums(struct module *mod)
6771{
6772 union trace_enum_map_item *map;
6773 union trace_enum_map_item **last = &trace_enum_maps;
6774
6775 if (!mod->num_trace_enums)
6776 return;
6777
6778 mutex_lock(&trace_enum_mutex);
6779
6780 map = trace_enum_maps;
6781
6782 while (map) {
6783 if (map->head.mod == mod)
6784 break;
6785 map = trace_enum_jmp_to_tail(map);
6786 last = &map->tail.next;
6787 map = map->tail.next;
6788 }
6789 if (!map)
6790 goto out;
6791
6792 *last = trace_enum_jmp_to_tail(map)->tail.next;
6793 kfree(map);
6794 out:
6795 mutex_unlock(&trace_enum_mutex);
6796}
6797#else
6798static inline void trace_module_remove_enums(struct module *mod) { }
6799#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
6800
6801static int trace_module_notify(struct notifier_block *self,
6802 unsigned long val, void *data)
6803{
6804 struct module *mod = data;
6805
6806 switch (val) {
6807 case MODULE_STATE_COMING:
6808 trace_module_add_enums(mod);
6809 break;
6810 case MODULE_STATE_GOING:
6811 trace_module_remove_enums(mod);
6812 break;
6813 }
6814
6815 return 0;
6816}
6817
6818static struct notifier_block trace_module_nb = {
6819 .notifier_call = trace_module_notify,
6820 .priority = 0,
6821};
6822#endif /* CONFIG_MODULES */
6823
6824static __init int tracer_init_tracefs(void)
6546{ 6825{
6547 struct dentry *d_tracer; 6826 struct dentry *d_tracer;
6548 6827
@@ -6552,7 +6831,7 @@ static __init int tracer_init_debugfs(void)
6552 if (IS_ERR(d_tracer)) 6831 if (IS_ERR(d_tracer))
6553 return 0; 6832 return 0;
6554 6833
6555 init_tracer_debugfs(&global_trace, d_tracer); 6834 init_tracer_tracefs(&global_trace, d_tracer);
6556 6835
6557 trace_create_file("tracing_thresh", 0644, d_tracer, 6836 trace_create_file("tracing_thresh", 0644, d_tracer,
6558 &global_trace, &tracing_thresh_fops); 6837 &global_trace, &tracing_thresh_fops);
@@ -6566,6 +6845,14 @@ static __init int tracer_init_debugfs(void)
6566 trace_create_file("saved_cmdlines_size", 0644, d_tracer, 6845 trace_create_file("saved_cmdlines_size", 0644, d_tracer,
6567 NULL, &tracing_saved_cmdlines_size_fops); 6846 NULL, &tracing_saved_cmdlines_size_fops);
6568 6847
6848 trace_enum_init();
6849
6850 trace_create_enum_file(d_tracer);
6851
6852#ifdef CONFIG_MODULES
6853 register_module_notifier(&trace_module_nb);
6854#endif
6855
6569#ifdef CONFIG_DYNAMIC_FTRACE 6856#ifdef CONFIG_DYNAMIC_FTRACE
6570 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 6857 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
6571 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 6858 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6575,6 +6862,10 @@ static __init int tracer_init_debugfs(void)
6575 6862
6576 create_trace_options_dir(&global_trace); 6863 create_trace_options_dir(&global_trace);
6577 6864
6865 /* If the tracer was started via cmdline, create options for it here */
6866 if (global_trace.current_trace != &nop_trace)
6867 update_tracer_options(&global_trace, global_trace.current_trace);
6868
6578 return 0; 6869 return 0;
6579} 6870}
6580 6871
@@ -6888,7 +7179,7 @@ void __init trace_init(void)
6888 tracepoint_printk = 0; 7179 tracepoint_printk = 0;
6889 } 7180 }
6890 tracer_alloc_buffers(); 7181 tracer_alloc_buffers();
6891 trace_event_init(); 7182 trace_event_init();
6892} 7183}
6893 7184
6894__init static int clear_boot_tracer(void) 7185__init static int clear_boot_tracer(void)
@@ -6910,5 +7201,5 @@ __init static int clear_boot_tracer(void)
6910 return 0; 7201 return 0;
6911} 7202}
6912 7203
6913fs_initcall(tracer_init_debugfs); 7204fs_initcall(tracer_init_tracefs);
6914late_initcall(clear_boot_tracer); 7205late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dd8205a35760..d2612016de94 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,7 +334,7 @@ struct tracer_flags {
334 334
335 335
336/** 336/**
337 * struct tracer - a specific tracer and its callbacks to interact with debugfs 337 * struct tracer - a specific tracer and its callbacks to interact with tracefs
338 * @name: the name chosen to select it on the available_tracers file 338 * @name: the name chosen to select it on the available_tracers file
339 * @init: called when one switches to this tracer (echo name > current_tracer) 339 * @init: called when one switches to this tracer (echo name > current_tracer)
340 * @reset: called when one switches to another tracer 340 * @reset: called when one switches to another tracer
@@ -1309,8 +1309,10 @@ static inline void init_ftrace_syscalls(void) { }
1309 1309
1310#ifdef CONFIG_EVENT_TRACING 1310#ifdef CONFIG_EVENT_TRACING
1311void trace_event_init(void); 1311void trace_event_init(void);
1312void trace_event_enum_update(struct trace_enum_map **map, int len);
1312#else 1313#else
1313static inline void __init trace_event_init(void) { } 1314static inline void __init trace_event_init(void) { }
1315static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { }
1314#endif 1316#endif
1315 1317
1316extern struct trace_iterator *tracepoint_print_iter; 1318extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e2d027ac66a2..ee7b94a4810a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
223 __dynamic_array( u32, buf ) 223 __dynamic_array( u32, buf )
224 ), 224 ),
225 225
226 F_printk("%pf: %s", 226 F_printk("%ps: %s",
227 (void *)__entry->ip, __entry->fmt), 227 (void *)__entry->ip, __entry->fmt),
228 228
229 FILTER_OTHER 229 FILTER_OTHER
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry,
238 __dynamic_array( char, buf ) 238 __dynamic_array( char, buf )
239 ), 239 ),
240 240
241 F_printk("%pf: %s", 241 F_printk("%ps: %s",
242 (void *)__entry->ip, __entry->buf), 242 (void *)__entry->ip, __entry->buf),
243 243
244 FILTER_OTHER 244 FILTER_OTHER
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
253 __field( const char *, str ) 253 __field( const char *, str )
254 ), 254 ),
255 255
256 F_printk("%pf: %s", 256 F_printk("%ps: %s",
257 (void *)__entry->ip, __entry->str), 257 (void *)__entry->ip, __entry->str),
258 258
259 FILTER_OTHER 259 FILTER_OTHER
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db54dda10ccc..7da1dfeb322e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
13#include <linux/workqueue.h> 13#include <linux/workqueue.h>
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/kthread.h> 15#include <linux/kthread.h>
16#include <linux/debugfs.h> 16#include <linux/tracefs.h>
17#include <linux/uaccess.h> 17#include <linux/uaccess.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/ctype.h> 19#include <linux/ctype.h>
@@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
480 return; 480 return;
481 481
482 if (!--dir->nr_events) { 482 if (!--dir->nr_events) {
483 debugfs_remove_recursive(dir->entry); 483 tracefs_remove_recursive(dir->entry);
484 list_del(&dir->list); 484 list_del(&dir->list);
485 __put_system_dir(dir); 485 __put_system_dir(dir);
486 } 486 }
@@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
499 } 499 }
500 spin_unlock(&dir->d_lock); 500 spin_unlock(&dir->d_lock);
501 501
502 debugfs_remove_recursive(dir); 502 tracefs_remove_recursive(dir);
503 } 503 }
504 504
505 list_del(&file->list); 505 list_del(&file->list);
@@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1526 } else 1526 } else
1527 __get_system(system); 1527 __get_system(system);
1528 1528
1529 dir->entry = debugfs_create_dir(name, parent); 1529 dir->entry = tracefs_create_dir(name, parent);
1530 if (!dir->entry) { 1530 if (!dir->entry) {
1531 pr_warn("Failed to create system directory %s\n", name); 1531 pr_warn("Failed to create system directory %s\n", name);
1532 __put_system(system); 1532 __put_system(system);
@@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
1539 dir->subsystem = system; 1539 dir->subsystem = system;
1540 file->system = dir; 1540 file->system = dir;
1541 1541
1542 entry = debugfs_create_file("filter", 0644, dir->entry, dir, 1542 entry = tracefs_create_file("filter", 0644, dir->entry, dir,
1543 &ftrace_subsystem_filter_fops); 1543 &ftrace_subsystem_filter_fops);
1544 if (!entry) { 1544 if (!entry) {
1545 kfree(system->filter); 1545 kfree(system->filter);
1546 system->filter = NULL; 1546 system->filter = NULL;
1547 pr_warn("Could not create debugfs '%s/filter' entry\n", name); 1547 pr_warn("Could not create tracefs '%s/filter' entry\n", name);
1548 } 1548 }
1549 1549
1550 trace_create_file("enable", 0644, dir->entry, dir, 1550 trace_create_file("enable", 0644, dir->entry, dir,
@@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1585 d_events = parent; 1585 d_events = parent;
1586 1586
1587 name = ftrace_event_name(call); 1587 name = ftrace_event_name(call);
1588 file->dir = debugfs_create_dir(name, d_events); 1588 file->dir = tracefs_create_dir(name, d_events);
1589 if (!file->dir) { 1589 if (!file->dir) {
1590 pr_warn("Could not create debugfs '%s' directory\n", name); 1590 pr_warn("Could not create tracefs '%s' directory\n", name);
1591 return -1; 1591 return -1;
1592 } 1592 }
1593 1593
@@ -1704,6 +1704,125 @@ __register_event(struct ftrace_event_call *call, struct module *mod)
1704 return 0; 1704 return 0;
1705} 1705}
1706 1706
1707static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
1708{
1709 int rlen;
1710 int elen;
1711
1712 /* Find the length of the enum value as a string */
1713 elen = snprintf(ptr, 0, "%ld", map->enum_value);
1714 /* Make sure there's enough room to replace the string with the value */
1715 if (len < elen)
1716 return NULL;
1717
1718 snprintf(ptr, elen + 1, "%ld", map->enum_value);
1719
1720 /* Get the rest of the string of ptr */
1721 rlen = strlen(ptr + len);
1722 memmove(ptr + elen, ptr + len, rlen);
1723 /* Make sure we end the new string */
1724 ptr[elen + rlen] = 0;
1725
1726 return ptr + elen;
1727}
1728
1729static void update_event_printk(struct ftrace_event_call *call,
1730 struct trace_enum_map *map)
1731{
1732 char *ptr;
1733 int quote = 0;
1734 int len = strlen(map->enum_string);
1735
1736 for (ptr = call->print_fmt; *ptr; ptr++) {
1737 if (*ptr == '\\') {
1738 ptr++;
1739 /* paranoid */
1740 if (!*ptr)
1741 break;
1742 continue;
1743 }
1744 if (*ptr == '"') {
1745 quote ^= 1;
1746 continue;
1747 }
1748 if (quote)
1749 continue;
1750 if (isdigit(*ptr)) {
1751 /* skip numbers */
1752 do {
1753 ptr++;
1754 /* Check for alpha chars like ULL */
1755 } while (isalnum(*ptr));
1756 /*
1757 * A number must have some kind of delimiter after
1758 * it, and we can ignore that too.
1759 */
1760 continue;
1761 }
1762 if (isalpha(*ptr) || *ptr == '_') {
1763 if (strncmp(map->enum_string, ptr, len) == 0 &&
1764 !isalnum(ptr[len]) && ptr[len] != '_') {
1765 ptr = enum_replace(ptr, map, len);
1766 /* Hmm, enum string smaller than value */
1767 if (WARN_ON_ONCE(!ptr))
1768 return;
1769 /*
1770 * No need to decrement here, as enum_replace()
1771 * returns the pointer to the character passed
1772 * the enum, and two enums can not be placed
1773 * back to back without something in between.
1774 * We can skip that something in between.
1775 */
1776 continue;
1777 }
1778 skip_more:
1779 do {
1780 ptr++;
1781 } while (isalnum(*ptr) || *ptr == '_');
1782 /*
1783 * If what comes after this variable is a '.' or
1784 * '->' then we can continue to ignore that string.
1785 */
1786 if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
1787 ptr += *ptr == '.' ? 1 : 2;
1788 goto skip_more;
1789 }
1790 /*
1791 * Once again, we can skip the delimiter that came
1792 * after the string.
1793 */
1794 continue;
1795 }
1796 }
1797}
1798
1799void trace_event_enum_update(struct trace_enum_map **map, int len)
1800{
1801 struct ftrace_event_call *call, *p;
1802 const char *last_system = NULL;
1803 int last_i;
1804 int i;
1805
1806 down_write(&trace_event_sem);
1807 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1808 /* events are usually grouped together with systems */
1809 if (!last_system || call->class->system != last_system) {
1810 last_i = 0;
1811 last_system = call->class->system;
1812 }
1813
1814 for (i = last_i; i < len; i++) {
1815 if (call->class->system == map[i]->system) {
1816 /* Save the first system if need be */
1817 if (!last_i)
1818 last_i = i;
1819 update_event_printk(call, map[i]);
1820 }
1821 }
1822 }
1823 up_write(&trace_event_sem);
1824}
1825
1707static struct ftrace_event_file * 1826static struct ftrace_event_file *
1708trace_create_new_event(struct ftrace_event_call *call, 1827trace_create_new_event(struct ftrace_event_call *call,
1709 struct trace_array *tr) 1828 struct trace_array *tr)
@@ -1915,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self,
1915 2034
1916static struct notifier_block trace_module_nb = { 2035static struct notifier_block trace_module_nb = {
1917 .notifier_call = trace_module_notify, 2036 .notifier_call = trace_module_notify,
1918 .priority = 0, 2037 .priority = 1, /* higher than trace.c module notify */
1919}; 2038};
1920#endif /* CONFIG_MODULES */ 2039#endif /* CONFIG_MODULES */
1921 2040
@@ -2228,7 +2347,7 @@ static inline int register_event_cmds(void) { return 0; }
2228/* 2347/*
2229 * The top level array has already had its ftrace_event_file 2348 * The top level array has already had its ftrace_event_file
2230 * descriptors created in order to allow for early events to 2349 * descriptors created in order to allow for early events to
2231 * be recorded. This function is called after the debugfs has been 2350 * be recorded. This function is called after the tracefs has been
2232 * initialized, and we now have to create the files associated 2351 * initialized, and we now have to create the files associated
2233 * to the events. 2352 * to the events.
2234 */ 2353 */
@@ -2311,16 +2430,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
2311 struct dentry *d_events; 2430 struct dentry *d_events;
2312 struct dentry *entry; 2431 struct dentry *entry;
2313 2432
2314 entry = debugfs_create_file("set_event", 0644, parent, 2433 entry = tracefs_create_file("set_event", 0644, parent,
2315 tr, &ftrace_set_event_fops); 2434 tr, &ftrace_set_event_fops);
2316 if (!entry) { 2435 if (!entry) {
2317 pr_warn("Could not create debugfs 'set_event' entry\n"); 2436 pr_warn("Could not create tracefs 'set_event' entry\n");
2318 return -ENOMEM; 2437 return -ENOMEM;
2319 } 2438 }
2320 2439
2321 d_events = debugfs_create_dir("events", parent); 2440 d_events = tracefs_create_dir("events", parent);
2322 if (!d_events) { 2441 if (!d_events) {
2323 pr_warn("Could not create debugfs 'events' directory\n"); 2442 pr_warn("Could not create tracefs 'events' directory\n");
2324 return -ENOMEM; 2443 return -ENOMEM;
2325 } 2444 }
2326 2445
@@ -2412,7 +2531,7 @@ int event_trace_del_tracer(struct trace_array *tr)
2412 2531
2413 down_write(&trace_event_sem); 2532 down_write(&trace_event_sem);
2414 __trace_remove_event_dirs(tr); 2533 __trace_remove_event_dirs(tr);
2415 debugfs_remove_recursive(tr->event_dir); 2534 tracefs_remove_recursive(tr->event_dir);
2416 up_write(&trace_event_sem); 2535 up_write(&trace_event_sem);
2417 2536
2418 tr->event_dir = NULL; 2537 tr->event_dir = NULL;
@@ -2534,10 +2653,10 @@ static __init int event_trace_init(void)
2534 if (IS_ERR(d_tracer)) 2653 if (IS_ERR(d_tracer))
2535 return 0; 2654 return 0;
2536 2655
2537 entry = debugfs_create_file("available_events", 0444, d_tracer, 2656 entry = tracefs_create_file("available_events", 0444, d_tracer,
2538 tr, &ftrace_avail_fops); 2657 tr, &ftrace_avail_fops);
2539 if (!entry) 2658 if (!entry)
2540 pr_warn("Could not create debugfs 'available_events' entry\n"); 2659 pr_warn("Could not create tracefs 'available_events' entry\n");
2541 2660
2542 if (trace_define_common_fields()) 2661 if (trace_define_common_fields())
2543 pr_warn("tracing: Failed to allocate common fields"); 2662 pr_warn("tracing: Failed to allocate common fields");
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 12e2b99be862..174a6a71146c 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \
177 }, \ 177 }, \
178 .event.type = etype, \ 178 .event.type = etype, \
179 .print_fmt = print, \ 179 .print_fmt = print, \
180 .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ 180 .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
181}; \ 181}; \
182struct ftrace_event_call __used \ 182struct ftrace_event_call __used \
183__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 183__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d25ad1526bb..9cfea4c6d314 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -6,7 +6,6 @@
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com> 6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 * 7 *
8 */ 8 */
9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 9#include <linux/uaccess.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/slab.h> 11#include <linux/slab.h>
@@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
151 * The curr_ret_stack is initialized to -1 and get increased 150 * The curr_ret_stack is initialized to -1 and get increased
152 * in this function. So it can be less than -1 only if it was 151 * in this function. So it can be less than -1 only if it was
153 * filtered out via ftrace_graph_notrace_addr() which can be 152 * filtered out via ftrace_graph_notrace_addr() which can be
154 * set from set_graph_notrace file in debugfs by user. 153 * set from set_graph_notrace file in tracefs by user.
155 */ 154 */
156 if (current->curr_ret_stack < -1) 155 if (current->curr_ret_stack < -1)
157 return -EBUSY; 156 return -EBUSY;
@@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = {
1432 .llseek = generic_file_llseek, 1431 .llseek = generic_file_llseek,
1433}; 1432};
1434 1433
1435static __init int init_graph_debugfs(void) 1434static __init int init_graph_tracefs(void)
1436{ 1435{
1437 struct dentry *d_tracer; 1436 struct dentry *d_tracer;
1438 1437
@@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void)
1445 1444
1446 return 0; 1445 return 0;
1447} 1446}
1448fs_initcall(init_graph_debugfs); 1447fs_initcall(init_graph_tracefs);
1449 1448
1450static __init int init_graph_trace(void) 1449static __init int init_graph_trace(void)
1451{ 1450{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..9ba3f43f580e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size)
250#define fetch_file_offset_string_size NULL 250#define fetch_file_offset_string_size NULL
251 251
252/* Fetch type information table */ 252/* Fetch type information table */
253const struct fetch_type kprobes_fetch_type_table[] = { 253static const struct fetch_type kprobes_fetch_type_table[] = {
254 /* Special types */ 254 /* Special types */
255 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, 255 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
256 sizeof(u32), 1, "__data_loc char[]"), 256 sizeof(u32), 1, "__data_loc char[]"),
@@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv)
760 760
761 /* Parse fetch argument */ 761 /* Parse fetch argument */
762 ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, 762 ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
763 is_return, true); 763 is_return, true,
764 kprobes_fetch_type_table);
764 if (ret) { 765 if (ret) {
765 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 766 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
766 goto error; 767 goto error;
@@ -1310,7 +1311,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
1310 return ret; 1311 return ret;
1311} 1312}
1312 1313
1313/* Make a debugfs interface for controlling probe points */ 1314/* Make a tracefs interface for controlling probe points */
1314static __init int init_kprobe_trace(void) 1315static __init int init_kprobe_trace(void)
1315{ 1316{
1316 struct dentry *d_tracer; 1317 struct dentry *d_tracer;
@@ -1323,20 +1324,20 @@ static __init int init_kprobe_trace(void)
1323 if (IS_ERR(d_tracer)) 1324 if (IS_ERR(d_tracer))
1324 return 0; 1325 return 0;
1325 1326
1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer, 1327 entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
1327 NULL, &kprobe_events_ops); 1328 NULL, &kprobe_events_ops);
1328 1329
1329 /* Event list interface */ 1330 /* Event list interface */
1330 if (!entry) 1331 if (!entry)
1331 pr_warning("Could not create debugfs " 1332 pr_warning("Could not create tracefs "
1332 "'kprobe_events' entry\n"); 1333 "'kprobe_events' entry\n");
1333 1334
1334 /* Profile interface */ 1335 /* Profile interface */
1335 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, 1336 entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
1336 NULL, &kprobe_profile_ops); 1337 NULL, &kprobe_profile_ops);
1337 1338
1338 if (!entry) 1339 if (!entry)
1339 pr_warning("Could not create debugfs " 1340 pr_warning("Could not create tracefs "
1340 "'kprobe_profile' entry\n"); 1341 "'kprobe_profile' entry\n");
1341 return 0; 1342 return 0;
1342} 1343}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index b983b2fd2ca1..1769a81da8a7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
356 356
357/* Recursive argument parser */ 357/* Recursive argument parser */
358static int parse_probe_arg(char *arg, const struct fetch_type *t, 358static int parse_probe_arg(char *arg, const struct fetch_type *t,
359 struct fetch_param *f, bool is_return, bool is_kprobe) 359 struct fetch_param *f, bool is_return, bool is_kprobe,
360 const struct fetch_type *ftbl)
360{ 361{
361 const struct fetch_type *ftbl;
362 unsigned long param; 362 unsigned long param;
363 long offset; 363 long offset;
364 char *tmp; 364 char *tmp;
365 int ret = 0; 365 int ret = 0;
366 366
367 ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
368 BUG_ON(ftbl == NULL);
369
370 switch (arg[0]) { 367 switch (arg[0]) {
371 case '$': 368 case '$':
372 ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); 369 ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
@@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
447 dprm->fetch_size = get_fetch_size_function(t, 444 dprm->fetch_size = get_fetch_size_function(t,
448 dprm->fetch, ftbl); 445 dprm->fetch, ftbl);
449 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, 446 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
450 is_kprobe); 447 is_kprobe, ftbl);
451 if (ret) 448 if (ret)
452 kfree(dprm); 449 kfree(dprm);
453 else { 450 else {
@@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf,
505 502
506/* String length checking wrapper */ 503/* String length checking wrapper */
507int traceprobe_parse_probe_arg(char *arg, ssize_t *size, 504int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
508 struct probe_arg *parg, bool is_return, bool is_kprobe) 505 struct probe_arg *parg, bool is_return, bool is_kprobe,
506 const struct fetch_type *ftbl)
509{ 507{
510 const struct fetch_type *ftbl;
511 const char *t; 508 const char *t;
512 int ret; 509 int ret;
513 510
514 ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
515 BUG_ON(ftbl == NULL);
516
517 if (strlen(arg) > MAX_ARGSTR_LEN) { 511 if (strlen(arg) > MAX_ARGSTR_LEN) {
518 pr_info("Argument is too long.: %s\n", arg); 512 pr_info("Argument is too long.: %s\n", arg);
519 return -ENOSPC; 513 return -ENOSPC;
@@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
535 } 529 }
536 parg->offset = *size; 530 parg->offset = *size;
537 *size += parg->type->size; 531 *size += parg->type->size;
538 ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); 532 ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
533 is_kprobe, ftbl);
539 534
540 if (ret >= 0 && t != NULL) 535 if (ret >= 0 && t != NULL)
541 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); 536 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fbce16d..ab283e146b70 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -25,7 +25,7 @@
25#include <linux/seq_file.h> 25#include <linux/seq_file.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/smp.h> 27#include <linux/smp.h>
28#include <linux/debugfs.h> 28#include <linux/tracefs.h>
29#include <linux/types.h> 29#include <linux/types.h>
30#include <linux/string.h> 30#include <linux/string.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \
229#define FETCH_TYPE_STRING 0 229#define FETCH_TYPE_STRING 0
230#define FETCH_TYPE_STRSIZE 1 230#define FETCH_TYPE_STRSIZE 1
231 231
232/*
233 * Fetch type information table.
234 * It's declared as a weak symbol due to conditional compilation.
235 */
236extern __weak const struct fetch_type kprobes_fetch_type_table[];
237extern __weak const struct fetch_type uprobes_fetch_type_table[];
238
239#ifdef CONFIG_KPROBE_EVENT 232#ifdef CONFIG_KPROBE_EVENT
240struct symbol_cache; 233struct symbol_cache;
241unsigned long update_symbol_cache(struct symbol_cache *sc); 234unsigned long update_symbol_cache(struct symbol_cache *sc);
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
333} 326}
334 327
335extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, 328extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
336 struct probe_arg *parg, bool is_return, bool is_kprobe); 329 struct probe_arg *parg, bool is_return, bool is_kprobe,
330 const struct fetch_type *ftbl);
337 331
338extern int traceprobe_conflict_field_name(const char *name, 332extern int traceprobe_conflict_field_name(const char *name,
339 struct probe_arg *args, int narg); 333 struct probe_arg *args, int narg);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75e19e86c954..6cf935316769 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -12,7 +12,7 @@
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/rbtree.h> 14#include <linux/rbtree.h>
15#include <linux/debugfs.h> 15#include <linux/tracefs.h>
16#include "trace_stat.h" 16#include "trace_stat.h"
17#include "trace.h" 17#include "trace.h"
18 18
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session)
65 65
66static void destroy_session(struct stat_session *session) 66static void destroy_session(struct stat_session *session)
67{ 67{
68 debugfs_remove(session->file); 68 tracefs_remove(session->file);
69 __reset_stat_session(session); 69 __reset_stat_session(session);
70 mutex_destroy(&session->stat_mutex); 70 mutex_destroy(&session->stat_mutex);
71 kfree(session); 71 kfree(session);
@@ -279,9 +279,9 @@ static int tracing_stat_init(void)
279 if (IS_ERR(d_tracing)) 279 if (IS_ERR(d_tracing))
280 return 0; 280 return 0;
281 281
282 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 282 stat_dir = tracefs_create_dir("trace_stat", d_tracing);
283 if (!stat_dir) 283 if (!stat_dir)
284 pr_warning("Could not create debugfs " 284 pr_warning("Could not create tracefs "
285 "'trace_stat' entry\n"); 285 "'trace_stat' entry\n");
286 return 0; 286 return 0;
287} 287}
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session)
291 if (!stat_dir && tracing_stat_init()) 291 if (!stat_dir && tracing_stat_init())
292 return -ENODEV; 292 return -ENODEV;
293 293
294 session->file = debugfs_create_file(session->ts->name, 0644, 294 session->file = tracefs_create_file(session->ts->name, 0644,
295 stat_dir, 295 stat_dir,
296 session, &tracing_stat_fops); 296 session, &tracing_stat_fops);
297 if (!session->file) 297 if (!session->file)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7dc1c8abecd6..74865465e0b7 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string)
196DEFINE_FETCH_file_offset(string_size) 196DEFINE_FETCH_file_offset(string_size)
197 197
198/* Fetch type information table */ 198/* Fetch type information table */
199const struct fetch_type uprobes_fetch_type_table[] = { 199static const struct fetch_type uprobes_fetch_type_table[] = {
200 /* Special types */ 200 /* Special types */
201 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, 201 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
202 sizeof(u32), 1, "__data_loc char[]"), 202 sizeof(u32), 1, "__data_loc char[]"),
@@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv)
535 535
536 /* Parse fetch argument */ 536 /* Parse fetch argument */
537 ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, 537 ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
538 is_return, false); 538 is_return, false,
539 uprobes_fetch_type_table);
539 if (ret) { 540 if (ret) {
540 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 541 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
541 goto error; 542 goto error;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 41ff75b478c6..586ad91300b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,6 +159,7 @@ struct worker_pool {
159 159
160 /* see manage_workers() for details on the two manager mutexes */ 160 /* see manage_workers() for details on the two manager mutexes */
161 struct mutex manager_arb; /* manager arbitration */ 161 struct mutex manager_arb; /* manager arbitration */
162 struct worker *manager; /* L: purely informational */
162 struct mutex attach_mutex; /* attach/detach exclusion */ 163 struct mutex attach_mutex; /* attach/detach exclusion */
163 struct list_head workers; /* A: attached workers */ 164 struct list_head workers; /* A: attached workers */
164 struct completion *detach_completion; /* all workers detached */ 165 struct completion *detach_completion; /* all workers detached */
@@ -230,7 +231,7 @@ struct wq_device;
230 */ 231 */
231struct workqueue_struct { 232struct workqueue_struct {
232 struct list_head pwqs; /* WR: all pwqs of this wq */ 233 struct list_head pwqs; /* WR: all pwqs of this wq */
233 struct list_head list; /* PL: list of all workqueues */ 234 struct list_head list; /* PR: list of all workqueues */
234 235
235 struct mutex mutex; /* protects this wq */ 236 struct mutex mutex; /* protects this wq */
236 int work_color; /* WQ: current work color */ 237 int work_color; /* WQ: current work color */
@@ -257,6 +258,13 @@ struct workqueue_struct {
257#endif 258#endif
258 char name[WQ_NAME_LEN]; /* I: workqueue name */ 259 char name[WQ_NAME_LEN]; /* I: workqueue name */
259 260
261 /*
262 * Destruction of workqueue_struct is sched-RCU protected to allow
263 * walking the workqueues list without grabbing wq_pool_mutex.
264 * This is used to dump all workqueues from sysrq.
265 */
266 struct rcu_head rcu;
267
260 /* hot fields used during command issue, aligned to cacheline */ 268 /* hot fields used during command issue, aligned to cacheline */
261 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ 269 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
262 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ 270 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
@@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
288static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ 296static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
289static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ 297static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
290 298
291static LIST_HEAD(workqueues); /* PL: list of all workqueues */ 299static LIST_HEAD(workqueues); /* PR: list of all workqueues */
292static bool workqueue_freezing; /* PL: have wqs started freezing? */ 300static bool workqueue_freezing; /* PL: have wqs started freezing? */
293 301
294/* the per-cpu worker pools */ 302/* the per-cpu worker pools */
@@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
324static int worker_thread(void *__worker); 332static int worker_thread(void *__worker);
325static void copy_workqueue_attrs(struct workqueue_attrs *to, 333static void copy_workqueue_attrs(struct workqueue_attrs *to,
326 const struct workqueue_attrs *from); 334 const struct workqueue_attrs *from);
335static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
327 336
328#define CREATE_TRACE_POINTS 337#define CREATE_TRACE_POINTS
329#include <trace/events/workqueue.h> 338#include <trace/events/workqueue.h>
@@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker)
1911 */ 1920 */
1912 if (!mutex_trylock(&pool->manager_arb)) 1921 if (!mutex_trylock(&pool->manager_arb))
1913 return false; 1922 return false;
1923 pool->manager = worker;
1914 1924
1915 maybe_create_worker(pool); 1925 maybe_create_worker(pool);
1916 1926
1927 pool->manager = NULL;
1917 mutex_unlock(&pool->manager_arb); 1928 mutex_unlock(&pool->manager_arb);
1918 return true; 1929 return true;
1919} 1930}
@@ -2303,6 +2314,7 @@ repeat:
2303struct wq_barrier { 2314struct wq_barrier {
2304 struct work_struct work; 2315 struct work_struct work;
2305 struct completion done; 2316 struct completion done;
2317 struct task_struct *task; /* purely informational */
2306}; 2318};
2307 2319
2308static void wq_barrier_func(struct work_struct *work) 2320static void wq_barrier_func(struct work_struct *work)
@@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
2351 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); 2363 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2352 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 2364 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2353 init_completion(&barr->done); 2365 init_completion(&barr->done);
2366 barr->task = current;
2354 2367
2355 /* 2368 /*
2356 * If @target is currently being executed, schedule the 2369 * If @target is currently being executed, schedule the
@@ -2989,323 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
2989} 3002}
2990EXPORT_SYMBOL_GPL(execute_in_process_context); 3003EXPORT_SYMBOL_GPL(execute_in_process_context);
2991 3004
2992#ifdef CONFIG_SYSFS
2993/*
2994 * Workqueues with WQ_SYSFS flag set is visible to userland via
2995 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
2996 * following attributes.
2997 *
2998 * per_cpu RO bool : whether the workqueue is per-cpu or unbound
2999 * max_active RW int : maximum number of in-flight work items
3000 *
3001 * Unbound workqueues have the following extra attributes.
3002 *
3003 * id RO int : the associated pool ID
3004 * nice RW int : nice value of the workers
3005 * cpumask RW mask : bitmask of allowed CPUs for the workers
3006 */
3007struct wq_device {
3008 struct workqueue_struct *wq;
3009 struct device dev;
3010};
3011
3012static struct workqueue_struct *dev_to_wq(struct device *dev)
3013{
3014 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3015
3016 return wq_dev->wq;
3017}
3018
3019static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
3020 char *buf)
3021{
3022 struct workqueue_struct *wq = dev_to_wq(dev);
3023
3024 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3025}
3026static DEVICE_ATTR_RO(per_cpu);
3027
3028static ssize_t max_active_show(struct device *dev,
3029 struct device_attribute *attr, char *buf)
3030{
3031 struct workqueue_struct *wq = dev_to_wq(dev);
3032
3033 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3034}
3035
3036static ssize_t max_active_store(struct device *dev,
3037 struct device_attribute *attr, const char *buf,
3038 size_t count)
3039{
3040 struct workqueue_struct *wq = dev_to_wq(dev);
3041 int val;
3042
3043 if (sscanf(buf, "%d", &val) != 1 || val <= 0)
3044 return -EINVAL;
3045
3046 workqueue_set_max_active(wq, val);
3047 return count;
3048}
3049static DEVICE_ATTR_RW(max_active);
3050
3051static struct attribute *wq_sysfs_attrs[] = {
3052 &dev_attr_per_cpu.attr,
3053 &dev_attr_max_active.attr,
3054 NULL,
3055};
3056ATTRIBUTE_GROUPS(wq_sysfs);
3057
3058static ssize_t wq_pool_ids_show(struct device *dev,
3059 struct device_attribute *attr, char *buf)
3060{
3061 struct workqueue_struct *wq = dev_to_wq(dev);
3062 const char *delim = "";
3063 int node, written = 0;
3064
3065 rcu_read_lock_sched();
3066 for_each_node(node) {
3067 written += scnprintf(buf + written, PAGE_SIZE - written,
3068 "%s%d:%d", delim, node,
3069 unbound_pwq_by_node(wq, node)->pool->id);
3070 delim = " ";
3071 }
3072 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3073 rcu_read_unlock_sched();
3074
3075 return written;
3076}
3077
3078static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
3079 char *buf)
3080{
3081 struct workqueue_struct *wq = dev_to_wq(dev);
3082 int written;
3083
3084 mutex_lock(&wq->mutex);
3085 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
3086 mutex_unlock(&wq->mutex);
3087
3088 return written;
3089}
3090
3091/* prepare workqueue_attrs for sysfs store operations */
3092static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
3093{
3094 struct workqueue_attrs *attrs;
3095
3096 attrs = alloc_workqueue_attrs(GFP_KERNEL);
3097 if (!attrs)
3098 return NULL;
3099
3100 mutex_lock(&wq->mutex);
3101 copy_workqueue_attrs(attrs, wq->unbound_attrs);
3102 mutex_unlock(&wq->mutex);
3103 return attrs;
3104}
3105
3106static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
3107 const char *buf, size_t count)
3108{
3109 struct workqueue_struct *wq = dev_to_wq(dev);
3110 struct workqueue_attrs *attrs;
3111 int ret;
3112
3113 attrs = wq_sysfs_prep_attrs(wq);
3114 if (!attrs)
3115 return -ENOMEM;
3116
3117 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
3118 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
3119 ret = apply_workqueue_attrs(wq, attrs);
3120 else
3121 ret = -EINVAL;
3122
3123 free_workqueue_attrs(attrs);
3124 return ret ?: count;
3125}
3126
3127static ssize_t wq_cpumask_show(struct device *dev,
3128 struct device_attribute *attr, char *buf)
3129{
3130 struct workqueue_struct *wq = dev_to_wq(dev);
3131 int written;
3132
3133 mutex_lock(&wq->mutex);
3134 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
3135 cpumask_pr_args(wq->unbound_attrs->cpumask));
3136 mutex_unlock(&wq->mutex);
3137 return written;
3138}
3139
3140static ssize_t wq_cpumask_store(struct device *dev,
3141 struct device_attribute *attr,
3142 const char *buf, size_t count)
3143{
3144 struct workqueue_struct *wq = dev_to_wq(dev);
3145 struct workqueue_attrs *attrs;
3146 int ret;
3147
3148 attrs = wq_sysfs_prep_attrs(wq);
3149 if (!attrs)
3150 return -ENOMEM;
3151
3152 ret = cpumask_parse(buf, attrs->cpumask);
3153 if (!ret)
3154 ret = apply_workqueue_attrs(wq, attrs);
3155
3156 free_workqueue_attrs(attrs);
3157 return ret ?: count;
3158}
3159
3160static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
3161 char *buf)
3162{
3163 struct workqueue_struct *wq = dev_to_wq(dev);
3164 int written;
3165
3166 mutex_lock(&wq->mutex);
3167 written = scnprintf(buf, PAGE_SIZE, "%d\n",
3168 !wq->unbound_attrs->no_numa);
3169 mutex_unlock(&wq->mutex);
3170
3171 return written;
3172}
3173
3174static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
3175 const char *buf, size_t count)
3176{
3177 struct workqueue_struct *wq = dev_to_wq(dev);
3178 struct workqueue_attrs *attrs;
3179 int v, ret;
3180
3181 attrs = wq_sysfs_prep_attrs(wq);
3182 if (!attrs)
3183 return -ENOMEM;
3184
3185 ret = -EINVAL;
3186 if (sscanf(buf, "%d", &v) == 1) {
3187 attrs->no_numa = !v;
3188 ret = apply_workqueue_attrs(wq, attrs);
3189 }
3190
3191 free_workqueue_attrs(attrs);
3192 return ret ?: count;
3193}
3194
3195static struct device_attribute wq_sysfs_unbound_attrs[] = {
3196 __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
3197 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
3198 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
3199 __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
3200 __ATTR_NULL,
3201};
3202
3203static struct bus_type wq_subsys = {
3204 .name = "workqueue",
3205 .dev_groups = wq_sysfs_groups,
3206};
3207
3208static int __init wq_sysfs_init(void)
3209{
3210 return subsys_virtual_register(&wq_subsys, NULL);
3211}
3212core_initcall(wq_sysfs_init);
3213
3214static void wq_device_release(struct device *dev)
3215{
3216 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3217
3218 kfree(wq_dev);
3219}
3220
3221/**
3222 * workqueue_sysfs_register - make a workqueue visible in sysfs
3223 * @wq: the workqueue to register
3224 *
3225 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
3226 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
3227 * which is the preferred method.
3228 *
3229 * Workqueue user should use this function directly iff it wants to apply
3230 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
3231 * apply_workqueue_attrs() may race against userland updating the
3232 * attributes.
3233 *
3234 * Return: 0 on success, -errno on failure.
3235 */
3236int workqueue_sysfs_register(struct workqueue_struct *wq)
3237{
3238 struct wq_device *wq_dev;
3239 int ret;
3240
3241 /*
3242 * Adjusting max_active or creating new pwqs by applyting
3243 * attributes breaks ordering guarantee. Disallow exposing ordered
3244 * workqueues.
3245 */
3246 if (WARN_ON(wq->flags & __WQ_ORDERED))
3247 return -EINVAL;
3248
3249 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
3250 if (!wq_dev)
3251 return -ENOMEM;
3252
3253 wq_dev->wq = wq;
3254 wq_dev->dev.bus = &wq_subsys;
3255 wq_dev->dev.init_name = wq->name;
3256 wq_dev->dev.release = wq_device_release;
3257
3258 /*
3259 * unbound_attrs are created separately. Suppress uevent until
3260 * everything is ready.
3261 */
3262 dev_set_uevent_suppress(&wq_dev->dev, true);
3263
3264 ret = device_register(&wq_dev->dev);
3265 if (ret) {
3266 kfree(wq_dev);
3267 wq->wq_dev = NULL;
3268 return ret;
3269 }
3270
3271 if (wq->flags & WQ_UNBOUND) {
3272 struct device_attribute *attr;
3273
3274 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
3275 ret = device_create_file(&wq_dev->dev, attr);
3276 if (ret) {
3277 device_unregister(&wq_dev->dev);
3278 wq->wq_dev = NULL;
3279 return ret;
3280 }
3281 }
3282 }
3283
3284 dev_set_uevent_suppress(&wq_dev->dev, false);
3285 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
3286 return 0;
3287}
3288
3289/**
3290 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
3291 * @wq: the workqueue to unregister
3292 *
3293 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
3294 */
3295static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
3296{
3297 struct wq_device *wq_dev = wq->wq_dev;
3298
3299 if (!wq->wq_dev)
3300 return;
3301
3302 wq->wq_dev = NULL;
3303 device_unregister(&wq_dev->dev);
3304}
3305#else /* CONFIG_SYSFS */
3306static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
3307#endif /* CONFIG_SYSFS */
3308
3309/** 3005/**
3310 * free_workqueue_attrs - free a workqueue_attrs 3006 * free_workqueue_attrs - free a workqueue_attrs
3311 * @attrs: workqueue_attrs to free 3007 * @attrs: workqueue_attrs to free
@@ -3424,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool)
3424 return 0; 3120 return 0;
3425} 3121}
3426 3122
3123static void rcu_free_wq(struct rcu_head *rcu)
3124{
3125 struct workqueue_struct *wq =
3126 container_of(rcu, struct workqueue_struct, rcu);
3127
3128 if (!(wq->flags & WQ_UNBOUND))
3129 free_percpu(wq->cpu_pwqs);
3130 else
3131 free_workqueue_attrs(wq->unbound_attrs);
3132
3133 kfree(wq->rescuer);
3134 kfree(wq);
3135}
3136
3427static void rcu_free_pool(struct rcu_head *rcu) 3137static void rcu_free_pool(struct rcu_head *rcu)
3428{ 3138{
3429 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); 3139 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
@@ -3601,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
3601 3311
3602 /* 3312 /*
3603 * If we're the last pwq going away, @wq is already dead and no one 3313 * If we're the last pwq going away, @wq is already dead and no one
3604 * is gonna access it anymore. Free it. 3314 * is gonna access it anymore. Schedule RCU free.
3605 */ 3315 */
3606 if (is_last) { 3316 if (is_last)
3607 free_workqueue_attrs(wq->unbound_attrs); 3317 call_rcu_sched(&wq->rcu, rcu_free_wq);
3608 kfree(wq);
3609 }
3610} 3318}
3611 3319
3612/** 3320/**
@@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4143 pwq_adjust_max_active(pwq); 3851 pwq_adjust_max_active(pwq);
4144 mutex_unlock(&wq->mutex); 3852 mutex_unlock(&wq->mutex);
4145 3853
4146 list_add(&wq->list, &workqueues); 3854 list_add_tail_rcu(&wq->list, &workqueues);
4147 3855
4148 mutex_unlock(&wq_pool_mutex); 3856 mutex_unlock(&wq_pool_mutex);
4149 3857
@@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
4199 * flushing is complete in case freeze races us. 3907 * flushing is complete in case freeze races us.
4200 */ 3908 */
4201 mutex_lock(&wq_pool_mutex); 3909 mutex_lock(&wq_pool_mutex);
4202 list_del_init(&wq->list); 3910 list_del_rcu(&wq->list);
4203 mutex_unlock(&wq_pool_mutex); 3911 mutex_unlock(&wq_pool_mutex);
4204 3912
4205 workqueue_sysfs_unregister(wq); 3913 workqueue_sysfs_unregister(wq);
4206 3914
4207 if (wq->rescuer) { 3915 if (wq->rescuer)
4208 kthread_stop(wq->rescuer->task); 3916 kthread_stop(wq->rescuer->task);
4209 kfree(wq->rescuer);
4210 wq->rescuer = NULL;
4211 }
4212 3917
4213 if (!(wq->flags & WQ_UNBOUND)) { 3918 if (!(wq->flags & WQ_UNBOUND)) {
4214 /* 3919 /*
4215 * The base ref is never dropped on per-cpu pwqs. Directly 3920 * The base ref is never dropped on per-cpu pwqs. Directly
4216 * free the pwqs and wq. 3921 * schedule RCU free.
4217 */ 3922 */
4218 free_percpu(wq->cpu_pwqs); 3923 call_rcu_sched(&wq->rcu, rcu_free_wq);
4219 kfree(wq);
4220 } else { 3924 } else {
4221 /* 3925 /*
4222 * We're the sole accessor of @wq at this point. Directly 3926 * We're the sole accessor of @wq at this point. Directly
@@ -4437,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
4437 } 4141 }
4438} 4142}
4439 4143
4144static void pr_cont_pool_info(struct worker_pool *pool)
4145{
4146 pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
4147 if (pool->node != NUMA_NO_NODE)
4148 pr_cont(" node=%d", pool->node);
4149 pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
4150}
4151
4152static void pr_cont_work(bool comma, struct work_struct *work)
4153{
4154 if (work->func == wq_barrier_func) {
4155 struct wq_barrier *barr;
4156
4157 barr = container_of(work, struct wq_barrier, work);
4158
4159 pr_cont("%s BAR(%d)", comma ? "," : "",
4160 task_pid_nr(barr->task));
4161 } else {
4162 pr_cont("%s %pf", comma ? "," : "", work->func);
4163 }
4164}
4165
4166static void show_pwq(struct pool_workqueue *pwq)
4167{
4168 struct worker_pool *pool = pwq->pool;
4169 struct work_struct *work;
4170 struct worker *worker;
4171 bool has_in_flight = false, has_pending = false;
4172 int bkt;
4173
4174 pr_info(" pwq %d:", pool->id);
4175 pr_cont_pool_info(pool);
4176
4177 pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
4178 !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
4179
4180 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
4181 if (worker->current_pwq == pwq) {
4182 has_in_flight = true;
4183 break;
4184 }
4185 }
4186 if (has_in_flight) {
4187 bool comma = false;
4188
4189 pr_info(" in-flight:");
4190 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
4191 if (worker->current_pwq != pwq)
4192 continue;
4193
4194 pr_cont("%s %d%s:%pf", comma ? "," : "",
4195 task_pid_nr(worker->task),
4196 worker == pwq->wq->rescuer ? "(RESCUER)" : "",
4197 worker->current_func);
4198 list_for_each_entry(work, &worker->scheduled, entry)
4199 pr_cont_work(false, work);
4200 comma = true;
4201 }
4202 pr_cont("\n");
4203 }
4204
4205 list_for_each_entry(work, &pool->worklist, entry) {
4206 if (get_work_pwq(work) == pwq) {
4207 has_pending = true;
4208 break;
4209 }
4210 }
4211 if (has_pending) {
4212 bool comma = false;
4213
4214 pr_info(" pending:");
4215 list_for_each_entry(work, &pool->worklist, entry) {
4216 if (get_work_pwq(work) != pwq)
4217 continue;
4218
4219 pr_cont_work(comma, work);
4220 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
4221 }
4222 pr_cont("\n");
4223 }
4224
4225 if (!list_empty(&pwq->delayed_works)) {
4226 bool comma = false;
4227
4228 pr_info(" delayed:");
4229 list_for_each_entry(work, &pwq->delayed_works, entry) {
4230 pr_cont_work(comma, work);
4231 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
4232 }
4233 pr_cont("\n");
4234 }
4235}
4236
4237/**
4238 * show_workqueue_state - dump workqueue state
4239 *
4240 * Called from a sysrq handler and prints out all busy workqueues and
4241 * pools.
4242 */
4243void show_workqueue_state(void)
4244{
4245 struct workqueue_struct *wq;
4246 struct worker_pool *pool;
4247 unsigned long flags;
4248 int pi;
4249
4250 rcu_read_lock_sched();
4251
4252 pr_info("Showing busy workqueues and worker pools:\n");
4253
4254 list_for_each_entry_rcu(wq, &workqueues, list) {
4255 struct pool_workqueue *pwq;
4256 bool idle = true;
4257
4258 for_each_pwq(pwq, wq) {
4259 if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
4260 idle = false;
4261 break;
4262 }
4263 }
4264 if (idle)
4265 continue;
4266
4267 pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
4268
4269 for_each_pwq(pwq, wq) {
4270 spin_lock_irqsave(&pwq->pool->lock, flags);
4271 if (pwq->nr_active || !list_empty(&pwq->delayed_works))
4272 show_pwq(pwq);
4273 spin_unlock_irqrestore(&pwq->pool->lock, flags);
4274 }
4275 }
4276
4277 for_each_pool(pool, pi) {
4278 struct worker *worker;
4279 bool first = true;
4280
4281 spin_lock_irqsave(&pool->lock, flags);
4282 if (pool->nr_workers == pool->nr_idle)
4283 goto next_pool;
4284
4285 pr_info("pool %d:", pool->id);
4286 pr_cont_pool_info(pool);
4287 pr_cont(" workers=%d", pool->nr_workers);
4288 if (pool->manager)
4289 pr_cont(" manager: %d",
4290 task_pid_nr(pool->manager->task));
4291 list_for_each_entry(worker, &pool->idle_list, entry) {
4292 pr_cont(" %s%d", first ? "idle: " : "",
4293 task_pid_nr(worker->task));
4294 first = false;
4295 }
4296 pr_cont("\n");
4297 next_pool:
4298 spin_unlock_irqrestore(&pool->lock, flags);
4299 }
4300
4301 rcu_read_unlock_sched();
4302}
4303
4440/* 4304/*
4441 * CPU hotplug. 4305 * CPU hotplug.
4442 * 4306 *
@@ -4834,6 +4698,323 @@ out_unlock:
4834} 4698}
4835#endif /* CONFIG_FREEZER */ 4699#endif /* CONFIG_FREEZER */
4836 4700
4701#ifdef CONFIG_SYSFS
4702/*
4703 * Workqueues with WQ_SYSFS flag set is visible to userland via
4704 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
4705 * following attributes.
4706 *
4707 * per_cpu RO bool : whether the workqueue is per-cpu or unbound
4708 * max_active RW int : maximum number of in-flight work items
4709 *
4710 * Unbound workqueues have the following extra attributes.
4711 *
4712 * id RO int : the associated pool ID
4713 * nice RW int : nice value of the workers
4714 * cpumask RW mask : bitmask of allowed CPUs for the workers
4715 */
4716struct wq_device {
4717 struct workqueue_struct *wq;
4718 struct device dev;
4719};
4720
4721static struct workqueue_struct *dev_to_wq(struct device *dev)
4722{
4723 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
4724
4725 return wq_dev->wq;
4726}
4727
4728static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
4729 char *buf)
4730{
4731 struct workqueue_struct *wq = dev_to_wq(dev);
4732
4733 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
4734}
4735static DEVICE_ATTR_RO(per_cpu);
4736
4737static ssize_t max_active_show(struct device *dev,
4738 struct device_attribute *attr, char *buf)
4739{
4740 struct workqueue_struct *wq = dev_to_wq(dev);
4741
4742 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
4743}
4744
4745static ssize_t max_active_store(struct device *dev,
4746 struct device_attribute *attr, const char *buf,
4747 size_t count)
4748{
4749 struct workqueue_struct *wq = dev_to_wq(dev);
4750 int val;
4751
4752 if (sscanf(buf, "%d", &val) != 1 || val <= 0)
4753 return -EINVAL;
4754
4755 workqueue_set_max_active(wq, val);
4756 return count;
4757}
4758static DEVICE_ATTR_RW(max_active);
4759
4760static struct attribute *wq_sysfs_attrs[] = {
4761 &dev_attr_per_cpu.attr,
4762 &dev_attr_max_active.attr,
4763 NULL,
4764};
4765ATTRIBUTE_GROUPS(wq_sysfs);
4766
4767static ssize_t wq_pool_ids_show(struct device *dev,
4768 struct device_attribute *attr, char *buf)
4769{
4770 struct workqueue_struct *wq = dev_to_wq(dev);
4771 const char *delim = "";
4772 int node, written = 0;
4773
4774 rcu_read_lock_sched();
4775 for_each_node(node) {
4776 written += scnprintf(buf + written, PAGE_SIZE - written,
4777 "%s%d:%d", delim, node,
4778 unbound_pwq_by_node(wq, node)->pool->id);
4779 delim = " ";
4780 }
4781 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
4782 rcu_read_unlock_sched();
4783
4784 return written;
4785}
4786
4787static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
4788 char *buf)
4789{
4790 struct workqueue_struct *wq = dev_to_wq(dev);
4791 int written;
4792
4793 mutex_lock(&wq->mutex);
4794 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
4795 mutex_unlock(&wq->mutex);
4796
4797 return written;
4798}
4799
4800/* prepare workqueue_attrs for sysfs store operations */
4801static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
4802{
4803 struct workqueue_attrs *attrs;
4804
4805 attrs = alloc_workqueue_attrs(GFP_KERNEL);
4806 if (!attrs)
4807 return NULL;
4808
4809 mutex_lock(&wq->mutex);
4810 copy_workqueue_attrs(attrs, wq->unbound_attrs);
4811 mutex_unlock(&wq->mutex);
4812 return attrs;
4813}
4814
4815static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
4816 const char *buf, size_t count)
4817{
4818 struct workqueue_struct *wq = dev_to_wq(dev);
4819 struct workqueue_attrs *attrs;
4820 int ret;
4821
4822 attrs = wq_sysfs_prep_attrs(wq);
4823 if (!attrs)
4824 return -ENOMEM;
4825
4826 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
4827 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
4828 ret = apply_workqueue_attrs(wq, attrs);
4829 else
4830 ret = -EINVAL;
4831
4832 free_workqueue_attrs(attrs);
4833 return ret ?: count;
4834}
4835
4836static ssize_t wq_cpumask_show(struct device *dev,
4837 struct device_attribute *attr, char *buf)
4838{
4839 struct workqueue_struct *wq = dev_to_wq(dev);
4840 int written;
4841
4842 mutex_lock(&wq->mutex);
4843 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
4844 cpumask_pr_args(wq->unbound_attrs->cpumask));
4845 mutex_unlock(&wq->mutex);
4846 return written;
4847}
4848
4849static ssize_t wq_cpumask_store(struct device *dev,
4850 struct device_attribute *attr,
4851 const char *buf, size_t count)
4852{
4853 struct workqueue_struct *wq = dev_to_wq(dev);
4854 struct workqueue_attrs *attrs;
4855 int ret;
4856
4857 attrs = wq_sysfs_prep_attrs(wq);
4858 if (!attrs)
4859 return -ENOMEM;
4860
4861 ret = cpumask_parse(buf, attrs->cpumask);
4862 if (!ret)
4863 ret = apply_workqueue_attrs(wq, attrs);
4864
4865 free_workqueue_attrs(attrs);
4866 return ret ?: count;
4867}
4868
4869static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
4870 char *buf)
4871{
4872 struct workqueue_struct *wq = dev_to_wq(dev);
4873 int written;
4874
4875 mutex_lock(&wq->mutex);
4876 written = scnprintf(buf, PAGE_SIZE, "%d\n",
4877 !wq->unbound_attrs->no_numa);
4878 mutex_unlock(&wq->mutex);
4879
4880 return written;
4881}
4882
4883static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
4884 const char *buf, size_t count)
4885{
4886 struct workqueue_struct *wq = dev_to_wq(dev);
4887 struct workqueue_attrs *attrs;
4888 int v, ret;
4889
4890 attrs = wq_sysfs_prep_attrs(wq);
4891 if (!attrs)
4892 return -ENOMEM;
4893
4894 ret = -EINVAL;
4895 if (sscanf(buf, "%d", &v) == 1) {
4896 attrs->no_numa = !v;
4897 ret = apply_workqueue_attrs(wq, attrs);
4898 }
4899
4900 free_workqueue_attrs(attrs);
4901 return ret ?: count;
4902}
4903
4904static struct device_attribute wq_sysfs_unbound_attrs[] = {
4905 __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
4906 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
4907 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
4908 __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
4909 __ATTR_NULL,
4910};
4911
4912static struct bus_type wq_subsys = {
4913 .name = "workqueue",
4914 .dev_groups = wq_sysfs_groups,
4915};
4916
4917static int __init wq_sysfs_init(void)
4918{
4919 return subsys_virtual_register(&wq_subsys, NULL);
4920}
4921core_initcall(wq_sysfs_init);
4922
4923static void wq_device_release(struct device *dev)
4924{
4925 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
4926
4927 kfree(wq_dev);
4928}
4929
4930/**
4931 * workqueue_sysfs_register - make a workqueue visible in sysfs
4932 * @wq: the workqueue to register
4933 *
4934 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
4935 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
4936 * which is the preferred method.
4937 *
4938 * Workqueue user should use this function directly iff it wants to apply
4939 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
4940 * apply_workqueue_attrs() may race against userland updating the
4941 * attributes.
4942 *
4943 * Return: 0 on success, -errno on failure.
4944 */
4945int workqueue_sysfs_register(struct workqueue_struct *wq)
4946{
4947 struct wq_device *wq_dev;
4948 int ret;
4949
4950 /*
4951 * Adjusting max_active or creating new pwqs by applyting
4952 * attributes breaks ordering guarantee. Disallow exposing ordered
4953 * workqueues.
4954 */
4955 if (WARN_ON(wq->flags & __WQ_ORDERED))
4956 return -EINVAL;
4957
4958 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
4959 if (!wq_dev)
4960 return -ENOMEM;
4961
4962 wq_dev->wq = wq;
4963 wq_dev->dev.bus = &wq_subsys;
4964 wq_dev->dev.init_name = wq->name;
4965 wq_dev->dev.release = wq_device_release;
4966
4967 /*
4968 * unbound_attrs are created separately. Suppress uevent until
4969 * everything is ready.
4970 */
4971 dev_set_uevent_suppress(&wq_dev->dev, true);
4972
4973 ret = device_register(&wq_dev->dev);
4974 if (ret) {
4975 kfree(wq_dev);
4976 wq->wq_dev = NULL;
4977 return ret;
4978 }
4979
4980 if (wq->flags & WQ_UNBOUND) {
4981 struct device_attribute *attr;
4982
4983 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
4984 ret = device_create_file(&wq_dev->dev, attr);
4985 if (ret) {
4986 device_unregister(&wq_dev->dev);
4987 wq->wq_dev = NULL;
4988 return ret;
4989 }
4990 }
4991 }
4992
4993 dev_set_uevent_suppress(&wq_dev->dev, false);
4994 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
4995 return 0;
4996}
4997
4998/**
4999 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
5000 * @wq: the workqueue to unregister
5001 *
5002 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
5003 */
5004static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
5005{
5006 struct wq_device *wq_dev = wq->wq_dev;
5007
5008 if (!wq->wq_dev)
5009 return;
5010
5011 wq->wq_dev = NULL;
5012 device_unregister(&wq_dev->dev);
5013}
5014#else /* CONFIG_SYSFS */
5015static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
5016#endif /* CONFIG_SYSFS */
5017
4837static void __init wq_numa_init(void) 5018static void __init wq_numa_init(void)
4838{ 5019{
4839 cpumask_var_t *tbl; 5020 cpumask_var_t *tbl;