diff options
Diffstat (limited to 'kernel')
63 files changed, 3351 insertions, 1727 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 29a7b2cc593e..a220fdb66568 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count) | |||
3806 | 3806 | ||
3807 | static void pidlist_free(void *p) | 3807 | static void pidlist_free(void *p) |
3808 | { | 3808 | { |
3809 | if (is_vmalloc_addr(p)) | 3809 | kvfree(p); |
3810 | vfree(p); | ||
3811 | else | ||
3812 | kfree(p); | ||
3813 | } | 3810 | } |
3814 | 3811 | ||
3815 | /* | 3812 | /* |
@@ -5040,6 +5037,9 @@ int __init cgroup_init(void) | |||
5040 | WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); | 5037 | WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); |
5041 | WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); | 5038 | WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); |
5042 | } | 5039 | } |
5040 | |||
5041 | if (ss->bind) | ||
5042 | ss->bind(init_css_set.subsys[ssid]); | ||
5043 | } | 5043 | } |
5044 | 5044 | ||
5045 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 5045 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 57858cebd6b5..94bbe4695232 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/gfp.h> | 20 | #include <linux/gfp.h> |
21 | #include <linux/suspend.h> | 21 | #include <linux/suspend.h> |
22 | #include <linux/lockdep.h> | 22 | #include <linux/lockdep.h> |
23 | #include <linux/tick.h> | ||
23 | #include <trace/events/power.h> | 24 | #include <trace/events/power.h> |
24 | 25 | ||
25 | #include "smpboot.h" | 26 | #include "smpboot.h" |
@@ -338,6 +339,8 @@ static int __ref take_cpu_down(void *_param) | |||
338 | return err; | 339 | return err; |
339 | 340 | ||
340 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 341 | cpu_notify(CPU_DYING | param->mod, param->hcpu); |
342 | /* Give up timekeeping duties */ | ||
343 | tick_handover_do_timer(); | ||
341 | /* Park the stopper thread */ | 344 | /* Park the stopper thread */ |
342 | kthread_park(current); | 345 | kthread_park(current); |
343 | return 0; | 346 | return 0; |
@@ -413,10 +416,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
413 | smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ | 416 | smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ |
414 | per_cpu(cpu_dead_idle, cpu) = false; | 417 | per_cpu(cpu_dead_idle, cpu) = false; |
415 | 418 | ||
419 | hotplug_cpu__broadcast_tick_pull(cpu); | ||
416 | /* This actually kills the CPU. */ | 420 | /* This actually kills the CPU. */ |
417 | __cpu_die(cpu); | 421 | __cpu_die(cpu); |
418 | 422 | ||
419 | /* CPU is completely dead: tell everyone. Too late to complain. */ | 423 | /* CPU is completely dead: tell everyone. Too late to complain. */ |
424 | tick_cleanup_dead_cpu(cpu); | ||
420 | cpu_notify_nofail(CPU_DEAD | mod, hcpu); | 425 | cpu_notify_nofail(CPU_DEAD | mod, hcpu); |
421 | 426 | ||
422 | check_for_tasks(cpu); | 427 | check_for_tasks(cpu); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fc7f4748d34a..c68f0721df10 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
622 | int csn; /* how many cpuset ptrs in csa so far */ | 622 | int csn; /* how many cpuset ptrs in csa so far */ |
623 | int i, j, k; /* indices for partition finding loops */ | 623 | int i, j, k; /* indices for partition finding loops */ |
624 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ | 624 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ |
625 | cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ | ||
625 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 626 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
626 | int ndoms = 0; /* number of sched domains in result */ | 627 | int ndoms = 0; /* number of sched domains in result */ |
627 | int nslot; /* next empty doms[] struct cpumask slot */ | 628 | int nslot; /* next empty doms[] struct cpumask slot */ |
@@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
631 | dattr = NULL; | 632 | dattr = NULL; |
632 | csa = NULL; | 633 | csa = NULL; |
633 | 634 | ||
635 | if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) | ||
636 | goto done; | ||
637 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | ||
638 | |||
634 | /* Special case for the 99% of systems with one, full, sched domain */ | 639 | /* Special case for the 99% of systems with one, full, sched domain */ |
635 | if (is_sched_load_balance(&top_cpuset)) { | 640 | if (is_sched_load_balance(&top_cpuset)) { |
636 | ndoms = 1; | 641 | ndoms = 1; |
@@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
643 | *dattr = SD_ATTR_INIT; | 648 | *dattr = SD_ATTR_INIT; |
644 | update_domain_attr_tree(dattr, &top_cpuset); | 649 | update_domain_attr_tree(dattr, &top_cpuset); |
645 | } | 650 | } |
646 | cpumask_copy(doms[0], top_cpuset.effective_cpus); | 651 | cpumask_and(doms[0], top_cpuset.effective_cpus, |
652 | non_isolated_cpus); | ||
647 | 653 | ||
648 | goto done; | 654 | goto done; |
649 | } | 655 | } |
@@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
666 | * the corresponding sched domain. | 672 | * the corresponding sched domain. |
667 | */ | 673 | */ |
668 | if (!cpumask_empty(cp->cpus_allowed) && | 674 | if (!cpumask_empty(cp->cpus_allowed) && |
669 | !is_sched_load_balance(cp)) | 675 | !(is_sched_load_balance(cp) && |
676 | cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) | ||
670 | continue; | 677 | continue; |
671 | 678 | ||
672 | if (is_sched_load_balance(cp)) | 679 | if (is_sched_load_balance(cp)) |
@@ -748,6 +755,7 @@ restart: | |||
748 | 755 | ||
749 | if (apn == b->pn) { | 756 | if (apn == b->pn) { |
750 | cpumask_or(dp, dp, b->effective_cpus); | 757 | cpumask_or(dp, dp, b->effective_cpus); |
758 | cpumask_and(dp, dp, non_isolated_cpus); | ||
751 | if (dattr) | 759 | if (dattr) |
752 | update_domain_attr_tree(dattr + nslot, b); | 760 | update_domain_attr_tree(dattr + nslot, b); |
753 | 761 | ||
@@ -760,6 +768,7 @@ restart: | |||
760 | BUG_ON(nslot != ndoms); | 768 | BUG_ON(nslot != ndoms); |
761 | 769 | ||
762 | done: | 770 | done: |
771 | free_cpumask_var(non_isolated_cpus); | ||
763 | kfree(csa); | 772 | kfree(csa); |
764 | 773 | ||
765 | /* | 774 | /* |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 453ef61311d4..2fabc0627165 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -4574,6 +4574,13 @@ static void perf_pending_event(struct irq_work *entry) | |||
4574 | { | 4574 | { |
4575 | struct perf_event *event = container_of(entry, | 4575 | struct perf_event *event = container_of(entry, |
4576 | struct perf_event, pending); | 4576 | struct perf_event, pending); |
4577 | int rctx; | ||
4578 | |||
4579 | rctx = perf_swevent_get_recursion_context(); | ||
4580 | /* | ||
4581 | * If we 'fail' here, that's OK, it means recursion is already disabled | ||
4582 | * and we won't recurse 'further'. | ||
4583 | */ | ||
4577 | 4584 | ||
4578 | if (event->pending_disable) { | 4585 | if (event->pending_disable) { |
4579 | event->pending_disable = 0; | 4586 | event->pending_disable = 0; |
@@ -4584,6 +4591,9 @@ static void perf_pending_event(struct irq_work *entry) | |||
4584 | event->pending_wakeup = 0; | 4591 | event->pending_wakeup = 0; |
4585 | perf_event_wakeup(event); | 4592 | perf_event_wakeup(event); |
4586 | } | 4593 | } |
4594 | |||
4595 | if (rctx >= 0) | ||
4596 | perf_swevent_put_recursion_context(rctx); | ||
4587 | } | 4597 | } |
4588 | 4598 | ||
4589 | /* | 4599 | /* |
diff --git a/kernel/futex.c b/kernel/futex.c index 2a5e3830e953..2579e407ff67 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, | |||
900 | if (!p) | 900 | if (!p) |
901 | return -ESRCH; | 901 | return -ESRCH; |
902 | 902 | ||
903 | if (!p->mm) { | 903 | if (unlikely(p->flags & PF_KTHREAD)) { |
904 | put_task_struct(p); | 904 | put_task_struct(p); |
905 | return -EPERM; | 905 | return -EPERM; |
906 | } | 906 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6f1c7a566b95..eb9a4ea394ab 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) | |||
948 | 948 | ||
949 | return -ENOSYS; | 949 | return -ENOSYS; |
950 | } | 950 | } |
951 | |||
952 | /** | ||
953 | * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt | ||
954 | * @data: Pointer to interrupt specific data | ||
955 | * @on: Whether to set or reset the wake-up capability of this irq | ||
956 | * | ||
957 | * Conditional, as the underlying parent chip might not implement it. | ||
958 | */ | ||
959 | int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) | ||
960 | { | ||
961 | data = data->parent_data; | ||
962 | if (data->chip->irq_set_wake) | ||
963 | return data->chip->irq_set_wake(data, on); | ||
964 | |||
965 | return -ENOSYS; | ||
966 | } | ||
951 | #endif | 967 | #endif |
952 | 968 | ||
953 | /** | 969 | /** |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 886d09e691d5..e68932bb308e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) | |||
68 | * Do not use this for shutdown scenarios where you must be sure | 68 | * Do not use this for shutdown scenarios where you must be sure |
69 | * that all parts (hardirq and threaded handler) have completed. | 69 | * that all parts (hardirq and threaded handler) have completed. |
70 | * | 70 | * |
71 | * Returns: false if a threaded handler is active. | ||
72 | * | ||
71 | * This function may be called - with care - from IRQ context. | 73 | * This function may be called - with care - from IRQ context. |
72 | */ | 74 | */ |
73 | void synchronize_hardirq(unsigned int irq) | 75 | bool synchronize_hardirq(unsigned int irq) |
74 | { | 76 | { |
75 | struct irq_desc *desc = irq_to_desc(irq); | 77 | struct irq_desc *desc = irq_to_desc(irq); |
76 | 78 | ||
77 | if (desc) | 79 | if (desc) { |
78 | __synchronize_hardirq(desc); | 80 | __synchronize_hardirq(desc); |
81 | return !atomic_read(&desc->threads_active); | ||
82 | } | ||
83 | |||
84 | return true; | ||
79 | } | 85 | } |
80 | EXPORT_SYMBOL(synchronize_hardirq); | 86 | EXPORT_SYMBOL(synchronize_hardirq); |
81 | 87 | ||
@@ -440,6 +446,32 @@ void disable_irq(unsigned int irq) | |||
440 | } | 446 | } |
441 | EXPORT_SYMBOL(disable_irq); | 447 | EXPORT_SYMBOL(disable_irq); |
442 | 448 | ||
449 | /** | ||
450 | * disable_hardirq - disables an irq and waits for hardirq completion | ||
451 | * @irq: Interrupt to disable | ||
452 | * | ||
453 | * Disable the selected interrupt line. Enables and Disables are | ||
454 | * nested. | ||
455 | * This function waits for any pending hard IRQ handlers for this | ||
456 | * interrupt to complete before returning. If you use this function while | ||
457 | * holding a resource the hard IRQ handler may need you will deadlock. | ||
458 | * | ||
459 | * When used to optimistically disable an interrupt from atomic context | ||
460 | * the return value must be checked. | ||
461 | * | ||
462 | * Returns: false if a threaded handler is active. | ||
463 | * | ||
464 | * This function may be called - with care - from IRQ context. | ||
465 | */ | ||
466 | bool disable_hardirq(unsigned int irq) | ||
467 | { | ||
468 | if (!__disable_irq_nosync(irq)) | ||
469 | return synchronize_hardirq(irq); | ||
470 | |||
471 | return false; | ||
472 | } | ||
473 | EXPORT_SYMBOL_GPL(disable_hardirq); | ||
474 | |||
443 | void __enable_irq(struct irq_desc *desc, unsigned int irq) | 475 | void __enable_irq(struct irq_desc *desc, unsigned int irq) |
444 | { | 476 | { |
445 | switch (desc->depth) { | 477 | switch (desc->depth) { |
@@ -1766,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
1766 | 1798 | ||
1767 | return retval; | 1799 | return retval; |
1768 | } | 1800 | } |
1801 | |||
1802 | /** | ||
1803 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. | ||
1804 | * @irq: Interrupt line that is forwarded to a VM | ||
1805 | * @which: One of IRQCHIP_STATE_* the caller wants to know about | ||
1806 | * @state: a pointer to a boolean where the state is to be storeed | ||
1807 | * | ||
1808 | * This call snapshots the internal irqchip state of an | ||
1809 | * interrupt, returning into @state the bit corresponding to | ||
1810 | * stage @which | ||
1811 | * | ||
1812 | * This function should be called with preemption disabled if the | ||
1813 | * interrupt controller has per-cpu registers. | ||
1814 | */ | ||
1815 | int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | ||
1816 | bool *state) | ||
1817 | { | ||
1818 | struct irq_desc *desc; | ||
1819 | struct irq_data *data; | ||
1820 | struct irq_chip *chip; | ||
1821 | unsigned long flags; | ||
1822 | int err = -EINVAL; | ||
1823 | |||
1824 | desc = irq_get_desc_buslock(irq, &flags, 0); | ||
1825 | if (!desc) | ||
1826 | return err; | ||
1827 | |||
1828 | data = irq_desc_get_irq_data(desc); | ||
1829 | |||
1830 | do { | ||
1831 | chip = irq_data_get_irq_chip(data); | ||
1832 | if (chip->irq_get_irqchip_state) | ||
1833 | break; | ||
1834 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
1835 | data = data->parent_data; | ||
1836 | #else | ||
1837 | data = NULL; | ||
1838 | #endif | ||
1839 | } while (data); | ||
1840 | |||
1841 | if (data) | ||
1842 | err = chip->irq_get_irqchip_state(data, which, state); | ||
1843 | |||
1844 | irq_put_desc_busunlock(desc, flags); | ||
1845 | return err; | ||
1846 | } | ||
1847 | |||
1848 | /** | ||
1849 | * irq_set_irqchip_state - set the state of a forwarded interrupt. | ||
1850 | * @irq: Interrupt line that is forwarded to a VM | ||
1851 | * @which: State to be restored (one of IRQCHIP_STATE_*) | ||
1852 | * @val: Value corresponding to @which | ||
1853 | * | ||
1854 | * This call sets the internal irqchip state of an interrupt, | ||
1855 | * depending on the value of @which. | ||
1856 | * | ||
1857 | * This function should be called with preemption disabled if the | ||
1858 | * interrupt controller has per-cpu registers. | ||
1859 | */ | ||
1860 | int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | ||
1861 | bool val) | ||
1862 | { | ||
1863 | struct irq_desc *desc; | ||
1864 | struct irq_data *data; | ||
1865 | struct irq_chip *chip; | ||
1866 | unsigned long flags; | ||
1867 | int err = -EINVAL; | ||
1868 | |||
1869 | desc = irq_get_desc_buslock(irq, &flags, 0); | ||
1870 | if (!desc) | ||
1871 | return err; | ||
1872 | |||
1873 | data = irq_desc_get_irq_data(desc); | ||
1874 | |||
1875 | do { | ||
1876 | chip = irq_data_get_irq_chip(data); | ||
1877 | if (chip->irq_set_irqchip_state) | ||
1878 | break; | ||
1879 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
1880 | data = data->parent_data; | ||
1881 | #else | ||
1882 | data = NULL; | ||
1883 | #endif | ||
1884 | } while (data); | ||
1885 | |||
1886 | if (data) | ||
1887 | err = chip->irq_set_irqchip_state(data, which, val); | ||
1888 | |||
1889 | irq_put_desc_busunlock(desc, flags); | ||
1890 | return err; | ||
1891 | } | ||
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3e18163f336f..474de5cb394d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
@@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) | |||
310 | struct msi_desc *desc; | 310 | struct msi_desc *desc; |
311 | 311 | ||
312 | for_each_msi_entry(desc, dev) { | 312 | for_each_msi_entry(desc, dev) { |
313 | irq_domain_free_irqs(desc->irq, desc->nvec_used); | 313 | /* |
314 | desc->irq = 0; | 314 | * We might have failed to allocate an MSI early |
315 | * enough that there is no IRQ associated to this | ||
316 | * entry. If that's the case, don't do anything. | ||
317 | */ | ||
318 | if (desc->irq) { | ||
319 | irq_domain_free_irqs(desc->irq, desc->nvec_used); | ||
320 | desc->irq = 0; | ||
321 | } | ||
315 | } | 322 | } |
316 | } | 323 | } |
317 | 324 | ||
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 3f9f1d6b4c2e..284e2691e380 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
@@ -335,32 +335,20 @@ unlock: | |||
335 | rcu_read_unlock(); | 335 | rcu_read_unlock(); |
336 | } | 336 | } |
337 | 337 | ||
338 | static int klp_disable_func(struct klp_func *func) | 338 | static void klp_disable_func(struct klp_func *func) |
339 | { | 339 | { |
340 | struct klp_ops *ops; | 340 | struct klp_ops *ops; |
341 | int ret; | ||
342 | |||
343 | if (WARN_ON(func->state != KLP_ENABLED)) | ||
344 | return -EINVAL; | ||
345 | 341 | ||
346 | if (WARN_ON(!func->old_addr)) | 342 | WARN_ON(func->state != KLP_ENABLED); |
347 | return -EINVAL; | 343 | WARN_ON(!func->old_addr); |
348 | 344 | ||
349 | ops = klp_find_ops(func->old_addr); | 345 | ops = klp_find_ops(func->old_addr); |
350 | if (WARN_ON(!ops)) | 346 | if (WARN_ON(!ops)) |
351 | return -EINVAL; | 347 | return; |
352 | 348 | ||
353 | if (list_is_singular(&ops->func_stack)) { | 349 | if (list_is_singular(&ops->func_stack)) { |
354 | ret = unregister_ftrace_function(&ops->fops); | 350 | WARN_ON(unregister_ftrace_function(&ops->fops)); |
355 | if (ret) { | 351 | WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0)); |
356 | pr_err("failed to unregister ftrace handler for function '%s' (%d)\n", | ||
357 | func->old_name, ret); | ||
358 | return ret; | ||
359 | } | ||
360 | |||
361 | ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); | ||
362 | if (ret) | ||
363 | pr_warn("function unregister succeeded but failed to clear the filter\n"); | ||
364 | 352 | ||
365 | list_del_rcu(&func->stack_node); | 353 | list_del_rcu(&func->stack_node); |
366 | list_del(&ops->node); | 354 | list_del(&ops->node); |
@@ -370,8 +358,6 @@ static int klp_disable_func(struct klp_func *func) | |||
370 | } | 358 | } |
371 | 359 | ||
372 | func->state = KLP_DISABLED; | 360 | func->state = KLP_DISABLED; |
373 | |||
374 | return 0; | ||
375 | } | 361 | } |
376 | 362 | ||
377 | static int klp_enable_func(struct klp_func *func) | 363 | static int klp_enable_func(struct klp_func *func) |
@@ -432,23 +418,15 @@ err: | |||
432 | return ret; | 418 | return ret; |
433 | } | 419 | } |
434 | 420 | ||
435 | static int klp_disable_object(struct klp_object *obj) | 421 | static void klp_disable_object(struct klp_object *obj) |
436 | { | 422 | { |
437 | struct klp_func *func; | 423 | struct klp_func *func; |
438 | int ret; | ||
439 | 424 | ||
440 | for (func = obj->funcs; func->old_name; func++) { | 425 | for (func = obj->funcs; func->old_name; func++) |
441 | if (func->state != KLP_ENABLED) | 426 | if (func->state == KLP_ENABLED) |
442 | continue; | 427 | klp_disable_func(func); |
443 | |||
444 | ret = klp_disable_func(func); | ||
445 | if (ret) | ||
446 | return ret; | ||
447 | } | ||
448 | 428 | ||
449 | obj->state = KLP_DISABLED; | 429 | obj->state = KLP_DISABLED; |
450 | |||
451 | return 0; | ||
452 | } | 430 | } |
453 | 431 | ||
454 | static int klp_enable_object(struct klp_object *obj) | 432 | static int klp_enable_object(struct klp_object *obj) |
@@ -464,22 +442,19 @@ static int klp_enable_object(struct klp_object *obj) | |||
464 | 442 | ||
465 | for (func = obj->funcs; func->old_name; func++) { | 443 | for (func = obj->funcs; func->old_name; func++) { |
466 | ret = klp_enable_func(func); | 444 | ret = klp_enable_func(func); |
467 | if (ret) | 445 | if (ret) { |
468 | goto unregister; | 446 | klp_disable_object(obj); |
447 | return ret; | ||
448 | } | ||
469 | } | 449 | } |
470 | obj->state = KLP_ENABLED; | 450 | obj->state = KLP_ENABLED; |
471 | 451 | ||
472 | return 0; | 452 | return 0; |
473 | |||
474 | unregister: | ||
475 | WARN_ON(klp_disable_object(obj)); | ||
476 | return ret; | ||
477 | } | 453 | } |
478 | 454 | ||
479 | static int __klp_disable_patch(struct klp_patch *patch) | 455 | static int __klp_disable_patch(struct klp_patch *patch) |
480 | { | 456 | { |
481 | struct klp_object *obj; | 457 | struct klp_object *obj; |
482 | int ret; | ||
483 | 458 | ||
484 | /* enforce stacking: only the last enabled patch can be disabled */ | 459 | /* enforce stacking: only the last enabled patch can be disabled */ |
485 | if (!list_is_last(&patch->list, &klp_patches) && | 460 | if (!list_is_last(&patch->list, &klp_patches) && |
@@ -489,12 +464,8 @@ static int __klp_disable_patch(struct klp_patch *patch) | |||
489 | pr_notice("disabling patch '%s'\n", patch->mod->name); | 464 | pr_notice("disabling patch '%s'\n", patch->mod->name); |
490 | 465 | ||
491 | for (obj = patch->objs; obj->funcs; obj++) { | 466 | for (obj = patch->objs; obj->funcs; obj++) { |
492 | if (obj->state != KLP_ENABLED) | 467 | if (obj->state == KLP_ENABLED) |
493 | continue; | 468 | klp_disable_object(obj); |
494 | |||
495 | ret = klp_disable_object(obj); | ||
496 | if (ret) | ||
497 | return ret; | ||
498 | } | 469 | } |
499 | 470 | ||
500 | patch->state = KLP_DISABLED; | 471 | patch->state = KLP_DISABLED; |
@@ -553,8 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch) | |||
553 | pr_notice("enabling patch '%s'\n", patch->mod->name); | 524 | pr_notice("enabling patch '%s'\n", patch->mod->name); |
554 | 525 | ||
555 | for (obj = patch->objs; obj->funcs; obj++) { | 526 | for (obj = patch->objs; obj->funcs; obj++) { |
556 | klp_find_object_module(obj); | ||
557 | |||
558 | if (!klp_is_object_loaded(obj)) | 527 | if (!klp_is_object_loaded(obj)) |
559 | continue; | 528 | continue; |
560 | 529 | ||
@@ -945,7 +914,6 @@ static void klp_module_notify_going(struct klp_patch *patch, | |||
945 | { | 914 | { |
946 | struct module *pmod = patch->mod; | 915 | struct module *pmod = patch->mod; |
947 | struct module *mod = obj->mod; | 916 | struct module *mod = obj->mod; |
948 | int ret; | ||
949 | 917 | ||
950 | if (patch->state == KLP_DISABLED) | 918 | if (patch->state == KLP_DISABLED) |
951 | goto disabled; | 919 | goto disabled; |
@@ -953,10 +921,7 @@ static void klp_module_notify_going(struct klp_patch *patch, | |||
953 | pr_notice("reverting patch '%s' on unloading module '%s'\n", | 921 | pr_notice("reverting patch '%s' on unloading module '%s'\n", |
954 | pmod->name, mod->name); | 922 | pmod->name, mod->name); |
955 | 923 | ||
956 | ret = klp_disable_object(obj); | 924 | klp_disable_object(obj); |
957 | if (ret) | ||
958 | pr_warn("failed to revert patch '%s' on module '%s' (%d)\n", | ||
959 | pmod->name, mod->name, ret); | ||
960 | 925 | ||
961 | disabled: | 926 | disabled: |
962 | klp_free_object_loaded(obj); | 927 | klp_free_object_loaded(obj); |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 88d0d4420ad2..ba77ab5f64dd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class) | |||
633 | if (!new_class->name) | 633 | if (!new_class->name) |
634 | return 0; | 634 | return 0; |
635 | 635 | ||
636 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | 636 | list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { |
637 | if (new_class->key - new_class->subclass == class->key) | 637 | if (new_class->key - new_class->subclass == class->key) |
638 | return class->name_version; | 638 | return class->name_version; |
639 | if (class->name && !strcmp(class->name, new_class->name)) | 639 | if (class->name && !strcmp(class->name, new_class->name)) |
@@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
700 | hash_head = classhashentry(key); | 700 | hash_head = classhashentry(key); |
701 | 701 | ||
702 | /* | 702 | /* |
703 | * We can walk the hash lockfree, because the hash only | 703 | * We do an RCU walk of the hash, see lockdep_free_key_range(). |
704 | * grows, and we are careful when adding entries to the end: | ||
705 | */ | 704 | */ |
706 | list_for_each_entry(class, hash_head, hash_entry) { | 705 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
706 | return NULL; | ||
707 | |||
708 | list_for_each_entry_rcu(class, hash_head, hash_entry) { | ||
707 | if (class->key == key) { | 709 | if (class->key == key) { |
708 | /* | 710 | /* |
709 | * Huh! same key, different name? Did someone trample | 711 | * Huh! same key, different name? Did someone trample |
@@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
728 | struct lockdep_subclass_key *key; | 730 | struct lockdep_subclass_key *key; |
729 | struct list_head *hash_head; | 731 | struct list_head *hash_head; |
730 | struct lock_class *class; | 732 | struct lock_class *class; |
731 | unsigned long flags; | 733 | |
734 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); | ||
732 | 735 | ||
733 | class = look_up_lock_class(lock, subclass); | 736 | class = look_up_lock_class(lock, subclass); |
734 | if (likely(class)) | 737 | if (likely(class)) |
@@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
750 | key = lock->key->subkeys + subclass; | 753 | key = lock->key->subkeys + subclass; |
751 | hash_head = classhashentry(key); | 754 | hash_head = classhashentry(key); |
752 | 755 | ||
753 | raw_local_irq_save(flags); | ||
754 | if (!graph_lock()) { | 756 | if (!graph_lock()) { |
755 | raw_local_irq_restore(flags); | ||
756 | return NULL; | 757 | return NULL; |
757 | } | 758 | } |
758 | /* | 759 | /* |
759 | * We have to do the hash-walk again, to avoid races | 760 | * We have to do the hash-walk again, to avoid races |
760 | * with another CPU: | 761 | * with another CPU: |
761 | */ | 762 | */ |
762 | list_for_each_entry(class, hash_head, hash_entry) | 763 | list_for_each_entry_rcu(class, hash_head, hash_entry) { |
763 | if (class->key == key) | 764 | if (class->key == key) |
764 | goto out_unlock_set; | 765 | goto out_unlock_set; |
766 | } | ||
767 | |||
765 | /* | 768 | /* |
766 | * Allocate a new key from the static array, and add it to | 769 | * Allocate a new key from the static array, and add it to |
767 | * the hash: | 770 | * the hash: |
768 | */ | 771 | */ |
769 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | 772 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { |
770 | if (!debug_locks_off_graph_unlock()) { | 773 | if (!debug_locks_off_graph_unlock()) { |
771 | raw_local_irq_restore(flags); | ||
772 | return NULL; | 774 | return NULL; |
773 | } | 775 | } |
774 | raw_local_irq_restore(flags); | ||
775 | 776 | ||
776 | print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); | 777 | print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); |
777 | dump_stack(); | 778 | dump_stack(); |
@@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
798 | 799 | ||
799 | if (verbose(class)) { | 800 | if (verbose(class)) { |
800 | graph_unlock(); | 801 | graph_unlock(); |
801 | raw_local_irq_restore(flags); | ||
802 | 802 | ||
803 | printk("\nnew class %p: %s", class->key, class->name); | 803 | printk("\nnew class %p: %s", class->key, class->name); |
804 | if (class->name_version > 1) | 804 | if (class->name_version > 1) |
@@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
806 | printk("\n"); | 806 | printk("\n"); |
807 | dump_stack(); | 807 | dump_stack(); |
808 | 808 | ||
809 | raw_local_irq_save(flags); | ||
810 | if (!graph_lock()) { | 809 | if (!graph_lock()) { |
811 | raw_local_irq_restore(flags); | ||
812 | return NULL; | 810 | return NULL; |
813 | } | 811 | } |
814 | } | 812 | } |
815 | out_unlock_set: | 813 | out_unlock_set: |
816 | graph_unlock(); | 814 | graph_unlock(); |
817 | raw_local_irq_restore(flags); | ||
818 | 815 | ||
819 | out_set_class_cache: | 816 | out_set_class_cache: |
820 | if (!subclass || force) | 817 | if (!subclass || force) |
@@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, | |||
870 | entry->distance = distance; | 867 | entry->distance = distance; |
871 | entry->trace = *trace; | 868 | entry->trace = *trace; |
872 | /* | 869 | /* |
873 | * Since we never remove from the dependency list, the list can | 870 | * Both allocation and removal are done under the graph lock; but |
874 | * be walked lockless by other CPUs, it's only allocation | 871 | * iteration is under RCU-sched; see look_up_lock_class() and |
875 | * that must be protected by the spinlock. But this also means | 872 | * lockdep_free_key_range(). |
876 | * we must make new entries visible only once writes to the | ||
877 | * entry become visible - hence the RCU op: | ||
878 | */ | 873 | */ |
879 | list_add_tail_rcu(&entry->entry, head); | 874 | list_add_tail_rcu(&entry->entry, head); |
880 | 875 | ||
@@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry, | |||
1025 | else | 1020 | else |
1026 | head = &lock->class->locks_before; | 1021 | head = &lock->class->locks_before; |
1027 | 1022 | ||
1028 | list_for_each_entry(entry, head, entry) { | 1023 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); |
1024 | |||
1025 | list_for_each_entry_rcu(entry, head, entry) { | ||
1029 | if (!lock_accessed(entry)) { | 1026 | if (!lock_accessed(entry)) { |
1030 | unsigned int cq_depth; | 1027 | unsigned int cq_depth; |
1031 | mark_lock_accessed(entry, lock); | 1028 | mark_lock_accessed(entry, lock); |
@@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
2022 | * We can walk it lock-free, because entries only get added | 2019 | * We can walk it lock-free, because entries only get added |
2023 | * to the hash: | 2020 | * to the hash: |
2024 | */ | 2021 | */ |
2025 | list_for_each_entry(chain, hash_head, entry) { | 2022 | list_for_each_entry_rcu(chain, hash_head, entry) { |
2026 | if (chain->chain_key == chain_key) { | 2023 | if (chain->chain_key == chain_key) { |
2027 | cache_hit: | 2024 | cache_hit: |
2028 | debug_atomic_inc(chain_lookup_hits); | 2025 | debug_atomic_inc(chain_lookup_hits); |
@@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2996 | if (unlikely(!debug_locks)) | 2993 | if (unlikely(!debug_locks)) |
2997 | return; | 2994 | return; |
2998 | 2995 | ||
2999 | if (subclass) | 2996 | if (subclass) { |
2997 | unsigned long flags; | ||
2998 | |||
2999 | if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) | ||
3000 | return; | ||
3001 | |||
3002 | raw_local_irq_save(flags); | ||
3003 | current->lockdep_recursion = 1; | ||
3000 | register_lock_class(lock, subclass, 1); | 3004 | register_lock_class(lock, subclass, 1); |
3005 | current->lockdep_recursion = 0; | ||
3006 | raw_local_irq_restore(flags); | ||
3007 | } | ||
3001 | } | 3008 | } |
3002 | EXPORT_SYMBOL_GPL(lockdep_init_map); | 3009 | EXPORT_SYMBOL_GPL(lockdep_init_map); |
3003 | 3010 | ||
@@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size) | |||
3887 | return addr >= start && addr < start + size; | 3894 | return addr >= start && addr < start + size; |
3888 | } | 3895 | } |
3889 | 3896 | ||
3897 | /* | ||
3898 | * Used in module.c to remove lock classes from memory that is going to be | ||
3899 | * freed; and possibly re-used by other modules. | ||
3900 | * | ||
3901 | * We will have had one sync_sched() before getting here, so we're guaranteed | ||
3902 | * nobody will look up these exact classes -- they're properly dead but still | ||
3903 | * allocated. | ||
3904 | */ | ||
3890 | void lockdep_free_key_range(void *start, unsigned long size) | 3905 | void lockdep_free_key_range(void *start, unsigned long size) |
3891 | { | 3906 | { |
3892 | struct lock_class *class, *next; | 3907 | struct lock_class *class; |
3893 | struct list_head *head; | 3908 | struct list_head *head; |
3894 | unsigned long flags; | 3909 | unsigned long flags; |
3895 | int i; | 3910 | int i; |
@@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
3905 | head = classhash_table + i; | 3920 | head = classhash_table + i; |
3906 | if (list_empty(head)) | 3921 | if (list_empty(head)) |
3907 | continue; | 3922 | continue; |
3908 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3923 | list_for_each_entry_rcu(class, head, hash_entry) { |
3909 | if (within(class->key, start, size)) | 3924 | if (within(class->key, start, size)) |
3910 | zap_class(class); | 3925 | zap_class(class); |
3911 | else if (within(class->name, start, size)) | 3926 | else if (within(class->name, start, size)) |
@@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
3916 | if (locked) | 3931 | if (locked) |
3917 | graph_unlock(); | 3932 | graph_unlock(); |
3918 | raw_local_irq_restore(flags); | 3933 | raw_local_irq_restore(flags); |
3934 | |||
3935 | /* | ||
3936 | * Wait for any possible iterators from look_up_lock_class() to pass | ||
3937 | * before continuing to free the memory they refer to. | ||
3938 | * | ||
3939 | * sync_sched() is sufficient because the read-side is IRQ disable. | ||
3940 | */ | ||
3941 | synchronize_sched(); | ||
3942 | |||
3943 | /* | ||
3944 | * XXX at this point we could return the resources to the pool; | ||
3945 | * instead we leak them. We would need to change to bitmap allocators | ||
3946 | * instead of the linear allocators we have now. | ||
3947 | */ | ||
3919 | } | 3948 | } |
3920 | 3949 | ||
3921 | void lockdep_reset_lock(struct lockdep_map *lock) | 3950 | void lockdep_reset_lock(struct lockdep_map *lock) |
3922 | { | 3951 | { |
3923 | struct lock_class *class, *next; | 3952 | struct lock_class *class; |
3924 | struct list_head *head; | 3953 | struct list_head *head; |
3925 | unsigned long flags; | 3954 | unsigned long flags; |
3926 | int i, j; | 3955 | int i, j; |
@@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
3948 | head = classhash_table + i; | 3977 | head = classhash_table + i; |
3949 | if (list_empty(head)) | 3978 | if (list_empty(head)) |
3950 | continue; | 3979 | continue; |
3951 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3980 | list_for_each_entry_rcu(class, head, hash_entry) { |
3952 | int match = 0; | 3981 | int match = 0; |
3953 | 3982 | ||
3954 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) | 3983 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) |
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index d1fe2ba5bac9..75e114bdf3f2 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h | |||
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
78 | */ | 78 | */ |
79 | return; | 79 | return; |
80 | } | 80 | } |
81 | ACCESS_ONCE(prev->next) = node; | 81 | WRITE_ONCE(prev->next, node); |
82 | 82 | ||
83 | /* Wait until the lock holder passes the lock down. */ | 83 | /* Wait until the lock holder passes the lock down. */ |
84 | arch_mcs_spin_lock_contended(&node->locked); | 84 | arch_mcs_spin_lock_contended(&node->locked); |
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
91 | static inline | 91 | static inline |
92 | void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | 92 | void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) |
93 | { | 93 | { |
94 | struct mcs_spinlock *next = ACCESS_ONCE(node->next); | 94 | struct mcs_spinlock *next = READ_ONCE(node->next); |
95 | 95 | ||
96 | if (likely(!next)) { | 96 | if (likely(!next)) { |
97 | /* | 97 | /* |
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
100 | if (likely(cmpxchg(lock, node, NULL) == node)) | 100 | if (likely(cmpxchg(lock, node, NULL) == node)) |
101 | return; | 101 | return; |
102 | /* Wait until the next pointer is set */ | 102 | /* Wait until the next pointer is set */ |
103 | while (!(next = ACCESS_ONCE(node->next))) | 103 | while (!(next = READ_ONCE(node->next))) |
104 | cpu_relax_lowlatency(); | 104 | cpu_relax_lowlatency(); |
105 | } | 105 | } |
106 | 106 | ||
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 94674e5919cb..4cccea6b8934 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/spinlock.h> | 25 | #include <linux/spinlock.h> |
26 | #include <linux/interrupt.h> | 26 | #include <linux/interrupt.h> |
27 | #include <linux/debug_locks.h> | 27 | #include <linux/debug_locks.h> |
28 | #include "mcs_spinlock.h" | 28 | #include <linux/osq_lock.h> |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, | 31 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, |
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, | |||
217 | } | 217 | } |
218 | 218 | ||
219 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 219 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
220 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | ||
221 | { | ||
222 | if (lock->owner != owner) | ||
223 | return false; | ||
224 | |||
225 | /* | ||
226 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
227 | * lock->owner still matches owner, if that fails, owner might | ||
228 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
229 | * ensures the memory stays valid. | ||
230 | */ | ||
231 | barrier(); | ||
232 | |||
233 | return owner->on_cpu; | ||
234 | } | ||
235 | |||
236 | /* | 220 | /* |
237 | * Look out! "owner" is an entirely speculative pointer | 221 | * Look out! "owner" is an entirely speculative pointer |
238 | * access and not reliable. | 222 | * access and not reliable. |
239 | */ | 223 | */ |
240 | static noinline | 224 | static noinline |
241 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | 225 | bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
242 | { | 226 | { |
227 | bool ret = true; | ||
228 | |||
243 | rcu_read_lock(); | 229 | rcu_read_lock(); |
244 | while (owner_running(lock, owner)) { | 230 | while (lock->owner == owner) { |
245 | if (need_resched()) | 231 | /* |
232 | * Ensure we emit the owner->on_cpu, dereference _after_ | ||
233 | * checking lock->owner still matches owner. If that fails, | ||
234 | * owner might point to freed memory. If it still matches, | ||
235 | * the rcu_read_lock() ensures the memory stays valid. | ||
236 | */ | ||
237 | barrier(); | ||
238 | |||
239 | if (!owner->on_cpu || need_resched()) { | ||
240 | ret = false; | ||
246 | break; | 241 | break; |
242 | } | ||
247 | 243 | ||
248 | cpu_relax_lowlatency(); | 244 | cpu_relax_lowlatency(); |
249 | } | 245 | } |
250 | rcu_read_unlock(); | 246 | rcu_read_unlock(); |
251 | 247 | ||
252 | /* | 248 | return ret; |
253 | * We break out the loop above on need_resched() and when the | ||
254 | * owner changed, which is a sign for heavy contention. Return | ||
255 | * success only when lock->owner is NULL. | ||
256 | */ | ||
257 | return lock->owner == NULL; | ||
258 | } | 249 | } |
259 | 250 | ||
260 | /* | 251 | /* |
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) | |||
269 | return 0; | 260 | return 0; |
270 | 261 | ||
271 | rcu_read_lock(); | 262 | rcu_read_lock(); |
272 | owner = ACCESS_ONCE(lock->owner); | 263 | owner = READ_ONCE(lock->owner); |
273 | if (owner) | 264 | if (owner) |
274 | retval = owner->on_cpu; | 265 | retval = owner->on_cpu; |
275 | rcu_read_unlock(); | 266 | rcu_read_unlock(); |
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, | |||
343 | * As such, when deadlock detection needs to be | 334 | * As such, when deadlock detection needs to be |
344 | * performed the optimistic spinning cannot be done. | 335 | * performed the optimistic spinning cannot be done. |
345 | */ | 336 | */ |
346 | if (ACCESS_ONCE(ww->ctx)) | 337 | if (READ_ONCE(ww->ctx)) |
347 | break; | 338 | break; |
348 | } | 339 | } |
349 | 340 | ||
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, | |||
351 | * If there's an owner, wait for it to either | 342 | * If there's an owner, wait for it to either |
352 | * release the lock or go to sleep. | 343 | * release the lock or go to sleep. |
353 | */ | 344 | */ |
354 | owner = ACCESS_ONCE(lock->owner); | 345 | owner = READ_ONCE(lock->owner); |
355 | if (owner && !mutex_spin_on_owner(lock, owner)) | 346 | if (owner && !mutex_spin_on_owner(lock, owner)) |
356 | break; | 347 | break; |
357 | 348 | ||
@@ -490,7 +481,7 @@ static inline int __sched | |||
490 | __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) | 481 | __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) |
491 | { | 482 | { |
492 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); | 483 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); |
493 | struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); | 484 | struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); |
494 | 485 | ||
495 | if (!hold_ctx) | 486 | if (!hold_ctx) |
496 | return 0; | 487 | return 0; |
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index c112d00341b0..dc85ee23a26f 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c | |||
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) | |||
98 | 98 | ||
99 | prev = decode_cpu(old); | 99 | prev = decode_cpu(old); |
100 | node->prev = prev; | 100 | node->prev = prev; |
101 | ACCESS_ONCE(prev->next) = node; | 101 | WRITE_ONCE(prev->next, node); |
102 | 102 | ||
103 | /* | 103 | /* |
104 | * Normally @prev is untouchable after the above store; because at that | 104 | * Normally @prev is untouchable after the above store; because at that |
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) | |||
109 | * cmpxchg in an attempt to undo our queueing. | 109 | * cmpxchg in an attempt to undo our queueing. |
110 | */ | 110 | */ |
111 | 111 | ||
112 | while (!ACCESS_ONCE(node->locked)) { | 112 | while (!READ_ONCE(node->locked)) { |
113 | /* | 113 | /* |
114 | * If we need to reschedule bail... so we can block. | 114 | * If we need to reschedule bail... so we can block. |
115 | */ | 115 | */ |
@@ -148,7 +148,7 @@ unqueue: | |||
148 | * Or we race against a concurrent unqueue()'s step-B, in which | 148 | * Or we race against a concurrent unqueue()'s step-B, in which |
149 | * case its step-C will write us a new @node->prev pointer. | 149 | * case its step-C will write us a new @node->prev pointer. |
150 | */ | 150 | */ |
151 | prev = ACCESS_ONCE(node->prev); | 151 | prev = READ_ONCE(node->prev); |
152 | } | 152 | } |
153 | 153 | ||
154 | /* | 154 | /* |
@@ -170,8 +170,8 @@ unqueue: | |||
170 | * it will wait in Step-A. | 170 | * it will wait in Step-A. |
171 | */ | 171 | */ |
172 | 172 | ||
173 | ACCESS_ONCE(next->prev) = prev; | 173 | WRITE_ONCE(next->prev, prev); |
174 | ACCESS_ONCE(prev->next) = next; | 174 | WRITE_ONCE(prev->next, next); |
175 | 175 | ||
176 | return false; | 176 | return false; |
177 | } | 177 | } |
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock) | |||
193 | node = this_cpu_ptr(&osq_node); | 193 | node = this_cpu_ptr(&osq_node); |
194 | next = xchg(&node->next, NULL); | 194 | next = xchg(&node->next, NULL); |
195 | if (next) { | 195 | if (next) { |
196 | ACCESS_ONCE(next->locked) = 1; | 196 | WRITE_ONCE(next->locked, 1); |
197 | return; | 197 | return; |
198 | } | 198 | } |
199 | 199 | ||
200 | next = osq_wait_next(lock, node, NULL); | 200 | next = osq_wait_next(lock, node, NULL); |
201 | if (next) | 201 | if (next) |
202 | ACCESS_ONCE(next->locked) = 1; | 202 | WRITE_ONCE(next->locked, 1); |
203 | } | 203 | } |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6357265a31ad..b73279367087 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) | |||
349 | * | 349 | * |
350 | * @task: the task owning the mutex (owner) for which a chain walk is | 350 | * @task: the task owning the mutex (owner) for which a chain walk is |
351 | * probably needed | 351 | * probably needed |
352 | * @deadlock_detect: do we have to carry out deadlock detection? | 352 | * @chwalk: do we have to carry out deadlock detection? |
353 | * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck | 353 | * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck |
354 | * things for a task that has just got its priority adjusted, and | 354 | * things for a task that has just got its priority adjusted, and |
355 | * is waiting on a mutex) | 355 | * is waiting on a mutex) |
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 2555ae15ec14..3a5048572065 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c | |||
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) | |||
85 | 85 | ||
86 | list_del(&waiter->list); | 86 | list_del(&waiter->list); |
87 | tsk = waiter->task; | 87 | tsk = waiter->task; |
88 | /* | ||
89 | * Make sure we do not wakeup the next reader before | ||
90 | * setting the nil condition to grant the next reader; | ||
91 | * otherwise we could miss the wakeup on the other | ||
92 | * side and end up sleeping again. See the pairing | ||
93 | * in rwsem_down_read_failed(). | ||
94 | */ | ||
88 | smp_mb(); | 95 | smp_mb(); |
89 | waiter->task = NULL; | 96 | waiter->task = NULL; |
90 | wake_up_process(tsk); | 97 | wake_up_process(tsk); |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2f7cc4076f50..3417d0172a5d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -14,8 +14,9 @@ | |||
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/export.h> | 15 | #include <linux/export.h> |
16 | #include <linux/sched/rt.h> | 16 | #include <linux/sched/rt.h> |
17 | #include <linux/osq_lock.h> | ||
17 | 18 | ||
18 | #include "mcs_spinlock.h" | 19 | #include "rwsem.h" |
19 | 20 | ||
20 | /* | 21 | /* |
21 | * Guide to the rw_semaphore's count field for common values. | 22 | * Guide to the rw_semaphore's count field for common values. |
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
186 | waiter = list_entry(next, struct rwsem_waiter, list); | 187 | waiter = list_entry(next, struct rwsem_waiter, list); |
187 | next = waiter->list.next; | 188 | next = waiter->list.next; |
188 | tsk = waiter->task; | 189 | tsk = waiter->task; |
190 | /* | ||
191 | * Make sure we do not wakeup the next reader before | ||
192 | * setting the nil condition to grant the next reader; | ||
193 | * otherwise we could miss the wakeup on the other | ||
194 | * side and end up sleeping again. See the pairing | ||
195 | * in rwsem_down_read_failed(). | ||
196 | */ | ||
189 | smp_mb(); | 197 | smp_mb(); |
190 | waiter->task = NULL; | 198 | waiter->task = NULL; |
191 | wake_up_process(tsk); | 199 | wake_up_process(tsk); |
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
258 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | 266 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { |
259 | if (!list_is_singular(&sem->wait_list)) | 267 | if (!list_is_singular(&sem->wait_list)) |
260 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 268 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); |
269 | rwsem_set_owner(sem); | ||
261 | return true; | 270 | return true; |
262 | } | 271 | } |
263 | 272 | ||
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
270 | */ | 279 | */ |
271 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | 280 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) |
272 | { | 281 | { |
273 | long old, count = ACCESS_ONCE(sem->count); | 282 | long old, count = READ_ONCE(sem->count); |
274 | 283 | ||
275 | while (true) { | 284 | while (true) { |
276 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | 285 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) |
277 | return false; | 286 | return false; |
278 | 287 | ||
279 | old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); | 288 | old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); |
280 | if (old == count) | 289 | if (old == count) { |
290 | rwsem_set_owner(sem); | ||
281 | return true; | 291 | return true; |
292 | } | ||
282 | 293 | ||
283 | count = old; | 294 | count = old; |
284 | } | 295 | } |
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | |||
287 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | 298 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) |
288 | { | 299 | { |
289 | struct task_struct *owner; | 300 | struct task_struct *owner; |
290 | bool on_cpu = false; | 301 | bool ret = true; |
291 | 302 | ||
292 | if (need_resched()) | 303 | if (need_resched()) |
293 | return false; | 304 | return false; |
294 | 305 | ||
295 | rcu_read_lock(); | 306 | rcu_read_lock(); |
296 | owner = ACCESS_ONCE(sem->owner); | 307 | owner = READ_ONCE(sem->owner); |
297 | if (owner) | 308 | if (!owner) { |
298 | on_cpu = owner->on_cpu; | 309 | long count = READ_ONCE(sem->count); |
299 | rcu_read_unlock(); | 310 | /* |
300 | 311 | * If sem->owner is not set, yet we have just recently entered the | |
301 | /* | 312 | * slowpath with the lock being active, then there is a possibility |
302 | * If sem->owner is not set, yet we have just recently entered the | 313 | * reader(s) may have the lock. To be safe, bail spinning in these |
303 | * slowpath, then there is a possibility reader(s) may have the lock. | 314 | * situations. |
304 | * To be safe, avoid spinning in these situations. | 315 | */ |
305 | */ | 316 | if (count & RWSEM_ACTIVE_MASK) |
306 | return on_cpu; | 317 | ret = false; |
307 | } | 318 | goto done; |
308 | 319 | } | |
309 | static inline bool owner_running(struct rw_semaphore *sem, | ||
310 | struct task_struct *owner) | ||
311 | { | ||
312 | if (sem->owner != owner) | ||
313 | return false; | ||
314 | |||
315 | /* | ||
316 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
317 | * sem->owner still matches owner, if that fails, owner might | ||
318 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
319 | * ensures the memory stays valid. | ||
320 | */ | ||
321 | barrier(); | ||
322 | 320 | ||
323 | return owner->on_cpu; | 321 | ret = owner->on_cpu; |
322 | done: | ||
323 | rcu_read_unlock(); | ||
324 | return ret; | ||
324 | } | 325 | } |
325 | 326 | ||
326 | static noinline | 327 | static noinline |
327 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | 328 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) |
328 | { | 329 | { |
330 | long count; | ||
331 | |||
329 | rcu_read_lock(); | 332 | rcu_read_lock(); |
330 | while (owner_running(sem, owner)) { | 333 | while (sem->owner == owner) { |
331 | if (need_resched()) | 334 | /* |
332 | break; | 335 | * Ensure we emit the owner->on_cpu, dereference _after_ |
336 | * checking sem->owner still matches owner, if that fails, | ||
337 | * owner might point to free()d memory, if it still matches, | ||
338 | * the rcu_read_lock() ensures the memory stays valid. | ||
339 | */ | ||
340 | barrier(); | ||
341 | |||
342 | /* abort spinning when need_resched or owner is not running */ | ||
343 | if (!owner->on_cpu || need_resched()) { | ||
344 | rcu_read_unlock(); | ||
345 | return false; | ||
346 | } | ||
333 | 347 | ||
334 | cpu_relax_lowlatency(); | 348 | cpu_relax_lowlatency(); |
335 | } | 349 | } |
336 | rcu_read_unlock(); | 350 | rcu_read_unlock(); |
337 | 351 | ||
352 | if (READ_ONCE(sem->owner)) | ||
353 | return true; /* new owner, continue spinning */ | ||
354 | |||
338 | /* | 355 | /* |
339 | * We break out the loop above on need_resched() or when the | 356 | * When the owner is not set, the lock could be free or |
340 | * owner changed, which is a sign for heavy contention. Return | 357 | * held by readers. Check the counter to verify the |
341 | * success only when sem->owner is NULL. | 358 | * state. |
342 | */ | 359 | */ |
343 | return sem->owner == NULL; | 360 | count = READ_ONCE(sem->count); |
361 | return (count == 0 || count == RWSEM_WAITING_BIAS); | ||
344 | } | 362 | } |
345 | 363 | ||
346 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | 364 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) |
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | |||
358 | goto done; | 376 | goto done; |
359 | 377 | ||
360 | while (true) { | 378 | while (true) { |
361 | owner = ACCESS_ONCE(sem->owner); | 379 | owner = READ_ONCE(sem->owner); |
362 | if (owner && !rwsem_spin_on_owner(sem, owner)) | 380 | if (owner && !rwsem_spin_on_owner(sem, owner)) |
363 | break; | 381 | break; |
364 | 382 | ||
@@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) | |||
432 | 450 | ||
433 | /* we're now waiting on the lock, but no longer actively locking */ | 451 | /* we're now waiting on the lock, but no longer actively locking */ |
434 | if (waiting) { | 452 | if (waiting) { |
435 | count = ACCESS_ONCE(sem->count); | 453 | count = READ_ONCE(sem->count); |
436 | 454 | ||
437 | /* | 455 | /* |
438 | * If there were already threads queued before us and there are | 456 | * If there were already threads queued before us and there are |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e2d3bc7f03b4..205be0ce34de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -9,29 +9,9 @@ | |||
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/export.h> | 10 | #include <linux/export.h> |
11 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
12 | |||
13 | #include <linux/atomic.h> | 12 | #include <linux/atomic.h> |
14 | 13 | ||
15 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 14 | #include "rwsem.h" |
16 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
17 | { | ||
18 | sem->owner = current; | ||
19 | } | ||
20 | |||
21 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
22 | { | ||
23 | sem->owner = NULL; | ||
24 | } | ||
25 | |||
26 | #else | ||
27 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
28 | { | ||
29 | } | ||
30 | |||
31 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
32 | { | ||
33 | } | ||
34 | #endif | ||
35 | 15 | ||
36 | /* | 16 | /* |
37 | * lock for reading | 17 | * lock for reading |
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h new file mode 100644 index 000000000000..870ed9a5b426 --- /dev/null +++ b/kernel/locking/rwsem.h | |||
@@ -0,0 +1,20 @@ | |||
1 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | ||
2 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
3 | { | ||
4 | sem->owner = current; | ||
5 | } | ||
6 | |||
7 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
8 | { | ||
9 | sem->owner = NULL; | ||
10 | } | ||
11 | |||
12 | #else | ||
13 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
14 | { | ||
15 | } | ||
16 | |||
17 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
18 | { | ||
19 | } | ||
20 | #endif | ||
diff --git a/kernel/module.c b/kernel/module.c index b3d634ed06c9..650b038ae520 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1865,7 +1865,7 @@ static void free_module(struct module *mod) | |||
1865 | kfree(mod->args); | 1865 | kfree(mod->args); |
1866 | percpu_modfree(mod); | 1866 | percpu_modfree(mod); |
1867 | 1867 | ||
1868 | /* Free lock-classes: */ | 1868 | /* Free lock-classes; relies on the preceding sync_rcu(). */ |
1869 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1869 | lockdep_free_key_range(mod->module_core, mod->core_size); |
1870 | 1870 | ||
1871 | /* Finally, free the core (containing the module structure) */ | 1871 | /* Finally, free the core (containing the module structure) */ |
@@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info) | |||
2479 | return 0; | 2479 | return 0; |
2480 | } | 2480 | } |
2481 | 2481 | ||
2482 | #define COPY_CHUNK_SIZE (16*PAGE_SIZE) | ||
2483 | |||
2484 | static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len) | ||
2485 | { | ||
2486 | do { | ||
2487 | unsigned long n = min(len, COPY_CHUNK_SIZE); | ||
2488 | |||
2489 | if (copy_from_user(dst, usrc, n) != 0) | ||
2490 | return -EFAULT; | ||
2491 | cond_resched(); | ||
2492 | dst += n; | ||
2493 | usrc += n; | ||
2494 | len -= n; | ||
2495 | } while (len); | ||
2496 | return 0; | ||
2497 | } | ||
2498 | |||
2482 | /* Sets info->hdr and info->len. */ | 2499 | /* Sets info->hdr and info->len. */ |
2483 | static int copy_module_from_user(const void __user *umod, unsigned long len, | 2500 | static int copy_module_from_user(const void __user *umod, unsigned long len, |
2484 | struct load_info *info) | 2501 | struct load_info *info) |
@@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, | |||
2498 | if (!info->hdr) | 2515 | if (!info->hdr) |
2499 | return -ENOMEM; | 2516 | return -ENOMEM; |
2500 | 2517 | ||
2501 | if (copy_from_user(info->hdr, umod, info->len) != 0) { | 2518 | if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { |
2502 | vfree(info->hdr); | 2519 | vfree(info->hdr); |
2503 | return -EFAULT; | 2520 | return -EFAULT; |
2504 | } | 2521 | } |
@@ -2753,6 +2770,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) | |||
2753 | mod->trace_events = section_objs(info, "_ftrace_events", | 2770 | mod->trace_events = section_objs(info, "_ftrace_events", |
2754 | sizeof(*mod->trace_events), | 2771 | sizeof(*mod->trace_events), |
2755 | &mod->num_trace_events); | 2772 | &mod->num_trace_events); |
2773 | mod->trace_enums = section_objs(info, "_ftrace_enum_map", | ||
2774 | sizeof(*mod->trace_enums), | ||
2775 | &mod->num_trace_enums); | ||
2756 | #endif | 2776 | #endif |
2757 | #ifdef CONFIG_TRACING | 2777 | #ifdef CONFIG_TRACING |
2758 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | 2778 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", |
@@ -3349,9 +3369,6 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3349 | module_bug_cleanup(mod); | 3369 | module_bug_cleanup(mod); |
3350 | mutex_unlock(&module_mutex); | 3370 | mutex_unlock(&module_mutex); |
3351 | 3371 | ||
3352 | /* Free lock-classes: */ | ||
3353 | lockdep_free_key_range(mod->module_core, mod->core_size); | ||
3354 | |||
3355 | /* we can't deallocate the module until we clear memory protection */ | 3372 | /* we can't deallocate the module until we clear memory protection */ |
3356 | unset_module_init_ro_nx(mod); | 3373 | unset_module_init_ro_nx(mod); |
3357 | unset_module_core_ro_nx(mod); | 3374 | unset_module_core_ro_nx(mod); |
@@ -3375,6 +3392,9 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3375 | synchronize_rcu(); | 3392 | synchronize_rcu(); |
3376 | mutex_unlock(&module_mutex); | 3393 | mutex_unlock(&module_mutex); |
3377 | free_module: | 3394 | free_module: |
3395 | /* Free lock-classes; relies on the preceding sync_rcu() */ | ||
3396 | lockdep_free_key_range(mod->module_core, mod->core_size); | ||
3397 | |||
3378 | module_deallocate(mod, info); | 3398 | module_deallocate(mod, info); |
3379 | free_copy: | 3399 | free_copy: |
3380 | free_copy(info); | 3400 | free_copy(info); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c24d5a23bf93..5235dd4e1e2f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
955 | } | 955 | } |
956 | } | 956 | } |
957 | 957 | ||
958 | static bool is_nosave_page(unsigned long pfn) | ||
959 | { | ||
960 | struct nosave_region *region; | ||
961 | |||
962 | list_for_each_entry(region, &nosave_regions, list) { | ||
963 | if (pfn >= region->start_pfn && pfn < region->end_pfn) { | ||
964 | pr_err("PM: %#010llx in e820 nosave region: " | ||
965 | "[mem %#010llx-%#010llx]\n", | ||
966 | (unsigned long long) pfn << PAGE_SHIFT, | ||
967 | (unsigned long long) region->start_pfn << PAGE_SHIFT, | ||
968 | ((unsigned long long) region->end_pfn << PAGE_SHIFT) | ||
969 | - 1); | ||
970 | return true; | ||
971 | } | ||
972 | } | ||
973 | |||
974 | return false; | ||
975 | } | ||
976 | |||
977 | /** | 958 | /** |
978 | * create_basic_memory_bitmaps - create bitmaps needed for marking page | 959 | * create_basic_memory_bitmaps - create bitmaps needed for marking page |
979 | * frames that should not be saved and free page frames. The pointers | 960 | * frames that should not be saved and free page frames. The pointers |
@@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) | |||
2042 | do { | 2023 | do { |
2043 | pfn = memory_bm_next_pfn(bm); | 2024 | pfn = memory_bm_next_pfn(bm); |
2044 | if (likely(pfn != BM_END_OF_MAP)) { | 2025 | if (likely(pfn != BM_END_OF_MAP)) { |
2045 | if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) | 2026 | if (likely(pfn_valid(pfn))) |
2046 | swsusp_set_page_free(pfn_to_page(pfn)); | 2027 | swsusp_set_page_free(pfn_to_page(pfn)); |
2047 | else | 2028 | else |
2048 | return -EFAULT; | 2029 | return -EFAULT; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..2f7937ee9e3a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -306,6 +306,9 @@ __read_mostly int scheduler_running; | |||
306 | */ | 306 | */ |
307 | int sysctl_sched_rt_runtime = 950000; | 307 | int sysctl_sched_rt_runtime = 950000; |
308 | 308 | ||
309 | /* cpus with isolated domains */ | ||
310 | cpumask_var_t cpu_isolated_map; | ||
311 | |||
309 | /* | 312 | /* |
310 | * this_rq_lock - lock this runqueue and disable interrupts. | 313 | * this_rq_lock - lock this runqueue and disable interrupts. |
311 | */ | 314 | */ |
@@ -690,6 +693,23 @@ static inline bool got_nohz_idle_kick(void) | |||
690 | bool sched_can_stop_tick(void) | 693 | bool sched_can_stop_tick(void) |
691 | { | 694 | { |
692 | /* | 695 | /* |
696 | * FIFO realtime policy runs the highest priority task. Other runnable | ||
697 | * tasks are of a lower priority. The scheduler tick does nothing. | ||
698 | */ | ||
699 | if (current->policy == SCHED_FIFO) | ||
700 | return true; | ||
701 | |||
702 | /* | ||
703 | * Round-robin realtime tasks time slice with other tasks at the same | ||
704 | * realtime priority. Is this task the only one at this priority? | ||
705 | */ | ||
706 | if (current->policy == SCHED_RR) { | ||
707 | struct sched_rt_entity *rt_se = ¤t->rt; | ||
708 | |||
709 | return rt_se->run_list.prev == rt_se->run_list.next; | ||
710 | } | ||
711 | |||
712 | /* | ||
693 | * More than one running task need preemption. | 713 | * More than one running task need preemption. |
694 | * nr_running update is assumed to be visible | 714 | * nr_running update is assumed to be visible |
695 | * after IPI is sent from wakers. | 715 | * after IPI is sent from wakers. |
@@ -996,6 +1016,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
996 | rq_clock_skip_update(rq, true); | 1016 | rq_clock_skip_update(rq, true); |
997 | } | 1017 | } |
998 | 1018 | ||
1019 | static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); | ||
1020 | |||
1021 | void register_task_migration_notifier(struct notifier_block *n) | ||
1022 | { | ||
1023 | atomic_notifier_chain_register(&task_migration_notifier, n); | ||
1024 | } | ||
1025 | |||
999 | #ifdef CONFIG_SMP | 1026 | #ifdef CONFIG_SMP |
1000 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1027 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
1001 | { | 1028 | { |
@@ -1026,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1026 | trace_sched_migrate_task(p, new_cpu); | 1053 | trace_sched_migrate_task(p, new_cpu); |
1027 | 1054 | ||
1028 | if (task_cpu(p) != new_cpu) { | 1055 | if (task_cpu(p) != new_cpu) { |
1056 | struct task_migration_notifier tmn; | ||
1057 | |||
1029 | if (p->sched_class->migrate_task_rq) | 1058 | if (p->sched_class->migrate_task_rq) |
1030 | p->sched_class->migrate_task_rq(p, new_cpu); | 1059 | p->sched_class->migrate_task_rq(p, new_cpu); |
1031 | p->se.nr_migrations++; | 1060 | p->se.nr_migrations++; |
1032 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); | 1061 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); |
1062 | |||
1063 | tmn.task = p; | ||
1064 | tmn.from_cpu = task_cpu(p); | ||
1065 | tmn.to_cpu = new_cpu; | ||
1066 | |||
1067 | atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); | ||
1033 | } | 1068 | } |
1034 | 1069 | ||
1035 | __set_task_cpu(p, new_cpu); | 1070 | __set_task_cpu(p, new_cpu); |
@@ -3034,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3034 | } else { | 3069 | } else { |
3035 | if (dl_prio(oldprio)) | 3070 | if (dl_prio(oldprio)) |
3036 | p->dl.dl_boosted = 0; | 3071 | p->dl.dl_boosted = 0; |
3072 | if (rt_prio(oldprio)) | ||
3073 | p->rt.timeout = 0; | ||
3037 | p->sched_class = &fair_sched_class; | 3074 | p->sched_class = &fair_sched_class; |
3038 | } | 3075 | } |
3039 | 3076 | ||
@@ -5318,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
5318 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5355 | static int sched_cpu_inactive(struct notifier_block *nfb, |
5319 | unsigned long action, void *hcpu) | 5356 | unsigned long action, void *hcpu) |
5320 | { | 5357 | { |
5321 | unsigned long flags; | ||
5322 | long cpu = (long)hcpu; | ||
5323 | struct dl_bw *dl_b; | ||
5324 | |||
5325 | switch (action & ~CPU_TASKS_FROZEN) { | 5358 | switch (action & ~CPU_TASKS_FROZEN) { |
5326 | case CPU_DOWN_PREPARE: | 5359 | case CPU_DOWN_PREPARE: |
5327 | set_cpu_active(cpu, false); | 5360 | set_cpu_active((long)hcpu, false); |
5328 | |||
5329 | /* explicitly allow suspend */ | ||
5330 | if (!(action & CPU_TASKS_FROZEN)) { | ||
5331 | bool overflow; | ||
5332 | int cpus; | ||
5333 | |||
5334 | rcu_read_lock_sched(); | ||
5335 | dl_b = dl_bw_of(cpu); | ||
5336 | |||
5337 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
5338 | cpus = dl_bw_cpus(cpu); | ||
5339 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
5340 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
5341 | |||
5342 | rcu_read_unlock_sched(); | ||
5343 | |||
5344 | if (overflow) | ||
5345 | return notifier_from_errno(-EBUSY); | ||
5346 | } | ||
5347 | return NOTIFY_OK; | 5361 | return NOTIFY_OK; |
5362 | default: | ||
5363 | return NOTIFY_DONE; | ||
5348 | } | 5364 | } |
5349 | |||
5350 | return NOTIFY_DONE; | ||
5351 | } | 5365 | } |
5352 | 5366 | ||
5353 | static int __init migration_init(void) | 5367 | static int __init migration_init(void) |
@@ -5428,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5428 | break; | 5442 | break; |
5429 | } | 5443 | } |
5430 | 5444 | ||
5431 | /* | ||
5432 | * Even though we initialize ->capacity to something semi-sane, | ||
5433 | * we leave capacity_orig unset. This allows us to detect if | ||
5434 | * domain iteration is still funny without causing /0 traps. | ||
5435 | */ | ||
5436 | if (!group->sgc->capacity_orig) { | ||
5437 | printk(KERN_CONT "\n"); | ||
5438 | printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); | ||
5439 | break; | ||
5440 | } | ||
5441 | |||
5442 | if (!cpumask_weight(sched_group_cpus(group))) { | 5445 | if (!cpumask_weight(sched_group_cpus(group))) { |
5443 | printk(KERN_CONT "\n"); | 5446 | printk(KERN_CONT "\n"); |
5444 | printk(KERN_ERR "ERROR: empty group\n"); | 5447 | printk(KERN_ERR "ERROR: empty group\n"); |
@@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
5811 | update_top_cache_domain(cpu); | 5814 | update_top_cache_domain(cpu); |
5812 | } | 5815 | } |
5813 | 5816 | ||
5814 | /* cpus with isolated domains */ | ||
5815 | static cpumask_var_t cpu_isolated_map; | ||
5816 | |||
5817 | /* Setup the mask of cpus configured for isolated domains */ | 5817 | /* Setup the mask of cpus configured for isolated domains */ |
5818 | static int __init isolated_cpu_setup(char *str) | 5818 | static int __init isolated_cpu_setup(char *str) |
5819 | { | 5819 | { |
@@ -5922,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5922 | * die on a /0 trap. | 5922 | * die on a /0 trap. |
5923 | */ | 5923 | */ |
5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | 5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); |
5925 | sg->sgc->capacity_orig = sg->sgc->capacity; | ||
5926 | 5925 | ||
5927 | /* | 5926 | /* |
5928 | * Make sure the first group of this domain contains the | 5927 | * Make sure the first group of this domain contains the |
@@ -6233,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
6233 | */ | 6232 | */ |
6234 | 6233 | ||
6235 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 6234 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
6235 | sd->flags |= SD_PREFER_SIBLING; | ||
6236 | sd->imbalance_pct = 110; | 6236 | sd->imbalance_pct = 110; |
6237 | sd->smt_gain = 1178; /* ~15% */ | 6237 | sd->smt_gain = 1178; /* ~15% */ |
6238 | 6238 | ||
@@ -6998,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
6998 | */ | 6998 | */ |
6999 | 6999 | ||
7000 | case CPU_ONLINE: | 7000 | case CPU_ONLINE: |
7001 | case CPU_DOWN_FAILED: | ||
7002 | cpuset_update_active_cpus(true); | 7001 | cpuset_update_active_cpus(true); |
7003 | break; | 7002 | break; |
7004 | default: | 7003 | default: |
@@ -7010,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
7010 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 7009 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
7011 | void *hcpu) | 7010 | void *hcpu) |
7012 | { | 7011 | { |
7013 | switch (action) { | 7012 | unsigned long flags; |
7013 | long cpu = (long)hcpu; | ||
7014 | struct dl_bw *dl_b; | ||
7015 | |||
7016 | switch (action & ~CPU_TASKS_FROZEN) { | ||
7014 | case CPU_DOWN_PREPARE: | 7017 | case CPU_DOWN_PREPARE: |
7018 | /* explicitly allow suspend */ | ||
7019 | if (!(action & CPU_TASKS_FROZEN)) { | ||
7020 | bool overflow; | ||
7021 | int cpus; | ||
7022 | |||
7023 | rcu_read_lock_sched(); | ||
7024 | dl_b = dl_bw_of(cpu); | ||
7025 | |||
7026 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
7027 | cpus = dl_bw_cpus(cpu); | ||
7028 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
7029 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
7030 | |||
7031 | rcu_read_unlock_sched(); | ||
7032 | |||
7033 | if (overflow) | ||
7034 | return notifier_from_errno(-EBUSY); | ||
7035 | } | ||
7015 | cpuset_update_active_cpus(false); | 7036 | cpuset_update_active_cpus(false); |
7016 | break; | 7037 | break; |
7017 | case CPU_DOWN_PREPARE_FROZEN: | 7038 | case CPU_DOWN_PREPARE_FROZEN: |
@@ -7156,8 +7177,8 @@ void __init sched_init(void) | |||
7156 | rq->calc_load_active = 0; | 7177 | rq->calc_load_active = 0; |
7157 | rq->calc_load_update = jiffies + LOAD_FREQ; | 7178 | rq->calc_load_update = jiffies + LOAD_FREQ; |
7158 | init_cfs_rq(&rq->cfs); | 7179 | init_cfs_rq(&rq->cfs); |
7159 | init_rt_rq(&rq->rt, rq); | 7180 | init_rt_rq(&rq->rt); |
7160 | init_dl_rq(&rq->dl, rq); | 7181 | init_dl_rq(&rq->dl); |
7161 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7182 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7162 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 7183 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
7163 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7184 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
@@ -7197,7 +7218,7 @@ void __init sched_init(void) | |||
7197 | #ifdef CONFIG_SMP | 7218 | #ifdef CONFIG_SMP |
7198 | rq->sd = NULL; | 7219 | rq->sd = NULL; |
7199 | rq->rd = NULL; | 7220 | rq->rd = NULL; |
7200 | rq->cpu_capacity = SCHED_CAPACITY_SCALE; | 7221 | rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; |
7201 | rq->post_schedule = 0; | 7222 | rq->post_schedule = 0; |
7202 | rq->active_balance = 0; | 7223 | rq->active_balance = 0; |
7203 | rq->next_balance = jiffies; | 7224 | rq->next_balance = jiffies; |
@@ -7796,7 +7817,7 @@ static int sched_rt_global_constraints(void) | |||
7796 | } | 7817 | } |
7797 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7818 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7798 | 7819 | ||
7799 | static int sched_dl_global_constraints(void) | 7820 | static int sched_dl_global_validate(void) |
7800 | { | 7821 | { |
7801 | u64 runtime = global_rt_runtime(); | 7822 | u64 runtime = global_rt_runtime(); |
7802 | u64 period = global_rt_period(); | 7823 | u64 period = global_rt_period(); |
@@ -7897,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
7897 | if (ret) | 7918 | if (ret) |
7898 | goto undo; | 7919 | goto undo; |
7899 | 7920 | ||
7900 | ret = sched_rt_global_constraints(); | 7921 | ret = sched_dl_global_validate(); |
7901 | if (ret) | 7922 | if (ret) |
7902 | goto undo; | 7923 | goto undo; |
7903 | 7924 | ||
7904 | ret = sched_dl_global_constraints(); | 7925 | ret = sched_rt_global_constraints(); |
7905 | if (ret) | 7926 | if (ret) |
7906 | goto undo; | 7927 | goto undo; |
7907 | 7928 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3fa8fa6d9403..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) | |||
69 | dl_b->total_bw = 0; | 69 | dl_b->total_bw = 0; |
70 | } | 70 | } |
71 | 71 | ||
72 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | 72 | void init_dl_rq(struct dl_rq *dl_rq) |
73 | { | 73 | { |
74 | dl_rq->rb_root = RB_ROOT; | 74 | dl_rq->rb_root = RB_ROOT; |
75 | 75 | ||
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) | |||
218 | rq->post_schedule = has_pushable_dl_tasks(rq); | 218 | rq->post_schedule = has_pushable_dl_tasks(rq); |
219 | } | 219 | } |
220 | 220 | ||
221 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); | ||
222 | |||
223 | static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) | ||
224 | { | ||
225 | struct rq *later_rq = NULL; | ||
226 | bool fallback = false; | ||
227 | |||
228 | later_rq = find_lock_later_rq(p, rq); | ||
229 | |||
230 | if (!later_rq) { | ||
231 | int cpu; | ||
232 | |||
233 | /* | ||
234 | * If we cannot preempt any rq, fall back to pick any | ||
235 | * online cpu. | ||
236 | */ | ||
237 | fallback = true; | ||
238 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); | ||
239 | if (cpu >= nr_cpu_ids) { | ||
240 | /* | ||
241 | * Fail to find any suitable cpu. | ||
242 | * The task will never come back! | ||
243 | */ | ||
244 | BUG_ON(dl_bandwidth_enabled()); | ||
245 | |||
246 | /* | ||
247 | * If admission control is disabled we | ||
248 | * try a little harder to let the task | ||
249 | * run. | ||
250 | */ | ||
251 | cpu = cpumask_any(cpu_active_mask); | ||
252 | } | ||
253 | later_rq = cpu_rq(cpu); | ||
254 | double_lock_balance(rq, later_rq); | ||
255 | } | ||
256 | |||
257 | deactivate_task(rq, p, 0); | ||
258 | set_task_cpu(p, later_rq->cpu); | ||
259 | activate_task(later_rq, p, ENQUEUE_REPLENISH); | ||
260 | |||
261 | if (!fallback) | ||
262 | resched_curr(later_rq); | ||
263 | |||
264 | double_unlock_balance(rq, later_rq); | ||
265 | } | ||
266 | |||
221 | #else | 267 | #else |
222 | 268 | ||
223 | static inline | 269 | static inline |
@@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
514 | unsigned long flags; | 560 | unsigned long flags; |
515 | struct rq *rq; | 561 | struct rq *rq; |
516 | 562 | ||
517 | rq = task_rq_lock(current, &flags); | 563 | rq = task_rq_lock(p, &flags); |
518 | 564 | ||
519 | /* | 565 | /* |
520 | * We need to take care of several possible races here: | 566 | * We need to take care of several possible races here: |
@@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
536 | sched_clock_tick(); | 582 | sched_clock_tick(); |
537 | update_rq_clock(rq); | 583 | update_rq_clock(rq); |
538 | 584 | ||
585 | #ifdef CONFIG_SMP | ||
586 | /* | ||
587 | * If we find that the rq the task was on is no longer | ||
588 | * available, we need to select a new rq. | ||
589 | */ | ||
590 | if (unlikely(!rq->online)) { | ||
591 | dl_task_offline_migration(rq, p); | ||
592 | goto unlock; | ||
593 | } | ||
594 | #endif | ||
595 | |||
539 | /* | 596 | /* |
540 | * If the throttle happened during sched-out; like: | 597 | * If the throttle happened during sched-out; like: |
541 | * | 598 | * |
@@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
569 | push_dl_task(rq); | 626 | push_dl_task(rq); |
570 | #endif | 627 | #endif |
571 | unlock: | 628 | unlock: |
572 | task_rq_unlock(rq, current, &flags); | 629 | task_rq_unlock(rq, p, &flags); |
573 | 630 | ||
574 | return HRTIMER_NORESTART; | 631 | return HRTIMER_NORESTART; |
575 | } | 632 | } |
@@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq) | |||
914 | } | 971 | } |
915 | update_rq_clock(rq); | 972 | update_rq_clock(rq); |
916 | update_curr_dl(rq); | 973 | update_curr_dl(rq); |
974 | /* | ||
975 | * Tell update_rq_clock() that we've just updated, | ||
976 | * so we don't do microscopic update in schedule() | ||
977 | * and double the fastpath cost. | ||
978 | */ | ||
979 | rq_clock_skip_update(rq, true); | ||
917 | } | 980 | } |
918 | 981 | ||
919 | #ifdef CONFIG_SMP | 982 | #ifdef CONFIG_SMP |
@@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1659 | { | 1722 | { |
1660 | int check_resched = 1; | 1723 | int check_resched = 1; |
1661 | 1724 | ||
1662 | /* | ||
1663 | * If p is throttled, don't consider the possibility | ||
1664 | * of preempting rq->curr, the check will be done right | ||
1665 | * after its runtime will get replenished. | ||
1666 | */ | ||
1667 | if (unlikely(p->dl.dl_throttled)) | ||
1668 | return; | ||
1669 | |||
1670 | if (task_on_rq_queued(p) && rq->curr != p) { | 1725 | if (task_on_rq_queued(p) && rq->curr != p) { |
1671 | #ifdef CONFIG_SMP | 1726 | #ifdef CONFIG_SMP |
1672 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && | 1727 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8baaf858d25c..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
71 | if (!se) { | 71 | if (!se) { |
72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | 72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; |
73 | P(avg->runnable_avg_sum); | 73 | P(avg->runnable_avg_sum); |
74 | P(avg->runnable_avg_period); | 74 | P(avg->avg_period); |
75 | return; | 75 | return; |
76 | } | 76 | } |
77 | 77 | ||
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
94 | P(se->load.weight); | 94 | P(se->load.weight); |
95 | #ifdef CONFIG_SMP | 95 | #ifdef CONFIG_SMP |
96 | P(se->avg.runnable_avg_sum); | 96 | P(se->avg.runnable_avg_sum); |
97 | P(se->avg.runnable_avg_period); | 97 | P(se->avg.running_avg_sum); |
98 | P(se->avg.avg_period); | ||
98 | P(se->avg.load_avg_contrib); | 99 | P(se->avg.load_avg_contrib); |
100 | P(se->avg.utilization_avg_contrib); | ||
99 | P(se->avg.decay_count); | 101 | P(se->avg.decay_count); |
100 | #endif | 102 | #endif |
101 | #undef PN | 103 | #undef PN |
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
214 | cfs_rq->runnable_load_avg); | 216 | cfs_rq->runnable_load_avg); |
215 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", | 217 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", |
216 | cfs_rq->blocked_load_avg); | 218 | cfs_rq->blocked_load_avg); |
219 | SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", | ||
220 | cfs_rq->utilization_load_avg); | ||
217 | #ifdef CONFIG_FAIR_GROUP_SCHED | 221 | #ifdef CONFIG_FAIR_GROUP_SCHED |
218 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", | 222 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", |
219 | cfs_rq->tg_load_contrib); | 223 | cfs_rq->tg_load_contrib); |
@@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
636 | P(se.load.weight); | 640 | P(se.load.weight); |
637 | #ifdef CONFIG_SMP | 641 | #ifdef CONFIG_SMP |
638 | P(se.avg.runnable_avg_sum); | 642 | P(se.avg.runnable_avg_sum); |
639 | P(se.avg.runnable_avg_period); | 643 | P(se.avg.running_avg_sum); |
644 | P(se.avg.avg_period); | ||
640 | P(se.avg.load_avg_contrib); | 645 | P(se.avg.load_avg_contrib); |
646 | P(se.avg.utilization_avg_contrib); | ||
641 | P(se.avg.decay_count); | 647 | P(se.avg.decay_count); |
642 | #endif | 648 | #endif |
643 | P(policy); | 649 | P(policy); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bcfe32088b37..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); | |||
670 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
671 | 671 | ||
672 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
673 | static inline void __update_task_entity_utilization(struct sched_entity *se); | ||
673 | 674 | ||
674 | /* Give new task start runnable values to heavy its load in infant time */ | 675 | /* Give new task start runnable values to heavy its load in infant time */ |
675 | void init_task_runnable_average(struct task_struct *p) | 676 | void init_task_runnable_average(struct task_struct *p) |
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p) | |||
677 | u32 slice; | 678 | u32 slice; |
678 | 679 | ||
679 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 680 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; |
680 | p->se.avg.runnable_avg_sum = slice; | 681 | p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; |
681 | p->se.avg.runnable_avg_period = slice; | 682 | p->se.avg.avg_period = slice; |
682 | __update_task_entity_contrib(&p->se); | 683 | __update_task_entity_contrib(&p->se); |
684 | __update_task_entity_utilization(&p->se); | ||
683 | } | 685 | } |
684 | #else | 686 | #else |
685 | void init_task_runnable_average(struct task_struct *p) | 687 | void init_task_runnable_average(struct task_struct *p) |
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env, | |||
1196 | static bool load_too_imbalanced(long src_load, long dst_load, | 1198 | static bool load_too_imbalanced(long src_load, long dst_load, |
1197 | struct task_numa_env *env) | 1199 | struct task_numa_env *env) |
1198 | { | 1200 | { |
1199 | long imb, old_imb; | ||
1200 | long orig_src_load, orig_dst_load; | ||
1201 | long src_capacity, dst_capacity; | 1201 | long src_capacity, dst_capacity; |
1202 | long orig_src_load; | ||
1203 | long load_a, load_b; | ||
1204 | long moved_load; | ||
1205 | long imb; | ||
1202 | 1206 | ||
1203 | /* | 1207 | /* |
1204 | * The load is corrected for the CPU capacity available on each node. | 1208 | * The load is corrected for the CPU capacity available on each node. |
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, | |||
1211 | dst_capacity = env->dst_stats.compute_capacity; | 1215 | dst_capacity = env->dst_stats.compute_capacity; |
1212 | 1216 | ||
1213 | /* We care about the slope of the imbalance, not the direction. */ | 1217 | /* We care about the slope of the imbalance, not the direction. */ |
1214 | if (dst_load < src_load) | 1218 | load_a = dst_load; |
1215 | swap(dst_load, src_load); | 1219 | load_b = src_load; |
1220 | if (load_a < load_b) | ||
1221 | swap(load_a, load_b); | ||
1216 | 1222 | ||
1217 | /* Is the difference below the threshold? */ | 1223 | /* Is the difference below the threshold? */ |
1218 | imb = dst_load * src_capacity * 100 - | 1224 | imb = load_a * src_capacity * 100 - |
1219 | src_load * dst_capacity * env->imbalance_pct; | 1225 | load_b * dst_capacity * env->imbalance_pct; |
1220 | if (imb <= 0) | 1226 | if (imb <= 0) |
1221 | return false; | 1227 | return false; |
1222 | 1228 | ||
1223 | /* | 1229 | /* |
1224 | * The imbalance is above the allowed threshold. | 1230 | * The imbalance is above the allowed threshold. |
1225 | * Compare it with the old imbalance. | 1231 | * Allow a move that brings us closer to a balanced situation, |
1232 | * without moving things past the point of balance. | ||
1226 | */ | 1233 | */ |
1227 | orig_src_load = env->src_stats.load; | 1234 | orig_src_load = env->src_stats.load; |
1228 | orig_dst_load = env->dst_stats.load; | ||
1229 | 1235 | ||
1230 | if (orig_dst_load < orig_src_load) | 1236 | /* |
1231 | swap(orig_dst_load, orig_src_load); | 1237 | * In a task swap, there will be one load moving from src to dst, |
1232 | 1238 | * and another moving back. This is the net sum of both moves. | |
1233 | old_imb = orig_dst_load * src_capacity * 100 - | 1239 | * A simple task move will always have a positive value. |
1234 | orig_src_load * dst_capacity * env->imbalance_pct; | 1240 | * Allow the move if it brings the system closer to a balanced |
1241 | * situation, without crossing over the balance point. | ||
1242 | */ | ||
1243 | moved_load = orig_src_load - src_load; | ||
1235 | 1244 | ||
1236 | /* Would this change make things worse? */ | 1245 | if (moved_load > 0) |
1237 | return (imb > old_imb); | 1246 | /* Moving src -> dst. Did we overshoot balance? */ |
1247 | return src_load * dst_capacity < dst_load * src_capacity; | ||
1248 | else | ||
1249 | /* Moving dst -> src. Did we overshoot balance? */ | ||
1250 | return dst_load * src_capacity < src_load * dst_capacity; | ||
1238 | } | 1251 | } |
1239 | 1252 | ||
1240 | /* | 1253 | /* |
@@ -1675,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
1675 | *period = now - p->last_task_numa_placement; | 1688 | *period = now - p->last_task_numa_placement; |
1676 | } else { | 1689 | } else { |
1677 | delta = p->se.avg.runnable_avg_sum; | 1690 | delta = p->se.avg.runnable_avg_sum; |
1678 | *period = p->se.avg.runnable_avg_period; | 1691 | *period = p->se.avg.avg_period; |
1679 | } | 1692 | } |
1680 | 1693 | ||
1681 | p->last_sum_exec_runtime = runtime; | 1694 | p->last_sum_exec_runtime = runtime; |
@@ -1765,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) | |||
1765 | } | 1778 | } |
1766 | } | 1779 | } |
1767 | /* Next round, evaluate the nodes within max_group. */ | 1780 | /* Next round, evaluate the nodes within max_group. */ |
1781 | if (!max_faults) | ||
1782 | break; | ||
1768 | nodes = max_group; | 1783 | nodes = max_group; |
1769 | } | 1784 | } |
1770 | return nid; | 1785 | return nid; |
@@ -2165,8 +2180,10 @@ void task_numa_work(struct callback_head *work) | |||
2165 | vma = mm->mmap; | 2180 | vma = mm->mmap; |
2166 | } | 2181 | } |
2167 | for (; vma; vma = vma->vm_next) { | 2182 | for (; vma; vma = vma->vm_next) { |
2168 | if (!vma_migratable(vma) || !vma_policy_mof(vma)) | 2183 | if (!vma_migratable(vma) || !vma_policy_mof(vma) || |
2184 | is_vm_hugetlb_page(vma)) { | ||
2169 | continue; | 2185 | continue; |
2186 | } | ||
2170 | 2187 | ||
2171 | /* | 2188 | /* |
2172 | * Shared library pages mapped by multiple processes are not | 2189 | * Shared library pages mapped by multiple processes are not |
@@ -2501,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) | |||
2501 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | 2518 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) |
2502 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 2519 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
2503 | */ | 2520 | */ |
2504 | static __always_inline int __update_entity_runnable_avg(u64 now, | 2521 | static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, |
2505 | struct sched_avg *sa, | 2522 | struct sched_avg *sa, |
2506 | int runnable) | 2523 | int runnable, |
2524 | int running) | ||
2507 | { | 2525 | { |
2508 | u64 delta, periods; | 2526 | u64 delta, periods; |
2509 | u32 runnable_contrib; | 2527 | u32 runnable_contrib; |
2510 | int delta_w, decayed = 0; | 2528 | int delta_w, decayed = 0; |
2529 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
2511 | 2530 | ||
2512 | delta = now - sa->last_runnable_update; | 2531 | delta = now - sa->last_runnable_update; |
2513 | /* | 2532 | /* |
@@ -2529,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
2529 | sa->last_runnable_update = now; | 2548 | sa->last_runnable_update = now; |
2530 | 2549 | ||
2531 | /* delta_w is the amount already accumulated against our next period */ | 2550 | /* delta_w is the amount already accumulated against our next period */ |
2532 | delta_w = sa->runnable_avg_period % 1024; | 2551 | delta_w = sa->avg_period % 1024; |
2533 | if (delta + delta_w >= 1024) { | 2552 | if (delta + delta_w >= 1024) { |
2534 | /* period roll-over */ | 2553 | /* period roll-over */ |
2535 | decayed = 1; | 2554 | decayed = 1; |
@@ -2542,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
2542 | delta_w = 1024 - delta_w; | 2561 | delta_w = 1024 - delta_w; |
2543 | if (runnable) | 2562 | if (runnable) |
2544 | sa->runnable_avg_sum += delta_w; | 2563 | sa->runnable_avg_sum += delta_w; |
2545 | sa->runnable_avg_period += delta_w; | 2564 | if (running) |
2565 | sa->running_avg_sum += delta_w * scale_freq | ||
2566 | >> SCHED_CAPACITY_SHIFT; | ||
2567 | sa->avg_period += delta_w; | ||
2546 | 2568 | ||
2547 | delta -= delta_w; | 2569 | delta -= delta_w; |
2548 | 2570 | ||
@@ -2552,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
2552 | 2574 | ||
2553 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | 2575 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, |
2554 | periods + 1); | 2576 | periods + 1); |
2555 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | 2577 | sa->running_avg_sum = decay_load(sa->running_avg_sum, |
2578 | periods + 1); | ||
2579 | sa->avg_period = decay_load(sa->avg_period, | ||
2556 | periods + 1); | 2580 | periods + 1); |
2557 | 2581 | ||
2558 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2582 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
2559 | runnable_contrib = __compute_runnable_contrib(periods); | 2583 | runnable_contrib = __compute_runnable_contrib(periods); |
2560 | if (runnable) | 2584 | if (runnable) |
2561 | sa->runnable_avg_sum += runnable_contrib; | 2585 | sa->runnable_avg_sum += runnable_contrib; |
2562 | sa->runnable_avg_period += runnable_contrib; | 2586 | if (running) |
2587 | sa->running_avg_sum += runnable_contrib * scale_freq | ||
2588 | >> SCHED_CAPACITY_SHIFT; | ||
2589 | sa->avg_period += runnable_contrib; | ||
2563 | } | 2590 | } |
2564 | 2591 | ||
2565 | /* Remainder of delta accrued against u_0` */ | 2592 | /* Remainder of delta accrued against u_0` */ |
2566 | if (runnable) | 2593 | if (runnable) |
2567 | sa->runnable_avg_sum += delta; | 2594 | sa->runnable_avg_sum += delta; |
2568 | sa->runnable_avg_period += delta; | 2595 | if (running) |
2596 | sa->running_avg_sum += delta * scale_freq | ||
2597 | >> SCHED_CAPACITY_SHIFT; | ||
2598 | sa->avg_period += delta; | ||
2569 | 2599 | ||
2570 | return decayed; | 2600 | return decayed; |
2571 | } | 2601 | } |
@@ -2582,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) | |||
2582 | return 0; | 2612 | return 0; |
2583 | 2613 | ||
2584 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 2614 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); |
2615 | se->avg.utilization_avg_contrib = | ||
2616 | decay_load(se->avg.utilization_avg_contrib, decays); | ||
2585 | 2617 | ||
2586 | return decays; | 2618 | return decays; |
2587 | } | 2619 | } |
@@ -2617,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, | |||
2617 | 2649 | ||
2618 | /* The fraction of a cpu used by this cfs_rq */ | 2650 | /* The fraction of a cpu used by this cfs_rq */ |
2619 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | 2651 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, |
2620 | sa->runnable_avg_period + 1); | 2652 | sa->avg_period + 1); |
2621 | contrib -= cfs_rq->tg_runnable_contrib; | 2653 | contrib -= cfs_rq->tg_runnable_contrib; |
2622 | 2654 | ||
2623 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | 2655 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { |
@@ -2670,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
2670 | 2702 | ||
2671 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 2703 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
2672 | { | 2704 | { |
2673 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); | 2705 | __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, |
2706 | runnable, runnable); | ||
2674 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 2707 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); |
2675 | } | 2708 | } |
2676 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2709 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -2688,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) | |||
2688 | 2721 | ||
2689 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | 2722 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ |
2690 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | 2723 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); |
2691 | contrib /= (se->avg.runnable_avg_period + 1); | 2724 | contrib /= (se->avg.avg_period + 1); |
2692 | se->avg.load_avg_contrib = scale_load(contrib); | 2725 | se->avg.load_avg_contrib = scale_load(contrib); |
2693 | } | 2726 | } |
2694 | 2727 | ||
@@ -2707,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) | |||
2707 | return se->avg.load_avg_contrib - old_contrib; | 2740 | return se->avg.load_avg_contrib - old_contrib; |
2708 | } | 2741 | } |
2709 | 2742 | ||
2743 | |||
2744 | static inline void __update_task_entity_utilization(struct sched_entity *se) | ||
2745 | { | ||
2746 | u32 contrib; | ||
2747 | |||
2748 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
2749 | contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); | ||
2750 | contrib /= (se->avg.avg_period + 1); | ||
2751 | se->avg.utilization_avg_contrib = scale_load(contrib); | ||
2752 | } | ||
2753 | |||
2754 | static long __update_entity_utilization_avg_contrib(struct sched_entity *se) | ||
2755 | { | ||
2756 | long old_contrib = se->avg.utilization_avg_contrib; | ||
2757 | |||
2758 | if (entity_is_task(se)) | ||
2759 | __update_task_entity_utilization(se); | ||
2760 | else | ||
2761 | se->avg.utilization_avg_contrib = | ||
2762 | group_cfs_rq(se)->utilization_load_avg; | ||
2763 | |||
2764 | return se->avg.utilization_avg_contrib - old_contrib; | ||
2765 | } | ||
2766 | |||
2710 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | 2767 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, |
2711 | long load_contrib) | 2768 | long load_contrib) |
2712 | { | 2769 | { |
@@ -2723,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
2723 | int update_cfs_rq) | 2780 | int update_cfs_rq) |
2724 | { | 2781 | { |
2725 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2782 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2726 | long contrib_delta; | 2783 | long contrib_delta, utilization_delta; |
2784 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
2727 | u64 now; | 2785 | u64 now; |
2728 | 2786 | ||
2729 | /* | 2787 | /* |
@@ -2735,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
2735 | else | 2793 | else |
2736 | now = cfs_rq_clock_task(group_cfs_rq(se)); | 2794 | now = cfs_rq_clock_task(group_cfs_rq(se)); |
2737 | 2795 | ||
2738 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | 2796 | if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, |
2797 | cfs_rq->curr == se)) | ||
2739 | return; | 2798 | return; |
2740 | 2799 | ||
2741 | contrib_delta = __update_entity_load_avg_contrib(se); | 2800 | contrib_delta = __update_entity_load_avg_contrib(se); |
2801 | utilization_delta = __update_entity_utilization_avg_contrib(se); | ||
2742 | 2802 | ||
2743 | if (!update_cfs_rq) | 2803 | if (!update_cfs_rq) |
2744 | return; | 2804 | return; |
2745 | 2805 | ||
2746 | if (se->on_rq) | 2806 | if (se->on_rq) { |
2747 | cfs_rq->runnable_load_avg += contrib_delta; | 2807 | cfs_rq->runnable_load_avg += contrib_delta; |
2748 | else | 2808 | cfs_rq->utilization_load_avg += utilization_delta; |
2809 | } else { | ||
2749 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | 2810 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); |
2811 | } | ||
2750 | } | 2812 | } |
2751 | 2813 | ||
2752 | /* | 2814 | /* |
@@ -2821,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2821 | } | 2883 | } |
2822 | 2884 | ||
2823 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | 2885 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; |
2886 | cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; | ||
2824 | /* we force update consideration on load-balancer moves */ | 2887 | /* we force update consideration on load-balancer moves */ |
2825 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | 2888 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); |
2826 | } | 2889 | } |
@@ -2839,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2839 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | 2902 | update_cfs_rq_blocked_load(cfs_rq, !sleep); |
2840 | 2903 | ||
2841 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 2904 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; |
2905 | cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; | ||
2842 | if (sleep) { | 2906 | if (sleep) { |
2843 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | 2907 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; |
2844 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 2908 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
@@ -3176,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3176 | */ | 3240 | */ |
3177 | update_stats_wait_end(cfs_rq, se); | 3241 | update_stats_wait_end(cfs_rq, se); |
3178 | __dequeue_entity(cfs_rq, se); | 3242 | __dequeue_entity(cfs_rq, se); |
3243 | update_entity_load_avg(se, 1); | ||
3179 | } | 3244 | } |
3180 | 3245 | ||
3181 | update_stats_curr_start(cfs_rq, se); | 3246 | update_stats_curr_start(cfs_rq, se); |
@@ -4302,6 +4367,11 @@ static unsigned long capacity_of(int cpu) | |||
4302 | return cpu_rq(cpu)->cpu_capacity; | 4367 | return cpu_rq(cpu)->cpu_capacity; |
4303 | } | 4368 | } |
4304 | 4369 | ||
4370 | static unsigned long capacity_orig_of(int cpu) | ||
4371 | { | ||
4372 | return cpu_rq(cpu)->cpu_capacity_orig; | ||
4373 | } | ||
4374 | |||
4305 | static unsigned long cpu_avg_load_per_task(int cpu) | 4375 | static unsigned long cpu_avg_load_per_task(int cpu) |
4306 | { | 4376 | { |
4307 | struct rq *rq = cpu_rq(cpu); | 4377 | struct rq *rq = cpu_rq(cpu); |
@@ -4715,6 +4785,33 @@ next: | |||
4715 | done: | 4785 | done: |
4716 | return target; | 4786 | return target; |
4717 | } | 4787 | } |
4788 | /* | ||
4789 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | ||
4790 | * tasks. The unit of the return value must be the one of capacity so we can | ||
4791 | * compare the usage with the capacity of the CPU that is available for CFS | ||
4792 | * task (ie cpu_capacity). | ||
4793 | * cfs.utilization_load_avg is the sum of running time of runnable tasks on a | ||
4794 | * CPU. It represents the amount of utilization of a CPU in the range | ||
4795 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | ||
4796 | * capacity of the CPU because it's about the running time on this CPU. | ||
4797 | * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE | ||
4798 | * because of unfortunate rounding in avg_period and running_load_avg or just | ||
4799 | * after migrating tasks until the average stabilizes with the new running | ||
4800 | * time. So we need to check that the usage stays into the range | ||
4801 | * [0..cpu_capacity_orig] and cap if necessary. | ||
4802 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | ||
4803 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | ||
4804 | */ | ||
4805 | static int get_cpu_usage(int cpu) | ||
4806 | { | ||
4807 | unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; | ||
4808 | unsigned long capacity = capacity_orig_of(cpu); | ||
4809 | |||
4810 | if (usage >= SCHED_LOAD_SCALE) | ||
4811 | return capacity; | ||
4812 | |||
4813 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
4814 | } | ||
4718 | 4815 | ||
4719 | /* | 4816 | /* |
4720 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 4817 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
@@ -5841,12 +5938,12 @@ struct sg_lb_stats { | |||
5841 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5938 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
5842 | unsigned long load_per_task; | 5939 | unsigned long load_per_task; |
5843 | unsigned long group_capacity; | 5940 | unsigned long group_capacity; |
5941 | unsigned long group_usage; /* Total usage of the group */ | ||
5844 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 5942 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
5845 | unsigned int group_capacity_factor; | ||
5846 | unsigned int idle_cpus; | 5943 | unsigned int idle_cpus; |
5847 | unsigned int group_weight; | 5944 | unsigned int group_weight; |
5848 | enum group_type group_type; | 5945 | enum group_type group_type; |
5849 | int group_has_free_capacity; | 5946 | int group_no_capacity; |
5850 | #ifdef CONFIG_NUMA_BALANCING | 5947 | #ifdef CONFIG_NUMA_BALANCING |
5851 | unsigned int nr_numa_running; | 5948 | unsigned int nr_numa_running; |
5852 | unsigned int nr_preferred_running; | 5949 | unsigned int nr_preferred_running; |
@@ -5917,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
5917 | return load_idx; | 6014 | return load_idx; |
5918 | } | 6015 | } |
5919 | 6016 | ||
5920 | static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) | ||
5921 | { | ||
5922 | return SCHED_CAPACITY_SCALE; | ||
5923 | } | ||
5924 | |||
5925 | unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
5926 | { | ||
5927 | return default_scale_capacity(sd, cpu); | ||
5928 | } | ||
5929 | |||
5930 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | 6017 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
5931 | { | 6018 | { |
5932 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | 6019 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
@@ -5943,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | |||
5943 | static unsigned long scale_rt_capacity(int cpu) | 6030 | static unsigned long scale_rt_capacity(int cpu) |
5944 | { | 6031 | { |
5945 | struct rq *rq = cpu_rq(cpu); | 6032 | struct rq *rq = cpu_rq(cpu); |
5946 | u64 total, available, age_stamp, avg; | 6033 | u64 total, used, age_stamp, avg; |
5947 | s64 delta; | 6034 | s64 delta; |
5948 | 6035 | ||
5949 | /* | 6036 | /* |
@@ -5959,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu) | |||
5959 | 6046 | ||
5960 | total = sched_avg_period() + delta; | 6047 | total = sched_avg_period() + delta; |
5961 | 6048 | ||
5962 | if (unlikely(total < avg)) { | 6049 | used = div_u64(avg, total); |
5963 | /* Ensures that capacity won't end up being negative */ | ||
5964 | available = 0; | ||
5965 | } else { | ||
5966 | available = total - avg; | ||
5967 | } | ||
5968 | |||
5969 | if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) | ||
5970 | total = SCHED_CAPACITY_SCALE; | ||
5971 | 6050 | ||
5972 | total >>= SCHED_CAPACITY_SHIFT; | 6051 | if (likely(used < SCHED_CAPACITY_SCALE)) |
6052 | return SCHED_CAPACITY_SCALE - used; | ||
5973 | 6053 | ||
5974 | return div_u64(available, total); | 6054 | return 1; |
5975 | } | 6055 | } |
5976 | 6056 | ||
5977 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6057 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
@@ -5986,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
5986 | 6066 | ||
5987 | capacity >>= SCHED_CAPACITY_SHIFT; | 6067 | capacity >>= SCHED_CAPACITY_SHIFT; |
5988 | 6068 | ||
5989 | sdg->sgc->capacity_orig = capacity; | 6069 | cpu_rq(cpu)->cpu_capacity_orig = capacity; |
5990 | |||
5991 | if (sched_feat(ARCH_CAPACITY)) | ||
5992 | capacity *= arch_scale_freq_capacity(sd, cpu); | ||
5993 | else | ||
5994 | capacity *= default_scale_capacity(sd, cpu); | ||
5995 | |||
5996 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
5997 | 6070 | ||
5998 | capacity *= scale_rt_capacity(cpu); | 6071 | capacity *= scale_rt_capacity(cpu); |
5999 | capacity >>= SCHED_CAPACITY_SHIFT; | 6072 | capacity >>= SCHED_CAPACITY_SHIFT; |
@@ -6009,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6009 | { | 6082 | { |
6010 | struct sched_domain *child = sd->child; | 6083 | struct sched_domain *child = sd->child; |
6011 | struct sched_group *group, *sdg = sd->groups; | 6084 | struct sched_group *group, *sdg = sd->groups; |
6012 | unsigned long capacity, capacity_orig; | 6085 | unsigned long capacity; |
6013 | unsigned long interval; | 6086 | unsigned long interval; |
6014 | 6087 | ||
6015 | interval = msecs_to_jiffies(sd->balance_interval); | 6088 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -6021,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6021 | return; | 6094 | return; |
6022 | } | 6095 | } |
6023 | 6096 | ||
6024 | capacity_orig = capacity = 0; | 6097 | capacity = 0; |
6025 | 6098 | ||
6026 | if (child->flags & SD_OVERLAP) { | 6099 | if (child->flags & SD_OVERLAP) { |
6027 | /* | 6100 | /* |
@@ -6041,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6041 | * Use capacity_of(), which is set irrespective of domains | 6114 | * Use capacity_of(), which is set irrespective of domains |
6042 | * in update_cpu_capacity(). | 6115 | * in update_cpu_capacity(). |
6043 | * | 6116 | * |
6044 | * This avoids capacity/capacity_orig from being 0 and | 6117 | * This avoids capacity from being 0 and |
6045 | * causing divide-by-zero issues on boot. | 6118 | * causing divide-by-zero issues on boot. |
6046 | * | ||
6047 | * Runtime updates will correct capacity_orig. | ||
6048 | */ | 6119 | */ |
6049 | if (unlikely(!rq->sd)) { | 6120 | if (unlikely(!rq->sd)) { |
6050 | capacity_orig += capacity_of(cpu); | ||
6051 | capacity += capacity_of(cpu); | 6121 | capacity += capacity_of(cpu); |
6052 | continue; | 6122 | continue; |
6053 | } | 6123 | } |
6054 | 6124 | ||
6055 | sgc = rq->sd->groups->sgc; | 6125 | sgc = rq->sd->groups->sgc; |
6056 | capacity_orig += sgc->capacity_orig; | ||
6057 | capacity += sgc->capacity; | 6126 | capacity += sgc->capacity; |
6058 | } | 6127 | } |
6059 | } else { | 6128 | } else { |
@@ -6064,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6064 | 6133 | ||
6065 | group = child->groups; | 6134 | group = child->groups; |
6066 | do { | 6135 | do { |
6067 | capacity_orig += group->sgc->capacity_orig; | ||
6068 | capacity += group->sgc->capacity; | 6136 | capacity += group->sgc->capacity; |
6069 | group = group->next; | 6137 | group = group->next; |
6070 | } while (group != child->groups); | 6138 | } while (group != child->groups); |
6071 | } | 6139 | } |
6072 | 6140 | ||
6073 | sdg->sgc->capacity_orig = capacity_orig; | ||
6074 | sdg->sgc->capacity = capacity; | 6141 | sdg->sgc->capacity = capacity; |
6075 | } | 6142 | } |
6076 | 6143 | ||
6077 | /* | 6144 | /* |
6078 | * Try and fix up capacity for tiny siblings, this is needed when | 6145 | * Check whether the capacity of the rq has been noticeably reduced by side |
6079 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | 6146 | * activity. The imbalance_pct is used for the threshold. |
6080 | * which on its own isn't powerful enough. | 6147 | * Return true is the capacity is reduced |
6081 | * | ||
6082 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
6083 | */ | 6148 | */ |
6084 | static inline int | 6149 | static inline int |
6085 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 6150 | check_cpu_capacity(struct rq *rq, struct sched_domain *sd) |
6086 | { | 6151 | { |
6087 | /* | 6152 | return ((rq->cpu_capacity * sd->imbalance_pct) < |
6088 | * Only siblings can have significantly less than SCHED_CAPACITY_SCALE | 6153 | (rq->cpu_capacity_orig * 100)); |
6089 | */ | ||
6090 | if (!(sd->flags & SD_SHARE_CPUCAPACITY)) | ||
6091 | return 0; | ||
6092 | |||
6093 | /* | ||
6094 | * If ~90% of the cpu_capacity is still there, we're good. | ||
6095 | */ | ||
6096 | if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) | ||
6097 | return 1; | ||
6098 | |||
6099 | return 0; | ||
6100 | } | 6154 | } |
6101 | 6155 | ||
6102 | /* | 6156 | /* |
@@ -6134,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
6134 | } | 6188 | } |
6135 | 6189 | ||
6136 | /* | 6190 | /* |
6137 | * Compute the group capacity factor. | 6191 | * group_has_capacity returns true if the group has spare capacity that could |
6138 | * | 6192 | * be used by some tasks. |
6139 | * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by | 6193 | * We consider that a group has spare capacity if the * number of task is |
6140 | * first dividing out the smt factor and computing the actual number of cores | 6194 | * smaller than the number of CPUs or if the usage is lower than the available |
6141 | * and limit unit capacity with that. | 6195 | * capacity for CFS tasks. |
6196 | * For the latter, we use a threshold to stabilize the state, to take into | ||
6197 | * account the variance of the tasks' load and to return true if the available | ||
6198 | * capacity in meaningful for the load balancer. | ||
6199 | * As an example, an available capacity of 1% can appear but it doesn't make | ||
6200 | * any benefit for the load balance. | ||
6142 | */ | 6201 | */ |
6143 | static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) | 6202 | static inline bool |
6203 | group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | ||
6144 | { | 6204 | { |
6145 | unsigned int capacity_factor, smt, cpus; | 6205 | if (sgs->sum_nr_running < sgs->group_weight) |
6146 | unsigned int capacity, capacity_orig; | 6206 | return true; |
6147 | 6207 | ||
6148 | capacity = group->sgc->capacity; | 6208 | if ((sgs->group_capacity * 100) > |
6149 | capacity_orig = group->sgc->capacity_orig; | 6209 | (sgs->group_usage * env->sd->imbalance_pct)) |
6150 | cpus = group->group_weight; | 6210 | return true; |
6211 | |||
6212 | return false; | ||
6213 | } | ||
6151 | 6214 | ||
6152 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ | 6215 | /* |
6153 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); | 6216 | * group_is_overloaded returns true if the group has more tasks than it can |
6154 | capacity_factor = cpus / smt; /* cores */ | 6217 | * handle. |
6218 | * group_is_overloaded is not equals to !group_has_capacity because a group | ||
6219 | * with the exact right number of tasks, has no more spare capacity but is not | ||
6220 | * overloaded so both group_has_capacity and group_is_overloaded return | ||
6221 | * false. | ||
6222 | */ | ||
6223 | static inline bool | ||
6224 | group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | ||
6225 | { | ||
6226 | if (sgs->sum_nr_running <= sgs->group_weight) | ||
6227 | return false; | ||
6155 | 6228 | ||
6156 | capacity_factor = min_t(unsigned, | 6229 | if ((sgs->group_capacity * 100) < |
6157 | capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); | 6230 | (sgs->group_usage * env->sd->imbalance_pct)) |
6158 | if (!capacity_factor) | 6231 | return true; |
6159 | capacity_factor = fix_small_capacity(env->sd, group); | ||
6160 | 6232 | ||
6161 | return capacity_factor; | 6233 | return false; |
6162 | } | 6234 | } |
6163 | 6235 | ||
6164 | static enum group_type | 6236 | static enum group_type group_classify(struct lb_env *env, |
6165 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | 6237 | struct sched_group *group, |
6238 | struct sg_lb_stats *sgs) | ||
6166 | { | 6239 | { |
6167 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6240 | if (sgs->group_no_capacity) |
6168 | return group_overloaded; | 6241 | return group_overloaded; |
6169 | 6242 | ||
6170 | if (sg_imbalanced(group)) | 6243 | if (sg_imbalanced(group)) |
@@ -6202,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6202 | load = source_load(i, load_idx); | 6275 | load = source_load(i, load_idx); |
6203 | 6276 | ||
6204 | sgs->group_load += load; | 6277 | sgs->group_load += load; |
6278 | sgs->group_usage += get_cpu_usage(i); | ||
6205 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6279 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
6206 | 6280 | ||
6207 | if (rq->nr_running > 1) | 6281 | if (rq->nr_running > 1) |
@@ -6224,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6224 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6298 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
6225 | 6299 | ||
6226 | sgs->group_weight = group->group_weight; | 6300 | sgs->group_weight = group->group_weight; |
6227 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | ||
6228 | sgs->group_type = group_classify(group, sgs); | ||
6229 | 6301 | ||
6230 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6302 | sgs->group_no_capacity = group_is_overloaded(env, sgs); |
6231 | sgs->group_has_free_capacity = 1; | 6303 | sgs->group_type = group_classify(env, group, sgs); |
6232 | } | 6304 | } |
6233 | 6305 | ||
6234 | /** | 6306 | /** |
@@ -6350,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6350 | 6422 | ||
6351 | /* | 6423 | /* |
6352 | * In case the child domain prefers tasks go to siblings | 6424 | * In case the child domain prefers tasks go to siblings |
6353 | * first, lower the sg capacity factor to one so that we'll try | 6425 | * first, lower the sg capacity so that we'll try |
6354 | * and move all the excess tasks away. We lower the capacity | 6426 | * and move all the excess tasks away. We lower the capacity |
6355 | * of a group only if the local group has the capacity to fit | 6427 | * of a group only if the local group has the capacity to fit |
6356 | * these excess tasks, i.e. nr_running < group_capacity_factor. The | 6428 | * these excess tasks. The extra check prevents the case where |
6357 | * extra check prevents the case where you always pull from the | 6429 | * you always pull from the heaviest group when it is already |
6358 | * heaviest group when it is already under-utilized (possible | 6430 | * under-utilized (possible with a large weight task outweighs |
6359 | * with a large weight task outweighs the tasks on the system). | 6431 | * the tasks on the system). |
6360 | */ | 6432 | */ |
6361 | if (prefer_sibling && sds->local && | 6433 | if (prefer_sibling && sds->local && |
6362 | sds->local_stat.group_has_free_capacity) { | 6434 | group_has_capacity(env, &sds->local_stat) && |
6363 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6435 | (sgs->sum_nr_running > 1)) { |
6364 | sgs->group_type = group_classify(sg, sgs); | 6436 | sgs->group_no_capacity = 1; |
6437 | sgs->group_type = group_overloaded; | ||
6365 | } | 6438 | } |
6366 | 6439 | ||
6367 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6440 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
@@ -6541,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6541 | */ | 6614 | */ |
6542 | if (busiest->group_type == group_overloaded && | 6615 | if (busiest->group_type == group_overloaded && |
6543 | local->group_type == group_overloaded) { | 6616 | local->group_type == group_overloaded) { |
6544 | load_above_capacity = | 6617 | load_above_capacity = busiest->sum_nr_running * |
6545 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6618 | SCHED_LOAD_SCALE; |
6546 | 6619 | if (load_above_capacity > busiest->group_capacity) | |
6547 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); | 6620 | load_above_capacity -= busiest->group_capacity; |
6548 | load_above_capacity /= busiest->group_capacity; | 6621 | else |
6622 | load_above_capacity = ~0UL; | ||
6549 | } | 6623 | } |
6550 | 6624 | ||
6551 | /* | 6625 | /* |
@@ -6608,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6608 | local = &sds.local_stat; | 6682 | local = &sds.local_stat; |
6609 | busiest = &sds.busiest_stat; | 6683 | busiest = &sds.busiest_stat; |
6610 | 6684 | ||
6685 | /* ASYM feature bypasses nice load balance check */ | ||
6611 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 6686 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
6612 | check_asym_packing(env, &sds)) | 6687 | check_asym_packing(env, &sds)) |
6613 | return sds.busiest; | 6688 | return sds.busiest; |
@@ -6628,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6628 | goto force_balance; | 6703 | goto force_balance; |
6629 | 6704 | ||
6630 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6705 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
6631 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && | 6706 | if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && |
6632 | !busiest->group_has_free_capacity) | 6707 | busiest->group_no_capacity) |
6633 | goto force_balance; | 6708 | goto force_balance; |
6634 | 6709 | ||
6635 | /* | 6710 | /* |
@@ -6688,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6688 | int i; | 6763 | int i; |
6689 | 6764 | ||
6690 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6765 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
6691 | unsigned long capacity, capacity_factor, wl; | 6766 | unsigned long capacity, wl; |
6692 | enum fbq_type rt; | 6767 | enum fbq_type rt; |
6693 | 6768 | ||
6694 | rq = cpu_rq(i); | 6769 | rq = cpu_rq(i); |
@@ -6717,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6717 | continue; | 6792 | continue; |
6718 | 6793 | ||
6719 | capacity = capacity_of(i); | 6794 | capacity = capacity_of(i); |
6720 | capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); | ||
6721 | if (!capacity_factor) | ||
6722 | capacity_factor = fix_small_capacity(env->sd, group); | ||
6723 | 6795 | ||
6724 | wl = weighted_cpuload(i); | 6796 | wl = weighted_cpuload(i); |
6725 | 6797 | ||
@@ -6727,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6727 | * When comparing with imbalance, use weighted_cpuload() | 6799 | * When comparing with imbalance, use weighted_cpuload() |
6728 | * which is not scaled with the cpu capacity. | 6800 | * which is not scaled with the cpu capacity. |
6729 | */ | 6801 | */ |
6730 | if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) | 6802 | |
6803 | if (rq->nr_running == 1 && wl > env->imbalance && | ||
6804 | !check_cpu_capacity(rq, env->sd)) | ||
6731 | continue; | 6805 | continue; |
6732 | 6806 | ||
6733 | /* | 6807 | /* |
@@ -6775,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) | |||
6775 | return 1; | 6849 | return 1; |
6776 | } | 6850 | } |
6777 | 6851 | ||
6852 | /* | ||
6853 | * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. | ||
6854 | * It's worth migrating the task if the src_cpu's capacity is reduced | ||
6855 | * because of other sched_class or IRQs if more capacity stays | ||
6856 | * available on dst_cpu. | ||
6857 | */ | ||
6858 | if ((env->idle != CPU_NOT_IDLE) && | ||
6859 | (env->src_rq->cfs.h_nr_running == 1)) { | ||
6860 | if ((check_cpu_capacity(env->src_rq, sd)) && | ||
6861 | (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) | ||
6862 | return 1; | ||
6863 | } | ||
6864 | |||
6778 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 6865 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
6779 | } | 6866 | } |
6780 | 6867 | ||
@@ -6874,6 +6961,9 @@ redo: | |||
6874 | 6961 | ||
6875 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 6962 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
6876 | 6963 | ||
6964 | env.src_cpu = busiest->cpu; | ||
6965 | env.src_rq = busiest; | ||
6966 | |||
6877 | ld_moved = 0; | 6967 | ld_moved = 0; |
6878 | if (busiest->nr_running > 1) { | 6968 | if (busiest->nr_running > 1) { |
6879 | /* | 6969 | /* |
@@ -6883,8 +6973,6 @@ redo: | |||
6883 | * correctly treated as an imbalance. | 6973 | * correctly treated as an imbalance. |
6884 | */ | 6974 | */ |
6885 | env.flags |= LBF_ALL_PINNED; | 6975 | env.flags |= LBF_ALL_PINNED; |
6886 | env.src_cpu = busiest->cpu; | ||
6887 | env.src_rq = busiest; | ||
6888 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6976 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
6889 | 6977 | ||
6890 | more_balance: | 6978 | more_balance: |
@@ -7584,22 +7672,25 @@ end: | |||
7584 | 7672 | ||
7585 | /* | 7673 | /* |
7586 | * Current heuristic for kicking the idle load balancer in the presence | 7674 | * Current heuristic for kicking the idle load balancer in the presence |
7587 | * of an idle cpu is the system. | 7675 | * of an idle cpu in the system. |
7588 | * - This rq has more than one task. | 7676 | * - This rq has more than one task. |
7589 | * - At any scheduler domain level, this cpu's scheduler group has multiple | 7677 | * - This rq has at least one CFS task and the capacity of the CPU is |
7590 | * busy cpu's exceeding the group's capacity. | 7678 | * significantly reduced because of RT tasks or IRQs. |
7679 | * - At parent of LLC scheduler domain level, this cpu's scheduler group has | ||
7680 | * multiple busy cpu. | ||
7591 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 7681 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
7592 | * domain span are idle. | 7682 | * domain span are idle. |
7593 | */ | 7683 | */ |
7594 | static inline int nohz_kick_needed(struct rq *rq) | 7684 | static inline bool nohz_kick_needed(struct rq *rq) |
7595 | { | 7685 | { |
7596 | unsigned long now = jiffies; | 7686 | unsigned long now = jiffies; |
7597 | struct sched_domain *sd; | 7687 | struct sched_domain *sd; |
7598 | struct sched_group_capacity *sgc; | 7688 | struct sched_group_capacity *sgc; |
7599 | int nr_busy, cpu = rq->cpu; | 7689 | int nr_busy, cpu = rq->cpu; |
7690 | bool kick = false; | ||
7600 | 7691 | ||
7601 | if (unlikely(rq->idle_balance)) | 7692 | if (unlikely(rq->idle_balance)) |
7602 | return 0; | 7693 | return false; |
7603 | 7694 | ||
7604 | /* | 7695 | /* |
7605 | * We may be recently in ticked or tickless idle mode. At the first | 7696 | * We may be recently in ticked or tickless idle mode. At the first |
@@ -7613,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
7613 | * balancing. | 7704 | * balancing. |
7614 | */ | 7705 | */ |
7615 | if (likely(!atomic_read(&nohz.nr_cpus))) | 7706 | if (likely(!atomic_read(&nohz.nr_cpus))) |
7616 | return 0; | 7707 | return false; |
7617 | 7708 | ||
7618 | if (time_before(now, nohz.next_balance)) | 7709 | if (time_before(now, nohz.next_balance)) |
7619 | return 0; | 7710 | return false; |
7620 | 7711 | ||
7621 | if (rq->nr_running >= 2) | 7712 | if (rq->nr_running >= 2) |
7622 | goto need_kick; | 7713 | return true; |
7623 | 7714 | ||
7624 | rcu_read_lock(); | 7715 | rcu_read_lock(); |
7625 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 7716 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
7626 | |||
7627 | if (sd) { | 7717 | if (sd) { |
7628 | sgc = sd->groups->sgc; | 7718 | sgc = sd->groups->sgc; |
7629 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 7719 | nr_busy = atomic_read(&sgc->nr_busy_cpus); |
7630 | 7720 | ||
7631 | if (nr_busy > 1) | 7721 | if (nr_busy > 1) { |
7632 | goto need_kick_unlock; | 7722 | kick = true; |
7723 | goto unlock; | ||
7724 | } | ||
7725 | |||
7633 | } | 7726 | } |
7634 | 7727 | ||
7635 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | 7728 | sd = rcu_dereference(rq->sd); |
7729 | if (sd) { | ||
7730 | if ((rq->cfs.h_nr_running >= 1) && | ||
7731 | check_cpu_capacity(rq, sd)) { | ||
7732 | kick = true; | ||
7733 | goto unlock; | ||
7734 | } | ||
7735 | } | ||
7636 | 7736 | ||
7737 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
7637 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | 7738 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, |
7638 | sched_domain_span(sd)) < cpu)) | 7739 | sched_domain_span(sd)) < cpu)) { |
7639 | goto need_kick_unlock; | 7740 | kick = true; |
7640 | 7741 | goto unlock; | |
7641 | rcu_read_unlock(); | 7742 | } |
7642 | return 0; | ||
7643 | 7743 | ||
7644 | need_kick_unlock: | 7744 | unlock: |
7645 | rcu_read_unlock(); | 7745 | rcu_read_unlock(); |
7646 | need_kick: | 7746 | return kick; |
7647 | return 1; | ||
7648 | } | 7747 | } |
7649 | #else | 7748 | #else |
7650 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | 7749 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
@@ -7660,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
7660 | enum cpu_idle_type idle = this_rq->idle_balance ? | 7759 | enum cpu_idle_type idle = this_rq->idle_balance ? |
7661 | CPU_IDLE : CPU_NOT_IDLE; | 7760 | CPU_IDLE : CPU_NOT_IDLE; |
7662 | 7761 | ||
7663 | rebalance_domains(this_rq, idle); | ||
7664 | |||
7665 | /* | 7762 | /* |
7666 | * If this cpu has a pending nohz_balance_kick, then do the | 7763 | * If this cpu has a pending nohz_balance_kick, then do the |
7667 | * balancing on behalf of the other idle cpus whose ticks are | 7764 | * balancing on behalf of the other idle cpus whose ticks are |
7668 | * stopped. | 7765 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
7766 | * give the idle cpus a chance to load balance. Else we may | ||
7767 | * load balance only within the local sched_domain hierarchy | ||
7768 | * and abort nohz_idle_balance altogether if we pull some load. | ||
7669 | */ | 7769 | */ |
7670 | nohz_idle_balance(this_rq, idle); | 7770 | nohz_idle_balance(this_rq, idle); |
7771 | rebalance_domains(this_rq, idle); | ||
7671 | } | 7772 | } |
7672 | 7773 | ||
7673 | /* | 7774 | /* |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) | |||
56 | */ | 56 | */ |
57 | SCHED_FEAT(TTWU_QUEUE, true) | 57 | SCHED_FEAT(TTWU_QUEUE, true) |
58 | 58 | ||
59 | #ifdef HAVE_RT_PUSH_IPI | ||
60 | /* | ||
61 | * In order to avoid a thundering herd attack of CPUs that are | ||
62 | * lowering their priorities at the same time, and there being | ||
63 | * a single CPU that has an RT task that can migrate and is waiting | ||
64 | * to run, where the other CPUs will try to take that CPUs | ||
65 | * rq lock and possibly create a large contention, sending an | ||
66 | * IPI to that CPU and let that CPU push the RT task to where | ||
67 | * it should go may be a better scenario. | ||
68 | */ | ||
69 | SCHED_FEAT(RT_PUSH_IPI, true) | ||
70 | #endif | ||
71 | |||
59 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 72 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
60 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 73 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
61 | SCHED_FEAT(LB_MIN, false) | 74 | SCHED_FEAT(LB_MIN, false) |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d27d36476dca..deef1caa94c6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -158,8 +158,7 @@ static void cpuidle_idle_call(void) | |||
158 | * is used from another cpu as a broadcast timer, this call may | 158 | * is used from another cpu as a broadcast timer, this call may |
159 | * fail if it is not available | 159 | * fail if it is not available |
160 | */ | 160 | */ |
161 | if (broadcast && | 161 | if (broadcast && tick_broadcast_enter()) |
162 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | ||
163 | goto use_default; | 162 | goto use_default; |
164 | 163 | ||
165 | /* Take note of the planned idle state. */ | 164 | /* Take note of the planned idle state. */ |
@@ -176,7 +175,7 @@ static void cpuidle_idle_call(void) | |||
176 | idle_set_state(this_rq(), NULL); | 175 | idle_set_state(this_rq(), NULL); |
177 | 176 | ||
178 | if (broadcast) | 177 | if (broadcast) |
179 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 178 | tick_broadcast_exit(); |
180 | 179 | ||
181 | /* | 180 | /* |
182 | * Give the governor an opportunity to reflect on the outcome | 181 | * Give the governor an opportunity to reflect on the outcome |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f4d4b077eba0..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include "sched.h" | 6 | #include "sched.h" |
7 | 7 | ||
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/irq_work.h> | ||
9 | 10 | ||
10 | int sched_rr_timeslice = RR_TIMESLICE; | 11 | int sched_rr_timeslice = RR_TIMESLICE; |
11 | 12 | ||
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
59 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 60 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
60 | } | 61 | } |
61 | 62 | ||
62 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 63 | #ifdef CONFIG_SMP |
64 | static void push_irq_work_func(struct irq_work *work); | ||
65 | #endif | ||
66 | |||
67 | void init_rt_rq(struct rt_rq *rt_rq) | ||
63 | { | 68 | { |
64 | struct rt_prio_array *array; | 69 | struct rt_prio_array *array; |
65 | int i; | 70 | int i; |
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
78 | rt_rq->rt_nr_migratory = 0; | 83 | rt_rq->rt_nr_migratory = 0; |
79 | rt_rq->overloaded = 0; | 84 | rt_rq->overloaded = 0; |
80 | plist_head_init(&rt_rq->pushable_tasks); | 85 | plist_head_init(&rt_rq->pushable_tasks); |
86 | |||
87 | #ifdef HAVE_RT_PUSH_IPI | ||
88 | rt_rq->push_flags = 0; | ||
89 | rt_rq->push_cpu = nr_cpu_ids; | ||
90 | raw_spin_lock_init(&rt_rq->push_lock); | ||
91 | init_irq_work(&rt_rq->push_work, push_irq_work_func); | ||
81 | #endif | 92 | #endif |
93 | #endif /* CONFIG_SMP */ | ||
82 | /* We start is dequeued state, because no RT tasks are queued */ | 94 | /* We start is dequeued state, because no RT tasks are queued */ |
83 | rt_rq->rt_queued = 0; | 95 | rt_rq->rt_queued = 0; |
84 | 96 | ||
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
193 | if (!rt_se) | 205 | if (!rt_se) |
194 | goto err_free_rq; | 206 | goto err_free_rq; |
195 | 207 | ||
196 | init_rt_rq(rt_rq, cpu_rq(i)); | 208 | init_rt_rq(rt_rq); |
197 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 209 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
198 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 210 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
199 | } | 211 | } |
@@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) | |||
1778 | ; | 1790 | ; |
1779 | } | 1791 | } |
1780 | 1792 | ||
1793 | #ifdef HAVE_RT_PUSH_IPI | ||
1794 | /* | ||
1795 | * The search for the next cpu always starts at rq->cpu and ends | ||
1796 | * when we reach rq->cpu again. It will never return rq->cpu. | ||
1797 | * This returns the next cpu to check, or nr_cpu_ids if the loop | ||
1798 | * is complete. | ||
1799 | * | ||
1800 | * rq->rt.push_cpu holds the last cpu returned by this function, | ||
1801 | * or if this is the first instance, it must hold rq->cpu. | ||
1802 | */ | ||
1803 | static int rto_next_cpu(struct rq *rq) | ||
1804 | { | ||
1805 | int prev_cpu = rq->rt.push_cpu; | ||
1806 | int cpu; | ||
1807 | |||
1808 | cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); | ||
1809 | |||
1810 | /* | ||
1811 | * If the previous cpu is less than the rq's CPU, then it already | ||
1812 | * passed the end of the mask, and has started from the beginning. | ||
1813 | * We end if the next CPU is greater or equal to rq's CPU. | ||
1814 | */ | ||
1815 | if (prev_cpu < rq->cpu) { | ||
1816 | if (cpu >= rq->cpu) | ||
1817 | return nr_cpu_ids; | ||
1818 | |||
1819 | } else if (cpu >= nr_cpu_ids) { | ||
1820 | /* | ||
1821 | * We passed the end of the mask, start at the beginning. | ||
1822 | * If the result is greater or equal to the rq's CPU, then | ||
1823 | * the loop is finished. | ||
1824 | */ | ||
1825 | cpu = cpumask_first(rq->rd->rto_mask); | ||
1826 | if (cpu >= rq->cpu) | ||
1827 | return nr_cpu_ids; | ||
1828 | } | ||
1829 | rq->rt.push_cpu = cpu; | ||
1830 | |||
1831 | /* Return cpu to let the caller know if the loop is finished or not */ | ||
1832 | return cpu; | ||
1833 | } | ||
1834 | |||
1835 | static int find_next_push_cpu(struct rq *rq) | ||
1836 | { | ||
1837 | struct rq *next_rq; | ||
1838 | int cpu; | ||
1839 | |||
1840 | while (1) { | ||
1841 | cpu = rto_next_cpu(rq); | ||
1842 | if (cpu >= nr_cpu_ids) | ||
1843 | break; | ||
1844 | next_rq = cpu_rq(cpu); | ||
1845 | |||
1846 | /* Make sure the next rq can push to this rq */ | ||
1847 | if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) | ||
1848 | break; | ||
1849 | } | ||
1850 | |||
1851 | return cpu; | ||
1852 | } | ||
1853 | |||
1854 | #define RT_PUSH_IPI_EXECUTING 1 | ||
1855 | #define RT_PUSH_IPI_RESTART 2 | ||
1856 | |||
1857 | static void tell_cpu_to_push(struct rq *rq) | ||
1858 | { | ||
1859 | int cpu; | ||
1860 | |||
1861 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
1862 | raw_spin_lock(&rq->rt.push_lock); | ||
1863 | /* Make sure it's still executing */ | ||
1864 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
1865 | /* | ||
1866 | * Tell the IPI to restart the loop as things have | ||
1867 | * changed since it started. | ||
1868 | */ | ||
1869 | rq->rt.push_flags |= RT_PUSH_IPI_RESTART; | ||
1870 | raw_spin_unlock(&rq->rt.push_lock); | ||
1871 | return; | ||
1872 | } | ||
1873 | raw_spin_unlock(&rq->rt.push_lock); | ||
1874 | } | ||
1875 | |||
1876 | /* When here, there's no IPI going around */ | ||
1877 | |||
1878 | rq->rt.push_cpu = rq->cpu; | ||
1879 | cpu = find_next_push_cpu(rq); | ||
1880 | if (cpu >= nr_cpu_ids) | ||
1881 | return; | ||
1882 | |||
1883 | rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; | ||
1884 | |||
1885 | irq_work_queue_on(&rq->rt.push_work, cpu); | ||
1886 | } | ||
1887 | |||
1888 | /* Called from hardirq context */ | ||
1889 | static void try_to_push_tasks(void *arg) | ||
1890 | { | ||
1891 | struct rt_rq *rt_rq = arg; | ||
1892 | struct rq *rq, *src_rq; | ||
1893 | int this_cpu; | ||
1894 | int cpu; | ||
1895 | |||
1896 | this_cpu = rt_rq->push_cpu; | ||
1897 | |||
1898 | /* Paranoid check */ | ||
1899 | BUG_ON(this_cpu != smp_processor_id()); | ||
1900 | |||
1901 | rq = cpu_rq(this_cpu); | ||
1902 | src_rq = rq_of_rt_rq(rt_rq); | ||
1903 | |||
1904 | again: | ||
1905 | if (has_pushable_tasks(rq)) { | ||
1906 | raw_spin_lock(&rq->lock); | ||
1907 | push_rt_task(rq); | ||
1908 | raw_spin_unlock(&rq->lock); | ||
1909 | } | ||
1910 | |||
1911 | /* Pass the IPI to the next rt overloaded queue */ | ||
1912 | raw_spin_lock(&rt_rq->push_lock); | ||
1913 | /* | ||
1914 | * If the source queue changed since the IPI went out, | ||
1915 | * we need to restart the search from that CPU again. | ||
1916 | */ | ||
1917 | if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { | ||
1918 | rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; | ||
1919 | rt_rq->push_cpu = src_rq->cpu; | ||
1920 | } | ||
1921 | |||
1922 | cpu = find_next_push_cpu(src_rq); | ||
1923 | |||
1924 | if (cpu >= nr_cpu_ids) | ||
1925 | rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; | ||
1926 | raw_spin_unlock(&rt_rq->push_lock); | ||
1927 | |||
1928 | if (cpu >= nr_cpu_ids) | ||
1929 | return; | ||
1930 | |||
1931 | /* | ||
1932 | * It is possible that a restart caused this CPU to be | ||
1933 | * chosen again. Don't bother with an IPI, just see if we | ||
1934 | * have more to push. | ||
1935 | */ | ||
1936 | if (unlikely(cpu == rq->cpu)) | ||
1937 | goto again; | ||
1938 | |||
1939 | /* Try the next RT overloaded CPU */ | ||
1940 | irq_work_queue_on(&rt_rq->push_work, cpu); | ||
1941 | } | ||
1942 | |||
1943 | static void push_irq_work_func(struct irq_work *work) | ||
1944 | { | ||
1945 | struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); | ||
1946 | |||
1947 | try_to_push_tasks(rt_rq); | ||
1948 | } | ||
1949 | #endif /* HAVE_RT_PUSH_IPI */ | ||
1950 | |||
1781 | static int pull_rt_task(struct rq *this_rq) | 1951 | static int pull_rt_task(struct rq *this_rq) |
1782 | { | 1952 | { |
1783 | int this_cpu = this_rq->cpu, ret = 0, cpu; | 1953 | int this_cpu = this_rq->cpu, ret = 0, cpu; |
@@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) | |||
1793 | */ | 1963 | */ |
1794 | smp_rmb(); | 1964 | smp_rmb(); |
1795 | 1965 | ||
1966 | #ifdef HAVE_RT_PUSH_IPI | ||
1967 | if (sched_feat(RT_PUSH_IPI)) { | ||
1968 | tell_cpu_to_push(this_rq); | ||
1969 | return 0; | ||
1970 | } | ||
1971 | #endif | ||
1972 | |||
1796 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 1973 | for_each_cpu(cpu, this_rq->rd->rto_mask) { |
1797 | if (this_cpu == cpu) | 1974 | if (this_cpu == cpu) |
1798 | continue; | 1975 | continue; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dc0f435a2779..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
8 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> |
9 | #include <linux/irq_work.h> | ||
9 | #include <linux/tick.h> | 10 | #include <linux/tick.h> |
10 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
11 | 12 | ||
@@ -362,8 +363,14 @@ struct cfs_rq { | |||
362 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | 363 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
363 | * This allows for the description of both thread and group usage (in | 364 | * This allows for the description of both thread and group usage (in |
364 | * the FAIR_GROUP_SCHED case). | 365 | * the FAIR_GROUP_SCHED case). |
366 | * runnable_load_avg is the sum of the load_avg_contrib of the | ||
367 | * sched_entities on the rq. | ||
368 | * blocked_load_avg is similar to runnable_load_avg except that its | ||
369 | * the blocked sched_entities on the rq. | ||
370 | * utilization_load_avg is the sum of the average running time of the | ||
371 | * sched_entities on the rq. | ||
365 | */ | 372 | */ |
366 | unsigned long runnable_load_avg, blocked_load_avg; | 373 | unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; |
367 | atomic64_t decay_counter; | 374 | atomic64_t decay_counter; |
368 | u64 last_decay; | 375 | u64 last_decay; |
369 | atomic_long_t removed_load; | 376 | atomic_long_t removed_load; |
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void) | |||
418 | return sysctl_sched_rt_runtime >= 0; | 425 | return sysctl_sched_rt_runtime >= 0; |
419 | } | 426 | } |
420 | 427 | ||
428 | /* RT IPI pull logic requires IRQ_WORK */ | ||
429 | #ifdef CONFIG_IRQ_WORK | ||
430 | # define HAVE_RT_PUSH_IPI | ||
431 | #endif | ||
432 | |||
421 | /* Real-Time classes' related field in a runqueue: */ | 433 | /* Real-Time classes' related field in a runqueue: */ |
422 | struct rt_rq { | 434 | struct rt_rq { |
423 | struct rt_prio_array active; | 435 | struct rt_prio_array active; |
@@ -435,7 +447,13 @@ struct rt_rq { | |||
435 | unsigned long rt_nr_total; | 447 | unsigned long rt_nr_total; |
436 | int overloaded; | 448 | int overloaded; |
437 | struct plist_head pushable_tasks; | 449 | struct plist_head pushable_tasks; |
450 | #ifdef HAVE_RT_PUSH_IPI | ||
451 | int push_flags; | ||
452 | int push_cpu; | ||
453 | struct irq_work push_work; | ||
454 | raw_spinlock_t push_lock; | ||
438 | #endif | 455 | #endif |
456 | #endif /* CONFIG_SMP */ | ||
439 | int rt_queued; | 457 | int rt_queued; |
440 | 458 | ||
441 | int rt_throttled; | 459 | int rt_throttled; |
@@ -597,6 +615,7 @@ struct rq { | |||
597 | struct sched_domain *sd; | 615 | struct sched_domain *sd; |
598 | 616 | ||
599 | unsigned long cpu_capacity; | 617 | unsigned long cpu_capacity; |
618 | unsigned long cpu_capacity_orig; | ||
600 | 619 | ||
601 | unsigned char idle_balance; | 620 | unsigned char idle_balance; |
602 | /* For active balancing */ | 621 | /* For active balancing */ |
@@ -807,7 +826,7 @@ struct sched_group_capacity { | |||
807 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity | 826 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity |
808 | * for a single CPU. | 827 | * for a single CPU. |
809 | */ | 828 | */ |
810 | unsigned int capacity, capacity_orig; | 829 | unsigned int capacity; |
811 | unsigned long next_update; | 830 | unsigned long next_update; |
812 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 831 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
813 | /* | 832 | /* |
@@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq) | |||
1368 | 1387 | ||
1369 | #ifdef CONFIG_SMP | 1388 | #ifdef CONFIG_SMP |
1370 | extern void sched_avg_update(struct rq *rq); | 1389 | extern void sched_avg_update(struct rq *rq); |
1390 | |||
1391 | #ifndef arch_scale_freq_capacity | ||
1392 | static __always_inline | ||
1393 | unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
1394 | { | ||
1395 | return SCHED_CAPACITY_SCALE; | ||
1396 | } | ||
1397 | #endif | ||
1398 | |||
1371 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1399 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
1372 | { | 1400 | { |
1373 | rq->rt_avg += rt_delta; | 1401 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); |
1374 | sched_avg_update(rq); | 1402 | sched_avg_update(rq); |
1375 | } | 1403 | } |
1376 | #else | 1404 | #else |
@@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
1643 | extern void print_dl_stats(struct seq_file *m, int cpu); | 1671 | extern void print_dl_stats(struct seq_file *m, int cpu); |
1644 | 1672 | ||
1645 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1673 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1646 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1674 | extern void init_rt_rq(struct rt_rq *rt_rq); |
1647 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | 1675 | extern void init_dl_rq(struct dl_rq *dl_rq); |
1648 | 1676 | ||
1649 | extern void cfs_bandwidth_usage_inc(void); | 1677 | extern void cfs_bandwidth_usage_inc(void); |
1650 | extern void cfs_bandwidth_usage_dec(void); | 1678 | extern void cfs_bandwidth_usage_dec(void); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 88ea2d6e0031..ce410bb9f2e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1228,6 +1228,14 @@ static struct ctl_table vm_table[] = { | |||
1228 | .extra1 = &zero, | 1228 | .extra1 = &zero, |
1229 | }, | 1229 | }, |
1230 | { | 1230 | { |
1231 | .procname = "dirtytime_expire_seconds", | ||
1232 | .data = &dirtytime_expire_interval, | ||
1233 | .maxlen = sizeof(dirty_expire_interval), | ||
1234 | .mode = 0644, | ||
1235 | .proc_handler = dirtytime_interval_handler, | ||
1236 | .extra1 = &zero, | ||
1237 | }, | ||
1238 | { | ||
1231 | .procname = "nr_pdflush_threads", | 1239 | .procname = "nr_pdflush_threads", |
1232 | .mode = 0444 /* read-only */, | 1240 | .mode = 0444 /* read-only */, |
1233 | .proc_handler = pdflush_proc_obsolete, | 1241 | .proc_handler = pdflush_proc_obsolete, |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index d626dc98e8df..579ce1b929af 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET | |||
33 | config GENERIC_CLOCKEVENTS | 33 | config GENERIC_CLOCKEVENTS |
34 | bool | 34 | bool |
35 | 35 | ||
36 | # Migration helper. Builds, but does not invoke | ||
37 | config GENERIC_CLOCKEVENTS_BUILD | ||
38 | bool | ||
39 | default y | ||
40 | depends on GENERIC_CLOCKEVENTS | ||
41 | |||
42 | # Architecture can handle broadcast in a driver-agnostic way | 36 | # Architecture can handle broadcast in a driver-agnostic way |
43 | config ARCH_HAS_TICK_BROADCAST | 37 | config ARCH_HAS_TICK_BROADCAST |
44 | bool | 38 | bool |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index c09c07817d7a..01f0312419b3 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o | |||
2 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o | 2 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o |
3 | obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o | 3 | obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o |
4 | 4 | ||
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o |
6 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | ||
7 | ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) | 6 | ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) |
8 | obj-y += tick-broadcast.o | 7 | obj-y += tick-broadcast.o |
9 | obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o | 8 | obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o |
10 | endif | 9 | endif |
11 | obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o | 10 | obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o |
12 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | 11 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o |
13 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o | ||
14 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | 12 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o |
15 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o | 13 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o |
16 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o | 14 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 55449909f114..25d942d1da27 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) | |||
94 | } | 94 | } |
95 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); | 95 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); |
96 | 96 | ||
97 | static int __clockevents_set_state(struct clock_event_device *dev, | ||
98 | enum clock_event_state state) | ||
99 | { | ||
100 | /* Transition with legacy set_mode() callback */ | ||
101 | if (dev->set_mode) { | ||
102 | /* Legacy callback doesn't support new modes */ | ||
103 | if (state > CLOCK_EVT_STATE_ONESHOT) | ||
104 | return -ENOSYS; | ||
105 | /* | ||
106 | * 'clock_event_state' and 'clock_event_mode' have 1-to-1 | ||
107 | * mapping until *_ONESHOT, and so a simple cast will work. | ||
108 | */ | ||
109 | dev->set_mode((enum clock_event_mode)state, dev); | ||
110 | dev->mode = (enum clock_event_mode)state; | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | if (dev->features & CLOCK_EVT_FEAT_DUMMY) | ||
115 | return 0; | ||
116 | |||
117 | /* Transition with new state-specific callbacks */ | ||
118 | switch (state) { | ||
119 | case CLOCK_EVT_STATE_DETACHED: | ||
120 | /* | ||
121 | * This is an internal state, which is guaranteed to go from | ||
122 | * SHUTDOWN to DETACHED. No driver interaction required. | ||
123 | */ | ||
124 | return 0; | ||
125 | |||
126 | case CLOCK_EVT_STATE_SHUTDOWN: | ||
127 | return dev->set_state_shutdown(dev); | ||
128 | |||
129 | case CLOCK_EVT_STATE_PERIODIC: | ||
130 | /* Core internal bug */ | ||
131 | if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) | ||
132 | return -ENOSYS; | ||
133 | return dev->set_state_periodic(dev); | ||
134 | |||
135 | case CLOCK_EVT_STATE_ONESHOT: | ||
136 | /* Core internal bug */ | ||
137 | if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
138 | return -ENOSYS; | ||
139 | return dev->set_state_oneshot(dev); | ||
140 | |||
141 | default: | ||
142 | return -ENOSYS; | ||
143 | } | ||
144 | } | ||
145 | |||
97 | /** | 146 | /** |
98 | * clockevents_set_mode - set the operating mode of a clock event device | 147 | * clockevents_set_state - set the operating state of a clock event device |
99 | * @dev: device to modify | 148 | * @dev: device to modify |
100 | * @mode: new mode | 149 | * @state: new state |
101 | * | 150 | * |
102 | * Must be called with interrupts disabled ! | 151 | * Must be called with interrupts disabled ! |
103 | */ | 152 | */ |
104 | void clockevents_set_mode(struct clock_event_device *dev, | 153 | void clockevents_set_state(struct clock_event_device *dev, |
105 | enum clock_event_mode mode) | 154 | enum clock_event_state state) |
106 | { | 155 | { |
107 | if (dev->mode != mode) { | 156 | if (dev->state != state) { |
108 | dev->set_mode(mode, dev); | 157 | if (__clockevents_set_state(dev, state)) |
109 | dev->mode = mode; | 158 | return; |
159 | |||
160 | dev->state = state; | ||
110 | 161 | ||
111 | /* | 162 | /* |
112 | * A nsec2cyc multiplicator of 0 is invalid and we'd crash | 163 | * A nsec2cyc multiplicator of 0 is invalid and we'd crash |
113 | * on it, so fix it up and emit a warning: | 164 | * on it, so fix it up and emit a warning: |
114 | */ | 165 | */ |
115 | if (mode == CLOCK_EVT_MODE_ONESHOT) { | 166 | if (state == CLOCK_EVT_STATE_ONESHOT) { |
116 | if (unlikely(!dev->mult)) { | 167 | if (unlikely(!dev->mult)) { |
117 | dev->mult = 1; | 168 | dev->mult = 1; |
118 | WARN_ON(1); | 169 | WARN_ON(1); |
@@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev, | |||
127 | */ | 178 | */ |
128 | void clockevents_shutdown(struct clock_event_device *dev) | 179 | void clockevents_shutdown(struct clock_event_device *dev) |
129 | { | 180 | { |
130 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | 181 | clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); |
131 | dev->next_event.tv64 = KTIME_MAX; | 182 | dev->next_event.tv64 = KTIME_MAX; |
132 | } | 183 | } |
133 | 184 | ||
185 | /** | ||
186 | * clockevents_tick_resume - Resume the tick device before using it again | ||
187 | * @dev: device to resume | ||
188 | */ | ||
189 | int clockevents_tick_resume(struct clock_event_device *dev) | ||
190 | { | ||
191 | int ret = 0; | ||
192 | |||
193 | if (dev->set_mode) { | ||
194 | dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); | ||
195 | dev->mode = CLOCK_EVT_MODE_RESUME; | ||
196 | } else if (dev->tick_resume) { | ||
197 | ret = dev->tick_resume(dev); | ||
198 | } | ||
199 | |||
200 | return ret; | ||
201 | } | ||
202 | |||
134 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST | 203 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST |
135 | 204 | ||
136 | /* Limit min_delta to a jiffie */ | 205 | /* Limit min_delta to a jiffie */ |
@@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) | |||
183 | delta = dev->min_delta_ns; | 252 | delta = dev->min_delta_ns; |
184 | dev->next_event = ktime_add_ns(ktime_get(), delta); | 253 | dev->next_event = ktime_add_ns(ktime_get(), delta); |
185 | 254 | ||
186 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 255 | if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) |
187 | return 0; | 256 | return 0; |
188 | 257 | ||
189 | dev->retries++; | 258 | dev->retries++; |
@@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) | |||
220 | delta = dev->min_delta_ns; | 289 | delta = dev->min_delta_ns; |
221 | dev->next_event = ktime_add_ns(ktime_get(), delta); | 290 | dev->next_event = ktime_add_ns(ktime_get(), delta); |
222 | 291 | ||
223 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 292 | if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) |
224 | return 0; | 293 | return 0; |
225 | 294 | ||
226 | dev->retries++; | 295 | dev->retries++; |
@@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | |||
252 | 321 | ||
253 | dev->next_event = expires; | 322 | dev->next_event = expires; |
254 | 323 | ||
255 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 324 | if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) |
256 | return 0; | 325 | return 0; |
257 | 326 | ||
258 | /* Shortcut for clockevent devices that can deal with ktime. */ | 327 | /* Shortcut for clockevent devices that can deal with ktime. */ |
@@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced) | |||
297 | struct clock_event_device *dev, *newdev = NULL; | 366 | struct clock_event_device *dev, *newdev = NULL; |
298 | 367 | ||
299 | list_for_each_entry(dev, &clockevent_devices, list) { | 368 | list_for_each_entry(dev, &clockevent_devices, list) { |
300 | if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) | 369 | if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) |
301 | continue; | 370 | continue; |
302 | 371 | ||
303 | if (!tick_check_replacement(newdev, dev)) | 372 | if (!tick_check_replacement(newdev, dev)) |
@@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced) | |||
323 | static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) | 392 | static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) |
324 | { | 393 | { |
325 | /* Fast track. Device is unused */ | 394 | /* Fast track. Device is unused */ |
326 | if (ced->mode == CLOCK_EVT_MODE_UNUSED) { | 395 | if (ced->state == CLOCK_EVT_STATE_DETACHED) { |
327 | list_del_init(&ced->list); | 396 | list_del_init(&ced->list); |
328 | return 0; | 397 | return 0; |
329 | } | 398 | } |
@@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) | |||
373 | } | 442 | } |
374 | EXPORT_SYMBOL_GPL(clockevents_unbind); | 443 | EXPORT_SYMBOL_GPL(clockevents_unbind); |
375 | 444 | ||
445 | /* Sanity check of state transition callbacks */ | ||
446 | static int clockevents_sanity_check(struct clock_event_device *dev) | ||
447 | { | ||
448 | /* Legacy set_mode() callback */ | ||
449 | if (dev->set_mode) { | ||
450 | /* We shouldn't be supporting new modes now */ | ||
451 | WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || | ||
452 | dev->set_state_shutdown || dev->tick_resume); | ||
453 | |||
454 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | ||
455 | return 0; | ||
456 | } | ||
457 | |||
458 | if (dev->features & CLOCK_EVT_FEAT_DUMMY) | ||
459 | return 0; | ||
460 | |||
461 | /* New state-specific callbacks */ | ||
462 | if (!dev->set_state_shutdown) | ||
463 | return -EINVAL; | ||
464 | |||
465 | if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && | ||
466 | !dev->set_state_periodic) | ||
467 | return -EINVAL; | ||
468 | |||
469 | if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && | ||
470 | !dev->set_state_oneshot) | ||
471 | return -EINVAL; | ||
472 | |||
473 | return 0; | ||
474 | } | ||
475 | |||
376 | /** | 476 | /** |
377 | * clockevents_register_device - register a clock event device | 477 | * clockevents_register_device - register a clock event device |
378 | * @dev: device to register | 478 | * @dev: device to register |
@@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
381 | { | 481 | { |
382 | unsigned long flags; | 482 | unsigned long flags; |
383 | 483 | ||
384 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | 484 | BUG_ON(clockevents_sanity_check(dev)); |
485 | |||
486 | /* Initialize state to DETACHED */ | ||
487 | dev->state = CLOCK_EVT_STATE_DETACHED; | ||
488 | |||
385 | if (!dev->cpumask) { | 489 | if (!dev->cpumask) { |
386 | WARN_ON(num_possible_cpus() > 1); | 490 | WARN_ON(num_possible_cpus() > 1); |
387 | dev->cpumask = cpumask_of(smp_processor_id()); | 491 | dev->cpumask = cpumask_of(smp_processor_id()); |
@@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) | |||
445 | { | 549 | { |
446 | clockevents_config(dev, freq); | 550 | clockevents_config(dev, freq); |
447 | 551 | ||
448 | if (dev->mode == CLOCK_EVT_MODE_ONESHOT) | 552 | if (dev->state == CLOCK_EVT_STATE_ONESHOT) |
449 | return clockevents_program_event(dev, dev->next_event, false); | 553 | return clockevents_program_event(dev, dev->next_event, false); |
450 | 554 | ||
451 | if (dev->mode == CLOCK_EVT_MODE_PERIODIC) | 555 | if (dev->state == CLOCK_EVT_STATE_PERIODIC) |
452 | dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); | 556 | return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); |
453 | 557 | ||
454 | return 0; | 558 | return 0; |
455 | } | 559 | } |
@@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev) | |||
491 | * @old: device to release (can be NULL) | 595 | * @old: device to release (can be NULL) |
492 | * @new: device to request (can be NULL) | 596 | * @new: device to request (can be NULL) |
493 | * | 597 | * |
494 | * Called from the notifier chain. clockevents_lock is held already | 598 | * Called from various tick functions with clockevents_lock held and |
599 | * interrupts disabled. | ||
495 | */ | 600 | */ |
496 | void clockevents_exchange_device(struct clock_event_device *old, | 601 | void clockevents_exchange_device(struct clock_event_device *old, |
497 | struct clock_event_device *new) | 602 | struct clock_event_device *new) |
498 | { | 603 | { |
499 | unsigned long flags; | ||
500 | |||
501 | local_irq_save(flags); | ||
502 | /* | 604 | /* |
503 | * Caller releases a clock event device. We queue it into the | 605 | * Caller releases a clock event device. We queue it into the |
504 | * released list and do a notify add later. | 606 | * released list and do a notify add later. |
505 | */ | 607 | */ |
506 | if (old) { | 608 | if (old) { |
507 | module_put(old->owner); | 609 | module_put(old->owner); |
508 | clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); | 610 | clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); |
509 | list_del(&old->list); | 611 | list_del(&old->list); |
510 | list_add(&old->list, &clockevents_released); | 612 | list_add(&old->list, &clockevents_released); |
511 | } | 613 | } |
512 | 614 | ||
513 | if (new) { | 615 | if (new) { |
514 | BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); | 616 | BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); |
515 | clockevents_shutdown(new); | 617 | clockevents_shutdown(new); |
516 | } | 618 | } |
517 | local_irq_restore(flags); | ||
518 | } | 619 | } |
519 | 620 | ||
520 | /** | 621 | /** |
@@ -541,74 +642,40 @@ void clockevents_resume(void) | |||
541 | dev->resume(dev); | 642 | dev->resume(dev); |
542 | } | 643 | } |
543 | 644 | ||
544 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 645 | #ifdef CONFIG_HOTPLUG_CPU |
545 | /** | 646 | /** |
546 | * clockevents_notify - notification about relevant events | 647 | * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu |
547 | * Returns 0 on success, any other value on error | ||
548 | */ | 648 | */ |
549 | int clockevents_notify(unsigned long reason, void *arg) | 649 | void tick_cleanup_dead_cpu(int cpu) |
550 | { | 650 | { |
551 | struct clock_event_device *dev, *tmp; | 651 | struct clock_event_device *dev, *tmp; |
552 | unsigned long flags; | 652 | unsigned long flags; |
553 | int cpu, ret = 0; | ||
554 | 653 | ||
555 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 654 | raw_spin_lock_irqsave(&clockevents_lock, flags); |
556 | 655 | ||
557 | switch (reason) { | 656 | tick_shutdown_broadcast_oneshot(cpu); |
558 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 657 | tick_shutdown_broadcast(cpu); |
559 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 658 | tick_shutdown(cpu); |
560 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | 659 | /* |
561 | tick_broadcast_on_off(reason, arg); | 660 | * Unregister the clock event devices which were |
562 | break; | 661 | * released from the users in the notify chain. |
563 | 662 | */ | |
564 | case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: | 663 | list_for_each_entry_safe(dev, tmp, &clockevents_released, list) |
565 | case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: | 664 | list_del(&dev->list); |
566 | ret = tick_broadcast_oneshot_control(reason); | 665 | /* |
567 | break; | 666 | * Now check whether the CPU has left unused per cpu devices |
568 | 667 | */ | |
569 | case CLOCK_EVT_NOTIFY_CPU_DYING: | 668 | list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { |
570 | tick_handover_do_timer(arg); | 669 | if (cpumask_test_cpu(cpu, dev->cpumask) && |
571 | break; | 670 | cpumask_weight(dev->cpumask) == 1 && |
572 | 671 | !tick_is_broadcast_device(dev)) { | |
573 | case CLOCK_EVT_NOTIFY_SUSPEND: | 672 | BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); |
574 | tick_suspend(); | ||
575 | tick_suspend_broadcast(); | ||
576 | break; | ||
577 | |||
578 | case CLOCK_EVT_NOTIFY_RESUME: | ||
579 | tick_resume(); | ||
580 | break; | ||
581 | |||
582 | case CLOCK_EVT_NOTIFY_CPU_DEAD: | ||
583 | tick_shutdown_broadcast_oneshot(arg); | ||
584 | tick_shutdown_broadcast(arg); | ||
585 | tick_shutdown(arg); | ||
586 | /* | ||
587 | * Unregister the clock event devices which were | ||
588 | * released from the users in the notify chain. | ||
589 | */ | ||
590 | list_for_each_entry_safe(dev, tmp, &clockevents_released, list) | ||
591 | list_del(&dev->list); | 673 | list_del(&dev->list); |
592 | /* | ||
593 | * Now check whether the CPU has left unused per cpu devices | ||
594 | */ | ||
595 | cpu = *((int *)arg); | ||
596 | list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { | ||
597 | if (cpumask_test_cpu(cpu, dev->cpumask) && | ||
598 | cpumask_weight(dev->cpumask) == 1 && | ||
599 | !tick_is_broadcast_device(dev)) { | ||
600 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | ||
601 | list_del(&dev->list); | ||
602 | } | ||
603 | } | 674 | } |
604 | break; | ||
605 | default: | ||
606 | break; | ||
607 | } | 675 | } |
608 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); | 676 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); |
609 | return ret; | ||
610 | } | 677 | } |
611 | EXPORT_SYMBOL_GPL(clockevents_notify); | 678 | #endif |
612 | 679 | ||
613 | #ifdef CONFIG_SYSFS | 680 | #ifdef CONFIG_SYSFS |
614 | struct bus_type clockevents_subsys = { | 681 | struct bus_type clockevents_subsys = { |
@@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void) | |||
727 | } | 794 | } |
728 | device_initcall(clockevents_init_sysfs); | 795 | device_initcall(clockevents_init_sysfs); |
729 | #endif /* SYSFS */ | 796 | #endif /* SYSFS */ |
730 | |||
731 | #endif /* GENERIC_CLOCK_EVENTS */ | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 4892352f0e49..15facb1b9c60 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs) | |||
142 | schedule_work(&watchdog_work); | 142 | schedule_work(&watchdog_work); |
143 | } | 143 | } |
144 | 144 | ||
145 | static void clocksource_unstable(struct clocksource *cs, int64_t delta) | ||
146 | { | ||
147 | printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", | ||
148 | cs->name, delta); | ||
149 | __clocksource_unstable(cs); | ||
150 | } | ||
151 | |||
152 | /** | 145 | /** |
153 | * clocksource_mark_unstable - mark clocksource unstable via watchdog | 146 | * clocksource_mark_unstable - mark clocksource unstable via watchdog |
154 | * @cs: clocksource to be marked unstable | 147 | * @cs: clocksource to be marked unstable |
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs) | |||
174 | static void clocksource_watchdog(unsigned long data) | 167 | static void clocksource_watchdog(unsigned long data) |
175 | { | 168 | { |
176 | struct clocksource *cs; | 169 | struct clocksource *cs; |
177 | cycle_t csnow, wdnow, delta; | 170 | cycle_t csnow, wdnow, cslast, wdlast, delta; |
178 | int64_t wd_nsec, cs_nsec; | 171 | int64_t wd_nsec, cs_nsec; |
179 | int next_cpu, reset_pending; | 172 | int next_cpu, reset_pending; |
180 | 173 | ||
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data) | |||
213 | 206 | ||
214 | delta = clocksource_delta(csnow, cs->cs_last, cs->mask); | 207 | delta = clocksource_delta(csnow, cs->cs_last, cs->mask); |
215 | cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); | 208 | cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); |
209 | wdlast = cs->wd_last; /* save these in case we print them */ | ||
210 | cslast = cs->cs_last; | ||
216 | cs->cs_last = csnow; | 211 | cs->cs_last = csnow; |
217 | cs->wd_last = wdnow; | 212 | cs->wd_last = wdnow; |
218 | 213 | ||
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data) | |||
221 | 216 | ||
222 | /* Check the deviation from the watchdog clocksource. */ | 217 | /* Check the deviation from the watchdog clocksource. */ |
223 | if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { | 218 | if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { |
224 | clocksource_unstable(cs, cs_nsec - wd_nsec); | 219 | pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); |
220 | pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", | ||
221 | watchdog->name, wdnow, wdlast, watchdog->mask); | ||
222 | pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", | ||
223 | cs->name, csnow, cslast, cs->mask); | ||
224 | __clocksource_unstable(cs); | ||
225 | continue; | 225 | continue; |
226 | } | 226 | } |
227 | 227 | ||
@@ -469,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) | |||
469 | * @shift: cycle to nanosecond divisor (power of two) | 469 | * @shift: cycle to nanosecond divisor (power of two) |
470 | * @maxadj: maximum adjustment value to mult (~11%) | 470 | * @maxadj: maximum adjustment value to mult (~11%) |
471 | * @mask: bitmask for two's complement subtraction of non 64 bit counters | 471 | * @mask: bitmask for two's complement subtraction of non 64 bit counters |
472 | * @max_cyc: maximum cycle value before potential overflow (does not include | ||
473 | * any safety margin) | ||
474 | * | ||
475 | * NOTE: This function includes a safety margin of 50%, in other words, we | ||
476 | * return half the number of nanoseconds the hardware counter can technically | ||
477 | * cover. This is done so that we can potentially detect problems caused by | ||
478 | * delayed timers or bad hardware, which might result in time intervals that | ||
479 | * are larger then what the math used can handle without overflows. | ||
472 | */ | 480 | */ |
473 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) | 481 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) |
474 | { | 482 | { |
475 | u64 max_nsecs, max_cycles; | 483 | u64 max_nsecs, max_cycles; |
476 | 484 | ||
477 | /* | 485 | /* |
478 | * Calculate the maximum number of cycles that we can pass to the | 486 | * Calculate the maximum number of cycles that we can pass to the |
479 | * cyc2ns function without overflowing a 64-bit signed result. The | 487 | * cyc2ns() function without overflowing a 64-bit result. |
480 | * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) | ||
481 | * which is equivalent to the below. | ||
482 | * max_cycles < (2^63)/(mult + maxadj) | ||
483 | * max_cycles < 2^(log2((2^63)/(mult + maxadj))) | ||
484 | * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) | ||
485 | * max_cycles < 2^(63 - log2(mult + maxadj)) | ||
486 | * max_cycles < 1 << (63 - log2(mult + maxadj)) | ||
487 | * Please note that we add 1 to the result of the log2 to account for | ||
488 | * any rounding errors, ensure the above inequality is satisfied and | ||
489 | * no overflow will occur. | ||
490 | */ | 488 | */ |
491 | max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); | 489 | max_cycles = ULLONG_MAX; |
490 | do_div(max_cycles, mult+maxadj); | ||
492 | 491 | ||
493 | /* | 492 | /* |
494 | * The actual maximum number of cycles we can defer the clocksource is | 493 | * The actual maximum number of cycles we can defer the clocksource is |
@@ -499,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) | |||
499 | max_cycles = min(max_cycles, mask); | 498 | max_cycles = min(max_cycles, mask); |
500 | max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); | 499 | max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); |
501 | 500 | ||
501 | /* return the max_cycles value as well if requested */ | ||
502 | if (max_cyc) | ||
503 | *max_cyc = max_cycles; | ||
504 | |||
505 | /* Return 50% of the actual maximum, so we can detect bad values */ | ||
506 | max_nsecs >>= 1; | ||
507 | |||
502 | return max_nsecs; | 508 | return max_nsecs; |
503 | } | 509 | } |
504 | 510 | ||
505 | /** | 511 | /** |
506 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 512 | * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles |
507 | * @cs: Pointer to clocksource | 513 | * @cs: Pointer to clocksource to be updated |
508 | * | 514 | * |
509 | */ | 515 | */ |
510 | static u64 clocksource_max_deferment(struct clocksource *cs) | 516 | static inline void clocksource_update_max_deferment(struct clocksource *cs) |
511 | { | 517 | { |
512 | u64 max_nsecs; | 518 | cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, |
513 | 519 | cs->maxadj, cs->mask, | |
514 | max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, | 520 | &cs->max_cycles); |
515 | cs->mask); | ||
516 | /* | ||
517 | * To ensure that the clocksource does not wrap whilst we are idle, | ||
518 | * limit the time the clocksource can be deferred by 12.5%. Please | ||
519 | * note a margin of 12.5% is used because this can be computed with | ||
520 | * a shift, versus say 10% which would require division. | ||
521 | */ | ||
522 | return max_nsecs - (max_nsecs >> 3); | ||
523 | } | 521 | } |
524 | 522 | ||
525 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 523 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
@@ -648,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
648 | } | 646 | } |
649 | 647 | ||
650 | /** | 648 | /** |
651 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 649 | * __clocksource_update_freq_scale - Used update clocksource with new freq |
652 | * @cs: clocksource to be registered | 650 | * @cs: clocksource to be registered |
653 | * @scale: Scale factor multiplied against freq to get clocksource hz | 651 | * @scale: Scale factor multiplied against freq to get clocksource hz |
654 | * @freq: clocksource frequency (cycles per second) divided by scale | 652 | * @freq: clocksource frequency (cycles per second) divided by scale |
@@ -656,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
656 | * This should only be called from the clocksource->enable() method. | 654 | * This should only be called from the clocksource->enable() method. |
657 | * | 655 | * |
658 | * This *SHOULD NOT* be called directly! Please use the | 656 | * This *SHOULD NOT* be called directly! Please use the |
659 | * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. | 657 | * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper |
658 | * functions. | ||
660 | */ | 659 | */ |
661 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 660 | void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) |
662 | { | 661 | { |
663 | u64 sec; | 662 | u64 sec; |
663 | |||
664 | /* | 664 | /* |
665 | * Calc the maximum number of seconds which we can run before | 665 | * Default clocksources are *special* and self-define their mult/shift. |
666 | * wrapping around. For clocksources which have a mask > 32bit | 666 | * But, you're not special, so you should specify a freq value. |
667 | * we need to limit the max sleep time to have a good | ||
668 | * conversion precision. 10 minutes is still a reasonable | ||
669 | * amount. That results in a shift value of 24 for a | ||
670 | * clocksource with mask >= 40bit and f >= 4GHz. That maps to | ||
671 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | ||
672 | * margin as we do in clocksource_max_deferment() | ||
673 | */ | 667 | */ |
674 | sec = (cs->mask - (cs->mask >> 3)); | 668 | if (freq) { |
675 | do_div(sec, freq); | 669 | /* |
676 | do_div(sec, scale); | 670 | * Calc the maximum number of seconds which we can run before |
677 | if (!sec) | 671 | * wrapping around. For clocksources which have a mask > 32-bit |
678 | sec = 1; | 672 | * we need to limit the max sleep time to have a good |
679 | else if (sec > 600 && cs->mask > UINT_MAX) | 673 | * conversion precision. 10 minutes is still a reasonable |
680 | sec = 600; | 674 | * amount. That results in a shift value of 24 for a |
681 | 675 | * clocksource with mask >= 40-bit and f >= 4GHz. That maps to | |
682 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 676 | * ~ 0.06ppm granularity for NTP. |
683 | NSEC_PER_SEC / scale, sec * scale); | 677 | */ |
684 | 678 | sec = cs->mask; | |
679 | do_div(sec, freq); | ||
680 | do_div(sec, scale); | ||
681 | if (!sec) | ||
682 | sec = 1; | ||
683 | else if (sec > 600 && cs->mask > UINT_MAX) | ||
684 | sec = 600; | ||
685 | |||
686 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | ||
687 | NSEC_PER_SEC / scale, sec * scale); | ||
688 | } | ||
685 | /* | 689 | /* |
686 | * for clocksources that have large mults, to avoid overflow. | 690 | * Ensure clocksources that have large 'mult' values don't overflow |
687 | * Since mult may be adjusted by ntp, add an safety extra margin | 691 | * when adjusted. |
688 | * | ||
689 | */ | 692 | */ |
690 | cs->maxadj = clocksource_max_adjustment(cs); | 693 | cs->maxadj = clocksource_max_adjustment(cs); |
691 | while ((cs->mult + cs->maxadj < cs->mult) | 694 | while (freq && ((cs->mult + cs->maxadj < cs->mult) |
692 | || (cs->mult - cs->maxadj > cs->mult)) { | 695 | || (cs->mult - cs->maxadj > cs->mult))) { |
693 | cs->mult >>= 1; | 696 | cs->mult >>= 1; |
694 | cs->shift--; | 697 | cs->shift--; |
695 | cs->maxadj = clocksource_max_adjustment(cs); | 698 | cs->maxadj = clocksource_max_adjustment(cs); |
696 | } | 699 | } |
697 | 700 | ||
698 | cs->max_idle_ns = clocksource_max_deferment(cs); | 701 | /* |
702 | * Only warn for *special* clocksources that self-define | ||
703 | * their mult/shift values and don't specify a freq. | ||
704 | */ | ||
705 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
706 | "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", | ||
707 | cs->name); | ||
708 | |||
709 | clocksource_update_max_deferment(cs); | ||
710 | |||
711 | pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", | ||
712 | cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); | ||
699 | } | 713 | } |
700 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 714 | EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); |
701 | 715 | ||
702 | /** | 716 | /** |
703 | * __clocksource_register_scale - Used to install new clocksources | 717 | * __clocksource_register_scale - Used to install new clocksources |
@@ -714,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
714 | { | 728 | { |
715 | 729 | ||
716 | /* Initialize mult/shift and max_idle_ns */ | 730 | /* Initialize mult/shift and max_idle_ns */ |
717 | __clocksource_updatefreq_scale(cs, scale, freq); | 731 | __clocksource_update_freq_scale(cs, scale, freq); |
718 | 732 | ||
719 | /* Add clocksource to the clocksource list */ | 733 | /* Add clocksource to the clocksource list */ |
720 | mutex_lock(&clocksource_mutex); | 734 | mutex_lock(&clocksource_mutex); |
@@ -726,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
726 | } | 740 | } |
727 | EXPORT_SYMBOL_GPL(__clocksource_register_scale); | 741 | EXPORT_SYMBOL_GPL(__clocksource_register_scale); |
728 | 742 | ||
729 | |||
730 | /** | ||
731 | * clocksource_register - Used to install new clocksources | ||
732 | * @cs: clocksource to be registered | ||
733 | * | ||
734 | * Returns -EBUSY if registration fails, zero otherwise. | ||
735 | */ | ||
736 | int clocksource_register(struct clocksource *cs) | ||
737 | { | ||
738 | /* calculate max adjustment for given mult/shift */ | ||
739 | cs->maxadj = clocksource_max_adjustment(cs); | ||
740 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
741 | "Clocksource %s might overflow on 11%% adjustment\n", | ||
742 | cs->name); | ||
743 | |||
744 | /* calculate max idle time permitted for this clocksource */ | ||
745 | cs->max_idle_ns = clocksource_max_deferment(cs); | ||
746 | |||
747 | mutex_lock(&clocksource_mutex); | ||
748 | clocksource_enqueue(cs); | ||
749 | clocksource_enqueue_watchdog(cs); | ||
750 | clocksource_select(); | ||
751 | mutex_unlock(&clocksource_mutex); | ||
752 | return 0; | ||
753 | } | ||
754 | EXPORT_SYMBOL(clocksource_register); | ||
755 | |||
756 | static void __clocksource_change_rating(struct clocksource *cs, int rating) | 743 | static void __clocksource_change_rating(struct clocksource *cs, int rating) |
757 | { | 744 | { |
758 | list_del(&cs->list); | 745 | list_del(&cs->list); |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index bee0c1f78091..76d4bd962b19 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -54,7 +54,7 @@ | |||
54 | 54 | ||
55 | #include <trace/events/timer.h> | 55 | #include <trace/events/timer.h> |
56 | 56 | ||
57 | #include "timekeeping.h" | 57 | #include "tick-internal.h" |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * The timer bases: | 60 | * The timer bases: |
@@ -1707,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self, | |||
1707 | break; | 1707 | break; |
1708 | 1708 | ||
1709 | #ifdef CONFIG_HOTPLUG_CPU | 1709 | #ifdef CONFIG_HOTPLUG_CPU |
1710 | case CPU_DYING: | ||
1711 | case CPU_DYING_FROZEN: | ||
1712 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); | ||
1713 | break; | ||
1714 | case CPU_DEAD: | 1710 | case CPU_DEAD: |
1715 | case CPU_DEAD_FROZEN: | 1711 | case CPU_DEAD_FROZEN: |
1716 | { | ||
1717 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); | ||
1718 | migrate_hrtimers(scpu); | 1712 | migrate_hrtimers(scpu); |
1719 | break; | 1713 | break; |
1720 | } | ||
1721 | #endif | 1714 | #endif |
1722 | 1715 | ||
1723 | default: | 1716 | default: |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a6a5bf53e86d..347fecf86a3f 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/module.h> | 25 | #include <linux/module.h> |
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | 27 | ||
28 | #include "tick-internal.h" | 28 | #include "timekeeping.h" |
29 | 29 | ||
30 | /* The Jiffies based clocksource is the lowest common | 30 | /* The Jiffies based clocksource is the lowest common |
31 | * denominator clock source which should function on | 31 | * denominator clock source which should function on |
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = { | |||
71 | .mask = 0xffffffff, /*32bits*/ | 71 | .mask = 0xffffffff, /*32bits*/ |
72 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | 72 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ |
73 | .shift = JIFFIES_SHIFT, | 73 | .shift = JIFFIES_SHIFT, |
74 | .max_cycles = 10, | ||
74 | }; | 75 | }; |
75 | 76 | ||
76 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); | 77 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); |
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies); | |||
94 | 95 | ||
95 | static int __init init_jiffies_clocksource(void) | 96 | static int __init init_jiffies_clocksource(void) |
96 | { | 97 | { |
97 | return clocksource_register(&clocksource_jiffies); | 98 | return __clocksource_register(&clocksource_jiffies); |
98 | } | 99 | } |
99 | 100 | ||
100 | core_initcall(init_jiffies_clocksource); | 101 | core_initcall(init_jiffies_clocksource); |
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second) | |||
130 | 131 | ||
131 | refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; | 132 | refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; |
132 | 133 | ||
133 | clocksource_register(&refined_jiffies); | 134 | __clocksource_register(&refined_jiffies); |
134 | return 0; | 135 | return 0; |
135 | } | 136 | } |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 0f60b08a4f07..7a681003001c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/rtc.h> | 18 | #include <linux/rtc.h> |
19 | 19 | ||
20 | #include "tick-internal.h" | ||
21 | #include "ntp_internal.h" | 20 | #include "ntp_internal.h" |
22 | 21 | ||
23 | /* | 22 | /* |
@@ -459,6 +458,16 @@ out: | |||
459 | return leap; | 458 | return leap; |
460 | } | 459 | } |
461 | 460 | ||
461 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | ||
462 | int __weak update_persistent_clock64(struct timespec64 now64) | ||
463 | { | ||
464 | struct timespec now; | ||
465 | |||
466 | now = timespec64_to_timespec(now64); | ||
467 | return update_persistent_clock(now); | ||
468 | } | ||
469 | #endif | ||
470 | |||
462 | #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) | 471 | #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) |
463 | static void sync_cmos_clock(struct work_struct *work); | 472 | static void sync_cmos_clock(struct work_struct *work); |
464 | 473 | ||
@@ -494,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work) | |||
494 | if (persistent_clock_is_local) | 503 | if (persistent_clock_is_local) |
495 | adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); | 504 | adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); |
496 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 505 | #ifdef CONFIG_GENERIC_CMOS_UPDATE |
497 | fail = update_persistent_clock(timespec64_to_timespec(adjust)); | 506 | fail = update_persistent_clock64(adjust); |
498 | #endif | 507 | #endif |
508 | |||
499 | #ifdef CONFIG_RTC_SYSTOHC | 509 | #ifdef CONFIG_RTC_SYSTOHC |
500 | if (fail == -ENODEV) | 510 | if (fail == -ENODEV) |
501 | fail = rtc_set_ntp_time(adjust); | 511 | fail = rtc_set_ntp_time(adjust); |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..a26036d37a38 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
@@ -1,5 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * sched_clock.c: support for extending counters to full 64-bit ns counter | 2 | * sched_clock.c: Generic sched_clock() support, to extend low level |
3 | * hardware time counters to full 64-bit ns values. | ||
3 | * | 4 | * |
4 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License version 2 as | 6 | * it under the terms of the GNU General Public License version 2 as |
@@ -18,15 +19,53 @@ | |||
18 | #include <linux/seqlock.h> | 19 | #include <linux/seqlock.h> |
19 | #include <linux/bitops.h> | 20 | #include <linux/bitops.h> |
20 | 21 | ||
21 | struct clock_data { | 22 | /** |
22 | ktime_t wrap_kt; | 23 | * struct clock_read_data - data required to read from sched_clock() |
24 | * | ||
25 | * @epoch_ns: sched_clock() value at last update | ||
26 | * @epoch_cyc: Clock cycle value at last update. | ||
27 | * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit | ||
28 | * clocks. | ||
29 | * @read_sched_clock: Current clock source (or dummy source when suspended). | ||
30 | * @mult: Multipler for scaled math conversion. | ||
31 | * @shift: Shift value for scaled math conversion. | ||
32 | * | ||
33 | * Care must be taken when updating this structure; it is read by | ||
34 | * some very hot code paths. It occupies <=40 bytes and, when combined | ||
35 | * with the seqcount used to synchronize access, comfortably fits into | ||
36 | * a 64 byte cache line. | ||
37 | */ | ||
38 | struct clock_read_data { | ||
23 | u64 epoch_ns; | 39 | u64 epoch_ns; |
24 | u64 epoch_cyc; | 40 | u64 epoch_cyc; |
25 | seqcount_t seq; | 41 | u64 sched_clock_mask; |
26 | unsigned long rate; | 42 | u64 (*read_sched_clock)(void); |
27 | u32 mult; | 43 | u32 mult; |
28 | u32 shift; | 44 | u32 shift; |
29 | bool suspended; | 45 | }; |
46 | |||
47 | /** | ||
48 | * struct clock_data - all data needed for sched_clock() (including | ||
49 | * registration of a new clock source) | ||
50 | * | ||
51 | * @seq: Sequence counter for protecting updates. The lowest | ||
52 | * bit is the index for @read_data. | ||
53 | * @read_data: Data required to read from sched_clock. | ||
54 | * @wrap_kt: Duration for which clock can run before wrapping. | ||
55 | * @rate: Tick rate of the registered clock. | ||
56 | * @actual_read_sched_clock: Registered hardware level clock read function. | ||
57 | * | ||
58 | * The ordering of this structure has been chosen to optimize cache | ||
59 | * performance. In particular 'seq' and 'read_data[0]' (combined) should fit | ||
60 | * into a single 64-byte cache line. | ||
61 | */ | ||
62 | struct clock_data { | ||
63 | seqcount_t seq; | ||
64 | struct clock_read_data read_data[2]; | ||
65 | ktime_t wrap_kt; | ||
66 | unsigned long rate; | ||
67 | |||
68 | u64 (*actual_read_sched_clock)(void); | ||
30 | }; | 69 | }; |
31 | 70 | ||
32 | static struct hrtimer sched_clock_timer; | 71 | static struct hrtimer sched_clock_timer; |
@@ -34,12 +73,6 @@ static int irqtime = -1; | |||
34 | 73 | ||
35 | core_param(irqtime, irqtime, int, 0400); | 74 | core_param(irqtime, irqtime, int, 0400); |
36 | 75 | ||
37 | static struct clock_data cd = { | ||
38 | .mult = NSEC_PER_SEC / HZ, | ||
39 | }; | ||
40 | |||
41 | static u64 __read_mostly sched_clock_mask; | ||
42 | |||
43 | static u64 notrace jiffy_sched_clock_read(void) | 76 | static u64 notrace jiffy_sched_clock_read(void) |
44 | { | 77 | { |
45 | /* | 78 | /* |
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void) | |||
49 | return (u64)(jiffies - INITIAL_JIFFIES); | 82 | return (u64)(jiffies - INITIAL_JIFFIES); |
50 | } | 83 | } |
51 | 84 | ||
52 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | 85 | static struct clock_data cd ____cacheline_aligned = { |
86 | .read_data[0] = { .mult = NSEC_PER_SEC / HZ, | ||
87 | .read_sched_clock = jiffy_sched_clock_read, }, | ||
88 | .actual_read_sched_clock = jiffy_sched_clock_read, | ||
89 | }; | ||
53 | 90 | ||
54 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | 91 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) |
55 | { | 92 | { |
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | |||
58 | 95 | ||
59 | unsigned long long notrace sched_clock(void) | 96 | unsigned long long notrace sched_clock(void) |
60 | { | 97 | { |
61 | u64 epoch_ns; | 98 | u64 cyc, res; |
62 | u64 epoch_cyc; | ||
63 | u64 cyc; | ||
64 | unsigned long seq; | 99 | unsigned long seq; |
65 | 100 | struct clock_read_data *rd; | |
66 | if (cd.suspended) | ||
67 | return cd.epoch_ns; | ||
68 | 101 | ||
69 | do { | 102 | do { |
70 | seq = raw_read_seqcount_begin(&cd.seq); | 103 | seq = raw_read_seqcount(&cd.seq); |
71 | epoch_cyc = cd.epoch_cyc; | 104 | rd = cd.read_data + (seq & 1); |
72 | epoch_ns = cd.epoch_ns; | 105 | |
106 | cyc = (rd->read_sched_clock() - rd->epoch_cyc) & | ||
107 | rd->sched_clock_mask; | ||
108 | res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); | ||
73 | } while (read_seqcount_retry(&cd.seq, seq)); | 109 | } while (read_seqcount_retry(&cd.seq, seq)); |
74 | 110 | ||
75 | cyc = read_sched_clock(); | 111 | return res; |
76 | cyc = (cyc - epoch_cyc) & sched_clock_mask; | 112 | } |
77 | return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); | 113 | |
114 | /* | ||
115 | * Updating the data required to read the clock. | ||
116 | * | ||
117 | * sched_clock() will never observe mis-matched data even if called from | ||
118 | * an NMI. We do this by maintaining an odd/even copy of the data and | ||
119 | * steering sched_clock() to one or the other using a sequence counter. | ||
120 | * In order to preserve the data cache profile of sched_clock() as much | ||
121 | * as possible the system reverts back to the even copy when the update | ||
122 | * completes; the odd copy is used *only* during an update. | ||
123 | */ | ||
124 | static void update_clock_read_data(struct clock_read_data *rd) | ||
125 | { | ||
126 | /* update the backup (odd) copy with the new data */ | ||
127 | cd.read_data[1] = *rd; | ||
128 | |||
129 | /* steer readers towards the odd copy */ | ||
130 | raw_write_seqcount_latch(&cd.seq); | ||
131 | |||
132 | /* now its safe for us to update the normal (even) copy */ | ||
133 | cd.read_data[0] = *rd; | ||
134 | |||
135 | /* switch readers back to the even copy */ | ||
136 | raw_write_seqcount_latch(&cd.seq); | ||
78 | } | 137 | } |
79 | 138 | ||
80 | /* | 139 | /* |
81 | * Atomically update the sched_clock epoch. | 140 | * Atomically update the sched_clock() epoch. |
82 | */ | 141 | */ |
83 | static void notrace update_sched_clock(void) | 142 | static void update_sched_clock(void) |
84 | { | 143 | { |
85 | unsigned long flags; | ||
86 | u64 cyc; | 144 | u64 cyc; |
87 | u64 ns; | 145 | u64 ns; |
146 | struct clock_read_data rd; | ||
147 | |||
148 | rd = cd.read_data[0]; | ||
149 | |||
150 | cyc = cd.actual_read_sched_clock(); | ||
151 | ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); | ||
152 | |||
153 | rd.epoch_ns = ns; | ||
154 | rd.epoch_cyc = cyc; | ||
88 | 155 | ||
89 | cyc = read_sched_clock(); | 156 | update_clock_read_data(&rd); |
90 | ns = cd.epoch_ns + | ||
91 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | ||
92 | cd.mult, cd.shift); | ||
93 | |||
94 | raw_local_irq_save(flags); | ||
95 | raw_write_seqcount_begin(&cd.seq); | ||
96 | cd.epoch_ns = ns; | ||
97 | cd.epoch_cyc = cyc; | ||
98 | raw_write_seqcount_end(&cd.seq); | ||
99 | raw_local_irq_restore(flags); | ||
100 | } | 157 | } |
101 | 158 | ||
102 | static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) | 159 | static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) |
103 | { | 160 | { |
104 | update_sched_clock(); | 161 | update_sched_clock(); |
105 | hrtimer_forward_now(hrt, cd.wrap_kt); | 162 | hrtimer_forward_now(hrt, cd.wrap_kt); |
163 | |||
106 | return HRTIMER_RESTART; | 164 | return HRTIMER_RESTART; |
107 | } | 165 | } |
108 | 166 | ||
109 | void __init sched_clock_register(u64 (*read)(void), int bits, | 167 | void __init |
110 | unsigned long rate) | 168 | sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) |
111 | { | 169 | { |
112 | u64 res, wrap, new_mask, new_epoch, cyc, ns; | 170 | u64 res, wrap, new_mask, new_epoch, cyc, ns; |
113 | u32 new_mult, new_shift; | 171 | u32 new_mult, new_shift; |
114 | ktime_t new_wrap_kt; | ||
115 | unsigned long r; | 172 | unsigned long r; |
116 | char r_unit; | 173 | char r_unit; |
174 | struct clock_read_data rd; | ||
117 | 175 | ||
118 | if (cd.rate > rate) | 176 | if (cd.rate > rate) |
119 | return; | 177 | return; |
120 | 178 | ||
121 | WARN_ON(!irqs_disabled()); | 179 | WARN_ON(!irqs_disabled()); |
122 | 180 | ||
123 | /* calculate the mult/shift to convert counter ticks to ns. */ | 181 | /* Calculate the mult/shift to convert counter ticks to ns. */ |
124 | clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); | 182 | clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); |
125 | 183 | ||
126 | new_mask = CLOCKSOURCE_MASK(bits); | 184 | new_mask = CLOCKSOURCE_MASK(bits); |
185 | cd.rate = rate; | ||
186 | |||
187 | /* Calculate how many nanosecs until we risk wrapping */ | ||
188 | wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); | ||
189 | cd.wrap_kt = ns_to_ktime(wrap); | ||
127 | 190 | ||
128 | /* calculate how many ns until we wrap */ | 191 | rd = cd.read_data[0]; |
129 | wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); | ||
130 | new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); | ||
131 | 192 | ||
132 | /* update epoch for new counter and update epoch_ns from old counter*/ | 193 | /* Update epoch for new counter and update 'epoch_ns' from old counter*/ |
133 | new_epoch = read(); | 194 | new_epoch = read(); |
134 | cyc = read_sched_clock(); | 195 | cyc = cd.actual_read_sched_clock(); |
135 | ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | 196 | ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); |
136 | cd.mult, cd.shift); | 197 | cd.actual_read_sched_clock = read; |
137 | 198 | ||
138 | raw_write_seqcount_begin(&cd.seq); | 199 | rd.read_sched_clock = read; |
139 | read_sched_clock = read; | 200 | rd.sched_clock_mask = new_mask; |
140 | sched_clock_mask = new_mask; | 201 | rd.mult = new_mult; |
141 | cd.rate = rate; | 202 | rd.shift = new_shift; |
142 | cd.wrap_kt = new_wrap_kt; | 203 | rd.epoch_cyc = new_epoch; |
143 | cd.mult = new_mult; | 204 | rd.epoch_ns = ns; |
144 | cd.shift = new_shift; | 205 | |
145 | cd.epoch_cyc = new_epoch; | 206 | update_clock_read_data(&rd); |
146 | cd.epoch_ns = ns; | ||
147 | raw_write_seqcount_end(&cd.seq); | ||
148 | 207 | ||
149 | r = rate; | 208 | r = rate; |
150 | if (r >= 4000000) { | 209 | if (r >= 4000000) { |
151 | r /= 1000000; | 210 | r /= 1000000; |
152 | r_unit = 'M'; | 211 | r_unit = 'M'; |
153 | } else if (r >= 1000) { | 212 | } else { |
154 | r /= 1000; | 213 | if (r >= 1000) { |
155 | r_unit = 'k'; | 214 | r /= 1000; |
156 | } else | 215 | r_unit = 'k'; |
157 | r_unit = ' '; | 216 | } else { |
158 | 217 | r_unit = ' '; | |
159 | /* calculate the ns resolution of this counter */ | 218 | } |
219 | } | ||
220 | |||
221 | /* Calculate the ns resolution of this counter */ | ||
160 | res = cyc_to_ns(1ULL, new_mult, new_shift); | 222 | res = cyc_to_ns(1ULL, new_mult, new_shift); |
161 | 223 | ||
162 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", | 224 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", |
163 | bits, r, r_unit, res, wrap); | 225 | bits, r, r_unit, res, wrap); |
164 | 226 | ||
165 | /* Enable IRQ time accounting if we have a fast enough sched_clock */ | 227 | /* Enable IRQ time accounting if we have a fast enough sched_clock() */ |
166 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) | 228 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) |
167 | enable_sched_clock_irqtime(); | 229 | enable_sched_clock_irqtime(); |
168 | 230 | ||
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits, | |||
172 | void __init sched_clock_postinit(void) | 234 | void __init sched_clock_postinit(void) |
173 | { | 235 | { |
174 | /* | 236 | /* |
175 | * If no sched_clock function has been provided at that point, | 237 | * If no sched_clock() function has been provided at that point, |
176 | * make it the final one one. | 238 | * make it the final one one. |
177 | */ | 239 | */ |
178 | if (read_sched_clock == jiffy_sched_clock_read) | 240 | if (cd.actual_read_sched_clock == jiffy_sched_clock_read) |
179 | sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); | 241 | sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); |
180 | 242 | ||
181 | update_sched_clock(); | 243 | update_sched_clock(); |
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void) | |||
189 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | 251 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); |
190 | } | 252 | } |
191 | 253 | ||
254 | /* | ||
255 | * Clock read function for use when the clock is suspended. | ||
256 | * | ||
257 | * This function makes it appear to sched_clock() as if the clock | ||
258 | * stopped counting at its last update. | ||
259 | * | ||
260 | * This function must only be called from the critical | ||
261 | * section in sched_clock(). It relies on the read_seqcount_retry() | ||
262 | * at the end of the critical section to be sure we observe the | ||
263 | * correct copy of 'epoch_cyc'. | ||
264 | */ | ||
265 | static u64 notrace suspended_sched_clock_read(void) | ||
266 | { | ||
267 | unsigned long seq = raw_read_seqcount(&cd.seq); | ||
268 | |||
269 | return cd.read_data[seq & 1].epoch_cyc; | ||
270 | } | ||
271 | |||
192 | static int sched_clock_suspend(void) | 272 | static int sched_clock_suspend(void) |
193 | { | 273 | { |
274 | struct clock_read_data *rd = &cd.read_data[0]; | ||
275 | |||
194 | update_sched_clock(); | 276 | update_sched_clock(); |
195 | hrtimer_cancel(&sched_clock_timer); | 277 | hrtimer_cancel(&sched_clock_timer); |
196 | cd.suspended = true; | 278 | rd->read_sched_clock = suspended_sched_clock_read; |
279 | |||
197 | return 0; | 280 | return 0; |
198 | } | 281 | } |
199 | 282 | ||
200 | static void sched_clock_resume(void) | 283 | static void sched_clock_resume(void) |
201 | { | 284 | { |
202 | cd.epoch_cyc = read_sched_clock(); | 285 | struct clock_read_data *rd = &cd.read_data[0]; |
286 | |||
287 | rd->epoch_cyc = cd.actual_read_sched_clock(); | ||
203 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | 288 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); |
204 | cd.suspended = false; | 289 | rd->read_sched_clock = cd.actual_read_sched_clock; |
205 | } | 290 | } |
206 | 291 | ||
207 | static struct syscore_ops sched_clock_ops = { | 292 | static struct syscore_ops sched_clock_ops = { |
208 | .suspend = sched_clock_suspend, | 293 | .suspend = sched_clock_suspend, |
209 | .resume = sched_clock_resume, | 294 | .resume = sched_clock_resume, |
210 | }; | 295 | }; |
211 | 296 | ||
212 | static int __init sched_clock_syscore_init(void) | 297 | static int __init sched_clock_syscore_init(void) |
213 | { | 298 | { |
214 | register_syscore_ops(&sched_clock_ops); | 299 | register_syscore_ops(&sched_clock_ops); |
300 | |||
215 | return 0; | 301 | return 0; |
216 | } | 302 | } |
217 | device_initcall(sched_clock_syscore_init); | 303 | device_initcall(sched_clock_syscore_init); |
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index eb682d5c697c..6aac4beedbbe 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c | |||
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode, | |||
49 | */ | 49 | */ |
50 | static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | 50 | static int bc_set_next(ktime_t expires, struct clock_event_device *bc) |
51 | { | 51 | { |
52 | int bc_moved; | ||
52 | /* | 53 | /* |
53 | * We try to cancel the timer first. If the callback is on | 54 | * We try to cancel the timer first. If the callback is on |
54 | * flight on some other cpu then we let it handle it. If we | 55 | * flight on some other cpu then we let it handle it. If we |
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | |||
60 | * restart the timer because we are in the callback, but we | 61 | * restart the timer because we are in the callback, but we |
61 | * can set the expiry time and let the callback return | 62 | * can set the expiry time and let the callback return |
62 | * HRTIMER_RESTART. | 63 | * HRTIMER_RESTART. |
64 | * | ||
65 | * Since we are in the idle loop at this point and because | ||
66 | * hrtimer_{start/cancel} functions call into tracing, | ||
67 | * calls to these functions must be bound within RCU_NONIDLE. | ||
63 | */ | 68 | */ |
64 | if (hrtimer_try_to_cancel(&bctimer) >= 0) { | 69 | RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? |
65 | hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); | 70 | !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : |
71 | 0); | ||
72 | if (bc_moved) { | ||
66 | /* Bind the "device" to the cpu */ | 73 | /* Bind the "device" to the cpu */ |
67 | bc->bound_on = smp_processor_id(); | 74 | bc->bound_on = smp_processor_id(); |
68 | } else if (bc->bound_on == smp_processor_id()) { | 75 | } else if (bc->bound_on == smp_processor_id()) { |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 066f0ec05e48..7e8ca4f448a8 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask; | |||
33 | static cpumask_var_t tick_broadcast_on; | 33 | static cpumask_var_t tick_broadcast_on; |
34 | static cpumask_var_t tmpmask; | 34 | static cpumask_var_t tmpmask; |
35 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); | 35 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); |
36 | static int tick_broadcast_force; | 36 | static int tick_broadcast_forced; |
37 | 37 | ||
38 | #ifdef CONFIG_TICK_ONESHOT | 38 | #ifdef CONFIG_TICK_ONESHOT |
39 | static void tick_broadcast_clear_oneshot(int cpu); | 39 | static void tick_broadcast_clear_oneshot(int cpu); |
40 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); | ||
40 | #else | 41 | #else |
41 | static inline void tick_broadcast_clear_oneshot(int cpu) { } | 42 | static inline void tick_broadcast_clear_oneshot(int cpu) { } |
43 | static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } | ||
42 | #endif | 44 | #endif |
43 | 45 | ||
44 | /* | 46 | /* |
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | |||
303 | /* | 305 | /* |
304 | * The device is in periodic mode. No reprogramming necessary: | 306 | * The device is in periodic mode. No reprogramming necessary: |
305 | */ | 307 | */ |
306 | if (dev->mode == CLOCK_EVT_MODE_PERIODIC) | 308 | if (dev->state == CLOCK_EVT_STATE_PERIODIC) |
307 | goto unlock; | 309 | goto unlock; |
308 | 310 | ||
309 | /* | 311 | /* |
@@ -324,49 +326,54 @@ unlock: | |||
324 | raw_spin_unlock(&tick_broadcast_lock); | 326 | raw_spin_unlock(&tick_broadcast_lock); |
325 | } | 327 | } |
326 | 328 | ||
327 | /* | 329 | /** |
328 | * Powerstate information: The system enters/leaves a state, where | 330 | * tick_broadcast_control - Enable/disable or force broadcast mode |
329 | * affected devices might stop | 331 | * @mode: The selected broadcast mode |
332 | * | ||
333 | * Called when the system enters a state where affected tick devices | ||
334 | * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. | ||
335 | * | ||
336 | * Called with interrupts disabled, so clockevents_lock is not | ||
337 | * required here because the local clock event device cannot go away | ||
338 | * under us. | ||
330 | */ | 339 | */ |
331 | static void tick_do_broadcast_on_off(unsigned long *reason) | 340 | void tick_broadcast_control(enum tick_broadcast_mode mode) |
332 | { | 341 | { |
333 | struct clock_event_device *bc, *dev; | 342 | struct clock_event_device *bc, *dev; |
334 | struct tick_device *td; | 343 | struct tick_device *td; |
335 | unsigned long flags; | ||
336 | int cpu, bc_stopped; | 344 | int cpu, bc_stopped; |
337 | 345 | ||
338 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 346 | td = this_cpu_ptr(&tick_cpu_device); |
339 | |||
340 | cpu = smp_processor_id(); | ||
341 | td = &per_cpu(tick_cpu_device, cpu); | ||
342 | dev = td->evtdev; | 347 | dev = td->evtdev; |
343 | bc = tick_broadcast_device.evtdev; | ||
344 | 348 | ||
345 | /* | 349 | /* |
346 | * Is the device not affected by the powerstate ? | 350 | * Is the device not affected by the powerstate ? |
347 | */ | 351 | */ |
348 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 352 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) |
349 | goto out; | 353 | return; |
350 | 354 | ||
351 | if (!tick_device_is_functional(dev)) | 355 | if (!tick_device_is_functional(dev)) |
352 | goto out; | 356 | return; |
353 | 357 | ||
358 | raw_spin_lock(&tick_broadcast_lock); | ||
359 | cpu = smp_processor_id(); | ||
360 | bc = tick_broadcast_device.evtdev; | ||
354 | bc_stopped = cpumask_empty(tick_broadcast_mask); | 361 | bc_stopped = cpumask_empty(tick_broadcast_mask); |
355 | 362 | ||
356 | switch (*reason) { | 363 | switch (mode) { |
357 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 364 | case TICK_BROADCAST_FORCE: |
358 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | 365 | tick_broadcast_forced = 1; |
366 | case TICK_BROADCAST_ON: | ||
359 | cpumask_set_cpu(cpu, tick_broadcast_on); | 367 | cpumask_set_cpu(cpu, tick_broadcast_on); |
360 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { | 368 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { |
361 | if (tick_broadcast_device.mode == | 369 | if (tick_broadcast_device.mode == |
362 | TICKDEV_MODE_PERIODIC) | 370 | TICKDEV_MODE_PERIODIC) |
363 | clockevents_shutdown(dev); | 371 | clockevents_shutdown(dev); |
364 | } | 372 | } |
365 | if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) | ||
366 | tick_broadcast_force = 1; | ||
367 | break; | 373 | break; |
368 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 374 | |
369 | if (tick_broadcast_force) | 375 | case TICK_BROADCAST_OFF: |
376 | if (tick_broadcast_forced) | ||
370 | break; | 377 | break; |
371 | cpumask_clear_cpu(cpu, tick_broadcast_on); | 378 | cpumask_clear_cpu(cpu, tick_broadcast_on); |
372 | if (!tick_device_is_functional(dev)) | 379 | if (!tick_device_is_functional(dev)) |
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason) | |||
388 | else | 395 | else |
389 | tick_broadcast_setup_oneshot(bc); | 396 | tick_broadcast_setup_oneshot(bc); |
390 | } | 397 | } |
391 | out: | 398 | raw_spin_unlock(&tick_broadcast_lock); |
392 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
393 | } | ||
394 | |||
395 | /* | ||
396 | * Powerstate information: The system enters/leaves a state, where | ||
397 | * affected devices might stop. | ||
398 | */ | ||
399 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) | ||
400 | { | ||
401 | if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) | ||
402 | printk(KERN_ERR "tick-broadcast: ignoring broadcast for " | ||
403 | "offline CPU #%d\n", *oncpu); | ||
404 | else | ||
405 | tick_do_broadcast_on_off(&reason); | ||
406 | } | 399 | } |
400 | EXPORT_SYMBOL_GPL(tick_broadcast_control); | ||
407 | 401 | ||
408 | /* | 402 | /* |
409 | * Set the periodic handler depending on broadcast on/off | 403 | * Set the periodic handler depending on broadcast on/off |
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) | |||
416 | dev->event_handler = tick_handle_periodic_broadcast; | 410 | dev->event_handler = tick_handle_periodic_broadcast; |
417 | } | 411 | } |
418 | 412 | ||
413 | #ifdef CONFIG_HOTPLUG_CPU | ||
419 | /* | 414 | /* |
420 | * Remove a CPU from broadcasting | 415 | * Remove a CPU from broadcasting |
421 | */ | 416 | */ |
422 | void tick_shutdown_broadcast(unsigned int *cpup) | 417 | void tick_shutdown_broadcast(unsigned int cpu) |
423 | { | 418 | { |
424 | struct clock_event_device *bc; | 419 | struct clock_event_device *bc; |
425 | unsigned long flags; | 420 | unsigned long flags; |
426 | unsigned int cpu = *cpup; | ||
427 | 421 | ||
428 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 422 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
429 | 423 | ||
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) | |||
438 | 432 | ||
439 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 433 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
440 | } | 434 | } |
435 | #endif | ||
441 | 436 | ||
442 | void tick_suspend_broadcast(void) | 437 | void tick_suspend_broadcast(void) |
443 | { | 438 | { |
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void) | |||
453 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 448 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
454 | } | 449 | } |
455 | 450 | ||
456 | int tick_resume_broadcast(void) | 451 | /* |
452 | * This is called from tick_resume_local() on a resuming CPU. That's | ||
453 | * called from the core resume function, tick_unfreeze() and the magic XEN | ||
454 | * resume hackery. | ||
455 | * | ||
456 | * In none of these cases the broadcast device mode can change and the | ||
457 | * bit of the resuming CPU in the broadcast mask is safe as well. | ||
458 | */ | ||
459 | bool tick_resume_check_broadcast(void) | ||
460 | { | ||
461 | if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) | ||
462 | return false; | ||
463 | else | ||
464 | return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); | ||
465 | } | ||
466 | |||
467 | void tick_resume_broadcast(void) | ||
457 | { | 468 | { |
458 | struct clock_event_device *bc; | 469 | struct clock_event_device *bc; |
459 | unsigned long flags; | 470 | unsigned long flags; |
460 | int broadcast = 0; | ||
461 | 471 | ||
462 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 472 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
463 | 473 | ||
464 | bc = tick_broadcast_device.evtdev; | 474 | bc = tick_broadcast_device.evtdev; |
465 | 475 | ||
466 | if (bc) { | 476 | if (bc) { |
467 | clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); | 477 | clockevents_tick_resume(bc); |
468 | 478 | ||
469 | switch (tick_broadcast_device.mode) { | 479 | switch (tick_broadcast_device.mode) { |
470 | case TICKDEV_MODE_PERIODIC: | 480 | case TICKDEV_MODE_PERIODIC: |
471 | if (!cpumask_empty(tick_broadcast_mask)) | 481 | if (!cpumask_empty(tick_broadcast_mask)) |
472 | tick_broadcast_start_periodic(bc); | 482 | tick_broadcast_start_periodic(bc); |
473 | broadcast = cpumask_test_cpu(smp_processor_id(), | ||
474 | tick_broadcast_mask); | ||
475 | break; | 483 | break; |
476 | case TICKDEV_MODE_ONESHOT: | 484 | case TICKDEV_MODE_ONESHOT: |
477 | if (!cpumask_empty(tick_broadcast_mask)) | 485 | if (!cpumask_empty(tick_broadcast_mask)) |
478 | broadcast = tick_resume_broadcast_oneshot(bc); | 486 | tick_resume_broadcast_oneshot(bc); |
479 | break; | 487 | break; |
480 | } | 488 | } |
481 | } | 489 | } |
482 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 490 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
483 | |||
484 | return broadcast; | ||
485 | } | 491 | } |
486 | 492 | ||
487 | |||
488 | #ifdef CONFIG_TICK_ONESHOT | 493 | #ifdef CONFIG_TICK_ONESHOT |
489 | 494 | ||
490 | static cpumask_var_t tick_broadcast_oneshot_mask; | 495 | static cpumask_var_t tick_broadcast_oneshot_mask; |
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, | |||
532 | { | 537 | { |
533 | int ret; | 538 | int ret; |
534 | 539 | ||
535 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) | 540 | if (bc->state != CLOCK_EVT_STATE_ONESHOT) |
536 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 541 | clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); |
537 | 542 | ||
538 | ret = clockevents_program_event(bc, expires, force); | 543 | ret = clockevents_program_event(bc, expires, force); |
539 | if (!ret) | 544 | if (!ret) |
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, | |||
541 | return ret; | 546 | return ret; |
542 | } | 547 | } |
543 | 548 | ||
544 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 549 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) |
545 | { | 550 | { |
546 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 551 | clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); |
547 | return 0; | ||
548 | } | 552 | } |
549 | 553 | ||
550 | /* | 554 | /* |
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void) | |||
562 | * switched over, leave the device alone. | 566 | * switched over, leave the device alone. |
563 | */ | 567 | */ |
564 | if (td->mode == TICKDEV_MODE_ONESHOT) { | 568 | if (td->mode == TICKDEV_MODE_ONESHOT) { |
565 | clockevents_set_mode(td->evtdev, | 569 | clockevents_set_state(td->evtdev, |
566 | CLOCK_EVT_MODE_ONESHOT); | 570 | CLOCK_EVT_STATE_ONESHOT); |
567 | } | 571 | } |
568 | } | 572 | } |
569 | } | 573 | } |
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, | |||
666 | if (dev->next_event.tv64 < bc->next_event.tv64) | 670 | if (dev->next_event.tv64 < bc->next_event.tv64) |
667 | return; | 671 | return; |
668 | } | 672 | } |
669 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | 673 | clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); |
670 | } | 674 | } |
671 | 675 | ||
672 | static void broadcast_move_bc(int deadcpu) | 676 | /** |
673 | { | 677 | * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode |
674 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | 678 | * @state: The target state (enter/exit) |
675 | 679 | * | |
676 | if (!bc || !broadcast_needs_cpu(bc, deadcpu)) | 680 | * The system enters/leaves a state, where affected devices might stop |
677 | return; | ||
678 | /* This moves the broadcast assignment to this cpu */ | ||
679 | clockevents_program_event(bc, bc->next_event, 1); | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | * Powerstate information: The system enters/leaves a state, where | ||
684 | * affected devices might stop | ||
685 | * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. | 681 | * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. |
682 | * | ||
683 | * Called with interrupts disabled, so clockevents_lock is not | ||
684 | * required here because the local clock event device cannot go away | ||
685 | * under us. | ||
686 | */ | 686 | */ |
687 | int tick_broadcast_oneshot_control(unsigned long reason) | 687 | int tick_broadcast_oneshot_control(enum tick_broadcast_state state) |
688 | { | 688 | { |
689 | struct clock_event_device *bc, *dev; | 689 | struct clock_event_device *bc, *dev; |
690 | struct tick_device *td; | 690 | struct tick_device *td; |
691 | unsigned long flags; | ||
692 | ktime_t now; | ||
693 | int cpu, ret = 0; | 691 | int cpu, ret = 0; |
692 | ktime_t now; | ||
694 | 693 | ||
695 | /* | 694 | /* |
696 | * Periodic mode does not care about the enter/exit of power | 695 | * Periodic mode does not care about the enter/exit of power |
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason) | |||
703 | * We are called with preemtion disabled from the depth of the | 702 | * We are called with preemtion disabled from the depth of the |
704 | * idle code, so we can't be moved away. | 703 | * idle code, so we can't be moved away. |
705 | */ | 704 | */ |
706 | cpu = smp_processor_id(); | 705 | td = this_cpu_ptr(&tick_cpu_device); |
707 | td = &per_cpu(tick_cpu_device, cpu); | ||
708 | dev = td->evtdev; | 706 | dev = td->evtdev; |
709 | 707 | ||
710 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 708 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) |
711 | return 0; | 709 | return 0; |
712 | 710 | ||
711 | raw_spin_lock(&tick_broadcast_lock); | ||
713 | bc = tick_broadcast_device.evtdev; | 712 | bc = tick_broadcast_device.evtdev; |
713 | cpu = smp_processor_id(); | ||
714 | 714 | ||
715 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 715 | if (state == TICK_BROADCAST_ENTER) { |
716 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | ||
717 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { | 716 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { |
718 | WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); | 717 | WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); |
719 | broadcast_shutdown_local(bc, dev); | 718 | broadcast_shutdown_local(bc, dev); |
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) | |||
741 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); | 740 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); |
742 | } else { | 741 | } else { |
743 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { | 742 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { |
744 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 743 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); |
745 | /* | 744 | /* |
746 | * The cpu which was handling the broadcast | 745 | * The cpu which was handling the broadcast |
747 | * timer marked this cpu in the broadcast | 746 | * timer marked this cpu in the broadcast |
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason) | |||
805 | } | 804 | } |
806 | } | 805 | } |
807 | out: | 806 | out: |
808 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 807 | raw_spin_unlock(&tick_broadcast_lock); |
809 | return ret; | 808 | return ret; |
810 | } | 809 | } |
810 | EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); | ||
811 | 811 | ||
812 | /* | 812 | /* |
813 | * Reset the one shot broadcast for a cpu | 813 | * Reset the one shot broadcast for a cpu |
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
842 | 842 | ||
843 | /* Set it up only once ! */ | 843 | /* Set it up only once ! */ |
844 | if (bc->event_handler != tick_handle_oneshot_broadcast) { | 844 | if (bc->event_handler != tick_handle_oneshot_broadcast) { |
845 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; | 845 | int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; |
846 | 846 | ||
847 | bc->event_handler = tick_handle_oneshot_broadcast; | 847 | bc->event_handler = tick_handle_oneshot_broadcast; |
848 | 848 | ||
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
858 | tick_broadcast_oneshot_mask, tmpmask); | 858 | tick_broadcast_oneshot_mask, tmpmask); |
859 | 859 | ||
860 | if (was_periodic && !cpumask_empty(tmpmask)) { | 860 | if (was_periodic && !cpumask_empty(tmpmask)) { |
861 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 861 | clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); |
862 | tick_broadcast_init_next_event(tmpmask, | 862 | tick_broadcast_init_next_event(tmpmask, |
863 | tick_next_period); | 863 | tick_next_period); |
864 | tick_broadcast_set_event(bc, cpu, tick_next_period, 1); | 864 | tick_broadcast_set_event(bc, cpu, tick_next_period, 1); |
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void) | |||
894 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 894 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
895 | } | 895 | } |
896 | 896 | ||
897 | #ifdef CONFIG_HOTPLUG_CPU | ||
898 | void hotplug_cpu__broadcast_tick_pull(int deadcpu) | ||
899 | { | ||
900 | struct clock_event_device *bc; | ||
901 | unsigned long flags; | ||
902 | |||
903 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
904 | bc = tick_broadcast_device.evtdev; | ||
905 | |||
906 | if (bc && broadcast_needs_cpu(bc, deadcpu)) { | ||
907 | /* This moves the broadcast assignment to this CPU: */ | ||
908 | clockevents_program_event(bc, bc->next_event, 1); | ||
909 | } | ||
910 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
911 | } | ||
897 | 912 | ||
898 | /* | 913 | /* |
899 | * Remove a dead CPU from broadcasting | 914 | * Remove a dead CPU from broadcasting |
900 | */ | 915 | */ |
901 | void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | 916 | void tick_shutdown_broadcast_oneshot(unsigned int cpu) |
902 | { | 917 | { |
903 | unsigned long flags; | 918 | unsigned long flags; |
904 | unsigned int cpu = *cpup; | ||
905 | 919 | ||
906 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 920 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
907 | 921 | ||
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | |||
913 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); | 927 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); |
914 | cpumask_clear_cpu(cpu, tick_broadcast_force_mask); | 928 | cpumask_clear_cpu(cpu, tick_broadcast_force_mask); |
915 | 929 | ||
916 | broadcast_move_bc(cpu); | ||
917 | |||
918 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 930 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
919 | } | 931 | } |
932 | #endif | ||
920 | 933 | ||
921 | /* | 934 | /* |
922 | * Check, whether the broadcast device is in one shot mode | 935 | * Check, whether the broadcast device is in one shot mode |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index f7c515595b42..3ae6afa1eb98 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev) | |||
102 | 102 | ||
103 | tick_periodic(cpu); | 103 | tick_periodic(cpu); |
104 | 104 | ||
105 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | 105 | if (dev->state != CLOCK_EVT_STATE_ONESHOT) |
106 | return; | 106 | return; |
107 | for (;;) { | 107 | for (;;) { |
108 | /* | 108 | /* |
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
140 | 140 | ||
141 | if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && | 141 | if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && |
142 | !tick_broadcast_oneshot_active()) { | 142 | !tick_broadcast_oneshot_active()) { |
143 | clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); | 143 | clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); |
144 | } else { | 144 | } else { |
145 | unsigned long seq; | 145 | unsigned long seq; |
146 | ktime_t next; | 146 | ktime_t next; |
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
150 | next = tick_next_period; | 150 | next = tick_next_period; |
151 | } while (read_seqretry(&jiffies_lock, seq)); | 151 | } while (read_seqretry(&jiffies_lock, seq)); |
152 | 152 | ||
153 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 153 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); |
154 | 154 | ||
155 | for (;;) { | 155 | for (;;) { |
156 | if (!clockevents_program_event(dev, next, false)) | 156 | if (!clockevents_program_event(dev, next, false)) |
@@ -332,14 +332,16 @@ out_bc: | |||
332 | tick_install_broadcast_device(newdev); | 332 | tick_install_broadcast_device(newdev); |
333 | } | 333 | } |
334 | 334 | ||
335 | #ifdef CONFIG_HOTPLUG_CPU | ||
335 | /* | 336 | /* |
336 | * Transfer the do_timer job away from a dying cpu. | 337 | * Transfer the do_timer job away from a dying cpu. |
337 | * | 338 | * |
338 | * Called with interrupts disabled. | 339 | * Called with interrupts disabled. Not locking required. If |
340 | * tick_do_timer_cpu is owned by this cpu, nothing can change it. | ||
339 | */ | 341 | */ |
340 | void tick_handover_do_timer(int *cpup) | 342 | void tick_handover_do_timer(void) |
341 | { | 343 | { |
342 | if (*cpup == tick_do_timer_cpu) { | 344 | if (tick_do_timer_cpu == smp_processor_id()) { |
343 | int cpu = cpumask_first(cpu_online_mask); | 345 | int cpu = cpumask_first(cpu_online_mask); |
344 | 346 | ||
345 | tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : | 347 | tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : |
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup) | |||
354 | * access the hardware device itself. | 356 | * access the hardware device itself. |
355 | * We just set the mode and remove it from the lists. | 357 | * We just set the mode and remove it from the lists. |
356 | */ | 358 | */ |
357 | void tick_shutdown(unsigned int *cpup) | 359 | void tick_shutdown(unsigned int cpu) |
358 | { | 360 | { |
359 | struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); | 361 | struct tick_device *td = &per_cpu(tick_cpu_device, cpu); |
360 | struct clock_event_device *dev = td->evtdev; | 362 | struct clock_event_device *dev = td->evtdev; |
361 | 363 | ||
362 | td->mode = TICKDEV_MODE_PERIODIC; | 364 | td->mode = TICKDEV_MODE_PERIODIC; |
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup) | |||
365 | * Prevent that the clock events layer tries to call | 367 | * Prevent that the clock events layer tries to call |
366 | * the set mode function! | 368 | * the set mode function! |
367 | */ | 369 | */ |
370 | dev->state = CLOCK_EVT_STATE_DETACHED; | ||
368 | dev->mode = CLOCK_EVT_MODE_UNUSED; | 371 | dev->mode = CLOCK_EVT_MODE_UNUSED; |
369 | clockevents_exchange_device(dev, NULL); | 372 | clockevents_exchange_device(dev, NULL); |
370 | dev->event_handler = clockevents_handle_noop; | 373 | dev->event_handler = clockevents_handle_noop; |
371 | td->evtdev = NULL; | 374 | td->evtdev = NULL; |
372 | } | 375 | } |
373 | } | 376 | } |
377 | #endif | ||
374 | 378 | ||
375 | void tick_suspend(void) | 379 | /** |
380 | * tick_suspend_local - Suspend the local tick device | ||
381 | * | ||
382 | * Called from the local cpu for freeze with interrupts disabled. | ||
383 | * | ||
384 | * No locks required. Nothing can change the per cpu device. | ||
385 | */ | ||
386 | void tick_suspend_local(void) | ||
376 | { | 387 | { |
377 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); | 388 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
378 | 389 | ||
379 | clockevents_shutdown(td->evtdev); | 390 | clockevents_shutdown(td->evtdev); |
380 | } | 391 | } |
381 | 392 | ||
382 | void tick_resume(void) | 393 | /** |
394 | * tick_resume_local - Resume the local tick device | ||
395 | * | ||
396 | * Called from the local CPU for unfreeze or XEN resume magic. | ||
397 | * | ||
398 | * No locks required. Nothing can change the per cpu device. | ||
399 | */ | ||
400 | void tick_resume_local(void) | ||
383 | { | 401 | { |
384 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); | 402 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
385 | int broadcast = tick_resume_broadcast(); | 403 | bool broadcast = tick_resume_check_broadcast(); |
386 | |||
387 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); | ||
388 | 404 | ||
405 | clockevents_tick_resume(td->evtdev); | ||
389 | if (!broadcast) { | 406 | if (!broadcast) { |
390 | if (td->mode == TICKDEV_MODE_PERIODIC) | 407 | if (td->mode == TICKDEV_MODE_PERIODIC) |
391 | tick_setup_periodic(td->evtdev, 0); | 408 | tick_setup_periodic(td->evtdev, 0); |
@@ -394,6 +411,35 @@ void tick_resume(void) | |||
394 | } | 411 | } |
395 | } | 412 | } |
396 | 413 | ||
414 | /** | ||
415 | * tick_suspend - Suspend the tick and the broadcast device | ||
416 | * | ||
417 | * Called from syscore_suspend() via timekeeping_suspend with only one | ||
418 | * CPU online and interrupts disabled or from tick_unfreeze() under | ||
419 | * tick_freeze_lock. | ||
420 | * | ||
421 | * No locks required. Nothing can change the per cpu device. | ||
422 | */ | ||
423 | void tick_suspend(void) | ||
424 | { | ||
425 | tick_suspend_local(); | ||
426 | tick_suspend_broadcast(); | ||
427 | } | ||
428 | |||
429 | /** | ||
430 | * tick_resume - Resume the tick and the broadcast device | ||
431 | * | ||
432 | * Called from syscore_resume() via timekeeping_resume with only one | ||
433 | * CPU online and interrupts disabled. | ||
434 | * | ||
435 | * No locks required. Nothing can change the per cpu device. | ||
436 | */ | ||
437 | void tick_resume(void) | ||
438 | { | ||
439 | tick_resume_broadcast(); | ||
440 | tick_resume_local(); | ||
441 | } | ||
442 | |||
397 | static DEFINE_RAW_SPINLOCK(tick_freeze_lock); | 443 | static DEFINE_RAW_SPINLOCK(tick_freeze_lock); |
398 | static unsigned int tick_freeze_depth; | 444 | static unsigned int tick_freeze_depth; |
399 | 445 | ||
@@ -411,12 +457,10 @@ void tick_freeze(void) | |||
411 | raw_spin_lock(&tick_freeze_lock); | 457 | raw_spin_lock(&tick_freeze_lock); |
412 | 458 | ||
413 | tick_freeze_depth++; | 459 | tick_freeze_depth++; |
414 | if (tick_freeze_depth == num_online_cpus()) { | 460 | if (tick_freeze_depth == num_online_cpus()) |
415 | timekeeping_suspend(); | 461 | timekeeping_suspend(); |
416 | } else { | 462 | else |
417 | tick_suspend(); | 463 | tick_suspend_local(); |
418 | tick_suspend_broadcast(); | ||
419 | } | ||
420 | 464 | ||
421 | raw_spin_unlock(&tick_freeze_lock); | 465 | raw_spin_unlock(&tick_freeze_lock); |
422 | } | 466 | } |
@@ -437,7 +481,7 @@ void tick_unfreeze(void) | |||
437 | if (tick_freeze_depth == num_online_cpus()) | 481 | if (tick_freeze_depth == num_online_cpus()) |
438 | timekeeping_resume(); | 482 | timekeeping_resume(); |
439 | else | 483 | else |
440 | tick_resume(); | 484 | tick_resume_local(); |
441 | 485 | ||
442 | tick_freeze_depth--; | 486 | tick_freeze_depth--; |
443 | 487 | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 366aeb4f2c66..b64fdd8054c5 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -5,15 +5,12 @@ | |||
5 | #include <linux/tick.h> | 5 | #include <linux/tick.h> |
6 | 6 | ||
7 | #include "timekeeping.h" | 7 | #include "timekeeping.h" |
8 | #include "tick-sched.h" | ||
8 | 9 | ||
9 | extern seqlock_t jiffies_lock; | 10 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
10 | 11 | ||
11 | #define CS_NAME_LEN 32 | 12 | # define TICK_DO_TIMER_NONE -1 |
12 | 13 | # define TICK_DO_TIMER_BOOT -2 | |
13 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD | ||
14 | |||
15 | #define TICK_DO_TIMER_NONE -1 | ||
16 | #define TICK_DO_TIMER_BOOT -2 | ||
17 | 14 | ||
18 | DECLARE_PER_CPU(struct tick_device, tick_cpu_device); | 15 | DECLARE_PER_CPU(struct tick_device, tick_cpu_device); |
19 | extern ktime_t tick_next_period; | 16 | extern ktime_t tick_next_period; |
@@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly; | |||
23 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); | 20 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); |
24 | extern void tick_handle_periodic(struct clock_event_device *dev); | 21 | extern void tick_handle_periodic(struct clock_event_device *dev); |
25 | extern void tick_check_new_device(struct clock_event_device *dev); | 22 | extern void tick_check_new_device(struct clock_event_device *dev); |
26 | extern void tick_handover_do_timer(int *cpup); | 23 | extern void tick_shutdown(unsigned int cpu); |
27 | extern void tick_shutdown(unsigned int *cpup); | ||
28 | extern void tick_suspend(void); | 24 | extern void tick_suspend(void); |
29 | extern void tick_resume(void); | 25 | extern void tick_resume(void); |
30 | extern bool tick_check_replacement(struct clock_event_device *curdev, | 26 | extern bool tick_check_replacement(struct clock_event_device *curdev, |
31 | struct clock_event_device *newdev); | 27 | struct clock_event_device *newdev); |
32 | extern void tick_install_replacement(struct clock_event_device *dev); | 28 | extern void tick_install_replacement(struct clock_event_device *dev); |
29 | extern int tick_is_oneshot_available(void); | ||
30 | extern struct tick_device *tick_get_device(int cpu); | ||
33 | 31 | ||
34 | extern void clockevents_shutdown(struct clock_event_device *dev); | 32 | extern int clockevents_tick_resume(struct clock_event_device *dev); |
33 | /* Check, if the device is functional or a dummy for broadcast */ | ||
34 | static inline int tick_device_is_functional(struct clock_event_device *dev) | ||
35 | { | ||
36 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | ||
37 | } | ||
35 | 38 | ||
39 | extern void clockevents_shutdown(struct clock_event_device *dev); | ||
40 | extern void clockevents_exchange_device(struct clock_event_device *old, | ||
41 | struct clock_event_device *new); | ||
42 | extern void clockevents_set_state(struct clock_event_device *dev, | ||
43 | enum clock_event_state state); | ||
44 | extern int clockevents_program_event(struct clock_event_device *dev, | ||
45 | ktime_t expires, bool force); | ||
46 | extern void clockevents_handle_noop(struct clock_event_device *dev); | ||
47 | extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); | ||
36 | extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); | 48 | extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); |
37 | 49 | ||
38 | /* | 50 | /* Broadcasting support */ |
39 | * NO_HZ / high resolution timer shared code | 51 | # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST |
40 | */ | 52 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); |
53 | extern void tick_install_broadcast_device(struct clock_event_device *dev); | ||
54 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | ||
55 | extern void tick_shutdown_broadcast(unsigned int cpu); | ||
56 | extern void tick_suspend_broadcast(void); | ||
57 | extern void tick_resume_broadcast(void); | ||
58 | extern bool tick_resume_check_broadcast(void); | ||
59 | extern void tick_broadcast_init(void); | ||
60 | extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | ||
61 | extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); | ||
62 | extern struct tick_device *tick_get_broadcast_device(void); | ||
63 | extern struct cpumask *tick_get_broadcast_mask(void); | ||
64 | # else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ | ||
65 | static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } | ||
66 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } | ||
67 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } | ||
68 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } | ||
69 | static inline void tick_shutdown_broadcast(unsigned int cpu) { } | ||
70 | static inline void tick_suspend_broadcast(void) { } | ||
71 | static inline void tick_resume_broadcast(void) { } | ||
72 | static inline bool tick_resume_check_broadcast(void) { return false; } | ||
73 | static inline void tick_broadcast_init(void) { } | ||
74 | static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } | ||
75 | |||
76 | /* Set the periodic handler in non broadcast mode */ | ||
77 | static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) | ||
78 | { | ||
79 | dev->event_handler = tick_handle_periodic; | ||
80 | } | ||
81 | # endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ | ||
82 | |||
83 | #else /* !GENERIC_CLOCKEVENTS: */ | ||
84 | static inline void tick_suspend(void) { } | ||
85 | static inline void tick_resume(void) { } | ||
86 | #endif /* !GENERIC_CLOCKEVENTS */ | ||
87 | |||
88 | /* Oneshot related functions */ | ||
41 | #ifdef CONFIG_TICK_ONESHOT | 89 | #ifdef CONFIG_TICK_ONESHOT |
42 | extern void tick_setup_oneshot(struct clock_event_device *newdev, | 90 | extern void tick_setup_oneshot(struct clock_event_device *newdev, |
43 | void (*handler)(struct clock_event_device *), | 91 | void (*handler)(struct clock_event_device *), |
@@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force); | |||
46 | extern void tick_oneshot_notify(void); | 94 | extern void tick_oneshot_notify(void); |
47 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); | 95 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); |
48 | extern void tick_resume_oneshot(void); | 96 | extern void tick_resume_oneshot(void); |
49 | # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | 97 | static inline bool tick_oneshot_possible(void) { return true; } |
98 | extern int tick_oneshot_mode_active(void); | ||
99 | extern void tick_clock_notify(void); | ||
100 | extern int tick_check_oneshot_change(int allow_nohz); | ||
101 | extern int tick_init_highres(void); | ||
102 | #else /* !CONFIG_TICK_ONESHOT: */ | ||
103 | static inline | ||
104 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
105 | void (*handler)(struct clock_event_device *), | ||
106 | ktime_t nextevt) { BUG(); } | ||
107 | static inline void tick_resume_oneshot(void) { BUG(); } | ||
108 | static inline int tick_program_event(ktime_t expires, int force) { return 0; } | ||
109 | static inline void tick_oneshot_notify(void) { } | ||
110 | static inline bool tick_oneshot_possible(void) { return false; } | ||
111 | static inline int tick_oneshot_mode_active(void) { return 0; } | ||
112 | static inline void tick_clock_notify(void) { } | ||
113 | static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } | ||
114 | #endif /* !CONFIG_TICK_ONESHOT */ | ||
115 | |||
116 | /* Functions related to oneshot broadcasting */ | ||
117 | #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) | ||
50 | extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); | 118 | extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); |
51 | extern int tick_broadcast_oneshot_control(unsigned long reason); | ||
52 | extern void tick_broadcast_switch_to_oneshot(void); | 119 | extern void tick_broadcast_switch_to_oneshot(void); |
53 | extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); | 120 | extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); |
54 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); | ||
55 | extern int tick_broadcast_oneshot_active(void); | 121 | extern int tick_broadcast_oneshot_active(void); |
56 | extern void tick_check_oneshot_broadcast_this_cpu(void); | 122 | extern void tick_check_oneshot_broadcast_this_cpu(void); |
57 | bool tick_broadcast_oneshot_available(void); | 123 | bool tick_broadcast_oneshot_available(void); |
58 | # else /* BROADCAST */ | 124 | extern struct cpumask *tick_get_broadcast_oneshot_mask(void); |
59 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 125 | #else /* !(BROADCAST && ONESHOT): */ |
60 | { | 126 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } |
61 | BUG(); | ||
62 | } | ||
63 | static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } | ||
64 | static inline void tick_broadcast_switch_to_oneshot(void) { } | 127 | static inline void tick_broadcast_switch_to_oneshot(void) { } |
65 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | 128 | static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } |
66 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 129 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
67 | static inline void tick_check_oneshot_broadcast_this_cpu(void) { } | 130 | static inline void tick_check_oneshot_broadcast_this_cpu(void) { } |
68 | static inline bool tick_broadcast_oneshot_available(void) { return true; } | 131 | static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } |
69 | # endif /* !BROADCAST */ | 132 | #endif /* !(BROADCAST && ONESHOT) */ |
70 | |||
71 | #else /* !ONESHOT */ | ||
72 | static inline | ||
73 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
74 | void (*handler)(struct clock_event_device *), | ||
75 | ktime_t nextevt) | ||
76 | { | ||
77 | BUG(); | ||
78 | } | ||
79 | static inline void tick_resume_oneshot(void) | ||
80 | { | ||
81 | BUG(); | ||
82 | } | ||
83 | static inline int tick_program_event(ktime_t expires, int force) | ||
84 | { | ||
85 | return 0; | ||
86 | } | ||
87 | static inline void tick_oneshot_notify(void) { } | ||
88 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | ||
89 | { | ||
90 | BUG(); | ||
91 | } | ||
92 | static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } | ||
93 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | ||
94 | static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | ||
95 | { | ||
96 | return 0; | ||
97 | } | ||
98 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | ||
99 | static inline bool tick_broadcast_oneshot_available(void) { return false; } | ||
100 | #endif /* !TICK_ONESHOT */ | ||
101 | 133 | ||
102 | /* NO_HZ_FULL internal */ | 134 | /* NO_HZ_FULL internal */ |
103 | #ifdef CONFIG_NO_HZ_FULL | 135 | #ifdef CONFIG_NO_HZ_FULL |
@@ -105,68 +137,3 @@ extern void tick_nohz_init(void); | |||
105 | # else | 137 | # else |
106 | static inline void tick_nohz_init(void) { } | 138 | static inline void tick_nohz_init(void) { } |
107 | #endif | 139 | #endif |
108 | |||
109 | /* | ||
110 | * Broadcasting support | ||
111 | */ | ||
112 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
113 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | ||
114 | extern void tick_install_broadcast_device(struct clock_event_device *dev); | ||
115 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | ||
116 | extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); | ||
117 | extern void tick_shutdown_broadcast(unsigned int *cpup); | ||
118 | extern void tick_suspend_broadcast(void); | ||
119 | extern int tick_resume_broadcast(void); | ||
120 | extern void tick_broadcast_init(void); | ||
121 | extern void | ||
122 | tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | ||
123 | int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); | ||
124 | |||
125 | #else /* !BROADCAST */ | ||
126 | |||
127 | static inline void tick_install_broadcast_device(struct clock_event_device *dev) | ||
128 | { | ||
129 | } | ||
130 | |||
131 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) | ||
132 | { | ||
133 | return 0; | ||
134 | } | ||
135 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, | ||
136 | int cpu) | ||
137 | { | ||
138 | return 0; | ||
139 | } | ||
140 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } | ||
141 | static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } | ||
142 | static inline void tick_shutdown_broadcast(unsigned int *cpup) { } | ||
143 | static inline void tick_suspend_broadcast(void) { } | ||
144 | static inline int tick_resume_broadcast(void) { return 0; } | ||
145 | static inline void tick_broadcast_init(void) { } | ||
146 | static inline int tick_broadcast_update_freq(struct clock_event_device *dev, | ||
147 | u32 freq) { return -ENODEV; } | ||
148 | |||
149 | /* | ||
150 | * Set the periodic handler in non broadcast mode | ||
151 | */ | ||
152 | static inline void tick_set_periodic_handler(struct clock_event_device *dev, | ||
153 | int broadcast) | ||
154 | { | ||
155 | dev->event_handler = tick_handle_periodic; | ||
156 | } | ||
157 | #endif /* !BROADCAST */ | ||
158 | |||
159 | /* | ||
160 | * Check, if the device is functional or a dummy for broadcast | ||
161 | */ | ||
162 | static inline int tick_device_is_functional(struct clock_event_device *dev) | ||
163 | { | ||
164 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | ||
165 | } | ||
166 | |||
167 | int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); | ||
168 | |||
169 | #endif | ||
170 | |||
171 | extern void do_timer(unsigned long ticks); | ||
172 | extern void update_wall_time(void); | ||
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 7ce740e78e1b..67a64b1670bf 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void) | |||
38 | { | 38 | { |
39 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); | 39 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
40 | 40 | ||
41 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 41 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); |
42 | clockevents_program_event(dev, ktime_get(), true); | 42 | clockevents_program_event(dev, ktime_get(), true); |
43 | } | 43 | } |
44 | 44 | ||
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, | |||
50 | ktime_t next_event) | 50 | ktime_t next_event) |
51 | { | 51 | { |
52 | newdev->event_handler = handler; | 52 | newdev->event_handler = handler; |
53 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); | 53 | clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); |
54 | clockevents_program_event(newdev, next_event, true); | 54 | clockevents_program_event(newdev, next_event, true); |
55 | } | 55 | } |
56 | 56 | ||
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) | |||
81 | 81 | ||
82 | td->mode = TICKDEV_MODE_ONESHOT; | 82 | td->mode = TICKDEV_MODE_ONESHOT; |
83 | dev->event_handler = handler; | 83 | dev->event_handler = handler; |
84 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 84 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); |
85 | tick_broadcast_switch_to_oneshot(); | 85 | tick_broadcast_switch_to_oneshot(); |
86 | return 0; | 86 | return 0; |
87 | } | 87 | } |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a4c4edac4528..914259128145 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -34,7 +34,7 @@ | |||
34 | /* | 34 | /* |
35 | * Per cpu nohz control structure | 35 | * Per cpu nohz control structure |
36 | */ | 36 | */ |
37 | DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 37 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * The time, when the last jiffy update happened. Protected by jiffies_lock. | 40 | * The time, when the last jiffy update happened. Protected by jiffies_lock. |
@@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str) | |||
416 | 416 | ||
417 | __setup("nohz=", setup_tick_nohz); | 417 | __setup("nohz=", setup_tick_nohz); |
418 | 418 | ||
419 | int tick_nohz_tick_stopped(void) | ||
420 | { | ||
421 | return __this_cpu_read(tick_cpu_sched.tick_stopped); | ||
422 | } | ||
423 | |||
419 | /** | 424 | /** |
420 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | 425 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted |
421 | * | 426 | * |
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h new file mode 100644 index 000000000000..28b5da3e1a17 --- /dev/null +++ b/kernel/time/tick-sched.h | |||
@@ -0,0 +1,74 @@ | |||
1 | #ifndef _TICK_SCHED_H | ||
2 | #define _TICK_SCHED_H | ||
3 | |||
4 | #include <linux/hrtimer.h> | ||
5 | |||
6 | enum tick_device_mode { | ||
7 | TICKDEV_MODE_PERIODIC, | ||
8 | TICKDEV_MODE_ONESHOT, | ||
9 | }; | ||
10 | |||
11 | struct tick_device { | ||
12 | struct clock_event_device *evtdev; | ||
13 | enum tick_device_mode mode; | ||
14 | }; | ||
15 | |||
16 | enum tick_nohz_mode { | ||
17 | NOHZ_MODE_INACTIVE, | ||
18 | NOHZ_MODE_LOWRES, | ||
19 | NOHZ_MODE_HIGHRES, | ||
20 | }; | ||
21 | |||
22 | /** | ||
23 | * struct tick_sched - sched tick emulation and no idle tick control/stats | ||
24 | * @sched_timer: hrtimer to schedule the periodic tick in high | ||
25 | * resolution mode | ||
26 | * @last_tick: Store the last tick expiry time when the tick | ||
27 | * timer is modified for nohz sleeps. This is necessary | ||
28 | * to resume the tick timer operation in the timeline | ||
29 | * when the CPU returns from nohz sleep. | ||
30 | * @tick_stopped: Indicator that the idle tick has been stopped | ||
31 | * @idle_jiffies: jiffies at the entry to idle for idle time accounting | ||
32 | * @idle_calls: Total number of idle calls | ||
33 | * @idle_sleeps: Number of idle calls, where the sched tick was stopped | ||
34 | * @idle_entrytime: Time when the idle call was entered | ||
35 | * @idle_waketime: Time when the idle was interrupted | ||
36 | * @idle_exittime: Time when the idle state was left | ||
37 | * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped | ||
38 | * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding | ||
39 | * @sleep_length: Duration of the current idle sleep | ||
40 | * @do_timer_lst: CPU was the last one doing do_timer before going idle | ||
41 | */ | ||
42 | struct tick_sched { | ||
43 | struct hrtimer sched_timer; | ||
44 | unsigned long check_clocks; | ||
45 | enum tick_nohz_mode nohz_mode; | ||
46 | ktime_t last_tick; | ||
47 | int inidle; | ||
48 | int tick_stopped; | ||
49 | unsigned long idle_jiffies; | ||
50 | unsigned long idle_calls; | ||
51 | unsigned long idle_sleeps; | ||
52 | int idle_active; | ||
53 | ktime_t idle_entrytime; | ||
54 | ktime_t idle_waketime; | ||
55 | ktime_t idle_exittime; | ||
56 | ktime_t idle_sleeptime; | ||
57 | ktime_t iowait_sleeptime; | ||
58 | ktime_t sleep_length; | ||
59 | unsigned long last_jiffies; | ||
60 | unsigned long next_jiffies; | ||
61 | ktime_t idle_expires; | ||
62 | int do_timer_last; | ||
63 | }; | ||
64 | |||
65 | extern struct tick_sched *tick_get_tick_sched(int cpu); | ||
66 | |||
67 | extern void tick_setup_sched_timer(void); | ||
68 | #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS | ||
69 | extern void tick_cancel_sched_timer(int cpu); | ||
70 | #else | ||
71 | static inline void tick_cancel_sched_timer(int cpu) { } | ||
72 | #endif | ||
73 | |||
74 | #endif | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 91db94136c10..946acb72179f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -59,17 +59,15 @@ struct tk_fast { | |||
59 | }; | 59 | }; |
60 | 60 | ||
61 | static struct tk_fast tk_fast_mono ____cacheline_aligned; | 61 | static struct tk_fast tk_fast_mono ____cacheline_aligned; |
62 | static struct tk_fast tk_fast_raw ____cacheline_aligned; | ||
62 | 63 | ||
63 | /* flag for if timekeeping is suspended */ | 64 | /* flag for if timekeeping is suspended */ |
64 | int __read_mostly timekeeping_suspended; | 65 | int __read_mostly timekeeping_suspended; |
65 | 66 | ||
66 | /* Flag for if there is a persistent clock on this platform */ | ||
67 | bool __read_mostly persistent_clock_exist = false; | ||
68 | |||
69 | static inline void tk_normalize_xtime(struct timekeeper *tk) | 67 | static inline void tk_normalize_xtime(struct timekeeper *tk) |
70 | { | 68 | { |
71 | while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { | 69 | while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { |
72 | tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; | 70 | tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; |
73 | tk->xtime_sec++; | 71 | tk->xtime_sec++; |
74 | } | 72 | } |
75 | } | 73 | } |
@@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk) | |||
79 | struct timespec64 ts; | 77 | struct timespec64 ts; |
80 | 78 | ||
81 | ts.tv_sec = tk->xtime_sec; | 79 | ts.tv_sec = tk->xtime_sec; |
82 | ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); | 80 | ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); |
83 | return ts; | 81 | return ts; |
84 | } | 82 | } |
85 | 83 | ||
86 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) | 84 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) |
87 | { | 85 | { |
88 | tk->xtime_sec = ts->tv_sec; | 86 | tk->xtime_sec = ts->tv_sec; |
89 | tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; | 87 | tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; |
90 | } | 88 | } |
91 | 89 | ||
92 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) | 90 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) |
93 | { | 91 | { |
94 | tk->xtime_sec += ts->tv_sec; | 92 | tk->xtime_sec += ts->tv_sec; |
95 | tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; | 93 | tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; |
96 | tk_normalize_xtime(tk); | 94 | tk_normalize_xtime(tk); |
97 | } | 95 | } |
98 | 96 | ||
@@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) | |||
118 | tk->offs_boot = ktime_add(tk->offs_boot, delta); | 116 | tk->offs_boot = ktime_add(tk->offs_boot, delta); |
119 | } | 117 | } |
120 | 118 | ||
119 | #ifdef CONFIG_DEBUG_TIMEKEEPING | ||
120 | #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ | ||
121 | /* | ||
122 | * These simple flag variables are managed | ||
123 | * without locks, which is racy, but ok since | ||
124 | * we don't really care about being super | ||
125 | * precise about how many events were seen, | ||
126 | * just that a problem was observed. | ||
127 | */ | ||
128 | static int timekeeping_underflow_seen; | ||
129 | static int timekeeping_overflow_seen; | ||
130 | |||
131 | /* last_warning is only modified under the timekeeping lock */ | ||
132 | static long timekeeping_last_warning; | ||
133 | |||
134 | static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) | ||
135 | { | ||
136 | |||
137 | cycle_t max_cycles = tk->tkr_mono.clock->max_cycles; | ||
138 | const char *name = tk->tkr_mono.clock->name; | ||
139 | |||
140 | if (offset > max_cycles) { | ||
141 | printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", | ||
142 | offset, name, max_cycles); | ||
143 | printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); | ||
144 | } else { | ||
145 | if (offset > (max_cycles >> 1)) { | ||
146 | printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", | ||
147 | offset, name, max_cycles >> 1); | ||
148 | printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | if (timekeeping_underflow_seen) { | ||
153 | if (jiffies - timekeeping_last_warning > WARNING_FREQ) { | ||
154 | printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); | ||
155 | printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); | ||
156 | printk_deferred(" Your kernel is probably still fine.\n"); | ||
157 | timekeeping_last_warning = jiffies; | ||
158 | } | ||
159 | timekeeping_underflow_seen = 0; | ||
160 | } | ||
161 | |||
162 | if (timekeeping_overflow_seen) { | ||
163 | if (jiffies - timekeeping_last_warning > WARNING_FREQ) { | ||
164 | printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); | ||
165 | printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); | ||
166 | printk_deferred(" Your kernel is probably still fine.\n"); | ||
167 | timekeeping_last_warning = jiffies; | ||
168 | } | ||
169 | timekeeping_overflow_seen = 0; | ||
170 | } | ||
171 | } | ||
172 | |||
173 | static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) | ||
174 | { | ||
175 | cycle_t now, last, mask, max, delta; | ||
176 | unsigned int seq; | ||
177 | |||
178 | /* | ||
179 | * Since we're called holding a seqlock, the data may shift | ||
180 | * under us while we're doing the calculation. This can cause | ||
181 | * false positives, since we'd note a problem but throw the | ||
182 | * results away. So nest another seqlock here to atomically | ||
183 | * grab the points we are checking with. | ||
184 | */ | ||
185 | do { | ||
186 | seq = read_seqcount_begin(&tk_core.seq); | ||
187 | now = tkr->read(tkr->clock); | ||
188 | last = tkr->cycle_last; | ||
189 | mask = tkr->mask; | ||
190 | max = tkr->clock->max_cycles; | ||
191 | } while (read_seqcount_retry(&tk_core.seq, seq)); | ||
192 | |||
193 | delta = clocksource_delta(now, last, mask); | ||
194 | |||
195 | /* | ||
196 | * Try to catch underflows by checking if we are seeing small | ||
197 | * mask-relative negative values. | ||
198 | */ | ||
199 | if (unlikely((~delta & mask) < (mask >> 3))) { | ||
200 | timekeeping_underflow_seen = 1; | ||
201 | delta = 0; | ||
202 | } | ||
203 | |||
204 | /* Cap delta value to the max_cycles values to avoid mult overflows */ | ||
205 | if (unlikely(delta > max)) { | ||
206 | timekeeping_overflow_seen = 1; | ||
207 | delta = tkr->clock->max_cycles; | ||
208 | } | ||
209 | |||
210 | return delta; | ||
211 | } | ||
212 | #else | ||
213 | static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) | ||
214 | { | ||
215 | } | ||
216 | static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) | ||
217 | { | ||
218 | cycle_t cycle_now, delta; | ||
219 | |||
220 | /* read clocksource */ | ||
221 | cycle_now = tkr->read(tkr->clock); | ||
222 | |||
223 | /* calculate the delta since the last update_wall_time */ | ||
224 | delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); | ||
225 | |||
226 | return delta; | ||
227 | } | ||
228 | #endif | ||
229 | |||
121 | /** | 230 | /** |
122 | * tk_setup_internals - Set up internals to use clocksource clock. | 231 | * tk_setup_internals - Set up internals to use clocksource clock. |
123 | * | 232 | * |
@@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
135 | u64 tmp, ntpinterval; | 244 | u64 tmp, ntpinterval; |
136 | struct clocksource *old_clock; | 245 | struct clocksource *old_clock; |
137 | 246 | ||
138 | old_clock = tk->tkr.clock; | 247 | old_clock = tk->tkr_mono.clock; |
139 | tk->tkr.clock = clock; | 248 | tk->tkr_mono.clock = clock; |
140 | tk->tkr.read = clock->read; | 249 | tk->tkr_mono.read = clock->read; |
141 | tk->tkr.mask = clock->mask; | 250 | tk->tkr_mono.mask = clock->mask; |
142 | tk->tkr.cycle_last = tk->tkr.read(clock); | 251 | tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); |
252 | |||
253 | tk->tkr_raw.clock = clock; | ||
254 | tk->tkr_raw.read = clock->read; | ||
255 | tk->tkr_raw.mask = clock->mask; | ||
256 | tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; | ||
143 | 257 | ||
144 | /* Do the ns -> cycle conversion first, using original mult */ | 258 | /* Do the ns -> cycle conversion first, using original mult */ |
145 | tmp = NTP_INTERVAL_LENGTH; | 259 | tmp = NTP_INTERVAL_LENGTH; |
@@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
163 | if (old_clock) { | 277 | if (old_clock) { |
164 | int shift_change = clock->shift - old_clock->shift; | 278 | int shift_change = clock->shift - old_clock->shift; |
165 | if (shift_change < 0) | 279 | if (shift_change < 0) |
166 | tk->tkr.xtime_nsec >>= -shift_change; | 280 | tk->tkr_mono.xtime_nsec >>= -shift_change; |
167 | else | 281 | else |
168 | tk->tkr.xtime_nsec <<= shift_change; | 282 | tk->tkr_mono.xtime_nsec <<= shift_change; |
169 | } | 283 | } |
170 | tk->tkr.shift = clock->shift; | 284 | tk->tkr_raw.xtime_nsec = 0; |
285 | |||
286 | tk->tkr_mono.shift = clock->shift; | ||
287 | tk->tkr_raw.shift = clock->shift; | ||
171 | 288 | ||
172 | tk->ntp_error = 0; | 289 | tk->ntp_error = 0; |
173 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; | 290 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; |
@@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
178 | * active clocksource. These value will be adjusted via NTP | 295 | * active clocksource. These value will be adjusted via NTP |
179 | * to counteract clock drifting. | 296 | * to counteract clock drifting. |
180 | */ | 297 | */ |
181 | tk->tkr.mult = clock->mult; | 298 | tk->tkr_mono.mult = clock->mult; |
299 | tk->tkr_raw.mult = clock->mult; | ||
182 | tk->ntp_err_mult = 0; | 300 | tk->ntp_err_mult = 0; |
183 | } | 301 | } |
184 | 302 | ||
@@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; } | |||
193 | 311 | ||
194 | static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) | 312 | static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) |
195 | { | 313 | { |
196 | cycle_t cycle_now, delta; | 314 | cycle_t delta; |
197 | s64 nsec; | 315 | s64 nsec; |
198 | 316 | ||
199 | /* read clocksource: */ | 317 | delta = timekeeping_get_delta(tkr); |
200 | cycle_now = tkr->read(tkr->clock); | ||
201 | |||
202 | /* calculate the delta since the last update_wall_time: */ | ||
203 | delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); | ||
204 | 318 | ||
205 | nsec = delta * tkr->mult + tkr->xtime_nsec; | 319 | nsec = delta * tkr->mult + tkr->xtime_nsec; |
206 | nsec >>= tkr->shift; | 320 | nsec >>= tkr->shift; |
@@ -209,25 +323,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) | |||
209 | return nsec + arch_gettimeoffset(); | 323 | return nsec + arch_gettimeoffset(); |
210 | } | 324 | } |
211 | 325 | ||
212 | static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | ||
213 | { | ||
214 | struct clocksource *clock = tk->tkr.clock; | ||
215 | cycle_t cycle_now, delta; | ||
216 | s64 nsec; | ||
217 | |||
218 | /* read clocksource: */ | ||
219 | cycle_now = tk->tkr.read(clock); | ||
220 | |||
221 | /* calculate the delta since the last update_wall_time: */ | ||
222 | delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); | ||
223 | |||
224 | /* convert delta to nanoseconds. */ | ||
225 | nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); | ||
226 | |||
227 | /* If arch requires, add in get_arch_timeoffset() */ | ||
228 | return nsec + arch_gettimeoffset(); | ||
229 | } | ||
230 | |||
231 | /** | 326 | /** |
232 | * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. | 327 | * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. |
233 | * @tkr: Timekeeping readout base from which we take the update | 328 | * @tkr: Timekeeping readout base from which we take the update |
@@ -267,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | |||
267 | * slightly wrong timestamp (a few nanoseconds). See | 362 | * slightly wrong timestamp (a few nanoseconds). See |
268 | * @ktime_get_mono_fast_ns. | 363 | * @ktime_get_mono_fast_ns. |
269 | */ | 364 | */ |
270 | static void update_fast_timekeeper(struct tk_read_base *tkr) | 365 | static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf) |
271 | { | 366 | { |
272 | struct tk_read_base *base = tk_fast_mono.base; | 367 | struct tk_read_base *base = tkf->base; |
273 | 368 | ||
274 | /* Force readers off to base[1] */ | 369 | /* Force readers off to base[1] */ |
275 | raw_write_seqcount_latch(&tk_fast_mono.seq); | 370 | raw_write_seqcount_latch(&tkf->seq); |
276 | 371 | ||
277 | /* Update base[0] */ | 372 | /* Update base[0] */ |
278 | memcpy(base, tkr, sizeof(*base)); | 373 | memcpy(base, tkr, sizeof(*base)); |
279 | 374 | ||
280 | /* Force readers back to base[0] */ | 375 | /* Force readers back to base[0] */ |
281 | raw_write_seqcount_latch(&tk_fast_mono.seq); | 376 | raw_write_seqcount_latch(&tkf->seq); |
282 | 377 | ||
283 | /* Update base[1] */ | 378 | /* Update base[1] */ |
284 | memcpy(base + 1, base, sizeof(*base)); | 379 | memcpy(base + 1, base, sizeof(*base)); |
@@ -316,22 +411,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr) | |||
316 | * of the following timestamps. Callers need to be aware of that and | 411 | * of the following timestamps. Callers need to be aware of that and |
317 | * deal with it. | 412 | * deal with it. |
318 | */ | 413 | */ |
319 | u64 notrace ktime_get_mono_fast_ns(void) | 414 | static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) |
320 | { | 415 | { |
321 | struct tk_read_base *tkr; | 416 | struct tk_read_base *tkr; |
322 | unsigned int seq; | 417 | unsigned int seq; |
323 | u64 now; | 418 | u64 now; |
324 | 419 | ||
325 | do { | 420 | do { |
326 | seq = raw_read_seqcount(&tk_fast_mono.seq); | 421 | seq = raw_read_seqcount(&tkf->seq); |
327 | tkr = tk_fast_mono.base + (seq & 0x01); | 422 | tkr = tkf->base + (seq & 0x01); |
328 | now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); | 423 | now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); |
424 | } while (read_seqcount_retry(&tkf->seq, seq)); | ||
329 | 425 | ||
330 | } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); | ||
331 | return now; | 426 | return now; |
332 | } | 427 | } |
428 | |||
429 | u64 ktime_get_mono_fast_ns(void) | ||
430 | { | ||
431 | return __ktime_get_fast_ns(&tk_fast_mono); | ||
432 | } | ||
333 | EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); | 433 | EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); |
334 | 434 | ||
435 | u64 ktime_get_raw_fast_ns(void) | ||
436 | { | ||
437 | return __ktime_get_fast_ns(&tk_fast_raw); | ||
438 | } | ||
439 | EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); | ||
440 | |||
335 | /* Suspend-time cycles value for halted fast timekeeper. */ | 441 | /* Suspend-time cycles value for halted fast timekeeper. */ |
336 | static cycle_t cycles_at_suspend; | 442 | static cycle_t cycles_at_suspend; |
337 | 443 | ||
@@ -353,12 +459,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs) | |||
353 | static void halt_fast_timekeeper(struct timekeeper *tk) | 459 | static void halt_fast_timekeeper(struct timekeeper *tk) |
354 | { | 460 | { |
355 | static struct tk_read_base tkr_dummy; | 461 | static struct tk_read_base tkr_dummy; |
356 | struct tk_read_base *tkr = &tk->tkr; | 462 | struct tk_read_base *tkr = &tk->tkr_mono; |
357 | 463 | ||
358 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); | 464 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); |
359 | cycles_at_suspend = tkr->read(tkr->clock); | 465 | cycles_at_suspend = tkr->read(tkr->clock); |
360 | tkr_dummy.read = dummy_clock_read; | 466 | tkr_dummy.read = dummy_clock_read; |
361 | update_fast_timekeeper(&tkr_dummy); | 467 | update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); |
468 | |||
469 | tkr = &tk->tkr_raw; | ||
470 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); | ||
471 | tkr_dummy.read = dummy_clock_read; | ||
472 | update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); | ||
362 | } | 473 | } |
363 | 474 | ||
364 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD | 475 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD |
@@ -369,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk) | |||
369 | 480 | ||
370 | xt = timespec64_to_timespec(tk_xtime(tk)); | 481 | xt = timespec64_to_timespec(tk_xtime(tk)); |
371 | wm = timespec64_to_timespec(tk->wall_to_monotonic); | 482 | wm = timespec64_to_timespec(tk->wall_to_monotonic); |
372 | update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, | 483 | update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult, |
373 | tk->tkr.cycle_last); | 484 | tk->tkr_mono.cycle_last); |
374 | } | 485 | } |
375 | 486 | ||
376 | static inline void old_vsyscall_fixup(struct timekeeper *tk) | 487 | static inline void old_vsyscall_fixup(struct timekeeper *tk) |
@@ -387,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) | |||
387 | * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD | 498 | * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD |
388 | * users are removed, this can be killed. | 499 | * users are removed, this can be killed. |
389 | */ | 500 | */ |
390 | remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); | 501 | remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); |
391 | tk->tkr.xtime_nsec -= remainder; | 502 | tk->tkr_mono.xtime_nsec -= remainder; |
392 | tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; | 503 | tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; |
393 | tk->ntp_error += remainder << tk->ntp_error_shift; | 504 | tk->ntp_error += remainder << tk->ntp_error_shift; |
394 | tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; | 505 | tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; |
395 | } | 506 | } |
396 | #else | 507 | #else |
397 | #define old_vsyscall_fixup(tk) | 508 | #define old_vsyscall_fixup(tk) |
@@ -456,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) | |||
456 | */ | 567 | */ |
457 | seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); | 568 | seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); |
458 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; | 569 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; |
459 | tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); | 570 | tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); |
460 | 571 | ||
461 | /* Update the monotonic raw base */ | 572 | /* Update the monotonic raw base */ |
462 | tk->base_raw = timespec64_to_ktime(tk->raw_time); | 573 | tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); |
463 | 574 | ||
464 | /* | 575 | /* |
465 | * The sum of the nanoseconds portions of xtime and | 576 | * The sum of the nanoseconds portions of xtime and |
466 | * wall_to_monotonic can be greater/equal one second. Take | 577 | * wall_to_monotonic can be greater/equal one second. Take |
467 | * this into account before updating tk->ktime_sec. | 578 | * this into account before updating tk->ktime_sec. |
468 | */ | 579 | */ |
469 | nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); | 580 | nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); |
470 | if (nsec >= NSEC_PER_SEC) | 581 | if (nsec >= NSEC_PER_SEC) |
471 | seconds++; | 582 | seconds++; |
472 | tk->ktime_sec = seconds; | 583 | tk->ktime_sec = seconds; |
@@ -489,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
489 | memcpy(&shadow_timekeeper, &tk_core.timekeeper, | 600 | memcpy(&shadow_timekeeper, &tk_core.timekeeper, |
490 | sizeof(tk_core.timekeeper)); | 601 | sizeof(tk_core.timekeeper)); |
491 | 602 | ||
492 | update_fast_timekeeper(&tk->tkr); | 603 | update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); |
604 | update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); | ||
493 | } | 605 | } |
494 | 606 | ||
495 | /** | 607 | /** |
@@ -501,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
501 | */ | 613 | */ |
502 | static void timekeeping_forward_now(struct timekeeper *tk) | 614 | static void timekeeping_forward_now(struct timekeeper *tk) |
503 | { | 615 | { |
504 | struct clocksource *clock = tk->tkr.clock; | 616 | struct clocksource *clock = tk->tkr_mono.clock; |
505 | cycle_t cycle_now, delta; | 617 | cycle_t cycle_now, delta; |
506 | s64 nsec; | 618 | s64 nsec; |
507 | 619 | ||
508 | cycle_now = tk->tkr.read(clock); | 620 | cycle_now = tk->tkr_mono.read(clock); |
509 | delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); | 621 | delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); |
510 | tk->tkr.cycle_last = cycle_now; | 622 | tk->tkr_mono.cycle_last = cycle_now; |
623 | tk->tkr_raw.cycle_last = cycle_now; | ||
511 | 624 | ||
512 | tk->tkr.xtime_nsec += delta * tk->tkr.mult; | 625 | tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; |
513 | 626 | ||
514 | /* If arch requires, add in get_arch_timeoffset() */ | 627 | /* If arch requires, add in get_arch_timeoffset() */ |
515 | tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; | 628 | tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; |
516 | 629 | ||
517 | tk_normalize_xtime(tk); | 630 | tk_normalize_xtime(tk); |
518 | 631 | ||
519 | nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); | 632 | nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); |
520 | timespec64_add_ns(&tk->raw_time, nsec); | 633 | timespec64_add_ns(&tk->raw_time, nsec); |
521 | } | 634 | } |
522 | 635 | ||
@@ -537,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts) | |||
537 | seq = read_seqcount_begin(&tk_core.seq); | 650 | seq = read_seqcount_begin(&tk_core.seq); |
538 | 651 | ||
539 | ts->tv_sec = tk->xtime_sec; | 652 | ts->tv_sec = tk->xtime_sec; |
540 | nsecs = timekeeping_get_ns(&tk->tkr); | 653 | nsecs = timekeeping_get_ns(&tk->tkr_mono); |
541 | 654 | ||
542 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 655 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
543 | 656 | ||
@@ -577,8 +690,8 @@ ktime_t ktime_get(void) | |||
577 | 690 | ||
578 | do { | 691 | do { |
579 | seq = read_seqcount_begin(&tk_core.seq); | 692 | seq = read_seqcount_begin(&tk_core.seq); |
580 | base = tk->tkr.base_mono; | 693 | base = tk->tkr_mono.base; |
581 | nsecs = timekeeping_get_ns(&tk->tkr); | 694 | nsecs = timekeeping_get_ns(&tk->tkr_mono); |
582 | 695 | ||
583 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 696 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
584 | 697 | ||
@@ -603,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) | |||
603 | 716 | ||
604 | do { | 717 | do { |
605 | seq = read_seqcount_begin(&tk_core.seq); | 718 | seq = read_seqcount_begin(&tk_core.seq); |
606 | base = ktime_add(tk->tkr.base_mono, *offset); | 719 | base = ktime_add(tk->tkr_mono.base, *offset); |
607 | nsecs = timekeeping_get_ns(&tk->tkr); | 720 | nsecs = timekeeping_get_ns(&tk->tkr_mono); |
608 | 721 | ||
609 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 722 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
610 | 723 | ||
@@ -645,8 +758,8 @@ ktime_t ktime_get_raw(void) | |||
645 | 758 | ||
646 | do { | 759 | do { |
647 | seq = read_seqcount_begin(&tk_core.seq); | 760 | seq = read_seqcount_begin(&tk_core.seq); |
648 | base = tk->base_raw; | 761 | base = tk->tkr_raw.base; |
649 | nsecs = timekeeping_get_ns_raw(tk); | 762 | nsecs = timekeeping_get_ns(&tk->tkr_raw); |
650 | 763 | ||
651 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 764 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
652 | 765 | ||
@@ -674,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts) | |||
674 | do { | 787 | do { |
675 | seq = read_seqcount_begin(&tk_core.seq); | 788 | seq = read_seqcount_begin(&tk_core.seq); |
676 | ts->tv_sec = tk->xtime_sec; | 789 | ts->tv_sec = tk->xtime_sec; |
677 | nsec = timekeeping_get_ns(&tk->tkr); | 790 | nsec = timekeeping_get_ns(&tk->tkr_mono); |
678 | tomono = tk->wall_to_monotonic; | 791 | tomono = tk->wall_to_monotonic; |
679 | 792 | ||
680 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 793 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
@@ -759,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
759 | ts_real->tv_sec = tk->xtime_sec; | 872 | ts_real->tv_sec = tk->xtime_sec; |
760 | ts_real->tv_nsec = 0; | 873 | ts_real->tv_nsec = 0; |
761 | 874 | ||
762 | nsecs_raw = timekeeping_get_ns_raw(tk); | 875 | nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); |
763 | nsecs_real = timekeeping_get_ns(&tk->tkr); | 876 | nsecs_real = timekeeping_get_ns(&tk->tkr_mono); |
764 | 877 | ||
765 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 878 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
766 | 879 | ||
@@ -943,7 +1056,7 @@ static int change_clocksource(void *data) | |||
943 | */ | 1056 | */ |
944 | if (try_module_get(new->owner)) { | 1057 | if (try_module_get(new->owner)) { |
945 | if (!new->enable || new->enable(new) == 0) { | 1058 | if (!new->enable || new->enable(new) == 0) { |
946 | old = tk->tkr.clock; | 1059 | old = tk->tkr_mono.clock; |
947 | tk_setup_internals(tk, new); | 1060 | tk_setup_internals(tk, new); |
948 | if (old->disable) | 1061 | if (old->disable) |
949 | old->disable(old); | 1062 | old->disable(old); |
@@ -971,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock) | |||
971 | { | 1084 | { |
972 | struct timekeeper *tk = &tk_core.timekeeper; | 1085 | struct timekeeper *tk = &tk_core.timekeeper; |
973 | 1086 | ||
974 | if (tk->tkr.clock == clock) | 1087 | if (tk->tkr_mono.clock == clock) |
975 | return 0; | 1088 | return 0; |
976 | stop_machine(change_clocksource, clock, NULL); | 1089 | stop_machine(change_clocksource, clock, NULL); |
977 | tick_clock_notify(); | 1090 | tick_clock_notify(); |
978 | return tk->tkr.clock == clock ? 0 : -1; | 1091 | return tk->tkr_mono.clock == clock ? 0 : -1; |
979 | } | 1092 | } |
980 | 1093 | ||
981 | /** | 1094 | /** |
@@ -993,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts) | |||
993 | 1106 | ||
994 | do { | 1107 | do { |
995 | seq = read_seqcount_begin(&tk_core.seq); | 1108 | seq = read_seqcount_begin(&tk_core.seq); |
996 | nsecs = timekeeping_get_ns_raw(tk); | 1109 | nsecs = timekeeping_get_ns(&tk->tkr_raw); |
997 | ts64 = tk->raw_time; | 1110 | ts64 = tk->raw_time; |
998 | 1111 | ||
999 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1112 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
@@ -1016,7 +1129,7 @@ int timekeeping_valid_for_hres(void) | |||
1016 | do { | 1129 | do { |
1017 | seq = read_seqcount_begin(&tk_core.seq); | 1130 | seq = read_seqcount_begin(&tk_core.seq); |
1018 | 1131 | ||
1019 | ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | 1132 | ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; |
1020 | 1133 | ||
1021 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1134 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
1022 | 1135 | ||
@@ -1035,7 +1148,7 @@ u64 timekeeping_max_deferment(void) | |||
1035 | do { | 1148 | do { |
1036 | seq = read_seqcount_begin(&tk_core.seq); | 1149 | seq = read_seqcount_begin(&tk_core.seq); |
1037 | 1150 | ||
1038 | ret = tk->tkr.clock->max_idle_ns; | 1151 | ret = tk->tkr_mono.clock->max_idle_ns; |
1039 | 1152 | ||
1040 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1153 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
1041 | 1154 | ||
@@ -1057,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts) | |||
1057 | ts->tv_nsec = 0; | 1170 | ts->tv_nsec = 0; |
1058 | } | 1171 | } |
1059 | 1172 | ||
1173 | void __weak read_persistent_clock64(struct timespec64 *ts64) | ||
1174 | { | ||
1175 | struct timespec ts; | ||
1176 | |||
1177 | read_persistent_clock(&ts); | ||
1178 | *ts64 = timespec_to_timespec64(ts); | ||
1179 | } | ||
1180 | |||
1060 | /** | 1181 | /** |
1061 | * read_boot_clock - Return time of the system start. | 1182 | * read_boot_clock - Return time of the system start. |
1062 | * | 1183 | * |
@@ -1072,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts) | |||
1072 | ts->tv_nsec = 0; | 1193 | ts->tv_nsec = 0; |
1073 | } | 1194 | } |
1074 | 1195 | ||
1196 | void __weak read_boot_clock64(struct timespec64 *ts64) | ||
1197 | { | ||
1198 | struct timespec ts; | ||
1199 | |||
1200 | read_boot_clock(&ts); | ||
1201 | *ts64 = timespec_to_timespec64(ts); | ||
1202 | } | ||
1203 | |||
1204 | /* Flag for if timekeeping_resume() has injected sleeptime */ | ||
1205 | static bool sleeptime_injected; | ||
1206 | |||
1207 | /* Flag for if there is a persistent clock on this platform */ | ||
1208 | static bool persistent_clock_exists; | ||
1209 | |||
1075 | /* | 1210 | /* |
1076 | * timekeeping_init - Initializes the clocksource and common timekeeping values | 1211 | * timekeeping_init - Initializes the clocksource and common timekeeping values |
1077 | */ | 1212 | */ |
@@ -1081,20 +1216,17 @@ void __init timekeeping_init(void) | |||
1081 | struct clocksource *clock; | 1216 | struct clocksource *clock; |
1082 | unsigned long flags; | 1217 | unsigned long flags; |
1083 | struct timespec64 now, boot, tmp; | 1218 | struct timespec64 now, boot, tmp; |
1084 | struct timespec ts; | ||
1085 | 1219 | ||
1086 | read_persistent_clock(&ts); | 1220 | read_persistent_clock64(&now); |
1087 | now = timespec_to_timespec64(ts); | ||
1088 | if (!timespec64_valid_strict(&now)) { | 1221 | if (!timespec64_valid_strict(&now)) { |
1089 | pr_warn("WARNING: Persistent clock returned invalid value!\n" | 1222 | pr_warn("WARNING: Persistent clock returned invalid value!\n" |
1090 | " Check your CMOS/BIOS settings.\n"); | 1223 | " Check your CMOS/BIOS settings.\n"); |
1091 | now.tv_sec = 0; | 1224 | now.tv_sec = 0; |
1092 | now.tv_nsec = 0; | 1225 | now.tv_nsec = 0; |
1093 | } else if (now.tv_sec || now.tv_nsec) | 1226 | } else if (now.tv_sec || now.tv_nsec) |
1094 | persistent_clock_exist = true; | 1227 | persistent_clock_exists = true; |
1095 | 1228 | ||
1096 | read_boot_clock(&ts); | 1229 | read_boot_clock64(&boot); |
1097 | boot = timespec_to_timespec64(ts); | ||
1098 | if (!timespec64_valid_strict(&boot)) { | 1230 | if (!timespec64_valid_strict(&boot)) { |
1099 | pr_warn("WARNING: Boot clock returned invalid value!\n" | 1231 | pr_warn("WARNING: Boot clock returned invalid value!\n" |
1100 | " Check your CMOS/BIOS settings.\n"); | 1232 | " Check your CMOS/BIOS settings.\n"); |
@@ -1114,7 +1246,6 @@ void __init timekeeping_init(void) | |||
1114 | tk_set_xtime(tk, &now); | 1246 | tk_set_xtime(tk, &now); |
1115 | tk->raw_time.tv_sec = 0; | 1247 | tk->raw_time.tv_sec = 0; |
1116 | tk->raw_time.tv_nsec = 0; | 1248 | tk->raw_time.tv_nsec = 0; |
1117 | tk->base_raw.tv64 = 0; | ||
1118 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) | 1249 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) |
1119 | boot = tk_xtime(tk); | 1250 | boot = tk_xtime(tk); |
1120 | 1251 | ||
@@ -1127,7 +1258,7 @@ void __init timekeeping_init(void) | |||
1127 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1258 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1128 | } | 1259 | } |
1129 | 1260 | ||
1130 | /* time in seconds when suspend began */ | 1261 | /* time in seconds when suspend began for persistent clock */ |
1131 | static struct timespec64 timekeeping_suspend_time; | 1262 | static struct timespec64 timekeeping_suspend_time; |
1132 | 1263 | ||
1133 | /** | 1264 | /** |
@@ -1152,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
1152 | tk_debug_account_sleep_time(delta); | 1283 | tk_debug_account_sleep_time(delta); |
1153 | } | 1284 | } |
1154 | 1285 | ||
1286 | #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) | ||
1287 | /** | ||
1288 | * We have three kinds of time sources to use for sleep time | ||
1289 | * injection, the preference order is: | ||
1290 | * 1) non-stop clocksource | ||
1291 | * 2) persistent clock (ie: RTC accessible when irqs are off) | ||
1292 | * 3) RTC | ||
1293 | * | ||
1294 | * 1) and 2) are used by timekeeping, 3) by RTC subsystem. | ||
1295 | * If system has neither 1) nor 2), 3) will be used finally. | ||
1296 | * | ||
1297 | * | ||
1298 | * If timekeeping has injected sleeptime via either 1) or 2), | ||
1299 | * 3) becomes needless, so in this case we don't need to call | ||
1300 | * rtc_resume(), and this is what timekeeping_rtc_skipresume() | ||
1301 | * means. | ||
1302 | */ | ||
1303 | bool timekeeping_rtc_skipresume(void) | ||
1304 | { | ||
1305 | return sleeptime_injected; | ||
1306 | } | ||
1307 | |||
1308 | /** | ||
1309 | * 1) can be determined whether to use or not only when doing | ||
1310 | * timekeeping_resume() which is invoked after rtc_suspend(), | ||
1311 | * so we can't skip rtc_suspend() surely if system has 1). | ||
1312 | * | ||
1313 | * But if system has 2), 2) will definitely be used, so in this | ||
1314 | * case we don't need to call rtc_suspend(), and this is what | ||
1315 | * timekeeping_rtc_skipsuspend() means. | ||
1316 | */ | ||
1317 | bool timekeeping_rtc_skipsuspend(void) | ||
1318 | { | ||
1319 | return persistent_clock_exists; | ||
1320 | } | ||
1321 | |||
1155 | /** | 1322 | /** |
1156 | * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values | 1323 | * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values |
1157 | * @delta: pointer to a timespec64 delta value | 1324 | * @delta: pointer to a timespec64 delta value |
1158 | * | 1325 | * |
1159 | * This hook is for architectures that cannot support read_persistent_clock | 1326 | * This hook is for architectures that cannot support read_persistent_clock64 |
1160 | * because their RTC/persistent clock is only accessible when irqs are enabled. | 1327 | * because their RTC/persistent clock is only accessible when irqs are enabled. |
1328 | * and also don't have an effective nonstop clocksource. | ||
1161 | * | 1329 | * |
1162 | * This function should only be called by rtc_resume(), and allows | 1330 | * This function should only be called by rtc_resume(), and allows |
1163 | * a suspend offset to be injected into the timekeeping values. | 1331 | * a suspend offset to be injected into the timekeeping values. |
@@ -1167,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) | |||
1167 | struct timekeeper *tk = &tk_core.timekeeper; | 1335 | struct timekeeper *tk = &tk_core.timekeeper; |
1168 | unsigned long flags; | 1336 | unsigned long flags; |
1169 | 1337 | ||
1170 | /* | ||
1171 | * Make sure we don't set the clock twice, as timekeeping_resume() | ||
1172 | * already did it | ||
1173 | */ | ||
1174 | if (has_persistent_clock()) | ||
1175 | return; | ||
1176 | |||
1177 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1338 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
1178 | write_seqcount_begin(&tk_core.seq); | 1339 | write_seqcount_begin(&tk_core.seq); |
1179 | 1340 | ||
@@ -1189,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) | |||
1189 | /* signal hrtimers about time change */ | 1350 | /* signal hrtimers about time change */ |
1190 | clock_was_set(); | 1351 | clock_was_set(); |
1191 | } | 1352 | } |
1353 | #endif | ||
1192 | 1354 | ||
1193 | /** | 1355 | /** |
1194 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 1356 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
1195 | * | ||
1196 | * This is for the generic clocksource timekeeping. | ||
1197 | * xtime/wall_to_monotonic/jiffies/etc are | ||
1198 | * still managed by arch specific suspend/resume code. | ||
1199 | */ | 1357 | */ |
1200 | void timekeeping_resume(void) | 1358 | void timekeeping_resume(void) |
1201 | { | 1359 | { |
1202 | struct timekeeper *tk = &tk_core.timekeeper; | 1360 | struct timekeeper *tk = &tk_core.timekeeper; |
1203 | struct clocksource *clock = tk->tkr.clock; | 1361 | struct clocksource *clock = tk->tkr_mono.clock; |
1204 | unsigned long flags; | 1362 | unsigned long flags; |
1205 | struct timespec64 ts_new, ts_delta; | 1363 | struct timespec64 ts_new, ts_delta; |
1206 | struct timespec tmp; | ||
1207 | cycle_t cycle_now, cycle_delta; | 1364 | cycle_t cycle_now, cycle_delta; |
1208 | bool suspendtime_found = false; | ||
1209 | 1365 | ||
1210 | read_persistent_clock(&tmp); | 1366 | sleeptime_injected = false; |
1211 | ts_new = timespec_to_timespec64(tmp); | 1367 | read_persistent_clock64(&ts_new); |
1212 | 1368 | ||
1213 | clockevents_resume(); | 1369 | clockevents_resume(); |
1214 | clocksource_resume(); | 1370 | clocksource_resume(); |
@@ -1228,16 +1384,16 @@ void timekeeping_resume(void) | |||
1228 | * The less preferred source will only be tried if there is no better | 1384 | * The less preferred source will only be tried if there is no better |
1229 | * usable source. The rtc part is handled separately in rtc core code. | 1385 | * usable source. The rtc part is handled separately in rtc core code. |
1230 | */ | 1386 | */ |
1231 | cycle_now = tk->tkr.read(clock); | 1387 | cycle_now = tk->tkr_mono.read(clock); |
1232 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && | 1388 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && |
1233 | cycle_now > tk->tkr.cycle_last) { | 1389 | cycle_now > tk->tkr_mono.cycle_last) { |
1234 | u64 num, max = ULLONG_MAX; | 1390 | u64 num, max = ULLONG_MAX; |
1235 | u32 mult = clock->mult; | 1391 | u32 mult = clock->mult; |
1236 | u32 shift = clock->shift; | 1392 | u32 shift = clock->shift; |
1237 | s64 nsec = 0; | 1393 | s64 nsec = 0; |
1238 | 1394 | ||
1239 | cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, | 1395 | cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, |
1240 | tk->tkr.mask); | 1396 | tk->tkr_mono.mask); |
1241 | 1397 | ||
1242 | /* | 1398 | /* |
1243 | * "cycle_delta * mutl" may cause 64 bits overflow, if the | 1399 | * "cycle_delta * mutl" may cause 64 bits overflow, if the |
@@ -1253,17 +1409,19 @@ void timekeeping_resume(void) | |||
1253 | nsec += ((u64) cycle_delta * mult) >> shift; | 1409 | nsec += ((u64) cycle_delta * mult) >> shift; |
1254 | 1410 | ||
1255 | ts_delta = ns_to_timespec64(nsec); | 1411 | ts_delta = ns_to_timespec64(nsec); |
1256 | suspendtime_found = true; | 1412 | sleeptime_injected = true; |
1257 | } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { | 1413 | } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { |
1258 | ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); | 1414 | ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); |
1259 | suspendtime_found = true; | 1415 | sleeptime_injected = true; |
1260 | } | 1416 | } |
1261 | 1417 | ||
1262 | if (suspendtime_found) | 1418 | if (sleeptime_injected) |
1263 | __timekeeping_inject_sleeptime(tk, &ts_delta); | 1419 | __timekeeping_inject_sleeptime(tk, &ts_delta); |
1264 | 1420 | ||
1265 | /* Re-base the last cycle value */ | 1421 | /* Re-base the last cycle value */ |
1266 | tk->tkr.cycle_last = cycle_now; | 1422 | tk->tkr_mono.cycle_last = cycle_now; |
1423 | tk->tkr_raw.cycle_last = cycle_now; | ||
1424 | |||
1267 | tk->ntp_error = 0; | 1425 | tk->ntp_error = 0; |
1268 | timekeeping_suspended = 0; | 1426 | timekeeping_suspended = 0; |
1269 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); | 1427 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |
@@ -1272,9 +1430,7 @@ void timekeeping_resume(void) | |||
1272 | 1430 | ||
1273 | touch_softlockup_watchdog(); | 1431 | touch_softlockup_watchdog(); |
1274 | 1432 | ||
1275 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | 1433 | tick_resume(); |
1276 | |||
1277 | /* Resume hrtimers */ | ||
1278 | hrtimers_resume(); | 1434 | hrtimers_resume(); |
1279 | } | 1435 | } |
1280 | 1436 | ||
@@ -1284,10 +1440,8 @@ int timekeeping_suspend(void) | |||
1284 | unsigned long flags; | 1440 | unsigned long flags; |
1285 | struct timespec64 delta, delta_delta; | 1441 | struct timespec64 delta, delta_delta; |
1286 | static struct timespec64 old_delta; | 1442 | static struct timespec64 old_delta; |
1287 | struct timespec tmp; | ||
1288 | 1443 | ||
1289 | read_persistent_clock(&tmp); | 1444 | read_persistent_clock64(&timekeeping_suspend_time); |
1290 | timekeeping_suspend_time = timespec_to_timespec64(tmp); | ||
1291 | 1445 | ||
1292 | /* | 1446 | /* |
1293 | * On some systems the persistent_clock can not be detected at | 1447 | * On some systems the persistent_clock can not be detected at |
@@ -1295,31 +1449,33 @@ int timekeeping_suspend(void) | |||
1295 | * value returned, update the persistent_clock_exists flag. | 1449 | * value returned, update the persistent_clock_exists flag. |
1296 | */ | 1450 | */ |
1297 | if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) | 1451 | if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) |
1298 | persistent_clock_exist = true; | 1452 | persistent_clock_exists = true; |
1299 | 1453 | ||
1300 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1454 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
1301 | write_seqcount_begin(&tk_core.seq); | 1455 | write_seqcount_begin(&tk_core.seq); |
1302 | timekeeping_forward_now(tk); | 1456 | timekeeping_forward_now(tk); |
1303 | timekeeping_suspended = 1; | 1457 | timekeeping_suspended = 1; |
1304 | 1458 | ||
1305 | /* | 1459 | if (persistent_clock_exists) { |
1306 | * To avoid drift caused by repeated suspend/resumes, | ||
1307 | * which each can add ~1 second drift error, | ||
1308 | * try to compensate so the difference in system time | ||
1309 | * and persistent_clock time stays close to constant. | ||
1310 | */ | ||
1311 | delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); | ||
1312 | delta_delta = timespec64_sub(delta, old_delta); | ||
1313 | if (abs(delta_delta.tv_sec) >= 2) { | ||
1314 | /* | 1460 | /* |
1315 | * if delta_delta is too large, assume time correction | 1461 | * To avoid drift caused by repeated suspend/resumes, |
1316 | * has occured and set old_delta to the current delta. | 1462 | * which each can add ~1 second drift error, |
1463 | * try to compensate so the difference in system time | ||
1464 | * and persistent_clock time stays close to constant. | ||
1317 | */ | 1465 | */ |
1318 | old_delta = delta; | 1466 | delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); |
1319 | } else { | 1467 | delta_delta = timespec64_sub(delta, old_delta); |
1320 | /* Otherwise try to adjust old_system to compensate */ | 1468 | if (abs(delta_delta.tv_sec) >= 2) { |
1321 | timekeeping_suspend_time = | 1469 | /* |
1322 | timespec64_add(timekeeping_suspend_time, delta_delta); | 1470 | * if delta_delta is too large, assume time correction |
1471 | * has occurred and set old_delta to the current delta. | ||
1472 | */ | ||
1473 | old_delta = delta; | ||
1474 | } else { | ||
1475 | /* Otherwise try to adjust old_system to compensate */ | ||
1476 | timekeeping_suspend_time = | ||
1477 | timespec64_add(timekeeping_suspend_time, delta_delta); | ||
1478 | } | ||
1323 | } | 1479 | } |
1324 | 1480 | ||
1325 | timekeeping_update(tk, TK_MIRROR); | 1481 | timekeeping_update(tk, TK_MIRROR); |
@@ -1327,7 +1483,7 @@ int timekeeping_suspend(void) | |||
1327 | write_seqcount_end(&tk_core.seq); | 1483 | write_seqcount_end(&tk_core.seq); |
1328 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1484 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1329 | 1485 | ||
1330 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 1486 | tick_suspend(); |
1331 | clocksource_suspend(); | 1487 | clocksource_suspend(); |
1332 | clockevents_suspend(); | 1488 | clockevents_suspend(); |
1333 | 1489 | ||
@@ -1416,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, | |||
1416 | * | 1572 | * |
1417 | * XXX - TODO: Doc ntp_error calculation. | 1573 | * XXX - TODO: Doc ntp_error calculation. |
1418 | */ | 1574 | */ |
1419 | if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { | 1575 | if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { |
1420 | /* NTP adjustment caused clocksource mult overflow */ | 1576 | /* NTP adjustment caused clocksource mult overflow */ |
1421 | WARN_ON_ONCE(1); | 1577 | WARN_ON_ONCE(1); |
1422 | return; | 1578 | return; |
1423 | } | 1579 | } |
1424 | 1580 | ||
1425 | tk->tkr.mult += mult_adj; | 1581 | tk->tkr_mono.mult += mult_adj; |
1426 | tk->xtime_interval += interval; | 1582 | tk->xtime_interval += interval; |
1427 | tk->tkr.xtime_nsec -= offset; | 1583 | tk->tkr_mono.xtime_nsec -= offset; |
1428 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; | 1584 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; |
1429 | } | 1585 | } |
1430 | 1586 | ||
@@ -1486,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1486 | tk->ntp_err_mult = 0; | 1642 | tk->ntp_err_mult = 0; |
1487 | } | 1643 | } |
1488 | 1644 | ||
1489 | if (unlikely(tk->tkr.clock->maxadj && | 1645 | if (unlikely(tk->tkr_mono.clock->maxadj && |
1490 | (abs(tk->tkr.mult - tk->tkr.clock->mult) | 1646 | (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) |
1491 | > tk->tkr.clock->maxadj))) { | 1647 | > tk->tkr_mono.clock->maxadj))) { |
1492 | printk_once(KERN_WARNING | 1648 | printk_once(KERN_WARNING |
1493 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 1649 | "Adjusting %s more than 11%% (%ld vs %ld)\n", |
1494 | tk->tkr.clock->name, (long)tk->tkr.mult, | 1650 | tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, |
1495 | (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); | 1651 | (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); |
1496 | } | 1652 | } |
1497 | 1653 | ||
1498 | /* | 1654 | /* |
@@ -1509,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1509 | * We'll correct this error next time through this function, when | 1665 | * We'll correct this error next time through this function, when |
1510 | * xtime_nsec is not as small. | 1666 | * xtime_nsec is not as small. |
1511 | */ | 1667 | */ |
1512 | if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { | 1668 | if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { |
1513 | s64 neg = -(s64)tk->tkr.xtime_nsec; | 1669 | s64 neg = -(s64)tk->tkr_mono.xtime_nsec; |
1514 | tk->tkr.xtime_nsec = 0; | 1670 | tk->tkr_mono.xtime_nsec = 0; |
1515 | tk->ntp_error += neg << tk->ntp_error_shift; | 1671 | tk->ntp_error += neg << tk->ntp_error_shift; |
1516 | } | 1672 | } |
1517 | } | 1673 | } |
@@ -1526,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1526 | */ | 1682 | */ |
1527 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) | 1683 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) |
1528 | { | 1684 | { |
1529 | u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; | 1685 | u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; |
1530 | unsigned int clock_set = 0; | 1686 | unsigned int clock_set = 0; |
1531 | 1687 | ||
1532 | while (tk->tkr.xtime_nsec >= nsecps) { | 1688 | while (tk->tkr_mono.xtime_nsec >= nsecps) { |
1533 | int leap; | 1689 | int leap; |
1534 | 1690 | ||
1535 | tk->tkr.xtime_nsec -= nsecps; | 1691 | tk->tkr_mono.xtime_nsec -= nsecps; |
1536 | tk->xtime_sec++; | 1692 | tk->xtime_sec++; |
1537 | 1693 | ||
1538 | /* Figure out if its a leap sec and apply if needed */ | 1694 | /* Figure out if its a leap sec and apply if needed */ |
@@ -1577,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | |||
1577 | 1733 | ||
1578 | /* Accumulate one shifted interval */ | 1734 | /* Accumulate one shifted interval */ |
1579 | offset -= interval; | 1735 | offset -= interval; |
1580 | tk->tkr.cycle_last += interval; | 1736 | tk->tkr_mono.cycle_last += interval; |
1737 | tk->tkr_raw.cycle_last += interval; | ||
1581 | 1738 | ||
1582 | tk->tkr.xtime_nsec += tk->xtime_interval << shift; | 1739 | tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; |
1583 | *clock_set |= accumulate_nsecs_to_secs(tk); | 1740 | *clock_set |= accumulate_nsecs_to_secs(tk); |
1584 | 1741 | ||
1585 | /* Accumulate raw time */ | 1742 | /* Accumulate raw time */ |
@@ -1622,14 +1779,17 @@ void update_wall_time(void) | |||
1622 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | 1779 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
1623 | offset = real_tk->cycle_interval; | 1780 | offset = real_tk->cycle_interval; |
1624 | #else | 1781 | #else |
1625 | offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), | 1782 | offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), |
1626 | tk->tkr.cycle_last, tk->tkr.mask); | 1783 | tk->tkr_mono.cycle_last, tk->tkr_mono.mask); |
1627 | #endif | 1784 | #endif |
1628 | 1785 | ||
1629 | /* Check if there's really nothing to do */ | 1786 | /* Check if there's really nothing to do */ |
1630 | if (offset < real_tk->cycle_interval) | 1787 | if (offset < real_tk->cycle_interval) |
1631 | goto out; | 1788 | goto out; |
1632 | 1789 | ||
1790 | /* Do some additional sanity checking */ | ||
1791 | timekeeping_check_update(real_tk, offset); | ||
1792 | |||
1633 | /* | 1793 | /* |
1634 | * With NO_HZ we may have to accumulate many cycle_intervals | 1794 | * With NO_HZ we may have to accumulate many cycle_intervals |
1635 | * (think "ticks") worth of time at once. To do this efficiently, | 1795 | * (think "ticks") worth of time at once. To do this efficiently, |
@@ -1784,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, | |||
1784 | do { | 1944 | do { |
1785 | seq = read_seqcount_begin(&tk_core.seq); | 1945 | seq = read_seqcount_begin(&tk_core.seq); |
1786 | 1946 | ||
1787 | base = tk->tkr.base_mono; | 1947 | base = tk->tkr_mono.base; |
1788 | nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; | 1948 | nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; |
1789 | 1949 | ||
1790 | *offs_real = tk->offs_real; | 1950 | *offs_real = tk->offs_real; |
1791 | *offs_boot = tk->offs_boot; | 1951 | *offs_boot = tk->offs_boot; |
@@ -1816,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, | |||
1816 | do { | 1976 | do { |
1817 | seq = read_seqcount_begin(&tk_core.seq); | 1977 | seq = read_seqcount_begin(&tk_core.seq); |
1818 | 1978 | ||
1819 | base = tk->tkr.base_mono; | 1979 | base = tk->tkr_mono.base; |
1820 | nsecs = timekeeping_get_ns(&tk->tkr); | 1980 | nsecs = timekeeping_get_ns(&tk->tkr_mono); |
1821 | 1981 | ||
1822 | *offs_real = tk->offs_real; | 1982 | *offs_real = tk->offs_real; |
1823 | *offs_boot = tk->offs_boot; | 1983 | *offs_boot = tk->offs_boot; |
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 1d91416055d5..ead8794b9a4e 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h | |||
@@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts); | |||
19 | extern int timekeeping_suspend(void); | 19 | extern int timekeeping_suspend(void); |
20 | extern void timekeeping_resume(void); | 20 | extern void timekeeping_resume(void); |
21 | 21 | ||
22 | extern void do_timer(unsigned long ticks); | ||
23 | extern void update_wall_time(void); | ||
24 | |||
25 | extern seqlock_t jiffies_lock; | ||
26 | |||
27 | #define CS_NAME_LEN 32 | ||
28 | |||
22 | #endif | 29 | #endif |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d3f5c504939..2ece3aa5069c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -90,8 +90,18 @@ struct tvec_base { | |||
90 | struct tvec tv5; | 90 | struct tvec tv5; |
91 | } ____cacheline_aligned; | 91 | } ____cacheline_aligned; |
92 | 92 | ||
93 | /* | ||
94 | * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've | ||
95 | * made NULL special, hint: lock_timer_base()) and we cannot get a compile time | ||
96 | * pointer to per-cpu entries because we don't know where we'll map the section, | ||
97 | * even for the boot cpu. | ||
98 | * | ||
99 | * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the | ||
100 | * rest of them. | ||
101 | */ | ||
93 | struct tvec_base boot_tvec_bases; | 102 | struct tvec_base boot_tvec_bases; |
94 | EXPORT_SYMBOL(boot_tvec_bases); | 103 | EXPORT_SYMBOL(boot_tvec_bases); |
104 | |||
95 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | 105 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; |
96 | 106 | ||
97 | /* Functions below help us manage 'deferrable' flag */ | 107 | /* Functions below help us manage 'deferrable' flag */ |
@@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
1027 | EXPORT_SYMBOL(try_to_del_timer_sync); | 1037 | EXPORT_SYMBOL(try_to_del_timer_sync); |
1028 | 1038 | ||
1029 | #ifdef CONFIG_SMP | 1039 | #ifdef CONFIG_SMP |
1040 | static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); | ||
1041 | |||
1030 | /** | 1042 | /** |
1031 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 1043 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
1032 | * @timer: the timer to be deactivated | 1044 | * @timer: the timer to be deactivated |
@@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) | |||
1532 | } | 1544 | } |
1533 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1545 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
1534 | 1546 | ||
1535 | static int init_timers_cpu(int cpu) | ||
1536 | { | ||
1537 | int j; | ||
1538 | struct tvec_base *base; | ||
1539 | static char tvec_base_done[NR_CPUS]; | ||
1540 | |||
1541 | if (!tvec_base_done[cpu]) { | ||
1542 | static char boot_done; | ||
1543 | |||
1544 | if (boot_done) { | ||
1545 | /* | ||
1546 | * The APs use this path later in boot | ||
1547 | */ | ||
1548 | base = kzalloc_node(sizeof(*base), GFP_KERNEL, | ||
1549 | cpu_to_node(cpu)); | ||
1550 | if (!base) | ||
1551 | return -ENOMEM; | ||
1552 | |||
1553 | /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ | ||
1554 | if (WARN_ON(base != tbase_get_base(base))) { | ||
1555 | kfree(base); | ||
1556 | return -ENOMEM; | ||
1557 | } | ||
1558 | per_cpu(tvec_bases, cpu) = base; | ||
1559 | } else { | ||
1560 | /* | ||
1561 | * This is for the boot CPU - we use compile-time | ||
1562 | * static initialisation because per-cpu memory isn't | ||
1563 | * ready yet and because the memory allocators are not | ||
1564 | * initialised either. | ||
1565 | */ | ||
1566 | boot_done = 1; | ||
1567 | base = &boot_tvec_bases; | ||
1568 | } | ||
1569 | spin_lock_init(&base->lock); | ||
1570 | tvec_base_done[cpu] = 1; | ||
1571 | base->cpu = cpu; | ||
1572 | } else { | ||
1573 | base = per_cpu(tvec_bases, cpu); | ||
1574 | } | ||
1575 | |||
1576 | |||
1577 | for (j = 0; j < TVN_SIZE; j++) { | ||
1578 | INIT_LIST_HEAD(base->tv5.vec + j); | ||
1579 | INIT_LIST_HEAD(base->tv4.vec + j); | ||
1580 | INIT_LIST_HEAD(base->tv3.vec + j); | ||
1581 | INIT_LIST_HEAD(base->tv2.vec + j); | ||
1582 | } | ||
1583 | for (j = 0; j < TVR_SIZE; j++) | ||
1584 | INIT_LIST_HEAD(base->tv1.vec + j); | ||
1585 | |||
1586 | base->timer_jiffies = jiffies; | ||
1587 | base->next_timer = base->timer_jiffies; | ||
1588 | base->active_timers = 0; | ||
1589 | base->all_timers = 0; | ||
1590 | return 0; | ||
1591 | } | ||
1592 | |||
1593 | #ifdef CONFIG_HOTPLUG_CPU | 1547 | #ifdef CONFIG_HOTPLUG_CPU |
1594 | static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) | 1548 | static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) |
1595 | { | 1549 | { |
@@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu) | |||
1631 | migrate_timer_list(new_base, old_base->tv5.vec + i); | 1585 | migrate_timer_list(new_base, old_base->tv5.vec + i); |
1632 | } | 1586 | } |
1633 | 1587 | ||
1588 | old_base->active_timers = 0; | ||
1589 | old_base->all_timers = 0; | ||
1590 | |||
1634 | spin_unlock(&old_base->lock); | 1591 | spin_unlock(&old_base->lock); |
1635 | spin_unlock_irq(&new_base->lock); | 1592 | spin_unlock_irq(&new_base->lock); |
1636 | put_cpu_var(tvec_bases); | 1593 | put_cpu_var(tvec_bases); |
1637 | } | 1594 | } |
1638 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
1639 | 1595 | ||
1640 | static int timer_cpu_notify(struct notifier_block *self, | 1596 | static int timer_cpu_notify(struct notifier_block *self, |
1641 | unsigned long action, void *hcpu) | 1597 | unsigned long action, void *hcpu) |
1642 | { | 1598 | { |
1643 | long cpu = (long)hcpu; | 1599 | switch (action) { |
1644 | int err; | ||
1645 | |||
1646 | switch(action) { | ||
1647 | case CPU_UP_PREPARE: | ||
1648 | case CPU_UP_PREPARE_FROZEN: | ||
1649 | err = init_timers_cpu(cpu); | ||
1650 | if (err < 0) | ||
1651 | return notifier_from_errno(err); | ||
1652 | break; | ||
1653 | #ifdef CONFIG_HOTPLUG_CPU | ||
1654 | case CPU_DEAD: | 1600 | case CPU_DEAD: |
1655 | case CPU_DEAD_FROZEN: | 1601 | case CPU_DEAD_FROZEN: |
1656 | migrate_timers(cpu); | 1602 | migrate_timers((long)hcpu); |
1657 | break; | 1603 | break; |
1658 | #endif | ||
1659 | default: | 1604 | default: |
1660 | break; | 1605 | break; |
1661 | } | 1606 | } |
1607 | |||
1662 | return NOTIFY_OK; | 1608 | return NOTIFY_OK; |
1663 | } | 1609 | } |
1664 | 1610 | ||
1665 | static struct notifier_block timers_nb = { | 1611 | static inline void timer_register_cpu_notifier(void) |
1666 | .notifier_call = timer_cpu_notify, | 1612 | { |
1667 | }; | 1613 | cpu_notifier(timer_cpu_notify, 0); |
1614 | } | ||
1615 | #else | ||
1616 | static inline void timer_register_cpu_notifier(void) { } | ||
1617 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
1668 | 1618 | ||
1619 | static void __init init_timer_cpu(struct tvec_base *base, int cpu) | ||
1620 | { | ||
1621 | int j; | ||
1669 | 1622 | ||
1670 | void __init init_timers(void) | 1623 | BUG_ON(base != tbase_get_base(base)); |
1624 | |||
1625 | base->cpu = cpu; | ||
1626 | per_cpu(tvec_bases, cpu) = base; | ||
1627 | spin_lock_init(&base->lock); | ||
1628 | |||
1629 | for (j = 0; j < TVN_SIZE; j++) { | ||
1630 | INIT_LIST_HEAD(base->tv5.vec + j); | ||
1631 | INIT_LIST_HEAD(base->tv4.vec + j); | ||
1632 | INIT_LIST_HEAD(base->tv3.vec + j); | ||
1633 | INIT_LIST_HEAD(base->tv2.vec + j); | ||
1634 | } | ||
1635 | for (j = 0; j < TVR_SIZE; j++) | ||
1636 | INIT_LIST_HEAD(base->tv1.vec + j); | ||
1637 | |||
1638 | base->timer_jiffies = jiffies; | ||
1639 | base->next_timer = base->timer_jiffies; | ||
1640 | } | ||
1641 | |||
1642 | static void __init init_timer_cpus(void) | ||
1671 | { | 1643 | { |
1672 | int err; | 1644 | struct tvec_base *base; |
1645 | int local_cpu = smp_processor_id(); | ||
1646 | int cpu; | ||
1673 | 1647 | ||
1648 | for_each_possible_cpu(cpu) { | ||
1649 | if (cpu == local_cpu) | ||
1650 | base = &boot_tvec_bases; | ||
1651 | #ifdef CONFIG_SMP | ||
1652 | else | ||
1653 | base = per_cpu_ptr(&__tvec_bases, cpu); | ||
1654 | #endif | ||
1655 | |||
1656 | init_timer_cpu(base, cpu); | ||
1657 | } | ||
1658 | } | ||
1659 | |||
1660 | void __init init_timers(void) | ||
1661 | { | ||
1674 | /* ensure there are enough low bits for flags in timer->base pointer */ | 1662 | /* ensure there are enough low bits for flags in timer->base pointer */ |
1675 | BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); | 1663 | BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); |
1676 | 1664 | ||
1677 | err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | 1665 | init_timer_cpus(); |
1678 | (void *)(long)smp_processor_id()); | ||
1679 | BUG_ON(err != NOTIFY_OK); | ||
1680 | |||
1681 | init_timer_stats(); | 1666 | init_timer_stats(); |
1682 | register_cpu_notifier(&timers_nb); | 1667 | timer_register_cpu_notifier(); |
1683 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); | 1668 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); |
1684 | } | 1669 | } |
1685 | 1670 | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 61ed862cdd37..e878c2e0ba45 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -16,10 +16,10 @@ | |||
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> |
18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> |
19 | #include <linux/tick.h> | ||
20 | 19 | ||
21 | #include <asm/uaccess.h> | 20 | #include <asm/uaccess.h> |
22 | 21 | ||
22 | #include "tick-internal.h" | ||
23 | 23 | ||
24 | struct timer_list_iter { | 24 | struct timer_list_iter { |
25 | int cpu; | 25 | int cpu; |
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) | |||
228 | print_name_offset(m, dev->set_next_event); | 228 | print_name_offset(m, dev->set_next_event); |
229 | SEQ_printf(m, "\n"); | 229 | SEQ_printf(m, "\n"); |
230 | 230 | ||
231 | SEQ_printf(m, " set_mode: "); | 231 | if (dev->set_mode) { |
232 | print_name_offset(m, dev->set_mode); | 232 | SEQ_printf(m, " set_mode: "); |
233 | SEQ_printf(m, "\n"); | 233 | print_name_offset(m, dev->set_mode); |
234 | SEQ_printf(m, "\n"); | ||
235 | } else { | ||
236 | if (dev->set_state_shutdown) { | ||
237 | SEQ_printf(m, " shutdown: "); | ||
238 | print_name_offset(m, dev->set_state_shutdown); | ||
239 | SEQ_printf(m, "\n"); | ||
240 | } | ||
241 | |||
242 | if (dev->set_state_periodic) { | ||
243 | SEQ_printf(m, " periodic: "); | ||
244 | print_name_offset(m, dev->set_state_periodic); | ||
245 | SEQ_printf(m, "\n"); | ||
246 | } | ||
247 | |||
248 | if (dev->set_state_oneshot) { | ||
249 | SEQ_printf(m, " oneshot: "); | ||
250 | print_name_offset(m, dev->set_state_oneshot); | ||
251 | SEQ_printf(m, "\n"); | ||
252 | } | ||
253 | |||
254 | if (dev->tick_resume) { | ||
255 | SEQ_printf(m, " resume: "); | ||
256 | print_name_offset(m, dev->tick_resume); | ||
257 | SEQ_printf(m, "\n"); | ||
258 | } | ||
259 | } | ||
234 | 260 | ||
235 | SEQ_printf(m, " event_handler: "); | 261 | SEQ_printf(m, " event_handler: "); |
236 | print_name_offset(m, dev->event_handler); | 262 | print_name_offset(m, dev->event_handler); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a5da09c899dd..fedbdd7d5d1e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -599,6 +599,34 @@ config RING_BUFFER_STARTUP_TEST | |||
599 | 599 | ||
600 | If unsure, say N | 600 | If unsure, say N |
601 | 601 | ||
602 | config TRACE_ENUM_MAP_FILE | ||
603 | bool "Show enum mappings for trace events" | ||
604 | depends on TRACING | ||
605 | help | ||
606 | The "print fmt" of the trace events will show the enum names instead | ||
607 | of their values. This can cause problems for user space tools that | ||
608 | use this string to parse the raw data as user space does not know | ||
609 | how to convert the string to its value. | ||
610 | |||
611 | To fix this, there's a special macro in the kernel that can be used | ||
612 | to convert the enum into its value. If this macro is used, then the | ||
613 | print fmt strings will have the enums converted to their values. | ||
614 | |||
615 | If something does not get converted properly, this option can be | ||
616 | used to show what enums the kernel tried to convert. | ||
617 | |||
618 | This option is for debugging the enum conversions. A file is created | ||
619 | in the tracing directory called "enum_map" that will show the enum | ||
620 | names matched with their values and what trace event system they | ||
621 | belong too. | ||
622 | |||
623 | Normally, the mapping of the strings to values will be freed after | ||
624 | boot up or module load. With this option, they will not be freed, as | ||
625 | they are needed for the "enum_map" file. Enabling this option will | ||
626 | increase the memory footprint of the running kernel. | ||
627 | |||
628 | If unsure, say N | ||
629 | |||
602 | endif # FTRACE | 630 | endif # FTRACE |
603 | 631 | ||
604 | endif # TRACING_SUPPORT | 632 | endif # TRACING_SUPPORT |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f228024055b..02bece4a99ea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -18,7 +18,7 @@ | |||
18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> |
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <linux/suspend.h> | 20 | #include <linux/suspend.h> |
21 | #include <linux/debugfs.h> | 21 | #include <linux/tracefs.h> |
22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
23 | #include <linux/kthread.h> | 23 | #include <linux/kthread.h> |
24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
@@ -249,6 +249,19 @@ static void update_function_graph_func(void); | |||
249 | static inline void update_function_graph_func(void) { } | 249 | static inline void update_function_graph_func(void) { } |
250 | #endif | 250 | #endif |
251 | 251 | ||
252 | |||
253 | static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) | ||
254 | { | ||
255 | /* | ||
256 | * If this is a dynamic ops or we force list func, | ||
257 | * then it needs to call the list anyway. | ||
258 | */ | ||
259 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) | ||
260 | return ftrace_ops_list_func; | ||
261 | |||
262 | return ftrace_ops_get_func(ops); | ||
263 | } | ||
264 | |||
252 | static void update_ftrace_function(void) | 265 | static void update_ftrace_function(void) |
253 | { | 266 | { |
254 | ftrace_func_t func; | 267 | ftrace_func_t func; |
@@ -270,7 +283,7 @@ static void update_ftrace_function(void) | |||
270 | * then have the mcount trampoline call the function directly. | 283 | * then have the mcount trampoline call the function directly. |
271 | */ | 284 | */ |
272 | } else if (ftrace_ops_list->next == &ftrace_list_end) { | 285 | } else if (ftrace_ops_list->next == &ftrace_list_end) { |
273 | func = ftrace_ops_get_func(ftrace_ops_list); | 286 | func = ftrace_ops_get_list_func(ftrace_ops_list); |
274 | 287 | ||
275 | } else { | 288 | } else { |
276 | /* Just use the default ftrace_ops */ | 289 | /* Just use the default ftrace_ops */ |
@@ -1008,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = { | |||
1008 | .stat_show = function_stat_show | 1021 | .stat_show = function_stat_show |
1009 | }; | 1022 | }; |
1010 | 1023 | ||
1011 | static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | 1024 | static __init void ftrace_profile_tracefs(struct dentry *d_tracer) |
1012 | { | 1025 | { |
1013 | struct ftrace_profile_stat *stat; | 1026 | struct ftrace_profile_stat *stat; |
1014 | struct dentry *entry; | 1027 | struct dentry *entry; |
@@ -1044,15 +1057,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | |||
1044 | } | 1057 | } |
1045 | } | 1058 | } |
1046 | 1059 | ||
1047 | entry = debugfs_create_file("function_profile_enabled", 0644, | 1060 | entry = tracefs_create_file("function_profile_enabled", 0644, |
1048 | d_tracer, NULL, &ftrace_profile_fops); | 1061 | d_tracer, NULL, &ftrace_profile_fops); |
1049 | if (!entry) | 1062 | if (!entry) |
1050 | pr_warning("Could not create debugfs " | 1063 | pr_warning("Could not create tracefs " |
1051 | "'function_profile_enabled' entry\n"); | 1064 | "'function_profile_enabled' entry\n"); |
1052 | } | 1065 | } |
1053 | 1066 | ||
1054 | #else /* CONFIG_FUNCTION_PROFILER */ | 1067 | #else /* CONFIG_FUNCTION_PROFILER */ |
1055 | static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | 1068 | static __init void ftrace_profile_tracefs(struct dentry *d_tracer) |
1056 | { | 1069 | { |
1057 | } | 1070 | } |
1058 | #endif /* CONFIG_FUNCTION_PROFILER */ | 1071 | #endif /* CONFIG_FUNCTION_PROFILER */ |
@@ -4712,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) | |||
4712 | mutex_unlock(&ftrace_lock); | 4725 | mutex_unlock(&ftrace_lock); |
4713 | } | 4726 | } |
4714 | 4727 | ||
4715 | static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | 4728 | static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) |
4716 | { | 4729 | { |
4717 | 4730 | ||
4718 | trace_create_file("available_filter_functions", 0444, | 4731 | trace_create_file("available_filter_functions", 0444, |
@@ -5020,7 +5033,7 @@ static int __init ftrace_nodyn_init(void) | |||
5020 | } | 5033 | } |
5021 | core_initcall(ftrace_nodyn_init); | 5034 | core_initcall(ftrace_nodyn_init); |
5022 | 5035 | ||
5023 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 5036 | static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } |
5024 | static inline void ftrace_startup_enable(int command) { } | 5037 | static inline void ftrace_startup_enable(int command) { } |
5025 | static inline void ftrace_startup_all(int command) { } | 5038 | static inline void ftrace_startup_all(int command) { } |
5026 | /* Keep as macros so we do not need to define the commands */ | 5039 | /* Keep as macros so we do not need to define the commands */ |
@@ -5209,13 +5222,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, | |||
5209 | ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) | 5222 | ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) |
5210 | { | 5223 | { |
5211 | /* | 5224 | /* |
5212 | * If this is a dynamic ops or we force list func, | ||
5213 | * then it needs to call the list anyway. | ||
5214 | */ | ||
5215 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) | ||
5216 | return ftrace_ops_list_func; | ||
5217 | |||
5218 | /* | ||
5219 | * If the func handles its own recursion, call it directly. | 5225 | * If the func handles its own recursion, call it directly. |
5220 | * Otherwise call the recursion protected function that | 5226 | * Otherwise call the recursion protected function that |
5221 | * will call the ftrace ops function. | 5227 | * will call the ftrace ops function. |
@@ -5473,7 +5479,7 @@ static const struct file_operations ftrace_pid_fops = { | |||
5473 | .release = ftrace_pid_release, | 5479 | .release = ftrace_pid_release, |
5474 | }; | 5480 | }; |
5475 | 5481 | ||
5476 | static __init int ftrace_init_debugfs(void) | 5482 | static __init int ftrace_init_tracefs(void) |
5477 | { | 5483 | { |
5478 | struct dentry *d_tracer; | 5484 | struct dentry *d_tracer; |
5479 | 5485 | ||
@@ -5481,16 +5487,16 @@ static __init int ftrace_init_debugfs(void) | |||
5481 | if (IS_ERR(d_tracer)) | 5487 | if (IS_ERR(d_tracer)) |
5482 | return 0; | 5488 | return 0; |
5483 | 5489 | ||
5484 | ftrace_init_dyn_debugfs(d_tracer); | 5490 | ftrace_init_dyn_tracefs(d_tracer); |
5485 | 5491 | ||
5486 | trace_create_file("set_ftrace_pid", 0644, d_tracer, | 5492 | trace_create_file("set_ftrace_pid", 0644, d_tracer, |
5487 | NULL, &ftrace_pid_fops); | 5493 | NULL, &ftrace_pid_fops); |
5488 | 5494 | ||
5489 | ftrace_profile_debugfs(d_tracer); | 5495 | ftrace_profile_tracefs(d_tracer); |
5490 | 5496 | ||
5491 | return 0; | 5497 | return 0; |
5492 | } | 5498 | } |
5493 | fs_initcall(ftrace_init_debugfs); | 5499 | fs_initcall(ftrace_init_tracefs); |
5494 | 5500 | ||
5495 | /** | 5501 | /** |
5496 | * ftrace_kill - kill ftrace | 5502 | * ftrace_kill - kill ftrace |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5040d44fe5a3..0315d43176d8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context); | |||
2679 | 2679 | ||
2680 | static __always_inline int trace_recursive_lock(void) | 2680 | static __always_inline int trace_recursive_lock(void) |
2681 | { | 2681 | { |
2682 | unsigned int val = this_cpu_read(current_context); | 2682 | unsigned int val = __this_cpu_read(current_context); |
2683 | int bit; | 2683 | int bit; |
2684 | 2684 | ||
2685 | if (in_interrupt()) { | 2685 | if (in_interrupt()) { |
@@ -2696,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void) | |||
2696 | return 1; | 2696 | return 1; |
2697 | 2697 | ||
2698 | val |= (1 << bit); | 2698 | val |= (1 << bit); |
2699 | this_cpu_write(current_context, val); | 2699 | __this_cpu_write(current_context, val); |
2700 | 2700 | ||
2701 | return 0; | 2701 | return 0; |
2702 | } | 2702 | } |
2703 | 2703 | ||
2704 | static __always_inline void trace_recursive_unlock(void) | 2704 | static __always_inline void trace_recursive_unlock(void) |
2705 | { | 2705 | { |
2706 | unsigned int val = this_cpu_read(current_context); | 2706 | __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); |
2707 | |||
2708 | val--; | ||
2709 | val &= this_cpu_read(current_context); | ||
2710 | this_cpu_write(current_context, val); | ||
2711 | } | 2707 | } |
2712 | 2708 | ||
2713 | #else | 2709 | #else |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 62c6506d663f..91eecaaa43e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/notifier.h> | 20 | #include <linux/notifier.h> |
21 | #include <linux/irqflags.h> | 21 | #include <linux/irqflags.h> |
22 | #include <linux/debugfs.h> | 22 | #include <linux/debugfs.h> |
23 | #include <linux/tracefs.h> | ||
23 | #include <linux/pagemap.h> | 24 | #include <linux/pagemap.h> |
24 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> |
25 | #include <linux/linkage.h> | 26 | #include <linux/linkage.h> |
@@ -31,6 +32,7 @@ | |||
31 | #include <linux/splice.h> | 32 | #include <linux/splice.h> |
32 | #include <linux/kdebug.h> | 33 | #include <linux/kdebug.h> |
33 | #include <linux/string.h> | 34 | #include <linux/string.h> |
35 | #include <linux/mount.h> | ||
34 | #include <linux/rwsem.h> | 36 | #include <linux/rwsem.h> |
35 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
36 | #include <linux/ctype.h> | 38 | #include <linux/ctype.h> |
@@ -123,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops; | |||
123 | /* When set, tracing will stop when a WARN*() is hit */ | 125 | /* When set, tracing will stop when a WARN*() is hit */ |
124 | int __disable_trace_on_warning; | 126 | int __disable_trace_on_warning; |
125 | 127 | ||
128 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | ||
129 | /* Map of enums to their values, for "enum_map" file */ | ||
130 | struct trace_enum_map_head { | ||
131 | struct module *mod; | ||
132 | unsigned long length; | ||
133 | }; | ||
134 | |||
135 | union trace_enum_map_item; | ||
136 | |||
137 | struct trace_enum_map_tail { | ||
138 | /* | ||
139 | * "end" is first and points to NULL as it must be different | ||
140 | * than "mod" or "enum_string" | ||
141 | */ | ||
142 | union trace_enum_map_item *next; | ||
143 | const char *end; /* points to NULL */ | ||
144 | }; | ||
145 | |||
146 | static DEFINE_MUTEX(trace_enum_mutex); | ||
147 | |||
148 | /* | ||
149 | * The trace_enum_maps are saved in an array with two extra elements, | ||
150 | * one at the beginning, and one at the end. The beginning item contains | ||
151 | * the count of the saved maps (head.length), and the module they | ||
152 | * belong to if not built in (head.mod). The ending item contains a | ||
153 | * pointer to the next array of saved enum_map items. | ||
154 | */ | ||
155 | union trace_enum_map_item { | ||
156 | struct trace_enum_map map; | ||
157 | struct trace_enum_map_head head; | ||
158 | struct trace_enum_map_tail tail; | ||
159 | }; | ||
160 | |||
161 | static union trace_enum_map_item *trace_enum_maps; | ||
162 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | ||
163 | |||
126 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); | 164 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); |
127 | 165 | ||
128 | #define MAX_TRACER_SIZE 100 | 166 | #define MAX_TRACER_SIZE 100 |
@@ -3908,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { | |||
3908 | .write = tracing_saved_cmdlines_size_write, | 3946 | .write = tracing_saved_cmdlines_size_write, |
3909 | }; | 3947 | }; |
3910 | 3948 | ||
3949 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | ||
3950 | static union trace_enum_map_item * | ||
3951 | update_enum_map(union trace_enum_map_item *ptr) | ||
3952 | { | ||
3953 | if (!ptr->map.enum_string) { | ||
3954 | if (ptr->tail.next) { | ||
3955 | ptr = ptr->tail.next; | ||
3956 | /* Set ptr to the next real item (skip head) */ | ||
3957 | ptr++; | ||
3958 | } else | ||
3959 | return NULL; | ||
3960 | } | ||
3961 | return ptr; | ||
3962 | } | ||
3963 | |||
3964 | static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) | ||
3965 | { | ||
3966 | union trace_enum_map_item *ptr = v; | ||
3967 | |||
3968 | /* | ||
3969 | * Paranoid! If ptr points to end, we don't want to increment past it. | ||
3970 | * This really should never happen. | ||
3971 | */ | ||
3972 | ptr = update_enum_map(ptr); | ||
3973 | if (WARN_ON_ONCE(!ptr)) | ||
3974 | return NULL; | ||
3975 | |||
3976 | ptr++; | ||
3977 | |||
3978 | (*pos)++; | ||
3979 | |||
3980 | ptr = update_enum_map(ptr); | ||
3981 | |||
3982 | return ptr; | ||
3983 | } | ||
3984 | |||
3985 | static void *enum_map_start(struct seq_file *m, loff_t *pos) | ||
3986 | { | ||
3987 | union trace_enum_map_item *v; | ||
3988 | loff_t l = 0; | ||
3989 | |||
3990 | mutex_lock(&trace_enum_mutex); | ||
3991 | |||
3992 | v = trace_enum_maps; | ||
3993 | if (v) | ||
3994 | v++; | ||
3995 | |||
3996 | while (v && l < *pos) { | ||
3997 | v = enum_map_next(m, v, &l); | ||
3998 | } | ||
3999 | |||
4000 | return v; | ||
4001 | } | ||
4002 | |||
4003 | static void enum_map_stop(struct seq_file *m, void *v) | ||
4004 | { | ||
4005 | mutex_unlock(&trace_enum_mutex); | ||
4006 | } | ||
4007 | |||
4008 | static int enum_map_show(struct seq_file *m, void *v) | ||
4009 | { | ||
4010 | union trace_enum_map_item *ptr = v; | ||
4011 | |||
4012 | seq_printf(m, "%s %ld (%s)\n", | ||
4013 | ptr->map.enum_string, ptr->map.enum_value, | ||
4014 | ptr->map.system); | ||
4015 | |||
4016 | return 0; | ||
4017 | } | ||
4018 | |||
4019 | static const struct seq_operations tracing_enum_map_seq_ops = { | ||
4020 | .start = enum_map_start, | ||
4021 | .next = enum_map_next, | ||
4022 | .stop = enum_map_stop, | ||
4023 | .show = enum_map_show, | ||
4024 | }; | ||
4025 | |||
4026 | static int tracing_enum_map_open(struct inode *inode, struct file *filp) | ||
4027 | { | ||
4028 | if (tracing_disabled) | ||
4029 | return -ENODEV; | ||
4030 | |||
4031 | return seq_open(filp, &tracing_enum_map_seq_ops); | ||
4032 | } | ||
4033 | |||
4034 | static const struct file_operations tracing_enum_map_fops = { | ||
4035 | .open = tracing_enum_map_open, | ||
4036 | .read = seq_read, | ||
4037 | .llseek = seq_lseek, | ||
4038 | .release = seq_release, | ||
4039 | }; | ||
4040 | |||
4041 | static inline union trace_enum_map_item * | ||
4042 | trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) | ||
4043 | { | ||
4044 | /* Return tail of array given the head */ | ||
4045 | return ptr + ptr->head.length + 1; | ||
4046 | } | ||
4047 | |||
4048 | static void | ||
4049 | trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, | ||
4050 | int len) | ||
4051 | { | ||
4052 | struct trace_enum_map **stop; | ||
4053 | struct trace_enum_map **map; | ||
4054 | union trace_enum_map_item *map_array; | ||
4055 | union trace_enum_map_item *ptr; | ||
4056 | |||
4057 | stop = start + len; | ||
4058 | |||
4059 | /* | ||
4060 | * The trace_enum_maps contains the map plus a head and tail item, | ||
4061 | * where the head holds the module and length of array, and the | ||
4062 | * tail holds a pointer to the next list. | ||
4063 | */ | ||
4064 | map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); | ||
4065 | if (!map_array) { | ||
4066 | pr_warning("Unable to allocate trace enum mapping\n"); | ||
4067 | return; | ||
4068 | } | ||
4069 | |||
4070 | mutex_lock(&trace_enum_mutex); | ||
4071 | |||
4072 | if (!trace_enum_maps) | ||
4073 | trace_enum_maps = map_array; | ||
4074 | else { | ||
4075 | ptr = trace_enum_maps; | ||
4076 | for (;;) { | ||
4077 | ptr = trace_enum_jmp_to_tail(ptr); | ||
4078 | if (!ptr->tail.next) | ||
4079 | break; | ||
4080 | ptr = ptr->tail.next; | ||
4081 | |||
4082 | } | ||
4083 | ptr->tail.next = map_array; | ||
4084 | } | ||
4085 | map_array->head.mod = mod; | ||
4086 | map_array->head.length = len; | ||
4087 | map_array++; | ||
4088 | |||
4089 | for (map = start; (unsigned long)map < (unsigned long)stop; map++) { | ||
4090 | map_array->map = **map; | ||
4091 | map_array++; | ||
4092 | } | ||
4093 | memset(map_array, 0, sizeof(*map_array)); | ||
4094 | |||
4095 | mutex_unlock(&trace_enum_mutex); | ||
4096 | } | ||
4097 | |||
4098 | static void trace_create_enum_file(struct dentry *d_tracer) | ||
4099 | { | ||
4100 | trace_create_file("enum_map", 0444, d_tracer, | ||
4101 | NULL, &tracing_enum_map_fops); | ||
4102 | } | ||
4103 | |||
4104 | #else /* CONFIG_TRACE_ENUM_MAP_FILE */ | ||
4105 | static inline void trace_create_enum_file(struct dentry *d_tracer) { } | ||
4106 | static inline void trace_insert_enum_map_file(struct module *mod, | ||
4107 | struct trace_enum_map **start, int len) { } | ||
4108 | #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ | ||
4109 | |||
4110 | static void trace_insert_enum_map(struct module *mod, | ||
4111 | struct trace_enum_map **start, int len) | ||
4112 | { | ||
4113 | struct trace_enum_map **map; | ||
4114 | |||
4115 | if (len <= 0) | ||
4116 | return; | ||
4117 | |||
4118 | map = start; | ||
4119 | |||
4120 | trace_event_enum_update(map, len); | ||
4121 | |||
4122 | trace_insert_enum_map_file(mod, start, len); | ||
4123 | } | ||
4124 | |||
3911 | static ssize_t | 4125 | static ssize_t |
3912 | tracing_set_trace_read(struct file *filp, char __user *ubuf, | 4126 | tracing_set_trace_read(struct file *filp, char __user *ubuf, |
3913 | size_t cnt, loff_t *ppos) | 4127 | size_t cnt, loff_t *ppos) |
@@ -4105,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr) | |||
4105 | tr->current_trace = &nop_trace; | 4319 | tr->current_trace = &nop_trace; |
4106 | } | 4320 | } |
4107 | 4321 | ||
4108 | static int tracing_set_tracer(struct trace_array *tr, const char *buf) | 4322 | static void update_tracer_options(struct trace_array *tr, struct tracer *t) |
4109 | { | 4323 | { |
4110 | static struct trace_option_dentry *topts; | 4324 | static struct trace_option_dentry *topts; |
4325 | |||
4326 | /* Only enable if the directory has been created already. */ | ||
4327 | if (!tr->dir) | ||
4328 | return; | ||
4329 | |||
4330 | /* Currently, only the top instance has options */ | ||
4331 | if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) | ||
4332 | return; | ||
4333 | |||
4334 | destroy_trace_option_files(topts); | ||
4335 | topts = create_trace_option_files(tr, t); | ||
4336 | } | ||
4337 | |||
4338 | static int tracing_set_tracer(struct trace_array *tr, const char *buf) | ||
4339 | { | ||
4111 | struct tracer *t; | 4340 | struct tracer *t; |
4112 | #ifdef CONFIG_TRACER_MAX_TRACE | 4341 | #ifdef CONFIG_TRACER_MAX_TRACE |
4113 | bool had_max_tr; | 4342 | bool had_max_tr; |
@@ -4172,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) | |||
4172 | free_snapshot(tr); | 4401 | free_snapshot(tr); |
4173 | } | 4402 | } |
4174 | #endif | 4403 | #endif |
4175 | /* Currently, only the top instance has options */ | 4404 | update_tracer_options(tr, t); |
4176 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { | ||
4177 | destroy_trace_option_files(topts); | ||
4178 | topts = create_trace_option_files(tr, t); | ||
4179 | } | ||
4180 | 4405 | ||
4181 | #ifdef CONFIG_TRACER_MAX_TRACE | 4406 | #ifdef CONFIG_TRACER_MAX_TRACE |
4182 | if (t->use_max_tr && !had_max_tr) { | 4407 | if (t->use_max_tr && !had_max_tr) { |
@@ -5817,6 +6042,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; } | |||
5817 | 6042 | ||
5818 | static struct dentry *tracing_get_dentry(struct trace_array *tr) | 6043 | static struct dentry *tracing_get_dentry(struct trace_array *tr) |
5819 | { | 6044 | { |
6045 | if (WARN_ON(!tr->dir)) | ||
6046 | return ERR_PTR(-ENODEV); | ||
6047 | |||
6048 | /* Top directory uses NULL as the parent */ | ||
6049 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) | ||
6050 | return NULL; | ||
6051 | |||
6052 | /* All sub buffers have a descriptor */ | ||
5820 | return tr->dir; | 6053 | return tr->dir; |
5821 | } | 6054 | } |
5822 | 6055 | ||
@@ -5831,10 +6064,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) | |||
5831 | if (IS_ERR(d_tracer)) | 6064 | if (IS_ERR(d_tracer)) |
5832 | return NULL; | 6065 | return NULL; |
5833 | 6066 | ||
5834 | tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); | 6067 | tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); |
5835 | 6068 | ||
5836 | WARN_ONCE(!tr->percpu_dir, | 6069 | WARN_ONCE(!tr->percpu_dir, |
5837 | "Could not create debugfs directory 'per_cpu/%d'\n", cpu); | 6070 | "Could not create tracefs directory 'per_cpu/%d'\n", cpu); |
5838 | 6071 | ||
5839 | return tr->percpu_dir; | 6072 | return tr->percpu_dir; |
5840 | } | 6073 | } |
@@ -5851,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, | |||
5851 | } | 6084 | } |
5852 | 6085 | ||
5853 | static void | 6086 | static void |
5854 | tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) | 6087 | tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) |
5855 | { | 6088 | { |
5856 | struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); | 6089 | struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); |
5857 | struct dentry *d_cpu; | 6090 | struct dentry *d_cpu; |
@@ -5861,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) | |||
5861 | return; | 6094 | return; |
5862 | 6095 | ||
5863 | snprintf(cpu_dir, 30, "cpu%ld", cpu); | 6096 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
5864 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 6097 | d_cpu = tracefs_create_dir(cpu_dir, d_percpu); |
5865 | if (!d_cpu) { | 6098 | if (!d_cpu) { |
5866 | pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); | 6099 | pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); |
5867 | return; | 6100 | return; |
5868 | } | 6101 | } |
5869 | 6102 | ||
@@ -6015,9 +6248,9 @@ struct dentry *trace_create_file(const char *name, | |||
6015 | { | 6248 | { |
6016 | struct dentry *ret; | 6249 | struct dentry *ret; |
6017 | 6250 | ||
6018 | ret = debugfs_create_file(name, mode, parent, data, fops); | 6251 | ret = tracefs_create_file(name, mode, parent, data, fops); |
6019 | if (!ret) | 6252 | if (!ret) |
6020 | pr_warning("Could not create debugfs '%s' entry\n", name); | 6253 | pr_warning("Could not create tracefs '%s' entry\n", name); |
6021 | 6254 | ||
6022 | return ret; | 6255 | return ret; |
6023 | } | 6256 | } |
@@ -6034,9 +6267,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) | |||
6034 | if (IS_ERR(d_tracer)) | 6267 | if (IS_ERR(d_tracer)) |
6035 | return NULL; | 6268 | return NULL; |
6036 | 6269 | ||
6037 | tr->options = debugfs_create_dir("options", d_tracer); | 6270 | tr->options = tracefs_create_dir("options", d_tracer); |
6038 | if (!tr->options) { | 6271 | if (!tr->options) { |
6039 | pr_warning("Could not create debugfs directory 'options'\n"); | 6272 | pr_warning("Could not create tracefs directory 'options'\n"); |
6040 | return NULL; | 6273 | return NULL; |
6041 | } | 6274 | } |
6042 | 6275 | ||
@@ -6105,7 +6338,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts) | |||
6105 | return; | 6338 | return; |
6106 | 6339 | ||
6107 | for (cnt = 0; topts[cnt].opt; cnt++) | 6340 | for (cnt = 0; topts[cnt].opt; cnt++) |
6108 | debugfs_remove(topts[cnt].entry); | 6341 | tracefs_remove(topts[cnt].entry); |
6109 | 6342 | ||
6110 | kfree(topts); | 6343 | kfree(topts); |
6111 | } | 6344 | } |
@@ -6194,7 +6427,7 @@ static const struct file_operations rb_simple_fops = { | |||
6194 | struct dentry *trace_instance_dir; | 6427 | struct dentry *trace_instance_dir; |
6195 | 6428 | ||
6196 | static void | 6429 | static void |
6197 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); | 6430 | init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); |
6198 | 6431 | ||
6199 | static int | 6432 | static int |
6200 | allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) | 6433 | allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) |
@@ -6271,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr) | |||
6271 | #endif | 6504 | #endif |
6272 | } | 6505 | } |
6273 | 6506 | ||
6274 | static int new_instance_create(const char *name) | 6507 | static int instance_mkdir(const char *name) |
6275 | { | 6508 | { |
6276 | struct trace_array *tr; | 6509 | struct trace_array *tr; |
6277 | int ret; | 6510 | int ret; |
@@ -6310,17 +6543,17 @@ static int new_instance_create(const char *name) | |||
6310 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) | 6543 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) |
6311 | goto out_free_tr; | 6544 | goto out_free_tr; |
6312 | 6545 | ||
6313 | tr->dir = debugfs_create_dir(name, trace_instance_dir); | 6546 | tr->dir = tracefs_create_dir(name, trace_instance_dir); |
6314 | if (!tr->dir) | 6547 | if (!tr->dir) |
6315 | goto out_free_tr; | 6548 | goto out_free_tr; |
6316 | 6549 | ||
6317 | ret = event_trace_add_tracer(tr->dir, tr); | 6550 | ret = event_trace_add_tracer(tr->dir, tr); |
6318 | if (ret) { | 6551 | if (ret) { |
6319 | debugfs_remove_recursive(tr->dir); | 6552 | tracefs_remove_recursive(tr->dir); |
6320 | goto out_free_tr; | 6553 | goto out_free_tr; |
6321 | } | 6554 | } |
6322 | 6555 | ||
6323 | init_tracer_debugfs(tr, tr->dir); | 6556 | init_tracer_tracefs(tr, tr->dir); |
6324 | 6557 | ||
6325 | list_add(&tr->list, &ftrace_trace_arrays); | 6558 | list_add(&tr->list, &ftrace_trace_arrays); |
6326 | 6559 | ||
@@ -6341,7 +6574,7 @@ static int new_instance_create(const char *name) | |||
6341 | 6574 | ||
6342 | } | 6575 | } |
6343 | 6576 | ||
6344 | static int instance_delete(const char *name) | 6577 | static int instance_rmdir(const char *name) |
6345 | { | 6578 | { |
6346 | struct trace_array *tr; | 6579 | struct trace_array *tr; |
6347 | int found = 0; | 6580 | int found = 0; |
@@ -6382,82 +6615,17 @@ static int instance_delete(const char *name) | |||
6382 | return ret; | 6615 | return ret; |
6383 | } | 6616 | } |
6384 | 6617 | ||
6385 | static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) | ||
6386 | { | ||
6387 | struct dentry *parent; | ||
6388 | int ret; | ||
6389 | |||
6390 | /* Paranoid: Make sure the parent is the "instances" directory */ | ||
6391 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); | ||
6392 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | ||
6393 | return -ENOENT; | ||
6394 | |||
6395 | /* | ||
6396 | * The inode mutex is locked, but debugfs_create_dir() will also | ||
6397 | * take the mutex. As the instances directory can not be destroyed | ||
6398 | * or changed in any other way, it is safe to unlock it, and | ||
6399 | * let the dentry try. If two users try to make the same dir at | ||
6400 | * the same time, then the new_instance_create() will determine the | ||
6401 | * winner. | ||
6402 | */ | ||
6403 | mutex_unlock(&inode->i_mutex); | ||
6404 | |||
6405 | ret = new_instance_create(dentry->d_iname); | ||
6406 | |||
6407 | mutex_lock(&inode->i_mutex); | ||
6408 | |||
6409 | return ret; | ||
6410 | } | ||
6411 | |||
6412 | static int instance_rmdir(struct inode *inode, struct dentry *dentry) | ||
6413 | { | ||
6414 | struct dentry *parent; | ||
6415 | int ret; | ||
6416 | |||
6417 | /* Paranoid: Make sure the parent is the "instances" directory */ | ||
6418 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); | ||
6419 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | ||
6420 | return -ENOENT; | ||
6421 | |||
6422 | /* The caller did a dget() on dentry */ | ||
6423 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
6424 | |||
6425 | /* | ||
6426 | * The inode mutex is locked, but debugfs_create_dir() will also | ||
6427 | * take the mutex. As the instances directory can not be destroyed | ||
6428 | * or changed in any other way, it is safe to unlock it, and | ||
6429 | * let the dentry try. If two users try to make the same dir at | ||
6430 | * the same time, then the instance_delete() will determine the | ||
6431 | * winner. | ||
6432 | */ | ||
6433 | mutex_unlock(&inode->i_mutex); | ||
6434 | |||
6435 | ret = instance_delete(dentry->d_iname); | ||
6436 | |||
6437 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); | ||
6438 | mutex_lock(&dentry->d_inode->i_mutex); | ||
6439 | |||
6440 | return ret; | ||
6441 | } | ||
6442 | |||
6443 | static const struct inode_operations instance_dir_inode_operations = { | ||
6444 | .lookup = simple_lookup, | ||
6445 | .mkdir = instance_mkdir, | ||
6446 | .rmdir = instance_rmdir, | ||
6447 | }; | ||
6448 | |||
6449 | static __init void create_trace_instances(struct dentry *d_tracer) | 6618 | static __init void create_trace_instances(struct dentry *d_tracer) |
6450 | { | 6619 | { |
6451 | trace_instance_dir = debugfs_create_dir("instances", d_tracer); | 6620 | trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, |
6621 | instance_mkdir, | ||
6622 | instance_rmdir); | ||
6452 | if (WARN_ON(!trace_instance_dir)) | 6623 | if (WARN_ON(!trace_instance_dir)) |
6453 | return; | 6624 | return; |
6454 | |||
6455 | /* Hijack the dir inode operations, to allow mkdir */ | ||
6456 | trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; | ||
6457 | } | 6625 | } |
6458 | 6626 | ||
6459 | static void | 6627 | static void |
6460 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | 6628 | init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) |
6461 | { | 6629 | { |
6462 | int cpu; | 6630 | int cpu; |
6463 | 6631 | ||
@@ -6511,10 +6679,32 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
6511 | #endif | 6679 | #endif |
6512 | 6680 | ||
6513 | for_each_tracing_cpu(cpu) | 6681 | for_each_tracing_cpu(cpu) |
6514 | tracing_init_debugfs_percpu(tr, cpu); | 6682 | tracing_init_tracefs_percpu(tr, cpu); |
6515 | 6683 | ||
6516 | } | 6684 | } |
6517 | 6685 | ||
6686 | static struct vfsmount *trace_automount(void *ingore) | ||
6687 | { | ||
6688 | struct vfsmount *mnt; | ||
6689 | struct file_system_type *type; | ||
6690 | |||
6691 | /* | ||
6692 | * To maintain backward compatibility for tools that mount | ||
6693 | * debugfs to get to the tracing facility, tracefs is automatically | ||
6694 | * mounted to the debugfs/tracing directory. | ||
6695 | */ | ||
6696 | type = get_fs_type("tracefs"); | ||
6697 | if (!type) | ||
6698 | return NULL; | ||
6699 | mnt = vfs_kern_mount(type, 0, "tracefs", NULL); | ||
6700 | put_filesystem(type); | ||
6701 | if (IS_ERR(mnt)) | ||
6702 | return NULL; | ||
6703 | mntget(mnt); | ||
6704 | |||
6705 | return mnt; | ||
6706 | } | ||
6707 | |||
6518 | /** | 6708 | /** |
6519 | * tracing_init_dentry - initialize top level trace array | 6709 | * tracing_init_dentry - initialize top level trace array |
6520 | * | 6710 | * |
@@ -6526,23 +6716,112 @@ struct dentry *tracing_init_dentry(void) | |||
6526 | { | 6716 | { |
6527 | struct trace_array *tr = &global_trace; | 6717 | struct trace_array *tr = &global_trace; |
6528 | 6718 | ||
6719 | /* The top level trace array uses NULL as parent */ | ||
6529 | if (tr->dir) | 6720 | if (tr->dir) |
6530 | return tr->dir; | 6721 | return NULL; |
6531 | 6722 | ||
6532 | if (WARN_ON(!debugfs_initialized())) | 6723 | if (WARN_ON(!debugfs_initialized())) |
6533 | return ERR_PTR(-ENODEV); | 6724 | return ERR_PTR(-ENODEV); |
6534 | 6725 | ||
6535 | tr->dir = debugfs_create_dir("tracing", NULL); | 6726 | /* |
6536 | 6727 | * As there may still be users that expect the tracing | |
6728 | * files to exist in debugfs/tracing, we must automount | ||
6729 | * the tracefs file system there, so older tools still | ||
6730 | * work with the newer kerenl. | ||
6731 | */ | ||
6732 | tr->dir = debugfs_create_automount("tracing", NULL, | ||
6733 | trace_automount, NULL); | ||
6537 | if (!tr->dir) { | 6734 | if (!tr->dir) { |
6538 | pr_warn_once("Could not create debugfs directory 'tracing'\n"); | 6735 | pr_warn_once("Could not create debugfs directory 'tracing'\n"); |
6539 | return ERR_PTR(-ENOMEM); | 6736 | return ERR_PTR(-ENOMEM); |
6540 | } | 6737 | } |
6541 | 6738 | ||
6542 | return tr->dir; | 6739 | return NULL; |
6740 | } | ||
6741 | |||
6742 | extern struct trace_enum_map *__start_ftrace_enum_maps[]; | ||
6743 | extern struct trace_enum_map *__stop_ftrace_enum_maps[]; | ||
6744 | |||
6745 | static void __init trace_enum_init(void) | ||
6746 | { | ||
6747 | int len; | ||
6748 | |||
6749 | len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; | ||
6750 | trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); | ||
6751 | } | ||
6752 | |||
6753 | #ifdef CONFIG_MODULES | ||
6754 | static void trace_module_add_enums(struct module *mod) | ||
6755 | { | ||
6756 | if (!mod->num_trace_enums) | ||
6757 | return; | ||
6758 | |||
6759 | /* | ||
6760 | * Modules with bad taint do not have events created, do | ||
6761 | * not bother with enums either. | ||
6762 | */ | ||
6763 | if (trace_module_has_bad_taint(mod)) | ||
6764 | return; | ||
6765 | |||
6766 | trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); | ||
6543 | } | 6767 | } |
6544 | 6768 | ||
6545 | static __init int tracer_init_debugfs(void) | 6769 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE |
6770 | static void trace_module_remove_enums(struct module *mod) | ||
6771 | { | ||
6772 | union trace_enum_map_item *map; | ||
6773 | union trace_enum_map_item **last = &trace_enum_maps; | ||
6774 | |||
6775 | if (!mod->num_trace_enums) | ||
6776 | return; | ||
6777 | |||
6778 | mutex_lock(&trace_enum_mutex); | ||
6779 | |||
6780 | map = trace_enum_maps; | ||
6781 | |||
6782 | while (map) { | ||
6783 | if (map->head.mod == mod) | ||
6784 | break; | ||
6785 | map = trace_enum_jmp_to_tail(map); | ||
6786 | last = &map->tail.next; | ||
6787 | map = map->tail.next; | ||
6788 | } | ||
6789 | if (!map) | ||
6790 | goto out; | ||
6791 | |||
6792 | *last = trace_enum_jmp_to_tail(map)->tail.next; | ||
6793 | kfree(map); | ||
6794 | out: | ||
6795 | mutex_unlock(&trace_enum_mutex); | ||
6796 | } | ||
6797 | #else | ||
6798 | static inline void trace_module_remove_enums(struct module *mod) { } | ||
6799 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | ||
6800 | |||
6801 | static int trace_module_notify(struct notifier_block *self, | ||
6802 | unsigned long val, void *data) | ||
6803 | { | ||
6804 | struct module *mod = data; | ||
6805 | |||
6806 | switch (val) { | ||
6807 | case MODULE_STATE_COMING: | ||
6808 | trace_module_add_enums(mod); | ||
6809 | break; | ||
6810 | case MODULE_STATE_GOING: | ||
6811 | trace_module_remove_enums(mod); | ||
6812 | break; | ||
6813 | } | ||
6814 | |||
6815 | return 0; | ||
6816 | } | ||
6817 | |||
6818 | static struct notifier_block trace_module_nb = { | ||
6819 | .notifier_call = trace_module_notify, | ||
6820 | .priority = 0, | ||
6821 | }; | ||
6822 | #endif /* CONFIG_MODULES */ | ||
6823 | |||
6824 | static __init int tracer_init_tracefs(void) | ||
6546 | { | 6825 | { |
6547 | struct dentry *d_tracer; | 6826 | struct dentry *d_tracer; |
6548 | 6827 | ||
@@ -6552,7 +6831,7 @@ static __init int tracer_init_debugfs(void) | |||
6552 | if (IS_ERR(d_tracer)) | 6831 | if (IS_ERR(d_tracer)) |
6553 | return 0; | 6832 | return 0; |
6554 | 6833 | ||
6555 | init_tracer_debugfs(&global_trace, d_tracer); | 6834 | init_tracer_tracefs(&global_trace, d_tracer); |
6556 | 6835 | ||
6557 | trace_create_file("tracing_thresh", 0644, d_tracer, | 6836 | trace_create_file("tracing_thresh", 0644, d_tracer, |
6558 | &global_trace, &tracing_thresh_fops); | 6837 | &global_trace, &tracing_thresh_fops); |
@@ -6566,6 +6845,14 @@ static __init int tracer_init_debugfs(void) | |||
6566 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, | 6845 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, |
6567 | NULL, &tracing_saved_cmdlines_size_fops); | 6846 | NULL, &tracing_saved_cmdlines_size_fops); |
6568 | 6847 | ||
6848 | trace_enum_init(); | ||
6849 | |||
6850 | trace_create_enum_file(d_tracer); | ||
6851 | |||
6852 | #ifdef CONFIG_MODULES | ||
6853 | register_module_notifier(&trace_module_nb); | ||
6854 | #endif | ||
6855 | |||
6569 | #ifdef CONFIG_DYNAMIC_FTRACE | 6856 | #ifdef CONFIG_DYNAMIC_FTRACE |
6570 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 6857 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
6571 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 6858 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
@@ -6575,6 +6862,10 @@ static __init int tracer_init_debugfs(void) | |||
6575 | 6862 | ||
6576 | create_trace_options_dir(&global_trace); | 6863 | create_trace_options_dir(&global_trace); |
6577 | 6864 | ||
6865 | /* If the tracer was started via cmdline, create options for it here */ | ||
6866 | if (global_trace.current_trace != &nop_trace) | ||
6867 | update_tracer_options(&global_trace, global_trace.current_trace); | ||
6868 | |||
6578 | return 0; | 6869 | return 0; |
6579 | } | 6870 | } |
6580 | 6871 | ||
@@ -6888,7 +7179,7 @@ void __init trace_init(void) | |||
6888 | tracepoint_printk = 0; | 7179 | tracepoint_printk = 0; |
6889 | } | 7180 | } |
6890 | tracer_alloc_buffers(); | 7181 | tracer_alloc_buffers(); |
6891 | trace_event_init(); | 7182 | trace_event_init(); |
6892 | } | 7183 | } |
6893 | 7184 | ||
6894 | __init static int clear_boot_tracer(void) | 7185 | __init static int clear_boot_tracer(void) |
@@ -6910,5 +7201,5 @@ __init static int clear_boot_tracer(void) | |||
6910 | return 0; | 7201 | return 0; |
6911 | } | 7202 | } |
6912 | 7203 | ||
6913 | fs_initcall(tracer_init_debugfs); | 7204 | fs_initcall(tracer_init_tracefs); |
6914 | late_initcall(clear_boot_tracer); | 7205 | late_initcall(clear_boot_tracer); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dd8205a35760..d2612016de94 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -334,7 +334,7 @@ struct tracer_flags { | |||
334 | 334 | ||
335 | 335 | ||
336 | /** | 336 | /** |
337 | * struct tracer - a specific tracer and its callbacks to interact with debugfs | 337 | * struct tracer - a specific tracer and its callbacks to interact with tracefs |
338 | * @name: the name chosen to select it on the available_tracers file | 338 | * @name: the name chosen to select it on the available_tracers file |
339 | * @init: called when one switches to this tracer (echo name > current_tracer) | 339 | * @init: called when one switches to this tracer (echo name > current_tracer) |
340 | * @reset: called when one switches to another tracer | 340 | * @reset: called when one switches to another tracer |
@@ -1309,8 +1309,10 @@ static inline void init_ftrace_syscalls(void) { } | |||
1309 | 1309 | ||
1310 | #ifdef CONFIG_EVENT_TRACING | 1310 | #ifdef CONFIG_EVENT_TRACING |
1311 | void trace_event_init(void); | 1311 | void trace_event_init(void); |
1312 | void trace_event_enum_update(struct trace_enum_map **map, int len); | ||
1312 | #else | 1313 | #else |
1313 | static inline void __init trace_event_init(void) { } | 1314 | static inline void __init trace_event_init(void) { } |
1315 | static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { } | ||
1314 | #endif | 1316 | #endif |
1315 | 1317 | ||
1316 | extern struct trace_iterator *tracepoint_print_iter; | 1318 | extern struct trace_iterator *tracepoint_print_iter; |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e2d027ac66a2..ee7b94a4810a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry, | |||
223 | __dynamic_array( u32, buf ) | 223 | __dynamic_array( u32, buf ) |
224 | ), | 224 | ), |
225 | 225 | ||
226 | F_printk("%pf: %s", | 226 | F_printk("%ps: %s", |
227 | (void *)__entry->ip, __entry->fmt), | 227 | (void *)__entry->ip, __entry->fmt), |
228 | 228 | ||
229 | FILTER_OTHER | 229 | FILTER_OTHER |
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry, | |||
238 | __dynamic_array( char, buf ) | 238 | __dynamic_array( char, buf ) |
239 | ), | 239 | ), |
240 | 240 | ||
241 | F_printk("%pf: %s", | 241 | F_printk("%ps: %s", |
242 | (void *)__entry->ip, __entry->buf), | 242 | (void *)__entry->ip, __entry->buf), |
243 | 243 | ||
244 | FILTER_OTHER | 244 | FILTER_OTHER |
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry, | |||
253 | __field( const char *, str ) | 253 | __field( const char *, str ) |
254 | ), | 254 | ), |
255 | 255 | ||
256 | F_printk("%pf: %s", | 256 | F_printk("%ps: %s", |
257 | (void *)__entry->ip, __entry->str), | 257 | (void *)__entry->ip, __entry->str), |
258 | 258 | ||
259 | FILTER_OTHER | 259 | FILTER_OTHER |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db54dda10ccc..7da1dfeb322e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -13,7 +13,7 @@ | |||
13 | #include <linux/workqueue.h> | 13 | #include <linux/workqueue.h> |
14 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
15 | #include <linux/kthread.h> | 15 | #include <linux/kthread.h> |
16 | #include <linux/debugfs.h> | 16 | #include <linux/tracefs.h> |
17 | #include <linux/uaccess.h> | 17 | #include <linux/uaccess.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/ctype.h> | 19 | #include <linux/ctype.h> |
@@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) | |||
480 | return; | 480 | return; |
481 | 481 | ||
482 | if (!--dir->nr_events) { | 482 | if (!--dir->nr_events) { |
483 | debugfs_remove_recursive(dir->entry); | 483 | tracefs_remove_recursive(dir->entry); |
484 | list_del(&dir->list); | 484 | list_del(&dir->list); |
485 | __put_system_dir(dir); | 485 | __put_system_dir(dir); |
486 | } | 486 | } |
@@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) | |||
499 | } | 499 | } |
500 | spin_unlock(&dir->d_lock); | 500 | spin_unlock(&dir->d_lock); |
501 | 501 | ||
502 | debugfs_remove_recursive(dir); | 502 | tracefs_remove_recursive(dir); |
503 | } | 503 | } |
504 | 504 | ||
505 | list_del(&file->list); | 505 | list_del(&file->list); |
@@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
1526 | } else | 1526 | } else |
1527 | __get_system(system); | 1527 | __get_system(system); |
1528 | 1528 | ||
1529 | dir->entry = debugfs_create_dir(name, parent); | 1529 | dir->entry = tracefs_create_dir(name, parent); |
1530 | if (!dir->entry) { | 1530 | if (!dir->entry) { |
1531 | pr_warn("Failed to create system directory %s\n", name); | 1531 | pr_warn("Failed to create system directory %s\n", name); |
1532 | __put_system(system); | 1532 | __put_system(system); |
@@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
1539 | dir->subsystem = system; | 1539 | dir->subsystem = system; |
1540 | file->system = dir; | 1540 | file->system = dir; |
1541 | 1541 | ||
1542 | entry = debugfs_create_file("filter", 0644, dir->entry, dir, | 1542 | entry = tracefs_create_file("filter", 0644, dir->entry, dir, |
1543 | &ftrace_subsystem_filter_fops); | 1543 | &ftrace_subsystem_filter_fops); |
1544 | if (!entry) { | 1544 | if (!entry) { |
1545 | kfree(system->filter); | 1545 | kfree(system->filter); |
1546 | system->filter = NULL; | 1546 | system->filter = NULL; |
1547 | pr_warn("Could not create debugfs '%s/filter' entry\n", name); | 1547 | pr_warn("Could not create tracefs '%s/filter' entry\n", name); |
1548 | } | 1548 | } |
1549 | 1549 | ||
1550 | trace_create_file("enable", 0644, dir->entry, dir, | 1550 | trace_create_file("enable", 0644, dir->entry, dir, |
@@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) | |||
1585 | d_events = parent; | 1585 | d_events = parent; |
1586 | 1586 | ||
1587 | name = ftrace_event_name(call); | 1587 | name = ftrace_event_name(call); |
1588 | file->dir = debugfs_create_dir(name, d_events); | 1588 | file->dir = tracefs_create_dir(name, d_events); |
1589 | if (!file->dir) { | 1589 | if (!file->dir) { |
1590 | pr_warn("Could not create debugfs '%s' directory\n", name); | 1590 | pr_warn("Could not create tracefs '%s' directory\n", name); |
1591 | return -1; | 1591 | return -1; |
1592 | } | 1592 | } |
1593 | 1593 | ||
@@ -1704,6 +1704,125 @@ __register_event(struct ftrace_event_call *call, struct module *mod) | |||
1704 | return 0; | 1704 | return 0; |
1705 | } | 1705 | } |
1706 | 1706 | ||
1707 | static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) | ||
1708 | { | ||
1709 | int rlen; | ||
1710 | int elen; | ||
1711 | |||
1712 | /* Find the length of the enum value as a string */ | ||
1713 | elen = snprintf(ptr, 0, "%ld", map->enum_value); | ||
1714 | /* Make sure there's enough room to replace the string with the value */ | ||
1715 | if (len < elen) | ||
1716 | return NULL; | ||
1717 | |||
1718 | snprintf(ptr, elen + 1, "%ld", map->enum_value); | ||
1719 | |||
1720 | /* Get the rest of the string of ptr */ | ||
1721 | rlen = strlen(ptr + len); | ||
1722 | memmove(ptr + elen, ptr + len, rlen); | ||
1723 | /* Make sure we end the new string */ | ||
1724 | ptr[elen + rlen] = 0; | ||
1725 | |||
1726 | return ptr + elen; | ||
1727 | } | ||
1728 | |||
1729 | static void update_event_printk(struct ftrace_event_call *call, | ||
1730 | struct trace_enum_map *map) | ||
1731 | { | ||
1732 | char *ptr; | ||
1733 | int quote = 0; | ||
1734 | int len = strlen(map->enum_string); | ||
1735 | |||
1736 | for (ptr = call->print_fmt; *ptr; ptr++) { | ||
1737 | if (*ptr == '\\') { | ||
1738 | ptr++; | ||
1739 | /* paranoid */ | ||
1740 | if (!*ptr) | ||
1741 | break; | ||
1742 | continue; | ||
1743 | } | ||
1744 | if (*ptr == '"') { | ||
1745 | quote ^= 1; | ||
1746 | continue; | ||
1747 | } | ||
1748 | if (quote) | ||
1749 | continue; | ||
1750 | if (isdigit(*ptr)) { | ||
1751 | /* skip numbers */ | ||
1752 | do { | ||
1753 | ptr++; | ||
1754 | /* Check for alpha chars like ULL */ | ||
1755 | } while (isalnum(*ptr)); | ||
1756 | /* | ||
1757 | * A number must have some kind of delimiter after | ||
1758 | * it, and we can ignore that too. | ||
1759 | */ | ||
1760 | continue; | ||
1761 | } | ||
1762 | if (isalpha(*ptr) || *ptr == '_') { | ||
1763 | if (strncmp(map->enum_string, ptr, len) == 0 && | ||
1764 | !isalnum(ptr[len]) && ptr[len] != '_') { | ||
1765 | ptr = enum_replace(ptr, map, len); | ||
1766 | /* Hmm, enum string smaller than value */ | ||
1767 | if (WARN_ON_ONCE(!ptr)) | ||
1768 | return; | ||
1769 | /* | ||
1770 | * No need to decrement here, as enum_replace() | ||
1771 | * returns the pointer to the character passed | ||
1772 | * the enum, and two enums can not be placed | ||
1773 | * back to back without something in between. | ||
1774 | * We can skip that something in between. | ||
1775 | */ | ||
1776 | continue; | ||
1777 | } | ||
1778 | skip_more: | ||
1779 | do { | ||
1780 | ptr++; | ||
1781 | } while (isalnum(*ptr) || *ptr == '_'); | ||
1782 | /* | ||
1783 | * If what comes after this variable is a '.' or | ||
1784 | * '->' then we can continue to ignore that string. | ||
1785 | */ | ||
1786 | if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { | ||
1787 | ptr += *ptr == '.' ? 1 : 2; | ||
1788 | goto skip_more; | ||
1789 | } | ||
1790 | /* | ||
1791 | * Once again, we can skip the delimiter that came | ||
1792 | * after the string. | ||
1793 | */ | ||
1794 | continue; | ||
1795 | } | ||
1796 | } | ||
1797 | } | ||
1798 | |||
1799 | void trace_event_enum_update(struct trace_enum_map **map, int len) | ||
1800 | { | ||
1801 | struct ftrace_event_call *call, *p; | ||
1802 | const char *last_system = NULL; | ||
1803 | int last_i; | ||
1804 | int i; | ||
1805 | |||
1806 | down_write(&trace_event_sem); | ||
1807 | list_for_each_entry_safe(call, p, &ftrace_events, list) { | ||
1808 | /* events are usually grouped together with systems */ | ||
1809 | if (!last_system || call->class->system != last_system) { | ||
1810 | last_i = 0; | ||
1811 | last_system = call->class->system; | ||
1812 | } | ||
1813 | |||
1814 | for (i = last_i; i < len; i++) { | ||
1815 | if (call->class->system == map[i]->system) { | ||
1816 | /* Save the first system if need be */ | ||
1817 | if (!last_i) | ||
1818 | last_i = i; | ||
1819 | update_event_printk(call, map[i]); | ||
1820 | } | ||
1821 | } | ||
1822 | } | ||
1823 | up_write(&trace_event_sem); | ||
1824 | } | ||
1825 | |||
1707 | static struct ftrace_event_file * | 1826 | static struct ftrace_event_file * |
1708 | trace_create_new_event(struct ftrace_event_call *call, | 1827 | trace_create_new_event(struct ftrace_event_call *call, |
1709 | struct trace_array *tr) | 1828 | struct trace_array *tr) |
@@ -1915,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self, | |||
1915 | 2034 | ||
1916 | static struct notifier_block trace_module_nb = { | 2035 | static struct notifier_block trace_module_nb = { |
1917 | .notifier_call = trace_module_notify, | 2036 | .notifier_call = trace_module_notify, |
1918 | .priority = 0, | 2037 | .priority = 1, /* higher than trace.c module notify */ |
1919 | }; | 2038 | }; |
1920 | #endif /* CONFIG_MODULES */ | 2039 | #endif /* CONFIG_MODULES */ |
1921 | 2040 | ||
@@ -2228,7 +2347,7 @@ static inline int register_event_cmds(void) { return 0; } | |||
2228 | /* | 2347 | /* |
2229 | * The top level array has already had its ftrace_event_file | 2348 | * The top level array has already had its ftrace_event_file |
2230 | * descriptors created in order to allow for early events to | 2349 | * descriptors created in order to allow for early events to |
2231 | * be recorded. This function is called after the debugfs has been | 2350 | * be recorded. This function is called after the tracefs has been |
2232 | * initialized, and we now have to create the files associated | 2351 | * initialized, and we now have to create the files associated |
2233 | * to the events. | 2352 | * to the events. |
2234 | */ | 2353 | */ |
@@ -2311,16 +2430,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) | |||
2311 | struct dentry *d_events; | 2430 | struct dentry *d_events; |
2312 | struct dentry *entry; | 2431 | struct dentry *entry; |
2313 | 2432 | ||
2314 | entry = debugfs_create_file("set_event", 0644, parent, | 2433 | entry = tracefs_create_file("set_event", 0644, parent, |
2315 | tr, &ftrace_set_event_fops); | 2434 | tr, &ftrace_set_event_fops); |
2316 | if (!entry) { | 2435 | if (!entry) { |
2317 | pr_warn("Could not create debugfs 'set_event' entry\n"); | 2436 | pr_warn("Could not create tracefs 'set_event' entry\n"); |
2318 | return -ENOMEM; | 2437 | return -ENOMEM; |
2319 | } | 2438 | } |
2320 | 2439 | ||
2321 | d_events = debugfs_create_dir("events", parent); | 2440 | d_events = tracefs_create_dir("events", parent); |
2322 | if (!d_events) { | 2441 | if (!d_events) { |
2323 | pr_warn("Could not create debugfs 'events' directory\n"); | 2442 | pr_warn("Could not create tracefs 'events' directory\n"); |
2324 | return -ENOMEM; | 2443 | return -ENOMEM; |
2325 | } | 2444 | } |
2326 | 2445 | ||
@@ -2412,7 +2531,7 @@ int event_trace_del_tracer(struct trace_array *tr) | |||
2412 | 2531 | ||
2413 | down_write(&trace_event_sem); | 2532 | down_write(&trace_event_sem); |
2414 | __trace_remove_event_dirs(tr); | 2533 | __trace_remove_event_dirs(tr); |
2415 | debugfs_remove_recursive(tr->event_dir); | 2534 | tracefs_remove_recursive(tr->event_dir); |
2416 | up_write(&trace_event_sem); | 2535 | up_write(&trace_event_sem); |
2417 | 2536 | ||
2418 | tr->event_dir = NULL; | 2537 | tr->event_dir = NULL; |
@@ -2534,10 +2653,10 @@ static __init int event_trace_init(void) | |||
2534 | if (IS_ERR(d_tracer)) | 2653 | if (IS_ERR(d_tracer)) |
2535 | return 0; | 2654 | return 0; |
2536 | 2655 | ||
2537 | entry = debugfs_create_file("available_events", 0444, d_tracer, | 2656 | entry = tracefs_create_file("available_events", 0444, d_tracer, |
2538 | tr, &ftrace_avail_fops); | 2657 | tr, &ftrace_avail_fops); |
2539 | if (!entry) | 2658 | if (!entry) |
2540 | pr_warn("Could not create debugfs 'available_events' entry\n"); | 2659 | pr_warn("Could not create tracefs 'available_events' entry\n"); |
2541 | 2660 | ||
2542 | if (trace_define_common_fields()) | 2661 | if (trace_define_common_fields()) |
2543 | pr_warn("tracing: Failed to allocate common fields"); | 2662 | pr_warn("tracing: Failed to allocate common fields"); |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 12e2b99be862..174a6a71146c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \ | |||
177 | }, \ | 177 | }, \ |
178 | .event.type = etype, \ | 178 | .event.type = etype, \ |
179 | .print_fmt = print, \ | 179 | .print_fmt = print, \ |
180 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ | 180 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ |
181 | }; \ | 181 | }; \ |
182 | struct ftrace_event_call __used \ | 182 | struct ftrace_event_call __used \ |
183 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 183 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2d25ad1526bb..9cfea4c6d314 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -6,7 +6,6 @@ | |||
6 | * is Copyright (c) Steven Rostedt <srostedt@redhat.com> | 6 | * is Copyright (c) Steven Rostedt <srostedt@redhat.com> |
7 | * | 7 | * |
8 | */ | 8 | */ |
9 | #include <linux/debugfs.h> | ||
10 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
11 | #include <linux/ftrace.h> | 10 | #include <linux/ftrace.h> |
12 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
@@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
151 | * The curr_ret_stack is initialized to -1 and get increased | 150 | * The curr_ret_stack is initialized to -1 and get increased |
152 | * in this function. So it can be less than -1 only if it was | 151 | * in this function. So it can be less than -1 only if it was |
153 | * filtered out via ftrace_graph_notrace_addr() which can be | 152 | * filtered out via ftrace_graph_notrace_addr() which can be |
154 | * set from set_graph_notrace file in debugfs by user. | 153 | * set from set_graph_notrace file in tracefs by user. |
155 | */ | 154 | */ |
156 | if (current->curr_ret_stack < -1) | 155 | if (current->curr_ret_stack < -1) |
157 | return -EBUSY; | 156 | return -EBUSY; |
@@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = { | |||
1432 | .llseek = generic_file_llseek, | 1431 | .llseek = generic_file_llseek, |
1433 | }; | 1432 | }; |
1434 | 1433 | ||
1435 | static __init int init_graph_debugfs(void) | 1434 | static __init int init_graph_tracefs(void) |
1436 | { | 1435 | { |
1437 | struct dentry *d_tracer; | 1436 | struct dentry *d_tracer; |
1438 | 1437 | ||
@@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void) | |||
1445 | 1444 | ||
1446 | return 0; | 1445 | return 0; |
1447 | } | 1446 | } |
1448 | fs_initcall(init_graph_debugfs); | 1447 | fs_initcall(init_graph_tracefs); |
1449 | 1448 | ||
1450 | static __init int init_graph_trace(void) | 1449 | static __init int init_graph_trace(void) |
1451 | { | 1450 | { |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d73f565b4e06..9ba3f43f580e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size) | |||
250 | #define fetch_file_offset_string_size NULL | 250 | #define fetch_file_offset_string_size NULL |
251 | 251 | ||
252 | /* Fetch type information table */ | 252 | /* Fetch type information table */ |
253 | const struct fetch_type kprobes_fetch_type_table[] = { | 253 | static const struct fetch_type kprobes_fetch_type_table[] = { |
254 | /* Special types */ | 254 | /* Special types */ |
255 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | 255 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, |
256 | sizeof(u32), 1, "__data_loc char[]"), | 256 | sizeof(u32), 1, "__data_loc char[]"), |
@@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv) | |||
760 | 760 | ||
761 | /* Parse fetch argument */ | 761 | /* Parse fetch argument */ |
762 | ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, | 762 | ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, |
763 | is_return, true); | 763 | is_return, true, |
764 | kprobes_fetch_type_table); | ||
764 | if (ret) { | 765 | if (ret) { |
765 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 766 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
766 | goto error; | 767 | goto error; |
@@ -1310,7 +1311,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) | |||
1310 | return ret; | 1311 | return ret; |
1311 | } | 1312 | } |
1312 | 1313 | ||
1313 | /* Make a debugfs interface for controlling probe points */ | 1314 | /* Make a tracefs interface for controlling probe points */ |
1314 | static __init int init_kprobe_trace(void) | 1315 | static __init int init_kprobe_trace(void) |
1315 | { | 1316 | { |
1316 | struct dentry *d_tracer; | 1317 | struct dentry *d_tracer; |
@@ -1323,20 +1324,20 @@ static __init int init_kprobe_trace(void) | |||
1323 | if (IS_ERR(d_tracer)) | 1324 | if (IS_ERR(d_tracer)) |
1324 | return 0; | 1325 | return 0; |
1325 | 1326 | ||
1326 | entry = debugfs_create_file("kprobe_events", 0644, d_tracer, | 1327 | entry = tracefs_create_file("kprobe_events", 0644, d_tracer, |
1327 | NULL, &kprobe_events_ops); | 1328 | NULL, &kprobe_events_ops); |
1328 | 1329 | ||
1329 | /* Event list interface */ | 1330 | /* Event list interface */ |
1330 | if (!entry) | 1331 | if (!entry) |
1331 | pr_warning("Could not create debugfs " | 1332 | pr_warning("Could not create tracefs " |
1332 | "'kprobe_events' entry\n"); | 1333 | "'kprobe_events' entry\n"); |
1333 | 1334 | ||
1334 | /* Profile interface */ | 1335 | /* Profile interface */ |
1335 | entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, | 1336 | entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, |
1336 | NULL, &kprobe_profile_ops); | 1337 | NULL, &kprobe_profile_ops); |
1337 | 1338 | ||
1338 | if (!entry) | 1339 | if (!entry) |
1339 | pr_warning("Could not create debugfs " | 1340 | pr_warning("Could not create tracefs " |
1340 | "'kprobe_profile' entry\n"); | 1341 | "'kprobe_profile' entry\n"); |
1341 | return 0; | 1342 | return 0; |
1342 | } | 1343 | } |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b983b2fd2ca1..1769a81da8a7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
356 | 356 | ||
357 | /* Recursive argument parser */ | 357 | /* Recursive argument parser */ |
358 | static int parse_probe_arg(char *arg, const struct fetch_type *t, | 358 | static int parse_probe_arg(char *arg, const struct fetch_type *t, |
359 | struct fetch_param *f, bool is_return, bool is_kprobe) | 359 | struct fetch_param *f, bool is_return, bool is_kprobe, |
360 | const struct fetch_type *ftbl) | ||
360 | { | 361 | { |
361 | const struct fetch_type *ftbl; | ||
362 | unsigned long param; | 362 | unsigned long param; |
363 | long offset; | 363 | long offset; |
364 | char *tmp; | 364 | char *tmp; |
365 | int ret = 0; | 365 | int ret = 0; |
366 | 366 | ||
367 | ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; | ||
368 | BUG_ON(ftbl == NULL); | ||
369 | |||
370 | switch (arg[0]) { | 367 | switch (arg[0]) { |
371 | case '$': | 368 | case '$': |
372 | ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); | 369 | ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); |
@@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
447 | dprm->fetch_size = get_fetch_size_function(t, | 444 | dprm->fetch_size = get_fetch_size_function(t, |
448 | dprm->fetch, ftbl); | 445 | dprm->fetch, ftbl); |
449 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, | 446 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, |
450 | is_kprobe); | 447 | is_kprobe, ftbl); |
451 | if (ret) | 448 | if (ret) |
452 | kfree(dprm); | 449 | kfree(dprm); |
453 | else { | 450 | else { |
@@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf, | |||
505 | 502 | ||
506 | /* String length checking wrapper */ | 503 | /* String length checking wrapper */ |
507 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | 504 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, |
508 | struct probe_arg *parg, bool is_return, bool is_kprobe) | 505 | struct probe_arg *parg, bool is_return, bool is_kprobe, |
506 | const struct fetch_type *ftbl) | ||
509 | { | 507 | { |
510 | const struct fetch_type *ftbl; | ||
511 | const char *t; | 508 | const char *t; |
512 | int ret; | 509 | int ret; |
513 | 510 | ||
514 | ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; | ||
515 | BUG_ON(ftbl == NULL); | ||
516 | |||
517 | if (strlen(arg) > MAX_ARGSTR_LEN) { | 511 | if (strlen(arg) > MAX_ARGSTR_LEN) { |
518 | pr_info("Argument is too long.: %s\n", arg); | 512 | pr_info("Argument is too long.: %s\n", arg); |
519 | return -ENOSPC; | 513 | return -ENOSPC; |
@@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | |||
535 | } | 529 | } |
536 | parg->offset = *size; | 530 | parg->offset = *size; |
537 | *size += parg->type->size; | 531 | *size += parg->type->size; |
538 | ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); | 532 | ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, |
533 | is_kprobe, ftbl); | ||
539 | 534 | ||
540 | if (ret >= 0 && t != NULL) | 535 | if (ret >= 0 && t != NULL) |
541 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | 536 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4f815fbce16d..ab283e146b70 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/seq_file.h> | 25 | #include <linux/seq_file.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/smp.h> | 27 | #include <linux/smp.h> |
28 | #include <linux/debugfs.h> | 28 | #include <linux/tracefs.h> |
29 | #include <linux/types.h> | 29 | #include <linux/types.h> |
30 | #include <linux/string.h> | 30 | #include <linux/string.h> |
31 | #include <linux/ctype.h> | 31 | #include <linux/ctype.h> |
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ | |||
229 | #define FETCH_TYPE_STRING 0 | 229 | #define FETCH_TYPE_STRING 0 |
230 | #define FETCH_TYPE_STRSIZE 1 | 230 | #define FETCH_TYPE_STRSIZE 1 |
231 | 231 | ||
232 | /* | ||
233 | * Fetch type information table. | ||
234 | * It's declared as a weak symbol due to conditional compilation. | ||
235 | */ | ||
236 | extern __weak const struct fetch_type kprobes_fetch_type_table[]; | ||
237 | extern __weak const struct fetch_type uprobes_fetch_type_table[]; | ||
238 | |||
239 | #ifdef CONFIG_KPROBE_EVENT | 232 | #ifdef CONFIG_KPROBE_EVENT |
240 | struct symbol_cache; | 233 | struct symbol_cache; |
241 | unsigned long update_symbol_cache(struct symbol_cache *sc); | 234 | unsigned long update_symbol_cache(struct symbol_cache *sc); |
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) | |||
333 | } | 326 | } |
334 | 327 | ||
335 | extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | 328 | extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, |
336 | struct probe_arg *parg, bool is_return, bool is_kprobe); | 329 | struct probe_arg *parg, bool is_return, bool is_kprobe, |
330 | const struct fetch_type *ftbl); | ||
337 | 331 | ||
338 | extern int traceprobe_conflict_field_name(const char *name, | 332 | extern int traceprobe_conflict_field_name(const char *name, |
339 | struct probe_arg *args, int narg); | 333 | struct probe_arg *args, int narg); |
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 75e19e86c954..6cf935316769 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/list.h> | 12 | #include <linux/list.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/rbtree.h> | 14 | #include <linux/rbtree.h> |
15 | #include <linux/debugfs.h> | 15 | #include <linux/tracefs.h> |
16 | #include "trace_stat.h" | 16 | #include "trace_stat.h" |
17 | #include "trace.h" | 17 | #include "trace.h" |
18 | 18 | ||
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session) | |||
65 | 65 | ||
66 | static void destroy_session(struct stat_session *session) | 66 | static void destroy_session(struct stat_session *session) |
67 | { | 67 | { |
68 | debugfs_remove(session->file); | 68 | tracefs_remove(session->file); |
69 | __reset_stat_session(session); | 69 | __reset_stat_session(session); |
70 | mutex_destroy(&session->stat_mutex); | 70 | mutex_destroy(&session->stat_mutex); |
71 | kfree(session); | 71 | kfree(session); |
@@ -279,9 +279,9 @@ static int tracing_stat_init(void) | |||
279 | if (IS_ERR(d_tracing)) | 279 | if (IS_ERR(d_tracing)) |
280 | return 0; | 280 | return 0; |
281 | 281 | ||
282 | stat_dir = debugfs_create_dir("trace_stat", d_tracing); | 282 | stat_dir = tracefs_create_dir("trace_stat", d_tracing); |
283 | if (!stat_dir) | 283 | if (!stat_dir) |
284 | pr_warning("Could not create debugfs " | 284 | pr_warning("Could not create tracefs " |
285 | "'trace_stat' entry\n"); | 285 | "'trace_stat' entry\n"); |
286 | return 0; | 286 | return 0; |
287 | } | 287 | } |
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session) | |||
291 | if (!stat_dir && tracing_stat_init()) | 291 | if (!stat_dir && tracing_stat_init()) |
292 | return -ENODEV; | 292 | return -ENODEV; |
293 | 293 | ||
294 | session->file = debugfs_create_file(session->ts->name, 0644, | 294 | session->file = tracefs_create_file(session->ts->name, 0644, |
295 | stat_dir, | 295 | stat_dir, |
296 | session, &tracing_stat_fops); | 296 | session, &tracing_stat_fops); |
297 | if (!session->file) | 297 | if (!session->file) |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7dc1c8abecd6..74865465e0b7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string) | |||
196 | DEFINE_FETCH_file_offset(string_size) | 196 | DEFINE_FETCH_file_offset(string_size) |
197 | 197 | ||
198 | /* Fetch type information table */ | 198 | /* Fetch type information table */ |
199 | const struct fetch_type uprobes_fetch_type_table[] = { | 199 | static const struct fetch_type uprobes_fetch_type_table[] = { |
200 | /* Special types */ | 200 | /* Special types */ |
201 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | 201 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, |
202 | sizeof(u32), 1, "__data_loc char[]"), | 202 | sizeof(u32), 1, "__data_loc char[]"), |
@@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv) | |||
535 | 535 | ||
536 | /* Parse fetch argument */ | 536 | /* Parse fetch argument */ |
537 | ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, | 537 | ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, |
538 | is_return, false); | 538 | is_return, false, |
539 | uprobes_fetch_type_table); | ||
539 | if (ret) { | 540 | if (ret) { |
540 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 541 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
541 | goto error; | 542 | goto error; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41ff75b478c6..586ad91300b0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -159,6 +159,7 @@ struct worker_pool { | |||
159 | 159 | ||
160 | /* see manage_workers() for details on the two manager mutexes */ | 160 | /* see manage_workers() for details on the two manager mutexes */ |
161 | struct mutex manager_arb; /* manager arbitration */ | 161 | struct mutex manager_arb; /* manager arbitration */ |
162 | struct worker *manager; /* L: purely informational */ | ||
162 | struct mutex attach_mutex; /* attach/detach exclusion */ | 163 | struct mutex attach_mutex; /* attach/detach exclusion */ |
163 | struct list_head workers; /* A: attached workers */ | 164 | struct list_head workers; /* A: attached workers */ |
164 | struct completion *detach_completion; /* all workers detached */ | 165 | struct completion *detach_completion; /* all workers detached */ |
@@ -230,7 +231,7 @@ struct wq_device; | |||
230 | */ | 231 | */ |
231 | struct workqueue_struct { | 232 | struct workqueue_struct { |
232 | struct list_head pwqs; /* WR: all pwqs of this wq */ | 233 | struct list_head pwqs; /* WR: all pwqs of this wq */ |
233 | struct list_head list; /* PL: list of all workqueues */ | 234 | struct list_head list; /* PR: list of all workqueues */ |
234 | 235 | ||
235 | struct mutex mutex; /* protects this wq */ | 236 | struct mutex mutex; /* protects this wq */ |
236 | int work_color; /* WQ: current work color */ | 237 | int work_color; /* WQ: current work color */ |
@@ -257,6 +258,13 @@ struct workqueue_struct { | |||
257 | #endif | 258 | #endif |
258 | char name[WQ_NAME_LEN]; /* I: workqueue name */ | 259 | char name[WQ_NAME_LEN]; /* I: workqueue name */ |
259 | 260 | ||
261 | /* | ||
262 | * Destruction of workqueue_struct is sched-RCU protected to allow | ||
263 | * walking the workqueues list without grabbing wq_pool_mutex. | ||
264 | * This is used to dump all workqueues from sysrq. | ||
265 | */ | ||
266 | struct rcu_head rcu; | ||
267 | |||
260 | /* hot fields used during command issue, aligned to cacheline */ | 268 | /* hot fields used during command issue, aligned to cacheline */ |
261 | unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ | 269 | unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ |
262 | struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ | 270 | struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ |
@@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; | |||
288 | static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ | 296 | static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ |
289 | static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ | 297 | static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ |
290 | 298 | ||
291 | static LIST_HEAD(workqueues); /* PL: list of all workqueues */ | 299 | static LIST_HEAD(workqueues); /* PR: list of all workqueues */ |
292 | static bool workqueue_freezing; /* PL: have wqs started freezing? */ | 300 | static bool workqueue_freezing; /* PL: have wqs started freezing? */ |
293 | 301 | ||
294 | /* the per-cpu worker pools */ | 302 | /* the per-cpu worker pools */ |
@@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); | |||
324 | static int worker_thread(void *__worker); | 332 | static int worker_thread(void *__worker); |
325 | static void copy_workqueue_attrs(struct workqueue_attrs *to, | 333 | static void copy_workqueue_attrs(struct workqueue_attrs *to, |
326 | const struct workqueue_attrs *from); | 334 | const struct workqueue_attrs *from); |
335 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq); | ||
327 | 336 | ||
328 | #define CREATE_TRACE_POINTS | 337 | #define CREATE_TRACE_POINTS |
329 | #include <trace/events/workqueue.h> | 338 | #include <trace/events/workqueue.h> |
@@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker) | |||
1911 | */ | 1920 | */ |
1912 | if (!mutex_trylock(&pool->manager_arb)) | 1921 | if (!mutex_trylock(&pool->manager_arb)) |
1913 | return false; | 1922 | return false; |
1923 | pool->manager = worker; | ||
1914 | 1924 | ||
1915 | maybe_create_worker(pool); | 1925 | maybe_create_worker(pool); |
1916 | 1926 | ||
1927 | pool->manager = NULL; | ||
1917 | mutex_unlock(&pool->manager_arb); | 1928 | mutex_unlock(&pool->manager_arb); |
1918 | return true; | 1929 | return true; |
1919 | } | 1930 | } |
@@ -2303,6 +2314,7 @@ repeat: | |||
2303 | struct wq_barrier { | 2314 | struct wq_barrier { |
2304 | struct work_struct work; | 2315 | struct work_struct work; |
2305 | struct completion done; | 2316 | struct completion done; |
2317 | struct task_struct *task; /* purely informational */ | ||
2306 | }; | 2318 | }; |
2307 | 2319 | ||
2308 | static void wq_barrier_func(struct work_struct *work) | 2320 | static void wq_barrier_func(struct work_struct *work) |
@@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, | |||
2351 | INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); | 2363 | INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); |
2352 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); | 2364 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); |
2353 | init_completion(&barr->done); | 2365 | init_completion(&barr->done); |
2366 | barr->task = current; | ||
2354 | 2367 | ||
2355 | /* | 2368 | /* |
2356 | * If @target is currently being executed, schedule the | 2369 | * If @target is currently being executed, schedule the |
@@ -2989,323 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) | |||
2989 | } | 3002 | } |
2990 | EXPORT_SYMBOL_GPL(execute_in_process_context); | 3003 | EXPORT_SYMBOL_GPL(execute_in_process_context); |
2991 | 3004 | ||
2992 | #ifdef CONFIG_SYSFS | ||
2993 | /* | ||
2994 | * Workqueues with WQ_SYSFS flag set is visible to userland via | ||
2995 | * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the | ||
2996 | * following attributes. | ||
2997 | * | ||
2998 | * per_cpu RO bool : whether the workqueue is per-cpu or unbound | ||
2999 | * max_active RW int : maximum number of in-flight work items | ||
3000 | * | ||
3001 | * Unbound workqueues have the following extra attributes. | ||
3002 | * | ||
3003 | * id RO int : the associated pool ID | ||
3004 | * nice RW int : nice value of the workers | ||
3005 | * cpumask RW mask : bitmask of allowed CPUs for the workers | ||
3006 | */ | ||
3007 | struct wq_device { | ||
3008 | struct workqueue_struct *wq; | ||
3009 | struct device dev; | ||
3010 | }; | ||
3011 | |||
3012 | static struct workqueue_struct *dev_to_wq(struct device *dev) | ||
3013 | { | ||
3014 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
3015 | |||
3016 | return wq_dev->wq; | ||
3017 | } | ||
3018 | |||
3019 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, | ||
3020 | char *buf) | ||
3021 | { | ||
3022 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3023 | |||
3024 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | ||
3025 | } | ||
3026 | static DEVICE_ATTR_RO(per_cpu); | ||
3027 | |||
3028 | static ssize_t max_active_show(struct device *dev, | ||
3029 | struct device_attribute *attr, char *buf) | ||
3030 | { | ||
3031 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3032 | |||
3033 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | ||
3034 | } | ||
3035 | |||
3036 | static ssize_t max_active_store(struct device *dev, | ||
3037 | struct device_attribute *attr, const char *buf, | ||
3038 | size_t count) | ||
3039 | { | ||
3040 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3041 | int val; | ||
3042 | |||
3043 | if (sscanf(buf, "%d", &val) != 1 || val <= 0) | ||
3044 | return -EINVAL; | ||
3045 | |||
3046 | workqueue_set_max_active(wq, val); | ||
3047 | return count; | ||
3048 | } | ||
3049 | static DEVICE_ATTR_RW(max_active); | ||
3050 | |||
3051 | static struct attribute *wq_sysfs_attrs[] = { | ||
3052 | &dev_attr_per_cpu.attr, | ||
3053 | &dev_attr_max_active.attr, | ||
3054 | NULL, | ||
3055 | }; | ||
3056 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
3057 | |||
3058 | static ssize_t wq_pool_ids_show(struct device *dev, | ||
3059 | struct device_attribute *attr, char *buf) | ||
3060 | { | ||
3061 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3062 | const char *delim = ""; | ||
3063 | int node, written = 0; | ||
3064 | |||
3065 | rcu_read_lock_sched(); | ||
3066 | for_each_node(node) { | ||
3067 | written += scnprintf(buf + written, PAGE_SIZE - written, | ||
3068 | "%s%d:%d", delim, node, | ||
3069 | unbound_pwq_by_node(wq, node)->pool->id); | ||
3070 | delim = " "; | ||
3071 | } | ||
3072 | written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); | ||
3073 | rcu_read_unlock_sched(); | ||
3074 | |||
3075 | return written; | ||
3076 | } | ||
3077 | |||
3078 | static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, | ||
3079 | char *buf) | ||
3080 | { | ||
3081 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3082 | int written; | ||
3083 | |||
3084 | mutex_lock(&wq->mutex); | ||
3085 | written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); | ||
3086 | mutex_unlock(&wq->mutex); | ||
3087 | |||
3088 | return written; | ||
3089 | } | ||
3090 | |||
3091 | /* prepare workqueue_attrs for sysfs store operations */ | ||
3092 | static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) | ||
3093 | { | ||
3094 | struct workqueue_attrs *attrs; | ||
3095 | |||
3096 | attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
3097 | if (!attrs) | ||
3098 | return NULL; | ||
3099 | |||
3100 | mutex_lock(&wq->mutex); | ||
3101 | copy_workqueue_attrs(attrs, wq->unbound_attrs); | ||
3102 | mutex_unlock(&wq->mutex); | ||
3103 | return attrs; | ||
3104 | } | ||
3105 | |||
3106 | static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, | ||
3107 | const char *buf, size_t count) | ||
3108 | { | ||
3109 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3110 | struct workqueue_attrs *attrs; | ||
3111 | int ret; | ||
3112 | |||
3113 | attrs = wq_sysfs_prep_attrs(wq); | ||
3114 | if (!attrs) | ||
3115 | return -ENOMEM; | ||
3116 | |||
3117 | if (sscanf(buf, "%d", &attrs->nice) == 1 && | ||
3118 | attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) | ||
3119 | ret = apply_workqueue_attrs(wq, attrs); | ||
3120 | else | ||
3121 | ret = -EINVAL; | ||
3122 | |||
3123 | free_workqueue_attrs(attrs); | ||
3124 | return ret ?: count; | ||
3125 | } | ||
3126 | |||
3127 | static ssize_t wq_cpumask_show(struct device *dev, | ||
3128 | struct device_attribute *attr, char *buf) | ||
3129 | { | ||
3130 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3131 | int written; | ||
3132 | |||
3133 | mutex_lock(&wq->mutex); | ||
3134 | written = scnprintf(buf, PAGE_SIZE, "%*pb\n", | ||
3135 | cpumask_pr_args(wq->unbound_attrs->cpumask)); | ||
3136 | mutex_unlock(&wq->mutex); | ||
3137 | return written; | ||
3138 | } | ||
3139 | |||
3140 | static ssize_t wq_cpumask_store(struct device *dev, | ||
3141 | struct device_attribute *attr, | ||
3142 | const char *buf, size_t count) | ||
3143 | { | ||
3144 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3145 | struct workqueue_attrs *attrs; | ||
3146 | int ret; | ||
3147 | |||
3148 | attrs = wq_sysfs_prep_attrs(wq); | ||
3149 | if (!attrs) | ||
3150 | return -ENOMEM; | ||
3151 | |||
3152 | ret = cpumask_parse(buf, attrs->cpumask); | ||
3153 | if (!ret) | ||
3154 | ret = apply_workqueue_attrs(wq, attrs); | ||
3155 | |||
3156 | free_workqueue_attrs(attrs); | ||
3157 | return ret ?: count; | ||
3158 | } | ||
3159 | |||
3160 | static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, | ||
3161 | char *buf) | ||
3162 | { | ||
3163 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3164 | int written; | ||
3165 | |||
3166 | mutex_lock(&wq->mutex); | ||
3167 | written = scnprintf(buf, PAGE_SIZE, "%d\n", | ||
3168 | !wq->unbound_attrs->no_numa); | ||
3169 | mutex_unlock(&wq->mutex); | ||
3170 | |||
3171 | return written; | ||
3172 | } | ||
3173 | |||
3174 | static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, | ||
3175 | const char *buf, size_t count) | ||
3176 | { | ||
3177 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3178 | struct workqueue_attrs *attrs; | ||
3179 | int v, ret; | ||
3180 | |||
3181 | attrs = wq_sysfs_prep_attrs(wq); | ||
3182 | if (!attrs) | ||
3183 | return -ENOMEM; | ||
3184 | |||
3185 | ret = -EINVAL; | ||
3186 | if (sscanf(buf, "%d", &v) == 1) { | ||
3187 | attrs->no_numa = !v; | ||
3188 | ret = apply_workqueue_attrs(wq, attrs); | ||
3189 | } | ||
3190 | |||
3191 | free_workqueue_attrs(attrs); | ||
3192 | return ret ?: count; | ||
3193 | } | ||
3194 | |||
3195 | static struct device_attribute wq_sysfs_unbound_attrs[] = { | ||
3196 | __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), | ||
3197 | __ATTR(nice, 0644, wq_nice_show, wq_nice_store), | ||
3198 | __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), | ||
3199 | __ATTR(numa, 0644, wq_numa_show, wq_numa_store), | ||
3200 | __ATTR_NULL, | ||
3201 | }; | ||
3202 | |||
3203 | static struct bus_type wq_subsys = { | ||
3204 | .name = "workqueue", | ||
3205 | .dev_groups = wq_sysfs_groups, | ||
3206 | }; | ||
3207 | |||
3208 | static int __init wq_sysfs_init(void) | ||
3209 | { | ||
3210 | return subsys_virtual_register(&wq_subsys, NULL); | ||
3211 | } | ||
3212 | core_initcall(wq_sysfs_init); | ||
3213 | |||
3214 | static void wq_device_release(struct device *dev) | ||
3215 | { | ||
3216 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
3217 | |||
3218 | kfree(wq_dev); | ||
3219 | } | ||
3220 | |||
3221 | /** | ||
3222 | * workqueue_sysfs_register - make a workqueue visible in sysfs | ||
3223 | * @wq: the workqueue to register | ||
3224 | * | ||
3225 | * Expose @wq in sysfs under /sys/bus/workqueue/devices. | ||
3226 | * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set | ||
3227 | * which is the preferred method. | ||
3228 | * | ||
3229 | * Workqueue user should use this function directly iff it wants to apply | ||
3230 | * workqueue_attrs before making the workqueue visible in sysfs; otherwise, | ||
3231 | * apply_workqueue_attrs() may race against userland updating the | ||
3232 | * attributes. | ||
3233 | * | ||
3234 | * Return: 0 on success, -errno on failure. | ||
3235 | */ | ||
3236 | int workqueue_sysfs_register(struct workqueue_struct *wq) | ||
3237 | { | ||
3238 | struct wq_device *wq_dev; | ||
3239 | int ret; | ||
3240 | |||
3241 | /* | ||
3242 | * Adjusting max_active or creating new pwqs by applyting | ||
3243 | * attributes breaks ordering guarantee. Disallow exposing ordered | ||
3244 | * workqueues. | ||
3245 | */ | ||
3246 | if (WARN_ON(wq->flags & __WQ_ORDERED)) | ||
3247 | return -EINVAL; | ||
3248 | |||
3249 | wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); | ||
3250 | if (!wq_dev) | ||
3251 | return -ENOMEM; | ||
3252 | |||
3253 | wq_dev->wq = wq; | ||
3254 | wq_dev->dev.bus = &wq_subsys; | ||
3255 | wq_dev->dev.init_name = wq->name; | ||
3256 | wq_dev->dev.release = wq_device_release; | ||
3257 | |||
3258 | /* | ||
3259 | * unbound_attrs are created separately. Suppress uevent until | ||
3260 | * everything is ready. | ||
3261 | */ | ||
3262 | dev_set_uevent_suppress(&wq_dev->dev, true); | ||
3263 | |||
3264 | ret = device_register(&wq_dev->dev); | ||
3265 | if (ret) { | ||
3266 | kfree(wq_dev); | ||
3267 | wq->wq_dev = NULL; | ||
3268 | return ret; | ||
3269 | } | ||
3270 | |||
3271 | if (wq->flags & WQ_UNBOUND) { | ||
3272 | struct device_attribute *attr; | ||
3273 | |||
3274 | for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { | ||
3275 | ret = device_create_file(&wq_dev->dev, attr); | ||
3276 | if (ret) { | ||
3277 | device_unregister(&wq_dev->dev); | ||
3278 | wq->wq_dev = NULL; | ||
3279 | return ret; | ||
3280 | } | ||
3281 | } | ||
3282 | } | ||
3283 | |||
3284 | dev_set_uevent_suppress(&wq_dev->dev, false); | ||
3285 | kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); | ||
3286 | return 0; | ||
3287 | } | ||
3288 | |||
3289 | /** | ||
3290 | * workqueue_sysfs_unregister - undo workqueue_sysfs_register() | ||
3291 | * @wq: the workqueue to unregister | ||
3292 | * | ||
3293 | * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. | ||
3294 | */ | ||
3295 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) | ||
3296 | { | ||
3297 | struct wq_device *wq_dev = wq->wq_dev; | ||
3298 | |||
3299 | if (!wq->wq_dev) | ||
3300 | return; | ||
3301 | |||
3302 | wq->wq_dev = NULL; | ||
3303 | device_unregister(&wq_dev->dev); | ||
3304 | } | ||
3305 | #else /* CONFIG_SYSFS */ | ||
3306 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } | ||
3307 | #endif /* CONFIG_SYSFS */ | ||
3308 | |||
3309 | /** | 3005 | /** |
3310 | * free_workqueue_attrs - free a workqueue_attrs | 3006 | * free_workqueue_attrs - free a workqueue_attrs |
3311 | * @attrs: workqueue_attrs to free | 3007 | * @attrs: workqueue_attrs to free |
@@ -3424,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool) | |||
3424 | return 0; | 3120 | return 0; |
3425 | } | 3121 | } |
3426 | 3122 | ||
3123 | static void rcu_free_wq(struct rcu_head *rcu) | ||
3124 | { | ||
3125 | struct workqueue_struct *wq = | ||
3126 | container_of(rcu, struct workqueue_struct, rcu); | ||
3127 | |||
3128 | if (!(wq->flags & WQ_UNBOUND)) | ||
3129 | free_percpu(wq->cpu_pwqs); | ||
3130 | else | ||
3131 | free_workqueue_attrs(wq->unbound_attrs); | ||
3132 | |||
3133 | kfree(wq->rescuer); | ||
3134 | kfree(wq); | ||
3135 | } | ||
3136 | |||
3427 | static void rcu_free_pool(struct rcu_head *rcu) | 3137 | static void rcu_free_pool(struct rcu_head *rcu) |
3428 | { | 3138 | { |
3429 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); | 3139 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); |
@@ -3601,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) | |||
3601 | 3311 | ||
3602 | /* | 3312 | /* |
3603 | * If we're the last pwq going away, @wq is already dead and no one | 3313 | * If we're the last pwq going away, @wq is already dead and no one |
3604 | * is gonna access it anymore. Free it. | 3314 | * is gonna access it anymore. Schedule RCU free. |
3605 | */ | 3315 | */ |
3606 | if (is_last) { | 3316 | if (is_last) |
3607 | free_workqueue_attrs(wq->unbound_attrs); | 3317 | call_rcu_sched(&wq->rcu, rcu_free_wq); |
3608 | kfree(wq); | ||
3609 | } | ||
3610 | } | 3318 | } |
3611 | 3319 | ||
3612 | /** | 3320 | /** |
@@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
4143 | pwq_adjust_max_active(pwq); | 3851 | pwq_adjust_max_active(pwq); |
4144 | mutex_unlock(&wq->mutex); | 3852 | mutex_unlock(&wq->mutex); |
4145 | 3853 | ||
4146 | list_add(&wq->list, &workqueues); | 3854 | list_add_tail_rcu(&wq->list, &workqueues); |
4147 | 3855 | ||
4148 | mutex_unlock(&wq_pool_mutex); | 3856 | mutex_unlock(&wq_pool_mutex); |
4149 | 3857 | ||
@@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
4199 | * flushing is complete in case freeze races us. | 3907 | * flushing is complete in case freeze races us. |
4200 | */ | 3908 | */ |
4201 | mutex_lock(&wq_pool_mutex); | 3909 | mutex_lock(&wq_pool_mutex); |
4202 | list_del_init(&wq->list); | 3910 | list_del_rcu(&wq->list); |
4203 | mutex_unlock(&wq_pool_mutex); | 3911 | mutex_unlock(&wq_pool_mutex); |
4204 | 3912 | ||
4205 | workqueue_sysfs_unregister(wq); | 3913 | workqueue_sysfs_unregister(wq); |
4206 | 3914 | ||
4207 | if (wq->rescuer) { | 3915 | if (wq->rescuer) |
4208 | kthread_stop(wq->rescuer->task); | 3916 | kthread_stop(wq->rescuer->task); |
4209 | kfree(wq->rescuer); | ||
4210 | wq->rescuer = NULL; | ||
4211 | } | ||
4212 | 3917 | ||
4213 | if (!(wq->flags & WQ_UNBOUND)) { | 3918 | if (!(wq->flags & WQ_UNBOUND)) { |
4214 | /* | 3919 | /* |
4215 | * The base ref is never dropped on per-cpu pwqs. Directly | 3920 | * The base ref is never dropped on per-cpu pwqs. Directly |
4216 | * free the pwqs and wq. | 3921 | * schedule RCU free. |
4217 | */ | 3922 | */ |
4218 | free_percpu(wq->cpu_pwqs); | 3923 | call_rcu_sched(&wq->rcu, rcu_free_wq); |
4219 | kfree(wq); | ||
4220 | } else { | 3924 | } else { |
4221 | /* | 3925 | /* |
4222 | * We're the sole accessor of @wq at this point. Directly | 3926 | * We're the sole accessor of @wq at this point. Directly |
@@ -4437,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) | |||
4437 | } | 4141 | } |
4438 | } | 4142 | } |
4439 | 4143 | ||
4144 | static void pr_cont_pool_info(struct worker_pool *pool) | ||
4145 | { | ||
4146 | pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); | ||
4147 | if (pool->node != NUMA_NO_NODE) | ||
4148 | pr_cont(" node=%d", pool->node); | ||
4149 | pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); | ||
4150 | } | ||
4151 | |||
4152 | static void pr_cont_work(bool comma, struct work_struct *work) | ||
4153 | { | ||
4154 | if (work->func == wq_barrier_func) { | ||
4155 | struct wq_barrier *barr; | ||
4156 | |||
4157 | barr = container_of(work, struct wq_barrier, work); | ||
4158 | |||
4159 | pr_cont("%s BAR(%d)", comma ? "," : "", | ||
4160 | task_pid_nr(barr->task)); | ||
4161 | } else { | ||
4162 | pr_cont("%s %pf", comma ? "," : "", work->func); | ||
4163 | } | ||
4164 | } | ||
4165 | |||
4166 | static void show_pwq(struct pool_workqueue *pwq) | ||
4167 | { | ||
4168 | struct worker_pool *pool = pwq->pool; | ||
4169 | struct work_struct *work; | ||
4170 | struct worker *worker; | ||
4171 | bool has_in_flight = false, has_pending = false; | ||
4172 | int bkt; | ||
4173 | |||
4174 | pr_info(" pwq %d:", pool->id); | ||
4175 | pr_cont_pool_info(pool); | ||
4176 | |||
4177 | pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, | ||
4178 | !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); | ||
4179 | |||
4180 | hash_for_each(pool->busy_hash, bkt, worker, hentry) { | ||
4181 | if (worker->current_pwq == pwq) { | ||
4182 | has_in_flight = true; | ||
4183 | break; | ||
4184 | } | ||
4185 | } | ||
4186 | if (has_in_flight) { | ||
4187 | bool comma = false; | ||
4188 | |||
4189 | pr_info(" in-flight:"); | ||
4190 | hash_for_each(pool->busy_hash, bkt, worker, hentry) { | ||
4191 | if (worker->current_pwq != pwq) | ||
4192 | continue; | ||
4193 | |||
4194 | pr_cont("%s %d%s:%pf", comma ? "," : "", | ||
4195 | task_pid_nr(worker->task), | ||
4196 | worker == pwq->wq->rescuer ? "(RESCUER)" : "", | ||
4197 | worker->current_func); | ||
4198 | list_for_each_entry(work, &worker->scheduled, entry) | ||
4199 | pr_cont_work(false, work); | ||
4200 | comma = true; | ||
4201 | } | ||
4202 | pr_cont("\n"); | ||
4203 | } | ||
4204 | |||
4205 | list_for_each_entry(work, &pool->worklist, entry) { | ||
4206 | if (get_work_pwq(work) == pwq) { | ||
4207 | has_pending = true; | ||
4208 | break; | ||
4209 | } | ||
4210 | } | ||
4211 | if (has_pending) { | ||
4212 | bool comma = false; | ||
4213 | |||
4214 | pr_info(" pending:"); | ||
4215 | list_for_each_entry(work, &pool->worklist, entry) { | ||
4216 | if (get_work_pwq(work) != pwq) | ||
4217 | continue; | ||
4218 | |||
4219 | pr_cont_work(comma, work); | ||
4220 | comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); | ||
4221 | } | ||
4222 | pr_cont("\n"); | ||
4223 | } | ||
4224 | |||
4225 | if (!list_empty(&pwq->delayed_works)) { | ||
4226 | bool comma = false; | ||
4227 | |||
4228 | pr_info(" delayed:"); | ||
4229 | list_for_each_entry(work, &pwq->delayed_works, entry) { | ||
4230 | pr_cont_work(comma, work); | ||
4231 | comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); | ||
4232 | } | ||
4233 | pr_cont("\n"); | ||
4234 | } | ||
4235 | } | ||
4236 | |||
4237 | /** | ||
4238 | * show_workqueue_state - dump workqueue state | ||
4239 | * | ||
4240 | * Called from a sysrq handler and prints out all busy workqueues and | ||
4241 | * pools. | ||
4242 | */ | ||
4243 | void show_workqueue_state(void) | ||
4244 | { | ||
4245 | struct workqueue_struct *wq; | ||
4246 | struct worker_pool *pool; | ||
4247 | unsigned long flags; | ||
4248 | int pi; | ||
4249 | |||
4250 | rcu_read_lock_sched(); | ||
4251 | |||
4252 | pr_info("Showing busy workqueues and worker pools:\n"); | ||
4253 | |||
4254 | list_for_each_entry_rcu(wq, &workqueues, list) { | ||
4255 | struct pool_workqueue *pwq; | ||
4256 | bool idle = true; | ||
4257 | |||
4258 | for_each_pwq(pwq, wq) { | ||
4259 | if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { | ||
4260 | idle = false; | ||
4261 | break; | ||
4262 | } | ||
4263 | } | ||
4264 | if (idle) | ||
4265 | continue; | ||
4266 | |||
4267 | pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); | ||
4268 | |||
4269 | for_each_pwq(pwq, wq) { | ||
4270 | spin_lock_irqsave(&pwq->pool->lock, flags); | ||
4271 | if (pwq->nr_active || !list_empty(&pwq->delayed_works)) | ||
4272 | show_pwq(pwq); | ||
4273 | spin_unlock_irqrestore(&pwq->pool->lock, flags); | ||
4274 | } | ||
4275 | } | ||
4276 | |||
4277 | for_each_pool(pool, pi) { | ||
4278 | struct worker *worker; | ||
4279 | bool first = true; | ||
4280 | |||
4281 | spin_lock_irqsave(&pool->lock, flags); | ||
4282 | if (pool->nr_workers == pool->nr_idle) | ||
4283 | goto next_pool; | ||
4284 | |||
4285 | pr_info("pool %d:", pool->id); | ||
4286 | pr_cont_pool_info(pool); | ||
4287 | pr_cont(" workers=%d", pool->nr_workers); | ||
4288 | if (pool->manager) | ||
4289 | pr_cont(" manager: %d", | ||
4290 | task_pid_nr(pool->manager->task)); | ||
4291 | list_for_each_entry(worker, &pool->idle_list, entry) { | ||
4292 | pr_cont(" %s%d", first ? "idle: " : "", | ||
4293 | task_pid_nr(worker->task)); | ||
4294 | first = false; | ||
4295 | } | ||
4296 | pr_cont("\n"); | ||
4297 | next_pool: | ||
4298 | spin_unlock_irqrestore(&pool->lock, flags); | ||
4299 | } | ||
4300 | |||
4301 | rcu_read_unlock_sched(); | ||
4302 | } | ||
4303 | |||
4440 | /* | 4304 | /* |
4441 | * CPU hotplug. | 4305 | * CPU hotplug. |
4442 | * | 4306 | * |
@@ -4834,6 +4698,323 @@ out_unlock: | |||
4834 | } | 4698 | } |
4835 | #endif /* CONFIG_FREEZER */ | 4699 | #endif /* CONFIG_FREEZER */ |
4836 | 4700 | ||
4701 | #ifdef CONFIG_SYSFS | ||
4702 | /* | ||
4703 | * Workqueues with WQ_SYSFS flag set is visible to userland via | ||
4704 | * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the | ||
4705 | * following attributes. | ||
4706 | * | ||
4707 | * per_cpu RO bool : whether the workqueue is per-cpu or unbound | ||
4708 | * max_active RW int : maximum number of in-flight work items | ||
4709 | * | ||
4710 | * Unbound workqueues have the following extra attributes. | ||
4711 | * | ||
4712 | * id RO int : the associated pool ID | ||
4713 | * nice RW int : nice value of the workers | ||
4714 | * cpumask RW mask : bitmask of allowed CPUs for the workers | ||
4715 | */ | ||
4716 | struct wq_device { | ||
4717 | struct workqueue_struct *wq; | ||
4718 | struct device dev; | ||
4719 | }; | ||
4720 | |||
4721 | static struct workqueue_struct *dev_to_wq(struct device *dev) | ||
4722 | { | ||
4723 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
4724 | |||
4725 | return wq_dev->wq; | ||
4726 | } | ||
4727 | |||
4728 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, | ||
4729 | char *buf) | ||
4730 | { | ||
4731 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4732 | |||
4733 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | ||
4734 | } | ||
4735 | static DEVICE_ATTR_RO(per_cpu); | ||
4736 | |||
4737 | static ssize_t max_active_show(struct device *dev, | ||
4738 | struct device_attribute *attr, char *buf) | ||
4739 | { | ||
4740 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4741 | |||
4742 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | ||
4743 | } | ||
4744 | |||
4745 | static ssize_t max_active_store(struct device *dev, | ||
4746 | struct device_attribute *attr, const char *buf, | ||
4747 | size_t count) | ||
4748 | { | ||
4749 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4750 | int val; | ||
4751 | |||
4752 | if (sscanf(buf, "%d", &val) != 1 || val <= 0) | ||
4753 | return -EINVAL; | ||
4754 | |||
4755 | workqueue_set_max_active(wq, val); | ||
4756 | return count; | ||
4757 | } | ||
4758 | static DEVICE_ATTR_RW(max_active); | ||
4759 | |||
4760 | static struct attribute *wq_sysfs_attrs[] = { | ||
4761 | &dev_attr_per_cpu.attr, | ||
4762 | &dev_attr_max_active.attr, | ||
4763 | NULL, | ||
4764 | }; | ||
4765 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
4766 | |||
4767 | static ssize_t wq_pool_ids_show(struct device *dev, | ||
4768 | struct device_attribute *attr, char *buf) | ||
4769 | { | ||
4770 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4771 | const char *delim = ""; | ||
4772 | int node, written = 0; | ||
4773 | |||
4774 | rcu_read_lock_sched(); | ||
4775 | for_each_node(node) { | ||
4776 | written += scnprintf(buf + written, PAGE_SIZE - written, | ||
4777 | "%s%d:%d", delim, node, | ||
4778 | unbound_pwq_by_node(wq, node)->pool->id); | ||
4779 | delim = " "; | ||
4780 | } | ||
4781 | written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); | ||
4782 | rcu_read_unlock_sched(); | ||
4783 | |||
4784 | return written; | ||
4785 | } | ||
4786 | |||
4787 | static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, | ||
4788 | char *buf) | ||
4789 | { | ||
4790 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4791 | int written; | ||
4792 | |||
4793 | mutex_lock(&wq->mutex); | ||
4794 | written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); | ||
4795 | mutex_unlock(&wq->mutex); | ||
4796 | |||
4797 | return written; | ||
4798 | } | ||
4799 | |||
4800 | /* prepare workqueue_attrs for sysfs store operations */ | ||
4801 | static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) | ||
4802 | { | ||
4803 | struct workqueue_attrs *attrs; | ||
4804 | |||
4805 | attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
4806 | if (!attrs) | ||
4807 | return NULL; | ||
4808 | |||
4809 | mutex_lock(&wq->mutex); | ||
4810 | copy_workqueue_attrs(attrs, wq->unbound_attrs); | ||
4811 | mutex_unlock(&wq->mutex); | ||
4812 | return attrs; | ||
4813 | } | ||
4814 | |||
4815 | static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, | ||
4816 | const char *buf, size_t count) | ||
4817 | { | ||
4818 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4819 | struct workqueue_attrs *attrs; | ||
4820 | int ret; | ||
4821 | |||
4822 | attrs = wq_sysfs_prep_attrs(wq); | ||
4823 | if (!attrs) | ||
4824 | return -ENOMEM; | ||
4825 | |||
4826 | if (sscanf(buf, "%d", &attrs->nice) == 1 && | ||
4827 | attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) | ||
4828 | ret = apply_workqueue_attrs(wq, attrs); | ||
4829 | else | ||
4830 | ret = -EINVAL; | ||
4831 | |||
4832 | free_workqueue_attrs(attrs); | ||
4833 | return ret ?: count; | ||
4834 | } | ||
4835 | |||
4836 | static ssize_t wq_cpumask_show(struct device *dev, | ||
4837 | struct device_attribute *attr, char *buf) | ||
4838 | { | ||
4839 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4840 | int written; | ||
4841 | |||
4842 | mutex_lock(&wq->mutex); | ||
4843 | written = scnprintf(buf, PAGE_SIZE, "%*pb\n", | ||
4844 | cpumask_pr_args(wq->unbound_attrs->cpumask)); | ||
4845 | mutex_unlock(&wq->mutex); | ||
4846 | return written; | ||
4847 | } | ||
4848 | |||
4849 | static ssize_t wq_cpumask_store(struct device *dev, | ||
4850 | struct device_attribute *attr, | ||
4851 | const char *buf, size_t count) | ||
4852 | { | ||
4853 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4854 | struct workqueue_attrs *attrs; | ||
4855 | int ret; | ||
4856 | |||
4857 | attrs = wq_sysfs_prep_attrs(wq); | ||
4858 | if (!attrs) | ||
4859 | return -ENOMEM; | ||
4860 | |||
4861 | ret = cpumask_parse(buf, attrs->cpumask); | ||
4862 | if (!ret) | ||
4863 | ret = apply_workqueue_attrs(wq, attrs); | ||
4864 | |||
4865 | free_workqueue_attrs(attrs); | ||
4866 | return ret ?: count; | ||
4867 | } | ||
4868 | |||
4869 | static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, | ||
4870 | char *buf) | ||
4871 | { | ||
4872 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4873 | int written; | ||
4874 | |||
4875 | mutex_lock(&wq->mutex); | ||
4876 | written = scnprintf(buf, PAGE_SIZE, "%d\n", | ||
4877 | !wq->unbound_attrs->no_numa); | ||
4878 | mutex_unlock(&wq->mutex); | ||
4879 | |||
4880 | return written; | ||
4881 | } | ||
4882 | |||
4883 | static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, | ||
4884 | const char *buf, size_t count) | ||
4885 | { | ||
4886 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4887 | struct workqueue_attrs *attrs; | ||
4888 | int v, ret; | ||
4889 | |||
4890 | attrs = wq_sysfs_prep_attrs(wq); | ||
4891 | if (!attrs) | ||
4892 | return -ENOMEM; | ||
4893 | |||
4894 | ret = -EINVAL; | ||
4895 | if (sscanf(buf, "%d", &v) == 1) { | ||
4896 | attrs->no_numa = !v; | ||
4897 | ret = apply_workqueue_attrs(wq, attrs); | ||
4898 | } | ||
4899 | |||
4900 | free_workqueue_attrs(attrs); | ||
4901 | return ret ?: count; | ||
4902 | } | ||
4903 | |||
4904 | static struct device_attribute wq_sysfs_unbound_attrs[] = { | ||
4905 | __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), | ||
4906 | __ATTR(nice, 0644, wq_nice_show, wq_nice_store), | ||
4907 | __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), | ||
4908 | __ATTR(numa, 0644, wq_numa_show, wq_numa_store), | ||
4909 | __ATTR_NULL, | ||
4910 | }; | ||
4911 | |||
4912 | static struct bus_type wq_subsys = { | ||
4913 | .name = "workqueue", | ||
4914 | .dev_groups = wq_sysfs_groups, | ||
4915 | }; | ||
4916 | |||
4917 | static int __init wq_sysfs_init(void) | ||
4918 | { | ||
4919 | return subsys_virtual_register(&wq_subsys, NULL); | ||
4920 | } | ||
4921 | core_initcall(wq_sysfs_init); | ||
4922 | |||
4923 | static void wq_device_release(struct device *dev) | ||
4924 | { | ||
4925 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
4926 | |||
4927 | kfree(wq_dev); | ||
4928 | } | ||
4929 | |||
4930 | /** | ||
4931 | * workqueue_sysfs_register - make a workqueue visible in sysfs | ||
4932 | * @wq: the workqueue to register | ||
4933 | * | ||
4934 | * Expose @wq in sysfs under /sys/bus/workqueue/devices. | ||
4935 | * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set | ||
4936 | * which is the preferred method. | ||
4937 | * | ||
4938 | * Workqueue user should use this function directly iff it wants to apply | ||
4939 | * workqueue_attrs before making the workqueue visible in sysfs; otherwise, | ||
4940 | * apply_workqueue_attrs() may race against userland updating the | ||
4941 | * attributes. | ||
4942 | * | ||
4943 | * Return: 0 on success, -errno on failure. | ||
4944 | */ | ||
4945 | int workqueue_sysfs_register(struct workqueue_struct *wq) | ||
4946 | { | ||
4947 | struct wq_device *wq_dev; | ||
4948 | int ret; | ||
4949 | |||
4950 | /* | ||
4951 | * Adjusting max_active or creating new pwqs by applyting | ||
4952 | * attributes breaks ordering guarantee. Disallow exposing ordered | ||
4953 | * workqueues. | ||
4954 | */ | ||
4955 | if (WARN_ON(wq->flags & __WQ_ORDERED)) | ||
4956 | return -EINVAL; | ||
4957 | |||
4958 | wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); | ||
4959 | if (!wq_dev) | ||
4960 | return -ENOMEM; | ||
4961 | |||
4962 | wq_dev->wq = wq; | ||
4963 | wq_dev->dev.bus = &wq_subsys; | ||
4964 | wq_dev->dev.init_name = wq->name; | ||
4965 | wq_dev->dev.release = wq_device_release; | ||
4966 | |||
4967 | /* | ||
4968 | * unbound_attrs are created separately. Suppress uevent until | ||
4969 | * everything is ready. | ||
4970 | */ | ||
4971 | dev_set_uevent_suppress(&wq_dev->dev, true); | ||
4972 | |||
4973 | ret = device_register(&wq_dev->dev); | ||
4974 | if (ret) { | ||
4975 | kfree(wq_dev); | ||
4976 | wq->wq_dev = NULL; | ||
4977 | return ret; | ||
4978 | } | ||
4979 | |||
4980 | if (wq->flags & WQ_UNBOUND) { | ||
4981 | struct device_attribute *attr; | ||
4982 | |||
4983 | for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { | ||
4984 | ret = device_create_file(&wq_dev->dev, attr); | ||
4985 | if (ret) { | ||
4986 | device_unregister(&wq_dev->dev); | ||
4987 | wq->wq_dev = NULL; | ||
4988 | return ret; | ||
4989 | } | ||
4990 | } | ||
4991 | } | ||
4992 | |||
4993 | dev_set_uevent_suppress(&wq_dev->dev, false); | ||
4994 | kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); | ||
4995 | return 0; | ||
4996 | } | ||
4997 | |||
4998 | /** | ||
4999 | * workqueue_sysfs_unregister - undo workqueue_sysfs_register() | ||
5000 | * @wq: the workqueue to unregister | ||
5001 | * | ||
5002 | * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. | ||
5003 | */ | ||
5004 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) | ||
5005 | { | ||
5006 | struct wq_device *wq_dev = wq->wq_dev; | ||
5007 | |||
5008 | if (!wq->wq_dev) | ||
5009 | return; | ||
5010 | |||
5011 | wq->wq_dev = NULL; | ||
5012 | device_unregister(&wq_dev->dev); | ||
5013 | } | ||
5014 | #else /* CONFIG_SYSFS */ | ||
5015 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } | ||
5016 | #endif /* CONFIG_SYSFS */ | ||
5017 | |||
4837 | static void __init wq_numa_init(void) | 5018 | static void __init wq_numa_init(void) |
4838 | { | 5019 | { |
4839 | cpumask_var_t *tbl; | 5020 | cpumask_var_t *tbl; |