aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/events/core.c23
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/irq/manage.c1
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/ptrace.c16
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/fair.c23
-rw-r--r--kernel/sched/wait.c10
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/timer.c74
14 files changed, 124 insertions, 68 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5df20d6d1520..29de1a9352c0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -228,7 +228,7 @@ static struct {
228 .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), 228 .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
229 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), 229 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
230#ifdef CONFIG_DEBUG_LOCK_ALLOC 230#ifdef CONFIG_DEBUG_LOCK_ALLOC
231 .dep_map = {.name = "cpu_hotplug.lock" }, 231 .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
232#endif 232#endif
233}; 233};
234 234
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c6e47e97b33f..0e292132efac 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event)
1960} 1960}
1961EXPORT_SYMBOL_GPL(perf_event_disable); 1961EXPORT_SYMBOL_GPL(perf_event_disable);
1962 1962
1963void perf_event_disable_inatomic(struct perf_event *event)
1964{
1965 event->pending_disable = 1;
1966 irq_work_queue(&event->pending);
1967}
1968
1963static void perf_set_shadow_time(struct perf_event *event, 1969static void perf_set_shadow_time(struct perf_event *event,
1964 struct perf_event_context *ctx, 1970 struct perf_event_context *ctx,
1965 u64 tstamp) 1971 u64 tstamp)
@@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event,
7075 if (events && atomic_dec_and_test(&event->event_limit)) { 7081 if (events && atomic_dec_and_test(&event->event_limit)) {
7076 ret = 1; 7082 ret = 1;
7077 event->pending_kill = POLL_HUP; 7083 event->pending_kill = POLL_HUP;
7078 event->pending_disable = 1; 7084
7079 irq_work_queue(&event->pending); 7085 perf_event_disable_inatomic(event);
7080 } 7086 }
7081 7087
7082 READ_ONCE(event->overflow_handler)(event, data, regs); 7088 READ_ONCE(event->overflow_handler)(event, data, regs);
@@ -8855,7 +8861,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);
8855 8861
8856void perf_pmu_unregister(struct pmu *pmu) 8862void perf_pmu_unregister(struct pmu *pmu)
8857{ 8863{
8864 int remove_device;
8865
8858 mutex_lock(&pmus_lock); 8866 mutex_lock(&pmus_lock);
8867 remove_device = pmu_bus_running;
8859 list_del_rcu(&pmu->entry); 8868 list_del_rcu(&pmu->entry);
8860 mutex_unlock(&pmus_lock); 8869 mutex_unlock(&pmus_lock);
8861 8870
@@ -8869,10 +8878,12 @@ void perf_pmu_unregister(struct pmu *pmu)
8869 free_percpu(pmu->pmu_disable_count); 8878 free_percpu(pmu->pmu_disable_count);
8870 if (pmu->type >= PERF_TYPE_MAX) 8879 if (pmu->type >= PERF_TYPE_MAX)
8871 idr_remove(&pmu_idr, pmu->type); 8880 idr_remove(&pmu_idr, pmu->type);
8872 if (pmu->nr_addr_filters) 8881 if (remove_device) {
8873 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); 8882 if (pmu->nr_addr_filters)
8874 device_del(pmu->dev); 8883 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
8875 put_device(pmu->dev); 8884 device_del(pmu->dev);
8885 put_device(pmu->dev);
8886 }
8876 free_pmu_context(pmu); 8887 free_pmu_context(pmu);
8877} 8888}
8878EXPORT_SYMBOL_GPL(perf_pmu_unregister); 8889EXPORT_SYMBOL_GPL(perf_pmu_unregister);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d4129bb05e5d..f9ec9add2164 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
300 300
301retry: 301retry:
302 /* Read the page with vaddr into memory */ 302 /* Read the page with vaddr into memory */
303 ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); 303 ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
304 &vma);
304 if (ret <= 0) 305 if (ret <= 0)
305 return ret; 306 return ret;
306 307
@@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1710 * but we treat this as a 'remote' access since it is 1711 * but we treat this as a 'remote' access since it is
1711 * essentially a kernel access to the memory. 1712 * essentially a kernel access to the memory.
1712 */ 1713 */
1713 result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); 1714 result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
1715 NULL);
1714 if (result < 0) 1716 if (result < 0)
1715 return result; 1717 return result;
1716 1718
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0c5f1a5db654..9c4d30483264 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq)
721 irq_put_desc_unlock(desc, flags); 721 irq_put_desc_unlock(desc, flags);
722 return 0; 722 return 0;
723} 723}
724EXPORT_SYMBOL_GPL(irq_set_parent);
724#endif 725#endif
725 726
726/* 727/*
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 8d44b3fea9d0..30e6d05aa5a9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void)
53 /* 53 /*
54 * We are interested in code coverage as a function of a syscall inputs, 54 * We are interested in code coverage as a function of a syscall inputs,
55 * so we ignore code executed in interrupts. 55 * so we ignore code executed in interrupts.
56 * The checks for whether we are in an interrupt are open-coded, because
57 * 1. We can't use in_interrupt() here, since it also returns true
58 * when we are inside local_bh_disable() section.
59 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
60 * since that leads to slower generated code (three separate tests,
61 * one for each of the flags).
56 */ 62 */
57 if (!t || in_interrupt()) 63 if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
64 | NMI_MASK)))
58 return; 65 return;
59 mode = READ_ONCE(t->kcov_mode); 66 mode = READ_ONCE(t->kcov_mode);
60 if (mode == KCOV_MODE_TRACE) { 67 if (mode == KCOV_MODE_TRACE) {
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1e7f5da648d9..6ccb08f57fcb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state)
498 498
499#ifndef CONFIG_SUSPEND_SKIP_SYNC 499#ifndef CONFIG_SUSPEND_SKIP_SYNC
500 trace_suspend_resume(TPS("sync_filesystems"), 0, true); 500 trace_suspend_resume(TPS("sync_filesystems"), 0, true);
501 printk(KERN_INFO "PM: Syncing filesystems ... "); 501 pr_info("PM: Syncing filesystems ... ");
502 sys_sync(); 502 sys_sync();
503 printk("done.\n"); 503 pr_cont("done.\n");
504 trace_suspend_resume(TPS("sync_filesystems"), 0, false); 504 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
505#endif 505#endif
506 506
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d5e397315473..de08fc90baaf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1769,6 +1769,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c
1769 cont_flush(); 1769 cont_flush();
1770 } 1770 }
1771 1771
1772 /* Skip empty continuation lines that couldn't be added - they just flush */
1773 if (!text_len && (lflags & LOG_CONT))
1774 return 0;
1775
1772 /* If it doesn't end in a newline, try to buffer the current line */ 1776 /* If it doesn't end in a newline, try to buffer the current line */
1773 if (!(lflags & LOG_NEWLINE)) { 1777 if (!(lflags & LOG_NEWLINE)) {
1774 if (cont_add(facility, level, lflags, text, text_len)) 1778 if (cont_add(facility, level, lflags, text, text_len))
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2a99027312a6..e6474f7272ec 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
537 int this_len, retval; 537 int this_len, retval;
538 538
539 this_len = (len > sizeof(buf)) ? sizeof(buf) : len; 539 this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
540 retval = access_process_vm(tsk, src, buf, this_len, 0); 540 retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
541 if (!retval) { 541 if (!retval) {
542 if (copied) 542 if (copied)
543 break; 543 break;
@@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
564 this_len = (len > sizeof(buf)) ? sizeof(buf) : len; 564 this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
565 if (copy_from_user(buf, src, this_len)) 565 if (copy_from_user(buf, src, this_len))
566 return -EFAULT; 566 return -EFAULT;
567 retval = access_process_vm(tsk, dst, buf, this_len, 1); 567 retval = access_process_vm(tsk, dst, buf, this_len,
568 FOLL_FORCE | FOLL_WRITE);
568 if (!retval) { 569 if (!retval) {
569 if (copied) 570 if (copied)
570 break; 571 break;
@@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
1127 unsigned long tmp; 1128 unsigned long tmp;
1128 int copied; 1129 int copied;
1129 1130
1130 copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); 1131 copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
1131 if (copied != sizeof(tmp)) 1132 if (copied != sizeof(tmp))
1132 return -EIO; 1133 return -EIO;
1133 return put_user(tmp, (unsigned long __user *)data); 1134 return put_user(tmp, (unsigned long __user *)data);
@@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
1138{ 1139{
1139 int copied; 1140 int copied;
1140 1141
1141 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); 1142 copied = access_process_vm(tsk, addr, &data, sizeof(data),
1143 FOLL_FORCE | FOLL_WRITE);
1142 return (copied == sizeof(data)) ? 0 : -EIO; 1144 return (copied == sizeof(data)) ? 0 : -EIO;
1143} 1145}
1144 1146
@@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1155 switch (request) { 1157 switch (request) {
1156 case PTRACE_PEEKTEXT: 1158 case PTRACE_PEEKTEXT:
1157 case PTRACE_PEEKDATA: 1159 case PTRACE_PEEKDATA:
1158 ret = access_process_vm(child, addr, &word, sizeof(word), 0); 1160 ret = access_process_vm(child, addr, &word, sizeof(word),
1161 FOLL_FORCE);
1159 if (ret != sizeof(word)) 1162 if (ret != sizeof(word))
1160 ret = -EIO; 1163 ret = -EIO;
1161 else 1164 else
@@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1164 1167
1165 case PTRACE_POKETEXT: 1168 case PTRACE_POKETEXT:
1166 case PTRACE_POKEDATA: 1169 case PTRACE_POKEDATA:
1167 ret = access_process_vm(child, addr, &data, sizeof(data), 1); 1170 ret = access_process_vm(child, addr, &data, sizeof(data),
1171 FOLL_FORCE | FOLL_WRITE);
1168 ret = (ret != sizeof(data) ? -EIO : 0); 1172 ret = (ret != sizeof(data) ? -EIO : 0);
1169 break; 1173 break;
1170 1174
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..42d4027f9e26 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly;
7515DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 7515DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7516DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); 7516DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
7517 7517
7518#define WAIT_TABLE_BITS 8
7519#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
7520static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
7521
7522wait_queue_head_t *bit_waitqueue(void *word, int bit)
7523{
7524 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
7525 unsigned long val = (unsigned long)word << shift | bit;
7526
7527 return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
7528}
7529EXPORT_SYMBOL(bit_waitqueue);
7530
7518void __init sched_init(void) 7531void __init sched_init(void)
7519{ 7532{
7520 int i, j; 7533 int i, j;
7521 unsigned long alloc_size = 0, ptr; 7534 unsigned long alloc_size = 0, ptr;
7522 7535
7536 for (i = 0; i < WAIT_TABLE_SIZE; i++)
7537 init_waitqueue_head(bit_wait_table + i);
7538
7523#ifdef CONFIG_FAIR_GROUP_SCHED 7539#ifdef CONFIG_FAIR_GROUP_SCHED
7524 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7540 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7525#endif 7541#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2d4ad72f8f3c..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
690 * will definitely be update (after enqueue). 690 * will definitely be update (after enqueue).
691 */ 691 */
692 sa->period_contrib = 1023; 692 sa->period_contrib = 1023;
693 sa->load_avg = scale_load_down(se->load.weight); 693 /*
694 * Tasks are intialized with full load to be seen as heavy tasks until
695 * they get a chance to stabilize to their real load level.
696 * Group entities are intialized with zero load to reflect the fact that
697 * nothing has been attached to the task group yet.
698 */
699 if (entity_is_task(se))
700 sa->load_avg = scale_load_down(se->load.weight);
694 sa->load_sum = sa->load_avg * LOAD_AVG_MAX; 701 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
695 /* 702 /*
696 * At this point, util_avg won't be used in select_task_rq_fair anyway 703 * At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -5471,13 +5478,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
5471 */ 5478 */
5472static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) 5479static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5473{ 5480{
5474 struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); 5481 struct sched_domain *this_sd;
5475 u64 avg_idle = this_rq()->avg_idle; 5482 u64 avg_cost, avg_idle = this_rq()->avg_idle;
5476 u64 avg_cost = this_sd->avg_scan_cost;
5477 u64 time, cost; 5483 u64 time, cost;
5478 s64 delta; 5484 s64 delta;
5479 int cpu, wrap; 5485 int cpu, wrap;
5480 5486
5487 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5488 if (!this_sd)
5489 return -1;
5490
5491 avg_cost = this_sd->avg_scan_cost;
5492
5481 /* 5493 /*
5482 * Due to large variance we need a large fuzz factor; hackbench in 5494 * Due to large variance we need a large fuzz factor; hackbench in
5483 * particularly is sensitive here. 5495 * particularly is sensitive here.
@@ -8827,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8827{ 8839{
8828 struct sched_entity *se; 8840 struct sched_entity *se;
8829 struct cfs_rq *cfs_rq; 8841 struct cfs_rq *cfs_rq;
8830 struct rq *rq;
8831 int i; 8842 int i;
8832 8843
8833 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8844 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8842,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8842 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); 8853 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8843 8854
8844 for_each_possible_cpu(i) { 8855 for_each_possible_cpu(i) {
8845 rq = cpu_rq(i);
8846
8847 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8856 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8848 GFP_KERNEL, cpu_to_node(i)); 8857 GFP_KERNEL, cpu_to_node(i));
8849 if (!cfs_rq) 8858 if (!cfs_rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 4f7053579fe3..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit)
480} 480}
481EXPORT_SYMBOL(wake_up_bit); 481EXPORT_SYMBOL(wake_up_bit);
482 482
483wait_queue_head_t *bit_waitqueue(void *word, int bit)
484{
485 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
486 const struct zone *zone = page_zone(virt_to_page(word));
487 unsigned long val = (unsigned long)word << shift | bit;
488
489 return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
490}
491EXPORT_SYMBOL(bit_waitqueue);
492
493/* 483/*
494 * Manipulate the atomic_t address to produce a better bit waitqueue table hash 484 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
495 * index (we're keying off bit -1, but that would produce a horrible hash 485 * index (we're keying off bit -1, but that would produce a horrible hash
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 1bf81ef91375..744fa611cae0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
58DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 58DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
59 59
60const char * const softirq_to_name[NR_SOFTIRQS] = { 60const char * const softirq_to_name[NR_SOFTIRQS] = {
61 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 61 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
62 "TASKLET", "SCHED", "HRTIMER", "RCU" 62 "TASKLET", "SCHED", "HRTIMER", "RCU"
63}; 63};
64 64
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c3aad685bbc0..12dd190634ab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
542static int alarm_timer_create(struct k_itimer *new_timer) 542static int alarm_timer_create(struct k_itimer *new_timer)
543{ 543{
544 enum alarmtimer_type type; 544 enum alarmtimer_type type;
545 struct alarm_base *base;
546 545
547 if (!alarmtimer_get_rtcdev()) 546 if (!alarmtimer_get_rtcdev())
548 return -ENOTSUPP; 547 return -ENOTSUPP;
@@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
551 return -EPERM; 550 return -EPERM;
552 551
553 type = clock2alarm(new_timer->it_clock); 552 type = clock2alarm(new_timer->it_clock);
554 base = &alarm_bases[type];
555 alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); 553 alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
556 return 0; 554 return 0;
557} 555}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d47980a1bc4..c611c47de884 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags)
878 878
879#ifdef CONFIG_NO_HZ_COMMON 879#ifdef CONFIG_NO_HZ_COMMON
880static inline struct timer_base * 880static inline struct timer_base *
881__get_target_base(struct timer_base *base, unsigned tflags) 881get_target_base(struct timer_base *base, unsigned tflags)
882{ 882{
883#ifdef CONFIG_SMP 883#ifdef CONFIG_SMP
884 if ((tflags & TIMER_PINNED) || !base->migration_enabled) 884 if ((tflags & TIMER_PINNED) || !base->migration_enabled)
@@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags)
891 891
892static inline void forward_timer_base(struct timer_base *base) 892static inline void forward_timer_base(struct timer_base *base)
893{ 893{
894 unsigned long jnow = READ_ONCE(jiffies);
895
894 /* 896 /*
895 * We only forward the base when it's idle and we have a delta between 897 * We only forward the base when it's idle and we have a delta between
896 * base clock and jiffies. 898 * base clock and jiffies.
897 */ 899 */
898 if (!base->is_idle || (long) (jiffies - base->clk) < 2) 900 if (!base->is_idle || (long) (jnow - base->clk) < 2)
899 return; 901 return;
900 902
901 /* 903 /*
902 * If the next expiry value is > jiffies, then we fast forward to 904 * If the next expiry value is > jiffies, then we fast forward to
903 * jiffies otherwise we forward to the next expiry value. 905 * jiffies otherwise we forward to the next expiry value.
904 */ 906 */
905 if (time_after(base->next_expiry, jiffies)) 907 if (time_after(base->next_expiry, jnow))
906 base->clk = jiffies; 908 base->clk = jnow;
907 else 909 else
908 base->clk = base->next_expiry; 910 base->clk = base->next_expiry;
909} 911}
910#else 912#else
911static inline struct timer_base * 913static inline struct timer_base *
912__get_target_base(struct timer_base *base, unsigned tflags) 914get_target_base(struct timer_base *base, unsigned tflags)
913{ 915{
914 return get_timer_this_cpu_base(tflags); 916 return get_timer_this_cpu_base(tflags);
915} 917}
@@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags)
917static inline void forward_timer_base(struct timer_base *base) { } 919static inline void forward_timer_base(struct timer_base *base) { }
918#endif 920#endif
919 921
920static inline struct timer_base *
921get_target_base(struct timer_base *base, unsigned tflags)
922{
923 struct timer_base *target = __get_target_base(base, tflags);
924
925 forward_timer_base(target);
926 return target;
927}
928 922
929/* 923/*
930 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means 924 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
943{ 937{
944 for (;;) { 938 for (;;) {
945 struct timer_base *base; 939 struct timer_base *base;
946 u32 tf = timer->flags; 940 u32 tf;
941
942 /*
943 * We need to use READ_ONCE() here, otherwise the compiler
944 * might re-read @tf between the check for TIMER_MIGRATING
945 * and spin_lock().
946 */
947 tf = READ_ONCE(timer->flags);
947 948
948 if (!(tf & TIMER_MIGRATING)) { 949 if (!(tf & TIMER_MIGRATING)) {
949 base = get_timer_base(tf); 950 base = get_timer_base(tf);
@@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
964 unsigned long clk = 0, flags; 965 unsigned long clk = 0, flags;
965 int ret = 0; 966 int ret = 0;
966 967
968 BUG_ON(!timer->function);
969
967 /* 970 /*
968 * This is a common optimization triggered by the networking code - if 971 * This is a common optimization triggered by the networking code - if
969 * the timer is re-modified to have the same timeout or ends up in the 972 * the timer is re-modified to have the same timeout or ends up in the
@@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
972 if (timer_pending(timer)) { 975 if (timer_pending(timer)) {
973 if (timer->expires == expires) 976 if (timer->expires == expires)
974 return 1; 977 return 1;
978
975 /* 979 /*
976 * Take the current timer_jiffies of base, but without holding 980 * We lock timer base and calculate the bucket index right
977 * the lock! 981 * here. If the timer ends up in the same bucket, then we
982 * just update the expiry time and avoid the whole
983 * dequeue/enqueue dance.
978 */ 984 */
979 base = get_timer_base(timer->flags); 985 base = lock_timer_base(timer, &flags);
980 clk = base->clk;
981 986
987 clk = base->clk;
982 idx = calc_wheel_index(expires, clk); 988 idx = calc_wheel_index(expires, clk);
983 989
984 /* 990 /*
@@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
988 */ 994 */
989 if (idx == timer_get_idx(timer)) { 995 if (idx == timer_get_idx(timer)) {
990 timer->expires = expires; 996 timer->expires = expires;
991 return 1; 997 ret = 1;
998 goto out_unlock;
992 } 999 }
1000 } else {
1001 base = lock_timer_base(timer, &flags);
993 } 1002 }
994 1003
995 timer_stats_timer_set_start_info(timer); 1004 timer_stats_timer_set_start_info(timer);
996 BUG_ON(!timer->function);
997
998 base = lock_timer_base(timer, &flags);
999 1005
1000 ret = detach_if_pending(timer, base, false); 1006 ret = detach_if_pending(timer, base, false);
1001 if (!ret && pending_only) 1007 if (!ret && pending_only)
@@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
1025 } 1031 }
1026 } 1032 }
1027 1033
1034 /* Try to forward a stale timer base clock */
1035 forward_timer_base(base);
1036
1028 timer->expires = expires; 1037 timer->expires = expires;
1029 /* 1038 /*
1030 * If 'idx' was calculated above and the base time did not advance 1039 * If 'idx' was calculated above and the base time did not advance
1031 * between calculating 'idx' and taking the lock, only enqueue_timer() 1040 * between calculating 'idx' and possibly switching the base, only
1032 * and trigger_dyntick_cpu() is required. Otherwise we need to 1041 * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
1033 * (re)calculate the wheel index via internal_add_timer(). 1042 * we need to (re)calculate the wheel index via
1043 * internal_add_timer().
1034 */ 1044 */
1035 if (idx != UINT_MAX && clk == base->clk) { 1045 if (idx != UINT_MAX && clk == base->clk) {
1036 enqueue_timer(base, timer, idx); 1046 enqueue_timer(base, timer, idx);
@@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1510 is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); 1520 is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
1511 base->next_expiry = nextevt; 1521 base->next_expiry = nextevt;
1512 /* 1522 /*
1513 * We have a fresh next event. Check whether we can forward the base: 1523 * We have a fresh next event. Check whether we can forward the
1524 * base. We can only do that when @basej is past base->clk
1525 * otherwise we might rewind base->clk.
1514 */ 1526 */
1515 if (time_after(nextevt, jiffies)) 1527 if (time_after(basej, base->clk)) {
1516 base->clk = jiffies; 1528 if (time_after(nextevt, basej))
1517 else if (time_after(nextevt, base->clk)) 1529 base->clk = basej;
1518 base->clk = nextevt; 1530 else if (time_after(nextevt, base->clk))
1531 base->clk = nextevt;
1532 }
1519 1533
1520 if (time_before_eq(nextevt, basej)) { 1534 if (time_before_eq(nextevt, basej)) {
1521 expires = basem; 1535 expires = basem;