Merge tag 'v3.13-rc6' into for-3.14/core

Needed to bring blk-mq uptodate, since changes have been going in since for-3.14/core was established. Fixup merge issues related to the immutable biovec changes. Signed-off-by: Jens Axboe <axboe@kernel.dk> Conflicts: block/blk-flush.c fs/btrfs/check-integrity.c fs/btrfs/extent_io.c fs/btrfs/scrub.c fs/logfs/dev_bdev.c
author: Jens Axboe <axboe@kernel.dk> 2013-12-31 11:51:02 -0500
committer: Jens Axboe <axboe@kernel.dk> 2013-12-31 11:51:02 -0500
commit: b28bc9b38c52f63f43e3fd875af982f2240a2859 (patch)
tree: 76cdb7b52b58f5685993cc15ed81d1c903023358 /kernel
parent: 8d30726912cb39c3a3ebde06214d54861f8fdde2 (diff)
parent: 802eee95bde72fd0cd0f3a5b2098375a487d1eda (diff)
31 files changed, 375 insertions, 240 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..790d83c7d160 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,3 +5,4 @@ config_data.h
 config_data.gz
 timeconst.h
 hz.bc
+x509_certificate_list
diff --git a/kernel/Makefile b/kernel/Makefile
index bbaf7d59c1bb..bc010ee272b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -137,9 +137,10 @@ $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
 ###############################################################################
 ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
 X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509
+X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
-X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
+X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
                                $(or $(realpath $(CERT)),$(CERT))))
+X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
 ifeq ($(X509_CERTIFICATES),)
 $(warning *** No X.509 certificates found ***)
@@ -164,9 +165,9 @@ $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
 targets += $(obj)/.x509.list
 $(obj)/.x509.list:
        @echo $(X509_CERTIFICATES) >$@
+endif
 clean-files := x509_certificate_list .x509.list
-endif
 ifeq ($(CONFIG_MODULE_SIG),y)
 ###############################################################################
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 5253204afdca..9fd4246b04b8 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,6 @@ void foo(void)
 #ifdef CONFIG_SMP
        DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
 #endif
-        DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int));
+        DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
        /* End of constants */
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4c62513fe19f..bc1dcabe9217 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);
 static DEFINE_MUTEX(cgroup_root_mutex);
 /*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions.  Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+/*
 * Generate an array of cgroup subsystem pointers. At boot time, this is
 * populated with the built in subsystems, and modular subsystems are
 * registered after that. The mutable section of this array is protected by
@@ -191,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
+static int cgroup_file_release(struct inode *inode, struct file *file);
 /**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -871,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
        INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
-        schedule_work(&cgrp->destroy_work);
+        queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -881,6 +890,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                struct cgroup *cgrp = dentry->d_fsdata;
                BUG_ON(!(cgroup_is_dead(cgrp)));
+                /*
+                 * XXX: cgrp->id is only used to look up css's.  As cgroup
+                 * and css's lifetimes will be decoupled, it should be made
+                 * per-subsystem and moved to css->id so that lookups are
+                 * successful until the target css is released.
+                 */
+                idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+                cgrp->id = -1;
                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
@@ -2421,7 +2440,7 @@ static const struct file_operations cgroup_seqfile_operations = {
        .read = seq_read,
        .write = cgroup_file_write,
        .llseek = seq_lseek,
-        .release = single_release,
+        .release = cgroup_file_release,
 };
 static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2482,6 +2501,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
                ret = cft->release(inode, file);
        if (css->ss)
                css_put(css);
+        if (file->f_op == &cgroup_seqfile_operations)
+                single_release(inode, file);
        return ret;
 }
@@ -4249,7 +4270,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
         * css_put().  dput() requires process context which we don't have.
         */
        INIT_WORK(&css->destroy_work, css_free_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 static void css_release(struct percpu_ref *ref)
@@ -4257,6 +4278,7 @@ static void css_release(struct percpu_ref *ref)
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);
+        rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
        call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
@@ -4415,14 +4437,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
-        /* each css holds a ref to the cgroup's dentry and the parent css */
-        for_each_root_subsys(root, ss) {
-                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-                dget(dentry);
-                css_get(css->parent);
-        }
        /* hold a ref to the parent's dentry */
        dget(parent->dentry);
@@ -4434,6 +4448,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                if (err)
                        goto err_destroy;
+                /* each css holds a ref to the cgroup's dentry and parent css */
+                dget(dentry);
+                css_get(css->parent);
+                /* mark it consumed for error path */
+                css_ar[ss->subsys_id] = NULL;
                if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
                    parent->parent) {
                        pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4480,6 +4501,14 @@ err_free_cgrp:
        return err;
 err_destroy:
+        for_each_root_subsys(root, ss) {
+                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
+                if (css) {
+                        percpu_ref_cancel_init(&css->refcnt);
+                        ss->css_free(css);
+                }
+        }
        cgroup_destroy_locked(cgrp);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4539,7 +4568,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
                container_of(ref, struct cgroup_subsys_state, refcnt);
        INIT_WORK(&css->destroy_work, css_killed_work_fn);
-        schedule_work(&css->destroy_work);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 /**
@@ -4641,8 +4670,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         * will be invoked to perform the rest of destruction once the
         * percpu refs of all css's are confirmed to be killed.
         */
-        for_each_root_subsys(cgrp->root, ss)
+        for_each_root_subsys(cgrp->root, ss) {
-                kill_css(cgroup_css(cgrp, ss));
+                struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+                if (css)
+                        kill_css(css);
+        }
        /*
         * Mark @cgrp dead.  This prevents further task migration and child
@@ -4711,14 +4744,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
        /* delete this cgroup from parent->children */
        list_del_rcu(&cgrp->sibling);
-        /*
-         * We should remove the cgroup object from idr before its grace
-         * period starts, so we won't be looking up a cgroup while the
-         * cgroup is being freed.
-         */
-        idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-        cgrp->id = -1;
        dput(d);
        set_bit(CGRP_RELEASABLE, &parent->flags);
@@ -5063,6 +5088,22 @@ out:
        return err;
 }
+static int __init cgroup_wq_init(void)
+{
+        /*
+         * There isn't much point in executing destruction path in
+         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
+         * Use 1 for @max_active.
+         *
+         * We would prefer to do this in cgroup_init() above, but that
+         * is called before init_workqueues(): so leave this until after.
+         */
+        cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+        BUG_ON(!cgroup_destroy_wq);
+        return 0;
+}
+core_initcall(cgroup_wq_init);
 /*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6bf981e13c43..4772034b4b17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
        need_loop = task_has_mempolicy(tsk) ||
                        !nodes_intersects(*newmems, tsk->mems_allowed);
-        if (need_loop)
+        if (need_loop) {
+                local_irq_disable();
                write_seqcount_begin(&tsk->mems_allowed_seq);
+        }
        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
        tsk->mems_allowed = *newmems;
-        if (need_loop)
+        if (need_loop) {
                write_seqcount_end(&tsk->mems_allowed_seq);
+                local_irq_enable();
+        }
        task_unlock(tsk);
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d724e7757cd1..f5744010a8d2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1396,6 +1396,8 @@ event_sched_out(struct perf_event *event,
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;
+        perf_pmu_disable(event->pmu);
        event->state = PERF_EVENT_STATE_INACTIVE;
        if (event->pending_disable) {
                event->pending_disable = 0;
@@ -1412,6 +1414,8 @@ event_sched_out(struct perf_event *event,
                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
+        perf_pmu_enable(event->pmu);
 }
 static void
@@ -1652,6 +1656,7 @@ event_sched_in(struct perf_event *event,
                 struct perf_event_context *ctx)
 {
        u64 tstamp = perf_event_time(event);
+        int ret = 0;
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
@@ -1674,10 +1679,13 @@ event_sched_in(struct perf_event *event,
         */
        smp_wmb();
+        perf_pmu_disable(event->pmu);
        if (event->pmu->add(event, PERF_EF_START)) {
                event->state = PERF_EVENT_STATE_INACTIVE;
                event->oncpu = -1;
-                return -EAGAIN;
+                ret = -EAGAIN;
+                goto out;
        }
        event->tstamp_running += tstamp - event->tstamp_stopped;
@@ -1693,7 +1701,10 @@ event_sched_in(struct perf_event *event,
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
-        return 0;
+out:
+        perf_pmu_enable(event->pmu);
+        return ret;
 }
 static int
@@ -2743,6 +2754,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                if (!event_filter_match(event))
                        continue;
+                perf_pmu_disable(event->pmu);
                hwc = &event->hw;
                if (hwc->interrupts == MAX_INTERRUPTS) {
@@ -2752,7 +2765,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                }
                if (!event->attr.freq || !event->attr.sample_freq)
-                        continue;
+                        goto next;
                /*
                 * stop the event and update event->count
@@ -2774,6 +2787,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                        perf_adjust_period(event, period, delta, false);
                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
+        next:
+                perf_pmu_enable(event->pmu);
        }
        perf_pmu_enable(ctx->pmu);
@@ -5680,11 +5695,6 @@ static void swevent_hlist_put(struct perf_event *event)
 {
        int cpu;
-        if (event->cpu != -1) {
-                swevent_hlist_put_cpu(event, event->cpu);
-                return;
-        }
        for_each_possible_cpu(cpu)
                swevent_hlist_put_cpu(event, cpu);
 }
@@ -5718,9 +5728,6 @@ static int swevent_hlist_get(struct perf_event *event)
        int err;
        int cpu, failed_cpu;
-        if (event->cpu != -1)
-                return swevent_hlist_get_cpu(event, event->cpu);
        get_online_cpus();
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(event, cpu);
diff --git a/kernel/extable.c b/kernel/extable.c
index 832cb28105bb..763faf037ec1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 static inline int init_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_sinittext &&
-            addr <= (unsigned long)_einittext)
+            addr < (unsigned long)_einittext)
                return 1;
        return 0;
 }
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
 int core_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext &&
-            addr <= (unsigned long)_etext)
+            addr < (unsigned long)_etext)
                return 1;
        if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 728d5be9548c..5721f0e3f2da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        spin_lock_init(&mm->page_table_lock);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+        clear_tlb_flush_pending(mm);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b462fa197517..aa6a8aadb911 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt);
 bool pm_freezing;
 bool pm_nosig_freezing;
+/*
+ * Temporary export for the deadlock workaround in ata_scsi_hotplug().
+ * Remove once the hack becomes unnecessary.
+ */
+EXPORT_SYMBOL_GPL(pm_freezing);
 /* protects freezing and frozen transitions */
 static DEFINE_SPINLOCK(freezer_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index 80ba086f021d..f6ff0191ecf7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -251,6 +251,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
                return -EINVAL;
        address -= key->both.offset;
+        if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+                return -EFAULT;
        /*
         * PROCESS_PRIVATE futexes are fast.
         * As the mm cannot disappear under us and the 'key' only needs
@@ -259,8 +262,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
         *        but access_ok() should be faster than find_vma()
         */
        if (!fshared) {
-                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
-                        return -EFAULT;
                key->private.mm = mm;
                key->private.address = address;
                get_futex_key_refs(key);
@@ -288,7 +289,7 @@ again:
                put_page(page);
                /* serialize against __split_huge_page_splitting() */
                local_irq_disable();
-                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+                if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
                        page_head = compound_head(page);
                        /*
                         * page_head is valid pointer but we must pin
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf21760..abcd6ca86cb7 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
                bool is_early = desc->action &&
                        desc->action->flags & IRQF_EARLY_RESUME;
-                if (is_early != want_early)
+                if (!is_early && want_early)
                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 490afc03627e..9c970167e402 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
        .name  = "Crash kernel",
@@ -1675,7 +1678,9 @@ int kernel_kexec(void)
        } else
 #endif
        {
+                kexec_in_progress = true;
                kernel_restart_prepare(NULL);
+                migrate_to_reboot_cpu();
                printk(KERN_EMERG "Starting new kernel\n");
                machine_shutdown();
        }
diff --git a/kernel/padata.c b/kernel/padata.c
index 07af2c95dcfe..2abd25d79cc8 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 static int padata_cpu_hash(struct parallel_data *pd)
 {
+        unsigned int seq_nr;
        int cpu_index;
        /*
@@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd)
         * seq_nr mod. number of cpus in use.
         */
-        spin_lock(&pd->seq_lock);
+        seq_nr = atomic_inc_return(&pd->seq_nr);
-        cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+        cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
-        pd->seq_nr++;
-        spin_unlock(&pd->seq_lock);
        return padata_index_to_cpu(pd, cpu_index);
 }
@@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
        padata_init_pqueues(pd);
        padata_init_squeues(pd);
        setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
-        pd->seq_nr = 0;
+        atomic_set(&pd->seq_nr, -1);
        atomic_set(&pd->reorder_objects, 0);
        atomic_set(&pd->refcnt, 0);
        pd->pinst = pinst;
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 463aa6736751..eacb8bd8cab4 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev)
        list_for_each_entry(tmp, &pm_vt_switch_list, head) {
                if (tmp->dev == dev) {
                        list_del(&tmp->head);
+                        kfree(tmp);
                        break;
                }
        }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6abb03dff5c0..08a765232432 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1632,7 +1632,7 @@ module_param(rcu_idle_gp_delay, int, 0644);
 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
 module_param(rcu_idle_lazy_gp_delay, int, 0644);
-extern int tick_nohz_enabled;
+extern int tick_nohz_active;
 /*
 * Try to advance callbacks for all flavors of RCU on the current CPU, but
@@ -1729,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu)
        int tne;
        /* Handle nohz enablement switches conservatively. */
-        tne = ACCESS_ONCE(tick_nohz_enabled);
+        tne = ACCESS_ONCE(tick_nohz_active);
        if (tne != rdtp->tick_nohz_enabled_snap) {
                if (rcu_cpu_has_callbacks(cpu, NULL))
                        invoke_rcu_core(); /* force nohz to see update. */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f813b3474646..662c83fc16b7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
-static void migrate_to_reboot_cpu(void)
+void migrate_to_reboot_cpu(void)
 {
        /* The boot cpu is always logical cpu 0 */
        int cpu = reboot_cpu;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1808606ee5f..a88f4a485c5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2660,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
        } while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
 /*
 * this is the entry point to schedule() from kernel preemption
@@ -2693,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
        exception_exit(prev_state);
 }
-#endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
                          void *key)
 {
@@ -4762,7 +4761,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                cpumask_clear_cpu(rq->cpu, old_rd->span);
                /*
-                 * If we dont want to free the old_rt yet then
+                 * If we dont want to free the old_rd yet then
                 * set old_rd to NULL to skip the freeing later
                 * in this function:
                 */
@@ -4903,6 +4902,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 static void update_top_cache_domain(int cpu)
 {
        struct sched_domain *sd;
+        struct sched_domain *busy_sd = NULL;
        int id = cpu;
        int size = 1;
@@ -4910,8 +4910,9 @@ static void update_top_cache_domain(int cpu)
        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
                size = cpumask_weight(sched_domain_span(sd));
-                rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
+                busy_sd = sd->parent; /* sd_busy */
        }
+        rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
@@ -5112,6 +5113,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                 * die on a /0 trap.
                 */
                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                sg->sgp->power_orig = sg->sgp->power;
                /*
                 * Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8b652ebe027..c7395d97e4cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
        update_sysctl();
 }
-#if BITS_PER_LONG == 32
+#define WMULT_CONST     (~0U)
-# define WMULT_CONST    (~0UL)
-#else
-# define WMULT_CONST    (1UL << 32)
-#endif
 #define WMULT_SHIFT     32
-/*
+static void __update_inv_weight(struct load_weight *lw)
- * Shift right and round:
+{
- */
+        unsigned long w;
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+        if (likely(lw->inv_weight))
+                return;
+        w = scale_load_down(lw->weight);
+        if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                lw->inv_weight = 1;
+        else if (unlikely(!w))
+                lw->inv_weight = WMULT_CONST;
+        else
+                lw->inv_weight = WMULT_CONST / w;
+}
 /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
 */
-static unsigned long
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-                struct load_weight *lw)
 {
-        u64 tmp;
+        u64 fact = scale_load_down(weight);
+        int shift = WMULT_SHIFT;
-        /*
+        __update_inv_weight(lw);
-         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-         * 2^SCHED_LOAD_RESOLUTION.
-         */
-        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-                tmp = (u64)delta_exec * scale_load_down(weight);
-        else
-                tmp = (u64)delta_exec;
-        if (!lw->inv_weight) {
+        if (unlikely(fact >> 32)) {
-                unsigned long w = scale_load_down(lw->weight);
+                while (fact >> 32) {
+                        fact >>= 1;
-                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                        shift--;
-                        lw->inv_weight = 1;
+                }
-                else if (unlikely(!w))
-                        lw->inv_weight = WMULT_CONST;
-                else
-                        lw->inv_weight = WMULT_CONST / w;
        }
-        /*
+        /* hint to use a 32x32->64 mul */
-         * Check whether we'd overflow the 64-bit multiplication:
+        fact = (u64)(u32)fact * lw->inv_weight;
-         */
-        if (unlikely(tmp > WMULT_CONST))
-                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-                        WMULT_SHIFT/2);
-        else
-                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+        while (fact >> 32) {
+                fact >>= 1;
+                shift--;
+        }
+        return mul_u64_u32_shr(delta_exec, fact, shift);
 }
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 /*
 * delta /= w
 */
-static inline unsigned long
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
 {
        if (unlikely(se->load.weight != NICE_0_LOAD))
-                delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+                delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
        return delta;
 }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
                        update_load_add(&lw, se->load.weight);
                        load = &lw;
                }
-                slice = calc_delta_mine(slice, se->load.weight, load);
+                slice = __calc_delta(slice, se->load.weight, load);
        }
        return slice;
 }
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
 #endif
 /*
- * Update the current task's runtime statistics. Skip current tasks that
+ * Update the current task's runtime statistics.
- * are not in our scheduling class.
 */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-              unsigned long delta_exec)
-{
-        unsigned long delta_exec_weighted;
-        schedstat_set(curr->statistics.exec_max,
-                      max((u64)delta_exec, curr->statistics.exec_max));
-        curr->sum_exec_runtime += delta_exec;
-        schedstat_add(cfs_rq, exec_clock, delta_exec);
-        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-        curr->vruntime += delta_exec_weighted;
-        update_min_vruntime(cfs_rq);
-}
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
        u64 now = rq_clock_task(rq_of(cfs_rq));
-        unsigned long delta_exec;
+        u64 delta_exec;
        if (unlikely(!curr))
                return;
-        /*
+        delta_exec = now - curr->exec_start;
-         * Get the amount of time the current task was running
+        if (unlikely((s64)delta_exec <= 0))
-         * since the last time we changed load (this cannot
-         * overflow on 32 bits):
-         */
-        delta_exec = (unsigned long)(now - curr->exec_start);
-        if (!delta_exec)
                return;
-        __update_curr(cfs_rq, curr, delta_exec);
        curr->exec_start = now;
+        schedstat_set(curr->statistics.exec_max,
+                      max(delta_exec, curr->statistics.exec_max));
+        curr->sum_exec_runtime += delta_exec;
+        schedstat_add(cfs_rq, exec_clock, delta_exec);
+        curr->vruntime += calc_delta_fair(delta_exec, curr);
+        update_min_vruntime(cfs_rq);
        if (entity_is_task(curr)) {
                struct task_struct *curtask = task_of(curr);
@@ -1752,6 +1738,13 @@ void task_numa_work(struct callback_head *work)
                    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                        continue;
+                /*
+                 * Skip inaccessible VMAs to avoid any confusion between
+                 * PROT_NONE and NUMA hinting ptes
+                 */
+                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                        continue;
                do {
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -3015,8 +3008,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        }
 }
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
-                                     unsigned long delta_exec)
 {
        /* dock delta_exec before expiring quota (as it could span periods) */
        cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3026,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 }
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                return;
@@ -3574,8 +3566,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
        return rq_clock_task(rq_of(cfs_rq));
 }
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-                                     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -5379,10 +5370,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
                 */
                for_each_cpu(cpu, sched_group_cpus(sdg)) {
-                        struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+                        struct sched_group_power *sgp;
+                        struct rq *rq = cpu_rq(cpu);
+                        /*
+                         * build_sched_domains() -> init_sched_groups_power()
+                         * gets here before we've attached the domains to the
+                         * runqueues.
+                         *
+                         * Use power_of(), which is set irrespective of domains
+                         * in update_cpu_power().
+                         *
+                         * This avoids power/power_orig from being 0 and
+                         * causing divide-by-zero issues on boot.
+                         *
+                         * Runtime updates will correct power_orig.
+                         */
+                        if (unlikely(!rq->sd)) {
+                                power_orig += power_of(cpu);
+                                power += power_of(cpu);
+                                continue;
+                        }
-                        power_orig += sg->sgp->power_orig;
+                        sgp = rq->sd->groups->sgp;
-                        power += sg->sgp->power;
+                        power_orig += sgp->power_orig;
+                        power += sgp->power;
                }
        } else  {
                /*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d57275fc396..1c4065575fa2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -901,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Change rq's cpupri only if rt_rq is the top queue.
+         */
+        if (&rq->rt != rt_rq)
+                return;
+#endif
        if (rq->online && prio < prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 }
@@ -910,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Change rq's cpupri only if rt_rq is the top queue.
+         */
+        if (&rq->rt != rt_rq)
+                return;
+#endif
        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
index 4aef390671cb..3e9868d47535 100644
--- a/kernel/system_certificates.S
+++ b/kernel/system_certificates.S
@@ -3,8 +3,18 @@
        __INITRODATA
+        .align 8
        .globl VMLINUX_SYMBOL(system_certificate_list)
 VMLINUX_SYMBOL(system_certificate_list):
+__cert_list_start:
        .incbin "kernel/x509_certificate_list"
-        .globl VMLINUX_SYMBOL(system_certificate_list_end)
+__cert_list_end:
-VMLINUX_SYMBOL(system_certificate_list_end):
+        .align 8
+        .globl VMLINUX_SYMBOL(system_certificate_list_size)
+VMLINUX_SYMBOL(system_certificate_list_size):
+#ifdef CONFIG_64BIT
+        .quad __cert_list_end - __cert_list_start
+#else
+        .long __cert_list_end - __cert_list_start
+#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 564dd93430a2..52ebc70263f4 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -22,7 +22,7 @@ struct key *system_trusted_keyring;
 EXPORT_SYMBOL_GPL(system_trusted_keyring);
 extern __initconst const u8 system_certificate_list[];
-extern __initconst const u8 system_certificate_list_end[];
+extern __initconst const unsigned long system_certificate_list_size;
 /*
 * Load the compiled-in keys
@@ -60,8 +60,8 @@ static __init int load_system_certificate_list(void)
        pr_notice("Loading compiled-in X.509 certificates\n");
-        end = system_certificate_list_end;
        p = system_certificate_list;
+        end = p + system_certificate_list_size;
        while (p < end) {
                /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
                 * than 256 bytes in size.
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 64522ecdfe0e..162b03ab0ad2 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 */
 ktime_t tick_next_period;
 ktime_t tick_period;
+/*
+ * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
+ * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
+ * variable has two functions:
+ *
+ * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
+ *    timekeeping lock all at once. Only the CPU which is assigned to do the
+ *    update is handling it.
+ *
+ * 2) Hand off the duty in the NOHZ idle case by setting the value to
+ *    TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
+ *    at it will take over and keep the time keeping alive.  The handover
+ *    procedure also covers cpu hotplug.
+ */
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3612fc77f834..ea20f7d1ac2c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -361,8 +361,8 @@ void __init tick_nohz_init(void)
 /*
 * NO HZ enabled ?
 */
-int tick_nohz_enabled __read_mostly  = 1;
+static int tick_nohz_enabled __read_mostly  = 1;
+int tick_nohz_active  __read_mostly;
 /*
 * Enable / Disable tickless mode
 */
@@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t now, idle;
-        if (!tick_nohz_enabled)
+        if (!tick_nohz_active)
                return -1;
        now = ktime_get();
@@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t now, iowait;
-        if (!tick_nohz_enabled)
+        if (!tick_nohz_active)
                return -1;
        now = ktime_get();
@@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
                return false;
        }
-        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
+                ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
                return false;
+        }
        if (need_resched())
                return false;
@@ -799,11 +801,6 @@ void tick_nohz_idle_enter(void)
        local_irq_disable();
        ts = &__get_cpu_var(tick_cpu_sched);
-        /*
-         * set ts->inidle unconditionally. even if the system did not
-         * switch to nohz mode the cpu frequency governers rely on the
-         * update of the idle time accounting in tick_nohz_start_idle().
-         */
        ts->inidle = 1;
        __tick_nohz_idle_enter(ts);
@@ -973,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        ktime_t next;
-        if (!tick_nohz_enabled)
+        if (!tick_nohz_active)
                return;
        local_irq_disable();
@@ -981,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void)
                local_irq_enable();
                return;
        }
+        tick_nohz_active = 1;
        ts->nohz_mode = NOHZ_MODE_LOWRES;
        /*
@@ -1139,8 +1136,10 @@ void tick_setup_sched_timer(void)
        }
 #ifdef CONFIG_NO_HZ_COMMON
-        if (tick_nohz_enabled)
+        if (tick_nohz_enabled) {
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
+                tick_nohz_active = 1;
+        }
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3abf53418b67..87b4f00284c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
        tk->xtime_nsec -= remainder;
        tk->xtime_nsec += 1ULL << tk->shift;
        tk->ntp_error += remainder << tk->ntp_error_shift;
+        tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
 }
 #else
 #define old_vsyscall_fixup(tk)
diff --git a/kernel/timer.c b/kernel/timer.c
index 6582b82fa966..accfd241b9e5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu)
                        /*
                         * The APs use this path later in boot
                         */
-                        base = kmalloc_node(sizeof(*base),
+                        base = kzalloc_node(sizeof(*base), GFP_KERNEL,
-                                                GFP_KERNEL | __GFP_ZERO,
+                                            cpu_to_node(cpu));
-                                                cpu_to_node(cpu));
                        if (!base)
                                return -ENOMEM;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 22fa55696760..72a0f81dc5a8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-        if (unlikely(ftrace_disabled))
-                return -ENODEV;
        if (FTRACE_WARN_ON(ops == &global_ops))
                return -EINVAL;
@@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
        int ret;
-        if (ftrace_disabled)
-                return -ENODEV;
        if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
                return -EBUSY;
@@ -781,7 +775,7 @@ static int ftrace_profile_init(void)
        int cpu;
        int ret = 0;
-        for_each_online_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
                ret = ftrace_profile_init_cpu(cpu);
                if (ret)
                        break;
@@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command)
 static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
        bool hash_enable = true;
+        int ret;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
+        ret = __register_ftrace_function(ops);
+        if (ret)
+                return ret;
        ftrace_start_up++;
        command |= FTRACE_UPDATE_CALLS;
@@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
        return 0;
 }
-static void ftrace_shutdown(struct ftrace_ops *ops, int command)
+static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 {
        bool hash_disable = true;
+        int ret;
        if (unlikely(ftrace_disabled))
-                return;
+                return -ENODEV;
+        ret = __unregister_ftrace_function(ops);
+        if (ret)
+                return ret;
        ftrace_start_up--;
        /*
@@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
        }
        if (!command || !ftrace_enabled)
-                return;
+                return 0;
        ftrace_run_update_code(command);
+        return 0;
 }
 static void ftrace_startup_sysctl(void)
@@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void)
        if (i == FTRACE_FUNC_HASHSIZE)
                return;
-        ret = __register_ftrace_function(&trace_probe_ops);
+        ret = ftrace_startup(&trace_probe_ops, 0);
-        if (!ret)
-                ret = ftrace_startup(&trace_probe_ops, 0);
        ftrace_probe_registered = 1;
 }
 static void __disable_ftrace_function_probe(void)
 {
-        int ret;
        int i;
        if (!ftrace_probe_registered)
@@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void)
        }
        /* no more funcs left */
-        ret = __unregister_ftrace_function(&trace_probe_ops);
+        ftrace_shutdown(&trace_probe_ops, 0);
-        if (!ret)
-                ftrace_shutdown(&trace_probe_ops, 0);
        ftrace_probe_registered = 0;
 }
@@ -4366,12 +4366,15 @@ core_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 /* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command)                   \
+# define ftrace_startup(ops, command)                                   \
-        ({                                              \
+        ({                                                              \
-                (ops)->flags |= FTRACE_OPS_FL_ENABLED;  \
+                int ___ret = __register_ftrace_function(ops);           \
-                0;                                      \
+                if (!___ret)                                            \
+                        (ops)->flags |= FTRACE_OPS_FL_ENABLED;          \
+                ___ret;                                                 \
        })
-# define ftrace_shutdown(ops, command)  do { } while (0)
+# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
 # define ftrace_startup_sysctl()        do { } while (0)
 # define ftrace_shutdown_sysctl()       do { } while (0)
@@ -4780,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
        mutex_lock(&ftrace_lock);
-        ret = __register_ftrace_function(ops);
+        ret = ftrace_startup(ops, 0);
-        if (!ret)
-                ret = ftrace_startup(ops, 0);
        mutex_unlock(&ftrace_lock);
@@ -4801,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
        int ret;
        mutex_lock(&ftrace_lock);
-        ret = __unregister_ftrace_function(ops);
+        ret = ftrace_shutdown(ops, 0);
-        if (!ret)
-                ftrace_shutdown(ops, 0);
        mutex_unlock(&ftrace_lock);
        return ret;
@@ -4997,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
        return NOTIFY_DONE;
 }
+/* Just a place holder for function graph */
+static struct ftrace_ops fgraph_ops __read_mostly = {
+        .func           = ftrace_stub,
+        .flags          = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
+                                FTRACE_OPS_FL_RECURSION_SAFE,
+};
 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
                        trace_func_graph_ent_t entryfunc)
 {
@@ -5023,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_graph_return = retfunc;
        ftrace_graph_entry = entryfunc;
-        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+        ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
@@ -5040,7 +5046,7 @@ void unregister_ftrace_graph(void)
        ftrace_graph_active--;
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
-        ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
+        ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 78e27e3b52ac..e854f420e033 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,12 @@ static int	total_ref_count;
 static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
+        if (tp_event->perf_perm) {
+                int ret = tp_event->perf_perm(tp_event, p_event);
+                if (ret)
+                        return ret;
+        }
        /* The ftrace function trace is allowed only for root. */
        if (ftrace_event_is_function(tp_event) &&
            perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
@@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 int perf_trace_init(struct perf_event *p_event)
 {
        struct ftrace_event_call *tp_event;
-        int event_id = p_event->attr.config;
+        u64 event_id = p_event->attr.config;
        int ret = -EINVAL;
        mutex_lock(&event_mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f919a2e21bf3..a11800ae96de 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2314,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr)
        /* Disable any running events */
        __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+        /* Access to events are within rcu_read_lock_sched() */
+        synchronize_sched();
        down_write(&trace_event_sem);
        __trace_remove_event_dirs(tr);
        debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e4b6d11bdf78..ea90eb5f6f17 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -431,11 +431,6 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
        if (!tr->sys_refcount_enter)
                unregister_trace_sys_enter(ftrace_syscall_enter, tr);
        mutex_unlock(&syscall_trace_lock);
-        /*
-         * Callers expect the event to be completely disabled on
-         * return, so wait for current handlers to finish.
-         */
-        synchronize_sched();
 }
 static int reg_event_syscall_exit(struct ftrace_event_file *file,
@@ -474,11 +469,6 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
        if (!tr->sys_refcount_exit)
                unregister_trace_sys_exit(ftrace_syscall_exit, tr);
        mutex_unlock(&syscall_trace_lock);
-        /*
-         * Callers expect the event to be completely disabled on
-         * return, so wait for current handlers to finish.
-         */
-        synchronize_sched();
 }
 static int __init init_syscall_trace(struct ftrace_event_call *call)
diff --git a/kernel/user.c b/kernel/user.c
index a3a0dbfda329..c006131beb77 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,9 +51,9 @@ struct user_namespace init_user_ns = {
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
        .proc_inum = PROC_USER_INIT_INO,
-#ifdef CONFIG_KEYS_KERBEROS_CACHE
+#ifdef CONFIG_PERSISTENT_KEYRINGS
-        .krb_cache_register_sem =
+        .persistent_keyring_register_sem =
-        __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem),
+        __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
 #endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987293d03ebc..b010eac595d2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
 /* I: attributes used when instantiating standard unbound pools on demand */
 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
 struct workqueue_struct *system_wq __read_mostly;
 EXPORT_SYMBOL(system_wq);
 struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { }
 static inline void debug_work_deactivate(struct work_struct *work) { }
 #endif
-/* allocate ID and assign it to @pool */
+/**
+ * worker_pool_assign_id - allocate ID and assing it to @pool
+ * @pool: the pool pointer of interest
+ *
+ * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
+ * successfully, -errno on failure.
+ */
 static int worker_pool_assign_id(struct worker_pool *pool)
 {
        int ret;
        lockdep_assert_held(&wq_pool_mutex);
-        ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
+                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
@@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
-        /* if dying, only works from the same workqueue are allowed */
+        /* if draining, only works from the same workqueue are allowed */
        if (unlikely(wq->flags & __WQ_DRAINING) &&
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
@@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool)
        if (IS_ERR(worker->task))
                goto fail;
+        set_user_nice(worker->task, pool->attrs->nice);
+        /* prevent userland from meddling with cpumask of workqueue workers */
+        worker->task->flags |= PF_NO_SETAFFINITY;
        /*
         * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
         * online CPUs.  It'll be re-applied when any of the CPUs come up.
         */
-        set_user_nice(worker->task, pool->attrs->nice);
        set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
-        /* prevent userland from meddling with cpumask of workqueue workers */
-        worker->task->flags |= PF_NO_SETAFFINITY;
        /*
         * The caller is responsible for ensuring %POOL_DISASSOCIATED
         * remains stable across this function.  See the comments above the
@@ -2840,19 +2851,6 @@ already_gone:
        return false;
 }
-static bool __flush_work(struct work_struct *work)
-{
-        struct wq_barrier barr;
-        if (start_flush_work(work, &barr)) {
-                wait_for_completion(&barr.done);
-                destroy_work_on_stack(&barr.work);
-                return true;
-        } else {
-                return false;
-        }
-}
 /**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
@@ -2866,10 +2864,18 @@ static bool __flush_work(struct work_struct *work)
 */
 bool flush_work(struct work_struct *work)
 {
+        struct wq_barrier barr;
        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);
-        return __flush_work(work);
+        if (start_flush_work(work, &barr)) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else {
+                return false;
+        }
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -4106,7 +4112,7 @@ out_unlock:
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
        bool highpri = wq->flags & WQ_HIGHPRI;
-        int cpu;
+        int cpu, ret;
        if (!(wq->flags & WQ_UNBOUND)) {
                wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4126,6 +4132,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
                        mutex_unlock(&wq->mutex);
                }
                return 0;
+        } else if (wq->flags & __WQ_ORDERED) {
+                ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+                /* there should only be single pwq for ordering guarantee */
+                WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+                              wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+                     "ordering guarantee broken for workqueue %s\n", wq->name);
+                return ret;
        } else {
                return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
        }
@@ -4814,14 +4827,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
        schedule_work_on(cpu, &wfc.work);
+        flush_work(&wfc.work);
-        /*
-         * The work item is on-stack and can't lead to deadlock through
-         * flushing.  Use __flush_work() to avoid spurious lockdep warnings
-         * when work_on_cpu()s are nested.
-         */
-        __flush_work(&wfc.work);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -5009,10 +5015,6 @@ static int __init init_workqueues(void)
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        int i, cpu;
-        /* make sure we have enough bits for OFFQ pool ID */
-        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
-                     WORK_CPU_END * NR_STD_WORKER_POOLS);
        WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5051,13 +5053,23 @@ static int __init init_workqueues(void)
                }
        }
-        /* create default unbound wq attrs */
+        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;
                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;
+                /*
+                 * An ordered wq should have only one pwq as ordering is
+                 * guaranteed by max_active which is enforced by pwqs.
+                 * Turn off NUMA so that dfl_pwq is used for all nodes.
+                 */
+                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+                attrs->nice = std_nice[i];
+                attrs->no_numa = true;
+                ordered_wq_attrs[i] = attrs;
        }
        system_wq = alloc_workqueue("events", 0, 0);
author	Jens Axboe <axboe@kernel.dk>	2013-12-31 11:51:02 -0500
committer	Jens Axboe <axboe@kernel.dk>	2013-12-31 11:51:02 -0500
commit	b28bc9b38c52f63f43e3fd875af982f2240a2859 (patch)
tree	76cdb7b52b58f5685993cc15ed81d1c903023358 /kernel
parent	8d30726912cb39c3a3ebde06214d54861f8fdde2 (diff)
parent	802eee95bde72fd0cd0f3a5b2098375a487d1eda (diff)