aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-12-31 11:51:02 -0500
committerJens Axboe <axboe@kernel.dk>2013-12-31 11:51:02 -0500
commitb28bc9b38c52f63f43e3fd875af982f2240a2859 (patch)
tree76cdb7b52b58f5685993cc15ed81d1c903023358 /kernel
parent8d30726912cb39c3a3ebde06214d54861f8fdde2 (diff)
parent802eee95bde72fd0cd0f3a5b2098375a487d1eda (diff)
Merge tag 'v3.13-rc6' into for-3.14/core
Needed to bring blk-mq uptodate, since changes have been going in since for-3.14/core was established. Fixup merge issues related to the immutable biovec changes. Signed-off-by: Jens Axboe <axboe@kernel.dk> Conflicts: block/blk-flush.c fs/btrfs/check-integrity.c fs/btrfs/extent_io.c fs/btrfs/scrub.c fs/logfs/dev_bdev.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/cgroup.c85
-rw-r--r--kernel/cpuset.c8
-rw-r--r--kernel/events/core.c29
-rw-r--r--kernel/extable.c4
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/futex.c7
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/kexec.c5
-rw-r--r--kernel/padata.c9
-rw-r--r--kernel/power/console.c1
-rw-r--r--kernel/rcu/tree_plugin.h4
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/fair.c178
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/system_certificates.S14
-rw-r--r--kernel/system_keyring.c4
-rw-r--r--kernel/time/tick-common.c15
-rw-r--r--kernel/time/tick-sched.c25
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/timer.c5
-rw-r--r--kernel/trace/ftrace.c66
-rw-r--r--kernel/trace/trace_event_perf.c8
-rw-r--r--kernel/trace/trace_events.c3
-rw-r--r--kernel/trace/trace_syscalls.c10
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/workqueue.c82
31 files changed, 375 insertions, 240 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..790d83c7d160 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,3 +5,4 @@ config_data.h
5config_data.gz 5config_data.gz
6timeconst.h 6timeconst.h
7hz.bc 7hz.bc
8x509_certificate_list
diff --git a/kernel/Makefile b/kernel/Makefile
index bbaf7d59c1bb..bc010ee272b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -137,9 +137,10 @@ $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
137############################################################################### 137###############################################################################
138ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) 138ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
139X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) 139X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
140X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 140X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
141X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ 141X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
142 $(or $(realpath $(CERT)),$(CERT)))) 142 $(or $(realpath $(CERT)),$(CERT))))
143X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
143 144
144ifeq ($(X509_CERTIFICATES),) 145ifeq ($(X509_CERTIFICATES),)
145$(warning *** No X.509 certificates found ***) 146$(warning *** No X.509 certificates found ***)
@@ -164,9 +165,9 @@ $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
164targets += $(obj)/.x509.list 165targets += $(obj)/.x509.list
165$(obj)/.x509.list: 166$(obj)/.x509.list:
166 @echo $(X509_CERTIFICATES) >$@ 167 @echo $(X509_CERTIFICATES) >$@
168endif
167 169
168clean-files := x509_certificate_list .x509.list 170clean-files := x509_certificate_list .x509.list
169endif
170 171
171ifeq ($(CONFIG_MODULE_SIG),y) 172ifeq ($(CONFIG_MODULE_SIG),y)
172############################################################################### 173###############################################################################
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 5253204afdca..9fd4246b04b8 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,6 @@ void foo(void)
22#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
23 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); 23 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
24#endif 24#endif
25 DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int)); 25 DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
26 /* End of constants */ 26 /* End of constants */
27} 27}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4c62513fe19f..bc1dcabe9217 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);
90static DEFINE_MUTEX(cgroup_root_mutex); 90static DEFINE_MUTEX(cgroup_root_mutex);
91 91
92/* 92/*
93 * cgroup destruction makes heavy use of work items and there can be a lot
94 * of concurrent destructions. Use a separate workqueue so that cgroup
95 * destruction work items don't end up filling up max_active of system_wq
96 * which may lead to deadlock.
97 */
98static struct workqueue_struct *cgroup_destroy_wq;
99
100/*
93 * Generate an array of cgroup subsystem pointers. At boot time, this is 101 * Generate an array of cgroup subsystem pointers. At boot time, this is
94 * populated with the built in subsystems, and modular subsystems are 102 * populated with the built in subsystems, and modular subsystems are
95 * registered after that. The mutable section of this array is protected by 103 * registered after that. The mutable section of this array is protected by
@@ -191,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
191static int cgroup_destroy_locked(struct cgroup *cgrp); 199static int cgroup_destroy_locked(struct cgroup *cgrp);
192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 200static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
193 bool is_add); 201 bool is_add);
202static int cgroup_file_release(struct inode *inode, struct file *file);
194 203
195/** 204/**
196 * cgroup_css - obtain a cgroup's css for the specified subsystem 205 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -871,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
871 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 880 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
872 881
873 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); 882 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
874 schedule_work(&cgrp->destroy_work); 883 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
875} 884}
876 885
877static void cgroup_diput(struct dentry *dentry, struct inode *inode) 886static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -881,6 +890,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
881 struct cgroup *cgrp = dentry->d_fsdata; 890 struct cgroup *cgrp = dentry->d_fsdata;
882 891
883 BUG_ON(!(cgroup_is_dead(cgrp))); 892 BUG_ON(!(cgroup_is_dead(cgrp)));
893
894 /*
895 * XXX: cgrp->id is only used to look up css's. As cgroup
896 * and css's lifetimes will be decoupled, it should be made
897 * per-subsystem and moved to css->id so that lookups are
898 * successful until the target css is released.
899 */
900 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
901 cgrp->id = -1;
902
884 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 903 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
885 } else { 904 } else {
886 struct cfent *cfe = __d_cfe(dentry); 905 struct cfent *cfe = __d_cfe(dentry);
@@ -2421,7 +2440,7 @@ static const struct file_operations cgroup_seqfile_operations = {
2421 .read = seq_read, 2440 .read = seq_read,
2422 .write = cgroup_file_write, 2441 .write = cgroup_file_write,
2423 .llseek = seq_lseek, 2442 .llseek = seq_lseek,
2424 .release = single_release, 2443 .release = cgroup_file_release,
2425}; 2444};
2426 2445
2427static int cgroup_file_open(struct inode *inode, struct file *file) 2446static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2482,6 +2501,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
2482 ret = cft->release(inode, file); 2501 ret = cft->release(inode, file);
2483 if (css->ss) 2502 if (css->ss)
2484 css_put(css); 2503 css_put(css);
2504 if (file->f_op == &cgroup_seqfile_operations)
2505 single_release(inode, file);
2485 return ret; 2506 return ret;
2486} 2507}
2487 2508
@@ -4249,7 +4270,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4249 * css_put(). dput() requires process context which we don't have. 4270 * css_put(). dput() requires process context which we don't have.
4250 */ 4271 */
4251 INIT_WORK(&css->destroy_work, css_free_work_fn); 4272 INIT_WORK(&css->destroy_work, css_free_work_fn);
4252 schedule_work(&css->destroy_work); 4273 queue_work(cgroup_destroy_wq, &css->destroy_work);
4253} 4274}
4254 4275
4255static void css_release(struct percpu_ref *ref) 4276static void css_release(struct percpu_ref *ref)
@@ -4257,6 +4278,7 @@ static void css_release(struct percpu_ref *ref)
4257 struct cgroup_subsys_state *css = 4278 struct cgroup_subsys_state *css =
4258 container_of(ref, struct cgroup_subsys_state, refcnt); 4279 container_of(ref, struct cgroup_subsys_state, refcnt);
4259 4280
4281 rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
4260 call_rcu(&css->rcu_head, css_free_rcu_fn); 4282 call_rcu(&css->rcu_head, css_free_rcu_fn);
4261} 4283}
4262 4284
@@ -4415,14 +4437,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4415 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4437 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4416 root->number_of_cgroups++; 4438 root->number_of_cgroups++;
4417 4439
4418 /* each css holds a ref to the cgroup's dentry and the parent css */
4419 for_each_root_subsys(root, ss) {
4420 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4421
4422 dget(dentry);
4423 css_get(css->parent);
4424 }
4425
4426 /* hold a ref to the parent's dentry */ 4440 /* hold a ref to the parent's dentry */
4427 dget(parent->dentry); 4441 dget(parent->dentry);
4428 4442
@@ -4434,6 +4448,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4434 if (err) 4448 if (err)
4435 goto err_destroy; 4449 goto err_destroy;
4436 4450
4451 /* each css holds a ref to the cgroup's dentry and parent css */
4452 dget(dentry);
4453 css_get(css->parent);
4454
4455 /* mark it consumed for error path */
4456 css_ar[ss->subsys_id] = NULL;
4457
4437 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4458 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4438 parent->parent) { 4459 parent->parent) {
4439 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4460 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4480,6 +4501,14 @@ err_free_cgrp:
4480 return err; 4501 return err;
4481 4502
4482err_destroy: 4503err_destroy:
4504 for_each_root_subsys(root, ss) {
4505 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4506
4507 if (css) {
4508 percpu_ref_cancel_init(&css->refcnt);
4509 ss->css_free(css);
4510 }
4511 }
4483 cgroup_destroy_locked(cgrp); 4512 cgroup_destroy_locked(cgrp);
4484 mutex_unlock(&cgroup_mutex); 4513 mutex_unlock(&cgroup_mutex);
4485 mutex_unlock(&dentry->d_inode->i_mutex); 4514 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4539,7 +4568,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4539 container_of(ref, struct cgroup_subsys_state, refcnt); 4568 container_of(ref, struct cgroup_subsys_state, refcnt);
4540 4569
4541 INIT_WORK(&css->destroy_work, css_killed_work_fn); 4570 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4542 schedule_work(&css->destroy_work); 4571 queue_work(cgroup_destroy_wq, &css->destroy_work);
4543} 4572}
4544 4573
4545/** 4574/**
@@ -4641,8 +4670,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4641 * will be invoked to perform the rest of destruction once the 4670 * will be invoked to perform the rest of destruction once the
4642 * percpu refs of all css's are confirmed to be killed. 4671 * percpu refs of all css's are confirmed to be killed.
4643 */ 4672 */
4644 for_each_root_subsys(cgrp->root, ss) 4673 for_each_root_subsys(cgrp->root, ss) {
4645 kill_css(cgroup_css(cgrp, ss)); 4674 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4675
4676 if (css)
4677 kill_css(css);
4678 }
4646 4679
4647 /* 4680 /*
4648 * Mark @cgrp dead. This prevents further task migration and child 4681 * Mark @cgrp dead. This prevents further task migration and child
@@ -4711,14 +4744,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4711 /* delete this cgroup from parent->children */ 4744 /* delete this cgroup from parent->children */
4712 list_del_rcu(&cgrp->sibling); 4745 list_del_rcu(&cgrp->sibling);
4713 4746
4714 /*
4715 * We should remove the cgroup object from idr before its grace
4716 * period starts, so we won't be looking up a cgroup while the
4717 * cgroup is being freed.
4718 */
4719 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4720 cgrp->id = -1;
4721
4722 dput(d); 4747 dput(d);
4723 4748
4724 set_bit(CGRP_RELEASABLE, &parent->flags); 4749 set_bit(CGRP_RELEASABLE, &parent->flags);
@@ -5063,6 +5088,22 @@ out:
5063 return err; 5088 return err;
5064} 5089}
5065 5090
5091static int __init cgroup_wq_init(void)
5092{
5093 /*
5094 * There isn't much point in executing destruction path in
5095 * parallel. Good chunk is serialized with cgroup_mutex anyway.
5096 * Use 1 for @max_active.
5097 *
5098 * We would prefer to do this in cgroup_init() above, but that
5099 * is called before init_workqueues(): so leave this until after.
5100 */
5101 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5102 BUG_ON(!cgroup_destroy_wq);
5103 return 0;
5104}
5105core_initcall(cgroup_wq_init);
5106
5066/* 5107/*
5067 * proc_cgroup_show() 5108 * proc_cgroup_show()
5068 * - Print task's cgroup paths into seq_file, one line for each hierarchy 5109 * - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6bf981e13c43..4772034b4b17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1033 need_loop = task_has_mempolicy(tsk) || 1033 need_loop = task_has_mempolicy(tsk) ||
1034 !nodes_intersects(*newmems, tsk->mems_allowed); 1034 !nodes_intersects(*newmems, tsk->mems_allowed);
1035 1035
1036 if (need_loop) 1036 if (need_loop) {
1037 local_irq_disable();
1037 write_seqcount_begin(&tsk->mems_allowed_seq); 1038 write_seqcount_begin(&tsk->mems_allowed_seq);
1039 }
1038 1040
1039 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 1041 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1040 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 1042 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1042 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 1044 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1043 tsk->mems_allowed = *newmems; 1045 tsk->mems_allowed = *newmems;
1044 1046
1045 if (need_loop) 1047 if (need_loop) {
1046 write_seqcount_end(&tsk->mems_allowed_seq); 1048 write_seqcount_end(&tsk->mems_allowed_seq);
1049 local_irq_enable();
1050 }
1047 1051
1048 task_unlock(tsk); 1052 task_unlock(tsk);
1049} 1053}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d724e7757cd1..f5744010a8d2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1396,6 +1396,8 @@ event_sched_out(struct perf_event *event,
1396 if (event->state != PERF_EVENT_STATE_ACTIVE) 1396 if (event->state != PERF_EVENT_STATE_ACTIVE)
1397 return; 1397 return;
1398 1398
1399 perf_pmu_disable(event->pmu);
1400
1399 event->state = PERF_EVENT_STATE_INACTIVE; 1401 event->state = PERF_EVENT_STATE_INACTIVE;
1400 if (event->pending_disable) { 1402 if (event->pending_disable) {
1401 event->pending_disable = 0; 1403 event->pending_disable = 0;
@@ -1412,6 +1414,8 @@ event_sched_out(struct perf_event *event,
1412 ctx->nr_freq--; 1414 ctx->nr_freq--;
1413 if (event->attr.exclusive || !cpuctx->active_oncpu) 1415 if (event->attr.exclusive || !cpuctx->active_oncpu)
1414 cpuctx->exclusive = 0; 1416 cpuctx->exclusive = 0;
1417
1418 perf_pmu_enable(event->pmu);
1415} 1419}
1416 1420
1417static void 1421static void
@@ -1652,6 +1656,7 @@ event_sched_in(struct perf_event *event,
1652 struct perf_event_context *ctx) 1656 struct perf_event_context *ctx)
1653{ 1657{
1654 u64 tstamp = perf_event_time(event); 1658 u64 tstamp = perf_event_time(event);
1659 int ret = 0;
1655 1660
1656 if (event->state <= PERF_EVENT_STATE_OFF) 1661 if (event->state <= PERF_EVENT_STATE_OFF)
1657 return 0; 1662 return 0;
@@ -1674,10 +1679,13 @@ event_sched_in(struct perf_event *event,
1674 */ 1679 */
1675 smp_wmb(); 1680 smp_wmb();
1676 1681
1682 perf_pmu_disable(event->pmu);
1683
1677 if (event->pmu->add(event, PERF_EF_START)) { 1684 if (event->pmu->add(event, PERF_EF_START)) {
1678 event->state = PERF_EVENT_STATE_INACTIVE; 1685 event->state = PERF_EVENT_STATE_INACTIVE;
1679 event->oncpu = -1; 1686 event->oncpu = -1;
1680 return -EAGAIN; 1687 ret = -EAGAIN;
1688 goto out;
1681 } 1689 }
1682 1690
1683 event->tstamp_running += tstamp - event->tstamp_stopped; 1691 event->tstamp_running += tstamp - event->tstamp_stopped;
@@ -1693,7 +1701,10 @@ event_sched_in(struct perf_event *event,
1693 if (event->attr.exclusive) 1701 if (event->attr.exclusive)
1694 cpuctx->exclusive = 1; 1702 cpuctx->exclusive = 1;
1695 1703
1696 return 0; 1704out:
1705 perf_pmu_enable(event->pmu);
1706
1707 return ret;
1697} 1708}
1698 1709
1699static int 1710static int
@@ -2743,6 +2754,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2743 if (!event_filter_match(event)) 2754 if (!event_filter_match(event))
2744 continue; 2755 continue;
2745 2756
2757 perf_pmu_disable(event->pmu);
2758
2746 hwc = &event->hw; 2759 hwc = &event->hw;
2747 2760
2748 if (hwc->interrupts == MAX_INTERRUPTS) { 2761 if (hwc->interrupts == MAX_INTERRUPTS) {
@@ -2752,7 +2765,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2752 } 2765 }
2753 2766
2754 if (!event->attr.freq || !event->attr.sample_freq) 2767 if (!event->attr.freq || !event->attr.sample_freq)
2755 continue; 2768 goto next;
2756 2769
2757 /* 2770 /*
2758 * stop the event and update event->count 2771 * stop the event and update event->count
@@ -2774,6 +2787,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2774 perf_adjust_period(event, period, delta, false); 2787 perf_adjust_period(event, period, delta, false);
2775 2788
2776 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); 2789 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2790 next:
2791 perf_pmu_enable(event->pmu);
2777 } 2792 }
2778 2793
2779 perf_pmu_enable(ctx->pmu); 2794 perf_pmu_enable(ctx->pmu);
@@ -5680,11 +5695,6 @@ static void swevent_hlist_put(struct perf_event *event)
5680{ 5695{
5681 int cpu; 5696 int cpu;
5682 5697
5683 if (event->cpu != -1) {
5684 swevent_hlist_put_cpu(event, event->cpu);
5685 return;
5686 }
5687
5688 for_each_possible_cpu(cpu) 5698 for_each_possible_cpu(cpu)
5689 swevent_hlist_put_cpu(event, cpu); 5699 swevent_hlist_put_cpu(event, cpu);
5690} 5700}
@@ -5718,9 +5728,6 @@ static int swevent_hlist_get(struct perf_event *event)
5718 int err; 5728 int err;
5719 int cpu, failed_cpu; 5729 int cpu, failed_cpu;
5720 5730
5721 if (event->cpu != -1)
5722 return swevent_hlist_get_cpu(event, event->cpu);
5723
5724 get_online_cpus(); 5731 get_online_cpus();
5725 for_each_possible_cpu(cpu) { 5732 for_each_possible_cpu(cpu) {
5726 err = swevent_hlist_get_cpu(event, cpu); 5733 err = swevent_hlist_get_cpu(event, cpu);
diff --git a/kernel/extable.c b/kernel/extable.c
index 832cb28105bb..763faf037ec1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
61static inline int init_kernel_text(unsigned long addr) 61static inline int init_kernel_text(unsigned long addr)
62{ 62{
63 if (addr >= (unsigned long)_sinittext && 63 if (addr >= (unsigned long)_sinittext &&
64 addr <= (unsigned long)_einittext) 64 addr < (unsigned long)_einittext)
65 return 1; 65 return 1;
66 return 0; 66 return 0;
67} 67}
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
69int core_kernel_text(unsigned long addr) 69int core_kernel_text(unsigned long addr)
70{ 70{
71 if (addr >= (unsigned long)_stext && 71 if (addr >= (unsigned long)_stext &&
72 addr <= (unsigned long)_etext) 72 addr < (unsigned long)_etext)
73 return 1; 73 return 1;
74 74
75 if (system_state == SYSTEM_BOOTING && 75 if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 728d5be9548c..5721f0e3f2da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
537 spin_lock_init(&mm->page_table_lock); 537 spin_lock_init(&mm->page_table_lock);
538 mm_init_aio(mm); 538 mm_init_aio(mm);
539 mm_init_owner(mm, p); 539 mm_init_owner(mm, p);
540 clear_tlb_flush_pending(mm);
540 541
541 if (likely(!mm_alloc_pgd(mm))) { 542 if (likely(!mm_alloc_pgd(mm))) {
542 mm->def_flags = 0; 543 mm->def_flags = 0;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b462fa197517..aa6a8aadb911 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt);
19bool pm_freezing; 19bool pm_freezing;
20bool pm_nosig_freezing; 20bool pm_nosig_freezing;
21 21
22/*
23 * Temporary export for the deadlock workaround in ata_scsi_hotplug().
24 * Remove once the hack becomes unnecessary.
25 */
26EXPORT_SYMBOL_GPL(pm_freezing);
27
22/* protects freezing and frozen transitions */ 28/* protects freezing and frozen transitions */
23static DEFINE_SPINLOCK(freezer_lock); 29static DEFINE_SPINLOCK(freezer_lock);
24 30
diff --git a/kernel/futex.c b/kernel/futex.c
index 80ba086f021d..f6ff0191ecf7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -251,6 +251,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
251 return -EINVAL; 251 return -EINVAL;
252 address -= key->both.offset; 252 address -= key->both.offset;
253 253
254 if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
255 return -EFAULT;
256
254 /* 257 /*
255 * PROCESS_PRIVATE futexes are fast. 258 * PROCESS_PRIVATE futexes are fast.
256 * As the mm cannot disappear under us and the 'key' only needs 259 * As the mm cannot disappear under us and the 'key' only needs
@@ -259,8 +262,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
259 * but access_ok() should be faster than find_vma() 262 * but access_ok() should be faster than find_vma()
260 */ 263 */
261 if (!fshared) { 264 if (!fshared) {
262 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
263 return -EFAULT;
264 key->private.mm = mm; 265 key->private.mm = mm;
265 key->private.address = address; 266 key->private.address = address;
266 get_futex_key_refs(key); 267 get_futex_key_refs(key);
@@ -288,7 +289,7 @@ again:
288 put_page(page); 289 put_page(page);
289 /* serialize against __split_huge_page_splitting() */ 290 /* serialize against __split_huge_page_splitting() */
290 local_irq_disable(); 291 local_irq_disable();
291 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { 292 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
292 page_head = compound_head(page); 293 page_head = compound_head(page);
293 /* 294 /*
294 * page_head is valid pointer but we must pin 295 * page_head is valid pointer but we must pin
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf21760..abcd6ca86cb7 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
50 bool is_early = desc->action && 50 bool is_early = desc->action &&
51 desc->action->flags & IRQF_EARLY_RESUME; 51 desc->action->flags & IRQF_EARLY_RESUME;
52 52
53 if (is_early != want_early) 53 if (!is_early && want_early)
54 continue; 54 continue;
55 55
56 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 490afc03627e..9c970167e402 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
47size_t vmcoreinfo_size; 47size_t vmcoreinfo_size;
48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
49 49
50/* Flag to indicate we are going to kexec a new kernel */
51bool kexec_in_progress = false;
52
50/* Location of the reserved area for the crash kernel */ 53/* Location of the reserved area for the crash kernel */
51struct resource crashk_res = { 54struct resource crashk_res = {
52 .name = "Crash kernel", 55 .name = "Crash kernel",
@@ -1675,7 +1678,9 @@ int kernel_kexec(void)
1675 } else 1678 } else
1676#endif 1679#endif
1677 { 1680 {
1681 kexec_in_progress = true;
1678 kernel_restart_prepare(NULL); 1682 kernel_restart_prepare(NULL);
1683 migrate_to_reboot_cpu();
1679 printk(KERN_EMERG "Starting new kernel\n"); 1684 printk(KERN_EMERG "Starting new kernel\n");
1680 machine_shutdown(); 1685 machine_shutdown();
1681 } 1686 }
diff --git a/kernel/padata.c b/kernel/padata.c
index 07af2c95dcfe..2abd25d79cc8 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
46 46
47static int padata_cpu_hash(struct parallel_data *pd) 47static int padata_cpu_hash(struct parallel_data *pd)
48{ 48{
49 unsigned int seq_nr;
49 int cpu_index; 50 int cpu_index;
50 51
51 /* 52 /*
@@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd)
53 * seq_nr mod. number of cpus in use. 54 * seq_nr mod. number of cpus in use.
54 */ 55 */
55 56
56 spin_lock(&pd->seq_lock); 57 seq_nr = atomic_inc_return(&pd->seq_nr);
57 cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); 58 cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
58 pd->seq_nr++;
59 spin_unlock(&pd->seq_lock);
60 59
61 return padata_index_to_cpu(pd, cpu_index); 60 return padata_index_to_cpu(pd, cpu_index);
62} 61}
@@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
429 padata_init_pqueues(pd); 428 padata_init_pqueues(pd);
430 padata_init_squeues(pd); 429 padata_init_squeues(pd);
431 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 430 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
432 pd->seq_nr = 0; 431 atomic_set(&pd->seq_nr, -1);
433 atomic_set(&pd->reorder_objects, 0); 432 atomic_set(&pd->reorder_objects, 0);
434 atomic_set(&pd->refcnt, 0); 433 atomic_set(&pd->refcnt, 0);
435 pd->pinst = pinst; 434 pd->pinst = pinst;
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 463aa6736751..eacb8bd8cab4 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev)
81 list_for_each_entry(tmp, &pm_vt_switch_list, head) { 81 list_for_each_entry(tmp, &pm_vt_switch_list, head) {
82 if (tmp->dev == dev) { 82 if (tmp->dev == dev) {
83 list_del(&tmp->head); 83 list_del(&tmp->head);
84 kfree(tmp);
84 break; 85 break;
85 } 86 }
86 } 87 }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6abb03dff5c0..08a765232432 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1632,7 +1632,7 @@ module_param(rcu_idle_gp_delay, int, 0644);
1632static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; 1632static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1633module_param(rcu_idle_lazy_gp_delay, int, 0644); 1633module_param(rcu_idle_lazy_gp_delay, int, 0644);
1634 1634
1635extern int tick_nohz_enabled; 1635extern int tick_nohz_active;
1636 1636
1637/* 1637/*
1638 * Try to advance callbacks for all flavors of RCU on the current CPU, but 1638 * Try to advance callbacks for all flavors of RCU on the current CPU, but
@@ -1729,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu)
1729 int tne; 1729 int tne;
1730 1730
1731 /* Handle nohz enablement switches conservatively. */ 1731 /* Handle nohz enablement switches conservatively. */
1732 tne = ACCESS_ONCE(tick_nohz_enabled); 1732 tne = ACCESS_ONCE(tick_nohz_active);
1733 if (tne != rdtp->tick_nohz_enabled_snap) { 1733 if (tne != rdtp->tick_nohz_enabled_snap) {
1734 if (rcu_cpu_has_callbacks(cpu, NULL)) 1734 if (rcu_cpu_has_callbacks(cpu, NULL))
1735 invoke_rcu_core(); /* force nohz to see update. */ 1735 invoke_rcu_core(); /* force nohz to see update. */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f813b3474646..662c83fc16b7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb)
104} 104}
105EXPORT_SYMBOL(unregister_reboot_notifier); 105EXPORT_SYMBOL(unregister_reboot_notifier);
106 106
107static void migrate_to_reboot_cpu(void) 107void migrate_to_reboot_cpu(void)
108{ 108{
109 /* The boot cpu is always logical cpu 0 */ 109 /* The boot cpu is always logical cpu 0 */
110 int cpu = reboot_cpu; 110 int cpu = reboot_cpu;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1808606ee5f..a88f4a485c5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2660,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
2660 } while (need_resched()); 2660 } while (need_resched());
2661} 2661}
2662EXPORT_SYMBOL(preempt_schedule); 2662EXPORT_SYMBOL(preempt_schedule);
2663#endif /* CONFIG_PREEMPT */
2663 2664
2664/* 2665/*
2665 * this is the entry point to schedule() from kernel preemption 2666 * this is the entry point to schedule() from kernel preemption
@@ -2693,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
2693 exception_exit(prev_state); 2694 exception_exit(prev_state);
2694} 2695}
2695 2696
2696#endif /* CONFIG_PREEMPT */
2697
2698int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 2697int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2699 void *key) 2698 void *key)
2700{ 2699{
@@ -4762,7 +4761,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
4762 cpumask_clear_cpu(rq->cpu, old_rd->span); 4761 cpumask_clear_cpu(rq->cpu, old_rd->span);
4763 4762
4764 /* 4763 /*
4765 * If we dont want to free the old_rt yet then 4764 * If we dont want to free the old_rd yet then
4766 * set old_rd to NULL to skip the freeing later 4765 * set old_rd to NULL to skip the freeing later
4767 * in this function: 4766 * in this function:
4768 */ 4767 */
@@ -4903,6 +4902,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym);
4903static void update_top_cache_domain(int cpu) 4902static void update_top_cache_domain(int cpu)
4904{ 4903{
4905 struct sched_domain *sd; 4904 struct sched_domain *sd;
4905 struct sched_domain *busy_sd = NULL;
4906 int id = cpu; 4906 int id = cpu;
4907 int size = 1; 4907 int size = 1;
4908 4908
@@ -4910,8 +4910,9 @@ static void update_top_cache_domain(int cpu)
4910 if (sd) { 4910 if (sd) {
4911 id = cpumask_first(sched_domain_span(sd)); 4911 id = cpumask_first(sched_domain_span(sd));
4912 size = cpumask_weight(sched_domain_span(sd)); 4912 size = cpumask_weight(sched_domain_span(sd));
4913 rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); 4913 busy_sd = sd->parent; /* sd_busy */
4914 } 4914 }
4915 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
4915 4916
4916 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 4917 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
4917 per_cpu(sd_llc_size, cpu) = size; 4918 per_cpu(sd_llc_size, cpu) = size;
@@ -5112,6 +5113,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5112 * die on a /0 trap. 5113 * die on a /0 trap.
5113 */ 5114 */
5114 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5115 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5116 sg->sgp->power_orig = sg->sgp->power;
5115 5117
5116 /* 5118 /*
5117 * Make sure the first group of this domain contains the 5119 * Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8b652ebe027..c7395d97e4cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
178 update_sysctl(); 178 update_sysctl();
179} 179}
180 180
181#if BITS_PER_LONG == 32 181#define WMULT_CONST (~0U)
182# define WMULT_CONST (~0UL)
183#else
184# define WMULT_CONST (1UL << 32)
185#endif
186
187#define WMULT_SHIFT 32 182#define WMULT_SHIFT 32
188 183
189/* 184static void __update_inv_weight(struct load_weight *lw)
190 * Shift right and round: 185{
191 */ 186 unsigned long w;
192#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 187
188 if (likely(lw->inv_weight))
189 return;
190
191 w = scale_load_down(lw->weight);
192
193 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194 lw->inv_weight = 1;
195 else if (unlikely(!w))
196 lw->inv_weight = WMULT_CONST;
197 else
198 lw->inv_weight = WMULT_CONST / w;
199}
193 200
194/* 201/*
195 * delta *= weight / lw 202 * delta_exec * weight / lw.weight
203 * OR
204 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
205 *
206 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
207 * we're guaranteed shift stays positive because inv_weight is guaranteed to
208 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
209 *
210 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
211 * weight/lw.weight <= 1, and therefore our shift will also be positive.
196 */ 212 */
197static unsigned long 213static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
198calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199 struct load_weight *lw)
200{ 214{
201 u64 tmp; 215 u64 fact = scale_load_down(weight);
216 int shift = WMULT_SHIFT;
202 217
203 /* 218 __update_inv_weight(lw);
204 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
206 * 2^SCHED_LOAD_RESOLUTION.
207 */
208 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209 tmp = (u64)delta_exec * scale_load_down(weight);
210 else
211 tmp = (u64)delta_exec;
212 219
213 if (!lw->inv_weight) { 220 if (unlikely(fact >> 32)) {
214 unsigned long w = scale_load_down(lw->weight); 221 while (fact >> 32) {
215 222 fact >>= 1;
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 223 shift--;
217 lw->inv_weight = 1; 224 }
218 else if (unlikely(!w))
219 lw->inv_weight = WMULT_CONST;
220 else
221 lw->inv_weight = WMULT_CONST / w;
222 } 225 }
223 226
224 /* 227 /* hint to use a 32x32->64 mul */
225 * Check whether we'd overflow the 64-bit multiplication: 228 fact = (u64)(u32)fact * lw->inv_weight;
226 */
227 if (unlikely(tmp > WMULT_CONST))
228 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
229 WMULT_SHIFT/2);
230 else
231 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232 229
233 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 230 while (fact >> 32) {
231 fact >>= 1;
232 shift--;
233 }
234
235 return mul_u64_u32_shr(delta_exec, fact, shift);
234} 236}
235 237
236 238
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
443#endif /* CONFIG_FAIR_GROUP_SCHED */ 445#endif /* CONFIG_FAIR_GROUP_SCHED */
444 446
445static __always_inline 447static __always_inline
446void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); 448void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
447 449
448/************************************************************** 450/**************************************************************
449 * Scheduling class tree data structure manipulation methods: 451 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
612/* 614/*
613 * delta /= w 615 * delta /= w
614 */ 616 */
615static inline unsigned long 617static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
616calc_delta_fair(unsigned long delta, struct sched_entity *se)
617{ 618{
618 if (unlikely(se->load.weight != NICE_0_LOAD)) 619 if (unlikely(se->load.weight != NICE_0_LOAD))
619 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); 620 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
620 621
621 return delta; 622 return delta;
622} 623}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665 update_load_add(&lw, se->load.weight); 666 update_load_add(&lw, se->load.weight);
666 load = &lw; 667 load = &lw;
667 } 668 }
668 slice = calc_delta_mine(slice, se->load.weight, load); 669 slice = __calc_delta(slice, se->load.weight, load);
669 } 670 }
670 return slice; 671 return slice;
671} 672}
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
703#endif 704#endif
704 705
705/* 706/*
706 * Update the current task's runtime statistics. Skip current tasks that 707 * Update the current task's runtime statistics.
707 * are not in our scheduling class.
708 */ 708 */
709static inline void
710__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
711 unsigned long delta_exec)
712{
713 unsigned long delta_exec_weighted;
714
715 schedstat_set(curr->statistics.exec_max,
716 max((u64)delta_exec, curr->statistics.exec_max));
717
718 curr->sum_exec_runtime += delta_exec;
719 schedstat_add(cfs_rq, exec_clock, delta_exec);
720 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
721
722 curr->vruntime += delta_exec_weighted;
723 update_min_vruntime(cfs_rq);
724}
725
726static void update_curr(struct cfs_rq *cfs_rq) 709static void update_curr(struct cfs_rq *cfs_rq)
727{ 710{
728 struct sched_entity *curr = cfs_rq->curr; 711 struct sched_entity *curr = cfs_rq->curr;
729 u64 now = rq_clock_task(rq_of(cfs_rq)); 712 u64 now = rq_clock_task(rq_of(cfs_rq));
730 unsigned long delta_exec; 713 u64 delta_exec;
731 714
732 if (unlikely(!curr)) 715 if (unlikely(!curr))
733 return; 716 return;
734 717
735 /* 718 delta_exec = now - curr->exec_start;
736 * Get the amount of time the current task was running 719 if (unlikely((s64)delta_exec <= 0))
737 * since the last time we changed load (this cannot
738 * overflow on 32 bits):
739 */
740 delta_exec = (unsigned long)(now - curr->exec_start);
741 if (!delta_exec)
742 return; 720 return;
743 721
744 __update_curr(cfs_rq, curr, delta_exec);
745 curr->exec_start = now; 722 curr->exec_start = now;
746 723
724 schedstat_set(curr->statistics.exec_max,
725 max(delta_exec, curr->statistics.exec_max));
726
727 curr->sum_exec_runtime += delta_exec;
728 schedstat_add(cfs_rq, exec_clock, delta_exec);
729
730 curr->vruntime += calc_delta_fair(delta_exec, curr);
731 update_min_vruntime(cfs_rq);
732
747 if (entity_is_task(curr)) { 733 if (entity_is_task(curr)) {
748 struct task_struct *curtask = task_of(curr); 734 struct task_struct *curtask = task_of(curr);
749 735
@@ -1752,6 +1738,13 @@ void task_numa_work(struct callback_head *work)
1752 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) 1738 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
1753 continue; 1739 continue;
1754 1740
1741 /*
1742 * Skip inaccessible VMAs to avoid any confusion between
1743 * PROT_NONE and NUMA hinting ptes
1744 */
1745 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1746 continue;
1747
1755 do { 1748 do {
1756 start = max(start, vma->vm_start); 1749 start = max(start, vma->vm_start);
1757 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1750 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -3015,8 +3008,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3015 } 3008 }
3016} 3009}
3017 3010
3018static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3011static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3019 unsigned long delta_exec)
3020{ 3012{
3021 /* dock delta_exec before expiring quota (as it could span periods) */ 3013 /* dock delta_exec before expiring quota (as it could span periods) */
3022 cfs_rq->runtime_remaining -= delta_exec; 3014 cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3026,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3034} 3026}
3035 3027
3036static __always_inline 3028static __always_inline
3037void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) 3029void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3038{ 3030{
3039 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 3031 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3040 return; 3032 return;
@@ -3574,8 +3566,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3574 return rq_clock_task(rq_of(cfs_rq)); 3566 return rq_clock_task(rq_of(cfs_rq));
3575} 3567}
3576 3568
3577static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3569static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3578 unsigned long delta_exec) {}
3579static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3570static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3580static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3571static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3581static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3572static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -5379,10 +5370,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
5379 */ 5370 */
5380 5371
5381 for_each_cpu(cpu, sched_group_cpus(sdg)) { 5372 for_each_cpu(cpu, sched_group_cpus(sdg)) {
5382 struct sched_group *sg = cpu_rq(cpu)->sd->groups; 5373 struct sched_group_power *sgp;
5374 struct rq *rq = cpu_rq(cpu);
5375
5376 /*
5377 * build_sched_domains() -> init_sched_groups_power()
5378 * gets here before we've attached the domains to the
5379 * runqueues.
5380 *
5381 * Use power_of(), which is set irrespective of domains
5382 * in update_cpu_power().
5383 *
5384 * This avoids power/power_orig from being 0 and
5385 * causing divide-by-zero issues on boot.
5386 *
5387 * Runtime updates will correct power_orig.
5388 */
5389 if (unlikely(!rq->sd)) {
5390 power_orig += power_of(cpu);
5391 power += power_of(cpu);
5392 continue;
5393 }
5383 5394
5384 power_orig += sg->sgp->power_orig; 5395 sgp = rq->sd->groups->sgp;
5385 power += sg->sgp->power; 5396 power_orig += sgp->power_orig;
5397 power += sgp->power;
5386 } 5398 }
5387 } else { 5399 } else {
5388 /* 5400 /*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d57275fc396..1c4065575fa2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -901,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
901{ 901{
902 struct rq *rq = rq_of_rt_rq(rt_rq); 902 struct rq *rq = rq_of_rt_rq(rt_rq);
903 903
904#ifdef CONFIG_RT_GROUP_SCHED
905 /*
906 * Change rq's cpupri only if rt_rq is the top queue.
907 */
908 if (&rq->rt != rt_rq)
909 return;
910#endif
904 if (rq->online && prio < prev_prio) 911 if (rq->online && prio < prev_prio)
905 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 912 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
906} 913}
@@ -910,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
910{ 917{
911 struct rq *rq = rq_of_rt_rq(rt_rq); 918 struct rq *rq = rq_of_rt_rq(rt_rq);
912 919
920#ifdef CONFIG_RT_GROUP_SCHED
921 /*
922 * Change rq's cpupri only if rt_rq is the top queue.
923 */
924 if (&rq->rt != rt_rq)
925 return;
926#endif
913 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 927 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
914 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 928 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
915} 929}
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
index 4aef390671cb..3e9868d47535 100644
--- a/kernel/system_certificates.S
+++ b/kernel/system_certificates.S
@@ -3,8 +3,18 @@
3 3
4 __INITRODATA 4 __INITRODATA
5 5
6 .align 8
6 .globl VMLINUX_SYMBOL(system_certificate_list) 7 .globl VMLINUX_SYMBOL(system_certificate_list)
7VMLINUX_SYMBOL(system_certificate_list): 8VMLINUX_SYMBOL(system_certificate_list):
9__cert_list_start:
8 .incbin "kernel/x509_certificate_list" 10 .incbin "kernel/x509_certificate_list"
9 .globl VMLINUX_SYMBOL(system_certificate_list_end) 11__cert_list_end:
10VMLINUX_SYMBOL(system_certificate_list_end): 12
13 .align 8
14 .globl VMLINUX_SYMBOL(system_certificate_list_size)
15VMLINUX_SYMBOL(system_certificate_list_size):
16#ifdef CONFIG_64BIT
17 .quad __cert_list_end - __cert_list_start
18#else
19 .long __cert_list_end - __cert_list_start
20#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 564dd93430a2..52ebc70263f4 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -22,7 +22,7 @@ struct key *system_trusted_keyring;
22EXPORT_SYMBOL_GPL(system_trusted_keyring); 22EXPORT_SYMBOL_GPL(system_trusted_keyring);
23 23
24extern __initconst const u8 system_certificate_list[]; 24extern __initconst const u8 system_certificate_list[];
25extern __initconst const u8 system_certificate_list_end[]; 25extern __initconst const unsigned long system_certificate_list_size;
26 26
27/* 27/*
28 * Load the compiled-in keys 28 * Load the compiled-in keys
@@ -60,8 +60,8 @@ static __init int load_system_certificate_list(void)
60 60
61 pr_notice("Loading compiled-in X.509 certificates\n"); 61 pr_notice("Loading compiled-in X.509 certificates\n");
62 62
63 end = system_certificate_list_end;
64 p = system_certificate_list; 63 p = system_certificate_list;
64 end = p + system_certificate_list_size;
65 while (p < end) { 65 while (p < end) {
66 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more 66 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
67 * than 256 bytes in size. 67 * than 256 bytes in size.
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 64522ecdfe0e..162b03ab0ad2 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33 */ 33 */
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36
37/*
38 * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
39 * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
40 * variable has two functions:
41 *
42 * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
43 * timekeeping lock all at once. Only the CPU which is assigned to do the
44 * update is handling it.
45 *
46 * 2) Hand off the duty in the NOHZ idle case by setting the value to
47 * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
48 * at it will take over and keep the time keeping alive. The handover
49 * procedure also covers cpu hotplug.
50 */
36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 51int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37 52
38/* 53/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3612fc77f834..ea20f7d1ac2c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -361,8 +361,8 @@ void __init tick_nohz_init(void)
361/* 361/*
362 * NO HZ enabled ? 362 * NO HZ enabled ?
363 */ 363 */
364int tick_nohz_enabled __read_mostly = 1; 364static int tick_nohz_enabled __read_mostly = 1;
365 365int tick_nohz_active __read_mostly;
366/* 366/*
367 * Enable / Disable tickless mode 367 * Enable / Disable tickless mode
368 */ 368 */
@@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
465 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 465 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
466 ktime_t now, idle; 466 ktime_t now, idle;
467 467
468 if (!tick_nohz_enabled) 468 if (!tick_nohz_active)
469 return -1; 469 return -1;
470 470
471 now = ktime_get(); 471 now = ktime_get();
@@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
506 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 506 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
507 ktime_t now, iowait; 507 ktime_t now, iowait;
508 508
509 if (!tick_nohz_enabled) 509 if (!tick_nohz_active)
510 return -1; 510 return -1;
511 511
512 now = ktime_get(); 512 now = ktime_get();
@@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
711 return false; 711 return false;
712 } 712 }
713 713
714 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 714 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
715 ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
715 return false; 716 return false;
717 }
716 718
717 if (need_resched()) 719 if (need_resched())
718 return false; 720 return false;
@@ -799,11 +801,6 @@ void tick_nohz_idle_enter(void)
799 local_irq_disable(); 801 local_irq_disable();
800 802
801 ts = &__get_cpu_var(tick_cpu_sched); 803 ts = &__get_cpu_var(tick_cpu_sched);
802 /*
803 * set ts->inidle unconditionally. even if the system did not
804 * switch to nohz mode the cpu frequency governers rely on the
805 * update of the idle time accounting in tick_nohz_start_idle().
806 */
807 ts->inidle = 1; 804 ts->inidle = 1;
808 __tick_nohz_idle_enter(ts); 805 __tick_nohz_idle_enter(ts);
809 806
@@ -973,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)
973 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 970 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
974 ktime_t next; 971 ktime_t next;
975 972
976 if (!tick_nohz_enabled) 973 if (!tick_nohz_active)
977 return; 974 return;
978 975
979 local_irq_disable(); 976 local_irq_disable();
@@ -981,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void)
981 local_irq_enable(); 978 local_irq_enable();
982 return; 979 return;
983 } 980 }
984 981 tick_nohz_active = 1;
985 ts->nohz_mode = NOHZ_MODE_LOWRES; 982 ts->nohz_mode = NOHZ_MODE_LOWRES;
986 983
987 /* 984 /*
@@ -1139,8 +1136,10 @@ void tick_setup_sched_timer(void)
1139 } 1136 }
1140 1137
1141#ifdef CONFIG_NO_HZ_COMMON 1138#ifdef CONFIG_NO_HZ_COMMON
1142 if (tick_nohz_enabled) 1139 if (tick_nohz_enabled) {
1143 ts->nohz_mode = NOHZ_MODE_HIGHRES; 1140 ts->nohz_mode = NOHZ_MODE_HIGHRES;
1141 tick_nohz_active = 1;
1142 }
1144#endif 1143#endif
1145} 1144}
1146#endif /* HIGH_RES_TIMERS */ 1145#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3abf53418b67..87b4f00284c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
1347 tk->xtime_nsec -= remainder; 1347 tk->xtime_nsec -= remainder;
1348 tk->xtime_nsec += 1ULL << tk->shift; 1348 tk->xtime_nsec += 1ULL << tk->shift;
1349 tk->ntp_error += remainder << tk->ntp_error_shift; 1349 tk->ntp_error += remainder << tk->ntp_error_shift;
1350 1350 tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
1351} 1351}
1352#else 1352#else
1353#define old_vsyscall_fixup(tk) 1353#define old_vsyscall_fixup(tk)
diff --git a/kernel/timer.c b/kernel/timer.c
index 6582b82fa966..accfd241b9e5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu)
1518 /* 1518 /*
1519 * The APs use this path later in boot 1519 * The APs use this path later in boot
1520 */ 1520 */
1521 base = kmalloc_node(sizeof(*base), 1521 base = kzalloc_node(sizeof(*base), GFP_KERNEL,
1522 GFP_KERNEL | __GFP_ZERO, 1522 cpu_to_node(cpu));
1523 cpu_to_node(cpu));
1524 if (!base) 1523 if (!base)
1525 return -ENOMEM; 1524 return -ENOMEM;
1526 1525
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 22fa55696760..72a0f81dc5a8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
367 367
368static int __register_ftrace_function(struct ftrace_ops *ops) 368static int __register_ftrace_function(struct ftrace_ops *ops)
369{ 369{
370 if (unlikely(ftrace_disabled))
371 return -ENODEV;
372
373 if (FTRACE_WARN_ON(ops == &global_ops)) 370 if (FTRACE_WARN_ON(ops == &global_ops))
374 return -EINVAL; 371 return -EINVAL;
375 372
@@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
428{ 425{
429 int ret; 426 int ret;
430 427
431 if (ftrace_disabled)
432 return -ENODEV;
433
434 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) 428 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
435 return -EBUSY; 429 return -EBUSY;
436 430
@@ -781,7 +775,7 @@ static int ftrace_profile_init(void)
781 int cpu; 775 int cpu;
782 int ret = 0; 776 int ret = 0;
783 777
784 for_each_online_cpu(cpu) { 778 for_each_possible_cpu(cpu) {
785 ret = ftrace_profile_init_cpu(cpu); 779 ret = ftrace_profile_init_cpu(cpu);
786 if (ret) 780 if (ret)
787 break; 781 break;
@@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command)
2088static int ftrace_startup(struct ftrace_ops *ops, int command) 2082static int ftrace_startup(struct ftrace_ops *ops, int command)
2089{ 2083{
2090 bool hash_enable = true; 2084 bool hash_enable = true;
2085 int ret;
2091 2086
2092 if (unlikely(ftrace_disabled)) 2087 if (unlikely(ftrace_disabled))
2093 return -ENODEV; 2088 return -ENODEV;
2094 2089
2090 ret = __register_ftrace_function(ops);
2091 if (ret)
2092 return ret;
2093
2095 ftrace_start_up++; 2094 ftrace_start_up++;
2096 command |= FTRACE_UPDATE_CALLS; 2095 command |= FTRACE_UPDATE_CALLS;
2097 2096
@@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2113 return 0; 2112 return 0;
2114} 2113}
2115 2114
2116static void ftrace_shutdown(struct ftrace_ops *ops, int command) 2115static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2117{ 2116{
2118 bool hash_disable = true; 2117 bool hash_disable = true;
2118 int ret;
2119 2119
2120 if (unlikely(ftrace_disabled)) 2120 if (unlikely(ftrace_disabled))
2121 return; 2121 return -ENODEV;
2122
2123 ret = __unregister_ftrace_function(ops);
2124 if (ret)
2125 return ret;
2122 2126
2123 ftrace_start_up--; 2127 ftrace_start_up--;
2124 /* 2128 /*
@@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
2153 } 2157 }
2154 2158
2155 if (!command || !ftrace_enabled) 2159 if (!command || !ftrace_enabled)
2156 return; 2160 return 0;
2157 2161
2158 ftrace_run_update_code(command); 2162 ftrace_run_update_code(command);
2163 return 0;
2159} 2164}
2160 2165
2161static void ftrace_startup_sysctl(void) 2166static void ftrace_startup_sysctl(void)
@@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void)
3060 if (i == FTRACE_FUNC_HASHSIZE) 3065 if (i == FTRACE_FUNC_HASHSIZE)
3061 return; 3066 return;
3062 3067
3063 ret = __register_ftrace_function(&trace_probe_ops); 3068 ret = ftrace_startup(&trace_probe_ops, 0);
3064 if (!ret)
3065 ret = ftrace_startup(&trace_probe_ops, 0);
3066 3069
3067 ftrace_probe_registered = 1; 3070 ftrace_probe_registered = 1;
3068} 3071}
3069 3072
3070static void __disable_ftrace_function_probe(void) 3073static void __disable_ftrace_function_probe(void)
3071{ 3074{
3072 int ret;
3073 int i; 3075 int i;
3074 3076
3075 if (!ftrace_probe_registered) 3077 if (!ftrace_probe_registered)
@@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void)
3082 } 3084 }
3083 3085
3084 /* no more funcs left */ 3086 /* no more funcs left */
3085 ret = __unregister_ftrace_function(&trace_probe_ops); 3087 ftrace_shutdown(&trace_probe_ops, 0);
3086 if (!ret)
3087 ftrace_shutdown(&trace_probe_ops, 0);
3088 3088
3089 ftrace_probe_registered = 0; 3089 ftrace_probe_registered = 0;
3090} 3090}
@@ -4366,12 +4366,15 @@ core_initcall(ftrace_nodyn_init);
4366static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4366static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4367static inline void ftrace_startup_enable(int command) { } 4367static inline void ftrace_startup_enable(int command) { }
4368/* Keep as macros so we do not need to define the commands */ 4368/* Keep as macros so we do not need to define the commands */
4369# define ftrace_startup(ops, command) \ 4369# define ftrace_startup(ops, command) \
4370 ({ \ 4370 ({ \
4371 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ 4371 int ___ret = __register_ftrace_function(ops); \
4372 0; \ 4372 if (!___ret) \
4373 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
4374 ___ret; \
4373 }) 4375 })
4374# define ftrace_shutdown(ops, command) do { } while (0) 4376# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
4377
4375# define ftrace_startup_sysctl() do { } while (0) 4378# define ftrace_startup_sysctl() do { } while (0)
4376# define ftrace_shutdown_sysctl() do { } while (0) 4379# define ftrace_shutdown_sysctl() do { } while (0)
4377 4380
@@ -4780,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
4780 4783
4781 mutex_lock(&ftrace_lock); 4784 mutex_lock(&ftrace_lock);
4782 4785
4783 ret = __register_ftrace_function(ops); 4786 ret = ftrace_startup(ops, 0);
4784 if (!ret)
4785 ret = ftrace_startup(ops, 0);
4786 4787
4787 mutex_unlock(&ftrace_lock); 4788 mutex_unlock(&ftrace_lock);
4788 4789
@@ -4801,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
4801 int ret; 4802 int ret;
4802 4803
4803 mutex_lock(&ftrace_lock); 4804 mutex_lock(&ftrace_lock);
4804 ret = __unregister_ftrace_function(ops); 4805 ret = ftrace_shutdown(ops, 0);
4805 if (!ret)
4806 ftrace_shutdown(ops, 0);
4807 mutex_unlock(&ftrace_lock); 4806 mutex_unlock(&ftrace_lock);
4808 4807
4809 return ret; 4808 return ret;
@@ -4997,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
4997 return NOTIFY_DONE; 4996 return NOTIFY_DONE;
4998} 4997}
4999 4998
4999/* Just a place holder for function graph */
5000static struct ftrace_ops fgraph_ops __read_mostly = {
5001 .func = ftrace_stub,
5002 .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
5003 FTRACE_OPS_FL_RECURSION_SAFE,
5004};
5005
5000int register_ftrace_graph(trace_func_graph_ret_t retfunc, 5006int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5001 trace_func_graph_ent_t entryfunc) 5007 trace_func_graph_ent_t entryfunc)
5002{ 5008{
@@ -5023,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5023 ftrace_graph_return = retfunc; 5029 ftrace_graph_return = retfunc;
5024 ftrace_graph_entry = entryfunc; 5030 ftrace_graph_entry = entryfunc;
5025 5031
5026 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); 5032 ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
5027 5033
5028out: 5034out:
5029 mutex_unlock(&ftrace_lock); 5035 mutex_unlock(&ftrace_lock);
@@ -5040,7 +5046,7 @@ void unregister_ftrace_graph(void)
5040 ftrace_graph_active--; 5046 ftrace_graph_active--;
5041 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 5047 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
5042 ftrace_graph_entry = ftrace_graph_entry_stub; 5048 ftrace_graph_entry = ftrace_graph_entry_stub;
5043 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); 5049 ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
5044 unregister_pm_notifier(&ftrace_suspend_notifier); 5050 unregister_pm_notifier(&ftrace_suspend_notifier);
5045 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5051 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5046 5052
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 78e27e3b52ac..e854f420e033 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,12 @@ static int total_ref_count;
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event, 24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 if (tp_event->perf_perm) {
28 int ret = tp_event->perf_perm(tp_event, p_event);
29 if (ret)
30 return ret;
31 }
32
27 /* The ftrace function trace is allowed only for root. */ 33 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) && 34 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) 35 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
@@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
173int perf_trace_init(struct perf_event *p_event) 179int perf_trace_init(struct perf_event *p_event)
174{ 180{
175 struct ftrace_event_call *tp_event; 181 struct ftrace_event_call *tp_event;
176 int event_id = p_event->attr.config; 182 u64 event_id = p_event->attr.config;
177 int ret = -EINVAL; 183 int ret = -EINVAL;
178 184
179 mutex_lock(&event_mutex); 185 mutex_lock(&event_mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f919a2e21bf3..a11800ae96de 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2314,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr)
2314 /* Disable any running events */ 2314 /* Disable any running events */
2315 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); 2315 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
2316 2316
2317 /* Access to events are within rcu_read_lock_sched() */
2318 synchronize_sched();
2319
2317 down_write(&trace_event_sem); 2320 down_write(&trace_event_sem);
2318 __trace_remove_event_dirs(tr); 2321 __trace_remove_event_dirs(tr);
2319 debugfs_remove_recursive(tr->event_dir); 2322 debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e4b6d11bdf78..ea90eb5f6f17 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -431,11 +431,6 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
431 if (!tr->sys_refcount_enter) 431 if (!tr->sys_refcount_enter)
432 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 432 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
433 mutex_unlock(&syscall_trace_lock); 433 mutex_unlock(&syscall_trace_lock);
434 /*
435 * Callers expect the event to be completely disabled on
436 * return, so wait for current handlers to finish.
437 */
438 synchronize_sched();
439} 434}
440 435
441static int reg_event_syscall_exit(struct ftrace_event_file *file, 436static int reg_event_syscall_exit(struct ftrace_event_file *file,
@@ -474,11 +469,6 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
474 if (!tr->sys_refcount_exit) 469 if (!tr->sys_refcount_exit)
475 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 470 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
476 mutex_unlock(&syscall_trace_lock); 471 mutex_unlock(&syscall_trace_lock);
477 /*
478 * Callers expect the event to be completely disabled on
479 * return, so wait for current handlers to finish.
480 */
481 synchronize_sched();
482} 472}
483 473
484static int __init init_syscall_trace(struct ftrace_event_call *call) 474static int __init init_syscall_trace(struct ftrace_event_call *call)
diff --git a/kernel/user.c b/kernel/user.c
index a3a0dbfda329..c006131beb77 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,9 +51,9 @@ struct user_namespace init_user_ns = {
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
54#ifdef CONFIG_KEYS_KERBEROS_CACHE 54#ifdef CONFIG_PERSISTENT_KEYRINGS
55 .krb_cache_register_sem = 55 .persistent_keyring_register_sem =
56 __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem), 56 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
57#endif 57#endif
58}; 58};
59EXPORT_SYMBOL_GPL(init_user_ns); 59EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987293d03ebc..b010eac595d2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
305/* I: attributes used when instantiating standard unbound pools on demand */ 305/* I: attributes used when instantiating standard unbound pools on demand */
306static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; 306static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
307 307
308/* I: attributes used when instantiating ordered pools on demand */
309static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
310
308struct workqueue_struct *system_wq __read_mostly; 311struct workqueue_struct *system_wq __read_mostly;
309EXPORT_SYMBOL(system_wq); 312EXPORT_SYMBOL(system_wq);
310struct workqueue_struct *system_highpri_wq __read_mostly; 313struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { }
518static inline void debug_work_deactivate(struct work_struct *work) { } 521static inline void debug_work_deactivate(struct work_struct *work) { }
519#endif 522#endif
520 523
521/* allocate ID and assign it to @pool */ 524/**
525 * worker_pool_assign_id - allocate ID and assing it to @pool
526 * @pool: the pool pointer of interest
527 *
528 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
529 * successfully, -errno on failure.
530 */
522static int worker_pool_assign_id(struct worker_pool *pool) 531static int worker_pool_assign_id(struct worker_pool *pool)
523{ 532{
524 int ret; 533 int ret;
525 534
526 lockdep_assert_held(&wq_pool_mutex); 535 lockdep_assert_held(&wq_pool_mutex);
527 536
528 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); 537 ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
538 GFP_KERNEL);
529 if (ret >= 0) { 539 if (ret >= 0) {
530 pool->id = ret; 540 pool->id = ret;
531 return 0; 541 return 0;
@@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
1320 1330
1321 debug_work_activate(work); 1331 debug_work_activate(work);
1322 1332
1323 /* if dying, only works from the same workqueue are allowed */ 1333 /* if draining, only works from the same workqueue are allowed */
1324 if (unlikely(wq->flags & __WQ_DRAINING) && 1334 if (unlikely(wq->flags & __WQ_DRAINING) &&
1325 WARN_ON_ONCE(!is_chained_work(wq))) 1335 WARN_ON_ONCE(!is_chained_work(wq)))
1326 return; 1336 return;
@@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool)
1736 if (IS_ERR(worker->task)) 1746 if (IS_ERR(worker->task))
1737 goto fail; 1747 goto fail;
1738 1748
1749 set_user_nice(worker->task, pool->attrs->nice);
1750
1751 /* prevent userland from meddling with cpumask of workqueue workers */
1752 worker->task->flags |= PF_NO_SETAFFINITY;
1753
1739 /* 1754 /*
1740 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any 1755 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1741 * online CPUs. It'll be re-applied when any of the CPUs come up. 1756 * online CPUs. It'll be re-applied when any of the CPUs come up.
1742 */ 1757 */
1743 set_user_nice(worker->task, pool->attrs->nice);
1744 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); 1758 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1745 1759
1746 /* prevent userland from meddling with cpumask of workqueue workers */
1747 worker->task->flags |= PF_NO_SETAFFINITY;
1748
1749 /* 1760 /*
1750 * The caller is responsible for ensuring %POOL_DISASSOCIATED 1761 * The caller is responsible for ensuring %POOL_DISASSOCIATED
1751 * remains stable across this function. See the comments above the 1762 * remains stable across this function. See the comments above the
@@ -2840,19 +2851,6 @@ already_gone:
2840 return false; 2851 return false;
2841} 2852}
2842 2853
2843static bool __flush_work(struct work_struct *work)
2844{
2845 struct wq_barrier barr;
2846
2847 if (start_flush_work(work, &barr)) {
2848 wait_for_completion(&barr.done);
2849 destroy_work_on_stack(&barr.work);
2850 return true;
2851 } else {
2852 return false;
2853 }
2854}
2855
2856/** 2854/**
2857 * flush_work - wait for a work to finish executing the last queueing instance 2855 * flush_work - wait for a work to finish executing the last queueing instance
2858 * @work: the work to flush 2856 * @work: the work to flush
@@ -2866,10 +2864,18 @@ static bool __flush_work(struct work_struct *work)
2866 */ 2864 */
2867bool flush_work(struct work_struct *work) 2865bool flush_work(struct work_struct *work)
2868{ 2866{
2867 struct wq_barrier barr;
2868
2869 lock_map_acquire(&work->lockdep_map); 2869 lock_map_acquire(&work->lockdep_map);
2870 lock_map_release(&work->lockdep_map); 2870 lock_map_release(&work->lockdep_map);
2871 2871
2872 return __flush_work(work); 2872 if (start_flush_work(work, &barr)) {
2873 wait_for_completion(&barr.done);
2874 destroy_work_on_stack(&barr.work);
2875 return true;
2876 } else {
2877 return false;
2878 }
2873} 2879}
2874EXPORT_SYMBOL_GPL(flush_work); 2880EXPORT_SYMBOL_GPL(flush_work);
2875 2881
@@ -4106,7 +4112,7 @@ out_unlock:
4106static int alloc_and_link_pwqs(struct workqueue_struct *wq) 4112static int alloc_and_link_pwqs(struct workqueue_struct *wq)
4107{ 4113{
4108 bool highpri = wq->flags & WQ_HIGHPRI; 4114 bool highpri = wq->flags & WQ_HIGHPRI;
4109 int cpu; 4115 int cpu, ret;
4110 4116
4111 if (!(wq->flags & WQ_UNBOUND)) { 4117 if (!(wq->flags & WQ_UNBOUND)) {
4112 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); 4118 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4126,6 +4132,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
4126 mutex_unlock(&wq->mutex); 4132 mutex_unlock(&wq->mutex);
4127 } 4133 }
4128 return 0; 4134 return 0;
4135 } else if (wq->flags & __WQ_ORDERED) {
4136 ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
4137 /* there should only be single pwq for ordering guarantee */
4138 WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
4139 wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
4140 "ordering guarantee broken for workqueue %s\n", wq->name);
4141 return ret;
4129 } else { 4142 } else {
4130 return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); 4143 return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
4131 } 4144 }
@@ -4814,14 +4827,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4814 4827
4815 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 4828 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
4816 schedule_work_on(cpu, &wfc.work); 4829 schedule_work_on(cpu, &wfc.work);
4817 4830 flush_work(&wfc.work);
4818 /*
4819 * The work item is on-stack and can't lead to deadlock through
4820 * flushing. Use __flush_work() to avoid spurious lockdep warnings
4821 * when work_on_cpu()s are nested.
4822 */
4823 __flush_work(&wfc.work);
4824
4825 return wfc.ret; 4831 return wfc.ret;
4826} 4832}
4827EXPORT_SYMBOL_GPL(work_on_cpu); 4833EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -5009,10 +5015,6 @@ static int __init init_workqueues(void)
5009 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; 5015 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
5010 int i, cpu; 5016 int i, cpu;
5011 5017
5012 /* make sure we have enough bits for OFFQ pool ID */
5013 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
5014 WORK_CPU_END * NR_STD_WORKER_POOLS);
5015
5016 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 5018 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
5017 5019
5018 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 5020 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5051,13 +5053,23 @@ static int __init init_workqueues(void)
5051 } 5053 }
5052 } 5054 }
5053 5055
5054 /* create default unbound wq attrs */ 5056 /* create default unbound and ordered wq attrs */
5055 for (i = 0; i < NR_STD_WORKER_POOLS; i++) { 5057 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
5056 struct workqueue_attrs *attrs; 5058 struct workqueue_attrs *attrs;
5057 5059
5058 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); 5060 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
5059 attrs->nice = std_nice[i]; 5061 attrs->nice = std_nice[i];
5060 unbound_std_wq_attrs[i] = attrs; 5062 unbound_std_wq_attrs[i] = attrs;
5063
5064 /*
5065 * An ordered wq should have only one pwq as ordering is
5066 * guaranteed by max_active which is enforced by pwqs.
5067 * Turn off NUMA so that dfl_pwq is used for all nodes.
5068 */
5069 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
5070 attrs->nice = std_nice[i];
5071 attrs->no_numa = true;
5072 ordered_wq_attrs[i] = attrs;
5061 } 5073 }
5062 5074
5063 system_wq = alloc_workqueue("events", 0, 0); 5075 system_wq = alloc_workqueue("events", 0, 0);