diff options
author | Mauro Carvalho Chehab <mchehab@redhat.com> | 2013-01-11 10:28:19 -0500 |
---|---|---|
committer | Mauro Carvalho Chehab <mchehab@redhat.com> | 2013-01-11 10:28:19 -0500 |
commit | 734d1ece37fbf3d2ddfc71bc6c69e0fe35f02542 (patch) | |
tree | c4805dd7e746b1feb9e09e9849f3245d0b2c0c6b /kernel | |
parent | 216c82c6aba63eeb49d7654b448e0d47bea255bb (diff) | |
parent | 9931faca02c604c22335f5a935a501bb2ace6e20 (diff) |
Merge tag 'v3.8-rc3' into v4l_for_linus
Linux 3.8-rc3
* tag 'v3.8-rc3': (11110 commits)
Linux 3.8-rc3
mm: reinstante dropped pmd_trans_splitting() check
cred: Remove tgcred pointer from struct cred
drm/ttm: fix fence locking in ttm_buffer_object_transfer
ARM: clps711x: Fix bad merge of clockevents setup
ARM: highbank: save and restore L2 cache and GIC on suspend
ARM: highbank: add a power request clear
ARM: highbank: fix secondary boot and hotplug
ARM: highbank: fix typos with hignbank in power request functions
ARM: dts: fix highbank cpu mpidr values
ARM: dts: add device_type prop to cpu nodes on Calxeda platforms
drm/prime: drop reference on imported dma-buf come from gem
xen/netfront: improve truesize tracking
ARM: mx5: Fix MX53 flexcan2 clock
ARM: OMAP2+: am33xx-hwmod: Fix wrongly terminated am33xx_usbss_mpu_irqs array
sctp: fix Kconfig bug in default cookie hmac selection
EDAC: Cleanup device deregistering path
EDAC: Fix EDAC Kconfig menu
EDAC: Fix kernel panic on module unloading
ALSA: hda - add mute LED for HP Pavilion 17 (Realtek codec)
...
Diffstat (limited to 'kernel')
99 files changed, 4859 insertions, 2771 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 86e3285ae7e5..6c072b6da239 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -54,7 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | |||
54 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | 54 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o |
55 | obj-$(CONFIG_UID16) += uid16.o | 55 | obj-$(CONFIG_UID16) += uid16.o |
56 | obj-$(CONFIG_MODULES) += module.o | 56 | obj-$(CONFIG_MODULES) += module.o |
57 | obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o | 57 | obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o |
58 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 58 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
59 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 59 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
60 | obj-$(CONFIG_KEXEC) += kexec.o | 60 | obj-$(CONFIG_KEXEC) += kexec.o |
@@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | |||
110 | obj-$(CONFIG_PADATA) += padata.o | 110 | obj-$(CONFIG_PADATA) += padata.o |
111 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 111 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
112 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | 112 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o |
113 | obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o | ||
113 | 114 | ||
114 | $(obj)/configs.o: $(obj)/config_data.h | 115 | $(obj)/configs.o: $(obj)/config_data.h |
115 | 116 | ||
@@ -136,10 +137,14 @@ ifeq ($(CONFIG_MODULE_SIG),y) | |||
136 | # | 137 | # |
137 | # Pull the signing certificate and any extra certificates into the kernel | 138 | # Pull the signing certificate and any extra certificates into the kernel |
138 | # | 139 | # |
140 | |||
141 | quiet_cmd_touch = TOUCH $@ | ||
142 | cmd_touch = touch $@ | ||
143 | |||
139 | extra_certificates: | 144 | extra_certificates: |
140 | touch $@ | 145 | $(call cmd,touch) |
141 | 146 | ||
142 | kernel/modsign_pubkey.o: signing_key.x509 extra_certificates | 147 | kernel/modsign_certificate.o: signing_key.x509 extra_certificates |
143 | 148 | ||
144 | ############################################################################### | 149 | ############################################################################### |
145 | # | 150 | # |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index ed206fd88cca..e81175ef25f8 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -249,7 +249,7 @@ static void untag_chunk(struct node *p) | |||
249 | list_del_rcu(&chunk->hash); | 249 | list_del_rcu(&chunk->hash); |
250 | spin_unlock(&hash_lock); | 250 | spin_unlock(&hash_lock); |
251 | spin_unlock(&entry->lock); | 251 | spin_unlock(&entry->lock); |
252 | fsnotify_destroy_mark(entry); | 252 | fsnotify_destroy_mark(entry, audit_tree_group); |
253 | goto out; | 253 | goto out; |
254 | } | 254 | } |
255 | 255 | ||
@@ -291,7 +291,7 @@ static void untag_chunk(struct node *p) | |||
291 | owner->root = new; | 291 | owner->root = new; |
292 | spin_unlock(&hash_lock); | 292 | spin_unlock(&hash_lock); |
293 | spin_unlock(&entry->lock); | 293 | spin_unlock(&entry->lock); |
294 | fsnotify_destroy_mark(entry); | 294 | fsnotify_destroy_mark(entry, audit_tree_group); |
295 | fsnotify_put_mark(&new->mark); /* drop initial reference */ | 295 | fsnotify_put_mark(&new->mark); /* drop initial reference */ |
296 | goto out; | 296 | goto out; |
297 | 297 | ||
@@ -331,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) | |||
331 | spin_unlock(&hash_lock); | 331 | spin_unlock(&hash_lock); |
332 | chunk->dead = 1; | 332 | chunk->dead = 1; |
333 | spin_unlock(&entry->lock); | 333 | spin_unlock(&entry->lock); |
334 | fsnotify_destroy_mark(entry); | 334 | fsnotify_destroy_mark(entry, audit_tree_group); |
335 | fsnotify_put_mark(entry); | 335 | fsnotify_put_mark(entry); |
336 | return 0; | 336 | return 0; |
337 | } | 337 | } |
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
412 | spin_unlock(&chunk_entry->lock); | 412 | spin_unlock(&chunk_entry->lock); |
413 | spin_unlock(&old_entry->lock); | 413 | spin_unlock(&old_entry->lock); |
414 | 414 | ||
415 | fsnotify_destroy_mark(chunk_entry); | 415 | fsnotify_destroy_mark(chunk_entry, audit_tree_group); |
416 | 416 | ||
417 | fsnotify_put_mark(chunk_entry); | 417 | fsnotify_put_mark(chunk_entry); |
418 | fsnotify_put_mark(old_entry); | 418 | fsnotify_put_mark(old_entry); |
@@ -443,7 +443,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
443 | spin_unlock(&hash_lock); | 443 | spin_unlock(&hash_lock); |
444 | spin_unlock(&chunk_entry->lock); | 444 | spin_unlock(&chunk_entry->lock); |
445 | spin_unlock(&old_entry->lock); | 445 | spin_unlock(&old_entry->lock); |
446 | fsnotify_destroy_mark(old_entry); | 446 | fsnotify_destroy_mark(old_entry, audit_tree_group); |
447 | fsnotify_put_mark(chunk_entry); /* drop initial reference */ | 447 | fsnotify_put_mark(chunk_entry); /* drop initial reference */ |
448 | fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ | 448 | fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ |
449 | return 0; | 449 | return 0; |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 9a9ae6e3d290..4a599f699adc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -350,7 +350,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
350 | } | 350 | } |
351 | mutex_unlock(&audit_filter_mutex); | 351 | mutex_unlock(&audit_filter_mutex); |
352 | 352 | ||
353 | fsnotify_destroy_mark(&parent->mark); | 353 | fsnotify_destroy_mark(&parent->mark, audit_watch_group); |
354 | } | 354 | } |
355 | 355 | ||
356 | /* Get path information necessary for adding watches. */ | 356 | /* Get path information necessary for adding watches. */ |
@@ -457,7 +457,7 @@ void audit_remove_watch_rule(struct audit_krule *krule) | |||
457 | 457 | ||
458 | if (list_empty(&parent->watches)) { | 458 | if (list_empty(&parent->watches)) { |
459 | audit_get_parent(parent); | 459 | audit_get_parent(parent); |
460 | fsnotify_destroy_mark(&parent->mark); | 460 | fsnotify_destroy_mark(&parent->mark, audit_watch_group); |
461 | audit_put_parent(parent); | 461 | audit_put_parent(parent); |
462 | } | 462 | } |
463 | } | 463 | } |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2f186ed80c40..e37e6a12c5e3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -200,7 +200,6 @@ struct audit_context { | |||
200 | struct list_head names_list; /* anchor for struct audit_names->list */ | 200 | struct list_head names_list; /* anchor for struct audit_names->list */ |
201 | char * filterkey; /* key for rule that triggered record */ | 201 | char * filterkey; /* key for rule that triggered record */ |
202 | struct path pwd; | 202 | struct path pwd; |
203 | struct audit_context *previous; /* For nested syscalls */ | ||
204 | struct audit_aux_data *aux; | 203 | struct audit_aux_data *aux; |
205 | struct audit_aux_data *aux_pids; | 204 | struct audit_aux_data *aux_pids; |
206 | struct sockaddr_storage *sockaddr; | 205 | struct sockaddr_storage *sockaddr; |
@@ -1091,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk) | |||
1091 | 1090 | ||
1092 | static inline void audit_free_context(struct audit_context *context) | 1091 | static inline void audit_free_context(struct audit_context *context) |
1093 | { | 1092 | { |
1094 | struct audit_context *previous; | 1093 | audit_free_names(context); |
1095 | int count = 0; | 1094 | unroll_tree_refs(context, NULL, 0); |
1096 | 1095 | free_tree_refs(context); | |
1097 | do { | 1096 | audit_free_aux(context); |
1098 | previous = context->previous; | 1097 | kfree(context->filterkey); |
1099 | if (previous || (count && count < 10)) { | 1098 | kfree(context->sockaddr); |
1100 | ++count; | 1099 | kfree(context); |
1101 | printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" | ||
1102 | " freeing multiple contexts (%d)\n", | ||
1103 | context->serial, context->major, | ||
1104 | context->name_count, count); | ||
1105 | } | ||
1106 | audit_free_names(context); | ||
1107 | unroll_tree_refs(context, NULL, 0); | ||
1108 | free_tree_refs(context); | ||
1109 | audit_free_aux(context); | ||
1110 | kfree(context->filterkey); | ||
1111 | kfree(context->sockaddr); | ||
1112 | kfree(context); | ||
1113 | context = previous; | ||
1114 | } while (context); | ||
1115 | if (count >= 10) | ||
1116 | printk(KERN_ERR "audit: freed %d contexts\n", count); | ||
1117 | } | 1100 | } |
1118 | 1101 | ||
1119 | void audit_log_task_context(struct audit_buffer *ab) | 1102 | void audit_log_task_context(struct audit_buffer *ab) |
@@ -1159,7 +1142,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
1159 | cred = current_cred(); | 1142 | cred = current_cred(); |
1160 | 1143 | ||
1161 | spin_lock_irq(&tsk->sighand->siglock); | 1144 | spin_lock_irq(&tsk->sighand->siglock); |
1162 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | 1145 | if (tsk->signal && tsk->signal->tty) |
1163 | tty = tsk->signal->tty->name; | 1146 | tty = tsk->signal->tty->name; |
1164 | else | 1147 | else |
1165 | tty = "(none)"; | 1148 | tty = "(none)"; |
@@ -1783,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major, | |||
1783 | if (!context) | 1766 | if (!context) |
1784 | return; | 1767 | return; |
1785 | 1768 | ||
1786 | /* | ||
1787 | * This happens only on certain architectures that make system | ||
1788 | * calls in kernel_thread via the entry.S interface, instead of | ||
1789 | * with direct calls. (If you are porting to a new | ||
1790 | * architecture, hitting this condition can indicate that you | ||
1791 | * got the _exit/_leave calls backward in entry.S.) | ||
1792 | * | ||
1793 | * i386 no | ||
1794 | * x86_64 no | ||
1795 | * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) | ||
1796 | * | ||
1797 | * This also happens with vm86 emulation in a non-nested manner | ||
1798 | * (entries without exits), so this case must be caught. | ||
1799 | */ | ||
1800 | if (context->in_syscall) { | ||
1801 | struct audit_context *newctx; | ||
1802 | |||
1803 | #if AUDIT_DEBUG | ||
1804 | printk(KERN_ERR | ||
1805 | "audit(:%d) pid=%d in syscall=%d;" | ||
1806 | " entering syscall=%d\n", | ||
1807 | context->serial, tsk->pid, context->major, major); | ||
1808 | #endif | ||
1809 | newctx = audit_alloc_context(context->state); | ||
1810 | if (newctx) { | ||
1811 | newctx->previous = context; | ||
1812 | context = newctx; | ||
1813 | tsk->audit_context = newctx; | ||
1814 | } else { | ||
1815 | /* If we can't alloc a new context, the best we | ||
1816 | * can do is to leak memory (any pending putname | ||
1817 | * will be lost). The only other alternative is | ||
1818 | * to abandon auditing. */ | ||
1819 | audit_zero_context(context, context->state); | ||
1820 | } | ||
1821 | } | ||
1822 | BUG_ON(context->in_syscall || context->name_count); | 1769 | BUG_ON(context->in_syscall || context->name_count); |
1823 | 1770 | ||
1824 | if (!audit_enabled) | 1771 | if (!audit_enabled) |
@@ -1881,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code) | |||
1881 | if (!list_empty(&context->killed_trees)) | 1828 | if (!list_empty(&context->killed_trees)) |
1882 | audit_kill_trees(&context->killed_trees); | 1829 | audit_kill_trees(&context->killed_trees); |
1883 | 1830 | ||
1884 | if (context->previous) { | 1831 | audit_free_names(context); |
1885 | struct audit_context *new_context = context->previous; | 1832 | unroll_tree_refs(context, NULL, 0); |
1886 | context->previous = NULL; | 1833 | audit_free_aux(context); |
1887 | audit_free_context(context); | 1834 | context->aux = NULL; |
1888 | tsk->audit_context = new_context; | 1835 | context->aux_pids = NULL; |
1889 | } else { | 1836 | context->target_pid = 0; |
1890 | audit_free_names(context); | 1837 | context->target_sid = 0; |
1891 | unroll_tree_refs(context, NULL, 0); | 1838 | context->sockaddr_len = 0; |
1892 | audit_free_aux(context); | 1839 | context->type = 0; |
1893 | context->aux = NULL; | 1840 | context->fds[0] = -1; |
1894 | context->aux_pids = NULL; | 1841 | if (context->state != AUDIT_RECORD_CONTEXT) { |
1895 | context->target_pid = 0; | 1842 | kfree(context->filterkey); |
1896 | context->target_sid = 0; | 1843 | context->filterkey = NULL; |
1897 | context->sockaddr_len = 0; | ||
1898 | context->type = 0; | ||
1899 | context->fds[0] = -1; | ||
1900 | if (context->state != AUDIT_RECORD_CONTEXT) { | ||
1901 | kfree(context->filterkey); | ||
1902 | context->filterkey = NULL; | ||
1903 | } | ||
1904 | tsk->audit_context = context; | ||
1905 | } | 1844 | } |
1845 | tsk->audit_context = context; | ||
1906 | } | 1846 | } |
1907 | 1847 | ||
1908 | static inline void handle_one(const struct inode *inode) | 1848 | static inline void handle_one(const struct inode *inode) |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620dd..4855892798fd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -138,6 +138,9 @@ struct cgroupfs_root { | |||
138 | /* Hierarchy-specific flags */ | 138 | /* Hierarchy-specific flags */ |
139 | unsigned long flags; | 139 | unsigned long flags; |
140 | 140 | ||
141 | /* IDs for cgroups in this hierarchy */ | ||
142 | struct ida cgroup_ida; | ||
143 | |||
141 | /* The path to use for release notifications. */ | 144 | /* The path to use for release notifications. */ |
142 | char release_agent_path[PATH_MAX]; | 145 | char release_agent_path[PATH_MAX]; |
143 | 146 | ||
@@ -171,8 +174,8 @@ struct css_id { | |||
171 | * The css to which this ID points. This pointer is set to valid value | 174 | * The css to which this ID points. This pointer is set to valid value |
172 | * after cgroup is populated. If cgroup is removed, this will be NULL. | 175 | * after cgroup is populated. If cgroup is removed, this will be NULL. |
173 | * This pointer is expected to be RCU-safe because destroy() | 176 | * This pointer is expected to be RCU-safe because destroy() |
174 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 177 | * is called after synchronize_rcu(). But for safe use, css_tryget() |
175 | * css_tryget() should be used for avoiding race. | 178 | * should be used for avoiding race. |
176 | */ | 179 | */ |
177 | struct cgroup_subsys_state __rcu *css; | 180 | struct cgroup_subsys_state __rcu *css; |
178 | /* | 181 | /* |
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
242 | */ | 245 | */ |
243 | static int need_forkexit_callback __read_mostly; | 246 | static int need_forkexit_callback __read_mostly; |
244 | 247 | ||
248 | static int cgroup_destroy_locked(struct cgroup *cgrp); | ||
249 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | ||
250 | struct cftype cfts[], bool is_add); | ||
251 | |||
245 | #ifdef CONFIG_PROVE_LOCKING | 252 | #ifdef CONFIG_PROVE_LOCKING |
246 | int cgroup_lock_is_held(void) | 253 | int cgroup_lock_is_held(void) |
247 | { | 254 | { |
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
294 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 301 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
295 | } | 302 | } |
296 | 303 | ||
297 | static int clone_children(const struct cgroup *cgrp) | ||
298 | { | ||
299 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
300 | } | ||
301 | |||
302 | /* | 304 | /* |
303 | * for_each_subsys() allows you to iterate on each subsystem attached to | 305 | * for_each_subsys() allows you to iterate on each subsystem attached to |
304 | * an active hierarchy | 306 | * an active hierarchy |
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
782 | * The task_lock() exception | 784 | * The task_lock() exception |
783 | * | 785 | * |
784 | * The need for this exception arises from the action of | 786 | * The need for this exception arises from the action of |
785 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with | 787 | * cgroup_attach_task(), which overwrites one task's cgroup pointer with |
786 | * another. It does so using cgroup_mutex, however there are | 788 | * another. It does so using cgroup_mutex, however there are |
787 | * several performance critical places that need to reference | 789 | * several performance critical places that need to reference |
788 | * task->cgroup without the expense of grabbing a system global | 790 | * task->cgroup without the expense of grabbing a system global |
789 | * mutex. Therefore except as noted below, when dereferencing or, as | 791 | * mutex. Therefore except as noted below, when dereferencing or, as |
790 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use | 792 | * in cgroup_attach_task(), modifying a task's cgroup pointer we use |
791 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 793 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
792 | * the task_struct routinely used for such matters. | 794 | * the task_struct routinely used for such matters. |
793 | * | 795 | * |
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
854 | return inode; | 856 | return inode; |
855 | } | 857 | } |
856 | 858 | ||
857 | /* | ||
858 | * Call subsys's pre_destroy handler. | ||
859 | * This is called before css refcnt check. | ||
860 | */ | ||
861 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) | ||
862 | { | ||
863 | struct cgroup_subsys *ss; | ||
864 | int ret = 0; | ||
865 | |||
866 | for_each_subsys(cgrp->root, ss) { | ||
867 | if (!ss->pre_destroy) | ||
868 | continue; | ||
869 | |||
870 | ret = ss->pre_destroy(cgrp); | ||
871 | if (ret) { | ||
872 | /* ->pre_destroy() failure is being deprecated */ | ||
873 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
874 | break; | ||
875 | } | ||
876 | } | ||
877 | |||
878 | return ret; | ||
879 | } | ||
880 | |||
881 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 859 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
882 | { | 860 | { |
883 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 861 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
898 | * Release the subsystem state objects. | 876 | * Release the subsystem state objects. |
899 | */ | 877 | */ |
900 | for_each_subsys(cgrp->root, ss) | 878 | for_each_subsys(cgrp->root, ss) |
901 | ss->destroy(cgrp); | 879 | ss->css_free(cgrp); |
902 | 880 | ||
903 | cgrp->root->number_of_cgroups--; | 881 | cgrp->root->number_of_cgroups--; |
904 | mutex_unlock(&cgroup_mutex); | 882 | mutex_unlock(&cgroup_mutex); |
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
917 | 895 | ||
918 | simple_xattrs_free(&cgrp->xattrs); | 896 | simple_xattrs_free(&cgrp->xattrs); |
919 | 897 | ||
898 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
920 | kfree_rcu(cgrp, rcu_head); | 899 | kfree_rcu(cgrp, rcu_head); |
921 | } else { | 900 | } else { |
922 | struct cfent *cfe = __d_cfe(dentry); | 901 | struct cfent *cfe = __d_cfe(dentry); |
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
987 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 966 | if (!test_bit(ss->subsys_id, &subsys_mask)) |
988 | continue; | 967 | continue; |
989 | list_for_each_entry(set, &ss->cftsets, node) | 968 | list_for_each_entry(set, &ss->cftsets, node) |
990 | cgroup_rm_file(cgrp, set->cfts); | 969 | cgroup_addrm_files(cgrp, NULL, set->cfts, false); |
991 | } | 970 | } |
992 | if (base_files) { | 971 | if (base_files) { |
993 | while (!list_empty(&cgrp->files)) | 972 | while (!list_empty(&cgrp->files)) |
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
1015 | } | 994 | } |
1016 | 995 | ||
1017 | /* | 996 | /* |
1018 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | ||
1019 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | ||
1020 | * reference to css->refcnt. In general, this refcnt is expected to goes down | ||
1021 | * to zero, soon. | ||
1022 | * | ||
1023 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | ||
1024 | */ | ||
1025 | static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | ||
1026 | |||
1027 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | ||
1028 | { | ||
1029 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | ||
1030 | wake_up_all(&cgroup_rmdir_waitq); | ||
1031 | } | ||
1032 | |||
1033 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
1034 | { | ||
1035 | css_get(css); | ||
1036 | } | ||
1037 | |||
1038 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
1039 | { | ||
1040 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
1041 | css_put(css); | ||
1042 | } | ||
1043 | |||
1044 | /* | ||
1045 | * Call with cgroup_mutex held. Drops reference counts on modules, including | 997 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
1046 | * any duplicate ones that parse_cgroupfs_options took. If this function | 998 | * any duplicate ones that parse_cgroupfs_options took. If this function |
1047 | * returns an error, no reference counts are touched. | 999 | * returns an error, no reference counts are touched. |
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1150 | seq_puts(seq, ",xattr"); | 1102 | seq_puts(seq, ",xattr"); |
1151 | if (strlen(root->release_agent_path)) | 1103 | if (strlen(root->release_agent_path)) |
1152 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1104 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
1153 | if (clone_children(&root->top_cgroup)) | 1105 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) |
1154 | seq_puts(seq, ",clone_children"); | 1106 | seq_puts(seq, ",clone_children"); |
1155 | if (strlen(root->name)) | 1107 | if (strlen(root->name)) |
1156 | seq_printf(seq, ",name=%s", root->name); | 1108 | seq_printf(seq, ",name=%s", root->name); |
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts { | |||
1162 | unsigned long subsys_mask; | 1114 | unsigned long subsys_mask; |
1163 | unsigned long flags; | 1115 | unsigned long flags; |
1164 | char *release_agent; | 1116 | char *release_agent; |
1165 | bool clone_children; | 1117 | bool cpuset_clone_children; |
1166 | char *name; | 1118 | char *name; |
1167 | /* User explicitly requested empty subsystem */ | 1119 | /* User explicitly requested empty subsystem */ |
1168 | bool none; | 1120 | bool none; |
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1213 | continue; | 1165 | continue; |
1214 | } | 1166 | } |
1215 | if (!strcmp(token, "clone_children")) { | 1167 | if (!strcmp(token, "clone_children")) { |
1216 | opts->clone_children = true; | 1168 | opts->cpuset_clone_children = true; |
1217 | continue; | 1169 | continue; |
1218 | } | 1170 | } |
1219 | if (!strcmp(token, "xattr")) { | 1171 | if (!strcmp(token, "xattr")) { |
@@ -1381,7 +1333,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1381 | if (ret) | 1333 | if (ret) |
1382 | goto out_unlock; | 1334 | goto out_unlock; |
1383 | 1335 | ||
1384 | /* See feature-removal-schedule.txt */ | ||
1385 | if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) | 1336 | if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) |
1386 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | 1337 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", |
1387 | task_tgid_nr(current), current->comm); | 1338 | task_tgid_nr(current), current->comm); |
@@ -1397,14 +1348,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1397 | goto out_unlock; | 1348 | goto out_unlock; |
1398 | } | 1349 | } |
1399 | 1350 | ||
1351 | /* | ||
1352 | * Clear out the files of subsystems that should be removed, do | ||
1353 | * this before rebind_subsystems, since rebind_subsystems may | ||
1354 | * change this hierarchy's subsys_list. | ||
1355 | */ | ||
1356 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1357 | |||
1400 | ret = rebind_subsystems(root, opts.subsys_mask); | 1358 | ret = rebind_subsystems(root, opts.subsys_mask); |
1401 | if (ret) { | 1359 | if (ret) { |
1360 | /* rebind_subsystems failed, re-populate the removed files */ | ||
1361 | cgroup_populate_dir(cgrp, false, removed_mask); | ||
1402 | drop_parsed_module_refcounts(opts.subsys_mask); | 1362 | drop_parsed_module_refcounts(opts.subsys_mask); |
1403 | goto out_unlock; | 1363 | goto out_unlock; |
1404 | } | 1364 | } |
1405 | 1365 | ||
1406 | /* clear out any existing files and repopulate subsystem files */ | ||
1407 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1408 | /* re-populate subsystem files */ | 1366 | /* re-populate subsystem files */ |
1409 | cgroup_populate_dir(cgrp, false, added_mask); | 1367 | cgroup_populate_dir(cgrp, false, added_mask); |
1410 | 1368 | ||
@@ -1432,6 +1390,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1432 | INIT_LIST_HEAD(&cgrp->children); | 1390 | INIT_LIST_HEAD(&cgrp->children); |
1433 | INIT_LIST_HEAD(&cgrp->files); | 1391 | INIT_LIST_HEAD(&cgrp->files); |
1434 | INIT_LIST_HEAD(&cgrp->css_sets); | 1392 | INIT_LIST_HEAD(&cgrp->css_sets); |
1393 | INIT_LIST_HEAD(&cgrp->allcg_node); | ||
1435 | INIT_LIST_HEAD(&cgrp->release_list); | 1394 | INIT_LIST_HEAD(&cgrp->release_list); |
1436 | INIT_LIST_HEAD(&cgrp->pidlists); | 1395 | INIT_LIST_HEAD(&cgrp->pidlists); |
1437 | mutex_init(&cgrp->pidlist_mutex); | 1396 | mutex_init(&cgrp->pidlist_mutex); |
@@ -1450,8 +1409,8 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1450 | root->number_of_cgroups = 1; | 1409 | root->number_of_cgroups = 1; |
1451 | cgrp->root = root; | 1410 | cgrp->root = root; |
1452 | cgrp->top_cgroup = cgrp; | 1411 | cgrp->top_cgroup = cgrp; |
1453 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1454 | init_cgroup_housekeeping(cgrp); | 1412 | init_cgroup_housekeeping(cgrp); |
1413 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1455 | } | 1414 | } |
1456 | 1415 | ||
1457 | static bool init_root_id(struct cgroupfs_root *root) | 1416 | static bool init_root_id(struct cgroupfs_root *root) |
@@ -1518,12 +1477,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1518 | 1477 | ||
1519 | root->subsys_mask = opts->subsys_mask; | 1478 | root->subsys_mask = opts->subsys_mask; |
1520 | root->flags = opts->flags; | 1479 | root->flags = opts->flags; |
1480 | ida_init(&root->cgroup_ida); | ||
1521 | if (opts->release_agent) | 1481 | if (opts->release_agent) |
1522 | strcpy(root->release_agent_path, opts->release_agent); | 1482 | strcpy(root->release_agent_path, opts->release_agent); |
1523 | if (opts->name) | 1483 | if (opts->name) |
1524 | strcpy(root->name, opts->name); | 1484 | strcpy(root->name, opts->name); |
1525 | if (opts->clone_children) | 1485 | if (opts->cpuset_clone_children) |
1526 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); | 1486 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); |
1527 | return root; | 1487 | return root; |
1528 | } | 1488 | } |
1529 | 1489 | ||
@@ -1536,6 +1496,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root) | |||
1536 | spin_lock(&hierarchy_id_lock); | 1496 | spin_lock(&hierarchy_id_lock); |
1537 | ida_remove(&hierarchy_ida, root->hierarchy_id); | 1497 | ida_remove(&hierarchy_ida, root->hierarchy_id); |
1538 | spin_unlock(&hierarchy_id_lock); | 1498 | spin_unlock(&hierarchy_id_lock); |
1499 | ida_destroy(&root->cgroup_ida); | ||
1539 | kfree(root); | 1500 | kfree(root); |
1540 | } | 1501 | } |
1541 | 1502 | ||
@@ -1701,7 +1662,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1701 | 1662 | ||
1702 | free_cg_links(&tmp_cg_links); | 1663 | free_cg_links(&tmp_cg_links); |
1703 | 1664 | ||
1704 | BUG_ON(!list_empty(&root_cgrp->sibling)); | ||
1705 | BUG_ON(!list_empty(&root_cgrp->children)); | 1665 | BUG_ON(!list_empty(&root_cgrp->children)); |
1706 | BUG_ON(root->number_of_cgroups != 1); | 1666 | BUG_ON(root->number_of_cgroups != 1); |
1707 | 1667 | ||
@@ -1750,7 +1710,6 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1750 | 1710 | ||
1751 | BUG_ON(root->number_of_cgroups != 1); | 1711 | BUG_ON(root->number_of_cgroups != 1); |
1752 | BUG_ON(!list_empty(&cgrp->children)); | 1712 | BUG_ON(!list_empty(&cgrp->children)); |
1753 | BUG_ON(!list_empty(&cgrp->sibling)); | ||
1754 | 1713 | ||
1755 | mutex_lock(&cgroup_mutex); | 1714 | mutex_lock(&cgroup_mutex); |
1756 | mutex_lock(&cgroup_root_mutex); | 1715 | mutex_lock(&cgroup_root_mutex); |
@@ -1808,9 +1767,11 @@ static struct kobject *cgroup_kobj; | |||
1808 | */ | 1767 | */ |
1809 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1768 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1810 | { | 1769 | { |
1770 | struct dentry *dentry = cgrp->dentry; | ||
1811 | char *start; | 1771 | char *start; |
1812 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, | 1772 | |
1813 | cgroup_lock_is_held()); | 1773 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), |
1774 | "cgroup_path() called without proper locking"); | ||
1814 | 1775 | ||
1815 | if (!dentry || cgrp == dummytop) { | 1776 | if (!dentry || cgrp == dummytop) { |
1816 | /* | 1777 | /* |
@@ -1821,9 +1782,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1821 | return 0; | 1782 | return 0; |
1822 | } | 1783 | } |
1823 | 1784 | ||
1824 | start = buf + buflen; | 1785 | start = buf + buflen - 1; |
1825 | 1786 | ||
1826 | *--start = '\0'; | 1787 | *start = '\0'; |
1827 | for (;;) { | 1788 | for (;;) { |
1828 | int len = dentry->d_name.len; | 1789 | int len = dentry->d_name.len; |
1829 | 1790 | ||
@@ -1834,8 +1795,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1834 | if (!cgrp) | 1795 | if (!cgrp) |
1835 | break; | 1796 | break; |
1836 | 1797 | ||
1837 | dentry = rcu_dereference_check(cgrp->dentry, | 1798 | dentry = cgrp->dentry; |
1838 | cgroup_lock_is_held()); | ||
1839 | if (!cgrp->parent) | 1799 | if (!cgrp->parent) |
1840 | continue; | 1800 | continue; |
1841 | if (--start < buf) | 1801 | if (--start < buf) |
@@ -1930,9 +1890,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); | |||
1930 | /* | 1890 | /* |
1931 | * cgroup_task_migrate - move a task from one cgroup to another. | 1891 | * cgroup_task_migrate - move a task from one cgroup to another. |
1932 | * | 1892 | * |
1933 | * 'guarantee' is set if the caller promises that a new css_set for the task | 1893 | * Must be called with cgroup_mutex and threadgroup locked. |
1934 | * will already exist. If not set, this function might sleep, and can fail with | ||
1935 | * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. | ||
1936 | */ | 1894 | */ |
1937 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1895 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, |
1938 | struct task_struct *tsk, struct css_set *newcg) | 1896 | struct task_struct *tsk, struct css_set *newcg) |
@@ -2025,12 +1983,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
2025 | } | 1983 | } |
2026 | 1984 | ||
2027 | synchronize_rcu(); | 1985 | synchronize_rcu(); |
2028 | |||
2029 | /* | ||
2030 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | ||
2031 | * is no longer empty. | ||
2032 | */ | ||
2033 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2034 | out: | 1986 | out: |
2035 | if (retval) { | 1987 | if (retval) { |
2036 | for_each_subsys(root, ss) { | 1988 | for_each_subsys(root, ss) { |
@@ -2200,7 +2152,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2200 | * step 5: success! and cleanup | 2152 | * step 5: success! and cleanup |
2201 | */ | 2153 | */ |
2202 | synchronize_rcu(); | 2154 | synchronize_rcu(); |
2203 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2204 | retval = 0; | 2155 | retval = 0; |
2205 | out_put_css_set_refs: | 2156 | out_put_css_set_refs: |
2206 | if (retval) { | 2157 | if (retval) { |
@@ -2711,10 +2662,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
2711 | 2662 | ||
2712 | /* start off with i_nlink == 2 (for "." entry) */ | 2663 | /* start off with i_nlink == 2 (for "." entry) */ |
2713 | inc_nlink(inode); | 2664 | inc_nlink(inode); |
2665 | inc_nlink(dentry->d_parent->d_inode); | ||
2714 | 2666 | ||
2715 | /* start with the directory inode held, so that we can | 2667 | /* |
2716 | * populate it without racing with another mkdir */ | 2668 | * Control reaches here with cgroup_mutex held. |
2717 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); | 2669 | * @inode->i_mutex should nest outside cgroup_mutex but we |
2670 | * want to populate it immediately without releasing | ||
2671 | * cgroup_mutex. As @inode isn't visible to anyone else | ||
2672 | * yet, trylock will always succeed without affecting | ||
2673 | * lockdep checks. | ||
2674 | */ | ||
2675 | WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); | ||
2718 | } else if (S_ISREG(mode)) { | 2676 | } else if (S_ISREG(mode)) { |
2719 | inode->i_size = 0; | 2677 | inode->i_size = 0; |
2720 | inode->i_fop = &cgroup_file_operations; | 2678 | inode->i_fop = &cgroup_file_operations; |
@@ -2725,32 +2683,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
2725 | return 0; | 2683 | return 0; |
2726 | } | 2684 | } |
2727 | 2685 | ||
2728 | /* | ||
2729 | * cgroup_create_dir - create a directory for an object. | ||
2730 | * @cgrp: the cgroup we create the directory for. It must have a valid | ||
2731 | * ->parent field. And we are going to fill its ->dentry field. | ||
2732 | * @dentry: dentry of the new cgroup | ||
2733 | * @mode: mode to set on new directory. | ||
2734 | */ | ||
2735 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | ||
2736 | umode_t mode) | ||
2737 | { | ||
2738 | struct dentry *parent; | ||
2739 | int error = 0; | ||
2740 | |||
2741 | parent = cgrp->parent->dentry; | ||
2742 | error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); | ||
2743 | if (!error) { | ||
2744 | dentry->d_fsdata = cgrp; | ||
2745 | inc_nlink(parent->d_inode); | ||
2746 | rcu_assign_pointer(cgrp->dentry, dentry); | ||
2747 | dget(dentry); | ||
2748 | } | ||
2749 | dput(dentry); | ||
2750 | |||
2751 | return error; | ||
2752 | } | ||
2753 | |||
2754 | /** | 2686 | /** |
2755 | * cgroup_file_mode - deduce file mode of a control file | 2687 | * cgroup_file_mode - deduce file mode of a control file |
2756 | * @cft: the control file in question | 2688 | * @cft: the control file in question |
@@ -2791,12 +2723,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2791 | 2723 | ||
2792 | simple_xattrs_init(&cft->xattrs); | 2724 | simple_xattrs_init(&cft->xattrs); |
2793 | 2725 | ||
2794 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2795 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2796 | return 0; | ||
2797 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2798 | return 0; | ||
2799 | |||
2800 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2726 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2801 | strcpy(name, subsys->name); | 2727 | strcpy(name, subsys->name); |
2802 | strcat(name, "."); | 2728 | strcat(name, "."); |
@@ -2837,6 +2763,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2837 | int err, ret = 0; | 2763 | int err, ret = 0; |
2838 | 2764 | ||
2839 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2765 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2766 | /* does cft->flags tell us to skip this file on @cgrp? */ | ||
2767 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2768 | continue; | ||
2769 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2770 | continue; | ||
2771 | |||
2840 | if (is_add) | 2772 | if (is_add) |
2841 | err = cgroup_add_file(cgrp, subsys, cft); | 2773 | err = cgroup_add_file(cgrp, subsys, cft); |
2842 | else | 2774 | else |
@@ -3044,6 +2976,92 @@ static void cgroup_enable_task_cg_lists(void) | |||
3044 | write_unlock(&css_set_lock); | 2976 | write_unlock(&css_set_lock); |
3045 | } | 2977 | } |
3046 | 2978 | ||
2979 | /** | ||
2980 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | ||
2981 | * @pos: the current position (%NULL to initiate traversal) | ||
2982 | * @cgroup: cgroup whose descendants to walk | ||
2983 | * | ||
2984 | * To be used by cgroup_for_each_descendant_pre(). Find the next | ||
2985 | * descendant to visit for pre-order traversal of @cgroup's descendants. | ||
2986 | */ | ||
2987 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | ||
2988 | struct cgroup *cgroup) | ||
2989 | { | ||
2990 | struct cgroup *next; | ||
2991 | |||
2992 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
2993 | |||
2994 | /* if first iteration, pretend we just visited @cgroup */ | ||
2995 | if (!pos) { | ||
2996 | if (list_empty(&cgroup->children)) | ||
2997 | return NULL; | ||
2998 | pos = cgroup; | ||
2999 | } | ||
3000 | |||
3001 | /* visit the first child if exists */ | ||
3002 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | ||
3003 | if (next) | ||
3004 | return next; | ||
3005 | |||
3006 | /* no child, visit my or the closest ancestor's next sibling */ | ||
3007 | do { | ||
3008 | next = list_entry_rcu(pos->sibling.next, struct cgroup, | ||
3009 | sibling); | ||
3010 | if (&next->sibling != &pos->parent->children) | ||
3011 | return next; | ||
3012 | |||
3013 | pos = pos->parent; | ||
3014 | } while (pos != cgroup); | ||
3015 | |||
3016 | return NULL; | ||
3017 | } | ||
3018 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | ||
3019 | |||
3020 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | ||
3021 | { | ||
3022 | struct cgroup *last; | ||
3023 | |||
3024 | do { | ||
3025 | last = pos; | ||
3026 | pos = list_first_or_null_rcu(&pos->children, struct cgroup, | ||
3027 | sibling); | ||
3028 | } while (pos); | ||
3029 | |||
3030 | return last; | ||
3031 | } | ||
3032 | |||
3033 | /** | ||
3034 | * cgroup_next_descendant_post - find the next descendant for post-order walk | ||
3035 | * @pos: the current position (%NULL to initiate traversal) | ||
3036 | * @cgroup: cgroup whose descendants to walk | ||
3037 | * | ||
3038 | * To be used by cgroup_for_each_descendant_post(). Find the next | ||
3039 | * descendant to visit for post-order traversal of @cgroup's descendants. | ||
3040 | */ | ||
3041 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | ||
3042 | struct cgroup *cgroup) | ||
3043 | { | ||
3044 | struct cgroup *next; | ||
3045 | |||
3046 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
3047 | |||
3048 | /* if first iteration, visit the leftmost descendant */ | ||
3049 | if (!pos) { | ||
3050 | next = cgroup_leftmost_descendant(cgroup); | ||
3051 | return next != cgroup ? next : NULL; | ||
3052 | } | ||
3053 | |||
3054 | /* if there's an unvisited sibling, visit its leftmost descendant */ | ||
3055 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | ||
3056 | if (&next->sibling != &pos->parent->children) | ||
3057 | return cgroup_leftmost_descendant(next); | ||
3058 | |||
3059 | /* no sibling left, visit parent */ | ||
3060 | next = pos->parent; | ||
3061 | return next != cgroup ? next : NULL; | ||
3062 | } | ||
3063 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); | ||
3064 | |||
3047 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 3065 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
3048 | __acquires(css_set_lock) | 3066 | __acquires(css_set_lock) |
3049 | { | 3067 | { |
@@ -3390,7 +3408,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3390 | { | 3408 | { |
3391 | struct cgroup_pidlist *l; | 3409 | struct cgroup_pidlist *l; |
3392 | /* don't need task_nsproxy() if we're looking at ourself */ | 3410 | /* don't need task_nsproxy() if we're looking at ourself */ |
3393 | struct pid_namespace *ns = current->nsproxy->pid_ns; | 3411 | struct pid_namespace *ns = task_active_pid_ns(current); |
3394 | 3412 | ||
3395 | /* | 3413 | /* |
3396 | * We can't drop the pidlist_mutex before taking the l->mutex in case | 3414 | * We can't drop the pidlist_mutex before taking the l->mutex in case |
@@ -3757,7 +3775,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3757 | if (flags & POLLHUP) { | 3775 | if (flags & POLLHUP) { |
3758 | __remove_wait_queue(event->wqh, &event->wait); | 3776 | __remove_wait_queue(event->wqh, &event->wait); |
3759 | spin_lock(&cgrp->event_list_lock); | 3777 | spin_lock(&cgrp->event_list_lock); |
3760 | list_del(&event->list); | 3778 | list_del_init(&event->list); |
3761 | spin_unlock(&cgrp->event_list_lock); | 3779 | spin_unlock(&cgrp->event_list_lock); |
3762 | /* | 3780 | /* |
3763 | * We are in atomic context, but cgroup_event_remove() may | 3781 | * We are in atomic context, but cgroup_event_remove() may |
@@ -3894,7 +3912,7 @@ fail: | |||
3894 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 3912 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, |
3895 | struct cftype *cft) | 3913 | struct cftype *cft) |
3896 | { | 3914 | { |
3897 | return clone_children(cgrp); | 3915 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3898 | } | 3916 | } |
3899 | 3917 | ||
3900 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 3918 | static int cgroup_clone_children_write(struct cgroup *cgrp, |
@@ -3902,9 +3920,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, | |||
3902 | u64 val) | 3920 | u64 val) |
3903 | { | 3921 | { |
3904 | if (val) | 3922 | if (val) |
3905 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3923 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3906 | else | 3924 | else |
3907 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3925 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3908 | return 0; | 3926 | return 0; |
3909 | } | 3927 | } |
3910 | 3928 | ||
@@ -4017,19 +4035,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
4017 | css->flags = 0; | 4035 | css->flags = 0; |
4018 | css->id = NULL; | 4036 | css->id = NULL; |
4019 | if (cgrp == dummytop) | 4037 | if (cgrp == dummytop) |
4020 | set_bit(CSS_ROOT, &css->flags); | 4038 | css->flags |= CSS_ROOT; |
4021 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 4039 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
4022 | cgrp->subsys[ss->subsys_id] = css; | 4040 | cgrp->subsys[ss->subsys_id] = css; |
4023 | 4041 | ||
4024 | /* | 4042 | /* |
4025 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | 4043 | * css holds an extra ref to @cgrp->dentry which is put on the last |
4026 | * which is put on the last css_put(). dput() requires process | 4044 | * css_put(). dput() requires process context, which css_put() may |
4027 | * context, which css_put() may be called without. @css->dput_work | 4045 | * be called without. @css->dput_work will be used to invoke |
4028 | * will be used to invoke dput() asynchronously from css_put(). | 4046 | * dput() asynchronously from css_put(). |
4029 | */ | 4047 | */ |
4030 | INIT_WORK(&css->dput_work, css_dput_fn); | 4048 | INIT_WORK(&css->dput_work, css_dput_fn); |
4031 | if (ss->__DEPRECATED_clear_css_refs) | 4049 | } |
4032 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | 4050 | |
4051 | /* invoke ->post_create() on a new CSS and mark it online if successful */ | ||
4052 | static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4053 | { | ||
4054 | int ret = 0; | ||
4055 | |||
4056 | lockdep_assert_held(&cgroup_mutex); | ||
4057 | |||
4058 | if (ss->css_online) | ||
4059 | ret = ss->css_online(cgrp); | ||
4060 | if (!ret) | ||
4061 | cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; | ||
4062 | return ret; | ||
4063 | } | ||
4064 | |||
4065 | /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ | ||
4066 | static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4067 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||
4068 | { | ||
4069 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4070 | |||
4071 | lockdep_assert_held(&cgroup_mutex); | ||
4072 | |||
4073 | if (!(css->flags & CSS_ONLINE)) | ||
4074 | return; | ||
4075 | |||
4076 | /* | ||
4077 | * css_offline() should be called with cgroup_mutex unlocked. See | ||
4078 | * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for | ||
4079 | * details. This temporary unlocking should go away once | ||
4080 | * cgroup_mutex is unexported from controllers. | ||
4081 | */ | ||
4082 | if (ss->css_offline) { | ||
4083 | mutex_unlock(&cgroup_mutex); | ||
4084 | ss->css_offline(cgrp); | ||
4085 | mutex_lock(&cgroup_mutex); | ||
4086 | } | ||
4087 | |||
4088 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | ||
4033 | } | 4089 | } |
4034 | 4090 | ||
4035 | /* | 4091 | /* |
@@ -4049,10 +4105,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4049 | struct cgroup_subsys *ss; | 4105 | struct cgroup_subsys *ss; |
4050 | struct super_block *sb = root->sb; | 4106 | struct super_block *sb = root->sb; |
4051 | 4107 | ||
4108 | /* allocate the cgroup and its ID, 0 is reserved for the root */ | ||
4052 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); | 4109 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); |
4053 | if (!cgrp) | 4110 | if (!cgrp) |
4054 | return -ENOMEM; | 4111 | return -ENOMEM; |
4055 | 4112 | ||
4113 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | ||
4114 | if (cgrp->id < 0) | ||
4115 | goto err_free_cgrp; | ||
4116 | |||
4117 | /* | ||
4118 | * Only live parents can have children. Note that the liveliness | ||
4119 | * check isn't strictly necessary because cgroup_mkdir() and | ||
4120 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
4121 | * anyway so that locking is contained inside cgroup proper and we | ||
4122 | * don't get nasty surprises if we ever grow another caller. | ||
4123 | */ | ||
4124 | if (!cgroup_lock_live_group(parent)) { | ||
4125 | err = -ENODEV; | ||
4126 | goto err_free_id; | ||
4127 | } | ||
4128 | |||
4056 | /* Grab a reference on the superblock so the hierarchy doesn't | 4129 | /* Grab a reference on the superblock so the hierarchy doesn't |
4057 | * get deleted on unmount if there are child cgroups. This | 4130 | * get deleted on unmount if there are child cgroups. This |
4058 | * can be done outside cgroup_mutex, since the sb can't | 4131 | * can be done outside cgroup_mutex, since the sb can't |
@@ -4060,8 +4133,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4060 | * fs */ | 4133 | * fs */ |
4061 | atomic_inc(&sb->s_active); | 4134 | atomic_inc(&sb->s_active); |
4062 | 4135 | ||
4063 | mutex_lock(&cgroup_mutex); | ||
4064 | |||
4065 | init_cgroup_housekeeping(cgrp); | 4136 | init_cgroup_housekeeping(cgrp); |
4066 | 4137 | ||
4067 | cgrp->parent = parent; | 4138 | cgrp->parent = parent; |
@@ -4071,26 +4142,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4071 | if (notify_on_release(parent)) | 4142 | if (notify_on_release(parent)) |
4072 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 4143 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
4073 | 4144 | ||
4074 | if (clone_children(parent)) | 4145 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) |
4075 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 4146 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
4076 | 4147 | ||
4077 | for_each_subsys(root, ss) { | 4148 | for_each_subsys(root, ss) { |
4078 | struct cgroup_subsys_state *css; | 4149 | struct cgroup_subsys_state *css; |
4079 | 4150 | ||
4080 | css = ss->create(cgrp); | 4151 | css = ss->css_alloc(cgrp); |
4081 | if (IS_ERR(css)) { | 4152 | if (IS_ERR(css)) { |
4082 | err = PTR_ERR(css); | 4153 | err = PTR_ERR(css); |
4083 | goto err_destroy; | 4154 | goto err_free_all; |
4084 | } | 4155 | } |
4085 | init_cgroup_css(css, ss, cgrp); | 4156 | init_cgroup_css(css, ss, cgrp); |
4086 | if (ss->use_id) { | 4157 | if (ss->use_id) { |
4087 | err = alloc_css_id(ss, parent, cgrp); | 4158 | err = alloc_css_id(ss, parent, cgrp); |
4088 | if (err) | 4159 | if (err) |
4089 | goto err_destroy; | 4160 | goto err_free_all; |
4090 | } | 4161 | } |
4091 | /* At error, ->destroy() callback has to free assigned ID. */ | 4162 | } |
4092 | if (clone_children(parent) && ss->post_clone) | 4163 | |
4093 | ss->post_clone(cgrp); | 4164 | /* |
4165 | * Create directory. cgroup_create_file() returns with the new | ||
4166 | * directory locked on success so that it can be populated without | ||
4167 | * dropping cgroup_mutex. | ||
4168 | */ | ||
4169 | err = cgroup_create_file(dentry, S_IFDIR | mode, sb); | ||
4170 | if (err < 0) | ||
4171 | goto err_free_all; | ||
4172 | lockdep_assert_held(&dentry->d_inode->i_mutex); | ||
4173 | |||
4174 | /* allocation complete, commit to creation */ | ||
4175 | dentry->d_fsdata = cgrp; | ||
4176 | cgrp->dentry = dentry; | ||
4177 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4178 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | ||
4179 | root->number_of_cgroups++; | ||
4180 | |||
4181 | /* each css holds a ref to the cgroup's dentry */ | ||
4182 | for_each_subsys(root, ss) | ||
4183 | dget(dentry); | ||
4184 | |||
4185 | /* creation succeeded, notify subsystems */ | ||
4186 | for_each_subsys(root, ss) { | ||
4187 | err = online_css(ss, cgrp); | ||
4188 | if (err) | ||
4189 | goto err_destroy; | ||
4094 | 4190 | ||
4095 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | 4191 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
4096 | parent->parent) { | 4192 | parent->parent) { |
@@ -4102,50 +4198,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4102 | } | 4198 | } |
4103 | } | 4199 | } |
4104 | 4200 | ||
4105 | list_add(&cgrp->sibling, &cgrp->parent->children); | ||
4106 | root->number_of_cgroups++; | ||
4107 | |||
4108 | err = cgroup_create_dir(cgrp, dentry, mode); | ||
4109 | if (err < 0) | ||
4110 | goto err_remove; | ||
4111 | |||
4112 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4113 | for_each_subsys(root, ss) | ||
4114 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4115 | dget(dentry); | ||
4116 | |||
4117 | /* The cgroup directory was pre-locked for us */ | ||
4118 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | ||
4119 | |||
4120 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4121 | |||
4122 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); | 4201 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); |
4123 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4202 | if (err) |
4203 | goto err_destroy; | ||
4124 | 4204 | ||
4125 | mutex_unlock(&cgroup_mutex); | 4205 | mutex_unlock(&cgroup_mutex); |
4126 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 4206 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
4127 | 4207 | ||
4128 | return 0; | 4208 | return 0; |
4129 | 4209 | ||
4130 | err_remove: | 4210 | err_free_all: |
4131 | |||
4132 | list_del(&cgrp->sibling); | ||
4133 | root->number_of_cgroups--; | ||
4134 | |||
4135 | err_destroy: | ||
4136 | |||
4137 | for_each_subsys(root, ss) { | 4211 | for_each_subsys(root, ss) { |
4138 | if (cgrp->subsys[ss->subsys_id]) | 4212 | if (cgrp->subsys[ss->subsys_id]) |
4139 | ss->destroy(cgrp); | 4213 | ss->css_free(cgrp); |
4140 | } | 4214 | } |
4141 | |||
4142 | mutex_unlock(&cgroup_mutex); | 4215 | mutex_unlock(&cgroup_mutex); |
4143 | |||
4144 | /* Release the reference count that we took on the superblock */ | 4216 | /* Release the reference count that we took on the superblock */ |
4145 | deactivate_super(sb); | 4217 | deactivate_super(sb); |
4146 | 4218 | err_free_id: | |
4219 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | ||
4220 | err_free_cgrp: | ||
4147 | kfree(cgrp); | 4221 | kfree(cgrp); |
4148 | return err; | 4222 | return err; |
4223 | |||
4224 | err_destroy: | ||
4225 | cgroup_destroy_locked(cgrp); | ||
4226 | mutex_unlock(&cgroup_mutex); | ||
4227 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
4228 | return err; | ||
4149 | } | 4229 | } |
4150 | 4230 | ||
4151 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | 4231 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
@@ -4197,153 +4277,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
4197 | return 0; | 4277 | return 0; |
4198 | } | 4278 | } |
4199 | 4279 | ||
4200 | /* | 4280 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
4201 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4281 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4202 | * CSS_REMOVED. Return true on success, or false if the cgroup has | ||
4203 | * busy subsystems. Call with cgroup_mutex held | ||
4204 | * | ||
4205 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4206 | * not, cgroup removal behaves differently. | ||
4207 | * | ||
4208 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4209 | * cgroup removal can be committed. This is implemented by | ||
4210 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4211 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4212 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4213 | * removed as soon as the existing user (memcg) is updated. | ||
4214 | * | ||
4215 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4216 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4217 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4218 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4219 | * is put so that dentry destruction happens only after all css's are | ||
4220 | * released. | ||
4221 | */ | ||
4222 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | ||
4223 | { | 4282 | { |
4283 | struct dentry *d = cgrp->dentry; | ||
4284 | struct cgroup *parent = cgrp->parent; | ||
4285 | DEFINE_WAIT(wait); | ||
4286 | struct cgroup_event *event, *tmp; | ||
4224 | struct cgroup_subsys *ss; | 4287 | struct cgroup_subsys *ss; |
4225 | unsigned long flags; | 4288 | LIST_HEAD(tmp_list); |
4226 | bool failed = false; | 4289 | |
4290 | lockdep_assert_held(&d->d_inode->i_mutex); | ||
4291 | lockdep_assert_held(&cgroup_mutex); | ||
4227 | 4292 | ||
4228 | local_irq_save(flags); | 4293 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) |
4294 | return -EBUSY; | ||
4229 | 4295 | ||
4230 | /* | 4296 | /* |
4231 | * Block new css_tryget() by deactivating refcnt. If all refcnts | 4297 | * Block new css_tryget() by deactivating refcnt and mark @cgrp |
4232 | * for subsystems w/ clear_css_refs set were 1 at the moment of | 4298 | * removed. This makes future css_tryget() and child creation |
4233 | * deactivation, we succeeded. | 4299 | * attempts fail thus maintaining the removal conditions verified |
4300 | * above. | ||
4234 | */ | 4301 | */ |
4235 | for_each_subsys(cgrp->root, ss) { | 4302 | for_each_subsys(cgrp->root, ss) { |
4236 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4303 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4237 | 4304 | ||
4238 | WARN_ON(atomic_read(&css->refcnt) < 0); | 4305 | WARN_ON(atomic_read(&css->refcnt) < 0); |
4239 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | 4306 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
4240 | |||
4241 | if (ss->__DEPRECATED_clear_css_refs) | ||
4242 | failed |= css_refcnt(css) != 1; | ||
4243 | } | ||
4244 | |||
4245 | /* | ||
4246 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4247 | * restore refcnts to positive values. Either way, all in-progress | ||
4248 | * css_tryget() will be released. | ||
4249 | */ | ||
4250 | for_each_subsys(cgrp->root, ss) { | ||
4251 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4252 | |||
4253 | if (!failed) { | ||
4254 | set_bit(CSS_REMOVED, &css->flags); | ||
4255 | css_put(css); | ||
4256 | } else { | ||
4257 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
4258 | } | ||
4259 | } | 4307 | } |
4308 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4260 | 4309 | ||
4261 | local_irq_restore(flags); | 4310 | /* tell subsystems to initate destruction */ |
4262 | return !failed; | 4311 | for_each_subsys(cgrp->root, ss) |
4263 | } | 4312 | offline_css(ss, cgrp); |
4264 | |||
4265 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4266 | { | ||
4267 | struct cgroup *cgrp = dentry->d_fsdata; | ||
4268 | struct dentry *d; | ||
4269 | struct cgroup *parent; | ||
4270 | DEFINE_WAIT(wait); | ||
4271 | struct cgroup_event *event, *tmp; | ||
4272 | int ret; | ||
4273 | |||
4274 | /* the vfs holds both inode->i_mutex already */ | ||
4275 | again: | ||
4276 | mutex_lock(&cgroup_mutex); | ||
4277 | if (atomic_read(&cgrp->count) != 0) { | ||
4278 | mutex_unlock(&cgroup_mutex); | ||
4279 | return -EBUSY; | ||
4280 | } | ||
4281 | if (!list_empty(&cgrp->children)) { | ||
4282 | mutex_unlock(&cgroup_mutex); | ||
4283 | return -EBUSY; | ||
4284 | } | ||
4285 | mutex_unlock(&cgroup_mutex); | ||
4286 | |||
4287 | /* | ||
4288 | * In general, subsystem has no css->refcnt after pre_destroy(). But | ||
4289 | * in racy cases, subsystem may have to get css->refcnt after | ||
4290 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | ||
4291 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | ||
4292 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
4293 | * and subsystem's reference count handling. Please see css_get/put | ||
4294 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
4295 | */ | ||
4296 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4297 | 4313 | ||
4298 | /* | 4314 | /* |
4299 | * Call pre_destroy handlers of subsys. Notify subsystems | 4315 | * Put all the base refs. Each css holds an extra reference to the |
4300 | * that rmdir() request comes. | 4316 | * cgroup's dentry and cgroup removal proceeds regardless of css |
4317 | * refs. On the last put of each css, whenever that may be, the | ||
4318 | * extra dentry ref is put so that dentry destruction happens only | ||
4319 | * after all css's are released. | ||
4301 | */ | 4320 | */ |
4302 | ret = cgroup_call_pre_destroy(cgrp); | 4321 | for_each_subsys(cgrp->root, ss) |
4303 | if (ret) { | 4322 | css_put(cgrp->subsys[ss->subsys_id]); |
4304 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4305 | return ret; | ||
4306 | } | ||
4307 | |||
4308 | mutex_lock(&cgroup_mutex); | ||
4309 | parent = cgrp->parent; | ||
4310 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | ||
4311 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4312 | mutex_unlock(&cgroup_mutex); | ||
4313 | return -EBUSY; | ||
4314 | } | ||
4315 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | ||
4316 | if (!cgroup_clear_css_refs(cgrp)) { | ||
4317 | mutex_unlock(&cgroup_mutex); | ||
4318 | /* | ||
4319 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
4320 | * prepare_to_wait(), we need to check this flag. | ||
4321 | */ | ||
4322 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
4323 | schedule(); | ||
4324 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4325 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4326 | if (signal_pending(current)) | ||
4327 | return -EINTR; | ||
4328 | goto again; | ||
4329 | } | ||
4330 | /* NO css_tryget() can success after here. */ | ||
4331 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4332 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4333 | 4323 | ||
4334 | raw_spin_lock(&release_list_lock); | 4324 | raw_spin_lock(&release_list_lock); |
4335 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4336 | if (!list_empty(&cgrp->release_list)) | 4325 | if (!list_empty(&cgrp->release_list)) |
4337 | list_del_init(&cgrp->release_list); | 4326 | list_del_init(&cgrp->release_list); |
4338 | raw_spin_unlock(&release_list_lock); | 4327 | raw_spin_unlock(&release_list_lock); |
4339 | 4328 | ||
4340 | /* delete this cgroup from parent->children */ | 4329 | /* delete this cgroup from parent->children */ |
4341 | list_del_init(&cgrp->sibling); | 4330 | list_del_rcu(&cgrp->sibling); |
4342 | |||
4343 | list_del_init(&cgrp->allcg_node); | 4331 | list_del_init(&cgrp->allcg_node); |
4344 | 4332 | ||
4345 | d = dget(cgrp->dentry); | 4333 | dget(d); |
4346 | |||
4347 | cgroup_d_remove_dir(d); | 4334 | cgroup_d_remove_dir(d); |
4348 | dput(d); | 4335 | dput(d); |
4349 | 4336 | ||
@@ -4353,21 +4340,35 @@ again: | |||
4353 | /* | 4340 | /* |
4354 | * Unregister events and notify userspace. | 4341 | * Unregister events and notify userspace. |
4355 | * Notify userspace about cgroup removing only after rmdir of cgroup | 4342 | * Notify userspace about cgroup removing only after rmdir of cgroup |
4356 | * directory to avoid race between userspace and kernelspace | 4343 | * directory to avoid race between userspace and kernelspace. Use |
4344 | * a temporary list to avoid a deadlock with cgroup_event_wake(). Since | ||
4345 | * cgroup_event_wake() is called with the wait queue head locked, | ||
4346 | * remove_wait_queue() cannot be called while holding event_list_lock. | ||
4357 | */ | 4347 | */ |
4358 | spin_lock(&cgrp->event_list_lock); | 4348 | spin_lock(&cgrp->event_list_lock); |
4359 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | 4349 | list_splice_init(&cgrp->event_list, &tmp_list); |
4360 | list_del(&event->list); | 4350 | spin_unlock(&cgrp->event_list_lock); |
4351 | list_for_each_entry_safe(event, tmp, &tmp_list, list) { | ||
4352 | list_del_init(&event->list); | ||
4361 | remove_wait_queue(event->wqh, &event->wait); | 4353 | remove_wait_queue(event->wqh, &event->wait); |
4362 | eventfd_signal(event->eventfd, 1); | 4354 | eventfd_signal(event->eventfd, 1); |
4363 | schedule_work(&event->remove); | 4355 | schedule_work(&event->remove); |
4364 | } | 4356 | } |
4365 | spin_unlock(&cgrp->event_list_lock); | ||
4366 | 4357 | ||
4367 | mutex_unlock(&cgroup_mutex); | ||
4368 | return 0; | 4358 | return 0; |
4369 | } | 4359 | } |
4370 | 4360 | ||
4361 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4362 | { | ||
4363 | int ret; | ||
4364 | |||
4365 | mutex_lock(&cgroup_mutex); | ||
4366 | ret = cgroup_destroy_locked(dentry->d_fsdata); | ||
4367 | mutex_unlock(&cgroup_mutex); | ||
4368 | |||
4369 | return ret; | ||
4370 | } | ||
4371 | |||
4371 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | 4372 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) |
4372 | { | 4373 | { |
4373 | INIT_LIST_HEAD(&ss->cftsets); | 4374 | INIT_LIST_HEAD(&ss->cftsets); |
@@ -4388,13 +4389,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4388 | 4389 | ||
4389 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4390 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4390 | 4391 | ||
4392 | mutex_lock(&cgroup_mutex); | ||
4393 | |||
4391 | /* init base cftset */ | 4394 | /* init base cftset */ |
4392 | cgroup_init_cftsets(ss); | 4395 | cgroup_init_cftsets(ss); |
4393 | 4396 | ||
4394 | /* Create the top cgroup state for this subsystem */ | 4397 | /* Create the top cgroup state for this subsystem */ |
4395 | list_add(&ss->sibling, &rootnode.subsys_list); | 4398 | list_add(&ss->sibling, &rootnode.subsys_list); |
4396 | ss->root = &rootnode; | 4399 | ss->root = &rootnode; |
4397 | css = ss->create(dummytop); | 4400 | css = ss->css_alloc(dummytop); |
4398 | /* We don't handle early failures gracefully */ | 4401 | /* We don't handle early failures gracefully */ |
4399 | BUG_ON(IS_ERR(css)); | 4402 | BUG_ON(IS_ERR(css)); |
4400 | init_cgroup_css(css, ss, dummytop); | 4403 | init_cgroup_css(css, ss, dummytop); |
@@ -4403,7 +4406,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4403 | * pointer to this state - since the subsystem is | 4406 | * pointer to this state - since the subsystem is |
4404 | * newly registered, all tasks and hence the | 4407 | * newly registered, all tasks and hence the |
4405 | * init_css_set is in the subsystem's top cgroup. */ | 4408 | * init_css_set is in the subsystem's top cgroup. */ |
4406 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; | 4409 | init_css_set.subsys[ss->subsys_id] = css; |
4407 | 4410 | ||
4408 | need_forkexit_callback |= ss->fork || ss->exit; | 4411 | need_forkexit_callback |= ss->fork || ss->exit; |
4409 | 4412 | ||
@@ -4413,6 +4416,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4413 | BUG_ON(!list_empty(&init_task.tasks)); | 4416 | BUG_ON(!list_empty(&init_task.tasks)); |
4414 | 4417 | ||
4415 | ss->active = 1; | 4418 | ss->active = 1; |
4419 | BUG_ON(online_css(ss, dummytop)); | ||
4420 | |||
4421 | mutex_unlock(&cgroup_mutex); | ||
4416 | 4422 | ||
4417 | /* this function shouldn't be used with modular subsystems, since they | 4423 | /* this function shouldn't be used with modular subsystems, since they |
4418 | * need to register a subsys_id, among other things */ | 4424 | * need to register a subsys_id, among other things */ |
@@ -4430,12 +4436,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4430 | */ | 4436 | */ |
4431 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | 4437 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) |
4432 | { | 4438 | { |
4433 | int i; | ||
4434 | struct cgroup_subsys_state *css; | 4439 | struct cgroup_subsys_state *css; |
4440 | int i, ret; | ||
4435 | 4441 | ||
4436 | /* check name and function validity */ | 4442 | /* check name and function validity */ |
4437 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || | 4443 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || |
4438 | ss->create == NULL || ss->destroy == NULL) | 4444 | ss->css_alloc == NULL || ss->css_free == NULL) |
4439 | return -EINVAL; | 4445 | return -EINVAL; |
4440 | 4446 | ||
4441 | /* | 4447 | /* |
@@ -4464,10 +4470,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4464 | subsys[ss->subsys_id] = ss; | 4470 | subsys[ss->subsys_id] = ss; |
4465 | 4471 | ||
4466 | /* | 4472 | /* |
4467 | * no ss->create seems to need anything important in the ss struct, so | 4473 | * no ss->css_alloc seems to need anything important in the ss |
4468 | * this can happen first (i.e. before the rootnode attachment). | 4474 | * struct, so this can happen first (i.e. before the rootnode |
4475 | * attachment). | ||
4469 | */ | 4476 | */ |
4470 | css = ss->create(dummytop); | 4477 | css = ss->css_alloc(dummytop); |
4471 | if (IS_ERR(css)) { | 4478 | if (IS_ERR(css)) { |
4472 | /* failure case - need to deassign the subsys[] slot. */ | 4479 | /* failure case - need to deassign the subsys[] slot. */ |
4473 | subsys[ss->subsys_id] = NULL; | 4480 | subsys[ss->subsys_id] = NULL; |
@@ -4482,14 +4489,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4482 | init_cgroup_css(css, ss, dummytop); | 4489 | init_cgroup_css(css, ss, dummytop); |
4483 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4490 | /* init_idr must be after init_cgroup_css because it sets css->id. */ |
4484 | if (ss->use_id) { | 4491 | if (ss->use_id) { |
4485 | int ret = cgroup_init_idr(ss, css); | 4492 | ret = cgroup_init_idr(ss, css); |
4486 | if (ret) { | 4493 | if (ret) |
4487 | dummytop->subsys[ss->subsys_id] = NULL; | 4494 | goto err_unload; |
4488 | ss->destroy(dummytop); | ||
4489 | subsys[ss->subsys_id] = NULL; | ||
4490 | mutex_unlock(&cgroup_mutex); | ||
4491 | return ret; | ||
4492 | } | ||
4493 | } | 4495 | } |
4494 | 4496 | ||
4495 | /* | 4497 | /* |
@@ -4522,10 +4524,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4522 | write_unlock(&css_set_lock); | 4524 | write_unlock(&css_set_lock); |
4523 | 4525 | ||
4524 | ss->active = 1; | 4526 | ss->active = 1; |
4527 | ret = online_css(ss, dummytop); | ||
4528 | if (ret) | ||
4529 | goto err_unload; | ||
4525 | 4530 | ||
4526 | /* success! */ | 4531 | /* success! */ |
4527 | mutex_unlock(&cgroup_mutex); | 4532 | mutex_unlock(&cgroup_mutex); |
4528 | return 0; | 4533 | return 0; |
4534 | |||
4535 | err_unload: | ||
4536 | mutex_unlock(&cgroup_mutex); | ||
4537 | /* @ss can't be mounted here as try_module_get() would fail */ | ||
4538 | cgroup_unload_subsys(ss); | ||
4539 | return ret; | ||
4529 | } | 4540 | } |
4530 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); | 4541 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); |
4531 | 4542 | ||
@@ -4552,6 +4563,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4552 | BUG_ON(ss->root != &rootnode); | 4563 | BUG_ON(ss->root != &rootnode); |
4553 | 4564 | ||
4554 | mutex_lock(&cgroup_mutex); | 4565 | mutex_lock(&cgroup_mutex); |
4566 | |||
4567 | offline_css(ss, dummytop); | ||
4568 | ss->active = 0; | ||
4569 | |||
4570 | if (ss->use_id) { | ||
4571 | idr_remove_all(&ss->idr); | ||
4572 | idr_destroy(&ss->idr); | ||
4573 | } | ||
4574 | |||
4555 | /* deassign the subsys_id */ | 4575 | /* deassign the subsys_id */ |
4556 | subsys[ss->subsys_id] = NULL; | 4576 | subsys[ss->subsys_id] = NULL; |
4557 | 4577 | ||
@@ -4567,7 +4587,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4567 | struct css_set *cg = link->cg; | 4587 | struct css_set *cg = link->cg; |
4568 | 4588 | ||
4569 | hlist_del(&cg->hlist); | 4589 | hlist_del(&cg->hlist); |
4570 | BUG_ON(!cg->subsys[ss->subsys_id]); | ||
4571 | cg->subsys[ss->subsys_id] = NULL; | 4590 | cg->subsys[ss->subsys_id] = NULL; |
4572 | hhead = css_set_hash(cg->subsys); | 4591 | hhead = css_set_hash(cg->subsys); |
4573 | hlist_add_head(&cg->hlist, hhead); | 4592 | hlist_add_head(&cg->hlist, hhead); |
@@ -4575,12 +4594,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4575 | write_unlock(&css_set_lock); | 4594 | write_unlock(&css_set_lock); |
4576 | 4595 | ||
4577 | /* | 4596 | /* |
4578 | * remove subsystem's css from the dummytop and free it - need to free | 4597 | * remove subsystem's css from the dummytop and free it - need to |
4579 | * before marking as null because ss->destroy needs the cgrp->subsys | 4598 | * free before marking as null because ss->css_free needs the |
4580 | * pointer to find their state. note that this also takes care of | 4599 | * cgrp->subsys pointer to find their state. note that this also |
4581 | * freeing the css_id. | 4600 | * takes care of freeing the css_id. |
4582 | */ | 4601 | */ |
4583 | ss->destroy(dummytop); | 4602 | ss->css_free(dummytop); |
4584 | dummytop->subsys[ss->subsys_id] = NULL; | 4603 | dummytop->subsys[ss->subsys_id] = NULL; |
4585 | 4604 | ||
4586 | mutex_unlock(&cgroup_mutex); | 4605 | mutex_unlock(&cgroup_mutex); |
@@ -4624,8 +4643,8 @@ int __init cgroup_init_early(void) | |||
4624 | 4643 | ||
4625 | BUG_ON(!ss->name); | 4644 | BUG_ON(!ss->name); |
4626 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | 4645 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); |
4627 | BUG_ON(!ss->create); | 4646 | BUG_ON(!ss->css_alloc); |
4628 | BUG_ON(!ss->destroy); | 4647 | BUG_ON(!ss->css_free); |
4629 | if (ss->subsys_id != i) { | 4648 | if (ss->subsys_id != i) { |
4630 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", | 4649 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", |
4631 | ss->name, ss->subsys_id); | 4650 | ss->name, ss->subsys_id); |
@@ -4832,44 +4851,19 @@ void cgroup_fork(struct task_struct *child) | |||
4832 | } | 4851 | } |
4833 | 4852 | ||
4834 | /** | 4853 | /** |
4835 | * cgroup_fork_callbacks - run fork callbacks | ||
4836 | * @child: the new task | ||
4837 | * | ||
4838 | * Called on a new task very soon before adding it to the | ||
4839 | * tasklist. No need to take any locks since no-one can | ||
4840 | * be operating on this task. | ||
4841 | */ | ||
4842 | void cgroup_fork_callbacks(struct task_struct *child) | ||
4843 | { | ||
4844 | if (need_forkexit_callback) { | ||
4845 | int i; | ||
4846 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4847 | struct cgroup_subsys *ss = subsys[i]; | ||
4848 | |||
4849 | /* | ||
4850 | * forkexit callbacks are only supported for | ||
4851 | * builtin subsystems. | ||
4852 | */ | ||
4853 | if (!ss || ss->module) | ||
4854 | continue; | ||
4855 | |||
4856 | if (ss->fork) | ||
4857 | ss->fork(child); | ||
4858 | } | ||
4859 | } | ||
4860 | } | ||
4861 | |||
4862 | /** | ||
4863 | * cgroup_post_fork - called on a new task after adding it to the task list | 4854 | * cgroup_post_fork - called on a new task after adding it to the task list |
4864 | * @child: the task in question | 4855 | * @child: the task in question |
4865 | * | 4856 | * |
4866 | * Adds the task to the list running through its css_set if necessary. | 4857 | * Adds the task to the list running through its css_set if necessary and |
4867 | * Has to be after the task is visible on the task list in case we race | 4858 | * call the subsystem fork() callbacks. Has to be after the task is |
4868 | * with the first call to cgroup_iter_start() - to guarantee that the | 4859 | * visible on the task list in case we race with the first call to |
4869 | * new task ends up on its list. | 4860 | * cgroup_iter_start() - to guarantee that the new task ends up on its |
4861 | * list. | ||
4870 | */ | 4862 | */ |
4871 | void cgroup_post_fork(struct task_struct *child) | 4863 | void cgroup_post_fork(struct task_struct *child) |
4872 | { | 4864 | { |
4865 | int i; | ||
4866 | |||
4873 | /* | 4867 | /* |
4874 | * use_task_css_set_links is set to 1 before we walk the tasklist | 4868 | * use_task_css_set_links is set to 1 before we walk the tasklist |
4875 | * under the tasklist_lock and we read it here after we added the child | 4869 | * under the tasklist_lock and we read it here after we added the child |
@@ -4889,7 +4883,30 @@ void cgroup_post_fork(struct task_struct *child) | |||
4889 | task_unlock(child); | 4883 | task_unlock(child); |
4890 | write_unlock(&css_set_lock); | 4884 | write_unlock(&css_set_lock); |
4891 | } | 4885 | } |
4886 | |||
4887 | /* | ||
4888 | * Call ss->fork(). This must happen after @child is linked on | ||
4889 | * css_set; otherwise, @child might change state between ->fork() | ||
4890 | * and addition to css_set. | ||
4891 | */ | ||
4892 | if (need_forkexit_callback) { | ||
4893 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4894 | struct cgroup_subsys *ss = subsys[i]; | ||
4895 | |||
4896 | /* | ||
4897 | * fork/exit callbacks are supported only for | ||
4898 | * builtin subsystems and we don't need further | ||
4899 | * synchronization as they never go away. | ||
4900 | */ | ||
4901 | if (!ss || ss->module) | ||
4902 | continue; | ||
4903 | |||
4904 | if (ss->fork) | ||
4905 | ss->fork(child); | ||
4906 | } | ||
4907 | } | ||
4892 | } | 4908 | } |
4909 | |||
4893 | /** | 4910 | /** |
4894 | * cgroup_exit - detach cgroup from exiting task | 4911 | * cgroup_exit - detach cgroup from exiting task |
4895 | * @tsk: pointer to task_struct of exiting process | 4912 | * @tsk: pointer to task_struct of exiting process |
@@ -5022,15 +5039,17 @@ static void check_for_release(struct cgroup *cgrp) | |||
5022 | /* Caller must verify that the css is not for root cgroup */ | 5039 | /* Caller must verify that the css is not for root cgroup */ |
5023 | bool __css_tryget(struct cgroup_subsys_state *css) | 5040 | bool __css_tryget(struct cgroup_subsys_state *css) |
5024 | { | 5041 | { |
5025 | do { | 5042 | while (true) { |
5026 | int v = css_refcnt(css); | 5043 | int t, v; |
5027 | 5044 | ||
5028 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | 5045 | v = css_refcnt(css); |
5046 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
5047 | if (likely(t == v)) | ||
5029 | return true; | 5048 | return true; |
5049 | else if (t < 0) | ||
5050 | return false; | ||
5030 | cpu_relax(); | 5051 | cpu_relax(); |
5031 | } while (!test_bit(CSS_REMOVED, &css->flags)); | 5052 | } |
5032 | |||
5033 | return false; | ||
5034 | } | 5053 | } |
5035 | EXPORT_SYMBOL_GPL(__css_tryget); | 5054 | EXPORT_SYMBOL_GPL(__css_tryget); |
5036 | 5055 | ||
@@ -5049,11 +5068,9 @@ void __css_put(struct cgroup_subsys_state *css) | |||
5049 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 5068 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
5050 | check_for_release(cgrp); | 5069 | check_for_release(cgrp); |
5051 | } | 5070 | } |
5052 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
5053 | break; | 5071 | break; |
5054 | case 0: | 5072 | case 0: |
5055 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | 5073 | schedule_work(&css->dput_work); |
5056 | schedule_work(&css->dput_work); | ||
5057 | break; | 5074 | break; |
5058 | } | 5075 | } |
5059 | rcu_read_unlock(); | 5076 | rcu_read_unlock(); |
@@ -5439,7 +5456,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
5439 | } | 5456 | } |
5440 | 5457 | ||
5441 | #ifdef CONFIG_CGROUP_DEBUG | 5458 | #ifdef CONFIG_CGROUP_DEBUG |
5442 | static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | 5459 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) |
5443 | { | 5460 | { |
5444 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5461 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5445 | 5462 | ||
@@ -5449,7 +5466,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | |||
5449 | return css; | 5466 | return css; |
5450 | } | 5467 | } |
5451 | 5468 | ||
5452 | static void debug_destroy(struct cgroup *cont) | 5469 | static void debug_css_free(struct cgroup *cont) |
5453 | { | 5470 | { |
5454 | kfree(cont->subsys[debug_subsys_id]); | 5471 | kfree(cont->subsys[debug_subsys_id]); |
5455 | } | 5472 | } |
@@ -5578,8 +5595,8 @@ static struct cftype debug_files[] = { | |||
5578 | 5595 | ||
5579 | struct cgroup_subsys debug_subsys = { | 5596 | struct cgroup_subsys debug_subsys = { |
5580 | .name = "debug", | 5597 | .name = "debug", |
5581 | .create = debug_create, | 5598 | .css_alloc = debug_css_alloc, |
5582 | .destroy = debug_destroy, | 5599 | .css_free = debug_css_free, |
5583 | .subsys_id = debug_subsys_id, | 5600 | .subsys_id = debug_subsys_id, |
5584 | .base_cftypes = debug_files, | 5601 | .base_cftypes = debug_files, |
5585 | }; | 5602 | }; |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index b1724ce98981..75dda1ea5026 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -22,24 +22,33 @@ | |||
22 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
23 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
24 | 24 | ||
25 | enum freezer_state { | 25 | /* |
26 | CGROUP_THAWED = 0, | 26 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is |
27 | CGROUP_FREEZING, | 27 | * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared |
28 | CGROUP_FROZEN, | 28 | * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING |
29 | * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of | ||
30 | * its ancestors has FREEZING_SELF set. | ||
31 | */ | ||
32 | enum freezer_state_flags { | ||
33 | CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ | ||
34 | CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ | ||
35 | CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ | ||
36 | CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ | ||
37 | |||
38 | /* mask for all FREEZING flags */ | ||
39 | CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, | ||
29 | }; | 40 | }; |
30 | 41 | ||
31 | struct freezer { | 42 | struct freezer { |
32 | struct cgroup_subsys_state css; | 43 | struct cgroup_subsys_state css; |
33 | enum freezer_state state; | 44 | unsigned int state; |
34 | spinlock_t lock; /* protects _writes_ to state */ | 45 | spinlock_t lock; |
35 | }; | 46 | }; |
36 | 47 | ||
37 | static inline struct freezer *cgroup_freezer( | 48 | static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) |
38 | struct cgroup *cgroup) | ||
39 | { | 49 | { |
40 | return container_of( | 50 | return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), |
41 | cgroup_subsys_state(cgroup, freezer_subsys_id), | 51 | struct freezer, css); |
42 | struct freezer, css); | ||
43 | } | 52 | } |
44 | 53 | ||
45 | static inline struct freezer *task_freezer(struct task_struct *task) | 54 | static inline struct freezer *task_freezer(struct task_struct *task) |
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
48 | struct freezer, css); | 57 | struct freezer, css); |
49 | } | 58 | } |
50 | 59 | ||
60 | static struct freezer *parent_freezer(struct freezer *freezer) | ||
61 | { | ||
62 | struct cgroup *pcg = freezer->css.cgroup->parent; | ||
63 | |||
64 | if (pcg) | ||
65 | return cgroup_freezer(pcg); | ||
66 | return NULL; | ||
67 | } | ||
68 | |||
51 | bool cgroup_freezing(struct task_struct *task) | 69 | bool cgroup_freezing(struct task_struct *task) |
52 | { | 70 | { |
53 | enum freezer_state state; | ||
54 | bool ret; | 71 | bool ret; |
55 | 72 | ||
56 | rcu_read_lock(); | 73 | rcu_read_lock(); |
57 | state = task_freezer(task)->state; | 74 | ret = task_freezer(task)->state & CGROUP_FREEZING; |
58 | ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; | ||
59 | rcu_read_unlock(); | 75 | rcu_read_unlock(); |
60 | 76 | ||
61 | return ret; | 77 | return ret; |
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task) | |||
65 | * cgroups_write_string() limits the size of freezer state strings to | 81 | * cgroups_write_string() limits the size of freezer state strings to |
66 | * CGROUP_LOCAL_BUFFER_SIZE | 82 | * CGROUP_LOCAL_BUFFER_SIZE |
67 | */ | 83 | */ |
68 | static const char *freezer_state_strs[] = { | 84 | static const char *freezer_state_strs(unsigned int state) |
69 | "THAWED", | 85 | { |
70 | "FREEZING", | 86 | if (state & CGROUP_FROZEN) |
71 | "FROZEN", | 87 | return "FROZEN"; |
88 | if (state & CGROUP_FREEZING) | ||
89 | return "FREEZING"; | ||
90 | return "THAWED"; | ||
72 | }; | 91 | }; |
73 | 92 | ||
74 | /* | ||
75 | * State diagram | ||
76 | * Transitions are caused by userspace writes to the freezer.state file. | ||
77 | * The values in parenthesis are state labels. The rest are edge labels. | ||
78 | * | ||
79 | * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) | ||
80 | * ^ ^ | | | ||
81 | * | \_______THAWED_______/ | | ||
82 | * \__________________________THAWED____________/ | ||
83 | */ | ||
84 | |||
85 | struct cgroup_subsys freezer_subsys; | 93 | struct cgroup_subsys freezer_subsys; |
86 | 94 | ||
87 | /* Locks taken and their ordering | 95 | static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) |
88 | * ------------------------------ | ||
89 | * cgroup_mutex (AKA cgroup_lock) | ||
90 | * freezer->lock | ||
91 | * css_set_lock | ||
92 | * task->alloc_lock (AKA task_lock) | ||
93 | * task->sighand->siglock | ||
94 | * | ||
95 | * cgroup code forces css_set_lock to be taken before task->alloc_lock | ||
96 | * | ||
97 | * freezer_create(), freezer_destroy(): | ||
98 | * cgroup_mutex [ by cgroup core ] | ||
99 | * | ||
100 | * freezer_can_attach(): | ||
101 | * cgroup_mutex (held by caller of can_attach) | ||
102 | * | ||
103 | * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): | ||
104 | * freezer->lock | ||
105 | * sighand->siglock (if the cgroup is freezing) | ||
106 | * | ||
107 | * freezer_read(): | ||
108 | * cgroup_mutex | ||
109 | * freezer->lock | ||
110 | * write_lock css_set_lock (cgroup iterator start) | ||
111 | * task->alloc_lock | ||
112 | * read_lock css_set_lock (cgroup iterator start) | ||
113 | * | ||
114 | * freezer_write() (freeze): | ||
115 | * cgroup_mutex | ||
116 | * freezer->lock | ||
117 | * write_lock css_set_lock (cgroup iterator start) | ||
118 | * task->alloc_lock | ||
119 | * read_lock css_set_lock (cgroup iterator start) | ||
120 | * sighand->siglock (fake signal delivery inside freeze_task()) | ||
121 | * | ||
122 | * freezer_write() (unfreeze): | ||
123 | * cgroup_mutex | ||
124 | * freezer->lock | ||
125 | * write_lock css_set_lock (cgroup iterator start) | ||
126 | * task->alloc_lock | ||
127 | * read_lock css_set_lock (cgroup iterator start) | ||
128 | * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) | ||
129 | * sighand->siglock | ||
130 | */ | ||
131 | static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) | ||
132 | { | 96 | { |
133 | struct freezer *freezer; | 97 | struct freezer *freezer; |
134 | 98 | ||
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) | |||
137 | return ERR_PTR(-ENOMEM); | 101 | return ERR_PTR(-ENOMEM); |
138 | 102 | ||
139 | spin_lock_init(&freezer->lock); | 103 | spin_lock_init(&freezer->lock); |
140 | freezer->state = CGROUP_THAWED; | ||
141 | return &freezer->css; | 104 | return &freezer->css; |
142 | } | 105 | } |
143 | 106 | ||
144 | static void freezer_destroy(struct cgroup *cgroup) | 107 | /** |
108 | * freezer_css_online - commit creation of a freezer cgroup | ||
109 | * @cgroup: cgroup being created | ||
110 | * | ||
111 | * We're committing to creation of @cgroup. Mark it online and inherit | ||
112 | * parent's freezing state while holding both parent's and our | ||
113 | * freezer->lock. | ||
114 | */ | ||
115 | static int freezer_css_online(struct cgroup *cgroup) | ||
116 | { | ||
117 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
118 | struct freezer *parent = parent_freezer(freezer); | ||
119 | |||
120 | /* | ||
121 | * The following double locking and freezing state inheritance | ||
122 | * guarantee that @cgroup can never escape ancestors' freezing | ||
123 | * states. See cgroup_for_each_descendant_pre() for details. | ||
124 | */ | ||
125 | if (parent) | ||
126 | spin_lock_irq(&parent->lock); | ||
127 | spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); | ||
128 | |||
129 | freezer->state |= CGROUP_FREEZER_ONLINE; | ||
130 | |||
131 | if (parent && (parent->state & CGROUP_FREEZING)) { | ||
132 | freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; | ||
133 | atomic_inc(&system_freezing_cnt); | ||
134 | } | ||
135 | |||
136 | spin_unlock(&freezer->lock); | ||
137 | if (parent) | ||
138 | spin_unlock_irq(&parent->lock); | ||
139 | |||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * freezer_css_offline - initiate destruction of @cgroup | ||
145 | * @cgroup: cgroup being destroyed | ||
146 | * | ||
147 | * @cgroup is going away. Mark it dead and decrement system_freezing_count | ||
148 | * if it was holding one. | ||
149 | */ | ||
150 | static void freezer_css_offline(struct cgroup *cgroup) | ||
145 | { | 151 | { |
146 | struct freezer *freezer = cgroup_freezer(cgroup); | 152 | struct freezer *freezer = cgroup_freezer(cgroup); |
147 | 153 | ||
148 | if (freezer->state != CGROUP_THAWED) | 154 | spin_lock_irq(&freezer->lock); |
155 | |||
156 | if (freezer->state & CGROUP_FREEZING) | ||
149 | atomic_dec(&system_freezing_cnt); | 157 | atomic_dec(&system_freezing_cnt); |
150 | kfree(freezer); | 158 | |
159 | freezer->state = 0; | ||
160 | |||
161 | spin_unlock_irq(&freezer->lock); | ||
151 | } | 162 | } |
152 | 163 | ||
153 | /* task is frozen or will freeze immediately when next it gets woken */ | 164 | static void freezer_css_free(struct cgroup *cgroup) |
154 | static bool is_task_frozen_enough(struct task_struct *task) | ||
155 | { | 165 | { |
156 | return frozen(task) || | 166 | kfree(cgroup_freezer(cgroup)); |
157 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
158 | } | 167 | } |
159 | 168 | ||
160 | /* | 169 | /* |
161 | * The call to cgroup_lock() in the freezer.state write method prevents | 170 | * Tasks can be migrated into a different freezer anytime regardless of its |
162 | * a write to that file racing against an attach, and hence the | 171 | * current state. freezer_attach() is responsible for making new tasks |
163 | * can_attach() result will remain valid until the attach completes. | 172 | * conform to the current state. |
173 | * | ||
174 | * Freezer state changes and task migration are synchronized via | ||
175 | * @freezer->lock. freezer_attach() makes the new tasks conform to the | ||
176 | * current state and all following state changes can see the new tasks. | ||
164 | */ | 177 | */ |
165 | static int freezer_can_attach(struct cgroup *new_cgroup, | 178 | static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) |
166 | struct cgroup_taskset *tset) | ||
167 | { | 179 | { |
168 | struct freezer *freezer; | 180 | struct freezer *freezer = cgroup_freezer(new_cgrp); |
169 | struct task_struct *task; | 181 | struct task_struct *task; |
182 | bool clear_frozen = false; | ||
183 | |||
184 | spin_lock_irq(&freezer->lock); | ||
170 | 185 | ||
171 | /* | 186 | /* |
172 | * Anything frozen can't move or be moved to/from. | 187 | * Make the new tasks conform to the current state of @new_cgrp. |
188 | * For simplicity, when migrating any task to a FROZEN cgroup, we | ||
189 | * revert it to FREEZING and let update_if_frozen() determine the | ||
190 | * correct state later. | ||
191 | * | ||
192 | * Tasks in @tset are on @new_cgrp but may not conform to its | ||
193 | * current state before executing the following - !frozen tasks may | ||
194 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. | ||
173 | */ | 195 | */ |
174 | cgroup_taskset_for_each(task, new_cgroup, tset) | 196 | cgroup_taskset_for_each(task, new_cgrp, tset) { |
175 | if (cgroup_freezing(task)) | 197 | if (!(freezer->state & CGROUP_FREEZING)) { |
176 | return -EBUSY; | 198 | __thaw_task(task); |
199 | } else { | ||
200 | freeze_task(task); | ||
201 | freezer->state &= ~CGROUP_FROZEN; | ||
202 | clear_frozen = true; | ||
203 | } | ||
204 | } | ||
177 | 205 | ||
178 | freezer = cgroup_freezer(new_cgroup); | 206 | spin_unlock_irq(&freezer->lock); |
179 | if (freezer->state != CGROUP_THAWED) | ||
180 | return -EBUSY; | ||
181 | 207 | ||
182 | return 0; | 208 | /* |
209 | * Propagate FROZEN clearing upwards. We may race with | ||
210 | * update_if_frozen(), but as long as both work bottom-up, either | ||
211 | * update_if_frozen() sees child's FROZEN cleared or we clear the | ||
212 | * parent's FROZEN later. No parent w/ !FROZEN children can be | ||
213 | * left FROZEN. | ||
214 | */ | ||
215 | while (clear_frozen && (freezer = parent_freezer(freezer))) { | ||
216 | spin_lock_irq(&freezer->lock); | ||
217 | freezer->state &= ~CGROUP_FROZEN; | ||
218 | clear_frozen = freezer->state & CGROUP_FREEZING; | ||
219 | spin_unlock_irq(&freezer->lock); | ||
220 | } | ||
183 | } | 221 | } |
184 | 222 | ||
185 | static void freezer_fork(struct task_struct *task) | 223 | static void freezer_fork(struct task_struct *task) |
186 | { | 224 | { |
187 | struct freezer *freezer; | 225 | struct freezer *freezer; |
188 | 226 | ||
189 | /* | ||
190 | * No lock is needed, since the task isn't on tasklist yet, | ||
191 | * so it can't be moved to another cgroup, which means the | ||
192 | * freezer won't be removed and will be valid during this | ||
193 | * function call. Nevertheless, apply RCU read-side critical | ||
194 | * section to suppress RCU lockdep false positives. | ||
195 | */ | ||
196 | rcu_read_lock(); | 227 | rcu_read_lock(); |
197 | freezer = task_freezer(task); | 228 | freezer = task_freezer(task); |
198 | rcu_read_unlock(); | ||
199 | 229 | ||
200 | /* | 230 | /* |
201 | * The root cgroup is non-freezable, so we can skip the | 231 | * The root cgroup is non-freezable, so we can skip the |
202 | * following check. | 232 | * following check. |
203 | */ | 233 | */ |
204 | if (!freezer->css.cgroup->parent) | 234 | if (!freezer->css.cgroup->parent) |
205 | return; | 235 | goto out; |
206 | 236 | ||
207 | spin_lock_irq(&freezer->lock); | 237 | spin_lock_irq(&freezer->lock); |
208 | BUG_ON(freezer->state == CGROUP_FROZEN); | 238 | if (freezer->state & CGROUP_FREEZING) |
209 | |||
210 | /* Locking avoids race with FREEZING -> THAWED transitions. */ | ||
211 | if (freezer->state == CGROUP_FREEZING) | ||
212 | freeze_task(task); | 239 | freeze_task(task); |
213 | spin_unlock_irq(&freezer->lock); | 240 | spin_unlock_irq(&freezer->lock); |
241 | out: | ||
242 | rcu_read_unlock(); | ||
214 | } | 243 | } |
215 | 244 | ||
216 | /* | 245 | /** |
217 | * caller must hold freezer->lock | 246 | * update_if_frozen - update whether a cgroup finished freezing |
247 | * @cgroup: cgroup of interest | ||
248 | * | ||
249 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by | ||
250 | * calling this function. If the current state is FREEZING but not FROZEN, | ||
251 | * this function checks whether all tasks of this cgroup and the descendant | ||
252 | * cgroups finished freezing and, if so, sets FROZEN. | ||
253 | * | ||
254 | * The caller is responsible for grabbing RCU read lock and calling | ||
255 | * update_if_frozen() on all descendants prior to invoking this function. | ||
256 | * | ||
257 | * Task states and freezer state might disagree while tasks are being | ||
258 | * migrated into or out of @cgroup, so we can't verify task states against | ||
259 | * @freezer state here. See freezer_attach() for details. | ||
218 | */ | 260 | */ |
219 | static void update_if_frozen(struct cgroup *cgroup, | 261 | static void update_if_frozen(struct cgroup *cgroup) |
220 | struct freezer *freezer) | ||
221 | { | 262 | { |
263 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
264 | struct cgroup *pos; | ||
222 | struct cgroup_iter it; | 265 | struct cgroup_iter it; |
223 | struct task_struct *task; | 266 | struct task_struct *task; |
224 | unsigned int nfrozen = 0, ntotal = 0; | ||
225 | enum freezer_state old_state = freezer->state; | ||
226 | 267 | ||
227 | cgroup_iter_start(cgroup, &it); | 268 | WARN_ON_ONCE(!rcu_read_lock_held()); |
228 | while ((task = cgroup_iter_next(cgroup, &it))) { | 269 | |
229 | ntotal++; | 270 | spin_lock_irq(&freezer->lock); |
230 | if (freezing(task) && is_task_frozen_enough(task)) | 271 | |
231 | nfrozen++; | 272 | if (!(freezer->state & CGROUP_FREEZING) || |
273 | (freezer->state & CGROUP_FROZEN)) | ||
274 | goto out_unlock; | ||
275 | |||
276 | /* are all (live) children frozen? */ | ||
277 | cgroup_for_each_child(pos, cgroup) { | ||
278 | struct freezer *child = cgroup_freezer(pos); | ||
279 | |||
280 | if ((child->state & CGROUP_FREEZER_ONLINE) && | ||
281 | !(child->state & CGROUP_FROZEN)) | ||
282 | goto out_unlock; | ||
232 | } | 283 | } |
233 | 284 | ||
234 | if (old_state == CGROUP_THAWED) { | 285 | /* are all tasks frozen? */ |
235 | BUG_ON(nfrozen > 0); | 286 | cgroup_iter_start(cgroup, &it); |
236 | } else if (old_state == CGROUP_FREEZING) { | 287 | |
237 | if (nfrozen == ntotal) | 288 | while ((task = cgroup_iter_next(cgroup, &it))) { |
238 | freezer->state = CGROUP_FROZEN; | 289 | if (freezing(task)) { |
239 | } else { /* old_state == CGROUP_FROZEN */ | 290 | /* |
240 | BUG_ON(nfrozen != ntotal); | 291 | * freezer_should_skip() indicates that the task |
292 | * should be skipped when determining freezing | ||
293 | * completion. Consider it frozen in addition to | ||
294 | * the usual frozen condition. | ||
295 | */ | ||
296 | if (!frozen(task) && !freezer_should_skip(task)) | ||
297 | goto out_iter_end; | ||
298 | } | ||
241 | } | 299 | } |
242 | 300 | ||
301 | freezer->state |= CGROUP_FROZEN; | ||
302 | out_iter_end: | ||
243 | cgroup_iter_end(cgroup, &it); | 303 | cgroup_iter_end(cgroup, &it); |
304 | out_unlock: | ||
305 | spin_unlock_irq(&freezer->lock); | ||
244 | } | 306 | } |
245 | 307 | ||
246 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | 308 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, |
247 | struct seq_file *m) | 309 | struct seq_file *m) |
248 | { | 310 | { |
249 | struct freezer *freezer; | 311 | struct cgroup *pos; |
250 | enum freezer_state state; | ||
251 | 312 | ||
252 | if (!cgroup_lock_live_group(cgroup)) | 313 | rcu_read_lock(); |
253 | return -ENODEV; | ||
254 | 314 | ||
255 | freezer = cgroup_freezer(cgroup); | 315 | /* update states bottom-up */ |
256 | spin_lock_irq(&freezer->lock); | 316 | cgroup_for_each_descendant_post(pos, cgroup) |
257 | state = freezer->state; | 317 | update_if_frozen(pos); |
258 | if (state == CGROUP_FREEZING) { | 318 | update_if_frozen(cgroup); |
259 | /* We change from FREEZING to FROZEN lazily if the cgroup was | 319 | |
260 | * only partially frozen when we exitted write. */ | 320 | rcu_read_unlock(); |
261 | update_if_frozen(cgroup, freezer); | ||
262 | state = freezer->state; | ||
263 | } | ||
264 | spin_unlock_irq(&freezer->lock); | ||
265 | cgroup_unlock(); | ||
266 | 321 | ||
267 | seq_puts(m, freezer_state_strs[state]); | 322 | seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); |
268 | seq_putc(m, '\n'); | 323 | seq_putc(m, '\n'); |
269 | return 0; | 324 | return 0; |
270 | } | 325 | } |
271 | 326 | ||
272 | static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | 327 | static void freeze_cgroup(struct freezer *freezer) |
273 | { | 328 | { |
329 | struct cgroup *cgroup = freezer->css.cgroup; | ||
274 | struct cgroup_iter it; | 330 | struct cgroup_iter it; |
275 | struct task_struct *task; | 331 | struct task_struct *task; |
276 | unsigned int num_cant_freeze_now = 0; | ||
277 | 332 | ||
278 | cgroup_iter_start(cgroup, &it); | 333 | cgroup_iter_start(cgroup, &it); |
279 | while ((task = cgroup_iter_next(cgroup, &it))) { | 334 | while ((task = cgroup_iter_next(cgroup, &it))) |
280 | if (!freeze_task(task)) | 335 | freeze_task(task); |
281 | continue; | ||
282 | if (is_task_frozen_enough(task)) | ||
283 | continue; | ||
284 | if (!freezing(task) && !freezer_should_skip(task)) | ||
285 | num_cant_freeze_now++; | ||
286 | } | ||
287 | cgroup_iter_end(cgroup, &it); | 336 | cgroup_iter_end(cgroup, &it); |
288 | |||
289 | return num_cant_freeze_now ? -EBUSY : 0; | ||
290 | } | 337 | } |
291 | 338 | ||
292 | static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | 339 | static void unfreeze_cgroup(struct freezer *freezer) |
293 | { | 340 | { |
341 | struct cgroup *cgroup = freezer->css.cgroup; | ||
294 | struct cgroup_iter it; | 342 | struct cgroup_iter it; |
295 | struct task_struct *task; | 343 | struct task_struct *task; |
296 | 344 | ||
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
300 | cgroup_iter_end(cgroup, &it); | 348 | cgroup_iter_end(cgroup, &it); |
301 | } | 349 | } |
302 | 350 | ||
303 | static int freezer_change_state(struct cgroup *cgroup, | 351 | /** |
304 | enum freezer_state goal_state) | 352 | * freezer_apply_state - apply state change to a single cgroup_freezer |
353 | * @freezer: freezer to apply state change to | ||
354 | * @freeze: whether to freeze or unfreeze | ||
355 | * @state: CGROUP_FREEZING_* flag to set or clear | ||
356 | * | ||
357 | * Set or clear @state on @cgroup according to @freeze, and perform | ||
358 | * freezing or thawing as necessary. | ||
359 | */ | ||
360 | static void freezer_apply_state(struct freezer *freezer, bool freeze, | ||
361 | unsigned int state) | ||
305 | { | 362 | { |
306 | struct freezer *freezer; | 363 | /* also synchronizes against task migration, see freezer_attach() */ |
307 | int retval = 0; | 364 | lockdep_assert_held(&freezer->lock); |
308 | |||
309 | freezer = cgroup_freezer(cgroup); | ||
310 | 365 | ||
311 | spin_lock_irq(&freezer->lock); | 366 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) |
367 | return; | ||
312 | 368 | ||
313 | update_if_frozen(cgroup, freezer); | 369 | if (freeze) { |
314 | 370 | if (!(freezer->state & CGROUP_FREEZING)) | |
315 | switch (goal_state) { | ||
316 | case CGROUP_THAWED: | ||
317 | if (freezer->state != CGROUP_THAWED) | ||
318 | atomic_dec(&system_freezing_cnt); | ||
319 | freezer->state = CGROUP_THAWED; | ||
320 | unfreeze_cgroup(cgroup, freezer); | ||
321 | break; | ||
322 | case CGROUP_FROZEN: | ||
323 | if (freezer->state == CGROUP_THAWED) | ||
324 | atomic_inc(&system_freezing_cnt); | 371 | atomic_inc(&system_freezing_cnt); |
325 | freezer->state = CGROUP_FREEZING; | 372 | freezer->state |= state; |
326 | retval = try_to_freeze_cgroup(cgroup, freezer); | 373 | freeze_cgroup(freezer); |
327 | break; | 374 | } else { |
328 | default: | 375 | bool was_freezing = freezer->state & CGROUP_FREEZING; |
329 | BUG(); | 376 | |
377 | freezer->state &= ~state; | ||
378 | |||
379 | if (!(freezer->state & CGROUP_FREEZING)) { | ||
380 | if (was_freezing) | ||
381 | atomic_dec(&system_freezing_cnt); | ||
382 | freezer->state &= ~CGROUP_FROZEN; | ||
383 | unfreeze_cgroup(freezer); | ||
384 | } | ||
330 | } | 385 | } |
386 | } | ||
331 | 387 | ||
388 | /** | ||
389 | * freezer_change_state - change the freezing state of a cgroup_freezer | ||
390 | * @freezer: freezer of interest | ||
391 | * @freeze: whether to freeze or thaw | ||
392 | * | ||
393 | * Freeze or thaw @freezer according to @freeze. The operations are | ||
394 | * recursive - all descendants of @freezer will be affected. | ||
395 | */ | ||
396 | static void freezer_change_state(struct freezer *freezer, bool freeze) | ||
397 | { | ||
398 | struct cgroup *pos; | ||
399 | |||
400 | /* update @freezer */ | ||
401 | spin_lock_irq(&freezer->lock); | ||
402 | freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); | ||
332 | spin_unlock_irq(&freezer->lock); | 403 | spin_unlock_irq(&freezer->lock); |
333 | 404 | ||
334 | return retval; | 405 | /* |
406 | * Update all its descendants in pre-order traversal. Each | ||
407 | * descendant will try to inherit its parent's FREEZING state as | ||
408 | * CGROUP_FREEZING_PARENT. | ||
409 | */ | ||
410 | rcu_read_lock(); | ||
411 | cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { | ||
412 | struct freezer *pos_f = cgroup_freezer(pos); | ||
413 | struct freezer *parent = parent_freezer(pos_f); | ||
414 | |||
415 | /* | ||
416 | * Our update to @parent->state is already visible which is | ||
417 | * all we need. No need to lock @parent. For more info on | ||
418 | * synchronization, see freezer_post_create(). | ||
419 | */ | ||
420 | spin_lock_irq(&pos_f->lock); | ||
421 | freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, | ||
422 | CGROUP_FREEZING_PARENT); | ||
423 | spin_unlock_irq(&pos_f->lock); | ||
424 | } | ||
425 | rcu_read_unlock(); | ||
335 | } | 426 | } |
336 | 427 | ||
337 | static int freezer_write(struct cgroup *cgroup, | 428 | static int freezer_write(struct cgroup *cgroup, struct cftype *cft, |
338 | struct cftype *cft, | ||
339 | const char *buffer) | 429 | const char *buffer) |
340 | { | 430 | { |
341 | int retval; | 431 | bool freeze; |
342 | enum freezer_state goal_state; | ||
343 | 432 | ||
344 | if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) | 433 | if (strcmp(buffer, freezer_state_strs(0)) == 0) |
345 | goal_state = CGROUP_THAWED; | 434 | freeze = false; |
346 | else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) | 435 | else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) |
347 | goal_state = CGROUP_FROZEN; | 436 | freeze = true; |
348 | else | 437 | else |
349 | return -EINVAL; | 438 | return -EINVAL; |
350 | 439 | ||
351 | if (!cgroup_lock_live_group(cgroup)) | 440 | freezer_change_state(cgroup_freezer(cgroup), freeze); |
352 | return -ENODEV; | 441 | return 0; |
353 | retval = freezer_change_state(cgroup, goal_state); | 442 | } |
354 | cgroup_unlock(); | 443 | |
355 | return retval; | 444 | static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) |
445 | { | ||
446 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
447 | |||
448 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); | ||
449 | } | ||
450 | |||
451 | static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) | ||
452 | { | ||
453 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
454 | |||
455 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); | ||
356 | } | 456 | } |
357 | 457 | ||
358 | static struct cftype files[] = { | 458 | static struct cftype files[] = { |
@@ -362,23 +462,27 @@ static struct cftype files[] = { | |||
362 | .read_seq_string = freezer_read, | 462 | .read_seq_string = freezer_read, |
363 | .write_string = freezer_write, | 463 | .write_string = freezer_write, |
364 | }, | 464 | }, |
465 | { | ||
466 | .name = "self_freezing", | ||
467 | .flags = CFTYPE_NOT_ON_ROOT, | ||
468 | .read_u64 = freezer_self_freezing_read, | ||
469 | }, | ||
470 | { | ||
471 | .name = "parent_freezing", | ||
472 | .flags = CFTYPE_NOT_ON_ROOT, | ||
473 | .read_u64 = freezer_parent_freezing_read, | ||
474 | }, | ||
365 | { } /* terminate */ | 475 | { } /* terminate */ |
366 | }; | 476 | }; |
367 | 477 | ||
368 | struct cgroup_subsys freezer_subsys = { | 478 | struct cgroup_subsys freezer_subsys = { |
369 | .name = "freezer", | 479 | .name = "freezer", |
370 | .create = freezer_create, | 480 | .css_alloc = freezer_css_alloc, |
371 | .destroy = freezer_destroy, | 481 | .css_online = freezer_css_online, |
482 | .css_offline = freezer_css_offline, | ||
483 | .css_free = freezer_css_free, | ||
372 | .subsys_id = freezer_subsys_id, | 484 | .subsys_id = freezer_subsys_id, |
373 | .can_attach = freezer_can_attach, | 485 | .attach = freezer_attach, |
374 | .fork = freezer_fork, | 486 | .fork = freezer_fork, |
375 | .base_cftypes = files, | 487 | .base_cftypes = files, |
376 | |||
377 | /* | ||
378 | * freezer subsys doesn't handle hierarchy at all. Frozen state | ||
379 | * should be inherited through the hierarchy - if a parent is | ||
380 | * frozen, all its children should be frozen. Fix it and remove | ||
381 | * the following. | ||
382 | */ | ||
383 | .broken_hierarchy = true, | ||
384 | }; | 488 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index c28a306ae05c..f6150e92dfc9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) | |||
1215 | return 0; | 1215 | return 0; |
1216 | } | 1216 | } |
1217 | 1217 | ||
1218 | #ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL | ||
1219 | asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, | ||
1220 | struct compat_timespec __user *interval) | ||
1221 | { | ||
1222 | struct timespec t; | ||
1223 | int ret; | ||
1224 | mm_segment_t old_fs = get_fs(); | ||
1225 | |||
1226 | set_fs(KERNEL_DS); | ||
1227 | ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); | ||
1228 | set_fs(old_fs); | ||
1229 | if (put_compat_timespec(&t, interval)) | ||
1230 | return -EFAULT; | ||
1231 | return ret; | ||
1232 | } | ||
1233 | #endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ | ||
1234 | |||
1218 | /* | 1235 | /* |
1219 | * Allocate user-space memory for the duration of a single system call, | 1236 | * Allocate user-space memory for the duration of a single system call, |
1220 | * in order to marshall parameters inside a compat thunk. | 1237 | * in order to marshall parameters inside a compat thunk. |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c new file mode 100644 index 000000000000..e0e07fd55508 --- /dev/null +++ b/kernel/context_tracking.c | |||
@@ -0,0 +1,83 @@ | |||
1 | #include <linux/context_tracking.h> | ||
2 | #include <linux/rcupdate.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/percpu.h> | ||
5 | #include <linux/hardirq.h> | ||
6 | |||
7 | struct context_tracking { | ||
8 | /* | ||
9 | * When active is false, hooks are not set to | ||
10 | * minimize overhead: TIF flags are cleared | ||
11 | * and calls to user_enter/exit are ignored. This | ||
12 | * may be further optimized using static keys. | ||
13 | */ | ||
14 | bool active; | ||
15 | enum { | ||
16 | IN_KERNEL = 0, | ||
17 | IN_USER, | ||
18 | } state; | ||
19 | }; | ||
20 | |||
21 | static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { | ||
22 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | ||
23 | .active = true, | ||
24 | #endif | ||
25 | }; | ||
26 | |||
27 | void user_enter(void) | ||
28 | { | ||
29 | unsigned long flags; | ||
30 | |||
31 | /* | ||
32 | * Some contexts may involve an exception occuring in an irq, | ||
33 | * leading to that nesting: | ||
34 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
35 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
36 | * helpers are enough to protect RCU uses inside the exception. So | ||
37 | * just return immediately if we detect we are in an IRQ. | ||
38 | */ | ||
39 | if (in_interrupt()) | ||
40 | return; | ||
41 | |||
42 | WARN_ON_ONCE(!current->mm); | ||
43 | |||
44 | local_irq_save(flags); | ||
45 | if (__this_cpu_read(context_tracking.active) && | ||
46 | __this_cpu_read(context_tracking.state) != IN_USER) { | ||
47 | __this_cpu_write(context_tracking.state, IN_USER); | ||
48 | rcu_user_enter(); | ||
49 | } | ||
50 | local_irq_restore(flags); | ||
51 | } | ||
52 | |||
53 | void user_exit(void) | ||
54 | { | ||
55 | unsigned long flags; | ||
56 | |||
57 | /* | ||
58 | * Some contexts may involve an exception occuring in an irq, | ||
59 | * leading to that nesting: | ||
60 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
61 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
62 | * helpers are enough to protect RCU uses inside the exception. So | ||
63 | * just return immediately if we detect we are in an IRQ. | ||
64 | */ | ||
65 | if (in_interrupt()) | ||
66 | return; | ||
67 | |||
68 | local_irq_save(flags); | ||
69 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | ||
70 | __this_cpu_write(context_tracking.state, IN_KERNEL); | ||
71 | rcu_user_exit(); | ||
72 | } | ||
73 | local_irq_restore(flags); | ||
74 | } | ||
75 | |||
76 | void context_tracking_task_switch(struct task_struct *prev, | ||
77 | struct task_struct *next) | ||
78 | { | ||
79 | if (__this_cpu_read(context_tracking.active)) { | ||
80 | clear_tsk_thread_flag(prev, TIF_NOHZ); | ||
81 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
82 | } | ||
83 | } | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 42bd331ee0ab..3046a503242c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -348,11 +348,13 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
348 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 348 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
349 | struct task_struct *idle; | 349 | struct task_struct *idle; |
350 | 350 | ||
351 | if (cpu_online(cpu) || !cpu_present(cpu)) | ||
352 | return -EINVAL; | ||
353 | |||
354 | cpu_hotplug_begin(); | 351 | cpu_hotplug_begin(); |
355 | 352 | ||
353 | if (cpu_online(cpu) || !cpu_present(cpu)) { | ||
354 | ret = -EINVAL; | ||
355 | goto out; | ||
356 | } | ||
357 | |||
356 | idle = idle_thread_get(cpu); | 358 | idle = idle_thread_get(cpu); |
357 | if (IS_ERR(idle)) { | 359 | if (IS_ERR(idle)) { |
358 | ret = PTR_ERR(idle); | 360 | ret = PTR_ERR(idle); |
@@ -601,6 +603,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, | |||
601 | 603 | ||
602 | static int __init cpu_hotplug_pm_sync_init(void) | 604 | static int __init cpu_hotplug_pm_sync_init(void) |
603 | { | 605 | { |
606 | /* | ||
607 | * cpu_hotplug_pm_callback has higher priority than x86 | ||
608 | * bsp_pm_callback which depends on cpu_hotplug_pm_callback | ||
609 | * to disable cpu hotplug to avoid cpu hotplug race. | ||
610 | */ | ||
604 | pm_notifier(cpu_hotplug_pm_callback, 0); | 611 | pm_notifier(cpu_hotplug_pm_callback, 0); |
605 | return 0; | 612 | return 0; |
606 | } | 613 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f33c7153b6d7..7bb63eea6eb8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
302 | * are online, with memory. If none are online with memory, walk | 302 | * are online, with memory. If none are online with memory, walk |
303 | * up the cpuset hierarchy until we find one that does have some | 303 | * up the cpuset hierarchy until we find one that does have some |
304 | * online mems. If we get all the way to the top and still haven't | 304 | * online mems. If we get all the way to the top and still haven't |
305 | * found any online mems, return node_states[N_HIGH_MEMORY]. | 305 | * found any online mems, return node_states[N_MEMORY]. |
306 | * | 306 | * |
307 | * One way or another, we guarantee to return some non-empty subset | 307 | * One way or another, we guarantee to return some non-empty subset |
308 | * of node_states[N_HIGH_MEMORY]. | 308 | * of node_states[N_MEMORY]. |
309 | * | 309 | * |
310 | * Call with callback_mutex held. | 310 | * Call with callback_mutex held. |
311 | */ | 311 | */ |
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
314 | { | 314 | { |
315 | while (cs && !nodes_intersects(cs->mems_allowed, | 315 | while (cs && !nodes_intersects(cs->mems_allowed, |
316 | node_states[N_HIGH_MEMORY])) | 316 | node_states[N_MEMORY])) |
317 | cs = cs->parent; | 317 | cs = cs->parent; |
318 | if (cs) | 318 | if (cs) |
319 | nodes_and(*pmask, cs->mems_allowed, | 319 | nodes_and(*pmask, cs->mems_allowed, |
320 | node_states[N_HIGH_MEMORY]); | 320 | node_states[N_MEMORY]); |
321 | else | 321 | else |
322 | *pmask = node_states[N_HIGH_MEMORY]; | 322 | *pmask = node_states[N_MEMORY]; |
323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); | 323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); |
324 | } | 324 | } |
325 | 325 | ||
326 | /* | 326 | /* |
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1100 | return -ENOMEM; | 1100 | return -ENOMEM; |
1101 | 1101 | ||
1102 | /* | 1102 | /* |
1103 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | 1103 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; |
1104 | * it's read-only | 1104 | * it's read-only |
1105 | */ | 1105 | */ |
1106 | if (cs == &top_cpuset) { | 1106 | if (cs == &top_cpuset) { |
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1122 | goto done; | 1122 | goto done; |
1123 | 1123 | ||
1124 | if (!nodes_subset(trialcs->mems_allowed, | 1124 | if (!nodes_subset(trialcs->mems_allowed, |
1125 | node_states[N_HIGH_MEMORY])) { | 1125 | node_states[N_MEMORY])) { |
1126 | retval = -EINVAL; | 1126 | retval = -EINVAL; |
1127 | goto done; | 1127 | goto done; |
1128 | } | 1128 | } |
@@ -1784,56 +1784,20 @@ static struct cftype files[] = { | |||
1784 | }; | 1784 | }; |
1785 | 1785 | ||
1786 | /* | 1786 | /* |
1787 | * post_clone() is called during cgroup_create() when the | 1787 | * cpuset_css_alloc - allocate a cpuset css |
1788 | * clone_children mount argument was specified. The cgroup | ||
1789 | * can not yet have any tasks. | ||
1790 | * | ||
1791 | * Currently we refuse to set up the cgroup - thereby | ||
1792 | * refusing the task to be entered, and as a result refusing | ||
1793 | * the sys_unshare() or clone() which initiated it - if any | ||
1794 | * sibling cpusets have exclusive cpus or mem. | ||
1795 | * | ||
1796 | * If this becomes a problem for some users who wish to | ||
1797 | * allow that scenario, then cpuset_post_clone() could be | ||
1798 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | ||
1799 | * (and likewise for mems) to the new cgroup. Called with cgroup_mutex | ||
1800 | * held. | ||
1801 | */ | ||
1802 | static void cpuset_post_clone(struct cgroup *cgroup) | ||
1803 | { | ||
1804 | struct cgroup *parent, *child; | ||
1805 | struct cpuset *cs, *parent_cs; | ||
1806 | |||
1807 | parent = cgroup->parent; | ||
1808 | list_for_each_entry(child, &parent->children, sibling) { | ||
1809 | cs = cgroup_cs(child); | ||
1810 | if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) | ||
1811 | return; | ||
1812 | } | ||
1813 | cs = cgroup_cs(cgroup); | ||
1814 | parent_cs = cgroup_cs(parent); | ||
1815 | |||
1816 | mutex_lock(&callback_mutex); | ||
1817 | cs->mems_allowed = parent_cs->mems_allowed; | ||
1818 | cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); | ||
1819 | mutex_unlock(&callback_mutex); | ||
1820 | return; | ||
1821 | } | ||
1822 | |||
1823 | /* | ||
1824 | * cpuset_create - create a cpuset | ||
1825 | * cont: control group that the new cpuset will be part of | 1788 | * cont: control group that the new cpuset will be part of |
1826 | */ | 1789 | */ |
1827 | 1790 | ||
1828 | static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | 1791 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) |
1829 | { | 1792 | { |
1830 | struct cpuset *cs; | 1793 | struct cgroup *parent_cg = cont->parent; |
1831 | struct cpuset *parent; | 1794 | struct cgroup *tmp_cg; |
1795 | struct cpuset *parent, *cs; | ||
1832 | 1796 | ||
1833 | if (!cont->parent) { | 1797 | if (!parent_cg) |
1834 | return &top_cpuset.css; | 1798 | return &top_cpuset.css; |
1835 | } | 1799 | parent = cgroup_cs(parent_cg); |
1836 | parent = cgroup_cs(cont->parent); | 1800 | |
1837 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); | 1801 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); |
1838 | if (!cs) | 1802 | if (!cs) |
1839 | return ERR_PTR(-ENOMEM); | 1803 | return ERR_PTR(-ENOMEM); |
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | |||
1855 | 1819 | ||
1856 | cs->parent = parent; | 1820 | cs->parent = parent; |
1857 | number_of_cpusets++; | 1821 | number_of_cpusets++; |
1858 | return &cs->css ; | 1822 | |
1823 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) | ||
1824 | goto skip_clone; | ||
1825 | |||
1826 | /* | ||
1827 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is | ||
1828 | * set. This flag handling is implemented in cgroup core for | ||
1829 | * histrical reasons - the flag may be specified during mount. | ||
1830 | * | ||
1831 | * Currently, if any sibling cpusets have exclusive cpus or mem, we | ||
1832 | * refuse to clone the configuration - thereby refusing the task to | ||
1833 | * be entered, and as a result refusing the sys_unshare() or | ||
1834 | * clone() which initiated it. If this becomes a problem for some | ||
1835 | * users who wish to allow that scenario, then this could be | ||
1836 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | ||
1837 | * (and likewise for mems) to the new cgroup. | ||
1838 | */ | ||
1839 | list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { | ||
1840 | struct cpuset *tmp_cs = cgroup_cs(tmp_cg); | ||
1841 | |||
1842 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) | ||
1843 | goto skip_clone; | ||
1844 | } | ||
1845 | |||
1846 | mutex_lock(&callback_mutex); | ||
1847 | cs->mems_allowed = parent->mems_allowed; | ||
1848 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | ||
1849 | mutex_unlock(&callback_mutex); | ||
1850 | skip_clone: | ||
1851 | return &cs->css; | ||
1859 | } | 1852 | } |
1860 | 1853 | ||
1861 | /* | 1854 | /* |
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | |||
1864 | * will call async_rebuild_sched_domains(). | 1857 | * will call async_rebuild_sched_domains(). |
1865 | */ | 1858 | */ |
1866 | 1859 | ||
1867 | static void cpuset_destroy(struct cgroup *cont) | 1860 | static void cpuset_css_free(struct cgroup *cont) |
1868 | { | 1861 | { |
1869 | struct cpuset *cs = cgroup_cs(cont); | 1862 | struct cpuset *cs = cgroup_cs(cont); |
1870 | 1863 | ||
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont) | |||
1878 | 1871 | ||
1879 | struct cgroup_subsys cpuset_subsys = { | 1872 | struct cgroup_subsys cpuset_subsys = { |
1880 | .name = "cpuset", | 1873 | .name = "cpuset", |
1881 | .create = cpuset_create, | 1874 | .css_alloc = cpuset_css_alloc, |
1882 | .destroy = cpuset_destroy, | 1875 | .css_free = cpuset_css_free, |
1883 | .can_attach = cpuset_can_attach, | 1876 | .can_attach = cpuset_can_attach, |
1884 | .attach = cpuset_attach, | 1877 | .attach = cpuset_attach, |
1885 | .post_clone = cpuset_post_clone, | ||
1886 | .subsys_id = cpuset_subsys_id, | 1878 | .subsys_id = cpuset_subsys_id, |
1887 | .base_cftypes = files, | 1879 | .base_cftypes = files, |
1888 | .early_init = 1, | 1880 | .early_init = 1, |
@@ -2034,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue) | |||
2034 | * before dropping down to the next. It always processes a node before | 2026 | * before dropping down to the next. It always processes a node before |
2035 | * any of its children. | 2027 | * any of its children. |
2036 | * | 2028 | * |
2037 | * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY | 2029 | * In the case of memory hot-unplug, it will remove nodes from N_MEMORY |
2038 | * if all present pages from a node are offlined. | 2030 | * if all present pages from a node are offlined. |
2039 | */ | 2031 | */ |
2040 | static void | 2032 | static void |
@@ -2073,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
2073 | 2065 | ||
2074 | /* Continue past cpusets with all mems online */ | 2066 | /* Continue past cpusets with all mems online */ |
2075 | if (nodes_subset(cp->mems_allowed, | 2067 | if (nodes_subset(cp->mems_allowed, |
2076 | node_states[N_HIGH_MEMORY])) | 2068 | node_states[N_MEMORY])) |
2077 | continue; | 2069 | continue; |
2078 | 2070 | ||
2079 | oldmems = cp->mems_allowed; | 2071 | oldmems = cp->mems_allowed; |
@@ -2081,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
2081 | /* Remove offline mems from this cpuset. */ | 2073 | /* Remove offline mems from this cpuset. */ |
2082 | mutex_lock(&callback_mutex); | 2074 | mutex_lock(&callback_mutex); |
2083 | nodes_and(cp->mems_allowed, cp->mems_allowed, | 2075 | nodes_and(cp->mems_allowed, cp->mems_allowed, |
2084 | node_states[N_HIGH_MEMORY]); | 2076 | node_states[N_MEMORY]); |
2085 | mutex_unlock(&callback_mutex); | 2077 | mutex_unlock(&callback_mutex); |
2086 | 2078 | ||
2087 | /* Move tasks from the empty cpuset to a parent */ | 2079 | /* Move tasks from the empty cpuset to a parent */ |
@@ -2134,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online) | |||
2134 | 2126 | ||
2135 | #ifdef CONFIG_MEMORY_HOTPLUG | 2127 | #ifdef CONFIG_MEMORY_HOTPLUG |
2136 | /* | 2128 | /* |
2137 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2129 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. |
2138 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. | 2130 | * Call this routine anytime after node_states[N_MEMORY] changes. |
2139 | * See cpuset_update_active_cpus() for CPU hotplug handling. | 2131 | * See cpuset_update_active_cpus() for CPU hotplug handling. |
2140 | */ | 2132 | */ |
2141 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2133 | static int cpuset_track_online_nodes(struct notifier_block *self, |
@@ -2148,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2148 | case MEM_ONLINE: | 2140 | case MEM_ONLINE: |
2149 | oldmems = top_cpuset.mems_allowed; | 2141 | oldmems = top_cpuset.mems_allowed; |
2150 | mutex_lock(&callback_mutex); | 2142 | mutex_lock(&callback_mutex); |
2151 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2143 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2152 | mutex_unlock(&callback_mutex); | 2144 | mutex_unlock(&callback_mutex); |
2153 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); | 2145 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); |
2154 | break; | 2146 | break; |
@@ -2177,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2177 | void __init cpuset_init_smp(void) | 2169 | void __init cpuset_init_smp(void) |
2178 | { | 2170 | { |
2179 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2171 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2180 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2172 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2181 | 2173 | ||
2182 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2174 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
2183 | 2175 | ||
@@ -2245,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void) | |||
2245 | * | 2237 | * |
2246 | * Description: Returns the nodemask_t mems_allowed of the cpuset | 2238 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
2247 | * attached to the specified @tsk. Guaranteed to return some non-empty | 2239 | * attached to the specified @tsk. Guaranteed to return some non-empty |
2248 | * subset of node_states[N_HIGH_MEMORY], even if this means going outside the | 2240 | * subset of node_states[N_MEMORY], even if this means going outside the |
2249 | * tasks cpuset. | 2241 | * tasks cpuset. |
2250 | **/ | 2242 | **/ |
2251 | 2243 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 48cea3da6d05..e0573a43c7df 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -30,17 +30,6 @@ | |||
30 | static struct kmem_cache *cred_jar; | 30 | static struct kmem_cache *cred_jar; |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * The common credentials for the initial task's thread group | ||
34 | */ | ||
35 | #ifdef CONFIG_KEYS | ||
36 | static struct thread_group_cred init_tgcred = { | ||
37 | .usage = ATOMIC_INIT(2), | ||
38 | .tgid = 0, | ||
39 | .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), | ||
40 | }; | ||
41 | #endif | ||
42 | |||
43 | /* | ||
44 | * The initial credentials for the initial task | 33 | * The initial credentials for the initial task |
45 | */ | 34 | */ |
46 | struct cred init_cred = { | 35 | struct cred init_cred = { |
@@ -65,9 +54,6 @@ struct cred init_cred = { | |||
65 | .user = INIT_USER, | 54 | .user = INIT_USER, |
66 | .user_ns = &init_user_ns, | 55 | .user_ns = &init_user_ns, |
67 | .group_info = &init_groups, | 56 | .group_info = &init_groups, |
68 | #ifdef CONFIG_KEYS | ||
69 | .tgcred = &init_tgcred, | ||
70 | #endif | ||
71 | }; | 57 | }; |
72 | 58 | ||
73 | static inline void set_cred_subscribers(struct cred *cred, int n) | 59 | static inline void set_cred_subscribers(struct cred *cred, int n) |
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n) | |||
96 | } | 82 | } |
97 | 83 | ||
98 | /* | 84 | /* |
99 | * Dispose of the shared task group credentials | ||
100 | */ | ||
101 | #ifdef CONFIG_KEYS | ||
102 | static void release_tgcred_rcu(struct rcu_head *rcu) | ||
103 | { | ||
104 | struct thread_group_cred *tgcred = | ||
105 | container_of(rcu, struct thread_group_cred, rcu); | ||
106 | |||
107 | BUG_ON(atomic_read(&tgcred->usage) != 0); | ||
108 | |||
109 | key_put(tgcred->session_keyring); | ||
110 | key_put(tgcred->process_keyring); | ||
111 | kfree(tgcred); | ||
112 | } | ||
113 | #endif | ||
114 | |||
115 | /* | ||
116 | * Release a set of thread group credentials. | ||
117 | */ | ||
118 | static void release_tgcred(struct cred *cred) | ||
119 | { | ||
120 | #ifdef CONFIG_KEYS | ||
121 | struct thread_group_cred *tgcred = cred->tgcred; | ||
122 | |||
123 | if (atomic_dec_and_test(&tgcred->usage)) | ||
124 | call_rcu(&tgcred->rcu, release_tgcred_rcu); | ||
125 | #endif | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * The RCU callback to actually dispose of a set of credentials | 85 | * The RCU callback to actually dispose of a set of credentials |
130 | */ | 86 | */ |
131 | static void put_cred_rcu(struct rcu_head *rcu) | 87 | static void put_cred_rcu(struct rcu_head *rcu) |
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu) | |||
150 | #endif | 106 | #endif |
151 | 107 | ||
152 | security_cred_free(cred); | 108 | security_cred_free(cred); |
109 | key_put(cred->session_keyring); | ||
110 | key_put(cred->process_keyring); | ||
153 | key_put(cred->thread_keyring); | 111 | key_put(cred->thread_keyring); |
154 | key_put(cred->request_key_auth); | 112 | key_put(cred->request_key_auth); |
155 | release_tgcred(cred); | ||
156 | if (cred->group_info) | 113 | if (cred->group_info) |
157 | put_group_info(cred->group_info); | 114 | put_group_info(cred->group_info); |
158 | free_uid(cred->user); | 115 | free_uid(cred->user); |
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void) | |||
246 | if (!new) | 203 | if (!new) |
247 | return NULL; | 204 | return NULL; |
248 | 205 | ||
249 | #ifdef CONFIG_KEYS | ||
250 | new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); | ||
251 | if (!new->tgcred) { | ||
252 | kmem_cache_free(cred_jar, new); | ||
253 | return NULL; | ||
254 | } | ||
255 | atomic_set(&new->tgcred->usage, 1); | ||
256 | #endif | ||
257 | |||
258 | atomic_set(&new->usage, 1); | 206 | atomic_set(&new->usage, 1); |
259 | #ifdef CONFIG_DEBUG_CREDENTIALS | 207 | #ifdef CONFIG_DEBUG_CREDENTIALS |
260 | new->magic = CRED_MAGIC; | 208 | new->magic = CRED_MAGIC; |
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void) | |||
308 | get_user_ns(new->user_ns); | 256 | get_user_ns(new->user_ns); |
309 | 257 | ||
310 | #ifdef CONFIG_KEYS | 258 | #ifdef CONFIG_KEYS |
259 | key_get(new->session_keyring); | ||
260 | key_get(new->process_keyring); | ||
311 | key_get(new->thread_keyring); | 261 | key_get(new->thread_keyring); |
312 | key_get(new->request_key_auth); | 262 | key_get(new->request_key_auth); |
313 | atomic_inc(&new->tgcred->usage); | ||
314 | #endif | 263 | #endif |
315 | 264 | ||
316 | #ifdef CONFIG_SECURITY | 265 | #ifdef CONFIG_SECURITY |
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds); | |||
334 | */ | 283 | */ |
335 | struct cred *prepare_exec_creds(void) | 284 | struct cred *prepare_exec_creds(void) |
336 | { | 285 | { |
337 | struct thread_group_cred *tgcred = NULL; | ||
338 | struct cred *new; | 286 | struct cred *new; |
339 | 287 | ||
340 | #ifdef CONFIG_KEYS | ||
341 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | ||
342 | if (!tgcred) | ||
343 | return NULL; | ||
344 | #endif | ||
345 | |||
346 | new = prepare_creds(); | 288 | new = prepare_creds(); |
347 | if (!new) { | 289 | if (!new) |
348 | kfree(tgcred); | ||
349 | return new; | 290 | return new; |
350 | } | ||
351 | 291 | ||
352 | #ifdef CONFIG_KEYS | 292 | #ifdef CONFIG_KEYS |
353 | /* newly exec'd tasks don't get a thread keyring */ | 293 | /* newly exec'd tasks don't get a thread keyring */ |
354 | key_put(new->thread_keyring); | 294 | key_put(new->thread_keyring); |
355 | new->thread_keyring = NULL; | 295 | new->thread_keyring = NULL; |
356 | 296 | ||
357 | /* create a new per-thread-group creds for all this set of threads to | ||
358 | * share */ | ||
359 | memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); | ||
360 | |||
361 | atomic_set(&tgcred->usage, 1); | ||
362 | spin_lock_init(&tgcred->lock); | ||
363 | |||
364 | /* inherit the session keyring; new process keyring */ | 297 | /* inherit the session keyring; new process keyring */ |
365 | key_get(tgcred->session_keyring); | 298 | key_put(new->process_keyring); |
366 | tgcred->process_keyring = NULL; | 299 | new->process_keyring = NULL; |
367 | |||
368 | release_tgcred(new); | ||
369 | new->tgcred = tgcred; | ||
370 | #endif | 300 | #endif |
371 | 301 | ||
372 | return new; | 302 | return new; |
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void) | |||
383 | */ | 313 | */ |
384 | int copy_creds(struct task_struct *p, unsigned long clone_flags) | 314 | int copy_creds(struct task_struct *p, unsigned long clone_flags) |
385 | { | 315 | { |
386 | #ifdef CONFIG_KEYS | ||
387 | struct thread_group_cred *tgcred; | ||
388 | #endif | ||
389 | struct cred *new; | 316 | struct cred *new; |
390 | int ret; | 317 | int ret; |
391 | 318 | ||
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
425 | install_thread_keyring_to_cred(new); | 352 | install_thread_keyring_to_cred(new); |
426 | } | 353 | } |
427 | 354 | ||
428 | /* we share the process and session keyrings between all the threads in | 355 | /* The process keyring is only shared between the threads in a process; |
429 | * a process - this is slightly icky as we violate COW credentials a | 356 | * anything outside of those threads doesn't inherit. |
430 | * bit */ | 357 | */ |
431 | if (!(clone_flags & CLONE_THREAD)) { | 358 | if (!(clone_flags & CLONE_THREAD)) { |
432 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | 359 | key_put(new->process_keyring); |
433 | if (!tgcred) { | 360 | new->process_keyring = NULL; |
434 | ret = -ENOMEM; | ||
435 | goto error_put; | ||
436 | } | ||
437 | atomic_set(&tgcred->usage, 1); | ||
438 | spin_lock_init(&tgcred->lock); | ||
439 | tgcred->process_keyring = NULL; | ||
440 | tgcred->session_keyring = key_get(new->tgcred->session_keyring); | ||
441 | |||
442 | release_tgcred(new); | ||
443 | new->tgcred = tgcred; | ||
444 | } | 361 | } |
445 | #endif | 362 | #endif |
446 | 363 | ||
@@ -455,6 +372,31 @@ error_put: | |||
455 | return ret; | 372 | return ret; |
456 | } | 373 | } |
457 | 374 | ||
375 | static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) | ||
376 | { | ||
377 | const struct user_namespace *set_ns = set->user_ns; | ||
378 | const struct user_namespace *subset_ns = subset->user_ns; | ||
379 | |||
380 | /* If the two credentials are in the same user namespace see if | ||
381 | * the capabilities of subset are a subset of set. | ||
382 | */ | ||
383 | if (set_ns == subset_ns) | ||
384 | return cap_issubset(subset->cap_permitted, set->cap_permitted); | ||
385 | |||
386 | /* The credentials are in a different user namespaces | ||
387 | * therefore one is a subset of the other only if a set is an | ||
388 | * ancestor of subset and set->euid is owner of subset or one | ||
389 | * of subsets ancestors. | ||
390 | */ | ||
391 | for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { | ||
392 | if ((set_ns == subset_ns->parent) && | ||
393 | uid_eq(subset_ns->owner, set->euid)) | ||
394 | return true; | ||
395 | } | ||
396 | |||
397 | return false; | ||
398 | } | ||
399 | |||
458 | /** | 400 | /** |
459 | * commit_creds - Install new credentials upon the current task | 401 | * commit_creds - Install new credentials upon the current task |
460 | * @new: The credentials to be assigned | 402 | * @new: The credentials to be assigned |
@@ -493,7 +435,7 @@ int commit_creds(struct cred *new) | |||
493 | !gid_eq(old->egid, new->egid) || | 435 | !gid_eq(old->egid, new->egid) || |
494 | !uid_eq(old->fsuid, new->fsuid) || | 436 | !uid_eq(old->fsuid, new->fsuid) || |
495 | !gid_eq(old->fsgid, new->fsgid) || | 437 | !gid_eq(old->fsgid, new->fsgid) || |
496 | !cap_issubset(new->cap_permitted, old->cap_permitted)) { | 438 | !cred_cap_issubset(old, new)) { |
497 | if (task->mm) | 439 | if (task->mm) |
498 | set_dumpable(task->mm, suid_dumpable); | 440 | set_dumpable(task->mm, suid_dumpable); |
499 | task->pdeath_signal = 0; | 441 | task->pdeath_signal = 0; |
@@ -643,9 +585,6 @@ void __init cred_init(void) | |||
643 | */ | 585 | */ |
644 | struct cred *prepare_kernel_cred(struct task_struct *daemon) | 586 | struct cred *prepare_kernel_cred(struct task_struct *daemon) |
645 | { | 587 | { |
646 | #ifdef CONFIG_KEYS | ||
647 | struct thread_group_cred *tgcred; | ||
648 | #endif | ||
649 | const struct cred *old; | 588 | const struct cred *old; |
650 | struct cred *new; | 589 | struct cred *new; |
651 | 590 | ||
@@ -653,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
653 | if (!new) | 592 | if (!new) |
654 | return NULL; | 593 | return NULL; |
655 | 594 | ||
656 | #ifdef CONFIG_KEYS | ||
657 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | ||
658 | if (!tgcred) { | ||
659 | kmem_cache_free(cred_jar, new); | ||
660 | return NULL; | ||
661 | } | ||
662 | #endif | ||
663 | |||
664 | kdebug("prepare_kernel_cred() alloc %p", new); | 595 | kdebug("prepare_kernel_cred() alloc %p", new); |
665 | 596 | ||
666 | if (daemon) | 597 | if (daemon) |
@@ -678,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
678 | get_group_info(new->group_info); | 609 | get_group_info(new->group_info); |
679 | 610 | ||
680 | #ifdef CONFIG_KEYS | 611 | #ifdef CONFIG_KEYS |
681 | atomic_set(&tgcred->usage, 1); | 612 | new->session_keyring = NULL; |
682 | spin_lock_init(&tgcred->lock); | 613 | new->process_keyring = NULL; |
683 | tgcred->process_keyring = NULL; | ||
684 | tgcred->session_keyring = NULL; | ||
685 | new->tgcred = tgcred; | ||
686 | new->request_key_auth = NULL; | ||
687 | new->thread_keyring = NULL; | 614 | new->thread_keyring = NULL; |
615 | new->request_key_auth = NULL; | ||
688 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; | 616 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; |
689 | #endif | 617 | #endif |
690 | 618 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index dbccf83c134d..301079d06f24 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6155 | 6155 | ||
6156 | event->parent = parent_event; | 6156 | event->parent = parent_event; |
6157 | 6157 | ||
6158 | event->ns = get_pid_ns(current->nsproxy->pid_ns); | 6158 | event->ns = get_pid_ns(task_active_pid_ns(current)); |
6159 | event->id = atomic64_inc_return(&perf_event_id); | 6159 | event->id = atomic64_inc_return(&perf_event_id); |
6160 | 6160 | ||
6161 | event->state = PERF_EVENT_STATE_INACTIVE; | 6161 | event->state = PERF_EVENT_STATE_INACTIVE; |
@@ -7434,7 +7434,7 @@ unlock: | |||
7434 | device_initcall(perf_event_sysfs_init); | 7434 | device_initcall(perf_event_sysfs_init); |
7435 | 7435 | ||
7436 | #ifdef CONFIG_CGROUP_PERF | 7436 | #ifdef CONFIG_CGROUP_PERF |
7437 | static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) | 7437 | static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) |
7438 | { | 7438 | { |
7439 | struct perf_cgroup *jc; | 7439 | struct perf_cgroup *jc; |
7440 | 7440 | ||
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) | |||
7451 | return &jc->css; | 7451 | return &jc->css; |
7452 | } | 7452 | } |
7453 | 7453 | ||
7454 | static void perf_cgroup_destroy(struct cgroup *cont) | 7454 | static void perf_cgroup_css_free(struct cgroup *cont) |
7455 | { | 7455 | { |
7456 | struct perf_cgroup *jc; | 7456 | struct perf_cgroup *jc; |
7457 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | 7457 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), |
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | |||
7492 | struct cgroup_subsys perf_subsys = { | 7492 | struct cgroup_subsys perf_subsys = { |
7493 | .name = "perf_event", | 7493 | .name = "perf_event", |
7494 | .subsys_id = perf_subsys_id, | 7494 | .subsys_id = perf_subsys_id, |
7495 | .create = perf_cgroup_create, | 7495 | .css_alloc = perf_cgroup_css_alloc, |
7496 | .destroy = perf_cgroup_destroy, | 7496 | .css_free = perf_cgroup_css_free, |
7497 | .exit = perf_cgroup_exit, | 7497 | .exit = perf_cgroup_exit, |
7498 | .attach = perf_cgroup_attach, | 7498 | .attach = perf_cgroup_attach, |
7499 | 7499 | ||
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5cc4e7e42e68..dea7acfbb071 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/ptrace.h> /* user_enable_single_step */ | 33 | #include <linux/ptrace.h> /* user_enable_single_step */ |
34 | #include <linux/kdebug.h> /* notifier mechanism */ | 34 | #include <linux/kdebug.h> /* notifier mechanism */ |
35 | #include "../../mm/internal.h" /* munlock_vma_page */ | 35 | #include "../../mm/internal.h" /* munlock_vma_page */ |
36 | #include <linux/percpu-rwsem.h> | ||
36 | 37 | ||
37 | #include <linux/uprobes.h> | 38 | #include <linux/uprobes.h> |
38 | 39 | ||
@@ -71,6 +72,8 @@ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; | |||
71 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | 72 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; |
72 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | 73 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) |
73 | 74 | ||
75 | static struct percpu_rw_semaphore dup_mmap_sem; | ||
76 | |||
74 | /* | 77 | /* |
75 | * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe | 78 | * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe |
76 | * events active at this time. Probably a fine grained per inode count is | 79 | * events active at this time. Probably a fine grained per inode count is |
@@ -766,10 +769,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | |||
766 | struct map_info *info; | 769 | struct map_info *info; |
767 | int err = 0; | 770 | int err = 0; |
768 | 771 | ||
772 | percpu_down_write(&dup_mmap_sem); | ||
769 | info = build_map_info(uprobe->inode->i_mapping, | 773 | info = build_map_info(uprobe->inode->i_mapping, |
770 | uprobe->offset, is_register); | 774 | uprobe->offset, is_register); |
771 | if (IS_ERR(info)) | 775 | if (IS_ERR(info)) { |
772 | return PTR_ERR(info); | 776 | err = PTR_ERR(info); |
777 | goto out; | ||
778 | } | ||
773 | 779 | ||
774 | while (info) { | 780 | while (info) { |
775 | struct mm_struct *mm = info->mm; | 781 | struct mm_struct *mm = info->mm; |
@@ -799,7 +805,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | |||
799 | mmput(mm); | 805 | mmput(mm); |
800 | info = free_map_info(info); | 806 | info = free_map_info(info); |
801 | } | 807 | } |
802 | 808 | out: | |
809 | percpu_up_write(&dup_mmap_sem); | ||
803 | return err; | 810 | return err; |
804 | } | 811 | } |
805 | 812 | ||
@@ -1131,6 +1138,16 @@ void uprobe_clear_state(struct mm_struct *mm) | |||
1131 | kfree(area); | 1138 | kfree(area); |
1132 | } | 1139 | } |
1133 | 1140 | ||
1141 | void uprobe_start_dup_mmap(void) | ||
1142 | { | ||
1143 | percpu_down_read(&dup_mmap_sem); | ||
1144 | } | ||
1145 | |||
1146 | void uprobe_end_dup_mmap(void) | ||
1147 | { | ||
1148 | percpu_up_read(&dup_mmap_sem); | ||
1149 | } | ||
1150 | |||
1134 | void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) | 1151 | void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) |
1135 | { | 1152 | { |
1136 | newmm->uprobes_state.xol_area = NULL; | 1153 | newmm->uprobes_state.xol_area = NULL; |
@@ -1199,6 +1216,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot | |||
1199 | vaddr = kmap_atomic(area->page); | 1216 | vaddr = kmap_atomic(area->page); |
1200 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); | 1217 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); |
1201 | kunmap_atomic(vaddr); | 1218 | kunmap_atomic(vaddr); |
1219 | /* | ||
1220 | * We probably need flush_icache_user_range() but it needs vma. | ||
1221 | * This should work on supported architectures too. | ||
1222 | */ | ||
1223 | flush_dcache_page(area->page); | ||
1202 | 1224 | ||
1203 | return current->utask->xol_vaddr; | 1225 | return current->utask->xol_vaddr; |
1204 | } | 1226 | } |
@@ -1430,16 +1452,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | |||
1430 | return uprobe; | 1452 | return uprobe; |
1431 | } | 1453 | } |
1432 | 1454 | ||
1433 | void __weak arch_uprobe_enable_step(struct arch_uprobe *arch) | ||
1434 | { | ||
1435 | user_enable_single_step(current); | ||
1436 | } | ||
1437 | |||
1438 | void __weak arch_uprobe_disable_step(struct arch_uprobe *arch) | ||
1439 | { | ||
1440 | user_disable_single_step(current); | ||
1441 | } | ||
1442 | |||
1443 | /* | 1455 | /* |
1444 | * Run handler and ask thread to singlestep. | 1456 | * Run handler and ask thread to singlestep. |
1445 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | 1457 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. |
@@ -1493,7 +1505,6 @@ static void handle_swbp(struct pt_regs *regs) | |||
1493 | goto out; | 1505 | goto out; |
1494 | 1506 | ||
1495 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { | 1507 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { |
1496 | arch_uprobe_enable_step(&uprobe->arch); | ||
1497 | utask->active_uprobe = uprobe; | 1508 | utask->active_uprobe = uprobe; |
1498 | utask->state = UTASK_SSTEP; | 1509 | utask->state = UTASK_SSTEP; |
1499 | return; | 1510 | return; |
@@ -1525,7 +1536,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | |||
1525 | else | 1536 | else |
1526 | WARN_ON_ONCE(1); | 1537 | WARN_ON_ONCE(1); |
1527 | 1538 | ||
1528 | arch_uprobe_disable_step(&uprobe->arch); | ||
1529 | put_uprobe(uprobe); | 1539 | put_uprobe(uprobe); |
1530 | utask->active_uprobe = NULL; | 1540 | utask->active_uprobe = NULL; |
1531 | utask->state = UTASK_RUNNING; | 1541 | utask->state = UTASK_RUNNING; |
@@ -1604,6 +1614,9 @@ static int __init init_uprobes(void) | |||
1604 | mutex_init(&uprobes_mmap_mutex[i]); | 1614 | mutex_init(&uprobes_mmap_mutex[i]); |
1605 | } | 1615 | } |
1606 | 1616 | ||
1617 | if (percpu_init_rwsem(&dup_mmap_sem)) | ||
1618 | return -ENOMEM; | ||
1619 | |||
1607 | return register_die_notifier(&uprobe_exception_nb); | 1620 | return register_die_notifier(&uprobe_exception_nb); |
1608 | } | 1621 | } |
1609 | module_init(init_uprobes); | 1622 | module_init(init_uprobes); |
diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092c..b4df21937216 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
72 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
73 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
75 | /* | ||
76 | * If we are the last child process in a pid namespace to be | ||
77 | * reaped, notify the reaper sleeping zap_pid_ns_processes(). | ||
78 | */ | ||
79 | if (IS_ENABLED(CONFIG_PID_NS)) { | ||
80 | struct task_struct *parent = p->real_parent; | ||
81 | |||
82 | if ((task_active_pid_ns(parent)->child_reaper == parent) && | ||
83 | list_empty(&parent->children) && | ||
84 | (parent->flags & PF_EXITING)) | ||
85 | wake_up_process(parent); | ||
86 | } | ||
87 | } | 75 | } |
88 | list_del_rcu(&p->thread_group); | 76 | list_del_rcu(&p->thread_group); |
89 | } | 77 | } |
@@ -322,43 +310,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | |||
322 | } | 310 | } |
323 | } | 311 | } |
324 | 312 | ||
325 | /** | ||
326 | * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd | ||
327 | * | ||
328 | * If a kernel thread is launched as a result of a system call, or if | ||
329 | * it ever exits, it should generally reparent itself to kthreadd so it | ||
330 | * isn't in the way of other processes and is correctly cleaned up on exit. | ||
331 | * | ||
332 | * The various task state such as scheduling policy and priority may have | ||
333 | * been inherited from a user process, so we reset them to sane values here. | ||
334 | * | ||
335 | * NOTE that reparent_to_kthreadd() gives the caller full capabilities. | ||
336 | */ | ||
337 | static void reparent_to_kthreadd(void) | ||
338 | { | ||
339 | write_lock_irq(&tasklist_lock); | ||
340 | |||
341 | ptrace_unlink(current); | ||
342 | /* Reparent to init */ | ||
343 | current->real_parent = current->parent = kthreadd_task; | ||
344 | list_move_tail(¤t->sibling, ¤t->real_parent->children); | ||
345 | |||
346 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | ||
347 | current->exit_signal = SIGCHLD; | ||
348 | |||
349 | if (task_nice(current) < 0) | ||
350 | set_user_nice(current, 0); | ||
351 | /* cpus_allowed? */ | ||
352 | /* rt_priority? */ | ||
353 | /* signals? */ | ||
354 | memcpy(current->signal->rlim, init_task.signal->rlim, | ||
355 | sizeof(current->signal->rlim)); | ||
356 | |||
357 | atomic_inc(&init_cred.usage); | ||
358 | commit_creds(&init_cred); | ||
359 | write_unlock_irq(&tasklist_lock); | ||
360 | } | ||
361 | |||
362 | void __set_special_pids(struct pid *pid) | 313 | void __set_special_pids(struct pid *pid) |
363 | { | 314 | { |
364 | struct task_struct *curr = current->group_leader; | 315 | struct task_struct *curr = current->group_leader; |
@@ -370,13 +321,6 @@ void __set_special_pids(struct pid *pid) | |||
370 | change_pid(curr, PIDTYPE_PGID, pid); | 321 | change_pid(curr, PIDTYPE_PGID, pid); |
371 | } | 322 | } |
372 | 323 | ||
373 | static void set_special_pids(struct pid *pid) | ||
374 | { | ||
375 | write_lock_irq(&tasklist_lock); | ||
376 | __set_special_pids(pid); | ||
377 | write_unlock_irq(&tasklist_lock); | ||
378 | } | ||
379 | |||
380 | /* | 324 | /* |
381 | * Let kernel threads use this to say that they allow a certain signal. | 325 | * Let kernel threads use this to say that they allow a certain signal. |
382 | * Must not be used if kthread was cloned with CLONE_SIGHAND. | 326 | * Must not be used if kthread was cloned with CLONE_SIGHAND. |
@@ -416,54 +360,6 @@ int disallow_signal(int sig) | |||
416 | 360 | ||
417 | EXPORT_SYMBOL(disallow_signal); | 361 | EXPORT_SYMBOL(disallow_signal); |
418 | 362 | ||
419 | /* | ||
420 | * Put all the gunge required to become a kernel thread without | ||
421 | * attached user resources in one place where it belongs. | ||
422 | */ | ||
423 | |||
424 | void daemonize(const char *name, ...) | ||
425 | { | ||
426 | va_list args; | ||
427 | sigset_t blocked; | ||
428 | |||
429 | va_start(args, name); | ||
430 | vsnprintf(current->comm, sizeof(current->comm), name, args); | ||
431 | va_end(args); | ||
432 | |||
433 | /* | ||
434 | * If we were started as result of loading a module, close all of the | ||
435 | * user space pages. We don't need them, and if we didn't close them | ||
436 | * they would be locked into memory. | ||
437 | */ | ||
438 | exit_mm(current); | ||
439 | /* | ||
440 | * We don't want to get frozen, in case system-wide hibernation | ||
441 | * or suspend transition begins right now. | ||
442 | */ | ||
443 | current->flags |= (PF_NOFREEZE | PF_KTHREAD); | ||
444 | |||
445 | if (current->nsproxy != &init_nsproxy) { | ||
446 | get_nsproxy(&init_nsproxy); | ||
447 | switch_task_namespaces(current, &init_nsproxy); | ||
448 | } | ||
449 | set_special_pids(&init_struct_pid); | ||
450 | proc_clear_tty(current); | ||
451 | |||
452 | /* Block and flush all signals */ | ||
453 | sigfillset(&blocked); | ||
454 | sigprocmask(SIG_BLOCK, &blocked, NULL); | ||
455 | flush_signals(current); | ||
456 | |||
457 | /* Become as one with the init task */ | ||
458 | |||
459 | daemonize_fs_struct(); | ||
460 | daemonize_descriptors(); | ||
461 | |||
462 | reparent_to_kthreadd(); | ||
463 | } | ||
464 | |||
465 | EXPORT_SYMBOL(daemonize); | ||
466 | |||
467 | #ifdef CONFIG_MM_OWNER | 363 | #ifdef CONFIG_MM_OWNER |
468 | /* | 364 | /* |
469 | * A task is exiting. If it owned this mm, find a new owner for the mm. | 365 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
@@ -1186,11 +1082,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1186 | * as other threads in the parent group can be right | 1082 | * as other threads in the parent group can be right |
1187 | * here reaping other children at the same time. | 1083 | * here reaping other children at the same time. |
1188 | * | 1084 | * |
1189 | * We use thread_group_times() to get times for the thread | 1085 | * We use thread_group_cputime_adjusted() to get times for the thread |
1190 | * group, which consolidates times for all threads in the | 1086 | * group, which consolidates times for all threads in the |
1191 | * group including the group leader. | 1087 | * group including the group leader. |
1192 | */ | 1088 | */ |
1193 | thread_group_times(p, &tgutime, &tgstime); | 1089 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
1194 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1090 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1195 | psig = p->real_parent->signal; | 1091 | psig = p->real_parent->signal; |
1196 | sig = p->signal; | 1092 | sig = p->signal; |
diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..65ca6d27f24e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti) | |||
146 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 146 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
147 | int node) | 147 | int node) |
148 | { | 148 | { |
149 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, | 149 | struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, |
150 | THREAD_SIZE_ORDER); | 150 | THREAD_SIZE_ORDER); |
151 | 151 | ||
152 | return page ? page_address(page) : NULL; | 152 | return page ? page_address(page) : NULL; |
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | |||
154 | 154 | ||
155 | static inline void free_thread_info(struct thread_info *ti) | 155 | static inline void free_thread_info(struct thread_info *ti) |
156 | { | 156 | { |
157 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 157 | free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
158 | } | 158 | } |
159 | # else | 159 | # else |
160 | static struct kmem_cache *thread_info_cache; | 160 | static struct kmem_cache *thread_info_cache; |
@@ -352,6 +352,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
352 | unsigned long charge; | 352 | unsigned long charge; |
353 | struct mempolicy *pol; | 353 | struct mempolicy *pol; |
354 | 354 | ||
355 | uprobe_start_dup_mmap(); | ||
355 | down_write(&oldmm->mmap_sem); | 356 | down_write(&oldmm->mmap_sem); |
356 | flush_cache_dup_mm(oldmm); | 357 | flush_cache_dup_mm(oldmm); |
357 | uprobe_dup_mmap(oldmm, mm); | 358 | uprobe_dup_mmap(oldmm, mm); |
@@ -469,6 +470,7 @@ out: | |||
469 | up_write(&mm->mmap_sem); | 470 | up_write(&mm->mmap_sem); |
470 | flush_tlb_mm(oldmm); | 471 | flush_tlb_mm(oldmm); |
471 | up_write(&oldmm->mmap_sem); | 472 | up_write(&oldmm->mmap_sem); |
473 | uprobe_end_dup_mmap(); | ||
472 | return retval; | 474 | return retval; |
473 | fail_nomem_anon_vma_fork: | 475 | fail_nomem_anon_vma_fork: |
474 | mpol_put(pol); | 476 | mpol_put(pol); |
@@ -821,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
821 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 823 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
822 | mm->pmd_huge_pte = NULL; | 824 | mm->pmd_huge_pte = NULL; |
823 | #endif | 825 | #endif |
826 | #ifdef CONFIG_NUMA_BALANCING | ||
827 | mm->first_nid = NUMA_PTE_SCAN_INIT; | ||
828 | #endif | ||
824 | if (!mm_init(mm, tsk)) | 829 | if (!mm_init(mm, tsk)) |
825 | goto fail_nomem; | 830 | goto fail_nomem; |
826 | 831 | ||
@@ -1039,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1039 | atomic_set(&sig->live, 1); | 1044 | atomic_set(&sig->live, 1); |
1040 | atomic_set(&sig->sigcnt, 1); | 1045 | atomic_set(&sig->sigcnt, 1); |
1041 | init_waitqueue_head(&sig->wait_chldexit); | 1046 | init_waitqueue_head(&sig->wait_chldexit); |
1042 | if (clone_flags & CLONE_NEWPID) | ||
1043 | sig->flags |= SIGNAL_UNKILLABLE; | ||
1044 | sig->curr_target = tsk; | 1047 | sig->curr_target = tsk; |
1045 | init_sigpending(&sig->shared_pending); | 1048 | init_sigpending(&sig->shared_pending); |
1046 | INIT_LIST_HEAD(&sig->posix_timers); | 1049 | INIT_LIST_HEAD(&sig->posix_timers); |
@@ -1127,7 +1130,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk) | |||
1127 | */ | 1130 | */ |
1128 | static struct task_struct *copy_process(unsigned long clone_flags, | 1131 | static struct task_struct *copy_process(unsigned long clone_flags, |
1129 | unsigned long stack_start, | 1132 | unsigned long stack_start, |
1130 | struct pt_regs *regs, | ||
1131 | unsigned long stack_size, | 1133 | unsigned long stack_size, |
1132 | int __user *child_tidptr, | 1134 | int __user *child_tidptr, |
1133 | struct pid *pid, | 1135 | struct pid *pid, |
@@ -1135,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1135 | { | 1137 | { |
1136 | int retval; | 1138 | int retval; |
1137 | struct task_struct *p; | 1139 | struct task_struct *p; |
1138 | int cgroup_callbacks_done = 0; | ||
1139 | 1140 | ||
1140 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | 1141 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) |
1141 | return ERR_PTR(-EINVAL); | 1142 | return ERR_PTR(-EINVAL); |
@@ -1165,6 +1166,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1165 | current->signal->flags & SIGNAL_UNKILLABLE) | 1166 | current->signal->flags & SIGNAL_UNKILLABLE) |
1166 | return ERR_PTR(-EINVAL); | 1167 | return ERR_PTR(-EINVAL); |
1167 | 1168 | ||
1169 | /* | ||
1170 | * If the new process will be in a different pid namespace | ||
1171 | * don't allow the creation of threads. | ||
1172 | */ | ||
1173 | if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && | ||
1174 | (task_active_pid_ns(current) != current->nsproxy->pid_ns)) | ||
1175 | return ERR_PTR(-EINVAL); | ||
1176 | |||
1168 | retval = security_task_create(clone_flags); | 1177 | retval = security_task_create(clone_flags); |
1169 | if (retval) | 1178 | if (retval) |
1170 | goto fork_out; | 1179 | goto fork_out; |
@@ -1222,7 +1231,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1222 | p->utime = p->stime = p->gtime = 0; | 1231 | p->utime = p->stime = p->gtime = 0; |
1223 | p->utimescaled = p->stimescaled = 0; | 1232 | p->utimescaled = p->stimescaled = 0; |
1224 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 1233 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
1225 | p->prev_utime = p->prev_stime = 0; | 1234 | p->prev_cputime.utime = p->prev_cputime.stime = 0; |
1226 | #endif | 1235 | #endif |
1227 | #if defined(SPLIT_RSS_COUNTING) | 1236 | #if defined(SPLIT_RSS_COUNTING) |
1228 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); | 1237 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); |
@@ -1320,7 +1329,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1320 | retval = copy_io(clone_flags, p); | 1329 | retval = copy_io(clone_flags, p); |
1321 | if (retval) | 1330 | if (retval) |
1322 | goto bad_fork_cleanup_namespaces; | 1331 | goto bad_fork_cleanup_namespaces; |
1323 | retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); | 1332 | retval = copy_thread(clone_flags, stack_start, stack_size, p); |
1324 | if (retval) | 1333 | if (retval) |
1325 | goto bad_fork_cleanup_io; | 1334 | goto bad_fork_cleanup_io; |
1326 | 1335 | ||
@@ -1393,12 +1402,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1393 | INIT_LIST_HEAD(&p->thread_group); | 1402 | INIT_LIST_HEAD(&p->thread_group); |
1394 | p->task_works = NULL; | 1403 | p->task_works = NULL; |
1395 | 1404 | ||
1396 | /* Now that the task is set up, run cgroup callbacks if | ||
1397 | * necessary. We need to run them before the task is visible | ||
1398 | * on the tasklist. */ | ||
1399 | cgroup_fork_callbacks(p); | ||
1400 | cgroup_callbacks_done = 1; | ||
1401 | |||
1402 | /* Need tasklist lock for parent etc handling! */ | 1405 | /* Need tasklist lock for parent etc handling! */ |
1403 | write_lock_irq(&tasklist_lock); | 1406 | write_lock_irq(&tasklist_lock); |
1404 | 1407 | ||
@@ -1441,8 +1444,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1441 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); | 1444 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); |
1442 | 1445 | ||
1443 | if (thread_group_leader(p)) { | 1446 | if (thread_group_leader(p)) { |
1444 | if (is_child_reaper(pid)) | 1447 | if (is_child_reaper(pid)) { |
1445 | p->nsproxy->pid_ns->child_reaper = p; | 1448 | ns_of_pid(pid)->child_reaper = p; |
1449 | p->signal->flags |= SIGNAL_UNKILLABLE; | ||
1450 | } | ||
1446 | 1451 | ||
1447 | p->signal->leader_pid = pid; | 1452 | p->signal->leader_pid = pid; |
1448 | p->signal->tty = tty_kref_get(current->signal->tty); | 1453 | p->signal->tty = tty_kref_get(current->signal->tty); |
@@ -1476,8 +1481,6 @@ bad_fork_cleanup_io: | |||
1476 | if (p->io_context) | 1481 | if (p->io_context) |
1477 | exit_io_context(p); | 1482 | exit_io_context(p); |
1478 | bad_fork_cleanup_namespaces: | 1483 | bad_fork_cleanup_namespaces: |
1479 | if (unlikely(clone_flags & CLONE_NEWPID)) | ||
1480 | pid_ns_release_proc(p->nsproxy->pid_ns); | ||
1481 | exit_task_namespaces(p); | 1484 | exit_task_namespaces(p); |
1482 | bad_fork_cleanup_mm: | 1485 | bad_fork_cleanup_mm: |
1483 | if (p->mm) | 1486 | if (p->mm) |
@@ -1503,7 +1506,7 @@ bad_fork_cleanup_cgroup: | |||
1503 | #endif | 1506 | #endif |
1504 | if (clone_flags & CLONE_THREAD) | 1507 | if (clone_flags & CLONE_THREAD) |
1505 | threadgroup_change_end(current); | 1508 | threadgroup_change_end(current); |
1506 | cgroup_exit(p, cgroup_callbacks_done); | 1509 | cgroup_exit(p, 0); |
1507 | delayacct_tsk_free(p); | 1510 | delayacct_tsk_free(p); |
1508 | module_put(task_thread_info(p)->exec_domain->module); | 1511 | module_put(task_thread_info(p)->exec_domain->module); |
1509 | bad_fork_cleanup_count: | 1512 | bad_fork_cleanup_count: |
@@ -1515,12 +1518,6 @@ fork_out: | |||
1515 | return ERR_PTR(retval); | 1518 | return ERR_PTR(retval); |
1516 | } | 1519 | } |
1517 | 1520 | ||
1518 | noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | ||
1519 | { | ||
1520 | memset(regs, 0, sizeof(struct pt_regs)); | ||
1521 | return regs; | ||
1522 | } | ||
1523 | |||
1524 | static inline void init_idle_pids(struct pid_link *links) | 1521 | static inline void init_idle_pids(struct pid_link *links) |
1525 | { | 1522 | { |
1526 | enum pid_type type; | 1523 | enum pid_type type; |
@@ -1534,10 +1531,7 @@ static inline void init_idle_pids(struct pid_link *links) | |||
1534 | struct task_struct * __cpuinit fork_idle(int cpu) | 1531 | struct task_struct * __cpuinit fork_idle(int cpu) |
1535 | { | 1532 | { |
1536 | struct task_struct *task; | 1533 | struct task_struct *task; |
1537 | struct pt_regs regs; | 1534 | task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); |
1538 | |||
1539 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, | ||
1540 | &init_struct_pid, 0); | ||
1541 | if (!IS_ERR(task)) { | 1535 | if (!IS_ERR(task)) { |
1542 | init_idle_pids(task->pids); | 1536 | init_idle_pids(task->pids); |
1543 | init_idle(task, cpu); | 1537 | init_idle(task, cpu); |
@@ -1554,7 +1548,6 @@ struct task_struct * __cpuinit fork_idle(int cpu) | |||
1554 | */ | 1548 | */ |
1555 | long do_fork(unsigned long clone_flags, | 1549 | long do_fork(unsigned long clone_flags, |
1556 | unsigned long stack_start, | 1550 | unsigned long stack_start, |
1557 | struct pt_regs *regs, | ||
1558 | unsigned long stack_size, | 1551 | unsigned long stack_size, |
1559 | int __user *parent_tidptr, | 1552 | int __user *parent_tidptr, |
1560 | int __user *child_tidptr) | 1553 | int __user *child_tidptr) |
@@ -1567,15 +1560,9 @@ long do_fork(unsigned long clone_flags, | |||
1567 | * Do some preliminary argument and permissions checking before we | 1560 | * Do some preliminary argument and permissions checking before we |
1568 | * actually start allocating stuff | 1561 | * actually start allocating stuff |
1569 | */ | 1562 | */ |
1570 | if (clone_flags & CLONE_NEWUSER) { | 1563 | if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { |
1571 | if (clone_flags & CLONE_THREAD) | 1564 | if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) |
1572 | return -EINVAL; | 1565 | return -EINVAL; |
1573 | /* hopefully this check will go away when userns support is | ||
1574 | * complete | ||
1575 | */ | ||
1576 | if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || | ||
1577 | !capable(CAP_SETGID)) | ||
1578 | return -EPERM; | ||
1579 | } | 1566 | } |
1580 | 1567 | ||
1581 | /* | 1568 | /* |
@@ -1584,7 +1571,7 @@ long do_fork(unsigned long clone_flags, | |||
1584 | * requested, no event is reported; otherwise, report if the event | 1571 | * requested, no event is reported; otherwise, report if the event |
1585 | * for the type of forking is enabled. | 1572 | * for the type of forking is enabled. |
1586 | */ | 1573 | */ |
1587 | if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { | 1574 | if (!(clone_flags & CLONE_UNTRACED)) { |
1588 | if (clone_flags & CLONE_VFORK) | 1575 | if (clone_flags & CLONE_VFORK) |
1589 | trace = PTRACE_EVENT_VFORK; | 1576 | trace = PTRACE_EVENT_VFORK; |
1590 | else if ((clone_flags & CSIGNAL) != SIGCHLD) | 1577 | else if ((clone_flags & CSIGNAL) != SIGCHLD) |
@@ -1596,7 +1583,7 @@ long do_fork(unsigned long clone_flags, | |||
1596 | trace = 0; | 1583 | trace = 0; |
1597 | } | 1584 | } |
1598 | 1585 | ||
1599 | p = copy_process(clone_flags, stack_start, regs, stack_size, | 1586 | p = copy_process(clone_flags, stack_start, stack_size, |
1600 | child_tidptr, NULL, trace); | 1587 | child_tidptr, NULL, trace); |
1601 | /* | 1588 | /* |
1602 | * Do this prior waking up the new thread - the thread pointer | 1589 | * Do this prior waking up the new thread - the thread pointer |
@@ -1634,15 +1621,56 @@ long do_fork(unsigned long clone_flags, | |||
1634 | return nr; | 1621 | return nr; |
1635 | } | 1622 | } |
1636 | 1623 | ||
1637 | #ifdef CONFIG_GENERIC_KERNEL_THREAD | ||
1638 | /* | 1624 | /* |
1639 | * Create a kernel thread. | 1625 | * Create a kernel thread. |
1640 | */ | 1626 | */ |
1641 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) | 1627 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) |
1642 | { | 1628 | { |
1643 | return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, | 1629 | return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, |
1644 | (unsigned long)arg, NULL, NULL); | 1630 | (unsigned long)arg, NULL, NULL); |
1645 | } | 1631 | } |
1632 | |||
1633 | #ifdef __ARCH_WANT_SYS_FORK | ||
1634 | SYSCALL_DEFINE0(fork) | ||
1635 | { | ||
1636 | #ifdef CONFIG_MMU | ||
1637 | return do_fork(SIGCHLD, 0, 0, NULL, NULL); | ||
1638 | #else | ||
1639 | /* can not support in nommu mode */ | ||
1640 | return(-EINVAL); | ||
1641 | #endif | ||
1642 | } | ||
1643 | #endif | ||
1644 | |||
1645 | #ifdef __ARCH_WANT_SYS_VFORK | ||
1646 | SYSCALL_DEFINE0(vfork) | ||
1647 | { | ||
1648 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, | ||
1649 | 0, NULL, NULL); | ||
1650 | } | ||
1651 | #endif | ||
1652 | |||
1653 | #ifdef __ARCH_WANT_SYS_CLONE | ||
1654 | #ifdef CONFIG_CLONE_BACKWARDS | ||
1655 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | ||
1656 | int __user *, parent_tidptr, | ||
1657 | int, tls_val, | ||
1658 | int __user *, child_tidptr) | ||
1659 | #elif defined(CONFIG_CLONE_BACKWARDS2) | ||
1660 | SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, | ||
1661 | int __user *, parent_tidptr, | ||
1662 | int __user *, child_tidptr, | ||
1663 | int, tls_val) | ||
1664 | #else | ||
1665 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | ||
1666 | int __user *, parent_tidptr, | ||
1667 | int __user *, child_tidptr, | ||
1668 | int, tls_val) | ||
1669 | #endif | ||
1670 | { | ||
1671 | return do_fork(clone_flags, newsp, 0, | ||
1672 | parent_tidptr, child_tidptr); | ||
1673 | } | ||
1646 | #endif | 1674 | #endif |
1647 | 1675 | ||
1648 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN | 1676 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN |
@@ -1694,7 +1722,8 @@ static int check_unshare_flags(unsigned long unshare_flags) | |||
1694 | { | 1722 | { |
1695 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | 1723 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
1696 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | 1724 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
1697 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) | 1725 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| |
1726 | CLONE_NEWUSER|CLONE_NEWPID)) | ||
1698 | return -EINVAL; | 1727 | return -EINVAL; |
1699 | /* | 1728 | /* |
1700 | * Not implemented, but pretend it works if there is nothing to | 1729 | * Not implemented, but pretend it works if there is nothing to |
@@ -1761,19 +1790,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1761 | { | 1790 | { |
1762 | struct fs_struct *fs, *new_fs = NULL; | 1791 | struct fs_struct *fs, *new_fs = NULL; |
1763 | struct files_struct *fd, *new_fd = NULL; | 1792 | struct files_struct *fd, *new_fd = NULL; |
1793 | struct cred *new_cred = NULL; | ||
1764 | struct nsproxy *new_nsproxy = NULL; | 1794 | struct nsproxy *new_nsproxy = NULL; |
1765 | int do_sysvsem = 0; | 1795 | int do_sysvsem = 0; |
1766 | int err; | 1796 | int err; |
1767 | 1797 | ||
1768 | err = check_unshare_flags(unshare_flags); | 1798 | /* |
1769 | if (err) | 1799 | * If unsharing a user namespace must also unshare the thread. |
1770 | goto bad_unshare_out; | 1800 | */ |
1771 | 1801 | if (unshare_flags & CLONE_NEWUSER) | |
1802 | unshare_flags |= CLONE_THREAD; | ||
1803 | /* | ||
1804 | * If unsharing a pid namespace must also unshare the thread. | ||
1805 | */ | ||
1806 | if (unshare_flags & CLONE_NEWPID) | ||
1807 | unshare_flags |= CLONE_THREAD; | ||
1808 | /* | ||
1809 | * If unsharing a thread from a thread group, must also unshare vm. | ||
1810 | */ | ||
1811 | if (unshare_flags & CLONE_THREAD) | ||
1812 | unshare_flags |= CLONE_VM; | ||
1813 | /* | ||
1814 | * If unsharing vm, must also unshare signal handlers. | ||
1815 | */ | ||
1816 | if (unshare_flags & CLONE_VM) | ||
1817 | unshare_flags |= CLONE_SIGHAND; | ||
1772 | /* | 1818 | /* |
1773 | * If unsharing namespace, must also unshare filesystem information. | 1819 | * If unsharing namespace, must also unshare filesystem information. |
1774 | */ | 1820 | */ |
1775 | if (unshare_flags & CLONE_NEWNS) | 1821 | if (unshare_flags & CLONE_NEWNS) |
1776 | unshare_flags |= CLONE_FS; | 1822 | unshare_flags |= CLONE_FS; |
1823 | |||
1824 | err = check_unshare_flags(unshare_flags); | ||
1825 | if (err) | ||
1826 | goto bad_unshare_out; | ||
1777 | /* | 1827 | /* |
1778 | * CLONE_NEWIPC must also detach from the undolist: after switching | 1828 | * CLONE_NEWIPC must also detach from the undolist: after switching |
1779 | * to a new ipc namespace, the semaphore arrays from the old | 1829 | * to a new ipc namespace, the semaphore arrays from the old |
@@ -1787,11 +1837,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1787 | err = unshare_fd(unshare_flags, &new_fd); | 1837 | err = unshare_fd(unshare_flags, &new_fd); |
1788 | if (err) | 1838 | if (err) |
1789 | goto bad_unshare_cleanup_fs; | 1839 | goto bad_unshare_cleanup_fs; |
1790 | err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); | 1840 | err = unshare_userns(unshare_flags, &new_cred); |
1791 | if (err) | 1841 | if (err) |
1792 | goto bad_unshare_cleanup_fd; | 1842 | goto bad_unshare_cleanup_fd; |
1843 | err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, | ||
1844 | new_cred, new_fs); | ||
1845 | if (err) | ||
1846 | goto bad_unshare_cleanup_cred; | ||
1793 | 1847 | ||
1794 | if (new_fs || new_fd || do_sysvsem || new_nsproxy) { | 1848 | if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { |
1795 | if (do_sysvsem) { | 1849 | if (do_sysvsem) { |
1796 | /* | 1850 | /* |
1797 | * CLONE_SYSVSEM is equivalent to sys_exit(). | 1851 | * CLONE_SYSVSEM is equivalent to sys_exit(). |
@@ -1824,11 +1878,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1824 | } | 1878 | } |
1825 | 1879 | ||
1826 | task_unlock(current); | 1880 | task_unlock(current); |
1881 | |||
1882 | if (new_cred) { | ||
1883 | /* Install the new user namespace */ | ||
1884 | commit_creds(new_cred); | ||
1885 | new_cred = NULL; | ||
1886 | } | ||
1827 | } | 1887 | } |
1828 | 1888 | ||
1829 | if (new_nsproxy) | 1889 | if (new_nsproxy) |
1830 | put_nsproxy(new_nsproxy); | 1890 | put_nsproxy(new_nsproxy); |
1831 | 1891 | ||
1892 | bad_unshare_cleanup_cred: | ||
1893 | if (new_cred) | ||
1894 | put_cred(new_cred); | ||
1832 | bad_unshare_cleanup_fd: | 1895 | bad_unshare_cleanup_fd: |
1833 | if (new_fd) | 1896 | if (new_fd) |
1834 | put_files_struct(new_fd); | 1897 | put_files_struct(new_fd); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 11f82a4d4eae..c38893b0efba 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p) | |||
116 | return false; | 116 | return false; |
117 | } | 117 | } |
118 | 118 | ||
119 | if (!(p->flags & PF_KTHREAD)) { | 119 | if (!(p->flags & PF_KTHREAD)) |
120 | fake_signal_wake_up(p); | 120 | fake_signal_wake_up(p); |
121 | /* | 121 | else |
122 | * fake_signal_wake_up() goes through p's scheduler | ||
123 | * lock and guarantees that TASK_STOPPED/TRACED -> | ||
124 | * TASK_RUNNING transition can't race with task state | ||
125 | * testing in try_to_freeze_tasks(). | ||
126 | */ | ||
127 | } else { | ||
128 | wake_up_state(p, TASK_INTERRUPTIBLE); | 122 | wake_up_state(p, TASK_INTERRUPTIBLE); |
129 | } | ||
130 | 123 | ||
131 | spin_unlock_irqrestore(&freezer_lock, flags); | 124 | spin_unlock_irqrestore(&freezer_lock, flags); |
132 | return true; | 125 | return true; |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 57d86d07221e..3aca9f29d30e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -272,6 +272,7 @@ void handle_nested_irq(unsigned int irq) | |||
272 | 272 | ||
273 | raw_spin_lock_irq(&desc->lock); | 273 | raw_spin_lock_irq(&desc->lock); |
274 | 274 | ||
275 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
275 | kstat_incr_irqs_this_cpu(irq, desc); | 276 | kstat_incr_irqs_this_cpu(irq, desc); |
276 | 277 | ||
277 | action = desc->action; | 278 | action = desc->action; |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4e69e24d3d7d..96f3a1d9c379 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -177,8 +177,8 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, | |||
177 | irq_base = irq_alloc_descs(first_irq, first_irq, size, | 177 | irq_base = irq_alloc_descs(first_irq, first_irq, size, |
178 | of_node_to_nid(of_node)); | 178 | of_node_to_nid(of_node)); |
179 | if (irq_base < 0) { | 179 | if (irq_base < 0) { |
180 | WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", | 180 | pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", |
181 | first_irq); | 181 | first_irq); |
182 | irq_base = first_irq; | 182 | irq_base = first_irq; |
183 | } | 183 | } |
184 | } else | 184 | } else |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4c69326aa773..e49a288fa479 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -616,6 +616,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
616 | return ret; | 616 | return ret; |
617 | } | 617 | } |
618 | 618 | ||
619 | #ifdef CONFIG_HARDIRQS_SW_RESEND | ||
620 | int irq_set_parent(int irq, int parent_irq) | ||
621 | { | ||
622 | unsigned long flags; | ||
623 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); | ||
624 | |||
625 | if (!desc) | ||
626 | return -EINVAL; | ||
627 | |||
628 | desc->parent_irq = parent_irq; | ||
629 | |||
630 | irq_put_desc_unlock(desc, flags); | ||
631 | return 0; | ||
632 | } | ||
633 | #endif | ||
634 | |||
619 | /* | 635 | /* |
620 | * Default primary interrupt handler for threaded interrupts. Is | 636 | * Default primary interrupt handler for threaded interrupts. Is |
621 | * assigned as primary handler when request_threaded_irq is called | 637 | * assigned as primary handler when request_threaded_irq is called |
@@ -716,6 +732,7 @@ static void | |||
716 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | 732 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) |
717 | { | 733 | { |
718 | cpumask_var_t mask; | 734 | cpumask_var_t mask; |
735 | bool valid = true; | ||
719 | 736 | ||
720 | if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) | 737 | if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) |
721 | return; | 738 | return; |
@@ -730,10 +747,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | |||
730 | } | 747 | } |
731 | 748 | ||
732 | raw_spin_lock_irq(&desc->lock); | 749 | raw_spin_lock_irq(&desc->lock); |
733 | cpumask_copy(mask, desc->irq_data.affinity); | 750 | /* |
751 | * This code is triggered unconditionally. Check the affinity | ||
752 | * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. | ||
753 | */ | ||
754 | if (desc->irq_data.affinity) | ||
755 | cpumask_copy(mask, desc->irq_data.affinity); | ||
756 | else | ||
757 | valid = false; | ||
734 | raw_spin_unlock_irq(&desc->lock); | 758 | raw_spin_unlock_irq(&desc->lock); |
735 | 759 | ||
736 | set_cpus_allowed_ptr(current, mask); | 760 | if (valid) |
761 | set_cpus_allowed_ptr(current, mask); | ||
737 | free_cpumask_var(mask); | 762 | free_cpumask_var(mask); |
738 | } | 763 | } |
739 | #else | 764 | #else |
@@ -793,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused) | |||
793 | action = kthread_data(tsk); | 818 | action = kthread_data(tsk); |
794 | 819 | ||
795 | pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | 820 | pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", |
796 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); | 821 | tsk->comm, tsk->pid, action->irq); |
797 | 822 | ||
798 | 823 | ||
799 | desc = irq_to_desc(action->irq); | 824 | desc = irq_to_desc(action->irq); |
@@ -833,6 +858,8 @@ static int irq_thread(void *data) | |||
833 | init_task_work(&on_exit_work, irq_thread_dtor); | 858 | init_task_work(&on_exit_work, irq_thread_dtor); |
834 | task_work_add(current, &on_exit_work, false); | 859 | task_work_add(current, &on_exit_work, false); |
835 | 860 | ||
861 | irq_thread_check_affinity(desc, action); | ||
862 | |||
836 | while (!irq_wait_for_interrupt(action)) { | 863 | while (!irq_wait_for_interrupt(action)) { |
837 | irqreturn_t action_ret; | 864 | irqreturn_t action_ret; |
838 | 865 | ||
@@ -936,6 +963,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
936 | */ | 963 | */ |
937 | get_task_struct(t); | 964 | get_task_struct(t); |
938 | new->thread = t; | 965 | new->thread = t; |
966 | /* | ||
967 | * Tell the thread to set its affinity. This is | ||
968 | * important for shared interrupt handlers as we do | ||
969 | * not invoke setup_affinity() for the secondary | ||
970 | * handlers as everything is already set up. Even for | ||
971 | * interrupts marked with IRQF_NO_BALANCE this is | ||
972 | * correct as we want the thread to move to the cpu(s) | ||
973 | * on which the requesting code placed the interrupt. | ||
974 | */ | ||
975 | set_bit(IRQTF_AFFINITY, &new->thread_flags); | ||
939 | } | 976 | } |
940 | 977 | ||
941 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { | 978 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 6454db7b6a4d..9065107f083e 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
74 | if (!desc->irq_data.chip->irq_retrigger || | 74 | if (!desc->irq_data.chip->irq_retrigger || |
75 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { | 75 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { |
76 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 76 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
77 | /* | ||
78 | * If the interrupt has a parent irq and runs | ||
79 | * in the thread context of the parent irq, | ||
80 | * retrigger the parent. | ||
81 | */ | ||
82 | if (desc->parent_irq && | ||
83 | irq_settings_is_nested_thread(desc)) | ||
84 | irq = desc->parent_irq; | ||
77 | /* Set it pending and activate the softirq: */ | 85 | /* Set it pending and activate the softirq: */ |
78 | set_bit(irq, irqs_resend); | 86 | set_bit(irq, irqs_resend); |
79 | tasklet_schedule(&resend_tasklet); | 87 | tasklet_schedule(&resend_tasklet); |
diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 30b7b225306c..e30ac0fe61c3 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/string.h> | 4 | #include <linux/string.h> |
5 | #include <linux/random.h> | 5 | #include <linux/random.h> |
6 | #include <linux/module.h> | 6 | #include <linux/module.h> |
7 | #include <linux/ptrace.h> | ||
7 | #include <linux/init.h> | 8 | #include <linux/init.h> |
8 | #include <linux/errno.h> | 9 | #include <linux/errno.h> |
9 | #include <linux/cache.h> | 10 | #include <linux/cache.h> |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 1c317e386831..0023a87e8de6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -219,9 +219,9 @@ static int ____call_usermodehelper(void *data) | |||
219 | 219 | ||
220 | commit_creds(new); | 220 | commit_creds(new); |
221 | 221 | ||
222 | retval = kernel_execve(sub_info->path, | 222 | retval = do_execve(sub_info->path, |
223 | (const char *const *)sub_info->argv, | 223 | (const char __user *const __user *)sub_info->argv, |
224 | (const char *const *)sub_info->envp); | 224 | (const char __user *const __user *)sub_info->envp); |
225 | if (!retval) | 225 | if (!retval) |
226 | return 0; | 226 | return 0; |
227 | 227 | ||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 4e316e1acf58..6ada93c23a9a 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | |||
26 | static struct kobj_attribute _name##_attr = \ | 26 | static struct kobj_attribute _name##_attr = \ |
27 | __ATTR(_name, 0644, _name##_show, _name##_store) | 27 | __ATTR(_name, 0644, _name##_show, _name##_store) |
28 | 28 | ||
29 | #if defined(CONFIG_HOTPLUG) | ||
30 | /* current uevent sequence number */ | 29 | /* current uevent sequence number */ |
31 | static ssize_t uevent_seqnum_show(struct kobject *kobj, | 30 | static ssize_t uevent_seqnum_show(struct kobject *kobj, |
32 | struct kobj_attribute *attr, char *buf) | 31 | struct kobj_attribute *attr, char *buf) |
@@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, | |||
54 | return count; | 53 | return count; |
55 | } | 54 | } |
56 | KERNEL_ATTR_RW(uevent_helper); | 55 | KERNEL_ATTR_RW(uevent_helper); |
57 | #endif | 56 | |
58 | 57 | ||
59 | #ifdef CONFIG_PROFILING | 58 | #ifdef CONFIG_PROFILING |
60 | static ssize_t profiling_show(struct kobject *kobj, | 59 | static ssize_t profiling_show(struct kobject *kobj, |
@@ -141,6 +140,23 @@ static ssize_t fscaps_show(struct kobject *kobj, | |||
141 | } | 140 | } |
142 | KERNEL_ATTR_RO(fscaps); | 141 | KERNEL_ATTR_RO(fscaps); |
143 | 142 | ||
143 | int rcu_expedited; | ||
144 | static ssize_t rcu_expedited_show(struct kobject *kobj, | ||
145 | struct kobj_attribute *attr, char *buf) | ||
146 | { | ||
147 | return sprintf(buf, "%d\n", rcu_expedited); | ||
148 | } | ||
149 | static ssize_t rcu_expedited_store(struct kobject *kobj, | ||
150 | struct kobj_attribute *attr, | ||
151 | const char *buf, size_t count) | ||
152 | { | ||
153 | if (kstrtoint(buf, 0, &rcu_expedited)) | ||
154 | return -EINVAL; | ||
155 | |||
156 | return count; | ||
157 | } | ||
158 | KERNEL_ATTR_RW(rcu_expedited); | ||
159 | |||
144 | /* | 160 | /* |
145 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | 161 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. |
146 | */ | 162 | */ |
@@ -169,10 +185,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj); | |||
169 | 185 | ||
170 | static struct attribute * kernel_attrs[] = { | 186 | static struct attribute * kernel_attrs[] = { |
171 | &fscaps_attr.attr, | 187 | &fscaps_attr.attr, |
172 | #if defined(CONFIG_HOTPLUG) | ||
173 | &uevent_seqnum_attr.attr, | 188 | &uevent_seqnum_attr.attr, |
174 | &uevent_helper_attr.attr, | 189 | &uevent_helper_attr.attr, |
175 | #endif | ||
176 | #ifdef CONFIG_PROFILING | 190 | #ifdef CONFIG_PROFILING |
177 | &profiling_attr.attr, | 191 | &profiling_attr.attr, |
178 | #endif | 192 | #endif |
@@ -182,6 +196,7 @@ static struct attribute * kernel_attrs[] = { | |||
182 | &kexec_crash_size_attr.attr, | 196 | &kexec_crash_size_attr.attr, |
183 | &vmcoreinfo_attr.attr, | 197 | &vmcoreinfo_attr.attr, |
184 | #endif | 198 | #endif |
199 | &rcu_expedited_attr.attr, | ||
185 | NULL | 200 | NULL |
186 | }; | 201 | }; |
187 | 202 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index 29fb60caecb5..691dc2ef9baf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -428,7 +428,7 @@ int kthreadd(void *unused) | |||
428 | set_task_comm(tsk, "kthreadd"); | 428 | set_task_comm(tsk, "kthreadd"); |
429 | ignore_signals(tsk); | 429 | ignore_signals(tsk); |
430 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 430 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
431 | set_mems_allowed(node_states[N_HIGH_MEMORY]); | 431 | set_mems_allowed(node_states[N_MEMORY]); |
432 | 432 | ||
433 | current->flags |= PF_NOFREEZE; | 433 | current->flags |= PF_NOFREEZE; |
434 | 434 | ||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 91c32a0b612c..b2c71c5873e4 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v) | |||
39 | 39 | ||
40 | static void print_name(struct seq_file *m, struct lock_class *class) | 40 | static void print_name(struct seq_file *m, struct lock_class *class) |
41 | { | 41 | { |
42 | char str[128]; | 42 | char str[KSYM_NAME_LEN]; |
43 | const char *name = class->name; | 43 | const char *name = class->name; |
44 | 44 | ||
45 | if (!name) { | 45 | if (!name) { |
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S new file mode 100644 index 000000000000..246b4c6e6135 --- /dev/null +++ b/kernel/modsign_certificate.S | |||
@@ -0,0 +1,19 @@ | |||
1 | /* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ | ||
2 | #ifndef SYMBOL_PREFIX | ||
3 | #define ASM_SYMBOL(sym) sym | ||
4 | #else | ||
5 | #define PASTE2(x,y) x##y | ||
6 | #define PASTE(x,y) PASTE2(x,y) | ||
7 | #define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) | ||
8 | #endif | ||
9 | |||
10 | #define GLOBAL(name) \ | ||
11 | .globl ASM_SYMBOL(name); \ | ||
12 | ASM_SYMBOL(name): | ||
13 | |||
14 | .section ".init.data","aw" | ||
15 | |||
16 | GLOBAL(modsign_certificate_list) | ||
17 | .incbin "signing_key.x509" | ||
18 | .incbin "extra_certificates" | ||
19 | GLOBAL(modsign_certificate_list_end) | ||
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 767e559dfb10..2b6e69909c39 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c | |||
@@ -20,12 +20,6 @@ struct key *modsign_keyring; | |||
20 | 20 | ||
21 | extern __initdata const u8 modsign_certificate_list[]; | 21 | extern __initdata const u8 modsign_certificate_list[]; |
22 | extern __initdata const u8 modsign_certificate_list_end[]; | 22 | extern __initdata const u8 modsign_certificate_list_end[]; |
23 | asm(".section .init.data,\"aw\"\n" | ||
24 | SYMBOL_PREFIX "modsign_certificate_list:\n" | ||
25 | ".incbin \"signing_key.x509\"\n" | ||
26 | ".incbin \"extra_certificates\"\n" | ||
27 | SYMBOL_PREFIX "modsign_certificate_list_end:" | ||
28 | ); | ||
29 | 23 | ||
30 | /* | 24 | /* |
31 | * We need to make sure ccache doesn't cache the .o file as it doesn't notice | 25 | * We need to make sure ccache doesn't cache the .o file as it doesn't notice |
@@ -40,18 +34,15 @@ static __init int module_verify_init(void) | |||
40 | { | 34 | { |
41 | pr_notice("Initialise module verification\n"); | 35 | pr_notice("Initialise module verification\n"); |
42 | 36 | ||
43 | modsign_keyring = key_alloc(&key_type_keyring, ".module_sign", | 37 | modsign_keyring = keyring_alloc(".module_sign", |
44 | KUIDT_INIT(0), KGIDT_INIT(0), | 38 | KUIDT_INIT(0), KGIDT_INIT(0), |
45 | current_cred(), | 39 | current_cred(), |
46 | (KEY_POS_ALL & ~KEY_POS_SETATTR) | | 40 | ((KEY_POS_ALL & ~KEY_POS_SETATTR) | |
47 | KEY_USR_VIEW | KEY_USR_READ, | 41 | KEY_USR_VIEW | KEY_USR_READ), |
48 | KEY_ALLOC_NOT_IN_QUOTA); | 42 | KEY_ALLOC_NOT_IN_QUOTA, NULL); |
49 | if (IS_ERR(modsign_keyring)) | 43 | if (IS_ERR(modsign_keyring)) |
50 | panic("Can't allocate module signing keyring\n"); | 44 | panic("Can't allocate module signing keyring\n"); |
51 | 45 | ||
52 | if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0) | ||
53 | panic("Can't instantiate module signing keyring\n"); | ||
54 | |||
55 | return 0; | 46 | return 0; |
56 | } | 47 | } |
57 | 48 | ||
diff --git a/kernel/module.c b/kernel/module.c index 6e48c3a43599..250092c1d57d 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/ftrace_event.h> | 21 | #include <linux/ftrace_event.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/kallsyms.h> | 23 | #include <linux/kallsyms.h> |
24 | #include <linux/file.h> | ||
24 | #include <linux/fs.h> | 25 | #include <linux/fs.h> |
25 | #include <linux/sysfs.h> | 26 | #include <linux/sysfs.h> |
26 | #include <linux/kernel.h> | 27 | #include <linux/kernel.h> |
@@ -28,6 +29,7 @@ | |||
28 | #include <linux/vmalloc.h> | 29 | #include <linux/vmalloc.h> |
29 | #include <linux/elf.h> | 30 | #include <linux/elf.h> |
30 | #include <linux/proc_fs.h> | 31 | #include <linux/proc_fs.h> |
32 | #include <linux/security.h> | ||
31 | #include <linux/seq_file.h> | 33 | #include <linux/seq_file.h> |
32 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
33 | #include <linux/fcntl.h> | 35 | #include <linux/fcntl.h> |
@@ -59,6 +61,7 @@ | |||
59 | #include <linux/pfn.h> | 61 | #include <linux/pfn.h> |
60 | #include <linux/bsearch.h> | 62 | #include <linux/bsearch.h> |
61 | #include <linux/fips.h> | 63 | #include <linux/fips.h> |
64 | #include <uapi/linux/module.h> | ||
62 | #include "module-internal.h" | 65 | #include "module-internal.h" |
63 | 66 | ||
64 | #define CREATE_TRACE_POINTS | 67 | #define CREATE_TRACE_POINTS |
@@ -372,9 +375,6 @@ static bool check_symbol(const struct symsearch *syms, | |||
372 | printk(KERN_WARNING "Symbol %s is being used " | 375 | printk(KERN_WARNING "Symbol %s is being used " |
373 | "by a non-GPL module, which will not " | 376 | "by a non-GPL module, which will not " |
374 | "be allowed in the future\n", fsa->name); | 377 | "be allowed in the future\n", fsa->name); |
375 | printk(KERN_WARNING "Please see the file " | ||
376 | "Documentation/feature-removal-schedule.txt " | ||
377 | "in the kernel source tree for more details.\n"); | ||
378 | } | 378 | } |
379 | } | 379 | } |
380 | 380 | ||
@@ -2282,7 +2282,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) | |||
2282 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; | 2282 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; |
2283 | Elf_Shdr *strsect = info->sechdrs + info->index.str; | 2283 | Elf_Shdr *strsect = info->sechdrs + info->index.str; |
2284 | const Elf_Sym *src; | 2284 | const Elf_Sym *src; |
2285 | unsigned int i, nsrc, ndst, strtab_size; | 2285 | unsigned int i, nsrc, ndst, strtab_size = 0; |
2286 | 2286 | ||
2287 | /* Put symbol section at end of init part of module. */ | 2287 | /* Put symbol section at end of init part of module. */ |
2288 | symsect->sh_flags |= SHF_ALLOC; | 2288 | symsect->sh_flags |= SHF_ALLOC; |
@@ -2293,9 +2293,6 @@ static void layout_symtab(struct module *mod, struct load_info *info) | |||
2293 | src = (void *)info->hdr + symsect->sh_offset; | 2293 | src = (void *)info->hdr + symsect->sh_offset; |
2294 | nsrc = symsect->sh_size / sizeof(*src); | 2294 | nsrc = symsect->sh_size / sizeof(*src); |
2295 | 2295 | ||
2296 | /* strtab always starts with a nul, so offset 0 is the empty string. */ | ||
2297 | strtab_size = 1; | ||
2298 | |||
2299 | /* Compute total space required for the core symbols' strtab. */ | 2296 | /* Compute total space required for the core symbols' strtab. */ |
2300 | for (ndst = i = 0; i < nsrc; i++) { | 2297 | for (ndst = i = 0; i < nsrc; i++) { |
2301 | if (i == 0 || | 2298 | if (i == 0 || |
@@ -2337,7 +2334,6 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) | |||
2337 | mod->core_symtab = dst = mod->module_core + info->symoffs; | 2334 | mod->core_symtab = dst = mod->module_core + info->symoffs; |
2338 | mod->core_strtab = s = mod->module_core + info->stroffs; | 2335 | mod->core_strtab = s = mod->module_core + info->stroffs; |
2339 | src = mod->symtab; | 2336 | src = mod->symtab; |
2340 | *s++ = 0; | ||
2341 | for (ndst = i = 0; i < mod->num_symtab; i++) { | 2337 | for (ndst = i = 0; i < mod->num_symtab; i++) { |
2342 | if (i == 0 || | 2338 | if (i == 0 || |
2343 | is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { | 2339 | is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { |
@@ -2378,7 +2374,7 @@ static void dynamic_debug_remove(struct _ddebug *debug) | |||
2378 | 2374 | ||
2379 | void * __weak module_alloc(unsigned long size) | 2375 | void * __weak module_alloc(unsigned long size) |
2380 | { | 2376 | { |
2381 | return size == 0 ? NULL : vmalloc_exec(size); | 2377 | return vmalloc_exec(size); |
2382 | } | 2378 | } |
2383 | 2379 | ||
2384 | static void *module_alloc_update_bounds(unsigned long size) | 2380 | static void *module_alloc_update_bounds(unsigned long size) |
@@ -2425,18 +2421,17 @@ static inline void kmemleak_load_module(const struct module *mod, | |||
2425 | #endif | 2421 | #endif |
2426 | 2422 | ||
2427 | #ifdef CONFIG_MODULE_SIG | 2423 | #ifdef CONFIG_MODULE_SIG |
2428 | static int module_sig_check(struct load_info *info, | 2424 | static int module_sig_check(struct load_info *info) |
2429 | const void *mod, unsigned long *_len) | ||
2430 | { | 2425 | { |
2431 | int err = -ENOKEY; | 2426 | int err = -ENOKEY; |
2432 | unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; | 2427 | const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; |
2433 | unsigned long len = *_len; | 2428 | const void *mod = info->hdr; |
2434 | 2429 | ||
2435 | if (len > markerlen && | 2430 | if (info->len > markerlen && |
2436 | memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { | 2431 | memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { |
2437 | /* We truncate the module to discard the signature */ | 2432 | /* We truncate the module to discard the signature */ |
2438 | *_len -= markerlen; | 2433 | info->len -= markerlen; |
2439 | err = mod_verify_sig(mod, _len); | 2434 | err = mod_verify_sig(mod, &info->len); |
2440 | } | 2435 | } |
2441 | 2436 | ||
2442 | if (!err) { | 2437 | if (!err) { |
@@ -2454,59 +2449,107 @@ static int module_sig_check(struct load_info *info, | |||
2454 | return err; | 2449 | return err; |
2455 | } | 2450 | } |
2456 | #else /* !CONFIG_MODULE_SIG */ | 2451 | #else /* !CONFIG_MODULE_SIG */ |
2457 | static int module_sig_check(struct load_info *info, | 2452 | static int module_sig_check(struct load_info *info) |
2458 | void *mod, unsigned long *len) | ||
2459 | { | 2453 | { |
2460 | return 0; | 2454 | return 0; |
2461 | } | 2455 | } |
2462 | #endif /* !CONFIG_MODULE_SIG */ | 2456 | #endif /* !CONFIG_MODULE_SIG */ |
2463 | 2457 | ||
2464 | /* Sets info->hdr, info->len and info->sig_ok. */ | 2458 | /* Sanity checks against invalid binaries, wrong arch, weird elf version. */ |
2465 | static int copy_and_check(struct load_info *info, | 2459 | static int elf_header_check(struct load_info *info) |
2466 | const void __user *umod, unsigned long len, | 2460 | { |
2467 | const char __user *uargs) | 2461 | if (info->len < sizeof(*(info->hdr))) |
2462 | return -ENOEXEC; | ||
2463 | |||
2464 | if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0 | ||
2465 | || info->hdr->e_type != ET_REL | ||
2466 | || !elf_check_arch(info->hdr) | ||
2467 | || info->hdr->e_shentsize != sizeof(Elf_Shdr)) | ||
2468 | return -ENOEXEC; | ||
2469 | |||
2470 | if (info->hdr->e_shoff >= info->len | ||
2471 | || (info->hdr->e_shnum * sizeof(Elf_Shdr) > | ||
2472 | info->len - info->hdr->e_shoff)) | ||
2473 | return -ENOEXEC; | ||
2474 | |||
2475 | return 0; | ||
2476 | } | ||
2477 | |||
2478 | /* Sets info->hdr and info->len. */ | ||
2479 | static int copy_module_from_user(const void __user *umod, unsigned long len, | ||
2480 | struct load_info *info) | ||
2468 | { | 2481 | { |
2469 | int err; | 2482 | int err; |
2470 | Elf_Ehdr *hdr; | ||
2471 | 2483 | ||
2472 | if (len < sizeof(*hdr)) | 2484 | info->len = len; |
2485 | if (info->len < sizeof(*(info->hdr))) | ||
2473 | return -ENOEXEC; | 2486 | return -ENOEXEC; |
2474 | 2487 | ||
2488 | err = security_kernel_module_from_file(NULL); | ||
2489 | if (err) | ||
2490 | return err; | ||
2491 | |||
2475 | /* Suck in entire file: we'll want most of it. */ | 2492 | /* Suck in entire file: we'll want most of it. */ |
2476 | if ((hdr = vmalloc(len)) == NULL) | 2493 | info->hdr = vmalloc(info->len); |
2494 | if (!info->hdr) | ||
2477 | return -ENOMEM; | 2495 | return -ENOMEM; |
2478 | 2496 | ||
2479 | if (copy_from_user(hdr, umod, len) != 0) { | 2497 | if (copy_from_user(info->hdr, umod, info->len) != 0) { |
2480 | err = -EFAULT; | 2498 | vfree(info->hdr); |
2481 | goto free_hdr; | 2499 | return -EFAULT; |
2482 | } | 2500 | } |
2483 | 2501 | ||
2484 | err = module_sig_check(info, hdr, &len); | 2502 | return 0; |
2503 | } | ||
2504 | |||
2505 | /* Sets info->hdr and info->len. */ | ||
2506 | static int copy_module_from_fd(int fd, struct load_info *info) | ||
2507 | { | ||
2508 | struct file *file; | ||
2509 | int err; | ||
2510 | struct kstat stat; | ||
2511 | loff_t pos; | ||
2512 | ssize_t bytes = 0; | ||
2513 | |||
2514 | file = fget(fd); | ||
2515 | if (!file) | ||
2516 | return -ENOEXEC; | ||
2517 | |||
2518 | err = security_kernel_module_from_file(file); | ||
2485 | if (err) | 2519 | if (err) |
2486 | goto free_hdr; | 2520 | goto out; |
2487 | 2521 | ||
2488 | /* Sanity checks against insmoding binaries or wrong arch, | 2522 | err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); |
2489 | weird elf version */ | 2523 | if (err) |
2490 | if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 | 2524 | goto out; |
2491 | || hdr->e_type != ET_REL | ||
2492 | || !elf_check_arch(hdr) | ||
2493 | || hdr->e_shentsize != sizeof(Elf_Shdr)) { | ||
2494 | err = -ENOEXEC; | ||
2495 | goto free_hdr; | ||
2496 | } | ||
2497 | 2525 | ||
2498 | if (hdr->e_shoff >= len || | 2526 | if (stat.size > INT_MAX) { |
2499 | hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { | 2527 | err = -EFBIG; |
2500 | err = -ENOEXEC; | 2528 | goto out; |
2501 | goto free_hdr; | 2529 | } |
2530 | info->hdr = vmalloc(stat.size); | ||
2531 | if (!info->hdr) { | ||
2532 | err = -ENOMEM; | ||
2533 | goto out; | ||
2502 | } | 2534 | } |
2503 | 2535 | ||
2504 | info->hdr = hdr; | 2536 | pos = 0; |
2505 | info->len = len; | 2537 | while (pos < stat.size) { |
2506 | return 0; | 2538 | bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, |
2539 | stat.size - pos); | ||
2540 | if (bytes < 0) { | ||
2541 | vfree(info->hdr); | ||
2542 | err = bytes; | ||
2543 | goto out; | ||
2544 | } | ||
2545 | if (bytes == 0) | ||
2546 | break; | ||
2547 | pos += bytes; | ||
2548 | } | ||
2549 | info->len = pos; | ||
2507 | 2550 | ||
2508 | free_hdr: | 2551 | out: |
2509 | vfree(hdr); | 2552 | fput(file); |
2510 | return err; | 2553 | return err; |
2511 | } | 2554 | } |
2512 | 2555 | ||
@@ -2515,7 +2558,7 @@ static void free_copy(struct load_info *info) | |||
2515 | vfree(info->hdr); | 2558 | vfree(info->hdr); |
2516 | } | 2559 | } |
2517 | 2560 | ||
2518 | static int rewrite_section_headers(struct load_info *info) | 2561 | static int rewrite_section_headers(struct load_info *info, int flags) |
2519 | { | 2562 | { |
2520 | unsigned int i; | 2563 | unsigned int i; |
2521 | 2564 | ||
@@ -2543,7 +2586,10 @@ static int rewrite_section_headers(struct load_info *info) | |||
2543 | } | 2586 | } |
2544 | 2587 | ||
2545 | /* Track but don't keep modinfo and version sections. */ | 2588 | /* Track but don't keep modinfo and version sections. */ |
2546 | info->index.vers = find_sec(info, "__versions"); | 2589 | if (flags & MODULE_INIT_IGNORE_MODVERSIONS) |
2590 | info->index.vers = 0; /* Pretend no __versions section! */ | ||
2591 | else | ||
2592 | info->index.vers = find_sec(info, "__versions"); | ||
2547 | info->index.info = find_sec(info, ".modinfo"); | 2593 | info->index.info = find_sec(info, ".modinfo"); |
2548 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2594 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; |
2549 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2595 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; |
@@ -2558,7 +2604,7 @@ static int rewrite_section_headers(struct load_info *info) | |||
2558 | * Return the temporary module pointer (we'll replace it with the final | 2604 | * Return the temporary module pointer (we'll replace it with the final |
2559 | * one when we move the module sections around). | 2605 | * one when we move the module sections around). |
2560 | */ | 2606 | */ |
2561 | static struct module *setup_load_info(struct load_info *info) | 2607 | static struct module *setup_load_info(struct load_info *info, int flags) |
2562 | { | 2608 | { |
2563 | unsigned int i; | 2609 | unsigned int i; |
2564 | int err; | 2610 | int err; |
@@ -2569,7 +2615,7 @@ static struct module *setup_load_info(struct load_info *info) | |||
2569 | info->secstrings = (void *)info->hdr | 2615 | info->secstrings = (void *)info->hdr |
2570 | + info->sechdrs[info->hdr->e_shstrndx].sh_offset; | 2616 | + info->sechdrs[info->hdr->e_shstrndx].sh_offset; |
2571 | 2617 | ||
2572 | err = rewrite_section_headers(info); | 2618 | err = rewrite_section_headers(info, flags); |
2573 | if (err) | 2619 | if (err) |
2574 | return ERR_PTR(err); | 2620 | return ERR_PTR(err); |
2575 | 2621 | ||
@@ -2607,11 +2653,14 @@ static struct module *setup_load_info(struct load_info *info) | |||
2607 | return mod; | 2653 | return mod; |
2608 | } | 2654 | } |
2609 | 2655 | ||
2610 | static int check_modinfo(struct module *mod, struct load_info *info) | 2656 | static int check_modinfo(struct module *mod, struct load_info *info, int flags) |
2611 | { | 2657 | { |
2612 | const char *modmagic = get_modinfo(info, "vermagic"); | 2658 | const char *modmagic = get_modinfo(info, "vermagic"); |
2613 | int err; | 2659 | int err; |
2614 | 2660 | ||
2661 | if (flags & MODULE_INIT_IGNORE_VERMAGIC) | ||
2662 | modmagic = NULL; | ||
2663 | |||
2615 | /* This is allowed: modprobe --force will invalidate it. */ | 2664 | /* This is allowed: modprobe --force will invalidate it. */ |
2616 | if (!modmagic) { | 2665 | if (!modmagic) { |
2617 | err = try_to_force_load(mod, "bad vermagic"); | 2666 | err = try_to_force_load(mod, "bad vermagic"); |
@@ -2741,20 +2790,23 @@ static int move_module(struct module *mod, struct load_info *info) | |||
2741 | memset(ptr, 0, mod->core_size); | 2790 | memset(ptr, 0, mod->core_size); |
2742 | mod->module_core = ptr; | 2791 | mod->module_core = ptr; |
2743 | 2792 | ||
2744 | ptr = module_alloc_update_bounds(mod->init_size); | 2793 | if (mod->init_size) { |
2745 | /* | 2794 | ptr = module_alloc_update_bounds(mod->init_size); |
2746 | * The pointer to this block is stored in the module structure | 2795 | /* |
2747 | * which is inside the block. This block doesn't need to be | 2796 | * The pointer to this block is stored in the module structure |
2748 | * scanned as it contains data and code that will be freed | 2797 | * which is inside the block. This block doesn't need to be |
2749 | * after the module is initialized. | 2798 | * scanned as it contains data and code that will be freed |
2750 | */ | 2799 | * after the module is initialized. |
2751 | kmemleak_ignore(ptr); | 2800 | */ |
2752 | if (!ptr && mod->init_size) { | 2801 | kmemleak_ignore(ptr); |
2753 | module_free(mod, mod->module_core); | 2802 | if (!ptr) { |
2754 | return -ENOMEM; | 2803 | module_free(mod, mod->module_core); |
2755 | } | 2804 | return -ENOMEM; |
2756 | memset(ptr, 0, mod->init_size); | 2805 | } |
2757 | mod->module_init = ptr; | 2806 | memset(ptr, 0, mod->init_size); |
2807 | mod->module_init = ptr; | ||
2808 | } else | ||
2809 | mod->module_init = NULL; | ||
2758 | 2810 | ||
2759 | /* Transfer each section which specifies SHF_ALLOC */ | 2811 | /* Transfer each section which specifies SHF_ALLOC */ |
2760 | pr_debug("final section addresses:\n"); | 2812 | pr_debug("final section addresses:\n"); |
@@ -2847,18 +2899,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, | |||
2847 | return 0; | 2899 | return 0; |
2848 | } | 2900 | } |
2849 | 2901 | ||
2850 | static struct module *layout_and_allocate(struct load_info *info) | 2902 | static struct module *layout_and_allocate(struct load_info *info, int flags) |
2851 | { | 2903 | { |
2852 | /* Module within temporary copy. */ | 2904 | /* Module within temporary copy. */ |
2853 | struct module *mod; | 2905 | struct module *mod; |
2854 | Elf_Shdr *pcpusec; | 2906 | Elf_Shdr *pcpusec; |
2855 | int err; | 2907 | int err; |
2856 | 2908 | ||
2857 | mod = setup_load_info(info); | 2909 | mod = setup_load_info(info, flags); |
2858 | if (IS_ERR(mod)) | 2910 | if (IS_ERR(mod)) |
2859 | return mod; | 2911 | return mod; |
2860 | 2912 | ||
2861 | err = check_modinfo(mod, info); | 2913 | err = check_modinfo(mod, info, flags); |
2862 | if (err) | 2914 | if (err) |
2863 | return ERR_PTR(err); | 2915 | return ERR_PTR(err); |
2864 | 2916 | ||
@@ -2945,33 +2997,124 @@ static bool finished_loading(const char *name) | |||
2945 | return ret; | 2997 | return ret; |
2946 | } | 2998 | } |
2947 | 2999 | ||
3000 | /* Call module constructors. */ | ||
3001 | static void do_mod_ctors(struct module *mod) | ||
3002 | { | ||
3003 | #ifdef CONFIG_CONSTRUCTORS | ||
3004 | unsigned long i; | ||
3005 | |||
3006 | for (i = 0; i < mod->num_ctors; i++) | ||
3007 | mod->ctors[i](); | ||
3008 | #endif | ||
3009 | } | ||
3010 | |||
3011 | /* This is where the real work happens */ | ||
3012 | static int do_init_module(struct module *mod) | ||
3013 | { | ||
3014 | int ret = 0; | ||
3015 | |||
3016 | blocking_notifier_call_chain(&module_notify_list, | ||
3017 | MODULE_STATE_COMING, mod); | ||
3018 | |||
3019 | /* Set RO and NX regions for core */ | ||
3020 | set_section_ro_nx(mod->module_core, | ||
3021 | mod->core_text_size, | ||
3022 | mod->core_ro_size, | ||
3023 | mod->core_size); | ||
3024 | |||
3025 | /* Set RO and NX regions for init */ | ||
3026 | set_section_ro_nx(mod->module_init, | ||
3027 | mod->init_text_size, | ||
3028 | mod->init_ro_size, | ||
3029 | mod->init_size); | ||
3030 | |||
3031 | do_mod_ctors(mod); | ||
3032 | /* Start the module */ | ||
3033 | if (mod->init != NULL) | ||
3034 | ret = do_one_initcall(mod->init); | ||
3035 | if (ret < 0) { | ||
3036 | /* Init routine failed: abort. Try to protect us from | ||
3037 | buggy refcounters. */ | ||
3038 | mod->state = MODULE_STATE_GOING; | ||
3039 | synchronize_sched(); | ||
3040 | module_put(mod); | ||
3041 | blocking_notifier_call_chain(&module_notify_list, | ||
3042 | MODULE_STATE_GOING, mod); | ||
3043 | free_module(mod); | ||
3044 | wake_up_all(&module_wq); | ||
3045 | return ret; | ||
3046 | } | ||
3047 | if (ret > 0) { | ||
3048 | printk(KERN_WARNING | ||
3049 | "%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" | ||
3050 | "%s: loading module anyway...\n", | ||
3051 | __func__, mod->name, ret, | ||
3052 | __func__); | ||
3053 | dump_stack(); | ||
3054 | } | ||
3055 | |||
3056 | /* Now it's a first class citizen! */ | ||
3057 | mod->state = MODULE_STATE_LIVE; | ||
3058 | blocking_notifier_call_chain(&module_notify_list, | ||
3059 | MODULE_STATE_LIVE, mod); | ||
3060 | |||
3061 | /* We need to finish all async code before the module init sequence is done */ | ||
3062 | async_synchronize_full(); | ||
3063 | |||
3064 | mutex_lock(&module_mutex); | ||
3065 | /* Drop initial reference. */ | ||
3066 | module_put(mod); | ||
3067 | trim_init_extable(mod); | ||
3068 | #ifdef CONFIG_KALLSYMS | ||
3069 | mod->num_symtab = mod->core_num_syms; | ||
3070 | mod->symtab = mod->core_symtab; | ||
3071 | mod->strtab = mod->core_strtab; | ||
3072 | #endif | ||
3073 | unset_module_init_ro_nx(mod); | ||
3074 | module_free(mod, mod->module_init); | ||
3075 | mod->module_init = NULL; | ||
3076 | mod->init_size = 0; | ||
3077 | mod->init_ro_size = 0; | ||
3078 | mod->init_text_size = 0; | ||
3079 | mutex_unlock(&module_mutex); | ||
3080 | wake_up_all(&module_wq); | ||
3081 | |||
3082 | return 0; | ||
3083 | } | ||
3084 | |||
3085 | static int may_init_module(void) | ||
3086 | { | ||
3087 | if (!capable(CAP_SYS_MODULE) || modules_disabled) | ||
3088 | return -EPERM; | ||
3089 | |||
3090 | return 0; | ||
3091 | } | ||
3092 | |||
2948 | /* Allocate and load the module: note that size of section 0 is always | 3093 | /* Allocate and load the module: note that size of section 0 is always |
2949 | zero, and we rely on this for optional sections. */ | 3094 | zero, and we rely on this for optional sections. */ |
2950 | static struct module *load_module(void __user *umod, | 3095 | static int load_module(struct load_info *info, const char __user *uargs, |
2951 | unsigned long len, | 3096 | int flags) |
2952 | const char __user *uargs) | ||
2953 | { | 3097 | { |
2954 | struct load_info info = { NULL, }; | ||
2955 | struct module *mod, *old; | 3098 | struct module *mod, *old; |
2956 | long err; | 3099 | long err; |
2957 | 3100 | ||
2958 | pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", | 3101 | err = module_sig_check(info); |
2959 | umod, len, uargs); | 3102 | if (err) |
3103 | goto free_copy; | ||
2960 | 3104 | ||
2961 | /* Copy in the blobs from userspace, check they are vaguely sane. */ | 3105 | err = elf_header_check(info); |
2962 | err = copy_and_check(&info, umod, len, uargs); | ||
2963 | if (err) | 3106 | if (err) |
2964 | return ERR_PTR(err); | 3107 | goto free_copy; |
2965 | 3108 | ||
2966 | /* Figure out module layout, and allocate all the memory. */ | 3109 | /* Figure out module layout, and allocate all the memory. */ |
2967 | mod = layout_and_allocate(&info); | 3110 | mod = layout_and_allocate(info, flags); |
2968 | if (IS_ERR(mod)) { | 3111 | if (IS_ERR(mod)) { |
2969 | err = PTR_ERR(mod); | 3112 | err = PTR_ERR(mod); |
2970 | goto free_copy; | 3113 | goto free_copy; |
2971 | } | 3114 | } |
2972 | 3115 | ||
2973 | #ifdef CONFIG_MODULE_SIG | 3116 | #ifdef CONFIG_MODULE_SIG |
2974 | mod->sig_ok = info.sig_ok; | 3117 | mod->sig_ok = info->sig_ok; |
2975 | if (!mod->sig_ok) | 3118 | if (!mod->sig_ok) |
2976 | add_taint_module(mod, TAINT_FORCED_MODULE); | 3119 | add_taint_module(mod, TAINT_FORCED_MODULE); |
2977 | #endif | 3120 | #endif |
@@ -2983,25 +3126,25 @@ static struct module *load_module(void __user *umod, | |||
2983 | 3126 | ||
2984 | /* Now we've got everything in the final locations, we can | 3127 | /* Now we've got everything in the final locations, we can |
2985 | * find optional sections. */ | 3128 | * find optional sections. */ |
2986 | find_module_sections(mod, &info); | 3129 | find_module_sections(mod, info); |
2987 | 3130 | ||
2988 | err = check_module_license_and_versions(mod); | 3131 | err = check_module_license_and_versions(mod); |
2989 | if (err) | 3132 | if (err) |
2990 | goto free_unload; | 3133 | goto free_unload; |
2991 | 3134 | ||
2992 | /* Set up MODINFO_ATTR fields */ | 3135 | /* Set up MODINFO_ATTR fields */ |
2993 | setup_modinfo(mod, &info); | 3136 | setup_modinfo(mod, info); |
2994 | 3137 | ||
2995 | /* Fix up syms, so that st_value is a pointer to location. */ | 3138 | /* Fix up syms, so that st_value is a pointer to location. */ |
2996 | err = simplify_symbols(mod, &info); | 3139 | err = simplify_symbols(mod, info); |
2997 | if (err < 0) | 3140 | if (err < 0) |
2998 | goto free_modinfo; | 3141 | goto free_modinfo; |
2999 | 3142 | ||
3000 | err = apply_relocations(mod, &info); | 3143 | err = apply_relocations(mod, info); |
3001 | if (err < 0) | 3144 | if (err < 0) |
3002 | goto free_modinfo; | 3145 | goto free_modinfo; |
3003 | 3146 | ||
3004 | err = post_relocation(mod, &info); | 3147 | err = post_relocation(mod, info); |
3005 | if (err < 0) | 3148 | if (err < 0) |
3006 | goto free_modinfo; | 3149 | goto free_modinfo; |
3007 | 3150 | ||
@@ -3041,14 +3184,14 @@ again: | |||
3041 | } | 3184 | } |
3042 | 3185 | ||
3043 | /* This has to be done once we're sure module name is unique. */ | 3186 | /* This has to be done once we're sure module name is unique. */ |
3044 | dynamic_debug_setup(info.debug, info.num_debug); | 3187 | dynamic_debug_setup(info->debug, info->num_debug); |
3045 | 3188 | ||
3046 | /* Find duplicate symbols */ | 3189 | /* Find duplicate symbols */ |
3047 | err = verify_export_symbols(mod); | 3190 | err = verify_export_symbols(mod); |
3048 | if (err < 0) | 3191 | if (err < 0) |
3049 | goto ddebug; | 3192 | goto ddebug; |
3050 | 3193 | ||
3051 | module_bug_finalize(info.hdr, info.sechdrs, mod); | 3194 | module_bug_finalize(info->hdr, info->sechdrs, mod); |
3052 | list_add_rcu(&mod->list, &modules); | 3195 | list_add_rcu(&mod->list, &modules); |
3053 | mutex_unlock(&module_mutex); | 3196 | mutex_unlock(&module_mutex); |
3054 | 3197 | ||
@@ -3059,16 +3202,17 @@ again: | |||
3059 | goto unlink; | 3202 | goto unlink; |
3060 | 3203 | ||
3061 | /* Link in to syfs. */ | 3204 | /* Link in to syfs. */ |
3062 | err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); | 3205 | err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); |
3063 | if (err < 0) | 3206 | if (err < 0) |
3064 | goto unlink; | 3207 | goto unlink; |
3065 | 3208 | ||
3066 | /* Get rid of temporary copy. */ | 3209 | /* Get rid of temporary copy. */ |
3067 | free_copy(&info); | 3210 | free_copy(info); |
3068 | 3211 | ||
3069 | /* Done! */ | 3212 | /* Done! */ |
3070 | trace_module_load(mod); | 3213 | trace_module_load(mod); |
3071 | return mod; | 3214 | |
3215 | return do_init_module(mod); | ||
3072 | 3216 | ||
3073 | unlink: | 3217 | unlink: |
3074 | mutex_lock(&module_mutex); | 3218 | mutex_lock(&module_mutex); |
@@ -3077,7 +3221,7 @@ again: | |||
3077 | module_bug_cleanup(mod); | 3221 | module_bug_cleanup(mod); |
3078 | wake_up_all(&module_wq); | 3222 | wake_up_all(&module_wq); |
3079 | ddebug: | 3223 | ddebug: |
3080 | dynamic_debug_remove(info.debug); | 3224 | dynamic_debug_remove(info->debug); |
3081 | unlock: | 3225 | unlock: |
3082 | mutex_unlock(&module_mutex); | 3226 | mutex_unlock(&module_mutex); |
3083 | synchronize_sched(); | 3227 | synchronize_sched(); |
@@ -3089,106 +3233,52 @@ again: | |||
3089 | free_unload: | 3233 | free_unload: |
3090 | module_unload_free(mod); | 3234 | module_unload_free(mod); |
3091 | free_module: | 3235 | free_module: |
3092 | module_deallocate(mod, &info); | 3236 | module_deallocate(mod, info); |
3093 | free_copy: | 3237 | free_copy: |
3094 | free_copy(&info); | 3238 | free_copy(info); |
3095 | return ERR_PTR(err); | 3239 | return err; |
3096 | } | ||
3097 | |||
3098 | /* Call module constructors. */ | ||
3099 | static void do_mod_ctors(struct module *mod) | ||
3100 | { | ||
3101 | #ifdef CONFIG_CONSTRUCTORS | ||
3102 | unsigned long i; | ||
3103 | |||
3104 | for (i = 0; i < mod->num_ctors; i++) | ||
3105 | mod->ctors[i](); | ||
3106 | #endif | ||
3107 | } | 3240 | } |
3108 | 3241 | ||
3109 | /* This is where the real work happens */ | ||
3110 | SYSCALL_DEFINE3(init_module, void __user *, umod, | 3242 | SYSCALL_DEFINE3(init_module, void __user *, umod, |
3111 | unsigned long, len, const char __user *, uargs) | 3243 | unsigned long, len, const char __user *, uargs) |
3112 | { | 3244 | { |
3113 | struct module *mod; | 3245 | int err; |
3114 | int ret = 0; | 3246 | struct load_info info = { }; |
3115 | 3247 | ||
3116 | /* Must have permission */ | 3248 | err = may_init_module(); |
3117 | if (!capable(CAP_SYS_MODULE) || modules_disabled) | 3249 | if (err) |
3118 | return -EPERM; | 3250 | return err; |
3119 | 3251 | ||
3120 | /* Do all the hard work */ | 3252 | pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n", |
3121 | mod = load_module(umod, len, uargs); | 3253 | umod, len, uargs); |
3122 | if (IS_ERR(mod)) | ||
3123 | return PTR_ERR(mod); | ||
3124 | 3254 | ||
3125 | blocking_notifier_call_chain(&module_notify_list, | 3255 | err = copy_module_from_user(umod, len, &info); |
3126 | MODULE_STATE_COMING, mod); | 3256 | if (err) |
3257 | return err; | ||
3127 | 3258 | ||
3128 | /* Set RO and NX regions for core */ | 3259 | return load_module(&info, uargs, 0); |
3129 | set_section_ro_nx(mod->module_core, | 3260 | } |
3130 | mod->core_text_size, | ||
3131 | mod->core_ro_size, | ||
3132 | mod->core_size); | ||
3133 | 3261 | ||
3134 | /* Set RO and NX regions for init */ | 3262 | SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |
3135 | set_section_ro_nx(mod->module_init, | 3263 | { |
3136 | mod->init_text_size, | 3264 | int err; |
3137 | mod->init_ro_size, | 3265 | struct load_info info = { }; |
3138 | mod->init_size); | ||
3139 | 3266 | ||
3140 | do_mod_ctors(mod); | 3267 | err = may_init_module(); |
3141 | /* Start the module */ | 3268 | if (err) |
3142 | if (mod->init != NULL) | 3269 | return err; |
3143 | ret = do_one_initcall(mod->init); | ||
3144 | if (ret < 0) { | ||
3145 | /* Init routine failed: abort. Try to protect us from | ||
3146 | buggy refcounters. */ | ||
3147 | mod->state = MODULE_STATE_GOING; | ||
3148 | synchronize_sched(); | ||
3149 | module_put(mod); | ||
3150 | blocking_notifier_call_chain(&module_notify_list, | ||
3151 | MODULE_STATE_GOING, mod); | ||
3152 | free_module(mod); | ||
3153 | wake_up_all(&module_wq); | ||
3154 | return ret; | ||
3155 | } | ||
3156 | if (ret > 0) { | ||
3157 | printk(KERN_WARNING | ||
3158 | "%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" | ||
3159 | "%s: loading module anyway...\n", | ||
3160 | __func__, mod->name, ret, | ||
3161 | __func__); | ||
3162 | dump_stack(); | ||
3163 | } | ||
3164 | 3270 | ||
3165 | /* Now it's a first class citizen! */ | 3271 | pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); |
3166 | mod->state = MODULE_STATE_LIVE; | ||
3167 | blocking_notifier_call_chain(&module_notify_list, | ||
3168 | MODULE_STATE_LIVE, mod); | ||
3169 | 3272 | ||
3170 | /* We need to finish all async code before the module init sequence is done */ | 3273 | if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS |
3171 | async_synchronize_full(); | 3274 | |MODULE_INIT_IGNORE_VERMAGIC)) |
3275 | return -EINVAL; | ||
3172 | 3276 | ||
3173 | mutex_lock(&module_mutex); | 3277 | err = copy_module_from_fd(fd, &info); |
3174 | /* Drop initial reference. */ | 3278 | if (err) |
3175 | module_put(mod); | 3279 | return err; |
3176 | trim_init_extable(mod); | ||
3177 | #ifdef CONFIG_KALLSYMS | ||
3178 | mod->num_symtab = mod->core_num_syms; | ||
3179 | mod->symtab = mod->core_symtab; | ||
3180 | mod->strtab = mod->core_strtab; | ||
3181 | #endif | ||
3182 | unset_module_init_ro_nx(mod); | ||
3183 | module_free(mod, mod->module_init); | ||
3184 | mod->module_init = NULL; | ||
3185 | mod->init_size = 0; | ||
3186 | mod->init_ro_size = 0; | ||
3187 | mod->init_text_size = 0; | ||
3188 | mutex_unlock(&module_mutex); | ||
3189 | wake_up_all(&module_wq); | ||
3190 | 3280 | ||
3191 | return 0; | 3281 | return load_module(&info, uargs, flags); |
3192 | } | 3282 | } |
3193 | 3283 | ||
3194 | static inline int within(unsigned long addr, void *start, unsigned long size) | 3284 | static inline int within(unsigned long addr, void *start, unsigned long size) |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b576f7f14bc6..78e2ecb20165 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void) | |||
57 | * leave it to the caller to do proper locking and attach it to task. | 57 | * leave it to the caller to do proper locking and attach it to task. |
58 | */ | 58 | */ |
59 | static struct nsproxy *create_new_namespaces(unsigned long flags, | 59 | static struct nsproxy *create_new_namespaces(unsigned long flags, |
60 | struct task_struct *tsk, struct fs_struct *new_fs) | 60 | struct task_struct *tsk, struct user_namespace *user_ns, |
61 | struct fs_struct *new_fs) | ||
61 | { | 62 | { |
62 | struct nsproxy *new_nsp; | 63 | struct nsproxy *new_nsp; |
63 | int err; | 64 | int err; |
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
66 | if (!new_nsp) | 67 | if (!new_nsp) |
67 | return ERR_PTR(-ENOMEM); | 68 | return ERR_PTR(-ENOMEM); |
68 | 69 | ||
69 | new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); | 70 | new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); |
70 | if (IS_ERR(new_nsp->mnt_ns)) { | 71 | if (IS_ERR(new_nsp->mnt_ns)) { |
71 | err = PTR_ERR(new_nsp->mnt_ns); | 72 | err = PTR_ERR(new_nsp->mnt_ns); |
72 | goto out_ns; | 73 | goto out_ns; |
73 | } | 74 | } |
74 | 75 | ||
75 | new_nsp->uts_ns = copy_utsname(flags, tsk); | 76 | new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); |
76 | if (IS_ERR(new_nsp->uts_ns)) { | 77 | if (IS_ERR(new_nsp->uts_ns)) { |
77 | err = PTR_ERR(new_nsp->uts_ns); | 78 | err = PTR_ERR(new_nsp->uts_ns); |
78 | goto out_uts; | 79 | goto out_uts; |
79 | } | 80 | } |
80 | 81 | ||
81 | new_nsp->ipc_ns = copy_ipcs(flags, tsk); | 82 | new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); |
82 | if (IS_ERR(new_nsp->ipc_ns)) { | 83 | if (IS_ERR(new_nsp->ipc_ns)) { |
83 | err = PTR_ERR(new_nsp->ipc_ns); | 84 | err = PTR_ERR(new_nsp->ipc_ns); |
84 | goto out_ipc; | 85 | goto out_ipc; |
85 | } | 86 | } |
86 | 87 | ||
87 | new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); | 88 | new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); |
88 | if (IS_ERR(new_nsp->pid_ns)) { | 89 | if (IS_ERR(new_nsp->pid_ns)) { |
89 | err = PTR_ERR(new_nsp->pid_ns); | 90 | err = PTR_ERR(new_nsp->pid_ns); |
90 | goto out_pid; | 91 | goto out_pid; |
91 | } | 92 | } |
92 | 93 | ||
93 | new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); | 94 | new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); |
94 | if (IS_ERR(new_nsp->net_ns)) { | 95 | if (IS_ERR(new_nsp->net_ns)) { |
95 | err = PTR_ERR(new_nsp->net_ns); | 96 | err = PTR_ERR(new_nsp->net_ns); |
96 | goto out_net; | 97 | goto out_net; |
@@ -122,6 +123,7 @@ out_ns: | |||
122 | int copy_namespaces(unsigned long flags, struct task_struct *tsk) | 123 | int copy_namespaces(unsigned long flags, struct task_struct *tsk) |
123 | { | 124 | { |
124 | struct nsproxy *old_ns = tsk->nsproxy; | 125 | struct nsproxy *old_ns = tsk->nsproxy; |
126 | struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); | ||
125 | struct nsproxy *new_ns; | 127 | struct nsproxy *new_ns; |
126 | int err = 0; | 128 | int err = 0; |
127 | 129 | ||
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
134 | CLONE_NEWPID | CLONE_NEWNET))) | 136 | CLONE_NEWPID | CLONE_NEWNET))) |
135 | return 0; | 137 | return 0; |
136 | 138 | ||
137 | if (!capable(CAP_SYS_ADMIN)) { | 139 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { |
138 | err = -EPERM; | 140 | err = -EPERM; |
139 | goto out; | 141 | goto out; |
140 | } | 142 | } |
@@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
151 | goto out; | 153 | goto out; |
152 | } | 154 | } |
153 | 155 | ||
154 | new_ns = create_new_namespaces(flags, tsk, tsk->fs); | 156 | new_ns = create_new_namespaces(flags, tsk, |
157 | task_cred_xxx(tsk, user_ns), tsk->fs); | ||
155 | if (IS_ERR(new_ns)) { | 158 | if (IS_ERR(new_ns)) { |
156 | err = PTR_ERR(new_ns); | 159 | err = PTR_ERR(new_ns); |
157 | goto out; | 160 | goto out; |
@@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns) | |||
183 | * On success, returns the new nsproxy. | 186 | * On success, returns the new nsproxy. |
184 | */ | 187 | */ |
185 | int unshare_nsproxy_namespaces(unsigned long unshare_flags, | 188 | int unshare_nsproxy_namespaces(unsigned long unshare_flags, |
186 | struct nsproxy **new_nsp, struct fs_struct *new_fs) | 189 | struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) |
187 | { | 190 | { |
191 | struct user_namespace *user_ns; | ||
188 | int err = 0; | 192 | int err = 0; |
189 | 193 | ||
190 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | | 194 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | |
191 | CLONE_NEWNET))) | 195 | CLONE_NEWNET | CLONE_NEWPID))) |
192 | return 0; | 196 | return 0; |
193 | 197 | ||
194 | if (!capable(CAP_SYS_ADMIN)) | 198 | user_ns = new_cred ? new_cred->user_ns : current_user_ns(); |
199 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | ||
195 | return -EPERM; | 200 | return -EPERM; |
196 | 201 | ||
197 | *new_nsp = create_new_namespaces(unshare_flags, current, | 202 | *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, |
198 | new_fs ? new_fs : current->fs); | 203 | new_fs ? new_fs : current->fs); |
199 | if (IS_ERR(*new_nsp)) { | 204 | if (IS_ERR(*new_nsp)) { |
200 | err = PTR_ERR(*new_nsp); | 205 | err = PTR_ERR(*new_nsp); |
201 | goto out; | 206 | goto out; |
@@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
241 | struct file *file; | 246 | struct file *file; |
242 | int err; | 247 | int err; |
243 | 248 | ||
244 | if (!capable(CAP_SYS_ADMIN)) | ||
245 | return -EPERM; | ||
246 | |||
247 | file = proc_ns_fget(fd); | 249 | file = proc_ns_fget(fd); |
248 | if (IS_ERR(file)) | 250 | if (IS_ERR(file)) |
249 | return PTR_ERR(file); | 251 | return PTR_ERR(file); |
@@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
254 | if (nstype && (ops->type != nstype)) | 256 | if (nstype && (ops->type != nstype)) |
255 | goto out; | 257 | goto out; |
256 | 258 | ||
257 | new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); | 259 | new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); |
258 | if (IS_ERR(new_nsproxy)) { | 260 | if (IS_ERR(new_nsproxy)) { |
259 | err = PTR_ERR(new_nsproxy); | 261 | err = PTR_ERR(new_nsproxy); |
260 | goto out; | 262 | goto out; |
diff --git a/kernel/padata.c b/kernel/padata.c index 89fe3d1b9efb..072f4ee4eb89 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) | |||
171 | { | 171 | { |
172 | int cpu, num_cpus; | 172 | int cpu, num_cpus; |
173 | unsigned int next_nr, next_index; | 173 | unsigned int next_nr, next_index; |
174 | struct padata_parallel_queue *queue, *next_queue; | 174 | struct padata_parallel_queue *next_queue; |
175 | struct padata_priv *padata; | 175 | struct padata_priv *padata; |
176 | struct padata_list *reorder; | 176 | struct padata_list *reorder; |
177 | 177 | ||
@@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) | |||
204 | goto out; | 204 | goto out; |
205 | } | 205 | } |
206 | 206 | ||
207 | queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); | 207 | if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { |
208 | if (queue->cpu_index == next_queue->cpu_index) { | ||
209 | padata = ERR_PTR(-ENODATA); | 208 | padata = ERR_PTR(-ENODATA); |
210 | goto out; | 209 | goto out; |
211 | } | 210 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index aebd4f5aaf41..de9af600006f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -1,8 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * Generic pidhash and scalable, time-bounded PID allocator | 2 | * Generic pidhash and scalable, time-bounded PID allocator |
3 | * | 3 | * |
4 | * (C) 2002-2003 William Irwin, IBM | 4 | * (C) 2002-2003 Nadia Yvette Chambers, IBM |
5 | * (C) 2004 William Irwin, Oracle | 5 | * (C) 2004 Nadia Yvette Chambers, Oracle |
6 | * (C) 2002-2004 Ingo Molnar, Red Hat | 6 | * (C) 2002-2004 Ingo Molnar, Red Hat |
7 | * | 7 | * |
8 | * pid-structures are backing objects for tasks sharing a given ID to chain | 8 | * pid-structures are backing objects for tasks sharing a given ID to chain |
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/pid_namespace.h> | 36 | #include <linux/pid_namespace.h> |
37 | #include <linux/init_task.h> | 37 | #include <linux/init_task.h> |
38 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
39 | #include <linux/proc_fs.h> | ||
39 | 40 | ||
40 | #define pid_hashfn(nr, ns) \ | 41 | #define pid_hashfn(nr, ns) \ |
41 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) | 42 | hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) |
@@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = { | |||
78 | .last_pid = 0, | 79 | .last_pid = 0, |
79 | .level = 0, | 80 | .level = 0, |
80 | .child_reaper = &init_task, | 81 | .child_reaper = &init_task, |
82 | .user_ns = &init_user_ns, | ||
83 | .proc_inum = PROC_PID_INIT_INO, | ||
81 | }; | 84 | }; |
82 | EXPORT_SYMBOL_GPL(init_pid_ns); | 85 | EXPORT_SYMBOL_GPL(init_pid_ns); |
83 | 86 | ||
84 | int is_container_init(struct task_struct *tsk) | ||
85 | { | ||
86 | int ret = 0; | ||
87 | struct pid *pid; | ||
88 | |||
89 | rcu_read_lock(); | ||
90 | pid = task_pid(tsk); | ||
91 | if (pid != NULL && pid->numbers[pid->level].nr == 1) | ||
92 | ret = 1; | ||
93 | rcu_read_unlock(); | ||
94 | |||
95 | return ret; | ||
96 | } | ||
97 | EXPORT_SYMBOL(is_container_init); | ||
98 | |||
99 | /* | 87 | /* |
100 | * Note: disable interrupts while the pidmap_lock is held as an | 88 | * Note: disable interrupts while the pidmap_lock is held as an |
101 | * interrupt might come in and do read_lock(&tasklist_lock). | 89 | * interrupt might come in and do read_lock(&tasklist_lock). |
@@ -269,8 +257,23 @@ void free_pid(struct pid *pid) | |||
269 | unsigned long flags; | 257 | unsigned long flags; |
270 | 258 | ||
271 | spin_lock_irqsave(&pidmap_lock, flags); | 259 | spin_lock_irqsave(&pidmap_lock, flags); |
272 | for (i = 0; i <= pid->level; i++) | 260 | for (i = 0; i <= pid->level; i++) { |
273 | hlist_del_rcu(&pid->numbers[i].pid_chain); | 261 | struct upid *upid = pid->numbers + i; |
262 | struct pid_namespace *ns = upid->ns; | ||
263 | hlist_del_rcu(&upid->pid_chain); | ||
264 | switch(--ns->nr_hashed) { | ||
265 | case 1: | ||
266 | /* When all that is left in the pid namespace | ||
267 | * is the reaper wake up the reaper. The reaper | ||
268 | * may be sleeping in zap_pid_ns_processes(). | ||
269 | */ | ||
270 | wake_up_process(ns->child_reaper); | ||
271 | break; | ||
272 | case 0: | ||
273 | schedule_work(&ns->proc_work); | ||
274 | break; | ||
275 | } | ||
276 | } | ||
274 | spin_unlock_irqrestore(&pidmap_lock, flags); | 277 | spin_unlock_irqrestore(&pidmap_lock, flags); |
275 | 278 | ||
276 | for (i = 0; i <= pid->level; i++) | 279 | for (i = 0; i <= pid->level; i++) |
@@ -292,6 +295,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
292 | goto out; | 295 | goto out; |
293 | 296 | ||
294 | tmp = ns; | 297 | tmp = ns; |
298 | pid->level = ns->level; | ||
295 | for (i = ns->level; i >= 0; i--) { | 299 | for (i = ns->level; i >= 0; i--) { |
296 | nr = alloc_pidmap(tmp); | 300 | nr = alloc_pidmap(tmp); |
297 | if (nr < 0) | 301 | if (nr < 0) |
@@ -302,22 +306,32 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
302 | tmp = tmp->parent; | 306 | tmp = tmp->parent; |
303 | } | 307 | } |
304 | 308 | ||
309 | if (unlikely(is_child_reaper(pid))) { | ||
310 | if (pid_ns_prepare_proc(ns)) | ||
311 | goto out_free; | ||
312 | } | ||
313 | |||
305 | get_pid_ns(ns); | 314 | get_pid_ns(ns); |
306 | pid->level = ns->level; | ||
307 | atomic_set(&pid->count, 1); | 315 | atomic_set(&pid->count, 1); |
308 | for (type = 0; type < PIDTYPE_MAX; ++type) | 316 | for (type = 0; type < PIDTYPE_MAX; ++type) |
309 | INIT_HLIST_HEAD(&pid->tasks[type]); | 317 | INIT_HLIST_HEAD(&pid->tasks[type]); |
310 | 318 | ||
311 | upid = pid->numbers + ns->level; | 319 | upid = pid->numbers + ns->level; |
312 | spin_lock_irq(&pidmap_lock); | 320 | spin_lock_irq(&pidmap_lock); |
313 | for ( ; upid >= pid->numbers; --upid) | 321 | if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) |
322 | goto out_unlock; | ||
323 | for ( ; upid >= pid->numbers; --upid) { | ||
314 | hlist_add_head_rcu(&upid->pid_chain, | 324 | hlist_add_head_rcu(&upid->pid_chain, |
315 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); | 325 | &pid_hash[pid_hashfn(upid->nr, upid->ns)]); |
326 | upid->ns->nr_hashed++; | ||
327 | } | ||
316 | spin_unlock_irq(&pidmap_lock); | 328 | spin_unlock_irq(&pidmap_lock); |
317 | 329 | ||
318 | out: | 330 | out: |
319 | return pid; | 331 | return pid; |
320 | 332 | ||
333 | out_unlock: | ||
334 | spin_unlock(&pidmap_lock); | ||
321 | out_free: | 335 | out_free: |
322 | while (++i <= ns->level) | 336 | while (++i <= ns->level) |
323 | free_pidmap(pid->numbers + i); | 337 | free_pidmap(pid->numbers + i); |
@@ -327,6 +341,13 @@ out_free: | |||
327 | goto out; | 341 | goto out; |
328 | } | 342 | } |
329 | 343 | ||
344 | void disable_pid_allocation(struct pid_namespace *ns) | ||
345 | { | ||
346 | spin_lock_irq(&pidmap_lock); | ||
347 | ns->nr_hashed &= ~PIDNS_HASH_ADDING; | ||
348 | spin_unlock_irq(&pidmap_lock); | ||
349 | } | ||
350 | |||
330 | struct pid *find_pid_ns(int nr, struct pid_namespace *ns) | 351 | struct pid *find_pid_ns(int nr, struct pid_namespace *ns) |
331 | { | 352 | { |
332 | struct hlist_node *elem; | 353 | struct hlist_node *elem; |
@@ -344,7 +365,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); | |||
344 | 365 | ||
345 | struct pid *find_vpid(int nr) | 366 | struct pid *find_vpid(int nr) |
346 | { | 367 | { |
347 | return find_pid_ns(nr, current->nsproxy->pid_ns); | 368 | return find_pid_ns(nr, task_active_pid_ns(current)); |
348 | } | 369 | } |
349 | EXPORT_SYMBOL_GPL(find_vpid); | 370 | EXPORT_SYMBOL_GPL(find_vpid); |
350 | 371 | ||
@@ -428,7 +449,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | |||
428 | 449 | ||
429 | struct task_struct *find_task_by_vpid(pid_t vnr) | 450 | struct task_struct *find_task_by_vpid(pid_t vnr) |
430 | { | 451 | { |
431 | return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); | 452 | return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); |
432 | } | 453 | } |
433 | 454 | ||
434 | struct pid *get_task_pid(struct task_struct *task, enum pid_type type) | 455 | struct pid *get_task_pid(struct task_struct *task, enum pid_type type) |
@@ -483,7 +504,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns); | |||
483 | 504 | ||
484 | pid_t pid_vnr(struct pid *pid) | 505 | pid_t pid_vnr(struct pid *pid) |
485 | { | 506 | { |
486 | return pid_nr_ns(pid, current->nsproxy->pid_ns); | 507 | return pid_nr_ns(pid, task_active_pid_ns(current)); |
487 | } | 508 | } |
488 | EXPORT_SYMBOL_GPL(pid_vnr); | 509 | EXPORT_SYMBOL_GPL(pid_vnr); |
489 | 510 | ||
@@ -494,7 +515,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, | |||
494 | 515 | ||
495 | rcu_read_lock(); | 516 | rcu_read_lock(); |
496 | if (!ns) | 517 | if (!ns) |
497 | ns = current->nsproxy->pid_ns; | 518 | ns = task_active_pid_ns(current); |
498 | if (likely(pid_alive(task))) { | 519 | if (likely(pid_alive(task))) { |
499 | if (type != PIDTYPE_PID) | 520 | if (type != PIDTYPE_PID) |
500 | task = task->group_leader; | 521 | task = task->group_leader; |
@@ -558,6 +579,9 @@ void __init pidhash_init(void) | |||
558 | 579 | ||
559 | void __init pidmap_init(void) | 580 | void __init pidmap_init(void) |
560 | { | 581 | { |
582 | /* Veryify no one has done anything silly */ | ||
583 | BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); | ||
584 | |||
561 | /* bump default and minimum pid_max based on number of cpus */ | 585 | /* bump default and minimum pid_max based on number of cpus */ |
562 | pid_max = min(pid_max_max, max_t(int, pid_max, | 586 | pid_max = min(pid_max_max, max_t(int, pid_max, |
563 | PIDS_PER_CPU_DEFAULT * num_possible_cpus())); | 587 | PIDS_PER_CPU_DEFAULT * num_possible_cpus())); |
@@ -569,6 +593,7 @@ void __init pidmap_init(void) | |||
569 | /* Reserve PID 0. We never call free_pidmap(0) */ | 593 | /* Reserve PID 0. We never call free_pidmap(0) */ |
570 | set_bit(0, init_pid_ns.pidmap[0].page); | 594 | set_bit(0, init_pid_ns.pidmap[0].page); |
571 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); | 595 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); |
596 | init_pid_ns.nr_hashed = PIDNS_HASH_ADDING; | ||
572 | 597 | ||
573 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, | 598 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, |
574 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); | 599 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7b07cc0dfb75..c1c3dc1c6023 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -10,6 +10,7 @@ | |||
10 | 10 | ||
11 | #include <linux/pid.h> | 11 | #include <linux/pid.h> |
12 | #include <linux/pid_namespace.h> | 12 | #include <linux/pid_namespace.h> |
13 | #include <linux/user_namespace.h> | ||
13 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
14 | #include <linux/err.h> | 15 | #include <linux/err.h> |
15 | #include <linux/acct.h> | 16 | #include <linux/acct.h> |
@@ -71,10 +72,17 @@ err_alloc: | |||
71 | return NULL; | 72 | return NULL; |
72 | } | 73 | } |
73 | 74 | ||
75 | static void proc_cleanup_work(struct work_struct *work) | ||
76 | { | ||
77 | struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); | ||
78 | pid_ns_release_proc(ns); | ||
79 | } | ||
80 | |||
74 | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ | 81 | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ |
75 | #define MAX_PID_NS_LEVEL 32 | 82 | #define MAX_PID_NS_LEVEL 32 |
76 | 83 | ||
77 | static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) | 84 | static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, |
85 | struct pid_namespace *parent_pid_ns) | ||
78 | { | 86 | { |
79 | struct pid_namespace *ns; | 87 | struct pid_namespace *ns; |
80 | unsigned int level = parent_pid_ns->level + 1; | 88 | unsigned int level = parent_pid_ns->level + 1; |
@@ -99,9 +107,16 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p | |||
99 | if (ns->pid_cachep == NULL) | 107 | if (ns->pid_cachep == NULL) |
100 | goto out_free_map; | 108 | goto out_free_map; |
101 | 109 | ||
110 | err = proc_alloc_inum(&ns->proc_inum); | ||
111 | if (err) | ||
112 | goto out_free_map; | ||
113 | |||
102 | kref_init(&ns->kref); | 114 | kref_init(&ns->kref); |
103 | ns->level = level; | 115 | ns->level = level; |
104 | ns->parent = get_pid_ns(parent_pid_ns); | 116 | ns->parent = get_pid_ns(parent_pid_ns); |
117 | ns->user_ns = get_user_ns(user_ns); | ||
118 | ns->nr_hashed = PIDNS_HASH_ADDING; | ||
119 | INIT_WORK(&ns->proc_work, proc_cleanup_work); | ||
105 | 120 | ||
106 | set_bit(0, ns->pidmap[0].page); | 121 | set_bit(0, ns->pidmap[0].page); |
107 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); | 122 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); |
@@ -109,14 +124,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p | |||
109 | for (i = 1; i < PIDMAP_ENTRIES; i++) | 124 | for (i = 1; i < PIDMAP_ENTRIES; i++) |
110 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); | 125 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); |
111 | 126 | ||
112 | err = pid_ns_prepare_proc(ns); | ||
113 | if (err) | ||
114 | goto out_put_parent_pid_ns; | ||
115 | |||
116 | return ns; | 127 | return ns; |
117 | 128 | ||
118 | out_put_parent_pid_ns: | ||
119 | put_pid_ns(parent_pid_ns); | ||
120 | out_free_map: | 129 | out_free_map: |
121 | kfree(ns->pidmap[0].page); | 130 | kfree(ns->pidmap[0].page); |
122 | out_free: | 131 | out_free: |
@@ -129,18 +138,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns) | |||
129 | { | 138 | { |
130 | int i; | 139 | int i; |
131 | 140 | ||
141 | proc_free_inum(ns->proc_inum); | ||
132 | for (i = 0; i < PIDMAP_ENTRIES; i++) | 142 | for (i = 0; i < PIDMAP_ENTRIES; i++) |
133 | kfree(ns->pidmap[i].page); | 143 | kfree(ns->pidmap[i].page); |
144 | put_user_ns(ns->user_ns); | ||
134 | kmem_cache_free(pid_ns_cachep, ns); | 145 | kmem_cache_free(pid_ns_cachep, ns); |
135 | } | 146 | } |
136 | 147 | ||
137 | struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) | 148 | struct pid_namespace *copy_pid_ns(unsigned long flags, |
149 | struct user_namespace *user_ns, struct pid_namespace *old_ns) | ||
138 | { | 150 | { |
139 | if (!(flags & CLONE_NEWPID)) | 151 | if (!(flags & CLONE_NEWPID)) |
140 | return get_pid_ns(old_ns); | 152 | return get_pid_ns(old_ns); |
141 | if (flags & (CLONE_THREAD|CLONE_PARENT)) | 153 | if (task_active_pid_ns(current) != old_ns) |
142 | return ERR_PTR(-EINVAL); | 154 | return ERR_PTR(-EINVAL); |
143 | return create_pid_namespace(old_ns); | 155 | return create_pid_namespace(user_ns, old_ns); |
144 | } | 156 | } |
145 | 157 | ||
146 | static void free_pid_ns(struct kref *kref) | 158 | static void free_pid_ns(struct kref *kref) |
@@ -170,6 +182,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
170 | int rc; | 182 | int rc; |
171 | struct task_struct *task, *me = current; | 183 | struct task_struct *task, *me = current; |
172 | 184 | ||
185 | /* Don't allow any more processes into the pid namespace */ | ||
186 | disable_pid_allocation(pid_ns); | ||
187 | |||
173 | /* Ignore SIGCHLD causing any terminated children to autoreap */ | 188 | /* Ignore SIGCHLD causing any terminated children to autoreap */ |
174 | spin_lock_irq(&me->sighand->siglock); | 189 | spin_lock_irq(&me->sighand->siglock); |
175 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; | 190 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; |
@@ -211,22 +226,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
211 | 226 | ||
212 | /* | 227 | /* |
213 | * sys_wait4() above can't reap the TASK_DEAD children. | 228 | * sys_wait4() above can't reap the TASK_DEAD children. |
214 | * Make sure they all go away, see __unhash_process(). | 229 | * Make sure they all go away, see free_pid(). |
215 | */ | 230 | */ |
216 | for (;;) { | 231 | for (;;) { |
217 | bool need_wait = false; | 232 | set_current_state(TASK_UNINTERRUPTIBLE); |
218 | 233 | if (pid_ns->nr_hashed == 1) | |
219 | read_lock(&tasklist_lock); | ||
220 | if (!list_empty(¤t->children)) { | ||
221 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
222 | need_wait = true; | ||
223 | } | ||
224 | read_unlock(&tasklist_lock); | ||
225 | |||
226 | if (!need_wait) | ||
227 | break; | 234 | break; |
228 | schedule(); | 235 | schedule(); |
229 | } | 236 | } |
237 | __set_current_state(TASK_RUNNING); | ||
230 | 238 | ||
231 | if (pid_ns->reboot) | 239 | if (pid_ns->reboot) |
232 | current->signal->group_exit_code = pid_ns->reboot; | 240 | current->signal->group_exit_code = pid_ns->reboot; |
@@ -239,9 +247,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
239 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, | 247 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, |
240 | void __user *buffer, size_t *lenp, loff_t *ppos) | 248 | void __user *buffer, size_t *lenp, loff_t *ppos) |
241 | { | 249 | { |
250 | struct pid_namespace *pid_ns = task_active_pid_ns(current); | ||
242 | struct ctl_table tmp = *table; | 251 | struct ctl_table tmp = *table; |
243 | 252 | ||
244 | if (write && !capable(CAP_SYS_ADMIN)) | 253 | if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) |
245 | return -EPERM; | 254 | return -EPERM; |
246 | 255 | ||
247 | /* | 256 | /* |
@@ -250,7 +259,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, | |||
250 | * it should synchronize its usage with external means. | 259 | * it should synchronize its usage with external means. |
251 | */ | 260 | */ |
252 | 261 | ||
253 | tmp.data = ¤t->nsproxy->pid_ns->last_pid; | 262 | tmp.data = &pid_ns->last_pid; |
254 | return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); | 263 | return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); |
255 | } | 264 | } |
256 | 265 | ||
@@ -299,6 +308,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | |||
299 | return 0; | 308 | return 0; |
300 | } | 309 | } |
301 | 310 | ||
311 | static void *pidns_get(struct task_struct *task) | ||
312 | { | ||
313 | struct pid_namespace *ns; | ||
314 | |||
315 | rcu_read_lock(); | ||
316 | ns = get_pid_ns(task_active_pid_ns(task)); | ||
317 | rcu_read_unlock(); | ||
318 | |||
319 | return ns; | ||
320 | } | ||
321 | |||
322 | static void pidns_put(void *ns) | ||
323 | { | ||
324 | put_pid_ns(ns); | ||
325 | } | ||
326 | |||
327 | static int pidns_install(struct nsproxy *nsproxy, void *ns) | ||
328 | { | ||
329 | struct pid_namespace *active = task_active_pid_ns(current); | ||
330 | struct pid_namespace *ancestor, *new = ns; | ||
331 | |||
332 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || | ||
333 | !nsown_capable(CAP_SYS_ADMIN)) | ||
334 | return -EPERM; | ||
335 | |||
336 | /* | ||
337 | * Only allow entering the current active pid namespace | ||
338 | * or a child of the current active pid namespace. | ||
339 | * | ||
340 | * This is required for fork to return a usable pid value and | ||
341 | * this maintains the property that processes and their | ||
342 | * children can not escape their current pid namespace. | ||
343 | */ | ||
344 | if (new->level < active->level) | ||
345 | return -EINVAL; | ||
346 | |||
347 | ancestor = new; | ||
348 | while (ancestor->level > active->level) | ||
349 | ancestor = ancestor->parent; | ||
350 | if (ancestor != active) | ||
351 | return -EINVAL; | ||
352 | |||
353 | put_pid_ns(nsproxy->pid_ns); | ||
354 | nsproxy->pid_ns = get_pid_ns(new); | ||
355 | return 0; | ||
356 | } | ||
357 | |||
358 | static unsigned int pidns_inum(void *ns) | ||
359 | { | ||
360 | struct pid_namespace *pid_ns = ns; | ||
361 | return pid_ns->proc_inum; | ||
362 | } | ||
363 | |||
364 | const struct proc_ns_operations pidns_operations = { | ||
365 | .name = "pid", | ||
366 | .type = CLONE_NEWPID, | ||
367 | .get = pidns_get, | ||
368 | .put = pidns_put, | ||
369 | .install = pidns_install, | ||
370 | .inum = pidns_inum, | ||
371 | }; | ||
372 | |||
302 | static __init int pid_namespaces_init(void) | 373 | static __init int pid_namespaces_init(void) |
303 | { | 374 | { |
304 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 375 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 125cb67daa21..a278cad1d5d6 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <asm/uaccess.h> | 9 | #include <asm/uaccess.h> |
10 | #include <linux/kernel_stat.h> | 10 | #include <linux/kernel_stat.h> |
11 | #include <trace/events/timer.h> | 11 | #include <trace/events/timer.h> |
12 | #include <linux/random.h> | ||
12 | 13 | ||
13 | /* | 14 | /* |
14 | * Called after updating RLIMIT_CPU to run cpu timer and update | 15 | * Called after updating RLIMIT_CPU to run cpu timer and update |
@@ -217,30 +218,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | |||
217 | return 0; | 218 | return 0; |
218 | } | 219 | } |
219 | 220 | ||
220 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | ||
221 | { | ||
222 | struct signal_struct *sig = tsk->signal; | ||
223 | struct task_struct *t; | ||
224 | |||
225 | times->utime = sig->utime; | ||
226 | times->stime = sig->stime; | ||
227 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
228 | |||
229 | rcu_read_lock(); | ||
230 | /* make sure we can trust tsk->thread_group list */ | ||
231 | if (!likely(pid_alive(tsk))) | ||
232 | goto out; | ||
233 | |||
234 | t = tsk; | ||
235 | do { | ||
236 | times->utime += t->utime; | ||
237 | times->stime += t->stime; | ||
238 | times->sum_exec_runtime += task_sched_runtime(t); | ||
239 | } while_each_thread(tsk, t); | ||
240 | out: | ||
241 | rcu_read_unlock(); | ||
242 | } | ||
243 | |||
244 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) | 221 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) |
245 | { | 222 | { |
246 | if (b->utime > a->utime) | 223 | if (b->utime > a->utime) |
@@ -494,6 +471,8 @@ static void cleanup_timers(struct list_head *head, | |||
494 | */ | 471 | */ |
495 | void posix_cpu_timers_exit(struct task_struct *tsk) | 472 | void posix_cpu_timers_exit(struct task_struct *tsk) |
496 | { | 473 | { |
474 | add_device_randomness((const void*) &tsk->se.sum_exec_runtime, | ||
475 | sizeof(unsigned long long)); | ||
497 | cleanup_timers(tsk->cpu_timers, | 476 | cleanup_timers(tsk->cpu_timers, |
498 | tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); | 477 | tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); |
499 | 478 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index f458238109cc..1c16f9167de1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
59 | { | 59 | { |
60 | unsigned long val; | 60 | unsigned long val; |
61 | 61 | ||
62 | if (strict_strtoul(buf, 10, &val)) | 62 | if (kstrtoul(buf, 10, &val)) |
63 | return -EINVAL; | 63 | return -EINVAL; |
64 | 64 | ||
65 | if (val > 1) | 65 | if (val > 1) |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 87da817f9e13..d5a258b60c6f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only) | |||
48 | if (p == current || !freeze_task(p)) | 48 | if (p == current || !freeze_task(p)) |
49 | continue; | 49 | continue; |
50 | 50 | ||
51 | /* | 51 | if (!freezer_should_skip(p)) |
52 | * Now that we've done set_freeze_flag, don't | ||
53 | * perturb a task in TASK_STOPPED or TASK_TRACED. | ||
54 | * It is "frozen enough". If the task does wake | ||
55 | * up, it will immediately call try_to_freeze. | ||
56 | * | ||
57 | * Because freeze_task() goes through p's scheduler lock, it's | ||
58 | * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING | ||
59 | * transition can't race with task state testing here. | ||
60 | */ | ||
61 | if (!task_is_stopped_or_traced(p) && | ||
62 | !freezer_should_skip(p)) | ||
63 | todo++; | 52 | todo++; |
64 | } while_each_thread(g, p); | 53 | } while_each_thread(g, p); |
65 | read_unlock(&tasklist_lock); | 54 | read_unlock(&tasklist_lock); |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 846bd42c7ed1..9322ff7eaad6 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
@@ -213,6 +213,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, | |||
213 | } | 213 | } |
214 | 214 | ||
215 | /** | 215 | /** |
216 | * pm_qos_flags_remove_req - Remove device PM QoS flags request. | ||
217 | * @pqf: Device PM QoS flags set to remove the request from. | ||
218 | * @req: Request to remove from the set. | ||
219 | */ | ||
220 | static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, | ||
221 | struct pm_qos_flags_request *req) | ||
222 | { | ||
223 | s32 val = 0; | ||
224 | |||
225 | list_del(&req->node); | ||
226 | list_for_each_entry(req, &pqf->list, node) | ||
227 | val |= req->flags; | ||
228 | |||
229 | pqf->effective_flags = val; | ||
230 | } | ||
231 | |||
232 | /** | ||
233 | * pm_qos_update_flags - Update a set of PM QoS flags. | ||
234 | * @pqf: Set of flags to update. | ||
235 | * @req: Request to add to the set, to modify, or to remove from the set. | ||
236 | * @action: Action to take on the set. | ||
237 | * @val: Value of the request to add or modify. | ||
238 | * | ||
239 | * Update the given set of PM QoS flags and call notifiers if the aggregate | ||
240 | * value has changed. Returns 1 if the aggregate constraint value has changed, | ||
241 | * 0 otherwise. | ||
242 | */ | ||
243 | bool pm_qos_update_flags(struct pm_qos_flags *pqf, | ||
244 | struct pm_qos_flags_request *req, | ||
245 | enum pm_qos_req_action action, s32 val) | ||
246 | { | ||
247 | unsigned long irqflags; | ||
248 | s32 prev_value, curr_value; | ||
249 | |||
250 | spin_lock_irqsave(&pm_qos_lock, irqflags); | ||
251 | |||
252 | prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; | ||
253 | |||
254 | switch (action) { | ||
255 | case PM_QOS_REMOVE_REQ: | ||
256 | pm_qos_flags_remove_req(pqf, req); | ||
257 | break; | ||
258 | case PM_QOS_UPDATE_REQ: | ||
259 | pm_qos_flags_remove_req(pqf, req); | ||
260 | case PM_QOS_ADD_REQ: | ||
261 | req->flags = val; | ||
262 | INIT_LIST_HEAD(&req->node); | ||
263 | list_add_tail(&req->node, &pqf->list); | ||
264 | pqf->effective_flags |= val; | ||
265 | break; | ||
266 | default: | ||
267 | /* no action */ | ||
268 | ; | ||
269 | } | ||
270 | |||
271 | curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; | ||
272 | |||
273 | spin_unlock_irqrestore(&pm_qos_lock, irqflags); | ||
274 | |||
275 | return prev_value != curr_value; | ||
276 | } | ||
277 | |||
278 | /** | ||
216 | * pm_qos_request - returns current system wide qos expectation | 279 | * pm_qos_request - returns current system wide qos expectation |
217 | * @pm_qos_class: identification of which qos value is requested | 280 | * @pm_qos_class: identification of which qos value is requested |
218 | * | 281 | * |
@@ -500,7 +563,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
500 | } else { | 563 | } else { |
501 | ascii_value[count] = '\0'; | 564 | ascii_value[count] = '\0'; |
502 | } | 565 | } |
503 | ret = strict_strtoul(ascii_value, 16, &ulval); | 566 | ret = kstrtoul(ascii_value, 16, &ulval); |
504 | if (ret) { | 567 | if (ret) { |
505 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); | 568 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); |
506 | return -EINVAL; | 569 | return -EINVAL; |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3c9d764eb0d8..7c33ed200410 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) | |||
126 | 126 | ||
127 | /* Figure out where to put the new node */ | 127 | /* Figure out where to put the new node */ |
128 | while (*new) { | 128 | while (*new) { |
129 | ext = container_of(*new, struct swsusp_extent, node); | 129 | ext = rb_entry(*new, struct swsusp_extent, node); |
130 | parent = *new; | 130 | parent = *new; |
131 | if (swap_offset < ext->start) { | 131 | if (swap_offset < ext->start) { |
132 | /* Try to merge */ | 132 | /* Try to merge */ |
diff --git a/kernel/printk.c b/kernel/printk.c index 2d607f4d1797..357f714ddd49 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -87,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem); | |||
87 | struct console *console_drivers; | 87 | struct console *console_drivers; |
88 | EXPORT_SYMBOL_GPL(console_drivers); | 88 | EXPORT_SYMBOL_GPL(console_drivers); |
89 | 89 | ||
90 | #ifdef CONFIG_LOCKDEP | ||
91 | static struct lockdep_map console_lock_dep_map = { | ||
92 | .name = "console_lock" | ||
93 | }; | ||
94 | #endif | ||
95 | |||
90 | /* | 96 | /* |
91 | * This is used for debugging the mess that is the VT code by | 97 | * This is used for debugging the mess that is the VT code by |
92 | * keeping track if we have the console semaphore held. It's | 98 | * keeping track if we have the console semaphore held. It's |
@@ -741,6 +747,21 @@ void __init setup_log_buf(int early) | |||
741 | free, (free * 100) / __LOG_BUF_LEN); | 747 | free, (free * 100) / __LOG_BUF_LEN); |
742 | } | 748 | } |
743 | 749 | ||
750 | static bool __read_mostly ignore_loglevel; | ||
751 | |||
752 | static int __init ignore_loglevel_setup(char *str) | ||
753 | { | ||
754 | ignore_loglevel = 1; | ||
755 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | ||
756 | |||
757 | return 0; | ||
758 | } | ||
759 | |||
760 | early_param("ignore_loglevel", ignore_loglevel_setup); | ||
761 | module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); | ||
762 | MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | ||
763 | "print all kernel messages to the console."); | ||
764 | |||
744 | #ifdef CONFIG_BOOT_PRINTK_DELAY | 765 | #ifdef CONFIG_BOOT_PRINTK_DELAY |
745 | 766 | ||
746 | static int boot_delay; /* msecs delay after each printk during bootup */ | 767 | static int boot_delay; /* msecs delay after each printk during bootup */ |
@@ -764,13 +785,15 @@ static int __init boot_delay_setup(char *str) | |||
764 | } | 785 | } |
765 | __setup("boot_delay=", boot_delay_setup); | 786 | __setup("boot_delay=", boot_delay_setup); |
766 | 787 | ||
767 | static void boot_delay_msec(void) | 788 | static void boot_delay_msec(int level) |
768 | { | 789 | { |
769 | unsigned long long k; | 790 | unsigned long long k; |
770 | unsigned long timeout; | 791 | unsigned long timeout; |
771 | 792 | ||
772 | if (boot_delay == 0 || system_state != SYSTEM_BOOTING) | 793 | if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) |
794 | || (level >= console_loglevel && !ignore_loglevel)) { | ||
773 | return; | 795 | return; |
796 | } | ||
774 | 797 | ||
775 | k = (unsigned long long)loops_per_msec * boot_delay; | 798 | k = (unsigned long long)loops_per_msec * boot_delay; |
776 | 799 | ||
@@ -789,7 +812,7 @@ static void boot_delay_msec(void) | |||
789 | } | 812 | } |
790 | } | 813 | } |
791 | #else | 814 | #else |
792 | static inline void boot_delay_msec(void) | 815 | static inline void boot_delay_msec(int level) |
793 | { | 816 | { |
794 | } | 817 | } |
795 | #endif | 818 | #endif |
@@ -847,10 +870,11 @@ static size_t print_time(u64 ts, char *buf) | |||
847 | if (!printk_time) | 870 | if (!printk_time) |
848 | return 0; | 871 | return 0; |
849 | 872 | ||
873 | rem_nsec = do_div(ts, 1000000000); | ||
874 | |||
850 | if (!buf) | 875 | if (!buf) |
851 | return 15; | 876 | return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); |
852 | 877 | ||
853 | rem_nsec = do_div(ts, 1000000000); | ||
854 | return sprintf(buf, "[%5lu.%06lu] ", | 878 | return sprintf(buf, "[%5lu.%06lu] ", |
855 | (unsigned long)ts, rem_nsec / 1000); | 879 | (unsigned long)ts, rem_nsec / 1000); |
856 | } | 880 | } |
@@ -1232,21 +1256,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | |||
1232 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); | 1256 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
1233 | } | 1257 | } |
1234 | 1258 | ||
1235 | static bool __read_mostly ignore_loglevel; | ||
1236 | |||
1237 | static int __init ignore_loglevel_setup(char *str) | ||
1238 | { | ||
1239 | ignore_loglevel = 1; | ||
1240 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | ||
1241 | |||
1242 | return 0; | ||
1243 | } | ||
1244 | |||
1245 | early_param("ignore_loglevel", ignore_loglevel_setup); | ||
1246 | module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); | ||
1247 | MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | ||
1248 | "print all kernel messages to the console."); | ||
1249 | |||
1250 | /* | 1259 | /* |
1251 | * Call the console drivers, asking them to write out | 1260 | * Call the console drivers, asking them to write out |
1252 | * log_buf[start] to log_buf[end - 1]. | 1261 | * log_buf[start] to log_buf[end - 1]. |
@@ -1492,7 +1501,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1492 | int this_cpu; | 1501 | int this_cpu; |
1493 | int printed_len = 0; | 1502 | int printed_len = 0; |
1494 | 1503 | ||
1495 | boot_delay_msec(); | 1504 | boot_delay_msec(level); |
1496 | printk_delay(); | 1505 | printk_delay(); |
1497 | 1506 | ||
1498 | /* This stops the holder of console_sem just where we want him */ | 1507 | /* This stops the holder of console_sem just where we want him */ |
@@ -1908,12 +1917,14 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, | |||
1908 | */ | 1917 | */ |
1909 | void console_lock(void) | 1918 | void console_lock(void) |
1910 | { | 1919 | { |
1911 | BUG_ON(in_interrupt()); | 1920 | might_sleep(); |
1921 | |||
1912 | down(&console_sem); | 1922 | down(&console_sem); |
1913 | if (console_suspended) | 1923 | if (console_suspended) |
1914 | return; | 1924 | return; |
1915 | console_locked = 1; | 1925 | console_locked = 1; |
1916 | console_may_schedule = 1; | 1926 | console_may_schedule = 1; |
1927 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); | ||
1917 | } | 1928 | } |
1918 | EXPORT_SYMBOL(console_lock); | 1929 | EXPORT_SYMBOL(console_lock); |
1919 | 1930 | ||
@@ -1935,6 +1946,7 @@ int console_trylock(void) | |||
1935 | } | 1946 | } |
1936 | console_locked = 1; | 1947 | console_locked = 1; |
1937 | console_may_schedule = 0; | 1948 | console_may_schedule = 0; |
1949 | mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); | ||
1938 | return 1; | 1950 | return 1; |
1939 | } | 1951 | } |
1940 | EXPORT_SYMBOL(console_trylock); | 1952 | EXPORT_SYMBOL(console_trylock); |
@@ -2095,6 +2107,7 @@ skip: | |||
2095 | local_irq_restore(flags); | 2107 | local_irq_restore(flags); |
2096 | } | 2108 | } |
2097 | console_locked = 0; | 2109 | console_locked = 0; |
2110 | mutex_release(&console_lock_dep_map, 1, _RET_IP_); | ||
2098 | 2111 | ||
2099 | /* Release the exclusive_console once it is used */ | 2112 | /* Release the exclusive_console once it is used */ |
2100 | if (unlikely(exclusive_console)) | 2113 | if (unlikely(exclusive_console)) |
diff --git a/kernel/profile.c b/kernel/profile.c index 76b8e77773ee..1f391819c42f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -8,9 +8,10 @@ | |||
8 | * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, | 8 | * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, |
9 | * Red Hat, July 2004 | 9 | * Red Hat, July 2004 |
10 | * Consolidation of architecture support code for profiling, | 10 | * Consolidation of architecture support code for profiling, |
11 | * William Irwin, Oracle, July 2004 | 11 | * Nadia Yvette Chambers, Oracle, July 2004 |
12 | * Amortized hit count accounting via per-cpu open-addressed hashtables | 12 | * Amortized hit count accounting via per-cpu open-addressed hashtables |
13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 | 13 | * to resolve timer interrupt livelocks, Nadia Yvette Chambers, |
14 | * Oracle, 2004 | ||
14 | */ | 15 | */ |
15 | 16 | ||
16 | #include <linux/export.h> | 17 | #include <linux/export.h> |
@@ -256,7 +257,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook); | |||
256 | * pagetable hash functions, but uses a full hashtable full of finite | 257 | * pagetable hash functions, but uses a full hashtable full of finite |
257 | * collision chains, not just pairs of them. | 258 | * collision chains, not just pairs of them. |
258 | * | 259 | * |
259 | * -- wli | 260 | * -- nyc |
260 | */ | 261 | */ |
261 | static void __profile_flip_buffers(void *unused) | 262 | static void __profile_flip_buffers(void *unused) |
262 | { | 263 | { |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1f5e55dda955..1599157336a6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -215,8 +215,12 @@ ok: | |||
215 | smp_rmb(); | 215 | smp_rmb(); |
216 | if (task->mm) | 216 | if (task->mm) |
217 | dumpable = get_dumpable(task->mm); | 217 | dumpable = get_dumpable(task->mm); |
218 | if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) | 218 | rcu_read_lock(); |
219 | if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { | ||
220 | rcu_read_unlock(); | ||
219 | return -EPERM; | 221 | return -EPERM; |
222 | } | ||
223 | rcu_read_unlock(); | ||
220 | 224 | ||
221 | return security_ptrace_access_check(task, mode); | 225 | return security_ptrace_access_check(task, mode); |
222 | } | 226 | } |
@@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request, | |||
280 | 284 | ||
281 | if (seize) | 285 | if (seize) |
282 | flags |= PT_SEIZED; | 286 | flags |= PT_SEIZED; |
283 | if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) | 287 | rcu_read_lock(); |
288 | if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) | ||
284 | flags |= PT_PTRACE_CAP; | 289 | flags |= PT_PTRACE_CAP; |
290 | rcu_read_unlock(); | ||
285 | task->ptrace = flags; | 291 | task->ptrace = flags; |
286 | 292 | ||
287 | __ptrace_link(task, current); | 293 | __ptrace_link(task, current); |
@@ -457,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer) | |||
457 | return; | 463 | return; |
458 | 464 | ||
459 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { | 465 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { |
466 | if (unlikely(p->ptrace & PT_EXITKILL)) | ||
467 | send_sig_info(SIGKILL, SEND_SIG_FORCED, p); | ||
468 | |||
460 | if (__ptrace_detach(tracer, p)) | 469 | if (__ptrace_detach(tracer, p)) |
461 | list_add(&p->ptrace_entry, &ptrace_dead); | 470 | list_add(&p->ptrace_entry, &ptrace_dead); |
462 | } | 471 | } |
diff --git a/kernel/rcu.h b/kernel/rcu.h index 8ba99cdc6515..20dfba576c2b 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
@@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) | |||
109 | } | 109 | } |
110 | } | 110 | } |
111 | 111 | ||
112 | extern int rcu_expedited; | ||
113 | |||
112 | #endif /* __LINUX_RCU_H */ | 114 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 29ca1c6da594..a2cf76177b44 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -46,12 +46,15 @@ | |||
46 | #include <linux/export.h> | 46 | #include <linux/export.h> |
47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
48 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
49 | #include <linux/module.h> | ||
49 | 50 | ||
50 | #define CREATE_TRACE_POINTS | 51 | #define CREATE_TRACE_POINTS |
51 | #include <trace/events/rcu.h> | 52 | #include <trace/events/rcu.h> |
52 | 53 | ||
53 | #include "rcu.h" | 54 | #include "rcu.h" |
54 | 55 | ||
56 | module_param(rcu_expedited, int, 0); | ||
57 | |||
55 | #ifdef CONFIG_PREEMPT_RCU | 58 | #ifdef CONFIG_PREEMPT_RCU |
56 | 59 | ||
57 | /* | 60 | /* |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e4c6a598d6f7..e7dce58f9c2a 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -195,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); | |||
195 | */ | 195 | */ |
196 | int rcu_is_cpu_rrupt_from_idle(void) | 196 | int rcu_is_cpu_rrupt_from_idle(void) |
197 | { | 197 | { |
198 | return rcu_dynticks_nesting <= 0; | 198 | return rcu_dynticks_nesting <= 1; |
199 | } | 199 | } |
200 | 200 | ||
201 | /* | 201 | /* |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 3d0190282204..f85016a2309b 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -706,7 +706,10 @@ void synchronize_rcu(void) | |||
706 | return; | 706 | return; |
707 | 707 | ||
708 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ | 708 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ |
709 | rcu_barrier(); | 709 | if (rcu_expedited) |
710 | synchronize_rcu_expedited(); | ||
711 | else | ||
712 | rcu_barrier(); | ||
710 | } | 713 | } |
711 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 714 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
712 | 715 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index aaa7b9f3532a..31dea01c85fd 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -339,7 +339,6 @@ rcu_stutter_wait(char *title) | |||
339 | 339 | ||
340 | struct rcu_torture_ops { | 340 | struct rcu_torture_ops { |
341 | void (*init)(void); | 341 | void (*init)(void); |
342 | void (*cleanup)(void); | ||
343 | int (*readlock)(void); | 342 | int (*readlock)(void); |
344 | void (*read_delay)(struct rcu_random_state *rrsp); | 343 | void (*read_delay)(struct rcu_random_state *rrsp); |
345 | void (*readunlock)(int idx); | 344 | void (*readunlock)(int idx); |
@@ -431,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) | |||
431 | 430 | ||
432 | static struct rcu_torture_ops rcu_ops = { | 431 | static struct rcu_torture_ops rcu_ops = { |
433 | .init = NULL, | 432 | .init = NULL, |
434 | .cleanup = NULL, | ||
435 | .readlock = rcu_torture_read_lock, | 433 | .readlock = rcu_torture_read_lock, |
436 | .read_delay = rcu_read_delay, | 434 | .read_delay = rcu_read_delay, |
437 | .readunlock = rcu_torture_read_unlock, | 435 | .readunlock = rcu_torture_read_unlock, |
@@ -475,7 +473,6 @@ static void rcu_sync_torture_init(void) | |||
475 | 473 | ||
476 | static struct rcu_torture_ops rcu_sync_ops = { | 474 | static struct rcu_torture_ops rcu_sync_ops = { |
477 | .init = rcu_sync_torture_init, | 475 | .init = rcu_sync_torture_init, |
478 | .cleanup = NULL, | ||
479 | .readlock = rcu_torture_read_lock, | 476 | .readlock = rcu_torture_read_lock, |
480 | .read_delay = rcu_read_delay, | 477 | .read_delay = rcu_read_delay, |
481 | .readunlock = rcu_torture_read_unlock, | 478 | .readunlock = rcu_torture_read_unlock, |
@@ -493,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
493 | 490 | ||
494 | static struct rcu_torture_ops rcu_expedited_ops = { | 491 | static struct rcu_torture_ops rcu_expedited_ops = { |
495 | .init = rcu_sync_torture_init, | 492 | .init = rcu_sync_torture_init, |
496 | .cleanup = NULL, | ||
497 | .readlock = rcu_torture_read_lock, | 493 | .readlock = rcu_torture_read_lock, |
498 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 494 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
499 | .readunlock = rcu_torture_read_unlock, | 495 | .readunlock = rcu_torture_read_unlock, |
@@ -536,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
536 | 532 | ||
537 | static struct rcu_torture_ops rcu_bh_ops = { | 533 | static struct rcu_torture_ops rcu_bh_ops = { |
538 | .init = NULL, | 534 | .init = NULL, |
539 | .cleanup = NULL, | ||
540 | .readlock = rcu_bh_torture_read_lock, | 535 | .readlock = rcu_bh_torture_read_lock, |
541 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 536 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
542 | .readunlock = rcu_bh_torture_read_unlock, | 537 | .readunlock = rcu_bh_torture_read_unlock, |
@@ -553,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
553 | 548 | ||
554 | static struct rcu_torture_ops rcu_bh_sync_ops = { | 549 | static struct rcu_torture_ops rcu_bh_sync_ops = { |
555 | .init = rcu_sync_torture_init, | 550 | .init = rcu_sync_torture_init, |
556 | .cleanup = NULL, | ||
557 | .readlock = rcu_bh_torture_read_lock, | 551 | .readlock = rcu_bh_torture_read_lock, |
558 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 552 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
559 | .readunlock = rcu_bh_torture_read_unlock, | 553 | .readunlock = rcu_bh_torture_read_unlock, |
@@ -570,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
570 | 564 | ||
571 | static struct rcu_torture_ops rcu_bh_expedited_ops = { | 565 | static struct rcu_torture_ops rcu_bh_expedited_ops = { |
572 | .init = rcu_sync_torture_init, | 566 | .init = rcu_sync_torture_init, |
573 | .cleanup = NULL, | ||
574 | .readlock = rcu_bh_torture_read_lock, | 567 | .readlock = rcu_bh_torture_read_lock, |
575 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 568 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
576 | .readunlock = rcu_bh_torture_read_unlock, | 569 | .readunlock = rcu_bh_torture_read_unlock, |
@@ -589,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { | |||
589 | * Definitions for srcu torture testing. | 582 | * Definitions for srcu torture testing. |
590 | */ | 583 | */ |
591 | 584 | ||
592 | static struct srcu_struct srcu_ctl; | 585 | DEFINE_STATIC_SRCU(srcu_ctl); |
593 | |||
594 | static void srcu_torture_init(void) | ||
595 | { | ||
596 | init_srcu_struct(&srcu_ctl); | ||
597 | rcu_sync_torture_init(); | ||
598 | } | ||
599 | |||
600 | static void srcu_torture_cleanup(void) | ||
601 | { | ||
602 | synchronize_srcu(&srcu_ctl); | ||
603 | cleanup_srcu_struct(&srcu_ctl); | ||
604 | } | ||
605 | 586 | ||
606 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) | 587 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) |
607 | { | 588 | { |
@@ -672,8 +653,7 @@ static int srcu_torture_stats(char *page) | |||
672 | } | 653 | } |
673 | 654 | ||
674 | static struct rcu_torture_ops srcu_ops = { | 655 | static struct rcu_torture_ops srcu_ops = { |
675 | .init = srcu_torture_init, | 656 | .init = rcu_sync_torture_init, |
676 | .cleanup = srcu_torture_cleanup, | ||
677 | .readlock = srcu_torture_read_lock, | 657 | .readlock = srcu_torture_read_lock, |
678 | .read_delay = srcu_read_delay, | 658 | .read_delay = srcu_read_delay, |
679 | .readunlock = srcu_torture_read_unlock, | 659 | .readunlock = srcu_torture_read_unlock, |
@@ -687,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = { | |||
687 | }; | 667 | }; |
688 | 668 | ||
689 | static struct rcu_torture_ops srcu_sync_ops = { | 669 | static struct rcu_torture_ops srcu_sync_ops = { |
690 | .init = srcu_torture_init, | 670 | .init = rcu_sync_torture_init, |
691 | .cleanup = srcu_torture_cleanup, | ||
692 | .readlock = srcu_torture_read_lock, | 671 | .readlock = srcu_torture_read_lock, |
693 | .read_delay = srcu_read_delay, | 672 | .read_delay = srcu_read_delay, |
694 | .readunlock = srcu_torture_read_unlock, | 673 | .readunlock = srcu_torture_read_unlock, |
@@ -712,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) | |||
712 | } | 691 | } |
713 | 692 | ||
714 | static struct rcu_torture_ops srcu_raw_ops = { | 693 | static struct rcu_torture_ops srcu_raw_ops = { |
715 | .init = srcu_torture_init, | 694 | .init = rcu_sync_torture_init, |
716 | .cleanup = srcu_torture_cleanup, | ||
717 | .readlock = srcu_torture_read_lock_raw, | 695 | .readlock = srcu_torture_read_lock_raw, |
718 | .read_delay = srcu_read_delay, | 696 | .read_delay = srcu_read_delay, |
719 | .readunlock = srcu_torture_read_unlock_raw, | 697 | .readunlock = srcu_torture_read_unlock_raw, |
@@ -727,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = { | |||
727 | }; | 705 | }; |
728 | 706 | ||
729 | static struct rcu_torture_ops srcu_raw_sync_ops = { | 707 | static struct rcu_torture_ops srcu_raw_sync_ops = { |
730 | .init = srcu_torture_init, | 708 | .init = rcu_sync_torture_init, |
731 | .cleanup = srcu_torture_cleanup, | ||
732 | .readlock = srcu_torture_read_lock_raw, | 709 | .readlock = srcu_torture_read_lock_raw, |
733 | .read_delay = srcu_read_delay, | 710 | .read_delay = srcu_read_delay, |
734 | .readunlock = srcu_torture_read_unlock_raw, | 711 | .readunlock = srcu_torture_read_unlock_raw, |
@@ -747,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void) | |||
747 | } | 724 | } |
748 | 725 | ||
749 | static struct rcu_torture_ops srcu_expedited_ops = { | 726 | static struct rcu_torture_ops srcu_expedited_ops = { |
750 | .init = srcu_torture_init, | 727 | .init = rcu_sync_torture_init, |
751 | .cleanup = srcu_torture_cleanup, | ||
752 | .readlock = srcu_torture_read_lock, | 728 | .readlock = srcu_torture_read_lock, |
753 | .read_delay = srcu_read_delay, | 729 | .read_delay = srcu_read_delay, |
754 | .readunlock = srcu_torture_read_unlock, | 730 | .readunlock = srcu_torture_read_unlock, |
@@ -783,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) | |||
783 | 759 | ||
784 | static struct rcu_torture_ops sched_ops = { | 760 | static struct rcu_torture_ops sched_ops = { |
785 | .init = rcu_sync_torture_init, | 761 | .init = rcu_sync_torture_init, |
786 | .cleanup = NULL, | ||
787 | .readlock = sched_torture_read_lock, | 762 | .readlock = sched_torture_read_lock, |
788 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 763 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
789 | .readunlock = sched_torture_read_unlock, | 764 | .readunlock = sched_torture_read_unlock, |
@@ -799,7 +774,6 @@ static struct rcu_torture_ops sched_ops = { | |||
799 | 774 | ||
800 | static struct rcu_torture_ops sched_sync_ops = { | 775 | static struct rcu_torture_ops sched_sync_ops = { |
801 | .init = rcu_sync_torture_init, | 776 | .init = rcu_sync_torture_init, |
802 | .cleanup = NULL, | ||
803 | .readlock = sched_torture_read_lock, | 777 | .readlock = sched_torture_read_lock, |
804 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 778 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
805 | .readunlock = sched_torture_read_unlock, | 779 | .readunlock = sched_torture_read_unlock, |
@@ -814,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = { | |||
814 | 788 | ||
815 | static struct rcu_torture_ops sched_expedited_ops = { | 789 | static struct rcu_torture_ops sched_expedited_ops = { |
816 | .init = rcu_sync_torture_init, | 790 | .init = rcu_sync_torture_init, |
817 | .cleanup = NULL, | ||
818 | .readlock = sched_torture_read_lock, | 791 | .readlock = sched_torture_read_lock, |
819 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 792 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
820 | .readunlock = sched_torture_read_unlock, | 793 | .readunlock = sched_torture_read_unlock, |
@@ -1396,12 +1369,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | |||
1396 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | 1369 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1397 | "test_boost=%d/%d test_boost_interval=%d " | 1370 | "test_boost=%d/%d test_boost_interval=%d " |
1398 | "test_boost_duration=%d shutdown_secs=%d " | 1371 | "test_boost_duration=%d shutdown_secs=%d " |
1372 | "stall_cpu=%d stall_cpu_holdoff=%d " | ||
1373 | "n_barrier_cbs=%d " | ||
1399 | "onoff_interval=%d onoff_holdoff=%d\n", | 1374 | "onoff_interval=%d onoff_holdoff=%d\n", |
1400 | torture_type, tag, nrealreaders, nfakewriters, | 1375 | torture_type, tag, nrealreaders, nfakewriters, |
1401 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1376 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1402 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | 1377 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1403 | test_boost, cur_ops->can_boost, | 1378 | test_boost, cur_ops->can_boost, |
1404 | test_boost_interval, test_boost_duration, shutdown_secs, | 1379 | test_boost_interval, test_boost_duration, shutdown_secs, |
1380 | stall_cpu, stall_cpu_holdoff, | ||
1381 | n_barrier_cbs, | ||
1405 | onoff_interval, onoff_holdoff); | 1382 | onoff_interval, onoff_holdoff); |
1406 | } | 1383 | } |
1407 | 1384 | ||
@@ -1502,6 +1479,7 @@ rcu_torture_onoff(void *arg) | |||
1502 | unsigned long delta; | 1479 | unsigned long delta; |
1503 | int maxcpu = -1; | 1480 | int maxcpu = -1; |
1504 | DEFINE_RCU_RANDOM(rand); | 1481 | DEFINE_RCU_RANDOM(rand); |
1482 | int ret; | ||
1505 | unsigned long starttime; | 1483 | unsigned long starttime; |
1506 | 1484 | ||
1507 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); | 1485 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); |
@@ -1522,7 +1500,13 @@ rcu_torture_onoff(void *arg) | |||
1522 | torture_type, cpu); | 1500 | torture_type, cpu); |
1523 | starttime = jiffies; | 1501 | starttime = jiffies; |
1524 | n_offline_attempts++; | 1502 | n_offline_attempts++; |
1525 | if (cpu_down(cpu) == 0) { | 1503 | ret = cpu_down(cpu); |
1504 | if (ret) { | ||
1505 | if (verbose) | ||
1506 | pr_alert("%s" TORTURE_FLAG | ||
1507 | "rcu_torture_onoff task: offline %d failed: errno %d\n", | ||
1508 | torture_type, cpu, ret); | ||
1509 | } else { | ||
1526 | if (verbose) | 1510 | if (verbose) |
1527 | pr_alert("%s" TORTURE_FLAG | 1511 | pr_alert("%s" TORTURE_FLAG |
1528 | "rcu_torture_onoff task: offlined %d\n", | 1512 | "rcu_torture_onoff task: offlined %d\n", |
@@ -1936,8 +1920,6 @@ rcu_torture_cleanup(void) | |||
1936 | 1920 | ||
1937 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 1921 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
1938 | 1922 | ||
1939 | if (cur_ops->cleanup) | ||
1940 | cur_ops->cleanup(); | ||
1941 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) | 1923 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) |
1942 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1924 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1943 | else if (n_online_successes != n_online_attempts || | 1925 | else if (n_online_successes != n_online_attempts || |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204..e441b77b614e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -68,9 +68,9 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
68 | .level = { &sname##_state.node[0] }, \ | 68 | .level = { &sname##_state.node[0] }, \ |
69 | .call = cr, \ | 69 | .call = cr, \ |
70 | .fqs_state = RCU_GP_IDLE, \ | 70 | .fqs_state = RCU_GP_IDLE, \ |
71 | .gpnum = -300, \ | 71 | .gpnum = 0UL - 300UL, \ |
72 | .completed = -300, \ | 72 | .completed = 0UL - 300UL, \ |
73 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ | 73 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ |
74 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 74 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ |
75 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 75 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
@@ -207,18 +207,15 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); | |||
207 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 207 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
208 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 208 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
209 | .dynticks = ATOMIC_INIT(1), | 209 | .dynticks = ATOMIC_INIT(1), |
210 | #if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE) | ||
211 | .ignore_user_qs = true, | ||
212 | #endif | ||
213 | }; | 210 | }; |
214 | 211 | ||
215 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 212 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
216 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ | 213 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ |
217 | static int qlowmark = 100; /* Once only this many pending, use blimit. */ | 214 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ |
218 | 215 | ||
219 | module_param(blimit, int, 0444); | 216 | module_param(blimit, long, 0444); |
220 | module_param(qhimark, int, 0444); | 217 | module_param(qhimark, long, 0444); |
221 | module_param(qlowmark, int, 0444); | 218 | module_param(qlowmark, long, 0444); |
222 | 219 | ||
223 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 220 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
224 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | 221 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; |
@@ -303,7 +300,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | |||
303 | static int | 300 | static int |
304 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | 301 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) |
305 | { | 302 | { |
306 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; | 303 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && |
304 | rdp->nxttail[RCU_DONE_TAIL] != NULL; | ||
307 | } | 305 | } |
308 | 306 | ||
309 | /* | 307 | /* |
@@ -312,8 +310,11 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | |||
312 | static int | 310 | static int |
313 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 311 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
314 | { | 312 | { |
315 | return *rdp->nxttail[RCU_DONE_TAIL + | 313 | struct rcu_head **ntp; |
316 | ACCESS_ONCE(rsp->completed) != rdp->completed] && | 314 | |
315 | ntp = rdp->nxttail[RCU_DONE_TAIL + | ||
316 | (ACCESS_ONCE(rsp->completed) != rdp->completed)]; | ||
317 | return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && | ||
317 | !rcu_gp_in_progress(rsp); | 318 | !rcu_gp_in_progress(rsp); |
318 | } | 319 | } |
319 | 320 | ||
@@ -416,29 +417,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); | |||
416 | */ | 417 | */ |
417 | void rcu_user_enter(void) | 418 | void rcu_user_enter(void) |
418 | { | 419 | { |
419 | unsigned long flags; | 420 | rcu_eqs_enter(1); |
420 | struct rcu_dynticks *rdtp; | ||
421 | |||
422 | /* | ||
423 | * Some contexts may involve an exception occuring in an irq, | ||
424 | * leading to that nesting: | ||
425 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
426 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
427 | * helpers are enough to protect RCU uses inside the exception. So | ||
428 | * just return immediately if we detect we are in an IRQ. | ||
429 | */ | ||
430 | if (in_interrupt()) | ||
431 | return; | ||
432 | |||
433 | WARN_ON_ONCE(!current->mm); | ||
434 | |||
435 | local_irq_save(flags); | ||
436 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
437 | if (!rdtp->ignore_user_qs && !rdtp->in_user) { | ||
438 | rdtp->in_user = true; | ||
439 | rcu_eqs_enter(true); | ||
440 | } | ||
441 | local_irq_restore(flags); | ||
442 | } | 421 | } |
443 | 422 | ||
444 | /** | 423 | /** |
@@ -575,27 +554,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); | |||
575 | */ | 554 | */ |
576 | void rcu_user_exit(void) | 555 | void rcu_user_exit(void) |
577 | { | 556 | { |
578 | unsigned long flags; | 557 | rcu_eqs_exit(1); |
579 | struct rcu_dynticks *rdtp; | ||
580 | |||
581 | /* | ||
582 | * Some contexts may involve an exception occuring in an irq, | ||
583 | * leading to that nesting: | ||
584 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
585 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
586 | * helpers are enough to protect RCU uses inside the exception. So | ||
587 | * just return immediately if we detect we are in an IRQ. | ||
588 | */ | ||
589 | if (in_interrupt()) | ||
590 | return; | ||
591 | |||
592 | local_irq_save(flags); | ||
593 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
594 | if (rdtp->in_user) { | ||
595 | rdtp->in_user = false; | ||
596 | rcu_eqs_exit(true); | ||
597 | } | ||
598 | local_irq_restore(flags); | ||
599 | } | 558 | } |
600 | 559 | ||
601 | /** | 560 | /** |
@@ -718,21 +677,6 @@ int rcu_is_cpu_idle(void) | |||
718 | } | 677 | } |
719 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 678 | EXPORT_SYMBOL(rcu_is_cpu_idle); |
720 | 679 | ||
721 | #ifdef CONFIG_RCU_USER_QS | ||
722 | void rcu_user_hooks_switch(struct task_struct *prev, | ||
723 | struct task_struct *next) | ||
724 | { | ||
725 | struct rcu_dynticks *rdtp; | ||
726 | |||
727 | /* Interrupts are disabled in context switch */ | ||
728 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
729 | if (!rdtp->ignore_user_qs) { | ||
730 | clear_tsk_thread_flag(prev, TIF_NOHZ); | ||
731 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
732 | } | ||
733 | } | ||
734 | #endif /* #ifdef CONFIG_RCU_USER_QS */ | ||
735 | |||
736 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | 680 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
737 | 681 | ||
738 | /* | 682 | /* |
@@ -873,6 +817,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) | |||
873 | rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); | 817 | rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); |
874 | } | 818 | } |
875 | 819 | ||
820 | /* | ||
821 | * Dump stacks of all tasks running on stalled CPUs. This is a fallback | ||
822 | * for architectures that do not implement trigger_all_cpu_backtrace(). | ||
823 | * The NMI-triggered stack traces are more accurate because they are | ||
824 | * printed by the target CPU. | ||
825 | */ | ||
826 | static void rcu_dump_cpu_stacks(struct rcu_state *rsp) | ||
827 | { | ||
828 | int cpu; | ||
829 | unsigned long flags; | ||
830 | struct rcu_node *rnp; | ||
831 | |||
832 | rcu_for_each_leaf_node(rsp, rnp) { | ||
833 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
834 | if (rnp->qsmask != 0) { | ||
835 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | ||
836 | if (rnp->qsmask & (1UL << cpu)) | ||
837 | dump_cpu_task(rnp->grplo + cpu); | ||
838 | } | ||
839 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
840 | } | ||
841 | } | ||
842 | |||
876 | static void print_other_cpu_stall(struct rcu_state *rsp) | 843 | static void print_other_cpu_stall(struct rcu_state *rsp) |
877 | { | 844 | { |
878 | int cpu; | 845 | int cpu; |
@@ -880,6 +847,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
880 | unsigned long flags; | 847 | unsigned long flags; |
881 | int ndetected = 0; | 848 | int ndetected = 0; |
882 | struct rcu_node *rnp = rcu_get_root(rsp); | 849 | struct rcu_node *rnp = rcu_get_root(rsp); |
850 | long totqlen = 0; | ||
883 | 851 | ||
884 | /* Only let one CPU complain about others per time interval. */ | 852 | /* Only let one CPU complain about others per time interval. */ |
885 | 853 | ||
@@ -924,12 +892,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
924 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 892 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
925 | 893 | ||
926 | print_cpu_stall_info_end(); | 894 | print_cpu_stall_info_end(); |
927 | printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", | 895 | for_each_possible_cpu(cpu) |
928 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 896 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; |
897 | pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", | ||
898 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | ||
899 | rsp->gpnum, rsp->completed, totqlen); | ||
929 | if (ndetected == 0) | 900 | if (ndetected == 0) |
930 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); | 901 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); |
931 | else if (!trigger_all_cpu_backtrace()) | 902 | else if (!trigger_all_cpu_backtrace()) |
932 | dump_stack(); | 903 | rcu_dump_cpu_stacks(rsp); |
933 | 904 | ||
934 | /* Complain about tasks blocking the grace period. */ | 905 | /* Complain about tasks blocking the grace period. */ |
935 | 906 | ||
@@ -940,8 +911,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
940 | 911 | ||
941 | static void print_cpu_stall(struct rcu_state *rsp) | 912 | static void print_cpu_stall(struct rcu_state *rsp) |
942 | { | 913 | { |
914 | int cpu; | ||
943 | unsigned long flags; | 915 | unsigned long flags; |
944 | struct rcu_node *rnp = rcu_get_root(rsp); | 916 | struct rcu_node *rnp = rcu_get_root(rsp); |
917 | long totqlen = 0; | ||
945 | 918 | ||
946 | /* | 919 | /* |
947 | * OK, time to rat on ourselves... | 920 | * OK, time to rat on ourselves... |
@@ -952,7 +925,10 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
952 | print_cpu_stall_info_begin(); | 925 | print_cpu_stall_info_begin(); |
953 | print_cpu_stall_info(rsp, smp_processor_id()); | 926 | print_cpu_stall_info(rsp, smp_processor_id()); |
954 | print_cpu_stall_info_end(); | 927 | print_cpu_stall_info_end(); |
955 | printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); | 928 | for_each_possible_cpu(cpu) |
929 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | ||
930 | pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", | ||
931 | jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); | ||
956 | if (!trigger_all_cpu_backtrace()) | 932 | if (!trigger_all_cpu_backtrace()) |
957 | dump_stack(); | 933 | dump_stack(); |
958 | 934 | ||
@@ -1091,6 +1067,7 @@ static void init_callback_list(struct rcu_data *rdp) | |||
1091 | rdp->nxtlist = NULL; | 1067 | rdp->nxtlist = NULL; |
1092 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1068 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1093 | rdp->nxttail[i] = &rdp->nxtlist; | 1069 | rdp->nxttail[i] = &rdp->nxtlist; |
1070 | init_nocb_callback_list(rdp); | ||
1094 | } | 1071 | } |
1095 | 1072 | ||
1096 | /* | 1073 | /* |
@@ -1404,15 +1381,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
1404 | !cpu_needs_another_gp(rsp, rdp)) { | 1381 | !cpu_needs_another_gp(rsp, rdp)) { |
1405 | /* | 1382 | /* |
1406 | * Either we have not yet spawned the grace-period | 1383 | * Either we have not yet spawned the grace-period |
1407 | * task or this CPU does not need another grace period. | 1384 | * task, this CPU does not need another grace period, |
1385 | * or a grace period is already in progress. | ||
1408 | * Either way, don't start a new grace period. | 1386 | * Either way, don't start a new grace period. |
1409 | */ | 1387 | */ |
1410 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1388 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1411 | return; | 1389 | return; |
1412 | } | 1390 | } |
1413 | 1391 | ||
1392 | /* | ||
1393 | * Because there is no grace period in progress right now, | ||
1394 | * any callbacks we have up to this point will be satisfied | ||
1395 | * by the next grace period. So promote all callbacks to be | ||
1396 | * handled after the end of the next grace period. If the | ||
1397 | * CPU is not yet aware of the end of the previous grace period, | ||
1398 | * we need to allow for the callback advancement that will | ||
1399 | * occur when it does become aware. Deadlock prevents us from | ||
1400 | * making it aware at this point: We cannot acquire a leaf | ||
1401 | * rcu_node ->lock while holding the root rcu_node ->lock. | ||
1402 | */ | ||
1403 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1404 | if (rdp->completed == rsp->completed) | ||
1405 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1406 | |||
1414 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1407 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
1415 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1408 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ |
1409 | |||
1410 | /* Ensure that CPU is aware of completion of last grace period. */ | ||
1411 | rcu_process_gp_end(rsp, rdp); | ||
1412 | local_irq_restore(flags); | ||
1413 | |||
1414 | /* Wake up rcu_gp_kthread() to start the grace period. */ | ||
1416 | wake_up(&rsp->gp_wq); | 1415 | wake_up(&rsp->gp_wq); |
1417 | } | 1416 | } |
1418 | 1417 | ||
@@ -1573,16 +1572,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1573 | /* | 1572 | /* |
1574 | * Send the specified CPU's RCU callbacks to the orphanage. The | 1573 | * Send the specified CPU's RCU callbacks to the orphanage. The |
1575 | * specified CPU must be offline, and the caller must hold the | 1574 | * specified CPU must be offline, and the caller must hold the |
1576 | * ->onofflock. | 1575 | * ->orphan_lock. |
1577 | */ | 1576 | */ |
1578 | static void | 1577 | static void |
1579 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | 1578 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, |
1580 | struct rcu_node *rnp, struct rcu_data *rdp) | 1579 | struct rcu_node *rnp, struct rcu_data *rdp) |
1581 | { | 1580 | { |
1581 | /* No-CBs CPUs do not have orphanable callbacks. */ | ||
1582 | if (is_nocb_cpu(rdp->cpu)) | ||
1583 | return; | ||
1584 | |||
1582 | /* | 1585 | /* |
1583 | * Orphan the callbacks. First adjust the counts. This is safe | 1586 | * Orphan the callbacks. First adjust the counts. This is safe |
1584 | * because ->onofflock excludes _rcu_barrier()'s adoption of | 1587 | * because _rcu_barrier() excludes CPU-hotplug operations, so it |
1585 | * the callbacks, thus no memory barrier is required. | 1588 | * cannot be running now. Thus no memory barrier is required. |
1586 | */ | 1589 | */ |
1587 | if (rdp->nxtlist != NULL) { | 1590 | if (rdp->nxtlist != NULL) { |
1588 | rsp->qlen_lazy += rdp->qlen_lazy; | 1591 | rsp->qlen_lazy += rdp->qlen_lazy; |
@@ -1623,13 +1626,17 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
1623 | 1626 | ||
1624 | /* | 1627 | /* |
1625 | * Adopt the RCU callbacks from the specified rcu_state structure's | 1628 | * Adopt the RCU callbacks from the specified rcu_state structure's |
1626 | * orphanage. The caller must hold the ->onofflock. | 1629 | * orphanage. The caller must hold the ->orphan_lock. |
1627 | */ | 1630 | */ |
1628 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | 1631 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) |
1629 | { | 1632 | { |
1630 | int i; | 1633 | int i; |
1631 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 1634 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); |
1632 | 1635 | ||
1636 | /* No-CBs CPUs are handled specially. */ | ||
1637 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) | ||
1638 | return; | ||
1639 | |||
1633 | /* Do the accounting first. */ | 1640 | /* Do the accounting first. */ |
1634 | rdp->qlen_lazy += rsp->qlen_lazy; | 1641 | rdp->qlen_lazy += rsp->qlen_lazy; |
1635 | rdp->qlen += rsp->qlen; | 1642 | rdp->qlen += rsp->qlen; |
@@ -1702,7 +1709,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1702 | 1709 | ||
1703 | /* Exclude any attempts to start a new grace period. */ | 1710 | /* Exclude any attempts to start a new grace period. */ |
1704 | mutex_lock(&rsp->onoff_mutex); | 1711 | mutex_lock(&rsp->onoff_mutex); |
1705 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1712 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); |
1706 | 1713 | ||
1707 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | 1714 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ |
1708 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | 1715 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); |
@@ -1729,10 +1736,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1729 | /* | 1736 | /* |
1730 | * We still hold the leaf rcu_node structure lock here, and | 1737 | * We still hold the leaf rcu_node structure lock here, and |
1731 | * irqs are still disabled. The reason for this subterfuge is | 1738 | * irqs are still disabled. The reason for this subterfuge is |
1732 | * because invoking rcu_report_unblock_qs_rnp() with ->onofflock | 1739 | * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock |
1733 | * held leads to deadlock. | 1740 | * held leads to deadlock. |
1734 | */ | 1741 | */ |
1735 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 1742 | raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ |
1736 | rnp = rdp->mynode; | 1743 | rnp = rdp->mynode; |
1737 | if (need_report & RCU_OFL_TASKS_NORM_GP) | 1744 | if (need_report & RCU_OFL_TASKS_NORM_GP) |
1738 | rcu_report_unblock_qs_rnp(rnp, flags); | 1745 | rcu_report_unblock_qs_rnp(rnp, flags); |
@@ -1769,7 +1776,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1769 | { | 1776 | { |
1770 | unsigned long flags; | 1777 | unsigned long flags; |
1771 | struct rcu_head *next, *list, **tail; | 1778 | struct rcu_head *next, *list, **tail; |
1772 | int bl, count, count_lazy, i; | 1779 | long bl, count, count_lazy; |
1780 | int i; | ||
1773 | 1781 | ||
1774 | /* If no callbacks are ready, just return.*/ | 1782 | /* If no callbacks are ready, just return.*/ |
1775 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1783 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
@@ -2107,9 +2115,15 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2107 | } | 2115 | } |
2108 | } | 2116 | } |
2109 | 2117 | ||
2118 | /* | ||
2119 | * Helper function for call_rcu() and friends. The cpu argument will | ||
2120 | * normally be -1, indicating "currently running CPU". It may specify | ||
2121 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() | ||
2122 | * is expected to specify a CPU. | ||
2123 | */ | ||
2110 | static void | 2124 | static void |
2111 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 2125 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
2112 | struct rcu_state *rsp, bool lazy) | 2126 | struct rcu_state *rsp, int cpu, bool lazy) |
2113 | { | 2127 | { |
2114 | unsigned long flags; | 2128 | unsigned long flags; |
2115 | struct rcu_data *rdp; | 2129 | struct rcu_data *rdp; |
@@ -2129,9 +2143,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2129 | rdp = this_cpu_ptr(rsp->rda); | 2143 | rdp = this_cpu_ptr(rsp->rda); |
2130 | 2144 | ||
2131 | /* Add the callback to our list. */ | 2145 | /* Add the callback to our list. */ |
2132 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { | 2146 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { |
2147 | int offline; | ||
2148 | |||
2149 | if (cpu != -1) | ||
2150 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2151 | offline = !__call_rcu_nocb(rdp, head, lazy); | ||
2152 | WARN_ON_ONCE(offline); | ||
2133 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ | 2153 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ |
2134 | WARN_ON_ONCE(1); | ||
2135 | local_irq_restore(flags); | 2154 | local_irq_restore(flags); |
2136 | return; | 2155 | return; |
2137 | } | 2156 | } |
@@ -2160,7 +2179,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2160 | */ | 2179 | */ |
2161 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 2180 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
2162 | { | 2181 | { |
2163 | __call_rcu(head, func, &rcu_sched_state, 0); | 2182 | __call_rcu(head, func, &rcu_sched_state, -1, 0); |
2164 | } | 2183 | } |
2165 | EXPORT_SYMBOL_GPL(call_rcu_sched); | 2184 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
2166 | 2185 | ||
@@ -2169,7 +2188,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); | |||
2169 | */ | 2188 | */ |
2170 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 2189 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
2171 | { | 2190 | { |
2172 | __call_rcu(head, func, &rcu_bh_state, 0); | 2191 | __call_rcu(head, func, &rcu_bh_state, -1, 0); |
2173 | } | 2192 | } |
2174 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 2193 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
2175 | 2194 | ||
@@ -2205,10 +2224,28 @@ static inline int rcu_blocking_is_gp(void) | |||
2205 | * rcu_read_lock_sched(). | 2224 | * rcu_read_lock_sched(). |
2206 | * | 2225 | * |
2207 | * This means that all preempt_disable code sequences, including NMI and | 2226 | * This means that all preempt_disable code sequences, including NMI and |
2208 | * hardware-interrupt handlers, in progress on entry will have completed | 2227 | * non-threaded hardware-interrupt handlers, in progress on entry will |
2209 | * before this primitive returns. However, this does not guarantee that | 2228 | * have completed before this primitive returns. However, this does not |
2210 | * softirq handlers will have completed, since in some kernels, these | 2229 | * guarantee that softirq handlers will have completed, since in some |
2211 | * handlers can run in process context, and can block. | 2230 | * kernels, these handlers can run in process context, and can block. |
2231 | * | ||
2232 | * Note that this guarantee implies further memory-ordering guarantees. | ||
2233 | * On systems with more than one CPU, when synchronize_sched() returns, | ||
2234 | * each CPU is guaranteed to have executed a full memory barrier since the | ||
2235 | * end of its last RCU-sched read-side critical section whose beginning | ||
2236 | * preceded the call to synchronize_sched(). In addition, each CPU having | ||
2237 | * an RCU read-side critical section that extends beyond the return from | ||
2238 | * synchronize_sched() is guaranteed to have executed a full memory barrier | ||
2239 | * after the beginning of synchronize_sched() and before the beginning of | ||
2240 | * that RCU read-side critical section. Note that these guarantees include | ||
2241 | * CPUs that are offline, idle, or executing in user mode, as well as CPUs | ||
2242 | * that are executing in the kernel. | ||
2243 | * | ||
2244 | * Furthermore, if CPU A invoked synchronize_sched(), which returned | ||
2245 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
2246 | * to have executed a full memory barrier during the execution of | ||
2247 | * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but | ||
2248 | * again only if the system has more than one CPU). | ||
2212 | * | 2249 | * |
2213 | * This primitive provides the guarantees made by the (now removed) | 2250 | * This primitive provides the guarantees made by the (now removed) |
2214 | * synchronize_kernel() API. In contrast, synchronize_rcu() only | 2251 | * synchronize_kernel() API. In contrast, synchronize_rcu() only |
@@ -2224,7 +2261,10 @@ void synchronize_sched(void) | |||
2224 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); | 2261 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); |
2225 | if (rcu_blocking_is_gp()) | 2262 | if (rcu_blocking_is_gp()) |
2226 | return; | 2263 | return; |
2227 | wait_rcu_gp(call_rcu_sched); | 2264 | if (rcu_expedited) |
2265 | synchronize_sched_expedited(); | ||
2266 | else | ||
2267 | wait_rcu_gp(call_rcu_sched); | ||
2228 | } | 2268 | } |
2229 | EXPORT_SYMBOL_GPL(synchronize_sched); | 2269 | EXPORT_SYMBOL_GPL(synchronize_sched); |
2230 | 2270 | ||
@@ -2236,6 +2276,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); | |||
2236 | * read-side critical sections have completed. RCU read-side critical | 2276 | * read-side critical sections have completed. RCU read-side critical |
2237 | * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), | 2277 | * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), |
2238 | * and may be nested. | 2278 | * and may be nested. |
2279 | * | ||
2280 | * See the description of synchronize_sched() for more detailed information | ||
2281 | * on memory ordering guarantees. | ||
2239 | */ | 2282 | */ |
2240 | void synchronize_rcu_bh(void) | 2283 | void synchronize_rcu_bh(void) |
2241 | { | 2284 | { |
@@ -2245,13 +2288,13 @@ void synchronize_rcu_bh(void) | |||
2245 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); | 2288 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); |
2246 | if (rcu_blocking_is_gp()) | 2289 | if (rcu_blocking_is_gp()) |
2247 | return; | 2290 | return; |
2248 | wait_rcu_gp(call_rcu_bh); | 2291 | if (rcu_expedited) |
2292 | synchronize_rcu_bh_expedited(); | ||
2293 | else | ||
2294 | wait_rcu_gp(call_rcu_bh); | ||
2249 | } | 2295 | } |
2250 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); | 2296 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); |
2251 | 2297 | ||
2252 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | ||
2253 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
2254 | |||
2255 | static int synchronize_sched_expedited_cpu_stop(void *data) | 2298 | static int synchronize_sched_expedited_cpu_stop(void *data) |
2256 | { | 2299 | { |
2257 | /* | 2300 | /* |
@@ -2308,10 +2351,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
2308 | */ | 2351 | */ |
2309 | void synchronize_sched_expedited(void) | 2352 | void synchronize_sched_expedited(void) |
2310 | { | 2353 | { |
2311 | int firstsnap, s, snap, trycount = 0; | 2354 | long firstsnap, s, snap; |
2355 | int trycount = 0; | ||
2356 | struct rcu_state *rsp = &rcu_sched_state; | ||
2357 | |||
2358 | /* | ||
2359 | * If we are in danger of counter wrap, just do synchronize_sched(). | ||
2360 | * By allowing sync_sched_expedited_started to advance no more than | ||
2361 | * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring | ||
2362 | * that more than 3.5 billion CPUs would be required to force a | ||
2363 | * counter wrap on a 32-bit system. Quite a few more CPUs would of | ||
2364 | * course be required on a 64-bit system. | ||
2365 | */ | ||
2366 | if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), | ||
2367 | (ulong)atomic_long_read(&rsp->expedited_done) + | ||
2368 | ULONG_MAX / 8)) { | ||
2369 | synchronize_sched(); | ||
2370 | atomic_long_inc(&rsp->expedited_wrap); | ||
2371 | return; | ||
2372 | } | ||
2312 | 2373 | ||
2313 | /* Note that atomic_inc_return() implies full memory barrier. */ | 2374 | /* |
2314 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | 2375 | * Take a ticket. Note that atomic_inc_return() implies a |
2376 | * full memory barrier. | ||
2377 | */ | ||
2378 | snap = atomic_long_inc_return(&rsp->expedited_start); | ||
2379 | firstsnap = snap; | ||
2315 | get_online_cpus(); | 2380 | get_online_cpus(); |
2316 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | 2381 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); |
2317 | 2382 | ||
@@ -2323,48 +2388,65 @@ void synchronize_sched_expedited(void) | |||
2323 | synchronize_sched_expedited_cpu_stop, | 2388 | synchronize_sched_expedited_cpu_stop, |
2324 | NULL) == -EAGAIN) { | 2389 | NULL) == -EAGAIN) { |
2325 | put_online_cpus(); | 2390 | put_online_cpus(); |
2391 | atomic_long_inc(&rsp->expedited_tryfail); | ||
2392 | |||
2393 | /* Check to see if someone else did our work for us. */ | ||
2394 | s = atomic_long_read(&rsp->expedited_done); | ||
2395 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | ||
2396 | /* ensure test happens before caller kfree */ | ||
2397 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
2398 | atomic_long_inc(&rsp->expedited_workdone1); | ||
2399 | return; | ||
2400 | } | ||
2326 | 2401 | ||
2327 | /* No joy, try again later. Or just synchronize_sched(). */ | 2402 | /* No joy, try again later. Or just synchronize_sched(). */ |
2328 | if (trycount++ < 10) { | 2403 | if (trycount++ < 10) { |
2329 | udelay(trycount * num_online_cpus()); | 2404 | udelay(trycount * num_online_cpus()); |
2330 | } else { | 2405 | } else { |
2331 | synchronize_sched(); | 2406 | wait_rcu_gp(call_rcu_sched); |
2407 | atomic_long_inc(&rsp->expedited_normal); | ||
2332 | return; | 2408 | return; |
2333 | } | 2409 | } |
2334 | 2410 | ||
2335 | /* Check to see if someone else did our work for us. */ | 2411 | /* Recheck to see if someone else did our work for us. */ |
2336 | s = atomic_read(&sync_sched_expedited_done); | 2412 | s = atomic_long_read(&rsp->expedited_done); |
2337 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | 2413 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { |
2338 | smp_mb(); /* ensure test happens before caller kfree */ | 2414 | /* ensure test happens before caller kfree */ |
2415 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
2416 | atomic_long_inc(&rsp->expedited_workdone2); | ||
2339 | return; | 2417 | return; |
2340 | } | 2418 | } |
2341 | 2419 | ||
2342 | /* | 2420 | /* |
2343 | * Refetching sync_sched_expedited_started allows later | 2421 | * Refetching sync_sched_expedited_started allows later |
2344 | * callers to piggyback on our grace period. We subtract | 2422 | * callers to piggyback on our grace period. We retry |
2345 | * 1 to get the same token that the last incrementer got. | 2423 | * after they started, so our grace period works for them, |
2346 | * We retry after they started, so our grace period works | 2424 | * and they started after our first try, so their grace |
2347 | * for them, and they started after our first try, so their | 2425 | * period works for us. |
2348 | * grace period works for us. | ||
2349 | */ | 2426 | */ |
2350 | get_online_cpus(); | 2427 | get_online_cpus(); |
2351 | snap = atomic_read(&sync_sched_expedited_started); | 2428 | snap = atomic_long_read(&rsp->expedited_start); |
2352 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | 2429 | smp_mb(); /* ensure read is before try_stop_cpus(). */ |
2353 | } | 2430 | } |
2431 | atomic_long_inc(&rsp->expedited_stoppedcpus); | ||
2354 | 2432 | ||
2355 | /* | 2433 | /* |
2356 | * Everyone up to our most recent fetch is covered by our grace | 2434 | * Everyone up to our most recent fetch is covered by our grace |
2357 | * period. Update the counter, but only if our work is still | 2435 | * period. Update the counter, but only if our work is still |
2358 | * relevant -- which it won't be if someone who started later | 2436 | * relevant -- which it won't be if someone who started later |
2359 | * than we did beat us to the punch. | 2437 | * than we did already did their update. |
2360 | */ | 2438 | */ |
2361 | do { | 2439 | do { |
2362 | s = atomic_read(&sync_sched_expedited_done); | 2440 | atomic_long_inc(&rsp->expedited_done_tries); |
2363 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | 2441 | s = atomic_long_read(&rsp->expedited_done); |
2364 | smp_mb(); /* ensure test happens before caller kfree */ | 2442 | if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { |
2443 | /* ensure test happens before caller kfree */ | ||
2444 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
2445 | atomic_long_inc(&rsp->expedited_done_lost); | ||
2365 | break; | 2446 | break; |
2366 | } | 2447 | } |
2367 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | 2448 | } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); |
2449 | atomic_long_inc(&rsp->expedited_done_exit); | ||
2368 | 2450 | ||
2369 | put_online_cpus(); | 2451 | put_online_cpus(); |
2370 | } | 2452 | } |
@@ -2558,9 +2640,17 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
2558 | * When that callback is invoked, we will know that all of the | 2640 | * When that callback is invoked, we will know that all of the |
2559 | * corresponding CPU's preceding callbacks have been invoked. | 2641 | * corresponding CPU's preceding callbacks have been invoked. |
2560 | */ | 2642 | */ |
2561 | for_each_online_cpu(cpu) { | 2643 | for_each_possible_cpu(cpu) { |
2644 | if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) | ||
2645 | continue; | ||
2562 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2646 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2563 | if (ACCESS_ONCE(rdp->qlen)) { | 2647 | if (is_nocb_cpu(cpu)) { |
2648 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, | ||
2649 | rsp->n_barrier_done); | ||
2650 | atomic_inc(&rsp->barrier_cpu_count); | ||
2651 | __call_rcu(&rdp->barrier_head, rcu_barrier_callback, | ||
2652 | rsp, cpu, 0); | ||
2653 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
2564 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 2654 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
2565 | rsp->n_barrier_done); | 2655 | rsp->n_barrier_done); |
2566 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | 2656 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); |
@@ -2634,6 +2724,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
2634 | #endif | 2724 | #endif |
2635 | rdp->cpu = cpu; | 2725 | rdp->cpu = cpu; |
2636 | rdp->rsp = rsp; | 2726 | rdp->rsp = rsp; |
2727 | rcu_boot_init_nocb_percpu_data(rdp); | ||
2637 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2728 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2638 | } | 2729 | } |
2639 | 2730 | ||
@@ -2715,6 +2806,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2715 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 2806 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
2716 | struct rcu_node *rnp = rdp->mynode; | 2807 | struct rcu_node *rnp = rdp->mynode; |
2717 | struct rcu_state *rsp; | 2808 | struct rcu_state *rsp; |
2809 | int ret = NOTIFY_OK; | ||
2718 | 2810 | ||
2719 | trace_rcu_utilization("Start CPU hotplug"); | 2811 | trace_rcu_utilization("Start CPU hotplug"); |
2720 | switch (action) { | 2812 | switch (action) { |
@@ -2728,7 +2820,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2728 | rcu_boost_kthread_setaffinity(rnp, -1); | 2820 | rcu_boost_kthread_setaffinity(rnp, -1); |
2729 | break; | 2821 | break; |
2730 | case CPU_DOWN_PREPARE: | 2822 | case CPU_DOWN_PREPARE: |
2731 | rcu_boost_kthread_setaffinity(rnp, cpu); | 2823 | if (nocb_cpu_expendable(cpu)) |
2824 | rcu_boost_kthread_setaffinity(rnp, cpu); | ||
2825 | else | ||
2826 | ret = NOTIFY_BAD; | ||
2732 | break; | 2827 | break; |
2733 | case CPU_DYING: | 2828 | case CPU_DYING: |
2734 | case CPU_DYING_FROZEN: | 2829 | case CPU_DYING_FROZEN: |
@@ -2752,7 +2847,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2752 | break; | 2847 | break; |
2753 | } | 2848 | } |
2754 | trace_rcu_utilization("End CPU hotplug"); | 2849 | trace_rcu_utilization("End CPU hotplug"); |
2755 | return NOTIFY_OK; | 2850 | return ret; |
2756 | } | 2851 | } |
2757 | 2852 | ||
2758 | /* | 2853 | /* |
@@ -2772,6 +2867,7 @@ static int __init rcu_spawn_gp_kthread(void) | |||
2772 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2867 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2773 | rsp->gp_kthread = t; | 2868 | rsp->gp_kthread = t; |
2774 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2869 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2870 | rcu_spawn_nocb_kthreads(rsp); | ||
2775 | } | 2871 | } |
2776 | return 0; | 2872 | return 0; |
2777 | } | 2873 | } |
@@ -2967,6 +3063,7 @@ void __init rcu_init(void) | |||
2967 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 3063 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
2968 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 3064 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
2969 | __rcu_init_preempt(); | 3065 | __rcu_init_preempt(); |
3066 | rcu_init_nocb(); | ||
2970 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 3067 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
2971 | 3068 | ||
2972 | /* | 3069 | /* |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index a240f032848e..4b69291b093d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -287,6 +287,7 @@ struct rcu_data { | |||
287 | long qlen_last_fqs_check; | 287 | long qlen_last_fqs_check; |
288 | /* qlen at last check for QS forcing */ | 288 | /* qlen at last check for QS forcing */ |
289 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 289 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
290 | unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ | ||
290 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ | 291 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ |
291 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ | 292 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ |
292 | unsigned long n_force_qs_snap; | 293 | unsigned long n_force_qs_snap; |
@@ -317,6 +318,18 @@ struct rcu_data { | |||
317 | struct rcu_head oom_head; | 318 | struct rcu_head oom_head; |
318 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 319 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
319 | 320 | ||
321 | /* 7) Callback offloading. */ | ||
322 | #ifdef CONFIG_RCU_NOCB_CPU | ||
323 | struct rcu_head *nocb_head; /* CBs waiting for kthread. */ | ||
324 | struct rcu_head **nocb_tail; | ||
325 | atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ | ||
326 | atomic_long_t nocb_q_count_lazy; /* (approximate). */ | ||
327 | int nocb_p_count; /* # CBs being invoked by kthread */ | ||
328 | int nocb_p_count_lazy; /* (approximate). */ | ||
329 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | ||
330 | struct task_struct *nocb_kthread; | ||
331 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
332 | |||
320 | int cpu; | 333 | int cpu; |
321 | struct rcu_state *rsp; | 334 | struct rcu_state *rsp; |
322 | }; | 335 | }; |
@@ -369,6 +382,12 @@ struct rcu_state { | |||
369 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 382 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
370 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | 383 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ |
371 | void (*func)(struct rcu_head *head)); | 384 | void (*func)(struct rcu_head *head)); |
385 | #ifdef CONFIG_RCU_NOCB_CPU | ||
386 | void (*call_remote)(struct rcu_head *head, | ||
387 | void (*func)(struct rcu_head *head)); | ||
388 | /* call_rcu() flavor, but for */ | ||
389 | /* placing on remote CPU. */ | ||
390 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
372 | 391 | ||
373 | /* The following fields are guarded by the root rcu_node's lock. */ | 392 | /* The following fields are guarded by the root rcu_node's lock. */ |
374 | 393 | ||
@@ -383,9 +402,8 @@ struct rcu_state { | |||
383 | 402 | ||
384 | /* End of fields guarded by root rcu_node's lock. */ | 403 | /* End of fields guarded by root rcu_node's lock. */ |
385 | 404 | ||
386 | raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; | 405 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; |
387 | /* exclude on/offline and */ | 406 | /* Protect following fields. */ |
388 | /* starting new GP. */ | ||
389 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | 407 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ |
390 | /* need a grace period. */ | 408 | /* need a grace period. */ |
391 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | 409 | struct rcu_head **orphan_nxttail; /* Tail of above. */ |
@@ -394,7 +412,7 @@ struct rcu_state { | |||
394 | struct rcu_head **orphan_donetail; /* Tail of above. */ | 412 | struct rcu_head **orphan_donetail; /* Tail of above. */ |
395 | long qlen_lazy; /* Number of lazy callbacks. */ | 413 | long qlen_lazy; /* Number of lazy callbacks. */ |
396 | long qlen; /* Total number of callbacks. */ | 414 | long qlen; /* Total number of callbacks. */ |
397 | /* End of fields guarded by onofflock. */ | 415 | /* End of fields guarded by orphan_lock. */ |
398 | 416 | ||
399 | struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ | 417 | struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ |
400 | 418 | ||
@@ -405,6 +423,18 @@ struct rcu_state { | |||
405 | /* _rcu_barrier(). */ | 423 | /* _rcu_barrier(). */ |
406 | /* End of fields guarded by barrier_mutex. */ | 424 | /* End of fields guarded by barrier_mutex. */ |
407 | 425 | ||
426 | atomic_long_t expedited_start; /* Starting ticket. */ | ||
427 | atomic_long_t expedited_done; /* Done ticket. */ | ||
428 | atomic_long_t expedited_wrap; /* # near-wrap incidents. */ | ||
429 | atomic_long_t expedited_tryfail; /* # acquisition failures. */ | ||
430 | atomic_long_t expedited_workdone1; /* # done by others #1. */ | ||
431 | atomic_long_t expedited_workdone2; /* # done by others #2. */ | ||
432 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ | ||
433 | atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ | ||
434 | atomic_long_t expedited_done_tries; /* # tries to update _done. */ | ||
435 | atomic_long_t expedited_done_lost; /* # times beaten to _done. */ | ||
436 | atomic_long_t expedited_done_exit; /* # times exited _done loop. */ | ||
437 | |||
408 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 438 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
409 | /* force_quiescent_state(). */ | 439 | /* force_quiescent_state(). */ |
410 | unsigned long n_force_qs; /* Number of calls to */ | 440 | unsigned long n_force_qs; /* Number of calls to */ |
@@ -428,6 +458,8 @@ struct rcu_state { | |||
428 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ | 458 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ |
429 | 459 | ||
430 | extern struct list_head rcu_struct_flavors; | 460 | extern struct list_head rcu_struct_flavors; |
461 | |||
462 | /* Sequence through rcu_state structures for each RCU flavor. */ | ||
431 | #define for_each_rcu_flavor(rsp) \ | 463 | #define for_each_rcu_flavor(rsp) \ |
432 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) | 464 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) |
433 | 465 | ||
@@ -504,5 +536,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | |||
504 | static void print_cpu_stall_info_end(void); | 536 | static void print_cpu_stall_info_end(void); |
505 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 537 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
506 | static void increment_cpu_stall_ticks(void); | 538 | static void increment_cpu_stall_ticks(void); |
539 | static bool is_nocb_cpu(int cpu); | ||
540 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
541 | bool lazy); | ||
542 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
543 | struct rcu_data *rdp); | ||
544 | static bool nocb_cpu_expendable(int cpu); | ||
545 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | ||
546 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | ||
547 | static void init_nocb_callback_list(struct rcu_data *rdp); | ||
548 | static void __init rcu_init_nocb(void); | ||
507 | 549 | ||
508 | #endif /* #ifndef RCU_TREE_NONCORE */ | 550 | #endif /* #ifndef RCU_TREE_NONCORE */ |
551 | |||
552 | #ifdef CONFIG_RCU_TRACE | ||
553 | #ifdef CONFIG_RCU_NOCB_CPU | ||
554 | /* Sum up queue lengths for tracing. */ | ||
555 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | ||
556 | { | ||
557 | *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; | ||
558 | *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; | ||
559 | } | ||
560 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
561 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | ||
562 | { | ||
563 | *ql = 0; | ||
564 | *qll = 0; | ||
565 | } | ||
566 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
567 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f92115488187..f6e5ec2932b4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -25,6 +25,7 @@ | |||
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/gfp.h> | ||
28 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
29 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
30 | 31 | ||
@@ -36,6 +37,14 @@ | |||
36 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | 37 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO |
37 | #endif | 38 | #endif |
38 | 39 | ||
40 | #ifdef CONFIG_RCU_NOCB_CPU | ||
41 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | ||
42 | static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ | ||
43 | static bool rcu_nocb_poll; /* Offload kthread are to poll. */ | ||
44 | module_param(rcu_nocb_poll, bool, 0444); | ||
45 | static char __initdata nocb_buf[NR_CPUS * 5]; | ||
46 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
47 | |||
39 | /* | 48 | /* |
40 | * Check the RCU kernel configuration parameters and print informative | 49 | * Check the RCU kernel configuration parameters and print informative |
41 | * messages about anything out of the ordinary. If you like #ifdef, you | 50 | * messages about anything out of the ordinary. If you like #ifdef, you |
@@ -76,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void) | |||
76 | printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 85 | printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); |
77 | if (nr_cpu_ids != NR_CPUS) | 86 | if (nr_cpu_ids != NR_CPUS) |
78 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 87 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
88 | #ifdef CONFIG_RCU_NOCB_CPU | ||
89 | if (have_rcu_nocb_mask) { | ||
90 | if (cpumask_test_cpu(0, rcu_nocb_mask)) { | ||
91 | cpumask_clear_cpu(0, rcu_nocb_mask); | ||
92 | pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); | ||
93 | } | ||
94 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | ||
95 | pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); | ||
96 | if (rcu_nocb_poll) | ||
97 | pr_info("\tExperimental polled no-CBs CPUs.\n"); | ||
98 | } | ||
99 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
79 | } | 100 | } |
80 | 101 | ||
81 | #ifdef CONFIG_TREE_PREEMPT_RCU | 102 | #ifdef CONFIG_TREE_PREEMPT_RCU |
@@ -642,7 +663,7 @@ static void rcu_preempt_do_callbacks(void) | |||
642 | */ | 663 | */ |
643 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 664 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
644 | { | 665 | { |
645 | __call_rcu(head, func, &rcu_preempt_state, 0); | 666 | __call_rcu(head, func, &rcu_preempt_state, -1, 0); |
646 | } | 667 | } |
647 | EXPORT_SYMBOL_GPL(call_rcu); | 668 | EXPORT_SYMBOL_GPL(call_rcu); |
648 | 669 | ||
@@ -656,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
656 | void kfree_call_rcu(struct rcu_head *head, | 677 | void kfree_call_rcu(struct rcu_head *head, |
657 | void (*func)(struct rcu_head *rcu)) | 678 | void (*func)(struct rcu_head *rcu)) |
658 | { | 679 | { |
659 | __call_rcu(head, func, &rcu_preempt_state, 1); | 680 | __call_rcu(head, func, &rcu_preempt_state, -1, 1); |
660 | } | 681 | } |
661 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | 682 | EXPORT_SYMBOL_GPL(kfree_call_rcu); |
662 | 683 | ||
@@ -670,6 +691,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); | |||
670 | * concurrently with new RCU read-side critical sections that began while | 691 | * concurrently with new RCU read-side critical sections that began while |
671 | * synchronize_rcu() was waiting. RCU read-side critical sections are | 692 | * synchronize_rcu() was waiting. RCU read-side critical sections are |
672 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | 693 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. |
694 | * | ||
695 | * See the description of synchronize_sched() for more detailed information | ||
696 | * on memory ordering guarantees. | ||
673 | */ | 697 | */ |
674 | void synchronize_rcu(void) | 698 | void synchronize_rcu(void) |
675 | { | 699 | { |
@@ -679,7 +703,10 @@ void synchronize_rcu(void) | |||
679 | "Illegal synchronize_rcu() in RCU read-side critical section"); | 703 | "Illegal synchronize_rcu() in RCU read-side critical section"); |
680 | if (!rcu_scheduler_active) | 704 | if (!rcu_scheduler_active) |
681 | return; | 705 | return; |
682 | wait_rcu_gp(call_rcu); | 706 | if (rcu_expedited) |
707 | synchronize_rcu_expedited(); | ||
708 | else | ||
709 | wait_rcu_gp(call_rcu); | ||
683 | } | 710 | } |
684 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 711 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
685 | 712 | ||
@@ -757,7 +784,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
757 | * grace period for the specified rcu_node structure. If there are no such | 784 | * grace period for the specified rcu_node structure. If there are no such |
758 | * tasks, report it up the rcu_node hierarchy. | 785 | * tasks, report it up the rcu_node hierarchy. |
759 | * | 786 | * |
760 | * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. | 787 | * Caller must hold sync_rcu_preempt_exp_mutex and must exclude |
788 | * CPU hotplug operations. | ||
761 | */ | 789 | */ |
762 | static void | 790 | static void |
763 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | 791 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) |
@@ -831,7 +859,7 @@ void synchronize_rcu_expedited(void) | |||
831 | udelay(trycount * num_online_cpus()); | 859 | udelay(trycount * num_online_cpus()); |
832 | } else { | 860 | } else { |
833 | put_online_cpus(); | 861 | put_online_cpus(); |
834 | synchronize_rcu(); | 862 | wait_rcu_gp(call_rcu); |
835 | return; | 863 | return; |
836 | } | 864 | } |
837 | } | 865 | } |
@@ -875,6 +903,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
875 | 903 | ||
876 | /** | 904 | /** |
877 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. | 905 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. |
906 | * | ||
907 | * Note that this primitive does not necessarily wait for an RCU grace period | ||
908 | * to complete. For example, if there are no RCU callbacks queued anywhere | ||
909 | * in the system, then rcu_barrier() is within its rights to return | ||
910 | * immediately, without waiting for anything, much less an RCU grace period. | ||
878 | */ | 911 | */ |
879 | void rcu_barrier(void) | 912 | void rcu_barrier(void) |
880 | { | 913 | { |
@@ -1013,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
1013 | void kfree_call_rcu(struct rcu_head *head, | 1046 | void kfree_call_rcu(struct rcu_head *head, |
1014 | void (*func)(struct rcu_head *rcu)) | 1047 | void (*func)(struct rcu_head *rcu)) |
1015 | { | 1048 | { |
1016 | __call_rcu(head, func, &rcu_sched_state, 1); | 1049 | __call_rcu(head, func, &rcu_sched_state, -1, 1); |
1017 | } | 1050 | } |
1018 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | 1051 | EXPORT_SYMBOL_GPL(kfree_call_rcu); |
1019 | 1052 | ||
@@ -2092,3 +2125,373 @@ static void increment_cpu_stall_ticks(void) | |||
2092 | } | 2125 | } |
2093 | 2126 | ||
2094 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | 2127 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ |
2128 | |||
2129 | #ifdef CONFIG_RCU_NOCB_CPU | ||
2130 | |||
2131 | /* | ||
2132 | * Offload callback processing from the boot-time-specified set of CPUs | ||
2133 | * specified by rcu_nocb_mask. For each CPU in the set, there is a | ||
2134 | * kthread created that pulls the callbacks from the corresponding CPU, | ||
2135 | * waits for a grace period to elapse, and invokes the callbacks. | ||
2136 | * The no-CBs CPUs do a wake_up() on their kthread when they insert | ||
2137 | * a callback into any empty list, unless the rcu_nocb_poll boot parameter | ||
2138 | * has been specified, in which case each kthread actively polls its | ||
2139 | * CPU. (Which isn't so great for energy efficiency, but which does | ||
2140 | * reduce RCU's overhead on that CPU.) | ||
2141 | * | ||
2142 | * This is intended to be used in conjunction with Frederic Weisbecker's | ||
2143 | * adaptive-idle work, which would seriously reduce OS jitter on CPUs | ||
2144 | * running CPU-bound user-mode computations. | ||
2145 | * | ||
2146 | * Offloading of callback processing could also in theory be used as | ||
2147 | * an energy-efficiency measure because CPUs with no RCU callbacks | ||
2148 | * queued are more aggressive about entering dyntick-idle mode. | ||
2149 | */ | ||
2150 | |||
2151 | |||
2152 | /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ | ||
2153 | static int __init rcu_nocb_setup(char *str) | ||
2154 | { | ||
2155 | alloc_bootmem_cpumask_var(&rcu_nocb_mask); | ||
2156 | have_rcu_nocb_mask = true; | ||
2157 | cpulist_parse(str, rcu_nocb_mask); | ||
2158 | return 1; | ||
2159 | } | ||
2160 | __setup("rcu_nocbs=", rcu_nocb_setup); | ||
2161 | |||
2162 | /* Is the specified CPU a no-CPUs CPU? */ | ||
2163 | static bool is_nocb_cpu(int cpu) | ||
2164 | { | ||
2165 | if (have_rcu_nocb_mask) | ||
2166 | return cpumask_test_cpu(cpu, rcu_nocb_mask); | ||
2167 | return false; | ||
2168 | } | ||
2169 | |||
2170 | /* | ||
2171 | * Enqueue the specified string of rcu_head structures onto the specified | ||
2172 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the | ||
2173 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy | ||
2174 | * counts are supplied by rhcount and rhcount_lazy. | ||
2175 | * | ||
2176 | * If warranted, also wake up the kthread servicing this CPUs queues. | ||
2177 | */ | ||
2178 | static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | ||
2179 | struct rcu_head *rhp, | ||
2180 | struct rcu_head **rhtp, | ||
2181 | int rhcount, int rhcount_lazy) | ||
2182 | { | ||
2183 | int len; | ||
2184 | struct rcu_head **old_rhpp; | ||
2185 | struct task_struct *t; | ||
2186 | |||
2187 | /* Enqueue the callback on the nocb list and update counts. */ | ||
2188 | old_rhpp = xchg(&rdp->nocb_tail, rhtp); | ||
2189 | ACCESS_ONCE(*old_rhpp) = rhp; | ||
2190 | atomic_long_add(rhcount, &rdp->nocb_q_count); | ||
2191 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); | ||
2192 | |||
2193 | /* If we are not being polled and there is a kthread, awaken it ... */ | ||
2194 | t = ACCESS_ONCE(rdp->nocb_kthread); | ||
2195 | if (rcu_nocb_poll | !t) | ||
2196 | return; | ||
2197 | len = atomic_long_read(&rdp->nocb_q_count); | ||
2198 | if (old_rhpp == &rdp->nocb_head) { | ||
2199 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ | ||
2200 | rdp->qlen_last_fqs_check = 0; | ||
2201 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | ||
2202 | wake_up_process(t); /* ... or if many callbacks queued. */ | ||
2203 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | ||
2204 | } | ||
2205 | return; | ||
2206 | } | ||
2207 | |||
2208 | /* | ||
2209 | * This is a helper for __call_rcu(), which invokes this when the normal | ||
2210 | * callback queue is inoperable. If this is not a no-CBs CPU, this | ||
2211 | * function returns failure back to __call_rcu(), which can complain | ||
2212 | * appropriately. | ||
2213 | * | ||
2214 | * Otherwise, this function queues the callback where the corresponding | ||
2215 | * "rcuo" kthread can find it. | ||
2216 | */ | ||
2217 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
2218 | bool lazy) | ||
2219 | { | ||
2220 | |||
2221 | if (!is_nocb_cpu(rdp->cpu)) | ||
2222 | return 0; | ||
2223 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); | ||
2224 | return 1; | ||
2225 | } | ||
2226 | |||
2227 | /* | ||
2228 | * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is | ||
2229 | * not a no-CBs CPU. | ||
2230 | */ | ||
2231 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
2232 | struct rcu_data *rdp) | ||
2233 | { | ||
2234 | long ql = rsp->qlen; | ||
2235 | long qll = rsp->qlen_lazy; | ||
2236 | |||
2237 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | ||
2238 | if (!is_nocb_cpu(smp_processor_id())) | ||
2239 | return 0; | ||
2240 | rsp->qlen = 0; | ||
2241 | rsp->qlen_lazy = 0; | ||
2242 | |||
2243 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ | ||
2244 | if (rsp->orphan_donelist != NULL) { | ||
2245 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, | ||
2246 | rsp->orphan_donetail, ql, qll); | ||
2247 | ql = qll = 0; | ||
2248 | rsp->orphan_donelist = NULL; | ||
2249 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
2250 | } | ||
2251 | if (rsp->orphan_nxtlist != NULL) { | ||
2252 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, | ||
2253 | rsp->orphan_nxttail, ql, qll); | ||
2254 | ql = qll = 0; | ||
2255 | rsp->orphan_nxtlist = NULL; | ||
2256 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
2257 | } | ||
2258 | return 1; | ||
2259 | } | ||
2260 | |||
2261 | /* | ||
2262 | * There must be at least one non-no-CBs CPU in operation at any given | ||
2263 | * time, because no-CBs CPUs are not capable of initiating grace periods | ||
2264 | * independently. This function therefore complains if the specified | ||
2265 | * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to | ||
2266 | * avoid offlining the last such CPU. (Recursion is a wonderful thing, | ||
2267 | * but you have to have a base case!) | ||
2268 | */ | ||
2269 | static bool nocb_cpu_expendable(int cpu) | ||
2270 | { | ||
2271 | cpumask_var_t non_nocb_cpus; | ||
2272 | int ret; | ||
2273 | |||
2274 | /* | ||
2275 | * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, | ||
2276 | * then offlining this CPU is harmless. Let it happen. | ||
2277 | */ | ||
2278 | if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) | ||
2279 | return 1; | ||
2280 | |||
2281 | /* If no memory, play it safe and keep the CPU around. */ | ||
2282 | if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) | ||
2283 | return 0; | ||
2284 | cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); | ||
2285 | cpumask_clear_cpu(cpu, non_nocb_cpus); | ||
2286 | ret = !cpumask_empty(non_nocb_cpus); | ||
2287 | free_cpumask_var(non_nocb_cpus); | ||
2288 | return ret; | ||
2289 | } | ||
2290 | |||
2291 | /* | ||
2292 | * Helper structure for remote registry of RCU callbacks. | ||
2293 | * This is needed for when a no-CBs CPU needs to start a grace period. | ||
2294 | * If it just invokes call_rcu(), the resulting callback will be queued, | ||
2295 | * which can result in deadlock. | ||
2296 | */ | ||
2297 | struct rcu_head_remote { | ||
2298 | struct rcu_head *rhp; | ||
2299 | call_rcu_func_t *crf; | ||
2300 | void (*func)(struct rcu_head *rhp); | ||
2301 | }; | ||
2302 | |||
2303 | /* | ||
2304 | * Register a callback as specified by the rcu_head_remote struct. | ||
2305 | * This function is intended to be invoked via smp_call_function_single(). | ||
2306 | */ | ||
2307 | static void call_rcu_local(void *arg) | ||
2308 | { | ||
2309 | struct rcu_head_remote *rhrp = | ||
2310 | container_of(arg, struct rcu_head_remote, rhp); | ||
2311 | |||
2312 | rhrp->crf(rhrp->rhp, rhrp->func); | ||
2313 | } | ||
2314 | |||
2315 | /* | ||
2316 | * Set up an rcu_head_remote structure and the invoke call_rcu_local() | ||
2317 | * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via | ||
2318 | * smp_call_function_single(). | ||
2319 | */ | ||
2320 | static void invoke_crf_remote(struct rcu_head *rhp, | ||
2321 | void (*func)(struct rcu_head *rhp), | ||
2322 | call_rcu_func_t crf) | ||
2323 | { | ||
2324 | struct rcu_head_remote rhr; | ||
2325 | |||
2326 | rhr.rhp = rhp; | ||
2327 | rhr.crf = crf; | ||
2328 | rhr.func = func; | ||
2329 | smp_call_function_single(0, call_rcu_local, &rhr, 1); | ||
2330 | } | ||
2331 | |||
2332 | /* | ||
2333 | * Helper functions to be passed to wait_rcu_gp(), each of which | ||
2334 | * invokes invoke_crf_remote() to register a callback appropriately. | ||
2335 | */ | ||
2336 | static void __maybe_unused | ||
2337 | call_rcu_preempt_remote(struct rcu_head *rhp, | ||
2338 | void (*func)(struct rcu_head *rhp)) | ||
2339 | { | ||
2340 | invoke_crf_remote(rhp, func, call_rcu); | ||
2341 | } | ||
2342 | static void call_rcu_bh_remote(struct rcu_head *rhp, | ||
2343 | void (*func)(struct rcu_head *rhp)) | ||
2344 | { | ||
2345 | invoke_crf_remote(rhp, func, call_rcu_bh); | ||
2346 | } | ||
2347 | static void call_rcu_sched_remote(struct rcu_head *rhp, | ||
2348 | void (*func)(struct rcu_head *rhp)) | ||
2349 | { | ||
2350 | invoke_crf_remote(rhp, func, call_rcu_sched); | ||
2351 | } | ||
2352 | |||
2353 | /* | ||
2354 | * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes | ||
2355 | * callbacks queued by the corresponding no-CBs CPU. | ||
2356 | */ | ||
2357 | static int rcu_nocb_kthread(void *arg) | ||
2358 | { | ||
2359 | int c, cl; | ||
2360 | struct rcu_head *list; | ||
2361 | struct rcu_head *next; | ||
2362 | struct rcu_head **tail; | ||
2363 | struct rcu_data *rdp = arg; | ||
2364 | |||
2365 | /* Each pass through this loop invokes one batch of callbacks */ | ||
2366 | for (;;) { | ||
2367 | /* If not polling, wait for next batch of callbacks. */ | ||
2368 | if (!rcu_nocb_poll) | ||
2369 | wait_event(rdp->nocb_wq, rdp->nocb_head); | ||
2370 | list = ACCESS_ONCE(rdp->nocb_head); | ||
2371 | if (!list) { | ||
2372 | schedule_timeout_interruptible(1); | ||
2373 | continue; | ||
2374 | } | ||
2375 | |||
2376 | /* | ||
2377 | * Extract queued callbacks, update counts, and wait | ||
2378 | * for a grace period to elapse. | ||
2379 | */ | ||
2380 | ACCESS_ONCE(rdp->nocb_head) = NULL; | ||
2381 | tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); | ||
2382 | c = atomic_long_xchg(&rdp->nocb_q_count, 0); | ||
2383 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); | ||
2384 | ACCESS_ONCE(rdp->nocb_p_count) += c; | ||
2385 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; | ||
2386 | wait_rcu_gp(rdp->rsp->call_remote); | ||
2387 | |||
2388 | /* Each pass through the following loop invokes a callback. */ | ||
2389 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); | ||
2390 | c = cl = 0; | ||
2391 | while (list) { | ||
2392 | next = list->next; | ||
2393 | /* Wait for enqueuing to complete, if needed. */ | ||
2394 | while (next == NULL && &list->next != tail) { | ||
2395 | schedule_timeout_interruptible(1); | ||
2396 | next = list->next; | ||
2397 | } | ||
2398 | debug_rcu_head_unqueue(list); | ||
2399 | local_bh_disable(); | ||
2400 | if (__rcu_reclaim(rdp->rsp->name, list)) | ||
2401 | cl++; | ||
2402 | c++; | ||
2403 | local_bh_enable(); | ||
2404 | list = next; | ||
2405 | } | ||
2406 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); | ||
2407 | ACCESS_ONCE(rdp->nocb_p_count) -= c; | ||
2408 | ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; | ||
2409 | rdp->n_nocbs_invoked += c; | ||
2410 | } | ||
2411 | return 0; | ||
2412 | } | ||
2413 | |||
2414 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ | ||
2415 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
2416 | { | ||
2417 | rdp->nocb_tail = &rdp->nocb_head; | ||
2418 | init_waitqueue_head(&rdp->nocb_wq); | ||
2419 | } | ||
2420 | |||
2421 | /* Create a kthread for each RCU flavor for each no-CBs CPU. */ | ||
2422 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
2423 | { | ||
2424 | int cpu; | ||
2425 | struct rcu_data *rdp; | ||
2426 | struct task_struct *t; | ||
2427 | |||
2428 | if (rcu_nocb_mask == NULL) | ||
2429 | return; | ||
2430 | for_each_cpu(cpu, rcu_nocb_mask) { | ||
2431 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2432 | t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); | ||
2433 | BUG_ON(IS_ERR(t)); | ||
2434 | ACCESS_ONCE(rdp->nocb_kthread) = t; | ||
2435 | } | ||
2436 | } | ||
2437 | |||
2438 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | ||
2439 | static void init_nocb_callback_list(struct rcu_data *rdp) | ||
2440 | { | ||
2441 | if (rcu_nocb_mask == NULL || | ||
2442 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) | ||
2443 | return; | ||
2444 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
2445 | } | ||
2446 | |||
2447 | /* Initialize the ->call_remote fields in the rcu_state structures. */ | ||
2448 | static void __init rcu_init_nocb(void) | ||
2449 | { | ||
2450 | #ifdef CONFIG_PREEMPT_RCU | ||
2451 | rcu_preempt_state.call_remote = call_rcu_preempt_remote; | ||
2452 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2453 | rcu_bh_state.call_remote = call_rcu_bh_remote; | ||
2454 | rcu_sched_state.call_remote = call_rcu_sched_remote; | ||
2455 | } | ||
2456 | |||
2457 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
2458 | |||
2459 | static bool is_nocb_cpu(int cpu) | ||
2460 | { | ||
2461 | return false; | ||
2462 | } | ||
2463 | |||
2464 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
2465 | bool lazy) | ||
2466 | { | ||
2467 | return 0; | ||
2468 | } | ||
2469 | |||
2470 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
2471 | struct rcu_data *rdp) | ||
2472 | { | ||
2473 | return 0; | ||
2474 | } | ||
2475 | |||
2476 | static bool nocb_cpu_expendable(int cpu) | ||
2477 | { | ||
2478 | return 1; | ||
2479 | } | ||
2480 | |||
2481 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
2482 | { | ||
2483 | } | ||
2484 | |||
2485 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
2486 | { | ||
2487 | } | ||
2488 | |||
2489 | static void init_nocb_callback_list(struct rcu_data *rdp) | ||
2490 | { | ||
2491 | } | ||
2492 | |||
2493 | static void __init rcu_init_nocb(void) | ||
2494 | { | ||
2495 | } | ||
2496 | |||
2497 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 693513bc50e6..0d095dcaa670 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -46,29 +46,58 @@ | |||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
48 | 48 | ||
49 | static int show_rcubarrier(struct seq_file *m, void *unused) | 49 | #define ulong2long(a) (*(long *)(&(a))) |
50 | |||
51 | static int r_open(struct inode *inode, struct file *file, | ||
52 | const struct seq_operations *op) | ||
50 | { | 53 | { |
51 | struct rcu_state *rsp; | 54 | int ret = seq_open(file, op); |
55 | if (!ret) { | ||
56 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
57 | m->private = inode->i_private; | ||
58 | } | ||
59 | return ret; | ||
60 | } | ||
61 | |||
62 | static void *r_start(struct seq_file *m, loff_t *pos) | ||
63 | { | ||
64 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
65 | *pos = cpumask_next(*pos - 1, cpu_possible_mask); | ||
66 | if ((*pos) < nr_cpu_ids) | ||
67 | return per_cpu_ptr(rsp->rda, *pos); | ||
68 | return NULL; | ||
69 | } | ||
52 | 70 | ||
53 | for_each_rcu_flavor(rsp) | 71 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
54 | seq_printf(m, "%s: bcc: %d nbd: %lu\n", | 72 | { |
55 | rsp->name, | 73 | (*pos)++; |
56 | atomic_read(&rsp->barrier_cpu_count), | 74 | return r_start(m, pos); |
57 | rsp->n_barrier_done); | 75 | } |
76 | |||
77 | static void r_stop(struct seq_file *m, void *v) | ||
78 | { | ||
79 | } | ||
80 | |||
81 | static int show_rcubarrier(struct seq_file *m, void *v) | ||
82 | { | ||
83 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
84 | seq_printf(m, "bcc: %d nbd: %lu\n", | ||
85 | atomic_read(&rsp->barrier_cpu_count), | ||
86 | rsp->n_barrier_done); | ||
58 | return 0; | 87 | return 0; |
59 | } | 88 | } |
60 | 89 | ||
61 | static int rcubarrier_open(struct inode *inode, struct file *file) | 90 | static int rcubarrier_open(struct inode *inode, struct file *file) |
62 | { | 91 | { |
63 | return single_open(file, show_rcubarrier, NULL); | 92 | return single_open(file, show_rcubarrier, inode->i_private); |
64 | } | 93 | } |
65 | 94 | ||
66 | static const struct file_operations rcubarrier_fops = { | 95 | static const struct file_operations rcubarrier_fops = { |
67 | .owner = THIS_MODULE, | 96 | .owner = THIS_MODULE, |
68 | .open = rcubarrier_open, | 97 | .open = rcubarrier_open, |
69 | .read = seq_read, | 98 | .read = seq_read, |
70 | .llseek = seq_lseek, | 99 | .llseek = no_llseek, |
71 | .release = single_release, | 100 | .release = seq_release, |
72 | }; | 101 | }; |
73 | 102 | ||
74 | #ifdef CONFIG_RCU_BOOST | 103 | #ifdef CONFIG_RCU_BOOST |
@@ -84,12 +113,14 @@ static char convert_kthread_status(unsigned int kthread_status) | |||
84 | 113 | ||
85 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | 114 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) |
86 | { | 115 | { |
116 | long ql, qll; | ||
117 | |||
87 | if (!rdp->beenonline) | 118 | if (!rdp->beenonline) |
88 | return; | 119 | return; |
89 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", | 120 | seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", |
90 | rdp->cpu, | 121 | rdp->cpu, |
91 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 122 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
92 | rdp->completed, rdp->gpnum, | 123 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), |
93 | rdp->passed_quiesce, rdp->qs_pending); | 124 | rdp->passed_quiesce, rdp->qs_pending); |
94 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | 125 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
95 | atomic_read(&rdp->dynticks->dynticks), | 126 | atomic_read(&rdp->dynticks->dynticks), |
@@ -97,8 +128,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
97 | rdp->dynticks->dynticks_nmi_nesting, | 128 | rdp->dynticks->dynticks_nmi_nesting, |
98 | rdp->dynticks_fqs); | 129 | rdp->dynticks_fqs); |
99 | seq_printf(m, " of=%lu", rdp->offline_fqs); | 130 | seq_printf(m, " of=%lu", rdp->offline_fqs); |
131 | rcu_nocb_q_lengths(rdp, &ql, &qll); | ||
132 | qll += rdp->qlen_lazy; | ||
133 | ql += rdp->qlen; | ||
100 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", | 134 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", |
101 | rdp->qlen_lazy, rdp->qlen, | 135 | qll, ql, |
102 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 136 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != |
103 | rdp->nxttail[RCU_NEXT_TAIL]], | 137 | rdp->nxttail[RCU_NEXT_TAIL]], |
104 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | 138 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != |
@@ -114,101 +148,67 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
114 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); | 148 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); |
115 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 149 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
116 | seq_printf(m, " b=%ld", rdp->blimit); | 150 | seq_printf(m, " b=%ld", rdp->blimit); |
117 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | 151 | seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", |
118 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 152 | rdp->n_cbs_invoked, rdp->n_nocbs_invoked, |
153 | rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
119 | } | 154 | } |
120 | 155 | ||
121 | static int show_rcudata(struct seq_file *m, void *unused) | 156 | static int show_rcudata(struct seq_file *m, void *v) |
122 | { | 157 | { |
123 | int cpu; | 158 | print_one_rcu_data(m, (struct rcu_data *)v); |
124 | struct rcu_state *rsp; | ||
125 | |||
126 | for_each_rcu_flavor(rsp) { | ||
127 | seq_printf(m, "%s:\n", rsp->name); | ||
128 | for_each_possible_cpu(cpu) | ||
129 | print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); | ||
130 | } | ||
131 | return 0; | 159 | return 0; |
132 | } | 160 | } |
133 | 161 | ||
162 | static const struct seq_operations rcudate_op = { | ||
163 | .start = r_start, | ||
164 | .next = r_next, | ||
165 | .stop = r_stop, | ||
166 | .show = show_rcudata, | ||
167 | }; | ||
168 | |||
134 | static int rcudata_open(struct inode *inode, struct file *file) | 169 | static int rcudata_open(struct inode *inode, struct file *file) |
135 | { | 170 | { |
136 | return single_open(file, show_rcudata, NULL); | 171 | return r_open(inode, file, &rcudate_op); |
137 | } | 172 | } |
138 | 173 | ||
139 | static const struct file_operations rcudata_fops = { | 174 | static const struct file_operations rcudata_fops = { |
140 | .owner = THIS_MODULE, | 175 | .owner = THIS_MODULE, |
141 | .open = rcudata_open, | 176 | .open = rcudata_open, |
142 | .read = seq_read, | 177 | .read = seq_read, |
143 | .llseek = seq_lseek, | 178 | .llseek = no_llseek, |
144 | .release = single_release, | 179 | .release = seq_release, |
145 | }; | 180 | }; |
146 | 181 | ||
147 | static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | 182 | static int show_rcuexp(struct seq_file *m, void *v) |
148 | { | ||
149 | if (!rdp->beenonline) | ||
150 | return; | ||
151 | seq_printf(m, "%d,%s,%lu,%lu,%d,%d", | ||
152 | rdp->cpu, | ||
153 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", | ||
154 | rdp->completed, rdp->gpnum, | ||
155 | rdp->passed_quiesce, rdp->qs_pending); | ||
156 | seq_printf(m, ",%d,%llx,%d,%lu", | ||
157 | atomic_read(&rdp->dynticks->dynticks), | ||
158 | rdp->dynticks->dynticks_nesting, | ||
159 | rdp->dynticks->dynticks_nmi_nesting, | ||
160 | rdp->dynticks_fqs); | ||
161 | seq_printf(m, ",%lu", rdp->offline_fqs); | ||
162 | seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, | ||
163 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
164 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
165 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
166 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
167 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
168 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
169 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
170 | #ifdef CONFIG_RCU_BOOST | ||
171 | seq_printf(m, ",%d,\"%c\"", | ||
172 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
173 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
174 | rdp->cpu))); | ||
175 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
176 | seq_printf(m, ",%ld", rdp->blimit); | ||
177 | seq_printf(m, ",%lu,%lu,%lu\n", | ||
178 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
179 | } | ||
180 | |||
181 | static int show_rcudata_csv(struct seq_file *m, void *unused) | ||
182 | { | 183 | { |
183 | int cpu; | 184 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
184 | struct rcu_state *rsp; | 185 | |
185 | 186 | seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", | |
186 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); | 187 | atomic_long_read(&rsp->expedited_start), |
187 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 188 | atomic_long_read(&rsp->expedited_done), |
188 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); | 189 | atomic_long_read(&rsp->expedited_wrap), |
189 | #ifdef CONFIG_RCU_BOOST | 190 | atomic_long_read(&rsp->expedited_tryfail), |
190 | seq_puts(m, "\"kt\",\"ktl\""); | 191 | atomic_long_read(&rsp->expedited_workdone1), |
191 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 192 | atomic_long_read(&rsp->expedited_workdone2), |
192 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); | 193 | atomic_long_read(&rsp->expedited_normal), |
193 | for_each_rcu_flavor(rsp) { | 194 | atomic_long_read(&rsp->expedited_stoppedcpus), |
194 | seq_printf(m, "\"%s:\"\n", rsp->name); | 195 | atomic_long_read(&rsp->expedited_done_tries), |
195 | for_each_possible_cpu(cpu) | 196 | atomic_long_read(&rsp->expedited_done_lost), |
196 | print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); | 197 | atomic_long_read(&rsp->expedited_done_exit)); |
197 | } | ||
198 | return 0; | 198 | return 0; |
199 | } | 199 | } |
200 | 200 | ||
201 | static int rcudata_csv_open(struct inode *inode, struct file *file) | 201 | static int rcuexp_open(struct inode *inode, struct file *file) |
202 | { | 202 | { |
203 | return single_open(file, show_rcudata_csv, NULL); | 203 | return single_open(file, show_rcuexp, inode->i_private); |
204 | } | 204 | } |
205 | 205 | ||
206 | static const struct file_operations rcudata_csv_fops = { | 206 | static const struct file_operations rcuexp_fops = { |
207 | .owner = THIS_MODULE, | 207 | .owner = THIS_MODULE, |
208 | .open = rcudata_csv_open, | 208 | .open = rcuexp_open, |
209 | .read = seq_read, | 209 | .read = seq_read, |
210 | .llseek = seq_lseek, | 210 | .llseek = no_llseek, |
211 | .release = single_release, | 211 | .release = seq_release, |
212 | }; | 212 | }; |
213 | 213 | ||
214 | #ifdef CONFIG_RCU_BOOST | 214 | #ifdef CONFIG_RCU_BOOST |
@@ -254,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = { | |||
254 | .owner = THIS_MODULE, | 254 | .owner = THIS_MODULE, |
255 | .open = rcu_node_boost_open, | 255 | .open = rcu_node_boost_open, |
256 | .read = seq_read, | 256 | .read = seq_read, |
257 | .llseek = seq_lseek, | 257 | .llseek = no_llseek, |
258 | .release = single_release, | 258 | .release = single_release, |
259 | }; | 259 | }; |
260 | 260 | ||
261 | /* | 261 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
262 | * Create the rcuboost debugfs entry. Standard error return. | ||
263 | */ | ||
264 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
265 | { | ||
266 | return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, | ||
267 | &rcu_node_boost_fops); | ||
268 | } | ||
269 | |||
270 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
271 | |||
272 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
273 | { | ||
274 | return 0; /* There cannot be an error if we didn't create it! */ | ||
275 | } | ||
276 | |||
277 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
278 | 262 | ||
279 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 263 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
280 | { | 264 | { |
@@ -283,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
283 | struct rcu_node *rnp; | 267 | struct rcu_node *rnp; |
284 | 268 | ||
285 | gpnum = rsp->gpnum; | 269 | gpnum = rsp->gpnum; |
286 | seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", | 270 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", |
287 | rsp->name, rsp->completed, gpnum, rsp->fqs_state, | 271 | ulong2long(rsp->completed), ulong2long(gpnum), |
272 | rsp->fqs_state, | ||
288 | (long)(rsp->jiffies_force_qs - jiffies), | 273 | (long)(rsp->jiffies_force_qs - jiffies), |
289 | (int)(jiffies & 0xffff)); | 274 | (int)(jiffies & 0xffff)); |
290 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | 275 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
@@ -306,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
306 | seq_puts(m, "\n"); | 291 | seq_puts(m, "\n"); |
307 | } | 292 | } |
308 | 293 | ||
309 | static int show_rcuhier(struct seq_file *m, void *unused) | 294 | static int show_rcuhier(struct seq_file *m, void *v) |
310 | { | 295 | { |
311 | struct rcu_state *rsp; | 296 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
312 | 297 | print_one_rcu_state(m, rsp); | |
313 | for_each_rcu_flavor(rsp) | ||
314 | print_one_rcu_state(m, rsp); | ||
315 | return 0; | 298 | return 0; |
316 | } | 299 | } |
317 | 300 | ||
318 | static int rcuhier_open(struct inode *inode, struct file *file) | 301 | static int rcuhier_open(struct inode *inode, struct file *file) |
319 | { | 302 | { |
320 | return single_open(file, show_rcuhier, NULL); | 303 | return single_open(file, show_rcuhier, inode->i_private); |
321 | } | 304 | } |
322 | 305 | ||
323 | static const struct file_operations rcuhier_fops = { | 306 | static const struct file_operations rcuhier_fops = { |
324 | .owner = THIS_MODULE, | 307 | .owner = THIS_MODULE, |
325 | .open = rcuhier_open, | 308 | .open = rcuhier_open, |
326 | .read = seq_read, | 309 | .read = seq_read, |
327 | .llseek = seq_lseek, | 310 | .llseek = no_llseek, |
328 | .release = single_release, | 311 | .release = seq_release, |
329 | }; | 312 | }; |
330 | 313 | ||
331 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | 314 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) |
@@ -338,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | |||
338 | struct rcu_node *rnp = &rsp->node[0]; | 321 | struct rcu_node *rnp = &rsp->node[0]; |
339 | 322 | ||
340 | raw_spin_lock_irqsave(&rnp->lock, flags); | 323 | raw_spin_lock_irqsave(&rnp->lock, flags); |
341 | completed = rsp->completed; | 324 | completed = ACCESS_ONCE(rsp->completed); |
342 | gpnum = rsp->gpnum; | 325 | gpnum = ACCESS_ONCE(rsp->gpnum); |
343 | if (rsp->completed == rsp->gpnum) | 326 | if (completed == gpnum) |
344 | gpage = 0; | 327 | gpage = 0; |
345 | else | 328 | else |
346 | gpage = jiffies - rsp->gp_start; | 329 | gpage = jiffies - rsp->gp_start; |
347 | gpmax = rsp->gp_max; | 330 | gpmax = rsp->gp_max; |
348 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 331 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
349 | seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", | 332 | seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", |
350 | rsp->name, completed, gpnum, gpage, gpmax); | 333 | ulong2long(completed), ulong2long(gpnum), gpage, gpmax); |
351 | } | 334 | } |
352 | 335 | ||
353 | static int show_rcugp(struct seq_file *m, void *unused) | 336 | static int show_rcugp(struct seq_file *m, void *v) |
354 | { | 337 | { |
355 | struct rcu_state *rsp; | 338 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
356 | 339 | show_one_rcugp(m, rsp); | |
357 | for_each_rcu_flavor(rsp) | ||
358 | show_one_rcugp(m, rsp); | ||
359 | return 0; | 340 | return 0; |
360 | } | 341 | } |
361 | 342 | ||
362 | static int rcugp_open(struct inode *inode, struct file *file) | 343 | static int rcugp_open(struct inode *inode, struct file *file) |
363 | { | 344 | { |
364 | return single_open(file, show_rcugp, NULL); | 345 | return single_open(file, show_rcugp, inode->i_private); |
365 | } | 346 | } |
366 | 347 | ||
367 | static const struct file_operations rcugp_fops = { | 348 | static const struct file_operations rcugp_fops = { |
368 | .owner = THIS_MODULE, | 349 | .owner = THIS_MODULE, |
369 | .open = rcugp_open, | 350 | .open = rcugp_open, |
370 | .read = seq_read, | 351 | .read = seq_read, |
371 | .llseek = seq_lseek, | 352 | .llseek = no_llseek, |
372 | .release = single_release, | 353 | .release = seq_release, |
373 | }; | 354 | }; |
374 | 355 | ||
375 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | 356 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) |
376 | { | 357 | { |
358 | if (!rdp->beenonline) | ||
359 | return; | ||
377 | seq_printf(m, "%3d%cnp=%ld ", | 360 | seq_printf(m, "%3d%cnp=%ld ", |
378 | rdp->cpu, | 361 | rdp->cpu, |
379 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 362 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
@@ -389,34 +372,30 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | |||
389 | rdp->n_rp_need_nothing); | 372 | rdp->n_rp_need_nothing); |
390 | } | 373 | } |
391 | 374 | ||
392 | static int show_rcu_pending(struct seq_file *m, void *unused) | 375 | static int show_rcu_pending(struct seq_file *m, void *v) |
393 | { | 376 | { |
394 | int cpu; | 377 | print_one_rcu_pending(m, (struct rcu_data *)v); |
395 | struct rcu_data *rdp; | ||
396 | struct rcu_state *rsp; | ||
397 | |||
398 | for_each_rcu_flavor(rsp) { | ||
399 | seq_printf(m, "%s:\n", rsp->name); | ||
400 | for_each_possible_cpu(cpu) { | ||
401 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
402 | if (rdp->beenonline) | ||
403 | print_one_rcu_pending(m, rdp); | ||
404 | } | ||
405 | } | ||
406 | return 0; | 378 | return 0; |
407 | } | 379 | } |
408 | 380 | ||
381 | static const struct seq_operations rcu_pending_op = { | ||
382 | .start = r_start, | ||
383 | .next = r_next, | ||
384 | .stop = r_stop, | ||
385 | .show = show_rcu_pending, | ||
386 | }; | ||
387 | |||
409 | static int rcu_pending_open(struct inode *inode, struct file *file) | 388 | static int rcu_pending_open(struct inode *inode, struct file *file) |
410 | { | 389 | { |
411 | return single_open(file, show_rcu_pending, NULL); | 390 | return r_open(inode, file, &rcu_pending_op); |
412 | } | 391 | } |
413 | 392 | ||
414 | static const struct file_operations rcu_pending_fops = { | 393 | static const struct file_operations rcu_pending_fops = { |
415 | .owner = THIS_MODULE, | 394 | .owner = THIS_MODULE, |
416 | .open = rcu_pending_open, | 395 | .open = rcu_pending_open, |
417 | .read = seq_read, | 396 | .read = seq_read, |
418 | .llseek = seq_lseek, | 397 | .llseek = no_llseek, |
419 | .release = single_release, | 398 | .release = seq_release, |
420 | }; | 399 | }; |
421 | 400 | ||
422 | static int show_rcutorture(struct seq_file *m, void *unused) | 401 | static int show_rcutorture(struct seq_file *m, void *unused) |
@@ -446,43 +425,58 @@ static struct dentry *rcudir; | |||
446 | 425 | ||
447 | static int __init rcutree_trace_init(void) | 426 | static int __init rcutree_trace_init(void) |
448 | { | 427 | { |
428 | struct rcu_state *rsp; | ||
449 | struct dentry *retval; | 429 | struct dentry *retval; |
430 | struct dentry *rspdir; | ||
450 | 431 | ||
451 | rcudir = debugfs_create_dir("rcu", NULL); | 432 | rcudir = debugfs_create_dir("rcu", NULL); |
452 | if (!rcudir) | 433 | if (!rcudir) |
453 | goto free_out; | 434 | goto free_out; |
454 | 435 | ||
455 | retval = debugfs_create_file("rcubarrier", 0444, rcudir, | 436 | for_each_rcu_flavor(rsp) { |
456 | NULL, &rcubarrier_fops); | 437 | rspdir = debugfs_create_dir(rsp->name, rcudir); |
457 | if (!retval) | 438 | if (!rspdir) |
458 | goto free_out; | 439 | goto free_out; |
459 | 440 | ||
460 | retval = debugfs_create_file("rcudata", 0444, rcudir, | 441 | retval = debugfs_create_file("rcudata", 0444, |
461 | NULL, &rcudata_fops); | 442 | rspdir, rsp, &rcudata_fops); |
462 | if (!retval) | 443 | if (!retval) |
463 | goto free_out; | 444 | goto free_out; |
464 | 445 | ||
465 | retval = debugfs_create_file("rcudata.csv", 0444, rcudir, | 446 | retval = debugfs_create_file("rcuexp", 0444, |
466 | NULL, &rcudata_csv_fops); | 447 | rspdir, rsp, &rcuexp_fops); |
467 | if (!retval) | 448 | if (!retval) |
468 | goto free_out; | 449 | goto free_out; |
469 | 450 | ||
470 | if (rcu_boost_trace_create_file(rcudir)) | 451 | retval = debugfs_create_file("rcu_pending", 0444, |
471 | goto free_out; | 452 | rspdir, rsp, &rcu_pending_fops); |
453 | if (!retval) | ||
454 | goto free_out; | ||
455 | |||
456 | retval = debugfs_create_file("rcubarrier", 0444, | ||
457 | rspdir, rsp, &rcubarrier_fops); | ||
458 | if (!retval) | ||
459 | goto free_out; | ||
472 | 460 | ||
473 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | 461 | #ifdef CONFIG_RCU_BOOST |
474 | if (!retval) | 462 | if (rsp == &rcu_preempt_state) { |
475 | goto free_out; | 463 | retval = debugfs_create_file("rcuboost", 0444, |
464 | rspdir, NULL, &rcu_node_boost_fops); | ||
465 | if (!retval) | ||
466 | goto free_out; | ||
467 | } | ||
468 | #endif | ||
476 | 469 | ||
477 | retval = debugfs_create_file("rcuhier", 0444, rcudir, | 470 | retval = debugfs_create_file("rcugp", 0444, |
478 | NULL, &rcuhier_fops); | 471 | rspdir, rsp, &rcugp_fops); |
479 | if (!retval) | 472 | if (!retval) |
480 | goto free_out; | 473 | goto free_out; |
481 | 474 | ||
482 | retval = debugfs_create_file("rcu_pending", 0444, rcudir, | 475 | retval = debugfs_create_file("rcuhier", 0444, |
483 | NULL, &rcu_pending_fops); | 476 | rspdir, rsp, &rcuhier_fops); |
484 | if (!retval) | 477 | if (!retval) |
485 | goto free_out; | 478 | goto free_out; |
479 | } | ||
486 | 480 | ||
487 | retval = debugfs_create_file("rcutorture", 0444, rcudir, | 481 | retval = debugfs_create_file("rcutorture", 0444, rcudir, |
488 | NULL, &rcutorture_fops); | 482 | NULL, &rcutorture_fops); |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ad581aa2369a..ff55247e7049 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | |||
86 | return __res_counter_charge(counter, val, limit_fail_at, true); | 86 | return __res_counter_charge(counter, val, limit_fail_at, true); |
87 | } | 87 | } |
88 | 88 | ||
89 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | 89 | u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) |
90 | { | 90 | { |
91 | if (WARN_ON(counter->usage < val)) | 91 | if (WARN_ON(counter->usage < val)) |
92 | val = counter->usage; | 92 | val = counter->usage; |
93 | 93 | ||
94 | counter->usage -= val; | 94 | counter->usage -= val; |
95 | return counter->usage; | ||
95 | } | 96 | } |
96 | 97 | ||
97 | void res_counter_uncharge_until(struct res_counter *counter, | 98 | u64 res_counter_uncharge_until(struct res_counter *counter, |
98 | struct res_counter *top, | 99 | struct res_counter *top, |
99 | unsigned long val) | 100 | unsigned long val) |
100 | { | 101 | { |
101 | unsigned long flags; | 102 | unsigned long flags; |
102 | struct res_counter *c; | 103 | struct res_counter *c; |
104 | u64 ret = 0; | ||
103 | 105 | ||
104 | local_irq_save(flags); | 106 | local_irq_save(flags); |
105 | for (c = counter; c != top; c = c->parent) { | 107 | for (c = counter; c != top; c = c->parent) { |
108 | u64 r; | ||
106 | spin_lock(&c->lock); | 109 | spin_lock(&c->lock); |
107 | res_counter_uncharge_locked(c, val); | 110 | r = res_counter_uncharge_locked(c, val); |
111 | if (c == counter) | ||
112 | ret = r; | ||
108 | spin_unlock(&c->lock); | 113 | spin_unlock(&c->lock); |
109 | } | 114 | } |
110 | local_irq_restore(flags); | 115 | local_irq_restore(flags); |
116 | return ret; | ||
111 | } | 117 | } |
112 | 118 | ||
113 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | 119 | u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) |
114 | { | 120 | { |
115 | res_counter_uncharge_until(counter, NULL, val); | 121 | return res_counter_uncharge_until(counter, NULL, val); |
116 | } | 122 | } |
117 | 123 | ||
118 | static inline unsigned long long * | 124 | static inline unsigned long long * |
@@ -192,25 +198,3 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
192 | *res = PAGE_ALIGN(*res); | 198 | *res = PAGE_ALIGN(*res); |
193 | return 0; | 199 | return 0; |
194 | } | 200 | } |
195 | |||
196 | int res_counter_write(struct res_counter *counter, int member, | ||
197 | const char *buf, write_strategy_fn write_strategy) | ||
198 | { | ||
199 | char *end; | ||
200 | unsigned long flags; | ||
201 | unsigned long long tmp, *val; | ||
202 | |||
203 | if (write_strategy) { | ||
204 | if (write_strategy(buf, &tmp)) | ||
205 | return -EINVAL; | ||
206 | } else { | ||
207 | tmp = simple_strtoull(buf, &end, 10); | ||
208 | if (*end != '\0') | ||
209 | return -EINVAL; | ||
210 | } | ||
211 | spin_lock_irqsave(&counter->lock, flags); | ||
212 | val = res_counter_member(counter, member); | ||
213 | *val = tmp; | ||
214 | spin_unlock_irqrestore(&counter->lock, flags); | ||
215 | return 0; | ||
216 | } | ||
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 15f60d01198b..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -143,11 +143,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
143 | 143 | ||
144 | p->signal->autogroup = autogroup_kref_get(ag); | 144 | p->signal->autogroup = autogroup_kref_get(ag); |
145 | 145 | ||
146 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | ||
147 | goto out; | ||
148 | |||
146 | t = p; | 149 | t = p; |
147 | do { | 150 | do { |
148 | sched_move_task(t); | 151 | sched_move_task(t); |
149 | } while_each_thread(p, t); | 152 | } while_each_thread(p, t); |
150 | 153 | ||
154 | out: | ||
151 | unlock_task_sighand(p, &flags); | 155 | unlock_task_sighand(p, &flags); |
152 | autogroup_kref_put(prev); | 156 | autogroup_kref_put(prev); |
153 | } | 157 | } |
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 443232ebbb53..8bd047142816 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h | |||
@@ -4,6 +4,11 @@ | |||
4 | #include <linux/rwsem.h> | 4 | #include <linux/rwsem.h> |
5 | 5 | ||
6 | struct autogroup { | 6 | struct autogroup { |
7 | /* | ||
8 | * reference doesn't mean how many thread attach to this | ||
9 | * autogroup now. It just stands for the number of task | ||
10 | * could use this autogroup. | ||
11 | */ | ||
7 | struct kref kref; | 12 | struct kref kref; |
8 | struct task_group *tg; | 13 | struct task_group *tg; |
9 | struct rw_semaphore lock; | 14 | struct rw_semaphore lock; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..257002c13bb0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -72,6 +72,7 @@ | |||
72 | #include <linux/slab.h> | 72 | #include <linux/slab.h> |
73 | #include <linux/init_task.h> | 73 | #include <linux/init_task.h> |
74 | #include <linux/binfmts.h> | 74 | #include <linux/binfmts.h> |
75 | #include <linux/context_tracking.h> | ||
75 | 76 | ||
76 | #include <asm/switch_to.h> | 77 | #include <asm/switch_to.h> |
77 | #include <asm/tlb.h> | 78 | #include <asm/tlb.h> |
@@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { }; | |||
192 | static void sched_feat_enable(int i) { }; | 193 | static void sched_feat_enable(int i) { }; |
193 | #endif /* HAVE_JUMP_LABEL */ | 194 | #endif /* HAVE_JUMP_LABEL */ |
194 | 195 | ||
195 | static ssize_t | 196 | static int sched_feat_set(char *cmp) |
196 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
197 | size_t cnt, loff_t *ppos) | ||
198 | { | 197 | { |
199 | char buf[64]; | ||
200 | char *cmp; | ||
201 | int neg = 0; | ||
202 | int i; | 198 | int i; |
203 | 199 | int neg = 0; | |
204 | if (cnt > 63) | ||
205 | cnt = 63; | ||
206 | |||
207 | if (copy_from_user(&buf, ubuf, cnt)) | ||
208 | return -EFAULT; | ||
209 | |||
210 | buf[cnt] = 0; | ||
211 | cmp = strstrip(buf); | ||
212 | 200 | ||
213 | if (strncmp(cmp, "NO_", 3) == 0) { | 201 | if (strncmp(cmp, "NO_", 3) == 0) { |
214 | neg = 1; | 202 | neg = 1; |
@@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
228 | } | 216 | } |
229 | } | 217 | } |
230 | 218 | ||
219 | return i; | ||
220 | } | ||
221 | |||
222 | static ssize_t | ||
223 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
224 | size_t cnt, loff_t *ppos) | ||
225 | { | ||
226 | char buf[64]; | ||
227 | char *cmp; | ||
228 | int i; | ||
229 | |||
230 | if (cnt > 63) | ||
231 | cnt = 63; | ||
232 | |||
233 | if (copy_from_user(&buf, ubuf, cnt)) | ||
234 | return -EFAULT; | ||
235 | |||
236 | buf[cnt] = 0; | ||
237 | cmp = strstrip(buf); | ||
238 | |||
239 | i = sched_feat_set(cmp); | ||
231 | if (i == __SCHED_FEAT_NR) | 240 | if (i == __SCHED_FEAT_NR) |
232 | return -EINVAL; | 241 | return -EINVAL; |
233 | 242 | ||
@@ -922,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
922 | rq->skip_clock_update = 1; | 931 | rq->skip_clock_update = 1; |
923 | } | 932 | } |
924 | 933 | ||
934 | static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); | ||
935 | |||
936 | void register_task_migration_notifier(struct notifier_block *n) | ||
937 | { | ||
938 | atomic_notifier_chain_register(&task_migration_notifier, n); | ||
939 | } | ||
940 | |||
925 | #ifdef CONFIG_SMP | 941 | #ifdef CONFIG_SMP |
926 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 942 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
927 | { | 943 | { |
@@ -952,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
952 | trace_sched_migrate_task(p, new_cpu); | 968 | trace_sched_migrate_task(p, new_cpu); |
953 | 969 | ||
954 | if (task_cpu(p) != new_cpu) { | 970 | if (task_cpu(p) != new_cpu) { |
971 | struct task_migration_notifier tmn; | ||
972 | |||
973 | if (p->sched_class->migrate_task_rq) | ||
974 | p->sched_class->migrate_task_rq(p, new_cpu); | ||
955 | p->se.nr_migrations++; | 975 | p->se.nr_migrations++; |
956 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); | 976 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); |
977 | |||
978 | tmn.task = p; | ||
979 | tmn.from_cpu = task_cpu(p); | ||
980 | tmn.to_cpu = new_cpu; | ||
981 | |||
982 | atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); | ||
957 | } | 983 | } |
958 | 984 | ||
959 | __set_task_cpu(p, new_cpu); | 985 | __set_task_cpu(p, new_cpu); |
@@ -1524,6 +1550,15 @@ static void __sched_fork(struct task_struct *p) | |||
1524 | p->se.vruntime = 0; | 1550 | p->se.vruntime = 0; |
1525 | INIT_LIST_HEAD(&p->se.group_node); | 1551 | INIT_LIST_HEAD(&p->se.group_node); |
1526 | 1552 | ||
1553 | /* | ||
1554 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
1555 | * removed when useful for applications beyond shares distribution (e.g. | ||
1556 | * load-balance). | ||
1557 | */ | ||
1558 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1559 | p->se.avg.runnable_avg_period = 0; | ||
1560 | p->se.avg.runnable_avg_sum = 0; | ||
1561 | #endif | ||
1527 | #ifdef CONFIG_SCHEDSTATS | 1562 | #ifdef CONFIG_SCHEDSTATS |
1528 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1563 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
1529 | #endif | 1564 | #endif |
@@ -1533,7 +1568,40 @@ static void __sched_fork(struct task_struct *p) | |||
1533 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1568 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1534 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 1569 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
1535 | #endif | 1570 | #endif |
1571 | |||
1572 | #ifdef CONFIG_NUMA_BALANCING | ||
1573 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | ||
1574 | p->mm->numa_next_scan = jiffies; | ||
1575 | p->mm->numa_next_reset = jiffies; | ||
1576 | p->mm->numa_scan_seq = 0; | ||
1577 | } | ||
1578 | |||
1579 | p->node_stamp = 0ULL; | ||
1580 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | ||
1581 | p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; | ||
1582 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | ||
1583 | p->numa_work.next = &p->numa_work; | ||
1584 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1585 | } | ||
1586 | |||
1587 | #ifdef CONFIG_NUMA_BALANCING | ||
1588 | #ifdef CONFIG_SCHED_DEBUG | ||
1589 | void set_numabalancing_state(bool enabled) | ||
1590 | { | ||
1591 | if (enabled) | ||
1592 | sched_feat_set("NUMA"); | ||
1593 | else | ||
1594 | sched_feat_set("NO_NUMA"); | ||
1536 | } | 1595 | } |
1596 | #else | ||
1597 | __read_mostly bool numabalancing_enabled; | ||
1598 | |||
1599 | void set_numabalancing_state(bool enabled) | ||
1600 | { | ||
1601 | numabalancing_enabled = enabled; | ||
1602 | } | ||
1603 | #endif /* CONFIG_SCHED_DEBUG */ | ||
1604 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1537 | 1605 | ||
1538 | /* | 1606 | /* |
1539 | * fork()/clone()-time setup: | 1607 | * fork()/clone()-time setup: |
@@ -1886,8 +1954,8 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
1886 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 1954 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
1887 | #endif | 1955 | #endif |
1888 | 1956 | ||
1957 | context_tracking_task_switch(prev, next); | ||
1889 | /* Here we just switch the register state and the stack. */ | 1958 | /* Here we just switch the register state and the stack. */ |
1890 | rcu_switch(prev, next); | ||
1891 | switch_to(prev, next, prev); | 1959 | switch_to(prev, next, prev); |
1892 | 1960 | ||
1893 | barrier(); | 1961 | barrier(); |
@@ -2911,7 +2979,7 @@ asmlinkage void __sched schedule(void) | |||
2911 | } | 2979 | } |
2912 | EXPORT_SYMBOL(schedule); | 2980 | EXPORT_SYMBOL(schedule); |
2913 | 2981 | ||
2914 | #ifdef CONFIG_RCU_USER_QS | 2982 | #ifdef CONFIG_CONTEXT_TRACKING |
2915 | asmlinkage void __sched schedule_user(void) | 2983 | asmlinkage void __sched schedule_user(void) |
2916 | { | 2984 | { |
2917 | /* | 2985 | /* |
@@ -2920,9 +2988,9 @@ asmlinkage void __sched schedule_user(void) | |||
2920 | * we haven't yet exited the RCU idle mode. Do it here manually until | 2988 | * we haven't yet exited the RCU idle mode. Do it here manually until |
2921 | * we find a better solution. | 2989 | * we find a better solution. |
2922 | */ | 2990 | */ |
2923 | rcu_user_exit(); | 2991 | user_exit(); |
2924 | schedule(); | 2992 | schedule(); |
2925 | rcu_user_enter(); | 2993 | user_enter(); |
2926 | } | 2994 | } |
2927 | #endif | 2995 | #endif |
2928 | 2996 | ||
@@ -3027,7 +3095,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3027 | /* Catch callers which need to be fixed */ | 3095 | /* Catch callers which need to be fixed */ |
3028 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3096 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3029 | 3097 | ||
3030 | rcu_user_exit(); | 3098 | user_exit(); |
3031 | do { | 3099 | do { |
3032 | add_preempt_count(PREEMPT_ACTIVE); | 3100 | add_preempt_count(PREEMPT_ACTIVE); |
3033 | local_irq_enable(); | 3101 | local_irq_enable(); |
@@ -4029,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4029 | goto out_free_cpus_allowed; | 4097 | goto out_free_cpus_allowed; |
4030 | } | 4098 | } |
4031 | retval = -EPERM; | 4099 | retval = -EPERM; |
4032 | if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) | 4100 | if (!check_same_owner(p)) { |
4033 | goto out_unlock; | 4101 | rcu_read_lock(); |
4102 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { | ||
4103 | rcu_read_unlock(); | ||
4104 | goto out_unlock; | ||
4105 | } | ||
4106 | rcu_read_unlock(); | ||
4107 | } | ||
4034 | 4108 | ||
4035 | retval = security_task_setscheduler(p); | 4109 | retval = security_task_setscheduler(p); |
4036 | if (retval) | 4110 | if (retval) |
@@ -4474,6 +4548,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; | |||
4474 | void sched_show_task(struct task_struct *p) | 4548 | void sched_show_task(struct task_struct *p) |
4475 | { | 4549 | { |
4476 | unsigned long free = 0; | 4550 | unsigned long free = 0; |
4551 | int ppid; | ||
4477 | unsigned state; | 4552 | unsigned state; |
4478 | 4553 | ||
4479 | state = p->state ? __ffs(p->state) + 1 : 0; | 4554 | state = p->state ? __ffs(p->state) + 1 : 0; |
@@ -4493,8 +4568,11 @@ void sched_show_task(struct task_struct *p) | |||
4493 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4568 | #ifdef CONFIG_DEBUG_STACK_USAGE |
4494 | free = stack_not_used(p); | 4569 | free = stack_not_used(p); |
4495 | #endif | 4570 | #endif |
4571 | rcu_read_lock(); | ||
4572 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); | ||
4573 | rcu_read_unlock(); | ||
4496 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4574 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
4497 | task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), | 4575 | task_pid_nr(p), ppid, |
4498 | (unsigned long)task_thread_info(p)->flags); | 4576 | (unsigned long)task_thread_info(p)->flags); |
4499 | 4577 | ||
4500 | show_stack(p, NULL); | 4578 | show_stack(p, NULL); |
@@ -7468,7 +7546,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | |||
7468 | struct task_group, css); | 7546 | struct task_group, css); |
7469 | } | 7547 | } |
7470 | 7548 | ||
7471 | static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) | 7549 | static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) |
7472 | { | 7550 | { |
7473 | struct task_group *tg, *parent; | 7551 | struct task_group *tg, *parent; |
7474 | 7552 | ||
@@ -7485,7 +7563,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) | |||
7485 | return &tg->css; | 7563 | return &tg->css; |
7486 | } | 7564 | } |
7487 | 7565 | ||
7488 | static void cpu_cgroup_destroy(struct cgroup *cgrp) | 7566 | static void cpu_cgroup_css_free(struct cgroup *cgrp) |
7489 | { | 7567 | { |
7490 | struct task_group *tg = cgroup_tg(cgrp); | 7568 | struct task_group *tg = cgroup_tg(cgrp); |
7491 | 7569 | ||
@@ -7845,8 +7923,8 @@ static struct cftype cpu_files[] = { | |||
7845 | 7923 | ||
7846 | struct cgroup_subsys cpu_cgroup_subsys = { | 7924 | struct cgroup_subsys cpu_cgroup_subsys = { |
7847 | .name = "cpu", | 7925 | .name = "cpu", |
7848 | .create = cpu_cgroup_create, | 7926 | .css_alloc = cpu_cgroup_css_alloc, |
7849 | .destroy = cpu_cgroup_destroy, | 7927 | .css_free = cpu_cgroup_css_free, |
7850 | .can_attach = cpu_cgroup_can_attach, | 7928 | .can_attach = cpu_cgroup_can_attach, |
7851 | .attach = cpu_cgroup_attach, | 7929 | .attach = cpu_cgroup_attach, |
7852 | .exit = cpu_cgroup_exit, | 7930 | .exit = cpu_cgroup_exit, |
@@ -7869,7 +7947,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7869 | struct cpuacct root_cpuacct; | 7947 | struct cpuacct root_cpuacct; |
7870 | 7948 | ||
7871 | /* create a new cpu accounting group */ | 7949 | /* create a new cpu accounting group */ |
7872 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) | 7950 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) |
7873 | { | 7951 | { |
7874 | struct cpuacct *ca; | 7952 | struct cpuacct *ca; |
7875 | 7953 | ||
@@ -7899,7 +7977,7 @@ out: | |||
7899 | } | 7977 | } |
7900 | 7978 | ||
7901 | /* destroy an existing cpu accounting group */ | 7979 | /* destroy an existing cpu accounting group */ |
7902 | static void cpuacct_destroy(struct cgroup *cgrp) | 7980 | static void cpuacct_css_free(struct cgroup *cgrp) |
7903 | { | 7981 | { |
7904 | struct cpuacct *ca = cgroup_ca(cgrp); | 7982 | struct cpuacct *ca = cgroup_ca(cgrp); |
7905 | 7983 | ||
@@ -8070,9 +8148,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
8070 | 8148 | ||
8071 | struct cgroup_subsys cpuacct_subsys = { | 8149 | struct cgroup_subsys cpuacct_subsys = { |
8072 | .name = "cpuacct", | 8150 | .name = "cpuacct", |
8073 | .create = cpuacct_create, | 8151 | .css_alloc = cpuacct_css_alloc, |
8074 | .destroy = cpuacct_destroy, | 8152 | .css_free = cpuacct_css_free, |
8075 | .subsys_id = cpuacct_subsys_id, | 8153 | .subsys_id = cpuacct_subsys_id, |
8076 | .base_cftypes = files, | 8154 | .base_cftypes = files, |
8077 | }; | 8155 | }; |
8078 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8156 | #endif /* CONFIG_CGROUP_CPUACCT */ |
8157 | |||
8158 | void dump_cpu_task(int cpu) | ||
8159 | { | ||
8160 | pr_info("Task dump for CPU %d:\n", cpu); | ||
8161 | sched_show_task(cpu_curr(cpu)); | ||
8162 | } | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 81b763ba58a6..293b202fcf79 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); | |||
43 | * Called before incrementing preempt_count on {soft,}irq_enter | 43 | * Called before incrementing preempt_count on {soft,}irq_enter |
44 | * and before decrementing preempt_count on {soft,}irq_exit. | 44 | * and before decrementing preempt_count on {soft,}irq_exit. |
45 | */ | 45 | */ |
46 | void vtime_account(struct task_struct *curr) | 46 | void irqtime_account_irq(struct task_struct *curr) |
47 | { | 47 | { |
48 | unsigned long flags; | 48 | unsigned long flags; |
49 | s64 delta; | 49 | s64 delta; |
@@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr) | |||
73 | irq_time_write_end(); | 73 | irq_time_write_end(); |
74 | local_irq_restore(flags); | 74 | local_irq_restore(flags); |
75 | } | 75 | } |
76 | EXPORT_SYMBOL_GPL(vtime_account); | 76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
77 | 77 | ||
78 | static int irqtime_account_hi_update(void) | 78 | static int irqtime_account_hi_update(void) |
79 | { | 79 | { |
@@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void) | |||
288 | return false; | 288 | return false; |
289 | } | 289 | } |
290 | 290 | ||
291 | /* | ||
292 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | ||
293 | * tasks (sum on group iteration) belonging to @tsk's group. | ||
294 | */ | ||
295 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | ||
296 | { | ||
297 | struct signal_struct *sig = tsk->signal; | ||
298 | struct task_struct *t; | ||
299 | |||
300 | times->utime = sig->utime; | ||
301 | times->stime = sig->stime; | ||
302 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
303 | |||
304 | rcu_read_lock(); | ||
305 | /* make sure we can trust tsk->thread_group list */ | ||
306 | if (!likely(pid_alive(tsk))) | ||
307 | goto out; | ||
308 | |||
309 | t = tsk; | ||
310 | do { | ||
311 | times->utime += t->utime; | ||
312 | times->stime += t->stime; | ||
313 | times->sum_exec_runtime += task_sched_runtime(t); | ||
314 | } while_each_thread(tsk, t); | ||
315 | out: | ||
316 | rcu_read_unlock(); | ||
317 | } | ||
318 | |||
291 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 319 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
292 | 320 | ||
293 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 321 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -417,13 +445,13 @@ void account_idle_ticks(unsigned long ticks) | |||
417 | * Use precise platform statistics if available: | 445 | * Use precise platform statistics if available: |
418 | */ | 446 | */ |
419 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 447 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
420 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 448 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
421 | { | 449 | { |
422 | *ut = p->utime; | 450 | *ut = p->utime; |
423 | *st = p->stime; | 451 | *st = p->stime; |
424 | } | 452 | } |
425 | 453 | ||
426 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 454 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
427 | { | 455 | { |
428 | struct task_cputime cputime; | 456 | struct task_cputime cputime; |
429 | 457 | ||
@@ -433,6 +461,29 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
433 | *st = cputime.stime; | 461 | *st = cputime.stime; |
434 | } | 462 | } |
435 | 463 | ||
464 | void vtime_account_system_irqsafe(struct task_struct *tsk) | ||
465 | { | ||
466 | unsigned long flags; | ||
467 | |||
468 | local_irq_save(flags); | ||
469 | vtime_account_system(tsk); | ||
470 | local_irq_restore(flags); | ||
471 | } | ||
472 | EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); | ||
473 | |||
474 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | ||
475 | void vtime_task_switch(struct task_struct *prev) | ||
476 | { | ||
477 | if (is_idle_task(prev)) | ||
478 | vtime_account_idle(prev); | ||
479 | else | ||
480 | vtime_account_system(prev); | ||
481 | |||
482 | vtime_account_user(prev); | ||
483 | arch_vtime_task_switch(prev); | ||
484 | } | ||
485 | #endif | ||
486 | |||
436 | /* | 487 | /* |
437 | * Archs that account the whole time spent in the idle task | 488 | * Archs that account the whole time spent in the idle task |
438 | * (outside irq) as idle time can rely on this and just implement | 489 | * (outside irq) as idle time can rely on this and just implement |
@@ -444,16 +495,10 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
444 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 495 | #ifndef __ARCH_HAS_VTIME_ACCOUNT |
445 | void vtime_account(struct task_struct *tsk) | 496 | void vtime_account(struct task_struct *tsk) |
446 | { | 497 | { |
447 | unsigned long flags; | ||
448 | |||
449 | local_irq_save(flags); | ||
450 | |||
451 | if (in_interrupt() || !is_idle_task(tsk)) | 498 | if (in_interrupt() || !is_idle_task(tsk)) |
452 | vtime_account_system(tsk); | 499 | vtime_account_system(tsk); |
453 | else | 500 | else |
454 | vtime_account_idle(tsk); | 501 | vtime_account_idle(tsk); |
455 | |||
456 | local_irq_restore(flags); | ||
457 | } | 502 | } |
458 | EXPORT_SYMBOL_GPL(vtime_account); | 503 | EXPORT_SYMBOL_GPL(vtime_account); |
459 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 504 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
@@ -478,14 +523,30 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | |||
478 | return (__force cputime_t) temp; | 523 | return (__force cputime_t) temp; |
479 | } | 524 | } |
480 | 525 | ||
481 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 526 | /* |
527 | * Adjust tick based cputime random precision against scheduler | ||
528 | * runtime accounting. | ||
529 | */ | ||
530 | static void cputime_adjust(struct task_cputime *curr, | ||
531 | struct cputime *prev, | ||
532 | cputime_t *ut, cputime_t *st) | ||
482 | { | 533 | { |
483 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | 534 | cputime_t rtime, utime, total; |
535 | |||
536 | utime = curr->utime; | ||
537 | total = utime + curr->stime; | ||
484 | 538 | ||
485 | /* | 539 | /* |
486 | * Use CFS's precise accounting: | 540 | * Tick based cputime accounting depend on random scheduling |
541 | * timeslices of a task to be interrupted or not by the timer. | ||
542 | * Depending on these circumstances, the number of these interrupts | ||
543 | * may be over or under-optimistic, matching the real user and system | ||
544 | * cputime with a variable precision. | ||
545 | * | ||
546 | * Fix this by scaling these tick based values against the total | ||
547 | * runtime accounted by the CFS scheduler. | ||
487 | */ | 548 | */ |
488 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 549 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); |
489 | 550 | ||
490 | if (total) | 551 | if (total) |
491 | utime = scale_utime(utime, rtime, total); | 552 | utime = scale_utime(utime, rtime, total); |
@@ -493,38 +554,36 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
493 | utime = rtime; | 554 | utime = rtime; |
494 | 555 | ||
495 | /* | 556 | /* |
496 | * Compare with previous values, to keep monotonicity: | 557 | * If the tick based count grows faster than the scheduler one, |
558 | * the result of the scaling may go backward. | ||
559 | * Let's enforce monotonicity. | ||
497 | */ | 560 | */ |
498 | p->prev_utime = max(p->prev_utime, utime); | 561 | prev->utime = max(prev->utime, utime); |
499 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | 562 | prev->stime = max(prev->stime, rtime - prev->utime); |
500 | 563 | ||
501 | *ut = p->prev_utime; | 564 | *ut = prev->utime; |
502 | *st = p->prev_stime; | 565 | *st = prev->stime; |
566 | } | ||
567 | |||
568 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
569 | { | ||
570 | struct task_cputime cputime = { | ||
571 | .utime = p->utime, | ||
572 | .stime = p->stime, | ||
573 | .sum_exec_runtime = p->se.sum_exec_runtime, | ||
574 | }; | ||
575 | |||
576 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | ||
503 | } | 577 | } |
504 | 578 | ||
505 | /* | 579 | /* |
506 | * Must be called with siglock held. | 580 | * Must be called with siglock held. |
507 | */ | 581 | */ |
508 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 582 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
509 | { | 583 | { |
510 | struct signal_struct *sig = p->signal; | ||
511 | struct task_cputime cputime; | 584 | struct task_cputime cputime; |
512 | cputime_t rtime, utime, total; | ||
513 | 585 | ||
514 | thread_group_cputime(p, &cputime); | 586 | thread_group_cputime(p, &cputime); |
515 | 587 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); | |
516 | total = cputime.utime + cputime.stime; | ||
517 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
518 | |||
519 | if (total) | ||
520 | utime = scale_utime(cputime.utime, rtime, total); | ||
521 | else | ||
522 | utime = rtime; | ||
523 | |||
524 | sig->prev_utime = max(sig->prev_utime, utime); | ||
525 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
526 | |||
527 | *ut = sig->prev_utime; | ||
528 | *st = sig->prev_stime; | ||
529 | } | 588 | } |
530 | #endif | 589 | #endif |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea9..2cd3c1b4e582 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) | |||
61 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) | 61 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) |
62 | { | 62 | { |
63 | struct sched_entity *se = tg->se[cpu]; | 63 | struct sched_entity *se = tg->se[cpu]; |
64 | if (!se) | ||
65 | return; | ||
66 | 64 | ||
67 | #define P(F) \ | 65 | #define P(F) \ |
68 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 66 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
69 | #define PN(F) \ | 67 | #define PN(F) \ |
70 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | 68 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
71 | 69 | ||
70 | if (!se) { | ||
71 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | ||
72 | P(avg->runnable_avg_sum); | ||
73 | P(avg->runnable_avg_period); | ||
74 | return; | ||
75 | } | ||
76 | |||
77 | |||
72 | PN(se->exec_start); | 78 | PN(se->exec_start); |
73 | PN(se->vruntime); | 79 | PN(se->vruntime); |
74 | PN(se->sum_exec_runtime); | 80 | PN(se->sum_exec_runtime); |
@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
85 | P(se->statistics.wait_count); | 91 | P(se->statistics.wait_count); |
86 | #endif | 92 | #endif |
87 | P(se->load.weight); | 93 | P(se->load.weight); |
94 | #ifdef CONFIG_SMP | ||
95 | P(se->avg.runnable_avg_sum); | ||
96 | P(se->avg.runnable_avg_period); | ||
97 | P(se->avg.load_avg_contrib); | ||
98 | P(se->avg.decay_count); | ||
99 | #endif | ||
88 | #undef PN | 100 | #undef PN |
89 | #undef P | 101 | #undef P |
90 | } | 102 | } |
@@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 218 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
207 | #ifdef CONFIG_FAIR_GROUP_SCHED | 219 | #ifdef CONFIG_FAIR_GROUP_SCHED |
208 | #ifdef CONFIG_SMP | 220 | #ifdef CONFIG_SMP |
209 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", | 221 | SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", |
210 | SPLIT_NS(cfs_rq->load_avg)); | 222 | cfs_rq->runnable_load_avg); |
211 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", | 223 | SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", |
212 | SPLIT_NS(cfs_rq->load_period)); | 224 | cfs_rq->blocked_load_avg); |
213 | SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", | 225 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", |
214 | cfs_rq->load_contribution); | 226 | atomic64_read(&cfs_rq->tg->load_avg)); |
215 | SEQ_printf(m, " .%-30s: %d\n", "load_tg", | 227 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", |
216 | atomic_read(&cfs_rq->tg->load_weight)); | 228 | cfs_rq->tg_load_contrib); |
229 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", | ||
230 | cfs_rq->tg_runnable_contrib); | ||
231 | SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", | ||
232 | atomic_read(&cfs_rq->tg->runnable_avg)); | ||
217 | #endif | 233 | #endif |
218 | 234 | ||
219 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 235 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..5eea8707234a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -26,6 +26,9 @@ | |||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
28 | #include <linux/interrupt.h> | 28 | #include <linux/interrupt.h> |
29 | #include <linux/mempolicy.h> | ||
30 | #include <linux/migrate.h> | ||
31 | #include <linux/task_work.h> | ||
29 | 32 | ||
30 | #include <trace/events/sched.h> | 33 | #include <trace/events/sched.h> |
31 | 34 | ||
@@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
259 | return grp->my_q; | 262 | return grp->my_q; |
260 | } | 263 | } |
261 | 264 | ||
265 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | ||
266 | int force_update); | ||
267 | |||
262 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 268 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
263 | { | 269 | { |
264 | if (!cfs_rq->on_list) { | 270 | if (!cfs_rq->on_list) { |
@@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
278 | } | 284 | } |
279 | 285 | ||
280 | cfs_rq->on_list = 1; | 286 | cfs_rq->on_list = 1; |
287 | /* We should have no load, but we need to update last_decay. */ | ||
288 | update_cfs_rq_blocked_load(cfs_rq, 0); | ||
281 | } | 289 | } |
282 | } | 290 | } |
283 | 291 | ||
@@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
653 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 661 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
654 | } | 662 | } |
655 | 663 | ||
656 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | ||
657 | static void update_cfs_shares(struct cfs_rq *cfs_rq); | ||
658 | |||
659 | /* | 664 | /* |
660 | * Update the current task's runtime statistics. Skip current tasks that | 665 | * Update the current task's runtime statistics. Skip current tasks that |
661 | * are not in our scheduling class. | 666 | * are not in our scheduling class. |
@@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
675 | 680 | ||
676 | curr->vruntime += delta_exec_weighted; | 681 | curr->vruntime += delta_exec_weighted; |
677 | update_min_vruntime(cfs_rq); | 682 | update_min_vruntime(cfs_rq); |
678 | |||
679 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
680 | cfs_rq->load_unacc_exec_time += delta_exec; | ||
681 | #endif | ||
682 | } | 683 | } |
683 | 684 | ||
684 | static void update_curr(struct cfs_rq *cfs_rq) | 685 | static void update_curr(struct cfs_rq *cfs_rq) |
@@ -776,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | * Scheduling class queueing methods: | 777 | * Scheduling class queueing methods: |
777 | */ | 778 | */ |
778 | 779 | ||
780 | #ifdef CONFIG_NUMA_BALANCING | ||
781 | /* | ||
782 | * numa task sample period in ms | ||
783 | */ | ||
784 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | ||
785 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | ||
786 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
787 | |||
788 | /* Portion of address space to scan in MB */ | ||
789 | unsigned int sysctl_numa_balancing_scan_size = 256; | ||
790 | |||
791 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | ||
792 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | ||
793 | |||
794 | static void task_numa_placement(struct task_struct *p) | ||
795 | { | ||
796 | int seq; | ||
797 | |||
798 | if (!p->mm) /* for example, ksmd faulting in a user's mm */ | ||
799 | return; | ||
800 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | ||
801 | if (p->numa_scan_seq == seq) | ||
802 | return; | ||
803 | p->numa_scan_seq = seq; | ||
804 | |||
805 | /* FIXME: Scheduling placement policy hints go here */ | ||
806 | } | ||
807 | |||
808 | /* | ||
809 | * Got a PROT_NONE fault for a page on @node. | ||
810 | */ | ||
811 | void task_numa_fault(int node, int pages, bool migrated) | ||
812 | { | ||
813 | struct task_struct *p = current; | ||
814 | |||
815 | if (!sched_feat_numa(NUMA)) | ||
816 | return; | ||
817 | |||
818 | /* FIXME: Allocate task-specific structure for placement policy here */ | ||
819 | |||
820 | /* | ||
821 | * If pages are properly placed (did not migrate) then scan slower. | ||
822 | * This is reset periodically in case of phase changes | ||
823 | */ | ||
824 | if (!migrated) | ||
825 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | ||
826 | p->numa_scan_period + jiffies_to_msecs(10)); | ||
827 | |||
828 | task_numa_placement(p); | ||
829 | } | ||
830 | |||
831 | static void reset_ptenuma_scan(struct task_struct *p) | ||
832 | { | ||
833 | ACCESS_ONCE(p->mm->numa_scan_seq)++; | ||
834 | p->mm->numa_scan_offset = 0; | ||
835 | } | ||
836 | |||
837 | /* | ||
838 | * The expensive part of numa migration is done from task_work context. | ||
839 | * Triggered from task_tick_numa(). | ||
840 | */ | ||
841 | void task_numa_work(struct callback_head *work) | ||
842 | { | ||
843 | unsigned long migrate, next_scan, now = jiffies; | ||
844 | struct task_struct *p = current; | ||
845 | struct mm_struct *mm = p->mm; | ||
846 | struct vm_area_struct *vma; | ||
847 | unsigned long start, end; | ||
848 | long pages; | ||
849 | |||
850 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | ||
851 | |||
852 | work->next = work; /* protect against double add */ | ||
853 | /* | ||
854 | * Who cares about NUMA placement when they're dying. | ||
855 | * | ||
856 | * NOTE: make sure not to dereference p->mm before this check, | ||
857 | * exit_task_work() happens _after_ exit_mm() so we could be called | ||
858 | * without p->mm even though we still had it when we enqueued this | ||
859 | * work. | ||
860 | */ | ||
861 | if (p->flags & PF_EXITING) | ||
862 | return; | ||
863 | |||
864 | /* | ||
865 | * We do not care about task placement until a task runs on a node | ||
866 | * other than the first one used by the address space. This is | ||
867 | * largely because migrations are driven by what CPU the task | ||
868 | * is running on. If it's never scheduled on another node, it'll | ||
869 | * not migrate so why bother trapping the fault. | ||
870 | */ | ||
871 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
872 | mm->first_nid = numa_node_id(); | ||
873 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
874 | /* Are we running on a new node yet? */ | ||
875 | if (numa_node_id() == mm->first_nid && | ||
876 | !sched_feat_numa(NUMA_FORCE)) | ||
877 | return; | ||
878 | |||
879 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
880 | } | ||
881 | |||
882 | /* | ||
883 | * Reset the scan period if enough time has gone by. Objective is that | ||
884 | * scanning will be reduced if pages are properly placed. As tasks | ||
885 | * can enter different phases this needs to be re-examined. Lacking | ||
886 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
887 | */ | ||
888 | migrate = mm->numa_next_reset; | ||
889 | if (time_after(now, migrate)) { | ||
890 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
891 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
892 | xchg(&mm->numa_next_reset, next_scan); | ||
893 | } | ||
894 | |||
895 | /* | ||
896 | * Enforce maximal scan/migration frequency.. | ||
897 | */ | ||
898 | migrate = mm->numa_next_scan; | ||
899 | if (time_before(now, migrate)) | ||
900 | return; | ||
901 | |||
902 | if (p->numa_scan_period == 0) | ||
903 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
904 | |||
905 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | ||
906 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | ||
907 | return; | ||
908 | |||
909 | /* | ||
910 | * Do not set pte_numa if the current running node is rate-limited. | ||
911 | * This loses statistics on the fault but if we are unwilling to | ||
912 | * migrate to this node, it is less likely we can do useful work | ||
913 | */ | ||
914 | if (migrate_ratelimited(numa_node_id())) | ||
915 | return; | ||
916 | |||
917 | start = mm->numa_scan_offset; | ||
918 | pages = sysctl_numa_balancing_scan_size; | ||
919 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ | ||
920 | if (!pages) | ||
921 | return; | ||
922 | |||
923 | down_read(&mm->mmap_sem); | ||
924 | vma = find_vma(mm, start); | ||
925 | if (!vma) { | ||
926 | reset_ptenuma_scan(p); | ||
927 | start = 0; | ||
928 | vma = mm->mmap; | ||
929 | } | ||
930 | for (; vma; vma = vma->vm_next) { | ||
931 | if (!vma_migratable(vma)) | ||
932 | continue; | ||
933 | |||
934 | /* Skip small VMAs. They are not likely to be of relevance */ | ||
935 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | ||
936 | continue; | ||
937 | |||
938 | do { | ||
939 | start = max(start, vma->vm_start); | ||
940 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | ||
941 | end = min(end, vma->vm_end); | ||
942 | pages -= change_prot_numa(vma, start, end); | ||
943 | |||
944 | start = end; | ||
945 | if (pages <= 0) | ||
946 | goto out; | ||
947 | } while (end != vma->vm_end); | ||
948 | } | ||
949 | |||
950 | out: | ||
951 | /* | ||
952 | * It is possible to reach the end of the VMA list but the last few VMAs are | ||
953 | * not guaranteed to the vma_migratable. If they are not, we would find the | ||
954 | * !migratable VMA on the next scan but not reset the scanner to the start | ||
955 | * so check it now. | ||
956 | */ | ||
957 | if (vma) | ||
958 | mm->numa_scan_offset = start; | ||
959 | else | ||
960 | reset_ptenuma_scan(p); | ||
961 | up_read(&mm->mmap_sem); | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Drive the periodic memory faults.. | ||
966 | */ | ||
967 | void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
968 | { | ||
969 | struct callback_head *work = &curr->numa_work; | ||
970 | u64 period, now; | ||
971 | |||
972 | /* | ||
973 | * We don't care about NUMA placement if we don't have memory. | ||
974 | */ | ||
975 | if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) | ||
976 | return; | ||
977 | |||
978 | /* | ||
979 | * Using runtime rather than walltime has the dual advantage that | ||
980 | * we (mostly) drive the selection from busy threads and that the | ||
981 | * task needs to have done some actual work before we bother with | ||
982 | * NUMA placement. | ||
983 | */ | ||
984 | now = curr->se.sum_exec_runtime; | ||
985 | period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; | ||
986 | |||
987 | if (now - curr->node_stamp > period) { | ||
988 | if (!curr->node_stamp) | ||
989 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
990 | curr->node_stamp = now; | ||
991 | |||
992 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | ||
993 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | ||
994 | task_work_add(curr, work, true); | ||
995 | } | ||
996 | } | ||
997 | } | ||
998 | #else | ||
999 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
1000 | { | ||
1001 | } | ||
1002 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1003 | |||
779 | static void | 1004 | static void |
780 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1005 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
781 | { | 1006 | { |
@@ -801,72 +1026,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
801 | } | 1026 | } |
802 | 1027 | ||
803 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1028 | #ifdef CONFIG_FAIR_GROUP_SCHED |
804 | /* we need this in update_cfs_load and load-balance functions below */ | ||
805 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
806 | # ifdef CONFIG_SMP | 1029 | # ifdef CONFIG_SMP |
807 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | ||
808 | int global_update) | ||
809 | { | ||
810 | struct task_group *tg = cfs_rq->tg; | ||
811 | long load_avg; | ||
812 | |||
813 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | ||
814 | load_avg -= cfs_rq->load_contribution; | ||
815 | |||
816 | if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { | ||
817 | atomic_add(load_avg, &tg->load_weight); | ||
818 | cfs_rq->load_contribution += load_avg; | ||
819 | } | ||
820 | } | ||
821 | |||
822 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
823 | { | ||
824 | u64 period = sysctl_sched_shares_window; | ||
825 | u64 now, delta; | ||
826 | unsigned long load = cfs_rq->load.weight; | ||
827 | |||
828 | if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) | ||
829 | return; | ||
830 | |||
831 | now = rq_of(cfs_rq)->clock_task; | ||
832 | delta = now - cfs_rq->load_stamp; | ||
833 | |||
834 | /* truncate load history at 4 idle periods */ | ||
835 | if (cfs_rq->load_stamp > cfs_rq->load_last && | ||
836 | now - cfs_rq->load_last > 4 * period) { | ||
837 | cfs_rq->load_period = 0; | ||
838 | cfs_rq->load_avg = 0; | ||
839 | delta = period - 1; | ||
840 | } | ||
841 | |||
842 | cfs_rq->load_stamp = now; | ||
843 | cfs_rq->load_unacc_exec_time = 0; | ||
844 | cfs_rq->load_period += delta; | ||
845 | if (load) { | ||
846 | cfs_rq->load_last = now; | ||
847 | cfs_rq->load_avg += delta * load; | ||
848 | } | ||
849 | |||
850 | /* consider updating load contribution on each fold or truncate */ | ||
851 | if (global_update || cfs_rq->load_period > period | ||
852 | || !cfs_rq->load_period) | ||
853 | update_cfs_rq_load_contribution(cfs_rq, global_update); | ||
854 | |||
855 | while (cfs_rq->load_period > period) { | ||
856 | /* | ||
857 | * Inline assembly required to prevent the compiler | ||
858 | * optimising this loop into a divmod call. | ||
859 | * See __iter_div_u64_rem() for another example of this. | ||
860 | */ | ||
861 | asm("" : "+rm" (cfs_rq->load_period)); | ||
862 | cfs_rq->load_period /= 2; | ||
863 | cfs_rq->load_avg /= 2; | ||
864 | } | ||
865 | |||
866 | if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) | ||
867 | list_del_leaf_cfs_rq(cfs_rq); | ||
868 | } | ||
869 | |||
870 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | 1030 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) |
871 | { | 1031 | { |
872 | long tg_weight; | 1032 | long tg_weight; |
@@ -876,8 +1036,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | |||
876 | * to gain a more accurate current total weight. See | 1036 | * to gain a more accurate current total weight. See |
877 | * update_cfs_rq_load_contribution(). | 1037 | * update_cfs_rq_load_contribution(). |
878 | */ | 1038 | */ |
879 | tg_weight = atomic_read(&tg->load_weight); | 1039 | tg_weight = atomic64_read(&tg->load_avg); |
880 | tg_weight -= cfs_rq->load_contribution; | 1040 | tg_weight -= cfs_rq->tg_load_contrib; |
881 | tg_weight += cfs_rq->load.weight; | 1041 | tg_weight += cfs_rq->load.weight; |
882 | 1042 | ||
883 | return tg_weight; | 1043 | return tg_weight; |
@@ -901,27 +1061,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
901 | 1061 | ||
902 | return shares; | 1062 | return shares; |
903 | } | 1063 | } |
904 | |||
905 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
906 | { | ||
907 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
908 | update_cfs_load(cfs_rq, 0); | ||
909 | update_cfs_shares(cfs_rq); | ||
910 | } | ||
911 | } | ||
912 | # else /* CONFIG_SMP */ | 1064 | # else /* CONFIG_SMP */ |
913 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
914 | { | ||
915 | } | ||
916 | |||
917 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | 1065 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
918 | { | 1066 | { |
919 | return tg->shares; | 1067 | return tg->shares; |
920 | } | 1068 | } |
921 | |||
922 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
923 | { | ||
924 | } | ||
925 | # endif /* CONFIG_SMP */ | 1069 | # endif /* CONFIG_SMP */ |
926 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 1070 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, |
927 | unsigned long weight) | 1071 | unsigned long weight) |
@@ -939,6 +1083,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
939 | account_entity_enqueue(cfs_rq, se); | 1083 | account_entity_enqueue(cfs_rq, se); |
940 | } | 1084 | } |
941 | 1085 | ||
1086 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
1087 | |||
942 | static void update_cfs_shares(struct cfs_rq *cfs_rq) | 1088 | static void update_cfs_shares(struct cfs_rq *cfs_rq) |
943 | { | 1089 | { |
944 | struct task_group *tg; | 1090 | struct task_group *tg; |
@@ -958,18 +1104,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
958 | reweight_entity(cfs_rq_of(se), se, shares); | 1104 | reweight_entity(cfs_rq_of(se), se, shares); |
959 | } | 1105 | } |
960 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 1106 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
961 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | 1107 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) |
962 | { | 1108 | { |
963 | } | 1109 | } |
1110 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
964 | 1111 | ||
965 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | 1112 | /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ |
1113 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1114 | /* | ||
1115 | * We choose a half-life close to 1 scheduling period. | ||
1116 | * Note: The tables below are dependent on this value. | ||
1117 | */ | ||
1118 | #define LOAD_AVG_PERIOD 32 | ||
1119 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ | ||
1120 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ | ||
1121 | |||
1122 | /* Precomputed fixed inverse multiplies for multiplication by y^n */ | ||
1123 | static const u32 runnable_avg_yN_inv[] = { | ||
1124 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, | ||
1125 | 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, | ||
1126 | 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, | ||
1127 | 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, | ||
1128 | 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, | ||
1129 | 0x85aac367, 0x82cd8698, | ||
1130 | }; | ||
1131 | |||
1132 | /* | ||
1133 | * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent | ||
1134 | * over-estimates when re-combining. | ||
1135 | */ | ||
1136 | static const u32 runnable_avg_yN_sum[] = { | ||
1137 | 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, | ||
1138 | 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, | ||
1139 | 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, | ||
1140 | }; | ||
1141 | |||
1142 | /* | ||
1143 | * Approximate: | ||
1144 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) | ||
1145 | */ | ||
1146 | static __always_inline u64 decay_load(u64 val, u64 n) | ||
966 | { | 1147 | { |
1148 | unsigned int local_n; | ||
1149 | |||
1150 | if (!n) | ||
1151 | return val; | ||
1152 | else if (unlikely(n > LOAD_AVG_PERIOD * 63)) | ||
1153 | return 0; | ||
1154 | |||
1155 | /* after bounds checking we can collapse to 32-bit */ | ||
1156 | local_n = n; | ||
1157 | |||
1158 | /* | ||
1159 | * As y^PERIOD = 1/2, we can combine | ||
1160 | * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) | ||
1161 | * With a look-up table which covers k^n (n<PERIOD) | ||
1162 | * | ||
1163 | * To achieve constant time decay_load. | ||
1164 | */ | ||
1165 | if (unlikely(local_n >= LOAD_AVG_PERIOD)) { | ||
1166 | val >>= local_n / LOAD_AVG_PERIOD; | ||
1167 | local_n %= LOAD_AVG_PERIOD; | ||
1168 | } | ||
1169 | |||
1170 | val *= runnable_avg_yN_inv[local_n]; | ||
1171 | /* We don't use SRR here since we always want to round down. */ | ||
1172 | return val >> 32; | ||
967 | } | 1173 | } |
968 | 1174 | ||
969 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | 1175 | /* |
1176 | * For updates fully spanning n periods, the contribution to runnable | ||
1177 | * average will be: \Sum 1024*y^n | ||
1178 | * | ||
1179 | * We can compute this reasonably efficiently by combining: | ||
1180 | * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} | ||
1181 | */ | ||
1182 | static u32 __compute_runnable_contrib(u64 n) | ||
970 | { | 1183 | { |
1184 | u32 contrib = 0; | ||
1185 | |||
1186 | if (likely(n <= LOAD_AVG_PERIOD)) | ||
1187 | return runnable_avg_yN_sum[n]; | ||
1188 | else if (unlikely(n >= LOAD_AVG_MAX_N)) | ||
1189 | return LOAD_AVG_MAX; | ||
1190 | |||
1191 | /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ | ||
1192 | do { | ||
1193 | contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ | ||
1194 | contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; | ||
1195 | |||
1196 | n -= LOAD_AVG_PERIOD; | ||
1197 | } while (n > LOAD_AVG_PERIOD); | ||
1198 | |||
1199 | contrib = decay_load(contrib, n); | ||
1200 | return contrib + runnable_avg_yN_sum[n]; | ||
971 | } | 1201 | } |
972 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 1202 | |
1203 | /* | ||
1204 | * We can represent the historical contribution to runnable average as the | ||
1205 | * coefficients of a geometric series. To do this we sub-divide our runnable | ||
1206 | * history into segments of approximately 1ms (1024us); label the segment that | ||
1207 | * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. | ||
1208 | * | ||
1209 | * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... | ||
1210 | * p0 p1 p2 | ||
1211 | * (now) (~1ms ago) (~2ms ago) | ||
1212 | * | ||
1213 | * Let u_i denote the fraction of p_i that the entity was runnable. | ||
1214 | * | ||
1215 | * We then designate the fractions u_i as our co-efficients, yielding the | ||
1216 | * following representation of historical load: | ||
1217 | * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... | ||
1218 | * | ||
1219 | * We choose y based on the with of a reasonably scheduling period, fixing: | ||
1220 | * y^32 = 0.5 | ||
1221 | * | ||
1222 | * This means that the contribution to load ~32ms ago (u_32) will be weighted | ||
1223 | * approximately half as much as the contribution to load within the last ms | ||
1224 | * (u_0). | ||
1225 | * | ||
1226 | * When a period "rolls over" and we have new u_0`, multiplying the previous | ||
1227 | * sum again by y is sufficient to update: | ||
1228 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | ||
1229 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | ||
1230 | */ | ||
1231 | static __always_inline int __update_entity_runnable_avg(u64 now, | ||
1232 | struct sched_avg *sa, | ||
1233 | int runnable) | ||
1234 | { | ||
1235 | u64 delta, periods; | ||
1236 | u32 runnable_contrib; | ||
1237 | int delta_w, decayed = 0; | ||
1238 | |||
1239 | delta = now - sa->last_runnable_update; | ||
1240 | /* | ||
1241 | * This should only happen when time goes backwards, which it | ||
1242 | * unfortunately does during sched clock init when we swap over to TSC. | ||
1243 | */ | ||
1244 | if ((s64)delta < 0) { | ||
1245 | sa->last_runnable_update = now; | ||
1246 | return 0; | ||
1247 | } | ||
1248 | |||
1249 | /* | ||
1250 | * Use 1024ns as the unit of measurement since it's a reasonable | ||
1251 | * approximation of 1us and fast to compute. | ||
1252 | */ | ||
1253 | delta >>= 10; | ||
1254 | if (!delta) | ||
1255 | return 0; | ||
1256 | sa->last_runnable_update = now; | ||
1257 | |||
1258 | /* delta_w is the amount already accumulated against our next period */ | ||
1259 | delta_w = sa->runnable_avg_period % 1024; | ||
1260 | if (delta + delta_w >= 1024) { | ||
1261 | /* period roll-over */ | ||
1262 | decayed = 1; | ||
1263 | |||
1264 | /* | ||
1265 | * Now that we know we're crossing a period boundary, figure | ||
1266 | * out how much from delta we need to complete the current | ||
1267 | * period and accrue it. | ||
1268 | */ | ||
1269 | delta_w = 1024 - delta_w; | ||
1270 | if (runnable) | ||
1271 | sa->runnable_avg_sum += delta_w; | ||
1272 | sa->runnable_avg_period += delta_w; | ||
1273 | |||
1274 | delta -= delta_w; | ||
1275 | |||
1276 | /* Figure out how many additional periods this update spans */ | ||
1277 | periods = delta / 1024; | ||
1278 | delta %= 1024; | ||
1279 | |||
1280 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | ||
1281 | periods + 1); | ||
1282 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | ||
1283 | periods + 1); | ||
1284 | |||
1285 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | ||
1286 | runnable_contrib = __compute_runnable_contrib(periods); | ||
1287 | if (runnable) | ||
1288 | sa->runnable_avg_sum += runnable_contrib; | ||
1289 | sa->runnable_avg_period += runnable_contrib; | ||
1290 | } | ||
1291 | |||
1292 | /* Remainder of delta accrued against u_0` */ | ||
1293 | if (runnable) | ||
1294 | sa->runnable_avg_sum += delta; | ||
1295 | sa->runnable_avg_period += delta; | ||
1296 | |||
1297 | return decayed; | ||
1298 | } | ||
1299 | |||
1300 | /* Synchronize an entity's decay with its parenting cfs_rq.*/ | ||
1301 | static inline u64 __synchronize_entity_decay(struct sched_entity *se) | ||
1302 | { | ||
1303 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1304 | u64 decays = atomic64_read(&cfs_rq->decay_counter); | ||
1305 | |||
1306 | decays -= se->avg.decay_count; | ||
1307 | if (!decays) | ||
1308 | return 0; | ||
1309 | |||
1310 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | ||
1311 | se->avg.decay_count = 0; | ||
1312 | |||
1313 | return decays; | ||
1314 | } | ||
1315 | |||
1316 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1317 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | ||
1318 | int force_update) | ||
1319 | { | ||
1320 | struct task_group *tg = cfs_rq->tg; | ||
1321 | s64 tg_contrib; | ||
1322 | |||
1323 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | ||
1324 | tg_contrib -= cfs_rq->tg_load_contrib; | ||
1325 | |||
1326 | if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | ||
1327 | atomic64_add(tg_contrib, &tg->load_avg); | ||
1328 | cfs_rq->tg_load_contrib += tg_contrib; | ||
1329 | } | ||
1330 | } | ||
1331 | |||
1332 | /* | ||
1333 | * Aggregate cfs_rq runnable averages into an equivalent task_group | ||
1334 | * representation for computing load contributions. | ||
1335 | */ | ||
1336 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | ||
1337 | struct cfs_rq *cfs_rq) | ||
1338 | { | ||
1339 | struct task_group *tg = cfs_rq->tg; | ||
1340 | long contrib; | ||
1341 | |||
1342 | /* The fraction of a cpu used by this cfs_rq */ | ||
1343 | contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, | ||
1344 | sa->runnable_avg_period + 1); | ||
1345 | contrib -= cfs_rq->tg_runnable_contrib; | ||
1346 | |||
1347 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | ||
1348 | atomic_add(contrib, &tg->runnable_avg); | ||
1349 | cfs_rq->tg_runnable_contrib += contrib; | ||
1350 | } | ||
1351 | } | ||
1352 | |||
1353 | static inline void __update_group_entity_contrib(struct sched_entity *se) | ||
1354 | { | ||
1355 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | ||
1356 | struct task_group *tg = cfs_rq->tg; | ||
1357 | int runnable_avg; | ||
1358 | |||
1359 | u64 contrib; | ||
1360 | |||
1361 | contrib = cfs_rq->tg_load_contrib * tg->shares; | ||
1362 | se->avg.load_avg_contrib = div64_u64(contrib, | ||
1363 | atomic64_read(&tg->load_avg) + 1); | ||
1364 | |||
1365 | /* | ||
1366 | * For group entities we need to compute a correction term in the case | ||
1367 | * that they are consuming <1 cpu so that we would contribute the same | ||
1368 | * load as a task of equal weight. | ||
1369 | * | ||
1370 | * Explicitly co-ordinating this measurement would be expensive, but | ||
1371 | * fortunately the sum of each cpus contribution forms a usable | ||
1372 | * lower-bound on the true value. | ||
1373 | * | ||
1374 | * Consider the aggregate of 2 contributions. Either they are disjoint | ||
1375 | * (and the sum represents true value) or they are disjoint and we are | ||
1376 | * understating by the aggregate of their overlap. | ||
1377 | * | ||
1378 | * Extending this to N cpus, for a given overlap, the maximum amount we | ||
1379 | * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of | ||
1380 | * cpus that overlap for this interval and w_i is the interval width. | ||
1381 | * | ||
1382 | * On a small machine; the first term is well-bounded which bounds the | ||
1383 | * total error since w_i is a subset of the period. Whereas on a | ||
1384 | * larger machine, while this first term can be larger, if w_i is the | ||
1385 | * of consequential size guaranteed to see n_i*w_i quickly converge to | ||
1386 | * our upper bound of 1-cpu. | ||
1387 | */ | ||
1388 | runnable_avg = atomic_read(&tg->runnable_avg); | ||
1389 | if (runnable_avg < NICE_0_LOAD) { | ||
1390 | se->avg.load_avg_contrib *= runnable_avg; | ||
1391 | se->avg.load_avg_contrib >>= NICE_0_SHIFT; | ||
1392 | } | ||
1393 | } | ||
1394 | #else | ||
1395 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | ||
1396 | int force_update) {} | ||
1397 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | ||
1398 | struct cfs_rq *cfs_rq) {} | ||
1399 | static inline void __update_group_entity_contrib(struct sched_entity *se) {} | ||
1400 | #endif | ||
1401 | |||
1402 | static inline void __update_task_entity_contrib(struct sched_entity *se) | ||
1403 | { | ||
1404 | u32 contrib; | ||
1405 | |||
1406 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
1407 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | ||
1408 | contrib /= (se->avg.runnable_avg_period + 1); | ||
1409 | se->avg.load_avg_contrib = scale_load(contrib); | ||
1410 | } | ||
1411 | |||
1412 | /* Compute the current contribution to load_avg by se, return any delta */ | ||
1413 | static long __update_entity_load_avg_contrib(struct sched_entity *se) | ||
1414 | { | ||
1415 | long old_contrib = se->avg.load_avg_contrib; | ||
1416 | |||
1417 | if (entity_is_task(se)) { | ||
1418 | __update_task_entity_contrib(se); | ||
1419 | } else { | ||
1420 | __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); | ||
1421 | __update_group_entity_contrib(se); | ||
1422 | } | ||
1423 | |||
1424 | return se->avg.load_avg_contrib - old_contrib; | ||
1425 | } | ||
1426 | |||
1427 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | ||
1428 | long load_contrib) | ||
1429 | { | ||
1430 | if (likely(load_contrib < cfs_rq->blocked_load_avg)) | ||
1431 | cfs_rq->blocked_load_avg -= load_contrib; | ||
1432 | else | ||
1433 | cfs_rq->blocked_load_avg = 0; | ||
1434 | } | ||
1435 | |||
1436 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
1437 | |||
1438 | /* Update a sched_entity's runnable average */ | ||
1439 | static inline void update_entity_load_avg(struct sched_entity *se, | ||
1440 | int update_cfs_rq) | ||
1441 | { | ||
1442 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1443 | long contrib_delta; | ||
1444 | u64 now; | ||
1445 | |||
1446 | /* | ||
1447 | * For a group entity we need to use their owned cfs_rq_clock_task() in | ||
1448 | * case they are the parent of a throttled hierarchy. | ||
1449 | */ | ||
1450 | if (entity_is_task(se)) | ||
1451 | now = cfs_rq_clock_task(cfs_rq); | ||
1452 | else | ||
1453 | now = cfs_rq_clock_task(group_cfs_rq(se)); | ||
1454 | |||
1455 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | ||
1456 | return; | ||
1457 | |||
1458 | contrib_delta = __update_entity_load_avg_contrib(se); | ||
1459 | |||
1460 | if (!update_cfs_rq) | ||
1461 | return; | ||
1462 | |||
1463 | if (se->on_rq) | ||
1464 | cfs_rq->runnable_load_avg += contrib_delta; | ||
1465 | else | ||
1466 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | ||
1467 | } | ||
1468 | |||
1469 | /* | ||
1470 | * Decay the load contributed by all blocked children and account this so that | ||
1471 | * their contribution may appropriately discounted when they wake up. | ||
1472 | */ | ||
1473 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | ||
1474 | { | ||
1475 | u64 now = cfs_rq_clock_task(cfs_rq) >> 20; | ||
1476 | u64 decays; | ||
1477 | |||
1478 | decays = now - cfs_rq->last_decay; | ||
1479 | if (!decays && !force_update) | ||
1480 | return; | ||
1481 | |||
1482 | if (atomic64_read(&cfs_rq->removed_load)) { | ||
1483 | u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); | ||
1484 | subtract_blocked_load_contrib(cfs_rq, removed_load); | ||
1485 | } | ||
1486 | |||
1487 | if (decays) { | ||
1488 | cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, | ||
1489 | decays); | ||
1490 | atomic64_add(decays, &cfs_rq->decay_counter); | ||
1491 | cfs_rq->last_decay = now; | ||
1492 | } | ||
1493 | |||
1494 | __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); | ||
1495 | } | ||
1496 | |||
1497 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | ||
1498 | { | ||
1499 | __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); | ||
1500 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | ||
1501 | } | ||
1502 | |||
1503 | /* Add the load generated by se into cfs_rq's child load-average */ | ||
1504 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1505 | struct sched_entity *se, | ||
1506 | int wakeup) | ||
1507 | { | ||
1508 | /* | ||
1509 | * We track migrations using entity decay_count <= 0, on a wake-up | ||
1510 | * migration we use a negative decay count to track the remote decays | ||
1511 | * accumulated while sleeping. | ||
1512 | */ | ||
1513 | if (unlikely(se->avg.decay_count <= 0)) { | ||
1514 | se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; | ||
1515 | if (se->avg.decay_count) { | ||
1516 | /* | ||
1517 | * In a wake-up migration we have to approximate the | ||
1518 | * time sleeping. This is because we can't synchronize | ||
1519 | * clock_task between the two cpus, and it is not | ||
1520 | * guaranteed to be read-safe. Instead, we can | ||
1521 | * approximate this using our carried decays, which are | ||
1522 | * explicitly atomically readable. | ||
1523 | */ | ||
1524 | se->avg.last_runnable_update -= (-se->avg.decay_count) | ||
1525 | << 20; | ||
1526 | update_entity_load_avg(se, 0); | ||
1527 | /* Indicate that we're now synchronized and on-rq */ | ||
1528 | se->avg.decay_count = 0; | ||
1529 | } | ||
1530 | wakeup = 0; | ||
1531 | } else { | ||
1532 | __synchronize_entity_decay(se); | ||
1533 | } | ||
1534 | |||
1535 | /* migrated tasks did not contribute to our blocked load */ | ||
1536 | if (wakeup) { | ||
1537 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); | ||
1538 | update_entity_load_avg(se, 0); | ||
1539 | } | ||
1540 | |||
1541 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | ||
1542 | /* we force update consideration on load-balancer moves */ | ||
1543 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | ||
1544 | } | ||
1545 | |||
1546 | /* | ||
1547 | * Remove se's load from this cfs_rq child load-average, if the entity is | ||
1548 | * transitioning to a blocked state we track its projected decay using | ||
1549 | * blocked_load_avg. | ||
1550 | */ | ||
1551 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1552 | struct sched_entity *se, | ||
1553 | int sleep) | ||
1554 | { | ||
1555 | update_entity_load_avg(se, 1); | ||
1556 | /* we force update consideration on load-balancer moves */ | ||
1557 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | ||
1558 | |||
1559 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | ||
1560 | if (sleep) { | ||
1561 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | ||
1562 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | ||
1563 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ | ||
1564 | } | ||
1565 | #else | ||
1566 | static inline void update_entity_load_avg(struct sched_entity *se, | ||
1567 | int update_cfs_rq) {} | ||
1568 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} | ||
1569 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1570 | struct sched_entity *se, | ||
1571 | int wakeup) {} | ||
1572 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1573 | struct sched_entity *se, | ||
1574 | int sleep) {} | ||
1575 | static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | ||
1576 | int force_update) {} | ||
1577 | #endif | ||
973 | 1578 | ||
974 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1579 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
975 | { | 1580 | { |
@@ -1096,7 +1701,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1096 | * Update run-time statistics of the 'current'. | 1701 | * Update run-time statistics of the 'current'. |
1097 | */ | 1702 | */ |
1098 | update_curr(cfs_rq); | 1703 | update_curr(cfs_rq); |
1099 | update_cfs_load(cfs_rq, 0); | 1704 | enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); |
1100 | account_entity_enqueue(cfs_rq, se); | 1705 | account_entity_enqueue(cfs_rq, se); |
1101 | update_cfs_shares(cfs_rq); | 1706 | update_cfs_shares(cfs_rq); |
1102 | 1707 | ||
@@ -1171,6 +1776,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1171 | * Update run-time statistics of the 'current'. | 1776 | * Update run-time statistics of the 'current'. |
1172 | */ | 1777 | */ |
1173 | update_curr(cfs_rq); | 1778 | update_curr(cfs_rq); |
1779 | dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); | ||
1174 | 1780 | ||
1175 | update_stats_dequeue(cfs_rq, se); | 1781 | update_stats_dequeue(cfs_rq, se); |
1176 | if (flags & DEQUEUE_SLEEP) { | 1782 | if (flags & DEQUEUE_SLEEP) { |
@@ -1191,7 +1797,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1191 | if (se != cfs_rq->curr) | 1797 | if (se != cfs_rq->curr) |
1192 | __dequeue_entity(cfs_rq, se); | 1798 | __dequeue_entity(cfs_rq, se); |
1193 | se->on_rq = 0; | 1799 | se->on_rq = 0; |
1194 | update_cfs_load(cfs_rq, 0); | ||
1195 | account_entity_dequeue(cfs_rq, se); | 1800 | account_entity_dequeue(cfs_rq, se); |
1196 | 1801 | ||
1197 | /* | 1802 | /* |
@@ -1340,6 +1945,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
1340 | update_stats_wait_start(cfs_rq, prev); | 1945 | update_stats_wait_start(cfs_rq, prev); |
1341 | /* Put 'current' back into the tree. */ | 1946 | /* Put 'current' back into the tree. */ |
1342 | __enqueue_entity(cfs_rq, prev); | 1947 | __enqueue_entity(cfs_rq, prev); |
1948 | /* in !on_rq case, update occurred at dequeue */ | ||
1949 | update_entity_load_avg(prev, 1); | ||
1343 | } | 1950 | } |
1344 | cfs_rq->curr = NULL; | 1951 | cfs_rq->curr = NULL; |
1345 | } | 1952 | } |
@@ -1353,9 +1960,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1353 | update_curr(cfs_rq); | 1960 | update_curr(cfs_rq); |
1354 | 1961 | ||
1355 | /* | 1962 | /* |
1356 | * Update share accounting for long-running entities. | 1963 | * Ensure that runnable average is periodically updated. |
1357 | */ | 1964 | */ |
1358 | update_entity_shares_tick(cfs_rq); | 1965 | update_entity_load_avg(curr, 1); |
1966 | update_cfs_rq_blocked_load(cfs_rq, 1); | ||
1359 | 1967 | ||
1360 | #ifdef CONFIG_SCHED_HRTICK | 1968 | #ifdef CONFIG_SCHED_HRTICK |
1361 | /* | 1969 | /* |
@@ -1448,6 +2056,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
1448 | return &tg->cfs_bandwidth; | 2056 | return &tg->cfs_bandwidth; |
1449 | } | 2057 | } |
1450 | 2058 | ||
2059 | /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ | ||
2060 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | ||
2061 | { | ||
2062 | if (unlikely(cfs_rq->throttle_count)) | ||
2063 | return cfs_rq->throttled_clock_task; | ||
2064 | |||
2065 | return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; | ||
2066 | } | ||
2067 | |||
1451 | /* returns 0 on failure to allocate runtime */ | 2068 | /* returns 0 on failure to allocate runtime */ |
1452 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 2069 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1453 | { | 2070 | { |
@@ -1592,14 +2209,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
1592 | cfs_rq->throttle_count--; | 2209 | cfs_rq->throttle_count--; |
1593 | #ifdef CONFIG_SMP | 2210 | #ifdef CONFIG_SMP |
1594 | if (!cfs_rq->throttle_count) { | 2211 | if (!cfs_rq->throttle_count) { |
1595 | u64 delta = rq->clock_task - cfs_rq->load_stamp; | 2212 | /* adjust cfs_rq_clock_task() */ |
1596 | 2213 | cfs_rq->throttled_clock_task_time += rq->clock_task - | |
1597 | /* leaving throttled state, advance shares averaging windows */ | 2214 | cfs_rq->throttled_clock_task; |
1598 | cfs_rq->load_stamp += delta; | ||
1599 | cfs_rq->load_last += delta; | ||
1600 | |||
1601 | /* update entity weight now that we are on_rq again */ | ||
1602 | update_cfs_shares(cfs_rq); | ||
1603 | } | 2215 | } |
1604 | #endif | 2216 | #endif |
1605 | 2217 | ||
@@ -1611,9 +2223,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) | |||
1611 | struct rq *rq = data; | 2223 | struct rq *rq = data; |
1612 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | 2224 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; |
1613 | 2225 | ||
1614 | /* group is entering throttled state, record last load */ | 2226 | /* group is entering throttled state, stop time */ |
1615 | if (!cfs_rq->throttle_count) | 2227 | if (!cfs_rq->throttle_count) |
1616 | update_cfs_load(cfs_rq, 0); | 2228 | cfs_rq->throttled_clock_task = rq->clock_task; |
1617 | cfs_rq->throttle_count++; | 2229 | cfs_rq->throttle_count++; |
1618 | 2230 | ||
1619 | return 0; | 2231 | return 0; |
@@ -1628,7 +2240,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1628 | 2240 | ||
1629 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | 2241 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; |
1630 | 2242 | ||
1631 | /* account load preceding throttle */ | 2243 | /* freeze hierarchy runnable averages while throttled */ |
1632 | rcu_read_lock(); | 2244 | rcu_read_lock(); |
1633 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); | 2245 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); |
1634 | rcu_read_unlock(); | 2246 | rcu_read_unlock(); |
@@ -1652,7 +2264,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1652 | rq->nr_running -= task_delta; | 2264 | rq->nr_running -= task_delta; |
1653 | 2265 | ||
1654 | cfs_rq->throttled = 1; | 2266 | cfs_rq->throttled = 1; |
1655 | cfs_rq->throttled_timestamp = rq->clock; | 2267 | cfs_rq->throttled_clock = rq->clock; |
1656 | raw_spin_lock(&cfs_b->lock); | 2268 | raw_spin_lock(&cfs_b->lock); |
1657 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 2269 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
1658 | raw_spin_unlock(&cfs_b->lock); | 2270 | raw_spin_unlock(&cfs_b->lock); |
@@ -1670,10 +2282,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1670 | 2282 | ||
1671 | cfs_rq->throttled = 0; | 2283 | cfs_rq->throttled = 0; |
1672 | raw_spin_lock(&cfs_b->lock); | 2284 | raw_spin_lock(&cfs_b->lock); |
1673 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; | 2285 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; |
1674 | list_del_rcu(&cfs_rq->throttled_list); | 2286 | list_del_rcu(&cfs_rq->throttled_list); |
1675 | raw_spin_unlock(&cfs_b->lock); | 2287 | raw_spin_unlock(&cfs_b->lock); |
1676 | cfs_rq->throttled_timestamp = 0; | ||
1677 | 2288 | ||
1678 | update_rq_clock(rq); | 2289 | update_rq_clock(rq); |
1679 | /* update hierarchical throttle state */ | 2290 | /* update hierarchical throttle state */ |
@@ -2073,8 +2684,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq) | |||
2073 | } | 2684 | } |
2074 | 2685 | ||
2075 | #else /* CONFIG_CFS_BANDWIDTH */ | 2686 | #else /* CONFIG_CFS_BANDWIDTH */ |
2076 | static __always_inline | 2687 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) |
2077 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} | 2688 | { |
2689 | return rq_of(cfs_rq)->clock_task; | ||
2690 | } | ||
2691 | |||
2692 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
2693 | unsigned long delta_exec) {} | ||
2078 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2694 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
2079 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 2695 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
2080 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2696 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
@@ -2207,12 +2823,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2207 | if (cfs_rq_throttled(cfs_rq)) | 2823 | if (cfs_rq_throttled(cfs_rq)) |
2208 | break; | 2824 | break; |
2209 | 2825 | ||
2210 | update_cfs_load(cfs_rq, 0); | ||
2211 | update_cfs_shares(cfs_rq); | 2826 | update_cfs_shares(cfs_rq); |
2827 | update_entity_load_avg(se, 1); | ||
2212 | } | 2828 | } |
2213 | 2829 | ||
2214 | if (!se) | 2830 | if (!se) { |
2831 | update_rq_runnable_avg(rq, rq->nr_running); | ||
2215 | inc_nr_running(rq); | 2832 | inc_nr_running(rq); |
2833 | } | ||
2216 | hrtick_update(rq); | 2834 | hrtick_update(rq); |
2217 | } | 2835 | } |
2218 | 2836 | ||
@@ -2266,12 +2884,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2266 | if (cfs_rq_throttled(cfs_rq)) | 2884 | if (cfs_rq_throttled(cfs_rq)) |
2267 | break; | 2885 | break; |
2268 | 2886 | ||
2269 | update_cfs_load(cfs_rq, 0); | ||
2270 | update_cfs_shares(cfs_rq); | 2887 | update_cfs_shares(cfs_rq); |
2888 | update_entity_load_avg(se, 1); | ||
2271 | } | 2889 | } |
2272 | 2890 | ||
2273 | if (!se) | 2891 | if (!se) { |
2274 | dec_nr_running(rq); | 2892 | dec_nr_running(rq); |
2893 | update_rq_runnable_avg(rq, 1); | ||
2894 | } | ||
2275 | hrtick_update(rq); | 2895 | hrtick_update(rq); |
2276 | } | 2896 | } |
2277 | 2897 | ||
@@ -2781,6 +3401,37 @@ unlock: | |||
2781 | 3401 | ||
2782 | return new_cpu; | 3402 | return new_cpu; |
2783 | } | 3403 | } |
3404 | |||
3405 | /* | ||
3406 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
3407 | * removed when useful for applications beyond shares distribution (e.g. | ||
3408 | * load-balance). | ||
3409 | */ | ||
3410 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
3411 | /* | ||
3412 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | ||
3413 | * cfs_rq_of(p) references at time of call are still valid and identify the | ||
3414 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | ||
3415 | * other assumptions, including the state of rq->lock, should be made. | ||
3416 | */ | ||
3417 | static void | ||
3418 | migrate_task_rq_fair(struct task_struct *p, int next_cpu) | ||
3419 | { | ||
3420 | struct sched_entity *se = &p->se; | ||
3421 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
3422 | |||
3423 | /* | ||
3424 | * Load tracking: accumulate removed load so that it can be processed | ||
3425 | * when we next update owning cfs_rq under rq->lock. Tasks contribute | ||
3426 | * to blocked load iff they have a positive decay-count. It can never | ||
3427 | * be negative here since on-rq tasks have decay-count == 0. | ||
3428 | */ | ||
3429 | if (se->avg.decay_count) { | ||
3430 | se->avg.decay_count = -__synchronize_entity_decay(se); | ||
3431 | atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); | ||
3432 | } | ||
3433 | } | ||
3434 | #endif | ||
2784 | #endif /* CONFIG_SMP */ | 3435 | #endif /* CONFIG_SMP */ |
2785 | 3436 | ||
2786 | static unsigned long | 3437 | static unsigned long |
@@ -2907,7 +3558,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
2907 | * Batch and idle tasks do not preempt non-idle tasks (their preemption | 3558 | * Batch and idle tasks do not preempt non-idle tasks (their preemption |
2908 | * is driven by the tick): | 3559 | * is driven by the tick): |
2909 | */ | 3560 | */ |
2910 | if (unlikely(p->policy != SCHED_NORMAL)) | 3561 | if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) |
2911 | return; | 3562 | return; |
2912 | 3563 | ||
2913 | find_matching_se(&se, &pse); | 3564 | find_matching_se(&se, &pse); |
@@ -3033,8 +3684,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3033 | 3684 | ||
3034 | #ifdef CONFIG_SMP | 3685 | #ifdef CONFIG_SMP |
3035 | /************************************************** | 3686 | /************************************************** |
3036 | * Fair scheduling class load-balancing methods: | 3687 | * Fair scheduling class load-balancing methods. |
3037 | */ | 3688 | * |
3689 | * BASICS | ||
3690 | * | ||
3691 | * The purpose of load-balancing is to achieve the same basic fairness the | ||
3692 | * per-cpu scheduler provides, namely provide a proportional amount of compute | ||
3693 | * time to each task. This is expressed in the following equation: | ||
3694 | * | ||
3695 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) | ||
3696 | * | ||
3697 | * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight | ||
3698 | * W_i,0 is defined as: | ||
3699 | * | ||
3700 | * W_i,0 = \Sum_j w_i,j (2) | ||
3701 | * | ||
3702 | * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight | ||
3703 | * is derived from the nice value as per prio_to_weight[]. | ||
3704 | * | ||
3705 | * The weight average is an exponential decay average of the instantaneous | ||
3706 | * weight: | ||
3707 | * | ||
3708 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) | ||
3709 | * | ||
3710 | * P_i is the cpu power (or compute capacity) of cpu i, typically it is the | ||
3711 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it | ||
3712 | * can also include other factors [XXX]. | ||
3713 | * | ||
3714 | * To achieve this balance we define a measure of imbalance which follows | ||
3715 | * directly from (1): | ||
3716 | * | ||
3717 | * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) | ||
3718 | * | ||
3719 | * We them move tasks around to minimize the imbalance. In the continuous | ||
3720 | * function space it is obvious this converges, in the discrete case we get | ||
3721 | * a few fun cases generally called infeasible weight scenarios. | ||
3722 | * | ||
3723 | * [XXX expand on: | ||
3724 | * - infeasible weights; | ||
3725 | * - local vs global optima in the discrete case. ] | ||
3726 | * | ||
3727 | * | ||
3728 | * SCHED DOMAINS | ||
3729 | * | ||
3730 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) | ||
3731 | * for all i,j solution, we create a tree of cpus that follows the hardware | ||
3732 | * topology where each level pairs two lower groups (or better). This results | ||
3733 | * in O(log n) layers. Furthermore we reduce the number of cpus going up the | ||
3734 | * tree to only the first of the previous level and we decrease the frequency | ||
3735 | * of load-balance at each level inv. proportional to the number of cpus in | ||
3736 | * the groups. | ||
3737 | * | ||
3738 | * This yields: | ||
3739 | * | ||
3740 | * log_2 n 1 n | ||
3741 | * \Sum { --- * --- * 2^i } = O(n) (5) | ||
3742 | * i = 0 2^i 2^i | ||
3743 | * `- size of each group | ||
3744 | * | | `- number of cpus doing load-balance | ||
3745 | * | `- freq | ||
3746 | * `- sum over all levels | ||
3747 | * | ||
3748 | * Coupled with a limit on how many tasks we can migrate every balance pass, | ||
3749 | * this makes (5) the runtime complexity of the balancer. | ||
3750 | * | ||
3751 | * An important property here is that each CPU is still (indirectly) connected | ||
3752 | * to every other cpu in at most O(log n) steps: | ||
3753 | * | ||
3754 | * The adjacency matrix of the resulting graph is given by: | ||
3755 | * | ||
3756 | * log_2 n | ||
3757 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) | ||
3758 | * k = 0 | ||
3759 | * | ||
3760 | * And you'll find that: | ||
3761 | * | ||
3762 | * A^(log_2 n)_i,j != 0 for all i,j (7) | ||
3763 | * | ||
3764 | * Showing there's indeed a path between every cpu in at most O(log n) steps. | ||
3765 | * The task movement gives a factor of O(m), giving a convergence complexity | ||
3766 | * of: | ||
3767 | * | ||
3768 | * O(nm log n), n := nr_cpus, m := nr_tasks (8) | ||
3769 | * | ||
3770 | * | ||
3771 | * WORK CONSERVING | ||
3772 | * | ||
3773 | * In order to avoid CPUs going idle while there's still work to do, new idle | ||
3774 | * balancing is more aggressive and has the newly idle cpu iterate up the domain | ||
3775 | * tree itself instead of relying on other CPUs to bring it work. | ||
3776 | * | ||
3777 | * This adds some complexity to both (5) and (8) but it reduces the total idle | ||
3778 | * time. | ||
3779 | * | ||
3780 | * [XXX more?] | ||
3781 | * | ||
3782 | * | ||
3783 | * CGROUPS | ||
3784 | * | ||
3785 | * Cgroups make a horror show out of (2), instead of a simple sum we get: | ||
3786 | * | ||
3787 | * s_k,i | ||
3788 | * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) | ||
3789 | * S_k | ||
3790 | * | ||
3791 | * Where | ||
3792 | * | ||
3793 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) | ||
3794 | * | ||
3795 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. | ||
3796 | * | ||
3797 | * The big problem is S_k, its a global sum needed to compute a local (W_i) | ||
3798 | * property. | ||
3799 | * | ||
3800 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that | ||
3801 | * rewrite all of this once again.] | ||
3802 | */ | ||
3038 | 3803 | ||
3039 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 3804 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
3040 | 3805 | ||
@@ -3300,52 +4065,58 @@ next: | |||
3300 | /* | 4065 | /* |
3301 | * update tg->load_weight by folding this cpu's load_avg | 4066 | * update tg->load_weight by folding this cpu's load_avg |
3302 | */ | 4067 | */ |
3303 | static int update_shares_cpu(struct task_group *tg, int cpu) | 4068 | static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) |
3304 | { | 4069 | { |
3305 | struct cfs_rq *cfs_rq; | 4070 | struct sched_entity *se = tg->se[cpu]; |
3306 | unsigned long flags; | 4071 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; |
3307 | struct rq *rq; | ||
3308 | |||
3309 | if (!tg->se[cpu]) | ||
3310 | return 0; | ||
3311 | |||
3312 | rq = cpu_rq(cpu); | ||
3313 | cfs_rq = tg->cfs_rq[cpu]; | ||
3314 | |||
3315 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
3316 | |||
3317 | update_rq_clock(rq); | ||
3318 | update_cfs_load(cfs_rq, 1); | ||
3319 | 4072 | ||
3320 | /* | 4073 | /* throttled entities do not contribute to load */ |
3321 | * We need to update shares after updating tg->load_weight in | 4074 | if (throttled_hierarchy(cfs_rq)) |
3322 | * order to adjust the weight of groups with long running tasks. | 4075 | return; |
3323 | */ | ||
3324 | update_cfs_shares(cfs_rq); | ||
3325 | 4076 | ||
3326 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 4077 | update_cfs_rq_blocked_load(cfs_rq, 1); |
3327 | 4078 | ||
3328 | return 0; | 4079 | if (se) { |
4080 | update_entity_load_avg(se, 1); | ||
4081 | /* | ||
4082 | * We pivot on our runnable average having decayed to zero for | ||
4083 | * list removal. This generally implies that all our children | ||
4084 | * have also been removed (modulo rounding error or bandwidth | ||
4085 | * control); however, such cases are rare and we can fix these | ||
4086 | * at enqueue. | ||
4087 | * | ||
4088 | * TODO: fix up out-of-order children on enqueue. | ||
4089 | */ | ||
4090 | if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) | ||
4091 | list_del_leaf_cfs_rq(cfs_rq); | ||
4092 | } else { | ||
4093 | struct rq *rq = rq_of(cfs_rq); | ||
4094 | update_rq_runnable_avg(rq, rq->nr_running); | ||
4095 | } | ||
3329 | } | 4096 | } |
3330 | 4097 | ||
3331 | static void update_shares(int cpu) | 4098 | static void update_blocked_averages(int cpu) |
3332 | { | 4099 | { |
3333 | struct cfs_rq *cfs_rq; | ||
3334 | struct rq *rq = cpu_rq(cpu); | 4100 | struct rq *rq = cpu_rq(cpu); |
4101 | struct cfs_rq *cfs_rq; | ||
4102 | unsigned long flags; | ||
3335 | 4103 | ||
3336 | rcu_read_lock(); | 4104 | raw_spin_lock_irqsave(&rq->lock, flags); |
4105 | update_rq_clock(rq); | ||
3337 | /* | 4106 | /* |
3338 | * Iterates the task_group tree in a bottom up fashion, see | 4107 | * Iterates the task_group tree in a bottom up fashion, see |
3339 | * list_add_leaf_cfs_rq() for details. | 4108 | * list_add_leaf_cfs_rq() for details. |
3340 | */ | 4109 | */ |
3341 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 4110 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
3342 | /* throttled entities do not contribute to load */ | 4111 | /* |
3343 | if (throttled_hierarchy(cfs_rq)) | 4112 | * Note: We may want to consider periodically releasing |
3344 | continue; | 4113 | * rq->lock about these updates so that creating many task |
3345 | 4114 | * groups does not result in continually extending hold time. | |
3346 | update_shares_cpu(cfs_rq->tg, cpu); | 4115 | */ |
4116 | __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); | ||
3347 | } | 4117 | } |
3348 | rcu_read_unlock(); | 4118 | |
4119 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
3349 | } | 4120 | } |
3350 | 4121 | ||
3351 | /* | 4122 | /* |
@@ -3397,7 +4168,7 @@ static unsigned long task_h_load(struct task_struct *p) | |||
3397 | return load; | 4168 | return load; |
3398 | } | 4169 | } |
3399 | #else | 4170 | #else |
3400 | static inline void update_shares(int cpu) | 4171 | static inline void update_blocked_averages(int cpu) |
3401 | { | 4172 | { |
3402 | } | 4173 | } |
3403 | 4174 | ||
@@ -4457,12 +5228,14 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
4457 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 5228 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
4458 | return; | 5229 | return; |
4459 | 5230 | ||
5231 | update_rq_runnable_avg(this_rq, 1); | ||
5232 | |||
4460 | /* | 5233 | /* |
4461 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 5234 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
4462 | */ | 5235 | */ |
4463 | raw_spin_unlock(&this_rq->lock); | 5236 | raw_spin_unlock(&this_rq->lock); |
4464 | 5237 | ||
4465 | update_shares(this_cpu); | 5238 | update_blocked_averages(this_cpu); |
4466 | rcu_read_lock(); | 5239 | rcu_read_lock(); |
4467 | for_each_domain(this_cpu, sd) { | 5240 | for_each_domain(this_cpu, sd) { |
4468 | unsigned long interval; | 5241 | unsigned long interval; |
@@ -4717,7 +5490,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
4717 | int update_next_balance = 0; | 5490 | int update_next_balance = 0; |
4718 | int need_serialize; | 5491 | int need_serialize; |
4719 | 5492 | ||
4720 | update_shares(cpu); | 5493 | update_blocked_averages(cpu); |
4721 | 5494 | ||
4722 | rcu_read_lock(); | 5495 | rcu_read_lock(); |
4723 | for_each_domain(cpu, sd) { | 5496 | for_each_domain(cpu, sd) { |
@@ -4954,6 +5727,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
4954 | cfs_rq = cfs_rq_of(se); | 5727 | cfs_rq = cfs_rq_of(se); |
4955 | entity_tick(cfs_rq, se, queued); | 5728 | entity_tick(cfs_rq, se, queued); |
4956 | } | 5729 | } |
5730 | |||
5731 | if (sched_feat_numa(NUMA)) | ||
5732 | task_tick_numa(rq, curr); | ||
5733 | |||
5734 | update_rq_runnable_avg(rq, 1); | ||
4957 | } | 5735 | } |
4958 | 5736 | ||
4959 | /* | 5737 | /* |
@@ -5046,6 +5824,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5046 | place_entity(cfs_rq, se, 0); | 5824 | place_entity(cfs_rq, se, 0); |
5047 | se->vruntime -= cfs_rq->min_vruntime; | 5825 | se->vruntime -= cfs_rq->min_vruntime; |
5048 | } | 5826 | } |
5827 | |||
5828 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
5829 | /* | ||
5830 | * Remove our load from contribution when we leave sched_fair | ||
5831 | * and ensure we don't carry in an old decay_count if we | ||
5832 | * switch back. | ||
5833 | */ | ||
5834 | if (p->se.avg.decay_count) { | ||
5835 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | ||
5836 | __synchronize_entity_decay(&p->se); | ||
5837 | subtract_blocked_load_contrib(cfs_rq, | ||
5838 | p->se.avg.load_avg_contrib); | ||
5839 | } | ||
5840 | #endif | ||
5049 | } | 5841 | } |
5050 | 5842 | ||
5051 | /* | 5843 | /* |
@@ -5092,11 +5884,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
5092 | #ifndef CONFIG_64BIT | 5884 | #ifndef CONFIG_64BIT |
5093 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5885 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
5094 | #endif | 5886 | #endif |
5887 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
5888 | atomic64_set(&cfs_rq->decay_counter, 1); | ||
5889 | atomic64_set(&cfs_rq->removed_load, 0); | ||
5890 | #endif | ||
5095 | } | 5891 | } |
5096 | 5892 | ||
5097 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5893 | #ifdef CONFIG_FAIR_GROUP_SCHED |
5098 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 5894 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
5099 | { | 5895 | { |
5896 | struct cfs_rq *cfs_rq; | ||
5100 | /* | 5897 | /* |
5101 | * If the task was not on the rq at the time of this cgroup movement | 5898 | * If the task was not on the rq at the time of this cgroup movement |
5102 | * it must have been asleep, sleeping tasks keep their ->vruntime | 5899 | * it must have been asleep, sleeping tasks keep their ->vruntime |
@@ -5128,8 +5925,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
5128 | if (!on_rq) | 5925 | if (!on_rq) |
5129 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | 5926 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; |
5130 | set_task_rq(p, task_cpu(p)); | 5927 | set_task_rq(p, task_cpu(p)); |
5131 | if (!on_rq) | 5928 | if (!on_rq) { |
5132 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; | 5929 | cfs_rq = cfs_rq_of(&p->se); |
5930 | p->se.vruntime += cfs_rq->min_vruntime; | ||
5931 | #ifdef CONFIG_SMP | ||
5932 | /* | ||
5933 | * migrate_task_rq_fair() will have removed our previous | ||
5934 | * contribution, but we must synchronize for ongoing future | ||
5935 | * decay. | ||
5936 | */ | ||
5937 | p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | ||
5938 | cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; | ||
5939 | #endif | ||
5940 | } | ||
5133 | } | 5941 | } |
5134 | 5942 | ||
5135 | void free_fair_sched_group(struct task_group *tg) | 5943 | void free_fair_sched_group(struct task_group *tg) |
@@ -5214,10 +6022,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
5214 | 6022 | ||
5215 | cfs_rq->tg = tg; | 6023 | cfs_rq->tg = tg; |
5216 | cfs_rq->rq = rq; | 6024 | cfs_rq->rq = rq; |
5217 | #ifdef CONFIG_SMP | ||
5218 | /* allow initial update_cfs_load() to truncate */ | ||
5219 | cfs_rq->load_stamp = 1; | ||
5220 | #endif | ||
5221 | init_cfs_rq_runtime(cfs_rq); | 6025 | init_cfs_rq_runtime(cfs_rq); |
5222 | 6026 | ||
5223 | tg->cfs_rq[cpu] = cfs_rq; | 6027 | tg->cfs_rq[cpu] = cfs_rq; |
@@ -5319,7 +6123,9 @@ const struct sched_class fair_sched_class = { | |||
5319 | 6123 | ||
5320 | #ifdef CONFIG_SMP | 6124 | #ifdef CONFIG_SMP |
5321 | .select_task_rq = select_task_rq_fair, | 6125 | .select_task_rq = select_task_rq_fair, |
5322 | 6126 | #ifdef CONFIG_FAIR_GROUP_SCHED | |
6127 | .migrate_task_rq = migrate_task_rq_fair, | ||
6128 | #endif | ||
5323 | .rq_online = rq_online_fair, | 6129 | .rq_online = rq_online_fair, |
5324 | .rq_offline = rq_offline_fair, | 6130 | .rq_offline = rq_offline_fair, |
5325 | 6131 | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..1ad1d2b5395f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -32,6 +32,11 @@ SCHED_FEAT(LAST_BUDDY, true) | |||
32 | SCHED_FEAT(CACHE_HOT_BUDDY, true) | 32 | SCHED_FEAT(CACHE_HOT_BUDDY, true) |
33 | 33 | ||
34 | /* | 34 | /* |
35 | * Allow wakeup-time preemption of the current task: | ||
36 | */ | ||
37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) | ||
38 | |||
39 | /* | ||
35 | * Use arch dependent cpu power functions | 40 | * Use arch dependent cpu power functions |
36 | */ | 41 | */ |
37 | SCHED_FEAT(ARCH_POWER, true) | 42 | SCHED_FEAT(ARCH_POWER, true) |
@@ -61,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
61 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 66 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
62 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 67 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
63 | SCHED_FEAT(LB_MIN, false) | 68 | SCHED_FEAT(LB_MIN, false) |
69 | |||
70 | /* | ||
71 | * Apply the automatic NUMA scheduling policy. Enabled automatically | ||
72 | * at runtime if running on a NUMA machine. Can be controlled via | ||
73 | * numa_balancing=. Allow PTE scanning to be forced on UMA machines | ||
74 | * for debugging the core machinery. | ||
75 | */ | ||
76 | #ifdef CONFIG_NUMA_BALANCING | ||
77 | SCHED_FEAT(NUMA, false) | ||
78 | SCHED_FEAT(NUMA_FORCE, false) | ||
79 | #endif | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..fc886441436a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -112,6 +112,8 @@ struct task_group { | |||
112 | unsigned long shares; | 112 | unsigned long shares; |
113 | 113 | ||
114 | atomic_t load_weight; | 114 | atomic_t load_weight; |
115 | atomic64_t load_avg; | ||
116 | atomic_t runnable_avg; | ||
115 | #endif | 117 | #endif |
116 | 118 | ||
117 | #ifdef CONFIG_RT_GROUP_SCHED | 119 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -222,22 +224,29 @@ struct cfs_rq { | |||
222 | unsigned int nr_spread_over; | 224 | unsigned int nr_spread_over; |
223 | #endif | 225 | #endif |
224 | 226 | ||
227 | #ifdef CONFIG_SMP | ||
228 | /* | ||
229 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
230 | * removed when useful for applications beyond shares distribution (e.g. | ||
231 | * load-balance). | ||
232 | */ | ||
225 | #ifdef CONFIG_FAIR_GROUP_SCHED | 233 | #ifdef CONFIG_FAIR_GROUP_SCHED |
226 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
227 | |||
228 | /* | 234 | /* |
229 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 235 | * CFS Load tracking |
230 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 236 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
231 | * (like users, containers etc.) | 237 | * This allows for the description of both thread and group usage (in |
232 | * | 238 | * the FAIR_GROUP_SCHED case). |
233 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
234 | * list is used during load balance. | ||
235 | */ | 239 | */ |
236 | int on_list; | 240 | u64 runnable_load_avg, blocked_load_avg; |
237 | struct list_head leaf_cfs_rq_list; | 241 | atomic64_t decay_counter, removed_load; |
238 | struct task_group *tg; /* group that "owns" this runqueue */ | 242 | u64 last_decay; |
243 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
244 | /* These always depend on CONFIG_FAIR_GROUP_SCHED */ | ||
245 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
246 | u32 tg_runnable_contrib; | ||
247 | u64 tg_load_contrib; | ||
248 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
239 | 249 | ||
240 | #ifdef CONFIG_SMP | ||
241 | /* | 250 | /* |
242 | * h_load = weight * f(tg) | 251 | * h_load = weight * f(tg) |
243 | * | 252 | * |
@@ -245,26 +254,30 @@ struct cfs_rq { | |||
245 | * this group. | 254 | * this group. |
246 | */ | 255 | */ |
247 | unsigned long h_load; | 256 | unsigned long h_load; |
257 | #endif /* CONFIG_SMP */ | ||
258 | |||
259 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
260 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
248 | 261 | ||
249 | /* | 262 | /* |
250 | * Maintaining per-cpu shares distribution for group scheduling | 263 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
264 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
265 | * (like users, containers etc.) | ||
251 | * | 266 | * |
252 | * load_stamp is the last time we updated the load average | 267 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
253 | * load_last is the last time we updated the load average and saw load | 268 | * list is used during load balance. |
254 | * load_unacc_exec_time is currently unaccounted execution time | ||
255 | */ | 269 | */ |
256 | u64 load_avg; | 270 | int on_list; |
257 | u64 load_period; | 271 | struct list_head leaf_cfs_rq_list; |
258 | u64 load_stamp, load_last, load_unacc_exec_time; | 272 | struct task_group *tg; /* group that "owns" this runqueue */ |
259 | 273 | ||
260 | unsigned long load_contribution; | ||
261 | #endif /* CONFIG_SMP */ | ||
262 | #ifdef CONFIG_CFS_BANDWIDTH | 274 | #ifdef CONFIG_CFS_BANDWIDTH |
263 | int runtime_enabled; | 275 | int runtime_enabled; |
264 | u64 runtime_expires; | 276 | u64 runtime_expires; |
265 | s64 runtime_remaining; | 277 | s64 runtime_remaining; |
266 | 278 | ||
267 | u64 throttled_timestamp; | 279 | u64 throttled_clock, throttled_clock_task; |
280 | u64 throttled_clock_task_time; | ||
268 | int throttled, throttle_count; | 281 | int throttled, throttle_count; |
269 | struct list_head throttled_list; | 282 | struct list_head throttled_list; |
270 | #endif /* CONFIG_CFS_BANDWIDTH */ | 283 | #endif /* CONFIG_CFS_BANDWIDTH */ |
@@ -467,6 +480,8 @@ struct rq { | |||
467 | #ifdef CONFIG_SMP | 480 | #ifdef CONFIG_SMP |
468 | struct llist_head wake_list; | 481 | struct llist_head wake_list; |
469 | #endif | 482 | #endif |
483 | |||
484 | struct sched_avg avg; | ||
470 | }; | 485 | }; |
471 | 486 | ||
472 | static inline int cpu_of(struct rq *rq) | 487 | static inline int cpu_of(struct rq *rq) |
@@ -648,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
648 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 663 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
649 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 664 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
650 | 665 | ||
666 | #ifdef CONFIG_NUMA_BALANCING | ||
667 | #define sched_feat_numa(x) sched_feat(x) | ||
668 | #ifdef CONFIG_SCHED_DEBUG | ||
669 | #define numabalancing_enabled sched_feat_numa(NUMA) | ||
670 | #else | ||
671 | extern bool numabalancing_enabled; | ||
672 | #endif /* CONFIG_SCHED_DEBUG */ | ||
673 | #else | ||
674 | #define sched_feat_numa(x) (0) | ||
675 | #define numabalancing_enabled (0) | ||
676 | #endif /* CONFIG_NUMA_BALANCING */ | ||
677 | |||
651 | static inline u64 global_rt_period(void) | 678 | static inline u64 global_rt_period(void) |
652 | { | 679 | { |
653 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | 680 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
@@ -1212,4 +1239,3 @@ static inline u64 irq_time_read(int cpu) | |||
1212 | } | 1239 | } |
1213 | #endif /* CONFIG_64BIT */ | 1240 | #endif /* CONFIG_64BIT */ |
1214 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 1241 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1215 | |||
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf9..5af44b593770 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) | |||
396 | #ifdef CONFIG_SECCOMP_FILTER | 396 | #ifdef CONFIG_SECCOMP_FILTER |
397 | case SECCOMP_MODE_FILTER: { | 397 | case SECCOMP_MODE_FILTER: { |
398 | int data; | 398 | int data; |
399 | struct pt_regs *regs = task_pt_regs(current); | ||
399 | ret = seccomp_run_filters(this_syscall); | 400 | ret = seccomp_run_filters(this_syscall); |
400 | data = ret & SECCOMP_RET_DATA; | 401 | data = ret & SECCOMP_RET_DATA; |
401 | ret &= SECCOMP_RET_ACTION; | 402 | ret &= SECCOMP_RET_ACTION; |
402 | switch (ret) { | 403 | switch (ret) { |
403 | case SECCOMP_RET_ERRNO: | 404 | case SECCOMP_RET_ERRNO: |
404 | /* Set the low-order 16-bits as a errno. */ | 405 | /* Set the low-order 16-bits as a errno. */ |
405 | syscall_set_return_value(current, task_pt_regs(current), | 406 | syscall_set_return_value(current, regs, |
406 | -data, 0); | 407 | -data, 0); |
407 | goto skip; | 408 | goto skip; |
408 | case SECCOMP_RET_TRAP: | 409 | case SECCOMP_RET_TRAP: |
409 | /* Show the handler the original registers. */ | 410 | /* Show the handler the original registers. */ |
410 | syscall_rollback(current, task_pt_regs(current)); | 411 | syscall_rollback(current, regs); |
411 | /* Let the filter pass back 16 bits of data. */ | 412 | /* Let the filter pass back 16 bits of data. */ |
412 | seccomp_send_sigsys(this_syscall, data); | 413 | seccomp_send_sigsys(this_syscall, data); |
413 | goto skip; | 414 | goto skip; |
414 | case SECCOMP_RET_TRACE: | 415 | case SECCOMP_RET_TRACE: |
415 | /* Skip these calls if there is no tracer. */ | 416 | /* Skip these calls if there is no tracer. */ |
416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | 417 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { |
418 | syscall_set_return_value(current, regs, | ||
419 | -ENOSYS, 0); | ||
417 | goto skip; | 420 | goto skip; |
421 | } | ||
418 | /* Allow the BPF to provide the event message */ | 422 | /* Allow the BPF to provide the event message */ |
419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | 423 | ptrace_event(PTRACE_EVENT_SECCOMP, data); |
420 | /* | 424 | /* |
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) | |||
425 | */ | 429 | */ |
426 | if (fatal_signal_pending(current)) | 430 | if (fatal_signal_pending(current)) |
427 | break; | 431 | break; |
432 | if (syscall_get_nr(current, regs) < 0) | ||
433 | goto skip; /* Explicit request to skip. */ | ||
434 | |||
428 | return 0; | 435 | return 0; |
429 | case SECCOMP_RET_ALLOW: | 436 | case SECCOMP_RET_ALLOW: |
430 | return 0; | 437 | return 0; |
diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d6..372771e948c2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/nsproxy.h> | 31 | #include <linux/nsproxy.h> |
32 | #include <linux/user_namespace.h> | 32 | #include <linux/user_namespace.h> |
33 | #include <linux/uprobes.h> | 33 | #include <linux/uprobes.h> |
34 | #include <linux/compat.h> | ||
34 | #define CREATE_TRACE_POINTS | 35 | #define CREATE_TRACE_POINTS |
35 | #include <trace/events/signal.h> | 36 | #include <trace/events/signal.h> |
36 | 37 | ||
@@ -1159,8 +1160,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1159 | return __send_signal(sig, info, t, group, from_ancestor_ns); | 1160 | return __send_signal(sig, info, t, group, from_ancestor_ns); |
1160 | } | 1161 | } |
1161 | 1162 | ||
1162 | static void print_fatal_signal(struct pt_regs *regs, int signr) | 1163 | static void print_fatal_signal(int signr) |
1163 | { | 1164 | { |
1165 | struct pt_regs *regs = signal_pt_regs(); | ||
1164 | printk("%s/%d: potentially unexpected fatal signal %d.\n", | 1166 | printk("%s/%d: potentially unexpected fatal signal %d.\n", |
1165 | current->comm, task_pid_nr(current), signr); | 1167 | current->comm, task_pid_nr(current), signr); |
1166 | 1168 | ||
@@ -1752,7 +1754,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1752 | * see comment in do_notify_parent() about the following 4 lines | 1754 | * see comment in do_notify_parent() about the following 4 lines |
1753 | */ | 1755 | */ |
1754 | rcu_read_lock(); | 1756 | rcu_read_lock(); |
1755 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); | 1757 | info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); |
1756 | info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); | 1758 | info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); |
1757 | rcu_read_unlock(); | 1759 | rcu_read_unlock(); |
1758 | 1760 | ||
@@ -1908,7 +1910,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1908 | preempt_disable(); | 1910 | preempt_disable(); |
1909 | read_unlock(&tasklist_lock); | 1911 | read_unlock(&tasklist_lock); |
1910 | preempt_enable_no_resched(); | 1912 | preempt_enable_no_resched(); |
1911 | schedule(); | 1913 | freezable_schedule(); |
1912 | } else { | 1914 | } else { |
1913 | /* | 1915 | /* |
1914 | * By the time we got the lock, our tracer went away. | 1916 | * By the time we got the lock, our tracer went away. |
@@ -1930,13 +1932,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1930 | } | 1932 | } |
1931 | 1933 | ||
1932 | /* | 1934 | /* |
1933 | * While in TASK_TRACED, we were considered "frozen enough". | ||
1934 | * Now that we woke up, it's crucial if we're supposed to be | ||
1935 | * frozen that we freeze now before running anything substantial. | ||
1936 | */ | ||
1937 | try_to_freeze(); | ||
1938 | |||
1939 | /* | ||
1940 | * We are back. Now reacquire the siglock before touching | 1935 | * We are back. Now reacquire the siglock before touching |
1941 | * last_siginfo, so that we are sure to have synchronized with | 1936 | * last_siginfo, so that we are sure to have synchronized with |
1942 | * any signal-sending on another CPU that wants to examine it. | 1937 | * any signal-sending on another CPU that wants to examine it. |
@@ -2092,7 +2087,7 @@ static bool do_signal_stop(int signr) | |||
2092 | } | 2087 | } |
2093 | 2088 | ||
2094 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | 2089 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ |
2095 | schedule(); | 2090 | freezable_schedule(); |
2096 | return true; | 2091 | return true; |
2097 | } else { | 2092 | } else { |
2098 | /* | 2093 | /* |
@@ -2138,10 +2133,9 @@ static void do_jobctl_trap(void) | |||
2138 | } | 2133 | } |
2139 | } | 2134 | } |
2140 | 2135 | ||
2141 | static int ptrace_signal(int signr, siginfo_t *info, | 2136 | static int ptrace_signal(int signr, siginfo_t *info) |
2142 | struct pt_regs *regs, void *cookie) | ||
2143 | { | 2137 | { |
2144 | ptrace_signal_deliver(regs, cookie); | 2138 | ptrace_signal_deliver(); |
2145 | /* | 2139 | /* |
2146 | * We do not check sig_kernel_stop(signr) but set this marker | 2140 | * We do not check sig_kernel_stop(signr) but set this marker |
2147 | * unconditionally because we do not know whether debugger will | 2141 | * unconditionally because we do not know whether debugger will |
@@ -2200,15 +2194,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
2200 | if (unlikely(uprobe_deny_signal())) | 2194 | if (unlikely(uprobe_deny_signal())) |
2201 | return 0; | 2195 | return 0; |
2202 | 2196 | ||
2203 | relock: | ||
2204 | /* | 2197 | /* |
2205 | * We'll jump back here after any time we were stopped in TASK_STOPPED. | 2198 | * Do this once, we can't return to user-mode if freezing() == T. |
2206 | * While in TASK_STOPPED, we were considered "frozen enough". | 2199 | * do_signal_stop() and ptrace_stop() do freezable_schedule() and |
2207 | * Now that we woke up, it's crucial if we're supposed to be | 2200 | * thus do not need another check after return. |
2208 | * frozen that we freeze now before running anything substantial. | ||
2209 | */ | 2201 | */ |
2210 | try_to_freeze(); | 2202 | try_to_freeze(); |
2211 | 2203 | ||
2204 | relock: | ||
2212 | spin_lock_irq(&sighand->siglock); | 2205 | spin_lock_irq(&sighand->siglock); |
2213 | /* | 2206 | /* |
2214 | * Every stopped thread goes here after wakeup. Check to see if | 2207 | * Every stopped thread goes here after wakeup. Check to see if |
@@ -2265,8 +2258,7 @@ relock: | |||
2265 | break; /* will return 0 */ | 2258 | break; /* will return 0 */ |
2266 | 2259 | ||
2267 | if (unlikely(current->ptrace) && signr != SIGKILL) { | 2260 | if (unlikely(current->ptrace) && signr != SIGKILL) { |
2268 | signr = ptrace_signal(signr, info, | 2261 | signr = ptrace_signal(signr, info); |
2269 | regs, cookie); | ||
2270 | if (!signr) | 2262 | if (!signr) |
2271 | continue; | 2263 | continue; |
2272 | } | 2264 | } |
@@ -2351,7 +2343,7 @@ relock: | |||
2351 | 2343 | ||
2352 | if (sig_kernel_coredump(signr)) { | 2344 | if (sig_kernel_coredump(signr)) { |
2353 | if (print_fatal_signals) | 2345 | if (print_fatal_signals) |
2354 | print_fatal_signal(regs, info->si_signo); | 2346 | print_fatal_signal(info->si_signo); |
2355 | /* | 2347 | /* |
2356 | * If it was able to dump core, this kills all | 2348 | * If it was able to dump core, this kills all |
2357 | * other threads in the group and synchronizes with | 2349 | * other threads in the group and synchronizes with |
@@ -2360,7 +2352,7 @@ relock: | |||
2360 | * first and our do_group_exit call below will use | 2352 | * first and our do_group_exit call below will use |
2361 | * that value and ignore the one we pass it. | 2353 | * that value and ignore the one we pass it. |
2362 | */ | 2354 | */ |
2363 | do_coredump(info, regs); | 2355 | do_coredump(info); |
2364 | } | 2356 | } |
2365 | 2357 | ||
2366 | /* | 2358 | /* |
@@ -2536,11 +2528,8 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) | |||
2536 | */ | 2528 | */ |
2537 | void set_current_blocked(sigset_t *newset) | 2529 | void set_current_blocked(sigset_t *newset) |
2538 | { | 2530 | { |
2539 | struct task_struct *tsk = current; | ||
2540 | sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); | 2531 | sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); |
2541 | spin_lock_irq(&tsk->sighand->siglock); | 2532 | __set_current_blocked(newset); |
2542 | __set_task_blocked(tsk, newset); | ||
2543 | spin_unlock_irq(&tsk->sighand->siglock); | ||
2544 | } | 2533 | } |
2545 | 2534 | ||
2546 | void __set_current_blocked(const sigset_t *newset) | 2535 | void __set_current_blocked(const sigset_t *newset) |
@@ -3103,6 +3092,79 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s | |||
3103 | out: | 3092 | out: |
3104 | return error; | 3093 | return error; |
3105 | } | 3094 | } |
3095 | #ifdef CONFIG_GENERIC_SIGALTSTACK | ||
3096 | SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) | ||
3097 | { | ||
3098 | return do_sigaltstack(uss, uoss, current_user_stack_pointer()); | ||
3099 | } | ||
3100 | #endif | ||
3101 | |||
3102 | int restore_altstack(const stack_t __user *uss) | ||
3103 | { | ||
3104 | int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); | ||
3105 | /* squash all but EFAULT for now */ | ||
3106 | return err == -EFAULT ? err : 0; | ||
3107 | } | ||
3108 | |||
3109 | int __save_altstack(stack_t __user *uss, unsigned long sp) | ||
3110 | { | ||
3111 | struct task_struct *t = current; | ||
3112 | return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | | ||
3113 | __put_user(sas_ss_flags(sp), &uss->ss_flags) | | ||
3114 | __put_user(t->sas_ss_size, &uss->ss_size); | ||
3115 | } | ||
3116 | |||
3117 | #ifdef CONFIG_COMPAT | ||
3118 | #ifdef CONFIG_GENERIC_SIGALTSTACK | ||
3119 | asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, | ||
3120 | compat_stack_t __user *uoss_ptr) | ||
3121 | { | ||
3122 | stack_t uss, uoss; | ||
3123 | int ret; | ||
3124 | mm_segment_t seg; | ||
3125 | |||
3126 | if (uss_ptr) { | ||
3127 | compat_stack_t uss32; | ||
3128 | |||
3129 | memset(&uss, 0, sizeof(stack_t)); | ||
3130 | if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) | ||
3131 | return -EFAULT; | ||
3132 | uss.ss_sp = compat_ptr(uss32.ss_sp); | ||
3133 | uss.ss_flags = uss32.ss_flags; | ||
3134 | uss.ss_size = uss32.ss_size; | ||
3135 | } | ||
3136 | seg = get_fs(); | ||
3137 | set_fs(KERNEL_DS); | ||
3138 | ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), | ||
3139 | (stack_t __force __user *) &uoss, | ||
3140 | compat_user_stack_pointer()); | ||
3141 | set_fs(seg); | ||
3142 | if (ret >= 0 && uoss_ptr) { | ||
3143 | if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || | ||
3144 | __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || | ||
3145 | __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || | ||
3146 | __put_user(uoss.ss_size, &uoss_ptr->ss_size)) | ||
3147 | ret = -EFAULT; | ||
3148 | } | ||
3149 | return ret; | ||
3150 | } | ||
3151 | |||
3152 | int compat_restore_altstack(const compat_stack_t __user *uss) | ||
3153 | { | ||
3154 | int err = compat_sys_sigaltstack(uss, NULL); | ||
3155 | /* squash all but -EFAULT for now */ | ||
3156 | return err == -EFAULT ? err : 0; | ||
3157 | } | ||
3158 | |||
3159 | int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) | ||
3160 | { | ||
3161 | struct task_struct *t = current; | ||
3162 | return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | | ||
3163 | __put_user(sas_ss_flags(sp), &uss->ss_flags) | | ||
3164 | __put_user(t->sas_ss_size, &uss->ss_size); | ||
3165 | } | ||
3166 | #endif | ||
3167 | #endif | ||
3106 | 3168 | ||
3107 | #ifdef __ARCH_WANT_SYS_SIGPENDING | 3169 | #ifdef __ARCH_WANT_SYS_SIGPENDING |
3108 | 3170 | ||
@@ -3139,7 +3201,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, | |||
3139 | if (nset) { | 3201 | if (nset) { |
3140 | if (copy_from_user(&new_set, nset, sizeof(*nset))) | 3202 | if (copy_from_user(&new_set, nset, sizeof(*nset))) |
3141 | return -EFAULT; | 3203 | return -EFAULT; |
3142 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
3143 | 3204 | ||
3144 | new_blocked = current->blocked; | 3205 | new_blocked = current->blocked; |
3145 | 3206 | ||
@@ -3157,7 +3218,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, | |||
3157 | return -EINVAL; | 3218 | return -EINVAL; |
3158 | } | 3219 | } |
3159 | 3220 | ||
3160 | __set_current_blocked(&new_blocked); | 3221 | set_current_blocked(&new_blocked); |
3161 | } | 3222 | } |
3162 | 3223 | ||
3163 | if (oset) { | 3224 | if (oset) { |
@@ -3221,6 +3282,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) | |||
3221 | int old = current->blocked.sig[0]; | 3282 | int old = current->blocked.sig[0]; |
3222 | sigset_t newset; | 3283 | sigset_t newset; |
3223 | 3284 | ||
3285 | siginitset(&newset, newmask); | ||
3224 | set_current_blocked(&newset); | 3286 | set_current_blocked(&newset); |
3225 | 3287 | ||
3226 | return old; | 3288 | return old; |
diff --git a/kernel/softirq.c b/kernel/softirq.c index cc96bdc0c2c9..ed567babe789 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) | |||
221 | current->flags &= ~PF_MEMALLOC; | 221 | current->flags &= ~PF_MEMALLOC; |
222 | 222 | ||
223 | pending = local_softirq_pending(); | 223 | pending = local_softirq_pending(); |
224 | vtime_account(current); | 224 | vtime_account_irq_enter(current); |
225 | 225 | ||
226 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 226 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
227 | SOFTIRQ_OFFSET); | 227 | SOFTIRQ_OFFSET); |
@@ -272,7 +272,7 @@ restart: | |||
272 | 272 | ||
273 | lockdep_softirq_exit(); | 273 | lockdep_softirq_exit(); |
274 | 274 | ||
275 | vtime_account(current); | 275 | vtime_account_irq_exit(current); |
276 | __local_bh_enable(SOFTIRQ_OFFSET); | 276 | __local_bh_enable(SOFTIRQ_OFFSET); |
277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
278 | } | 278 | } |
@@ -341,7 +341,7 @@ static inline void invoke_softirq(void) | |||
341 | */ | 341 | */ |
342 | void irq_exit(void) | 342 | void irq_exit(void) |
343 | { | 343 | { |
344 | vtime_account(current); | 344 | vtime_account_irq_exit(current); |
345 | trace_hardirq_exit(); | 345 | trace_hardirq_exit(); |
346 | sub_preempt_count(IRQ_EXIT_OFFSET); | 346 | sub_preempt_count(IRQ_EXIT_OFFSET); |
347 | if (!in_interrupt() && local_softirq_pending()) | 347 | if (!in_interrupt() && local_softirq_pending()) |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 97c465ebd844..2b859828cdc3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -16,8 +16,10 @@ | |||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2006 | 18 | * Copyright (C) IBM Corporation, 2006 |
19 | * Copyright (C) Fujitsu, 2012 | ||
19 | * | 20 | * |
20 | * Author: Paul McKenney <paulmck@us.ibm.com> | 21 | * Author: Paul McKenney <paulmck@us.ibm.com> |
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | ||
21 | * | 23 | * |
22 | * For detailed explanation of Read-Copy Update mechanism see - | 24 | * For detailed explanation of Read-Copy Update mechanism see - |
23 | * Documentation/RCU/ *.txt | 25 | * Documentation/RCU/ *.txt |
@@ -34,6 +36,10 @@ | |||
34 | #include <linux/delay.h> | 36 | #include <linux/delay.h> |
35 | #include <linux/srcu.h> | 37 | #include <linux/srcu.h> |
36 | 38 | ||
39 | #include <trace/events/rcu.h> | ||
40 | |||
41 | #include "rcu.h" | ||
42 | |||
37 | /* | 43 | /* |
38 | * Initialize an rcu_batch structure to empty. | 44 | * Initialize an rcu_batch structure to empty. |
39 | */ | 45 | */ |
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | |||
92 | } | 98 | } |
93 | } | 99 | } |
94 | 100 | ||
95 | /* single-thread state-machine */ | ||
96 | static void process_srcu(struct work_struct *work); | ||
97 | |||
98 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 101 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
99 | { | 102 | { |
100 | sp->completed = 0; | 103 | sp->completed = 0; |
@@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
464 | */ | 467 | */ |
465 | void synchronize_srcu(struct srcu_struct *sp) | 468 | void synchronize_srcu(struct srcu_struct *sp) |
466 | { | 469 | { |
467 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); | 470 | __synchronize_srcu(sp, rcu_expedited |
471 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT | ||
472 | : SYNCHRONIZE_SRCU_TRYCOUNT); | ||
468 | } | 473 | } |
469 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 474 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
470 | 475 | ||
@@ -637,7 +642,7 @@ static void srcu_reschedule(struct srcu_struct *sp) | |||
637 | /* | 642 | /* |
638 | * This is the work-queue function that handles SRCU grace periods. | 643 | * This is the work-queue function that handles SRCU grace periods. |
639 | */ | 644 | */ |
640 | static void process_srcu(struct work_struct *work) | 645 | void process_srcu(struct work_struct *work) |
641 | { | 646 | { |
642 | struct srcu_struct *sp; | 647 | struct srcu_struct *sp; |
643 | 648 | ||
@@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work) | |||
648 | srcu_invoke_callbacks(sp); | 653 | srcu_invoke_callbacks(sp); |
649 | srcu_reschedule(sp); | 654 | srcu_reschedule(sp); |
650 | } | 655 | } |
656 | EXPORT_SYMBOL_GPL(process_srcu); | ||
diff --git a/kernel/sys.c b/kernel/sys.c index e6e0ece5f6a0..265b37690421 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms) | |||
1046 | cputime_t tgutime, tgstime, cutime, cstime; | 1046 | cputime_t tgutime, tgstime, cutime, cstime; |
1047 | 1047 | ||
1048 | spin_lock_irq(¤t->sighand->siglock); | 1048 | spin_lock_irq(¤t->sighand->siglock); |
1049 | thread_group_times(current, &tgutime, &tgstime); | 1049 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); |
1050 | cutime = current->signal->cutime; | 1050 | cutime = current->signal->cutime; |
1051 | cstime = current->signal->cstime; | 1051 | cstime = current->signal->cstime; |
1052 | spin_unlock_irq(¤t->sighand->siglock); | 1052 | spin_unlock_irq(¤t->sighand->siglock); |
@@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1704 | utime = stime = 0; | 1704 | utime = stime = 0; |
1705 | 1705 | ||
1706 | if (who == RUSAGE_THREAD) { | 1706 | if (who == RUSAGE_THREAD) { |
1707 | task_times(current, &utime, &stime); | 1707 | task_cputime_adjusted(current, &utime, &stime); |
1708 | accumulate_thread_rusage(p, r); | 1708 | accumulate_thread_rusage(p, r); |
1709 | maxrss = p->signal->maxrss; | 1709 | maxrss = p->signal->maxrss; |
1710 | goto out; | 1710 | goto out; |
@@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1730 | break; | 1730 | break; |
1731 | 1731 | ||
1732 | case RUSAGE_SELF: | 1732 | case RUSAGE_SELF: |
1733 | thread_group_times(p, &tgutime, &tgstime); | 1733 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
1734 | utime += tgutime; | 1734 | utime += tgutime; |
1735 | stime += tgstime; | 1735 | stime += tgstime; |
1736 | r->ru_nvcsw += p->signal->nvcsw; | 1736 | r->ru_nvcsw += p->signal->nvcsw; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index dbff751e4086..395084d4ce16 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff); | |||
25 | cond_syscall(sys_kexec_load); | 25 | cond_syscall(sys_kexec_load); |
26 | cond_syscall(compat_sys_kexec_load); | 26 | cond_syscall(compat_sys_kexec_load); |
27 | cond_syscall(sys_init_module); | 27 | cond_syscall(sys_init_module); |
28 | cond_syscall(sys_finit_module); | ||
28 | cond_syscall(sys_delete_module); | 29 | cond_syscall(sys_delete_module); |
29 | cond_syscall(sys_socketpair); | 30 | cond_syscall(sys_socketpair); |
30 | cond_syscall(sys_bind); | 31 | cond_syscall(sys_bind); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..c88878db491e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ | |||
256 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 256 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
257 | static int min_wakeup_granularity_ns; /* 0 usecs */ | 257 | static int min_wakeup_granularity_ns; /* 0 usecs */ |
258 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 258 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
259 | #ifdef CONFIG_SMP | ||
259 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
260 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
261 | #endif | 262 | #endif /* CONFIG_SMP */ |
263 | #endif /* CONFIG_SCHED_DEBUG */ | ||
262 | 264 | ||
263 | #ifdef CONFIG_COMPACTION | 265 | #ifdef CONFIG_COMPACTION |
264 | static int min_extfrag_threshold; | 266 | static int min_extfrag_threshold; |
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = { | |||
301 | .extra1 = &min_wakeup_granularity_ns, | 303 | .extra1 = &min_wakeup_granularity_ns, |
302 | .extra2 = &max_wakeup_granularity_ns, | 304 | .extra2 = &max_wakeup_granularity_ns, |
303 | }, | 305 | }, |
306 | #ifdef CONFIG_SMP | ||
304 | { | 307 | { |
305 | .procname = "sched_tunable_scaling", | 308 | .procname = "sched_tunable_scaling", |
306 | .data = &sysctl_sched_tunable_scaling, | 309 | .data = &sysctl_sched_tunable_scaling, |
@@ -347,7 +350,45 @@ static struct ctl_table kern_table[] = { | |||
347 | .extra1 = &zero, | 350 | .extra1 = &zero, |
348 | .extra2 = &one, | 351 | .extra2 = &one, |
349 | }, | 352 | }, |
350 | #endif | 353 | #endif /* CONFIG_SMP */ |
354 | #ifdef CONFIG_NUMA_BALANCING | ||
355 | { | ||
356 | .procname = "numa_balancing_scan_delay_ms", | ||
357 | .data = &sysctl_numa_balancing_scan_delay, | ||
358 | .maxlen = sizeof(unsigned int), | ||
359 | .mode = 0644, | ||
360 | .proc_handler = proc_dointvec, | ||
361 | }, | ||
362 | { | ||
363 | .procname = "numa_balancing_scan_period_min_ms", | ||
364 | .data = &sysctl_numa_balancing_scan_period_min, | ||
365 | .maxlen = sizeof(unsigned int), | ||
366 | .mode = 0644, | ||
367 | .proc_handler = proc_dointvec, | ||
368 | }, | ||
369 | { | ||
370 | .procname = "numa_balancing_scan_period_reset", | ||
371 | .data = &sysctl_numa_balancing_scan_period_reset, | ||
372 | .maxlen = sizeof(unsigned int), | ||
373 | .mode = 0644, | ||
374 | .proc_handler = proc_dointvec, | ||
375 | }, | ||
376 | { | ||
377 | .procname = "numa_balancing_scan_period_max_ms", | ||
378 | .data = &sysctl_numa_balancing_scan_period_max, | ||
379 | .maxlen = sizeof(unsigned int), | ||
380 | .mode = 0644, | ||
381 | .proc_handler = proc_dointvec, | ||
382 | }, | ||
383 | { | ||
384 | .procname = "numa_balancing_scan_size_mb", | ||
385 | .data = &sysctl_numa_balancing_scan_size, | ||
386 | .maxlen = sizeof(unsigned int), | ||
387 | .mode = 0644, | ||
388 | .proc_handler = proc_dointvec, | ||
389 | }, | ||
390 | #endif /* CONFIG_NUMA_BALANCING */ | ||
391 | #endif /* CONFIG_SCHED_DEBUG */ | ||
351 | { | 392 | { |
352 | .procname = "sched_rt_period_us", | 393 | .procname = "sched_rt_period_us", |
353 | .data = &sysctl_sched_rt_period, | 394 | .data = &sysctl_sched_rt_period, |
@@ -565,7 +606,7 @@ static struct ctl_table kern_table[] = { | |||
565 | .extra2 = &one, | 606 | .extra2 = &one, |
566 | }, | 607 | }, |
567 | #endif | 608 | #endif |
568 | #ifdef CONFIG_HOTPLUG | 609 | |
569 | { | 610 | { |
570 | .procname = "hotplug", | 611 | .procname = "hotplug", |
571 | .data = &uevent_helper, | 612 | .data = &uevent_helper, |
@@ -573,7 +614,7 @@ static struct ctl_table kern_table[] = { | |||
573 | .mode = 0644, | 614 | .mode = 0644, |
574 | .proc_handler = proc_dostring, | 615 | .proc_handler = proc_dostring, |
575 | }, | 616 | }, |
576 | #endif | 617 | |
577 | #ifdef CONFIG_CHR_DEV_SG | 618 | #ifdef CONFIG_CHR_DEV_SG |
578 | { | 619 | { |
579 | .procname = "sg-big-buff", | 620 | .procname = "sg-big-buff", |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 65bdcf198d4e..5a6384450501 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1344 | goto out_putname; | 1344 | goto out_putname; |
1345 | } | 1345 | } |
1346 | 1346 | ||
1347 | mnt = current->nsproxy->pid_ns->proc_mnt; | 1347 | mnt = task_active_pid_ns(current)->proc_mnt; |
1348 | file = file_open_root(mnt->mnt_root, mnt, pathname, flags); | 1348 | file = file_open_root(mnt->mnt_root, mnt, pathname, flags); |
1349 | result = PTR_ERR(file); | 1349 | result = PTR_ERR(file); |
1350 | if (IS_ERR(file)) | 1350 | if (IS_ERR(file)) |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e2fd74b8e8c2..ff7d9d2ab504 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o |
2 | obj-y += timeconv.o posix-clock.o alarmtimer.o | 2 | obj-y += timeconv.o posix-clock.o alarmtimer.o |
3 | 3 | ||
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 6629bf7b5285..7a925ba456fb 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs) | |||
58 | return (cycle_t) jiffies; | 58 | return (cycle_t) jiffies; |
59 | } | 59 | } |
60 | 60 | ||
61 | struct clocksource clocksource_jiffies = { | 61 | static struct clocksource clocksource_jiffies = { |
62 | .name = "jiffies", | 62 | .name = "jiffies", |
63 | .rating = 1, /* lowest valid rating*/ | 63 | .rating = 1, /* lowest valid rating*/ |
64 | .read = jiffies_read, | 64 | .read = jiffies_read, |
@@ -67,6 +67,8 @@ struct clocksource clocksource_jiffies = { | |||
67 | .shift = JIFFIES_SHIFT, | 67 | .shift = JIFFIES_SHIFT, |
68 | }; | 68 | }; |
69 | 69 | ||
70 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); | ||
71 | |||
70 | #if (BITS_PER_LONG < 64) | 72 | #if (BITS_PER_LONG < 64) |
71 | u64 get_jiffies_64(void) | 73 | u64 get_jiffies_64(void) |
72 | { | 74 | { |
@@ -74,9 +76,9 @@ u64 get_jiffies_64(void) | |||
74 | u64 ret; | 76 | u64 ret; |
75 | 77 | ||
76 | do { | 78 | do { |
77 | seq = read_seqbegin(&xtime_lock); | 79 | seq = read_seqbegin(&jiffies_lock); |
78 | ret = jiffies_64; | 80 | ret = jiffies_64; |
79 | } while (read_seqretry(&xtime_lock, seq)); | 81 | } while (read_seqretry(&jiffies_lock, seq)); |
80 | return ret; | 82 | return ret; |
81 | } | 83 | } |
82 | EXPORT_SYMBOL(get_jiffies_64); | 84 | EXPORT_SYMBOL(get_jiffies_64); |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index da6c9ecad4e4..b1600a6973f4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void) | |||
63 | static void tick_periodic(int cpu) | 63 | static void tick_periodic(int cpu) |
64 | { | 64 | { |
65 | if (tick_do_timer_cpu == cpu) { | 65 | if (tick_do_timer_cpu == cpu) { |
66 | write_seqlock(&xtime_lock); | 66 | write_seqlock(&jiffies_lock); |
67 | 67 | ||
68 | /* Keep track of the next tick event */ | 68 | /* Keep track of the next tick event */ |
69 | tick_next_period = ktime_add(tick_next_period, tick_period); | 69 | tick_next_period = ktime_add(tick_next_period, tick_period); |
70 | 70 | ||
71 | do_timer(1); | 71 | do_timer(1); |
72 | write_sequnlock(&xtime_lock); | 72 | write_sequnlock(&jiffies_lock); |
73 | } | 73 | } |
74 | 74 | ||
75 | update_process_times(user_mode(get_irq_regs())); | 75 | update_process_times(user_mode(get_irq_regs())); |
@@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
130 | ktime_t next; | 130 | ktime_t next; |
131 | 131 | ||
132 | do { | 132 | do { |
133 | seq = read_seqbegin(&xtime_lock); | 133 | seq = read_seqbegin(&jiffies_lock); |
134 | next = tick_next_period; | 134 | next = tick_next_period; |
135 | } while (read_seqretry(&xtime_lock, seq)); | 135 | } while (read_seqretry(&jiffies_lock, seq)); |
136 | 136 | ||
137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); |
138 | 138 | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4e265b901fed..cf3e59ed6dc0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) | |||
141 | #endif | 141 | #endif |
142 | 142 | ||
143 | extern void do_timer(unsigned long ticks); | 143 | extern void do_timer(unsigned long ticks); |
144 | extern seqlock_t xtime_lock; | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..d58e552d9fd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -31,7 +31,7 @@ | |||
31 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 31 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * The time, when the last jiffy update happened. Protected by xtime_lock. | 34 | * The time, when the last jiffy update happened. Protected by jiffies_lock. |
35 | */ | 35 | */ |
36 | static ktime_t last_jiffies_update; | 36 | static ktime_t last_jiffies_update; |
37 | 37 | ||
@@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
49 | ktime_t delta; | 49 | ktime_t delta; |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * Do a quick check without holding xtime_lock: | 52 | * Do a quick check without holding jiffies_lock: |
53 | */ | 53 | */ |
54 | delta = ktime_sub(now, last_jiffies_update); | 54 | delta = ktime_sub(now, last_jiffies_update); |
55 | if (delta.tv64 < tick_period.tv64) | 55 | if (delta.tv64 < tick_period.tv64) |
56 | return; | 56 | return; |
57 | 57 | ||
58 | /* Reevalute with xtime_lock held */ | 58 | /* Reevalute with jiffies_lock held */ |
59 | write_seqlock(&xtime_lock); | 59 | write_seqlock(&jiffies_lock); |
60 | 60 | ||
61 | delta = ktime_sub(now, last_jiffies_update); | 61 | delta = ktime_sub(now, last_jiffies_update); |
62 | if (delta.tv64 >= tick_period.tv64) { | 62 | if (delta.tv64 >= tick_period.tv64) { |
@@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
79 | /* Keep the tick_next_period variable up to date */ | 79 | /* Keep the tick_next_period variable up to date */ |
80 | tick_next_period = ktime_add(last_jiffies_update, tick_period); | 80 | tick_next_period = ktime_add(last_jiffies_update, tick_period); |
81 | } | 81 | } |
82 | write_sequnlock(&xtime_lock); | 82 | write_sequnlock(&jiffies_lock); |
83 | } | 83 | } |
84 | 84 | ||
85 | /* | 85 | /* |
@@ -89,15 +89,58 @@ static ktime_t tick_init_jiffy_update(void) | |||
89 | { | 89 | { |
90 | ktime_t period; | 90 | ktime_t period; |
91 | 91 | ||
92 | write_seqlock(&xtime_lock); | 92 | write_seqlock(&jiffies_lock); |
93 | /* Did we start the jiffies update yet ? */ | 93 | /* Did we start the jiffies update yet ? */ |
94 | if (last_jiffies_update.tv64 == 0) | 94 | if (last_jiffies_update.tv64 == 0) |
95 | last_jiffies_update = tick_next_period; | 95 | last_jiffies_update = tick_next_period; |
96 | period = last_jiffies_update; | 96 | period = last_jiffies_update; |
97 | write_sequnlock(&xtime_lock); | 97 | write_sequnlock(&jiffies_lock); |
98 | return period; | 98 | return period; |
99 | } | 99 | } |
100 | 100 | ||
101 | |||
102 | static void tick_sched_do_timer(ktime_t now) | ||
103 | { | ||
104 | int cpu = smp_processor_id(); | ||
105 | |||
106 | #ifdef CONFIG_NO_HZ | ||
107 | /* | ||
108 | * Check if the do_timer duty was dropped. We don't care about | ||
109 | * concurrency: This happens only when the cpu in charge went | ||
110 | * into a long sleep. If two cpus happen to assign themself to | ||
111 | * this duty, then the jiffies update is still serialized by | ||
112 | * jiffies_lock. | ||
113 | */ | ||
114 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
115 | tick_do_timer_cpu = cpu; | ||
116 | #endif | ||
117 | |||
118 | /* Check, if the jiffies need an update */ | ||
119 | if (tick_do_timer_cpu == cpu) | ||
120 | tick_do_update_jiffies64(now); | ||
121 | } | ||
122 | |||
123 | static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | ||
124 | { | ||
125 | #ifdef CONFIG_NO_HZ | ||
126 | /* | ||
127 | * When we are idle and the tick is stopped, we have to touch | ||
128 | * the watchdog as we might not schedule for a really long | ||
129 | * time. This happens on complete idle SMP systems while | ||
130 | * waiting on the login prompt. We also increment the "start of | ||
131 | * idle" jiffy stamp so the idle accounting adjustment we do | ||
132 | * when we go busy again does not account too much ticks. | ||
133 | */ | ||
134 | if (ts->tick_stopped) { | ||
135 | touch_softlockup_watchdog(); | ||
136 | if (is_idle_task(current)) | ||
137 | ts->idle_jiffies++; | ||
138 | } | ||
139 | #endif | ||
140 | update_process_times(user_mode(regs)); | ||
141 | profile_tick(CPU_PROFILING); | ||
142 | } | ||
143 | |||
101 | /* | 144 | /* |
102 | * NOHZ - aka dynamic tick functionality | 145 | * NOHZ - aka dynamic tick functionality |
103 | */ | 146 | */ |
@@ -282,11 +325,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
282 | 325 | ||
283 | /* Read jiffies and the time when jiffies were updated last */ | 326 | /* Read jiffies and the time when jiffies were updated last */ |
284 | do { | 327 | do { |
285 | seq = read_seqbegin(&xtime_lock); | 328 | seq = read_seqbegin(&jiffies_lock); |
286 | last_update = last_jiffies_update; | 329 | last_update = last_jiffies_update; |
287 | last_jiffies = jiffies; | 330 | last_jiffies = jiffies; |
288 | time_delta = timekeeping_max_deferment(); | 331 | time_delta = timekeeping_max_deferment(); |
289 | } while (read_seqretry(&xtime_lock, seq)); | 332 | } while (read_seqretry(&jiffies_lock, seq)); |
290 | 333 | ||
291 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || | 334 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || |
292 | arch_needs_cpu(cpu)) { | 335 | arch_needs_cpu(cpu)) { |
@@ -526,6 +569,8 @@ void tick_nohz_irq_exit(void) | |||
526 | if (!ts->inidle) | 569 | if (!ts->inidle) |
527 | return; | 570 | return; |
528 | 571 | ||
572 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
573 | menu_hrtimer_cancel(); | ||
529 | __tick_nohz_idle_enter(ts); | 574 | __tick_nohz_idle_enter(ts); |
530 | } | 575 | } |
531 | 576 | ||
@@ -621,6 +666,8 @@ void tick_nohz_idle_exit(void) | |||
621 | 666 | ||
622 | ts->inidle = 0; | 667 | ts->inidle = 0; |
623 | 668 | ||
669 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
670 | menu_hrtimer_cancel(); | ||
624 | if (ts->idle_active || ts->tick_stopped) | 671 | if (ts->idle_active || ts->tick_stopped) |
625 | now = ktime_get(); | 672 | now = ktime_get(); |
626 | 673 | ||
@@ -648,40 +695,12 @@ static void tick_nohz_handler(struct clock_event_device *dev) | |||
648 | { | 695 | { |
649 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 696 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
650 | struct pt_regs *regs = get_irq_regs(); | 697 | struct pt_regs *regs = get_irq_regs(); |
651 | int cpu = smp_processor_id(); | ||
652 | ktime_t now = ktime_get(); | 698 | ktime_t now = ktime_get(); |
653 | 699 | ||
654 | dev->next_event.tv64 = KTIME_MAX; | 700 | dev->next_event.tv64 = KTIME_MAX; |
655 | 701 | ||
656 | /* | 702 | tick_sched_do_timer(now); |
657 | * Check if the do_timer duty was dropped. We don't care about | 703 | tick_sched_handle(ts, regs); |
658 | * concurrency: This happens only when the cpu in charge went | ||
659 | * into a long sleep. If two cpus happen to assign themself to | ||
660 | * this duty, then the jiffies update is still serialized by | ||
661 | * xtime_lock. | ||
662 | */ | ||
663 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
664 | tick_do_timer_cpu = cpu; | ||
665 | |||
666 | /* Check, if the jiffies need an update */ | ||
667 | if (tick_do_timer_cpu == cpu) | ||
668 | tick_do_update_jiffies64(now); | ||
669 | |||
670 | /* | ||
671 | * When we are idle and the tick is stopped, we have to touch | ||
672 | * the watchdog as we might not schedule for a really long | ||
673 | * time. This happens on complete idle SMP systems while | ||
674 | * waiting on the login prompt. We also increment the "start | ||
675 | * of idle" jiffy stamp so the idle accounting adjustment we | ||
676 | * do when we go busy again does not account too much ticks. | ||
677 | */ | ||
678 | if (ts->tick_stopped) { | ||
679 | touch_softlockup_watchdog(); | ||
680 | ts->idle_jiffies++; | ||
681 | } | ||
682 | |||
683 | update_process_times(user_mode(regs)); | ||
684 | profile_tick(CPU_PROFILING); | ||
685 | 704 | ||
686 | while (tick_nohz_reprogram(ts, now)) { | 705 | while (tick_nohz_reprogram(ts, now)) { |
687 | now = ktime_get(); | 706 | now = ktime_get(); |
@@ -794,7 +813,7 @@ void tick_check_idle(int cpu) | |||
794 | #ifdef CONFIG_HIGH_RES_TIMERS | 813 | #ifdef CONFIG_HIGH_RES_TIMERS |
795 | /* | 814 | /* |
796 | * We rearm the timer until we get disabled by the idle code. | 815 | * We rearm the timer until we get disabled by the idle code. |
797 | * Called with interrupts disabled and timer->base->cpu_base->lock held. | 816 | * Called with interrupts disabled. |
798 | */ | 817 | */ |
799 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | 818 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) |
800 | { | 819 | { |
@@ -802,45 +821,15 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
802 | container_of(timer, struct tick_sched, sched_timer); | 821 | container_of(timer, struct tick_sched, sched_timer); |
803 | struct pt_regs *regs = get_irq_regs(); | 822 | struct pt_regs *regs = get_irq_regs(); |
804 | ktime_t now = ktime_get(); | 823 | ktime_t now = ktime_get(); |
805 | int cpu = smp_processor_id(); | ||
806 | 824 | ||
807 | #ifdef CONFIG_NO_HZ | 825 | tick_sched_do_timer(now); |
808 | /* | ||
809 | * Check if the do_timer duty was dropped. We don't care about | ||
810 | * concurrency: This happens only when the cpu in charge went | ||
811 | * into a long sleep. If two cpus happen to assign themself to | ||
812 | * this duty, then the jiffies update is still serialized by | ||
813 | * xtime_lock. | ||
814 | */ | ||
815 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
816 | tick_do_timer_cpu = cpu; | ||
817 | #endif | ||
818 | |||
819 | /* Check, if the jiffies need an update */ | ||
820 | if (tick_do_timer_cpu == cpu) | ||
821 | tick_do_update_jiffies64(now); | ||
822 | 826 | ||
823 | /* | 827 | /* |
824 | * Do not call, when we are not in irq context and have | 828 | * Do not call, when we are not in irq context and have |
825 | * no valid regs pointer | 829 | * no valid regs pointer |
826 | */ | 830 | */ |
827 | if (regs) { | 831 | if (regs) |
828 | /* | 832 | tick_sched_handle(ts, regs); |
829 | * When we are idle and the tick is stopped, we have to touch | ||
830 | * the watchdog as we might not schedule for a really long | ||
831 | * time. This happens on complete idle SMP systems while | ||
832 | * waiting on the login prompt. We also increment the "start of | ||
833 | * idle" jiffy stamp so the idle accounting adjustment we do | ||
834 | * when we go busy again does not account too much ticks. | ||
835 | */ | ||
836 | if (ts->tick_stopped) { | ||
837 | touch_softlockup_watchdog(); | ||
838 | if (is_idle_task(current)) | ||
839 | ts->idle_jiffies++; | ||
840 | } | ||
841 | update_process_times(user_mode(regs)); | ||
842 | profile_tick(CPU_PROFILING); | ||
843 | } | ||
844 | 833 | ||
845 | hrtimer_forward(timer, now, tick_period); | 834 | hrtimer_forward(timer, now, tick_period); |
846 | 835 | ||
@@ -874,7 +863,7 @@ void tick_setup_sched_timer(void) | |||
874 | /* Get the next period (per cpu) */ | 863 | /* Get the next period (per cpu) */ |
875 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 864 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
876 | 865 | ||
877 | /* Offset the tick to avert xtime_lock contention. */ | 866 | /* Offset the tick to avert jiffies_lock contention. */ |
878 | if (sched_skew_tick) { | 867 | if (sched_skew_tick) { |
879 | u64 offset = ktime_to_ns(tick_period) >> 1; | 868 | u64 offset = ktime_to_ns(tick_period) >> 1; |
880 | do_div(offset, num_possible_cpus()); | 869 | do_div(offset, num_possible_cpus()); |
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c deleted file mode 100644 index a9ae369925ce..000000000000 --- a/kernel/time/timecompare.c +++ /dev/null | |||
@@ -1,193 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2009 Intel Corporation. | ||
3 | * Author: Patrick Ohly <patrick.ohly@intel.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
18 | */ | ||
19 | |||
20 | #include <linux/timecompare.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/slab.h> | ||
23 | #include <linux/math64.h> | ||
24 | #include <linux/kernel.h> | ||
25 | |||
26 | /* | ||
27 | * fixed point arithmetic scale factor for skew | ||
28 | * | ||
29 | * Usually one would measure skew in ppb (parts per billion, 1e9), but | ||
30 | * using a factor of 2 simplifies the math. | ||
31 | */ | ||
32 | #define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) | ||
33 | |||
34 | ktime_t timecompare_transform(struct timecompare *sync, | ||
35 | u64 source_tstamp) | ||
36 | { | ||
37 | u64 nsec; | ||
38 | |||
39 | nsec = source_tstamp + sync->offset; | ||
40 | nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / | ||
41 | TIMECOMPARE_SKEW_RESOLUTION; | ||
42 | |||
43 | return ns_to_ktime(nsec); | ||
44 | } | ||
45 | EXPORT_SYMBOL_GPL(timecompare_transform); | ||
46 | |||
47 | int timecompare_offset(struct timecompare *sync, | ||
48 | s64 *offset, | ||
49 | u64 *source_tstamp) | ||
50 | { | ||
51 | u64 start_source = 0, end_source = 0; | ||
52 | struct { | ||
53 | s64 offset; | ||
54 | s64 duration_target; | ||
55 | } buffer[10], sample, *samples; | ||
56 | int counter = 0, i; | ||
57 | int used; | ||
58 | int index; | ||
59 | int num_samples = sync->num_samples; | ||
60 | |||
61 | if (num_samples > ARRAY_SIZE(buffer)) { | ||
62 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); | ||
63 | if (!samples) { | ||
64 | samples = buffer; | ||
65 | num_samples = ARRAY_SIZE(buffer); | ||
66 | } | ||
67 | } else { | ||
68 | samples = buffer; | ||
69 | } | ||
70 | |||
71 | /* run until we have enough valid samples, but do not try forever */ | ||
72 | i = 0; | ||
73 | counter = 0; | ||
74 | while (1) { | ||
75 | u64 ts; | ||
76 | ktime_t start, end; | ||
77 | |||
78 | start = sync->target(); | ||
79 | ts = timecounter_read(sync->source); | ||
80 | end = sync->target(); | ||
81 | |||
82 | if (!i) | ||
83 | start_source = ts; | ||
84 | |||
85 | /* ignore negative durations */ | ||
86 | sample.duration_target = ktime_to_ns(ktime_sub(end, start)); | ||
87 | if (sample.duration_target >= 0) { | ||
88 | /* | ||
89 | * assume symetric delay to and from source: | ||
90 | * average target time corresponds to measured | ||
91 | * source time | ||
92 | */ | ||
93 | sample.offset = | ||
94 | (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - | ||
95 | ts; | ||
96 | |||
97 | /* simple insertion sort based on duration */ | ||
98 | index = counter - 1; | ||
99 | while (index >= 0) { | ||
100 | if (samples[index].duration_target < | ||
101 | sample.duration_target) | ||
102 | break; | ||
103 | samples[index + 1] = samples[index]; | ||
104 | index--; | ||
105 | } | ||
106 | samples[index + 1] = sample; | ||
107 | counter++; | ||
108 | } | ||
109 | |||
110 | i++; | ||
111 | if (counter >= num_samples || i >= 100000) { | ||
112 | end_source = ts; | ||
113 | break; | ||
114 | } | ||
115 | } | ||
116 | |||
117 | *source_tstamp = (end_source + start_source) / 2; | ||
118 | |||
119 | /* remove outliers by only using 75% of the samples */ | ||
120 | used = counter * 3 / 4; | ||
121 | if (!used) | ||
122 | used = counter; | ||
123 | if (used) { | ||
124 | /* calculate average */ | ||
125 | s64 off = 0; | ||
126 | for (index = 0; index < used; index++) | ||
127 | off += samples[index].offset; | ||
128 | *offset = div_s64(off, used); | ||
129 | } | ||
130 | |||
131 | if (samples && samples != buffer) | ||
132 | kfree(samples); | ||
133 | |||
134 | return used; | ||
135 | } | ||
136 | EXPORT_SYMBOL_GPL(timecompare_offset); | ||
137 | |||
138 | void __timecompare_update(struct timecompare *sync, | ||
139 | u64 source_tstamp) | ||
140 | { | ||
141 | s64 offset; | ||
142 | u64 average_time; | ||
143 | |||
144 | if (!timecompare_offset(sync, &offset, &average_time)) | ||
145 | return; | ||
146 | |||
147 | if (!sync->last_update) { | ||
148 | sync->last_update = average_time; | ||
149 | sync->offset = offset; | ||
150 | sync->skew = 0; | ||
151 | } else { | ||
152 | s64 delta_nsec = average_time - sync->last_update; | ||
153 | |||
154 | /* avoid division by negative or small deltas */ | ||
155 | if (delta_nsec >= 10000) { | ||
156 | s64 delta_offset_nsec = offset - sync->offset; | ||
157 | s64 skew; /* delta_offset_nsec * | ||
158 | TIMECOMPARE_SKEW_RESOLUTION / | ||
159 | delta_nsec */ | ||
160 | u64 divisor; | ||
161 | |||
162 | /* div_s64() is limited to 32 bit divisor */ | ||
163 | skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; | ||
164 | divisor = delta_nsec; | ||
165 | while (unlikely(divisor >= ((s64)1) << 32)) { | ||
166 | /* divide both by 2; beware, right shift | ||
167 | of negative value has undefined | ||
168 | behavior and can only be used for | ||
169 | the positive divisor */ | ||
170 | skew = div_s64(skew, 2); | ||
171 | divisor >>= 1; | ||
172 | } | ||
173 | skew = div_s64(skew, divisor); | ||
174 | |||
175 | /* | ||
176 | * Calculate new overall skew as 4/16 the | ||
177 | * old value and 12/16 the new one. This is | ||
178 | * a rather arbitrary tradeoff between | ||
179 | * only using the latest measurement (0/16 and | ||
180 | * 16/16) and even more weight on past measurements. | ||
181 | */ | ||
182 | #define TIMECOMPARE_NEW_SKEW_PER_16 12 | ||
183 | sync->skew = | ||
184 | div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * | ||
185 | sync->skew + | ||
186 | TIMECOMPARE_NEW_SKEW_PER_16 * skew, | ||
187 | 16); | ||
188 | sync->last_update = average_time; | ||
189 | sync->offset = offset; | ||
190 | } | ||
191 | } | ||
192 | } | ||
193 | EXPORT_SYMBOL_GPL(__timecompare_update); | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e424970bb562..cbc6acb0db3f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -21,16 +21,11 @@ | |||
21 | #include <linux/time.h> | 21 | #include <linux/time.h> |
22 | #include <linux/tick.h> | 22 | #include <linux/tick.h> |
23 | #include <linux/stop_machine.h> | 23 | #include <linux/stop_machine.h> |
24 | #include <linux/pvclock_gtod.h> | ||
24 | 25 | ||
25 | 26 | ||
26 | static struct timekeeper timekeeper; | 27 | static struct timekeeper timekeeper; |
27 | 28 | ||
28 | /* | ||
29 | * This read-write spinlock protects us from races in SMP while | ||
30 | * playing with xtime. | ||
31 | */ | ||
32 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | ||
33 | |||
34 | /* flag for if timekeeping is suspended */ | 29 | /* flag for if timekeeping is suspended */ |
35 | int __read_mostly timekeeping_suspended; | 30 | int __read_mostly timekeeping_suspended; |
36 | 31 | ||
@@ -180,6 +175,54 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | |||
180 | return nsec + arch_gettimeoffset(); | 175 | return nsec + arch_gettimeoffset(); |
181 | } | 176 | } |
182 | 177 | ||
178 | static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); | ||
179 | |||
180 | static void update_pvclock_gtod(struct timekeeper *tk) | ||
181 | { | ||
182 | raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * pvclock_gtod_register_notifier - register a pvclock timedata update listener | ||
187 | * | ||
188 | * Must hold write on timekeeper.lock | ||
189 | */ | ||
190 | int pvclock_gtod_register_notifier(struct notifier_block *nb) | ||
191 | { | ||
192 | struct timekeeper *tk = &timekeeper; | ||
193 | unsigned long flags; | ||
194 | int ret; | ||
195 | |||
196 | write_seqlock_irqsave(&tk->lock, flags); | ||
197 | ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); | ||
198 | /* update timekeeping data */ | ||
199 | update_pvclock_gtod(tk); | ||
200 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
201 | |||
202 | return ret; | ||
203 | } | ||
204 | EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); | ||
205 | |||
206 | /** | ||
207 | * pvclock_gtod_unregister_notifier - unregister a pvclock | ||
208 | * timedata update listener | ||
209 | * | ||
210 | * Must hold write on timekeeper.lock | ||
211 | */ | ||
212 | int pvclock_gtod_unregister_notifier(struct notifier_block *nb) | ||
213 | { | ||
214 | struct timekeeper *tk = &timekeeper; | ||
215 | unsigned long flags; | ||
216 | int ret; | ||
217 | |||
218 | write_seqlock_irqsave(&tk->lock, flags); | ||
219 | ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); | ||
220 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
221 | |||
222 | return ret; | ||
223 | } | ||
224 | EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); | ||
225 | |||
183 | /* must hold write on timekeeper.lock */ | 226 | /* must hold write on timekeeper.lock */ |
184 | static void timekeeping_update(struct timekeeper *tk, bool clearntp) | 227 | static void timekeeping_update(struct timekeeper *tk, bool clearntp) |
185 | { | 228 | { |
@@ -188,6 +231,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) | |||
188 | ntp_clear(); | 231 | ntp_clear(); |
189 | } | 232 | } |
190 | update_vsyscall(tk); | 233 | update_vsyscall(tk); |
234 | update_pvclock_gtod(tk); | ||
191 | } | 235 | } |
192 | 236 | ||
193 | /** | 237 | /** |
@@ -1299,9 +1343,7 @@ struct timespec get_monotonic_coarse(void) | |||
1299 | } | 1343 | } |
1300 | 1344 | ||
1301 | /* | 1345 | /* |
1302 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | 1346 | * Must hold jiffies_lock |
1303 | * without sampling the sequence number in xtime_lock. | ||
1304 | * jiffies is defined in the linker script... | ||
1305 | */ | 1347 | */ |
1306 | void do_timer(unsigned long ticks) | 1348 | void do_timer(unsigned long ticks) |
1307 | { | 1349 | { |
@@ -1389,7 +1431,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); | |||
1389 | */ | 1431 | */ |
1390 | void xtime_update(unsigned long ticks) | 1432 | void xtime_update(unsigned long ticks) |
1391 | { | 1433 | { |
1392 | write_seqlock(&xtime_lock); | 1434 | write_seqlock(&jiffies_lock); |
1393 | do_timer(ticks); | 1435 | do_timer(ticks); |
1394 | write_sequnlock(&xtime_lock); | 1436 | write_sequnlock(&jiffies_lock); |
1395 | } | 1437 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4cea4f41c1d9..5d89335a485f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -119,6 +119,7 @@ config TRACING | |||
119 | select BINARY_PRINTF | 119 | select BINARY_PRINTF |
120 | select EVENT_TRACING | 120 | select EVENT_TRACING |
121 | select TRACE_CLOCK | 121 | select TRACE_CLOCK |
122 | select IRQ_WORK | ||
122 | 123 | ||
123 | config GENERIC_TRACER | 124 | config GENERIC_TRACER |
124 | bool | 125 | bool |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9dcf15d38380..3ffe4c5ad3f3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -10,7 +10,7 @@ | |||
10 | * Based on code in the latency_tracer, that is: | 10 | * Based on code in the latency_tracer, that is: |
11 | * | 11 | * |
12 | * Copyright (C) 2004-2006 Ingo Molnar | 12 | * Copyright (C) 2004-2006 Ingo Molnar |
13 | * Copyright (C) 2004 William Lee Irwin III | 13 | * Copyright (C) 2004 Nadia Yvette Chambers |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/stop_machine.h> | 16 | #include <linux/stop_machine.h> |
@@ -2437,7 +2437,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) | |||
2437 | { | 2437 | { |
2438 | iter->pos = 0; | 2438 | iter->pos = 0; |
2439 | iter->func_pos = 0; | 2439 | iter->func_pos = 0; |
2440 | iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); | 2440 | iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); |
2441 | } | 2441 | } |
2442 | 2442 | ||
2443 | static void *t_start(struct seq_file *m, loff_t *pos) | 2443 | static void *t_start(struct seq_file *m, loff_t *pos) |
@@ -2675,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file) | |||
2675 | } | 2675 | } |
2676 | 2676 | ||
2677 | loff_t | 2677 | loff_t |
2678 | ftrace_regex_lseek(struct file *file, loff_t offset, int origin) | 2678 | ftrace_regex_lseek(struct file *file, loff_t offset, int whence) |
2679 | { | 2679 | { |
2680 | loff_t ret; | 2680 | loff_t ret; |
2681 | 2681 | ||
2682 | if (file->f_mode & FMODE_READ) | 2682 | if (file->f_mode & FMODE_READ) |
2683 | ret = seq_lseek(file, offset, origin); | 2683 | ret = seq_lseek(file, offset, whence); |
2684 | else | 2684 | else |
2685 | file->f_pos = ret = 1; | 2685 | file->f_pos = ret = 1; |
2686 | 2686 | ||
@@ -2868,7 +2868,7 @@ static int __init ftrace_mod_cmd_init(void) | |||
2868 | { | 2868 | { |
2869 | return register_ftrace_command(&ftrace_mod_cmd); | 2869 | return register_ftrace_command(&ftrace_mod_cmd); |
2870 | } | 2870 | } |
2871 | device_initcall(ftrace_mod_cmd_init); | 2871 | core_initcall(ftrace_mod_cmd_init); |
2872 | 2872 | ||
2873 | static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, | 2873 | static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, |
2874 | struct ftrace_ops *op, struct pt_regs *pt_regs) | 2874 | struct ftrace_ops *op, struct pt_regs *pt_regs) |
@@ -4055,7 +4055,7 @@ static int __init ftrace_nodyn_init(void) | |||
4055 | ftrace_enabled = 1; | 4055 | ftrace_enabled = 1; |
4056 | return 0; | 4056 | return 0; |
4057 | } | 4057 | } |
4058 | device_initcall(ftrace_nodyn_init); | 4058 | core_initcall(ftrace_nodyn_init); |
4059 | 4059 | ||
4060 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 4060 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
4061 | static inline void ftrace_startup_enable(int command) { } | 4061 | static inline void ftrace_startup_enable(int command) { } |
@@ -4381,7 +4381,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, | |||
4381 | if (strlen(tmp) == 0) | 4381 | if (strlen(tmp) == 0) |
4382 | return 1; | 4382 | return 1; |
4383 | 4383 | ||
4384 | ret = strict_strtol(tmp, 10, &val); | 4384 | ret = kstrtol(tmp, 10, &val); |
4385 | if (ret < 0) | 4385 | if (ret < 0) |
4386 | return ret; | 4386 | return ret; |
4387 | 4387 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b979426d16c6..ce8514feedcd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -460,9 +460,10 @@ struct ring_buffer_per_cpu { | |||
460 | unsigned long lost_events; | 460 | unsigned long lost_events; |
461 | unsigned long last_overrun; | 461 | unsigned long last_overrun; |
462 | local_t entries_bytes; | 462 | local_t entries_bytes; |
463 | local_t commit_overrun; | ||
464 | local_t overrun; | ||
465 | local_t entries; | 463 | local_t entries; |
464 | local_t overrun; | ||
465 | local_t commit_overrun; | ||
466 | local_t dropped_events; | ||
466 | local_t committing; | 467 | local_t committing; |
467 | local_t commits; | 468 | local_t commits; |
468 | unsigned long read; | 469 | unsigned long read; |
@@ -1396,6 +1397,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
1396 | struct list_head *head_page_with_bit; | 1397 | struct list_head *head_page_with_bit; |
1397 | 1398 | ||
1398 | head_page = &rb_set_head_page(cpu_buffer)->list; | 1399 | head_page = &rb_set_head_page(cpu_buffer)->list; |
1400 | if (!head_page) | ||
1401 | break; | ||
1399 | prev_page = head_page->prev; | 1402 | prev_page = head_page->prev; |
1400 | 1403 | ||
1401 | first_page = pages->next; | 1404 | first_page = pages->next; |
@@ -1820,7 +1823,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | |||
1820 | } | 1823 | } |
1821 | 1824 | ||
1822 | /** | 1825 | /** |
1823 | * ring_buffer_update_event - update event type and data | 1826 | * rb_update_event - update event type and data |
1824 | * @event: the even to update | 1827 | * @event: the even to update |
1825 | * @type: the type of event | 1828 | * @type: the type of event |
1826 | * @length: the size of the event field in the ring buffer | 1829 | * @length: the size of the event field in the ring buffer |
@@ -2155,8 +2158,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
2155 | * If we are not in overwrite mode, | 2158 | * If we are not in overwrite mode, |
2156 | * this is easy, just stop here. | 2159 | * this is easy, just stop here. |
2157 | */ | 2160 | */ |
2158 | if (!(buffer->flags & RB_FL_OVERWRITE)) | 2161 | if (!(buffer->flags & RB_FL_OVERWRITE)) { |
2162 | local_inc(&cpu_buffer->dropped_events); | ||
2159 | goto out_reset; | 2163 | goto out_reset; |
2164 | } | ||
2160 | 2165 | ||
2161 | ret = rb_handle_head_page(cpu_buffer, | 2166 | ret = rb_handle_head_page(cpu_buffer, |
2162 | tail_page, | 2167 | tail_page, |
@@ -2720,8 +2725,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); | |||
2720 | * and not the length of the event which would hold the header. | 2725 | * and not the length of the event which would hold the header. |
2721 | */ | 2726 | */ |
2722 | int ring_buffer_write(struct ring_buffer *buffer, | 2727 | int ring_buffer_write(struct ring_buffer *buffer, |
2723 | unsigned long length, | 2728 | unsigned long length, |
2724 | void *data) | 2729 | void *data) |
2725 | { | 2730 | { |
2726 | struct ring_buffer_per_cpu *cpu_buffer; | 2731 | struct ring_buffer_per_cpu *cpu_buffer; |
2727 | struct ring_buffer_event *event; | 2732 | struct ring_buffer_event *event; |
@@ -2929,12 +2934,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) | |||
2929 | * @buffer: The ring buffer | 2934 | * @buffer: The ring buffer |
2930 | * @cpu: The per CPU buffer to read from. | 2935 | * @cpu: The per CPU buffer to read from. |
2931 | */ | 2936 | */ |
2932 | unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) | 2937 | u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) |
2933 | { | 2938 | { |
2934 | unsigned long flags; | 2939 | unsigned long flags; |
2935 | struct ring_buffer_per_cpu *cpu_buffer; | 2940 | struct ring_buffer_per_cpu *cpu_buffer; |
2936 | struct buffer_page *bpage; | 2941 | struct buffer_page *bpage; |
2937 | unsigned long ret; | 2942 | u64 ret = 0; |
2938 | 2943 | ||
2939 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 2944 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
2940 | return 0; | 2945 | return 0; |
@@ -2949,7 +2954,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) | |||
2949 | bpage = cpu_buffer->reader_page; | 2954 | bpage = cpu_buffer->reader_page; |
2950 | else | 2955 | else |
2951 | bpage = rb_set_head_page(cpu_buffer); | 2956 | bpage = rb_set_head_page(cpu_buffer); |
2952 | ret = bpage->page->time_stamp; | 2957 | if (bpage) |
2958 | ret = bpage->page->time_stamp; | ||
2953 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 2959 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
2954 | 2960 | ||
2955 | return ret; | 2961 | return ret; |
@@ -2995,7 +3001,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) | |||
2995 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); | 3001 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); |
2996 | 3002 | ||
2997 | /** | 3003 | /** |
2998 | * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer | 3004 | * ring_buffer_overrun_cpu - get the number of overruns caused by the ring |
3005 | * buffer wrapping around (only if RB_FL_OVERWRITE is on). | ||
2999 | * @buffer: The ring buffer | 3006 | * @buffer: The ring buffer |
3000 | * @cpu: The per CPU buffer to get the number of overruns from | 3007 | * @cpu: The per CPU buffer to get the number of overruns from |
3001 | */ | 3008 | */ |
@@ -3015,7 +3022,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) | |||
3015 | EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); | 3022 | EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); |
3016 | 3023 | ||
3017 | /** | 3024 | /** |
3018 | * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits | 3025 | * ring_buffer_commit_overrun_cpu - get the number of overruns caused by |
3026 | * commits failing due to the buffer wrapping around while there are uncommitted | ||
3027 | * events, such as during an interrupt storm. | ||
3019 | * @buffer: The ring buffer | 3028 | * @buffer: The ring buffer |
3020 | * @cpu: The per CPU buffer to get the number of overruns from | 3029 | * @cpu: The per CPU buffer to get the number of overruns from |
3021 | */ | 3030 | */ |
@@ -3036,6 +3045,28 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) | |||
3036 | EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); | 3045 | EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); |
3037 | 3046 | ||
3038 | /** | 3047 | /** |
3048 | * ring_buffer_dropped_events_cpu - get the number of dropped events caused by | ||
3049 | * the ring buffer filling up (only if RB_FL_OVERWRITE is off). | ||
3050 | * @buffer: The ring buffer | ||
3051 | * @cpu: The per CPU buffer to get the number of overruns from | ||
3052 | */ | ||
3053 | unsigned long | ||
3054 | ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) | ||
3055 | { | ||
3056 | struct ring_buffer_per_cpu *cpu_buffer; | ||
3057 | unsigned long ret; | ||
3058 | |||
3059 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
3060 | return 0; | ||
3061 | |||
3062 | cpu_buffer = buffer->buffers[cpu]; | ||
3063 | ret = local_read(&cpu_buffer->dropped_events); | ||
3064 | |||
3065 | return ret; | ||
3066 | } | ||
3067 | EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); | ||
3068 | |||
3069 | /** | ||
3039 | * ring_buffer_entries - get the number of entries in a buffer | 3070 | * ring_buffer_entries - get the number of entries in a buffer |
3040 | * @buffer: The ring buffer | 3071 | * @buffer: The ring buffer |
3041 | * | 3072 | * |
@@ -3260,6 +3291,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
3260 | * Splice the empty reader page into the list around the head. | 3291 | * Splice the empty reader page into the list around the head. |
3261 | */ | 3292 | */ |
3262 | reader = rb_set_head_page(cpu_buffer); | 3293 | reader = rb_set_head_page(cpu_buffer); |
3294 | if (!reader) | ||
3295 | goto out; | ||
3263 | cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); | 3296 | cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); |
3264 | cpu_buffer->reader_page->list.prev = reader->list.prev; | 3297 | cpu_buffer->reader_page->list.prev = reader->list.prev; |
3265 | 3298 | ||
@@ -3778,12 +3811,17 @@ void | |||
3778 | ring_buffer_read_finish(struct ring_buffer_iter *iter) | 3811 | ring_buffer_read_finish(struct ring_buffer_iter *iter) |
3779 | { | 3812 | { |
3780 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3813 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3814 | unsigned long flags; | ||
3781 | 3815 | ||
3782 | /* | 3816 | /* |
3783 | * Ring buffer is disabled from recording, here's a good place | 3817 | * Ring buffer is disabled from recording, here's a good place |
3784 | * to check the integrity of the ring buffer. | 3818 | * to check the integrity of the ring buffer. |
3819 | * Must prevent readers from trying to read, as the check | ||
3820 | * clears the HEAD page and readers require it. | ||
3785 | */ | 3821 | */ |
3822 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | ||
3786 | rb_check_pages(cpu_buffer); | 3823 | rb_check_pages(cpu_buffer); |
3824 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | ||
3787 | 3825 | ||
3788 | atomic_dec(&cpu_buffer->record_disabled); | 3826 | atomic_dec(&cpu_buffer->record_disabled); |
3789 | atomic_dec(&cpu_buffer->buffer->resize_disabled); | 3827 | atomic_dec(&cpu_buffer->buffer->resize_disabled); |
@@ -3864,9 +3902,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3864 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 3902 | local_set(&cpu_buffer->reader_page->page->commit, 0); |
3865 | cpu_buffer->reader_page->read = 0; | 3903 | cpu_buffer->reader_page->read = 0; |
3866 | 3904 | ||
3867 | local_set(&cpu_buffer->commit_overrun, 0); | ||
3868 | local_set(&cpu_buffer->entries_bytes, 0); | 3905 | local_set(&cpu_buffer->entries_bytes, 0); |
3869 | local_set(&cpu_buffer->overrun, 0); | 3906 | local_set(&cpu_buffer->overrun, 0); |
3907 | local_set(&cpu_buffer->commit_overrun, 0); | ||
3908 | local_set(&cpu_buffer->dropped_events, 0); | ||
3870 | local_set(&cpu_buffer->entries, 0); | 3909 | local_set(&cpu_buffer->entries, 0); |
3871 | local_set(&cpu_buffer->committing, 0); | 3910 | local_set(&cpu_buffer->committing, 0); |
3872 | local_set(&cpu_buffer->commits, 0); | 3911 | local_set(&cpu_buffer->commits, 0); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 31e4f55773f1..e5125677efa0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * | 9 | * |
10 | * Based on code from the latency_tracer, that is: | 10 | * Based on code from the latency_tracer, that is: |
11 | * Copyright (C) 2004-2006 Ingo Molnar | 11 | * Copyright (C) 2004-2006 Ingo Molnar |
12 | * Copyright (C) 2004 William Lee Irwin III | 12 | * Copyright (C) 2004 Nadia Yvette Chambers |
13 | */ | 13 | */ |
14 | #include <linux/ring_buffer.h> | 14 | #include <linux/ring_buffer.h> |
15 | #include <generated/utsrelease.h> | 15 | #include <generated/utsrelease.h> |
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <linux/notifier.h> | 20 | #include <linux/notifier.h> |
21 | #include <linux/irqflags.h> | 21 | #include <linux/irqflags.h> |
22 | #include <linux/irq_work.h> | ||
22 | #include <linux/debugfs.h> | 23 | #include <linux/debugfs.h> |
23 | #include <linux/pagemap.h> | 24 | #include <linux/pagemap.h> |
24 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> |
@@ -78,6 +79,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) | |||
78 | } | 79 | } |
79 | 80 | ||
80 | /* | 81 | /* |
82 | * To prevent the comm cache from being overwritten when no | ||
83 | * tracing is active, only save the comm when a trace event | ||
84 | * occurred. | ||
85 | */ | ||
86 | static DEFINE_PER_CPU(bool, trace_cmdline_save); | ||
87 | |||
88 | /* | ||
89 | * When a reader is waiting for data, then this variable is | ||
90 | * set to true. | ||
91 | */ | ||
92 | static bool trace_wakeup_needed; | ||
93 | |||
94 | static struct irq_work trace_work_wakeup; | ||
95 | |||
96 | /* | ||
81 | * Kill all tracing for good (never come back). | 97 | * Kill all tracing for good (never come back). |
82 | * It is initialized to 1 but will turn to zero if the initialization | 98 | * It is initialized to 1 but will turn to zero if the initialization |
83 | * of the tracer is successful. But that is the only place that sets | 99 | * of the tracer is successful. But that is the only place that sets |
@@ -139,6 +155,18 @@ static int __init set_ftrace_dump_on_oops(char *str) | |||
139 | } | 155 | } |
140 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); | 156 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); |
141 | 157 | ||
158 | |||
159 | static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; | ||
160 | static char *trace_boot_options __initdata; | ||
161 | |||
162 | static int __init set_trace_boot_options(char *str) | ||
163 | { | ||
164 | strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); | ||
165 | trace_boot_options = trace_boot_options_buf; | ||
166 | return 0; | ||
167 | } | ||
168 | __setup("trace_options=", set_trace_boot_options); | ||
169 | |||
142 | unsigned long long ns2usecs(cycle_t nsec) | 170 | unsigned long long ns2usecs(cycle_t nsec) |
143 | { | 171 | { |
144 | nsec += 500; | 172 | nsec += 500; |
@@ -198,20 +226,9 @@ static struct trace_array max_tr; | |||
198 | 226 | ||
199 | static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); | 227 | static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); |
200 | 228 | ||
201 | /* tracer_enabled is used to toggle activation of a tracer */ | ||
202 | static int tracer_enabled = 1; | ||
203 | |||
204 | /** | ||
205 | * tracing_is_enabled - return tracer_enabled status | ||
206 | * | ||
207 | * This function is used by other tracers to know the status | ||
208 | * of the tracer_enabled flag. Tracers may use this function | ||
209 | * to know if it should enable their features when starting | ||
210 | * up. See irqsoff tracer for an example (start_irqsoff_tracer). | ||
211 | */ | ||
212 | int tracing_is_enabled(void) | 229 | int tracing_is_enabled(void) |
213 | { | 230 | { |
214 | return tracer_enabled; | 231 | return tracing_is_on(); |
215 | } | 232 | } |
216 | 233 | ||
217 | /* | 234 | /* |
@@ -333,12 +350,18 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | |||
333 | static int trace_stop_count; | 350 | static int trace_stop_count; |
334 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); | 351 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); |
335 | 352 | ||
336 | static void wakeup_work_handler(struct work_struct *work) | 353 | /** |
354 | * trace_wake_up - wake up tasks waiting for trace input | ||
355 | * | ||
356 | * Schedules a delayed work to wake up any task that is blocked on the | ||
357 | * trace_wait queue. These is used with trace_poll for tasks polling the | ||
358 | * trace. | ||
359 | */ | ||
360 | static void trace_wake_up(struct irq_work *work) | ||
337 | { | 361 | { |
338 | wake_up(&trace_wait); | 362 | wake_up_all(&trace_wait); |
339 | } | ||
340 | 363 | ||
341 | static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); | 364 | } |
342 | 365 | ||
343 | /** | 366 | /** |
344 | * tracing_on - enable tracing buffers | 367 | * tracing_on - enable tracing buffers |
@@ -393,22 +416,6 @@ int tracing_is_on(void) | |||
393 | } | 416 | } |
394 | EXPORT_SYMBOL_GPL(tracing_is_on); | 417 | EXPORT_SYMBOL_GPL(tracing_is_on); |
395 | 418 | ||
396 | /** | ||
397 | * trace_wake_up - wake up tasks waiting for trace input | ||
398 | * | ||
399 | * Schedules a delayed work to wake up any task that is blocked on the | ||
400 | * trace_wait queue. These is used with trace_poll for tasks polling the | ||
401 | * trace. | ||
402 | */ | ||
403 | void trace_wake_up(void) | ||
404 | { | ||
405 | const unsigned long delay = msecs_to_jiffies(2); | ||
406 | |||
407 | if (trace_flags & TRACE_ITER_BLOCK) | ||
408 | return; | ||
409 | schedule_delayed_work(&wakeup_work, delay); | ||
410 | } | ||
411 | |||
412 | static int __init set_buf_size(char *str) | 419 | static int __init set_buf_size(char *str) |
413 | { | 420 | { |
414 | unsigned long buf_size; | 421 | unsigned long buf_size; |
@@ -431,7 +438,7 @@ static int __init set_tracing_thresh(char *str) | |||
431 | 438 | ||
432 | if (!str) | 439 | if (!str) |
433 | return 0; | 440 | return 0; |
434 | ret = strict_strtoul(str, 0, &threshold); | 441 | ret = kstrtoul(str, 0, &threshold); |
435 | if (ret < 0) | 442 | if (ret < 0) |
436 | return 0; | 443 | return 0; |
437 | tracing_thresh = threshold * 1000; | 444 | tracing_thresh = threshold * 1000; |
@@ -477,10 +484,12 @@ static const char *trace_options[] = { | |||
477 | static struct { | 484 | static struct { |
478 | u64 (*func)(void); | 485 | u64 (*func)(void); |
479 | const char *name; | 486 | const char *name; |
487 | int in_ns; /* is this clock in nanoseconds? */ | ||
480 | } trace_clocks[] = { | 488 | } trace_clocks[] = { |
481 | { trace_clock_local, "local" }, | 489 | { trace_clock_local, "local", 1 }, |
482 | { trace_clock_global, "global" }, | 490 | { trace_clock_global, "global", 1 }, |
483 | { trace_clock_counter, "counter" }, | 491 | { trace_clock_counter, "counter", 0 }, |
492 | ARCH_TRACE_CLOCKS | ||
484 | }; | 493 | }; |
485 | 494 | ||
486 | int trace_clock_id; | 495 | int trace_clock_id; |
@@ -757,6 +766,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
757 | } | 766 | } |
758 | #endif /* CONFIG_TRACER_MAX_TRACE */ | 767 | #endif /* CONFIG_TRACER_MAX_TRACE */ |
759 | 768 | ||
769 | static void default_wait_pipe(struct trace_iterator *iter) | ||
770 | { | ||
771 | DEFINE_WAIT(wait); | ||
772 | |||
773 | prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); | ||
774 | |||
775 | /* | ||
776 | * The events can happen in critical sections where | ||
777 | * checking a work queue can cause deadlocks. | ||
778 | * After adding a task to the queue, this flag is set | ||
779 | * only to notify events to try to wake up the queue | ||
780 | * using irq_work. | ||
781 | * | ||
782 | * We don't clear it even if the buffer is no longer | ||
783 | * empty. The flag only causes the next event to run | ||
784 | * irq_work to do the work queue wake up. The worse | ||
785 | * that can happen if we race with !trace_empty() is that | ||
786 | * an event will cause an irq_work to try to wake up | ||
787 | * an empty queue. | ||
788 | * | ||
789 | * There's no reason to protect this flag either, as | ||
790 | * the work queue and irq_work logic will do the necessary | ||
791 | * synchronization for the wake ups. The only thing | ||
792 | * that is necessary is that the wake up happens after | ||
793 | * a task has been queued. It's OK for spurious wake ups. | ||
794 | */ | ||
795 | trace_wakeup_needed = true; | ||
796 | |||
797 | if (trace_empty(iter)) | ||
798 | schedule(); | ||
799 | |||
800 | finish_wait(&trace_wait, &wait); | ||
801 | } | ||
802 | |||
760 | /** | 803 | /** |
761 | * register_tracer - register a tracer with the ftrace system. | 804 | * register_tracer - register a tracer with the ftrace system. |
762 | * @type - the plugin for the tracer | 805 | * @type - the plugin for the tracer |
@@ -875,32 +918,6 @@ int register_tracer(struct tracer *type) | |||
875 | return ret; | 918 | return ret; |
876 | } | 919 | } |
877 | 920 | ||
878 | void unregister_tracer(struct tracer *type) | ||
879 | { | ||
880 | struct tracer **t; | ||
881 | |||
882 | mutex_lock(&trace_types_lock); | ||
883 | for (t = &trace_types; *t; t = &(*t)->next) { | ||
884 | if (*t == type) | ||
885 | goto found; | ||
886 | } | ||
887 | pr_info("Tracer %s not registered\n", type->name); | ||
888 | goto out; | ||
889 | |||
890 | found: | ||
891 | *t = (*t)->next; | ||
892 | |||
893 | if (type == current_trace && tracer_enabled) { | ||
894 | tracer_enabled = 0; | ||
895 | tracing_stop(); | ||
896 | if (current_trace->stop) | ||
897 | current_trace->stop(&global_trace); | ||
898 | current_trace = &nop_trace; | ||
899 | } | ||
900 | out: | ||
901 | mutex_unlock(&trace_types_lock); | ||
902 | } | ||
903 | |||
904 | void tracing_reset(struct trace_array *tr, int cpu) | 921 | void tracing_reset(struct trace_array *tr, int cpu) |
905 | { | 922 | { |
906 | struct ring_buffer *buffer = tr->buffer; | 923 | struct ring_buffer *buffer = tr->buffer; |
@@ -1131,10 +1148,14 @@ void trace_find_cmdline(int pid, char comm[]) | |||
1131 | 1148 | ||
1132 | void tracing_record_cmdline(struct task_struct *tsk) | 1149 | void tracing_record_cmdline(struct task_struct *tsk) |
1133 | { | 1150 | { |
1134 | if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || | 1151 | if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) |
1135 | !tracing_is_on()) | ||
1136 | return; | 1152 | return; |
1137 | 1153 | ||
1154 | if (!__this_cpu_read(trace_cmdline_save)) | ||
1155 | return; | ||
1156 | |||
1157 | __this_cpu_write(trace_cmdline_save, false); | ||
1158 | |||
1138 | trace_save_cmdline(tsk); | 1159 | trace_save_cmdline(tsk); |
1139 | } | 1160 | } |
1140 | 1161 | ||
@@ -1178,27 +1199,36 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, | |||
1178 | return event; | 1199 | return event; |
1179 | } | 1200 | } |
1180 | 1201 | ||
1202 | void | ||
1203 | __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) | ||
1204 | { | ||
1205 | __this_cpu_write(trace_cmdline_save, true); | ||
1206 | if (trace_wakeup_needed) { | ||
1207 | trace_wakeup_needed = false; | ||
1208 | /* irq_work_queue() supplies it's own memory barriers */ | ||
1209 | irq_work_queue(&trace_work_wakeup); | ||
1210 | } | ||
1211 | ring_buffer_unlock_commit(buffer, event); | ||
1212 | } | ||
1213 | |||
1181 | static inline void | 1214 | static inline void |
1182 | __trace_buffer_unlock_commit(struct ring_buffer *buffer, | 1215 | __trace_buffer_unlock_commit(struct ring_buffer *buffer, |
1183 | struct ring_buffer_event *event, | 1216 | struct ring_buffer_event *event, |
1184 | unsigned long flags, int pc, | 1217 | unsigned long flags, int pc) |
1185 | int wake) | ||
1186 | { | 1218 | { |
1187 | ring_buffer_unlock_commit(buffer, event); | 1219 | __buffer_unlock_commit(buffer, event); |
1188 | 1220 | ||
1189 | ftrace_trace_stack(buffer, flags, 6, pc); | 1221 | ftrace_trace_stack(buffer, flags, 6, pc); |
1190 | ftrace_trace_userstack(buffer, flags, pc); | 1222 | ftrace_trace_userstack(buffer, flags, pc); |
1191 | |||
1192 | if (wake) | ||
1193 | trace_wake_up(); | ||
1194 | } | 1223 | } |
1195 | 1224 | ||
1196 | void trace_buffer_unlock_commit(struct ring_buffer *buffer, | 1225 | void trace_buffer_unlock_commit(struct ring_buffer *buffer, |
1197 | struct ring_buffer_event *event, | 1226 | struct ring_buffer_event *event, |
1198 | unsigned long flags, int pc) | 1227 | unsigned long flags, int pc) |
1199 | { | 1228 | { |
1200 | __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); | 1229 | __trace_buffer_unlock_commit(buffer, event, flags, pc); |
1201 | } | 1230 | } |
1231 | EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); | ||
1202 | 1232 | ||
1203 | struct ring_buffer_event * | 1233 | struct ring_buffer_event * |
1204 | trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, | 1234 | trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, |
@@ -1215,29 +1245,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, | |||
1215 | struct ring_buffer_event *event, | 1245 | struct ring_buffer_event *event, |
1216 | unsigned long flags, int pc) | 1246 | unsigned long flags, int pc) |
1217 | { | 1247 | { |
1218 | __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); | 1248 | __trace_buffer_unlock_commit(buffer, event, flags, pc); |
1219 | } | 1249 | } |
1220 | EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); | 1250 | EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); |
1221 | 1251 | ||
1222 | void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, | 1252 | void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, |
1223 | struct ring_buffer_event *event, | 1253 | struct ring_buffer_event *event, |
1224 | unsigned long flags, int pc) | 1254 | unsigned long flags, int pc, |
1255 | struct pt_regs *regs) | ||
1225 | { | 1256 | { |
1226 | __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); | 1257 | __buffer_unlock_commit(buffer, event); |
1227 | } | ||
1228 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); | ||
1229 | |||
1230 | void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, | ||
1231 | struct ring_buffer_event *event, | ||
1232 | unsigned long flags, int pc, | ||
1233 | struct pt_regs *regs) | ||
1234 | { | ||
1235 | ring_buffer_unlock_commit(buffer, event); | ||
1236 | 1258 | ||
1237 | ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); | 1259 | ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); |
1238 | ftrace_trace_userstack(buffer, flags, pc); | 1260 | ftrace_trace_userstack(buffer, flags, pc); |
1239 | } | 1261 | } |
1240 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); | 1262 | EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); |
1241 | 1263 | ||
1242 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, | 1264 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, |
1243 | struct ring_buffer_event *event) | 1265 | struct ring_buffer_event *event) |
@@ -1269,7 +1291,7 @@ trace_function(struct trace_array *tr, | |||
1269 | entry->parent_ip = parent_ip; | 1291 | entry->parent_ip = parent_ip; |
1270 | 1292 | ||
1271 | if (!filter_check_discard(call, entry, buffer, event)) | 1293 | if (!filter_check_discard(call, entry, buffer, event)) |
1272 | ring_buffer_unlock_commit(buffer, event); | 1294 | __buffer_unlock_commit(buffer, event); |
1273 | } | 1295 | } |
1274 | 1296 | ||
1275 | void | 1297 | void |
@@ -1362,7 +1384,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
1362 | entry->size = trace.nr_entries; | 1384 | entry->size = trace.nr_entries; |
1363 | 1385 | ||
1364 | if (!filter_check_discard(call, entry, buffer, event)) | 1386 | if (!filter_check_discard(call, entry, buffer, event)) |
1365 | ring_buffer_unlock_commit(buffer, event); | 1387 | __buffer_unlock_commit(buffer, event); |
1366 | 1388 | ||
1367 | out: | 1389 | out: |
1368 | /* Again, don't let gcc optimize things here */ | 1390 | /* Again, don't let gcc optimize things here */ |
@@ -1458,7 +1480,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1458 | 1480 | ||
1459 | save_stack_trace_user(&trace); | 1481 | save_stack_trace_user(&trace); |
1460 | if (!filter_check_discard(call, entry, buffer, event)) | 1482 | if (!filter_check_discard(call, entry, buffer, event)) |
1461 | ring_buffer_unlock_commit(buffer, event); | 1483 | __buffer_unlock_commit(buffer, event); |
1462 | 1484 | ||
1463 | out_drop_count: | 1485 | out_drop_count: |
1464 | __this_cpu_dec(user_stack_count); | 1486 | __this_cpu_dec(user_stack_count); |
@@ -1559,10 +1581,10 @@ static int alloc_percpu_trace_buffer(void) | |||
1559 | return -ENOMEM; | 1581 | return -ENOMEM; |
1560 | } | 1582 | } |
1561 | 1583 | ||
1584 | static int buffers_allocated; | ||
1585 | |||
1562 | void trace_printk_init_buffers(void) | 1586 | void trace_printk_init_buffers(void) |
1563 | { | 1587 | { |
1564 | static int buffers_allocated; | ||
1565 | |||
1566 | if (buffers_allocated) | 1588 | if (buffers_allocated) |
1567 | return; | 1589 | return; |
1568 | 1590 | ||
@@ -1571,7 +1593,38 @@ void trace_printk_init_buffers(void) | |||
1571 | 1593 | ||
1572 | pr_info("ftrace: Allocated trace_printk buffers\n"); | 1594 | pr_info("ftrace: Allocated trace_printk buffers\n"); |
1573 | 1595 | ||
1596 | /* Expand the buffers to set size */ | ||
1597 | tracing_update_buffers(); | ||
1598 | |||
1574 | buffers_allocated = 1; | 1599 | buffers_allocated = 1; |
1600 | |||
1601 | /* | ||
1602 | * trace_printk_init_buffers() can be called by modules. | ||
1603 | * If that happens, then we need to start cmdline recording | ||
1604 | * directly here. If the global_trace.buffer is already | ||
1605 | * allocated here, then this was called by module code. | ||
1606 | */ | ||
1607 | if (global_trace.buffer) | ||
1608 | tracing_start_cmdline_record(); | ||
1609 | } | ||
1610 | |||
1611 | void trace_printk_start_comm(void) | ||
1612 | { | ||
1613 | /* Start tracing comms if trace printk is set */ | ||
1614 | if (!buffers_allocated) | ||
1615 | return; | ||
1616 | tracing_start_cmdline_record(); | ||
1617 | } | ||
1618 | |||
1619 | static void trace_printk_start_stop_comm(int enabled) | ||
1620 | { | ||
1621 | if (!buffers_allocated) | ||
1622 | return; | ||
1623 | |||
1624 | if (enabled) | ||
1625 | tracing_start_cmdline_record(); | ||
1626 | else | ||
1627 | tracing_stop_cmdline_record(); | ||
1575 | } | 1628 | } |
1576 | 1629 | ||
1577 | /** | 1630 | /** |
@@ -1622,7 +1675,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1622 | 1675 | ||
1623 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); | 1676 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); |
1624 | if (!filter_check_discard(call, entry, buffer, event)) { | 1677 | if (!filter_check_discard(call, entry, buffer, event)) { |
1625 | ring_buffer_unlock_commit(buffer, event); | 1678 | __buffer_unlock_commit(buffer, event); |
1626 | ftrace_trace_stack(buffer, flags, 6, pc); | 1679 | ftrace_trace_stack(buffer, flags, 6, pc); |
1627 | } | 1680 | } |
1628 | 1681 | ||
@@ -1693,7 +1746,7 @@ int trace_array_vprintk(struct trace_array *tr, | |||
1693 | memcpy(&entry->buf, tbuffer, len); | 1746 | memcpy(&entry->buf, tbuffer, len); |
1694 | entry->buf[len] = '\0'; | 1747 | entry->buf[len] = '\0'; |
1695 | if (!filter_check_discard(call, entry, buffer, event)) { | 1748 | if (!filter_check_discard(call, entry, buffer, event)) { |
1696 | ring_buffer_unlock_commit(buffer, event); | 1749 | __buffer_unlock_commit(buffer, event); |
1697 | ftrace_trace_stack(buffer, flags, 6, pc); | 1750 | ftrace_trace_stack(buffer, flags, 6, pc); |
1698 | } | 1751 | } |
1699 | out: | 1752 | out: |
@@ -2426,6 +2479,10 @@ __tracing_open(struct inode *inode, struct file *file) | |||
2426 | if (ring_buffer_overruns(iter->tr->buffer)) | 2479 | if (ring_buffer_overruns(iter->tr->buffer)) |
2427 | iter->iter_flags |= TRACE_FILE_ANNOTATE; | 2480 | iter->iter_flags |= TRACE_FILE_ANNOTATE; |
2428 | 2481 | ||
2482 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | ||
2483 | if (trace_clocks[trace_clock_id].in_ns) | ||
2484 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | ||
2485 | |||
2429 | /* stop the trace while dumping */ | 2486 | /* stop the trace while dumping */ |
2430 | tracing_stop(); | 2487 | tracing_stop(); |
2431 | 2488 | ||
@@ -2794,26 +2851,19 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
2794 | 2851 | ||
2795 | if (mask == TRACE_ITER_OVERWRITE) | 2852 | if (mask == TRACE_ITER_OVERWRITE) |
2796 | ring_buffer_change_overwrite(global_trace.buffer, enabled); | 2853 | ring_buffer_change_overwrite(global_trace.buffer, enabled); |
2854 | |||
2855 | if (mask == TRACE_ITER_PRINTK) | ||
2856 | trace_printk_start_stop_comm(enabled); | ||
2797 | } | 2857 | } |
2798 | 2858 | ||
2799 | static ssize_t | 2859 | static int trace_set_options(char *option) |
2800 | tracing_trace_options_write(struct file *filp, const char __user *ubuf, | ||
2801 | size_t cnt, loff_t *ppos) | ||
2802 | { | 2860 | { |
2803 | char buf[64]; | ||
2804 | char *cmp; | 2861 | char *cmp; |
2805 | int neg = 0; | 2862 | int neg = 0; |
2806 | int ret; | 2863 | int ret = 0; |
2807 | int i; | 2864 | int i; |
2808 | 2865 | ||
2809 | if (cnt >= sizeof(buf)) | 2866 | cmp = strstrip(option); |
2810 | return -EINVAL; | ||
2811 | |||
2812 | if (copy_from_user(&buf, ubuf, cnt)) | ||
2813 | return -EFAULT; | ||
2814 | |||
2815 | buf[cnt] = 0; | ||
2816 | cmp = strstrip(buf); | ||
2817 | 2867 | ||
2818 | if (strncmp(cmp, "no", 2) == 0) { | 2868 | if (strncmp(cmp, "no", 2) == 0) { |
2819 | neg = 1; | 2869 | neg = 1; |
@@ -2832,10 +2882,25 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, | |||
2832 | mutex_lock(&trace_types_lock); | 2882 | mutex_lock(&trace_types_lock); |
2833 | ret = set_tracer_option(current_trace, cmp, neg); | 2883 | ret = set_tracer_option(current_trace, cmp, neg); |
2834 | mutex_unlock(&trace_types_lock); | 2884 | mutex_unlock(&trace_types_lock); |
2835 | if (ret) | ||
2836 | return ret; | ||
2837 | } | 2885 | } |
2838 | 2886 | ||
2887 | return ret; | ||
2888 | } | ||
2889 | |||
2890 | static ssize_t | ||
2891 | tracing_trace_options_write(struct file *filp, const char __user *ubuf, | ||
2892 | size_t cnt, loff_t *ppos) | ||
2893 | { | ||
2894 | char buf[64]; | ||
2895 | |||
2896 | if (cnt >= sizeof(buf)) | ||
2897 | return -EINVAL; | ||
2898 | |||
2899 | if (copy_from_user(&buf, ubuf, cnt)) | ||
2900 | return -EFAULT; | ||
2901 | |||
2902 | trace_set_options(buf); | ||
2903 | |||
2839 | *ppos += cnt; | 2904 | *ppos += cnt; |
2840 | 2905 | ||
2841 | return cnt; | 2906 | return cnt; |
@@ -2940,56 +3005,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = { | |||
2940 | }; | 3005 | }; |
2941 | 3006 | ||
2942 | static ssize_t | 3007 | static ssize_t |
2943 | tracing_ctrl_read(struct file *filp, char __user *ubuf, | ||
2944 | size_t cnt, loff_t *ppos) | ||
2945 | { | ||
2946 | char buf[64]; | ||
2947 | int r; | ||
2948 | |||
2949 | r = sprintf(buf, "%u\n", tracer_enabled); | ||
2950 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
2951 | } | ||
2952 | |||
2953 | static ssize_t | ||
2954 | tracing_ctrl_write(struct file *filp, const char __user *ubuf, | ||
2955 | size_t cnt, loff_t *ppos) | ||
2956 | { | ||
2957 | struct trace_array *tr = filp->private_data; | ||
2958 | unsigned long val; | ||
2959 | int ret; | ||
2960 | |||
2961 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); | ||
2962 | if (ret) | ||
2963 | return ret; | ||
2964 | |||
2965 | val = !!val; | ||
2966 | |||
2967 | mutex_lock(&trace_types_lock); | ||
2968 | if (tracer_enabled ^ val) { | ||
2969 | |||
2970 | /* Only need to warn if this is used to change the state */ | ||
2971 | WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); | ||
2972 | |||
2973 | if (val) { | ||
2974 | tracer_enabled = 1; | ||
2975 | if (current_trace->start) | ||
2976 | current_trace->start(tr); | ||
2977 | tracing_start(); | ||
2978 | } else { | ||
2979 | tracer_enabled = 0; | ||
2980 | tracing_stop(); | ||
2981 | if (current_trace->stop) | ||
2982 | current_trace->stop(tr); | ||
2983 | } | ||
2984 | } | ||
2985 | mutex_unlock(&trace_types_lock); | ||
2986 | |||
2987 | *ppos += cnt; | ||
2988 | |||
2989 | return cnt; | ||
2990 | } | ||
2991 | |||
2992 | static ssize_t | ||
2993 | tracing_set_trace_read(struct file *filp, char __user *ubuf, | 3008 | tracing_set_trace_read(struct file *filp, char __user *ubuf, |
2994 | size_t cnt, loff_t *ppos) | 3009 | size_t cnt, loff_t *ppos) |
2995 | { | 3010 | { |
@@ -3019,6 +3034,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val) | |||
3019 | tr->data[cpu]->entries = val; | 3034 | tr->data[cpu]->entries = val; |
3020 | } | 3035 | } |
3021 | 3036 | ||
3037 | /* resize @tr's buffer to the size of @size_tr's entries */ | ||
3038 | static int resize_buffer_duplicate_size(struct trace_array *tr, | ||
3039 | struct trace_array *size_tr, int cpu_id) | ||
3040 | { | ||
3041 | int cpu, ret = 0; | ||
3042 | |||
3043 | if (cpu_id == RING_BUFFER_ALL_CPUS) { | ||
3044 | for_each_tracing_cpu(cpu) { | ||
3045 | ret = ring_buffer_resize(tr->buffer, | ||
3046 | size_tr->data[cpu]->entries, cpu); | ||
3047 | if (ret < 0) | ||
3048 | break; | ||
3049 | tr->data[cpu]->entries = size_tr->data[cpu]->entries; | ||
3050 | } | ||
3051 | } else { | ||
3052 | ret = ring_buffer_resize(tr->buffer, | ||
3053 | size_tr->data[cpu_id]->entries, cpu_id); | ||
3054 | if (ret == 0) | ||
3055 | tr->data[cpu_id]->entries = | ||
3056 | size_tr->data[cpu_id]->entries; | ||
3057 | } | ||
3058 | |||
3059 | return ret; | ||
3060 | } | ||
3061 | |||
3022 | static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | 3062 | static int __tracing_resize_ring_buffer(unsigned long size, int cpu) |
3023 | { | 3063 | { |
3024 | int ret; | 3064 | int ret; |
@@ -3030,6 +3070,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | |||
3030 | */ | 3070 | */ |
3031 | ring_buffer_expanded = 1; | 3071 | ring_buffer_expanded = 1; |
3032 | 3072 | ||
3073 | /* May be called before buffers are initialized */ | ||
3074 | if (!global_trace.buffer) | ||
3075 | return 0; | ||
3076 | |||
3033 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); | 3077 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); |
3034 | if (ret < 0) | 3078 | if (ret < 0) |
3035 | return ret; | 3079 | return ret; |
@@ -3039,23 +3083,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | |||
3039 | 3083 | ||
3040 | ret = ring_buffer_resize(max_tr.buffer, size, cpu); | 3084 | ret = ring_buffer_resize(max_tr.buffer, size, cpu); |
3041 | if (ret < 0) { | 3085 | if (ret < 0) { |
3042 | int r = 0; | 3086 | int r = resize_buffer_duplicate_size(&global_trace, |
3043 | 3087 | &global_trace, cpu); | |
3044 | if (cpu == RING_BUFFER_ALL_CPUS) { | ||
3045 | int i; | ||
3046 | for_each_tracing_cpu(i) { | ||
3047 | r = ring_buffer_resize(global_trace.buffer, | ||
3048 | global_trace.data[i]->entries, | ||
3049 | i); | ||
3050 | if (r < 0) | ||
3051 | break; | ||
3052 | } | ||
3053 | } else { | ||
3054 | r = ring_buffer_resize(global_trace.buffer, | ||
3055 | global_trace.data[cpu]->entries, | ||
3056 | cpu); | ||
3057 | } | ||
3058 | |||
3059 | if (r < 0) { | 3088 | if (r < 0) { |
3060 | /* | 3089 | /* |
3061 | * AARGH! We are left with different | 3090 | * AARGH! We are left with different |
@@ -3193,17 +3222,11 @@ static int tracing_set_tracer(const char *buf) | |||
3193 | 3222 | ||
3194 | topts = create_trace_option_files(t); | 3223 | topts = create_trace_option_files(t); |
3195 | if (t->use_max_tr) { | 3224 | if (t->use_max_tr) { |
3196 | int cpu; | ||
3197 | /* we need to make per cpu buffer sizes equivalent */ | 3225 | /* we need to make per cpu buffer sizes equivalent */ |
3198 | for_each_tracing_cpu(cpu) { | 3226 | ret = resize_buffer_duplicate_size(&max_tr, &global_trace, |
3199 | ret = ring_buffer_resize(max_tr.buffer, | 3227 | RING_BUFFER_ALL_CPUS); |
3200 | global_trace.data[cpu]->entries, | 3228 | if (ret < 0) |
3201 | cpu); | 3229 | goto out; |
3202 | if (ret < 0) | ||
3203 | goto out; | ||
3204 | max_tr.data[cpu]->entries = | ||
3205 | global_trace.data[cpu]->entries; | ||
3206 | } | ||
3207 | } | 3230 | } |
3208 | 3231 | ||
3209 | if (t->init) { | 3232 | if (t->init) { |
@@ -3325,6 +3348,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3325 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | 3348 | if (trace_flags & TRACE_ITER_LATENCY_FMT) |
3326 | iter->iter_flags |= TRACE_FILE_LAT_FMT; | 3349 | iter->iter_flags |= TRACE_FILE_LAT_FMT; |
3327 | 3350 | ||
3351 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | ||
3352 | if (trace_clocks[trace_clock_id].in_ns) | ||
3353 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | ||
3354 | |||
3328 | iter->cpu_file = cpu_file; | 3355 | iter->cpu_file = cpu_file; |
3329 | iter->tr = &global_trace; | 3356 | iter->tr = &global_trace; |
3330 | mutex_init(&iter->mutex); | 3357 | mutex_init(&iter->mutex); |
@@ -3385,19 +3412,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) | |||
3385 | } | 3412 | } |
3386 | } | 3413 | } |
3387 | 3414 | ||
3388 | |||
3389 | void default_wait_pipe(struct trace_iterator *iter) | ||
3390 | { | ||
3391 | DEFINE_WAIT(wait); | ||
3392 | |||
3393 | prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); | ||
3394 | |||
3395 | if (trace_empty(iter)) | ||
3396 | schedule(); | ||
3397 | |||
3398 | finish_wait(&trace_wait, &wait); | ||
3399 | } | ||
3400 | |||
3401 | /* | 3415 | /* |
3402 | * This is a make-shift waitqueue. | 3416 | * This is a make-shift waitqueue. |
3403 | * A tracer might use this callback on some rare cases: | 3417 | * A tracer might use this callback on some rare cases: |
@@ -3438,7 +3452,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
3438 | return -EINTR; | 3452 | return -EINTR; |
3439 | 3453 | ||
3440 | /* | 3454 | /* |
3441 | * We block until we read something and tracing is disabled. | 3455 | * We block until we read something and tracing is enabled. |
3442 | * We still block if tracing is disabled, but we have never | 3456 | * We still block if tracing is disabled, but we have never |
3443 | * read anything. This allows a user to cat this file, and | 3457 | * read anything. This allows a user to cat this file, and |
3444 | * then enable tracing. But after we have read something, | 3458 | * then enable tracing. But after we have read something, |
@@ -3446,7 +3460,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
3446 | * | 3460 | * |
3447 | * iter->pos will be 0 if we haven't read anything. | 3461 | * iter->pos will be 0 if we haven't read anything. |
3448 | */ | 3462 | */ |
3449 | if (!tracer_enabled && iter->pos) | 3463 | if (tracing_is_enabled() && iter->pos) |
3450 | break; | 3464 | break; |
3451 | } | 3465 | } |
3452 | 3466 | ||
@@ -3955,7 +3969,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3955 | } else | 3969 | } else |
3956 | entry->buf[cnt] = '\0'; | 3970 | entry->buf[cnt] = '\0'; |
3957 | 3971 | ||
3958 | ring_buffer_unlock_commit(buffer, event); | 3972 | __buffer_unlock_commit(buffer, event); |
3959 | 3973 | ||
3960 | written = cnt; | 3974 | written = cnt; |
3961 | 3975 | ||
@@ -4016,6 +4030,14 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, | |||
4016 | if (max_tr.buffer) | 4030 | if (max_tr.buffer) |
4017 | ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); | 4031 | ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); |
4018 | 4032 | ||
4033 | /* | ||
4034 | * New clock may not be consistent with the previous clock. | ||
4035 | * Reset the buffer so that it doesn't have incomparable timestamps. | ||
4036 | */ | ||
4037 | tracing_reset_online_cpus(&global_trace); | ||
4038 | if (max_tr.buffer) | ||
4039 | tracing_reset_online_cpus(&max_tr); | ||
4040 | |||
4019 | mutex_unlock(&trace_types_lock); | 4041 | mutex_unlock(&trace_types_lock); |
4020 | 4042 | ||
4021 | *fpos += cnt; | 4043 | *fpos += cnt; |
@@ -4037,13 +4059,6 @@ static const struct file_operations tracing_max_lat_fops = { | |||
4037 | .llseek = generic_file_llseek, | 4059 | .llseek = generic_file_llseek, |
4038 | }; | 4060 | }; |
4039 | 4061 | ||
4040 | static const struct file_operations tracing_ctrl_fops = { | ||
4041 | .open = tracing_open_generic, | ||
4042 | .read = tracing_ctrl_read, | ||
4043 | .write = tracing_ctrl_write, | ||
4044 | .llseek = generic_file_llseek, | ||
4045 | }; | ||
4046 | |||
4047 | static const struct file_operations set_tracer_fops = { | 4062 | static const struct file_operations set_tracer_fops = { |
4048 | .open = tracing_open_generic, | 4063 | .open = tracing_open_generic, |
4049 | .read = tracing_set_trace_read, | 4064 | .read = tracing_set_trace_read, |
@@ -4260,13 +4275,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4260 | return -ENOMEM; | 4275 | return -ENOMEM; |
4261 | 4276 | ||
4262 | if (*ppos & (PAGE_SIZE - 1)) { | 4277 | if (*ppos & (PAGE_SIZE - 1)) { |
4263 | WARN_ONCE(1, "Ftrace: previous read must page-align\n"); | ||
4264 | ret = -EINVAL; | 4278 | ret = -EINVAL; |
4265 | goto out; | 4279 | goto out; |
4266 | } | 4280 | } |
4267 | 4281 | ||
4268 | if (len & (PAGE_SIZE - 1)) { | 4282 | if (len & (PAGE_SIZE - 1)) { |
4269 | WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); | ||
4270 | if (len < PAGE_SIZE) { | 4283 | if (len < PAGE_SIZE) { |
4271 | ret = -EINVAL; | 4284 | ret = -EINVAL; |
4272 | goto out; | 4285 | goto out; |
@@ -4377,13 +4390,27 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
4377 | cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); | 4390 | cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); |
4378 | trace_seq_printf(s, "bytes: %ld\n", cnt); | 4391 | trace_seq_printf(s, "bytes: %ld\n", cnt); |
4379 | 4392 | ||
4380 | t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); | 4393 | if (trace_clocks[trace_clock_id].in_ns) { |
4381 | usec_rem = do_div(t, USEC_PER_SEC); | 4394 | /* local or global for trace_clock */ |
4382 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); | 4395 | t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); |
4396 | usec_rem = do_div(t, USEC_PER_SEC); | ||
4397 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", | ||
4398 | t, usec_rem); | ||
4383 | 4399 | ||
4384 | t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); | 4400 | t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); |
4385 | usec_rem = do_div(t, USEC_PER_SEC); | 4401 | usec_rem = do_div(t, USEC_PER_SEC); |
4386 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); | 4402 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); |
4403 | } else { | ||
4404 | /* counter or tsc mode for trace_clock */ | ||
4405 | trace_seq_printf(s, "oldest event ts: %llu\n", | ||
4406 | ring_buffer_oldest_event_ts(tr->buffer, cpu)); | ||
4407 | |||
4408 | trace_seq_printf(s, "now ts: %llu\n", | ||
4409 | ring_buffer_time_stamp(tr->buffer, cpu)); | ||
4410 | } | ||
4411 | |||
4412 | cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); | ||
4413 | trace_seq_printf(s, "dropped events: %ld\n", cnt); | ||
4387 | 4414 | ||
4388 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 4415 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); |
4389 | 4416 | ||
@@ -4815,9 +4842,6 @@ static __init int tracer_init_debugfs(void) | |||
4815 | 4842 | ||
4816 | d_tracer = tracing_init_dentry(); | 4843 | d_tracer = tracing_init_dentry(); |
4817 | 4844 | ||
4818 | trace_create_file("tracing_enabled", 0644, d_tracer, | ||
4819 | &global_trace, &tracing_ctrl_fops); | ||
4820 | |||
4821 | trace_create_file("trace_options", 0644, d_tracer, | 4845 | trace_create_file("trace_options", 0644, d_tracer, |
4822 | NULL, &tracing_iter_fops); | 4846 | NULL, &tracing_iter_fops); |
4823 | 4847 | ||
@@ -5089,6 +5113,7 @@ __init static int tracer_alloc_buffers(void) | |||
5089 | 5113 | ||
5090 | /* Only allocate trace_printk buffers if a trace_printk exists */ | 5114 | /* Only allocate trace_printk buffers if a trace_printk exists */ |
5091 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) | 5115 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) |
5116 | /* Must be called before global_trace.buffer is allocated */ | ||
5092 | trace_printk_init_buffers(); | 5117 | trace_printk_init_buffers(); |
5093 | 5118 | ||
5094 | /* To save memory, keep the ring buffer size to its minimum */ | 5119 | /* To save memory, keep the ring buffer size to its minimum */ |
@@ -5136,6 +5161,7 @@ __init static int tracer_alloc_buffers(void) | |||
5136 | #endif | 5161 | #endif |
5137 | 5162 | ||
5138 | trace_init_cmdlines(); | 5163 | trace_init_cmdlines(); |
5164 | init_irq_work(&trace_work_wakeup, trace_wake_up); | ||
5139 | 5165 | ||
5140 | register_tracer(&nop_trace); | 5166 | register_tracer(&nop_trace); |
5141 | current_trace = &nop_trace; | 5167 | current_trace = &nop_trace; |
@@ -5147,6 +5173,13 @@ __init static int tracer_alloc_buffers(void) | |||
5147 | 5173 | ||
5148 | register_die_notifier(&trace_die_notifier); | 5174 | register_die_notifier(&trace_die_notifier); |
5149 | 5175 | ||
5176 | while (trace_boot_options) { | ||
5177 | char *option; | ||
5178 | |||
5179 | option = strsep(&trace_boot_options, ","); | ||
5180 | trace_set_options(option); | ||
5181 | } | ||
5182 | |||
5150 | return 0; | 5183 | return 0; |
5151 | 5184 | ||
5152 | out_free_cpumask: | 5185 | out_free_cpumask: |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c15f528c1af4..c75d7988902c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -285,8 +285,8 @@ struct tracer { | |||
285 | int (*set_flag)(u32 old_flags, u32 bit, int set); | 285 | int (*set_flag)(u32 old_flags, u32 bit, int set); |
286 | struct tracer *next; | 286 | struct tracer *next; |
287 | struct tracer_flags *flags; | 287 | struct tracer_flags *flags; |
288 | int print_max; | 288 | bool print_max; |
289 | int use_max_tr; | 289 | bool use_max_tr; |
290 | }; | 290 | }; |
291 | 291 | ||
292 | 292 | ||
@@ -327,7 +327,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) | |||
327 | 327 | ||
328 | int tracer_init(struct tracer *t, struct trace_array *tr); | 328 | int tracer_init(struct tracer *t, struct trace_array *tr); |
329 | int tracing_is_enabled(void); | 329 | int tracing_is_enabled(void); |
330 | void trace_wake_up(void); | ||
331 | void tracing_reset(struct trace_array *tr, int cpu); | 330 | void tracing_reset(struct trace_array *tr, int cpu); |
332 | void tracing_reset_online_cpus(struct trace_array *tr); | 331 | void tracing_reset_online_cpus(struct trace_array *tr); |
333 | void tracing_reset_current(int cpu); | 332 | void tracing_reset_current(int cpu); |
@@ -349,9 +348,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, | |||
349 | unsigned long len, | 348 | unsigned long len, |
350 | unsigned long flags, | 349 | unsigned long flags, |
351 | int pc); | 350 | int pc); |
352 | void trace_buffer_unlock_commit(struct ring_buffer *buffer, | ||
353 | struct ring_buffer_event *event, | ||
354 | unsigned long flags, int pc); | ||
355 | 351 | ||
356 | struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, | 352 | struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, |
357 | struct trace_array_cpu *data); | 353 | struct trace_array_cpu *data); |
@@ -359,6 +355,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, | |||
359 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, | 355 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, |
360 | int *ent_cpu, u64 *ent_ts); | 356 | int *ent_cpu, u64 *ent_ts); |
361 | 357 | ||
358 | void __buffer_unlock_commit(struct ring_buffer *buffer, | ||
359 | struct ring_buffer_event *event); | ||
360 | |||
362 | int trace_empty(struct trace_iterator *iter); | 361 | int trace_empty(struct trace_iterator *iter); |
363 | 362 | ||
364 | void *trace_find_next_entry_inc(struct trace_iterator *iter); | 363 | void *trace_find_next_entry_inc(struct trace_iterator *iter); |
@@ -367,7 +366,6 @@ void trace_init_global_iter(struct trace_iterator *iter); | |||
367 | 366 | ||
368 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | 367 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); |
369 | 368 | ||
370 | void default_wait_pipe(struct trace_iterator *iter); | ||
371 | void poll_wait_pipe(struct trace_iterator *iter); | 369 | void poll_wait_pipe(struct trace_iterator *iter); |
372 | 370 | ||
373 | void ftrace(struct trace_array *tr, | 371 | void ftrace(struct trace_array *tr, |
@@ -407,12 +405,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr); | |||
407 | void tracing_stop_sched_switch_record(void); | 405 | void tracing_stop_sched_switch_record(void); |
408 | void tracing_start_sched_switch_record(void); | 406 | void tracing_start_sched_switch_record(void); |
409 | int register_tracer(struct tracer *type); | 407 | int register_tracer(struct tracer *type); |
410 | void unregister_tracer(struct tracer *type); | ||
411 | int is_tracing_stopped(void); | 408 | int is_tracing_stopped(void); |
412 | enum trace_file_type { | ||
413 | TRACE_FILE_LAT_FMT = 1, | ||
414 | TRACE_FILE_ANNOTATE = 2, | ||
415 | }; | ||
416 | 409 | ||
417 | extern cpumask_var_t __read_mostly tracing_buffer_mask; | 410 | extern cpumask_var_t __read_mostly tracing_buffer_mask; |
418 | 411 | ||
@@ -841,6 +834,7 @@ extern const char *__start___trace_bprintk_fmt[]; | |||
841 | extern const char *__stop___trace_bprintk_fmt[]; | 834 | extern const char *__stop___trace_bprintk_fmt[]; |
842 | 835 | ||
843 | void trace_printk_init_buffers(void); | 836 | void trace_printk_init_buffers(void); |
837 | void trace_printk_start_comm(void); | ||
844 | 838 | ||
845 | #undef FTRACE_ENTRY | 839 | #undef FTRACE_ENTRY |
846 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ | 840 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8d3538b4ea5f..95e96842ed29 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
77 | entry->correct = val == expect; | 77 | entry->correct = val == expect; |
78 | 78 | ||
79 | if (!filter_check_discard(call, entry, buffer, event)) | 79 | if (!filter_check_discard(call, entry, buffer, event)) |
80 | ring_buffer_unlock_commit(buffer, event); | 80 | __buffer_unlock_commit(buffer, event); |
81 | 81 | ||
82 | out: | 82 | out: |
83 | atomic_dec(&tr->data[cpu]->disabled); | 83 | atomic_dec(&tr->data[cpu]->disabled); |
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void) | |||
199 | } | 199 | } |
200 | return register_tracer(&branch_trace); | 200 | return register_tracer(&branch_trace); |
201 | } | 201 | } |
202 | device_initcall(init_branch_tracer); | 202 | core_initcall(init_branch_tracer); |
203 | 203 | ||
204 | #else | 204 | #else |
205 | static inline | 205 | static inline |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d608d09d08c0..880073d0b946 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -491,19 +491,6 @@ static void t_stop(struct seq_file *m, void *p) | |||
491 | mutex_unlock(&event_mutex); | 491 | mutex_unlock(&event_mutex); |
492 | } | 492 | } |
493 | 493 | ||
494 | static int | ||
495 | ftrace_event_seq_open(struct inode *inode, struct file *file) | ||
496 | { | ||
497 | const struct seq_operations *seq_ops; | ||
498 | |||
499 | if ((file->f_mode & FMODE_WRITE) && | ||
500 | (file->f_flags & O_TRUNC)) | ||
501 | ftrace_clear_events(); | ||
502 | |||
503 | seq_ops = inode->i_private; | ||
504 | return seq_open(file, seq_ops); | ||
505 | } | ||
506 | |||
507 | static ssize_t | 494 | static ssize_t |
508 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | 495 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, |
509 | loff_t *ppos) | 496 | loff_t *ppos) |
@@ -980,6 +967,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) | |||
980 | return r; | 967 | return r; |
981 | } | 968 | } |
982 | 969 | ||
970 | static int ftrace_event_avail_open(struct inode *inode, struct file *file); | ||
971 | static int ftrace_event_set_open(struct inode *inode, struct file *file); | ||
972 | |||
983 | static const struct seq_operations show_event_seq_ops = { | 973 | static const struct seq_operations show_event_seq_ops = { |
984 | .start = t_start, | 974 | .start = t_start, |
985 | .next = t_next, | 975 | .next = t_next, |
@@ -995,14 +985,14 @@ static const struct seq_operations show_set_event_seq_ops = { | |||
995 | }; | 985 | }; |
996 | 986 | ||
997 | static const struct file_operations ftrace_avail_fops = { | 987 | static const struct file_operations ftrace_avail_fops = { |
998 | .open = ftrace_event_seq_open, | 988 | .open = ftrace_event_avail_open, |
999 | .read = seq_read, | 989 | .read = seq_read, |
1000 | .llseek = seq_lseek, | 990 | .llseek = seq_lseek, |
1001 | .release = seq_release, | 991 | .release = seq_release, |
1002 | }; | 992 | }; |
1003 | 993 | ||
1004 | static const struct file_operations ftrace_set_event_fops = { | 994 | static const struct file_operations ftrace_set_event_fops = { |
1005 | .open = ftrace_event_seq_open, | 995 | .open = ftrace_event_set_open, |
1006 | .read = seq_read, | 996 | .read = seq_read, |
1007 | .write = ftrace_event_write, | 997 | .write = ftrace_event_write, |
1008 | .llseek = seq_lseek, | 998 | .llseek = seq_lseek, |
@@ -1078,6 +1068,26 @@ static struct dentry *event_trace_events_dir(void) | |||
1078 | return d_events; | 1068 | return d_events; |
1079 | } | 1069 | } |
1080 | 1070 | ||
1071 | static int | ||
1072 | ftrace_event_avail_open(struct inode *inode, struct file *file) | ||
1073 | { | ||
1074 | const struct seq_operations *seq_ops = &show_event_seq_ops; | ||
1075 | |||
1076 | return seq_open(file, seq_ops); | ||
1077 | } | ||
1078 | |||
1079 | static int | ||
1080 | ftrace_event_set_open(struct inode *inode, struct file *file) | ||
1081 | { | ||
1082 | const struct seq_operations *seq_ops = &show_set_event_seq_ops; | ||
1083 | |||
1084 | if ((file->f_mode & FMODE_WRITE) && | ||
1085 | (file->f_flags & O_TRUNC)) | ||
1086 | ftrace_clear_events(); | ||
1087 | |||
1088 | return seq_open(file, seq_ops); | ||
1089 | } | ||
1090 | |||
1081 | static struct dentry * | 1091 | static struct dentry * |
1082 | event_subsystem_dir(const char *name, struct dentry *d_events) | 1092 | event_subsystem_dir(const char *name, struct dentry *d_events) |
1083 | { | 1093 | { |
@@ -1489,6 +1499,9 @@ static __init int event_trace_enable(void) | |||
1489 | if (ret) | 1499 | if (ret) |
1490 | pr_warn("Failed to enable trace event: %s\n", token); | 1500 | pr_warn("Failed to enable trace event: %s\n", token); |
1491 | } | 1501 | } |
1502 | |||
1503 | trace_printk_start_comm(); | ||
1504 | |||
1492 | return 0; | 1505 | return 0; |
1493 | } | 1506 | } |
1494 | 1507 | ||
@@ -1505,15 +1518,13 @@ static __init int event_trace_init(void) | |||
1505 | return 0; | 1518 | return 0; |
1506 | 1519 | ||
1507 | entry = debugfs_create_file("available_events", 0444, d_tracer, | 1520 | entry = debugfs_create_file("available_events", 0444, d_tracer, |
1508 | (void *)&show_event_seq_ops, | 1521 | NULL, &ftrace_avail_fops); |
1509 | &ftrace_avail_fops); | ||
1510 | if (!entry) | 1522 | if (!entry) |
1511 | pr_warning("Could not create debugfs " | 1523 | pr_warning("Could not create debugfs " |
1512 | "'available_events' entry\n"); | 1524 | "'available_events' entry\n"); |
1513 | 1525 | ||
1514 | entry = debugfs_create_file("set_event", 0644, d_tracer, | 1526 | entry = debugfs_create_file("set_event", 0644, d_tracer, |
1515 | (void *)&show_set_event_seq_ops, | 1527 | NULL, &ftrace_set_event_fops); |
1516 | &ftrace_set_event_fops); | ||
1517 | if (!entry) | 1528 | if (!entry) |
1518 | pr_warning("Could not create debugfs " | 1529 | pr_warning("Could not create debugfs " |
1519 | "'set_event' entry\n"); | 1530 | "'set_event' entry\n"); |
@@ -1749,7 +1760,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, | |||
1749 | entry->ip = ip; | 1760 | entry->ip = ip; |
1750 | entry->parent_ip = parent_ip; | 1761 | entry->parent_ip = parent_ip; |
1751 | 1762 | ||
1752 | trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); | 1763 | trace_buffer_unlock_commit(buffer, event, flags, pc); |
1753 | 1764 | ||
1754 | out: | 1765 | out: |
1755 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); | 1766 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c154797a7ff7..e5b0ca8b8d4d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps, | |||
1000 | } | 1000 | } |
1001 | } else { | 1001 | } else { |
1002 | if (field->is_signed) | 1002 | if (field->is_signed) |
1003 | ret = strict_strtoll(pred->regex.pattern, 0, &val); | 1003 | ret = kstrtoll(pred->regex.pattern, 0, &val); |
1004 | else | 1004 | else |
1005 | ret = strict_strtoull(pred->regex.pattern, 0, &val); | 1005 | ret = kstrtoull(pred->regex.pattern, 0, &val); |
1006 | if (ret) { | 1006 | if (ret) { |
1007 | parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); | 1007 | parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); |
1008 | return -EINVAL; | 1008 | return -EINVAL; |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 507a7a9630bf..8e3ad8082ab7 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Based on code from the latency_tracer, that is: | 7 | * Based on code from the latency_tracer, that is: |
8 | * | 8 | * |
9 | * Copyright (C) 2004-2006 Ingo Molnar | 9 | * Copyright (C) 2004-2006 Ingo Molnar |
10 | * Copyright (C) 2004 William Lee Irwin III | 10 | * Copyright (C) 2004 Nadia Yvette Chambers |
11 | */ | 11 | */ |
12 | #include <linux/ring_buffer.h> | 12 | #include <linux/ring_buffer.h> |
13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
@@ -366,7 +366,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, | |||
366 | * We use the callback data field (which is a pointer) | 366 | * We use the callback data field (which is a pointer) |
367 | * as our counter. | 367 | * as our counter. |
368 | */ | 368 | */ |
369 | ret = strict_strtoul(number, 0, (unsigned long *)&count); | 369 | ret = kstrtoul(number, 0, (unsigned long *)&count); |
370 | if (ret) | 370 | if (ret) |
371 | return ret; | 371 | return ret; |
372 | 372 | ||
@@ -411,5 +411,4 @@ static __init int init_function_trace(void) | |||
411 | init_func_cmd_traceon(); | 411 | init_func_cmd_traceon(); |
412 | return register_tracer(&function_trace); | 412 | return register_tracer(&function_trace); |
413 | } | 413 | } |
414 | device_initcall(init_function_trace); | 414 | core_initcall(init_function_trace); |
415 | |||
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 99b4378393d5..4edb4b74eb7e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -223,7 +223,7 @@ int __trace_graph_entry(struct trace_array *tr, | |||
223 | entry = ring_buffer_event_data(event); | 223 | entry = ring_buffer_event_data(event); |
224 | entry->graph_ent = *trace; | 224 | entry->graph_ent = *trace; |
225 | if (!filter_current_check_discard(buffer, call, entry, event)) | 225 | if (!filter_current_check_discard(buffer, call, entry, event)) |
226 | ring_buffer_unlock_commit(buffer, event); | 226 | __buffer_unlock_commit(buffer, event); |
227 | 227 | ||
228 | return 1; | 228 | return 1; |
229 | } | 229 | } |
@@ -327,7 +327,7 @@ void __trace_graph_return(struct trace_array *tr, | |||
327 | entry = ring_buffer_event_data(event); | 327 | entry = ring_buffer_event_data(event); |
328 | entry->ret = *trace; | 328 | entry->ret = *trace; |
329 | if (!filter_current_check_discard(buffer, call, entry, event)) | 329 | if (!filter_current_check_discard(buffer, call, entry, event)) |
330 | ring_buffer_unlock_commit(buffer, event); | 330 | __buffer_unlock_commit(buffer, event); |
331 | } | 331 | } |
332 | 332 | ||
333 | void trace_graph_return(struct ftrace_graph_ret *trace) | 333 | void trace_graph_return(struct ftrace_graph_ret *trace) |
@@ -1474,4 +1474,4 @@ static __init int init_graph_trace(void) | |||
1474 | return register_tracer(&graph_trace); | 1474 | return register_tracer(&graph_trace); |
1475 | } | 1475 | } |
1476 | 1476 | ||
1477 | device_initcall(init_graph_trace); | 1477 | core_initcall(init_graph_trace); |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d98ee8283b29..713a2cac4881 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * From code in the latency_tracer, that is: | 7 | * From code in the latency_tracer, that is: |
8 | * | 8 | * |
9 | * Copyright (C) 2004-2006 Ingo Molnar | 9 | * Copyright (C) 2004-2006 Ingo Molnar |
10 | * Copyright (C) 2004 William Lee Irwin III | 10 | * Copyright (C) 2004 Nadia Yvette Chambers |
11 | */ | 11 | */ |
12 | #include <linux/kallsyms.h> | 12 | #include <linux/kallsyms.h> |
13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
@@ -604,7 +604,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
604 | .reset = irqsoff_tracer_reset, | 604 | .reset = irqsoff_tracer_reset, |
605 | .start = irqsoff_tracer_start, | 605 | .start = irqsoff_tracer_start, |
606 | .stop = irqsoff_tracer_stop, | 606 | .stop = irqsoff_tracer_stop, |
607 | .print_max = 1, | 607 | .print_max = true, |
608 | .print_header = irqsoff_print_header, | 608 | .print_header = irqsoff_print_header, |
609 | .print_line = irqsoff_print_line, | 609 | .print_line = irqsoff_print_line, |
610 | .flags = &tracer_flags, | 610 | .flags = &tracer_flags, |
@@ -614,7 +614,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
614 | #endif | 614 | #endif |
615 | .open = irqsoff_trace_open, | 615 | .open = irqsoff_trace_open, |
616 | .close = irqsoff_trace_close, | 616 | .close = irqsoff_trace_close, |
617 | .use_max_tr = 1, | 617 | .use_max_tr = true, |
618 | }; | 618 | }; |
619 | # define register_irqsoff(trace) register_tracer(&trace) | 619 | # define register_irqsoff(trace) register_tracer(&trace) |
620 | #else | 620 | #else |
@@ -637,7 +637,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
637 | .reset = irqsoff_tracer_reset, | 637 | .reset = irqsoff_tracer_reset, |
638 | .start = irqsoff_tracer_start, | 638 | .start = irqsoff_tracer_start, |
639 | .stop = irqsoff_tracer_stop, | 639 | .stop = irqsoff_tracer_stop, |
640 | .print_max = 1, | 640 | .print_max = true, |
641 | .print_header = irqsoff_print_header, | 641 | .print_header = irqsoff_print_header, |
642 | .print_line = irqsoff_print_line, | 642 | .print_line = irqsoff_print_line, |
643 | .flags = &tracer_flags, | 643 | .flags = &tracer_flags, |
@@ -647,7 +647,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
647 | #endif | 647 | #endif |
648 | .open = irqsoff_trace_open, | 648 | .open = irqsoff_trace_open, |
649 | .close = irqsoff_trace_close, | 649 | .close = irqsoff_trace_close, |
650 | .use_max_tr = 1, | 650 | .use_max_tr = true, |
651 | }; | 651 | }; |
652 | # define register_preemptoff(trace) register_tracer(&trace) | 652 | # define register_preemptoff(trace) register_tracer(&trace) |
653 | #else | 653 | #else |
@@ -672,7 +672,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
672 | .reset = irqsoff_tracer_reset, | 672 | .reset = irqsoff_tracer_reset, |
673 | .start = irqsoff_tracer_start, | 673 | .start = irqsoff_tracer_start, |
674 | .stop = irqsoff_tracer_stop, | 674 | .stop = irqsoff_tracer_stop, |
675 | .print_max = 1, | 675 | .print_max = true, |
676 | .print_header = irqsoff_print_header, | 676 | .print_header = irqsoff_print_header, |
677 | .print_line = irqsoff_print_line, | 677 | .print_line = irqsoff_print_line, |
678 | .flags = &tracer_flags, | 678 | .flags = &tracer_flags, |
@@ -682,7 +682,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
682 | #endif | 682 | #endif |
683 | .open = irqsoff_trace_open, | 683 | .open = irqsoff_trace_open, |
684 | .close = irqsoff_trace_close, | 684 | .close = irqsoff_trace_close, |
685 | .use_max_tr = 1, | 685 | .use_max_tr = true, |
686 | }; | 686 | }; |
687 | 687 | ||
688 | # define register_preemptirqsoff(trace) register_tracer(&trace) | 688 | # define register_preemptirqsoff(trace) register_tracer(&trace) |
@@ -698,4 +698,4 @@ __init static int init_irqsoff_tracer(void) | |||
698 | 698 | ||
699 | return 0; | 699 | return 0; |
700 | } | 700 | } |
701 | device_initcall(init_irqsoff_tracer); | 701 | core_initcall(init_irqsoff_tracer); |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1a2117043bb1..1865d5f76538 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv) | |||
444 | return -EINVAL; | 444 | return -EINVAL; |
445 | } | 445 | } |
446 | /* an address specified */ | 446 | /* an address specified */ |
447 | ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); | 447 | ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); |
448 | if (ret) { | 448 | if (ret) { |
449 | pr_info("Failed to parse address.\n"); | 449 | pr_info("Failed to parse address.\n"); |
450 | return ret; | 450 | return ret; |
@@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
751 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 751 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
752 | 752 | ||
753 | if (!filter_current_check_discard(buffer, call, entry, event)) | 753 | if (!filter_current_check_discard(buffer, call, entry, event)) |
754 | trace_nowake_buffer_unlock_commit_regs(buffer, event, | 754 | trace_buffer_unlock_commit_regs(buffer, event, |
755 | irq_flags, pc, regs); | 755 | irq_flags, pc, regs); |
756 | } | 756 | } |
757 | 757 | ||
758 | /* Kretprobe handler */ | 758 | /* Kretprobe handler */ |
@@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
784 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 784 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
785 | 785 | ||
786 | if (!filter_current_check_discard(buffer, call, entry, event)) | 786 | if (!filter_current_check_discard(buffer, call, entry, event)) |
787 | trace_nowake_buffer_unlock_commit_regs(buffer, event, | 787 | trace_buffer_unlock_commit_regs(buffer, event, |
788 | irq_flags, pc, regs); | 788 | irq_flags, pc, regs); |
789 | } | 789 | } |
790 | 790 | ||
791 | /* Event entry printers */ | 791 | /* Event entry printers */ |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 123b189c732c..194d79602dc7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) | |||
610 | return trace_print_lat_fmt(s, entry); | 610 | return trace_print_lat_fmt(s, entry); |
611 | } | 611 | } |
612 | 612 | ||
613 | static unsigned long preempt_mark_thresh = 100; | 613 | static unsigned long preempt_mark_thresh_us = 100; |
614 | 614 | ||
615 | static int | 615 | static int |
616 | lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, | 616 | lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) |
617 | unsigned long rel_usecs) | ||
618 | { | 617 | { |
619 | return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, | 618 | unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; |
620 | rel_usecs > preempt_mark_thresh ? '!' : | 619 | unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; |
621 | rel_usecs > 1 ? '+' : ' '); | 620 | unsigned long long abs_ts = iter->ts - iter->tr->time_start; |
621 | unsigned long long rel_ts = next_ts - iter->ts; | ||
622 | struct trace_seq *s = &iter->seq; | ||
623 | |||
624 | if (in_ns) { | ||
625 | abs_ts = ns2usecs(abs_ts); | ||
626 | rel_ts = ns2usecs(rel_ts); | ||
627 | } | ||
628 | |||
629 | if (verbose && in_ns) { | ||
630 | unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC); | ||
631 | unsigned long abs_msec = (unsigned long)abs_ts; | ||
632 | unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); | ||
633 | unsigned long rel_msec = (unsigned long)rel_ts; | ||
634 | |||
635 | return trace_seq_printf( | ||
636 | s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", | ||
637 | ns2usecs(iter->ts), | ||
638 | abs_msec, abs_usec, | ||
639 | rel_msec, rel_usec); | ||
640 | } else if (verbose && !in_ns) { | ||
641 | return trace_seq_printf( | ||
642 | s, "[%016llx] %lld (+%lld): ", | ||
643 | iter->ts, abs_ts, rel_ts); | ||
644 | } else if (!verbose && in_ns) { | ||
645 | return trace_seq_printf( | ||
646 | s, " %4lldus%c: ", | ||
647 | abs_ts, | ||
648 | rel_ts > preempt_mark_thresh_us ? '!' : | ||
649 | rel_ts > 1 ? '+' : ' '); | ||
650 | } else { /* !verbose && !in_ns */ | ||
651 | return trace_seq_printf(s, " %4lld: ", abs_ts); | ||
652 | } | ||
622 | } | 653 | } |
623 | 654 | ||
624 | int trace_print_context(struct trace_iterator *iter) | 655 | int trace_print_context(struct trace_iterator *iter) |
625 | { | 656 | { |
626 | struct trace_seq *s = &iter->seq; | 657 | struct trace_seq *s = &iter->seq; |
627 | struct trace_entry *entry = iter->ent; | 658 | struct trace_entry *entry = iter->ent; |
628 | unsigned long long t = ns2usecs(iter->ts); | 659 | unsigned long long t; |
629 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 660 | unsigned long secs, usec_rem; |
630 | unsigned long secs = (unsigned long)t; | ||
631 | char comm[TASK_COMM_LEN]; | 661 | char comm[TASK_COMM_LEN]; |
632 | int ret; | 662 | int ret; |
633 | 663 | ||
@@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter) | |||
644 | return 0; | 674 | return 0; |
645 | } | 675 | } |
646 | 676 | ||
647 | return trace_seq_printf(s, " %5lu.%06lu: ", | 677 | if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { |
648 | secs, usec_rem); | 678 | t = ns2usecs(iter->ts); |
679 | usec_rem = do_div(t, USEC_PER_SEC); | ||
680 | secs = (unsigned long)t; | ||
681 | return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); | ||
682 | } else | ||
683 | return trace_seq_printf(s, " %12llu: ", iter->ts); | ||
649 | } | 684 | } |
650 | 685 | ||
651 | int trace_print_lat_context(struct trace_iterator *iter) | 686 | int trace_print_lat_context(struct trace_iterator *iter) |
@@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
659 | *next_entry = trace_find_next_entry(iter, NULL, | 694 | *next_entry = trace_find_next_entry(iter, NULL, |
660 | &next_ts); | 695 | &next_ts); |
661 | unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); | 696 | unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); |
662 | unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); | ||
663 | unsigned long rel_usecs; | ||
664 | 697 | ||
665 | /* Restore the original ent_size */ | 698 | /* Restore the original ent_size */ |
666 | iter->ent_size = ent_size; | 699 | iter->ent_size = ent_size; |
667 | 700 | ||
668 | if (!next_entry) | 701 | if (!next_entry) |
669 | next_ts = iter->ts; | 702 | next_ts = iter->ts; |
670 | rel_usecs = ns2usecs(next_ts - iter->ts); | ||
671 | 703 | ||
672 | if (verbose) { | 704 | if (verbose) { |
673 | char comm[TASK_COMM_LEN]; | 705 | char comm[TASK_COMM_LEN]; |
674 | 706 | ||
675 | trace_find_cmdline(entry->pid, comm); | 707 | trace_find_cmdline(entry->pid, comm); |
676 | 708 | ||
677 | ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" | 709 | ret = trace_seq_printf( |
678 | " %ld.%03ldms (+%ld.%03ldms): ", comm, | 710 | s, "%16s %5d %3d %d %08x %08lx ", |
679 | entry->pid, iter->cpu, entry->flags, | 711 | comm, entry->pid, iter->cpu, entry->flags, |
680 | entry->preempt_count, iter->idx, | 712 | entry->preempt_count, iter->idx); |
681 | ns2usecs(iter->ts), | ||
682 | abs_usecs / USEC_PER_MSEC, | ||
683 | abs_usecs % USEC_PER_MSEC, | ||
684 | rel_usecs / USEC_PER_MSEC, | ||
685 | rel_usecs % USEC_PER_MSEC); | ||
686 | } else { | 713 | } else { |
687 | ret = lat_print_generic(s, entry, iter->cpu); | 714 | ret = lat_print_generic(s, entry, iter->cpu); |
688 | if (ret) | ||
689 | ret = lat_print_timestamp(s, abs_usecs, rel_usecs); | ||
690 | } | 715 | } |
691 | 716 | ||
717 | if (ret) | ||
718 | ret = lat_print_timestamp(iter, next_ts); | ||
719 | |||
692 | return ret; | 720 | return ret; |
693 | } | 721 | } |
694 | 722 | ||
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index daa9980153af..412e959709b4 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type) | |||
441 | goto fail; | 441 | goto fail; |
442 | 442 | ||
443 | type++; | 443 | type++; |
444 | if (strict_strtoul(type, 0, &bs)) | 444 | if (kstrtoul(type, 0, &bs)) |
445 | goto fail; | 445 | goto fail; |
446 | 446 | ||
447 | switch (bs) { | 447 | switch (bs) { |
@@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) | |||
501 | 501 | ||
502 | tmp = strchr(symbol, '+'); | 502 | tmp = strchr(symbol, '+'); |
503 | if (tmp) { | 503 | if (tmp) { |
504 | /* skip sign because strict_strtol doesn't accept '+' */ | 504 | /* skip sign because kstrtoul doesn't accept '+' */ |
505 | ret = strict_strtoul(tmp + 1, 0, offset); | 505 | ret = kstrtoul(tmp + 1, 0, offset); |
506 | if (ret) | 506 | if (ret) |
507 | return ret; | 507 | return ret; |
508 | 508 | ||
@@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
533 | else | 533 | else |
534 | ret = -EINVAL; | 534 | ret = -EINVAL; |
535 | } else if (isdigit(arg[5])) { | 535 | } else if (isdigit(arg[5])) { |
536 | ret = strict_strtoul(arg + 5, 10, ¶m); | 536 | ret = kstrtoul(arg + 5, 10, ¶m); |
537 | if (ret || param > PARAM_MAX_STACK) | 537 | if (ret || param > PARAM_MAX_STACK) |
538 | ret = -EINVAL; | 538 | ret = -EINVAL; |
539 | else { | 539 | else { |
@@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
579 | 579 | ||
580 | case '@': /* memory or symbol */ | 580 | case '@': /* memory or symbol */ |
581 | if (isdigit(arg[1])) { | 581 | if (isdigit(arg[1])) { |
582 | ret = strict_strtoul(arg + 1, 0, ¶m); | 582 | ret = kstrtoul(arg + 1, 0, ¶m); |
583 | if (ret) | 583 | if (ret) |
584 | break; | 584 | break; |
585 | 585 | ||
@@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
597 | break; | 597 | break; |
598 | 598 | ||
599 | case '+': /* deref memory */ | 599 | case '+': /* deref memory */ |
600 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | 600 | arg++; /* Skip '+', because kstrtol() rejects it. */ |
601 | case '-': | 601 | case '-': |
602 | tmp = strchr(arg, '('); | 602 | tmp = strchr(arg, '('); |
603 | if (!tmp) | 603 | if (!tmp) |
604 | break; | 604 | break; |
605 | 605 | ||
606 | *tmp = '\0'; | 606 | *tmp = '\0'; |
607 | ret = strict_strtol(arg, 0, &offset); | 607 | ret = kstrtol(arg, 0, &offset); |
608 | 608 | ||
609 | if (ret) | 609 | if (ret) |
610 | break; | 610 | break; |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 7e62c0a18456..3374c792ccd8 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
102 | entry->next_cpu = task_cpu(wakee); | 102 | entry->next_cpu = task_cpu(wakee); |
103 | 103 | ||
104 | if (!filter_check_discard(call, entry, buffer, event)) | 104 | if (!filter_check_discard(call, entry, buffer, event)) |
105 | ring_buffer_unlock_commit(buffer, event); | 105 | trace_buffer_unlock_commit(buffer, event, flags, pc); |
106 | ftrace_trace_stack(tr->buffer, flags, 6, pc); | ||
107 | ftrace_trace_userstack(tr->buffer, flags, pc); | ||
108 | } | 106 | } |
109 | 107 | ||
110 | static void | 108 | static void |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 02170c00c413..9fe45fcefca0 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Based on code from the latency_tracer, that is: | 7 | * Based on code from the latency_tracer, that is: |
8 | * | 8 | * |
9 | * Copyright (C) 2004-2006 Ingo Molnar | 9 | * Copyright (C) 2004-2006 Ingo Molnar |
10 | * Copyright (C) 2004 William Lee Irwin III | 10 | * Copyright (C) 2004 Nadia Yvette Chambers |
11 | */ | 11 | */ |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
@@ -589,7 +589,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
589 | .reset = wakeup_tracer_reset, | 589 | .reset = wakeup_tracer_reset, |
590 | .start = wakeup_tracer_start, | 590 | .start = wakeup_tracer_start, |
591 | .stop = wakeup_tracer_stop, | 591 | .stop = wakeup_tracer_stop, |
592 | .print_max = 1, | 592 | .print_max = true, |
593 | .print_header = wakeup_print_header, | 593 | .print_header = wakeup_print_header, |
594 | .print_line = wakeup_print_line, | 594 | .print_line = wakeup_print_line, |
595 | .flags = &tracer_flags, | 595 | .flags = &tracer_flags, |
@@ -599,7 +599,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
599 | #endif | 599 | #endif |
600 | .open = wakeup_trace_open, | 600 | .open = wakeup_trace_open, |
601 | .close = wakeup_trace_close, | 601 | .close = wakeup_trace_close, |
602 | .use_max_tr = 1, | 602 | .use_max_tr = true, |
603 | }; | 603 | }; |
604 | 604 | ||
605 | static struct tracer wakeup_rt_tracer __read_mostly = | 605 | static struct tracer wakeup_rt_tracer __read_mostly = |
@@ -610,7 +610,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
610 | .start = wakeup_tracer_start, | 610 | .start = wakeup_tracer_start, |
611 | .stop = wakeup_tracer_stop, | 611 | .stop = wakeup_tracer_stop, |
612 | .wait_pipe = poll_wait_pipe, | 612 | .wait_pipe = poll_wait_pipe, |
613 | .print_max = 1, | 613 | .print_max = true, |
614 | .print_header = wakeup_print_header, | 614 | .print_header = wakeup_print_header, |
615 | .print_line = wakeup_print_line, | 615 | .print_line = wakeup_print_line, |
616 | .flags = &tracer_flags, | 616 | .flags = &tracer_flags, |
@@ -620,7 +620,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
620 | #endif | 620 | #endif |
621 | .open = wakeup_trace_open, | 621 | .open = wakeup_trace_open, |
622 | .close = wakeup_trace_close, | 622 | .close = wakeup_trace_close, |
623 | .use_max_tr = 1, | 623 | .use_max_tr = true, |
624 | }; | 624 | }; |
625 | 625 | ||
626 | __init static int init_wakeup_tracer(void) | 626 | __init static int init_wakeup_tracer(void) |
@@ -637,4 +637,4 @@ __init static int init_wakeup_tracer(void) | |||
637 | 637 | ||
638 | return 0; | 638 | return 0; |
639 | } | 639 | } |
640 | device_initcall(init_wakeup_tracer); | 640 | core_initcall(init_wakeup_tracer); |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 2c00a691a540..47623169a815 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -320,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
320 | int (*func)(void)) | 320 | int (*func)(void)) |
321 | { | 321 | { |
322 | int save_ftrace_enabled = ftrace_enabled; | 322 | int save_ftrace_enabled = ftrace_enabled; |
323 | int save_tracer_enabled = tracer_enabled; | ||
324 | unsigned long count; | 323 | unsigned long count; |
325 | char *func_name; | 324 | char *func_name; |
326 | int ret; | 325 | int ret; |
@@ -331,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
331 | 330 | ||
332 | /* enable tracing, and record the filter function */ | 331 | /* enable tracing, and record the filter function */ |
333 | ftrace_enabled = 1; | 332 | ftrace_enabled = 1; |
334 | tracer_enabled = 1; | ||
335 | 333 | ||
336 | /* passed in by parameter to fool gcc from optimizing */ | 334 | /* passed in by parameter to fool gcc from optimizing */ |
337 | func(); | 335 | func(); |
@@ -395,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
395 | 393 | ||
396 | out: | 394 | out: |
397 | ftrace_enabled = save_ftrace_enabled; | 395 | ftrace_enabled = save_ftrace_enabled; |
398 | tracer_enabled = save_tracer_enabled; | ||
399 | 396 | ||
400 | /* Enable tracing on all functions again */ | 397 | /* Enable tracing on all functions again */ |
401 | ftrace_set_global_filter(NULL, 0, 1); | 398 | ftrace_set_global_filter(NULL, 0, 1); |
@@ -452,7 +449,6 @@ static int | |||
452 | trace_selftest_function_recursion(void) | 449 | trace_selftest_function_recursion(void) |
453 | { | 450 | { |
454 | int save_ftrace_enabled = ftrace_enabled; | 451 | int save_ftrace_enabled = ftrace_enabled; |
455 | int save_tracer_enabled = tracer_enabled; | ||
456 | char *func_name; | 452 | char *func_name; |
457 | int len; | 453 | int len; |
458 | int ret; | 454 | int ret; |
@@ -465,7 +461,6 @@ trace_selftest_function_recursion(void) | |||
465 | 461 | ||
466 | /* enable tracing, and record the filter function */ | 462 | /* enable tracing, and record the filter function */ |
467 | ftrace_enabled = 1; | 463 | ftrace_enabled = 1; |
468 | tracer_enabled = 1; | ||
469 | 464 | ||
470 | /* Handle PPC64 '.' name */ | 465 | /* Handle PPC64 '.' name */ |
471 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | 466 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); |
@@ -534,7 +529,6 @@ trace_selftest_function_recursion(void) | |||
534 | ret = 0; | 529 | ret = 0; |
535 | out: | 530 | out: |
536 | ftrace_enabled = save_ftrace_enabled; | 531 | ftrace_enabled = save_ftrace_enabled; |
537 | tracer_enabled = save_tracer_enabled; | ||
538 | 532 | ||
539 | return ret; | 533 | return ret; |
540 | } | 534 | } |
@@ -569,7 +563,6 @@ static int | |||
569 | trace_selftest_function_regs(void) | 563 | trace_selftest_function_regs(void) |
570 | { | 564 | { |
571 | int save_ftrace_enabled = ftrace_enabled; | 565 | int save_ftrace_enabled = ftrace_enabled; |
572 | int save_tracer_enabled = tracer_enabled; | ||
573 | char *func_name; | 566 | char *func_name; |
574 | int len; | 567 | int len; |
575 | int ret; | 568 | int ret; |
@@ -586,7 +579,6 @@ trace_selftest_function_regs(void) | |||
586 | 579 | ||
587 | /* enable tracing, and record the filter function */ | 580 | /* enable tracing, and record the filter function */ |
588 | ftrace_enabled = 1; | 581 | ftrace_enabled = 1; |
589 | tracer_enabled = 1; | ||
590 | 582 | ||
591 | /* Handle PPC64 '.' name */ | 583 | /* Handle PPC64 '.' name */ |
592 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | 584 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); |
@@ -648,7 +640,6 @@ trace_selftest_function_regs(void) | |||
648 | ret = 0; | 640 | ret = 0; |
649 | out: | 641 | out: |
650 | ftrace_enabled = save_ftrace_enabled; | 642 | ftrace_enabled = save_ftrace_enabled; |
651 | tracer_enabled = save_tracer_enabled; | ||
652 | 643 | ||
653 | return ret; | 644 | return ret; |
654 | } | 645 | } |
@@ -662,7 +653,6 @@ int | |||
662 | trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | 653 | trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) |
663 | { | 654 | { |
664 | int save_ftrace_enabled = ftrace_enabled; | 655 | int save_ftrace_enabled = ftrace_enabled; |
665 | int save_tracer_enabled = tracer_enabled; | ||
666 | unsigned long count; | 656 | unsigned long count; |
667 | int ret; | 657 | int ret; |
668 | 658 | ||
@@ -671,7 +661,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
671 | 661 | ||
672 | /* start the tracing */ | 662 | /* start the tracing */ |
673 | ftrace_enabled = 1; | 663 | ftrace_enabled = 1; |
674 | tracer_enabled = 1; | ||
675 | 664 | ||
676 | ret = tracer_init(trace, tr); | 665 | ret = tracer_init(trace, tr); |
677 | if (ret) { | 666 | if (ret) { |
@@ -708,7 +697,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
708 | ret = trace_selftest_function_regs(); | 697 | ret = trace_selftest_function_regs(); |
709 | out: | 698 | out: |
710 | ftrace_enabled = save_ftrace_enabled; | 699 | ftrace_enabled = save_ftrace_enabled; |
711 | tracer_enabled = save_tracer_enabled; | ||
712 | 700 | ||
713 | /* kill ftrace totally if we failed */ | 701 | /* kill ftrace totally if we failed */ |
714 | if (ret) | 702 | if (ret) |
@@ -1106,6 +1094,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
1106 | tracing_stop(); | 1094 | tracing_stop(); |
1107 | /* check both trace buffers */ | 1095 | /* check both trace buffers */ |
1108 | ret = trace_test_buffer(tr, NULL); | 1096 | ret = trace_test_buffer(tr, NULL); |
1097 | printk("ret = %d\n", ret); | ||
1109 | if (!ret) | 1098 | if (!ret) |
1110 | ret = trace_test_buffer(&max_tr, &count); | 1099 | ret = trace_test_buffer(&max_tr, &count); |
1111 | 1100 | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0c1b165778e5..42ca822fc701 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -33,7 +33,6 @@ static unsigned long max_stack_size; | |||
33 | static arch_spinlock_t max_stack_lock = | 33 | static arch_spinlock_t max_stack_lock = |
34 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 34 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
35 | 35 | ||
36 | static int stack_trace_disabled __read_mostly; | ||
37 | static DEFINE_PER_CPU(int, trace_active); | 36 | static DEFINE_PER_CPU(int, trace_active); |
38 | static DEFINE_MUTEX(stack_sysctl_mutex); | 37 | static DEFINE_MUTEX(stack_sysctl_mutex); |
39 | 38 | ||
@@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
116 | { | 115 | { |
117 | int cpu; | 116 | int cpu; |
118 | 117 | ||
119 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) | ||
120 | return; | ||
121 | |||
122 | preempt_disable_notrace(); | 118 | preempt_disable_notrace(); |
123 | 119 | ||
124 | cpu = raw_smp_processor_id(); | 120 | cpu = raw_smp_processor_id(); |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 2485a7d09b11..7609dd6714c2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -21,9 +21,6 @@ static int syscall_enter_register(struct ftrace_event_call *event, | |||
21 | static int syscall_exit_register(struct ftrace_event_call *event, | 21 | static int syscall_exit_register(struct ftrace_event_call *event, |
22 | enum trace_reg type, void *data); | 22 | enum trace_reg type, void *data); |
23 | 23 | ||
24 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | ||
25 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | ||
26 | |||
27 | static struct list_head * | 24 | static struct list_head * |
28 | syscall_get_enter_fields(struct ftrace_event_call *call) | 25 | syscall_get_enter_fields(struct ftrace_event_call *call) |
29 | { | 26 | { |
@@ -32,30 +29,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
32 | return &entry->enter_fields; | 29 | return &entry->enter_fields; |
33 | } | 30 | } |
34 | 31 | ||
35 | struct trace_event_functions enter_syscall_print_funcs = { | ||
36 | .trace = print_syscall_enter, | ||
37 | }; | ||
38 | |||
39 | struct trace_event_functions exit_syscall_print_funcs = { | ||
40 | .trace = print_syscall_exit, | ||
41 | }; | ||
42 | |||
43 | struct ftrace_event_class event_class_syscall_enter = { | ||
44 | .system = "syscalls", | ||
45 | .reg = syscall_enter_register, | ||
46 | .define_fields = syscall_enter_define_fields, | ||
47 | .get_fields = syscall_get_enter_fields, | ||
48 | .raw_init = init_syscall_trace, | ||
49 | }; | ||
50 | |||
51 | struct ftrace_event_class event_class_syscall_exit = { | ||
52 | .system = "syscalls", | ||
53 | .reg = syscall_exit_register, | ||
54 | .define_fields = syscall_exit_define_fields, | ||
55 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), | ||
56 | .raw_init = init_syscall_trace, | ||
57 | }; | ||
58 | |||
59 | extern struct syscall_metadata *__start_syscalls_metadata[]; | 32 | extern struct syscall_metadata *__start_syscalls_metadata[]; |
60 | extern struct syscall_metadata *__stop_syscalls_metadata[]; | 33 | extern struct syscall_metadata *__stop_syscalls_metadata[]; |
61 | 34 | ||
@@ -432,7 +405,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
432 | mutex_unlock(&syscall_trace_lock); | 405 | mutex_unlock(&syscall_trace_lock); |
433 | } | 406 | } |
434 | 407 | ||
435 | int init_syscall_trace(struct ftrace_event_call *call) | 408 | static int init_syscall_trace(struct ftrace_event_call *call) |
436 | { | 409 | { |
437 | int id; | 410 | int id; |
438 | int num; | 411 | int num; |
@@ -457,6 +430,30 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
457 | return id; | 430 | return id; |
458 | } | 431 | } |
459 | 432 | ||
433 | struct trace_event_functions enter_syscall_print_funcs = { | ||
434 | .trace = print_syscall_enter, | ||
435 | }; | ||
436 | |||
437 | struct trace_event_functions exit_syscall_print_funcs = { | ||
438 | .trace = print_syscall_exit, | ||
439 | }; | ||
440 | |||
441 | struct ftrace_event_class event_class_syscall_enter = { | ||
442 | .system = "syscalls", | ||
443 | .reg = syscall_enter_register, | ||
444 | .define_fields = syscall_enter_define_fields, | ||
445 | .get_fields = syscall_get_enter_fields, | ||
446 | .raw_init = init_syscall_trace, | ||
447 | }; | ||
448 | |||
449 | struct ftrace_event_class event_class_syscall_exit = { | ||
450 | .system = "syscalls", | ||
451 | .reg = syscall_exit_register, | ||
452 | .define_fields = syscall_exit_define_fields, | ||
453 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), | ||
454 | .raw_init = init_syscall_trace, | ||
455 | }; | ||
456 | |||
460 | unsigned long __init __weak arch_syscall_addr(int nr) | 457 | unsigned long __init __weak arch_syscall_addr(int nr) |
461 | { | 458 | { |
462 | return (unsigned long)sys_call_table[nr]; | 459 | return (unsigned long)sys_call_table[nr]; |
@@ -537,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
537 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); | 534 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
538 | } | 535 | } |
539 | 536 | ||
540 | int perf_sysenter_enable(struct ftrace_event_call *call) | 537 | static int perf_sysenter_enable(struct ftrace_event_call *call) |
541 | { | 538 | { |
542 | int ret = 0; | 539 | int ret = 0; |
543 | int num; | 540 | int num; |
@@ -558,7 +555,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call) | |||
558 | return ret; | 555 | return ret; |
559 | } | 556 | } |
560 | 557 | ||
561 | void perf_sysenter_disable(struct ftrace_event_call *call) | 558 | static void perf_sysenter_disable(struct ftrace_event_call *call) |
562 | { | 559 | { |
563 | int num; | 560 | int num; |
564 | 561 | ||
@@ -615,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
615 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); | 612 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
616 | } | 613 | } |
617 | 614 | ||
618 | int perf_sysexit_enable(struct ftrace_event_call *call) | 615 | static int perf_sysexit_enable(struct ftrace_event_call *call) |
619 | { | 616 | { |
620 | int ret = 0; | 617 | int ret = 0; |
621 | int num; | 618 | int num; |
@@ -636,7 +633,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call) | |||
636 | return ret; | 633 | return ret; |
637 | } | 634 | } |
638 | 635 | ||
639 | void perf_sysexit_disable(struct ftrace_event_call *call) | 636 | static void perf_sysexit_disable(struct ftrace_event_call *call) |
640 | { | 637 | { |
641 | int num; | 638 | int num; |
642 | 639 | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 03003cd7dd96..c86e6d4f67fb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
23 | #include <linux/uprobes.h> | 23 | #include <linux/uprobes.h> |
24 | #include <linux/namei.h> | 24 | #include <linux/namei.h> |
25 | #include <linux/string.h> | ||
25 | 26 | ||
26 | #include "trace_probe.h" | 27 | #include "trace_probe.h" |
27 | 28 | ||
@@ -189,7 +190,7 @@ static int create_trace_uprobe(int argc, char **argv) | |||
189 | if (argv[0][0] == '-') | 190 | if (argv[0][0] == '-') |
190 | is_delete = true; | 191 | is_delete = true; |
191 | else if (argv[0][0] != 'p') { | 192 | else if (argv[0][0] != 'p') { |
192 | pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); | 193 | pr_info("Probe definition must be started with 'p' or '-'.\n"); |
193 | return -EINVAL; | 194 | return -EINVAL; |
194 | } | 195 | } |
195 | 196 | ||
@@ -252,7 +253,7 @@ static int create_trace_uprobe(int argc, char **argv) | |||
252 | if (ret) | 253 | if (ret) |
253 | goto fail_address_parse; | 254 | goto fail_address_parse; |
254 | 255 | ||
255 | ret = strict_strtoul(arg, 0, &offset); | 256 | ret = kstrtoul(arg, 0, &offset); |
256 | if (ret) | 257 | if (ret) |
257 | goto fail_address_parse; | 258 | goto fail_address_parse; |
258 | 259 | ||
@@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv) | |||
263 | 264 | ||
264 | /* setup a probe */ | 265 | /* setup a probe */ |
265 | if (!event) { | 266 | if (!event) { |
266 | char *tail = strrchr(filename, '/'); | 267 | char *tail; |
267 | char *ptr; | 268 | char *ptr; |
268 | 269 | ||
269 | ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); | 270 | tail = kstrdup(kbasename(filename), GFP_KERNEL); |
270 | if (!ptr) { | 271 | if (!tail) { |
271 | ret = -ENOMEM; | 272 | ret = -ENOMEM; |
272 | goto fail_address_parse; | 273 | goto fail_address_parse; |
273 | } | 274 | } |
274 | 275 | ||
275 | tail = ptr; | ||
276 | ptr = strpbrk(tail, ".-_"); | 276 | ptr = strpbrk(tail, ".-_"); |
277 | if (ptr) | 277 | if (ptr) |
278 | *ptr = '\0'; | 278 | *ptr = '\0'; |
diff --git a/kernel/user.c b/kernel/user.c index 750acffbe9ec..33acb5e53a5f 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/export.h> | 17 | #include <linux/export.h> |
18 | #include <linux/user_namespace.h> | 18 | #include <linux/user_namespace.h> |
19 | #include <linux/proc_fs.h> | ||
19 | 20 | ||
20 | /* | 21 | /* |
21 | * userns count is 1 for root user, 1 for init_uts_ns, | 22 | * userns count is 1 for root user, 1 for init_uts_ns, |
@@ -51,6 +52,7 @@ struct user_namespace init_user_ns = { | |||
51 | }, | 52 | }, |
52 | .owner = GLOBAL_ROOT_UID, | 53 | .owner = GLOBAL_ROOT_UID, |
53 | .group = GLOBAL_ROOT_GID, | 54 | .group = GLOBAL_ROOT_GID, |
55 | .proc_inum = PROC_USER_INIT_INO, | ||
54 | }; | 56 | }; |
55 | EXPORT_SYMBOL_GPL(init_user_ns); | 57 | EXPORT_SYMBOL_GPL(init_user_ns); |
56 | 58 | ||
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 456a6b9fba34..2b042c42fbc4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/nsproxy.h> | 9 | #include <linux/nsproxy.h> |
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/user_namespace.h> | 11 | #include <linux/user_namespace.h> |
12 | #include <linux/proc_fs.h> | ||
12 | #include <linux/highuid.h> | 13 | #include <linux/highuid.h> |
13 | #include <linux/cred.h> | 14 | #include <linux/cred.h> |
14 | #include <linux/securebits.h> | 15 | #include <linux/securebits.h> |
@@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly; | |||
26 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | 27 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, |
27 | struct uid_gid_map *map); | 28 | struct uid_gid_map *map); |
28 | 29 | ||
30 | static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) | ||
31 | { | ||
32 | /* Start with the same capabilities as init but useless for doing | ||
33 | * anything as the capabilities are bound to the new user namespace. | ||
34 | */ | ||
35 | cred->securebits = SECUREBITS_DEFAULT; | ||
36 | cred->cap_inheritable = CAP_EMPTY_SET; | ||
37 | cred->cap_permitted = CAP_FULL_SET; | ||
38 | cred->cap_effective = CAP_FULL_SET; | ||
39 | cred->cap_bset = CAP_FULL_SET; | ||
40 | #ifdef CONFIG_KEYS | ||
41 | key_put(cred->request_key_auth); | ||
42 | cred->request_key_auth = NULL; | ||
43 | #endif | ||
44 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ | ||
45 | cred->user_ns = user_ns; | ||
46 | } | ||
47 | |||
29 | /* | 48 | /* |
30 | * Create a new user namespace, deriving the creator from the user in the | 49 | * Create a new user namespace, deriving the creator from the user in the |
31 | * passed credentials, and replacing that user with the new root user for the | 50 | * passed credentials, and replacing that user with the new root user for the |
@@ -39,6 +58,7 @@ int create_user_ns(struct cred *new) | |||
39 | struct user_namespace *ns, *parent_ns = new->user_ns; | 58 | struct user_namespace *ns, *parent_ns = new->user_ns; |
40 | kuid_t owner = new->euid; | 59 | kuid_t owner = new->euid; |
41 | kgid_t group = new->egid; | 60 | kgid_t group = new->egid; |
61 | int ret; | ||
42 | 62 | ||
43 | /* The creator needs a mapping in the parent user namespace | 63 | /* The creator needs a mapping in the parent user namespace |
44 | * or else we won't be able to reasonably tell userspace who | 64 | * or else we won't be able to reasonably tell userspace who |
@@ -52,38 +72,45 @@ int create_user_ns(struct cred *new) | |||
52 | if (!ns) | 72 | if (!ns) |
53 | return -ENOMEM; | 73 | return -ENOMEM; |
54 | 74 | ||
75 | ret = proc_alloc_inum(&ns->proc_inum); | ||
76 | if (ret) { | ||
77 | kmem_cache_free(user_ns_cachep, ns); | ||
78 | return ret; | ||
79 | } | ||
80 | |||
55 | kref_init(&ns->kref); | 81 | kref_init(&ns->kref); |
82 | /* Leave the new->user_ns reference with the new user namespace. */ | ||
56 | ns->parent = parent_ns; | 83 | ns->parent = parent_ns; |
57 | ns->owner = owner; | 84 | ns->owner = owner; |
58 | ns->group = group; | 85 | ns->group = group; |
59 | 86 | ||
60 | /* Start with the same capabilities as init but useless for doing | 87 | set_cred_user_ns(new, ns); |
61 | * anything as the capabilities are bound to the new user namespace. | ||
62 | */ | ||
63 | new->securebits = SECUREBITS_DEFAULT; | ||
64 | new->cap_inheritable = CAP_EMPTY_SET; | ||
65 | new->cap_permitted = CAP_FULL_SET; | ||
66 | new->cap_effective = CAP_FULL_SET; | ||
67 | new->cap_bset = CAP_FULL_SET; | ||
68 | #ifdef CONFIG_KEYS | ||
69 | key_put(new->request_key_auth); | ||
70 | new->request_key_auth = NULL; | ||
71 | #endif | ||
72 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ | ||
73 | |||
74 | /* Leave the new->user_ns reference with the new user namespace. */ | ||
75 | /* Leave the reference to our user_ns with the new cred. */ | ||
76 | new->user_ns = ns; | ||
77 | 88 | ||
78 | return 0; | 89 | return 0; |
79 | } | 90 | } |
80 | 91 | ||
92 | int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) | ||
93 | { | ||
94 | struct cred *cred; | ||
95 | |||
96 | if (!(unshare_flags & CLONE_NEWUSER)) | ||
97 | return 0; | ||
98 | |||
99 | cred = prepare_creds(); | ||
100 | if (!cred) | ||
101 | return -ENOMEM; | ||
102 | |||
103 | *new_cred = cred; | ||
104 | return create_user_ns(cred); | ||
105 | } | ||
106 | |||
81 | void free_user_ns(struct kref *kref) | 107 | void free_user_ns(struct kref *kref) |
82 | { | 108 | { |
83 | struct user_namespace *parent, *ns = | 109 | struct user_namespace *parent, *ns = |
84 | container_of(kref, struct user_namespace, kref); | 110 | container_of(kref, struct user_namespace, kref); |
85 | 111 | ||
86 | parent = ns->parent; | 112 | parent = ns->parent; |
113 | proc_free_inum(ns->proc_inum); | ||
87 | kmem_cache_free(user_ns_cachep, ns); | 114 | kmem_cache_free(user_ns_cachep, ns); |
88 | put_user_ns(parent); | 115 | put_user_ns(parent); |
89 | } | 116 | } |
@@ -372,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v) | |||
372 | struct user_namespace *lower_ns; | 399 | struct user_namespace *lower_ns; |
373 | uid_t lower; | 400 | uid_t lower; |
374 | 401 | ||
375 | lower_ns = current_user_ns(); | 402 | lower_ns = seq_user_ns(seq); |
376 | if ((lower_ns == ns) && lower_ns->parent) | 403 | if ((lower_ns == ns) && lower_ns->parent) |
377 | lower_ns = lower_ns->parent; | 404 | lower_ns = lower_ns->parent; |
378 | 405 | ||
@@ -393,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v) | |||
393 | struct user_namespace *lower_ns; | 420 | struct user_namespace *lower_ns; |
394 | gid_t lower; | 421 | gid_t lower; |
395 | 422 | ||
396 | lower_ns = current_user_ns(); | 423 | lower_ns = seq_user_ns(seq); |
397 | if ((lower_ns == ns) && lower_ns->parent) | 424 | if ((lower_ns == ns) && lower_ns->parent) |
398 | lower_ns = lower_ns->parent; | 425 | lower_ns = lower_ns->parent; |
399 | 426 | ||
@@ -669,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz | |||
669 | { | 696 | { |
670 | struct seq_file *seq = file->private_data; | 697 | struct seq_file *seq = file->private_data; |
671 | struct user_namespace *ns = seq->private; | 698 | struct user_namespace *ns = seq->private; |
699 | struct user_namespace *seq_ns = seq_user_ns(seq); | ||
672 | 700 | ||
673 | if (!ns->parent) | 701 | if (!ns->parent) |
674 | return -EPERM; | 702 | return -EPERM; |
675 | 703 | ||
704 | if ((seq_ns != ns) && (seq_ns != ns->parent)) | ||
705 | return -EPERM; | ||
706 | |||
676 | return map_write(file, buf, size, ppos, CAP_SETUID, | 707 | return map_write(file, buf, size, ppos, CAP_SETUID, |
677 | &ns->uid_map, &ns->parent->uid_map); | 708 | &ns->uid_map, &ns->parent->uid_map); |
678 | } | 709 | } |
@@ -681,10 +712,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz | |||
681 | { | 712 | { |
682 | struct seq_file *seq = file->private_data; | 713 | struct seq_file *seq = file->private_data; |
683 | struct user_namespace *ns = seq->private; | 714 | struct user_namespace *ns = seq->private; |
715 | struct user_namespace *seq_ns = seq_user_ns(seq); | ||
684 | 716 | ||
685 | if (!ns->parent) | 717 | if (!ns->parent) |
686 | return -EPERM; | 718 | return -EPERM; |
687 | 719 | ||
720 | if ((seq_ns != ns) && (seq_ns != ns->parent)) | ||
721 | return -EPERM; | ||
722 | |||
688 | return map_write(file, buf, size, ppos, CAP_SETGID, | 723 | return map_write(file, buf, size, ppos, CAP_SETGID, |
689 | &ns->gid_map, &ns->parent->gid_map); | 724 | &ns->gid_map, &ns->parent->gid_map); |
690 | } | 725 | } |
@@ -709,6 +744,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t | |||
709 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | 744 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, |
710 | struct uid_gid_map *new_map) | 745 | struct uid_gid_map *new_map) |
711 | { | 746 | { |
747 | /* Allow mapping to your own filesystem ids */ | ||
748 | if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { | ||
749 | u32 id = new_map->extent[0].lower_first; | ||
750 | if (cap_setid == CAP_SETUID) { | ||
751 | kuid_t uid = make_kuid(ns->parent, id); | ||
752 | if (uid_eq(uid, current_fsuid())) | ||
753 | return true; | ||
754 | } | ||
755 | else if (cap_setid == CAP_SETGID) { | ||
756 | kgid_t gid = make_kgid(ns->parent, id); | ||
757 | if (gid_eq(gid, current_fsgid())) | ||
758 | return true; | ||
759 | } | ||
760 | } | ||
761 | |||
712 | /* Allow anyone to set a mapping that doesn't require privilege */ | 762 | /* Allow anyone to set a mapping that doesn't require privilege */ |
713 | if (!cap_valid(cap_setid)) | 763 | if (!cap_valid(cap_setid)) |
714 | return true; | 764 | return true; |
@@ -722,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | |||
722 | return false; | 772 | return false; |
723 | } | 773 | } |
724 | 774 | ||
775 | static void *userns_get(struct task_struct *task) | ||
776 | { | ||
777 | struct user_namespace *user_ns; | ||
778 | |||
779 | rcu_read_lock(); | ||
780 | user_ns = get_user_ns(__task_cred(task)->user_ns); | ||
781 | rcu_read_unlock(); | ||
782 | |||
783 | return user_ns; | ||
784 | } | ||
785 | |||
786 | static void userns_put(void *ns) | ||
787 | { | ||
788 | put_user_ns(ns); | ||
789 | } | ||
790 | |||
791 | static int userns_install(struct nsproxy *nsproxy, void *ns) | ||
792 | { | ||
793 | struct user_namespace *user_ns = ns; | ||
794 | struct cred *cred; | ||
795 | |||
796 | /* Don't allow gaining capabilities by reentering | ||
797 | * the same user namespace. | ||
798 | */ | ||
799 | if (user_ns == current_user_ns()) | ||
800 | return -EINVAL; | ||
801 | |||
802 | /* Threaded processes may not enter a different user namespace */ | ||
803 | if (atomic_read(¤t->mm->mm_users) > 1) | ||
804 | return -EINVAL; | ||
805 | |||
806 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | ||
807 | return -EPERM; | ||
808 | |||
809 | cred = prepare_creds(); | ||
810 | if (!cred) | ||
811 | return -ENOMEM; | ||
812 | |||
813 | put_user_ns(cred->user_ns); | ||
814 | set_cred_user_ns(cred, get_user_ns(user_ns)); | ||
815 | |||
816 | return commit_creds(cred); | ||
817 | } | ||
818 | |||
819 | static unsigned int userns_inum(void *ns) | ||
820 | { | ||
821 | struct user_namespace *user_ns = ns; | ||
822 | return user_ns->proc_inum; | ||
823 | } | ||
824 | |||
825 | const struct proc_ns_operations userns_operations = { | ||
826 | .name = "user", | ||
827 | .type = CLONE_NEWUSER, | ||
828 | .get = userns_get, | ||
829 | .put = userns_put, | ||
830 | .install = userns_install, | ||
831 | .inum = userns_inum, | ||
832 | }; | ||
833 | |||
725 | static __init int user_namespaces_init(void) | 834 | static __init int user_namespaces_init(void) |
726 | { | 835 | { |
727 | user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); | 836 | user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 679d97a5d3fd..08b197e8c485 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void) | |||
32 | * @old_ns: namespace to clone | 32 | * @old_ns: namespace to clone |
33 | * Return NULL on error (failure to kmalloc), new ns otherwise | 33 | * Return NULL on error (failure to kmalloc), new ns otherwise |
34 | */ | 34 | */ |
35 | static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, | 35 | static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, |
36 | struct uts_namespace *old_ns) | 36 | struct uts_namespace *old_ns) |
37 | { | 37 | { |
38 | struct uts_namespace *ns; | 38 | struct uts_namespace *ns; |
39 | int err; | ||
39 | 40 | ||
40 | ns = create_uts_ns(); | 41 | ns = create_uts_ns(); |
41 | if (!ns) | 42 | if (!ns) |
42 | return ERR_PTR(-ENOMEM); | 43 | return ERR_PTR(-ENOMEM); |
43 | 44 | ||
45 | err = proc_alloc_inum(&ns->proc_inum); | ||
46 | if (err) { | ||
47 | kfree(ns); | ||
48 | return ERR_PTR(err); | ||
49 | } | ||
50 | |||
44 | down_read(&uts_sem); | 51 | down_read(&uts_sem); |
45 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 52 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
46 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); | 53 | ns->user_ns = get_user_ns(user_ns); |
47 | up_read(&uts_sem); | 54 | up_read(&uts_sem); |
48 | return ns; | 55 | return ns; |
49 | } | 56 | } |
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, | |||
55 | * versa. | 62 | * versa. |
56 | */ | 63 | */ |
57 | struct uts_namespace *copy_utsname(unsigned long flags, | 64 | struct uts_namespace *copy_utsname(unsigned long flags, |
58 | struct task_struct *tsk) | 65 | struct user_namespace *user_ns, struct uts_namespace *old_ns) |
59 | { | 66 | { |
60 | struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; | ||
61 | struct uts_namespace *new_ns; | 67 | struct uts_namespace *new_ns; |
62 | 68 | ||
63 | BUG_ON(!old_ns); | 69 | BUG_ON(!old_ns); |
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, | |||
66 | if (!(flags & CLONE_NEWUTS)) | 72 | if (!(flags & CLONE_NEWUTS)) |
67 | return old_ns; | 73 | return old_ns; |
68 | 74 | ||
69 | new_ns = clone_uts_ns(tsk, old_ns); | 75 | new_ns = clone_uts_ns(user_ns, old_ns); |
70 | 76 | ||
71 | put_uts_ns(old_ns); | 77 | put_uts_ns(old_ns); |
72 | return new_ns; | 78 | return new_ns; |
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref) | |||
78 | 84 | ||
79 | ns = container_of(kref, struct uts_namespace, kref); | 85 | ns = container_of(kref, struct uts_namespace, kref); |
80 | put_user_ns(ns->user_ns); | 86 | put_user_ns(ns->user_ns); |
87 | proc_free_inum(ns->proc_inum); | ||
81 | kfree(ns); | 88 | kfree(ns); |
82 | } | 89 | } |
83 | 90 | ||
@@ -102,19 +109,32 @@ static void utsns_put(void *ns) | |||
102 | put_uts_ns(ns); | 109 | put_uts_ns(ns); |
103 | } | 110 | } |
104 | 111 | ||
105 | static int utsns_install(struct nsproxy *nsproxy, void *ns) | 112 | static int utsns_install(struct nsproxy *nsproxy, void *new) |
106 | { | 113 | { |
114 | struct uts_namespace *ns = new; | ||
115 | |||
116 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || | ||
117 | !nsown_capable(CAP_SYS_ADMIN)) | ||
118 | return -EPERM; | ||
119 | |||
107 | get_uts_ns(ns); | 120 | get_uts_ns(ns); |
108 | put_uts_ns(nsproxy->uts_ns); | 121 | put_uts_ns(nsproxy->uts_ns); |
109 | nsproxy->uts_ns = ns; | 122 | nsproxy->uts_ns = ns; |
110 | return 0; | 123 | return 0; |
111 | } | 124 | } |
112 | 125 | ||
126 | static unsigned int utsns_inum(void *vp) | ||
127 | { | ||
128 | struct uts_namespace *ns = vp; | ||
129 | |||
130 | return ns->proc_inum; | ||
131 | } | ||
132 | |||
113 | const struct proc_ns_operations utsns_operations = { | 133 | const struct proc_ns_operations utsns_operations = { |
114 | .name = "uts", | 134 | .name = "uts", |
115 | .type = CLONE_NEWUTS, | 135 | .type = CLONE_NEWUTS, |
116 | .get = utsns_get, | 136 | .get = utsns_get, |
117 | .put = utsns_put, | 137 | .put = utsns_put, |
118 | .install = utsns_install, | 138 | .install = utsns_install, |
139 | .inum = utsns_inum, | ||
119 | }; | 140 | }; |
120 | |||
diff --git a/kernel/wait.c b/kernel/wait.c index 7fdd9eaca2c3..6698e0c04ead 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Generic waiting primitives. | 2 | * Generic waiting primitives. |
3 | * | 3 | * |
4 | * (C) 2004 William Irwin, Oracle | 4 | * (C) 2004 Nadia Yvette Chambers, Oracle |
5 | */ | 5 | */ |
6 | #include <linux/init.h> | 6 | #include <linux/init.h> |
7 | #include <linux/export.h> | 7 | #include <linux/export.h> |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c8c21be11ab4..75a2ab3d0b02 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -31,6 +31,7 @@ | |||
31 | int watchdog_enabled = 1; | 31 | int watchdog_enabled = 1; |
32 | int __read_mostly watchdog_thresh = 10; | 32 | int __read_mostly watchdog_thresh = 10; |
33 | static int __read_mostly watchdog_disabled; | 33 | static int __read_mostly watchdog_disabled; |
34 | static u64 __read_mostly sample_period; | ||
34 | 35 | ||
35 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 36 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
36 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | 37 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); |
@@ -116,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu) | |||
116 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | 117 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ |
117 | } | 118 | } |
118 | 119 | ||
119 | static u64 get_sample_period(void) | 120 | static void set_sample_period(void) |
120 | { | 121 | { |
121 | /* | 122 | /* |
122 | * convert watchdog_thresh from seconds to ns | 123 | * convert watchdog_thresh from seconds to ns |
@@ -125,7 +126,7 @@ static u64 get_sample_period(void) | |||
125 | * and hard thresholds) to increment before the | 126 | * and hard thresholds) to increment before the |
126 | * hardlockup detector generates a warning | 127 | * hardlockup detector generates a warning |
127 | */ | 128 | */ |
128 | return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); | 129 | sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); |
129 | } | 130 | } |
130 | 131 | ||
131 | /* Commands for resetting the watchdog */ | 132 | /* Commands for resetting the watchdog */ |
@@ -275,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
275 | wake_up_process(__this_cpu_read(softlockup_watchdog)); | 276 | wake_up_process(__this_cpu_read(softlockup_watchdog)); |
276 | 277 | ||
277 | /* .. and repeat */ | 278 | /* .. and repeat */ |
278 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | 279 | hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); |
279 | 280 | ||
280 | if (touch_ts == 0) { | 281 | if (touch_ts == 0) { |
281 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { | 282 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { |
@@ -343,6 +344,10 @@ static void watchdog_enable(unsigned int cpu) | |||
343 | { | 344 | { |
344 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 345 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
345 | 346 | ||
347 | /* kick off the timer for the hardlockup detector */ | ||
348 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
349 | hrtimer->function = watchdog_timer_fn; | ||
350 | |||
346 | if (!watchdog_enabled) { | 351 | if (!watchdog_enabled) { |
347 | kthread_park(current); | 352 | kthread_park(current); |
348 | return; | 353 | return; |
@@ -351,12 +356,8 @@ static void watchdog_enable(unsigned int cpu) | |||
351 | /* Enable the perf event */ | 356 | /* Enable the perf event */ |
352 | watchdog_nmi_enable(cpu); | 357 | watchdog_nmi_enable(cpu); |
353 | 358 | ||
354 | /* kick off the timer for the hardlockup detector */ | ||
355 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
356 | hrtimer->function = watchdog_timer_fn; | ||
357 | |||
358 | /* done here because hrtimer_start can only pin to smp_processor_id() */ | 359 | /* done here because hrtimer_start can only pin to smp_processor_id() */ |
359 | hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), | 360 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), |
360 | HRTIMER_MODE_REL_PINNED); | 361 | HRTIMER_MODE_REL_PINNED); |
361 | 362 | ||
362 | /* initialize timestamp */ | 363 | /* initialize timestamp */ |
@@ -368,9 +369,6 @@ static void watchdog_disable(unsigned int cpu) | |||
368 | { | 369 | { |
369 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 370 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
370 | 371 | ||
371 | if (!watchdog_enabled) | ||
372 | return; | ||
373 | |||
374 | watchdog_set_prio(SCHED_NORMAL, 0); | 372 | watchdog_set_prio(SCHED_NORMAL, 0); |
375 | hrtimer_cancel(hrtimer); | 373 | hrtimer_cancel(hrtimer); |
376 | /* disable the perf event */ | 374 | /* disable the perf event */ |
@@ -386,7 +384,7 @@ static int watchdog_should_run(unsigned int cpu) | |||
386 | /* | 384 | /* |
387 | * The watchdog thread function - touches the timestamp. | 385 | * The watchdog thread function - touches the timestamp. |
388 | * | 386 | * |
389 | * It only runs once every get_sample_period() seconds (4 seconds by | 387 | * It only runs once every sample_period seconds (4 seconds by |
390 | * default) to reset the softlockup timestamp. If this gets delayed | 388 | * default) to reset the softlockup timestamp. If this gets delayed |
391 | * for more than 2*watchdog_thresh seconds then the debug-printout | 389 | * for more than 2*watchdog_thresh seconds then the debug-printout |
392 | * triggers in watchdog_timer_fn(). | 390 | * triggers in watchdog_timer_fn(). |
@@ -519,6 +517,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
519 | if (ret || !write) | 517 | if (ret || !write) |
520 | return ret; | 518 | return ret; |
521 | 519 | ||
520 | set_sample_period(); | ||
522 | if (watchdog_enabled && watchdog_thresh) | 521 | if (watchdog_enabled && watchdog_thresh) |
523 | watchdog_enable_all_cpus(); | 522 | watchdog_enable_all_cpus(); |
524 | else | 523 | else |
@@ -540,6 +539,7 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
540 | 539 | ||
541 | void __init lockup_detector_init(void) | 540 | void __init lockup_detector_init(void) |
542 | { | 541 | { |
542 | set_sample_period(); | ||
543 | if (smpboot_register_percpu_thread(&watchdog_threads)) { | 543 | if (smpboot_register_percpu_thread(&watchdog_threads)) { |
544 | pr_err("Failed to create watchdog threads, disabled\n"); | 544 | pr_err("Failed to create watchdog threads, disabled\n"); |
545 | watchdog_disabled = -ENODEV; | 545 | watchdog_disabled = -ENODEV; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1dae900df798..fbc6576a83c3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -739,8 +739,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
739 | { | 739 | { |
740 | struct worker *worker = kthread_data(task); | 740 | struct worker *worker = kthread_data(task); |
741 | 741 | ||
742 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 742 | if (!(worker->flags & WORKER_NOT_RUNNING)) { |
743 | WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); | ||
743 | atomic_inc(get_pool_nr_running(worker->pool)); | 744 | atomic_inc(get_pool_nr_running(worker->pool)); |
745 | } | ||
744 | } | 746 | } |
745 | 747 | ||
746 | /** | 748 | /** |
@@ -3485,7 +3487,7 @@ unsigned int work_busy(struct work_struct *work) | |||
3485 | unsigned int ret = 0; | 3487 | unsigned int ret = 0; |
3486 | 3488 | ||
3487 | if (!gcwq) | 3489 | if (!gcwq) |
3488 | return false; | 3490 | return 0; |
3489 | 3491 | ||
3490 | spin_lock_irqsave(&gcwq->lock, flags); | 3492 | spin_lock_irqsave(&gcwq->lock, flags); |
3491 | 3493 | ||