diff options
Diffstat (limited to 'kernel')
87 files changed, 4073 insertions, 2445 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 86e3285ae7e5..ac0d533eb7de 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | |||
110 | obj-$(CONFIG_PADATA) += padata.o | 110 | obj-$(CONFIG_PADATA) += padata.o |
111 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 111 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
112 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | 112 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o |
113 | obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o | ||
113 | 114 | ||
114 | $(obj)/configs.o: $(obj)/config_data.h | 115 | $(obj)/configs.o: $(obj)/config_data.h |
115 | 116 | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2f186ed80c40..e37e6a12c5e3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -200,7 +200,6 @@ struct audit_context { | |||
200 | struct list_head names_list; /* anchor for struct audit_names->list */ | 200 | struct list_head names_list; /* anchor for struct audit_names->list */ |
201 | char * filterkey; /* key for rule that triggered record */ | 201 | char * filterkey; /* key for rule that triggered record */ |
202 | struct path pwd; | 202 | struct path pwd; |
203 | struct audit_context *previous; /* For nested syscalls */ | ||
204 | struct audit_aux_data *aux; | 203 | struct audit_aux_data *aux; |
205 | struct audit_aux_data *aux_pids; | 204 | struct audit_aux_data *aux_pids; |
206 | struct sockaddr_storage *sockaddr; | 205 | struct sockaddr_storage *sockaddr; |
@@ -1091,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk) | |||
1091 | 1090 | ||
1092 | static inline void audit_free_context(struct audit_context *context) | 1091 | static inline void audit_free_context(struct audit_context *context) |
1093 | { | 1092 | { |
1094 | struct audit_context *previous; | 1093 | audit_free_names(context); |
1095 | int count = 0; | 1094 | unroll_tree_refs(context, NULL, 0); |
1096 | 1095 | free_tree_refs(context); | |
1097 | do { | 1096 | audit_free_aux(context); |
1098 | previous = context->previous; | 1097 | kfree(context->filterkey); |
1099 | if (previous || (count && count < 10)) { | 1098 | kfree(context->sockaddr); |
1100 | ++count; | 1099 | kfree(context); |
1101 | printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" | ||
1102 | " freeing multiple contexts (%d)\n", | ||
1103 | context->serial, context->major, | ||
1104 | context->name_count, count); | ||
1105 | } | ||
1106 | audit_free_names(context); | ||
1107 | unroll_tree_refs(context, NULL, 0); | ||
1108 | free_tree_refs(context); | ||
1109 | audit_free_aux(context); | ||
1110 | kfree(context->filterkey); | ||
1111 | kfree(context->sockaddr); | ||
1112 | kfree(context); | ||
1113 | context = previous; | ||
1114 | } while (context); | ||
1115 | if (count >= 10) | ||
1116 | printk(KERN_ERR "audit: freed %d contexts\n", count); | ||
1117 | } | 1100 | } |
1118 | 1101 | ||
1119 | void audit_log_task_context(struct audit_buffer *ab) | 1102 | void audit_log_task_context(struct audit_buffer *ab) |
@@ -1159,7 +1142,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
1159 | cred = current_cred(); | 1142 | cred = current_cred(); |
1160 | 1143 | ||
1161 | spin_lock_irq(&tsk->sighand->siglock); | 1144 | spin_lock_irq(&tsk->sighand->siglock); |
1162 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | 1145 | if (tsk->signal && tsk->signal->tty) |
1163 | tty = tsk->signal->tty->name; | 1146 | tty = tsk->signal->tty->name; |
1164 | else | 1147 | else |
1165 | tty = "(none)"; | 1148 | tty = "(none)"; |
@@ -1783,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major, | |||
1783 | if (!context) | 1766 | if (!context) |
1784 | return; | 1767 | return; |
1785 | 1768 | ||
1786 | /* | ||
1787 | * This happens only on certain architectures that make system | ||
1788 | * calls in kernel_thread via the entry.S interface, instead of | ||
1789 | * with direct calls. (If you are porting to a new | ||
1790 | * architecture, hitting this condition can indicate that you | ||
1791 | * got the _exit/_leave calls backward in entry.S.) | ||
1792 | * | ||
1793 | * i386 no | ||
1794 | * x86_64 no | ||
1795 | * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) | ||
1796 | * | ||
1797 | * This also happens with vm86 emulation in a non-nested manner | ||
1798 | * (entries without exits), so this case must be caught. | ||
1799 | */ | ||
1800 | if (context->in_syscall) { | ||
1801 | struct audit_context *newctx; | ||
1802 | |||
1803 | #if AUDIT_DEBUG | ||
1804 | printk(KERN_ERR | ||
1805 | "audit(:%d) pid=%d in syscall=%d;" | ||
1806 | " entering syscall=%d\n", | ||
1807 | context->serial, tsk->pid, context->major, major); | ||
1808 | #endif | ||
1809 | newctx = audit_alloc_context(context->state); | ||
1810 | if (newctx) { | ||
1811 | newctx->previous = context; | ||
1812 | context = newctx; | ||
1813 | tsk->audit_context = newctx; | ||
1814 | } else { | ||
1815 | /* If we can't alloc a new context, the best we | ||
1816 | * can do is to leak memory (any pending putname | ||
1817 | * will be lost). The only other alternative is | ||
1818 | * to abandon auditing. */ | ||
1819 | audit_zero_context(context, context->state); | ||
1820 | } | ||
1821 | } | ||
1822 | BUG_ON(context->in_syscall || context->name_count); | 1769 | BUG_ON(context->in_syscall || context->name_count); |
1823 | 1770 | ||
1824 | if (!audit_enabled) | 1771 | if (!audit_enabled) |
@@ -1881,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code) | |||
1881 | if (!list_empty(&context->killed_trees)) | 1828 | if (!list_empty(&context->killed_trees)) |
1882 | audit_kill_trees(&context->killed_trees); | 1829 | audit_kill_trees(&context->killed_trees); |
1883 | 1830 | ||
1884 | if (context->previous) { | 1831 | audit_free_names(context); |
1885 | struct audit_context *new_context = context->previous; | 1832 | unroll_tree_refs(context, NULL, 0); |
1886 | context->previous = NULL; | 1833 | audit_free_aux(context); |
1887 | audit_free_context(context); | 1834 | context->aux = NULL; |
1888 | tsk->audit_context = new_context; | 1835 | context->aux_pids = NULL; |
1889 | } else { | 1836 | context->target_pid = 0; |
1890 | audit_free_names(context); | 1837 | context->target_sid = 0; |
1891 | unroll_tree_refs(context, NULL, 0); | 1838 | context->sockaddr_len = 0; |
1892 | audit_free_aux(context); | 1839 | context->type = 0; |
1893 | context->aux = NULL; | 1840 | context->fds[0] = -1; |
1894 | context->aux_pids = NULL; | 1841 | if (context->state != AUDIT_RECORD_CONTEXT) { |
1895 | context->target_pid = 0; | 1842 | kfree(context->filterkey); |
1896 | context->target_sid = 0; | 1843 | context->filterkey = NULL; |
1897 | context->sockaddr_len = 0; | ||
1898 | context->type = 0; | ||
1899 | context->fds[0] = -1; | ||
1900 | if (context->state != AUDIT_RECORD_CONTEXT) { | ||
1901 | kfree(context->filterkey); | ||
1902 | context->filterkey = NULL; | ||
1903 | } | ||
1904 | tsk->audit_context = context; | ||
1905 | } | 1844 | } |
1845 | tsk->audit_context = context; | ||
1906 | } | 1846 | } |
1907 | 1847 | ||
1908 | static inline void handle_one(const struct inode *inode) | 1848 | static inline void handle_one(const struct inode *inode) |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0dbfba2efa77..4855892798fd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -138,6 +138,9 @@ struct cgroupfs_root { | |||
138 | /* Hierarchy-specific flags */ | 138 | /* Hierarchy-specific flags */ |
139 | unsigned long flags; | 139 | unsigned long flags; |
140 | 140 | ||
141 | /* IDs for cgroups in this hierarchy */ | ||
142 | struct ida cgroup_ida; | ||
143 | |||
141 | /* The path to use for release notifications. */ | 144 | /* The path to use for release notifications. */ |
142 | char release_agent_path[PATH_MAX]; | 145 | char release_agent_path[PATH_MAX]; |
143 | 146 | ||
@@ -171,8 +174,8 @@ struct css_id { | |||
171 | * The css to which this ID points. This pointer is set to valid value | 174 | * The css to which this ID points. This pointer is set to valid value |
172 | * after cgroup is populated. If cgroup is removed, this will be NULL. | 175 | * after cgroup is populated. If cgroup is removed, this will be NULL. |
173 | * This pointer is expected to be RCU-safe because destroy() | 176 | * This pointer is expected to be RCU-safe because destroy() |
174 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 177 | * is called after synchronize_rcu(). But for safe use, css_tryget() |
175 | * css_tryget() should be used for avoiding race. | 178 | * should be used for avoiding race. |
176 | */ | 179 | */ |
177 | struct cgroup_subsys_state __rcu *css; | 180 | struct cgroup_subsys_state __rcu *css; |
178 | /* | 181 | /* |
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
242 | */ | 245 | */ |
243 | static int need_forkexit_callback __read_mostly; | 246 | static int need_forkexit_callback __read_mostly; |
244 | 247 | ||
248 | static int cgroup_destroy_locked(struct cgroup *cgrp); | ||
249 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | ||
250 | struct cftype cfts[], bool is_add); | ||
251 | |||
245 | #ifdef CONFIG_PROVE_LOCKING | 252 | #ifdef CONFIG_PROVE_LOCKING |
246 | int cgroup_lock_is_held(void) | 253 | int cgroup_lock_is_held(void) |
247 | { | 254 | { |
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
294 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 301 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
295 | } | 302 | } |
296 | 303 | ||
297 | static int clone_children(const struct cgroup *cgrp) | ||
298 | { | ||
299 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
300 | } | ||
301 | |||
302 | /* | 304 | /* |
303 | * for_each_subsys() allows you to iterate on each subsystem attached to | 305 | * for_each_subsys() allows you to iterate on each subsystem attached to |
304 | * an active hierarchy | 306 | * an active hierarchy |
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
782 | * The task_lock() exception | 784 | * The task_lock() exception |
783 | * | 785 | * |
784 | * The need for this exception arises from the action of | 786 | * The need for this exception arises from the action of |
785 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with | 787 | * cgroup_attach_task(), which overwrites one task's cgroup pointer with |
786 | * another. It does so using cgroup_mutex, however there are | 788 | * another. It does so using cgroup_mutex, however there are |
787 | * several performance critical places that need to reference | 789 | * several performance critical places that need to reference |
788 | * task->cgroup without the expense of grabbing a system global | 790 | * task->cgroup without the expense of grabbing a system global |
789 | * mutex. Therefore except as noted below, when dereferencing or, as | 791 | * mutex. Therefore except as noted below, when dereferencing or, as |
790 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use | 792 | * in cgroup_attach_task(), modifying a task's cgroup pointer we use |
791 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 793 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
792 | * the task_struct routinely used for such matters. | 794 | * the task_struct routinely used for such matters. |
793 | * | 795 | * |
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
854 | return inode; | 856 | return inode; |
855 | } | 857 | } |
856 | 858 | ||
857 | /* | ||
858 | * Call subsys's pre_destroy handler. | ||
859 | * This is called before css refcnt check. | ||
860 | */ | ||
861 | static int cgroup_call_pre_destroy(struct cgroup *cgrp) | ||
862 | { | ||
863 | struct cgroup_subsys *ss; | ||
864 | int ret = 0; | ||
865 | |||
866 | for_each_subsys(cgrp->root, ss) { | ||
867 | if (!ss->pre_destroy) | ||
868 | continue; | ||
869 | |||
870 | ret = ss->pre_destroy(cgrp); | ||
871 | if (ret) { | ||
872 | /* ->pre_destroy() failure is being deprecated */ | ||
873 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
874 | break; | ||
875 | } | ||
876 | } | ||
877 | |||
878 | return ret; | ||
879 | } | ||
880 | |||
881 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 859 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
882 | { | 860 | { |
883 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 861 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
898 | * Release the subsystem state objects. | 876 | * Release the subsystem state objects. |
899 | */ | 877 | */ |
900 | for_each_subsys(cgrp->root, ss) | 878 | for_each_subsys(cgrp->root, ss) |
901 | ss->destroy(cgrp); | 879 | ss->css_free(cgrp); |
902 | 880 | ||
903 | cgrp->root->number_of_cgroups--; | 881 | cgrp->root->number_of_cgroups--; |
904 | mutex_unlock(&cgroup_mutex); | 882 | mutex_unlock(&cgroup_mutex); |
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
917 | 895 | ||
918 | simple_xattrs_free(&cgrp->xattrs); | 896 | simple_xattrs_free(&cgrp->xattrs); |
919 | 897 | ||
898 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
920 | kfree_rcu(cgrp, rcu_head); | 899 | kfree_rcu(cgrp, rcu_head); |
921 | } else { | 900 | } else { |
922 | struct cfent *cfe = __d_cfe(dentry); | 901 | struct cfent *cfe = __d_cfe(dentry); |
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
987 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 966 | if (!test_bit(ss->subsys_id, &subsys_mask)) |
988 | continue; | 967 | continue; |
989 | list_for_each_entry(set, &ss->cftsets, node) | 968 | list_for_each_entry(set, &ss->cftsets, node) |
990 | cgroup_rm_file(cgrp, set->cfts); | 969 | cgroup_addrm_files(cgrp, NULL, set->cfts, false); |
991 | } | 970 | } |
992 | if (base_files) { | 971 | if (base_files) { |
993 | while (!list_empty(&cgrp->files)) | 972 | while (!list_empty(&cgrp->files)) |
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
1015 | } | 994 | } |
1016 | 995 | ||
1017 | /* | 996 | /* |
1018 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | ||
1019 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | ||
1020 | * reference to css->refcnt. In general, this refcnt is expected to goes down | ||
1021 | * to zero, soon. | ||
1022 | * | ||
1023 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | ||
1024 | */ | ||
1025 | static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | ||
1026 | |||
1027 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | ||
1028 | { | ||
1029 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | ||
1030 | wake_up_all(&cgroup_rmdir_waitq); | ||
1031 | } | ||
1032 | |||
1033 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
1034 | { | ||
1035 | css_get(css); | ||
1036 | } | ||
1037 | |||
1038 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
1039 | { | ||
1040 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
1041 | css_put(css); | ||
1042 | } | ||
1043 | |||
1044 | /* | ||
1045 | * Call with cgroup_mutex held. Drops reference counts on modules, including | 997 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
1046 | * any duplicate ones that parse_cgroupfs_options took. If this function | 998 | * any duplicate ones that parse_cgroupfs_options took. If this function |
1047 | * returns an error, no reference counts are touched. | 999 | * returns an error, no reference counts are touched. |
@@ -1150,7 +1102,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1150 | seq_puts(seq, ",xattr"); | 1102 | seq_puts(seq, ",xattr"); |
1151 | if (strlen(root->release_agent_path)) | 1103 | if (strlen(root->release_agent_path)) |
1152 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1104 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
1153 | if (clone_children(&root->top_cgroup)) | 1105 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) |
1154 | seq_puts(seq, ",clone_children"); | 1106 | seq_puts(seq, ",clone_children"); |
1155 | if (strlen(root->name)) | 1107 | if (strlen(root->name)) |
1156 | seq_printf(seq, ",name=%s", root->name); | 1108 | seq_printf(seq, ",name=%s", root->name); |
@@ -1162,7 +1114,7 @@ struct cgroup_sb_opts { | |||
1162 | unsigned long subsys_mask; | 1114 | unsigned long subsys_mask; |
1163 | unsigned long flags; | 1115 | unsigned long flags; |
1164 | char *release_agent; | 1116 | char *release_agent; |
1165 | bool clone_children; | 1117 | bool cpuset_clone_children; |
1166 | char *name; | 1118 | char *name; |
1167 | /* User explicitly requested empty subsystem */ | 1119 | /* User explicitly requested empty subsystem */ |
1168 | bool none; | 1120 | bool none; |
@@ -1213,7 +1165,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1213 | continue; | 1165 | continue; |
1214 | } | 1166 | } |
1215 | if (!strcmp(token, "clone_children")) { | 1167 | if (!strcmp(token, "clone_children")) { |
1216 | opts->clone_children = true; | 1168 | opts->cpuset_clone_children = true; |
1217 | continue; | 1169 | continue; |
1218 | } | 1170 | } |
1219 | if (!strcmp(token, "xattr")) { | 1171 | if (!strcmp(token, "xattr")) { |
@@ -1381,7 +1333,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1381 | if (ret) | 1333 | if (ret) |
1382 | goto out_unlock; | 1334 | goto out_unlock; |
1383 | 1335 | ||
1384 | /* See feature-removal-schedule.txt */ | ||
1385 | if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) | 1336 | if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) |
1386 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | 1337 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", |
1387 | task_tgid_nr(current), current->comm); | 1338 | task_tgid_nr(current), current->comm); |
@@ -1397,14 +1348,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1397 | goto out_unlock; | 1348 | goto out_unlock; |
1398 | } | 1349 | } |
1399 | 1350 | ||
1351 | /* | ||
1352 | * Clear out the files of subsystems that should be removed, do | ||
1353 | * this before rebind_subsystems, since rebind_subsystems may | ||
1354 | * change this hierarchy's subsys_list. | ||
1355 | */ | ||
1356 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1357 | |||
1400 | ret = rebind_subsystems(root, opts.subsys_mask); | 1358 | ret = rebind_subsystems(root, opts.subsys_mask); |
1401 | if (ret) { | 1359 | if (ret) { |
1360 | /* rebind_subsystems failed, re-populate the removed files */ | ||
1361 | cgroup_populate_dir(cgrp, false, removed_mask); | ||
1402 | drop_parsed_module_refcounts(opts.subsys_mask); | 1362 | drop_parsed_module_refcounts(opts.subsys_mask); |
1403 | goto out_unlock; | 1363 | goto out_unlock; |
1404 | } | 1364 | } |
1405 | 1365 | ||
1406 | /* clear out any existing files and repopulate subsystem files */ | ||
1407 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1408 | /* re-populate subsystem files */ | 1366 | /* re-populate subsystem files */ |
1409 | cgroup_populate_dir(cgrp, false, added_mask); | 1367 | cgroup_populate_dir(cgrp, false, added_mask); |
1410 | 1368 | ||
@@ -1432,6 +1390,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1432 | INIT_LIST_HEAD(&cgrp->children); | 1390 | INIT_LIST_HEAD(&cgrp->children); |
1433 | INIT_LIST_HEAD(&cgrp->files); | 1391 | INIT_LIST_HEAD(&cgrp->files); |
1434 | INIT_LIST_HEAD(&cgrp->css_sets); | 1392 | INIT_LIST_HEAD(&cgrp->css_sets); |
1393 | INIT_LIST_HEAD(&cgrp->allcg_node); | ||
1435 | INIT_LIST_HEAD(&cgrp->release_list); | 1394 | INIT_LIST_HEAD(&cgrp->release_list); |
1436 | INIT_LIST_HEAD(&cgrp->pidlists); | 1395 | INIT_LIST_HEAD(&cgrp->pidlists); |
1437 | mutex_init(&cgrp->pidlist_mutex); | 1396 | mutex_init(&cgrp->pidlist_mutex); |
@@ -1450,8 +1409,8 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1450 | root->number_of_cgroups = 1; | 1409 | root->number_of_cgroups = 1; |
1451 | cgrp->root = root; | 1410 | cgrp->root = root; |
1452 | cgrp->top_cgroup = cgrp; | 1411 | cgrp->top_cgroup = cgrp; |
1453 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1454 | init_cgroup_housekeeping(cgrp); | 1412 | init_cgroup_housekeeping(cgrp); |
1413 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1455 | } | 1414 | } |
1456 | 1415 | ||
1457 | static bool init_root_id(struct cgroupfs_root *root) | 1416 | static bool init_root_id(struct cgroupfs_root *root) |
@@ -1518,12 +1477,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1518 | 1477 | ||
1519 | root->subsys_mask = opts->subsys_mask; | 1478 | root->subsys_mask = opts->subsys_mask; |
1520 | root->flags = opts->flags; | 1479 | root->flags = opts->flags; |
1480 | ida_init(&root->cgroup_ida); | ||
1521 | if (opts->release_agent) | 1481 | if (opts->release_agent) |
1522 | strcpy(root->release_agent_path, opts->release_agent); | 1482 | strcpy(root->release_agent_path, opts->release_agent); |
1523 | if (opts->name) | 1483 | if (opts->name) |
1524 | strcpy(root->name, opts->name); | 1484 | strcpy(root->name, opts->name); |
1525 | if (opts->clone_children) | 1485 | if (opts->cpuset_clone_children) |
1526 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); | 1486 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); |
1527 | return root; | 1487 | return root; |
1528 | } | 1488 | } |
1529 | 1489 | ||
@@ -1536,6 +1496,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root) | |||
1536 | spin_lock(&hierarchy_id_lock); | 1496 | spin_lock(&hierarchy_id_lock); |
1537 | ida_remove(&hierarchy_ida, root->hierarchy_id); | 1497 | ida_remove(&hierarchy_ida, root->hierarchy_id); |
1538 | spin_unlock(&hierarchy_id_lock); | 1498 | spin_unlock(&hierarchy_id_lock); |
1499 | ida_destroy(&root->cgroup_ida); | ||
1539 | kfree(root); | 1500 | kfree(root); |
1540 | } | 1501 | } |
1541 | 1502 | ||
@@ -1701,7 +1662,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1701 | 1662 | ||
1702 | free_cg_links(&tmp_cg_links); | 1663 | free_cg_links(&tmp_cg_links); |
1703 | 1664 | ||
1704 | BUG_ON(!list_empty(&root_cgrp->sibling)); | ||
1705 | BUG_ON(!list_empty(&root_cgrp->children)); | 1665 | BUG_ON(!list_empty(&root_cgrp->children)); |
1706 | BUG_ON(root->number_of_cgroups != 1); | 1666 | BUG_ON(root->number_of_cgroups != 1); |
1707 | 1667 | ||
@@ -1750,7 +1710,6 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1750 | 1710 | ||
1751 | BUG_ON(root->number_of_cgroups != 1); | 1711 | BUG_ON(root->number_of_cgroups != 1); |
1752 | BUG_ON(!list_empty(&cgrp->children)); | 1712 | BUG_ON(!list_empty(&cgrp->children)); |
1753 | BUG_ON(!list_empty(&cgrp->sibling)); | ||
1754 | 1713 | ||
1755 | mutex_lock(&cgroup_mutex); | 1714 | mutex_lock(&cgroup_mutex); |
1756 | mutex_lock(&cgroup_root_mutex); | 1715 | mutex_lock(&cgroup_root_mutex); |
@@ -1808,9 +1767,11 @@ static struct kobject *cgroup_kobj; | |||
1808 | */ | 1767 | */ |
1809 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1768 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1810 | { | 1769 | { |
1770 | struct dentry *dentry = cgrp->dentry; | ||
1811 | char *start; | 1771 | char *start; |
1812 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, | 1772 | |
1813 | cgroup_lock_is_held()); | 1773 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), |
1774 | "cgroup_path() called without proper locking"); | ||
1814 | 1775 | ||
1815 | if (!dentry || cgrp == dummytop) { | 1776 | if (!dentry || cgrp == dummytop) { |
1816 | /* | 1777 | /* |
@@ -1821,9 +1782,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1821 | return 0; | 1782 | return 0; |
1822 | } | 1783 | } |
1823 | 1784 | ||
1824 | start = buf + buflen; | 1785 | start = buf + buflen - 1; |
1825 | 1786 | ||
1826 | *--start = '\0'; | 1787 | *start = '\0'; |
1827 | for (;;) { | 1788 | for (;;) { |
1828 | int len = dentry->d_name.len; | 1789 | int len = dentry->d_name.len; |
1829 | 1790 | ||
@@ -1834,8 +1795,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1834 | if (!cgrp) | 1795 | if (!cgrp) |
1835 | break; | 1796 | break; |
1836 | 1797 | ||
1837 | dentry = rcu_dereference_check(cgrp->dentry, | 1798 | dentry = cgrp->dentry; |
1838 | cgroup_lock_is_held()); | ||
1839 | if (!cgrp->parent) | 1799 | if (!cgrp->parent) |
1840 | continue; | 1800 | continue; |
1841 | if (--start < buf) | 1801 | if (--start < buf) |
@@ -1930,9 +1890,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); | |||
1930 | /* | 1890 | /* |
1931 | * cgroup_task_migrate - move a task from one cgroup to another. | 1891 | * cgroup_task_migrate - move a task from one cgroup to another. |
1932 | * | 1892 | * |
1933 | * 'guarantee' is set if the caller promises that a new css_set for the task | 1893 | * Must be called with cgroup_mutex and threadgroup locked. |
1934 | * will already exist. If not set, this function might sleep, and can fail with | ||
1935 | * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. | ||
1936 | */ | 1894 | */ |
1937 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1895 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, |
1938 | struct task_struct *tsk, struct css_set *newcg) | 1896 | struct task_struct *tsk, struct css_set *newcg) |
@@ -2025,12 +1983,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
2025 | } | 1983 | } |
2026 | 1984 | ||
2027 | synchronize_rcu(); | 1985 | synchronize_rcu(); |
2028 | |||
2029 | /* | ||
2030 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | ||
2031 | * is no longer empty. | ||
2032 | */ | ||
2033 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2034 | out: | 1986 | out: |
2035 | if (retval) { | 1987 | if (retval) { |
2036 | for_each_subsys(root, ss) { | 1988 | for_each_subsys(root, ss) { |
@@ -2200,7 +2152,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2200 | * step 5: success! and cleanup | 2152 | * step 5: success! and cleanup |
2201 | */ | 2153 | */ |
2202 | synchronize_rcu(); | 2154 | synchronize_rcu(); |
2203 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2204 | retval = 0; | 2155 | retval = 0; |
2205 | out_put_css_set_refs: | 2156 | out_put_css_set_refs: |
2206 | if (retval) { | 2157 | if (retval) { |
@@ -2711,10 +2662,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
2711 | 2662 | ||
2712 | /* start off with i_nlink == 2 (for "." entry) */ | 2663 | /* start off with i_nlink == 2 (for "." entry) */ |
2713 | inc_nlink(inode); | 2664 | inc_nlink(inode); |
2665 | inc_nlink(dentry->d_parent->d_inode); | ||
2714 | 2666 | ||
2715 | /* start with the directory inode held, so that we can | 2667 | /* |
2716 | * populate it without racing with another mkdir */ | 2668 | * Control reaches here with cgroup_mutex held. |
2717 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); | 2669 | * @inode->i_mutex should nest outside cgroup_mutex but we |
2670 | * want to populate it immediately without releasing | ||
2671 | * cgroup_mutex. As @inode isn't visible to anyone else | ||
2672 | * yet, trylock will always succeed without affecting | ||
2673 | * lockdep checks. | ||
2674 | */ | ||
2675 | WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); | ||
2718 | } else if (S_ISREG(mode)) { | 2676 | } else if (S_ISREG(mode)) { |
2719 | inode->i_size = 0; | 2677 | inode->i_size = 0; |
2720 | inode->i_fop = &cgroup_file_operations; | 2678 | inode->i_fop = &cgroup_file_operations; |
@@ -2725,32 +2683,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
2725 | return 0; | 2683 | return 0; |
2726 | } | 2684 | } |
2727 | 2685 | ||
2728 | /* | ||
2729 | * cgroup_create_dir - create a directory for an object. | ||
2730 | * @cgrp: the cgroup we create the directory for. It must have a valid | ||
2731 | * ->parent field. And we are going to fill its ->dentry field. | ||
2732 | * @dentry: dentry of the new cgroup | ||
2733 | * @mode: mode to set on new directory. | ||
2734 | */ | ||
2735 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | ||
2736 | umode_t mode) | ||
2737 | { | ||
2738 | struct dentry *parent; | ||
2739 | int error = 0; | ||
2740 | |||
2741 | parent = cgrp->parent->dentry; | ||
2742 | error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); | ||
2743 | if (!error) { | ||
2744 | dentry->d_fsdata = cgrp; | ||
2745 | inc_nlink(parent->d_inode); | ||
2746 | rcu_assign_pointer(cgrp->dentry, dentry); | ||
2747 | dget(dentry); | ||
2748 | } | ||
2749 | dput(dentry); | ||
2750 | |||
2751 | return error; | ||
2752 | } | ||
2753 | |||
2754 | /** | 2686 | /** |
2755 | * cgroup_file_mode - deduce file mode of a control file | 2687 | * cgroup_file_mode - deduce file mode of a control file |
2756 | * @cft: the control file in question | 2688 | * @cft: the control file in question |
@@ -2791,12 +2723,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2791 | 2723 | ||
2792 | simple_xattrs_init(&cft->xattrs); | 2724 | simple_xattrs_init(&cft->xattrs); |
2793 | 2725 | ||
2794 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2795 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2796 | return 0; | ||
2797 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2798 | return 0; | ||
2799 | |||
2800 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2726 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2801 | strcpy(name, subsys->name); | 2727 | strcpy(name, subsys->name); |
2802 | strcat(name, "."); | 2728 | strcat(name, "."); |
@@ -2837,6 +2763,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2837 | int err, ret = 0; | 2763 | int err, ret = 0; |
2838 | 2764 | ||
2839 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2765 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2766 | /* does cft->flags tell us to skip this file on @cgrp? */ | ||
2767 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2768 | continue; | ||
2769 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2770 | continue; | ||
2771 | |||
2840 | if (is_add) | 2772 | if (is_add) |
2841 | err = cgroup_add_file(cgrp, subsys, cft); | 2773 | err = cgroup_add_file(cgrp, subsys, cft); |
2842 | else | 2774 | else |
@@ -3044,6 +2976,92 @@ static void cgroup_enable_task_cg_lists(void) | |||
3044 | write_unlock(&css_set_lock); | 2976 | write_unlock(&css_set_lock); |
3045 | } | 2977 | } |
3046 | 2978 | ||
2979 | /** | ||
2980 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | ||
2981 | * @pos: the current position (%NULL to initiate traversal) | ||
2982 | * @cgroup: cgroup whose descendants to walk | ||
2983 | * | ||
2984 | * To be used by cgroup_for_each_descendant_pre(). Find the next | ||
2985 | * descendant to visit for pre-order traversal of @cgroup's descendants. | ||
2986 | */ | ||
2987 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | ||
2988 | struct cgroup *cgroup) | ||
2989 | { | ||
2990 | struct cgroup *next; | ||
2991 | |||
2992 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
2993 | |||
2994 | /* if first iteration, pretend we just visited @cgroup */ | ||
2995 | if (!pos) { | ||
2996 | if (list_empty(&cgroup->children)) | ||
2997 | return NULL; | ||
2998 | pos = cgroup; | ||
2999 | } | ||
3000 | |||
3001 | /* visit the first child if exists */ | ||
3002 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | ||
3003 | if (next) | ||
3004 | return next; | ||
3005 | |||
3006 | /* no child, visit my or the closest ancestor's next sibling */ | ||
3007 | do { | ||
3008 | next = list_entry_rcu(pos->sibling.next, struct cgroup, | ||
3009 | sibling); | ||
3010 | if (&next->sibling != &pos->parent->children) | ||
3011 | return next; | ||
3012 | |||
3013 | pos = pos->parent; | ||
3014 | } while (pos != cgroup); | ||
3015 | |||
3016 | return NULL; | ||
3017 | } | ||
3018 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | ||
3019 | |||
3020 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | ||
3021 | { | ||
3022 | struct cgroup *last; | ||
3023 | |||
3024 | do { | ||
3025 | last = pos; | ||
3026 | pos = list_first_or_null_rcu(&pos->children, struct cgroup, | ||
3027 | sibling); | ||
3028 | } while (pos); | ||
3029 | |||
3030 | return last; | ||
3031 | } | ||
3032 | |||
3033 | /** | ||
3034 | * cgroup_next_descendant_post - find the next descendant for post-order walk | ||
3035 | * @pos: the current position (%NULL to initiate traversal) | ||
3036 | * @cgroup: cgroup whose descendants to walk | ||
3037 | * | ||
3038 | * To be used by cgroup_for_each_descendant_post(). Find the next | ||
3039 | * descendant to visit for post-order traversal of @cgroup's descendants. | ||
3040 | */ | ||
3041 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | ||
3042 | struct cgroup *cgroup) | ||
3043 | { | ||
3044 | struct cgroup *next; | ||
3045 | |||
3046 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
3047 | |||
3048 | /* if first iteration, visit the leftmost descendant */ | ||
3049 | if (!pos) { | ||
3050 | next = cgroup_leftmost_descendant(cgroup); | ||
3051 | return next != cgroup ? next : NULL; | ||
3052 | } | ||
3053 | |||
3054 | /* if there's an unvisited sibling, visit its leftmost descendant */ | ||
3055 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | ||
3056 | if (&next->sibling != &pos->parent->children) | ||
3057 | return cgroup_leftmost_descendant(next); | ||
3058 | |||
3059 | /* no sibling left, visit parent */ | ||
3060 | next = pos->parent; | ||
3061 | return next != cgroup ? next : NULL; | ||
3062 | } | ||
3063 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); | ||
3064 | |||
3047 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 3065 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
3048 | __acquires(css_set_lock) | 3066 | __acquires(css_set_lock) |
3049 | { | 3067 | { |
@@ -3757,7 +3775,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3757 | if (flags & POLLHUP) { | 3775 | if (flags & POLLHUP) { |
3758 | __remove_wait_queue(event->wqh, &event->wait); | 3776 | __remove_wait_queue(event->wqh, &event->wait); |
3759 | spin_lock(&cgrp->event_list_lock); | 3777 | spin_lock(&cgrp->event_list_lock); |
3760 | list_del(&event->list); | 3778 | list_del_init(&event->list); |
3761 | spin_unlock(&cgrp->event_list_lock); | 3779 | spin_unlock(&cgrp->event_list_lock); |
3762 | /* | 3780 | /* |
3763 | * We are in atomic context, but cgroup_event_remove() may | 3781 | * We are in atomic context, but cgroup_event_remove() may |
@@ -3894,7 +3912,7 @@ fail: | |||
3894 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 3912 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, |
3895 | struct cftype *cft) | 3913 | struct cftype *cft) |
3896 | { | 3914 | { |
3897 | return clone_children(cgrp); | 3915 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3898 | } | 3916 | } |
3899 | 3917 | ||
3900 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 3918 | static int cgroup_clone_children_write(struct cgroup *cgrp, |
@@ -3902,9 +3920,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, | |||
3902 | u64 val) | 3920 | u64 val) |
3903 | { | 3921 | { |
3904 | if (val) | 3922 | if (val) |
3905 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3923 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3906 | else | 3924 | else |
3907 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3925 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
3908 | return 0; | 3926 | return 0; |
3909 | } | 3927 | } |
3910 | 3928 | ||
@@ -4017,19 +4035,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
4017 | css->flags = 0; | 4035 | css->flags = 0; |
4018 | css->id = NULL; | 4036 | css->id = NULL; |
4019 | if (cgrp == dummytop) | 4037 | if (cgrp == dummytop) |
4020 | set_bit(CSS_ROOT, &css->flags); | 4038 | css->flags |= CSS_ROOT; |
4021 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 4039 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
4022 | cgrp->subsys[ss->subsys_id] = css; | 4040 | cgrp->subsys[ss->subsys_id] = css; |
4023 | 4041 | ||
4024 | /* | 4042 | /* |
4025 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | 4043 | * css holds an extra ref to @cgrp->dentry which is put on the last |
4026 | * which is put on the last css_put(). dput() requires process | 4044 | * css_put(). dput() requires process context, which css_put() may |
4027 | * context, which css_put() may be called without. @css->dput_work | 4045 | * be called without. @css->dput_work will be used to invoke |
4028 | * will be used to invoke dput() asynchronously from css_put(). | 4046 | * dput() asynchronously from css_put(). |
4029 | */ | 4047 | */ |
4030 | INIT_WORK(&css->dput_work, css_dput_fn); | 4048 | INIT_WORK(&css->dput_work, css_dput_fn); |
4031 | if (ss->__DEPRECATED_clear_css_refs) | 4049 | } |
4032 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | 4050 | |
4051 | /* invoke ->post_create() on a new CSS and mark it online if successful */ | ||
4052 | static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4053 | { | ||
4054 | int ret = 0; | ||
4055 | |||
4056 | lockdep_assert_held(&cgroup_mutex); | ||
4057 | |||
4058 | if (ss->css_online) | ||
4059 | ret = ss->css_online(cgrp); | ||
4060 | if (!ret) | ||
4061 | cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; | ||
4062 | return ret; | ||
4063 | } | ||
4064 | |||
4065 | /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ | ||
4066 | static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
4067 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||
4068 | { | ||
4069 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4070 | |||
4071 | lockdep_assert_held(&cgroup_mutex); | ||
4072 | |||
4073 | if (!(css->flags & CSS_ONLINE)) | ||
4074 | return; | ||
4075 | |||
4076 | /* | ||
4077 | * css_offline() should be called with cgroup_mutex unlocked. See | ||
4078 | * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for | ||
4079 | * details. This temporary unlocking should go away once | ||
4080 | * cgroup_mutex is unexported from controllers. | ||
4081 | */ | ||
4082 | if (ss->css_offline) { | ||
4083 | mutex_unlock(&cgroup_mutex); | ||
4084 | ss->css_offline(cgrp); | ||
4085 | mutex_lock(&cgroup_mutex); | ||
4086 | } | ||
4087 | |||
4088 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | ||
4033 | } | 4089 | } |
4034 | 4090 | ||
4035 | /* | 4091 | /* |
@@ -4049,10 +4105,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4049 | struct cgroup_subsys *ss; | 4105 | struct cgroup_subsys *ss; |
4050 | struct super_block *sb = root->sb; | 4106 | struct super_block *sb = root->sb; |
4051 | 4107 | ||
4108 | /* allocate the cgroup and its ID, 0 is reserved for the root */ | ||
4052 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); | 4109 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); |
4053 | if (!cgrp) | 4110 | if (!cgrp) |
4054 | return -ENOMEM; | 4111 | return -ENOMEM; |
4055 | 4112 | ||
4113 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | ||
4114 | if (cgrp->id < 0) | ||
4115 | goto err_free_cgrp; | ||
4116 | |||
4117 | /* | ||
4118 | * Only live parents can have children. Note that the liveliness | ||
4119 | * check isn't strictly necessary because cgroup_mkdir() and | ||
4120 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
4121 | * anyway so that locking is contained inside cgroup proper and we | ||
4122 | * don't get nasty surprises if we ever grow another caller. | ||
4123 | */ | ||
4124 | if (!cgroup_lock_live_group(parent)) { | ||
4125 | err = -ENODEV; | ||
4126 | goto err_free_id; | ||
4127 | } | ||
4128 | |||
4056 | /* Grab a reference on the superblock so the hierarchy doesn't | 4129 | /* Grab a reference on the superblock so the hierarchy doesn't |
4057 | * get deleted on unmount if there are child cgroups. This | 4130 | * get deleted on unmount if there are child cgroups. This |
4058 | * can be done outside cgroup_mutex, since the sb can't | 4131 | * can be done outside cgroup_mutex, since the sb can't |
@@ -4060,8 +4133,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4060 | * fs */ | 4133 | * fs */ |
4061 | atomic_inc(&sb->s_active); | 4134 | atomic_inc(&sb->s_active); |
4062 | 4135 | ||
4063 | mutex_lock(&cgroup_mutex); | ||
4064 | |||
4065 | init_cgroup_housekeeping(cgrp); | 4136 | init_cgroup_housekeeping(cgrp); |
4066 | 4137 | ||
4067 | cgrp->parent = parent; | 4138 | cgrp->parent = parent; |
@@ -4071,26 +4142,51 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4071 | if (notify_on_release(parent)) | 4142 | if (notify_on_release(parent)) |
4072 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 4143 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
4073 | 4144 | ||
4074 | if (clone_children(parent)) | 4145 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) |
4075 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 4146 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
4076 | 4147 | ||
4077 | for_each_subsys(root, ss) { | 4148 | for_each_subsys(root, ss) { |
4078 | struct cgroup_subsys_state *css; | 4149 | struct cgroup_subsys_state *css; |
4079 | 4150 | ||
4080 | css = ss->create(cgrp); | 4151 | css = ss->css_alloc(cgrp); |
4081 | if (IS_ERR(css)) { | 4152 | if (IS_ERR(css)) { |
4082 | err = PTR_ERR(css); | 4153 | err = PTR_ERR(css); |
4083 | goto err_destroy; | 4154 | goto err_free_all; |
4084 | } | 4155 | } |
4085 | init_cgroup_css(css, ss, cgrp); | 4156 | init_cgroup_css(css, ss, cgrp); |
4086 | if (ss->use_id) { | 4157 | if (ss->use_id) { |
4087 | err = alloc_css_id(ss, parent, cgrp); | 4158 | err = alloc_css_id(ss, parent, cgrp); |
4088 | if (err) | 4159 | if (err) |
4089 | goto err_destroy; | 4160 | goto err_free_all; |
4090 | } | 4161 | } |
4091 | /* At error, ->destroy() callback has to free assigned ID. */ | 4162 | } |
4092 | if (clone_children(parent) && ss->post_clone) | 4163 | |
4093 | ss->post_clone(cgrp); | 4164 | /* |
4165 | * Create directory. cgroup_create_file() returns with the new | ||
4166 | * directory locked on success so that it can be populated without | ||
4167 | * dropping cgroup_mutex. | ||
4168 | */ | ||
4169 | err = cgroup_create_file(dentry, S_IFDIR | mode, sb); | ||
4170 | if (err < 0) | ||
4171 | goto err_free_all; | ||
4172 | lockdep_assert_held(&dentry->d_inode->i_mutex); | ||
4173 | |||
4174 | /* allocation complete, commit to creation */ | ||
4175 | dentry->d_fsdata = cgrp; | ||
4176 | cgrp->dentry = dentry; | ||
4177 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4178 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | ||
4179 | root->number_of_cgroups++; | ||
4180 | |||
4181 | /* each css holds a ref to the cgroup's dentry */ | ||
4182 | for_each_subsys(root, ss) | ||
4183 | dget(dentry); | ||
4184 | |||
4185 | /* creation succeeded, notify subsystems */ | ||
4186 | for_each_subsys(root, ss) { | ||
4187 | err = online_css(ss, cgrp); | ||
4188 | if (err) | ||
4189 | goto err_destroy; | ||
4094 | 4190 | ||
4095 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | 4191 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
4096 | parent->parent) { | 4192 | parent->parent) { |
@@ -4102,50 +4198,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4102 | } | 4198 | } |
4103 | } | 4199 | } |
4104 | 4200 | ||
4105 | list_add(&cgrp->sibling, &cgrp->parent->children); | ||
4106 | root->number_of_cgroups++; | ||
4107 | |||
4108 | err = cgroup_create_dir(cgrp, dentry, mode); | ||
4109 | if (err < 0) | ||
4110 | goto err_remove; | ||
4111 | |||
4112 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4113 | for_each_subsys(root, ss) | ||
4114 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4115 | dget(dentry); | ||
4116 | |||
4117 | /* The cgroup directory was pre-locked for us */ | ||
4118 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | ||
4119 | |||
4120 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4121 | |||
4122 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); | 4201 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); |
4123 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4202 | if (err) |
4203 | goto err_destroy; | ||
4124 | 4204 | ||
4125 | mutex_unlock(&cgroup_mutex); | 4205 | mutex_unlock(&cgroup_mutex); |
4126 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 4206 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
4127 | 4207 | ||
4128 | return 0; | 4208 | return 0; |
4129 | 4209 | ||
4130 | err_remove: | 4210 | err_free_all: |
4131 | |||
4132 | list_del(&cgrp->sibling); | ||
4133 | root->number_of_cgroups--; | ||
4134 | |||
4135 | err_destroy: | ||
4136 | |||
4137 | for_each_subsys(root, ss) { | 4211 | for_each_subsys(root, ss) { |
4138 | if (cgrp->subsys[ss->subsys_id]) | 4212 | if (cgrp->subsys[ss->subsys_id]) |
4139 | ss->destroy(cgrp); | 4213 | ss->css_free(cgrp); |
4140 | } | 4214 | } |
4141 | |||
4142 | mutex_unlock(&cgroup_mutex); | 4215 | mutex_unlock(&cgroup_mutex); |
4143 | |||
4144 | /* Release the reference count that we took on the superblock */ | 4216 | /* Release the reference count that we took on the superblock */ |
4145 | deactivate_super(sb); | 4217 | deactivate_super(sb); |
4146 | 4218 | err_free_id: | |
4219 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | ||
4220 | err_free_cgrp: | ||
4147 | kfree(cgrp); | 4221 | kfree(cgrp); |
4148 | return err; | 4222 | return err; |
4223 | |||
4224 | err_destroy: | ||
4225 | cgroup_destroy_locked(cgrp); | ||
4226 | mutex_unlock(&cgroup_mutex); | ||
4227 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
4228 | return err; | ||
4149 | } | 4229 | } |
4150 | 4230 | ||
4151 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | 4231 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
@@ -4197,153 +4277,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
4197 | return 0; | 4277 | return 0; |
4198 | } | 4278 | } |
4199 | 4279 | ||
4200 | /* | 4280 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
4201 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4281 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4202 | * CSS_REMOVED. Return true on success, or false if the cgroup has | ||
4203 | * busy subsystems. Call with cgroup_mutex held | ||
4204 | * | ||
4205 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4206 | * not, cgroup removal behaves differently. | ||
4207 | * | ||
4208 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4209 | * cgroup removal can be committed. This is implemented by | ||
4210 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4211 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4212 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4213 | * removed as soon as the existing user (memcg) is updated. | ||
4214 | * | ||
4215 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4216 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4217 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4218 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4219 | * is put so that dentry destruction happens only after all css's are | ||
4220 | * released. | ||
4221 | */ | ||
4222 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | ||
4223 | { | 4282 | { |
4283 | struct dentry *d = cgrp->dentry; | ||
4284 | struct cgroup *parent = cgrp->parent; | ||
4285 | DEFINE_WAIT(wait); | ||
4286 | struct cgroup_event *event, *tmp; | ||
4224 | struct cgroup_subsys *ss; | 4287 | struct cgroup_subsys *ss; |
4225 | unsigned long flags; | 4288 | LIST_HEAD(tmp_list); |
4226 | bool failed = false; | 4289 | |
4290 | lockdep_assert_held(&d->d_inode->i_mutex); | ||
4291 | lockdep_assert_held(&cgroup_mutex); | ||
4227 | 4292 | ||
4228 | local_irq_save(flags); | 4293 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) |
4294 | return -EBUSY; | ||
4229 | 4295 | ||
4230 | /* | 4296 | /* |
4231 | * Block new css_tryget() by deactivating refcnt. If all refcnts | 4297 | * Block new css_tryget() by deactivating refcnt and mark @cgrp |
4232 | * for subsystems w/ clear_css_refs set were 1 at the moment of | 4298 | * removed. This makes future css_tryget() and child creation |
4233 | * deactivation, we succeeded. | 4299 | * attempts fail thus maintaining the removal conditions verified |
4300 | * above. | ||
4234 | */ | 4301 | */ |
4235 | for_each_subsys(cgrp->root, ss) { | 4302 | for_each_subsys(cgrp->root, ss) { |
4236 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4303 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4237 | 4304 | ||
4238 | WARN_ON(atomic_read(&css->refcnt) < 0); | 4305 | WARN_ON(atomic_read(&css->refcnt) < 0); |
4239 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | 4306 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
4240 | |||
4241 | if (ss->__DEPRECATED_clear_css_refs) | ||
4242 | failed |= css_refcnt(css) != 1; | ||
4243 | } | ||
4244 | |||
4245 | /* | ||
4246 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4247 | * restore refcnts to positive values. Either way, all in-progress | ||
4248 | * css_tryget() will be released. | ||
4249 | */ | ||
4250 | for_each_subsys(cgrp->root, ss) { | ||
4251 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4252 | |||
4253 | if (!failed) { | ||
4254 | set_bit(CSS_REMOVED, &css->flags); | ||
4255 | css_put(css); | ||
4256 | } else { | ||
4257 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
4258 | } | ||
4259 | } | 4307 | } |
4308 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4260 | 4309 | ||
4261 | local_irq_restore(flags); | 4310 | /* tell subsystems to initate destruction */ |
4262 | return !failed; | 4311 | for_each_subsys(cgrp->root, ss) |
4263 | } | 4312 | offline_css(ss, cgrp); |
4264 | |||
4265 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4266 | { | ||
4267 | struct cgroup *cgrp = dentry->d_fsdata; | ||
4268 | struct dentry *d; | ||
4269 | struct cgroup *parent; | ||
4270 | DEFINE_WAIT(wait); | ||
4271 | struct cgroup_event *event, *tmp; | ||
4272 | int ret; | ||
4273 | |||
4274 | /* the vfs holds both inode->i_mutex already */ | ||
4275 | again: | ||
4276 | mutex_lock(&cgroup_mutex); | ||
4277 | if (atomic_read(&cgrp->count) != 0) { | ||
4278 | mutex_unlock(&cgroup_mutex); | ||
4279 | return -EBUSY; | ||
4280 | } | ||
4281 | if (!list_empty(&cgrp->children)) { | ||
4282 | mutex_unlock(&cgroup_mutex); | ||
4283 | return -EBUSY; | ||
4284 | } | ||
4285 | mutex_unlock(&cgroup_mutex); | ||
4286 | |||
4287 | /* | ||
4288 | * In general, subsystem has no css->refcnt after pre_destroy(). But | ||
4289 | * in racy cases, subsystem may have to get css->refcnt after | ||
4290 | * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes | ||
4291 | * make rmdir return -EBUSY too often. To avoid that, we use waitqueue | ||
4292 | * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir | ||
4293 | * and subsystem's reference count handling. Please see css_get/put | ||
4294 | * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. | ||
4295 | */ | ||
4296 | set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4297 | 4313 | ||
4298 | /* | 4314 | /* |
4299 | * Call pre_destroy handlers of subsys. Notify subsystems | 4315 | * Put all the base refs. Each css holds an extra reference to the |
4300 | * that rmdir() request comes. | 4316 | * cgroup's dentry and cgroup removal proceeds regardless of css |
4317 | * refs. On the last put of each css, whenever that may be, the | ||
4318 | * extra dentry ref is put so that dentry destruction happens only | ||
4319 | * after all css's are released. | ||
4301 | */ | 4320 | */ |
4302 | ret = cgroup_call_pre_destroy(cgrp); | 4321 | for_each_subsys(cgrp->root, ss) |
4303 | if (ret) { | 4322 | css_put(cgrp->subsys[ss->subsys_id]); |
4304 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4305 | return ret; | ||
4306 | } | ||
4307 | |||
4308 | mutex_lock(&cgroup_mutex); | ||
4309 | parent = cgrp->parent; | ||
4310 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | ||
4311 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4312 | mutex_unlock(&cgroup_mutex); | ||
4313 | return -EBUSY; | ||
4314 | } | ||
4315 | prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); | ||
4316 | if (!cgroup_clear_css_refs(cgrp)) { | ||
4317 | mutex_unlock(&cgroup_mutex); | ||
4318 | /* | ||
4319 | * Because someone may call cgroup_wakeup_rmdir_waiter() before | ||
4320 | * prepare_to_wait(), we need to check this flag. | ||
4321 | */ | ||
4322 | if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) | ||
4323 | schedule(); | ||
4324 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4325 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4326 | if (signal_pending(current)) | ||
4327 | return -EINTR; | ||
4328 | goto again; | ||
4329 | } | ||
4330 | /* NO css_tryget() can success after here. */ | ||
4331 | finish_wait(&cgroup_rmdir_waitq, &wait); | ||
4332 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | ||
4333 | 4323 | ||
4334 | raw_spin_lock(&release_list_lock); | 4324 | raw_spin_lock(&release_list_lock); |
4335 | set_bit(CGRP_REMOVED, &cgrp->flags); | ||
4336 | if (!list_empty(&cgrp->release_list)) | 4325 | if (!list_empty(&cgrp->release_list)) |
4337 | list_del_init(&cgrp->release_list); | 4326 | list_del_init(&cgrp->release_list); |
4338 | raw_spin_unlock(&release_list_lock); | 4327 | raw_spin_unlock(&release_list_lock); |
4339 | 4328 | ||
4340 | /* delete this cgroup from parent->children */ | 4329 | /* delete this cgroup from parent->children */ |
4341 | list_del_init(&cgrp->sibling); | 4330 | list_del_rcu(&cgrp->sibling); |
4342 | |||
4343 | list_del_init(&cgrp->allcg_node); | 4331 | list_del_init(&cgrp->allcg_node); |
4344 | 4332 | ||
4345 | d = dget(cgrp->dentry); | 4333 | dget(d); |
4346 | |||
4347 | cgroup_d_remove_dir(d); | 4334 | cgroup_d_remove_dir(d); |
4348 | dput(d); | 4335 | dput(d); |
4349 | 4336 | ||
@@ -4353,21 +4340,35 @@ again: | |||
4353 | /* | 4340 | /* |
4354 | * Unregister events and notify userspace. | 4341 | * Unregister events and notify userspace. |
4355 | * Notify userspace about cgroup removing only after rmdir of cgroup | 4342 | * Notify userspace about cgroup removing only after rmdir of cgroup |
4356 | * directory to avoid race between userspace and kernelspace | 4343 | * directory to avoid race between userspace and kernelspace. Use |
4344 | * a temporary list to avoid a deadlock with cgroup_event_wake(). Since | ||
4345 | * cgroup_event_wake() is called with the wait queue head locked, | ||
4346 | * remove_wait_queue() cannot be called while holding event_list_lock. | ||
4357 | */ | 4347 | */ |
4358 | spin_lock(&cgrp->event_list_lock); | 4348 | spin_lock(&cgrp->event_list_lock); |
4359 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | 4349 | list_splice_init(&cgrp->event_list, &tmp_list); |
4360 | list_del(&event->list); | 4350 | spin_unlock(&cgrp->event_list_lock); |
4351 | list_for_each_entry_safe(event, tmp, &tmp_list, list) { | ||
4352 | list_del_init(&event->list); | ||
4361 | remove_wait_queue(event->wqh, &event->wait); | 4353 | remove_wait_queue(event->wqh, &event->wait); |
4362 | eventfd_signal(event->eventfd, 1); | 4354 | eventfd_signal(event->eventfd, 1); |
4363 | schedule_work(&event->remove); | 4355 | schedule_work(&event->remove); |
4364 | } | 4356 | } |
4365 | spin_unlock(&cgrp->event_list_lock); | ||
4366 | 4357 | ||
4367 | mutex_unlock(&cgroup_mutex); | ||
4368 | return 0; | 4358 | return 0; |
4369 | } | 4359 | } |
4370 | 4360 | ||
4361 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
4362 | { | ||
4363 | int ret; | ||
4364 | |||
4365 | mutex_lock(&cgroup_mutex); | ||
4366 | ret = cgroup_destroy_locked(dentry->d_fsdata); | ||
4367 | mutex_unlock(&cgroup_mutex); | ||
4368 | |||
4369 | return ret; | ||
4370 | } | ||
4371 | |||
4371 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | 4372 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) |
4372 | { | 4373 | { |
4373 | INIT_LIST_HEAD(&ss->cftsets); | 4374 | INIT_LIST_HEAD(&ss->cftsets); |
@@ -4388,13 +4389,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4388 | 4389 | ||
4389 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4390 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4390 | 4391 | ||
4392 | mutex_lock(&cgroup_mutex); | ||
4393 | |||
4391 | /* init base cftset */ | 4394 | /* init base cftset */ |
4392 | cgroup_init_cftsets(ss); | 4395 | cgroup_init_cftsets(ss); |
4393 | 4396 | ||
4394 | /* Create the top cgroup state for this subsystem */ | 4397 | /* Create the top cgroup state for this subsystem */ |
4395 | list_add(&ss->sibling, &rootnode.subsys_list); | 4398 | list_add(&ss->sibling, &rootnode.subsys_list); |
4396 | ss->root = &rootnode; | 4399 | ss->root = &rootnode; |
4397 | css = ss->create(dummytop); | 4400 | css = ss->css_alloc(dummytop); |
4398 | /* We don't handle early failures gracefully */ | 4401 | /* We don't handle early failures gracefully */ |
4399 | BUG_ON(IS_ERR(css)); | 4402 | BUG_ON(IS_ERR(css)); |
4400 | init_cgroup_css(css, ss, dummytop); | 4403 | init_cgroup_css(css, ss, dummytop); |
@@ -4403,7 +4406,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4403 | * pointer to this state - since the subsystem is | 4406 | * pointer to this state - since the subsystem is |
4404 | * newly registered, all tasks and hence the | 4407 | * newly registered, all tasks and hence the |
4405 | * init_css_set is in the subsystem's top cgroup. */ | 4408 | * init_css_set is in the subsystem's top cgroup. */ |
4406 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; | 4409 | init_css_set.subsys[ss->subsys_id] = css; |
4407 | 4410 | ||
4408 | need_forkexit_callback |= ss->fork || ss->exit; | 4411 | need_forkexit_callback |= ss->fork || ss->exit; |
4409 | 4412 | ||
@@ -4413,6 +4416,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4413 | BUG_ON(!list_empty(&init_task.tasks)); | 4416 | BUG_ON(!list_empty(&init_task.tasks)); |
4414 | 4417 | ||
4415 | ss->active = 1; | 4418 | ss->active = 1; |
4419 | BUG_ON(online_css(ss, dummytop)); | ||
4420 | |||
4421 | mutex_unlock(&cgroup_mutex); | ||
4416 | 4422 | ||
4417 | /* this function shouldn't be used with modular subsystems, since they | 4423 | /* this function shouldn't be used with modular subsystems, since they |
4418 | * need to register a subsys_id, among other things */ | 4424 | * need to register a subsys_id, among other things */ |
@@ -4430,12 +4436,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4430 | */ | 4436 | */ |
4431 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | 4437 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) |
4432 | { | 4438 | { |
4433 | int i; | ||
4434 | struct cgroup_subsys_state *css; | 4439 | struct cgroup_subsys_state *css; |
4440 | int i, ret; | ||
4435 | 4441 | ||
4436 | /* check name and function validity */ | 4442 | /* check name and function validity */ |
4437 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || | 4443 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || |
4438 | ss->create == NULL || ss->destroy == NULL) | 4444 | ss->css_alloc == NULL || ss->css_free == NULL) |
4439 | return -EINVAL; | 4445 | return -EINVAL; |
4440 | 4446 | ||
4441 | /* | 4447 | /* |
@@ -4464,10 +4470,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4464 | subsys[ss->subsys_id] = ss; | 4470 | subsys[ss->subsys_id] = ss; |
4465 | 4471 | ||
4466 | /* | 4472 | /* |
4467 | * no ss->create seems to need anything important in the ss struct, so | 4473 | * no ss->css_alloc seems to need anything important in the ss |
4468 | * this can happen first (i.e. before the rootnode attachment). | 4474 | * struct, so this can happen first (i.e. before the rootnode |
4475 | * attachment). | ||
4469 | */ | 4476 | */ |
4470 | css = ss->create(dummytop); | 4477 | css = ss->css_alloc(dummytop); |
4471 | if (IS_ERR(css)) { | 4478 | if (IS_ERR(css)) { |
4472 | /* failure case - need to deassign the subsys[] slot. */ | 4479 | /* failure case - need to deassign the subsys[] slot. */ |
4473 | subsys[ss->subsys_id] = NULL; | 4480 | subsys[ss->subsys_id] = NULL; |
@@ -4482,14 +4489,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4482 | init_cgroup_css(css, ss, dummytop); | 4489 | init_cgroup_css(css, ss, dummytop); |
4483 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4490 | /* init_idr must be after init_cgroup_css because it sets css->id. */ |
4484 | if (ss->use_id) { | 4491 | if (ss->use_id) { |
4485 | int ret = cgroup_init_idr(ss, css); | 4492 | ret = cgroup_init_idr(ss, css); |
4486 | if (ret) { | 4493 | if (ret) |
4487 | dummytop->subsys[ss->subsys_id] = NULL; | 4494 | goto err_unload; |
4488 | ss->destroy(dummytop); | ||
4489 | subsys[ss->subsys_id] = NULL; | ||
4490 | mutex_unlock(&cgroup_mutex); | ||
4491 | return ret; | ||
4492 | } | ||
4493 | } | 4495 | } |
4494 | 4496 | ||
4495 | /* | 4497 | /* |
@@ -4522,10 +4524,19 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4522 | write_unlock(&css_set_lock); | 4524 | write_unlock(&css_set_lock); |
4523 | 4525 | ||
4524 | ss->active = 1; | 4526 | ss->active = 1; |
4527 | ret = online_css(ss, dummytop); | ||
4528 | if (ret) | ||
4529 | goto err_unload; | ||
4525 | 4530 | ||
4526 | /* success! */ | 4531 | /* success! */ |
4527 | mutex_unlock(&cgroup_mutex); | 4532 | mutex_unlock(&cgroup_mutex); |
4528 | return 0; | 4533 | return 0; |
4534 | |||
4535 | err_unload: | ||
4536 | mutex_unlock(&cgroup_mutex); | ||
4537 | /* @ss can't be mounted here as try_module_get() would fail */ | ||
4538 | cgroup_unload_subsys(ss); | ||
4539 | return ret; | ||
4529 | } | 4540 | } |
4530 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); | 4541 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); |
4531 | 4542 | ||
@@ -4552,6 +4563,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4552 | BUG_ON(ss->root != &rootnode); | 4563 | BUG_ON(ss->root != &rootnode); |
4553 | 4564 | ||
4554 | mutex_lock(&cgroup_mutex); | 4565 | mutex_lock(&cgroup_mutex); |
4566 | |||
4567 | offline_css(ss, dummytop); | ||
4568 | ss->active = 0; | ||
4569 | |||
4570 | if (ss->use_id) { | ||
4571 | idr_remove_all(&ss->idr); | ||
4572 | idr_destroy(&ss->idr); | ||
4573 | } | ||
4574 | |||
4555 | /* deassign the subsys_id */ | 4575 | /* deassign the subsys_id */ |
4556 | subsys[ss->subsys_id] = NULL; | 4576 | subsys[ss->subsys_id] = NULL; |
4557 | 4577 | ||
@@ -4567,7 +4587,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4567 | struct css_set *cg = link->cg; | 4587 | struct css_set *cg = link->cg; |
4568 | 4588 | ||
4569 | hlist_del(&cg->hlist); | 4589 | hlist_del(&cg->hlist); |
4570 | BUG_ON(!cg->subsys[ss->subsys_id]); | ||
4571 | cg->subsys[ss->subsys_id] = NULL; | 4590 | cg->subsys[ss->subsys_id] = NULL; |
4572 | hhead = css_set_hash(cg->subsys); | 4591 | hhead = css_set_hash(cg->subsys); |
4573 | hlist_add_head(&cg->hlist, hhead); | 4592 | hlist_add_head(&cg->hlist, hhead); |
@@ -4575,12 +4594,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4575 | write_unlock(&css_set_lock); | 4594 | write_unlock(&css_set_lock); |
4576 | 4595 | ||
4577 | /* | 4596 | /* |
4578 | * remove subsystem's css from the dummytop and free it - need to free | 4597 | * remove subsystem's css from the dummytop and free it - need to |
4579 | * before marking as null because ss->destroy needs the cgrp->subsys | 4598 | * free before marking as null because ss->css_free needs the |
4580 | * pointer to find their state. note that this also takes care of | 4599 | * cgrp->subsys pointer to find their state. note that this also |
4581 | * freeing the css_id. | 4600 | * takes care of freeing the css_id. |
4582 | */ | 4601 | */ |
4583 | ss->destroy(dummytop); | 4602 | ss->css_free(dummytop); |
4584 | dummytop->subsys[ss->subsys_id] = NULL; | 4603 | dummytop->subsys[ss->subsys_id] = NULL; |
4585 | 4604 | ||
4586 | mutex_unlock(&cgroup_mutex); | 4605 | mutex_unlock(&cgroup_mutex); |
@@ -4624,8 +4643,8 @@ int __init cgroup_init_early(void) | |||
4624 | 4643 | ||
4625 | BUG_ON(!ss->name); | 4644 | BUG_ON(!ss->name); |
4626 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | 4645 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); |
4627 | BUG_ON(!ss->create); | 4646 | BUG_ON(!ss->css_alloc); |
4628 | BUG_ON(!ss->destroy); | 4647 | BUG_ON(!ss->css_free); |
4629 | if (ss->subsys_id != i) { | 4648 | if (ss->subsys_id != i) { |
4630 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", | 4649 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", |
4631 | ss->name, ss->subsys_id); | 4650 | ss->name, ss->subsys_id); |
@@ -4832,44 +4851,19 @@ void cgroup_fork(struct task_struct *child) | |||
4832 | } | 4851 | } |
4833 | 4852 | ||
4834 | /** | 4853 | /** |
4835 | * cgroup_fork_callbacks - run fork callbacks | ||
4836 | * @child: the new task | ||
4837 | * | ||
4838 | * Called on a new task very soon before adding it to the | ||
4839 | * tasklist. No need to take any locks since no-one can | ||
4840 | * be operating on this task. | ||
4841 | */ | ||
4842 | void cgroup_fork_callbacks(struct task_struct *child) | ||
4843 | { | ||
4844 | if (need_forkexit_callback) { | ||
4845 | int i; | ||
4846 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4847 | struct cgroup_subsys *ss = subsys[i]; | ||
4848 | |||
4849 | /* | ||
4850 | * forkexit callbacks are only supported for | ||
4851 | * builtin subsystems. | ||
4852 | */ | ||
4853 | if (!ss || ss->module) | ||
4854 | continue; | ||
4855 | |||
4856 | if (ss->fork) | ||
4857 | ss->fork(child); | ||
4858 | } | ||
4859 | } | ||
4860 | } | ||
4861 | |||
4862 | /** | ||
4863 | * cgroup_post_fork - called on a new task after adding it to the task list | 4854 | * cgroup_post_fork - called on a new task after adding it to the task list |
4864 | * @child: the task in question | 4855 | * @child: the task in question |
4865 | * | 4856 | * |
4866 | * Adds the task to the list running through its css_set if necessary. | 4857 | * Adds the task to the list running through its css_set if necessary and |
4867 | * Has to be after the task is visible on the task list in case we race | 4858 | * call the subsystem fork() callbacks. Has to be after the task is |
4868 | * with the first call to cgroup_iter_start() - to guarantee that the | 4859 | * visible on the task list in case we race with the first call to |
4869 | * new task ends up on its list. | 4860 | * cgroup_iter_start() - to guarantee that the new task ends up on its |
4861 | * list. | ||
4870 | */ | 4862 | */ |
4871 | void cgroup_post_fork(struct task_struct *child) | 4863 | void cgroup_post_fork(struct task_struct *child) |
4872 | { | 4864 | { |
4865 | int i; | ||
4866 | |||
4873 | /* | 4867 | /* |
4874 | * use_task_css_set_links is set to 1 before we walk the tasklist | 4868 | * use_task_css_set_links is set to 1 before we walk the tasklist |
4875 | * under the tasklist_lock and we read it here after we added the child | 4869 | * under the tasklist_lock and we read it here after we added the child |
@@ -4889,7 +4883,30 @@ void cgroup_post_fork(struct task_struct *child) | |||
4889 | task_unlock(child); | 4883 | task_unlock(child); |
4890 | write_unlock(&css_set_lock); | 4884 | write_unlock(&css_set_lock); |
4891 | } | 4885 | } |
4886 | |||
4887 | /* | ||
4888 | * Call ss->fork(). This must happen after @child is linked on | ||
4889 | * css_set; otherwise, @child might change state between ->fork() | ||
4890 | * and addition to css_set. | ||
4891 | */ | ||
4892 | if (need_forkexit_callback) { | ||
4893 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4894 | struct cgroup_subsys *ss = subsys[i]; | ||
4895 | |||
4896 | /* | ||
4897 | * fork/exit callbacks are supported only for | ||
4898 | * builtin subsystems and we don't need further | ||
4899 | * synchronization as they never go away. | ||
4900 | */ | ||
4901 | if (!ss || ss->module) | ||
4902 | continue; | ||
4903 | |||
4904 | if (ss->fork) | ||
4905 | ss->fork(child); | ||
4906 | } | ||
4907 | } | ||
4892 | } | 4908 | } |
4909 | |||
4893 | /** | 4910 | /** |
4894 | * cgroup_exit - detach cgroup from exiting task | 4911 | * cgroup_exit - detach cgroup from exiting task |
4895 | * @tsk: pointer to task_struct of exiting process | 4912 | * @tsk: pointer to task_struct of exiting process |
@@ -5022,15 +5039,17 @@ static void check_for_release(struct cgroup *cgrp) | |||
5022 | /* Caller must verify that the css is not for root cgroup */ | 5039 | /* Caller must verify that the css is not for root cgroup */ |
5023 | bool __css_tryget(struct cgroup_subsys_state *css) | 5040 | bool __css_tryget(struct cgroup_subsys_state *css) |
5024 | { | 5041 | { |
5025 | do { | 5042 | while (true) { |
5026 | int v = css_refcnt(css); | 5043 | int t, v; |
5027 | 5044 | ||
5028 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | 5045 | v = css_refcnt(css); |
5046 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
5047 | if (likely(t == v)) | ||
5029 | return true; | 5048 | return true; |
5049 | else if (t < 0) | ||
5050 | return false; | ||
5030 | cpu_relax(); | 5051 | cpu_relax(); |
5031 | } while (!test_bit(CSS_REMOVED, &css->flags)); | 5052 | } |
5032 | |||
5033 | return false; | ||
5034 | } | 5053 | } |
5035 | EXPORT_SYMBOL_GPL(__css_tryget); | 5054 | EXPORT_SYMBOL_GPL(__css_tryget); |
5036 | 5055 | ||
@@ -5049,11 +5068,9 @@ void __css_put(struct cgroup_subsys_state *css) | |||
5049 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 5068 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
5050 | check_for_release(cgrp); | 5069 | check_for_release(cgrp); |
5051 | } | 5070 | } |
5052 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
5053 | break; | 5071 | break; |
5054 | case 0: | 5072 | case 0: |
5055 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | 5073 | schedule_work(&css->dput_work); |
5056 | schedule_work(&css->dput_work); | ||
5057 | break; | 5074 | break; |
5058 | } | 5075 | } |
5059 | rcu_read_unlock(); | 5076 | rcu_read_unlock(); |
@@ -5439,7 +5456,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
5439 | } | 5456 | } |
5440 | 5457 | ||
5441 | #ifdef CONFIG_CGROUP_DEBUG | 5458 | #ifdef CONFIG_CGROUP_DEBUG |
5442 | static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | 5459 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) |
5443 | { | 5460 | { |
5444 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5461 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5445 | 5462 | ||
@@ -5449,7 +5466,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont) | |||
5449 | return css; | 5466 | return css; |
5450 | } | 5467 | } |
5451 | 5468 | ||
5452 | static void debug_destroy(struct cgroup *cont) | 5469 | static void debug_css_free(struct cgroup *cont) |
5453 | { | 5470 | { |
5454 | kfree(cont->subsys[debug_subsys_id]); | 5471 | kfree(cont->subsys[debug_subsys_id]); |
5455 | } | 5472 | } |
@@ -5578,8 +5595,8 @@ static struct cftype debug_files[] = { | |||
5578 | 5595 | ||
5579 | struct cgroup_subsys debug_subsys = { | 5596 | struct cgroup_subsys debug_subsys = { |
5580 | .name = "debug", | 5597 | .name = "debug", |
5581 | .create = debug_create, | 5598 | .css_alloc = debug_css_alloc, |
5582 | .destroy = debug_destroy, | 5599 | .css_free = debug_css_free, |
5583 | .subsys_id = debug_subsys_id, | 5600 | .subsys_id = debug_subsys_id, |
5584 | .base_cftypes = debug_files, | 5601 | .base_cftypes = debug_files, |
5585 | }; | 5602 | }; |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index b1724ce98981..75dda1ea5026 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -22,24 +22,33 @@ | |||
22 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
23 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
24 | 24 | ||
25 | enum freezer_state { | 25 | /* |
26 | CGROUP_THAWED = 0, | 26 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is |
27 | CGROUP_FREEZING, | 27 | * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared |
28 | CGROUP_FROZEN, | 28 | * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING |
29 | * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of | ||
30 | * its ancestors has FREEZING_SELF set. | ||
31 | */ | ||
32 | enum freezer_state_flags { | ||
33 | CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ | ||
34 | CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ | ||
35 | CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ | ||
36 | CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ | ||
37 | |||
38 | /* mask for all FREEZING flags */ | ||
39 | CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, | ||
29 | }; | 40 | }; |
30 | 41 | ||
31 | struct freezer { | 42 | struct freezer { |
32 | struct cgroup_subsys_state css; | 43 | struct cgroup_subsys_state css; |
33 | enum freezer_state state; | 44 | unsigned int state; |
34 | spinlock_t lock; /* protects _writes_ to state */ | 45 | spinlock_t lock; |
35 | }; | 46 | }; |
36 | 47 | ||
37 | static inline struct freezer *cgroup_freezer( | 48 | static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) |
38 | struct cgroup *cgroup) | ||
39 | { | 49 | { |
40 | return container_of( | 50 | return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), |
41 | cgroup_subsys_state(cgroup, freezer_subsys_id), | 51 | struct freezer, css); |
42 | struct freezer, css); | ||
43 | } | 52 | } |
44 | 53 | ||
45 | static inline struct freezer *task_freezer(struct task_struct *task) | 54 | static inline struct freezer *task_freezer(struct task_struct *task) |
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
48 | struct freezer, css); | 57 | struct freezer, css); |
49 | } | 58 | } |
50 | 59 | ||
60 | static struct freezer *parent_freezer(struct freezer *freezer) | ||
61 | { | ||
62 | struct cgroup *pcg = freezer->css.cgroup->parent; | ||
63 | |||
64 | if (pcg) | ||
65 | return cgroup_freezer(pcg); | ||
66 | return NULL; | ||
67 | } | ||
68 | |||
51 | bool cgroup_freezing(struct task_struct *task) | 69 | bool cgroup_freezing(struct task_struct *task) |
52 | { | 70 | { |
53 | enum freezer_state state; | ||
54 | bool ret; | 71 | bool ret; |
55 | 72 | ||
56 | rcu_read_lock(); | 73 | rcu_read_lock(); |
57 | state = task_freezer(task)->state; | 74 | ret = task_freezer(task)->state & CGROUP_FREEZING; |
58 | ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; | ||
59 | rcu_read_unlock(); | 75 | rcu_read_unlock(); |
60 | 76 | ||
61 | return ret; | 77 | return ret; |
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task) | |||
65 | * cgroups_write_string() limits the size of freezer state strings to | 81 | * cgroups_write_string() limits the size of freezer state strings to |
66 | * CGROUP_LOCAL_BUFFER_SIZE | 82 | * CGROUP_LOCAL_BUFFER_SIZE |
67 | */ | 83 | */ |
68 | static const char *freezer_state_strs[] = { | 84 | static const char *freezer_state_strs(unsigned int state) |
69 | "THAWED", | 85 | { |
70 | "FREEZING", | 86 | if (state & CGROUP_FROZEN) |
71 | "FROZEN", | 87 | return "FROZEN"; |
88 | if (state & CGROUP_FREEZING) | ||
89 | return "FREEZING"; | ||
90 | return "THAWED"; | ||
72 | }; | 91 | }; |
73 | 92 | ||
74 | /* | ||
75 | * State diagram | ||
76 | * Transitions are caused by userspace writes to the freezer.state file. | ||
77 | * The values in parenthesis are state labels. The rest are edge labels. | ||
78 | * | ||
79 | * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) | ||
80 | * ^ ^ | | | ||
81 | * | \_______THAWED_______/ | | ||
82 | * \__________________________THAWED____________/ | ||
83 | */ | ||
84 | |||
85 | struct cgroup_subsys freezer_subsys; | 93 | struct cgroup_subsys freezer_subsys; |
86 | 94 | ||
87 | /* Locks taken and their ordering | 95 | static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) |
88 | * ------------------------------ | ||
89 | * cgroup_mutex (AKA cgroup_lock) | ||
90 | * freezer->lock | ||
91 | * css_set_lock | ||
92 | * task->alloc_lock (AKA task_lock) | ||
93 | * task->sighand->siglock | ||
94 | * | ||
95 | * cgroup code forces css_set_lock to be taken before task->alloc_lock | ||
96 | * | ||
97 | * freezer_create(), freezer_destroy(): | ||
98 | * cgroup_mutex [ by cgroup core ] | ||
99 | * | ||
100 | * freezer_can_attach(): | ||
101 | * cgroup_mutex (held by caller of can_attach) | ||
102 | * | ||
103 | * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): | ||
104 | * freezer->lock | ||
105 | * sighand->siglock (if the cgroup is freezing) | ||
106 | * | ||
107 | * freezer_read(): | ||
108 | * cgroup_mutex | ||
109 | * freezer->lock | ||
110 | * write_lock css_set_lock (cgroup iterator start) | ||
111 | * task->alloc_lock | ||
112 | * read_lock css_set_lock (cgroup iterator start) | ||
113 | * | ||
114 | * freezer_write() (freeze): | ||
115 | * cgroup_mutex | ||
116 | * freezer->lock | ||
117 | * write_lock css_set_lock (cgroup iterator start) | ||
118 | * task->alloc_lock | ||
119 | * read_lock css_set_lock (cgroup iterator start) | ||
120 | * sighand->siglock (fake signal delivery inside freeze_task()) | ||
121 | * | ||
122 | * freezer_write() (unfreeze): | ||
123 | * cgroup_mutex | ||
124 | * freezer->lock | ||
125 | * write_lock css_set_lock (cgroup iterator start) | ||
126 | * task->alloc_lock | ||
127 | * read_lock css_set_lock (cgroup iterator start) | ||
128 | * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) | ||
129 | * sighand->siglock | ||
130 | */ | ||
131 | static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) | ||
132 | { | 96 | { |
133 | struct freezer *freezer; | 97 | struct freezer *freezer; |
134 | 98 | ||
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) | |||
137 | return ERR_PTR(-ENOMEM); | 101 | return ERR_PTR(-ENOMEM); |
138 | 102 | ||
139 | spin_lock_init(&freezer->lock); | 103 | spin_lock_init(&freezer->lock); |
140 | freezer->state = CGROUP_THAWED; | ||
141 | return &freezer->css; | 104 | return &freezer->css; |
142 | } | 105 | } |
143 | 106 | ||
144 | static void freezer_destroy(struct cgroup *cgroup) | 107 | /** |
108 | * freezer_css_online - commit creation of a freezer cgroup | ||
109 | * @cgroup: cgroup being created | ||
110 | * | ||
111 | * We're committing to creation of @cgroup. Mark it online and inherit | ||
112 | * parent's freezing state while holding both parent's and our | ||
113 | * freezer->lock. | ||
114 | */ | ||
115 | static int freezer_css_online(struct cgroup *cgroup) | ||
116 | { | ||
117 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
118 | struct freezer *parent = parent_freezer(freezer); | ||
119 | |||
120 | /* | ||
121 | * The following double locking and freezing state inheritance | ||
122 | * guarantee that @cgroup can never escape ancestors' freezing | ||
123 | * states. See cgroup_for_each_descendant_pre() for details. | ||
124 | */ | ||
125 | if (parent) | ||
126 | spin_lock_irq(&parent->lock); | ||
127 | spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); | ||
128 | |||
129 | freezer->state |= CGROUP_FREEZER_ONLINE; | ||
130 | |||
131 | if (parent && (parent->state & CGROUP_FREEZING)) { | ||
132 | freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; | ||
133 | atomic_inc(&system_freezing_cnt); | ||
134 | } | ||
135 | |||
136 | spin_unlock(&freezer->lock); | ||
137 | if (parent) | ||
138 | spin_unlock_irq(&parent->lock); | ||
139 | |||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * freezer_css_offline - initiate destruction of @cgroup | ||
145 | * @cgroup: cgroup being destroyed | ||
146 | * | ||
147 | * @cgroup is going away. Mark it dead and decrement system_freezing_count | ||
148 | * if it was holding one. | ||
149 | */ | ||
150 | static void freezer_css_offline(struct cgroup *cgroup) | ||
145 | { | 151 | { |
146 | struct freezer *freezer = cgroup_freezer(cgroup); | 152 | struct freezer *freezer = cgroup_freezer(cgroup); |
147 | 153 | ||
148 | if (freezer->state != CGROUP_THAWED) | 154 | spin_lock_irq(&freezer->lock); |
155 | |||
156 | if (freezer->state & CGROUP_FREEZING) | ||
149 | atomic_dec(&system_freezing_cnt); | 157 | atomic_dec(&system_freezing_cnt); |
150 | kfree(freezer); | 158 | |
159 | freezer->state = 0; | ||
160 | |||
161 | spin_unlock_irq(&freezer->lock); | ||
151 | } | 162 | } |
152 | 163 | ||
153 | /* task is frozen or will freeze immediately when next it gets woken */ | 164 | static void freezer_css_free(struct cgroup *cgroup) |
154 | static bool is_task_frozen_enough(struct task_struct *task) | ||
155 | { | 165 | { |
156 | return frozen(task) || | 166 | kfree(cgroup_freezer(cgroup)); |
157 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
158 | } | 167 | } |
159 | 168 | ||
160 | /* | 169 | /* |
161 | * The call to cgroup_lock() in the freezer.state write method prevents | 170 | * Tasks can be migrated into a different freezer anytime regardless of its |
162 | * a write to that file racing against an attach, and hence the | 171 | * current state. freezer_attach() is responsible for making new tasks |
163 | * can_attach() result will remain valid until the attach completes. | 172 | * conform to the current state. |
173 | * | ||
174 | * Freezer state changes and task migration are synchronized via | ||
175 | * @freezer->lock. freezer_attach() makes the new tasks conform to the | ||
176 | * current state and all following state changes can see the new tasks. | ||
164 | */ | 177 | */ |
165 | static int freezer_can_attach(struct cgroup *new_cgroup, | 178 | static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) |
166 | struct cgroup_taskset *tset) | ||
167 | { | 179 | { |
168 | struct freezer *freezer; | 180 | struct freezer *freezer = cgroup_freezer(new_cgrp); |
169 | struct task_struct *task; | 181 | struct task_struct *task; |
182 | bool clear_frozen = false; | ||
183 | |||
184 | spin_lock_irq(&freezer->lock); | ||
170 | 185 | ||
171 | /* | 186 | /* |
172 | * Anything frozen can't move or be moved to/from. | 187 | * Make the new tasks conform to the current state of @new_cgrp. |
188 | * For simplicity, when migrating any task to a FROZEN cgroup, we | ||
189 | * revert it to FREEZING and let update_if_frozen() determine the | ||
190 | * correct state later. | ||
191 | * | ||
192 | * Tasks in @tset are on @new_cgrp but may not conform to its | ||
193 | * current state before executing the following - !frozen tasks may | ||
194 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. | ||
173 | */ | 195 | */ |
174 | cgroup_taskset_for_each(task, new_cgroup, tset) | 196 | cgroup_taskset_for_each(task, new_cgrp, tset) { |
175 | if (cgroup_freezing(task)) | 197 | if (!(freezer->state & CGROUP_FREEZING)) { |
176 | return -EBUSY; | 198 | __thaw_task(task); |
199 | } else { | ||
200 | freeze_task(task); | ||
201 | freezer->state &= ~CGROUP_FROZEN; | ||
202 | clear_frozen = true; | ||
203 | } | ||
204 | } | ||
177 | 205 | ||
178 | freezer = cgroup_freezer(new_cgroup); | 206 | spin_unlock_irq(&freezer->lock); |
179 | if (freezer->state != CGROUP_THAWED) | ||
180 | return -EBUSY; | ||
181 | 207 | ||
182 | return 0; | 208 | /* |
209 | * Propagate FROZEN clearing upwards. We may race with | ||
210 | * update_if_frozen(), but as long as both work bottom-up, either | ||
211 | * update_if_frozen() sees child's FROZEN cleared or we clear the | ||
212 | * parent's FROZEN later. No parent w/ !FROZEN children can be | ||
213 | * left FROZEN. | ||
214 | */ | ||
215 | while (clear_frozen && (freezer = parent_freezer(freezer))) { | ||
216 | spin_lock_irq(&freezer->lock); | ||
217 | freezer->state &= ~CGROUP_FROZEN; | ||
218 | clear_frozen = freezer->state & CGROUP_FREEZING; | ||
219 | spin_unlock_irq(&freezer->lock); | ||
220 | } | ||
183 | } | 221 | } |
184 | 222 | ||
185 | static void freezer_fork(struct task_struct *task) | 223 | static void freezer_fork(struct task_struct *task) |
186 | { | 224 | { |
187 | struct freezer *freezer; | 225 | struct freezer *freezer; |
188 | 226 | ||
189 | /* | ||
190 | * No lock is needed, since the task isn't on tasklist yet, | ||
191 | * so it can't be moved to another cgroup, which means the | ||
192 | * freezer won't be removed and will be valid during this | ||
193 | * function call. Nevertheless, apply RCU read-side critical | ||
194 | * section to suppress RCU lockdep false positives. | ||
195 | */ | ||
196 | rcu_read_lock(); | 227 | rcu_read_lock(); |
197 | freezer = task_freezer(task); | 228 | freezer = task_freezer(task); |
198 | rcu_read_unlock(); | ||
199 | 229 | ||
200 | /* | 230 | /* |
201 | * The root cgroup is non-freezable, so we can skip the | 231 | * The root cgroup is non-freezable, so we can skip the |
202 | * following check. | 232 | * following check. |
203 | */ | 233 | */ |
204 | if (!freezer->css.cgroup->parent) | 234 | if (!freezer->css.cgroup->parent) |
205 | return; | 235 | goto out; |
206 | 236 | ||
207 | spin_lock_irq(&freezer->lock); | 237 | spin_lock_irq(&freezer->lock); |
208 | BUG_ON(freezer->state == CGROUP_FROZEN); | 238 | if (freezer->state & CGROUP_FREEZING) |
209 | |||
210 | /* Locking avoids race with FREEZING -> THAWED transitions. */ | ||
211 | if (freezer->state == CGROUP_FREEZING) | ||
212 | freeze_task(task); | 239 | freeze_task(task); |
213 | spin_unlock_irq(&freezer->lock); | 240 | spin_unlock_irq(&freezer->lock); |
241 | out: | ||
242 | rcu_read_unlock(); | ||
214 | } | 243 | } |
215 | 244 | ||
216 | /* | 245 | /** |
217 | * caller must hold freezer->lock | 246 | * update_if_frozen - update whether a cgroup finished freezing |
247 | * @cgroup: cgroup of interest | ||
248 | * | ||
249 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by | ||
250 | * calling this function. If the current state is FREEZING but not FROZEN, | ||
251 | * this function checks whether all tasks of this cgroup and the descendant | ||
252 | * cgroups finished freezing and, if so, sets FROZEN. | ||
253 | * | ||
254 | * The caller is responsible for grabbing RCU read lock and calling | ||
255 | * update_if_frozen() on all descendants prior to invoking this function. | ||
256 | * | ||
257 | * Task states and freezer state might disagree while tasks are being | ||
258 | * migrated into or out of @cgroup, so we can't verify task states against | ||
259 | * @freezer state here. See freezer_attach() for details. | ||
218 | */ | 260 | */ |
219 | static void update_if_frozen(struct cgroup *cgroup, | 261 | static void update_if_frozen(struct cgroup *cgroup) |
220 | struct freezer *freezer) | ||
221 | { | 262 | { |
263 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
264 | struct cgroup *pos; | ||
222 | struct cgroup_iter it; | 265 | struct cgroup_iter it; |
223 | struct task_struct *task; | 266 | struct task_struct *task; |
224 | unsigned int nfrozen = 0, ntotal = 0; | ||
225 | enum freezer_state old_state = freezer->state; | ||
226 | 267 | ||
227 | cgroup_iter_start(cgroup, &it); | 268 | WARN_ON_ONCE(!rcu_read_lock_held()); |
228 | while ((task = cgroup_iter_next(cgroup, &it))) { | 269 | |
229 | ntotal++; | 270 | spin_lock_irq(&freezer->lock); |
230 | if (freezing(task) && is_task_frozen_enough(task)) | 271 | |
231 | nfrozen++; | 272 | if (!(freezer->state & CGROUP_FREEZING) || |
273 | (freezer->state & CGROUP_FROZEN)) | ||
274 | goto out_unlock; | ||
275 | |||
276 | /* are all (live) children frozen? */ | ||
277 | cgroup_for_each_child(pos, cgroup) { | ||
278 | struct freezer *child = cgroup_freezer(pos); | ||
279 | |||
280 | if ((child->state & CGROUP_FREEZER_ONLINE) && | ||
281 | !(child->state & CGROUP_FROZEN)) | ||
282 | goto out_unlock; | ||
232 | } | 283 | } |
233 | 284 | ||
234 | if (old_state == CGROUP_THAWED) { | 285 | /* are all tasks frozen? */ |
235 | BUG_ON(nfrozen > 0); | 286 | cgroup_iter_start(cgroup, &it); |
236 | } else if (old_state == CGROUP_FREEZING) { | 287 | |
237 | if (nfrozen == ntotal) | 288 | while ((task = cgroup_iter_next(cgroup, &it))) { |
238 | freezer->state = CGROUP_FROZEN; | 289 | if (freezing(task)) { |
239 | } else { /* old_state == CGROUP_FROZEN */ | 290 | /* |
240 | BUG_ON(nfrozen != ntotal); | 291 | * freezer_should_skip() indicates that the task |
292 | * should be skipped when determining freezing | ||
293 | * completion. Consider it frozen in addition to | ||
294 | * the usual frozen condition. | ||
295 | */ | ||
296 | if (!frozen(task) && !freezer_should_skip(task)) | ||
297 | goto out_iter_end; | ||
298 | } | ||
241 | } | 299 | } |
242 | 300 | ||
301 | freezer->state |= CGROUP_FROZEN; | ||
302 | out_iter_end: | ||
243 | cgroup_iter_end(cgroup, &it); | 303 | cgroup_iter_end(cgroup, &it); |
304 | out_unlock: | ||
305 | spin_unlock_irq(&freezer->lock); | ||
244 | } | 306 | } |
245 | 307 | ||
246 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | 308 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, |
247 | struct seq_file *m) | 309 | struct seq_file *m) |
248 | { | 310 | { |
249 | struct freezer *freezer; | 311 | struct cgroup *pos; |
250 | enum freezer_state state; | ||
251 | 312 | ||
252 | if (!cgroup_lock_live_group(cgroup)) | 313 | rcu_read_lock(); |
253 | return -ENODEV; | ||
254 | 314 | ||
255 | freezer = cgroup_freezer(cgroup); | 315 | /* update states bottom-up */ |
256 | spin_lock_irq(&freezer->lock); | 316 | cgroup_for_each_descendant_post(pos, cgroup) |
257 | state = freezer->state; | 317 | update_if_frozen(pos); |
258 | if (state == CGROUP_FREEZING) { | 318 | update_if_frozen(cgroup); |
259 | /* We change from FREEZING to FROZEN lazily if the cgroup was | 319 | |
260 | * only partially frozen when we exitted write. */ | 320 | rcu_read_unlock(); |
261 | update_if_frozen(cgroup, freezer); | ||
262 | state = freezer->state; | ||
263 | } | ||
264 | spin_unlock_irq(&freezer->lock); | ||
265 | cgroup_unlock(); | ||
266 | 321 | ||
267 | seq_puts(m, freezer_state_strs[state]); | 322 | seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); |
268 | seq_putc(m, '\n'); | 323 | seq_putc(m, '\n'); |
269 | return 0; | 324 | return 0; |
270 | } | 325 | } |
271 | 326 | ||
272 | static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | 327 | static void freeze_cgroup(struct freezer *freezer) |
273 | { | 328 | { |
329 | struct cgroup *cgroup = freezer->css.cgroup; | ||
274 | struct cgroup_iter it; | 330 | struct cgroup_iter it; |
275 | struct task_struct *task; | 331 | struct task_struct *task; |
276 | unsigned int num_cant_freeze_now = 0; | ||
277 | 332 | ||
278 | cgroup_iter_start(cgroup, &it); | 333 | cgroup_iter_start(cgroup, &it); |
279 | while ((task = cgroup_iter_next(cgroup, &it))) { | 334 | while ((task = cgroup_iter_next(cgroup, &it))) |
280 | if (!freeze_task(task)) | 335 | freeze_task(task); |
281 | continue; | ||
282 | if (is_task_frozen_enough(task)) | ||
283 | continue; | ||
284 | if (!freezing(task) && !freezer_should_skip(task)) | ||
285 | num_cant_freeze_now++; | ||
286 | } | ||
287 | cgroup_iter_end(cgroup, &it); | 336 | cgroup_iter_end(cgroup, &it); |
288 | |||
289 | return num_cant_freeze_now ? -EBUSY : 0; | ||
290 | } | 337 | } |
291 | 338 | ||
292 | static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | 339 | static void unfreeze_cgroup(struct freezer *freezer) |
293 | { | 340 | { |
341 | struct cgroup *cgroup = freezer->css.cgroup; | ||
294 | struct cgroup_iter it; | 342 | struct cgroup_iter it; |
295 | struct task_struct *task; | 343 | struct task_struct *task; |
296 | 344 | ||
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
300 | cgroup_iter_end(cgroup, &it); | 348 | cgroup_iter_end(cgroup, &it); |
301 | } | 349 | } |
302 | 350 | ||
303 | static int freezer_change_state(struct cgroup *cgroup, | 351 | /** |
304 | enum freezer_state goal_state) | 352 | * freezer_apply_state - apply state change to a single cgroup_freezer |
353 | * @freezer: freezer to apply state change to | ||
354 | * @freeze: whether to freeze or unfreeze | ||
355 | * @state: CGROUP_FREEZING_* flag to set or clear | ||
356 | * | ||
357 | * Set or clear @state on @cgroup according to @freeze, and perform | ||
358 | * freezing or thawing as necessary. | ||
359 | */ | ||
360 | static void freezer_apply_state(struct freezer *freezer, bool freeze, | ||
361 | unsigned int state) | ||
305 | { | 362 | { |
306 | struct freezer *freezer; | 363 | /* also synchronizes against task migration, see freezer_attach() */ |
307 | int retval = 0; | 364 | lockdep_assert_held(&freezer->lock); |
308 | |||
309 | freezer = cgroup_freezer(cgroup); | ||
310 | 365 | ||
311 | spin_lock_irq(&freezer->lock); | 366 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) |
367 | return; | ||
312 | 368 | ||
313 | update_if_frozen(cgroup, freezer); | 369 | if (freeze) { |
314 | 370 | if (!(freezer->state & CGROUP_FREEZING)) | |
315 | switch (goal_state) { | ||
316 | case CGROUP_THAWED: | ||
317 | if (freezer->state != CGROUP_THAWED) | ||
318 | atomic_dec(&system_freezing_cnt); | ||
319 | freezer->state = CGROUP_THAWED; | ||
320 | unfreeze_cgroup(cgroup, freezer); | ||
321 | break; | ||
322 | case CGROUP_FROZEN: | ||
323 | if (freezer->state == CGROUP_THAWED) | ||
324 | atomic_inc(&system_freezing_cnt); | 371 | atomic_inc(&system_freezing_cnt); |
325 | freezer->state = CGROUP_FREEZING; | 372 | freezer->state |= state; |
326 | retval = try_to_freeze_cgroup(cgroup, freezer); | 373 | freeze_cgroup(freezer); |
327 | break; | 374 | } else { |
328 | default: | 375 | bool was_freezing = freezer->state & CGROUP_FREEZING; |
329 | BUG(); | 376 | |
377 | freezer->state &= ~state; | ||
378 | |||
379 | if (!(freezer->state & CGROUP_FREEZING)) { | ||
380 | if (was_freezing) | ||
381 | atomic_dec(&system_freezing_cnt); | ||
382 | freezer->state &= ~CGROUP_FROZEN; | ||
383 | unfreeze_cgroup(freezer); | ||
384 | } | ||
330 | } | 385 | } |
386 | } | ||
331 | 387 | ||
388 | /** | ||
389 | * freezer_change_state - change the freezing state of a cgroup_freezer | ||
390 | * @freezer: freezer of interest | ||
391 | * @freeze: whether to freeze or thaw | ||
392 | * | ||
393 | * Freeze or thaw @freezer according to @freeze. The operations are | ||
394 | * recursive - all descendants of @freezer will be affected. | ||
395 | */ | ||
396 | static void freezer_change_state(struct freezer *freezer, bool freeze) | ||
397 | { | ||
398 | struct cgroup *pos; | ||
399 | |||
400 | /* update @freezer */ | ||
401 | spin_lock_irq(&freezer->lock); | ||
402 | freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); | ||
332 | spin_unlock_irq(&freezer->lock); | 403 | spin_unlock_irq(&freezer->lock); |
333 | 404 | ||
334 | return retval; | 405 | /* |
406 | * Update all its descendants in pre-order traversal. Each | ||
407 | * descendant will try to inherit its parent's FREEZING state as | ||
408 | * CGROUP_FREEZING_PARENT. | ||
409 | */ | ||
410 | rcu_read_lock(); | ||
411 | cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { | ||
412 | struct freezer *pos_f = cgroup_freezer(pos); | ||
413 | struct freezer *parent = parent_freezer(pos_f); | ||
414 | |||
415 | /* | ||
416 | * Our update to @parent->state is already visible which is | ||
417 | * all we need. No need to lock @parent. For more info on | ||
418 | * synchronization, see freezer_post_create(). | ||
419 | */ | ||
420 | spin_lock_irq(&pos_f->lock); | ||
421 | freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, | ||
422 | CGROUP_FREEZING_PARENT); | ||
423 | spin_unlock_irq(&pos_f->lock); | ||
424 | } | ||
425 | rcu_read_unlock(); | ||
335 | } | 426 | } |
336 | 427 | ||
337 | static int freezer_write(struct cgroup *cgroup, | 428 | static int freezer_write(struct cgroup *cgroup, struct cftype *cft, |
338 | struct cftype *cft, | ||
339 | const char *buffer) | 429 | const char *buffer) |
340 | { | 430 | { |
341 | int retval; | 431 | bool freeze; |
342 | enum freezer_state goal_state; | ||
343 | 432 | ||
344 | if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) | 433 | if (strcmp(buffer, freezer_state_strs(0)) == 0) |
345 | goal_state = CGROUP_THAWED; | 434 | freeze = false; |
346 | else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) | 435 | else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) |
347 | goal_state = CGROUP_FROZEN; | 436 | freeze = true; |
348 | else | 437 | else |
349 | return -EINVAL; | 438 | return -EINVAL; |
350 | 439 | ||
351 | if (!cgroup_lock_live_group(cgroup)) | 440 | freezer_change_state(cgroup_freezer(cgroup), freeze); |
352 | return -ENODEV; | 441 | return 0; |
353 | retval = freezer_change_state(cgroup, goal_state); | 442 | } |
354 | cgroup_unlock(); | 443 | |
355 | return retval; | 444 | static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) |
445 | { | ||
446 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
447 | |||
448 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); | ||
449 | } | ||
450 | |||
451 | static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) | ||
452 | { | ||
453 | struct freezer *freezer = cgroup_freezer(cgroup); | ||
454 | |||
455 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); | ||
356 | } | 456 | } |
357 | 457 | ||
358 | static struct cftype files[] = { | 458 | static struct cftype files[] = { |
@@ -362,23 +462,27 @@ static struct cftype files[] = { | |||
362 | .read_seq_string = freezer_read, | 462 | .read_seq_string = freezer_read, |
363 | .write_string = freezer_write, | 463 | .write_string = freezer_write, |
364 | }, | 464 | }, |
465 | { | ||
466 | .name = "self_freezing", | ||
467 | .flags = CFTYPE_NOT_ON_ROOT, | ||
468 | .read_u64 = freezer_self_freezing_read, | ||
469 | }, | ||
470 | { | ||
471 | .name = "parent_freezing", | ||
472 | .flags = CFTYPE_NOT_ON_ROOT, | ||
473 | .read_u64 = freezer_parent_freezing_read, | ||
474 | }, | ||
365 | { } /* terminate */ | 475 | { } /* terminate */ |
366 | }; | 476 | }; |
367 | 477 | ||
368 | struct cgroup_subsys freezer_subsys = { | 478 | struct cgroup_subsys freezer_subsys = { |
369 | .name = "freezer", | 479 | .name = "freezer", |
370 | .create = freezer_create, | 480 | .css_alloc = freezer_css_alloc, |
371 | .destroy = freezer_destroy, | 481 | .css_online = freezer_css_online, |
482 | .css_offline = freezer_css_offline, | ||
483 | .css_free = freezer_css_free, | ||
372 | .subsys_id = freezer_subsys_id, | 484 | .subsys_id = freezer_subsys_id, |
373 | .can_attach = freezer_can_attach, | 485 | .attach = freezer_attach, |
374 | .fork = freezer_fork, | 486 | .fork = freezer_fork, |
375 | .base_cftypes = files, | 487 | .base_cftypes = files, |
376 | |||
377 | /* | ||
378 | * freezer subsys doesn't handle hierarchy at all. Frozen state | ||
379 | * should be inherited through the hierarchy - if a parent is | ||
380 | * frozen, all its children should be frozen. Fix it and remove | ||
381 | * the following. | ||
382 | */ | ||
383 | .broken_hierarchy = true, | ||
384 | }; | 488 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index c28a306ae05c..f6150e92dfc9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) | |||
1215 | return 0; | 1215 | return 0; |
1216 | } | 1216 | } |
1217 | 1217 | ||
1218 | #ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL | ||
1219 | asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, | ||
1220 | struct compat_timespec __user *interval) | ||
1221 | { | ||
1222 | struct timespec t; | ||
1223 | int ret; | ||
1224 | mm_segment_t old_fs = get_fs(); | ||
1225 | |||
1226 | set_fs(KERNEL_DS); | ||
1227 | ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); | ||
1228 | set_fs(old_fs); | ||
1229 | if (put_compat_timespec(&t, interval)) | ||
1230 | return -EFAULT; | ||
1231 | return ret; | ||
1232 | } | ||
1233 | #endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ | ||
1234 | |||
1218 | /* | 1235 | /* |
1219 | * Allocate user-space memory for the duration of a single system call, | 1236 | * Allocate user-space memory for the duration of a single system call, |
1220 | * in order to marshall parameters inside a compat thunk. | 1237 | * in order to marshall parameters inside a compat thunk. |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c new file mode 100644 index 000000000000..e0e07fd55508 --- /dev/null +++ b/kernel/context_tracking.c | |||
@@ -0,0 +1,83 @@ | |||
1 | #include <linux/context_tracking.h> | ||
2 | #include <linux/rcupdate.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/percpu.h> | ||
5 | #include <linux/hardirq.h> | ||
6 | |||
7 | struct context_tracking { | ||
8 | /* | ||
9 | * When active is false, hooks are not set to | ||
10 | * minimize overhead: TIF flags are cleared | ||
11 | * and calls to user_enter/exit are ignored. This | ||
12 | * may be further optimized using static keys. | ||
13 | */ | ||
14 | bool active; | ||
15 | enum { | ||
16 | IN_KERNEL = 0, | ||
17 | IN_USER, | ||
18 | } state; | ||
19 | }; | ||
20 | |||
21 | static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { | ||
22 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | ||
23 | .active = true, | ||
24 | #endif | ||
25 | }; | ||
26 | |||
27 | void user_enter(void) | ||
28 | { | ||
29 | unsigned long flags; | ||
30 | |||
31 | /* | ||
32 | * Some contexts may involve an exception occuring in an irq, | ||
33 | * leading to that nesting: | ||
34 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
35 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
36 | * helpers are enough to protect RCU uses inside the exception. So | ||
37 | * just return immediately if we detect we are in an IRQ. | ||
38 | */ | ||
39 | if (in_interrupt()) | ||
40 | return; | ||
41 | |||
42 | WARN_ON_ONCE(!current->mm); | ||
43 | |||
44 | local_irq_save(flags); | ||
45 | if (__this_cpu_read(context_tracking.active) && | ||
46 | __this_cpu_read(context_tracking.state) != IN_USER) { | ||
47 | __this_cpu_write(context_tracking.state, IN_USER); | ||
48 | rcu_user_enter(); | ||
49 | } | ||
50 | local_irq_restore(flags); | ||
51 | } | ||
52 | |||
53 | void user_exit(void) | ||
54 | { | ||
55 | unsigned long flags; | ||
56 | |||
57 | /* | ||
58 | * Some contexts may involve an exception occuring in an irq, | ||
59 | * leading to that nesting: | ||
60 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
61 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
62 | * helpers are enough to protect RCU uses inside the exception. So | ||
63 | * just return immediately if we detect we are in an IRQ. | ||
64 | */ | ||
65 | if (in_interrupt()) | ||
66 | return; | ||
67 | |||
68 | local_irq_save(flags); | ||
69 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | ||
70 | __this_cpu_write(context_tracking.state, IN_KERNEL); | ||
71 | rcu_user_exit(); | ||
72 | } | ||
73 | local_irq_restore(flags); | ||
74 | } | ||
75 | |||
76 | void context_tracking_task_switch(struct task_struct *prev, | ||
77 | struct task_struct *next) | ||
78 | { | ||
79 | if (__this_cpu_read(context_tracking.active)) { | ||
80 | clear_tsk_thread_flag(prev, TIF_NOHZ); | ||
81 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
82 | } | ||
83 | } | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 42bd331ee0ab..3046a503242c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -348,11 +348,13 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
348 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 348 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
349 | struct task_struct *idle; | 349 | struct task_struct *idle; |
350 | 350 | ||
351 | if (cpu_online(cpu) || !cpu_present(cpu)) | ||
352 | return -EINVAL; | ||
353 | |||
354 | cpu_hotplug_begin(); | 351 | cpu_hotplug_begin(); |
355 | 352 | ||
353 | if (cpu_online(cpu) || !cpu_present(cpu)) { | ||
354 | ret = -EINVAL; | ||
355 | goto out; | ||
356 | } | ||
357 | |||
356 | idle = idle_thread_get(cpu); | 358 | idle = idle_thread_get(cpu); |
357 | if (IS_ERR(idle)) { | 359 | if (IS_ERR(idle)) { |
358 | ret = PTR_ERR(idle); | 360 | ret = PTR_ERR(idle); |
@@ -601,6 +603,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, | |||
601 | 603 | ||
602 | static int __init cpu_hotplug_pm_sync_init(void) | 604 | static int __init cpu_hotplug_pm_sync_init(void) |
603 | { | 605 | { |
606 | /* | ||
607 | * cpu_hotplug_pm_callback has higher priority than x86 | ||
608 | * bsp_pm_callback which depends on cpu_hotplug_pm_callback | ||
609 | * to disable cpu hotplug to avoid cpu hotplug race. | ||
610 | */ | ||
604 | pm_notifier(cpu_hotplug_pm_callback, 0); | 611 | pm_notifier(cpu_hotplug_pm_callback, 0); |
605 | return 0; | 612 | return 0; |
606 | } | 613 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f33c7153b6d7..7bb63eea6eb8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
302 | * are online, with memory. If none are online with memory, walk | 302 | * are online, with memory. If none are online with memory, walk |
303 | * up the cpuset hierarchy until we find one that does have some | 303 | * up the cpuset hierarchy until we find one that does have some |
304 | * online mems. If we get all the way to the top and still haven't | 304 | * online mems. If we get all the way to the top and still haven't |
305 | * found any online mems, return node_states[N_HIGH_MEMORY]. | 305 | * found any online mems, return node_states[N_MEMORY]. |
306 | * | 306 | * |
307 | * One way or another, we guarantee to return some non-empty subset | 307 | * One way or another, we guarantee to return some non-empty subset |
308 | * of node_states[N_HIGH_MEMORY]. | 308 | * of node_states[N_MEMORY]. |
309 | * | 309 | * |
310 | * Call with callback_mutex held. | 310 | * Call with callback_mutex held. |
311 | */ | 311 | */ |
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
314 | { | 314 | { |
315 | while (cs && !nodes_intersects(cs->mems_allowed, | 315 | while (cs && !nodes_intersects(cs->mems_allowed, |
316 | node_states[N_HIGH_MEMORY])) | 316 | node_states[N_MEMORY])) |
317 | cs = cs->parent; | 317 | cs = cs->parent; |
318 | if (cs) | 318 | if (cs) |
319 | nodes_and(*pmask, cs->mems_allowed, | 319 | nodes_and(*pmask, cs->mems_allowed, |
320 | node_states[N_HIGH_MEMORY]); | 320 | node_states[N_MEMORY]); |
321 | else | 321 | else |
322 | *pmask = node_states[N_HIGH_MEMORY]; | 322 | *pmask = node_states[N_MEMORY]; |
323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); | 323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); |
324 | } | 324 | } |
325 | 325 | ||
326 | /* | 326 | /* |
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1100 | return -ENOMEM; | 1100 | return -ENOMEM; |
1101 | 1101 | ||
1102 | /* | 1102 | /* |
1103 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | 1103 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; |
1104 | * it's read-only | 1104 | * it's read-only |
1105 | */ | 1105 | */ |
1106 | if (cs == &top_cpuset) { | 1106 | if (cs == &top_cpuset) { |
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1122 | goto done; | 1122 | goto done; |
1123 | 1123 | ||
1124 | if (!nodes_subset(trialcs->mems_allowed, | 1124 | if (!nodes_subset(trialcs->mems_allowed, |
1125 | node_states[N_HIGH_MEMORY])) { | 1125 | node_states[N_MEMORY])) { |
1126 | retval = -EINVAL; | 1126 | retval = -EINVAL; |
1127 | goto done; | 1127 | goto done; |
1128 | } | 1128 | } |
@@ -1784,56 +1784,20 @@ static struct cftype files[] = { | |||
1784 | }; | 1784 | }; |
1785 | 1785 | ||
1786 | /* | 1786 | /* |
1787 | * post_clone() is called during cgroup_create() when the | 1787 | * cpuset_css_alloc - allocate a cpuset css |
1788 | * clone_children mount argument was specified. The cgroup | ||
1789 | * can not yet have any tasks. | ||
1790 | * | ||
1791 | * Currently we refuse to set up the cgroup - thereby | ||
1792 | * refusing the task to be entered, and as a result refusing | ||
1793 | * the sys_unshare() or clone() which initiated it - if any | ||
1794 | * sibling cpusets have exclusive cpus or mem. | ||
1795 | * | ||
1796 | * If this becomes a problem for some users who wish to | ||
1797 | * allow that scenario, then cpuset_post_clone() could be | ||
1798 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | ||
1799 | * (and likewise for mems) to the new cgroup. Called with cgroup_mutex | ||
1800 | * held. | ||
1801 | */ | ||
1802 | static void cpuset_post_clone(struct cgroup *cgroup) | ||
1803 | { | ||
1804 | struct cgroup *parent, *child; | ||
1805 | struct cpuset *cs, *parent_cs; | ||
1806 | |||
1807 | parent = cgroup->parent; | ||
1808 | list_for_each_entry(child, &parent->children, sibling) { | ||
1809 | cs = cgroup_cs(child); | ||
1810 | if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) | ||
1811 | return; | ||
1812 | } | ||
1813 | cs = cgroup_cs(cgroup); | ||
1814 | parent_cs = cgroup_cs(parent); | ||
1815 | |||
1816 | mutex_lock(&callback_mutex); | ||
1817 | cs->mems_allowed = parent_cs->mems_allowed; | ||
1818 | cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); | ||
1819 | mutex_unlock(&callback_mutex); | ||
1820 | return; | ||
1821 | } | ||
1822 | |||
1823 | /* | ||
1824 | * cpuset_create - create a cpuset | ||
1825 | * cont: control group that the new cpuset will be part of | 1788 | * cont: control group that the new cpuset will be part of |
1826 | */ | 1789 | */ |
1827 | 1790 | ||
1828 | static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | 1791 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) |
1829 | { | 1792 | { |
1830 | struct cpuset *cs; | 1793 | struct cgroup *parent_cg = cont->parent; |
1831 | struct cpuset *parent; | 1794 | struct cgroup *tmp_cg; |
1795 | struct cpuset *parent, *cs; | ||
1832 | 1796 | ||
1833 | if (!cont->parent) { | 1797 | if (!parent_cg) |
1834 | return &top_cpuset.css; | 1798 | return &top_cpuset.css; |
1835 | } | 1799 | parent = cgroup_cs(parent_cg); |
1836 | parent = cgroup_cs(cont->parent); | 1800 | |
1837 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); | 1801 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); |
1838 | if (!cs) | 1802 | if (!cs) |
1839 | return ERR_PTR(-ENOMEM); | 1803 | return ERR_PTR(-ENOMEM); |
@@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | |||
1855 | 1819 | ||
1856 | cs->parent = parent; | 1820 | cs->parent = parent; |
1857 | number_of_cpusets++; | 1821 | number_of_cpusets++; |
1858 | return &cs->css ; | 1822 | |
1823 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) | ||
1824 | goto skip_clone; | ||
1825 | |||
1826 | /* | ||
1827 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is | ||
1828 | * set. This flag handling is implemented in cgroup core for | ||
1829 | * histrical reasons - the flag may be specified during mount. | ||
1830 | * | ||
1831 | * Currently, if any sibling cpusets have exclusive cpus or mem, we | ||
1832 | * refuse to clone the configuration - thereby refusing the task to | ||
1833 | * be entered, and as a result refusing the sys_unshare() or | ||
1834 | * clone() which initiated it. If this becomes a problem for some | ||
1835 | * users who wish to allow that scenario, then this could be | ||
1836 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | ||
1837 | * (and likewise for mems) to the new cgroup. | ||
1838 | */ | ||
1839 | list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { | ||
1840 | struct cpuset *tmp_cs = cgroup_cs(tmp_cg); | ||
1841 | |||
1842 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) | ||
1843 | goto skip_clone; | ||
1844 | } | ||
1845 | |||
1846 | mutex_lock(&callback_mutex); | ||
1847 | cs->mems_allowed = parent->mems_allowed; | ||
1848 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | ||
1849 | mutex_unlock(&callback_mutex); | ||
1850 | skip_clone: | ||
1851 | return &cs->css; | ||
1859 | } | 1852 | } |
1860 | 1853 | ||
1861 | /* | 1854 | /* |
@@ -1864,7 +1857,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) | |||
1864 | * will call async_rebuild_sched_domains(). | 1857 | * will call async_rebuild_sched_domains(). |
1865 | */ | 1858 | */ |
1866 | 1859 | ||
1867 | static void cpuset_destroy(struct cgroup *cont) | 1860 | static void cpuset_css_free(struct cgroup *cont) |
1868 | { | 1861 | { |
1869 | struct cpuset *cs = cgroup_cs(cont); | 1862 | struct cpuset *cs = cgroup_cs(cont); |
1870 | 1863 | ||
@@ -1878,11 +1871,10 @@ static void cpuset_destroy(struct cgroup *cont) | |||
1878 | 1871 | ||
1879 | struct cgroup_subsys cpuset_subsys = { | 1872 | struct cgroup_subsys cpuset_subsys = { |
1880 | .name = "cpuset", | 1873 | .name = "cpuset", |
1881 | .create = cpuset_create, | 1874 | .css_alloc = cpuset_css_alloc, |
1882 | .destroy = cpuset_destroy, | 1875 | .css_free = cpuset_css_free, |
1883 | .can_attach = cpuset_can_attach, | 1876 | .can_attach = cpuset_can_attach, |
1884 | .attach = cpuset_attach, | 1877 | .attach = cpuset_attach, |
1885 | .post_clone = cpuset_post_clone, | ||
1886 | .subsys_id = cpuset_subsys_id, | 1878 | .subsys_id = cpuset_subsys_id, |
1887 | .base_cftypes = files, | 1879 | .base_cftypes = files, |
1888 | .early_init = 1, | 1880 | .early_init = 1, |
@@ -2034,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue) | |||
2034 | * before dropping down to the next. It always processes a node before | 2026 | * before dropping down to the next. It always processes a node before |
2035 | * any of its children. | 2027 | * any of its children. |
2036 | * | 2028 | * |
2037 | * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY | 2029 | * In the case of memory hot-unplug, it will remove nodes from N_MEMORY |
2038 | * if all present pages from a node are offlined. | 2030 | * if all present pages from a node are offlined. |
2039 | */ | 2031 | */ |
2040 | static void | 2032 | static void |
@@ -2073,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
2073 | 2065 | ||
2074 | /* Continue past cpusets with all mems online */ | 2066 | /* Continue past cpusets with all mems online */ |
2075 | if (nodes_subset(cp->mems_allowed, | 2067 | if (nodes_subset(cp->mems_allowed, |
2076 | node_states[N_HIGH_MEMORY])) | 2068 | node_states[N_MEMORY])) |
2077 | continue; | 2069 | continue; |
2078 | 2070 | ||
2079 | oldmems = cp->mems_allowed; | 2071 | oldmems = cp->mems_allowed; |
@@ -2081,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
2081 | /* Remove offline mems from this cpuset. */ | 2073 | /* Remove offline mems from this cpuset. */ |
2082 | mutex_lock(&callback_mutex); | 2074 | mutex_lock(&callback_mutex); |
2083 | nodes_and(cp->mems_allowed, cp->mems_allowed, | 2075 | nodes_and(cp->mems_allowed, cp->mems_allowed, |
2084 | node_states[N_HIGH_MEMORY]); | 2076 | node_states[N_MEMORY]); |
2085 | mutex_unlock(&callback_mutex); | 2077 | mutex_unlock(&callback_mutex); |
2086 | 2078 | ||
2087 | /* Move tasks from the empty cpuset to a parent */ | 2079 | /* Move tasks from the empty cpuset to a parent */ |
@@ -2134,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online) | |||
2134 | 2126 | ||
2135 | #ifdef CONFIG_MEMORY_HOTPLUG | 2127 | #ifdef CONFIG_MEMORY_HOTPLUG |
2136 | /* | 2128 | /* |
2137 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2129 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. |
2138 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. | 2130 | * Call this routine anytime after node_states[N_MEMORY] changes. |
2139 | * See cpuset_update_active_cpus() for CPU hotplug handling. | 2131 | * See cpuset_update_active_cpus() for CPU hotplug handling. |
2140 | */ | 2132 | */ |
2141 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2133 | static int cpuset_track_online_nodes(struct notifier_block *self, |
@@ -2148,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2148 | case MEM_ONLINE: | 2140 | case MEM_ONLINE: |
2149 | oldmems = top_cpuset.mems_allowed; | 2141 | oldmems = top_cpuset.mems_allowed; |
2150 | mutex_lock(&callback_mutex); | 2142 | mutex_lock(&callback_mutex); |
2151 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2143 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2152 | mutex_unlock(&callback_mutex); | 2144 | mutex_unlock(&callback_mutex); |
2153 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); | 2145 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); |
2154 | break; | 2146 | break; |
@@ -2177,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2177 | void __init cpuset_init_smp(void) | 2169 | void __init cpuset_init_smp(void) |
2178 | { | 2170 | { |
2179 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2171 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2180 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2172 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2181 | 2173 | ||
2182 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2174 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
2183 | 2175 | ||
@@ -2245,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void) | |||
2245 | * | 2237 | * |
2246 | * Description: Returns the nodemask_t mems_allowed of the cpuset | 2238 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
2247 | * attached to the specified @tsk. Guaranteed to return some non-empty | 2239 | * attached to the specified @tsk. Guaranteed to return some non-empty |
2248 | * subset of node_states[N_HIGH_MEMORY], even if this means going outside the | 2240 | * subset of node_states[N_MEMORY], even if this means going outside the |
2249 | * tasks cpuset. | 2241 | * tasks cpuset. |
2250 | **/ | 2242 | **/ |
2251 | 2243 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 709d521903f6..e0573a43c7df 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -30,17 +30,6 @@ | |||
30 | static struct kmem_cache *cred_jar; | 30 | static struct kmem_cache *cred_jar; |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * The common credentials for the initial task's thread group | ||
34 | */ | ||
35 | #ifdef CONFIG_KEYS | ||
36 | static struct thread_group_cred init_tgcred = { | ||
37 | .usage = ATOMIC_INIT(2), | ||
38 | .tgid = 0, | ||
39 | .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), | ||
40 | }; | ||
41 | #endif | ||
42 | |||
43 | /* | ||
44 | * The initial credentials for the initial task | 33 | * The initial credentials for the initial task |
45 | */ | 34 | */ |
46 | struct cred init_cred = { | 35 | struct cred init_cred = { |
@@ -65,9 +54,6 @@ struct cred init_cred = { | |||
65 | .user = INIT_USER, | 54 | .user = INIT_USER, |
66 | .user_ns = &init_user_ns, | 55 | .user_ns = &init_user_ns, |
67 | .group_info = &init_groups, | 56 | .group_info = &init_groups, |
68 | #ifdef CONFIG_KEYS | ||
69 | .tgcred = &init_tgcred, | ||
70 | #endif | ||
71 | }; | 57 | }; |
72 | 58 | ||
73 | static inline void set_cred_subscribers(struct cred *cred, int n) | 59 | static inline void set_cred_subscribers(struct cred *cred, int n) |
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n) | |||
96 | } | 82 | } |
97 | 83 | ||
98 | /* | 84 | /* |
99 | * Dispose of the shared task group credentials | ||
100 | */ | ||
101 | #ifdef CONFIG_KEYS | ||
102 | static void release_tgcred_rcu(struct rcu_head *rcu) | ||
103 | { | ||
104 | struct thread_group_cred *tgcred = | ||
105 | container_of(rcu, struct thread_group_cred, rcu); | ||
106 | |||
107 | BUG_ON(atomic_read(&tgcred->usage) != 0); | ||
108 | |||
109 | key_put(tgcred->session_keyring); | ||
110 | key_put(tgcred->process_keyring); | ||
111 | kfree(tgcred); | ||
112 | } | ||
113 | #endif | ||
114 | |||
115 | /* | ||
116 | * Release a set of thread group credentials. | ||
117 | */ | ||
118 | static void release_tgcred(struct cred *cred) | ||
119 | { | ||
120 | #ifdef CONFIG_KEYS | ||
121 | struct thread_group_cred *tgcred = cred->tgcred; | ||
122 | |||
123 | if (atomic_dec_and_test(&tgcred->usage)) | ||
124 | call_rcu(&tgcred->rcu, release_tgcred_rcu); | ||
125 | #endif | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * The RCU callback to actually dispose of a set of credentials | 85 | * The RCU callback to actually dispose of a set of credentials |
130 | */ | 86 | */ |
131 | static void put_cred_rcu(struct rcu_head *rcu) | 87 | static void put_cred_rcu(struct rcu_head *rcu) |
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu) | |||
150 | #endif | 106 | #endif |
151 | 107 | ||
152 | security_cred_free(cred); | 108 | security_cred_free(cred); |
109 | key_put(cred->session_keyring); | ||
110 | key_put(cred->process_keyring); | ||
153 | key_put(cred->thread_keyring); | 111 | key_put(cred->thread_keyring); |
154 | key_put(cred->request_key_auth); | 112 | key_put(cred->request_key_auth); |
155 | release_tgcred(cred); | ||
156 | if (cred->group_info) | 113 | if (cred->group_info) |
157 | put_group_info(cred->group_info); | 114 | put_group_info(cred->group_info); |
158 | free_uid(cred->user); | 115 | free_uid(cred->user); |
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void) | |||
246 | if (!new) | 203 | if (!new) |
247 | return NULL; | 204 | return NULL; |
248 | 205 | ||
249 | #ifdef CONFIG_KEYS | ||
250 | new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); | ||
251 | if (!new->tgcred) { | ||
252 | kmem_cache_free(cred_jar, new); | ||
253 | return NULL; | ||
254 | } | ||
255 | atomic_set(&new->tgcred->usage, 1); | ||
256 | #endif | ||
257 | |||
258 | atomic_set(&new->usage, 1); | 206 | atomic_set(&new->usage, 1); |
259 | #ifdef CONFIG_DEBUG_CREDENTIALS | 207 | #ifdef CONFIG_DEBUG_CREDENTIALS |
260 | new->magic = CRED_MAGIC; | 208 | new->magic = CRED_MAGIC; |
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void) | |||
308 | get_user_ns(new->user_ns); | 256 | get_user_ns(new->user_ns); |
309 | 257 | ||
310 | #ifdef CONFIG_KEYS | 258 | #ifdef CONFIG_KEYS |
259 | key_get(new->session_keyring); | ||
260 | key_get(new->process_keyring); | ||
311 | key_get(new->thread_keyring); | 261 | key_get(new->thread_keyring); |
312 | key_get(new->request_key_auth); | 262 | key_get(new->request_key_auth); |
313 | atomic_inc(&new->tgcred->usage); | ||
314 | #endif | 263 | #endif |
315 | 264 | ||
316 | #ifdef CONFIG_SECURITY | 265 | #ifdef CONFIG_SECURITY |
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds); | |||
334 | */ | 283 | */ |
335 | struct cred *prepare_exec_creds(void) | 284 | struct cred *prepare_exec_creds(void) |
336 | { | 285 | { |
337 | struct thread_group_cred *tgcred = NULL; | ||
338 | struct cred *new; | 286 | struct cred *new; |
339 | 287 | ||
340 | #ifdef CONFIG_KEYS | ||
341 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | ||
342 | if (!tgcred) | ||
343 | return NULL; | ||
344 | #endif | ||
345 | |||
346 | new = prepare_creds(); | 288 | new = prepare_creds(); |
347 | if (!new) { | 289 | if (!new) |
348 | kfree(tgcred); | ||
349 | return new; | 290 | return new; |
350 | } | ||
351 | 291 | ||
352 | #ifdef CONFIG_KEYS | 292 | #ifdef CONFIG_KEYS |
353 | /* newly exec'd tasks don't get a thread keyring */ | 293 | /* newly exec'd tasks don't get a thread keyring */ |
354 | key_put(new->thread_keyring); | 294 | key_put(new->thread_keyring); |
355 | new->thread_keyring = NULL; | 295 | new->thread_keyring = NULL; |
356 | 296 | ||
357 | /* create a new per-thread-group creds for all this set of threads to | ||
358 | * share */ | ||
359 | memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); | ||
360 | |||
361 | atomic_set(&tgcred->usage, 1); | ||
362 | spin_lock_init(&tgcred->lock); | ||
363 | |||
364 | /* inherit the session keyring; new process keyring */ | 297 | /* inherit the session keyring; new process keyring */ |
365 | key_get(tgcred->session_keyring); | 298 | key_put(new->process_keyring); |
366 | tgcred->process_keyring = NULL; | 299 | new->process_keyring = NULL; |
367 | |||
368 | release_tgcred(new); | ||
369 | new->tgcred = tgcred; | ||
370 | #endif | 300 | #endif |
371 | 301 | ||
372 | return new; | 302 | return new; |
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void) | |||
383 | */ | 313 | */ |
384 | int copy_creds(struct task_struct *p, unsigned long clone_flags) | 314 | int copy_creds(struct task_struct *p, unsigned long clone_flags) |
385 | { | 315 | { |
386 | #ifdef CONFIG_KEYS | ||
387 | struct thread_group_cred *tgcred; | ||
388 | #endif | ||
389 | struct cred *new; | 316 | struct cred *new; |
390 | int ret; | 317 | int ret; |
391 | 318 | ||
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
425 | install_thread_keyring_to_cred(new); | 352 | install_thread_keyring_to_cred(new); |
426 | } | 353 | } |
427 | 354 | ||
428 | /* we share the process and session keyrings between all the threads in | 355 | /* The process keyring is only shared between the threads in a process; |
429 | * a process - this is slightly icky as we violate COW credentials a | 356 | * anything outside of those threads doesn't inherit. |
430 | * bit */ | 357 | */ |
431 | if (!(clone_flags & CLONE_THREAD)) { | 358 | if (!(clone_flags & CLONE_THREAD)) { |
432 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | 359 | key_put(new->process_keyring); |
433 | if (!tgcred) { | 360 | new->process_keyring = NULL; |
434 | ret = -ENOMEM; | ||
435 | goto error_put; | ||
436 | } | ||
437 | atomic_set(&tgcred->usage, 1); | ||
438 | spin_lock_init(&tgcred->lock); | ||
439 | tgcred->process_keyring = NULL; | ||
440 | tgcred->session_keyring = key_get(new->tgcred->session_keyring); | ||
441 | |||
442 | release_tgcred(new); | ||
443 | new->tgcred = tgcred; | ||
444 | } | 361 | } |
445 | #endif | 362 | #endif |
446 | 363 | ||
@@ -668,9 +585,6 @@ void __init cred_init(void) | |||
668 | */ | 585 | */ |
669 | struct cred *prepare_kernel_cred(struct task_struct *daemon) | 586 | struct cred *prepare_kernel_cred(struct task_struct *daemon) |
670 | { | 587 | { |
671 | #ifdef CONFIG_KEYS | ||
672 | struct thread_group_cred *tgcred; | ||
673 | #endif | ||
674 | const struct cred *old; | 588 | const struct cred *old; |
675 | struct cred *new; | 589 | struct cred *new; |
676 | 590 | ||
@@ -678,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
678 | if (!new) | 592 | if (!new) |
679 | return NULL; | 593 | return NULL; |
680 | 594 | ||
681 | #ifdef CONFIG_KEYS | ||
682 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | ||
683 | if (!tgcred) { | ||
684 | kmem_cache_free(cred_jar, new); | ||
685 | return NULL; | ||
686 | } | ||
687 | #endif | ||
688 | |||
689 | kdebug("prepare_kernel_cred() alloc %p", new); | 595 | kdebug("prepare_kernel_cred() alloc %p", new); |
690 | 596 | ||
691 | if (daemon) | 597 | if (daemon) |
@@ -703,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
703 | get_group_info(new->group_info); | 609 | get_group_info(new->group_info); |
704 | 610 | ||
705 | #ifdef CONFIG_KEYS | 611 | #ifdef CONFIG_KEYS |
706 | atomic_set(&tgcred->usage, 1); | 612 | new->session_keyring = NULL; |
707 | spin_lock_init(&tgcred->lock); | 613 | new->process_keyring = NULL; |
708 | tgcred->process_keyring = NULL; | ||
709 | tgcred->session_keyring = NULL; | ||
710 | new->tgcred = tgcred; | ||
711 | new->request_key_auth = NULL; | ||
712 | new->thread_keyring = NULL; | 614 | new->thread_keyring = NULL; |
615 | new->request_key_auth = NULL; | ||
713 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; | 616 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; |
714 | #endif | 617 | #endif |
715 | 618 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 738f3564e83b..301079d06f24 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -7434,7 +7434,7 @@ unlock: | |||
7434 | device_initcall(perf_event_sysfs_init); | 7434 | device_initcall(perf_event_sysfs_init); |
7435 | 7435 | ||
7436 | #ifdef CONFIG_CGROUP_PERF | 7436 | #ifdef CONFIG_CGROUP_PERF |
7437 | static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) | 7437 | static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) |
7438 | { | 7438 | { |
7439 | struct perf_cgroup *jc; | 7439 | struct perf_cgroup *jc; |
7440 | 7440 | ||
@@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) | |||
7451 | return &jc->css; | 7451 | return &jc->css; |
7452 | } | 7452 | } |
7453 | 7453 | ||
7454 | static void perf_cgroup_destroy(struct cgroup *cont) | 7454 | static void perf_cgroup_css_free(struct cgroup *cont) |
7455 | { | 7455 | { |
7456 | struct perf_cgroup *jc; | 7456 | struct perf_cgroup *jc; |
7457 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | 7457 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), |
@@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | |||
7492 | struct cgroup_subsys perf_subsys = { | 7492 | struct cgroup_subsys perf_subsys = { |
7493 | .name = "perf_event", | 7493 | .name = "perf_event", |
7494 | .subsys_id = perf_subsys_id, | 7494 | .subsys_id = perf_subsys_id, |
7495 | .create = perf_cgroup_create, | 7495 | .css_alloc = perf_cgroup_css_alloc, |
7496 | .destroy = perf_cgroup_destroy, | 7496 | .css_free = perf_cgroup_css_free, |
7497 | .exit = perf_cgroup_exit, | 7497 | .exit = perf_cgroup_exit, |
7498 | .attach = perf_cgroup_attach, | 7498 | .attach = perf_cgroup_attach, |
7499 | 7499 | ||
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9a7b487c6fe2..fe8a916507ed 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
111 | * Count the number of breakpoints of the same type and same task. | 111 | * Count the number of breakpoints of the same type and same task. |
112 | * The given event must be not on the list. | 112 | * The given event must be not on the list. |
113 | */ | 113 | */ |
114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) | 114 | static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) |
115 | { | 115 | { |
116 | struct task_struct *tsk = bp->hw.bp_target; | 116 | struct task_struct *tsk = bp->hw.bp_target; |
117 | struct perf_event *iter; | 117 | struct perf_event *iter; |
118 | int count = 0; | 118 | int count = 0; |
119 | 119 | ||
120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
121 | if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) | 121 | if (iter->hw.bp_target == tsk && |
122 | find_slot_idx(iter) == type && | ||
123 | cpu == iter->cpu) | ||
122 | count += hw_breakpoint_weight(iter); | 124 | count += hw_breakpoint_weight(iter); |
123 | } | 125 | } |
124 | 126 | ||
@@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
141 | if (!tsk) | 143 | if (!tsk) |
142 | slots->pinned += max_task_bp_pinned(cpu, type); | 144 | slots->pinned += max_task_bp_pinned(cpu, type); |
143 | else | 145 | else |
144 | slots->pinned += task_bp_pinned(bp, type); | 146 | slots->pinned += task_bp_pinned(cpu, bp, type); |
145 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); | 147 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); |
146 | 148 | ||
147 | return; | 149 | return; |
@@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
154 | if (!tsk) | 156 | if (!tsk) |
155 | nr += max_task_bp_pinned(cpu, type); | 157 | nr += max_task_bp_pinned(cpu, type); |
156 | else | 158 | else |
157 | nr += task_bp_pinned(bp, type); | 159 | nr += task_bp_pinned(cpu, bp, type); |
158 | 160 | ||
159 | if (nr > slots->pinned) | 161 | if (nr > slots->pinned) |
160 | slots->pinned = nr; | 162 | slots->pinned = nr; |
@@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, | |||
188 | int old_idx = 0; | 190 | int old_idx = 0; |
189 | int idx = 0; | 191 | int idx = 0; |
190 | 192 | ||
191 | old_count = task_bp_pinned(bp, type); | 193 | old_count = task_bp_pinned(cpu, bp, type); |
192 | old_idx = old_count - 1; | 194 | old_idx = old_count - 1; |
193 | idx = old_idx + weight; | 195 | idx = old_idx + weight; |
194 | 196 | ||
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5cc4e7e42e68..dea7acfbb071 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/ptrace.h> /* user_enable_single_step */ | 33 | #include <linux/ptrace.h> /* user_enable_single_step */ |
34 | #include <linux/kdebug.h> /* notifier mechanism */ | 34 | #include <linux/kdebug.h> /* notifier mechanism */ |
35 | #include "../../mm/internal.h" /* munlock_vma_page */ | 35 | #include "../../mm/internal.h" /* munlock_vma_page */ |
36 | #include <linux/percpu-rwsem.h> | ||
36 | 37 | ||
37 | #include <linux/uprobes.h> | 38 | #include <linux/uprobes.h> |
38 | 39 | ||
@@ -71,6 +72,8 @@ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; | |||
71 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | 72 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; |
72 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | 73 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) |
73 | 74 | ||
75 | static struct percpu_rw_semaphore dup_mmap_sem; | ||
76 | |||
74 | /* | 77 | /* |
75 | * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe | 78 | * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe |
76 | * events active at this time. Probably a fine grained per inode count is | 79 | * events active at this time. Probably a fine grained per inode count is |
@@ -766,10 +769,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | |||
766 | struct map_info *info; | 769 | struct map_info *info; |
767 | int err = 0; | 770 | int err = 0; |
768 | 771 | ||
772 | percpu_down_write(&dup_mmap_sem); | ||
769 | info = build_map_info(uprobe->inode->i_mapping, | 773 | info = build_map_info(uprobe->inode->i_mapping, |
770 | uprobe->offset, is_register); | 774 | uprobe->offset, is_register); |
771 | if (IS_ERR(info)) | 775 | if (IS_ERR(info)) { |
772 | return PTR_ERR(info); | 776 | err = PTR_ERR(info); |
777 | goto out; | ||
778 | } | ||
773 | 779 | ||
774 | while (info) { | 780 | while (info) { |
775 | struct mm_struct *mm = info->mm; | 781 | struct mm_struct *mm = info->mm; |
@@ -799,7 +805,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | |||
799 | mmput(mm); | 805 | mmput(mm); |
800 | info = free_map_info(info); | 806 | info = free_map_info(info); |
801 | } | 807 | } |
802 | 808 | out: | |
809 | percpu_up_write(&dup_mmap_sem); | ||
803 | return err; | 810 | return err; |
804 | } | 811 | } |
805 | 812 | ||
@@ -1131,6 +1138,16 @@ void uprobe_clear_state(struct mm_struct *mm) | |||
1131 | kfree(area); | 1138 | kfree(area); |
1132 | } | 1139 | } |
1133 | 1140 | ||
1141 | void uprobe_start_dup_mmap(void) | ||
1142 | { | ||
1143 | percpu_down_read(&dup_mmap_sem); | ||
1144 | } | ||
1145 | |||
1146 | void uprobe_end_dup_mmap(void) | ||
1147 | { | ||
1148 | percpu_up_read(&dup_mmap_sem); | ||
1149 | } | ||
1150 | |||
1134 | void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) | 1151 | void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) |
1135 | { | 1152 | { |
1136 | newmm->uprobes_state.xol_area = NULL; | 1153 | newmm->uprobes_state.xol_area = NULL; |
@@ -1199,6 +1216,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot | |||
1199 | vaddr = kmap_atomic(area->page); | 1216 | vaddr = kmap_atomic(area->page); |
1200 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); | 1217 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); |
1201 | kunmap_atomic(vaddr); | 1218 | kunmap_atomic(vaddr); |
1219 | /* | ||
1220 | * We probably need flush_icache_user_range() but it needs vma. | ||
1221 | * This should work on supported architectures too. | ||
1222 | */ | ||
1223 | flush_dcache_page(area->page); | ||
1202 | 1224 | ||
1203 | return current->utask->xol_vaddr; | 1225 | return current->utask->xol_vaddr; |
1204 | } | 1226 | } |
@@ -1430,16 +1452,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | |||
1430 | return uprobe; | 1452 | return uprobe; |
1431 | } | 1453 | } |
1432 | 1454 | ||
1433 | void __weak arch_uprobe_enable_step(struct arch_uprobe *arch) | ||
1434 | { | ||
1435 | user_enable_single_step(current); | ||
1436 | } | ||
1437 | |||
1438 | void __weak arch_uprobe_disable_step(struct arch_uprobe *arch) | ||
1439 | { | ||
1440 | user_disable_single_step(current); | ||
1441 | } | ||
1442 | |||
1443 | /* | 1455 | /* |
1444 | * Run handler and ask thread to singlestep. | 1456 | * Run handler and ask thread to singlestep. |
1445 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | 1457 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. |
@@ -1493,7 +1505,6 @@ static void handle_swbp(struct pt_regs *regs) | |||
1493 | goto out; | 1505 | goto out; |
1494 | 1506 | ||
1495 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { | 1507 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { |
1496 | arch_uprobe_enable_step(&uprobe->arch); | ||
1497 | utask->active_uprobe = uprobe; | 1508 | utask->active_uprobe = uprobe; |
1498 | utask->state = UTASK_SSTEP; | 1509 | utask->state = UTASK_SSTEP; |
1499 | return; | 1510 | return; |
@@ -1525,7 +1536,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | |||
1525 | else | 1536 | else |
1526 | WARN_ON_ONCE(1); | 1537 | WARN_ON_ONCE(1); |
1527 | 1538 | ||
1528 | arch_uprobe_disable_step(&uprobe->arch); | ||
1529 | put_uprobe(uprobe); | 1539 | put_uprobe(uprobe); |
1530 | utask->active_uprobe = NULL; | 1540 | utask->active_uprobe = NULL; |
1531 | utask->state = UTASK_RUNNING; | 1541 | utask->state = UTASK_RUNNING; |
@@ -1604,6 +1614,9 @@ static int __init init_uprobes(void) | |||
1604 | mutex_init(&uprobes_mmap_mutex[i]); | 1614 | mutex_init(&uprobes_mmap_mutex[i]); |
1605 | } | 1615 | } |
1606 | 1616 | ||
1617 | if (percpu_init_rwsem(&dup_mmap_sem)) | ||
1618 | return -ENOMEM; | ||
1619 | |||
1607 | return register_die_notifier(&uprobe_exception_nb); | 1620 | return register_die_notifier(&uprobe_exception_nb); |
1608 | } | 1621 | } |
1609 | module_init(init_uprobes); | 1622 | module_init(init_uprobes); |
diff --git a/kernel/exit.c b/kernel/exit.c index d7fe58db4527..b4df21937216 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -310,43 +310,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | |||
310 | } | 310 | } |
311 | } | 311 | } |
312 | 312 | ||
313 | /** | ||
314 | * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd | ||
315 | * | ||
316 | * If a kernel thread is launched as a result of a system call, or if | ||
317 | * it ever exits, it should generally reparent itself to kthreadd so it | ||
318 | * isn't in the way of other processes and is correctly cleaned up on exit. | ||
319 | * | ||
320 | * The various task state such as scheduling policy and priority may have | ||
321 | * been inherited from a user process, so we reset them to sane values here. | ||
322 | * | ||
323 | * NOTE that reparent_to_kthreadd() gives the caller full capabilities. | ||
324 | */ | ||
325 | static void reparent_to_kthreadd(void) | ||
326 | { | ||
327 | write_lock_irq(&tasklist_lock); | ||
328 | |||
329 | ptrace_unlink(current); | ||
330 | /* Reparent to init */ | ||
331 | current->real_parent = current->parent = kthreadd_task; | ||
332 | list_move_tail(¤t->sibling, ¤t->real_parent->children); | ||
333 | |||
334 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | ||
335 | current->exit_signal = SIGCHLD; | ||
336 | |||
337 | if (task_nice(current) < 0) | ||
338 | set_user_nice(current, 0); | ||
339 | /* cpus_allowed? */ | ||
340 | /* rt_priority? */ | ||
341 | /* signals? */ | ||
342 | memcpy(current->signal->rlim, init_task.signal->rlim, | ||
343 | sizeof(current->signal->rlim)); | ||
344 | |||
345 | atomic_inc(&init_cred.usage); | ||
346 | commit_creds(&init_cred); | ||
347 | write_unlock_irq(&tasklist_lock); | ||
348 | } | ||
349 | |||
350 | void __set_special_pids(struct pid *pid) | 313 | void __set_special_pids(struct pid *pid) |
351 | { | 314 | { |
352 | struct task_struct *curr = current->group_leader; | 315 | struct task_struct *curr = current->group_leader; |
@@ -358,13 +321,6 @@ void __set_special_pids(struct pid *pid) | |||
358 | change_pid(curr, PIDTYPE_PGID, pid); | 321 | change_pid(curr, PIDTYPE_PGID, pid); |
359 | } | 322 | } |
360 | 323 | ||
361 | static void set_special_pids(struct pid *pid) | ||
362 | { | ||
363 | write_lock_irq(&tasklist_lock); | ||
364 | __set_special_pids(pid); | ||
365 | write_unlock_irq(&tasklist_lock); | ||
366 | } | ||
367 | |||
368 | /* | 324 | /* |
369 | * Let kernel threads use this to say that they allow a certain signal. | 325 | * Let kernel threads use this to say that they allow a certain signal. |
370 | * Must not be used if kthread was cloned with CLONE_SIGHAND. | 326 | * Must not be used if kthread was cloned with CLONE_SIGHAND. |
@@ -404,54 +360,6 @@ int disallow_signal(int sig) | |||
404 | 360 | ||
405 | EXPORT_SYMBOL(disallow_signal); | 361 | EXPORT_SYMBOL(disallow_signal); |
406 | 362 | ||
407 | /* | ||
408 | * Put all the gunge required to become a kernel thread without | ||
409 | * attached user resources in one place where it belongs. | ||
410 | */ | ||
411 | |||
412 | void daemonize(const char *name, ...) | ||
413 | { | ||
414 | va_list args; | ||
415 | sigset_t blocked; | ||
416 | |||
417 | va_start(args, name); | ||
418 | vsnprintf(current->comm, sizeof(current->comm), name, args); | ||
419 | va_end(args); | ||
420 | |||
421 | /* | ||
422 | * If we were started as result of loading a module, close all of the | ||
423 | * user space pages. We don't need them, and if we didn't close them | ||
424 | * they would be locked into memory. | ||
425 | */ | ||
426 | exit_mm(current); | ||
427 | /* | ||
428 | * We don't want to get frozen, in case system-wide hibernation | ||
429 | * or suspend transition begins right now. | ||
430 | */ | ||
431 | current->flags |= (PF_NOFREEZE | PF_KTHREAD); | ||
432 | |||
433 | if (current->nsproxy != &init_nsproxy) { | ||
434 | get_nsproxy(&init_nsproxy); | ||
435 | switch_task_namespaces(current, &init_nsproxy); | ||
436 | } | ||
437 | set_special_pids(&init_struct_pid); | ||
438 | proc_clear_tty(current); | ||
439 | |||
440 | /* Block and flush all signals */ | ||
441 | sigfillset(&blocked); | ||
442 | sigprocmask(SIG_BLOCK, &blocked, NULL); | ||
443 | flush_signals(current); | ||
444 | |||
445 | /* Become as one with the init task */ | ||
446 | |||
447 | daemonize_fs_struct(); | ||
448 | daemonize_descriptors(); | ||
449 | |||
450 | reparent_to_kthreadd(); | ||
451 | } | ||
452 | |||
453 | EXPORT_SYMBOL(daemonize); | ||
454 | |||
455 | #ifdef CONFIG_MM_OWNER | 363 | #ifdef CONFIG_MM_OWNER |
456 | /* | 364 | /* |
457 | * A task is exiting. If it owned this mm, find a new owner for the mm. | 365 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
@@ -1174,11 +1082,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1174 | * as other threads in the parent group can be right | 1082 | * as other threads in the parent group can be right |
1175 | * here reaping other children at the same time. | 1083 | * here reaping other children at the same time. |
1176 | * | 1084 | * |
1177 | * We use thread_group_times() to get times for the thread | 1085 | * We use thread_group_cputime_adjusted() to get times for the thread |
1178 | * group, which consolidates times for all threads in the | 1086 | * group, which consolidates times for all threads in the |
1179 | * group including the group leader. | 1087 | * group including the group leader. |
1180 | */ | 1088 | */ |
1181 | thread_group_times(p, &tgutime, &tgstime); | 1089 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
1182 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1090 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1183 | psig = p->real_parent->signal; | 1091 | psig = p->real_parent->signal; |
1184 | sig = p->signal; | 1092 | sig = p->signal; |
diff --git a/kernel/fork.c b/kernel/fork.c index 38e53b87402c..c36c4e301efe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -352,6 +352,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
352 | unsigned long charge; | 352 | unsigned long charge; |
353 | struct mempolicy *pol; | 353 | struct mempolicy *pol; |
354 | 354 | ||
355 | uprobe_start_dup_mmap(); | ||
355 | down_write(&oldmm->mmap_sem); | 356 | down_write(&oldmm->mmap_sem); |
356 | flush_cache_dup_mm(oldmm); | 357 | flush_cache_dup_mm(oldmm); |
357 | uprobe_dup_mmap(oldmm, mm); | 358 | uprobe_dup_mmap(oldmm, mm); |
@@ -469,6 +470,7 @@ out: | |||
469 | up_write(&mm->mmap_sem); | 470 | up_write(&mm->mmap_sem); |
470 | flush_tlb_mm(oldmm); | 471 | flush_tlb_mm(oldmm); |
471 | up_write(&oldmm->mmap_sem); | 472 | up_write(&oldmm->mmap_sem); |
473 | uprobe_end_dup_mmap(); | ||
472 | return retval; | 474 | return retval; |
473 | fail_nomem_anon_vma_fork: | 475 | fail_nomem_anon_vma_fork: |
474 | mpol_put(pol); | 476 | mpol_put(pol); |
@@ -821,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
821 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 823 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
822 | mm->pmd_huge_pte = NULL; | 824 | mm->pmd_huge_pte = NULL; |
823 | #endif | 825 | #endif |
826 | #ifdef CONFIG_NUMA_BALANCING | ||
827 | mm->first_nid = NUMA_PTE_SCAN_INIT; | ||
828 | #endif | ||
824 | if (!mm_init(mm, tsk)) | 829 | if (!mm_init(mm, tsk)) |
825 | goto fail_nomem; | 830 | goto fail_nomem; |
826 | 831 | ||
@@ -1125,7 +1130,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk) | |||
1125 | */ | 1130 | */ |
1126 | static struct task_struct *copy_process(unsigned long clone_flags, | 1131 | static struct task_struct *copy_process(unsigned long clone_flags, |
1127 | unsigned long stack_start, | 1132 | unsigned long stack_start, |
1128 | struct pt_regs *regs, | ||
1129 | unsigned long stack_size, | 1133 | unsigned long stack_size, |
1130 | int __user *child_tidptr, | 1134 | int __user *child_tidptr, |
1131 | struct pid *pid, | 1135 | struct pid *pid, |
@@ -1133,7 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1133 | { | 1137 | { |
1134 | int retval; | 1138 | int retval; |
1135 | struct task_struct *p; | 1139 | struct task_struct *p; |
1136 | int cgroup_callbacks_done = 0; | ||
1137 | 1140 | ||
1138 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | 1141 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) |
1139 | return ERR_PTR(-EINVAL); | 1142 | return ERR_PTR(-EINVAL); |
@@ -1220,7 +1223,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1220 | p->utime = p->stime = p->gtime = 0; | 1223 | p->utime = p->stime = p->gtime = 0; |
1221 | p->utimescaled = p->stimescaled = 0; | 1224 | p->utimescaled = p->stimescaled = 0; |
1222 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 1225 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
1223 | p->prev_utime = p->prev_stime = 0; | 1226 | p->prev_cputime.utime = p->prev_cputime.stime = 0; |
1224 | #endif | 1227 | #endif |
1225 | #if defined(SPLIT_RSS_COUNTING) | 1228 | #if defined(SPLIT_RSS_COUNTING) |
1226 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); | 1229 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); |
@@ -1318,7 +1321,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1318 | retval = copy_io(clone_flags, p); | 1321 | retval = copy_io(clone_flags, p); |
1319 | if (retval) | 1322 | if (retval) |
1320 | goto bad_fork_cleanup_namespaces; | 1323 | goto bad_fork_cleanup_namespaces; |
1321 | retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); | 1324 | retval = copy_thread(clone_flags, stack_start, stack_size, p); |
1322 | if (retval) | 1325 | if (retval) |
1323 | goto bad_fork_cleanup_io; | 1326 | goto bad_fork_cleanup_io; |
1324 | 1327 | ||
@@ -1391,12 +1394,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1391 | INIT_LIST_HEAD(&p->thread_group); | 1394 | INIT_LIST_HEAD(&p->thread_group); |
1392 | p->task_works = NULL; | 1395 | p->task_works = NULL; |
1393 | 1396 | ||
1394 | /* Now that the task is set up, run cgroup callbacks if | ||
1395 | * necessary. We need to run them before the task is visible | ||
1396 | * on the tasklist. */ | ||
1397 | cgroup_fork_callbacks(p); | ||
1398 | cgroup_callbacks_done = 1; | ||
1399 | |||
1400 | /* Need tasklist lock for parent etc handling! */ | 1397 | /* Need tasklist lock for parent etc handling! */ |
1401 | write_lock_irq(&tasklist_lock); | 1398 | write_lock_irq(&tasklist_lock); |
1402 | 1399 | ||
@@ -1501,7 +1498,7 @@ bad_fork_cleanup_cgroup: | |||
1501 | #endif | 1498 | #endif |
1502 | if (clone_flags & CLONE_THREAD) | 1499 | if (clone_flags & CLONE_THREAD) |
1503 | threadgroup_change_end(current); | 1500 | threadgroup_change_end(current); |
1504 | cgroup_exit(p, cgroup_callbacks_done); | 1501 | cgroup_exit(p, 0); |
1505 | delayacct_tsk_free(p); | 1502 | delayacct_tsk_free(p); |
1506 | module_put(task_thread_info(p)->exec_domain->module); | 1503 | module_put(task_thread_info(p)->exec_domain->module); |
1507 | bad_fork_cleanup_count: | 1504 | bad_fork_cleanup_count: |
@@ -1513,12 +1510,6 @@ fork_out: | |||
1513 | return ERR_PTR(retval); | 1510 | return ERR_PTR(retval); |
1514 | } | 1511 | } |
1515 | 1512 | ||
1516 | noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | ||
1517 | { | ||
1518 | memset(regs, 0, sizeof(struct pt_regs)); | ||
1519 | return regs; | ||
1520 | } | ||
1521 | |||
1522 | static inline void init_idle_pids(struct pid_link *links) | 1513 | static inline void init_idle_pids(struct pid_link *links) |
1523 | { | 1514 | { |
1524 | enum pid_type type; | 1515 | enum pid_type type; |
@@ -1532,10 +1523,7 @@ static inline void init_idle_pids(struct pid_link *links) | |||
1532 | struct task_struct * __cpuinit fork_idle(int cpu) | 1523 | struct task_struct * __cpuinit fork_idle(int cpu) |
1533 | { | 1524 | { |
1534 | struct task_struct *task; | 1525 | struct task_struct *task; |
1535 | struct pt_regs regs; | 1526 | task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); |
1536 | |||
1537 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, | ||
1538 | &init_struct_pid, 0); | ||
1539 | if (!IS_ERR(task)) { | 1527 | if (!IS_ERR(task)) { |
1540 | init_idle_pids(task->pids); | 1528 | init_idle_pids(task->pids); |
1541 | init_idle(task, cpu); | 1529 | init_idle(task, cpu); |
@@ -1552,7 +1540,6 @@ struct task_struct * __cpuinit fork_idle(int cpu) | |||
1552 | */ | 1540 | */ |
1553 | long do_fork(unsigned long clone_flags, | 1541 | long do_fork(unsigned long clone_flags, |
1554 | unsigned long stack_start, | 1542 | unsigned long stack_start, |
1555 | struct pt_regs *regs, | ||
1556 | unsigned long stack_size, | 1543 | unsigned long stack_size, |
1557 | int __user *parent_tidptr, | 1544 | int __user *parent_tidptr, |
1558 | int __user *child_tidptr) | 1545 | int __user *child_tidptr) |
@@ -1576,7 +1563,7 @@ long do_fork(unsigned long clone_flags, | |||
1576 | * requested, no event is reported; otherwise, report if the event | 1563 | * requested, no event is reported; otherwise, report if the event |
1577 | * for the type of forking is enabled. | 1564 | * for the type of forking is enabled. |
1578 | */ | 1565 | */ |
1579 | if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { | 1566 | if (!(clone_flags & CLONE_UNTRACED)) { |
1580 | if (clone_flags & CLONE_VFORK) | 1567 | if (clone_flags & CLONE_VFORK) |
1581 | trace = PTRACE_EVENT_VFORK; | 1568 | trace = PTRACE_EVENT_VFORK; |
1582 | else if ((clone_flags & CSIGNAL) != SIGCHLD) | 1569 | else if ((clone_flags & CSIGNAL) != SIGCHLD) |
@@ -1588,7 +1575,7 @@ long do_fork(unsigned long clone_flags, | |||
1588 | trace = 0; | 1575 | trace = 0; |
1589 | } | 1576 | } |
1590 | 1577 | ||
1591 | p = copy_process(clone_flags, stack_start, regs, stack_size, | 1578 | p = copy_process(clone_flags, stack_start, stack_size, |
1592 | child_tidptr, NULL, trace); | 1579 | child_tidptr, NULL, trace); |
1593 | /* | 1580 | /* |
1594 | * Do this prior waking up the new thread - the thread pointer | 1581 | * Do this prior waking up the new thread - the thread pointer |
@@ -1632,11 +1619,54 @@ long do_fork(unsigned long clone_flags, | |||
1632 | */ | 1619 | */ |
1633 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) | 1620 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) |
1634 | { | 1621 | { |
1635 | return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, | 1622 | return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, |
1636 | (unsigned long)arg, NULL, NULL); | 1623 | (unsigned long)arg, NULL, NULL); |
1637 | } | 1624 | } |
1638 | #endif | 1625 | #endif |
1639 | 1626 | ||
1627 | #ifdef __ARCH_WANT_SYS_FORK | ||
1628 | SYSCALL_DEFINE0(fork) | ||
1629 | { | ||
1630 | #ifdef CONFIG_MMU | ||
1631 | return do_fork(SIGCHLD, 0, 0, NULL, NULL); | ||
1632 | #else | ||
1633 | /* can not support in nommu mode */ | ||
1634 | return(-EINVAL); | ||
1635 | #endif | ||
1636 | } | ||
1637 | #endif | ||
1638 | |||
1639 | #ifdef __ARCH_WANT_SYS_VFORK | ||
1640 | SYSCALL_DEFINE0(vfork) | ||
1641 | { | ||
1642 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, | ||
1643 | 0, NULL, NULL); | ||
1644 | } | ||
1645 | #endif | ||
1646 | |||
1647 | #ifdef __ARCH_WANT_SYS_CLONE | ||
1648 | #ifdef CONFIG_CLONE_BACKWARDS | ||
1649 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | ||
1650 | int __user *, parent_tidptr, | ||
1651 | int, tls_val, | ||
1652 | int __user *, child_tidptr) | ||
1653 | #elif defined(CONFIG_CLONE_BACKWARDS2) | ||
1654 | SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, | ||
1655 | int __user *, parent_tidptr, | ||
1656 | int __user *, child_tidptr, | ||
1657 | int, tls_val) | ||
1658 | #else | ||
1659 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | ||
1660 | int __user *, parent_tidptr, | ||
1661 | int __user *, child_tidptr, | ||
1662 | int, tls_val) | ||
1663 | #endif | ||
1664 | { | ||
1665 | return do_fork(clone_flags, newsp, 0, | ||
1666 | parent_tidptr, child_tidptr); | ||
1667 | } | ||
1668 | #endif | ||
1669 | |||
1640 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN | 1670 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN |
1641 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 | 1671 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 |
1642 | #endif | 1672 | #endif |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 11f82a4d4eae..c38893b0efba 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p) | |||
116 | return false; | 116 | return false; |
117 | } | 117 | } |
118 | 118 | ||
119 | if (!(p->flags & PF_KTHREAD)) { | 119 | if (!(p->flags & PF_KTHREAD)) |
120 | fake_signal_wake_up(p); | 120 | fake_signal_wake_up(p); |
121 | /* | 121 | else |
122 | * fake_signal_wake_up() goes through p's scheduler | ||
123 | * lock and guarantees that TASK_STOPPED/TRACED -> | ||
124 | * TASK_RUNNING transition can't race with task state | ||
125 | * testing in try_to_freeze_tasks(). | ||
126 | */ | ||
127 | } else { | ||
128 | wake_up_state(p, TASK_INTERRUPTIBLE); | 122 | wake_up_state(p, TASK_INTERRUPTIBLE); |
129 | } | ||
130 | 123 | ||
131 | spin_unlock_irqrestore(&freezer_lock, flags); | 124 | spin_unlock_irqrestore(&freezer_lock, flags); |
132 | return true; | 125 | return true; |
diff --git a/kernel/futex.c b/kernel/futex.c index 3717e7b306e0..19eb089ca003 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
716 | struct futex_pi_state **ps, | 716 | struct futex_pi_state **ps, |
717 | struct task_struct *task, int set_waiters) | 717 | struct task_struct *task, int set_waiters) |
718 | { | 718 | { |
719 | int lock_taken, ret, ownerdied = 0; | 719 | int lock_taken, ret, force_take = 0; |
720 | u32 uval, newval, curval, vpid = task_pid_vnr(task); | 720 | u32 uval, newval, curval, vpid = task_pid_vnr(task); |
721 | 721 | ||
722 | retry: | 722 | retry: |
@@ -755,17 +755,15 @@ retry: | |||
755 | newval = curval | FUTEX_WAITERS; | 755 | newval = curval | FUTEX_WAITERS; |
756 | 756 | ||
757 | /* | 757 | /* |
758 | * There are two cases, where a futex might have no owner (the | 758 | * Should we force take the futex? See below. |
759 | * owner TID is 0): OWNER_DIED. We take over the futex in this | ||
760 | * case. We also do an unconditional take over, when the owner | ||
761 | * of the futex died. | ||
762 | * | ||
763 | * This is safe as we are protected by the hash bucket lock ! | ||
764 | */ | 759 | */ |
765 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | 760 | if (unlikely(force_take)) { |
766 | /* Keep the OWNER_DIED bit */ | 761 | /* |
762 | * Keep the OWNER_DIED and the WAITERS bit and set the | ||
763 | * new TID value. | ||
764 | */ | ||
767 | newval = (curval & ~FUTEX_TID_MASK) | vpid; | 765 | newval = (curval & ~FUTEX_TID_MASK) | vpid; |
768 | ownerdied = 0; | 766 | force_take = 0; |
769 | lock_taken = 1; | 767 | lock_taken = 1; |
770 | } | 768 | } |
771 | 769 | ||
@@ -775,7 +773,7 @@ retry: | |||
775 | goto retry; | 773 | goto retry; |
776 | 774 | ||
777 | /* | 775 | /* |
778 | * We took the lock due to owner died take over. | 776 | * We took the lock due to forced take over. |
779 | */ | 777 | */ |
780 | if (unlikely(lock_taken)) | 778 | if (unlikely(lock_taken)) |
781 | return 1; | 779 | return 1; |
@@ -790,20 +788,25 @@ retry: | |||
790 | switch (ret) { | 788 | switch (ret) { |
791 | case -ESRCH: | 789 | case -ESRCH: |
792 | /* | 790 | /* |
793 | * No owner found for this futex. Check if the | 791 | * We failed to find an owner for this |
794 | * OWNER_DIED bit is set to figure out whether | 792 | * futex. So we have no pi_state to block |
795 | * this is a robust futex or not. | 793 | * on. This can happen in two cases: |
794 | * | ||
795 | * 1) The owner died | ||
796 | * 2) A stale FUTEX_WAITERS bit | ||
797 | * | ||
798 | * Re-read the futex value. | ||
796 | */ | 799 | */ |
797 | if (get_futex_value_locked(&curval, uaddr)) | 800 | if (get_futex_value_locked(&curval, uaddr)) |
798 | return -EFAULT; | 801 | return -EFAULT; |
799 | 802 | ||
800 | /* | 803 | /* |
801 | * We simply start over in case of a robust | 804 | * If the owner died or we have a stale |
802 | * futex. The code above will take the futex | 805 | * WAITERS bit the owner TID in the user space |
803 | * and return happy. | 806 | * futex is 0. |
804 | */ | 807 | */ |
805 | if (curval & FUTEX_OWNER_DIED) { | 808 | if (!(curval & FUTEX_TID_MASK)) { |
806 | ownerdied = 1; | 809 | force_take = 1; |
807 | goto retry; | 810 | goto retry; |
808 | } | 811 | } |
809 | default: | 812 | default: |
@@ -840,6 +843,9 @@ static void wake_futex(struct futex_q *q) | |||
840 | { | 843 | { |
841 | struct task_struct *p = q->task; | 844 | struct task_struct *p = q->task; |
842 | 845 | ||
846 | if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) | ||
847 | return; | ||
848 | |||
843 | /* | 849 | /* |
844 | * We set q->lock_ptr = NULL _before_ we wake up the task. If | 850 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
845 | * a non-futex wake up happens on another CPU then the task | 851 | * a non-futex wake up happens on another CPU then the task |
@@ -1075,6 +1081,10 @@ retry_private: | |||
1075 | 1081 | ||
1076 | plist_for_each_entry_safe(this, next, head, list) { | 1082 | plist_for_each_entry_safe(this, next, head, list) { |
1077 | if (match_futex (&this->key, &key1)) { | 1083 | if (match_futex (&this->key, &key1)) { |
1084 | if (this->pi_state || this->rt_waiter) { | ||
1085 | ret = -EINVAL; | ||
1086 | goto out_unlock; | ||
1087 | } | ||
1078 | wake_futex(this); | 1088 | wake_futex(this); |
1079 | if (++ret >= nr_wake) | 1089 | if (++ret >= nr_wake) |
1080 | break; | 1090 | break; |
@@ -1087,6 +1097,10 @@ retry_private: | |||
1087 | op_ret = 0; | 1097 | op_ret = 0; |
1088 | plist_for_each_entry_safe(this, next, head, list) { | 1098 | plist_for_each_entry_safe(this, next, head, list) { |
1089 | if (match_futex (&this->key, &key2)) { | 1099 | if (match_futex (&this->key, &key2)) { |
1100 | if (this->pi_state || this->rt_waiter) { | ||
1101 | ret = -EINVAL; | ||
1102 | goto out_unlock; | ||
1103 | } | ||
1090 | wake_futex(this); | 1104 | wake_futex(this); |
1091 | if (++op_ret >= nr_wake2) | 1105 | if (++op_ret >= nr_wake2) |
1092 | break; | 1106 | break; |
@@ -1095,6 +1109,7 @@ retry_private: | |||
1095 | ret += op_ret; | 1109 | ret += op_ret; |
1096 | } | 1110 | } |
1097 | 1111 | ||
1112 | out_unlock: | ||
1098 | double_unlock_hb(hb1, hb2); | 1113 | double_unlock_hb(hb1, hb2); |
1099 | out_put_keys: | 1114 | out_put_keys: |
1100 | put_futex_key(&key2); | 1115 | put_futex_key(&key2); |
@@ -1384,9 +1399,13 @@ retry_private: | |||
1384 | /* | 1399 | /* |
1385 | * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always | 1400 | * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always |
1386 | * be paired with each other and no other futex ops. | 1401 | * be paired with each other and no other futex ops. |
1402 | * | ||
1403 | * We should never be requeueing a futex_q with a pi_state, | ||
1404 | * which is awaiting a futex_unlock_pi(). | ||
1387 | */ | 1405 | */ |
1388 | if ((requeue_pi && !this->rt_waiter) || | 1406 | if ((requeue_pi && !this->rt_waiter) || |
1389 | (!requeue_pi && this->rt_waiter)) { | 1407 | (!requeue_pi && this->rt_waiter) || |
1408 | this->pi_state) { | ||
1390 | ret = -EINVAL; | 1409 | ret = -EINVAL; |
1391 | break; | 1410 | break; |
1392 | } | 1411 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 57d86d07221e..3aca9f29d30e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -272,6 +272,7 @@ void handle_nested_irq(unsigned int irq) | |||
272 | 272 | ||
273 | raw_spin_lock_irq(&desc->lock); | 273 | raw_spin_lock_irq(&desc->lock); |
274 | 274 | ||
275 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
275 | kstat_incr_irqs_this_cpu(irq, desc); | 276 | kstat_incr_irqs_this_cpu(irq, desc); |
276 | 277 | ||
277 | action = desc->action; | 278 | action = desc->action; |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4e69e24d3d7d..96f3a1d9c379 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -177,8 +177,8 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, | |||
177 | irq_base = irq_alloc_descs(first_irq, first_irq, size, | 177 | irq_base = irq_alloc_descs(first_irq, first_irq, size, |
178 | of_node_to_nid(of_node)); | 178 | of_node_to_nid(of_node)); |
179 | if (irq_base < 0) { | 179 | if (irq_base < 0) { |
180 | WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", | 180 | pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", |
181 | first_irq); | 181 | first_irq); |
182 | irq_base = first_irq; | 182 | irq_base = first_irq; |
183 | } | 183 | } |
184 | } else | 184 | } else |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4c69326aa773..35c70c9e24d8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -616,6 +616,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
616 | return ret; | 616 | return ret; |
617 | } | 617 | } |
618 | 618 | ||
619 | #ifdef CONFIG_HARDIRQS_SW_RESEND | ||
620 | int irq_set_parent(int irq, int parent_irq) | ||
621 | { | ||
622 | unsigned long flags; | ||
623 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); | ||
624 | |||
625 | if (!desc) | ||
626 | return -EINVAL; | ||
627 | |||
628 | desc->parent_irq = parent_irq; | ||
629 | |||
630 | irq_put_desc_unlock(desc, flags); | ||
631 | return 0; | ||
632 | } | ||
633 | #endif | ||
634 | |||
619 | /* | 635 | /* |
620 | * Default primary interrupt handler for threaded interrupts. Is | 636 | * Default primary interrupt handler for threaded interrupts. Is |
621 | * assigned as primary handler when request_threaded_irq is called | 637 | * assigned as primary handler when request_threaded_irq is called |
@@ -716,6 +732,7 @@ static void | |||
716 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | 732 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) |
717 | { | 733 | { |
718 | cpumask_var_t mask; | 734 | cpumask_var_t mask; |
735 | bool valid = true; | ||
719 | 736 | ||
720 | if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) | 737 | if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) |
721 | return; | 738 | return; |
@@ -730,10 +747,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | |||
730 | } | 747 | } |
731 | 748 | ||
732 | raw_spin_lock_irq(&desc->lock); | 749 | raw_spin_lock_irq(&desc->lock); |
733 | cpumask_copy(mask, desc->irq_data.affinity); | 750 | /* |
751 | * This code is triggered unconditionally. Check the affinity | ||
752 | * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. | ||
753 | */ | ||
754 | if (desc->irq_data.affinity) | ||
755 | cpumask_copy(mask, desc->irq_data.affinity); | ||
756 | else | ||
757 | valid = false; | ||
734 | raw_spin_unlock_irq(&desc->lock); | 758 | raw_spin_unlock_irq(&desc->lock); |
735 | 759 | ||
736 | set_cpus_allowed_ptr(current, mask); | 760 | if (valid) |
761 | set_cpus_allowed_ptr(current, mask); | ||
737 | free_cpumask_var(mask); | 762 | free_cpumask_var(mask); |
738 | } | 763 | } |
739 | #else | 764 | #else |
@@ -833,6 +858,8 @@ static int irq_thread(void *data) | |||
833 | init_task_work(&on_exit_work, irq_thread_dtor); | 858 | init_task_work(&on_exit_work, irq_thread_dtor); |
834 | task_work_add(current, &on_exit_work, false); | 859 | task_work_add(current, &on_exit_work, false); |
835 | 860 | ||
861 | irq_thread_check_affinity(desc, action); | ||
862 | |||
836 | while (!irq_wait_for_interrupt(action)) { | 863 | while (!irq_wait_for_interrupt(action)) { |
837 | irqreturn_t action_ret; | 864 | irqreturn_t action_ret; |
838 | 865 | ||
@@ -936,6 +963,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
936 | */ | 963 | */ |
937 | get_task_struct(t); | 964 | get_task_struct(t); |
938 | new->thread = t; | 965 | new->thread = t; |
966 | /* | ||
967 | * Tell the thread to set its affinity. This is | ||
968 | * important for shared interrupt handlers as we do | ||
969 | * not invoke setup_affinity() for the secondary | ||
970 | * handlers as everything is already set up. Even for | ||
971 | * interrupts marked with IRQF_NO_BALANCE this is | ||
972 | * correct as we want the thread to move to the cpu(s) | ||
973 | * on which the requesting code placed the interrupt. | ||
974 | */ | ||
975 | set_bit(IRQTF_AFFINITY, &new->thread_flags); | ||
939 | } | 976 | } |
940 | 977 | ||
941 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { | 978 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 6454db7b6a4d..9065107f083e 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
74 | if (!desc->irq_data.chip->irq_retrigger || | 74 | if (!desc->irq_data.chip->irq_retrigger || |
75 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { | 75 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { |
76 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 76 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
77 | /* | ||
78 | * If the interrupt has a parent irq and runs | ||
79 | * in the thread context of the parent irq, | ||
80 | * retrigger the parent. | ||
81 | */ | ||
82 | if (desc->parent_irq && | ||
83 | irq_settings_is_nested_thread(desc)) | ||
84 | irq = desc->parent_irq; | ||
77 | /* Set it pending and activate the softirq: */ | 85 | /* Set it pending and activate the softirq: */ |
78 | set_bit(irq, irqs_resend); | 86 | set_bit(irq, irqs_resend); |
79 | tasklet_schedule(&resend_tasklet); | 87 | tasklet_schedule(&resend_tasklet); |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 4e316e1acf58..6ada93c23a9a 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | |||
26 | static struct kobj_attribute _name##_attr = \ | 26 | static struct kobj_attribute _name##_attr = \ |
27 | __ATTR(_name, 0644, _name##_show, _name##_store) | 27 | __ATTR(_name, 0644, _name##_show, _name##_store) |
28 | 28 | ||
29 | #if defined(CONFIG_HOTPLUG) | ||
30 | /* current uevent sequence number */ | 29 | /* current uevent sequence number */ |
31 | static ssize_t uevent_seqnum_show(struct kobject *kobj, | 30 | static ssize_t uevent_seqnum_show(struct kobject *kobj, |
32 | struct kobj_attribute *attr, char *buf) | 31 | struct kobj_attribute *attr, char *buf) |
@@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, | |||
54 | return count; | 53 | return count; |
55 | } | 54 | } |
56 | KERNEL_ATTR_RW(uevent_helper); | 55 | KERNEL_ATTR_RW(uevent_helper); |
57 | #endif | 56 | |
58 | 57 | ||
59 | #ifdef CONFIG_PROFILING | 58 | #ifdef CONFIG_PROFILING |
60 | static ssize_t profiling_show(struct kobject *kobj, | 59 | static ssize_t profiling_show(struct kobject *kobj, |
@@ -141,6 +140,23 @@ static ssize_t fscaps_show(struct kobject *kobj, | |||
141 | } | 140 | } |
142 | KERNEL_ATTR_RO(fscaps); | 141 | KERNEL_ATTR_RO(fscaps); |
143 | 142 | ||
143 | int rcu_expedited; | ||
144 | static ssize_t rcu_expedited_show(struct kobject *kobj, | ||
145 | struct kobj_attribute *attr, char *buf) | ||
146 | { | ||
147 | return sprintf(buf, "%d\n", rcu_expedited); | ||
148 | } | ||
149 | static ssize_t rcu_expedited_store(struct kobject *kobj, | ||
150 | struct kobj_attribute *attr, | ||
151 | const char *buf, size_t count) | ||
152 | { | ||
153 | if (kstrtoint(buf, 0, &rcu_expedited)) | ||
154 | return -EINVAL; | ||
155 | |||
156 | return count; | ||
157 | } | ||
158 | KERNEL_ATTR_RW(rcu_expedited); | ||
159 | |||
144 | /* | 160 | /* |
145 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | 161 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. |
146 | */ | 162 | */ |
@@ -169,10 +185,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj); | |||
169 | 185 | ||
170 | static struct attribute * kernel_attrs[] = { | 186 | static struct attribute * kernel_attrs[] = { |
171 | &fscaps_attr.attr, | 187 | &fscaps_attr.attr, |
172 | #if defined(CONFIG_HOTPLUG) | ||
173 | &uevent_seqnum_attr.attr, | 188 | &uevent_seqnum_attr.attr, |
174 | &uevent_helper_attr.attr, | 189 | &uevent_helper_attr.attr, |
175 | #endif | ||
176 | #ifdef CONFIG_PROFILING | 190 | #ifdef CONFIG_PROFILING |
177 | &profiling_attr.attr, | 191 | &profiling_attr.attr, |
178 | #endif | 192 | #endif |
@@ -182,6 +196,7 @@ static struct attribute * kernel_attrs[] = { | |||
182 | &kexec_crash_size_attr.attr, | 196 | &kexec_crash_size_attr.attr, |
183 | &vmcoreinfo_attr.attr, | 197 | &vmcoreinfo_attr.attr, |
184 | #endif | 198 | #endif |
199 | &rcu_expedited_attr.attr, | ||
185 | NULL | 200 | NULL |
186 | }; | 201 | }; |
187 | 202 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index 29fb60caecb5..691dc2ef9baf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -428,7 +428,7 @@ int kthreadd(void *unused) | |||
428 | set_task_comm(tsk, "kthreadd"); | 428 | set_task_comm(tsk, "kthreadd"); |
429 | ignore_signals(tsk); | 429 | ignore_signals(tsk); |
430 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 430 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
431 | set_mems_allowed(node_states[N_HIGH_MEMORY]); | 431 | set_mems_allowed(node_states[N_MEMORY]); |
432 | 432 | ||
433 | current->flags |= PF_NOFREEZE; | 433 | current->flags |= PF_NOFREEZE; |
434 | 434 | ||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 91c32a0b612c..b2c71c5873e4 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v) | |||
39 | 39 | ||
40 | static void print_name(struct seq_file *m, struct lock_class *class) | 40 | static void print_name(struct seq_file *m, struct lock_class *class) |
41 | { | 41 | { |
42 | char str[128]; | 42 | char str[KSYM_NAME_LEN]; |
43 | const char *name = class->name; | 43 | const char *name = class->name; |
44 | 44 | ||
45 | if (!name) { | 45 | if (!name) { |
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 4646eb2c3820..767e559dfb10 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c | |||
@@ -21,10 +21,10 @@ struct key *modsign_keyring; | |||
21 | extern __initdata const u8 modsign_certificate_list[]; | 21 | extern __initdata const u8 modsign_certificate_list[]; |
22 | extern __initdata const u8 modsign_certificate_list_end[]; | 22 | extern __initdata const u8 modsign_certificate_list_end[]; |
23 | asm(".section .init.data,\"aw\"\n" | 23 | asm(".section .init.data,\"aw\"\n" |
24 | "modsign_certificate_list:\n" | 24 | SYMBOL_PREFIX "modsign_certificate_list:\n" |
25 | ".incbin \"signing_key.x509\"\n" | 25 | ".incbin \"signing_key.x509\"\n" |
26 | ".incbin \"extra_certificates\"\n" | 26 | ".incbin \"extra_certificates\"\n" |
27 | "modsign_certificate_list_end:" | 27 | SYMBOL_PREFIX "modsign_certificate_list_end:" |
28 | ); | 28 | ); |
29 | 29 | ||
30 | /* | 30 | /* |
diff --git a/kernel/module.c b/kernel/module.c index 6085f5ef88ea..808bd62e1723 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -372,9 +372,6 @@ static bool check_symbol(const struct symsearch *syms, | |||
372 | printk(KERN_WARNING "Symbol %s is being used " | 372 | printk(KERN_WARNING "Symbol %s is being used " |
373 | "by a non-GPL module, which will not " | 373 | "by a non-GPL module, which will not " |
374 | "be allowed in the future\n", fsa->name); | 374 | "be allowed in the future\n", fsa->name); |
375 | printk(KERN_WARNING "Please see the file " | ||
376 | "Documentation/feature-removal-schedule.txt " | ||
377 | "in the kernel source tree for more details.\n"); | ||
378 | } | 375 | } |
379 | } | 376 | } |
380 | 377 | ||
@@ -2293,12 +2290,17 @@ static void layout_symtab(struct module *mod, struct load_info *info) | |||
2293 | src = (void *)info->hdr + symsect->sh_offset; | 2290 | src = (void *)info->hdr + symsect->sh_offset; |
2294 | nsrc = symsect->sh_size / sizeof(*src); | 2291 | nsrc = symsect->sh_size / sizeof(*src); |
2295 | 2292 | ||
2293 | /* strtab always starts with a nul, so offset 0 is the empty string. */ | ||
2294 | strtab_size = 1; | ||
2295 | |||
2296 | /* Compute total space required for the core symbols' strtab. */ | 2296 | /* Compute total space required for the core symbols' strtab. */ |
2297 | for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) | 2297 | for (ndst = i = 0; i < nsrc; i++) { |
2298 | if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { | 2298 | if (i == 0 || |
2299 | strtab_size += strlen(&info->strtab[src->st_name]) + 1; | 2299 | is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { |
2300 | strtab_size += strlen(&info->strtab[src[i].st_name])+1; | ||
2300 | ndst++; | 2301 | ndst++; |
2301 | } | 2302 | } |
2303 | } | ||
2302 | 2304 | ||
2303 | /* Append room for core symbols at end of core part. */ | 2305 | /* Append room for core symbols at end of core part. */ |
2304 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); | 2306 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); |
@@ -2332,15 +2334,15 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) | |||
2332 | mod->core_symtab = dst = mod->module_core + info->symoffs; | 2334 | mod->core_symtab = dst = mod->module_core + info->symoffs; |
2333 | mod->core_strtab = s = mod->module_core + info->stroffs; | 2335 | mod->core_strtab = s = mod->module_core + info->stroffs; |
2334 | src = mod->symtab; | 2336 | src = mod->symtab; |
2335 | *dst = *src; | ||
2336 | *s++ = 0; | 2337 | *s++ = 0; |
2337 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { | 2338 | for (ndst = i = 0; i < mod->num_symtab; i++) { |
2338 | if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) | 2339 | if (i == 0 || |
2339 | continue; | 2340 | is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { |
2340 | 2341 | dst[ndst] = src[i]; | |
2341 | dst[ndst] = *src; | 2342 | dst[ndst++].st_name = s - mod->core_strtab; |
2342 | dst[ndst++].st_name = s - mod->core_strtab; | 2343 | s += strlcpy(s, &mod->strtab[src[i].st_name], |
2343 | s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; | 2344 | KSYM_NAME_LEN) + 1; |
2345 | } | ||
2344 | } | 2346 | } |
2345 | mod->core_num_syms = ndst; | 2347 | mod->core_num_syms = ndst; |
2346 | } | 2348 | } |
diff --git a/kernel/module_signing.c b/kernel/module_signing.c index ea1b1df5dbb0..f2970bddc5ea 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c | |||
@@ -27,13 +27,13 @@ | |||
27 | * - Information block | 27 | * - Information block |
28 | */ | 28 | */ |
29 | struct module_signature { | 29 | struct module_signature { |
30 | enum pkey_algo algo : 8; /* Public-key crypto algorithm */ | 30 | u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ |
31 | enum pkey_hash_algo hash : 8; /* Digest algorithm */ | 31 | u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ |
32 | enum pkey_id_type id_type : 8; /* Key identifier type */ | 32 | u8 id_type; /* Key identifier type [enum pkey_id_type] */ |
33 | u8 signer_len; /* Length of signer's name */ | 33 | u8 signer_len; /* Length of signer's name */ |
34 | u8 key_id_len; /* Length of key identifier */ | 34 | u8 key_id_len; /* Length of key identifier */ |
35 | u8 __pad[3]; | 35 | u8 __pad[3]; |
36 | __be32 sig_len; /* Length of signature data */ | 36 | __be32 sig_len; /* Length of signature data */ |
37 | }; | 37 | }; |
38 | 38 | ||
39 | /* | 39 | /* |
diff --git a/kernel/padata.c b/kernel/padata.c index 89fe3d1b9efb..072f4ee4eb89 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) | |||
171 | { | 171 | { |
172 | int cpu, num_cpus; | 172 | int cpu, num_cpus; |
173 | unsigned int next_nr, next_index; | 173 | unsigned int next_nr, next_index; |
174 | struct padata_parallel_queue *queue, *next_queue; | 174 | struct padata_parallel_queue *next_queue; |
175 | struct padata_priv *padata; | 175 | struct padata_priv *padata; |
176 | struct padata_list *reorder; | 176 | struct padata_list *reorder; |
177 | 177 | ||
@@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) | |||
204 | goto out; | 204 | goto out; |
205 | } | 205 | } |
206 | 206 | ||
207 | queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); | 207 | if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { |
208 | if (queue->cpu_index == next_queue->cpu_index) { | ||
209 | padata = ERR_PTR(-ENODATA); | 208 | padata = ERR_PTR(-ENODATA); |
210 | goto out; | 209 | goto out; |
211 | } | 210 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index 3026ddae0a34..36aa02ff17d6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -1,8 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * Generic pidhash and scalable, time-bounded PID allocator | 2 | * Generic pidhash and scalable, time-bounded PID allocator |
3 | * | 3 | * |
4 | * (C) 2002-2003 William Irwin, IBM | 4 | * (C) 2002-2003 Nadia Yvette Chambers, IBM |
5 | * (C) 2004 William Irwin, Oracle | 5 | * (C) 2004 Nadia Yvette Chambers, Oracle |
6 | * (C) 2002-2004 Ingo Molnar, Red Hat | 6 | * (C) 2002-2004 Ingo Molnar, Red Hat |
7 | * | 7 | * |
8 | * pid-structures are backing objects for tasks sharing a given ID to chain | 8 | * pid-structures are backing objects for tasks sharing a given ID to chain |
@@ -84,21 +84,6 @@ struct pid_namespace init_pid_ns = { | |||
84 | }; | 84 | }; |
85 | EXPORT_SYMBOL_GPL(init_pid_ns); | 85 | EXPORT_SYMBOL_GPL(init_pid_ns); |
86 | 86 | ||
87 | int is_container_init(struct task_struct *tsk) | ||
88 | { | ||
89 | int ret = 0; | ||
90 | struct pid *pid; | ||
91 | |||
92 | rcu_read_lock(); | ||
93 | pid = task_pid(tsk); | ||
94 | if (pid != NULL && pid->numbers[pid->level].nr == 1) | ||
95 | ret = 1; | ||
96 | rcu_read_unlock(); | ||
97 | |||
98 | return ret; | ||
99 | } | ||
100 | EXPORT_SYMBOL(is_container_init); | ||
101 | |||
102 | /* | 87 | /* |
103 | * Note: disable interrupts while the pidmap_lock is held as an | 88 | * Note: disable interrupts while the pidmap_lock is held as an |
104 | * interrupt might come in and do read_lock(&tasklist_lock). | 89 | * interrupt might come in and do read_lock(&tasklist_lock). |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 125cb67daa21..d73840271dce 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -217,30 +217,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | |||
217 | return 0; | 217 | return 0; |
218 | } | 218 | } |
219 | 219 | ||
220 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | ||
221 | { | ||
222 | struct signal_struct *sig = tsk->signal; | ||
223 | struct task_struct *t; | ||
224 | |||
225 | times->utime = sig->utime; | ||
226 | times->stime = sig->stime; | ||
227 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
228 | |||
229 | rcu_read_lock(); | ||
230 | /* make sure we can trust tsk->thread_group list */ | ||
231 | if (!likely(pid_alive(tsk))) | ||
232 | goto out; | ||
233 | |||
234 | t = tsk; | ||
235 | do { | ||
236 | times->utime += t->utime; | ||
237 | times->stime += t->stime; | ||
238 | times->sum_exec_runtime += task_sched_runtime(t); | ||
239 | } while_each_thread(tsk, t); | ||
240 | out: | ||
241 | rcu_read_unlock(); | ||
242 | } | ||
243 | |||
244 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) | 220 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) |
245 | { | 221 | { |
246 | if (b->utime > a->utime) | 222 | if (b->utime > a->utime) |
diff --git a/kernel/power/main.c b/kernel/power/main.c index f458238109cc..1c16f9167de1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
59 | { | 59 | { |
60 | unsigned long val; | 60 | unsigned long val; |
61 | 61 | ||
62 | if (strict_strtoul(buf, 10, &val)) | 62 | if (kstrtoul(buf, 10, &val)) |
63 | return -EINVAL; | 63 | return -EINVAL; |
64 | 64 | ||
65 | if (val > 1) | 65 | if (val > 1) |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 87da817f9e13..d5a258b60c6f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only) | |||
48 | if (p == current || !freeze_task(p)) | 48 | if (p == current || !freeze_task(p)) |
49 | continue; | 49 | continue; |
50 | 50 | ||
51 | /* | 51 | if (!freezer_should_skip(p)) |
52 | * Now that we've done set_freeze_flag, don't | ||
53 | * perturb a task in TASK_STOPPED or TASK_TRACED. | ||
54 | * It is "frozen enough". If the task does wake | ||
55 | * up, it will immediately call try_to_freeze. | ||
56 | * | ||
57 | * Because freeze_task() goes through p's scheduler lock, it's | ||
58 | * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING | ||
59 | * transition can't race with task state testing here. | ||
60 | */ | ||
61 | if (!task_is_stopped_or_traced(p) && | ||
62 | !freezer_should_skip(p)) | ||
63 | todo++; | 52 | todo++; |
64 | } while_each_thread(g, p); | 53 | } while_each_thread(g, p); |
65 | read_unlock(&tasklist_lock); | 54 | read_unlock(&tasklist_lock); |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 846bd42c7ed1..9322ff7eaad6 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
@@ -213,6 +213,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, | |||
213 | } | 213 | } |
214 | 214 | ||
215 | /** | 215 | /** |
216 | * pm_qos_flags_remove_req - Remove device PM QoS flags request. | ||
217 | * @pqf: Device PM QoS flags set to remove the request from. | ||
218 | * @req: Request to remove from the set. | ||
219 | */ | ||
220 | static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, | ||
221 | struct pm_qos_flags_request *req) | ||
222 | { | ||
223 | s32 val = 0; | ||
224 | |||
225 | list_del(&req->node); | ||
226 | list_for_each_entry(req, &pqf->list, node) | ||
227 | val |= req->flags; | ||
228 | |||
229 | pqf->effective_flags = val; | ||
230 | } | ||
231 | |||
232 | /** | ||
233 | * pm_qos_update_flags - Update a set of PM QoS flags. | ||
234 | * @pqf: Set of flags to update. | ||
235 | * @req: Request to add to the set, to modify, or to remove from the set. | ||
236 | * @action: Action to take on the set. | ||
237 | * @val: Value of the request to add or modify. | ||
238 | * | ||
239 | * Update the given set of PM QoS flags and call notifiers if the aggregate | ||
240 | * value has changed. Returns 1 if the aggregate constraint value has changed, | ||
241 | * 0 otherwise. | ||
242 | */ | ||
243 | bool pm_qos_update_flags(struct pm_qos_flags *pqf, | ||
244 | struct pm_qos_flags_request *req, | ||
245 | enum pm_qos_req_action action, s32 val) | ||
246 | { | ||
247 | unsigned long irqflags; | ||
248 | s32 prev_value, curr_value; | ||
249 | |||
250 | spin_lock_irqsave(&pm_qos_lock, irqflags); | ||
251 | |||
252 | prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; | ||
253 | |||
254 | switch (action) { | ||
255 | case PM_QOS_REMOVE_REQ: | ||
256 | pm_qos_flags_remove_req(pqf, req); | ||
257 | break; | ||
258 | case PM_QOS_UPDATE_REQ: | ||
259 | pm_qos_flags_remove_req(pqf, req); | ||
260 | case PM_QOS_ADD_REQ: | ||
261 | req->flags = val; | ||
262 | INIT_LIST_HEAD(&req->node); | ||
263 | list_add_tail(&req->node, &pqf->list); | ||
264 | pqf->effective_flags |= val; | ||
265 | break; | ||
266 | default: | ||
267 | /* no action */ | ||
268 | ; | ||
269 | } | ||
270 | |||
271 | curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; | ||
272 | |||
273 | spin_unlock_irqrestore(&pm_qos_lock, irqflags); | ||
274 | |||
275 | return prev_value != curr_value; | ||
276 | } | ||
277 | |||
278 | /** | ||
216 | * pm_qos_request - returns current system wide qos expectation | 279 | * pm_qos_request - returns current system wide qos expectation |
217 | * @pm_qos_class: identification of which qos value is requested | 280 | * @pm_qos_class: identification of which qos value is requested |
218 | * | 281 | * |
@@ -500,7 +563,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
500 | } else { | 563 | } else { |
501 | ascii_value[count] = '\0'; | 564 | ascii_value[count] = '\0'; |
502 | } | 565 | } |
503 | ret = strict_strtoul(ascii_value, 16, &ulval); | 566 | ret = kstrtoul(ascii_value, 16, &ulval); |
504 | if (ret) { | 567 | if (ret) { |
505 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); | 568 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); |
506 | return -EINVAL; | 569 | return -EINVAL; |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3c9d764eb0d8..7c33ed200410 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) | |||
126 | 126 | ||
127 | /* Figure out where to put the new node */ | 127 | /* Figure out where to put the new node */ |
128 | while (*new) { | 128 | while (*new) { |
129 | ext = container_of(*new, struct swsusp_extent, node); | 129 | ext = rb_entry(*new, struct swsusp_extent, node); |
130 | parent = *new; | 130 | parent = *new; |
131 | if (swap_offset < ext->start) { | 131 | if (swap_offset < ext->start) { |
132 | /* Try to merge */ | 132 | /* Try to merge */ |
diff --git a/kernel/printk.c b/kernel/printk.c index 2d607f4d1797..19c0d7bcf24a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -87,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem); | |||
87 | struct console *console_drivers; | 87 | struct console *console_drivers; |
88 | EXPORT_SYMBOL_GPL(console_drivers); | 88 | EXPORT_SYMBOL_GPL(console_drivers); |
89 | 89 | ||
90 | #ifdef CONFIG_LOCKDEP | ||
91 | static struct lockdep_map console_lock_dep_map = { | ||
92 | .name = "console_lock" | ||
93 | }; | ||
94 | #endif | ||
95 | |||
90 | /* | 96 | /* |
91 | * This is used for debugging the mess that is the VT code by | 97 | * This is used for debugging the mess that is the VT code by |
92 | * keeping track if we have the console semaphore held. It's | 98 | * keeping track if we have the console semaphore held. It's |
@@ -741,6 +747,21 @@ void __init setup_log_buf(int early) | |||
741 | free, (free * 100) / __LOG_BUF_LEN); | 747 | free, (free * 100) / __LOG_BUF_LEN); |
742 | } | 748 | } |
743 | 749 | ||
750 | static bool __read_mostly ignore_loglevel; | ||
751 | |||
752 | static int __init ignore_loglevel_setup(char *str) | ||
753 | { | ||
754 | ignore_loglevel = 1; | ||
755 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | ||
756 | |||
757 | return 0; | ||
758 | } | ||
759 | |||
760 | early_param("ignore_loglevel", ignore_loglevel_setup); | ||
761 | module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); | ||
762 | MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | ||
763 | "print all kernel messages to the console."); | ||
764 | |||
744 | #ifdef CONFIG_BOOT_PRINTK_DELAY | 765 | #ifdef CONFIG_BOOT_PRINTK_DELAY |
745 | 766 | ||
746 | static int boot_delay; /* msecs delay after each printk during bootup */ | 767 | static int boot_delay; /* msecs delay after each printk during bootup */ |
@@ -764,13 +785,15 @@ static int __init boot_delay_setup(char *str) | |||
764 | } | 785 | } |
765 | __setup("boot_delay=", boot_delay_setup); | 786 | __setup("boot_delay=", boot_delay_setup); |
766 | 787 | ||
767 | static void boot_delay_msec(void) | 788 | static void boot_delay_msec(int level) |
768 | { | 789 | { |
769 | unsigned long long k; | 790 | unsigned long long k; |
770 | unsigned long timeout; | 791 | unsigned long timeout; |
771 | 792 | ||
772 | if (boot_delay == 0 || system_state != SYSTEM_BOOTING) | 793 | if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) |
794 | || (level >= console_loglevel && !ignore_loglevel)) { | ||
773 | return; | 795 | return; |
796 | } | ||
774 | 797 | ||
775 | k = (unsigned long long)loops_per_msec * boot_delay; | 798 | k = (unsigned long long)loops_per_msec * boot_delay; |
776 | 799 | ||
@@ -789,7 +812,7 @@ static void boot_delay_msec(void) | |||
789 | } | 812 | } |
790 | } | 813 | } |
791 | #else | 814 | #else |
792 | static inline void boot_delay_msec(void) | 815 | static inline void boot_delay_msec(int level) |
793 | { | 816 | { |
794 | } | 817 | } |
795 | #endif | 818 | #endif |
@@ -1232,21 +1255,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | |||
1232 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); | 1255 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
1233 | } | 1256 | } |
1234 | 1257 | ||
1235 | static bool __read_mostly ignore_loglevel; | ||
1236 | |||
1237 | static int __init ignore_loglevel_setup(char *str) | ||
1238 | { | ||
1239 | ignore_loglevel = 1; | ||
1240 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | ||
1241 | |||
1242 | return 0; | ||
1243 | } | ||
1244 | |||
1245 | early_param("ignore_loglevel", ignore_loglevel_setup); | ||
1246 | module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); | ||
1247 | MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | ||
1248 | "print all kernel messages to the console."); | ||
1249 | |||
1250 | /* | 1258 | /* |
1251 | * Call the console drivers, asking them to write out | 1259 | * Call the console drivers, asking them to write out |
1252 | * log_buf[start] to log_buf[end - 1]. | 1260 | * log_buf[start] to log_buf[end - 1]. |
@@ -1492,7 +1500,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1492 | int this_cpu; | 1500 | int this_cpu; |
1493 | int printed_len = 0; | 1501 | int printed_len = 0; |
1494 | 1502 | ||
1495 | boot_delay_msec(); | 1503 | boot_delay_msec(level); |
1496 | printk_delay(); | 1504 | printk_delay(); |
1497 | 1505 | ||
1498 | /* This stops the holder of console_sem just where we want him */ | 1506 | /* This stops the holder of console_sem just where we want him */ |
@@ -1908,12 +1916,14 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, | |||
1908 | */ | 1916 | */ |
1909 | void console_lock(void) | 1917 | void console_lock(void) |
1910 | { | 1918 | { |
1911 | BUG_ON(in_interrupt()); | 1919 | might_sleep(); |
1920 | |||
1912 | down(&console_sem); | 1921 | down(&console_sem); |
1913 | if (console_suspended) | 1922 | if (console_suspended) |
1914 | return; | 1923 | return; |
1915 | console_locked = 1; | 1924 | console_locked = 1; |
1916 | console_may_schedule = 1; | 1925 | console_may_schedule = 1; |
1926 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); | ||
1917 | } | 1927 | } |
1918 | EXPORT_SYMBOL(console_lock); | 1928 | EXPORT_SYMBOL(console_lock); |
1919 | 1929 | ||
@@ -1935,6 +1945,7 @@ int console_trylock(void) | |||
1935 | } | 1945 | } |
1936 | console_locked = 1; | 1946 | console_locked = 1; |
1937 | console_may_schedule = 0; | 1947 | console_may_schedule = 0; |
1948 | mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); | ||
1938 | return 1; | 1949 | return 1; |
1939 | } | 1950 | } |
1940 | EXPORT_SYMBOL(console_trylock); | 1951 | EXPORT_SYMBOL(console_trylock); |
@@ -2095,6 +2106,7 @@ skip: | |||
2095 | local_irq_restore(flags); | 2106 | local_irq_restore(flags); |
2096 | } | 2107 | } |
2097 | console_locked = 0; | 2108 | console_locked = 0; |
2109 | mutex_release(&console_lock_dep_map, 1, _RET_IP_); | ||
2098 | 2110 | ||
2099 | /* Release the exclusive_console once it is used */ | 2111 | /* Release the exclusive_console once it is used */ |
2100 | if (unlikely(exclusive_console)) | 2112 | if (unlikely(exclusive_console)) |
diff --git a/kernel/profile.c b/kernel/profile.c index 76b8e77773ee..1f391819c42f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -8,9 +8,10 @@ | |||
8 | * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, | 8 | * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, |
9 | * Red Hat, July 2004 | 9 | * Red Hat, July 2004 |
10 | * Consolidation of architecture support code for profiling, | 10 | * Consolidation of architecture support code for profiling, |
11 | * William Irwin, Oracle, July 2004 | 11 | * Nadia Yvette Chambers, Oracle, July 2004 |
12 | * Amortized hit count accounting via per-cpu open-addressed hashtables | 12 | * Amortized hit count accounting via per-cpu open-addressed hashtables |
13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 | 13 | * to resolve timer interrupt livelocks, Nadia Yvette Chambers, |
14 | * Oracle, 2004 | ||
14 | */ | 15 | */ |
15 | 16 | ||
16 | #include <linux/export.h> | 17 | #include <linux/export.h> |
@@ -256,7 +257,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook); | |||
256 | * pagetable hash functions, but uses a full hashtable full of finite | 257 | * pagetable hash functions, but uses a full hashtable full of finite |
257 | * collision chains, not just pairs of them. | 258 | * collision chains, not just pairs of them. |
258 | * | 259 | * |
259 | * -- wli | 260 | * -- nyc |
260 | */ | 261 | */ |
261 | static void __profile_flip_buffers(void *unused) | 262 | static void __profile_flip_buffers(void *unused) |
262 | { | 263 | { |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 7b09b88862cc..1599157336a6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -463,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer) | |||
463 | return; | 463 | return; |
464 | 464 | ||
465 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { | 465 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { |
466 | if (unlikely(p->ptrace & PT_EXITKILL)) | ||
467 | send_sig_info(SIGKILL, SEND_SIG_FORCED, p); | ||
468 | |||
466 | if (__ptrace_detach(tracer, p)) | 469 | if (__ptrace_detach(tracer, p)) |
467 | list_add(&p->ptrace_entry, &ptrace_dead); | 470 | list_add(&p->ptrace_entry, &ptrace_dead); |
468 | } | 471 | } |
diff --git a/kernel/rcu.h b/kernel/rcu.h index 8ba99cdc6515..20dfba576c2b 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
@@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) | |||
109 | } | 109 | } |
110 | } | 110 | } |
111 | 111 | ||
112 | extern int rcu_expedited; | ||
113 | |||
112 | #endif /* __LINUX_RCU_H */ | 114 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 29ca1c6da594..a2cf76177b44 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -46,12 +46,15 @@ | |||
46 | #include <linux/export.h> | 46 | #include <linux/export.h> |
47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
48 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
49 | #include <linux/module.h> | ||
49 | 50 | ||
50 | #define CREATE_TRACE_POINTS | 51 | #define CREATE_TRACE_POINTS |
51 | #include <trace/events/rcu.h> | 52 | #include <trace/events/rcu.h> |
52 | 53 | ||
53 | #include "rcu.h" | 54 | #include "rcu.h" |
54 | 55 | ||
56 | module_param(rcu_expedited, int, 0); | ||
57 | |||
55 | #ifdef CONFIG_PREEMPT_RCU | 58 | #ifdef CONFIG_PREEMPT_RCU |
56 | 59 | ||
57 | /* | 60 | /* |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e4c6a598d6f7..e7dce58f9c2a 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -195,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); | |||
195 | */ | 195 | */ |
196 | int rcu_is_cpu_rrupt_from_idle(void) | 196 | int rcu_is_cpu_rrupt_from_idle(void) |
197 | { | 197 | { |
198 | return rcu_dynticks_nesting <= 0; | 198 | return rcu_dynticks_nesting <= 1; |
199 | } | 199 | } |
200 | 200 | ||
201 | /* | 201 | /* |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 3d0190282204..f85016a2309b 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -706,7 +706,10 @@ void synchronize_rcu(void) | |||
706 | return; | 706 | return; |
707 | 707 | ||
708 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ | 708 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ |
709 | rcu_barrier(); | 709 | if (rcu_expedited) |
710 | synchronize_rcu_expedited(); | ||
711 | else | ||
712 | rcu_barrier(); | ||
710 | } | 713 | } |
711 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 714 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
712 | 715 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index aaa7b9f3532a..31dea01c85fd 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -339,7 +339,6 @@ rcu_stutter_wait(char *title) | |||
339 | 339 | ||
340 | struct rcu_torture_ops { | 340 | struct rcu_torture_ops { |
341 | void (*init)(void); | 341 | void (*init)(void); |
342 | void (*cleanup)(void); | ||
343 | int (*readlock)(void); | 342 | int (*readlock)(void); |
344 | void (*read_delay)(struct rcu_random_state *rrsp); | 343 | void (*read_delay)(struct rcu_random_state *rrsp); |
345 | void (*readunlock)(int idx); | 344 | void (*readunlock)(int idx); |
@@ -431,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) | |||
431 | 430 | ||
432 | static struct rcu_torture_ops rcu_ops = { | 431 | static struct rcu_torture_ops rcu_ops = { |
433 | .init = NULL, | 432 | .init = NULL, |
434 | .cleanup = NULL, | ||
435 | .readlock = rcu_torture_read_lock, | 433 | .readlock = rcu_torture_read_lock, |
436 | .read_delay = rcu_read_delay, | 434 | .read_delay = rcu_read_delay, |
437 | .readunlock = rcu_torture_read_unlock, | 435 | .readunlock = rcu_torture_read_unlock, |
@@ -475,7 +473,6 @@ static void rcu_sync_torture_init(void) | |||
475 | 473 | ||
476 | static struct rcu_torture_ops rcu_sync_ops = { | 474 | static struct rcu_torture_ops rcu_sync_ops = { |
477 | .init = rcu_sync_torture_init, | 475 | .init = rcu_sync_torture_init, |
478 | .cleanup = NULL, | ||
479 | .readlock = rcu_torture_read_lock, | 476 | .readlock = rcu_torture_read_lock, |
480 | .read_delay = rcu_read_delay, | 477 | .read_delay = rcu_read_delay, |
481 | .readunlock = rcu_torture_read_unlock, | 478 | .readunlock = rcu_torture_read_unlock, |
@@ -493,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
493 | 490 | ||
494 | static struct rcu_torture_ops rcu_expedited_ops = { | 491 | static struct rcu_torture_ops rcu_expedited_ops = { |
495 | .init = rcu_sync_torture_init, | 492 | .init = rcu_sync_torture_init, |
496 | .cleanup = NULL, | ||
497 | .readlock = rcu_torture_read_lock, | 493 | .readlock = rcu_torture_read_lock, |
498 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 494 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
499 | .readunlock = rcu_torture_read_unlock, | 495 | .readunlock = rcu_torture_read_unlock, |
@@ -536,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
536 | 532 | ||
537 | static struct rcu_torture_ops rcu_bh_ops = { | 533 | static struct rcu_torture_ops rcu_bh_ops = { |
538 | .init = NULL, | 534 | .init = NULL, |
539 | .cleanup = NULL, | ||
540 | .readlock = rcu_bh_torture_read_lock, | 535 | .readlock = rcu_bh_torture_read_lock, |
541 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 536 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
542 | .readunlock = rcu_bh_torture_read_unlock, | 537 | .readunlock = rcu_bh_torture_read_unlock, |
@@ -553,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
553 | 548 | ||
554 | static struct rcu_torture_ops rcu_bh_sync_ops = { | 549 | static struct rcu_torture_ops rcu_bh_sync_ops = { |
555 | .init = rcu_sync_torture_init, | 550 | .init = rcu_sync_torture_init, |
556 | .cleanup = NULL, | ||
557 | .readlock = rcu_bh_torture_read_lock, | 551 | .readlock = rcu_bh_torture_read_lock, |
558 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 552 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
559 | .readunlock = rcu_bh_torture_read_unlock, | 553 | .readunlock = rcu_bh_torture_read_unlock, |
@@ -570,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
570 | 564 | ||
571 | static struct rcu_torture_ops rcu_bh_expedited_ops = { | 565 | static struct rcu_torture_ops rcu_bh_expedited_ops = { |
572 | .init = rcu_sync_torture_init, | 566 | .init = rcu_sync_torture_init, |
573 | .cleanup = NULL, | ||
574 | .readlock = rcu_bh_torture_read_lock, | 567 | .readlock = rcu_bh_torture_read_lock, |
575 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 568 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
576 | .readunlock = rcu_bh_torture_read_unlock, | 569 | .readunlock = rcu_bh_torture_read_unlock, |
@@ -589,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { | |||
589 | * Definitions for srcu torture testing. | 582 | * Definitions for srcu torture testing. |
590 | */ | 583 | */ |
591 | 584 | ||
592 | static struct srcu_struct srcu_ctl; | 585 | DEFINE_STATIC_SRCU(srcu_ctl); |
593 | |||
594 | static void srcu_torture_init(void) | ||
595 | { | ||
596 | init_srcu_struct(&srcu_ctl); | ||
597 | rcu_sync_torture_init(); | ||
598 | } | ||
599 | |||
600 | static void srcu_torture_cleanup(void) | ||
601 | { | ||
602 | synchronize_srcu(&srcu_ctl); | ||
603 | cleanup_srcu_struct(&srcu_ctl); | ||
604 | } | ||
605 | 586 | ||
606 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) | 587 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) |
607 | { | 588 | { |
@@ -672,8 +653,7 @@ static int srcu_torture_stats(char *page) | |||
672 | } | 653 | } |
673 | 654 | ||
674 | static struct rcu_torture_ops srcu_ops = { | 655 | static struct rcu_torture_ops srcu_ops = { |
675 | .init = srcu_torture_init, | 656 | .init = rcu_sync_torture_init, |
676 | .cleanup = srcu_torture_cleanup, | ||
677 | .readlock = srcu_torture_read_lock, | 657 | .readlock = srcu_torture_read_lock, |
678 | .read_delay = srcu_read_delay, | 658 | .read_delay = srcu_read_delay, |
679 | .readunlock = srcu_torture_read_unlock, | 659 | .readunlock = srcu_torture_read_unlock, |
@@ -687,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = { | |||
687 | }; | 667 | }; |
688 | 668 | ||
689 | static struct rcu_torture_ops srcu_sync_ops = { | 669 | static struct rcu_torture_ops srcu_sync_ops = { |
690 | .init = srcu_torture_init, | 670 | .init = rcu_sync_torture_init, |
691 | .cleanup = srcu_torture_cleanup, | ||
692 | .readlock = srcu_torture_read_lock, | 671 | .readlock = srcu_torture_read_lock, |
693 | .read_delay = srcu_read_delay, | 672 | .read_delay = srcu_read_delay, |
694 | .readunlock = srcu_torture_read_unlock, | 673 | .readunlock = srcu_torture_read_unlock, |
@@ -712,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) | |||
712 | } | 691 | } |
713 | 692 | ||
714 | static struct rcu_torture_ops srcu_raw_ops = { | 693 | static struct rcu_torture_ops srcu_raw_ops = { |
715 | .init = srcu_torture_init, | 694 | .init = rcu_sync_torture_init, |
716 | .cleanup = srcu_torture_cleanup, | ||
717 | .readlock = srcu_torture_read_lock_raw, | 695 | .readlock = srcu_torture_read_lock_raw, |
718 | .read_delay = srcu_read_delay, | 696 | .read_delay = srcu_read_delay, |
719 | .readunlock = srcu_torture_read_unlock_raw, | 697 | .readunlock = srcu_torture_read_unlock_raw, |
@@ -727,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = { | |||
727 | }; | 705 | }; |
728 | 706 | ||
729 | static struct rcu_torture_ops srcu_raw_sync_ops = { | 707 | static struct rcu_torture_ops srcu_raw_sync_ops = { |
730 | .init = srcu_torture_init, | 708 | .init = rcu_sync_torture_init, |
731 | .cleanup = srcu_torture_cleanup, | ||
732 | .readlock = srcu_torture_read_lock_raw, | 709 | .readlock = srcu_torture_read_lock_raw, |
733 | .read_delay = srcu_read_delay, | 710 | .read_delay = srcu_read_delay, |
734 | .readunlock = srcu_torture_read_unlock_raw, | 711 | .readunlock = srcu_torture_read_unlock_raw, |
@@ -747,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void) | |||
747 | } | 724 | } |
748 | 725 | ||
749 | static struct rcu_torture_ops srcu_expedited_ops = { | 726 | static struct rcu_torture_ops srcu_expedited_ops = { |
750 | .init = srcu_torture_init, | 727 | .init = rcu_sync_torture_init, |
751 | .cleanup = srcu_torture_cleanup, | ||
752 | .readlock = srcu_torture_read_lock, | 728 | .readlock = srcu_torture_read_lock, |
753 | .read_delay = srcu_read_delay, | 729 | .read_delay = srcu_read_delay, |
754 | .readunlock = srcu_torture_read_unlock, | 730 | .readunlock = srcu_torture_read_unlock, |
@@ -783,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) | |||
783 | 759 | ||
784 | static struct rcu_torture_ops sched_ops = { | 760 | static struct rcu_torture_ops sched_ops = { |
785 | .init = rcu_sync_torture_init, | 761 | .init = rcu_sync_torture_init, |
786 | .cleanup = NULL, | ||
787 | .readlock = sched_torture_read_lock, | 762 | .readlock = sched_torture_read_lock, |
788 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 763 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
789 | .readunlock = sched_torture_read_unlock, | 764 | .readunlock = sched_torture_read_unlock, |
@@ -799,7 +774,6 @@ static struct rcu_torture_ops sched_ops = { | |||
799 | 774 | ||
800 | static struct rcu_torture_ops sched_sync_ops = { | 775 | static struct rcu_torture_ops sched_sync_ops = { |
801 | .init = rcu_sync_torture_init, | 776 | .init = rcu_sync_torture_init, |
802 | .cleanup = NULL, | ||
803 | .readlock = sched_torture_read_lock, | 777 | .readlock = sched_torture_read_lock, |
804 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 778 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
805 | .readunlock = sched_torture_read_unlock, | 779 | .readunlock = sched_torture_read_unlock, |
@@ -814,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = { | |||
814 | 788 | ||
815 | static struct rcu_torture_ops sched_expedited_ops = { | 789 | static struct rcu_torture_ops sched_expedited_ops = { |
816 | .init = rcu_sync_torture_init, | 790 | .init = rcu_sync_torture_init, |
817 | .cleanup = NULL, | ||
818 | .readlock = sched_torture_read_lock, | 791 | .readlock = sched_torture_read_lock, |
819 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 792 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
820 | .readunlock = sched_torture_read_unlock, | 793 | .readunlock = sched_torture_read_unlock, |
@@ -1396,12 +1369,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | |||
1396 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | 1369 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1397 | "test_boost=%d/%d test_boost_interval=%d " | 1370 | "test_boost=%d/%d test_boost_interval=%d " |
1398 | "test_boost_duration=%d shutdown_secs=%d " | 1371 | "test_boost_duration=%d shutdown_secs=%d " |
1372 | "stall_cpu=%d stall_cpu_holdoff=%d " | ||
1373 | "n_barrier_cbs=%d " | ||
1399 | "onoff_interval=%d onoff_holdoff=%d\n", | 1374 | "onoff_interval=%d onoff_holdoff=%d\n", |
1400 | torture_type, tag, nrealreaders, nfakewriters, | 1375 | torture_type, tag, nrealreaders, nfakewriters, |
1401 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1376 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1402 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | 1377 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1403 | test_boost, cur_ops->can_boost, | 1378 | test_boost, cur_ops->can_boost, |
1404 | test_boost_interval, test_boost_duration, shutdown_secs, | 1379 | test_boost_interval, test_boost_duration, shutdown_secs, |
1380 | stall_cpu, stall_cpu_holdoff, | ||
1381 | n_barrier_cbs, | ||
1405 | onoff_interval, onoff_holdoff); | 1382 | onoff_interval, onoff_holdoff); |
1406 | } | 1383 | } |
1407 | 1384 | ||
@@ -1502,6 +1479,7 @@ rcu_torture_onoff(void *arg) | |||
1502 | unsigned long delta; | 1479 | unsigned long delta; |
1503 | int maxcpu = -1; | 1480 | int maxcpu = -1; |
1504 | DEFINE_RCU_RANDOM(rand); | 1481 | DEFINE_RCU_RANDOM(rand); |
1482 | int ret; | ||
1505 | unsigned long starttime; | 1483 | unsigned long starttime; |
1506 | 1484 | ||
1507 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); | 1485 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); |
@@ -1522,7 +1500,13 @@ rcu_torture_onoff(void *arg) | |||
1522 | torture_type, cpu); | 1500 | torture_type, cpu); |
1523 | starttime = jiffies; | 1501 | starttime = jiffies; |
1524 | n_offline_attempts++; | 1502 | n_offline_attempts++; |
1525 | if (cpu_down(cpu) == 0) { | 1503 | ret = cpu_down(cpu); |
1504 | if (ret) { | ||
1505 | if (verbose) | ||
1506 | pr_alert("%s" TORTURE_FLAG | ||
1507 | "rcu_torture_onoff task: offline %d failed: errno %d\n", | ||
1508 | torture_type, cpu, ret); | ||
1509 | } else { | ||
1526 | if (verbose) | 1510 | if (verbose) |
1527 | pr_alert("%s" TORTURE_FLAG | 1511 | pr_alert("%s" TORTURE_FLAG |
1528 | "rcu_torture_onoff task: offlined %d\n", | 1512 | "rcu_torture_onoff task: offlined %d\n", |
@@ -1936,8 +1920,6 @@ rcu_torture_cleanup(void) | |||
1936 | 1920 | ||
1937 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 1921 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
1938 | 1922 | ||
1939 | if (cur_ops->cleanup) | ||
1940 | cur_ops->cleanup(); | ||
1941 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) | 1923 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) |
1942 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1924 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1943 | else if (n_online_successes != n_online_attempts || | 1925 | else if (n_online_successes != n_online_attempts || |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204..e441b77b614e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -68,9 +68,9 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
68 | .level = { &sname##_state.node[0] }, \ | 68 | .level = { &sname##_state.node[0] }, \ |
69 | .call = cr, \ | 69 | .call = cr, \ |
70 | .fqs_state = RCU_GP_IDLE, \ | 70 | .fqs_state = RCU_GP_IDLE, \ |
71 | .gpnum = -300, \ | 71 | .gpnum = 0UL - 300UL, \ |
72 | .completed = -300, \ | 72 | .completed = 0UL - 300UL, \ |
73 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ | 73 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ |
74 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 74 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ |
75 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 75 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
@@ -207,18 +207,15 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); | |||
207 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 207 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
208 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 208 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
209 | .dynticks = ATOMIC_INIT(1), | 209 | .dynticks = ATOMIC_INIT(1), |
210 | #if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE) | ||
211 | .ignore_user_qs = true, | ||
212 | #endif | ||
213 | }; | 210 | }; |
214 | 211 | ||
215 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 212 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
216 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ | 213 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ |
217 | static int qlowmark = 100; /* Once only this many pending, use blimit. */ | 214 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ |
218 | 215 | ||
219 | module_param(blimit, int, 0444); | 216 | module_param(blimit, long, 0444); |
220 | module_param(qhimark, int, 0444); | 217 | module_param(qhimark, long, 0444); |
221 | module_param(qlowmark, int, 0444); | 218 | module_param(qlowmark, long, 0444); |
222 | 219 | ||
223 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 220 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
224 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | 221 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; |
@@ -303,7 +300,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | |||
303 | static int | 300 | static int |
304 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | 301 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) |
305 | { | 302 | { |
306 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; | 303 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && |
304 | rdp->nxttail[RCU_DONE_TAIL] != NULL; | ||
307 | } | 305 | } |
308 | 306 | ||
309 | /* | 307 | /* |
@@ -312,8 +310,11 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | |||
312 | static int | 310 | static int |
313 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 311 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
314 | { | 312 | { |
315 | return *rdp->nxttail[RCU_DONE_TAIL + | 313 | struct rcu_head **ntp; |
316 | ACCESS_ONCE(rsp->completed) != rdp->completed] && | 314 | |
315 | ntp = rdp->nxttail[RCU_DONE_TAIL + | ||
316 | (ACCESS_ONCE(rsp->completed) != rdp->completed)]; | ||
317 | return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && | ||
317 | !rcu_gp_in_progress(rsp); | 318 | !rcu_gp_in_progress(rsp); |
318 | } | 319 | } |
319 | 320 | ||
@@ -416,29 +417,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); | |||
416 | */ | 417 | */ |
417 | void rcu_user_enter(void) | 418 | void rcu_user_enter(void) |
418 | { | 419 | { |
419 | unsigned long flags; | 420 | rcu_eqs_enter(1); |
420 | struct rcu_dynticks *rdtp; | ||
421 | |||
422 | /* | ||
423 | * Some contexts may involve an exception occuring in an irq, | ||
424 | * leading to that nesting: | ||
425 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
426 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
427 | * helpers are enough to protect RCU uses inside the exception. So | ||
428 | * just return immediately if we detect we are in an IRQ. | ||
429 | */ | ||
430 | if (in_interrupt()) | ||
431 | return; | ||
432 | |||
433 | WARN_ON_ONCE(!current->mm); | ||
434 | |||
435 | local_irq_save(flags); | ||
436 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
437 | if (!rdtp->ignore_user_qs && !rdtp->in_user) { | ||
438 | rdtp->in_user = true; | ||
439 | rcu_eqs_enter(true); | ||
440 | } | ||
441 | local_irq_restore(flags); | ||
442 | } | 421 | } |
443 | 422 | ||
444 | /** | 423 | /** |
@@ -575,27 +554,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); | |||
575 | */ | 554 | */ |
576 | void rcu_user_exit(void) | 555 | void rcu_user_exit(void) |
577 | { | 556 | { |
578 | unsigned long flags; | 557 | rcu_eqs_exit(1); |
579 | struct rcu_dynticks *rdtp; | ||
580 | |||
581 | /* | ||
582 | * Some contexts may involve an exception occuring in an irq, | ||
583 | * leading to that nesting: | ||
584 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
585 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
586 | * helpers are enough to protect RCU uses inside the exception. So | ||
587 | * just return immediately if we detect we are in an IRQ. | ||
588 | */ | ||
589 | if (in_interrupt()) | ||
590 | return; | ||
591 | |||
592 | local_irq_save(flags); | ||
593 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
594 | if (rdtp->in_user) { | ||
595 | rdtp->in_user = false; | ||
596 | rcu_eqs_exit(true); | ||
597 | } | ||
598 | local_irq_restore(flags); | ||
599 | } | 558 | } |
600 | 559 | ||
601 | /** | 560 | /** |
@@ -718,21 +677,6 @@ int rcu_is_cpu_idle(void) | |||
718 | } | 677 | } |
719 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 678 | EXPORT_SYMBOL(rcu_is_cpu_idle); |
720 | 679 | ||
721 | #ifdef CONFIG_RCU_USER_QS | ||
722 | void rcu_user_hooks_switch(struct task_struct *prev, | ||
723 | struct task_struct *next) | ||
724 | { | ||
725 | struct rcu_dynticks *rdtp; | ||
726 | |||
727 | /* Interrupts are disabled in context switch */ | ||
728 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
729 | if (!rdtp->ignore_user_qs) { | ||
730 | clear_tsk_thread_flag(prev, TIF_NOHZ); | ||
731 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
732 | } | ||
733 | } | ||
734 | #endif /* #ifdef CONFIG_RCU_USER_QS */ | ||
735 | |||
736 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | 680 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
737 | 681 | ||
738 | /* | 682 | /* |
@@ -873,6 +817,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) | |||
873 | rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); | 817 | rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); |
874 | } | 818 | } |
875 | 819 | ||
820 | /* | ||
821 | * Dump stacks of all tasks running on stalled CPUs. This is a fallback | ||
822 | * for architectures that do not implement trigger_all_cpu_backtrace(). | ||
823 | * The NMI-triggered stack traces are more accurate because they are | ||
824 | * printed by the target CPU. | ||
825 | */ | ||
826 | static void rcu_dump_cpu_stacks(struct rcu_state *rsp) | ||
827 | { | ||
828 | int cpu; | ||
829 | unsigned long flags; | ||
830 | struct rcu_node *rnp; | ||
831 | |||
832 | rcu_for_each_leaf_node(rsp, rnp) { | ||
833 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
834 | if (rnp->qsmask != 0) { | ||
835 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | ||
836 | if (rnp->qsmask & (1UL << cpu)) | ||
837 | dump_cpu_task(rnp->grplo + cpu); | ||
838 | } | ||
839 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
840 | } | ||
841 | } | ||
842 | |||
876 | static void print_other_cpu_stall(struct rcu_state *rsp) | 843 | static void print_other_cpu_stall(struct rcu_state *rsp) |
877 | { | 844 | { |
878 | int cpu; | 845 | int cpu; |
@@ -880,6 +847,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
880 | unsigned long flags; | 847 | unsigned long flags; |
881 | int ndetected = 0; | 848 | int ndetected = 0; |
882 | struct rcu_node *rnp = rcu_get_root(rsp); | 849 | struct rcu_node *rnp = rcu_get_root(rsp); |
850 | long totqlen = 0; | ||
883 | 851 | ||
884 | /* Only let one CPU complain about others per time interval. */ | 852 | /* Only let one CPU complain about others per time interval. */ |
885 | 853 | ||
@@ -924,12 +892,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
924 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 892 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
925 | 893 | ||
926 | print_cpu_stall_info_end(); | 894 | print_cpu_stall_info_end(); |
927 | printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", | 895 | for_each_possible_cpu(cpu) |
928 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 896 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; |
897 | pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", | ||
898 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | ||
899 | rsp->gpnum, rsp->completed, totqlen); | ||
929 | if (ndetected == 0) | 900 | if (ndetected == 0) |
930 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); | 901 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); |
931 | else if (!trigger_all_cpu_backtrace()) | 902 | else if (!trigger_all_cpu_backtrace()) |
932 | dump_stack(); | 903 | rcu_dump_cpu_stacks(rsp); |
933 | 904 | ||
934 | /* Complain about tasks blocking the grace period. */ | 905 | /* Complain about tasks blocking the grace period. */ |
935 | 906 | ||
@@ -940,8 +911,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
940 | 911 | ||
941 | static void print_cpu_stall(struct rcu_state *rsp) | 912 | static void print_cpu_stall(struct rcu_state *rsp) |
942 | { | 913 | { |
914 | int cpu; | ||
943 | unsigned long flags; | 915 | unsigned long flags; |
944 | struct rcu_node *rnp = rcu_get_root(rsp); | 916 | struct rcu_node *rnp = rcu_get_root(rsp); |
917 | long totqlen = 0; | ||
945 | 918 | ||
946 | /* | 919 | /* |
947 | * OK, time to rat on ourselves... | 920 | * OK, time to rat on ourselves... |
@@ -952,7 +925,10 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
952 | print_cpu_stall_info_begin(); | 925 | print_cpu_stall_info_begin(); |
953 | print_cpu_stall_info(rsp, smp_processor_id()); | 926 | print_cpu_stall_info(rsp, smp_processor_id()); |
954 | print_cpu_stall_info_end(); | 927 | print_cpu_stall_info_end(); |
955 | printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); | 928 | for_each_possible_cpu(cpu) |
929 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | ||
930 | pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", | ||
931 | jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); | ||
956 | if (!trigger_all_cpu_backtrace()) | 932 | if (!trigger_all_cpu_backtrace()) |
957 | dump_stack(); | 933 | dump_stack(); |
958 | 934 | ||
@@ -1091,6 +1067,7 @@ static void init_callback_list(struct rcu_data *rdp) | |||
1091 | rdp->nxtlist = NULL; | 1067 | rdp->nxtlist = NULL; |
1092 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1068 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1093 | rdp->nxttail[i] = &rdp->nxtlist; | 1069 | rdp->nxttail[i] = &rdp->nxtlist; |
1070 | init_nocb_callback_list(rdp); | ||
1094 | } | 1071 | } |
1095 | 1072 | ||
1096 | /* | 1073 | /* |
@@ -1404,15 +1381,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
1404 | !cpu_needs_another_gp(rsp, rdp)) { | 1381 | !cpu_needs_another_gp(rsp, rdp)) { |
1405 | /* | 1382 | /* |
1406 | * Either we have not yet spawned the grace-period | 1383 | * Either we have not yet spawned the grace-period |
1407 | * task or this CPU does not need another grace period. | 1384 | * task, this CPU does not need another grace period, |
1385 | * or a grace period is already in progress. | ||
1408 | * Either way, don't start a new grace period. | 1386 | * Either way, don't start a new grace period. |
1409 | */ | 1387 | */ |
1410 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1388 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1411 | return; | 1389 | return; |
1412 | } | 1390 | } |
1413 | 1391 | ||
1392 | /* | ||
1393 | * Because there is no grace period in progress right now, | ||
1394 | * any callbacks we have up to this point will be satisfied | ||
1395 | * by the next grace period. So promote all callbacks to be | ||
1396 | * handled after the end of the next grace period. If the | ||
1397 | * CPU is not yet aware of the end of the previous grace period, | ||
1398 | * we need to allow for the callback advancement that will | ||
1399 | * occur when it does become aware. Deadlock prevents us from | ||
1400 | * making it aware at this point: We cannot acquire a leaf | ||
1401 | * rcu_node ->lock while holding the root rcu_node ->lock. | ||
1402 | */ | ||
1403 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1404 | if (rdp->completed == rsp->completed) | ||
1405 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1406 | |||
1414 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1407 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
1415 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1408 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ |
1409 | |||
1410 | /* Ensure that CPU is aware of completion of last grace period. */ | ||
1411 | rcu_process_gp_end(rsp, rdp); | ||
1412 | local_irq_restore(flags); | ||
1413 | |||
1414 | /* Wake up rcu_gp_kthread() to start the grace period. */ | ||
1416 | wake_up(&rsp->gp_wq); | 1415 | wake_up(&rsp->gp_wq); |
1417 | } | 1416 | } |
1418 | 1417 | ||
@@ -1573,16 +1572,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1573 | /* | 1572 | /* |
1574 | * Send the specified CPU's RCU callbacks to the orphanage. The | 1573 | * Send the specified CPU's RCU callbacks to the orphanage. The |
1575 | * specified CPU must be offline, and the caller must hold the | 1574 | * specified CPU must be offline, and the caller must hold the |
1576 | * ->onofflock. | 1575 | * ->orphan_lock. |
1577 | */ | 1576 | */ |
1578 | static void | 1577 | static void |
1579 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | 1578 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, |
1580 | struct rcu_node *rnp, struct rcu_data *rdp) | 1579 | struct rcu_node *rnp, struct rcu_data *rdp) |
1581 | { | 1580 | { |
1581 | /* No-CBs CPUs do not have orphanable callbacks. */ | ||
1582 | if (is_nocb_cpu(rdp->cpu)) | ||
1583 | return; | ||
1584 | |||
1582 | /* | 1585 | /* |
1583 | * Orphan the callbacks. First adjust the counts. This is safe | 1586 | * Orphan the callbacks. First adjust the counts. This is safe |
1584 | * because ->onofflock excludes _rcu_barrier()'s adoption of | 1587 | * because _rcu_barrier() excludes CPU-hotplug operations, so it |
1585 | * the callbacks, thus no memory barrier is required. | 1588 | * cannot be running now. Thus no memory barrier is required. |
1586 | */ | 1589 | */ |
1587 | if (rdp->nxtlist != NULL) { | 1590 | if (rdp->nxtlist != NULL) { |
1588 | rsp->qlen_lazy += rdp->qlen_lazy; | 1591 | rsp->qlen_lazy += rdp->qlen_lazy; |
@@ -1623,13 +1626,17 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
1623 | 1626 | ||
1624 | /* | 1627 | /* |
1625 | * Adopt the RCU callbacks from the specified rcu_state structure's | 1628 | * Adopt the RCU callbacks from the specified rcu_state structure's |
1626 | * orphanage. The caller must hold the ->onofflock. | 1629 | * orphanage. The caller must hold the ->orphan_lock. |
1627 | */ | 1630 | */ |
1628 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | 1631 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) |
1629 | { | 1632 | { |
1630 | int i; | 1633 | int i; |
1631 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 1634 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); |
1632 | 1635 | ||
1636 | /* No-CBs CPUs are handled specially. */ | ||
1637 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) | ||
1638 | return; | ||
1639 | |||
1633 | /* Do the accounting first. */ | 1640 | /* Do the accounting first. */ |
1634 | rdp->qlen_lazy += rsp->qlen_lazy; | 1641 | rdp->qlen_lazy += rsp->qlen_lazy; |
1635 | rdp->qlen += rsp->qlen; | 1642 | rdp->qlen += rsp->qlen; |
@@ -1702,7 +1709,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1702 | 1709 | ||
1703 | /* Exclude any attempts to start a new grace period. */ | 1710 | /* Exclude any attempts to start a new grace period. */ |
1704 | mutex_lock(&rsp->onoff_mutex); | 1711 | mutex_lock(&rsp->onoff_mutex); |
1705 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1712 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); |
1706 | 1713 | ||
1707 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | 1714 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ |
1708 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | 1715 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); |
@@ -1729,10 +1736,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1729 | /* | 1736 | /* |
1730 | * We still hold the leaf rcu_node structure lock here, and | 1737 | * We still hold the leaf rcu_node structure lock here, and |
1731 | * irqs are still disabled. The reason for this subterfuge is | 1738 | * irqs are still disabled. The reason for this subterfuge is |
1732 | * because invoking rcu_report_unblock_qs_rnp() with ->onofflock | 1739 | * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock |
1733 | * held leads to deadlock. | 1740 | * held leads to deadlock. |
1734 | */ | 1741 | */ |
1735 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 1742 | raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ |
1736 | rnp = rdp->mynode; | 1743 | rnp = rdp->mynode; |
1737 | if (need_report & RCU_OFL_TASKS_NORM_GP) | 1744 | if (need_report & RCU_OFL_TASKS_NORM_GP) |
1738 | rcu_report_unblock_qs_rnp(rnp, flags); | 1745 | rcu_report_unblock_qs_rnp(rnp, flags); |
@@ -1769,7 +1776,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1769 | { | 1776 | { |
1770 | unsigned long flags; | 1777 | unsigned long flags; |
1771 | struct rcu_head *next, *list, **tail; | 1778 | struct rcu_head *next, *list, **tail; |
1772 | int bl, count, count_lazy, i; | 1779 | long bl, count, count_lazy; |
1780 | int i; | ||
1773 | 1781 | ||
1774 | /* If no callbacks are ready, just return.*/ | 1782 | /* If no callbacks are ready, just return.*/ |
1775 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1783 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
@@ -2107,9 +2115,15 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2107 | } | 2115 | } |
2108 | } | 2116 | } |
2109 | 2117 | ||
2118 | /* | ||
2119 | * Helper function for call_rcu() and friends. The cpu argument will | ||
2120 | * normally be -1, indicating "currently running CPU". It may specify | ||
2121 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() | ||
2122 | * is expected to specify a CPU. | ||
2123 | */ | ||
2110 | static void | 2124 | static void |
2111 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 2125 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
2112 | struct rcu_state *rsp, bool lazy) | 2126 | struct rcu_state *rsp, int cpu, bool lazy) |
2113 | { | 2127 | { |
2114 | unsigned long flags; | 2128 | unsigned long flags; |
2115 | struct rcu_data *rdp; | 2129 | struct rcu_data *rdp; |
@@ -2129,9 +2143,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2129 | rdp = this_cpu_ptr(rsp->rda); | 2143 | rdp = this_cpu_ptr(rsp->rda); |
2130 | 2144 | ||
2131 | /* Add the callback to our list. */ | 2145 | /* Add the callback to our list. */ |
2132 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { | 2146 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { |
2147 | int offline; | ||
2148 | |||
2149 | if (cpu != -1) | ||
2150 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2151 | offline = !__call_rcu_nocb(rdp, head, lazy); | ||
2152 | WARN_ON_ONCE(offline); | ||
2133 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ | 2153 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ |
2134 | WARN_ON_ONCE(1); | ||
2135 | local_irq_restore(flags); | 2154 | local_irq_restore(flags); |
2136 | return; | 2155 | return; |
2137 | } | 2156 | } |
@@ -2160,7 +2179,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2160 | */ | 2179 | */ |
2161 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 2180 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
2162 | { | 2181 | { |
2163 | __call_rcu(head, func, &rcu_sched_state, 0); | 2182 | __call_rcu(head, func, &rcu_sched_state, -1, 0); |
2164 | } | 2183 | } |
2165 | EXPORT_SYMBOL_GPL(call_rcu_sched); | 2184 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
2166 | 2185 | ||
@@ -2169,7 +2188,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); | |||
2169 | */ | 2188 | */ |
2170 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 2189 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
2171 | { | 2190 | { |
2172 | __call_rcu(head, func, &rcu_bh_state, 0); | 2191 | __call_rcu(head, func, &rcu_bh_state, -1, 0); |
2173 | } | 2192 | } |
2174 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 2193 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
2175 | 2194 | ||
@@ -2205,10 +2224,28 @@ static inline int rcu_blocking_is_gp(void) | |||
2205 | * rcu_read_lock_sched(). | 2224 | * rcu_read_lock_sched(). |
2206 | * | 2225 | * |
2207 | * This means that all preempt_disable code sequences, including NMI and | 2226 | * This means that all preempt_disable code sequences, including NMI and |
2208 | * hardware-interrupt handlers, in progress on entry will have completed | 2227 | * non-threaded hardware-interrupt handlers, in progress on entry will |
2209 | * before this primitive returns. However, this does not guarantee that | 2228 | * have completed before this primitive returns. However, this does not |
2210 | * softirq handlers will have completed, since in some kernels, these | 2229 | * guarantee that softirq handlers will have completed, since in some |
2211 | * handlers can run in process context, and can block. | 2230 | * kernels, these handlers can run in process context, and can block. |
2231 | * | ||
2232 | * Note that this guarantee implies further memory-ordering guarantees. | ||
2233 | * On systems with more than one CPU, when synchronize_sched() returns, | ||
2234 | * each CPU is guaranteed to have executed a full memory barrier since the | ||
2235 | * end of its last RCU-sched read-side critical section whose beginning | ||
2236 | * preceded the call to synchronize_sched(). In addition, each CPU having | ||
2237 | * an RCU read-side critical section that extends beyond the return from | ||
2238 | * synchronize_sched() is guaranteed to have executed a full memory barrier | ||
2239 | * after the beginning of synchronize_sched() and before the beginning of | ||
2240 | * that RCU read-side critical section. Note that these guarantees include | ||
2241 | * CPUs that are offline, idle, or executing in user mode, as well as CPUs | ||
2242 | * that are executing in the kernel. | ||
2243 | * | ||
2244 | * Furthermore, if CPU A invoked synchronize_sched(), which returned | ||
2245 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
2246 | * to have executed a full memory barrier during the execution of | ||
2247 | * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but | ||
2248 | * again only if the system has more than one CPU). | ||
2212 | * | 2249 | * |
2213 | * This primitive provides the guarantees made by the (now removed) | 2250 | * This primitive provides the guarantees made by the (now removed) |
2214 | * synchronize_kernel() API. In contrast, synchronize_rcu() only | 2251 | * synchronize_kernel() API. In contrast, synchronize_rcu() only |
@@ -2224,7 +2261,10 @@ void synchronize_sched(void) | |||
2224 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); | 2261 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); |
2225 | if (rcu_blocking_is_gp()) | 2262 | if (rcu_blocking_is_gp()) |
2226 | return; | 2263 | return; |
2227 | wait_rcu_gp(call_rcu_sched); | 2264 | if (rcu_expedited) |
2265 | synchronize_sched_expedited(); | ||
2266 | else | ||
2267 | wait_rcu_gp(call_rcu_sched); | ||
2228 | } | 2268 | } |
2229 | EXPORT_SYMBOL_GPL(synchronize_sched); | 2269 | EXPORT_SYMBOL_GPL(synchronize_sched); |
2230 | 2270 | ||
@@ -2236,6 +2276,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); | |||
2236 | * read-side critical sections have completed. RCU read-side critical | 2276 | * read-side critical sections have completed. RCU read-side critical |
2237 | * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), | 2277 | * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), |
2238 | * and may be nested. | 2278 | * and may be nested. |
2279 | * | ||
2280 | * See the description of synchronize_sched() for more detailed information | ||
2281 | * on memory ordering guarantees. | ||
2239 | */ | 2282 | */ |
2240 | void synchronize_rcu_bh(void) | 2283 | void synchronize_rcu_bh(void) |
2241 | { | 2284 | { |
@@ -2245,13 +2288,13 @@ void synchronize_rcu_bh(void) | |||
2245 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); | 2288 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); |
2246 | if (rcu_blocking_is_gp()) | 2289 | if (rcu_blocking_is_gp()) |
2247 | return; | 2290 | return; |
2248 | wait_rcu_gp(call_rcu_bh); | 2291 | if (rcu_expedited) |
2292 | synchronize_rcu_bh_expedited(); | ||
2293 | else | ||
2294 | wait_rcu_gp(call_rcu_bh); | ||
2249 | } | 2295 | } |
2250 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); | 2296 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); |
2251 | 2297 | ||
2252 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | ||
2253 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
2254 | |||
2255 | static int synchronize_sched_expedited_cpu_stop(void *data) | 2298 | static int synchronize_sched_expedited_cpu_stop(void *data) |
2256 | { | 2299 | { |
2257 | /* | 2300 | /* |
@@ -2308,10 +2351,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
2308 | */ | 2351 | */ |
2309 | void synchronize_sched_expedited(void) | 2352 | void synchronize_sched_expedited(void) |
2310 | { | 2353 | { |
2311 | int firstsnap, s, snap, trycount = 0; | 2354 | long firstsnap, s, snap; |
2355 | int trycount = 0; | ||
2356 | struct rcu_state *rsp = &rcu_sched_state; | ||
2357 | |||
2358 | /* | ||
2359 | * If we are in danger of counter wrap, just do synchronize_sched(). | ||
2360 | * By allowing sync_sched_expedited_started to advance no more than | ||
2361 | * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring | ||
2362 | * that more than 3.5 billion CPUs would be required to force a | ||
2363 | * counter wrap on a 32-bit system. Quite a few more CPUs would of | ||
2364 | * course be required on a 64-bit system. | ||
2365 | */ | ||
2366 | if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), | ||
2367 | (ulong)atomic_long_read(&rsp->expedited_done) + | ||
2368 | ULONG_MAX / 8)) { | ||
2369 | synchronize_sched(); | ||
2370 | atomic_long_inc(&rsp->expedited_wrap); | ||
2371 | return; | ||
2372 | } | ||
2312 | 2373 | ||
2313 | /* Note that atomic_inc_return() implies full memory barrier. */ | 2374 | /* |
2314 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | 2375 | * Take a ticket. Note that atomic_inc_return() implies a |
2376 | * full memory barrier. | ||
2377 | */ | ||
2378 | snap = atomic_long_inc_return(&rsp->expedited_start); | ||
2379 | firstsnap = snap; | ||
2315 | get_online_cpus(); | 2380 | get_online_cpus(); |
2316 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | 2381 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); |
2317 | 2382 | ||
@@ -2323,48 +2388,65 @@ void synchronize_sched_expedited(void) | |||
2323 | synchronize_sched_expedited_cpu_stop, | 2388 | synchronize_sched_expedited_cpu_stop, |
2324 | NULL) == -EAGAIN) { | 2389 | NULL) == -EAGAIN) { |
2325 | put_online_cpus(); | 2390 | put_online_cpus(); |
2391 | atomic_long_inc(&rsp->expedited_tryfail); | ||
2392 | |||
2393 | /* Check to see if someone else did our work for us. */ | ||
2394 | s = atomic_long_read(&rsp->expedited_done); | ||
2395 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | ||
2396 | /* ensure test happens before caller kfree */ | ||
2397 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
2398 | atomic_long_inc(&rsp->expedited_workdone1); | ||
2399 | return; | ||
2400 | } | ||
2326 | 2401 | ||
2327 | /* No joy, try again later. Or just synchronize_sched(). */ | 2402 | /* No joy, try again later. Or just synchronize_sched(). */ |
2328 | if (trycount++ < 10) { | 2403 | if (trycount++ < 10) { |
2329 | udelay(trycount * num_online_cpus()); | 2404 | udelay(trycount * num_online_cpus()); |
2330 | } else { | 2405 | } else { |
2331 | synchronize_sched(); | 2406 | wait_rcu_gp(call_rcu_sched); |
2407 | atomic_long_inc(&rsp->expedited_normal); | ||
2332 | return; | 2408 | return; |
2333 | } | 2409 | } |
2334 | 2410 | ||
2335 | /* Check to see if someone else did our work for us. */ | 2411 | /* Recheck to see if someone else did our work for us. */ |
2336 | s = atomic_read(&sync_sched_expedited_done); | 2412 | s = atomic_long_read(&rsp->expedited_done); |
2337 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | 2413 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { |
2338 | smp_mb(); /* ensure test happens before caller kfree */ | 2414 | /* ensure test happens before caller kfree */ |
2415 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
2416 | atomic_long_inc(&rsp->expedited_workdone2); | ||
2339 | return; | 2417 | return; |
2340 | } | 2418 | } |
2341 | 2419 | ||
2342 | /* | 2420 | /* |
2343 | * Refetching sync_sched_expedited_started allows later | 2421 | * Refetching sync_sched_expedited_started allows later |
2344 | * callers to piggyback on our grace period. We subtract | 2422 | * callers to piggyback on our grace period. We retry |
2345 | * 1 to get the same token that the last incrementer got. | 2423 | * after they started, so our grace period works for them, |
2346 | * We retry after they started, so our grace period works | 2424 | * and they started after our first try, so their grace |
2347 | * for them, and they started after our first try, so their | 2425 | * period works for us. |
2348 | * grace period works for us. | ||
2349 | */ | 2426 | */ |
2350 | get_online_cpus(); | 2427 | get_online_cpus(); |
2351 | snap = atomic_read(&sync_sched_expedited_started); | 2428 | snap = atomic_long_read(&rsp->expedited_start); |
2352 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | 2429 | smp_mb(); /* ensure read is before try_stop_cpus(). */ |
2353 | } | 2430 | } |
2431 | atomic_long_inc(&rsp->expedited_stoppedcpus); | ||
2354 | 2432 | ||
2355 | /* | 2433 | /* |
2356 | * Everyone up to our most recent fetch is covered by our grace | 2434 | * Everyone up to our most recent fetch is covered by our grace |
2357 | * period. Update the counter, but only if our work is still | 2435 | * period. Update the counter, but only if our work is still |
2358 | * relevant -- which it won't be if someone who started later | 2436 | * relevant -- which it won't be if someone who started later |
2359 | * than we did beat us to the punch. | 2437 | * than we did already did their update. |
2360 | */ | 2438 | */ |
2361 | do { | 2439 | do { |
2362 | s = atomic_read(&sync_sched_expedited_done); | 2440 | atomic_long_inc(&rsp->expedited_done_tries); |
2363 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | 2441 | s = atomic_long_read(&rsp->expedited_done); |
2364 | smp_mb(); /* ensure test happens before caller kfree */ | 2442 | if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { |
2443 | /* ensure test happens before caller kfree */ | ||
2444 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
2445 | atomic_long_inc(&rsp->expedited_done_lost); | ||
2365 | break; | 2446 | break; |
2366 | } | 2447 | } |
2367 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | 2448 | } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); |
2449 | atomic_long_inc(&rsp->expedited_done_exit); | ||
2368 | 2450 | ||
2369 | put_online_cpus(); | 2451 | put_online_cpus(); |
2370 | } | 2452 | } |
@@ -2558,9 +2640,17 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
2558 | * When that callback is invoked, we will know that all of the | 2640 | * When that callback is invoked, we will know that all of the |
2559 | * corresponding CPU's preceding callbacks have been invoked. | 2641 | * corresponding CPU's preceding callbacks have been invoked. |
2560 | */ | 2642 | */ |
2561 | for_each_online_cpu(cpu) { | 2643 | for_each_possible_cpu(cpu) { |
2644 | if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) | ||
2645 | continue; | ||
2562 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2646 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2563 | if (ACCESS_ONCE(rdp->qlen)) { | 2647 | if (is_nocb_cpu(cpu)) { |
2648 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, | ||
2649 | rsp->n_barrier_done); | ||
2650 | atomic_inc(&rsp->barrier_cpu_count); | ||
2651 | __call_rcu(&rdp->barrier_head, rcu_barrier_callback, | ||
2652 | rsp, cpu, 0); | ||
2653 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
2564 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 2654 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
2565 | rsp->n_barrier_done); | 2655 | rsp->n_barrier_done); |
2566 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | 2656 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); |
@@ -2634,6 +2724,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
2634 | #endif | 2724 | #endif |
2635 | rdp->cpu = cpu; | 2725 | rdp->cpu = cpu; |
2636 | rdp->rsp = rsp; | 2726 | rdp->rsp = rsp; |
2727 | rcu_boot_init_nocb_percpu_data(rdp); | ||
2637 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2728 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2638 | } | 2729 | } |
2639 | 2730 | ||
@@ -2715,6 +2806,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2715 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 2806 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
2716 | struct rcu_node *rnp = rdp->mynode; | 2807 | struct rcu_node *rnp = rdp->mynode; |
2717 | struct rcu_state *rsp; | 2808 | struct rcu_state *rsp; |
2809 | int ret = NOTIFY_OK; | ||
2718 | 2810 | ||
2719 | trace_rcu_utilization("Start CPU hotplug"); | 2811 | trace_rcu_utilization("Start CPU hotplug"); |
2720 | switch (action) { | 2812 | switch (action) { |
@@ -2728,7 +2820,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2728 | rcu_boost_kthread_setaffinity(rnp, -1); | 2820 | rcu_boost_kthread_setaffinity(rnp, -1); |
2729 | break; | 2821 | break; |
2730 | case CPU_DOWN_PREPARE: | 2822 | case CPU_DOWN_PREPARE: |
2731 | rcu_boost_kthread_setaffinity(rnp, cpu); | 2823 | if (nocb_cpu_expendable(cpu)) |
2824 | rcu_boost_kthread_setaffinity(rnp, cpu); | ||
2825 | else | ||
2826 | ret = NOTIFY_BAD; | ||
2732 | break; | 2827 | break; |
2733 | case CPU_DYING: | 2828 | case CPU_DYING: |
2734 | case CPU_DYING_FROZEN: | 2829 | case CPU_DYING_FROZEN: |
@@ -2752,7 +2847,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2752 | break; | 2847 | break; |
2753 | } | 2848 | } |
2754 | trace_rcu_utilization("End CPU hotplug"); | 2849 | trace_rcu_utilization("End CPU hotplug"); |
2755 | return NOTIFY_OK; | 2850 | return ret; |
2756 | } | 2851 | } |
2757 | 2852 | ||
2758 | /* | 2853 | /* |
@@ -2772,6 +2867,7 @@ static int __init rcu_spawn_gp_kthread(void) | |||
2772 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2867 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2773 | rsp->gp_kthread = t; | 2868 | rsp->gp_kthread = t; |
2774 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2869 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2870 | rcu_spawn_nocb_kthreads(rsp); | ||
2775 | } | 2871 | } |
2776 | return 0; | 2872 | return 0; |
2777 | } | 2873 | } |
@@ -2967,6 +3063,7 @@ void __init rcu_init(void) | |||
2967 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 3063 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
2968 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 3064 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
2969 | __rcu_init_preempt(); | 3065 | __rcu_init_preempt(); |
3066 | rcu_init_nocb(); | ||
2970 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 3067 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
2971 | 3068 | ||
2972 | /* | 3069 | /* |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index a240f032848e..4b69291b093d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -287,6 +287,7 @@ struct rcu_data { | |||
287 | long qlen_last_fqs_check; | 287 | long qlen_last_fqs_check; |
288 | /* qlen at last check for QS forcing */ | 288 | /* qlen at last check for QS forcing */ |
289 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 289 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
290 | unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ | ||
290 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ | 291 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ |
291 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ | 292 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ |
292 | unsigned long n_force_qs_snap; | 293 | unsigned long n_force_qs_snap; |
@@ -317,6 +318,18 @@ struct rcu_data { | |||
317 | struct rcu_head oom_head; | 318 | struct rcu_head oom_head; |
318 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 319 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
319 | 320 | ||
321 | /* 7) Callback offloading. */ | ||
322 | #ifdef CONFIG_RCU_NOCB_CPU | ||
323 | struct rcu_head *nocb_head; /* CBs waiting for kthread. */ | ||
324 | struct rcu_head **nocb_tail; | ||
325 | atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ | ||
326 | atomic_long_t nocb_q_count_lazy; /* (approximate). */ | ||
327 | int nocb_p_count; /* # CBs being invoked by kthread */ | ||
328 | int nocb_p_count_lazy; /* (approximate). */ | ||
329 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | ||
330 | struct task_struct *nocb_kthread; | ||
331 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
332 | |||
320 | int cpu; | 333 | int cpu; |
321 | struct rcu_state *rsp; | 334 | struct rcu_state *rsp; |
322 | }; | 335 | }; |
@@ -369,6 +382,12 @@ struct rcu_state { | |||
369 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 382 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
370 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | 383 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ |
371 | void (*func)(struct rcu_head *head)); | 384 | void (*func)(struct rcu_head *head)); |
385 | #ifdef CONFIG_RCU_NOCB_CPU | ||
386 | void (*call_remote)(struct rcu_head *head, | ||
387 | void (*func)(struct rcu_head *head)); | ||
388 | /* call_rcu() flavor, but for */ | ||
389 | /* placing on remote CPU. */ | ||
390 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
372 | 391 | ||
373 | /* The following fields are guarded by the root rcu_node's lock. */ | 392 | /* The following fields are guarded by the root rcu_node's lock. */ |
374 | 393 | ||
@@ -383,9 +402,8 @@ struct rcu_state { | |||
383 | 402 | ||
384 | /* End of fields guarded by root rcu_node's lock. */ | 403 | /* End of fields guarded by root rcu_node's lock. */ |
385 | 404 | ||
386 | raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; | 405 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; |
387 | /* exclude on/offline and */ | 406 | /* Protect following fields. */ |
388 | /* starting new GP. */ | ||
389 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | 407 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ |
390 | /* need a grace period. */ | 408 | /* need a grace period. */ |
391 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | 409 | struct rcu_head **orphan_nxttail; /* Tail of above. */ |
@@ -394,7 +412,7 @@ struct rcu_state { | |||
394 | struct rcu_head **orphan_donetail; /* Tail of above. */ | 412 | struct rcu_head **orphan_donetail; /* Tail of above. */ |
395 | long qlen_lazy; /* Number of lazy callbacks. */ | 413 | long qlen_lazy; /* Number of lazy callbacks. */ |
396 | long qlen; /* Total number of callbacks. */ | 414 | long qlen; /* Total number of callbacks. */ |
397 | /* End of fields guarded by onofflock. */ | 415 | /* End of fields guarded by orphan_lock. */ |
398 | 416 | ||
399 | struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ | 417 | struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ |
400 | 418 | ||
@@ -405,6 +423,18 @@ struct rcu_state { | |||
405 | /* _rcu_barrier(). */ | 423 | /* _rcu_barrier(). */ |
406 | /* End of fields guarded by barrier_mutex. */ | 424 | /* End of fields guarded by barrier_mutex. */ |
407 | 425 | ||
426 | atomic_long_t expedited_start; /* Starting ticket. */ | ||
427 | atomic_long_t expedited_done; /* Done ticket. */ | ||
428 | atomic_long_t expedited_wrap; /* # near-wrap incidents. */ | ||
429 | atomic_long_t expedited_tryfail; /* # acquisition failures. */ | ||
430 | atomic_long_t expedited_workdone1; /* # done by others #1. */ | ||
431 | atomic_long_t expedited_workdone2; /* # done by others #2. */ | ||
432 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ | ||
433 | atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ | ||
434 | atomic_long_t expedited_done_tries; /* # tries to update _done. */ | ||
435 | atomic_long_t expedited_done_lost; /* # times beaten to _done. */ | ||
436 | atomic_long_t expedited_done_exit; /* # times exited _done loop. */ | ||
437 | |||
408 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 438 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
409 | /* force_quiescent_state(). */ | 439 | /* force_quiescent_state(). */ |
410 | unsigned long n_force_qs; /* Number of calls to */ | 440 | unsigned long n_force_qs; /* Number of calls to */ |
@@ -428,6 +458,8 @@ struct rcu_state { | |||
428 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ | 458 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ |
429 | 459 | ||
430 | extern struct list_head rcu_struct_flavors; | 460 | extern struct list_head rcu_struct_flavors; |
461 | |||
462 | /* Sequence through rcu_state structures for each RCU flavor. */ | ||
431 | #define for_each_rcu_flavor(rsp) \ | 463 | #define for_each_rcu_flavor(rsp) \ |
432 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) | 464 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) |
433 | 465 | ||
@@ -504,5 +536,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | |||
504 | static void print_cpu_stall_info_end(void); | 536 | static void print_cpu_stall_info_end(void); |
505 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 537 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
506 | static void increment_cpu_stall_ticks(void); | 538 | static void increment_cpu_stall_ticks(void); |
539 | static bool is_nocb_cpu(int cpu); | ||
540 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
541 | bool lazy); | ||
542 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
543 | struct rcu_data *rdp); | ||
544 | static bool nocb_cpu_expendable(int cpu); | ||
545 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | ||
546 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | ||
547 | static void init_nocb_callback_list(struct rcu_data *rdp); | ||
548 | static void __init rcu_init_nocb(void); | ||
507 | 549 | ||
508 | #endif /* #ifndef RCU_TREE_NONCORE */ | 550 | #endif /* #ifndef RCU_TREE_NONCORE */ |
551 | |||
552 | #ifdef CONFIG_RCU_TRACE | ||
553 | #ifdef CONFIG_RCU_NOCB_CPU | ||
554 | /* Sum up queue lengths for tracing. */ | ||
555 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | ||
556 | { | ||
557 | *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; | ||
558 | *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; | ||
559 | } | ||
560 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
561 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | ||
562 | { | ||
563 | *ql = 0; | ||
564 | *qll = 0; | ||
565 | } | ||
566 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
567 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f92115488187..f6e5ec2932b4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -25,6 +25,7 @@ | |||
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/gfp.h> | ||
28 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
29 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
30 | 31 | ||
@@ -36,6 +37,14 @@ | |||
36 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | 37 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO |
37 | #endif | 38 | #endif |
38 | 39 | ||
40 | #ifdef CONFIG_RCU_NOCB_CPU | ||
41 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | ||
42 | static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ | ||
43 | static bool rcu_nocb_poll; /* Offload kthread are to poll. */ | ||
44 | module_param(rcu_nocb_poll, bool, 0444); | ||
45 | static char __initdata nocb_buf[NR_CPUS * 5]; | ||
46 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
47 | |||
39 | /* | 48 | /* |
40 | * Check the RCU kernel configuration parameters and print informative | 49 | * Check the RCU kernel configuration parameters and print informative |
41 | * messages about anything out of the ordinary. If you like #ifdef, you | 50 | * messages about anything out of the ordinary. If you like #ifdef, you |
@@ -76,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void) | |||
76 | printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 85 | printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); |
77 | if (nr_cpu_ids != NR_CPUS) | 86 | if (nr_cpu_ids != NR_CPUS) |
78 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 87 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
88 | #ifdef CONFIG_RCU_NOCB_CPU | ||
89 | if (have_rcu_nocb_mask) { | ||
90 | if (cpumask_test_cpu(0, rcu_nocb_mask)) { | ||
91 | cpumask_clear_cpu(0, rcu_nocb_mask); | ||
92 | pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); | ||
93 | } | ||
94 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | ||
95 | pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); | ||
96 | if (rcu_nocb_poll) | ||
97 | pr_info("\tExperimental polled no-CBs CPUs.\n"); | ||
98 | } | ||
99 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
79 | } | 100 | } |
80 | 101 | ||
81 | #ifdef CONFIG_TREE_PREEMPT_RCU | 102 | #ifdef CONFIG_TREE_PREEMPT_RCU |
@@ -642,7 +663,7 @@ static void rcu_preempt_do_callbacks(void) | |||
642 | */ | 663 | */ |
643 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 664 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
644 | { | 665 | { |
645 | __call_rcu(head, func, &rcu_preempt_state, 0); | 666 | __call_rcu(head, func, &rcu_preempt_state, -1, 0); |
646 | } | 667 | } |
647 | EXPORT_SYMBOL_GPL(call_rcu); | 668 | EXPORT_SYMBOL_GPL(call_rcu); |
648 | 669 | ||
@@ -656,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
656 | void kfree_call_rcu(struct rcu_head *head, | 677 | void kfree_call_rcu(struct rcu_head *head, |
657 | void (*func)(struct rcu_head *rcu)) | 678 | void (*func)(struct rcu_head *rcu)) |
658 | { | 679 | { |
659 | __call_rcu(head, func, &rcu_preempt_state, 1); | 680 | __call_rcu(head, func, &rcu_preempt_state, -1, 1); |
660 | } | 681 | } |
661 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | 682 | EXPORT_SYMBOL_GPL(kfree_call_rcu); |
662 | 683 | ||
@@ -670,6 +691,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); | |||
670 | * concurrently with new RCU read-side critical sections that began while | 691 | * concurrently with new RCU read-side critical sections that began while |
671 | * synchronize_rcu() was waiting. RCU read-side critical sections are | 692 | * synchronize_rcu() was waiting. RCU read-side critical sections are |
672 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | 693 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. |
694 | * | ||
695 | * See the description of synchronize_sched() for more detailed information | ||
696 | * on memory ordering guarantees. | ||
673 | */ | 697 | */ |
674 | void synchronize_rcu(void) | 698 | void synchronize_rcu(void) |
675 | { | 699 | { |
@@ -679,7 +703,10 @@ void synchronize_rcu(void) | |||
679 | "Illegal synchronize_rcu() in RCU read-side critical section"); | 703 | "Illegal synchronize_rcu() in RCU read-side critical section"); |
680 | if (!rcu_scheduler_active) | 704 | if (!rcu_scheduler_active) |
681 | return; | 705 | return; |
682 | wait_rcu_gp(call_rcu); | 706 | if (rcu_expedited) |
707 | synchronize_rcu_expedited(); | ||
708 | else | ||
709 | wait_rcu_gp(call_rcu); | ||
683 | } | 710 | } |
684 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 711 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
685 | 712 | ||
@@ -757,7 +784,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
757 | * grace period for the specified rcu_node structure. If there are no such | 784 | * grace period for the specified rcu_node structure. If there are no such |
758 | * tasks, report it up the rcu_node hierarchy. | 785 | * tasks, report it up the rcu_node hierarchy. |
759 | * | 786 | * |
760 | * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. | 787 | * Caller must hold sync_rcu_preempt_exp_mutex and must exclude |
788 | * CPU hotplug operations. | ||
761 | */ | 789 | */ |
762 | static void | 790 | static void |
763 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | 791 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) |
@@ -831,7 +859,7 @@ void synchronize_rcu_expedited(void) | |||
831 | udelay(trycount * num_online_cpus()); | 859 | udelay(trycount * num_online_cpus()); |
832 | } else { | 860 | } else { |
833 | put_online_cpus(); | 861 | put_online_cpus(); |
834 | synchronize_rcu(); | 862 | wait_rcu_gp(call_rcu); |
835 | return; | 863 | return; |
836 | } | 864 | } |
837 | } | 865 | } |
@@ -875,6 +903,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
875 | 903 | ||
876 | /** | 904 | /** |
877 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. | 905 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. |
906 | * | ||
907 | * Note that this primitive does not necessarily wait for an RCU grace period | ||
908 | * to complete. For example, if there are no RCU callbacks queued anywhere | ||
909 | * in the system, then rcu_barrier() is within its rights to return | ||
910 | * immediately, without waiting for anything, much less an RCU grace period. | ||
878 | */ | 911 | */ |
879 | void rcu_barrier(void) | 912 | void rcu_barrier(void) |
880 | { | 913 | { |
@@ -1013,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
1013 | void kfree_call_rcu(struct rcu_head *head, | 1046 | void kfree_call_rcu(struct rcu_head *head, |
1014 | void (*func)(struct rcu_head *rcu)) | 1047 | void (*func)(struct rcu_head *rcu)) |
1015 | { | 1048 | { |
1016 | __call_rcu(head, func, &rcu_sched_state, 1); | 1049 | __call_rcu(head, func, &rcu_sched_state, -1, 1); |
1017 | } | 1050 | } |
1018 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | 1051 | EXPORT_SYMBOL_GPL(kfree_call_rcu); |
1019 | 1052 | ||
@@ -2092,3 +2125,373 @@ static void increment_cpu_stall_ticks(void) | |||
2092 | } | 2125 | } |
2093 | 2126 | ||
2094 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | 2127 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ |
2128 | |||
2129 | #ifdef CONFIG_RCU_NOCB_CPU | ||
2130 | |||
2131 | /* | ||
2132 | * Offload callback processing from the boot-time-specified set of CPUs | ||
2133 | * specified by rcu_nocb_mask. For each CPU in the set, there is a | ||
2134 | * kthread created that pulls the callbacks from the corresponding CPU, | ||
2135 | * waits for a grace period to elapse, and invokes the callbacks. | ||
2136 | * The no-CBs CPUs do a wake_up() on their kthread when they insert | ||
2137 | * a callback into any empty list, unless the rcu_nocb_poll boot parameter | ||
2138 | * has been specified, in which case each kthread actively polls its | ||
2139 | * CPU. (Which isn't so great for energy efficiency, but which does | ||
2140 | * reduce RCU's overhead on that CPU.) | ||
2141 | * | ||
2142 | * This is intended to be used in conjunction with Frederic Weisbecker's | ||
2143 | * adaptive-idle work, which would seriously reduce OS jitter on CPUs | ||
2144 | * running CPU-bound user-mode computations. | ||
2145 | * | ||
2146 | * Offloading of callback processing could also in theory be used as | ||
2147 | * an energy-efficiency measure because CPUs with no RCU callbacks | ||
2148 | * queued are more aggressive about entering dyntick-idle mode. | ||
2149 | */ | ||
2150 | |||
2151 | |||
2152 | /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ | ||
2153 | static int __init rcu_nocb_setup(char *str) | ||
2154 | { | ||
2155 | alloc_bootmem_cpumask_var(&rcu_nocb_mask); | ||
2156 | have_rcu_nocb_mask = true; | ||
2157 | cpulist_parse(str, rcu_nocb_mask); | ||
2158 | return 1; | ||
2159 | } | ||
2160 | __setup("rcu_nocbs=", rcu_nocb_setup); | ||
2161 | |||
2162 | /* Is the specified CPU a no-CPUs CPU? */ | ||
2163 | static bool is_nocb_cpu(int cpu) | ||
2164 | { | ||
2165 | if (have_rcu_nocb_mask) | ||
2166 | return cpumask_test_cpu(cpu, rcu_nocb_mask); | ||
2167 | return false; | ||
2168 | } | ||
2169 | |||
2170 | /* | ||
2171 | * Enqueue the specified string of rcu_head structures onto the specified | ||
2172 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the | ||
2173 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy | ||
2174 | * counts are supplied by rhcount and rhcount_lazy. | ||
2175 | * | ||
2176 | * If warranted, also wake up the kthread servicing this CPUs queues. | ||
2177 | */ | ||
2178 | static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | ||
2179 | struct rcu_head *rhp, | ||
2180 | struct rcu_head **rhtp, | ||
2181 | int rhcount, int rhcount_lazy) | ||
2182 | { | ||
2183 | int len; | ||
2184 | struct rcu_head **old_rhpp; | ||
2185 | struct task_struct *t; | ||
2186 | |||
2187 | /* Enqueue the callback on the nocb list and update counts. */ | ||
2188 | old_rhpp = xchg(&rdp->nocb_tail, rhtp); | ||
2189 | ACCESS_ONCE(*old_rhpp) = rhp; | ||
2190 | atomic_long_add(rhcount, &rdp->nocb_q_count); | ||
2191 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); | ||
2192 | |||
2193 | /* If we are not being polled and there is a kthread, awaken it ... */ | ||
2194 | t = ACCESS_ONCE(rdp->nocb_kthread); | ||
2195 | if (rcu_nocb_poll | !t) | ||
2196 | return; | ||
2197 | len = atomic_long_read(&rdp->nocb_q_count); | ||
2198 | if (old_rhpp == &rdp->nocb_head) { | ||
2199 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ | ||
2200 | rdp->qlen_last_fqs_check = 0; | ||
2201 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | ||
2202 | wake_up_process(t); /* ... or if many callbacks queued. */ | ||
2203 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | ||
2204 | } | ||
2205 | return; | ||
2206 | } | ||
2207 | |||
2208 | /* | ||
2209 | * This is a helper for __call_rcu(), which invokes this when the normal | ||
2210 | * callback queue is inoperable. If this is not a no-CBs CPU, this | ||
2211 | * function returns failure back to __call_rcu(), which can complain | ||
2212 | * appropriately. | ||
2213 | * | ||
2214 | * Otherwise, this function queues the callback where the corresponding | ||
2215 | * "rcuo" kthread can find it. | ||
2216 | */ | ||
2217 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
2218 | bool lazy) | ||
2219 | { | ||
2220 | |||
2221 | if (!is_nocb_cpu(rdp->cpu)) | ||
2222 | return 0; | ||
2223 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); | ||
2224 | return 1; | ||
2225 | } | ||
2226 | |||
2227 | /* | ||
2228 | * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is | ||
2229 | * not a no-CBs CPU. | ||
2230 | */ | ||
2231 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
2232 | struct rcu_data *rdp) | ||
2233 | { | ||
2234 | long ql = rsp->qlen; | ||
2235 | long qll = rsp->qlen_lazy; | ||
2236 | |||
2237 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | ||
2238 | if (!is_nocb_cpu(smp_processor_id())) | ||
2239 | return 0; | ||
2240 | rsp->qlen = 0; | ||
2241 | rsp->qlen_lazy = 0; | ||
2242 | |||
2243 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ | ||
2244 | if (rsp->orphan_donelist != NULL) { | ||
2245 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, | ||
2246 | rsp->orphan_donetail, ql, qll); | ||
2247 | ql = qll = 0; | ||
2248 | rsp->orphan_donelist = NULL; | ||
2249 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
2250 | } | ||
2251 | if (rsp->orphan_nxtlist != NULL) { | ||
2252 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, | ||
2253 | rsp->orphan_nxttail, ql, qll); | ||
2254 | ql = qll = 0; | ||
2255 | rsp->orphan_nxtlist = NULL; | ||
2256 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
2257 | } | ||
2258 | return 1; | ||
2259 | } | ||
2260 | |||
2261 | /* | ||
2262 | * There must be at least one non-no-CBs CPU in operation at any given | ||
2263 | * time, because no-CBs CPUs are not capable of initiating grace periods | ||
2264 | * independently. This function therefore complains if the specified | ||
2265 | * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to | ||
2266 | * avoid offlining the last such CPU. (Recursion is a wonderful thing, | ||
2267 | * but you have to have a base case!) | ||
2268 | */ | ||
2269 | static bool nocb_cpu_expendable(int cpu) | ||
2270 | { | ||
2271 | cpumask_var_t non_nocb_cpus; | ||
2272 | int ret; | ||
2273 | |||
2274 | /* | ||
2275 | * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, | ||
2276 | * then offlining this CPU is harmless. Let it happen. | ||
2277 | */ | ||
2278 | if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) | ||
2279 | return 1; | ||
2280 | |||
2281 | /* If no memory, play it safe and keep the CPU around. */ | ||
2282 | if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) | ||
2283 | return 0; | ||
2284 | cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); | ||
2285 | cpumask_clear_cpu(cpu, non_nocb_cpus); | ||
2286 | ret = !cpumask_empty(non_nocb_cpus); | ||
2287 | free_cpumask_var(non_nocb_cpus); | ||
2288 | return ret; | ||
2289 | } | ||
2290 | |||
2291 | /* | ||
2292 | * Helper structure for remote registry of RCU callbacks. | ||
2293 | * This is needed for when a no-CBs CPU needs to start a grace period. | ||
2294 | * If it just invokes call_rcu(), the resulting callback will be queued, | ||
2295 | * which can result in deadlock. | ||
2296 | */ | ||
2297 | struct rcu_head_remote { | ||
2298 | struct rcu_head *rhp; | ||
2299 | call_rcu_func_t *crf; | ||
2300 | void (*func)(struct rcu_head *rhp); | ||
2301 | }; | ||
2302 | |||
2303 | /* | ||
2304 | * Register a callback as specified by the rcu_head_remote struct. | ||
2305 | * This function is intended to be invoked via smp_call_function_single(). | ||
2306 | */ | ||
2307 | static void call_rcu_local(void *arg) | ||
2308 | { | ||
2309 | struct rcu_head_remote *rhrp = | ||
2310 | container_of(arg, struct rcu_head_remote, rhp); | ||
2311 | |||
2312 | rhrp->crf(rhrp->rhp, rhrp->func); | ||
2313 | } | ||
2314 | |||
2315 | /* | ||
2316 | * Set up an rcu_head_remote structure and the invoke call_rcu_local() | ||
2317 | * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via | ||
2318 | * smp_call_function_single(). | ||
2319 | */ | ||
2320 | static void invoke_crf_remote(struct rcu_head *rhp, | ||
2321 | void (*func)(struct rcu_head *rhp), | ||
2322 | call_rcu_func_t crf) | ||
2323 | { | ||
2324 | struct rcu_head_remote rhr; | ||
2325 | |||
2326 | rhr.rhp = rhp; | ||
2327 | rhr.crf = crf; | ||
2328 | rhr.func = func; | ||
2329 | smp_call_function_single(0, call_rcu_local, &rhr, 1); | ||
2330 | } | ||
2331 | |||
2332 | /* | ||
2333 | * Helper functions to be passed to wait_rcu_gp(), each of which | ||
2334 | * invokes invoke_crf_remote() to register a callback appropriately. | ||
2335 | */ | ||
2336 | static void __maybe_unused | ||
2337 | call_rcu_preempt_remote(struct rcu_head *rhp, | ||
2338 | void (*func)(struct rcu_head *rhp)) | ||
2339 | { | ||
2340 | invoke_crf_remote(rhp, func, call_rcu); | ||
2341 | } | ||
2342 | static void call_rcu_bh_remote(struct rcu_head *rhp, | ||
2343 | void (*func)(struct rcu_head *rhp)) | ||
2344 | { | ||
2345 | invoke_crf_remote(rhp, func, call_rcu_bh); | ||
2346 | } | ||
2347 | static void call_rcu_sched_remote(struct rcu_head *rhp, | ||
2348 | void (*func)(struct rcu_head *rhp)) | ||
2349 | { | ||
2350 | invoke_crf_remote(rhp, func, call_rcu_sched); | ||
2351 | } | ||
2352 | |||
2353 | /* | ||
2354 | * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes | ||
2355 | * callbacks queued by the corresponding no-CBs CPU. | ||
2356 | */ | ||
2357 | static int rcu_nocb_kthread(void *arg) | ||
2358 | { | ||
2359 | int c, cl; | ||
2360 | struct rcu_head *list; | ||
2361 | struct rcu_head *next; | ||
2362 | struct rcu_head **tail; | ||
2363 | struct rcu_data *rdp = arg; | ||
2364 | |||
2365 | /* Each pass through this loop invokes one batch of callbacks */ | ||
2366 | for (;;) { | ||
2367 | /* If not polling, wait for next batch of callbacks. */ | ||
2368 | if (!rcu_nocb_poll) | ||
2369 | wait_event(rdp->nocb_wq, rdp->nocb_head); | ||
2370 | list = ACCESS_ONCE(rdp->nocb_head); | ||
2371 | if (!list) { | ||
2372 | schedule_timeout_interruptible(1); | ||
2373 | continue; | ||
2374 | } | ||
2375 | |||
2376 | /* | ||
2377 | * Extract queued callbacks, update counts, and wait | ||
2378 | * for a grace period to elapse. | ||
2379 | */ | ||
2380 | ACCESS_ONCE(rdp->nocb_head) = NULL; | ||
2381 | tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); | ||
2382 | c = atomic_long_xchg(&rdp->nocb_q_count, 0); | ||
2383 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); | ||
2384 | ACCESS_ONCE(rdp->nocb_p_count) += c; | ||
2385 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; | ||
2386 | wait_rcu_gp(rdp->rsp->call_remote); | ||
2387 | |||
2388 | /* Each pass through the following loop invokes a callback. */ | ||
2389 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); | ||
2390 | c = cl = 0; | ||
2391 | while (list) { | ||
2392 | next = list->next; | ||
2393 | /* Wait for enqueuing to complete, if needed. */ | ||
2394 | while (next == NULL && &list->next != tail) { | ||
2395 | schedule_timeout_interruptible(1); | ||
2396 | next = list->next; | ||
2397 | } | ||
2398 | debug_rcu_head_unqueue(list); | ||
2399 | local_bh_disable(); | ||
2400 | if (__rcu_reclaim(rdp->rsp->name, list)) | ||
2401 | cl++; | ||
2402 | c++; | ||
2403 | local_bh_enable(); | ||
2404 | list = next; | ||
2405 | } | ||
2406 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); | ||
2407 | ACCESS_ONCE(rdp->nocb_p_count) -= c; | ||
2408 | ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; | ||
2409 | rdp->n_nocbs_invoked += c; | ||
2410 | } | ||
2411 | return 0; | ||
2412 | } | ||
2413 | |||
2414 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ | ||
2415 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
2416 | { | ||
2417 | rdp->nocb_tail = &rdp->nocb_head; | ||
2418 | init_waitqueue_head(&rdp->nocb_wq); | ||
2419 | } | ||
2420 | |||
2421 | /* Create a kthread for each RCU flavor for each no-CBs CPU. */ | ||
2422 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
2423 | { | ||
2424 | int cpu; | ||
2425 | struct rcu_data *rdp; | ||
2426 | struct task_struct *t; | ||
2427 | |||
2428 | if (rcu_nocb_mask == NULL) | ||
2429 | return; | ||
2430 | for_each_cpu(cpu, rcu_nocb_mask) { | ||
2431 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2432 | t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); | ||
2433 | BUG_ON(IS_ERR(t)); | ||
2434 | ACCESS_ONCE(rdp->nocb_kthread) = t; | ||
2435 | } | ||
2436 | } | ||
2437 | |||
2438 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | ||
2439 | static void init_nocb_callback_list(struct rcu_data *rdp) | ||
2440 | { | ||
2441 | if (rcu_nocb_mask == NULL || | ||
2442 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) | ||
2443 | return; | ||
2444 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
2445 | } | ||
2446 | |||
2447 | /* Initialize the ->call_remote fields in the rcu_state structures. */ | ||
2448 | static void __init rcu_init_nocb(void) | ||
2449 | { | ||
2450 | #ifdef CONFIG_PREEMPT_RCU | ||
2451 | rcu_preempt_state.call_remote = call_rcu_preempt_remote; | ||
2452 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2453 | rcu_bh_state.call_remote = call_rcu_bh_remote; | ||
2454 | rcu_sched_state.call_remote = call_rcu_sched_remote; | ||
2455 | } | ||
2456 | |||
2457 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
2458 | |||
2459 | static bool is_nocb_cpu(int cpu) | ||
2460 | { | ||
2461 | return false; | ||
2462 | } | ||
2463 | |||
2464 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
2465 | bool lazy) | ||
2466 | { | ||
2467 | return 0; | ||
2468 | } | ||
2469 | |||
2470 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
2471 | struct rcu_data *rdp) | ||
2472 | { | ||
2473 | return 0; | ||
2474 | } | ||
2475 | |||
2476 | static bool nocb_cpu_expendable(int cpu) | ||
2477 | { | ||
2478 | return 1; | ||
2479 | } | ||
2480 | |||
2481 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
2482 | { | ||
2483 | } | ||
2484 | |||
2485 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
2486 | { | ||
2487 | } | ||
2488 | |||
2489 | static void init_nocb_callback_list(struct rcu_data *rdp) | ||
2490 | { | ||
2491 | } | ||
2492 | |||
2493 | static void __init rcu_init_nocb(void) | ||
2494 | { | ||
2495 | } | ||
2496 | |||
2497 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 693513bc50e6..0d095dcaa670 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -46,29 +46,58 @@ | |||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
48 | 48 | ||
49 | static int show_rcubarrier(struct seq_file *m, void *unused) | 49 | #define ulong2long(a) (*(long *)(&(a))) |
50 | |||
51 | static int r_open(struct inode *inode, struct file *file, | ||
52 | const struct seq_operations *op) | ||
50 | { | 53 | { |
51 | struct rcu_state *rsp; | 54 | int ret = seq_open(file, op); |
55 | if (!ret) { | ||
56 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
57 | m->private = inode->i_private; | ||
58 | } | ||
59 | return ret; | ||
60 | } | ||
61 | |||
62 | static void *r_start(struct seq_file *m, loff_t *pos) | ||
63 | { | ||
64 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
65 | *pos = cpumask_next(*pos - 1, cpu_possible_mask); | ||
66 | if ((*pos) < nr_cpu_ids) | ||
67 | return per_cpu_ptr(rsp->rda, *pos); | ||
68 | return NULL; | ||
69 | } | ||
52 | 70 | ||
53 | for_each_rcu_flavor(rsp) | 71 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
54 | seq_printf(m, "%s: bcc: %d nbd: %lu\n", | 72 | { |
55 | rsp->name, | 73 | (*pos)++; |
56 | atomic_read(&rsp->barrier_cpu_count), | 74 | return r_start(m, pos); |
57 | rsp->n_barrier_done); | 75 | } |
76 | |||
77 | static void r_stop(struct seq_file *m, void *v) | ||
78 | { | ||
79 | } | ||
80 | |||
81 | static int show_rcubarrier(struct seq_file *m, void *v) | ||
82 | { | ||
83 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
84 | seq_printf(m, "bcc: %d nbd: %lu\n", | ||
85 | atomic_read(&rsp->barrier_cpu_count), | ||
86 | rsp->n_barrier_done); | ||
58 | return 0; | 87 | return 0; |
59 | } | 88 | } |
60 | 89 | ||
61 | static int rcubarrier_open(struct inode *inode, struct file *file) | 90 | static int rcubarrier_open(struct inode *inode, struct file *file) |
62 | { | 91 | { |
63 | return single_open(file, show_rcubarrier, NULL); | 92 | return single_open(file, show_rcubarrier, inode->i_private); |
64 | } | 93 | } |
65 | 94 | ||
66 | static const struct file_operations rcubarrier_fops = { | 95 | static const struct file_operations rcubarrier_fops = { |
67 | .owner = THIS_MODULE, | 96 | .owner = THIS_MODULE, |
68 | .open = rcubarrier_open, | 97 | .open = rcubarrier_open, |
69 | .read = seq_read, | 98 | .read = seq_read, |
70 | .llseek = seq_lseek, | 99 | .llseek = no_llseek, |
71 | .release = single_release, | 100 | .release = seq_release, |
72 | }; | 101 | }; |
73 | 102 | ||
74 | #ifdef CONFIG_RCU_BOOST | 103 | #ifdef CONFIG_RCU_BOOST |
@@ -84,12 +113,14 @@ static char convert_kthread_status(unsigned int kthread_status) | |||
84 | 113 | ||
85 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | 114 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) |
86 | { | 115 | { |
116 | long ql, qll; | ||
117 | |||
87 | if (!rdp->beenonline) | 118 | if (!rdp->beenonline) |
88 | return; | 119 | return; |
89 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", | 120 | seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", |
90 | rdp->cpu, | 121 | rdp->cpu, |
91 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 122 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
92 | rdp->completed, rdp->gpnum, | 123 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), |
93 | rdp->passed_quiesce, rdp->qs_pending); | 124 | rdp->passed_quiesce, rdp->qs_pending); |
94 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | 125 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
95 | atomic_read(&rdp->dynticks->dynticks), | 126 | atomic_read(&rdp->dynticks->dynticks), |
@@ -97,8 +128,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
97 | rdp->dynticks->dynticks_nmi_nesting, | 128 | rdp->dynticks->dynticks_nmi_nesting, |
98 | rdp->dynticks_fqs); | 129 | rdp->dynticks_fqs); |
99 | seq_printf(m, " of=%lu", rdp->offline_fqs); | 130 | seq_printf(m, " of=%lu", rdp->offline_fqs); |
131 | rcu_nocb_q_lengths(rdp, &ql, &qll); | ||
132 | qll += rdp->qlen_lazy; | ||
133 | ql += rdp->qlen; | ||
100 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", | 134 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", |
101 | rdp->qlen_lazy, rdp->qlen, | 135 | qll, ql, |
102 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 136 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != |
103 | rdp->nxttail[RCU_NEXT_TAIL]], | 137 | rdp->nxttail[RCU_NEXT_TAIL]], |
104 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | 138 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != |
@@ -114,101 +148,67 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
114 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); | 148 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); |
115 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 149 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
116 | seq_printf(m, " b=%ld", rdp->blimit); | 150 | seq_printf(m, " b=%ld", rdp->blimit); |
117 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | 151 | seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", |
118 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 152 | rdp->n_cbs_invoked, rdp->n_nocbs_invoked, |
153 | rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
119 | } | 154 | } |
120 | 155 | ||
121 | static int show_rcudata(struct seq_file *m, void *unused) | 156 | static int show_rcudata(struct seq_file *m, void *v) |
122 | { | 157 | { |
123 | int cpu; | 158 | print_one_rcu_data(m, (struct rcu_data *)v); |
124 | struct rcu_state *rsp; | ||
125 | |||
126 | for_each_rcu_flavor(rsp) { | ||
127 | seq_printf(m, "%s:\n", rsp->name); | ||
128 | for_each_possible_cpu(cpu) | ||
129 | print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); | ||
130 | } | ||
131 | return 0; | 159 | return 0; |
132 | } | 160 | } |
133 | 161 | ||
162 | static const struct seq_operations rcudate_op = { | ||
163 | .start = r_start, | ||
164 | .next = r_next, | ||
165 | .stop = r_stop, | ||
166 | .show = show_rcudata, | ||
167 | }; | ||
168 | |||
134 | static int rcudata_open(struct inode *inode, struct file *file) | 169 | static int rcudata_open(struct inode *inode, struct file *file) |
135 | { | 170 | { |
136 | return single_open(file, show_rcudata, NULL); | 171 | return r_open(inode, file, &rcudate_op); |
137 | } | 172 | } |
138 | 173 | ||
139 | static const struct file_operations rcudata_fops = { | 174 | static const struct file_operations rcudata_fops = { |
140 | .owner = THIS_MODULE, | 175 | .owner = THIS_MODULE, |
141 | .open = rcudata_open, | 176 | .open = rcudata_open, |
142 | .read = seq_read, | 177 | .read = seq_read, |
143 | .llseek = seq_lseek, | 178 | .llseek = no_llseek, |
144 | .release = single_release, | 179 | .release = seq_release, |
145 | }; | 180 | }; |
146 | 181 | ||
147 | static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | 182 | static int show_rcuexp(struct seq_file *m, void *v) |
148 | { | ||
149 | if (!rdp->beenonline) | ||
150 | return; | ||
151 | seq_printf(m, "%d,%s,%lu,%lu,%d,%d", | ||
152 | rdp->cpu, | ||
153 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", | ||
154 | rdp->completed, rdp->gpnum, | ||
155 | rdp->passed_quiesce, rdp->qs_pending); | ||
156 | seq_printf(m, ",%d,%llx,%d,%lu", | ||
157 | atomic_read(&rdp->dynticks->dynticks), | ||
158 | rdp->dynticks->dynticks_nesting, | ||
159 | rdp->dynticks->dynticks_nmi_nesting, | ||
160 | rdp->dynticks_fqs); | ||
161 | seq_printf(m, ",%lu", rdp->offline_fqs); | ||
162 | seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, | ||
163 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
164 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
165 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
166 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
167 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
168 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
169 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
170 | #ifdef CONFIG_RCU_BOOST | ||
171 | seq_printf(m, ",%d,\"%c\"", | ||
172 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
173 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
174 | rdp->cpu))); | ||
175 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
176 | seq_printf(m, ",%ld", rdp->blimit); | ||
177 | seq_printf(m, ",%lu,%lu,%lu\n", | ||
178 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
179 | } | ||
180 | |||
181 | static int show_rcudata_csv(struct seq_file *m, void *unused) | ||
182 | { | 183 | { |
183 | int cpu; | 184 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
184 | struct rcu_state *rsp; | 185 | |
185 | 186 | seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", | |
186 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); | 187 | atomic_long_read(&rsp->expedited_start), |
187 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 188 | atomic_long_read(&rsp->expedited_done), |
188 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); | 189 | atomic_long_read(&rsp->expedited_wrap), |
189 | #ifdef CONFIG_RCU_BOOST | 190 | atomic_long_read(&rsp->expedited_tryfail), |
190 | seq_puts(m, "\"kt\",\"ktl\""); | 191 | atomic_long_read(&rsp->expedited_workdone1), |
191 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 192 | atomic_long_read(&rsp->expedited_workdone2), |
192 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); | 193 | atomic_long_read(&rsp->expedited_normal), |
193 | for_each_rcu_flavor(rsp) { | 194 | atomic_long_read(&rsp->expedited_stoppedcpus), |
194 | seq_printf(m, "\"%s:\"\n", rsp->name); | 195 | atomic_long_read(&rsp->expedited_done_tries), |
195 | for_each_possible_cpu(cpu) | 196 | atomic_long_read(&rsp->expedited_done_lost), |
196 | print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); | 197 | atomic_long_read(&rsp->expedited_done_exit)); |
197 | } | ||
198 | return 0; | 198 | return 0; |
199 | } | 199 | } |
200 | 200 | ||
201 | static int rcudata_csv_open(struct inode *inode, struct file *file) | 201 | static int rcuexp_open(struct inode *inode, struct file *file) |
202 | { | 202 | { |
203 | return single_open(file, show_rcudata_csv, NULL); | 203 | return single_open(file, show_rcuexp, inode->i_private); |
204 | } | 204 | } |
205 | 205 | ||
206 | static const struct file_operations rcudata_csv_fops = { | 206 | static const struct file_operations rcuexp_fops = { |
207 | .owner = THIS_MODULE, | 207 | .owner = THIS_MODULE, |
208 | .open = rcudata_csv_open, | 208 | .open = rcuexp_open, |
209 | .read = seq_read, | 209 | .read = seq_read, |
210 | .llseek = seq_lseek, | 210 | .llseek = no_llseek, |
211 | .release = single_release, | 211 | .release = seq_release, |
212 | }; | 212 | }; |
213 | 213 | ||
214 | #ifdef CONFIG_RCU_BOOST | 214 | #ifdef CONFIG_RCU_BOOST |
@@ -254,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = { | |||
254 | .owner = THIS_MODULE, | 254 | .owner = THIS_MODULE, |
255 | .open = rcu_node_boost_open, | 255 | .open = rcu_node_boost_open, |
256 | .read = seq_read, | 256 | .read = seq_read, |
257 | .llseek = seq_lseek, | 257 | .llseek = no_llseek, |
258 | .release = single_release, | 258 | .release = single_release, |
259 | }; | 259 | }; |
260 | 260 | ||
261 | /* | 261 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
262 | * Create the rcuboost debugfs entry. Standard error return. | ||
263 | */ | ||
264 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
265 | { | ||
266 | return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, | ||
267 | &rcu_node_boost_fops); | ||
268 | } | ||
269 | |||
270 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
271 | |||
272 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
273 | { | ||
274 | return 0; /* There cannot be an error if we didn't create it! */ | ||
275 | } | ||
276 | |||
277 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
278 | 262 | ||
279 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 263 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
280 | { | 264 | { |
@@ -283,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
283 | struct rcu_node *rnp; | 267 | struct rcu_node *rnp; |
284 | 268 | ||
285 | gpnum = rsp->gpnum; | 269 | gpnum = rsp->gpnum; |
286 | seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", | 270 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", |
287 | rsp->name, rsp->completed, gpnum, rsp->fqs_state, | 271 | ulong2long(rsp->completed), ulong2long(gpnum), |
272 | rsp->fqs_state, | ||
288 | (long)(rsp->jiffies_force_qs - jiffies), | 273 | (long)(rsp->jiffies_force_qs - jiffies), |
289 | (int)(jiffies & 0xffff)); | 274 | (int)(jiffies & 0xffff)); |
290 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | 275 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
@@ -306,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
306 | seq_puts(m, "\n"); | 291 | seq_puts(m, "\n"); |
307 | } | 292 | } |
308 | 293 | ||
309 | static int show_rcuhier(struct seq_file *m, void *unused) | 294 | static int show_rcuhier(struct seq_file *m, void *v) |
310 | { | 295 | { |
311 | struct rcu_state *rsp; | 296 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
312 | 297 | print_one_rcu_state(m, rsp); | |
313 | for_each_rcu_flavor(rsp) | ||
314 | print_one_rcu_state(m, rsp); | ||
315 | return 0; | 298 | return 0; |
316 | } | 299 | } |
317 | 300 | ||
318 | static int rcuhier_open(struct inode *inode, struct file *file) | 301 | static int rcuhier_open(struct inode *inode, struct file *file) |
319 | { | 302 | { |
320 | return single_open(file, show_rcuhier, NULL); | 303 | return single_open(file, show_rcuhier, inode->i_private); |
321 | } | 304 | } |
322 | 305 | ||
323 | static const struct file_operations rcuhier_fops = { | 306 | static const struct file_operations rcuhier_fops = { |
324 | .owner = THIS_MODULE, | 307 | .owner = THIS_MODULE, |
325 | .open = rcuhier_open, | 308 | .open = rcuhier_open, |
326 | .read = seq_read, | 309 | .read = seq_read, |
327 | .llseek = seq_lseek, | 310 | .llseek = no_llseek, |
328 | .release = single_release, | 311 | .release = seq_release, |
329 | }; | 312 | }; |
330 | 313 | ||
331 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | 314 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) |
@@ -338,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | |||
338 | struct rcu_node *rnp = &rsp->node[0]; | 321 | struct rcu_node *rnp = &rsp->node[0]; |
339 | 322 | ||
340 | raw_spin_lock_irqsave(&rnp->lock, flags); | 323 | raw_spin_lock_irqsave(&rnp->lock, flags); |
341 | completed = rsp->completed; | 324 | completed = ACCESS_ONCE(rsp->completed); |
342 | gpnum = rsp->gpnum; | 325 | gpnum = ACCESS_ONCE(rsp->gpnum); |
343 | if (rsp->completed == rsp->gpnum) | 326 | if (completed == gpnum) |
344 | gpage = 0; | 327 | gpage = 0; |
345 | else | 328 | else |
346 | gpage = jiffies - rsp->gp_start; | 329 | gpage = jiffies - rsp->gp_start; |
347 | gpmax = rsp->gp_max; | 330 | gpmax = rsp->gp_max; |
348 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 331 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
349 | seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", | 332 | seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", |
350 | rsp->name, completed, gpnum, gpage, gpmax); | 333 | ulong2long(completed), ulong2long(gpnum), gpage, gpmax); |
351 | } | 334 | } |
352 | 335 | ||
353 | static int show_rcugp(struct seq_file *m, void *unused) | 336 | static int show_rcugp(struct seq_file *m, void *v) |
354 | { | 337 | { |
355 | struct rcu_state *rsp; | 338 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
356 | 339 | show_one_rcugp(m, rsp); | |
357 | for_each_rcu_flavor(rsp) | ||
358 | show_one_rcugp(m, rsp); | ||
359 | return 0; | 340 | return 0; |
360 | } | 341 | } |
361 | 342 | ||
362 | static int rcugp_open(struct inode *inode, struct file *file) | 343 | static int rcugp_open(struct inode *inode, struct file *file) |
363 | { | 344 | { |
364 | return single_open(file, show_rcugp, NULL); | 345 | return single_open(file, show_rcugp, inode->i_private); |
365 | } | 346 | } |
366 | 347 | ||
367 | static const struct file_operations rcugp_fops = { | 348 | static const struct file_operations rcugp_fops = { |
368 | .owner = THIS_MODULE, | 349 | .owner = THIS_MODULE, |
369 | .open = rcugp_open, | 350 | .open = rcugp_open, |
370 | .read = seq_read, | 351 | .read = seq_read, |
371 | .llseek = seq_lseek, | 352 | .llseek = no_llseek, |
372 | .release = single_release, | 353 | .release = seq_release, |
373 | }; | 354 | }; |
374 | 355 | ||
375 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | 356 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) |
376 | { | 357 | { |
358 | if (!rdp->beenonline) | ||
359 | return; | ||
377 | seq_printf(m, "%3d%cnp=%ld ", | 360 | seq_printf(m, "%3d%cnp=%ld ", |
378 | rdp->cpu, | 361 | rdp->cpu, |
379 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 362 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
@@ -389,34 +372,30 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | |||
389 | rdp->n_rp_need_nothing); | 372 | rdp->n_rp_need_nothing); |
390 | } | 373 | } |
391 | 374 | ||
392 | static int show_rcu_pending(struct seq_file *m, void *unused) | 375 | static int show_rcu_pending(struct seq_file *m, void *v) |
393 | { | 376 | { |
394 | int cpu; | 377 | print_one_rcu_pending(m, (struct rcu_data *)v); |
395 | struct rcu_data *rdp; | ||
396 | struct rcu_state *rsp; | ||
397 | |||
398 | for_each_rcu_flavor(rsp) { | ||
399 | seq_printf(m, "%s:\n", rsp->name); | ||
400 | for_each_possible_cpu(cpu) { | ||
401 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
402 | if (rdp->beenonline) | ||
403 | print_one_rcu_pending(m, rdp); | ||
404 | } | ||
405 | } | ||
406 | return 0; | 378 | return 0; |
407 | } | 379 | } |
408 | 380 | ||
381 | static const struct seq_operations rcu_pending_op = { | ||
382 | .start = r_start, | ||
383 | .next = r_next, | ||
384 | .stop = r_stop, | ||
385 | .show = show_rcu_pending, | ||
386 | }; | ||
387 | |||
409 | static int rcu_pending_open(struct inode *inode, struct file *file) | 388 | static int rcu_pending_open(struct inode *inode, struct file *file) |
410 | { | 389 | { |
411 | return single_open(file, show_rcu_pending, NULL); | 390 | return r_open(inode, file, &rcu_pending_op); |
412 | } | 391 | } |
413 | 392 | ||
414 | static const struct file_operations rcu_pending_fops = { | 393 | static const struct file_operations rcu_pending_fops = { |
415 | .owner = THIS_MODULE, | 394 | .owner = THIS_MODULE, |
416 | .open = rcu_pending_open, | 395 | .open = rcu_pending_open, |
417 | .read = seq_read, | 396 | .read = seq_read, |
418 | .llseek = seq_lseek, | 397 | .llseek = no_llseek, |
419 | .release = single_release, | 398 | .release = seq_release, |
420 | }; | 399 | }; |
421 | 400 | ||
422 | static int show_rcutorture(struct seq_file *m, void *unused) | 401 | static int show_rcutorture(struct seq_file *m, void *unused) |
@@ -446,43 +425,58 @@ static struct dentry *rcudir; | |||
446 | 425 | ||
447 | static int __init rcutree_trace_init(void) | 426 | static int __init rcutree_trace_init(void) |
448 | { | 427 | { |
428 | struct rcu_state *rsp; | ||
449 | struct dentry *retval; | 429 | struct dentry *retval; |
430 | struct dentry *rspdir; | ||
450 | 431 | ||
451 | rcudir = debugfs_create_dir("rcu", NULL); | 432 | rcudir = debugfs_create_dir("rcu", NULL); |
452 | if (!rcudir) | 433 | if (!rcudir) |
453 | goto free_out; | 434 | goto free_out; |
454 | 435 | ||
455 | retval = debugfs_create_file("rcubarrier", 0444, rcudir, | 436 | for_each_rcu_flavor(rsp) { |
456 | NULL, &rcubarrier_fops); | 437 | rspdir = debugfs_create_dir(rsp->name, rcudir); |
457 | if (!retval) | 438 | if (!rspdir) |
458 | goto free_out; | 439 | goto free_out; |
459 | 440 | ||
460 | retval = debugfs_create_file("rcudata", 0444, rcudir, | 441 | retval = debugfs_create_file("rcudata", 0444, |
461 | NULL, &rcudata_fops); | 442 | rspdir, rsp, &rcudata_fops); |
462 | if (!retval) | 443 | if (!retval) |
463 | goto free_out; | 444 | goto free_out; |
464 | 445 | ||
465 | retval = debugfs_create_file("rcudata.csv", 0444, rcudir, | 446 | retval = debugfs_create_file("rcuexp", 0444, |
466 | NULL, &rcudata_csv_fops); | 447 | rspdir, rsp, &rcuexp_fops); |
467 | if (!retval) | 448 | if (!retval) |
468 | goto free_out; | 449 | goto free_out; |
469 | 450 | ||
470 | if (rcu_boost_trace_create_file(rcudir)) | 451 | retval = debugfs_create_file("rcu_pending", 0444, |
471 | goto free_out; | 452 | rspdir, rsp, &rcu_pending_fops); |
453 | if (!retval) | ||
454 | goto free_out; | ||
455 | |||
456 | retval = debugfs_create_file("rcubarrier", 0444, | ||
457 | rspdir, rsp, &rcubarrier_fops); | ||
458 | if (!retval) | ||
459 | goto free_out; | ||
472 | 460 | ||
473 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | 461 | #ifdef CONFIG_RCU_BOOST |
474 | if (!retval) | 462 | if (rsp == &rcu_preempt_state) { |
475 | goto free_out; | 463 | retval = debugfs_create_file("rcuboost", 0444, |
464 | rspdir, NULL, &rcu_node_boost_fops); | ||
465 | if (!retval) | ||
466 | goto free_out; | ||
467 | } | ||
468 | #endif | ||
476 | 469 | ||
477 | retval = debugfs_create_file("rcuhier", 0444, rcudir, | 470 | retval = debugfs_create_file("rcugp", 0444, |
478 | NULL, &rcuhier_fops); | 471 | rspdir, rsp, &rcugp_fops); |
479 | if (!retval) | 472 | if (!retval) |
480 | goto free_out; | 473 | goto free_out; |
481 | 474 | ||
482 | retval = debugfs_create_file("rcu_pending", 0444, rcudir, | 475 | retval = debugfs_create_file("rcuhier", 0444, |
483 | NULL, &rcu_pending_fops); | 476 | rspdir, rsp, &rcuhier_fops); |
484 | if (!retval) | 477 | if (!retval) |
485 | goto free_out; | 478 | goto free_out; |
479 | } | ||
486 | 480 | ||
487 | retval = debugfs_create_file("rcutorture", 0444, rcudir, | 481 | retval = debugfs_create_file("rcutorture", 0444, rcudir, |
488 | NULL, &rcutorture_fops); | 482 | NULL, &rcutorture_fops); |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ad581aa2369a..3920d593e63c 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -192,25 +192,3 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
192 | *res = PAGE_ALIGN(*res); | 192 | *res = PAGE_ALIGN(*res); |
193 | return 0; | 193 | return 0; |
194 | } | 194 | } |
195 | |||
196 | int res_counter_write(struct res_counter *counter, int member, | ||
197 | const char *buf, write_strategy_fn write_strategy) | ||
198 | { | ||
199 | char *end; | ||
200 | unsigned long flags; | ||
201 | unsigned long long tmp, *val; | ||
202 | |||
203 | if (write_strategy) { | ||
204 | if (write_strategy(buf, &tmp)) | ||
205 | return -EINVAL; | ||
206 | } else { | ||
207 | tmp = simple_strtoull(buf, &end, 10); | ||
208 | if (*end != '\0') | ||
209 | return -EINVAL; | ||
210 | } | ||
211 | spin_lock_irqsave(&counter->lock, flags); | ||
212 | val = res_counter_member(counter, member); | ||
213 | *val = tmp; | ||
214 | spin_unlock_irqrestore(&counter->lock, flags); | ||
215 | return 0; | ||
216 | } | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f5eb1838b3e..257002c13bb0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -72,6 +72,7 @@ | |||
72 | #include <linux/slab.h> | 72 | #include <linux/slab.h> |
73 | #include <linux/init_task.h> | 73 | #include <linux/init_task.h> |
74 | #include <linux/binfmts.h> | 74 | #include <linux/binfmts.h> |
75 | #include <linux/context_tracking.h> | ||
75 | 76 | ||
76 | #include <asm/switch_to.h> | 77 | #include <asm/switch_to.h> |
77 | #include <asm/tlb.h> | 78 | #include <asm/tlb.h> |
@@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { }; | |||
192 | static void sched_feat_enable(int i) { }; | 193 | static void sched_feat_enable(int i) { }; |
193 | #endif /* HAVE_JUMP_LABEL */ | 194 | #endif /* HAVE_JUMP_LABEL */ |
194 | 195 | ||
195 | static ssize_t | 196 | static int sched_feat_set(char *cmp) |
196 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
197 | size_t cnt, loff_t *ppos) | ||
198 | { | 197 | { |
199 | char buf[64]; | ||
200 | char *cmp; | ||
201 | int neg = 0; | ||
202 | int i; | 198 | int i; |
203 | 199 | int neg = 0; | |
204 | if (cnt > 63) | ||
205 | cnt = 63; | ||
206 | |||
207 | if (copy_from_user(&buf, ubuf, cnt)) | ||
208 | return -EFAULT; | ||
209 | |||
210 | buf[cnt] = 0; | ||
211 | cmp = strstrip(buf); | ||
212 | 200 | ||
213 | if (strncmp(cmp, "NO_", 3) == 0) { | 201 | if (strncmp(cmp, "NO_", 3) == 0) { |
214 | neg = 1; | 202 | neg = 1; |
@@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
228 | } | 216 | } |
229 | } | 217 | } |
230 | 218 | ||
219 | return i; | ||
220 | } | ||
221 | |||
222 | static ssize_t | ||
223 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
224 | size_t cnt, loff_t *ppos) | ||
225 | { | ||
226 | char buf[64]; | ||
227 | char *cmp; | ||
228 | int i; | ||
229 | |||
230 | if (cnt > 63) | ||
231 | cnt = 63; | ||
232 | |||
233 | if (copy_from_user(&buf, ubuf, cnt)) | ||
234 | return -EFAULT; | ||
235 | |||
236 | buf[cnt] = 0; | ||
237 | cmp = strstrip(buf); | ||
238 | |||
239 | i = sched_feat_set(cmp); | ||
231 | if (i == __SCHED_FEAT_NR) | 240 | if (i == __SCHED_FEAT_NR) |
232 | return -EINVAL; | 241 | return -EINVAL; |
233 | 242 | ||
@@ -922,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
922 | rq->skip_clock_update = 1; | 931 | rq->skip_clock_update = 1; |
923 | } | 932 | } |
924 | 933 | ||
934 | static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); | ||
935 | |||
936 | void register_task_migration_notifier(struct notifier_block *n) | ||
937 | { | ||
938 | atomic_notifier_chain_register(&task_migration_notifier, n); | ||
939 | } | ||
940 | |||
925 | #ifdef CONFIG_SMP | 941 | #ifdef CONFIG_SMP |
926 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 942 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
927 | { | 943 | { |
@@ -952,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
952 | trace_sched_migrate_task(p, new_cpu); | 968 | trace_sched_migrate_task(p, new_cpu); |
953 | 969 | ||
954 | if (task_cpu(p) != new_cpu) { | 970 | if (task_cpu(p) != new_cpu) { |
971 | struct task_migration_notifier tmn; | ||
972 | |||
973 | if (p->sched_class->migrate_task_rq) | ||
974 | p->sched_class->migrate_task_rq(p, new_cpu); | ||
955 | p->se.nr_migrations++; | 975 | p->se.nr_migrations++; |
956 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); | 976 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); |
977 | |||
978 | tmn.task = p; | ||
979 | tmn.from_cpu = task_cpu(p); | ||
980 | tmn.to_cpu = new_cpu; | ||
981 | |||
982 | atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); | ||
957 | } | 983 | } |
958 | 984 | ||
959 | __set_task_cpu(p, new_cpu); | 985 | __set_task_cpu(p, new_cpu); |
@@ -1524,6 +1550,15 @@ static void __sched_fork(struct task_struct *p) | |||
1524 | p->se.vruntime = 0; | 1550 | p->se.vruntime = 0; |
1525 | INIT_LIST_HEAD(&p->se.group_node); | 1551 | INIT_LIST_HEAD(&p->se.group_node); |
1526 | 1552 | ||
1553 | /* | ||
1554 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
1555 | * removed when useful for applications beyond shares distribution (e.g. | ||
1556 | * load-balance). | ||
1557 | */ | ||
1558 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1559 | p->se.avg.runnable_avg_period = 0; | ||
1560 | p->se.avg.runnable_avg_sum = 0; | ||
1561 | #endif | ||
1527 | #ifdef CONFIG_SCHEDSTATS | 1562 | #ifdef CONFIG_SCHEDSTATS |
1528 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1563 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
1529 | #endif | 1564 | #endif |
@@ -1533,8 +1568,41 @@ static void __sched_fork(struct task_struct *p) | |||
1533 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1568 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1534 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 1569 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
1535 | #endif | 1570 | #endif |
1571 | |||
1572 | #ifdef CONFIG_NUMA_BALANCING | ||
1573 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | ||
1574 | p->mm->numa_next_scan = jiffies; | ||
1575 | p->mm->numa_next_reset = jiffies; | ||
1576 | p->mm->numa_scan_seq = 0; | ||
1577 | } | ||
1578 | |||
1579 | p->node_stamp = 0ULL; | ||
1580 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | ||
1581 | p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; | ||
1582 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | ||
1583 | p->numa_work.next = &p->numa_work; | ||
1584 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1536 | } | 1585 | } |
1537 | 1586 | ||
1587 | #ifdef CONFIG_NUMA_BALANCING | ||
1588 | #ifdef CONFIG_SCHED_DEBUG | ||
1589 | void set_numabalancing_state(bool enabled) | ||
1590 | { | ||
1591 | if (enabled) | ||
1592 | sched_feat_set("NUMA"); | ||
1593 | else | ||
1594 | sched_feat_set("NO_NUMA"); | ||
1595 | } | ||
1596 | #else | ||
1597 | __read_mostly bool numabalancing_enabled; | ||
1598 | |||
1599 | void set_numabalancing_state(bool enabled) | ||
1600 | { | ||
1601 | numabalancing_enabled = enabled; | ||
1602 | } | ||
1603 | #endif /* CONFIG_SCHED_DEBUG */ | ||
1604 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1605 | |||
1538 | /* | 1606 | /* |
1539 | * fork()/clone()-time setup: | 1607 | * fork()/clone()-time setup: |
1540 | */ | 1608 | */ |
@@ -1886,8 +1954,8 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
1886 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 1954 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
1887 | #endif | 1955 | #endif |
1888 | 1956 | ||
1957 | context_tracking_task_switch(prev, next); | ||
1889 | /* Here we just switch the register state and the stack. */ | 1958 | /* Here we just switch the register state and the stack. */ |
1890 | rcu_switch(prev, next); | ||
1891 | switch_to(prev, next, prev); | 1959 | switch_to(prev, next, prev); |
1892 | 1960 | ||
1893 | barrier(); | 1961 | barrier(); |
@@ -2911,7 +2979,7 @@ asmlinkage void __sched schedule(void) | |||
2911 | } | 2979 | } |
2912 | EXPORT_SYMBOL(schedule); | 2980 | EXPORT_SYMBOL(schedule); |
2913 | 2981 | ||
2914 | #ifdef CONFIG_RCU_USER_QS | 2982 | #ifdef CONFIG_CONTEXT_TRACKING |
2915 | asmlinkage void __sched schedule_user(void) | 2983 | asmlinkage void __sched schedule_user(void) |
2916 | { | 2984 | { |
2917 | /* | 2985 | /* |
@@ -2920,9 +2988,9 @@ asmlinkage void __sched schedule_user(void) | |||
2920 | * we haven't yet exited the RCU idle mode. Do it here manually until | 2988 | * we haven't yet exited the RCU idle mode. Do it here manually until |
2921 | * we find a better solution. | 2989 | * we find a better solution. |
2922 | */ | 2990 | */ |
2923 | rcu_user_exit(); | 2991 | user_exit(); |
2924 | schedule(); | 2992 | schedule(); |
2925 | rcu_user_enter(); | 2993 | user_enter(); |
2926 | } | 2994 | } |
2927 | #endif | 2995 | #endif |
2928 | 2996 | ||
@@ -3027,7 +3095,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3027 | /* Catch callers which need to be fixed */ | 3095 | /* Catch callers which need to be fixed */ |
3028 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3096 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3029 | 3097 | ||
3030 | rcu_user_exit(); | 3098 | user_exit(); |
3031 | do { | 3099 | do { |
3032 | add_preempt_count(PREEMPT_ACTIVE); | 3100 | add_preempt_count(PREEMPT_ACTIVE); |
3033 | local_irq_enable(); | 3101 | local_irq_enable(); |
@@ -4480,6 +4548,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; | |||
4480 | void sched_show_task(struct task_struct *p) | 4548 | void sched_show_task(struct task_struct *p) |
4481 | { | 4549 | { |
4482 | unsigned long free = 0; | 4550 | unsigned long free = 0; |
4551 | int ppid; | ||
4483 | unsigned state; | 4552 | unsigned state; |
4484 | 4553 | ||
4485 | state = p->state ? __ffs(p->state) + 1 : 0; | 4554 | state = p->state ? __ffs(p->state) + 1 : 0; |
@@ -4499,8 +4568,11 @@ void sched_show_task(struct task_struct *p) | |||
4499 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4568 | #ifdef CONFIG_DEBUG_STACK_USAGE |
4500 | free = stack_not_used(p); | 4569 | free = stack_not_used(p); |
4501 | #endif | 4570 | #endif |
4571 | rcu_read_lock(); | ||
4572 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); | ||
4573 | rcu_read_unlock(); | ||
4502 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4574 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
4503 | task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), | 4575 | task_pid_nr(p), ppid, |
4504 | (unsigned long)task_thread_info(p)->flags); | 4576 | (unsigned long)task_thread_info(p)->flags); |
4505 | 4577 | ||
4506 | show_stack(p, NULL); | 4578 | show_stack(p, NULL); |
@@ -7474,7 +7546,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | |||
7474 | struct task_group, css); | 7546 | struct task_group, css); |
7475 | } | 7547 | } |
7476 | 7548 | ||
7477 | static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) | 7549 | static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) |
7478 | { | 7550 | { |
7479 | struct task_group *tg, *parent; | 7551 | struct task_group *tg, *parent; |
7480 | 7552 | ||
@@ -7491,7 +7563,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) | |||
7491 | return &tg->css; | 7563 | return &tg->css; |
7492 | } | 7564 | } |
7493 | 7565 | ||
7494 | static void cpu_cgroup_destroy(struct cgroup *cgrp) | 7566 | static void cpu_cgroup_css_free(struct cgroup *cgrp) |
7495 | { | 7567 | { |
7496 | struct task_group *tg = cgroup_tg(cgrp); | 7568 | struct task_group *tg = cgroup_tg(cgrp); |
7497 | 7569 | ||
@@ -7851,8 +7923,8 @@ static struct cftype cpu_files[] = { | |||
7851 | 7923 | ||
7852 | struct cgroup_subsys cpu_cgroup_subsys = { | 7924 | struct cgroup_subsys cpu_cgroup_subsys = { |
7853 | .name = "cpu", | 7925 | .name = "cpu", |
7854 | .create = cpu_cgroup_create, | 7926 | .css_alloc = cpu_cgroup_css_alloc, |
7855 | .destroy = cpu_cgroup_destroy, | 7927 | .css_free = cpu_cgroup_css_free, |
7856 | .can_attach = cpu_cgroup_can_attach, | 7928 | .can_attach = cpu_cgroup_can_attach, |
7857 | .attach = cpu_cgroup_attach, | 7929 | .attach = cpu_cgroup_attach, |
7858 | .exit = cpu_cgroup_exit, | 7930 | .exit = cpu_cgroup_exit, |
@@ -7875,7 +7947,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7875 | struct cpuacct root_cpuacct; | 7947 | struct cpuacct root_cpuacct; |
7876 | 7948 | ||
7877 | /* create a new cpu accounting group */ | 7949 | /* create a new cpu accounting group */ |
7878 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) | 7950 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) |
7879 | { | 7951 | { |
7880 | struct cpuacct *ca; | 7952 | struct cpuacct *ca; |
7881 | 7953 | ||
@@ -7905,7 +7977,7 @@ out: | |||
7905 | } | 7977 | } |
7906 | 7978 | ||
7907 | /* destroy an existing cpu accounting group */ | 7979 | /* destroy an existing cpu accounting group */ |
7908 | static void cpuacct_destroy(struct cgroup *cgrp) | 7980 | static void cpuacct_css_free(struct cgroup *cgrp) |
7909 | { | 7981 | { |
7910 | struct cpuacct *ca = cgroup_ca(cgrp); | 7982 | struct cpuacct *ca = cgroup_ca(cgrp); |
7911 | 7983 | ||
@@ -8076,9 +8148,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
8076 | 8148 | ||
8077 | struct cgroup_subsys cpuacct_subsys = { | 8149 | struct cgroup_subsys cpuacct_subsys = { |
8078 | .name = "cpuacct", | 8150 | .name = "cpuacct", |
8079 | .create = cpuacct_create, | 8151 | .css_alloc = cpuacct_css_alloc, |
8080 | .destroy = cpuacct_destroy, | 8152 | .css_free = cpuacct_css_free, |
8081 | .subsys_id = cpuacct_subsys_id, | 8153 | .subsys_id = cpuacct_subsys_id, |
8082 | .base_cftypes = files, | 8154 | .base_cftypes = files, |
8083 | }; | 8155 | }; |
8084 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8156 | #endif /* CONFIG_CGROUP_CPUACCT */ |
8157 | |||
8158 | void dump_cpu_task(int cpu) | ||
8159 | { | ||
8160 | pr_info("Task dump for CPU %d:\n", cpu); | ||
8161 | sched_show_task(cpu_curr(cpu)); | ||
8162 | } | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 81b763ba58a6..293b202fcf79 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); | |||
43 | * Called before incrementing preempt_count on {soft,}irq_enter | 43 | * Called before incrementing preempt_count on {soft,}irq_enter |
44 | * and before decrementing preempt_count on {soft,}irq_exit. | 44 | * and before decrementing preempt_count on {soft,}irq_exit. |
45 | */ | 45 | */ |
46 | void vtime_account(struct task_struct *curr) | 46 | void irqtime_account_irq(struct task_struct *curr) |
47 | { | 47 | { |
48 | unsigned long flags; | 48 | unsigned long flags; |
49 | s64 delta; | 49 | s64 delta; |
@@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr) | |||
73 | irq_time_write_end(); | 73 | irq_time_write_end(); |
74 | local_irq_restore(flags); | 74 | local_irq_restore(flags); |
75 | } | 75 | } |
76 | EXPORT_SYMBOL_GPL(vtime_account); | 76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
77 | 77 | ||
78 | static int irqtime_account_hi_update(void) | 78 | static int irqtime_account_hi_update(void) |
79 | { | 79 | { |
@@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void) | |||
288 | return false; | 288 | return false; |
289 | } | 289 | } |
290 | 290 | ||
291 | /* | ||
292 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | ||
293 | * tasks (sum on group iteration) belonging to @tsk's group. | ||
294 | */ | ||
295 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | ||
296 | { | ||
297 | struct signal_struct *sig = tsk->signal; | ||
298 | struct task_struct *t; | ||
299 | |||
300 | times->utime = sig->utime; | ||
301 | times->stime = sig->stime; | ||
302 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
303 | |||
304 | rcu_read_lock(); | ||
305 | /* make sure we can trust tsk->thread_group list */ | ||
306 | if (!likely(pid_alive(tsk))) | ||
307 | goto out; | ||
308 | |||
309 | t = tsk; | ||
310 | do { | ||
311 | times->utime += t->utime; | ||
312 | times->stime += t->stime; | ||
313 | times->sum_exec_runtime += task_sched_runtime(t); | ||
314 | } while_each_thread(tsk, t); | ||
315 | out: | ||
316 | rcu_read_unlock(); | ||
317 | } | ||
318 | |||
291 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 319 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
292 | 320 | ||
293 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 321 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -417,13 +445,13 @@ void account_idle_ticks(unsigned long ticks) | |||
417 | * Use precise platform statistics if available: | 445 | * Use precise platform statistics if available: |
418 | */ | 446 | */ |
419 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 447 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
420 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 448 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
421 | { | 449 | { |
422 | *ut = p->utime; | 450 | *ut = p->utime; |
423 | *st = p->stime; | 451 | *st = p->stime; |
424 | } | 452 | } |
425 | 453 | ||
426 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 454 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
427 | { | 455 | { |
428 | struct task_cputime cputime; | 456 | struct task_cputime cputime; |
429 | 457 | ||
@@ -433,6 +461,29 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
433 | *st = cputime.stime; | 461 | *st = cputime.stime; |
434 | } | 462 | } |
435 | 463 | ||
464 | void vtime_account_system_irqsafe(struct task_struct *tsk) | ||
465 | { | ||
466 | unsigned long flags; | ||
467 | |||
468 | local_irq_save(flags); | ||
469 | vtime_account_system(tsk); | ||
470 | local_irq_restore(flags); | ||
471 | } | ||
472 | EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); | ||
473 | |||
474 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | ||
475 | void vtime_task_switch(struct task_struct *prev) | ||
476 | { | ||
477 | if (is_idle_task(prev)) | ||
478 | vtime_account_idle(prev); | ||
479 | else | ||
480 | vtime_account_system(prev); | ||
481 | |||
482 | vtime_account_user(prev); | ||
483 | arch_vtime_task_switch(prev); | ||
484 | } | ||
485 | #endif | ||
486 | |||
436 | /* | 487 | /* |
437 | * Archs that account the whole time spent in the idle task | 488 | * Archs that account the whole time spent in the idle task |
438 | * (outside irq) as idle time can rely on this and just implement | 489 | * (outside irq) as idle time can rely on this and just implement |
@@ -444,16 +495,10 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
444 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 495 | #ifndef __ARCH_HAS_VTIME_ACCOUNT |
445 | void vtime_account(struct task_struct *tsk) | 496 | void vtime_account(struct task_struct *tsk) |
446 | { | 497 | { |
447 | unsigned long flags; | ||
448 | |||
449 | local_irq_save(flags); | ||
450 | |||
451 | if (in_interrupt() || !is_idle_task(tsk)) | 498 | if (in_interrupt() || !is_idle_task(tsk)) |
452 | vtime_account_system(tsk); | 499 | vtime_account_system(tsk); |
453 | else | 500 | else |
454 | vtime_account_idle(tsk); | 501 | vtime_account_idle(tsk); |
455 | |||
456 | local_irq_restore(flags); | ||
457 | } | 502 | } |
458 | EXPORT_SYMBOL_GPL(vtime_account); | 503 | EXPORT_SYMBOL_GPL(vtime_account); |
459 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 504 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
@@ -478,14 +523,30 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | |||
478 | return (__force cputime_t) temp; | 523 | return (__force cputime_t) temp; |
479 | } | 524 | } |
480 | 525 | ||
481 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 526 | /* |
527 | * Adjust tick based cputime random precision against scheduler | ||
528 | * runtime accounting. | ||
529 | */ | ||
530 | static void cputime_adjust(struct task_cputime *curr, | ||
531 | struct cputime *prev, | ||
532 | cputime_t *ut, cputime_t *st) | ||
482 | { | 533 | { |
483 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | 534 | cputime_t rtime, utime, total; |
535 | |||
536 | utime = curr->utime; | ||
537 | total = utime + curr->stime; | ||
484 | 538 | ||
485 | /* | 539 | /* |
486 | * Use CFS's precise accounting: | 540 | * Tick based cputime accounting depend on random scheduling |
541 | * timeslices of a task to be interrupted or not by the timer. | ||
542 | * Depending on these circumstances, the number of these interrupts | ||
543 | * may be over or under-optimistic, matching the real user and system | ||
544 | * cputime with a variable precision. | ||
545 | * | ||
546 | * Fix this by scaling these tick based values against the total | ||
547 | * runtime accounted by the CFS scheduler. | ||
487 | */ | 548 | */ |
488 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 549 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); |
489 | 550 | ||
490 | if (total) | 551 | if (total) |
491 | utime = scale_utime(utime, rtime, total); | 552 | utime = scale_utime(utime, rtime, total); |
@@ -493,38 +554,36 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
493 | utime = rtime; | 554 | utime = rtime; |
494 | 555 | ||
495 | /* | 556 | /* |
496 | * Compare with previous values, to keep monotonicity: | 557 | * If the tick based count grows faster than the scheduler one, |
558 | * the result of the scaling may go backward. | ||
559 | * Let's enforce monotonicity. | ||
497 | */ | 560 | */ |
498 | p->prev_utime = max(p->prev_utime, utime); | 561 | prev->utime = max(prev->utime, utime); |
499 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | 562 | prev->stime = max(prev->stime, rtime - prev->utime); |
500 | 563 | ||
501 | *ut = p->prev_utime; | 564 | *ut = prev->utime; |
502 | *st = p->prev_stime; | 565 | *st = prev->stime; |
566 | } | ||
567 | |||
568 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
569 | { | ||
570 | struct task_cputime cputime = { | ||
571 | .utime = p->utime, | ||
572 | .stime = p->stime, | ||
573 | .sum_exec_runtime = p->se.sum_exec_runtime, | ||
574 | }; | ||
575 | |||
576 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | ||
503 | } | 577 | } |
504 | 578 | ||
505 | /* | 579 | /* |
506 | * Must be called with siglock held. | 580 | * Must be called with siglock held. |
507 | */ | 581 | */ |
508 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 582 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
509 | { | 583 | { |
510 | struct signal_struct *sig = p->signal; | ||
511 | struct task_cputime cputime; | 584 | struct task_cputime cputime; |
512 | cputime_t rtime, utime, total; | ||
513 | 585 | ||
514 | thread_group_cputime(p, &cputime); | 586 | thread_group_cputime(p, &cputime); |
515 | 587 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); | |
516 | total = cputime.utime + cputime.stime; | ||
517 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
518 | |||
519 | if (total) | ||
520 | utime = scale_utime(cputime.utime, rtime, total); | ||
521 | else | ||
522 | utime = rtime; | ||
523 | |||
524 | sig->prev_utime = max(sig->prev_utime, utime); | ||
525 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
526 | |||
527 | *ut = sig->prev_utime; | ||
528 | *st = sig->prev_stime; | ||
529 | } | 588 | } |
530 | #endif | 589 | #endif |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea9..2cd3c1b4e582 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) | |||
61 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) | 61 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) |
62 | { | 62 | { |
63 | struct sched_entity *se = tg->se[cpu]; | 63 | struct sched_entity *se = tg->se[cpu]; |
64 | if (!se) | ||
65 | return; | ||
66 | 64 | ||
67 | #define P(F) \ | 65 | #define P(F) \ |
68 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 66 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
69 | #define PN(F) \ | 67 | #define PN(F) \ |
70 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | 68 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
71 | 69 | ||
70 | if (!se) { | ||
71 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | ||
72 | P(avg->runnable_avg_sum); | ||
73 | P(avg->runnable_avg_period); | ||
74 | return; | ||
75 | } | ||
76 | |||
77 | |||
72 | PN(se->exec_start); | 78 | PN(se->exec_start); |
73 | PN(se->vruntime); | 79 | PN(se->vruntime); |
74 | PN(se->sum_exec_runtime); | 80 | PN(se->sum_exec_runtime); |
@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
85 | P(se->statistics.wait_count); | 91 | P(se->statistics.wait_count); |
86 | #endif | 92 | #endif |
87 | P(se->load.weight); | 93 | P(se->load.weight); |
94 | #ifdef CONFIG_SMP | ||
95 | P(se->avg.runnable_avg_sum); | ||
96 | P(se->avg.runnable_avg_period); | ||
97 | P(se->avg.load_avg_contrib); | ||
98 | P(se->avg.decay_count); | ||
99 | #endif | ||
88 | #undef PN | 100 | #undef PN |
89 | #undef P | 101 | #undef P |
90 | } | 102 | } |
@@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 218 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
207 | #ifdef CONFIG_FAIR_GROUP_SCHED | 219 | #ifdef CONFIG_FAIR_GROUP_SCHED |
208 | #ifdef CONFIG_SMP | 220 | #ifdef CONFIG_SMP |
209 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", | 221 | SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", |
210 | SPLIT_NS(cfs_rq->load_avg)); | 222 | cfs_rq->runnable_load_avg); |
211 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", | 223 | SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", |
212 | SPLIT_NS(cfs_rq->load_period)); | 224 | cfs_rq->blocked_load_avg); |
213 | SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", | 225 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", |
214 | cfs_rq->load_contribution); | 226 | atomic64_read(&cfs_rq->tg->load_avg)); |
215 | SEQ_printf(m, " .%-30s: %d\n", "load_tg", | 227 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", |
216 | atomic_read(&cfs_rq->tg->load_weight)); | 228 | cfs_rq->tg_load_contrib); |
229 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", | ||
230 | cfs_rq->tg_runnable_contrib); | ||
231 | SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", | ||
232 | atomic_read(&cfs_rq->tg->runnable_avg)); | ||
217 | #endif | 233 | #endif |
218 | 234 | ||
219 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 235 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..4603d6cb9e25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -26,6 +26,9 @@ | |||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
28 | #include <linux/interrupt.h> | 28 | #include <linux/interrupt.h> |
29 | #include <linux/mempolicy.h> | ||
30 | #include <linux/migrate.h> | ||
31 | #include <linux/task_work.h> | ||
29 | 32 | ||
30 | #include <trace/events/sched.h> | 33 | #include <trace/events/sched.h> |
31 | 34 | ||
@@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
259 | return grp->my_q; | 262 | return grp->my_q; |
260 | } | 263 | } |
261 | 264 | ||
265 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | ||
266 | int force_update); | ||
267 | |||
262 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 268 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
263 | { | 269 | { |
264 | if (!cfs_rq->on_list) { | 270 | if (!cfs_rq->on_list) { |
@@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
278 | } | 284 | } |
279 | 285 | ||
280 | cfs_rq->on_list = 1; | 286 | cfs_rq->on_list = 1; |
287 | /* We should have no load, but we need to update last_decay. */ | ||
288 | update_cfs_rq_blocked_load(cfs_rq, 0); | ||
281 | } | 289 | } |
282 | } | 290 | } |
283 | 291 | ||
@@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
653 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 661 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
654 | } | 662 | } |
655 | 663 | ||
656 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | ||
657 | static void update_cfs_shares(struct cfs_rq *cfs_rq); | ||
658 | |||
659 | /* | 664 | /* |
660 | * Update the current task's runtime statistics. Skip current tasks that | 665 | * Update the current task's runtime statistics. Skip current tasks that |
661 | * are not in our scheduling class. | 666 | * are not in our scheduling class. |
@@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
675 | 680 | ||
676 | curr->vruntime += delta_exec_weighted; | 681 | curr->vruntime += delta_exec_weighted; |
677 | update_min_vruntime(cfs_rq); | 682 | update_min_vruntime(cfs_rq); |
678 | |||
679 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
680 | cfs_rq->load_unacc_exec_time += delta_exec; | ||
681 | #endif | ||
682 | } | 683 | } |
683 | 684 | ||
684 | static void update_curr(struct cfs_rq *cfs_rq) | 685 | static void update_curr(struct cfs_rq *cfs_rq) |
@@ -776,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | * Scheduling class queueing methods: | 777 | * Scheduling class queueing methods: |
777 | */ | 778 | */ |
778 | 779 | ||
780 | #ifdef CONFIG_NUMA_BALANCING | ||
781 | /* | ||
782 | * numa task sample period in ms | ||
783 | */ | ||
784 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | ||
785 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | ||
786 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
787 | |||
788 | /* Portion of address space to scan in MB */ | ||
789 | unsigned int sysctl_numa_balancing_scan_size = 256; | ||
790 | |||
791 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | ||
792 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | ||
793 | |||
794 | static void task_numa_placement(struct task_struct *p) | ||
795 | { | ||
796 | int seq = ACCESS_ONCE(p->mm->numa_scan_seq); | ||
797 | |||
798 | if (p->numa_scan_seq == seq) | ||
799 | return; | ||
800 | p->numa_scan_seq = seq; | ||
801 | |||
802 | /* FIXME: Scheduling placement policy hints go here */ | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * Got a PROT_NONE fault for a page on @node. | ||
807 | */ | ||
808 | void task_numa_fault(int node, int pages, bool migrated) | ||
809 | { | ||
810 | struct task_struct *p = current; | ||
811 | |||
812 | if (!sched_feat_numa(NUMA)) | ||
813 | return; | ||
814 | |||
815 | /* FIXME: Allocate task-specific structure for placement policy here */ | ||
816 | |||
817 | /* | ||
818 | * If pages are properly placed (did not migrate) then scan slower. | ||
819 | * This is reset periodically in case of phase changes | ||
820 | */ | ||
821 | if (!migrated) | ||
822 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | ||
823 | p->numa_scan_period + jiffies_to_msecs(10)); | ||
824 | |||
825 | task_numa_placement(p); | ||
826 | } | ||
827 | |||
828 | static void reset_ptenuma_scan(struct task_struct *p) | ||
829 | { | ||
830 | ACCESS_ONCE(p->mm->numa_scan_seq)++; | ||
831 | p->mm->numa_scan_offset = 0; | ||
832 | } | ||
833 | |||
834 | /* | ||
835 | * The expensive part of numa migration is done from task_work context. | ||
836 | * Triggered from task_tick_numa(). | ||
837 | */ | ||
838 | void task_numa_work(struct callback_head *work) | ||
839 | { | ||
840 | unsigned long migrate, next_scan, now = jiffies; | ||
841 | struct task_struct *p = current; | ||
842 | struct mm_struct *mm = p->mm; | ||
843 | struct vm_area_struct *vma; | ||
844 | unsigned long start, end; | ||
845 | long pages; | ||
846 | |||
847 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | ||
848 | |||
849 | work->next = work; /* protect against double add */ | ||
850 | /* | ||
851 | * Who cares about NUMA placement when they're dying. | ||
852 | * | ||
853 | * NOTE: make sure not to dereference p->mm before this check, | ||
854 | * exit_task_work() happens _after_ exit_mm() so we could be called | ||
855 | * without p->mm even though we still had it when we enqueued this | ||
856 | * work. | ||
857 | */ | ||
858 | if (p->flags & PF_EXITING) | ||
859 | return; | ||
860 | |||
861 | /* | ||
862 | * We do not care about task placement until a task runs on a node | ||
863 | * other than the first one used by the address space. This is | ||
864 | * largely because migrations are driven by what CPU the task | ||
865 | * is running on. If it's never scheduled on another node, it'll | ||
866 | * not migrate so why bother trapping the fault. | ||
867 | */ | ||
868 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
869 | mm->first_nid = numa_node_id(); | ||
870 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
871 | /* Are we running on a new node yet? */ | ||
872 | if (numa_node_id() == mm->first_nid && | ||
873 | !sched_feat_numa(NUMA_FORCE)) | ||
874 | return; | ||
875 | |||
876 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
877 | } | ||
878 | |||
879 | /* | ||
880 | * Reset the scan period if enough time has gone by. Objective is that | ||
881 | * scanning will be reduced if pages are properly placed. As tasks | ||
882 | * can enter different phases this needs to be re-examined. Lacking | ||
883 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
884 | */ | ||
885 | migrate = mm->numa_next_reset; | ||
886 | if (time_after(now, migrate)) { | ||
887 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
888 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
889 | xchg(&mm->numa_next_reset, next_scan); | ||
890 | } | ||
891 | |||
892 | /* | ||
893 | * Enforce maximal scan/migration frequency.. | ||
894 | */ | ||
895 | migrate = mm->numa_next_scan; | ||
896 | if (time_before(now, migrate)) | ||
897 | return; | ||
898 | |||
899 | if (p->numa_scan_period == 0) | ||
900 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
901 | |||
902 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | ||
903 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | ||
904 | return; | ||
905 | |||
906 | /* | ||
907 | * Do not set pte_numa if the current running node is rate-limited. | ||
908 | * This loses statistics on the fault but if we are unwilling to | ||
909 | * migrate to this node, it is less likely we can do useful work | ||
910 | */ | ||
911 | if (migrate_ratelimited(numa_node_id())) | ||
912 | return; | ||
913 | |||
914 | start = mm->numa_scan_offset; | ||
915 | pages = sysctl_numa_balancing_scan_size; | ||
916 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ | ||
917 | if (!pages) | ||
918 | return; | ||
919 | |||
920 | down_read(&mm->mmap_sem); | ||
921 | vma = find_vma(mm, start); | ||
922 | if (!vma) { | ||
923 | reset_ptenuma_scan(p); | ||
924 | start = 0; | ||
925 | vma = mm->mmap; | ||
926 | } | ||
927 | for (; vma; vma = vma->vm_next) { | ||
928 | if (!vma_migratable(vma)) | ||
929 | continue; | ||
930 | |||
931 | /* Skip small VMAs. They are not likely to be of relevance */ | ||
932 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | ||
933 | continue; | ||
934 | |||
935 | do { | ||
936 | start = max(start, vma->vm_start); | ||
937 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | ||
938 | end = min(end, vma->vm_end); | ||
939 | pages -= change_prot_numa(vma, start, end); | ||
940 | |||
941 | start = end; | ||
942 | if (pages <= 0) | ||
943 | goto out; | ||
944 | } while (end != vma->vm_end); | ||
945 | } | ||
946 | |||
947 | out: | ||
948 | /* | ||
949 | * It is possible to reach the end of the VMA list but the last few VMAs are | ||
950 | * not guaranteed to the vma_migratable. If they are not, we would find the | ||
951 | * !migratable VMA on the next scan but not reset the scanner to the start | ||
952 | * so check it now. | ||
953 | */ | ||
954 | if (vma) | ||
955 | mm->numa_scan_offset = start; | ||
956 | else | ||
957 | reset_ptenuma_scan(p); | ||
958 | up_read(&mm->mmap_sem); | ||
959 | } | ||
960 | |||
961 | /* | ||
962 | * Drive the periodic memory faults.. | ||
963 | */ | ||
964 | void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
965 | { | ||
966 | struct callback_head *work = &curr->numa_work; | ||
967 | u64 period, now; | ||
968 | |||
969 | /* | ||
970 | * We don't care about NUMA placement if we don't have memory. | ||
971 | */ | ||
972 | if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) | ||
973 | return; | ||
974 | |||
975 | /* | ||
976 | * Using runtime rather than walltime has the dual advantage that | ||
977 | * we (mostly) drive the selection from busy threads and that the | ||
978 | * task needs to have done some actual work before we bother with | ||
979 | * NUMA placement. | ||
980 | */ | ||
981 | now = curr->se.sum_exec_runtime; | ||
982 | period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; | ||
983 | |||
984 | if (now - curr->node_stamp > period) { | ||
985 | if (!curr->node_stamp) | ||
986 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
987 | curr->node_stamp = now; | ||
988 | |||
989 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | ||
990 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | ||
991 | task_work_add(curr, work, true); | ||
992 | } | ||
993 | } | ||
994 | } | ||
995 | #else | ||
996 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
997 | { | ||
998 | } | ||
999 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1000 | |||
779 | static void | 1001 | static void |
780 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1002 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
781 | { | 1003 | { |
@@ -801,72 +1023,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
801 | } | 1023 | } |
802 | 1024 | ||
803 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1025 | #ifdef CONFIG_FAIR_GROUP_SCHED |
804 | /* we need this in update_cfs_load and load-balance functions below */ | ||
805 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
806 | # ifdef CONFIG_SMP | 1026 | # ifdef CONFIG_SMP |
807 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | ||
808 | int global_update) | ||
809 | { | ||
810 | struct task_group *tg = cfs_rq->tg; | ||
811 | long load_avg; | ||
812 | |||
813 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | ||
814 | load_avg -= cfs_rq->load_contribution; | ||
815 | |||
816 | if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { | ||
817 | atomic_add(load_avg, &tg->load_weight); | ||
818 | cfs_rq->load_contribution += load_avg; | ||
819 | } | ||
820 | } | ||
821 | |||
822 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
823 | { | ||
824 | u64 period = sysctl_sched_shares_window; | ||
825 | u64 now, delta; | ||
826 | unsigned long load = cfs_rq->load.weight; | ||
827 | |||
828 | if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) | ||
829 | return; | ||
830 | |||
831 | now = rq_of(cfs_rq)->clock_task; | ||
832 | delta = now - cfs_rq->load_stamp; | ||
833 | |||
834 | /* truncate load history at 4 idle periods */ | ||
835 | if (cfs_rq->load_stamp > cfs_rq->load_last && | ||
836 | now - cfs_rq->load_last > 4 * period) { | ||
837 | cfs_rq->load_period = 0; | ||
838 | cfs_rq->load_avg = 0; | ||
839 | delta = period - 1; | ||
840 | } | ||
841 | |||
842 | cfs_rq->load_stamp = now; | ||
843 | cfs_rq->load_unacc_exec_time = 0; | ||
844 | cfs_rq->load_period += delta; | ||
845 | if (load) { | ||
846 | cfs_rq->load_last = now; | ||
847 | cfs_rq->load_avg += delta * load; | ||
848 | } | ||
849 | |||
850 | /* consider updating load contribution on each fold or truncate */ | ||
851 | if (global_update || cfs_rq->load_period > period | ||
852 | || !cfs_rq->load_period) | ||
853 | update_cfs_rq_load_contribution(cfs_rq, global_update); | ||
854 | |||
855 | while (cfs_rq->load_period > period) { | ||
856 | /* | ||
857 | * Inline assembly required to prevent the compiler | ||
858 | * optimising this loop into a divmod call. | ||
859 | * See __iter_div_u64_rem() for another example of this. | ||
860 | */ | ||
861 | asm("" : "+rm" (cfs_rq->load_period)); | ||
862 | cfs_rq->load_period /= 2; | ||
863 | cfs_rq->load_avg /= 2; | ||
864 | } | ||
865 | |||
866 | if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) | ||
867 | list_del_leaf_cfs_rq(cfs_rq); | ||
868 | } | ||
869 | |||
870 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | 1027 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) |
871 | { | 1028 | { |
872 | long tg_weight; | 1029 | long tg_weight; |
@@ -876,8 +1033,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | |||
876 | * to gain a more accurate current total weight. See | 1033 | * to gain a more accurate current total weight. See |
877 | * update_cfs_rq_load_contribution(). | 1034 | * update_cfs_rq_load_contribution(). |
878 | */ | 1035 | */ |
879 | tg_weight = atomic_read(&tg->load_weight); | 1036 | tg_weight = atomic64_read(&tg->load_avg); |
880 | tg_weight -= cfs_rq->load_contribution; | 1037 | tg_weight -= cfs_rq->tg_load_contrib; |
881 | tg_weight += cfs_rq->load.weight; | 1038 | tg_weight += cfs_rq->load.weight; |
882 | 1039 | ||
883 | return tg_weight; | 1040 | return tg_weight; |
@@ -901,27 +1058,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
901 | 1058 | ||
902 | return shares; | 1059 | return shares; |
903 | } | 1060 | } |
904 | |||
905 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
906 | { | ||
907 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
908 | update_cfs_load(cfs_rq, 0); | ||
909 | update_cfs_shares(cfs_rq); | ||
910 | } | ||
911 | } | ||
912 | # else /* CONFIG_SMP */ | 1061 | # else /* CONFIG_SMP */ |
913 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
914 | { | ||
915 | } | ||
916 | |||
917 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | 1062 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
918 | { | 1063 | { |
919 | return tg->shares; | 1064 | return tg->shares; |
920 | } | 1065 | } |
921 | |||
922 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
923 | { | ||
924 | } | ||
925 | # endif /* CONFIG_SMP */ | 1066 | # endif /* CONFIG_SMP */ |
926 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 1067 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, |
927 | unsigned long weight) | 1068 | unsigned long weight) |
@@ -939,6 +1080,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
939 | account_entity_enqueue(cfs_rq, se); | 1080 | account_entity_enqueue(cfs_rq, se); |
940 | } | 1081 | } |
941 | 1082 | ||
1083 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
1084 | |||
942 | static void update_cfs_shares(struct cfs_rq *cfs_rq) | 1085 | static void update_cfs_shares(struct cfs_rq *cfs_rq) |
943 | { | 1086 | { |
944 | struct task_group *tg; | 1087 | struct task_group *tg; |
@@ -958,18 +1101,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
958 | reweight_entity(cfs_rq_of(se), se, shares); | 1101 | reweight_entity(cfs_rq_of(se), se, shares); |
959 | } | 1102 | } |
960 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 1103 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
961 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | 1104 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) |
962 | { | 1105 | { |
963 | } | 1106 | } |
1107 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
964 | 1108 | ||
965 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | 1109 | /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ |
1110 | #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) | ||
1111 | /* | ||
1112 | * We choose a half-life close to 1 scheduling period. | ||
1113 | * Note: The tables below are dependent on this value. | ||
1114 | */ | ||
1115 | #define LOAD_AVG_PERIOD 32 | ||
1116 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ | ||
1117 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ | ||
1118 | |||
1119 | /* Precomputed fixed inverse multiplies for multiplication by y^n */ | ||
1120 | static const u32 runnable_avg_yN_inv[] = { | ||
1121 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, | ||
1122 | 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, | ||
1123 | 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, | ||
1124 | 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, | ||
1125 | 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, | ||
1126 | 0x85aac367, 0x82cd8698, | ||
1127 | }; | ||
1128 | |||
1129 | /* | ||
1130 | * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent | ||
1131 | * over-estimates when re-combining. | ||
1132 | */ | ||
1133 | static const u32 runnable_avg_yN_sum[] = { | ||
1134 | 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, | ||
1135 | 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, | ||
1136 | 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, | ||
1137 | }; | ||
1138 | |||
1139 | /* | ||
1140 | * Approximate: | ||
1141 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) | ||
1142 | */ | ||
1143 | static __always_inline u64 decay_load(u64 val, u64 n) | ||
1144 | { | ||
1145 | unsigned int local_n; | ||
1146 | |||
1147 | if (!n) | ||
1148 | return val; | ||
1149 | else if (unlikely(n > LOAD_AVG_PERIOD * 63)) | ||
1150 | return 0; | ||
1151 | |||
1152 | /* after bounds checking we can collapse to 32-bit */ | ||
1153 | local_n = n; | ||
1154 | |||
1155 | /* | ||
1156 | * As y^PERIOD = 1/2, we can combine | ||
1157 | * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) | ||
1158 | * With a look-up table which covers k^n (n<PERIOD) | ||
1159 | * | ||
1160 | * To achieve constant time decay_load. | ||
1161 | */ | ||
1162 | if (unlikely(local_n >= LOAD_AVG_PERIOD)) { | ||
1163 | val >>= local_n / LOAD_AVG_PERIOD; | ||
1164 | local_n %= LOAD_AVG_PERIOD; | ||
1165 | } | ||
1166 | |||
1167 | val *= runnable_avg_yN_inv[local_n]; | ||
1168 | /* We don't use SRR here since we always want to round down. */ | ||
1169 | return val >> 32; | ||
1170 | } | ||
1171 | |||
1172 | /* | ||
1173 | * For updates fully spanning n periods, the contribution to runnable | ||
1174 | * average will be: \Sum 1024*y^n | ||
1175 | * | ||
1176 | * We can compute this reasonably efficiently by combining: | ||
1177 | * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} | ||
1178 | */ | ||
1179 | static u32 __compute_runnable_contrib(u64 n) | ||
966 | { | 1180 | { |
1181 | u32 contrib = 0; | ||
1182 | |||
1183 | if (likely(n <= LOAD_AVG_PERIOD)) | ||
1184 | return runnable_avg_yN_sum[n]; | ||
1185 | else if (unlikely(n >= LOAD_AVG_MAX_N)) | ||
1186 | return LOAD_AVG_MAX; | ||
1187 | |||
1188 | /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ | ||
1189 | do { | ||
1190 | contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ | ||
1191 | contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; | ||
1192 | |||
1193 | n -= LOAD_AVG_PERIOD; | ||
1194 | } while (n > LOAD_AVG_PERIOD); | ||
1195 | |||
1196 | contrib = decay_load(contrib, n); | ||
1197 | return contrib + runnable_avg_yN_sum[n]; | ||
1198 | } | ||
1199 | |||
1200 | /* | ||
1201 | * We can represent the historical contribution to runnable average as the | ||
1202 | * coefficients of a geometric series. To do this we sub-divide our runnable | ||
1203 | * history into segments of approximately 1ms (1024us); label the segment that | ||
1204 | * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. | ||
1205 | * | ||
1206 | * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... | ||
1207 | * p0 p1 p2 | ||
1208 | * (now) (~1ms ago) (~2ms ago) | ||
1209 | * | ||
1210 | * Let u_i denote the fraction of p_i that the entity was runnable. | ||
1211 | * | ||
1212 | * We then designate the fractions u_i as our co-efficients, yielding the | ||
1213 | * following representation of historical load: | ||
1214 | * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... | ||
1215 | * | ||
1216 | * We choose y based on the with of a reasonably scheduling period, fixing: | ||
1217 | * y^32 = 0.5 | ||
1218 | * | ||
1219 | * This means that the contribution to load ~32ms ago (u_32) will be weighted | ||
1220 | * approximately half as much as the contribution to load within the last ms | ||
1221 | * (u_0). | ||
1222 | * | ||
1223 | * When a period "rolls over" and we have new u_0`, multiplying the previous | ||
1224 | * sum again by y is sufficient to update: | ||
1225 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | ||
1226 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | ||
1227 | */ | ||
1228 | static __always_inline int __update_entity_runnable_avg(u64 now, | ||
1229 | struct sched_avg *sa, | ||
1230 | int runnable) | ||
1231 | { | ||
1232 | u64 delta, periods; | ||
1233 | u32 runnable_contrib; | ||
1234 | int delta_w, decayed = 0; | ||
1235 | |||
1236 | delta = now - sa->last_runnable_update; | ||
1237 | /* | ||
1238 | * This should only happen when time goes backwards, which it | ||
1239 | * unfortunately does during sched clock init when we swap over to TSC. | ||
1240 | */ | ||
1241 | if ((s64)delta < 0) { | ||
1242 | sa->last_runnable_update = now; | ||
1243 | return 0; | ||
1244 | } | ||
1245 | |||
1246 | /* | ||
1247 | * Use 1024ns as the unit of measurement since it's a reasonable | ||
1248 | * approximation of 1us and fast to compute. | ||
1249 | */ | ||
1250 | delta >>= 10; | ||
1251 | if (!delta) | ||
1252 | return 0; | ||
1253 | sa->last_runnable_update = now; | ||
1254 | |||
1255 | /* delta_w is the amount already accumulated against our next period */ | ||
1256 | delta_w = sa->runnable_avg_period % 1024; | ||
1257 | if (delta + delta_w >= 1024) { | ||
1258 | /* period roll-over */ | ||
1259 | decayed = 1; | ||
1260 | |||
1261 | /* | ||
1262 | * Now that we know we're crossing a period boundary, figure | ||
1263 | * out how much from delta we need to complete the current | ||
1264 | * period and accrue it. | ||
1265 | */ | ||
1266 | delta_w = 1024 - delta_w; | ||
1267 | if (runnable) | ||
1268 | sa->runnable_avg_sum += delta_w; | ||
1269 | sa->runnable_avg_period += delta_w; | ||
1270 | |||
1271 | delta -= delta_w; | ||
1272 | |||
1273 | /* Figure out how many additional periods this update spans */ | ||
1274 | periods = delta / 1024; | ||
1275 | delta %= 1024; | ||
1276 | |||
1277 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | ||
1278 | periods + 1); | ||
1279 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | ||
1280 | periods + 1); | ||
1281 | |||
1282 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | ||
1283 | runnable_contrib = __compute_runnable_contrib(periods); | ||
1284 | if (runnable) | ||
1285 | sa->runnable_avg_sum += runnable_contrib; | ||
1286 | sa->runnable_avg_period += runnable_contrib; | ||
1287 | } | ||
1288 | |||
1289 | /* Remainder of delta accrued against u_0` */ | ||
1290 | if (runnable) | ||
1291 | sa->runnable_avg_sum += delta; | ||
1292 | sa->runnable_avg_period += delta; | ||
1293 | |||
1294 | return decayed; | ||
967 | } | 1295 | } |
968 | 1296 | ||
969 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | 1297 | /* Synchronize an entity's decay with its parenting cfs_rq.*/ |
1298 | static inline u64 __synchronize_entity_decay(struct sched_entity *se) | ||
970 | { | 1299 | { |
1300 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1301 | u64 decays = atomic64_read(&cfs_rq->decay_counter); | ||
1302 | |||
1303 | decays -= se->avg.decay_count; | ||
1304 | if (!decays) | ||
1305 | return 0; | ||
1306 | |||
1307 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | ||
1308 | se->avg.decay_count = 0; | ||
1309 | |||
1310 | return decays; | ||
1311 | } | ||
1312 | |||
1313 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1314 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | ||
1315 | int force_update) | ||
1316 | { | ||
1317 | struct task_group *tg = cfs_rq->tg; | ||
1318 | s64 tg_contrib; | ||
1319 | |||
1320 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | ||
1321 | tg_contrib -= cfs_rq->tg_load_contrib; | ||
1322 | |||
1323 | if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | ||
1324 | atomic64_add(tg_contrib, &tg->load_avg); | ||
1325 | cfs_rq->tg_load_contrib += tg_contrib; | ||
1326 | } | ||
1327 | } | ||
1328 | |||
1329 | /* | ||
1330 | * Aggregate cfs_rq runnable averages into an equivalent task_group | ||
1331 | * representation for computing load contributions. | ||
1332 | */ | ||
1333 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | ||
1334 | struct cfs_rq *cfs_rq) | ||
1335 | { | ||
1336 | struct task_group *tg = cfs_rq->tg; | ||
1337 | long contrib; | ||
1338 | |||
1339 | /* The fraction of a cpu used by this cfs_rq */ | ||
1340 | contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, | ||
1341 | sa->runnable_avg_period + 1); | ||
1342 | contrib -= cfs_rq->tg_runnable_contrib; | ||
1343 | |||
1344 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | ||
1345 | atomic_add(contrib, &tg->runnable_avg); | ||
1346 | cfs_rq->tg_runnable_contrib += contrib; | ||
1347 | } | ||
1348 | } | ||
1349 | |||
1350 | static inline void __update_group_entity_contrib(struct sched_entity *se) | ||
1351 | { | ||
1352 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | ||
1353 | struct task_group *tg = cfs_rq->tg; | ||
1354 | int runnable_avg; | ||
1355 | |||
1356 | u64 contrib; | ||
1357 | |||
1358 | contrib = cfs_rq->tg_load_contrib * tg->shares; | ||
1359 | se->avg.load_avg_contrib = div64_u64(contrib, | ||
1360 | atomic64_read(&tg->load_avg) + 1); | ||
1361 | |||
1362 | /* | ||
1363 | * For group entities we need to compute a correction term in the case | ||
1364 | * that they are consuming <1 cpu so that we would contribute the same | ||
1365 | * load as a task of equal weight. | ||
1366 | * | ||
1367 | * Explicitly co-ordinating this measurement would be expensive, but | ||
1368 | * fortunately the sum of each cpus contribution forms a usable | ||
1369 | * lower-bound on the true value. | ||
1370 | * | ||
1371 | * Consider the aggregate of 2 contributions. Either they are disjoint | ||
1372 | * (and the sum represents true value) or they are disjoint and we are | ||
1373 | * understating by the aggregate of their overlap. | ||
1374 | * | ||
1375 | * Extending this to N cpus, for a given overlap, the maximum amount we | ||
1376 | * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of | ||
1377 | * cpus that overlap for this interval and w_i is the interval width. | ||
1378 | * | ||
1379 | * On a small machine; the first term is well-bounded which bounds the | ||
1380 | * total error since w_i is a subset of the period. Whereas on a | ||
1381 | * larger machine, while this first term can be larger, if w_i is the | ||
1382 | * of consequential size guaranteed to see n_i*w_i quickly converge to | ||
1383 | * our upper bound of 1-cpu. | ||
1384 | */ | ||
1385 | runnable_avg = atomic_read(&tg->runnable_avg); | ||
1386 | if (runnable_avg < NICE_0_LOAD) { | ||
1387 | se->avg.load_avg_contrib *= runnable_avg; | ||
1388 | se->avg.load_avg_contrib >>= NICE_0_SHIFT; | ||
1389 | } | ||
1390 | } | ||
1391 | #else | ||
1392 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | ||
1393 | int force_update) {} | ||
1394 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | ||
1395 | struct cfs_rq *cfs_rq) {} | ||
1396 | static inline void __update_group_entity_contrib(struct sched_entity *se) {} | ||
1397 | #endif | ||
1398 | |||
1399 | static inline void __update_task_entity_contrib(struct sched_entity *se) | ||
1400 | { | ||
1401 | u32 contrib; | ||
1402 | |||
1403 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
1404 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | ||
1405 | contrib /= (se->avg.runnable_avg_period + 1); | ||
1406 | se->avg.load_avg_contrib = scale_load(contrib); | ||
971 | } | 1407 | } |
972 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 1408 | |
1409 | /* Compute the current contribution to load_avg by se, return any delta */ | ||
1410 | static long __update_entity_load_avg_contrib(struct sched_entity *se) | ||
1411 | { | ||
1412 | long old_contrib = se->avg.load_avg_contrib; | ||
1413 | |||
1414 | if (entity_is_task(se)) { | ||
1415 | __update_task_entity_contrib(se); | ||
1416 | } else { | ||
1417 | __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); | ||
1418 | __update_group_entity_contrib(se); | ||
1419 | } | ||
1420 | |||
1421 | return se->avg.load_avg_contrib - old_contrib; | ||
1422 | } | ||
1423 | |||
1424 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | ||
1425 | long load_contrib) | ||
1426 | { | ||
1427 | if (likely(load_contrib < cfs_rq->blocked_load_avg)) | ||
1428 | cfs_rq->blocked_load_avg -= load_contrib; | ||
1429 | else | ||
1430 | cfs_rq->blocked_load_avg = 0; | ||
1431 | } | ||
1432 | |||
1433 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
1434 | |||
1435 | /* Update a sched_entity's runnable average */ | ||
1436 | static inline void update_entity_load_avg(struct sched_entity *se, | ||
1437 | int update_cfs_rq) | ||
1438 | { | ||
1439 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1440 | long contrib_delta; | ||
1441 | u64 now; | ||
1442 | |||
1443 | /* | ||
1444 | * For a group entity we need to use their owned cfs_rq_clock_task() in | ||
1445 | * case they are the parent of a throttled hierarchy. | ||
1446 | */ | ||
1447 | if (entity_is_task(se)) | ||
1448 | now = cfs_rq_clock_task(cfs_rq); | ||
1449 | else | ||
1450 | now = cfs_rq_clock_task(group_cfs_rq(se)); | ||
1451 | |||
1452 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | ||
1453 | return; | ||
1454 | |||
1455 | contrib_delta = __update_entity_load_avg_contrib(se); | ||
1456 | |||
1457 | if (!update_cfs_rq) | ||
1458 | return; | ||
1459 | |||
1460 | if (se->on_rq) | ||
1461 | cfs_rq->runnable_load_avg += contrib_delta; | ||
1462 | else | ||
1463 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | ||
1464 | } | ||
1465 | |||
1466 | /* | ||
1467 | * Decay the load contributed by all blocked children and account this so that | ||
1468 | * their contribution may appropriately discounted when they wake up. | ||
1469 | */ | ||
1470 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | ||
1471 | { | ||
1472 | u64 now = cfs_rq_clock_task(cfs_rq) >> 20; | ||
1473 | u64 decays; | ||
1474 | |||
1475 | decays = now - cfs_rq->last_decay; | ||
1476 | if (!decays && !force_update) | ||
1477 | return; | ||
1478 | |||
1479 | if (atomic64_read(&cfs_rq->removed_load)) { | ||
1480 | u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); | ||
1481 | subtract_blocked_load_contrib(cfs_rq, removed_load); | ||
1482 | } | ||
1483 | |||
1484 | if (decays) { | ||
1485 | cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, | ||
1486 | decays); | ||
1487 | atomic64_add(decays, &cfs_rq->decay_counter); | ||
1488 | cfs_rq->last_decay = now; | ||
1489 | } | ||
1490 | |||
1491 | __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); | ||
1492 | } | ||
1493 | |||
1494 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | ||
1495 | { | ||
1496 | __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); | ||
1497 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | ||
1498 | } | ||
1499 | |||
1500 | /* Add the load generated by se into cfs_rq's child load-average */ | ||
1501 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1502 | struct sched_entity *se, | ||
1503 | int wakeup) | ||
1504 | { | ||
1505 | /* | ||
1506 | * We track migrations using entity decay_count <= 0, on a wake-up | ||
1507 | * migration we use a negative decay count to track the remote decays | ||
1508 | * accumulated while sleeping. | ||
1509 | */ | ||
1510 | if (unlikely(se->avg.decay_count <= 0)) { | ||
1511 | se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; | ||
1512 | if (se->avg.decay_count) { | ||
1513 | /* | ||
1514 | * In a wake-up migration we have to approximate the | ||
1515 | * time sleeping. This is because we can't synchronize | ||
1516 | * clock_task between the two cpus, and it is not | ||
1517 | * guaranteed to be read-safe. Instead, we can | ||
1518 | * approximate this using our carried decays, which are | ||
1519 | * explicitly atomically readable. | ||
1520 | */ | ||
1521 | se->avg.last_runnable_update -= (-se->avg.decay_count) | ||
1522 | << 20; | ||
1523 | update_entity_load_avg(se, 0); | ||
1524 | /* Indicate that we're now synchronized and on-rq */ | ||
1525 | se->avg.decay_count = 0; | ||
1526 | } | ||
1527 | wakeup = 0; | ||
1528 | } else { | ||
1529 | __synchronize_entity_decay(se); | ||
1530 | } | ||
1531 | |||
1532 | /* migrated tasks did not contribute to our blocked load */ | ||
1533 | if (wakeup) { | ||
1534 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); | ||
1535 | update_entity_load_avg(se, 0); | ||
1536 | } | ||
1537 | |||
1538 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | ||
1539 | /* we force update consideration on load-balancer moves */ | ||
1540 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | ||
1541 | } | ||
1542 | |||
1543 | /* | ||
1544 | * Remove se's load from this cfs_rq child load-average, if the entity is | ||
1545 | * transitioning to a blocked state we track its projected decay using | ||
1546 | * blocked_load_avg. | ||
1547 | */ | ||
1548 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1549 | struct sched_entity *se, | ||
1550 | int sleep) | ||
1551 | { | ||
1552 | update_entity_load_avg(se, 1); | ||
1553 | /* we force update consideration on load-balancer moves */ | ||
1554 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | ||
1555 | |||
1556 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | ||
1557 | if (sleep) { | ||
1558 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | ||
1559 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | ||
1560 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ | ||
1561 | } | ||
1562 | #else | ||
1563 | static inline void update_entity_load_avg(struct sched_entity *se, | ||
1564 | int update_cfs_rq) {} | ||
1565 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} | ||
1566 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1567 | struct sched_entity *se, | ||
1568 | int wakeup) {} | ||
1569 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | ||
1570 | struct sched_entity *se, | ||
1571 | int sleep) {} | ||
1572 | static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | ||
1573 | int force_update) {} | ||
1574 | #endif | ||
973 | 1575 | ||
974 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1576 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
975 | { | 1577 | { |
@@ -1096,7 +1698,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1096 | * Update run-time statistics of the 'current'. | 1698 | * Update run-time statistics of the 'current'. |
1097 | */ | 1699 | */ |
1098 | update_curr(cfs_rq); | 1700 | update_curr(cfs_rq); |
1099 | update_cfs_load(cfs_rq, 0); | 1701 | enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); |
1100 | account_entity_enqueue(cfs_rq, se); | 1702 | account_entity_enqueue(cfs_rq, se); |
1101 | update_cfs_shares(cfs_rq); | 1703 | update_cfs_shares(cfs_rq); |
1102 | 1704 | ||
@@ -1171,6 +1773,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1171 | * Update run-time statistics of the 'current'. | 1773 | * Update run-time statistics of the 'current'. |
1172 | */ | 1774 | */ |
1173 | update_curr(cfs_rq); | 1775 | update_curr(cfs_rq); |
1776 | dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); | ||
1174 | 1777 | ||
1175 | update_stats_dequeue(cfs_rq, se); | 1778 | update_stats_dequeue(cfs_rq, se); |
1176 | if (flags & DEQUEUE_SLEEP) { | 1779 | if (flags & DEQUEUE_SLEEP) { |
@@ -1191,7 +1794,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1191 | if (se != cfs_rq->curr) | 1794 | if (se != cfs_rq->curr) |
1192 | __dequeue_entity(cfs_rq, se); | 1795 | __dequeue_entity(cfs_rq, se); |
1193 | se->on_rq = 0; | 1796 | se->on_rq = 0; |
1194 | update_cfs_load(cfs_rq, 0); | ||
1195 | account_entity_dequeue(cfs_rq, se); | 1797 | account_entity_dequeue(cfs_rq, se); |
1196 | 1798 | ||
1197 | /* | 1799 | /* |
@@ -1340,6 +1942,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
1340 | update_stats_wait_start(cfs_rq, prev); | 1942 | update_stats_wait_start(cfs_rq, prev); |
1341 | /* Put 'current' back into the tree. */ | 1943 | /* Put 'current' back into the tree. */ |
1342 | __enqueue_entity(cfs_rq, prev); | 1944 | __enqueue_entity(cfs_rq, prev); |
1945 | /* in !on_rq case, update occurred at dequeue */ | ||
1946 | update_entity_load_avg(prev, 1); | ||
1343 | } | 1947 | } |
1344 | cfs_rq->curr = NULL; | 1948 | cfs_rq->curr = NULL; |
1345 | } | 1949 | } |
@@ -1353,9 +1957,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1353 | update_curr(cfs_rq); | 1957 | update_curr(cfs_rq); |
1354 | 1958 | ||
1355 | /* | 1959 | /* |
1356 | * Update share accounting for long-running entities. | 1960 | * Ensure that runnable average is periodically updated. |
1357 | */ | 1961 | */ |
1358 | update_entity_shares_tick(cfs_rq); | 1962 | update_entity_load_avg(curr, 1); |
1963 | update_cfs_rq_blocked_load(cfs_rq, 1); | ||
1359 | 1964 | ||
1360 | #ifdef CONFIG_SCHED_HRTICK | 1965 | #ifdef CONFIG_SCHED_HRTICK |
1361 | /* | 1966 | /* |
@@ -1448,6 +2053,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
1448 | return &tg->cfs_bandwidth; | 2053 | return &tg->cfs_bandwidth; |
1449 | } | 2054 | } |
1450 | 2055 | ||
2056 | /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ | ||
2057 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | ||
2058 | { | ||
2059 | if (unlikely(cfs_rq->throttle_count)) | ||
2060 | return cfs_rq->throttled_clock_task; | ||
2061 | |||
2062 | return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; | ||
2063 | } | ||
2064 | |||
1451 | /* returns 0 on failure to allocate runtime */ | 2065 | /* returns 0 on failure to allocate runtime */ |
1452 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 2066 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1453 | { | 2067 | { |
@@ -1592,14 +2206,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
1592 | cfs_rq->throttle_count--; | 2206 | cfs_rq->throttle_count--; |
1593 | #ifdef CONFIG_SMP | 2207 | #ifdef CONFIG_SMP |
1594 | if (!cfs_rq->throttle_count) { | 2208 | if (!cfs_rq->throttle_count) { |
1595 | u64 delta = rq->clock_task - cfs_rq->load_stamp; | 2209 | /* adjust cfs_rq_clock_task() */ |
1596 | 2210 | cfs_rq->throttled_clock_task_time += rq->clock_task - | |
1597 | /* leaving throttled state, advance shares averaging windows */ | 2211 | cfs_rq->throttled_clock_task; |
1598 | cfs_rq->load_stamp += delta; | ||
1599 | cfs_rq->load_last += delta; | ||
1600 | |||
1601 | /* update entity weight now that we are on_rq again */ | ||
1602 | update_cfs_shares(cfs_rq); | ||
1603 | } | 2212 | } |
1604 | #endif | 2213 | #endif |
1605 | 2214 | ||
@@ -1611,9 +2220,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) | |||
1611 | struct rq *rq = data; | 2220 | struct rq *rq = data; |
1612 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | 2221 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; |
1613 | 2222 | ||
1614 | /* group is entering throttled state, record last load */ | 2223 | /* group is entering throttled state, stop time */ |
1615 | if (!cfs_rq->throttle_count) | 2224 | if (!cfs_rq->throttle_count) |
1616 | update_cfs_load(cfs_rq, 0); | 2225 | cfs_rq->throttled_clock_task = rq->clock_task; |
1617 | cfs_rq->throttle_count++; | 2226 | cfs_rq->throttle_count++; |
1618 | 2227 | ||
1619 | return 0; | 2228 | return 0; |
@@ -1628,7 +2237,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1628 | 2237 | ||
1629 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | 2238 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; |
1630 | 2239 | ||
1631 | /* account load preceding throttle */ | 2240 | /* freeze hierarchy runnable averages while throttled */ |
1632 | rcu_read_lock(); | 2241 | rcu_read_lock(); |
1633 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); | 2242 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); |
1634 | rcu_read_unlock(); | 2243 | rcu_read_unlock(); |
@@ -1652,7 +2261,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1652 | rq->nr_running -= task_delta; | 2261 | rq->nr_running -= task_delta; |
1653 | 2262 | ||
1654 | cfs_rq->throttled = 1; | 2263 | cfs_rq->throttled = 1; |
1655 | cfs_rq->throttled_timestamp = rq->clock; | 2264 | cfs_rq->throttled_clock = rq->clock; |
1656 | raw_spin_lock(&cfs_b->lock); | 2265 | raw_spin_lock(&cfs_b->lock); |
1657 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 2266 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
1658 | raw_spin_unlock(&cfs_b->lock); | 2267 | raw_spin_unlock(&cfs_b->lock); |
@@ -1670,10 +2279,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1670 | 2279 | ||
1671 | cfs_rq->throttled = 0; | 2280 | cfs_rq->throttled = 0; |
1672 | raw_spin_lock(&cfs_b->lock); | 2281 | raw_spin_lock(&cfs_b->lock); |
1673 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; | 2282 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; |
1674 | list_del_rcu(&cfs_rq->throttled_list); | 2283 | list_del_rcu(&cfs_rq->throttled_list); |
1675 | raw_spin_unlock(&cfs_b->lock); | 2284 | raw_spin_unlock(&cfs_b->lock); |
1676 | cfs_rq->throttled_timestamp = 0; | ||
1677 | 2285 | ||
1678 | update_rq_clock(rq); | 2286 | update_rq_clock(rq); |
1679 | /* update hierarchical throttle state */ | 2287 | /* update hierarchical throttle state */ |
@@ -2073,8 +2681,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq) | |||
2073 | } | 2681 | } |
2074 | 2682 | ||
2075 | #else /* CONFIG_CFS_BANDWIDTH */ | 2683 | #else /* CONFIG_CFS_BANDWIDTH */ |
2076 | static __always_inline | 2684 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) |
2077 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} | 2685 | { |
2686 | return rq_of(cfs_rq)->clock_task; | ||
2687 | } | ||
2688 | |||
2689 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
2690 | unsigned long delta_exec) {} | ||
2078 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2691 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
2079 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 2692 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
2080 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2693 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
@@ -2207,12 +2820,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2207 | if (cfs_rq_throttled(cfs_rq)) | 2820 | if (cfs_rq_throttled(cfs_rq)) |
2208 | break; | 2821 | break; |
2209 | 2822 | ||
2210 | update_cfs_load(cfs_rq, 0); | ||
2211 | update_cfs_shares(cfs_rq); | 2823 | update_cfs_shares(cfs_rq); |
2824 | update_entity_load_avg(se, 1); | ||
2212 | } | 2825 | } |
2213 | 2826 | ||
2214 | if (!se) | 2827 | if (!se) { |
2828 | update_rq_runnable_avg(rq, rq->nr_running); | ||
2215 | inc_nr_running(rq); | 2829 | inc_nr_running(rq); |
2830 | } | ||
2216 | hrtick_update(rq); | 2831 | hrtick_update(rq); |
2217 | } | 2832 | } |
2218 | 2833 | ||
@@ -2266,12 +2881,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2266 | if (cfs_rq_throttled(cfs_rq)) | 2881 | if (cfs_rq_throttled(cfs_rq)) |
2267 | break; | 2882 | break; |
2268 | 2883 | ||
2269 | update_cfs_load(cfs_rq, 0); | ||
2270 | update_cfs_shares(cfs_rq); | 2884 | update_cfs_shares(cfs_rq); |
2885 | update_entity_load_avg(se, 1); | ||
2271 | } | 2886 | } |
2272 | 2887 | ||
2273 | if (!se) | 2888 | if (!se) { |
2274 | dec_nr_running(rq); | 2889 | dec_nr_running(rq); |
2890 | update_rq_runnable_avg(rq, 1); | ||
2891 | } | ||
2275 | hrtick_update(rq); | 2892 | hrtick_update(rq); |
2276 | } | 2893 | } |
2277 | 2894 | ||
@@ -2781,6 +3398,37 @@ unlock: | |||
2781 | 3398 | ||
2782 | return new_cpu; | 3399 | return new_cpu; |
2783 | } | 3400 | } |
3401 | |||
3402 | /* | ||
3403 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
3404 | * removed when useful for applications beyond shares distribution (e.g. | ||
3405 | * load-balance). | ||
3406 | */ | ||
3407 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
3408 | /* | ||
3409 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | ||
3410 | * cfs_rq_of(p) references at time of call are still valid and identify the | ||
3411 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | ||
3412 | * other assumptions, including the state of rq->lock, should be made. | ||
3413 | */ | ||
3414 | static void | ||
3415 | migrate_task_rq_fair(struct task_struct *p, int next_cpu) | ||
3416 | { | ||
3417 | struct sched_entity *se = &p->se; | ||
3418 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
3419 | |||
3420 | /* | ||
3421 | * Load tracking: accumulate removed load so that it can be processed | ||
3422 | * when we next update owning cfs_rq under rq->lock. Tasks contribute | ||
3423 | * to blocked load iff they have a positive decay-count. It can never | ||
3424 | * be negative here since on-rq tasks have decay-count == 0. | ||
3425 | */ | ||
3426 | if (se->avg.decay_count) { | ||
3427 | se->avg.decay_count = -__synchronize_entity_decay(se); | ||
3428 | atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); | ||
3429 | } | ||
3430 | } | ||
3431 | #endif | ||
2784 | #endif /* CONFIG_SMP */ | 3432 | #endif /* CONFIG_SMP */ |
2785 | 3433 | ||
2786 | static unsigned long | 3434 | static unsigned long |
@@ -2907,7 +3555,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
2907 | * Batch and idle tasks do not preempt non-idle tasks (their preemption | 3555 | * Batch and idle tasks do not preempt non-idle tasks (their preemption |
2908 | * is driven by the tick): | 3556 | * is driven by the tick): |
2909 | */ | 3557 | */ |
2910 | if (unlikely(p->policy != SCHED_NORMAL)) | 3558 | if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) |
2911 | return; | 3559 | return; |
2912 | 3560 | ||
2913 | find_matching_se(&se, &pse); | 3561 | find_matching_se(&se, &pse); |
@@ -3033,8 +3681,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3033 | 3681 | ||
3034 | #ifdef CONFIG_SMP | 3682 | #ifdef CONFIG_SMP |
3035 | /************************************************** | 3683 | /************************************************** |
3036 | * Fair scheduling class load-balancing methods: | 3684 | * Fair scheduling class load-balancing methods. |
3037 | */ | 3685 | * |
3686 | * BASICS | ||
3687 | * | ||
3688 | * The purpose of load-balancing is to achieve the same basic fairness the | ||
3689 | * per-cpu scheduler provides, namely provide a proportional amount of compute | ||
3690 | * time to each task. This is expressed in the following equation: | ||
3691 | * | ||
3692 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) | ||
3693 | * | ||
3694 | * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight | ||
3695 | * W_i,0 is defined as: | ||
3696 | * | ||
3697 | * W_i,0 = \Sum_j w_i,j (2) | ||
3698 | * | ||
3699 | * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight | ||
3700 | * is derived from the nice value as per prio_to_weight[]. | ||
3701 | * | ||
3702 | * The weight average is an exponential decay average of the instantaneous | ||
3703 | * weight: | ||
3704 | * | ||
3705 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) | ||
3706 | * | ||
3707 | * P_i is the cpu power (or compute capacity) of cpu i, typically it is the | ||
3708 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it | ||
3709 | * can also include other factors [XXX]. | ||
3710 | * | ||
3711 | * To achieve this balance we define a measure of imbalance which follows | ||
3712 | * directly from (1): | ||
3713 | * | ||
3714 | * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) | ||
3715 | * | ||
3716 | * We them move tasks around to minimize the imbalance. In the continuous | ||
3717 | * function space it is obvious this converges, in the discrete case we get | ||
3718 | * a few fun cases generally called infeasible weight scenarios. | ||
3719 | * | ||
3720 | * [XXX expand on: | ||
3721 | * - infeasible weights; | ||
3722 | * - local vs global optima in the discrete case. ] | ||
3723 | * | ||
3724 | * | ||
3725 | * SCHED DOMAINS | ||
3726 | * | ||
3727 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) | ||
3728 | * for all i,j solution, we create a tree of cpus that follows the hardware | ||
3729 | * topology where each level pairs two lower groups (or better). This results | ||
3730 | * in O(log n) layers. Furthermore we reduce the number of cpus going up the | ||
3731 | * tree to only the first of the previous level and we decrease the frequency | ||
3732 | * of load-balance at each level inv. proportional to the number of cpus in | ||
3733 | * the groups. | ||
3734 | * | ||
3735 | * This yields: | ||
3736 | * | ||
3737 | * log_2 n 1 n | ||
3738 | * \Sum { --- * --- * 2^i } = O(n) (5) | ||
3739 | * i = 0 2^i 2^i | ||
3740 | * `- size of each group | ||
3741 | * | | `- number of cpus doing load-balance | ||
3742 | * | `- freq | ||
3743 | * `- sum over all levels | ||
3744 | * | ||
3745 | * Coupled with a limit on how many tasks we can migrate every balance pass, | ||
3746 | * this makes (5) the runtime complexity of the balancer. | ||
3747 | * | ||
3748 | * An important property here is that each CPU is still (indirectly) connected | ||
3749 | * to every other cpu in at most O(log n) steps: | ||
3750 | * | ||
3751 | * The adjacency matrix of the resulting graph is given by: | ||
3752 | * | ||
3753 | * log_2 n | ||
3754 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) | ||
3755 | * k = 0 | ||
3756 | * | ||
3757 | * And you'll find that: | ||
3758 | * | ||
3759 | * A^(log_2 n)_i,j != 0 for all i,j (7) | ||
3760 | * | ||
3761 | * Showing there's indeed a path between every cpu in at most O(log n) steps. | ||
3762 | * The task movement gives a factor of O(m), giving a convergence complexity | ||
3763 | * of: | ||
3764 | * | ||
3765 | * O(nm log n), n := nr_cpus, m := nr_tasks (8) | ||
3766 | * | ||
3767 | * | ||
3768 | * WORK CONSERVING | ||
3769 | * | ||
3770 | * In order to avoid CPUs going idle while there's still work to do, new idle | ||
3771 | * balancing is more aggressive and has the newly idle cpu iterate up the domain | ||
3772 | * tree itself instead of relying on other CPUs to bring it work. | ||
3773 | * | ||
3774 | * This adds some complexity to both (5) and (8) but it reduces the total idle | ||
3775 | * time. | ||
3776 | * | ||
3777 | * [XXX more?] | ||
3778 | * | ||
3779 | * | ||
3780 | * CGROUPS | ||
3781 | * | ||
3782 | * Cgroups make a horror show out of (2), instead of a simple sum we get: | ||
3783 | * | ||
3784 | * s_k,i | ||
3785 | * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) | ||
3786 | * S_k | ||
3787 | * | ||
3788 | * Where | ||
3789 | * | ||
3790 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) | ||
3791 | * | ||
3792 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. | ||
3793 | * | ||
3794 | * The big problem is S_k, its a global sum needed to compute a local (W_i) | ||
3795 | * property. | ||
3796 | * | ||
3797 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that | ||
3798 | * rewrite all of this once again.] | ||
3799 | */ | ||
3038 | 3800 | ||
3039 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 3801 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
3040 | 3802 | ||
@@ -3300,52 +4062,58 @@ next: | |||
3300 | /* | 4062 | /* |
3301 | * update tg->load_weight by folding this cpu's load_avg | 4063 | * update tg->load_weight by folding this cpu's load_avg |
3302 | */ | 4064 | */ |
3303 | static int update_shares_cpu(struct task_group *tg, int cpu) | 4065 | static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) |
3304 | { | 4066 | { |
3305 | struct cfs_rq *cfs_rq; | 4067 | struct sched_entity *se = tg->se[cpu]; |
3306 | unsigned long flags; | 4068 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; |
3307 | struct rq *rq; | ||
3308 | |||
3309 | if (!tg->se[cpu]) | ||
3310 | return 0; | ||
3311 | |||
3312 | rq = cpu_rq(cpu); | ||
3313 | cfs_rq = tg->cfs_rq[cpu]; | ||
3314 | |||
3315 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
3316 | |||
3317 | update_rq_clock(rq); | ||
3318 | update_cfs_load(cfs_rq, 1); | ||
3319 | 4069 | ||
3320 | /* | 4070 | /* throttled entities do not contribute to load */ |
3321 | * We need to update shares after updating tg->load_weight in | 4071 | if (throttled_hierarchy(cfs_rq)) |
3322 | * order to adjust the weight of groups with long running tasks. | 4072 | return; |
3323 | */ | ||
3324 | update_cfs_shares(cfs_rq); | ||
3325 | 4073 | ||
3326 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 4074 | update_cfs_rq_blocked_load(cfs_rq, 1); |
3327 | 4075 | ||
3328 | return 0; | 4076 | if (se) { |
4077 | update_entity_load_avg(se, 1); | ||
4078 | /* | ||
4079 | * We pivot on our runnable average having decayed to zero for | ||
4080 | * list removal. This generally implies that all our children | ||
4081 | * have also been removed (modulo rounding error or bandwidth | ||
4082 | * control); however, such cases are rare and we can fix these | ||
4083 | * at enqueue. | ||
4084 | * | ||
4085 | * TODO: fix up out-of-order children on enqueue. | ||
4086 | */ | ||
4087 | if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) | ||
4088 | list_del_leaf_cfs_rq(cfs_rq); | ||
4089 | } else { | ||
4090 | struct rq *rq = rq_of(cfs_rq); | ||
4091 | update_rq_runnable_avg(rq, rq->nr_running); | ||
4092 | } | ||
3329 | } | 4093 | } |
3330 | 4094 | ||
3331 | static void update_shares(int cpu) | 4095 | static void update_blocked_averages(int cpu) |
3332 | { | 4096 | { |
3333 | struct cfs_rq *cfs_rq; | ||
3334 | struct rq *rq = cpu_rq(cpu); | 4097 | struct rq *rq = cpu_rq(cpu); |
4098 | struct cfs_rq *cfs_rq; | ||
4099 | unsigned long flags; | ||
3335 | 4100 | ||
3336 | rcu_read_lock(); | 4101 | raw_spin_lock_irqsave(&rq->lock, flags); |
4102 | update_rq_clock(rq); | ||
3337 | /* | 4103 | /* |
3338 | * Iterates the task_group tree in a bottom up fashion, see | 4104 | * Iterates the task_group tree in a bottom up fashion, see |
3339 | * list_add_leaf_cfs_rq() for details. | 4105 | * list_add_leaf_cfs_rq() for details. |
3340 | */ | 4106 | */ |
3341 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 4107 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
3342 | /* throttled entities do not contribute to load */ | 4108 | /* |
3343 | if (throttled_hierarchy(cfs_rq)) | 4109 | * Note: We may want to consider periodically releasing |
3344 | continue; | 4110 | * rq->lock about these updates so that creating many task |
3345 | 4111 | * groups does not result in continually extending hold time. | |
3346 | update_shares_cpu(cfs_rq->tg, cpu); | 4112 | */ |
4113 | __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); | ||
3347 | } | 4114 | } |
3348 | rcu_read_unlock(); | 4115 | |
4116 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
3349 | } | 4117 | } |
3350 | 4118 | ||
3351 | /* | 4119 | /* |
@@ -3397,7 +4165,7 @@ static unsigned long task_h_load(struct task_struct *p) | |||
3397 | return load; | 4165 | return load; |
3398 | } | 4166 | } |
3399 | #else | 4167 | #else |
3400 | static inline void update_shares(int cpu) | 4168 | static inline void update_blocked_averages(int cpu) |
3401 | { | 4169 | { |
3402 | } | 4170 | } |
3403 | 4171 | ||
@@ -4457,12 +5225,14 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
4457 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 5225 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
4458 | return; | 5226 | return; |
4459 | 5227 | ||
5228 | update_rq_runnable_avg(this_rq, 1); | ||
5229 | |||
4460 | /* | 5230 | /* |
4461 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 5231 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
4462 | */ | 5232 | */ |
4463 | raw_spin_unlock(&this_rq->lock); | 5233 | raw_spin_unlock(&this_rq->lock); |
4464 | 5234 | ||
4465 | update_shares(this_cpu); | 5235 | update_blocked_averages(this_cpu); |
4466 | rcu_read_lock(); | 5236 | rcu_read_lock(); |
4467 | for_each_domain(this_cpu, sd) { | 5237 | for_each_domain(this_cpu, sd) { |
4468 | unsigned long interval; | 5238 | unsigned long interval; |
@@ -4717,7 +5487,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
4717 | int update_next_balance = 0; | 5487 | int update_next_balance = 0; |
4718 | int need_serialize; | 5488 | int need_serialize; |
4719 | 5489 | ||
4720 | update_shares(cpu); | 5490 | update_blocked_averages(cpu); |
4721 | 5491 | ||
4722 | rcu_read_lock(); | 5492 | rcu_read_lock(); |
4723 | for_each_domain(cpu, sd) { | 5493 | for_each_domain(cpu, sd) { |
@@ -4954,6 +5724,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
4954 | cfs_rq = cfs_rq_of(se); | 5724 | cfs_rq = cfs_rq_of(se); |
4955 | entity_tick(cfs_rq, se, queued); | 5725 | entity_tick(cfs_rq, se, queued); |
4956 | } | 5726 | } |
5727 | |||
5728 | if (sched_feat_numa(NUMA)) | ||
5729 | task_tick_numa(rq, curr); | ||
5730 | |||
5731 | update_rq_runnable_avg(rq, 1); | ||
4957 | } | 5732 | } |
4958 | 5733 | ||
4959 | /* | 5734 | /* |
@@ -5046,6 +5821,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5046 | place_entity(cfs_rq, se, 0); | 5821 | place_entity(cfs_rq, se, 0); |
5047 | se->vruntime -= cfs_rq->min_vruntime; | 5822 | se->vruntime -= cfs_rq->min_vruntime; |
5048 | } | 5823 | } |
5824 | |||
5825 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
5826 | /* | ||
5827 | * Remove our load from contribution when we leave sched_fair | ||
5828 | * and ensure we don't carry in an old decay_count if we | ||
5829 | * switch back. | ||
5830 | */ | ||
5831 | if (p->se.avg.decay_count) { | ||
5832 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | ||
5833 | __synchronize_entity_decay(&p->se); | ||
5834 | subtract_blocked_load_contrib(cfs_rq, | ||
5835 | p->se.avg.load_avg_contrib); | ||
5836 | } | ||
5837 | #endif | ||
5049 | } | 5838 | } |
5050 | 5839 | ||
5051 | /* | 5840 | /* |
@@ -5092,11 +5881,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
5092 | #ifndef CONFIG_64BIT | 5881 | #ifndef CONFIG_64BIT |
5093 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5882 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
5094 | #endif | 5883 | #endif |
5884 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
5885 | atomic64_set(&cfs_rq->decay_counter, 1); | ||
5886 | atomic64_set(&cfs_rq->removed_load, 0); | ||
5887 | #endif | ||
5095 | } | 5888 | } |
5096 | 5889 | ||
5097 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5890 | #ifdef CONFIG_FAIR_GROUP_SCHED |
5098 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 5891 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
5099 | { | 5892 | { |
5893 | struct cfs_rq *cfs_rq; | ||
5100 | /* | 5894 | /* |
5101 | * If the task was not on the rq at the time of this cgroup movement | 5895 | * If the task was not on the rq at the time of this cgroup movement |
5102 | * it must have been asleep, sleeping tasks keep their ->vruntime | 5896 | * it must have been asleep, sleeping tasks keep their ->vruntime |
@@ -5128,8 +5922,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
5128 | if (!on_rq) | 5922 | if (!on_rq) |
5129 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | 5923 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; |
5130 | set_task_rq(p, task_cpu(p)); | 5924 | set_task_rq(p, task_cpu(p)); |
5131 | if (!on_rq) | 5925 | if (!on_rq) { |
5132 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; | 5926 | cfs_rq = cfs_rq_of(&p->se); |
5927 | p->se.vruntime += cfs_rq->min_vruntime; | ||
5928 | #ifdef CONFIG_SMP | ||
5929 | /* | ||
5930 | * migrate_task_rq_fair() will have removed our previous | ||
5931 | * contribution, but we must synchronize for ongoing future | ||
5932 | * decay. | ||
5933 | */ | ||
5934 | p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | ||
5935 | cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; | ||
5936 | #endif | ||
5937 | } | ||
5133 | } | 5938 | } |
5134 | 5939 | ||
5135 | void free_fair_sched_group(struct task_group *tg) | 5940 | void free_fair_sched_group(struct task_group *tg) |
@@ -5214,10 +6019,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
5214 | 6019 | ||
5215 | cfs_rq->tg = tg; | 6020 | cfs_rq->tg = tg; |
5216 | cfs_rq->rq = rq; | 6021 | cfs_rq->rq = rq; |
5217 | #ifdef CONFIG_SMP | ||
5218 | /* allow initial update_cfs_load() to truncate */ | ||
5219 | cfs_rq->load_stamp = 1; | ||
5220 | #endif | ||
5221 | init_cfs_rq_runtime(cfs_rq); | 6022 | init_cfs_rq_runtime(cfs_rq); |
5222 | 6023 | ||
5223 | tg->cfs_rq[cpu] = cfs_rq; | 6024 | tg->cfs_rq[cpu] = cfs_rq; |
@@ -5319,7 +6120,9 @@ const struct sched_class fair_sched_class = { | |||
5319 | 6120 | ||
5320 | #ifdef CONFIG_SMP | 6121 | #ifdef CONFIG_SMP |
5321 | .select_task_rq = select_task_rq_fair, | 6122 | .select_task_rq = select_task_rq_fair, |
5322 | 6123 | #ifdef CONFIG_FAIR_GROUP_SCHED | |
6124 | .migrate_task_rq = migrate_task_rq_fair, | ||
6125 | #endif | ||
5323 | .rq_online = rq_online_fair, | 6126 | .rq_online = rq_online_fair, |
5324 | .rq_offline = rq_offline_fair, | 6127 | .rq_offline = rq_offline_fair, |
5325 | 6128 | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..1ad1d2b5395f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -32,6 +32,11 @@ SCHED_FEAT(LAST_BUDDY, true) | |||
32 | SCHED_FEAT(CACHE_HOT_BUDDY, true) | 32 | SCHED_FEAT(CACHE_HOT_BUDDY, true) |
33 | 33 | ||
34 | /* | 34 | /* |
35 | * Allow wakeup-time preemption of the current task: | ||
36 | */ | ||
37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) | ||
38 | |||
39 | /* | ||
35 | * Use arch dependent cpu power functions | 40 | * Use arch dependent cpu power functions |
36 | */ | 41 | */ |
37 | SCHED_FEAT(ARCH_POWER, true) | 42 | SCHED_FEAT(ARCH_POWER, true) |
@@ -61,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
61 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 66 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
62 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 67 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
63 | SCHED_FEAT(LB_MIN, false) | 68 | SCHED_FEAT(LB_MIN, false) |
69 | |||
70 | /* | ||
71 | * Apply the automatic NUMA scheduling policy. Enabled automatically | ||
72 | * at runtime if running on a NUMA machine. Can be controlled via | ||
73 | * numa_balancing=. Allow PTE scanning to be forced on UMA machines | ||
74 | * for debugging the core machinery. | ||
75 | */ | ||
76 | #ifdef CONFIG_NUMA_BALANCING | ||
77 | SCHED_FEAT(NUMA, false) | ||
78 | SCHED_FEAT(NUMA_FORCE, false) | ||
79 | #endif | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..fc886441436a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -112,6 +112,8 @@ struct task_group { | |||
112 | unsigned long shares; | 112 | unsigned long shares; |
113 | 113 | ||
114 | atomic_t load_weight; | 114 | atomic_t load_weight; |
115 | atomic64_t load_avg; | ||
116 | atomic_t runnable_avg; | ||
115 | #endif | 117 | #endif |
116 | 118 | ||
117 | #ifdef CONFIG_RT_GROUP_SCHED | 119 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -222,22 +224,29 @@ struct cfs_rq { | |||
222 | unsigned int nr_spread_over; | 224 | unsigned int nr_spread_over; |
223 | #endif | 225 | #endif |
224 | 226 | ||
227 | #ifdef CONFIG_SMP | ||
228 | /* | ||
229 | * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be | ||
230 | * removed when useful for applications beyond shares distribution (e.g. | ||
231 | * load-balance). | ||
232 | */ | ||
225 | #ifdef CONFIG_FAIR_GROUP_SCHED | 233 | #ifdef CONFIG_FAIR_GROUP_SCHED |
226 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
227 | |||
228 | /* | 234 | /* |
229 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 235 | * CFS Load tracking |
230 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 236 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
231 | * (like users, containers etc.) | 237 | * This allows for the description of both thread and group usage (in |
232 | * | 238 | * the FAIR_GROUP_SCHED case). |
233 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
234 | * list is used during load balance. | ||
235 | */ | 239 | */ |
236 | int on_list; | 240 | u64 runnable_load_avg, blocked_load_avg; |
237 | struct list_head leaf_cfs_rq_list; | 241 | atomic64_t decay_counter, removed_load; |
238 | struct task_group *tg; /* group that "owns" this runqueue */ | 242 | u64 last_decay; |
243 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
244 | /* These always depend on CONFIG_FAIR_GROUP_SCHED */ | ||
245 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
246 | u32 tg_runnable_contrib; | ||
247 | u64 tg_load_contrib; | ||
248 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
239 | 249 | ||
240 | #ifdef CONFIG_SMP | ||
241 | /* | 250 | /* |
242 | * h_load = weight * f(tg) | 251 | * h_load = weight * f(tg) |
243 | * | 252 | * |
@@ -245,26 +254,30 @@ struct cfs_rq { | |||
245 | * this group. | 254 | * this group. |
246 | */ | 255 | */ |
247 | unsigned long h_load; | 256 | unsigned long h_load; |
257 | #endif /* CONFIG_SMP */ | ||
258 | |||
259 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
260 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
248 | 261 | ||
249 | /* | 262 | /* |
250 | * Maintaining per-cpu shares distribution for group scheduling | 263 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
264 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
265 | * (like users, containers etc.) | ||
251 | * | 266 | * |
252 | * load_stamp is the last time we updated the load average | 267 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
253 | * load_last is the last time we updated the load average and saw load | 268 | * list is used during load balance. |
254 | * load_unacc_exec_time is currently unaccounted execution time | ||
255 | */ | 269 | */ |
256 | u64 load_avg; | 270 | int on_list; |
257 | u64 load_period; | 271 | struct list_head leaf_cfs_rq_list; |
258 | u64 load_stamp, load_last, load_unacc_exec_time; | 272 | struct task_group *tg; /* group that "owns" this runqueue */ |
259 | 273 | ||
260 | unsigned long load_contribution; | ||
261 | #endif /* CONFIG_SMP */ | ||
262 | #ifdef CONFIG_CFS_BANDWIDTH | 274 | #ifdef CONFIG_CFS_BANDWIDTH |
263 | int runtime_enabled; | 275 | int runtime_enabled; |
264 | u64 runtime_expires; | 276 | u64 runtime_expires; |
265 | s64 runtime_remaining; | 277 | s64 runtime_remaining; |
266 | 278 | ||
267 | u64 throttled_timestamp; | 279 | u64 throttled_clock, throttled_clock_task; |
280 | u64 throttled_clock_task_time; | ||
268 | int throttled, throttle_count; | 281 | int throttled, throttle_count; |
269 | struct list_head throttled_list; | 282 | struct list_head throttled_list; |
270 | #endif /* CONFIG_CFS_BANDWIDTH */ | 283 | #endif /* CONFIG_CFS_BANDWIDTH */ |
@@ -467,6 +480,8 @@ struct rq { | |||
467 | #ifdef CONFIG_SMP | 480 | #ifdef CONFIG_SMP |
468 | struct llist_head wake_list; | 481 | struct llist_head wake_list; |
469 | #endif | 482 | #endif |
483 | |||
484 | struct sched_avg avg; | ||
470 | }; | 485 | }; |
471 | 486 | ||
472 | static inline int cpu_of(struct rq *rq) | 487 | static inline int cpu_of(struct rq *rq) |
@@ -648,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
648 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 663 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
649 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 664 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
650 | 665 | ||
666 | #ifdef CONFIG_NUMA_BALANCING | ||
667 | #define sched_feat_numa(x) sched_feat(x) | ||
668 | #ifdef CONFIG_SCHED_DEBUG | ||
669 | #define numabalancing_enabled sched_feat_numa(NUMA) | ||
670 | #else | ||
671 | extern bool numabalancing_enabled; | ||
672 | #endif /* CONFIG_SCHED_DEBUG */ | ||
673 | #else | ||
674 | #define sched_feat_numa(x) (0) | ||
675 | #define numabalancing_enabled (0) | ||
676 | #endif /* CONFIG_NUMA_BALANCING */ | ||
677 | |||
651 | static inline u64 global_rt_period(void) | 678 | static inline u64 global_rt_period(void) |
652 | { | 679 | { |
653 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | 680 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
@@ -1212,4 +1239,3 @@ static inline u64 irq_time_read(int cpu) | |||
1212 | } | 1239 | } |
1213 | #endif /* CONFIG_64BIT */ | 1240 | #endif /* CONFIG_64BIT */ |
1214 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 1241 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1215 | |||
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf9..5af44b593770 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) | |||
396 | #ifdef CONFIG_SECCOMP_FILTER | 396 | #ifdef CONFIG_SECCOMP_FILTER |
397 | case SECCOMP_MODE_FILTER: { | 397 | case SECCOMP_MODE_FILTER: { |
398 | int data; | 398 | int data; |
399 | struct pt_regs *regs = task_pt_regs(current); | ||
399 | ret = seccomp_run_filters(this_syscall); | 400 | ret = seccomp_run_filters(this_syscall); |
400 | data = ret & SECCOMP_RET_DATA; | 401 | data = ret & SECCOMP_RET_DATA; |
401 | ret &= SECCOMP_RET_ACTION; | 402 | ret &= SECCOMP_RET_ACTION; |
402 | switch (ret) { | 403 | switch (ret) { |
403 | case SECCOMP_RET_ERRNO: | 404 | case SECCOMP_RET_ERRNO: |
404 | /* Set the low-order 16-bits as a errno. */ | 405 | /* Set the low-order 16-bits as a errno. */ |
405 | syscall_set_return_value(current, task_pt_regs(current), | 406 | syscall_set_return_value(current, regs, |
406 | -data, 0); | 407 | -data, 0); |
407 | goto skip; | 408 | goto skip; |
408 | case SECCOMP_RET_TRAP: | 409 | case SECCOMP_RET_TRAP: |
409 | /* Show the handler the original registers. */ | 410 | /* Show the handler the original registers. */ |
410 | syscall_rollback(current, task_pt_regs(current)); | 411 | syscall_rollback(current, regs); |
411 | /* Let the filter pass back 16 bits of data. */ | 412 | /* Let the filter pass back 16 bits of data. */ |
412 | seccomp_send_sigsys(this_syscall, data); | 413 | seccomp_send_sigsys(this_syscall, data); |
413 | goto skip; | 414 | goto skip; |
414 | case SECCOMP_RET_TRACE: | 415 | case SECCOMP_RET_TRACE: |
415 | /* Skip these calls if there is no tracer. */ | 416 | /* Skip these calls if there is no tracer. */ |
416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | 417 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { |
418 | syscall_set_return_value(current, regs, | ||
419 | -ENOSYS, 0); | ||
417 | goto skip; | 420 | goto skip; |
421 | } | ||
418 | /* Allow the BPF to provide the event message */ | 422 | /* Allow the BPF to provide the event message */ |
419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | 423 | ptrace_event(PTRACE_EVENT_SECCOMP, data); |
420 | /* | 424 | /* |
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) | |||
425 | */ | 429 | */ |
426 | if (fatal_signal_pending(current)) | 430 | if (fatal_signal_pending(current)) |
427 | break; | 431 | break; |
432 | if (syscall_get_nr(current, regs) < 0) | ||
433 | goto skip; /* Explicit request to skip. */ | ||
434 | |||
428 | return 0; | 435 | return 0; |
429 | case SECCOMP_RET_ALLOW: | 436 | case SECCOMP_RET_ALLOW: |
430 | return 0; | 437 | return 0; |
diff --git a/kernel/signal.c b/kernel/signal.c index b2445d86f226..580a91e63471 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1159,8 +1159,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1159 | return __send_signal(sig, info, t, group, from_ancestor_ns); | 1159 | return __send_signal(sig, info, t, group, from_ancestor_ns); |
1160 | } | 1160 | } |
1161 | 1161 | ||
1162 | static void print_fatal_signal(struct pt_regs *regs, int signr) | 1162 | static void print_fatal_signal(int signr) |
1163 | { | 1163 | { |
1164 | struct pt_regs *regs = signal_pt_regs(); | ||
1164 | printk("%s/%d: potentially unexpected fatal signal %d.\n", | 1165 | printk("%s/%d: potentially unexpected fatal signal %d.\n", |
1165 | current->comm, task_pid_nr(current), signr); | 1166 | current->comm, task_pid_nr(current), signr); |
1166 | 1167 | ||
@@ -1908,7 +1909,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1908 | preempt_disable(); | 1909 | preempt_disable(); |
1909 | read_unlock(&tasklist_lock); | 1910 | read_unlock(&tasklist_lock); |
1910 | preempt_enable_no_resched(); | 1911 | preempt_enable_no_resched(); |
1911 | schedule(); | 1912 | freezable_schedule(); |
1912 | } else { | 1913 | } else { |
1913 | /* | 1914 | /* |
1914 | * By the time we got the lock, our tracer went away. | 1915 | * By the time we got the lock, our tracer went away. |
@@ -1930,13 +1931,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1930 | } | 1931 | } |
1931 | 1932 | ||
1932 | /* | 1933 | /* |
1933 | * While in TASK_TRACED, we were considered "frozen enough". | ||
1934 | * Now that we woke up, it's crucial if we're supposed to be | ||
1935 | * frozen that we freeze now before running anything substantial. | ||
1936 | */ | ||
1937 | try_to_freeze(); | ||
1938 | |||
1939 | /* | ||
1940 | * We are back. Now reacquire the siglock before touching | 1934 | * We are back. Now reacquire the siglock before touching |
1941 | * last_siginfo, so that we are sure to have synchronized with | 1935 | * last_siginfo, so that we are sure to have synchronized with |
1942 | * any signal-sending on another CPU that wants to examine it. | 1936 | * any signal-sending on another CPU that wants to examine it. |
@@ -2092,7 +2086,7 @@ static bool do_signal_stop(int signr) | |||
2092 | } | 2086 | } |
2093 | 2087 | ||
2094 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | 2088 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ |
2095 | schedule(); | 2089 | freezable_schedule(); |
2096 | return true; | 2090 | return true; |
2097 | } else { | 2091 | } else { |
2098 | /* | 2092 | /* |
@@ -2138,10 +2132,9 @@ static void do_jobctl_trap(void) | |||
2138 | } | 2132 | } |
2139 | } | 2133 | } |
2140 | 2134 | ||
2141 | static int ptrace_signal(int signr, siginfo_t *info, | 2135 | static int ptrace_signal(int signr, siginfo_t *info) |
2142 | struct pt_regs *regs, void *cookie) | ||
2143 | { | 2136 | { |
2144 | ptrace_signal_deliver(regs, cookie); | 2137 | ptrace_signal_deliver(); |
2145 | /* | 2138 | /* |
2146 | * We do not check sig_kernel_stop(signr) but set this marker | 2139 | * We do not check sig_kernel_stop(signr) but set this marker |
2147 | * unconditionally because we do not know whether debugger will | 2140 | * unconditionally because we do not know whether debugger will |
@@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
2200 | if (unlikely(uprobe_deny_signal())) | 2193 | if (unlikely(uprobe_deny_signal())) |
2201 | return 0; | 2194 | return 0; |
2202 | 2195 | ||
2203 | relock: | ||
2204 | /* | 2196 | /* |
2205 | * We'll jump back here after any time we were stopped in TASK_STOPPED. | 2197 | * Do this once, we can't return to user-mode if freezing() == T. |
2206 | * While in TASK_STOPPED, we were considered "frozen enough". | 2198 | * do_signal_stop() and ptrace_stop() do freezable_schedule() and |
2207 | * Now that we woke up, it's crucial if we're supposed to be | 2199 | * thus do not need another check after return. |
2208 | * frozen that we freeze now before running anything substantial. | ||
2209 | */ | 2200 | */ |
2210 | try_to_freeze(); | 2201 | try_to_freeze(); |
2211 | 2202 | ||
2203 | relock: | ||
2212 | spin_lock_irq(&sighand->siglock); | 2204 | spin_lock_irq(&sighand->siglock); |
2213 | /* | 2205 | /* |
2214 | * Every stopped thread goes here after wakeup. Check to see if | 2206 | * Every stopped thread goes here after wakeup. Check to see if |
@@ -2265,8 +2257,7 @@ relock: | |||
2265 | break; /* will return 0 */ | 2257 | break; /* will return 0 */ |
2266 | 2258 | ||
2267 | if (unlikely(current->ptrace) && signr != SIGKILL) { | 2259 | if (unlikely(current->ptrace) && signr != SIGKILL) { |
2268 | signr = ptrace_signal(signr, info, | 2260 | signr = ptrace_signal(signr, info); |
2269 | regs, cookie); | ||
2270 | if (!signr) | 2261 | if (!signr) |
2271 | continue; | 2262 | continue; |
2272 | } | 2263 | } |
@@ -2351,7 +2342,7 @@ relock: | |||
2351 | 2342 | ||
2352 | if (sig_kernel_coredump(signr)) { | 2343 | if (sig_kernel_coredump(signr)) { |
2353 | if (print_fatal_signals) | 2344 | if (print_fatal_signals) |
2354 | print_fatal_signal(regs, info->si_signo); | 2345 | print_fatal_signal(info->si_signo); |
2355 | /* | 2346 | /* |
2356 | * If it was able to dump core, this kills all | 2347 | * If it was able to dump core, this kills all |
2357 | * other threads in the group and synchronizes with | 2348 | * other threads in the group and synchronizes with |
@@ -2360,7 +2351,7 @@ relock: | |||
2360 | * first and our do_group_exit call below will use | 2351 | * first and our do_group_exit call below will use |
2361 | * that value and ignore the one we pass it. | 2352 | * that value and ignore the one we pass it. |
2362 | */ | 2353 | */ |
2363 | do_coredump(info, regs); | 2354 | do_coredump(info); |
2364 | } | 2355 | } |
2365 | 2356 | ||
2366 | /* | 2357 | /* |
diff --git a/kernel/softirq.c b/kernel/softirq.c index cc96bdc0c2c9..ed567babe789 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) | |||
221 | current->flags &= ~PF_MEMALLOC; | 221 | current->flags &= ~PF_MEMALLOC; |
222 | 222 | ||
223 | pending = local_softirq_pending(); | 223 | pending = local_softirq_pending(); |
224 | vtime_account(current); | 224 | vtime_account_irq_enter(current); |
225 | 225 | ||
226 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 226 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
227 | SOFTIRQ_OFFSET); | 227 | SOFTIRQ_OFFSET); |
@@ -272,7 +272,7 @@ restart: | |||
272 | 272 | ||
273 | lockdep_softirq_exit(); | 273 | lockdep_softirq_exit(); |
274 | 274 | ||
275 | vtime_account(current); | 275 | vtime_account_irq_exit(current); |
276 | __local_bh_enable(SOFTIRQ_OFFSET); | 276 | __local_bh_enable(SOFTIRQ_OFFSET); |
277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
278 | } | 278 | } |
@@ -341,7 +341,7 @@ static inline void invoke_softirq(void) | |||
341 | */ | 341 | */ |
342 | void irq_exit(void) | 342 | void irq_exit(void) |
343 | { | 343 | { |
344 | vtime_account(current); | 344 | vtime_account_irq_exit(current); |
345 | trace_hardirq_exit(); | 345 | trace_hardirq_exit(); |
346 | sub_preempt_count(IRQ_EXIT_OFFSET); | 346 | sub_preempt_count(IRQ_EXIT_OFFSET); |
347 | if (!in_interrupt() && local_softirq_pending()) | 347 | if (!in_interrupt() && local_softirq_pending()) |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 97c465ebd844..2b859828cdc3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -16,8 +16,10 @@ | |||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2006 | 18 | * Copyright (C) IBM Corporation, 2006 |
19 | * Copyright (C) Fujitsu, 2012 | ||
19 | * | 20 | * |
20 | * Author: Paul McKenney <paulmck@us.ibm.com> | 21 | * Author: Paul McKenney <paulmck@us.ibm.com> |
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | ||
21 | * | 23 | * |
22 | * For detailed explanation of Read-Copy Update mechanism see - | 24 | * For detailed explanation of Read-Copy Update mechanism see - |
23 | * Documentation/RCU/ *.txt | 25 | * Documentation/RCU/ *.txt |
@@ -34,6 +36,10 @@ | |||
34 | #include <linux/delay.h> | 36 | #include <linux/delay.h> |
35 | #include <linux/srcu.h> | 37 | #include <linux/srcu.h> |
36 | 38 | ||
39 | #include <trace/events/rcu.h> | ||
40 | |||
41 | #include "rcu.h" | ||
42 | |||
37 | /* | 43 | /* |
38 | * Initialize an rcu_batch structure to empty. | 44 | * Initialize an rcu_batch structure to empty. |
39 | */ | 45 | */ |
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | |||
92 | } | 98 | } |
93 | } | 99 | } |
94 | 100 | ||
95 | /* single-thread state-machine */ | ||
96 | static void process_srcu(struct work_struct *work); | ||
97 | |||
98 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 101 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
99 | { | 102 | { |
100 | sp->completed = 0; | 103 | sp->completed = 0; |
@@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
464 | */ | 467 | */ |
465 | void synchronize_srcu(struct srcu_struct *sp) | 468 | void synchronize_srcu(struct srcu_struct *sp) |
466 | { | 469 | { |
467 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); | 470 | __synchronize_srcu(sp, rcu_expedited |
471 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT | ||
472 | : SYNCHRONIZE_SRCU_TRYCOUNT); | ||
468 | } | 473 | } |
469 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 474 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
470 | 475 | ||
@@ -637,7 +642,7 @@ static void srcu_reschedule(struct srcu_struct *sp) | |||
637 | /* | 642 | /* |
638 | * This is the work-queue function that handles SRCU grace periods. | 643 | * This is the work-queue function that handles SRCU grace periods. |
639 | */ | 644 | */ |
640 | static void process_srcu(struct work_struct *work) | 645 | void process_srcu(struct work_struct *work) |
641 | { | 646 | { |
642 | struct srcu_struct *sp; | 647 | struct srcu_struct *sp; |
643 | 648 | ||
@@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work) | |||
648 | srcu_invoke_callbacks(sp); | 653 | srcu_invoke_callbacks(sp); |
649 | srcu_reschedule(sp); | 654 | srcu_reschedule(sp); |
650 | } | 655 | } |
656 | EXPORT_SYMBOL_GPL(process_srcu); | ||
diff --git a/kernel/sys.c b/kernel/sys.c index e6e0ece5f6a0..265b37690421 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms) | |||
1046 | cputime_t tgutime, tgstime, cutime, cstime; | 1046 | cputime_t tgutime, tgstime, cutime, cstime; |
1047 | 1047 | ||
1048 | spin_lock_irq(¤t->sighand->siglock); | 1048 | spin_lock_irq(¤t->sighand->siglock); |
1049 | thread_group_times(current, &tgutime, &tgstime); | 1049 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); |
1050 | cutime = current->signal->cutime; | 1050 | cutime = current->signal->cutime; |
1051 | cstime = current->signal->cstime; | 1051 | cstime = current->signal->cstime; |
1052 | spin_unlock_irq(¤t->sighand->siglock); | 1052 | spin_unlock_irq(¤t->sighand->siglock); |
@@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1704 | utime = stime = 0; | 1704 | utime = stime = 0; |
1705 | 1705 | ||
1706 | if (who == RUSAGE_THREAD) { | 1706 | if (who == RUSAGE_THREAD) { |
1707 | task_times(current, &utime, &stime); | 1707 | task_cputime_adjusted(current, &utime, &stime); |
1708 | accumulate_thread_rusage(p, r); | 1708 | accumulate_thread_rusage(p, r); |
1709 | maxrss = p->signal->maxrss; | 1709 | maxrss = p->signal->maxrss; |
1710 | goto out; | 1710 | goto out; |
@@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1730 | break; | 1730 | break; |
1731 | 1731 | ||
1732 | case RUSAGE_SELF: | 1732 | case RUSAGE_SELF: |
1733 | thread_group_times(p, &tgutime, &tgstime); | 1733 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
1734 | utime += tgutime; | 1734 | utime += tgutime; |
1735 | stime += tgstime; | 1735 | stime += tgstime; |
1736 | r->ru_nvcsw += p->signal->nvcsw; | 1736 | r->ru_nvcsw += p->signal->nvcsw; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..c88878db491e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ | |||
256 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 256 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
257 | static int min_wakeup_granularity_ns; /* 0 usecs */ | 257 | static int min_wakeup_granularity_ns; /* 0 usecs */ |
258 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 258 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
259 | #ifdef CONFIG_SMP | ||
259 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
260 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
261 | #endif | 262 | #endif /* CONFIG_SMP */ |
263 | #endif /* CONFIG_SCHED_DEBUG */ | ||
262 | 264 | ||
263 | #ifdef CONFIG_COMPACTION | 265 | #ifdef CONFIG_COMPACTION |
264 | static int min_extfrag_threshold; | 266 | static int min_extfrag_threshold; |
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = { | |||
301 | .extra1 = &min_wakeup_granularity_ns, | 303 | .extra1 = &min_wakeup_granularity_ns, |
302 | .extra2 = &max_wakeup_granularity_ns, | 304 | .extra2 = &max_wakeup_granularity_ns, |
303 | }, | 305 | }, |
306 | #ifdef CONFIG_SMP | ||
304 | { | 307 | { |
305 | .procname = "sched_tunable_scaling", | 308 | .procname = "sched_tunable_scaling", |
306 | .data = &sysctl_sched_tunable_scaling, | 309 | .data = &sysctl_sched_tunable_scaling, |
@@ -347,7 +350,45 @@ static struct ctl_table kern_table[] = { | |||
347 | .extra1 = &zero, | 350 | .extra1 = &zero, |
348 | .extra2 = &one, | 351 | .extra2 = &one, |
349 | }, | 352 | }, |
350 | #endif | 353 | #endif /* CONFIG_SMP */ |
354 | #ifdef CONFIG_NUMA_BALANCING | ||
355 | { | ||
356 | .procname = "numa_balancing_scan_delay_ms", | ||
357 | .data = &sysctl_numa_balancing_scan_delay, | ||
358 | .maxlen = sizeof(unsigned int), | ||
359 | .mode = 0644, | ||
360 | .proc_handler = proc_dointvec, | ||
361 | }, | ||
362 | { | ||
363 | .procname = "numa_balancing_scan_period_min_ms", | ||
364 | .data = &sysctl_numa_balancing_scan_period_min, | ||
365 | .maxlen = sizeof(unsigned int), | ||
366 | .mode = 0644, | ||
367 | .proc_handler = proc_dointvec, | ||
368 | }, | ||
369 | { | ||
370 | .procname = "numa_balancing_scan_period_reset", | ||
371 | .data = &sysctl_numa_balancing_scan_period_reset, | ||
372 | .maxlen = sizeof(unsigned int), | ||
373 | .mode = 0644, | ||
374 | .proc_handler = proc_dointvec, | ||
375 | }, | ||
376 | { | ||
377 | .procname = "numa_balancing_scan_period_max_ms", | ||
378 | .data = &sysctl_numa_balancing_scan_period_max, | ||
379 | .maxlen = sizeof(unsigned int), | ||
380 | .mode = 0644, | ||
381 | .proc_handler = proc_dointvec, | ||
382 | }, | ||
383 | { | ||
384 | .procname = "numa_balancing_scan_size_mb", | ||
385 | .data = &sysctl_numa_balancing_scan_size, | ||
386 | .maxlen = sizeof(unsigned int), | ||
387 | .mode = 0644, | ||
388 | .proc_handler = proc_dointvec, | ||
389 | }, | ||
390 | #endif /* CONFIG_NUMA_BALANCING */ | ||
391 | #endif /* CONFIG_SCHED_DEBUG */ | ||
351 | { | 392 | { |
352 | .procname = "sched_rt_period_us", | 393 | .procname = "sched_rt_period_us", |
353 | .data = &sysctl_sched_rt_period, | 394 | .data = &sysctl_sched_rt_period, |
@@ -565,7 +606,7 @@ static struct ctl_table kern_table[] = { | |||
565 | .extra2 = &one, | 606 | .extra2 = &one, |
566 | }, | 607 | }, |
567 | #endif | 608 | #endif |
568 | #ifdef CONFIG_HOTPLUG | 609 | |
569 | { | 610 | { |
570 | .procname = "hotplug", | 611 | .procname = "hotplug", |
571 | .data = &uevent_helper, | 612 | .data = &uevent_helper, |
@@ -573,7 +614,7 @@ static struct ctl_table kern_table[] = { | |||
573 | .mode = 0644, | 614 | .mode = 0644, |
574 | .proc_handler = proc_dostring, | 615 | .proc_handler = proc_dostring, |
575 | }, | 616 | }, |
576 | #endif | 617 | |
577 | #ifdef CONFIG_CHR_DEV_SG | 618 | #ifdef CONFIG_CHR_DEV_SG |
578 | { | 619 | { |
579 | .procname = "sg-big-buff", | 620 | .procname = "sg-big-buff", |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e2fd74b8e8c2..ff7d9d2ab504 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o |
2 | obj-y += timeconv.o posix-clock.o alarmtimer.o | 2 | obj-y += timeconv.o posix-clock.o alarmtimer.o |
3 | 3 | ||
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 6629bf7b5285..7a925ba456fb 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs) | |||
58 | return (cycle_t) jiffies; | 58 | return (cycle_t) jiffies; |
59 | } | 59 | } |
60 | 60 | ||
61 | struct clocksource clocksource_jiffies = { | 61 | static struct clocksource clocksource_jiffies = { |
62 | .name = "jiffies", | 62 | .name = "jiffies", |
63 | .rating = 1, /* lowest valid rating*/ | 63 | .rating = 1, /* lowest valid rating*/ |
64 | .read = jiffies_read, | 64 | .read = jiffies_read, |
@@ -67,6 +67,8 @@ struct clocksource clocksource_jiffies = { | |||
67 | .shift = JIFFIES_SHIFT, | 67 | .shift = JIFFIES_SHIFT, |
68 | }; | 68 | }; |
69 | 69 | ||
70 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); | ||
71 | |||
70 | #if (BITS_PER_LONG < 64) | 72 | #if (BITS_PER_LONG < 64) |
71 | u64 get_jiffies_64(void) | 73 | u64 get_jiffies_64(void) |
72 | { | 74 | { |
@@ -74,9 +76,9 @@ u64 get_jiffies_64(void) | |||
74 | u64 ret; | 76 | u64 ret; |
75 | 77 | ||
76 | do { | 78 | do { |
77 | seq = read_seqbegin(&xtime_lock); | 79 | seq = read_seqbegin(&jiffies_lock); |
78 | ret = jiffies_64; | 80 | ret = jiffies_64; |
79 | } while (read_seqretry(&xtime_lock, seq)); | 81 | } while (read_seqretry(&jiffies_lock, seq)); |
80 | return ret; | 82 | return ret; |
81 | } | 83 | } |
82 | EXPORT_SYMBOL(get_jiffies_64); | 84 | EXPORT_SYMBOL(get_jiffies_64); |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index da6c9ecad4e4..b1600a6973f4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void) | |||
63 | static void tick_periodic(int cpu) | 63 | static void tick_periodic(int cpu) |
64 | { | 64 | { |
65 | if (tick_do_timer_cpu == cpu) { | 65 | if (tick_do_timer_cpu == cpu) { |
66 | write_seqlock(&xtime_lock); | 66 | write_seqlock(&jiffies_lock); |
67 | 67 | ||
68 | /* Keep track of the next tick event */ | 68 | /* Keep track of the next tick event */ |
69 | tick_next_period = ktime_add(tick_next_period, tick_period); | 69 | tick_next_period = ktime_add(tick_next_period, tick_period); |
70 | 70 | ||
71 | do_timer(1); | 71 | do_timer(1); |
72 | write_sequnlock(&xtime_lock); | 72 | write_sequnlock(&jiffies_lock); |
73 | } | 73 | } |
74 | 74 | ||
75 | update_process_times(user_mode(get_irq_regs())); | 75 | update_process_times(user_mode(get_irq_regs())); |
@@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
130 | ktime_t next; | 130 | ktime_t next; |
131 | 131 | ||
132 | do { | 132 | do { |
133 | seq = read_seqbegin(&xtime_lock); | 133 | seq = read_seqbegin(&jiffies_lock); |
134 | next = tick_next_period; | 134 | next = tick_next_period; |
135 | } while (read_seqretry(&xtime_lock, seq)); | 135 | } while (read_seqretry(&jiffies_lock, seq)); |
136 | 136 | ||
137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); |
138 | 138 | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4e265b901fed..cf3e59ed6dc0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) | |||
141 | #endif | 141 | #endif |
142 | 142 | ||
143 | extern void do_timer(unsigned long ticks); | 143 | extern void do_timer(unsigned long ticks); |
144 | extern seqlock_t xtime_lock; | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..d58e552d9fd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -31,7 +31,7 @@ | |||
31 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 31 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * The time, when the last jiffy update happened. Protected by xtime_lock. | 34 | * The time, when the last jiffy update happened. Protected by jiffies_lock. |
35 | */ | 35 | */ |
36 | static ktime_t last_jiffies_update; | 36 | static ktime_t last_jiffies_update; |
37 | 37 | ||
@@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
49 | ktime_t delta; | 49 | ktime_t delta; |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * Do a quick check without holding xtime_lock: | 52 | * Do a quick check without holding jiffies_lock: |
53 | */ | 53 | */ |
54 | delta = ktime_sub(now, last_jiffies_update); | 54 | delta = ktime_sub(now, last_jiffies_update); |
55 | if (delta.tv64 < tick_period.tv64) | 55 | if (delta.tv64 < tick_period.tv64) |
56 | return; | 56 | return; |
57 | 57 | ||
58 | /* Reevalute with xtime_lock held */ | 58 | /* Reevalute with jiffies_lock held */ |
59 | write_seqlock(&xtime_lock); | 59 | write_seqlock(&jiffies_lock); |
60 | 60 | ||
61 | delta = ktime_sub(now, last_jiffies_update); | 61 | delta = ktime_sub(now, last_jiffies_update); |
62 | if (delta.tv64 >= tick_period.tv64) { | 62 | if (delta.tv64 >= tick_period.tv64) { |
@@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
79 | /* Keep the tick_next_period variable up to date */ | 79 | /* Keep the tick_next_period variable up to date */ |
80 | tick_next_period = ktime_add(last_jiffies_update, tick_period); | 80 | tick_next_period = ktime_add(last_jiffies_update, tick_period); |
81 | } | 81 | } |
82 | write_sequnlock(&xtime_lock); | 82 | write_sequnlock(&jiffies_lock); |
83 | } | 83 | } |
84 | 84 | ||
85 | /* | 85 | /* |
@@ -89,15 +89,58 @@ static ktime_t tick_init_jiffy_update(void) | |||
89 | { | 89 | { |
90 | ktime_t period; | 90 | ktime_t period; |
91 | 91 | ||
92 | write_seqlock(&xtime_lock); | 92 | write_seqlock(&jiffies_lock); |
93 | /* Did we start the jiffies update yet ? */ | 93 | /* Did we start the jiffies update yet ? */ |
94 | if (last_jiffies_update.tv64 == 0) | 94 | if (last_jiffies_update.tv64 == 0) |
95 | last_jiffies_update = tick_next_period; | 95 | last_jiffies_update = tick_next_period; |
96 | period = last_jiffies_update; | 96 | period = last_jiffies_update; |
97 | write_sequnlock(&xtime_lock); | 97 | write_sequnlock(&jiffies_lock); |
98 | return period; | 98 | return period; |
99 | } | 99 | } |
100 | 100 | ||
101 | |||
102 | static void tick_sched_do_timer(ktime_t now) | ||
103 | { | ||
104 | int cpu = smp_processor_id(); | ||
105 | |||
106 | #ifdef CONFIG_NO_HZ | ||
107 | /* | ||
108 | * Check if the do_timer duty was dropped. We don't care about | ||
109 | * concurrency: This happens only when the cpu in charge went | ||
110 | * into a long sleep. If two cpus happen to assign themself to | ||
111 | * this duty, then the jiffies update is still serialized by | ||
112 | * jiffies_lock. | ||
113 | */ | ||
114 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
115 | tick_do_timer_cpu = cpu; | ||
116 | #endif | ||
117 | |||
118 | /* Check, if the jiffies need an update */ | ||
119 | if (tick_do_timer_cpu == cpu) | ||
120 | tick_do_update_jiffies64(now); | ||
121 | } | ||
122 | |||
123 | static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | ||
124 | { | ||
125 | #ifdef CONFIG_NO_HZ | ||
126 | /* | ||
127 | * When we are idle and the tick is stopped, we have to touch | ||
128 | * the watchdog as we might not schedule for a really long | ||
129 | * time. This happens on complete idle SMP systems while | ||
130 | * waiting on the login prompt. We also increment the "start of | ||
131 | * idle" jiffy stamp so the idle accounting adjustment we do | ||
132 | * when we go busy again does not account too much ticks. | ||
133 | */ | ||
134 | if (ts->tick_stopped) { | ||
135 | touch_softlockup_watchdog(); | ||
136 | if (is_idle_task(current)) | ||
137 | ts->idle_jiffies++; | ||
138 | } | ||
139 | #endif | ||
140 | update_process_times(user_mode(regs)); | ||
141 | profile_tick(CPU_PROFILING); | ||
142 | } | ||
143 | |||
101 | /* | 144 | /* |
102 | * NOHZ - aka dynamic tick functionality | 145 | * NOHZ - aka dynamic tick functionality |
103 | */ | 146 | */ |
@@ -282,11 +325,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
282 | 325 | ||
283 | /* Read jiffies and the time when jiffies were updated last */ | 326 | /* Read jiffies and the time when jiffies were updated last */ |
284 | do { | 327 | do { |
285 | seq = read_seqbegin(&xtime_lock); | 328 | seq = read_seqbegin(&jiffies_lock); |
286 | last_update = last_jiffies_update; | 329 | last_update = last_jiffies_update; |
287 | last_jiffies = jiffies; | 330 | last_jiffies = jiffies; |
288 | time_delta = timekeeping_max_deferment(); | 331 | time_delta = timekeeping_max_deferment(); |
289 | } while (read_seqretry(&xtime_lock, seq)); | 332 | } while (read_seqretry(&jiffies_lock, seq)); |
290 | 333 | ||
291 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || | 334 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || |
292 | arch_needs_cpu(cpu)) { | 335 | arch_needs_cpu(cpu)) { |
@@ -526,6 +569,8 @@ void tick_nohz_irq_exit(void) | |||
526 | if (!ts->inidle) | 569 | if (!ts->inidle) |
527 | return; | 570 | return; |
528 | 571 | ||
572 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
573 | menu_hrtimer_cancel(); | ||
529 | __tick_nohz_idle_enter(ts); | 574 | __tick_nohz_idle_enter(ts); |
530 | } | 575 | } |
531 | 576 | ||
@@ -621,6 +666,8 @@ void tick_nohz_idle_exit(void) | |||
621 | 666 | ||
622 | ts->inidle = 0; | 667 | ts->inidle = 0; |
623 | 668 | ||
669 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
670 | menu_hrtimer_cancel(); | ||
624 | if (ts->idle_active || ts->tick_stopped) | 671 | if (ts->idle_active || ts->tick_stopped) |
625 | now = ktime_get(); | 672 | now = ktime_get(); |
626 | 673 | ||
@@ -648,40 +695,12 @@ static void tick_nohz_handler(struct clock_event_device *dev) | |||
648 | { | 695 | { |
649 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 696 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
650 | struct pt_regs *regs = get_irq_regs(); | 697 | struct pt_regs *regs = get_irq_regs(); |
651 | int cpu = smp_processor_id(); | ||
652 | ktime_t now = ktime_get(); | 698 | ktime_t now = ktime_get(); |
653 | 699 | ||
654 | dev->next_event.tv64 = KTIME_MAX; | 700 | dev->next_event.tv64 = KTIME_MAX; |
655 | 701 | ||
656 | /* | 702 | tick_sched_do_timer(now); |
657 | * Check if the do_timer duty was dropped. We don't care about | 703 | tick_sched_handle(ts, regs); |
658 | * concurrency: This happens only when the cpu in charge went | ||
659 | * into a long sleep. If two cpus happen to assign themself to | ||
660 | * this duty, then the jiffies update is still serialized by | ||
661 | * xtime_lock. | ||
662 | */ | ||
663 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
664 | tick_do_timer_cpu = cpu; | ||
665 | |||
666 | /* Check, if the jiffies need an update */ | ||
667 | if (tick_do_timer_cpu == cpu) | ||
668 | tick_do_update_jiffies64(now); | ||
669 | |||
670 | /* | ||
671 | * When we are idle and the tick is stopped, we have to touch | ||
672 | * the watchdog as we might not schedule for a really long | ||
673 | * time. This happens on complete idle SMP systems while | ||
674 | * waiting on the login prompt. We also increment the "start | ||
675 | * of idle" jiffy stamp so the idle accounting adjustment we | ||
676 | * do when we go busy again does not account too much ticks. | ||
677 | */ | ||
678 | if (ts->tick_stopped) { | ||
679 | touch_softlockup_watchdog(); | ||
680 | ts->idle_jiffies++; | ||
681 | } | ||
682 | |||
683 | update_process_times(user_mode(regs)); | ||
684 | profile_tick(CPU_PROFILING); | ||
685 | 704 | ||
686 | while (tick_nohz_reprogram(ts, now)) { | 705 | while (tick_nohz_reprogram(ts, now)) { |
687 | now = ktime_get(); | 706 | now = ktime_get(); |
@@ -794,7 +813,7 @@ void tick_check_idle(int cpu) | |||
794 | #ifdef CONFIG_HIGH_RES_TIMERS | 813 | #ifdef CONFIG_HIGH_RES_TIMERS |
795 | /* | 814 | /* |
796 | * We rearm the timer until we get disabled by the idle code. | 815 | * We rearm the timer until we get disabled by the idle code. |
797 | * Called with interrupts disabled and timer->base->cpu_base->lock held. | 816 | * Called with interrupts disabled. |
798 | */ | 817 | */ |
799 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | 818 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) |
800 | { | 819 | { |
@@ -802,45 +821,15 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
802 | container_of(timer, struct tick_sched, sched_timer); | 821 | container_of(timer, struct tick_sched, sched_timer); |
803 | struct pt_regs *regs = get_irq_regs(); | 822 | struct pt_regs *regs = get_irq_regs(); |
804 | ktime_t now = ktime_get(); | 823 | ktime_t now = ktime_get(); |
805 | int cpu = smp_processor_id(); | ||
806 | 824 | ||
807 | #ifdef CONFIG_NO_HZ | 825 | tick_sched_do_timer(now); |
808 | /* | ||
809 | * Check if the do_timer duty was dropped. We don't care about | ||
810 | * concurrency: This happens only when the cpu in charge went | ||
811 | * into a long sleep. If two cpus happen to assign themself to | ||
812 | * this duty, then the jiffies update is still serialized by | ||
813 | * xtime_lock. | ||
814 | */ | ||
815 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
816 | tick_do_timer_cpu = cpu; | ||
817 | #endif | ||
818 | |||
819 | /* Check, if the jiffies need an update */ | ||
820 | if (tick_do_timer_cpu == cpu) | ||
821 | tick_do_update_jiffies64(now); | ||
822 | 826 | ||
823 | /* | 827 | /* |
824 | * Do not call, when we are not in irq context and have | 828 | * Do not call, when we are not in irq context and have |
825 | * no valid regs pointer | 829 | * no valid regs pointer |
826 | */ | 830 | */ |
827 | if (regs) { | 831 | if (regs) |
828 | /* | 832 | tick_sched_handle(ts, regs); |
829 | * When we are idle and the tick is stopped, we have to touch | ||
830 | * the watchdog as we might not schedule for a really long | ||
831 | * time. This happens on complete idle SMP systems while | ||
832 | * waiting on the login prompt. We also increment the "start of | ||
833 | * idle" jiffy stamp so the idle accounting adjustment we do | ||
834 | * when we go busy again does not account too much ticks. | ||
835 | */ | ||
836 | if (ts->tick_stopped) { | ||
837 | touch_softlockup_watchdog(); | ||
838 | if (is_idle_task(current)) | ||
839 | ts->idle_jiffies++; | ||
840 | } | ||
841 | update_process_times(user_mode(regs)); | ||
842 | profile_tick(CPU_PROFILING); | ||
843 | } | ||
844 | 833 | ||
845 | hrtimer_forward(timer, now, tick_period); | 834 | hrtimer_forward(timer, now, tick_period); |
846 | 835 | ||
@@ -874,7 +863,7 @@ void tick_setup_sched_timer(void) | |||
874 | /* Get the next period (per cpu) */ | 863 | /* Get the next period (per cpu) */ |
875 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 864 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
876 | 865 | ||
877 | /* Offset the tick to avert xtime_lock contention. */ | 866 | /* Offset the tick to avert jiffies_lock contention. */ |
878 | if (sched_skew_tick) { | 867 | if (sched_skew_tick) { |
879 | u64 offset = ktime_to_ns(tick_period) >> 1; | 868 | u64 offset = ktime_to_ns(tick_period) >> 1; |
880 | do_div(offset, num_possible_cpus()); | 869 | do_div(offset, num_possible_cpus()); |
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c deleted file mode 100644 index a9ae369925ce..000000000000 --- a/kernel/time/timecompare.c +++ /dev/null | |||
@@ -1,193 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2009 Intel Corporation. | ||
3 | * Author: Patrick Ohly <patrick.ohly@intel.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
18 | */ | ||
19 | |||
20 | #include <linux/timecompare.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/slab.h> | ||
23 | #include <linux/math64.h> | ||
24 | #include <linux/kernel.h> | ||
25 | |||
26 | /* | ||
27 | * fixed point arithmetic scale factor for skew | ||
28 | * | ||
29 | * Usually one would measure skew in ppb (parts per billion, 1e9), but | ||
30 | * using a factor of 2 simplifies the math. | ||
31 | */ | ||
32 | #define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) | ||
33 | |||
34 | ktime_t timecompare_transform(struct timecompare *sync, | ||
35 | u64 source_tstamp) | ||
36 | { | ||
37 | u64 nsec; | ||
38 | |||
39 | nsec = source_tstamp + sync->offset; | ||
40 | nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / | ||
41 | TIMECOMPARE_SKEW_RESOLUTION; | ||
42 | |||
43 | return ns_to_ktime(nsec); | ||
44 | } | ||
45 | EXPORT_SYMBOL_GPL(timecompare_transform); | ||
46 | |||
47 | int timecompare_offset(struct timecompare *sync, | ||
48 | s64 *offset, | ||
49 | u64 *source_tstamp) | ||
50 | { | ||
51 | u64 start_source = 0, end_source = 0; | ||
52 | struct { | ||
53 | s64 offset; | ||
54 | s64 duration_target; | ||
55 | } buffer[10], sample, *samples; | ||
56 | int counter = 0, i; | ||
57 | int used; | ||
58 | int index; | ||
59 | int num_samples = sync->num_samples; | ||
60 | |||
61 | if (num_samples > ARRAY_SIZE(buffer)) { | ||
62 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); | ||
63 | if (!samples) { | ||
64 | samples = buffer; | ||
65 | num_samples = ARRAY_SIZE(buffer); | ||
66 | } | ||
67 | } else { | ||
68 | samples = buffer; | ||
69 | } | ||
70 | |||
71 | /* run until we have enough valid samples, but do not try forever */ | ||
72 | i = 0; | ||
73 | counter = 0; | ||
74 | while (1) { | ||
75 | u64 ts; | ||
76 | ktime_t start, end; | ||
77 | |||
78 | start = sync->target(); | ||
79 | ts = timecounter_read(sync->source); | ||
80 | end = sync->target(); | ||
81 | |||
82 | if (!i) | ||
83 | start_source = ts; | ||
84 | |||
85 | /* ignore negative durations */ | ||
86 | sample.duration_target = ktime_to_ns(ktime_sub(end, start)); | ||
87 | if (sample.duration_target >= 0) { | ||
88 | /* | ||
89 | * assume symetric delay to and from source: | ||
90 | * average target time corresponds to measured | ||
91 | * source time | ||
92 | */ | ||
93 | sample.offset = | ||
94 | (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - | ||
95 | ts; | ||
96 | |||
97 | /* simple insertion sort based on duration */ | ||
98 | index = counter - 1; | ||
99 | while (index >= 0) { | ||
100 | if (samples[index].duration_target < | ||
101 | sample.duration_target) | ||
102 | break; | ||
103 | samples[index + 1] = samples[index]; | ||
104 | index--; | ||
105 | } | ||
106 | samples[index + 1] = sample; | ||
107 | counter++; | ||
108 | } | ||
109 | |||
110 | i++; | ||
111 | if (counter >= num_samples || i >= 100000) { | ||
112 | end_source = ts; | ||
113 | break; | ||
114 | } | ||
115 | } | ||
116 | |||
117 | *source_tstamp = (end_source + start_source) / 2; | ||
118 | |||
119 | /* remove outliers by only using 75% of the samples */ | ||
120 | used = counter * 3 / 4; | ||
121 | if (!used) | ||
122 | used = counter; | ||
123 | if (used) { | ||
124 | /* calculate average */ | ||
125 | s64 off = 0; | ||
126 | for (index = 0; index < used; index++) | ||
127 | off += samples[index].offset; | ||
128 | *offset = div_s64(off, used); | ||
129 | } | ||
130 | |||
131 | if (samples && samples != buffer) | ||
132 | kfree(samples); | ||
133 | |||
134 | return used; | ||
135 | } | ||
136 | EXPORT_SYMBOL_GPL(timecompare_offset); | ||
137 | |||
138 | void __timecompare_update(struct timecompare *sync, | ||
139 | u64 source_tstamp) | ||
140 | { | ||
141 | s64 offset; | ||
142 | u64 average_time; | ||
143 | |||
144 | if (!timecompare_offset(sync, &offset, &average_time)) | ||
145 | return; | ||
146 | |||
147 | if (!sync->last_update) { | ||
148 | sync->last_update = average_time; | ||
149 | sync->offset = offset; | ||
150 | sync->skew = 0; | ||
151 | } else { | ||
152 | s64 delta_nsec = average_time - sync->last_update; | ||
153 | |||
154 | /* avoid division by negative or small deltas */ | ||
155 | if (delta_nsec >= 10000) { | ||
156 | s64 delta_offset_nsec = offset - sync->offset; | ||
157 | s64 skew; /* delta_offset_nsec * | ||
158 | TIMECOMPARE_SKEW_RESOLUTION / | ||
159 | delta_nsec */ | ||
160 | u64 divisor; | ||
161 | |||
162 | /* div_s64() is limited to 32 bit divisor */ | ||
163 | skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; | ||
164 | divisor = delta_nsec; | ||
165 | while (unlikely(divisor >= ((s64)1) << 32)) { | ||
166 | /* divide both by 2; beware, right shift | ||
167 | of negative value has undefined | ||
168 | behavior and can only be used for | ||
169 | the positive divisor */ | ||
170 | skew = div_s64(skew, 2); | ||
171 | divisor >>= 1; | ||
172 | } | ||
173 | skew = div_s64(skew, divisor); | ||
174 | |||
175 | /* | ||
176 | * Calculate new overall skew as 4/16 the | ||
177 | * old value and 12/16 the new one. This is | ||
178 | * a rather arbitrary tradeoff between | ||
179 | * only using the latest measurement (0/16 and | ||
180 | * 16/16) and even more weight on past measurements. | ||
181 | */ | ||
182 | #define TIMECOMPARE_NEW_SKEW_PER_16 12 | ||
183 | sync->skew = | ||
184 | div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * | ||
185 | sync->skew + | ||
186 | TIMECOMPARE_NEW_SKEW_PER_16 * skew, | ||
187 | 16); | ||
188 | sync->last_update = average_time; | ||
189 | sync->offset = offset; | ||
190 | } | ||
191 | } | ||
192 | } | ||
193 | EXPORT_SYMBOL_GPL(__timecompare_update); | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e424970bb562..cbc6acb0db3f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -21,16 +21,11 @@ | |||
21 | #include <linux/time.h> | 21 | #include <linux/time.h> |
22 | #include <linux/tick.h> | 22 | #include <linux/tick.h> |
23 | #include <linux/stop_machine.h> | 23 | #include <linux/stop_machine.h> |
24 | #include <linux/pvclock_gtod.h> | ||
24 | 25 | ||
25 | 26 | ||
26 | static struct timekeeper timekeeper; | 27 | static struct timekeeper timekeeper; |
27 | 28 | ||
28 | /* | ||
29 | * This read-write spinlock protects us from races in SMP while | ||
30 | * playing with xtime. | ||
31 | */ | ||
32 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | ||
33 | |||
34 | /* flag for if timekeeping is suspended */ | 29 | /* flag for if timekeeping is suspended */ |
35 | int __read_mostly timekeeping_suspended; | 30 | int __read_mostly timekeeping_suspended; |
36 | 31 | ||
@@ -180,6 +175,54 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | |||
180 | return nsec + arch_gettimeoffset(); | 175 | return nsec + arch_gettimeoffset(); |
181 | } | 176 | } |
182 | 177 | ||
178 | static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); | ||
179 | |||
180 | static void update_pvclock_gtod(struct timekeeper *tk) | ||
181 | { | ||
182 | raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * pvclock_gtod_register_notifier - register a pvclock timedata update listener | ||
187 | * | ||
188 | * Must hold write on timekeeper.lock | ||
189 | */ | ||
190 | int pvclock_gtod_register_notifier(struct notifier_block *nb) | ||
191 | { | ||
192 | struct timekeeper *tk = &timekeeper; | ||
193 | unsigned long flags; | ||
194 | int ret; | ||
195 | |||
196 | write_seqlock_irqsave(&tk->lock, flags); | ||
197 | ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); | ||
198 | /* update timekeeping data */ | ||
199 | update_pvclock_gtod(tk); | ||
200 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
201 | |||
202 | return ret; | ||
203 | } | ||
204 | EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); | ||
205 | |||
206 | /** | ||
207 | * pvclock_gtod_unregister_notifier - unregister a pvclock | ||
208 | * timedata update listener | ||
209 | * | ||
210 | * Must hold write on timekeeper.lock | ||
211 | */ | ||
212 | int pvclock_gtod_unregister_notifier(struct notifier_block *nb) | ||
213 | { | ||
214 | struct timekeeper *tk = &timekeeper; | ||
215 | unsigned long flags; | ||
216 | int ret; | ||
217 | |||
218 | write_seqlock_irqsave(&tk->lock, flags); | ||
219 | ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); | ||
220 | write_sequnlock_irqrestore(&tk->lock, flags); | ||
221 | |||
222 | return ret; | ||
223 | } | ||
224 | EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); | ||
225 | |||
183 | /* must hold write on timekeeper.lock */ | 226 | /* must hold write on timekeeper.lock */ |
184 | static void timekeeping_update(struct timekeeper *tk, bool clearntp) | 227 | static void timekeeping_update(struct timekeeper *tk, bool clearntp) |
185 | { | 228 | { |
@@ -188,6 +231,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) | |||
188 | ntp_clear(); | 231 | ntp_clear(); |
189 | } | 232 | } |
190 | update_vsyscall(tk); | 233 | update_vsyscall(tk); |
234 | update_pvclock_gtod(tk); | ||
191 | } | 235 | } |
192 | 236 | ||
193 | /** | 237 | /** |
@@ -1299,9 +1343,7 @@ struct timespec get_monotonic_coarse(void) | |||
1299 | } | 1343 | } |
1300 | 1344 | ||
1301 | /* | 1345 | /* |
1302 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | 1346 | * Must hold jiffies_lock |
1303 | * without sampling the sequence number in xtime_lock. | ||
1304 | * jiffies is defined in the linker script... | ||
1305 | */ | 1347 | */ |
1306 | void do_timer(unsigned long ticks) | 1348 | void do_timer(unsigned long ticks) |
1307 | { | 1349 | { |
@@ -1389,7 +1431,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); | |||
1389 | */ | 1431 | */ |
1390 | void xtime_update(unsigned long ticks) | 1432 | void xtime_update(unsigned long ticks) |
1391 | { | 1433 | { |
1392 | write_seqlock(&xtime_lock); | 1434 | write_seqlock(&jiffies_lock); |
1393 | do_timer(ticks); | 1435 | do_timer(ticks); |
1394 | write_sequnlock(&xtime_lock); | 1436 | write_sequnlock(&jiffies_lock); |
1395 | } | 1437 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4cea4f41c1d9..5d89335a485f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -119,6 +119,7 @@ config TRACING | |||
119 | select BINARY_PRINTF | 119 | select BINARY_PRINTF |
120 | select EVENT_TRACING | 120 | select EVENT_TRACING |
121 | select TRACE_CLOCK | 121 | select TRACE_CLOCK |
122 | select IRQ_WORK | ||
122 | 123 | ||
123 | config GENERIC_TRACER | 124 | config GENERIC_TRACER |
124 | bool | 125 | bool |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9dcf15d38380..3ffe4c5ad3f3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -10,7 +10,7 @@ | |||
10 | * Based on code in the latency_tracer, that is: | 10 | * Based on code in the latency_tracer, that is: |
11 | * | 11 | * |
12 | * Copyright (C) 2004-2006 Ingo Molnar | 12 | * Copyright (C) 2004-2006 Ingo Molnar |
13 | * Copyright (C) 2004 William Lee Irwin III | 13 | * Copyright (C) 2004 Nadia Yvette Chambers |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/stop_machine.h> | 16 | #include <linux/stop_machine.h> |
@@ -2437,7 +2437,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) | |||
2437 | { | 2437 | { |
2438 | iter->pos = 0; | 2438 | iter->pos = 0; |
2439 | iter->func_pos = 0; | 2439 | iter->func_pos = 0; |
2440 | iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); | 2440 | iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); |
2441 | } | 2441 | } |
2442 | 2442 | ||
2443 | static void *t_start(struct seq_file *m, loff_t *pos) | 2443 | static void *t_start(struct seq_file *m, loff_t *pos) |
@@ -2675,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file) | |||
2675 | } | 2675 | } |
2676 | 2676 | ||
2677 | loff_t | 2677 | loff_t |
2678 | ftrace_regex_lseek(struct file *file, loff_t offset, int origin) | 2678 | ftrace_regex_lseek(struct file *file, loff_t offset, int whence) |
2679 | { | 2679 | { |
2680 | loff_t ret; | 2680 | loff_t ret; |
2681 | 2681 | ||
2682 | if (file->f_mode & FMODE_READ) | 2682 | if (file->f_mode & FMODE_READ) |
2683 | ret = seq_lseek(file, offset, origin); | 2683 | ret = seq_lseek(file, offset, whence); |
2684 | else | 2684 | else |
2685 | file->f_pos = ret = 1; | 2685 | file->f_pos = ret = 1; |
2686 | 2686 | ||
@@ -2868,7 +2868,7 @@ static int __init ftrace_mod_cmd_init(void) | |||
2868 | { | 2868 | { |
2869 | return register_ftrace_command(&ftrace_mod_cmd); | 2869 | return register_ftrace_command(&ftrace_mod_cmd); |
2870 | } | 2870 | } |
2871 | device_initcall(ftrace_mod_cmd_init); | 2871 | core_initcall(ftrace_mod_cmd_init); |
2872 | 2872 | ||
2873 | static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, | 2873 | static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, |
2874 | struct ftrace_ops *op, struct pt_regs *pt_regs) | 2874 | struct ftrace_ops *op, struct pt_regs *pt_regs) |
@@ -4055,7 +4055,7 @@ static int __init ftrace_nodyn_init(void) | |||
4055 | ftrace_enabled = 1; | 4055 | ftrace_enabled = 1; |
4056 | return 0; | 4056 | return 0; |
4057 | } | 4057 | } |
4058 | device_initcall(ftrace_nodyn_init); | 4058 | core_initcall(ftrace_nodyn_init); |
4059 | 4059 | ||
4060 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 4060 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
4061 | static inline void ftrace_startup_enable(int command) { } | 4061 | static inline void ftrace_startup_enable(int command) { } |
@@ -4381,7 +4381,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, | |||
4381 | if (strlen(tmp) == 0) | 4381 | if (strlen(tmp) == 0) |
4382 | return 1; | 4382 | return 1; |
4383 | 4383 | ||
4384 | ret = strict_strtol(tmp, 10, &val); | 4384 | ret = kstrtol(tmp, 10, &val); |
4385 | if (ret < 0) | 4385 | if (ret < 0) |
4386 | return ret; | 4386 | return ret; |
4387 | 4387 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b979426d16c6..ce8514feedcd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -460,9 +460,10 @@ struct ring_buffer_per_cpu { | |||
460 | unsigned long lost_events; | 460 | unsigned long lost_events; |
461 | unsigned long last_overrun; | 461 | unsigned long last_overrun; |
462 | local_t entries_bytes; | 462 | local_t entries_bytes; |
463 | local_t commit_overrun; | ||
464 | local_t overrun; | ||
465 | local_t entries; | 463 | local_t entries; |
464 | local_t overrun; | ||
465 | local_t commit_overrun; | ||
466 | local_t dropped_events; | ||
466 | local_t committing; | 467 | local_t committing; |
467 | local_t commits; | 468 | local_t commits; |
468 | unsigned long read; | 469 | unsigned long read; |
@@ -1396,6 +1397,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
1396 | struct list_head *head_page_with_bit; | 1397 | struct list_head *head_page_with_bit; |
1397 | 1398 | ||
1398 | head_page = &rb_set_head_page(cpu_buffer)->list; | 1399 | head_page = &rb_set_head_page(cpu_buffer)->list; |
1400 | if (!head_page) | ||
1401 | break; | ||
1399 | prev_page = head_page->prev; | 1402 | prev_page = head_page->prev; |
1400 | 1403 | ||
1401 | first_page = pages->next; | 1404 | first_page = pages->next; |
@@ -1820,7 +1823,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | |||
1820 | } | 1823 | } |
1821 | 1824 | ||
1822 | /** | 1825 | /** |
1823 | * ring_buffer_update_event - update event type and data | 1826 | * rb_update_event - update event type and data |
1824 | * @event: the even to update | 1827 | * @event: the even to update |
1825 | * @type: the type of event | 1828 | * @type: the type of event |
1826 | * @length: the size of the event field in the ring buffer | 1829 | * @length: the size of the event field in the ring buffer |
@@ -2155,8 +2158,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
2155 | * If we are not in overwrite mode, | 2158 | * If we are not in overwrite mode, |
2156 | * this is easy, just stop here. | 2159 | * this is easy, just stop here. |
2157 | */ | 2160 | */ |
2158 | if (!(buffer->flags & RB_FL_OVERWRITE)) | 2161 | if (!(buffer->flags & RB_FL_OVERWRITE)) { |
2162 | local_inc(&cpu_buffer->dropped_events); | ||
2159 | goto out_reset; | 2163 | goto out_reset; |
2164 | } | ||
2160 | 2165 | ||
2161 | ret = rb_handle_head_page(cpu_buffer, | 2166 | ret = rb_handle_head_page(cpu_buffer, |
2162 | tail_page, | 2167 | tail_page, |
@@ -2720,8 +2725,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); | |||
2720 | * and not the length of the event which would hold the header. | 2725 | * and not the length of the event which would hold the header. |
2721 | */ | 2726 | */ |
2722 | int ring_buffer_write(struct ring_buffer *buffer, | 2727 | int ring_buffer_write(struct ring_buffer *buffer, |
2723 | unsigned long length, | 2728 | unsigned long length, |
2724 | void *data) | 2729 | void *data) |
2725 | { | 2730 | { |
2726 | struct ring_buffer_per_cpu *cpu_buffer; | 2731 | struct ring_buffer_per_cpu *cpu_buffer; |
2727 | struct ring_buffer_event *event; | 2732 | struct ring_buffer_event *event; |
@@ -2929,12 +2934,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) | |||
2929 | * @buffer: The ring buffer | 2934 | * @buffer: The ring buffer |
2930 | * @cpu: The per CPU buffer to read from. | 2935 | * @cpu: The per CPU buffer to read from. |
2931 | */ | 2936 | */ |
2932 | unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) | 2937 | u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) |
2933 | { | 2938 | { |
2934 | unsigned long flags; | 2939 | unsigned long flags; |
2935 | struct ring_buffer_per_cpu *cpu_buffer; | 2940 | struct ring_buffer_per_cpu *cpu_buffer; |
2936 | struct buffer_page *bpage; | 2941 | struct buffer_page *bpage; |
2937 | unsigned long ret; | 2942 | u64 ret = 0; |
2938 | 2943 | ||
2939 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 2944 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
2940 | return 0; | 2945 | return 0; |
@@ -2949,7 +2954,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) | |||
2949 | bpage = cpu_buffer->reader_page; | 2954 | bpage = cpu_buffer->reader_page; |
2950 | else | 2955 | else |
2951 | bpage = rb_set_head_page(cpu_buffer); | 2956 | bpage = rb_set_head_page(cpu_buffer); |
2952 | ret = bpage->page->time_stamp; | 2957 | if (bpage) |
2958 | ret = bpage->page->time_stamp; | ||
2953 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 2959 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
2954 | 2960 | ||
2955 | return ret; | 2961 | return ret; |
@@ -2995,7 +3001,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) | |||
2995 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); | 3001 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); |
2996 | 3002 | ||
2997 | /** | 3003 | /** |
2998 | * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer | 3004 | * ring_buffer_overrun_cpu - get the number of overruns caused by the ring |
3005 | * buffer wrapping around (only if RB_FL_OVERWRITE is on). | ||
2999 | * @buffer: The ring buffer | 3006 | * @buffer: The ring buffer |
3000 | * @cpu: The per CPU buffer to get the number of overruns from | 3007 | * @cpu: The per CPU buffer to get the number of overruns from |
3001 | */ | 3008 | */ |
@@ -3015,7 +3022,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) | |||
3015 | EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); | 3022 | EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); |
3016 | 3023 | ||
3017 | /** | 3024 | /** |
3018 | * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits | 3025 | * ring_buffer_commit_overrun_cpu - get the number of overruns caused by |
3026 | * commits failing due to the buffer wrapping around while there are uncommitted | ||
3027 | * events, such as during an interrupt storm. | ||
3019 | * @buffer: The ring buffer | 3028 | * @buffer: The ring buffer |
3020 | * @cpu: The per CPU buffer to get the number of overruns from | 3029 | * @cpu: The per CPU buffer to get the number of overruns from |
3021 | */ | 3030 | */ |
@@ -3036,6 +3045,28 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) | |||
3036 | EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); | 3045 | EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); |
3037 | 3046 | ||
3038 | /** | 3047 | /** |
3048 | * ring_buffer_dropped_events_cpu - get the number of dropped events caused by | ||
3049 | * the ring buffer filling up (only if RB_FL_OVERWRITE is off). | ||
3050 | * @buffer: The ring buffer | ||
3051 | * @cpu: The per CPU buffer to get the number of overruns from | ||
3052 | */ | ||
3053 | unsigned long | ||
3054 | ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) | ||
3055 | { | ||
3056 | struct ring_buffer_per_cpu *cpu_buffer; | ||
3057 | unsigned long ret; | ||
3058 | |||
3059 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
3060 | return 0; | ||
3061 | |||
3062 | cpu_buffer = buffer->buffers[cpu]; | ||
3063 | ret = local_read(&cpu_buffer->dropped_events); | ||
3064 | |||
3065 | return ret; | ||
3066 | } | ||
3067 | EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); | ||
3068 | |||
3069 | /** | ||
3039 | * ring_buffer_entries - get the number of entries in a buffer | 3070 | * ring_buffer_entries - get the number of entries in a buffer |
3040 | * @buffer: The ring buffer | 3071 | * @buffer: The ring buffer |
3041 | * | 3072 | * |
@@ -3260,6 +3291,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
3260 | * Splice the empty reader page into the list around the head. | 3291 | * Splice the empty reader page into the list around the head. |
3261 | */ | 3292 | */ |
3262 | reader = rb_set_head_page(cpu_buffer); | 3293 | reader = rb_set_head_page(cpu_buffer); |
3294 | if (!reader) | ||
3295 | goto out; | ||
3263 | cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); | 3296 | cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); |
3264 | cpu_buffer->reader_page->list.prev = reader->list.prev; | 3297 | cpu_buffer->reader_page->list.prev = reader->list.prev; |
3265 | 3298 | ||
@@ -3778,12 +3811,17 @@ void | |||
3778 | ring_buffer_read_finish(struct ring_buffer_iter *iter) | 3811 | ring_buffer_read_finish(struct ring_buffer_iter *iter) |
3779 | { | 3812 | { |
3780 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3813 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3814 | unsigned long flags; | ||
3781 | 3815 | ||
3782 | /* | 3816 | /* |
3783 | * Ring buffer is disabled from recording, here's a good place | 3817 | * Ring buffer is disabled from recording, here's a good place |
3784 | * to check the integrity of the ring buffer. | 3818 | * to check the integrity of the ring buffer. |
3819 | * Must prevent readers from trying to read, as the check | ||
3820 | * clears the HEAD page and readers require it. | ||
3785 | */ | 3821 | */ |
3822 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | ||
3786 | rb_check_pages(cpu_buffer); | 3823 | rb_check_pages(cpu_buffer); |
3824 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | ||
3787 | 3825 | ||
3788 | atomic_dec(&cpu_buffer->record_disabled); | 3826 | atomic_dec(&cpu_buffer->record_disabled); |
3789 | atomic_dec(&cpu_buffer->buffer->resize_disabled); | 3827 | atomic_dec(&cpu_buffer->buffer->resize_disabled); |
@@ -3864,9 +3902,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3864 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 3902 | local_set(&cpu_buffer->reader_page->page->commit, 0); |
3865 | cpu_buffer->reader_page->read = 0; | 3903 | cpu_buffer->reader_page->read = 0; |
3866 | 3904 | ||
3867 | local_set(&cpu_buffer->commit_overrun, 0); | ||
3868 | local_set(&cpu_buffer->entries_bytes, 0); | 3905 | local_set(&cpu_buffer->entries_bytes, 0); |
3869 | local_set(&cpu_buffer->overrun, 0); | 3906 | local_set(&cpu_buffer->overrun, 0); |
3907 | local_set(&cpu_buffer->commit_overrun, 0); | ||
3908 | local_set(&cpu_buffer->dropped_events, 0); | ||
3870 | local_set(&cpu_buffer->entries, 0); | 3909 | local_set(&cpu_buffer->entries, 0); |
3871 | local_set(&cpu_buffer->committing, 0); | 3910 | local_set(&cpu_buffer->committing, 0); |
3872 | local_set(&cpu_buffer->commits, 0); | 3911 | local_set(&cpu_buffer->commits, 0); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 31e4f55773f1..61e081b4ba11 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * | 9 | * |
10 | * Based on code from the latency_tracer, that is: | 10 | * Based on code from the latency_tracer, that is: |
11 | * Copyright (C) 2004-2006 Ingo Molnar | 11 | * Copyright (C) 2004-2006 Ingo Molnar |
12 | * Copyright (C) 2004 William Lee Irwin III | 12 | * Copyright (C) 2004 Nadia Yvette Chambers |
13 | */ | 13 | */ |
14 | #include <linux/ring_buffer.h> | 14 | #include <linux/ring_buffer.h> |
15 | #include <generated/utsrelease.h> | 15 | #include <generated/utsrelease.h> |
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <linux/notifier.h> | 20 | #include <linux/notifier.h> |
21 | #include <linux/irqflags.h> | 21 | #include <linux/irqflags.h> |
22 | #include <linux/irq_work.h> | ||
22 | #include <linux/debugfs.h> | 23 | #include <linux/debugfs.h> |
23 | #include <linux/pagemap.h> | 24 | #include <linux/pagemap.h> |
24 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> |
@@ -78,6 +79,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) | |||
78 | } | 79 | } |
79 | 80 | ||
80 | /* | 81 | /* |
82 | * To prevent the comm cache from being overwritten when no | ||
83 | * tracing is active, only save the comm when a trace event | ||
84 | * occurred. | ||
85 | */ | ||
86 | static DEFINE_PER_CPU(bool, trace_cmdline_save); | ||
87 | |||
88 | /* | ||
89 | * When a reader is waiting for data, then this variable is | ||
90 | * set to true. | ||
91 | */ | ||
92 | static bool trace_wakeup_needed; | ||
93 | |||
94 | static struct irq_work trace_work_wakeup; | ||
95 | |||
96 | /* | ||
81 | * Kill all tracing for good (never come back). | 97 | * Kill all tracing for good (never come back). |
82 | * It is initialized to 1 but will turn to zero if the initialization | 98 | * It is initialized to 1 but will turn to zero if the initialization |
83 | * of the tracer is successful. But that is the only place that sets | 99 | * of the tracer is successful. But that is the only place that sets |
@@ -139,6 +155,18 @@ static int __init set_ftrace_dump_on_oops(char *str) | |||
139 | } | 155 | } |
140 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); | 156 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); |
141 | 157 | ||
158 | |||
159 | static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; | ||
160 | static char *trace_boot_options __initdata; | ||
161 | |||
162 | static int __init set_trace_boot_options(char *str) | ||
163 | { | ||
164 | strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); | ||
165 | trace_boot_options = trace_boot_options_buf; | ||
166 | return 0; | ||
167 | } | ||
168 | __setup("trace_options=", set_trace_boot_options); | ||
169 | |||
142 | unsigned long long ns2usecs(cycle_t nsec) | 170 | unsigned long long ns2usecs(cycle_t nsec) |
143 | { | 171 | { |
144 | nsec += 500; | 172 | nsec += 500; |
@@ -198,20 +226,9 @@ static struct trace_array max_tr; | |||
198 | 226 | ||
199 | static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); | 227 | static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); |
200 | 228 | ||
201 | /* tracer_enabled is used to toggle activation of a tracer */ | ||
202 | static int tracer_enabled = 1; | ||
203 | |||
204 | /** | ||
205 | * tracing_is_enabled - return tracer_enabled status | ||
206 | * | ||
207 | * This function is used by other tracers to know the status | ||
208 | * of the tracer_enabled flag. Tracers may use this function | ||
209 | * to know if it should enable their features when starting | ||
210 | * up. See irqsoff tracer for an example (start_irqsoff_tracer). | ||
211 | */ | ||
212 | int tracing_is_enabled(void) | 229 | int tracing_is_enabled(void) |
213 | { | 230 | { |
214 | return tracer_enabled; | 231 | return tracing_is_on(); |
215 | } | 232 | } |
216 | 233 | ||
217 | /* | 234 | /* |
@@ -333,12 +350,18 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | |||
333 | static int trace_stop_count; | 350 | static int trace_stop_count; |
334 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); | 351 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); |
335 | 352 | ||
336 | static void wakeup_work_handler(struct work_struct *work) | 353 | /** |
354 | * trace_wake_up - wake up tasks waiting for trace input | ||
355 | * | ||
356 | * Schedules a delayed work to wake up any task that is blocked on the | ||
357 | * trace_wait queue. These is used with trace_poll for tasks polling the | ||
358 | * trace. | ||
359 | */ | ||
360 | static void trace_wake_up(struct irq_work *work) | ||
337 | { | 361 | { |
338 | wake_up(&trace_wait); | 362 | wake_up_all(&trace_wait); |
339 | } | ||
340 | 363 | ||
341 | static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); | 364 | } |
342 | 365 | ||
343 | /** | 366 | /** |
344 | * tracing_on - enable tracing buffers | 367 | * tracing_on - enable tracing buffers |
@@ -393,22 +416,6 @@ int tracing_is_on(void) | |||
393 | } | 416 | } |
394 | EXPORT_SYMBOL_GPL(tracing_is_on); | 417 | EXPORT_SYMBOL_GPL(tracing_is_on); |
395 | 418 | ||
396 | /** | ||
397 | * trace_wake_up - wake up tasks waiting for trace input | ||
398 | * | ||
399 | * Schedules a delayed work to wake up any task that is blocked on the | ||
400 | * trace_wait queue. These is used with trace_poll for tasks polling the | ||
401 | * trace. | ||
402 | */ | ||
403 | void trace_wake_up(void) | ||
404 | { | ||
405 | const unsigned long delay = msecs_to_jiffies(2); | ||
406 | |||
407 | if (trace_flags & TRACE_ITER_BLOCK) | ||
408 | return; | ||
409 | schedule_delayed_work(&wakeup_work, delay); | ||
410 | } | ||
411 | |||
412 | static int __init set_buf_size(char *str) | 419 | static int __init set_buf_size(char *str) |
413 | { | 420 | { |
414 | unsigned long buf_size; | 421 | unsigned long buf_size; |
@@ -431,7 +438,7 @@ static int __init set_tracing_thresh(char *str) | |||
431 | 438 | ||
432 | if (!str) | 439 | if (!str) |
433 | return 0; | 440 | return 0; |
434 | ret = strict_strtoul(str, 0, &threshold); | 441 | ret = kstrtoul(str, 0, &threshold); |
435 | if (ret < 0) | 442 | if (ret < 0) |
436 | return 0; | 443 | return 0; |
437 | tracing_thresh = threshold * 1000; | 444 | tracing_thresh = threshold * 1000; |
@@ -477,10 +484,12 @@ static const char *trace_options[] = { | |||
477 | static struct { | 484 | static struct { |
478 | u64 (*func)(void); | 485 | u64 (*func)(void); |
479 | const char *name; | 486 | const char *name; |
487 | int in_ns; /* is this clock in nanoseconds? */ | ||
480 | } trace_clocks[] = { | 488 | } trace_clocks[] = { |
481 | { trace_clock_local, "local" }, | 489 | { trace_clock_local, "local", 1 }, |
482 | { trace_clock_global, "global" }, | 490 | { trace_clock_global, "global", 1 }, |
483 | { trace_clock_counter, "counter" }, | 491 | { trace_clock_counter, "counter", 0 }, |
492 | ARCH_TRACE_CLOCKS | ||
484 | }; | 493 | }; |
485 | 494 | ||
486 | int trace_clock_id; | 495 | int trace_clock_id; |
@@ -757,6 +766,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
757 | } | 766 | } |
758 | #endif /* CONFIG_TRACER_MAX_TRACE */ | 767 | #endif /* CONFIG_TRACER_MAX_TRACE */ |
759 | 768 | ||
769 | static void default_wait_pipe(struct trace_iterator *iter) | ||
770 | { | ||
771 | DEFINE_WAIT(wait); | ||
772 | |||
773 | prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); | ||
774 | |||
775 | /* | ||
776 | * The events can happen in critical sections where | ||
777 | * checking a work queue can cause deadlocks. | ||
778 | * After adding a task to the queue, this flag is set | ||
779 | * only to notify events to try to wake up the queue | ||
780 | * using irq_work. | ||
781 | * | ||
782 | * We don't clear it even if the buffer is no longer | ||
783 | * empty. The flag only causes the next event to run | ||
784 | * irq_work to do the work queue wake up. The worse | ||
785 | * that can happen if we race with !trace_empty() is that | ||
786 | * an event will cause an irq_work to try to wake up | ||
787 | * an empty queue. | ||
788 | * | ||
789 | * There's no reason to protect this flag either, as | ||
790 | * the work queue and irq_work logic will do the necessary | ||
791 | * synchronization for the wake ups. The only thing | ||
792 | * that is necessary is that the wake up happens after | ||
793 | * a task has been queued. It's OK for spurious wake ups. | ||
794 | */ | ||
795 | trace_wakeup_needed = true; | ||
796 | |||
797 | if (trace_empty(iter)) | ||
798 | schedule(); | ||
799 | |||
800 | finish_wait(&trace_wait, &wait); | ||
801 | } | ||
802 | |||
760 | /** | 803 | /** |
761 | * register_tracer - register a tracer with the ftrace system. | 804 | * register_tracer - register a tracer with the ftrace system. |
762 | * @type - the plugin for the tracer | 805 | * @type - the plugin for the tracer |
@@ -875,32 +918,6 @@ int register_tracer(struct tracer *type) | |||
875 | return ret; | 918 | return ret; |
876 | } | 919 | } |
877 | 920 | ||
878 | void unregister_tracer(struct tracer *type) | ||
879 | { | ||
880 | struct tracer **t; | ||
881 | |||
882 | mutex_lock(&trace_types_lock); | ||
883 | for (t = &trace_types; *t; t = &(*t)->next) { | ||
884 | if (*t == type) | ||
885 | goto found; | ||
886 | } | ||
887 | pr_info("Tracer %s not registered\n", type->name); | ||
888 | goto out; | ||
889 | |||
890 | found: | ||
891 | *t = (*t)->next; | ||
892 | |||
893 | if (type == current_trace && tracer_enabled) { | ||
894 | tracer_enabled = 0; | ||
895 | tracing_stop(); | ||
896 | if (current_trace->stop) | ||
897 | current_trace->stop(&global_trace); | ||
898 | current_trace = &nop_trace; | ||
899 | } | ||
900 | out: | ||
901 | mutex_unlock(&trace_types_lock); | ||
902 | } | ||
903 | |||
904 | void tracing_reset(struct trace_array *tr, int cpu) | 921 | void tracing_reset(struct trace_array *tr, int cpu) |
905 | { | 922 | { |
906 | struct ring_buffer *buffer = tr->buffer; | 923 | struct ring_buffer *buffer = tr->buffer; |
@@ -1131,10 +1148,14 @@ void trace_find_cmdline(int pid, char comm[]) | |||
1131 | 1148 | ||
1132 | void tracing_record_cmdline(struct task_struct *tsk) | 1149 | void tracing_record_cmdline(struct task_struct *tsk) |
1133 | { | 1150 | { |
1134 | if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || | 1151 | if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) |
1135 | !tracing_is_on()) | ||
1136 | return; | 1152 | return; |
1137 | 1153 | ||
1154 | if (!__this_cpu_read(trace_cmdline_save)) | ||
1155 | return; | ||
1156 | |||
1157 | __this_cpu_write(trace_cmdline_save, false); | ||
1158 | |||
1138 | trace_save_cmdline(tsk); | 1159 | trace_save_cmdline(tsk); |
1139 | } | 1160 | } |
1140 | 1161 | ||
@@ -1178,27 +1199,36 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, | |||
1178 | return event; | 1199 | return event; |
1179 | } | 1200 | } |
1180 | 1201 | ||
1202 | void | ||
1203 | __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) | ||
1204 | { | ||
1205 | __this_cpu_write(trace_cmdline_save, true); | ||
1206 | if (trace_wakeup_needed) { | ||
1207 | trace_wakeup_needed = false; | ||
1208 | /* irq_work_queue() supplies it's own memory barriers */ | ||
1209 | irq_work_queue(&trace_work_wakeup); | ||
1210 | } | ||
1211 | ring_buffer_unlock_commit(buffer, event); | ||
1212 | } | ||
1213 | |||
1181 | static inline void | 1214 | static inline void |
1182 | __trace_buffer_unlock_commit(struct ring_buffer *buffer, | 1215 | __trace_buffer_unlock_commit(struct ring_buffer *buffer, |
1183 | struct ring_buffer_event *event, | 1216 | struct ring_buffer_event *event, |
1184 | unsigned long flags, int pc, | 1217 | unsigned long flags, int pc) |
1185 | int wake) | ||
1186 | { | 1218 | { |
1187 | ring_buffer_unlock_commit(buffer, event); | 1219 | __buffer_unlock_commit(buffer, event); |
1188 | 1220 | ||
1189 | ftrace_trace_stack(buffer, flags, 6, pc); | 1221 | ftrace_trace_stack(buffer, flags, 6, pc); |
1190 | ftrace_trace_userstack(buffer, flags, pc); | 1222 | ftrace_trace_userstack(buffer, flags, pc); |
1191 | |||
1192 | if (wake) | ||
1193 | trace_wake_up(); | ||
1194 | } | 1223 | } |
1195 | 1224 | ||
1196 | void trace_buffer_unlock_commit(struct ring_buffer *buffer, | 1225 | void trace_buffer_unlock_commit(struct ring_buffer *buffer, |
1197 | struct ring_buffer_event *event, | 1226 | struct ring_buffer_event *event, |
1198 | unsigned long flags, int pc) | 1227 | unsigned long flags, int pc) |
1199 | { | 1228 | { |
1200 | __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); | 1229 | __trace_buffer_unlock_commit(buffer, event, flags, pc); |
1201 | } | 1230 | } |
1231 | EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); | ||
1202 | 1232 | ||
1203 | struct ring_buffer_event * | 1233 | struct ring_buffer_event * |
1204 | trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, | 1234 | trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, |
@@ -1215,29 +1245,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, | |||
1215 | struct ring_buffer_event *event, | 1245 | struct ring_buffer_event *event, |
1216 | unsigned long flags, int pc) | 1246 | unsigned long flags, int pc) |
1217 | { | 1247 | { |
1218 | __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); | 1248 | __trace_buffer_unlock_commit(buffer, event, flags, pc); |
1219 | } | 1249 | } |
1220 | EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); | 1250 | EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); |
1221 | 1251 | ||
1222 | void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, | 1252 | void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, |
1223 | struct ring_buffer_event *event, | 1253 | struct ring_buffer_event *event, |
1224 | unsigned long flags, int pc) | 1254 | unsigned long flags, int pc, |
1255 | struct pt_regs *regs) | ||
1225 | { | 1256 | { |
1226 | __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); | 1257 | __buffer_unlock_commit(buffer, event); |
1227 | } | ||
1228 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); | ||
1229 | |||
1230 | void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, | ||
1231 | struct ring_buffer_event *event, | ||
1232 | unsigned long flags, int pc, | ||
1233 | struct pt_regs *regs) | ||
1234 | { | ||
1235 | ring_buffer_unlock_commit(buffer, event); | ||
1236 | 1258 | ||
1237 | ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); | 1259 | ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); |
1238 | ftrace_trace_userstack(buffer, flags, pc); | 1260 | ftrace_trace_userstack(buffer, flags, pc); |
1239 | } | 1261 | } |
1240 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); | 1262 | EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); |
1241 | 1263 | ||
1242 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, | 1264 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, |
1243 | struct ring_buffer_event *event) | 1265 | struct ring_buffer_event *event) |
@@ -1269,7 +1291,7 @@ trace_function(struct trace_array *tr, | |||
1269 | entry->parent_ip = parent_ip; | 1291 | entry->parent_ip = parent_ip; |
1270 | 1292 | ||
1271 | if (!filter_check_discard(call, entry, buffer, event)) | 1293 | if (!filter_check_discard(call, entry, buffer, event)) |
1272 | ring_buffer_unlock_commit(buffer, event); | 1294 | __buffer_unlock_commit(buffer, event); |
1273 | } | 1295 | } |
1274 | 1296 | ||
1275 | void | 1297 | void |
@@ -1362,7 +1384,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
1362 | entry->size = trace.nr_entries; | 1384 | entry->size = trace.nr_entries; |
1363 | 1385 | ||
1364 | if (!filter_check_discard(call, entry, buffer, event)) | 1386 | if (!filter_check_discard(call, entry, buffer, event)) |
1365 | ring_buffer_unlock_commit(buffer, event); | 1387 | __buffer_unlock_commit(buffer, event); |
1366 | 1388 | ||
1367 | out: | 1389 | out: |
1368 | /* Again, don't let gcc optimize things here */ | 1390 | /* Again, don't let gcc optimize things here */ |
@@ -1458,7 +1480,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1458 | 1480 | ||
1459 | save_stack_trace_user(&trace); | 1481 | save_stack_trace_user(&trace); |
1460 | if (!filter_check_discard(call, entry, buffer, event)) | 1482 | if (!filter_check_discard(call, entry, buffer, event)) |
1461 | ring_buffer_unlock_commit(buffer, event); | 1483 | __buffer_unlock_commit(buffer, event); |
1462 | 1484 | ||
1463 | out_drop_count: | 1485 | out_drop_count: |
1464 | __this_cpu_dec(user_stack_count); | 1486 | __this_cpu_dec(user_stack_count); |
@@ -1559,10 +1581,10 @@ static int alloc_percpu_trace_buffer(void) | |||
1559 | return -ENOMEM; | 1581 | return -ENOMEM; |
1560 | } | 1582 | } |
1561 | 1583 | ||
1584 | static int buffers_allocated; | ||
1585 | |||
1562 | void trace_printk_init_buffers(void) | 1586 | void trace_printk_init_buffers(void) |
1563 | { | 1587 | { |
1564 | static int buffers_allocated; | ||
1565 | |||
1566 | if (buffers_allocated) | 1588 | if (buffers_allocated) |
1567 | return; | 1589 | return; |
1568 | 1590 | ||
@@ -1571,7 +1593,38 @@ void trace_printk_init_buffers(void) | |||
1571 | 1593 | ||
1572 | pr_info("ftrace: Allocated trace_printk buffers\n"); | 1594 | pr_info("ftrace: Allocated trace_printk buffers\n"); |
1573 | 1595 | ||
1596 | /* Expand the buffers to set size */ | ||
1597 | tracing_update_buffers(); | ||
1598 | |||
1574 | buffers_allocated = 1; | 1599 | buffers_allocated = 1; |
1600 | |||
1601 | /* | ||
1602 | * trace_printk_init_buffers() can be called by modules. | ||
1603 | * If that happens, then we need to start cmdline recording | ||
1604 | * directly here. If the global_trace.buffer is already | ||
1605 | * allocated here, then this was called by module code. | ||
1606 | */ | ||
1607 | if (global_trace.buffer) | ||
1608 | tracing_start_cmdline_record(); | ||
1609 | } | ||
1610 | |||
1611 | void trace_printk_start_comm(void) | ||
1612 | { | ||
1613 | /* Start tracing comms if trace printk is set */ | ||
1614 | if (!buffers_allocated) | ||
1615 | return; | ||
1616 | tracing_start_cmdline_record(); | ||
1617 | } | ||
1618 | |||
1619 | static void trace_printk_start_stop_comm(int enabled) | ||
1620 | { | ||
1621 | if (!buffers_allocated) | ||
1622 | return; | ||
1623 | |||
1624 | if (enabled) | ||
1625 | tracing_start_cmdline_record(); | ||
1626 | else | ||
1627 | tracing_stop_cmdline_record(); | ||
1575 | } | 1628 | } |
1576 | 1629 | ||
1577 | /** | 1630 | /** |
@@ -1622,7 +1675,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1622 | 1675 | ||
1623 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); | 1676 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); |
1624 | if (!filter_check_discard(call, entry, buffer, event)) { | 1677 | if (!filter_check_discard(call, entry, buffer, event)) { |
1625 | ring_buffer_unlock_commit(buffer, event); | 1678 | __buffer_unlock_commit(buffer, event); |
1626 | ftrace_trace_stack(buffer, flags, 6, pc); | 1679 | ftrace_trace_stack(buffer, flags, 6, pc); |
1627 | } | 1680 | } |
1628 | 1681 | ||
@@ -1693,7 +1746,7 @@ int trace_array_vprintk(struct trace_array *tr, | |||
1693 | memcpy(&entry->buf, tbuffer, len); | 1746 | memcpy(&entry->buf, tbuffer, len); |
1694 | entry->buf[len] = '\0'; | 1747 | entry->buf[len] = '\0'; |
1695 | if (!filter_check_discard(call, entry, buffer, event)) { | 1748 | if (!filter_check_discard(call, entry, buffer, event)) { |
1696 | ring_buffer_unlock_commit(buffer, event); | 1749 | __buffer_unlock_commit(buffer, event); |
1697 | ftrace_trace_stack(buffer, flags, 6, pc); | 1750 | ftrace_trace_stack(buffer, flags, 6, pc); |
1698 | } | 1751 | } |
1699 | out: | 1752 | out: |
@@ -2426,6 +2479,10 @@ __tracing_open(struct inode *inode, struct file *file) | |||
2426 | if (ring_buffer_overruns(iter->tr->buffer)) | 2479 | if (ring_buffer_overruns(iter->tr->buffer)) |
2427 | iter->iter_flags |= TRACE_FILE_ANNOTATE; | 2480 | iter->iter_flags |= TRACE_FILE_ANNOTATE; |
2428 | 2481 | ||
2482 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | ||
2483 | if (trace_clocks[trace_clock_id].in_ns) | ||
2484 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | ||
2485 | |||
2429 | /* stop the trace while dumping */ | 2486 | /* stop the trace while dumping */ |
2430 | tracing_stop(); | 2487 | tracing_stop(); |
2431 | 2488 | ||
@@ -2794,26 +2851,19 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
2794 | 2851 | ||
2795 | if (mask == TRACE_ITER_OVERWRITE) | 2852 | if (mask == TRACE_ITER_OVERWRITE) |
2796 | ring_buffer_change_overwrite(global_trace.buffer, enabled); | 2853 | ring_buffer_change_overwrite(global_trace.buffer, enabled); |
2854 | |||
2855 | if (mask == TRACE_ITER_PRINTK) | ||
2856 | trace_printk_start_stop_comm(enabled); | ||
2797 | } | 2857 | } |
2798 | 2858 | ||
2799 | static ssize_t | 2859 | static int trace_set_options(char *option) |
2800 | tracing_trace_options_write(struct file *filp, const char __user *ubuf, | ||
2801 | size_t cnt, loff_t *ppos) | ||
2802 | { | 2860 | { |
2803 | char buf[64]; | ||
2804 | char *cmp; | 2861 | char *cmp; |
2805 | int neg = 0; | 2862 | int neg = 0; |
2806 | int ret; | 2863 | int ret = 0; |
2807 | int i; | 2864 | int i; |
2808 | 2865 | ||
2809 | if (cnt >= sizeof(buf)) | 2866 | cmp = strstrip(option); |
2810 | return -EINVAL; | ||
2811 | |||
2812 | if (copy_from_user(&buf, ubuf, cnt)) | ||
2813 | return -EFAULT; | ||
2814 | |||
2815 | buf[cnt] = 0; | ||
2816 | cmp = strstrip(buf); | ||
2817 | 2867 | ||
2818 | if (strncmp(cmp, "no", 2) == 0) { | 2868 | if (strncmp(cmp, "no", 2) == 0) { |
2819 | neg = 1; | 2869 | neg = 1; |
@@ -2832,10 +2882,25 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, | |||
2832 | mutex_lock(&trace_types_lock); | 2882 | mutex_lock(&trace_types_lock); |
2833 | ret = set_tracer_option(current_trace, cmp, neg); | 2883 | ret = set_tracer_option(current_trace, cmp, neg); |
2834 | mutex_unlock(&trace_types_lock); | 2884 | mutex_unlock(&trace_types_lock); |
2835 | if (ret) | ||
2836 | return ret; | ||
2837 | } | 2885 | } |
2838 | 2886 | ||
2887 | return ret; | ||
2888 | } | ||
2889 | |||
2890 | static ssize_t | ||
2891 | tracing_trace_options_write(struct file *filp, const char __user *ubuf, | ||
2892 | size_t cnt, loff_t *ppos) | ||
2893 | { | ||
2894 | char buf[64]; | ||
2895 | |||
2896 | if (cnt >= sizeof(buf)) | ||
2897 | return -EINVAL; | ||
2898 | |||
2899 | if (copy_from_user(&buf, ubuf, cnt)) | ||
2900 | return -EFAULT; | ||
2901 | |||
2902 | trace_set_options(buf); | ||
2903 | |||
2839 | *ppos += cnt; | 2904 | *ppos += cnt; |
2840 | 2905 | ||
2841 | return cnt; | 2906 | return cnt; |
@@ -2940,56 +3005,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = { | |||
2940 | }; | 3005 | }; |
2941 | 3006 | ||
2942 | static ssize_t | 3007 | static ssize_t |
2943 | tracing_ctrl_read(struct file *filp, char __user *ubuf, | ||
2944 | size_t cnt, loff_t *ppos) | ||
2945 | { | ||
2946 | char buf[64]; | ||
2947 | int r; | ||
2948 | |||
2949 | r = sprintf(buf, "%u\n", tracer_enabled); | ||
2950 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
2951 | } | ||
2952 | |||
2953 | static ssize_t | ||
2954 | tracing_ctrl_write(struct file *filp, const char __user *ubuf, | ||
2955 | size_t cnt, loff_t *ppos) | ||
2956 | { | ||
2957 | struct trace_array *tr = filp->private_data; | ||
2958 | unsigned long val; | ||
2959 | int ret; | ||
2960 | |||
2961 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); | ||
2962 | if (ret) | ||
2963 | return ret; | ||
2964 | |||
2965 | val = !!val; | ||
2966 | |||
2967 | mutex_lock(&trace_types_lock); | ||
2968 | if (tracer_enabled ^ val) { | ||
2969 | |||
2970 | /* Only need to warn if this is used to change the state */ | ||
2971 | WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); | ||
2972 | |||
2973 | if (val) { | ||
2974 | tracer_enabled = 1; | ||
2975 | if (current_trace->start) | ||
2976 | current_trace->start(tr); | ||
2977 | tracing_start(); | ||
2978 | } else { | ||
2979 | tracer_enabled = 0; | ||
2980 | tracing_stop(); | ||
2981 | if (current_trace->stop) | ||
2982 | current_trace->stop(tr); | ||
2983 | } | ||
2984 | } | ||
2985 | mutex_unlock(&trace_types_lock); | ||
2986 | |||
2987 | *ppos += cnt; | ||
2988 | |||
2989 | return cnt; | ||
2990 | } | ||
2991 | |||
2992 | static ssize_t | ||
2993 | tracing_set_trace_read(struct file *filp, char __user *ubuf, | 3008 | tracing_set_trace_read(struct file *filp, char __user *ubuf, |
2994 | size_t cnt, loff_t *ppos) | 3009 | size_t cnt, loff_t *ppos) |
2995 | { | 3010 | { |
@@ -3030,6 +3045,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | |||
3030 | */ | 3045 | */ |
3031 | ring_buffer_expanded = 1; | 3046 | ring_buffer_expanded = 1; |
3032 | 3047 | ||
3048 | /* May be called before buffers are initialized */ | ||
3049 | if (!global_trace.buffer) | ||
3050 | return 0; | ||
3051 | |||
3033 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); | 3052 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); |
3034 | if (ret < 0) | 3053 | if (ret < 0) |
3035 | return ret; | 3054 | return ret; |
@@ -3325,6 +3344,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3325 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | 3344 | if (trace_flags & TRACE_ITER_LATENCY_FMT) |
3326 | iter->iter_flags |= TRACE_FILE_LAT_FMT; | 3345 | iter->iter_flags |= TRACE_FILE_LAT_FMT; |
3327 | 3346 | ||
3347 | /* Output in nanoseconds only if we are using a clock in nanoseconds. */ | ||
3348 | if (trace_clocks[trace_clock_id].in_ns) | ||
3349 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | ||
3350 | |||
3328 | iter->cpu_file = cpu_file; | 3351 | iter->cpu_file = cpu_file; |
3329 | iter->tr = &global_trace; | 3352 | iter->tr = &global_trace; |
3330 | mutex_init(&iter->mutex); | 3353 | mutex_init(&iter->mutex); |
@@ -3385,19 +3408,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) | |||
3385 | } | 3408 | } |
3386 | } | 3409 | } |
3387 | 3410 | ||
3388 | |||
3389 | void default_wait_pipe(struct trace_iterator *iter) | ||
3390 | { | ||
3391 | DEFINE_WAIT(wait); | ||
3392 | |||
3393 | prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); | ||
3394 | |||
3395 | if (trace_empty(iter)) | ||
3396 | schedule(); | ||
3397 | |||
3398 | finish_wait(&trace_wait, &wait); | ||
3399 | } | ||
3400 | |||
3401 | /* | 3411 | /* |
3402 | * This is a make-shift waitqueue. | 3412 | * This is a make-shift waitqueue. |
3403 | * A tracer might use this callback on some rare cases: | 3413 | * A tracer might use this callback on some rare cases: |
@@ -3438,7 +3448,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
3438 | return -EINTR; | 3448 | return -EINTR; |
3439 | 3449 | ||
3440 | /* | 3450 | /* |
3441 | * We block until we read something and tracing is disabled. | 3451 | * We block until we read something and tracing is enabled. |
3442 | * We still block if tracing is disabled, but we have never | 3452 | * We still block if tracing is disabled, but we have never |
3443 | * read anything. This allows a user to cat this file, and | 3453 | * read anything. This allows a user to cat this file, and |
3444 | * then enable tracing. But after we have read something, | 3454 | * then enable tracing. But after we have read something, |
@@ -3446,7 +3456,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
3446 | * | 3456 | * |
3447 | * iter->pos will be 0 if we haven't read anything. | 3457 | * iter->pos will be 0 if we haven't read anything. |
3448 | */ | 3458 | */ |
3449 | if (!tracer_enabled && iter->pos) | 3459 | if (tracing_is_enabled() && iter->pos) |
3450 | break; | 3460 | break; |
3451 | } | 3461 | } |
3452 | 3462 | ||
@@ -3955,7 +3965,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3955 | } else | 3965 | } else |
3956 | entry->buf[cnt] = '\0'; | 3966 | entry->buf[cnt] = '\0'; |
3957 | 3967 | ||
3958 | ring_buffer_unlock_commit(buffer, event); | 3968 | __buffer_unlock_commit(buffer, event); |
3959 | 3969 | ||
3960 | written = cnt; | 3970 | written = cnt; |
3961 | 3971 | ||
@@ -4016,6 +4026,14 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, | |||
4016 | if (max_tr.buffer) | 4026 | if (max_tr.buffer) |
4017 | ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); | 4027 | ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); |
4018 | 4028 | ||
4029 | /* | ||
4030 | * New clock may not be consistent with the previous clock. | ||
4031 | * Reset the buffer so that it doesn't have incomparable timestamps. | ||
4032 | */ | ||
4033 | tracing_reset_online_cpus(&global_trace); | ||
4034 | if (max_tr.buffer) | ||
4035 | tracing_reset_online_cpus(&max_tr); | ||
4036 | |||
4019 | mutex_unlock(&trace_types_lock); | 4037 | mutex_unlock(&trace_types_lock); |
4020 | 4038 | ||
4021 | *fpos += cnt; | 4039 | *fpos += cnt; |
@@ -4037,13 +4055,6 @@ static const struct file_operations tracing_max_lat_fops = { | |||
4037 | .llseek = generic_file_llseek, | 4055 | .llseek = generic_file_llseek, |
4038 | }; | 4056 | }; |
4039 | 4057 | ||
4040 | static const struct file_operations tracing_ctrl_fops = { | ||
4041 | .open = tracing_open_generic, | ||
4042 | .read = tracing_ctrl_read, | ||
4043 | .write = tracing_ctrl_write, | ||
4044 | .llseek = generic_file_llseek, | ||
4045 | }; | ||
4046 | |||
4047 | static const struct file_operations set_tracer_fops = { | 4058 | static const struct file_operations set_tracer_fops = { |
4048 | .open = tracing_open_generic, | 4059 | .open = tracing_open_generic, |
4049 | .read = tracing_set_trace_read, | 4060 | .read = tracing_set_trace_read, |
@@ -4377,13 +4388,27 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
4377 | cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); | 4388 | cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); |
4378 | trace_seq_printf(s, "bytes: %ld\n", cnt); | 4389 | trace_seq_printf(s, "bytes: %ld\n", cnt); |
4379 | 4390 | ||
4380 | t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); | 4391 | if (trace_clocks[trace_clock_id].in_ns) { |
4381 | usec_rem = do_div(t, USEC_PER_SEC); | 4392 | /* local or global for trace_clock */ |
4382 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); | 4393 | t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); |
4394 | usec_rem = do_div(t, USEC_PER_SEC); | ||
4395 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", | ||
4396 | t, usec_rem); | ||
4397 | |||
4398 | t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); | ||
4399 | usec_rem = do_div(t, USEC_PER_SEC); | ||
4400 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); | ||
4401 | } else { | ||
4402 | /* counter or tsc mode for trace_clock */ | ||
4403 | trace_seq_printf(s, "oldest event ts: %llu\n", | ||
4404 | ring_buffer_oldest_event_ts(tr->buffer, cpu)); | ||
4405 | |||
4406 | trace_seq_printf(s, "now ts: %llu\n", | ||
4407 | ring_buffer_time_stamp(tr->buffer, cpu)); | ||
4408 | } | ||
4383 | 4409 | ||
4384 | t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); | 4410 | cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); |
4385 | usec_rem = do_div(t, USEC_PER_SEC); | 4411 | trace_seq_printf(s, "dropped events: %ld\n", cnt); |
4386 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); | ||
4387 | 4412 | ||
4388 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 4413 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); |
4389 | 4414 | ||
@@ -4815,9 +4840,6 @@ static __init int tracer_init_debugfs(void) | |||
4815 | 4840 | ||
4816 | d_tracer = tracing_init_dentry(); | 4841 | d_tracer = tracing_init_dentry(); |
4817 | 4842 | ||
4818 | trace_create_file("tracing_enabled", 0644, d_tracer, | ||
4819 | &global_trace, &tracing_ctrl_fops); | ||
4820 | |||
4821 | trace_create_file("trace_options", 0644, d_tracer, | 4843 | trace_create_file("trace_options", 0644, d_tracer, |
4822 | NULL, &tracing_iter_fops); | 4844 | NULL, &tracing_iter_fops); |
4823 | 4845 | ||
@@ -5089,6 +5111,7 @@ __init static int tracer_alloc_buffers(void) | |||
5089 | 5111 | ||
5090 | /* Only allocate trace_printk buffers if a trace_printk exists */ | 5112 | /* Only allocate trace_printk buffers if a trace_printk exists */ |
5091 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) | 5113 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) |
5114 | /* Must be called before global_trace.buffer is allocated */ | ||
5092 | trace_printk_init_buffers(); | 5115 | trace_printk_init_buffers(); |
5093 | 5116 | ||
5094 | /* To save memory, keep the ring buffer size to its minimum */ | 5117 | /* To save memory, keep the ring buffer size to its minimum */ |
@@ -5136,6 +5159,7 @@ __init static int tracer_alloc_buffers(void) | |||
5136 | #endif | 5159 | #endif |
5137 | 5160 | ||
5138 | trace_init_cmdlines(); | 5161 | trace_init_cmdlines(); |
5162 | init_irq_work(&trace_work_wakeup, trace_wake_up); | ||
5139 | 5163 | ||
5140 | register_tracer(&nop_trace); | 5164 | register_tracer(&nop_trace); |
5141 | current_trace = &nop_trace; | 5165 | current_trace = &nop_trace; |
@@ -5147,6 +5171,13 @@ __init static int tracer_alloc_buffers(void) | |||
5147 | 5171 | ||
5148 | register_die_notifier(&trace_die_notifier); | 5172 | register_die_notifier(&trace_die_notifier); |
5149 | 5173 | ||
5174 | while (trace_boot_options) { | ||
5175 | char *option; | ||
5176 | |||
5177 | option = strsep(&trace_boot_options, ","); | ||
5178 | trace_set_options(option); | ||
5179 | } | ||
5180 | |||
5150 | return 0; | 5181 | return 0; |
5151 | 5182 | ||
5152 | out_free_cpumask: | 5183 | out_free_cpumask: |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c15f528c1af4..c75d7988902c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -285,8 +285,8 @@ struct tracer { | |||
285 | int (*set_flag)(u32 old_flags, u32 bit, int set); | 285 | int (*set_flag)(u32 old_flags, u32 bit, int set); |
286 | struct tracer *next; | 286 | struct tracer *next; |
287 | struct tracer_flags *flags; | 287 | struct tracer_flags *flags; |
288 | int print_max; | 288 | bool print_max; |
289 | int use_max_tr; | 289 | bool use_max_tr; |
290 | }; | 290 | }; |
291 | 291 | ||
292 | 292 | ||
@@ -327,7 +327,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) | |||
327 | 327 | ||
328 | int tracer_init(struct tracer *t, struct trace_array *tr); | 328 | int tracer_init(struct tracer *t, struct trace_array *tr); |
329 | int tracing_is_enabled(void); | 329 | int tracing_is_enabled(void); |
330 | void trace_wake_up(void); | ||
331 | void tracing_reset(struct trace_array *tr, int cpu); | 330 | void tracing_reset(struct trace_array *tr, int cpu); |
332 | void tracing_reset_online_cpus(struct trace_array *tr); | 331 | void tracing_reset_online_cpus(struct trace_array *tr); |
333 | void tracing_reset_current(int cpu); | 332 | void tracing_reset_current(int cpu); |
@@ -349,9 +348,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, | |||
349 | unsigned long len, | 348 | unsigned long len, |
350 | unsigned long flags, | 349 | unsigned long flags, |
351 | int pc); | 350 | int pc); |
352 | void trace_buffer_unlock_commit(struct ring_buffer *buffer, | ||
353 | struct ring_buffer_event *event, | ||
354 | unsigned long flags, int pc); | ||
355 | 351 | ||
356 | struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, | 352 | struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, |
357 | struct trace_array_cpu *data); | 353 | struct trace_array_cpu *data); |
@@ -359,6 +355,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, | |||
359 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, | 355 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, |
360 | int *ent_cpu, u64 *ent_ts); | 356 | int *ent_cpu, u64 *ent_ts); |
361 | 357 | ||
358 | void __buffer_unlock_commit(struct ring_buffer *buffer, | ||
359 | struct ring_buffer_event *event); | ||
360 | |||
362 | int trace_empty(struct trace_iterator *iter); | 361 | int trace_empty(struct trace_iterator *iter); |
363 | 362 | ||
364 | void *trace_find_next_entry_inc(struct trace_iterator *iter); | 363 | void *trace_find_next_entry_inc(struct trace_iterator *iter); |
@@ -367,7 +366,6 @@ void trace_init_global_iter(struct trace_iterator *iter); | |||
367 | 366 | ||
368 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | 367 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); |
369 | 368 | ||
370 | void default_wait_pipe(struct trace_iterator *iter); | ||
371 | void poll_wait_pipe(struct trace_iterator *iter); | 369 | void poll_wait_pipe(struct trace_iterator *iter); |
372 | 370 | ||
373 | void ftrace(struct trace_array *tr, | 371 | void ftrace(struct trace_array *tr, |
@@ -407,12 +405,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr); | |||
407 | void tracing_stop_sched_switch_record(void); | 405 | void tracing_stop_sched_switch_record(void); |
408 | void tracing_start_sched_switch_record(void); | 406 | void tracing_start_sched_switch_record(void); |
409 | int register_tracer(struct tracer *type); | 407 | int register_tracer(struct tracer *type); |
410 | void unregister_tracer(struct tracer *type); | ||
411 | int is_tracing_stopped(void); | 408 | int is_tracing_stopped(void); |
412 | enum trace_file_type { | ||
413 | TRACE_FILE_LAT_FMT = 1, | ||
414 | TRACE_FILE_ANNOTATE = 2, | ||
415 | }; | ||
416 | 409 | ||
417 | extern cpumask_var_t __read_mostly tracing_buffer_mask; | 410 | extern cpumask_var_t __read_mostly tracing_buffer_mask; |
418 | 411 | ||
@@ -841,6 +834,7 @@ extern const char *__start___trace_bprintk_fmt[]; | |||
841 | extern const char *__stop___trace_bprintk_fmt[]; | 834 | extern const char *__stop___trace_bprintk_fmt[]; |
842 | 835 | ||
843 | void trace_printk_init_buffers(void); | 836 | void trace_printk_init_buffers(void); |
837 | void trace_printk_start_comm(void); | ||
844 | 838 | ||
845 | #undef FTRACE_ENTRY | 839 | #undef FTRACE_ENTRY |
846 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ | 840 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8d3538b4ea5f..95e96842ed29 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) | |||
77 | entry->correct = val == expect; | 77 | entry->correct = val == expect; |
78 | 78 | ||
79 | if (!filter_check_discard(call, entry, buffer, event)) | 79 | if (!filter_check_discard(call, entry, buffer, event)) |
80 | ring_buffer_unlock_commit(buffer, event); | 80 | __buffer_unlock_commit(buffer, event); |
81 | 81 | ||
82 | out: | 82 | out: |
83 | atomic_dec(&tr->data[cpu]->disabled); | 83 | atomic_dec(&tr->data[cpu]->disabled); |
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void) | |||
199 | } | 199 | } |
200 | return register_tracer(&branch_trace); | 200 | return register_tracer(&branch_trace); |
201 | } | 201 | } |
202 | device_initcall(init_branch_tracer); | 202 | core_initcall(init_branch_tracer); |
203 | 203 | ||
204 | #else | 204 | #else |
205 | static inline | 205 | static inline |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d608d09d08c0..880073d0b946 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -491,19 +491,6 @@ static void t_stop(struct seq_file *m, void *p) | |||
491 | mutex_unlock(&event_mutex); | 491 | mutex_unlock(&event_mutex); |
492 | } | 492 | } |
493 | 493 | ||
494 | static int | ||
495 | ftrace_event_seq_open(struct inode *inode, struct file *file) | ||
496 | { | ||
497 | const struct seq_operations *seq_ops; | ||
498 | |||
499 | if ((file->f_mode & FMODE_WRITE) && | ||
500 | (file->f_flags & O_TRUNC)) | ||
501 | ftrace_clear_events(); | ||
502 | |||
503 | seq_ops = inode->i_private; | ||
504 | return seq_open(file, seq_ops); | ||
505 | } | ||
506 | |||
507 | static ssize_t | 494 | static ssize_t |
508 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | 495 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, |
509 | loff_t *ppos) | 496 | loff_t *ppos) |
@@ -980,6 +967,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) | |||
980 | return r; | 967 | return r; |
981 | } | 968 | } |
982 | 969 | ||
970 | static int ftrace_event_avail_open(struct inode *inode, struct file *file); | ||
971 | static int ftrace_event_set_open(struct inode *inode, struct file *file); | ||
972 | |||
983 | static const struct seq_operations show_event_seq_ops = { | 973 | static const struct seq_operations show_event_seq_ops = { |
984 | .start = t_start, | 974 | .start = t_start, |
985 | .next = t_next, | 975 | .next = t_next, |
@@ -995,14 +985,14 @@ static const struct seq_operations show_set_event_seq_ops = { | |||
995 | }; | 985 | }; |
996 | 986 | ||
997 | static const struct file_operations ftrace_avail_fops = { | 987 | static const struct file_operations ftrace_avail_fops = { |
998 | .open = ftrace_event_seq_open, | 988 | .open = ftrace_event_avail_open, |
999 | .read = seq_read, | 989 | .read = seq_read, |
1000 | .llseek = seq_lseek, | 990 | .llseek = seq_lseek, |
1001 | .release = seq_release, | 991 | .release = seq_release, |
1002 | }; | 992 | }; |
1003 | 993 | ||
1004 | static const struct file_operations ftrace_set_event_fops = { | 994 | static const struct file_operations ftrace_set_event_fops = { |
1005 | .open = ftrace_event_seq_open, | 995 | .open = ftrace_event_set_open, |
1006 | .read = seq_read, | 996 | .read = seq_read, |
1007 | .write = ftrace_event_write, | 997 | .write = ftrace_event_write, |
1008 | .llseek = seq_lseek, | 998 | .llseek = seq_lseek, |
@@ -1078,6 +1068,26 @@ static struct dentry *event_trace_events_dir(void) | |||
1078 | return d_events; | 1068 | return d_events; |
1079 | } | 1069 | } |
1080 | 1070 | ||
1071 | static int | ||
1072 | ftrace_event_avail_open(struct inode *inode, struct file *file) | ||
1073 | { | ||
1074 | const struct seq_operations *seq_ops = &show_event_seq_ops; | ||
1075 | |||
1076 | return seq_open(file, seq_ops); | ||
1077 | } | ||
1078 | |||
1079 | static int | ||
1080 | ftrace_event_set_open(struct inode *inode, struct file *file) | ||
1081 | { | ||
1082 | const struct seq_operations *seq_ops = &show_set_event_seq_ops; | ||
1083 | |||
1084 | if ((file->f_mode & FMODE_WRITE) && | ||
1085 | (file->f_flags & O_TRUNC)) | ||
1086 | ftrace_clear_events(); | ||
1087 | |||
1088 | return seq_open(file, seq_ops); | ||
1089 | } | ||
1090 | |||
1081 | static struct dentry * | 1091 | static struct dentry * |
1082 | event_subsystem_dir(const char *name, struct dentry *d_events) | 1092 | event_subsystem_dir(const char *name, struct dentry *d_events) |
1083 | { | 1093 | { |
@@ -1489,6 +1499,9 @@ static __init int event_trace_enable(void) | |||
1489 | if (ret) | 1499 | if (ret) |
1490 | pr_warn("Failed to enable trace event: %s\n", token); | 1500 | pr_warn("Failed to enable trace event: %s\n", token); |
1491 | } | 1501 | } |
1502 | |||
1503 | trace_printk_start_comm(); | ||
1504 | |||
1492 | return 0; | 1505 | return 0; |
1493 | } | 1506 | } |
1494 | 1507 | ||
@@ -1505,15 +1518,13 @@ static __init int event_trace_init(void) | |||
1505 | return 0; | 1518 | return 0; |
1506 | 1519 | ||
1507 | entry = debugfs_create_file("available_events", 0444, d_tracer, | 1520 | entry = debugfs_create_file("available_events", 0444, d_tracer, |
1508 | (void *)&show_event_seq_ops, | 1521 | NULL, &ftrace_avail_fops); |
1509 | &ftrace_avail_fops); | ||
1510 | if (!entry) | 1522 | if (!entry) |
1511 | pr_warning("Could not create debugfs " | 1523 | pr_warning("Could not create debugfs " |
1512 | "'available_events' entry\n"); | 1524 | "'available_events' entry\n"); |
1513 | 1525 | ||
1514 | entry = debugfs_create_file("set_event", 0644, d_tracer, | 1526 | entry = debugfs_create_file("set_event", 0644, d_tracer, |
1515 | (void *)&show_set_event_seq_ops, | 1527 | NULL, &ftrace_set_event_fops); |
1516 | &ftrace_set_event_fops); | ||
1517 | if (!entry) | 1528 | if (!entry) |
1518 | pr_warning("Could not create debugfs " | 1529 | pr_warning("Could not create debugfs " |
1519 | "'set_event' entry\n"); | 1530 | "'set_event' entry\n"); |
@@ -1749,7 +1760,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, | |||
1749 | entry->ip = ip; | 1760 | entry->ip = ip; |
1750 | entry->parent_ip = parent_ip; | 1761 | entry->parent_ip = parent_ip; |
1751 | 1762 | ||
1752 | trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); | 1763 | trace_buffer_unlock_commit(buffer, event, flags, pc); |
1753 | 1764 | ||
1754 | out: | 1765 | out: |
1755 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); | 1766 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c154797a7ff7..e5b0ca8b8d4d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps, | |||
1000 | } | 1000 | } |
1001 | } else { | 1001 | } else { |
1002 | if (field->is_signed) | 1002 | if (field->is_signed) |
1003 | ret = strict_strtoll(pred->regex.pattern, 0, &val); | 1003 | ret = kstrtoll(pred->regex.pattern, 0, &val); |
1004 | else | 1004 | else |
1005 | ret = strict_strtoull(pred->regex.pattern, 0, &val); | 1005 | ret = kstrtoull(pred->regex.pattern, 0, &val); |
1006 | if (ret) { | 1006 | if (ret) { |
1007 | parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); | 1007 | parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); |
1008 | return -EINVAL; | 1008 | return -EINVAL; |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 507a7a9630bf..8e3ad8082ab7 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Based on code from the latency_tracer, that is: | 7 | * Based on code from the latency_tracer, that is: |
8 | * | 8 | * |
9 | * Copyright (C) 2004-2006 Ingo Molnar | 9 | * Copyright (C) 2004-2006 Ingo Molnar |
10 | * Copyright (C) 2004 William Lee Irwin III | 10 | * Copyright (C) 2004 Nadia Yvette Chambers |
11 | */ | 11 | */ |
12 | #include <linux/ring_buffer.h> | 12 | #include <linux/ring_buffer.h> |
13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
@@ -366,7 +366,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, | |||
366 | * We use the callback data field (which is a pointer) | 366 | * We use the callback data field (which is a pointer) |
367 | * as our counter. | 367 | * as our counter. |
368 | */ | 368 | */ |
369 | ret = strict_strtoul(number, 0, (unsigned long *)&count); | 369 | ret = kstrtoul(number, 0, (unsigned long *)&count); |
370 | if (ret) | 370 | if (ret) |
371 | return ret; | 371 | return ret; |
372 | 372 | ||
@@ -411,5 +411,4 @@ static __init int init_function_trace(void) | |||
411 | init_func_cmd_traceon(); | 411 | init_func_cmd_traceon(); |
412 | return register_tracer(&function_trace); | 412 | return register_tracer(&function_trace); |
413 | } | 413 | } |
414 | device_initcall(init_function_trace); | 414 | core_initcall(init_function_trace); |
415 | |||
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 99b4378393d5..4edb4b74eb7e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -223,7 +223,7 @@ int __trace_graph_entry(struct trace_array *tr, | |||
223 | entry = ring_buffer_event_data(event); | 223 | entry = ring_buffer_event_data(event); |
224 | entry->graph_ent = *trace; | 224 | entry->graph_ent = *trace; |
225 | if (!filter_current_check_discard(buffer, call, entry, event)) | 225 | if (!filter_current_check_discard(buffer, call, entry, event)) |
226 | ring_buffer_unlock_commit(buffer, event); | 226 | __buffer_unlock_commit(buffer, event); |
227 | 227 | ||
228 | return 1; | 228 | return 1; |
229 | } | 229 | } |
@@ -327,7 +327,7 @@ void __trace_graph_return(struct trace_array *tr, | |||
327 | entry = ring_buffer_event_data(event); | 327 | entry = ring_buffer_event_data(event); |
328 | entry->ret = *trace; | 328 | entry->ret = *trace; |
329 | if (!filter_current_check_discard(buffer, call, entry, event)) | 329 | if (!filter_current_check_discard(buffer, call, entry, event)) |
330 | ring_buffer_unlock_commit(buffer, event); | 330 | __buffer_unlock_commit(buffer, event); |
331 | } | 331 | } |
332 | 332 | ||
333 | void trace_graph_return(struct ftrace_graph_ret *trace) | 333 | void trace_graph_return(struct ftrace_graph_ret *trace) |
@@ -1474,4 +1474,4 @@ static __init int init_graph_trace(void) | |||
1474 | return register_tracer(&graph_trace); | 1474 | return register_tracer(&graph_trace); |
1475 | } | 1475 | } |
1476 | 1476 | ||
1477 | device_initcall(init_graph_trace); | 1477 | core_initcall(init_graph_trace); |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d98ee8283b29..713a2cac4881 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * From code in the latency_tracer, that is: | 7 | * From code in the latency_tracer, that is: |
8 | * | 8 | * |
9 | * Copyright (C) 2004-2006 Ingo Molnar | 9 | * Copyright (C) 2004-2006 Ingo Molnar |
10 | * Copyright (C) 2004 William Lee Irwin III | 10 | * Copyright (C) 2004 Nadia Yvette Chambers |
11 | */ | 11 | */ |
12 | #include <linux/kallsyms.h> | 12 | #include <linux/kallsyms.h> |
13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
@@ -604,7 +604,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
604 | .reset = irqsoff_tracer_reset, | 604 | .reset = irqsoff_tracer_reset, |
605 | .start = irqsoff_tracer_start, | 605 | .start = irqsoff_tracer_start, |
606 | .stop = irqsoff_tracer_stop, | 606 | .stop = irqsoff_tracer_stop, |
607 | .print_max = 1, | 607 | .print_max = true, |
608 | .print_header = irqsoff_print_header, | 608 | .print_header = irqsoff_print_header, |
609 | .print_line = irqsoff_print_line, | 609 | .print_line = irqsoff_print_line, |
610 | .flags = &tracer_flags, | 610 | .flags = &tracer_flags, |
@@ -614,7 +614,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
614 | #endif | 614 | #endif |
615 | .open = irqsoff_trace_open, | 615 | .open = irqsoff_trace_open, |
616 | .close = irqsoff_trace_close, | 616 | .close = irqsoff_trace_close, |
617 | .use_max_tr = 1, | 617 | .use_max_tr = true, |
618 | }; | 618 | }; |
619 | # define register_irqsoff(trace) register_tracer(&trace) | 619 | # define register_irqsoff(trace) register_tracer(&trace) |
620 | #else | 620 | #else |
@@ -637,7 +637,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
637 | .reset = irqsoff_tracer_reset, | 637 | .reset = irqsoff_tracer_reset, |
638 | .start = irqsoff_tracer_start, | 638 | .start = irqsoff_tracer_start, |
639 | .stop = irqsoff_tracer_stop, | 639 | .stop = irqsoff_tracer_stop, |
640 | .print_max = 1, | 640 | .print_max = true, |
641 | .print_header = irqsoff_print_header, | 641 | .print_header = irqsoff_print_header, |
642 | .print_line = irqsoff_print_line, | 642 | .print_line = irqsoff_print_line, |
643 | .flags = &tracer_flags, | 643 | .flags = &tracer_flags, |
@@ -647,7 +647,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
647 | #endif | 647 | #endif |
648 | .open = irqsoff_trace_open, | 648 | .open = irqsoff_trace_open, |
649 | .close = irqsoff_trace_close, | 649 | .close = irqsoff_trace_close, |
650 | .use_max_tr = 1, | 650 | .use_max_tr = true, |
651 | }; | 651 | }; |
652 | # define register_preemptoff(trace) register_tracer(&trace) | 652 | # define register_preemptoff(trace) register_tracer(&trace) |
653 | #else | 653 | #else |
@@ -672,7 +672,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
672 | .reset = irqsoff_tracer_reset, | 672 | .reset = irqsoff_tracer_reset, |
673 | .start = irqsoff_tracer_start, | 673 | .start = irqsoff_tracer_start, |
674 | .stop = irqsoff_tracer_stop, | 674 | .stop = irqsoff_tracer_stop, |
675 | .print_max = 1, | 675 | .print_max = true, |
676 | .print_header = irqsoff_print_header, | 676 | .print_header = irqsoff_print_header, |
677 | .print_line = irqsoff_print_line, | 677 | .print_line = irqsoff_print_line, |
678 | .flags = &tracer_flags, | 678 | .flags = &tracer_flags, |
@@ -682,7 +682,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
682 | #endif | 682 | #endif |
683 | .open = irqsoff_trace_open, | 683 | .open = irqsoff_trace_open, |
684 | .close = irqsoff_trace_close, | 684 | .close = irqsoff_trace_close, |
685 | .use_max_tr = 1, | 685 | .use_max_tr = true, |
686 | }; | 686 | }; |
687 | 687 | ||
688 | # define register_preemptirqsoff(trace) register_tracer(&trace) | 688 | # define register_preemptirqsoff(trace) register_tracer(&trace) |
@@ -698,4 +698,4 @@ __init static int init_irqsoff_tracer(void) | |||
698 | 698 | ||
699 | return 0; | 699 | return 0; |
700 | } | 700 | } |
701 | device_initcall(init_irqsoff_tracer); | 701 | core_initcall(init_irqsoff_tracer); |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1a2117043bb1..1865d5f76538 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv) | |||
444 | return -EINVAL; | 444 | return -EINVAL; |
445 | } | 445 | } |
446 | /* an address specified */ | 446 | /* an address specified */ |
447 | ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); | 447 | ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); |
448 | if (ret) { | 448 | if (ret) { |
449 | pr_info("Failed to parse address.\n"); | 449 | pr_info("Failed to parse address.\n"); |
450 | return ret; | 450 | return ret; |
@@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
751 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 751 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
752 | 752 | ||
753 | if (!filter_current_check_discard(buffer, call, entry, event)) | 753 | if (!filter_current_check_discard(buffer, call, entry, event)) |
754 | trace_nowake_buffer_unlock_commit_regs(buffer, event, | 754 | trace_buffer_unlock_commit_regs(buffer, event, |
755 | irq_flags, pc, regs); | 755 | irq_flags, pc, regs); |
756 | } | 756 | } |
757 | 757 | ||
758 | /* Kretprobe handler */ | 758 | /* Kretprobe handler */ |
@@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
784 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 784 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
785 | 785 | ||
786 | if (!filter_current_check_discard(buffer, call, entry, event)) | 786 | if (!filter_current_check_discard(buffer, call, entry, event)) |
787 | trace_nowake_buffer_unlock_commit_regs(buffer, event, | 787 | trace_buffer_unlock_commit_regs(buffer, event, |
788 | irq_flags, pc, regs); | 788 | irq_flags, pc, regs); |
789 | } | 789 | } |
790 | 790 | ||
791 | /* Event entry printers */ | 791 | /* Event entry printers */ |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 123b189c732c..194d79602dc7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) | |||
610 | return trace_print_lat_fmt(s, entry); | 610 | return trace_print_lat_fmt(s, entry); |
611 | } | 611 | } |
612 | 612 | ||
613 | static unsigned long preempt_mark_thresh = 100; | 613 | static unsigned long preempt_mark_thresh_us = 100; |
614 | 614 | ||
615 | static int | 615 | static int |
616 | lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, | 616 | lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) |
617 | unsigned long rel_usecs) | ||
618 | { | 617 | { |
619 | return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, | 618 | unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; |
620 | rel_usecs > preempt_mark_thresh ? '!' : | 619 | unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; |
621 | rel_usecs > 1 ? '+' : ' '); | 620 | unsigned long long abs_ts = iter->ts - iter->tr->time_start; |
621 | unsigned long long rel_ts = next_ts - iter->ts; | ||
622 | struct trace_seq *s = &iter->seq; | ||
623 | |||
624 | if (in_ns) { | ||
625 | abs_ts = ns2usecs(abs_ts); | ||
626 | rel_ts = ns2usecs(rel_ts); | ||
627 | } | ||
628 | |||
629 | if (verbose && in_ns) { | ||
630 | unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC); | ||
631 | unsigned long abs_msec = (unsigned long)abs_ts; | ||
632 | unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); | ||
633 | unsigned long rel_msec = (unsigned long)rel_ts; | ||
634 | |||
635 | return trace_seq_printf( | ||
636 | s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", | ||
637 | ns2usecs(iter->ts), | ||
638 | abs_msec, abs_usec, | ||
639 | rel_msec, rel_usec); | ||
640 | } else if (verbose && !in_ns) { | ||
641 | return trace_seq_printf( | ||
642 | s, "[%016llx] %lld (+%lld): ", | ||
643 | iter->ts, abs_ts, rel_ts); | ||
644 | } else if (!verbose && in_ns) { | ||
645 | return trace_seq_printf( | ||
646 | s, " %4lldus%c: ", | ||
647 | abs_ts, | ||
648 | rel_ts > preempt_mark_thresh_us ? '!' : | ||
649 | rel_ts > 1 ? '+' : ' '); | ||
650 | } else { /* !verbose && !in_ns */ | ||
651 | return trace_seq_printf(s, " %4lld: ", abs_ts); | ||
652 | } | ||
622 | } | 653 | } |
623 | 654 | ||
624 | int trace_print_context(struct trace_iterator *iter) | 655 | int trace_print_context(struct trace_iterator *iter) |
625 | { | 656 | { |
626 | struct trace_seq *s = &iter->seq; | 657 | struct trace_seq *s = &iter->seq; |
627 | struct trace_entry *entry = iter->ent; | 658 | struct trace_entry *entry = iter->ent; |
628 | unsigned long long t = ns2usecs(iter->ts); | 659 | unsigned long long t; |
629 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 660 | unsigned long secs, usec_rem; |
630 | unsigned long secs = (unsigned long)t; | ||
631 | char comm[TASK_COMM_LEN]; | 661 | char comm[TASK_COMM_LEN]; |
632 | int ret; | 662 | int ret; |
633 | 663 | ||
@@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter) | |||
644 | return 0; | 674 | return 0; |
645 | } | 675 | } |
646 | 676 | ||
647 | return trace_seq_printf(s, " %5lu.%06lu: ", | 677 | if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { |
648 | secs, usec_rem); | 678 | t = ns2usecs(iter->ts); |
679 | usec_rem = do_div(t, USEC_PER_SEC); | ||
680 | secs = (unsigned long)t; | ||
681 | return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); | ||
682 | } else | ||
683 | return trace_seq_printf(s, " %12llu: ", iter->ts); | ||
649 | } | 684 | } |
650 | 685 | ||
651 | int trace_print_lat_context(struct trace_iterator *iter) | 686 | int trace_print_lat_context(struct trace_iterator *iter) |
@@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
659 | *next_entry = trace_find_next_entry(iter, NULL, | 694 | *next_entry = trace_find_next_entry(iter, NULL, |
660 | &next_ts); | 695 | &next_ts); |
661 | unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); | 696 | unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); |
662 | unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); | ||
663 | unsigned long rel_usecs; | ||
664 | 697 | ||
665 | /* Restore the original ent_size */ | 698 | /* Restore the original ent_size */ |
666 | iter->ent_size = ent_size; | 699 | iter->ent_size = ent_size; |
667 | 700 | ||
668 | if (!next_entry) | 701 | if (!next_entry) |
669 | next_ts = iter->ts; | 702 | next_ts = iter->ts; |
670 | rel_usecs = ns2usecs(next_ts - iter->ts); | ||
671 | 703 | ||
672 | if (verbose) { | 704 | if (verbose) { |
673 | char comm[TASK_COMM_LEN]; | 705 | char comm[TASK_COMM_LEN]; |
674 | 706 | ||
675 | trace_find_cmdline(entry->pid, comm); | 707 | trace_find_cmdline(entry->pid, comm); |
676 | 708 | ||
677 | ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" | 709 | ret = trace_seq_printf( |
678 | " %ld.%03ldms (+%ld.%03ldms): ", comm, | 710 | s, "%16s %5d %3d %d %08x %08lx ", |
679 | entry->pid, iter->cpu, entry->flags, | 711 | comm, entry->pid, iter->cpu, entry->flags, |
680 | entry->preempt_count, iter->idx, | 712 | entry->preempt_count, iter->idx); |
681 | ns2usecs(iter->ts), | ||
682 | abs_usecs / USEC_PER_MSEC, | ||
683 | abs_usecs % USEC_PER_MSEC, | ||
684 | rel_usecs / USEC_PER_MSEC, | ||
685 | rel_usecs % USEC_PER_MSEC); | ||
686 | } else { | 713 | } else { |
687 | ret = lat_print_generic(s, entry, iter->cpu); | 714 | ret = lat_print_generic(s, entry, iter->cpu); |
688 | if (ret) | ||
689 | ret = lat_print_timestamp(s, abs_usecs, rel_usecs); | ||
690 | } | 715 | } |
691 | 716 | ||
717 | if (ret) | ||
718 | ret = lat_print_timestamp(iter, next_ts); | ||
719 | |||
692 | return ret; | 720 | return ret; |
693 | } | 721 | } |
694 | 722 | ||
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index daa9980153af..412e959709b4 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type) | |||
441 | goto fail; | 441 | goto fail; |
442 | 442 | ||
443 | type++; | 443 | type++; |
444 | if (strict_strtoul(type, 0, &bs)) | 444 | if (kstrtoul(type, 0, &bs)) |
445 | goto fail; | 445 | goto fail; |
446 | 446 | ||
447 | switch (bs) { | 447 | switch (bs) { |
@@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) | |||
501 | 501 | ||
502 | tmp = strchr(symbol, '+'); | 502 | tmp = strchr(symbol, '+'); |
503 | if (tmp) { | 503 | if (tmp) { |
504 | /* skip sign because strict_strtol doesn't accept '+' */ | 504 | /* skip sign because kstrtoul doesn't accept '+' */ |
505 | ret = strict_strtoul(tmp + 1, 0, offset); | 505 | ret = kstrtoul(tmp + 1, 0, offset); |
506 | if (ret) | 506 | if (ret) |
507 | return ret; | 507 | return ret; |
508 | 508 | ||
@@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
533 | else | 533 | else |
534 | ret = -EINVAL; | 534 | ret = -EINVAL; |
535 | } else if (isdigit(arg[5])) { | 535 | } else if (isdigit(arg[5])) { |
536 | ret = strict_strtoul(arg + 5, 10, ¶m); | 536 | ret = kstrtoul(arg + 5, 10, ¶m); |
537 | if (ret || param > PARAM_MAX_STACK) | 537 | if (ret || param > PARAM_MAX_STACK) |
538 | ret = -EINVAL; | 538 | ret = -EINVAL; |
539 | else { | 539 | else { |
@@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
579 | 579 | ||
580 | case '@': /* memory or symbol */ | 580 | case '@': /* memory or symbol */ |
581 | if (isdigit(arg[1])) { | 581 | if (isdigit(arg[1])) { |
582 | ret = strict_strtoul(arg + 1, 0, ¶m); | 582 | ret = kstrtoul(arg + 1, 0, ¶m); |
583 | if (ret) | 583 | if (ret) |
584 | break; | 584 | break; |
585 | 585 | ||
@@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
597 | break; | 597 | break; |
598 | 598 | ||
599 | case '+': /* deref memory */ | 599 | case '+': /* deref memory */ |
600 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | 600 | arg++; /* Skip '+', because kstrtol() rejects it. */ |
601 | case '-': | 601 | case '-': |
602 | tmp = strchr(arg, '('); | 602 | tmp = strchr(arg, '('); |
603 | if (!tmp) | 603 | if (!tmp) |
604 | break; | 604 | break; |
605 | 605 | ||
606 | *tmp = '\0'; | 606 | *tmp = '\0'; |
607 | ret = strict_strtol(arg, 0, &offset); | 607 | ret = kstrtol(arg, 0, &offset); |
608 | 608 | ||
609 | if (ret) | 609 | if (ret) |
610 | break; | 610 | break; |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 7e62c0a18456..3374c792ccd8 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
102 | entry->next_cpu = task_cpu(wakee); | 102 | entry->next_cpu = task_cpu(wakee); |
103 | 103 | ||
104 | if (!filter_check_discard(call, entry, buffer, event)) | 104 | if (!filter_check_discard(call, entry, buffer, event)) |
105 | ring_buffer_unlock_commit(buffer, event); | 105 | trace_buffer_unlock_commit(buffer, event, flags, pc); |
106 | ftrace_trace_stack(tr->buffer, flags, 6, pc); | ||
107 | ftrace_trace_userstack(tr->buffer, flags, pc); | ||
108 | } | 106 | } |
109 | 107 | ||
110 | static void | 108 | static void |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 02170c00c413..9fe45fcefca0 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Based on code from the latency_tracer, that is: | 7 | * Based on code from the latency_tracer, that is: |
8 | * | 8 | * |
9 | * Copyright (C) 2004-2006 Ingo Molnar | 9 | * Copyright (C) 2004-2006 Ingo Molnar |
10 | * Copyright (C) 2004 William Lee Irwin III | 10 | * Copyright (C) 2004 Nadia Yvette Chambers |
11 | */ | 11 | */ |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
@@ -589,7 +589,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
589 | .reset = wakeup_tracer_reset, | 589 | .reset = wakeup_tracer_reset, |
590 | .start = wakeup_tracer_start, | 590 | .start = wakeup_tracer_start, |
591 | .stop = wakeup_tracer_stop, | 591 | .stop = wakeup_tracer_stop, |
592 | .print_max = 1, | 592 | .print_max = true, |
593 | .print_header = wakeup_print_header, | 593 | .print_header = wakeup_print_header, |
594 | .print_line = wakeup_print_line, | 594 | .print_line = wakeup_print_line, |
595 | .flags = &tracer_flags, | 595 | .flags = &tracer_flags, |
@@ -599,7 +599,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
599 | #endif | 599 | #endif |
600 | .open = wakeup_trace_open, | 600 | .open = wakeup_trace_open, |
601 | .close = wakeup_trace_close, | 601 | .close = wakeup_trace_close, |
602 | .use_max_tr = 1, | 602 | .use_max_tr = true, |
603 | }; | 603 | }; |
604 | 604 | ||
605 | static struct tracer wakeup_rt_tracer __read_mostly = | 605 | static struct tracer wakeup_rt_tracer __read_mostly = |
@@ -610,7 +610,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
610 | .start = wakeup_tracer_start, | 610 | .start = wakeup_tracer_start, |
611 | .stop = wakeup_tracer_stop, | 611 | .stop = wakeup_tracer_stop, |
612 | .wait_pipe = poll_wait_pipe, | 612 | .wait_pipe = poll_wait_pipe, |
613 | .print_max = 1, | 613 | .print_max = true, |
614 | .print_header = wakeup_print_header, | 614 | .print_header = wakeup_print_header, |
615 | .print_line = wakeup_print_line, | 615 | .print_line = wakeup_print_line, |
616 | .flags = &tracer_flags, | 616 | .flags = &tracer_flags, |
@@ -620,7 +620,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
620 | #endif | 620 | #endif |
621 | .open = wakeup_trace_open, | 621 | .open = wakeup_trace_open, |
622 | .close = wakeup_trace_close, | 622 | .close = wakeup_trace_close, |
623 | .use_max_tr = 1, | 623 | .use_max_tr = true, |
624 | }; | 624 | }; |
625 | 625 | ||
626 | __init static int init_wakeup_tracer(void) | 626 | __init static int init_wakeup_tracer(void) |
@@ -637,4 +637,4 @@ __init static int init_wakeup_tracer(void) | |||
637 | 637 | ||
638 | return 0; | 638 | return 0; |
639 | } | 639 | } |
640 | device_initcall(init_wakeup_tracer); | 640 | core_initcall(init_wakeup_tracer); |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 2c00a691a540..47623169a815 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -320,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
320 | int (*func)(void)) | 320 | int (*func)(void)) |
321 | { | 321 | { |
322 | int save_ftrace_enabled = ftrace_enabled; | 322 | int save_ftrace_enabled = ftrace_enabled; |
323 | int save_tracer_enabled = tracer_enabled; | ||
324 | unsigned long count; | 323 | unsigned long count; |
325 | char *func_name; | 324 | char *func_name; |
326 | int ret; | 325 | int ret; |
@@ -331,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
331 | 330 | ||
332 | /* enable tracing, and record the filter function */ | 331 | /* enable tracing, and record the filter function */ |
333 | ftrace_enabled = 1; | 332 | ftrace_enabled = 1; |
334 | tracer_enabled = 1; | ||
335 | 333 | ||
336 | /* passed in by parameter to fool gcc from optimizing */ | 334 | /* passed in by parameter to fool gcc from optimizing */ |
337 | func(); | 335 | func(); |
@@ -395,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
395 | 393 | ||
396 | out: | 394 | out: |
397 | ftrace_enabled = save_ftrace_enabled; | 395 | ftrace_enabled = save_ftrace_enabled; |
398 | tracer_enabled = save_tracer_enabled; | ||
399 | 396 | ||
400 | /* Enable tracing on all functions again */ | 397 | /* Enable tracing on all functions again */ |
401 | ftrace_set_global_filter(NULL, 0, 1); | 398 | ftrace_set_global_filter(NULL, 0, 1); |
@@ -452,7 +449,6 @@ static int | |||
452 | trace_selftest_function_recursion(void) | 449 | trace_selftest_function_recursion(void) |
453 | { | 450 | { |
454 | int save_ftrace_enabled = ftrace_enabled; | 451 | int save_ftrace_enabled = ftrace_enabled; |
455 | int save_tracer_enabled = tracer_enabled; | ||
456 | char *func_name; | 452 | char *func_name; |
457 | int len; | 453 | int len; |
458 | int ret; | 454 | int ret; |
@@ -465,7 +461,6 @@ trace_selftest_function_recursion(void) | |||
465 | 461 | ||
466 | /* enable tracing, and record the filter function */ | 462 | /* enable tracing, and record the filter function */ |
467 | ftrace_enabled = 1; | 463 | ftrace_enabled = 1; |
468 | tracer_enabled = 1; | ||
469 | 464 | ||
470 | /* Handle PPC64 '.' name */ | 465 | /* Handle PPC64 '.' name */ |
471 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | 466 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); |
@@ -534,7 +529,6 @@ trace_selftest_function_recursion(void) | |||
534 | ret = 0; | 529 | ret = 0; |
535 | out: | 530 | out: |
536 | ftrace_enabled = save_ftrace_enabled; | 531 | ftrace_enabled = save_ftrace_enabled; |
537 | tracer_enabled = save_tracer_enabled; | ||
538 | 532 | ||
539 | return ret; | 533 | return ret; |
540 | } | 534 | } |
@@ -569,7 +563,6 @@ static int | |||
569 | trace_selftest_function_regs(void) | 563 | trace_selftest_function_regs(void) |
570 | { | 564 | { |
571 | int save_ftrace_enabled = ftrace_enabled; | 565 | int save_ftrace_enabled = ftrace_enabled; |
572 | int save_tracer_enabled = tracer_enabled; | ||
573 | char *func_name; | 566 | char *func_name; |
574 | int len; | 567 | int len; |
575 | int ret; | 568 | int ret; |
@@ -586,7 +579,6 @@ trace_selftest_function_regs(void) | |||
586 | 579 | ||
587 | /* enable tracing, and record the filter function */ | 580 | /* enable tracing, and record the filter function */ |
588 | ftrace_enabled = 1; | 581 | ftrace_enabled = 1; |
589 | tracer_enabled = 1; | ||
590 | 582 | ||
591 | /* Handle PPC64 '.' name */ | 583 | /* Handle PPC64 '.' name */ |
592 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | 584 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); |
@@ -648,7 +640,6 @@ trace_selftest_function_regs(void) | |||
648 | ret = 0; | 640 | ret = 0; |
649 | out: | 641 | out: |
650 | ftrace_enabled = save_ftrace_enabled; | 642 | ftrace_enabled = save_ftrace_enabled; |
651 | tracer_enabled = save_tracer_enabled; | ||
652 | 643 | ||
653 | return ret; | 644 | return ret; |
654 | } | 645 | } |
@@ -662,7 +653,6 @@ int | |||
662 | trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | 653 | trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) |
663 | { | 654 | { |
664 | int save_ftrace_enabled = ftrace_enabled; | 655 | int save_ftrace_enabled = ftrace_enabled; |
665 | int save_tracer_enabled = tracer_enabled; | ||
666 | unsigned long count; | 656 | unsigned long count; |
667 | int ret; | 657 | int ret; |
668 | 658 | ||
@@ -671,7 +661,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
671 | 661 | ||
672 | /* start the tracing */ | 662 | /* start the tracing */ |
673 | ftrace_enabled = 1; | 663 | ftrace_enabled = 1; |
674 | tracer_enabled = 1; | ||
675 | 664 | ||
676 | ret = tracer_init(trace, tr); | 665 | ret = tracer_init(trace, tr); |
677 | if (ret) { | 666 | if (ret) { |
@@ -708,7 +697,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
708 | ret = trace_selftest_function_regs(); | 697 | ret = trace_selftest_function_regs(); |
709 | out: | 698 | out: |
710 | ftrace_enabled = save_ftrace_enabled; | 699 | ftrace_enabled = save_ftrace_enabled; |
711 | tracer_enabled = save_tracer_enabled; | ||
712 | 700 | ||
713 | /* kill ftrace totally if we failed */ | 701 | /* kill ftrace totally if we failed */ |
714 | if (ret) | 702 | if (ret) |
@@ -1106,6 +1094,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
1106 | tracing_stop(); | 1094 | tracing_stop(); |
1107 | /* check both trace buffers */ | 1095 | /* check both trace buffers */ |
1108 | ret = trace_test_buffer(tr, NULL); | 1096 | ret = trace_test_buffer(tr, NULL); |
1097 | printk("ret = %d\n", ret); | ||
1109 | if (!ret) | 1098 | if (!ret) |
1110 | ret = trace_test_buffer(&max_tr, &count); | 1099 | ret = trace_test_buffer(&max_tr, &count); |
1111 | 1100 | ||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 2485a7d09b11..7609dd6714c2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -21,9 +21,6 @@ static int syscall_enter_register(struct ftrace_event_call *event, | |||
21 | static int syscall_exit_register(struct ftrace_event_call *event, | 21 | static int syscall_exit_register(struct ftrace_event_call *event, |
22 | enum trace_reg type, void *data); | 22 | enum trace_reg type, void *data); |
23 | 23 | ||
24 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | ||
25 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | ||
26 | |||
27 | static struct list_head * | 24 | static struct list_head * |
28 | syscall_get_enter_fields(struct ftrace_event_call *call) | 25 | syscall_get_enter_fields(struct ftrace_event_call *call) |
29 | { | 26 | { |
@@ -32,30 +29,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
32 | return &entry->enter_fields; | 29 | return &entry->enter_fields; |
33 | } | 30 | } |
34 | 31 | ||
35 | struct trace_event_functions enter_syscall_print_funcs = { | ||
36 | .trace = print_syscall_enter, | ||
37 | }; | ||
38 | |||
39 | struct trace_event_functions exit_syscall_print_funcs = { | ||
40 | .trace = print_syscall_exit, | ||
41 | }; | ||
42 | |||
43 | struct ftrace_event_class event_class_syscall_enter = { | ||
44 | .system = "syscalls", | ||
45 | .reg = syscall_enter_register, | ||
46 | .define_fields = syscall_enter_define_fields, | ||
47 | .get_fields = syscall_get_enter_fields, | ||
48 | .raw_init = init_syscall_trace, | ||
49 | }; | ||
50 | |||
51 | struct ftrace_event_class event_class_syscall_exit = { | ||
52 | .system = "syscalls", | ||
53 | .reg = syscall_exit_register, | ||
54 | .define_fields = syscall_exit_define_fields, | ||
55 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), | ||
56 | .raw_init = init_syscall_trace, | ||
57 | }; | ||
58 | |||
59 | extern struct syscall_metadata *__start_syscalls_metadata[]; | 32 | extern struct syscall_metadata *__start_syscalls_metadata[]; |
60 | extern struct syscall_metadata *__stop_syscalls_metadata[]; | 33 | extern struct syscall_metadata *__stop_syscalls_metadata[]; |
61 | 34 | ||
@@ -432,7 +405,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
432 | mutex_unlock(&syscall_trace_lock); | 405 | mutex_unlock(&syscall_trace_lock); |
433 | } | 406 | } |
434 | 407 | ||
435 | int init_syscall_trace(struct ftrace_event_call *call) | 408 | static int init_syscall_trace(struct ftrace_event_call *call) |
436 | { | 409 | { |
437 | int id; | 410 | int id; |
438 | int num; | 411 | int num; |
@@ -457,6 +430,30 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
457 | return id; | 430 | return id; |
458 | } | 431 | } |
459 | 432 | ||
433 | struct trace_event_functions enter_syscall_print_funcs = { | ||
434 | .trace = print_syscall_enter, | ||
435 | }; | ||
436 | |||
437 | struct trace_event_functions exit_syscall_print_funcs = { | ||
438 | .trace = print_syscall_exit, | ||
439 | }; | ||
440 | |||
441 | struct ftrace_event_class event_class_syscall_enter = { | ||
442 | .system = "syscalls", | ||
443 | .reg = syscall_enter_register, | ||
444 | .define_fields = syscall_enter_define_fields, | ||
445 | .get_fields = syscall_get_enter_fields, | ||
446 | .raw_init = init_syscall_trace, | ||
447 | }; | ||
448 | |||
449 | struct ftrace_event_class event_class_syscall_exit = { | ||
450 | .system = "syscalls", | ||
451 | .reg = syscall_exit_register, | ||
452 | .define_fields = syscall_exit_define_fields, | ||
453 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), | ||
454 | .raw_init = init_syscall_trace, | ||
455 | }; | ||
456 | |||
460 | unsigned long __init __weak arch_syscall_addr(int nr) | 457 | unsigned long __init __weak arch_syscall_addr(int nr) |
461 | { | 458 | { |
462 | return (unsigned long)sys_call_table[nr]; | 459 | return (unsigned long)sys_call_table[nr]; |
@@ -537,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
537 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); | 534 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
538 | } | 535 | } |
539 | 536 | ||
540 | int perf_sysenter_enable(struct ftrace_event_call *call) | 537 | static int perf_sysenter_enable(struct ftrace_event_call *call) |
541 | { | 538 | { |
542 | int ret = 0; | 539 | int ret = 0; |
543 | int num; | 540 | int num; |
@@ -558,7 +555,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call) | |||
558 | return ret; | 555 | return ret; |
559 | } | 556 | } |
560 | 557 | ||
561 | void perf_sysenter_disable(struct ftrace_event_call *call) | 558 | static void perf_sysenter_disable(struct ftrace_event_call *call) |
562 | { | 559 | { |
563 | int num; | 560 | int num; |
564 | 561 | ||
@@ -615,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
615 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); | 612 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
616 | } | 613 | } |
617 | 614 | ||
618 | int perf_sysexit_enable(struct ftrace_event_call *call) | 615 | static int perf_sysexit_enable(struct ftrace_event_call *call) |
619 | { | 616 | { |
620 | int ret = 0; | 617 | int ret = 0; |
621 | int num; | 618 | int num; |
@@ -636,7 +633,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call) | |||
636 | return ret; | 633 | return ret; |
637 | } | 634 | } |
638 | 635 | ||
639 | void perf_sysexit_disable(struct ftrace_event_call *call) | 636 | static void perf_sysexit_disable(struct ftrace_event_call *call) |
640 | { | 637 | { |
641 | int num; | 638 | int num; |
642 | 639 | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 03003cd7dd96..c86e6d4f67fb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
23 | #include <linux/uprobes.h> | 23 | #include <linux/uprobes.h> |
24 | #include <linux/namei.h> | 24 | #include <linux/namei.h> |
25 | #include <linux/string.h> | ||
25 | 26 | ||
26 | #include "trace_probe.h" | 27 | #include "trace_probe.h" |
27 | 28 | ||
@@ -189,7 +190,7 @@ static int create_trace_uprobe(int argc, char **argv) | |||
189 | if (argv[0][0] == '-') | 190 | if (argv[0][0] == '-') |
190 | is_delete = true; | 191 | is_delete = true; |
191 | else if (argv[0][0] != 'p') { | 192 | else if (argv[0][0] != 'p') { |
192 | pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); | 193 | pr_info("Probe definition must be started with 'p' or '-'.\n"); |
193 | return -EINVAL; | 194 | return -EINVAL; |
194 | } | 195 | } |
195 | 196 | ||
@@ -252,7 +253,7 @@ static int create_trace_uprobe(int argc, char **argv) | |||
252 | if (ret) | 253 | if (ret) |
253 | goto fail_address_parse; | 254 | goto fail_address_parse; |
254 | 255 | ||
255 | ret = strict_strtoul(arg, 0, &offset); | 256 | ret = kstrtoul(arg, 0, &offset); |
256 | if (ret) | 257 | if (ret) |
257 | goto fail_address_parse; | 258 | goto fail_address_parse; |
258 | 259 | ||
@@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv) | |||
263 | 264 | ||
264 | /* setup a probe */ | 265 | /* setup a probe */ |
265 | if (!event) { | 266 | if (!event) { |
266 | char *tail = strrchr(filename, '/'); | 267 | char *tail; |
267 | char *ptr; | 268 | char *ptr; |
268 | 269 | ||
269 | ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); | 270 | tail = kstrdup(kbasename(filename), GFP_KERNEL); |
270 | if (!ptr) { | 271 | if (!tail) { |
271 | ret = -ENOMEM; | 272 | ret = -ENOMEM; |
272 | goto fail_address_parse; | 273 | goto fail_address_parse; |
273 | } | 274 | } |
274 | 275 | ||
275 | tail = ptr; | ||
276 | ptr = strpbrk(tail, ".-_"); | 276 | ptr = strpbrk(tail, ".-_"); |
277 | if (ptr) | 277 | if (ptr) |
278 | *ptr = '\0'; | 278 | *ptr = '\0'; |
diff --git a/kernel/wait.c b/kernel/wait.c index 7fdd9eaca2c3..6698e0c04ead 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Generic waiting primitives. | 2 | * Generic waiting primitives. |
3 | * | 3 | * |
4 | * (C) 2004 William Irwin, Oracle | 4 | * (C) 2004 Nadia Yvette Chambers, Oracle |
5 | */ | 5 | */ |
6 | #include <linux/init.h> | 6 | #include <linux/init.h> |
7 | #include <linux/export.h> | 7 | #include <linux/export.h> |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9d4c8d5a1f53..997c6a16ec22 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -31,6 +31,7 @@ | |||
31 | int watchdog_enabled = 1; | 31 | int watchdog_enabled = 1; |
32 | int __read_mostly watchdog_thresh = 10; | 32 | int __read_mostly watchdog_thresh = 10; |
33 | static int __read_mostly watchdog_disabled; | 33 | static int __read_mostly watchdog_disabled; |
34 | static u64 __read_mostly sample_period; | ||
34 | 35 | ||
35 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 36 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
36 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | 37 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); |
@@ -116,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu) | |||
116 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | 117 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ |
117 | } | 118 | } |
118 | 119 | ||
119 | static unsigned long get_sample_period(void) | 120 | static void set_sample_period(void) |
120 | { | 121 | { |
121 | /* | 122 | /* |
122 | * convert watchdog_thresh from seconds to ns | 123 | * convert watchdog_thresh from seconds to ns |
@@ -125,7 +126,7 @@ static unsigned long get_sample_period(void) | |||
125 | * and hard thresholds) to increment before the | 126 | * and hard thresholds) to increment before the |
126 | * hardlockup detector generates a warning | 127 | * hardlockup detector generates a warning |
127 | */ | 128 | */ |
128 | return get_softlockup_thresh() * (NSEC_PER_SEC / 5); | 129 | sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); |
129 | } | 130 | } |
130 | 131 | ||
131 | /* Commands for resetting the watchdog */ | 132 | /* Commands for resetting the watchdog */ |
@@ -275,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
275 | wake_up_process(__this_cpu_read(softlockup_watchdog)); | 276 | wake_up_process(__this_cpu_read(softlockup_watchdog)); |
276 | 277 | ||
277 | /* .. and repeat */ | 278 | /* .. and repeat */ |
278 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | 279 | hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); |
279 | 280 | ||
280 | if (touch_ts == 0) { | 281 | if (touch_ts == 0) { |
281 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { | 282 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { |
@@ -356,7 +357,7 @@ static void watchdog_enable(unsigned int cpu) | |||
356 | hrtimer->function = watchdog_timer_fn; | 357 | hrtimer->function = watchdog_timer_fn; |
357 | 358 | ||
358 | /* done here because hrtimer_start can only pin to smp_processor_id() */ | 359 | /* done here because hrtimer_start can only pin to smp_processor_id() */ |
359 | hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), | 360 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), |
360 | HRTIMER_MODE_REL_PINNED); | 361 | HRTIMER_MODE_REL_PINNED); |
361 | 362 | ||
362 | /* initialize timestamp */ | 363 | /* initialize timestamp */ |
@@ -368,6 +369,9 @@ static void watchdog_disable(unsigned int cpu) | |||
368 | { | 369 | { |
369 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 370 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
370 | 371 | ||
372 | if (!watchdog_enabled) | ||
373 | return; | ||
374 | |||
371 | watchdog_set_prio(SCHED_NORMAL, 0); | 375 | watchdog_set_prio(SCHED_NORMAL, 0); |
372 | hrtimer_cancel(hrtimer); | 376 | hrtimer_cancel(hrtimer); |
373 | /* disable the perf event */ | 377 | /* disable the perf event */ |
@@ -383,7 +387,7 @@ static int watchdog_should_run(unsigned int cpu) | |||
383 | /* | 387 | /* |
384 | * The watchdog thread function - touches the timestamp. | 388 | * The watchdog thread function - touches the timestamp. |
385 | * | 389 | * |
386 | * It only runs once every get_sample_period() seconds (4 seconds by | 390 | * It only runs once every sample_period seconds (4 seconds by |
387 | * default) to reset the softlockup timestamp. If this gets delayed | 391 | * default) to reset the softlockup timestamp. If this gets delayed |
388 | * for more than 2*watchdog_thresh seconds then the debug-printout | 392 | * for more than 2*watchdog_thresh seconds then the debug-printout |
389 | * triggers in watchdog_timer_fn(). | 393 | * triggers in watchdog_timer_fn(). |
@@ -516,6 +520,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
516 | if (ret || !write) | 520 | if (ret || !write) |
517 | return ret; | 521 | return ret; |
518 | 522 | ||
523 | set_sample_period(); | ||
519 | if (watchdog_enabled && watchdog_thresh) | 524 | if (watchdog_enabled && watchdog_thresh) |
520 | watchdog_enable_all_cpus(); | 525 | watchdog_enable_all_cpus(); |
521 | else | 526 | else |
@@ -537,6 +542,7 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
537 | 542 | ||
538 | void __init lockup_detector_init(void) | 543 | void __init lockup_detector_init(void) |
539 | { | 544 | { |
545 | set_sample_period(); | ||
540 | if (smpboot_register_percpu_thread(&watchdog_threads)) { | 546 | if (smpboot_register_percpu_thread(&watchdog_threads)) { |
541 | pr_err("Failed to create watchdog threads, disabled\n"); | 547 | pr_err("Failed to create watchdog threads, disabled\n"); |
542 | watchdog_disabled = -ENODEV; | 548 | watchdog_disabled = -ENODEV; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 042d221d33cc..fbc6576a83c3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -739,8 +739,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
739 | { | 739 | { |
740 | struct worker *worker = kthread_data(task); | 740 | struct worker *worker = kthread_data(task); |
741 | 741 | ||
742 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 742 | if (!(worker->flags & WORKER_NOT_RUNNING)) { |
743 | WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); | ||
743 | atomic_inc(get_pool_nr_running(worker->pool)); | 744 | atomic_inc(get_pool_nr_running(worker->pool)); |
745 | } | ||
744 | } | 746 | } |
745 | 747 | ||
746 | /** | 748 | /** |
@@ -1361,8 +1363,19 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, | |||
1361 | 1363 | ||
1362 | WARN_ON_ONCE(timer->function != delayed_work_timer_fn || | 1364 | WARN_ON_ONCE(timer->function != delayed_work_timer_fn || |
1363 | timer->data != (unsigned long)dwork); | 1365 | timer->data != (unsigned long)dwork); |
1364 | BUG_ON(timer_pending(timer)); | 1366 | WARN_ON_ONCE(timer_pending(timer)); |
1365 | BUG_ON(!list_empty(&work->entry)); | 1367 | WARN_ON_ONCE(!list_empty(&work->entry)); |
1368 | |||
1369 | /* | ||
1370 | * If @delay is 0, queue @dwork->work immediately. This is for | ||
1371 | * both optimization and correctness. The earliest @timer can | ||
1372 | * expire is on the closest next tick and delayed_work users depend | ||
1373 | * on that there's no such delay when @delay is 0. | ||
1374 | */ | ||
1375 | if (!delay) { | ||
1376 | __queue_work(cpu, wq, &dwork->work); | ||
1377 | return; | ||
1378 | } | ||
1366 | 1379 | ||
1367 | timer_stats_timer_set_start_info(&dwork->timer); | 1380 | timer_stats_timer_set_start_info(&dwork->timer); |
1368 | 1381 | ||
@@ -1417,9 +1430,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
1417 | bool ret = false; | 1430 | bool ret = false; |
1418 | unsigned long flags; | 1431 | unsigned long flags; |
1419 | 1432 | ||
1420 | if (!delay) | ||
1421 | return queue_work_on(cpu, wq, &dwork->work); | ||
1422 | |||
1423 | /* read the comment in __queue_work() */ | 1433 | /* read the comment in __queue_work() */ |
1424 | local_irq_save(flags); | 1434 | local_irq_save(flags); |
1425 | 1435 | ||
@@ -2407,8 +2417,10 @@ static int rescuer_thread(void *__wq) | |||
2407 | repeat: | 2417 | repeat: |
2408 | set_current_state(TASK_INTERRUPTIBLE); | 2418 | set_current_state(TASK_INTERRUPTIBLE); |
2409 | 2419 | ||
2410 | if (kthread_should_stop()) | 2420 | if (kthread_should_stop()) { |
2421 | __set_current_state(TASK_RUNNING); | ||
2411 | return 0; | 2422 | return 0; |
2423 | } | ||
2412 | 2424 | ||
2413 | /* | 2425 | /* |
2414 | * See whether any cpu is asking for help. Unbounded | 2426 | * See whether any cpu is asking for help. Unbounded |
@@ -3475,7 +3487,7 @@ unsigned int work_busy(struct work_struct *work) | |||
3475 | unsigned int ret = 0; | 3487 | unsigned int ret = 0; |
3476 | 3488 | ||
3477 | if (!gcwq) | 3489 | if (!gcwq) |
3478 | return false; | 3490 | return 0; |
3479 | 3491 | ||
3480 | spin_lock_irqsave(&gcwq->lock, flags); | 3492 | spin_lock_irqsave(&gcwq->lock, flags); |
3481 | 3493 | ||