diff options
Diffstat (limited to 'kernel')
100 files changed, 5872 insertions, 3527 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0b72d1a74be0..0b5ff083fa22 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -10,8 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
13 | async.o range.o | 13 | async.o range.o jump_label.o |
14 | obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o | ||
15 | obj-y += groups.o | 14 | obj-y += groups.o |
16 | 15 | ||
17 | ifdef CONFIG_FUNCTION_TRACER | 16 | ifdef CONFIG_FUNCTION_TRACER |
@@ -23,6 +22,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg | |||
23 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 22 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
24 | CFLAGS_REMOVE_sched_clock.o = -pg | 23 | CFLAGS_REMOVE_sched_clock.o = -pg |
25 | CFLAGS_REMOVE_perf_event.o = -pg | 24 | CFLAGS_REMOVE_perf_event.o = -pg |
25 | CFLAGS_REMOVE_irq_work.o = -pg | ||
26 | endif | 26 | endif |
27 | 27 | ||
28 | obj-$(CONFIG_FREEZER) += freezer.o | 28 | obj-$(CONFIG_FREEZER) += freezer.o |
@@ -86,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o | |||
86 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o | 86 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o |
87 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o | 87 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o |
88 | obj-$(CONFIG_TINY_RCU) += rcutiny.o | 88 | obj-$(CONFIG_TINY_RCU) += rcutiny.o |
89 | obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o | ||
89 | obj-$(CONFIG_RELAY) += relay.o | 90 | obj-$(CONFIG_RELAY) += relay.o |
90 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 91 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
91 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 92 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
@@ -100,6 +101,7 @@ obj-$(CONFIG_TRACING) += trace/ | |||
100 | obj-$(CONFIG_X86_DS) += trace/ | 101 | obj-$(CONFIG_X86_DS) += trace/ |
101 | obj-$(CONFIG_RING_BUFFER) += trace/ | 102 | obj-$(CONFIG_RING_BUFFER) += trace/ |
102 | obj-$(CONFIG_SMP) += sched_cpupri.o | 103 | obj-$(CONFIG_SMP) += sched_cpupri.o |
104 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | ||
103 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 105 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
104 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 106 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
105 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 107 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c9483d8f6140..5cf366965d0c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -52,7 +52,6 @@ | |||
52 | #include <linux/cgroupstats.h> | 52 | #include <linux/cgroupstats.h> |
53 | #include <linux/hash.h> | 53 | #include <linux/hash.h> |
54 | #include <linux/namei.h> | 54 | #include <linux/namei.h> |
55 | #include <linux/smp_lock.h> | ||
56 | #include <linux/pid_namespace.h> | 55 | #include <linux/pid_namespace.h> |
57 | #include <linux/idr.h> | 56 | #include <linux/idr.h> |
58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
@@ -138,7 +137,7 @@ struct css_id { | |||
138 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 137 | * is called after synchronize_rcu(). But for safe use, css_is_removed() |
139 | * css_tryget() should be used for avoiding race. | 138 | * css_tryget() should be used for avoiding race. |
140 | */ | 139 | */ |
141 | struct cgroup_subsys_state *css; | 140 | struct cgroup_subsys_state __rcu *css; |
142 | /* | 141 | /* |
143 | * ID of this css. | 142 | * ID of this css. |
144 | */ | 143 | */ |
@@ -244,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
244 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 243 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
245 | } | 244 | } |
246 | 245 | ||
246 | static int clone_children(const struct cgroup *cgrp) | ||
247 | { | ||
248 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
249 | } | ||
250 | |||
247 | /* | 251 | /* |
248 | * for_each_subsys() allows you to iterate on each subsystem attached to | 252 | * for_each_subsys() allows you to iterate on each subsystem attached to |
249 | * an active hierarchy | 253 | * an active hierarchy |
@@ -778,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | |||
778 | struct inode *inode = new_inode(sb); | 782 | struct inode *inode = new_inode(sb); |
779 | 783 | ||
780 | if (inode) { | 784 | if (inode) { |
785 | inode->i_ino = get_next_ino(); | ||
781 | inode->i_mode = mode; | 786 | inode->i_mode = mode; |
782 | inode->i_uid = current_fsuid(); | 787 | inode->i_uid = current_fsuid(); |
783 | inode->i_gid = current_fsgid(); | 788 | inode->i_gid = current_fsgid(); |
@@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
1040 | seq_puts(seq, ",noprefix"); | 1045 | seq_puts(seq, ",noprefix"); |
1041 | if (strlen(root->release_agent_path)) | 1046 | if (strlen(root->release_agent_path)) |
1042 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1047 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
1048 | if (clone_children(&root->top_cgroup)) | ||
1049 | seq_puts(seq, ",clone_children"); | ||
1043 | if (strlen(root->name)) | 1050 | if (strlen(root->name)) |
1044 | seq_printf(seq, ",name=%s", root->name); | 1051 | seq_printf(seq, ",name=%s", root->name); |
1045 | mutex_unlock(&cgroup_mutex); | 1052 | mutex_unlock(&cgroup_mutex); |
@@ -1050,6 +1057,7 @@ struct cgroup_sb_opts { | |||
1050 | unsigned long subsys_bits; | 1057 | unsigned long subsys_bits; |
1051 | unsigned long flags; | 1058 | unsigned long flags; |
1052 | char *release_agent; | 1059 | char *release_agent; |
1060 | bool clone_children; | ||
1053 | char *name; | 1061 | char *name; |
1054 | /* User explicitly requested empty subsystem */ | 1062 | /* User explicitly requested empty subsystem */ |
1055 | bool none; | 1063 | bool none; |
@@ -1066,7 +1074,8 @@ struct cgroup_sb_opts { | |||
1066 | */ | 1074 | */ |
1067 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | 1075 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
1068 | { | 1076 | { |
1069 | char *token, *o = data ?: "all"; | 1077 | char *token, *o = data; |
1078 | bool all_ss = false, one_ss = false; | ||
1070 | unsigned long mask = (unsigned long)-1; | 1079 | unsigned long mask = (unsigned long)-1; |
1071 | int i; | 1080 | int i; |
1072 | bool module_pin_failed = false; | 1081 | bool module_pin_failed = false; |
@@ -1082,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1082 | while ((token = strsep(&o, ",")) != NULL) { | 1091 | while ((token = strsep(&o, ",")) != NULL) { |
1083 | if (!*token) | 1092 | if (!*token) |
1084 | return -EINVAL; | 1093 | return -EINVAL; |
1085 | if (!strcmp(token, "all")) { | 1094 | if (!strcmp(token, "none")) { |
1086 | /* Add all non-disabled subsystems */ | ||
1087 | opts->subsys_bits = 0; | ||
1088 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1089 | struct cgroup_subsys *ss = subsys[i]; | ||
1090 | if (ss == NULL) | ||
1091 | continue; | ||
1092 | if (!ss->disabled) | ||
1093 | opts->subsys_bits |= 1ul << i; | ||
1094 | } | ||
1095 | } else if (!strcmp(token, "none")) { | ||
1096 | /* Explicitly have no subsystems */ | 1095 | /* Explicitly have no subsystems */ |
1097 | opts->none = true; | 1096 | opts->none = true; |
1098 | } else if (!strcmp(token, "noprefix")) { | 1097 | continue; |
1098 | } | ||
1099 | if (!strcmp(token, "all")) { | ||
1100 | /* Mutually exclusive option 'all' + subsystem name */ | ||
1101 | if (one_ss) | ||
1102 | return -EINVAL; | ||
1103 | all_ss = true; | ||
1104 | continue; | ||
1105 | } | ||
1106 | if (!strcmp(token, "noprefix")) { | ||
1099 | set_bit(ROOT_NOPREFIX, &opts->flags); | 1107 | set_bit(ROOT_NOPREFIX, &opts->flags); |
1100 | } else if (!strncmp(token, "release_agent=", 14)) { | 1108 | continue; |
1109 | } | ||
1110 | if (!strcmp(token, "clone_children")) { | ||
1111 | opts->clone_children = true; | ||
1112 | continue; | ||
1113 | } | ||
1114 | if (!strncmp(token, "release_agent=", 14)) { | ||
1101 | /* Specifying two release agents is forbidden */ | 1115 | /* Specifying two release agents is forbidden */ |
1102 | if (opts->release_agent) | 1116 | if (opts->release_agent) |
1103 | return -EINVAL; | 1117 | return -EINVAL; |
@@ -1105,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1105 | kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); | 1119 | kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); |
1106 | if (!opts->release_agent) | 1120 | if (!opts->release_agent) |
1107 | return -ENOMEM; | 1121 | return -ENOMEM; |
1108 | } else if (!strncmp(token, "name=", 5)) { | 1122 | continue; |
1123 | } | ||
1124 | if (!strncmp(token, "name=", 5)) { | ||
1109 | const char *name = token + 5; | 1125 | const char *name = token + 5; |
1110 | /* Can't specify an empty name */ | 1126 | /* Can't specify an empty name */ |
1111 | if (!strlen(name)) | 1127 | if (!strlen(name)) |
@@ -1127,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1127 | GFP_KERNEL); | 1143 | GFP_KERNEL); |
1128 | if (!opts->name) | 1144 | if (!opts->name) |
1129 | return -ENOMEM; | 1145 | return -ENOMEM; |
1130 | } else { | 1146 | |
1131 | struct cgroup_subsys *ss; | 1147 | continue; |
1132 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1148 | } |
1133 | ss = subsys[i]; | 1149 | |
1134 | if (ss == NULL) | 1150 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1135 | continue; | 1151 | struct cgroup_subsys *ss = subsys[i]; |
1136 | if (!strcmp(token, ss->name)) { | 1152 | if (ss == NULL) |
1137 | if (!ss->disabled) | 1153 | continue; |
1138 | set_bit(i, &opts->subsys_bits); | 1154 | if (strcmp(token, ss->name)) |
1139 | break; | 1155 | continue; |
1140 | } | 1156 | if (ss->disabled) |
1141 | } | 1157 | continue; |
1142 | if (i == CGROUP_SUBSYS_COUNT) | 1158 | |
1143 | return -ENOENT; | 1159 | /* Mutually exclusive option 'all' + subsystem name */ |
1160 | if (all_ss) | ||
1161 | return -EINVAL; | ||
1162 | set_bit(i, &opts->subsys_bits); | ||
1163 | one_ss = true; | ||
1164 | |||
1165 | break; | ||
1166 | } | ||
1167 | if (i == CGROUP_SUBSYS_COUNT) | ||
1168 | return -ENOENT; | ||
1169 | } | ||
1170 | |||
1171 | /* | ||
1172 | * If the 'all' option was specified select all the subsystems, | ||
1173 | * otherwise 'all, 'none' and a subsystem name options were not | ||
1174 | * specified, let's default to 'all' | ||
1175 | */ | ||
1176 | if (all_ss || (!all_ss && !one_ss && !opts->none)) { | ||
1177 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1178 | struct cgroup_subsys *ss = subsys[i]; | ||
1179 | if (ss == NULL) | ||
1180 | continue; | ||
1181 | if (ss->disabled) | ||
1182 | continue; | ||
1183 | set_bit(i, &opts->subsys_bits); | ||
1144 | } | 1184 | } |
1145 | } | 1185 | } |
1146 | 1186 | ||
@@ -1222,7 +1262,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1222 | struct cgroup *cgrp = &root->top_cgroup; | 1262 | struct cgroup *cgrp = &root->top_cgroup; |
1223 | struct cgroup_sb_opts opts; | 1263 | struct cgroup_sb_opts opts; |
1224 | 1264 | ||
1225 | lock_kernel(); | ||
1226 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | 1265 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); |
1227 | mutex_lock(&cgroup_mutex); | 1266 | mutex_lock(&cgroup_mutex); |
1228 | 1267 | ||
@@ -1255,7 +1294,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1255 | kfree(opts.name); | 1294 | kfree(opts.name); |
1256 | mutex_unlock(&cgroup_mutex); | 1295 | mutex_unlock(&cgroup_mutex); |
1257 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1296 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1258 | unlock_kernel(); | ||
1259 | return ret; | 1297 | return ret; |
1260 | } | 1298 | } |
1261 | 1299 | ||
@@ -1357,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1357 | strcpy(root->release_agent_path, opts->release_agent); | 1395 | strcpy(root->release_agent_path, opts->release_agent); |
1358 | if (opts->name) | 1396 | if (opts->name) |
1359 | strcpy(root->name, opts->name); | 1397 | strcpy(root->name, opts->name); |
1398 | if (opts->clone_children) | ||
1399 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); | ||
1360 | return root; | 1400 | return root; |
1361 | } | 1401 | } |
1362 | 1402 | ||
@@ -1568,7 +1608,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1568 | out_err: | 1608 | out_err: |
1569 | kfree(opts.release_agent); | 1609 | kfree(opts.release_agent); |
1570 | kfree(opts.name); | 1610 | kfree(opts.name); |
1571 | |||
1572 | return ret; | 1611 | return ret; |
1573 | } | 1612 | } |
1574 | 1613 | ||
@@ -1883,6 +1922,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | |||
1883 | const char *buffer) | 1922 | const char *buffer) |
1884 | { | 1923 | { |
1885 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | 1924 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
1925 | if (strlen(buffer) >= PATH_MAX) | ||
1926 | return -EINVAL; | ||
1886 | if (!cgroup_lock_live_group(cgrp)) | 1927 | if (!cgroup_lock_live_group(cgrp)) |
1887 | return -ENODEV; | 1928 | return -ENODEV; |
1888 | strcpy(cgrp->root->release_agent_path, buffer); | 1929 | strcpy(cgrp->root->release_agent_path, buffer); |
@@ -3176,6 +3217,23 @@ fail: | |||
3176 | return ret; | 3217 | return ret; |
3177 | } | 3218 | } |
3178 | 3219 | ||
3220 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | ||
3221 | struct cftype *cft) | ||
3222 | { | ||
3223 | return clone_children(cgrp); | ||
3224 | } | ||
3225 | |||
3226 | static int cgroup_clone_children_write(struct cgroup *cgrp, | ||
3227 | struct cftype *cft, | ||
3228 | u64 val) | ||
3229 | { | ||
3230 | if (val) | ||
3231 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
3232 | else | ||
3233 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
3234 | return 0; | ||
3235 | } | ||
3236 | |||
3179 | /* | 3237 | /* |
3180 | * for the common functions, 'private' gives the type of file | 3238 | * for the common functions, 'private' gives the type of file |
3181 | */ | 3239 | */ |
@@ -3206,6 +3264,11 @@ static struct cftype files[] = { | |||
3206 | .write_string = cgroup_write_event_control, | 3264 | .write_string = cgroup_write_event_control, |
3207 | .mode = S_IWUGO, | 3265 | .mode = S_IWUGO, |
3208 | }, | 3266 | }, |
3267 | { | ||
3268 | .name = "cgroup.clone_children", | ||
3269 | .read_u64 = cgroup_clone_children_read, | ||
3270 | .write_u64 = cgroup_clone_children_write, | ||
3271 | }, | ||
3209 | }; | 3272 | }; |
3210 | 3273 | ||
3211 | static struct cftype cft_release_agent = { | 3274 | static struct cftype cft_release_agent = { |
@@ -3335,6 +3398,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3335 | if (notify_on_release(parent)) | 3398 | if (notify_on_release(parent)) |
3336 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3399 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
3337 | 3400 | ||
3401 | if (clone_children(parent)) | ||
3402 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
3403 | |||
3338 | for_each_subsys(root, ss) { | 3404 | for_each_subsys(root, ss) { |
3339 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); | 3405 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); |
3340 | 3406 | ||
@@ -3349,6 +3415,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3349 | goto err_destroy; | 3415 | goto err_destroy; |
3350 | } | 3416 | } |
3351 | /* At error, ->destroy() callback has to free assigned ID. */ | 3417 | /* At error, ->destroy() callback has to free assigned ID. */ |
3418 | if (clone_children(parent) && ss->post_clone) | ||
3419 | ss->post_clone(ss, cgrp); | ||
3352 | } | 3420 | } |
3353 | 3421 | ||
3354 | cgroup_lock_hierarchy(root); | 3422 | cgroup_lock_hierarchy(root); |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ce71ed53e88f..e7bebb7c6c38 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
48 | struct freezer, css); | 48 | struct freezer, css); |
49 | } | 49 | } |
50 | 50 | ||
51 | int cgroup_freezing_or_frozen(struct task_struct *task) | 51 | static inline int __cgroup_freezing_or_frozen(struct task_struct *task) |
52 | { | 52 | { |
53 | struct freezer *freezer; | 53 | enum freezer_state state = task_freezer(task)->state; |
54 | enum freezer_state state; | 54 | return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); |
55 | } | ||
55 | 56 | ||
57 | int cgroup_freezing_or_frozen(struct task_struct *task) | ||
58 | { | ||
59 | int result; | ||
56 | task_lock(task); | 60 | task_lock(task); |
57 | freezer = task_freezer(task); | 61 | result = __cgroup_freezing_or_frozen(task); |
58 | if (!freezer->css.cgroup->parent) | ||
59 | state = CGROUP_THAWED; /* root cgroup can't be frozen */ | ||
60 | else | ||
61 | state = freezer->state; | ||
62 | task_unlock(task); | 62 | task_unlock(task); |
63 | 63 | return result; | |
64 | return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); | ||
65 | } | 64 | } |
66 | 65 | ||
67 | /* | 66 | /* |
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss, | |||
154 | kfree(cgroup_freezer(cgroup)); | 153 | kfree(cgroup_freezer(cgroup)); |
155 | } | 154 | } |
156 | 155 | ||
157 | /* Task is frozen or will freeze immediately when next it gets woken */ | ||
158 | static bool is_task_frozen_enough(struct task_struct *task) | ||
159 | { | ||
160 | return frozen(task) || | ||
161 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
162 | } | ||
163 | |||
164 | /* | 156 | /* |
165 | * The call to cgroup_lock() in the freezer.state write method prevents | 157 | * The call to cgroup_lock() in the freezer.state write method prevents |
166 | * a write to that file racing against an attach, and hence the | 158 | * a write to that file racing against an attach, and hence the |
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss, | |||
174 | 166 | ||
175 | /* | 167 | /* |
176 | * Anything frozen can't move or be moved to/from. | 168 | * Anything frozen can't move or be moved to/from. |
177 | * | ||
178 | * Since orig_freezer->state == FROZEN means that @task has been | ||
179 | * frozen, so it's sufficient to check the latter condition. | ||
180 | */ | 169 | */ |
181 | 170 | ||
182 | if (is_task_frozen_enough(task)) | 171 | freezer = cgroup_freezer(new_cgroup); |
172 | if (freezer->state != CGROUP_THAWED) | ||
183 | return -EBUSY; | 173 | return -EBUSY; |
184 | 174 | ||
185 | freezer = cgroup_freezer(new_cgroup); | 175 | rcu_read_lock(); |
186 | if (freezer->state == CGROUP_FROZEN) | 176 | if (__cgroup_freezing_or_frozen(task)) { |
177 | rcu_read_unlock(); | ||
187 | return -EBUSY; | 178 | return -EBUSY; |
179 | } | ||
180 | rcu_read_unlock(); | ||
188 | 181 | ||
189 | if (threadgroup) { | 182 | if (threadgroup) { |
190 | struct task_struct *c; | 183 | struct task_struct *c; |
191 | 184 | ||
192 | rcu_read_lock(); | 185 | rcu_read_lock(); |
193 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | 186 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { |
194 | if (is_task_frozen_enough(c)) { | 187 | if (__cgroup_freezing_or_frozen(c)) { |
195 | rcu_read_unlock(); | 188 | rcu_read_unlock(); |
196 | return -EBUSY; | 189 | return -EBUSY; |
197 | } | 190 | } |
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | |||
236 | /* | 229 | /* |
237 | * caller must hold freezer->lock | 230 | * caller must hold freezer->lock |
238 | */ | 231 | */ |
239 | static void update_freezer_state(struct cgroup *cgroup, | 232 | static void update_if_frozen(struct cgroup *cgroup, |
240 | struct freezer *freezer) | 233 | struct freezer *freezer) |
241 | { | 234 | { |
242 | struct cgroup_iter it; | 235 | struct cgroup_iter it; |
243 | struct task_struct *task; | 236 | struct task_struct *task; |
244 | unsigned int nfrozen = 0, ntotal = 0; | 237 | unsigned int nfrozen = 0, ntotal = 0; |
238 | enum freezer_state old_state = freezer->state; | ||
245 | 239 | ||
246 | cgroup_iter_start(cgroup, &it); | 240 | cgroup_iter_start(cgroup, &it); |
247 | while ((task = cgroup_iter_next(cgroup, &it))) { | 241 | while ((task = cgroup_iter_next(cgroup, &it))) { |
248 | ntotal++; | 242 | ntotal++; |
249 | if (is_task_frozen_enough(task)) | 243 | if (frozen(task)) |
250 | nfrozen++; | 244 | nfrozen++; |
251 | } | 245 | } |
252 | 246 | ||
253 | /* | 247 | if (old_state == CGROUP_THAWED) { |
254 | * Transition to FROZEN when no new tasks can be added ensures | 248 | BUG_ON(nfrozen > 0); |
255 | * that we never exist in the FROZEN state while there are unfrozen | 249 | } else if (old_state == CGROUP_FREEZING) { |
256 | * tasks. | 250 | if (nfrozen == ntotal) |
257 | */ | 251 | freezer->state = CGROUP_FROZEN; |
258 | if (nfrozen == ntotal) | 252 | } else { /* old_state == CGROUP_FROZEN */ |
259 | freezer->state = CGROUP_FROZEN; | 253 | BUG_ON(nfrozen != ntotal); |
260 | else if (nfrozen > 0) | 254 | } |
261 | freezer->state = CGROUP_FREEZING; | 255 | |
262 | else | ||
263 | freezer->state = CGROUP_THAWED; | ||
264 | cgroup_iter_end(cgroup, &it); | 256 | cgroup_iter_end(cgroup, &it); |
265 | } | 257 | } |
266 | 258 | ||
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | |||
279 | if (state == CGROUP_FREEZING) { | 271 | if (state == CGROUP_FREEZING) { |
280 | /* We change from FREEZING to FROZEN lazily if the cgroup was | 272 | /* We change from FREEZING to FROZEN lazily if the cgroup was |
281 | * only partially frozen when we exitted write. */ | 273 | * only partially frozen when we exitted write. */ |
282 | update_freezer_state(cgroup, freezer); | 274 | update_if_frozen(cgroup, freezer); |
283 | state = freezer->state; | 275 | state = freezer->state; |
284 | } | 276 | } |
285 | spin_unlock_irq(&freezer->lock); | 277 | spin_unlock_irq(&freezer->lock); |
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
301 | while ((task = cgroup_iter_next(cgroup, &it))) { | 293 | while ((task = cgroup_iter_next(cgroup, &it))) { |
302 | if (!freeze_task(task, true)) | 294 | if (!freeze_task(task, true)) |
303 | continue; | 295 | continue; |
304 | if (is_task_frozen_enough(task)) | 296 | if (frozen(task)) |
305 | continue; | 297 | continue; |
306 | if (!freezing(task) && !freezer_should_skip(task)) | 298 | if (!freezing(task) && !freezer_should_skip(task)) |
307 | num_cant_freeze_now++; | 299 | num_cant_freeze_now++; |
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup, | |||
335 | 327 | ||
336 | spin_lock_irq(&freezer->lock); | 328 | spin_lock_irq(&freezer->lock); |
337 | 329 | ||
338 | update_freezer_state(cgroup, freezer); | 330 | update_if_frozen(cgroup, freezer); |
339 | if (goal_state == freezer->state) | 331 | if (goal_state == freezer->state) |
340 | goto out; | 332 | goto out; |
341 | 333 | ||
diff --git a/kernel/configs.c b/kernel/configs.c index abaee684ecbf..b4066b44a99d 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf, | |||
66 | static const struct file_operations ikconfig_file_ops = { | 66 | static const struct file_operations ikconfig_file_ops = { |
67 | .owner = THIS_MODULE, | 67 | .owner = THIS_MODULE, |
68 | .read = ikconfig_read_current, | 68 | .read = ikconfig_read_current, |
69 | .llseek = default_llseek, | ||
69 | }; | 70 | }; |
70 | 71 | ||
71 | static int __init ikconfig_init(void) | 72 | static int __init ikconfig_init(void) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b23c0979bbe7..51b143e2a07a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
1397 | if (tsk->flags & PF_THREAD_BOUND) | 1397 | if (tsk->flags & PF_THREAD_BOUND) |
1398 | return -EINVAL; | 1398 | return -EINVAL; |
1399 | 1399 | ||
1400 | ret = security_task_setscheduler(tsk, 0, NULL); | 1400 | ret = security_task_setscheduler(tsk); |
1401 | if (ret) | 1401 | if (ret) |
1402 | return ret; | 1402 | return ret; |
1403 | if (threadgroup) { | 1403 | if (threadgroup) { |
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
1405 | 1405 | ||
1406 | rcu_read_lock(); | 1406 | rcu_read_lock(); |
1407 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | 1407 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { |
1408 | ret = security_task_setscheduler(c, 0, NULL); | 1408 | ret = security_task_setscheduler(c); |
1409 | if (ret) { | 1409 | if (ret) { |
1410 | rcu_read_unlock(); | 1410 | rcu_read_unlock(); |
1411 | return ret; | 1411 | return ret; |
diff --git a/kernel/cred.c b/kernel/cred.c index 9a3e22641fe7..6a1aa004e376 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds); | |||
325 | 325 | ||
326 | /* | 326 | /* |
327 | * Prepare credentials for current to perform an execve() | 327 | * Prepare credentials for current to perform an execve() |
328 | * - The caller must hold current->cred_guard_mutex | 328 | * - The caller must hold ->cred_guard_mutex |
329 | */ | 329 | */ |
330 | struct cred *prepare_exec_creds(void) | 330 | struct cred *prepare_exec_creds(void) |
331 | { | 331 | { |
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
384 | struct cred *new; | 384 | struct cred *new; |
385 | int ret; | 385 | int ret; |
386 | 386 | ||
387 | mutex_init(&p->cred_guard_mutex); | ||
388 | |||
389 | if ( | 387 | if ( |
390 | #ifdef CONFIG_KEYS | 388 | #ifdef CONFIG_KEYS |
391 | !p->cred->thread_keyring && | 389 | !p->cred->thread_keyring && |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index de407c78178d..fec596da9bd0 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/pid.h> | 47 | #include <linux/pid.h> |
48 | #include <linux/smp.h> | 48 | #include <linux/smp.h> |
49 | #include <linux/mm.h> | 49 | #include <linux/mm.h> |
50 | #include <linux/rcupdate.h> | ||
50 | 51 | ||
51 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
52 | #include <asm/byteorder.h> | 53 | #include <asm/byteorder.h> |
@@ -109,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { | |||
109 | */ | 110 | */ |
110 | atomic_t kgdb_active = ATOMIC_INIT(-1); | 111 | atomic_t kgdb_active = ATOMIC_INIT(-1); |
111 | EXPORT_SYMBOL_GPL(kgdb_active); | 112 | EXPORT_SYMBOL_GPL(kgdb_active); |
113 | static DEFINE_RAW_SPINLOCK(dbg_master_lock); | ||
114 | static DEFINE_RAW_SPINLOCK(dbg_slave_lock); | ||
112 | 115 | ||
113 | /* | 116 | /* |
114 | * We use NR_CPUs not PERCPU, in case kgdb is used to debug early | 117 | * We use NR_CPUs not PERCPU, in case kgdb is used to debug early |
115 | * bootup code (which might not have percpu set up yet): | 118 | * bootup code (which might not have percpu set up yet): |
116 | */ | 119 | */ |
117 | static atomic_t passive_cpu_wait[NR_CPUS]; | 120 | static atomic_t masters_in_kgdb; |
118 | static atomic_t cpu_in_kgdb[NR_CPUS]; | 121 | static atomic_t slaves_in_kgdb; |
119 | static atomic_t kgdb_break_tasklet_var; | 122 | static atomic_t kgdb_break_tasklet_var; |
120 | atomic_t kgdb_setting_breakpoint; | 123 | atomic_t kgdb_setting_breakpoint; |
121 | 124 | ||
@@ -457,26 +460,32 @@ static int kgdb_reenter_check(struct kgdb_state *ks) | |||
457 | return 1; | 460 | return 1; |
458 | } | 461 | } |
459 | 462 | ||
460 | static void dbg_cpu_switch(int cpu, int next_cpu) | 463 | static void dbg_touch_watchdogs(void) |
461 | { | 464 | { |
462 | /* Mark the cpu we are switching away from as a slave when it | 465 | touch_softlockup_watchdog_sync(); |
463 | * holds the kgdb_active token. This must be done so that the | 466 | clocksource_touch_watchdog(); |
464 | * that all the cpus wait in for the debug core will not enter | 467 | rcu_cpu_stall_reset(); |
465 | * again as the master. */ | ||
466 | if (cpu == atomic_read(&kgdb_active)) { | ||
467 | kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; | ||
468 | kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER; | ||
469 | } | ||
470 | kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER; | ||
471 | } | 468 | } |
472 | 469 | ||
473 | static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) | 470 | static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, |
471 | int exception_state) | ||
474 | { | 472 | { |
475 | unsigned long flags; | 473 | unsigned long flags; |
476 | int sstep_tries = 100; | 474 | int sstep_tries = 100; |
477 | int error; | 475 | int error; |
478 | int i, cpu; | 476 | int cpu; |
479 | int trace_on = 0; | 477 | int trace_on = 0; |
478 | int online_cpus = num_online_cpus(); | ||
479 | |||
480 | kgdb_info[ks->cpu].enter_kgdb++; | ||
481 | kgdb_info[ks->cpu].exception_state |= exception_state; | ||
482 | |||
483 | if (exception_state == DCPU_WANT_MASTER) | ||
484 | atomic_inc(&masters_in_kgdb); | ||
485 | else | ||
486 | atomic_inc(&slaves_in_kgdb); | ||
487 | kgdb_disable_hw_debug(ks->linux_regs); | ||
488 | |||
480 | acquirelock: | 489 | acquirelock: |
481 | /* | 490 | /* |
482 | * Interrupts will be restored by the 'trap return' code, except when | 491 | * Interrupts will be restored by the 'trap return' code, except when |
@@ -489,14 +498,15 @@ acquirelock: | |||
489 | kgdb_info[cpu].task = current; | 498 | kgdb_info[cpu].task = current; |
490 | kgdb_info[cpu].ret_state = 0; | 499 | kgdb_info[cpu].ret_state = 0; |
491 | kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; | 500 | kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; |
492 | /* | ||
493 | * Make sure the above info reaches the primary CPU before | ||
494 | * our cpu_in_kgdb[] flag setting does: | ||
495 | */ | ||
496 | atomic_inc(&cpu_in_kgdb[cpu]); | ||
497 | 501 | ||
498 | if (exception_level == 1) | 502 | /* Make sure the above info reaches the primary CPU */ |
503 | smp_mb(); | ||
504 | |||
505 | if (exception_level == 1) { | ||
506 | if (raw_spin_trylock(&dbg_master_lock)) | ||
507 | atomic_xchg(&kgdb_active, cpu); | ||
499 | goto cpu_master_loop; | 508 | goto cpu_master_loop; |
509 | } | ||
500 | 510 | ||
501 | /* | 511 | /* |
502 | * CPU will loop if it is a slave or request to become a kgdb | 512 | * CPU will loop if it is a slave or request to become a kgdb |
@@ -508,10 +518,12 @@ cpu_loop: | |||
508 | kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; | 518 | kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; |
509 | goto cpu_master_loop; | 519 | goto cpu_master_loop; |
510 | } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { | 520 | } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { |
511 | if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) | 521 | if (raw_spin_trylock(&dbg_master_lock)) { |
522 | atomic_xchg(&kgdb_active, cpu); | ||
512 | break; | 523 | break; |
524 | } | ||
513 | } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { | 525 | } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { |
514 | if (!atomic_read(&passive_cpu_wait[cpu])) | 526 | if (!raw_spin_is_locked(&dbg_slave_lock)) |
515 | goto return_normal; | 527 | goto return_normal; |
516 | } else { | 528 | } else { |
517 | return_normal: | 529 | return_normal: |
@@ -522,9 +534,12 @@ return_normal: | |||
522 | arch_kgdb_ops.correct_hw_break(); | 534 | arch_kgdb_ops.correct_hw_break(); |
523 | if (trace_on) | 535 | if (trace_on) |
524 | tracing_on(); | 536 | tracing_on(); |
525 | atomic_dec(&cpu_in_kgdb[cpu]); | 537 | kgdb_info[cpu].exception_state &= |
526 | touch_softlockup_watchdog_sync(); | 538 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); |
527 | clocksource_touch_watchdog(); | 539 | kgdb_info[cpu].enter_kgdb--; |
540 | smp_mb__before_atomic_dec(); | ||
541 | atomic_dec(&slaves_in_kgdb); | ||
542 | dbg_touch_watchdogs(); | ||
528 | local_irq_restore(flags); | 543 | local_irq_restore(flags); |
529 | return 0; | 544 | return 0; |
530 | } | 545 | } |
@@ -541,8 +556,8 @@ return_normal: | |||
541 | (kgdb_info[cpu].task && | 556 | (kgdb_info[cpu].task && |
542 | kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { | 557 | kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { |
543 | atomic_set(&kgdb_active, -1); | 558 | atomic_set(&kgdb_active, -1); |
544 | touch_softlockup_watchdog_sync(); | 559 | raw_spin_unlock(&dbg_master_lock); |
545 | clocksource_touch_watchdog(); | 560 | dbg_touch_watchdogs(); |
546 | local_irq_restore(flags); | 561 | local_irq_restore(flags); |
547 | 562 | ||
548 | goto acquirelock; | 563 | goto acquirelock; |
@@ -563,16 +578,12 @@ return_normal: | |||
563 | if (dbg_io_ops->pre_exception) | 578 | if (dbg_io_ops->pre_exception) |
564 | dbg_io_ops->pre_exception(); | 579 | dbg_io_ops->pre_exception(); |
565 | 580 | ||
566 | kgdb_disable_hw_debug(ks->linux_regs); | ||
567 | |||
568 | /* | 581 | /* |
569 | * Get the passive CPU lock which will hold all the non-primary | 582 | * Get the passive CPU lock which will hold all the non-primary |
570 | * CPU in a spin state while the debugger is active | 583 | * CPU in a spin state while the debugger is active |
571 | */ | 584 | */ |
572 | if (!kgdb_single_step) { | 585 | if (!kgdb_single_step) |
573 | for (i = 0; i < NR_CPUS; i++) | 586 | raw_spin_lock(&dbg_slave_lock); |
574 | atomic_inc(&passive_cpu_wait[i]); | ||
575 | } | ||
576 | 587 | ||
577 | #ifdef CONFIG_SMP | 588 | #ifdef CONFIG_SMP |
578 | /* Signal the other CPUs to enter kgdb_wait() */ | 589 | /* Signal the other CPUs to enter kgdb_wait() */ |
@@ -583,10 +594,9 @@ return_normal: | |||
583 | /* | 594 | /* |
584 | * Wait for the other CPUs to be notified and be waiting for us: | 595 | * Wait for the other CPUs to be notified and be waiting for us: |
585 | */ | 596 | */ |
586 | for_each_online_cpu(i) { | 597 | while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + |
587 | while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) | 598 | atomic_read(&slaves_in_kgdb)) != online_cpus) |
588 | cpu_relax(); | 599 | cpu_relax(); |
589 | } | ||
590 | 600 | ||
591 | /* | 601 | /* |
592 | * At this point the primary processor is completely | 602 | * At this point the primary processor is completely |
@@ -615,7 +625,8 @@ cpu_master_loop: | |||
615 | if (error == DBG_PASS_EVENT) { | 625 | if (error == DBG_PASS_EVENT) { |
616 | dbg_kdb_mode = !dbg_kdb_mode; | 626 | dbg_kdb_mode = !dbg_kdb_mode; |
617 | } else if (error == DBG_SWITCH_CPU_EVENT) { | 627 | } else if (error == DBG_SWITCH_CPU_EVENT) { |
618 | dbg_cpu_switch(cpu, dbg_switch_cpu); | 628 | kgdb_info[dbg_switch_cpu].exception_state |= |
629 | DCPU_NEXT_MASTER; | ||
619 | goto cpu_loop; | 630 | goto cpu_loop; |
620 | } else { | 631 | } else { |
621 | kgdb_info[cpu].ret_state = error; | 632 | kgdb_info[cpu].ret_state = error; |
@@ -627,24 +638,11 @@ cpu_master_loop: | |||
627 | if (dbg_io_ops->post_exception) | 638 | if (dbg_io_ops->post_exception) |
628 | dbg_io_ops->post_exception(); | 639 | dbg_io_ops->post_exception(); |
629 | 640 | ||
630 | atomic_dec(&cpu_in_kgdb[ks->cpu]); | ||
631 | |||
632 | if (!kgdb_single_step) { | 641 | if (!kgdb_single_step) { |
633 | for (i = NR_CPUS-1; i >= 0; i--) | 642 | raw_spin_unlock(&dbg_slave_lock); |
634 | atomic_dec(&passive_cpu_wait[i]); | 643 | /* Wait till all the CPUs have quit from the debugger. */ |
635 | /* | 644 | while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb)) |
636 | * Wait till all the CPUs have quit from the debugger, | 645 | cpu_relax(); |
637 | * but allow a CPU that hit an exception and is | ||
638 | * waiting to become the master to remain in the debug | ||
639 | * core. | ||
640 | */ | ||
641 | for_each_online_cpu(i) { | ||
642 | while (kgdb_do_roundup && | ||
643 | atomic_read(&cpu_in_kgdb[i]) && | ||
644 | !(kgdb_info[i].exception_state & | ||
645 | DCPU_WANT_MASTER)) | ||
646 | cpu_relax(); | ||
647 | } | ||
648 | } | 646 | } |
649 | 647 | ||
650 | kgdb_restore: | 648 | kgdb_restore: |
@@ -655,12 +653,20 @@ kgdb_restore: | |||
655 | else | 653 | else |
656 | kgdb_sstep_pid = 0; | 654 | kgdb_sstep_pid = 0; |
657 | } | 655 | } |
656 | if (arch_kgdb_ops.correct_hw_break) | ||
657 | arch_kgdb_ops.correct_hw_break(); | ||
658 | if (trace_on) | 658 | if (trace_on) |
659 | tracing_on(); | 659 | tracing_on(); |
660 | |||
661 | kgdb_info[cpu].exception_state &= | ||
662 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); | ||
663 | kgdb_info[cpu].enter_kgdb--; | ||
664 | smp_mb__before_atomic_dec(); | ||
665 | atomic_dec(&masters_in_kgdb); | ||
660 | /* Free kgdb_active */ | 666 | /* Free kgdb_active */ |
661 | atomic_set(&kgdb_active, -1); | 667 | atomic_set(&kgdb_active, -1); |
662 | touch_softlockup_watchdog_sync(); | 668 | raw_spin_unlock(&dbg_master_lock); |
663 | clocksource_touch_watchdog(); | 669 | dbg_touch_watchdogs(); |
664 | local_irq_restore(flags); | 670 | local_irq_restore(flags); |
665 | 671 | ||
666 | return kgdb_info[cpu].ret_state; | 672 | return kgdb_info[cpu].ret_state; |
@@ -678,7 +684,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
678 | { | 684 | { |
679 | struct kgdb_state kgdb_var; | 685 | struct kgdb_state kgdb_var; |
680 | struct kgdb_state *ks = &kgdb_var; | 686 | struct kgdb_state *ks = &kgdb_var; |
681 | int ret; | ||
682 | 687 | ||
683 | ks->cpu = raw_smp_processor_id(); | 688 | ks->cpu = raw_smp_processor_id(); |
684 | ks->ex_vector = evector; | 689 | ks->ex_vector = evector; |
@@ -689,11 +694,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
689 | 694 | ||
690 | if (kgdb_reenter_check(ks)) | 695 | if (kgdb_reenter_check(ks)) |
691 | return 0; /* Ouch, double exception ! */ | 696 | return 0; /* Ouch, double exception ! */ |
692 | kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; | 697 | if (kgdb_info[ks->cpu].enter_kgdb != 0) |
693 | ret = kgdb_cpu_enter(ks, regs); | 698 | return 0; |
694 | kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | | 699 | |
695 | DCPU_IS_SLAVE); | 700 | return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); |
696 | return ret; | ||
697 | } | 701 | } |
698 | 702 | ||
699 | int kgdb_nmicallback(int cpu, void *regs) | 703 | int kgdb_nmicallback(int cpu, void *regs) |
@@ -706,12 +710,9 @@ int kgdb_nmicallback(int cpu, void *regs) | |||
706 | ks->cpu = cpu; | 710 | ks->cpu = cpu; |
707 | ks->linux_regs = regs; | 711 | ks->linux_regs = regs; |
708 | 712 | ||
709 | if (!atomic_read(&cpu_in_kgdb[cpu]) && | 713 | if (kgdb_info[ks->cpu].enter_kgdb == 0 && |
710 | atomic_read(&kgdb_active) != -1 && | 714 | raw_spin_is_locked(&dbg_master_lock)) { |
711 | atomic_read(&kgdb_active) != cpu) { | 715 | kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE); |
712 | kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; | ||
713 | kgdb_cpu_enter(ks, regs); | ||
714 | kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE; | ||
715 | return 0; | 716 | return 0; |
716 | } | 717 | } |
717 | #endif | 718 | #endif |
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index c5d753d80f67..3494c28a7e7a 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h | |||
@@ -40,6 +40,7 @@ struct debuggerinfo_struct { | |||
40 | int exception_state; | 40 | int exception_state; |
41 | int ret_state; | 41 | int ret_state; |
42 | int irq_depth; | 42 | int irq_depth; |
43 | int enter_kgdb; | ||
43 | }; | 44 | }; |
44 | 45 | ||
45 | extern struct debuggerinfo_struct kgdb_info[]; | 46 | extern struct debuggerinfo_struct kgdb_info[]; |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index bf6e8270e957..dd0b1b7dd02c 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks) | |||
86 | } | 86 | } |
87 | /* Set initial kdb state variables */ | 87 | /* Set initial kdb state variables */ |
88 | KDB_STATE_CLEAR(KGDB_TRANS); | 88 | KDB_STATE_CLEAR(KGDB_TRANS); |
89 | kdb_initial_cpu = ks->cpu; | 89 | kdb_initial_cpu = atomic_read(&kgdb_active); |
90 | kdb_current_task = kgdb_info[ks->cpu].task; | 90 | kdb_current_task = kgdb_info[ks->cpu].task; |
91 | kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; | 91 | kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; |
92 | /* Remove any breakpoints as needed by kdb and clear single step */ | 92 | /* Remove any breakpoints as needed by kdb and clear single step */ |
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks) | |||
105 | ks->pass_exception = 1; | 105 | ks->pass_exception = 1; |
106 | KDB_FLAG_SET(CATASTROPHIC); | 106 | KDB_FLAG_SET(CATASTROPHIC); |
107 | } | 107 | } |
108 | kdb_initial_cpu = ks->cpu; | ||
109 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { | 108 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { |
110 | KDB_STATE_CLEAR(SSBPT); | 109 | KDB_STATE_CLEAR(SSBPT); |
111 | KDB_STATE_CLEAR(DOING_SS); | 110 | KDB_STATE_CLEAR(DOING_SS); |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index c9b7f4f90bba..96fdaac46a80 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...) | |||
823 | 823 | ||
824 | return r; | 824 | return r; |
825 | } | 825 | } |
826 | 826 | EXPORT_SYMBOL_GPL(kdb_printf); | |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index caf057a3de0e..d7bda21a106b 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv) | |||
1749 | int nextarg; | 1749 | int nextarg; |
1750 | long offset; | 1750 | long offset; |
1751 | 1751 | ||
1752 | if (raw_smp_processor_id() != kdb_initial_cpu) { | ||
1753 | kdb_printf("go must execute on the entry cpu, " | ||
1754 | "please use \"cpu %d\" and then execute go\n", | ||
1755 | kdb_initial_cpu); | ||
1756 | return KDB_BADCPUNUM; | ||
1757 | } | ||
1752 | if (argc == 1) { | 1758 | if (argc == 1) { |
1753 | if (raw_smp_processor_id() != kdb_initial_cpu) { | ||
1754 | kdb_printf("go <address> must be issued from the " | ||
1755 | "initial cpu, do cpu %d first\n", | ||
1756 | kdb_initial_cpu); | ||
1757 | return KDB_ARGCOUNT; | ||
1758 | } | ||
1759 | nextarg = 1; | 1759 | nextarg = 1; |
1760 | diag = kdbgetaddrarg(argc, argv, &nextarg, | 1760 | diag = kdbgetaddrarg(argc, argv, &nextarg, |
1761 | &addr, &offset, NULL); | 1761 | &addr, &offset, NULL); |
@@ -2783,6 +2783,8 @@ int kdb_register_repeat(char *cmd, | |||
2783 | 2783 | ||
2784 | return 0; | 2784 | return 0; |
2785 | } | 2785 | } |
2786 | EXPORT_SYMBOL_GPL(kdb_register_repeat); | ||
2787 | |||
2786 | 2788 | ||
2787 | /* | 2789 | /* |
2788 | * kdb_register - Compatibility register function for commands that do | 2790 | * kdb_register - Compatibility register function for commands that do |
@@ -2805,6 +2807,7 @@ int kdb_register(char *cmd, | |||
2805 | return kdb_register_repeat(cmd, func, usage, help, minlen, | 2807 | return kdb_register_repeat(cmd, func, usage, help, minlen, |
2806 | KDB_REPEAT_NONE); | 2808 | KDB_REPEAT_NONE); |
2807 | } | 2809 | } |
2810 | EXPORT_SYMBOL_GPL(kdb_register); | ||
2808 | 2811 | ||
2809 | /* | 2812 | /* |
2810 | * kdb_unregister - This function is used to unregister a kernel | 2813 | * kdb_unregister - This function is used to unregister a kernel |
@@ -2823,7 +2826,7 @@ int kdb_unregister(char *cmd) | |||
2823 | /* | 2826 | /* |
2824 | * find the command. | 2827 | * find the command. |
2825 | */ | 2828 | */ |
2826 | for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { | 2829 | for_each_kdbcmd(kp, i) { |
2827 | if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { | 2830 | if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { |
2828 | kp->cmd_name = NULL; | 2831 | kp->cmd_name = NULL; |
2829 | return 0; | 2832 | return 0; |
@@ -2833,6 +2836,7 @@ int kdb_unregister(char *cmd) | |||
2833 | /* Couldn't find it. */ | 2836 | /* Couldn't find it. */ |
2834 | return 1; | 2837 | return 1; |
2835 | } | 2838 | } |
2839 | EXPORT_SYMBOL_GPL(kdb_unregister); | ||
2836 | 2840 | ||
2837 | /* Initialize the kdb command table. */ | 2841 | /* Initialize the kdb command table. */ |
2838 | static void __init kdb_inittab(void) | 2842 | static void __init kdb_inittab(void) |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index be775f7e81e0..35d69ed1dfb5 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
@@ -15,29 +15,6 @@ | |||
15 | #include <linux/kgdb.h> | 15 | #include <linux/kgdb.h> |
16 | #include "../debug_core.h" | 16 | #include "../debug_core.h" |
17 | 17 | ||
18 | /* Kernel Debugger Error codes. Must not overlap with command codes. */ | ||
19 | #define KDB_NOTFOUND (-1) | ||
20 | #define KDB_ARGCOUNT (-2) | ||
21 | #define KDB_BADWIDTH (-3) | ||
22 | #define KDB_BADRADIX (-4) | ||
23 | #define KDB_NOTENV (-5) | ||
24 | #define KDB_NOENVVALUE (-6) | ||
25 | #define KDB_NOTIMP (-7) | ||
26 | #define KDB_ENVFULL (-8) | ||
27 | #define KDB_ENVBUFFULL (-9) | ||
28 | #define KDB_TOOMANYBPT (-10) | ||
29 | #define KDB_TOOMANYDBREGS (-11) | ||
30 | #define KDB_DUPBPT (-12) | ||
31 | #define KDB_BPTNOTFOUND (-13) | ||
32 | #define KDB_BADMODE (-14) | ||
33 | #define KDB_BADINT (-15) | ||
34 | #define KDB_INVADDRFMT (-16) | ||
35 | #define KDB_BADREG (-17) | ||
36 | #define KDB_BADCPUNUM (-18) | ||
37 | #define KDB_BADLENGTH (-19) | ||
38 | #define KDB_NOBP (-20) | ||
39 | #define KDB_BADADDR (-21) | ||
40 | |||
41 | /* Kernel Debugger Command codes. Must not overlap with error codes. */ | 18 | /* Kernel Debugger Command codes. Must not overlap with error codes. */ |
42 | #define KDB_CMD_GO (-1001) | 19 | #define KDB_CMD_GO (-1001) |
43 | #define KDB_CMD_CPU (-1002) | 20 | #define KDB_CMD_CPU (-1002) |
@@ -93,17 +70,6 @@ | |||
93 | */ | 70 | */ |
94 | #define KDB_MAXBPT 16 | 71 | #define KDB_MAXBPT 16 |
95 | 72 | ||
96 | /* Maximum number of arguments to a function */ | ||
97 | #define KDB_MAXARGS 16 | ||
98 | |||
99 | typedef enum { | ||
100 | KDB_REPEAT_NONE = 0, /* Do not repeat this command */ | ||
101 | KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */ | ||
102 | KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */ | ||
103 | } kdb_repeat_t; | ||
104 | |||
105 | typedef int (*kdb_func_t)(int, const char **); | ||
106 | |||
107 | /* Symbol table format returned by kallsyms. */ | 73 | /* Symbol table format returned by kallsyms. */ |
108 | typedef struct __ksymtab { | 74 | typedef struct __ksymtab { |
109 | unsigned long value; /* Address of symbol */ | 75 | unsigned long value; /* Address of symbol */ |
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag); | |||
123 | extern int kallsyms_symbol_complete(char *prefix_name, int max_len); | 89 | extern int kallsyms_symbol_complete(char *prefix_name, int max_len); |
124 | 90 | ||
125 | /* Exported Symbols for kernel loadable modules to use. */ | 91 | /* Exported Symbols for kernel loadable modules to use. */ |
126 | extern int kdb_register(char *, kdb_func_t, char *, char *, short); | ||
127 | extern int kdb_register_repeat(char *, kdb_func_t, char *, char *, | ||
128 | short, kdb_repeat_t); | ||
129 | extern int kdb_unregister(char *); | ||
130 | |||
131 | extern int kdb_getarea_size(void *, unsigned long, size_t); | 92 | extern int kdb_getarea_size(void *, unsigned long, size_t); |
132 | extern int kdb_putarea_size(unsigned long, void *, size_t); | 93 | extern int kdb_putarea_size(unsigned long, void *, size_t); |
133 | 94 | ||
@@ -144,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); | |||
144 | extern int kdb_putword(unsigned long, unsigned long, size_t); | 105 | extern int kdb_putword(unsigned long, unsigned long, size_t); |
145 | 106 | ||
146 | extern int kdbgetularg(const char *, unsigned long *); | 107 | extern int kdbgetularg(const char *, unsigned long *); |
108 | extern int kdbgetu64arg(const char *, u64 *); | ||
147 | extern char *kdbgetenv(const char *); | 109 | extern char *kdbgetenv(const char *); |
148 | extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, | 110 | extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, |
149 | long *, char **); | 111 | long *, char **); |
@@ -255,14 +217,6 @@ extern void kdb_ps1(const struct task_struct *p); | |||
255 | extern void kdb_print_nameval(const char *name, unsigned long val); | 217 | extern void kdb_print_nameval(const char *name, unsigned long val); |
256 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); | 218 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); |
257 | extern void kdb_meminfo_proc_show(void); | 219 | extern void kdb_meminfo_proc_show(void); |
258 | #ifdef CONFIG_KALLSYMS | ||
259 | extern const char *kdb_walk_kallsyms(loff_t *pos); | ||
260 | #else /* ! CONFIG_KALLSYMS */ | ||
261 | static inline const char *kdb_walk_kallsyms(loff_t *pos) | ||
262 | { | ||
263 | return NULL; | ||
264 | } | ||
265 | #endif /* ! CONFIG_KALLSYMS */ | ||
266 | extern char *kdb_getstr(char *, size_t, char *); | 220 | extern char *kdb_getstr(char *, size_t, char *); |
267 | 221 | ||
268 | /* Defines for kdb_symbol_print */ | 222 | /* Defines for kdb_symbol_print */ |
diff --git a/kernel/early_res.c b/kernel/early_res.c deleted file mode 100644 index 7bfae887f211..000000000000 --- a/kernel/early_res.c +++ /dev/null | |||
@@ -1,590 +0,0 @@ | |||
1 | /* | ||
2 | * early_res, could be used to replace bootmem | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/bootmem.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/early_res.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <linux/kmemleak.h> | ||
12 | |||
13 | /* | ||
14 | * Early reserved memory areas. | ||
15 | */ | ||
16 | /* | ||
17 | * need to make sure this one is bigger enough before | ||
18 | * find_fw_memmap_area could be used | ||
19 | */ | ||
20 | #define MAX_EARLY_RES_X 32 | ||
21 | |||
22 | struct early_res { | ||
23 | u64 start, end; | ||
24 | char name[15]; | ||
25 | char overlap_ok; | ||
26 | }; | ||
27 | static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; | ||
28 | |||
29 | static int max_early_res __initdata = MAX_EARLY_RES_X; | ||
30 | static struct early_res *early_res __initdata = &early_res_x[0]; | ||
31 | static int early_res_count __initdata; | ||
32 | |||
33 | static int __init find_overlapped_early(u64 start, u64 end) | ||
34 | { | ||
35 | int i; | ||
36 | struct early_res *r; | ||
37 | |||
38 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
39 | r = &early_res[i]; | ||
40 | if (end > r->start && start < r->end) | ||
41 | break; | ||
42 | } | ||
43 | |||
44 | return i; | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * Drop the i-th range from the early reservation map, | ||
49 | * by copying any higher ranges down one over it, and | ||
50 | * clearing what had been the last slot. | ||
51 | */ | ||
52 | static void __init drop_range(int i) | ||
53 | { | ||
54 | int j; | ||
55 | |||
56 | for (j = i + 1; j < max_early_res && early_res[j].end; j++) | ||
57 | ; | ||
58 | |||
59 | memmove(&early_res[i], &early_res[i + 1], | ||
60 | (j - 1 - i) * sizeof(struct early_res)); | ||
61 | |||
62 | early_res[j - 1].end = 0; | ||
63 | early_res_count--; | ||
64 | } | ||
65 | |||
66 | static void __init drop_range_partial(int i, u64 start, u64 end) | ||
67 | { | ||
68 | u64 common_start, common_end; | ||
69 | u64 old_start, old_end; | ||
70 | |||
71 | old_start = early_res[i].start; | ||
72 | old_end = early_res[i].end; | ||
73 | common_start = max(old_start, start); | ||
74 | common_end = min(old_end, end); | ||
75 | |||
76 | /* no overlap ? */ | ||
77 | if (common_start >= common_end) | ||
78 | return; | ||
79 | |||
80 | if (old_start < common_start) { | ||
81 | /* make head segment */ | ||
82 | early_res[i].end = common_start; | ||
83 | if (old_end > common_end) { | ||
84 | char name[15]; | ||
85 | |||
86 | /* | ||
87 | * Save a local copy of the name, since the | ||
88 | * early_res array could get resized inside | ||
89 | * reserve_early_without_check() -> | ||
90 | * __check_and_double_early_res(), which would | ||
91 | * make the current name pointer invalid. | ||
92 | */ | ||
93 | strncpy(name, early_res[i].name, | ||
94 | sizeof(early_res[i].name) - 1); | ||
95 | /* add another for left over on tail */ | ||
96 | reserve_early_without_check(common_end, old_end, name); | ||
97 | } | ||
98 | return; | ||
99 | } else { | ||
100 | if (old_end > common_end) { | ||
101 | /* reuse the entry for tail left */ | ||
102 | early_res[i].start = common_end; | ||
103 | return; | ||
104 | } | ||
105 | /* all covered */ | ||
106 | drop_range(i); | ||
107 | } | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * Split any existing ranges that: | ||
112 | * 1) are marked 'overlap_ok', and | ||
113 | * 2) overlap with the stated range [start, end) | ||
114 | * into whatever portion (if any) of the existing range is entirely | ||
115 | * below or entirely above the stated range. Drop the portion | ||
116 | * of the existing range that overlaps with the stated range, | ||
117 | * which will allow the caller of this routine to then add that | ||
118 | * stated range without conflicting with any existing range. | ||
119 | */ | ||
120 | static void __init drop_overlaps_that_are_ok(u64 start, u64 end) | ||
121 | { | ||
122 | int i; | ||
123 | struct early_res *r; | ||
124 | u64 lower_start, lower_end; | ||
125 | u64 upper_start, upper_end; | ||
126 | char name[15]; | ||
127 | |||
128 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
129 | r = &early_res[i]; | ||
130 | |||
131 | /* Continue past non-overlapping ranges */ | ||
132 | if (end <= r->start || start >= r->end) | ||
133 | continue; | ||
134 | |||
135 | /* | ||
136 | * Leave non-ok overlaps as is; let caller | ||
137 | * panic "Overlapping early reservations" | ||
138 | * when it hits this overlap. | ||
139 | */ | ||
140 | if (!r->overlap_ok) | ||
141 | return; | ||
142 | |||
143 | /* | ||
144 | * We have an ok overlap. We will drop it from the early | ||
145 | * reservation map, and add back in any non-overlapping | ||
146 | * portions (lower or upper) as separate, overlap_ok, | ||
147 | * non-overlapping ranges. | ||
148 | */ | ||
149 | |||
150 | /* 1. Note any non-overlapping (lower or upper) ranges. */ | ||
151 | strncpy(name, r->name, sizeof(name) - 1); | ||
152 | |||
153 | lower_start = lower_end = 0; | ||
154 | upper_start = upper_end = 0; | ||
155 | if (r->start < start) { | ||
156 | lower_start = r->start; | ||
157 | lower_end = start; | ||
158 | } | ||
159 | if (r->end > end) { | ||
160 | upper_start = end; | ||
161 | upper_end = r->end; | ||
162 | } | ||
163 | |||
164 | /* 2. Drop the original ok overlapping range */ | ||
165 | drop_range(i); | ||
166 | |||
167 | i--; /* resume for-loop on copied down entry */ | ||
168 | |||
169 | /* 3. Add back in any non-overlapping ranges. */ | ||
170 | if (lower_end) | ||
171 | reserve_early_overlap_ok(lower_start, lower_end, name); | ||
172 | if (upper_end) | ||
173 | reserve_early_overlap_ok(upper_start, upper_end, name); | ||
174 | } | ||
175 | } | ||
176 | |||
177 | static void __init __reserve_early(u64 start, u64 end, char *name, | ||
178 | int overlap_ok) | ||
179 | { | ||
180 | int i; | ||
181 | struct early_res *r; | ||
182 | |||
183 | i = find_overlapped_early(start, end); | ||
184 | if (i >= max_early_res) | ||
185 | panic("Too many early reservations"); | ||
186 | r = &early_res[i]; | ||
187 | if (r->end) | ||
188 | panic("Overlapping early reservations " | ||
189 | "%llx-%llx %s to %llx-%llx %s\n", | ||
190 | start, end - 1, name ? name : "", r->start, | ||
191 | r->end - 1, r->name); | ||
192 | r->start = start; | ||
193 | r->end = end; | ||
194 | r->overlap_ok = overlap_ok; | ||
195 | if (name) | ||
196 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
197 | early_res_count++; | ||
198 | } | ||
199 | |||
200 | /* | ||
201 | * A few early reservtations come here. | ||
202 | * | ||
203 | * The 'overlap_ok' in the name of this routine does -not- mean it | ||
204 | * is ok for these reservations to overlap an earlier reservation. | ||
205 | * Rather it means that it is ok for subsequent reservations to | ||
206 | * overlap this one. | ||
207 | * | ||
208 | * Use this entry point to reserve early ranges when you are doing | ||
209 | * so out of "Paranoia", reserving perhaps more memory than you need, | ||
210 | * just in case, and don't mind a subsequent overlapping reservation | ||
211 | * that is known to be needed. | ||
212 | * | ||
213 | * The drop_overlaps_that_are_ok() call here isn't really needed. | ||
214 | * It would be needed if we had two colliding 'overlap_ok' | ||
215 | * reservations, so that the second such would not panic on the | ||
216 | * overlap with the first. We don't have any such as of this | ||
217 | * writing, but might as well tolerate such if it happens in | ||
218 | * the future. | ||
219 | */ | ||
220 | void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) | ||
221 | { | ||
222 | drop_overlaps_that_are_ok(start, end); | ||
223 | __reserve_early(start, end, name, 1); | ||
224 | } | ||
225 | |||
226 | static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) | ||
227 | { | ||
228 | u64 start, end, size, mem; | ||
229 | struct early_res *new; | ||
230 | |||
231 | /* do we have enough slots left ? */ | ||
232 | if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) | ||
233 | return; | ||
234 | |||
235 | /* double it */ | ||
236 | mem = -1ULL; | ||
237 | size = sizeof(struct early_res) * max_early_res * 2; | ||
238 | if (early_res == early_res_x) | ||
239 | start = 0; | ||
240 | else | ||
241 | start = early_res[0].end; | ||
242 | end = ex_start; | ||
243 | if (start + size < end) | ||
244 | mem = find_fw_memmap_area(start, end, size, | ||
245 | sizeof(struct early_res)); | ||
246 | if (mem == -1ULL) { | ||
247 | start = ex_end; | ||
248 | end = get_max_mapped(); | ||
249 | if (start + size < end) | ||
250 | mem = find_fw_memmap_area(start, end, size, | ||
251 | sizeof(struct early_res)); | ||
252 | } | ||
253 | if (mem == -1ULL) | ||
254 | panic("can not find more space for early_res array"); | ||
255 | |||
256 | new = __va(mem); | ||
257 | /* save the first one for own */ | ||
258 | new[0].start = mem; | ||
259 | new[0].end = mem + size; | ||
260 | new[0].overlap_ok = 0; | ||
261 | /* copy old to new */ | ||
262 | if (early_res == early_res_x) { | ||
263 | memcpy(&new[1], &early_res[0], | ||
264 | sizeof(struct early_res) * max_early_res); | ||
265 | memset(&new[max_early_res+1], 0, | ||
266 | sizeof(struct early_res) * (max_early_res - 1)); | ||
267 | early_res_count++; | ||
268 | } else { | ||
269 | memcpy(&new[1], &early_res[1], | ||
270 | sizeof(struct early_res) * (max_early_res - 1)); | ||
271 | memset(&new[max_early_res], 0, | ||
272 | sizeof(struct early_res) * max_early_res); | ||
273 | } | ||
274 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
275 | early_res = new; | ||
276 | max_early_res *= 2; | ||
277 | printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", | ||
278 | max_early_res, mem, mem + size - 1); | ||
279 | } | ||
280 | |||
281 | /* | ||
282 | * Most early reservations come here. | ||
283 | * | ||
284 | * We first have drop_overlaps_that_are_ok() drop any pre-existing | ||
285 | * 'overlap_ok' ranges, so that we can then reserve this memory | ||
286 | * range without risk of panic'ing on an overlapping overlap_ok | ||
287 | * early reservation. | ||
288 | */ | ||
289 | void __init reserve_early(u64 start, u64 end, char *name) | ||
290 | { | ||
291 | if (start >= end) | ||
292 | return; | ||
293 | |||
294 | __check_and_double_early_res(start, end); | ||
295 | |||
296 | drop_overlaps_that_are_ok(start, end); | ||
297 | __reserve_early(start, end, name, 0); | ||
298 | } | ||
299 | |||
300 | void __init reserve_early_without_check(u64 start, u64 end, char *name) | ||
301 | { | ||
302 | struct early_res *r; | ||
303 | |||
304 | if (start >= end) | ||
305 | return; | ||
306 | |||
307 | __check_and_double_early_res(start, end); | ||
308 | |||
309 | r = &early_res[early_res_count]; | ||
310 | |||
311 | r->start = start; | ||
312 | r->end = end; | ||
313 | r->overlap_ok = 0; | ||
314 | if (name) | ||
315 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
316 | early_res_count++; | ||
317 | } | ||
318 | |||
319 | void __init free_early(u64 start, u64 end) | ||
320 | { | ||
321 | struct early_res *r; | ||
322 | int i; | ||
323 | |||
324 | kmemleak_free_part(__va(start), end - start); | ||
325 | |||
326 | i = find_overlapped_early(start, end); | ||
327 | r = &early_res[i]; | ||
328 | if (i >= max_early_res || r->end != end || r->start != start) | ||
329 | panic("free_early on not reserved area: %llx-%llx!", | ||
330 | start, end - 1); | ||
331 | |||
332 | drop_range(i); | ||
333 | } | ||
334 | |||
335 | void __init free_early_partial(u64 start, u64 end) | ||
336 | { | ||
337 | struct early_res *r; | ||
338 | int i; | ||
339 | |||
340 | kmemleak_free_part(__va(start), end - start); | ||
341 | |||
342 | if (start == end) | ||
343 | return; | ||
344 | |||
345 | if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) | ||
346 | return; | ||
347 | |||
348 | try_next: | ||
349 | i = find_overlapped_early(start, end); | ||
350 | if (i >= max_early_res) | ||
351 | return; | ||
352 | |||
353 | r = &early_res[i]; | ||
354 | /* hole ? */ | ||
355 | if (r->end >= end && r->start <= start) { | ||
356 | drop_range_partial(i, start, end); | ||
357 | return; | ||
358 | } | ||
359 | |||
360 | drop_range_partial(i, start, end); | ||
361 | goto try_next; | ||
362 | } | ||
363 | |||
364 | #ifdef CONFIG_NO_BOOTMEM | ||
365 | static void __init subtract_early_res(struct range *range, int az) | ||
366 | { | ||
367 | int i, count; | ||
368 | u64 final_start, final_end; | ||
369 | int idx = 0; | ||
370 | |||
371 | count = 0; | ||
372 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
373 | count++; | ||
374 | |||
375 | /* need to skip first one ?*/ | ||
376 | if (early_res != early_res_x) | ||
377 | idx = 1; | ||
378 | |||
379 | #define DEBUG_PRINT_EARLY_RES 1 | ||
380 | |||
381 | #if DEBUG_PRINT_EARLY_RES | ||
382 | printk(KERN_INFO "Subtract (%d early reservations)\n", count); | ||
383 | #endif | ||
384 | for (i = idx; i < count; i++) { | ||
385 | struct early_res *r = &early_res[i]; | ||
386 | #if DEBUG_PRINT_EARLY_RES | ||
387 | printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, | ||
388 | r->start, r->end, r->name); | ||
389 | #endif | ||
390 | final_start = PFN_DOWN(r->start); | ||
391 | final_end = PFN_UP(r->end); | ||
392 | if (final_start >= final_end) | ||
393 | continue; | ||
394 | subtract_range(range, az, final_start, final_end); | ||
395 | } | ||
396 | |||
397 | } | ||
398 | |||
399 | int __init get_free_all_memory_range(struct range **rangep, int nodeid) | ||
400 | { | ||
401 | int i, count; | ||
402 | u64 start = 0, end; | ||
403 | u64 size; | ||
404 | u64 mem; | ||
405 | struct range *range; | ||
406 | int nr_range; | ||
407 | |||
408 | count = 0; | ||
409 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
410 | count++; | ||
411 | |||
412 | count *= 2; | ||
413 | |||
414 | size = sizeof(struct range) * count; | ||
415 | end = get_max_mapped(); | ||
416 | #ifdef MAX_DMA32_PFN | ||
417 | if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) | ||
418 | start = MAX_DMA32_PFN << PAGE_SHIFT; | ||
419 | #endif | ||
420 | mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); | ||
421 | if (mem == -1ULL) | ||
422 | panic("can not find more space for range free"); | ||
423 | |||
424 | range = __va(mem); | ||
425 | /* use early_node_map[] and early_res to get range array at first */ | ||
426 | memset(range, 0, size); | ||
427 | nr_range = 0; | ||
428 | |||
429 | /* need to go over early_node_map to find out good range for node */ | ||
430 | nr_range = add_from_early_node_map(range, count, nr_range, nodeid); | ||
431 | #ifdef CONFIG_X86_32 | ||
432 | subtract_range(range, count, max_low_pfn, -1ULL); | ||
433 | #endif | ||
434 | subtract_early_res(range, count); | ||
435 | nr_range = clean_sort_range(range, count); | ||
436 | |||
437 | /* need to clear it ? */ | ||
438 | if (nodeid == MAX_NUMNODES) { | ||
439 | memset(&early_res[0], 0, | ||
440 | sizeof(struct early_res) * max_early_res); | ||
441 | early_res = NULL; | ||
442 | max_early_res = 0; | ||
443 | } | ||
444 | |||
445 | *rangep = range; | ||
446 | return nr_range; | ||
447 | } | ||
448 | #else | ||
449 | void __init early_res_to_bootmem(u64 start, u64 end) | ||
450 | { | ||
451 | int i, count; | ||
452 | u64 final_start, final_end; | ||
453 | int idx = 0; | ||
454 | |||
455 | count = 0; | ||
456 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
457 | count++; | ||
458 | |||
459 | /* need to skip first one ?*/ | ||
460 | if (early_res != early_res_x) | ||
461 | idx = 1; | ||
462 | |||
463 | printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", | ||
464 | count - idx, max_early_res, start, end); | ||
465 | for (i = idx; i < count; i++) { | ||
466 | struct early_res *r = &early_res[i]; | ||
467 | printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, | ||
468 | r->start, r->end, r->name); | ||
469 | final_start = max(start, r->start); | ||
470 | final_end = min(end, r->end); | ||
471 | if (final_start >= final_end) { | ||
472 | printk(KERN_CONT "\n"); | ||
473 | continue; | ||
474 | } | ||
475 | printk(KERN_CONT " ==> [%010llx - %010llx]\n", | ||
476 | final_start, final_end); | ||
477 | reserve_bootmem_generic(final_start, final_end - final_start, | ||
478 | BOOTMEM_DEFAULT); | ||
479 | } | ||
480 | /* clear them */ | ||
481 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
482 | early_res = NULL; | ||
483 | max_early_res = 0; | ||
484 | early_res_count = 0; | ||
485 | } | ||
486 | #endif | ||
487 | |||
488 | /* Check for already reserved areas */ | ||
489 | static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) | ||
490 | { | ||
491 | int i; | ||
492 | u64 addr = *addrp; | ||
493 | int changed = 0; | ||
494 | struct early_res *r; | ||
495 | again: | ||
496 | i = find_overlapped_early(addr, addr + size); | ||
497 | r = &early_res[i]; | ||
498 | if (i < max_early_res && r->end) { | ||
499 | *addrp = addr = round_up(r->end, align); | ||
500 | changed = 1; | ||
501 | goto again; | ||
502 | } | ||
503 | return changed; | ||
504 | } | ||
505 | |||
506 | /* Check for already reserved areas */ | ||
507 | static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) | ||
508 | { | ||
509 | int i; | ||
510 | u64 addr = *addrp, last; | ||
511 | u64 size = *sizep; | ||
512 | int changed = 0; | ||
513 | again: | ||
514 | last = addr + size; | ||
515 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
516 | struct early_res *r = &early_res[i]; | ||
517 | if (last > r->start && addr < r->start) { | ||
518 | size = r->start - addr; | ||
519 | changed = 1; | ||
520 | goto again; | ||
521 | } | ||
522 | if (last > r->end && addr < r->end) { | ||
523 | addr = round_up(r->end, align); | ||
524 | size = last - addr; | ||
525 | changed = 1; | ||
526 | goto again; | ||
527 | } | ||
528 | if (last <= r->end && addr >= r->start) { | ||
529 | (*sizep)++; | ||
530 | return 0; | ||
531 | } | ||
532 | } | ||
533 | if (changed) { | ||
534 | *addrp = addr; | ||
535 | *sizep = size; | ||
536 | } | ||
537 | return changed; | ||
538 | } | ||
539 | |||
540 | /* | ||
541 | * Find a free area with specified alignment in a specific range. | ||
542 | * only with the area.between start to end is active range from early_node_map | ||
543 | * so they are good as RAM | ||
544 | */ | ||
545 | u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, | ||
546 | u64 size, u64 align) | ||
547 | { | ||
548 | u64 addr, last; | ||
549 | |||
550 | addr = round_up(ei_start, align); | ||
551 | if (addr < start) | ||
552 | addr = round_up(start, align); | ||
553 | if (addr >= ei_last) | ||
554 | goto out; | ||
555 | while (bad_addr(&addr, size, align) && addr+size <= ei_last) | ||
556 | ; | ||
557 | last = addr + size; | ||
558 | if (last > ei_last) | ||
559 | goto out; | ||
560 | if (last > end) | ||
561 | goto out; | ||
562 | |||
563 | return addr; | ||
564 | |||
565 | out: | ||
566 | return -1ULL; | ||
567 | } | ||
568 | |||
569 | u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, | ||
570 | u64 *sizep, u64 align) | ||
571 | { | ||
572 | u64 addr, last; | ||
573 | |||
574 | addr = round_up(ei_start, align); | ||
575 | if (addr < start) | ||
576 | addr = round_up(start, align); | ||
577 | if (addr >= ei_last) | ||
578 | goto out; | ||
579 | *sizep = ei_last - addr; | ||
580 | while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) | ||
581 | ; | ||
582 | last = addr + *sizep; | ||
583 | if (last > ei_last) | ||
584 | goto out; | ||
585 | |||
586 | return addr; | ||
587 | |||
588 | out: | ||
589 | return -1ULL; | ||
590 | } | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 03120229db28..b194febf5799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/perf_event.h> | 50 | #include <linux/perf_event.h> |
51 | #include <trace/events/sched.h> | 51 | #include <trace/events/sched.h> |
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | ||
53 | 54 | ||
54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
55 | #include <asm/unistd.h> | 56 | #include <asm/unistd.h> |
@@ -149,9 +150,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
149 | { | 150 | { |
150 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); | 151 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
151 | 152 | ||
152 | #ifdef CONFIG_PERF_EVENTS | 153 | perf_event_delayed_put(tsk); |
153 | WARN_ON_ONCE(tsk->perf_event_ctxp); | ||
154 | #endif | ||
155 | trace_sched_process_free(tsk); | 154 | trace_sched_process_free(tsk); |
156 | put_task_struct(tsk); | 155 | put_task_struct(tsk); |
157 | } | 156 | } |
@@ -689,6 +688,8 @@ static void exit_mm(struct task_struct * tsk) | |||
689 | enter_lazy_tlb(mm, current); | 688 | enter_lazy_tlb(mm, current); |
690 | /* We don't want this task to be frozen prematurely */ | 689 | /* We don't want this task to be frozen prematurely */ |
691 | clear_freeze_flag(tsk); | 690 | clear_freeze_flag(tsk); |
691 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
692 | atomic_dec(&mm->oom_disable_count); | ||
692 | task_unlock(tsk); | 693 | task_unlock(tsk); |
693 | mm_update_next_owner(mm); | 694 | mm_update_next_owner(mm); |
694 | mmput(mm); | 695 | mmput(mm); |
@@ -702,6 +703,8 @@ static void exit_mm(struct task_struct * tsk) | |||
702 | * space. | 703 | * space. |
703 | */ | 704 | */ |
704 | static struct task_struct *find_new_reaper(struct task_struct *father) | 705 | static struct task_struct *find_new_reaper(struct task_struct *father) |
706 | __releases(&tasklist_lock) | ||
707 | __acquires(&tasklist_lock) | ||
705 | { | 708 | { |
706 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | 709 | struct pid_namespace *pid_ns = task_active_pid_ns(father); |
707 | struct task_struct *thread; | 710 | struct task_struct *thread; |
diff --git a/kernel/fork.c b/kernel/fork.c index c445f8cc408d..3b159c5991b7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -65,6 +65,7 @@ | |||
65 | #include <linux/perf_event.h> | 65 | #include <linux/perf_event.h> |
66 | #include <linux/posix-timers.h> | 66 | #include <linux/posix-timers.h> |
67 | #include <linux/user-return-notifier.h> | 67 | #include <linux/user-return-notifier.h> |
68 | #include <linux/oom.h> | ||
68 | 69 | ||
69 | #include <asm/pgtable.h> | 70 | #include <asm/pgtable.h> |
70 | #include <asm/pgalloc.h> | 71 | #include <asm/pgalloc.h> |
@@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
488 | mm->cached_hole_size = ~0UL; | 489 | mm->cached_hole_size = ~0UL; |
489 | mm_init_aio(mm); | 490 | mm_init_aio(mm); |
490 | mm_init_owner(mm, p); | 491 | mm_init_owner(mm, p); |
492 | atomic_set(&mm->oom_disable_count, 0); | ||
491 | 493 | ||
492 | if (likely(!mm_alloc_pgd(mm))) { | 494 | if (likely(!mm_alloc_pgd(mm))) { |
493 | mm->def_flags = 0; | 495 | mm->def_flags = 0; |
@@ -741,6 +743,8 @@ good_mm: | |||
741 | /* Initializing for Swap token stuff */ | 743 | /* Initializing for Swap token stuff */ |
742 | mm->token_priority = 0; | 744 | mm->token_priority = 0; |
743 | mm->last_interval = 0; | 745 | mm->last_interval = 0; |
746 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
747 | atomic_inc(&mm->oom_disable_count); | ||
744 | 748 | ||
745 | tsk->mm = mm; | 749 | tsk->mm = mm; |
746 | tsk->active_mm = mm; | 750 | tsk->active_mm = mm; |
@@ -904,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
904 | sig->oom_adj = current->signal->oom_adj; | 908 | sig->oom_adj = current->signal->oom_adj; |
905 | sig->oom_score_adj = current->signal->oom_score_adj; | 909 | sig->oom_score_adj = current->signal->oom_score_adj; |
906 | 910 | ||
911 | mutex_init(&sig->cred_guard_mutex); | ||
912 | |||
907 | return 0; | 913 | return 0; |
908 | } | 914 | } |
909 | 915 | ||
@@ -1299,8 +1305,13 @@ bad_fork_cleanup_io: | |||
1299 | bad_fork_cleanup_namespaces: | 1305 | bad_fork_cleanup_namespaces: |
1300 | exit_task_namespaces(p); | 1306 | exit_task_namespaces(p); |
1301 | bad_fork_cleanup_mm: | 1307 | bad_fork_cleanup_mm: |
1302 | if (p->mm) | 1308 | if (p->mm) { |
1309 | task_lock(p); | ||
1310 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
1311 | atomic_dec(&p->mm->oom_disable_count); | ||
1312 | task_unlock(p); | ||
1303 | mmput(p->mm); | 1313 | mmput(p->mm); |
1314 | } | ||
1304 | bad_fork_cleanup_signal: | 1315 | bad_fork_cleanup_signal: |
1305 | if (!(clone_flags & CLONE_THREAD)) | 1316 | if (!(clone_flags & CLONE_THREAD)) |
1306 | free_signal_struct(p->signal); | 1317 | free_signal_struct(p->signal); |
@@ -1693,6 +1704,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1693 | active_mm = current->active_mm; | 1704 | active_mm = current->active_mm; |
1694 | current->mm = new_mm; | 1705 | current->mm = new_mm; |
1695 | current->active_mm = new_mm; | 1706 | current->active_mm = new_mm; |
1707 | if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | ||
1708 | atomic_dec(&mm->oom_disable_count); | ||
1709 | atomic_inc(&new_mm->oom_disable_count); | ||
1710 | } | ||
1696 | activate_mm(active_mm, new_mm); | 1711 | activate_mm(active_mm, new_mm); |
1697 | new_mm = mm; | 1712 | new_mm = mm; |
1698 | } | 1713 | } |
diff --git a/kernel/futex.c b/kernel/futex.c index 6a3a5fa1526d..6c683b37f2ce 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -91,6 +91,7 @@ struct futex_pi_state { | |||
91 | 91 | ||
92 | /** | 92 | /** |
93 | * struct futex_q - The hashed futex queue entry, one per waiting task | 93 | * struct futex_q - The hashed futex queue entry, one per waiting task |
94 | * @list: priority-sorted list of tasks waiting on this futex | ||
94 | * @task: the task waiting on the futex | 95 | * @task: the task waiting on the futex |
95 | * @lock_ptr: the hash bucket lock | 96 | * @lock_ptr: the hash bucket lock |
96 | * @key: the key the futex is hashed on | 97 | * @key: the key the futex is hashed on |
@@ -104,7 +105,7 @@ struct futex_pi_state { | |||
104 | * | 105 | * |
105 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. | 106 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. |
106 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. | 107 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. |
107 | * The order of wakup is always to make the first condition true, then | 108 | * The order of wakeup is always to make the first condition true, then |
108 | * the second. | 109 | * the second. |
109 | * | 110 | * |
110 | * PI futexes are typically woken before they are removed from the hash list via | 111 | * PI futexes are typically woken before they are removed from the hash list via |
@@ -168,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key) | |||
168 | 169 | ||
169 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 170 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
170 | case FUT_OFF_INODE: | 171 | case FUT_OFF_INODE: |
171 | atomic_inc(&key->shared.inode->i_count); | 172 | ihold(key->shared.inode); |
172 | break; | 173 | break; |
173 | case FUT_OFF_MMSHARED: | 174 | case FUT_OFF_MMSHARED: |
174 | atomic_inc(&key->private.mm->mm_count); | 175 | atomic_inc(&key->private.mm->mm_count); |
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key) | |||
295 | * Slow path to fixup the fault we just took in the atomic write | 296 | * Slow path to fixup the fault we just took in the atomic write |
296 | * access to @uaddr. | 297 | * access to @uaddr. |
297 | * | 298 | * |
298 | * We have no generic implementation of a non destructive write to the | 299 | * We have no generic implementation of a non-destructive write to the |
299 | * user address. We know that we faulted in the atomic pagefault | 300 | * user address. We know that we faulted in the atomic pagefault |
300 | * disabled section so we can as well avoid the #PF overhead by | 301 | * disabled section so we can as well avoid the #PF overhead by |
301 | * calling get_user_pages() right away. | 302 | * calling get_user_pages() right away. |
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
515 | */ | 516 | */ |
516 | pi_state = this->pi_state; | 517 | pi_state = this->pi_state; |
517 | /* | 518 | /* |
518 | * Userspace might have messed up non PI and PI futexes | 519 | * Userspace might have messed up non-PI and PI futexes |
519 | */ | 520 | */ |
520 | if (unlikely(!pi_state)) | 521 | if (unlikely(!pi_state)) |
521 | return -EINVAL; | 522 | return -EINVAL; |
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q) | |||
736 | 737 | ||
737 | /* | 738 | /* |
738 | * We set q->lock_ptr = NULL _before_ we wake up the task. If | 739 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
739 | * a non futex wake up happens on another CPU then the task | 740 | * a non-futex wake up happens on another CPU then the task |
740 | * might exit and p would dereference a non existing task | 741 | * might exit and p would dereference a non-existing task |
741 | * struct. Prevent this by holding a reference on p across the | 742 | * struct. Prevent this by holding a reference on p across the |
742 | * wake up. | 743 | * wake up. |
743 | */ | 744 | */ |
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1131 | 1132 | ||
1132 | /** | 1133 | /** |
1133 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | 1134 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 |
1134 | * uaddr1: source futex user address | 1135 | * @uaddr1: source futex user address |
1135 | * uaddr2: target futex user address | 1136 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED |
1136 | * nr_wake: number of waiters to wake (must be 1 for requeue_pi) | 1137 | * @uaddr2: target futex user address |
1137 | * nr_requeue: number of waiters to requeue (0-INT_MAX) | 1138 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) |
1138 | * requeue_pi: if we are attempting to requeue from a non-pi futex to a | 1139 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) |
1140 | * @cmpval: @uaddr1 expected value (or %NULL) | ||
1141 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a | ||
1139 | * pi futex (pi to pi requeue is not supported) | 1142 | * pi futex (pi to pi requeue is not supported) |
1140 | * | 1143 | * |
1141 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | 1144 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire |
@@ -1360,10 +1363,10 @@ out: | |||
1360 | 1363 | ||
1361 | /* The key must be already stored in q->key. */ | 1364 | /* The key must be already stored in q->key. */ |
1362 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | 1365 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) |
1366 | __acquires(&hb->lock) | ||
1363 | { | 1367 | { |
1364 | struct futex_hash_bucket *hb; | 1368 | struct futex_hash_bucket *hb; |
1365 | 1369 | ||
1366 | get_futex_key_refs(&q->key); | ||
1367 | hb = hash_futex(&q->key); | 1370 | hb = hash_futex(&q->key); |
1368 | q->lock_ptr = &hb->lock; | 1371 | q->lock_ptr = &hb->lock; |
1369 | 1372 | ||
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
1373 | 1376 | ||
1374 | static inline void | 1377 | static inline void |
1375 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | 1378 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
1379 | __releases(&hb->lock) | ||
1376 | { | 1380 | { |
1377 | spin_unlock(&hb->lock); | 1381 | spin_unlock(&hb->lock); |
1378 | drop_futex_key_refs(&q->key); | ||
1379 | } | 1382 | } |
1380 | 1383 | ||
1381 | /** | 1384 | /** |
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | |||
1391 | * an example). | 1394 | * an example). |
1392 | */ | 1395 | */ |
1393 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | 1396 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
1397 | __releases(&hb->lock) | ||
1394 | { | 1398 | { |
1395 | int prio; | 1399 | int prio; |
1396 | 1400 | ||
@@ -1471,6 +1475,7 @@ retry: | |||
1471 | * and dropped here. | 1475 | * and dropped here. |
1472 | */ | 1476 | */ |
1473 | static void unqueue_me_pi(struct futex_q *q) | 1477 | static void unqueue_me_pi(struct futex_q *q) |
1478 | __releases(q->lock_ptr) | ||
1474 | { | 1479 | { |
1475 | WARN_ON(plist_node_empty(&q->list)); | 1480 | WARN_ON(plist_node_empty(&q->list)); |
1476 | plist_del(&q->list, &q->list.plist); | 1481 | plist_del(&q->list, &q->list.plist); |
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q) | |||
1480 | q->pi_state = NULL; | 1485 | q->pi_state = NULL; |
1481 | 1486 | ||
1482 | spin_unlock(q->lock_ptr); | 1487 | spin_unlock(q->lock_ptr); |
1483 | |||
1484 | drop_futex_key_refs(&q->key); | ||
1485 | } | 1488 | } |
1486 | 1489 | ||
1487 | /* | 1490 | /* |
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
1812 | } | 1815 | } |
1813 | 1816 | ||
1814 | retry: | 1817 | retry: |
1815 | /* Prepare to wait on uaddr. */ | 1818 | /* |
1819 | * Prepare to wait on uaddr. On success, holds hb lock and increments | ||
1820 | * q.key refs. | ||
1821 | */ | ||
1816 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 1822 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
1817 | if (ret) | 1823 | if (ret) |
1818 | goto out; | 1824 | goto out; |
@@ -1822,28 +1828,27 @@ retry: | |||
1822 | 1828 | ||
1823 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 1829 | /* If we were woken (and unqueued), we succeeded, whatever. */ |
1824 | ret = 0; | 1830 | ret = 0; |
1831 | /* unqueue_me() drops q.key ref */ | ||
1825 | if (!unqueue_me(&q)) | 1832 | if (!unqueue_me(&q)) |
1826 | goto out_put_key; | 1833 | goto out; |
1827 | ret = -ETIMEDOUT; | 1834 | ret = -ETIMEDOUT; |
1828 | if (to && !to->task) | 1835 | if (to && !to->task) |
1829 | goto out_put_key; | 1836 | goto out; |
1830 | 1837 | ||
1831 | /* | 1838 | /* |
1832 | * We expect signal_pending(current), but we might be the | 1839 | * We expect signal_pending(current), but we might be the |
1833 | * victim of a spurious wakeup as well. | 1840 | * victim of a spurious wakeup as well. |
1834 | */ | 1841 | */ |
1835 | if (!signal_pending(current)) { | 1842 | if (!signal_pending(current)) |
1836 | put_futex_key(fshared, &q.key); | ||
1837 | goto retry; | 1843 | goto retry; |
1838 | } | ||
1839 | 1844 | ||
1840 | ret = -ERESTARTSYS; | 1845 | ret = -ERESTARTSYS; |
1841 | if (!abs_time) | 1846 | if (!abs_time) |
1842 | goto out_put_key; | 1847 | goto out; |
1843 | 1848 | ||
1844 | restart = ¤t_thread_info()->restart_block; | 1849 | restart = ¤t_thread_info()->restart_block; |
1845 | restart->fn = futex_wait_restart; | 1850 | restart->fn = futex_wait_restart; |
1846 | restart->futex.uaddr = (u32 *)uaddr; | 1851 | restart->futex.uaddr = uaddr; |
1847 | restart->futex.val = val; | 1852 | restart->futex.val = val; |
1848 | restart->futex.time = abs_time->tv64; | 1853 | restart->futex.time = abs_time->tv64; |
1849 | restart->futex.bitset = bitset; | 1854 | restart->futex.bitset = bitset; |
@@ -1856,8 +1861,6 @@ retry: | |||
1856 | 1861 | ||
1857 | ret = -ERESTART_RESTARTBLOCK; | 1862 | ret = -ERESTART_RESTARTBLOCK; |
1858 | 1863 | ||
1859 | out_put_key: | ||
1860 | put_futex_key(fshared, &q.key); | ||
1861 | out: | 1864 | out: |
1862 | if (to) { | 1865 | if (to) { |
1863 | hrtimer_cancel(&to->timer); | 1866 | hrtimer_cancel(&to->timer); |
@@ -1869,7 +1872,7 @@ out: | |||
1869 | 1872 | ||
1870 | static long futex_wait_restart(struct restart_block *restart) | 1873 | static long futex_wait_restart(struct restart_block *restart) |
1871 | { | 1874 | { |
1872 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; | 1875 | u32 __user *uaddr = restart->futex.uaddr; |
1873 | int fshared = 0; | 1876 | int fshared = 0; |
1874 | ktime_t t, *tp = NULL; | 1877 | ktime_t t, *tp = NULL; |
1875 | 1878 | ||
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2236 | q.rt_waiter = &rt_waiter; | 2239 | q.rt_waiter = &rt_waiter; |
2237 | q.requeue_pi_key = &key2; | 2240 | q.requeue_pi_key = &key2; |
2238 | 2241 | ||
2239 | /* Prepare to wait on uaddr. */ | 2242 | /* |
2243 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref | ||
2244 | * count. | ||
2245 | */ | ||
2240 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 2246 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
2241 | if (ret) | 2247 | if (ret) |
2242 | goto out_key2; | 2248 | goto out_key2; |
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2254 | * In order for us to be here, we know our q.key == key2, and since | 2260 | * In order for us to be here, we know our q.key == key2, and since |
2255 | * we took the hb->lock above, we also know that futex_requeue() has | 2261 | * we took the hb->lock above, we also know that futex_requeue() has |
2256 | * completed and we no longer have to concern ourselves with a wakeup | 2262 | * completed and we no longer have to concern ourselves with a wakeup |
2257 | * race with the atomic proxy lock acquition by the requeue code. | 2263 | * race with the atomic proxy lock acquisition by the requeue code. The |
2264 | * futex_requeue dropped our key1 reference and incremented our key2 | ||
2265 | * reference count. | ||
2258 | */ | 2266 | */ |
2259 | 2267 | ||
2260 | /* Check if the requeue code acquired the second futex for us. */ | 2268 | /* Check if the requeue code acquired the second futex for us. */ |
@@ -2458,7 +2466,7 @@ retry: | |||
2458 | */ | 2466 | */ |
2459 | static inline int fetch_robust_entry(struct robust_list __user **entry, | 2467 | static inline int fetch_robust_entry(struct robust_list __user **entry, |
2460 | struct robust_list __user * __user *head, | 2468 | struct robust_list __user * __user *head, |
2461 | int *pi) | 2469 | unsigned int *pi) |
2462 | { | 2470 | { |
2463 | unsigned long uentry; | 2471 | unsigned long uentry; |
2464 | 2472 | ||
@@ -2647,7 +2655,7 @@ static int __init futex_init(void) | |||
2647 | * of the complex code paths. Also we want to prevent | 2655 | * of the complex code paths. Also we want to prevent |
2648 | * registration of robust lists in that case. NULL is | 2656 | * registration of robust lists in that case. NULL is |
2649 | * guaranteed to fault and we get -EFAULT on functional | 2657 | * guaranteed to fault and we get -EFAULT on functional |
2650 | * implementation, the non functional ones will return | 2658 | * implementation, the non-functional ones will return |
2651 | * -ENOSYS. | 2659 | * -ENOSYS. |
2652 | */ | 2660 | */ |
2653 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); | 2661 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d49afb2395e5..06da4dfc339b 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -19,7 +19,7 @@ | |||
19 | */ | 19 | */ |
20 | static inline int | 20 | static inline int |
21 | fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | 21 | fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, |
22 | compat_uptr_t __user *head, int *pi) | 22 | compat_uptr_t __user *head, unsigned int *pi) |
23 | { | 23 | { |
24 | if (get_user(*uentry, head)) | 24 | if (get_user(*uentry, head)) |
25 | return -EFAULT; | 25 | return -EFAULT; |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index f83972b16564..9bd0934f6c33 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
@@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len, | |||
561 | static const struct file_operations gcov_reset_fops = { | 561 | static const struct file_operations gcov_reset_fops = { |
562 | .write = reset_write, | 562 | .write = reset_write, |
563 | .read = reset_read, | 563 | .read = reset_read, |
564 | .llseek = noop_llseek, | ||
564 | }; | 565 | }; |
565 | 566 | ||
566 | /* | 567 | /* |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1decafbb6b1a..72206cf5c6cf 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -931,6 +931,7 @@ static inline int | |||
931 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) | 931 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) |
932 | { | 932 | { |
933 | if (hrtimer_is_queued(timer)) { | 933 | if (hrtimer_is_queued(timer)) { |
934 | unsigned long state; | ||
934 | int reprogram; | 935 | int reprogram; |
935 | 936 | ||
936 | /* | 937 | /* |
@@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) | |||
944 | debug_deactivate(timer); | 945 | debug_deactivate(timer); |
945 | timer_stats_hrtimer_clear_start_info(timer); | 946 | timer_stats_hrtimer_clear_start_info(timer); |
946 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); | 947 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); |
947 | __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, | 948 | /* |
948 | reprogram); | 949 | * We must preserve the CALLBACK state flag here, |
950 | * otherwise we could move the timer base in | ||
951 | * switch_hrtimer_base. | ||
952 | */ | ||
953 | state = timer->state & HRTIMER_STATE_CALLBACK; | ||
954 | __remove_hrtimer(timer, base, state, reprogram); | ||
949 | return 1; | 955 | return 1; |
950 | } | 956 | } |
951 | return 0; | 957 | return 0; |
@@ -1231,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) | |||
1231 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | 1237 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); |
1232 | enqueue_hrtimer(timer, base); | 1238 | enqueue_hrtimer(timer, base); |
1233 | } | 1239 | } |
1240 | |||
1241 | WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); | ||
1242 | |||
1234 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1243 | timer->state &= ~HRTIMER_STATE_CALLBACK; |
1235 | } | 1244 | } |
1236 | 1245 | ||
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d51aac2..53ead174da2f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
98 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 98 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" |
99 | " disables this message.\n"); | 99 | " disables this message.\n"); |
100 | sched_show_task(t); | 100 | sched_show_task(t); |
101 | __debug_show_held_locks(t); | 101 | debug_show_held_locks(t); |
102 | 102 | ||
103 | touch_nmi_watchdog(); | 103 | touch_nmi_watchdog(); |
104 | 104 | ||
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
111 | * periodically exit the critical section and enter a new one. | 111 | * periodically exit the critical section and enter a new one. |
112 | * | 112 | * |
113 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order | 113 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order |
114 | * exit the grace period. For classic RCU, a reschedule is required. | 114 | * to exit the grace period. For classic RCU, a reschedule is required. |
115 | */ | 115 | */ |
116 | static void rcu_lock_break(struct task_struct *g, struct task_struct *t) | 116 | static void rcu_lock_break(struct task_struct *g, struct task_struct *t) |
117 | { | 117 | { |
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c7c2aed9e2dc..2c9120f0afca 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
113 | */ | 113 | */ |
114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) | 114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) |
115 | { | 115 | { |
116 | struct perf_event_context *ctx = bp->ctx; | 116 | struct task_struct *tsk = bp->hw.bp_target; |
117 | struct perf_event *iter; | 117 | struct perf_event *iter; |
118 | int count = 0; | 118 | int count = 0; |
119 | 119 | ||
120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
121 | if (iter->ctx == ctx && find_slot_idx(iter) == type) | 121 | if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) |
122 | count += hw_breakpoint_weight(iter); | 122 | count += hw_breakpoint_weight(iter); |
123 | } | 123 | } |
124 | 124 | ||
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
134 | enum bp_type_idx type) | 134 | enum bp_type_idx type) |
135 | { | 135 | { |
136 | int cpu = bp->cpu; | 136 | int cpu = bp->cpu; |
137 | struct task_struct *tsk = bp->ctx->task; | 137 | struct task_struct *tsk = bp->hw.bp_target; |
138 | 138 | ||
139 | if (cpu >= 0) { | 139 | if (cpu >= 0) { |
140 | slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); | 140 | slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); |
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, | |||
213 | int weight) | 213 | int weight) |
214 | { | 214 | { |
215 | int cpu = bp->cpu; | 215 | int cpu = bp->cpu; |
216 | struct task_struct *tsk = bp->ctx->task; | 216 | struct task_struct *tsk = bp->hw.bp_target; |
217 | 217 | ||
218 | /* Pinned counter cpu profiling */ | 218 | /* Pinned counter cpu profiling */ |
219 | if (!tsk) { | 219 | if (!tsk) { |
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, | |||
433 | perf_overflow_handler_t triggered, | 433 | perf_overflow_handler_t triggered, |
434 | struct task_struct *tsk) | 434 | struct task_struct *tsk) |
435 | { | 435 | { |
436 | return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), | 436 | return perf_event_create_kernel_counter(attr, -1, tsk, triggered); |
437 | triggered); | ||
438 | } | 437 | } |
439 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | 438 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); |
440 | 439 | ||
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, | |||
516 | get_online_cpus(); | 515 | get_online_cpus(); |
517 | for_each_online_cpu(cpu) { | 516 | for_each_online_cpu(cpu) { |
518 | pevent = per_cpu_ptr(cpu_events, cpu); | 517 | pevent = per_cpu_ptr(cpu_events, cpu); |
519 | bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); | 518 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); |
520 | 519 | ||
521 | *pevent = bp; | 520 | *pevent = bp; |
522 | 521 | ||
@@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { | |||
566 | .priority = 0x7fffffff | 565 | .priority = 0x7fffffff |
567 | }; | 566 | }; |
568 | 567 | ||
568 | static void bp_perf_event_destroy(struct perf_event *event) | ||
569 | { | ||
570 | release_bp_slot(event); | ||
571 | } | ||
572 | |||
573 | static int hw_breakpoint_event_init(struct perf_event *bp) | ||
574 | { | ||
575 | int err; | ||
576 | |||
577 | if (bp->attr.type != PERF_TYPE_BREAKPOINT) | ||
578 | return -ENOENT; | ||
579 | |||
580 | err = register_perf_hw_breakpoint(bp); | ||
581 | if (err) | ||
582 | return err; | ||
583 | |||
584 | bp->destroy = bp_perf_event_destroy; | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | static int hw_breakpoint_add(struct perf_event *bp, int flags) | ||
590 | { | ||
591 | if (!(flags & PERF_EF_START)) | ||
592 | bp->hw.state = PERF_HES_STOPPED; | ||
593 | |||
594 | return arch_install_hw_breakpoint(bp); | ||
595 | } | ||
596 | |||
597 | static void hw_breakpoint_del(struct perf_event *bp, int flags) | ||
598 | { | ||
599 | arch_uninstall_hw_breakpoint(bp); | ||
600 | } | ||
601 | |||
602 | static void hw_breakpoint_start(struct perf_event *bp, int flags) | ||
603 | { | ||
604 | bp->hw.state = 0; | ||
605 | } | ||
606 | |||
607 | static void hw_breakpoint_stop(struct perf_event *bp, int flags) | ||
608 | { | ||
609 | bp->hw.state = PERF_HES_STOPPED; | ||
610 | } | ||
611 | |||
612 | static struct pmu perf_breakpoint = { | ||
613 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ | ||
614 | |||
615 | .event_init = hw_breakpoint_event_init, | ||
616 | .add = hw_breakpoint_add, | ||
617 | .del = hw_breakpoint_del, | ||
618 | .start = hw_breakpoint_start, | ||
619 | .stop = hw_breakpoint_stop, | ||
620 | .read = hw_breakpoint_pmu_read, | ||
621 | }; | ||
622 | |||
569 | static int __init init_hw_breakpoint(void) | 623 | static int __init init_hw_breakpoint(void) |
570 | { | 624 | { |
571 | unsigned int **task_bp_pinned; | 625 | unsigned int **task_bp_pinned; |
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void) | |||
587 | 641 | ||
588 | constraints_initialized = 1; | 642 | constraints_initialized = 1; |
589 | 643 | ||
644 | perf_pmu_register(&perf_breakpoint); | ||
645 | |||
590 | return register_die_notifier(&hw_breakpoint_exceptions_nb); | 646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); |
591 | 647 | ||
592 | err_alloc: | 648 | err_alloc: |
@@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void) | |||
602 | core_initcall(init_hw_breakpoint); | 658 | core_initcall(init_hw_breakpoint); |
603 | 659 | ||
604 | 660 | ||
605 | struct pmu perf_ops_bp = { | ||
606 | .enable = arch_install_hw_breakpoint, | ||
607 | .disable = arch_uninstall_hw_breakpoint, | ||
608 | .read = hw_breakpoint_pmu_read, | ||
609 | }; | ||
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig new file mode 100644 index 000000000000..31d766bf5d2e --- /dev/null +++ b/kernel/irq/Kconfig | |||
@@ -0,0 +1,53 @@ | |||
1 | config HAVE_GENERIC_HARDIRQS | ||
2 | def_bool n | ||
3 | |||
4 | if HAVE_GENERIC_HARDIRQS | ||
5 | menu "IRQ subsystem" | ||
6 | # | ||
7 | # Interrupt subsystem related configuration options | ||
8 | # | ||
9 | config GENERIC_HARDIRQS | ||
10 | def_bool y | ||
11 | |||
12 | config GENERIC_HARDIRQS_NO__DO_IRQ | ||
13 | def_bool y | ||
14 | |||
15 | # Select this to disable the deprecated stuff | ||
16 | config GENERIC_HARDIRQS_NO_DEPRECATED | ||
17 | def_bool n | ||
18 | |||
19 | # Options selectable by the architecture code | ||
20 | config HAVE_SPARSE_IRQ | ||
21 | def_bool n | ||
22 | |||
23 | config GENERIC_IRQ_PROBE | ||
24 | def_bool n | ||
25 | |||
26 | config GENERIC_PENDING_IRQ | ||
27 | def_bool n | ||
28 | |||
29 | config AUTO_IRQ_AFFINITY | ||
30 | def_bool n | ||
31 | |||
32 | config IRQ_PER_CPU | ||
33 | def_bool n | ||
34 | |||
35 | config HARDIRQS_SW_RESEND | ||
36 | def_bool n | ||
37 | |||
38 | config SPARSE_IRQ | ||
39 | bool "Support sparse irq numbering" | ||
40 | depends on HAVE_SPARSE_IRQ | ||
41 | ---help--- | ||
42 | |||
43 | Sparse irq numbering is useful for distro kernels that want | ||
44 | to define a high CONFIG_NR_CPUS value but still want to have | ||
45 | low kernel memory footprint on smaller machines. | ||
46 | |||
47 | ( Sparse irqs can also be beneficial on NUMA boxes, as they spread | ||
48 | out the interrupt descriptors in a more NUMA-friendly way. ) | ||
49 | |||
50 | If you don't know what to do here, say N. | ||
51 | |||
52 | endmenu | ||
53 | endif | ||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 7d047808419d..54329cd7b3ee 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -1,7 +1,6 @@ | |||
1 | 1 | ||
2 | obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o | 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o |
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 4 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
6 | obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o | ||
7 | obj-$(CONFIG_PM_SLEEP) += pm.o | 6 | obj-$(CONFIG_PM_SLEEP) += pm.o |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 2295a31ef110..505798f86c36 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void) | |||
57 | * Some chips need to know about probing in | 57 | * Some chips need to know about probing in |
58 | * progress: | 58 | * progress: |
59 | */ | 59 | */ |
60 | if (desc->chip->set_type) | 60 | if (desc->irq_data.chip->irq_set_type) |
61 | desc->chip->set_type(i, IRQ_TYPE_PROBE); | 61 | desc->irq_data.chip->irq_set_type(&desc->irq_data, |
62 | desc->chip->startup(i); | 62 | IRQ_TYPE_PROBE); |
63 | desc->irq_data.chip->irq_startup(&desc->irq_data); | ||
63 | } | 64 | } |
64 | raw_spin_unlock_irq(&desc->lock); | 65 | raw_spin_unlock_irq(&desc->lock); |
65 | } | 66 | } |
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void) | |||
76 | raw_spin_lock_irq(&desc->lock); | 77 | raw_spin_lock_irq(&desc->lock); |
77 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { | 78 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { |
78 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; | 79 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; |
79 | if (desc->chip->startup(i)) | 80 | if (desc->irq_data.chip->irq_startup(&desc->irq_data)) |
80 | desc->status |= IRQ_PENDING; | 81 | desc->status |= IRQ_PENDING; |
81 | } | 82 | } |
82 | raw_spin_unlock_irq(&desc->lock); | 83 | raw_spin_unlock_irq(&desc->lock); |
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void) | |||
98 | /* It triggered already - consider it spurious. */ | 99 | /* It triggered already - consider it spurious. */ |
99 | if (!(status & IRQ_WAITING)) { | 100 | if (!(status & IRQ_WAITING)) { |
100 | desc->status = status & ~IRQ_AUTODETECT; | 101 | desc->status = status & ~IRQ_AUTODETECT; |
101 | desc->chip->shutdown(i); | 102 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
102 | } else | 103 | } else |
103 | if (i < 32) | 104 | if (i < 32) |
104 | mask |= 1 << i; | 105 | mask |= 1 << i; |
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val) | |||
137 | mask |= 1 << i; | 138 | mask |= 1 << i; |
138 | 139 | ||
139 | desc->status = status & ~IRQ_AUTODETECT; | 140 | desc->status = status & ~IRQ_AUTODETECT; |
140 | desc->chip->shutdown(i); | 141 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
141 | } | 142 | } |
142 | raw_spin_unlock_irq(&desc->lock); | 143 | raw_spin_unlock_irq(&desc->lock); |
143 | } | 144 | } |
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val) | |||
181 | nr_of_irqs++; | 182 | nr_of_irqs++; |
182 | } | 183 | } |
183 | desc->status = status & ~IRQ_AUTODETECT; | 184 | desc->status = status & ~IRQ_AUTODETECT; |
184 | desc->chip->shutdown(i); | 185 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
185 | } | 186 | } |
186 | raw_spin_unlock_irq(&desc->lock); | 187 | raw_spin_unlock_irq(&desc->lock); |
187 | } | 188 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7091d5ca2f8..baa5c4acad83 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -18,108 +18,6 @@ | |||
18 | 18 | ||
19 | #include "internals.h" | 19 | #include "internals.h" |
20 | 20 | ||
21 | static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) | ||
22 | { | ||
23 | struct irq_desc *desc; | ||
24 | unsigned long flags; | ||
25 | |||
26 | desc = irq_to_desc(irq); | ||
27 | if (!desc) { | ||
28 | WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); | ||
29 | return; | ||
30 | } | ||
31 | |||
32 | /* Ensure we don't have left over values from a previous use of this irq */ | ||
33 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
34 | desc->status = IRQ_DISABLED; | ||
35 | desc->chip = &no_irq_chip; | ||
36 | desc->handle_irq = handle_bad_irq; | ||
37 | desc->depth = 1; | ||
38 | desc->msi_desc = NULL; | ||
39 | desc->handler_data = NULL; | ||
40 | if (!keep_chip_data) | ||
41 | desc->chip_data = NULL; | ||
42 | desc->action = NULL; | ||
43 | desc->irq_count = 0; | ||
44 | desc->irqs_unhandled = 0; | ||
45 | #ifdef CONFIG_SMP | ||
46 | cpumask_setall(desc->affinity); | ||
47 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
48 | cpumask_clear(desc->pending_mask); | ||
49 | #endif | ||
50 | #endif | ||
51 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
52 | } | ||
53 | |||
54 | /** | ||
55 | * dynamic_irq_init - initialize a dynamically allocated irq | ||
56 | * @irq: irq number to initialize | ||
57 | */ | ||
58 | void dynamic_irq_init(unsigned int irq) | ||
59 | { | ||
60 | dynamic_irq_init_x(irq, false); | ||
61 | } | ||
62 | |||
63 | /** | ||
64 | * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq | ||
65 | * @irq: irq number to initialize | ||
66 | * | ||
67 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
68 | */ | ||
69 | void dynamic_irq_init_keep_chip_data(unsigned int irq) | ||
70 | { | ||
71 | dynamic_irq_init_x(irq, true); | ||
72 | } | ||
73 | |||
74 | static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) | ||
75 | { | ||
76 | struct irq_desc *desc = irq_to_desc(irq); | ||
77 | unsigned long flags; | ||
78 | |||
79 | if (!desc) { | ||
80 | WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); | ||
81 | return; | ||
82 | } | ||
83 | |||
84 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
85 | if (desc->action) { | ||
86 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
87 | WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", | ||
88 | irq); | ||
89 | return; | ||
90 | } | ||
91 | desc->msi_desc = NULL; | ||
92 | desc->handler_data = NULL; | ||
93 | if (!keep_chip_data) | ||
94 | desc->chip_data = NULL; | ||
95 | desc->handle_irq = handle_bad_irq; | ||
96 | desc->chip = &no_irq_chip; | ||
97 | desc->name = NULL; | ||
98 | clear_kstat_irqs(desc); | ||
99 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
100 | } | ||
101 | |||
102 | /** | ||
103 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
104 | * @irq: irq number to initialize | ||
105 | */ | ||
106 | void dynamic_irq_cleanup(unsigned int irq) | ||
107 | { | ||
108 | dynamic_irq_cleanup_x(irq, false); | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq | ||
113 | * @irq: irq number to initialize | ||
114 | * | ||
115 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
116 | */ | ||
117 | void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) | ||
118 | { | ||
119 | dynamic_irq_cleanup_x(irq, true); | ||
120 | } | ||
121 | |||
122 | |||
123 | /** | 21 | /** |
124 | * set_irq_chip - set the irq chip for an irq | 22 | * set_irq_chip - set the irq chip for an irq |
125 | * @irq: irq number | 23 | * @irq: irq number |
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) | |||
140 | 38 | ||
141 | raw_spin_lock_irqsave(&desc->lock, flags); | 39 | raw_spin_lock_irqsave(&desc->lock, flags); |
142 | irq_chip_set_defaults(chip); | 40 | irq_chip_set_defaults(chip); |
143 | desc->chip = chip; | 41 | desc->irq_data.chip = chip; |
144 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 42 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
145 | 43 | ||
146 | return 0; | 44 | return 0; |
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data) | |||
193 | } | 91 | } |
194 | 92 | ||
195 | raw_spin_lock_irqsave(&desc->lock, flags); | 93 | raw_spin_lock_irqsave(&desc->lock, flags); |
196 | desc->handler_data = data; | 94 | desc->irq_data.handler_data = data; |
197 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 95 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
198 | return 0; | 96 | return 0; |
199 | } | 97 | } |
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) | |||
218 | } | 116 | } |
219 | 117 | ||
220 | raw_spin_lock_irqsave(&desc->lock, flags); | 118 | raw_spin_lock_irqsave(&desc->lock, flags); |
221 | desc->msi_desc = entry; | 119 | desc->irq_data.msi_desc = entry; |
222 | if (entry) | 120 | if (entry) |
223 | entry->irq = irq; | 121 | entry->irq = irq; |
224 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 122 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data) | |||
243 | return -EINVAL; | 141 | return -EINVAL; |
244 | } | 142 | } |
245 | 143 | ||
246 | if (!desc->chip) { | 144 | if (!desc->irq_data.chip) { |
247 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); | 145 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); |
248 | return -EINVAL; | 146 | return -EINVAL; |
249 | } | 147 | } |
250 | 148 | ||
251 | raw_spin_lock_irqsave(&desc->lock, flags); | 149 | raw_spin_lock_irqsave(&desc->lock, flags); |
252 | desc->chip_data = data; | 150 | desc->irq_data.chip_data = data; |
253 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 151 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
254 | 152 | ||
255 | return 0; | 153 | return 0; |
256 | } | 154 | } |
257 | EXPORT_SYMBOL(set_irq_chip_data); | 155 | EXPORT_SYMBOL(set_irq_chip_data); |
258 | 156 | ||
157 | struct irq_data *irq_get_irq_data(unsigned int irq) | ||
158 | { | ||
159 | struct irq_desc *desc = irq_to_desc(irq); | ||
160 | |||
161 | return desc ? &desc->irq_data : NULL; | ||
162 | } | ||
163 | EXPORT_SYMBOL_GPL(irq_get_irq_data); | ||
164 | |||
259 | /** | 165 | /** |
260 | * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq | 166 | * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq |
261 | * | 167 | * |
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread); | |||
287 | /* | 193 | /* |
288 | * default enable function | 194 | * default enable function |
289 | */ | 195 | */ |
290 | static void default_enable(unsigned int irq) | 196 | static void default_enable(struct irq_data *data) |
291 | { | 197 | { |
292 | struct irq_desc *desc = irq_to_desc(irq); | 198 | struct irq_desc *desc = irq_data_to_desc(data); |
293 | 199 | ||
294 | desc->chip->unmask(irq); | 200 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
295 | desc->status &= ~IRQ_MASKED; | 201 | desc->status &= ~IRQ_MASKED; |
296 | } | 202 | } |
297 | 203 | ||
298 | /* | 204 | /* |
299 | * default disable function | 205 | * default disable function |
300 | */ | 206 | */ |
301 | static void default_disable(unsigned int irq) | 207 | static void default_disable(struct irq_data *data) |
302 | { | 208 | { |
303 | } | 209 | } |
304 | 210 | ||
305 | /* | 211 | /* |
306 | * default startup function | 212 | * default startup function |
307 | */ | 213 | */ |
308 | static unsigned int default_startup(unsigned int irq) | 214 | static unsigned int default_startup(struct irq_data *data) |
309 | { | 215 | { |
310 | struct irq_desc *desc = irq_to_desc(irq); | 216 | struct irq_desc *desc = irq_data_to_desc(data); |
311 | 217 | ||
312 | desc->chip->enable(irq); | 218 | desc->irq_data.chip->irq_enable(data); |
313 | return 0; | 219 | return 0; |
314 | } | 220 | } |
315 | 221 | ||
316 | /* | 222 | /* |
317 | * default shutdown function | 223 | * default shutdown function |
318 | */ | 224 | */ |
319 | static void default_shutdown(unsigned int irq) | 225 | static void default_shutdown(struct irq_data *data) |
320 | { | 226 | { |
321 | struct irq_desc *desc = irq_to_desc(irq); | 227 | struct irq_desc *desc = irq_data_to_desc(data); |
322 | 228 | ||
323 | desc->chip->mask(irq); | 229 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
324 | desc->status |= IRQ_MASKED; | 230 | desc->status |= IRQ_MASKED; |
325 | } | 231 | } |
326 | 232 | ||
233 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
234 | /* Temporary migration helpers */ | ||
235 | static void compat_irq_mask(struct irq_data *data) | ||
236 | { | ||
237 | data->chip->mask(data->irq); | ||
238 | } | ||
239 | |||
240 | static void compat_irq_unmask(struct irq_data *data) | ||
241 | { | ||
242 | data->chip->unmask(data->irq); | ||
243 | } | ||
244 | |||
245 | static void compat_irq_ack(struct irq_data *data) | ||
246 | { | ||
247 | data->chip->ack(data->irq); | ||
248 | } | ||
249 | |||
250 | static void compat_irq_mask_ack(struct irq_data *data) | ||
251 | { | ||
252 | data->chip->mask_ack(data->irq); | ||
253 | } | ||
254 | |||
255 | static void compat_irq_eoi(struct irq_data *data) | ||
256 | { | ||
257 | data->chip->eoi(data->irq); | ||
258 | } | ||
259 | |||
260 | static void compat_irq_enable(struct irq_data *data) | ||
261 | { | ||
262 | data->chip->enable(data->irq); | ||
263 | } | ||
264 | |||
265 | static void compat_irq_disable(struct irq_data *data) | ||
266 | { | ||
267 | data->chip->disable(data->irq); | ||
268 | } | ||
269 | |||
270 | static void compat_irq_shutdown(struct irq_data *data) | ||
271 | { | ||
272 | data->chip->shutdown(data->irq); | ||
273 | } | ||
274 | |||
275 | static unsigned int compat_irq_startup(struct irq_data *data) | ||
276 | { | ||
277 | return data->chip->startup(data->irq); | ||
278 | } | ||
279 | |||
280 | static int compat_irq_set_affinity(struct irq_data *data, | ||
281 | const struct cpumask *dest, bool force) | ||
282 | { | ||
283 | return data->chip->set_affinity(data->irq, dest); | ||
284 | } | ||
285 | |||
286 | static int compat_irq_set_type(struct irq_data *data, unsigned int type) | ||
287 | { | ||
288 | return data->chip->set_type(data->irq, type); | ||
289 | } | ||
290 | |||
291 | static int compat_irq_set_wake(struct irq_data *data, unsigned int on) | ||
292 | { | ||
293 | return data->chip->set_wake(data->irq, on); | ||
294 | } | ||
295 | |||
296 | static int compat_irq_retrigger(struct irq_data *data) | ||
297 | { | ||
298 | return data->chip->retrigger(data->irq); | ||
299 | } | ||
300 | |||
301 | static void compat_bus_lock(struct irq_data *data) | ||
302 | { | ||
303 | data->chip->bus_lock(data->irq); | ||
304 | } | ||
305 | |||
306 | static void compat_bus_sync_unlock(struct irq_data *data) | ||
307 | { | ||
308 | data->chip->bus_sync_unlock(data->irq); | ||
309 | } | ||
310 | #endif | ||
311 | |||
327 | /* | 312 | /* |
328 | * Fixup enable/disable function pointers | 313 | * Fixup enable/disable function pointers |
329 | */ | 314 | */ |
330 | void irq_chip_set_defaults(struct irq_chip *chip) | 315 | void irq_chip_set_defaults(struct irq_chip *chip) |
331 | { | 316 | { |
332 | if (!chip->enable) | 317 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED |
333 | chip->enable = default_enable; | ||
334 | if (!chip->disable) | ||
335 | chip->disable = default_disable; | ||
336 | if (!chip->startup) | ||
337 | chip->startup = default_startup; | ||
338 | /* | 318 | /* |
339 | * We use chip->disable, when the user provided its own. When | 319 | * Compat fixup functions need to be before we set the |
340 | * we have default_disable set for chip->disable, then we need | 320 | * defaults for enable/disable/startup/shutdown |
321 | */ | ||
322 | if (chip->enable) | ||
323 | chip->irq_enable = compat_irq_enable; | ||
324 | if (chip->disable) | ||
325 | chip->irq_disable = compat_irq_disable; | ||
326 | if (chip->shutdown) | ||
327 | chip->irq_shutdown = compat_irq_shutdown; | ||
328 | if (chip->startup) | ||
329 | chip->irq_startup = compat_irq_startup; | ||
330 | #endif | ||
331 | /* | ||
332 | * The real defaults | ||
333 | */ | ||
334 | if (!chip->irq_enable) | ||
335 | chip->irq_enable = default_enable; | ||
336 | if (!chip->irq_disable) | ||
337 | chip->irq_disable = default_disable; | ||
338 | if (!chip->irq_startup) | ||
339 | chip->irq_startup = default_startup; | ||
340 | /* | ||
341 | * We use chip->irq_disable, when the user provided its own. When | ||
342 | * we have default_disable set for chip->irq_disable, then we need | ||
341 | * to use default_shutdown, otherwise the irq line is not | 343 | * to use default_shutdown, otherwise the irq line is not |
342 | * disabled on free_irq(): | 344 | * disabled on free_irq(): |
343 | */ | 345 | */ |
344 | if (!chip->shutdown) | 346 | if (!chip->irq_shutdown) |
345 | chip->shutdown = chip->disable != default_disable ? | 347 | chip->irq_shutdown = chip->irq_disable != default_disable ? |
346 | chip->disable : default_shutdown; | 348 | chip->irq_disable : default_shutdown; |
347 | if (!chip->name) | 349 | |
348 | chip->name = chip->typename; | 350 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED |
349 | if (!chip->end) | 351 | if (!chip->end) |
350 | chip->end = dummy_irq_chip.end; | 352 | chip->end = dummy_irq_chip.end; |
353 | |||
354 | /* | ||
355 | * Now fix up the remaining compat handlers | ||
356 | */ | ||
357 | if (chip->bus_lock) | ||
358 | chip->irq_bus_lock = compat_bus_lock; | ||
359 | if (chip->bus_sync_unlock) | ||
360 | chip->irq_bus_sync_unlock = compat_bus_sync_unlock; | ||
361 | if (chip->mask) | ||
362 | chip->irq_mask = compat_irq_mask; | ||
363 | if (chip->unmask) | ||
364 | chip->irq_unmask = compat_irq_unmask; | ||
365 | if (chip->ack) | ||
366 | chip->irq_ack = compat_irq_ack; | ||
367 | if (chip->mask_ack) | ||
368 | chip->irq_mask_ack = compat_irq_mask_ack; | ||
369 | if (chip->eoi) | ||
370 | chip->irq_eoi = compat_irq_eoi; | ||
371 | if (chip->set_affinity) | ||
372 | chip->irq_set_affinity = compat_irq_set_affinity; | ||
373 | if (chip->set_type) | ||
374 | chip->irq_set_type = compat_irq_set_type; | ||
375 | if (chip->set_wake) | ||
376 | chip->irq_set_wake = compat_irq_set_wake; | ||
377 | if (chip->retrigger) | ||
378 | chip->irq_retrigger = compat_irq_retrigger; | ||
379 | #endif | ||
351 | } | 380 | } |
352 | 381 | ||
353 | static inline void mask_ack_irq(struct irq_desc *desc, int irq) | 382 | static inline void mask_ack_irq(struct irq_desc *desc) |
354 | { | 383 | { |
355 | if (desc->chip->mask_ack) | 384 | if (desc->irq_data.chip->irq_mask_ack) |
356 | desc->chip->mask_ack(irq); | 385 | desc->irq_data.chip->irq_mask_ack(&desc->irq_data); |
357 | else { | 386 | else { |
358 | desc->chip->mask(irq); | 387 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
359 | if (desc->chip->ack) | 388 | if (desc->irq_data.chip->irq_ack) |
360 | desc->chip->ack(irq); | 389 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
361 | } | 390 | } |
362 | desc->status |= IRQ_MASKED; | 391 | desc->status |= IRQ_MASKED; |
363 | } | 392 | } |
364 | 393 | ||
365 | static inline void mask_irq(struct irq_desc *desc, int irq) | 394 | static inline void mask_irq(struct irq_desc *desc) |
366 | { | 395 | { |
367 | if (desc->chip->mask) { | 396 | if (desc->irq_data.chip->irq_mask) { |
368 | desc->chip->mask(irq); | 397 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
369 | desc->status |= IRQ_MASKED; | 398 | desc->status |= IRQ_MASKED; |
370 | } | 399 | } |
371 | } | 400 | } |
372 | 401 | ||
373 | static inline void unmask_irq(struct irq_desc *desc, int irq) | 402 | static inline void unmask_irq(struct irq_desc *desc) |
374 | { | 403 | { |
375 | if (desc->chip->unmask) { | 404 | if (desc->irq_data.chip->irq_unmask) { |
376 | desc->chip->unmask(irq); | 405 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
377 | desc->status &= ~IRQ_MASKED; | 406 | desc->status &= ~IRQ_MASKED; |
378 | } | 407 | } |
379 | } | 408 | } |
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
476 | irqreturn_t action_ret; | 505 | irqreturn_t action_ret; |
477 | 506 | ||
478 | raw_spin_lock(&desc->lock); | 507 | raw_spin_lock(&desc->lock); |
479 | mask_ack_irq(desc, irq); | 508 | mask_ack_irq(desc); |
480 | 509 | ||
481 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 510 | if (unlikely(desc->status & IRQ_INPROGRESS)) |
482 | goto out_unlock; | 511 | goto out_unlock; |
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
502 | desc->status &= ~IRQ_INPROGRESS; | 531 | desc->status &= ~IRQ_INPROGRESS; |
503 | 532 | ||
504 | if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) | 533 | if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) |
505 | unmask_irq(desc, irq); | 534 | unmask_irq(desc); |
506 | out_unlock: | 535 | out_unlock: |
507 | raw_spin_unlock(&desc->lock); | 536 | raw_spin_unlock(&desc->lock); |
508 | } | 537 | } |
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
539 | action = desc->action; | 568 | action = desc->action; |
540 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 569 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { |
541 | desc->status |= IRQ_PENDING; | 570 | desc->status |= IRQ_PENDING; |
542 | mask_irq(desc, irq); | 571 | mask_irq(desc); |
543 | goto out; | 572 | goto out; |
544 | } | 573 | } |
545 | 574 | ||
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
554 | raw_spin_lock(&desc->lock); | 583 | raw_spin_lock(&desc->lock); |
555 | desc->status &= ~IRQ_INPROGRESS; | 584 | desc->status &= ~IRQ_INPROGRESS; |
556 | out: | 585 | out: |
557 | desc->chip->eoi(irq); | 586 | desc->irq_data.chip->irq_eoi(&desc->irq_data); |
558 | 587 | ||
559 | raw_spin_unlock(&desc->lock); | 588 | raw_spin_unlock(&desc->lock); |
560 | } | 589 | } |
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
590 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || | 619 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || |
591 | !desc->action)) { | 620 | !desc->action)) { |
592 | desc->status |= (IRQ_PENDING | IRQ_MASKED); | 621 | desc->status |= (IRQ_PENDING | IRQ_MASKED); |
593 | mask_ack_irq(desc, irq); | 622 | mask_ack_irq(desc); |
594 | goto out_unlock; | 623 | goto out_unlock; |
595 | } | 624 | } |
596 | kstat_incr_irqs_this_cpu(irq, desc); | 625 | kstat_incr_irqs_this_cpu(irq, desc); |
597 | 626 | ||
598 | /* Start handling the irq */ | 627 | /* Start handling the irq */ |
599 | if (desc->chip->ack) | 628 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
600 | desc->chip->ack(irq); | ||
601 | 629 | ||
602 | /* Mark the IRQ currently in progress.*/ | 630 | /* Mark the IRQ currently in progress.*/ |
603 | desc->status |= IRQ_INPROGRESS; | 631 | desc->status |= IRQ_INPROGRESS; |
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
607 | irqreturn_t action_ret; | 635 | irqreturn_t action_ret; |
608 | 636 | ||
609 | if (unlikely(!action)) { | 637 | if (unlikely(!action)) { |
610 | mask_irq(desc, irq); | 638 | mask_irq(desc); |
611 | goto out_unlock; | 639 | goto out_unlock; |
612 | } | 640 | } |
613 | 641 | ||
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
619 | if (unlikely((desc->status & | 647 | if (unlikely((desc->status & |
620 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == | 648 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == |
621 | (IRQ_PENDING | IRQ_MASKED))) { | 649 | (IRQ_PENDING | IRQ_MASKED))) { |
622 | unmask_irq(desc, irq); | 650 | unmask_irq(desc); |
623 | } | 651 | } |
624 | 652 | ||
625 | desc->status &= ~IRQ_PENDING; | 653 | desc->status &= ~IRQ_PENDING; |
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) | |||
650 | 678 | ||
651 | kstat_incr_irqs_this_cpu(irq, desc); | 679 | kstat_incr_irqs_this_cpu(irq, desc); |
652 | 680 | ||
653 | if (desc->chip->ack) | 681 | if (desc->irq_data.chip->irq_ack) |
654 | desc->chip->ack(irq); | 682 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
655 | 683 | ||
656 | action_ret = handle_IRQ_event(irq, desc->action); | 684 | action_ret = handle_IRQ_event(irq, desc->action); |
657 | if (!noirqdebug) | 685 | if (!noirqdebug) |
658 | note_interrupt(irq, desc, action_ret); | 686 | note_interrupt(irq, desc, action_ret); |
659 | 687 | ||
660 | if (desc->chip->eoi) | 688 | if (desc->irq_data.chip->irq_eoi) |
661 | desc->chip->eoi(irq); | 689 | desc->irq_data.chip->irq_eoi(&desc->irq_data); |
662 | } | 690 | } |
663 | 691 | ||
664 | void | 692 | void |
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
676 | 704 | ||
677 | if (!handle) | 705 | if (!handle) |
678 | handle = handle_bad_irq; | 706 | handle = handle_bad_irq; |
679 | else if (desc->chip == &no_irq_chip) { | 707 | else if (desc->irq_data.chip == &no_irq_chip) { |
680 | printk(KERN_WARNING "Trying to install %sinterrupt handler " | 708 | printk(KERN_WARNING "Trying to install %sinterrupt handler " |
681 | "for IRQ%d\n", is_chained ? "chained " : "", irq); | 709 | "for IRQ%d\n", is_chained ? "chained " : "", irq); |
682 | /* | 710 | /* |
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
686 | * prevent us to setup the interrupt at all. Switch it to | 714 | * prevent us to setup the interrupt at all. Switch it to |
687 | * dummy_irq_chip for easy transition. | 715 | * dummy_irq_chip for easy transition. |
688 | */ | 716 | */ |
689 | desc->chip = &dummy_irq_chip; | 717 | desc->irq_data.chip = &dummy_irq_chip; |
690 | } | 718 | } |
691 | 719 | ||
692 | chip_bus_lock(irq, desc); | 720 | chip_bus_lock(desc); |
693 | raw_spin_lock_irqsave(&desc->lock, flags); | 721 | raw_spin_lock_irqsave(&desc->lock, flags); |
694 | 722 | ||
695 | /* Uninstall? */ | 723 | /* Uninstall? */ |
696 | if (handle == handle_bad_irq) { | 724 | if (handle == handle_bad_irq) { |
697 | if (desc->chip != &no_irq_chip) | 725 | if (desc->irq_data.chip != &no_irq_chip) |
698 | mask_ack_irq(desc, irq); | 726 | mask_ack_irq(desc); |
699 | desc->status |= IRQ_DISABLED; | 727 | desc->status |= IRQ_DISABLED; |
700 | desc->depth = 1; | 728 | desc->depth = 1; |
701 | } | 729 | } |
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
706 | desc->status &= ~IRQ_DISABLED; | 734 | desc->status &= ~IRQ_DISABLED; |
707 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; | 735 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; |
708 | desc->depth = 0; | 736 | desc->depth = 0; |
709 | desc->chip->startup(irq); | 737 | desc->irq_data.chip->irq_startup(&desc->irq_data); |
710 | } | 738 | } |
711 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 739 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
712 | chip_bus_sync_unlock(irq, desc); | 740 | chip_bus_sync_unlock(desc); |
713 | } | 741 | } |
714 | EXPORT_SYMBOL_GPL(__set_irq_handler); | 742 | EXPORT_SYMBOL_GPL(__set_irq_handler); |
715 | 743 | ||
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, | |||
729 | __set_irq_handler(irq, handle, 0, name); | 757 | __set_irq_handler(irq, handle, 0, name); |
730 | } | 758 | } |
731 | 759 | ||
732 | void set_irq_noprobe(unsigned int irq) | 760 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) |
733 | { | 761 | { |
734 | struct irq_desc *desc = irq_to_desc(irq); | 762 | struct irq_desc *desc = irq_to_desc(irq); |
735 | unsigned long flags; | 763 | unsigned long flags; |
736 | 764 | ||
737 | if (!desc) { | 765 | if (!desc) |
738 | printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); | ||
739 | return; | 766 | return; |
740 | } | ||
741 | |||
742 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
743 | desc->status |= IRQ_NOPROBE; | ||
744 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
745 | } | ||
746 | |||
747 | void set_irq_probe(unsigned int irq) | ||
748 | { | ||
749 | struct irq_desc *desc = irq_to_desc(irq); | ||
750 | unsigned long flags; | ||
751 | 767 | ||
752 | if (!desc) { | 768 | /* Sanitize flags */ |
753 | printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); | 769 | set &= IRQF_MODIFY_MASK; |
754 | return; | 770 | clr &= IRQF_MODIFY_MASK; |
755 | } | ||
756 | 771 | ||
757 | raw_spin_lock_irqsave(&desc->lock, flags); | 772 | raw_spin_lock_irqsave(&desc->lock, flags); |
758 | desc->status &= ~IRQ_NOPROBE; | 773 | desc->status &= ~clr; |
774 | desc->status |= set; | ||
759 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 775 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
760 | } | 776 | } |
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c new file mode 100644 index 000000000000..20dc5474947e --- /dev/null +++ b/kernel/irq/dummychip.c | |||
@@ -0,0 +1,68 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
3 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
4 | * | ||
5 | * This file contains the dummy interrupt chip implementation | ||
6 | */ | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/irq.h> | ||
9 | |||
10 | #include "internals.h" | ||
11 | |||
12 | /* | ||
13 | * What should we do if we get a hw irq event on an illegal vector? | ||
14 | * Each architecture has to answer this themself. | ||
15 | */ | ||
16 | static void ack_bad(struct irq_data *data) | ||
17 | { | ||
18 | struct irq_desc *desc = irq_data_to_desc(data); | ||
19 | |||
20 | print_irq_desc(data->irq, desc); | ||
21 | ack_bad_irq(data->irq); | ||
22 | } | ||
23 | |||
24 | /* | ||
25 | * NOP functions | ||
26 | */ | ||
27 | static void noop(struct irq_data *data) { } | ||
28 | |||
29 | static unsigned int noop_ret(struct irq_data *data) | ||
30 | { | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
35 | static void compat_noop(unsigned int irq) { } | ||
36 | #define END_INIT .end = compat_noop | ||
37 | #else | ||
38 | #define END_INIT | ||
39 | #endif | ||
40 | |||
41 | /* | ||
42 | * Generic no controller implementation | ||
43 | */ | ||
44 | struct irq_chip no_irq_chip = { | ||
45 | .name = "none", | ||
46 | .irq_startup = noop_ret, | ||
47 | .irq_shutdown = noop, | ||
48 | .irq_enable = noop, | ||
49 | .irq_disable = noop, | ||
50 | .irq_ack = ack_bad, | ||
51 | END_INIT | ||
52 | }; | ||
53 | |||
54 | /* | ||
55 | * Generic dummy implementation which can be used for | ||
56 | * real dumb interrupt sources | ||
57 | */ | ||
58 | struct irq_chip dummy_irq_chip = { | ||
59 | .name = "dummy", | ||
60 | .irq_startup = noop_ret, | ||
61 | .irq_shutdown = noop, | ||
62 | .irq_enable = noop, | ||
63 | .irq_disable = noop, | ||
64 | .irq_ack = noop, | ||
65 | .irq_mask = noop, | ||
66 | .irq_unmask = noop, | ||
67 | END_INIT | ||
68 | }; | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27e5c6911223..e2347eb63306 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -11,24 +11,15 @@ | |||
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/irq.h> | 13 | #include <linux/irq.h> |
14 | #include <linux/sched.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/random.h> | 14 | #include <linux/random.h> |
15 | #include <linux/sched.h> | ||
18 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
19 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
20 | #include <linux/rculist.h> | 18 | |
21 | #include <linux/hash.h> | ||
22 | #include <linux/radix-tree.h> | ||
23 | #include <trace/events/irq.h> | 19 | #include <trace/events/irq.h> |
24 | 20 | ||
25 | #include "internals.h" | 21 | #include "internals.h" |
26 | 22 | ||
27 | /* | ||
28 | * lockdep: we want to handle all irq_desc locks as a single lock-class: | ||
29 | */ | ||
30 | struct lock_class_key irq_desc_lock_class; | ||
31 | |||
32 | /** | 23 | /** |
33 | * handle_bad_irq - handle spurious and unhandled irqs | 24 | * handle_bad_irq - handle spurious and unhandled irqs |
34 | * @irq: the interrupt number | 25 | * @irq: the interrupt number |
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) | |||
43 | ack_bad_irq(irq); | 34 | ack_bad_irq(irq); |
44 | } | 35 | } |
45 | 36 | ||
46 | #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) | ||
47 | static void __init init_irq_default_affinity(void) | ||
48 | { | ||
49 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); | ||
50 | cpumask_setall(irq_default_affinity); | ||
51 | } | ||
52 | #else | ||
53 | static void __init init_irq_default_affinity(void) | ||
54 | { | ||
55 | } | ||
56 | #endif | ||
57 | |||
58 | /* | ||
59 | * Linux has a controller-independent interrupt architecture. | ||
60 | * Every controller has a 'controller-template', that is used | ||
61 | * by the main code to do the right thing. Each driver-visible | ||
62 | * interrupt source is transparently wired to the appropriate | ||
63 | * controller. Thus drivers need not be aware of the | ||
64 | * interrupt-controller. | ||
65 | * | ||
66 | * The code is designed to be easily extended with new/different | ||
67 | * interrupt controllers, without having to do assembly magic or | ||
68 | * having to touch the generic code. | ||
69 | * | ||
70 | * Controller mappings for all interrupt sources: | ||
71 | */ | ||
72 | int nr_irqs = NR_IRQS; | ||
73 | EXPORT_SYMBOL_GPL(nr_irqs); | ||
74 | |||
75 | #ifdef CONFIG_SPARSE_IRQ | ||
76 | |||
77 | static struct irq_desc irq_desc_init = { | ||
78 | .irq = -1, | ||
79 | .status = IRQ_DISABLED, | ||
80 | .chip = &no_irq_chip, | ||
81 | .handle_irq = handle_bad_irq, | ||
82 | .depth = 1, | ||
83 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), | ||
84 | }; | ||
85 | |||
86 | void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) | ||
87 | { | ||
88 | void *ptr; | ||
89 | |||
90 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), | ||
91 | GFP_ATOMIC, node); | ||
92 | |||
93 | /* | ||
94 | * don't overwite if can not get new one | ||
95 | * init_copy_kstat_irqs() could still use old one | ||
96 | */ | ||
97 | if (ptr) { | ||
98 | printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); | ||
99 | desc->kstat_irqs = ptr; | ||
100 | } | ||
101 | } | ||
102 | |||
103 | static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) | ||
104 | { | ||
105 | memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); | ||
106 | |||
107 | raw_spin_lock_init(&desc->lock); | ||
108 | desc->irq = irq; | ||
109 | #ifdef CONFIG_SMP | ||
110 | desc->node = node; | ||
111 | #endif | ||
112 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
113 | init_kstat_irqs(desc, node, nr_cpu_ids); | ||
114 | if (!desc->kstat_irqs) { | ||
115 | printk(KERN_ERR "can not alloc kstat_irqs\n"); | ||
116 | BUG_ON(1); | ||
117 | } | ||
118 | if (!alloc_desc_masks(desc, node, false)) { | ||
119 | printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); | ||
120 | BUG_ON(1); | ||
121 | } | ||
122 | init_desc_masks(desc); | ||
123 | arch_init_chip_data(desc, node); | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Protect the sparse_irqs: | ||
128 | */ | ||
129 | DEFINE_RAW_SPINLOCK(sparse_irq_lock); | ||
130 | |||
131 | static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); | ||
132 | |||
133 | static void set_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
134 | { | ||
135 | radix_tree_insert(&irq_desc_tree, irq, desc); | ||
136 | } | ||
137 | |||
138 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
139 | { | ||
140 | return radix_tree_lookup(&irq_desc_tree, irq); | ||
141 | } | ||
142 | |||
143 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
144 | { | ||
145 | void **ptr; | ||
146 | |||
147 | ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); | ||
148 | if (ptr) | ||
149 | radix_tree_replace_slot(ptr, desc); | ||
150 | } | ||
151 | |||
152 | static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { | ||
153 | [0 ... NR_IRQS_LEGACY-1] = { | ||
154 | .irq = -1, | ||
155 | .status = IRQ_DISABLED, | ||
156 | .chip = &no_irq_chip, | ||
157 | .handle_irq = handle_bad_irq, | ||
158 | .depth = 1, | ||
159 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), | ||
160 | } | ||
161 | }; | ||
162 | |||
163 | static unsigned int *kstat_irqs_legacy; | ||
164 | |||
165 | int __init early_irq_init(void) | ||
166 | { | ||
167 | struct irq_desc *desc; | ||
168 | int legacy_count; | ||
169 | int node; | ||
170 | int i; | ||
171 | |||
172 | init_irq_default_affinity(); | ||
173 | |||
174 | /* initialize nr_irqs based on nr_cpu_ids */ | ||
175 | arch_probe_nr_irqs(); | ||
176 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); | ||
177 | |||
178 | desc = irq_desc_legacy; | ||
179 | legacy_count = ARRAY_SIZE(irq_desc_legacy); | ||
180 | node = first_online_node; | ||
181 | |||
182 | /* allocate based on nr_cpu_ids */ | ||
183 | kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * | ||
184 | sizeof(int), GFP_NOWAIT, node); | ||
185 | |||
186 | for (i = 0; i < legacy_count; i++) { | ||
187 | desc[i].irq = i; | ||
188 | #ifdef CONFIG_SMP | ||
189 | desc[i].node = node; | ||
190 | #endif | ||
191 | desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; | ||
192 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | ||
193 | alloc_desc_masks(&desc[i], node, true); | ||
194 | init_desc_masks(&desc[i]); | ||
195 | set_irq_desc(i, &desc[i]); | ||
196 | } | ||
197 | |||
198 | return arch_early_irq_init(); | ||
199 | } | ||
200 | |||
201 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | ||
202 | { | ||
203 | struct irq_desc *desc; | ||
204 | unsigned long flags; | ||
205 | |||
206 | if (irq >= nr_irqs) { | ||
207 | WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", | ||
208 | irq, nr_irqs); | ||
209 | return NULL; | ||
210 | } | ||
211 | |||
212 | desc = irq_to_desc(irq); | ||
213 | if (desc) | ||
214 | return desc; | ||
215 | |||
216 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | ||
217 | |||
218 | /* We have to check it to avoid races with another CPU */ | ||
219 | desc = irq_to_desc(irq); | ||
220 | if (desc) | ||
221 | goto out_unlock; | ||
222 | |||
223 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | ||
224 | |||
225 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); | ||
226 | if (!desc) { | ||
227 | printk(KERN_ERR "can not alloc irq_desc\n"); | ||
228 | BUG_ON(1); | ||
229 | } | ||
230 | init_one_irq_desc(irq, desc, node); | ||
231 | |||
232 | set_irq_desc(irq, desc); | ||
233 | |||
234 | out_unlock: | ||
235 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | ||
236 | |||
237 | return desc; | ||
238 | } | ||
239 | |||
240 | #else /* !CONFIG_SPARSE_IRQ */ | ||
241 | |||
242 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | ||
243 | [0 ... NR_IRQS-1] = { | ||
244 | .status = IRQ_DISABLED, | ||
245 | .chip = &no_irq_chip, | ||
246 | .handle_irq = handle_bad_irq, | ||
247 | .depth = 1, | ||
248 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), | ||
249 | } | ||
250 | }; | ||
251 | |||
252 | static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; | ||
253 | int __init early_irq_init(void) | ||
254 | { | ||
255 | struct irq_desc *desc; | ||
256 | int count; | ||
257 | int i; | ||
258 | |||
259 | init_irq_default_affinity(); | ||
260 | |||
261 | printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); | ||
262 | |||
263 | desc = irq_desc; | ||
264 | count = ARRAY_SIZE(irq_desc); | ||
265 | |||
266 | for (i = 0; i < count; i++) { | ||
267 | desc[i].irq = i; | ||
268 | alloc_desc_masks(&desc[i], 0, true); | ||
269 | init_desc_masks(&desc[i]); | ||
270 | desc[i].kstat_irqs = kstat_irqs_all[i]; | ||
271 | } | ||
272 | return arch_early_irq_init(); | ||
273 | } | ||
274 | |||
275 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
276 | { | ||
277 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; | ||
278 | } | ||
279 | |||
280 | struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) | ||
281 | { | ||
282 | return irq_to_desc(irq); | ||
283 | } | ||
284 | #endif /* !CONFIG_SPARSE_IRQ */ | ||
285 | |||
286 | void clear_kstat_irqs(struct irq_desc *desc) | ||
287 | { | ||
288 | memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); | ||
289 | } | ||
290 | |||
291 | /* | ||
292 | * What should we do if we get a hw irq event on an illegal vector? | ||
293 | * Each architecture has to answer this themself. | ||
294 | */ | ||
295 | static void ack_bad(unsigned int irq) | ||
296 | { | ||
297 | struct irq_desc *desc = irq_to_desc(irq); | ||
298 | |||
299 | print_irq_desc(irq, desc); | ||
300 | ack_bad_irq(irq); | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * NOP functions | ||
305 | */ | ||
306 | static void noop(unsigned int irq) | ||
307 | { | ||
308 | } | ||
309 | |||
310 | static unsigned int noop_ret(unsigned int irq) | ||
311 | { | ||
312 | return 0; | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * Generic no controller implementation | ||
317 | */ | ||
318 | struct irq_chip no_irq_chip = { | ||
319 | .name = "none", | ||
320 | .startup = noop_ret, | ||
321 | .shutdown = noop, | ||
322 | .enable = noop, | ||
323 | .disable = noop, | ||
324 | .ack = ack_bad, | ||
325 | .end = noop, | ||
326 | }; | ||
327 | |||
328 | /* | ||
329 | * Generic dummy implementation which can be used for | ||
330 | * real dumb interrupt sources | ||
331 | */ | ||
332 | struct irq_chip dummy_irq_chip = { | ||
333 | .name = "dummy", | ||
334 | .startup = noop_ret, | ||
335 | .shutdown = noop, | ||
336 | .enable = noop, | ||
337 | .disable = noop, | ||
338 | .ack = noop, | ||
339 | .mask = noop, | ||
340 | .unmask = noop, | ||
341 | .end = noop, | ||
342 | }; | ||
343 | |||
344 | /* | 37 | /* |
345 | * Special, empty irq handler: | 38 | * Special, empty irq handler: |
346 | */ | 39 | */ |
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq) | |||
457 | /* | 150 | /* |
458 | * No locking required for CPU-local interrupts: | 151 | * No locking required for CPU-local interrupts: |
459 | */ | 152 | */ |
460 | if (desc->chip->ack) | 153 | if (desc->irq_data.chip->ack) |
461 | desc->chip->ack(irq); | 154 | desc->irq_data.chip->ack(irq); |
462 | if (likely(!(desc->status & IRQ_DISABLED))) { | 155 | if (likely(!(desc->status & IRQ_DISABLED))) { |
463 | action_ret = handle_IRQ_event(irq, desc->action); | 156 | action_ret = handle_IRQ_event(irq, desc->action); |
464 | if (!noirqdebug) | 157 | if (!noirqdebug) |
465 | note_interrupt(irq, desc, action_ret); | 158 | note_interrupt(irq, desc, action_ret); |
466 | } | 159 | } |
467 | desc->chip->end(irq); | 160 | desc->irq_data.chip->end(irq); |
468 | return 1; | 161 | return 1; |
469 | } | 162 | } |
470 | 163 | ||
471 | raw_spin_lock(&desc->lock); | 164 | raw_spin_lock(&desc->lock); |
472 | if (desc->chip->ack) | 165 | if (desc->irq_data.chip->ack) |
473 | desc->chip->ack(irq); | 166 | desc->irq_data.chip->ack(irq); |
474 | /* | 167 | /* |
475 | * REPLAY is when Linux resends an IRQ that was dropped earlier | 168 | * REPLAY is when Linux resends an IRQ that was dropped earlier |
476 | * WAITING is used by probe to mark irqs that are being tested | 169 | * WAITING is used by probe to mark irqs that are being tested |
@@ -530,27 +223,9 @@ out: | |||
530 | * The ->end() handler has to deal with interrupts which got | 223 | * The ->end() handler has to deal with interrupts which got |
531 | * disabled while the handler was running. | 224 | * disabled while the handler was running. |
532 | */ | 225 | */ |
533 | desc->chip->end(irq); | 226 | desc->irq_data.chip->end(irq); |
534 | raw_spin_unlock(&desc->lock); | 227 | raw_spin_unlock(&desc->lock); |
535 | 228 | ||
536 | return 1; | 229 | return 1; |
537 | } | 230 | } |
538 | #endif | 231 | #endif |
539 | |||
540 | void early_init_irq_lock_class(void) | ||
541 | { | ||
542 | struct irq_desc *desc; | ||
543 | int i; | ||
544 | |||
545 | for_each_irq_desc(i, desc) { | ||
546 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
547 | } | ||
548 | } | ||
549 | |||
550 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | ||
551 | { | ||
552 | struct irq_desc *desc = irq_to_desc(irq); | ||
553 | return desc ? desc->kstat_irqs[cpu] : 0; | ||
554 | } | ||
555 | EXPORT_SYMBOL(kstat_irqs_cpu); | ||
556 | |||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c63f3bc88f0b..4571ae7e085a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -1,9 +1,12 @@ | |||
1 | /* | 1 | /* |
2 | * IRQ subsystem internal functions and variables: | 2 | * IRQ subsystem internal functions and variables: |
3 | */ | 3 | */ |
4 | #include <linux/irqdesc.h> | ||
4 | 5 | ||
5 | extern int noirqdebug; | 6 | extern int noirqdebug; |
6 | 7 | ||
8 | #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) | ||
9 | |||
7 | /* Set default functions for irq_chip structures: */ | 10 | /* Set default functions for irq_chip structures: */ |
8 | extern void irq_chip_set_defaults(struct irq_chip *chip); | 11 | extern void irq_chip_set_defaults(struct irq_chip *chip); |
9 | 12 | ||
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
15 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | 18 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); |
16 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 19 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); |
17 | 20 | ||
18 | extern struct lock_class_key irq_desc_lock_class; | ||
19 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 21 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
20 | extern void clear_kstat_irqs(struct irq_desc *desc); | ||
21 | extern raw_spinlock_t sparse_irq_lock; | ||
22 | 22 | ||
23 | #ifdef CONFIG_SPARSE_IRQ | 23 | /* Resending of interrupts :*/ |
24 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc); | 24 | void check_irq_resend(struct irq_desc *desc, unsigned int irq); |
25 | #endif | ||
26 | 25 | ||
27 | #ifdef CONFIG_PROC_FS | 26 | #ifdef CONFIG_PROC_FS |
28 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); | 27 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); |
28 | extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); | ||
29 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); | 29 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); |
30 | extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); | 30 | extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); |
31 | #else | 31 | #else |
32 | static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } | 32 | static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } |
33 | static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } | ||
33 | static inline void register_handler_proc(unsigned int irq, | 34 | static inline void register_handler_proc(unsigned int irq, |
34 | struct irqaction *action) { } | 35 | struct irqaction *action) { } |
35 | static inline void unregister_handler_proc(unsigned int irq, | 36 | static inline void unregister_handler_proc(unsigned int irq, |
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq); | |||
40 | 41 | ||
41 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 42 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
42 | 43 | ||
44 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
45 | static inline void irq_end(unsigned int irq, struct irq_desc *desc) | ||
46 | { | ||
47 | if (desc->irq_data.chip && desc->irq_data.chip->end) | ||
48 | desc->irq_data.chip->end(irq); | ||
49 | } | ||
50 | #else | ||
51 | static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } | ||
52 | #endif | ||
53 | |||
43 | /* Inline functions for support of irq chips on slow busses */ | 54 | /* Inline functions for support of irq chips on slow busses */ |
44 | static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) | 55 | static inline void chip_bus_lock(struct irq_desc *desc) |
45 | { | 56 | { |
46 | if (unlikely(desc->chip->bus_lock)) | 57 | if (unlikely(desc->irq_data.chip->irq_bus_lock)) |
47 | desc->chip->bus_lock(irq); | 58 | desc->irq_data.chip->irq_bus_lock(&desc->irq_data); |
48 | } | 59 | } |
49 | 60 | ||
50 | static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) | 61 | static inline void chip_bus_sync_unlock(struct irq_desc *desc) |
51 | { | 62 | { |
52 | if (unlikely(desc->chip->bus_sync_unlock)) | 63 | if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) |
53 | desc->chip->bus_sync_unlock(irq); | 64 | desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); |
54 | } | 65 | } |
55 | 66 | ||
56 | /* | 67 | /* |
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | |||
67 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | 78 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); |
68 | printk("->handle_irq(): %p, ", desc->handle_irq); | 79 | printk("->handle_irq(): %p, ", desc->handle_irq); |
69 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | 80 | print_symbol("%s\n", (unsigned long)desc->handle_irq); |
70 | printk("->chip(): %p, ", desc->chip); | 81 | printk("->irq_data.chip(): %p, ", desc->irq_data.chip); |
71 | print_symbol("%s\n", (unsigned long)desc->chip); | 82 | print_symbol("%s\n", (unsigned long)desc->irq_data.chip); |
72 | printk("->action(): %p\n", desc->action); | 83 | printk("->action(): %p\n", desc->action); |
73 | if (desc->action) { | 84 | if (desc->action) { |
74 | printk("->action->handler(): %p, ", desc->action->handler); | 85 | printk("->action->handler(): %p, ", desc->action->handler); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c new file mode 100644 index 000000000000..9988d03797f5 --- /dev/null +++ b/kernel/irq/irqdesc.c | |||
@@ -0,0 +1,410 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
3 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
4 | * | ||
5 | * This file contains the interrupt descriptor management code | ||
6 | * | ||
7 | * Detailed information is available in Documentation/DocBook/genericirq | ||
8 | * | ||
9 | */ | ||
10 | #include <linux/irq.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/interrupt.h> | ||
14 | #include <linux/kernel_stat.h> | ||
15 | #include <linux/radix-tree.h> | ||
16 | #include <linux/bitmap.h> | ||
17 | |||
18 | #include "internals.h" | ||
19 | |||
20 | /* | ||
21 | * lockdep: we want to handle all irq_desc locks as a single lock-class: | ||
22 | */ | ||
23 | static struct lock_class_key irq_desc_lock_class; | ||
24 | |||
25 | #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) | ||
26 | static void __init init_irq_default_affinity(void) | ||
27 | { | ||
28 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); | ||
29 | cpumask_setall(irq_default_affinity); | ||
30 | } | ||
31 | #else | ||
32 | static void __init init_irq_default_affinity(void) | ||
33 | { | ||
34 | } | ||
35 | #endif | ||
36 | |||
37 | #ifdef CONFIG_SMP | ||
38 | static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) | ||
39 | { | ||
40 | if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) | ||
41 | return -ENOMEM; | ||
42 | |||
43 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
44 | if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { | ||
45 | free_cpumask_var(desc->irq_data.affinity); | ||
46 | return -ENOMEM; | ||
47 | } | ||
48 | #endif | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | static void desc_smp_init(struct irq_desc *desc, int node) | ||
53 | { | ||
54 | desc->irq_data.node = node; | ||
55 | cpumask_copy(desc->irq_data.affinity, irq_default_affinity); | ||
56 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
57 | cpumask_clear(desc->pending_mask); | ||
58 | #endif | ||
59 | } | ||
60 | |||
61 | static inline int desc_node(struct irq_desc *desc) | ||
62 | { | ||
63 | return desc->irq_data.node; | ||
64 | } | ||
65 | |||
66 | #else | ||
67 | static inline int | ||
68 | alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } | ||
69 | static inline void desc_smp_init(struct irq_desc *desc, int node) { } | ||
70 | static inline int desc_node(struct irq_desc *desc) { return 0; } | ||
71 | #endif | ||
72 | |||
73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | ||
74 | { | ||
75 | desc->irq_data.irq = irq; | ||
76 | desc->irq_data.chip = &no_irq_chip; | ||
77 | desc->irq_data.chip_data = NULL; | ||
78 | desc->irq_data.handler_data = NULL; | ||
79 | desc->irq_data.msi_desc = NULL; | ||
80 | desc->status = IRQ_DEFAULT_INIT_FLAGS; | ||
81 | desc->handle_irq = handle_bad_irq; | ||
82 | desc->depth = 1; | ||
83 | desc->irq_count = 0; | ||
84 | desc->irqs_unhandled = 0; | ||
85 | desc->name = NULL; | ||
86 | memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); | ||
87 | desc_smp_init(desc, node); | ||
88 | } | ||
89 | |||
90 | int nr_irqs = NR_IRQS; | ||
91 | EXPORT_SYMBOL_GPL(nr_irqs); | ||
92 | |||
93 | static DEFINE_MUTEX(sparse_irq_lock); | ||
94 | static DECLARE_BITMAP(allocated_irqs, NR_IRQS); | ||
95 | |||
96 | #ifdef CONFIG_SPARSE_IRQ | ||
97 | |||
98 | static RADIX_TREE(irq_desc_tree, GFP_KERNEL); | ||
99 | |||
100 | static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) | ||
101 | { | ||
102 | radix_tree_insert(&irq_desc_tree, irq, desc); | ||
103 | } | ||
104 | |||
105 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
106 | { | ||
107 | return radix_tree_lookup(&irq_desc_tree, irq); | ||
108 | } | ||
109 | |||
110 | static void delete_irq_desc(unsigned int irq) | ||
111 | { | ||
112 | radix_tree_delete(&irq_desc_tree, irq); | ||
113 | } | ||
114 | |||
115 | #ifdef CONFIG_SMP | ||
116 | static void free_masks(struct irq_desc *desc) | ||
117 | { | ||
118 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
119 | free_cpumask_var(desc->pending_mask); | ||
120 | #endif | ||
121 | free_cpumask_var(desc->irq_data.affinity); | ||
122 | } | ||
123 | #else | ||
124 | static inline void free_masks(struct irq_desc *desc) { } | ||
125 | #endif | ||
126 | |||
127 | static struct irq_desc *alloc_desc(int irq, int node) | ||
128 | { | ||
129 | struct irq_desc *desc; | ||
130 | gfp_t gfp = GFP_KERNEL; | ||
131 | |||
132 | desc = kzalloc_node(sizeof(*desc), gfp, node); | ||
133 | if (!desc) | ||
134 | return NULL; | ||
135 | /* allocate based on nr_cpu_ids */ | ||
136 | desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), | ||
137 | gfp, node); | ||
138 | if (!desc->kstat_irqs) | ||
139 | goto err_desc; | ||
140 | |||
141 | if (alloc_masks(desc, gfp, node)) | ||
142 | goto err_kstat; | ||
143 | |||
144 | raw_spin_lock_init(&desc->lock); | ||
145 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
146 | |||
147 | desc_set_defaults(irq, desc, node); | ||
148 | |||
149 | return desc; | ||
150 | |||
151 | err_kstat: | ||
152 | kfree(desc->kstat_irqs); | ||
153 | err_desc: | ||
154 | kfree(desc); | ||
155 | return NULL; | ||
156 | } | ||
157 | |||
158 | static void free_desc(unsigned int irq) | ||
159 | { | ||
160 | struct irq_desc *desc = irq_to_desc(irq); | ||
161 | |||
162 | unregister_irq_proc(irq, desc); | ||
163 | |||
164 | mutex_lock(&sparse_irq_lock); | ||
165 | delete_irq_desc(irq); | ||
166 | mutex_unlock(&sparse_irq_lock); | ||
167 | |||
168 | free_masks(desc); | ||
169 | kfree(desc->kstat_irqs); | ||
170 | kfree(desc); | ||
171 | } | ||
172 | |||
173 | static int alloc_descs(unsigned int start, unsigned int cnt, int node) | ||
174 | { | ||
175 | struct irq_desc *desc; | ||
176 | int i; | ||
177 | |||
178 | for (i = 0; i < cnt; i++) { | ||
179 | desc = alloc_desc(start + i, node); | ||
180 | if (!desc) | ||
181 | goto err; | ||
182 | mutex_lock(&sparse_irq_lock); | ||
183 | irq_insert_desc(start + i, desc); | ||
184 | mutex_unlock(&sparse_irq_lock); | ||
185 | } | ||
186 | return start; | ||
187 | |||
188 | err: | ||
189 | for (i--; i >= 0; i--) | ||
190 | free_desc(start + i); | ||
191 | |||
192 | mutex_lock(&sparse_irq_lock); | ||
193 | bitmap_clear(allocated_irqs, start, cnt); | ||
194 | mutex_unlock(&sparse_irq_lock); | ||
195 | return -ENOMEM; | ||
196 | } | ||
197 | |||
198 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | ||
199 | { | ||
200 | int res = irq_alloc_descs(irq, irq, 1, node); | ||
201 | |||
202 | if (res == -EEXIST || res == irq) | ||
203 | return irq_to_desc(irq); | ||
204 | return NULL; | ||
205 | } | ||
206 | |||
207 | int __init early_irq_init(void) | ||
208 | { | ||
209 | int i, initcnt, node = first_online_node; | ||
210 | struct irq_desc *desc; | ||
211 | |||
212 | init_irq_default_affinity(); | ||
213 | |||
214 | /* Let arch update nr_irqs and return the nr of preallocated irqs */ | ||
215 | initcnt = arch_probe_nr_irqs(); | ||
216 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); | ||
217 | |||
218 | for (i = 0; i < initcnt; i++) { | ||
219 | desc = alloc_desc(i, node); | ||
220 | set_bit(i, allocated_irqs); | ||
221 | irq_insert_desc(i, desc); | ||
222 | } | ||
223 | return arch_early_irq_init(); | ||
224 | } | ||
225 | |||
226 | #else /* !CONFIG_SPARSE_IRQ */ | ||
227 | |||
228 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | ||
229 | [0 ... NR_IRQS-1] = { | ||
230 | .status = IRQ_DEFAULT_INIT_FLAGS, | ||
231 | .handle_irq = handle_bad_irq, | ||
232 | .depth = 1, | ||
233 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), | ||
234 | } | ||
235 | }; | ||
236 | |||
237 | static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; | ||
238 | int __init early_irq_init(void) | ||
239 | { | ||
240 | int count, i, node = first_online_node; | ||
241 | struct irq_desc *desc; | ||
242 | |||
243 | init_irq_default_affinity(); | ||
244 | |||
245 | printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); | ||
246 | |||
247 | desc = irq_desc; | ||
248 | count = ARRAY_SIZE(irq_desc); | ||
249 | |||
250 | for (i = 0; i < count; i++) { | ||
251 | desc[i].irq_data.irq = i; | ||
252 | desc[i].irq_data.chip = &no_irq_chip; | ||
253 | desc[i].kstat_irqs = kstat_irqs_all[i]; | ||
254 | alloc_masks(desc + i, GFP_KERNEL, node); | ||
255 | desc_smp_init(desc + i, node); | ||
256 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | ||
257 | } | ||
258 | return arch_early_irq_init(); | ||
259 | } | ||
260 | |||
261 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
262 | { | ||
263 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; | ||
264 | } | ||
265 | |||
266 | struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) | ||
267 | { | ||
268 | return irq_to_desc(irq); | ||
269 | } | ||
270 | |||
271 | static void free_desc(unsigned int irq) | ||
272 | { | ||
273 | dynamic_irq_cleanup(irq); | ||
274 | } | ||
275 | |||
276 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) | ||
277 | { | ||
278 | return start; | ||
279 | } | ||
280 | #endif /* !CONFIG_SPARSE_IRQ */ | ||
281 | |||
282 | /* Dynamic interrupt handling */ | ||
283 | |||
284 | /** | ||
285 | * irq_free_descs - free irq descriptors | ||
286 | * @from: Start of descriptor range | ||
287 | * @cnt: Number of consecutive irqs to free | ||
288 | */ | ||
289 | void irq_free_descs(unsigned int from, unsigned int cnt) | ||
290 | { | ||
291 | int i; | ||
292 | |||
293 | if (from >= nr_irqs || (from + cnt) > nr_irqs) | ||
294 | return; | ||
295 | |||
296 | for (i = 0; i < cnt; i++) | ||
297 | free_desc(from + i); | ||
298 | |||
299 | mutex_lock(&sparse_irq_lock); | ||
300 | bitmap_clear(allocated_irqs, from, cnt); | ||
301 | mutex_unlock(&sparse_irq_lock); | ||
302 | } | ||
303 | |||
304 | /** | ||
305 | * irq_alloc_descs - allocate and initialize a range of irq descriptors | ||
306 | * @irq: Allocate for specific irq number if irq >= 0 | ||
307 | * @from: Start the search from this irq number | ||
308 | * @cnt: Number of consecutive irqs to allocate. | ||
309 | * @node: Preferred node on which the irq descriptor should be allocated | ||
310 | * | ||
311 | * Returns the first irq number or error code | ||
312 | */ | ||
313 | int __ref | ||
314 | irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) | ||
315 | { | ||
316 | int start, ret; | ||
317 | |||
318 | if (!cnt) | ||
319 | return -EINVAL; | ||
320 | |||
321 | mutex_lock(&sparse_irq_lock); | ||
322 | |||
323 | start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); | ||
324 | ret = -EEXIST; | ||
325 | if (irq >=0 && start != irq) | ||
326 | goto err; | ||
327 | |||
328 | ret = -ENOMEM; | ||
329 | if (start >= nr_irqs) | ||
330 | goto err; | ||
331 | |||
332 | bitmap_set(allocated_irqs, start, cnt); | ||
333 | mutex_unlock(&sparse_irq_lock); | ||
334 | return alloc_descs(start, cnt, node); | ||
335 | |||
336 | err: | ||
337 | mutex_unlock(&sparse_irq_lock); | ||
338 | return ret; | ||
339 | } | ||
340 | |||
341 | /** | ||
342 | * irq_reserve_irqs - mark irqs allocated | ||
343 | * @from: mark from irq number | ||
344 | * @cnt: number of irqs to mark | ||
345 | * | ||
346 | * Returns 0 on success or an appropriate error code | ||
347 | */ | ||
348 | int irq_reserve_irqs(unsigned int from, unsigned int cnt) | ||
349 | { | ||
350 | unsigned int start; | ||
351 | int ret = 0; | ||
352 | |||
353 | if (!cnt || (from + cnt) > nr_irqs) | ||
354 | return -EINVAL; | ||
355 | |||
356 | mutex_lock(&sparse_irq_lock); | ||
357 | start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); | ||
358 | if (start == from) | ||
359 | bitmap_set(allocated_irqs, start, cnt); | ||
360 | else | ||
361 | ret = -EEXIST; | ||
362 | mutex_unlock(&sparse_irq_lock); | ||
363 | return ret; | ||
364 | } | ||
365 | |||
366 | /** | ||
367 | * irq_get_next_irq - get next allocated irq number | ||
368 | * @offset: where to start the search | ||
369 | * | ||
370 | * Returns next irq number after offset or nr_irqs if none is found. | ||
371 | */ | ||
372 | unsigned int irq_get_next_irq(unsigned int offset) | ||
373 | { | ||
374 | return find_next_bit(allocated_irqs, nr_irqs, offset); | ||
375 | } | ||
376 | |||
377 | /** | ||
378 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
379 | * @irq: irq number to initialize | ||
380 | */ | ||
381 | void dynamic_irq_cleanup(unsigned int irq) | ||
382 | { | ||
383 | struct irq_desc *desc = irq_to_desc(irq); | ||
384 | unsigned long flags; | ||
385 | |||
386 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
387 | desc_set_defaults(irq, desc, desc_node(desc)); | ||
388 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
389 | } | ||
390 | |||
391 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | ||
392 | { | ||
393 | struct irq_desc *desc = irq_to_desc(irq); | ||
394 | return desc ? desc->kstat_irqs[cpu] : 0; | ||
395 | } | ||
396 | |||
397 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
398 | unsigned int kstat_irqs(unsigned int irq) | ||
399 | { | ||
400 | struct irq_desc *desc = irq_to_desc(irq); | ||
401 | int cpu; | ||
402 | int sum = 0; | ||
403 | |||
404 | if (!desc) | ||
405 | return 0; | ||
406 | for_each_possible_cpu(cpu) | ||
407 | sum += desc->kstat_irqs[cpu]; | ||
408 | return sum; | ||
409 | } | ||
410 | #endif /* CONFIG_GENERIC_HARDIRQS */ | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c3003e9d91a3..644e8d5fa367 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) | |||
73 | { | 73 | { |
74 | struct irq_desc *desc = irq_to_desc(irq); | 74 | struct irq_desc *desc = irq_to_desc(irq); |
75 | 75 | ||
76 | if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || | 76 | if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || |
77 | !desc->chip->set_affinity) | 77 | !desc->irq_data.chip->irq_set_affinity) |
78 | return 0; | 78 | return 0; |
79 | 79 | ||
80 | return 1; | 80 | return 1; |
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc) | |||
109 | int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) | 109 | int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) |
110 | { | 110 | { |
111 | struct irq_desc *desc = irq_to_desc(irq); | 111 | struct irq_desc *desc = irq_to_desc(irq); |
112 | struct irq_chip *chip = desc->irq_data.chip; | ||
112 | unsigned long flags; | 113 | unsigned long flags; |
113 | 114 | ||
114 | if (!desc->chip->set_affinity) | 115 | if (!chip->irq_set_affinity) |
115 | return -EINVAL; | 116 | return -EINVAL; |
116 | 117 | ||
117 | raw_spin_lock_irqsave(&desc->lock, flags); | 118 | raw_spin_lock_irqsave(&desc->lock, flags); |
118 | 119 | ||
119 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 120 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
120 | if (desc->status & IRQ_MOVE_PCNTXT) { | 121 | if (desc->status & IRQ_MOVE_PCNTXT) { |
121 | if (!desc->chip->set_affinity(irq, cpumask)) { | 122 | if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { |
122 | cpumask_copy(desc->affinity, cpumask); | 123 | cpumask_copy(desc->irq_data.affinity, cpumask); |
123 | irq_set_thread_affinity(desc); | 124 | irq_set_thread_affinity(desc); |
124 | } | 125 | } |
125 | } | 126 | } |
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) | |||
128 | cpumask_copy(desc->pending_mask, cpumask); | 129 | cpumask_copy(desc->pending_mask, cpumask); |
129 | } | 130 | } |
130 | #else | 131 | #else |
131 | if (!desc->chip->set_affinity(irq, cpumask)) { | 132 | if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { |
132 | cpumask_copy(desc->affinity, cpumask); | 133 | cpumask_copy(desc->irq_data.affinity, cpumask); |
133 | irq_set_thread_affinity(desc); | 134 | irq_set_thread_affinity(desc); |
134 | } | 135 | } |
135 | #endif | 136 | #endif |
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) | |||
168 | * one of the targets is online. | 169 | * one of the targets is online. |
169 | */ | 170 | */ |
170 | if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { | 171 | if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { |
171 | if (cpumask_any_and(desc->affinity, cpu_online_mask) | 172 | if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) |
172 | < nr_cpu_ids) | 173 | < nr_cpu_ids) |
173 | goto set_affinity; | 174 | goto set_affinity; |
174 | else | 175 | else |
175 | desc->status &= ~IRQ_AFFINITY_SET; | 176 | desc->status &= ~IRQ_AFFINITY_SET; |
176 | } | 177 | } |
177 | 178 | ||
178 | cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); | 179 | cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); |
179 | set_affinity: | 180 | set_affinity: |
180 | desc->chip->set_affinity(irq, desc->affinity); | 181 | desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); |
181 | 182 | ||
182 | return 0; | 183 | return 0; |
183 | } | 184 | } |
@@ -223,7 +224,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | |||
223 | 224 | ||
224 | if (!desc->depth++) { | 225 | if (!desc->depth++) { |
225 | desc->status |= IRQ_DISABLED; | 226 | desc->status |= IRQ_DISABLED; |
226 | desc->chip->disable(irq); | 227 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
227 | } | 228 | } |
228 | } | 229 | } |
229 | 230 | ||
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq) | |||
246 | if (!desc) | 247 | if (!desc) |
247 | return; | 248 | return; |
248 | 249 | ||
249 | chip_bus_lock(irq, desc); | 250 | chip_bus_lock(desc); |
250 | raw_spin_lock_irqsave(&desc->lock, flags); | 251 | raw_spin_lock_irqsave(&desc->lock, flags); |
251 | __disable_irq(desc, irq, false); | 252 | __disable_irq(desc, irq, false); |
252 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 253 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
253 | chip_bus_sync_unlock(irq, desc); | 254 | chip_bus_sync_unlock(desc); |
254 | } | 255 | } |
255 | EXPORT_SYMBOL(disable_irq_nosync); | 256 | EXPORT_SYMBOL(disable_irq_nosync); |
256 | 257 | ||
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | |||
313 | * IRQ line is re-enabled. | 314 | * IRQ line is re-enabled. |
314 | * | 315 | * |
315 | * This function may be called from IRQ context only when | 316 | * This function may be called from IRQ context only when |
316 | * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! | 317 | * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! |
317 | */ | 318 | */ |
318 | void enable_irq(unsigned int irq) | 319 | void enable_irq(unsigned int irq) |
319 | { | 320 | { |
@@ -323,11 +324,11 @@ void enable_irq(unsigned int irq) | |||
323 | if (!desc) | 324 | if (!desc) |
324 | return; | 325 | return; |
325 | 326 | ||
326 | chip_bus_lock(irq, desc); | 327 | chip_bus_lock(desc); |
327 | raw_spin_lock_irqsave(&desc->lock, flags); | 328 | raw_spin_lock_irqsave(&desc->lock, flags); |
328 | __enable_irq(desc, irq, false); | 329 | __enable_irq(desc, irq, false); |
329 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 330 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
330 | chip_bus_sync_unlock(irq, desc); | 331 | chip_bus_sync_unlock(desc); |
331 | } | 332 | } |
332 | EXPORT_SYMBOL(enable_irq); | 333 | EXPORT_SYMBOL(enable_irq); |
333 | 334 | ||
@@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
336 | struct irq_desc *desc = irq_to_desc(irq); | 337 | struct irq_desc *desc = irq_to_desc(irq); |
337 | int ret = -ENXIO; | 338 | int ret = -ENXIO; |
338 | 339 | ||
339 | if (desc->chip->set_wake) | 340 | if (desc->irq_data.chip->irq_set_wake) |
340 | ret = desc->chip->set_wake(irq, on); | 341 | ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); |
341 | 342 | ||
342 | return ret; | 343 | return ret; |
343 | } | 344 | } |
@@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) | |||
429 | } | 430 | } |
430 | 431 | ||
431 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 432 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
432 | unsigned long flags) | 433 | unsigned long flags) |
433 | { | 434 | { |
434 | int ret; | 435 | int ret; |
435 | struct irq_chip *chip = desc->chip; | 436 | struct irq_chip *chip = desc->irq_data.chip; |
436 | 437 | ||
437 | if (!chip || !chip->set_type) { | 438 | if (!chip || !chip->irq_set_type) { |
438 | /* | 439 | /* |
439 | * IRQF_TRIGGER_* but the PIC does not support multiple | 440 | * IRQF_TRIGGER_* but the PIC does not support multiple |
440 | * flow-types? | 441 | * flow-types? |
@@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
445 | } | 446 | } |
446 | 447 | ||
447 | /* caller masked out all except trigger mode flags */ | 448 | /* caller masked out all except trigger mode flags */ |
448 | ret = chip->set_type(irq, flags); | 449 | ret = chip->irq_set_type(&desc->irq_data, flags); |
449 | 450 | ||
450 | if (ret) | 451 | if (ret) |
451 | pr_err("setting trigger mode %d for irq %u failed (%pF)\n", | 452 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", |
452 | (int)flags, irq, chip->set_type); | 453 | flags, irq, chip->irq_set_type); |
453 | else { | 454 | else { |
454 | if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) | 455 | if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) |
455 | flags |= IRQ_LEVEL; | 456 | flags |= IRQ_LEVEL; |
@@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
457 | desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); | 458 | desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); |
458 | desc->status |= flags; | 459 | desc->status |= flags; |
459 | 460 | ||
460 | if (chip != desc->chip) | 461 | if (chip != desc->irq_data.chip) |
461 | irq_chip_set_defaults(desc->chip); | 462 | irq_chip_set_defaults(desc->irq_data.chip); |
462 | } | 463 | } |
463 | 464 | ||
464 | return ret; | 465 | return ret; |
@@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
507 | static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) | 508 | static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) |
508 | { | 509 | { |
509 | again: | 510 | again: |
510 | chip_bus_lock(irq, desc); | 511 | chip_bus_lock(desc); |
511 | raw_spin_lock_irq(&desc->lock); | 512 | raw_spin_lock_irq(&desc->lock); |
512 | 513 | ||
513 | /* | 514 | /* |
@@ -521,17 +522,17 @@ again: | |||
521 | */ | 522 | */ |
522 | if (unlikely(desc->status & IRQ_INPROGRESS)) { | 523 | if (unlikely(desc->status & IRQ_INPROGRESS)) { |
523 | raw_spin_unlock_irq(&desc->lock); | 524 | raw_spin_unlock_irq(&desc->lock); |
524 | chip_bus_sync_unlock(irq, desc); | 525 | chip_bus_sync_unlock(desc); |
525 | cpu_relax(); | 526 | cpu_relax(); |
526 | goto again; | 527 | goto again; |
527 | } | 528 | } |
528 | 529 | ||
529 | if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { | 530 | if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { |
530 | desc->status &= ~IRQ_MASKED; | 531 | desc->status &= ~IRQ_MASKED; |
531 | desc->chip->unmask(irq); | 532 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
532 | } | 533 | } |
533 | raw_spin_unlock_irq(&desc->lock); | 534 | raw_spin_unlock_irq(&desc->lock); |
534 | chip_bus_sync_unlock(irq, desc); | 535 | chip_bus_sync_unlock(desc); |
535 | } | 536 | } |
536 | 537 | ||
537 | #ifdef CONFIG_SMP | 538 | #ifdef CONFIG_SMP |
@@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | |||
556 | } | 557 | } |
557 | 558 | ||
558 | raw_spin_lock_irq(&desc->lock); | 559 | raw_spin_lock_irq(&desc->lock); |
559 | cpumask_copy(mask, desc->affinity); | 560 | cpumask_copy(mask, desc->irq_data.affinity); |
560 | raw_spin_unlock_irq(&desc->lock); | 561 | raw_spin_unlock_irq(&desc->lock); |
561 | 562 | ||
562 | set_cpus_allowed_ptr(current, mask); | 563 | set_cpus_allowed_ptr(current, mask); |
@@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
657 | if (!desc) | 658 | if (!desc) |
658 | return -EINVAL; | 659 | return -EINVAL; |
659 | 660 | ||
660 | if (desc->chip == &no_irq_chip) | 661 | if (desc->irq_data.chip == &no_irq_chip) |
661 | return -ENOSYS; | 662 | return -ENOSYS; |
662 | /* | 663 | /* |
663 | * Some drivers like serial.c use request_irq() heavily, | 664 | * Some drivers like serial.c use request_irq() heavily, |
@@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
752 | } | 753 | } |
753 | 754 | ||
754 | if (!shared) { | 755 | if (!shared) { |
755 | irq_chip_set_defaults(desc->chip); | 756 | irq_chip_set_defaults(desc->irq_data.chip); |
756 | 757 | ||
757 | init_waitqueue_head(&desc->wait_for_threads); | 758 | init_waitqueue_head(&desc->wait_for_threads); |
758 | 759 | ||
@@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
779 | if (!(desc->status & IRQ_NOAUTOEN)) { | 780 | if (!(desc->status & IRQ_NOAUTOEN)) { |
780 | desc->depth = 0; | 781 | desc->depth = 0; |
781 | desc->status &= ~IRQ_DISABLED; | 782 | desc->status &= ~IRQ_DISABLED; |
782 | desc->chip->startup(irq); | 783 | desc->irq_data.chip->irq_startup(&desc->irq_data); |
783 | } else | 784 | } else |
784 | /* Undo nested disables: */ | 785 | /* Undo nested disables: */ |
785 | desc->depth = 1; | 786 | desc->depth = 1; |
@@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
912 | 913 | ||
913 | /* Currently used only by UML, might disappear one day: */ | 914 | /* Currently used only by UML, might disappear one day: */ |
914 | #ifdef CONFIG_IRQ_RELEASE_METHOD | 915 | #ifdef CONFIG_IRQ_RELEASE_METHOD |
915 | if (desc->chip->release) | 916 | if (desc->irq_data.chip->release) |
916 | desc->chip->release(irq, dev_id); | 917 | desc->irq_data.chip->release(irq, dev_id); |
917 | #endif | 918 | #endif |
918 | 919 | ||
919 | /* If this was the last handler, shut down the IRQ line: */ | 920 | /* If this was the last handler, shut down the IRQ line: */ |
920 | if (!desc->action) { | 921 | if (!desc->action) { |
921 | desc->status |= IRQ_DISABLED; | 922 | desc->status |= IRQ_DISABLED; |
922 | if (desc->chip->shutdown) | 923 | if (desc->irq_data.chip->irq_shutdown) |
923 | desc->chip->shutdown(irq); | 924 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
924 | else | 925 | else |
925 | desc->chip->disable(irq); | 926 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
926 | } | 927 | } |
927 | 928 | ||
928 | #ifdef CONFIG_SMP | 929 | #ifdef CONFIG_SMP |
@@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id) | |||
997 | if (!desc) | 998 | if (!desc) |
998 | return; | 999 | return; |
999 | 1000 | ||
1000 | chip_bus_lock(irq, desc); | 1001 | chip_bus_lock(desc); |
1001 | kfree(__free_irq(irq, dev_id)); | 1002 | kfree(__free_irq(irq, dev_id)); |
1002 | chip_bus_sync_unlock(irq, desc); | 1003 | chip_bus_sync_unlock(desc); |
1003 | } | 1004 | } |
1004 | EXPORT_SYMBOL(free_irq); | 1005 | EXPORT_SYMBOL(free_irq); |
1005 | 1006 | ||
@@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
1086 | action->name = devname; | 1087 | action->name = devname; |
1087 | action->dev_id = dev_id; | 1088 | action->dev_id = dev_id; |
1088 | 1089 | ||
1089 | chip_bus_lock(irq, desc); | 1090 | chip_bus_lock(desc); |
1090 | retval = __setup_irq(irq, desc, action); | 1091 | retval = __setup_irq(irq, desc, action); |
1091 | chip_bus_sync_unlock(irq, desc); | 1092 | chip_bus_sync_unlock(desc); |
1092 | 1093 | ||
1093 | if (retval) | 1094 | if (retval) |
1094 | kfree(action); | 1095 | kfree(action); |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 241962280836..1d2541940480 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -7,6 +7,7 @@ | |||
7 | void move_masked_irq(int irq) | 7 | void move_masked_irq(int irq) |
8 | { | 8 | { |
9 | struct irq_desc *desc = irq_to_desc(irq); | 9 | struct irq_desc *desc = irq_to_desc(irq); |
10 | struct irq_chip *chip = desc->irq_data.chip; | ||
10 | 11 | ||
11 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 12 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) |
12 | return; | 13 | return; |
@@ -24,7 +25,7 @@ void move_masked_irq(int irq) | |||
24 | if (unlikely(cpumask_empty(desc->pending_mask))) | 25 | if (unlikely(cpumask_empty(desc->pending_mask))) |
25 | return; | 26 | return; |
26 | 27 | ||
27 | if (!desc->chip->set_affinity) | 28 | if (!chip->irq_set_affinity) |
28 | return; | 29 | return; |
29 | 30 | ||
30 | assert_raw_spin_locked(&desc->lock); | 31 | assert_raw_spin_locked(&desc->lock); |
@@ -43,8 +44,9 @@ void move_masked_irq(int irq) | |||
43 | */ | 44 | */ |
44 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) | 45 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) |
45 | < nr_cpu_ids)) | 46 | < nr_cpu_ids)) |
46 | if (!desc->chip->set_affinity(irq, desc->pending_mask)) { | 47 | if (!chip->irq_set_affinity(&desc->irq_data, |
47 | cpumask_copy(desc->affinity, desc->pending_mask); | 48 | desc->pending_mask, false)) { |
49 | cpumask_copy(desc->irq_data.affinity, desc->pending_mask); | ||
48 | irq_set_thread_affinity(desc); | 50 | irq_set_thread_affinity(desc); |
49 | } | 51 | } |
50 | 52 | ||
@@ -61,8 +63,8 @@ void move_native_irq(int irq) | |||
61 | if (unlikely(desc->status & IRQ_DISABLED)) | 63 | if (unlikely(desc->status & IRQ_DISABLED)) |
62 | return; | 64 | return; |
63 | 65 | ||
64 | desc->chip->mask(irq); | 66 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
65 | move_masked_irq(irq); | 67 | move_masked_irq(irq); |
66 | desc->chip->unmask(irq); | 68 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
67 | } | 69 | } |
68 | 70 | ||
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c deleted file mode 100644 index 65d3845665ac..000000000000 --- a/kernel/irq/numa_migrate.c +++ /dev/null | |||
@@ -1,120 +0,0 @@ | |||
1 | /* | ||
2 | * NUMA irq-desc migration code | ||
3 | * | ||
4 | * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to | ||
5 | * the new "home node" of the IRQ. | ||
6 | */ | ||
7 | |||
8 | #include <linux/irq.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/random.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | #include <linux/kernel_stat.h> | ||
14 | |||
15 | #include "internals.h" | ||
16 | |||
17 | static void init_copy_kstat_irqs(struct irq_desc *old_desc, | ||
18 | struct irq_desc *desc, | ||
19 | int node, int nr) | ||
20 | { | ||
21 | init_kstat_irqs(desc, node, nr); | ||
22 | |||
23 | if (desc->kstat_irqs != old_desc->kstat_irqs) | ||
24 | memcpy(desc->kstat_irqs, old_desc->kstat_irqs, | ||
25 | nr * sizeof(*desc->kstat_irqs)); | ||
26 | } | ||
27 | |||
28 | static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) | ||
29 | { | ||
30 | if (old_desc->kstat_irqs == desc->kstat_irqs) | ||
31 | return; | ||
32 | |||
33 | kfree(old_desc->kstat_irqs); | ||
34 | old_desc->kstat_irqs = NULL; | ||
35 | } | ||
36 | |||
37 | static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, | ||
38 | struct irq_desc *desc, int node) | ||
39 | { | ||
40 | memcpy(desc, old_desc, sizeof(struct irq_desc)); | ||
41 | if (!alloc_desc_masks(desc, node, false)) { | ||
42 | printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " | ||
43 | "for migration.\n", irq); | ||
44 | return false; | ||
45 | } | ||
46 | raw_spin_lock_init(&desc->lock); | ||
47 | desc->node = node; | ||
48 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
49 | init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); | ||
50 | init_copy_desc_masks(old_desc, desc); | ||
51 | arch_init_copy_chip_data(old_desc, desc, node); | ||
52 | return true; | ||
53 | } | ||
54 | |||
55 | static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) | ||
56 | { | ||
57 | free_kstat_irqs(old_desc, desc); | ||
58 | free_desc_masks(old_desc, desc); | ||
59 | arch_free_chip_data(old_desc, desc); | ||
60 | } | ||
61 | |||
62 | static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | ||
63 | int node) | ||
64 | { | ||
65 | struct irq_desc *desc; | ||
66 | unsigned int irq; | ||
67 | unsigned long flags; | ||
68 | |||
69 | irq = old_desc->irq; | ||
70 | |||
71 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | ||
72 | |||
73 | /* We have to check it to avoid races with another CPU */ | ||
74 | desc = irq_to_desc(irq); | ||
75 | |||
76 | if (desc && old_desc != desc) | ||
77 | goto out_unlock; | ||
78 | |||
79 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | ||
80 | if (!desc) { | ||
81 | printk(KERN_ERR "irq %d: can not get new irq_desc " | ||
82 | "for migration.\n", irq); | ||
83 | /* still use old one */ | ||
84 | desc = old_desc; | ||
85 | goto out_unlock; | ||
86 | } | ||
87 | if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { | ||
88 | /* still use old one */ | ||
89 | kfree(desc); | ||
90 | desc = old_desc; | ||
91 | goto out_unlock; | ||
92 | } | ||
93 | |||
94 | replace_irq_desc(irq, desc); | ||
95 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | ||
96 | |||
97 | /* free the old one */ | ||
98 | free_one_irq_desc(old_desc, desc); | ||
99 | kfree(old_desc); | ||
100 | |||
101 | return desc; | ||
102 | |||
103 | out_unlock: | ||
104 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | ||
105 | |||
106 | return desc; | ||
107 | } | ||
108 | |||
109 | struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) | ||
110 | { | ||
111 | /* those static or target node is -1, do not move them */ | ||
112 | if (desc->irq < NR_IRQS_LEGACY || node == -1) | ||
113 | return desc; | ||
114 | |||
115 | if (desc->node != node) | ||
116 | desc = __real_move_irq_desc(desc, node); | ||
117 | |||
118 | return desc; | ||
119 | } | ||
120 | |||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 09a2ee540bd2..01b1d3a88983 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir; | |||
21 | static int irq_affinity_proc_show(struct seq_file *m, void *v) | 21 | static int irq_affinity_proc_show(struct seq_file *m, void *v) |
22 | { | 22 | { |
23 | struct irq_desc *desc = irq_to_desc((long)m->private); | 23 | struct irq_desc *desc = irq_to_desc((long)m->private); |
24 | const struct cpumask *mask = desc->affinity; | 24 | const struct cpumask *mask = desc->irq_data.affinity; |
25 | 25 | ||
26 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 26 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
27 | if (desc->status & IRQ_MOVE_PENDING) | 27 | if (desc->status & IRQ_MOVE_PENDING) |
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, | |||
65 | cpumask_var_t new_value; | 65 | cpumask_var_t new_value; |
66 | int err; | 66 | int err; |
67 | 67 | ||
68 | if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || | 68 | if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || |
69 | irq_balancing_disabled(irq)) | 69 | irq_balancing_disabled(irq)) |
70 | return -EIO; | 70 | return -EIO; |
71 | 71 | ||
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) | |||
185 | { | 185 | { |
186 | struct irq_desc *desc = irq_to_desc((long) m->private); | 186 | struct irq_desc *desc = irq_to_desc((long) m->private); |
187 | 187 | ||
188 | seq_printf(m, "%d\n", desc->node); | 188 | seq_printf(m, "%d\n", desc->irq_data.node); |
189 | return 0; | 189 | return 0; |
190 | } | 190 | } |
191 | 191 | ||
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
269 | { | 269 | { |
270 | char name [MAX_NAMELEN]; | 270 | char name [MAX_NAMELEN]; |
271 | 271 | ||
272 | if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) | 272 | if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) |
273 | return; | 273 | return; |
274 | 274 | ||
275 | memset(name, 0, MAX_NAMELEN); | 275 | memset(name, 0, MAX_NAMELEN); |
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
297 | &irq_spurious_proc_fops, (void *)(long)irq); | 297 | &irq_spurious_proc_fops, (void *)(long)irq); |
298 | } | 298 | } |
299 | 299 | ||
300 | void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | ||
301 | { | ||
302 | char name [MAX_NAMELEN]; | ||
303 | |||
304 | if (!root_irq_dir || !desc->dir) | ||
305 | return; | ||
306 | #ifdef CONFIG_SMP | ||
307 | remove_proc_entry("smp_affinity", desc->dir); | ||
308 | remove_proc_entry("affinity_hint", desc->dir); | ||
309 | remove_proc_entry("node", desc->dir); | ||
310 | #endif | ||
311 | remove_proc_entry("spurious", desc->dir); | ||
312 | |||
313 | memset(name, 0, MAX_NAMELEN); | ||
314 | sprintf(name, "%u", irq); | ||
315 | remove_proc_entry(name, root_irq_dir); | ||
316 | } | ||
317 | |||
300 | #undef MAX_NAMELEN | 318 | #undef MAX_NAMELEN |
301 | 319 | ||
302 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) | 320 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 090c3763f3a2..891115a929aa 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
60 | /* | 60 | /* |
61 | * Make sure the interrupt is enabled, before resending it: | 61 | * Make sure the interrupt is enabled, before resending it: |
62 | */ | 62 | */ |
63 | desc->chip->enable(irq); | 63 | desc->irq_data.chip->irq_enable(&desc->irq_data); |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * We do not resend level type interrupts. Level type | 66 | * We do not resend level type interrupts. Level type |
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
70 | if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | 70 | if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { |
71 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; | 71 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; |
72 | 72 | ||
73 | if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { | 73 | if (!desc->irq_data.chip->irq_retrigger || |
74 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { | ||
74 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 75 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
75 | /* Set it pending and activate the softirq: */ | 76 | /* Set it pending and activate the softirq: */ |
76 | set_bit(irq, irqs_resend); | 77 | set_bit(irq, irqs_resend); |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89fb90ae534f..3089d3b9d5f3 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -14,6 +14,8 @@ | |||
14 | #include <linux/moduleparam.h> | 14 | #include <linux/moduleparam.h> |
15 | #include <linux/timer.h> | 15 | #include <linux/timer.h> |
16 | 16 | ||
17 | #include "internals.h" | ||
18 | |||
17 | static int irqfixup __read_mostly; | 19 | static int irqfixup __read_mostly; |
18 | 20 | ||
19 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) | 21 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) |
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc) | |||
78 | * If we did actual work for the real IRQ line we must let the | 80 | * If we did actual work for the real IRQ line we must let the |
79 | * IRQ controller clean up too | 81 | * IRQ controller clean up too |
80 | */ | 82 | */ |
81 | if (work && desc->chip && desc->chip->end) | 83 | if (work) |
82 | desc->chip->end(irq); | 84 | irq_end(irq, desc); |
83 | raw_spin_unlock(&desc->lock); | 85 | raw_spin_unlock(&desc->lock); |
84 | 86 | ||
85 | return ok; | 87 | return ok; |
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
254 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | 256 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); |
255 | desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; | 257 | desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; |
256 | desc->depth++; | 258 | desc->depth++; |
257 | desc->chip->disable(irq); | 259 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
258 | 260 | ||
259 | mod_timer(&poll_spurious_irq_timer, | 261 | mod_timer(&poll_spurious_irq_timer, |
260 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 262 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 000000000000..f16763ff8481 --- /dev/null +++ b/kernel/irq_work.c | |||
@@ -0,0 +1,164 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
3 | * | ||
4 | * Provides a framework for enqueueing and running callbacks from hardirq | ||
5 | * context. The enqueueing is NMI-safe. | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/irq_work.h> | ||
11 | #include <linux/hardirq.h> | ||
12 | |||
13 | /* | ||
14 | * An entry can be in one of four states: | ||
15 | * | ||
16 | * free NULL, 0 -> {claimed} : free to be used | ||
17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued | ||
18 | * pending next, 3 -> {busy} : queued, pending callback | ||
19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed | ||
20 | * | ||
21 | * We use the lower two bits of the next pointer to keep PENDING and BUSY | ||
22 | * flags. | ||
23 | */ | ||
24 | |||
25 | #define IRQ_WORK_PENDING 1UL | ||
26 | #define IRQ_WORK_BUSY 2UL | ||
27 | #define IRQ_WORK_FLAGS 3UL | ||
28 | |||
29 | static inline bool irq_work_is_set(struct irq_work *entry, int flags) | ||
30 | { | ||
31 | return (unsigned long)entry->next & flags; | ||
32 | } | ||
33 | |||
34 | static inline struct irq_work *irq_work_next(struct irq_work *entry) | ||
35 | { | ||
36 | unsigned long next = (unsigned long)entry->next; | ||
37 | next &= ~IRQ_WORK_FLAGS; | ||
38 | return (struct irq_work *)next; | ||
39 | } | ||
40 | |||
41 | static inline struct irq_work *next_flags(struct irq_work *entry, int flags) | ||
42 | { | ||
43 | unsigned long next = (unsigned long)entry; | ||
44 | next |= flags; | ||
45 | return (struct irq_work *)next; | ||
46 | } | ||
47 | |||
48 | static DEFINE_PER_CPU(struct irq_work *, irq_work_list); | ||
49 | |||
50 | /* | ||
51 | * Claim the entry so that no one else will poke at it. | ||
52 | */ | ||
53 | static bool irq_work_claim(struct irq_work *entry) | ||
54 | { | ||
55 | struct irq_work *next, *nflags; | ||
56 | |||
57 | do { | ||
58 | next = entry->next; | ||
59 | if ((unsigned long)next & IRQ_WORK_PENDING) | ||
60 | return false; | ||
61 | nflags = next_flags(next, IRQ_WORK_FLAGS); | ||
62 | } while (cmpxchg(&entry->next, next, nflags) != next); | ||
63 | |||
64 | return true; | ||
65 | } | ||
66 | |||
67 | |||
68 | void __weak arch_irq_work_raise(void) | ||
69 | { | ||
70 | /* | ||
71 | * Lame architectures will get the timer tick callback | ||
72 | */ | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Queue the entry and raise the IPI if needed. | ||
77 | */ | ||
78 | static void __irq_work_queue(struct irq_work *entry) | ||
79 | { | ||
80 | struct irq_work **head, *next; | ||
81 | |||
82 | head = &get_cpu_var(irq_work_list); | ||
83 | |||
84 | do { | ||
85 | next = *head; | ||
86 | /* Can assign non-atomic because we keep the flags set. */ | ||
87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | ||
88 | } while (cmpxchg(head, next, entry) != next); | ||
89 | |||
90 | /* The list was empty, raise self-interrupt to start processing. */ | ||
91 | if (!irq_work_next(entry)) | ||
92 | arch_irq_work_raise(); | ||
93 | |||
94 | put_cpu_var(irq_work_list); | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * Enqueue the irq_work @entry, returns true on success, failure when the | ||
99 | * @entry was already enqueued by someone else. | ||
100 | * | ||
101 | * Can be re-enqueued while the callback is still in progress. | ||
102 | */ | ||
103 | bool irq_work_queue(struct irq_work *entry) | ||
104 | { | ||
105 | if (!irq_work_claim(entry)) { | ||
106 | /* | ||
107 | * Already enqueued, can't do! | ||
108 | */ | ||
109 | return false; | ||
110 | } | ||
111 | |||
112 | __irq_work_queue(entry); | ||
113 | return true; | ||
114 | } | ||
115 | EXPORT_SYMBOL_GPL(irq_work_queue); | ||
116 | |||
117 | /* | ||
118 | * Run the irq_work entries on this cpu. Requires to be ran from hardirq | ||
119 | * context with local IRQs disabled. | ||
120 | */ | ||
121 | void irq_work_run(void) | ||
122 | { | ||
123 | struct irq_work *list, **head; | ||
124 | |||
125 | head = &__get_cpu_var(irq_work_list); | ||
126 | if (*head == NULL) | ||
127 | return; | ||
128 | |||
129 | BUG_ON(!in_irq()); | ||
130 | BUG_ON(!irqs_disabled()); | ||
131 | |||
132 | list = xchg(head, NULL); | ||
133 | while (list != NULL) { | ||
134 | struct irq_work *entry = list; | ||
135 | |||
136 | list = irq_work_next(list); | ||
137 | |||
138 | /* | ||
139 | * Clear the PENDING bit, after this point the @entry | ||
140 | * can be re-used. | ||
141 | */ | ||
142 | entry->next = next_flags(NULL, IRQ_WORK_BUSY); | ||
143 | entry->func(entry); | ||
144 | /* | ||
145 | * Clear the BUSY bit and return to the free state if | ||
146 | * no-one else claimed it meanwhile. | ||
147 | */ | ||
148 | cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); | ||
149 | } | ||
150 | } | ||
151 | EXPORT_SYMBOL_GPL(irq_work_run); | ||
152 | |||
153 | /* | ||
154 | * Synchronize against the irq_work @entry, ensures the entry is not | ||
155 | * currently in use. | ||
156 | */ | ||
157 | void irq_work_sync(struct irq_work *entry) | ||
158 | { | ||
159 | WARN_ON_ONCE(irqs_disabled()); | ||
160 | |||
161 | while (irq_work_is_set(entry, IRQ_WORK_BUSY)) | ||
162 | cpu_relax(); | ||
163 | } | ||
164 | EXPORT_SYMBOL_GPL(irq_work_sync); | ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c new file mode 100644 index 000000000000..7be868bf25c6 --- /dev/null +++ b/kernel/jump_label.c | |||
@@ -0,0 +1,429 @@ | |||
1 | /* | ||
2 | * jump label support | ||
3 | * | ||
4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> | ||
5 | * | ||
6 | */ | ||
7 | #include <linux/jump_label.h> | ||
8 | #include <linux/memory.h> | ||
9 | #include <linux/uaccess.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/jhash.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/sort.h> | ||
15 | #include <linux/err.h> | ||
16 | |||
17 | #ifdef HAVE_JUMP_LABEL | ||
18 | |||
19 | #define JUMP_LABEL_HASH_BITS 6 | ||
20 | #define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) | ||
21 | static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; | ||
22 | |||
23 | /* mutex to protect coming/going of the the jump_label table */ | ||
24 | static DEFINE_MUTEX(jump_label_mutex); | ||
25 | |||
26 | struct jump_label_entry { | ||
27 | struct hlist_node hlist; | ||
28 | struct jump_entry *table; | ||
29 | int nr_entries; | ||
30 | /* hang modules off here */ | ||
31 | struct hlist_head modules; | ||
32 | unsigned long key; | ||
33 | }; | ||
34 | |||
35 | struct jump_label_module_entry { | ||
36 | struct hlist_node hlist; | ||
37 | struct jump_entry *table; | ||
38 | int nr_entries; | ||
39 | struct module *mod; | ||
40 | }; | ||
41 | |||
42 | static int jump_label_cmp(const void *a, const void *b) | ||
43 | { | ||
44 | const struct jump_entry *jea = a; | ||
45 | const struct jump_entry *jeb = b; | ||
46 | |||
47 | if (jea->key < jeb->key) | ||
48 | return -1; | ||
49 | |||
50 | if (jea->key > jeb->key) | ||
51 | return 1; | ||
52 | |||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | static void | ||
57 | sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) | ||
58 | { | ||
59 | unsigned long size; | ||
60 | |||
61 | size = (((unsigned long)stop - (unsigned long)start) | ||
62 | / sizeof(struct jump_entry)); | ||
63 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); | ||
64 | } | ||
65 | |||
66 | static struct jump_label_entry *get_jump_label_entry(jump_label_t key) | ||
67 | { | ||
68 | struct hlist_head *head; | ||
69 | struct hlist_node *node; | ||
70 | struct jump_label_entry *e; | ||
71 | u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
72 | |||
73 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
74 | hlist_for_each_entry(e, node, head, hlist) { | ||
75 | if (key == e->key) | ||
76 | return e; | ||
77 | } | ||
78 | return NULL; | ||
79 | } | ||
80 | |||
81 | static struct jump_label_entry * | ||
82 | add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) | ||
83 | { | ||
84 | struct hlist_head *head; | ||
85 | struct jump_label_entry *e; | ||
86 | u32 hash; | ||
87 | |||
88 | e = get_jump_label_entry(key); | ||
89 | if (e) | ||
90 | return ERR_PTR(-EEXIST); | ||
91 | |||
92 | e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); | ||
93 | if (!e) | ||
94 | return ERR_PTR(-ENOMEM); | ||
95 | |||
96 | hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
97 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
98 | e->key = key; | ||
99 | e->table = table; | ||
100 | e->nr_entries = nr_entries; | ||
101 | INIT_HLIST_HEAD(&(e->modules)); | ||
102 | hlist_add_head(&e->hlist, head); | ||
103 | return e; | ||
104 | } | ||
105 | |||
106 | static int | ||
107 | build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) | ||
108 | { | ||
109 | struct jump_entry *iter, *iter_begin; | ||
110 | struct jump_label_entry *entry; | ||
111 | int count; | ||
112 | |||
113 | sort_jump_label_entries(start, stop); | ||
114 | iter = start; | ||
115 | while (iter < stop) { | ||
116 | entry = get_jump_label_entry(iter->key); | ||
117 | if (!entry) { | ||
118 | iter_begin = iter; | ||
119 | count = 0; | ||
120 | while ((iter < stop) && | ||
121 | (iter->key == iter_begin->key)) { | ||
122 | iter++; | ||
123 | count++; | ||
124 | } | ||
125 | entry = add_jump_label_entry(iter_begin->key, | ||
126 | count, iter_begin); | ||
127 | if (IS_ERR(entry)) | ||
128 | return PTR_ERR(entry); | ||
129 | } else { | ||
130 | WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); | ||
131 | return -1; | ||
132 | } | ||
133 | } | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | /*** | ||
138 | * jump_label_update - update jump label text | ||
139 | * @key - key value associated with a a jump label | ||
140 | * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE | ||
141 | * | ||
142 | * Will enable/disable the jump for jump label @key, depending on the | ||
143 | * value of @type. | ||
144 | * | ||
145 | */ | ||
146 | |||
147 | void jump_label_update(unsigned long key, enum jump_label_type type) | ||
148 | { | ||
149 | struct jump_entry *iter; | ||
150 | struct jump_label_entry *entry; | ||
151 | struct hlist_node *module_node; | ||
152 | struct jump_label_module_entry *e_module; | ||
153 | int count; | ||
154 | |||
155 | mutex_lock(&jump_label_mutex); | ||
156 | entry = get_jump_label_entry((jump_label_t)key); | ||
157 | if (entry) { | ||
158 | count = entry->nr_entries; | ||
159 | iter = entry->table; | ||
160 | while (count--) { | ||
161 | if (kernel_text_address(iter->code)) | ||
162 | arch_jump_label_transform(iter, type); | ||
163 | iter++; | ||
164 | } | ||
165 | /* eanble/disable jump labels in modules */ | ||
166 | hlist_for_each_entry(e_module, module_node, &(entry->modules), | ||
167 | hlist) { | ||
168 | count = e_module->nr_entries; | ||
169 | iter = e_module->table; | ||
170 | while (count--) { | ||
171 | if (kernel_text_address(iter->code)) | ||
172 | arch_jump_label_transform(iter, type); | ||
173 | iter++; | ||
174 | } | ||
175 | } | ||
176 | } | ||
177 | mutex_unlock(&jump_label_mutex); | ||
178 | } | ||
179 | |||
180 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | ||
181 | { | ||
182 | if (entry->code <= (unsigned long)end && | ||
183 | entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) | ||
184 | return 1; | ||
185 | |||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | #ifdef CONFIG_MODULES | ||
190 | |||
191 | static int module_conflict(void *start, void *end) | ||
192 | { | ||
193 | struct hlist_head *head; | ||
194 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | ||
195 | struct jump_label_entry *e; | ||
196 | struct jump_label_module_entry *e_module; | ||
197 | struct jump_entry *iter; | ||
198 | int i, count; | ||
199 | int conflict = 0; | ||
200 | |||
201 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | ||
202 | head = &jump_label_table[i]; | ||
203 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | ||
204 | hlist_for_each_entry_safe(e_module, module_node, | ||
205 | module_node_next, | ||
206 | &(e->modules), hlist) { | ||
207 | count = e_module->nr_entries; | ||
208 | iter = e_module->table; | ||
209 | while (count--) { | ||
210 | if (addr_conflict(iter, start, end)) { | ||
211 | conflict = 1; | ||
212 | goto out; | ||
213 | } | ||
214 | iter++; | ||
215 | } | ||
216 | } | ||
217 | } | ||
218 | } | ||
219 | out: | ||
220 | return conflict; | ||
221 | } | ||
222 | |||
223 | #endif | ||
224 | |||
225 | /*** | ||
226 | * jump_label_text_reserved - check if addr range is reserved | ||
227 | * @start: start text addr | ||
228 | * @end: end text addr | ||
229 | * | ||
230 | * checks if the text addr located between @start and @end | ||
231 | * overlaps with any of the jump label patch addresses. Code | ||
232 | * that wants to modify kernel text should first verify that | ||
233 | * it does not overlap with any of the jump label addresses. | ||
234 | * | ||
235 | * returns 1 if there is an overlap, 0 otherwise | ||
236 | */ | ||
237 | int jump_label_text_reserved(void *start, void *end) | ||
238 | { | ||
239 | struct jump_entry *iter; | ||
240 | struct jump_entry *iter_start = __start___jump_table; | ||
241 | struct jump_entry *iter_stop = __start___jump_table; | ||
242 | int conflict = 0; | ||
243 | |||
244 | mutex_lock(&jump_label_mutex); | ||
245 | iter = iter_start; | ||
246 | while (iter < iter_stop) { | ||
247 | if (addr_conflict(iter, start, end)) { | ||
248 | conflict = 1; | ||
249 | goto out; | ||
250 | } | ||
251 | iter++; | ||
252 | } | ||
253 | |||
254 | /* now check modules */ | ||
255 | #ifdef CONFIG_MODULES | ||
256 | conflict = module_conflict(start, end); | ||
257 | #endif | ||
258 | out: | ||
259 | mutex_unlock(&jump_label_mutex); | ||
260 | return conflict; | ||
261 | } | ||
262 | |||
263 | static __init int init_jump_label(void) | ||
264 | { | ||
265 | int ret; | ||
266 | struct jump_entry *iter_start = __start___jump_table; | ||
267 | struct jump_entry *iter_stop = __stop___jump_table; | ||
268 | struct jump_entry *iter; | ||
269 | |||
270 | mutex_lock(&jump_label_mutex); | ||
271 | ret = build_jump_label_hashtable(__start___jump_table, | ||
272 | __stop___jump_table); | ||
273 | iter = iter_start; | ||
274 | while (iter < iter_stop) { | ||
275 | arch_jump_label_text_poke_early(iter->code); | ||
276 | iter++; | ||
277 | } | ||
278 | mutex_unlock(&jump_label_mutex); | ||
279 | return ret; | ||
280 | } | ||
281 | early_initcall(init_jump_label); | ||
282 | |||
283 | #ifdef CONFIG_MODULES | ||
284 | |||
285 | static struct jump_label_module_entry * | ||
286 | add_jump_label_module_entry(struct jump_label_entry *entry, | ||
287 | struct jump_entry *iter_begin, | ||
288 | int count, struct module *mod) | ||
289 | { | ||
290 | struct jump_label_module_entry *e; | ||
291 | |||
292 | e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); | ||
293 | if (!e) | ||
294 | return ERR_PTR(-ENOMEM); | ||
295 | e->mod = mod; | ||
296 | e->nr_entries = count; | ||
297 | e->table = iter_begin; | ||
298 | hlist_add_head(&e->hlist, &entry->modules); | ||
299 | return e; | ||
300 | } | ||
301 | |||
302 | static int add_jump_label_module(struct module *mod) | ||
303 | { | ||
304 | struct jump_entry *iter, *iter_begin; | ||
305 | struct jump_label_entry *entry; | ||
306 | struct jump_label_module_entry *module_entry; | ||
307 | int count; | ||
308 | |||
309 | /* if the module doesn't have jump label entries, just return */ | ||
310 | if (!mod->num_jump_entries) | ||
311 | return 0; | ||
312 | |||
313 | sort_jump_label_entries(mod->jump_entries, | ||
314 | mod->jump_entries + mod->num_jump_entries); | ||
315 | iter = mod->jump_entries; | ||
316 | while (iter < mod->jump_entries + mod->num_jump_entries) { | ||
317 | entry = get_jump_label_entry(iter->key); | ||
318 | iter_begin = iter; | ||
319 | count = 0; | ||
320 | while ((iter < mod->jump_entries + mod->num_jump_entries) && | ||
321 | (iter->key == iter_begin->key)) { | ||
322 | iter++; | ||
323 | count++; | ||
324 | } | ||
325 | if (!entry) { | ||
326 | entry = add_jump_label_entry(iter_begin->key, 0, NULL); | ||
327 | if (IS_ERR(entry)) | ||
328 | return PTR_ERR(entry); | ||
329 | } | ||
330 | module_entry = add_jump_label_module_entry(entry, iter_begin, | ||
331 | count, mod); | ||
332 | if (IS_ERR(module_entry)) | ||
333 | return PTR_ERR(module_entry); | ||
334 | } | ||
335 | return 0; | ||
336 | } | ||
337 | |||
338 | static void remove_jump_label_module(struct module *mod) | ||
339 | { | ||
340 | struct hlist_head *head; | ||
341 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | ||
342 | struct jump_label_entry *e; | ||
343 | struct jump_label_module_entry *e_module; | ||
344 | int i; | ||
345 | |||
346 | /* if the module doesn't have jump label entries, just return */ | ||
347 | if (!mod->num_jump_entries) | ||
348 | return; | ||
349 | |||
350 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | ||
351 | head = &jump_label_table[i]; | ||
352 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | ||
353 | hlist_for_each_entry_safe(e_module, module_node, | ||
354 | module_node_next, | ||
355 | &(e->modules), hlist) { | ||
356 | if (e_module->mod == mod) { | ||
357 | hlist_del(&e_module->hlist); | ||
358 | kfree(e_module); | ||
359 | } | ||
360 | } | ||
361 | if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { | ||
362 | hlist_del(&e->hlist); | ||
363 | kfree(e); | ||
364 | } | ||
365 | } | ||
366 | } | ||
367 | } | ||
368 | |||
369 | static int | ||
370 | jump_label_module_notify(struct notifier_block *self, unsigned long val, | ||
371 | void *data) | ||
372 | { | ||
373 | struct module *mod = data; | ||
374 | int ret = 0; | ||
375 | |||
376 | switch (val) { | ||
377 | case MODULE_STATE_COMING: | ||
378 | mutex_lock(&jump_label_mutex); | ||
379 | ret = add_jump_label_module(mod); | ||
380 | if (ret) | ||
381 | remove_jump_label_module(mod); | ||
382 | mutex_unlock(&jump_label_mutex); | ||
383 | break; | ||
384 | case MODULE_STATE_GOING: | ||
385 | mutex_lock(&jump_label_mutex); | ||
386 | remove_jump_label_module(mod); | ||
387 | mutex_unlock(&jump_label_mutex); | ||
388 | break; | ||
389 | } | ||
390 | return ret; | ||
391 | } | ||
392 | |||
393 | /*** | ||
394 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
395 | * @mod: module to patch | ||
396 | * | ||
397 | * Allow for run-time selection of the optimal nops. Before the module | ||
398 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
399 | * the arch specific jump label code. | ||
400 | */ | ||
401 | void jump_label_apply_nops(struct module *mod) | ||
402 | { | ||
403 | struct jump_entry *iter; | ||
404 | |||
405 | /* if the module doesn't have jump label entries, just return */ | ||
406 | if (!mod->num_jump_entries) | ||
407 | return; | ||
408 | |||
409 | iter = mod->jump_entries; | ||
410 | while (iter < mod->jump_entries + mod->num_jump_entries) { | ||
411 | arch_jump_label_text_poke_early(iter->code); | ||
412 | iter++; | ||
413 | } | ||
414 | } | ||
415 | |||
416 | struct notifier_block jump_label_module_nb = { | ||
417 | .notifier_call = jump_label_module_notify, | ||
418 | .priority = 0, | ||
419 | }; | ||
420 | |||
421 | static __init int init_jump_label_module(void) | ||
422 | { | ||
423 | return register_module_notifier(&jump_label_module_nb); | ||
424 | } | ||
425 | early_initcall(init_jump_label_module); | ||
426 | |||
427 | #endif /* CONFIG_MODULES */ | ||
428 | |||
429 | #endif | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index c0613f7d6730..b55045bc7563 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
816 | 816 | ||
817 | ptr = kmap(page); | 817 | ptr = kmap(page); |
818 | /* Start with a clear page */ | 818 | /* Start with a clear page */ |
819 | memset(ptr, 0, PAGE_SIZE); | 819 | clear_page(ptr); |
820 | ptr += maddr & ~PAGE_MASK; | 820 | ptr += maddr & ~PAGE_MASK; |
821 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); | 821 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); |
822 | if (mchunk > mbytes) | 822 | if (mchunk > mbytes) |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f3ae96..99865c33a60d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/memory.h> | 47 | #include <linux/memory.h> |
48 | #include <linux/ftrace.h> | 48 | #include <linux/ftrace.h> |
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | #include <linux/jump_label.h> | ||
50 | 51 | ||
51 | #include <asm-generic/sections.h> | 52 | #include <asm-generic/sections.h> |
52 | #include <asm/cacheflush.h> | 53 | #include <asm/cacheflush.h> |
@@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | |||
73 | /* NOTE: change this value only with kprobe_mutex held */ | 74 | /* NOTE: change this value only with kprobe_mutex held */ |
74 | static bool kprobes_all_disarmed; | 75 | static bool kprobes_all_disarmed; |
75 | 76 | ||
76 | static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 77 | /* This protects kprobe_table and optimizing_list */ |
78 | static DEFINE_MUTEX(kprobe_mutex); | ||
77 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 79 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
78 | static struct { | 80 | static struct { |
79 | spinlock_t lock ____cacheline_aligned_in_smp; | 81 | spinlock_t lock ____cacheline_aligned_in_smp; |
@@ -399,7 +401,7 @@ static inline int kprobe_optready(struct kprobe *p) | |||
399 | * Return an optimized kprobe whose optimizing code replaces | 401 | * Return an optimized kprobe whose optimizing code replaces |
400 | * instructions including addr (exclude breakpoint). | 402 | * instructions including addr (exclude breakpoint). |
401 | */ | 403 | */ |
402 | struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | 404 | static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) |
403 | { | 405 | { |
404 | int i; | 406 | int i; |
405 | struct kprobe *p = NULL; | 407 | struct kprobe *p = NULL; |
@@ -594,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
594 | } | 596 | } |
595 | 597 | ||
596 | #ifdef CONFIG_SYSCTL | 598 | #ifdef CONFIG_SYSCTL |
599 | /* This should be called with kprobe_mutex locked */ | ||
597 | static void __kprobes optimize_all_kprobes(void) | 600 | static void __kprobes optimize_all_kprobes(void) |
598 | { | 601 | { |
599 | struct hlist_head *head; | 602 | struct hlist_head *head; |
@@ -606,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void) | |||
606 | return; | 609 | return; |
607 | 610 | ||
608 | kprobes_allow_optimization = true; | 611 | kprobes_allow_optimization = true; |
609 | mutex_lock(&text_mutex); | ||
610 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 612 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
611 | head = &kprobe_table[i]; | 613 | head = &kprobe_table[i]; |
612 | hlist_for_each_entry_rcu(p, node, head, hlist) | 614 | hlist_for_each_entry_rcu(p, node, head, hlist) |
613 | if (!kprobe_disabled(p)) | 615 | if (!kprobe_disabled(p)) |
614 | optimize_kprobe(p); | 616 | optimize_kprobe(p); |
615 | } | 617 | } |
616 | mutex_unlock(&text_mutex); | ||
617 | printk(KERN_INFO "Kprobes globally optimized\n"); | 618 | printk(KERN_INFO "Kprobes globally optimized\n"); |
618 | } | 619 | } |
619 | 620 | ||
621 | /* This should be called with kprobe_mutex locked */ | ||
620 | static void __kprobes unoptimize_all_kprobes(void) | 622 | static void __kprobes unoptimize_all_kprobes(void) |
621 | { | 623 | { |
622 | struct hlist_head *head; | 624 | struct hlist_head *head; |
@@ -831,6 +833,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | |||
831 | 833 | ||
832 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | 834 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, |
833 | struct hlist_head **head, unsigned long *flags) | 835 | struct hlist_head **head, unsigned long *flags) |
836 | __acquires(hlist_lock) | ||
834 | { | 837 | { |
835 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 838 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
836 | spinlock_t *hlist_lock; | 839 | spinlock_t *hlist_lock; |
@@ -842,6 +845,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | |||
842 | 845 | ||
843 | static void __kprobes kretprobe_table_lock(unsigned long hash, | 846 | static void __kprobes kretprobe_table_lock(unsigned long hash, |
844 | unsigned long *flags) | 847 | unsigned long *flags) |
848 | __acquires(hlist_lock) | ||
845 | { | 849 | { |
846 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 850 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
847 | spin_lock_irqsave(hlist_lock, *flags); | 851 | spin_lock_irqsave(hlist_lock, *flags); |
@@ -849,6 +853,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash, | |||
849 | 853 | ||
850 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | 854 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, |
851 | unsigned long *flags) | 855 | unsigned long *flags) |
856 | __releases(hlist_lock) | ||
852 | { | 857 | { |
853 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 858 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
854 | spinlock_t *hlist_lock; | 859 | spinlock_t *hlist_lock; |
@@ -857,7 +862,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | |||
857 | spin_unlock_irqrestore(hlist_lock, *flags); | 862 | spin_unlock_irqrestore(hlist_lock, *flags); |
858 | } | 863 | } |
859 | 864 | ||
860 | void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) | 865 | static void __kprobes kretprobe_table_unlock(unsigned long hash, |
866 | unsigned long *flags) | ||
867 | __releases(hlist_lock) | ||
861 | { | 868 | { |
862 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 869 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
863 | spin_unlock_irqrestore(hlist_lock, *flags); | 870 | spin_unlock_irqrestore(hlist_lock, *flags); |
@@ -1141,7 +1148,8 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1141 | preempt_disable(); | 1148 | preempt_disable(); |
1142 | if (!kernel_text_address((unsigned long) p->addr) || | 1149 | if (!kernel_text_address((unsigned long) p->addr) || |
1143 | in_kprobes_functions((unsigned long) p->addr) || | 1150 | in_kprobes_functions((unsigned long) p->addr) || |
1144 | ftrace_text_reserved(p->addr, p->addr)) { | 1151 | ftrace_text_reserved(p->addr, p->addr) || |
1152 | jump_label_text_reserved(p->addr, p->addr)) { | ||
1145 | preempt_enable(); | 1153 | preempt_enable(); |
1146 | return -EINVAL; | 1154 | return -EINVAL; |
1147 | } | 1155 | } |
@@ -1339,18 +1347,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) | |||
1339 | if (num <= 0) | 1347 | if (num <= 0) |
1340 | return -EINVAL; | 1348 | return -EINVAL; |
1341 | for (i = 0; i < num; i++) { | 1349 | for (i = 0; i < num; i++) { |
1342 | unsigned long addr; | 1350 | unsigned long addr, offset; |
1343 | jp = jps[i]; | 1351 | jp = jps[i]; |
1344 | addr = arch_deref_entry_point(jp->entry); | 1352 | addr = arch_deref_entry_point(jp->entry); |
1345 | 1353 | ||
1346 | if (!kernel_text_address(addr)) | 1354 | /* Verify probepoint is a function entry point */ |
1347 | ret = -EINVAL; | 1355 | if (kallsyms_lookup_size_offset(addr, NULL, &offset) && |
1348 | else { | 1356 | offset == 0) { |
1349 | /* Todo: Verify probepoint is a function entry point */ | ||
1350 | jp->kp.pre_handler = setjmp_pre_handler; | 1357 | jp->kp.pre_handler = setjmp_pre_handler; |
1351 | jp->kp.break_handler = longjmp_break_handler; | 1358 | jp->kp.break_handler = longjmp_break_handler; |
1352 | ret = register_kprobe(&jp->kp); | 1359 | ret = register_kprobe(&jp->kp); |
1353 | } | 1360 | } else |
1361 | ret = -EINVAL; | ||
1362 | |||
1354 | if (ret < 0) { | 1363 | if (ret < 0) { |
1355 | if (i > 0) | 1364 | if (i > 0) |
1356 | unregister_jprobes(jps, i); | 1365 | unregister_jprobes(jps, i); |
@@ -1992,6 +2001,7 @@ static ssize_t write_enabled_file_bool(struct file *file, | |||
1992 | static const struct file_operations fops_kp = { | 2001 | static const struct file_operations fops_kp = { |
1993 | .read = read_enabled_file_bool, | 2002 | .read = read_enabled_file_bool, |
1994 | .write = write_enabled_file_bool, | 2003 | .write = write_enabled_file_bool, |
2004 | .llseek = default_llseek, | ||
1995 | }; | 2005 | }; |
1996 | 2006 | ||
1997 | static int __kprobes debugfs_kprobe_init(void) | 2007 | static int __kprobes debugfs_kprobe_init(void) |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f2852a510232..42ba65dff7d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
639 | } | 639 | } |
640 | #endif | 640 | #endif |
641 | 641 | ||
642 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
643 | debug_locks_off(); | ||
644 | printk(KERN_ERR | ||
645 | "BUG: looking up invalid subclass: %u\n", subclass); | ||
646 | printk(KERN_ERR | ||
647 | "turning off the locking correctness validator.\n"); | ||
648 | dump_stack(); | ||
649 | return NULL; | ||
650 | } | ||
651 | |||
642 | /* | 652 | /* |
643 | * Static locks do not have their class-keys yet - for them the key | 653 | * Static locks do not have their class-keys yet - for them the key |
644 | * is the lock object itself: | 654 | * is the lock object itself: |
@@ -774,7 +784,9 @@ out_unlock_set: | |||
774 | raw_local_irq_restore(flags); | 784 | raw_local_irq_restore(flags); |
775 | 785 | ||
776 | if (!subclass || force) | 786 | if (!subclass || force) |
777 | lock->class_cache = class; | 787 | lock->class_cache[0] = class; |
788 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) | ||
789 | lock->class_cache[subclass] = class; | ||
778 | 790 | ||
779 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | 791 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) |
780 | return NULL; | 792 | return NULL; |
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
2679 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 2691 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
2680 | struct lock_class_key *key, int subclass) | 2692 | struct lock_class_key *key, int subclass) |
2681 | { | 2693 | { |
2682 | lock->class_cache = NULL; | 2694 | int i; |
2695 | |||
2696 | for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) | ||
2697 | lock->class_cache[i] = NULL; | ||
2698 | |||
2683 | #ifdef CONFIG_LOCK_STAT | 2699 | #ifdef CONFIG_LOCK_STAT |
2684 | lock->cpu = raw_smp_processor_id(); | 2700 | lock->cpu = raw_smp_processor_id(); |
2685 | #endif | 2701 | #endif |
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2739 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2755 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2740 | return 0; | 2756 | return 0; |
2741 | 2757 | ||
2742 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
2743 | debug_locks_off(); | ||
2744 | printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); | ||
2745 | printk("turning off the locking correctness validator.\n"); | ||
2746 | dump_stack(); | ||
2747 | return 0; | ||
2748 | } | ||
2749 | |||
2750 | if (lock->key == &__lockdep_no_validate__) | 2758 | if (lock->key == &__lockdep_no_validate__) |
2751 | check = 1; | 2759 | check = 1; |
2752 | 2760 | ||
2753 | if (!subclass) | 2761 | if (subclass < NR_LOCKDEP_CACHING_CLASSES) |
2754 | class = lock->class_cache; | 2762 | class = lock->class_cache[subclass]; |
2755 | /* | 2763 | /* |
2756 | * Not cached yet or subclass? | 2764 | * Not cached? |
2757 | */ | 2765 | */ |
2758 | if (unlikely(!class)) { | 2766 | if (unlikely(!class)) { |
2759 | class = register_lock_class(lock, subclass, 0); | 2767 | class = register_lock_class(lock, subclass, 0); |
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) | |||
2918 | return 1; | 2926 | return 1; |
2919 | 2927 | ||
2920 | if (hlock->references) { | 2928 | if (hlock->references) { |
2921 | struct lock_class *class = lock->class_cache; | 2929 | struct lock_class *class = lock->class_cache[0]; |
2922 | 2930 | ||
2923 | if (!class) | 2931 | if (!class) |
2924 | class = look_up_lock_class(lock, 0); | 2932 | class = look_up_lock_class(lock, 0); |
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
3559 | if (list_empty(head)) | 3567 | if (list_empty(head)) |
3560 | continue; | 3568 | continue; |
3561 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3569 | list_for_each_entry_safe(class, next, head, hash_entry) { |
3562 | if (unlikely(class == lock->class_cache)) { | 3570 | int match = 0; |
3571 | |||
3572 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) | ||
3573 | match |= class == lock->class_cache[j]; | ||
3574 | |||
3575 | if (unlikely(match)) { | ||
3563 | if (debug_locks_off_graph_unlock()) | 3576 | if (debug_locks_off_graph_unlock()) |
3564 | WARN_ON(1); | 3577 | WARN_ON(1); |
3565 | goto out_restore; | 3578 | goto out_restore; |
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); | |||
3775 | * Careful: only use this function if you are sure that | 3788 | * Careful: only use this function if you are sure that |
3776 | * the task cannot run in parallel! | 3789 | * the task cannot run in parallel! |
3777 | */ | 3790 | */ |
3778 | void __debug_show_held_locks(struct task_struct *task) | 3791 | void debug_show_held_locks(struct task_struct *task) |
3779 | { | 3792 | { |
3780 | if (unlikely(!debug_locks)) { | 3793 | if (unlikely(!debug_locks)) { |
3781 | printk("INFO: lockdep is turned off.\n"); | 3794 | printk("INFO: lockdep is turned off.\n"); |
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task) | |||
3783 | } | 3796 | } |
3784 | lockdep_print_held_locks(task); | 3797 | lockdep_print_held_locks(task); |
3785 | } | 3798 | } |
3786 | EXPORT_SYMBOL_GPL(__debug_show_held_locks); | ||
3787 | |||
3788 | void debug_show_held_locks(struct task_struct *task) | ||
3789 | { | ||
3790 | __debug_show_held_locks(task); | ||
3791 | } | ||
3792 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3799 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
3793 | 3800 | ||
3794 | void lockdep_sys_exit(void) | 3801 | void lockdep_sys_exit(void) |
diff --git a/kernel/module.c b/kernel/module.c index ccd641991842..437a74a7524a 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/async.h> | 55 | #include <linux/async.h> |
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
58 | #include <linux/jump_label.h> | ||
58 | 59 | ||
59 | #define CREATE_TRACE_POINTS | 60 | #define CREATE_TRACE_POINTS |
60 | #include <trace/events/module.h> | 61 | #include <trace/events/module.h> |
@@ -2036,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info) | |||
2036 | { | 2037 | { |
2037 | } | 2038 | } |
2038 | 2039 | ||
2039 | static void add_kallsyms(struct module *mod, struct load_info *info) | 2040 | static void add_kallsyms(struct module *mod, const struct load_info *info) |
2040 | { | 2041 | { |
2041 | } | 2042 | } |
2042 | #endif /* CONFIG_KALLSYMS */ | 2043 | #endif /* CONFIG_KALLSYMS */ |
@@ -2309,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2309 | sizeof(*mod->tracepoints), | 2310 | sizeof(*mod->tracepoints), |
2310 | &mod->num_tracepoints); | 2311 | &mod->num_tracepoints); |
2311 | #endif | 2312 | #endif |
2313 | #ifdef HAVE_JUMP_LABEL | ||
2314 | mod->jump_entries = section_objs(info, "__jump_table", | ||
2315 | sizeof(*mod->jump_entries), | ||
2316 | &mod->num_jump_entries); | ||
2317 | #endif | ||
2312 | #ifdef CONFIG_EVENT_TRACING | 2318 | #ifdef CONFIG_EVENT_TRACING |
2313 | mod->trace_events = section_objs(info, "_ftrace_events", | 2319 | mod->trace_events = section_objs(info, "_ftrace_events", |
2314 | sizeof(*mod->trace_events), | 2320 | sizeof(*mod->trace_events), |
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 2a5dfec8efe0..2c98ad94ba0e 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c | |||
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, | |||
85 | return ERR_PTR(-EPERM); | 85 | return ERR_PTR(-EPERM); |
86 | if (!cgroup_is_descendant(cgroup, current)) | 86 | if (!cgroup_is_descendant(cgroup, current)) |
87 | return ERR_PTR(-EPERM); | 87 | return ERR_PTR(-EPERM); |
88 | if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { | ||
89 | printk("ns_cgroup can't be created with parent " | ||
90 | "'clone_children' set.\n"); | ||
91 | return ERR_PTR(-EINVAL); | ||
92 | } | ||
93 | |||
94 | printk_once("ns_cgroup deprecated: consider using the " | ||
95 | "'clone_children' flag without the ns_cgroup.\n"); | ||
88 | 96 | ||
89 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); | 97 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); |
90 | if (!ns_cgroup) | 98 | if (!ns_cgroup) |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b98bed3d8182..517d827f4982 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -31,24 +31,18 @@ | |||
31 | #include <linux/kernel_stat.h> | 31 | #include <linux/kernel_stat.h> |
32 | #include <linux/perf_event.h> | 32 | #include <linux/perf_event.h> |
33 | #include <linux/ftrace_event.h> | 33 | #include <linux/ftrace_event.h> |
34 | #include <linux/hw_breakpoint.h> | ||
35 | 34 | ||
36 | #include <asm/irq_regs.h> | 35 | #include <asm/irq_regs.h> |
37 | 36 | ||
38 | /* | 37 | atomic_t perf_task_events __read_mostly; |
39 | * Each CPU has a list of per CPU events: | ||
40 | */ | ||
41 | static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | ||
42 | |||
43 | int perf_max_events __read_mostly = 1; | ||
44 | static int perf_reserved_percpu __read_mostly; | ||
45 | static int perf_overcommit __read_mostly = 1; | ||
46 | |||
47 | static atomic_t nr_events __read_mostly; | ||
48 | static atomic_t nr_mmap_events __read_mostly; | 38 | static atomic_t nr_mmap_events __read_mostly; |
49 | static atomic_t nr_comm_events __read_mostly; | 39 | static atomic_t nr_comm_events __read_mostly; |
50 | static atomic_t nr_task_events __read_mostly; | 40 | static atomic_t nr_task_events __read_mostly; |
51 | 41 | ||
42 | static LIST_HEAD(pmus); | ||
43 | static DEFINE_MUTEX(pmus_lock); | ||
44 | static struct srcu_struct pmus_srcu; | ||
45 | |||
52 | /* | 46 | /* |
53 | * perf event paranoia level: | 47 | * perf event paranoia level: |
54 | * -1 - not paranoid at all | 48 | * -1 - not paranoid at all |
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; | |||
67 | 61 | ||
68 | static atomic64_t perf_event_id; | 62 | static atomic64_t perf_event_id; |
69 | 63 | ||
70 | /* | 64 | void __weak perf_event_print_debug(void) { } |
71 | * Lock for (sysadmin-configurable) event reservations: | ||
72 | */ | ||
73 | static DEFINE_SPINLOCK(perf_resource_lock); | ||
74 | 65 | ||
75 | /* | 66 | extern __weak const char *perf_pmu_name(void) |
76 | * Architecture provided APIs - weak aliases: | ||
77 | */ | ||
78 | extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) | ||
79 | { | 67 | { |
80 | return NULL; | 68 | return "pmu"; |
81 | } | 69 | } |
82 | 70 | ||
83 | void __weak hw_perf_disable(void) { barrier(); } | 71 | void perf_pmu_disable(struct pmu *pmu) |
84 | void __weak hw_perf_enable(void) { barrier(); } | 72 | { |
85 | 73 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | |
86 | void __weak perf_event_print_debug(void) { } | 74 | if (!(*count)++) |
87 | 75 | pmu->pmu_disable(pmu); | |
88 | static DEFINE_PER_CPU(int, perf_disable_count); | 76 | } |
89 | 77 | ||
90 | void perf_disable(void) | 78 | void perf_pmu_enable(struct pmu *pmu) |
91 | { | 79 | { |
92 | if (!__get_cpu_var(perf_disable_count)++) | 80 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
93 | hw_perf_disable(); | 81 | if (!--(*count)) |
82 | pmu->pmu_enable(pmu); | ||
94 | } | 83 | } |
95 | 84 | ||
96 | void perf_enable(void) | 85 | static DEFINE_PER_CPU(struct list_head, rotation_list); |
86 | |||
87 | /* | ||
88 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
89 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
90 | * disabled, while rotate_context is called from IRQ context. | ||
91 | */ | ||
92 | static void perf_pmu_rotate_start(struct pmu *pmu) | ||
97 | { | 93 | { |
98 | if (!--__get_cpu_var(perf_disable_count)) | 94 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
99 | hw_perf_enable(); | 95 | struct list_head *head = &__get_cpu_var(rotation_list); |
96 | |||
97 | WARN_ON(!irqs_disabled()); | ||
98 | |||
99 | if (list_empty(&cpuctx->rotation_list)) | ||
100 | list_add(&cpuctx->rotation_list, head); | ||
100 | } | 101 | } |
101 | 102 | ||
102 | static void get_ctx(struct perf_event_context *ctx) | 103 | static void get_ctx(struct perf_event_context *ctx) |
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event) | |||
151 | * the context could get moved to another task. | 152 | * the context could get moved to another task. |
152 | */ | 153 | */ |
153 | static struct perf_event_context * | 154 | static struct perf_event_context * |
154 | perf_lock_task_context(struct task_struct *task, unsigned long *flags) | 155 | perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) |
155 | { | 156 | { |
156 | struct perf_event_context *ctx; | 157 | struct perf_event_context *ctx; |
157 | 158 | ||
158 | rcu_read_lock(); | 159 | rcu_read_lock(); |
159 | retry: | 160 | retry: |
160 | ctx = rcu_dereference(task->perf_event_ctxp); | 161 | ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); |
161 | if (ctx) { | 162 | if (ctx) { |
162 | /* | 163 | /* |
163 | * If this context is a clone of another, it might | 164 | * If this context is a clone of another, it might |
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
170 | * can't get swapped on us any more. | 171 | * can't get swapped on us any more. |
171 | */ | 172 | */ |
172 | raw_spin_lock_irqsave(&ctx->lock, *flags); | 173 | raw_spin_lock_irqsave(&ctx->lock, *flags); |
173 | if (ctx != rcu_dereference(task->perf_event_ctxp)) { | 174 | if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { |
174 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); | 175 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); |
175 | goto retry; | 176 | goto retry; |
176 | } | 177 | } |
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
189 | * can't get swapped to another task. This also increments its | 190 | * can't get swapped to another task. This also increments its |
190 | * reference count so that the context can't get freed. | 191 | * reference count so that the context can't get freed. |
191 | */ | 192 | */ |
192 | static struct perf_event_context *perf_pin_task_context(struct task_struct *task) | 193 | static struct perf_event_context * |
194 | perf_pin_task_context(struct task_struct *task, int ctxn) | ||
193 | { | 195 | { |
194 | struct perf_event_context *ctx; | 196 | struct perf_event_context *ctx; |
195 | unsigned long flags; | 197 | unsigned long flags; |
196 | 198 | ||
197 | ctx = perf_lock_task_context(task, &flags); | 199 | ctx = perf_lock_task_context(task, ctxn, &flags); |
198 | if (ctx) { | 200 | if (ctx) { |
199 | ++ctx->pin_count; | 201 | ++ctx->pin_count; |
200 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 202 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
302 | } | 304 | } |
303 | 305 | ||
304 | list_add_rcu(&event->event_entry, &ctx->event_list); | 306 | list_add_rcu(&event->event_entry, &ctx->event_list); |
307 | if (!ctx->nr_events) | ||
308 | perf_pmu_rotate_start(ctx->pmu); | ||
305 | ctx->nr_events++; | 309 | ctx->nr_events++; |
306 | if (event->attr.inherit_stat) | 310 | if (event->attr.inherit_stat) |
307 | ctx->nr_stat++; | 311 | ctx->nr_stat++; |
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event) | |||
311 | { | 315 | { |
312 | struct perf_event *group_leader = event->group_leader; | 316 | struct perf_event *group_leader = event->group_leader; |
313 | 317 | ||
314 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); | 318 | /* |
319 | * We can have double attach due to group movement in perf_event_open. | ||
320 | */ | ||
321 | if (event->attach_state & PERF_ATTACH_GROUP) | ||
322 | return; | ||
323 | |||
315 | event->attach_state |= PERF_ATTACH_GROUP; | 324 | event->attach_state |= PERF_ATTACH_GROUP; |
316 | 325 | ||
317 | if (group_leader == event) | 326 | if (group_leader == event) |
@@ -436,7 +445,7 @@ event_sched_out(struct perf_event *event, | |||
436 | event->state = PERF_EVENT_STATE_OFF; | 445 | event->state = PERF_EVENT_STATE_OFF; |
437 | } | 446 | } |
438 | event->tstamp_stopped = ctx->time; | 447 | event->tstamp_stopped = ctx->time; |
439 | event->pmu->disable(event); | 448 | event->pmu->del(event, 0); |
440 | event->oncpu = -1; | 449 | event->oncpu = -1; |
441 | 450 | ||
442 | if (!is_software_event(event)) | 451 | if (!is_software_event(event)) |
@@ -466,6 +475,12 @@ group_sched_out(struct perf_event *group_event, | |||
466 | cpuctx->exclusive = 0; | 475 | cpuctx->exclusive = 0; |
467 | } | 476 | } |
468 | 477 | ||
478 | static inline struct perf_cpu_context * | ||
479 | __get_cpu_context(struct perf_event_context *ctx) | ||
480 | { | ||
481 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
482 | } | ||
483 | |||
469 | /* | 484 | /* |
470 | * Cross CPU call to remove a performance event | 485 | * Cross CPU call to remove a performance event |
471 | * | 486 | * |
@@ -474,9 +489,9 @@ group_sched_out(struct perf_event *group_event, | |||
474 | */ | 489 | */ |
475 | static void __perf_event_remove_from_context(void *info) | 490 | static void __perf_event_remove_from_context(void *info) |
476 | { | 491 | { |
477 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
478 | struct perf_event *event = info; | 492 | struct perf_event *event = info; |
479 | struct perf_event_context *ctx = event->ctx; | 493 | struct perf_event_context *ctx = event->ctx; |
494 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
480 | 495 | ||
481 | /* | 496 | /* |
482 | * If this is a task context, we need to check whether it is | 497 | * If this is a task context, we need to check whether it is |
@@ -487,27 +502,11 @@ static void __perf_event_remove_from_context(void *info) | |||
487 | return; | 502 | return; |
488 | 503 | ||
489 | raw_spin_lock(&ctx->lock); | 504 | raw_spin_lock(&ctx->lock); |
490 | /* | ||
491 | * Protect the list operation against NMI by disabling the | ||
492 | * events on a global level. | ||
493 | */ | ||
494 | perf_disable(); | ||
495 | 505 | ||
496 | event_sched_out(event, cpuctx, ctx); | 506 | event_sched_out(event, cpuctx, ctx); |
497 | 507 | ||
498 | list_del_event(event, ctx); | 508 | list_del_event(event, ctx); |
499 | 509 | ||
500 | if (!ctx->task) { | ||
501 | /* | ||
502 | * Allow more per task events with respect to the | ||
503 | * reservation: | ||
504 | */ | ||
505 | cpuctx->max_pertask = | ||
506 | min(perf_max_events - ctx->nr_events, | ||
507 | perf_max_events - perf_reserved_percpu); | ||
508 | } | ||
509 | |||
510 | perf_enable(); | ||
511 | raw_spin_unlock(&ctx->lock); | 510 | raw_spin_unlock(&ctx->lock); |
512 | } | 511 | } |
513 | 512 | ||
@@ -572,8 +571,8 @@ retry: | |||
572 | static void __perf_event_disable(void *info) | 571 | static void __perf_event_disable(void *info) |
573 | { | 572 | { |
574 | struct perf_event *event = info; | 573 | struct perf_event *event = info; |
575 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
576 | struct perf_event_context *ctx = event->ctx; | 574 | struct perf_event_context *ctx = event->ctx; |
575 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
577 | 576 | ||
578 | /* | 577 | /* |
579 | * If this is a per-task event, need to check whether this | 578 | * If this is a per-task event, need to check whether this |
@@ -628,7 +627,7 @@ void perf_event_disable(struct perf_event *event) | |||
628 | return; | 627 | return; |
629 | } | 628 | } |
630 | 629 | ||
631 | retry: | 630 | retry: |
632 | task_oncpu_function_call(task, __perf_event_disable, event); | 631 | task_oncpu_function_call(task, __perf_event_disable, event); |
633 | 632 | ||
634 | raw_spin_lock_irq(&ctx->lock); | 633 | raw_spin_lock_irq(&ctx->lock); |
@@ -667,7 +666,7 @@ event_sched_in(struct perf_event *event, | |||
667 | */ | 666 | */ |
668 | smp_wmb(); | 667 | smp_wmb(); |
669 | 668 | ||
670 | if (event->pmu->enable(event)) { | 669 | if (event->pmu->add(event, PERF_EF_START)) { |
671 | event->state = PERF_EVENT_STATE_INACTIVE; | 670 | event->state = PERF_EVENT_STATE_INACTIVE; |
672 | event->oncpu = -1; | 671 | event->oncpu = -1; |
673 | return -EAGAIN; | 672 | return -EAGAIN; |
@@ -691,22 +690,17 @@ group_sched_in(struct perf_event *group_event, | |||
691 | struct perf_event_context *ctx) | 690 | struct perf_event_context *ctx) |
692 | { | 691 | { |
693 | struct perf_event *event, *partial_group = NULL; | 692 | struct perf_event *event, *partial_group = NULL; |
694 | const struct pmu *pmu = group_event->pmu; | 693 | struct pmu *pmu = group_event->pmu; |
695 | bool txn = false; | 694 | u64 now = ctx->time; |
695 | bool simulate = false; | ||
696 | 696 | ||
697 | if (group_event->state == PERF_EVENT_STATE_OFF) | 697 | if (group_event->state == PERF_EVENT_STATE_OFF) |
698 | return 0; | 698 | return 0; |
699 | 699 | ||
700 | /* Check if group transaction availabe */ | 700 | pmu->start_txn(pmu); |
701 | if (pmu->start_txn) | ||
702 | txn = true; | ||
703 | |||
704 | if (txn) | ||
705 | pmu->start_txn(pmu); | ||
706 | 701 | ||
707 | if (event_sched_in(group_event, cpuctx, ctx)) { | 702 | if (event_sched_in(group_event, cpuctx, ctx)) { |
708 | if (txn) | 703 | pmu->cancel_txn(pmu); |
709 | pmu->cancel_txn(pmu); | ||
710 | return -EAGAIN; | 704 | return -EAGAIN; |
711 | } | 705 | } |
712 | 706 | ||
@@ -720,23 +714,38 @@ group_sched_in(struct perf_event *group_event, | |||
720 | } | 714 | } |
721 | } | 715 | } |
722 | 716 | ||
723 | if (!txn || !pmu->commit_txn(pmu)) | 717 | if (!pmu->commit_txn(pmu)) |
724 | return 0; | 718 | return 0; |
725 | 719 | ||
726 | group_error: | 720 | group_error: |
727 | /* | 721 | /* |
728 | * Groups can be scheduled in as one unit only, so undo any | 722 | * Groups can be scheduled in as one unit only, so undo any |
729 | * partial group before returning: | 723 | * partial group before returning: |
724 | * The events up to the failed event are scheduled out normally, | ||
725 | * tstamp_stopped will be updated. | ||
726 | * | ||
727 | * The failed events and the remaining siblings need to have | ||
728 | * their timings updated as if they had gone thru event_sched_in() | ||
729 | * and event_sched_out(). This is required to get consistent timings | ||
730 | * across the group. This also takes care of the case where the group | ||
731 | * could never be scheduled by ensuring tstamp_stopped is set to mark | ||
732 | * the time the event was actually stopped, such that time delta | ||
733 | * calculation in update_event_times() is correct. | ||
730 | */ | 734 | */ |
731 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 735 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
732 | if (event == partial_group) | 736 | if (event == partial_group) |
733 | break; | 737 | simulate = true; |
734 | event_sched_out(event, cpuctx, ctx); | 738 | |
739 | if (simulate) { | ||
740 | event->tstamp_running += now - event->tstamp_stopped; | ||
741 | event->tstamp_stopped = now; | ||
742 | } else { | ||
743 | event_sched_out(event, cpuctx, ctx); | ||
744 | } | ||
735 | } | 745 | } |
736 | event_sched_out(group_event, cpuctx, ctx); | 746 | event_sched_out(group_event, cpuctx, ctx); |
737 | 747 | ||
738 | if (txn) | 748 | pmu->cancel_txn(pmu); |
739 | pmu->cancel_txn(pmu); | ||
740 | 749 | ||
741 | return -EAGAIN; | 750 | return -EAGAIN; |
742 | } | 751 | } |
@@ -789,10 +798,10 @@ static void add_event_to_ctx(struct perf_event *event, | |||
789 | */ | 798 | */ |
790 | static void __perf_install_in_context(void *info) | 799 | static void __perf_install_in_context(void *info) |
791 | { | 800 | { |
792 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
793 | struct perf_event *event = info; | 801 | struct perf_event *event = info; |
794 | struct perf_event_context *ctx = event->ctx; | 802 | struct perf_event_context *ctx = event->ctx; |
795 | struct perf_event *leader = event->group_leader; | 803 | struct perf_event *leader = event->group_leader; |
804 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
796 | int err; | 805 | int err; |
797 | 806 | ||
798 | /* | 807 | /* |
@@ -812,12 +821,6 @@ static void __perf_install_in_context(void *info) | |||
812 | ctx->is_active = 1; | 821 | ctx->is_active = 1; |
813 | update_context_time(ctx); | 822 | update_context_time(ctx); |
814 | 823 | ||
815 | /* | ||
816 | * Protect the list operation against NMI by disabling the | ||
817 | * events on a global level. NOP for non NMI based events. | ||
818 | */ | ||
819 | perf_disable(); | ||
820 | |||
821 | add_event_to_ctx(event, ctx); | 824 | add_event_to_ctx(event, ctx); |
822 | 825 | ||
823 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 826 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
@@ -855,12 +858,7 @@ static void __perf_install_in_context(void *info) | |||
855 | } | 858 | } |
856 | } | 859 | } |
857 | 860 | ||
858 | if (!err && !ctx->task && cpuctx->max_pertask) | 861 | unlock: |
859 | cpuctx->max_pertask--; | ||
860 | |||
861 | unlock: | ||
862 | perf_enable(); | ||
863 | |||
864 | raw_spin_unlock(&ctx->lock); | 862 | raw_spin_unlock(&ctx->lock); |
865 | } | 863 | } |
866 | 864 | ||
@@ -883,6 +881,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
883 | { | 881 | { |
884 | struct task_struct *task = ctx->task; | 882 | struct task_struct *task = ctx->task; |
885 | 883 | ||
884 | event->ctx = ctx; | ||
885 | |||
886 | if (!task) { | 886 | if (!task) { |
887 | /* | 887 | /* |
888 | * Per cpu events are installed via an smp call and | 888 | * Per cpu events are installed via an smp call and |
@@ -931,10 +931,12 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
931 | 931 | ||
932 | event->state = PERF_EVENT_STATE_INACTIVE; | 932 | event->state = PERF_EVENT_STATE_INACTIVE; |
933 | event->tstamp_enabled = ctx->time - event->total_time_enabled; | 933 | event->tstamp_enabled = ctx->time - event->total_time_enabled; |
934 | list_for_each_entry(sub, &event->sibling_list, group_entry) | 934 | list_for_each_entry(sub, &event->sibling_list, group_entry) { |
935 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) | 935 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { |
936 | sub->tstamp_enabled = | 936 | sub->tstamp_enabled = |
937 | ctx->time - sub->total_time_enabled; | 937 | ctx->time - sub->total_time_enabled; |
938 | } | ||
939 | } | ||
938 | } | 940 | } |
939 | 941 | ||
940 | /* | 942 | /* |
@@ -943,9 +945,9 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
943 | static void __perf_event_enable(void *info) | 945 | static void __perf_event_enable(void *info) |
944 | { | 946 | { |
945 | struct perf_event *event = info; | 947 | struct perf_event *event = info; |
946 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
947 | struct perf_event_context *ctx = event->ctx; | 948 | struct perf_event_context *ctx = event->ctx; |
948 | struct perf_event *leader = event->group_leader; | 949 | struct perf_event *leader = event->group_leader; |
950 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
949 | int err; | 951 | int err; |
950 | 952 | ||
951 | /* | 953 | /* |
@@ -979,12 +981,10 @@ static void __perf_event_enable(void *info) | |||
979 | if (!group_can_go_on(event, cpuctx, 1)) { | 981 | if (!group_can_go_on(event, cpuctx, 1)) { |
980 | err = -EEXIST; | 982 | err = -EEXIST; |
981 | } else { | 983 | } else { |
982 | perf_disable(); | ||
983 | if (event == leader) | 984 | if (event == leader) |
984 | err = group_sched_in(event, cpuctx, ctx); | 985 | err = group_sched_in(event, cpuctx, ctx); |
985 | else | 986 | else |
986 | err = event_sched_in(event, cpuctx, ctx); | 987 | err = event_sched_in(event, cpuctx, ctx); |
987 | perf_enable(); | ||
988 | } | 988 | } |
989 | 989 | ||
990 | if (err) { | 990 | if (err) { |
@@ -1000,7 +1000,7 @@ static void __perf_event_enable(void *info) | |||
1000 | } | 1000 | } |
1001 | } | 1001 | } |
1002 | 1002 | ||
1003 | unlock: | 1003 | unlock: |
1004 | raw_spin_unlock(&ctx->lock); | 1004 | raw_spin_unlock(&ctx->lock); |
1005 | } | 1005 | } |
1006 | 1006 | ||
@@ -1041,7 +1041,7 @@ void perf_event_enable(struct perf_event *event) | |||
1041 | if (event->state == PERF_EVENT_STATE_ERROR) | 1041 | if (event->state == PERF_EVENT_STATE_ERROR) |
1042 | event->state = PERF_EVENT_STATE_OFF; | 1042 | event->state = PERF_EVENT_STATE_OFF; |
1043 | 1043 | ||
1044 | retry: | 1044 | retry: |
1045 | raw_spin_unlock_irq(&ctx->lock); | 1045 | raw_spin_unlock_irq(&ctx->lock); |
1046 | task_oncpu_function_call(task, __perf_event_enable, event); | 1046 | task_oncpu_function_call(task, __perf_event_enable, event); |
1047 | 1047 | ||
@@ -1061,7 +1061,7 @@ void perf_event_enable(struct perf_event *event) | |||
1061 | if (event->state == PERF_EVENT_STATE_OFF) | 1061 | if (event->state == PERF_EVENT_STATE_OFF) |
1062 | __perf_event_mark_enabled(event, ctx); | 1062 | __perf_event_mark_enabled(event, ctx); |
1063 | 1063 | ||
1064 | out: | 1064 | out: |
1065 | raw_spin_unlock_irq(&ctx->lock); | 1065 | raw_spin_unlock_irq(&ctx->lock); |
1066 | } | 1066 | } |
1067 | 1067 | ||
@@ -1092,26 +1092,26 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
1092 | struct perf_event *event; | 1092 | struct perf_event *event; |
1093 | 1093 | ||
1094 | raw_spin_lock(&ctx->lock); | 1094 | raw_spin_lock(&ctx->lock); |
1095 | perf_pmu_disable(ctx->pmu); | ||
1095 | ctx->is_active = 0; | 1096 | ctx->is_active = 0; |
1096 | if (likely(!ctx->nr_events)) | 1097 | if (likely(!ctx->nr_events)) |
1097 | goto out; | 1098 | goto out; |
1098 | update_context_time(ctx); | 1099 | update_context_time(ctx); |
1099 | 1100 | ||
1100 | perf_disable(); | ||
1101 | if (!ctx->nr_active) | 1101 | if (!ctx->nr_active) |
1102 | goto out_enable; | 1102 | goto out; |
1103 | 1103 | ||
1104 | if (event_type & EVENT_PINNED) | 1104 | if (event_type & EVENT_PINNED) { |
1105 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 1105 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
1106 | group_sched_out(event, cpuctx, ctx); | 1106 | group_sched_out(event, cpuctx, ctx); |
1107 | } | ||
1107 | 1108 | ||
1108 | if (event_type & EVENT_FLEXIBLE) | 1109 | if (event_type & EVENT_FLEXIBLE) { |
1109 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 1110 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
1110 | group_sched_out(event, cpuctx, ctx); | 1111 | group_sched_out(event, cpuctx, ctx); |
1111 | 1112 | } | |
1112 | out_enable: | 1113 | out: |
1113 | perf_enable(); | 1114 | perf_pmu_enable(ctx->pmu); |
1114 | out: | ||
1115 | raw_spin_unlock(&ctx->lock); | 1115 | raw_spin_unlock(&ctx->lock); |
1116 | } | 1116 | } |
1117 | 1117 | ||
@@ -1209,34 +1209,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
1209 | } | 1209 | } |
1210 | } | 1210 | } |
1211 | 1211 | ||
1212 | /* | 1212 | void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
1213 | * Called from scheduler to remove the events of the current task, | 1213 | struct task_struct *next) |
1214 | * with interrupts disabled. | ||
1215 | * | ||
1216 | * We stop each event and update the event value in event->count. | ||
1217 | * | ||
1218 | * This does not protect us against NMI, but disable() | ||
1219 | * sets the disabled bit in the control field of event _before_ | ||
1220 | * accessing the event control register. If a NMI hits, then it will | ||
1221 | * not restart the event. | ||
1222 | */ | ||
1223 | void perf_event_task_sched_out(struct task_struct *task, | ||
1224 | struct task_struct *next) | ||
1225 | { | 1214 | { |
1226 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1215 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
1227 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1228 | struct perf_event_context *next_ctx; | 1216 | struct perf_event_context *next_ctx; |
1229 | struct perf_event_context *parent; | 1217 | struct perf_event_context *parent; |
1218 | struct perf_cpu_context *cpuctx; | ||
1230 | int do_switch = 1; | 1219 | int do_switch = 1; |
1231 | 1220 | ||
1232 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | 1221 | if (likely(!ctx)) |
1222 | return; | ||
1233 | 1223 | ||
1234 | if (likely(!ctx || !cpuctx->task_ctx)) | 1224 | cpuctx = __get_cpu_context(ctx); |
1225 | if (!cpuctx->task_ctx) | ||
1235 | return; | 1226 | return; |
1236 | 1227 | ||
1237 | rcu_read_lock(); | 1228 | rcu_read_lock(); |
1238 | parent = rcu_dereference(ctx->parent_ctx); | 1229 | parent = rcu_dereference(ctx->parent_ctx); |
1239 | next_ctx = next->perf_event_ctxp; | 1230 | next_ctx = next->perf_event_ctxp[ctxn]; |
1240 | if (parent && next_ctx && | 1231 | if (parent && next_ctx && |
1241 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 1232 | rcu_dereference(next_ctx->parent_ctx) == parent) { |
1242 | /* | 1233 | /* |
@@ -1255,8 +1246,8 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1255 | * XXX do we need a memory barrier of sorts | 1246 | * XXX do we need a memory barrier of sorts |
1256 | * wrt to rcu_dereference() of perf_event_ctxp | 1247 | * wrt to rcu_dereference() of perf_event_ctxp |
1257 | */ | 1248 | */ |
1258 | task->perf_event_ctxp = next_ctx; | 1249 | task->perf_event_ctxp[ctxn] = next_ctx; |
1259 | next->perf_event_ctxp = ctx; | 1250 | next->perf_event_ctxp[ctxn] = ctx; |
1260 | ctx->task = next; | 1251 | ctx->task = next; |
1261 | next_ctx->task = task; | 1252 | next_ctx->task = task; |
1262 | do_switch = 0; | 1253 | do_switch = 0; |
@@ -1274,10 +1265,35 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1274 | } | 1265 | } |
1275 | } | 1266 | } |
1276 | 1267 | ||
1268 | #define for_each_task_context_nr(ctxn) \ | ||
1269 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | ||
1270 | |||
1271 | /* | ||
1272 | * Called from scheduler to remove the events of the current task, | ||
1273 | * with interrupts disabled. | ||
1274 | * | ||
1275 | * We stop each event and update the event value in event->count. | ||
1276 | * | ||
1277 | * This does not protect us against NMI, but disable() | ||
1278 | * sets the disabled bit in the control field of event _before_ | ||
1279 | * accessing the event control register. If a NMI hits, then it will | ||
1280 | * not restart the event. | ||
1281 | */ | ||
1282 | void __perf_event_task_sched_out(struct task_struct *task, | ||
1283 | struct task_struct *next) | ||
1284 | { | ||
1285 | int ctxn; | ||
1286 | |||
1287 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | ||
1288 | |||
1289 | for_each_task_context_nr(ctxn) | ||
1290 | perf_event_context_sched_out(task, ctxn, next); | ||
1291 | } | ||
1292 | |||
1277 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1293 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
1278 | enum event_type_t event_type) | 1294 | enum event_type_t event_type) |
1279 | { | 1295 | { |
1280 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1296 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1281 | 1297 | ||
1282 | if (!cpuctx->task_ctx) | 1298 | if (!cpuctx->task_ctx) |
1283 | return; | 1299 | return; |
@@ -1292,14 +1308,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, | |||
1292 | /* | 1308 | /* |
1293 | * Called with IRQs disabled | 1309 | * Called with IRQs disabled |
1294 | */ | 1310 | */ |
1295 | static void __perf_event_task_sched_out(struct perf_event_context *ctx) | ||
1296 | { | ||
1297 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
1298 | } | ||
1299 | |||
1300 | /* | ||
1301 | * Called with IRQs disabled | ||
1302 | */ | ||
1303 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | 1311 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, |
1304 | enum event_type_t event_type) | 1312 | enum event_type_t event_type) |
1305 | { | 1313 | { |
@@ -1350,9 +1358,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1350 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1358 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
1351 | continue; | 1359 | continue; |
1352 | 1360 | ||
1353 | if (group_can_go_on(event, cpuctx, can_add_hw)) | 1361 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
1354 | if (group_sched_in(event, cpuctx, ctx)) | 1362 | if (group_sched_in(event, cpuctx, ctx)) |
1355 | can_add_hw = 0; | 1363 | can_add_hw = 0; |
1364 | } | ||
1356 | } | 1365 | } |
1357 | } | 1366 | } |
1358 | 1367 | ||
@@ -1368,8 +1377,6 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1368 | 1377 | ||
1369 | ctx->timestamp = perf_clock(); | 1378 | ctx->timestamp = perf_clock(); |
1370 | 1379 | ||
1371 | perf_disable(); | ||
1372 | |||
1373 | /* | 1380 | /* |
1374 | * First go through the list and put on any pinned groups | 1381 | * First go through the list and put on any pinned groups |
1375 | * in order to give them the best chance of going on. | 1382 | * in order to give them the best chance of going on. |
@@ -1381,8 +1388,7 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1381 | if (event_type & EVENT_FLEXIBLE) | 1388 | if (event_type & EVENT_FLEXIBLE) |
1382 | ctx_flexible_sched_in(ctx, cpuctx); | 1389 | ctx_flexible_sched_in(ctx, cpuctx); |
1383 | 1390 | ||
1384 | perf_enable(); | 1391 | out: |
1385 | out: | ||
1386 | raw_spin_unlock(&ctx->lock); | 1392 | raw_spin_unlock(&ctx->lock); |
1387 | } | 1393 | } |
1388 | 1394 | ||
@@ -1394,43 +1400,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
1394 | ctx_sched_in(ctx, cpuctx, event_type); | 1400 | ctx_sched_in(ctx, cpuctx, event_type); |
1395 | } | 1401 | } |
1396 | 1402 | ||
1397 | static void task_ctx_sched_in(struct task_struct *task, | 1403 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
1398 | enum event_type_t event_type) | 1404 | enum event_type_t event_type) |
1399 | { | 1405 | { |
1400 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1406 | struct perf_cpu_context *cpuctx; |
1401 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1402 | 1407 | ||
1403 | if (likely(!ctx)) | 1408 | cpuctx = __get_cpu_context(ctx); |
1404 | return; | ||
1405 | if (cpuctx->task_ctx == ctx) | 1409 | if (cpuctx->task_ctx == ctx) |
1406 | return; | 1410 | return; |
1411 | |||
1407 | ctx_sched_in(ctx, cpuctx, event_type); | 1412 | ctx_sched_in(ctx, cpuctx, event_type); |
1408 | cpuctx->task_ctx = ctx; | 1413 | cpuctx->task_ctx = ctx; |
1409 | } | 1414 | } |
1410 | /* | ||
1411 | * Called from scheduler to add the events of the current task | ||
1412 | * with interrupts disabled. | ||
1413 | * | ||
1414 | * We restore the event value and then enable it. | ||
1415 | * | ||
1416 | * This does not protect us against NMI, but enable() | ||
1417 | * sets the enabled bit in the control field of event _before_ | ||
1418 | * accessing the event control register. If a NMI hits, then it will | ||
1419 | * keep the event running. | ||
1420 | */ | ||
1421 | void perf_event_task_sched_in(struct task_struct *task) | ||
1422 | { | ||
1423 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1424 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1425 | 1415 | ||
1426 | if (likely(!ctx)) | 1416 | void perf_event_context_sched_in(struct perf_event_context *ctx) |
1427 | return; | 1417 | { |
1418 | struct perf_cpu_context *cpuctx; | ||
1428 | 1419 | ||
1420 | cpuctx = __get_cpu_context(ctx); | ||
1429 | if (cpuctx->task_ctx == ctx) | 1421 | if (cpuctx->task_ctx == ctx) |
1430 | return; | 1422 | return; |
1431 | 1423 | ||
1432 | perf_disable(); | 1424 | perf_pmu_disable(ctx->pmu); |
1433 | |||
1434 | /* | 1425 | /* |
1435 | * We want to keep the following priority order: | 1426 | * We want to keep the following priority order: |
1436 | * cpu pinned (that don't need to move), task pinned, | 1427 | * cpu pinned (that don't need to move), task pinned, |
@@ -1444,7 +1435,37 @@ void perf_event_task_sched_in(struct task_struct *task) | |||
1444 | 1435 | ||
1445 | cpuctx->task_ctx = ctx; | 1436 | cpuctx->task_ctx = ctx; |
1446 | 1437 | ||
1447 | perf_enable(); | 1438 | /* |
1439 | * Since these rotations are per-cpu, we need to ensure the | ||
1440 | * cpu-context we got scheduled on is actually rotating. | ||
1441 | */ | ||
1442 | perf_pmu_rotate_start(ctx->pmu); | ||
1443 | perf_pmu_enable(ctx->pmu); | ||
1444 | } | ||
1445 | |||
1446 | /* | ||
1447 | * Called from scheduler to add the events of the current task | ||
1448 | * with interrupts disabled. | ||
1449 | * | ||
1450 | * We restore the event value and then enable it. | ||
1451 | * | ||
1452 | * This does not protect us against NMI, but enable() | ||
1453 | * sets the enabled bit in the control field of event _before_ | ||
1454 | * accessing the event control register. If a NMI hits, then it will | ||
1455 | * keep the event running. | ||
1456 | */ | ||
1457 | void __perf_event_task_sched_in(struct task_struct *task) | ||
1458 | { | ||
1459 | struct perf_event_context *ctx; | ||
1460 | int ctxn; | ||
1461 | |||
1462 | for_each_task_context_nr(ctxn) { | ||
1463 | ctx = task->perf_event_ctxp[ctxn]; | ||
1464 | if (likely(!ctx)) | ||
1465 | continue; | ||
1466 | |||
1467 | perf_event_context_sched_in(ctx); | ||
1468 | } | ||
1448 | } | 1469 | } |
1449 | 1470 | ||
1450 | #define MAX_INTERRUPTS (~0ULL) | 1471 | #define MAX_INTERRUPTS (~0ULL) |
@@ -1524,22 +1545,6 @@ do { \ | |||
1524 | return div64_u64(dividend, divisor); | 1545 | return div64_u64(dividend, divisor); |
1525 | } | 1546 | } |
1526 | 1547 | ||
1527 | static void perf_event_stop(struct perf_event *event) | ||
1528 | { | ||
1529 | if (!event->pmu->stop) | ||
1530 | return event->pmu->disable(event); | ||
1531 | |||
1532 | return event->pmu->stop(event); | ||
1533 | } | ||
1534 | |||
1535 | static int perf_event_start(struct perf_event *event) | ||
1536 | { | ||
1537 | if (!event->pmu->start) | ||
1538 | return event->pmu->enable(event); | ||
1539 | |||
1540 | return event->pmu->start(event); | ||
1541 | } | ||
1542 | |||
1543 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | 1548 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) |
1544 | { | 1549 | { |
1545 | struct hw_perf_event *hwc = &event->hw; | 1550 | struct hw_perf_event *hwc = &event->hw; |
@@ -1559,15 +1564,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
1559 | hwc->sample_period = sample_period; | 1564 | hwc->sample_period = sample_period; |
1560 | 1565 | ||
1561 | if (local64_read(&hwc->period_left) > 8*sample_period) { | 1566 | if (local64_read(&hwc->period_left) > 8*sample_period) { |
1562 | perf_disable(); | 1567 | event->pmu->stop(event, PERF_EF_UPDATE); |
1563 | perf_event_stop(event); | ||
1564 | local64_set(&hwc->period_left, 0); | 1568 | local64_set(&hwc->period_left, 0); |
1565 | perf_event_start(event); | 1569 | event->pmu->start(event, PERF_EF_RELOAD); |
1566 | perf_enable(); | ||
1567 | } | 1570 | } |
1568 | } | 1571 | } |
1569 | 1572 | ||
1570 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | 1573 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) |
1571 | { | 1574 | { |
1572 | struct perf_event *event; | 1575 | struct perf_event *event; |
1573 | struct hw_perf_event *hwc; | 1576 | struct hw_perf_event *hwc; |
@@ -1592,23 +1595,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1592 | */ | 1595 | */ |
1593 | if (interrupts == MAX_INTERRUPTS) { | 1596 | if (interrupts == MAX_INTERRUPTS) { |
1594 | perf_log_throttle(event, 1); | 1597 | perf_log_throttle(event, 1); |
1595 | perf_disable(); | 1598 | event->pmu->start(event, 0); |
1596 | event->pmu->unthrottle(event); | ||
1597 | perf_enable(); | ||
1598 | } | 1599 | } |
1599 | 1600 | ||
1600 | if (!event->attr.freq || !event->attr.sample_freq) | 1601 | if (!event->attr.freq || !event->attr.sample_freq) |
1601 | continue; | 1602 | continue; |
1602 | 1603 | ||
1603 | perf_disable(); | ||
1604 | event->pmu->read(event); | 1604 | event->pmu->read(event); |
1605 | now = local64_read(&event->count); | 1605 | now = local64_read(&event->count); |
1606 | delta = now - hwc->freq_count_stamp; | 1606 | delta = now - hwc->freq_count_stamp; |
1607 | hwc->freq_count_stamp = now; | 1607 | hwc->freq_count_stamp = now; |
1608 | 1608 | ||
1609 | if (delta > 0) | 1609 | if (delta > 0) |
1610 | perf_adjust_period(event, TICK_NSEC, delta); | 1610 | perf_adjust_period(event, period, delta); |
1611 | perf_enable(); | ||
1612 | } | 1611 | } |
1613 | raw_spin_unlock(&ctx->lock); | 1612 | raw_spin_unlock(&ctx->lock); |
1614 | } | 1613 | } |
@@ -1626,32 +1625,38 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
1626 | raw_spin_unlock(&ctx->lock); | 1625 | raw_spin_unlock(&ctx->lock); |
1627 | } | 1626 | } |
1628 | 1627 | ||
1629 | void perf_event_task_tick(struct task_struct *curr) | 1628 | /* |
1629 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
1630 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
1631 | * disabled, while rotate_context is called from IRQ context. | ||
1632 | */ | ||
1633 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) | ||
1630 | { | 1634 | { |
1631 | struct perf_cpu_context *cpuctx; | 1635 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; |
1632 | struct perf_event_context *ctx; | 1636 | struct perf_event_context *ctx = NULL; |
1633 | int rotate = 0; | 1637 | int rotate = 0, remove = 1; |
1634 | |||
1635 | if (!atomic_read(&nr_events)) | ||
1636 | return; | ||
1637 | 1638 | ||
1638 | cpuctx = &__get_cpu_var(perf_cpu_context); | 1639 | if (cpuctx->ctx.nr_events) { |
1639 | if (cpuctx->ctx.nr_events && | 1640 | remove = 0; |
1640 | cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | 1641 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
1641 | rotate = 1; | 1642 | rotate = 1; |
1643 | } | ||
1642 | 1644 | ||
1643 | ctx = curr->perf_event_ctxp; | 1645 | ctx = cpuctx->task_ctx; |
1644 | if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) | 1646 | if (ctx && ctx->nr_events) { |
1645 | rotate = 1; | 1647 | remove = 0; |
1648 | if (ctx->nr_events != ctx->nr_active) | ||
1649 | rotate = 1; | ||
1650 | } | ||
1646 | 1651 | ||
1647 | perf_ctx_adjust_freq(&cpuctx->ctx); | 1652 | perf_pmu_disable(cpuctx->ctx.pmu); |
1653 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | ||
1648 | if (ctx) | 1654 | if (ctx) |
1649 | perf_ctx_adjust_freq(ctx); | 1655 | perf_ctx_adjust_freq(ctx, interval); |
1650 | 1656 | ||
1651 | if (!rotate) | 1657 | if (!rotate) |
1652 | return; | 1658 | goto done; |
1653 | 1659 | ||
1654 | perf_disable(); | ||
1655 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 1660 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1656 | if (ctx) | 1661 | if (ctx) |
1657 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); | 1662 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); |
@@ -1662,8 +1667,27 @@ void perf_event_task_tick(struct task_struct *curr) | |||
1662 | 1667 | ||
1663 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 1668 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); |
1664 | if (ctx) | 1669 | if (ctx) |
1665 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); | 1670 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
1666 | perf_enable(); | 1671 | |
1672 | done: | ||
1673 | if (remove) | ||
1674 | list_del_init(&cpuctx->rotation_list); | ||
1675 | |||
1676 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
1677 | } | ||
1678 | |||
1679 | void perf_event_task_tick(void) | ||
1680 | { | ||
1681 | struct list_head *head = &__get_cpu_var(rotation_list); | ||
1682 | struct perf_cpu_context *cpuctx, *tmp; | ||
1683 | |||
1684 | WARN_ON(!irqs_disabled()); | ||
1685 | |||
1686 | list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { | ||
1687 | if (cpuctx->jiffies_interval == 1 || | ||
1688 | !(jiffies % cpuctx->jiffies_interval)) | ||
1689 | perf_rotate_context(cpuctx); | ||
1690 | } | ||
1667 | } | 1691 | } |
1668 | 1692 | ||
1669 | static int event_enable_on_exec(struct perf_event *event, | 1693 | static int event_enable_on_exec(struct perf_event *event, |
@@ -1685,20 +1709,18 @@ static int event_enable_on_exec(struct perf_event *event, | |||
1685 | * Enable all of a task's events that have been marked enable-on-exec. | 1709 | * Enable all of a task's events that have been marked enable-on-exec. |
1686 | * This expects task == current. | 1710 | * This expects task == current. |
1687 | */ | 1711 | */ |
1688 | static void perf_event_enable_on_exec(struct task_struct *task) | 1712 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) |
1689 | { | 1713 | { |
1690 | struct perf_event_context *ctx; | ||
1691 | struct perf_event *event; | 1714 | struct perf_event *event; |
1692 | unsigned long flags; | 1715 | unsigned long flags; |
1693 | int enabled = 0; | 1716 | int enabled = 0; |
1694 | int ret; | 1717 | int ret; |
1695 | 1718 | ||
1696 | local_irq_save(flags); | 1719 | local_irq_save(flags); |
1697 | ctx = task->perf_event_ctxp; | ||
1698 | if (!ctx || !ctx->nr_events) | 1720 | if (!ctx || !ctx->nr_events) |
1699 | goto out; | 1721 | goto out; |
1700 | 1722 | ||
1701 | __perf_event_task_sched_out(ctx); | 1723 | task_ctx_sched_out(ctx, EVENT_ALL); |
1702 | 1724 | ||
1703 | raw_spin_lock(&ctx->lock); | 1725 | raw_spin_lock(&ctx->lock); |
1704 | 1726 | ||
@@ -1722,8 +1744,8 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1722 | 1744 | ||
1723 | raw_spin_unlock(&ctx->lock); | 1745 | raw_spin_unlock(&ctx->lock); |
1724 | 1746 | ||
1725 | perf_event_task_sched_in(task); | 1747 | perf_event_context_sched_in(ctx); |
1726 | out: | 1748 | out: |
1727 | local_irq_restore(flags); | 1749 | local_irq_restore(flags); |
1728 | } | 1750 | } |
1729 | 1751 | ||
@@ -1732,9 +1754,9 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1732 | */ | 1754 | */ |
1733 | static void __perf_event_read(void *info) | 1755 | static void __perf_event_read(void *info) |
1734 | { | 1756 | { |
1735 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1736 | struct perf_event *event = info; | 1757 | struct perf_event *event = info; |
1737 | struct perf_event_context *ctx = event->ctx; | 1758 | struct perf_event_context *ctx = event->ctx; |
1759 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
1738 | 1760 | ||
1739 | /* | 1761 | /* |
1740 | * If this is a task context, we need to check whether it is | 1762 | * If this is a task context, we need to check whether it is |
@@ -1773,7 +1795,13 @@ static u64 perf_event_read(struct perf_event *event) | |||
1773 | unsigned long flags; | 1795 | unsigned long flags; |
1774 | 1796 | ||
1775 | raw_spin_lock_irqsave(&ctx->lock, flags); | 1797 | raw_spin_lock_irqsave(&ctx->lock, flags); |
1776 | update_context_time(ctx); | 1798 | /* |
1799 | * may read while context is not active | ||
1800 | * (e.g., thread is blocked), in that case | ||
1801 | * we cannot update context time | ||
1802 | */ | ||
1803 | if (ctx->is_active) | ||
1804 | update_context_time(ctx); | ||
1777 | update_event_times(event); | 1805 | update_event_times(event); |
1778 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 1806 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1779 | } | 1807 | } |
@@ -1782,11 +1810,219 @@ static u64 perf_event_read(struct perf_event *event) | |||
1782 | } | 1810 | } |
1783 | 1811 | ||
1784 | /* | 1812 | /* |
1785 | * Initialize the perf_event context in a task_struct: | 1813 | * Callchain support |
1786 | */ | 1814 | */ |
1815 | |||
1816 | struct callchain_cpus_entries { | ||
1817 | struct rcu_head rcu_head; | ||
1818 | struct perf_callchain_entry *cpu_entries[0]; | ||
1819 | }; | ||
1820 | |||
1821 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
1822 | static atomic_t nr_callchain_events; | ||
1823 | static DEFINE_MUTEX(callchain_mutex); | ||
1824 | struct callchain_cpus_entries *callchain_cpus_entries; | ||
1825 | |||
1826 | |||
1827 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
1828 | struct pt_regs *regs) | ||
1829 | { | ||
1830 | } | ||
1831 | |||
1832 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
1833 | struct pt_regs *regs) | ||
1834 | { | ||
1835 | } | ||
1836 | |||
1837 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
1838 | { | ||
1839 | struct callchain_cpus_entries *entries; | ||
1840 | int cpu; | ||
1841 | |||
1842 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
1843 | |||
1844 | for_each_possible_cpu(cpu) | ||
1845 | kfree(entries->cpu_entries[cpu]); | ||
1846 | |||
1847 | kfree(entries); | ||
1848 | } | ||
1849 | |||
1850 | static void release_callchain_buffers(void) | ||
1851 | { | ||
1852 | struct callchain_cpus_entries *entries; | ||
1853 | |||
1854 | entries = callchain_cpus_entries; | ||
1855 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
1856 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
1857 | } | ||
1858 | |||
1859 | static int alloc_callchain_buffers(void) | ||
1860 | { | ||
1861 | int cpu; | ||
1862 | int size; | ||
1863 | struct callchain_cpus_entries *entries; | ||
1864 | |||
1865 | /* | ||
1866 | * We can't use the percpu allocation API for data that can be | ||
1867 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
1868 | * until that gets sorted out. | ||
1869 | */ | ||
1870 | size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * | ||
1871 | num_possible_cpus(); | ||
1872 | |||
1873 | entries = kzalloc(size, GFP_KERNEL); | ||
1874 | if (!entries) | ||
1875 | return -ENOMEM; | ||
1876 | |||
1877 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
1878 | |||
1879 | for_each_possible_cpu(cpu) { | ||
1880 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
1881 | cpu_to_node(cpu)); | ||
1882 | if (!entries->cpu_entries[cpu]) | ||
1883 | goto fail; | ||
1884 | } | ||
1885 | |||
1886 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
1887 | |||
1888 | return 0; | ||
1889 | |||
1890 | fail: | ||
1891 | for_each_possible_cpu(cpu) | ||
1892 | kfree(entries->cpu_entries[cpu]); | ||
1893 | kfree(entries); | ||
1894 | |||
1895 | return -ENOMEM; | ||
1896 | } | ||
1897 | |||
1898 | static int get_callchain_buffers(void) | ||
1899 | { | ||
1900 | int err = 0; | ||
1901 | int count; | ||
1902 | |||
1903 | mutex_lock(&callchain_mutex); | ||
1904 | |||
1905 | count = atomic_inc_return(&nr_callchain_events); | ||
1906 | if (WARN_ON_ONCE(count < 1)) { | ||
1907 | err = -EINVAL; | ||
1908 | goto exit; | ||
1909 | } | ||
1910 | |||
1911 | if (count > 1) { | ||
1912 | /* If the allocation failed, give up */ | ||
1913 | if (!callchain_cpus_entries) | ||
1914 | err = -ENOMEM; | ||
1915 | goto exit; | ||
1916 | } | ||
1917 | |||
1918 | err = alloc_callchain_buffers(); | ||
1919 | if (err) | ||
1920 | release_callchain_buffers(); | ||
1921 | exit: | ||
1922 | mutex_unlock(&callchain_mutex); | ||
1923 | |||
1924 | return err; | ||
1925 | } | ||
1926 | |||
1927 | static void put_callchain_buffers(void) | ||
1928 | { | ||
1929 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
1930 | release_callchain_buffers(); | ||
1931 | mutex_unlock(&callchain_mutex); | ||
1932 | } | ||
1933 | } | ||
1934 | |||
1935 | static int get_recursion_context(int *recursion) | ||
1936 | { | ||
1937 | int rctx; | ||
1938 | |||
1939 | if (in_nmi()) | ||
1940 | rctx = 3; | ||
1941 | else if (in_irq()) | ||
1942 | rctx = 2; | ||
1943 | else if (in_softirq()) | ||
1944 | rctx = 1; | ||
1945 | else | ||
1946 | rctx = 0; | ||
1947 | |||
1948 | if (recursion[rctx]) | ||
1949 | return -1; | ||
1950 | |||
1951 | recursion[rctx]++; | ||
1952 | barrier(); | ||
1953 | |||
1954 | return rctx; | ||
1955 | } | ||
1956 | |||
1957 | static inline void put_recursion_context(int *recursion, int rctx) | ||
1958 | { | ||
1959 | barrier(); | ||
1960 | recursion[rctx]--; | ||
1961 | } | ||
1962 | |||
1963 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
1964 | { | ||
1965 | int cpu; | ||
1966 | struct callchain_cpus_entries *entries; | ||
1967 | |||
1968 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
1969 | if (*rctx == -1) | ||
1970 | return NULL; | ||
1971 | |||
1972 | entries = rcu_dereference(callchain_cpus_entries); | ||
1973 | if (!entries) | ||
1974 | return NULL; | ||
1975 | |||
1976 | cpu = smp_processor_id(); | ||
1977 | |||
1978 | return &entries->cpu_entries[cpu][*rctx]; | ||
1979 | } | ||
1980 | |||
1787 | static void | 1981 | static void |
1788 | __perf_event_init_context(struct perf_event_context *ctx, | 1982 | put_callchain_entry(int rctx) |
1789 | struct task_struct *task) | 1983 | { |
1984 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
1985 | } | ||
1986 | |||
1987 | static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
1988 | { | ||
1989 | int rctx; | ||
1990 | struct perf_callchain_entry *entry; | ||
1991 | |||
1992 | |||
1993 | entry = get_callchain_entry(&rctx); | ||
1994 | if (rctx == -1) | ||
1995 | return NULL; | ||
1996 | |||
1997 | if (!entry) | ||
1998 | goto exit_put; | ||
1999 | |||
2000 | entry->nr = 0; | ||
2001 | |||
2002 | if (!user_mode(regs)) { | ||
2003 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
2004 | perf_callchain_kernel(entry, regs); | ||
2005 | if (current->mm) | ||
2006 | regs = task_pt_regs(current); | ||
2007 | else | ||
2008 | regs = NULL; | ||
2009 | } | ||
2010 | |||
2011 | if (regs) { | ||
2012 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
2013 | perf_callchain_user(entry, regs); | ||
2014 | } | ||
2015 | |||
2016 | exit_put: | ||
2017 | put_callchain_entry(rctx); | ||
2018 | |||
2019 | return entry; | ||
2020 | } | ||
2021 | |||
2022 | /* | ||
2023 | * Initialize the perf_event context in a task_struct: | ||
2024 | */ | ||
2025 | static void __perf_event_init_context(struct perf_event_context *ctx) | ||
1790 | { | 2026 | { |
1791 | raw_spin_lock_init(&ctx->lock); | 2027 | raw_spin_lock_init(&ctx->lock); |
1792 | mutex_init(&ctx->mutex); | 2028 | mutex_init(&ctx->mutex); |
@@ -1794,45 +2030,38 @@ __perf_event_init_context(struct perf_event_context *ctx, | |||
1794 | INIT_LIST_HEAD(&ctx->flexible_groups); | 2030 | INIT_LIST_HEAD(&ctx->flexible_groups); |
1795 | INIT_LIST_HEAD(&ctx->event_list); | 2031 | INIT_LIST_HEAD(&ctx->event_list); |
1796 | atomic_set(&ctx->refcount, 1); | 2032 | atomic_set(&ctx->refcount, 1); |
1797 | ctx->task = task; | ||
1798 | } | 2033 | } |
1799 | 2034 | ||
1800 | static struct perf_event_context *find_get_context(pid_t pid, int cpu) | 2035 | static struct perf_event_context * |
2036 | alloc_perf_context(struct pmu *pmu, struct task_struct *task) | ||
1801 | { | 2037 | { |
1802 | struct perf_event_context *ctx; | 2038 | struct perf_event_context *ctx; |
1803 | struct perf_cpu_context *cpuctx; | ||
1804 | struct task_struct *task; | ||
1805 | unsigned long flags; | ||
1806 | int err; | ||
1807 | |||
1808 | if (pid == -1 && cpu != -1) { | ||
1809 | /* Must be root to operate on a CPU event: */ | ||
1810 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
1811 | return ERR_PTR(-EACCES); | ||
1812 | 2039 | ||
1813 | if (cpu < 0 || cpu >= nr_cpumask_bits) | 2040 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); |
1814 | return ERR_PTR(-EINVAL); | 2041 | if (!ctx) |
2042 | return NULL; | ||
1815 | 2043 | ||
1816 | /* | 2044 | __perf_event_init_context(ctx); |
1817 | * We could be clever and allow to attach a event to an | 2045 | if (task) { |
1818 | * offline CPU and activate it when the CPU comes up, but | 2046 | ctx->task = task; |
1819 | * that's for later. | 2047 | get_task_struct(task); |
1820 | */ | 2048 | } |
1821 | if (!cpu_online(cpu)) | 2049 | ctx->pmu = pmu; |
1822 | return ERR_PTR(-ENODEV); | ||
1823 | 2050 | ||
1824 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 2051 | return ctx; |
1825 | ctx = &cpuctx->ctx; | 2052 | } |
1826 | get_ctx(ctx); | ||
1827 | 2053 | ||
1828 | return ctx; | 2054 | static struct task_struct * |
1829 | } | 2055 | find_lively_task_by_vpid(pid_t vpid) |
2056 | { | ||
2057 | struct task_struct *task; | ||
2058 | int err; | ||
1830 | 2059 | ||
1831 | rcu_read_lock(); | 2060 | rcu_read_lock(); |
1832 | if (!pid) | 2061 | if (!vpid) |
1833 | task = current; | 2062 | task = current; |
1834 | else | 2063 | else |
1835 | task = find_task_by_vpid(pid); | 2064 | task = find_task_by_vpid(vpid); |
1836 | if (task) | 2065 | if (task) |
1837 | get_task_struct(task); | 2066 | get_task_struct(task); |
1838 | rcu_read_unlock(); | 2067 | rcu_read_unlock(); |
@@ -1852,36 +2081,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) | |||
1852 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 2081 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
1853 | goto errout; | 2082 | goto errout; |
1854 | 2083 | ||
1855 | retry: | 2084 | return task; |
1856 | ctx = perf_lock_task_context(task, &flags); | 2085 | errout: |
2086 | put_task_struct(task); | ||
2087 | return ERR_PTR(err); | ||
2088 | |||
2089 | } | ||
2090 | |||
2091 | static struct perf_event_context * | ||
2092 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | ||
2093 | { | ||
2094 | struct perf_event_context *ctx; | ||
2095 | struct perf_cpu_context *cpuctx; | ||
2096 | unsigned long flags; | ||
2097 | int ctxn, err; | ||
2098 | |||
2099 | if (!task && cpu != -1) { | ||
2100 | /* Must be root to operate on a CPU event: */ | ||
2101 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
2102 | return ERR_PTR(-EACCES); | ||
2103 | |||
2104 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
2105 | return ERR_PTR(-EINVAL); | ||
2106 | |||
2107 | /* | ||
2108 | * We could be clever and allow to attach a event to an | ||
2109 | * offline CPU and activate it when the CPU comes up, but | ||
2110 | * that's for later. | ||
2111 | */ | ||
2112 | if (!cpu_online(cpu)) | ||
2113 | return ERR_PTR(-ENODEV); | ||
2114 | |||
2115 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
2116 | ctx = &cpuctx->ctx; | ||
2117 | get_ctx(ctx); | ||
2118 | |||
2119 | return ctx; | ||
2120 | } | ||
2121 | |||
2122 | err = -EINVAL; | ||
2123 | ctxn = pmu->task_ctx_nr; | ||
2124 | if (ctxn < 0) | ||
2125 | goto errout; | ||
2126 | |||
2127 | retry: | ||
2128 | ctx = perf_lock_task_context(task, ctxn, &flags); | ||
1857 | if (ctx) { | 2129 | if (ctx) { |
1858 | unclone_ctx(ctx); | 2130 | unclone_ctx(ctx); |
1859 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2131 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1860 | } | 2132 | } |
1861 | 2133 | ||
1862 | if (!ctx) { | 2134 | if (!ctx) { |
1863 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); | 2135 | ctx = alloc_perf_context(pmu, task); |
1864 | err = -ENOMEM; | 2136 | err = -ENOMEM; |
1865 | if (!ctx) | 2137 | if (!ctx) |
1866 | goto errout; | 2138 | goto errout; |
1867 | __perf_event_init_context(ctx, task); | 2139 | |
1868 | get_ctx(ctx); | 2140 | get_ctx(ctx); |
1869 | if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { | 2141 | |
2142 | if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { | ||
1870 | /* | 2143 | /* |
1871 | * We raced with some other task; use | 2144 | * We raced with some other task; use |
1872 | * the context they set. | 2145 | * the context they set. |
1873 | */ | 2146 | */ |
2147 | put_task_struct(task); | ||
1874 | kfree(ctx); | 2148 | kfree(ctx); |
1875 | goto retry; | 2149 | goto retry; |
1876 | } | 2150 | } |
1877 | get_task_struct(task); | ||
1878 | } | 2151 | } |
1879 | 2152 | ||
1880 | put_task_struct(task); | ||
1881 | return ctx; | 2153 | return ctx; |
1882 | 2154 | ||
1883 | errout: | 2155 | errout: |
1884 | put_task_struct(task); | ||
1885 | return ERR_PTR(err); | 2156 | return ERR_PTR(err); |
1886 | } | 2157 | } |
1887 | 2158 | ||
@@ -1898,21 +2169,23 @@ static void free_event_rcu(struct rcu_head *head) | |||
1898 | kfree(event); | 2169 | kfree(event); |
1899 | } | 2170 | } |
1900 | 2171 | ||
1901 | static void perf_pending_sync(struct perf_event *event); | ||
1902 | static void perf_buffer_put(struct perf_buffer *buffer); | 2172 | static void perf_buffer_put(struct perf_buffer *buffer); |
1903 | 2173 | ||
1904 | static void free_event(struct perf_event *event) | 2174 | static void free_event(struct perf_event *event) |
1905 | { | 2175 | { |
1906 | perf_pending_sync(event); | 2176 | irq_work_sync(&event->pending); |
1907 | 2177 | ||
1908 | if (!event->parent) { | 2178 | if (!event->parent) { |
1909 | atomic_dec(&nr_events); | 2179 | if (event->attach_state & PERF_ATTACH_TASK) |
2180 | jump_label_dec(&perf_task_events); | ||
1910 | if (event->attr.mmap || event->attr.mmap_data) | 2181 | if (event->attr.mmap || event->attr.mmap_data) |
1911 | atomic_dec(&nr_mmap_events); | 2182 | atomic_dec(&nr_mmap_events); |
1912 | if (event->attr.comm) | 2183 | if (event->attr.comm) |
1913 | atomic_dec(&nr_comm_events); | 2184 | atomic_dec(&nr_comm_events); |
1914 | if (event->attr.task) | 2185 | if (event->attr.task) |
1915 | atomic_dec(&nr_task_events); | 2186 | atomic_dec(&nr_task_events); |
2187 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | ||
2188 | put_callchain_buffers(); | ||
1916 | } | 2189 | } |
1917 | 2190 | ||
1918 | if (event->buffer) { | 2191 | if (event->buffer) { |
@@ -1923,7 +2196,9 @@ static void free_event(struct perf_event *event) | |||
1923 | if (event->destroy) | 2196 | if (event->destroy) |
1924 | event->destroy(event); | 2197 | event->destroy(event); |
1925 | 2198 | ||
1926 | put_ctx(event->ctx); | 2199 | if (event->ctx) |
2200 | put_ctx(event->ctx); | ||
2201 | |||
1927 | call_rcu(&event->rcu_head, free_event_rcu); | 2202 | call_rcu(&event->rcu_head, free_event_rcu); |
1928 | } | 2203 | } |
1929 | 2204 | ||
@@ -2342,6 +2617,9 @@ int perf_event_task_disable(void) | |||
2342 | 2617 | ||
2343 | static int perf_event_index(struct perf_event *event) | 2618 | static int perf_event_index(struct perf_event *event) |
2344 | { | 2619 | { |
2620 | if (event->hw.state & PERF_HES_STOPPED) | ||
2621 | return 0; | ||
2622 | |||
2345 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2623 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2346 | return 0; | 2624 | return 0; |
2347 | 2625 | ||
@@ -2845,16 +3123,7 @@ void perf_event_wakeup(struct perf_event *event) | |||
2845 | } | 3123 | } |
2846 | } | 3124 | } |
2847 | 3125 | ||
2848 | /* | 3126 | static void perf_pending_event(struct irq_work *entry) |
2849 | * Pending wakeups | ||
2850 | * | ||
2851 | * Handle the case where we need to wakeup up from NMI (or rq->lock) context. | ||
2852 | * | ||
2853 | * The NMI bit means we cannot possibly take locks. Therefore, maintain a | ||
2854 | * single linked list and use cmpxchg() to add entries lockless. | ||
2855 | */ | ||
2856 | |||
2857 | static void perf_pending_event(struct perf_pending_entry *entry) | ||
2858 | { | 3127 | { |
2859 | struct perf_event *event = container_of(entry, | 3128 | struct perf_event *event = container_of(entry, |
2860 | struct perf_event, pending); | 3129 | struct perf_event, pending); |
@@ -2870,99 +3139,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) | |||
2870 | } | 3139 | } |
2871 | } | 3140 | } |
2872 | 3141 | ||
2873 | #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) | ||
2874 | |||
2875 | static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { | ||
2876 | PENDING_TAIL, | ||
2877 | }; | ||
2878 | |||
2879 | static void perf_pending_queue(struct perf_pending_entry *entry, | ||
2880 | void (*func)(struct perf_pending_entry *)) | ||
2881 | { | ||
2882 | struct perf_pending_entry **head; | ||
2883 | |||
2884 | if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) | ||
2885 | return; | ||
2886 | |||
2887 | entry->func = func; | ||
2888 | |||
2889 | head = &get_cpu_var(perf_pending_head); | ||
2890 | |||
2891 | do { | ||
2892 | entry->next = *head; | ||
2893 | } while (cmpxchg(head, entry->next, entry) != entry->next); | ||
2894 | |||
2895 | set_perf_event_pending(); | ||
2896 | |||
2897 | put_cpu_var(perf_pending_head); | ||
2898 | } | ||
2899 | |||
2900 | static int __perf_pending_run(void) | ||
2901 | { | ||
2902 | struct perf_pending_entry *list; | ||
2903 | int nr = 0; | ||
2904 | |||
2905 | list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); | ||
2906 | while (list != PENDING_TAIL) { | ||
2907 | void (*func)(struct perf_pending_entry *); | ||
2908 | struct perf_pending_entry *entry = list; | ||
2909 | |||
2910 | list = list->next; | ||
2911 | |||
2912 | func = entry->func; | ||
2913 | entry->next = NULL; | ||
2914 | /* | ||
2915 | * Ensure we observe the unqueue before we issue the wakeup, | ||
2916 | * so that we won't be waiting forever. | ||
2917 | * -- see perf_not_pending(). | ||
2918 | */ | ||
2919 | smp_wmb(); | ||
2920 | |||
2921 | func(entry); | ||
2922 | nr++; | ||
2923 | } | ||
2924 | |||
2925 | return nr; | ||
2926 | } | ||
2927 | |||
2928 | static inline int perf_not_pending(struct perf_event *event) | ||
2929 | { | ||
2930 | /* | ||
2931 | * If we flush on whatever cpu we run, there is a chance we don't | ||
2932 | * need to wait. | ||
2933 | */ | ||
2934 | get_cpu(); | ||
2935 | __perf_pending_run(); | ||
2936 | put_cpu(); | ||
2937 | |||
2938 | /* | ||
2939 | * Ensure we see the proper queue state before going to sleep | ||
2940 | * so that we do not miss the wakeup. -- see perf_pending_handle() | ||
2941 | */ | ||
2942 | smp_rmb(); | ||
2943 | return event->pending.next == NULL; | ||
2944 | } | ||
2945 | |||
2946 | static void perf_pending_sync(struct perf_event *event) | ||
2947 | { | ||
2948 | wait_event(event->waitq, perf_not_pending(event)); | ||
2949 | } | ||
2950 | |||
2951 | void perf_event_do_pending(void) | ||
2952 | { | ||
2953 | __perf_pending_run(); | ||
2954 | } | ||
2955 | |||
2956 | /* | ||
2957 | * Callchain support -- arch specific | ||
2958 | */ | ||
2959 | |||
2960 | __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2961 | { | ||
2962 | return NULL; | ||
2963 | } | ||
2964 | |||
2965 | |||
2966 | /* | 3142 | /* |
2967 | * We assume there is only KVM supporting the callbacks. | 3143 | * We assume there is only KVM supporting the callbacks. |
2968 | * Later on, we might change it to a list if there is | 3144 | * Later on, we might change it to a list if there is |
@@ -3012,8 +3188,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
3012 | 3188 | ||
3013 | if (handle->nmi) { | 3189 | if (handle->nmi) { |
3014 | handle->event->pending_wakeup = 1; | 3190 | handle->event->pending_wakeup = 1; |
3015 | perf_pending_queue(&handle->event->pending, | 3191 | irq_work_queue(&handle->event->pending); |
3016 | perf_pending_event); | ||
3017 | } else | 3192 | } else |
3018 | perf_event_wakeup(handle->event); | 3193 | perf_event_wakeup(handle->event); |
3019 | } | 3194 | } |
@@ -3069,7 +3244,7 @@ again: | |||
3069 | if (handle->wakeup != local_read(&buffer->wakeup)) | 3244 | if (handle->wakeup != local_read(&buffer->wakeup)) |
3070 | perf_output_wakeup(handle); | 3245 | perf_output_wakeup(handle); |
3071 | 3246 | ||
3072 | out: | 3247 | out: |
3073 | preempt_enable(); | 3248 | preempt_enable(); |
3074 | } | 3249 | } |
3075 | 3250 | ||
@@ -3457,14 +3632,20 @@ static void perf_event_output(struct perf_event *event, int nmi, | |||
3457 | struct perf_output_handle handle; | 3632 | struct perf_output_handle handle; |
3458 | struct perf_event_header header; | 3633 | struct perf_event_header header; |
3459 | 3634 | ||
3635 | /* protect the callchain buffers */ | ||
3636 | rcu_read_lock(); | ||
3637 | |||
3460 | perf_prepare_sample(&header, data, event, regs); | 3638 | perf_prepare_sample(&header, data, event, regs); |
3461 | 3639 | ||
3462 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) | 3640 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) |
3463 | return; | 3641 | goto exit; |
3464 | 3642 | ||
3465 | perf_output_sample(&handle, &header, data, event); | 3643 | perf_output_sample(&handle, &header, data, event); |
3466 | 3644 | ||
3467 | perf_output_end(&handle); | 3645 | perf_output_end(&handle); |
3646 | |||
3647 | exit: | ||
3648 | rcu_read_unlock(); | ||
3468 | } | 3649 | } |
3469 | 3650 | ||
3470 | /* | 3651 | /* |
@@ -3578,16 +3759,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, | |||
3578 | static void perf_event_task_event(struct perf_task_event *task_event) | 3759 | static void perf_event_task_event(struct perf_task_event *task_event) |
3579 | { | 3760 | { |
3580 | struct perf_cpu_context *cpuctx; | 3761 | struct perf_cpu_context *cpuctx; |
3581 | struct perf_event_context *ctx = task_event->task_ctx; | 3762 | struct perf_event_context *ctx; |
3763 | struct pmu *pmu; | ||
3764 | int ctxn; | ||
3582 | 3765 | ||
3583 | rcu_read_lock(); | 3766 | rcu_read_lock(); |
3584 | cpuctx = &get_cpu_var(perf_cpu_context); | 3767 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3585 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 3768 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3586 | if (!ctx) | 3769 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
3587 | ctx = rcu_dereference(current->perf_event_ctxp); | 3770 | |
3588 | if (ctx) | 3771 | ctx = task_event->task_ctx; |
3589 | perf_event_task_ctx(ctx, task_event); | 3772 | if (!ctx) { |
3590 | put_cpu_var(perf_cpu_context); | 3773 | ctxn = pmu->task_ctx_nr; |
3774 | if (ctxn < 0) | ||
3775 | goto next; | ||
3776 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
3777 | } | ||
3778 | if (ctx) | ||
3779 | perf_event_task_ctx(ctx, task_event); | ||
3780 | next: | ||
3781 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
3782 | } | ||
3591 | rcu_read_unlock(); | 3783 | rcu_read_unlock(); |
3592 | } | 3784 | } |
3593 | 3785 | ||
@@ -3692,8 +3884,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3692 | { | 3884 | { |
3693 | struct perf_cpu_context *cpuctx; | 3885 | struct perf_cpu_context *cpuctx; |
3694 | struct perf_event_context *ctx; | 3886 | struct perf_event_context *ctx; |
3695 | unsigned int size; | ||
3696 | char comm[TASK_COMM_LEN]; | 3887 | char comm[TASK_COMM_LEN]; |
3888 | unsigned int size; | ||
3889 | struct pmu *pmu; | ||
3890 | int ctxn; | ||
3697 | 3891 | ||
3698 | memset(comm, 0, sizeof(comm)); | 3892 | memset(comm, 0, sizeof(comm)); |
3699 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); | 3893 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); |
@@ -3705,21 +3899,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3705 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 3899 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
3706 | 3900 | ||
3707 | rcu_read_lock(); | 3901 | rcu_read_lock(); |
3708 | cpuctx = &get_cpu_var(perf_cpu_context); | 3902 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3709 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | 3903 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3710 | ctx = rcu_dereference(current->perf_event_ctxp); | 3904 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
3711 | if (ctx) | 3905 | |
3712 | perf_event_comm_ctx(ctx, comm_event); | 3906 | ctxn = pmu->task_ctx_nr; |
3713 | put_cpu_var(perf_cpu_context); | 3907 | if (ctxn < 0) |
3908 | goto next; | ||
3909 | |||
3910 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
3911 | if (ctx) | ||
3912 | perf_event_comm_ctx(ctx, comm_event); | ||
3913 | next: | ||
3914 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
3915 | } | ||
3714 | rcu_read_unlock(); | 3916 | rcu_read_unlock(); |
3715 | } | 3917 | } |
3716 | 3918 | ||
3717 | void perf_event_comm(struct task_struct *task) | 3919 | void perf_event_comm(struct task_struct *task) |
3718 | { | 3920 | { |
3719 | struct perf_comm_event comm_event; | 3921 | struct perf_comm_event comm_event; |
3922 | struct perf_event_context *ctx; | ||
3923 | int ctxn; | ||
3720 | 3924 | ||
3721 | if (task->perf_event_ctxp) | 3925 | for_each_task_context_nr(ctxn) { |
3722 | perf_event_enable_on_exec(task); | 3926 | ctx = task->perf_event_ctxp[ctxn]; |
3927 | if (!ctx) | ||
3928 | continue; | ||
3929 | |||
3930 | perf_event_enable_on_exec(ctx); | ||
3931 | } | ||
3723 | 3932 | ||
3724 | if (!atomic_read(&nr_comm_events)) | 3933 | if (!atomic_read(&nr_comm_events)) |
3725 | return; | 3934 | return; |
@@ -3821,6 +4030,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
3821 | char tmp[16]; | 4030 | char tmp[16]; |
3822 | char *buf = NULL; | 4031 | char *buf = NULL; |
3823 | const char *name; | 4032 | const char *name; |
4033 | struct pmu *pmu; | ||
4034 | int ctxn; | ||
3824 | 4035 | ||
3825 | memset(tmp, 0, sizeof(tmp)); | 4036 | memset(tmp, 0, sizeof(tmp)); |
3826 | 4037 | ||
@@ -3873,12 +4084,23 @@ got_name: | |||
3873 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 4084 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
3874 | 4085 | ||
3875 | rcu_read_lock(); | 4086 | rcu_read_lock(); |
3876 | cpuctx = &get_cpu_var(perf_cpu_context); | 4087 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3877 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4088 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3878 | ctx = rcu_dereference(current->perf_event_ctxp); | 4089 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, |
3879 | if (ctx) | 4090 | vma->vm_flags & VM_EXEC); |
3880 | perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4091 | |
3881 | put_cpu_var(perf_cpu_context); | 4092 | ctxn = pmu->task_ctx_nr; |
4093 | if (ctxn < 0) | ||
4094 | goto next; | ||
4095 | |||
4096 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4097 | if (ctx) { | ||
4098 | perf_event_mmap_ctx(ctx, mmap_event, | ||
4099 | vma->vm_flags & VM_EXEC); | ||
4100 | } | ||
4101 | next: | ||
4102 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4103 | } | ||
3882 | rcu_read_unlock(); | 4104 | rcu_read_unlock(); |
3883 | 4105 | ||
3884 | kfree(buf); | 4106 | kfree(buf); |
@@ -3960,8 +4182,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
3960 | struct hw_perf_event *hwc = &event->hw; | 4182 | struct hw_perf_event *hwc = &event->hw; |
3961 | int ret = 0; | 4183 | int ret = 0; |
3962 | 4184 | ||
3963 | throttle = (throttle && event->pmu->unthrottle != NULL); | ||
3964 | |||
3965 | if (!throttle) { | 4185 | if (!throttle) { |
3966 | hwc->interrupts++; | 4186 | hwc->interrupts++; |
3967 | } else { | 4187 | } else { |
@@ -4004,8 +4224,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4004 | event->pending_kill = POLL_HUP; | 4224 | event->pending_kill = POLL_HUP; |
4005 | if (nmi) { | 4225 | if (nmi) { |
4006 | event->pending_disable = 1; | 4226 | event->pending_disable = 1; |
4007 | perf_pending_queue(&event->pending, | 4227 | irq_work_queue(&event->pending); |
4008 | perf_pending_event); | ||
4009 | } else | 4228 | } else |
4010 | perf_event_disable(event); | 4229 | perf_event_disable(event); |
4011 | } | 4230 | } |
@@ -4029,6 +4248,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, | |||
4029 | * Generic software event infrastructure | 4248 | * Generic software event infrastructure |
4030 | */ | 4249 | */ |
4031 | 4250 | ||
4251 | struct swevent_htable { | ||
4252 | struct swevent_hlist *swevent_hlist; | ||
4253 | struct mutex hlist_mutex; | ||
4254 | int hlist_refcount; | ||
4255 | |||
4256 | /* Recursion avoidance in each contexts */ | ||
4257 | int recursion[PERF_NR_CONTEXTS]; | ||
4258 | }; | ||
4259 | |||
4260 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | ||
4261 | |||
4032 | /* | 4262 | /* |
4033 | * We directly increment event->count and keep a second value in | 4263 | * We directly increment event->count and keep a second value in |
4034 | * event->hw.period_left to count intervals. This period event | 4264 | * event->hw.period_left to count intervals. This period event |
@@ -4086,7 +4316,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
4086 | } | 4316 | } |
4087 | } | 4317 | } |
4088 | 4318 | ||
4089 | static void perf_swevent_add(struct perf_event *event, u64 nr, | 4319 | static void perf_swevent_event(struct perf_event *event, u64 nr, |
4090 | int nmi, struct perf_sample_data *data, | 4320 | int nmi, struct perf_sample_data *data, |
4091 | struct pt_regs *regs) | 4321 | struct pt_regs *regs) |
4092 | { | 4322 | { |
@@ -4112,6 +4342,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4112 | static int perf_exclude_event(struct perf_event *event, | 4342 | static int perf_exclude_event(struct perf_event *event, |
4113 | struct pt_regs *regs) | 4343 | struct pt_regs *regs) |
4114 | { | 4344 | { |
4345 | if (event->hw.state & PERF_HES_STOPPED) | ||
4346 | return 0; | ||
4347 | |||
4115 | if (regs) { | 4348 | if (regs) { |
4116 | if (event->attr.exclude_user && user_mode(regs)) | 4349 | if (event->attr.exclude_user && user_mode(regs)) |
4117 | return 1; | 4350 | return 1; |
@@ -4158,11 +4391,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) | |||
4158 | 4391 | ||
4159 | /* For the read side: events when they trigger */ | 4392 | /* For the read side: events when they trigger */ |
4160 | static inline struct hlist_head * | 4393 | static inline struct hlist_head * |
4161 | find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | 4394 | find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) |
4162 | { | 4395 | { |
4163 | struct swevent_hlist *hlist; | 4396 | struct swevent_hlist *hlist; |
4164 | 4397 | ||
4165 | hlist = rcu_dereference(ctx->swevent_hlist); | 4398 | hlist = rcu_dereference(swhash->swevent_hlist); |
4166 | if (!hlist) | 4399 | if (!hlist) |
4167 | return NULL; | 4400 | return NULL; |
4168 | 4401 | ||
@@ -4171,7 +4404,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | |||
4171 | 4404 | ||
4172 | /* For the event head insertion and removal in the hlist */ | 4405 | /* For the event head insertion and removal in the hlist */ |
4173 | static inline struct hlist_head * | 4406 | static inline struct hlist_head * |
4174 | find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | 4407 | find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) |
4175 | { | 4408 | { |
4176 | struct swevent_hlist *hlist; | 4409 | struct swevent_hlist *hlist; |
4177 | u32 event_id = event->attr.config; | 4410 | u32 event_id = event->attr.config; |
@@ -4182,7 +4415,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | |||
4182 | * and release. Which makes the protected version suitable here. | 4415 | * and release. Which makes the protected version suitable here. |
4183 | * The context lock guarantees that. | 4416 | * The context lock guarantees that. |
4184 | */ | 4417 | */ |
4185 | hlist = rcu_dereference_protected(ctx->swevent_hlist, | 4418 | hlist = rcu_dereference_protected(swhash->swevent_hlist, |
4186 | lockdep_is_held(&event->ctx->lock)); | 4419 | lockdep_is_held(&event->ctx->lock)); |
4187 | if (!hlist) | 4420 | if (!hlist) |
4188 | return NULL; | 4421 | return NULL; |
@@ -4195,23 +4428,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
4195 | struct perf_sample_data *data, | 4428 | struct perf_sample_data *data, |
4196 | struct pt_regs *regs) | 4429 | struct pt_regs *regs) |
4197 | { | 4430 | { |
4198 | struct perf_cpu_context *cpuctx; | 4431 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4199 | struct perf_event *event; | 4432 | struct perf_event *event; |
4200 | struct hlist_node *node; | 4433 | struct hlist_node *node; |
4201 | struct hlist_head *head; | 4434 | struct hlist_head *head; |
4202 | 4435 | ||
4203 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4204 | |||
4205 | rcu_read_lock(); | 4436 | rcu_read_lock(); |
4206 | 4437 | head = find_swevent_head_rcu(swhash, type, event_id); | |
4207 | head = find_swevent_head_rcu(cpuctx, type, event_id); | ||
4208 | |||
4209 | if (!head) | 4438 | if (!head) |
4210 | goto end; | 4439 | goto end; |
4211 | 4440 | ||
4212 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4441 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4213 | if (perf_swevent_match(event, type, event_id, data, regs)) | 4442 | if (perf_swevent_match(event, type, event_id, data, regs)) |
4214 | perf_swevent_add(event, nr, nmi, data, regs); | 4443 | perf_swevent_event(event, nr, nmi, data, regs); |
4215 | } | 4444 | } |
4216 | end: | 4445 | end: |
4217 | rcu_read_unlock(); | 4446 | rcu_read_unlock(); |
@@ -4219,33 +4448,17 @@ end: | |||
4219 | 4448 | ||
4220 | int perf_swevent_get_recursion_context(void) | 4449 | int perf_swevent_get_recursion_context(void) |
4221 | { | 4450 | { |
4222 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4451 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4223 | int rctx; | ||
4224 | |||
4225 | if (in_nmi()) | ||
4226 | rctx = 3; | ||
4227 | else if (in_irq()) | ||
4228 | rctx = 2; | ||
4229 | else if (in_softirq()) | ||
4230 | rctx = 1; | ||
4231 | else | ||
4232 | rctx = 0; | ||
4233 | |||
4234 | if (cpuctx->recursion[rctx]) | ||
4235 | return -1; | ||
4236 | |||
4237 | cpuctx->recursion[rctx]++; | ||
4238 | barrier(); | ||
4239 | 4452 | ||
4240 | return rctx; | 4453 | return get_recursion_context(swhash->recursion); |
4241 | } | 4454 | } |
4242 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 4455 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
4243 | 4456 | ||
4244 | void inline perf_swevent_put_recursion_context(int rctx) | 4457 | void inline perf_swevent_put_recursion_context(int rctx) |
4245 | { | 4458 | { |
4246 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4459 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4247 | barrier(); | 4460 | |
4248 | cpuctx->recursion[rctx]--; | 4461 | put_recursion_context(swhash->recursion, rctx); |
4249 | } | 4462 | } |
4250 | 4463 | ||
4251 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4464 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
@@ -4271,20 +4484,20 @@ static void perf_swevent_read(struct perf_event *event) | |||
4271 | { | 4484 | { |
4272 | } | 4485 | } |
4273 | 4486 | ||
4274 | static int perf_swevent_enable(struct perf_event *event) | 4487 | static int perf_swevent_add(struct perf_event *event, int flags) |
4275 | { | 4488 | { |
4489 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | ||
4276 | struct hw_perf_event *hwc = &event->hw; | 4490 | struct hw_perf_event *hwc = &event->hw; |
4277 | struct perf_cpu_context *cpuctx; | ||
4278 | struct hlist_head *head; | 4491 | struct hlist_head *head; |
4279 | 4492 | ||
4280 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4281 | |||
4282 | if (hwc->sample_period) { | 4493 | if (hwc->sample_period) { |
4283 | hwc->last_period = hwc->sample_period; | 4494 | hwc->last_period = hwc->sample_period; |
4284 | perf_swevent_set_period(event); | 4495 | perf_swevent_set_period(event); |
4285 | } | 4496 | } |
4286 | 4497 | ||
4287 | head = find_swevent_head(cpuctx, event); | 4498 | hwc->state = !(flags & PERF_EF_START); |
4499 | |||
4500 | head = find_swevent_head(swhash, event); | ||
4288 | if (WARN_ON_ONCE(!head)) | 4501 | if (WARN_ON_ONCE(!head)) |
4289 | return -EINVAL; | 4502 | return -EINVAL; |
4290 | 4503 | ||
@@ -4293,202 +4506,27 @@ static int perf_swevent_enable(struct perf_event *event) | |||
4293 | return 0; | 4506 | return 0; |
4294 | } | 4507 | } |
4295 | 4508 | ||
4296 | static void perf_swevent_disable(struct perf_event *event) | 4509 | static void perf_swevent_del(struct perf_event *event, int flags) |
4297 | { | 4510 | { |
4298 | hlist_del_rcu(&event->hlist_entry); | 4511 | hlist_del_rcu(&event->hlist_entry); |
4299 | } | 4512 | } |
4300 | 4513 | ||
4301 | static void perf_swevent_void(struct perf_event *event) | 4514 | static void perf_swevent_start(struct perf_event *event, int flags) |
4302 | { | 4515 | { |
4516 | event->hw.state = 0; | ||
4303 | } | 4517 | } |
4304 | 4518 | ||
4305 | static int perf_swevent_int(struct perf_event *event) | 4519 | static void perf_swevent_stop(struct perf_event *event, int flags) |
4306 | { | 4520 | { |
4307 | return 0; | 4521 | event->hw.state = PERF_HES_STOPPED; |
4308 | } | 4522 | } |
4309 | 4523 | ||
4310 | static const struct pmu perf_ops_generic = { | ||
4311 | .enable = perf_swevent_enable, | ||
4312 | .disable = perf_swevent_disable, | ||
4313 | .start = perf_swevent_int, | ||
4314 | .stop = perf_swevent_void, | ||
4315 | .read = perf_swevent_read, | ||
4316 | .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ | ||
4317 | }; | ||
4318 | |||
4319 | /* | ||
4320 | * hrtimer based swevent callback | ||
4321 | */ | ||
4322 | |||
4323 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
4324 | { | ||
4325 | enum hrtimer_restart ret = HRTIMER_RESTART; | ||
4326 | struct perf_sample_data data; | ||
4327 | struct pt_regs *regs; | ||
4328 | struct perf_event *event; | ||
4329 | u64 period; | ||
4330 | |||
4331 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | ||
4332 | event->pmu->read(event); | ||
4333 | |||
4334 | perf_sample_data_init(&data, 0); | ||
4335 | data.period = event->hw.last_period; | ||
4336 | regs = get_irq_regs(); | ||
4337 | |||
4338 | if (regs && !perf_exclude_event(event, regs)) { | ||
4339 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
4340 | if (perf_event_overflow(event, 0, &data, regs)) | ||
4341 | ret = HRTIMER_NORESTART; | ||
4342 | } | ||
4343 | |||
4344 | period = max_t(u64, 10000, event->hw.sample_period); | ||
4345 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4346 | |||
4347 | return ret; | ||
4348 | } | ||
4349 | |||
4350 | static void perf_swevent_start_hrtimer(struct perf_event *event) | ||
4351 | { | ||
4352 | struct hw_perf_event *hwc = &event->hw; | ||
4353 | |||
4354 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
4355 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4356 | if (hwc->sample_period) { | ||
4357 | u64 period; | ||
4358 | |||
4359 | if (hwc->remaining) { | ||
4360 | if (hwc->remaining < 0) | ||
4361 | period = 10000; | ||
4362 | else | ||
4363 | period = hwc->remaining; | ||
4364 | hwc->remaining = 0; | ||
4365 | } else { | ||
4366 | period = max_t(u64, 10000, hwc->sample_period); | ||
4367 | } | ||
4368 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4369 | ns_to_ktime(period), 0, | ||
4370 | HRTIMER_MODE_REL, 0); | ||
4371 | } | ||
4372 | } | ||
4373 | |||
4374 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | ||
4375 | { | ||
4376 | struct hw_perf_event *hwc = &event->hw; | ||
4377 | |||
4378 | if (hwc->sample_period) { | ||
4379 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
4380 | hwc->remaining = ktime_to_ns(remaining); | ||
4381 | |||
4382 | hrtimer_cancel(&hwc->hrtimer); | ||
4383 | } | ||
4384 | } | ||
4385 | |||
4386 | /* | ||
4387 | * Software event: cpu wall time clock | ||
4388 | */ | ||
4389 | |||
4390 | static void cpu_clock_perf_event_update(struct perf_event *event) | ||
4391 | { | ||
4392 | int cpu = raw_smp_processor_id(); | ||
4393 | s64 prev; | ||
4394 | u64 now; | ||
4395 | |||
4396 | now = cpu_clock(cpu); | ||
4397 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4398 | local64_add(now - prev, &event->count); | ||
4399 | } | ||
4400 | |||
4401 | static int cpu_clock_perf_event_enable(struct perf_event *event) | ||
4402 | { | ||
4403 | struct hw_perf_event *hwc = &event->hw; | ||
4404 | int cpu = raw_smp_processor_id(); | ||
4405 | |||
4406 | local64_set(&hwc->prev_count, cpu_clock(cpu)); | ||
4407 | perf_swevent_start_hrtimer(event); | ||
4408 | |||
4409 | return 0; | ||
4410 | } | ||
4411 | |||
4412 | static void cpu_clock_perf_event_disable(struct perf_event *event) | ||
4413 | { | ||
4414 | perf_swevent_cancel_hrtimer(event); | ||
4415 | cpu_clock_perf_event_update(event); | ||
4416 | } | ||
4417 | |||
4418 | static void cpu_clock_perf_event_read(struct perf_event *event) | ||
4419 | { | ||
4420 | cpu_clock_perf_event_update(event); | ||
4421 | } | ||
4422 | |||
4423 | static const struct pmu perf_ops_cpu_clock = { | ||
4424 | .enable = cpu_clock_perf_event_enable, | ||
4425 | .disable = cpu_clock_perf_event_disable, | ||
4426 | .read = cpu_clock_perf_event_read, | ||
4427 | }; | ||
4428 | |||
4429 | /* | ||
4430 | * Software event: task time clock | ||
4431 | */ | ||
4432 | |||
4433 | static void task_clock_perf_event_update(struct perf_event *event, u64 now) | ||
4434 | { | ||
4435 | u64 prev; | ||
4436 | s64 delta; | ||
4437 | |||
4438 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4439 | delta = now - prev; | ||
4440 | local64_add(delta, &event->count); | ||
4441 | } | ||
4442 | |||
4443 | static int task_clock_perf_event_enable(struct perf_event *event) | ||
4444 | { | ||
4445 | struct hw_perf_event *hwc = &event->hw; | ||
4446 | u64 now; | ||
4447 | |||
4448 | now = event->ctx->time; | ||
4449 | |||
4450 | local64_set(&hwc->prev_count, now); | ||
4451 | |||
4452 | perf_swevent_start_hrtimer(event); | ||
4453 | |||
4454 | return 0; | ||
4455 | } | ||
4456 | |||
4457 | static void task_clock_perf_event_disable(struct perf_event *event) | ||
4458 | { | ||
4459 | perf_swevent_cancel_hrtimer(event); | ||
4460 | task_clock_perf_event_update(event, event->ctx->time); | ||
4461 | |||
4462 | } | ||
4463 | |||
4464 | static void task_clock_perf_event_read(struct perf_event *event) | ||
4465 | { | ||
4466 | u64 time; | ||
4467 | |||
4468 | if (!in_nmi()) { | ||
4469 | update_context_time(event->ctx); | ||
4470 | time = event->ctx->time; | ||
4471 | } else { | ||
4472 | u64 now = perf_clock(); | ||
4473 | u64 delta = now - event->ctx->timestamp; | ||
4474 | time = event->ctx->time + delta; | ||
4475 | } | ||
4476 | |||
4477 | task_clock_perf_event_update(event, time); | ||
4478 | } | ||
4479 | |||
4480 | static const struct pmu perf_ops_task_clock = { | ||
4481 | .enable = task_clock_perf_event_enable, | ||
4482 | .disable = task_clock_perf_event_disable, | ||
4483 | .read = task_clock_perf_event_read, | ||
4484 | }; | ||
4485 | |||
4486 | /* Deref the hlist from the update side */ | 4524 | /* Deref the hlist from the update side */ |
4487 | static inline struct swevent_hlist * | 4525 | static inline struct swevent_hlist * |
4488 | swevent_hlist_deref(struct perf_cpu_context *cpuctx) | 4526 | swevent_hlist_deref(struct swevent_htable *swhash) |
4489 | { | 4527 | { |
4490 | return rcu_dereference_protected(cpuctx->swevent_hlist, | 4528 | return rcu_dereference_protected(swhash->swevent_hlist, |
4491 | lockdep_is_held(&cpuctx->hlist_mutex)); | 4529 | lockdep_is_held(&swhash->hlist_mutex)); |
4492 | } | 4530 | } |
4493 | 4531 | ||
4494 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | 4532 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) |
@@ -4499,27 +4537,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | |||
4499 | kfree(hlist); | 4537 | kfree(hlist); |
4500 | } | 4538 | } |
4501 | 4539 | ||
4502 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) | 4540 | static void swevent_hlist_release(struct swevent_htable *swhash) |
4503 | { | 4541 | { |
4504 | struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); | 4542 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); |
4505 | 4543 | ||
4506 | if (!hlist) | 4544 | if (!hlist) |
4507 | return; | 4545 | return; |
4508 | 4546 | ||
4509 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); | 4547 | rcu_assign_pointer(swhash->swevent_hlist, NULL); |
4510 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 4548 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); |
4511 | } | 4549 | } |
4512 | 4550 | ||
4513 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | 4551 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
4514 | { | 4552 | { |
4515 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4553 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4516 | 4554 | ||
4517 | mutex_lock(&cpuctx->hlist_mutex); | 4555 | mutex_lock(&swhash->hlist_mutex); |
4518 | 4556 | ||
4519 | if (!--cpuctx->hlist_refcount) | 4557 | if (!--swhash->hlist_refcount) |
4520 | swevent_hlist_release(cpuctx); | 4558 | swevent_hlist_release(swhash); |
4521 | 4559 | ||
4522 | mutex_unlock(&cpuctx->hlist_mutex); | 4560 | mutex_unlock(&swhash->hlist_mutex); |
4523 | } | 4561 | } |
4524 | 4562 | ||
4525 | static void swevent_hlist_put(struct perf_event *event) | 4563 | static void swevent_hlist_put(struct perf_event *event) |
@@ -4537,12 +4575,12 @@ static void swevent_hlist_put(struct perf_event *event) | |||
4537 | 4575 | ||
4538 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | 4576 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) |
4539 | { | 4577 | { |
4540 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4578 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4541 | int err = 0; | 4579 | int err = 0; |
4542 | 4580 | ||
4543 | mutex_lock(&cpuctx->hlist_mutex); | 4581 | mutex_lock(&swhash->hlist_mutex); |
4544 | 4582 | ||
4545 | if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { | 4583 | if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { |
4546 | struct swevent_hlist *hlist; | 4584 | struct swevent_hlist *hlist; |
4547 | 4585 | ||
4548 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 4586 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
@@ -4550,11 +4588,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | |||
4550 | err = -ENOMEM; | 4588 | err = -ENOMEM; |
4551 | goto exit; | 4589 | goto exit; |
4552 | } | 4590 | } |
4553 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 4591 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
4554 | } | 4592 | } |
4555 | cpuctx->hlist_refcount++; | 4593 | swhash->hlist_refcount++; |
4556 | exit: | 4594 | exit: |
4557 | mutex_unlock(&cpuctx->hlist_mutex); | 4595 | mutex_unlock(&swhash->hlist_mutex); |
4558 | 4596 | ||
4559 | return err; | 4597 | return err; |
4560 | } | 4598 | } |
@@ -4578,7 +4616,7 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4578 | put_online_cpus(); | 4616 | put_online_cpus(); |
4579 | 4617 | ||
4580 | return 0; | 4618 | return 0; |
4581 | fail: | 4619 | fail: |
4582 | for_each_possible_cpu(cpu) { | 4620 | for_each_possible_cpu(cpu) { |
4583 | if (cpu == failed_cpu) | 4621 | if (cpu == failed_cpu) |
4584 | break; | 4622 | break; |
@@ -4589,17 +4627,64 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4589 | return err; | 4627 | return err; |
4590 | } | 4628 | } |
4591 | 4629 | ||
4592 | #ifdef CONFIG_EVENT_TRACING | 4630 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
4631 | |||
4632 | static void sw_perf_event_destroy(struct perf_event *event) | ||
4633 | { | ||
4634 | u64 event_id = event->attr.config; | ||
4635 | |||
4636 | WARN_ON(event->parent); | ||
4637 | |||
4638 | jump_label_dec(&perf_swevent_enabled[event_id]); | ||
4639 | swevent_hlist_put(event); | ||
4640 | } | ||
4641 | |||
4642 | static int perf_swevent_init(struct perf_event *event) | ||
4643 | { | ||
4644 | int event_id = event->attr.config; | ||
4645 | |||
4646 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
4647 | return -ENOENT; | ||
4648 | |||
4649 | switch (event_id) { | ||
4650 | case PERF_COUNT_SW_CPU_CLOCK: | ||
4651 | case PERF_COUNT_SW_TASK_CLOCK: | ||
4652 | return -ENOENT; | ||
4653 | |||
4654 | default: | ||
4655 | break; | ||
4656 | } | ||
4657 | |||
4658 | if (event_id > PERF_COUNT_SW_MAX) | ||
4659 | return -ENOENT; | ||
4660 | |||
4661 | if (!event->parent) { | ||
4662 | int err; | ||
4593 | 4663 | ||
4594 | static const struct pmu perf_ops_tracepoint = { | 4664 | err = swevent_hlist_get(event); |
4595 | .enable = perf_trace_enable, | 4665 | if (err) |
4596 | .disable = perf_trace_disable, | 4666 | return err; |
4597 | .start = perf_swevent_int, | 4667 | |
4598 | .stop = perf_swevent_void, | 4668 | jump_label_inc(&perf_swevent_enabled[event_id]); |
4669 | event->destroy = sw_perf_event_destroy; | ||
4670 | } | ||
4671 | |||
4672 | return 0; | ||
4673 | } | ||
4674 | |||
4675 | static struct pmu perf_swevent = { | ||
4676 | .task_ctx_nr = perf_sw_context, | ||
4677 | |||
4678 | .event_init = perf_swevent_init, | ||
4679 | .add = perf_swevent_add, | ||
4680 | .del = perf_swevent_del, | ||
4681 | .start = perf_swevent_start, | ||
4682 | .stop = perf_swevent_stop, | ||
4599 | .read = perf_swevent_read, | 4683 | .read = perf_swevent_read, |
4600 | .unthrottle = perf_swevent_void, | ||
4601 | }; | 4684 | }; |
4602 | 4685 | ||
4686 | #ifdef CONFIG_EVENT_TRACING | ||
4687 | |||
4603 | static int perf_tp_filter_match(struct perf_event *event, | 4688 | static int perf_tp_filter_match(struct perf_event *event, |
4604 | struct perf_sample_data *data) | 4689 | struct perf_sample_data *data) |
4605 | { | 4690 | { |
@@ -4643,7 +4728,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
4643 | 4728 | ||
4644 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4729 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4645 | if (perf_tp_event_match(event, &data, regs)) | 4730 | if (perf_tp_event_match(event, &data, regs)) |
4646 | perf_swevent_add(event, count, 1, &data, regs); | 4731 | perf_swevent_event(event, count, 1, &data, regs); |
4647 | } | 4732 | } |
4648 | 4733 | ||
4649 | perf_swevent_put_recursion_context(rctx); | 4734 | perf_swevent_put_recursion_context(rctx); |
@@ -4655,10 +4740,13 @@ static void tp_perf_event_destroy(struct perf_event *event) | |||
4655 | perf_trace_destroy(event); | 4740 | perf_trace_destroy(event); |
4656 | } | 4741 | } |
4657 | 4742 | ||
4658 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4743 | static int perf_tp_event_init(struct perf_event *event) |
4659 | { | 4744 | { |
4660 | int err; | 4745 | int err; |
4661 | 4746 | ||
4747 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | ||
4748 | return -ENOENT; | ||
4749 | |||
4662 | /* | 4750 | /* |
4663 | * Raw tracepoint data is a severe data leak, only allow root to | 4751 | * Raw tracepoint data is a severe data leak, only allow root to |
4664 | * have these. | 4752 | * have these. |
@@ -4666,15 +4754,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
4666 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | 4754 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && |
4667 | perf_paranoid_tracepoint_raw() && | 4755 | perf_paranoid_tracepoint_raw() && |
4668 | !capable(CAP_SYS_ADMIN)) | 4756 | !capable(CAP_SYS_ADMIN)) |
4669 | return ERR_PTR(-EPERM); | 4757 | return -EPERM; |
4670 | 4758 | ||
4671 | err = perf_trace_init(event); | 4759 | err = perf_trace_init(event); |
4672 | if (err) | 4760 | if (err) |
4673 | return NULL; | 4761 | return err; |
4674 | 4762 | ||
4675 | event->destroy = tp_perf_event_destroy; | 4763 | event->destroy = tp_perf_event_destroy; |
4676 | 4764 | ||
4677 | return &perf_ops_tracepoint; | 4765 | return 0; |
4766 | } | ||
4767 | |||
4768 | static struct pmu perf_tracepoint = { | ||
4769 | .task_ctx_nr = perf_sw_context, | ||
4770 | |||
4771 | .event_init = perf_tp_event_init, | ||
4772 | .add = perf_trace_add, | ||
4773 | .del = perf_trace_del, | ||
4774 | .start = perf_swevent_start, | ||
4775 | .stop = perf_swevent_stop, | ||
4776 | .read = perf_swevent_read, | ||
4777 | }; | ||
4778 | |||
4779 | static inline void perf_tp_register(void) | ||
4780 | { | ||
4781 | perf_pmu_register(&perf_tracepoint); | ||
4678 | } | 4782 | } |
4679 | 4783 | ||
4680 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4784 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4702,9 +4806,8 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4702 | 4806 | ||
4703 | #else | 4807 | #else |
4704 | 4808 | ||
4705 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4809 | static inline void perf_tp_register(void) |
4706 | { | 4810 | { |
4707 | return NULL; | ||
4708 | } | 4811 | } |
4709 | 4812 | ||
4710 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4813 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4719,105 +4822,389 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4719 | #endif /* CONFIG_EVENT_TRACING */ | 4822 | #endif /* CONFIG_EVENT_TRACING */ |
4720 | 4823 | ||
4721 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 4824 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
4722 | static void bp_perf_event_destroy(struct perf_event *event) | 4825 | void perf_bp_event(struct perf_event *bp, void *data) |
4723 | { | 4826 | { |
4724 | release_bp_slot(event); | 4827 | struct perf_sample_data sample; |
4828 | struct pt_regs *regs = data; | ||
4829 | |||
4830 | perf_sample_data_init(&sample, bp->attr.bp_addr); | ||
4831 | |||
4832 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | ||
4833 | perf_swevent_event(bp, 1, 1, &sample, regs); | ||
4725 | } | 4834 | } |
4835 | #endif | ||
4726 | 4836 | ||
4727 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4837 | /* |
4838 | * hrtimer based swevent callback | ||
4839 | */ | ||
4840 | |||
4841 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
4728 | { | 4842 | { |
4729 | int err; | 4843 | enum hrtimer_restart ret = HRTIMER_RESTART; |
4844 | struct perf_sample_data data; | ||
4845 | struct pt_regs *regs; | ||
4846 | struct perf_event *event; | ||
4847 | u64 period; | ||
4730 | 4848 | ||
4731 | err = register_perf_hw_breakpoint(bp); | 4849 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
4732 | if (err) | 4850 | event->pmu->read(event); |
4733 | return ERR_PTR(err); | 4851 | |
4852 | perf_sample_data_init(&data, 0); | ||
4853 | data.period = event->hw.last_period; | ||
4854 | regs = get_irq_regs(); | ||
4855 | |||
4856 | if (regs && !perf_exclude_event(event, regs)) { | ||
4857 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
4858 | if (perf_event_overflow(event, 0, &data, regs)) | ||
4859 | ret = HRTIMER_NORESTART; | ||
4860 | } | ||
4734 | 4861 | ||
4735 | bp->destroy = bp_perf_event_destroy; | 4862 | period = max_t(u64, 10000, event->hw.sample_period); |
4863 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4736 | 4864 | ||
4737 | return &perf_ops_bp; | 4865 | return ret; |
4738 | } | 4866 | } |
4739 | 4867 | ||
4740 | void perf_bp_event(struct perf_event *bp, void *data) | 4868 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
4741 | { | 4869 | { |
4742 | struct perf_sample_data sample; | 4870 | struct hw_perf_event *hwc = &event->hw; |
4743 | struct pt_regs *regs = data; | ||
4744 | 4871 | ||
4745 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 4872 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
4873 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4874 | if (hwc->sample_period) { | ||
4875 | s64 period = local64_read(&hwc->period_left); | ||
4876 | |||
4877 | if (period) { | ||
4878 | if (period < 0) | ||
4879 | period = 10000; | ||
4880 | |||
4881 | local64_set(&hwc->period_left, 0); | ||
4882 | } else { | ||
4883 | period = max_t(u64, 10000, hwc->sample_period); | ||
4884 | } | ||
4885 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4886 | ns_to_ktime(period), 0, | ||
4887 | HRTIMER_MODE_REL_PINNED, 0); | ||
4888 | } | ||
4889 | } | ||
4890 | |||
4891 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | ||
4892 | { | ||
4893 | struct hw_perf_event *hwc = &event->hw; | ||
4746 | 4894 | ||
4747 | if (!perf_exclude_event(bp, regs)) | 4895 | if (hwc->sample_period) { |
4748 | perf_swevent_add(bp, 1, 1, &sample, regs); | 4896 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); |
4897 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | ||
4898 | |||
4899 | hrtimer_cancel(&hwc->hrtimer); | ||
4900 | } | ||
4749 | } | 4901 | } |
4750 | #else | 4902 | |
4751 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4903 | /* |
4904 | * Software event: cpu wall time clock | ||
4905 | */ | ||
4906 | |||
4907 | static void cpu_clock_event_update(struct perf_event *event) | ||
4752 | { | 4908 | { |
4753 | return NULL; | 4909 | s64 prev; |
4910 | u64 now; | ||
4911 | |||
4912 | now = local_clock(); | ||
4913 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4914 | local64_add(now - prev, &event->count); | ||
4754 | } | 4915 | } |
4755 | 4916 | ||
4756 | void perf_bp_event(struct perf_event *bp, void *regs) | 4917 | static void cpu_clock_event_start(struct perf_event *event, int flags) |
4757 | { | 4918 | { |
4919 | local64_set(&event->hw.prev_count, local_clock()); | ||
4920 | perf_swevent_start_hrtimer(event); | ||
4758 | } | 4921 | } |
4759 | #endif | ||
4760 | 4922 | ||
4761 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 4923 | static void cpu_clock_event_stop(struct perf_event *event, int flags) |
4924 | { | ||
4925 | perf_swevent_cancel_hrtimer(event); | ||
4926 | cpu_clock_event_update(event); | ||
4927 | } | ||
4762 | 4928 | ||
4763 | static void sw_perf_event_destroy(struct perf_event *event) | 4929 | static int cpu_clock_event_add(struct perf_event *event, int flags) |
4764 | { | 4930 | { |
4765 | u64 event_id = event->attr.config; | 4931 | if (flags & PERF_EF_START) |
4932 | cpu_clock_event_start(event, flags); | ||
4766 | 4933 | ||
4767 | WARN_ON(event->parent); | 4934 | return 0; |
4935 | } | ||
4768 | 4936 | ||
4769 | atomic_dec(&perf_swevent_enabled[event_id]); | 4937 | static void cpu_clock_event_del(struct perf_event *event, int flags) |
4770 | swevent_hlist_put(event); | 4938 | { |
4939 | cpu_clock_event_stop(event, flags); | ||
4771 | } | 4940 | } |
4772 | 4941 | ||
4773 | static const struct pmu *sw_perf_event_init(struct perf_event *event) | 4942 | static void cpu_clock_event_read(struct perf_event *event) |
4774 | { | 4943 | { |
4775 | const struct pmu *pmu = NULL; | 4944 | cpu_clock_event_update(event); |
4776 | u64 event_id = event->attr.config; | 4945 | } |
4946 | |||
4947 | static int cpu_clock_event_init(struct perf_event *event) | ||
4948 | { | ||
4949 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
4950 | return -ENOENT; | ||
4951 | |||
4952 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | ||
4953 | return -ENOENT; | ||
4954 | |||
4955 | return 0; | ||
4956 | } | ||
4957 | |||
4958 | static struct pmu perf_cpu_clock = { | ||
4959 | .task_ctx_nr = perf_sw_context, | ||
4960 | |||
4961 | .event_init = cpu_clock_event_init, | ||
4962 | .add = cpu_clock_event_add, | ||
4963 | .del = cpu_clock_event_del, | ||
4964 | .start = cpu_clock_event_start, | ||
4965 | .stop = cpu_clock_event_stop, | ||
4966 | .read = cpu_clock_event_read, | ||
4967 | }; | ||
4968 | |||
4969 | /* | ||
4970 | * Software event: task time clock | ||
4971 | */ | ||
4972 | |||
4973 | static void task_clock_event_update(struct perf_event *event, u64 now) | ||
4974 | { | ||
4975 | u64 prev; | ||
4976 | s64 delta; | ||
4977 | |||
4978 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4979 | delta = now - prev; | ||
4980 | local64_add(delta, &event->count); | ||
4981 | } | ||
4982 | |||
4983 | static void task_clock_event_start(struct perf_event *event, int flags) | ||
4984 | { | ||
4985 | local64_set(&event->hw.prev_count, event->ctx->time); | ||
4986 | perf_swevent_start_hrtimer(event); | ||
4987 | } | ||
4988 | |||
4989 | static void task_clock_event_stop(struct perf_event *event, int flags) | ||
4990 | { | ||
4991 | perf_swevent_cancel_hrtimer(event); | ||
4992 | task_clock_event_update(event, event->ctx->time); | ||
4993 | } | ||
4994 | |||
4995 | static int task_clock_event_add(struct perf_event *event, int flags) | ||
4996 | { | ||
4997 | if (flags & PERF_EF_START) | ||
4998 | task_clock_event_start(event, flags); | ||
4999 | |||
5000 | return 0; | ||
5001 | } | ||
5002 | |||
5003 | static void task_clock_event_del(struct perf_event *event, int flags) | ||
5004 | { | ||
5005 | task_clock_event_stop(event, PERF_EF_UPDATE); | ||
5006 | } | ||
5007 | |||
5008 | static void task_clock_event_read(struct perf_event *event) | ||
5009 | { | ||
5010 | u64 time; | ||
5011 | |||
5012 | if (!in_nmi()) { | ||
5013 | update_context_time(event->ctx); | ||
5014 | time = event->ctx->time; | ||
5015 | } else { | ||
5016 | u64 now = perf_clock(); | ||
5017 | u64 delta = now - event->ctx->timestamp; | ||
5018 | time = event->ctx->time + delta; | ||
5019 | } | ||
5020 | |||
5021 | task_clock_event_update(event, time); | ||
5022 | } | ||
5023 | |||
5024 | static int task_clock_event_init(struct perf_event *event) | ||
5025 | { | ||
5026 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
5027 | return -ENOENT; | ||
5028 | |||
5029 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | ||
5030 | return -ENOENT; | ||
5031 | |||
5032 | return 0; | ||
5033 | } | ||
5034 | |||
5035 | static struct pmu perf_task_clock = { | ||
5036 | .task_ctx_nr = perf_sw_context, | ||
5037 | |||
5038 | .event_init = task_clock_event_init, | ||
5039 | .add = task_clock_event_add, | ||
5040 | .del = task_clock_event_del, | ||
5041 | .start = task_clock_event_start, | ||
5042 | .stop = task_clock_event_stop, | ||
5043 | .read = task_clock_event_read, | ||
5044 | }; | ||
5045 | |||
5046 | static void perf_pmu_nop_void(struct pmu *pmu) | ||
5047 | { | ||
5048 | } | ||
5049 | |||
5050 | static int perf_pmu_nop_int(struct pmu *pmu) | ||
5051 | { | ||
5052 | return 0; | ||
5053 | } | ||
5054 | |||
5055 | static void perf_pmu_start_txn(struct pmu *pmu) | ||
5056 | { | ||
5057 | perf_pmu_disable(pmu); | ||
5058 | } | ||
5059 | |||
5060 | static int perf_pmu_commit_txn(struct pmu *pmu) | ||
5061 | { | ||
5062 | perf_pmu_enable(pmu); | ||
5063 | return 0; | ||
5064 | } | ||
5065 | |||
5066 | static void perf_pmu_cancel_txn(struct pmu *pmu) | ||
5067 | { | ||
5068 | perf_pmu_enable(pmu); | ||
5069 | } | ||
5070 | |||
5071 | /* | ||
5072 | * Ensures all contexts with the same task_ctx_nr have the same | ||
5073 | * pmu_cpu_context too. | ||
5074 | */ | ||
5075 | static void *find_pmu_context(int ctxn) | ||
5076 | { | ||
5077 | struct pmu *pmu; | ||
5078 | |||
5079 | if (ctxn < 0) | ||
5080 | return NULL; | ||
4777 | 5081 | ||
5082 | list_for_each_entry(pmu, &pmus, entry) { | ||
5083 | if (pmu->task_ctx_nr == ctxn) | ||
5084 | return pmu->pmu_cpu_context; | ||
5085 | } | ||
5086 | |||
5087 | return NULL; | ||
5088 | } | ||
5089 | |||
5090 | static void free_pmu_context(void * __percpu cpu_context) | ||
5091 | { | ||
5092 | struct pmu *pmu; | ||
5093 | |||
5094 | mutex_lock(&pmus_lock); | ||
4778 | /* | 5095 | /* |
4779 | * Software events (currently) can't in general distinguish | 5096 | * Like a real lame refcount. |
4780 | * between user, kernel and hypervisor events. | ||
4781 | * However, context switches and cpu migrations are considered | ||
4782 | * to be kernel events, and page faults are never hypervisor | ||
4783 | * events. | ||
4784 | */ | 5097 | */ |
4785 | switch (event_id) { | 5098 | list_for_each_entry(pmu, &pmus, entry) { |
4786 | case PERF_COUNT_SW_CPU_CLOCK: | 5099 | if (pmu->pmu_cpu_context == cpu_context) |
4787 | pmu = &perf_ops_cpu_clock; | 5100 | goto out; |
5101 | } | ||
4788 | 5102 | ||
4789 | break; | 5103 | free_percpu(cpu_context); |
4790 | case PERF_COUNT_SW_TASK_CLOCK: | 5104 | out: |
4791 | /* | 5105 | mutex_unlock(&pmus_lock); |
4792 | * If the user instantiates this as a per-cpu event, | 5106 | } |
4793 | * use the cpu_clock event instead. | ||
4794 | */ | ||
4795 | if (event->ctx->task) | ||
4796 | pmu = &perf_ops_task_clock; | ||
4797 | else | ||
4798 | pmu = &perf_ops_cpu_clock; | ||
4799 | 5107 | ||
4800 | break; | 5108 | int perf_pmu_register(struct pmu *pmu) |
4801 | case PERF_COUNT_SW_PAGE_FAULTS: | 5109 | { |
4802 | case PERF_COUNT_SW_PAGE_FAULTS_MIN: | 5110 | int cpu, ret; |
4803 | case PERF_COUNT_SW_PAGE_FAULTS_MAJ: | ||
4804 | case PERF_COUNT_SW_CONTEXT_SWITCHES: | ||
4805 | case PERF_COUNT_SW_CPU_MIGRATIONS: | ||
4806 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: | ||
4807 | case PERF_COUNT_SW_EMULATION_FAULTS: | ||
4808 | if (!event->parent) { | ||
4809 | int err; | ||
4810 | |||
4811 | err = swevent_hlist_get(event); | ||
4812 | if (err) | ||
4813 | return ERR_PTR(err); | ||
4814 | 5111 | ||
4815 | atomic_inc(&perf_swevent_enabled[event_id]); | 5112 | mutex_lock(&pmus_lock); |
4816 | event->destroy = sw_perf_event_destroy; | 5113 | ret = -ENOMEM; |
5114 | pmu->pmu_disable_count = alloc_percpu(int); | ||
5115 | if (!pmu->pmu_disable_count) | ||
5116 | goto unlock; | ||
5117 | |||
5118 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); | ||
5119 | if (pmu->pmu_cpu_context) | ||
5120 | goto got_cpu_context; | ||
5121 | |||
5122 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | ||
5123 | if (!pmu->pmu_cpu_context) | ||
5124 | goto free_pdc; | ||
5125 | |||
5126 | for_each_possible_cpu(cpu) { | ||
5127 | struct perf_cpu_context *cpuctx; | ||
5128 | |||
5129 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
5130 | __perf_event_init_context(&cpuctx->ctx); | ||
5131 | cpuctx->ctx.type = cpu_context; | ||
5132 | cpuctx->ctx.pmu = pmu; | ||
5133 | cpuctx->jiffies_interval = 1; | ||
5134 | INIT_LIST_HEAD(&cpuctx->rotation_list); | ||
5135 | } | ||
5136 | |||
5137 | got_cpu_context: | ||
5138 | if (!pmu->start_txn) { | ||
5139 | if (pmu->pmu_enable) { | ||
5140 | /* | ||
5141 | * If we have pmu_enable/pmu_disable calls, install | ||
5142 | * transaction stubs that use that to try and batch | ||
5143 | * hardware accesses. | ||
5144 | */ | ||
5145 | pmu->start_txn = perf_pmu_start_txn; | ||
5146 | pmu->commit_txn = perf_pmu_commit_txn; | ||
5147 | pmu->cancel_txn = perf_pmu_cancel_txn; | ||
5148 | } else { | ||
5149 | pmu->start_txn = perf_pmu_nop_void; | ||
5150 | pmu->commit_txn = perf_pmu_nop_int; | ||
5151 | pmu->cancel_txn = perf_pmu_nop_void; | ||
5152 | } | ||
5153 | } | ||
5154 | |||
5155 | if (!pmu->pmu_enable) { | ||
5156 | pmu->pmu_enable = perf_pmu_nop_void; | ||
5157 | pmu->pmu_disable = perf_pmu_nop_void; | ||
5158 | } | ||
5159 | |||
5160 | list_add_rcu(&pmu->entry, &pmus); | ||
5161 | ret = 0; | ||
5162 | unlock: | ||
5163 | mutex_unlock(&pmus_lock); | ||
5164 | |||
5165 | return ret; | ||
5166 | |||
5167 | free_pdc: | ||
5168 | free_percpu(pmu->pmu_disable_count); | ||
5169 | goto unlock; | ||
5170 | } | ||
5171 | |||
5172 | void perf_pmu_unregister(struct pmu *pmu) | ||
5173 | { | ||
5174 | mutex_lock(&pmus_lock); | ||
5175 | list_del_rcu(&pmu->entry); | ||
5176 | mutex_unlock(&pmus_lock); | ||
5177 | |||
5178 | /* | ||
5179 | * We dereference the pmu list under both SRCU and regular RCU, so | ||
5180 | * synchronize against both of those. | ||
5181 | */ | ||
5182 | synchronize_srcu(&pmus_srcu); | ||
5183 | synchronize_rcu(); | ||
5184 | |||
5185 | free_percpu(pmu->pmu_disable_count); | ||
5186 | free_pmu_context(pmu->pmu_cpu_context); | ||
5187 | } | ||
5188 | |||
5189 | struct pmu *perf_init_event(struct perf_event *event) | ||
5190 | { | ||
5191 | struct pmu *pmu = NULL; | ||
5192 | int idx; | ||
5193 | |||
5194 | idx = srcu_read_lock(&pmus_srcu); | ||
5195 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
5196 | int ret = pmu->event_init(event); | ||
5197 | if (!ret) | ||
5198 | goto unlock; | ||
5199 | |||
5200 | if (ret != -ENOENT) { | ||
5201 | pmu = ERR_PTR(ret); | ||
5202 | goto unlock; | ||
4817 | } | 5203 | } |
4818 | pmu = &perf_ops_generic; | ||
4819 | break; | ||
4820 | } | 5204 | } |
5205 | pmu = ERR_PTR(-ENOENT); | ||
5206 | unlock: | ||
5207 | srcu_read_unlock(&pmus_srcu, idx); | ||
4821 | 5208 | ||
4822 | return pmu; | 5209 | return pmu; |
4823 | } | 5210 | } |
@@ -4826,20 +5213,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) | |||
4826 | * Allocate and initialize a event structure | 5213 | * Allocate and initialize a event structure |
4827 | */ | 5214 | */ |
4828 | static struct perf_event * | 5215 | static struct perf_event * |
4829 | perf_event_alloc(struct perf_event_attr *attr, | 5216 | perf_event_alloc(struct perf_event_attr *attr, int cpu, |
4830 | int cpu, | 5217 | struct task_struct *task, |
4831 | struct perf_event_context *ctx, | 5218 | struct perf_event *group_leader, |
4832 | struct perf_event *group_leader, | 5219 | struct perf_event *parent_event, |
4833 | struct perf_event *parent_event, | 5220 | perf_overflow_handler_t overflow_handler) |
4834 | perf_overflow_handler_t overflow_handler, | 5221 | { |
4835 | gfp_t gfpflags) | 5222 | struct pmu *pmu; |
4836 | { | ||
4837 | const struct pmu *pmu; | ||
4838 | struct perf_event *event; | 5223 | struct perf_event *event; |
4839 | struct hw_perf_event *hwc; | 5224 | struct hw_perf_event *hwc; |
4840 | long err; | 5225 | long err; |
4841 | 5226 | ||
4842 | event = kzalloc(sizeof(*event), gfpflags); | 5227 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
4843 | if (!event) | 5228 | if (!event) |
4844 | return ERR_PTR(-ENOMEM); | 5229 | return ERR_PTR(-ENOMEM); |
4845 | 5230 | ||
@@ -4857,6 +5242,7 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4857 | INIT_LIST_HEAD(&event->event_entry); | 5242 | INIT_LIST_HEAD(&event->event_entry); |
4858 | INIT_LIST_HEAD(&event->sibling_list); | 5243 | INIT_LIST_HEAD(&event->sibling_list); |
4859 | init_waitqueue_head(&event->waitq); | 5244 | init_waitqueue_head(&event->waitq); |
5245 | init_irq_work(&event->pending, perf_pending_event); | ||
4860 | 5246 | ||
4861 | mutex_init(&event->mmap_mutex); | 5247 | mutex_init(&event->mmap_mutex); |
4862 | 5248 | ||
@@ -4864,7 +5250,6 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4864 | event->attr = *attr; | 5250 | event->attr = *attr; |
4865 | event->group_leader = group_leader; | 5251 | event->group_leader = group_leader; |
4866 | event->pmu = NULL; | 5252 | event->pmu = NULL; |
4867 | event->ctx = ctx; | ||
4868 | event->oncpu = -1; | 5253 | event->oncpu = -1; |
4869 | 5254 | ||
4870 | event->parent = parent_event; | 5255 | event->parent = parent_event; |
@@ -4874,6 +5259,17 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4874 | 5259 | ||
4875 | event->state = PERF_EVENT_STATE_INACTIVE; | 5260 | event->state = PERF_EVENT_STATE_INACTIVE; |
4876 | 5261 | ||
5262 | if (task) { | ||
5263 | event->attach_state = PERF_ATTACH_TASK; | ||
5264 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
5265 | /* | ||
5266 | * hw_breakpoint is a bit difficult here.. | ||
5267 | */ | ||
5268 | if (attr->type == PERF_TYPE_BREAKPOINT) | ||
5269 | event->hw.bp_target = task; | ||
5270 | #endif | ||
5271 | } | ||
5272 | |||
4877 | if (!overflow_handler && parent_event) | 5273 | if (!overflow_handler && parent_event) |
4878 | overflow_handler = parent_event->overflow_handler; | 5274 | overflow_handler = parent_event->overflow_handler; |
4879 | 5275 | ||
@@ -4898,29 +5294,8 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4898 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 5294 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
4899 | goto done; | 5295 | goto done; |
4900 | 5296 | ||
4901 | switch (attr->type) { | 5297 | pmu = perf_init_event(event); |
4902 | case PERF_TYPE_RAW: | ||
4903 | case PERF_TYPE_HARDWARE: | ||
4904 | case PERF_TYPE_HW_CACHE: | ||
4905 | pmu = hw_perf_event_init(event); | ||
4906 | break; | ||
4907 | |||
4908 | case PERF_TYPE_SOFTWARE: | ||
4909 | pmu = sw_perf_event_init(event); | ||
4910 | break; | ||
4911 | |||
4912 | case PERF_TYPE_TRACEPOINT: | ||
4913 | pmu = tp_perf_event_init(event); | ||
4914 | break; | ||
4915 | |||
4916 | case PERF_TYPE_BREAKPOINT: | ||
4917 | pmu = bp_perf_event_init(event); | ||
4918 | break; | ||
4919 | |||
4920 | 5298 | ||
4921 | default: | ||
4922 | break; | ||
4923 | } | ||
4924 | done: | 5299 | done: |
4925 | err = 0; | 5300 | err = 0; |
4926 | if (!pmu) | 5301 | if (!pmu) |
@@ -4938,13 +5313,21 @@ done: | |||
4938 | event->pmu = pmu; | 5313 | event->pmu = pmu; |
4939 | 5314 | ||
4940 | if (!event->parent) { | 5315 | if (!event->parent) { |
4941 | atomic_inc(&nr_events); | 5316 | if (event->attach_state & PERF_ATTACH_TASK) |
5317 | jump_label_inc(&perf_task_events); | ||
4942 | if (event->attr.mmap || event->attr.mmap_data) | 5318 | if (event->attr.mmap || event->attr.mmap_data) |
4943 | atomic_inc(&nr_mmap_events); | 5319 | atomic_inc(&nr_mmap_events); |
4944 | if (event->attr.comm) | 5320 | if (event->attr.comm) |
4945 | atomic_inc(&nr_comm_events); | 5321 | atomic_inc(&nr_comm_events); |
4946 | if (event->attr.task) | 5322 | if (event->attr.task) |
4947 | atomic_inc(&nr_task_events); | 5323 | atomic_inc(&nr_task_events); |
5324 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | ||
5325 | err = get_callchain_buffers(); | ||
5326 | if (err) { | ||
5327 | free_event(event); | ||
5328 | return ERR_PTR(err); | ||
5329 | } | ||
5330 | } | ||
4948 | } | 5331 | } |
4949 | 5332 | ||
4950 | return event; | 5333 | return event; |
@@ -5092,12 +5475,16 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5092 | struct perf_event_attr __user *, attr_uptr, | 5475 | struct perf_event_attr __user *, attr_uptr, |
5093 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) | 5476 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) |
5094 | { | 5477 | { |
5095 | struct perf_event *event, *group_leader = NULL, *output_event = NULL; | 5478 | struct perf_event *group_leader = NULL, *output_event = NULL; |
5479 | struct perf_event *event, *sibling; | ||
5096 | struct perf_event_attr attr; | 5480 | struct perf_event_attr attr; |
5097 | struct perf_event_context *ctx; | 5481 | struct perf_event_context *ctx; |
5098 | struct file *event_file = NULL; | 5482 | struct file *event_file = NULL; |
5099 | struct file *group_file = NULL; | 5483 | struct file *group_file = NULL; |
5484 | struct task_struct *task = NULL; | ||
5485 | struct pmu *pmu; | ||
5100 | int event_fd; | 5486 | int event_fd; |
5487 | int move_group = 0; | ||
5101 | int fput_needed = 0; | 5488 | int fput_needed = 0; |
5102 | int err; | 5489 | int err; |
5103 | 5490 | ||
@@ -5123,20 +5510,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5123 | if (event_fd < 0) | 5510 | if (event_fd < 0) |
5124 | return event_fd; | 5511 | return event_fd; |
5125 | 5512 | ||
5126 | /* | ||
5127 | * Get the target context (task or percpu): | ||
5128 | */ | ||
5129 | ctx = find_get_context(pid, cpu); | ||
5130 | if (IS_ERR(ctx)) { | ||
5131 | err = PTR_ERR(ctx); | ||
5132 | goto err_fd; | ||
5133 | } | ||
5134 | |||
5135 | if (group_fd != -1) { | 5513 | if (group_fd != -1) { |
5136 | group_leader = perf_fget_light(group_fd, &fput_needed); | 5514 | group_leader = perf_fget_light(group_fd, &fput_needed); |
5137 | if (IS_ERR(group_leader)) { | 5515 | if (IS_ERR(group_leader)) { |
5138 | err = PTR_ERR(group_leader); | 5516 | err = PTR_ERR(group_leader); |
5139 | goto err_put_context; | 5517 | goto err_fd; |
5140 | } | 5518 | } |
5141 | group_file = group_leader->filp; | 5519 | group_file = group_leader->filp; |
5142 | if (flags & PERF_FLAG_FD_OUTPUT) | 5520 | if (flags & PERF_FLAG_FD_OUTPUT) |
@@ -5145,6 +5523,58 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5145 | group_leader = NULL; | 5523 | group_leader = NULL; |
5146 | } | 5524 | } |
5147 | 5525 | ||
5526 | if (pid != -1) { | ||
5527 | task = find_lively_task_by_vpid(pid); | ||
5528 | if (IS_ERR(task)) { | ||
5529 | err = PTR_ERR(task); | ||
5530 | goto err_group_fd; | ||
5531 | } | ||
5532 | } | ||
5533 | |||
5534 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); | ||
5535 | if (IS_ERR(event)) { | ||
5536 | err = PTR_ERR(event); | ||
5537 | goto err_task; | ||
5538 | } | ||
5539 | |||
5540 | /* | ||
5541 | * Special case software events and allow them to be part of | ||
5542 | * any hardware group. | ||
5543 | */ | ||
5544 | pmu = event->pmu; | ||
5545 | |||
5546 | if (group_leader && | ||
5547 | (is_software_event(event) != is_software_event(group_leader))) { | ||
5548 | if (is_software_event(event)) { | ||
5549 | /* | ||
5550 | * If event and group_leader are not both a software | ||
5551 | * event, and event is, then group leader is not. | ||
5552 | * | ||
5553 | * Allow the addition of software events to !software | ||
5554 | * groups, this is safe because software events never | ||
5555 | * fail to schedule. | ||
5556 | */ | ||
5557 | pmu = group_leader->pmu; | ||
5558 | } else if (is_software_event(group_leader) && | ||
5559 | (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { | ||
5560 | /* | ||
5561 | * In case the group is a pure software group, and we | ||
5562 | * try to add a hardware event, move the whole group to | ||
5563 | * the hardware context. | ||
5564 | */ | ||
5565 | move_group = 1; | ||
5566 | } | ||
5567 | } | ||
5568 | |||
5569 | /* | ||
5570 | * Get the target context (task or percpu): | ||
5571 | */ | ||
5572 | ctx = find_get_context(pmu, task, cpu); | ||
5573 | if (IS_ERR(ctx)) { | ||
5574 | err = PTR_ERR(ctx); | ||
5575 | goto err_alloc; | ||
5576 | } | ||
5577 | |||
5148 | /* | 5578 | /* |
5149 | * Look up the group leader (we will attach this event to it): | 5579 | * Look up the group leader (we will attach this event to it): |
5150 | */ | 5580 | */ |
@@ -5156,42 +5586,66 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5156 | * becoming part of another group-sibling): | 5586 | * becoming part of another group-sibling): |
5157 | */ | 5587 | */ |
5158 | if (group_leader->group_leader != group_leader) | 5588 | if (group_leader->group_leader != group_leader) |
5159 | goto err_put_context; | 5589 | goto err_context; |
5160 | /* | 5590 | /* |
5161 | * Do not allow to attach to a group in a different | 5591 | * Do not allow to attach to a group in a different |
5162 | * task or CPU context: | 5592 | * task or CPU context: |
5163 | */ | 5593 | */ |
5164 | if (group_leader->ctx != ctx) | 5594 | if (move_group) { |
5165 | goto err_put_context; | 5595 | if (group_leader->ctx->type != ctx->type) |
5596 | goto err_context; | ||
5597 | } else { | ||
5598 | if (group_leader->ctx != ctx) | ||
5599 | goto err_context; | ||
5600 | } | ||
5601 | |||
5166 | /* | 5602 | /* |
5167 | * Only a group leader can be exclusive or pinned | 5603 | * Only a group leader can be exclusive or pinned |
5168 | */ | 5604 | */ |
5169 | if (attr.exclusive || attr.pinned) | 5605 | if (attr.exclusive || attr.pinned) |
5170 | goto err_put_context; | 5606 | goto err_context; |
5171 | } | ||
5172 | |||
5173 | event = perf_event_alloc(&attr, cpu, ctx, group_leader, | ||
5174 | NULL, NULL, GFP_KERNEL); | ||
5175 | if (IS_ERR(event)) { | ||
5176 | err = PTR_ERR(event); | ||
5177 | goto err_put_context; | ||
5178 | } | 5607 | } |
5179 | 5608 | ||
5180 | if (output_event) { | 5609 | if (output_event) { |
5181 | err = perf_event_set_output(event, output_event); | 5610 | err = perf_event_set_output(event, output_event); |
5182 | if (err) | 5611 | if (err) |
5183 | goto err_free_put_context; | 5612 | goto err_context; |
5184 | } | 5613 | } |
5185 | 5614 | ||
5186 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); | 5615 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); |
5187 | if (IS_ERR(event_file)) { | 5616 | if (IS_ERR(event_file)) { |
5188 | err = PTR_ERR(event_file); | 5617 | err = PTR_ERR(event_file); |
5189 | goto err_free_put_context; | 5618 | goto err_context; |
5619 | } | ||
5620 | |||
5621 | if (move_group) { | ||
5622 | struct perf_event_context *gctx = group_leader->ctx; | ||
5623 | |||
5624 | mutex_lock(&gctx->mutex); | ||
5625 | perf_event_remove_from_context(group_leader); | ||
5626 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
5627 | group_entry) { | ||
5628 | perf_event_remove_from_context(sibling); | ||
5629 | put_ctx(gctx); | ||
5630 | } | ||
5631 | mutex_unlock(&gctx->mutex); | ||
5632 | put_ctx(gctx); | ||
5190 | } | 5633 | } |
5191 | 5634 | ||
5192 | event->filp = event_file; | 5635 | event->filp = event_file; |
5193 | WARN_ON_ONCE(ctx->parent_ctx); | 5636 | WARN_ON_ONCE(ctx->parent_ctx); |
5194 | mutex_lock(&ctx->mutex); | 5637 | mutex_lock(&ctx->mutex); |
5638 | |||
5639 | if (move_group) { | ||
5640 | perf_install_in_context(ctx, group_leader, cpu); | ||
5641 | get_ctx(ctx); | ||
5642 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
5643 | group_entry) { | ||
5644 | perf_install_in_context(ctx, sibling, cpu); | ||
5645 | get_ctx(ctx); | ||
5646 | } | ||
5647 | } | ||
5648 | |||
5195 | perf_install_in_context(ctx, event, cpu); | 5649 | perf_install_in_context(ctx, event, cpu); |
5196 | ++ctx->generation; | 5650 | ++ctx->generation; |
5197 | mutex_unlock(&ctx->mutex); | 5651 | mutex_unlock(&ctx->mutex); |
@@ -5212,11 +5666,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5212 | fd_install(event_fd, event_file); | 5666 | fd_install(event_fd, event_file); |
5213 | return event_fd; | 5667 | return event_fd; |
5214 | 5668 | ||
5215 | err_free_put_context: | 5669 | err_context: |
5670 | put_ctx(ctx); | ||
5671 | err_alloc: | ||
5216 | free_event(event); | 5672 | free_event(event); |
5217 | err_put_context: | 5673 | err_task: |
5674 | if (task) | ||
5675 | put_task_struct(task); | ||
5676 | err_group_fd: | ||
5218 | fput_light(group_file, fput_needed); | 5677 | fput_light(group_file, fput_needed); |
5219 | put_ctx(ctx); | ||
5220 | err_fd: | 5678 | err_fd: |
5221 | put_unused_fd(event_fd); | 5679 | put_unused_fd(event_fd); |
5222 | return err; | 5680 | return err; |
@@ -5227,32 +5685,31 @@ err_fd: | |||
5227 | * | 5685 | * |
5228 | * @attr: attributes of the counter to create | 5686 | * @attr: attributes of the counter to create |
5229 | * @cpu: cpu in which the counter is bound | 5687 | * @cpu: cpu in which the counter is bound |
5230 | * @pid: task to profile | 5688 | * @task: task to profile (NULL for percpu) |
5231 | */ | 5689 | */ |
5232 | struct perf_event * | 5690 | struct perf_event * |
5233 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | 5691 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, |
5234 | pid_t pid, | 5692 | struct task_struct *task, |
5235 | perf_overflow_handler_t overflow_handler) | 5693 | perf_overflow_handler_t overflow_handler) |
5236 | { | 5694 | { |
5237 | struct perf_event *event; | ||
5238 | struct perf_event_context *ctx; | 5695 | struct perf_event_context *ctx; |
5696 | struct perf_event *event; | ||
5239 | int err; | 5697 | int err; |
5240 | 5698 | ||
5241 | /* | 5699 | /* |
5242 | * Get the target context (task or percpu): | 5700 | * Get the target context (task or percpu): |
5243 | */ | 5701 | */ |
5244 | 5702 | ||
5245 | ctx = find_get_context(pid, cpu); | 5703 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); |
5246 | if (IS_ERR(ctx)) { | ||
5247 | err = PTR_ERR(ctx); | ||
5248 | goto err_exit; | ||
5249 | } | ||
5250 | |||
5251 | event = perf_event_alloc(attr, cpu, ctx, NULL, | ||
5252 | NULL, overflow_handler, GFP_KERNEL); | ||
5253 | if (IS_ERR(event)) { | 5704 | if (IS_ERR(event)) { |
5254 | err = PTR_ERR(event); | 5705 | err = PTR_ERR(event); |
5255 | goto err_put_context; | 5706 | goto err; |
5707 | } | ||
5708 | |||
5709 | ctx = find_get_context(event->pmu, task, cpu); | ||
5710 | if (IS_ERR(ctx)) { | ||
5711 | err = PTR_ERR(ctx); | ||
5712 | goto err_free; | ||
5256 | } | 5713 | } |
5257 | 5714 | ||
5258 | event->filp = NULL; | 5715 | event->filp = NULL; |
@@ -5270,112 +5727,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
5270 | 5727 | ||
5271 | return event; | 5728 | return event; |
5272 | 5729 | ||
5273 | err_put_context: | 5730 | err_free: |
5274 | put_ctx(ctx); | 5731 | free_event(event); |
5275 | err_exit: | 5732 | err: |
5276 | return ERR_PTR(err); | 5733 | return ERR_PTR(err); |
5277 | } | 5734 | } |
5278 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); | 5735 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); |
5279 | 5736 | ||
5280 | /* | ||
5281 | * inherit a event from parent task to child task: | ||
5282 | */ | ||
5283 | static struct perf_event * | ||
5284 | inherit_event(struct perf_event *parent_event, | ||
5285 | struct task_struct *parent, | ||
5286 | struct perf_event_context *parent_ctx, | ||
5287 | struct task_struct *child, | ||
5288 | struct perf_event *group_leader, | ||
5289 | struct perf_event_context *child_ctx) | ||
5290 | { | ||
5291 | struct perf_event *child_event; | ||
5292 | |||
5293 | /* | ||
5294 | * Instead of creating recursive hierarchies of events, | ||
5295 | * we link inherited events back to the original parent, | ||
5296 | * which has a filp for sure, which we use as the reference | ||
5297 | * count: | ||
5298 | */ | ||
5299 | if (parent_event->parent) | ||
5300 | parent_event = parent_event->parent; | ||
5301 | |||
5302 | child_event = perf_event_alloc(&parent_event->attr, | ||
5303 | parent_event->cpu, child_ctx, | ||
5304 | group_leader, parent_event, | ||
5305 | NULL, GFP_KERNEL); | ||
5306 | if (IS_ERR(child_event)) | ||
5307 | return child_event; | ||
5308 | get_ctx(child_ctx); | ||
5309 | |||
5310 | /* | ||
5311 | * Make the child state follow the state of the parent event, | ||
5312 | * not its attr.disabled bit. We hold the parent's mutex, | ||
5313 | * so we won't race with perf_event_{en, dis}able_family. | ||
5314 | */ | ||
5315 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
5316 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
5317 | else | ||
5318 | child_event->state = PERF_EVENT_STATE_OFF; | ||
5319 | |||
5320 | if (parent_event->attr.freq) { | ||
5321 | u64 sample_period = parent_event->hw.sample_period; | ||
5322 | struct hw_perf_event *hwc = &child_event->hw; | ||
5323 | |||
5324 | hwc->sample_period = sample_period; | ||
5325 | hwc->last_period = sample_period; | ||
5326 | |||
5327 | local64_set(&hwc->period_left, sample_period); | ||
5328 | } | ||
5329 | |||
5330 | child_event->overflow_handler = parent_event->overflow_handler; | ||
5331 | |||
5332 | /* | ||
5333 | * Link it up in the child's context: | ||
5334 | */ | ||
5335 | add_event_to_ctx(child_event, child_ctx); | ||
5336 | |||
5337 | /* | ||
5338 | * Get a reference to the parent filp - we will fput it | ||
5339 | * when the child event exits. This is safe to do because | ||
5340 | * we are in the parent and we know that the filp still | ||
5341 | * exists and has a nonzero count: | ||
5342 | */ | ||
5343 | atomic_long_inc(&parent_event->filp->f_count); | ||
5344 | |||
5345 | /* | ||
5346 | * Link this into the parent event's child list | ||
5347 | */ | ||
5348 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
5349 | mutex_lock(&parent_event->child_mutex); | ||
5350 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
5351 | mutex_unlock(&parent_event->child_mutex); | ||
5352 | |||
5353 | return child_event; | ||
5354 | } | ||
5355 | |||
5356 | static int inherit_group(struct perf_event *parent_event, | ||
5357 | struct task_struct *parent, | ||
5358 | struct perf_event_context *parent_ctx, | ||
5359 | struct task_struct *child, | ||
5360 | struct perf_event_context *child_ctx) | ||
5361 | { | ||
5362 | struct perf_event *leader; | ||
5363 | struct perf_event *sub; | ||
5364 | struct perf_event *child_ctr; | ||
5365 | |||
5366 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
5367 | child, NULL, child_ctx); | ||
5368 | if (IS_ERR(leader)) | ||
5369 | return PTR_ERR(leader); | ||
5370 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
5371 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
5372 | child, leader, child_ctx); | ||
5373 | if (IS_ERR(child_ctr)) | ||
5374 | return PTR_ERR(child_ctr); | ||
5375 | } | ||
5376 | return 0; | ||
5377 | } | ||
5378 | |||
5379 | static void sync_child_event(struct perf_event *child_event, | 5737 | static void sync_child_event(struct perf_event *child_event, |
5380 | struct task_struct *child) | 5738 | struct task_struct *child) |
5381 | { | 5739 | { |
@@ -5432,16 +5790,13 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
5432 | } | 5790 | } |
5433 | } | 5791 | } |
5434 | 5792 | ||
5435 | /* | 5793 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
5436 | * When a child task exits, feed back event values to parent events. | ||
5437 | */ | ||
5438 | void perf_event_exit_task(struct task_struct *child) | ||
5439 | { | 5794 | { |
5440 | struct perf_event *child_event, *tmp; | 5795 | struct perf_event *child_event, *tmp; |
5441 | struct perf_event_context *child_ctx; | 5796 | struct perf_event_context *child_ctx; |
5442 | unsigned long flags; | 5797 | unsigned long flags; |
5443 | 5798 | ||
5444 | if (likely(!child->perf_event_ctxp)) { | 5799 | if (likely(!child->perf_event_ctxp[ctxn])) { |
5445 | perf_event_task(child, NULL, 0); | 5800 | perf_event_task(child, NULL, 0); |
5446 | return; | 5801 | return; |
5447 | } | 5802 | } |
@@ -5453,8 +5808,8 @@ void perf_event_exit_task(struct task_struct *child) | |||
5453 | * scheduled, so we are now safe from rescheduling changing | 5808 | * scheduled, so we are now safe from rescheduling changing |
5454 | * our context. | 5809 | * our context. |
5455 | */ | 5810 | */ |
5456 | child_ctx = child->perf_event_ctxp; | 5811 | child_ctx = child->perf_event_ctxp[ctxn]; |
5457 | __perf_event_task_sched_out(child_ctx); | 5812 | task_ctx_sched_out(child_ctx, EVENT_ALL); |
5458 | 5813 | ||
5459 | /* | 5814 | /* |
5460 | * Take the context lock here so that if find_get_context is | 5815 | * Take the context lock here so that if find_get_context is |
@@ -5462,7 +5817,7 @@ void perf_event_exit_task(struct task_struct *child) | |||
5462 | * incremented the context's refcount before we do put_ctx below. | 5817 | * incremented the context's refcount before we do put_ctx below. |
5463 | */ | 5818 | */ |
5464 | raw_spin_lock(&child_ctx->lock); | 5819 | raw_spin_lock(&child_ctx->lock); |
5465 | child->perf_event_ctxp = NULL; | 5820 | child->perf_event_ctxp[ctxn] = NULL; |
5466 | /* | 5821 | /* |
5467 | * If this context is a clone; unclone it so it can't get | 5822 | * If this context is a clone; unclone it so it can't get |
5468 | * swapped to another process while we're removing all | 5823 | * swapped to another process while we're removing all |
@@ -5515,6 +5870,17 @@ again: | |||
5515 | put_ctx(child_ctx); | 5870 | put_ctx(child_ctx); |
5516 | } | 5871 | } |
5517 | 5872 | ||
5873 | /* | ||
5874 | * When a child task exits, feed back event values to parent events. | ||
5875 | */ | ||
5876 | void perf_event_exit_task(struct task_struct *child) | ||
5877 | { | ||
5878 | int ctxn; | ||
5879 | |||
5880 | for_each_task_context_nr(ctxn) | ||
5881 | perf_event_exit_task_context(child, ctxn); | ||
5882 | } | ||
5883 | |||
5518 | static void perf_free_event(struct perf_event *event, | 5884 | static void perf_free_event(struct perf_event *event, |
5519 | struct perf_event_context *ctx) | 5885 | struct perf_event_context *ctx) |
5520 | { | 5886 | { |
@@ -5536,48 +5902,166 @@ static void perf_free_event(struct perf_event *event, | |||
5536 | 5902 | ||
5537 | /* | 5903 | /* |
5538 | * free an unexposed, unused context as created by inheritance by | 5904 | * free an unexposed, unused context as created by inheritance by |
5539 | * init_task below, used by fork() in case of fail. | 5905 | * perf_event_init_task below, used by fork() in case of fail. |
5540 | */ | 5906 | */ |
5541 | void perf_event_free_task(struct task_struct *task) | 5907 | void perf_event_free_task(struct task_struct *task) |
5542 | { | 5908 | { |
5543 | struct perf_event_context *ctx = task->perf_event_ctxp; | 5909 | struct perf_event_context *ctx; |
5544 | struct perf_event *event, *tmp; | 5910 | struct perf_event *event, *tmp; |
5911 | int ctxn; | ||
5545 | 5912 | ||
5546 | if (!ctx) | 5913 | for_each_task_context_nr(ctxn) { |
5547 | return; | 5914 | ctx = task->perf_event_ctxp[ctxn]; |
5915 | if (!ctx) | ||
5916 | continue; | ||
5548 | 5917 | ||
5549 | mutex_lock(&ctx->mutex); | 5918 | mutex_lock(&ctx->mutex); |
5550 | again: | 5919 | again: |
5551 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 5920 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, |
5552 | perf_free_event(event, ctx); | 5921 | group_entry) |
5922 | perf_free_event(event, ctx); | ||
5553 | 5923 | ||
5554 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, | 5924 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, |
5555 | group_entry) | 5925 | group_entry) |
5556 | perf_free_event(event, ctx); | 5926 | perf_free_event(event, ctx); |
5557 | 5927 | ||
5558 | if (!list_empty(&ctx->pinned_groups) || | 5928 | if (!list_empty(&ctx->pinned_groups) || |
5559 | !list_empty(&ctx->flexible_groups)) | 5929 | !list_empty(&ctx->flexible_groups)) |
5560 | goto again; | 5930 | goto again; |
5561 | 5931 | ||
5562 | mutex_unlock(&ctx->mutex); | 5932 | mutex_unlock(&ctx->mutex); |
5563 | 5933 | ||
5564 | put_ctx(ctx); | 5934 | put_ctx(ctx); |
5935 | } | ||
5936 | } | ||
5937 | |||
5938 | void perf_event_delayed_put(struct task_struct *task) | ||
5939 | { | ||
5940 | int ctxn; | ||
5941 | |||
5942 | for_each_task_context_nr(ctxn) | ||
5943 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); | ||
5944 | } | ||
5945 | |||
5946 | /* | ||
5947 | * inherit a event from parent task to child task: | ||
5948 | */ | ||
5949 | static struct perf_event * | ||
5950 | inherit_event(struct perf_event *parent_event, | ||
5951 | struct task_struct *parent, | ||
5952 | struct perf_event_context *parent_ctx, | ||
5953 | struct task_struct *child, | ||
5954 | struct perf_event *group_leader, | ||
5955 | struct perf_event_context *child_ctx) | ||
5956 | { | ||
5957 | struct perf_event *child_event; | ||
5958 | unsigned long flags; | ||
5959 | |||
5960 | /* | ||
5961 | * Instead of creating recursive hierarchies of events, | ||
5962 | * we link inherited events back to the original parent, | ||
5963 | * which has a filp for sure, which we use as the reference | ||
5964 | * count: | ||
5965 | */ | ||
5966 | if (parent_event->parent) | ||
5967 | parent_event = parent_event->parent; | ||
5968 | |||
5969 | child_event = perf_event_alloc(&parent_event->attr, | ||
5970 | parent_event->cpu, | ||
5971 | child, | ||
5972 | group_leader, parent_event, | ||
5973 | NULL); | ||
5974 | if (IS_ERR(child_event)) | ||
5975 | return child_event; | ||
5976 | get_ctx(child_ctx); | ||
5977 | |||
5978 | /* | ||
5979 | * Make the child state follow the state of the parent event, | ||
5980 | * not its attr.disabled bit. We hold the parent's mutex, | ||
5981 | * so we won't race with perf_event_{en, dis}able_family. | ||
5982 | */ | ||
5983 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
5984 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
5985 | else | ||
5986 | child_event->state = PERF_EVENT_STATE_OFF; | ||
5987 | |||
5988 | if (parent_event->attr.freq) { | ||
5989 | u64 sample_period = parent_event->hw.sample_period; | ||
5990 | struct hw_perf_event *hwc = &child_event->hw; | ||
5991 | |||
5992 | hwc->sample_period = sample_period; | ||
5993 | hwc->last_period = sample_period; | ||
5994 | |||
5995 | local64_set(&hwc->period_left, sample_period); | ||
5996 | } | ||
5997 | |||
5998 | child_event->ctx = child_ctx; | ||
5999 | child_event->overflow_handler = parent_event->overflow_handler; | ||
6000 | |||
6001 | /* | ||
6002 | * Link it up in the child's context: | ||
6003 | */ | ||
6004 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | ||
6005 | add_event_to_ctx(child_event, child_ctx); | ||
6006 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | ||
6007 | |||
6008 | /* | ||
6009 | * Get a reference to the parent filp - we will fput it | ||
6010 | * when the child event exits. This is safe to do because | ||
6011 | * we are in the parent and we know that the filp still | ||
6012 | * exists and has a nonzero count: | ||
6013 | */ | ||
6014 | atomic_long_inc(&parent_event->filp->f_count); | ||
6015 | |||
6016 | /* | ||
6017 | * Link this into the parent event's child list | ||
6018 | */ | ||
6019 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
6020 | mutex_lock(&parent_event->child_mutex); | ||
6021 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
6022 | mutex_unlock(&parent_event->child_mutex); | ||
6023 | |||
6024 | return child_event; | ||
6025 | } | ||
6026 | |||
6027 | static int inherit_group(struct perf_event *parent_event, | ||
6028 | struct task_struct *parent, | ||
6029 | struct perf_event_context *parent_ctx, | ||
6030 | struct task_struct *child, | ||
6031 | struct perf_event_context *child_ctx) | ||
6032 | { | ||
6033 | struct perf_event *leader; | ||
6034 | struct perf_event *sub; | ||
6035 | struct perf_event *child_ctr; | ||
6036 | |||
6037 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
6038 | child, NULL, child_ctx); | ||
6039 | if (IS_ERR(leader)) | ||
6040 | return PTR_ERR(leader); | ||
6041 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
6042 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
6043 | child, leader, child_ctx); | ||
6044 | if (IS_ERR(child_ctr)) | ||
6045 | return PTR_ERR(child_ctr); | ||
6046 | } | ||
6047 | return 0; | ||
5565 | } | 6048 | } |
5566 | 6049 | ||
5567 | static int | 6050 | static int |
5568 | inherit_task_group(struct perf_event *event, struct task_struct *parent, | 6051 | inherit_task_group(struct perf_event *event, struct task_struct *parent, |
5569 | struct perf_event_context *parent_ctx, | 6052 | struct perf_event_context *parent_ctx, |
5570 | struct task_struct *child, | 6053 | struct task_struct *child, int ctxn, |
5571 | int *inherited_all) | 6054 | int *inherited_all) |
5572 | { | 6055 | { |
5573 | int ret; | 6056 | int ret; |
5574 | struct perf_event_context *child_ctx = child->perf_event_ctxp; | 6057 | struct perf_event_context *child_ctx; |
5575 | 6058 | ||
5576 | if (!event->attr.inherit) { | 6059 | if (!event->attr.inherit) { |
5577 | *inherited_all = 0; | 6060 | *inherited_all = 0; |
5578 | return 0; | 6061 | return 0; |
5579 | } | 6062 | } |
5580 | 6063 | ||
6064 | child_ctx = child->perf_event_ctxp[ctxn]; | ||
5581 | if (!child_ctx) { | 6065 | if (!child_ctx) { |
5582 | /* | 6066 | /* |
5583 | * This is executed from the parent task context, so | 6067 | * This is executed from the parent task context, so |
@@ -5586,14 +6070,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5586 | * child. | 6070 | * child. |
5587 | */ | 6071 | */ |
5588 | 6072 | ||
5589 | child_ctx = kzalloc(sizeof(struct perf_event_context), | 6073 | child_ctx = alloc_perf_context(event->pmu, child); |
5590 | GFP_KERNEL); | ||
5591 | if (!child_ctx) | 6074 | if (!child_ctx) |
5592 | return -ENOMEM; | 6075 | return -ENOMEM; |
5593 | 6076 | ||
5594 | __perf_event_init_context(child_ctx, child); | 6077 | child->perf_event_ctxp[ctxn] = child_ctx; |
5595 | child->perf_event_ctxp = child_ctx; | ||
5596 | get_task_struct(child); | ||
5597 | } | 6078 | } |
5598 | 6079 | ||
5599 | ret = inherit_group(event, parent, parent_ctx, | 6080 | ret = inherit_group(event, parent, parent_ctx, |
@@ -5605,11 +6086,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5605 | return ret; | 6086 | return ret; |
5606 | } | 6087 | } |
5607 | 6088 | ||
5608 | |||
5609 | /* | 6089 | /* |
5610 | * Initialize the perf_event context in task_struct | 6090 | * Initialize the perf_event context in task_struct |
5611 | */ | 6091 | */ |
5612 | int perf_event_init_task(struct task_struct *child) | 6092 | int perf_event_init_context(struct task_struct *child, int ctxn) |
5613 | { | 6093 | { |
5614 | struct perf_event_context *child_ctx, *parent_ctx; | 6094 | struct perf_event_context *child_ctx, *parent_ctx; |
5615 | struct perf_event_context *cloned_ctx; | 6095 | struct perf_event_context *cloned_ctx; |
@@ -5618,19 +6098,19 @@ int perf_event_init_task(struct task_struct *child) | |||
5618 | int inherited_all = 1; | 6098 | int inherited_all = 1; |
5619 | int ret = 0; | 6099 | int ret = 0; |
5620 | 6100 | ||
5621 | child->perf_event_ctxp = NULL; | 6101 | child->perf_event_ctxp[ctxn] = NULL; |
5622 | 6102 | ||
5623 | mutex_init(&child->perf_event_mutex); | 6103 | mutex_init(&child->perf_event_mutex); |
5624 | INIT_LIST_HEAD(&child->perf_event_list); | 6104 | INIT_LIST_HEAD(&child->perf_event_list); |
5625 | 6105 | ||
5626 | if (likely(!parent->perf_event_ctxp)) | 6106 | if (likely(!parent->perf_event_ctxp[ctxn])) |
5627 | return 0; | 6107 | return 0; |
5628 | 6108 | ||
5629 | /* | 6109 | /* |
5630 | * If the parent's context is a clone, pin it so it won't get | 6110 | * If the parent's context is a clone, pin it so it won't get |
5631 | * swapped under us. | 6111 | * swapped under us. |
5632 | */ | 6112 | */ |
5633 | parent_ctx = perf_pin_task_context(parent); | 6113 | parent_ctx = perf_pin_task_context(parent, ctxn); |
5634 | 6114 | ||
5635 | /* | 6115 | /* |
5636 | * No need to check if parent_ctx != NULL here; since we saw | 6116 | * No need to check if parent_ctx != NULL here; since we saw |
@@ -5650,20 +6130,20 @@ int perf_event_init_task(struct task_struct *child) | |||
5650 | * the list, not manipulating it: | 6130 | * the list, not manipulating it: |
5651 | */ | 6131 | */ |
5652 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { | 6132 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { |
5653 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6133 | ret = inherit_task_group(event, parent, parent_ctx, |
5654 | &inherited_all); | 6134 | child, ctxn, &inherited_all); |
5655 | if (ret) | 6135 | if (ret) |
5656 | break; | 6136 | break; |
5657 | } | 6137 | } |
5658 | 6138 | ||
5659 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 6139 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
5660 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6140 | ret = inherit_task_group(event, parent, parent_ctx, |
5661 | &inherited_all); | 6141 | child, ctxn, &inherited_all); |
5662 | if (ret) | 6142 | if (ret) |
5663 | break; | 6143 | break; |
5664 | } | 6144 | } |
5665 | 6145 | ||
5666 | child_ctx = child->perf_event_ctxp; | 6146 | child_ctx = child->perf_event_ctxp[ctxn]; |
5667 | 6147 | ||
5668 | if (child_ctx && inherited_all) { | 6148 | if (child_ctx && inherited_all) { |
5669 | /* | 6149 | /* |
@@ -5692,63 +6172,98 @@ int perf_event_init_task(struct task_struct *child) | |||
5692 | return ret; | 6172 | return ret; |
5693 | } | 6173 | } |
5694 | 6174 | ||
6175 | /* | ||
6176 | * Initialize the perf_event context in task_struct | ||
6177 | */ | ||
6178 | int perf_event_init_task(struct task_struct *child) | ||
6179 | { | ||
6180 | int ctxn, ret; | ||
6181 | |||
6182 | for_each_task_context_nr(ctxn) { | ||
6183 | ret = perf_event_init_context(child, ctxn); | ||
6184 | if (ret) | ||
6185 | return ret; | ||
6186 | } | ||
6187 | |||
6188 | return 0; | ||
6189 | } | ||
6190 | |||
5695 | static void __init perf_event_init_all_cpus(void) | 6191 | static void __init perf_event_init_all_cpus(void) |
5696 | { | 6192 | { |
6193 | struct swevent_htable *swhash; | ||
5697 | int cpu; | 6194 | int cpu; |
5698 | struct perf_cpu_context *cpuctx; | ||
5699 | 6195 | ||
5700 | for_each_possible_cpu(cpu) { | 6196 | for_each_possible_cpu(cpu) { |
5701 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 6197 | swhash = &per_cpu(swevent_htable, cpu); |
5702 | mutex_init(&cpuctx->hlist_mutex); | 6198 | mutex_init(&swhash->hlist_mutex); |
5703 | __perf_event_init_context(&cpuctx->ctx, NULL); | 6199 | INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); |
5704 | } | 6200 | } |
5705 | } | 6201 | } |
5706 | 6202 | ||
5707 | static void __cpuinit perf_event_init_cpu(int cpu) | 6203 | static void __cpuinit perf_event_init_cpu(int cpu) |
5708 | { | 6204 | { |
5709 | struct perf_cpu_context *cpuctx; | 6205 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5710 | |||
5711 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5712 | 6206 | ||
5713 | spin_lock(&perf_resource_lock); | 6207 | mutex_lock(&swhash->hlist_mutex); |
5714 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; | 6208 | if (swhash->hlist_refcount > 0) { |
5715 | spin_unlock(&perf_resource_lock); | ||
5716 | |||
5717 | mutex_lock(&cpuctx->hlist_mutex); | ||
5718 | if (cpuctx->hlist_refcount > 0) { | ||
5719 | struct swevent_hlist *hlist; | 6209 | struct swevent_hlist *hlist; |
5720 | 6210 | ||
5721 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 6211 | hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); |
5722 | WARN_ON_ONCE(!hlist); | 6212 | WARN_ON(!hlist); |
5723 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 6213 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
5724 | } | 6214 | } |
5725 | mutex_unlock(&cpuctx->hlist_mutex); | 6215 | mutex_unlock(&swhash->hlist_mutex); |
5726 | } | 6216 | } |
5727 | 6217 | ||
5728 | #ifdef CONFIG_HOTPLUG_CPU | 6218 | #ifdef CONFIG_HOTPLUG_CPU |
5729 | static void __perf_event_exit_cpu(void *info) | 6219 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
5730 | { | 6220 | { |
5731 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 6221 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
5732 | struct perf_event_context *ctx = &cpuctx->ctx; | 6222 | |
6223 | WARN_ON(!irqs_disabled()); | ||
6224 | |||
6225 | list_del_init(&cpuctx->rotation_list); | ||
6226 | } | ||
6227 | |||
6228 | static void __perf_event_exit_context(void *__info) | ||
6229 | { | ||
6230 | struct perf_event_context *ctx = __info; | ||
5733 | struct perf_event *event, *tmp; | 6231 | struct perf_event *event, *tmp; |
5734 | 6232 | ||
6233 | perf_pmu_rotate_stop(ctx->pmu); | ||
6234 | |||
5735 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 6235 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
5736 | __perf_event_remove_from_context(event); | 6236 | __perf_event_remove_from_context(event); |
5737 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 6237 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
5738 | __perf_event_remove_from_context(event); | 6238 | __perf_event_remove_from_context(event); |
5739 | } | 6239 | } |
6240 | |||
6241 | static void perf_event_exit_cpu_context(int cpu) | ||
6242 | { | ||
6243 | struct perf_event_context *ctx; | ||
6244 | struct pmu *pmu; | ||
6245 | int idx; | ||
6246 | |||
6247 | idx = srcu_read_lock(&pmus_srcu); | ||
6248 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
6249 | ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; | ||
6250 | |||
6251 | mutex_lock(&ctx->mutex); | ||
6252 | smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); | ||
6253 | mutex_unlock(&ctx->mutex); | ||
6254 | } | ||
6255 | srcu_read_unlock(&pmus_srcu, idx); | ||
6256 | } | ||
6257 | |||
5740 | static void perf_event_exit_cpu(int cpu) | 6258 | static void perf_event_exit_cpu(int cpu) |
5741 | { | 6259 | { |
5742 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 6260 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5743 | struct perf_event_context *ctx = &cpuctx->ctx; | ||
5744 | 6261 | ||
5745 | mutex_lock(&cpuctx->hlist_mutex); | 6262 | mutex_lock(&swhash->hlist_mutex); |
5746 | swevent_hlist_release(cpuctx); | 6263 | swevent_hlist_release(swhash); |
5747 | mutex_unlock(&cpuctx->hlist_mutex); | 6264 | mutex_unlock(&swhash->hlist_mutex); |
5748 | 6265 | ||
5749 | mutex_lock(&ctx->mutex); | 6266 | perf_event_exit_cpu_context(cpu); |
5750 | smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); | ||
5751 | mutex_unlock(&ctx->mutex); | ||
5752 | } | 6267 | } |
5753 | #else | 6268 | #else |
5754 | static inline void perf_event_exit_cpu(int cpu) { } | 6269 | static inline void perf_event_exit_cpu(int cpu) { } |
@@ -5778,118 +6293,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
5778 | return NOTIFY_OK; | 6293 | return NOTIFY_OK; |
5779 | } | 6294 | } |
5780 | 6295 | ||
5781 | /* | ||
5782 | * This has to have a higher priority than migration_notifier in sched.c. | ||
5783 | */ | ||
5784 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | ||
5785 | .notifier_call = perf_cpu_notify, | ||
5786 | .priority = 20, | ||
5787 | }; | ||
5788 | |||
5789 | void __init perf_event_init(void) | 6296 | void __init perf_event_init(void) |
5790 | { | 6297 | { |
5791 | perf_event_init_all_cpus(); | 6298 | perf_event_init_all_cpus(); |
5792 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | 6299 | init_srcu_struct(&pmus_srcu); |
5793 | (void *)(long)smp_processor_id()); | 6300 | perf_pmu_register(&perf_swevent); |
5794 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, | 6301 | perf_pmu_register(&perf_cpu_clock); |
5795 | (void *)(long)smp_processor_id()); | 6302 | perf_pmu_register(&perf_task_clock); |
5796 | register_cpu_notifier(&perf_cpu_nb); | 6303 | perf_tp_register(); |
5797 | } | 6304 | perf_cpu_notifier(perf_cpu_notify); |
5798 | |||
5799 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, | ||
5800 | struct sysdev_class_attribute *attr, | ||
5801 | char *buf) | ||
5802 | { | ||
5803 | return sprintf(buf, "%d\n", perf_reserved_percpu); | ||
5804 | } | ||
5805 | |||
5806 | static ssize_t | ||
5807 | perf_set_reserve_percpu(struct sysdev_class *class, | ||
5808 | struct sysdev_class_attribute *attr, | ||
5809 | const char *buf, | ||
5810 | size_t count) | ||
5811 | { | ||
5812 | struct perf_cpu_context *cpuctx; | ||
5813 | unsigned long val; | ||
5814 | int err, cpu, mpt; | ||
5815 | |||
5816 | err = strict_strtoul(buf, 10, &val); | ||
5817 | if (err) | ||
5818 | return err; | ||
5819 | if (val > perf_max_events) | ||
5820 | return -EINVAL; | ||
5821 | |||
5822 | spin_lock(&perf_resource_lock); | ||
5823 | perf_reserved_percpu = val; | ||
5824 | for_each_online_cpu(cpu) { | ||
5825 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5826 | raw_spin_lock_irq(&cpuctx->ctx.lock); | ||
5827 | mpt = min(perf_max_events - cpuctx->ctx.nr_events, | ||
5828 | perf_max_events - perf_reserved_percpu); | ||
5829 | cpuctx->max_pertask = mpt; | ||
5830 | raw_spin_unlock_irq(&cpuctx->ctx.lock); | ||
5831 | } | ||
5832 | spin_unlock(&perf_resource_lock); | ||
5833 | |||
5834 | return count; | ||
5835 | } | ||
5836 | |||
5837 | static ssize_t perf_show_overcommit(struct sysdev_class *class, | ||
5838 | struct sysdev_class_attribute *attr, | ||
5839 | char *buf) | ||
5840 | { | ||
5841 | return sprintf(buf, "%d\n", perf_overcommit); | ||
5842 | } | ||
5843 | |||
5844 | static ssize_t | ||
5845 | perf_set_overcommit(struct sysdev_class *class, | ||
5846 | struct sysdev_class_attribute *attr, | ||
5847 | const char *buf, size_t count) | ||
5848 | { | ||
5849 | unsigned long val; | ||
5850 | int err; | ||
5851 | |||
5852 | err = strict_strtoul(buf, 10, &val); | ||
5853 | if (err) | ||
5854 | return err; | ||
5855 | if (val > 1) | ||
5856 | return -EINVAL; | ||
5857 | |||
5858 | spin_lock(&perf_resource_lock); | ||
5859 | perf_overcommit = val; | ||
5860 | spin_unlock(&perf_resource_lock); | ||
5861 | |||
5862 | return count; | ||
5863 | } | ||
5864 | |||
5865 | static SYSDEV_CLASS_ATTR( | ||
5866 | reserve_percpu, | ||
5867 | 0644, | ||
5868 | perf_show_reserve_percpu, | ||
5869 | perf_set_reserve_percpu | ||
5870 | ); | ||
5871 | |||
5872 | static SYSDEV_CLASS_ATTR( | ||
5873 | overcommit, | ||
5874 | 0644, | ||
5875 | perf_show_overcommit, | ||
5876 | perf_set_overcommit | ||
5877 | ); | ||
5878 | |||
5879 | static struct attribute *perfclass_attrs[] = { | ||
5880 | &attr_reserve_percpu.attr, | ||
5881 | &attr_overcommit.attr, | ||
5882 | NULL | ||
5883 | }; | ||
5884 | |||
5885 | static struct attribute_group perfclass_attr_group = { | ||
5886 | .attrs = perfclass_attrs, | ||
5887 | .name = "perf_events", | ||
5888 | }; | ||
5889 | |||
5890 | static int __init perf_event_sysfs_init(void) | ||
5891 | { | ||
5892 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | ||
5893 | &perfclass_attr_group); | ||
5894 | } | 6305 | } |
5895 | device_initcall(perf_event_sysfs_init); | ||
diff --git a/kernel/pid.c b/kernel/pid.c index d55c6fb8d087..39b65b69584f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
401 | struct task_struct *result = NULL; | 401 | struct task_struct *result = NULL; |
402 | if (pid) { | 402 | if (pid) { |
403 | struct hlist_node *first; | 403 | struct hlist_node *first; |
404 | first = rcu_dereference_check(pid->tasks[type].first, | 404 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), |
405 | rcu_read_lock_held() || | 405 | rcu_read_lock_held() || |
406 | lockdep_tasklist_lock_is_held()); | 406 | lockdep_tasklist_lock_is_held()); |
407 | if (first) | 407 | if (first) |
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task); | |||
416 | */ | 416 | */ |
417 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 417 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
418 | { | 418 | { |
419 | rcu_lockdep_assert(rcu_read_lock_held()); | ||
419 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); | 420 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); |
420 | } | 421 | } |
421 | 422 | ||
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 645e541a45f6..c7a8f453919e 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
@@ -110,6 +110,7 @@ static const struct file_operations pm_qos_power_fops = { | |||
110 | .write = pm_qos_power_write, | 110 | .write = pm_qos_power_write, |
111 | .open = pm_qos_power_open, | 111 | .open = pm_qos_power_open, |
112 | .release = pm_qos_power_release, | 112 | .release = pm_qos_power_release, |
113 | .llseek = noop_llseek, | ||
113 | }; | 114 | }; |
114 | 115 | ||
115 | /* unlocked internal variant */ | 116 | /* unlocked internal variant */ |
@@ -398,7 +399,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
398 | } else | 399 | } else |
399 | return -EINVAL; | 400 | return -EINVAL; |
400 | 401 | ||
401 | pm_qos_req = (struct pm_qos_request_list *)filp->private_data; | 402 | pm_qos_req = filp->private_data; |
402 | pm_qos_update_request(pm_qos_req, value); | 403 | pm_qos_update_request(pm_qos_req, value); |
403 | 404 | ||
404 | return count; | 405 | return count; |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca6066a6952e..29bff6117abc 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -86,6 +86,7 @@ config PM_SLEEP_SMP | |||
86 | depends on SMP | 86 | depends on SMP |
87 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE | 87 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE |
88 | depends on PM_SLEEP | 88 | depends on PM_SLEEP |
89 | select HOTPLUG | ||
89 | select HOTPLUG_CPU | 90 | select HOTPLUG_CPU |
90 | default y | 91 | default y |
91 | 92 | ||
@@ -137,6 +138,8 @@ config SUSPEND_FREEZER | |||
137 | config HIBERNATION | 138 | config HIBERNATION |
138 | bool "Hibernation (aka 'suspend to disk')" | 139 | bool "Hibernation (aka 'suspend to disk')" |
139 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE | 140 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE |
141 | select LZO_COMPRESS | ||
142 | select LZO_DECOMPRESS | ||
140 | select SUSPEND_NVS if HAS_IOMEM | 143 | select SUSPEND_NVS if HAS_IOMEM |
141 | ---help--- | 144 | ---help--- |
142 | Enable the suspend to disk (STD) functionality, which is usually | 145 | Enable the suspend to disk (STD) functionality, which is usually |
@@ -242,3 +245,17 @@ config PM_OPS | |||
242 | bool | 245 | bool |
243 | depends on PM_SLEEP || PM_RUNTIME | 246 | depends on PM_SLEEP || PM_RUNTIME |
244 | default y | 247 | default y |
248 | |||
249 | config PM_OPP | ||
250 | bool "Operating Performance Point (OPP) Layer library" | ||
251 | depends on PM | ||
252 | ---help--- | ||
253 | SOCs have a standard set of tuples consisting of frequency and | ||
254 | voltage pairs that the device will support per voltage domain. This | ||
255 | is called Operating Performance Point or OPP. The actual definitions | ||
256 | of OPP varies over silicon within the same family of devices. | ||
257 | |||
258 | OPP layer organizes the data internally using device pointers | ||
259 | representing individual voltage domains and provides SOC | ||
260 | implementations a ready to use framework to manage OPPs. | ||
261 | For more information, read <file:Documentation/power/opp.txt> | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8dc31e02ae12..657272e91d0a 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include "power.h" | 29 | #include "power.h" |
30 | 30 | ||
31 | 31 | ||
32 | static int nocompress = 0; | ||
32 | static int noresume = 0; | 33 | static int noresume = 0; |
33 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 34 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
34 | dev_t swsusp_resume_device; | 35 | dev_t swsusp_resume_device; |
@@ -638,6 +639,8 @@ int hibernate(void) | |||
638 | 639 | ||
639 | if (hibernation_mode == HIBERNATION_PLATFORM) | 640 | if (hibernation_mode == HIBERNATION_PLATFORM) |
640 | flags |= SF_PLATFORM_MODE; | 641 | flags |= SF_PLATFORM_MODE; |
642 | if (nocompress) | ||
643 | flags |= SF_NOCOMPRESS_MODE; | ||
641 | pr_debug("PM: writing image.\n"); | 644 | pr_debug("PM: writing image.\n"); |
642 | error = swsusp_write(flags); | 645 | error = swsusp_write(flags); |
643 | swsusp_free(); | 646 | swsusp_free(); |
@@ -705,7 +708,7 @@ static int software_resume(void) | |||
705 | goto Unlock; | 708 | goto Unlock; |
706 | } | 709 | } |
707 | 710 | ||
708 | pr_debug("PM: Checking image partition %s\n", resume_file); | 711 | pr_debug("PM: Checking hibernation image partition %s\n", resume_file); |
709 | 712 | ||
710 | /* Check if the device is there */ | 713 | /* Check if the device is there */ |
711 | swsusp_resume_device = name_to_dev_t(resume_file); | 714 | swsusp_resume_device = name_to_dev_t(resume_file); |
@@ -730,10 +733,10 @@ static int software_resume(void) | |||
730 | } | 733 | } |
731 | 734 | ||
732 | Check_image: | 735 | Check_image: |
733 | pr_debug("PM: Resume from partition %d:%d\n", | 736 | pr_debug("PM: Hibernation image partition %d:%d present\n", |
734 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); | 737 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); |
735 | 738 | ||
736 | pr_debug("PM: Checking hibernation image.\n"); | 739 | pr_debug("PM: Looking for hibernation image.\n"); |
737 | error = swsusp_check(); | 740 | error = swsusp_check(); |
738 | if (error) | 741 | if (error) |
739 | goto Unlock; | 742 | goto Unlock; |
@@ -765,14 +768,14 @@ static int software_resume(void) | |||
765 | goto Done; | 768 | goto Done; |
766 | } | 769 | } |
767 | 770 | ||
768 | pr_debug("PM: Reading hibernation image.\n"); | 771 | pr_debug("PM: Loading hibernation image.\n"); |
769 | 772 | ||
770 | error = swsusp_read(&flags); | 773 | error = swsusp_read(&flags); |
771 | swsusp_close(FMODE_READ); | 774 | swsusp_close(FMODE_READ); |
772 | if (!error) | 775 | if (!error) |
773 | hibernation_restore(flags & SF_PLATFORM_MODE); | 776 | hibernation_restore(flags & SF_PLATFORM_MODE); |
774 | 777 | ||
775 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); | 778 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); |
776 | swsusp_free(); | 779 | swsusp_free(); |
777 | thaw_processes(); | 780 | thaw_processes(); |
778 | Done: | 781 | Done: |
@@ -785,7 +788,7 @@ static int software_resume(void) | |||
785 | /* For success case, the suspend path will release the lock */ | 788 | /* For success case, the suspend path will release the lock */ |
786 | Unlock: | 789 | Unlock: |
787 | mutex_unlock(&pm_mutex); | 790 | mutex_unlock(&pm_mutex); |
788 | pr_debug("PM: Resume from disk failed.\n"); | 791 | pr_debug("PM: Hibernation image not present or could not be loaded.\n"); |
789 | return error; | 792 | return error; |
790 | close_finish: | 793 | close_finish: |
791 | swsusp_close(FMODE_READ); | 794 | swsusp_close(FMODE_READ); |
@@ -1004,6 +1007,15 @@ static int __init resume_offset_setup(char *str) | |||
1004 | return 1; | 1007 | return 1; |
1005 | } | 1008 | } |
1006 | 1009 | ||
1010 | static int __init hibernate_setup(char *str) | ||
1011 | { | ||
1012 | if (!strncmp(str, "noresume", 8)) | ||
1013 | noresume = 1; | ||
1014 | else if (!strncmp(str, "nocompress", 10)) | ||
1015 | nocompress = 1; | ||
1016 | return 1; | ||
1017 | } | ||
1018 | |||
1007 | static int __init noresume_setup(char *str) | 1019 | static int __init noresume_setup(char *str) |
1008 | { | 1020 | { |
1009 | noresume = 1; | 1021 | noresume = 1; |
@@ -1013,3 +1025,4 @@ static int __init noresume_setup(char *str) | |||
1013 | __setup("noresume", noresume_setup); | 1025 | __setup("noresume", noresume_setup); |
1014 | __setup("resume_offset=", resume_offset_setup); | 1026 | __setup("resume_offset=", resume_offset_setup); |
1015 | __setup("resume=", resume_setup); | 1027 | __setup("resume=", resume_setup); |
1028 | __setup("hibernate=", hibernate_setup); | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 62b0bc6e4983..7b5db6a8561e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -237,18 +237,18 @@ static ssize_t wakeup_count_show(struct kobject *kobj, | |||
237 | struct kobj_attribute *attr, | 237 | struct kobj_attribute *attr, |
238 | char *buf) | 238 | char *buf) |
239 | { | 239 | { |
240 | unsigned long val; | 240 | unsigned int val; |
241 | 241 | ||
242 | return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; | 242 | return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; |
243 | } | 243 | } |
244 | 244 | ||
245 | static ssize_t wakeup_count_store(struct kobject *kobj, | 245 | static ssize_t wakeup_count_store(struct kobject *kobj, |
246 | struct kobj_attribute *attr, | 246 | struct kobj_attribute *attr, |
247 | const char *buf, size_t n) | 247 | const char *buf, size_t n) |
248 | { | 248 | { |
249 | unsigned long val; | 249 | unsigned int val; |
250 | 250 | ||
251 | if (sscanf(buf, "%lu", &val) == 1) { | 251 | if (sscanf(buf, "%u", &val) == 1) { |
252 | if (pm_save_wakeup_count(val)) | 252 | if (pm_save_wakeup_count(val)) |
253 | return n; | 253 | return n; |
254 | } | 254 | } |
@@ -281,12 +281,30 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
281 | } | 281 | } |
282 | 282 | ||
283 | power_attr(pm_trace); | 283 | power_attr(pm_trace); |
284 | |||
285 | static ssize_t pm_trace_dev_match_show(struct kobject *kobj, | ||
286 | struct kobj_attribute *attr, | ||
287 | char *buf) | ||
288 | { | ||
289 | return show_trace_dev_match(buf, PAGE_SIZE); | ||
290 | } | ||
291 | |||
292 | static ssize_t | ||
293 | pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
294 | const char *buf, size_t n) | ||
295 | { | ||
296 | return -EINVAL; | ||
297 | } | ||
298 | |||
299 | power_attr(pm_trace_dev_match); | ||
300 | |||
284 | #endif /* CONFIG_PM_TRACE */ | 301 | #endif /* CONFIG_PM_TRACE */ |
285 | 302 | ||
286 | static struct attribute * g[] = { | 303 | static struct attribute * g[] = { |
287 | &state_attr.attr, | 304 | &state_attr.attr, |
288 | #ifdef CONFIG_PM_TRACE | 305 | #ifdef CONFIG_PM_TRACE |
289 | &pm_trace_attr.attr, | 306 | &pm_trace_attr.attr, |
307 | &pm_trace_dev_match_attr.attr, | ||
290 | #endif | 308 | #endif |
291 | #ifdef CONFIG_PM_SLEEP | 309 | #ifdef CONFIG_PM_SLEEP |
292 | &pm_async_attr.attr, | 310 | &pm_async_attr.attr, |
@@ -308,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq); | |||
308 | 326 | ||
309 | static int __init pm_start_workqueue(void) | 327 | static int __init pm_start_workqueue(void) |
310 | { | 328 | { |
311 | pm_wq = create_freezeable_workqueue("pm"); | 329 | pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); |
312 | 330 | ||
313 | return pm_wq ? 0 : -ENOMEM; | 331 | return pm_wq ? 0 : -ENOMEM; |
314 | } | 332 | } |
@@ -321,6 +339,7 @@ static int __init pm_init(void) | |||
321 | int error = pm_start_workqueue(); | 339 | int error = pm_start_workqueue(); |
322 | if (error) | 340 | if (error) |
323 | return error; | 341 | return error; |
342 | hibernate_image_size_init(); | ||
324 | power_kobj = kobject_create_and_add("power", NULL); | 343 | power_kobj = kobject_create_and_add("power", NULL); |
325 | if (!power_kobj) | 344 | if (!power_kobj) |
326 | return -ENOMEM; | 345 | return -ENOMEM; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 006270fe382d..03634be55f62 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -14,6 +14,9 @@ struct swsusp_info { | |||
14 | } __attribute__((aligned(PAGE_SIZE))); | 14 | } __attribute__((aligned(PAGE_SIZE))); |
15 | 15 | ||
16 | #ifdef CONFIG_HIBERNATION | 16 | #ifdef CONFIG_HIBERNATION |
17 | /* kernel/power/snapshot.c */ | ||
18 | extern void __init hibernate_image_size_init(void); | ||
19 | |||
17 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER | 20 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER |
18 | /* Maximum size of architecture specific data in a hibernation header */ | 21 | /* Maximum size of architecture specific data in a hibernation header */ |
19 | #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) | 22 | #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) |
@@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info) | |||
49 | extern int hibernation_snapshot(int platform_mode); | 52 | extern int hibernation_snapshot(int platform_mode); |
50 | extern int hibernation_restore(int platform_mode); | 53 | extern int hibernation_restore(int platform_mode); |
51 | extern int hibernation_platform_enter(void); | 54 | extern int hibernation_platform_enter(void); |
52 | #endif | 55 | |
56 | #else /* !CONFIG_HIBERNATION */ | ||
57 | |||
58 | static inline void hibernate_image_size_init(void) {} | ||
59 | #endif /* !CONFIG_HIBERNATION */ | ||
53 | 60 | ||
54 | extern int pfn_is_nosave(unsigned long); | 61 | extern int pfn_is_nosave(unsigned long); |
55 | 62 | ||
@@ -134,6 +141,7 @@ extern int swsusp_swap_in_use(void); | |||
134 | * the image header. | 141 | * the image header. |
135 | */ | 142 | */ |
136 | #define SF_PLATFORM_MODE 1 | 143 | #define SF_PLATFORM_MODE 1 |
144 | #define SF_NOCOMPRESS_MODE 2 | ||
137 | 145 | ||
138 | /* kernel/power/hibernate.c */ | 146 | /* kernel/power/hibernate.c */ |
139 | extern int swsusp_check(void); | 147 | extern int swsusp_check(void); |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 028a99598f49..e50b4c1b2a0f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -40,6 +40,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
40 | struct timeval start, end; | 40 | struct timeval start, end; |
41 | u64 elapsed_csecs64; | 41 | u64 elapsed_csecs64; |
42 | unsigned int elapsed_csecs; | 42 | unsigned int elapsed_csecs; |
43 | bool wakeup = false; | ||
43 | 44 | ||
44 | do_gettimeofday(&start); | 45 | do_gettimeofday(&start); |
45 | 46 | ||
@@ -78,6 +79,11 @@ static int try_to_freeze_tasks(bool sig_only) | |||
78 | if (!todo || time_after(jiffies, end_time)) | 79 | if (!todo || time_after(jiffies, end_time)) |
79 | break; | 80 | break; |
80 | 81 | ||
82 | if (!pm_check_wakeup_events()) { | ||
83 | wakeup = true; | ||
84 | break; | ||
85 | } | ||
86 | |||
81 | /* | 87 | /* |
82 | * We need to retry, but first give the freezing tasks some | 88 | * We need to retry, but first give the freezing tasks some |
83 | * time to enter the regrigerator. | 89 | * time to enter the regrigerator. |
@@ -97,8 +103,9 @@ static int try_to_freeze_tasks(bool sig_only) | |||
97 | * but it cleans up leftover PF_FREEZE requests. | 103 | * but it cleans up leftover PF_FREEZE requests. |
98 | */ | 104 | */ |
99 | printk("\n"); | 105 | printk("\n"); |
100 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " | 106 | printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " |
101 | "(%d tasks refusing to freeze, wq_busy=%d):\n", | 107 | "(%d tasks refusing to freeze, wq_busy=%d):\n", |
108 | wakeup ? "aborted" : "failed", | ||
102 | elapsed_csecs / 100, elapsed_csecs % 100, | 109 | elapsed_csecs / 100, elapsed_csecs % 100, |
103 | todo - wq_busy, wq_busy); | 110 | todo - wq_busy, wq_busy); |
104 | 111 | ||
@@ -107,7 +114,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
107 | read_lock(&tasklist_lock); | 114 | read_lock(&tasklist_lock); |
108 | do_each_thread(g, p) { | 115 | do_each_thread(g, p) { |
109 | task_lock(p); | 116 | task_lock(p); |
110 | if (freezing(p) && !freezer_should_skip(p)) | 117 | if (!wakeup && freezing(p) && !freezer_should_skip(p)) |
111 | sched_show_task(p); | 118 | sched_show_task(p); |
112 | cancel_freezing(p); | 119 | cancel_freezing(p); |
113 | task_unlock(p); | 120 | task_unlock(p); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d3f795f01bbc..0dac75ea4456 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *); | |||
46 | * size will not exceed N bytes, but if that is impossible, it will | 46 | * size will not exceed N bytes, but if that is impossible, it will |
47 | * try to create the smallest image possible. | 47 | * try to create the smallest image possible. |
48 | */ | 48 | */ |
49 | unsigned long image_size = 500 * 1024 * 1024; | 49 | unsigned long image_size; |
50 | |||
51 | void __init hibernate_image_size_init(void) | ||
52 | { | ||
53 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; | ||
54 | } | ||
50 | 55 | ||
51 | /* List of PBEs needed for restoring the pages that were allocated before | 56 | /* List of PBEs needed for restoring the pages that were allocated before |
52 | * the suspend and included in the suspend image, but have also been | 57 | * the suspend and included in the suspend image, but have also been |
@@ -979,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
979 | src = kmap_atomic(s_page, KM_USER0); | 984 | src = kmap_atomic(s_page, KM_USER0); |
980 | dst = kmap_atomic(d_page, KM_USER1); | 985 | dst = kmap_atomic(d_page, KM_USER1); |
981 | do_copy_page(dst, src); | 986 | do_copy_page(dst, src); |
982 | kunmap_atomic(src, KM_USER0); | ||
983 | kunmap_atomic(dst, KM_USER1); | 987 | kunmap_atomic(dst, KM_USER1); |
988 | kunmap_atomic(src, KM_USER0); | ||
984 | } else { | 989 | } else { |
985 | if (PageHighMem(d_page)) { | 990 | if (PageHighMem(d_page)) { |
986 | /* Page pointed to by src may contain some kernel | 991 | /* Page pointed to by src may contain some kernel |
@@ -988,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
988 | */ | 993 | */ |
989 | safe_copy_page(buffer, s_page); | 994 | safe_copy_page(buffer, s_page); |
990 | dst = kmap_atomic(d_page, KM_USER0); | 995 | dst = kmap_atomic(d_page, KM_USER0); |
991 | memcpy(dst, buffer, PAGE_SIZE); | 996 | copy_page(dst, buffer); |
992 | kunmap_atomic(dst, KM_USER0); | 997 | kunmap_atomic(dst, KM_USER0); |
993 | } else { | 998 | } else { |
994 | safe_copy_page(page_address(d_page), s_page); | 999 | safe_copy_page(page_address(d_page), s_page); |
@@ -1318,12 +1323,14 @@ int hibernate_preallocate_memory(void) | |||
1318 | 1323 | ||
1319 | /* Compute the maximum number of saveable pages to leave in memory. */ | 1324 | /* Compute the maximum number of saveable pages to leave in memory. */ |
1320 | max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; | 1325 | max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; |
1326 | /* Compute the desired number of image pages specified by image_size. */ | ||
1321 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); | 1327 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); |
1322 | if (size > max_size) | 1328 | if (size > max_size) |
1323 | size = max_size; | 1329 | size = max_size; |
1324 | /* | 1330 | /* |
1325 | * If the maximum is not less than the current number of saveable pages | 1331 | * If the desired number of image pages is at least as large as the |
1326 | * in memory, allocate page frames for the image and we're done. | 1332 | * current number of saveable pages in memory, allocate page frames for |
1333 | * the image and we're done. | ||
1327 | */ | 1334 | */ |
1328 | if (size >= saveable) { | 1335 | if (size >= saveable) { |
1329 | pages = preallocate_image_highmem(save_highmem); | 1336 | pages = preallocate_image_highmem(save_highmem); |
@@ -1680,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
1680 | memory_bm_position_reset(&orig_bm); | 1687 | memory_bm_position_reset(&orig_bm); |
1681 | memory_bm_position_reset(©_bm); | 1688 | memory_bm_position_reset(©_bm); |
1682 | } else if (handle->cur <= nr_meta_pages) { | 1689 | } else if (handle->cur <= nr_meta_pages) { |
1683 | memset(buffer, 0, PAGE_SIZE); | 1690 | clear_page(buffer); |
1684 | pack_pfns(buffer, &orig_bm); | 1691 | pack_pfns(buffer, &orig_bm); |
1685 | } else { | 1692 | } else { |
1686 | struct page *page; | 1693 | struct page *page; |
@@ -1694,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
1694 | void *kaddr; | 1701 | void *kaddr; |
1695 | 1702 | ||
1696 | kaddr = kmap_atomic(page, KM_USER0); | 1703 | kaddr = kmap_atomic(page, KM_USER0); |
1697 | memcpy(buffer, kaddr, PAGE_SIZE); | 1704 | copy_page(buffer, kaddr); |
1698 | kunmap_atomic(kaddr, KM_USER0); | 1705 | kunmap_atomic(kaddr, KM_USER0); |
1699 | handle->buffer = buffer; | 1706 | handle->buffer = buffer; |
1700 | } else { | 1707 | } else { |
@@ -1977,7 +1984,7 @@ static void copy_last_highmem_page(void) | |||
1977 | void *dst; | 1984 | void *dst; |
1978 | 1985 | ||
1979 | dst = kmap_atomic(last_highmem_page, KM_USER0); | 1986 | dst = kmap_atomic(last_highmem_page, KM_USER0); |
1980 | memcpy(dst, buffer, PAGE_SIZE); | 1987 | copy_page(dst, buffer); |
1981 | kunmap_atomic(dst, KM_USER0); | 1988 | kunmap_atomic(dst, KM_USER0); |
1982 | last_highmem_page = NULL; | 1989 | last_highmem_page = NULL; |
1983 | } | 1990 | } |
@@ -2263,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | |||
2263 | 2270 | ||
2264 | kaddr1 = kmap_atomic(p1, KM_USER0); | 2271 | kaddr1 = kmap_atomic(p1, KM_USER0); |
2265 | kaddr2 = kmap_atomic(p2, KM_USER1); | 2272 | kaddr2 = kmap_atomic(p2, KM_USER1); |
2266 | memcpy(buf, kaddr1, PAGE_SIZE); | 2273 | copy_page(buf, kaddr1); |
2267 | memcpy(kaddr1, kaddr2, PAGE_SIZE); | 2274 | copy_page(kaddr1, kaddr2); |
2268 | memcpy(kaddr2, buf, PAGE_SIZE); | 2275 | copy_page(kaddr2, buf); |
2269 | kunmap_atomic(kaddr1, KM_USER0); | ||
2270 | kunmap_atomic(kaddr2, KM_USER1); | 2276 | kunmap_atomic(kaddr2, KM_USER1); |
2277 | kunmap_atomic(kaddr1, KM_USER0); | ||
2271 | } | 2278 | } |
2272 | 2279 | ||
2273 | /** | 2280 | /** |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e6a5bdf61a37..a0e4a86ccf94 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -24,10 +24,12 @@ | |||
24 | #include <linux/swapops.h> | 24 | #include <linux/swapops.h> |
25 | #include <linux/pm.h> | 25 | #include <linux/pm.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/lzo.h> | ||
28 | #include <linux/vmalloc.h> | ||
27 | 29 | ||
28 | #include "power.h" | 30 | #include "power.h" |
29 | 31 | ||
30 | #define SWSUSP_SIG "S1SUSPEND" | 32 | #define HIBERNATE_SIG "LINHIB0001" |
31 | 33 | ||
32 | /* | 34 | /* |
33 | * The swap map is a data structure used for keeping track of each page | 35 | * The swap map is a data structure used for keeping track of each page |
@@ -193,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) | |||
193 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || | 195 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || |
194 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { | 196 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { |
195 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); | 197 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); |
196 | memcpy(swsusp_header->sig,SWSUSP_SIG, 10); | 198 | memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); |
197 | swsusp_header->image = handle->first_sector; | 199 | swsusp_header->image = handle->first_sector; |
198 | swsusp_header->flags = flags; | 200 | swsusp_header->flags = flags; |
199 | error = hib_bio_write_page(swsusp_resume_block, | 201 | error = hib_bio_write_page(swsusp_resume_block, |
@@ -249,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | |||
249 | if (bio_chain) { | 251 | if (bio_chain) { |
250 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 252 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); |
251 | if (src) { | 253 | if (src) { |
252 | memcpy(src, buf, PAGE_SIZE); | 254 | copy_page(src, buf); |
253 | } else { | 255 | } else { |
254 | WARN_ON_ONCE(1); | 256 | WARN_ON_ONCE(1); |
255 | bio_chain = NULL; /* Go synchronous */ | 257 | bio_chain = NULL; /* Go synchronous */ |
@@ -323,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
323 | error = write_page(handle->cur, handle->cur_swap, NULL); | 325 | error = write_page(handle->cur, handle->cur_swap, NULL); |
324 | if (error) | 326 | if (error) |
325 | goto out; | 327 | goto out; |
326 | memset(handle->cur, 0, PAGE_SIZE); | 328 | clear_page(handle->cur); |
327 | handle->cur_swap = offset; | 329 | handle->cur_swap = offset; |
328 | handle->k = 0; | 330 | handle->k = 0; |
329 | } | 331 | } |
@@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle, | |||
357 | return error; | 359 | return error; |
358 | } | 360 | } |
359 | 361 | ||
362 | /* We need to remember how much compressed data we need to read. */ | ||
363 | #define LZO_HEADER sizeof(size_t) | ||
364 | |||
365 | /* Number of pages/bytes we'll compress at one time. */ | ||
366 | #define LZO_UNC_PAGES 32 | ||
367 | #define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) | ||
368 | |||
369 | /* Number of pages/bytes we need for compressed data (worst case). */ | ||
370 | #define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ | ||
371 | LZO_HEADER, PAGE_SIZE) | ||
372 | #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) | ||
373 | |||
360 | /** | 374 | /** |
361 | * save_image - save the suspend image data | 375 | * save_image - save the suspend image data |
362 | */ | 376 | */ |
@@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle, | |||
404 | return ret; | 418 | return ret; |
405 | } | 419 | } |
406 | 420 | ||
421 | |||
422 | /** | ||
423 | * save_image_lzo - Save the suspend image data compressed with LZO. | ||
424 | * @handle: Swap mam handle to use for saving the image. | ||
425 | * @snapshot: Image to read data from. | ||
426 | * @nr_to_write: Number of pages to save. | ||
427 | */ | ||
428 | static int save_image_lzo(struct swap_map_handle *handle, | ||
429 | struct snapshot_handle *snapshot, | ||
430 | unsigned int nr_to_write) | ||
431 | { | ||
432 | unsigned int m; | ||
433 | int ret = 0; | ||
434 | int nr_pages; | ||
435 | int err2; | ||
436 | struct bio *bio; | ||
437 | struct timeval start; | ||
438 | struct timeval stop; | ||
439 | size_t off, unc_len, cmp_len; | ||
440 | unsigned char *unc, *cmp, *wrk, *page; | ||
441 | |||
442 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
443 | if (!page) { | ||
444 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | ||
445 | return -ENOMEM; | ||
446 | } | ||
447 | |||
448 | wrk = vmalloc(LZO1X_1_MEM_COMPRESS); | ||
449 | if (!wrk) { | ||
450 | printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); | ||
451 | free_page((unsigned long)page); | ||
452 | return -ENOMEM; | ||
453 | } | ||
454 | |||
455 | unc = vmalloc(LZO_UNC_SIZE); | ||
456 | if (!unc) { | ||
457 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | ||
458 | vfree(wrk); | ||
459 | free_page((unsigned long)page); | ||
460 | return -ENOMEM; | ||
461 | } | ||
462 | |||
463 | cmp = vmalloc(LZO_CMP_SIZE); | ||
464 | if (!cmp) { | ||
465 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | ||
466 | vfree(unc); | ||
467 | vfree(wrk); | ||
468 | free_page((unsigned long)page); | ||
469 | return -ENOMEM; | ||
470 | } | ||
471 | |||
472 | printk(KERN_INFO | ||
473 | "PM: Compressing and saving image data (%u pages) ... ", | ||
474 | nr_to_write); | ||
475 | m = nr_to_write / 100; | ||
476 | if (!m) | ||
477 | m = 1; | ||
478 | nr_pages = 0; | ||
479 | bio = NULL; | ||
480 | do_gettimeofday(&start); | ||
481 | for (;;) { | ||
482 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { | ||
483 | ret = snapshot_read_next(snapshot); | ||
484 | if (ret < 0) | ||
485 | goto out_finish; | ||
486 | |||
487 | if (!ret) | ||
488 | break; | ||
489 | |||
490 | memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); | ||
491 | |||
492 | if (!(nr_pages % m)) | ||
493 | printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); | ||
494 | nr_pages++; | ||
495 | } | ||
496 | |||
497 | if (!off) | ||
498 | break; | ||
499 | |||
500 | unc_len = off; | ||
501 | ret = lzo1x_1_compress(unc, unc_len, | ||
502 | cmp + LZO_HEADER, &cmp_len, wrk); | ||
503 | if (ret < 0) { | ||
504 | printk(KERN_ERR "PM: LZO compression failed\n"); | ||
505 | break; | ||
506 | } | ||
507 | |||
508 | if (unlikely(!cmp_len || | ||
509 | cmp_len > lzo1x_worst_compress(unc_len))) { | ||
510 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | ||
511 | ret = -1; | ||
512 | break; | ||
513 | } | ||
514 | |||
515 | *(size_t *)cmp = cmp_len; | ||
516 | |||
517 | /* | ||
518 | * Given we are writing one page at a time to disk, we copy | ||
519 | * that much from the buffer, although the last bit will likely | ||
520 | * be smaller than full page. This is OK - we saved the length | ||
521 | * of the compressed data, so any garbage at the end will be | ||
522 | * discarded when we read it. | ||
523 | */ | ||
524 | for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { | ||
525 | memcpy(page, cmp + off, PAGE_SIZE); | ||
526 | |||
527 | ret = swap_write_page(handle, page, &bio); | ||
528 | if (ret) | ||
529 | goto out_finish; | ||
530 | } | ||
531 | } | ||
532 | |||
533 | out_finish: | ||
534 | err2 = hib_wait_on_bio_chain(&bio); | ||
535 | do_gettimeofday(&stop); | ||
536 | if (!ret) | ||
537 | ret = err2; | ||
538 | if (!ret) | ||
539 | printk(KERN_CONT "\b\b\b\bdone\n"); | ||
540 | else | ||
541 | printk(KERN_CONT "\n"); | ||
542 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | ||
543 | |||
544 | vfree(cmp); | ||
545 | vfree(unc); | ||
546 | vfree(wrk); | ||
547 | free_page((unsigned long)page); | ||
548 | |||
549 | return ret; | ||
550 | } | ||
551 | |||
407 | /** | 552 | /** |
408 | * enough_swap - Make sure we have enough swap to save the image. | 553 | * enough_swap - Make sure we have enough swap to save the image. |
409 | * | 554 | * |
@@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle, | |||
411 | * space avaiable from the resume partition. | 556 | * space avaiable from the resume partition. |
412 | */ | 557 | */ |
413 | 558 | ||
414 | static int enough_swap(unsigned int nr_pages) | 559 | static int enough_swap(unsigned int nr_pages, unsigned int flags) |
415 | { | 560 | { |
416 | unsigned int free_swap = count_swap_pages(root_swap, 1); | 561 | unsigned int free_swap = count_swap_pages(root_swap, 1); |
562 | unsigned int required; | ||
417 | 563 | ||
418 | pr_debug("PM: Free swap pages: %u\n", free_swap); | 564 | pr_debug("PM: Free swap pages: %u\n", free_swap); |
419 | return free_swap > nr_pages + PAGES_FOR_IO; | 565 | |
566 | required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? | ||
567 | nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); | ||
568 | return free_swap > required; | ||
420 | } | 569 | } |
421 | 570 | ||
422 | /** | 571 | /** |
@@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags) | |||
443 | printk(KERN_ERR "PM: Cannot get swap writer\n"); | 592 | printk(KERN_ERR "PM: Cannot get swap writer\n"); |
444 | return error; | 593 | return error; |
445 | } | 594 | } |
446 | if (!enough_swap(pages)) { | 595 | if (!enough_swap(pages, flags)) { |
447 | printk(KERN_ERR "PM: Not enough free swap\n"); | 596 | printk(KERN_ERR "PM: Not enough free swap\n"); |
448 | error = -ENOSPC; | 597 | error = -ENOSPC; |
449 | goto out_finish; | 598 | goto out_finish; |
@@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags) | |||
458 | } | 607 | } |
459 | header = (struct swsusp_info *)data_of(snapshot); | 608 | header = (struct swsusp_info *)data_of(snapshot); |
460 | error = swap_write_page(&handle, header, NULL); | 609 | error = swap_write_page(&handle, header, NULL); |
461 | if (!error) | 610 | if (!error) { |
462 | error = save_image(&handle, &snapshot, pages - 1); | 611 | error = (flags & SF_NOCOMPRESS_MODE) ? |
612 | save_image(&handle, &snapshot, pages - 1) : | ||
613 | save_image_lzo(&handle, &snapshot, pages - 1); | ||
614 | } | ||
463 | out_finish: | 615 | out_finish: |
464 | error = swap_writer_finish(&handle, flags, error); | 616 | error = swap_writer_finish(&handle, flags, error); |
465 | return error; | 617 | return error; |
@@ -590,6 +742,127 @@ static int load_image(struct swap_map_handle *handle, | |||
590 | } | 742 | } |
591 | 743 | ||
592 | /** | 744 | /** |
745 | * load_image_lzo - Load compressed image data and decompress them with LZO. | ||
746 | * @handle: Swap map handle to use for loading data. | ||
747 | * @snapshot: Image to copy uncompressed data into. | ||
748 | * @nr_to_read: Number of pages to load. | ||
749 | */ | ||
750 | static int load_image_lzo(struct swap_map_handle *handle, | ||
751 | struct snapshot_handle *snapshot, | ||
752 | unsigned int nr_to_read) | ||
753 | { | ||
754 | unsigned int m; | ||
755 | int error = 0; | ||
756 | struct timeval start; | ||
757 | struct timeval stop; | ||
758 | unsigned nr_pages; | ||
759 | size_t off, unc_len, cmp_len; | ||
760 | unsigned char *unc, *cmp, *page; | ||
761 | |||
762 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
763 | if (!page) { | ||
764 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | ||
765 | return -ENOMEM; | ||
766 | } | ||
767 | |||
768 | unc = vmalloc(LZO_UNC_SIZE); | ||
769 | if (!unc) { | ||
770 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | ||
771 | free_page((unsigned long)page); | ||
772 | return -ENOMEM; | ||
773 | } | ||
774 | |||
775 | cmp = vmalloc(LZO_CMP_SIZE); | ||
776 | if (!cmp) { | ||
777 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | ||
778 | vfree(unc); | ||
779 | free_page((unsigned long)page); | ||
780 | return -ENOMEM; | ||
781 | } | ||
782 | |||
783 | printk(KERN_INFO | ||
784 | "PM: Loading and decompressing image data (%u pages) ... ", | ||
785 | nr_to_read); | ||
786 | m = nr_to_read / 100; | ||
787 | if (!m) | ||
788 | m = 1; | ||
789 | nr_pages = 0; | ||
790 | do_gettimeofday(&start); | ||
791 | |||
792 | error = snapshot_write_next(snapshot); | ||
793 | if (error <= 0) | ||
794 | goto out_finish; | ||
795 | |||
796 | for (;;) { | ||
797 | error = swap_read_page(handle, page, NULL); /* sync */ | ||
798 | if (error) | ||
799 | break; | ||
800 | |||
801 | cmp_len = *(size_t *)page; | ||
802 | if (unlikely(!cmp_len || | ||
803 | cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { | ||
804 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | ||
805 | error = -1; | ||
806 | break; | ||
807 | } | ||
808 | |||
809 | memcpy(cmp, page, PAGE_SIZE); | ||
810 | for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { | ||
811 | error = swap_read_page(handle, page, NULL); /* sync */ | ||
812 | if (error) | ||
813 | goto out_finish; | ||
814 | |||
815 | memcpy(cmp + off, page, PAGE_SIZE); | ||
816 | } | ||
817 | |||
818 | unc_len = LZO_UNC_SIZE; | ||
819 | error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, | ||
820 | unc, &unc_len); | ||
821 | if (error < 0) { | ||
822 | printk(KERN_ERR "PM: LZO decompression failed\n"); | ||
823 | break; | ||
824 | } | ||
825 | |||
826 | if (unlikely(!unc_len || | ||
827 | unc_len > LZO_UNC_SIZE || | ||
828 | unc_len & (PAGE_SIZE - 1))) { | ||
829 | printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); | ||
830 | error = -1; | ||
831 | break; | ||
832 | } | ||
833 | |||
834 | for (off = 0; off < unc_len; off += PAGE_SIZE) { | ||
835 | memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); | ||
836 | |||
837 | if (!(nr_pages % m)) | ||
838 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
839 | nr_pages++; | ||
840 | |||
841 | error = snapshot_write_next(snapshot); | ||
842 | if (error <= 0) | ||
843 | goto out_finish; | ||
844 | } | ||
845 | } | ||
846 | |||
847 | out_finish: | ||
848 | do_gettimeofday(&stop); | ||
849 | if (!error) { | ||
850 | printk("\b\b\b\bdone\n"); | ||
851 | snapshot_write_finalize(snapshot); | ||
852 | if (!snapshot_image_loaded(snapshot)) | ||
853 | error = -ENODATA; | ||
854 | } else | ||
855 | printk("\n"); | ||
856 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | ||
857 | |||
858 | vfree(cmp); | ||
859 | vfree(unc); | ||
860 | free_page((unsigned long)page); | ||
861 | |||
862 | return error; | ||
863 | } | ||
864 | |||
865 | /** | ||
593 | * swsusp_read - read the hibernation image. | 866 | * swsusp_read - read the hibernation image. |
594 | * @flags_p: flags passed by the "frozen" kernel in the image header should | 867 | * @flags_p: flags passed by the "frozen" kernel in the image header should |
595 | * be written into this memeory location | 868 | * be written into this memeory location |
@@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p) | |||
612 | goto end; | 885 | goto end; |
613 | if (!error) | 886 | if (!error) |
614 | error = swap_read_page(&handle, header, NULL); | 887 | error = swap_read_page(&handle, header, NULL); |
615 | if (!error) | 888 | if (!error) { |
616 | error = load_image(&handle, &snapshot, header->pages - 1); | 889 | error = (*flags_p & SF_NOCOMPRESS_MODE) ? |
890 | load_image(&handle, &snapshot, header->pages - 1) : | ||
891 | load_image_lzo(&handle, &snapshot, header->pages - 1); | ||
892 | } | ||
617 | swap_reader_finish(&handle); | 893 | swap_reader_finish(&handle); |
618 | end: | 894 | end: |
619 | if (!error) | 895 | if (!error) |
@@ -634,13 +910,13 @@ int swsusp_check(void) | |||
634 | hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | 910 | hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); |
635 | if (!IS_ERR(hib_resume_bdev)) { | 911 | if (!IS_ERR(hib_resume_bdev)) { |
636 | set_blocksize(hib_resume_bdev, PAGE_SIZE); | 912 | set_blocksize(hib_resume_bdev, PAGE_SIZE); |
637 | memset(swsusp_header, 0, PAGE_SIZE); | 913 | clear_page(swsusp_header); |
638 | error = hib_bio_read_page(swsusp_resume_block, | 914 | error = hib_bio_read_page(swsusp_resume_block, |
639 | swsusp_header, NULL); | 915 | swsusp_header, NULL); |
640 | if (error) | 916 | if (error) |
641 | goto put; | 917 | goto put; |
642 | 918 | ||
643 | if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { | 919 | if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { |
644 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); | 920 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); |
645 | /* Reset swap signature now */ | 921 | /* Reset swap signature now */ |
646 | error = hib_bio_write_page(swsusp_resume_block, | 922 | error = hib_bio_write_page(swsusp_resume_block, |
@@ -653,13 +929,13 @@ put: | |||
653 | if (error) | 929 | if (error) |
654 | blkdev_put(hib_resume_bdev, FMODE_READ); | 930 | blkdev_put(hib_resume_bdev, FMODE_READ); |
655 | else | 931 | else |
656 | pr_debug("PM: Signature found, resuming\n"); | 932 | pr_debug("PM: Image signature found, resuming\n"); |
657 | } else { | 933 | } else { |
658 | error = PTR_ERR(hib_resume_bdev); | 934 | error = PTR_ERR(hib_resume_bdev); |
659 | } | 935 | } |
660 | 936 | ||
661 | if (error) | 937 | if (error) |
662 | pr_debug("PM: Error %d checking image file\n", error); | 938 | pr_debug("PM: Image not found (code %d)\n", error); |
663 | 939 | ||
664 | return error; | 940 | return error; |
665 | } | 941 | } |
diff --git a/kernel/printk.c b/kernel/printk.c index 8fe465ac008a..b2ebaee8c377 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress); | |||
85 | * provides serialisation for access to the entire console | 85 | * provides serialisation for access to the entire console |
86 | * driver system. | 86 | * driver system. |
87 | */ | 87 | */ |
88 | static DECLARE_MUTEX(console_sem); | 88 | static DEFINE_SEMAPHORE(console_sem); |
89 | struct console *console_drivers; | 89 | struct console *console_drivers; |
90 | EXPORT_SYMBOL_GPL(console_drivers); | 90 | EXPORT_SYMBOL_GPL(console_drivers); |
91 | 91 | ||
@@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup); | |||
210 | 210 | ||
211 | #ifdef CONFIG_BOOT_PRINTK_DELAY | 211 | #ifdef CONFIG_BOOT_PRINTK_DELAY |
212 | 212 | ||
213 | static unsigned int boot_delay; /* msecs delay after each printk during bootup */ | 213 | static int boot_delay; /* msecs delay after each printk during bootup */ |
214 | static unsigned long long loops_per_msec; /* based on boot_delay */ | 214 | static unsigned long long loops_per_msec; /* based on boot_delay */ |
215 | 215 | ||
216 | static int __init boot_delay_setup(char *str) | 216 | static int __init boot_delay_setup(char *str) |
@@ -556,7 +556,7 @@ static void zap_locks(void) | |||
556 | /* If a crash is occurring, make sure we can't deadlock */ | 556 | /* If a crash is occurring, make sure we can't deadlock */ |
557 | spin_lock_init(&logbuf_lock); | 557 | spin_lock_init(&logbuf_lock); |
558 | /* And make sure that we print immediately */ | 558 | /* And make sure that we print immediately */ |
559 | init_MUTEX(&console_sem); | 559 | sema_init(&console_sem, 1); |
560 | } | 560 | } |
561 | 561 | ||
562 | #if defined(CONFIG_PRINTK_TIME) | 562 | #if defined(CONFIG_PRINTK_TIME) |
@@ -647,6 +647,7 @@ static inline int can_use_console(unsigned int cpu) | |||
647 | * released but interrupts still disabled. | 647 | * released but interrupts still disabled. |
648 | */ | 648 | */ |
649 | static int acquire_console_semaphore_for_printk(unsigned int cpu) | 649 | static int acquire_console_semaphore_for_printk(unsigned int cpu) |
650 | __releases(&logbuf_lock) | ||
650 | { | 651 | { |
651 | int retval = 0; | 652 | int retval = 0; |
652 | 653 | ||
@@ -1511,7 +1512,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1511 | } | 1512 | } |
1512 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 1513 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1513 | 1514 | ||
1514 | static const char const *kmsg_reasons[] = { | 1515 | static const char * const kmsg_reasons[] = { |
1515 | [KMSG_DUMP_OOPS] = "oops", | 1516 | [KMSG_DUMP_OOPS] = "oops", |
1516 | [KMSG_DUMP_PANIC] = "panic", | 1517 | [KMSG_DUMP_PANIC] = "panic", |
1517 | [KMSG_DUMP_KEXEC] = "kexec", | 1518 | [KMSG_DUMP_KEXEC] = "kexec", |
diff --git a/kernel/profile.c b/kernel/profile.c index b22a899934cc..66f841b7fbd3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
555 | static const struct file_operations proc_profile_operations = { | 555 | static const struct file_operations proc_profile_operations = { |
556 | .read = read_profile, | 556 | .read = read_profile, |
557 | .write = write_profile, | 557 | .write = write_profile, |
558 | .llseek = default_llseek, | ||
558 | }; | 559 | }; |
559 | 560 | ||
560 | #ifdef CONFIG_SMP | 561 | #ifdef CONFIG_SMP |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f34d798ef4a2..99bbaa3e5b0d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task) | |||
181 | * under ptrace. | 181 | * under ptrace. |
182 | */ | 182 | */ |
183 | retval = -ERESTARTNOINTR; | 183 | retval = -ERESTARTNOINTR; |
184 | if (mutex_lock_interruptible(&task->cred_guard_mutex)) | 184 | if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) |
185 | goto out; | 185 | goto out; |
186 | 186 | ||
187 | task_lock(task); | 187 | task_lock(task); |
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task) | |||
208 | unlock_tasklist: | 208 | unlock_tasklist: |
209 | write_unlock_irq(&tasklist_lock); | 209 | write_unlock_irq(&tasklist_lock); |
210 | unlock_creds: | 210 | unlock_creds: |
211 | mutex_unlock(&task->cred_guard_mutex); | 211 | mutex_unlock(&task->signal->cred_guard_mutex); |
212 | out: | 212 | out: |
213 | return retval; | 213 | return retval; |
214 | } | 214 | } |
@@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
329 | * and reacquire the lock. | 329 | * and reacquire the lock. |
330 | */ | 330 | */ |
331 | void exit_ptrace(struct task_struct *tracer) | 331 | void exit_ptrace(struct task_struct *tracer) |
332 | __releases(&tasklist_lock) | ||
333 | __acquires(&tasklist_lock) | ||
332 | { | 334 | { |
333 | struct task_struct *p, *n; | 335 | struct task_struct *p, *n; |
334 | LIST_HEAD(ptrace_dead); | 336 | LIST_HEAD(ptrace_dead); |
@@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds | |||
402 | return copied; | 404 | return copied; |
403 | } | 405 | } |
404 | 406 | ||
405 | static int ptrace_setoptions(struct task_struct *child, long data) | 407 | static int ptrace_setoptions(struct task_struct *child, unsigned long data) |
406 | { | 408 | { |
407 | child->ptrace &= ~PT_TRACE_MASK; | 409 | child->ptrace &= ~PT_TRACE_MASK; |
408 | 410 | ||
@@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) | |||
481 | #define is_sysemu_singlestep(request) 0 | 483 | #define is_sysemu_singlestep(request) 0 |
482 | #endif | 484 | #endif |
483 | 485 | ||
484 | static int ptrace_resume(struct task_struct *child, long request, long data) | 486 | static int ptrace_resume(struct task_struct *child, long request, |
487 | unsigned long data) | ||
485 | { | 488 | { |
486 | if (!valid_signal(data)) | 489 | if (!valid_signal(data)) |
487 | return -EIO; | 490 | return -EIO; |
@@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, | |||
558 | #endif | 561 | #endif |
559 | 562 | ||
560 | int ptrace_request(struct task_struct *child, long request, | 563 | int ptrace_request(struct task_struct *child, long request, |
561 | long addr, long data) | 564 | unsigned long addr, unsigned long data) |
562 | { | 565 | { |
563 | int ret = -EIO; | 566 | int ret = -EIO; |
564 | siginfo_t siginfo; | 567 | siginfo_t siginfo; |
568 | void __user *datavp = (void __user *) data; | ||
569 | unsigned long __user *datalp = datavp; | ||
565 | 570 | ||
566 | switch (request) { | 571 | switch (request) { |
567 | case PTRACE_PEEKTEXT: | 572 | case PTRACE_PEEKTEXT: |
@@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request, | |||
578 | ret = ptrace_setoptions(child, data); | 583 | ret = ptrace_setoptions(child, data); |
579 | break; | 584 | break; |
580 | case PTRACE_GETEVENTMSG: | 585 | case PTRACE_GETEVENTMSG: |
581 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); | 586 | ret = put_user(child->ptrace_message, datalp); |
582 | break; | 587 | break; |
583 | 588 | ||
584 | case PTRACE_GETSIGINFO: | 589 | case PTRACE_GETSIGINFO: |
585 | ret = ptrace_getsiginfo(child, &siginfo); | 590 | ret = ptrace_getsiginfo(child, &siginfo); |
586 | if (!ret) | 591 | if (!ret) |
587 | ret = copy_siginfo_to_user((siginfo_t __user *) data, | 592 | ret = copy_siginfo_to_user(datavp, &siginfo); |
588 | &siginfo); | ||
589 | break; | 593 | break; |
590 | 594 | ||
591 | case PTRACE_SETSIGINFO: | 595 | case PTRACE_SETSIGINFO: |
592 | if (copy_from_user(&siginfo, (siginfo_t __user *) data, | 596 | if (copy_from_user(&siginfo, datavp, sizeof siginfo)) |
593 | sizeof siginfo)) | ||
594 | ret = -EFAULT; | 597 | ret = -EFAULT; |
595 | else | 598 | else |
596 | ret = ptrace_setsiginfo(child, &siginfo); | 599 | ret = ptrace_setsiginfo(child, &siginfo); |
@@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
621 | } | 624 | } |
622 | mmput(mm); | 625 | mmput(mm); |
623 | 626 | ||
624 | ret = put_user(tmp, (unsigned long __user *) data); | 627 | ret = put_user(tmp, datalp); |
625 | break; | 628 | break; |
626 | } | 629 | } |
627 | #endif | 630 | #endif |
@@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
650 | case PTRACE_SETREGSET: | 653 | case PTRACE_SETREGSET: |
651 | { | 654 | { |
652 | struct iovec kiov; | 655 | struct iovec kiov; |
653 | struct iovec __user *uiov = (struct iovec __user *) data; | 656 | struct iovec __user *uiov = datavp; |
654 | 657 | ||
655 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) | 658 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) |
656 | return -EFAULT; | 659 | return -EFAULT; |
@@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid) | |||
691 | #define arch_ptrace_attach(child) do { } while (0) | 694 | #define arch_ptrace_attach(child) do { } while (0) |
692 | #endif | 695 | #endif |
693 | 696 | ||
694 | SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) | 697 | SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, |
698 | unsigned long, data) | ||
695 | { | 699 | { |
696 | struct task_struct *child; | 700 | struct task_struct *child; |
697 | long ret; | 701 | long ret; |
@@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) | |||
732 | return ret; | 736 | return ret; |
733 | } | 737 | } |
734 | 738 | ||
735 | int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) | 739 | int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, |
740 | unsigned long data) | ||
736 | { | 741 | { |
737 | unsigned long tmp; | 742 | unsigned long tmp; |
738 | int copied; | 743 | int copied; |
@@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) | |||
743 | return put_user(tmp, (unsigned long __user *)data); | 748 | return put_user(tmp, (unsigned long __user *)data); |
744 | } | 749 | } |
745 | 750 | ||
746 | int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) | 751 | int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, |
752 | unsigned long data) | ||
747 | { | 753 | { |
748 | int copied; | 754 | int copied; |
749 | 755 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4d169835fb36..a23a57a976d1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) | |||
73 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | 73 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); |
74 | 74 | ||
75 | /** | 75 | /** |
76 | * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? | 76 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? |
77 | * | 77 | * |
78 | * Check for bottom half being disabled, which covers both the | 78 | * Check for bottom half being disabled, which covers both the |
79 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses | 79 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses |
80 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) | 80 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) |
81 | * will show the situation. | 81 | * will show the situation. This is useful for debug checks in functions |
82 | * that require that they be called within an RCU read-side critical | ||
83 | * section. | ||
82 | * | 84 | * |
83 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. | 85 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. |
84 | */ | 86 | */ |
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void) | |||
86 | { | 88 | { |
87 | if (!debug_lockdep_rcu_enabled()) | 89 | if (!debug_lockdep_rcu_enabled()) |
88 | return 1; | 90 | return 1; |
89 | return in_softirq(); | 91 | return in_softirq() || irqs_disabled(); |
90 | } | 92 | } |
91 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | 93 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); |
92 | 94 | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 196ec02f8be0..d806735342ac 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly; | |||
59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | 59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); |
60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
61 | 61 | ||
62 | /* Forward declarations for rcutiny_plugin.h. */ | ||
63 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | ||
64 | static void __call_rcu(struct rcu_head *head, | ||
65 | void (*func)(struct rcu_head *rcu), | ||
66 | struct rcu_ctrlblk *rcp); | ||
67 | |||
68 | #include "rcutiny_plugin.h" | ||
69 | |||
62 | #ifdef CONFIG_NO_HZ | 70 | #ifdef CONFIG_NO_HZ |
63 | 71 | ||
64 | static long rcu_dynticks_nesting = 1; | 72 | static long rcu_dynticks_nesting = 1; |
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
140 | rcu_sched_qs(cpu); | 148 | rcu_sched_qs(cpu); |
141 | else if (!in_softirq()) | 149 | else if (!in_softirq()) |
142 | rcu_bh_qs(cpu); | 150 | rcu_bh_qs(cpu); |
151 | rcu_preempt_check_callbacks(); | ||
143 | } | 152 | } |
144 | 153 | ||
145 | /* | 154 | /* |
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
162 | *rcp->donetail = NULL; | 171 | *rcp->donetail = NULL; |
163 | if (rcp->curtail == rcp->donetail) | 172 | if (rcp->curtail == rcp->donetail) |
164 | rcp->curtail = &rcp->rcucblist; | 173 | rcp->curtail = &rcp->rcucblist; |
174 | rcu_preempt_remove_callbacks(rcp); | ||
165 | rcp->donetail = &rcp->rcucblist; | 175 | rcp->donetail = &rcp->rcucblist; |
166 | local_irq_restore(flags); | 176 | local_irq_restore(flags); |
167 | 177 | ||
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
182 | { | 192 | { |
183 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 193 | __rcu_process_callbacks(&rcu_sched_ctrlblk); |
184 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 194 | __rcu_process_callbacks(&rcu_bh_ctrlblk); |
195 | rcu_preempt_process_callbacks(); | ||
185 | } | 196 | } |
186 | 197 | ||
187 | /* | 198 | /* |
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head, | |||
223 | } | 234 | } |
224 | 235 | ||
225 | /* | 236 | /* |
226 | * Post an RCU callback to be invoked after the end of an RCU grace | 237 | * Post an RCU callback to be invoked after the end of an RCU-sched grace |
227 | * period. But since we have but one CPU, that would be after any | 238 | * period. But since we have but one CPU, that would be after any |
228 | * quiescent state. | 239 | * quiescent state. |
229 | */ | 240 | */ |
230 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 241 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
231 | { | 242 | { |
232 | __call_rcu(head, func, &rcu_sched_ctrlblk); | 243 | __call_rcu(head, func, &rcu_sched_ctrlblk); |
233 | } | 244 | } |
234 | EXPORT_SYMBOL_GPL(call_rcu); | 245 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
235 | 246 | ||
236 | /* | 247 | /* |
237 | * Post an RCU bottom-half callback to be invoked after any subsequent | 248 | * Post an RCU bottom-half callback to be invoked after any subsequent |
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
243 | } | 254 | } |
244 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 255 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
245 | 256 | ||
246 | void rcu_barrier(void) | ||
247 | { | ||
248 | struct rcu_synchronize rcu; | ||
249 | |||
250 | init_rcu_head_on_stack(&rcu.head); | ||
251 | init_completion(&rcu.completion); | ||
252 | /* Will wake me after RCU finished. */ | ||
253 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
254 | /* Wait for it. */ | ||
255 | wait_for_completion(&rcu.completion); | ||
256 | destroy_rcu_head_on_stack(&rcu.head); | ||
257 | } | ||
258 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
259 | |||
260 | void rcu_barrier_bh(void) | 257 | void rcu_barrier_bh(void) |
261 | { | 258 | { |
262 | struct rcu_synchronize rcu; | 259 | struct rcu_synchronize rcu; |
@@ -289,5 +286,3 @@ void __init rcu_init(void) | |||
289 | { | 286 | { |
290 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 287 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
291 | } | 288 | } |
292 | |||
293 | #include "rcutiny_plugin.h" | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc742..6ceca4f745ff 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | 2 | * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition |
3 | * Internal non-public definitions that provide either classic | 3 | * Internal non-public definitions that provide either classic |
4 | * or preemptable semantics. | 4 | * or preemptible semantics. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -17,11 +17,587 @@ | |||
17 | * along with this program; if not, write to the Free Software | 17 | * along with this program; if not, write to the Free Software |
18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
19 | * | 19 | * |
20 | * Copyright IBM Corporation, 2009 | 20 | * Copyright (c) 2010 Linaro |
21 | * | 21 | * |
22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #ifdef CONFIG_TINY_PREEMPT_RCU | ||
26 | |||
27 | #include <linux/delay.h> | ||
28 | |||
29 | /* Global control variables for preemptible RCU. */ | ||
30 | struct rcu_preempt_ctrlblk { | ||
31 | struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ | ||
32 | struct rcu_head **nexttail; | ||
33 | /* Tasks blocked in a preemptible RCU */ | ||
34 | /* read-side critical section while an */ | ||
35 | /* preemptible-RCU grace period is in */ | ||
36 | /* progress must wait for a later grace */ | ||
37 | /* period. This pointer points to the */ | ||
38 | /* ->next pointer of the last task that */ | ||
39 | /* must wait for a later grace period, or */ | ||
40 | /* to &->rcb.rcucblist if there is no */ | ||
41 | /* such task. */ | ||
42 | struct list_head blkd_tasks; | ||
43 | /* Tasks blocked in RCU read-side critical */ | ||
44 | /* section. Tasks are placed at the head */ | ||
45 | /* of this list and age towards the tail. */ | ||
46 | struct list_head *gp_tasks; | ||
47 | /* Pointer to the first task blocking the */ | ||
48 | /* current grace period, or NULL if there */ | ||
49 | /* is not such task. */ | ||
50 | struct list_head *exp_tasks; | ||
51 | /* Pointer to first task blocking the */ | ||
52 | /* current expedited grace period, or NULL */ | ||
53 | /* if there is no such task. If there */ | ||
54 | /* is no current expedited grace period, */ | ||
55 | /* then there cannot be any such task. */ | ||
56 | u8 gpnum; /* Current grace period. */ | ||
57 | u8 gpcpu; /* Last grace period blocked by the CPU. */ | ||
58 | u8 completed; /* Last grace period completed. */ | ||
59 | /* If all three are equal, RCU is idle. */ | ||
60 | }; | ||
61 | |||
62 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | ||
63 | .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
64 | .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
65 | .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
66 | .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), | ||
67 | }; | ||
68 | |||
69 | static int rcu_preempted_readers_exp(void); | ||
70 | static void rcu_report_exp_done(void); | ||
71 | |||
72 | /* | ||
73 | * Return true if the CPU has not yet responded to the current grace period. | ||
74 | */ | ||
75 | static int rcu_cpu_blocking_cur_gp(void) | ||
76 | { | ||
77 | return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Check for a running RCU reader. Because there is only one CPU, | ||
82 | * there can be but one running RCU reader at a time. ;-) | ||
83 | */ | ||
84 | static int rcu_preempt_running_reader(void) | ||
85 | { | ||
86 | return current->rcu_read_lock_nesting; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * Check for preempted RCU readers blocking any grace period. | ||
91 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
92 | */ | ||
93 | static int rcu_preempt_blocked_readers_any(void) | ||
94 | { | ||
95 | return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); | ||
96 | } | ||
97 | |||
98 | /* | ||
99 | * Check for preempted RCU readers blocking the current grace period. | ||
100 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
101 | */ | ||
102 | static int rcu_preempt_blocked_readers_cgp(void) | ||
103 | { | ||
104 | return rcu_preempt_ctrlblk.gp_tasks != NULL; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Return true if another preemptible-RCU grace period is needed. | ||
109 | */ | ||
110 | static int rcu_preempt_needs_another_gp(void) | ||
111 | { | ||
112 | return *rcu_preempt_ctrlblk.rcb.curtail != NULL; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Return true if a preemptible-RCU grace period is in progress. | ||
117 | * The caller must disable hardirqs. | ||
118 | */ | ||
119 | static int rcu_preempt_gp_in_progress(void) | ||
120 | { | ||
121 | return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | ||
126 | * that this just means that the task currently running on the CPU is | ||
127 | * in a quiescent state. There might be any number of tasks blocked | ||
128 | * while in an RCU read-side critical section. | ||
129 | * | ||
130 | * Unlike the other rcu_*_qs() functions, callers to this function | ||
131 | * must disable irqs in order to protect the assignment to | ||
132 | * ->rcu_read_unlock_special. | ||
133 | * | ||
134 | * Because this is a single-CPU implementation, the only way a grace | ||
135 | * period can end is if the CPU is in a quiescent state. The reason is | ||
136 | * that a blocked preemptible-RCU reader can exit its critical section | ||
137 | * only if the CPU is running it at the time. Therefore, when the | ||
138 | * last task blocking the current grace period exits its RCU read-side | ||
139 | * critical section, neither the CPU nor blocked tasks will be stopping | ||
140 | * the current grace period. (In contrast, SMP implementations | ||
141 | * might have CPUs running in RCU read-side critical sections that | ||
142 | * block later grace periods -- but this is not possible given only | ||
143 | * one CPU.) | ||
144 | */ | ||
145 | static void rcu_preempt_cpu_qs(void) | ||
146 | { | ||
147 | /* Record both CPU and task as having responded to current GP. */ | ||
148 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; | ||
149 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | ||
150 | |||
151 | /* | ||
152 | * If there is no GP, or if blocked readers are still blocking GP, | ||
153 | * then there is nothing more to do. | ||
154 | */ | ||
155 | if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) | ||
156 | return; | ||
157 | |||
158 | /* Advance callbacks. */ | ||
159 | rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; | ||
160 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; | ||
161 | rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; | ||
162 | |||
163 | /* If there are no blocked readers, next GP is done instantly. */ | ||
164 | if (!rcu_preempt_blocked_readers_any()) | ||
165 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; | ||
166 | |||
167 | /* If there are done callbacks, make RCU_SOFTIRQ process them. */ | ||
168 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | ||
169 | raise_softirq(RCU_SOFTIRQ); | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * Start a new RCU grace period if warranted. Hard irqs must be disabled. | ||
174 | */ | ||
175 | static void rcu_preempt_start_gp(void) | ||
176 | { | ||
177 | if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { | ||
178 | |||
179 | /* Official start of GP. */ | ||
180 | rcu_preempt_ctrlblk.gpnum++; | ||
181 | |||
182 | /* Any blocked RCU readers block new GP. */ | ||
183 | if (rcu_preempt_blocked_readers_any()) | ||
184 | rcu_preempt_ctrlblk.gp_tasks = | ||
185 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
186 | |||
187 | /* If there is no running reader, CPU is done with GP. */ | ||
188 | if (!rcu_preempt_running_reader()) | ||
189 | rcu_preempt_cpu_qs(); | ||
190 | } | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * We have entered the scheduler, and the current task might soon be | ||
195 | * context-switched away from. If this task is in an RCU read-side | ||
196 | * critical section, we will no longer be able to rely on the CPU to | ||
197 | * record that fact, so we enqueue the task on the blkd_tasks list. | ||
198 | * If the task started after the current grace period began, as recorded | ||
199 | * by ->gpcpu, we enqueue at the beginning of the list. Otherwise | ||
200 | * before the element referenced by ->gp_tasks (or at the tail if | ||
201 | * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. | ||
202 | * The task will dequeue itself when it exits the outermost enclosing | ||
203 | * RCU read-side critical section. Therefore, the current grace period | ||
204 | * cannot be permitted to complete until the ->gp_tasks pointer becomes | ||
205 | * NULL. | ||
206 | * | ||
207 | * Caller must disable preemption. | ||
208 | */ | ||
209 | void rcu_preempt_note_context_switch(void) | ||
210 | { | ||
211 | struct task_struct *t = current; | ||
212 | unsigned long flags; | ||
213 | |||
214 | local_irq_save(flags); /* must exclude scheduler_tick(). */ | ||
215 | if (rcu_preempt_running_reader() && | ||
216 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | ||
217 | |||
218 | /* Possibly blocking in an RCU read-side critical section. */ | ||
219 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | ||
220 | |||
221 | /* | ||
222 | * If this CPU has already checked in, then this task | ||
223 | * will hold up the next grace period rather than the | ||
224 | * current grace period. Queue the task accordingly. | ||
225 | * If the task is queued for the current grace period | ||
226 | * (i.e., this CPU has not yet passed through a quiescent | ||
227 | * state for the current grace period), then as long | ||
228 | * as that task remains queued, the current grace period | ||
229 | * cannot end. | ||
230 | */ | ||
231 | list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); | ||
232 | if (rcu_cpu_blocking_cur_gp()) | ||
233 | rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Either we were not in an RCU read-side critical section to | ||
238 | * begin with, or we have now recorded that critical section | ||
239 | * globally. Either way, we can now note a quiescent state | ||
240 | * for this CPU. Again, if we were in an RCU read-side critical | ||
241 | * section, and if that critical section was blocking the current | ||
242 | * grace period, then the fact that the task has been enqueued | ||
243 | * means that current grace period continues to be blocked. | ||
244 | */ | ||
245 | rcu_preempt_cpu_qs(); | ||
246 | local_irq_restore(flags); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Tiny-preemptible RCU implementation for rcu_read_lock(). | ||
251 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
252 | * if we block. | ||
253 | */ | ||
254 | void __rcu_read_lock(void) | ||
255 | { | ||
256 | current->rcu_read_lock_nesting++; | ||
257 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ | ||
258 | } | ||
259 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
260 | |||
261 | /* | ||
262 | * Handle special cases during rcu_read_unlock(), such as needing to | ||
263 | * notify RCU core processing or task having blocked during the RCU | ||
264 | * read-side critical section. | ||
265 | */ | ||
266 | static void rcu_read_unlock_special(struct task_struct *t) | ||
267 | { | ||
268 | int empty; | ||
269 | int empty_exp; | ||
270 | unsigned long flags; | ||
271 | struct list_head *np; | ||
272 | int special; | ||
273 | |||
274 | /* | ||
275 | * NMI handlers cannot block and cannot safely manipulate state. | ||
276 | * They therefore cannot possibly be special, so just leave. | ||
277 | */ | ||
278 | if (in_nmi()) | ||
279 | return; | ||
280 | |||
281 | local_irq_save(flags); | ||
282 | |||
283 | /* | ||
284 | * If RCU core is waiting for this CPU to exit critical section, | ||
285 | * let it know that we have done so. | ||
286 | */ | ||
287 | special = t->rcu_read_unlock_special; | ||
288 | if (special & RCU_READ_UNLOCK_NEED_QS) | ||
289 | rcu_preempt_cpu_qs(); | ||
290 | |||
291 | /* Hardware IRQ handlers cannot block. */ | ||
292 | if (in_irq()) { | ||
293 | local_irq_restore(flags); | ||
294 | return; | ||
295 | } | ||
296 | |||
297 | /* Clean up if blocked during RCU read-side critical section. */ | ||
298 | if (special & RCU_READ_UNLOCK_BLOCKED) { | ||
299 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; | ||
300 | |||
301 | /* | ||
302 | * Remove this task from the ->blkd_tasks list and adjust | ||
303 | * any pointers that might have been referencing it. | ||
304 | */ | ||
305 | empty = !rcu_preempt_blocked_readers_cgp(); | ||
306 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | ||
307 | np = t->rcu_node_entry.next; | ||
308 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
309 | np = NULL; | ||
310 | list_del(&t->rcu_node_entry); | ||
311 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | ||
312 | rcu_preempt_ctrlblk.gp_tasks = np; | ||
313 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | ||
314 | rcu_preempt_ctrlblk.exp_tasks = np; | ||
315 | INIT_LIST_HEAD(&t->rcu_node_entry); | ||
316 | |||
317 | /* | ||
318 | * If this was the last task on the current list, and if | ||
319 | * we aren't waiting on the CPU, report the quiescent state | ||
320 | * and start a new grace period if needed. | ||
321 | */ | ||
322 | if (!empty && !rcu_preempt_blocked_readers_cgp()) { | ||
323 | rcu_preempt_cpu_qs(); | ||
324 | rcu_preempt_start_gp(); | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * If this was the last task on the expedited lists, | ||
329 | * then we need wake up the waiting task. | ||
330 | */ | ||
331 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
332 | rcu_report_exp_done(); | ||
333 | } | ||
334 | local_irq_restore(flags); | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * Tiny-preemptible RCU implementation for rcu_read_unlock(). | ||
339 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
340 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
341 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
342 | * in an RCU read-side critical section and other special cases. | ||
343 | */ | ||
344 | void __rcu_read_unlock(void) | ||
345 | { | ||
346 | struct task_struct *t = current; | ||
347 | |||
348 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ | ||
349 | --t->rcu_read_lock_nesting; | ||
350 | barrier(); /* decrement before load of ->rcu_read_unlock_special */ | ||
351 | if (t->rcu_read_lock_nesting == 0 && | ||
352 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
353 | rcu_read_unlock_special(t); | ||
354 | #ifdef CONFIG_PROVE_LOCKING | ||
355 | WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); | ||
356 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
357 | } | ||
358 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
359 | |||
360 | /* | ||
361 | * Check for a quiescent state from the current CPU. When a task blocks, | ||
362 | * the task is recorded in the rcu_preempt_ctrlblk structure, which is | ||
363 | * checked elsewhere. This is called from the scheduling-clock interrupt. | ||
364 | * | ||
365 | * Caller must disable hard irqs. | ||
366 | */ | ||
367 | static void rcu_preempt_check_callbacks(void) | ||
368 | { | ||
369 | struct task_struct *t = current; | ||
370 | |||
371 | if (rcu_preempt_gp_in_progress() && | ||
372 | (!rcu_preempt_running_reader() || | ||
373 | !rcu_cpu_blocking_cur_gp())) | ||
374 | rcu_preempt_cpu_qs(); | ||
375 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | ||
376 | rcu_preempt_ctrlblk.rcb.donetail) | ||
377 | raise_softirq(RCU_SOFTIRQ); | ||
378 | if (rcu_preempt_gp_in_progress() && | ||
379 | rcu_cpu_blocking_cur_gp() && | ||
380 | rcu_preempt_running_reader()) | ||
381 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to | ||
386 | * update, so this is invoked from __rcu_process_callbacks() to | ||
387 | * handle that case. Of course, it is invoked for all flavors of | ||
388 | * RCU, but RCU callbacks can appear only on one of the lists, and | ||
389 | * neither ->nexttail nor ->donetail can possibly be NULL, so there | ||
390 | * is no need for an explicit check. | ||
391 | */ | ||
392 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
393 | { | ||
394 | if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) | ||
395 | rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Process callbacks for preemptible RCU. | ||
400 | */ | ||
401 | static void rcu_preempt_process_callbacks(void) | ||
402 | { | ||
403 | __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * Queue a preemptible -RCU callback for invocation after a grace period. | ||
408 | */ | ||
409 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
410 | { | ||
411 | unsigned long flags; | ||
412 | |||
413 | debug_rcu_head_queue(head); | ||
414 | head->func = func; | ||
415 | head->next = NULL; | ||
416 | |||
417 | local_irq_save(flags); | ||
418 | *rcu_preempt_ctrlblk.nexttail = head; | ||
419 | rcu_preempt_ctrlblk.nexttail = &head->next; | ||
420 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ | ||
421 | local_irq_restore(flags); | ||
422 | } | ||
423 | EXPORT_SYMBOL_GPL(call_rcu); | ||
424 | |||
425 | void rcu_barrier(void) | ||
426 | { | ||
427 | struct rcu_synchronize rcu; | ||
428 | |||
429 | init_rcu_head_on_stack(&rcu.head); | ||
430 | init_completion(&rcu.completion); | ||
431 | /* Will wake me after RCU finished. */ | ||
432 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
433 | /* Wait for it. */ | ||
434 | wait_for_completion(&rcu.completion); | ||
435 | destroy_rcu_head_on_stack(&rcu.head); | ||
436 | } | ||
437 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
438 | |||
439 | /* | ||
440 | * synchronize_rcu - wait until a grace period has elapsed. | ||
441 | * | ||
442 | * Control will return to the caller some time after a full grace | ||
443 | * period has elapsed, in other words after all currently executing RCU | ||
444 | * read-side critical sections have completed. RCU read-side critical | ||
445 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
446 | * and may be nested. | ||
447 | */ | ||
448 | void synchronize_rcu(void) | ||
449 | { | ||
450 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
451 | if (!rcu_scheduler_active) | ||
452 | return; | ||
453 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
454 | |||
455 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
456 | if (!rcu_preempt_blocked_readers_any()) | ||
457 | return; | ||
458 | |||
459 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ | ||
460 | rcu_barrier(); | ||
461 | } | ||
462 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
463 | |||
464 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | ||
465 | static unsigned long sync_rcu_preempt_exp_count; | ||
466 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | ||
467 | |||
468 | /* | ||
469 | * Return non-zero if there are any tasks in RCU read-side critical | ||
470 | * sections blocking the current preemptible-RCU expedited grace period. | ||
471 | * If there is no preemptible-RCU expedited grace period currently in | ||
472 | * progress, returns zero unconditionally. | ||
473 | */ | ||
474 | static int rcu_preempted_readers_exp(void) | ||
475 | { | ||
476 | return rcu_preempt_ctrlblk.exp_tasks != NULL; | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * Report the exit from RCU read-side critical section for the last task | ||
481 | * that queued itself during or before the current expedited preemptible-RCU | ||
482 | * grace period. | ||
483 | */ | ||
484 | static void rcu_report_exp_done(void) | ||
485 | { | ||
486 | wake_up(&sync_rcu_preempt_exp_wq); | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | ||
491 | * is to rely in the fact that there is but one CPU, and that it is | ||
492 | * illegal for a task to invoke synchronize_rcu_expedited() while in a | ||
493 | * preemptible-RCU read-side critical section. Therefore, any such | ||
494 | * critical sections must correspond to blocked tasks, which must therefore | ||
495 | * be on the ->blkd_tasks list. So just record the current head of the | ||
496 | * list in the ->exp_tasks pointer, and wait for all tasks including and | ||
497 | * after the task pointed to by ->exp_tasks to drain. | ||
498 | */ | ||
499 | void synchronize_rcu_expedited(void) | ||
500 | { | ||
501 | unsigned long flags; | ||
502 | struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; | ||
503 | unsigned long snap; | ||
504 | |||
505 | barrier(); /* ensure prior action seen before grace period. */ | ||
506 | |||
507 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
508 | |||
509 | /* | ||
510 | * Acquire lock so that there is only one preemptible RCU grace | ||
511 | * period in flight. Of course, if someone does the expedited | ||
512 | * grace period for us while we are acquiring the lock, just leave. | ||
513 | */ | ||
514 | snap = sync_rcu_preempt_exp_count + 1; | ||
515 | mutex_lock(&sync_rcu_preempt_exp_mutex); | ||
516 | if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) | ||
517 | goto unlock_mb_ret; /* Others did our work for us. */ | ||
518 | |||
519 | local_irq_save(flags); | ||
520 | |||
521 | /* | ||
522 | * All RCU readers have to already be on blkd_tasks because | ||
523 | * we cannot legally be executing in an RCU read-side critical | ||
524 | * section. | ||
525 | */ | ||
526 | |||
527 | /* Snapshot current head of ->blkd_tasks list. */ | ||
528 | rpcp->exp_tasks = rpcp->blkd_tasks.next; | ||
529 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) | ||
530 | rpcp->exp_tasks = NULL; | ||
531 | local_irq_restore(flags); | ||
532 | |||
533 | /* Wait for tail of ->blkd_tasks list to drain. */ | ||
534 | if (rcu_preempted_readers_exp()) | ||
535 | wait_event(sync_rcu_preempt_exp_wq, | ||
536 | !rcu_preempted_readers_exp()); | ||
537 | |||
538 | /* Clean up and exit. */ | ||
539 | barrier(); /* ensure expedited GP seen before counter increment. */ | ||
540 | sync_rcu_preempt_exp_count++; | ||
541 | unlock_mb_ret: | ||
542 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | ||
543 | barrier(); /* ensure subsequent action seen after grace period. */ | ||
544 | } | ||
545 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
546 | |||
547 | /* | ||
548 | * Does preemptible RCU need the CPU to stay out of dynticks mode? | ||
549 | */ | ||
550 | int rcu_preempt_needs_cpu(void) | ||
551 | { | ||
552 | if (!rcu_preempt_running_reader()) | ||
553 | rcu_preempt_cpu_qs(); | ||
554 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | * Check for a task exiting while in a preemptible -RCU read-side | ||
559 | * critical section, clean up if so. No need to issue warnings, | ||
560 | * as debug_check_no_locks_held() already does this if lockdep | ||
561 | * is enabled. | ||
562 | */ | ||
563 | void exit_rcu(void) | ||
564 | { | ||
565 | struct task_struct *t = current; | ||
566 | |||
567 | if (t->rcu_read_lock_nesting == 0) | ||
568 | return; | ||
569 | t->rcu_read_lock_nesting = 1; | ||
570 | rcu_read_unlock(); | ||
571 | } | ||
572 | |||
573 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
574 | |||
575 | /* | ||
576 | * Because preemptible RCU does not exist, it never has any callbacks | ||
577 | * to check. | ||
578 | */ | ||
579 | static void rcu_preempt_check_callbacks(void) | ||
580 | { | ||
581 | } | ||
582 | |||
583 | /* | ||
584 | * Because preemptible RCU does not exist, it never has any callbacks | ||
585 | * to remove. | ||
586 | */ | ||
587 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
588 | { | ||
589 | } | ||
590 | |||
591 | /* | ||
592 | * Because preemptible RCU does not exist, it never has any callbacks | ||
593 | * to process. | ||
594 | */ | ||
595 | static void rcu_preempt_process_callbacks(void) | ||
596 | { | ||
597 | } | ||
598 | |||
599 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
600 | |||
25 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 601 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
26 | 602 | ||
27 | #include <linux/kernel_stat.h> | 603 | #include <linux/kernel_stat.h> |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e2726d790b9..9d8e8fb2515f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -120,7 +120,7 @@ struct rcu_torture { | |||
120 | }; | 120 | }; |
121 | 121 | ||
122 | static LIST_HEAD(rcu_torture_freelist); | 122 | static LIST_HEAD(rcu_torture_freelist); |
123 | static struct rcu_torture *rcu_torture_current; | 123 | static struct rcu_torture __rcu *rcu_torture_current; |
124 | static long rcu_torture_current_version; | 124 | static long rcu_torture_current_version; |
125 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | 125 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
126 | static DEFINE_SPINLOCK(rcu_torture_lock); | 126 | static DEFINE_SPINLOCK(rcu_torture_lock); |
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | |||
153 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ | 153 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ |
154 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ | 154 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ |
155 | static int fullstop = FULLSTOP_RMMOD; | 155 | static int fullstop = FULLSTOP_RMMOD; |
156 | DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ | 156 | /* |
157 | /* of kthreads. */ | 157 | * Protect fullstop transitions and spawning of kthreads. |
158 | */ | ||
159 | static DEFINE_MUTEX(fullstop_mutex); | ||
158 | 160 | ||
159 | /* | 161 | /* |
160 | * Detect and respond to a system shutdown. | 162 | * Detect and respond to a system shutdown. |
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) | |||
303 | mdelay(longdelay_ms); | 305 | mdelay(longdelay_ms); |
304 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) | 306 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) |
305 | udelay(shortdelay_us); | 307 | udelay(shortdelay_us); |
308 | #ifdef CONFIG_PREEMPT | ||
309 | if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) | ||
310 | preempt_schedule(); /* No QS if preempt_disable() in effect */ | ||
311 | #endif | ||
306 | } | 312 | } |
307 | 313 | ||
308 | static void rcu_torture_read_unlock(int idx) __releases(RCU) | 314 | static void rcu_torture_read_unlock(int idx) __releases(RCU) |
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) | |||
536 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); | 542 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); |
537 | if (!delay) | 543 | if (!delay) |
538 | schedule_timeout_interruptible(longdelay); | 544 | schedule_timeout_interruptible(longdelay); |
545 | else | ||
546 | rcu_read_delay(rrsp); | ||
539 | } | 547 | } |
540 | 548 | ||
541 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) | 549 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) |
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg) | |||
731 | continue; | 739 | continue; |
732 | rp->rtort_pipe_count = 0; | 740 | rp->rtort_pipe_count = 0; |
733 | udelay(rcu_random(&rand) & 0x3ff); | 741 | udelay(rcu_random(&rand) & 0x3ff); |
734 | old_rp = rcu_torture_current; | 742 | old_rp = rcu_dereference_check(rcu_torture_current, |
743 | current == writer_task); | ||
735 | rp->rtort_mbtest = 1; | 744 | rp->rtort_mbtest = 1; |
736 | rcu_assign_pointer(rcu_torture_current, rp); | 745 | rcu_assign_pointer(rcu_torture_current, rp); |
737 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ | 746 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5bc43976c5a..ccdc04c47981 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -143,6 +143,11 @@ module_param(blimit, int, 0); | |||
143 | module_param(qhimark, int, 0); | 143 | module_param(qhimark, int, 0); |
144 | module_param(qlowmark, int, 0); | 144 | module_param(qlowmark, int, 0); |
145 | 145 | ||
146 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
147 | int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; | ||
148 | module_param(rcu_cpu_stall_suppress, int, 0644); | ||
149 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
150 | |||
146 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 151 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); |
147 | static int rcu_pending(int cpu); | 152 | static int rcu_pending(int cpu); |
148 | 153 | ||
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
450 | 455 | ||
451 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 456 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
452 | 457 | ||
453 | int rcu_cpu_stall_panicking __read_mostly; | 458 | int rcu_cpu_stall_suppress __read_mostly; |
454 | 459 | ||
455 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 460 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
456 | { | 461 | { |
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
482 | rcu_print_task_stall(rnp); | 487 | rcu_print_task_stall(rnp); |
483 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 488 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
484 | 489 | ||
485 | /* OK, time to rat on our buddy... */ | 490 | /* |
486 | 491 | * OK, time to rat on our buddy... | |
492 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
493 | * RCU CPU stall warnings. | ||
494 | */ | ||
487 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", | 495 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", |
488 | rsp->name); | 496 | rsp->name); |
489 | rcu_for_each_leaf_node(rsp, rnp) { | 497 | rcu_for_each_leaf_node(rsp, rnp) { |
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
512 | unsigned long flags; | 520 | unsigned long flags; |
513 | struct rcu_node *rnp = rcu_get_root(rsp); | 521 | struct rcu_node *rnp = rcu_get_root(rsp); |
514 | 522 | ||
523 | /* | ||
524 | * OK, time to rat on ourselves... | ||
525 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
526 | * RCU CPU stall warnings. | ||
527 | */ | ||
515 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", | 528 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", |
516 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); | 529 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); |
517 | trigger_all_cpu_backtrace(); | 530 | trigger_all_cpu_backtrace(); |
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
530 | long delta; | 543 | long delta; |
531 | struct rcu_node *rnp; | 544 | struct rcu_node *rnp; |
532 | 545 | ||
533 | if (rcu_cpu_stall_panicking) | 546 | if (rcu_cpu_stall_suppress) |
534 | return; | 547 | return; |
535 | delta = jiffies - rsp->jiffies_stall; | 548 | delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); |
536 | rnp = rdp->mynode; | 549 | rnp = rdp->mynode; |
537 | if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { | 550 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { |
538 | 551 | ||
539 | /* We haven't checked in, so go dump stack. */ | 552 | /* We haven't checked in, so go dump stack. */ |
540 | print_cpu_stall(rsp); | 553 | print_cpu_stall(rsp); |
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
548 | 561 | ||
549 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | 562 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) |
550 | { | 563 | { |
551 | rcu_cpu_stall_panicking = 1; | 564 | rcu_cpu_stall_suppress = 1; |
552 | return NOTIFY_DONE; | 565 | return NOTIFY_DONE; |
553 | } | 566 | } |
554 | 567 | ||
568 | /** | ||
569 | * rcu_cpu_stall_reset - prevent further stall warnings in current grace period | ||
570 | * | ||
571 | * Set the stall-warning timeout way off into the future, thus preventing | ||
572 | * any RCU CPU stall-warning messages from appearing in the current set of | ||
573 | * RCU grace periods. | ||
574 | * | ||
575 | * The caller must disable hard irqs. | ||
576 | */ | ||
577 | void rcu_cpu_stall_reset(void) | ||
578 | { | ||
579 | rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
580 | rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
581 | rcu_preempt_stall_reset(); | ||
582 | } | ||
583 | |||
555 | static struct notifier_block rcu_panic_block = { | 584 | static struct notifier_block rcu_panic_block = { |
556 | .notifier_call = rcu_panic, | 585 | .notifier_call = rcu_panic, |
557 | }; | 586 | }; |
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
571 | { | 600 | { |
572 | } | 601 | } |
573 | 602 | ||
603 | void rcu_cpu_stall_reset(void) | ||
604 | { | ||
605 | } | ||
606 | |||
574 | static void __init check_cpu_stall_init(void) | 607 | static void __init check_cpu_stall_init(void) |
575 | { | 608 | { |
576 | } | 609 | } |
@@ -712,7 +745,7 @@ static void | |||
712 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | 745 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) |
713 | __releases(rcu_get_root(rsp)->lock) | 746 | __releases(rcu_get_root(rsp)->lock) |
714 | { | 747 | { |
715 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 748 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
716 | struct rcu_node *rnp = rcu_get_root(rsp); | 749 | struct rcu_node *rnp = rcu_get_root(rsp); |
717 | 750 | ||
718 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { | 751 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { |
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
960 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 993 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) |
961 | { | 994 | { |
962 | int i; | 995 | int i; |
963 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 996 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
964 | 997 | ||
965 | if (rdp->nxtlist == NULL) | 998 | if (rdp->nxtlist == NULL) |
966 | return; /* irqs disabled, so comparison is stable. */ | 999 | return; /* irqs disabled, so comparison is stable. */ |
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |||
971 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1004 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
972 | rdp->nxttail[i] = &rdp->nxtlist; | 1005 | rdp->nxttail[i] = &rdp->nxtlist; |
973 | rsp->orphan_qlen += rdp->qlen; | 1006 | rsp->orphan_qlen += rdp->qlen; |
1007 | rdp->n_cbs_orphaned += rdp->qlen; | ||
974 | rdp->qlen = 0; | 1008 | rdp->qlen = 0; |
975 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 1009 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
976 | } | 1010 | } |
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
984 | struct rcu_data *rdp; | 1018 | struct rcu_data *rdp; |
985 | 1019 | ||
986 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1020 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
987 | rdp = rsp->rda[smp_processor_id()]; | 1021 | rdp = this_cpu_ptr(rsp->rda); |
988 | if (rsp->orphan_cbs_list == NULL) { | 1022 | if (rsp->orphan_cbs_list == NULL) { |
989 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 1023 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
990 | return; | 1024 | return; |
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
992 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | 1026 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; |
993 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; | 1027 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; |
994 | rdp->qlen += rsp->orphan_qlen; | 1028 | rdp->qlen += rsp->orphan_qlen; |
1029 | rdp->n_cbs_adopted += rsp->orphan_qlen; | ||
995 | rsp->orphan_cbs_list = NULL; | 1030 | rsp->orphan_cbs_list = NULL; |
996 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | 1031 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; |
997 | rsp->orphan_qlen = 0; | 1032 | rsp->orphan_qlen = 0; |
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1007 | unsigned long flags; | 1042 | unsigned long flags; |
1008 | unsigned long mask; | 1043 | unsigned long mask; |
1009 | int need_report = 0; | 1044 | int need_report = 0; |
1010 | struct rcu_data *rdp = rsp->rda[cpu]; | 1045 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1011 | struct rcu_node *rnp; | 1046 | struct rcu_node *rnp; |
1012 | 1047 | ||
1013 | /* Exclude any attempts to start a new grace period. */ | 1048 | /* Exclude any attempts to start a new grace period. */ |
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1123 | 1158 | ||
1124 | /* Update count, and requeue any remaining callbacks. */ | 1159 | /* Update count, and requeue any remaining callbacks. */ |
1125 | rdp->qlen -= count; | 1160 | rdp->qlen -= count; |
1161 | rdp->n_cbs_invoked += count; | ||
1126 | if (list != NULL) { | 1162 | if (list != NULL) { |
1127 | *tail = rdp->nxtlist; | 1163 | *tail = rdp->nxtlist; |
1128 | rdp->nxtlist = list; | 1164 | rdp->nxtlist = list; |
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1226 | cpu = rnp->grplo; | 1262 | cpu = rnp->grplo; |
1227 | bit = 1; | 1263 | bit = 1; |
1228 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 1264 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
1229 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) | 1265 | if ((rnp->qsmask & bit) != 0 && |
1266 | f(per_cpu_ptr(rsp->rda, cpu))) | ||
1230 | mask |= bit; | 1267 | mask |= bit; |
1231 | } | 1268 | } |
1232 | if (mask != 0) { | 1269 | if (mask != 0) { |
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1402 | * a quiescent state betweentimes. | 1439 | * a quiescent state betweentimes. |
1403 | */ | 1440 | */ |
1404 | local_irq_save(flags); | 1441 | local_irq_save(flags); |
1405 | rdp = rsp->rda[smp_processor_id()]; | 1442 | rdp = this_cpu_ptr(rsp->rda); |
1406 | rcu_process_gp_end(rsp, rdp); | 1443 | rcu_process_gp_end(rsp, rdp); |
1407 | check_for_new_grace_period(rsp, rdp); | 1444 | check_for_new_grace_period(rsp, rdp); |
1408 | 1445 | ||
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1701 | { | 1738 | { |
1702 | unsigned long flags; | 1739 | unsigned long flags; |
1703 | int i; | 1740 | int i; |
1704 | struct rcu_data *rdp = rsp->rda[cpu]; | 1741 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1705 | struct rcu_node *rnp = rcu_get_root(rsp); | 1742 | struct rcu_node *rnp = rcu_get_root(rsp); |
1706 | 1743 | ||
1707 | /* Set up local state, ensuring consistent view of global state. */ | 1744 | /* Set up local state, ensuring consistent view of global state. */ |
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1729 | { | 1766 | { |
1730 | unsigned long flags; | 1767 | unsigned long flags; |
1731 | unsigned long mask; | 1768 | unsigned long mask; |
1732 | struct rcu_data *rdp = rsp->rda[cpu]; | 1769 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1733 | struct rcu_node *rnp = rcu_get_root(rsp); | 1770 | struct rcu_node *rnp = rcu_get_root(rsp); |
1734 | 1771 | ||
1735 | /* Set up local state, ensuring consistent view of global state. */ | 1772 | /* Set up local state, ensuring consistent view of global state. */ |
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
1865 | /* | 1902 | /* |
1866 | * Helper function for rcu_init() that initializes one rcu_state structure. | 1903 | * Helper function for rcu_init() that initializes one rcu_state structure. |
1867 | */ | 1904 | */ |
1868 | static void __init rcu_init_one(struct rcu_state *rsp) | 1905 | static void __init rcu_init_one(struct rcu_state *rsp, |
1906 | struct rcu_data __percpu *rda) | ||
1869 | { | 1907 | { |
1870 | static char *buf[] = { "rcu_node_level_0", | 1908 | static char *buf[] = { "rcu_node_level_0", |
1871 | "rcu_node_level_1", | 1909 | "rcu_node_level_1", |
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
1918 | } | 1956 | } |
1919 | } | 1957 | } |
1920 | 1958 | ||
1959 | rsp->rda = rda; | ||
1921 | rnp = rsp->level[NUM_RCU_LVLS - 1]; | 1960 | rnp = rsp->level[NUM_RCU_LVLS - 1]; |
1922 | for_each_possible_cpu(i) { | 1961 | for_each_possible_cpu(i) { |
1923 | while (i > rnp->grphi) | 1962 | while (i > rnp->grphi) |
1924 | rnp++; | 1963 | rnp++; |
1925 | rsp->rda[i]->mynode = rnp; | 1964 | per_cpu_ptr(rsp->rda, i)->mynode = rnp; |
1926 | rcu_boot_init_percpu_data(i, rsp); | 1965 | rcu_boot_init_percpu_data(i, rsp); |
1927 | } | 1966 | } |
1928 | } | 1967 | } |
1929 | 1968 | ||
1930 | /* | ||
1931 | * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used | ||
1932 | * nowhere else! Assigns leaf node pointers into each CPU's rcu_data | ||
1933 | * structure. | ||
1934 | */ | ||
1935 | #define RCU_INIT_FLAVOR(rsp, rcu_data) \ | ||
1936 | do { \ | ||
1937 | int i; \ | ||
1938 | \ | ||
1939 | for_each_possible_cpu(i) { \ | ||
1940 | (rsp)->rda[i] = &per_cpu(rcu_data, i); \ | ||
1941 | } \ | ||
1942 | rcu_init_one(rsp); \ | ||
1943 | } while (0) | ||
1944 | |||
1945 | void __init rcu_init(void) | 1969 | void __init rcu_init(void) |
1946 | { | 1970 | { |
1947 | int cpu; | 1971 | int cpu; |
1948 | 1972 | ||
1949 | rcu_bootup_announce(); | 1973 | rcu_bootup_announce(); |
1950 | RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); | 1974 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
1951 | RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); | 1975 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
1952 | __rcu_init_preempt(); | 1976 | __rcu_init_preempt(); |
1953 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 1977 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
1954 | 1978 | ||
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b18ed0..91d4170c5c13 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -202,6 +202,9 @@ struct rcu_data { | |||
202 | long qlen; /* # of queued callbacks */ | 202 | long qlen; /* # of queued callbacks */ |
203 | long qlen_last_fqs_check; | 203 | long qlen_last_fqs_check; |
204 | /* qlen at last check for QS forcing */ | 204 | /* qlen at last check for QS forcing */ |
205 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | ||
206 | unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ | ||
207 | unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ | ||
205 | unsigned long n_force_qs_snap; | 208 | unsigned long n_force_qs_snap; |
206 | /* did other CPU force QS recently? */ | 209 | /* did other CPU force QS recently? */ |
207 | long blimit; /* Upper limit on a processed batch */ | 210 | long blimit; /* Upper limit on a processed batch */ |
@@ -254,19 +257,23 @@ struct rcu_data { | |||
254 | #define RCU_STALL_DELAY_DELTA 0 | 257 | #define RCU_STALL_DELAY_DELTA 0 |
255 | #endif | 258 | #endif |
256 | 259 | ||
257 | #define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) | 260 | #define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ |
261 | RCU_STALL_DELAY_DELTA) | ||
258 | /* for rsp->jiffies_stall */ | 262 | /* for rsp->jiffies_stall */ |
259 | #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) | 263 | #define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) |
260 | /* for rsp->jiffies_stall */ | 264 | /* for rsp->jiffies_stall */ |
261 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ | 265 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ |
262 | /* to take at least one */ | 266 | /* to take at least one */ |
263 | /* scheduling clock irq */ | 267 | /* scheduling clock irq */ |
264 | /* before ratting on them. */ | 268 | /* before ratting on them. */ |
265 | 269 | ||
266 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 270 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE |
271 | #define RCU_CPU_STALL_SUPPRESS_INIT 0 | ||
272 | #else | ||
273 | #define RCU_CPU_STALL_SUPPRESS_INIT 1 | ||
274 | #endif | ||
267 | 275 | ||
268 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) | 276 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
269 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | ||
270 | 277 | ||
271 | /* | 278 | /* |
272 | * RCU global state, including node hierarchy. This hierarchy is | 279 | * RCU global state, including node hierarchy. This hierarchy is |
@@ -283,7 +290,7 @@ struct rcu_state { | |||
283 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ | 290 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ |
284 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | 291 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ |
285 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ | 292 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ |
286 | struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ | 293 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
287 | 294 | ||
288 | /* The following fields are guarded by the root rcu_node's lock. */ | 295 | /* The following fields are guarded by the root rcu_node's lock. */ |
289 | 296 | ||
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | |||
365 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 372 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
366 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 373 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
367 | static void rcu_print_task_stall(struct rcu_node *rnp); | 374 | static void rcu_print_task_stall(struct rcu_node *rnp); |
375 | static void rcu_preempt_stall_reset(void); | ||
368 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 376 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
369 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 377 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
370 | #ifdef CONFIG_HOTPLUG_CPU | 378 | #ifdef CONFIG_HOTPLUG_CPU |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420245d9..71a4147473f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
57 | printk(KERN_INFO | 57 | printk(KERN_INFO |
58 | "\tRCU-based detection of stalled CPUs is disabled.\n"); | 58 | "\tRCU-based detection of stalled CPUs is disabled.\n"); |
59 | #endif | 59 | #endif |
60 | #ifndef CONFIG_RCU_CPU_STALL_VERBOSE | 60 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) |
61 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); | 61 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); |
62 | #endif | 62 | #endif |
63 | #if NUM_RCU_LVL_4 != 0 | 63 | #if NUM_RCU_LVL_4 != 0 |
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
154 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 154 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
155 | 155 | ||
156 | /* Possibly blocking in an RCU read-side critical section. */ | 156 | /* Possibly blocking in an RCU read-side critical section. */ |
157 | rdp = rcu_preempt_state.rda[cpu]; | 157 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
158 | rnp = rdp->mynode; | 158 | rnp = rdp->mynode; |
159 | raw_spin_lock_irqsave(&rnp->lock, flags); | 159 | raw_spin_lock_irqsave(&rnp->lock, flags); |
160 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 160 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
201 | */ | 201 | */ |
202 | void __rcu_read_lock(void) | 202 | void __rcu_read_lock(void) |
203 | { | 203 | { |
204 | ACCESS_ONCE(current->rcu_read_lock_nesting)++; | 204 | current->rcu_read_lock_nesting++; |
205 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ | 205 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ |
206 | } | 206 | } |
207 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | 207 | EXPORT_SYMBOL_GPL(__rcu_read_lock); |
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void) | |||
344 | struct task_struct *t = current; | 344 | struct task_struct *t = current; |
345 | 345 | ||
346 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ | 346 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ |
347 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && | 347 | --t->rcu_read_lock_nesting; |
348 | barrier(); /* decrement before load of ->rcu_read_unlock_special */ | ||
349 | if (t->rcu_read_lock_nesting == 0 && | ||
348 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 350 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
349 | rcu_read_unlock_special(t); | 351 | rcu_read_unlock_special(t); |
350 | #ifdef CONFIG_PROVE_LOCKING | 352 | #ifdef CONFIG_PROVE_LOCKING |
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) | |||
417 | } | 419 | } |
418 | } | 420 | } |
419 | 421 | ||
422 | /* | ||
423 | * Suppress preemptible RCU's CPU stall warnings by pushing the | ||
424 | * time of the next stall-warning message comfortably far into the | ||
425 | * future. | ||
426 | */ | ||
427 | static void rcu_preempt_stall_reset(void) | ||
428 | { | ||
429 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
430 | } | ||
431 | |||
420 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 432 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
421 | 433 | ||
422 | /* | 434 | /* |
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
546 | * | 558 | * |
547 | * Control will return to the caller some time after a full grace | 559 | * Control will return to the caller some time after a full grace |
548 | * period has elapsed, in other words after all currently executing RCU | 560 | * period has elapsed, in other words after all currently executing RCU |
549 | * read-side critical sections have completed. RCU read-side critical | 561 | * read-side critical sections have completed. Note, however, that |
550 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 562 | * upon return from synchronize_rcu(), the caller might well be executing |
551 | * and may be nested. | 563 | * concurrently with new RCU read-side critical sections that began while |
564 | * synchronize_rcu() was waiting. RCU read-side critical sections are | ||
565 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | ||
552 | */ | 566 | */ |
553 | void synchronize_rcu(void) | 567 | void synchronize_rcu(void) |
554 | { | 568 | { |
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void) | |||
771 | */ | 785 | */ |
772 | static void __init __rcu_init_preempt(void) | 786 | static void __init __rcu_init_preempt(void) |
773 | { | 787 | { |
774 | RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); | 788 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
775 | } | 789 | } |
776 | 790 | ||
777 | /* | 791 | /* |
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp) | |||
865 | { | 879 | { |
866 | } | 880 | } |
867 | 881 | ||
882 | /* | ||
883 | * Because preemptible RCU does not exist, there is no need to suppress | ||
884 | * its CPU stall warnings. | ||
885 | */ | ||
886 | static void rcu_preempt_stall_reset(void) | ||
887 | { | ||
888 | } | ||
889 | |||
868 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 890 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
869 | 891 | ||
870 | /* | 892 | /* |
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void) | |||
919 | } | 941 | } |
920 | 942 | ||
921 | /* | 943 | /* |
922 | * In classic RCU, call_rcu() is just call_rcu_sched(). | ||
923 | */ | ||
924 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
925 | { | ||
926 | call_rcu_sched(head, func); | ||
927 | } | ||
928 | EXPORT_SYMBOL_GPL(call_rcu); | ||
929 | |||
930 | /* | ||
931 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 944 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
932 | * But because preemptable RCU does not exist, map to rcu-sched. | 945 | * But because preemptable RCU does not exist, map to rcu-sched. |
933 | */ | 946 | */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b45738e..d15430b9d122 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
64 | rdp->dynticks_fqs); | 64 | rdp->dynticks_fqs); |
65 | #endif /* #ifdef CONFIG_NO_HZ */ | 65 | #endif /* #ifdef CONFIG_NO_HZ */ |
66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
67 | seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); | 67 | seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); |
68 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | ||
69 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
68 | } | 70 | } |
69 | 71 | ||
70 | #define PRINT_RCU_DATA(name, func, m) \ | 72 | #define PRINT_RCU_DATA(name, func, m) \ |
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
119 | rdp->dynticks_fqs); | 121 | rdp->dynticks_fqs); |
120 | #endif /* #ifdef CONFIG_NO_HZ */ | 122 | #endif /* #ifdef CONFIG_NO_HZ */ |
121 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 123 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
122 | seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); | 124 | seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); |
125 | seq_printf(m, ",%lu,%lu,%lu\n", | ||
126 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
123 | } | 127 | } |
124 | 128 | ||
125 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 129 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) | |||
128 | #ifdef CONFIG_NO_HZ | 132 | #ifdef CONFIG_NO_HZ |
129 | seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); | 133 | seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); |
130 | #endif /* #ifdef CONFIG_NO_HZ */ | 134 | #endif /* #ifdef CONFIG_NO_HZ */ |
131 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); | 135 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); |
132 | #ifdef CONFIG_TREE_PREEMPT_RCU | 136 | #ifdef CONFIG_TREE_PREEMPT_RCU |
133 | seq_puts(m, "\"rcu_preempt:\"\n"); | 137 | seq_puts(m, "\"rcu_preempt:\"\n"); |
134 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); | 138 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); |
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) | |||
262 | struct rcu_data *rdp; | 266 | struct rcu_data *rdp; |
263 | 267 | ||
264 | for_each_possible_cpu(cpu) { | 268 | for_each_possible_cpu(cpu) { |
265 | rdp = rsp->rda[cpu]; | 269 | rdp = per_cpu_ptr(rsp->rda, cpu); |
266 | if (rdp->beenonline) | 270 | if (rdp->beenonline) |
267 | print_one_rcu_pending(m, rdp); | 271 | print_one_rcu_pending(m, rdp); |
268 | } | 272 | } |
diff --git a/kernel/resource.c b/kernel/resource.c index 7b36976e5dea..9fad33efd0db 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource); | |||
40 | 40 | ||
41 | static DEFINE_RWLOCK(resource_lock); | 41 | static DEFINE_RWLOCK(resource_lock); |
42 | 42 | ||
43 | /* | ||
44 | * By default, we allocate free space bottom-up. The architecture can request | ||
45 | * top-down by clearing this flag. The user can override the architecture's | ||
46 | * choice with the "resource_alloc_from_bottom" kernel boot option, but that | ||
47 | * should only be a debugging tool. | ||
48 | */ | ||
49 | int resource_alloc_from_bottom = 1; | ||
50 | |||
51 | static __init int setup_alloc_from_bottom(char *s) | ||
52 | { | ||
53 | printk(KERN_INFO | ||
54 | "resource: allocating from bottom-up; please report a bug\n"); | ||
55 | resource_alloc_from_bottom = 1; | ||
56 | return 0; | ||
57 | } | ||
58 | early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); | ||
59 | |||
43 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 60 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
44 | { | 61 | { |
45 | struct resource *p = v; | 62 | struct resource *p = v; |
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn) | |||
357 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | 374 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; |
358 | } | 375 | } |
359 | 376 | ||
377 | static resource_size_t simple_align_resource(void *data, | ||
378 | const struct resource *avail, | ||
379 | resource_size_t size, | ||
380 | resource_size_t align) | ||
381 | { | ||
382 | return avail->start; | ||
383 | } | ||
384 | |||
385 | static void resource_clip(struct resource *res, resource_size_t min, | ||
386 | resource_size_t max) | ||
387 | { | ||
388 | if (res->start < min) | ||
389 | res->start = min; | ||
390 | if (res->end > max) | ||
391 | res->end = max; | ||
392 | } | ||
393 | |||
394 | static bool resource_contains(struct resource *res1, struct resource *res2) | ||
395 | { | ||
396 | return res1->start <= res2->start && res1->end >= res2->end; | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * Find the resource before "child" in the sibling list of "root" children. | ||
401 | */ | ||
402 | static struct resource *find_sibling_prev(struct resource *root, struct resource *child) | ||
403 | { | ||
404 | struct resource *this; | ||
405 | |||
406 | for (this = root->child; this; this = this->sibling) | ||
407 | if (this->sibling == child) | ||
408 | return this; | ||
409 | |||
410 | return NULL; | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * Find empty slot in the resource tree given range and alignment. | ||
415 | * This version allocates from the end of the root resource first. | ||
416 | */ | ||
417 | static int find_resource_from_top(struct resource *root, struct resource *new, | ||
418 | resource_size_t size, resource_size_t min, | ||
419 | resource_size_t max, resource_size_t align, | ||
420 | resource_size_t (*alignf)(void *, | ||
421 | const struct resource *, | ||
422 | resource_size_t, | ||
423 | resource_size_t), | ||
424 | void *alignf_data) | ||
425 | { | ||
426 | struct resource *this; | ||
427 | struct resource tmp, avail, alloc; | ||
428 | |||
429 | tmp.start = root->end; | ||
430 | tmp.end = root->end; | ||
431 | |||
432 | this = find_sibling_prev(root, NULL); | ||
433 | for (;;) { | ||
434 | if (this) { | ||
435 | if (this->end < root->end) | ||
436 | tmp.start = this->end + 1; | ||
437 | } else | ||
438 | tmp.start = root->start; | ||
439 | |||
440 | resource_clip(&tmp, min, max); | ||
441 | |||
442 | /* Check for overflow after ALIGN() */ | ||
443 | avail = *new; | ||
444 | avail.start = ALIGN(tmp.start, align); | ||
445 | avail.end = tmp.end; | ||
446 | if (avail.start >= tmp.start) { | ||
447 | alloc.start = alignf(alignf_data, &avail, size, align); | ||
448 | alloc.end = alloc.start + size - 1; | ||
449 | if (resource_contains(&avail, &alloc)) { | ||
450 | new->start = alloc.start; | ||
451 | new->end = alloc.end; | ||
452 | return 0; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | if (!this || this->start == root->start) | ||
457 | break; | ||
458 | |||
459 | tmp.end = this->start - 1; | ||
460 | this = find_sibling_prev(root, this); | ||
461 | } | ||
462 | return -EBUSY; | ||
463 | } | ||
464 | |||
360 | /* | 465 | /* |
361 | * Find empty slot in the resource tree given range and alignment. | 466 | * Find empty slot in the resource tree given range and alignment. |
467 | * This version allocates from the beginning of the root resource first. | ||
362 | */ | 468 | */ |
363 | static int find_resource(struct resource *root, struct resource *new, | 469 | static int find_resource(struct resource *root, struct resource *new, |
364 | resource_size_t size, resource_size_t min, | 470 | resource_size_t size, resource_size_t min, |
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new, | |||
370 | void *alignf_data) | 476 | void *alignf_data) |
371 | { | 477 | { |
372 | struct resource *this = root->child; | 478 | struct resource *this = root->child; |
373 | struct resource tmp = *new; | 479 | struct resource tmp = *new, avail, alloc; |
374 | 480 | ||
375 | tmp.start = root->start; | 481 | tmp.start = root->start; |
376 | /* | 482 | /* |
377 | * Skip past an allocated resource that starts at 0, since the assignment | 483 | * Skip past an allocated resource that starts at 0, since the |
378 | * of this->start - 1 to tmp->end below would cause an underflow. | 484 | * assignment of this->start - 1 to tmp->end below would cause an |
485 | * underflow. | ||
379 | */ | 486 | */ |
380 | if (this && this->start == 0) { | 487 | if (this && this->start == 0) { |
381 | tmp.start = this->end + 1; | 488 | tmp.start = this->end + 1; |
382 | this = this->sibling; | 489 | this = this->sibling; |
383 | } | 490 | } |
384 | for(;;) { | 491 | for (;;) { |
385 | if (this) | 492 | if (this) |
386 | tmp.end = this->start - 1; | 493 | tmp.end = this->start - 1; |
387 | else | 494 | else |
388 | tmp.end = root->end; | 495 | tmp.end = root->end; |
389 | if (tmp.start < min) | 496 | |
390 | tmp.start = min; | 497 | resource_clip(&tmp, min, max); |
391 | if (tmp.end > max) | 498 | |
392 | tmp.end = max; | 499 | /* Check for overflow after ALIGN() */ |
393 | tmp.start = ALIGN(tmp.start, align); | 500 | avail = *new; |
394 | if (alignf) | 501 | avail.start = ALIGN(tmp.start, align); |
395 | tmp.start = alignf(alignf_data, &tmp, size, align); | 502 | avail.end = tmp.end; |
396 | if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { | 503 | if (avail.start >= tmp.start) { |
397 | new->start = tmp.start; | 504 | alloc.start = alignf(alignf_data, &avail, size, align); |
398 | new->end = tmp.start + size - 1; | 505 | alloc.end = alloc.start + size - 1; |
399 | return 0; | 506 | if (resource_contains(&avail, &alloc)) { |
507 | new->start = alloc.start; | ||
508 | new->end = alloc.end; | ||
509 | return 0; | ||
510 | } | ||
400 | } | 511 | } |
512 | |||
401 | if (!this) | 513 | if (!this) |
402 | break; | 514 | break; |
515 | |||
403 | tmp.start = this->end + 1; | 516 | tmp.start = this->end + 1; |
404 | this = this->sibling; | 517 | this = this->sibling; |
405 | } | 518 | } |
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
428 | { | 541 | { |
429 | int err; | 542 | int err; |
430 | 543 | ||
544 | if (!alignf) | ||
545 | alignf = simple_align_resource; | ||
546 | |||
431 | write_lock(&resource_lock); | 547 | write_lock(&resource_lock); |
432 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | 548 | if (resource_alloc_from_bottom) |
549 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | ||
550 | else | ||
551 | err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data); | ||
433 | if (err >= 0 && __request_resource(root, new)) | 552 | if (err >= 0 && __request_resource(root, new)) |
434 | err = -EBUSY; | 553 | err = -EBUSY; |
435 | write_unlock(&resource_lock); | 554 | write_unlock(&resource_lock); |
@@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou | |||
453 | 572 | ||
454 | if (first == parent) | 573 | if (first == parent) |
455 | return first; | 574 | return first; |
575 | if (WARN_ON(first == new)) /* duplicated insertion */ | ||
576 | return first; | ||
456 | 577 | ||
457 | if ((first->start > new->start) || (first->end < new->end)) | 578 | if ((first->start > new->start) || (first->end < new->end)) |
458 | break; | 579 | break; |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index a56f629b057a..66cb89bc5ef1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
76 | } | 76 | } |
77 | 77 | ||
78 | if (!lockwakeup && td->bkl == 4) { | 78 | if (!lockwakeup && td->bkl == 4) { |
79 | #ifdef CONFIG_LOCK_KERNEL | ||
79 | unlock_kernel(); | 80 | unlock_kernel(); |
81 | #endif | ||
80 | td->bkl = 0; | 82 | td->bkl = 0; |
81 | } | 83 | } |
82 | return 0; | 84 | return 0; |
@@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
133 | if (td->bkl) | 135 | if (td->bkl) |
134 | return 0; | 136 | return 0; |
135 | td->bkl = 1; | 137 | td->bkl = 1; |
138 | #ifdef CONFIG_LOCK_KERNEL | ||
136 | lock_kernel(); | 139 | lock_kernel(); |
140 | #endif | ||
137 | td->bkl = 4; | 141 | td->bkl = 4; |
138 | return 0; | 142 | return 0; |
139 | 143 | ||
140 | case RTTEST_UNLOCKBKL: | 144 | case RTTEST_UNLOCKBKL: |
141 | if (td->bkl != 4) | 145 | if (td->bkl != 4) |
142 | break; | 146 | break; |
147 | #ifdef CONFIG_LOCK_KERNEL | ||
143 | unlock_kernel(); | 148 | unlock_kernel(); |
149 | #endif | ||
144 | td->bkl = 0; | 150 | td->bkl = 0; |
145 | return 0; | 151 | return 0; |
146 | 152 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 3fe253e6a6e9..aa14a56f9d03 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -3714,7 +3714,7 @@ void scheduler_tick(void) | |||
3714 | curr->sched_class->task_tick(rq, curr, 0); | 3714 | curr->sched_class->task_tick(rq, curr, 0); |
3715 | raw_spin_unlock(&rq->lock); | 3715 | raw_spin_unlock(&rq->lock); |
3716 | 3716 | ||
3717 | perf_event_task_tick(curr); | 3717 | perf_event_task_tick(); |
3718 | 3718 | ||
3719 | #ifdef CONFIG_SMP | 3719 | #ifdef CONFIG_SMP |
3720 | rq->idle_at_tick = idle_cpu(cpu); | 3720 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -4772,7 +4772,7 @@ recheck: | |||
4772 | } | 4772 | } |
4773 | 4773 | ||
4774 | if (user) { | 4774 | if (user) { |
4775 | retval = security_task_setscheduler(p, policy, param); | 4775 | retval = security_task_setscheduler(p); |
4776 | if (retval) | 4776 | if (retval) |
4777 | return retval; | 4777 | return retval; |
4778 | } | 4778 | } |
@@ -5023,7 +5023,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
5023 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5023 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) |
5024 | goto out_unlock; | 5024 | goto out_unlock; |
5025 | 5025 | ||
5026 | retval = security_task_setscheduler(p, 0, NULL); | 5026 | retval = security_task_setscheduler(p); |
5027 | if (retval) | 5027 | if (retval) |
5028 | goto out_unlock; | 5028 | goto out_unlock; |
5029 | 5029 | ||
@@ -5473,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5473 | idle->se.exec_start = sched_clock(); | 5473 | idle->se.exec_start = sched_clock(); |
5474 | 5474 | ||
5475 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 5475 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); |
5476 | /* | ||
5477 | * We're having a chicken and egg problem, even though we are | ||
5478 | * holding rq->lock, the cpu isn't yet set to this cpu so the | ||
5479 | * lockdep check in task_group() will fail. | ||
5480 | * | ||
5481 | * Similar case to sched_fork(). / Alternatively we could | ||
5482 | * use task_rq_lock() here and obtain the other rq->lock. | ||
5483 | * | ||
5484 | * Silence PROVE_RCU | ||
5485 | */ | ||
5486 | rcu_read_lock(); | ||
5476 | __set_task_cpu(idle, cpu); | 5487 | __set_task_cpu(idle, cpu); |
5488 | rcu_read_unlock(); | ||
5477 | 5489 | ||
5478 | rq->curr = rq->idle = idle; | 5490 | rq->curr = rq->idle = idle; |
5479 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5491 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3acc2a487c18..f4f6a8326dd0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -3793,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p) | |||
3793 | 3793 | ||
3794 | update_rq_clock(rq); | 3794 | update_rq_clock(rq); |
3795 | 3795 | ||
3796 | if (unlikely(task_cpu(p) != this_cpu)) | 3796 | if (unlikely(task_cpu(p) != this_cpu)) { |
3797 | rcu_read_lock(); | ||
3797 | __set_task_cpu(p, this_cpu); | 3798 | __set_task_cpu(p, this_cpu); |
3799 | rcu_read_unlock(); | ||
3800 | } | ||
3798 | 3801 | ||
3799 | update_curr(cfs_rq); | 3802 | update_curr(cfs_rq); |
3800 | 3803 | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 919562c3d6b7..4e3cff10fdce 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p) | |||
1105 | return count; | 1105 | return count; |
1106 | } | 1106 | } |
1107 | 1107 | ||
1108 | struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) | 1108 | struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, |
1109 | unsigned long *flags) | ||
1109 | { | 1110 | { |
1110 | struct sighand_struct *sighand; | 1111 | struct sighand_struct *sighand; |
1111 | 1112 | ||
@@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk) | |||
1617 | * is gone, we keep current->exit_code unless clear_code. | 1618 | * is gone, we keep current->exit_code unless clear_code. |
1618 | */ | 1619 | */ |
1619 | static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | 1620 | static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) |
1621 | __releases(¤t->sighand->siglock) | ||
1622 | __acquires(¤t->sighand->siglock) | ||
1620 | { | 1623 | { |
1621 | if (arch_ptrace_stop_needed(exit_code, info)) { | 1624 | if (arch_ptrace_stop_needed(exit_code, info)) { |
1622 | /* | 1625 | /* |
diff --git a/kernel/smp.c b/kernel/smp.c index ed6aacfcb7ef..12ed8b013e2d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); | |||
267 | * | 267 | * |
268 | * Returns 0 on success, else a negative status code. | 268 | * Returns 0 on success, else a negative status code. |
269 | */ | 269 | */ |
270 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | 270 | int smp_call_function_single(int cpu, smp_call_func_t func, void *info, |
271 | int wait) | 271 | int wait) |
272 | { | 272 | { |
273 | struct call_single_data d = { | 273 | struct call_single_data d = { |
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single); | |||
336 | * 3) any other online cpu in @mask | 336 | * 3) any other online cpu in @mask |
337 | */ | 337 | */ |
338 | int smp_call_function_any(const struct cpumask *mask, | 338 | int smp_call_function_any(const struct cpumask *mask, |
339 | void (*func)(void *info), void *info, int wait) | 339 | smp_call_func_t func, void *info, int wait) |
340 | { | 340 | { |
341 | unsigned int cpu; | 341 | unsigned int cpu; |
342 | const struct cpumask *nodemask; | 342 | const struct cpumask *nodemask; |
@@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, | |||
416 | * must be disabled when calling this function. | 416 | * must be disabled when calling this function. |
417 | */ | 417 | */ |
418 | void smp_call_function_many(const struct cpumask *mask, | 418 | void smp_call_function_many(const struct cpumask *mask, |
419 | void (*func)(void *), void *info, bool wait) | 419 | smp_call_func_t func, void *info, bool wait) |
420 | { | 420 | { |
421 | struct call_function_data *data; | 421 | struct call_function_data *data; |
422 | unsigned long flags; | 422 | unsigned long flags; |
@@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many); | |||
500 | * You must not call this function with disabled interrupts or from a | 500 | * You must not call this function with disabled interrupts or from a |
501 | * hardware interrupt handler or from a bottom half handler. | 501 | * hardware interrupt handler or from a bottom half handler. |
502 | */ | 502 | */ |
503 | int smp_call_function(void (*func)(void *), void *info, int wait) | 503 | int smp_call_function(smp_call_func_t func, void *info, int wait) |
504 | { | 504 | { |
505 | preempt_disable(); | 505 | preempt_disable(); |
506 | smp_call_function_many(cpu_online_mask, func, info, wait); | 506 | smp_call_function_many(cpu_online_mask, func, info, wait); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 79ee8f1fc0e7..18f4be0d5fe0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = { | |||
67 | * to the pending events, so lets the scheduler to balance | 67 | * to the pending events, so lets the scheduler to balance |
68 | * the softirq load for us. | 68 | * the softirq load for us. |
69 | */ | 69 | */ |
70 | void wakeup_softirqd(void) | 70 | static void wakeup_softirqd(void) |
71 | { | 71 | { |
72 | /* Interrupts are disabled: no need to stop preemption */ | 72 | /* Interrupts are disabled: no need to stop preemption */ |
73 | struct task_struct *tsk = __get_cpu_var(ksoftirqd); | 73 | struct task_struct *tsk = __get_cpu_var(ksoftirqd); |
@@ -229,18 +229,20 @@ restart: | |||
229 | 229 | ||
230 | do { | 230 | do { |
231 | if (pending & 1) { | 231 | if (pending & 1) { |
232 | unsigned int vec_nr = h - softirq_vec; | ||
232 | int prev_count = preempt_count(); | 233 | int prev_count = preempt_count(); |
233 | kstat_incr_softirqs_this_cpu(h - softirq_vec); | ||
234 | 234 | ||
235 | trace_softirq_entry(h, softirq_vec); | 235 | kstat_incr_softirqs_this_cpu(vec_nr); |
236 | |||
237 | trace_softirq_entry(vec_nr); | ||
236 | h->action(h); | 238 | h->action(h); |
237 | trace_softirq_exit(h, softirq_vec); | 239 | trace_softirq_exit(vec_nr); |
238 | if (unlikely(prev_count != preempt_count())) { | 240 | if (unlikely(prev_count != preempt_count())) { |
239 | printk(KERN_ERR "huh, entered softirq %td %s %p" | 241 | printk(KERN_ERR "huh, entered softirq %u %s %p" |
240 | "with preempt_count %08x," | 242 | "with preempt_count %08x," |
241 | " exited with %08x?\n", h - softirq_vec, | 243 | " exited with %08x?\n", vec_nr, |
242 | softirq_to_name[h - softirq_vec], | 244 | softirq_to_name[vec_nr], h->action, |
243 | h->action, prev_count, preempt_count()); | 245 | prev_count, preempt_count()); |
244 | preempt_count() = prev_count; | 246 | preempt_count() = prev_count; |
245 | } | 247 | } |
246 | 248 | ||
@@ -910,17 +912,14 @@ int __init __weak early_irq_init(void) | |||
910 | return 0; | 912 | return 0; |
911 | } | 913 | } |
912 | 914 | ||
915 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
913 | int __init __weak arch_probe_nr_irqs(void) | 916 | int __init __weak arch_probe_nr_irqs(void) |
914 | { | 917 | { |
915 | return 0; | 918 | return NR_IRQS_LEGACY; |
916 | } | 919 | } |
917 | 920 | ||
918 | int __init __weak arch_early_irq_init(void) | 921 | int __init __weak arch_early_irq_init(void) |
919 | { | 922 | { |
920 | return 0; | 923 | return 0; |
921 | } | 924 | } |
922 | 925 | #endif | |
923 | int __weak arch_init_chip_data(struct irq_desc *desc, int node) | ||
924 | { | ||
925 | return 0; | ||
926 | } | ||
diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3fd509..c71e07500536 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) | |||
46 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | 46 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, |
47 | struct lock_class_key *key) | 47 | struct lock_class_key *key) |
48 | { | 48 | { |
49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
50 | /* Don't re-initialize a lock while it is held. */ | 49 | /* Don't re-initialize a lock while it is held. */ |
51 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | 50 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); |
52 | lockdep_init_map(&sp->dep_map, name, key, 0); | 51 | lockdep_init_map(&sp->dep_map, name, key, 0); |
53 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
54 | return init_srcu_struct_fields(sp); | 52 | return init_srcu_struct_fields(sp); |
55 | } | 53 | } |
56 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | 54 | EXPORT_SYMBOL_GPL(__init_srcu_struct); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 090c28812ce1..2df820b03beb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -262,7 +262,7 @@ repeat: | |||
262 | cpu_stop_fn_t fn = work->fn; | 262 | cpu_stop_fn_t fn = work->fn; |
263 | void *arg = work->arg; | 263 | void *arg = work->arg; |
264 | struct cpu_stop_done *done = work->done; | 264 | struct cpu_stop_done *done = work->done; |
265 | char ksym_buf[KSYM_NAME_LEN]; | 265 | char ksym_buf[KSYM_NAME_LEN] __maybe_unused; |
266 | 266 | ||
267 | __set_current_state(TASK_RUNNING); | 267 | __set_current_state(TASK_RUNNING); |
268 | 268 | ||
@@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |||
304 | p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", | 304 | p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", |
305 | cpu); | 305 | cpu); |
306 | if (IS_ERR(p)) | 306 | if (IS_ERR(p)) |
307 | return NOTIFY_BAD; | 307 | return notifier_from_errno(PTR_ERR(p)); |
308 | get_task_struct(p); | 308 | get_task_struct(p); |
309 | kthread_bind(p, cpu); | 309 | kthread_bind(p, cpu); |
310 | sched_set_stop_task(cpu, p); | 310 | sched_set_stop_task(cpu, p); |
@@ -372,7 +372,7 @@ static int __init cpu_stop_init(void) | |||
372 | /* start one for the boot cpu */ | 372 | /* start one for the boot cpu */ |
373 | err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, | 373 | err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, |
374 | bcpu); | 374 | bcpu); |
375 | BUG_ON(err == NOTIFY_BAD); | 375 | BUG_ON(err != NOTIFY_OK); |
376 | cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); | 376 | cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); |
377 | register_cpu_notifier(&cpu_stop_cpu_notifier); | 377 | register_cpu_notifier(&cpu_stop_cpu_notifier); |
378 | 378 | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bad369ec5403..c782fe9924c7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -50,6 +50,7 @@ cond_syscall(compat_sys_sendmsg); | |||
50 | cond_syscall(sys_recvmsg); | 50 | cond_syscall(sys_recvmsg); |
51 | cond_syscall(sys_recvmmsg); | 51 | cond_syscall(sys_recvmmsg); |
52 | cond_syscall(compat_sys_recvmsg); | 52 | cond_syscall(compat_sys_recvmsg); |
53 | cond_syscall(compat_sys_recv); | ||
53 | cond_syscall(compat_sys_recvfrom); | 54 | cond_syscall(compat_sys_recvfrom); |
54 | cond_syscall(compat_sys_recvmmsg); | 55 | cond_syscall(compat_sys_recvmmsg); |
55 | cond_syscall(sys_socketcall); | 56 | cond_syscall(sys_socketcall); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770f..c33a1edb799f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -161,8 +161,6 @@ extern int no_unaligned_warning; | |||
161 | extern int unaligned_dump_stack; | 161 | extern int unaligned_dump_stack; |
162 | #endif | 162 | #endif |
163 | 163 | ||
164 | extern struct ratelimit_state printk_ratelimit_state; | ||
165 | |||
166 | #ifdef CONFIG_PROC_SYSCTL | 164 | #ifdef CONFIG_PROC_SYSCTL |
167 | static int proc_do_cad_pid(struct ctl_table *table, int write, | 165 | static int proc_do_cad_pid(struct ctl_table *table, int write, |
168 | void __user *buffer, size_t *lenp, loff_t *ppos); | 166 | void __user *buffer, size_t *lenp, loff_t *ppos); |
@@ -1340,28 +1338,28 @@ static struct ctl_table fs_table[] = { | |||
1340 | .data = &inodes_stat, | 1338 | .data = &inodes_stat, |
1341 | .maxlen = 2*sizeof(int), | 1339 | .maxlen = 2*sizeof(int), |
1342 | .mode = 0444, | 1340 | .mode = 0444, |
1343 | .proc_handler = proc_dointvec, | 1341 | .proc_handler = proc_nr_inodes, |
1344 | }, | 1342 | }, |
1345 | { | 1343 | { |
1346 | .procname = "inode-state", | 1344 | .procname = "inode-state", |
1347 | .data = &inodes_stat, | 1345 | .data = &inodes_stat, |
1348 | .maxlen = 7*sizeof(int), | 1346 | .maxlen = 7*sizeof(int), |
1349 | .mode = 0444, | 1347 | .mode = 0444, |
1350 | .proc_handler = proc_dointvec, | 1348 | .proc_handler = proc_nr_inodes, |
1351 | }, | 1349 | }, |
1352 | { | 1350 | { |
1353 | .procname = "file-nr", | 1351 | .procname = "file-nr", |
1354 | .data = &files_stat, | 1352 | .data = &files_stat, |
1355 | .maxlen = 3*sizeof(int), | 1353 | .maxlen = sizeof(files_stat), |
1356 | .mode = 0444, | 1354 | .mode = 0444, |
1357 | .proc_handler = proc_nr_files, | 1355 | .proc_handler = proc_nr_files, |
1358 | }, | 1356 | }, |
1359 | { | 1357 | { |
1360 | .procname = "file-max", | 1358 | .procname = "file-max", |
1361 | .data = &files_stat.max_files, | 1359 | .data = &files_stat.max_files, |
1362 | .maxlen = sizeof(int), | 1360 | .maxlen = sizeof(files_stat.max_files), |
1363 | .mode = 0644, | 1361 | .mode = 0644, |
1364 | .proc_handler = proc_dointvec, | 1362 | .proc_handler = proc_doulongvec_minmax, |
1365 | }, | 1363 | }, |
1366 | { | 1364 | { |
1367 | .procname = "nr_open", | 1365 | .procname = "nr_open", |
@@ -1377,7 +1375,7 @@ static struct ctl_table fs_table[] = { | |||
1377 | .data = &dentry_stat, | 1375 | .data = &dentry_stat, |
1378 | .maxlen = 6*sizeof(int), | 1376 | .maxlen = 6*sizeof(int), |
1379 | .mode = 0444, | 1377 | .mode = 0444, |
1380 | .proc_handler = proc_dointvec, | 1378 | .proc_handler = proc_nr_dentry, |
1381 | }, | 1379 | }, |
1382 | { | 1380 | { |
1383 | .procname = "overflowuid", | 1381 | .procname = "overflowuid", |
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 04cdcf72c827..10b90d8a03c4 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) | |||
143 | if (!table->maxlen) | 143 | if (!table->maxlen) |
144 | set_fail(&fail, table, "No maxlen"); | 144 | set_fail(&fail, table, "No maxlen"); |
145 | } | 145 | } |
146 | if ((table->proc_handler == proc_doulongvec_minmax) || | ||
147 | (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { | ||
148 | if (table->maxlen > sizeof (unsigned long)) { | ||
149 | if (!table->extra1) | ||
150 | set_fail(&fail, table, "No min"); | ||
151 | if (!table->extra2) | ||
152 | set_fail(&fail, table, "No max"); | ||
153 | } | ||
154 | } | ||
155 | #ifdef CONFIG_PROC_SYSCTL | 146 | #ifdef CONFIG_PROC_SYSCTL |
156 | if (table->procname && !table->proc_handler) | 147 | if (table->procname && !table->proc_handler) |
157 | set_fail(&fail, table, "No proc_handler"); | 148 | set_fail(&fail, table, "No proc_handler"); |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 11281d5792bd..c8231fb15708 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb, | |||
175 | up_write(&listeners->sem); | 175 | up_write(&listeners->sem); |
176 | } | 176 | } |
177 | 177 | ||
178 | static int fill_pid(pid_t pid, struct task_struct *tsk, | 178 | static void fill_stats(struct task_struct *tsk, struct taskstats *stats) |
179 | struct taskstats *stats) | ||
180 | { | 179 | { |
181 | int rc = 0; | ||
182 | |||
183 | if (!tsk) { | ||
184 | rcu_read_lock(); | ||
185 | tsk = find_task_by_vpid(pid); | ||
186 | if (tsk) | ||
187 | get_task_struct(tsk); | ||
188 | rcu_read_unlock(); | ||
189 | if (!tsk) | ||
190 | return -ESRCH; | ||
191 | } else | ||
192 | get_task_struct(tsk); | ||
193 | |||
194 | memset(stats, 0, sizeof(*stats)); | 180 | memset(stats, 0, sizeof(*stats)); |
195 | /* | 181 | /* |
196 | * Each accounting subsystem adds calls to its functions to | 182 | * Each accounting subsystem adds calls to its functions to |
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, | |||
209 | 195 | ||
210 | /* fill in extended acct fields */ | 196 | /* fill in extended acct fields */ |
211 | xacct_add_tsk(stats, tsk); | 197 | xacct_add_tsk(stats, tsk); |
198 | } | ||
212 | 199 | ||
213 | /* Define err: label here if needed */ | 200 | static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) |
214 | put_task_struct(tsk); | 201 | { |
215 | return rc; | 202 | struct task_struct *tsk; |
216 | 203 | ||
204 | rcu_read_lock(); | ||
205 | tsk = find_task_by_vpid(pid); | ||
206 | if (tsk) | ||
207 | get_task_struct(tsk); | ||
208 | rcu_read_unlock(); | ||
209 | if (!tsk) | ||
210 | return -ESRCH; | ||
211 | fill_stats(tsk, stats); | ||
212 | put_task_struct(tsk); | ||
213 | return 0; | ||
217 | } | 214 | } |
218 | 215 | ||
219 | static int fill_tgid(pid_t tgid, struct task_struct *first, | 216 | static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) |
220 | struct taskstats *stats) | ||
221 | { | 217 | { |
222 | struct task_struct *tsk; | 218 | struct task_struct *tsk, *first; |
223 | unsigned long flags; | 219 | unsigned long flags; |
224 | int rc = -ESRCH; | 220 | int rc = -ESRCH; |
225 | 221 | ||
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, | |||
228 | * leaders who are already counted with the dead tasks | 224 | * leaders who are already counted with the dead tasks |
229 | */ | 225 | */ |
230 | rcu_read_lock(); | 226 | rcu_read_lock(); |
231 | if (!first) | 227 | first = find_task_by_vpid(tgid); |
232 | first = find_task_by_vpid(tgid); | ||
233 | 228 | ||
234 | if (!first || !lock_task_sighand(first, &flags)) | 229 | if (!first || !lock_task_sighand(first, &flags)) |
235 | goto out; | 230 | goto out; |
@@ -268,7 +263,6 @@ out: | |||
268 | return rc; | 263 | return rc; |
269 | } | 264 | } |
270 | 265 | ||
271 | |||
272 | static void fill_tgid_exit(struct task_struct *tsk) | 266 | static void fill_tgid_exit(struct task_struct *tsk) |
273 | { | 267 | { |
274 | unsigned long flags; | 268 | unsigned long flags; |
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | |||
360 | struct nlattr *na, *ret; | 354 | struct nlattr *na, *ret; |
361 | int aggr; | 355 | int aggr; |
362 | 356 | ||
357 | /* If we don't pad, we end up with alignment on a 4 byte boundary. | ||
358 | * This causes lots of runtime warnings on systems requiring 8 byte | ||
359 | * alignment */ | ||
360 | u32 pids[2] = { pid, 0 }; | ||
361 | int pid_size = ALIGN(sizeof(pid), sizeof(long)); | ||
362 | |||
363 | aggr = (type == TASKSTATS_TYPE_PID) | 363 | aggr = (type == TASKSTATS_TYPE_PID) |
364 | ? TASKSTATS_TYPE_AGGR_PID | 364 | ? TASKSTATS_TYPE_AGGR_PID |
365 | : TASKSTATS_TYPE_AGGR_TGID; | 365 | : TASKSTATS_TYPE_AGGR_TGID; |
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | |||
367 | na = nla_nest_start(skb, aggr); | 367 | na = nla_nest_start(skb, aggr); |
368 | if (!na) | 368 | if (!na) |
369 | goto err; | 369 | goto err; |
370 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) | 370 | if (nla_put(skb, type, pid_size, pids) < 0) |
371 | goto err; | 371 | goto err; |
372 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); | 372 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); |
373 | if (!ret) | 373 | if (!ret) |
@@ -424,39 +424,46 @@ err: | |||
424 | return rc; | 424 | return rc; |
425 | } | 425 | } |
426 | 426 | ||
427 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | 427 | static int cmd_attr_register_cpumask(struct genl_info *info) |
428 | { | 428 | { |
429 | int rc; | ||
430 | struct sk_buff *rep_skb; | ||
431 | struct taskstats *stats; | ||
432 | size_t size; | ||
433 | cpumask_var_t mask; | 429 | cpumask_var_t mask; |
430 | int rc; | ||
434 | 431 | ||
435 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) | 432 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) |
436 | return -ENOMEM; | 433 | return -ENOMEM; |
437 | |||
438 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); | 434 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); |
439 | if (rc < 0) | 435 | if (rc < 0) |
440 | goto free_return_rc; | 436 | goto out; |
441 | if (rc == 0) { | 437 | rc = add_del_listener(info->snd_pid, mask, REGISTER); |
442 | rc = add_del_listener(info->snd_pid, mask, REGISTER); | 438 | out: |
443 | goto free_return_rc; | 439 | free_cpumask_var(mask); |
444 | } | 440 | return rc; |
441 | } | ||
442 | |||
443 | static int cmd_attr_deregister_cpumask(struct genl_info *info) | ||
444 | { | ||
445 | cpumask_var_t mask; | ||
446 | int rc; | ||
445 | 447 | ||
448 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) | ||
449 | return -ENOMEM; | ||
446 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); | 450 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); |
447 | if (rc < 0) | 451 | if (rc < 0) |
448 | goto free_return_rc; | 452 | goto out; |
449 | if (rc == 0) { | 453 | rc = add_del_listener(info->snd_pid, mask, DEREGISTER); |
450 | rc = add_del_listener(info->snd_pid, mask, DEREGISTER); | 454 | out: |
451 | free_return_rc: | ||
452 | free_cpumask_var(mask); | ||
453 | return rc; | ||
454 | } | ||
455 | free_cpumask_var(mask); | 455 | free_cpumask_var(mask); |
456 | return rc; | ||
457 | } | ||
458 | |||
459 | static int cmd_attr_pid(struct genl_info *info) | ||
460 | { | ||
461 | struct taskstats *stats; | ||
462 | struct sk_buff *rep_skb; | ||
463 | size_t size; | ||
464 | u32 pid; | ||
465 | int rc; | ||
456 | 466 | ||
457 | /* | ||
458 | * Size includes space for nested attributes | ||
459 | */ | ||
460 | size = nla_total_size(sizeof(u32)) + | 467 | size = nla_total_size(sizeof(u32)) + |
461 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | 468 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); |
462 | 469 | ||
@@ -465,33 +472,64 @@ free_return_rc: | |||
465 | return rc; | 472 | return rc; |
466 | 473 | ||
467 | rc = -EINVAL; | 474 | rc = -EINVAL; |
468 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | 475 | pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); |
469 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | 476 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); |
470 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); | 477 | if (!stats) |
471 | if (!stats) | 478 | goto err; |
472 | goto err; | 479 | |
473 | 480 | rc = fill_stats_for_pid(pid, stats); | |
474 | rc = fill_pid(pid, NULL, stats); | 481 | if (rc < 0) |
475 | if (rc < 0) | 482 | goto err; |
476 | goto err; | 483 | return send_reply(rep_skb, info); |
477 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | 484 | err: |
478 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | 485 | nlmsg_free(rep_skb); |
479 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); | 486 | return rc; |
480 | if (!stats) | 487 | } |
481 | goto err; | 488 | |
482 | 489 | static int cmd_attr_tgid(struct genl_info *info) | |
483 | rc = fill_tgid(tgid, NULL, stats); | 490 | { |
484 | if (rc < 0) | 491 | struct taskstats *stats; |
485 | goto err; | 492 | struct sk_buff *rep_skb; |
486 | } else | 493 | size_t size; |
494 | u32 tgid; | ||
495 | int rc; | ||
496 | |||
497 | size = nla_total_size(sizeof(u32)) + | ||
498 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
499 | |||
500 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); | ||
501 | if (rc < 0) | ||
502 | return rc; | ||
503 | |||
504 | rc = -EINVAL; | ||
505 | tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | ||
506 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); | ||
507 | if (!stats) | ||
487 | goto err; | 508 | goto err; |
488 | 509 | ||
510 | rc = fill_stats_for_tgid(tgid, stats); | ||
511 | if (rc < 0) | ||
512 | goto err; | ||
489 | return send_reply(rep_skb, info); | 513 | return send_reply(rep_skb, info); |
490 | err: | 514 | err: |
491 | nlmsg_free(rep_skb); | 515 | nlmsg_free(rep_skb); |
492 | return rc; | 516 | return rc; |
493 | } | 517 | } |
494 | 518 | ||
519 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | ||
520 | { | ||
521 | if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) | ||
522 | return cmd_attr_register_cpumask(info); | ||
523 | else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) | ||
524 | return cmd_attr_deregister_cpumask(info); | ||
525 | else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) | ||
526 | return cmd_attr_pid(info); | ||
527 | else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) | ||
528 | return cmd_attr_tgid(info); | ||
529 | else | ||
530 | return -EINVAL; | ||
531 | } | ||
532 | |||
495 | static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) | 533 | static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) |
496 | { | 534 | { |
497 | struct signal_struct *sig = tsk->signal; | 535 | struct signal_struct *sig = tsk->signal; |
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
555 | if (!stats) | 593 | if (!stats) |
556 | goto err; | 594 | goto err; |
557 | 595 | ||
558 | rc = fill_pid(-1, tsk, stats); | 596 | fill_stats(tsk, stats); |
559 | if (rc < 0) | ||
560 | goto err; | ||
561 | 597 | ||
562 | /* | 598 | /* |
563 | * Doesn't matter if tsk is the leader or the last group member leaving | 599 | * Doesn't matter if tsk is the leader or the last group member leaving |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 4f104515a19b..f8b11a283171 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c | |||
@@ -115,7 +115,9 @@ static int test_kprobes(void) | |||
115 | int ret; | 115 | int ret; |
116 | struct kprobe *kps[2] = {&kp, &kp2}; | 116 | struct kprobe *kps[2] = {&kp, &kp2}; |
117 | 117 | ||
118 | kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 118 | /* addr and flags should be cleard for reusing kprobe. */ |
119 | kp.addr = NULL; | ||
120 | kp.flags = 0; | ||
119 | ret = register_kprobes(kps, 2); | 121 | ret = register_kprobes(kps, 2); |
120 | if (ret < 0) { | 122 | if (ret < 0) { |
121 | printk(KERN_ERR "Kprobe smoke test failed: " | 123 | printk(KERN_ERR "Kprobe smoke test failed: " |
@@ -210,7 +212,9 @@ static int test_jprobes(void) | |||
210 | int ret; | 212 | int ret; |
211 | struct jprobe *jps[2] = {&jp, &jp2}; | 213 | struct jprobe *jps[2] = {&jp, &jp2}; |
212 | 214 | ||
213 | jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 215 | /* addr and flags should be cleard for reusing kprobe. */ |
216 | jp.kp.addr = NULL; | ||
217 | jp.kp.flags = 0; | ||
214 | ret = register_jprobes(jps, 2); | 218 | ret = register_jprobes(jps, 2); |
215 | if (ret < 0) { | 219 | if (ret < 0) { |
216 | printk(KERN_ERR "Kprobe smoke test failed: " | 220 | printk(KERN_ERR "Kprobe smoke test failed: " |
@@ -323,7 +327,9 @@ static int test_kretprobes(void) | |||
323 | int ret; | 327 | int ret; |
324 | struct kretprobe *rps[2] = {&rp, &rp2}; | 328 | struct kretprobe *rps[2] = {&rp, &rp2}; |
325 | 329 | ||
326 | rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 330 | /* addr and flags should be cleard for reusing kprobe. */ |
331 | rp.kp.addr = NULL; | ||
332 | rp.kp.flags = 0; | ||
327 | ret = register_kretprobes(rps, 2); | 333 | ret = register_kretprobes(rps, 2); |
328 | if (ret < 0) { | 334 | if (ret < 0) { |
329 | printk(KERN_ERR "Kprobe smoke test failed: " | 335 | printk(KERN_ERR "Kprobe smoke test failed: " |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c63116863a80..d2321891538f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset) | |||
149 | time_reftime = get_seconds(); | 149 | time_reftime = get_seconds(); |
150 | 150 | ||
151 | offset64 = offset; | 151 | offset64 = offset; |
152 | freq_adj = (offset64 * secs) << | 152 | freq_adj = ntp_update_offset_fll(offset64, secs); |
153 | (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); | ||
154 | 153 | ||
155 | freq_adj += ntp_update_offset_fll(offset64, secs); | 154 | /* |
155 | * Clamp update interval to reduce PLL gain with low | ||
156 | * sampling rate (e.g. intermittent network connection) | ||
157 | * to avoid instability. | ||
158 | */ | ||
159 | if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) | ||
160 | secs = 1 << (SHIFT_PLL + 1 + time_constant); | ||
161 | |||
162 | freq_adj += (offset64 * secs) << | ||
163 | (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); | ||
156 | 164 | ||
157 | freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); | 165 | freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); |
158 | 166 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 97bf05baade7..68a9ae7679b7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -37,7 +37,7 @@ | |||
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/tick.h> | 38 | #include <linux/tick.h> |
39 | #include <linux/kallsyms.h> | 39 | #include <linux/kallsyms.h> |
40 | #include <linux/perf_event.h> | 40 | #include <linux/irq_work.h> |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
43 | 43 | ||
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick) | |||
1279 | run_local_timers(); | 1279 | run_local_timers(); |
1280 | rcu_check_callbacks(cpu, user_tick); | 1280 | rcu_check_callbacks(cpu, user_tick); |
1281 | printk_tick(); | 1281 | printk_tick(); |
1282 | perf_event_do_pending(); | 1282 | #ifdef CONFIG_IRQ_WORK |
1283 | if (in_irq()) | ||
1284 | irq_work_run(); | ||
1285 | #endif | ||
1283 | scheduler_tick(); | 1286 | scheduler_tick(); |
1284 | run_posix_cpu_timers(p); | 1287 | run_posix_cpu_timers(p); |
1285 | } | 1288 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 538501c6ea50..e04b8bcdef88 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS | |||
49 | help | 49 | help |
50 | See Documentation/trace/ftrace-design.txt | 50 | See Documentation/trace/ftrace-design.txt |
51 | 51 | ||
52 | config HAVE_C_RECORDMCOUNT | ||
53 | bool | ||
54 | help | ||
55 | C version of recordmcount available? | ||
56 | |||
52 | config TRACER_MAX_TRACE | 57 | config TRACER_MAX_TRACE |
53 | bool | 58 | bool |
54 | 59 | ||
@@ -121,7 +126,7 @@ if FTRACE | |||
121 | config FUNCTION_TRACER | 126 | config FUNCTION_TRACER |
122 | bool "Kernel Function Tracer" | 127 | bool "Kernel Function Tracer" |
123 | depends on HAVE_FUNCTION_TRACER | 128 | depends on HAVE_FUNCTION_TRACER |
124 | select FRAME_POINTER | 129 | select FRAME_POINTER if (!ARM_UNWIND) |
125 | select KALLSYMS | 130 | select KALLSYMS |
126 | select GENERIC_TRACER | 131 | select GENERIC_TRACER |
127 | select CONTEXT_SWITCH_TRACER | 132 | select CONTEXT_SWITCH_TRACER |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 959f8d6c8cc1..bc251ed66724 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -23,7 +23,6 @@ | |||
23 | #include <linux/mutex.h> | 23 | #include <linux/mutex.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/debugfs.h> | 25 | #include <linux/debugfs.h> |
26 | #include <linux/smp_lock.h> | ||
27 | #include <linux/time.h> | 26 | #include <linux/time.h> |
28 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
29 | 28 | ||
@@ -326,6 +325,7 @@ static const struct file_operations blk_dropped_fops = { | |||
326 | .owner = THIS_MODULE, | 325 | .owner = THIS_MODULE, |
327 | .open = blk_dropped_open, | 326 | .open = blk_dropped_open, |
328 | .read = blk_dropped_read, | 327 | .read = blk_dropped_read, |
328 | .llseek = default_llseek, | ||
329 | }; | 329 | }; |
330 | 330 | ||
331 | static int blk_msg_open(struct inode *inode, struct file *filp) | 331 | static int blk_msg_open(struct inode *inode, struct file *filp) |
@@ -365,6 +365,7 @@ static const struct file_operations blk_msg_fops = { | |||
365 | .owner = THIS_MODULE, | 365 | .owner = THIS_MODULE, |
366 | .open = blk_msg_open, | 366 | .open = blk_msg_open, |
367 | .write = blk_msg_write, | 367 | .write = blk_msg_write, |
368 | .llseek = noop_llseek, | ||
368 | }; | 369 | }; |
369 | 370 | ||
370 | /* | 371 | /* |
@@ -639,7 +640,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
639 | if (!q) | 640 | if (!q) |
640 | return -ENXIO; | 641 | return -ENXIO; |
641 | 642 | ||
642 | lock_kernel(); | ||
643 | mutex_lock(&bdev->bd_mutex); | 643 | mutex_lock(&bdev->bd_mutex); |
644 | 644 | ||
645 | switch (cmd) { | 645 | switch (cmd) { |
@@ -667,7 +667,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
667 | } | 667 | } |
668 | 668 | ||
669 | mutex_unlock(&bdev->bd_mutex); | 669 | mutex_unlock(&bdev->bd_mutex); |
670 | unlock_kernel(); | ||
671 | return ret; | 670 | return ret; |
672 | } | 671 | } |
673 | 672 | ||
@@ -1652,10 +1651,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, | |||
1652 | struct block_device *bdev; | 1651 | struct block_device *bdev; |
1653 | ssize_t ret = -ENXIO; | 1652 | ssize_t ret = -ENXIO; |
1654 | 1653 | ||
1655 | lock_kernel(); | ||
1656 | bdev = bdget(part_devt(p)); | 1654 | bdev = bdget(part_devt(p)); |
1657 | if (bdev == NULL) | 1655 | if (bdev == NULL) |
1658 | goto out_unlock_kernel; | 1656 | goto out; |
1659 | 1657 | ||
1660 | q = blk_trace_get_queue(bdev); | 1658 | q = blk_trace_get_queue(bdev); |
1661 | if (q == NULL) | 1659 | if (q == NULL) |
@@ -1683,8 +1681,7 @@ out_unlock_bdev: | |||
1683 | mutex_unlock(&bdev->bd_mutex); | 1681 | mutex_unlock(&bdev->bd_mutex); |
1684 | out_bdput: | 1682 | out_bdput: |
1685 | bdput(bdev); | 1683 | bdput(bdev); |
1686 | out_unlock_kernel: | 1684 | out: |
1687 | unlock_kernel(); | ||
1688 | return ret; | 1685 | return ret; |
1689 | } | 1686 | } |
1690 | 1687 | ||
@@ -1714,11 +1711,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, | |||
1714 | 1711 | ||
1715 | ret = -ENXIO; | 1712 | ret = -ENXIO; |
1716 | 1713 | ||
1717 | lock_kernel(); | ||
1718 | p = dev_to_part(dev); | 1714 | p = dev_to_part(dev); |
1719 | bdev = bdget(part_devt(p)); | 1715 | bdev = bdget(part_devt(p)); |
1720 | if (bdev == NULL) | 1716 | if (bdev == NULL) |
1721 | goto out_unlock_kernel; | 1717 | goto out; |
1722 | 1718 | ||
1723 | q = blk_trace_get_queue(bdev); | 1719 | q = blk_trace_get_queue(bdev); |
1724 | if (q == NULL) | 1720 | if (q == NULL) |
@@ -1753,8 +1749,6 @@ out_unlock_bdev: | |||
1753 | mutex_unlock(&bdev->bd_mutex); | 1749 | mutex_unlock(&bdev->bd_mutex); |
1754 | out_bdput: | 1750 | out_bdput: |
1755 | bdput(bdev); | 1751 | bdput(bdev); |
1756 | out_unlock_kernel: | ||
1757 | unlock_kernel(); | ||
1758 | out: | 1752 | out: |
1759 | return ret ? ret : count; | 1753 | return ret ? ret : count; |
1760 | } | 1754 | } |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa7ece649fe1..f3dadae83883 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -800,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = { | |||
800 | .open = tracing_open_generic, | 800 | .open = tracing_open_generic, |
801 | .read = ftrace_profile_read, | 801 | .read = ftrace_profile_read, |
802 | .write = ftrace_profile_write, | 802 | .write = ftrace_profile_write, |
803 | .llseek = default_llseek, | ||
803 | }; | 804 | }; |
804 | 805 | ||
805 | /* used to initialize the real stat files */ | 806 | /* used to initialize the real stat files */ |
@@ -884,10 +885,8 @@ enum { | |||
884 | FTRACE_ENABLE_CALLS = (1 << 0), | 885 | FTRACE_ENABLE_CALLS = (1 << 0), |
885 | FTRACE_DISABLE_CALLS = (1 << 1), | 886 | FTRACE_DISABLE_CALLS = (1 << 1), |
886 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), | 887 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), |
887 | FTRACE_ENABLE_MCOUNT = (1 << 3), | 888 | FTRACE_START_FUNC_RET = (1 << 3), |
888 | FTRACE_DISABLE_MCOUNT = (1 << 4), | 889 | FTRACE_STOP_FUNC_RET = (1 << 4), |
889 | FTRACE_START_FUNC_RET = (1 << 5), | ||
890 | FTRACE_STOP_FUNC_RET = (1 << 6), | ||
891 | }; | 890 | }; |
892 | 891 | ||
893 | static int ftrace_filtered; | 892 | static int ftrace_filtered; |
@@ -1226,8 +1225,6 @@ static void ftrace_shutdown(int command) | |||
1226 | 1225 | ||
1227 | static void ftrace_startup_sysctl(void) | 1226 | static void ftrace_startup_sysctl(void) |
1228 | { | 1227 | { |
1229 | int command = FTRACE_ENABLE_MCOUNT; | ||
1230 | |||
1231 | if (unlikely(ftrace_disabled)) | 1228 | if (unlikely(ftrace_disabled)) |
1232 | return; | 1229 | return; |
1233 | 1230 | ||
@@ -1235,23 +1232,17 @@ static void ftrace_startup_sysctl(void) | |||
1235 | saved_ftrace_func = NULL; | 1232 | saved_ftrace_func = NULL; |
1236 | /* ftrace_start_up is true if we want ftrace running */ | 1233 | /* ftrace_start_up is true if we want ftrace running */ |
1237 | if (ftrace_start_up) | 1234 | if (ftrace_start_up) |
1238 | command |= FTRACE_ENABLE_CALLS; | 1235 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); |
1239 | |||
1240 | ftrace_run_update_code(command); | ||
1241 | } | 1236 | } |
1242 | 1237 | ||
1243 | static void ftrace_shutdown_sysctl(void) | 1238 | static void ftrace_shutdown_sysctl(void) |
1244 | { | 1239 | { |
1245 | int command = FTRACE_DISABLE_MCOUNT; | ||
1246 | |||
1247 | if (unlikely(ftrace_disabled)) | 1240 | if (unlikely(ftrace_disabled)) |
1248 | return; | 1241 | return; |
1249 | 1242 | ||
1250 | /* ftrace_start_up is true if ftrace is running */ | 1243 | /* ftrace_start_up is true if ftrace is running */ |
1251 | if (ftrace_start_up) | 1244 | if (ftrace_start_up) |
1252 | command |= FTRACE_DISABLE_CALLS; | 1245 | ftrace_run_update_code(FTRACE_DISABLE_CALLS); |
1253 | |||
1254 | ftrace_run_update_code(command); | ||
1255 | } | 1246 | } |
1256 | 1247 | ||
1257 | static cycle_t ftrace_update_time; | 1248 | static cycle_t ftrace_update_time; |
@@ -1368,24 +1359,29 @@ enum { | |||
1368 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ | 1359 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ |
1369 | 1360 | ||
1370 | struct ftrace_iterator { | 1361 | struct ftrace_iterator { |
1371 | struct ftrace_page *pg; | 1362 | loff_t pos; |
1372 | int hidx; | 1363 | loff_t func_pos; |
1373 | int idx; | 1364 | struct ftrace_page *pg; |
1374 | unsigned flags; | 1365 | struct dyn_ftrace *func; |
1375 | struct trace_parser parser; | 1366 | struct ftrace_func_probe *probe; |
1367 | struct trace_parser parser; | ||
1368 | int hidx; | ||
1369 | int idx; | ||
1370 | unsigned flags; | ||
1376 | }; | 1371 | }; |
1377 | 1372 | ||
1378 | static void * | 1373 | static void * |
1379 | t_hash_next(struct seq_file *m, void *v, loff_t *pos) | 1374 | t_hash_next(struct seq_file *m, loff_t *pos) |
1380 | { | 1375 | { |
1381 | struct ftrace_iterator *iter = m->private; | 1376 | struct ftrace_iterator *iter = m->private; |
1382 | struct hlist_node *hnd = v; | 1377 | struct hlist_node *hnd = NULL; |
1383 | struct hlist_head *hhd; | 1378 | struct hlist_head *hhd; |
1384 | 1379 | ||
1385 | WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); | ||
1386 | |||
1387 | (*pos)++; | 1380 | (*pos)++; |
1381 | iter->pos = *pos; | ||
1388 | 1382 | ||
1383 | if (iter->probe) | ||
1384 | hnd = &iter->probe->node; | ||
1389 | retry: | 1385 | retry: |
1390 | if (iter->hidx >= FTRACE_FUNC_HASHSIZE) | 1386 | if (iter->hidx >= FTRACE_FUNC_HASHSIZE) |
1391 | return NULL; | 1387 | return NULL; |
@@ -1408,7 +1404,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) | |||
1408 | } | 1404 | } |
1409 | } | 1405 | } |
1410 | 1406 | ||
1411 | return hnd; | 1407 | if (WARN_ON_ONCE(!hnd)) |
1408 | return NULL; | ||
1409 | |||
1410 | iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); | ||
1411 | |||
1412 | return iter; | ||
1412 | } | 1413 | } |
1413 | 1414 | ||
1414 | static void *t_hash_start(struct seq_file *m, loff_t *pos) | 1415 | static void *t_hash_start(struct seq_file *m, loff_t *pos) |
@@ -1417,26 +1418,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) | |||
1417 | void *p = NULL; | 1418 | void *p = NULL; |
1418 | loff_t l; | 1419 | loff_t l; |
1419 | 1420 | ||
1420 | if (!(iter->flags & FTRACE_ITER_HASH)) | 1421 | if (iter->func_pos > *pos) |
1421 | *pos = 0; | 1422 | return NULL; |
1422 | |||
1423 | iter->flags |= FTRACE_ITER_HASH; | ||
1424 | 1423 | ||
1425 | iter->hidx = 0; | 1424 | iter->hidx = 0; |
1426 | for (l = 0; l <= *pos; ) { | 1425 | for (l = 0; l <= (*pos - iter->func_pos); ) { |
1427 | p = t_hash_next(m, p, &l); | 1426 | p = t_hash_next(m, &l); |
1428 | if (!p) | 1427 | if (!p) |
1429 | break; | 1428 | break; |
1430 | } | 1429 | } |
1431 | return p; | 1430 | if (!p) |
1431 | return NULL; | ||
1432 | |||
1433 | /* Only set this if we have an item */ | ||
1434 | iter->flags |= FTRACE_ITER_HASH; | ||
1435 | |||
1436 | return iter; | ||
1432 | } | 1437 | } |
1433 | 1438 | ||
1434 | static int t_hash_show(struct seq_file *m, void *v) | 1439 | static int |
1440 | t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) | ||
1435 | { | 1441 | { |
1436 | struct ftrace_func_probe *rec; | 1442 | struct ftrace_func_probe *rec; |
1437 | struct hlist_node *hnd = v; | ||
1438 | 1443 | ||
1439 | rec = hlist_entry(hnd, struct ftrace_func_probe, node); | 1444 | rec = iter->probe; |
1445 | if (WARN_ON_ONCE(!rec)) | ||
1446 | return -EIO; | ||
1440 | 1447 | ||
1441 | if (rec->ops->print) | 1448 | if (rec->ops->print) |
1442 | return rec->ops->print(m, rec->ip, rec->ops, rec->data); | 1449 | return rec->ops->print(m, rec->ip, rec->ops, rec->data); |
@@ -1457,12 +1464,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
1457 | struct dyn_ftrace *rec = NULL; | 1464 | struct dyn_ftrace *rec = NULL; |
1458 | 1465 | ||
1459 | if (iter->flags & FTRACE_ITER_HASH) | 1466 | if (iter->flags & FTRACE_ITER_HASH) |
1460 | return t_hash_next(m, v, pos); | 1467 | return t_hash_next(m, pos); |
1461 | 1468 | ||
1462 | (*pos)++; | 1469 | (*pos)++; |
1470 | iter->pos = *pos; | ||
1463 | 1471 | ||
1464 | if (iter->flags & FTRACE_ITER_PRINTALL) | 1472 | if (iter->flags & FTRACE_ITER_PRINTALL) |
1465 | return NULL; | 1473 | return t_hash_start(m, pos); |
1466 | 1474 | ||
1467 | retry: | 1475 | retry: |
1468 | if (iter->idx >= iter->pg->index) { | 1476 | if (iter->idx >= iter->pg->index) { |
@@ -1491,7 +1499,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
1491 | } | 1499 | } |
1492 | } | 1500 | } |
1493 | 1501 | ||
1494 | return rec; | 1502 | if (!rec) |
1503 | return t_hash_start(m, pos); | ||
1504 | |||
1505 | iter->func_pos = *pos; | ||
1506 | iter->func = rec; | ||
1507 | |||
1508 | return iter; | ||
1509 | } | ||
1510 | |||
1511 | static void reset_iter_read(struct ftrace_iterator *iter) | ||
1512 | { | ||
1513 | iter->pos = 0; | ||
1514 | iter->func_pos = 0; | ||
1515 | iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); | ||
1495 | } | 1516 | } |
1496 | 1517 | ||
1497 | static void *t_start(struct seq_file *m, loff_t *pos) | 1518 | static void *t_start(struct seq_file *m, loff_t *pos) |
@@ -1502,6 +1523,12 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1502 | 1523 | ||
1503 | mutex_lock(&ftrace_lock); | 1524 | mutex_lock(&ftrace_lock); |
1504 | /* | 1525 | /* |
1526 | * If an lseek was done, then reset and start from beginning. | ||
1527 | */ | ||
1528 | if (*pos < iter->pos) | ||
1529 | reset_iter_read(iter); | ||
1530 | |||
1531 | /* | ||
1505 | * For set_ftrace_filter reading, if we have the filter | 1532 | * For set_ftrace_filter reading, if we have the filter |
1506 | * off, we can short cut and just print out that all | 1533 | * off, we can short cut and just print out that all |
1507 | * functions are enabled. | 1534 | * functions are enabled. |
@@ -1518,6 +1545,11 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1518 | if (iter->flags & FTRACE_ITER_HASH) | 1545 | if (iter->flags & FTRACE_ITER_HASH) |
1519 | return t_hash_start(m, pos); | 1546 | return t_hash_start(m, pos); |
1520 | 1547 | ||
1548 | /* | ||
1549 | * Unfortunately, we need to restart at ftrace_pages_start | ||
1550 | * every time we let go of the ftrace_mutex. This is because | ||
1551 | * those pointers can change without the lock. | ||
1552 | */ | ||
1521 | iter->pg = ftrace_pages_start; | 1553 | iter->pg = ftrace_pages_start; |
1522 | iter->idx = 0; | 1554 | iter->idx = 0; |
1523 | for (l = 0; l <= *pos; ) { | 1555 | for (l = 0; l <= *pos; ) { |
@@ -1526,10 +1558,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1526 | break; | 1558 | break; |
1527 | } | 1559 | } |
1528 | 1560 | ||
1529 | if (!p && iter->flags & FTRACE_ITER_FILTER) | 1561 | if (!p) { |
1530 | return t_hash_start(m, pos); | 1562 | if (iter->flags & FTRACE_ITER_FILTER) |
1563 | return t_hash_start(m, pos); | ||
1564 | |||
1565 | return NULL; | ||
1566 | } | ||
1531 | 1567 | ||
1532 | return p; | 1568 | return iter; |
1533 | } | 1569 | } |
1534 | 1570 | ||
1535 | static void t_stop(struct seq_file *m, void *p) | 1571 | static void t_stop(struct seq_file *m, void *p) |
@@ -1540,16 +1576,18 @@ static void t_stop(struct seq_file *m, void *p) | |||
1540 | static int t_show(struct seq_file *m, void *v) | 1576 | static int t_show(struct seq_file *m, void *v) |
1541 | { | 1577 | { |
1542 | struct ftrace_iterator *iter = m->private; | 1578 | struct ftrace_iterator *iter = m->private; |
1543 | struct dyn_ftrace *rec = v; | 1579 | struct dyn_ftrace *rec; |
1544 | 1580 | ||
1545 | if (iter->flags & FTRACE_ITER_HASH) | 1581 | if (iter->flags & FTRACE_ITER_HASH) |
1546 | return t_hash_show(m, v); | 1582 | return t_hash_show(m, iter); |
1547 | 1583 | ||
1548 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 1584 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
1549 | seq_printf(m, "#### all functions enabled ####\n"); | 1585 | seq_printf(m, "#### all functions enabled ####\n"); |
1550 | return 0; | 1586 | return 0; |
1551 | } | 1587 | } |
1552 | 1588 | ||
1589 | rec = iter->func; | ||
1590 | |||
1553 | if (!rec) | 1591 | if (!rec) |
1554 | return 0; | 1592 | return 0; |
1555 | 1593 | ||
@@ -1601,8 +1639,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) | |||
1601 | 1639 | ||
1602 | ret = ftrace_avail_open(inode, file); | 1640 | ret = ftrace_avail_open(inode, file); |
1603 | if (!ret) { | 1641 | if (!ret) { |
1604 | m = (struct seq_file *)file->private_data; | 1642 | m = file->private_data; |
1605 | iter = (struct ftrace_iterator *)m->private; | 1643 | iter = m->private; |
1606 | iter->flags = FTRACE_ITER_FAILURES; | 1644 | iter->flags = FTRACE_ITER_FAILURES; |
1607 | } | 1645 | } |
1608 | 1646 | ||
@@ -2418,7 +2456,7 @@ static const struct file_operations ftrace_filter_fops = { | |||
2418 | .open = ftrace_filter_open, | 2456 | .open = ftrace_filter_open, |
2419 | .read = seq_read, | 2457 | .read = seq_read, |
2420 | .write = ftrace_filter_write, | 2458 | .write = ftrace_filter_write, |
2421 | .llseek = no_llseek, | 2459 | .llseek = ftrace_regex_lseek, |
2422 | .release = ftrace_filter_release, | 2460 | .release = ftrace_filter_release, |
2423 | }; | 2461 | }; |
2424 | 2462 | ||
@@ -2632,6 +2670,7 @@ static const struct file_operations ftrace_graph_fops = { | |||
2632 | .read = seq_read, | 2670 | .read = seq_read, |
2633 | .write = ftrace_graph_write, | 2671 | .write = ftrace_graph_write, |
2634 | .release = ftrace_graph_release, | 2672 | .release = ftrace_graph_release, |
2673 | .llseek = seq_lseek, | ||
2635 | }; | 2674 | }; |
2636 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 2675 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
2637 | 2676 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bca96377fd4e..9ed509a015d8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -224,6 +224,9 @@ enum { | |||
224 | RB_LEN_TIME_STAMP = 16, | 224 | RB_LEN_TIME_STAMP = 16, |
225 | }; | 225 | }; |
226 | 226 | ||
227 | #define skip_time_extend(event) \ | ||
228 | ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) | ||
229 | |||
227 | static inline int rb_null_event(struct ring_buffer_event *event) | 230 | static inline int rb_null_event(struct ring_buffer_event *event) |
228 | { | 231 | { |
229 | return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; | 232 | return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; |
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event) | |||
248 | return length + RB_EVNT_HDR_SIZE; | 251 | return length + RB_EVNT_HDR_SIZE; |
249 | } | 252 | } |
250 | 253 | ||
251 | /* inline for ring buffer fast paths */ | 254 | /* |
252 | static unsigned | 255 | * Return the length of the given event. Will return |
256 | * the length of the time extend if the event is a | ||
257 | * time extend. | ||
258 | */ | ||
259 | static inline unsigned | ||
253 | rb_event_length(struct ring_buffer_event *event) | 260 | rb_event_length(struct ring_buffer_event *event) |
254 | { | 261 | { |
255 | switch (event->type_len) { | 262 | switch (event->type_len) { |
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event) | |||
274 | return 0; | 281 | return 0; |
275 | } | 282 | } |
276 | 283 | ||
284 | /* | ||
285 | * Return total length of time extend and data, | ||
286 | * or just the event length for all other events. | ||
287 | */ | ||
288 | static inline unsigned | ||
289 | rb_event_ts_length(struct ring_buffer_event *event) | ||
290 | { | ||
291 | unsigned len = 0; | ||
292 | |||
293 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { | ||
294 | /* time extends include the data event after it */ | ||
295 | len = RB_LEN_TIME_EXTEND; | ||
296 | event = skip_time_extend(event); | ||
297 | } | ||
298 | return len + rb_event_length(event); | ||
299 | } | ||
300 | |||
277 | /** | 301 | /** |
278 | * ring_buffer_event_length - return the length of the event | 302 | * ring_buffer_event_length - return the length of the event |
279 | * @event: the event to get the length of | 303 | * @event: the event to get the length of |
304 | * | ||
305 | * Returns the size of the data load of a data event. | ||
306 | * If the event is something other than a data event, it | ||
307 | * returns the size of the event itself. With the exception | ||
308 | * of a TIME EXTEND, where it still returns the size of the | ||
309 | * data load of the data event after it. | ||
280 | */ | 310 | */ |
281 | unsigned ring_buffer_event_length(struct ring_buffer_event *event) | 311 | unsigned ring_buffer_event_length(struct ring_buffer_event *event) |
282 | { | 312 | { |
283 | unsigned length = rb_event_length(event); | 313 | unsigned length; |
314 | |||
315 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
316 | event = skip_time_extend(event); | ||
317 | |||
318 | length = rb_event_length(event); | ||
284 | if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) | 319 | if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) |
285 | return length; | 320 | return length; |
286 | length -= RB_EVNT_HDR_SIZE; | 321 | length -= RB_EVNT_HDR_SIZE; |
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); | |||
294 | static void * | 329 | static void * |
295 | rb_event_data(struct ring_buffer_event *event) | 330 | rb_event_data(struct ring_buffer_event *event) |
296 | { | 331 | { |
332 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
333 | event = skip_time_extend(event); | ||
297 | BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); | 334 | BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); |
298 | /* If length is in len field, then array[0] has the data */ | 335 | /* If length is in len field, then array[0] has the data */ |
299 | if (event->type_len) | 336 | if (event->type_len) |
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta) | |||
404 | /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ | 441 | /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ |
405 | #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) | 442 | #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) |
406 | 443 | ||
407 | /* Max number of timestamps that can fit on a page */ | ||
408 | #define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) | ||
409 | |||
410 | int ring_buffer_print_page_header(struct trace_seq *s) | 444 | int ring_buffer_print_page_header(struct trace_seq *s) |
411 | { | 445 | { |
412 | struct buffer_data_page field; | 446 | struct buffer_data_page field; |
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) | |||
1546 | iter->head = 0; | 1580 | iter->head = 0; |
1547 | } | 1581 | } |
1548 | 1582 | ||
1583 | /* Slow path, do not inline */ | ||
1584 | static noinline struct ring_buffer_event * | ||
1585 | rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | ||
1586 | { | ||
1587 | event->type_len = RINGBUF_TYPE_TIME_EXTEND; | ||
1588 | |||
1589 | /* Not the first event on the page? */ | ||
1590 | if (rb_event_index(event)) { | ||
1591 | event->time_delta = delta & TS_MASK; | ||
1592 | event->array[0] = delta >> TS_SHIFT; | ||
1593 | } else { | ||
1594 | /* nope, just zero it */ | ||
1595 | event->time_delta = 0; | ||
1596 | event->array[0] = 0; | ||
1597 | } | ||
1598 | |||
1599 | return skip_time_extend(event); | ||
1600 | } | ||
1601 | |||
1549 | /** | 1602 | /** |
1550 | * ring_buffer_update_event - update event type and data | 1603 | * ring_buffer_update_event - update event type and data |
1551 | * @event: the even to update | 1604 | * @event: the even to update |
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) | |||
1558 | * data field. | 1611 | * data field. |
1559 | */ | 1612 | */ |
1560 | static void | 1613 | static void |
1561 | rb_update_event(struct ring_buffer_event *event, | 1614 | rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, |
1562 | unsigned type, unsigned length) | 1615 | struct ring_buffer_event *event, unsigned length, |
1616 | int add_timestamp, u64 delta) | ||
1563 | { | 1617 | { |
1564 | event->type_len = type; | 1618 | /* Only a commit updates the timestamp */ |
1565 | 1619 | if (unlikely(!rb_event_is_commit(cpu_buffer, event))) | |
1566 | switch (type) { | 1620 | delta = 0; |
1567 | |||
1568 | case RINGBUF_TYPE_PADDING: | ||
1569 | case RINGBUF_TYPE_TIME_EXTEND: | ||
1570 | case RINGBUF_TYPE_TIME_STAMP: | ||
1571 | break; | ||
1572 | 1621 | ||
1573 | case 0: | 1622 | /* |
1574 | length -= RB_EVNT_HDR_SIZE; | 1623 | * If we need to add a timestamp, then we |
1575 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) | 1624 | * add it to the start of the resevered space. |
1576 | event->array[0] = length; | 1625 | */ |
1577 | else | 1626 | if (unlikely(add_timestamp)) { |
1578 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); | 1627 | event = rb_add_time_stamp(event, delta); |
1579 | break; | 1628 | length -= RB_LEN_TIME_EXTEND; |
1580 | default: | 1629 | delta = 0; |
1581 | BUG(); | ||
1582 | } | 1630 | } |
1631 | |||
1632 | event->time_delta = delta; | ||
1633 | length -= RB_EVNT_HDR_SIZE; | ||
1634 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { | ||
1635 | event->type_len = 0; | ||
1636 | event->array[0] = length; | ||
1637 | } else | ||
1638 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); | ||
1583 | } | 1639 | } |
1584 | 1640 | ||
1585 | /* | 1641 | /* |
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1823 | local_sub(length, &tail_page->write); | 1879 | local_sub(length, &tail_page->write); |
1824 | } | 1880 | } |
1825 | 1881 | ||
1826 | static struct ring_buffer_event * | 1882 | /* |
1883 | * This is the slow path, force gcc not to inline it. | ||
1884 | */ | ||
1885 | static noinline struct ring_buffer_event * | ||
1827 | rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | 1886 | rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, |
1828 | unsigned long length, unsigned long tail, | 1887 | unsigned long length, unsigned long tail, |
1829 | struct buffer_page *tail_page, u64 *ts) | 1888 | struct buffer_page *tail_page, u64 ts) |
1830 | { | 1889 | { |
1831 | struct buffer_page *commit_page = cpu_buffer->commit_page; | 1890 | struct buffer_page *commit_page = cpu_buffer->commit_page; |
1832 | struct ring_buffer *buffer = cpu_buffer->buffer; | 1891 | struct ring_buffer *buffer = cpu_buffer->buffer; |
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1909 | * Nested commits always have zero deltas, so | 1968 | * Nested commits always have zero deltas, so |
1910 | * just reread the time stamp | 1969 | * just reread the time stamp |
1911 | */ | 1970 | */ |
1912 | *ts = rb_time_stamp(buffer); | 1971 | ts = rb_time_stamp(buffer); |
1913 | next_page->page->time_stamp = *ts; | 1972 | next_page->page->time_stamp = ts; |
1914 | } | 1973 | } |
1915 | 1974 | ||
1916 | out_again: | 1975 | out_again: |
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1929 | 1988 | ||
1930 | static struct ring_buffer_event * | 1989 | static struct ring_buffer_event * |
1931 | __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | 1990 | __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, |
1932 | unsigned type, unsigned long length, u64 *ts) | 1991 | unsigned long length, u64 ts, |
1992 | u64 delta, int add_timestamp) | ||
1933 | { | 1993 | { |
1934 | struct buffer_page *tail_page; | 1994 | struct buffer_page *tail_page; |
1935 | struct ring_buffer_event *event; | 1995 | struct ring_buffer_event *event; |
1936 | unsigned long tail, write; | 1996 | unsigned long tail, write; |
1937 | 1997 | ||
1998 | /* | ||
1999 | * If the time delta since the last event is too big to | ||
2000 | * hold in the time field of the event, then we append a | ||
2001 | * TIME EXTEND event ahead of the data event. | ||
2002 | */ | ||
2003 | if (unlikely(add_timestamp)) | ||
2004 | length += RB_LEN_TIME_EXTEND; | ||
2005 | |||
1938 | tail_page = cpu_buffer->tail_page; | 2006 | tail_page = cpu_buffer->tail_page; |
1939 | write = local_add_return(length, &tail_page->write); | 2007 | write = local_add_return(length, &tail_page->write); |
1940 | 2008 | ||
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
1943 | tail = write - length; | 2011 | tail = write - length; |
1944 | 2012 | ||
1945 | /* See if we shot pass the end of this buffer page */ | 2013 | /* See if we shot pass the end of this buffer page */ |
1946 | if (write > BUF_PAGE_SIZE) | 2014 | if (unlikely(write > BUF_PAGE_SIZE)) |
1947 | return rb_move_tail(cpu_buffer, length, tail, | 2015 | return rb_move_tail(cpu_buffer, length, tail, |
1948 | tail_page, ts); | 2016 | tail_page, ts); |
1949 | 2017 | ||
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
1951 | 2019 | ||
1952 | event = __rb_page_index(tail_page, tail); | 2020 | event = __rb_page_index(tail_page, tail); |
1953 | kmemcheck_annotate_bitfield(event, bitfield); | 2021 | kmemcheck_annotate_bitfield(event, bitfield); |
1954 | rb_update_event(event, type, length); | 2022 | rb_update_event(cpu_buffer, event, length, add_timestamp, delta); |
1955 | 2023 | ||
1956 | /* The passed in type is zero for DATA */ | 2024 | local_inc(&tail_page->entries); |
1957 | if (likely(!type)) | ||
1958 | local_inc(&tail_page->entries); | ||
1959 | 2025 | ||
1960 | /* | 2026 | /* |
1961 | * If this is the first commit on the page, then update | 2027 | * If this is the first commit on the page, then update |
1962 | * its timestamp. | 2028 | * its timestamp. |
1963 | */ | 2029 | */ |
1964 | if (!tail) | 2030 | if (!tail) |
1965 | tail_page->page->time_stamp = *ts; | 2031 | tail_page->page->time_stamp = ts; |
1966 | 2032 | ||
1967 | return event; | 2033 | return event; |
1968 | } | 2034 | } |
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
1977 | unsigned long addr; | 2043 | unsigned long addr; |
1978 | 2044 | ||
1979 | new_index = rb_event_index(event); | 2045 | new_index = rb_event_index(event); |
1980 | old_index = new_index + rb_event_length(event); | 2046 | old_index = new_index + rb_event_ts_length(event); |
1981 | addr = (unsigned long)event; | 2047 | addr = (unsigned long)event; |
1982 | addr &= PAGE_MASK; | 2048 | addr &= PAGE_MASK; |
1983 | 2049 | ||
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
2003 | return 0; | 2069 | return 0; |
2004 | } | 2070 | } |
2005 | 2071 | ||
2006 | static int | ||
2007 | rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, | ||
2008 | u64 *ts, u64 *delta) | ||
2009 | { | ||
2010 | struct ring_buffer_event *event; | ||
2011 | int ret; | ||
2012 | |||
2013 | WARN_ONCE(*delta > (1ULL << 59), | ||
2014 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", | ||
2015 | (unsigned long long)*delta, | ||
2016 | (unsigned long long)*ts, | ||
2017 | (unsigned long long)cpu_buffer->write_stamp); | ||
2018 | |||
2019 | /* | ||
2020 | * The delta is too big, we to add a | ||
2021 | * new timestamp. | ||
2022 | */ | ||
2023 | event = __rb_reserve_next(cpu_buffer, | ||
2024 | RINGBUF_TYPE_TIME_EXTEND, | ||
2025 | RB_LEN_TIME_EXTEND, | ||
2026 | ts); | ||
2027 | if (!event) | ||
2028 | return -EBUSY; | ||
2029 | |||
2030 | if (PTR_ERR(event) == -EAGAIN) | ||
2031 | return -EAGAIN; | ||
2032 | |||
2033 | /* Only a commited time event can update the write stamp */ | ||
2034 | if (rb_event_is_commit(cpu_buffer, event)) { | ||
2035 | /* | ||
2036 | * If this is the first on the page, then it was | ||
2037 | * updated with the page itself. Try to discard it | ||
2038 | * and if we can't just make it zero. | ||
2039 | */ | ||
2040 | if (rb_event_index(event)) { | ||
2041 | event->time_delta = *delta & TS_MASK; | ||
2042 | event->array[0] = *delta >> TS_SHIFT; | ||
2043 | } else { | ||
2044 | /* try to discard, since we do not need this */ | ||
2045 | if (!rb_try_to_discard(cpu_buffer, event)) { | ||
2046 | /* nope, just zero it */ | ||
2047 | event->time_delta = 0; | ||
2048 | event->array[0] = 0; | ||
2049 | } | ||
2050 | } | ||
2051 | cpu_buffer->write_stamp = *ts; | ||
2052 | /* let the caller know this was the commit */ | ||
2053 | ret = 1; | ||
2054 | } else { | ||
2055 | /* Try to discard the event */ | ||
2056 | if (!rb_try_to_discard(cpu_buffer, event)) { | ||
2057 | /* Darn, this is just wasted space */ | ||
2058 | event->time_delta = 0; | ||
2059 | event->array[0] = 0; | ||
2060 | } | ||
2061 | ret = 0; | ||
2062 | } | ||
2063 | |||
2064 | *delta = 0; | ||
2065 | |||
2066 | return ret; | ||
2067 | } | ||
2068 | |||
2069 | static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) | 2072 | static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) |
2070 | { | 2073 | { |
2071 | local_inc(&cpu_buffer->committing); | 2074 | local_inc(&cpu_buffer->committing); |
2072 | local_inc(&cpu_buffer->commits); | 2075 | local_inc(&cpu_buffer->commits); |
2073 | } | 2076 | } |
2074 | 2077 | ||
2075 | static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) | 2078 | static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) |
2076 | { | 2079 | { |
2077 | unsigned long commits; | 2080 | unsigned long commits; |
2078 | 2081 | ||
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2110 | unsigned long length) | 2113 | unsigned long length) |
2111 | { | 2114 | { |
2112 | struct ring_buffer_event *event; | 2115 | struct ring_buffer_event *event; |
2113 | u64 ts, delta = 0; | 2116 | u64 ts, delta; |
2114 | int commit = 0; | ||
2115 | int nr_loops = 0; | 2117 | int nr_loops = 0; |
2118 | int add_timestamp; | ||
2119 | u64 diff; | ||
2116 | 2120 | ||
2117 | rb_start_commit(cpu_buffer); | 2121 | rb_start_commit(cpu_buffer); |
2118 | 2122 | ||
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2133 | 2137 | ||
2134 | length = rb_calculate_event_length(length); | 2138 | length = rb_calculate_event_length(length); |
2135 | again: | 2139 | again: |
2140 | add_timestamp = 0; | ||
2141 | delta = 0; | ||
2142 | |||
2136 | /* | 2143 | /* |
2137 | * We allow for interrupts to reenter here and do a trace. | 2144 | * We allow for interrupts to reenter here and do a trace. |
2138 | * If one does, it will cause this original code to loop | 2145 | * If one does, it will cause this original code to loop |
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2146 | goto out_fail; | 2153 | goto out_fail; |
2147 | 2154 | ||
2148 | ts = rb_time_stamp(cpu_buffer->buffer); | 2155 | ts = rb_time_stamp(cpu_buffer->buffer); |
2156 | diff = ts - cpu_buffer->write_stamp; | ||
2149 | 2157 | ||
2150 | /* | 2158 | /* make sure this diff is calculated here */ |
2151 | * Only the first commit can update the timestamp. | 2159 | barrier(); |
2152 | * Yes there is a race here. If an interrupt comes in | ||
2153 | * just after the conditional and it traces too, then it | ||
2154 | * will also check the deltas. More than one timestamp may | ||
2155 | * also be made. But only the entry that did the actual | ||
2156 | * commit will be something other than zero. | ||
2157 | */ | ||
2158 | if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && | ||
2159 | rb_page_write(cpu_buffer->tail_page) == | ||
2160 | rb_commit_index(cpu_buffer))) { | ||
2161 | u64 diff; | ||
2162 | |||
2163 | diff = ts - cpu_buffer->write_stamp; | ||
2164 | |||
2165 | /* make sure this diff is calculated here */ | ||
2166 | barrier(); | ||
2167 | |||
2168 | /* Did the write stamp get updated already? */ | ||
2169 | if (unlikely(ts < cpu_buffer->write_stamp)) | ||
2170 | goto get_event; | ||
2171 | 2160 | ||
2161 | /* Did the write stamp get updated already? */ | ||
2162 | if (likely(ts >= cpu_buffer->write_stamp)) { | ||
2172 | delta = diff; | 2163 | delta = diff; |
2173 | if (unlikely(test_time_stamp(delta))) { | 2164 | if (unlikely(test_time_stamp(delta))) { |
2174 | 2165 | WARN_ONCE(delta > (1ULL << 59), | |
2175 | commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); | 2166 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", |
2176 | if (commit == -EBUSY) | 2167 | (unsigned long long)delta, |
2177 | goto out_fail; | 2168 | (unsigned long long)ts, |
2178 | 2169 | (unsigned long long)cpu_buffer->write_stamp); | |
2179 | if (commit == -EAGAIN) | 2170 | add_timestamp = 1; |
2180 | goto again; | ||
2181 | |||
2182 | RB_WARN_ON(cpu_buffer, commit < 0); | ||
2183 | } | 2171 | } |
2184 | } | 2172 | } |
2185 | 2173 | ||
2186 | get_event: | 2174 | event = __rb_reserve_next(cpu_buffer, length, ts, |
2187 | event = __rb_reserve_next(cpu_buffer, 0, length, &ts); | 2175 | delta, add_timestamp); |
2188 | if (unlikely(PTR_ERR(event) == -EAGAIN)) | 2176 | if (unlikely(PTR_ERR(event) == -EAGAIN)) |
2189 | goto again; | 2177 | goto again; |
2190 | 2178 | ||
2191 | if (!event) | 2179 | if (!event) |
2192 | goto out_fail; | 2180 | goto out_fail; |
2193 | 2181 | ||
2194 | if (!rb_event_is_commit(cpu_buffer, event)) | ||
2195 | delta = 0; | ||
2196 | |||
2197 | event->time_delta = delta; | ||
2198 | |||
2199 | return event; | 2182 | return event; |
2200 | 2183 | ||
2201 | out_fail: | 2184 | out_fail: |
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2207 | 2190 | ||
2208 | #define TRACE_RECURSIVE_DEPTH 16 | 2191 | #define TRACE_RECURSIVE_DEPTH 16 |
2209 | 2192 | ||
2210 | static int trace_recursive_lock(void) | 2193 | /* Keep this code out of the fast path cache */ |
2194 | static noinline void trace_recursive_fail(void) | ||
2211 | { | 2195 | { |
2212 | current->trace_recursion++; | ||
2213 | |||
2214 | if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) | ||
2215 | return 0; | ||
2216 | |||
2217 | /* Disable all tracing before we do anything else */ | 2196 | /* Disable all tracing before we do anything else */ |
2218 | tracing_off_permanent(); | 2197 | tracing_off_permanent(); |
2219 | 2198 | ||
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void) | |||
2225 | in_nmi()); | 2204 | in_nmi()); |
2226 | 2205 | ||
2227 | WARN_ON_ONCE(1); | 2206 | WARN_ON_ONCE(1); |
2207 | } | ||
2208 | |||
2209 | static inline int trace_recursive_lock(void) | ||
2210 | { | ||
2211 | current->trace_recursion++; | ||
2212 | |||
2213 | if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) | ||
2214 | return 0; | ||
2215 | |||
2216 | trace_recursive_fail(); | ||
2217 | |||
2228 | return -1; | 2218 | return -1; |
2229 | } | 2219 | } |
2230 | 2220 | ||
2231 | static void trace_recursive_unlock(void) | 2221 | static inline void trace_recursive_unlock(void) |
2232 | { | 2222 | { |
2233 | WARN_ON_ONCE(!current->trace_recursion); | 2223 | WARN_ON_ONCE(!current->trace_recursion); |
2234 | 2224 | ||
@@ -2308,12 +2298,28 @@ static void | |||
2308 | rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, | 2298 | rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, |
2309 | struct ring_buffer_event *event) | 2299 | struct ring_buffer_event *event) |
2310 | { | 2300 | { |
2301 | u64 delta; | ||
2302 | |||
2311 | /* | 2303 | /* |
2312 | * The event first in the commit queue updates the | 2304 | * The event first in the commit queue updates the |
2313 | * time stamp. | 2305 | * time stamp. |
2314 | */ | 2306 | */ |
2315 | if (rb_event_is_commit(cpu_buffer, event)) | 2307 | if (rb_event_is_commit(cpu_buffer, event)) { |
2316 | cpu_buffer->write_stamp += event->time_delta; | 2308 | /* |
2309 | * A commit event that is first on a page | ||
2310 | * updates the write timestamp with the page stamp | ||
2311 | */ | ||
2312 | if (!rb_event_index(event)) | ||
2313 | cpu_buffer->write_stamp = | ||
2314 | cpu_buffer->commit_page->page->time_stamp; | ||
2315 | else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { | ||
2316 | delta = event->array[0]; | ||
2317 | delta <<= TS_SHIFT; | ||
2318 | delta += event->time_delta; | ||
2319 | cpu_buffer->write_stamp += delta; | ||
2320 | } else | ||
2321 | cpu_buffer->write_stamp += event->time_delta; | ||
2322 | } | ||
2317 | } | 2323 | } |
2318 | 2324 | ||
2319 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, | 2325 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, |
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); | |||
2353 | 2359 | ||
2354 | static inline void rb_event_discard(struct ring_buffer_event *event) | 2360 | static inline void rb_event_discard(struct ring_buffer_event *event) |
2355 | { | 2361 | { |
2362 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
2363 | event = skip_time_extend(event); | ||
2364 | |||
2356 | /* array[0] holds the actual length for the discarded event */ | 2365 | /* array[0] holds the actual length for the discarded event */ |
2357 | event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; | 2366 | event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; |
2358 | event->type_len = RINGBUF_TYPE_PADDING; | 2367 | event->type_len = RINGBUF_TYPE_PADDING; |
@@ -2606,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) | |||
2606 | } | 2615 | } |
2607 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); | 2616 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); |
2608 | 2617 | ||
2618 | /* | ||
2619 | * The total entries in the ring buffer is the running counter | ||
2620 | * of entries entered into the ring buffer, minus the sum of | ||
2621 | * the entries read from the ring buffer and the number of | ||
2622 | * entries that were overwritten. | ||
2623 | */ | ||
2624 | static inline unsigned long | ||
2625 | rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) | ||
2626 | { | ||
2627 | return local_read(&cpu_buffer->entries) - | ||
2628 | (local_read(&cpu_buffer->overrun) + cpu_buffer->read); | ||
2629 | } | ||
2630 | |||
2609 | /** | 2631 | /** |
2610 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer | 2632 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer |
2611 | * @buffer: The ring buffer | 2633 | * @buffer: The ring buffer |
@@ -2614,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); | |||
2614 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) | 2636 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) |
2615 | { | 2637 | { |
2616 | struct ring_buffer_per_cpu *cpu_buffer; | 2638 | struct ring_buffer_per_cpu *cpu_buffer; |
2617 | unsigned long ret; | ||
2618 | 2639 | ||
2619 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 2640 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
2620 | return 0; | 2641 | return 0; |
2621 | 2642 | ||
2622 | cpu_buffer = buffer->buffers[cpu]; | 2643 | cpu_buffer = buffer->buffers[cpu]; |
2623 | ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) | ||
2624 | - cpu_buffer->read; | ||
2625 | 2644 | ||
2626 | return ret; | 2645 | return rb_num_of_entries(cpu_buffer); |
2627 | } | 2646 | } |
2628 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); | 2647 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); |
2629 | 2648 | ||
@@ -2684,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) | |||
2684 | /* if you care about this being correct, lock the buffer */ | 2703 | /* if you care about this being correct, lock the buffer */ |
2685 | for_each_buffer_cpu(buffer, cpu) { | 2704 | for_each_buffer_cpu(buffer, cpu) { |
2686 | cpu_buffer = buffer->buffers[cpu]; | 2705 | cpu_buffer = buffer->buffers[cpu]; |
2687 | entries += (local_read(&cpu_buffer->entries) - | 2706 | entries += rb_num_of_entries(cpu_buffer); |
2688 | local_read(&cpu_buffer->overrun)) - cpu_buffer->read; | ||
2689 | } | 2707 | } |
2690 | 2708 | ||
2691 | return entries; | 2709 | return entries; |
@@ -3040,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, | |||
3040 | 3058 | ||
3041 | again: | 3059 | again: |
3042 | /* | 3060 | /* |
3043 | * We repeat when a timestamp is encountered. It is possible | 3061 | * We repeat when a time extend is encountered. |
3044 | * to get multiple timestamps from an interrupt entering just | 3062 | * Since the time extend is always attached to a data event, |
3045 | * as one timestamp is about to be written, or from discarded | 3063 | * we should never loop more than once. |
3046 | * commits. The most that we can have is the number on a single page. | 3064 | * (We never hit the following condition more than twice). |
3047 | */ | 3065 | */ |
3048 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) | 3066 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) |
3049 | return NULL; | 3067 | return NULL; |
3050 | 3068 | ||
3051 | reader = rb_get_reader_page(cpu_buffer); | 3069 | reader = rb_get_reader_page(cpu_buffer); |
@@ -3121,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
3121 | return NULL; | 3139 | return NULL; |
3122 | 3140 | ||
3123 | /* | 3141 | /* |
3124 | * We repeat when a timestamp is encountered. | 3142 | * We repeat when a time extend is encountered. |
3125 | * We can get multiple timestamps by nested interrupts or also | 3143 | * Since the time extend is always attached to a data event, |
3126 | * if filtering is on (discarding commits). Since discarding | 3144 | * we should never loop more than once. |
3127 | * commits can be frequent we can get a lot of timestamps. | 3145 | * (We never hit the following condition more than twice). |
3128 | * But we limit them by not adding timestamps if they begin | ||
3129 | * at the start of a page. | ||
3130 | */ | 3146 | */ |
3131 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) | 3147 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) |
3132 | return NULL; | 3148 | return NULL; |
3133 | 3149 | ||
3134 | if (rb_per_cpu_empty(cpu_buffer)) | 3150 | if (rb_per_cpu_empty(cpu_buffer)) |
@@ -3826,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3826 | if (len > (commit - read)) | 3842 | if (len > (commit - read)) |
3827 | len = (commit - read); | 3843 | len = (commit - read); |
3828 | 3844 | ||
3829 | size = rb_event_length(event); | 3845 | /* Always keep the time extend and data together */ |
3846 | size = rb_event_ts_length(event); | ||
3830 | 3847 | ||
3831 | if (len < size) | 3848 | if (len < size) |
3832 | goto out_unlock; | 3849 | goto out_unlock; |
@@ -3848,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3848 | break; | 3865 | break; |
3849 | 3866 | ||
3850 | event = rb_reader_event(cpu_buffer); | 3867 | event = rb_reader_event(cpu_buffer); |
3851 | size = rb_event_length(event); | 3868 | /* Always keep the time extend and data together */ |
3869 | size = rb_event_ts_length(event); | ||
3852 | } while (len > size); | 3870 | } while (len > size); |
3853 | 3871 | ||
3854 | /* update bpage */ | 3872 | /* update bpage */ |
@@ -3965,6 +3983,7 @@ static const struct file_operations rb_simple_fops = { | |||
3965 | .open = tracing_open_generic, | 3983 | .open = tracing_open_generic, |
3966 | .read = rb_simple_read, | 3984 | .read = rb_simple_read, |
3967 | .write = rb_simple_write, | 3985 | .write = rb_simple_write, |
3986 | .llseek = default_llseek, | ||
3968 | }; | 3987 | }; |
3969 | 3988 | ||
3970 | 3989 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f541156..82d9b8106cd0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) | |||
2196 | 2196 | ||
2197 | static int tracing_release(struct inode *inode, struct file *file) | 2197 | static int tracing_release(struct inode *inode, struct file *file) |
2198 | { | 2198 | { |
2199 | struct seq_file *m = (struct seq_file *)file->private_data; | 2199 | struct seq_file *m = file->private_data; |
2200 | struct trace_iterator *iter; | 2200 | struct trace_iterator *iter; |
2201 | int cpu; | 2201 | int cpu; |
2202 | 2202 | ||
@@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
3996 | { | 3996 | { |
3997 | struct dentry *d_percpu = tracing_dentry_percpu(); | 3997 | struct dentry *d_percpu = tracing_dentry_percpu(); |
3998 | struct dentry *d_cpu; | 3998 | struct dentry *d_cpu; |
3999 | /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ | 3999 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
4000 | char cpu_dir[7]; | ||
4001 | 4000 | ||
4002 | if (cpu > 999 || cpu < 0) | 4001 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
4003 | return; | ||
4004 | |||
4005 | sprintf(cpu_dir, "cpu%ld", cpu); | ||
4006 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 4002 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); |
4007 | if (!d_cpu) { | 4003 | if (!d_cpu) { |
4008 | pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); | 4004 | pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d39b3c5454a5..9021f8c0c0c3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr, | |||
343 | unsigned long ip, | 343 | unsigned long ip, |
344 | unsigned long parent_ip, | 344 | unsigned long parent_ip, |
345 | unsigned long flags, int pc); | 345 | unsigned long flags, int pc); |
346 | void trace_graph_function(struct trace_array *tr, | ||
347 | unsigned long ip, | ||
348 | unsigned long parent_ip, | ||
349 | unsigned long flags, int pc); | ||
346 | void trace_default_header(struct seq_file *m); | 350 | void trace_default_header(struct seq_file *m); |
347 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); | 351 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); |
348 | int trace_empty(struct trace_iterator *iter); | 352 | int trace_empty(struct trace_iterator *iter); |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 31cc4cb0dbf2..39c059ca670e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -9,7 +9,7 @@ | |||
9 | #include <linux/kprobes.h> | 9 | #include <linux/kprobes.h> |
10 | #include "trace.h" | 10 | #include "trace.h" |
11 | 11 | ||
12 | static char *perf_trace_buf[4]; | 12 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * Force it to be aligned to unsigned long to avoid misaligned accesses | 15 | * Force it to be aligned to unsigned long to avoid misaligned accesses |
@@ -24,7 +24,7 @@ static int total_ref_count; | |||
24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | 24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, |
25 | struct perf_event *p_event) | 25 | struct perf_event *p_event) |
26 | { | 26 | { |
27 | struct hlist_head *list; | 27 | struct hlist_head __percpu *list; |
28 | int ret = -ENOMEM; | 28 | int ret = -ENOMEM; |
29 | int cpu; | 29 | int cpu; |
30 | 30 | ||
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, | |||
42 | tp_event->perf_events = list; | 42 | tp_event->perf_events = list; |
43 | 43 | ||
44 | if (!total_ref_count) { | 44 | if (!total_ref_count) { |
45 | char *buf; | 45 | char __percpu *buf; |
46 | int i; | 46 | int i; |
47 | 47 | ||
48 | for (i = 0; i < 4; i++) { | 48 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
49 | buf = (char *)alloc_percpu(perf_trace_t); | 49 | buf = (char __percpu *)alloc_percpu(perf_trace_t); |
50 | if (!buf) | 50 | if (!buf) |
51 | goto fail; | 51 | goto fail; |
52 | 52 | ||
@@ -65,7 +65,7 @@ fail: | |||
65 | if (!total_ref_count) { | 65 | if (!total_ref_count) { |
66 | int i; | 66 | int i; |
67 | 67 | ||
68 | for (i = 0; i < 4; i++) { | 68 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
69 | free_percpu(perf_trace_buf[i]); | 69 | free_percpu(perf_trace_buf[i]); |
70 | perf_trace_buf[i] = NULL; | 70 | perf_trace_buf[i] = NULL; |
71 | } | 71 | } |
@@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event) | |||
101 | return ret; | 101 | return ret; |
102 | } | 102 | } |
103 | 103 | ||
104 | int perf_trace_enable(struct perf_event *p_event) | 104 | int perf_trace_add(struct perf_event *p_event, int flags) |
105 | { | 105 | { |
106 | struct ftrace_event_call *tp_event = p_event->tp_event; | 106 | struct ftrace_event_call *tp_event = p_event->tp_event; |
107 | struct hlist_head __percpu *pcpu_list; | ||
107 | struct hlist_head *list; | 108 | struct hlist_head *list; |
108 | 109 | ||
109 | list = tp_event->perf_events; | 110 | pcpu_list = tp_event->perf_events; |
110 | if (WARN_ON_ONCE(!list)) | 111 | if (WARN_ON_ONCE(!pcpu_list)) |
111 | return -EINVAL; | 112 | return -EINVAL; |
112 | 113 | ||
113 | list = this_cpu_ptr(list); | 114 | if (!(flags & PERF_EF_START)) |
115 | p_event->hw.state = PERF_HES_STOPPED; | ||
116 | |||
117 | list = this_cpu_ptr(pcpu_list); | ||
114 | hlist_add_head_rcu(&p_event->hlist_entry, list); | 118 | hlist_add_head_rcu(&p_event->hlist_entry, list); |
115 | 119 | ||
116 | return 0; | 120 | return 0; |
117 | } | 121 | } |
118 | 122 | ||
119 | void perf_trace_disable(struct perf_event *p_event) | 123 | void perf_trace_del(struct perf_event *p_event, int flags) |
120 | { | 124 | { |
121 | hlist_del_rcu(&p_event->hlist_entry); | 125 | hlist_del_rcu(&p_event->hlist_entry); |
122 | } | 126 | } |
@@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event) | |||
142 | tp_event->perf_events = NULL; | 146 | tp_event->perf_events = NULL; |
143 | 147 | ||
144 | if (!--total_ref_count) { | 148 | if (!--total_ref_count) { |
145 | for (i = 0; i < 4; i++) { | 149 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
146 | free_percpu(perf_trace_buf[i]); | 150 | free_percpu(perf_trace_buf[i]); |
147 | perf_trace_buf[i] = NULL; | 151 | perf_trace_buf[i] = NULL; |
148 | } | 152 | } |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f146328..0725eeab1937 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -600,21 +600,29 @@ out: | |||
600 | 600 | ||
601 | enum { | 601 | enum { |
602 | FORMAT_HEADER = 1, | 602 | FORMAT_HEADER = 1, |
603 | FORMAT_PRINTFMT = 2, | 603 | FORMAT_FIELD_SEPERATOR = 2, |
604 | FORMAT_PRINTFMT = 3, | ||
604 | }; | 605 | }; |
605 | 606 | ||
606 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) | 607 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) |
607 | { | 608 | { |
608 | struct ftrace_event_call *call = m->private; | 609 | struct ftrace_event_call *call = m->private; |
609 | struct ftrace_event_field *field; | 610 | struct ftrace_event_field *field; |
610 | struct list_head *head; | 611 | struct list_head *common_head = &ftrace_common_fields; |
612 | struct list_head *head = trace_get_fields(call); | ||
611 | 613 | ||
612 | (*pos)++; | 614 | (*pos)++; |
613 | 615 | ||
614 | switch ((unsigned long)v) { | 616 | switch ((unsigned long)v) { |
615 | case FORMAT_HEADER: | 617 | case FORMAT_HEADER: |
616 | head = &ftrace_common_fields; | 618 | if (unlikely(list_empty(common_head))) |
619 | return NULL; | ||
620 | |||
621 | field = list_entry(common_head->prev, | ||
622 | struct ftrace_event_field, link); | ||
623 | return field; | ||
617 | 624 | ||
625 | case FORMAT_FIELD_SEPERATOR: | ||
618 | if (unlikely(list_empty(head))) | 626 | if (unlikely(list_empty(head))) |
619 | return NULL; | 627 | return NULL; |
620 | 628 | ||
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) | |||
626 | return NULL; | 634 | return NULL; |
627 | } | 635 | } |
628 | 636 | ||
629 | head = trace_get_fields(call); | ||
630 | |||
631 | /* | ||
632 | * To separate common fields from event fields, the | ||
633 | * LSB is set on the first event field. Clear it in case. | ||
634 | */ | ||
635 | v = (void *)((unsigned long)v & ~1L); | ||
636 | |||
637 | field = v; | 637 | field = v; |
638 | /* | 638 | if (field->link.prev == common_head) |
639 | * If this is a common field, and at the end of the list, then | 639 | return (void *)FORMAT_FIELD_SEPERATOR; |
640 | * continue with main list. | 640 | else if (field->link.prev == head) |
641 | */ | ||
642 | if (field->link.prev == &ftrace_common_fields) { | ||
643 | if (unlikely(list_empty(head))) | ||
644 | return NULL; | ||
645 | field = list_entry(head->prev, struct ftrace_event_field, link); | ||
646 | /* Set the LSB to notify f_show to print an extra newline */ | ||
647 | field = (struct ftrace_event_field *) | ||
648 | ((unsigned long)field | 1); | ||
649 | return field; | ||
650 | } | ||
651 | |||
652 | /* If we are done tell f_show to print the format */ | ||
653 | if (field->link.prev == head) | ||
654 | return (void *)FORMAT_PRINTFMT; | 641 | return (void *)FORMAT_PRINTFMT; |
655 | 642 | ||
656 | field = list_entry(field->link.prev, struct ftrace_event_field, link); | 643 | field = list_entry(field->link.prev, struct ftrace_event_field, link); |
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v) | |||
688 | seq_printf(m, "format:\n"); | 675 | seq_printf(m, "format:\n"); |
689 | return 0; | 676 | return 0; |
690 | 677 | ||
678 | case FORMAT_FIELD_SEPERATOR: | ||
679 | seq_putc(m, '\n'); | ||
680 | return 0; | ||
681 | |||
691 | case FORMAT_PRINTFMT: | 682 | case FORMAT_PRINTFMT: |
692 | seq_printf(m, "\nprint fmt: %s\n", | 683 | seq_printf(m, "\nprint fmt: %s\n", |
693 | call->print_fmt); | 684 | call->print_fmt); |
694 | return 0; | 685 | return 0; |
695 | } | 686 | } |
696 | 687 | ||
697 | /* | ||
698 | * To separate common fields from event fields, the | ||
699 | * LSB is set on the first event field. Clear it and | ||
700 | * print a newline if it is set. | ||
701 | */ | ||
702 | if ((unsigned long)v & 1) { | ||
703 | seq_putc(m, '\n'); | ||
704 | v = (void *)((unsigned long)v & ~1L); | ||
705 | } | ||
706 | |||
707 | field = v; | 688 | field = v; |
708 | 689 | ||
709 | /* | 690 | /* |
@@ -951,6 +932,7 @@ static const struct file_operations ftrace_enable_fops = { | |||
951 | .open = tracing_open_generic, | 932 | .open = tracing_open_generic, |
952 | .read = event_enable_read, | 933 | .read = event_enable_read, |
953 | .write = event_enable_write, | 934 | .write = event_enable_write, |
935 | .llseek = default_llseek, | ||
954 | }; | 936 | }; |
955 | 937 | ||
956 | static const struct file_operations ftrace_event_format_fops = { | 938 | static const struct file_operations ftrace_event_format_fops = { |
@@ -963,29 +945,34 @@ static const struct file_operations ftrace_event_format_fops = { | |||
963 | static const struct file_operations ftrace_event_id_fops = { | 945 | static const struct file_operations ftrace_event_id_fops = { |
964 | .open = tracing_open_generic, | 946 | .open = tracing_open_generic, |
965 | .read = event_id_read, | 947 | .read = event_id_read, |
948 | .llseek = default_llseek, | ||
966 | }; | 949 | }; |
967 | 950 | ||
968 | static const struct file_operations ftrace_event_filter_fops = { | 951 | static const struct file_operations ftrace_event_filter_fops = { |
969 | .open = tracing_open_generic, | 952 | .open = tracing_open_generic, |
970 | .read = event_filter_read, | 953 | .read = event_filter_read, |
971 | .write = event_filter_write, | 954 | .write = event_filter_write, |
955 | .llseek = default_llseek, | ||
972 | }; | 956 | }; |
973 | 957 | ||
974 | static const struct file_operations ftrace_subsystem_filter_fops = { | 958 | static const struct file_operations ftrace_subsystem_filter_fops = { |
975 | .open = tracing_open_generic, | 959 | .open = tracing_open_generic, |
976 | .read = subsystem_filter_read, | 960 | .read = subsystem_filter_read, |
977 | .write = subsystem_filter_write, | 961 | .write = subsystem_filter_write, |
962 | .llseek = default_llseek, | ||
978 | }; | 963 | }; |
979 | 964 | ||
980 | static const struct file_operations ftrace_system_enable_fops = { | 965 | static const struct file_operations ftrace_system_enable_fops = { |
981 | .open = tracing_open_generic, | 966 | .open = tracing_open_generic, |
982 | .read = system_enable_read, | 967 | .read = system_enable_read, |
983 | .write = system_enable_write, | 968 | .write = system_enable_write, |
969 | .llseek = default_llseek, | ||
984 | }; | 970 | }; |
985 | 971 | ||
986 | static const struct file_operations ftrace_show_header_fops = { | 972 | static const struct file_operations ftrace_show_header_fops = { |
987 | .open = tracing_open_generic, | 973 | .open = tracing_open_generic, |
988 | .read = show_header, | 974 | .read = show_header, |
975 | .llseek = default_llseek, | ||
989 | }; | 976 | }; |
990 | 977 | ||
991 | static struct dentry *event_trace_events_dir(void) | 978 | static struct dentry *event_trace_events_dir(void) |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6f233698518e..76b05980225c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -15,15 +15,19 @@ | |||
15 | #include "trace.h" | 15 | #include "trace.h" |
16 | #include "trace_output.h" | 16 | #include "trace_output.h" |
17 | 17 | ||
18 | /* When set, irq functions will be ignored */ | ||
19 | static int ftrace_graph_skip_irqs; | ||
20 | |||
18 | struct fgraph_cpu_data { | 21 | struct fgraph_cpu_data { |
19 | pid_t last_pid; | 22 | pid_t last_pid; |
20 | int depth; | 23 | int depth; |
24 | int depth_irq; | ||
21 | int ignore; | 25 | int ignore; |
22 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; | 26 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; |
23 | }; | 27 | }; |
24 | 28 | ||
25 | struct fgraph_data { | 29 | struct fgraph_data { |
26 | struct fgraph_cpu_data *cpu_data; | 30 | struct fgraph_cpu_data __percpu *cpu_data; |
27 | 31 | ||
28 | /* Place to preserve last processed entry. */ | 32 | /* Place to preserve last processed entry. */ |
29 | struct ftrace_graph_ent_entry ent; | 33 | struct ftrace_graph_ent_entry ent; |
@@ -41,6 +45,7 @@ struct fgraph_data { | |||
41 | #define TRACE_GRAPH_PRINT_PROC 0x8 | 45 | #define TRACE_GRAPH_PRINT_PROC 0x8 |
42 | #define TRACE_GRAPH_PRINT_DURATION 0x10 | 46 | #define TRACE_GRAPH_PRINT_DURATION 0x10 |
43 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | 47 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 |
48 | #define TRACE_GRAPH_PRINT_IRQS 0x40 | ||
44 | 49 | ||
45 | static struct tracer_opt trace_opts[] = { | 50 | static struct tracer_opt trace_opts[] = { |
46 | /* Display overruns? (for self-debug purpose) */ | 51 | /* Display overruns? (for self-debug purpose) */ |
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = { | |||
55 | { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, | 60 | { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, |
56 | /* Display absolute time of an entry */ | 61 | /* Display absolute time of an entry */ |
57 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, | 62 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, |
63 | /* Display interrupts */ | ||
64 | { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, | ||
58 | { } /* Empty entry */ | 65 | { } /* Empty entry */ |
59 | }; | 66 | }; |
60 | 67 | ||
61 | static struct tracer_flags tracer_flags = { | 68 | static struct tracer_flags tracer_flags = { |
62 | /* Don't display overruns and proc by default */ | 69 | /* Don't display overruns and proc by default */ |
63 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | | 70 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | |
64 | TRACE_GRAPH_PRINT_DURATION, | 71 | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, |
65 | .opts = trace_opts | 72 | .opts = trace_opts |
66 | }; | 73 | }; |
67 | 74 | ||
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr, | |||
204 | return 1; | 211 | return 1; |
205 | } | 212 | } |
206 | 213 | ||
214 | static inline int ftrace_graph_ignore_irqs(void) | ||
215 | { | ||
216 | if (!ftrace_graph_skip_irqs) | ||
217 | return 0; | ||
218 | |||
219 | return in_irq(); | ||
220 | } | ||
221 | |||
207 | int trace_graph_entry(struct ftrace_graph_ent *trace) | 222 | int trace_graph_entry(struct ftrace_graph_ent *trace) |
208 | { | 223 | { |
209 | struct trace_array *tr = graph_array; | 224 | struct trace_array *tr = graph_array; |
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
218 | return 0; | 233 | return 0; |
219 | 234 | ||
220 | /* trace it when it is-nested-in or is a function enabled. */ | 235 | /* trace it when it is-nested-in or is a function enabled. */ |
221 | if (!(trace->depth || ftrace_graph_addr(trace->func))) | 236 | if (!(trace->depth || ftrace_graph_addr(trace->func)) || |
237 | ftrace_graph_ignore_irqs()) | ||
222 | return 0; | 238 | return 0; |
223 | 239 | ||
224 | local_irq_save(flags); | 240 | local_irq_save(flags); |
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) | |||
246 | return trace_graph_entry(trace); | 262 | return trace_graph_entry(trace); |
247 | } | 263 | } |
248 | 264 | ||
265 | static void | ||
266 | __trace_graph_function(struct trace_array *tr, | ||
267 | unsigned long ip, unsigned long flags, int pc) | ||
268 | { | ||
269 | u64 time = trace_clock_local(); | ||
270 | struct ftrace_graph_ent ent = { | ||
271 | .func = ip, | ||
272 | .depth = 0, | ||
273 | }; | ||
274 | struct ftrace_graph_ret ret = { | ||
275 | .func = ip, | ||
276 | .depth = 0, | ||
277 | .calltime = time, | ||
278 | .rettime = time, | ||
279 | }; | ||
280 | |||
281 | __trace_graph_entry(tr, &ent, flags, pc); | ||
282 | __trace_graph_return(tr, &ret, flags, pc); | ||
283 | } | ||
284 | |||
285 | void | ||
286 | trace_graph_function(struct trace_array *tr, | ||
287 | unsigned long ip, unsigned long parent_ip, | ||
288 | unsigned long flags, int pc) | ||
289 | { | ||
290 | __trace_graph_function(tr, ip, flags, pc); | ||
291 | } | ||
292 | |||
249 | void __trace_graph_return(struct trace_array *tr, | 293 | void __trace_graph_return(struct trace_array *tr, |
250 | struct ftrace_graph_ret *trace, | 294 | struct ftrace_graph_ret *trace, |
251 | unsigned long flags, | 295 | unsigned long flags, |
@@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
649 | 693 | ||
650 | /* Print nsecs (we don't want to exceed 7 numbers) */ | 694 | /* Print nsecs (we don't want to exceed 7 numbers) */ |
651 | if (len < 7) { | 695 | if (len < 7) { |
652 | snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", | 696 | size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); |
653 | nsecs_rem); | 697 | |
698 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); | ||
654 | ret = trace_seq_printf(s, ".%s", nsecs_str); | 699 | ret = trace_seq_printf(s, ".%s", nsecs_str); |
655 | if (!ret) | 700 | if (!ret) |
656 | return TRACE_TYPE_PARTIAL_LINE; | 701 | return TRACE_TYPE_PARTIAL_LINE; |
@@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | |||
855 | return 0; | 900 | return 0; |
856 | } | 901 | } |
857 | 902 | ||
903 | /* | ||
904 | * Entry check for irq code | ||
905 | * | ||
906 | * returns 1 if | ||
907 | * - we are inside irq code | ||
908 | * - we just extered irq code | ||
909 | * | ||
910 | * retunns 0 if | ||
911 | * - funcgraph-interrupts option is set | ||
912 | * - we are not inside irq code | ||
913 | */ | ||
914 | static int | ||
915 | check_irq_entry(struct trace_iterator *iter, u32 flags, | ||
916 | unsigned long addr, int depth) | ||
917 | { | ||
918 | int cpu = iter->cpu; | ||
919 | int *depth_irq; | ||
920 | struct fgraph_data *data = iter->private; | ||
921 | |||
922 | /* | ||
923 | * If we are either displaying irqs, or we got called as | ||
924 | * a graph event and private data does not exist, | ||
925 | * then we bypass the irq check. | ||
926 | */ | ||
927 | if ((flags & TRACE_GRAPH_PRINT_IRQS) || | ||
928 | (!data)) | ||
929 | return 0; | ||
930 | |||
931 | depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
932 | |||
933 | /* | ||
934 | * We are inside the irq code | ||
935 | */ | ||
936 | if (*depth_irq >= 0) | ||
937 | return 1; | ||
938 | |||
939 | if ((addr < (unsigned long)__irqentry_text_start) || | ||
940 | (addr >= (unsigned long)__irqentry_text_end)) | ||
941 | return 0; | ||
942 | |||
943 | /* | ||
944 | * We are entering irq code. | ||
945 | */ | ||
946 | *depth_irq = depth; | ||
947 | return 1; | ||
948 | } | ||
949 | |||
950 | /* | ||
951 | * Return check for irq code | ||
952 | * | ||
953 | * returns 1 if | ||
954 | * - we are inside irq code | ||
955 | * - we just left irq code | ||
956 | * | ||
957 | * returns 0 if | ||
958 | * - funcgraph-interrupts option is set | ||
959 | * - we are not inside irq code | ||
960 | */ | ||
961 | static int | ||
962 | check_irq_return(struct trace_iterator *iter, u32 flags, int depth) | ||
963 | { | ||
964 | int cpu = iter->cpu; | ||
965 | int *depth_irq; | ||
966 | struct fgraph_data *data = iter->private; | ||
967 | |||
968 | /* | ||
969 | * If we are either displaying irqs, or we got called as | ||
970 | * a graph event and private data does not exist, | ||
971 | * then we bypass the irq check. | ||
972 | */ | ||
973 | if ((flags & TRACE_GRAPH_PRINT_IRQS) || | ||
974 | (!data)) | ||
975 | return 0; | ||
976 | |||
977 | depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
978 | |||
979 | /* | ||
980 | * We are not inside the irq code. | ||
981 | */ | ||
982 | if (*depth_irq == -1) | ||
983 | return 0; | ||
984 | |||
985 | /* | ||
986 | * We are inside the irq code, and this is returning entry. | ||
987 | * Let's not trace it and clear the entry depth, since | ||
988 | * we are out of irq code. | ||
989 | * | ||
990 | * This condition ensures that we 'leave the irq code' once | ||
991 | * we are out of the entry depth. Thus protecting us from | ||
992 | * the RETURN entry loss. | ||
993 | */ | ||
994 | if (*depth_irq >= depth) { | ||
995 | *depth_irq = -1; | ||
996 | return 1; | ||
997 | } | ||
998 | |||
999 | /* | ||
1000 | * We are inside the irq code, and this is not the entry. | ||
1001 | */ | ||
1002 | return 1; | ||
1003 | } | ||
1004 | |||
858 | static enum print_line_t | 1005 | static enum print_line_t |
859 | print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | 1006 | print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, |
860 | struct trace_iterator *iter, u32 flags) | 1007 | struct trace_iterator *iter, u32 flags) |
@@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | |||
865 | static enum print_line_t ret; | 1012 | static enum print_line_t ret; |
866 | int cpu = iter->cpu; | 1013 | int cpu = iter->cpu; |
867 | 1014 | ||
1015 | if (check_irq_entry(iter, flags, call->func, call->depth)) | ||
1016 | return TRACE_TYPE_HANDLED; | ||
1017 | |||
868 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) | 1018 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) |
869 | return TRACE_TYPE_PARTIAL_LINE; | 1019 | return TRACE_TYPE_PARTIAL_LINE; |
870 | 1020 | ||
@@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
902 | int ret; | 1052 | int ret; |
903 | int i; | 1053 | int i; |
904 | 1054 | ||
1055 | if (check_irq_return(iter, flags, trace->depth)) | ||
1056 | return TRACE_TYPE_HANDLED; | ||
1057 | |||
905 | if (data) { | 1058 | if (data) { |
906 | struct fgraph_cpu_data *cpu_data; | 1059 | struct fgraph_cpu_data *cpu_data; |
907 | int cpu = iter->cpu; | 1060 | int cpu = iter->cpu; |
@@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1054 | 1207 | ||
1055 | 1208 | ||
1056 | enum print_line_t | 1209 | enum print_line_t |
1057 | print_graph_function_flags(struct trace_iterator *iter, u32 flags) | 1210 | __print_graph_function_flags(struct trace_iterator *iter, u32 flags) |
1058 | { | 1211 | { |
1059 | struct ftrace_graph_ent_entry *field; | 1212 | struct ftrace_graph_ent_entry *field; |
1060 | struct fgraph_data *data = iter->private; | 1213 | struct fgraph_data *data = iter->private; |
@@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) | |||
1117 | static enum print_line_t | 1270 | static enum print_line_t |
1118 | print_graph_function(struct trace_iterator *iter) | 1271 | print_graph_function(struct trace_iterator *iter) |
1119 | { | 1272 | { |
1120 | return print_graph_function_flags(iter, tracer_flags.val); | 1273 | return __print_graph_function_flags(iter, tracer_flags.val); |
1274 | } | ||
1275 | |||
1276 | enum print_line_t print_graph_function_flags(struct trace_iterator *iter, | ||
1277 | u32 flags) | ||
1278 | { | ||
1279 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
1280 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
1281 | else | ||
1282 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
1283 | |||
1284 | return __print_graph_function_flags(iter, flags); | ||
1121 | } | 1285 | } |
1122 | 1286 | ||
1123 | static enum print_line_t | 1287 | static enum print_line_t |
@@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) | |||
1149 | seq_printf(s, "#%.*s|||| / \n", size, spaces); | 1313 | seq_printf(s, "#%.*s|||| / \n", size, spaces); |
1150 | } | 1314 | } |
1151 | 1315 | ||
1152 | void print_graph_headers_flags(struct seq_file *s, u32 flags) | 1316 | static void __print_graph_headers_flags(struct seq_file *s, u32 flags) |
1153 | { | 1317 | { |
1154 | int lat = trace_flags & TRACE_ITER_LATENCY_FMT; | 1318 | int lat = trace_flags & TRACE_ITER_LATENCY_FMT; |
1155 | 1319 | ||
@@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s) | |||
1190 | print_graph_headers_flags(s, tracer_flags.val); | 1354 | print_graph_headers_flags(s, tracer_flags.val); |
1191 | } | 1355 | } |
1192 | 1356 | ||
1357 | void print_graph_headers_flags(struct seq_file *s, u32 flags) | ||
1358 | { | ||
1359 | struct trace_iterator *iter = s->private; | ||
1360 | |||
1361 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | ||
1362 | /* print nothing if the buffers are empty */ | ||
1363 | if (trace_empty(iter)) | ||
1364 | return; | ||
1365 | |||
1366 | print_trace_header(s, iter); | ||
1367 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
1368 | } else | ||
1369 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
1370 | |||
1371 | __print_graph_headers_flags(s, flags); | ||
1372 | } | ||
1373 | |||
1193 | void graph_trace_open(struct trace_iterator *iter) | 1374 | void graph_trace_open(struct trace_iterator *iter) |
1194 | { | 1375 | { |
1195 | /* pid and depth on the last trace processed */ | 1376 | /* pid and depth on the last trace processed */ |
@@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter) | |||
1210 | pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); | 1391 | pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); |
1211 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 1392 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); |
1212 | int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); | 1393 | int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); |
1394 | int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
1395 | |||
1213 | *pid = -1; | 1396 | *pid = -1; |
1214 | *depth = 0; | 1397 | *depth = 0; |
1215 | *ignore = 0; | 1398 | *ignore = 0; |
1399 | *depth_irq = -1; | ||
1216 | } | 1400 | } |
1217 | 1401 | ||
1218 | iter->private = data; | 1402 | iter->private = data; |
@@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter) | |||
1235 | } | 1419 | } |
1236 | } | 1420 | } |
1237 | 1421 | ||
1422 | static int func_graph_set_flag(u32 old_flags, u32 bit, int set) | ||
1423 | { | ||
1424 | if (bit == TRACE_GRAPH_PRINT_IRQS) | ||
1425 | ftrace_graph_skip_irqs = !set; | ||
1426 | |||
1427 | return 0; | ||
1428 | } | ||
1429 | |||
1238 | static struct trace_event_functions graph_functions = { | 1430 | static struct trace_event_functions graph_functions = { |
1239 | .trace = print_graph_function_event, | 1431 | .trace = print_graph_function_event, |
1240 | }; | 1432 | }; |
@@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = { | |||
1261 | .print_line = print_graph_function, | 1453 | .print_line = print_graph_function, |
1262 | .print_header = print_graph_headers, | 1454 | .print_header = print_graph_headers, |
1263 | .flags = &tracer_flags, | 1455 | .flags = &tracer_flags, |
1456 | .set_flag = func_graph_set_flag, | ||
1264 | #ifdef CONFIG_FTRACE_SELFTEST | 1457 | #ifdef CONFIG_FTRACE_SELFTEST |
1265 | .selftest = trace_selftest_startup_function_graph, | 1458 | .selftest = trace_selftest_startup_function_graph, |
1266 | #endif | 1459 | #endif |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 73a6b0601f2e..5cf8c602b880 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; | |||
87 | 87 | ||
88 | #ifdef CONFIG_FUNCTION_TRACER | 88 | #ifdef CONFIG_FUNCTION_TRACER |
89 | /* | 89 | /* |
90 | * irqsoff uses its own tracer function to keep the overhead down: | 90 | * Prologue for the preempt and irqs off function tracers. |
91 | * | ||
92 | * Returns 1 if it is OK to continue, and data->disabled is | ||
93 | * incremented. | ||
94 | * 0 if the trace is to be ignored, and data->disabled | ||
95 | * is kept the same. | ||
96 | * | ||
97 | * Note, this function is also used outside this ifdef but | ||
98 | * inside the #ifdef of the function graph tracer below. | ||
99 | * This is OK, since the function graph tracer is | ||
100 | * dependent on the function tracer. | ||
91 | */ | 101 | */ |
92 | static void | 102 | static int func_prolog_dec(struct trace_array *tr, |
93 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | 103 | struct trace_array_cpu **data, |
104 | unsigned long *flags) | ||
94 | { | 105 | { |
95 | struct trace_array *tr = irqsoff_trace; | ||
96 | struct trace_array_cpu *data; | ||
97 | unsigned long flags; | ||
98 | long disabled; | 106 | long disabled; |
99 | int cpu; | 107 | int cpu; |
100 | 108 | ||
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
106 | */ | 114 | */ |
107 | cpu = raw_smp_processor_id(); | 115 | cpu = raw_smp_processor_id(); |
108 | if (likely(!per_cpu(tracing_cpu, cpu))) | 116 | if (likely(!per_cpu(tracing_cpu, cpu))) |
109 | return; | 117 | return 0; |
110 | 118 | ||
111 | local_save_flags(flags); | 119 | local_save_flags(*flags); |
112 | /* slight chance to get a false positive on tracing_cpu */ | 120 | /* slight chance to get a false positive on tracing_cpu */ |
113 | if (!irqs_disabled_flags(flags)) | 121 | if (!irqs_disabled_flags(*flags)) |
114 | return; | 122 | return 0; |
115 | 123 | ||
116 | data = tr->data[cpu]; | 124 | *data = tr->data[cpu]; |
117 | disabled = atomic_inc_return(&data->disabled); | 125 | disabled = atomic_inc_return(&(*data)->disabled); |
118 | 126 | ||
119 | if (likely(disabled == 1)) | 127 | if (likely(disabled == 1)) |
120 | trace_function(tr, ip, parent_ip, flags, preempt_count()); | 128 | return 1; |
129 | |||
130 | atomic_dec(&(*data)->disabled); | ||
131 | |||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * irqsoff uses its own tracer function to keep the overhead down: | ||
137 | */ | ||
138 | static void | ||
139 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | ||
140 | { | ||
141 | struct trace_array *tr = irqsoff_trace; | ||
142 | struct trace_array_cpu *data; | ||
143 | unsigned long flags; | ||
144 | |||
145 | if (!func_prolog_dec(tr, &data, &flags)) | ||
146 | return; | ||
147 | |||
148 | trace_function(tr, ip, parent_ip, flags, preempt_count()); | ||
121 | 149 | ||
122 | atomic_dec(&data->disabled); | 150 | atomic_dec(&data->disabled); |
123 | } | 151 | } |
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) | |||
155 | struct trace_array *tr = irqsoff_trace; | 183 | struct trace_array *tr = irqsoff_trace; |
156 | struct trace_array_cpu *data; | 184 | struct trace_array_cpu *data; |
157 | unsigned long flags; | 185 | unsigned long flags; |
158 | long disabled; | ||
159 | int ret; | 186 | int ret; |
160 | int cpu; | ||
161 | int pc; | 187 | int pc; |
162 | 188 | ||
163 | cpu = raw_smp_processor_id(); | 189 | if (!func_prolog_dec(tr, &data, &flags)) |
164 | if (likely(!per_cpu(tracing_cpu, cpu))) | ||
165 | return 0; | 190 | return 0; |
166 | 191 | ||
167 | local_save_flags(flags); | 192 | pc = preempt_count(); |
168 | /* slight chance to get a false positive on tracing_cpu */ | 193 | ret = __trace_graph_entry(tr, trace, flags, pc); |
169 | if (!irqs_disabled_flags(flags)) | ||
170 | return 0; | ||
171 | |||
172 | data = tr->data[cpu]; | ||
173 | disabled = atomic_inc_return(&data->disabled); | ||
174 | |||
175 | if (likely(disabled == 1)) { | ||
176 | pc = preempt_count(); | ||
177 | ret = __trace_graph_entry(tr, trace, flags, pc); | ||
178 | } else | ||
179 | ret = 0; | ||
180 | |||
181 | atomic_dec(&data->disabled); | 194 | atomic_dec(&data->disabled); |
195 | |||
182 | return ret; | 196 | return ret; |
183 | } | 197 | } |
184 | 198 | ||
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) | |||
187 | struct trace_array *tr = irqsoff_trace; | 201 | struct trace_array *tr = irqsoff_trace; |
188 | struct trace_array_cpu *data; | 202 | struct trace_array_cpu *data; |
189 | unsigned long flags; | 203 | unsigned long flags; |
190 | long disabled; | ||
191 | int cpu; | ||
192 | int pc; | 204 | int pc; |
193 | 205 | ||
194 | cpu = raw_smp_processor_id(); | 206 | if (!func_prolog_dec(tr, &data, &flags)) |
195 | if (likely(!per_cpu(tracing_cpu, cpu))) | ||
196 | return; | 207 | return; |
197 | 208 | ||
198 | local_save_flags(flags); | 209 | pc = preempt_count(); |
199 | /* slight chance to get a false positive on tracing_cpu */ | 210 | __trace_graph_return(tr, trace, flags, pc); |
200 | if (!irqs_disabled_flags(flags)) | ||
201 | return; | ||
202 | |||
203 | data = tr->data[cpu]; | ||
204 | disabled = atomic_inc_return(&data->disabled); | ||
205 | |||
206 | if (likely(disabled == 1)) { | ||
207 | pc = preempt_count(); | ||
208 | __trace_graph_return(tr, trace, flags, pc); | ||
209 | } | ||
210 | |||
211 | atomic_dec(&data->disabled); | 211 | atomic_dec(&data->disabled); |
212 | } | 212 | } |
213 | 213 | ||
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) | |||
229 | 229 | ||
230 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) | 230 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) |
231 | { | 231 | { |
232 | u32 flags = GRAPH_TRACER_FLAGS; | ||
233 | |||
234 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
235 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
236 | else | ||
237 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
238 | |||
239 | /* | 232 | /* |
240 | * In graph mode call the graph tracer output function, | 233 | * In graph mode call the graph tracer output function, |
241 | * otherwise go with the TRACE_FN event handler | 234 | * otherwise go with the TRACE_FN event handler |
242 | */ | 235 | */ |
243 | if (is_graph()) | 236 | if (is_graph()) |
244 | return print_graph_function_flags(iter, flags); | 237 | return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); |
245 | 238 | ||
246 | return TRACE_TYPE_UNHANDLED; | 239 | return TRACE_TYPE_UNHANDLED; |
247 | } | 240 | } |
248 | 241 | ||
249 | static void irqsoff_print_header(struct seq_file *s) | 242 | static void irqsoff_print_header(struct seq_file *s) |
250 | { | 243 | { |
251 | if (is_graph()) { | 244 | if (is_graph()) |
252 | struct trace_iterator *iter = s->private; | 245 | print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); |
253 | u32 flags = GRAPH_TRACER_FLAGS; | 246 | else |
254 | |||
255 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | ||
256 | /* print nothing if the buffers are empty */ | ||
257 | if (trace_empty(iter)) | ||
258 | return; | ||
259 | |||
260 | print_trace_header(s, iter); | ||
261 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
262 | } else | ||
263 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
264 | |||
265 | print_graph_headers_flags(s, flags); | ||
266 | } else | ||
267 | trace_default_header(s); | 247 | trace_default_header(s); |
268 | } | 248 | } |
269 | 249 | ||
270 | static void | 250 | static void |
271 | trace_graph_function(struct trace_array *tr, | ||
272 | unsigned long ip, unsigned long flags, int pc) | ||
273 | { | ||
274 | u64 time = trace_clock_local(); | ||
275 | struct ftrace_graph_ent ent = { | ||
276 | .func = ip, | ||
277 | .depth = 0, | ||
278 | }; | ||
279 | struct ftrace_graph_ret ret = { | ||
280 | .func = ip, | ||
281 | .depth = 0, | ||
282 | .calltime = time, | ||
283 | .rettime = time, | ||
284 | }; | ||
285 | |||
286 | __trace_graph_entry(tr, &ent, flags, pc); | ||
287 | __trace_graph_return(tr, &ret, flags, pc); | ||
288 | } | ||
289 | |||
290 | static void | ||
291 | __trace_function(struct trace_array *tr, | 251 | __trace_function(struct trace_array *tr, |
292 | unsigned long ip, unsigned long parent_ip, | 252 | unsigned long ip, unsigned long parent_ip, |
293 | unsigned long flags, int pc) | 253 | unsigned long flags, int pc) |
294 | { | 254 | { |
295 | if (!is_graph()) | 255 | if (is_graph()) |
256 | trace_graph_function(tr, ip, parent_ip, flags, pc); | ||
257 | else | ||
296 | trace_function(tr, ip, parent_ip, flags, pc); | 258 | trace_function(tr, ip, parent_ip, flags, pc); |
297 | else { | ||
298 | trace_graph_function(tr, parent_ip, flags, pc); | ||
299 | trace_graph_function(tr, ip, flags, pc); | ||
300 | } | ||
301 | } | 259 | } |
302 | 260 | ||
303 | #else | 261 | #else |
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 7b8ecd751d93..3c5c5dfea0b3 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/kdb.h> | 13 | #include <linux/kdb.h> |
14 | #include <linux/ftrace.h> | 14 | #include <linux/ftrace.h> |
15 | 15 | ||
16 | #include "../debug/kdb/kdb_private.h" | ||
17 | #include "trace.h" | 16 | #include "trace.h" |
18 | #include "trace_output.h" | 17 | #include "trace_output.h" |
19 | 18 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 544301d29dee..2dec9bcde8b4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -31,7 +31,6 @@ | |||
31 | #include <linux/perf_event.h> | 31 | #include <linux/perf_event.h> |
32 | #include <linux/stringify.h> | 32 | #include <linux/stringify.h> |
33 | #include <linux/limits.h> | 33 | #include <linux/limits.h> |
34 | #include <linux/uaccess.h> | ||
35 | #include <asm/bitsperlong.h> | 34 | #include <asm/bitsperlong.h> |
36 | 35 | ||
37 | #include "trace.h" | 36 | #include "trace.h" |
@@ -648,7 +647,7 @@ static int register_trace_probe(struct trace_probe *tp) | |||
648 | } | 647 | } |
649 | ret = register_probe_event(tp); | 648 | ret = register_probe_event(tp); |
650 | if (ret) { | 649 | if (ret) { |
651 | pr_warning("Faild to register probe event(%d)\n", ret); | 650 | pr_warning("Failed to register probe event(%d)\n", ret); |
652 | goto end; | 651 | goto end; |
653 | } | 652 | } |
654 | 653 | ||
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4086eae6e81b..7319559ed59f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -31,48 +31,98 @@ static int wakeup_rt; | |||
31 | static arch_spinlock_t wakeup_lock = | 31 | static arch_spinlock_t wakeup_lock = |
32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
33 | 33 | ||
34 | static void wakeup_reset(struct trace_array *tr); | ||
34 | static void __wakeup_reset(struct trace_array *tr); | 35 | static void __wakeup_reset(struct trace_array *tr); |
36 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace); | ||
37 | static void wakeup_graph_return(struct ftrace_graph_ret *trace); | ||
35 | 38 | ||
36 | static int save_lat_flag; | 39 | static int save_lat_flag; |
37 | 40 | ||
41 | #define TRACE_DISPLAY_GRAPH 1 | ||
42 | |||
43 | static struct tracer_opt trace_opts[] = { | ||
44 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
45 | /* display latency trace as call graph */ | ||
46 | { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, | ||
47 | #endif | ||
48 | { } /* Empty entry */ | ||
49 | }; | ||
50 | |||
51 | static struct tracer_flags tracer_flags = { | ||
52 | .val = 0, | ||
53 | .opts = trace_opts, | ||
54 | }; | ||
55 | |||
56 | #define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) | ||
57 | |||
38 | #ifdef CONFIG_FUNCTION_TRACER | 58 | #ifdef CONFIG_FUNCTION_TRACER |
59 | |||
39 | /* | 60 | /* |
40 | * irqsoff uses its own tracer function to keep the overhead down: | 61 | * Prologue for the wakeup function tracers. |
62 | * | ||
63 | * Returns 1 if it is OK to continue, and preemption | ||
64 | * is disabled and data->disabled is incremented. | ||
65 | * 0 if the trace is to be ignored, and preemption | ||
66 | * is not disabled and data->disabled is | ||
67 | * kept the same. | ||
68 | * | ||
69 | * Note, this function is also used outside this ifdef but | ||
70 | * inside the #ifdef of the function graph tracer below. | ||
71 | * This is OK, since the function graph tracer is | ||
72 | * dependent on the function tracer. | ||
41 | */ | 73 | */ |
42 | static void | 74 | static int |
43 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | 75 | func_prolog_preempt_disable(struct trace_array *tr, |
76 | struct trace_array_cpu **data, | ||
77 | int *pc) | ||
44 | { | 78 | { |
45 | struct trace_array *tr = wakeup_trace; | ||
46 | struct trace_array_cpu *data; | ||
47 | unsigned long flags; | ||
48 | long disabled; | 79 | long disabled; |
49 | int cpu; | 80 | int cpu; |
50 | int pc; | ||
51 | 81 | ||
52 | if (likely(!wakeup_task)) | 82 | if (likely(!wakeup_task)) |
53 | return; | 83 | return 0; |
54 | 84 | ||
55 | pc = preempt_count(); | 85 | *pc = preempt_count(); |
56 | preempt_disable_notrace(); | 86 | preempt_disable_notrace(); |
57 | 87 | ||
58 | cpu = raw_smp_processor_id(); | 88 | cpu = raw_smp_processor_id(); |
59 | if (cpu != wakeup_current_cpu) | 89 | if (cpu != wakeup_current_cpu) |
60 | goto out_enable; | 90 | goto out_enable; |
61 | 91 | ||
62 | data = tr->data[cpu]; | 92 | *data = tr->data[cpu]; |
63 | disabled = atomic_inc_return(&data->disabled); | 93 | disabled = atomic_inc_return(&(*data)->disabled); |
64 | if (unlikely(disabled != 1)) | 94 | if (unlikely(disabled != 1)) |
65 | goto out; | 95 | goto out; |
66 | 96 | ||
67 | local_irq_save(flags); | 97 | return 1; |
68 | 98 | ||
69 | trace_function(tr, ip, parent_ip, flags, pc); | 99 | out: |
100 | atomic_dec(&(*data)->disabled); | ||
101 | |||
102 | out_enable: | ||
103 | preempt_enable_notrace(); | ||
104 | return 0; | ||
105 | } | ||
70 | 106 | ||
107 | /* | ||
108 | * wakeup uses its own tracer function to keep the overhead down: | ||
109 | */ | ||
110 | static void | ||
111 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | ||
112 | { | ||
113 | struct trace_array *tr = wakeup_trace; | ||
114 | struct trace_array_cpu *data; | ||
115 | unsigned long flags; | ||
116 | int pc; | ||
117 | |||
118 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
119 | return; | ||
120 | |||
121 | local_irq_save(flags); | ||
122 | trace_function(tr, ip, parent_ip, flags, pc); | ||
71 | local_irq_restore(flags); | 123 | local_irq_restore(flags); |
72 | 124 | ||
73 | out: | ||
74 | atomic_dec(&data->disabled); | 125 | atomic_dec(&data->disabled); |
75 | out_enable: | ||
76 | preempt_enable_notrace(); | 126 | preempt_enable_notrace(); |
77 | } | 127 | } |
78 | 128 | ||
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly = | |||
82 | }; | 132 | }; |
83 | #endif /* CONFIG_FUNCTION_TRACER */ | 133 | #endif /* CONFIG_FUNCTION_TRACER */ |
84 | 134 | ||
135 | static int start_func_tracer(int graph) | ||
136 | { | ||
137 | int ret; | ||
138 | |||
139 | if (!graph) | ||
140 | ret = register_ftrace_function(&trace_ops); | ||
141 | else | ||
142 | ret = register_ftrace_graph(&wakeup_graph_return, | ||
143 | &wakeup_graph_entry); | ||
144 | |||
145 | if (!ret && tracing_is_enabled()) | ||
146 | tracer_enabled = 1; | ||
147 | else | ||
148 | tracer_enabled = 0; | ||
149 | |||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | static void stop_func_tracer(int graph) | ||
154 | { | ||
155 | tracer_enabled = 0; | ||
156 | |||
157 | if (!graph) | ||
158 | unregister_ftrace_function(&trace_ops); | ||
159 | else | ||
160 | unregister_ftrace_graph(); | ||
161 | } | ||
162 | |||
163 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
164 | static int wakeup_set_flag(u32 old_flags, u32 bit, int set) | ||
165 | { | ||
166 | |||
167 | if (!(bit & TRACE_DISPLAY_GRAPH)) | ||
168 | return -EINVAL; | ||
169 | |||
170 | if (!(is_graph() ^ set)) | ||
171 | return 0; | ||
172 | |||
173 | stop_func_tracer(!set); | ||
174 | |||
175 | wakeup_reset(wakeup_trace); | ||
176 | tracing_max_latency = 0; | ||
177 | |||
178 | return start_func_tracer(set); | ||
179 | } | ||
180 | |||
181 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | ||
182 | { | ||
183 | struct trace_array *tr = wakeup_trace; | ||
184 | struct trace_array_cpu *data; | ||
185 | unsigned long flags; | ||
186 | int pc, ret = 0; | ||
187 | |||
188 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
189 | return 0; | ||
190 | |||
191 | local_save_flags(flags); | ||
192 | ret = __trace_graph_entry(tr, trace, flags, pc); | ||
193 | atomic_dec(&data->disabled); | ||
194 | preempt_enable_notrace(); | ||
195 | |||
196 | return ret; | ||
197 | } | ||
198 | |||
199 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) | ||
200 | { | ||
201 | struct trace_array *tr = wakeup_trace; | ||
202 | struct trace_array_cpu *data; | ||
203 | unsigned long flags; | ||
204 | int pc; | ||
205 | |||
206 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
207 | return; | ||
208 | |||
209 | local_save_flags(flags); | ||
210 | __trace_graph_return(tr, trace, flags, pc); | ||
211 | atomic_dec(&data->disabled); | ||
212 | |||
213 | preempt_enable_notrace(); | ||
214 | return; | ||
215 | } | ||
216 | |||
217 | static void wakeup_trace_open(struct trace_iterator *iter) | ||
218 | { | ||
219 | if (is_graph()) | ||
220 | graph_trace_open(iter); | ||
221 | } | ||
222 | |||
223 | static void wakeup_trace_close(struct trace_iterator *iter) | ||
224 | { | ||
225 | if (iter->private) | ||
226 | graph_trace_close(iter); | ||
227 | } | ||
228 | |||
229 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) | ||
230 | |||
231 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | ||
232 | { | ||
233 | /* | ||
234 | * In graph mode call the graph tracer output function, | ||
235 | * otherwise go with the TRACE_FN event handler | ||
236 | */ | ||
237 | if (is_graph()) | ||
238 | return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); | ||
239 | |||
240 | return TRACE_TYPE_UNHANDLED; | ||
241 | } | ||
242 | |||
243 | static void wakeup_print_header(struct seq_file *s) | ||
244 | { | ||
245 | if (is_graph()) | ||
246 | print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); | ||
247 | else | ||
248 | trace_default_header(s); | ||
249 | } | ||
250 | |||
251 | static void | ||
252 | __trace_function(struct trace_array *tr, | ||
253 | unsigned long ip, unsigned long parent_ip, | ||
254 | unsigned long flags, int pc) | ||
255 | { | ||
256 | if (is_graph()) | ||
257 | trace_graph_function(tr, ip, parent_ip, flags, pc); | ||
258 | else | ||
259 | trace_function(tr, ip, parent_ip, flags, pc); | ||
260 | } | ||
261 | #else | ||
262 | #define __trace_function trace_function | ||
263 | |||
264 | static int wakeup_set_flag(u32 old_flags, u32 bit, int set) | ||
265 | { | ||
266 | return -EINVAL; | ||
267 | } | ||
268 | |||
269 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | ||
270 | { | ||
271 | return -1; | ||
272 | } | ||
273 | |||
274 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | ||
275 | { | ||
276 | return TRACE_TYPE_UNHANDLED; | ||
277 | } | ||
278 | |||
279 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } | ||
280 | static void wakeup_print_header(struct seq_file *s) { } | ||
281 | static void wakeup_trace_open(struct trace_iterator *iter) { } | ||
282 | static void wakeup_trace_close(struct trace_iterator *iter) { } | ||
283 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | ||
284 | |||
85 | /* | 285 | /* |
86 | * Should this new latency be reported/recorded? | 286 | * Should this new latency be reported/recorded? |
87 | */ | 287 | */ |
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore, | |||
152 | /* The task we are waiting for is waking up */ | 352 | /* The task we are waiting for is waking up */ |
153 | data = wakeup_trace->data[wakeup_cpu]; | 353 | data = wakeup_trace->data[wakeup_cpu]; |
154 | 354 | ||
155 | trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); | 355 | __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); |
156 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); | 356 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); |
157 | 357 | ||
158 | T0 = data->preempt_timestamp; | 358 | T0 = data->preempt_timestamp; |
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
252 | * is not called by an assembly function (where as schedule is) | 452 | * is not called by an assembly function (where as schedule is) |
253 | * it should be safe to use it here. | 453 | * it should be safe to use it here. |
254 | */ | 454 | */ |
255 | trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); | 455 | __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); |
256 | 456 | ||
257 | out_locked: | 457 | out_locked: |
258 | arch_spin_unlock(&wakeup_lock); | 458 | arch_spin_unlock(&wakeup_lock); |
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr) | |||
303 | */ | 503 | */ |
304 | smp_wmb(); | 504 | smp_wmb(); |
305 | 505 | ||
306 | register_ftrace_function(&trace_ops); | 506 | if (start_func_tracer(is_graph())) |
307 | 507 | printk(KERN_ERR "failed to start wakeup tracer\n"); | |
308 | if (tracing_is_enabled()) | ||
309 | tracer_enabled = 1; | ||
310 | else | ||
311 | tracer_enabled = 0; | ||
312 | 508 | ||
313 | return; | 509 | return; |
314 | fail_deprobe_wake_new: | 510 | fail_deprobe_wake_new: |
@@ -320,7 +516,7 @@ fail_deprobe: | |||
320 | static void stop_wakeup_tracer(struct trace_array *tr) | 516 | static void stop_wakeup_tracer(struct trace_array *tr) |
321 | { | 517 | { |
322 | tracer_enabled = 0; | 518 | tracer_enabled = 0; |
323 | unregister_ftrace_function(&trace_ops); | 519 | stop_func_tracer(is_graph()); |
324 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); | 520 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); |
325 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); | 521 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); |
326 | unregister_trace_sched_wakeup(probe_wakeup, NULL); | 522 | unregister_trace_sched_wakeup(probe_wakeup, NULL); |
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly = | |||
379 | .start = wakeup_tracer_start, | 575 | .start = wakeup_tracer_start, |
380 | .stop = wakeup_tracer_stop, | 576 | .stop = wakeup_tracer_stop, |
381 | .print_max = 1, | 577 | .print_max = 1, |
578 | .print_header = wakeup_print_header, | ||
579 | .print_line = wakeup_print_line, | ||
580 | .flags = &tracer_flags, | ||
581 | .set_flag = wakeup_set_flag, | ||
382 | #ifdef CONFIG_FTRACE_SELFTEST | 582 | #ifdef CONFIG_FTRACE_SELFTEST |
383 | .selftest = trace_selftest_startup_wakeup, | 583 | .selftest = trace_selftest_startup_wakeup, |
384 | #endif | 584 | #endif |
585 | .open = wakeup_trace_open, | ||
586 | .close = wakeup_trace_close, | ||
385 | .use_max_tr = 1, | 587 | .use_max_tr = 1, |
386 | }; | 588 | }; |
387 | 589 | ||
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
394 | .stop = wakeup_tracer_stop, | 596 | .stop = wakeup_tracer_stop, |
395 | .wait_pipe = poll_wait_pipe, | 597 | .wait_pipe = poll_wait_pipe, |
396 | .print_max = 1, | 598 | .print_max = 1, |
599 | .print_header = wakeup_print_header, | ||
600 | .print_line = wakeup_print_line, | ||
601 | .flags = &tracer_flags, | ||
602 | .set_flag = wakeup_set_flag, | ||
397 | #ifdef CONFIG_FTRACE_SELFTEST | 603 | #ifdef CONFIG_FTRACE_SELFTEST |
398 | .selftest = trace_selftest_startup_wakeup, | 604 | .selftest = trace_selftest_startup_wakeup, |
399 | #endif | 605 | #endif |
606 | .open = wakeup_trace_open, | ||
607 | .close = wakeup_trace_close, | ||
400 | .use_max_tr = 1, | 608 | .use_max_tr = 1, |
401 | }; | 609 | }; |
402 | 610 | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a6b7e0e0f3eb..4c5dead0c239 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = { | |||
195 | .open = tracing_open_generic, | 195 | .open = tracing_open_generic, |
196 | .read = stack_max_size_read, | 196 | .read = stack_max_size_read, |
197 | .write = stack_max_size_write, | 197 | .write = stack_max_size_write, |
198 | .llseek = default_llseek, | ||
198 | }; | 199 | }; |
199 | 200 | ||
200 | static void * | 201 | static void * |
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index a7cc3793baf6..209b379a4721 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c | |||
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void) | |||
263 | { | 263 | { |
264 | int ret, cpu; | 264 | int ret, cpu; |
265 | 265 | ||
266 | for_each_possible_cpu(cpu) { | ||
267 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
268 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
269 | } | ||
270 | |||
266 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | 271 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); |
267 | if (ret) | 272 | if (ret) |
268 | goto out; | 273 | goto out; |
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void) | |||
279 | if (ret) | 284 | if (ret) |
280 | goto no_creation; | 285 | goto no_creation; |
281 | 286 | ||
282 | for_each_possible_cpu(cpu) { | ||
283 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
284 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
285 | } | ||
286 | |||
287 | return 0; | 287 | return 0; |
288 | 288 | ||
289 | no_creation: | 289 | no_creation: |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index c77f3eceea25..e95ee7f31d43 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/jump_label.h> | ||
28 | 29 | ||
29 | extern struct tracepoint __start___tracepoints[]; | 30 | extern struct tracepoint __start___tracepoints[]; |
30 | extern struct tracepoint __stop___tracepoints[]; | 31 | extern struct tracepoint __stop___tracepoints[]; |
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
263 | * is used. | 264 | * is used. |
264 | */ | 265 | */ |
265 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); | 266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); |
266 | elem->state = active; | 267 | if (!elem->state && active) { |
268 | jump_label_enable(&elem->state); | ||
269 | elem->state = active; | ||
270 | } else if (elem->state && !active) { | ||
271 | jump_label_disable(&elem->state); | ||
272 | elem->state = active; | ||
273 | } | ||
267 | } | 274 | } |
268 | 275 | ||
269 | /* | 276 | /* |
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem) | |||
277 | if (elem->unregfunc && elem->state) | 284 | if (elem->unregfunc && elem->state) |
278 | elem->unregfunc(); | 285 | elem->unregfunc(); |
279 | 286 | ||
280 | elem->state = 0; | 287 | if (elem->state) { |
288 | jump_label_disable(&elem->state); | ||
289 | elem->state = 0; | ||
290 | } | ||
281 | rcu_assign_pointer(elem->funcs, NULL); | 291 | rcu_assign_pointer(elem->funcs, NULL); |
282 | } | 292 | } |
283 | 293 | ||
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 0a67e041edf8..24dc60d9fa1f 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
63 | stats->ac_ppid = pid_alive(tsk) ? | 63 | stats->ac_ppid = pid_alive(tsk) ? |
64 | rcu_dereference(tsk->real_parent)->tgid : 0; | 64 | rcu_dereference(tsk->real_parent)->tgid : 0; |
65 | rcu_read_unlock(); | 65 | rcu_read_unlock(); |
66 | stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; | 66 | stats->ac_utime = cputime_to_usecs(tsk->utime); |
67 | stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; | 67 | stats->ac_stime = cputime_to_usecs(tsk->stime); |
68 | stats->ac_utimescaled = | 68 | stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); |
69 | cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; | 69 | stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); |
70 | stats->ac_stimescaled = | ||
71 | cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC; | ||
72 | stats->ac_minflt = tsk->min_flt; | 70 | stats->ac_minflt = tsk->min_flt; |
73 | stats->ac_majflt = tsk->maj_flt; | 71 | stats->ac_majflt = tsk->maj_flt; |
74 | 72 | ||
diff --git a/kernel/user.c b/kernel/user.c index 7e72614b736d..2c7d8d5914b1 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | |||
91 | * upon function exit. | 91 | * upon function exit. |
92 | */ | 92 | */ |
93 | static void free_user(struct user_struct *up, unsigned long flags) | 93 | static void free_user(struct user_struct *up, unsigned long flags) |
94 | __releases(&uidhash_lock) | ||
94 | { | 95 | { |
95 | uid_hash_remove(up); | 96 | uid_hash_remove(up); |
96 | spin_unlock_irqrestore(&uidhash_lock, flags); | 97 | spin_unlock_irqrestore(&uidhash_lock, flags); |
diff --git a/kernel/wait.c b/kernel/wait.c index c4bd3d825f35..b0310eb6cc1e 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
92 | } | 92 | } |
93 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 93 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
94 | 94 | ||
95 | /* | 95 | /** |
96 | * finish_wait - clean up after waiting in a queue | 96 | * finish_wait - clean up after waiting in a queue |
97 | * @q: waitqueue waited on | 97 | * @q: waitqueue waited on |
98 | * @wait: wait descriptor | 98 | * @wait: wait descriptor |
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) | |||
127 | } | 127 | } |
128 | EXPORT_SYMBOL(finish_wait); | 128 | EXPORT_SYMBOL(finish_wait); |
129 | 129 | ||
130 | /* | 130 | /** |
131 | * abort_exclusive_wait - abort exclusive waiting in a queue | 131 | * abort_exclusive_wait - abort exclusive waiting in a queue |
132 | * @q: waitqueue waited on | 132 | * @q: waitqueue waited on |
133 | * @wait: wait descriptor | 133 | * @wait: wait descriptor |
134 | * @state: runstate of the waiter to be woken | 134 | * @mode: runstate of the waiter to be woken |
135 | * @key: key to identify a wait bit queue or %NULL | 135 | * @key: key to identify a wait bit queue or %NULL |
136 | * | 136 | * |
137 | * Sets current thread back to running state and removes | 137 | * Sets current thread back to running state and removes |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f9c3c52ecc1..bafba687a6d8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | |||
43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
44 | #endif | 44 | #endif |
45 | 45 | ||
46 | static int __read_mostly did_panic; | ||
47 | static int __initdata no_watchdog; | 46 | static int __initdata no_watchdog; |
48 | 47 | ||
49 | 48 | ||
@@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts) | |||
187 | return 0; | 186 | return 0; |
188 | } | 187 | } |
189 | 188 | ||
190 | static int | ||
191 | watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||
192 | { | ||
193 | did_panic = 1; | ||
194 | |||
195 | return NOTIFY_DONE; | ||
196 | } | ||
197 | |||
198 | static struct notifier_block panic_block = { | ||
199 | .notifier_call = watchdog_panic, | ||
200 | }; | ||
201 | |||
202 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 189 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
203 | static struct perf_event_attr wd_hw_attr = { | 190 | static struct perf_event_attr wd_hw_attr = { |
204 | .type = PERF_TYPE_HARDWARE, | 191 | .type = PERF_TYPE_HARDWARE, |
@@ -209,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = { | |||
209 | }; | 196 | }; |
210 | 197 | ||
211 | /* Callback function for perf event subsystem */ | 198 | /* Callback function for perf event subsystem */ |
212 | void watchdog_overflow_callback(struct perf_event *event, int nmi, | 199 | static void watchdog_overflow_callback(struct perf_event *event, int nmi, |
213 | struct perf_sample_data *data, | 200 | struct perf_sample_data *data, |
214 | struct pt_regs *regs) | 201 | struct pt_regs *regs) |
215 | { | 202 | { |
@@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu) | |||
371 | /* Try to register using hardware perf events */ | 358 | /* Try to register using hardware perf events */ |
372 | wd_attr = &wd_hw_attr; | 359 | wd_attr = &wd_hw_attr; |
373 | wd_attr->sample_period = hw_nmi_get_sample_period(); | 360 | wd_attr->sample_period = hw_nmi_get_sample_period(); |
374 | event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); | 361 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); |
375 | if (!IS_ERR(event)) { | 362 | if (!IS_ERR(event)) { |
376 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | 363 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); |
377 | goto out_save; | 364 | goto out_save; |
378 | } | 365 | } |
379 | 366 | ||
380 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); | 367 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); |
381 | return -1; | 368 | return PTR_ERR(event); |
382 | 369 | ||
383 | /* success path */ | 370 | /* success path */ |
384 | out_save: | 371 | out_save: |
@@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu) | |||
422 | static int watchdog_enable(int cpu) | 409 | static int watchdog_enable(int cpu) |
423 | { | 410 | { |
424 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | 411 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); |
412 | int err; | ||
425 | 413 | ||
426 | /* enable the perf event */ | 414 | /* enable the perf event */ |
427 | if (watchdog_nmi_enable(cpu) != 0) | 415 | err = watchdog_nmi_enable(cpu); |
428 | return -1; | 416 | if (err) |
417 | return err; | ||
429 | 418 | ||
430 | /* create the watchdog thread */ | 419 | /* create the watchdog thread */ |
431 | if (!p) { | 420 | if (!p) { |
432 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | 421 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); |
433 | if (IS_ERR(p)) { | 422 | if (IS_ERR(p)) { |
434 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | 423 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); |
435 | return -1; | 424 | return PTR_ERR(p); |
436 | } | 425 | } |
437 | kthread_bind(p, cpu); | 426 | kthread_bind(p, cpu); |
438 | per_cpu(watchdog_touch_ts, cpu) = 0; | 427 | per_cpu(watchdog_touch_ts, cpu) = 0; |
@@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void) | |||
484 | { | 473 | { |
485 | int cpu; | 474 | int cpu; |
486 | 475 | ||
476 | if (no_watchdog) | ||
477 | return; | ||
478 | |||
487 | for_each_online_cpu(cpu) | 479 | for_each_online_cpu(cpu) |
488 | watchdog_disable(cpu); | 480 | watchdog_disable(cpu); |
489 | 481 | ||
@@ -526,17 +518,16 @@ static int __cpuinit | |||
526 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 518 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
527 | { | 519 | { |
528 | int hotcpu = (unsigned long)hcpu; | 520 | int hotcpu = (unsigned long)hcpu; |
521 | int err = 0; | ||
529 | 522 | ||
530 | switch (action) { | 523 | switch (action) { |
531 | case CPU_UP_PREPARE: | 524 | case CPU_UP_PREPARE: |
532 | case CPU_UP_PREPARE_FROZEN: | 525 | case CPU_UP_PREPARE_FROZEN: |
533 | if (watchdog_prepare_cpu(hotcpu)) | 526 | err = watchdog_prepare_cpu(hotcpu); |
534 | return NOTIFY_BAD; | ||
535 | break; | 527 | break; |
536 | case CPU_ONLINE: | 528 | case CPU_ONLINE: |
537 | case CPU_ONLINE_FROZEN: | 529 | case CPU_ONLINE_FROZEN: |
538 | if (watchdog_enable(hotcpu)) | 530 | err = watchdog_enable(hotcpu); |
539 | return NOTIFY_BAD; | ||
540 | break; | 531 | break; |
541 | #ifdef CONFIG_HOTPLUG_CPU | 532 | #ifdef CONFIG_HOTPLUG_CPU |
542 | case CPU_UP_CANCELED: | 533 | case CPU_UP_CANCELED: |
@@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
549 | break; | 540 | break; |
550 | #endif /* CONFIG_HOTPLUG_CPU */ | 541 | #endif /* CONFIG_HOTPLUG_CPU */ |
551 | } | 542 | } |
552 | return NOTIFY_OK; | 543 | return notifier_from_errno(err); |
553 | } | 544 | } |
554 | 545 | ||
555 | static struct notifier_block __cpuinitdata cpu_nfb = { | 546 | static struct notifier_block __cpuinitdata cpu_nfb = { |
@@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void) | |||
565 | return 0; | 556 | return 0; |
566 | 557 | ||
567 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 558 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
568 | WARN_ON(err == NOTIFY_BAD); | 559 | WARN_ON(notifier_to_errno(err)); |
569 | 560 | ||
570 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 561 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
571 | register_cpu_notifier(&cpu_nfb); | 562 | register_cpu_notifier(&cpu_nfb); |
572 | 563 | ||
573 | atomic_notifier_chain_register(&panic_notifier_list, &panic_block); | ||
574 | |||
575 | return 0; | 564 | return 0; |
576 | } | 565 | } |
577 | early_initcall(spawn_watchdog_task); | 566 | early_initcall(spawn_watchdog_task); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f77afd939229..90db1bd1a978 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -42,9 +42,6 @@ | |||
42 | #include <linux/lockdep.h> | 42 | #include <linux/lockdep.h> |
43 | #include <linux/idr.h> | 43 | #include <linux/idr.h> |
44 | 44 | ||
45 | #define CREATE_TRACE_POINTS | ||
46 | #include <trace/events/workqueue.h> | ||
47 | |||
48 | #include "workqueue_sched.h" | 45 | #include "workqueue_sched.h" |
49 | 46 | ||
50 | enum { | 47 | enum { |
@@ -257,6 +254,9 @@ EXPORT_SYMBOL_GPL(system_long_wq); | |||
257 | EXPORT_SYMBOL_GPL(system_nrt_wq); | 254 | EXPORT_SYMBOL_GPL(system_nrt_wq); |
258 | EXPORT_SYMBOL_GPL(system_unbound_wq); | 255 | EXPORT_SYMBOL_GPL(system_unbound_wq); |
259 | 256 | ||
257 | #define CREATE_TRACE_POINTS | ||
258 | #include <trace/events/workqueue.h> | ||
259 | |||
260 | #define for_each_busy_worker(worker, i, pos, gcwq) \ | 260 | #define for_each_busy_worker(worker, i, pos, gcwq) \ |
261 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ | 261 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ |
262 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) | 262 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) |
@@ -310,21 +310,6 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, | |||
310 | (cpu) < WORK_CPU_NONE; \ | 310 | (cpu) < WORK_CPU_NONE; \ |
311 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) | 311 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) |
312 | 312 | ||
313 | #ifdef CONFIG_LOCKDEP | ||
314 | /** | ||
315 | * in_workqueue_context() - in context of specified workqueue? | ||
316 | * @wq: the workqueue of interest | ||
317 | * | ||
318 | * Checks lockdep state to see if the current task is executing from | ||
319 | * within a workqueue item. This function exists only if lockdep is | ||
320 | * enabled. | ||
321 | */ | ||
322 | int in_workqueue_context(struct workqueue_struct *wq) | ||
323 | { | ||
324 | return lock_is_held(&wq->lockdep_map); | ||
325 | } | ||
326 | #endif | ||
327 | |||
328 | #ifdef CONFIG_DEBUG_OBJECTS_WORK | 313 | #ifdef CONFIG_DEBUG_OBJECTS_WORK |
329 | 314 | ||
330 | static struct debug_obj_descr work_debug_descr; | 315 | static struct debug_obj_descr work_debug_descr; |
@@ -604,7 +589,9 @@ static bool keep_working(struct global_cwq *gcwq) | |||
604 | { | 589 | { |
605 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | 590 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); |
606 | 591 | ||
607 | return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; | 592 | return !list_empty(&gcwq->worklist) && |
593 | (atomic_read(nr_running) <= 1 || | ||
594 | gcwq->flags & GCWQ_HIGHPRI_PENDING); | ||
608 | } | 595 | } |
609 | 596 | ||
610 | /* Do we need a new worker? Called from manager. */ | 597 | /* Do we need a new worker? Called from manager. */ |
@@ -997,6 +984,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
997 | 984 | ||
998 | /* gcwq determined, get cwq and queue */ | 985 | /* gcwq determined, get cwq and queue */ |
999 | cwq = get_cwq(gcwq->cpu, wq); | 986 | cwq = get_cwq(gcwq->cpu, wq); |
987 | trace_workqueue_queue_work(cpu, cwq, work); | ||
1000 | 988 | ||
1001 | BUG_ON(!list_empty(&work->entry)); | 989 | BUG_ON(!list_empty(&work->entry)); |
1002 | 990 | ||
@@ -1004,6 +992,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1004 | work_flags = work_color_to_flags(cwq->work_color); | 992 | work_flags = work_color_to_flags(cwq->work_color); |
1005 | 993 | ||
1006 | if (likely(cwq->nr_active < cwq->max_active)) { | 994 | if (likely(cwq->nr_active < cwq->max_active)) { |
995 | trace_workqueue_activate_work(work); | ||
1007 | cwq->nr_active++; | 996 | cwq->nr_active++; |
1008 | worklist = gcwq_determine_ins_pos(gcwq, cwq); | 997 | worklist = gcwq_determine_ins_pos(gcwq, cwq); |
1009 | } else { | 998 | } else { |
@@ -1679,6 +1668,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | |||
1679 | struct work_struct, entry); | 1668 | struct work_struct, entry); |
1680 | struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); | 1669 | struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); |
1681 | 1670 | ||
1671 | trace_workqueue_activate_work(work); | ||
1682 | move_linked_works(work, pos, NULL); | 1672 | move_linked_works(work, pos, NULL); |
1683 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | 1673 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); |
1684 | cwq->nr_active++; | 1674 | cwq->nr_active++; |
@@ -2074,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | |||
2074 | * checks and call back into the fixup functions where we | 2064 | * checks and call back into the fixup functions where we |
2075 | * might deadlock. | 2065 | * might deadlock. |
2076 | */ | 2066 | */ |
2077 | INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); | 2067 | INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); |
2078 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); | 2068 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); |
2079 | init_completion(&barr->done); | 2069 | init_completion(&barr->done); |
2080 | 2070 | ||
@@ -2326,27 +2316,17 @@ out_unlock: | |||
2326 | } | 2316 | } |
2327 | EXPORT_SYMBOL_GPL(flush_workqueue); | 2317 | EXPORT_SYMBOL_GPL(flush_workqueue); |
2328 | 2318 | ||
2329 | /** | 2319 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, |
2330 | * flush_work - block until a work_struct's callback has terminated | 2320 | bool wait_executing) |
2331 | * @work: the work which is to be flushed | ||
2332 | * | ||
2333 | * Returns false if @work has already terminated. | ||
2334 | * | ||
2335 | * It is expected that, prior to calling flush_work(), the caller has | ||
2336 | * arranged for the work to not be requeued, otherwise it doesn't make | ||
2337 | * sense to use this function. | ||
2338 | */ | ||
2339 | int flush_work(struct work_struct *work) | ||
2340 | { | 2321 | { |
2341 | struct worker *worker = NULL; | 2322 | struct worker *worker = NULL; |
2342 | struct global_cwq *gcwq; | 2323 | struct global_cwq *gcwq; |
2343 | struct cpu_workqueue_struct *cwq; | 2324 | struct cpu_workqueue_struct *cwq; |
2344 | struct wq_barrier barr; | ||
2345 | 2325 | ||
2346 | might_sleep(); | 2326 | might_sleep(); |
2347 | gcwq = get_work_gcwq(work); | 2327 | gcwq = get_work_gcwq(work); |
2348 | if (!gcwq) | 2328 | if (!gcwq) |
2349 | return 0; | 2329 | return false; |
2350 | 2330 | ||
2351 | spin_lock_irq(&gcwq->lock); | 2331 | spin_lock_irq(&gcwq->lock); |
2352 | if (!list_empty(&work->entry)) { | 2332 | if (!list_empty(&work->entry)) { |
@@ -2359,28 +2339,127 @@ int flush_work(struct work_struct *work) | |||
2359 | cwq = get_work_cwq(work); | 2339 | cwq = get_work_cwq(work); |
2360 | if (unlikely(!cwq || gcwq != cwq->gcwq)) | 2340 | if (unlikely(!cwq || gcwq != cwq->gcwq)) |
2361 | goto already_gone; | 2341 | goto already_gone; |
2362 | } else { | 2342 | } else if (wait_executing) { |
2363 | worker = find_worker_executing_work(gcwq, work); | 2343 | worker = find_worker_executing_work(gcwq, work); |
2364 | if (!worker) | 2344 | if (!worker) |
2365 | goto already_gone; | 2345 | goto already_gone; |
2366 | cwq = worker->current_cwq; | 2346 | cwq = worker->current_cwq; |
2367 | } | 2347 | } else |
2348 | goto already_gone; | ||
2368 | 2349 | ||
2369 | insert_wq_barrier(cwq, &barr, work, worker); | 2350 | insert_wq_barrier(cwq, barr, work, worker); |
2370 | spin_unlock_irq(&gcwq->lock); | 2351 | spin_unlock_irq(&gcwq->lock); |
2371 | 2352 | ||
2372 | lock_map_acquire(&cwq->wq->lockdep_map); | 2353 | lock_map_acquire(&cwq->wq->lockdep_map); |
2373 | lock_map_release(&cwq->wq->lockdep_map); | 2354 | lock_map_release(&cwq->wq->lockdep_map); |
2374 | 2355 | return true; | |
2375 | wait_for_completion(&barr.done); | ||
2376 | destroy_work_on_stack(&barr.work); | ||
2377 | return 1; | ||
2378 | already_gone: | 2356 | already_gone: |
2379 | spin_unlock_irq(&gcwq->lock); | 2357 | spin_unlock_irq(&gcwq->lock); |
2380 | return 0; | 2358 | return false; |
2359 | } | ||
2360 | |||
2361 | /** | ||
2362 | * flush_work - wait for a work to finish executing the last queueing instance | ||
2363 | * @work: the work to flush | ||
2364 | * | ||
2365 | * Wait until @work has finished execution. This function considers | ||
2366 | * only the last queueing instance of @work. If @work has been | ||
2367 | * enqueued across different CPUs on a non-reentrant workqueue or on | ||
2368 | * multiple workqueues, @work might still be executing on return on | ||
2369 | * some of the CPUs from earlier queueing. | ||
2370 | * | ||
2371 | * If @work was queued only on a non-reentrant, ordered or unbound | ||
2372 | * workqueue, @work is guaranteed to be idle on return if it hasn't | ||
2373 | * been requeued since flush started. | ||
2374 | * | ||
2375 | * RETURNS: | ||
2376 | * %true if flush_work() waited for the work to finish execution, | ||
2377 | * %false if it was already idle. | ||
2378 | */ | ||
2379 | bool flush_work(struct work_struct *work) | ||
2380 | { | ||
2381 | struct wq_barrier barr; | ||
2382 | |||
2383 | if (start_flush_work(work, &barr, true)) { | ||
2384 | wait_for_completion(&barr.done); | ||
2385 | destroy_work_on_stack(&barr.work); | ||
2386 | return true; | ||
2387 | } else | ||
2388 | return false; | ||
2381 | } | 2389 | } |
2382 | EXPORT_SYMBOL_GPL(flush_work); | 2390 | EXPORT_SYMBOL_GPL(flush_work); |
2383 | 2391 | ||
2392 | static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) | ||
2393 | { | ||
2394 | struct wq_barrier barr; | ||
2395 | struct worker *worker; | ||
2396 | |||
2397 | spin_lock_irq(&gcwq->lock); | ||
2398 | |||
2399 | worker = find_worker_executing_work(gcwq, work); | ||
2400 | if (unlikely(worker)) | ||
2401 | insert_wq_barrier(worker->current_cwq, &barr, work, worker); | ||
2402 | |||
2403 | spin_unlock_irq(&gcwq->lock); | ||
2404 | |||
2405 | if (unlikely(worker)) { | ||
2406 | wait_for_completion(&barr.done); | ||
2407 | destroy_work_on_stack(&barr.work); | ||
2408 | return true; | ||
2409 | } else | ||
2410 | return false; | ||
2411 | } | ||
2412 | |||
2413 | static bool wait_on_work(struct work_struct *work) | ||
2414 | { | ||
2415 | bool ret = false; | ||
2416 | int cpu; | ||
2417 | |||
2418 | might_sleep(); | ||
2419 | |||
2420 | lock_map_acquire(&work->lockdep_map); | ||
2421 | lock_map_release(&work->lockdep_map); | ||
2422 | |||
2423 | for_each_gcwq_cpu(cpu) | ||
2424 | ret |= wait_on_cpu_work(get_gcwq(cpu), work); | ||
2425 | return ret; | ||
2426 | } | ||
2427 | |||
2428 | /** | ||
2429 | * flush_work_sync - wait until a work has finished execution | ||
2430 | * @work: the work to flush | ||
2431 | * | ||
2432 | * Wait until @work has finished execution. On return, it's | ||
2433 | * guaranteed that all queueing instances of @work which happened | ||
2434 | * before this function is called are finished. In other words, if | ||
2435 | * @work hasn't been requeued since this function was called, @work is | ||
2436 | * guaranteed to be idle on return. | ||
2437 | * | ||
2438 | * RETURNS: | ||
2439 | * %true if flush_work_sync() waited for the work to finish execution, | ||
2440 | * %false if it was already idle. | ||
2441 | */ | ||
2442 | bool flush_work_sync(struct work_struct *work) | ||
2443 | { | ||
2444 | struct wq_barrier barr; | ||
2445 | bool pending, waited; | ||
2446 | |||
2447 | /* we'll wait for executions separately, queue barr only if pending */ | ||
2448 | pending = start_flush_work(work, &barr, false); | ||
2449 | |||
2450 | /* wait for executions to finish */ | ||
2451 | waited = wait_on_work(work); | ||
2452 | |||
2453 | /* wait for the pending one */ | ||
2454 | if (pending) { | ||
2455 | wait_for_completion(&barr.done); | ||
2456 | destroy_work_on_stack(&barr.work); | ||
2457 | } | ||
2458 | |||
2459 | return pending || waited; | ||
2460 | } | ||
2461 | EXPORT_SYMBOL_GPL(flush_work_sync); | ||
2462 | |||
2384 | /* | 2463 | /* |
2385 | * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, | 2464 | * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, |
2386 | * so this work can't be re-armed in any way. | 2465 | * so this work can't be re-armed in any way. |
@@ -2423,39 +2502,7 @@ static int try_to_grab_pending(struct work_struct *work) | |||
2423 | return ret; | 2502 | return ret; |
2424 | } | 2503 | } |
2425 | 2504 | ||
2426 | static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) | 2505 | static bool __cancel_work_timer(struct work_struct *work, |
2427 | { | ||
2428 | struct wq_barrier barr; | ||
2429 | struct worker *worker; | ||
2430 | |||
2431 | spin_lock_irq(&gcwq->lock); | ||
2432 | |||
2433 | worker = find_worker_executing_work(gcwq, work); | ||
2434 | if (unlikely(worker)) | ||
2435 | insert_wq_barrier(worker->current_cwq, &barr, work, worker); | ||
2436 | |||
2437 | spin_unlock_irq(&gcwq->lock); | ||
2438 | |||
2439 | if (unlikely(worker)) { | ||
2440 | wait_for_completion(&barr.done); | ||
2441 | destroy_work_on_stack(&barr.work); | ||
2442 | } | ||
2443 | } | ||
2444 | |||
2445 | static void wait_on_work(struct work_struct *work) | ||
2446 | { | ||
2447 | int cpu; | ||
2448 | |||
2449 | might_sleep(); | ||
2450 | |||
2451 | lock_map_acquire(&work->lockdep_map); | ||
2452 | lock_map_release(&work->lockdep_map); | ||
2453 | |||
2454 | for_each_gcwq_cpu(cpu) | ||
2455 | wait_on_cpu_work(get_gcwq(cpu), work); | ||
2456 | } | ||
2457 | |||
2458 | static int __cancel_work_timer(struct work_struct *work, | ||
2459 | struct timer_list* timer) | 2506 | struct timer_list* timer) |
2460 | { | 2507 | { |
2461 | int ret; | 2508 | int ret; |
@@ -2472,42 +2519,81 @@ static int __cancel_work_timer(struct work_struct *work, | |||
2472 | } | 2519 | } |
2473 | 2520 | ||
2474 | /** | 2521 | /** |
2475 | * cancel_work_sync - block until a work_struct's callback has terminated | 2522 | * cancel_work_sync - cancel a work and wait for it to finish |
2476 | * @work: the work which is to be flushed | 2523 | * @work: the work to cancel |
2477 | * | ||
2478 | * Returns true if @work was pending. | ||
2479 | * | 2524 | * |
2480 | * cancel_work_sync() will cancel the work if it is queued. If the work's | 2525 | * Cancel @work and wait for its execution to finish. This function |
2481 | * callback appears to be running, cancel_work_sync() will block until it | 2526 | * can be used even if the work re-queues itself or migrates to |
2482 | * has completed. | 2527 | * another workqueue. On return from this function, @work is |
2528 | * guaranteed to be not pending or executing on any CPU. | ||
2483 | * | 2529 | * |
2484 | * It is possible to use this function if the work re-queues itself. It can | 2530 | * cancel_work_sync(&delayed_work->work) must not be used for |
2485 | * cancel the work even if it migrates to another workqueue, however in that | 2531 | * delayed_work's. Use cancel_delayed_work_sync() instead. |
2486 | * case it only guarantees that work->func() has completed on the last queued | ||
2487 | * workqueue. | ||
2488 | * | 2532 | * |
2489 | * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not | 2533 | * The caller must ensure that the workqueue on which @work was last |
2490 | * pending, otherwise it goes into a busy-wait loop until the timer expires. | ||
2491 | * | ||
2492 | * The caller must ensure that workqueue_struct on which this work was last | ||
2493 | * queued can't be destroyed before this function returns. | 2534 | * queued can't be destroyed before this function returns. |
2535 | * | ||
2536 | * RETURNS: | ||
2537 | * %true if @work was pending, %false otherwise. | ||
2494 | */ | 2538 | */ |
2495 | int cancel_work_sync(struct work_struct *work) | 2539 | bool cancel_work_sync(struct work_struct *work) |
2496 | { | 2540 | { |
2497 | return __cancel_work_timer(work, NULL); | 2541 | return __cancel_work_timer(work, NULL); |
2498 | } | 2542 | } |
2499 | EXPORT_SYMBOL_GPL(cancel_work_sync); | 2543 | EXPORT_SYMBOL_GPL(cancel_work_sync); |
2500 | 2544 | ||
2501 | /** | 2545 | /** |
2502 | * cancel_delayed_work_sync - reliably kill off a delayed work. | 2546 | * flush_delayed_work - wait for a dwork to finish executing the last queueing |
2503 | * @dwork: the delayed work struct | 2547 | * @dwork: the delayed work to flush |
2548 | * | ||
2549 | * Delayed timer is cancelled and the pending work is queued for | ||
2550 | * immediate execution. Like flush_work(), this function only | ||
2551 | * considers the last queueing instance of @dwork. | ||
2552 | * | ||
2553 | * RETURNS: | ||
2554 | * %true if flush_work() waited for the work to finish execution, | ||
2555 | * %false if it was already idle. | ||
2556 | */ | ||
2557 | bool flush_delayed_work(struct delayed_work *dwork) | ||
2558 | { | ||
2559 | if (del_timer_sync(&dwork->timer)) | ||
2560 | __queue_work(raw_smp_processor_id(), | ||
2561 | get_work_cwq(&dwork->work)->wq, &dwork->work); | ||
2562 | return flush_work(&dwork->work); | ||
2563 | } | ||
2564 | EXPORT_SYMBOL(flush_delayed_work); | ||
2565 | |||
2566 | /** | ||
2567 | * flush_delayed_work_sync - wait for a dwork to finish | ||
2568 | * @dwork: the delayed work to flush | ||
2504 | * | 2569 | * |
2505 | * Returns true if @dwork was pending. | 2570 | * Delayed timer is cancelled and the pending work is queued for |
2571 | * execution immediately. Other than timer handling, its behavior | ||
2572 | * is identical to flush_work_sync(). | ||
2506 | * | 2573 | * |
2507 | * It is possible to use this function if @dwork rearms itself via queue_work() | 2574 | * RETURNS: |
2508 | * or queue_delayed_work(). See also the comment for cancel_work_sync(). | 2575 | * %true if flush_work_sync() waited for the work to finish execution, |
2576 | * %false if it was already idle. | ||
2509 | */ | 2577 | */ |
2510 | int cancel_delayed_work_sync(struct delayed_work *dwork) | 2578 | bool flush_delayed_work_sync(struct delayed_work *dwork) |
2579 | { | ||
2580 | if (del_timer_sync(&dwork->timer)) | ||
2581 | __queue_work(raw_smp_processor_id(), | ||
2582 | get_work_cwq(&dwork->work)->wq, &dwork->work); | ||
2583 | return flush_work_sync(&dwork->work); | ||
2584 | } | ||
2585 | EXPORT_SYMBOL(flush_delayed_work_sync); | ||
2586 | |||
2587 | /** | ||
2588 | * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish | ||
2589 | * @dwork: the delayed work cancel | ||
2590 | * | ||
2591 | * This is cancel_work_sync() for delayed works. | ||
2592 | * | ||
2593 | * RETURNS: | ||
2594 | * %true if @dwork was pending, %false otherwise. | ||
2595 | */ | ||
2596 | bool cancel_delayed_work_sync(struct delayed_work *dwork) | ||
2511 | { | 2597 | { |
2512 | return __cancel_work_timer(&dwork->work, &dwork->timer); | 2598 | return __cancel_work_timer(&dwork->work, &dwork->timer); |
2513 | } | 2599 | } |
@@ -2559,23 +2645,6 @@ int schedule_delayed_work(struct delayed_work *dwork, | |||
2559 | EXPORT_SYMBOL(schedule_delayed_work); | 2645 | EXPORT_SYMBOL(schedule_delayed_work); |
2560 | 2646 | ||
2561 | /** | 2647 | /** |
2562 | * flush_delayed_work - block until a dwork_struct's callback has terminated | ||
2563 | * @dwork: the delayed work which is to be flushed | ||
2564 | * | ||
2565 | * Any timeout is cancelled, and any pending work is run immediately. | ||
2566 | */ | ||
2567 | void flush_delayed_work(struct delayed_work *dwork) | ||
2568 | { | ||
2569 | if (del_timer_sync(&dwork->timer)) { | ||
2570 | __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, | ||
2571 | &dwork->work); | ||
2572 | put_cpu(); | ||
2573 | } | ||
2574 | flush_work(&dwork->work); | ||
2575 | } | ||
2576 | EXPORT_SYMBOL(flush_delayed_work); | ||
2577 | |||
2578 | /** | ||
2579 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay | 2648 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay |
2580 | * @cpu: cpu to use | 2649 | * @cpu: cpu to use |
2581 | * @dwork: job to be done | 2650 | * @dwork: job to be done |
@@ -2592,13 +2661,15 @@ int schedule_delayed_work_on(int cpu, | |||
2592 | EXPORT_SYMBOL(schedule_delayed_work_on); | 2661 | EXPORT_SYMBOL(schedule_delayed_work_on); |
2593 | 2662 | ||
2594 | /** | 2663 | /** |
2595 | * schedule_on_each_cpu - call a function on each online CPU from keventd | 2664 | * schedule_on_each_cpu - execute a function synchronously on each online CPU |
2596 | * @func: the function to call | 2665 | * @func: the function to call |
2597 | * | 2666 | * |
2598 | * Returns zero on success. | 2667 | * schedule_on_each_cpu() executes @func on each online CPU using the |
2599 | * Returns -ve errno on failure. | 2668 | * system workqueue and blocks until all CPUs have completed. |
2600 | * | ||
2601 | * schedule_on_each_cpu() is very slow. | 2669 | * schedule_on_each_cpu() is very slow. |
2670 | * | ||
2671 | * RETURNS: | ||
2672 | * 0 on success, -errno on failure. | ||
2602 | */ | 2673 | */ |
2603 | int schedule_on_each_cpu(work_func_t func) | 2674 | int schedule_on_each_cpu(work_func_t func) |
2604 | { | 2675 | { |
@@ -2720,7 +2791,9 @@ static int alloc_cwqs(struct workqueue_struct *wq) | |||
2720 | } | 2791 | } |
2721 | } | 2792 | } |
2722 | 2793 | ||
2723 | /* just in case, make sure it's actually aligned */ | 2794 | /* just in case, make sure it's actually aligned |
2795 | * - this is affected by PERCPU() alignment in vmlinux.lds.S | ||
2796 | */ | ||
2724 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); | 2797 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); |
2725 | return wq->cpu_wq.v ? 0 : -ENOMEM; | 2798 | return wq->cpu_wq.v ? 0 : -ENOMEM; |
2726 | } | 2799 | } |
@@ -2764,6 +2837,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
2764 | unsigned int cpu; | 2837 | unsigned int cpu; |
2765 | 2838 | ||
2766 | /* | 2839 | /* |
2840 | * Workqueues which may be used during memory reclaim should | ||
2841 | * have a rescuer to guarantee forward progress. | ||
2842 | */ | ||
2843 | if (flags & WQ_MEM_RECLAIM) | ||
2844 | flags |= WQ_RESCUER; | ||
2845 | |||
2846 | /* | ||
2767 | * Unbound workqueues aren't concurrency managed and should be | 2847 | * Unbound workqueues aren't concurrency managed and should be |
2768 | * dispatched to workers immediately. | 2848 | * dispatched to workers immediately. |
2769 | */ | 2849 | */ |