diff options
| author | Jiri Kosina <jkosina@suse.cz> | 2014-11-20 08:42:02 -0500 |
|---|---|---|
| committer | Jiri Kosina <jkosina@suse.cz> | 2014-11-20 08:42:02 -0500 |
| commit | a02001086bbfb4da35d1228bebc2f1b442db455f (patch) | |
| tree | 62ab47936cef06fd08657ca5b6cd1df98c19be57 /kernel | |
| parent | eff264efeeb0898408e8c9df72d8a32621035bed (diff) | |
| parent | fc14f9c1272f62c3e8d01300f52467c0d9af50f9 (diff) | |
Merge Linus' tree to be be to apply submitted patches to newer code than
current trivial.git base
Diffstat (limited to 'kernel')
125 files changed, 9587 insertions, 3061 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0026cf531769..17ea6d4a9a24 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ | |||
| 86 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 86 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
| 87 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 87 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
| 88 | obj-$(CONFIG_CPU_PM) += cpu_pm.o | 88 | obj-$(CONFIG_CPU_PM) += cpu_pm.o |
| 89 | obj-$(CONFIG_NET) += bpf/ | 89 | obj-$(CONFIG_BPF) += bpf/ |
| 90 | 90 | ||
| 91 | obj-$(CONFIG_PERF_EVENTS) += events/ | 91 | obj-$(CONFIG_PERF_EVENTS) += events/ |
| 92 | 92 | ||
| @@ -105,7 +105,7 @@ targets += config_data.gz | |||
| 105 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE | 105 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE |
| 106 | $(call if_changed,gzip) | 106 | $(call if_changed,gzip) |
| 107 | 107 | ||
| 108 | filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") | 108 | filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") |
| 109 | targets += config_data.h | 109 | targets += config_data.h |
| 110 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE | 110 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE |
| 111 | $(call filechk,ikconfiggz) | 111 | $(call filechk,ikconfiggz) |
diff --git a/kernel/acct.c b/kernel/acct.c index a1844f14c6d6..33738ef972f3 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -59,6 +59,7 @@ | |||
| 59 | #include <asm/div64.h> | 59 | #include <asm/div64.h> |
| 60 | #include <linux/blkdev.h> /* sector_div */ | 60 | #include <linux/blkdev.h> /* sector_div */ |
| 61 | #include <linux/pid_namespace.h> | 61 | #include <linux/pid_namespace.h> |
| 62 | #include <linux/fs_pin.h> | ||
| 62 | 63 | ||
| 63 | /* | 64 | /* |
| 64 | * These constants control the amount of freespace that suspend and | 65 | * These constants control the amount of freespace that suspend and |
| @@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30}; | |||
| 75 | /* | 76 | /* |
| 76 | * External references and all of the globals. | 77 | * External references and all of the globals. |
| 77 | */ | 78 | */ |
| 78 | static void do_acct_process(struct bsd_acct_struct *acct, | 79 | static void do_acct_process(struct bsd_acct_struct *acct); |
| 79 | struct pid_namespace *ns, struct file *); | ||
| 80 | 80 | ||
| 81 | /* | ||
| 82 | * This structure is used so that all the data protected by lock | ||
| 83 | * can be placed in the same cache line as the lock. This primes | ||
| 84 | * the cache line to have the data after getting the lock. | ||
| 85 | */ | ||
| 86 | struct bsd_acct_struct { | 81 | struct bsd_acct_struct { |
| 82 | struct fs_pin pin; | ||
| 83 | struct mutex lock; | ||
| 87 | int active; | 84 | int active; |
| 88 | unsigned long needcheck; | 85 | unsigned long needcheck; |
| 89 | struct file *file; | 86 | struct file *file; |
| 90 | struct pid_namespace *ns; | 87 | struct pid_namespace *ns; |
| 91 | struct list_head list; | 88 | struct work_struct work; |
| 89 | struct completion done; | ||
| 92 | }; | 90 | }; |
| 93 | 91 | ||
| 94 | static DEFINE_SPINLOCK(acct_lock); | ||
| 95 | static LIST_HEAD(acct_list); | ||
| 96 | |||
| 97 | /* | 92 | /* |
| 98 | * Check the amount of free space and suspend/resume accordingly. | 93 | * Check the amount of free space and suspend/resume accordingly. |
| 99 | */ | 94 | */ |
| 100 | static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | 95 | static int check_free_space(struct bsd_acct_struct *acct) |
| 101 | { | 96 | { |
| 102 | struct kstatfs sbuf; | 97 | struct kstatfs sbuf; |
| 103 | int res; | 98 | |
| 104 | int act; | 99 | if (time_is_before_jiffies(acct->needcheck)) |
| 105 | u64 resume; | ||
| 106 | u64 suspend; | ||
| 107 | |||
| 108 | spin_lock(&acct_lock); | ||
| 109 | res = acct->active; | ||
| 110 | if (!file || time_is_before_jiffies(acct->needcheck)) | ||
| 111 | goto out; | 100 | goto out; |
| 112 | spin_unlock(&acct_lock); | ||
| 113 | 101 | ||
| 114 | /* May block */ | 102 | /* May block */ |
| 115 | if (vfs_statfs(&file->f_path, &sbuf)) | 103 | if (vfs_statfs(&acct->file->f_path, &sbuf)) |
| 116 | return res; | ||
| 117 | suspend = sbuf.f_blocks * SUSPEND; | ||
| 118 | resume = sbuf.f_blocks * RESUME; | ||
| 119 | |||
| 120 | do_div(suspend, 100); | ||
| 121 | do_div(resume, 100); | ||
| 122 | |||
| 123 | if (sbuf.f_bavail <= suspend) | ||
| 124 | act = -1; | ||
| 125 | else if (sbuf.f_bavail >= resume) | ||
| 126 | act = 1; | ||
| 127 | else | ||
| 128 | act = 0; | ||
| 129 | |||
| 130 | /* | ||
| 131 | * If some joker switched acct->file under us we'ld better be | ||
| 132 | * silent and _not_ touch anything. | ||
| 133 | */ | ||
| 134 | spin_lock(&acct_lock); | ||
| 135 | if (file != acct->file) { | ||
| 136 | if (act) | ||
| 137 | res = act > 0; | ||
| 138 | goto out; | 104 | goto out; |
| 139 | } | ||
| 140 | 105 | ||
| 141 | if (acct->active) { | 106 | if (acct->active) { |
| 142 | if (act < 0) { | 107 | u64 suspend = sbuf.f_blocks * SUSPEND; |
| 108 | do_div(suspend, 100); | ||
| 109 | if (sbuf.f_bavail <= suspend) { | ||
| 143 | acct->active = 0; | 110 | acct->active = 0; |
| 144 | printk(KERN_INFO "Process accounting paused\n"); | 111 | pr_info("Process accounting paused\n"); |
| 145 | } | 112 | } |
| 146 | } else { | 113 | } else { |
| 147 | if (act > 0) { | 114 | u64 resume = sbuf.f_blocks * RESUME; |
| 115 | do_div(resume, 100); | ||
| 116 | if (sbuf.f_bavail >= resume) { | ||
| 148 | acct->active = 1; | 117 | acct->active = 1; |
| 149 | printk(KERN_INFO "Process accounting resumed\n"); | 118 | pr_info("Process accounting resumed\n"); |
| 150 | } | 119 | } |
| 151 | } | 120 | } |
| 152 | 121 | ||
| 153 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; | 122 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; |
| 154 | res = acct->active; | ||
| 155 | out: | 123 | out: |
| 156 | spin_unlock(&acct_lock); | 124 | return acct->active; |
| 125 | } | ||
| 126 | |||
| 127 | static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) | ||
| 128 | { | ||
| 129 | struct bsd_acct_struct *res; | ||
| 130 | again: | ||
| 131 | smp_rmb(); | ||
| 132 | rcu_read_lock(); | ||
| 133 | res = ACCESS_ONCE(ns->bacct); | ||
| 134 | if (!res) { | ||
| 135 | rcu_read_unlock(); | ||
| 136 | return NULL; | ||
| 137 | } | ||
| 138 | if (!atomic_long_inc_not_zero(&res->pin.count)) { | ||
| 139 | rcu_read_unlock(); | ||
| 140 | cpu_relax(); | ||
| 141 | goto again; | ||
| 142 | } | ||
| 143 | rcu_read_unlock(); | ||
| 144 | mutex_lock(&res->lock); | ||
| 145 | if (!res->ns) { | ||
| 146 | mutex_unlock(&res->lock); | ||
| 147 | pin_put(&res->pin); | ||
| 148 | goto again; | ||
| 149 | } | ||
| 157 | return res; | 150 | return res; |
| 158 | } | 151 | } |
| 159 | 152 | ||
| 160 | /* | 153 | static void close_work(struct work_struct *work) |
| 161 | * Close the old accounting file (if currently open) and then replace | 154 | { |
| 162 | * it with file (if non-NULL). | 155 | struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); |
| 163 | * | 156 | struct file *file = acct->file; |
| 164 | * NOTE: acct_lock MUST be held on entry and exit. | 157 | if (file->f_op->flush) |
| 165 | */ | 158 | file->f_op->flush(file, NULL); |
| 166 | static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, | 159 | __fput_sync(file); |
| 167 | struct pid_namespace *ns) | 160 | complete(&acct->done); |
| 161 | } | ||
| 162 | |||
| 163 | static void acct_kill(struct bsd_acct_struct *acct, | ||
| 164 | struct bsd_acct_struct *new) | ||
| 168 | { | 165 | { |
| 169 | struct file *old_acct = NULL; | 166 | if (acct) { |
| 170 | struct pid_namespace *old_ns = NULL; | 167 | struct pid_namespace *ns = acct->ns; |
| 171 | 168 | do_acct_process(acct); | |
| 172 | if (acct->file) { | 169 | INIT_WORK(&acct->work, close_work); |
| 173 | old_acct = acct->file; | 170 | init_completion(&acct->done); |
| 174 | old_ns = acct->ns; | 171 | schedule_work(&acct->work); |
| 175 | acct->active = 0; | 172 | wait_for_completion(&acct->done); |
| 176 | acct->file = NULL; | 173 | pin_remove(&acct->pin); |
| 174 | ns->bacct = new; | ||
| 177 | acct->ns = NULL; | 175 | acct->ns = NULL; |
| 178 | list_del(&acct->list); | 176 | atomic_long_dec(&acct->pin.count); |
| 177 | mutex_unlock(&acct->lock); | ||
| 178 | pin_put(&acct->pin); | ||
| 179 | } | 179 | } |
| 180 | if (file) { | 180 | } |
| 181 | acct->file = file; | 181 | |
| 182 | acct->ns = ns; | 182 | static void acct_pin_kill(struct fs_pin *pin) |
| 183 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; | 183 | { |
| 184 | acct->active = 1; | 184 | struct bsd_acct_struct *acct; |
| 185 | list_add(&acct->list, &acct_list); | 185 | acct = container_of(pin, struct bsd_acct_struct, pin); |
| 186 | } | 186 | mutex_lock(&acct->lock); |
| 187 | if (old_acct) { | 187 | if (!acct->ns) { |
| 188 | mnt_unpin(old_acct->f_path.mnt); | 188 | mutex_unlock(&acct->lock); |
| 189 | spin_unlock(&acct_lock); | 189 | pin_put(pin); |
| 190 | do_acct_process(acct, old_ns, old_acct); | 190 | acct = NULL; |
| 191 | filp_close(old_acct, NULL); | ||
| 192 | spin_lock(&acct_lock); | ||
| 193 | } | 191 | } |
| 192 | acct_kill(acct, NULL); | ||
| 194 | } | 193 | } |
| 195 | 194 | ||
| 196 | static int acct_on(struct filename *pathname) | 195 | static int acct_on(struct filename *pathname) |
| 197 | { | 196 | { |
| 198 | struct file *file; | 197 | struct file *file; |
| 199 | struct vfsmount *mnt; | 198 | struct vfsmount *mnt, *internal; |
| 200 | struct pid_namespace *ns; | 199 | struct pid_namespace *ns = task_active_pid_ns(current); |
| 201 | struct bsd_acct_struct *acct = NULL; | 200 | struct bsd_acct_struct *acct, *old; |
| 201 | int err; | ||
| 202 | |||
| 203 | acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); | ||
| 204 | if (!acct) | ||
| 205 | return -ENOMEM; | ||
| 202 | 206 | ||
| 203 | /* Difference from BSD - they don't do O_APPEND */ | 207 | /* Difference from BSD - they don't do O_APPEND */ |
| 204 | file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); | 208 | file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); |
| 205 | if (IS_ERR(file)) | 209 | if (IS_ERR(file)) { |
| 210 | kfree(acct); | ||
| 206 | return PTR_ERR(file); | 211 | return PTR_ERR(file); |
| 212 | } | ||
| 207 | 213 | ||
| 208 | if (!S_ISREG(file_inode(file)->i_mode)) { | 214 | if (!S_ISREG(file_inode(file)->i_mode)) { |
| 215 | kfree(acct); | ||
| 209 | filp_close(file, NULL); | 216 | filp_close(file, NULL); |
| 210 | return -EACCES; | 217 | return -EACCES; |
| 211 | } | 218 | } |
| 212 | 219 | ||
| 213 | if (!file->f_op->write) { | 220 | if (!file->f_op->write) { |
| 221 | kfree(acct); | ||
| 214 | filp_close(file, NULL); | 222 | filp_close(file, NULL); |
| 215 | return -EIO; | 223 | return -EIO; |
| 216 | } | 224 | } |
| 217 | 225 | internal = mnt_clone_internal(&file->f_path); | |
| 218 | ns = task_active_pid_ns(current); | 226 | if (IS_ERR(internal)) { |
| 219 | if (ns->bacct == NULL) { | 227 | kfree(acct); |
| 220 | acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); | 228 | filp_close(file, NULL); |
| 221 | if (acct == NULL) { | 229 | return PTR_ERR(internal); |
| 222 | filp_close(file, NULL); | ||
| 223 | return -ENOMEM; | ||
| 224 | } | ||
| 225 | } | 230 | } |
| 226 | 231 | err = mnt_want_write(internal); | |
| 227 | spin_lock(&acct_lock); | 232 | if (err) { |
| 228 | if (ns->bacct == NULL) { | 233 | mntput(internal); |
| 229 | ns->bacct = acct; | 234 | kfree(acct); |
| 230 | acct = NULL; | 235 | filp_close(file, NULL); |
| 236 | return err; | ||
| 231 | } | 237 | } |
| 232 | |||
| 233 | mnt = file->f_path.mnt; | 238 | mnt = file->f_path.mnt; |
| 234 | mnt_pin(mnt); | 239 | file->f_path.mnt = internal; |
| 235 | acct_file_reopen(ns->bacct, file, ns); | 240 | |
| 236 | spin_unlock(&acct_lock); | 241 | atomic_long_set(&acct->pin.count, 1); |
| 237 | 242 | acct->pin.kill = acct_pin_kill; | |
| 238 | mntput(mnt); /* it's pinned, now give up active reference */ | 243 | acct->file = file; |
| 239 | kfree(acct); | 244 | acct->needcheck = jiffies; |
| 240 | 245 | acct->ns = ns; | |
| 246 | mutex_init(&acct->lock); | ||
| 247 | mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ | ||
| 248 | pin_insert(&acct->pin, mnt); | ||
| 249 | |||
| 250 | old = acct_get(ns); | ||
| 251 | if (old) | ||
| 252 | acct_kill(old, acct); | ||
| 253 | else | ||
| 254 | ns->bacct = acct; | ||
| 255 | mutex_unlock(&acct->lock); | ||
| 256 | mnt_drop_write(mnt); | ||
| 257 | mntput(mnt); | ||
| 241 | return 0; | 258 | return 0; |
| 242 | } | 259 | } |
| 243 | 260 | ||
| 261 | static DEFINE_MUTEX(acct_on_mutex); | ||
| 262 | |||
| 244 | /** | 263 | /** |
| 245 | * sys_acct - enable/disable process accounting | 264 | * sys_acct - enable/disable process accounting |
| 246 | * @name: file name for accounting records or NULL to shutdown accounting | 265 | * @name: file name for accounting records or NULL to shutdown accounting |
| @@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name) | |||
| 261 | 280 | ||
| 262 | if (name) { | 281 | if (name) { |
| 263 | struct filename *tmp = getname(name); | 282 | struct filename *tmp = getname(name); |
| 283 | |||
| 264 | if (IS_ERR(tmp)) | 284 | if (IS_ERR(tmp)) |
| 265 | return PTR_ERR(tmp); | 285 | return PTR_ERR(tmp); |
| 286 | mutex_lock(&acct_on_mutex); | ||
| 266 | error = acct_on(tmp); | 287 | error = acct_on(tmp); |
| 288 | mutex_unlock(&acct_on_mutex); | ||
| 267 | putname(tmp); | 289 | putname(tmp); |
| 268 | } else { | 290 | } else { |
| 269 | struct bsd_acct_struct *acct; | 291 | acct_kill(acct_get(task_active_pid_ns(current)), NULL); |
| 270 | |||
| 271 | acct = task_active_pid_ns(current)->bacct; | ||
| 272 | if (acct == NULL) | ||
| 273 | return 0; | ||
| 274 | |||
| 275 | spin_lock(&acct_lock); | ||
| 276 | acct_file_reopen(acct, NULL, NULL); | ||
| 277 | spin_unlock(&acct_lock); | ||
| 278 | } | 292 | } |
| 279 | 293 | ||
| 280 | return error; | 294 | return error; |
| 281 | } | 295 | } |
| 282 | 296 | ||
| 283 | /** | ||
| 284 | * acct_auto_close - turn off a filesystem's accounting if it is on | ||
| 285 | * @m: vfsmount being shut down | ||
| 286 | * | ||
| 287 | * If the accounting is turned on for a file in the subtree pointed to | ||
| 288 | * to by m, turn accounting off. Done when m is about to die. | ||
| 289 | */ | ||
| 290 | void acct_auto_close_mnt(struct vfsmount *m) | ||
| 291 | { | ||
| 292 | struct bsd_acct_struct *acct; | ||
| 293 | |||
| 294 | spin_lock(&acct_lock); | ||
| 295 | restart: | ||
| 296 | list_for_each_entry(acct, &acct_list, list) | ||
| 297 | if (acct->file && acct->file->f_path.mnt == m) { | ||
| 298 | acct_file_reopen(acct, NULL, NULL); | ||
| 299 | goto restart; | ||
| 300 | } | ||
| 301 | spin_unlock(&acct_lock); | ||
| 302 | } | ||
| 303 | |||
| 304 | /** | ||
| 305 | * acct_auto_close - turn off a filesystem's accounting if it is on | ||
| 306 | * @sb: super block for the filesystem | ||
| 307 | * | ||
| 308 | * If the accounting is turned on for a file in the filesystem pointed | ||
| 309 | * to by sb, turn accounting off. | ||
| 310 | */ | ||
| 311 | void acct_auto_close(struct super_block *sb) | ||
| 312 | { | ||
| 313 | struct bsd_acct_struct *acct; | ||
| 314 | |||
| 315 | spin_lock(&acct_lock); | ||
| 316 | restart: | ||
| 317 | list_for_each_entry(acct, &acct_list, list) | ||
| 318 | if (acct->file && acct->file->f_path.dentry->d_sb == sb) { | ||
| 319 | acct_file_reopen(acct, NULL, NULL); | ||
| 320 | goto restart; | ||
| 321 | } | ||
| 322 | spin_unlock(&acct_lock); | ||
| 323 | } | ||
| 324 | |||
| 325 | void acct_exit_ns(struct pid_namespace *ns) | 297 | void acct_exit_ns(struct pid_namespace *ns) |
| 326 | { | 298 | { |
| 327 | struct bsd_acct_struct *acct = ns->bacct; | 299 | acct_kill(acct_get(ns), NULL); |
| 328 | |||
| 329 | if (acct == NULL) | ||
| 330 | return; | ||
| 331 | |||
| 332 | spin_lock(&acct_lock); | ||
| 333 | if (acct->file != NULL) | ||
| 334 | acct_file_reopen(acct, NULL, NULL); | ||
| 335 | spin_unlock(&acct_lock); | ||
| 336 | |||
| 337 | kfree(acct); | ||
| 338 | } | 300 | } |
| 339 | 301 | ||
| 340 | /* | 302 | /* |
| @@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value) | |||
| 376 | return exp; | 338 | return exp; |
| 377 | } | 339 | } |
| 378 | 340 | ||
| 379 | #if ACCT_VERSION==1 || ACCT_VERSION==2 | 341 | #if ACCT_VERSION == 1 || ACCT_VERSION == 2 |
| 380 | /* | 342 | /* |
| 381 | * encode an u64 into a comp2_t (24 bits) | 343 | * encode an u64 into a comp2_t (24 bits) |
| 382 | * | 344 | * |
| @@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value) | |||
| 389 | #define MANTSIZE2 20 /* 20 bit mantissa. */ | 351 | #define MANTSIZE2 20 /* 20 bit mantissa. */ |
| 390 | #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ | 352 | #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ |
| 391 | #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ | 353 | #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ |
| 392 | #define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ | 354 | #define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */ |
| 393 | 355 | ||
| 394 | static comp2_t encode_comp2_t(u64 value) | 356 | static comp2_t encode_comp2_t(u64 value) |
| 395 | { | 357 | { |
| @@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value) | |||
| 420 | } | 382 | } |
| 421 | #endif | 383 | #endif |
| 422 | 384 | ||
| 423 | #if ACCT_VERSION==3 | 385 | #if ACCT_VERSION == 3 |
| 424 | /* | 386 | /* |
| 425 | * encode an u64 into a 32 bit IEEE float | 387 | * encode an u64 into a 32 bit IEEE float |
| 426 | */ | 388 | */ |
| @@ -429,8 +391,9 @@ static u32 encode_float(u64 value) | |||
| 429 | unsigned exp = 190; | 391 | unsigned exp = 190; |
| 430 | unsigned u; | 392 | unsigned u; |
| 431 | 393 | ||
| 432 | if (value==0) return 0; | 394 | if (value == 0) |
| 433 | while ((s64)value > 0){ | 395 | return 0; |
| 396 | while ((s64)value > 0) { | ||
| 434 | value <<= 1; | 397 | value <<= 1; |
| 435 | exp--; | 398 | exp--; |
| 436 | } | 399 | } |
| @@ -448,116 +411,116 @@ static u32 encode_float(u64 value) | |||
| 448 | * do_exit() or when switching to a different output file. | 411 | * do_exit() or when switching to a different output file. |
| 449 | */ | 412 | */ |
| 450 | 413 | ||
| 451 | /* | 414 | static void fill_ac(acct_t *ac) |
| 452 | * do_acct_process does all actual work. Caller holds the reference to file. | ||
| 453 | */ | ||
| 454 | static void do_acct_process(struct bsd_acct_struct *acct, | ||
| 455 | struct pid_namespace *ns, struct file *file) | ||
| 456 | { | 415 | { |
| 457 | struct pacct_struct *pacct = ¤t->signal->pacct; | 416 | struct pacct_struct *pacct = ¤t->signal->pacct; |
| 458 | acct_t ac; | ||
| 459 | mm_segment_t fs; | ||
| 460 | unsigned long flim; | ||
| 461 | u64 elapsed, run_time; | 417 | u64 elapsed, run_time; |
| 462 | struct tty_struct *tty; | 418 | struct tty_struct *tty; |
| 463 | const struct cred *orig_cred; | ||
| 464 | |||
| 465 | /* Perform file operations on behalf of whoever enabled accounting */ | ||
| 466 | orig_cred = override_creds(file->f_cred); | ||
| 467 | |||
| 468 | /* | ||
| 469 | * First check to see if there is enough free_space to continue | ||
| 470 | * the process accounting system. | ||
| 471 | */ | ||
| 472 | if (!check_free_space(acct, file)) | ||
| 473 | goto out; | ||
| 474 | 419 | ||
| 475 | /* | 420 | /* |
| 476 | * Fill the accounting struct with the needed info as recorded | 421 | * Fill the accounting struct with the needed info as recorded |
| 477 | * by the different kernel functions. | 422 | * by the different kernel functions. |
| 478 | */ | 423 | */ |
| 479 | memset(&ac, 0, sizeof(acct_t)); | 424 | memset(ac, 0, sizeof(acct_t)); |
| 480 | 425 | ||
| 481 | ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; | 426 | ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; |
| 482 | strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); | 427 | strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); |
| 483 | 428 | ||
| 484 | /* calculate run_time in nsec*/ | 429 | /* calculate run_time in nsec*/ |
| 485 | run_time = ktime_get_ns(); | 430 | run_time = ktime_get_ns(); |
| 486 | run_time -= current->group_leader->start_time; | 431 | run_time -= current->group_leader->start_time; |
| 487 | /* convert nsec -> AHZ */ | 432 | /* convert nsec -> AHZ */ |
| 488 | elapsed = nsec_to_AHZ(run_time); | 433 | elapsed = nsec_to_AHZ(run_time); |
| 489 | #if ACCT_VERSION==3 | 434 | #if ACCT_VERSION == 3 |
| 490 | ac.ac_etime = encode_float(elapsed); | 435 | ac->ac_etime = encode_float(elapsed); |
| 491 | #else | 436 | #else |
| 492 | ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? | 437 | ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? |
| 493 | (unsigned long) elapsed : (unsigned long) -1l); | 438 | (unsigned long) elapsed : (unsigned long) -1l); |
| 494 | #endif | 439 | #endif |
| 495 | #if ACCT_VERSION==1 || ACCT_VERSION==2 | 440 | #if ACCT_VERSION == 1 || ACCT_VERSION == 2 |
| 496 | { | 441 | { |
| 497 | /* new enlarged etime field */ | 442 | /* new enlarged etime field */ |
| 498 | comp2_t etime = encode_comp2_t(elapsed); | 443 | comp2_t etime = encode_comp2_t(elapsed); |
| 499 | ac.ac_etime_hi = etime >> 16; | 444 | |
| 500 | ac.ac_etime_lo = (u16) etime; | 445 | ac->ac_etime_hi = etime >> 16; |
| 446 | ac->ac_etime_lo = (u16) etime; | ||
| 501 | } | 447 | } |
| 502 | #endif | 448 | #endif |
| 503 | do_div(elapsed, AHZ); | 449 | do_div(elapsed, AHZ); |
| 504 | ac.ac_btime = get_seconds() - elapsed; | 450 | ac->ac_btime = get_seconds() - elapsed; |
| 505 | /* we really need to bite the bullet and change layout */ | ||
| 506 | ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); | ||
| 507 | ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); | ||
| 508 | #if ACCT_VERSION==2 | 451 | #if ACCT_VERSION==2 |
| 509 | ac.ac_ahz = AHZ; | 452 | ac->ac_ahz = AHZ; |
| 510 | #endif | ||
| 511 | #if ACCT_VERSION==1 || ACCT_VERSION==2 | ||
| 512 | /* backward-compatible 16 bit fields */ | ||
| 513 | ac.ac_uid16 = ac.ac_uid; | ||
| 514 | ac.ac_gid16 = ac.ac_gid; | ||
| 515 | #endif | ||
| 516 | #if ACCT_VERSION==3 | ||
| 517 | ac.ac_pid = task_tgid_nr_ns(current, ns); | ||
| 518 | rcu_read_lock(); | ||
| 519 | ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); | ||
| 520 | rcu_read_unlock(); | ||
| 521 | #endif | 453 | #endif |
| 522 | 454 | ||
| 523 | spin_lock_irq(¤t->sighand->siglock); | 455 | spin_lock_irq(¤t->sighand->siglock); |
| 524 | tty = current->signal->tty; /* Safe as we hold the siglock */ | 456 | tty = current->signal->tty; /* Safe as we hold the siglock */ |
| 525 | ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; | 457 | ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; |
| 526 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); | 458 | ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); |
| 527 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); | 459 | ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); |
| 528 | ac.ac_flag = pacct->ac_flag; | 460 | ac->ac_flag = pacct->ac_flag; |
| 529 | ac.ac_mem = encode_comp_t(pacct->ac_mem); | 461 | ac->ac_mem = encode_comp_t(pacct->ac_mem); |
| 530 | ac.ac_minflt = encode_comp_t(pacct->ac_minflt); | 462 | ac->ac_minflt = encode_comp_t(pacct->ac_minflt); |
| 531 | ac.ac_majflt = encode_comp_t(pacct->ac_majflt); | 463 | ac->ac_majflt = encode_comp_t(pacct->ac_majflt); |
| 532 | ac.ac_exitcode = pacct->ac_exitcode; | 464 | ac->ac_exitcode = pacct->ac_exitcode; |
| 533 | spin_unlock_irq(¤t->sighand->siglock); | 465 | spin_unlock_irq(¤t->sighand->siglock); |
| 534 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ | 466 | } |
| 535 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); | 467 | /* |
| 536 | ac.ac_swaps = encode_comp_t(0); | 468 | * do_acct_process does all actual work. Caller holds the reference to file. |
| 469 | */ | ||
| 470 | static void do_acct_process(struct bsd_acct_struct *acct) | ||
| 471 | { | ||
| 472 | acct_t ac; | ||
| 473 | unsigned long flim; | ||
| 474 | const struct cred *orig_cred; | ||
| 475 | struct file *file = acct->file; | ||
| 537 | 476 | ||
| 538 | /* | 477 | /* |
| 539 | * Get freeze protection. If the fs is frozen, just skip the write | 478 | * Accounting records are not subject to resource limits. |
| 540 | * as we could deadlock the system otherwise. | ||
| 541 | */ | 479 | */ |
| 542 | if (!file_start_write_trylock(file)) | 480 | flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; |
| 543 | goto out; | 481 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; |
| 482 | /* Perform file operations on behalf of whoever enabled accounting */ | ||
| 483 | orig_cred = override_creds(file->f_cred); | ||
| 484 | |||
| 544 | /* | 485 | /* |
| 545 | * Kernel segment override to datasegment and write it | 486 | * First check to see if there is enough free_space to continue |
| 546 | * to the accounting file. | 487 | * the process accounting system. |
| 547 | */ | 488 | */ |
| 548 | fs = get_fs(); | 489 | if (!check_free_space(acct)) |
| 549 | set_fs(KERNEL_DS); | 490 | goto out; |
| 491 | |||
| 492 | fill_ac(&ac); | ||
| 493 | /* we really need to bite the bullet and change layout */ | ||
| 494 | ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); | ||
| 495 | ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); | ||
| 496 | #if ACCT_VERSION == 1 || ACCT_VERSION == 2 | ||
| 497 | /* backward-compatible 16 bit fields */ | ||
| 498 | ac.ac_uid16 = ac.ac_uid; | ||
| 499 | ac.ac_gid16 = ac.ac_gid; | ||
| 500 | #endif | ||
| 501 | #if ACCT_VERSION == 3 | ||
| 502 | { | ||
| 503 | struct pid_namespace *ns = acct->ns; | ||
| 504 | |||
| 505 | ac.ac_pid = task_tgid_nr_ns(current, ns); | ||
| 506 | rcu_read_lock(); | ||
| 507 | ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), | ||
| 508 | ns); | ||
| 509 | rcu_read_unlock(); | ||
| 510 | } | ||
| 511 | #endif | ||
| 550 | /* | 512 | /* |
| 551 | * Accounting records are not subject to resource limits. | 513 | * Get freeze protection. If the fs is frozen, just skip the write |
| 514 | * as we could deadlock the system otherwise. | ||
| 552 | */ | 515 | */ |
| 553 | flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 516 | if (file_start_write_trylock(file)) { |
| 554 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; | 517 | /* it's been opened O_APPEND, so position is irrelevant */ |
| 555 | file->f_op->write(file, (char *)&ac, | 518 | loff_t pos = 0; |
| 556 | sizeof(acct_t), &file->f_pos); | 519 | __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); |
| 557 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; | 520 | file_end_write(file); |
| 558 | set_fs(fs); | 521 | } |
| 559 | file_end_write(file); | ||
| 560 | out: | 522 | out: |
| 523 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; | ||
| 561 | revert_creds(orig_cred); | 524 | revert_creds(orig_cred); |
| 562 | } | 525 | } |
| 563 | 526 | ||
| @@ -574,6 +537,7 @@ void acct_collect(long exitcode, int group_dead) | |||
| 574 | 537 | ||
| 575 | if (group_dead && current->mm) { | 538 | if (group_dead && current->mm) { |
| 576 | struct vm_area_struct *vma; | 539 | struct vm_area_struct *vma; |
| 540 | |||
| 577 | down_read(¤t->mm->mmap_sem); | 541 | down_read(¤t->mm->mmap_sem); |
| 578 | vma = current->mm->mmap; | 542 | vma = current->mm->mmap; |
| 579 | while (vma) { | 543 | while (vma) { |
| @@ -605,34 +569,20 @@ void acct_collect(long exitcode, int group_dead) | |||
| 605 | spin_unlock_irq(¤t->sighand->siglock); | 569 | spin_unlock_irq(¤t->sighand->siglock); |
| 606 | } | 570 | } |
| 607 | 571 | ||
| 608 | static void acct_process_in_ns(struct pid_namespace *ns) | 572 | static void slow_acct_process(struct pid_namespace *ns) |
| 609 | { | 573 | { |
| 610 | struct file *file = NULL; | 574 | for ( ; ns; ns = ns->parent) { |
| 611 | struct bsd_acct_struct *acct; | 575 | struct bsd_acct_struct *acct = acct_get(ns); |
| 612 | 576 | if (acct) { | |
| 613 | acct = ns->bacct; | 577 | do_acct_process(acct); |
| 614 | /* | 578 | mutex_unlock(&acct->lock); |
| 615 | * accelerate the common fastpath: | 579 | pin_put(&acct->pin); |
| 616 | */ | 580 | } |
| 617 | if (!acct || !acct->file) | ||
| 618 | return; | ||
| 619 | |||
| 620 | spin_lock(&acct_lock); | ||
| 621 | file = acct->file; | ||
| 622 | if (unlikely(!file)) { | ||
| 623 | spin_unlock(&acct_lock); | ||
| 624 | return; | ||
| 625 | } | 581 | } |
| 626 | get_file(file); | ||
| 627 | spin_unlock(&acct_lock); | ||
| 628 | |||
| 629 | do_acct_process(acct, ns, file); | ||
| 630 | fput(file); | ||
| 631 | } | 582 | } |
| 632 | 583 | ||
| 633 | /** | 584 | /** |
| 634 | * acct_process - now just a wrapper around acct_process_in_ns, | 585 | * acct_process |
| 635 | * which in turn is a wrapper around do_acct_process. | ||
| 636 | * | 586 | * |
| 637 | * handles process accounting for an exiting task | 587 | * handles process accounting for an exiting task |
| 638 | */ | 588 | */ |
| @@ -645,6 +595,10 @@ void acct_process(void) | |||
| 645 | * alive and holds its namespace, which in turn holds | 595 | * alive and holds its namespace, which in turn holds |
| 646 | * its parent. | 596 | * its parent. |
| 647 | */ | 597 | */ |
| 648 | for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) | 598 | for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { |
| 649 | acct_process_in_ns(ns); | 599 | if (ns->bacct) |
| 600 | break; | ||
| 601 | } | ||
| 602 | if (unlikely(ns)) | ||
| 603 | slow_acct_process(ns); | ||
| 650 | } | 604 | } |
diff --git a/kernel/async.c b/kernel/async.c index 61f023ce0228..4c3773c0bf63 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
| @@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
| 115 | 115 | ||
| 116 | /* 1) run (and print duration) */ | 116 | /* 1) run (and print duration) */ |
| 117 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 117 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 118 | printk(KERN_DEBUG "calling %lli_%pF @ %i\n", | 118 | pr_debug("calling %lli_%pF @ %i\n", |
| 119 | (long long)entry->cookie, | 119 | (long long)entry->cookie, |
| 120 | entry->func, task_pid_nr(current)); | 120 | entry->func, task_pid_nr(current)); |
| 121 | calltime = ktime_get(); | 121 | calltime = ktime_get(); |
| @@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
| 124 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 124 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 125 | rettime = ktime_get(); | 125 | rettime = ktime_get(); |
| 126 | delta = ktime_sub(rettime, calltime); | 126 | delta = ktime_sub(rettime, calltime); |
| 127 | printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", | 127 | pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", |
| 128 | (long long)entry->cookie, | 128 | (long long)entry->cookie, |
| 129 | entry->func, | 129 | entry->func, |
| 130 | (long long)ktime_to_ns(delta) >> 10); | 130 | (long long)ktime_to_ns(delta) >> 10); |
| @@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain | |||
| 285 | ktime_t uninitialized_var(starttime), delta, endtime; | 285 | ktime_t uninitialized_var(starttime), delta, endtime; |
| 286 | 286 | ||
| 287 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 287 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 288 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); | 288 | pr_debug("async_waiting @ %i\n", task_pid_nr(current)); |
| 289 | starttime = ktime_get(); | 289 | starttime = ktime_get(); |
| 290 | } | 290 | } |
| 291 | 291 | ||
| @@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain | |||
| 295 | endtime = ktime_get(); | 295 | endtime = ktime_get(); |
| 296 | delta = ktime_sub(endtime, starttime); | 296 | delta = ktime_sub(endtime, starttime); |
| 297 | 297 | ||
| 298 | printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", | 298 | pr_debug("async_continuing @ %i after %lli usec\n", |
| 299 | task_pid_nr(current), | 299 | task_pid_nr(current), |
| 300 | (long long)ktime_to_ns(delta) >> 10); | 300 | (long long)ktime_to_ns(delta) >> 10); |
| 301 | } | 301 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index ba2ff5a5c600..cebb11db4d34 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); | |||
| 126 | 126 | ||
| 127 | /* The netlink socket. */ | 127 | /* The netlink socket. */ |
| 128 | static struct sock *audit_sock; | 128 | static struct sock *audit_sock; |
| 129 | int audit_net_id; | 129 | static int audit_net_id; |
| 130 | 130 | ||
| 131 | /* Hash for inode-based rules */ | 131 | /* Hash for inode-based rules */ |
| 132 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; | 132 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; |
| @@ -724,7 +724,7 @@ static int audit_get_feature(struct sk_buff *skb) | |||
| 724 | 724 | ||
| 725 | seq = nlmsg_hdr(skb)->nlmsg_seq; | 725 | seq = nlmsg_hdr(skb)->nlmsg_seq; |
| 726 | 726 | ||
| 727 | audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); | 727 | audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); |
| 728 | 728 | ||
| 729 | return 0; | 729 | return 0; |
| 730 | } | 730 | } |
| @@ -739,7 +739,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature | |||
| 739 | 739 | ||
| 740 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); | 740 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); |
| 741 | audit_log_task_info(ab, current); | 741 | audit_log_task_info(ab, current); |
| 742 | audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", | 742 | audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", |
| 743 | audit_feature_names[which], !!old_feature, !!new_feature, | 743 | audit_feature_names[which], !!old_feature, !!new_feature, |
| 744 | !!old_lock, !!new_lock, res); | 744 | !!old_lock, !!new_lock, res); |
| 745 | audit_log_end(ab); | 745 | audit_log_end(ab); |
| @@ -750,7 +750,7 @@ static int audit_set_feature(struct sk_buff *skb) | |||
| 750 | struct audit_features *uaf; | 750 | struct audit_features *uaf; |
| 751 | int i; | 751 | int i; |
| 752 | 752 | ||
| 753 | BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); | 753 | BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); |
| 754 | uaf = nlmsg_data(nlmsg_hdr(skb)); | 754 | uaf = nlmsg_data(nlmsg_hdr(skb)); |
| 755 | 755 | ||
| 756 | /* if there is ever a version 2 we should handle that here */ | 756 | /* if there is ever a version 2 we should handle that here */ |
| @@ -1301,19 +1301,9 @@ err: | |||
| 1301 | */ | 1301 | */ |
| 1302 | unsigned int audit_serial(void) | 1302 | unsigned int audit_serial(void) |
| 1303 | { | 1303 | { |
| 1304 | static DEFINE_SPINLOCK(serial_lock); | 1304 | static atomic_t serial = ATOMIC_INIT(0); |
| 1305 | static unsigned int serial = 0; | ||
| 1306 | 1305 | ||
| 1307 | unsigned long flags; | 1306 | return atomic_add_return(1, &serial); |
| 1308 | unsigned int ret; | ||
| 1309 | |||
| 1310 | spin_lock_irqsave(&serial_lock, flags); | ||
| 1311 | do { | ||
| 1312 | ret = ++serial; | ||
| 1313 | } while (unlikely(!ret)); | ||
| 1314 | spin_unlock_irqrestore(&serial_lock, flags); | ||
| 1315 | |||
| 1316 | return ret; | ||
| 1317 | } | 1307 | } |
| 1318 | 1308 | ||
| 1319 | static inline void audit_get_stamp(struct audit_context *ctx, | 1309 | static inline void audit_get_stamp(struct audit_context *ctx, |
| @@ -1681,7 +1671,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) | |||
| 1681 | } | 1671 | } |
| 1682 | } | 1672 | } |
| 1683 | 1673 | ||
| 1684 | void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) | 1674 | static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) |
| 1685 | { | 1675 | { |
| 1686 | kernel_cap_t *perm = &name->fcap.permitted; | 1676 | kernel_cap_t *perm = &name->fcap.permitted; |
| 1687 | kernel_cap_t *inh = &name->fcap.inheritable; | 1677 | kernel_cap_t *inh = &name->fcap.inheritable; |
| @@ -1860,7 +1850,7 @@ EXPORT_SYMBOL(audit_log_task_context); | |||
| 1860 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | 1850 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) |
| 1861 | { | 1851 | { |
| 1862 | const struct cred *cred; | 1852 | const struct cred *cred; |
| 1863 | char name[sizeof(tsk->comm)]; | 1853 | char comm[sizeof(tsk->comm)]; |
| 1864 | struct mm_struct *mm = tsk->mm; | 1854 | struct mm_struct *mm = tsk->mm; |
| 1865 | char *tty; | 1855 | char *tty; |
| 1866 | 1856 | ||
| @@ -1894,9 +1884,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
| 1894 | from_kgid(&init_user_ns, cred->fsgid), | 1884 | from_kgid(&init_user_ns, cred->fsgid), |
| 1895 | tty, audit_get_sessionid(tsk)); | 1885 | tty, audit_get_sessionid(tsk)); |
| 1896 | 1886 | ||
| 1897 | get_task_comm(name, tsk); | ||
| 1898 | audit_log_format(ab, " comm="); | 1887 | audit_log_format(ab, " comm="); |
| 1899 | audit_log_untrustedstring(ab, name); | 1888 | audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); |
| 1900 | 1889 | ||
| 1901 | if (mm) { | 1890 | if (mm) { |
| 1902 | down_read(&mm->mmap_sem); | 1891 | down_read(&mm->mmap_sem); |
| @@ -1959,6 +1948,7 @@ void audit_log_end(struct audit_buffer *ab) | |||
| 1959 | } else { | 1948 | } else { |
| 1960 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | 1949 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); |
| 1961 | 1950 | ||
| 1951 | nlh->nlmsg_len = ab->skb->len; | ||
| 1962 | kauditd_send_multicast_skb(ab->skb); | 1952 | kauditd_send_multicast_skb(ab->skb); |
| 1963 | 1953 | ||
| 1964 | /* | 1954 | /* |
| @@ -1970,7 +1960,7 @@ void audit_log_end(struct audit_buffer *ab) | |||
| 1970 | * protocol between the kaudit kernel subsystem and the auditd | 1960 | * protocol between the kaudit kernel subsystem and the auditd |
| 1971 | * userspace code. | 1961 | * userspace code. |
| 1972 | */ | 1962 | */ |
| 1973 | nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; | 1963 | nlh->nlmsg_len -= NLMSG_HDRLEN; |
| 1974 | 1964 | ||
| 1975 | if (audit_pid) { | 1965 | if (audit_pid) { |
| 1976 | skb_queue_tail(&audit_skb_queue, ab->skb); | 1966 | skb_queue_tail(&audit_skb_queue, ab->skb); |
diff --git a/kernel/audit.h b/kernel/audit.h index 7bb65730c890..3cdffad5a1d9 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
| @@ -222,7 +222,6 @@ extern void audit_copy_inode(struct audit_names *name, | |||
| 222 | const struct inode *inode); | 222 | const struct inode *inode); |
| 223 | extern void audit_log_cap(struct audit_buffer *ab, char *prefix, | 223 | extern void audit_log_cap(struct audit_buffer *ab, char *prefix, |
| 224 | kernel_cap_t *cap); | 224 | kernel_cap_t *cap); |
| 225 | extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); | ||
| 226 | extern void audit_log_name(struct audit_context *context, | 225 | extern void audit_log_name(struct audit_context *context, |
| 227 | struct audit_names *n, struct path *path, | 226 | struct audit_names *n, struct path *path, |
| 228 | int record_num, int *call_panic); | 227 | int record_num, int *call_panic); |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 135944a7b28a..80f29e015570 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
| @@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count) | |||
| 154 | chunk->owners[i].index = i; | 154 | chunk->owners[i].index = i; |
| 155 | } | 155 | } |
| 156 | fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); | 156 | fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); |
| 157 | chunk->mark.mask = FS_IN_IGNORED; | ||
| 157 | return chunk; | 158 | return chunk; |
| 158 | } | 159 | } |
| 159 | 160 | ||
| @@ -449,7 +450,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
| 449 | return 0; | 450 | return 0; |
| 450 | } | 451 | } |
| 451 | 452 | ||
| 452 | static void audit_log_remove_rule(struct audit_krule *rule) | 453 | static void audit_tree_log_remove_rule(struct audit_krule *rule) |
| 453 | { | 454 | { |
| 454 | struct audit_buffer *ab; | 455 | struct audit_buffer *ab; |
| 455 | 456 | ||
| @@ -457,7 +458,7 @@ static void audit_log_remove_rule(struct audit_krule *rule) | |||
| 457 | if (unlikely(!ab)) | 458 | if (unlikely(!ab)) |
| 458 | return; | 459 | return; |
| 459 | audit_log_format(ab, "op="); | 460 | audit_log_format(ab, "op="); |
| 460 | audit_log_string(ab, "remove rule"); | 461 | audit_log_string(ab, "remove_rule"); |
| 461 | audit_log_format(ab, " dir="); | 462 | audit_log_format(ab, " dir="); |
| 462 | audit_log_untrustedstring(ab, rule->tree->pathname); | 463 | audit_log_untrustedstring(ab, rule->tree->pathname); |
| 463 | audit_log_key(ab, rule->filterkey); | 464 | audit_log_key(ab, rule->filterkey); |
| @@ -476,7 +477,7 @@ static void kill_rules(struct audit_tree *tree) | |||
| 476 | list_del_init(&rule->rlist); | 477 | list_del_init(&rule->rlist); |
| 477 | if (rule->tree) { | 478 | if (rule->tree) { |
| 478 | /* not a half-baked one */ | 479 | /* not a half-baked one */ |
| 479 | audit_log_remove_rule(rule); | 480 | audit_tree_log_remove_rule(rule); |
| 480 | rule->tree = NULL; | 481 | rule->tree = NULL; |
| 481 | list_del_rcu(&entry->list); | 482 | list_del_rcu(&entry->list); |
| 482 | list_del(&entry->rule.list); | 483 | list_del(&entry->rule.list); |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 70b4554d2fbe..ad9c1682f616 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
| @@ -314,7 +314,7 @@ static void audit_update_watch(struct audit_parent *parent, | |||
| 314 | &nentry->rule.list); | 314 | &nentry->rule.list); |
| 315 | } | 315 | } |
| 316 | 316 | ||
| 317 | audit_watch_log_rule_change(r, owatch, "updated rules"); | 317 | audit_watch_log_rule_change(r, owatch, "updated_rules"); |
| 318 | 318 | ||
| 319 | call_rcu(&oentry->rcu, audit_free_rule_rcu); | 319 | call_rcu(&oentry->rcu, audit_free_rule_rcu); |
| 320 | } | 320 | } |
| @@ -342,7 +342,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
| 342 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { | 342 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { |
| 343 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { | 343 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { |
| 344 | e = container_of(r, struct audit_entry, rule); | 344 | e = container_of(r, struct audit_entry, rule); |
| 345 | audit_watch_log_rule_change(r, w, "remove rule"); | 345 | audit_watch_log_rule_change(r, w, "remove_rule"); |
| 346 | list_del(&r->rlist); | 346 | list_del(&r->rlist); |
| 347 | list_del(&r->list); | 347 | list_del(&r->list); |
| 348 | list_del_rcu(&e->list); | 348 | list_del_rcu(&e->list); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c3dbb7..3598e13f2a65 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -71,6 +71,24 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { | |||
| 71 | 71 | ||
| 72 | DEFINE_MUTEX(audit_filter_mutex); | 72 | DEFINE_MUTEX(audit_filter_mutex); |
| 73 | 73 | ||
| 74 | static void audit_free_lsm_field(struct audit_field *f) | ||
| 75 | { | ||
| 76 | switch (f->type) { | ||
| 77 | case AUDIT_SUBJ_USER: | ||
| 78 | case AUDIT_SUBJ_ROLE: | ||
| 79 | case AUDIT_SUBJ_TYPE: | ||
| 80 | case AUDIT_SUBJ_SEN: | ||
| 81 | case AUDIT_SUBJ_CLR: | ||
| 82 | case AUDIT_OBJ_USER: | ||
| 83 | case AUDIT_OBJ_ROLE: | ||
| 84 | case AUDIT_OBJ_TYPE: | ||
| 85 | case AUDIT_OBJ_LEV_LOW: | ||
| 86 | case AUDIT_OBJ_LEV_HIGH: | ||
| 87 | kfree(f->lsm_str); | ||
| 88 | security_audit_rule_free(f->lsm_rule); | ||
| 89 | } | ||
| 90 | } | ||
| 91 | |||
| 74 | static inline void audit_free_rule(struct audit_entry *e) | 92 | static inline void audit_free_rule(struct audit_entry *e) |
| 75 | { | 93 | { |
| 76 | int i; | 94 | int i; |
| @@ -80,11 +98,8 @@ static inline void audit_free_rule(struct audit_entry *e) | |||
| 80 | if (erule->watch) | 98 | if (erule->watch) |
| 81 | audit_put_watch(erule->watch); | 99 | audit_put_watch(erule->watch); |
| 82 | if (erule->fields) | 100 | if (erule->fields) |
| 83 | for (i = 0; i < erule->field_count; i++) { | 101 | for (i = 0; i < erule->field_count; i++) |
| 84 | struct audit_field *f = &erule->fields[i]; | 102 | audit_free_lsm_field(&erule->fields[i]); |
| 85 | kfree(f->lsm_str); | ||
| 86 | security_audit_rule_free(f->lsm_rule); | ||
| 87 | } | ||
| 88 | kfree(erule->fields); | 103 | kfree(erule->fields); |
| 89 | kfree(erule->filterkey); | 104 | kfree(erule->filterkey); |
| 90 | kfree(e); | 105 | kfree(e); |
| @@ -106,7 +121,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) | |||
| 106 | if (unlikely(!entry)) | 121 | if (unlikely(!entry)) |
| 107 | return NULL; | 122 | return NULL; |
| 108 | 123 | ||
| 109 | fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); | 124 | fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); |
| 110 | if (unlikely(!fields)) { | 125 | if (unlikely(!fields)) { |
| 111 | kfree(entry); | 126 | kfree(entry); |
| 112 | return NULL; | 127 | return NULL; |
| @@ -148,7 +163,7 @@ static inline int audit_to_inode(struct audit_krule *krule, | |||
| 148 | struct audit_field *f) | 163 | struct audit_field *f) |
| 149 | { | 164 | { |
| 150 | if (krule->listnr != AUDIT_FILTER_EXIT || | 165 | if (krule->listnr != AUDIT_FILTER_EXIT || |
| 151 | krule->watch || krule->inode_f || krule->tree || | 166 | krule->inode_f || krule->watch || krule->tree || |
| 152 | (f->op != Audit_equal && f->op != Audit_not_equal)) | 167 | (f->op != Audit_equal && f->op != Audit_not_equal)) |
| 153 | return -EINVAL; | 168 | return -EINVAL; |
| 154 | 169 | ||
| @@ -160,7 +175,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; | |||
| 160 | 175 | ||
| 161 | int __init audit_register_class(int class, unsigned *list) | 176 | int __init audit_register_class(int class, unsigned *list) |
| 162 | { | 177 | { |
| 163 | __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); | 178 | __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); |
| 164 | if (!p) | 179 | if (!p) |
| 165 | return -ENOMEM; | 180 | return -ENOMEM; |
| 166 | while (*list != ~0U) { | 181 | while (*list != ~0U) { |
| @@ -422,10 +437,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 422 | 437 | ||
| 423 | f->type = data->fields[i]; | 438 | f->type = data->fields[i]; |
| 424 | f->val = data->values[i]; | 439 | f->val = data->values[i]; |
| 425 | f->uid = INVALID_UID; | ||
| 426 | f->gid = INVALID_GID; | ||
| 427 | f->lsm_str = NULL; | ||
| 428 | f->lsm_rule = NULL; | ||
| 429 | 440 | ||
| 430 | /* Support legacy tests for a valid loginuid */ | 441 | /* Support legacy tests for a valid loginuid */ |
| 431 | if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { | 442 | if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { |
| @@ -1053,30 +1064,27 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, | |||
| 1053 | int err = 0; | 1064 | int err = 0; |
| 1054 | struct audit_entry *entry; | 1065 | struct audit_entry *entry; |
| 1055 | 1066 | ||
| 1067 | entry = audit_data_to_entry(data, datasz); | ||
| 1068 | if (IS_ERR(entry)) | ||
| 1069 | return PTR_ERR(entry); | ||
| 1070 | |||
| 1056 | switch (type) { | 1071 | switch (type) { |
| 1057 | case AUDIT_ADD_RULE: | 1072 | case AUDIT_ADD_RULE: |
| 1058 | entry = audit_data_to_entry(data, datasz); | ||
| 1059 | if (IS_ERR(entry)) | ||
| 1060 | return PTR_ERR(entry); | ||
| 1061 | |||
| 1062 | err = audit_add_rule(entry); | 1073 | err = audit_add_rule(entry); |
| 1063 | audit_log_rule_change("add rule", &entry->rule, !err); | 1074 | audit_log_rule_change("add_rule", &entry->rule, !err); |
| 1064 | if (err) | ||
| 1065 | audit_free_rule(entry); | ||
| 1066 | break; | 1075 | break; |
| 1067 | case AUDIT_DEL_RULE: | 1076 | case AUDIT_DEL_RULE: |
| 1068 | entry = audit_data_to_entry(data, datasz); | ||
| 1069 | if (IS_ERR(entry)) | ||
| 1070 | return PTR_ERR(entry); | ||
| 1071 | |||
| 1072 | err = audit_del_rule(entry); | 1077 | err = audit_del_rule(entry); |
| 1073 | audit_log_rule_change("remove rule", &entry->rule, !err); | 1078 | audit_log_rule_change("remove_rule", &entry->rule, !err); |
| 1074 | audit_free_rule(entry); | ||
| 1075 | break; | 1079 | break; |
| 1076 | default: | 1080 | default: |
| 1077 | return -EINVAL; | 1081 | err = -EINVAL; |
| 1082 | WARN_ON(1); | ||
| 1078 | } | 1083 | } |
| 1079 | 1084 | ||
| 1085 | if (err || type == AUDIT_DEL_RULE) | ||
| 1086 | audit_free_rule(entry); | ||
| 1087 | |||
| 1080 | return err; | 1088 | return err; |
| 1081 | } | 1089 | } |
| 1082 | 1090 | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7208c1df248d..e420a0c41b5f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -67,6 +67,7 @@ | |||
| 67 | #include <linux/binfmts.h> | 67 | #include <linux/binfmts.h> |
| 68 | #include <linux/highmem.h> | 68 | #include <linux/highmem.h> |
| 69 | #include <linux/syscalls.h> | 69 | #include <linux/syscalls.h> |
| 70 | #include <asm/syscall.h> | ||
| 70 | #include <linux/capability.h> | 71 | #include <linux/capability.h> |
| 71 | #include <linux/fs_struct.h> | 72 | #include <linux/fs_struct.h> |
| 72 | #include <linux/compat.h> | 73 | #include <linux/compat.h> |
| @@ -125,14 +126,6 @@ struct audit_tree_refs { | |||
| 125 | struct audit_chunk *c[31]; | 126 | struct audit_chunk *c[31]; |
| 126 | }; | 127 | }; |
| 127 | 128 | ||
| 128 | static inline int open_arg(int flags, int mask) | ||
| 129 | { | ||
| 130 | int n = ACC_MODE(flags); | ||
| 131 | if (flags & (O_TRUNC | O_CREAT)) | ||
| 132 | n |= AUDIT_PERM_WRITE; | ||
| 133 | return n & mask; | ||
| 134 | } | ||
| 135 | |||
| 136 | static int audit_match_perm(struct audit_context *ctx, int mask) | 129 | static int audit_match_perm(struct audit_context *ctx, int mask) |
| 137 | { | 130 | { |
| 138 | unsigned n; | 131 | unsigned n; |
| @@ -1505,7 +1498,6 @@ void __audit_free(struct task_struct *tsk) | |||
| 1505 | 1498 | ||
| 1506 | /** | 1499 | /** |
| 1507 | * audit_syscall_entry - fill in an audit record at syscall entry | 1500 | * audit_syscall_entry - fill in an audit record at syscall entry |
| 1508 | * @arch: architecture type | ||
| 1509 | * @major: major syscall type (function) | 1501 | * @major: major syscall type (function) |
| 1510 | * @a1: additional syscall register 1 | 1502 | * @a1: additional syscall register 1 |
| 1511 | * @a2: additional syscall register 2 | 1503 | * @a2: additional syscall register 2 |
| @@ -1520,9 +1512,8 @@ void __audit_free(struct task_struct *tsk) | |||
| 1520 | * will only be written if another part of the kernel requests that it | 1512 | * will only be written if another part of the kernel requests that it |
| 1521 | * be written). | 1513 | * be written). |
| 1522 | */ | 1514 | */ |
| 1523 | void __audit_syscall_entry(int arch, int major, | 1515 | void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, |
| 1524 | unsigned long a1, unsigned long a2, | 1516 | unsigned long a3, unsigned long a4) |
| 1525 | unsigned long a3, unsigned long a4) | ||
| 1526 | { | 1517 | { |
| 1527 | struct task_struct *tsk = current; | 1518 | struct task_struct *tsk = current; |
| 1528 | struct audit_context *context = tsk->audit_context; | 1519 | struct audit_context *context = tsk->audit_context; |
| @@ -1536,7 +1527,7 @@ void __audit_syscall_entry(int arch, int major, | |||
| 1536 | if (!audit_enabled) | 1527 | if (!audit_enabled) |
| 1537 | return; | 1528 | return; |
| 1538 | 1529 | ||
| 1539 | context->arch = arch; | 1530 | context->arch = syscall_get_arch(); |
| 1540 | context->major = major; | 1531 | context->major = major; |
| 1541 | context->argv[0] = a1; | 1532 | context->argv[0] = a1; |
| 1542 | context->argv[1] = a2; | 1533 | context->argv[1] = a2; |
| @@ -2433,6 +2424,7 @@ static void audit_log_task(struct audit_buffer *ab) | |||
| 2433 | kgid_t gid; | 2424 | kgid_t gid; |
| 2434 | unsigned int sessionid; | 2425 | unsigned int sessionid; |
| 2435 | struct mm_struct *mm = current->mm; | 2426 | struct mm_struct *mm = current->mm; |
| 2427 | char comm[sizeof(current->comm)]; | ||
| 2436 | 2428 | ||
| 2437 | auid = audit_get_loginuid(current); | 2429 | auid = audit_get_loginuid(current); |
| 2438 | sessionid = audit_get_sessionid(current); | 2430 | sessionid = audit_get_sessionid(current); |
| @@ -2445,7 +2437,7 @@ static void audit_log_task(struct audit_buffer *ab) | |||
| 2445 | sessionid); | 2437 | sessionid); |
| 2446 | audit_log_task_context(ab); | 2438 | audit_log_task_context(ab); |
| 2447 | audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); | 2439 | audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); |
| 2448 | audit_log_untrustedstring(ab, current->comm); | 2440 | audit_log_untrustedstring(ab, get_task_comm(comm, current)); |
| 2449 | if (mm) { | 2441 | if (mm) { |
| 2450 | down_read(&mm->mmap_sem); | 2442 | down_read(&mm->mmap_sem); |
| 2451 | if (mm->exe_file) | 2443 | if (mm->exe_file) |
| @@ -2488,11 +2480,9 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) | |||
| 2488 | if (unlikely(!ab)) | 2480 | if (unlikely(!ab)) |
| 2489 | return; | 2481 | return; |
| 2490 | audit_log_task(ab); | 2482 | audit_log_task(ab); |
| 2491 | audit_log_format(ab, " sig=%ld", signr); | 2483 | audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", |
| 2492 | audit_log_format(ab, " syscall=%ld", syscall); | 2484 | signr, syscall_get_arch(), syscall, is_compat_task(), |
| 2493 | audit_log_format(ab, " compat=%d", is_compat_task()); | 2485 | KSTK_EIP(current), code); |
| 2494 | audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); | ||
| 2495 | audit_log_format(ab, " code=0x%x", code); | ||
| 2496 | audit_log_end(ab); | 2486 | audit_log_end(ab); |
| 2497 | } | 2487 | } |
| 2498 | 2488 | ||
diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246b04b8..e1d1d1952bfa 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c | |||
| @@ -9,7 +9,6 @@ | |||
| 9 | #include <linux/page-flags.h> | 9 | #include <linux/page-flags.h> |
| 10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
| 11 | #include <linux/kbuild.h> | 11 | #include <linux/kbuild.h> |
| 12 | #include <linux/page_cgroup.h> | ||
| 13 | #include <linux/log2.h> | 12 | #include <linux/log2.h> |
| 14 | #include <linux/spinlock_types.h> | 13 | #include <linux/spinlock_types.h> |
| 15 | 14 | ||
| @@ -18,7 +17,6 @@ void foo(void) | |||
| 18 | /* The enum constants to put into include/generated/bounds.h */ | 17 | /* The enum constants to put into include/generated/bounds.h */ |
| 19 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | 18 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); |
| 20 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | 19 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); |
| 21 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); | ||
| 22 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
| 23 | DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); | 21 | DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); |
| 24 | #endif | 22 | #endif |
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..0daf7f6ae7df 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
| @@ -1 +1,5 @@ | |||
| 1 | obj-y := core.o | 1 | obj-y := core.o |
| 2 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o | ||
| 3 | ifdef CONFIG_TEST_BPF | ||
| 4 | obj-$(CONFIG_BPF_SYSCALL) += test_stub.o | ||
| 5 | endif | ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f0dbcbb34af..d6594e457a25 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -20,9 +20,14 @@ | |||
| 20 | * Andi Kleen - Fix a few bad bugs and races. | 20 | * Andi Kleen - Fix a few bad bugs and races. |
| 21 | * Kris Katterjohn - Added many additional checks in bpf_check_classic() | 21 | * Kris Katterjohn - Added many additional checks in bpf_check_classic() |
| 22 | */ | 22 | */ |
| 23 | |||
| 23 | #include <linux/filter.h> | 24 | #include <linux/filter.h> |
| 24 | #include <linux/skbuff.h> | 25 | #include <linux/skbuff.h> |
| 26 | #include <linux/vmalloc.h> | ||
| 27 | #include <linux/random.h> | ||
| 28 | #include <linux/moduleloader.h> | ||
| 25 | #include <asm/unaligned.h> | 29 | #include <asm/unaligned.h> |
| 30 | #include <linux/bpf.h> | ||
| 26 | 31 | ||
| 27 | /* Registers */ | 32 | /* Registers */ |
| 28 | #define BPF_R0 regs[BPF_REG_0] | 33 | #define BPF_R0 regs[BPF_REG_0] |
| @@ -63,6 +68,105 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns | |||
| 63 | return NULL; | 68 | return NULL; |
| 64 | } | 69 | } |
| 65 | 70 | ||
| 71 | struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) | ||
| 72 | { | ||
| 73 | gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | | ||
| 74 | gfp_extra_flags; | ||
| 75 | struct bpf_prog_aux *aux; | ||
| 76 | struct bpf_prog *fp; | ||
| 77 | |||
| 78 | size = round_up(size, PAGE_SIZE); | ||
| 79 | fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); | ||
| 80 | if (fp == NULL) | ||
| 81 | return NULL; | ||
| 82 | |||
| 83 | aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); | ||
| 84 | if (aux == NULL) { | ||
| 85 | vfree(fp); | ||
| 86 | return NULL; | ||
| 87 | } | ||
| 88 | |||
| 89 | fp->pages = size / PAGE_SIZE; | ||
| 90 | fp->aux = aux; | ||
| 91 | |||
| 92 | return fp; | ||
| 93 | } | ||
| 94 | EXPORT_SYMBOL_GPL(bpf_prog_alloc); | ||
| 95 | |||
| 96 | struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, | ||
| 97 | gfp_t gfp_extra_flags) | ||
| 98 | { | ||
| 99 | gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | | ||
| 100 | gfp_extra_flags; | ||
| 101 | struct bpf_prog *fp; | ||
| 102 | |||
| 103 | BUG_ON(fp_old == NULL); | ||
| 104 | |||
| 105 | size = round_up(size, PAGE_SIZE); | ||
| 106 | if (size <= fp_old->pages * PAGE_SIZE) | ||
| 107 | return fp_old; | ||
| 108 | |||
| 109 | fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); | ||
| 110 | if (fp != NULL) { | ||
| 111 | memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); | ||
| 112 | fp->pages = size / PAGE_SIZE; | ||
| 113 | |||
| 114 | /* We keep fp->aux from fp_old around in the new | ||
| 115 | * reallocated structure. | ||
| 116 | */ | ||
| 117 | fp_old->aux = NULL; | ||
| 118 | __bpf_prog_free(fp_old); | ||
| 119 | } | ||
| 120 | |||
| 121 | return fp; | ||
| 122 | } | ||
| 123 | EXPORT_SYMBOL_GPL(bpf_prog_realloc); | ||
| 124 | |||
| 125 | void __bpf_prog_free(struct bpf_prog *fp) | ||
| 126 | { | ||
| 127 | kfree(fp->aux); | ||
| 128 | vfree(fp); | ||
| 129 | } | ||
| 130 | EXPORT_SYMBOL_GPL(__bpf_prog_free); | ||
| 131 | |||
| 132 | #ifdef CONFIG_BPF_JIT | ||
| 133 | struct bpf_binary_header * | ||
| 134 | bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, | ||
| 135 | unsigned int alignment, | ||
| 136 | bpf_jit_fill_hole_t bpf_fill_ill_insns) | ||
| 137 | { | ||
| 138 | struct bpf_binary_header *hdr; | ||
| 139 | unsigned int size, hole, start; | ||
| 140 | |||
| 141 | /* Most of BPF filters are really small, but if some of them | ||
| 142 | * fill a page, allow at least 128 extra bytes to insert a | ||
| 143 | * random section of illegal instructions. | ||
| 144 | */ | ||
| 145 | size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); | ||
| 146 | hdr = module_alloc(size); | ||
| 147 | if (hdr == NULL) | ||
| 148 | return NULL; | ||
| 149 | |||
| 150 | /* Fill space with illegal/arch-dep instructions. */ | ||
| 151 | bpf_fill_ill_insns(hdr, size); | ||
| 152 | |||
| 153 | hdr->pages = size / PAGE_SIZE; | ||
| 154 | hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), | ||
| 155 | PAGE_SIZE - sizeof(*hdr)); | ||
| 156 | start = (prandom_u32() % hole) & ~(alignment - 1); | ||
| 157 | |||
| 158 | /* Leave a random number of instructions before BPF code. */ | ||
| 159 | *image_ptr = &hdr->image[start]; | ||
| 160 | |||
| 161 | return hdr; | ||
| 162 | } | ||
| 163 | |||
| 164 | void bpf_jit_binary_free(struct bpf_binary_header *hdr) | ||
| 165 | { | ||
| 166 | module_free(NULL, hdr); | ||
| 167 | } | ||
| 168 | #endif /* CONFIG_BPF_JIT */ | ||
| 169 | |||
| 66 | /* Base function for offset calculation. Needs to go into .text section, | 170 | /* Base function for offset calculation. Needs to go into .text section, |
| 67 | * therefore keeping it non-static as well; will also be used by JITs | 171 | * therefore keeping it non-static as well; will also be used by JITs |
| 68 | * anyway later on, so do not let the compiler omit it. | 172 | * anyway later on, so do not let the compiler omit it. |
| @@ -180,6 +284,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) | |||
| 180 | [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, | 284 | [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, |
| 181 | [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, | 285 | [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, |
| 182 | [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, | 286 | [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, |
| 287 | [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, | ||
| 183 | }; | 288 | }; |
| 184 | void *ptr; | 289 | void *ptr; |
| 185 | int off; | 290 | int off; |
| @@ -239,6 +344,10 @@ select_insn: | |||
| 239 | ALU64_MOV_K: | 344 | ALU64_MOV_K: |
| 240 | DST = IMM; | 345 | DST = IMM; |
| 241 | CONT; | 346 | CONT; |
| 347 | LD_IMM_DW: | ||
| 348 | DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; | ||
| 349 | insn++; | ||
| 350 | CONT; | ||
| 242 | ALU64_ARSH_X: | 351 | ALU64_ARSH_X: |
| 243 | (*(s64 *) &DST) >>= SRC; | 352 | (*(s64 *) &DST) >>= SRC; |
| 244 | CONT; | 353 | CONT; |
| @@ -523,12 +632,35 @@ void bpf_prog_select_runtime(struct bpf_prog *fp) | |||
| 523 | 632 | ||
| 524 | /* Probe if internal BPF can be JITed */ | 633 | /* Probe if internal BPF can be JITed */ |
| 525 | bpf_int_jit_compile(fp); | 634 | bpf_int_jit_compile(fp); |
| 635 | /* Lock whole bpf_prog as read-only */ | ||
| 636 | bpf_prog_lock_ro(fp); | ||
| 526 | } | 637 | } |
| 527 | EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); | 638 | EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); |
| 528 | 639 | ||
| 529 | /* free internal BPF program */ | 640 | static void bpf_prog_free_deferred(struct work_struct *work) |
| 641 | { | ||
| 642 | struct bpf_prog_aux *aux; | ||
| 643 | |||
| 644 | aux = container_of(work, struct bpf_prog_aux, work); | ||
| 645 | bpf_jit_free(aux->prog); | ||
| 646 | } | ||
| 647 | |||
| 648 | /* Free internal BPF program */ | ||
| 530 | void bpf_prog_free(struct bpf_prog *fp) | 649 | void bpf_prog_free(struct bpf_prog *fp) |
| 531 | { | 650 | { |
| 532 | bpf_jit_free(fp); | 651 | struct bpf_prog_aux *aux = fp->aux; |
| 652 | |||
| 653 | INIT_WORK(&aux->work, bpf_prog_free_deferred); | ||
| 654 | aux->prog = fp; | ||
| 655 | schedule_work(&aux->work); | ||
| 533 | } | 656 | } |
| 534 | EXPORT_SYMBOL_GPL(bpf_prog_free); | 657 | EXPORT_SYMBOL_GPL(bpf_prog_free); |
| 658 | |||
| 659 | /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call | ||
| 660 | * skb_copy_bits(), so provide a weak definition of it for NET-less config. | ||
| 661 | */ | ||
| 662 | int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, | ||
| 663 | int len) | ||
| 664 | { | ||
| 665 | return -EFAULT; | ||
| 666 | } | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..ba61c8c16032 --- /dev/null +++ b/kernel/bpf/syscall.c | |||
| @@ -0,0 +1,606 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/bpf.h> | ||
| 13 | #include <linux/syscalls.h> | ||
| 14 | #include <linux/slab.h> | ||
| 15 | #include <linux/anon_inodes.h> | ||
| 16 | #include <linux/file.h> | ||
| 17 | #include <linux/license.h> | ||
| 18 | #include <linux/filter.h> | ||
| 19 | |||
| 20 | static LIST_HEAD(bpf_map_types); | ||
| 21 | |||
| 22 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | ||
| 23 | { | ||
| 24 | struct bpf_map_type_list *tl; | ||
| 25 | struct bpf_map *map; | ||
| 26 | |||
| 27 | list_for_each_entry(tl, &bpf_map_types, list_node) { | ||
| 28 | if (tl->type == attr->map_type) { | ||
| 29 | map = tl->ops->map_alloc(attr); | ||
| 30 | if (IS_ERR(map)) | ||
| 31 | return map; | ||
| 32 | map->ops = tl->ops; | ||
| 33 | map->map_type = attr->map_type; | ||
| 34 | return map; | ||
| 35 | } | ||
| 36 | } | ||
| 37 | return ERR_PTR(-EINVAL); | ||
| 38 | } | ||
| 39 | |||
| 40 | /* boot time registration of different map implementations */ | ||
| 41 | void bpf_register_map_type(struct bpf_map_type_list *tl) | ||
| 42 | { | ||
| 43 | list_add(&tl->list_node, &bpf_map_types); | ||
| 44 | } | ||
| 45 | |||
| 46 | /* called from workqueue */ | ||
| 47 | static void bpf_map_free_deferred(struct work_struct *work) | ||
| 48 | { | ||
| 49 | struct bpf_map *map = container_of(work, struct bpf_map, work); | ||
| 50 | |||
| 51 | /* implementation dependent freeing */ | ||
| 52 | map->ops->map_free(map); | ||
| 53 | } | ||
| 54 | |||
| 55 | /* decrement map refcnt and schedule it for freeing via workqueue | ||
| 56 | * (unrelying map implementation ops->map_free() might sleep) | ||
| 57 | */ | ||
| 58 | void bpf_map_put(struct bpf_map *map) | ||
| 59 | { | ||
| 60 | if (atomic_dec_and_test(&map->refcnt)) { | ||
| 61 | INIT_WORK(&map->work, bpf_map_free_deferred); | ||
| 62 | schedule_work(&map->work); | ||
| 63 | } | ||
| 64 | } | ||
| 65 | |||
| 66 | static int bpf_map_release(struct inode *inode, struct file *filp) | ||
| 67 | { | ||
| 68 | struct bpf_map *map = filp->private_data; | ||
| 69 | |||
| 70 | bpf_map_put(map); | ||
| 71 | return 0; | ||
| 72 | } | ||
| 73 | |||
| 74 | static const struct file_operations bpf_map_fops = { | ||
| 75 | .release = bpf_map_release, | ||
| 76 | }; | ||
| 77 | |||
| 78 | /* helper macro to check that unused fields 'union bpf_attr' are zero */ | ||
| 79 | #define CHECK_ATTR(CMD) \ | ||
| 80 | memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ | ||
| 81 | sizeof(attr->CMD##_LAST_FIELD), 0, \ | ||
| 82 | sizeof(*attr) - \ | ||
| 83 | offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ | ||
| 84 | sizeof(attr->CMD##_LAST_FIELD)) != NULL | ||
| 85 | |||
| 86 | #define BPF_MAP_CREATE_LAST_FIELD max_entries | ||
| 87 | /* called via syscall */ | ||
| 88 | static int map_create(union bpf_attr *attr) | ||
| 89 | { | ||
| 90 | struct bpf_map *map; | ||
| 91 | int err; | ||
| 92 | |||
| 93 | err = CHECK_ATTR(BPF_MAP_CREATE); | ||
| 94 | if (err) | ||
| 95 | return -EINVAL; | ||
| 96 | |||
| 97 | /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ | ||
| 98 | map = find_and_alloc_map(attr); | ||
| 99 | if (IS_ERR(map)) | ||
| 100 | return PTR_ERR(map); | ||
| 101 | |||
| 102 | atomic_set(&map->refcnt, 1); | ||
| 103 | |||
| 104 | err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); | ||
| 105 | |||
| 106 | if (err < 0) | ||
| 107 | /* failed to allocate fd */ | ||
| 108 | goto free_map; | ||
| 109 | |||
| 110 | return err; | ||
| 111 | |||
| 112 | free_map: | ||
| 113 | map->ops->map_free(map); | ||
| 114 | return err; | ||
| 115 | } | ||
| 116 | |||
| 117 | /* if error is returned, fd is released. | ||
| 118 | * On success caller should complete fd access with matching fdput() | ||
| 119 | */ | ||
| 120 | struct bpf_map *bpf_map_get(struct fd f) | ||
| 121 | { | ||
| 122 | struct bpf_map *map; | ||
| 123 | |||
| 124 | if (!f.file) | ||
| 125 | return ERR_PTR(-EBADF); | ||
| 126 | |||
| 127 | if (f.file->f_op != &bpf_map_fops) { | ||
| 128 | fdput(f); | ||
| 129 | return ERR_PTR(-EINVAL); | ||
| 130 | } | ||
| 131 | |||
| 132 | map = f.file->private_data; | ||
| 133 | |||
| 134 | return map; | ||
| 135 | } | ||
| 136 | |||
| 137 | /* helper to convert user pointers passed inside __aligned_u64 fields */ | ||
| 138 | static void __user *u64_to_ptr(__u64 val) | ||
| 139 | { | ||
| 140 | return (void __user *) (unsigned long) val; | ||
| 141 | } | ||
| 142 | |||
| 143 | /* last field in 'union bpf_attr' used by this command */ | ||
| 144 | #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value | ||
| 145 | |||
| 146 | static int map_lookup_elem(union bpf_attr *attr) | ||
| 147 | { | ||
| 148 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 149 | void __user *uvalue = u64_to_ptr(attr->value); | ||
| 150 | int ufd = attr->map_fd; | ||
| 151 | struct fd f = fdget(ufd); | ||
| 152 | struct bpf_map *map; | ||
| 153 | void *key, *value; | ||
| 154 | int err; | ||
| 155 | |||
| 156 | if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) | ||
| 157 | return -EINVAL; | ||
| 158 | |||
| 159 | map = bpf_map_get(f); | ||
| 160 | if (IS_ERR(map)) | ||
| 161 | return PTR_ERR(map); | ||
| 162 | |||
| 163 | err = -ENOMEM; | ||
| 164 | key = kmalloc(map->key_size, GFP_USER); | ||
| 165 | if (!key) | ||
| 166 | goto err_put; | ||
| 167 | |||
| 168 | err = -EFAULT; | ||
| 169 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 170 | goto free_key; | ||
| 171 | |||
| 172 | err = -ESRCH; | ||
| 173 | rcu_read_lock(); | ||
| 174 | value = map->ops->map_lookup_elem(map, key); | ||
| 175 | if (!value) | ||
| 176 | goto err_unlock; | ||
| 177 | |||
| 178 | err = -EFAULT; | ||
| 179 | if (copy_to_user(uvalue, value, map->value_size) != 0) | ||
| 180 | goto err_unlock; | ||
| 181 | |||
| 182 | err = 0; | ||
| 183 | |||
| 184 | err_unlock: | ||
| 185 | rcu_read_unlock(); | ||
| 186 | free_key: | ||
| 187 | kfree(key); | ||
| 188 | err_put: | ||
| 189 | fdput(f); | ||
| 190 | return err; | ||
| 191 | } | ||
| 192 | |||
| 193 | #define BPF_MAP_UPDATE_ELEM_LAST_FIELD value | ||
| 194 | |||
| 195 | static int map_update_elem(union bpf_attr *attr) | ||
| 196 | { | ||
| 197 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 198 | void __user *uvalue = u64_to_ptr(attr->value); | ||
| 199 | int ufd = attr->map_fd; | ||
| 200 | struct fd f = fdget(ufd); | ||
| 201 | struct bpf_map *map; | ||
| 202 | void *key, *value; | ||
| 203 | int err; | ||
| 204 | |||
| 205 | if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) | ||
| 206 | return -EINVAL; | ||
| 207 | |||
| 208 | map = bpf_map_get(f); | ||
| 209 | if (IS_ERR(map)) | ||
| 210 | return PTR_ERR(map); | ||
| 211 | |||
| 212 | err = -ENOMEM; | ||
| 213 | key = kmalloc(map->key_size, GFP_USER); | ||
| 214 | if (!key) | ||
| 215 | goto err_put; | ||
| 216 | |||
| 217 | err = -EFAULT; | ||
| 218 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 219 | goto free_key; | ||
| 220 | |||
| 221 | err = -ENOMEM; | ||
| 222 | value = kmalloc(map->value_size, GFP_USER); | ||
| 223 | if (!value) | ||
| 224 | goto free_key; | ||
| 225 | |||
| 226 | err = -EFAULT; | ||
| 227 | if (copy_from_user(value, uvalue, map->value_size) != 0) | ||
| 228 | goto free_value; | ||
| 229 | |||
| 230 | /* eBPF program that use maps are running under rcu_read_lock(), | ||
| 231 | * therefore all map accessors rely on this fact, so do the same here | ||
| 232 | */ | ||
| 233 | rcu_read_lock(); | ||
| 234 | err = map->ops->map_update_elem(map, key, value); | ||
| 235 | rcu_read_unlock(); | ||
| 236 | |||
| 237 | free_value: | ||
| 238 | kfree(value); | ||
| 239 | free_key: | ||
| 240 | kfree(key); | ||
| 241 | err_put: | ||
| 242 | fdput(f); | ||
| 243 | return err; | ||
| 244 | } | ||
| 245 | |||
| 246 | #define BPF_MAP_DELETE_ELEM_LAST_FIELD key | ||
| 247 | |||
| 248 | static int map_delete_elem(union bpf_attr *attr) | ||
| 249 | { | ||
| 250 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 251 | int ufd = attr->map_fd; | ||
| 252 | struct fd f = fdget(ufd); | ||
| 253 | struct bpf_map *map; | ||
| 254 | void *key; | ||
| 255 | int err; | ||
| 256 | |||
| 257 | if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) | ||
| 258 | return -EINVAL; | ||
| 259 | |||
| 260 | map = bpf_map_get(f); | ||
| 261 | if (IS_ERR(map)) | ||
| 262 | return PTR_ERR(map); | ||
| 263 | |||
| 264 | err = -ENOMEM; | ||
| 265 | key = kmalloc(map->key_size, GFP_USER); | ||
| 266 | if (!key) | ||
| 267 | goto err_put; | ||
| 268 | |||
| 269 | err = -EFAULT; | ||
| 270 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 271 | goto free_key; | ||
| 272 | |||
| 273 | rcu_read_lock(); | ||
| 274 | err = map->ops->map_delete_elem(map, key); | ||
| 275 | rcu_read_unlock(); | ||
| 276 | |||
| 277 | free_key: | ||
| 278 | kfree(key); | ||
| 279 | err_put: | ||
| 280 | fdput(f); | ||
| 281 | return err; | ||
| 282 | } | ||
| 283 | |||
| 284 | /* last field in 'union bpf_attr' used by this command */ | ||
| 285 | #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key | ||
| 286 | |||
| 287 | static int map_get_next_key(union bpf_attr *attr) | ||
| 288 | { | ||
| 289 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 290 | void __user *unext_key = u64_to_ptr(attr->next_key); | ||
| 291 | int ufd = attr->map_fd; | ||
| 292 | struct fd f = fdget(ufd); | ||
| 293 | struct bpf_map *map; | ||
| 294 | void *key, *next_key; | ||
| 295 | int err; | ||
| 296 | |||
| 297 | if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) | ||
| 298 | return -EINVAL; | ||
| 299 | |||
| 300 | map = bpf_map_get(f); | ||
| 301 | if (IS_ERR(map)) | ||
| 302 | return PTR_ERR(map); | ||
| 303 | |||
| 304 | err = -ENOMEM; | ||
| 305 | key = kmalloc(map->key_size, GFP_USER); | ||
| 306 | if (!key) | ||
| 307 | goto err_put; | ||
| 308 | |||
| 309 | err = -EFAULT; | ||
| 310 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 311 | goto free_key; | ||
| 312 | |||
| 313 | err = -ENOMEM; | ||
| 314 | next_key = kmalloc(map->key_size, GFP_USER); | ||
| 315 | if (!next_key) | ||
| 316 | goto free_key; | ||
| 317 | |||
| 318 | rcu_read_lock(); | ||
| 319 | err = map->ops->map_get_next_key(map, key, next_key); | ||
| 320 | rcu_read_unlock(); | ||
| 321 | if (err) | ||
| 322 | goto free_next_key; | ||
| 323 | |||
| 324 | err = -EFAULT; | ||
| 325 | if (copy_to_user(unext_key, next_key, map->key_size) != 0) | ||
| 326 | goto free_next_key; | ||
| 327 | |||
| 328 | err = 0; | ||
| 329 | |||
| 330 | free_next_key: | ||
| 331 | kfree(next_key); | ||
| 332 | free_key: | ||
| 333 | kfree(key); | ||
| 334 | err_put: | ||
| 335 | fdput(f); | ||
| 336 | return err; | ||
| 337 | } | ||
| 338 | |||
| 339 | static LIST_HEAD(bpf_prog_types); | ||
| 340 | |||
| 341 | static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) | ||
| 342 | { | ||
| 343 | struct bpf_prog_type_list *tl; | ||
| 344 | |||
| 345 | list_for_each_entry(tl, &bpf_prog_types, list_node) { | ||
| 346 | if (tl->type == type) { | ||
| 347 | prog->aux->ops = tl->ops; | ||
| 348 | prog->aux->prog_type = type; | ||
| 349 | return 0; | ||
| 350 | } | ||
| 351 | } | ||
| 352 | return -EINVAL; | ||
| 353 | } | ||
| 354 | |||
| 355 | void bpf_register_prog_type(struct bpf_prog_type_list *tl) | ||
| 356 | { | ||
| 357 | list_add(&tl->list_node, &bpf_prog_types); | ||
| 358 | } | ||
| 359 | |||
| 360 | /* fixup insn->imm field of bpf_call instructions: | ||
| 361 | * if (insn->imm == BPF_FUNC_map_lookup_elem) | ||
| 362 | * insn->imm = bpf_map_lookup_elem - __bpf_call_base; | ||
| 363 | * else if (insn->imm == BPF_FUNC_map_update_elem) | ||
| 364 | * insn->imm = bpf_map_update_elem - __bpf_call_base; | ||
| 365 | * else ... | ||
| 366 | * | ||
| 367 | * this function is called after eBPF program passed verification | ||
| 368 | */ | ||
| 369 | static void fixup_bpf_calls(struct bpf_prog *prog) | ||
| 370 | { | ||
| 371 | const struct bpf_func_proto *fn; | ||
| 372 | int i; | ||
| 373 | |||
| 374 | for (i = 0; i < prog->len; i++) { | ||
| 375 | struct bpf_insn *insn = &prog->insnsi[i]; | ||
| 376 | |||
| 377 | if (insn->code == (BPF_JMP | BPF_CALL)) { | ||
| 378 | /* we reach here when program has bpf_call instructions | ||
| 379 | * and it passed bpf_check(), means that | ||
| 380 | * ops->get_func_proto must have been supplied, check it | ||
| 381 | */ | ||
| 382 | BUG_ON(!prog->aux->ops->get_func_proto); | ||
| 383 | |||
| 384 | fn = prog->aux->ops->get_func_proto(insn->imm); | ||
| 385 | /* all functions that have prototype and verifier allowed | ||
| 386 | * programs to call them, must be real in-kernel functions | ||
| 387 | */ | ||
| 388 | BUG_ON(!fn->func); | ||
| 389 | insn->imm = fn->func - __bpf_call_base; | ||
| 390 | } | ||
| 391 | } | ||
| 392 | } | ||
| 393 | |||
| 394 | /* drop refcnt on maps used by eBPF program and free auxilary data */ | ||
| 395 | static void free_used_maps(struct bpf_prog_aux *aux) | ||
| 396 | { | ||
| 397 | int i; | ||
| 398 | |||
| 399 | for (i = 0; i < aux->used_map_cnt; i++) | ||
| 400 | bpf_map_put(aux->used_maps[i]); | ||
| 401 | |||
| 402 | kfree(aux->used_maps); | ||
| 403 | } | ||
| 404 | |||
| 405 | void bpf_prog_put(struct bpf_prog *prog) | ||
| 406 | { | ||
| 407 | if (atomic_dec_and_test(&prog->aux->refcnt)) { | ||
| 408 | free_used_maps(prog->aux); | ||
| 409 | bpf_prog_free(prog); | ||
| 410 | } | ||
| 411 | } | ||
| 412 | |||
| 413 | static int bpf_prog_release(struct inode *inode, struct file *filp) | ||
| 414 | { | ||
| 415 | struct bpf_prog *prog = filp->private_data; | ||
| 416 | |||
| 417 | bpf_prog_put(prog); | ||
| 418 | return 0; | ||
| 419 | } | ||
| 420 | |||
| 421 | static const struct file_operations bpf_prog_fops = { | ||
| 422 | .release = bpf_prog_release, | ||
| 423 | }; | ||
| 424 | |||
| 425 | static struct bpf_prog *get_prog(struct fd f) | ||
| 426 | { | ||
| 427 | struct bpf_prog *prog; | ||
| 428 | |||
| 429 | if (!f.file) | ||
| 430 | return ERR_PTR(-EBADF); | ||
| 431 | |||
| 432 | if (f.file->f_op != &bpf_prog_fops) { | ||
| 433 | fdput(f); | ||
| 434 | return ERR_PTR(-EINVAL); | ||
| 435 | } | ||
| 436 | |||
| 437 | prog = f.file->private_data; | ||
| 438 | |||
| 439 | return prog; | ||
| 440 | } | ||
| 441 | |||
| 442 | /* called by sockets/tracing/seccomp before attaching program to an event | ||
| 443 | * pairs with bpf_prog_put() | ||
| 444 | */ | ||
| 445 | struct bpf_prog *bpf_prog_get(u32 ufd) | ||
| 446 | { | ||
| 447 | struct fd f = fdget(ufd); | ||
| 448 | struct bpf_prog *prog; | ||
| 449 | |||
| 450 | prog = get_prog(f); | ||
| 451 | |||
| 452 | if (IS_ERR(prog)) | ||
| 453 | return prog; | ||
| 454 | |||
| 455 | atomic_inc(&prog->aux->refcnt); | ||
| 456 | fdput(f); | ||
| 457 | return prog; | ||
| 458 | } | ||
| 459 | |||
| 460 | /* last field in 'union bpf_attr' used by this command */ | ||
| 461 | #define BPF_PROG_LOAD_LAST_FIELD log_buf | ||
| 462 | |||
| 463 | static int bpf_prog_load(union bpf_attr *attr) | ||
| 464 | { | ||
| 465 | enum bpf_prog_type type = attr->prog_type; | ||
| 466 | struct bpf_prog *prog; | ||
| 467 | int err; | ||
| 468 | char license[128]; | ||
| 469 | bool is_gpl; | ||
| 470 | |||
| 471 | if (CHECK_ATTR(BPF_PROG_LOAD)) | ||
| 472 | return -EINVAL; | ||
| 473 | |||
| 474 | /* copy eBPF program license from user space */ | ||
| 475 | if (strncpy_from_user(license, u64_to_ptr(attr->license), | ||
| 476 | sizeof(license) - 1) < 0) | ||
| 477 | return -EFAULT; | ||
| 478 | license[sizeof(license) - 1] = 0; | ||
| 479 | |||
| 480 | /* eBPF programs must be GPL compatible to use GPL-ed functions */ | ||
| 481 | is_gpl = license_is_gpl_compatible(license); | ||
| 482 | |||
| 483 | if (attr->insn_cnt >= BPF_MAXINSNS) | ||
| 484 | return -EINVAL; | ||
| 485 | |||
| 486 | /* plain bpf_prog allocation */ | ||
| 487 | prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); | ||
| 488 | if (!prog) | ||
| 489 | return -ENOMEM; | ||
| 490 | |||
| 491 | prog->len = attr->insn_cnt; | ||
| 492 | |||
| 493 | err = -EFAULT; | ||
| 494 | if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), | ||
| 495 | prog->len * sizeof(struct bpf_insn)) != 0) | ||
| 496 | goto free_prog; | ||
| 497 | |||
| 498 | prog->orig_prog = NULL; | ||
| 499 | prog->jited = false; | ||
| 500 | |||
| 501 | atomic_set(&prog->aux->refcnt, 1); | ||
| 502 | prog->aux->is_gpl_compatible = is_gpl; | ||
| 503 | |||
| 504 | /* find program type: socket_filter vs tracing_filter */ | ||
| 505 | err = find_prog_type(type, prog); | ||
| 506 | if (err < 0) | ||
| 507 | goto free_prog; | ||
| 508 | |||
| 509 | /* run eBPF verifier */ | ||
| 510 | err = bpf_check(prog, attr); | ||
| 511 | |||
| 512 | if (err < 0) | ||
| 513 | goto free_used_maps; | ||
| 514 | |||
| 515 | /* fixup BPF_CALL->imm field */ | ||
| 516 | fixup_bpf_calls(prog); | ||
| 517 | |||
| 518 | /* eBPF program is ready to be JITed */ | ||
| 519 | bpf_prog_select_runtime(prog); | ||
| 520 | |||
| 521 | err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); | ||
| 522 | |||
| 523 | if (err < 0) | ||
| 524 | /* failed to allocate fd */ | ||
| 525 | goto free_used_maps; | ||
| 526 | |||
| 527 | return err; | ||
| 528 | |||
| 529 | free_used_maps: | ||
| 530 | free_used_maps(prog->aux); | ||
| 531 | free_prog: | ||
| 532 | bpf_prog_free(prog); | ||
| 533 | return err; | ||
| 534 | } | ||
| 535 | |||
| 536 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | ||
| 537 | { | ||
| 538 | union bpf_attr attr = {}; | ||
| 539 | int err; | ||
| 540 | |||
| 541 | /* the syscall is limited to root temporarily. This restriction will be | ||
| 542 | * lifted when security audit is clean. Note that eBPF+tracing must have | ||
| 543 | * this restriction, since it may pass kernel data to user space | ||
| 544 | */ | ||
| 545 | if (!capable(CAP_SYS_ADMIN)) | ||
| 546 | return -EPERM; | ||
| 547 | |||
| 548 | if (!access_ok(VERIFY_READ, uattr, 1)) | ||
| 549 | return -EFAULT; | ||
| 550 | |||
| 551 | if (size > PAGE_SIZE) /* silly large */ | ||
| 552 | return -E2BIG; | ||
| 553 | |||
| 554 | /* If we're handed a bigger struct than we know of, | ||
| 555 | * ensure all the unknown bits are 0 - i.e. new | ||
| 556 | * user-space does not rely on any kernel feature | ||
| 557 | * extensions we dont know about yet. | ||
| 558 | */ | ||
| 559 | if (size > sizeof(attr)) { | ||
| 560 | unsigned char __user *addr; | ||
| 561 | unsigned char __user *end; | ||
| 562 | unsigned char val; | ||
| 563 | |||
| 564 | addr = (void __user *)uattr + sizeof(attr); | ||
| 565 | end = (void __user *)uattr + size; | ||
| 566 | |||
| 567 | for (; addr < end; addr++) { | ||
| 568 | err = get_user(val, addr); | ||
| 569 | if (err) | ||
| 570 | return err; | ||
| 571 | if (val) | ||
| 572 | return -E2BIG; | ||
| 573 | } | ||
| 574 | size = sizeof(attr); | ||
| 575 | } | ||
| 576 | |||
| 577 | /* copy attributes from user space, may be less than sizeof(bpf_attr) */ | ||
| 578 | if (copy_from_user(&attr, uattr, size) != 0) | ||
| 579 | return -EFAULT; | ||
| 580 | |||
| 581 | switch (cmd) { | ||
| 582 | case BPF_MAP_CREATE: | ||
| 583 | err = map_create(&attr); | ||
| 584 | break; | ||
| 585 | case BPF_MAP_LOOKUP_ELEM: | ||
| 586 | err = map_lookup_elem(&attr); | ||
| 587 | break; | ||
| 588 | case BPF_MAP_UPDATE_ELEM: | ||
| 589 | err = map_update_elem(&attr); | ||
| 590 | break; | ||
| 591 | case BPF_MAP_DELETE_ELEM: | ||
| 592 | err = map_delete_elem(&attr); | ||
| 593 | break; | ||
| 594 | case BPF_MAP_GET_NEXT_KEY: | ||
| 595 | err = map_get_next_key(&attr); | ||
| 596 | break; | ||
| 597 | case BPF_PROG_LOAD: | ||
| 598 | err = bpf_prog_load(&attr); | ||
| 599 | break; | ||
| 600 | default: | ||
| 601 | err = -EINVAL; | ||
| 602 | break; | ||
| 603 | } | ||
| 604 | |||
| 605 | return err; | ||
| 606 | } | ||
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c new file mode 100644 index 000000000000..fcaddff4003e --- /dev/null +++ b/kernel/bpf/test_stub.c | |||
| @@ -0,0 +1,116 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | ||
| 7 | #include <linux/kernel.h> | ||
| 8 | #include <linux/types.h> | ||
| 9 | #include <linux/slab.h> | ||
| 10 | #include <linux/err.h> | ||
| 11 | #include <linux/bpf.h> | ||
| 12 | |||
| 13 | /* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC | ||
| 14 | * to be used by user space verifier testsuite | ||
| 15 | */ | ||
| 16 | struct bpf_context { | ||
| 17 | u64 arg1; | ||
| 18 | u64 arg2; | ||
| 19 | }; | ||
| 20 | |||
| 21 | static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 22 | { | ||
| 23 | return 0; | ||
| 24 | } | ||
| 25 | |||
| 26 | static struct bpf_func_proto test_funcs[] = { | ||
| 27 | [BPF_FUNC_unspec] = { | ||
| 28 | .func = test_func, | ||
| 29 | .gpl_only = true, | ||
| 30 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | ||
| 31 | .arg1_type = ARG_CONST_MAP_PTR, | ||
| 32 | .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 33 | }, | ||
| 34 | }; | ||
| 35 | |||
| 36 | static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) | ||
| 37 | { | ||
| 38 | if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) | ||
| 39 | return NULL; | ||
| 40 | return &test_funcs[func_id]; | ||
| 41 | } | ||
| 42 | |||
| 43 | static const struct bpf_context_access { | ||
| 44 | int size; | ||
| 45 | enum bpf_access_type type; | ||
| 46 | } test_ctx_access[] = { | ||
| 47 | [offsetof(struct bpf_context, arg1)] = { | ||
| 48 | FIELD_SIZEOF(struct bpf_context, arg1), | ||
| 49 | BPF_READ | ||
| 50 | }, | ||
| 51 | [offsetof(struct bpf_context, arg2)] = { | ||
| 52 | FIELD_SIZEOF(struct bpf_context, arg2), | ||
| 53 | BPF_READ | ||
| 54 | }, | ||
| 55 | }; | ||
| 56 | |||
| 57 | static bool test_is_valid_access(int off, int size, enum bpf_access_type type) | ||
| 58 | { | ||
| 59 | const struct bpf_context_access *access; | ||
| 60 | |||
| 61 | if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) | ||
| 62 | return false; | ||
| 63 | |||
| 64 | access = &test_ctx_access[off]; | ||
| 65 | if (access->size == size && (access->type & type)) | ||
| 66 | return true; | ||
| 67 | |||
| 68 | return false; | ||
| 69 | } | ||
| 70 | |||
| 71 | static struct bpf_verifier_ops test_ops = { | ||
| 72 | .get_func_proto = test_func_proto, | ||
| 73 | .is_valid_access = test_is_valid_access, | ||
| 74 | }; | ||
| 75 | |||
| 76 | static struct bpf_prog_type_list tl_prog = { | ||
| 77 | .ops = &test_ops, | ||
| 78 | .type = BPF_PROG_TYPE_UNSPEC, | ||
| 79 | }; | ||
| 80 | |||
| 81 | static struct bpf_map *test_map_alloc(union bpf_attr *attr) | ||
| 82 | { | ||
| 83 | struct bpf_map *map; | ||
| 84 | |||
| 85 | map = kzalloc(sizeof(*map), GFP_USER); | ||
| 86 | if (!map) | ||
| 87 | return ERR_PTR(-ENOMEM); | ||
| 88 | |||
| 89 | map->key_size = attr->key_size; | ||
| 90 | map->value_size = attr->value_size; | ||
| 91 | map->max_entries = attr->max_entries; | ||
| 92 | return map; | ||
| 93 | } | ||
| 94 | |||
| 95 | static void test_map_free(struct bpf_map *map) | ||
| 96 | { | ||
| 97 | kfree(map); | ||
| 98 | } | ||
| 99 | |||
| 100 | static struct bpf_map_ops test_map_ops = { | ||
| 101 | .map_alloc = test_map_alloc, | ||
| 102 | .map_free = test_map_free, | ||
| 103 | }; | ||
| 104 | |||
| 105 | static struct bpf_map_type_list tl_map = { | ||
| 106 | .ops = &test_map_ops, | ||
| 107 | .type = BPF_MAP_TYPE_UNSPEC, | ||
| 108 | }; | ||
| 109 | |||
| 110 | static int __init register_test_ops(void) | ||
| 111 | { | ||
| 112 | bpf_register_map_type(&tl_map); | ||
| 113 | bpf_register_prog_type(&tl_prog); | ||
| 114 | return 0; | ||
| 115 | } | ||
| 116 | late_initcall(register_test_ops); | ||
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c new file mode 100644 index 000000000000..9f81818f2941 --- /dev/null +++ b/kernel/bpf/verifier.c | |||
| @@ -0,0 +1,1924 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/kernel.h> | ||
| 13 | #include <linux/types.h> | ||
| 14 | #include <linux/slab.h> | ||
| 15 | #include <linux/bpf.h> | ||
| 16 | #include <linux/filter.h> | ||
| 17 | #include <net/netlink.h> | ||
| 18 | #include <linux/file.h> | ||
| 19 | #include <linux/vmalloc.h> | ||
| 20 | |||
| 21 | /* bpf_check() is a static code analyzer that walks eBPF program | ||
| 22 | * instruction by instruction and updates register/stack state. | ||
| 23 | * All paths of conditional branches are analyzed until 'bpf_exit' insn. | ||
| 24 | * | ||
| 25 | * The first pass is depth-first-search to check that the program is a DAG. | ||
| 26 | * It rejects the following programs: | ||
| 27 | * - larger than BPF_MAXINSNS insns | ||
| 28 | * - if loop is present (detected via back-edge) | ||
| 29 | * - unreachable insns exist (shouldn't be a forest. program = one function) | ||
| 30 | * - out of bounds or malformed jumps | ||
| 31 | * The second pass is all possible path descent from the 1st insn. | ||
| 32 | * Since it's analyzing all pathes through the program, the length of the | ||
| 33 | * analysis is limited to 32k insn, which may be hit even if total number of | ||
| 34 | * insn is less then 4K, but there are too many branches that change stack/regs. | ||
| 35 | * Number of 'branches to be analyzed' is limited to 1k | ||
| 36 | * | ||
| 37 | * On entry to each instruction, each register has a type, and the instruction | ||
| 38 | * changes the types of the registers depending on instruction semantics. | ||
| 39 | * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is | ||
| 40 | * copied to R1. | ||
| 41 | * | ||
| 42 | * All registers are 64-bit. | ||
| 43 | * R0 - return register | ||
| 44 | * R1-R5 argument passing registers | ||
| 45 | * R6-R9 callee saved registers | ||
| 46 | * R10 - frame pointer read-only | ||
| 47 | * | ||
| 48 | * At the start of BPF program the register R1 contains a pointer to bpf_context | ||
| 49 | * and has type PTR_TO_CTX. | ||
| 50 | * | ||
| 51 | * Verifier tracks arithmetic operations on pointers in case: | ||
| 52 | * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), | ||
| 53 | * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), | ||
| 54 | * 1st insn copies R10 (which has FRAME_PTR) type into R1 | ||
| 55 | * and 2nd arithmetic instruction is pattern matched to recognize | ||
| 56 | * that it wants to construct a pointer to some element within stack. | ||
| 57 | * So after 2nd insn, the register R1 has type PTR_TO_STACK | ||
| 58 | * (and -20 constant is saved for further stack bounds checking). | ||
| 59 | * Meaning that this reg is a pointer to stack plus known immediate constant. | ||
| 60 | * | ||
| 61 | * Most of the time the registers have UNKNOWN_VALUE type, which | ||
| 62 | * means the register has some value, but it's not a valid pointer. | ||
| 63 | * (like pointer plus pointer becomes UNKNOWN_VALUE type) | ||
| 64 | * | ||
| 65 | * When verifier sees load or store instructions the type of base register | ||
| 66 | * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer | ||
| 67 | * types recognized by check_mem_access() function. | ||
| 68 | * | ||
| 69 | * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' | ||
| 70 | * and the range of [ptr, ptr + map's value_size) is accessible. | ||
| 71 | * | ||
| 72 | * registers used to pass values to function calls are checked against | ||
| 73 | * function argument constraints. | ||
| 74 | * | ||
| 75 | * ARG_PTR_TO_MAP_KEY is one of such argument constraints. | ||
| 76 | * It means that the register type passed to this function must be | ||
| 77 | * PTR_TO_STACK and it will be used inside the function as | ||
| 78 | * 'pointer to map element key' | ||
| 79 | * | ||
| 80 | * For example the argument constraints for bpf_map_lookup_elem(): | ||
| 81 | * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | ||
| 82 | * .arg1_type = ARG_CONST_MAP_PTR, | ||
| 83 | * .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 84 | * | ||
| 85 | * ret_type says that this function returns 'pointer to map elem value or null' | ||
| 86 | * function expects 1st argument to be a const pointer to 'struct bpf_map' and | ||
| 87 | * 2nd argument should be a pointer to stack, which will be used inside | ||
| 88 | * the helper function as a pointer to map element key. | ||
| 89 | * | ||
| 90 | * On the kernel side the helper function looks like: | ||
| 91 | * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 92 | * { | ||
| 93 | * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 94 | * void *key = (void *) (unsigned long) r2; | ||
| 95 | * void *value; | ||
| 96 | * | ||
| 97 | * here kernel can access 'key' and 'map' pointers safely, knowing that | ||
| 98 | * [key, key + map->key_size) bytes are valid and were initialized on | ||
| 99 | * the stack of eBPF program. | ||
| 100 | * } | ||
| 101 | * | ||
| 102 | * Corresponding eBPF program may look like: | ||
| 103 | * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR | ||
| 104 | * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK | ||
| 105 | * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP | ||
| 106 | * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), | ||
| 107 | * here verifier looks at prototype of map_lookup_elem() and sees: | ||
| 108 | * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, | ||
| 109 | * Now verifier knows that this map has key of R1->map_ptr->key_size bytes | ||
| 110 | * | ||
| 111 | * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, | ||
| 112 | * Now verifier checks that [R2, R2 + map's key_size) are within stack limits | ||
| 113 | * and were initialized prior to this call. | ||
| 114 | * If it's ok, then verifier allows this BPF_CALL insn and looks at | ||
| 115 | * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets | ||
| 116 | * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function | ||
| 117 | * returns ether pointer to map value or NULL. | ||
| 118 | * | ||
| 119 | * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' | ||
| 120 | * insn, the register holding that pointer in the true branch changes state to | ||
| 121 | * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false | ||
| 122 | * branch. See check_cond_jmp_op(). | ||
| 123 | * | ||
| 124 | * After the call R0 is set to return type of the function and registers R1-R5 | ||
| 125 | * are set to NOT_INIT to indicate that they are no longer readable. | ||
| 126 | */ | ||
| 127 | |||
| 128 | /* types of values stored in eBPF registers */ | ||
| 129 | enum bpf_reg_type { | ||
| 130 | NOT_INIT = 0, /* nothing was written into register */ | ||
| 131 | UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ | ||
| 132 | PTR_TO_CTX, /* reg points to bpf_context */ | ||
| 133 | CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ | ||
| 134 | PTR_TO_MAP_VALUE, /* reg points to map element value */ | ||
| 135 | PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ | ||
| 136 | FRAME_PTR, /* reg == frame_pointer */ | ||
| 137 | PTR_TO_STACK, /* reg == frame_pointer + imm */ | ||
| 138 | CONST_IMM, /* constant integer value */ | ||
| 139 | }; | ||
| 140 | |||
| 141 | struct reg_state { | ||
| 142 | enum bpf_reg_type type; | ||
| 143 | union { | ||
| 144 | /* valid when type == CONST_IMM | PTR_TO_STACK */ | ||
| 145 | int imm; | ||
| 146 | |||
| 147 | /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | | ||
| 148 | * PTR_TO_MAP_VALUE_OR_NULL | ||
| 149 | */ | ||
| 150 | struct bpf_map *map_ptr; | ||
| 151 | }; | ||
| 152 | }; | ||
| 153 | |||
| 154 | enum bpf_stack_slot_type { | ||
| 155 | STACK_INVALID, /* nothing was stored in this stack slot */ | ||
| 156 | STACK_SPILL, /* 1st byte of register spilled into stack */ | ||
| 157 | STACK_SPILL_PART, /* other 7 bytes of register spill */ | ||
| 158 | STACK_MISC /* BPF program wrote some data into this slot */ | ||
| 159 | }; | ||
| 160 | |||
| 161 | struct bpf_stack_slot { | ||
| 162 | enum bpf_stack_slot_type stype; | ||
| 163 | struct reg_state reg_st; | ||
| 164 | }; | ||
| 165 | |||
| 166 | /* state of the program: | ||
| 167 | * type of all registers and stack info | ||
| 168 | */ | ||
| 169 | struct verifier_state { | ||
| 170 | struct reg_state regs[MAX_BPF_REG]; | ||
| 171 | struct bpf_stack_slot stack[MAX_BPF_STACK]; | ||
| 172 | }; | ||
| 173 | |||
| 174 | /* linked list of verifier states used to prune search */ | ||
| 175 | struct verifier_state_list { | ||
| 176 | struct verifier_state state; | ||
| 177 | struct verifier_state_list *next; | ||
| 178 | }; | ||
| 179 | |||
| 180 | /* verifier_state + insn_idx are pushed to stack when branch is encountered */ | ||
| 181 | struct verifier_stack_elem { | ||
| 182 | /* verifer state is 'st' | ||
| 183 | * before processing instruction 'insn_idx' | ||
| 184 | * and after processing instruction 'prev_insn_idx' | ||
| 185 | */ | ||
| 186 | struct verifier_state st; | ||
| 187 | int insn_idx; | ||
| 188 | int prev_insn_idx; | ||
| 189 | struct verifier_stack_elem *next; | ||
| 190 | }; | ||
| 191 | |||
| 192 | #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ | ||
| 193 | |||
| 194 | /* single container for all structs | ||
| 195 | * one verifier_env per bpf_check() call | ||
| 196 | */ | ||
| 197 | struct verifier_env { | ||
| 198 | struct bpf_prog *prog; /* eBPF program being verified */ | ||
| 199 | struct verifier_stack_elem *head; /* stack of verifier states to be processed */ | ||
| 200 | int stack_size; /* number of states to be processed */ | ||
| 201 | struct verifier_state cur_state; /* current verifier state */ | ||
| 202 | struct verifier_state_list **explored_states; /* search pruning optimization */ | ||
| 203 | struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ | ||
| 204 | u32 used_map_cnt; /* number of used maps */ | ||
| 205 | }; | ||
| 206 | |||
| 207 | /* verbose verifier prints what it's seeing | ||
| 208 | * bpf_check() is called under lock, so no race to access these global vars | ||
| 209 | */ | ||
| 210 | static u32 log_level, log_size, log_len; | ||
| 211 | static char *log_buf; | ||
| 212 | |||
| 213 | static DEFINE_MUTEX(bpf_verifier_lock); | ||
| 214 | |||
| 215 | /* log_level controls verbosity level of eBPF verifier. | ||
| 216 | * verbose() is used to dump the verification trace to the log, so the user | ||
| 217 | * can figure out what's wrong with the program | ||
| 218 | */ | ||
| 219 | static void verbose(const char *fmt, ...) | ||
| 220 | { | ||
| 221 | va_list args; | ||
| 222 | |||
| 223 | if (log_level == 0 || log_len >= log_size - 1) | ||
| 224 | return; | ||
| 225 | |||
| 226 | va_start(args, fmt); | ||
| 227 | log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); | ||
| 228 | va_end(args); | ||
| 229 | } | ||
| 230 | |||
| 231 | /* string representation of 'enum bpf_reg_type' */ | ||
| 232 | static const char * const reg_type_str[] = { | ||
| 233 | [NOT_INIT] = "?", | ||
| 234 | [UNKNOWN_VALUE] = "inv", | ||
| 235 | [PTR_TO_CTX] = "ctx", | ||
| 236 | [CONST_PTR_TO_MAP] = "map_ptr", | ||
| 237 | [PTR_TO_MAP_VALUE] = "map_value", | ||
| 238 | [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", | ||
| 239 | [FRAME_PTR] = "fp", | ||
| 240 | [PTR_TO_STACK] = "fp", | ||
| 241 | [CONST_IMM] = "imm", | ||
| 242 | }; | ||
| 243 | |||
| 244 | static void print_verifier_state(struct verifier_env *env) | ||
| 245 | { | ||
| 246 | enum bpf_reg_type t; | ||
| 247 | int i; | ||
| 248 | |||
| 249 | for (i = 0; i < MAX_BPF_REG; i++) { | ||
| 250 | t = env->cur_state.regs[i].type; | ||
| 251 | if (t == NOT_INIT) | ||
| 252 | continue; | ||
| 253 | verbose(" R%d=%s", i, reg_type_str[t]); | ||
| 254 | if (t == CONST_IMM || t == PTR_TO_STACK) | ||
| 255 | verbose("%d", env->cur_state.regs[i].imm); | ||
| 256 | else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || | ||
| 257 | t == PTR_TO_MAP_VALUE_OR_NULL) | ||
| 258 | verbose("(ks=%d,vs=%d)", | ||
| 259 | env->cur_state.regs[i].map_ptr->key_size, | ||
| 260 | env->cur_state.regs[i].map_ptr->value_size); | ||
| 261 | } | ||
| 262 | for (i = 0; i < MAX_BPF_STACK; i++) { | ||
| 263 | if (env->cur_state.stack[i].stype == STACK_SPILL) | ||
| 264 | verbose(" fp%d=%s", -MAX_BPF_STACK + i, | ||
| 265 | reg_type_str[env->cur_state.stack[i].reg_st.type]); | ||
| 266 | } | ||
| 267 | verbose("\n"); | ||
| 268 | } | ||
| 269 | |||
| 270 | static const char *const bpf_class_string[] = { | ||
| 271 | [BPF_LD] = "ld", | ||
| 272 | [BPF_LDX] = "ldx", | ||
| 273 | [BPF_ST] = "st", | ||
| 274 | [BPF_STX] = "stx", | ||
| 275 | [BPF_ALU] = "alu", | ||
| 276 | [BPF_JMP] = "jmp", | ||
| 277 | [BPF_RET] = "BUG", | ||
| 278 | [BPF_ALU64] = "alu64", | ||
| 279 | }; | ||
| 280 | |||
| 281 | static const char *const bpf_alu_string[] = { | ||
| 282 | [BPF_ADD >> 4] = "+=", | ||
| 283 | [BPF_SUB >> 4] = "-=", | ||
| 284 | [BPF_MUL >> 4] = "*=", | ||
| 285 | [BPF_DIV >> 4] = "/=", | ||
| 286 | [BPF_OR >> 4] = "|=", | ||
| 287 | [BPF_AND >> 4] = "&=", | ||
| 288 | [BPF_LSH >> 4] = "<<=", | ||
| 289 | [BPF_RSH >> 4] = ">>=", | ||
| 290 | [BPF_NEG >> 4] = "neg", | ||
| 291 | [BPF_MOD >> 4] = "%=", | ||
| 292 | [BPF_XOR >> 4] = "^=", | ||
| 293 | [BPF_MOV >> 4] = "=", | ||
| 294 | [BPF_ARSH >> 4] = "s>>=", | ||
| 295 | [BPF_END >> 4] = "endian", | ||
| 296 | }; | ||
| 297 | |||
| 298 | static const char *const bpf_ldst_string[] = { | ||
| 299 | [BPF_W >> 3] = "u32", | ||
| 300 | [BPF_H >> 3] = "u16", | ||
| 301 | [BPF_B >> 3] = "u8", | ||
| 302 | [BPF_DW >> 3] = "u64", | ||
| 303 | }; | ||
| 304 | |||
| 305 | static const char *const bpf_jmp_string[] = { | ||
| 306 | [BPF_JA >> 4] = "jmp", | ||
| 307 | [BPF_JEQ >> 4] = "==", | ||
| 308 | [BPF_JGT >> 4] = ">", | ||
| 309 | [BPF_JGE >> 4] = ">=", | ||
| 310 | [BPF_JSET >> 4] = "&", | ||
| 311 | [BPF_JNE >> 4] = "!=", | ||
| 312 | [BPF_JSGT >> 4] = "s>", | ||
| 313 | [BPF_JSGE >> 4] = "s>=", | ||
| 314 | [BPF_CALL >> 4] = "call", | ||
| 315 | [BPF_EXIT >> 4] = "exit", | ||
| 316 | }; | ||
| 317 | |||
| 318 | static void print_bpf_insn(struct bpf_insn *insn) | ||
| 319 | { | ||
| 320 | u8 class = BPF_CLASS(insn->code); | ||
| 321 | |||
| 322 | if (class == BPF_ALU || class == BPF_ALU64) { | ||
| 323 | if (BPF_SRC(insn->code) == BPF_X) | ||
| 324 | verbose("(%02x) %sr%d %s %sr%d\n", | ||
| 325 | insn->code, class == BPF_ALU ? "(u32) " : "", | ||
| 326 | insn->dst_reg, | ||
| 327 | bpf_alu_string[BPF_OP(insn->code) >> 4], | ||
| 328 | class == BPF_ALU ? "(u32) " : "", | ||
| 329 | insn->src_reg); | ||
| 330 | else | ||
| 331 | verbose("(%02x) %sr%d %s %s%d\n", | ||
| 332 | insn->code, class == BPF_ALU ? "(u32) " : "", | ||
| 333 | insn->dst_reg, | ||
| 334 | bpf_alu_string[BPF_OP(insn->code) >> 4], | ||
| 335 | class == BPF_ALU ? "(u32) " : "", | ||
| 336 | insn->imm); | ||
| 337 | } else if (class == BPF_STX) { | ||
| 338 | if (BPF_MODE(insn->code) == BPF_MEM) | ||
| 339 | verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", | ||
| 340 | insn->code, | ||
| 341 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 342 | insn->dst_reg, | ||
| 343 | insn->off, insn->src_reg); | ||
| 344 | else if (BPF_MODE(insn->code) == BPF_XADD) | ||
| 345 | verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", | ||
| 346 | insn->code, | ||
| 347 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 348 | insn->dst_reg, insn->off, | ||
| 349 | insn->src_reg); | ||
| 350 | else | ||
| 351 | verbose("BUG_%02x\n", insn->code); | ||
| 352 | } else if (class == BPF_ST) { | ||
| 353 | if (BPF_MODE(insn->code) != BPF_MEM) { | ||
| 354 | verbose("BUG_st_%02x\n", insn->code); | ||
| 355 | return; | ||
| 356 | } | ||
| 357 | verbose("(%02x) *(%s *)(r%d %+d) = %d\n", | ||
| 358 | insn->code, | ||
| 359 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 360 | insn->dst_reg, | ||
| 361 | insn->off, insn->imm); | ||
| 362 | } else if (class == BPF_LDX) { | ||
| 363 | if (BPF_MODE(insn->code) != BPF_MEM) { | ||
| 364 | verbose("BUG_ldx_%02x\n", insn->code); | ||
| 365 | return; | ||
| 366 | } | ||
| 367 | verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", | ||
| 368 | insn->code, insn->dst_reg, | ||
| 369 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 370 | insn->src_reg, insn->off); | ||
| 371 | } else if (class == BPF_LD) { | ||
| 372 | if (BPF_MODE(insn->code) == BPF_ABS) { | ||
| 373 | verbose("(%02x) r0 = *(%s *)skb[%d]\n", | ||
| 374 | insn->code, | ||
| 375 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 376 | insn->imm); | ||
| 377 | } else if (BPF_MODE(insn->code) == BPF_IND) { | ||
| 378 | verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", | ||
| 379 | insn->code, | ||
| 380 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 381 | insn->src_reg, insn->imm); | ||
| 382 | } else if (BPF_MODE(insn->code) == BPF_IMM) { | ||
| 383 | verbose("(%02x) r%d = 0x%x\n", | ||
| 384 | insn->code, insn->dst_reg, insn->imm); | ||
| 385 | } else { | ||
| 386 | verbose("BUG_ld_%02x\n", insn->code); | ||
| 387 | return; | ||
| 388 | } | ||
| 389 | } else if (class == BPF_JMP) { | ||
| 390 | u8 opcode = BPF_OP(insn->code); | ||
| 391 | |||
| 392 | if (opcode == BPF_CALL) { | ||
| 393 | verbose("(%02x) call %d\n", insn->code, insn->imm); | ||
| 394 | } else if (insn->code == (BPF_JMP | BPF_JA)) { | ||
| 395 | verbose("(%02x) goto pc%+d\n", | ||
| 396 | insn->code, insn->off); | ||
| 397 | } else if (insn->code == (BPF_JMP | BPF_EXIT)) { | ||
| 398 | verbose("(%02x) exit\n", insn->code); | ||
| 399 | } else if (BPF_SRC(insn->code) == BPF_X) { | ||
| 400 | verbose("(%02x) if r%d %s r%d goto pc%+d\n", | ||
| 401 | insn->code, insn->dst_reg, | ||
| 402 | bpf_jmp_string[BPF_OP(insn->code) >> 4], | ||
| 403 | insn->src_reg, insn->off); | ||
| 404 | } else { | ||
| 405 | verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", | ||
| 406 | insn->code, insn->dst_reg, | ||
| 407 | bpf_jmp_string[BPF_OP(insn->code) >> 4], | ||
| 408 | insn->imm, insn->off); | ||
| 409 | } | ||
| 410 | } else { | ||
| 411 | verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); | ||
| 412 | } | ||
| 413 | } | ||
| 414 | |||
| 415 | static int pop_stack(struct verifier_env *env, int *prev_insn_idx) | ||
| 416 | { | ||
| 417 | struct verifier_stack_elem *elem; | ||
| 418 | int insn_idx; | ||
| 419 | |||
| 420 | if (env->head == NULL) | ||
| 421 | return -1; | ||
| 422 | |||
| 423 | memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); | ||
| 424 | insn_idx = env->head->insn_idx; | ||
| 425 | if (prev_insn_idx) | ||
| 426 | *prev_insn_idx = env->head->prev_insn_idx; | ||
| 427 | elem = env->head->next; | ||
| 428 | kfree(env->head); | ||
| 429 | env->head = elem; | ||
| 430 | env->stack_size--; | ||
| 431 | return insn_idx; | ||
| 432 | } | ||
| 433 | |||
| 434 | static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, | ||
| 435 | int prev_insn_idx) | ||
| 436 | { | ||
| 437 | struct verifier_stack_elem *elem; | ||
| 438 | |||
| 439 | elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); | ||
| 440 | if (!elem) | ||
| 441 | goto err; | ||
| 442 | |||
| 443 | memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); | ||
| 444 | elem->insn_idx = insn_idx; | ||
| 445 | elem->prev_insn_idx = prev_insn_idx; | ||
| 446 | elem->next = env->head; | ||
| 447 | env->head = elem; | ||
| 448 | env->stack_size++; | ||
| 449 | if (env->stack_size > 1024) { | ||
| 450 | verbose("BPF program is too complex\n"); | ||
| 451 | goto err; | ||
| 452 | } | ||
| 453 | return &elem->st; | ||
| 454 | err: | ||
| 455 | /* pop all elements and return */ | ||
| 456 | while (pop_stack(env, NULL) >= 0); | ||
| 457 | return NULL; | ||
| 458 | } | ||
| 459 | |||
| 460 | #define CALLER_SAVED_REGS 6 | ||
| 461 | static const int caller_saved[CALLER_SAVED_REGS] = { | ||
| 462 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 | ||
| 463 | }; | ||
| 464 | |||
| 465 | static void init_reg_state(struct reg_state *regs) | ||
| 466 | { | ||
| 467 | int i; | ||
| 468 | |||
| 469 | for (i = 0; i < MAX_BPF_REG; i++) { | ||
| 470 | regs[i].type = NOT_INIT; | ||
| 471 | regs[i].imm = 0; | ||
| 472 | regs[i].map_ptr = NULL; | ||
| 473 | } | ||
| 474 | |||
| 475 | /* frame pointer */ | ||
| 476 | regs[BPF_REG_FP].type = FRAME_PTR; | ||
| 477 | |||
| 478 | /* 1st arg to a function */ | ||
| 479 | regs[BPF_REG_1].type = PTR_TO_CTX; | ||
| 480 | } | ||
| 481 | |||
| 482 | static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) | ||
| 483 | { | ||
| 484 | BUG_ON(regno >= MAX_BPF_REG); | ||
| 485 | regs[regno].type = UNKNOWN_VALUE; | ||
| 486 | regs[regno].imm = 0; | ||
| 487 | regs[regno].map_ptr = NULL; | ||
| 488 | } | ||
| 489 | |||
| 490 | enum reg_arg_type { | ||
| 491 | SRC_OP, /* register is used as source operand */ | ||
| 492 | DST_OP, /* register is used as destination operand */ | ||
| 493 | DST_OP_NO_MARK /* same as above, check only, don't mark */ | ||
| 494 | }; | ||
| 495 | |||
| 496 | static int check_reg_arg(struct reg_state *regs, u32 regno, | ||
| 497 | enum reg_arg_type t) | ||
| 498 | { | ||
| 499 | if (regno >= MAX_BPF_REG) { | ||
| 500 | verbose("R%d is invalid\n", regno); | ||
| 501 | return -EINVAL; | ||
| 502 | } | ||
| 503 | |||
| 504 | if (t == SRC_OP) { | ||
| 505 | /* check whether register used as source operand can be read */ | ||
| 506 | if (regs[regno].type == NOT_INIT) { | ||
| 507 | verbose("R%d !read_ok\n", regno); | ||
| 508 | return -EACCES; | ||
| 509 | } | ||
| 510 | } else { | ||
| 511 | /* check whether register used as dest operand can be written to */ | ||
| 512 | if (regno == BPF_REG_FP) { | ||
| 513 | verbose("frame pointer is read only\n"); | ||
| 514 | return -EACCES; | ||
| 515 | } | ||
| 516 | if (t == DST_OP) | ||
| 517 | mark_reg_unknown_value(regs, regno); | ||
| 518 | } | ||
| 519 | return 0; | ||
| 520 | } | ||
| 521 | |||
| 522 | static int bpf_size_to_bytes(int bpf_size) | ||
| 523 | { | ||
| 524 | if (bpf_size == BPF_W) | ||
| 525 | return 4; | ||
| 526 | else if (bpf_size == BPF_H) | ||
| 527 | return 2; | ||
| 528 | else if (bpf_size == BPF_B) | ||
| 529 | return 1; | ||
| 530 | else if (bpf_size == BPF_DW) | ||
| 531 | return 8; | ||
| 532 | else | ||
| 533 | return -EINVAL; | ||
| 534 | } | ||
| 535 | |||
| 536 | /* check_stack_read/write functions track spill/fill of registers, | ||
| 537 | * stack boundary and alignment are checked in check_mem_access() | ||
| 538 | */ | ||
| 539 | static int check_stack_write(struct verifier_state *state, int off, int size, | ||
| 540 | int value_regno) | ||
| 541 | { | ||
| 542 | struct bpf_stack_slot *slot; | ||
| 543 | int i; | ||
| 544 | |||
| 545 | if (value_regno >= 0 && | ||
| 546 | (state->regs[value_regno].type == PTR_TO_MAP_VALUE || | ||
| 547 | state->regs[value_regno].type == PTR_TO_STACK || | ||
| 548 | state->regs[value_regno].type == PTR_TO_CTX)) { | ||
| 549 | |||
| 550 | /* register containing pointer is being spilled into stack */ | ||
| 551 | if (size != 8) { | ||
| 552 | verbose("invalid size of register spill\n"); | ||
| 553 | return -EACCES; | ||
| 554 | } | ||
| 555 | |||
| 556 | slot = &state->stack[MAX_BPF_STACK + off]; | ||
| 557 | slot->stype = STACK_SPILL; | ||
| 558 | /* save register state */ | ||
| 559 | slot->reg_st = state->regs[value_regno]; | ||
| 560 | for (i = 1; i < 8; i++) { | ||
| 561 | slot = &state->stack[MAX_BPF_STACK + off + i]; | ||
| 562 | slot->stype = STACK_SPILL_PART; | ||
| 563 | slot->reg_st.type = UNKNOWN_VALUE; | ||
| 564 | slot->reg_st.map_ptr = NULL; | ||
| 565 | } | ||
| 566 | } else { | ||
| 567 | |||
| 568 | /* regular write of data into stack */ | ||
| 569 | for (i = 0; i < size; i++) { | ||
| 570 | slot = &state->stack[MAX_BPF_STACK + off + i]; | ||
| 571 | slot->stype = STACK_MISC; | ||
| 572 | slot->reg_st.type = UNKNOWN_VALUE; | ||
| 573 | slot->reg_st.map_ptr = NULL; | ||
| 574 | } | ||
| 575 | } | ||
| 576 | return 0; | ||
| 577 | } | ||
| 578 | |||
| 579 | static int check_stack_read(struct verifier_state *state, int off, int size, | ||
| 580 | int value_regno) | ||
| 581 | { | ||
| 582 | int i; | ||
| 583 | struct bpf_stack_slot *slot; | ||
| 584 | |||
| 585 | slot = &state->stack[MAX_BPF_STACK + off]; | ||
| 586 | |||
| 587 | if (slot->stype == STACK_SPILL) { | ||
| 588 | if (size != 8) { | ||
| 589 | verbose("invalid size of register spill\n"); | ||
| 590 | return -EACCES; | ||
| 591 | } | ||
| 592 | for (i = 1; i < 8; i++) { | ||
| 593 | if (state->stack[MAX_BPF_STACK + off + i].stype != | ||
| 594 | STACK_SPILL_PART) { | ||
| 595 | verbose("corrupted spill memory\n"); | ||
| 596 | return -EACCES; | ||
| 597 | } | ||
| 598 | } | ||
| 599 | |||
| 600 | if (value_regno >= 0) | ||
| 601 | /* restore register state from stack */ | ||
| 602 | state->regs[value_regno] = slot->reg_st; | ||
| 603 | return 0; | ||
| 604 | } else { | ||
| 605 | for (i = 0; i < size; i++) { | ||
| 606 | if (state->stack[MAX_BPF_STACK + off + i].stype != | ||
| 607 | STACK_MISC) { | ||
| 608 | verbose("invalid read from stack off %d+%d size %d\n", | ||
| 609 | off, i, size); | ||
| 610 | return -EACCES; | ||
| 611 | } | ||
| 612 | } | ||
| 613 | if (value_regno >= 0) | ||
| 614 | /* have read misc data from the stack */ | ||
| 615 | mark_reg_unknown_value(state->regs, value_regno); | ||
| 616 | return 0; | ||
| 617 | } | ||
| 618 | } | ||
| 619 | |||
| 620 | /* check read/write into map element returned by bpf_map_lookup_elem() */ | ||
| 621 | static int check_map_access(struct verifier_env *env, u32 regno, int off, | ||
| 622 | int size) | ||
| 623 | { | ||
| 624 | struct bpf_map *map = env->cur_state.regs[regno].map_ptr; | ||
| 625 | |||
| 626 | if (off < 0 || off + size > map->value_size) { | ||
| 627 | verbose("invalid access to map value, value_size=%d off=%d size=%d\n", | ||
| 628 | map->value_size, off, size); | ||
| 629 | return -EACCES; | ||
| 630 | } | ||
| 631 | return 0; | ||
| 632 | } | ||
| 633 | |||
| 634 | /* check access to 'struct bpf_context' fields */ | ||
| 635 | static int check_ctx_access(struct verifier_env *env, int off, int size, | ||
| 636 | enum bpf_access_type t) | ||
| 637 | { | ||
| 638 | if (env->prog->aux->ops->is_valid_access && | ||
| 639 | env->prog->aux->ops->is_valid_access(off, size, t)) | ||
| 640 | return 0; | ||
| 641 | |||
| 642 | verbose("invalid bpf_context access off=%d size=%d\n", off, size); | ||
| 643 | return -EACCES; | ||
| 644 | } | ||
| 645 | |||
| 646 | /* check whether memory at (regno + off) is accessible for t = (read | write) | ||
| 647 | * if t==write, value_regno is a register which value is stored into memory | ||
| 648 | * if t==read, value_regno is a register which will receive the value from memory | ||
| 649 | * if t==write && value_regno==-1, some unknown value is stored into memory | ||
| 650 | * if t==read && value_regno==-1, don't care what we read from memory | ||
| 651 | */ | ||
| 652 | static int check_mem_access(struct verifier_env *env, u32 regno, int off, | ||
| 653 | int bpf_size, enum bpf_access_type t, | ||
| 654 | int value_regno) | ||
| 655 | { | ||
| 656 | struct verifier_state *state = &env->cur_state; | ||
| 657 | int size, err = 0; | ||
| 658 | |||
| 659 | size = bpf_size_to_bytes(bpf_size); | ||
| 660 | if (size < 0) | ||
| 661 | return size; | ||
| 662 | |||
| 663 | if (off % size != 0) { | ||
| 664 | verbose("misaligned access off %d size %d\n", off, size); | ||
| 665 | return -EACCES; | ||
| 666 | } | ||
| 667 | |||
| 668 | if (state->regs[regno].type == PTR_TO_MAP_VALUE) { | ||
| 669 | err = check_map_access(env, regno, off, size); | ||
| 670 | if (!err && t == BPF_READ && value_regno >= 0) | ||
| 671 | mark_reg_unknown_value(state->regs, value_regno); | ||
| 672 | |||
| 673 | } else if (state->regs[regno].type == PTR_TO_CTX) { | ||
| 674 | err = check_ctx_access(env, off, size, t); | ||
| 675 | if (!err && t == BPF_READ && value_regno >= 0) | ||
| 676 | mark_reg_unknown_value(state->regs, value_regno); | ||
| 677 | |||
| 678 | } else if (state->regs[regno].type == FRAME_PTR) { | ||
| 679 | if (off >= 0 || off < -MAX_BPF_STACK) { | ||
| 680 | verbose("invalid stack off=%d size=%d\n", off, size); | ||
| 681 | return -EACCES; | ||
| 682 | } | ||
| 683 | if (t == BPF_WRITE) | ||
| 684 | err = check_stack_write(state, off, size, value_regno); | ||
| 685 | else | ||
| 686 | err = check_stack_read(state, off, size, value_regno); | ||
| 687 | } else { | ||
| 688 | verbose("R%d invalid mem access '%s'\n", | ||
| 689 | regno, reg_type_str[state->regs[regno].type]); | ||
| 690 | return -EACCES; | ||
| 691 | } | ||
| 692 | return err; | ||
| 693 | } | ||
| 694 | |||
| 695 | static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) | ||
| 696 | { | ||
| 697 | struct reg_state *regs = env->cur_state.regs; | ||
| 698 | int err; | ||
| 699 | |||
| 700 | if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || | ||
| 701 | insn->imm != 0) { | ||
| 702 | verbose("BPF_XADD uses reserved fields\n"); | ||
| 703 | return -EINVAL; | ||
| 704 | } | ||
| 705 | |||
| 706 | /* check src1 operand */ | ||
| 707 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 708 | if (err) | ||
| 709 | return err; | ||
| 710 | |||
| 711 | /* check src2 operand */ | ||
| 712 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 713 | if (err) | ||
| 714 | return err; | ||
| 715 | |||
| 716 | /* check whether atomic_add can read the memory */ | ||
| 717 | err = check_mem_access(env, insn->dst_reg, insn->off, | ||
| 718 | BPF_SIZE(insn->code), BPF_READ, -1); | ||
| 719 | if (err) | ||
| 720 | return err; | ||
| 721 | |||
| 722 | /* check whether atomic_add can write into the same memory */ | ||
| 723 | return check_mem_access(env, insn->dst_reg, insn->off, | ||
| 724 | BPF_SIZE(insn->code), BPF_WRITE, -1); | ||
| 725 | } | ||
| 726 | |||
| 727 | /* when register 'regno' is passed into function that will read 'access_size' | ||
| 728 | * bytes from that pointer, make sure that it's within stack boundary | ||
| 729 | * and all elements of stack are initialized | ||
| 730 | */ | ||
| 731 | static int check_stack_boundary(struct verifier_env *env, | ||
| 732 | int regno, int access_size) | ||
| 733 | { | ||
| 734 | struct verifier_state *state = &env->cur_state; | ||
| 735 | struct reg_state *regs = state->regs; | ||
| 736 | int off, i; | ||
| 737 | |||
| 738 | if (regs[regno].type != PTR_TO_STACK) | ||
| 739 | return -EACCES; | ||
| 740 | |||
| 741 | off = regs[regno].imm; | ||
| 742 | if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || | ||
| 743 | access_size <= 0) { | ||
| 744 | verbose("invalid stack type R%d off=%d access_size=%d\n", | ||
| 745 | regno, off, access_size); | ||
| 746 | return -EACCES; | ||
| 747 | } | ||
| 748 | |||
| 749 | for (i = 0; i < access_size; i++) { | ||
| 750 | if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { | ||
| 751 | verbose("invalid indirect read from stack off %d+%d size %d\n", | ||
| 752 | off, i, access_size); | ||
| 753 | return -EACCES; | ||
| 754 | } | ||
| 755 | } | ||
| 756 | return 0; | ||
| 757 | } | ||
| 758 | |||
| 759 | static int check_func_arg(struct verifier_env *env, u32 regno, | ||
| 760 | enum bpf_arg_type arg_type, struct bpf_map **mapp) | ||
| 761 | { | ||
| 762 | struct reg_state *reg = env->cur_state.regs + regno; | ||
| 763 | enum bpf_reg_type expected_type; | ||
| 764 | int err = 0; | ||
| 765 | |||
| 766 | if (arg_type == ARG_ANYTHING) | ||
| 767 | return 0; | ||
| 768 | |||
| 769 | if (reg->type == NOT_INIT) { | ||
| 770 | verbose("R%d !read_ok\n", regno); | ||
| 771 | return -EACCES; | ||
| 772 | } | ||
| 773 | |||
| 774 | if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || | ||
| 775 | arg_type == ARG_PTR_TO_MAP_VALUE) { | ||
| 776 | expected_type = PTR_TO_STACK; | ||
| 777 | } else if (arg_type == ARG_CONST_STACK_SIZE) { | ||
| 778 | expected_type = CONST_IMM; | ||
| 779 | } else if (arg_type == ARG_CONST_MAP_PTR) { | ||
| 780 | expected_type = CONST_PTR_TO_MAP; | ||
| 781 | } else { | ||
| 782 | verbose("unsupported arg_type %d\n", arg_type); | ||
| 783 | return -EFAULT; | ||
| 784 | } | ||
| 785 | |||
| 786 | if (reg->type != expected_type) { | ||
| 787 | verbose("R%d type=%s expected=%s\n", regno, | ||
| 788 | reg_type_str[reg->type], reg_type_str[expected_type]); | ||
| 789 | return -EACCES; | ||
| 790 | } | ||
| 791 | |||
| 792 | if (arg_type == ARG_CONST_MAP_PTR) { | ||
| 793 | /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ | ||
| 794 | *mapp = reg->map_ptr; | ||
| 795 | |||
| 796 | } else if (arg_type == ARG_PTR_TO_MAP_KEY) { | ||
| 797 | /* bpf_map_xxx(..., map_ptr, ..., key) call: | ||
| 798 | * check that [key, key + map->key_size) are within | ||
| 799 | * stack limits and initialized | ||
| 800 | */ | ||
| 801 | if (!*mapp) { | ||
| 802 | /* in function declaration map_ptr must come before | ||
| 803 | * map_key, so that it's verified and known before | ||
| 804 | * we have to check map_key here. Otherwise it means | ||
| 805 | * that kernel subsystem misconfigured verifier | ||
| 806 | */ | ||
| 807 | verbose("invalid map_ptr to access map->key\n"); | ||
| 808 | return -EACCES; | ||
| 809 | } | ||
| 810 | err = check_stack_boundary(env, regno, (*mapp)->key_size); | ||
| 811 | |||
| 812 | } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { | ||
| 813 | /* bpf_map_xxx(..., map_ptr, ..., value) call: | ||
| 814 | * check [value, value + map->value_size) validity | ||
| 815 | */ | ||
| 816 | if (!*mapp) { | ||
| 817 | /* kernel subsystem misconfigured verifier */ | ||
| 818 | verbose("invalid map_ptr to access map->value\n"); | ||
| 819 | return -EACCES; | ||
| 820 | } | ||
| 821 | err = check_stack_boundary(env, regno, (*mapp)->value_size); | ||
| 822 | |||
| 823 | } else if (arg_type == ARG_CONST_STACK_SIZE) { | ||
| 824 | /* bpf_xxx(..., buf, len) call will access 'len' bytes | ||
| 825 | * from stack pointer 'buf'. Check it | ||
| 826 | * note: regno == len, regno - 1 == buf | ||
| 827 | */ | ||
| 828 | if (regno == 0) { | ||
| 829 | /* kernel subsystem misconfigured verifier */ | ||
| 830 | verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); | ||
| 831 | return -EACCES; | ||
| 832 | } | ||
| 833 | err = check_stack_boundary(env, regno - 1, reg->imm); | ||
| 834 | } | ||
| 835 | |||
| 836 | return err; | ||
| 837 | } | ||
| 838 | |||
| 839 | static int check_call(struct verifier_env *env, int func_id) | ||
| 840 | { | ||
| 841 | struct verifier_state *state = &env->cur_state; | ||
| 842 | const struct bpf_func_proto *fn = NULL; | ||
| 843 | struct reg_state *regs = state->regs; | ||
| 844 | struct bpf_map *map = NULL; | ||
| 845 | struct reg_state *reg; | ||
| 846 | int i, err; | ||
| 847 | |||
| 848 | /* find function prototype */ | ||
| 849 | if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { | ||
| 850 | verbose("invalid func %d\n", func_id); | ||
| 851 | return -EINVAL; | ||
| 852 | } | ||
| 853 | |||
| 854 | if (env->prog->aux->ops->get_func_proto) | ||
| 855 | fn = env->prog->aux->ops->get_func_proto(func_id); | ||
| 856 | |||
| 857 | if (!fn) { | ||
| 858 | verbose("unknown func %d\n", func_id); | ||
| 859 | return -EINVAL; | ||
| 860 | } | ||
| 861 | |||
| 862 | /* eBPF programs must be GPL compatible to use GPL-ed functions */ | ||
| 863 | if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { | ||
| 864 | verbose("cannot call GPL only function from proprietary program\n"); | ||
| 865 | return -EINVAL; | ||
| 866 | } | ||
| 867 | |||
| 868 | /* check args */ | ||
| 869 | err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); | ||
| 870 | if (err) | ||
| 871 | return err; | ||
| 872 | err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); | ||
| 873 | if (err) | ||
| 874 | return err; | ||
| 875 | err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); | ||
| 876 | if (err) | ||
| 877 | return err; | ||
| 878 | err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); | ||
| 879 | if (err) | ||
| 880 | return err; | ||
| 881 | err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); | ||
| 882 | if (err) | ||
| 883 | return err; | ||
| 884 | |||
| 885 | /* reset caller saved regs */ | ||
| 886 | for (i = 0; i < CALLER_SAVED_REGS; i++) { | ||
| 887 | reg = regs + caller_saved[i]; | ||
| 888 | reg->type = NOT_INIT; | ||
| 889 | reg->imm = 0; | ||
| 890 | } | ||
| 891 | |||
| 892 | /* update return register */ | ||
| 893 | if (fn->ret_type == RET_INTEGER) { | ||
| 894 | regs[BPF_REG_0].type = UNKNOWN_VALUE; | ||
| 895 | } else if (fn->ret_type == RET_VOID) { | ||
| 896 | regs[BPF_REG_0].type = NOT_INIT; | ||
| 897 | } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { | ||
| 898 | regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; | ||
| 899 | /* remember map_ptr, so that check_map_access() | ||
| 900 | * can check 'value_size' boundary of memory access | ||
| 901 | * to map element returned from bpf_map_lookup_elem() | ||
| 902 | */ | ||
| 903 | if (map == NULL) { | ||
| 904 | verbose("kernel subsystem misconfigured verifier\n"); | ||
| 905 | return -EINVAL; | ||
| 906 | } | ||
| 907 | regs[BPF_REG_0].map_ptr = map; | ||
| 908 | } else { | ||
| 909 | verbose("unknown return type %d of func %d\n", | ||
| 910 | fn->ret_type, func_id); | ||
| 911 | return -EINVAL; | ||
| 912 | } | ||
| 913 | return 0; | ||
| 914 | } | ||
| 915 | |||
| 916 | /* check validity of 32-bit and 64-bit arithmetic operations */ | ||
| 917 | static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) | ||
| 918 | { | ||
| 919 | u8 opcode = BPF_OP(insn->code); | ||
| 920 | int err; | ||
| 921 | |||
| 922 | if (opcode == BPF_END || opcode == BPF_NEG) { | ||
| 923 | if (opcode == BPF_NEG) { | ||
| 924 | if (BPF_SRC(insn->code) != 0 || | ||
| 925 | insn->src_reg != BPF_REG_0 || | ||
| 926 | insn->off != 0 || insn->imm != 0) { | ||
| 927 | verbose("BPF_NEG uses reserved fields\n"); | ||
| 928 | return -EINVAL; | ||
| 929 | } | ||
| 930 | } else { | ||
| 931 | if (insn->src_reg != BPF_REG_0 || insn->off != 0 || | ||
| 932 | (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { | ||
| 933 | verbose("BPF_END uses reserved fields\n"); | ||
| 934 | return -EINVAL; | ||
| 935 | } | ||
| 936 | } | ||
| 937 | |||
| 938 | /* check src operand */ | ||
| 939 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 940 | if (err) | ||
| 941 | return err; | ||
| 942 | |||
| 943 | /* check dest operand */ | ||
| 944 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | ||
| 945 | if (err) | ||
| 946 | return err; | ||
| 947 | |||
| 948 | } else if (opcode == BPF_MOV) { | ||
| 949 | |||
| 950 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 951 | if (insn->imm != 0 || insn->off != 0) { | ||
| 952 | verbose("BPF_MOV uses reserved fields\n"); | ||
| 953 | return -EINVAL; | ||
| 954 | } | ||
| 955 | |||
| 956 | /* check src operand */ | ||
| 957 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 958 | if (err) | ||
| 959 | return err; | ||
| 960 | } else { | ||
| 961 | if (insn->src_reg != BPF_REG_0 || insn->off != 0) { | ||
| 962 | verbose("BPF_MOV uses reserved fields\n"); | ||
| 963 | return -EINVAL; | ||
| 964 | } | ||
| 965 | } | ||
| 966 | |||
| 967 | /* check dest operand */ | ||
| 968 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | ||
| 969 | if (err) | ||
| 970 | return err; | ||
| 971 | |||
| 972 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 973 | if (BPF_CLASS(insn->code) == BPF_ALU64) { | ||
| 974 | /* case: R1 = R2 | ||
| 975 | * copy register state to dest reg | ||
| 976 | */ | ||
| 977 | regs[insn->dst_reg] = regs[insn->src_reg]; | ||
| 978 | } else { | ||
| 979 | regs[insn->dst_reg].type = UNKNOWN_VALUE; | ||
| 980 | regs[insn->dst_reg].map_ptr = NULL; | ||
| 981 | } | ||
| 982 | } else { | ||
| 983 | /* case: R = imm | ||
| 984 | * remember the value we stored into this reg | ||
| 985 | */ | ||
| 986 | regs[insn->dst_reg].type = CONST_IMM; | ||
| 987 | regs[insn->dst_reg].imm = insn->imm; | ||
| 988 | } | ||
| 989 | |||
| 990 | } else if (opcode > BPF_END) { | ||
| 991 | verbose("invalid BPF_ALU opcode %x\n", opcode); | ||
| 992 | return -EINVAL; | ||
| 993 | |||
| 994 | } else { /* all other ALU ops: and, sub, xor, add, ... */ | ||
| 995 | |||
| 996 | bool stack_relative = false; | ||
| 997 | |||
| 998 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 999 | if (insn->imm != 0 || insn->off != 0) { | ||
| 1000 | verbose("BPF_ALU uses reserved fields\n"); | ||
| 1001 | return -EINVAL; | ||
| 1002 | } | ||
| 1003 | /* check src1 operand */ | ||
| 1004 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 1005 | if (err) | ||
| 1006 | return err; | ||
| 1007 | } else { | ||
| 1008 | if (insn->src_reg != BPF_REG_0 || insn->off != 0) { | ||
| 1009 | verbose("BPF_ALU uses reserved fields\n"); | ||
| 1010 | return -EINVAL; | ||
| 1011 | } | ||
| 1012 | } | ||
| 1013 | |||
| 1014 | /* check src2 operand */ | ||
| 1015 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 1016 | if (err) | ||
| 1017 | return err; | ||
| 1018 | |||
| 1019 | if ((opcode == BPF_MOD || opcode == BPF_DIV) && | ||
| 1020 | BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { | ||
| 1021 | verbose("div by zero\n"); | ||
| 1022 | return -EINVAL; | ||
| 1023 | } | ||
| 1024 | |||
| 1025 | /* pattern match 'bpf_add Rx, imm' instruction */ | ||
| 1026 | if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && | ||
| 1027 | regs[insn->dst_reg].type == FRAME_PTR && | ||
| 1028 | BPF_SRC(insn->code) == BPF_K) | ||
| 1029 | stack_relative = true; | ||
| 1030 | |||
| 1031 | /* check dest operand */ | ||
| 1032 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | ||
| 1033 | if (err) | ||
| 1034 | return err; | ||
| 1035 | |||
| 1036 | if (stack_relative) { | ||
| 1037 | regs[insn->dst_reg].type = PTR_TO_STACK; | ||
| 1038 | regs[insn->dst_reg].imm = insn->imm; | ||
| 1039 | } | ||
| 1040 | } | ||
| 1041 | |||
| 1042 | return 0; | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | static int check_cond_jmp_op(struct verifier_env *env, | ||
| 1046 | struct bpf_insn *insn, int *insn_idx) | ||
| 1047 | { | ||
| 1048 | struct reg_state *regs = env->cur_state.regs; | ||
| 1049 | struct verifier_state *other_branch; | ||
| 1050 | u8 opcode = BPF_OP(insn->code); | ||
| 1051 | int err; | ||
| 1052 | |||
| 1053 | if (opcode > BPF_EXIT) { | ||
| 1054 | verbose("invalid BPF_JMP opcode %x\n", opcode); | ||
| 1055 | return -EINVAL; | ||
| 1056 | } | ||
| 1057 | |||
| 1058 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 1059 | if (insn->imm != 0) { | ||
| 1060 | verbose("BPF_JMP uses reserved fields\n"); | ||
| 1061 | return -EINVAL; | ||
| 1062 | } | ||
| 1063 | |||
| 1064 | /* check src1 operand */ | ||
| 1065 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 1066 | if (err) | ||
| 1067 | return err; | ||
| 1068 | } else { | ||
| 1069 | if (insn->src_reg != BPF_REG_0) { | ||
| 1070 | verbose("BPF_JMP uses reserved fields\n"); | ||
| 1071 | return -EINVAL; | ||
| 1072 | } | ||
| 1073 | } | ||
| 1074 | |||
| 1075 | /* check src2 operand */ | ||
| 1076 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 1077 | if (err) | ||
| 1078 | return err; | ||
| 1079 | |||
| 1080 | /* detect if R == 0 where R was initialized to zero earlier */ | ||
| 1081 | if (BPF_SRC(insn->code) == BPF_K && | ||
| 1082 | (opcode == BPF_JEQ || opcode == BPF_JNE) && | ||
| 1083 | regs[insn->dst_reg].type == CONST_IMM && | ||
| 1084 | regs[insn->dst_reg].imm == insn->imm) { | ||
| 1085 | if (opcode == BPF_JEQ) { | ||
| 1086 | /* if (imm == imm) goto pc+off; | ||
| 1087 | * only follow the goto, ignore fall-through | ||
| 1088 | */ | ||
| 1089 | *insn_idx += insn->off; | ||
| 1090 | return 0; | ||
| 1091 | } else { | ||
| 1092 | /* if (imm != imm) goto pc+off; | ||
| 1093 | * only follow fall-through branch, since | ||
| 1094 | * that's where the program will go | ||
| 1095 | */ | ||
| 1096 | return 0; | ||
| 1097 | } | ||
| 1098 | } | ||
| 1099 | |||
| 1100 | other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); | ||
| 1101 | if (!other_branch) | ||
| 1102 | return -EFAULT; | ||
| 1103 | |||
| 1104 | /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ | ||
| 1105 | if (BPF_SRC(insn->code) == BPF_K && | ||
| 1106 | insn->imm == 0 && (opcode == BPF_JEQ || | ||
| 1107 | opcode == BPF_JNE) && | ||
| 1108 | regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { | ||
| 1109 | if (opcode == BPF_JEQ) { | ||
| 1110 | /* next fallthrough insn can access memory via | ||
| 1111 | * this register | ||
| 1112 | */ | ||
| 1113 | regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; | ||
| 1114 | /* branch targer cannot access it, since reg == 0 */ | ||
| 1115 | other_branch->regs[insn->dst_reg].type = CONST_IMM; | ||
| 1116 | other_branch->regs[insn->dst_reg].imm = 0; | ||
| 1117 | } else { | ||
| 1118 | other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; | ||
| 1119 | regs[insn->dst_reg].type = CONST_IMM; | ||
| 1120 | regs[insn->dst_reg].imm = 0; | ||
| 1121 | } | ||
| 1122 | } else if (BPF_SRC(insn->code) == BPF_K && | ||
| 1123 | (opcode == BPF_JEQ || opcode == BPF_JNE)) { | ||
| 1124 | |||
| 1125 | if (opcode == BPF_JEQ) { | ||
| 1126 | /* detect if (R == imm) goto | ||
| 1127 | * and in the target state recognize that R = imm | ||
| 1128 | */ | ||
| 1129 | other_branch->regs[insn->dst_reg].type = CONST_IMM; | ||
| 1130 | other_branch->regs[insn->dst_reg].imm = insn->imm; | ||
| 1131 | } else { | ||
| 1132 | /* detect if (R != imm) goto | ||
| 1133 | * and in the fall-through state recognize that R = imm | ||
| 1134 | */ | ||
| 1135 | regs[insn->dst_reg].type = CONST_IMM; | ||
| 1136 | regs[insn->dst_reg].imm = insn->imm; | ||
| 1137 | } | ||
| 1138 | } | ||
| 1139 | if (log_level) | ||
| 1140 | print_verifier_state(env); | ||
| 1141 | return 0; | ||
| 1142 | } | ||
| 1143 | |||
| 1144 | /* return the map pointer stored inside BPF_LD_IMM64 instruction */ | ||
| 1145 | static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) | ||
| 1146 | { | ||
| 1147 | u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; | ||
| 1148 | |||
| 1149 | return (struct bpf_map *) (unsigned long) imm64; | ||
| 1150 | } | ||
| 1151 | |||
| 1152 | /* verify BPF_LD_IMM64 instruction */ | ||
| 1153 | static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) | ||
| 1154 | { | ||
| 1155 | struct reg_state *regs = env->cur_state.regs; | ||
| 1156 | int err; | ||
| 1157 | |||
| 1158 | if (BPF_SIZE(insn->code) != BPF_DW) { | ||
| 1159 | verbose("invalid BPF_LD_IMM insn\n"); | ||
| 1160 | return -EINVAL; | ||
| 1161 | } | ||
| 1162 | if (insn->off != 0) { | ||
| 1163 | verbose("BPF_LD_IMM64 uses reserved fields\n"); | ||
| 1164 | return -EINVAL; | ||
| 1165 | } | ||
| 1166 | |||
| 1167 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | ||
| 1168 | if (err) | ||
| 1169 | return err; | ||
| 1170 | |||
| 1171 | if (insn->src_reg == 0) | ||
| 1172 | /* generic move 64-bit immediate into a register */ | ||
| 1173 | return 0; | ||
| 1174 | |||
| 1175 | /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ | ||
| 1176 | BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); | ||
| 1177 | |||
| 1178 | regs[insn->dst_reg].type = CONST_PTR_TO_MAP; | ||
| 1179 | regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); | ||
| 1180 | return 0; | ||
| 1181 | } | ||
| 1182 | |||
| 1183 | /* non-recursive DFS pseudo code | ||
| 1184 | * 1 procedure DFS-iterative(G,v): | ||
| 1185 | * 2 label v as discovered | ||
| 1186 | * 3 let S be a stack | ||
| 1187 | * 4 S.push(v) | ||
| 1188 | * 5 while S is not empty | ||
| 1189 | * 6 t <- S.pop() | ||
| 1190 | * 7 if t is what we're looking for: | ||
| 1191 | * 8 return t | ||
| 1192 | * 9 for all edges e in G.adjacentEdges(t) do | ||
| 1193 | * 10 if edge e is already labelled | ||
| 1194 | * 11 continue with the next edge | ||
| 1195 | * 12 w <- G.adjacentVertex(t,e) | ||
| 1196 | * 13 if vertex w is not discovered and not explored | ||
| 1197 | * 14 label e as tree-edge | ||
| 1198 | * 15 label w as discovered | ||
| 1199 | * 16 S.push(w) | ||
| 1200 | * 17 continue at 5 | ||
| 1201 | * 18 else if vertex w is discovered | ||
| 1202 | * 19 label e as back-edge | ||
| 1203 | * 20 else | ||
| 1204 | * 21 // vertex w is explored | ||
| 1205 | * 22 label e as forward- or cross-edge | ||
| 1206 | * 23 label t as explored | ||
| 1207 | * 24 S.pop() | ||
| 1208 | * | ||
| 1209 | * convention: | ||
| 1210 | * 0x10 - discovered | ||
| 1211 | * 0x11 - discovered and fall-through edge labelled | ||
| 1212 | * 0x12 - discovered and fall-through and branch edges labelled | ||
| 1213 | * 0x20 - explored | ||
| 1214 | */ | ||
| 1215 | |||
| 1216 | enum { | ||
| 1217 | DISCOVERED = 0x10, | ||
| 1218 | EXPLORED = 0x20, | ||
| 1219 | FALLTHROUGH = 1, | ||
| 1220 | BRANCH = 2, | ||
| 1221 | }; | ||
| 1222 | |||
| 1223 | #define STATE_LIST_MARK ((struct verifier_state_list *) -1L) | ||
| 1224 | |||
| 1225 | static int *insn_stack; /* stack of insns to process */ | ||
| 1226 | static int cur_stack; /* current stack index */ | ||
| 1227 | static int *insn_state; | ||
| 1228 | |||
| 1229 | /* t, w, e - match pseudo-code above: | ||
| 1230 | * t - index of current instruction | ||
| 1231 | * w - next instruction | ||
| 1232 | * e - edge | ||
| 1233 | */ | ||
| 1234 | static int push_insn(int t, int w, int e, struct verifier_env *env) | ||
| 1235 | { | ||
| 1236 | if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) | ||
| 1237 | return 0; | ||
| 1238 | |||
| 1239 | if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) | ||
| 1240 | return 0; | ||
| 1241 | |||
| 1242 | if (w < 0 || w >= env->prog->len) { | ||
| 1243 | verbose("jump out of range from insn %d to %d\n", t, w); | ||
| 1244 | return -EINVAL; | ||
| 1245 | } | ||
| 1246 | |||
| 1247 | if (e == BRANCH) | ||
| 1248 | /* mark branch target for state pruning */ | ||
| 1249 | env->explored_states[w] = STATE_LIST_MARK; | ||
| 1250 | |||
| 1251 | if (insn_state[w] == 0) { | ||
| 1252 | /* tree-edge */ | ||
| 1253 | insn_state[t] = DISCOVERED | e; | ||
| 1254 | insn_state[w] = DISCOVERED; | ||
| 1255 | if (cur_stack >= env->prog->len) | ||
| 1256 | return -E2BIG; | ||
| 1257 | insn_stack[cur_stack++] = w; | ||
| 1258 | return 1; | ||
| 1259 | } else if ((insn_state[w] & 0xF0) == DISCOVERED) { | ||
| 1260 | verbose("back-edge from insn %d to %d\n", t, w); | ||
| 1261 | return -EINVAL; | ||
| 1262 | } else if (insn_state[w] == EXPLORED) { | ||
| 1263 | /* forward- or cross-edge */ | ||
| 1264 | insn_state[t] = DISCOVERED | e; | ||
| 1265 | } else { | ||
| 1266 | verbose("insn state internal bug\n"); | ||
| 1267 | return -EFAULT; | ||
| 1268 | } | ||
| 1269 | return 0; | ||
| 1270 | } | ||
| 1271 | |||
| 1272 | /* non-recursive depth-first-search to detect loops in BPF program | ||
| 1273 | * loop == back-edge in directed graph | ||
| 1274 | */ | ||
| 1275 | static int check_cfg(struct verifier_env *env) | ||
| 1276 | { | ||
| 1277 | struct bpf_insn *insns = env->prog->insnsi; | ||
| 1278 | int insn_cnt = env->prog->len; | ||
| 1279 | int ret = 0; | ||
| 1280 | int i, t; | ||
| 1281 | |||
| 1282 | insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); | ||
| 1283 | if (!insn_state) | ||
| 1284 | return -ENOMEM; | ||
| 1285 | |||
| 1286 | insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); | ||
| 1287 | if (!insn_stack) { | ||
| 1288 | kfree(insn_state); | ||
| 1289 | return -ENOMEM; | ||
| 1290 | } | ||
| 1291 | |||
| 1292 | insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ | ||
| 1293 | insn_stack[0] = 0; /* 0 is the first instruction */ | ||
| 1294 | cur_stack = 1; | ||
| 1295 | |||
| 1296 | peek_stack: | ||
| 1297 | if (cur_stack == 0) | ||
| 1298 | goto check_state; | ||
| 1299 | t = insn_stack[cur_stack - 1]; | ||
| 1300 | |||
| 1301 | if (BPF_CLASS(insns[t].code) == BPF_JMP) { | ||
| 1302 | u8 opcode = BPF_OP(insns[t].code); | ||
| 1303 | |||
| 1304 | if (opcode == BPF_EXIT) { | ||
| 1305 | goto mark_explored; | ||
| 1306 | } else if (opcode == BPF_CALL) { | ||
| 1307 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | ||
| 1308 | if (ret == 1) | ||
| 1309 | goto peek_stack; | ||
| 1310 | else if (ret < 0) | ||
| 1311 | goto err_free; | ||
| 1312 | } else if (opcode == BPF_JA) { | ||
| 1313 | if (BPF_SRC(insns[t].code) != BPF_K) { | ||
| 1314 | ret = -EINVAL; | ||
| 1315 | goto err_free; | ||
| 1316 | } | ||
| 1317 | /* unconditional jump with single edge */ | ||
| 1318 | ret = push_insn(t, t + insns[t].off + 1, | ||
| 1319 | FALLTHROUGH, env); | ||
| 1320 | if (ret == 1) | ||
| 1321 | goto peek_stack; | ||
| 1322 | else if (ret < 0) | ||
| 1323 | goto err_free; | ||
| 1324 | /* tell verifier to check for equivalent states | ||
| 1325 | * after every call and jump | ||
| 1326 | */ | ||
| 1327 | env->explored_states[t + 1] = STATE_LIST_MARK; | ||
| 1328 | } else { | ||
| 1329 | /* conditional jump with two edges */ | ||
| 1330 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | ||
| 1331 | if (ret == 1) | ||
| 1332 | goto peek_stack; | ||
| 1333 | else if (ret < 0) | ||
| 1334 | goto err_free; | ||
| 1335 | |||
| 1336 | ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); | ||
| 1337 | if (ret == 1) | ||
| 1338 | goto peek_stack; | ||
| 1339 | else if (ret < 0) | ||
| 1340 | goto err_free; | ||
| 1341 | } | ||
| 1342 | } else { | ||
| 1343 | /* all other non-branch instructions with single | ||
| 1344 | * fall-through edge | ||
| 1345 | */ | ||
| 1346 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | ||
| 1347 | if (ret == 1) | ||
| 1348 | goto peek_stack; | ||
| 1349 | else if (ret < 0) | ||
| 1350 | goto err_free; | ||
| 1351 | } | ||
| 1352 | |||
| 1353 | mark_explored: | ||
| 1354 | insn_state[t] = EXPLORED; | ||
| 1355 | if (cur_stack-- <= 0) { | ||
| 1356 | verbose("pop stack internal bug\n"); | ||
| 1357 | ret = -EFAULT; | ||
| 1358 | goto err_free; | ||
| 1359 | } | ||
| 1360 | goto peek_stack; | ||
| 1361 | |||
| 1362 | check_state: | ||
| 1363 | for (i = 0; i < insn_cnt; i++) { | ||
| 1364 | if (insn_state[i] != EXPLORED) { | ||
| 1365 | verbose("unreachable insn %d\n", i); | ||
| 1366 | ret = -EINVAL; | ||
| 1367 | goto err_free; | ||
| 1368 | } | ||
| 1369 | } | ||
| 1370 | ret = 0; /* cfg looks good */ | ||
| 1371 | |||
| 1372 | err_free: | ||
| 1373 | kfree(insn_state); | ||
| 1374 | kfree(insn_stack); | ||
| 1375 | return ret; | ||
| 1376 | } | ||
| 1377 | |||
| 1378 | /* compare two verifier states | ||
| 1379 | * | ||
| 1380 | * all states stored in state_list are known to be valid, since | ||
| 1381 | * verifier reached 'bpf_exit' instruction through them | ||
| 1382 | * | ||
| 1383 | * this function is called when verifier exploring different branches of | ||
| 1384 | * execution popped from the state stack. If it sees an old state that has | ||
| 1385 | * more strict register state and more strict stack state then this execution | ||
| 1386 | * branch doesn't need to be explored further, since verifier already | ||
| 1387 | * concluded that more strict state leads to valid finish. | ||
| 1388 | * | ||
| 1389 | * Therefore two states are equivalent if register state is more conservative | ||
| 1390 | * and explored stack state is more conservative than the current one. | ||
| 1391 | * Example: | ||
| 1392 | * explored current | ||
| 1393 | * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) | ||
| 1394 | * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) | ||
| 1395 | * | ||
| 1396 | * In other words if current stack state (one being explored) has more | ||
| 1397 | * valid slots than old one that already passed validation, it means | ||
| 1398 | * the verifier can stop exploring and conclude that current state is valid too | ||
| 1399 | * | ||
| 1400 | * Similarly with registers. If explored state has register type as invalid | ||
| 1401 | * whereas register type in current state is meaningful, it means that | ||
| 1402 | * the current state will reach 'bpf_exit' instruction safely | ||
| 1403 | */ | ||
| 1404 | static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | ||
| 1405 | { | ||
| 1406 | int i; | ||
| 1407 | |||
| 1408 | for (i = 0; i < MAX_BPF_REG; i++) { | ||
| 1409 | if (memcmp(&old->regs[i], &cur->regs[i], | ||
| 1410 | sizeof(old->regs[0])) != 0) { | ||
| 1411 | if (old->regs[i].type == NOT_INIT || | ||
| 1412 | (old->regs[i].type == UNKNOWN_VALUE && | ||
| 1413 | cur->regs[i].type != NOT_INIT)) | ||
| 1414 | continue; | ||
| 1415 | return false; | ||
| 1416 | } | ||
| 1417 | } | ||
| 1418 | |||
| 1419 | for (i = 0; i < MAX_BPF_STACK; i++) { | ||
| 1420 | if (memcmp(&old->stack[i], &cur->stack[i], | ||
| 1421 | sizeof(old->stack[0])) != 0) { | ||
| 1422 | if (old->stack[i].stype == STACK_INVALID) | ||
| 1423 | continue; | ||
| 1424 | return false; | ||
| 1425 | } | ||
| 1426 | } | ||
| 1427 | return true; | ||
| 1428 | } | ||
| 1429 | |||
| 1430 | static int is_state_visited(struct verifier_env *env, int insn_idx) | ||
| 1431 | { | ||
| 1432 | struct verifier_state_list *new_sl; | ||
| 1433 | struct verifier_state_list *sl; | ||
| 1434 | |||
| 1435 | sl = env->explored_states[insn_idx]; | ||
| 1436 | if (!sl) | ||
| 1437 | /* this 'insn_idx' instruction wasn't marked, so we will not | ||
| 1438 | * be doing state search here | ||
| 1439 | */ | ||
| 1440 | return 0; | ||
| 1441 | |||
| 1442 | while (sl != STATE_LIST_MARK) { | ||
| 1443 | if (states_equal(&sl->state, &env->cur_state)) | ||
| 1444 | /* reached equivalent register/stack state, | ||
| 1445 | * prune the search | ||
| 1446 | */ | ||
| 1447 | return 1; | ||
| 1448 | sl = sl->next; | ||
| 1449 | } | ||
| 1450 | |||
| 1451 | /* there were no equivalent states, remember current one. | ||
| 1452 | * technically the current state is not proven to be safe yet, | ||
| 1453 | * but it will either reach bpf_exit (which means it's safe) or | ||
| 1454 | * it will be rejected. Since there are no loops, we won't be | ||
| 1455 | * seeing this 'insn_idx' instruction again on the way to bpf_exit | ||
| 1456 | */ | ||
| 1457 | new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); | ||
| 1458 | if (!new_sl) | ||
| 1459 | return -ENOMEM; | ||
| 1460 | |||
| 1461 | /* add new state to the head of linked list */ | ||
| 1462 | memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); | ||
| 1463 | new_sl->next = env->explored_states[insn_idx]; | ||
| 1464 | env->explored_states[insn_idx] = new_sl; | ||
| 1465 | return 0; | ||
| 1466 | } | ||
| 1467 | |||
| 1468 | static int do_check(struct verifier_env *env) | ||
| 1469 | { | ||
| 1470 | struct verifier_state *state = &env->cur_state; | ||
| 1471 | struct bpf_insn *insns = env->prog->insnsi; | ||
| 1472 | struct reg_state *regs = state->regs; | ||
| 1473 | int insn_cnt = env->prog->len; | ||
| 1474 | int insn_idx, prev_insn_idx = 0; | ||
| 1475 | int insn_processed = 0; | ||
| 1476 | bool do_print_state = false; | ||
| 1477 | |||
| 1478 | init_reg_state(regs); | ||
| 1479 | insn_idx = 0; | ||
| 1480 | for (;;) { | ||
| 1481 | struct bpf_insn *insn; | ||
| 1482 | u8 class; | ||
| 1483 | int err; | ||
| 1484 | |||
| 1485 | if (insn_idx >= insn_cnt) { | ||
| 1486 | verbose("invalid insn idx %d insn_cnt %d\n", | ||
| 1487 | insn_idx, insn_cnt); | ||
| 1488 | return -EFAULT; | ||
| 1489 | } | ||
| 1490 | |||
| 1491 | insn = &insns[insn_idx]; | ||
| 1492 | class = BPF_CLASS(insn->code); | ||
| 1493 | |||
| 1494 | if (++insn_processed > 32768) { | ||
| 1495 | verbose("BPF program is too large. Proccessed %d insn\n", | ||
| 1496 | insn_processed); | ||
| 1497 | return -E2BIG; | ||
| 1498 | } | ||
| 1499 | |||
| 1500 | err = is_state_visited(env, insn_idx); | ||
| 1501 | if (err < 0) | ||
| 1502 | return err; | ||
| 1503 | if (err == 1) { | ||
| 1504 | /* found equivalent state, can prune the search */ | ||
| 1505 | if (log_level) { | ||
| 1506 | if (do_print_state) | ||
| 1507 | verbose("\nfrom %d to %d: safe\n", | ||
| 1508 | prev_insn_idx, insn_idx); | ||
| 1509 | else | ||
| 1510 | verbose("%d: safe\n", insn_idx); | ||
| 1511 | } | ||
| 1512 | goto process_bpf_exit; | ||
| 1513 | } | ||
| 1514 | |||
| 1515 | if (log_level && do_print_state) { | ||
| 1516 | verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); | ||
| 1517 | print_verifier_state(env); | ||
| 1518 | do_print_state = false; | ||
| 1519 | } | ||
| 1520 | |||
| 1521 | if (log_level) { | ||
| 1522 | verbose("%d: ", insn_idx); | ||
| 1523 | print_bpf_insn(insn); | ||
| 1524 | } | ||
| 1525 | |||
| 1526 | if (class == BPF_ALU || class == BPF_ALU64) { | ||
| 1527 | err = check_alu_op(regs, insn); | ||
| 1528 | if (err) | ||
| 1529 | return err; | ||
| 1530 | |||
| 1531 | } else if (class == BPF_LDX) { | ||
| 1532 | if (BPF_MODE(insn->code) != BPF_MEM || | ||
| 1533 | insn->imm != 0) { | ||
| 1534 | verbose("BPF_LDX uses reserved fields\n"); | ||
| 1535 | return -EINVAL; | ||
| 1536 | } | ||
| 1537 | /* check src operand */ | ||
| 1538 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 1539 | if (err) | ||
| 1540 | return err; | ||
| 1541 | |||
| 1542 | err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); | ||
| 1543 | if (err) | ||
| 1544 | return err; | ||
| 1545 | |||
| 1546 | /* check that memory (src_reg + off) is readable, | ||
| 1547 | * the state of dst_reg will be updated by this func | ||
| 1548 | */ | ||
| 1549 | err = check_mem_access(env, insn->src_reg, insn->off, | ||
| 1550 | BPF_SIZE(insn->code), BPF_READ, | ||
| 1551 | insn->dst_reg); | ||
| 1552 | if (err) | ||
| 1553 | return err; | ||
| 1554 | |||
| 1555 | } else if (class == BPF_STX) { | ||
| 1556 | if (BPF_MODE(insn->code) == BPF_XADD) { | ||
| 1557 | err = check_xadd(env, insn); | ||
| 1558 | if (err) | ||
| 1559 | return err; | ||
| 1560 | insn_idx++; | ||
| 1561 | continue; | ||
| 1562 | } | ||
| 1563 | |||
| 1564 | if (BPF_MODE(insn->code) != BPF_MEM || | ||
| 1565 | insn->imm != 0) { | ||
| 1566 | verbose("BPF_STX uses reserved fields\n"); | ||
| 1567 | return -EINVAL; | ||
| 1568 | } | ||
| 1569 | /* check src1 operand */ | ||
| 1570 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 1571 | if (err) | ||
| 1572 | return err; | ||
| 1573 | /* check src2 operand */ | ||
| 1574 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 1575 | if (err) | ||
| 1576 | return err; | ||
| 1577 | |||
| 1578 | /* check that memory (dst_reg + off) is writeable */ | ||
| 1579 | err = check_mem_access(env, insn->dst_reg, insn->off, | ||
| 1580 | BPF_SIZE(insn->code), BPF_WRITE, | ||
| 1581 | insn->src_reg); | ||
| 1582 | if (err) | ||
| 1583 | return err; | ||
| 1584 | |||
| 1585 | } else if (class == BPF_ST) { | ||
| 1586 | if (BPF_MODE(insn->code) != BPF_MEM || | ||
| 1587 | insn->src_reg != BPF_REG_0) { | ||
| 1588 | verbose("BPF_ST uses reserved fields\n"); | ||
| 1589 | return -EINVAL; | ||
| 1590 | } | ||
| 1591 | /* check src operand */ | ||
| 1592 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 1593 | if (err) | ||
| 1594 | return err; | ||
| 1595 | |||
| 1596 | /* check that memory (dst_reg + off) is writeable */ | ||
| 1597 | err = check_mem_access(env, insn->dst_reg, insn->off, | ||
| 1598 | BPF_SIZE(insn->code), BPF_WRITE, | ||
| 1599 | -1); | ||
| 1600 | if (err) | ||
| 1601 | return err; | ||
| 1602 | |||
| 1603 | } else if (class == BPF_JMP) { | ||
| 1604 | u8 opcode = BPF_OP(insn->code); | ||
| 1605 | |||
| 1606 | if (opcode == BPF_CALL) { | ||
| 1607 | if (BPF_SRC(insn->code) != BPF_K || | ||
| 1608 | insn->off != 0 || | ||
| 1609 | insn->src_reg != BPF_REG_0 || | ||
| 1610 | insn->dst_reg != BPF_REG_0) { | ||
| 1611 | verbose("BPF_CALL uses reserved fields\n"); | ||
| 1612 | return -EINVAL; | ||
| 1613 | } | ||
| 1614 | |||
| 1615 | err = check_call(env, insn->imm); | ||
| 1616 | if (err) | ||
| 1617 | return err; | ||
| 1618 | |||
| 1619 | } else if (opcode == BPF_JA) { | ||
| 1620 | if (BPF_SRC(insn->code) != BPF_K || | ||
| 1621 | insn->imm != 0 || | ||
| 1622 | insn->src_reg != BPF_REG_0 || | ||
| 1623 | insn->dst_reg != BPF_REG_0) { | ||
| 1624 | verbose("BPF_JA uses reserved fields\n"); | ||
| 1625 | return -EINVAL; | ||
| 1626 | } | ||
| 1627 | |||
| 1628 | insn_idx += insn->off + 1; | ||
| 1629 | continue; | ||
| 1630 | |||
| 1631 | } else if (opcode == BPF_EXIT) { | ||
| 1632 | if (BPF_SRC(insn->code) != BPF_K || | ||
| 1633 | insn->imm != 0 || | ||
| 1634 | insn->src_reg != BPF_REG_0 || | ||
| 1635 | insn->dst_reg != BPF_REG_0) { | ||
| 1636 | verbose("BPF_EXIT uses reserved fields\n"); | ||
| 1637 | return -EINVAL; | ||
| 1638 | } | ||
| 1639 | |||
| 1640 | /* eBPF calling convetion is such that R0 is used | ||
| 1641 | * to return the value from eBPF program. | ||
| 1642 | * Make sure that it's readable at this time | ||
| 1643 | * of bpf_exit, which means that program wrote | ||
| 1644 | * something into it earlier | ||
| 1645 | */ | ||
| 1646 | err = check_reg_arg(regs, BPF_REG_0, SRC_OP); | ||
| 1647 | if (err) | ||
| 1648 | return err; | ||
| 1649 | |||
| 1650 | process_bpf_exit: | ||
| 1651 | insn_idx = pop_stack(env, &prev_insn_idx); | ||
| 1652 | if (insn_idx < 0) { | ||
| 1653 | break; | ||
| 1654 | } else { | ||
| 1655 | do_print_state = true; | ||
| 1656 | continue; | ||
| 1657 | } | ||
| 1658 | } else { | ||
| 1659 | err = check_cond_jmp_op(env, insn, &insn_idx); | ||
| 1660 | if (err) | ||
| 1661 | return err; | ||
| 1662 | } | ||
| 1663 | } else if (class == BPF_LD) { | ||
| 1664 | u8 mode = BPF_MODE(insn->code); | ||
| 1665 | |||
| 1666 | if (mode == BPF_ABS || mode == BPF_IND) { | ||
| 1667 | verbose("LD_ABS is not supported yet\n"); | ||
| 1668 | return -EINVAL; | ||
| 1669 | } else if (mode == BPF_IMM) { | ||
| 1670 | err = check_ld_imm(env, insn); | ||
| 1671 | if (err) | ||
| 1672 | return err; | ||
| 1673 | |||
| 1674 | insn_idx++; | ||
| 1675 | } else { | ||
| 1676 | verbose("invalid BPF_LD mode\n"); | ||
| 1677 | return -EINVAL; | ||
| 1678 | } | ||
| 1679 | } else { | ||
| 1680 | verbose("unknown insn class %d\n", class); | ||
| 1681 | return -EINVAL; | ||
| 1682 | } | ||
| 1683 | |||
| 1684 | insn_idx++; | ||
| 1685 | } | ||
| 1686 | |||
| 1687 | return 0; | ||
| 1688 | } | ||
| 1689 | |||
| 1690 | /* look for pseudo eBPF instructions that access map FDs and | ||
| 1691 | * replace them with actual map pointers | ||
| 1692 | */ | ||
| 1693 | static int replace_map_fd_with_map_ptr(struct verifier_env *env) | ||
| 1694 | { | ||
| 1695 | struct bpf_insn *insn = env->prog->insnsi; | ||
| 1696 | int insn_cnt = env->prog->len; | ||
| 1697 | int i, j; | ||
| 1698 | |||
| 1699 | for (i = 0; i < insn_cnt; i++, insn++) { | ||
| 1700 | if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { | ||
| 1701 | struct bpf_map *map; | ||
| 1702 | struct fd f; | ||
| 1703 | |||
| 1704 | if (i == insn_cnt - 1 || insn[1].code != 0 || | ||
| 1705 | insn[1].dst_reg != 0 || insn[1].src_reg != 0 || | ||
| 1706 | insn[1].off != 0) { | ||
| 1707 | verbose("invalid bpf_ld_imm64 insn\n"); | ||
| 1708 | return -EINVAL; | ||
| 1709 | } | ||
| 1710 | |||
| 1711 | if (insn->src_reg == 0) | ||
| 1712 | /* valid generic load 64-bit imm */ | ||
| 1713 | goto next_insn; | ||
| 1714 | |||
| 1715 | if (insn->src_reg != BPF_PSEUDO_MAP_FD) { | ||
| 1716 | verbose("unrecognized bpf_ld_imm64 insn\n"); | ||
| 1717 | return -EINVAL; | ||
| 1718 | } | ||
| 1719 | |||
| 1720 | f = fdget(insn->imm); | ||
| 1721 | |||
| 1722 | map = bpf_map_get(f); | ||
| 1723 | if (IS_ERR(map)) { | ||
| 1724 | verbose("fd %d is not pointing to valid bpf_map\n", | ||
| 1725 | insn->imm); | ||
| 1726 | fdput(f); | ||
| 1727 | return PTR_ERR(map); | ||
| 1728 | } | ||
| 1729 | |||
| 1730 | /* store map pointer inside BPF_LD_IMM64 instruction */ | ||
| 1731 | insn[0].imm = (u32) (unsigned long) map; | ||
| 1732 | insn[1].imm = ((u64) (unsigned long) map) >> 32; | ||
| 1733 | |||
| 1734 | /* check whether we recorded this map already */ | ||
| 1735 | for (j = 0; j < env->used_map_cnt; j++) | ||
| 1736 | if (env->used_maps[j] == map) { | ||
| 1737 | fdput(f); | ||
| 1738 | goto next_insn; | ||
| 1739 | } | ||
| 1740 | |||
| 1741 | if (env->used_map_cnt >= MAX_USED_MAPS) { | ||
| 1742 | fdput(f); | ||
| 1743 | return -E2BIG; | ||
| 1744 | } | ||
| 1745 | |||
| 1746 | /* remember this map */ | ||
| 1747 | env->used_maps[env->used_map_cnt++] = map; | ||
| 1748 | |||
| 1749 | /* hold the map. If the program is rejected by verifier, | ||
| 1750 | * the map will be released by release_maps() or it | ||
| 1751 | * will be used by the valid program until it's unloaded | ||
| 1752 | * and all maps are released in free_bpf_prog_info() | ||
| 1753 | */ | ||
| 1754 | atomic_inc(&map->refcnt); | ||
| 1755 | |||
| 1756 | fdput(f); | ||
| 1757 | next_insn: | ||
| 1758 | insn++; | ||
| 1759 | i++; | ||
| 1760 | } | ||
| 1761 | } | ||
| 1762 | |||
| 1763 | /* now all pseudo BPF_LD_IMM64 instructions load valid | ||
| 1764 | * 'struct bpf_map *' into a register instead of user map_fd. | ||
| 1765 | * These pointers will be used later by verifier to validate map access. | ||
| 1766 | */ | ||
| 1767 | return 0; | ||
| 1768 | } | ||
| 1769 | |||
| 1770 | /* drop refcnt of maps used by the rejected program */ | ||
| 1771 | static void release_maps(struct verifier_env *env) | ||
| 1772 | { | ||
| 1773 | int i; | ||
| 1774 | |||
| 1775 | for (i = 0; i < env->used_map_cnt; i++) | ||
| 1776 | bpf_map_put(env->used_maps[i]); | ||
| 1777 | } | ||
| 1778 | |||
| 1779 | /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ | ||
| 1780 | static void convert_pseudo_ld_imm64(struct verifier_env *env) | ||
| 1781 | { | ||
| 1782 | struct bpf_insn *insn = env->prog->insnsi; | ||
| 1783 | int insn_cnt = env->prog->len; | ||
| 1784 | int i; | ||
| 1785 | |||
| 1786 | for (i = 0; i < insn_cnt; i++, insn++) | ||
| 1787 | if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) | ||
| 1788 | insn->src_reg = 0; | ||
| 1789 | } | ||
| 1790 | |||
| 1791 | static void free_states(struct verifier_env *env) | ||
| 1792 | { | ||
| 1793 | struct verifier_state_list *sl, *sln; | ||
| 1794 | int i; | ||
| 1795 | |||
| 1796 | if (!env->explored_states) | ||
| 1797 | return; | ||
| 1798 | |||
| 1799 | for (i = 0; i < env->prog->len; i++) { | ||
| 1800 | sl = env->explored_states[i]; | ||
| 1801 | |||
| 1802 | if (sl) | ||
| 1803 | while (sl != STATE_LIST_MARK) { | ||
| 1804 | sln = sl->next; | ||
| 1805 | kfree(sl); | ||
| 1806 | sl = sln; | ||
| 1807 | } | ||
| 1808 | } | ||
| 1809 | |||
| 1810 | kfree(env->explored_states); | ||
| 1811 | } | ||
| 1812 | |||
| 1813 | int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) | ||
| 1814 | { | ||
| 1815 | char __user *log_ubuf = NULL; | ||
| 1816 | struct verifier_env *env; | ||
| 1817 | int ret = -EINVAL; | ||
| 1818 | |||
| 1819 | if (prog->len <= 0 || prog->len > BPF_MAXINSNS) | ||
| 1820 | return -E2BIG; | ||
| 1821 | |||
| 1822 | /* 'struct verifier_env' can be global, but since it's not small, | ||
| 1823 | * allocate/free it every time bpf_check() is called | ||
| 1824 | */ | ||
| 1825 | env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); | ||
| 1826 | if (!env) | ||
| 1827 | return -ENOMEM; | ||
| 1828 | |||
| 1829 | env->prog = prog; | ||
| 1830 | |||
| 1831 | /* grab the mutex to protect few globals used by verifier */ | ||
| 1832 | mutex_lock(&bpf_verifier_lock); | ||
| 1833 | |||
| 1834 | if (attr->log_level || attr->log_buf || attr->log_size) { | ||
| 1835 | /* user requested verbose verifier output | ||
| 1836 | * and supplied buffer to store the verification trace | ||
| 1837 | */ | ||
| 1838 | log_level = attr->log_level; | ||
| 1839 | log_ubuf = (char __user *) (unsigned long) attr->log_buf; | ||
| 1840 | log_size = attr->log_size; | ||
| 1841 | log_len = 0; | ||
| 1842 | |||
| 1843 | ret = -EINVAL; | ||
| 1844 | /* log_* values have to be sane */ | ||
| 1845 | if (log_size < 128 || log_size > UINT_MAX >> 8 || | ||
| 1846 | log_level == 0 || log_ubuf == NULL) | ||
| 1847 | goto free_env; | ||
| 1848 | |||
| 1849 | ret = -ENOMEM; | ||
| 1850 | log_buf = vmalloc(log_size); | ||
| 1851 | if (!log_buf) | ||
| 1852 | goto free_env; | ||
| 1853 | } else { | ||
| 1854 | log_level = 0; | ||
| 1855 | } | ||
| 1856 | |||
| 1857 | ret = replace_map_fd_with_map_ptr(env); | ||
| 1858 | if (ret < 0) | ||
| 1859 | goto skip_full_check; | ||
| 1860 | |||
| 1861 | env->explored_states = kcalloc(prog->len, | ||
| 1862 | sizeof(struct verifier_state_list *), | ||
| 1863 | GFP_USER); | ||
| 1864 | ret = -ENOMEM; | ||
| 1865 | if (!env->explored_states) | ||
| 1866 | goto skip_full_check; | ||
| 1867 | |||
| 1868 | ret = check_cfg(env); | ||
| 1869 | if (ret < 0) | ||
| 1870 | goto skip_full_check; | ||
| 1871 | |||
| 1872 | ret = do_check(env); | ||
| 1873 | |||
| 1874 | skip_full_check: | ||
| 1875 | while (pop_stack(env, NULL) >= 0); | ||
| 1876 | free_states(env); | ||
| 1877 | |||
| 1878 | if (log_level && log_len >= log_size - 1) { | ||
| 1879 | BUG_ON(log_len >= log_size); | ||
| 1880 | /* verifier log exceeded user supplied buffer */ | ||
| 1881 | ret = -ENOSPC; | ||
| 1882 | /* fall through to return what was recorded */ | ||
| 1883 | } | ||
| 1884 | |||
| 1885 | /* copy verifier log back to user space including trailing zero */ | ||
| 1886 | if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { | ||
| 1887 | ret = -EFAULT; | ||
| 1888 | goto free_log_buf; | ||
| 1889 | } | ||
| 1890 | |||
| 1891 | if (ret == 0 && env->used_map_cnt) { | ||
| 1892 | /* if program passed verifier, update used_maps in bpf_prog_info */ | ||
| 1893 | prog->aux->used_maps = kmalloc_array(env->used_map_cnt, | ||
| 1894 | sizeof(env->used_maps[0]), | ||
| 1895 | GFP_KERNEL); | ||
| 1896 | |||
| 1897 | if (!prog->aux->used_maps) { | ||
| 1898 | ret = -ENOMEM; | ||
| 1899 | goto free_log_buf; | ||
| 1900 | } | ||
| 1901 | |||
| 1902 | memcpy(prog->aux->used_maps, env->used_maps, | ||
| 1903 | sizeof(env->used_maps[0]) * env->used_map_cnt); | ||
| 1904 | prog->aux->used_map_cnt = env->used_map_cnt; | ||
| 1905 | |||
| 1906 | /* program is valid. Convert pseudo bpf_ld_imm64 into generic | ||
| 1907 | * bpf_ld_imm64 instructions | ||
| 1908 | */ | ||
| 1909 | convert_pseudo_ld_imm64(env); | ||
| 1910 | } | ||
| 1911 | |||
| 1912 | free_log_buf: | ||
| 1913 | if (log_level) | ||
| 1914 | vfree(log_buf); | ||
| 1915 | free_env: | ||
| 1916 | if (!prog->aux->used_maps) | ||
| 1917 | /* if we didn't copy map pointers into bpf_prog_info, release | ||
| 1918 | * them now. Otherwise free_bpf_prog_info() will release them. | ||
| 1919 | */ | ||
| 1920 | release_maps(env); | ||
| 1921 | kfree(env); | ||
| 1922 | mutex_unlock(&bpf_verifier_lock); | ||
| 1923 | return ret; | ||
| 1924 | } | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788cfd52..136eceadeed1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly; | |||
| 185 | static struct cftype cgroup_dfl_base_files[]; | 185 | static struct cftype cgroup_dfl_base_files[]; |
| 186 | static struct cftype cgroup_legacy_base_files[]; | 186 | static struct cftype cgroup_legacy_base_files[]; |
| 187 | 187 | ||
| 188 | static void cgroup_put(struct cgroup *cgrp); | ||
| 189 | static int rebind_subsystems(struct cgroup_root *dst_root, | 188 | static int rebind_subsystems(struct cgroup_root *dst_root, |
| 190 | unsigned int ss_mask); | 189 | unsigned int ss_mask); |
| 191 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 190 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
| @@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref); | |||
| 195 | static void kill_css(struct cgroup_subsys_state *css); | 194 | static void kill_css(struct cgroup_subsys_state *css); |
| 196 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 195 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
| 197 | bool is_add); | 196 | bool is_add); |
| 198 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); | ||
| 199 | 197 | ||
| 200 | /* IDR wrappers which synchronize using cgroup_idr_lock */ | 198 | /* IDR wrappers which synchronize using cgroup_idr_lock */ |
| 201 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | 199 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, |
| @@ -331,14 +329,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) | |||
| 331 | return false; | 329 | return false; |
| 332 | } | 330 | } |
| 333 | 331 | ||
| 334 | static int cgroup_is_releasable(const struct cgroup *cgrp) | ||
| 335 | { | ||
| 336 | const int bits = | ||
| 337 | (1 << CGRP_RELEASABLE) | | ||
| 338 | (1 << CGRP_NOTIFY_ON_RELEASE); | ||
| 339 | return (cgrp->flags & bits) == bits; | ||
| 340 | } | ||
| 341 | |||
| 342 | static int notify_on_release(const struct cgroup *cgrp) | 332 | static int notify_on_release(const struct cgroup *cgrp) |
| 343 | { | 333 | { |
| 344 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 334 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
| @@ -394,12 +384,7 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
| 394 | ; \ | 384 | ; \ |
| 395 | else | 385 | else |
| 396 | 386 | ||
| 397 | /* the list of cgroups eligible for automatic release. Protected by | ||
| 398 | * release_list_lock */ | ||
| 399 | static LIST_HEAD(release_list); | ||
| 400 | static DEFINE_RAW_SPINLOCK(release_list_lock); | ||
| 401 | static void cgroup_release_agent(struct work_struct *work); | 387 | static void cgroup_release_agent(struct work_struct *work); |
| 402 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); | ||
| 403 | static void check_for_release(struct cgroup *cgrp); | 388 | static void check_for_release(struct cgroup *cgrp); |
| 404 | 389 | ||
| 405 | /* | 390 | /* |
| @@ -498,7 +483,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
| 498 | return key; | 483 | return key; |
| 499 | } | 484 | } |
| 500 | 485 | ||
| 501 | static void put_css_set_locked(struct css_set *cset, bool taskexit) | 486 | static void put_css_set_locked(struct css_set *cset) |
| 502 | { | 487 | { |
| 503 | struct cgrp_cset_link *link, *tmp_link; | 488 | struct cgrp_cset_link *link, *tmp_link; |
| 504 | struct cgroup_subsys *ss; | 489 | struct cgroup_subsys *ss; |
| @@ -524,11 +509,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
| 524 | /* @cgrp can't go away while we're holding css_set_rwsem */ | 509 | /* @cgrp can't go away while we're holding css_set_rwsem */ |
| 525 | if (list_empty(&cgrp->cset_links)) { | 510 | if (list_empty(&cgrp->cset_links)) { |
| 526 | cgroup_update_populated(cgrp, false); | 511 | cgroup_update_populated(cgrp, false); |
| 527 | if (notify_on_release(cgrp)) { | 512 | check_for_release(cgrp); |
| 528 | if (taskexit) | ||
| 529 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
| 530 | check_for_release(cgrp); | ||
| 531 | } | ||
| 532 | } | 513 | } |
| 533 | 514 | ||
| 534 | kfree(link); | 515 | kfree(link); |
| @@ -537,7 +518,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
| 537 | kfree_rcu(cset, rcu_head); | 518 | kfree_rcu(cset, rcu_head); |
| 538 | } | 519 | } |
| 539 | 520 | ||
| 540 | static void put_css_set(struct css_set *cset, bool taskexit) | 521 | static void put_css_set(struct css_set *cset) |
| 541 | { | 522 | { |
| 542 | /* | 523 | /* |
| 543 | * Ensure that the refcount doesn't hit zero while any readers | 524 | * Ensure that the refcount doesn't hit zero while any readers |
| @@ -548,7 +529,7 @@ static void put_css_set(struct css_set *cset, bool taskexit) | |||
| 548 | return; | 529 | return; |
| 549 | 530 | ||
| 550 | down_write(&css_set_rwsem); | 531 | down_write(&css_set_rwsem); |
| 551 | put_css_set_locked(cset, taskexit); | 532 | put_css_set_locked(cset); |
| 552 | up_write(&css_set_rwsem); | 533 | up_write(&css_set_rwsem); |
| 553 | } | 534 | } |
| 554 | 535 | ||
| @@ -969,14 +950,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
| 969 | * knows that the cgroup won't be removed, as cgroup_rmdir() | 950 | * knows that the cgroup won't be removed, as cgroup_rmdir() |
| 970 | * needs that mutex. | 951 | * needs that mutex. |
| 971 | * | 952 | * |
| 972 | * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't | ||
| 973 | * (usually) take cgroup_mutex. These are the two most performance | ||
| 974 | * critical pieces of code here. The exception occurs on cgroup_exit(), | ||
| 975 | * when a task in a notify_on_release cgroup exits. Then cgroup_mutex | ||
| 976 | * is taken, and if the cgroup count is zero, a usermode call made | ||
| 977 | * to the release agent with the name of the cgroup (path relative to | ||
| 978 | * the root of cgroup file system) as the argument. | ||
| 979 | * | ||
| 980 | * A cgroup can only be deleted if both its 'count' of using tasks | 953 | * A cgroup can only be deleted if both its 'count' of using tasks |
| 981 | * is zero, and its list of 'children' cgroups is empty. Since all | 954 | * is zero, and its list of 'children' cgroups is empty. Since all |
| 982 | * tasks in the system use _some_ cgroup, and since there is always at | 955 | * tasks in the system use _some_ cgroup, and since there is always at |
| @@ -1035,6 +1008,11 @@ static void cgroup_get(struct cgroup *cgrp) | |||
| 1035 | css_get(&cgrp->self); | 1008 | css_get(&cgrp->self); |
| 1036 | } | 1009 | } |
| 1037 | 1010 | ||
| 1011 | static bool cgroup_tryget(struct cgroup *cgrp) | ||
| 1012 | { | ||
| 1013 | return css_tryget(&cgrp->self); | ||
| 1014 | } | ||
| 1015 | |||
| 1038 | static void cgroup_put(struct cgroup *cgrp) | 1016 | static void cgroup_put(struct cgroup *cgrp) |
| 1039 | { | 1017 | { |
| 1040 | css_put(&cgrp->self); | 1018 | css_put(&cgrp->self); |
| @@ -1147,7 +1125,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) | |||
| 1147 | * protection against removal. Ensure @cgrp stays accessible and | 1125 | * protection against removal. Ensure @cgrp stays accessible and |
| 1148 | * break the active_ref protection. | 1126 | * break the active_ref protection. |
| 1149 | */ | 1127 | */ |
| 1150 | cgroup_get(cgrp); | 1128 | if (!cgroup_tryget(cgrp)) |
| 1129 | return NULL; | ||
| 1151 | kernfs_break_active_protection(kn); | 1130 | kernfs_break_active_protection(kn); |
| 1152 | 1131 | ||
| 1153 | mutex_lock(&cgroup_mutex); | 1132 | mutex_lock(&cgroup_mutex); |
| @@ -1581,7 +1560,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
| 1581 | INIT_LIST_HEAD(&cgrp->self.sibling); | 1560 | INIT_LIST_HEAD(&cgrp->self.sibling); |
| 1582 | INIT_LIST_HEAD(&cgrp->self.children); | 1561 | INIT_LIST_HEAD(&cgrp->self.children); |
| 1583 | INIT_LIST_HEAD(&cgrp->cset_links); | 1562 | INIT_LIST_HEAD(&cgrp->cset_links); |
| 1584 | INIT_LIST_HEAD(&cgrp->release_list); | ||
| 1585 | INIT_LIST_HEAD(&cgrp->pidlists); | 1563 | INIT_LIST_HEAD(&cgrp->pidlists); |
| 1586 | mutex_init(&cgrp->pidlist_mutex); | 1564 | mutex_init(&cgrp->pidlist_mutex); |
| 1587 | cgrp->self.cgroup = cgrp; | 1565 | cgrp->self.cgroup = cgrp; |
| @@ -1591,6 +1569,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
| 1591 | INIT_LIST_HEAD(&cgrp->e_csets[ssid]); | 1569 | INIT_LIST_HEAD(&cgrp->e_csets[ssid]); |
| 1592 | 1570 | ||
| 1593 | init_waitqueue_head(&cgrp->offline_waitq); | 1571 | init_waitqueue_head(&cgrp->offline_waitq); |
| 1572 | INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); | ||
| 1594 | } | 1573 | } |
| 1595 | 1574 | ||
| 1596 | static void init_cgroup_root(struct cgroup_root *root, | 1575 | static void init_cgroup_root(struct cgroup_root *root, |
| @@ -1628,7 +1607,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) | |||
| 1628 | goto out; | 1607 | goto out; |
| 1629 | root_cgrp->id = ret; | 1608 | root_cgrp->id = ret; |
| 1630 | 1609 | ||
| 1631 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); | 1610 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, |
| 1611 | GFP_KERNEL); | ||
| 1632 | if (ret) | 1612 | if (ret) |
| 1633 | goto out; | 1613 | goto out; |
| 1634 | 1614 | ||
| @@ -2046,8 +2026,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, | |||
| 2046 | * task. As trading it for new_cset is protected by cgroup_mutex, | 2026 | * task. As trading it for new_cset is protected by cgroup_mutex, |
| 2047 | * we're safe to drop it here; it will be freed under RCU. | 2027 | * we're safe to drop it here; it will be freed under RCU. |
| 2048 | */ | 2028 | */ |
| 2049 | set_bit(CGRP_RELEASABLE, &old_cgrp->flags); | 2029 | put_css_set_locked(old_cset); |
| 2050 | put_css_set_locked(old_cset, false); | ||
| 2051 | } | 2030 | } |
| 2052 | 2031 | ||
| 2053 | /** | 2032 | /** |
| @@ -2068,7 +2047,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
| 2068 | cset->mg_src_cgrp = NULL; | 2047 | cset->mg_src_cgrp = NULL; |
| 2069 | cset->mg_dst_cset = NULL; | 2048 | cset->mg_dst_cset = NULL; |
| 2070 | list_del_init(&cset->mg_preload_node); | 2049 | list_del_init(&cset->mg_preload_node); |
| 2071 | put_css_set_locked(cset, false); | 2050 | put_css_set_locked(cset); |
| 2072 | } | 2051 | } |
| 2073 | up_write(&css_set_rwsem); | 2052 | up_write(&css_set_rwsem); |
| 2074 | } | 2053 | } |
| @@ -2162,8 +2141,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
| 2162 | if (src_cset == dst_cset) { | 2141 | if (src_cset == dst_cset) { |
| 2163 | src_cset->mg_src_cgrp = NULL; | 2142 | src_cset->mg_src_cgrp = NULL; |
| 2164 | list_del_init(&src_cset->mg_preload_node); | 2143 | list_del_init(&src_cset->mg_preload_node); |
| 2165 | put_css_set(src_cset, false); | 2144 | put_css_set(src_cset); |
| 2166 | put_css_set(dst_cset, false); | 2145 | put_css_set(dst_cset); |
| 2167 | continue; | 2146 | continue; |
| 2168 | } | 2147 | } |
| 2169 | 2148 | ||
| @@ -2172,7 +2151,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
| 2172 | if (list_empty(&dst_cset->mg_preload_node)) | 2151 | if (list_empty(&dst_cset->mg_preload_node)) |
| 2173 | list_add(&dst_cset->mg_preload_node, &csets); | 2152 | list_add(&dst_cset->mg_preload_node, &csets); |
| 2174 | else | 2153 | else |
| 2175 | put_css_set(dst_cset, false); | 2154 | put_css_set(dst_cset); |
| 2176 | } | 2155 | } |
| 2177 | 2156 | ||
| 2178 | list_splice_tail(&csets, preloaded_csets); | 2157 | list_splice_tail(&csets, preloaded_csets); |
| @@ -3271,8 +3250,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
| 3271 | { | 3250 | { |
| 3272 | struct cftype *cft; | 3251 | struct cftype *cft; |
| 3273 | 3252 | ||
| 3274 | for (cft = cfts; cft && cft->name[0] != '\0'; cft++) | 3253 | /* |
| 3275 | cft->flags |= __CFTYPE_NOT_ON_DFL; | 3254 | * If legacy_flies_on_dfl, we want to show the legacy files on the |
| 3255 | * dfl hierarchy but iff the target subsystem hasn't been updated | ||
| 3256 | * for the dfl hierarchy yet. | ||
| 3257 | */ | ||
| 3258 | if (!cgroup_legacy_files_on_dfl || | ||
| 3259 | ss->dfl_cftypes != ss->legacy_cftypes) { | ||
| 3260 | for (cft = cfts; cft && cft->name[0] != '\0'; cft++) | ||
| 3261 | cft->flags |= __CFTYPE_NOT_ON_DFL; | ||
| 3262 | } | ||
| 3263 | |||
| 3276 | return cgroup_add_cftypes(ss, cfts); | 3264 | return cgroup_add_cftypes(ss, cfts); |
| 3277 | } | 3265 | } |
| 3278 | 3266 | ||
| @@ -3970,7 +3958,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
| 3970 | 3958 | ||
| 3971 | l = cgroup_pidlist_find_create(cgrp, type); | 3959 | l = cgroup_pidlist_find_create(cgrp, type); |
| 3972 | if (!l) { | 3960 | if (!l) { |
| 3973 | mutex_unlock(&cgrp->pidlist_mutex); | ||
| 3974 | pidlist_free(array); | 3961 | pidlist_free(array); |
| 3975 | return -ENOMEM; | 3962 | return -ENOMEM; |
| 3976 | } | 3963 | } |
| @@ -4159,7 +4146,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | |||
| 4159 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, | 4146 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, |
| 4160 | struct cftype *cft, u64 val) | 4147 | struct cftype *cft, u64 val) |
| 4161 | { | 4148 | { |
| 4162 | clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); | ||
| 4163 | if (val) | 4149 | if (val) |
| 4164 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); | 4150 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
| 4165 | else | 4151 | else |
| @@ -4337,6 +4323,7 @@ static void css_free_work_fn(struct work_struct *work) | |||
| 4337 | /* cgroup free path */ | 4323 | /* cgroup free path */ |
| 4338 | atomic_dec(&cgrp->root->nr_cgrps); | 4324 | atomic_dec(&cgrp->root->nr_cgrps); |
| 4339 | cgroup_pidlist_destroy_all(cgrp); | 4325 | cgroup_pidlist_destroy_all(cgrp); |
| 4326 | cancel_work_sync(&cgrp->release_agent_work); | ||
| 4340 | 4327 | ||
| 4341 | if (cgroup_parent(cgrp)) { | 4328 | if (cgroup_parent(cgrp)) { |
| 4342 | /* | 4329 | /* |
| @@ -4387,6 +4374,15 @@ static void css_release_work_fn(struct work_struct *work) | |||
| 4387 | /* cgroup release path */ | 4374 | /* cgroup release path */ |
| 4388 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | 4375 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
| 4389 | cgrp->id = -1; | 4376 | cgrp->id = -1; |
| 4377 | |||
| 4378 | /* | ||
| 4379 | * There are two control paths which try to determine | ||
| 4380 | * cgroup from dentry without going through kernfs - | ||
| 4381 | * cgroupstats_build() and css_tryget_online_from_dir(). | ||
| 4382 | * Those are supported by RCU protecting clearing of | ||
| 4383 | * cgrp->kn->priv backpointer. | ||
| 4384 | */ | ||
| 4385 | RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); | ||
| 4390 | } | 4386 | } |
| 4391 | 4387 | ||
| 4392 | mutex_unlock(&cgroup_mutex); | 4388 | mutex_unlock(&cgroup_mutex); |
| @@ -4487,7 +4483,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | |||
| 4487 | 4483 | ||
| 4488 | init_and_link_css(css, ss, cgrp); | 4484 | init_and_link_css(css, ss, cgrp); |
| 4489 | 4485 | ||
| 4490 | err = percpu_ref_init(&css->refcnt, css_release); | 4486 | err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); |
| 4491 | if (err) | 4487 | if (err) |
| 4492 | goto err_free_css; | 4488 | goto err_free_css; |
| 4493 | 4489 | ||
| @@ -4543,6 +4539,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
| 4543 | struct cftype *base_files; | 4539 | struct cftype *base_files; |
| 4544 | int ssid, ret; | 4540 | int ssid, ret; |
| 4545 | 4541 | ||
| 4542 | /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. | ||
| 4543 | */ | ||
| 4544 | if (strchr(name, '\n')) | ||
| 4545 | return -EINVAL; | ||
| 4546 | |||
| 4546 | parent = cgroup_kn_lock_live(parent_kn); | 4547 | parent = cgroup_kn_lock_live(parent_kn); |
| 4547 | if (!parent) | 4548 | if (!parent) |
| 4548 | return -ENODEV; | 4549 | return -ENODEV; |
| @@ -4555,7 +4556,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
| 4555 | goto out_unlock; | 4556 | goto out_unlock; |
| 4556 | } | 4557 | } |
| 4557 | 4558 | ||
| 4558 | ret = percpu_ref_init(&cgrp->self.refcnt, css_release); | 4559 | ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); |
| 4559 | if (ret) | 4560 | if (ret) |
| 4560 | goto out_free_cgrp; | 4561 | goto out_free_cgrp; |
| 4561 | 4562 | ||
| @@ -4785,19 +4786,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 4785 | for_each_css(css, ssid, cgrp) | 4786 | for_each_css(css, ssid, cgrp) |
| 4786 | kill_css(css); | 4787 | kill_css(css); |
| 4787 | 4788 | ||
| 4788 | /* CSS_ONLINE is clear, remove from ->release_list for the last time */ | ||
| 4789 | raw_spin_lock(&release_list_lock); | ||
| 4790 | if (!list_empty(&cgrp->release_list)) | ||
| 4791 | list_del_init(&cgrp->release_list); | ||
| 4792 | raw_spin_unlock(&release_list_lock); | ||
| 4793 | |||
| 4794 | /* | 4789 | /* |
| 4795 | * Remove @cgrp directory along with the base files. @cgrp has an | 4790 | * Remove @cgrp directory along with the base files. @cgrp has an |
| 4796 | * extra ref on its kn. | 4791 | * extra ref on its kn. |
| 4797 | */ | 4792 | */ |
| 4798 | kernfs_remove(cgrp->kn); | 4793 | kernfs_remove(cgrp->kn); |
| 4799 | 4794 | ||
| 4800 | set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); | ||
| 4801 | check_for_release(cgroup_parent(cgrp)); | 4795 | check_for_release(cgroup_parent(cgrp)); |
| 4802 | 4796 | ||
| 4803 | /* put the base reference */ | 4797 | /* put the base reference */ |
| @@ -4814,23 +4808,10 @@ static int cgroup_rmdir(struct kernfs_node *kn) | |||
| 4814 | cgrp = cgroup_kn_lock_live(kn); | 4808 | cgrp = cgroup_kn_lock_live(kn); |
| 4815 | if (!cgrp) | 4809 | if (!cgrp) |
| 4816 | return 0; | 4810 | return 0; |
| 4817 | cgroup_get(cgrp); /* for @kn->priv clearing */ | ||
| 4818 | 4811 | ||
| 4819 | ret = cgroup_destroy_locked(cgrp); | 4812 | ret = cgroup_destroy_locked(cgrp); |
| 4820 | 4813 | ||
| 4821 | cgroup_kn_unlock(kn); | 4814 | cgroup_kn_unlock(kn); |
| 4822 | |||
| 4823 | /* | ||
| 4824 | * There are two control paths which try to determine cgroup from | ||
| 4825 | * dentry without going through kernfs - cgroupstats_build() and | ||
| 4826 | * css_tryget_online_from_dir(). Those are supported by RCU | ||
| 4827 | * protecting clearing of cgrp->kn->priv backpointer, which should | ||
| 4828 | * happen after all files under it have been removed. | ||
| 4829 | */ | ||
| 4830 | if (!ret) | ||
| 4831 | RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); | ||
| 4832 | |||
| 4833 | cgroup_put(cgrp); | ||
| 4834 | return ret; | 4815 | return ret; |
| 4835 | } | 4816 | } |
| 4836 | 4817 | ||
| @@ -5034,12 +5015,9 @@ core_initcall(cgroup_wq_init); | |||
| 5034 | * - Print task's cgroup paths into seq_file, one line for each hierarchy | 5015 | * - Print task's cgroup paths into seq_file, one line for each hierarchy |
| 5035 | * - Used for /proc/<pid>/cgroup. | 5016 | * - Used for /proc/<pid>/cgroup. |
| 5036 | */ | 5017 | */ |
| 5037 | 5018 | int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |
| 5038 | /* TODO: Use a proper seq_file iterator */ | 5019 | struct pid *pid, struct task_struct *tsk) |
| 5039 | int proc_cgroup_show(struct seq_file *m, void *v) | ||
| 5040 | { | 5020 | { |
| 5041 | struct pid *pid; | ||
| 5042 | struct task_struct *tsk; | ||
| 5043 | char *buf, *path; | 5021 | char *buf, *path; |
| 5044 | int retval; | 5022 | int retval; |
| 5045 | struct cgroup_root *root; | 5023 | struct cgroup_root *root; |
| @@ -5049,14 +5027,6 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
| 5049 | if (!buf) | 5027 | if (!buf) |
| 5050 | goto out; | 5028 | goto out; |
| 5051 | 5029 | ||
| 5052 | retval = -ESRCH; | ||
| 5053 | pid = m->private; | ||
| 5054 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
| 5055 | if (!tsk) | ||
| 5056 | goto out_free; | ||
| 5057 | |||
| 5058 | retval = 0; | ||
| 5059 | |||
| 5060 | mutex_lock(&cgroup_mutex); | 5030 | mutex_lock(&cgroup_mutex); |
| 5061 | down_read(&css_set_rwsem); | 5031 | down_read(&css_set_rwsem); |
| 5062 | 5032 | ||
| @@ -5086,11 +5056,10 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
| 5086 | seq_putc(m, '\n'); | 5056 | seq_putc(m, '\n'); |
| 5087 | } | 5057 | } |
| 5088 | 5058 | ||
| 5059 | retval = 0; | ||
| 5089 | out_unlock: | 5060 | out_unlock: |
| 5090 | up_read(&css_set_rwsem); | 5061 | up_read(&css_set_rwsem); |
| 5091 | mutex_unlock(&cgroup_mutex); | 5062 | mutex_unlock(&cgroup_mutex); |
| 5092 | put_task_struct(tsk); | ||
| 5093 | out_free: | ||
| 5094 | kfree(buf); | 5063 | kfree(buf); |
| 5095 | out: | 5064 | out: |
| 5096 | return retval; | 5065 | return retval; |
| @@ -5161,7 +5130,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
| 5161 | int i; | 5130 | int i; |
| 5162 | 5131 | ||
| 5163 | /* | 5132 | /* |
| 5164 | * This may race against cgroup_enable_task_cg_links(). As that | 5133 | * This may race against cgroup_enable_task_cg_lists(). As that |
| 5165 | * function sets use_task_css_set_links before grabbing | 5134 | * function sets use_task_css_set_links before grabbing |
| 5166 | * tasklist_lock and we just went through tasklist_lock to add | 5135 | * tasklist_lock and we just went through tasklist_lock to add |
| 5167 | * @child, it's guaranteed that either we see the set | 5136 | * @child, it's guaranteed that either we see the set |
| @@ -5176,7 +5145,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
| 5176 | * when implementing operations which need to migrate all tasks of | 5145 | * when implementing operations which need to migrate all tasks of |
| 5177 | * a cgroup to another. | 5146 | * a cgroup to another. |
| 5178 | * | 5147 | * |
| 5179 | * Note that if we lose to cgroup_enable_task_cg_links(), @child | 5148 | * Note that if we lose to cgroup_enable_task_cg_lists(), @child |
| 5180 | * will remain in init_css_set. This is safe because all tasks are | 5149 | * will remain in init_css_set. This is safe because all tasks are |
| 5181 | * in the init_css_set before cg_links is enabled and there's no | 5150 | * in the init_css_set before cg_links is enabled and there's no |
| 5182 | * operation which transfers all tasks out of init_css_set. | 5151 | * operation which transfers all tasks out of init_css_set. |
| @@ -5260,30 +5229,14 @@ void cgroup_exit(struct task_struct *tsk) | |||
| 5260 | } | 5229 | } |
| 5261 | 5230 | ||
| 5262 | if (put_cset) | 5231 | if (put_cset) |
| 5263 | put_css_set(cset, true); | 5232 | put_css_set(cset); |
| 5264 | } | 5233 | } |
| 5265 | 5234 | ||
| 5266 | static void check_for_release(struct cgroup *cgrp) | 5235 | static void check_for_release(struct cgroup *cgrp) |
| 5267 | { | 5236 | { |
| 5268 | if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && | 5237 | if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && |
| 5269 | !css_has_online_children(&cgrp->self)) { | 5238 | !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) |
| 5270 | /* | 5239 | schedule_work(&cgrp->release_agent_work); |
| 5271 | * Control Group is currently removeable. If it's not | ||
| 5272 | * already queued for a userspace notification, queue | ||
| 5273 | * it now | ||
| 5274 | */ | ||
| 5275 | int need_schedule_work = 0; | ||
| 5276 | |||
| 5277 | raw_spin_lock(&release_list_lock); | ||
| 5278 | if (!cgroup_is_dead(cgrp) && | ||
| 5279 | list_empty(&cgrp->release_list)) { | ||
| 5280 | list_add(&cgrp->release_list, &release_list); | ||
| 5281 | need_schedule_work = 1; | ||
| 5282 | } | ||
| 5283 | raw_spin_unlock(&release_list_lock); | ||
| 5284 | if (need_schedule_work) | ||
| 5285 | schedule_work(&release_agent_work); | ||
| 5286 | } | ||
| 5287 | } | 5240 | } |
| 5288 | 5241 | ||
| 5289 | /* | 5242 | /* |
| @@ -5311,52 +5264,39 @@ static void check_for_release(struct cgroup *cgrp) | |||
| 5311 | */ | 5264 | */ |
| 5312 | static void cgroup_release_agent(struct work_struct *work) | 5265 | static void cgroup_release_agent(struct work_struct *work) |
| 5313 | { | 5266 | { |
| 5314 | BUG_ON(work != &release_agent_work); | 5267 | struct cgroup *cgrp = |
| 5268 | container_of(work, struct cgroup, release_agent_work); | ||
| 5269 | char *pathbuf = NULL, *agentbuf = NULL, *path; | ||
| 5270 | char *argv[3], *envp[3]; | ||
| 5271 | |||
| 5315 | mutex_lock(&cgroup_mutex); | 5272 | mutex_lock(&cgroup_mutex); |
| 5316 | raw_spin_lock(&release_list_lock); | 5273 | |
| 5317 | while (!list_empty(&release_list)) { | 5274 | pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); |
| 5318 | char *argv[3], *envp[3]; | 5275 | agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); |
| 5319 | int i; | 5276 | if (!pathbuf || !agentbuf) |
| 5320 | char *pathbuf = NULL, *agentbuf = NULL, *path; | 5277 | goto out; |
| 5321 | struct cgroup *cgrp = list_entry(release_list.next, | 5278 | |
| 5322 | struct cgroup, | 5279 | path = cgroup_path(cgrp, pathbuf, PATH_MAX); |
| 5323 | release_list); | 5280 | if (!path) |
| 5324 | list_del_init(&cgrp->release_list); | 5281 | goto out; |
| 5325 | raw_spin_unlock(&release_list_lock); | 5282 | |
| 5326 | pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); | 5283 | argv[0] = agentbuf; |
| 5327 | if (!pathbuf) | 5284 | argv[1] = path; |
| 5328 | goto continue_free; | 5285 | argv[2] = NULL; |
| 5329 | path = cgroup_path(cgrp, pathbuf, PATH_MAX); | 5286 | |
| 5330 | if (!path) | 5287 | /* minimal command environment */ |
| 5331 | goto continue_free; | 5288 | envp[0] = "HOME=/"; |
| 5332 | agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); | 5289 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
| 5333 | if (!agentbuf) | 5290 | envp[2] = NULL; |
| 5334 | goto continue_free; | 5291 | |
| 5335 | |||
| 5336 | i = 0; | ||
| 5337 | argv[i++] = agentbuf; | ||
| 5338 | argv[i++] = path; | ||
| 5339 | argv[i] = NULL; | ||
| 5340 | |||
| 5341 | i = 0; | ||
| 5342 | /* minimal command environment */ | ||
| 5343 | envp[i++] = "HOME=/"; | ||
| 5344 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
| 5345 | envp[i] = NULL; | ||
| 5346 | |||
| 5347 | /* Drop the lock while we invoke the usermode helper, | ||
| 5348 | * since the exec could involve hitting disk and hence | ||
| 5349 | * be a slow process */ | ||
| 5350 | mutex_unlock(&cgroup_mutex); | ||
| 5351 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
| 5352 | mutex_lock(&cgroup_mutex); | ||
| 5353 | continue_free: | ||
| 5354 | kfree(pathbuf); | ||
| 5355 | kfree(agentbuf); | ||
| 5356 | raw_spin_lock(&release_list_lock); | ||
| 5357 | } | ||
| 5358 | raw_spin_unlock(&release_list_lock); | ||
| 5359 | mutex_unlock(&cgroup_mutex); | 5292 | mutex_unlock(&cgroup_mutex); |
| 5293 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
| 5294 | goto out_free; | ||
| 5295 | out: | ||
| 5296 | mutex_unlock(&cgroup_mutex); | ||
| 5297 | out_free: | ||
| 5298 | kfree(agentbuf); | ||
| 5299 | kfree(pathbuf); | ||
| 5360 | } | 5300 | } |
| 5361 | 5301 | ||
| 5362 | static int __init cgroup_disable(char *str) | 5302 | static int __init cgroup_disable(char *str) |
| @@ -5416,7 +5356,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, | |||
| 5416 | /* | 5356 | /* |
| 5417 | * This path doesn't originate from kernfs and @kn could already | 5357 | * This path doesn't originate from kernfs and @kn could already |
| 5418 | * have been or be removed at any point. @kn->priv is RCU | 5358 | * have been or be removed at any point. @kn->priv is RCU |
| 5419 | * protected for this access. See cgroup_rmdir() for details. | 5359 | * protected for this access. See css_release_work_fn() for details. |
| 5420 | */ | 5360 | */ |
| 5421 | cgrp = rcu_dereference(kn->priv); | 5361 | cgrp = rcu_dereference(kn->priv); |
| 5422 | if (cgrp) | 5362 | if (cgrp) |
| @@ -5544,7 +5484,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) | |||
| 5544 | 5484 | ||
| 5545 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | 5485 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) |
| 5546 | { | 5486 | { |
| 5547 | return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); | 5487 | return (!cgroup_has_tasks(css->cgroup) && |
| 5488 | !css_has_online_children(&css->cgroup->self)); | ||
| 5548 | } | 5489 | } |
| 5549 | 5490 | ||
| 5550 | static struct cftype debug_files[] = { | 5491 | static struct cftype debug_files[] = { |
diff --git a/kernel/compat.c b/kernel/compat.c index 633394f442f8..ebb3c369d03d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) | |||
| 226 | ret = hrtimer_nanosleep_restart(restart); | 226 | ret = hrtimer_nanosleep_restart(restart); |
| 227 | set_fs(oldfs); | 227 | set_fs(oldfs); |
| 228 | 228 | ||
| 229 | if (ret) { | 229 | if (ret == -ERESTART_RESTARTBLOCK) { |
| 230 | rmtp = restart->nanosleep.compat_rmtp; | 230 | rmtp = restart->nanosleep.compat_rmtp; |
| 231 | 231 | ||
| 232 | if (rmtp && compat_put_timespec(&rmt, rmtp)) | 232 | if (rmtp && compat_put_timespec(&rmt, rmtp)) |
| @@ -256,7 +256,26 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, | |||
| 256 | HRTIMER_MODE_REL, CLOCK_MONOTONIC); | 256 | HRTIMER_MODE_REL, CLOCK_MONOTONIC); |
| 257 | set_fs(oldfs); | 257 | set_fs(oldfs); |
| 258 | 258 | ||
| 259 | if (ret) { | 259 | /* |
| 260 | * hrtimer_nanosleep() can only return 0 or | ||
| 261 | * -ERESTART_RESTARTBLOCK here because: | ||
| 262 | * | ||
| 263 | * - we call it with HRTIMER_MODE_REL and therefor exclude the | ||
| 264 | * -ERESTARTNOHAND return path. | ||
| 265 | * | ||
| 266 | * - we supply the rmtp argument from the task stack (due to | ||
| 267 | * the necessary compat conversion. So the update cannot | ||
| 268 | * fail, which excludes the -EFAULT return path as well. If | ||
| 269 | * it fails nevertheless we have a bigger problem and wont | ||
| 270 | * reach this place anymore. | ||
| 271 | * | ||
| 272 | * - if the return value is 0, we do not have to update rmtp | ||
| 273 | * because there is no remaining time. | ||
| 274 | * | ||
| 275 | * We check for -ERESTART_RESTARTBLOCK nevertheless if the | ||
| 276 | * core implementation decides to return random nonsense. | ||
| 277 | */ | ||
| 278 | if (ret == -ERESTART_RESTARTBLOCK) { | ||
| 260 | struct restart_block *restart | 279 | struct restart_block *restart |
| 261 | = ¤t_thread_info()->restart_block; | 280 | = ¤t_thread_info()->restart_block; |
| 262 | 281 | ||
| @@ -266,7 +285,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, | |||
| 266 | if (rmtp && compat_put_timespec(&rmt, rmtp)) | 285 | if (rmtp && compat_put_timespec(&rmt, rmtp)) |
| 267 | return -EFAULT; | 286 | return -EFAULT; |
| 268 | } | 287 | } |
| 269 | |||
| 270 | return ret; | 288 | return ret; |
| 271 | } | 289 | } |
| 272 | 290 | ||
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config new file mode 100644 index 000000000000..c2de56ab0fce --- /dev/null +++ b/kernel/configs/tiny.config | |||
| @@ -0,0 +1,4 @@ | |||
| 1 | CONFIG_CC_OPTIMIZE_FOR_SIZE=y | ||
| 2 | CONFIG_KERNEL_XZ=y | ||
| 3 | CONFIG_OPTIMIZE_INLINING=y | ||
| 4 | CONFIG_SLOB=y | ||
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..937ecdfdf258 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
| @@ -107,46 +107,6 @@ void context_tracking_user_enter(void) | |||
| 107 | } | 107 | } |
| 108 | NOKPROBE_SYMBOL(context_tracking_user_enter); | 108 | NOKPROBE_SYMBOL(context_tracking_user_enter); |
| 109 | 109 | ||
| 110 | #ifdef CONFIG_PREEMPT | ||
| 111 | /** | ||
| 112 | * preempt_schedule_context - preempt_schedule called by tracing | ||
| 113 | * | ||
| 114 | * The tracing infrastructure uses preempt_enable_notrace to prevent | ||
| 115 | * recursion and tracing preempt enabling caused by the tracing | ||
| 116 | * infrastructure itself. But as tracing can happen in areas coming | ||
| 117 | * from userspace or just about to enter userspace, a preempt enable | ||
| 118 | * can occur before user_exit() is called. This will cause the scheduler | ||
| 119 | * to be called when the system is still in usermode. | ||
| 120 | * | ||
| 121 | * To prevent this, the preempt_enable_notrace will use this function | ||
| 122 | * instead of preempt_schedule() to exit user context if needed before | ||
| 123 | * calling the scheduler. | ||
| 124 | */ | ||
| 125 | asmlinkage __visible void __sched notrace preempt_schedule_context(void) | ||
| 126 | { | ||
| 127 | enum ctx_state prev_ctx; | ||
| 128 | |||
| 129 | if (likely(!preemptible())) | ||
| 130 | return; | ||
| 131 | |||
| 132 | /* | ||
| 133 | * Need to disable preemption in case user_exit() is traced | ||
| 134 | * and the tracer calls preempt_enable_notrace() causing | ||
| 135 | * an infinite recursion. | ||
| 136 | */ | ||
| 137 | preempt_disable_notrace(); | ||
| 138 | prev_ctx = exception_enter(); | ||
| 139 | preempt_enable_no_resched_notrace(); | ||
| 140 | |||
| 141 | preempt_schedule(); | ||
| 142 | |||
| 143 | preempt_disable_notrace(); | ||
| 144 | exception_exit(prev_ctx); | ||
| 145 | preempt_enable_notrace(); | ||
| 146 | } | ||
| 147 | EXPORT_SYMBOL_GPL(preempt_schedule_context); | ||
| 148 | #endif /* CONFIG_PREEMPT */ | ||
| 149 | |||
| 150 | /** | 110 | /** |
| 151 | * context_tracking_user_exit - Inform the context tracking that the CPU is | 111 | * context_tracking_user_exit - Inform the context tracking that the CPU is |
| 152 | * exiting userspace mode and entering the kernel. | 112 | * exiting userspace mode and entering the kernel. |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 81e2a388a0f6..90a3d017b90c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -64,6 +64,8 @@ static struct { | |||
| 64 | * an ongoing cpu hotplug operation. | 64 | * an ongoing cpu hotplug operation. |
| 65 | */ | 65 | */ |
| 66 | int refcount; | 66 | int refcount; |
| 67 | /* And allows lockless put_online_cpus(). */ | ||
| 68 | atomic_t puts_pending; | ||
| 67 | 69 | ||
| 68 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 70 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| 69 | struct lockdep_map dep_map; | 71 | struct lockdep_map dep_map; |
| @@ -79,6 +81,8 @@ static struct { | |||
| 79 | 81 | ||
| 80 | /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ | 82 | /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ |
| 81 | #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) | 83 | #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) |
| 84 | #define cpuhp_lock_acquire_tryread() \ | ||
| 85 | lock_map_acquire_tryread(&cpu_hotplug.dep_map) | ||
| 82 | #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) | 86 | #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) |
| 83 | #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) | 87 | #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) |
| 84 | 88 | ||
| @@ -91,15 +95,31 @@ void get_online_cpus(void) | |||
| 91 | mutex_lock(&cpu_hotplug.lock); | 95 | mutex_lock(&cpu_hotplug.lock); |
| 92 | cpu_hotplug.refcount++; | 96 | cpu_hotplug.refcount++; |
| 93 | mutex_unlock(&cpu_hotplug.lock); | 97 | mutex_unlock(&cpu_hotplug.lock); |
| 94 | |||
| 95 | } | 98 | } |
| 96 | EXPORT_SYMBOL_GPL(get_online_cpus); | 99 | EXPORT_SYMBOL_GPL(get_online_cpus); |
| 97 | 100 | ||
| 101 | bool try_get_online_cpus(void) | ||
| 102 | { | ||
| 103 | if (cpu_hotplug.active_writer == current) | ||
| 104 | return true; | ||
| 105 | if (!mutex_trylock(&cpu_hotplug.lock)) | ||
| 106 | return false; | ||
| 107 | cpuhp_lock_acquire_tryread(); | ||
| 108 | cpu_hotplug.refcount++; | ||
| 109 | mutex_unlock(&cpu_hotplug.lock); | ||
| 110 | return true; | ||
| 111 | } | ||
| 112 | EXPORT_SYMBOL_GPL(try_get_online_cpus); | ||
| 113 | |||
| 98 | void put_online_cpus(void) | 114 | void put_online_cpus(void) |
| 99 | { | 115 | { |
| 100 | if (cpu_hotplug.active_writer == current) | 116 | if (cpu_hotplug.active_writer == current) |
| 101 | return; | 117 | return; |
| 102 | mutex_lock(&cpu_hotplug.lock); | 118 | if (!mutex_trylock(&cpu_hotplug.lock)) { |
| 119 | atomic_inc(&cpu_hotplug.puts_pending); | ||
| 120 | cpuhp_lock_release(); | ||
| 121 | return; | ||
| 122 | } | ||
| 103 | 123 | ||
| 104 | if (WARN_ON(!cpu_hotplug.refcount)) | 124 | if (WARN_ON(!cpu_hotplug.refcount)) |
| 105 | cpu_hotplug.refcount++; /* try to fix things up */ | 125 | cpu_hotplug.refcount++; /* try to fix things up */ |
| @@ -141,6 +161,12 @@ void cpu_hotplug_begin(void) | |||
| 141 | cpuhp_lock_acquire(); | 161 | cpuhp_lock_acquire(); |
| 142 | for (;;) { | 162 | for (;;) { |
| 143 | mutex_lock(&cpu_hotplug.lock); | 163 | mutex_lock(&cpu_hotplug.lock); |
| 164 | if (atomic_read(&cpu_hotplug.puts_pending)) { | ||
| 165 | int delta; | ||
| 166 | |||
| 167 | delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); | ||
| 168 | cpu_hotplug.refcount -= delta; | ||
| 169 | } | ||
| 144 | if (likely(!cpu_hotplug.refcount)) | 170 | if (likely(!cpu_hotplug.refcount)) |
| 145 | break; | 171 | break; |
| 146 | __set_current_state(TASK_UNINTERRUPTIBLE); | 172 | __set_current_state(TASK_UNINTERRUPTIBLE); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 22874d7cf2c0..1f107c74087b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -365,13 +365,14 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, | |||
| 365 | struct task_struct *tsk) | 365 | struct task_struct *tsk) |
| 366 | { | 366 | { |
| 367 | if (is_spread_page(cs)) | 367 | if (is_spread_page(cs)) |
| 368 | tsk->flags |= PF_SPREAD_PAGE; | 368 | task_set_spread_page(tsk); |
| 369 | else | 369 | else |
| 370 | tsk->flags &= ~PF_SPREAD_PAGE; | 370 | task_clear_spread_page(tsk); |
| 371 | |||
| 371 | if (is_spread_slab(cs)) | 372 | if (is_spread_slab(cs)) |
| 372 | tsk->flags |= PF_SPREAD_SLAB; | 373 | task_set_spread_slab(tsk); |
| 373 | else | 374 | else |
| 374 | tsk->flags &= ~PF_SPREAD_SLAB; | 375 | task_clear_spread_slab(tsk); |
| 375 | } | 376 | } |
| 376 | 377 | ||
| 377 | /* | 378 | /* |
| @@ -2729,10 +2730,9 @@ void __cpuset_memory_pressure_bump(void) | |||
| 2729 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it | 2730 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it |
| 2730 | * anyway. | 2731 | * anyway. |
| 2731 | */ | 2732 | */ |
| 2732 | int proc_cpuset_show(struct seq_file *m, void *unused_v) | 2733 | int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, |
| 2734 | struct pid *pid, struct task_struct *tsk) | ||
| 2733 | { | 2735 | { |
| 2734 | struct pid *pid; | ||
| 2735 | struct task_struct *tsk; | ||
| 2736 | char *buf, *p; | 2736 | char *buf, *p; |
| 2737 | struct cgroup_subsys_state *css; | 2737 | struct cgroup_subsys_state *css; |
| 2738 | int retval; | 2738 | int retval; |
| @@ -2742,24 +2742,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
| 2742 | if (!buf) | 2742 | if (!buf) |
| 2743 | goto out; | 2743 | goto out; |
| 2744 | 2744 | ||
| 2745 | retval = -ESRCH; | ||
| 2746 | pid = m->private; | ||
| 2747 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
| 2748 | if (!tsk) | ||
| 2749 | goto out_free; | ||
| 2750 | |||
| 2751 | retval = -ENAMETOOLONG; | 2745 | retval = -ENAMETOOLONG; |
| 2752 | rcu_read_lock(); | 2746 | rcu_read_lock(); |
| 2753 | css = task_css(tsk, cpuset_cgrp_id); | 2747 | css = task_css(tsk, cpuset_cgrp_id); |
| 2754 | p = cgroup_path(css->cgroup, buf, PATH_MAX); | 2748 | p = cgroup_path(css->cgroup, buf, PATH_MAX); |
| 2755 | rcu_read_unlock(); | 2749 | rcu_read_unlock(); |
| 2756 | if (!p) | 2750 | if (!p) |
| 2757 | goto out_put_task; | 2751 | goto out_free; |
| 2758 | seq_puts(m, p); | 2752 | seq_puts(m, p); |
| 2759 | seq_putc(m, '\n'); | 2753 | seq_putc(m, '\n'); |
| 2760 | retval = 0; | 2754 | retval = 0; |
| 2761 | out_put_task: | ||
| 2762 | put_task_struct(tsk); | ||
| 2763 | out_free: | 2755 | out_free: |
| 2764 | kfree(buf); | 2756 | kfree(buf); |
| 2765 | out: | 2757 | out: |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index c766ee54c0b1..b64e238b553b 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c | |||
| @@ -18,6 +18,7 @@ unsigned long saved_max_pfn; | |||
| 18 | * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. | 18 | * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. |
| 19 | */ | 19 | */ |
| 20 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | 20 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; |
| 21 | EXPORT_SYMBOL_GPL(elfcorehdr_addr); | ||
| 21 | 22 | ||
| 22 | /* | 23 | /* |
| 23 | * stores the size of elf header of crash image | 24 | * stores the size of elf header of crash image |
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 70a504601dc3..b20d544f20c2 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c | |||
| @@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp) | |||
| 52 | 52 | ||
| 53 | bp->bph_length = 1; | 53 | bp->bph_length = 1; |
| 54 | if ((argc + 1) != nextarg) { | 54 | if ((argc + 1) != nextarg) { |
| 55 | if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) | 55 | if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0) |
| 56 | bp->bp_type = BP_ACCESS_WATCHPOINT; | 56 | bp->bp_type = BP_ACCESS_WATCHPOINT; |
| 57 | else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) | 57 | else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) |
| 58 | bp->bp_type = BP_WRITE_WATCHPOINT; | 58 | bp->bp_type = BP_WRITE_WATCHPOINT; |
| 59 | else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) | 59 | else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0) |
| 60 | bp->bp_type = BP_HARDWARE_BREAKPOINT; | 60 | bp->bp_type = BP_HARDWARE_BREAKPOINT; |
| 61 | else | 61 | else |
| 62 | return KDB_ARGCOUNT; | 62 | return KDB_ARGCOUNT; |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 97b67df8fbfe..d659487254d5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
| @@ -52,7 +52,7 @@ static void release_callchain_buffers(void) | |||
| 52 | struct callchain_cpus_entries *entries; | 52 | struct callchain_cpus_entries *entries; |
| 53 | 53 | ||
| 54 | entries = callchain_cpus_entries; | 54 | entries = callchain_cpus_entries; |
| 55 | rcu_assign_pointer(callchain_cpus_entries, NULL); | 55 | RCU_INIT_POINTER(callchain_cpus_entries, NULL); |
| 56 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | 56 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); |
| 57 | } | 57 | } |
| 58 | 58 | ||
| @@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) | |||
| 137 | int cpu; | 137 | int cpu; |
| 138 | struct callchain_cpus_entries *entries; | 138 | struct callchain_cpus_entries *entries; |
| 139 | 139 | ||
| 140 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | 140 | *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); |
| 141 | if (*rctx == -1) | 141 | if (*rctx == -1) |
| 142 | return NULL; | 142 | return NULL; |
| 143 | 143 | ||
| @@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) | |||
| 153 | static void | 153 | static void |
| 154 | put_callchain_entry(int rctx) | 154 | put_callchain_entry(int rctx) |
| 155 | { | 155 | { |
| 156 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | 156 | put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); |
| 157 | } | 157 | } |
| 158 | 158 | ||
| 159 | struct perf_callchain_entry * | 159 | struct perf_callchain_entry * |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cf24b3e42ec..2b02c9fda790 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -41,11 +41,14 @@ | |||
| 41 | #include <linux/cgroup.h> | 41 | #include <linux/cgroup.h> |
| 42 | #include <linux/module.h> | 42 | #include <linux/module.h> |
| 43 | #include <linux/mman.h> | 43 | #include <linux/mman.h> |
| 44 | #include <linux/compat.h> | ||
| 44 | 45 | ||
| 45 | #include "internal.h" | 46 | #include "internal.h" |
| 46 | 47 | ||
| 47 | #include <asm/irq_regs.h> | 48 | #include <asm/irq_regs.h> |
| 48 | 49 | ||
| 50 | static struct workqueue_struct *perf_wq; | ||
| 51 | |||
| 49 | struct remote_function_call { | 52 | struct remote_function_call { |
| 50 | struct task_struct *p; | 53 | struct task_struct *p; |
| 51 | int (*func)(void *info); | 54 | int (*func)(void *info); |
| @@ -119,6 +122,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | |||
| 119 | return data.ret; | 122 | return data.ret; |
| 120 | } | 123 | } |
| 121 | 124 | ||
| 125 | #define EVENT_OWNER_KERNEL ((void *) -1) | ||
| 126 | |||
| 127 | static bool is_kernel_event(struct perf_event *event) | ||
| 128 | { | ||
| 129 | return event->owner == EVENT_OWNER_KERNEL; | ||
| 130 | } | ||
| 131 | |||
| 122 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | 132 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ |
| 123 | PERF_FLAG_FD_OUTPUT |\ | 133 | PERF_FLAG_FD_OUTPUT |\ |
| 124 | PERF_FLAG_PID_CGROUP |\ | 134 | PERF_FLAG_PID_CGROUP |\ |
| @@ -239,7 +249,7 @@ static void perf_duration_warn(struct irq_work *w) | |||
| 239 | u64 avg_local_sample_len; | 249 | u64 avg_local_sample_len; |
| 240 | u64 local_samples_len; | 250 | u64 local_samples_len; |
| 241 | 251 | ||
| 242 | local_samples_len = __get_cpu_var(running_sample_length); | 252 | local_samples_len = __this_cpu_read(running_sample_length); |
| 243 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; | 253 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; |
| 244 | 254 | ||
| 245 | printk_ratelimited(KERN_WARNING | 255 | printk_ratelimited(KERN_WARNING |
| @@ -261,10 +271,10 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
| 261 | return; | 271 | return; |
| 262 | 272 | ||
| 263 | /* decay the counter by 1 average sample */ | 273 | /* decay the counter by 1 average sample */ |
| 264 | local_samples_len = __get_cpu_var(running_sample_length); | 274 | local_samples_len = __this_cpu_read(running_sample_length); |
| 265 | local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; | 275 | local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; |
| 266 | local_samples_len += sample_len_ns; | 276 | local_samples_len += sample_len_ns; |
| 267 | __get_cpu_var(running_sample_length) = local_samples_len; | 277 | __this_cpu_write(running_sample_length, local_samples_len); |
| 268 | 278 | ||
| 269 | /* | 279 | /* |
| 270 | * note: this will be biased artifically low until we have | 280 | * note: this will be biased artifically low until we have |
| @@ -391,14 +401,9 @@ perf_cgroup_match(struct perf_event *event) | |||
| 391 | event->cgrp->css.cgroup); | 401 | event->cgrp->css.cgroup); |
| 392 | } | 402 | } |
| 393 | 403 | ||
| 394 | static inline void perf_put_cgroup(struct perf_event *event) | ||
| 395 | { | ||
| 396 | css_put(&event->cgrp->css); | ||
| 397 | } | ||
| 398 | |||
| 399 | static inline void perf_detach_cgroup(struct perf_event *event) | 404 | static inline void perf_detach_cgroup(struct perf_event *event) |
| 400 | { | 405 | { |
| 401 | perf_put_cgroup(event); | 406 | css_put(&event->cgrp->css); |
| 402 | event->cgrp = NULL; | 407 | event->cgrp = NULL; |
| 403 | } | 408 | } |
| 404 | 409 | ||
| @@ -877,7 +882,7 @@ static DEFINE_PER_CPU(struct list_head, rotation_list); | |||
| 877 | static void perf_pmu_rotate_start(struct pmu *pmu) | 882 | static void perf_pmu_rotate_start(struct pmu *pmu) |
| 878 | { | 883 | { |
| 879 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | 884 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
| 880 | struct list_head *head = &__get_cpu_var(rotation_list); | 885 | struct list_head *head = this_cpu_ptr(&rotation_list); |
| 881 | 886 | ||
| 882 | WARN_ON(!irqs_disabled()); | 887 | WARN_ON(!irqs_disabled()); |
| 883 | 888 | ||
| @@ -901,13 +906,23 @@ static void put_ctx(struct perf_event_context *ctx) | |||
| 901 | } | 906 | } |
| 902 | } | 907 | } |
| 903 | 908 | ||
| 904 | static void unclone_ctx(struct perf_event_context *ctx) | 909 | /* |
| 910 | * This must be done under the ctx->lock, such as to serialize against | ||
| 911 | * context_equiv(), therefore we cannot call put_ctx() since that might end up | ||
| 912 | * calling scheduler related locks and ctx->lock nests inside those. | ||
| 913 | */ | ||
| 914 | static __must_check struct perf_event_context * | ||
| 915 | unclone_ctx(struct perf_event_context *ctx) | ||
| 905 | { | 916 | { |
| 906 | if (ctx->parent_ctx) { | 917 | struct perf_event_context *parent_ctx = ctx->parent_ctx; |
| 907 | put_ctx(ctx->parent_ctx); | 918 | |
| 919 | lockdep_assert_held(&ctx->lock); | ||
| 920 | |||
| 921 | if (parent_ctx) | ||
| 908 | ctx->parent_ctx = NULL; | 922 | ctx->parent_ctx = NULL; |
| 909 | } | ||
| 910 | ctx->generation++; | 923 | ctx->generation++; |
| 924 | |||
| 925 | return parent_ctx; | ||
| 911 | } | 926 | } |
| 912 | 927 | ||
| 913 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | 928 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) |
| @@ -1374,6 +1389,45 @@ out: | |||
| 1374 | perf_event__header_size(tmp); | 1389 | perf_event__header_size(tmp); |
| 1375 | } | 1390 | } |
| 1376 | 1391 | ||
| 1392 | /* | ||
| 1393 | * User event without the task. | ||
| 1394 | */ | ||
| 1395 | static bool is_orphaned_event(struct perf_event *event) | ||
| 1396 | { | ||
| 1397 | return event && !is_kernel_event(event) && !event->owner; | ||
| 1398 | } | ||
| 1399 | |||
| 1400 | /* | ||
| 1401 | * Event has a parent but parent's task finished and it's | ||
| 1402 | * alive only because of children holding refference. | ||
| 1403 | */ | ||
| 1404 | static bool is_orphaned_child(struct perf_event *event) | ||
| 1405 | { | ||
| 1406 | return is_orphaned_event(event->parent); | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | static void orphans_remove_work(struct work_struct *work); | ||
| 1410 | |||
| 1411 | static void schedule_orphans_remove(struct perf_event_context *ctx) | ||
| 1412 | { | ||
| 1413 | if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) | ||
| 1414 | return; | ||
| 1415 | |||
| 1416 | if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { | ||
| 1417 | get_ctx(ctx); | ||
| 1418 | ctx->orphans_remove_sched = true; | ||
| 1419 | } | ||
| 1420 | } | ||
| 1421 | |||
| 1422 | static int __init perf_workqueue_init(void) | ||
| 1423 | { | ||
| 1424 | perf_wq = create_singlethread_workqueue("perf"); | ||
| 1425 | WARN(!perf_wq, "failed to create perf workqueue\n"); | ||
| 1426 | return perf_wq ? 0 : -1; | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | core_initcall(perf_workqueue_init); | ||
| 1430 | |||
| 1377 | static inline int | 1431 | static inline int |
| 1378 | event_filter_match(struct perf_event *event) | 1432 | event_filter_match(struct perf_event *event) |
| 1379 | { | 1433 | { |
| @@ -1423,6 +1477,9 @@ event_sched_out(struct perf_event *event, | |||
| 1423 | if (event->attr.exclusive || !cpuctx->active_oncpu) | 1477 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
| 1424 | cpuctx->exclusive = 0; | 1478 | cpuctx->exclusive = 0; |
| 1425 | 1479 | ||
| 1480 | if (is_orphaned_child(event)) | ||
| 1481 | schedule_orphans_remove(ctx); | ||
| 1482 | |||
| 1426 | perf_pmu_enable(event->pmu); | 1483 | perf_pmu_enable(event->pmu); |
| 1427 | } | 1484 | } |
| 1428 | 1485 | ||
| @@ -1523,6 +1580,11 @@ retry: | |||
| 1523 | */ | 1580 | */ |
| 1524 | if (ctx->is_active) { | 1581 | if (ctx->is_active) { |
| 1525 | raw_spin_unlock_irq(&ctx->lock); | 1582 | raw_spin_unlock_irq(&ctx->lock); |
| 1583 | /* | ||
| 1584 | * Reload the task pointer, it might have been changed by | ||
| 1585 | * a concurrent perf_event_context_sched_out(). | ||
| 1586 | */ | ||
| 1587 | task = ctx->task; | ||
| 1526 | goto retry; | 1588 | goto retry; |
| 1527 | } | 1589 | } |
| 1528 | 1590 | ||
| @@ -1725,6 +1787,9 @@ event_sched_in(struct perf_event *event, | |||
| 1725 | if (event->attr.exclusive) | 1787 | if (event->attr.exclusive) |
| 1726 | cpuctx->exclusive = 1; | 1788 | cpuctx->exclusive = 1; |
| 1727 | 1789 | ||
| 1790 | if (is_orphaned_child(event)) | ||
| 1791 | schedule_orphans_remove(ctx); | ||
| 1792 | |||
| 1728 | out: | 1793 | out: |
| 1729 | perf_pmu_enable(event->pmu); | 1794 | perf_pmu_enable(event->pmu); |
| 1730 | 1795 | ||
| @@ -1966,6 +2031,11 @@ retry: | |||
| 1966 | */ | 2031 | */ |
| 1967 | if (ctx->is_active) { | 2032 | if (ctx->is_active) { |
| 1968 | raw_spin_unlock_irq(&ctx->lock); | 2033 | raw_spin_unlock_irq(&ctx->lock); |
| 2034 | /* | ||
| 2035 | * Reload the task pointer, it might have been changed by | ||
| 2036 | * a concurrent perf_event_context_sched_out(). | ||
| 2037 | */ | ||
| 2038 | task = ctx->task; | ||
| 1969 | goto retry; | 2039 | goto retry; |
| 1970 | } | 2040 | } |
| 1971 | 2041 | ||
| @@ -2199,6 +2269,9 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
| 2199 | static int context_equiv(struct perf_event_context *ctx1, | 2269 | static int context_equiv(struct perf_event_context *ctx1, |
| 2200 | struct perf_event_context *ctx2) | 2270 | struct perf_event_context *ctx2) |
| 2201 | { | 2271 | { |
| 2272 | lockdep_assert_held(&ctx1->lock); | ||
| 2273 | lockdep_assert_held(&ctx2->lock); | ||
| 2274 | |||
| 2202 | /* Pinning disables the swap optimization */ | 2275 | /* Pinning disables the swap optimization */ |
| 2203 | if (ctx1->pin_count || ctx2->pin_count) | 2276 | if (ctx1->pin_count || ctx2->pin_count) |
| 2204 | return 0; | 2277 | return 0; |
| @@ -2320,7 +2393,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2320 | next_parent = rcu_dereference(next_ctx->parent_ctx); | 2393 | next_parent = rcu_dereference(next_ctx->parent_ctx); |
| 2321 | 2394 | ||
| 2322 | /* If neither context have a parent context; they cannot be clones. */ | 2395 | /* If neither context have a parent context; they cannot be clones. */ |
| 2323 | if (!parent || !next_parent) | 2396 | if (!parent && !next_parent) |
| 2324 | goto unlock; | 2397 | goto unlock; |
| 2325 | 2398 | ||
| 2326 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { | 2399 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { |
| @@ -2389,7 +2462,7 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
| 2389 | * to check if we have to switch out PMU state. | 2462 | * to check if we have to switch out PMU state. |
| 2390 | * cgroup event are system-wide mode only | 2463 | * cgroup event are system-wide mode only |
| 2391 | */ | 2464 | */ |
| 2392 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | 2465 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) |
| 2393 | perf_cgroup_sched_out(task, next); | 2466 | perf_cgroup_sched_out(task, next); |
| 2394 | } | 2467 | } |
| 2395 | 2468 | ||
| @@ -2632,11 +2705,11 @@ void __perf_event_task_sched_in(struct task_struct *prev, | |||
| 2632 | * to check if we have to switch in PMU state. | 2705 | * to check if we have to switch in PMU state. |
| 2633 | * cgroup event are system-wide mode only | 2706 | * cgroup event are system-wide mode only |
| 2634 | */ | 2707 | */ |
| 2635 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | 2708 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) |
| 2636 | perf_cgroup_sched_in(prev, task); | 2709 | perf_cgroup_sched_in(prev, task); |
| 2637 | 2710 | ||
| 2638 | /* check for system-wide branch_stack events */ | 2711 | /* check for system-wide branch_stack events */ |
| 2639 | if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) | 2712 | if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) |
| 2640 | perf_branch_stack_sched_in(prev, task); | 2713 | perf_branch_stack_sched_in(prev, task); |
| 2641 | } | 2714 | } |
| 2642 | 2715 | ||
| @@ -2891,7 +2964,7 @@ bool perf_event_can_stop_tick(void) | |||
| 2891 | 2964 | ||
| 2892 | void perf_event_task_tick(void) | 2965 | void perf_event_task_tick(void) |
| 2893 | { | 2966 | { |
| 2894 | struct list_head *head = &__get_cpu_var(rotation_list); | 2967 | struct list_head *head = this_cpu_ptr(&rotation_list); |
| 2895 | struct perf_cpu_context *cpuctx, *tmp; | 2968 | struct perf_cpu_context *cpuctx, *tmp; |
| 2896 | struct perf_event_context *ctx; | 2969 | struct perf_event_context *ctx; |
| 2897 | int throttled; | 2970 | int throttled; |
| @@ -2932,6 +3005,7 @@ static int event_enable_on_exec(struct perf_event *event, | |||
| 2932 | */ | 3005 | */ |
| 2933 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) | 3006 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) |
| 2934 | { | 3007 | { |
| 3008 | struct perf_event_context *clone_ctx = NULL; | ||
| 2935 | struct perf_event *event; | 3009 | struct perf_event *event; |
| 2936 | unsigned long flags; | 3010 | unsigned long flags; |
| 2937 | int enabled = 0; | 3011 | int enabled = 0; |
| @@ -2963,7 +3037,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
| 2963 | * Unclone this context if we enabled any event. | 3037 | * Unclone this context if we enabled any event. |
| 2964 | */ | 3038 | */ |
| 2965 | if (enabled) | 3039 | if (enabled) |
| 2966 | unclone_ctx(ctx); | 3040 | clone_ctx = unclone_ctx(ctx); |
| 2967 | 3041 | ||
| 2968 | raw_spin_unlock(&ctx->lock); | 3042 | raw_spin_unlock(&ctx->lock); |
| 2969 | 3043 | ||
| @@ -2973,6 +3047,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
| 2973 | perf_event_context_sched_in(ctx, ctx->task); | 3047 | perf_event_context_sched_in(ctx, ctx->task); |
| 2974 | out: | 3048 | out: |
| 2975 | local_irq_restore(flags); | 3049 | local_irq_restore(flags); |
| 3050 | |||
| 3051 | if (clone_ctx) | ||
| 3052 | put_ctx(clone_ctx); | ||
| 2976 | } | 3053 | } |
| 2977 | 3054 | ||
| 2978 | void perf_event_exec(void) | 3055 | void perf_event_exec(void) |
| @@ -3067,6 +3144,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) | |||
| 3067 | INIT_LIST_HEAD(&ctx->flexible_groups); | 3144 | INIT_LIST_HEAD(&ctx->flexible_groups); |
| 3068 | INIT_LIST_HEAD(&ctx->event_list); | 3145 | INIT_LIST_HEAD(&ctx->event_list); |
| 3069 | atomic_set(&ctx->refcount, 1); | 3146 | atomic_set(&ctx->refcount, 1); |
| 3147 | INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); | ||
| 3070 | } | 3148 | } |
| 3071 | 3149 | ||
| 3072 | static struct perf_event_context * | 3150 | static struct perf_event_context * |
| @@ -3124,7 +3202,7 @@ errout: | |||
| 3124 | static struct perf_event_context * | 3202 | static struct perf_event_context * |
| 3125 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | 3203 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) |
| 3126 | { | 3204 | { |
| 3127 | struct perf_event_context *ctx; | 3205 | struct perf_event_context *ctx, *clone_ctx = NULL; |
| 3128 | struct perf_cpu_context *cpuctx; | 3206 | struct perf_cpu_context *cpuctx; |
| 3129 | unsigned long flags; | 3207 | unsigned long flags; |
| 3130 | int ctxn, err; | 3208 | int ctxn, err; |
| @@ -3158,9 +3236,12 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
| 3158 | retry: | 3236 | retry: |
| 3159 | ctx = perf_lock_task_context(task, ctxn, &flags); | 3237 | ctx = perf_lock_task_context(task, ctxn, &flags); |
| 3160 | if (ctx) { | 3238 | if (ctx) { |
| 3161 | unclone_ctx(ctx); | 3239 | clone_ctx = unclone_ctx(ctx); |
| 3162 | ++ctx->pin_count; | 3240 | ++ctx->pin_count; |
| 3163 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 3241 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| 3242 | |||
| 3243 | if (clone_ctx) | ||
| 3244 | put_ctx(clone_ctx); | ||
| 3164 | } else { | 3245 | } else { |
| 3165 | ctx = alloc_perf_context(pmu, task); | 3246 | ctx = alloc_perf_context(pmu, task); |
| 3166 | err = -ENOMEM; | 3247 | err = -ENOMEM; |
| @@ -3312,16 +3393,12 @@ static void free_event(struct perf_event *event) | |||
| 3312 | } | 3393 | } |
| 3313 | 3394 | ||
| 3314 | /* | 3395 | /* |
| 3315 | * Called when the last reference to the file is gone. | 3396 | * Remove user event from the owner task. |
| 3316 | */ | 3397 | */ |
| 3317 | static void put_event(struct perf_event *event) | 3398 | static void perf_remove_from_owner(struct perf_event *event) |
| 3318 | { | 3399 | { |
| 3319 | struct perf_event_context *ctx = event->ctx; | ||
| 3320 | struct task_struct *owner; | 3400 | struct task_struct *owner; |
| 3321 | 3401 | ||
| 3322 | if (!atomic_long_dec_and_test(&event->refcount)) | ||
| 3323 | return; | ||
| 3324 | |||
| 3325 | rcu_read_lock(); | 3402 | rcu_read_lock(); |
| 3326 | owner = ACCESS_ONCE(event->owner); | 3403 | owner = ACCESS_ONCE(event->owner); |
| 3327 | /* | 3404 | /* |
| @@ -3354,6 +3431,20 @@ static void put_event(struct perf_event *event) | |||
| 3354 | mutex_unlock(&owner->perf_event_mutex); | 3431 | mutex_unlock(&owner->perf_event_mutex); |
| 3355 | put_task_struct(owner); | 3432 | put_task_struct(owner); |
| 3356 | } | 3433 | } |
| 3434 | } | ||
| 3435 | |||
| 3436 | /* | ||
| 3437 | * Called when the last reference to the file is gone. | ||
| 3438 | */ | ||
| 3439 | static void put_event(struct perf_event *event) | ||
| 3440 | { | ||
| 3441 | struct perf_event_context *ctx = event->ctx; | ||
| 3442 | |||
| 3443 | if (!atomic_long_dec_and_test(&event->refcount)) | ||
| 3444 | return; | ||
| 3445 | |||
| 3446 | if (!is_kernel_event(event)) | ||
| 3447 | perf_remove_from_owner(event); | ||
| 3357 | 3448 | ||
| 3358 | WARN_ON_ONCE(ctx->parent_ctx); | 3449 | WARN_ON_ONCE(ctx->parent_ctx); |
| 3359 | /* | 3450 | /* |
| @@ -3388,6 +3479,42 @@ static int perf_release(struct inode *inode, struct file *file) | |||
| 3388 | return 0; | 3479 | return 0; |
| 3389 | } | 3480 | } |
| 3390 | 3481 | ||
| 3482 | /* | ||
| 3483 | * Remove all orphanes events from the context. | ||
| 3484 | */ | ||
| 3485 | static void orphans_remove_work(struct work_struct *work) | ||
| 3486 | { | ||
| 3487 | struct perf_event_context *ctx; | ||
| 3488 | struct perf_event *event, *tmp; | ||
| 3489 | |||
| 3490 | ctx = container_of(work, struct perf_event_context, | ||
| 3491 | orphans_remove.work); | ||
| 3492 | |||
| 3493 | mutex_lock(&ctx->mutex); | ||
| 3494 | list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { | ||
| 3495 | struct perf_event *parent_event = event->parent; | ||
| 3496 | |||
| 3497 | if (!is_orphaned_child(event)) | ||
| 3498 | continue; | ||
| 3499 | |||
| 3500 | perf_remove_from_context(event, true); | ||
| 3501 | |||
| 3502 | mutex_lock(&parent_event->child_mutex); | ||
| 3503 | list_del_init(&event->child_list); | ||
| 3504 | mutex_unlock(&parent_event->child_mutex); | ||
| 3505 | |||
| 3506 | free_event(event); | ||
| 3507 | put_event(parent_event); | ||
| 3508 | } | ||
| 3509 | |||
| 3510 | raw_spin_lock_irq(&ctx->lock); | ||
| 3511 | ctx->orphans_remove_sched = false; | ||
| 3512 | raw_spin_unlock_irq(&ctx->lock); | ||
| 3513 | mutex_unlock(&ctx->mutex); | ||
| 3514 | |||
| 3515 | put_ctx(ctx); | ||
| 3516 | } | ||
| 3517 | |||
| 3391 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 3518 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
| 3392 | { | 3519 | { |
| 3393 | struct perf_event *child; | 3520 | struct perf_event *child; |
| @@ -3485,6 +3612,19 @@ static int perf_event_read_one(struct perf_event *event, | |||
| 3485 | return n * sizeof(u64); | 3612 | return n * sizeof(u64); |
| 3486 | } | 3613 | } |
| 3487 | 3614 | ||
| 3615 | static bool is_event_hup(struct perf_event *event) | ||
| 3616 | { | ||
| 3617 | bool no_children; | ||
| 3618 | |||
| 3619 | if (event->state != PERF_EVENT_STATE_EXIT) | ||
| 3620 | return false; | ||
| 3621 | |||
| 3622 | mutex_lock(&event->child_mutex); | ||
| 3623 | no_children = list_empty(&event->child_list); | ||
| 3624 | mutex_unlock(&event->child_mutex); | ||
| 3625 | return no_children; | ||
| 3626 | } | ||
| 3627 | |||
| 3488 | /* | 3628 | /* |
| 3489 | * Read the performance event - simple non blocking version for now | 3629 | * Read the performance event - simple non blocking version for now |
| 3490 | */ | 3630 | */ |
| @@ -3526,7 +3666,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
| 3526 | { | 3666 | { |
| 3527 | struct perf_event *event = file->private_data; | 3667 | struct perf_event *event = file->private_data; |
| 3528 | struct ring_buffer *rb; | 3668 | struct ring_buffer *rb; |
| 3529 | unsigned int events = POLL_HUP; | 3669 | unsigned int events = POLLHUP; |
| 3670 | |||
| 3671 | poll_wait(file, &event->waitq, wait); | ||
| 3672 | |||
| 3673 | if (is_event_hup(event)) | ||
| 3674 | return events; | ||
| 3530 | 3675 | ||
| 3531 | /* | 3676 | /* |
| 3532 | * Pin the event->rb by taking event->mmap_mutex; otherwise | 3677 | * Pin the event->rb by taking event->mmap_mutex; otherwise |
| @@ -3537,9 +3682,6 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
| 3537 | if (rb) | 3682 | if (rb) |
| 3538 | events = atomic_xchg(&rb->poll, 0); | 3683 | events = atomic_xchg(&rb->poll, 0); |
| 3539 | mutex_unlock(&event->mmap_mutex); | 3684 | mutex_unlock(&event->mmap_mutex); |
| 3540 | |||
| 3541 | poll_wait(file, &event->waitq, wait); | ||
| 3542 | |||
| 3543 | return events; | 3685 | return events; |
| 3544 | } | 3686 | } |
| 3545 | 3687 | ||
| @@ -3717,6 +3859,26 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
| 3717 | return 0; | 3859 | return 0; |
| 3718 | } | 3860 | } |
| 3719 | 3861 | ||
| 3862 | #ifdef CONFIG_COMPAT | ||
| 3863 | static long perf_compat_ioctl(struct file *file, unsigned int cmd, | ||
| 3864 | unsigned long arg) | ||
| 3865 | { | ||
| 3866 | switch (_IOC_NR(cmd)) { | ||
| 3867 | case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): | ||
| 3868 | case _IOC_NR(PERF_EVENT_IOC_ID): | ||
| 3869 | /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ | ||
| 3870 | if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { | ||
| 3871 | cmd &= ~IOCSIZE_MASK; | ||
| 3872 | cmd |= sizeof(void *) << IOCSIZE_SHIFT; | ||
| 3873 | } | ||
| 3874 | break; | ||
| 3875 | } | ||
| 3876 | return perf_ioctl(file, cmd, arg); | ||
| 3877 | } | ||
| 3878 | #else | ||
| 3879 | # define perf_compat_ioctl NULL | ||
| 3880 | #endif | ||
| 3881 | |||
| 3720 | int perf_event_task_enable(void) | 3882 | int perf_event_task_enable(void) |
| 3721 | { | 3883 | { |
| 3722 | struct perf_event *event; | 3884 | struct perf_event *event; |
| @@ -4222,7 +4384,7 @@ static const struct file_operations perf_fops = { | |||
| 4222 | .read = perf_read, | 4384 | .read = perf_read, |
| 4223 | .poll = perf_poll, | 4385 | .poll = perf_poll, |
| 4224 | .unlocked_ioctl = perf_ioctl, | 4386 | .unlocked_ioctl = perf_ioctl, |
| 4225 | .compat_ioctl = perf_ioctl, | 4387 | .compat_ioctl = perf_compat_ioctl, |
| 4226 | .mmap = perf_mmap, | 4388 | .mmap = perf_mmap, |
| 4227 | .fasync = perf_fasync, | 4389 | .fasync = perf_fasync, |
| 4228 | }; | 4390 | }; |
| @@ -5671,7 +5833,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
| 5671 | struct perf_sample_data *data, | 5833 | struct perf_sample_data *data, |
| 5672 | struct pt_regs *regs) | 5834 | struct pt_regs *regs) |
| 5673 | { | 5835 | { |
| 5674 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5836 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
| 5675 | struct perf_event *event; | 5837 | struct perf_event *event; |
| 5676 | struct hlist_head *head; | 5838 | struct hlist_head *head; |
| 5677 | 5839 | ||
| @@ -5690,7 +5852,7 @@ end: | |||
| 5690 | 5852 | ||
| 5691 | int perf_swevent_get_recursion_context(void) | 5853 | int perf_swevent_get_recursion_context(void) |
| 5692 | { | 5854 | { |
| 5693 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5855 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
| 5694 | 5856 | ||
| 5695 | return get_recursion_context(swhash->recursion); | 5857 | return get_recursion_context(swhash->recursion); |
| 5696 | } | 5858 | } |
| @@ -5698,7 +5860,7 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | |||
| 5698 | 5860 | ||
| 5699 | inline void perf_swevent_put_recursion_context(int rctx) | 5861 | inline void perf_swevent_put_recursion_context(int rctx) |
| 5700 | { | 5862 | { |
| 5701 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5863 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
| 5702 | 5864 | ||
| 5703 | put_recursion_context(swhash->recursion, rctx); | 5865 | put_recursion_context(swhash->recursion, rctx); |
| 5704 | } | 5866 | } |
| @@ -5727,7 +5889,7 @@ static void perf_swevent_read(struct perf_event *event) | |||
| 5727 | 5889 | ||
| 5728 | static int perf_swevent_add(struct perf_event *event, int flags) | 5890 | static int perf_swevent_add(struct perf_event *event, int flags) |
| 5729 | { | 5891 | { |
| 5730 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5892 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
| 5731 | struct hw_perf_event *hwc = &event->hw; | 5893 | struct hw_perf_event *hwc = &event->hw; |
| 5732 | struct hlist_head *head; | 5894 | struct hlist_head *head; |
| 5733 | 5895 | ||
| @@ -5783,7 +5945,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) | |||
| 5783 | if (!hlist) | 5945 | if (!hlist) |
| 5784 | return; | 5946 | return; |
| 5785 | 5947 | ||
| 5786 | rcu_assign_pointer(swhash->swevent_hlist, NULL); | 5948 | RCU_INIT_POINTER(swhash->swevent_hlist, NULL); |
| 5787 | kfree_rcu(hlist, rcu_head); | 5949 | kfree_rcu(hlist, rcu_head); |
| 5788 | } | 5950 | } |
| 5789 | 5951 | ||
| @@ -5909,11 +6071,6 @@ static int perf_swevent_init(struct perf_event *event) | |||
| 5909 | return 0; | 6071 | return 0; |
| 5910 | } | 6072 | } |
| 5911 | 6073 | ||
| 5912 | static int perf_swevent_event_idx(struct perf_event *event) | ||
| 5913 | { | ||
| 5914 | return 0; | ||
| 5915 | } | ||
| 5916 | |||
| 5917 | static struct pmu perf_swevent = { | 6074 | static struct pmu perf_swevent = { |
| 5918 | .task_ctx_nr = perf_sw_context, | 6075 | .task_ctx_nr = perf_sw_context, |
| 5919 | 6076 | ||
| @@ -5923,8 +6080,6 @@ static struct pmu perf_swevent = { | |||
| 5923 | .start = perf_swevent_start, | 6080 | .start = perf_swevent_start, |
| 5924 | .stop = perf_swevent_stop, | 6081 | .stop = perf_swevent_stop, |
| 5925 | .read = perf_swevent_read, | 6082 | .read = perf_swevent_read, |
| 5926 | |||
| 5927 | .event_idx = perf_swevent_event_idx, | ||
| 5928 | }; | 6083 | }; |
| 5929 | 6084 | ||
| 5930 | #ifdef CONFIG_EVENT_TRACING | 6085 | #ifdef CONFIG_EVENT_TRACING |
| @@ -6042,8 +6197,6 @@ static struct pmu perf_tracepoint = { | |||
| 6042 | .start = perf_swevent_start, | 6197 | .start = perf_swevent_start, |
| 6043 | .stop = perf_swevent_stop, | 6198 | .stop = perf_swevent_stop, |
| 6044 | .read = perf_swevent_read, | 6199 | .read = perf_swevent_read, |
| 6045 | |||
| 6046 | .event_idx = perf_swevent_event_idx, | ||
| 6047 | }; | 6200 | }; |
| 6048 | 6201 | ||
| 6049 | static inline void perf_tp_register(void) | 6202 | static inline void perf_tp_register(void) |
| @@ -6269,8 +6422,6 @@ static struct pmu perf_cpu_clock = { | |||
| 6269 | .start = cpu_clock_event_start, | 6422 | .start = cpu_clock_event_start, |
| 6270 | .stop = cpu_clock_event_stop, | 6423 | .stop = cpu_clock_event_stop, |
| 6271 | .read = cpu_clock_event_read, | 6424 | .read = cpu_clock_event_read, |
| 6272 | |||
| 6273 | .event_idx = perf_swevent_event_idx, | ||
| 6274 | }; | 6425 | }; |
| 6275 | 6426 | ||
| 6276 | /* | 6427 | /* |
| @@ -6349,8 +6500,6 @@ static struct pmu perf_task_clock = { | |||
| 6349 | .start = task_clock_event_start, | 6500 | .start = task_clock_event_start, |
| 6350 | .stop = task_clock_event_stop, | 6501 | .stop = task_clock_event_stop, |
| 6351 | .read = task_clock_event_read, | 6502 | .read = task_clock_event_read, |
| 6352 | |||
| 6353 | .event_idx = perf_swevent_event_idx, | ||
| 6354 | }; | 6503 | }; |
| 6355 | 6504 | ||
| 6356 | static void perf_pmu_nop_void(struct pmu *pmu) | 6505 | static void perf_pmu_nop_void(struct pmu *pmu) |
| @@ -6380,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) | |||
| 6380 | 6529 | ||
| 6381 | static int perf_event_idx_default(struct perf_event *event) | 6530 | static int perf_event_idx_default(struct perf_event *event) |
| 6382 | { | 6531 | { |
| 6383 | return event->hw.idx + 1; | 6532 | return 0; |
| 6384 | } | 6533 | } |
| 6385 | 6534 | ||
| 6386 | /* | 6535 | /* |
| @@ -7366,6 +7515,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 7366 | goto err; | 7515 | goto err; |
| 7367 | } | 7516 | } |
| 7368 | 7517 | ||
| 7518 | /* Mark owner so we could distinguish it from user events. */ | ||
| 7519 | event->owner = EVENT_OWNER_KERNEL; | ||
| 7520 | |||
| 7369 | account_event(event); | 7521 | account_event(event); |
| 7370 | 7522 | ||
| 7371 | ctx = find_get_context(event->pmu, task, cpu); | 7523 | ctx = find_get_context(event->pmu, task, cpu); |
| @@ -7453,6 +7605,12 @@ static void sync_child_event(struct perf_event *child_event, | |||
| 7453 | mutex_unlock(&parent_event->child_mutex); | 7605 | mutex_unlock(&parent_event->child_mutex); |
| 7454 | 7606 | ||
| 7455 | /* | 7607 | /* |
| 7608 | * Make sure user/parent get notified, that we just | ||
| 7609 | * lost one event. | ||
| 7610 | */ | ||
| 7611 | perf_event_wakeup(parent_event); | ||
| 7612 | |||
| 7613 | /* | ||
| 7456 | * Release the parent event, if this was the last | 7614 | * Release the parent event, if this was the last |
| 7457 | * reference to it. | 7615 | * reference to it. |
| 7458 | */ | 7616 | */ |
| @@ -7486,13 +7644,16 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
| 7486 | if (child_event->parent) { | 7644 | if (child_event->parent) { |
| 7487 | sync_child_event(child_event, child); | 7645 | sync_child_event(child_event, child); |
| 7488 | free_event(child_event); | 7646 | free_event(child_event); |
| 7647 | } else { | ||
| 7648 | child_event->state = PERF_EVENT_STATE_EXIT; | ||
| 7649 | perf_event_wakeup(child_event); | ||
| 7489 | } | 7650 | } |
| 7490 | } | 7651 | } |
| 7491 | 7652 | ||
| 7492 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | 7653 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
| 7493 | { | 7654 | { |
| 7494 | struct perf_event *child_event, *next; | 7655 | struct perf_event *child_event, *next; |
| 7495 | struct perf_event_context *child_ctx, *parent_ctx; | 7656 | struct perf_event_context *child_ctx, *clone_ctx = NULL; |
| 7496 | unsigned long flags; | 7657 | unsigned long flags; |
| 7497 | 7658 | ||
| 7498 | if (likely(!child->perf_event_ctxp[ctxn])) { | 7659 | if (likely(!child->perf_event_ctxp[ctxn])) { |
| @@ -7519,28 +7680,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
| 7519 | child->perf_event_ctxp[ctxn] = NULL; | 7680 | child->perf_event_ctxp[ctxn] = NULL; |
| 7520 | 7681 | ||
| 7521 | /* | 7682 | /* |
| 7522 | * In order to avoid freeing: child_ctx->parent_ctx->task | ||
| 7523 | * under perf_event_context::lock, grab another reference. | ||
| 7524 | */ | ||
| 7525 | parent_ctx = child_ctx->parent_ctx; | ||
| 7526 | if (parent_ctx) | ||
| 7527 | get_ctx(parent_ctx); | ||
| 7528 | |||
| 7529 | /* | ||
| 7530 | * If this context is a clone; unclone it so it can't get | 7683 | * If this context is a clone; unclone it so it can't get |
| 7531 | * swapped to another process while we're removing all | 7684 | * swapped to another process while we're removing all |
| 7532 | * the events from it. | 7685 | * the events from it. |
| 7533 | */ | 7686 | */ |
| 7534 | unclone_ctx(child_ctx); | 7687 | clone_ctx = unclone_ctx(child_ctx); |
| 7535 | update_context_time(child_ctx); | 7688 | update_context_time(child_ctx); |
| 7536 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | 7689 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); |
| 7537 | 7690 | ||
| 7538 | /* | 7691 | if (clone_ctx) |
| 7539 | * Now that we no longer hold perf_event_context::lock, drop | 7692 | put_ctx(clone_ctx); |
| 7540 | * our extra child_ctx->parent_ctx reference. | ||
| 7541 | */ | ||
| 7542 | if (parent_ctx) | ||
| 7543 | put_ctx(parent_ctx); | ||
| 7544 | 7693 | ||
| 7545 | /* | 7694 | /* |
| 7546 | * Report the task dead after unscheduling the events so that we | 7695 | * Report the task dead after unscheduling the events so that we |
| @@ -7669,6 +7818,7 @@ inherit_event(struct perf_event *parent_event, | |||
| 7669 | struct perf_event *group_leader, | 7818 | struct perf_event *group_leader, |
| 7670 | struct perf_event_context *child_ctx) | 7819 | struct perf_event_context *child_ctx) |
| 7671 | { | 7820 | { |
| 7821 | enum perf_event_active_state parent_state = parent_event->state; | ||
| 7672 | struct perf_event *child_event; | 7822 | struct perf_event *child_event; |
| 7673 | unsigned long flags; | 7823 | unsigned long flags; |
| 7674 | 7824 | ||
| @@ -7689,7 +7839,8 @@ inherit_event(struct perf_event *parent_event, | |||
| 7689 | if (IS_ERR(child_event)) | 7839 | if (IS_ERR(child_event)) |
| 7690 | return child_event; | 7840 | return child_event; |
| 7691 | 7841 | ||
| 7692 | if (!atomic_long_inc_not_zero(&parent_event->refcount)) { | 7842 | if (is_orphaned_event(parent_event) || |
| 7843 | !atomic_long_inc_not_zero(&parent_event->refcount)) { | ||
| 7693 | free_event(child_event); | 7844 | free_event(child_event); |
| 7694 | return NULL; | 7845 | return NULL; |
| 7695 | } | 7846 | } |
| @@ -7701,7 +7852,7 @@ inherit_event(struct perf_event *parent_event, | |||
| 7701 | * not its attr.disabled bit. We hold the parent's mutex, | 7852 | * not its attr.disabled bit. We hold the parent's mutex, |
| 7702 | * so we won't race with perf_event_{en, dis}able_family. | 7853 | * so we won't race with perf_event_{en, dis}able_family. |
| 7703 | */ | 7854 | */ |
| 7704 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | 7855 | if (parent_state >= PERF_EVENT_STATE_INACTIVE) |
| 7705 | child_event->state = PERF_EVENT_STATE_INACTIVE; | 7856 | child_event->state = PERF_EVENT_STATE_INACTIVE; |
| 7706 | else | 7857 | else |
| 7707 | child_event->state = PERF_EVENT_STATE_OFF; | 7858 | child_event->state = PERF_EVENT_STATE_OFF; |
| @@ -7917,8 +8068,10 @@ int perf_event_init_task(struct task_struct *child) | |||
| 7917 | 8068 | ||
| 7918 | for_each_task_context_nr(ctxn) { | 8069 | for_each_task_context_nr(ctxn) { |
| 7919 | ret = perf_event_init_context(child, ctxn); | 8070 | ret = perf_event_init_context(child, ctxn); |
| 7920 | if (ret) | 8071 | if (ret) { |
| 8072 | perf_event_free_task(child); | ||
| 7921 | return ret; | 8073 | return ret; |
| 8074 | } | ||
| 7922 | } | 8075 | } |
| 7923 | 8076 | ||
| 7924 | return 0; | 8077 | return 0; |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 1559fb0b9296..9803a6600d49 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
| @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) | |||
| 605 | bp->hw.state = PERF_HES_STOPPED; | 605 | bp->hw.state = PERF_HES_STOPPED; |
| 606 | } | 606 | } |
| 607 | 607 | ||
| 608 | static int hw_breakpoint_event_idx(struct perf_event *bp) | ||
| 609 | { | ||
| 610 | return 0; | ||
| 611 | } | ||
| 612 | |||
| 613 | static struct pmu perf_breakpoint = { | 608 | static struct pmu perf_breakpoint = { |
| 614 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ | 609 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ |
| 615 | 610 | ||
| @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = { | |||
| 619 | .start = hw_breakpoint_start, | 614 | .start = hw_breakpoint_start, |
| 620 | .stop = hw_breakpoint_stop, | 615 | .stop = hw_breakpoint_stop, |
| 621 | .read = hw_breakpoint_pmu_read, | 616 | .read = hw_breakpoint_pmu_read, |
| 622 | |||
| 623 | .event_idx = hw_breakpoint_event_idx, | ||
| 624 | }; | 617 | }; |
| 625 | 618 | ||
| 626 | int __init init_hw_breakpoint(void) | 619 | int __init init_hw_breakpoint(void) |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e8c137..1d0af8a2c646 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 167 | /* For mmu_notifiers */ | 167 | /* For mmu_notifiers */ |
| 168 | const unsigned long mmun_start = addr; | 168 | const unsigned long mmun_start = addr; |
| 169 | const unsigned long mmun_end = addr + PAGE_SIZE; | 169 | const unsigned long mmun_end = addr + PAGE_SIZE; |
| 170 | struct mem_cgroup *memcg; | ||
| 171 | |||
| 172 | err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); | ||
| 173 | if (err) | ||
| 174 | return err; | ||
| 170 | 175 | ||
| 171 | /* For try_to_free_swap() and munlock_vma_page() below */ | 176 | /* For try_to_free_swap() and munlock_vma_page() below */ |
| 172 | lock_page(page); | 177 | lock_page(page); |
| @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 179 | 184 | ||
| 180 | get_page(kpage); | 185 | get_page(kpage); |
| 181 | page_add_new_anon_rmap(kpage, vma, addr); | 186 | page_add_new_anon_rmap(kpage, vma, addr); |
| 187 | mem_cgroup_commit_charge(kpage, memcg, false); | ||
| 188 | lru_cache_add_active_or_unevictable(kpage, vma); | ||
| 182 | 189 | ||
| 183 | if (!PageAnon(page)) { | 190 | if (!PageAnon(page)) { |
| 184 | dec_mm_counter(mm, MM_FILEPAGES); | 191 | dec_mm_counter(mm, MM_FILEPAGES); |
| @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 200 | 207 | ||
| 201 | err = 0; | 208 | err = 0; |
| 202 | unlock: | 209 | unlock: |
| 210 | mem_cgroup_cancel_charge(kpage, memcg); | ||
| 203 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 211 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
| 204 | unlock_page(page); | 212 | unlock_page(page); |
| 205 | return err; | 213 | return err; |
| @@ -315,18 +323,11 @@ retry: | |||
| 315 | if (!new_page) | 323 | if (!new_page) |
| 316 | goto put_old; | 324 | goto put_old; |
| 317 | 325 | ||
| 318 | if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) | ||
| 319 | goto put_new; | ||
| 320 | |||
| 321 | __SetPageUptodate(new_page); | 326 | __SetPageUptodate(new_page); |
| 322 | copy_highpage(new_page, old_page); | 327 | copy_highpage(new_page, old_page); |
| 323 | copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); | 328 | copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); |
| 324 | 329 | ||
| 325 | ret = __replace_page(vma, vaddr, old_page, new_page); | 330 | ret = __replace_page(vma, vaddr, old_page, new_page); |
| 326 | if (ret) | ||
| 327 | mem_cgroup_uncharge_page(new_page); | ||
| 328 | |||
| 329 | put_new: | ||
| 330 | page_cache_release(new_page); | 331 | page_cache_release(new_page); |
| 331 | put_old: | 332 | put_old: |
| 332 | put_page(old_page); | 333 | put_page(old_page); |
diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668f1799..5d30019ff953 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -59,7 +59,7 @@ | |||
| 59 | #include <asm/pgtable.h> | 59 | #include <asm/pgtable.h> |
| 60 | #include <asm/mmu_context.h> | 60 | #include <asm/mmu_context.h> |
| 61 | 61 | ||
| 62 | static void exit_mm(struct task_struct * tsk); | 62 | static void exit_mm(struct task_struct *tsk); |
| 63 | 63 | ||
| 64 | static void __unhash_process(struct task_struct *p, bool group_dead) | 64 | static void __unhash_process(struct task_struct *p, bool group_dead) |
| 65 | { | 65 | { |
| @@ -115,32 +115,33 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 115 | 115 | ||
| 116 | if (tsk == sig->curr_target) | 116 | if (tsk == sig->curr_target) |
| 117 | sig->curr_target = next_thread(tsk); | 117 | sig->curr_target = next_thread(tsk); |
| 118 | /* | ||
| 119 | * Accumulate here the counters for all threads but the | ||
| 120 | * group leader as they die, so they can be added into | ||
| 121 | * the process-wide totals when those are taken. | ||
| 122 | * The group leader stays around as a zombie as long | ||
| 123 | * as there are other threads. When it gets reaped, | ||
| 124 | * the exit.c code will add its counts into these totals. | ||
| 125 | * We won't ever get here for the group leader, since it | ||
| 126 | * will have been the last reference on the signal_struct. | ||
| 127 | */ | ||
| 128 | task_cputime(tsk, &utime, &stime); | ||
| 129 | sig->utime += utime; | ||
| 130 | sig->stime += stime; | ||
| 131 | sig->gtime += task_gtime(tsk); | ||
| 132 | sig->min_flt += tsk->min_flt; | ||
| 133 | sig->maj_flt += tsk->maj_flt; | ||
| 134 | sig->nvcsw += tsk->nvcsw; | ||
| 135 | sig->nivcsw += tsk->nivcsw; | ||
| 136 | sig->inblock += task_io_get_inblock(tsk); | ||
| 137 | sig->oublock += task_io_get_oublock(tsk); | ||
| 138 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | ||
| 139 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||
| 140 | } | 118 | } |
| 141 | 119 | ||
| 120 | /* | ||
| 121 | * Accumulate here the counters for all threads but the group leader | ||
| 122 | * as they die, so they can be added into the process-wide totals | ||
| 123 | * when those are taken. The group leader stays around as a zombie as | ||
| 124 | * long as there are other threads. When it gets reaped, the exit.c | ||
| 125 | * code will add its counts into these totals. We won't ever get here | ||
| 126 | * for the group leader, since it will have been the last reference on | ||
| 127 | * the signal_struct. | ||
| 128 | */ | ||
| 129 | task_cputime(tsk, &utime, &stime); | ||
| 130 | write_seqlock(&sig->stats_lock); | ||
| 131 | sig->utime += utime; | ||
| 132 | sig->stime += stime; | ||
| 133 | sig->gtime += task_gtime(tsk); | ||
| 134 | sig->min_flt += tsk->min_flt; | ||
| 135 | sig->maj_flt += tsk->maj_flt; | ||
| 136 | sig->nvcsw += tsk->nvcsw; | ||
| 137 | sig->nivcsw += tsk->nivcsw; | ||
| 138 | sig->inblock += task_io_get_inblock(tsk); | ||
| 139 | sig->oublock += task_io_get_oublock(tsk); | ||
| 140 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | ||
| 141 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||
| 142 | sig->nr_threads--; | 142 | sig->nr_threads--; |
| 143 | __unhash_process(tsk, group_dead); | 143 | __unhash_process(tsk, group_dead); |
| 144 | write_sequnlock(&sig->stats_lock); | ||
| 144 | 145 | ||
| 145 | /* | 146 | /* |
| 146 | * Do this under ->siglock, we can race with another thread | 147 | * Do this under ->siglock, we can race with another thread |
| @@ -151,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 151 | spin_unlock(&sighand->siglock); | 152 | spin_unlock(&sighand->siglock); |
| 152 | 153 | ||
| 153 | __cleanup_sighand(sighand); | 154 | __cleanup_sighand(sighand); |
| 154 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | 155 | clear_tsk_thread_flag(tsk, TIF_SIGPENDING); |
| 155 | if (group_dead) { | 156 | if (group_dead) { |
| 156 | flush_sigqueue(&sig->shared_pending); | 157 | flush_sigqueue(&sig->shared_pending); |
| 157 | tty_kref_put(tty); | 158 | tty_kref_put(tty); |
| @@ -168,7 +169,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
| 168 | } | 169 | } |
| 169 | 170 | ||
| 170 | 171 | ||
| 171 | void release_task(struct task_struct * p) | 172 | void release_task(struct task_struct *p) |
| 172 | { | 173 | { |
| 173 | struct task_struct *leader; | 174 | struct task_struct *leader; |
| 174 | int zap_leader; | 175 | int zap_leader; |
| @@ -192,7 +193,8 @@ repeat: | |||
| 192 | */ | 193 | */ |
| 193 | zap_leader = 0; | 194 | zap_leader = 0; |
| 194 | leader = p->group_leader; | 195 | leader = p->group_leader; |
| 195 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { | 196 | if (leader != p && thread_group_empty(leader) |
| 197 | && leader->exit_state == EXIT_ZOMBIE) { | ||
| 196 | /* | 198 | /* |
| 197 | * If we were the last child thread and the leader has | 199 | * If we were the last child thread and the leader has |
| 198 | * exited already, and the leader's parent ignores SIGCHLD, | 200 | * exited already, and the leader's parent ignores SIGCHLD, |
| @@ -241,7 +243,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) | |||
| 241 | * | 243 | * |
| 242 | * "I ask you, have you ever known what it is to be an orphan?" | 244 | * "I ask you, have you ever known what it is to be an orphan?" |
| 243 | */ | 245 | */ |
| 244 | static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) | 246 | static int will_become_orphaned_pgrp(struct pid *pgrp, |
| 247 | struct task_struct *ignored_task) | ||
| 245 | { | 248 | { |
| 246 | struct task_struct *p; | 249 | struct task_struct *p; |
| 247 | 250 | ||
| @@ -294,9 +297,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | |||
| 294 | struct task_struct *ignored_task = tsk; | 297 | struct task_struct *ignored_task = tsk; |
| 295 | 298 | ||
| 296 | if (!parent) | 299 | if (!parent) |
| 297 | /* exit: our father is in a different pgrp than | 300 | /* exit: our father is in a different pgrp than |
| 298 | * we are and we were the only connection outside. | 301 | * we are and we were the only connection outside. |
| 299 | */ | 302 | */ |
| 300 | parent = tsk->real_parent; | 303 | parent = tsk->real_parent; |
| 301 | else | 304 | else |
| 302 | /* reparent: our child is in a different pgrp than | 305 | /* reparent: our child is in a different pgrp than |
| @@ -405,7 +408,7 @@ assign_new_owner: | |||
| 405 | * Turn us into a lazy TLB process if we | 408 | * Turn us into a lazy TLB process if we |
| 406 | * aren't already.. | 409 | * aren't already.. |
| 407 | */ | 410 | */ |
| 408 | static void exit_mm(struct task_struct * tsk) | 411 | static void exit_mm(struct task_struct *tsk) |
| 409 | { | 412 | { |
| 410 | struct mm_struct *mm = tsk->mm; | 413 | struct mm_struct *mm = tsk->mm; |
| 411 | struct core_state *core_state; | 414 | struct core_state *core_state; |
| @@ -425,6 +428,7 @@ static void exit_mm(struct task_struct * tsk) | |||
| 425 | core_state = mm->core_state; | 428 | core_state = mm->core_state; |
| 426 | if (core_state) { | 429 | if (core_state) { |
| 427 | struct core_thread self; | 430 | struct core_thread self; |
| 431 | |||
| 428 | up_read(&mm->mmap_sem); | 432 | up_read(&mm->mmap_sem); |
| 429 | 433 | ||
| 430 | self.task = tsk; | 434 | self.task = tsk; |
| @@ -455,6 +459,7 @@ static void exit_mm(struct task_struct * tsk) | |||
| 455 | task_unlock(tsk); | 459 | task_unlock(tsk); |
| 456 | mm_update_next_owner(mm); | 460 | mm_update_next_owner(mm); |
| 457 | mmput(mm); | 461 | mmput(mm); |
| 462 | clear_thread_flag(TIF_MEMDIE); | ||
| 458 | } | 463 | } |
| 459 | 464 | ||
| 460 | /* | 465 | /* |
| @@ -565,6 +570,7 @@ static void forget_original_parent(struct task_struct *father) | |||
| 565 | 570 | ||
| 566 | list_for_each_entry_safe(p, n, &father->children, sibling) { | 571 | list_for_each_entry_safe(p, n, &father->children, sibling) { |
| 567 | struct task_struct *t = p; | 572 | struct task_struct *t = p; |
| 573 | |||
| 568 | do { | 574 | do { |
| 569 | t->real_parent = reaper; | 575 | t->real_parent = reaper; |
| 570 | if (t->parent == father) { | 576 | if (t->parent == father) { |
| @@ -598,7 +604,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
| 598 | /* | 604 | /* |
| 599 | * This does two things: | 605 | * This does two things: |
| 600 | * | 606 | * |
| 601 | * A. Make init inherit all the child processes | 607 | * A. Make init inherit all the child processes |
| 602 | * B. Check to see if any process groups have become orphaned | 608 | * B. Check to see if any process groups have become orphaned |
| 603 | * as a result of our exiting, and if they have any stopped | 609 | * as a result of our exiting, and if they have any stopped |
| 604 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | 610 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) |
| @@ -648,9 +654,8 @@ static void check_stack_usage(void) | |||
| 648 | 654 | ||
| 649 | spin_lock(&low_water_lock); | 655 | spin_lock(&low_water_lock); |
| 650 | if (free < lowest_to_date) { | 656 | if (free < lowest_to_date) { |
| 651 | printk(KERN_WARNING "%s (%d) used greatest stack depth: " | 657 | pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", |
| 652 | "%lu bytes left\n", | 658 | current->comm, task_pid_nr(current), free); |
| 653 | current->comm, task_pid_nr(current), free); | ||
| 654 | lowest_to_date = free; | 659 | lowest_to_date = free; |
| 655 | } | 660 | } |
| 656 | spin_unlock(&low_water_lock); | 661 | spin_unlock(&low_water_lock); |
| @@ -663,6 +668,7 @@ void do_exit(long code) | |||
| 663 | { | 668 | { |
| 664 | struct task_struct *tsk = current; | 669 | struct task_struct *tsk = current; |
| 665 | int group_dead; | 670 | int group_dead; |
| 671 | TASKS_RCU(int tasks_rcu_i); | ||
| 666 | 672 | ||
| 667 | profile_task_exit(tsk); | 673 | profile_task_exit(tsk); |
| 668 | 674 | ||
| @@ -691,8 +697,7 @@ void do_exit(long code) | |||
| 691 | * leave this task alone and wait for reboot. | 697 | * leave this task alone and wait for reboot. |
| 692 | */ | 698 | */ |
| 693 | if (unlikely(tsk->flags & PF_EXITING)) { | 699 | if (unlikely(tsk->flags & PF_EXITING)) { |
| 694 | printk(KERN_ALERT | 700 | pr_alert("Fixing recursive fault but reboot is needed!\n"); |
| 695 | "Fixing recursive fault but reboot is needed!\n"); | ||
| 696 | /* | 701 | /* |
| 697 | * We can do this unlocked here. The futex code uses | 702 | * We can do this unlocked here. The futex code uses |
| 698 | * this flag just to verify whether the pi state | 703 | * this flag just to verify whether the pi state |
| @@ -716,9 +721,9 @@ void do_exit(long code) | |||
| 716 | raw_spin_unlock_wait(&tsk->pi_lock); | 721 | raw_spin_unlock_wait(&tsk->pi_lock); |
| 717 | 722 | ||
| 718 | if (unlikely(in_atomic())) | 723 | if (unlikely(in_atomic())) |
| 719 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 724 | pr_info("note: %s[%d] exited with preempt_count %d\n", |
| 720 | current->comm, task_pid_nr(current), | 725 | current->comm, task_pid_nr(current), |
| 721 | preempt_count()); | 726 | preempt_count()); |
| 722 | 727 | ||
| 723 | acct_update_integrals(tsk); | 728 | acct_update_integrals(tsk); |
| 724 | /* sync mm's RSS info before statistics gathering */ | 729 | /* sync mm's RSS info before statistics gathering */ |
| @@ -772,6 +777,7 @@ void do_exit(long code) | |||
| 772 | */ | 777 | */ |
| 773 | flush_ptrace_hw_breakpoint(tsk); | 778 | flush_ptrace_hw_breakpoint(tsk); |
| 774 | 779 | ||
| 780 | TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); | ||
| 775 | exit_notify(tsk, group_dead); | 781 | exit_notify(tsk, group_dead); |
| 776 | proc_exit_connector(tsk); | 782 | proc_exit_connector(tsk); |
| 777 | #ifdef CONFIG_NUMA | 783 | #ifdef CONFIG_NUMA |
| @@ -811,6 +817,7 @@ void do_exit(long code) | |||
| 811 | if (tsk->nr_dirtied) | 817 | if (tsk->nr_dirtied) |
| 812 | __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); | 818 | __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); |
| 813 | exit_rcu(); | 819 | exit_rcu(); |
| 820 | TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); | ||
| 814 | 821 | ||
| 815 | /* | 822 | /* |
| 816 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | 823 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed |
| @@ -836,7 +843,6 @@ void do_exit(long code) | |||
| 836 | for (;;) | 843 | for (;;) |
| 837 | cpu_relax(); /* For when BUG is null */ | 844 | cpu_relax(); /* For when BUG is null */ |
| 838 | } | 845 | } |
| 839 | |||
| 840 | EXPORT_SYMBOL_GPL(do_exit); | 846 | EXPORT_SYMBOL_GPL(do_exit); |
| 841 | 847 | ||
| 842 | void complete_and_exit(struct completion *comp, long code) | 848 | void complete_and_exit(struct completion *comp, long code) |
| @@ -846,7 +852,6 @@ void complete_and_exit(struct completion *comp, long code) | |||
| 846 | 852 | ||
| 847 | do_exit(code); | 853 | do_exit(code); |
| 848 | } | 854 | } |
| 849 | |||
| 850 | EXPORT_SYMBOL(complete_and_exit); | 855 | EXPORT_SYMBOL(complete_and_exit); |
| 851 | 856 | ||
| 852 | SYSCALL_DEFINE1(exit, int, error_code) | 857 | SYSCALL_DEFINE1(exit, int, error_code) |
| @@ -869,6 +874,7 @@ do_group_exit(int exit_code) | |||
| 869 | exit_code = sig->group_exit_code; | 874 | exit_code = sig->group_exit_code; |
| 870 | else if (!thread_group_empty(current)) { | 875 | else if (!thread_group_empty(current)) { |
| 871 | struct sighand_struct *const sighand = current->sighand; | 876 | struct sighand_struct *const sighand = current->sighand; |
| 877 | |||
| 872 | spin_lock_irq(&sighand->siglock); | 878 | spin_lock_irq(&sighand->siglock); |
| 873 | if (signal_group_exit(sig)) | 879 | if (signal_group_exit(sig)) |
| 874 | /* Another thread got here before we took the lock. */ | 880 | /* Another thread got here before we took the lock. */ |
| @@ -1033,14 +1039,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1033 | * as other threads in the parent group can be right | 1039 | * as other threads in the parent group can be right |
| 1034 | * here reaping other children at the same time. | 1040 | * here reaping other children at the same time. |
| 1035 | * | 1041 | * |
| 1036 | * We use thread_group_cputime_adjusted() to get times for the thread | 1042 | * We use thread_group_cputime_adjusted() to get times for |
| 1037 | * group, which consolidates times for all threads in the | 1043 | * the thread group, which consolidates times for all threads |
| 1038 | * group including the group leader. | 1044 | * in the group including the group leader. |
| 1039 | */ | 1045 | */ |
| 1040 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); | 1046 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
| 1041 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1047 | spin_lock_irq(&p->real_parent->sighand->siglock); |
| 1042 | psig = p->real_parent->signal; | 1048 | psig = p->real_parent->signal; |
| 1043 | sig = p->signal; | 1049 | sig = p->signal; |
| 1050 | write_seqlock(&psig->stats_lock); | ||
| 1044 | psig->cutime += tgutime + sig->cutime; | 1051 | psig->cutime += tgutime + sig->cutime; |
| 1045 | psig->cstime += tgstime + sig->cstime; | 1052 | psig->cstime += tgstime + sig->cstime; |
| 1046 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; | 1053 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; |
| @@ -1063,6 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1063 | psig->cmaxrss = maxrss; | 1070 | psig->cmaxrss = maxrss; |
| 1064 | task_io_accounting_add(&psig->ioac, &p->ioac); | 1071 | task_io_accounting_add(&psig->ioac, &p->ioac); |
| 1065 | task_io_accounting_add(&psig->ioac, &sig->ioac); | 1072 | task_io_accounting_add(&psig->ioac, &sig->ioac); |
| 1073 | write_sequnlock(&psig->stats_lock); | ||
| 1066 | spin_unlock_irq(&p->real_parent->sighand->siglock); | 1074 | spin_unlock_irq(&p->real_parent->sighand->siglock); |
| 1067 | } | 1075 | } |
| 1068 | 1076 | ||
| @@ -1417,6 +1425,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) | |||
| 1417 | 1425 | ||
| 1418 | list_for_each_entry(p, &tsk->children, sibling) { | 1426 | list_for_each_entry(p, &tsk->children, sibling) { |
| 1419 | int ret = wait_consider_task(wo, 0, p); | 1427 | int ret = wait_consider_task(wo, 0, p); |
| 1428 | |||
| 1420 | if (ret) | 1429 | if (ret) |
| 1421 | return ret; | 1430 | return ret; |
| 1422 | } | 1431 | } |
| @@ -1430,6 +1439,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) | |||
| 1430 | 1439 | ||
| 1431 | list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { | 1440 | list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { |
| 1432 | int ret = wait_consider_task(wo, 1, p); | 1441 | int ret = wait_consider_task(wo, 1, p); |
| 1442 | |||
| 1433 | if (ret) | 1443 | if (ret) |
| 1434 | return ret; | 1444 | return ret; |
| 1435 | } | 1445 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index fbd3497b221f..9b7d746d6d62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, | |||
| 294 | return 0; | 294 | return 0; |
| 295 | } | 295 | } |
| 296 | 296 | ||
| 297 | void set_task_stack_end_magic(struct task_struct *tsk) | ||
| 298 | { | ||
| 299 | unsigned long *stackend; | ||
| 300 | |||
| 301 | stackend = end_of_stack(tsk); | ||
| 302 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | ||
| 303 | } | ||
| 304 | |||
| 297 | static struct task_struct *dup_task_struct(struct task_struct *orig) | 305 | static struct task_struct *dup_task_struct(struct task_struct *orig) |
| 298 | { | 306 | { |
| 299 | struct task_struct *tsk; | 307 | struct task_struct *tsk; |
| 300 | struct thread_info *ti; | 308 | struct thread_info *ti; |
| 301 | unsigned long *stackend; | ||
| 302 | int node = tsk_fork_get_node(orig); | 309 | int node = tsk_fork_get_node(orig); |
| 303 | int err; | 310 | int err; |
| 304 | 311 | ||
| @@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 328 | setup_thread_stack(tsk, orig); | 335 | setup_thread_stack(tsk, orig); |
| 329 | clear_user_return_notifier(tsk); | 336 | clear_user_return_notifier(tsk); |
| 330 | clear_tsk_need_resched(tsk); | 337 | clear_tsk_need_resched(tsk); |
| 331 | stackend = end_of_stack(tsk); | 338 | set_task_stack_end_magic(tsk); |
| 332 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | ||
| 333 | 339 | ||
| 334 | #ifdef CONFIG_CC_STACKPROTECTOR | 340 | #ifdef CONFIG_CC_STACKPROTECTOR |
| 335 | tsk->stack_canary = get_random_int(); | 341 | tsk->stack_canary = get_random_int(); |
| @@ -374,12 +380,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 374 | */ | 380 | */ |
| 375 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); | 381 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); |
| 376 | 382 | ||
| 377 | mm->locked_vm = 0; | 383 | mm->total_vm = oldmm->total_vm; |
| 378 | mm->mmap = NULL; | 384 | mm->shared_vm = oldmm->shared_vm; |
| 379 | mm->vmacache_seqnum = 0; | 385 | mm->exec_vm = oldmm->exec_vm; |
| 380 | mm->map_count = 0; | 386 | mm->stack_vm = oldmm->stack_vm; |
| 381 | cpumask_clear(mm_cpumask(mm)); | 387 | |
| 382 | mm->mm_rb = RB_ROOT; | ||
| 383 | rb_link = &mm->mm_rb.rb_node; | 388 | rb_link = &mm->mm_rb.rb_node; |
| 384 | rb_parent = NULL; | 389 | rb_parent = NULL; |
| 385 | pprev = &mm->mmap; | 390 | pprev = &mm->mmap; |
| @@ -430,7 +435,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 430 | atomic_dec(&inode->i_writecount); | 435 | atomic_dec(&inode->i_writecount); |
| 431 | mutex_lock(&mapping->i_mmap_mutex); | 436 | mutex_lock(&mapping->i_mmap_mutex); |
| 432 | if (tmp->vm_flags & VM_SHARED) | 437 | if (tmp->vm_flags & VM_SHARED) |
| 433 | mapping->i_mmap_writable++; | 438 | atomic_inc(&mapping->i_mmap_writable); |
| 434 | flush_dcache_mmap_lock(mapping); | 439 | flush_dcache_mmap_lock(mapping); |
| 435 | /* insert tmp into the share list, just after mpnt */ | 440 | /* insert tmp into the share list, just after mpnt */ |
| 436 | if (unlikely(tmp->vm_flags & VM_NONLINEAR)) | 441 | if (unlikely(tmp->vm_flags & VM_NONLINEAR)) |
| @@ -536,19 +541,37 @@ static void mm_init_aio(struct mm_struct *mm) | |||
| 536 | #endif | 541 | #endif |
| 537 | } | 542 | } |
| 538 | 543 | ||
| 544 | static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | ||
| 545 | { | ||
| 546 | #ifdef CONFIG_MEMCG | ||
| 547 | mm->owner = p; | ||
| 548 | #endif | ||
| 549 | } | ||
| 550 | |||
| 539 | static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | 551 | static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) |
| 540 | { | 552 | { |
| 553 | mm->mmap = NULL; | ||
| 554 | mm->mm_rb = RB_ROOT; | ||
| 555 | mm->vmacache_seqnum = 0; | ||
| 541 | atomic_set(&mm->mm_users, 1); | 556 | atomic_set(&mm->mm_users, 1); |
| 542 | atomic_set(&mm->mm_count, 1); | 557 | atomic_set(&mm->mm_count, 1); |
| 543 | init_rwsem(&mm->mmap_sem); | 558 | init_rwsem(&mm->mmap_sem); |
| 544 | INIT_LIST_HEAD(&mm->mmlist); | 559 | INIT_LIST_HEAD(&mm->mmlist); |
| 545 | mm->core_state = NULL; | 560 | mm->core_state = NULL; |
| 546 | atomic_long_set(&mm->nr_ptes, 0); | 561 | atomic_long_set(&mm->nr_ptes, 0); |
| 562 | mm->map_count = 0; | ||
| 563 | mm->locked_vm = 0; | ||
| 564 | mm->pinned_vm = 0; | ||
| 547 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); | 565 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
| 548 | spin_lock_init(&mm->page_table_lock); | 566 | spin_lock_init(&mm->page_table_lock); |
| 567 | mm_init_cpumask(mm); | ||
| 549 | mm_init_aio(mm); | 568 | mm_init_aio(mm); |
| 550 | mm_init_owner(mm, p); | 569 | mm_init_owner(mm, p); |
| 570 | mmu_notifier_mm_init(mm); | ||
| 551 | clear_tlb_flush_pending(mm); | 571 | clear_tlb_flush_pending(mm); |
| 572 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | ||
| 573 | mm->pmd_huge_pte = NULL; | ||
| 574 | #endif | ||
| 552 | 575 | ||
| 553 | if (current->mm) { | 576 | if (current->mm) { |
| 554 | mm->flags = current->mm->flags & MMF_INIT_MASK; | 577 | mm->flags = current->mm->flags & MMF_INIT_MASK; |
| @@ -558,11 +581,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | |||
| 558 | mm->def_flags = 0; | 581 | mm->def_flags = 0; |
| 559 | } | 582 | } |
| 560 | 583 | ||
| 561 | if (likely(!mm_alloc_pgd(mm))) { | 584 | if (mm_alloc_pgd(mm)) |
| 562 | mmu_notifier_mm_init(mm); | 585 | goto fail_nopgd; |
| 563 | return mm; | ||
| 564 | } | ||
| 565 | 586 | ||
| 587 | if (init_new_context(p, mm)) | ||
| 588 | goto fail_nocontext; | ||
| 589 | |||
| 590 | return mm; | ||
| 591 | |||
| 592 | fail_nocontext: | ||
| 593 | mm_free_pgd(mm); | ||
| 594 | fail_nopgd: | ||
| 566 | free_mm(mm); | 595 | free_mm(mm); |
| 567 | return NULL; | 596 | return NULL; |
| 568 | } | 597 | } |
| @@ -578,9 +607,8 @@ static void check_mm(struct mm_struct *mm) | |||
| 578 | printk(KERN_ALERT "BUG: Bad rss-counter state " | 607 | printk(KERN_ALERT "BUG: Bad rss-counter state " |
| 579 | "mm:%p idx:%d val:%ld\n", mm, i, x); | 608 | "mm:%p idx:%d val:%ld\n", mm, i, x); |
| 580 | } | 609 | } |
| 581 | |||
| 582 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | 610 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
| 583 | VM_BUG_ON(mm->pmd_huge_pte); | 611 | VM_BUG_ON_MM(mm->pmd_huge_pte, mm); |
| 584 | #endif | 612 | #endif |
| 585 | } | 613 | } |
| 586 | 614 | ||
| @@ -596,7 +624,6 @@ struct mm_struct *mm_alloc(void) | |||
| 596 | return NULL; | 624 | return NULL; |
| 597 | 625 | ||
| 598 | memset(mm, 0, sizeof(*mm)); | 626 | memset(mm, 0, sizeof(*mm)); |
| 599 | mm_init_cpumask(mm); | ||
| 600 | return mm_init(mm, current); | 627 | return mm_init(mm, current); |
| 601 | } | 628 | } |
| 602 | 629 | ||
| @@ -828,17 +855,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) | |||
| 828 | goto fail_nomem; | 855 | goto fail_nomem; |
| 829 | 856 | ||
| 830 | memcpy(mm, oldmm, sizeof(*mm)); | 857 | memcpy(mm, oldmm, sizeof(*mm)); |
| 831 | mm_init_cpumask(mm); | ||
| 832 | 858 | ||
| 833 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | ||
| 834 | mm->pmd_huge_pte = NULL; | ||
| 835 | #endif | ||
| 836 | if (!mm_init(mm, tsk)) | 859 | if (!mm_init(mm, tsk)) |
| 837 | goto fail_nomem; | 860 | goto fail_nomem; |
| 838 | 861 | ||
| 839 | if (init_new_context(tsk, mm)) | ||
| 840 | goto fail_nocontext; | ||
| 841 | |||
| 842 | dup_mm_exe_file(oldmm, mm); | 862 | dup_mm_exe_file(oldmm, mm); |
| 843 | 863 | ||
| 844 | err = dup_mmap(mm, oldmm); | 864 | err = dup_mmap(mm, oldmm); |
| @@ -860,15 +880,6 @@ free_pt: | |||
| 860 | 880 | ||
| 861 | fail_nomem: | 881 | fail_nomem: |
| 862 | return NULL; | 882 | return NULL; |
| 863 | |||
| 864 | fail_nocontext: | ||
| 865 | /* | ||
| 866 | * If init_new_context() failed, we cannot use mmput() to free the mm | ||
| 867 | * because it calls destroy_context() | ||
| 868 | */ | ||
| 869 | mm_free_pgd(mm); | ||
| 870 | free_mm(mm); | ||
| 871 | return NULL; | ||
| 872 | } | 883 | } |
| 873 | 884 | ||
| 874 | static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) | 885 | static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) |
| @@ -1062,6 +1073,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1062 | sig->curr_target = tsk; | 1073 | sig->curr_target = tsk; |
| 1063 | init_sigpending(&sig->shared_pending); | 1074 | init_sigpending(&sig->shared_pending); |
| 1064 | INIT_LIST_HEAD(&sig->posix_timers); | 1075 | INIT_LIST_HEAD(&sig->posix_timers); |
| 1076 | seqlock_init(&sig->stats_lock); | ||
| 1065 | 1077 | ||
| 1066 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1078 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 1067 | sig->real_timer.function = it_real_fn; | 1079 | sig->real_timer.function = it_real_fn; |
| @@ -1099,7 +1111,7 @@ static void copy_seccomp(struct task_struct *p) | |||
| 1099 | * needed because this new task is not yet running and cannot | 1111 | * needed because this new task is not yet running and cannot |
| 1100 | * be racing exec. | 1112 | * be racing exec. |
| 1101 | */ | 1113 | */ |
| 1102 | BUG_ON(!spin_is_locked(¤t->sighand->siglock)); | 1114 | assert_spin_locked(¤t->sighand->siglock); |
| 1103 | 1115 | ||
| 1104 | /* Ref-count the new filter user, and assign it. */ | 1116 | /* Ref-count the new filter user, and assign it. */ |
| 1105 | get_seccomp_filter(current); | 1117 | get_seccomp_filter(current); |
| @@ -1140,13 +1152,6 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
| 1140 | #endif | 1152 | #endif |
| 1141 | } | 1153 | } |
| 1142 | 1154 | ||
| 1143 | #ifdef CONFIG_MEMCG | ||
| 1144 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | ||
| 1145 | { | ||
| 1146 | mm->owner = p; | ||
| 1147 | } | ||
| 1148 | #endif /* CONFIG_MEMCG */ | ||
| 1149 | |||
| 1150 | /* | 1155 | /* |
| 1151 | * Initialize POSIX timer handling for a single task. | 1156 | * Initialize POSIX timer handling for a single task. |
| 1152 | */ | 1157 | */ |
| @@ -1346,10 +1351,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1346 | #ifdef CONFIG_DEBUG_MUTEXES | 1351 | #ifdef CONFIG_DEBUG_MUTEXES |
| 1347 | p->blocked_on = NULL; /* not blocked yet */ | 1352 | p->blocked_on = NULL; /* not blocked yet */ |
| 1348 | #endif | 1353 | #endif |
| 1349 | #ifdef CONFIG_MEMCG | ||
| 1350 | p->memcg_batch.do_batch = 0; | ||
| 1351 | p->memcg_batch.memcg = NULL; | ||
| 1352 | #endif | ||
| 1353 | #ifdef CONFIG_BCACHE | 1354 | #ifdef CONFIG_BCACHE |
| 1354 | p->sequential_io = 0; | 1355 | p->sequential_io = 0; |
| 1355 | p->sequential_io_avg = 0; | 1356 | p->sequential_io_avg = 0; |
| @@ -1365,8 +1366,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1365 | goto bad_fork_cleanup_policy; | 1366 | goto bad_fork_cleanup_policy; |
| 1366 | retval = audit_alloc(p); | 1367 | retval = audit_alloc(p); |
| 1367 | if (retval) | 1368 | if (retval) |
| 1368 | goto bad_fork_cleanup_policy; | 1369 | goto bad_fork_cleanup_perf; |
| 1369 | /* copy all the process information */ | 1370 | /* copy all the process information */ |
| 1371 | shm_init_task(p); | ||
| 1370 | retval = copy_semundo(clone_flags, p); | 1372 | retval = copy_semundo(clone_flags, p); |
| 1371 | if (retval) | 1373 | if (retval) |
| 1372 | goto bad_fork_cleanup_audit; | 1374 | goto bad_fork_cleanup_audit; |
| @@ -1570,8 +1572,9 @@ bad_fork_cleanup_semundo: | |||
| 1570 | exit_sem(p); | 1572 | exit_sem(p); |
| 1571 | bad_fork_cleanup_audit: | 1573 | bad_fork_cleanup_audit: |
| 1572 | audit_free(p); | 1574 | audit_free(p); |
| 1573 | bad_fork_cleanup_policy: | 1575 | bad_fork_cleanup_perf: |
| 1574 | perf_event_free_task(p); | 1576 | perf_event_free_task(p); |
| 1577 | bad_fork_cleanup_policy: | ||
| 1575 | #ifdef CONFIG_NUMA | 1578 | #ifdef CONFIG_NUMA |
| 1576 | mpol_put(p->mempolicy); | 1579 | mpol_put(p->mempolicy); |
| 1577 | bad_fork_cleanup_threadgroup_lock: | 1580 | bad_fork_cleanup_threadgroup_lock: |
| @@ -1918,6 +1921,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
| 1918 | */ | 1921 | */ |
| 1919 | exit_sem(current); | 1922 | exit_sem(current); |
| 1920 | } | 1923 | } |
| 1924 | if (unshare_flags & CLONE_NEWIPC) { | ||
| 1925 | /* Orphan segments in old ns (see sem above). */ | ||
| 1926 | exit_shm(current); | ||
| 1927 | shm_init_task(current); | ||
| 1928 | } | ||
| 1921 | 1929 | ||
| 1922 | if (new_nsproxy) | 1930 | if (new_nsproxy) |
| 1923 | switch_task_namespaces(current, new_nsproxy); | 1931 | switch_task_namespaces(current, new_nsproxy); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aadb911..a8900a3bc27a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
| @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) | |||
| 42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) | 42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) |
| 43 | return false; | 43 | return false; |
| 44 | 44 | ||
| 45 | if (test_thread_flag(TIF_MEMDIE)) | ||
| 46 | return false; | ||
| 47 | |||
| 45 | if (pm_nosig_freezing || cgroup_freezing(p)) | 48 | if (pm_nosig_freezing || cgroup_freezing(p)) |
| 46 | return true; | 49 | return true; |
| 47 | 50 | ||
| @@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p) | |||
| 147 | { | 150 | { |
| 148 | unsigned long flags; | 151 | unsigned long flags; |
| 149 | 152 | ||
| 150 | /* | ||
| 151 | * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to | ||
| 152 | * be visible to @p as waking up implies wmb. Waking up inside | ||
| 153 | * freezer_lock also prevents wakeups from leaking outside | ||
| 154 | * refrigerator. | ||
| 155 | */ | ||
| 156 | spin_lock_irqsave(&freezer_lock, flags); | 153 | spin_lock_irqsave(&freezer_lock, flags); |
| 157 | if (frozen(p)) | 154 | if (frozen(p)) |
| 158 | wake_up_process(p); | 155 | wake_up_process(p); |
diff --git a/kernel/futex.c b/kernel/futex.c index d3a9d946d0b7..63678b573d61 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -143,9 +143,8 @@ | |||
| 143 | * | 143 | * |
| 144 | * Where (A) orders the waiters increment and the futex value read through | 144 | * Where (A) orders the waiters increment and the futex value read through |
| 145 | * atomic operations (see hb_waiters_inc) and where (B) orders the write | 145 | * atomic operations (see hb_waiters_inc) and where (B) orders the write |
| 146 | * to futex and the waiters read -- this is done by the barriers in | 146 | * to futex and the waiters read -- this is done by the barriers for both |
| 147 | * get_futex_key_refs(), through either ihold or atomic_inc, depending on the | 147 | * shared and private futexes in get_futex_key_refs(). |
| 148 | * futex type. | ||
| 149 | * | 148 | * |
| 150 | * This yields the following case (where X:=waiters, Y:=futex): | 149 | * This yields the following case (where X:=waiters, Y:=futex): |
| 151 | * | 150 | * |
| @@ -343,12 +342,21 @@ static void get_futex_key_refs(union futex_key *key) | |||
| 343 | case FUT_OFF_MMSHARED: | 342 | case FUT_OFF_MMSHARED: |
| 344 | futex_get_mm(key); /* implies MB (B) */ | 343 | futex_get_mm(key); /* implies MB (B) */ |
| 345 | break; | 344 | break; |
| 345 | default: | ||
| 346 | /* | ||
| 347 | * Private futexes do not hold reference on an inode or | ||
| 348 | * mm, therefore the only purpose of calling get_futex_key_refs | ||
| 349 | * is because we need the barrier for the lockless waiter check. | ||
| 350 | */ | ||
| 351 | smp_mb(); /* explicit MB (B) */ | ||
| 346 | } | 352 | } |
| 347 | } | 353 | } |
| 348 | 354 | ||
| 349 | /* | 355 | /* |
| 350 | * Drop a reference to the resource addressed by a key. | 356 | * Drop a reference to the resource addressed by a key. |
| 351 | * The hash bucket spinlock must not be held. | 357 | * The hash bucket spinlock must not be held. This is |
| 358 | * a no-op for private futexes, see comment in the get | ||
| 359 | * counterpart. | ||
| 352 | */ | 360 | */ |
| 353 | static void drop_futex_key_refs(union futex_key *key) | 361 | static void drop_futex_key_refs(union futex_key *key) |
| 354 | { | 362 | { |
| @@ -639,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void) | |||
| 639 | return pi_state; | 647 | return pi_state; |
| 640 | } | 648 | } |
| 641 | 649 | ||
| 650 | /* | ||
| 651 | * Must be called with the hb lock held. | ||
| 652 | */ | ||
| 642 | static void free_pi_state(struct futex_pi_state *pi_state) | 653 | static void free_pi_state(struct futex_pi_state *pi_state) |
| 643 | { | 654 | { |
| 655 | if (!pi_state) | ||
| 656 | return; | ||
| 657 | |||
| 644 | if (!atomic_dec_and_test(&pi_state->refcount)) | 658 | if (!atomic_dec_and_test(&pi_state->refcount)) |
| 645 | return; | 659 | return; |
| 646 | 660 | ||
| @@ -1519,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | |||
| 1519 | } | 1533 | } |
| 1520 | 1534 | ||
| 1521 | retry: | 1535 | retry: |
| 1522 | if (pi_state != NULL) { | ||
| 1523 | /* | ||
| 1524 | * We will have to lookup the pi_state again, so free this one | ||
| 1525 | * to keep the accounting correct. | ||
| 1526 | */ | ||
| 1527 | free_pi_state(pi_state); | ||
| 1528 | pi_state = NULL; | ||
| 1529 | } | ||
| 1530 | |||
| 1531 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); | 1536 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
| 1532 | if (unlikely(ret != 0)) | 1537 | if (unlikely(ret != 0)) |
| 1533 | goto out; | 1538 | goto out; |
| @@ -1617,6 +1622,8 @@ retry_private: | |||
| 1617 | case 0: | 1622 | case 0: |
| 1618 | break; | 1623 | break; |
| 1619 | case -EFAULT: | 1624 | case -EFAULT: |
| 1625 | free_pi_state(pi_state); | ||
| 1626 | pi_state = NULL; | ||
| 1620 | double_unlock_hb(hb1, hb2); | 1627 | double_unlock_hb(hb1, hb2); |
| 1621 | hb_waiters_dec(hb2); | 1628 | hb_waiters_dec(hb2); |
| 1622 | put_futex_key(&key2); | 1629 | put_futex_key(&key2); |
| @@ -1632,6 +1639,8 @@ retry_private: | |||
| 1632 | * exit to complete. | 1639 | * exit to complete. |
| 1633 | * - The user space value changed. | 1640 | * - The user space value changed. |
| 1634 | */ | 1641 | */ |
| 1642 | free_pi_state(pi_state); | ||
| 1643 | pi_state = NULL; | ||
| 1635 | double_unlock_hb(hb1, hb2); | 1644 | double_unlock_hb(hb1, hb2); |
| 1636 | hb_waiters_dec(hb2); | 1645 | hb_waiters_dec(hb2); |
| 1637 | put_futex_key(&key2); | 1646 | put_futex_key(&key2); |
| @@ -1708,6 +1717,7 @@ retry_private: | |||
| 1708 | } | 1717 | } |
| 1709 | 1718 | ||
| 1710 | out_unlock: | 1719 | out_unlock: |
| 1720 | free_pi_state(pi_state); | ||
| 1711 | double_unlock_hb(hb1, hb2); | 1721 | double_unlock_hb(hb1, hb2); |
| 1712 | hb_waiters_dec(hb2); | 1722 | hb_waiters_dec(hb2); |
| 1713 | 1723 | ||
| @@ -1725,8 +1735,6 @@ out_put_keys: | |||
| 1725 | out_put_key1: | 1735 | out_put_key1: |
| 1726 | put_futex_key(&key1); | 1736 | put_futex_key(&key1); |
| 1727 | out: | 1737 | out: |
| 1728 | if (pi_state != NULL) | ||
| 1729 | free_pi_state(pi_state); | ||
| 1730 | return ret ? ret : task_count; | 1738 | return ret ? ret : task_count; |
| 1731 | } | 1739 | } |
| 1732 | 1740 | ||
| @@ -2592,6 +2600,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
| 2592 | * shared futexes. We need to compare the keys: | 2600 | * shared futexes. We need to compare the keys: |
| 2593 | */ | 2601 | */ |
| 2594 | if (match_futex(&q.key, &key2)) { | 2602 | if (match_futex(&q.key, &key2)) { |
| 2603 | queue_unlock(hb); | ||
| 2595 | ret = -EINVAL; | 2604 | ret = -EINVAL; |
| 2596 | goto out_put_keys; | 2605 | goto out_put_keys; |
| 2597 | } | 2606 | } |
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d04ce8ac4399..3b7408759bdf 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
| @@ -35,7 +35,7 @@ config GCOV_KERNEL | |||
| 35 | config GCOV_PROFILE_ALL | 35 | config GCOV_PROFILE_ALL |
| 36 | bool "Profile entire Kernel" | 36 | bool "Profile entire Kernel" |
| 37 | depends on GCOV_KERNEL | 37 | depends on GCOV_KERNEL |
| 38 | depends on SUPERH || S390 || X86 || PPC || MICROBLAZE | 38 | depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 |
| 39 | default n | 39 | default n |
| 40 | ---help--- | 40 | ---help--- |
| 41 | This options activates profiling for the entire kernel. | 41 | This options activates profiling for the entire kernel. |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a76379..edf67c493a8e 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
| @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) | |||
| 784 | 784 | ||
| 785 | err_remove: | 785 | err_remove: |
| 786 | pr_err("init failed\n"); | 786 | pr_err("init failed\n"); |
| 787 | if (root_node.dentry) | 787 | debugfs_remove(root_node.dentry); |
| 788 | debugfs_remove(root_node.dentry); | ||
| 789 | 788 | ||
| 790 | return rc; | 789 | return rc; |
| 791 | } | 790 | } |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d269cecdfbf0..225086b2652e 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -55,6 +55,9 @@ config GENERIC_IRQ_CHIP | |||
| 55 | config IRQ_DOMAIN | 55 | config IRQ_DOMAIN |
| 56 | bool | 56 | bool |
| 57 | 57 | ||
| 58 | config HANDLE_DOMAIN_IRQ | ||
| 59 | bool | ||
| 60 | |||
| 58 | config IRQ_DOMAIN_DEBUG | 61 | config IRQ_DOMAIN_DEBUG |
| 59 | bool "Expose hardware/virtual IRQ mapping via debugfs" | 62 | bool "Expose hardware/virtual IRQ mapping via debugfs" |
| 60 | depends on IRQ_DOMAIN && DEBUG_FS | 63 | depends on IRQ_DOMAIN && DEBUG_FS |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b28a2fd7b1..e5202f00cabc 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -342,6 +342,31 @@ static bool irq_check_poll(struct irq_desc *desc) | |||
| 342 | return irq_wait_for_poll(desc); | 342 | return irq_wait_for_poll(desc); |
| 343 | } | 343 | } |
| 344 | 344 | ||
| 345 | static bool irq_may_run(struct irq_desc *desc) | ||
| 346 | { | ||
| 347 | unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; | ||
| 348 | |||
| 349 | /* | ||
| 350 | * If the interrupt is not in progress and is not an armed | ||
| 351 | * wakeup interrupt, proceed. | ||
| 352 | */ | ||
| 353 | if (!irqd_has_set(&desc->irq_data, mask)) | ||
| 354 | return true; | ||
| 355 | |||
| 356 | /* | ||
| 357 | * If the interrupt is an armed wakeup source, mark it pending | ||
| 358 | * and suspended, disable it and notify the pm core about the | ||
| 359 | * event. | ||
| 360 | */ | ||
| 361 | if (irq_pm_check_wakeup(desc)) | ||
| 362 | return false; | ||
| 363 | |||
| 364 | /* | ||
| 365 | * Handle a potential concurrent poll on a different core. | ||
| 366 | */ | ||
| 367 | return irq_check_poll(desc); | ||
| 368 | } | ||
| 369 | |||
| 345 | /** | 370 | /** |
| 346 | * handle_simple_irq - Simple and software-decoded IRQs. | 371 | * handle_simple_irq - Simple and software-decoded IRQs. |
| 347 | * @irq: the interrupt number | 372 | * @irq: the interrupt number |
| @@ -359,9 +384,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
| 359 | { | 384 | { |
| 360 | raw_spin_lock(&desc->lock); | 385 | raw_spin_lock(&desc->lock); |
| 361 | 386 | ||
| 362 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) | 387 | if (!irq_may_run(desc)) |
| 363 | if (!irq_check_poll(desc)) | 388 | goto out_unlock; |
| 364 | goto out_unlock; | ||
| 365 | 389 | ||
| 366 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 390 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 367 | kstat_incr_irqs_this_cpu(irq, desc); | 391 | kstat_incr_irqs_this_cpu(irq, desc); |
| @@ -412,9 +436,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
| 412 | raw_spin_lock(&desc->lock); | 436 | raw_spin_lock(&desc->lock); |
| 413 | mask_ack_irq(desc); | 437 | mask_ack_irq(desc); |
| 414 | 438 | ||
| 415 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) | 439 | if (!irq_may_run(desc)) |
| 416 | if (!irq_check_poll(desc)) | 440 | goto out_unlock; |
| 417 | goto out_unlock; | ||
| 418 | 441 | ||
| 419 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 442 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 420 | kstat_incr_irqs_this_cpu(irq, desc); | 443 | kstat_incr_irqs_this_cpu(irq, desc); |
| @@ -485,9 +508,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
| 485 | 508 | ||
| 486 | raw_spin_lock(&desc->lock); | 509 | raw_spin_lock(&desc->lock); |
| 487 | 510 | ||
| 488 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) | 511 | if (!irq_may_run(desc)) |
| 489 | if (!irq_check_poll(desc)) | 512 | goto out; |
| 490 | goto out; | ||
| 491 | 513 | ||
| 492 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 514 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 493 | kstat_incr_irqs_this_cpu(irq, desc); | 515 | kstat_incr_irqs_this_cpu(irq, desc); |
| @@ -517,6 +539,7 @@ out: | |||
| 517 | chip->irq_eoi(&desc->irq_data); | 539 | chip->irq_eoi(&desc->irq_data); |
| 518 | raw_spin_unlock(&desc->lock); | 540 | raw_spin_unlock(&desc->lock); |
| 519 | } | 541 | } |
| 542 | EXPORT_SYMBOL_GPL(handle_fasteoi_irq); | ||
| 520 | 543 | ||
| 521 | /** | 544 | /** |
| 522 | * handle_edge_irq - edge type IRQ handler | 545 | * handle_edge_irq - edge type IRQ handler |
| @@ -540,19 +563,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
| 540 | raw_spin_lock(&desc->lock); | 563 | raw_spin_lock(&desc->lock); |
| 541 | 564 | ||
| 542 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 565 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 566 | |||
| 567 | if (!irq_may_run(desc)) { | ||
| 568 | desc->istate |= IRQS_PENDING; | ||
| 569 | mask_ack_irq(desc); | ||
| 570 | goto out_unlock; | ||
| 571 | } | ||
| 572 | |||
| 543 | /* | 573 | /* |
| 544 | * If we're currently running this IRQ, or its disabled, | 574 | * If its disabled or no action available then mask it and get |
| 545 | * we shouldn't process the IRQ. Mark it pending, handle | 575 | * out of here. |
| 546 | * the necessary masking and go out | ||
| 547 | */ | 576 | */ |
| 548 | if (unlikely(irqd_irq_disabled(&desc->irq_data) || | 577 | if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { |
| 549 | irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { | 578 | desc->istate |= IRQS_PENDING; |
| 550 | if (!irq_check_poll(desc)) { | 579 | mask_ack_irq(desc); |
| 551 | desc->istate |= IRQS_PENDING; | 580 | goto out_unlock; |
| 552 | mask_ack_irq(desc); | ||
| 553 | goto out_unlock; | ||
| 554 | } | ||
| 555 | } | 581 | } |
| 582 | |||
| 556 | kstat_incr_irqs_this_cpu(irq, desc); | 583 | kstat_incr_irqs_this_cpu(irq, desc); |
| 557 | 584 | ||
| 558 | /* Start handling the irq */ | 585 | /* Start handling the irq */ |
| @@ -601,18 +628,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) | |||
| 601 | raw_spin_lock(&desc->lock); | 628 | raw_spin_lock(&desc->lock); |
| 602 | 629 | ||
| 603 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 630 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 631 | |||
| 632 | if (!irq_may_run(desc)) { | ||
| 633 | desc->istate |= IRQS_PENDING; | ||
| 634 | goto out_eoi; | ||
| 635 | } | ||
| 636 | |||
| 604 | /* | 637 | /* |
| 605 | * If we're currently running this IRQ, or its disabled, | 638 | * If its disabled or no action available then mask it and get |
| 606 | * we shouldn't process the IRQ. Mark it pending, handle | 639 | * out of here. |
| 607 | * the necessary masking and go out | ||
| 608 | */ | 640 | */ |
| 609 | if (unlikely(irqd_irq_disabled(&desc->irq_data) || | 641 | if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { |
| 610 | irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { | 642 | desc->istate |= IRQS_PENDING; |
| 611 | if (!irq_check_poll(desc)) { | 643 | goto out_eoi; |
| 612 | desc->istate |= IRQS_PENDING; | ||
| 613 | goto out_eoi; | ||
| 614 | } | ||
| 615 | } | 644 | } |
| 645 | |||
| 616 | kstat_incr_irqs_this_cpu(irq, desc); | 646 | kstat_incr_irqs_this_cpu(irq, desc); |
| 617 | 647 | ||
| 618 | do { | 648 | do { |
| @@ -669,7 +699,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) | |||
| 669 | { | 699 | { |
| 670 | struct irq_chip *chip = irq_desc_get_chip(desc); | 700 | struct irq_chip *chip = irq_desc_get_chip(desc); |
| 671 | struct irqaction *action = desc->action; | 701 | struct irqaction *action = desc->action; |
| 672 | void *dev_id = __this_cpu_ptr(action->percpu_dev_id); | 702 | void *dev_id = raw_cpu_ptr(action->percpu_dev_id); |
| 673 | irqreturn_t res; | 703 | irqreturn_t res; |
| 674 | 704 | ||
| 675 | kstat_incr_irqs_this_cpu(irq, desc); | 705 | kstat_incr_irqs_this_cpu(irq, desc); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 099ea2e0eb88..4332d766619d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -63,8 +63,8 @@ enum { | |||
| 63 | 63 | ||
| 64 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 64 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
| 65 | unsigned long flags); | 65 | unsigned long flags); |
| 66 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | 66 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq); |
| 67 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 67 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq); |
| 68 | 68 | ||
| 69 | extern int irq_startup(struct irq_desc *desc, bool resend); | 69 | extern int irq_startup(struct irq_desc *desc, bool resend); |
| 70 | extern void irq_shutdown(struct irq_desc *desc); | 70 | extern void irq_shutdown(struct irq_desc *desc); |
| @@ -194,3 +194,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d | |||
| 194 | __this_cpu_inc(*desc->kstat_irqs); | 194 | __this_cpu_inc(*desc->kstat_irqs); |
| 195 | __this_cpu_inc(kstat.irqs_sum); | 195 | __this_cpu_inc(kstat.irqs_sum); |
| 196 | } | 196 | } |
| 197 | |||
| 198 | #ifdef CONFIG_PM_SLEEP | ||
| 199 | bool irq_pm_check_wakeup(struct irq_desc *desc); | ||
| 200 | void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); | ||
| 201 | void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); | ||
| 202 | #else | ||
| 203 | static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } | ||
| 204 | static inline void | ||
| 205 | irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } | ||
| 206 | static inline void | ||
| 207 | irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } | ||
| 208 | #endif | ||
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1487a123db5c..a1782f88f0af 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
| 15 | #include <linux/radix-tree.h> | 15 | #include <linux/radix-tree.h> |
| 16 | #include <linux/bitmap.h> | 16 | #include <linux/bitmap.h> |
| 17 | #include <linux/irqdomain.h> | ||
| 17 | 18 | ||
| 18 | #include "internals.h" | 19 | #include "internals.h" |
| 19 | 20 | ||
| @@ -336,6 +337,47 @@ int generic_handle_irq(unsigned int irq) | |||
| 336 | } | 337 | } |
| 337 | EXPORT_SYMBOL_GPL(generic_handle_irq); | 338 | EXPORT_SYMBOL_GPL(generic_handle_irq); |
| 338 | 339 | ||
| 340 | #ifdef CONFIG_HANDLE_DOMAIN_IRQ | ||
| 341 | /** | ||
| 342 | * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain | ||
| 343 | * @domain: The domain where to perform the lookup | ||
| 344 | * @hwirq: The HW irq number to convert to a logical one | ||
| 345 | * @lookup: Whether to perform the domain lookup or not | ||
| 346 | * @regs: Register file coming from the low-level handling code | ||
| 347 | * | ||
| 348 | * Returns: 0 on success, or -EINVAL if conversion has failed | ||
| 349 | */ | ||
| 350 | int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, | ||
| 351 | bool lookup, struct pt_regs *regs) | ||
| 352 | { | ||
| 353 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
| 354 | unsigned int irq = hwirq; | ||
| 355 | int ret = 0; | ||
| 356 | |||
| 357 | irq_enter(); | ||
| 358 | |||
| 359 | #ifdef CONFIG_IRQ_DOMAIN | ||
| 360 | if (lookup) | ||
| 361 | irq = irq_find_mapping(domain, hwirq); | ||
| 362 | #endif | ||
| 363 | |||
| 364 | /* | ||
| 365 | * Some hardware gives randomly wrong interrupts. Rather | ||
| 366 | * than crashing, do something sensible. | ||
| 367 | */ | ||
| 368 | if (unlikely(!irq || irq >= nr_irqs)) { | ||
| 369 | ack_bad_irq(irq); | ||
| 370 | ret = -EINVAL; | ||
| 371 | } else { | ||
| 372 | generic_handle_irq(irq); | ||
| 373 | } | ||
| 374 | |||
| 375 | irq_exit(); | ||
| 376 | set_irq_regs(old_regs); | ||
| 377 | return ret; | ||
| 378 | } | ||
| 379 | #endif | ||
| 380 | |||
| 339 | /* Dynamic interrupt handling */ | 381 | /* Dynamic interrupt handling */ |
| 340 | 382 | ||
| 341 | /** | 383 | /** |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3dc6a61bf06a..0a9104b4608b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -382,14 +382,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | |||
| 382 | } | 382 | } |
| 383 | #endif | 383 | #endif |
| 384 | 384 | ||
| 385 | void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | 385 | void __disable_irq(struct irq_desc *desc, unsigned int irq) |
| 386 | { | 386 | { |
| 387 | if (suspend) { | ||
| 388 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) | ||
| 389 | return; | ||
| 390 | desc->istate |= IRQS_SUSPENDED; | ||
| 391 | } | ||
| 392 | |||
| 393 | if (!desc->depth++) | 387 | if (!desc->depth++) |
| 394 | irq_disable(desc); | 388 | irq_disable(desc); |
| 395 | } | 389 | } |
| @@ -401,7 +395,7 @@ static int __disable_irq_nosync(unsigned int irq) | |||
| 401 | 395 | ||
| 402 | if (!desc) | 396 | if (!desc) |
| 403 | return -EINVAL; | 397 | return -EINVAL; |
| 404 | __disable_irq(desc, irq, false); | 398 | __disable_irq(desc, irq); |
| 405 | irq_put_desc_busunlock(desc, flags); | 399 | irq_put_desc_busunlock(desc, flags); |
| 406 | return 0; | 400 | return 0; |
| 407 | } | 401 | } |
| @@ -442,20 +436,8 @@ void disable_irq(unsigned int irq) | |||
| 442 | } | 436 | } |
| 443 | EXPORT_SYMBOL(disable_irq); | 437 | EXPORT_SYMBOL(disable_irq); |
| 444 | 438 | ||
| 445 | void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | 439 | void __enable_irq(struct irq_desc *desc, unsigned int irq) |
| 446 | { | 440 | { |
| 447 | if (resume) { | ||
| 448 | if (!(desc->istate & IRQS_SUSPENDED)) { | ||
| 449 | if (!desc->action) | ||
| 450 | return; | ||
| 451 | if (!(desc->action->flags & IRQF_FORCE_RESUME)) | ||
| 452 | return; | ||
| 453 | /* Pretend that it got disabled ! */ | ||
| 454 | desc->depth++; | ||
| 455 | } | ||
| 456 | desc->istate &= ~IRQS_SUSPENDED; | ||
| 457 | } | ||
| 458 | |||
| 459 | switch (desc->depth) { | 441 | switch (desc->depth) { |
| 460 | case 0: | 442 | case 0: |
| 461 | err_out: | 443 | err_out: |
| @@ -497,7 +479,7 @@ void enable_irq(unsigned int irq) | |||
| 497 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) | 479 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) |
| 498 | goto out; | 480 | goto out; |
| 499 | 481 | ||
| 500 | __enable_irq(desc, irq, false); | 482 | __enable_irq(desc, irq); |
| 501 | out: | 483 | out: |
| 502 | irq_put_desc_busunlock(desc, flags); | 484 | irq_put_desc_busunlock(desc, flags); |
| 503 | } | 485 | } |
| @@ -1218,6 +1200,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1218 | new->irq = irq; | 1200 | new->irq = irq; |
| 1219 | *old_ptr = new; | 1201 | *old_ptr = new; |
| 1220 | 1202 | ||
| 1203 | irq_pm_install_action(desc, new); | ||
| 1204 | |||
| 1221 | /* Reset broken irq detection when installing new handler */ | 1205 | /* Reset broken irq detection when installing new handler */ |
| 1222 | desc->irq_count = 0; | 1206 | desc->irq_count = 0; |
| 1223 | desc->irqs_unhandled = 0; | 1207 | desc->irqs_unhandled = 0; |
| @@ -1228,7 +1212,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1228 | */ | 1212 | */ |
| 1229 | if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { | 1213 | if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { |
| 1230 | desc->istate &= ~IRQS_SPURIOUS_DISABLED; | 1214 | desc->istate &= ~IRQS_SPURIOUS_DISABLED; |
| 1231 | __enable_irq(desc, irq, false); | 1215 | __enable_irq(desc, irq); |
| 1232 | } | 1216 | } |
| 1233 | 1217 | ||
| 1234 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1218 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| @@ -1336,6 +1320,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1336 | /* Found it - now remove it from the list of entries: */ | 1320 | /* Found it - now remove it from the list of entries: */ |
| 1337 | *action_ptr = action->next; | 1321 | *action_ptr = action->next; |
| 1338 | 1322 | ||
| 1323 | irq_pm_remove_action(desc, action); | ||
| 1324 | |||
| 1339 | /* If this was the last handler, shut down the IRQ line: */ | 1325 | /* If this was the last handler, shut down the IRQ line: */ |
| 1340 | if (!desc->action) { | 1326 | if (!desc->action) { |
| 1341 | irq_shutdown(desc); | 1327 | irq_shutdown(desc); |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index abcd6ca86cb7..3ca532592704 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
| @@ -9,17 +9,105 @@ | |||
| 9 | #include <linux/irq.h> | 9 | #include <linux/irq.h> |
| 10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
| 11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
| 12 | #include <linux/suspend.h> | ||
| 12 | #include <linux/syscore_ops.h> | 13 | #include <linux/syscore_ops.h> |
| 13 | 14 | ||
| 14 | #include "internals.h" | 15 | #include "internals.h" |
| 15 | 16 | ||
| 17 | bool irq_pm_check_wakeup(struct irq_desc *desc) | ||
| 18 | { | ||
| 19 | if (irqd_is_wakeup_armed(&desc->irq_data)) { | ||
| 20 | irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); | ||
| 21 | desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; | ||
| 22 | desc->depth++; | ||
| 23 | irq_disable(desc); | ||
| 24 | pm_system_wakeup(); | ||
| 25 | return true; | ||
| 26 | } | ||
| 27 | return false; | ||
| 28 | } | ||
| 29 | |||
| 30 | /* | ||
| 31 | * Called from __setup_irq() with desc->lock held after @action has | ||
| 32 | * been installed in the action chain. | ||
| 33 | */ | ||
| 34 | void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) | ||
| 35 | { | ||
| 36 | desc->nr_actions++; | ||
| 37 | |||
| 38 | if (action->flags & IRQF_FORCE_RESUME) | ||
| 39 | desc->force_resume_depth++; | ||
| 40 | |||
| 41 | WARN_ON_ONCE(desc->force_resume_depth && | ||
| 42 | desc->force_resume_depth != desc->nr_actions); | ||
| 43 | |||
| 44 | if (action->flags & IRQF_NO_SUSPEND) | ||
| 45 | desc->no_suspend_depth++; | ||
| 46 | |||
| 47 | WARN_ON_ONCE(desc->no_suspend_depth && | ||
| 48 | desc->no_suspend_depth != desc->nr_actions); | ||
| 49 | } | ||
| 50 | |||
| 51 | /* | ||
| 52 | * Called from __free_irq() with desc->lock held after @action has | ||
| 53 | * been removed from the action chain. | ||
| 54 | */ | ||
| 55 | void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) | ||
| 56 | { | ||
| 57 | desc->nr_actions--; | ||
| 58 | |||
| 59 | if (action->flags & IRQF_FORCE_RESUME) | ||
| 60 | desc->force_resume_depth--; | ||
| 61 | |||
| 62 | if (action->flags & IRQF_NO_SUSPEND) | ||
| 63 | desc->no_suspend_depth--; | ||
| 64 | } | ||
| 65 | |||
| 66 | static bool suspend_device_irq(struct irq_desc *desc, int irq) | ||
| 67 | { | ||
| 68 | if (!desc->action || desc->no_suspend_depth) | ||
| 69 | return false; | ||
| 70 | |||
| 71 | if (irqd_is_wakeup_set(&desc->irq_data)) { | ||
| 72 | irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); | ||
| 73 | /* | ||
| 74 | * We return true here to force the caller to issue | ||
| 75 | * synchronize_irq(). We need to make sure that the | ||
| 76 | * IRQD_WAKEUP_ARMED is visible before we return from | ||
| 77 | * suspend_device_irqs(). | ||
| 78 | */ | ||
| 79 | return true; | ||
| 80 | } | ||
| 81 | |||
| 82 | desc->istate |= IRQS_SUSPENDED; | ||
| 83 | __disable_irq(desc, irq); | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Hardware which has no wakeup source configuration facility | ||
| 87 | * requires that the non wakeup interrupts are masked at the | ||
| 88 | * chip level. The chip implementation indicates that with | ||
| 89 | * IRQCHIP_MASK_ON_SUSPEND. | ||
| 90 | */ | ||
| 91 | if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) | ||
| 92 | mask_irq(desc); | ||
| 93 | return true; | ||
| 94 | } | ||
| 95 | |||
| 16 | /** | 96 | /** |
| 17 | * suspend_device_irqs - disable all currently enabled interrupt lines | 97 | * suspend_device_irqs - disable all currently enabled interrupt lines |
| 18 | * | 98 | * |
| 19 | * During system-wide suspend or hibernation device drivers need to be prevented | 99 | * During system-wide suspend or hibernation device drivers need to be |
| 20 | * from receiving interrupts and this function is provided for this purpose. | 100 | * prevented from receiving interrupts and this function is provided |
| 21 | * It marks all interrupt lines in use, except for the timer ones, as disabled | 101 | * for this purpose. |
| 22 | * and sets the IRQS_SUSPENDED flag for each of them. | 102 | * |
| 103 | * So we disable all interrupts and mark them IRQS_SUSPENDED except | ||
| 104 | * for those which are unused, those which are marked as not | ||
| 105 | * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND | ||
| 106 | * set and those which are marked as active wakeup sources. | ||
| 107 | * | ||
| 108 | * The active wakeup sources are handled by the flow handler entry | ||
| 109 | * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the | ||
| 110 | * interrupt and notifies the pm core about the wakeup. | ||
| 23 | */ | 111 | */ |
| 24 | void suspend_device_irqs(void) | 112 | void suspend_device_irqs(void) |
| 25 | { | 113 | { |
| @@ -28,18 +116,36 @@ void suspend_device_irqs(void) | |||
| 28 | 116 | ||
| 29 | for_each_irq_desc(irq, desc) { | 117 | for_each_irq_desc(irq, desc) { |
| 30 | unsigned long flags; | 118 | unsigned long flags; |
| 119 | bool sync; | ||
| 31 | 120 | ||
| 32 | raw_spin_lock_irqsave(&desc->lock, flags); | 121 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 33 | __disable_irq(desc, irq, true); | 122 | sync = suspend_device_irq(desc, irq); |
| 34 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 123 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 35 | } | ||
| 36 | 124 | ||
| 37 | for_each_irq_desc(irq, desc) | 125 | if (sync) |
| 38 | if (desc->istate & IRQS_SUSPENDED) | ||
| 39 | synchronize_irq(irq); | 126 | synchronize_irq(irq); |
| 127 | } | ||
| 40 | } | 128 | } |
| 41 | EXPORT_SYMBOL_GPL(suspend_device_irqs); | 129 | EXPORT_SYMBOL_GPL(suspend_device_irqs); |
| 42 | 130 | ||
| 131 | static void resume_irq(struct irq_desc *desc, int irq) | ||
| 132 | { | ||
| 133 | irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); | ||
| 134 | |||
| 135 | if (desc->istate & IRQS_SUSPENDED) | ||
| 136 | goto resume; | ||
| 137 | |||
| 138 | /* Force resume the interrupt? */ | ||
| 139 | if (!desc->force_resume_depth) | ||
| 140 | return; | ||
| 141 | |||
| 142 | /* Pretend that it got disabled ! */ | ||
| 143 | desc->depth++; | ||
| 144 | resume: | ||
| 145 | desc->istate &= ~IRQS_SUSPENDED; | ||
| 146 | __enable_irq(desc, irq); | ||
| 147 | } | ||
| 148 | |||
| 43 | static void resume_irqs(bool want_early) | 149 | static void resume_irqs(bool want_early) |
| 44 | { | 150 | { |
| 45 | struct irq_desc *desc; | 151 | struct irq_desc *desc; |
| @@ -54,7 +160,7 @@ static void resume_irqs(bool want_early) | |||
| 54 | continue; | 160 | continue; |
| 55 | 161 | ||
| 56 | raw_spin_lock_irqsave(&desc->lock, flags); | 162 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 57 | __enable_irq(desc, irq, true); | 163 | resume_irq(desc, irq); |
| 58 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 164 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 59 | } | 165 | } |
| 60 | } | 166 | } |
| @@ -93,38 +199,3 @@ void resume_device_irqs(void) | |||
| 93 | resume_irqs(false); | 199 | resume_irqs(false); |
| 94 | } | 200 | } |
| 95 | EXPORT_SYMBOL_GPL(resume_device_irqs); | 201 | EXPORT_SYMBOL_GPL(resume_device_irqs); |
| 96 | |||
| 97 | /** | ||
| 98 | * check_wakeup_irqs - check if any wake-up interrupts are pending | ||
| 99 | */ | ||
| 100 | int check_wakeup_irqs(void) | ||
| 101 | { | ||
| 102 | struct irq_desc *desc; | ||
| 103 | int irq; | ||
| 104 | |||
| 105 | for_each_irq_desc(irq, desc) { | ||
| 106 | /* | ||
| 107 | * Only interrupts which are marked as wakeup source | ||
| 108 | * and have not been disabled before the suspend check | ||
| 109 | * can abort suspend. | ||
| 110 | */ | ||
| 111 | if (irqd_is_wakeup_set(&desc->irq_data)) { | ||
| 112 | if (desc->depth == 1 && desc->istate & IRQS_PENDING) | ||
| 113 | return -EBUSY; | ||
| 114 | continue; | ||
| 115 | } | ||
| 116 | /* | ||
| 117 | * Check the non wakeup interrupts whether they need | ||
| 118 | * to be masked before finally going into suspend | ||
| 119 | * state. That's for hardware which has no wakeup | ||
| 120 | * source configuration facility. The chip | ||
| 121 | * implementation indicates that with | ||
| 122 | * IRQCHIP_MASK_ON_SUSPEND. | ||
| 123 | */ | ||
| 124 | if (desc->istate & IRQS_SUSPENDED && | ||
| 125 | irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) | ||
| 126 | mask_irq(desc); | ||
| 127 | } | ||
| 128 | |||
| 129 | return 0; | ||
| 130 | } | ||
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e6bcbe756663..3ab9048483fa 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
| @@ -95,11 +95,11 @@ bool irq_work_queue(struct irq_work *work) | |||
| 95 | 95 | ||
| 96 | /* If the work is "lazy", handle it from next tick if any */ | 96 | /* If the work is "lazy", handle it from next tick if any */ |
| 97 | if (work->flags & IRQ_WORK_LAZY) { | 97 | if (work->flags & IRQ_WORK_LAZY) { |
| 98 | if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && | 98 | if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && |
| 99 | tick_nohz_tick_stopped()) | 99 | tick_nohz_tick_stopped()) |
| 100 | arch_irq_work_raise(); | 100 | arch_irq_work_raise(); |
| 101 | } else { | 101 | } else { |
| 102 | if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) | 102 | if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) |
| 103 | arch_irq_work_raise(); | 103 | arch_irq_work_raise(); |
| 104 | } | 104 | } |
| 105 | 105 | ||
| @@ -113,10 +113,12 @@ bool irq_work_needs_cpu(void) | |||
| 113 | { | 113 | { |
| 114 | struct llist_head *raised, *lazy; | 114 | struct llist_head *raised, *lazy; |
| 115 | 115 | ||
| 116 | raised = &__get_cpu_var(raised_list); | 116 | raised = this_cpu_ptr(&raised_list); |
| 117 | lazy = &__get_cpu_var(lazy_list); | 117 | lazy = this_cpu_ptr(&lazy_list); |
| 118 | if (llist_empty(raised) && llist_empty(lazy)) | 118 | |
| 119 | return false; | 119 | if (llist_empty(raised) || arch_irq_work_has_interrupt()) |
| 120 | if (llist_empty(lazy)) | ||
| 121 | return false; | ||
| 120 | 122 | ||
| 121 | /* All work should have been flushed before going offline */ | 123 | /* All work should have been flushed before going offline */ |
| 122 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | 124 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); |
| @@ -166,11 +168,20 @@ static void irq_work_run_list(struct llist_head *list) | |||
| 166 | */ | 168 | */ |
| 167 | void irq_work_run(void) | 169 | void irq_work_run(void) |
| 168 | { | 170 | { |
| 169 | irq_work_run_list(&__get_cpu_var(raised_list)); | 171 | irq_work_run_list(this_cpu_ptr(&raised_list)); |
| 170 | irq_work_run_list(&__get_cpu_var(lazy_list)); | 172 | irq_work_run_list(this_cpu_ptr(&lazy_list)); |
| 171 | } | 173 | } |
| 172 | EXPORT_SYMBOL_GPL(irq_work_run); | 174 | EXPORT_SYMBOL_GPL(irq_work_run); |
| 173 | 175 | ||
| 176 | void irq_work_tick(void) | ||
| 177 | { | ||
| 178 | struct llist_head *raised = &__get_cpu_var(raised_list); | ||
| 179 | |||
| 180 | if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) | ||
| 181 | irq_work_run_list(raised); | ||
| 182 | irq_work_run_list(&__get_cpu_var(lazy_list)); | ||
| 183 | } | ||
| 184 | |||
| 174 | /* | 185 | /* |
| 175 | * Synchronize against the irq_work @entry, ensures the entry is not | 186 | * Synchronize against the irq_work @entry, ensures the entry is not |
| 176 | * currently in use. | 187 | * currently in use. |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37dac3a..5c5987f10819 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, | |||
| 364 | address += symbol_offset; | 364 | address += symbol_offset; |
| 365 | name = kallsyms_lookup(address, &size, &offset, &modname, buffer); | 365 | name = kallsyms_lookup(address, &size, &offset, &modname, buffer); |
| 366 | if (!name) | 366 | if (!name) |
| 367 | return sprintf(buffer, "0x%lx", address); | 367 | return sprintf(buffer, "0x%lx", address - symbol_offset); |
| 368 | 368 | ||
| 369 | if (name != buffer) | 369 | if (name != buffer) |
| 370 | strcpy(buffer, name); | 370 | strcpy(buffer, name); |
| @@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file) | |||
| 565 | * using get_symbol_offset for every symbol. | 565 | * using get_symbol_offset for every symbol. |
| 566 | */ | 566 | */ |
| 567 | struct kallsym_iter *iter; | 567 | struct kallsym_iter *iter; |
| 568 | int ret; | 568 | iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter)); |
| 569 | |||
| 570 | iter = kmalloc(sizeof(*iter), GFP_KERNEL); | ||
| 571 | if (!iter) | 569 | if (!iter) |
| 572 | return -ENOMEM; | 570 | return -ENOMEM; |
| 573 | reset_iter(iter, 0); | 571 | reset_iter(iter, 0); |
| 574 | 572 | ||
| 575 | ret = seq_open(file, &kallsyms_op); | 573 | return 0; |
| 576 | if (ret == 0) | ||
| 577 | ((struct seq_file *)file->private_data)->private = iter; | ||
| 578 | else | ||
| 579 | kfree(iter); | ||
| 580 | return ret; | ||
| 581 | } | 574 | } |
| 582 | 575 | ||
| 583 | #ifdef CONFIG_KGDB_KDB | 576 | #ifdef CONFIG_KGDB_KDB |
diff --git a/kernel/kcmp.c b/kernel/kcmp.c index e30ac0fe61c3..0aa69ea1d8fd 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c | |||
| @@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type) | |||
| 44 | */ | 44 | */ |
| 45 | static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) | 45 | static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) |
| 46 | { | 46 | { |
| 47 | long ret; | 47 | long t1, t2; |
| 48 | 48 | ||
| 49 | ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); | 49 | t1 = kptr_obfuscate((long)v1, type); |
| 50 | t2 = kptr_obfuscate((long)v2, type); | ||
| 50 | 51 | ||
| 51 | return (ret < 0) | ((ret > 0) << 1); | 52 | return (t1 < t2) | ((t1 > t2) << 1); |
| 52 | } | 53 | } |
| 53 | 54 | ||
| 54 | /* The caller must have pinned the task */ | 55 | /* The caller must have pinned the task */ |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c925884..2abf9f6e9a61 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -6,6 +6,8 @@ | |||
| 6 | * Version 2. See the file COPYING for more details. | 6 | * Version 2. See the file COPYING for more details. |
| 7 | */ | 7 | */ |
| 8 | 8 | ||
| 9 | #define pr_fmt(fmt) "kexec: " fmt | ||
| 10 | |||
| 9 | #include <linux/capability.h> | 11 | #include <linux/capability.h> |
| 10 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 11 | #include <linux/file.h> | 13 | #include <linux/file.h> |
| @@ -40,6 +42,9 @@ | |||
| 40 | #include <asm/io.h> | 42 | #include <asm/io.h> |
| 41 | #include <asm/sections.h> | 43 | #include <asm/sections.h> |
| 42 | 44 | ||
| 45 | #include <crypto/hash.h> | ||
| 46 | #include <crypto/sha.h> | ||
| 47 | |||
| 43 | /* Per cpu memory for storing cpu states in case of system crash. */ | 48 | /* Per cpu memory for storing cpu states in case of system crash. */ |
| 44 | note_buf_t __percpu *crash_notes; | 49 | note_buf_t __percpu *crash_notes; |
| 45 | 50 | ||
| @@ -52,6 +57,17 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | |||
| 52 | /* Flag to indicate we are going to kexec a new kernel */ | 57 | /* Flag to indicate we are going to kexec a new kernel */ |
| 53 | bool kexec_in_progress = false; | 58 | bool kexec_in_progress = false; |
| 54 | 59 | ||
| 60 | /* | ||
| 61 | * Declare these symbols weak so that if architecture provides a purgatory, | ||
| 62 | * these will be overridden. | ||
| 63 | */ | ||
| 64 | char __weak kexec_purgatory[0]; | ||
| 65 | size_t __weak kexec_purgatory_size = 0; | ||
| 66 | |||
| 67 | #ifdef CONFIG_KEXEC_FILE | ||
| 68 | static int kexec_calculate_store_digests(struct kimage *image); | ||
| 69 | #endif | ||
| 70 | |||
| 55 | /* Location of the reserved area for the crash kernel */ | 71 | /* Location of the reserved area for the crash kernel */ |
| 56 | struct resource crashk_res = { | 72 | struct resource crashk_res = { |
| 57 | .name = "Crash kernel", | 73 | .name = "Crash kernel", |
| @@ -125,45 +141,27 @@ static struct page *kimage_alloc_page(struct kimage *image, | |||
| 125 | gfp_t gfp_mask, | 141 | gfp_t gfp_mask, |
| 126 | unsigned long dest); | 142 | unsigned long dest); |
| 127 | 143 | ||
| 128 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | 144 | static int copy_user_segment_list(struct kimage *image, |
| 129 | unsigned long nr_segments, | 145 | unsigned long nr_segments, |
| 130 | struct kexec_segment __user *segments) | 146 | struct kexec_segment __user *segments) |
| 131 | { | 147 | { |
| 148 | int ret; | ||
| 132 | size_t segment_bytes; | 149 | size_t segment_bytes; |
| 133 | struct kimage *image; | ||
| 134 | unsigned long i; | ||
| 135 | int result; | ||
| 136 | |||
| 137 | /* Allocate a controlling structure */ | ||
| 138 | result = -ENOMEM; | ||
| 139 | image = kzalloc(sizeof(*image), GFP_KERNEL); | ||
| 140 | if (!image) | ||
| 141 | goto out; | ||
| 142 | |||
| 143 | image->head = 0; | ||
| 144 | image->entry = &image->head; | ||
| 145 | image->last_entry = &image->head; | ||
| 146 | image->control_page = ~0; /* By default this does not apply */ | ||
| 147 | image->start = entry; | ||
| 148 | image->type = KEXEC_TYPE_DEFAULT; | ||
| 149 | |||
| 150 | /* Initialize the list of control pages */ | ||
| 151 | INIT_LIST_HEAD(&image->control_pages); | ||
| 152 | |||
| 153 | /* Initialize the list of destination pages */ | ||
| 154 | INIT_LIST_HEAD(&image->dest_pages); | ||
| 155 | |||
| 156 | /* Initialize the list of unusable pages */ | ||
| 157 | INIT_LIST_HEAD(&image->unuseable_pages); | ||
| 158 | 150 | ||
| 159 | /* Read in the segments */ | 151 | /* Read in the segments */ |
| 160 | image->nr_segments = nr_segments; | 152 | image->nr_segments = nr_segments; |
| 161 | segment_bytes = nr_segments * sizeof(*segments); | 153 | segment_bytes = nr_segments * sizeof(*segments); |
| 162 | result = copy_from_user(image->segment, segments, segment_bytes); | 154 | ret = copy_from_user(image->segment, segments, segment_bytes); |
| 163 | if (result) { | 155 | if (ret) |
| 164 | result = -EFAULT; | 156 | ret = -EFAULT; |
| 165 | goto out; | 157 | |
| 166 | } | 158 | return ret; |
| 159 | } | ||
| 160 | |||
| 161 | static int sanity_check_segment_list(struct kimage *image) | ||
| 162 | { | ||
| 163 | int result, i; | ||
| 164 | unsigned long nr_segments = image->nr_segments; | ||
| 167 | 165 | ||
| 168 | /* | 166 | /* |
| 169 | * Verify we have good destination addresses. The caller is | 167 | * Verify we have good destination addresses. The caller is |
| @@ -185,9 +183,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
| 185 | mstart = image->segment[i].mem; | 183 | mstart = image->segment[i].mem; |
| 186 | mend = mstart + image->segment[i].memsz; | 184 | mend = mstart + image->segment[i].memsz; |
| 187 | if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) | 185 | if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) |
| 188 | goto out; | 186 | return result; |
| 189 | if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) | 187 | if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) |
| 190 | goto out; | 188 | return result; |
| 191 | } | 189 | } |
| 192 | 190 | ||
| 193 | /* Verify our destination addresses do not overlap. | 191 | /* Verify our destination addresses do not overlap. |
| @@ -208,7 +206,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
| 208 | pend = pstart + image->segment[j].memsz; | 206 | pend = pstart + image->segment[j].memsz; |
| 209 | /* Do the segments overlap ? */ | 207 | /* Do the segments overlap ? */ |
| 210 | if ((mend > pstart) && (mstart < pend)) | 208 | if ((mend > pstart) && (mstart < pend)) |
| 211 | goto out; | 209 | return result; |
| 212 | } | 210 | } |
| 213 | } | 211 | } |
| 214 | 212 | ||
| @@ -220,131 +218,406 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
| 220 | result = -EINVAL; | 218 | result = -EINVAL; |
| 221 | for (i = 0; i < nr_segments; i++) { | 219 | for (i = 0; i < nr_segments; i++) { |
| 222 | if (image->segment[i].bufsz > image->segment[i].memsz) | 220 | if (image->segment[i].bufsz > image->segment[i].memsz) |
| 223 | goto out; | 221 | return result; |
| 224 | } | 222 | } |
| 225 | 223 | ||
| 226 | result = 0; | 224 | /* |
| 227 | out: | 225 | * Verify we have good destination addresses. Normally |
| 228 | if (result == 0) | 226 | * the caller is responsible for making certain we don't |
| 229 | *rimage = image; | 227 | * attempt to load the new image into invalid or reserved |
| 230 | else | 228 | * areas of RAM. But crash kernels are preloaded into a |
| 231 | kfree(image); | 229 | * reserved area of ram. We must ensure the addresses |
| 230 | * are in the reserved area otherwise preloading the | ||
| 231 | * kernel could corrupt things. | ||
| 232 | */ | ||
| 232 | 233 | ||
| 233 | return result; | 234 | if (image->type == KEXEC_TYPE_CRASH) { |
| 235 | result = -EADDRNOTAVAIL; | ||
| 236 | for (i = 0; i < nr_segments; i++) { | ||
| 237 | unsigned long mstart, mend; | ||
| 238 | |||
| 239 | mstart = image->segment[i].mem; | ||
| 240 | mend = mstart + image->segment[i].memsz - 1; | ||
| 241 | /* Ensure we are within the crash kernel limits */ | ||
| 242 | if ((mstart < crashk_res.start) || | ||
| 243 | (mend > crashk_res.end)) | ||
| 244 | return result; | ||
| 245 | } | ||
| 246 | } | ||
| 247 | |||
| 248 | return 0; | ||
| 249 | } | ||
| 250 | |||
| 251 | static struct kimage *do_kimage_alloc_init(void) | ||
| 252 | { | ||
| 253 | struct kimage *image; | ||
| 234 | 254 | ||
| 255 | /* Allocate a controlling structure */ | ||
| 256 | image = kzalloc(sizeof(*image), GFP_KERNEL); | ||
| 257 | if (!image) | ||
| 258 | return NULL; | ||
| 259 | |||
| 260 | image->head = 0; | ||
| 261 | image->entry = &image->head; | ||
| 262 | image->last_entry = &image->head; | ||
| 263 | image->control_page = ~0; /* By default this does not apply */ | ||
| 264 | image->type = KEXEC_TYPE_DEFAULT; | ||
| 265 | |||
| 266 | /* Initialize the list of control pages */ | ||
| 267 | INIT_LIST_HEAD(&image->control_pages); | ||
| 268 | |||
| 269 | /* Initialize the list of destination pages */ | ||
| 270 | INIT_LIST_HEAD(&image->dest_pages); | ||
| 271 | |||
| 272 | /* Initialize the list of unusable pages */ | ||
| 273 | INIT_LIST_HEAD(&image->unusable_pages); | ||
| 274 | |||
| 275 | return image; | ||
| 235 | } | 276 | } |
| 236 | 277 | ||
| 237 | static void kimage_free_page_list(struct list_head *list); | 278 | static void kimage_free_page_list(struct list_head *list); |
| 238 | 279 | ||
| 239 | static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, | 280 | static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, |
| 240 | unsigned long nr_segments, | 281 | unsigned long nr_segments, |
| 241 | struct kexec_segment __user *segments) | 282 | struct kexec_segment __user *segments, |
| 283 | unsigned long flags) | ||
| 242 | { | 284 | { |
| 243 | int result; | 285 | int ret; |
| 244 | struct kimage *image; | 286 | struct kimage *image; |
| 287 | bool kexec_on_panic = flags & KEXEC_ON_CRASH; | ||
| 288 | |||
| 289 | if (kexec_on_panic) { | ||
| 290 | /* Verify we have a valid entry point */ | ||
| 291 | if ((entry < crashk_res.start) || (entry > crashk_res.end)) | ||
| 292 | return -EADDRNOTAVAIL; | ||
| 293 | } | ||
| 245 | 294 | ||
| 246 | /* Allocate and initialize a controlling structure */ | 295 | /* Allocate and initialize a controlling structure */ |
| 247 | image = NULL; | 296 | image = do_kimage_alloc_init(); |
| 248 | result = do_kimage_alloc(&image, entry, nr_segments, segments); | 297 | if (!image) |
| 249 | if (result) | 298 | return -ENOMEM; |
| 250 | goto out; | 299 | |
| 300 | image->start = entry; | ||
| 301 | |||
| 302 | ret = copy_user_segment_list(image, nr_segments, segments); | ||
| 303 | if (ret) | ||
| 304 | goto out_free_image; | ||
| 305 | |||
| 306 | ret = sanity_check_segment_list(image); | ||
| 307 | if (ret) | ||
| 308 | goto out_free_image; | ||
| 309 | |||
| 310 | /* Enable the special crash kernel control page allocation policy. */ | ||
| 311 | if (kexec_on_panic) { | ||
| 312 | image->control_page = crashk_res.start; | ||
| 313 | image->type = KEXEC_TYPE_CRASH; | ||
| 314 | } | ||
| 251 | 315 | ||
| 252 | /* | 316 | /* |
| 253 | * Find a location for the control code buffer, and add it | 317 | * Find a location for the control code buffer, and add it |
| 254 | * the vector of segments so that it's pages will also be | 318 | * the vector of segments so that it's pages will also be |
| 255 | * counted as destination pages. | 319 | * counted as destination pages. |
| 256 | */ | 320 | */ |
| 257 | result = -ENOMEM; | 321 | ret = -ENOMEM; |
| 258 | image->control_code_page = kimage_alloc_control_pages(image, | 322 | image->control_code_page = kimage_alloc_control_pages(image, |
| 259 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 323 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
| 260 | if (!image->control_code_page) { | 324 | if (!image->control_code_page) { |
| 261 | pr_err("Could not allocate control_code_buffer\n"); | 325 | pr_err("Could not allocate control_code_buffer\n"); |
| 262 | goto out_free; | 326 | goto out_free_image; |
| 263 | } | 327 | } |
| 264 | 328 | ||
| 265 | image->swap_page = kimage_alloc_control_pages(image, 0); | 329 | if (!kexec_on_panic) { |
| 266 | if (!image->swap_page) { | 330 | image->swap_page = kimage_alloc_control_pages(image, 0); |
| 267 | pr_err("Could not allocate swap buffer\n"); | 331 | if (!image->swap_page) { |
| 268 | goto out_free; | 332 | pr_err("Could not allocate swap buffer\n"); |
| 333 | goto out_free_control_pages; | ||
| 334 | } | ||
| 269 | } | 335 | } |
| 270 | 336 | ||
| 271 | *rimage = image; | 337 | *rimage = image; |
| 272 | return 0; | 338 | return 0; |
| 273 | 339 | out_free_control_pages: | |
| 274 | out_free: | ||
| 275 | kimage_free_page_list(&image->control_pages); | 340 | kimage_free_page_list(&image->control_pages); |
| 341 | out_free_image: | ||
| 276 | kfree(image); | 342 | kfree(image); |
| 277 | out: | 343 | return ret; |
| 278 | return result; | ||
| 279 | } | 344 | } |
| 280 | 345 | ||
| 281 | static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, | 346 | #ifdef CONFIG_KEXEC_FILE |
| 282 | unsigned long nr_segments, | 347 | static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) |
| 283 | struct kexec_segment __user *segments) | ||
| 284 | { | 348 | { |
| 285 | int result; | 349 | struct fd f = fdget(fd); |
| 286 | struct kimage *image; | 350 | int ret; |
| 287 | unsigned long i; | 351 | struct kstat stat; |
| 352 | loff_t pos; | ||
| 353 | ssize_t bytes = 0; | ||
| 288 | 354 | ||
| 289 | image = NULL; | 355 | if (!f.file) |
| 290 | /* Verify we have a valid entry point */ | 356 | return -EBADF; |
| 291 | if ((entry < crashk_res.start) || (entry > crashk_res.end)) { | 357 | |
| 292 | result = -EADDRNOTAVAIL; | 358 | ret = vfs_getattr(&f.file->f_path, &stat); |
| 359 | if (ret) | ||
| 360 | goto out; | ||
| 361 | |||
| 362 | if (stat.size > INT_MAX) { | ||
| 363 | ret = -EFBIG; | ||
| 293 | goto out; | 364 | goto out; |
| 294 | } | 365 | } |
| 295 | 366 | ||
| 296 | /* Allocate and initialize a controlling structure */ | 367 | /* Don't hand 0 to vmalloc, it whines. */ |
| 297 | result = do_kimage_alloc(&image, entry, nr_segments, segments); | 368 | if (stat.size == 0) { |
| 298 | if (result) | 369 | ret = -EINVAL; |
| 299 | goto out; | 370 | goto out; |
| 371 | } | ||
| 300 | 372 | ||
| 301 | /* Enable the special crash kernel control page | 373 | *buf = vmalloc(stat.size); |
| 302 | * allocation policy. | 374 | if (!*buf) { |
| 303 | */ | 375 | ret = -ENOMEM; |
| 304 | image->control_page = crashk_res.start; | 376 | goto out; |
| 305 | image->type = KEXEC_TYPE_CRASH; | 377 | } |
| 306 | 378 | ||
| 307 | /* | 379 | pos = 0; |
| 308 | * Verify we have good destination addresses. Normally | 380 | while (pos < stat.size) { |
| 309 | * the caller is responsible for making certain we don't | 381 | bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, |
| 310 | * attempt to load the new image into invalid or reserved | 382 | stat.size - pos); |
| 311 | * areas of RAM. But crash kernels are preloaded into a | 383 | if (bytes < 0) { |
| 312 | * reserved area of ram. We must ensure the addresses | 384 | vfree(*buf); |
| 313 | * are in the reserved area otherwise preloading the | 385 | ret = bytes; |
| 314 | * kernel could corrupt things. | 386 | goto out; |
| 315 | */ | 387 | } |
| 316 | result = -EADDRNOTAVAIL; | ||
| 317 | for (i = 0; i < nr_segments; i++) { | ||
| 318 | unsigned long mstart, mend; | ||
| 319 | 388 | ||
| 320 | mstart = image->segment[i].mem; | 389 | if (bytes == 0) |
| 321 | mend = mstart + image->segment[i].memsz - 1; | 390 | break; |
| 322 | /* Ensure we are within the crash kernel limits */ | 391 | pos += bytes; |
| 323 | if ((mstart < crashk_res.start) || (mend > crashk_res.end)) | ||
| 324 | goto out_free; | ||
| 325 | } | 392 | } |
| 326 | 393 | ||
| 394 | if (pos != stat.size) { | ||
| 395 | ret = -EBADF; | ||
| 396 | vfree(*buf); | ||
| 397 | goto out; | ||
| 398 | } | ||
| 399 | |||
| 400 | *buf_len = pos; | ||
| 401 | out: | ||
| 402 | fdput(f); | ||
| 403 | return ret; | ||
| 404 | } | ||
| 405 | |||
| 406 | /* Architectures can provide this probe function */ | ||
| 407 | int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, | ||
| 408 | unsigned long buf_len) | ||
| 409 | { | ||
| 410 | return -ENOEXEC; | ||
| 411 | } | ||
| 412 | |||
| 413 | void * __weak arch_kexec_kernel_image_load(struct kimage *image) | ||
| 414 | { | ||
| 415 | return ERR_PTR(-ENOEXEC); | ||
| 416 | } | ||
| 417 | |||
| 418 | void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) | ||
| 419 | { | ||
| 420 | } | ||
| 421 | |||
| 422 | int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, | ||
| 423 | unsigned long buf_len) | ||
| 424 | { | ||
| 425 | return -EKEYREJECTED; | ||
| 426 | } | ||
| 427 | |||
| 428 | /* Apply relocations of type RELA */ | ||
| 429 | int __weak | ||
| 430 | arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, | ||
| 431 | unsigned int relsec) | ||
| 432 | { | ||
| 433 | pr_err("RELA relocation unsupported.\n"); | ||
| 434 | return -ENOEXEC; | ||
| 435 | } | ||
| 436 | |||
| 437 | /* Apply relocations of type REL */ | ||
| 438 | int __weak | ||
| 439 | arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, | ||
| 440 | unsigned int relsec) | ||
| 441 | { | ||
| 442 | pr_err("REL relocation unsupported.\n"); | ||
| 443 | return -ENOEXEC; | ||
| 444 | } | ||
| 445 | |||
| 446 | /* | ||
| 447 | * Free up memory used by kernel, initrd, and comand line. This is temporary | ||
| 448 | * memory allocation which is not needed any more after these buffers have | ||
| 449 | * been loaded into separate segments and have been copied elsewhere. | ||
| 450 | */ | ||
| 451 | static void kimage_file_post_load_cleanup(struct kimage *image) | ||
| 452 | { | ||
| 453 | struct purgatory_info *pi = &image->purgatory_info; | ||
| 454 | |||
| 455 | vfree(image->kernel_buf); | ||
| 456 | image->kernel_buf = NULL; | ||
| 457 | |||
| 458 | vfree(image->initrd_buf); | ||
| 459 | image->initrd_buf = NULL; | ||
| 460 | |||
| 461 | kfree(image->cmdline_buf); | ||
| 462 | image->cmdline_buf = NULL; | ||
| 463 | |||
| 464 | vfree(pi->purgatory_buf); | ||
| 465 | pi->purgatory_buf = NULL; | ||
| 466 | |||
| 467 | vfree(pi->sechdrs); | ||
| 468 | pi->sechdrs = NULL; | ||
| 469 | |||
| 470 | /* See if architecture has anything to cleanup post load */ | ||
| 471 | arch_kimage_file_post_load_cleanup(image); | ||
| 472 | |||
| 327 | /* | 473 | /* |
| 328 | * Find a location for the control code buffer, and add | 474 | * Above call should have called into bootloader to free up |
| 329 | * the vector of segments so that it's pages will also be | 475 | * any data stored in kimage->image_loader_data. It should |
| 330 | * counted as destination pages. | 476 | * be ok now to free it up. |
| 331 | */ | 477 | */ |
| 332 | result = -ENOMEM; | 478 | kfree(image->image_loader_data); |
| 479 | image->image_loader_data = NULL; | ||
| 480 | } | ||
| 481 | |||
| 482 | /* | ||
| 483 | * In file mode list of segments is prepared by kernel. Copy relevant | ||
| 484 | * data from user space, do error checking, prepare segment list | ||
| 485 | */ | ||
| 486 | static int | ||
| 487 | kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, | ||
| 488 | const char __user *cmdline_ptr, | ||
| 489 | unsigned long cmdline_len, unsigned flags) | ||
| 490 | { | ||
| 491 | int ret = 0; | ||
| 492 | void *ldata; | ||
| 493 | |||
| 494 | ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, | ||
| 495 | &image->kernel_buf_len); | ||
| 496 | if (ret) | ||
| 497 | return ret; | ||
| 498 | |||
| 499 | /* Call arch image probe handlers */ | ||
| 500 | ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, | ||
| 501 | image->kernel_buf_len); | ||
| 502 | |||
| 503 | if (ret) | ||
| 504 | goto out; | ||
| 505 | |||
| 506 | #ifdef CONFIG_KEXEC_VERIFY_SIG | ||
| 507 | ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, | ||
| 508 | image->kernel_buf_len); | ||
| 509 | if (ret) { | ||
| 510 | pr_debug("kernel signature verification failed.\n"); | ||
| 511 | goto out; | ||
| 512 | } | ||
| 513 | pr_debug("kernel signature verification successful.\n"); | ||
| 514 | #endif | ||
| 515 | /* It is possible that there no initramfs is being loaded */ | ||
| 516 | if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { | ||
| 517 | ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, | ||
| 518 | &image->initrd_buf_len); | ||
| 519 | if (ret) | ||
| 520 | goto out; | ||
| 521 | } | ||
| 522 | |||
| 523 | if (cmdline_len) { | ||
| 524 | image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); | ||
| 525 | if (!image->cmdline_buf) { | ||
| 526 | ret = -ENOMEM; | ||
| 527 | goto out; | ||
| 528 | } | ||
| 529 | |||
| 530 | ret = copy_from_user(image->cmdline_buf, cmdline_ptr, | ||
| 531 | cmdline_len); | ||
| 532 | if (ret) { | ||
| 533 | ret = -EFAULT; | ||
| 534 | goto out; | ||
| 535 | } | ||
| 536 | |||
| 537 | image->cmdline_buf_len = cmdline_len; | ||
| 538 | |||
| 539 | /* command line should be a string with last byte null */ | ||
| 540 | if (image->cmdline_buf[cmdline_len - 1] != '\0') { | ||
| 541 | ret = -EINVAL; | ||
| 542 | goto out; | ||
| 543 | } | ||
| 544 | } | ||
| 545 | |||
| 546 | /* Call arch image load handlers */ | ||
| 547 | ldata = arch_kexec_kernel_image_load(image); | ||
| 548 | |||
| 549 | if (IS_ERR(ldata)) { | ||
| 550 | ret = PTR_ERR(ldata); | ||
| 551 | goto out; | ||
| 552 | } | ||
| 553 | |||
| 554 | image->image_loader_data = ldata; | ||
| 555 | out: | ||
| 556 | /* In case of error, free up all allocated memory in this function */ | ||
| 557 | if (ret) | ||
| 558 | kimage_file_post_load_cleanup(image); | ||
| 559 | return ret; | ||
| 560 | } | ||
| 561 | |||
| 562 | static int | ||
| 563 | kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, | ||
| 564 | int initrd_fd, const char __user *cmdline_ptr, | ||
| 565 | unsigned long cmdline_len, unsigned long flags) | ||
| 566 | { | ||
| 567 | int ret; | ||
| 568 | struct kimage *image; | ||
| 569 | bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; | ||
| 570 | |||
| 571 | image = do_kimage_alloc_init(); | ||
| 572 | if (!image) | ||
| 573 | return -ENOMEM; | ||
| 574 | |||
| 575 | image->file_mode = 1; | ||
| 576 | |||
| 577 | if (kexec_on_panic) { | ||
| 578 | /* Enable special crash kernel control page alloc policy. */ | ||
| 579 | image->control_page = crashk_res.start; | ||
| 580 | image->type = KEXEC_TYPE_CRASH; | ||
| 581 | } | ||
| 582 | |||
| 583 | ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, | ||
| 584 | cmdline_ptr, cmdline_len, flags); | ||
| 585 | if (ret) | ||
| 586 | goto out_free_image; | ||
| 587 | |||
| 588 | ret = sanity_check_segment_list(image); | ||
| 589 | if (ret) | ||
| 590 | goto out_free_post_load_bufs; | ||
| 591 | |||
| 592 | ret = -ENOMEM; | ||
| 333 | image->control_code_page = kimage_alloc_control_pages(image, | 593 | image->control_code_page = kimage_alloc_control_pages(image, |
| 334 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 594 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
| 335 | if (!image->control_code_page) { | 595 | if (!image->control_code_page) { |
| 336 | pr_err("Could not allocate control_code_buffer\n"); | 596 | pr_err("Could not allocate control_code_buffer\n"); |
| 337 | goto out_free; | 597 | goto out_free_post_load_bufs; |
| 598 | } | ||
| 599 | |||
| 600 | if (!kexec_on_panic) { | ||
| 601 | image->swap_page = kimage_alloc_control_pages(image, 0); | ||
| 602 | if (!image->swap_page) { | ||
| 603 | pr_err(KERN_ERR "Could not allocate swap buffer\n"); | ||
| 604 | goto out_free_control_pages; | ||
| 605 | } | ||
| 338 | } | 606 | } |
| 339 | 607 | ||
| 340 | *rimage = image; | 608 | *rimage = image; |
| 341 | return 0; | 609 | return 0; |
| 342 | 610 | out_free_control_pages: | |
| 343 | out_free: | 611 | kimage_free_page_list(&image->control_pages); |
| 612 | out_free_post_load_bufs: | ||
| 613 | kimage_file_post_load_cleanup(image); | ||
| 614 | out_free_image: | ||
| 344 | kfree(image); | 615 | kfree(image); |
| 345 | out: | 616 | return ret; |
| 346 | return result; | ||
| 347 | } | 617 | } |
| 618 | #else /* CONFIG_KEXEC_FILE */ | ||
| 619 | static inline void kimage_file_post_load_cleanup(struct kimage *image) { } | ||
| 620 | #endif /* CONFIG_KEXEC_FILE */ | ||
| 348 | 621 | ||
| 349 | static int kimage_is_destination_range(struct kimage *image, | 622 | static int kimage_is_destination_range(struct kimage *image, |
| 350 | unsigned long start, | 623 | unsigned long start, |
| @@ -609,7 +882,7 @@ static void kimage_free_extra_pages(struct kimage *image) | |||
| 609 | kimage_free_page_list(&image->dest_pages); | 882 | kimage_free_page_list(&image->dest_pages); |
| 610 | 883 | ||
| 611 | /* Walk through and free any unusable pages I have cached */ | 884 | /* Walk through and free any unusable pages I have cached */ |
| 612 | kimage_free_page_list(&image->unuseable_pages); | 885 | kimage_free_page_list(&image->unusable_pages); |
| 613 | 886 | ||
| 614 | } | 887 | } |
| 615 | static void kimage_terminate(struct kimage *image) | 888 | static void kimage_terminate(struct kimage *image) |
| @@ -663,6 +936,14 @@ static void kimage_free(struct kimage *image) | |||
| 663 | 936 | ||
| 664 | /* Free the kexec control pages... */ | 937 | /* Free the kexec control pages... */ |
| 665 | kimage_free_page_list(&image->control_pages); | 938 | kimage_free_page_list(&image->control_pages); |
| 939 | |||
| 940 | /* | ||
| 941 | * Free up any temporary buffers allocated. This might hit if | ||
| 942 | * error occurred much later after buffer allocation. | ||
| 943 | */ | ||
| 944 | if (image->file_mode) | ||
| 945 | kimage_file_post_load_cleanup(image); | ||
| 946 | |||
| 666 | kfree(image); | 947 | kfree(image); |
| 667 | } | 948 | } |
| 668 | 949 | ||
| @@ -732,7 +1013,7 @@ static struct page *kimage_alloc_page(struct kimage *image, | |||
| 732 | /* If the page cannot be used file it away */ | 1013 | /* If the page cannot be used file it away */ |
| 733 | if (page_to_pfn(page) > | 1014 | if (page_to_pfn(page) > |
| 734 | (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { | 1015 | (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { |
| 735 | list_add(&page->lru, &image->unuseable_pages); | 1016 | list_add(&page->lru, &image->unusable_pages); |
| 736 | continue; | 1017 | continue; |
| 737 | } | 1018 | } |
| 738 | addr = page_to_pfn(page) << PAGE_SHIFT; | 1019 | addr = page_to_pfn(page) << PAGE_SHIFT; |
| @@ -791,10 +1072,14 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
| 791 | unsigned long maddr; | 1072 | unsigned long maddr; |
| 792 | size_t ubytes, mbytes; | 1073 | size_t ubytes, mbytes; |
| 793 | int result; | 1074 | int result; |
| 794 | unsigned char __user *buf; | 1075 | unsigned char __user *buf = NULL; |
| 1076 | unsigned char *kbuf = NULL; | ||
| 795 | 1077 | ||
| 796 | result = 0; | 1078 | result = 0; |
| 797 | buf = segment->buf; | 1079 | if (image->file_mode) |
| 1080 | kbuf = segment->kbuf; | ||
| 1081 | else | ||
| 1082 | buf = segment->buf; | ||
| 798 | ubytes = segment->bufsz; | 1083 | ubytes = segment->bufsz; |
| 799 | mbytes = segment->memsz; | 1084 | mbytes = segment->memsz; |
| 800 | maddr = segment->mem; | 1085 | maddr = segment->mem; |
| @@ -826,7 +1111,11 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
| 826 | PAGE_SIZE - (maddr & ~PAGE_MASK)); | 1111 | PAGE_SIZE - (maddr & ~PAGE_MASK)); |
| 827 | uchunk = min(ubytes, mchunk); | 1112 | uchunk = min(ubytes, mchunk); |
| 828 | 1113 | ||
| 829 | result = copy_from_user(ptr, buf, uchunk); | 1114 | /* For file based kexec, source pages are in kernel memory */ |
| 1115 | if (image->file_mode) | ||
| 1116 | memcpy(ptr, kbuf, uchunk); | ||
| 1117 | else | ||
| 1118 | result = copy_from_user(ptr, buf, uchunk); | ||
| 830 | kunmap(page); | 1119 | kunmap(page); |
| 831 | if (result) { | 1120 | if (result) { |
| 832 | result = -EFAULT; | 1121 | result = -EFAULT; |
| @@ -834,7 +1123,10 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
| 834 | } | 1123 | } |
| 835 | ubytes -= uchunk; | 1124 | ubytes -= uchunk; |
| 836 | maddr += mchunk; | 1125 | maddr += mchunk; |
| 837 | buf += mchunk; | 1126 | if (image->file_mode) |
| 1127 | kbuf += mchunk; | ||
| 1128 | else | ||
| 1129 | buf += mchunk; | ||
| 838 | mbytes -= mchunk; | 1130 | mbytes -= mchunk; |
| 839 | } | 1131 | } |
| 840 | out: | 1132 | out: |
| @@ -851,10 +1143,14 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
| 851 | unsigned long maddr; | 1143 | unsigned long maddr; |
| 852 | size_t ubytes, mbytes; | 1144 | size_t ubytes, mbytes; |
| 853 | int result; | 1145 | int result; |
| 854 | unsigned char __user *buf; | 1146 | unsigned char __user *buf = NULL; |
| 1147 | unsigned char *kbuf = NULL; | ||
| 855 | 1148 | ||
| 856 | result = 0; | 1149 | result = 0; |
| 857 | buf = segment->buf; | 1150 | if (image->file_mode) |
| 1151 | kbuf = segment->kbuf; | ||
| 1152 | else | ||
| 1153 | buf = segment->buf; | ||
| 858 | ubytes = segment->bufsz; | 1154 | ubytes = segment->bufsz; |
| 859 | mbytes = segment->memsz; | 1155 | mbytes = segment->memsz; |
| 860 | maddr = segment->mem; | 1156 | maddr = segment->mem; |
| @@ -877,7 +1173,12 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
| 877 | /* Zero the trailing part of the page */ | 1173 | /* Zero the trailing part of the page */ |
| 878 | memset(ptr + uchunk, 0, mchunk - uchunk); | 1174 | memset(ptr + uchunk, 0, mchunk - uchunk); |
| 879 | } | 1175 | } |
| 880 | result = copy_from_user(ptr, buf, uchunk); | 1176 | |
| 1177 | /* For file based kexec, source pages are in kernel memory */ | ||
| 1178 | if (image->file_mode) | ||
| 1179 | memcpy(ptr, kbuf, uchunk); | ||
| 1180 | else | ||
| 1181 | result = copy_from_user(ptr, buf, uchunk); | ||
| 881 | kexec_flush_icache_page(page); | 1182 | kexec_flush_icache_page(page); |
| 882 | kunmap(page); | 1183 | kunmap(page); |
| 883 | if (result) { | 1184 | if (result) { |
| @@ -886,7 +1187,10 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
| 886 | } | 1187 | } |
| 887 | ubytes -= uchunk; | 1188 | ubytes -= uchunk; |
| 888 | maddr += mchunk; | 1189 | maddr += mchunk; |
| 889 | buf += mchunk; | 1190 | if (image->file_mode) |
| 1191 | kbuf += mchunk; | ||
| 1192 | else | ||
| 1193 | buf += mchunk; | ||
| 890 | mbytes -= mchunk; | 1194 | mbytes -= mchunk; |
| 891 | } | 1195 | } |
| 892 | out: | 1196 | out: |
| @@ -986,16 +1290,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | |||
| 986 | 1290 | ||
| 987 | /* Loading another kernel to reboot into */ | 1291 | /* Loading another kernel to reboot into */ |
| 988 | if ((flags & KEXEC_ON_CRASH) == 0) | 1292 | if ((flags & KEXEC_ON_CRASH) == 0) |
| 989 | result = kimage_normal_alloc(&image, entry, | 1293 | result = kimage_alloc_init(&image, entry, nr_segments, |
| 990 | nr_segments, segments); | 1294 | segments, flags); |
| 991 | /* Loading another kernel to switch to if this one crashes */ | 1295 | /* Loading another kernel to switch to if this one crashes */ |
| 992 | else if (flags & KEXEC_ON_CRASH) { | 1296 | else if (flags & KEXEC_ON_CRASH) { |
| 993 | /* Free any current crash dump kernel before | 1297 | /* Free any current crash dump kernel before |
| 994 | * we corrupt it. | 1298 | * we corrupt it. |
| 995 | */ | 1299 | */ |
| 996 | kimage_free(xchg(&kexec_crash_image, NULL)); | 1300 | kimage_free(xchg(&kexec_crash_image, NULL)); |
| 997 | result = kimage_crash_alloc(&image, entry, | 1301 | result = kimage_alloc_init(&image, entry, nr_segments, |
| 998 | nr_segments, segments); | 1302 | segments, flags); |
| 999 | crash_map_reserved_pages(); | 1303 | crash_map_reserved_pages(); |
| 1000 | } | 1304 | } |
| 1001 | if (result) | 1305 | if (result) |
| @@ -1077,6 +1381,85 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | |||
| 1077 | } | 1381 | } |
| 1078 | #endif | 1382 | #endif |
| 1079 | 1383 | ||
| 1384 | #ifdef CONFIG_KEXEC_FILE | ||
| 1385 | SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, | ||
| 1386 | unsigned long, cmdline_len, const char __user *, cmdline_ptr, | ||
| 1387 | unsigned long, flags) | ||
| 1388 | { | ||
| 1389 | int ret = 0, i; | ||
| 1390 | struct kimage **dest_image, *image; | ||
| 1391 | |||
| 1392 | /* We only trust the superuser with rebooting the system. */ | ||
| 1393 | if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) | ||
| 1394 | return -EPERM; | ||
| 1395 | |||
| 1396 | /* Make sure we have a legal set of flags */ | ||
| 1397 | if (flags != (flags & KEXEC_FILE_FLAGS)) | ||
| 1398 | return -EINVAL; | ||
| 1399 | |||
| 1400 | image = NULL; | ||
| 1401 | |||
| 1402 | if (!mutex_trylock(&kexec_mutex)) | ||
| 1403 | return -EBUSY; | ||
| 1404 | |||
| 1405 | dest_image = &kexec_image; | ||
| 1406 | if (flags & KEXEC_FILE_ON_CRASH) | ||
| 1407 | dest_image = &kexec_crash_image; | ||
| 1408 | |||
| 1409 | if (flags & KEXEC_FILE_UNLOAD) | ||
| 1410 | goto exchange; | ||
| 1411 | |||
| 1412 | /* | ||
| 1413 | * In case of crash, new kernel gets loaded in reserved region. It is | ||
| 1414 | * same memory where old crash kernel might be loaded. Free any | ||
| 1415 | * current crash dump kernel before we corrupt it. | ||
| 1416 | */ | ||
| 1417 | if (flags & KEXEC_FILE_ON_CRASH) | ||
| 1418 | kimage_free(xchg(&kexec_crash_image, NULL)); | ||
| 1419 | |||
| 1420 | ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, | ||
| 1421 | cmdline_len, flags); | ||
| 1422 | if (ret) | ||
| 1423 | goto out; | ||
| 1424 | |||
| 1425 | ret = machine_kexec_prepare(image); | ||
| 1426 | if (ret) | ||
| 1427 | goto out; | ||
| 1428 | |||
| 1429 | ret = kexec_calculate_store_digests(image); | ||
| 1430 | if (ret) | ||
| 1431 | goto out; | ||
| 1432 | |||
| 1433 | for (i = 0; i < image->nr_segments; i++) { | ||
| 1434 | struct kexec_segment *ksegment; | ||
| 1435 | |||
| 1436 | ksegment = &image->segment[i]; | ||
| 1437 | pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", | ||
| 1438 | i, ksegment->buf, ksegment->bufsz, ksegment->mem, | ||
| 1439 | ksegment->memsz); | ||
| 1440 | |||
| 1441 | ret = kimage_load_segment(image, &image->segment[i]); | ||
| 1442 | if (ret) | ||
| 1443 | goto out; | ||
| 1444 | } | ||
| 1445 | |||
| 1446 | kimage_terminate(image); | ||
| 1447 | |||
| 1448 | /* | ||
| 1449 | * Free up any temporary buffers allocated which are not needed | ||
| 1450 | * after image has been loaded | ||
| 1451 | */ | ||
| 1452 | kimage_file_post_load_cleanup(image); | ||
| 1453 | exchange: | ||
| 1454 | image = xchg(dest_image, image); | ||
| 1455 | out: | ||
| 1456 | mutex_unlock(&kexec_mutex); | ||
| 1457 | kimage_free(image); | ||
| 1458 | return ret; | ||
| 1459 | } | ||
| 1460 | |||
| 1461 | #endif /* CONFIG_KEXEC_FILE */ | ||
| 1462 | |||
| 1080 | void crash_kexec(struct pt_regs *regs) | 1463 | void crash_kexec(struct pt_regs *regs) |
| 1081 | { | 1464 | { |
| 1082 | /* Take the kexec_mutex here to prevent sys_kexec_load | 1465 | /* Take the kexec_mutex here to prevent sys_kexec_load |
| @@ -1376,7 +1759,6 @@ static __initdata char *suffix_tbl[] = { | |||
| 1376 | */ | 1759 | */ |
| 1377 | static int __init parse_crashkernel_suffix(char *cmdline, | 1760 | static int __init parse_crashkernel_suffix(char *cmdline, |
| 1378 | unsigned long long *crash_size, | 1761 | unsigned long long *crash_size, |
| 1379 | unsigned long long *crash_base, | ||
| 1380 | const char *suffix) | 1762 | const char *suffix) |
| 1381 | { | 1763 | { |
| 1382 | char *cur = cmdline; | 1764 | char *cur = cmdline; |
| @@ -1465,7 +1847,7 @@ static int __init __parse_crashkernel(char *cmdline, | |||
| 1465 | 1847 | ||
| 1466 | if (suffix) | 1848 | if (suffix) |
| 1467 | return parse_crashkernel_suffix(ck_cmdline, crash_size, | 1849 | return parse_crashkernel_suffix(ck_cmdline, crash_size, |
| 1468 | crash_base, suffix); | 1850 | suffix); |
| 1469 | /* | 1851 | /* |
| 1470 | * if the commandline contains a ':', then that's the extended | 1852 | * if the commandline contains a ':', then that's the extended |
| 1471 | * syntax -- if not, it must be the classic syntax | 1853 | * syntax -- if not, it must be the classic syntax |
| @@ -1632,6 +2014,672 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
| 1632 | 2014 | ||
| 1633 | subsys_initcall(crash_save_vmcoreinfo_init); | 2015 | subsys_initcall(crash_save_vmcoreinfo_init); |
| 1634 | 2016 | ||
| 2017 | #ifdef CONFIG_KEXEC_FILE | ||
| 2018 | static int locate_mem_hole_top_down(unsigned long start, unsigned long end, | ||
| 2019 | struct kexec_buf *kbuf) | ||
| 2020 | { | ||
| 2021 | struct kimage *image = kbuf->image; | ||
| 2022 | unsigned long temp_start, temp_end; | ||
| 2023 | |||
| 2024 | temp_end = min(end, kbuf->buf_max); | ||
| 2025 | temp_start = temp_end - kbuf->memsz; | ||
| 2026 | |||
| 2027 | do { | ||
| 2028 | /* align down start */ | ||
| 2029 | temp_start = temp_start & (~(kbuf->buf_align - 1)); | ||
| 2030 | |||
| 2031 | if (temp_start < start || temp_start < kbuf->buf_min) | ||
| 2032 | return 0; | ||
| 2033 | |||
| 2034 | temp_end = temp_start + kbuf->memsz - 1; | ||
| 2035 | |||
| 2036 | /* | ||
| 2037 | * Make sure this does not conflict with any of existing | ||
| 2038 | * segments | ||
| 2039 | */ | ||
| 2040 | if (kimage_is_destination_range(image, temp_start, temp_end)) { | ||
| 2041 | temp_start = temp_start - PAGE_SIZE; | ||
| 2042 | continue; | ||
| 2043 | } | ||
| 2044 | |||
| 2045 | /* We found a suitable memory range */ | ||
| 2046 | break; | ||
| 2047 | } while (1); | ||
| 2048 | |||
| 2049 | /* If we are here, we found a suitable memory range */ | ||
| 2050 | kbuf->mem = temp_start; | ||
| 2051 | |||
| 2052 | /* Success, stop navigating through remaining System RAM ranges */ | ||
| 2053 | return 1; | ||
| 2054 | } | ||
| 2055 | |||
| 2056 | static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, | ||
| 2057 | struct kexec_buf *kbuf) | ||
| 2058 | { | ||
| 2059 | struct kimage *image = kbuf->image; | ||
| 2060 | unsigned long temp_start, temp_end; | ||
| 2061 | |||
| 2062 | temp_start = max(start, kbuf->buf_min); | ||
| 2063 | |||
| 2064 | do { | ||
| 2065 | temp_start = ALIGN(temp_start, kbuf->buf_align); | ||
| 2066 | temp_end = temp_start + kbuf->memsz - 1; | ||
| 2067 | |||
| 2068 | if (temp_end > end || temp_end > kbuf->buf_max) | ||
| 2069 | return 0; | ||
| 2070 | /* | ||
| 2071 | * Make sure this does not conflict with any of existing | ||
| 2072 | * segments | ||
| 2073 | */ | ||
| 2074 | if (kimage_is_destination_range(image, temp_start, temp_end)) { | ||
| 2075 | temp_start = temp_start + PAGE_SIZE; | ||
| 2076 | continue; | ||
| 2077 | } | ||
| 2078 | |||
| 2079 | /* We found a suitable memory range */ | ||
| 2080 | break; | ||
| 2081 | } while (1); | ||
| 2082 | |||
| 2083 | /* If we are here, we found a suitable memory range */ | ||
| 2084 | kbuf->mem = temp_start; | ||
| 2085 | |||
| 2086 | /* Success, stop navigating through remaining System RAM ranges */ | ||
| 2087 | return 1; | ||
| 2088 | } | ||
| 2089 | |||
| 2090 | static int locate_mem_hole_callback(u64 start, u64 end, void *arg) | ||
| 2091 | { | ||
| 2092 | struct kexec_buf *kbuf = (struct kexec_buf *)arg; | ||
| 2093 | unsigned long sz = end - start + 1; | ||
| 2094 | |||
| 2095 | /* Returning 0 will take to next memory range */ | ||
| 2096 | if (sz < kbuf->memsz) | ||
| 2097 | return 0; | ||
| 2098 | |||
| 2099 | if (end < kbuf->buf_min || start > kbuf->buf_max) | ||
| 2100 | return 0; | ||
| 2101 | |||
| 2102 | /* | ||
| 2103 | * Allocate memory top down with-in ram range. Otherwise bottom up | ||
| 2104 | * allocation. | ||
| 2105 | */ | ||
| 2106 | if (kbuf->top_down) | ||
| 2107 | return locate_mem_hole_top_down(start, end, kbuf); | ||
| 2108 | return locate_mem_hole_bottom_up(start, end, kbuf); | ||
| 2109 | } | ||
| 2110 | |||
| 2111 | /* | ||
| 2112 | * Helper function for placing a buffer in a kexec segment. This assumes | ||
| 2113 | * that kexec_mutex is held. | ||
| 2114 | */ | ||
| 2115 | int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, | ||
| 2116 | unsigned long memsz, unsigned long buf_align, | ||
| 2117 | unsigned long buf_min, unsigned long buf_max, | ||
| 2118 | bool top_down, unsigned long *load_addr) | ||
| 2119 | { | ||
| 2120 | |||
| 2121 | struct kexec_segment *ksegment; | ||
| 2122 | struct kexec_buf buf, *kbuf; | ||
| 2123 | int ret; | ||
| 2124 | |||
| 2125 | /* Currently adding segment this way is allowed only in file mode */ | ||
| 2126 | if (!image->file_mode) | ||
| 2127 | return -EINVAL; | ||
| 2128 | |||
| 2129 | if (image->nr_segments >= KEXEC_SEGMENT_MAX) | ||
| 2130 | return -EINVAL; | ||
| 2131 | |||
| 2132 | /* | ||
| 2133 | * Make sure we are not trying to add buffer after allocating | ||
| 2134 | * control pages. All segments need to be placed first before | ||
| 2135 | * any control pages are allocated. As control page allocation | ||
| 2136 | * logic goes through list of segments to make sure there are | ||
| 2137 | * no destination overlaps. | ||
| 2138 | */ | ||
| 2139 | if (!list_empty(&image->control_pages)) { | ||
| 2140 | WARN_ON(1); | ||
| 2141 | return -EINVAL; | ||
| 2142 | } | ||
| 2143 | |||
| 2144 | memset(&buf, 0, sizeof(struct kexec_buf)); | ||
| 2145 | kbuf = &buf; | ||
| 2146 | kbuf->image = image; | ||
| 2147 | kbuf->buffer = buffer; | ||
| 2148 | kbuf->bufsz = bufsz; | ||
| 2149 | |||
| 2150 | kbuf->memsz = ALIGN(memsz, PAGE_SIZE); | ||
| 2151 | kbuf->buf_align = max(buf_align, PAGE_SIZE); | ||
| 2152 | kbuf->buf_min = buf_min; | ||
| 2153 | kbuf->buf_max = buf_max; | ||
| 2154 | kbuf->top_down = top_down; | ||
| 2155 | |||
| 2156 | /* Walk the RAM ranges and allocate a suitable range for the buffer */ | ||
| 2157 | if (image->type == KEXEC_TYPE_CRASH) | ||
| 2158 | ret = walk_iomem_res("Crash kernel", | ||
| 2159 | IORESOURCE_MEM | IORESOURCE_BUSY, | ||
| 2160 | crashk_res.start, crashk_res.end, kbuf, | ||
| 2161 | locate_mem_hole_callback); | ||
| 2162 | else | ||
| 2163 | ret = walk_system_ram_res(0, -1, kbuf, | ||
| 2164 | locate_mem_hole_callback); | ||
| 2165 | if (ret != 1) { | ||
| 2166 | /* A suitable memory range could not be found for buffer */ | ||
| 2167 | return -EADDRNOTAVAIL; | ||
| 2168 | } | ||
| 2169 | |||
| 2170 | /* Found a suitable memory range */ | ||
| 2171 | ksegment = &image->segment[image->nr_segments]; | ||
| 2172 | ksegment->kbuf = kbuf->buffer; | ||
| 2173 | ksegment->bufsz = kbuf->bufsz; | ||
| 2174 | ksegment->mem = kbuf->mem; | ||
| 2175 | ksegment->memsz = kbuf->memsz; | ||
| 2176 | image->nr_segments++; | ||
| 2177 | *load_addr = ksegment->mem; | ||
| 2178 | return 0; | ||
| 2179 | } | ||
| 2180 | |||
| 2181 | /* Calculate and store the digest of segments */ | ||
| 2182 | static int kexec_calculate_store_digests(struct kimage *image) | ||
| 2183 | { | ||
| 2184 | struct crypto_shash *tfm; | ||
| 2185 | struct shash_desc *desc; | ||
| 2186 | int ret = 0, i, j, zero_buf_sz, sha_region_sz; | ||
| 2187 | size_t desc_size, nullsz; | ||
| 2188 | char *digest; | ||
| 2189 | void *zero_buf; | ||
| 2190 | struct kexec_sha_region *sha_regions; | ||
| 2191 | struct purgatory_info *pi = &image->purgatory_info; | ||
| 2192 | |||
| 2193 | zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); | ||
| 2194 | zero_buf_sz = PAGE_SIZE; | ||
| 2195 | |||
| 2196 | tfm = crypto_alloc_shash("sha256", 0, 0); | ||
| 2197 | if (IS_ERR(tfm)) { | ||
| 2198 | ret = PTR_ERR(tfm); | ||
| 2199 | goto out; | ||
| 2200 | } | ||
| 2201 | |||
| 2202 | desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); | ||
| 2203 | desc = kzalloc(desc_size, GFP_KERNEL); | ||
| 2204 | if (!desc) { | ||
| 2205 | ret = -ENOMEM; | ||
| 2206 | goto out_free_tfm; | ||
| 2207 | } | ||
| 2208 | |||
| 2209 | sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); | ||
| 2210 | sha_regions = vzalloc(sha_region_sz); | ||
| 2211 | if (!sha_regions) | ||
| 2212 | goto out_free_desc; | ||
| 2213 | |||
| 2214 | desc->tfm = tfm; | ||
| 2215 | desc->flags = 0; | ||
| 2216 | |||
| 2217 | ret = crypto_shash_init(desc); | ||
| 2218 | if (ret < 0) | ||
| 2219 | goto out_free_sha_regions; | ||
| 2220 | |||
| 2221 | digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); | ||
| 2222 | if (!digest) { | ||
| 2223 | ret = -ENOMEM; | ||
| 2224 | goto out_free_sha_regions; | ||
| 2225 | } | ||
| 2226 | |||
| 2227 | for (j = i = 0; i < image->nr_segments; i++) { | ||
| 2228 | struct kexec_segment *ksegment; | ||
| 2229 | |||
| 2230 | ksegment = &image->segment[i]; | ||
| 2231 | /* | ||
| 2232 | * Skip purgatory as it will be modified once we put digest | ||
| 2233 | * info in purgatory. | ||
| 2234 | */ | ||
| 2235 | if (ksegment->kbuf == pi->purgatory_buf) | ||
| 2236 | continue; | ||
| 2237 | |||
| 2238 | ret = crypto_shash_update(desc, ksegment->kbuf, | ||
| 2239 | ksegment->bufsz); | ||
| 2240 | if (ret) | ||
| 2241 | break; | ||
| 2242 | |||
| 2243 | /* | ||
| 2244 | * Assume rest of the buffer is filled with zero and | ||
| 2245 | * update digest accordingly. | ||
| 2246 | */ | ||
| 2247 | nullsz = ksegment->memsz - ksegment->bufsz; | ||
| 2248 | while (nullsz) { | ||
| 2249 | unsigned long bytes = nullsz; | ||
| 2250 | |||
| 2251 | if (bytes > zero_buf_sz) | ||
| 2252 | bytes = zero_buf_sz; | ||
| 2253 | ret = crypto_shash_update(desc, zero_buf, bytes); | ||
| 2254 | if (ret) | ||
| 2255 | break; | ||
| 2256 | nullsz -= bytes; | ||
| 2257 | } | ||
| 2258 | |||
| 2259 | if (ret) | ||
| 2260 | break; | ||
| 2261 | |||
| 2262 | sha_regions[j].start = ksegment->mem; | ||
| 2263 | sha_regions[j].len = ksegment->memsz; | ||
| 2264 | j++; | ||
| 2265 | } | ||
| 2266 | |||
| 2267 | if (!ret) { | ||
| 2268 | ret = crypto_shash_final(desc, digest); | ||
| 2269 | if (ret) | ||
| 2270 | goto out_free_digest; | ||
| 2271 | ret = kexec_purgatory_get_set_symbol(image, "sha_regions", | ||
| 2272 | sha_regions, sha_region_sz, 0); | ||
| 2273 | if (ret) | ||
| 2274 | goto out_free_digest; | ||
| 2275 | |||
| 2276 | ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", | ||
| 2277 | digest, SHA256_DIGEST_SIZE, 0); | ||
| 2278 | if (ret) | ||
| 2279 | goto out_free_digest; | ||
| 2280 | } | ||
| 2281 | |||
| 2282 | out_free_digest: | ||
| 2283 | kfree(digest); | ||
| 2284 | out_free_sha_regions: | ||
| 2285 | vfree(sha_regions); | ||
| 2286 | out_free_desc: | ||
| 2287 | kfree(desc); | ||
| 2288 | out_free_tfm: | ||
| 2289 | kfree(tfm); | ||
| 2290 | out: | ||
| 2291 | return ret; | ||
| 2292 | } | ||
| 2293 | |||
| 2294 | /* Actually load purgatory. Lot of code taken from kexec-tools */ | ||
| 2295 | static int __kexec_load_purgatory(struct kimage *image, unsigned long min, | ||
| 2296 | unsigned long max, int top_down) | ||
| 2297 | { | ||
| 2298 | struct purgatory_info *pi = &image->purgatory_info; | ||
| 2299 | unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; | ||
| 2300 | unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; | ||
| 2301 | unsigned char *buf_addr, *src; | ||
| 2302 | int i, ret = 0, entry_sidx = -1; | ||
| 2303 | const Elf_Shdr *sechdrs_c; | ||
| 2304 | Elf_Shdr *sechdrs = NULL; | ||
| 2305 | void *purgatory_buf = NULL; | ||
| 2306 | |||
| 2307 | /* | ||
| 2308 | * sechdrs_c points to section headers in purgatory and are read | ||
| 2309 | * only. No modifications allowed. | ||
| 2310 | */ | ||
| 2311 | sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; | ||
| 2312 | |||
| 2313 | /* | ||
| 2314 | * We can not modify sechdrs_c[] and its fields. It is read only. | ||
| 2315 | * Copy it over to a local copy where one can store some temporary | ||
| 2316 | * data and free it at the end. We need to modify ->sh_addr and | ||
| 2317 | * ->sh_offset fields to keep track of permanent and temporary | ||
| 2318 | * locations of sections. | ||
| 2319 | */ | ||
| 2320 | sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); | ||
| 2321 | if (!sechdrs) | ||
| 2322 | return -ENOMEM; | ||
| 2323 | |||
| 2324 | memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); | ||
| 2325 | |||
| 2326 | /* | ||
| 2327 | * We seem to have multiple copies of sections. First copy is which | ||
| 2328 | * is embedded in kernel in read only section. Some of these sections | ||
| 2329 | * will be copied to a temporary buffer and relocated. And these | ||
| 2330 | * sections will finally be copied to their final destination at | ||
| 2331 | * segment load time. | ||
| 2332 | * | ||
| 2333 | * Use ->sh_offset to reflect section address in memory. It will | ||
| 2334 | * point to original read only copy if section is not allocatable. | ||
| 2335 | * Otherwise it will point to temporary copy which will be relocated. | ||
| 2336 | * | ||
| 2337 | * Use ->sh_addr to contain final address of the section where it | ||
| 2338 | * will go during execution time. | ||
| 2339 | */ | ||
| 2340 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
| 2341 | if (sechdrs[i].sh_type == SHT_NOBITS) | ||
| 2342 | continue; | ||
| 2343 | |||
| 2344 | sechdrs[i].sh_offset = (unsigned long)pi->ehdr + | ||
| 2345 | sechdrs[i].sh_offset; | ||
| 2346 | } | ||
| 2347 | |||
| 2348 | /* | ||
| 2349 | * Identify entry point section and make entry relative to section | ||
| 2350 | * start. | ||
| 2351 | */ | ||
| 2352 | entry = pi->ehdr->e_entry; | ||
| 2353 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
| 2354 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
| 2355 | continue; | ||
| 2356 | |||
| 2357 | if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) | ||
| 2358 | continue; | ||
| 2359 | |||
| 2360 | /* Make entry section relative */ | ||
| 2361 | if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && | ||
| 2362 | ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > | ||
| 2363 | pi->ehdr->e_entry)) { | ||
| 2364 | entry_sidx = i; | ||
| 2365 | entry -= sechdrs[i].sh_addr; | ||
| 2366 | break; | ||
| 2367 | } | ||
| 2368 | } | ||
| 2369 | |||
| 2370 | /* Determine how much memory is needed to load relocatable object. */ | ||
| 2371 | buf_align = 1; | ||
| 2372 | bss_align = 1; | ||
| 2373 | buf_sz = 0; | ||
| 2374 | bss_sz = 0; | ||
| 2375 | |||
| 2376 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
| 2377 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
| 2378 | continue; | ||
| 2379 | |||
| 2380 | align = sechdrs[i].sh_addralign; | ||
| 2381 | if (sechdrs[i].sh_type != SHT_NOBITS) { | ||
| 2382 | if (buf_align < align) | ||
| 2383 | buf_align = align; | ||
| 2384 | buf_sz = ALIGN(buf_sz, align); | ||
| 2385 | buf_sz += sechdrs[i].sh_size; | ||
| 2386 | } else { | ||
| 2387 | /* bss section */ | ||
| 2388 | if (bss_align < align) | ||
| 2389 | bss_align = align; | ||
| 2390 | bss_sz = ALIGN(bss_sz, align); | ||
| 2391 | bss_sz += sechdrs[i].sh_size; | ||
| 2392 | } | ||
| 2393 | } | ||
| 2394 | |||
| 2395 | /* Determine the bss padding required to align bss properly */ | ||
| 2396 | bss_pad = 0; | ||
| 2397 | if (buf_sz & (bss_align - 1)) | ||
| 2398 | bss_pad = bss_align - (buf_sz & (bss_align - 1)); | ||
| 2399 | |||
| 2400 | memsz = buf_sz + bss_pad + bss_sz; | ||
| 2401 | |||
| 2402 | /* Allocate buffer for purgatory */ | ||
| 2403 | purgatory_buf = vzalloc(buf_sz); | ||
| 2404 | if (!purgatory_buf) { | ||
| 2405 | ret = -ENOMEM; | ||
| 2406 | goto out; | ||
| 2407 | } | ||
| 2408 | |||
| 2409 | if (buf_align < bss_align) | ||
| 2410 | buf_align = bss_align; | ||
| 2411 | |||
| 2412 | /* Add buffer to segment list */ | ||
| 2413 | ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, | ||
| 2414 | buf_align, min, max, top_down, | ||
| 2415 | &pi->purgatory_load_addr); | ||
| 2416 | if (ret) | ||
| 2417 | goto out; | ||
| 2418 | |||
| 2419 | /* Load SHF_ALLOC sections */ | ||
| 2420 | buf_addr = purgatory_buf; | ||
| 2421 | load_addr = curr_load_addr = pi->purgatory_load_addr; | ||
| 2422 | bss_addr = load_addr + buf_sz + bss_pad; | ||
| 2423 | |||
| 2424 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
| 2425 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
| 2426 | continue; | ||
| 2427 | |||
| 2428 | align = sechdrs[i].sh_addralign; | ||
| 2429 | if (sechdrs[i].sh_type != SHT_NOBITS) { | ||
| 2430 | curr_load_addr = ALIGN(curr_load_addr, align); | ||
| 2431 | offset = curr_load_addr - load_addr; | ||
| 2432 | /* We already modifed ->sh_offset to keep src addr */ | ||
| 2433 | src = (char *) sechdrs[i].sh_offset; | ||
| 2434 | memcpy(buf_addr + offset, src, sechdrs[i].sh_size); | ||
| 2435 | |||
| 2436 | /* Store load address and source address of section */ | ||
| 2437 | sechdrs[i].sh_addr = curr_load_addr; | ||
| 2438 | |||
| 2439 | /* | ||
| 2440 | * This section got copied to temporary buffer. Update | ||
| 2441 | * ->sh_offset accordingly. | ||
| 2442 | */ | ||
| 2443 | sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); | ||
| 2444 | |||
| 2445 | /* Advance to the next address */ | ||
| 2446 | curr_load_addr += sechdrs[i].sh_size; | ||
| 2447 | } else { | ||
| 2448 | bss_addr = ALIGN(bss_addr, align); | ||
| 2449 | sechdrs[i].sh_addr = bss_addr; | ||
| 2450 | bss_addr += sechdrs[i].sh_size; | ||
| 2451 | } | ||
| 2452 | } | ||
| 2453 | |||
| 2454 | /* Update entry point based on load address of text section */ | ||
| 2455 | if (entry_sidx >= 0) | ||
| 2456 | entry += sechdrs[entry_sidx].sh_addr; | ||
| 2457 | |||
| 2458 | /* Make kernel jump to purgatory after shutdown */ | ||
| 2459 | image->start = entry; | ||
| 2460 | |||
| 2461 | /* Used later to get/set symbol values */ | ||
| 2462 | pi->sechdrs = sechdrs; | ||
| 2463 | |||
| 2464 | /* | ||
| 2465 | * Used later to identify which section is purgatory and skip it | ||
| 2466 | * from checksumming. | ||
| 2467 | */ | ||
| 2468 | pi->purgatory_buf = purgatory_buf; | ||
| 2469 | return ret; | ||
| 2470 | out: | ||
| 2471 | vfree(sechdrs); | ||
| 2472 | vfree(purgatory_buf); | ||
| 2473 | return ret; | ||
| 2474 | } | ||
| 2475 | |||
| 2476 | static int kexec_apply_relocations(struct kimage *image) | ||
| 2477 | { | ||
| 2478 | int i, ret; | ||
| 2479 | struct purgatory_info *pi = &image->purgatory_info; | ||
| 2480 | Elf_Shdr *sechdrs = pi->sechdrs; | ||
| 2481 | |||
| 2482 | /* Apply relocations */ | ||
| 2483 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
| 2484 | Elf_Shdr *section, *symtab; | ||
| 2485 | |||
| 2486 | if (sechdrs[i].sh_type != SHT_RELA && | ||
| 2487 | sechdrs[i].sh_type != SHT_REL) | ||
| 2488 | continue; | ||
| 2489 | |||
| 2490 | /* | ||
| 2491 | * For section of type SHT_RELA/SHT_REL, | ||
| 2492 | * ->sh_link contains section header index of associated | ||
| 2493 | * symbol table. And ->sh_info contains section header | ||
| 2494 | * index of section to which relocations apply. | ||
| 2495 | */ | ||
| 2496 | if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || | ||
| 2497 | sechdrs[i].sh_link >= pi->ehdr->e_shnum) | ||
| 2498 | return -ENOEXEC; | ||
| 2499 | |||
| 2500 | section = &sechdrs[sechdrs[i].sh_info]; | ||
| 2501 | symtab = &sechdrs[sechdrs[i].sh_link]; | ||
| 2502 | |||
| 2503 | if (!(section->sh_flags & SHF_ALLOC)) | ||
| 2504 | continue; | ||
| 2505 | |||
| 2506 | /* | ||
| 2507 | * symtab->sh_link contain section header index of associated | ||
| 2508 | * string table. | ||
| 2509 | */ | ||
| 2510 | if (symtab->sh_link >= pi->ehdr->e_shnum) | ||
| 2511 | /* Invalid section number? */ | ||
| 2512 | continue; | ||
| 2513 | |||
| 2514 | /* | ||
| 2515 | * Respective archicture needs to provide support for applying | ||
| 2516 | * relocations of type SHT_RELA/SHT_REL. | ||
| 2517 | */ | ||
| 2518 | if (sechdrs[i].sh_type == SHT_RELA) | ||
| 2519 | ret = arch_kexec_apply_relocations_add(pi->ehdr, | ||
| 2520 | sechdrs, i); | ||
| 2521 | else if (sechdrs[i].sh_type == SHT_REL) | ||
| 2522 | ret = arch_kexec_apply_relocations(pi->ehdr, | ||
| 2523 | sechdrs, i); | ||
| 2524 | if (ret) | ||
| 2525 | return ret; | ||
| 2526 | } | ||
| 2527 | |||
| 2528 | return 0; | ||
| 2529 | } | ||
| 2530 | |||
| 2531 | /* Load relocatable purgatory object and relocate it appropriately */ | ||
| 2532 | int kexec_load_purgatory(struct kimage *image, unsigned long min, | ||
| 2533 | unsigned long max, int top_down, | ||
| 2534 | unsigned long *load_addr) | ||
| 2535 | { | ||
| 2536 | struct purgatory_info *pi = &image->purgatory_info; | ||
| 2537 | int ret; | ||
| 2538 | |||
| 2539 | if (kexec_purgatory_size <= 0) | ||
| 2540 | return -EINVAL; | ||
| 2541 | |||
| 2542 | if (kexec_purgatory_size < sizeof(Elf_Ehdr)) | ||
| 2543 | return -ENOEXEC; | ||
| 2544 | |||
| 2545 | pi->ehdr = (Elf_Ehdr *)kexec_purgatory; | ||
| 2546 | |||
| 2547 | if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 | ||
| 2548 | || pi->ehdr->e_type != ET_REL | ||
| 2549 | || !elf_check_arch(pi->ehdr) | ||
| 2550 | || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) | ||
| 2551 | return -ENOEXEC; | ||
| 2552 | |||
| 2553 | if (pi->ehdr->e_shoff >= kexec_purgatory_size | ||
| 2554 | || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > | ||
| 2555 | kexec_purgatory_size - pi->ehdr->e_shoff)) | ||
| 2556 | return -ENOEXEC; | ||
| 2557 | |||
| 2558 | ret = __kexec_load_purgatory(image, min, max, top_down); | ||
| 2559 | if (ret) | ||
| 2560 | return ret; | ||
| 2561 | |||
| 2562 | ret = kexec_apply_relocations(image); | ||
| 2563 | if (ret) | ||
| 2564 | goto out; | ||
| 2565 | |||
| 2566 | *load_addr = pi->purgatory_load_addr; | ||
| 2567 | return 0; | ||
| 2568 | out: | ||
| 2569 | vfree(pi->sechdrs); | ||
| 2570 | vfree(pi->purgatory_buf); | ||
| 2571 | return ret; | ||
| 2572 | } | ||
| 2573 | |||
| 2574 | static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, | ||
| 2575 | const char *name) | ||
| 2576 | { | ||
| 2577 | Elf_Sym *syms; | ||
| 2578 | Elf_Shdr *sechdrs; | ||
| 2579 | Elf_Ehdr *ehdr; | ||
| 2580 | int i, k; | ||
| 2581 | const char *strtab; | ||
| 2582 | |||
| 2583 | if (!pi->sechdrs || !pi->ehdr) | ||
| 2584 | return NULL; | ||
| 2585 | |||
| 2586 | sechdrs = pi->sechdrs; | ||
| 2587 | ehdr = pi->ehdr; | ||
| 2588 | |||
| 2589 | for (i = 0; i < ehdr->e_shnum; i++) { | ||
| 2590 | if (sechdrs[i].sh_type != SHT_SYMTAB) | ||
| 2591 | continue; | ||
| 2592 | |||
| 2593 | if (sechdrs[i].sh_link >= ehdr->e_shnum) | ||
| 2594 | /* Invalid strtab section number */ | ||
| 2595 | continue; | ||
| 2596 | strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; | ||
| 2597 | syms = (Elf_Sym *)sechdrs[i].sh_offset; | ||
| 2598 | |||
| 2599 | /* Go through symbols for a match */ | ||
| 2600 | for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { | ||
| 2601 | if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) | ||
| 2602 | continue; | ||
| 2603 | |||
| 2604 | if (strcmp(strtab + syms[k].st_name, name) != 0) | ||
| 2605 | continue; | ||
| 2606 | |||
| 2607 | if (syms[k].st_shndx == SHN_UNDEF || | ||
| 2608 | syms[k].st_shndx >= ehdr->e_shnum) { | ||
| 2609 | pr_debug("Symbol: %s has bad section index %d.\n", | ||
| 2610 | name, syms[k].st_shndx); | ||
| 2611 | return NULL; | ||
| 2612 | } | ||
| 2613 | |||
| 2614 | /* Found the symbol we are looking for */ | ||
| 2615 | return &syms[k]; | ||
| 2616 | } | ||
| 2617 | } | ||
| 2618 | |||
| 2619 | return NULL; | ||
| 2620 | } | ||
| 2621 | |||
| 2622 | void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) | ||
| 2623 | { | ||
| 2624 | struct purgatory_info *pi = &image->purgatory_info; | ||
| 2625 | Elf_Sym *sym; | ||
| 2626 | Elf_Shdr *sechdr; | ||
| 2627 | |||
| 2628 | sym = kexec_purgatory_find_symbol(pi, name); | ||
| 2629 | if (!sym) | ||
| 2630 | return ERR_PTR(-EINVAL); | ||
| 2631 | |||
| 2632 | sechdr = &pi->sechdrs[sym->st_shndx]; | ||
| 2633 | |||
| 2634 | /* | ||
| 2635 | * Returns the address where symbol will finally be loaded after | ||
| 2636 | * kexec_load_segment() | ||
| 2637 | */ | ||
| 2638 | return (void *)(sechdr->sh_addr + sym->st_value); | ||
| 2639 | } | ||
| 2640 | |||
| 2641 | /* | ||
| 2642 | * Get or set value of a symbol. If "get_value" is true, symbol value is | ||
| 2643 | * returned in buf otherwise symbol value is set based on value in buf. | ||
| 2644 | */ | ||
| 2645 | int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, | ||
| 2646 | void *buf, unsigned int size, bool get_value) | ||
| 2647 | { | ||
| 2648 | Elf_Sym *sym; | ||
| 2649 | Elf_Shdr *sechdrs; | ||
| 2650 | struct purgatory_info *pi = &image->purgatory_info; | ||
| 2651 | char *sym_buf; | ||
| 2652 | |||
| 2653 | sym = kexec_purgatory_find_symbol(pi, name); | ||
| 2654 | if (!sym) | ||
| 2655 | return -EINVAL; | ||
| 2656 | |||
| 2657 | if (sym->st_size != size) { | ||
| 2658 | pr_err("symbol %s size mismatch: expected %lu actual %u\n", | ||
| 2659 | name, (unsigned long)sym->st_size, size); | ||
| 2660 | return -EINVAL; | ||
| 2661 | } | ||
| 2662 | |||
| 2663 | sechdrs = pi->sechdrs; | ||
| 2664 | |||
| 2665 | if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { | ||
| 2666 | pr_err("symbol %s is in a bss section. Cannot %s\n", name, | ||
| 2667 | get_value ? "get" : "set"); | ||
| 2668 | return -EINVAL; | ||
| 2669 | } | ||
| 2670 | |||
| 2671 | sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + | ||
| 2672 | sym->st_value; | ||
| 2673 | |||
| 2674 | if (get_value) | ||
| 2675 | memcpy((void *)buf, sym_buf, size); | ||
| 2676 | else | ||
| 2677 | memcpy((void *)sym_buf, buf, size); | ||
| 2678 | |||
| 2679 | return 0; | ||
| 2680 | } | ||
| 2681 | #endif /* CONFIG_KEXEC_FILE */ | ||
| 2682 | |||
| 1635 | /* | 2683 | /* |
| 1636 | * Move into place and start executing a preloaded standalone | 2684 | * Move into place and start executing a preloaded standalone |
| 1637 | * executable. If nothing was preloaded return an error. | 2685 | * executable. If nothing was preloaded return an error. |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 8637e041a247..80f7a6d00519 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...) | |||
| 196 | EXPORT_SYMBOL(__request_module); | 196 | EXPORT_SYMBOL(__request_module); |
| 197 | #endif /* CONFIG_MODULES */ | 197 | #endif /* CONFIG_MODULES */ |
| 198 | 198 | ||
| 199 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) | ||
| 200 | { | ||
| 201 | if (info->cleanup) | ||
| 202 | (*info->cleanup)(info); | ||
| 203 | kfree(info); | ||
| 204 | } | ||
| 205 | |||
| 206 | static void umh_complete(struct subprocess_info *sub_info) | ||
| 207 | { | ||
| 208 | struct completion *comp = xchg(&sub_info->complete, NULL); | ||
| 209 | /* | ||
| 210 | * See call_usermodehelper_exec(). If xchg() returns NULL | ||
| 211 | * we own sub_info, the UMH_KILLABLE caller has gone away | ||
| 212 | * or the caller used UMH_NO_WAIT. | ||
| 213 | */ | ||
| 214 | if (comp) | ||
| 215 | complete(comp); | ||
| 216 | else | ||
| 217 | call_usermodehelper_freeinfo(sub_info); | ||
| 218 | } | ||
| 219 | |||
| 199 | /* | 220 | /* |
| 200 | * This is the task which runs the usermode application | 221 | * This is the task which runs the usermode application |
| 201 | */ | 222 | */ |
| 202 | static int ____call_usermodehelper(void *data) | 223 | static int ____call_usermodehelper(void *data) |
| 203 | { | 224 | { |
| 204 | struct subprocess_info *sub_info = data; | 225 | struct subprocess_info *sub_info = data; |
| 226 | int wait = sub_info->wait & ~UMH_KILLABLE; | ||
| 205 | struct cred *new; | 227 | struct cred *new; |
| 206 | int retval; | 228 | int retval; |
| 207 | 229 | ||
| @@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data) | |||
| 221 | retval = -ENOMEM; | 243 | retval = -ENOMEM; |
| 222 | new = prepare_kernel_cred(current); | 244 | new = prepare_kernel_cred(current); |
| 223 | if (!new) | 245 | if (!new) |
| 224 | goto fail; | 246 | goto out; |
| 225 | 247 | ||
| 226 | spin_lock(&umh_sysctl_lock); | 248 | spin_lock(&umh_sysctl_lock); |
| 227 | new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); | 249 | new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); |
| @@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data) | |||
| 233 | retval = sub_info->init(sub_info, new); | 255 | retval = sub_info->init(sub_info, new); |
| 234 | if (retval) { | 256 | if (retval) { |
| 235 | abort_creds(new); | 257 | abort_creds(new); |
| 236 | goto fail; | 258 | goto out; |
| 237 | } | 259 | } |
| 238 | } | 260 | } |
| 239 | 261 | ||
| @@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data) | |||
| 242 | retval = do_execve(getname_kernel(sub_info->path), | 264 | retval = do_execve(getname_kernel(sub_info->path), |
| 243 | (const char __user *const __user *)sub_info->argv, | 265 | (const char __user *const __user *)sub_info->argv, |
| 244 | (const char __user *const __user *)sub_info->envp); | 266 | (const char __user *const __user *)sub_info->envp); |
| 267 | out: | ||
| 268 | sub_info->retval = retval; | ||
| 269 | /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ | ||
| 270 | if (wait != UMH_WAIT_PROC) | ||
| 271 | umh_complete(sub_info); | ||
| 245 | if (!retval) | 272 | if (!retval) |
| 246 | return 0; | 273 | return 0; |
| 247 | |||
| 248 | /* Exec failed? */ | ||
| 249 | fail: | ||
| 250 | sub_info->retval = retval; | ||
| 251 | do_exit(0); | 274 | do_exit(0); |
| 252 | } | 275 | } |
| 253 | 276 | ||
| @@ -258,26 +281,6 @@ static int call_helper(void *data) | |||
| 258 | return ____call_usermodehelper(data); | 281 | return ____call_usermodehelper(data); |
| 259 | } | 282 | } |
| 260 | 283 | ||
| 261 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) | ||
| 262 | { | ||
| 263 | if (info->cleanup) | ||
| 264 | (*info->cleanup)(info); | ||
| 265 | kfree(info); | ||
| 266 | } | ||
| 267 | |||
| 268 | static void umh_complete(struct subprocess_info *sub_info) | ||
| 269 | { | ||
| 270 | struct completion *comp = xchg(&sub_info->complete, NULL); | ||
| 271 | /* | ||
| 272 | * See call_usermodehelper_exec(). If xchg() returns NULL | ||
| 273 | * we own sub_info, the UMH_KILLABLE caller has gone away. | ||
| 274 | */ | ||
| 275 | if (comp) | ||
| 276 | complete(comp); | ||
| 277 | else | ||
| 278 | call_usermodehelper_freeinfo(sub_info); | ||
| 279 | } | ||
| 280 | |||
| 281 | /* Keventd can't block, but this (a child) can. */ | 284 | /* Keventd can't block, but this (a child) can. */ |
| 282 | static int wait_for_helper(void *data) | 285 | static int wait_for_helper(void *data) |
| 283 | { | 286 | { |
| @@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work) | |||
| 336 | kmod_thread_locker = NULL; | 339 | kmod_thread_locker = NULL; |
| 337 | } | 340 | } |
| 338 | 341 | ||
| 339 | switch (wait) { | 342 | if (pid < 0) { |
| 340 | case UMH_NO_WAIT: | 343 | sub_info->retval = pid; |
| 341 | call_usermodehelper_freeinfo(sub_info); | ||
| 342 | break; | ||
| 343 | |||
| 344 | case UMH_WAIT_PROC: | ||
| 345 | if (pid > 0) | ||
| 346 | break; | ||
| 347 | /* FALLTHROUGH */ | ||
| 348 | case UMH_WAIT_EXEC: | ||
| 349 | if (pid < 0) | ||
| 350 | sub_info->retval = pid; | ||
| 351 | umh_complete(sub_info); | 344 | umh_complete(sub_info); |
| 352 | } | 345 | } |
| 353 | } | 346 | } |
| @@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
| 588 | goto out; | 581 | goto out; |
| 589 | } | 582 | } |
| 590 | 583 | ||
| 591 | sub_info->complete = &done; | 584 | /* |
| 585 | * Set the completion pointer only if there is a waiter. | ||
| 586 | * This makes it possible to use umh_complete to free | ||
| 587 | * the data structure in case of UMH_NO_WAIT. | ||
| 588 | */ | ||
| 589 | sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; | ||
| 592 | sub_info->wait = wait; | 590 | sub_info->wait = wait; |
| 593 | 591 | ||
| 594 | queue_work(khelper_wq, &sub_info->work); | 592 | queue_work(khelper_wq, &sub_info->work); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 734e9a7d280b..3995f546d0f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -1778,7 +1778,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) | |||
| 1778 | unsigned long hash, flags = 0; | 1778 | unsigned long hash, flags = 0; |
| 1779 | struct kretprobe_instance *ri; | 1779 | struct kretprobe_instance *ri; |
| 1780 | 1780 | ||
| 1781 | /*TODO: consider to only swap the RA after the last pre_handler fired */ | 1781 | /* |
| 1782 | * To avoid deadlocks, prohibit return probing in NMI contexts, | ||
| 1783 | * just skip the probe and increase the (inexact) 'nmissed' | ||
| 1784 | * statistical counter, so that the user is informed that | ||
| 1785 | * something happened: | ||
| 1786 | */ | ||
| 1787 | if (unlikely(in_nmi())) { | ||
| 1788 | rp->nmissed++; | ||
| 1789 | return 0; | ||
| 1790 | } | ||
| 1791 | |||
| 1792 | /* TODO: consider to only swap the RA after the last pre_handler fired */ | ||
| 1782 | hash = hash_ptr(current, KPROBE_HASH_BITS); | 1793 | hash = hash_ptr(current, KPROBE_HASH_BITS); |
| 1783 | raw_spin_lock_irqsave(&rp->lock, flags); | 1794 | raw_spin_lock_irqsave(&rp->lock, flags); |
| 1784 | if (!hlist_empty(&rp->free_instances)) { | 1795 | if (!hlist_empty(&rp->free_instances)) { |
diff --git a/kernel/kthread.c b/kernel/kthread.c index ef483220e855..10e489c448fe 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | |||
| 369 | { | 369 | { |
| 370 | struct task_struct *p; | 370 | struct task_struct *p; |
| 371 | 371 | ||
| 372 | p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, | 372 | p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, |
| 373 | cpu); | 373 | cpu); |
| 374 | if (IS_ERR(p)) | 374 | if (IS_ERR(p)) |
| 375 | return p; | 375 | return p; |
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 0955b885d0dc..ec8cce259779 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c | |||
| @@ -20,30 +20,20 @@ | |||
| 20 | * Author: Paul E. McKenney <paulmck@us.ibm.com> | 20 | * Author: Paul E. McKenney <paulmck@us.ibm.com> |
| 21 | * Based on kernel/rcu/torture.c. | 21 | * Based on kernel/rcu/torture.c. |
| 22 | */ | 22 | */ |
| 23 | #include <linux/types.h> | ||
| 24 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
| 25 | #include <linux/init.h> | ||
| 26 | #include <linux/module.h> | 24 | #include <linux/module.h> |
| 27 | #include <linux/kthread.h> | 25 | #include <linux/kthread.h> |
| 28 | #include <linux/err.h> | ||
| 29 | #include <linux/spinlock.h> | 26 | #include <linux/spinlock.h> |
| 27 | #include <linux/rwlock.h> | ||
| 28 | #include <linux/mutex.h> | ||
| 29 | #include <linux/rwsem.h> | ||
| 30 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
| 31 | #include <linux/interrupt.h> | 31 | #include <linux/interrupt.h> |
| 32 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
| 33 | #include <linux/atomic.h> | 33 | #include <linux/atomic.h> |
| 34 | #include <linux/bitops.h> | ||
| 35 | #include <linux/completion.h> | ||
| 36 | #include <linux/moduleparam.h> | 34 | #include <linux/moduleparam.h> |
| 37 | #include <linux/percpu.h> | ||
| 38 | #include <linux/notifier.h> | ||
| 39 | #include <linux/reboot.h> | ||
| 40 | #include <linux/freezer.h> | ||
| 41 | #include <linux/cpu.h> | ||
| 42 | #include <linux/delay.h> | 35 | #include <linux/delay.h> |
| 43 | #include <linux/stat.h> | ||
| 44 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
| 45 | #include <linux/trace_clock.h> | ||
| 46 | #include <asm/byteorder.h> | ||
| 47 | #include <linux/torture.h> | 37 | #include <linux/torture.h> |
| 48 | 38 | ||
| 49 | MODULE_LICENSE("GPL"); | 39 | MODULE_LICENSE("GPL"); |
| @@ -51,6 +41,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); | |||
| 51 | 41 | ||
| 52 | torture_param(int, nwriters_stress, -1, | 42 | torture_param(int, nwriters_stress, -1, |
| 53 | "Number of write-locking stress-test threads"); | 43 | "Number of write-locking stress-test threads"); |
| 44 | torture_param(int, nreaders_stress, -1, | ||
| 45 | "Number of read-locking stress-test threads"); | ||
| 54 | torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); | 46 | torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); |
| 55 | torture_param(int, onoff_interval, 0, | 47 | torture_param(int, onoff_interval, 0, |
| 56 | "Time between CPU hotplugs (s), 0=disable"); | 48 | "Time between CPU hotplugs (s), 0=disable"); |
| @@ -66,30 +58,28 @@ torture_param(bool, verbose, true, | |||
| 66 | static char *torture_type = "spin_lock"; | 58 | static char *torture_type = "spin_lock"; |
| 67 | module_param(torture_type, charp, 0444); | 59 | module_param(torture_type, charp, 0444); |
| 68 | MODULE_PARM_DESC(torture_type, | 60 | MODULE_PARM_DESC(torture_type, |
| 69 | "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); | 61 | "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); |
| 70 | |||
| 71 | static atomic_t n_lock_torture_errors; | ||
| 72 | 62 | ||
| 73 | static struct task_struct *stats_task; | 63 | static struct task_struct *stats_task; |
| 74 | static struct task_struct **writer_tasks; | 64 | static struct task_struct **writer_tasks; |
| 65 | static struct task_struct **reader_tasks; | ||
| 75 | 66 | ||
| 76 | static int nrealwriters_stress; | ||
| 77 | static bool lock_is_write_held; | 67 | static bool lock_is_write_held; |
| 68 | static bool lock_is_read_held; | ||
| 78 | 69 | ||
| 79 | struct lock_writer_stress_stats { | 70 | struct lock_stress_stats { |
| 80 | long n_write_lock_fail; | 71 | long n_lock_fail; |
| 81 | long n_write_lock_acquired; | 72 | long n_lock_acquired; |
| 82 | }; | 73 | }; |
| 83 | static struct lock_writer_stress_stats *lwsa; | ||
| 84 | 74 | ||
| 85 | #if defined(MODULE) | 75 | #if defined(MODULE) |
| 86 | #define LOCKTORTURE_RUNNABLE_INIT 1 | 76 | #define LOCKTORTURE_RUNNABLE_INIT 1 |
| 87 | #else | 77 | #else |
| 88 | #define LOCKTORTURE_RUNNABLE_INIT 0 | 78 | #define LOCKTORTURE_RUNNABLE_INIT 0 |
| 89 | #endif | 79 | #endif |
| 90 | int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; | 80 | int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; |
| 91 | module_param(locktorture_runnable, int, 0444); | 81 | module_param(torture_runnable, int, 0444); |
| 92 | MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); | 82 | MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); |
| 93 | 83 | ||
| 94 | /* Forward reference. */ | 84 | /* Forward reference. */ |
| 95 | static void lock_torture_cleanup(void); | 85 | static void lock_torture_cleanup(void); |
| @@ -102,12 +92,25 @@ struct lock_torture_ops { | |||
| 102 | int (*writelock)(void); | 92 | int (*writelock)(void); |
| 103 | void (*write_delay)(struct torture_random_state *trsp); | 93 | void (*write_delay)(struct torture_random_state *trsp); |
| 104 | void (*writeunlock)(void); | 94 | void (*writeunlock)(void); |
| 95 | int (*readlock)(void); | ||
| 96 | void (*read_delay)(struct torture_random_state *trsp); | ||
| 97 | void (*readunlock)(void); | ||
| 105 | unsigned long flags; | 98 | unsigned long flags; |
| 106 | const char *name; | 99 | const char *name; |
| 107 | }; | 100 | }; |
| 108 | 101 | ||
| 109 | static struct lock_torture_ops *cur_ops; | 102 | struct lock_torture_cxt { |
| 110 | 103 | int nrealwriters_stress; | |
| 104 | int nrealreaders_stress; | ||
| 105 | bool debug_lock; | ||
| 106 | atomic_t n_lock_torture_errors; | ||
| 107 | struct lock_torture_ops *cur_ops; | ||
| 108 | struct lock_stress_stats *lwsa; /* writer statistics */ | ||
| 109 | struct lock_stress_stats *lrsa; /* reader statistics */ | ||
| 110 | }; | ||
| 111 | static struct lock_torture_cxt cxt = { 0, 0, false, | ||
| 112 | ATOMIC_INIT(0), | ||
| 113 | NULL, NULL}; | ||
| 111 | /* | 114 | /* |
| 112 | * Definitions for lock torture testing. | 115 | * Definitions for lock torture testing. |
| 113 | */ | 116 | */ |
| @@ -123,10 +126,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) | |||
| 123 | 126 | ||
| 124 | /* We want a long delay occasionally to force massive contention. */ | 127 | /* We want a long delay occasionally to force massive contention. */ |
| 125 | if (!(torture_random(trsp) % | 128 | if (!(torture_random(trsp) % |
| 126 | (nrealwriters_stress * 2000 * longdelay_us))) | 129 | (cxt.nrealwriters_stress * 2000 * longdelay_us))) |
| 127 | mdelay(longdelay_us); | 130 | mdelay(longdelay_us); |
| 128 | #ifdef CONFIG_PREEMPT | 131 | #ifdef CONFIG_PREEMPT |
| 129 | if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) | 132 | if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) |
| 130 | preempt_schedule(); /* Allow test to be preempted. */ | 133 | preempt_schedule(); /* Allow test to be preempted. */ |
| 131 | #endif | 134 | #endif |
| 132 | } | 135 | } |
| @@ -140,6 +143,9 @@ static struct lock_torture_ops lock_busted_ops = { | |||
| 140 | .writelock = torture_lock_busted_write_lock, | 143 | .writelock = torture_lock_busted_write_lock, |
| 141 | .write_delay = torture_lock_busted_write_delay, | 144 | .write_delay = torture_lock_busted_write_delay, |
| 142 | .writeunlock = torture_lock_busted_write_unlock, | 145 | .writeunlock = torture_lock_busted_write_unlock, |
| 146 | .readlock = NULL, | ||
| 147 | .read_delay = NULL, | ||
| 148 | .readunlock = NULL, | ||
| 143 | .name = "lock_busted" | 149 | .name = "lock_busted" |
| 144 | }; | 150 | }; |
| 145 | 151 | ||
| @@ -160,13 +166,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) | |||
| 160 | * we want a long delay occasionally to force massive contention. | 166 | * we want a long delay occasionally to force massive contention. |
| 161 | */ | 167 | */ |
| 162 | if (!(torture_random(trsp) % | 168 | if (!(torture_random(trsp) % |
| 163 | (nrealwriters_stress * 2000 * longdelay_us))) | 169 | (cxt.nrealwriters_stress * 2000 * longdelay_us))) |
| 164 | mdelay(longdelay_us); | 170 | mdelay(longdelay_us); |
| 165 | if (!(torture_random(trsp) % | 171 | if (!(torture_random(trsp) % |
| 166 | (nrealwriters_stress * 2 * shortdelay_us))) | 172 | (cxt.nrealwriters_stress * 2 * shortdelay_us))) |
| 167 | udelay(shortdelay_us); | 173 | udelay(shortdelay_us); |
| 168 | #ifdef CONFIG_PREEMPT | 174 | #ifdef CONFIG_PREEMPT |
| 169 | if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) | 175 | if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) |
| 170 | preempt_schedule(); /* Allow test to be preempted. */ | 176 | preempt_schedule(); /* Allow test to be preempted. */ |
| 171 | #endif | 177 | #endif |
| 172 | } | 178 | } |
| @@ -180,39 +186,253 @@ static struct lock_torture_ops spin_lock_ops = { | |||
| 180 | .writelock = torture_spin_lock_write_lock, | 186 | .writelock = torture_spin_lock_write_lock, |
| 181 | .write_delay = torture_spin_lock_write_delay, | 187 | .write_delay = torture_spin_lock_write_delay, |
| 182 | .writeunlock = torture_spin_lock_write_unlock, | 188 | .writeunlock = torture_spin_lock_write_unlock, |
| 189 | .readlock = NULL, | ||
| 190 | .read_delay = NULL, | ||
| 191 | .readunlock = NULL, | ||
| 183 | .name = "spin_lock" | 192 | .name = "spin_lock" |
| 184 | }; | 193 | }; |
| 185 | 194 | ||
| 186 | static int torture_spin_lock_write_lock_irq(void) | 195 | static int torture_spin_lock_write_lock_irq(void) |
| 187 | __acquires(torture_spinlock_irq) | 196 | __acquires(torture_spinlock) |
| 188 | { | 197 | { |
| 189 | unsigned long flags; | 198 | unsigned long flags; |
| 190 | 199 | ||
| 191 | spin_lock_irqsave(&torture_spinlock, flags); | 200 | spin_lock_irqsave(&torture_spinlock, flags); |
| 192 | cur_ops->flags = flags; | 201 | cxt.cur_ops->flags = flags; |
| 193 | return 0; | 202 | return 0; |
| 194 | } | 203 | } |
| 195 | 204 | ||
| 196 | static void torture_lock_spin_write_unlock_irq(void) | 205 | static void torture_lock_spin_write_unlock_irq(void) |
| 197 | __releases(torture_spinlock) | 206 | __releases(torture_spinlock) |
| 198 | { | 207 | { |
| 199 | spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); | 208 | spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags); |
| 200 | } | 209 | } |
| 201 | 210 | ||
| 202 | static struct lock_torture_ops spin_lock_irq_ops = { | 211 | static struct lock_torture_ops spin_lock_irq_ops = { |
| 203 | .writelock = torture_spin_lock_write_lock_irq, | 212 | .writelock = torture_spin_lock_write_lock_irq, |
| 204 | .write_delay = torture_spin_lock_write_delay, | 213 | .write_delay = torture_spin_lock_write_delay, |
| 205 | .writeunlock = torture_lock_spin_write_unlock_irq, | 214 | .writeunlock = torture_lock_spin_write_unlock_irq, |
| 215 | .readlock = NULL, | ||
| 216 | .read_delay = NULL, | ||
| 217 | .readunlock = NULL, | ||
| 206 | .name = "spin_lock_irq" | 218 | .name = "spin_lock_irq" |
| 207 | }; | 219 | }; |
| 208 | 220 | ||
| 221 | static DEFINE_RWLOCK(torture_rwlock); | ||
| 222 | |||
| 223 | static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) | ||
| 224 | { | ||
| 225 | write_lock(&torture_rwlock); | ||
| 226 | return 0; | ||
| 227 | } | ||
| 228 | |||
| 229 | static void torture_rwlock_write_delay(struct torture_random_state *trsp) | ||
| 230 | { | ||
| 231 | const unsigned long shortdelay_us = 2; | ||
| 232 | const unsigned long longdelay_ms = 100; | ||
| 233 | |||
| 234 | /* We want a short delay mostly to emulate likely code, and | ||
| 235 | * we want a long delay occasionally to force massive contention. | ||
| 236 | */ | ||
| 237 | if (!(torture_random(trsp) % | ||
| 238 | (cxt.nrealwriters_stress * 2000 * longdelay_ms))) | ||
| 239 | mdelay(longdelay_ms); | ||
| 240 | else | ||
| 241 | udelay(shortdelay_us); | ||
| 242 | } | ||
| 243 | |||
| 244 | static void torture_rwlock_write_unlock(void) __releases(torture_rwlock) | ||
| 245 | { | ||
| 246 | write_unlock(&torture_rwlock); | ||
| 247 | } | ||
| 248 | |||
| 249 | static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) | ||
| 250 | { | ||
| 251 | read_lock(&torture_rwlock); | ||
| 252 | return 0; | ||
| 253 | } | ||
| 254 | |||
| 255 | static void torture_rwlock_read_delay(struct torture_random_state *trsp) | ||
| 256 | { | ||
| 257 | const unsigned long shortdelay_us = 10; | ||
| 258 | const unsigned long longdelay_ms = 100; | ||
| 259 | |||
| 260 | /* We want a short delay mostly to emulate likely code, and | ||
| 261 | * we want a long delay occasionally to force massive contention. | ||
| 262 | */ | ||
| 263 | if (!(torture_random(trsp) % | ||
| 264 | (cxt.nrealreaders_stress * 2000 * longdelay_ms))) | ||
| 265 | mdelay(longdelay_ms); | ||
| 266 | else | ||
| 267 | udelay(shortdelay_us); | ||
| 268 | } | ||
| 269 | |||
| 270 | static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) | ||
| 271 | { | ||
| 272 | read_unlock(&torture_rwlock); | ||
| 273 | } | ||
| 274 | |||
| 275 | static struct lock_torture_ops rw_lock_ops = { | ||
| 276 | .writelock = torture_rwlock_write_lock, | ||
| 277 | .write_delay = torture_rwlock_write_delay, | ||
| 278 | .writeunlock = torture_rwlock_write_unlock, | ||
| 279 | .readlock = torture_rwlock_read_lock, | ||
| 280 | .read_delay = torture_rwlock_read_delay, | ||
| 281 | .readunlock = torture_rwlock_read_unlock, | ||
| 282 | .name = "rw_lock" | ||
| 283 | }; | ||
| 284 | |||
| 285 | static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) | ||
| 286 | { | ||
| 287 | unsigned long flags; | ||
| 288 | |||
| 289 | write_lock_irqsave(&torture_rwlock, flags); | ||
| 290 | cxt.cur_ops->flags = flags; | ||
| 291 | return 0; | ||
| 292 | } | ||
| 293 | |||
| 294 | static void torture_rwlock_write_unlock_irq(void) | ||
| 295 | __releases(torture_rwlock) | ||
| 296 | { | ||
| 297 | write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); | ||
| 298 | } | ||
| 299 | |||
| 300 | static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) | ||
| 301 | { | ||
| 302 | unsigned long flags; | ||
| 303 | |||
| 304 | read_lock_irqsave(&torture_rwlock, flags); | ||
| 305 | cxt.cur_ops->flags = flags; | ||
| 306 | return 0; | ||
| 307 | } | ||
| 308 | |||
| 309 | static void torture_rwlock_read_unlock_irq(void) | ||
| 310 | __releases(torture_rwlock) | ||
| 311 | { | ||
| 312 | write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); | ||
| 313 | } | ||
| 314 | |||
| 315 | static struct lock_torture_ops rw_lock_irq_ops = { | ||
| 316 | .writelock = torture_rwlock_write_lock_irq, | ||
| 317 | .write_delay = torture_rwlock_write_delay, | ||
| 318 | .writeunlock = torture_rwlock_write_unlock_irq, | ||
| 319 | .readlock = torture_rwlock_read_lock_irq, | ||
| 320 | .read_delay = torture_rwlock_read_delay, | ||
| 321 | .readunlock = torture_rwlock_read_unlock_irq, | ||
| 322 | .name = "rw_lock_irq" | ||
| 323 | }; | ||
| 324 | |||
| 325 | static DEFINE_MUTEX(torture_mutex); | ||
| 326 | |||
| 327 | static int torture_mutex_lock(void) __acquires(torture_mutex) | ||
| 328 | { | ||
| 329 | mutex_lock(&torture_mutex); | ||
| 330 | return 0; | ||
| 331 | } | ||
| 332 | |||
| 333 | static void torture_mutex_delay(struct torture_random_state *trsp) | ||
| 334 | { | ||
| 335 | const unsigned long longdelay_ms = 100; | ||
| 336 | |||
| 337 | /* We want a long delay occasionally to force massive contention. */ | ||
| 338 | if (!(torture_random(trsp) % | ||
| 339 | (cxt.nrealwriters_stress * 2000 * longdelay_ms))) | ||
| 340 | mdelay(longdelay_ms * 5); | ||
| 341 | else | ||
| 342 | mdelay(longdelay_ms / 5); | ||
| 343 | #ifdef CONFIG_PREEMPT | ||
| 344 | if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) | ||
| 345 | preempt_schedule(); /* Allow test to be preempted. */ | ||
| 346 | #endif | ||
| 347 | } | ||
| 348 | |||
| 349 | static void torture_mutex_unlock(void) __releases(torture_mutex) | ||
| 350 | { | ||
| 351 | mutex_unlock(&torture_mutex); | ||
| 352 | } | ||
| 353 | |||
| 354 | static struct lock_torture_ops mutex_lock_ops = { | ||
| 355 | .writelock = torture_mutex_lock, | ||
| 356 | .write_delay = torture_mutex_delay, | ||
| 357 | .writeunlock = torture_mutex_unlock, | ||
| 358 | .readlock = NULL, | ||
| 359 | .read_delay = NULL, | ||
| 360 | .readunlock = NULL, | ||
| 361 | .name = "mutex_lock" | ||
| 362 | }; | ||
| 363 | |||
| 364 | static DECLARE_RWSEM(torture_rwsem); | ||
| 365 | static int torture_rwsem_down_write(void) __acquires(torture_rwsem) | ||
| 366 | { | ||
| 367 | down_write(&torture_rwsem); | ||
| 368 | return 0; | ||
| 369 | } | ||
| 370 | |||
| 371 | static void torture_rwsem_write_delay(struct torture_random_state *trsp) | ||
| 372 | { | ||
| 373 | const unsigned long longdelay_ms = 100; | ||
| 374 | |||
| 375 | /* We want a long delay occasionally to force massive contention. */ | ||
| 376 | if (!(torture_random(trsp) % | ||
| 377 | (cxt.nrealwriters_stress * 2000 * longdelay_ms))) | ||
| 378 | mdelay(longdelay_ms * 10); | ||
| 379 | else | ||
| 380 | mdelay(longdelay_ms / 10); | ||
| 381 | #ifdef CONFIG_PREEMPT | ||
| 382 | if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) | ||
| 383 | preempt_schedule(); /* Allow test to be preempted. */ | ||
| 384 | #endif | ||
| 385 | } | ||
| 386 | |||
| 387 | static void torture_rwsem_up_write(void) __releases(torture_rwsem) | ||
| 388 | { | ||
| 389 | up_write(&torture_rwsem); | ||
| 390 | } | ||
| 391 | |||
| 392 | static int torture_rwsem_down_read(void) __acquires(torture_rwsem) | ||
| 393 | { | ||
| 394 | down_read(&torture_rwsem); | ||
| 395 | return 0; | ||
| 396 | } | ||
| 397 | |||
| 398 | static void torture_rwsem_read_delay(struct torture_random_state *trsp) | ||
| 399 | { | ||
| 400 | const unsigned long longdelay_ms = 100; | ||
| 401 | |||
| 402 | /* We want a long delay occasionally to force massive contention. */ | ||
| 403 | if (!(torture_random(trsp) % | ||
| 404 | (cxt.nrealwriters_stress * 2000 * longdelay_ms))) | ||
| 405 | mdelay(longdelay_ms * 2); | ||
| 406 | else | ||
| 407 | mdelay(longdelay_ms / 2); | ||
| 408 | #ifdef CONFIG_PREEMPT | ||
| 409 | if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) | ||
| 410 | preempt_schedule(); /* Allow test to be preempted. */ | ||
| 411 | #endif | ||
| 412 | } | ||
| 413 | |||
| 414 | static void torture_rwsem_up_read(void) __releases(torture_rwsem) | ||
| 415 | { | ||
| 416 | up_read(&torture_rwsem); | ||
| 417 | } | ||
| 418 | |||
| 419 | static struct lock_torture_ops rwsem_lock_ops = { | ||
| 420 | .writelock = torture_rwsem_down_write, | ||
| 421 | .write_delay = torture_rwsem_write_delay, | ||
| 422 | .writeunlock = torture_rwsem_up_write, | ||
| 423 | .readlock = torture_rwsem_down_read, | ||
| 424 | .read_delay = torture_rwsem_read_delay, | ||
| 425 | .readunlock = torture_rwsem_up_read, | ||
| 426 | .name = "rwsem_lock" | ||
| 427 | }; | ||
| 428 | |||
| 209 | /* | 429 | /* |
| 210 | * Lock torture writer kthread. Repeatedly acquires and releases | 430 | * Lock torture writer kthread. Repeatedly acquires and releases |
| 211 | * the lock, checking for duplicate acquisitions. | 431 | * the lock, checking for duplicate acquisitions. |
| 212 | */ | 432 | */ |
| 213 | static int lock_torture_writer(void *arg) | 433 | static int lock_torture_writer(void *arg) |
| 214 | { | 434 | { |
| 215 | struct lock_writer_stress_stats *lwsp = arg; | 435 | struct lock_stress_stats *lwsp = arg; |
| 216 | static DEFINE_TORTURE_RANDOM(rand); | 436 | static DEFINE_TORTURE_RANDOM(rand); |
| 217 | 437 | ||
| 218 | VERBOSE_TOROUT_STRING("lock_torture_writer task started"); | 438 | VERBOSE_TOROUT_STRING("lock_torture_writer task started"); |
| @@ -221,14 +441,19 @@ static int lock_torture_writer(void *arg) | |||
| 221 | do { | 441 | do { |
| 222 | if ((torture_random(&rand) & 0xfffff) == 0) | 442 | if ((torture_random(&rand) & 0xfffff) == 0) |
| 223 | schedule_timeout_uninterruptible(1); | 443 | schedule_timeout_uninterruptible(1); |
| 224 | cur_ops->writelock(); | 444 | |
| 445 | cxt.cur_ops->writelock(); | ||
| 225 | if (WARN_ON_ONCE(lock_is_write_held)) | 446 | if (WARN_ON_ONCE(lock_is_write_held)) |
| 226 | lwsp->n_write_lock_fail++; | 447 | lwsp->n_lock_fail++; |
| 227 | lock_is_write_held = 1; | 448 | lock_is_write_held = 1; |
| 228 | lwsp->n_write_lock_acquired++; | 449 | if (WARN_ON_ONCE(lock_is_read_held)) |
| 229 | cur_ops->write_delay(&rand); | 450 | lwsp->n_lock_fail++; /* rare, but... */ |
| 451 | |||
| 452 | lwsp->n_lock_acquired++; | ||
| 453 | cxt.cur_ops->write_delay(&rand); | ||
| 230 | lock_is_write_held = 0; | 454 | lock_is_write_held = 0; |
| 231 | cur_ops->writeunlock(); | 455 | cxt.cur_ops->writeunlock(); |
| 456 | |||
| 232 | stutter_wait("lock_torture_writer"); | 457 | stutter_wait("lock_torture_writer"); |
| 233 | } while (!torture_must_stop()); | 458 | } while (!torture_must_stop()); |
| 234 | torture_kthread_stopping("lock_torture_writer"); | 459 | torture_kthread_stopping("lock_torture_writer"); |
| @@ -236,32 +461,66 @@ static int lock_torture_writer(void *arg) | |||
| 236 | } | 461 | } |
| 237 | 462 | ||
| 238 | /* | 463 | /* |
| 464 | * Lock torture reader kthread. Repeatedly acquires and releases | ||
| 465 | * the reader lock. | ||
| 466 | */ | ||
| 467 | static int lock_torture_reader(void *arg) | ||
| 468 | { | ||
| 469 | struct lock_stress_stats *lrsp = arg; | ||
| 470 | static DEFINE_TORTURE_RANDOM(rand); | ||
| 471 | |||
| 472 | VERBOSE_TOROUT_STRING("lock_torture_reader task started"); | ||
| 473 | set_user_nice(current, MAX_NICE); | ||
| 474 | |||
| 475 | do { | ||
| 476 | if ((torture_random(&rand) & 0xfffff) == 0) | ||
| 477 | schedule_timeout_uninterruptible(1); | ||
| 478 | |||
| 479 | cxt.cur_ops->readlock(); | ||
| 480 | lock_is_read_held = 1; | ||
| 481 | if (WARN_ON_ONCE(lock_is_write_held)) | ||
| 482 | lrsp->n_lock_fail++; /* rare, but... */ | ||
| 483 | |||
| 484 | lrsp->n_lock_acquired++; | ||
| 485 | cxt.cur_ops->read_delay(&rand); | ||
| 486 | lock_is_read_held = 0; | ||
| 487 | cxt.cur_ops->readunlock(); | ||
| 488 | |||
| 489 | stutter_wait("lock_torture_reader"); | ||
| 490 | } while (!torture_must_stop()); | ||
| 491 | torture_kthread_stopping("lock_torture_reader"); | ||
| 492 | return 0; | ||
| 493 | } | ||
| 494 | |||
| 495 | /* | ||
| 239 | * Create an lock-torture-statistics message in the specified buffer. | 496 | * Create an lock-torture-statistics message in the specified buffer. |
| 240 | */ | 497 | */ |
| 241 | static void lock_torture_printk(char *page) | 498 | static void __torture_print_stats(char *page, |
| 499 | struct lock_stress_stats *statp, bool write) | ||
| 242 | { | 500 | { |
| 243 | bool fail = 0; | 501 | bool fail = 0; |
| 244 | int i; | 502 | int i, n_stress; |
| 245 | long max = 0; | 503 | long max = 0; |
| 246 | long min = lwsa[0].n_write_lock_acquired; | 504 | long min = statp[0].n_lock_acquired; |
| 247 | long long sum = 0; | 505 | long long sum = 0; |
| 248 | 506 | ||
| 249 | for (i = 0; i < nrealwriters_stress; i++) { | 507 | n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; |
| 250 | if (lwsa[i].n_write_lock_fail) | 508 | for (i = 0; i < n_stress; i++) { |
| 509 | if (statp[i].n_lock_fail) | ||
| 251 | fail = true; | 510 | fail = true; |
| 252 | sum += lwsa[i].n_write_lock_acquired; | 511 | sum += statp[i].n_lock_acquired; |
| 253 | if (max < lwsa[i].n_write_lock_fail) | 512 | if (max < statp[i].n_lock_fail) |
| 254 | max = lwsa[i].n_write_lock_fail; | 513 | max = statp[i].n_lock_fail; |
| 255 | if (min > lwsa[i].n_write_lock_fail) | 514 | if (min > statp[i].n_lock_fail) |
| 256 | min = lwsa[i].n_write_lock_fail; | 515 | min = statp[i].n_lock_fail; |
| 257 | } | 516 | } |
| 258 | page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); | ||
| 259 | page += sprintf(page, | 517 | page += sprintf(page, |
| 260 | "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", | 518 | "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", |
| 519 | write ? "Writes" : "Reads ", | ||
| 261 | sum, max, min, max / 2 > min ? "???" : "", | 520 | sum, max, min, max / 2 > min ? "???" : "", |
| 262 | fail, fail ? "!!!" : ""); | 521 | fail, fail ? "!!!" : ""); |
| 263 | if (fail) | 522 | if (fail) |
| 264 | atomic_inc(&n_lock_torture_errors); | 523 | atomic_inc(&cxt.n_lock_torture_errors); |
| 265 | } | 524 | } |
| 266 | 525 | ||
| 267 | /* | 526 | /* |
| @@ -274,18 +533,35 @@ static void lock_torture_printk(char *page) | |||
| 274 | */ | 533 | */ |
| 275 | static void lock_torture_stats_print(void) | 534 | static void lock_torture_stats_print(void) |
| 276 | { | 535 | { |
| 277 | int size = nrealwriters_stress * 200 + 8192; | 536 | int size = cxt.nrealwriters_stress * 200 + 8192; |
| 278 | char *buf; | 537 | char *buf; |
| 279 | 538 | ||
| 539 | if (cxt.cur_ops->readlock) | ||
| 540 | size += cxt.nrealreaders_stress * 200 + 8192; | ||
| 541 | |||
| 280 | buf = kmalloc(size, GFP_KERNEL); | 542 | buf = kmalloc(size, GFP_KERNEL); |
| 281 | if (!buf) { | 543 | if (!buf) { |
| 282 | pr_err("lock_torture_stats_print: Out of memory, need: %d", | 544 | pr_err("lock_torture_stats_print: Out of memory, need: %d", |
| 283 | size); | 545 | size); |
| 284 | return; | 546 | return; |
| 285 | } | 547 | } |
| 286 | lock_torture_printk(buf); | 548 | |
| 549 | __torture_print_stats(buf, cxt.lwsa, true); | ||
| 287 | pr_alert("%s", buf); | 550 | pr_alert("%s", buf); |
| 288 | kfree(buf); | 551 | kfree(buf); |
| 552 | |||
| 553 | if (cxt.cur_ops->readlock) { | ||
| 554 | buf = kmalloc(size, GFP_KERNEL); | ||
| 555 | if (!buf) { | ||
| 556 | pr_err("lock_torture_stats_print: Out of memory, need: %d", | ||
| 557 | size); | ||
| 558 | return; | ||
| 559 | } | ||
| 560 | |||
| 561 | __torture_print_stats(buf, cxt.lrsa, false); | ||
| 562 | pr_alert("%s", buf); | ||
| 563 | kfree(buf); | ||
| 564 | } | ||
| 289 | } | 565 | } |
| 290 | 566 | ||
| 291 | /* | 567 | /* |
| @@ -312,9 +588,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, | |||
| 312 | const char *tag) | 588 | const char *tag) |
| 313 | { | 589 | { |
| 314 | pr_alert("%s" TORTURE_FLAG | 590 | pr_alert("%s" TORTURE_FLAG |
| 315 | "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", | 591 | "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", |
| 316 | torture_type, tag, nrealwriters_stress, stat_interval, verbose, | 592 | torture_type, tag, cxt.debug_lock ? " [debug]": "", |
| 317 | shuffle_interval, stutter, shutdown_secs, | 593 | cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval, |
| 594 | verbose, shuffle_interval, stutter, shutdown_secs, | ||
| 318 | onoff_interval, onoff_holdoff); | 595 | onoff_interval, onoff_holdoff); |
| 319 | } | 596 | } |
| 320 | 597 | ||
| @@ -322,46 +599,59 @@ static void lock_torture_cleanup(void) | |||
| 322 | { | 599 | { |
| 323 | int i; | 600 | int i; |
| 324 | 601 | ||
| 325 | if (torture_cleanup()) | 602 | if (torture_cleanup_begin()) |
| 326 | return; | 603 | return; |
| 327 | 604 | ||
| 328 | if (writer_tasks) { | 605 | if (writer_tasks) { |
| 329 | for (i = 0; i < nrealwriters_stress; i++) | 606 | for (i = 0; i < cxt.nrealwriters_stress; i++) |
| 330 | torture_stop_kthread(lock_torture_writer, | 607 | torture_stop_kthread(lock_torture_writer, |
| 331 | writer_tasks[i]); | 608 | writer_tasks[i]); |
| 332 | kfree(writer_tasks); | 609 | kfree(writer_tasks); |
| 333 | writer_tasks = NULL; | 610 | writer_tasks = NULL; |
| 334 | } | 611 | } |
| 335 | 612 | ||
| 613 | if (reader_tasks) { | ||
| 614 | for (i = 0; i < cxt.nrealreaders_stress; i++) | ||
| 615 | torture_stop_kthread(lock_torture_reader, | ||
| 616 | reader_tasks[i]); | ||
| 617 | kfree(reader_tasks); | ||
| 618 | reader_tasks = NULL; | ||
| 619 | } | ||
| 620 | |||
| 336 | torture_stop_kthread(lock_torture_stats, stats_task); | 621 | torture_stop_kthread(lock_torture_stats, stats_task); |
| 337 | lock_torture_stats_print(); /* -After- the stats thread is stopped! */ | 622 | lock_torture_stats_print(); /* -After- the stats thread is stopped! */ |
| 338 | 623 | ||
| 339 | if (atomic_read(&n_lock_torture_errors)) | 624 | if (atomic_read(&cxt.n_lock_torture_errors)) |
| 340 | lock_torture_print_module_parms(cur_ops, | 625 | lock_torture_print_module_parms(cxt.cur_ops, |
| 341 | "End of test: FAILURE"); | 626 | "End of test: FAILURE"); |
| 342 | else if (torture_onoff_failures()) | 627 | else if (torture_onoff_failures()) |
| 343 | lock_torture_print_module_parms(cur_ops, | 628 | lock_torture_print_module_parms(cxt.cur_ops, |
| 344 | "End of test: LOCK_HOTPLUG"); | 629 | "End of test: LOCK_HOTPLUG"); |
| 345 | else | 630 | else |
| 346 | lock_torture_print_module_parms(cur_ops, | 631 | lock_torture_print_module_parms(cxt.cur_ops, |
| 347 | "End of test: SUCCESS"); | 632 | "End of test: SUCCESS"); |
| 633 | torture_cleanup_end(); | ||
| 348 | } | 634 | } |
| 349 | 635 | ||
| 350 | static int __init lock_torture_init(void) | 636 | static int __init lock_torture_init(void) |
| 351 | { | 637 | { |
| 352 | int i; | 638 | int i, j; |
| 353 | int firsterr = 0; | 639 | int firsterr = 0; |
| 354 | static struct lock_torture_ops *torture_ops[] = { | 640 | static struct lock_torture_ops *torture_ops[] = { |
| 355 | &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, | 641 | &lock_busted_ops, |
| 642 | &spin_lock_ops, &spin_lock_irq_ops, | ||
| 643 | &rw_lock_ops, &rw_lock_irq_ops, | ||
| 644 | &mutex_lock_ops, | ||
| 645 | &rwsem_lock_ops, | ||
| 356 | }; | 646 | }; |
| 357 | 647 | ||
| 358 | if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) | 648 | if (!torture_init_begin(torture_type, verbose, &torture_runnable)) |
| 359 | return -EBUSY; | 649 | return -EBUSY; |
| 360 | 650 | ||
| 361 | /* Process args and tell the world that the torturer is on the job. */ | 651 | /* Process args and tell the world that the torturer is on the job. */ |
| 362 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { | 652 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { |
| 363 | cur_ops = torture_ops[i]; | 653 | cxt.cur_ops = torture_ops[i]; |
| 364 | if (strcmp(torture_type, cur_ops->name) == 0) | 654 | if (strcmp(torture_type, cxt.cur_ops->name) == 0) |
| 365 | break; | 655 | break; |
| 366 | } | 656 | } |
| 367 | if (i == ARRAY_SIZE(torture_ops)) { | 657 | if (i == ARRAY_SIZE(torture_ops)) { |
| @@ -374,31 +664,69 @@ static int __init lock_torture_init(void) | |||
| 374 | torture_init_end(); | 664 | torture_init_end(); |
| 375 | return -EINVAL; | 665 | return -EINVAL; |
| 376 | } | 666 | } |
| 377 | if (cur_ops->init) | 667 | if (cxt.cur_ops->init) |
| 378 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 668 | cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
| 379 | 669 | ||
| 380 | if (nwriters_stress >= 0) | 670 | if (nwriters_stress >= 0) |
| 381 | nrealwriters_stress = nwriters_stress; | 671 | cxt.nrealwriters_stress = nwriters_stress; |
| 382 | else | 672 | else |
| 383 | nrealwriters_stress = 2 * num_online_cpus(); | 673 | cxt.nrealwriters_stress = 2 * num_online_cpus(); |
| 384 | lock_torture_print_module_parms(cur_ops, "Start of test"); | 674 | |
| 675 | #ifdef CONFIG_DEBUG_MUTEXES | ||
| 676 | if (strncmp(torture_type, "mutex", 5) == 0) | ||
| 677 | cxt.debug_lock = true; | ||
| 678 | #endif | ||
| 679 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
| 680 | if ((strncmp(torture_type, "spin", 4) == 0) || | ||
| 681 | (strncmp(torture_type, "rw_lock", 7) == 0)) | ||
| 682 | cxt.debug_lock = true; | ||
| 683 | #endif | ||
| 385 | 684 | ||
| 386 | /* Initialize the statistics so that each run gets its own numbers. */ | 685 | /* Initialize the statistics so that each run gets its own numbers. */ |
| 387 | 686 | ||
| 388 | lock_is_write_held = 0; | 687 | lock_is_write_held = 0; |
| 389 | lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); | 688 | cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); |
| 390 | if (lwsa == NULL) { | 689 | if (cxt.lwsa == NULL) { |
| 391 | VERBOSE_TOROUT_STRING("lwsa: Out of memory"); | 690 | VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); |
| 392 | firsterr = -ENOMEM; | 691 | firsterr = -ENOMEM; |
| 393 | goto unwind; | 692 | goto unwind; |
| 394 | } | 693 | } |
| 395 | for (i = 0; i < nrealwriters_stress; i++) { | 694 | for (i = 0; i < cxt.nrealwriters_stress; i++) { |
| 396 | lwsa[i].n_write_lock_fail = 0; | 695 | cxt.lwsa[i].n_lock_fail = 0; |
| 397 | lwsa[i].n_write_lock_acquired = 0; | 696 | cxt.lwsa[i].n_lock_acquired = 0; |
| 398 | } | 697 | } |
| 399 | 698 | ||
| 400 | /* Start up the kthreads. */ | 699 | if (cxt.cur_ops->readlock) { |
| 700 | if (nreaders_stress >= 0) | ||
| 701 | cxt.nrealreaders_stress = nreaders_stress; | ||
| 702 | else { | ||
| 703 | /* | ||
| 704 | * By default distribute evenly the number of | ||
| 705 | * readers and writers. We still run the same number | ||
| 706 | * of threads as the writer-only locks default. | ||
| 707 | */ | ||
| 708 | if (nwriters_stress < 0) /* user doesn't care */ | ||
| 709 | cxt.nrealwriters_stress = num_online_cpus(); | ||
| 710 | cxt.nrealreaders_stress = cxt.nrealwriters_stress; | ||
| 711 | } | ||
| 712 | |||
| 713 | lock_is_read_held = 0; | ||
| 714 | cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); | ||
| 715 | if (cxt.lrsa == NULL) { | ||
| 716 | VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); | ||
| 717 | firsterr = -ENOMEM; | ||
| 718 | kfree(cxt.lwsa); | ||
| 719 | goto unwind; | ||
| 720 | } | ||
| 721 | |||
| 722 | for (i = 0; i < cxt.nrealreaders_stress; i++) { | ||
| 723 | cxt.lrsa[i].n_lock_fail = 0; | ||
| 724 | cxt.lrsa[i].n_lock_acquired = 0; | ||
| 725 | } | ||
| 726 | } | ||
| 727 | lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); | ||
| 401 | 728 | ||
| 729 | /* Prepare torture context. */ | ||
| 402 | if (onoff_interval > 0) { | 730 | if (onoff_interval > 0) { |
| 403 | firsterr = torture_onoff_init(onoff_holdoff * HZ, | 731 | firsterr = torture_onoff_init(onoff_holdoff * HZ, |
| 404 | onoff_interval * HZ); | 732 | onoff_interval * HZ); |
| @@ -422,18 +750,51 @@ static int __init lock_torture_init(void) | |||
| 422 | goto unwind; | 750 | goto unwind; |
| 423 | } | 751 | } |
| 424 | 752 | ||
| 425 | writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), | 753 | writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), |
| 426 | GFP_KERNEL); | 754 | GFP_KERNEL); |
| 427 | if (writer_tasks == NULL) { | 755 | if (writer_tasks == NULL) { |
| 428 | VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); | 756 | VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); |
| 429 | firsterr = -ENOMEM; | 757 | firsterr = -ENOMEM; |
| 430 | goto unwind; | 758 | goto unwind; |
| 431 | } | 759 | } |
| 432 | for (i = 0; i < nrealwriters_stress; i++) { | 760 | |
| 433 | firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], | 761 | if (cxt.cur_ops->readlock) { |
| 762 | reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]), | ||
| 763 | GFP_KERNEL); | ||
| 764 | if (reader_tasks == NULL) { | ||
| 765 | VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); | ||
| 766 | firsterr = -ENOMEM; | ||
| 767 | goto unwind; | ||
| 768 | } | ||
| 769 | } | ||
| 770 | |||
| 771 | /* | ||
| 772 | * Create the kthreads and start torturing (oh, those poor little locks). | ||
| 773 | * | ||
| 774 | * TODO: Note that we interleave writers with readers, giving writers a | ||
| 775 | * slight advantage, by creating its kthread first. This can be modified | ||
| 776 | * for very specific needs, or even let the user choose the policy, if | ||
| 777 | * ever wanted. | ||
| 778 | */ | ||
| 779 | for (i = 0, j = 0; i < cxt.nrealwriters_stress || | ||
| 780 | j < cxt.nrealreaders_stress; i++, j++) { | ||
| 781 | if (i >= cxt.nrealwriters_stress) | ||
| 782 | goto create_reader; | ||
| 783 | |||
| 784 | /* Create writer. */ | ||
| 785 | firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], | ||
| 434 | writer_tasks[i]); | 786 | writer_tasks[i]); |
| 435 | if (firsterr) | 787 | if (firsterr) |
| 436 | goto unwind; | 788 | goto unwind; |
| 789 | |||
| 790 | create_reader: | ||
| 791 | if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) | ||
| 792 | continue; | ||
| 793 | /* Create reader. */ | ||
| 794 | firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j], | ||
| 795 | reader_tasks[j]); | ||
| 796 | if (firsterr) | ||
| 797 | goto unwind; | ||
| 437 | } | 798 | } |
| 438 | if (stat_interval > 0) { | 799 | if (stat_interval > 0) { |
| 439 | firsterr = torture_create_kthread(lock_torture_stats, NULL, | 800 | firsterr = torture_create_kthread(lock_torture_stats, NULL, |
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 23e89c5930e9..4d60986fcbee 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h | |||
| @@ -56,9 +56,6 @@ do { \ | |||
| 56 | * If the lock has already been acquired, then this will proceed to spin | 56 | * If the lock has already been acquired, then this will proceed to spin |
| 57 | * on this node->locked until the previous lock holder sets the node->locked | 57 | * on this node->locked until the previous lock holder sets the node->locked |
| 58 | * in mcs_spin_unlock(). | 58 | * in mcs_spin_unlock(). |
| 59 | * | ||
| 60 | * We don't inline mcs_spin_lock() so that perf can correctly account for the | ||
| 61 | * time spent in this lock function. | ||
| 62 | */ | 59 | */ |
| 63 | static inline | 60 | static inline |
| 64 | void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | 61 | void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) |
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index ae712b25e492..dadbf88c22c4 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
| @@ -15,7 +15,7 @@ | |||
| 15 | * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale | 15 | * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale |
| 16 | * and Sven Dietrich. | 16 | * and Sven Dietrich. |
| 17 | * | 17 | * |
| 18 | * Also see Documentation/mutex-design.txt. | 18 | * Also see Documentation/locking/mutex-design.txt. |
| 19 | */ | 19 | */ |
| 20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
| 21 | #include <linux/ww_mutex.h> | 21 | #include <linux/ww_mutex.h> |
| @@ -106,6 +106,92 @@ void __sched mutex_lock(struct mutex *lock) | |||
| 106 | EXPORT_SYMBOL(mutex_lock); | 106 | EXPORT_SYMBOL(mutex_lock); |
| 107 | #endif | 107 | #endif |
| 108 | 108 | ||
| 109 | static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, | ||
| 110 | struct ww_acquire_ctx *ww_ctx) | ||
| 111 | { | ||
| 112 | #ifdef CONFIG_DEBUG_MUTEXES | ||
| 113 | /* | ||
| 114 | * If this WARN_ON triggers, you used ww_mutex_lock to acquire, | ||
| 115 | * but released with a normal mutex_unlock in this call. | ||
| 116 | * | ||
| 117 | * This should never happen, always use ww_mutex_unlock. | ||
| 118 | */ | ||
| 119 | DEBUG_LOCKS_WARN_ON(ww->ctx); | ||
| 120 | |||
| 121 | /* | ||
| 122 | * Not quite done after calling ww_acquire_done() ? | ||
| 123 | */ | ||
| 124 | DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); | ||
| 125 | |||
| 126 | if (ww_ctx->contending_lock) { | ||
| 127 | /* | ||
| 128 | * After -EDEADLK you tried to | ||
| 129 | * acquire a different ww_mutex? Bad! | ||
| 130 | */ | ||
| 131 | DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); | ||
| 132 | |||
| 133 | /* | ||
| 134 | * You called ww_mutex_lock after receiving -EDEADLK, | ||
| 135 | * but 'forgot' to unlock everything else first? | ||
| 136 | */ | ||
| 137 | DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); | ||
| 138 | ww_ctx->contending_lock = NULL; | ||
| 139 | } | ||
| 140 | |||
| 141 | /* | ||
| 142 | * Naughty, using a different class will lead to undefined behavior! | ||
| 143 | */ | ||
| 144 | DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); | ||
| 145 | #endif | ||
| 146 | ww_ctx->acquired++; | ||
| 147 | } | ||
| 148 | |||
| 149 | /* | ||
| 150 | * after acquiring lock with fastpath or when we lost out in contested | ||
| 151 | * slowpath, set ctx and wake up any waiters so they can recheck. | ||
| 152 | * | ||
| 153 | * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, | ||
| 154 | * as the fastpath and opportunistic spinning are disabled in that case. | ||
| 155 | */ | ||
| 156 | static __always_inline void | ||
| 157 | ww_mutex_set_context_fastpath(struct ww_mutex *lock, | ||
| 158 | struct ww_acquire_ctx *ctx) | ||
| 159 | { | ||
| 160 | unsigned long flags; | ||
| 161 | struct mutex_waiter *cur; | ||
| 162 | |||
| 163 | ww_mutex_lock_acquired(lock, ctx); | ||
| 164 | |||
| 165 | lock->ctx = ctx; | ||
| 166 | |||
| 167 | /* | ||
| 168 | * The lock->ctx update should be visible on all cores before | ||
| 169 | * the atomic read is done, otherwise contended waiters might be | ||
| 170 | * missed. The contended waiters will either see ww_ctx == NULL | ||
| 171 | * and keep spinning, or it will acquire wait_lock, add itself | ||
| 172 | * to waiter list and sleep. | ||
| 173 | */ | ||
| 174 | smp_mb(); /* ^^^ */ | ||
| 175 | |||
| 176 | /* | ||
| 177 | * Check if lock is contended, if not there is nobody to wake up | ||
| 178 | */ | ||
| 179 | if (likely(atomic_read(&lock->base.count) == 0)) | ||
| 180 | return; | ||
| 181 | |||
| 182 | /* | ||
| 183 | * Uh oh, we raced in fastpath, wake up everyone in this case, | ||
| 184 | * so they can see the new lock->ctx. | ||
| 185 | */ | ||
| 186 | spin_lock_mutex(&lock->base.wait_lock, flags); | ||
| 187 | list_for_each_entry(cur, &lock->base.wait_list, list) { | ||
| 188 | debug_mutex_wake_waiter(&lock->base, cur); | ||
| 189 | wake_up_process(cur->task); | ||
| 190 | } | ||
| 191 | spin_unlock_mutex(&lock->base.wait_lock, flags); | ||
| 192 | } | ||
| 193 | |||
| 194 | |||
| 109 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 195 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
| 110 | /* | 196 | /* |
| 111 | * In order to avoid a stampede of mutex spinners from acquiring the mutex | 197 | * In order to avoid a stampede of mutex spinners from acquiring the mutex |
| @@ -180,6 +266,129 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) | |||
| 180 | */ | 266 | */ |
| 181 | return retval; | 267 | return retval; |
| 182 | } | 268 | } |
| 269 | |||
| 270 | /* | ||
| 271 | * Atomically try to take the lock when it is available | ||
| 272 | */ | ||
| 273 | static inline bool mutex_try_to_acquire(struct mutex *lock) | ||
| 274 | { | ||
| 275 | return !mutex_is_locked(lock) && | ||
| 276 | (atomic_cmpxchg(&lock->count, 1, 0) == 1); | ||
| 277 | } | ||
| 278 | |||
| 279 | /* | ||
| 280 | * Optimistic spinning. | ||
| 281 | * | ||
| 282 | * We try to spin for acquisition when we find that the lock owner | ||
| 283 | * is currently running on a (different) CPU and while we don't | ||
| 284 | * need to reschedule. The rationale is that if the lock owner is | ||
| 285 | * running, it is likely to release the lock soon. | ||
| 286 | * | ||
| 287 | * Since this needs the lock owner, and this mutex implementation | ||
| 288 | * doesn't track the owner atomically in the lock field, we need to | ||
| 289 | * track it non-atomically. | ||
| 290 | * | ||
| 291 | * We can't do this for DEBUG_MUTEXES because that relies on wait_lock | ||
| 292 | * to serialize everything. | ||
| 293 | * | ||
| 294 | * The mutex spinners are queued up using MCS lock so that only one | ||
| 295 | * spinner can compete for the mutex. However, if mutex spinning isn't | ||
| 296 | * going to happen, there is no point in going through the lock/unlock | ||
| 297 | * overhead. | ||
| 298 | * | ||
| 299 | * Returns true when the lock was taken, otherwise false, indicating | ||
| 300 | * that we need to jump to the slowpath and sleep. | ||
| 301 | */ | ||
| 302 | static bool mutex_optimistic_spin(struct mutex *lock, | ||
| 303 | struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) | ||
| 304 | { | ||
| 305 | struct task_struct *task = current; | ||
| 306 | |||
| 307 | if (!mutex_can_spin_on_owner(lock)) | ||
| 308 | goto done; | ||
| 309 | |||
| 310 | if (!osq_lock(&lock->osq)) | ||
| 311 | goto done; | ||
| 312 | |||
| 313 | while (true) { | ||
| 314 | struct task_struct *owner; | ||
| 315 | |||
| 316 | if (use_ww_ctx && ww_ctx->acquired > 0) { | ||
| 317 | struct ww_mutex *ww; | ||
| 318 | |||
| 319 | ww = container_of(lock, struct ww_mutex, base); | ||
| 320 | /* | ||
| 321 | * If ww->ctx is set the contents are undefined, only | ||
| 322 | * by acquiring wait_lock there is a guarantee that | ||
| 323 | * they are not invalid when reading. | ||
| 324 | * | ||
| 325 | * As such, when deadlock detection needs to be | ||
| 326 | * performed the optimistic spinning cannot be done. | ||
| 327 | */ | ||
| 328 | if (ACCESS_ONCE(ww->ctx)) | ||
| 329 | break; | ||
| 330 | } | ||
| 331 | |||
| 332 | /* | ||
| 333 | * If there's an owner, wait for it to either | ||
| 334 | * release the lock or go to sleep. | ||
| 335 | */ | ||
| 336 | owner = ACCESS_ONCE(lock->owner); | ||
| 337 | if (owner && !mutex_spin_on_owner(lock, owner)) | ||
| 338 | break; | ||
| 339 | |||
| 340 | /* Try to acquire the mutex if it is unlocked. */ | ||
| 341 | if (mutex_try_to_acquire(lock)) { | ||
| 342 | lock_acquired(&lock->dep_map, ip); | ||
| 343 | |||
| 344 | if (use_ww_ctx) { | ||
| 345 | struct ww_mutex *ww; | ||
| 346 | ww = container_of(lock, struct ww_mutex, base); | ||
| 347 | |||
| 348 | ww_mutex_set_context_fastpath(ww, ww_ctx); | ||
| 349 | } | ||
| 350 | |||
| 351 | mutex_set_owner(lock); | ||
| 352 | osq_unlock(&lock->osq); | ||
| 353 | return true; | ||
| 354 | } | ||
| 355 | |||
| 356 | /* | ||
| 357 | * When there's no owner, we might have preempted between the | ||
| 358 | * owner acquiring the lock and setting the owner field. If | ||
| 359 | * we're an RT task that will live-lock because we won't let | ||
| 360 | * the owner complete. | ||
| 361 | */ | ||
| 362 | if (!owner && (need_resched() || rt_task(task))) | ||
| 363 | break; | ||
| 364 | |||
| 365 | /* | ||
| 366 | * The cpu_relax() call is a compiler barrier which forces | ||
| 367 | * everything in this loop to be re-loaded. We don't need | ||
| 368 | * memory barriers as we'll eventually observe the right | ||
| 369 | * values at the cost of a few extra spins. | ||
| 370 | */ | ||
| 371 | cpu_relax_lowlatency(); | ||
| 372 | } | ||
| 373 | |||
| 374 | osq_unlock(&lock->osq); | ||
| 375 | done: | ||
| 376 | /* | ||
| 377 | * If we fell out of the spin path because of need_resched(), | ||
| 378 | * reschedule now, before we try-lock the mutex. This avoids getting | ||
| 379 | * scheduled out right after we obtained the mutex. | ||
| 380 | */ | ||
| 381 | if (need_resched()) | ||
| 382 | schedule_preempt_disabled(); | ||
| 383 | |||
| 384 | return false; | ||
| 385 | } | ||
| 386 | #else | ||
| 387 | static bool mutex_optimistic_spin(struct mutex *lock, | ||
| 388 | struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) | ||
| 389 | { | ||
| 390 | return false; | ||
| 391 | } | ||
| 183 | #endif | 392 | #endif |
| 184 | 393 | ||
| 185 | __visible __used noinline | 394 | __visible __used noinline |
| @@ -277,91 +486,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) | |||
| 277 | return 0; | 486 | return 0; |
| 278 | } | 487 | } |
| 279 | 488 | ||
| 280 | static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, | ||
| 281 | struct ww_acquire_ctx *ww_ctx) | ||
| 282 | { | ||
| 283 | #ifdef CONFIG_DEBUG_MUTEXES | ||
| 284 | /* | ||
| 285 | * If this WARN_ON triggers, you used ww_mutex_lock to acquire, | ||
| 286 | * but released with a normal mutex_unlock in this call. | ||
| 287 | * | ||
| 288 | * This should never happen, always use ww_mutex_unlock. | ||
| 289 | */ | ||
| 290 | DEBUG_LOCKS_WARN_ON(ww->ctx); | ||
| 291 | |||
| 292 | /* | ||
| 293 | * Not quite done after calling ww_acquire_done() ? | ||
| 294 | */ | ||
| 295 | DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); | ||
| 296 | |||
| 297 | if (ww_ctx->contending_lock) { | ||
| 298 | /* | ||
| 299 | * After -EDEADLK you tried to | ||
| 300 | * acquire a different ww_mutex? Bad! | ||
| 301 | */ | ||
| 302 | DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); | ||
| 303 | |||
| 304 | /* | ||
| 305 | * You called ww_mutex_lock after receiving -EDEADLK, | ||
| 306 | * but 'forgot' to unlock everything else first? | ||
| 307 | */ | ||
| 308 | DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); | ||
| 309 | ww_ctx->contending_lock = NULL; | ||
| 310 | } | ||
| 311 | |||
| 312 | /* | ||
| 313 | * Naughty, using a different class will lead to undefined behavior! | ||
| 314 | */ | ||
| 315 | DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); | ||
| 316 | #endif | ||
| 317 | ww_ctx->acquired++; | ||
| 318 | } | ||
| 319 | |||
| 320 | /* | ||
| 321 | * after acquiring lock with fastpath or when we lost out in contested | ||
| 322 | * slowpath, set ctx and wake up any waiters so they can recheck. | ||
| 323 | * | ||
| 324 | * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, | ||
| 325 | * as the fastpath and opportunistic spinning are disabled in that case. | ||
| 326 | */ | ||
| 327 | static __always_inline void | ||
| 328 | ww_mutex_set_context_fastpath(struct ww_mutex *lock, | ||
| 329 | struct ww_acquire_ctx *ctx) | ||
| 330 | { | ||
| 331 | unsigned long flags; | ||
| 332 | struct mutex_waiter *cur; | ||
| 333 | |||
| 334 | ww_mutex_lock_acquired(lock, ctx); | ||
| 335 | |||
| 336 | lock->ctx = ctx; | ||
| 337 | |||
| 338 | /* | ||
| 339 | * The lock->ctx update should be visible on all cores before | ||
| 340 | * the atomic read is done, otherwise contended waiters might be | ||
| 341 | * missed. The contended waiters will either see ww_ctx == NULL | ||
| 342 | * and keep spinning, or it will acquire wait_lock, add itself | ||
| 343 | * to waiter list and sleep. | ||
| 344 | */ | ||
| 345 | smp_mb(); /* ^^^ */ | ||
| 346 | |||
| 347 | /* | ||
| 348 | * Check if lock is contended, if not there is nobody to wake up | ||
| 349 | */ | ||
| 350 | if (likely(atomic_read(&lock->base.count) == 0)) | ||
| 351 | return; | ||
| 352 | |||
| 353 | /* | ||
| 354 | * Uh oh, we raced in fastpath, wake up everyone in this case, | ||
| 355 | * so they can see the new lock->ctx. | ||
| 356 | */ | ||
| 357 | spin_lock_mutex(&lock->base.wait_lock, flags); | ||
| 358 | list_for_each_entry(cur, &lock->base.wait_list, list) { | ||
| 359 | debug_mutex_wake_waiter(&lock->base, cur); | ||
| 360 | wake_up_process(cur->task); | ||
| 361 | } | ||
| 362 | spin_unlock_mutex(&lock->base.wait_lock, flags); | ||
| 363 | } | ||
| 364 | |||
| 365 | /* | 489 | /* |
| 366 | * Lock a mutex (possibly interruptible), slowpath: | 490 | * Lock a mutex (possibly interruptible), slowpath: |
| 367 | */ | 491 | */ |
| @@ -378,104 +502,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 378 | preempt_disable(); | 502 | preempt_disable(); |
| 379 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); | 503 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); |
| 380 | 504 | ||
| 381 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 505 | if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { |
| 382 | /* | 506 | /* got the lock, yay! */ |
| 383 | * Optimistic spinning. | 507 | preempt_enable(); |
| 384 | * | 508 | return 0; |
| 385 | * We try to spin for acquisition when we find that the lock owner | ||
| 386 | * is currently running on a (different) CPU and while we don't | ||
| 387 | * need to reschedule. The rationale is that if the lock owner is | ||
| 388 | * running, it is likely to release the lock soon. | ||
| 389 | * | ||
| 390 | * Since this needs the lock owner, and this mutex implementation | ||
| 391 | * doesn't track the owner atomically in the lock field, we need to | ||
| 392 | * track it non-atomically. | ||
| 393 | * | ||
| 394 | * We can't do this for DEBUG_MUTEXES because that relies on wait_lock | ||
| 395 | * to serialize everything. | ||
| 396 | * | ||
| 397 | * The mutex spinners are queued up using MCS lock so that only one | ||
| 398 | * spinner can compete for the mutex. However, if mutex spinning isn't | ||
| 399 | * going to happen, there is no point in going through the lock/unlock | ||
| 400 | * overhead. | ||
| 401 | */ | ||
| 402 | if (!mutex_can_spin_on_owner(lock)) | ||
| 403 | goto slowpath; | ||
| 404 | |||
| 405 | if (!osq_lock(&lock->osq)) | ||
| 406 | goto slowpath; | ||
| 407 | |||
| 408 | for (;;) { | ||
| 409 | struct task_struct *owner; | ||
| 410 | |||
| 411 | if (use_ww_ctx && ww_ctx->acquired > 0) { | ||
| 412 | struct ww_mutex *ww; | ||
| 413 | |||
| 414 | ww = container_of(lock, struct ww_mutex, base); | ||
| 415 | /* | ||
| 416 | * If ww->ctx is set the contents are undefined, only | ||
| 417 | * by acquiring wait_lock there is a guarantee that | ||
| 418 | * they are not invalid when reading. | ||
| 419 | * | ||
| 420 | * As such, when deadlock detection needs to be | ||
| 421 | * performed the optimistic spinning cannot be done. | ||
| 422 | */ | ||
| 423 | if (ACCESS_ONCE(ww->ctx)) | ||
| 424 | break; | ||
| 425 | } | ||
| 426 | |||
| 427 | /* | ||
| 428 | * If there's an owner, wait for it to either | ||
| 429 | * release the lock or go to sleep. | ||
| 430 | */ | ||
| 431 | owner = ACCESS_ONCE(lock->owner); | ||
| 432 | if (owner && !mutex_spin_on_owner(lock, owner)) | ||
| 433 | break; | ||
| 434 | |||
| 435 | /* Try to acquire the mutex if it is unlocked. */ | ||
| 436 | if (!mutex_is_locked(lock) && | ||
| 437 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { | ||
| 438 | lock_acquired(&lock->dep_map, ip); | ||
| 439 | if (use_ww_ctx) { | ||
| 440 | struct ww_mutex *ww; | ||
| 441 | ww = container_of(lock, struct ww_mutex, base); | ||
| 442 | |||
| 443 | ww_mutex_set_context_fastpath(ww, ww_ctx); | ||
| 444 | } | ||
| 445 | |||
| 446 | mutex_set_owner(lock); | ||
| 447 | osq_unlock(&lock->osq); | ||
| 448 | preempt_enable(); | ||
| 449 | return 0; | ||
| 450 | } | ||
| 451 | |||
| 452 | /* | ||
| 453 | * When there's no owner, we might have preempted between the | ||
| 454 | * owner acquiring the lock and setting the owner field. If | ||
| 455 | * we're an RT task that will live-lock because we won't let | ||
| 456 | * the owner complete. | ||
| 457 | */ | ||
| 458 | if (!owner && (need_resched() || rt_task(task))) | ||
| 459 | break; | ||
| 460 | |||
| 461 | /* | ||
| 462 | * The cpu_relax() call is a compiler barrier which forces | ||
| 463 | * everything in this loop to be re-loaded. We don't need | ||
| 464 | * memory barriers as we'll eventually observe the right | ||
| 465 | * values at the cost of a few extra spins. | ||
| 466 | */ | ||
| 467 | cpu_relax_lowlatency(); | ||
| 468 | } | 509 | } |
| 469 | osq_unlock(&lock->osq); | 510 | |
| 470 | slowpath: | ||
| 471 | /* | ||
| 472 | * If we fell out of the spin path because of need_resched(), | ||
| 473 | * reschedule now, before we try-lock the mutex. This avoids getting | ||
| 474 | * scheduled out right after we obtained the mutex. | ||
| 475 | */ | ||
| 476 | if (need_resched()) | ||
| 477 | schedule_preempt_disabled(); | ||
| 478 | #endif | ||
| 479 | spin_lock_mutex(&lock->wait_lock, flags); | 511 | spin_lock_mutex(&lock->wait_lock, flags); |
| 480 | 512 | ||
| 481 | /* | 513 | /* |
| @@ -679,15 +711,21 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); | |||
| 679 | * Release the lock, slowpath: | 711 | * Release the lock, slowpath: |
| 680 | */ | 712 | */ |
| 681 | static inline void | 713 | static inline void |
| 682 | __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) | 714 | __mutex_unlock_common_slowpath(struct mutex *lock, int nested) |
| 683 | { | 715 | { |
| 684 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
| 685 | unsigned long flags; | 716 | unsigned long flags; |
| 686 | 717 | ||
| 687 | /* | 718 | /* |
| 688 | * some architectures leave the lock unlocked in the fastpath failure | 719 | * As a performance measurement, release the lock before doing other |
| 720 | * wakeup related duties to follow. This allows other tasks to acquire | ||
| 721 | * the lock sooner, while still handling cleanups in past unlock calls. | ||
| 722 | * This can be done as we do not enforce strict equivalence between the | ||
| 723 | * mutex counter and wait_list. | ||
| 724 | * | ||
| 725 | * | ||
| 726 | * Some architectures leave the lock unlocked in the fastpath failure | ||
| 689 | * case, others need to leave it locked. In the later case we have to | 727 | * case, others need to leave it locked. In the later case we have to |
| 690 | * unlock it here | 728 | * unlock it here - as the lock counter is currently 0 or negative. |
| 691 | */ | 729 | */ |
| 692 | if (__mutex_slowpath_needs_to_unlock()) | 730 | if (__mutex_slowpath_needs_to_unlock()) |
| 693 | atomic_set(&lock->count, 1); | 731 | atomic_set(&lock->count, 1); |
| @@ -716,7 +754,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) | |||
| 716 | __visible void | 754 | __visible void |
| 717 | __mutex_unlock_slowpath(atomic_t *lock_count) | 755 | __mutex_unlock_slowpath(atomic_t *lock_count) |
| 718 | { | 756 | { |
| 719 | __mutex_unlock_common_slowpath(lock_count, 1); | 757 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 758 | |||
| 759 | __mutex_unlock_common_slowpath(lock, 1); | ||
| 720 | } | 760 | } |
| 721 | 761 | ||
| 722 | #ifndef CONFIG_DEBUG_LOCK_ALLOC | 762 | #ifndef CONFIG_DEBUG_LOCK_ALLOC |
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..5cda397607f2 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h | |||
| @@ -16,7 +16,7 @@ | |||
| 16 | #define mutex_remove_waiter(lock, waiter, ti) \ | 16 | #define mutex_remove_waiter(lock, waiter, ti) \ |
| 17 | __list_del((waiter)->list.prev, (waiter)->list.next) | 17 | __list_del((waiter)->list.prev, (waiter)->list.next) |
| 18 | 18 | ||
| 19 | #ifdef CONFIG_SMP | 19 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
| 20 | static inline void mutex_set_owner(struct mutex *lock) | 20 | static inline void mutex_set_owner(struct mutex *lock) |
| 21 | { | 21 | { |
| 22 | lock->owner = current; | 22 | lock->owner = current; |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a0ea2a141b3b..7c98873a3077 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -8,7 +8,7 @@ | |||
| 8 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt | 8 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt |
| 9 | * Copyright (C) 2006 Esben Nielsen | 9 | * Copyright (C) 2006 Esben Nielsen |
| 10 | * | 10 | * |
| 11 | * See Documentation/rt-mutex-design.txt for details. | 11 | * See Documentation/locking/rt-mutex-design.txt for details. |
| 12 | */ | 12 | */ |
| 13 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
| 14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index d6203faf2eb1..7628c3fc37ca 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
| @@ -246,19 +246,22 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
| 246 | 246 | ||
| 247 | return sem; | 247 | return sem; |
| 248 | } | 248 | } |
| 249 | EXPORT_SYMBOL(rwsem_down_read_failed); | ||
| 249 | 250 | ||
| 250 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | 251 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) |
| 251 | { | 252 | { |
| 252 | if (!(count & RWSEM_ACTIVE_MASK)) { | 253 | /* |
| 253 | /* try acquiring the write lock */ | 254 | * Try acquiring the write lock. Check count first in order |
| 254 | if (sem->count == RWSEM_WAITING_BIAS && | 255 | * to reduce unnecessary expensive cmpxchg() operations. |
| 255 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, | 256 | */ |
| 256 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | 257 | if (count == RWSEM_WAITING_BIAS && |
| 257 | if (!list_is_singular(&sem->wait_list)) | 258 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, |
| 258 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 259 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { |
| 259 | return true; | 260 | if (!list_is_singular(&sem->wait_list)) |
| 260 | } | 261 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); |
| 262 | return true; | ||
| 261 | } | 263 | } |
| 264 | |||
| 262 | return false; | 265 | return false; |
| 263 | } | 266 | } |
| 264 | 267 | ||
| @@ -465,6 +468,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) | |||
| 465 | 468 | ||
| 466 | return sem; | 469 | return sem; |
| 467 | } | 470 | } |
| 471 | EXPORT_SYMBOL(rwsem_down_write_failed); | ||
| 468 | 472 | ||
| 469 | /* | 473 | /* |
| 470 | * handle waking up a waiter on the semaphore | 474 | * handle waking up a waiter on the semaphore |
| @@ -485,6 +489,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) | |||
| 485 | 489 | ||
| 486 | return sem; | 490 | return sem; |
| 487 | } | 491 | } |
| 492 | EXPORT_SYMBOL(rwsem_wake); | ||
| 488 | 493 | ||
| 489 | /* | 494 | /* |
| 490 | * downgrade a write lock into a read lock | 495 | * downgrade a write lock into a read lock |
| @@ -506,8 +511,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) | |||
| 506 | 511 | ||
| 507 | return sem; | 512 | return sem; |
| 508 | } | 513 | } |
| 509 | |||
| 510 | EXPORT_SYMBOL(rwsem_down_read_failed); | ||
| 511 | EXPORT_SYMBOL(rwsem_down_write_failed); | ||
| 512 | EXPORT_SYMBOL(rwsem_wake); | ||
| 513 | EXPORT_SYMBOL(rwsem_downgrade_wake); | 514 | EXPORT_SYMBOL(rwsem_downgrade_wake); |
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 6815171a4fff..b8120abe594b 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c | |||
| @@ -36,7 +36,7 @@ | |||
| 36 | static noinline void __down(struct semaphore *sem); | 36 | static noinline void __down(struct semaphore *sem); |
| 37 | static noinline int __down_interruptible(struct semaphore *sem); | 37 | static noinline int __down_interruptible(struct semaphore *sem); |
| 38 | static noinline int __down_killable(struct semaphore *sem); | 38 | static noinline int __down_killable(struct semaphore *sem); |
| 39 | static noinline int __down_timeout(struct semaphore *sem, long jiffies); | 39 | static noinline int __down_timeout(struct semaphore *sem, long timeout); |
| 40 | static noinline void __up(struct semaphore *sem); | 40 | static noinline void __up(struct semaphore *sem); |
| 41 | 41 | ||
| 42 | /** | 42 | /** |
| @@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock); | |||
| 145 | /** | 145 | /** |
| 146 | * down_timeout - acquire the semaphore within a specified time | 146 | * down_timeout - acquire the semaphore within a specified time |
| 147 | * @sem: the semaphore to be acquired | 147 | * @sem: the semaphore to be acquired |
| 148 | * @jiffies: how long to wait before failing | 148 | * @timeout: how long to wait before failing |
| 149 | * | 149 | * |
| 150 | * Attempts to acquire the semaphore. If no more tasks are allowed to | 150 | * Attempts to acquire the semaphore. If no more tasks are allowed to |
| 151 | * acquire the semaphore, calling this function will put the task to sleep. | 151 | * acquire the semaphore, calling this function will put the task to sleep. |
| 152 | * If the semaphore is not released within the specified number of jiffies, | 152 | * If the semaphore is not released within the specified number of jiffies, |
| 153 | * this function returns -ETIME. It returns 0 if the semaphore was acquired. | 153 | * this function returns -ETIME. It returns 0 if the semaphore was acquired. |
| 154 | */ | 154 | */ |
| 155 | int down_timeout(struct semaphore *sem, long jiffies) | 155 | int down_timeout(struct semaphore *sem, long timeout) |
| 156 | { | 156 | { |
| 157 | unsigned long flags; | 157 | unsigned long flags; |
| 158 | int result = 0; | 158 | int result = 0; |
| @@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies) | |||
| 161 | if (likely(sem->count > 0)) | 161 | if (likely(sem->count > 0)) |
| 162 | sem->count--; | 162 | sem->count--; |
| 163 | else | 163 | else |
| 164 | result = __down_timeout(sem, jiffies); | 164 | result = __down_timeout(sem, timeout); |
| 165 | raw_spin_unlock_irqrestore(&sem->lock, flags); | 165 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
| 166 | 166 | ||
| 167 | return result; | 167 | return result; |
| @@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem) | |||
| 248 | return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); | 248 | return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); |
| 249 | } | 249 | } |
| 250 | 250 | ||
| 251 | static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) | 251 | static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) |
| 252 | { | 252 | { |
| 253 | return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); | 253 | return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); |
| 254 | } | 254 | } |
| 255 | 255 | ||
| 256 | static noinline void __sched __up(struct semaphore *sem) | 256 | static noinline void __sched __up(struct semaphore *sem) |
diff --git a/kernel/module.c b/kernel/module.c index ae79ce615cb9..88cec1ddb1e3 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -135,7 +135,7 @@ static int param_set_bool_enable_only(const char *val, | |||
| 135 | } | 135 | } |
| 136 | 136 | ||
| 137 | static const struct kernel_param_ops param_ops_bool_enable_only = { | 137 | static const struct kernel_param_ops param_ops_bool_enable_only = { |
| 138 | .flags = KERNEL_PARAM_FL_NOARG, | 138 | .flags = KERNEL_PARAM_OPS_FL_NOARG, |
| 139 | .set = param_set_bool_enable_only, | 139 | .set = param_set_bool_enable_only, |
| 140 | .get = param_get_bool, | 140 | .get = param_get_bool, |
| 141 | }; | 141 | }; |
| @@ -1842,7 +1842,9 @@ static void free_module(struct module *mod) | |||
| 1842 | 1842 | ||
| 1843 | /* We leave it in list to prevent duplicate loads, but make sure | 1843 | /* We leave it in list to prevent duplicate loads, but make sure |
| 1844 | * that noone uses it while it's being deconstructed. */ | 1844 | * that noone uses it while it's being deconstructed. */ |
| 1845 | mutex_lock(&module_mutex); | ||
| 1845 | mod->state = MODULE_STATE_UNFORMED; | 1846 | mod->state = MODULE_STATE_UNFORMED; |
| 1847 | mutex_unlock(&module_mutex); | ||
| 1846 | 1848 | ||
| 1847 | /* Remove dynamic debug info */ | 1849 | /* Remove dynamic debug info */ |
| 1848 | ddebug_remove_module(mod->name); | 1850 | ddebug_remove_module(mod->name); |
| @@ -3304,6 +3306,11 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
| 3304 | mutex_lock(&module_mutex); | 3306 | mutex_lock(&module_mutex); |
| 3305 | module_bug_cleanup(mod); | 3307 | module_bug_cleanup(mod); |
| 3306 | mutex_unlock(&module_mutex); | 3308 | mutex_unlock(&module_mutex); |
| 3309 | |||
| 3310 | /* we can't deallocate the module until we clear memory protection */ | ||
| 3311 | unset_module_init_ro_nx(mod); | ||
| 3312 | unset_module_core_ro_nx(mod); | ||
| 3313 | |||
| 3307 | ddebug_cleanup: | 3314 | ddebug_cleanup: |
| 3308 | dynamic_debug_remove(info->debug); | 3315 | dynamic_debug_remove(info->debug); |
| 3309 | synchronize_sched(); | 3316 | synchronize_sched(); |
| @@ -3381,7 +3388,9 @@ static inline int within(unsigned long addr, void *start, unsigned long size) | |||
| 3381 | */ | 3388 | */ |
| 3382 | static inline int is_arm_mapping_symbol(const char *str) | 3389 | static inline int is_arm_mapping_symbol(const char *str) |
| 3383 | { | 3390 | { |
| 3384 | return str[0] == '$' && strchr("atd", str[1]) | 3391 | if (str[0] == '.' && str[1] == 'L') |
| 3392 | return true; | ||
| 3393 | return str[0] == '$' && strchr("axtd", str[1]) | ||
| 3385 | && (str[2] == '\0' || str[2] == '.'); | 3394 | && (str[2] == '\0' || str[2] == '.'); |
| 3386 | } | 3395 | } |
| 3387 | 3396 | ||
| @@ -3444,8 +3453,7 @@ const char *module_address_lookup(unsigned long addr, | |||
| 3444 | list_for_each_entry_rcu(mod, &modules, list) { | 3453 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3445 | if (mod->state == MODULE_STATE_UNFORMED) | 3454 | if (mod->state == MODULE_STATE_UNFORMED) |
| 3446 | continue; | 3455 | continue; |
| 3447 | if (within_module_init(addr, mod) || | 3456 | if (within_module(addr, mod)) { |
| 3448 | within_module_core(addr, mod)) { | ||
| 3449 | if (modname) | 3457 | if (modname) |
| 3450 | *modname = mod->name; | 3458 | *modname = mod->name; |
| 3451 | ret = get_ksymbol(mod, addr, size, offset); | 3459 | ret = get_ksymbol(mod, addr, size, offset); |
| @@ -3469,8 +3477,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) | |||
| 3469 | list_for_each_entry_rcu(mod, &modules, list) { | 3477 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3470 | if (mod->state == MODULE_STATE_UNFORMED) | 3478 | if (mod->state == MODULE_STATE_UNFORMED) |
| 3471 | continue; | 3479 | continue; |
| 3472 | if (within_module_init(addr, mod) || | 3480 | if (within_module(addr, mod)) { |
| 3473 | within_module_core(addr, mod)) { | ||
| 3474 | const char *sym; | 3481 | const char *sym; |
| 3475 | 3482 | ||
| 3476 | sym = get_ksymbol(mod, addr, NULL, NULL); | 3483 | sym = get_ksymbol(mod, addr, NULL, NULL); |
| @@ -3495,8 +3502,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | |||
| 3495 | list_for_each_entry_rcu(mod, &modules, list) { | 3502 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3496 | if (mod->state == MODULE_STATE_UNFORMED) | 3503 | if (mod->state == MODULE_STATE_UNFORMED) |
| 3497 | continue; | 3504 | continue; |
| 3498 | if (within_module_init(addr, mod) || | 3505 | if (within_module(addr, mod)) { |
| 3499 | within_module_core(addr, mod)) { | ||
| 3500 | const char *sym; | 3506 | const char *sym; |
| 3501 | 3507 | ||
| 3502 | sym = get_ksymbol(mod, addr, size, offset); | 3508 | sym = get_ksymbol(mod, addr, size, offset); |
| @@ -3760,8 +3766,7 @@ struct module *__module_address(unsigned long addr) | |||
| 3760 | list_for_each_entry_rcu(mod, &modules, list) { | 3766 | list_for_each_entry_rcu(mod, &modules, list) { |
| 3761 | if (mod->state == MODULE_STATE_UNFORMED) | 3767 | if (mod->state == MODULE_STATE_UNFORMED) |
| 3762 | continue; | 3768 | continue; |
| 3763 | if (within_module_core(addr, mod) | 3769 | if (within_module(addr, mod)) |
| 3764 | || within_module_init(addr, mod)) | ||
| 3765 | return mod; | 3770 | return mod; |
| 3766 | } | 3771 | } |
| 3767 | return NULL; | 3772 | return NULL; |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8e7811086b82..ef42d0ab3115 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) | |||
| 204 | 204 | ||
| 205 | might_sleep(); | 205 | might_sleep(); |
| 206 | 206 | ||
| 207 | task_lock(p); | ||
| 207 | ns = p->nsproxy; | 208 | ns = p->nsproxy; |
| 209 | p->nsproxy = new; | ||
| 210 | task_unlock(p); | ||
| 208 | 211 | ||
| 209 | rcu_assign_pointer(p->nsproxy, new); | 212 | if (ns && atomic_dec_and_test(&ns->count)) |
| 210 | |||
| 211 | if (ns && atomic_dec_and_test(&ns->count)) { | ||
| 212 | /* | ||
| 213 | * wait for others to get what they want from this nsproxy. | ||
| 214 | * | ||
| 215 | * cannot release this nsproxy via the call_rcu() since | ||
| 216 | * put_mnt_ns() will want to sleep | ||
| 217 | */ | ||
| 218 | synchronize_rcu(); | ||
| 219 | free_nsproxy(ns); | 213 | free_nsproxy(ns); |
| 220 | } | ||
| 221 | } | 214 | } |
| 222 | 215 | ||
| 223 | void exit_task_namespaces(struct task_struct *p) | 216 | void exit_task_namespaces(struct task_struct *p) |
diff --git a/kernel/panic.c b/kernel/panic.c index 62e16cef9cc2..cf80672b7924 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -224,6 +224,7 @@ static const struct tnt tnts[] = { | |||
| 224 | { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, | 224 | { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, |
| 225 | { TAINT_OOT_MODULE, 'O', ' ' }, | 225 | { TAINT_OOT_MODULE, 'O', ' ' }, |
| 226 | { TAINT_UNSIGNED_MODULE, 'E', ' ' }, | 226 | { TAINT_UNSIGNED_MODULE, 'E', ' ' }, |
| 227 | { TAINT_SOFTLOCKUP, 'L', ' ' }, | ||
| 227 | }; | 228 | }; |
| 228 | 229 | ||
| 229 | /** | 230 | /** |
| @@ -243,6 +244,7 @@ static const struct tnt tnts[] = { | |||
| 243 | * 'I' - Working around severe firmware bug. | 244 | * 'I' - Working around severe firmware bug. |
| 244 | * 'O' - Out-of-tree module has been loaded. | 245 | * 'O' - Out-of-tree module has been loaded. |
| 245 | * 'E' - Unsigned module has been loaded. | 246 | * 'E' - Unsigned module has been loaded. |
| 247 | * 'L' - A soft lockup has previously occurred. | ||
| 246 | * | 248 | * |
| 247 | * The string is overwritten by the next call to print_tainted(). | 249 | * The string is overwritten by the next call to print_tainted(). |
| 248 | */ | 250 | */ |
diff --git a/kernel/params.c b/kernel/params.c index 34f527023794..db97b791390f 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/string.h> | 19 | #include <linux/string.h> |
| 20 | #include <linux/errno.h> | 20 | #include <linux/errno.h> |
| 21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
| 22 | #include <linux/moduleparam.h> | ||
| 22 | #include <linux/device.h> | 23 | #include <linux/device.h> |
| 23 | #include <linux/err.h> | 24 | #include <linux/err.h> |
| 24 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
| @@ -83,6 +84,15 @@ bool parameq(const char *a, const char *b) | |||
| 83 | return parameqn(a, b, strlen(a)+1); | 84 | return parameqn(a, b, strlen(a)+1); |
| 84 | } | 85 | } |
| 85 | 86 | ||
| 87 | static void param_check_unsafe(const struct kernel_param *kp) | ||
| 88 | { | ||
| 89 | if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { | ||
| 90 | pr_warn("Setting dangerous option %s - tainting kernel\n", | ||
| 91 | kp->name); | ||
| 92 | add_taint(TAINT_USER, LOCKDEP_STILL_OK); | ||
| 93 | } | ||
| 94 | } | ||
| 95 | |||
| 86 | static int parse_one(char *param, | 96 | static int parse_one(char *param, |
| 87 | char *val, | 97 | char *val, |
| 88 | const char *doing, | 98 | const char *doing, |
| @@ -104,11 +114,12 @@ static int parse_one(char *param, | |||
| 104 | return 0; | 114 | return 0; |
| 105 | /* No one handled NULL, so do it here. */ | 115 | /* No one handled NULL, so do it here. */ |
| 106 | if (!val && | 116 | if (!val && |
| 107 | !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) | 117 | !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG)) |
| 108 | return -EINVAL; | 118 | return -EINVAL; |
| 109 | pr_debug("handling %s with %p\n", param, | 119 | pr_debug("handling %s with %p\n", param, |
| 110 | params[i].ops->set); | 120 | params[i].ops->set); |
| 111 | mutex_lock(¶m_lock); | 121 | mutex_lock(¶m_lock); |
| 122 | param_check_unsafe(¶ms[i]); | ||
| 112 | err = params[i].ops->set(val, ¶ms[i]); | 123 | err = params[i].ops->set(val, ¶ms[i]); |
| 113 | mutex_unlock(¶m_lock); | 124 | mutex_unlock(¶m_lock); |
| 114 | return err; | 125 | return err; |
| @@ -318,7 +329,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) | |||
| 318 | EXPORT_SYMBOL(param_get_bool); | 329 | EXPORT_SYMBOL(param_get_bool); |
| 319 | 330 | ||
| 320 | struct kernel_param_ops param_ops_bool = { | 331 | struct kernel_param_ops param_ops_bool = { |
| 321 | .flags = KERNEL_PARAM_FL_NOARG, | 332 | .flags = KERNEL_PARAM_OPS_FL_NOARG, |
| 322 | .set = param_set_bool, | 333 | .set = param_set_bool, |
| 323 | .get = param_get_bool, | 334 | .get = param_get_bool, |
| 324 | }; | 335 | }; |
| @@ -369,7 +380,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) | |||
| 369 | EXPORT_SYMBOL(param_set_bint); | 380 | EXPORT_SYMBOL(param_set_bint); |
| 370 | 381 | ||
| 371 | struct kernel_param_ops param_ops_bint = { | 382 | struct kernel_param_ops param_ops_bint = { |
| 372 | .flags = KERNEL_PARAM_FL_NOARG, | 383 | .flags = KERNEL_PARAM_OPS_FL_NOARG, |
| 373 | .set = param_set_bint, | 384 | .set = param_set_bint, |
| 374 | .get = param_get_int, | 385 | .get = param_get_int, |
| 375 | }; | 386 | }; |
| @@ -503,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string); | |||
| 503 | #define to_module_attr(n) container_of(n, struct module_attribute, attr) | 514 | #define to_module_attr(n) container_of(n, struct module_attribute, attr) |
| 504 | #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) | 515 | #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) |
| 505 | 516 | ||
| 506 | extern struct kernel_param __start___param[], __stop___param[]; | ||
| 507 | |||
| 508 | struct param_attribute | 517 | struct param_attribute |
| 509 | { | 518 | { |
| 510 | struct module_attribute mattr; | 519 | struct module_attribute mattr; |
| @@ -552,6 +561,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
| 552 | return -EPERM; | 561 | return -EPERM; |
| 553 | 562 | ||
| 554 | mutex_lock(¶m_lock); | 563 | mutex_lock(¶m_lock); |
| 564 | param_check_unsafe(attribute->param); | ||
| 555 | err = attribute->param->ops->set(buf, attribute->param); | 565 | err = attribute->param->ops->set(buf, attribute->param); |
| 556 | mutex_unlock(¶m_lock); | 566 | mutex_unlock(¶m_lock); |
| 557 | if (!err) | 567 | if (!err) |
| @@ -763,7 +773,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name) | |||
| 763 | } | 773 | } |
| 764 | 774 | ||
| 765 | static void __init kernel_add_sysfs_param(const char *name, | 775 | static void __init kernel_add_sysfs_param(const char *name, |
| 766 | struct kernel_param *kparam, | 776 | const struct kernel_param *kparam, |
| 767 | unsigned int name_skip) | 777 | unsigned int name_skip) |
| 768 | { | 778 | { |
| 769 | struct module_kobject *mk; | 779 | struct module_kobject *mk; |
| @@ -798,7 +808,7 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
| 798 | */ | 808 | */ |
| 799 | static void __init param_sysfs_builtin(void) | 809 | static void __init param_sysfs_builtin(void) |
| 800 | { | 810 | { |
| 801 | struct kernel_param *kp; | 811 | const struct kernel_param *kp; |
| 802 | unsigned int name_len; | 812 | unsigned int name_len; |
| 803 | char modname[MODULE_NAME_LEN]; | 813 | char modname[MODULE_NAME_LEN]; |
| 804 | 814 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e4e4121fa327..bbef57f5bdfd 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -302,6 +302,10 @@ config PM_GENERIC_DOMAINS_RUNTIME | |||
| 302 | def_bool y | 302 | def_bool y |
| 303 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS | 303 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS |
| 304 | 304 | ||
| 305 | config PM_GENERIC_DOMAINS_OF | ||
| 306 | def_bool y | ||
| 307 | depends on PM_GENERIC_DOMAINS && OF | ||
| 308 | |||
| 305 | config CPU_PM | 309 | config CPU_PM |
| 306 | bool | 310 | bool |
| 307 | depends on SUSPEND || CPU_IDLE | 311 | depends on SUSPEND || CPU_IDLE |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a9dfa79b6bab..1f35a3478f3c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode) | |||
| 502 | error = dpm_suspend_start(PMSG_QUIESCE); | 502 | error = dpm_suspend_start(PMSG_QUIESCE); |
| 503 | if (!error) { | 503 | if (!error) { |
| 504 | error = resume_target_kernel(platform_mode); | 504 | error = resume_target_kernel(platform_mode); |
| 505 | dpm_resume_end(PMSG_RECOVER); | 505 | /* |
| 506 | * The above should either succeed and jump to the new kernel, | ||
| 507 | * or return with an error. Otherwise things are just | ||
| 508 | * undefined, so let's be paranoid. | ||
| 509 | */ | ||
| 510 | BUG_ON(!error); | ||
| 506 | } | 511 | } |
| 512 | dpm_resume_end(PMSG_RECOVER); | ||
| 507 | pm_restore_gfp_mask(); | 513 | pm_restore_gfp_mask(); |
| 508 | resume_console(); | 514 | resume_console(); |
| 509 | pm_restore_console(); | 515 | pm_restore_console(); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 5d49dcac2537..2df883a9d3cb 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -179,6 +179,7 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, | |||
| 179 | 179 | ||
| 180 | #ifdef CONFIG_SUSPEND | 180 | #ifdef CONFIG_SUSPEND |
| 181 | /* kernel/power/suspend.c */ | 181 | /* kernel/power/suspend.c */ |
| 182 | extern const char *pm_labels[]; | ||
| 182 | extern const char *pm_states[]; | 183 | extern const char *pm_states[]; |
| 183 | 184 | ||
| 184 | extern int suspend_devices_and_enter(suspend_state_t state); | 185 | extern int suspend_devices_and_enter(suspend_state_t state); |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 4ee194eb524b..5a6ec8678b9a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 46 | while (true) { | 46 | while (true) { |
| 47 | todo = 0; | 47 | todo = 0; |
| 48 | read_lock(&tasklist_lock); | 48 | read_lock(&tasklist_lock); |
| 49 | do_each_thread(g, p) { | 49 | for_each_process_thread(g, p) { |
| 50 | if (p == current || !freeze_task(p)) | 50 | if (p == current || !freeze_task(p)) |
| 51 | continue; | 51 | continue; |
| 52 | 52 | ||
| 53 | if (!freezer_should_skip(p)) | 53 | if (!freezer_should_skip(p)) |
| 54 | todo++; | 54 | todo++; |
| 55 | } while_each_thread(g, p); | 55 | } |
| 56 | read_unlock(&tasklist_lock); | 56 | read_unlock(&tasklist_lock); |
| 57 | 57 | ||
| 58 | if (!user_only) { | 58 | if (!user_only) { |
| @@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 93 | 93 | ||
| 94 | if (!wakeup) { | 94 | if (!wakeup) { |
| 95 | read_lock(&tasklist_lock); | 95 | read_lock(&tasklist_lock); |
| 96 | do_each_thread(g, p) { | 96 | for_each_process_thread(g, p) { |
| 97 | if (p != current && !freezer_should_skip(p) | 97 | if (p != current && !freezer_should_skip(p) |
| 98 | && freezing(p) && !frozen(p)) | 98 | && freezing(p) && !frozen(p)) |
| 99 | sched_show_task(p); | 99 | sched_show_task(p); |
| 100 | } while_each_thread(g, p); | 100 | } |
| 101 | read_unlock(&tasklist_lock); | 101 | read_unlock(&tasklist_lock); |
| 102 | } | 102 | } |
| 103 | } else { | 103 | } else { |
| @@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 108 | return todo ? -EBUSY : 0; | 108 | return todo ? -EBUSY : 0; |
| 109 | } | 109 | } |
| 110 | 110 | ||
| 111 | static bool __check_frozen_processes(void) | ||
| 112 | { | ||
| 113 | struct task_struct *g, *p; | ||
| 114 | |||
| 115 | for_each_process_thread(g, p) | ||
| 116 | if (p != current && !freezer_should_skip(p) && !frozen(p)) | ||
| 117 | return false; | ||
| 118 | |||
| 119 | return true; | ||
| 120 | } | ||
| 121 | |||
| 122 | /* | ||
| 123 | * Returns true if all freezable tasks (except for current) are frozen already | ||
| 124 | */ | ||
| 125 | static bool check_frozen_processes(void) | ||
| 126 | { | ||
| 127 | bool ret; | ||
| 128 | |||
| 129 | read_lock(&tasklist_lock); | ||
| 130 | ret = __check_frozen_processes(); | ||
| 131 | read_unlock(&tasklist_lock); | ||
| 132 | return ret; | ||
| 133 | } | ||
| 134 | |||
| 111 | /** | 135 | /** |
| 112 | * freeze_processes - Signal user space processes to enter the refrigerator. | 136 | * freeze_processes - Signal user space processes to enter the refrigerator. |
| 113 | * The current thread will not be frozen. The same process that calls | 137 | * The current thread will not be frozen. The same process that calls |
| @@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 118 | int freeze_processes(void) | 142 | int freeze_processes(void) |
| 119 | { | 143 | { |
| 120 | int error; | 144 | int error; |
| 145 | int oom_kills_saved; | ||
| 121 | 146 | ||
| 122 | error = __usermodehelper_disable(UMH_FREEZING); | 147 | error = __usermodehelper_disable(UMH_FREEZING); |
| 123 | if (error) | 148 | if (error) |
| @@ -129,13 +154,28 @@ int freeze_processes(void) | |||
| 129 | if (!pm_freezing) | 154 | if (!pm_freezing) |
| 130 | atomic_inc(&system_freezing_cnt); | 155 | atomic_inc(&system_freezing_cnt); |
| 131 | 156 | ||
| 157 | pm_wakeup_clear(); | ||
| 132 | printk("Freezing user space processes ... "); | 158 | printk("Freezing user space processes ... "); |
| 133 | pm_freezing = true; | 159 | pm_freezing = true; |
| 160 | oom_kills_saved = oom_kills_count(); | ||
| 134 | error = try_to_freeze_tasks(true); | 161 | error = try_to_freeze_tasks(true); |
| 135 | if (!error) { | 162 | if (!error) { |
| 136 | printk("done."); | ||
| 137 | __usermodehelper_set_disable_depth(UMH_DISABLED); | 163 | __usermodehelper_set_disable_depth(UMH_DISABLED); |
| 138 | oom_killer_disable(); | 164 | oom_killer_disable(); |
| 165 | |||
| 166 | /* | ||
| 167 | * There might have been an OOM kill while we were | ||
| 168 | * freezing tasks and the killed task might be still | ||
| 169 | * on the way out so we have to double check for race. | ||
| 170 | */ | ||
| 171 | if (oom_kills_count() != oom_kills_saved && | ||
| 172 | !check_frozen_processes()) { | ||
| 173 | __usermodehelper_set_disable_depth(UMH_ENABLED); | ||
| 174 | printk("OOM in progress."); | ||
| 175 | error = -EBUSY; | ||
| 176 | } else { | ||
| 177 | printk("done."); | ||
| 178 | } | ||
| 139 | } | 179 | } |
| 140 | printk("\n"); | 180 | printk("\n"); |
| 141 | BUG_ON(in_atomic()); | 181 | BUG_ON(in_atomic()); |
| @@ -190,11 +230,11 @@ void thaw_processes(void) | |||
| 190 | thaw_workqueues(); | 230 | thaw_workqueues(); |
| 191 | 231 | ||
| 192 | read_lock(&tasklist_lock); | 232 | read_lock(&tasklist_lock); |
| 193 | do_each_thread(g, p) { | 233 | for_each_process_thread(g, p) { |
| 194 | /* No other threads should have PF_SUSPEND_TASK set */ | 234 | /* No other threads should have PF_SUSPEND_TASK set */ |
| 195 | WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); | 235 | WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); |
| 196 | __thaw_task(p); | 236 | __thaw_task(p); |
| 197 | } while_each_thread(g, p); | 237 | } |
| 198 | read_unlock(&tasklist_lock); | 238 | read_unlock(&tasklist_lock); |
| 199 | 239 | ||
| 200 | WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); | 240 | WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); |
| @@ -217,10 +257,10 @@ void thaw_kernel_threads(void) | |||
| 217 | thaw_workqueues(); | 257 | thaw_workqueues(); |
| 218 | 258 | ||
| 219 | read_lock(&tasklist_lock); | 259 | read_lock(&tasklist_lock); |
| 220 | do_each_thread(g, p) { | 260 | for_each_process_thread(g, p) { |
| 221 | if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) | 261 | if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) |
| 222 | __thaw_task(p); | 262 | __thaw_task(p); |
| 223 | } while_each_thread(g, p); | 263 | } |
| 224 | read_unlock(&tasklist_lock); | 264 | read_unlock(&tasklist_lock); |
| 225 | 265 | ||
| 226 | schedule(); | 266 | schedule(); |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..5f4c006c4b1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
| @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { | |||
| 105 | }; | 105 | }; |
| 106 | 106 | ||
| 107 | 107 | ||
| 108 | static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); | ||
| 109 | static struct pm_qos_constraints memory_bw_constraints = { | ||
| 110 | .list = PLIST_HEAD_INIT(memory_bw_constraints.list), | ||
| 111 | .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, | ||
| 112 | .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, | ||
| 113 | .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, | ||
| 114 | .type = PM_QOS_SUM, | ||
| 115 | .notifiers = &memory_bandwidth_notifier, | ||
| 116 | }; | ||
| 117 | static struct pm_qos_object memory_bandwidth_pm_qos = { | ||
| 118 | .constraints = &memory_bw_constraints, | ||
| 119 | .name = "memory_bandwidth", | ||
| 120 | }; | ||
| 121 | |||
| 122 | |||
| 108 | static struct pm_qos_object *pm_qos_array[] = { | 123 | static struct pm_qos_object *pm_qos_array[] = { |
| 109 | &null_pm_qos, | 124 | &null_pm_qos, |
| 110 | &cpu_dma_pm_qos, | 125 | &cpu_dma_pm_qos, |
| 111 | &network_lat_pm_qos, | 126 | &network_lat_pm_qos, |
| 112 | &network_throughput_pm_qos | 127 | &network_throughput_pm_qos, |
| 128 | &memory_bandwidth_pm_qos, | ||
| 113 | }; | 129 | }; |
| 114 | 130 | ||
| 115 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | 131 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, |
| @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { | |||
| 130 | /* unlocked internal variant */ | 146 | /* unlocked internal variant */ |
| 131 | static inline int pm_qos_get_value(struct pm_qos_constraints *c) | 147 | static inline int pm_qos_get_value(struct pm_qos_constraints *c) |
| 132 | { | 148 | { |
| 149 | struct plist_node *node; | ||
| 150 | int total_value = 0; | ||
| 151 | |||
| 133 | if (plist_head_empty(&c->list)) | 152 | if (plist_head_empty(&c->list)) |
| 134 | return c->no_constraint_value; | 153 | return c->no_constraint_value; |
| 135 | 154 | ||
| @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) | |||
| 140 | case PM_QOS_MAX: | 159 | case PM_QOS_MAX: |
| 141 | return plist_last(&c->list)->prio; | 160 | return plist_last(&c->list)->prio; |
| 142 | 161 | ||
| 162 | case PM_QOS_SUM: | ||
| 163 | plist_for_each(node, &c->list) | ||
| 164 | total_value += node->prio; | ||
| 165 | |||
| 166 | return total_value; | ||
| 167 | |||
| 143 | default: | 168 | default: |
| 144 | /* runtime check for not using enum */ | 169 | /* runtime check for not using enum */ |
| 145 | BUG(); | 170 | BUG(); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4fc5c32422b3..791a61892bb5 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -954,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
| 954 | } | 954 | } |
| 955 | } | 955 | } |
| 956 | 956 | ||
| 957 | static bool is_nosave_page(unsigned long pfn) | ||
| 958 | { | ||
| 959 | struct nosave_region *region; | ||
| 960 | |||
| 961 | list_for_each_entry(region, &nosave_regions, list) { | ||
| 962 | if (pfn >= region->start_pfn && pfn < region->end_pfn) { | ||
| 963 | pr_err("PM: %#010llx in e820 nosave region: " | ||
| 964 | "[mem %#010llx-%#010llx]\n", | ||
| 965 | (unsigned long long) pfn << PAGE_SHIFT, | ||
| 966 | (unsigned long long) region->start_pfn << PAGE_SHIFT, | ||
| 967 | ((unsigned long long) region->end_pfn << PAGE_SHIFT) | ||
| 968 | - 1); | ||
| 969 | return true; | ||
| 970 | } | ||
| 971 | } | ||
| 972 | |||
| 973 | return false; | ||
| 974 | } | ||
| 975 | |||
| 957 | /** | 976 | /** |
| 958 | * create_basic_memory_bitmaps - create bitmaps needed for marking page | 977 | * create_basic_memory_bitmaps - create bitmaps needed for marking page |
| 959 | * frames that should not be saved and free page frames. The pointers | 978 | * frames that should not be saved and free page frames. The pointers |
| @@ -1324,6 +1343,9 @@ void swsusp_free(void) | |||
| 1324 | { | 1343 | { |
| 1325 | unsigned long fb_pfn, fr_pfn; | 1344 | unsigned long fb_pfn, fr_pfn; |
| 1326 | 1345 | ||
| 1346 | if (!forbidden_pages_map || !free_pages_map) | ||
| 1347 | goto out; | ||
| 1348 | |||
| 1327 | memory_bm_position_reset(forbidden_pages_map); | 1349 | memory_bm_position_reset(forbidden_pages_map); |
| 1328 | memory_bm_position_reset(free_pages_map); | 1350 | memory_bm_position_reset(free_pages_map); |
| 1329 | 1351 | ||
| @@ -1351,6 +1373,7 @@ loop: | |||
| 1351 | goto loop; | 1373 | goto loop; |
| 1352 | } | 1374 | } |
| 1353 | 1375 | ||
| 1376 | out: | ||
| 1354 | nr_copy_pages = 0; | 1377 | nr_copy_pages = 0; |
| 1355 | nr_meta_pages = 0; | 1378 | nr_meta_pages = 0; |
| 1356 | restore_pblist = NULL; | 1379 | restore_pblist = NULL; |
| @@ -2015,7 +2038,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) | |||
| 2015 | do { | 2038 | do { |
| 2016 | pfn = memory_bm_next_pfn(bm); | 2039 | pfn = memory_bm_next_pfn(bm); |
| 2017 | if (likely(pfn != BM_END_OF_MAP)) { | 2040 | if (likely(pfn != BM_END_OF_MAP)) { |
| 2018 | if (likely(pfn_valid(pfn))) | 2041 | if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) |
| 2019 | swsusp_set_page_free(pfn_to_page(pfn)); | 2042 | swsusp_set_page_free(pfn_to_page(pfn)); |
| 2020 | else | 2043 | else |
| 2021 | return -EFAULT; | 2044 | return -EFAULT; |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6dadb25cb0d8..c347e3ce3a55 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -31,7 +31,7 @@ | |||
| 31 | 31 | ||
| 32 | #include "power.h" | 32 | #include "power.h" |
| 33 | 33 | ||
| 34 | static const char *pm_labels[] = { "mem", "standby", "freeze", }; | 34 | const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; |
| 35 | const char *pm_states[PM_SUSPEND_MAX]; | 35 | const char *pm_states[PM_SUSPEND_MAX]; |
| 36 | 36 | ||
| 37 | static const struct platform_suspend_ops *suspend_ops; | 37 | static const struct platform_suspend_ops *suspend_ops; |
| @@ -146,17 +146,29 @@ static int platform_suspend_prepare(suspend_state_t state) | |||
| 146 | 146 | ||
| 147 | static int platform_suspend_prepare_late(suspend_state_t state) | 147 | static int platform_suspend_prepare_late(suspend_state_t state) |
| 148 | { | 148 | { |
| 149 | return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ? | ||
| 150 | freeze_ops->prepare() : 0; | ||
| 151 | } | ||
| 152 | |||
| 153 | static int platform_suspend_prepare_noirq(suspend_state_t state) | ||
| 154 | { | ||
| 149 | return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? | 155 | return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? |
| 150 | suspend_ops->prepare_late() : 0; | 156 | suspend_ops->prepare_late() : 0; |
| 151 | } | 157 | } |
| 152 | 158 | ||
| 153 | static void platform_suspend_wake(suspend_state_t state) | 159 | static void platform_resume_noirq(suspend_state_t state) |
| 154 | { | 160 | { |
| 155 | if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) | 161 | if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) |
| 156 | suspend_ops->wake(); | 162 | suspend_ops->wake(); |
| 157 | } | 163 | } |
| 158 | 164 | ||
| 159 | static void platform_suspend_finish(suspend_state_t state) | 165 | static void platform_resume_early(suspend_state_t state) |
| 166 | { | ||
| 167 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore) | ||
| 168 | freeze_ops->restore(); | ||
| 169 | } | ||
| 170 | |||
| 171 | static void platform_resume_finish(suspend_state_t state) | ||
| 160 | { | 172 | { |
| 161 | if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) | 173 | if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) |
| 162 | suspend_ops->finish(); | 174 | suspend_ops->finish(); |
| @@ -172,7 +184,7 @@ static int platform_suspend_begin(suspend_state_t state) | |||
| 172 | return 0; | 184 | return 0; |
| 173 | } | 185 | } |
| 174 | 186 | ||
| 175 | static void platform_suspend_end(suspend_state_t state) | 187 | static void platform_resume_end(suspend_state_t state) |
| 176 | { | 188 | { |
| 177 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) | 189 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) |
| 178 | freeze_ops->end(); | 190 | freeze_ops->end(); |
| @@ -180,7 +192,7 @@ static void platform_suspend_end(suspend_state_t state) | |||
| 180 | suspend_ops->end(); | 192 | suspend_ops->end(); |
| 181 | } | 193 | } |
| 182 | 194 | ||
| 183 | static void platform_suspend_recover(suspend_state_t state) | 195 | static void platform_recover(suspend_state_t state) |
| 184 | { | 196 | { |
| 185 | if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) | 197 | if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) |
| 186 | suspend_ops->recover(); | 198 | suspend_ops->recover(); |
| @@ -265,13 +277,22 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 265 | if (error) | 277 | if (error) |
| 266 | goto Platform_finish; | 278 | goto Platform_finish; |
| 267 | 279 | ||
| 268 | error = dpm_suspend_end(PMSG_SUSPEND); | 280 | error = dpm_suspend_late(PMSG_SUSPEND); |
| 269 | if (error) { | 281 | if (error) { |
| 270 | printk(KERN_ERR "PM: Some devices failed to power down\n"); | 282 | printk(KERN_ERR "PM: late suspend of devices failed\n"); |
| 271 | goto Platform_finish; | 283 | goto Platform_finish; |
| 272 | } | 284 | } |
| 273 | error = platform_suspend_prepare_late(state); | 285 | error = platform_suspend_prepare_late(state); |
| 274 | if (error) | 286 | if (error) |
| 287 | goto Devices_early_resume; | ||
| 288 | |||
| 289 | error = dpm_suspend_noirq(PMSG_SUSPEND); | ||
| 290 | if (error) { | ||
| 291 | printk(KERN_ERR "PM: noirq suspend of devices failed\n"); | ||
| 292 | goto Platform_early_resume; | ||
| 293 | } | ||
| 294 | error = platform_suspend_prepare_noirq(state); | ||
| 295 | if (error) | ||
| 275 | goto Platform_wake; | 296 | goto Platform_wake; |
| 276 | 297 | ||
| 277 | if (suspend_test(TEST_PLATFORM)) | 298 | if (suspend_test(TEST_PLATFORM)) |
| @@ -318,11 +339,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 318 | enable_nonboot_cpus(); | 339 | enable_nonboot_cpus(); |
| 319 | 340 | ||
| 320 | Platform_wake: | 341 | Platform_wake: |
| 321 | platform_suspend_wake(state); | 342 | platform_resume_noirq(state); |
| 322 | dpm_resume_start(PMSG_RESUME); | 343 | dpm_resume_noirq(PMSG_RESUME); |
| 344 | |||
| 345 | Platform_early_resume: | ||
| 346 | platform_resume_early(state); | ||
| 347 | |||
| 348 | Devices_early_resume: | ||
| 349 | dpm_resume_early(PMSG_RESUME); | ||
| 323 | 350 | ||
| 324 | Platform_finish: | 351 | Platform_finish: |
| 325 | platform_suspend_finish(state); | 352 | platform_resume_finish(state); |
| 326 | return error; | 353 | return error; |
| 327 | } | 354 | } |
| 328 | 355 | ||
| @@ -361,14 +388,16 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 361 | suspend_test_start(); | 388 | suspend_test_start(); |
| 362 | dpm_resume_end(PMSG_RESUME); | 389 | dpm_resume_end(PMSG_RESUME); |
| 363 | suspend_test_finish("resume devices"); | 390 | suspend_test_finish("resume devices"); |
| 391 | trace_suspend_resume(TPS("resume_console"), state, true); | ||
| 364 | resume_console(); | 392 | resume_console(); |
| 393 | trace_suspend_resume(TPS("resume_console"), state, false); | ||
| 365 | 394 | ||
| 366 | Close: | 395 | Close: |
| 367 | platform_suspend_end(state); | 396 | platform_resume_end(state); |
| 368 | return error; | 397 | return error; |
| 369 | 398 | ||
| 370 | Recover_platform: | 399 | Recover_platform: |
| 371 | platform_suspend_recover(state); | 400 | platform_recover(state); |
| 372 | goto Resume_devices; | 401 | goto Resume_devices; |
| 373 | } | 402 | } |
| 374 | 403 | ||
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 2f524928b6aa..084452e34a12 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
| @@ -22,6 +22,8 @@ | |||
| 22 | #define TEST_SUSPEND_SECONDS 10 | 22 | #define TEST_SUSPEND_SECONDS 10 |
| 23 | 23 | ||
| 24 | static unsigned long suspend_test_start_time; | 24 | static unsigned long suspend_test_start_time; |
| 25 | static u32 test_repeat_count_max = 1; | ||
| 26 | static u32 test_repeat_count_current; | ||
| 25 | 27 | ||
| 26 | void suspend_test_start(void) | 28 | void suspend_test_start(void) |
| 27 | { | 29 | { |
| @@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
| 74 | int status; | 76 | int status; |
| 75 | 77 | ||
| 76 | /* this may fail if the RTC hasn't been initialized */ | 78 | /* this may fail if the RTC hasn't been initialized */ |
| 79 | repeat: | ||
| 77 | status = rtc_read_time(rtc, &alm.time); | 80 | status = rtc_read_time(rtc, &alm.time); |
| 78 | if (status < 0) { | 81 | if (status < 0) { |
| 79 | printk(err_readtime, dev_name(&rtc->dev), status); | 82 | printk(err_readtime, dev_name(&rtc->dev), status); |
| @@ -100,10 +103,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
| 100 | if (state == PM_SUSPEND_STANDBY) { | 103 | if (state == PM_SUSPEND_STANDBY) { |
| 101 | printk(info_test, pm_states[state]); | 104 | printk(info_test, pm_states[state]); |
| 102 | status = pm_suspend(state); | 105 | status = pm_suspend(state); |
| 106 | if (status < 0) | ||
| 107 | state = PM_SUSPEND_FREEZE; | ||
| 103 | } | 108 | } |
| 109 | if (state == PM_SUSPEND_FREEZE) { | ||
| 110 | printk(info_test, pm_states[state]); | ||
| 111 | status = pm_suspend(state); | ||
| 112 | } | ||
| 113 | |||
| 104 | if (status < 0) | 114 | if (status < 0) |
| 105 | printk(err_suspend, status); | 115 | printk(err_suspend, status); |
| 106 | 116 | ||
| 117 | test_repeat_count_current++; | ||
| 118 | if (test_repeat_count_current < test_repeat_count_max) | ||
| 119 | goto repeat; | ||
| 120 | |||
| 107 | /* Some platforms can't detect that the alarm triggered the | 121 | /* Some platforms can't detect that the alarm triggered the |
| 108 | * wakeup, or (accordingly) disable it after it afterwards. | 122 | * wakeup, or (accordingly) disable it after it afterwards. |
| 109 | * It's supposed to give oneshot behavior; cope. | 123 | * It's supposed to give oneshot behavior; cope. |
| @@ -129,24 +143,36 @@ static int __init has_wakealarm(struct device *dev, const void *data) | |||
| 129 | * at startup time. They're normally disabled, for faster boot and because | 143 | * at startup time. They're normally disabled, for faster boot and because |
| 130 | * we can't know which states really work on this particular system. | 144 | * we can't know which states really work on this particular system. |
| 131 | */ | 145 | */ |
| 132 | static suspend_state_t test_state __initdata = PM_SUSPEND_ON; | 146 | static const char *test_state_label __initdata; |
| 133 | 147 | ||
| 134 | static char warn_bad_state[] __initdata = | 148 | static char warn_bad_state[] __initdata = |
| 135 | KERN_WARNING "PM: can't test '%s' suspend state\n"; | 149 | KERN_WARNING "PM: can't test '%s' suspend state\n"; |
| 136 | 150 | ||
| 137 | static int __init setup_test_suspend(char *value) | 151 | static int __init setup_test_suspend(char *value) |
| 138 | { | 152 | { |
| 139 | suspend_state_t i; | 153 | int i; |
| 154 | char *repeat; | ||
| 155 | char *suspend_type; | ||
| 140 | 156 | ||
| 141 | /* "=mem" ==> "mem" */ | 157 | /* example : "=mem[,N]" ==> "mem[,N]" */ |
| 142 | value++; | 158 | value++; |
| 143 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) | 159 | suspend_type = strsep(&value, ","); |
| 144 | if (!strcmp(pm_states[i], value)) { | 160 | if (!suspend_type) |
| 145 | test_state = i; | 161 | return 0; |
| 162 | |||
| 163 | repeat = strsep(&value, ","); | ||
| 164 | if (repeat) { | ||
| 165 | if (kstrtou32(repeat, 0, &test_repeat_count_max)) | ||
| 166 | return 0; | ||
| 167 | } | ||
| 168 | |||
| 169 | for (i = 0; pm_labels[i]; i++) | ||
| 170 | if (!strcmp(pm_labels[i], suspend_type)) { | ||
| 171 | test_state_label = pm_labels[i]; | ||
| 146 | return 0; | 172 | return 0; |
| 147 | } | 173 | } |
| 148 | 174 | ||
| 149 | printk(warn_bad_state, value); | 175 | printk(warn_bad_state, suspend_type); |
| 150 | return 0; | 176 | return 0; |
| 151 | } | 177 | } |
| 152 | __setup("test_suspend", setup_test_suspend); | 178 | __setup("test_suspend", setup_test_suspend); |
| @@ -158,13 +184,21 @@ static int __init test_suspend(void) | |||
| 158 | 184 | ||
| 159 | struct rtc_device *rtc = NULL; | 185 | struct rtc_device *rtc = NULL; |
| 160 | struct device *dev; | 186 | struct device *dev; |
| 187 | suspend_state_t test_state; | ||
| 161 | 188 | ||
| 162 | /* PM is initialized by now; is that state testable? */ | 189 | /* PM is initialized by now; is that state testable? */ |
| 163 | if (test_state == PM_SUSPEND_ON) | 190 | if (!test_state_label) |
| 164 | goto done; | 191 | return 0; |
| 165 | if (!pm_states[test_state]) { | 192 | |
| 166 | printk(warn_bad_state, pm_states[test_state]); | 193 | for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) { |
| 167 | goto done; | 194 | const char *state_label = pm_states[test_state]; |
| 195 | |||
| 196 | if (state_label && !strcmp(test_state_label, state_label)) | ||
| 197 | break; | ||
| 198 | } | ||
| 199 | if (test_state == PM_SUSPEND_MAX) { | ||
| 200 | printk(warn_bad_state, test_state_label); | ||
| 201 | return 0; | ||
| 168 | } | 202 | } |
| 169 | 203 | ||
| 170 | /* RTCs have initialized by now too ... can we use one? */ | 204 | /* RTCs have initialized by now too ... can we use one? */ |
| @@ -173,13 +207,12 @@ static int __init test_suspend(void) | |||
| 173 | rtc = rtc_class_open(dev_name(dev)); | 207 | rtc = rtc_class_open(dev_name(dev)); |
| 174 | if (!rtc) { | 208 | if (!rtc) { |
| 175 | printk(warn_no_rtc); | 209 | printk(warn_no_rtc); |
| 176 | goto done; | 210 | return 0; |
| 177 | } | 211 | } |
| 178 | 212 | ||
| 179 | /* go for it */ | 213 | /* go for it */ |
| 180 | test_wakealarm(rtc, test_state); | 214 | test_wakealarm(rtc, test_state); |
| 181 | rtc_class_close(rtc); | 215 | rtc_class_close(rtc); |
| 182 | done: | ||
| 183 | return 0; | 216 | return 0; |
| 184 | } | 217 | } |
| 185 | late_initcall(test_suspend); | 218 | late_initcall(test_suspend); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839dbca07..ced2b84b1cb7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -45,6 +45,7 @@ | |||
| 45 | #include <linux/poll.h> | 45 | #include <linux/poll.h> |
| 46 | #include <linux/irq_work.h> | 46 | #include <linux/irq_work.h> |
| 47 | #include <linux/utsname.h> | 47 | #include <linux/utsname.h> |
| 48 | #include <linux/ctype.h> | ||
| 48 | 49 | ||
| 49 | #include <asm/uaccess.h> | 50 | #include <asm/uaccess.h> |
| 50 | 51 | ||
| @@ -56,7 +57,7 @@ | |||
| 56 | 57 | ||
| 57 | int console_printk[4] = { | 58 | int console_printk[4] = { |
| 58 | CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ | 59 | CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ |
| 59 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ | 60 | MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ |
| 60 | CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ | 61 | CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ |
| 61 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ | 62 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ |
| 62 | }; | 63 | }; |
| @@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip) | |||
| 113 | * This is used for debugging the mess that is the VT code by | 114 | * This is used for debugging the mess that is the VT code by |
| 114 | * keeping track if we have the console semaphore held. It's | 115 | * keeping track if we have the console semaphore held. It's |
| 115 | * definitely not the perfect debug tool (we don't know if _WE_ | 116 | * definitely not the perfect debug tool (we don't know if _WE_ |
| 116 | * hold it are racing, but it helps tracking those weird code | 117 | * hold it and are racing, but it helps tracking those weird code |
| 117 | * path in the console code where we end up in places I want | 118 | * paths in the console code where we end up in places I want |
| 118 | * locked without the console sempahore held | 119 | * locked without the console sempahore held). |
| 119 | */ | 120 | */ |
| 120 | static int console_locked, console_suspended; | 121 | static int console_locked, console_suspended; |
| 121 | 122 | ||
| @@ -146,8 +147,8 @@ static int console_may_schedule; | |||
| 146 | * the overall length of the record. | 147 | * the overall length of the record. |
| 147 | * | 148 | * |
| 148 | * The heads to the first and last entry in the buffer, as well as the | 149 | * The heads to the first and last entry in the buffer, as well as the |
| 149 | * sequence numbers of these both entries are maintained when messages | 150 | * sequence numbers of these entries are maintained when messages are |
| 150 | * are stored.. | 151 | * stored. |
| 151 | * | 152 | * |
| 152 | * If the heads indicate available messages, the length in the header | 153 | * If the heads indicate available messages, the length in the header |
| 153 | * tells the start next message. A length == 0 for the next message | 154 | * tells the start next message. A length == 0 for the next message |
| @@ -257,7 +258,7 @@ static u64 clear_seq; | |||
| 257 | static u32 clear_idx; | 258 | static u32 clear_idx; |
| 258 | 259 | ||
| 259 | #define PREFIX_MAX 32 | 260 | #define PREFIX_MAX 32 |
| 260 | #define LOG_LINE_MAX 1024 - PREFIX_MAX | 261 | #define LOG_LINE_MAX (1024 - PREFIX_MAX) |
| 261 | 262 | ||
| 262 | /* record buffer */ | 263 | /* record buffer */ |
| 263 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | 264 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
| @@ -270,6 +271,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | |||
| 270 | static char *log_buf = __log_buf; | 271 | static char *log_buf = __log_buf; |
| 271 | static u32 log_buf_len = __LOG_BUF_LEN; | 272 | static u32 log_buf_len = __LOG_BUF_LEN; |
| 272 | 273 | ||
| 274 | /* Return log buffer address */ | ||
| 275 | char *log_buf_addr_get(void) | ||
| 276 | { | ||
| 277 | return log_buf; | ||
| 278 | } | ||
| 279 | |||
| 280 | /* Return log buffer size */ | ||
| 281 | u32 log_buf_len_get(void) | ||
| 282 | { | ||
| 283 | return log_buf_len; | ||
| 284 | } | ||
| 285 | |||
| 273 | /* human readable text of the record */ | 286 | /* human readable text of the record */ |
| 274 | static char *log_text(const struct printk_log *msg) | 287 | static char *log_text(const struct printk_log *msg) |
| 275 | { | 288 | { |
| @@ -344,7 +357,7 @@ static int log_make_free_space(u32 msg_size) | |||
| 344 | while (log_first_seq < log_next_seq) { | 357 | while (log_first_seq < log_next_seq) { |
| 345 | if (logbuf_has_space(msg_size, false)) | 358 | if (logbuf_has_space(msg_size, false)) |
| 346 | return 0; | 359 | return 0; |
| 347 | /* drop old messages until we have enough continuous space */ | 360 | /* drop old messages until we have enough contiguous space */ |
| 348 | log_first_idx = log_next(log_first_idx); | 361 | log_first_idx = log_next(log_first_idx); |
| 349 | log_first_seq++; | 362 | log_first_seq++; |
| 350 | } | 363 | } |
| @@ -453,11 +466,7 @@ static int log_store(int facility, int level, | |||
| 453 | return msg->text_len; | 466 | return msg->text_len; |
| 454 | } | 467 | } |
| 455 | 468 | ||
| 456 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | 469 | int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); |
| 457 | int dmesg_restrict = 1; | ||
| 458 | #else | ||
| 459 | int dmesg_restrict; | ||
| 460 | #endif | ||
| 461 | 470 | ||
| 462 | static int syslog_action_restricted(int type) | 471 | static int syslog_action_restricted(int type) |
| 463 | { | 472 | { |
| @@ -509,14 +518,13 @@ struct devkmsg_user { | |||
| 509 | char buf[8192]; | 518 | char buf[8192]; |
| 510 | }; | 519 | }; |
| 511 | 520 | ||
| 512 | static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | 521 | static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) |
| 513 | unsigned long count, loff_t pos) | ||
| 514 | { | 522 | { |
| 515 | char *buf, *line; | 523 | char *buf, *line; |
| 516 | int i; | 524 | int i; |
| 517 | int level = default_message_loglevel; | 525 | int level = default_message_loglevel; |
| 518 | int facility = 1; /* LOG_USER */ | 526 | int facility = 1; /* LOG_USER */ |
| 519 | size_t len = iov_length(iv, count); | 527 | size_t len = iocb->ki_nbytes; |
| 520 | ssize_t ret = len; | 528 | ssize_t ret = len; |
| 521 | 529 | ||
| 522 | if (len > LOG_LINE_MAX) | 530 | if (len > LOG_LINE_MAX) |
| @@ -525,13 +533,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | |||
| 525 | if (buf == NULL) | 533 | if (buf == NULL) |
| 526 | return -ENOMEM; | 534 | return -ENOMEM; |
| 527 | 535 | ||
| 528 | line = buf; | 536 | buf[len] = '\0'; |
| 529 | for (i = 0; i < count; i++) { | 537 | if (copy_from_iter(buf, len, from) != len) { |
| 530 | if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { | 538 | kfree(buf); |
| 531 | ret = -EFAULT; | 539 | return -EFAULT; |
| 532 | goto out; | ||
| 533 | } | ||
| 534 | line += iv[i].iov_len; | ||
| 535 | } | 540 | } |
| 536 | 541 | ||
| 537 | /* | 542 | /* |
| @@ -557,10 +562,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | |||
| 557 | line = endp; | 562 | line = endp; |
| 558 | } | 563 | } |
| 559 | } | 564 | } |
| 560 | line[len] = '\0'; | ||
| 561 | 565 | ||
| 562 | printk_emit(facility, level, NULL, 0, "%s", line); | 566 | printk_emit(facility, level, NULL, 0, "%s", line); |
| 563 | out: | ||
| 564 | kfree(buf); | 567 | kfree(buf); |
| 565 | return ret; | 568 | return ret; |
| 566 | } | 569 | } |
| @@ -792,7 +795,7 @@ static int devkmsg_release(struct inode *inode, struct file *file) | |||
| 792 | const struct file_operations kmsg_fops = { | 795 | const struct file_operations kmsg_fops = { |
| 793 | .open = devkmsg_open, | 796 | .open = devkmsg_open, |
| 794 | .read = devkmsg_read, | 797 | .read = devkmsg_read, |
| 795 | .aio_write = devkmsg_writev, | 798 | .write_iter = devkmsg_write, |
| 796 | .llseek = devkmsg_llseek, | 799 | .llseek = devkmsg_llseek, |
| 797 | .poll = devkmsg_poll, | 800 | .poll = devkmsg_poll, |
| 798 | .release = devkmsg_release, | 801 | .release = devkmsg_release, |
| @@ -828,34 +831,80 @@ void log_buf_kexec_setup(void) | |||
| 828 | /* requested log_buf_len from kernel cmdline */ | 831 | /* requested log_buf_len from kernel cmdline */ |
| 829 | static unsigned long __initdata new_log_buf_len; | 832 | static unsigned long __initdata new_log_buf_len; |
| 830 | 833 | ||
| 831 | /* save requested log_buf_len since it's too early to process it */ | 834 | /* we practice scaling the ring buffer by powers of 2 */ |
| 832 | static int __init log_buf_len_setup(char *str) | 835 | static void __init log_buf_len_update(unsigned size) |
| 833 | { | 836 | { |
| 834 | unsigned size = memparse(str, &str); | ||
| 835 | |||
| 836 | if (size) | 837 | if (size) |
| 837 | size = roundup_pow_of_two(size); | 838 | size = roundup_pow_of_two(size); |
| 838 | if (size > log_buf_len) | 839 | if (size > log_buf_len) |
| 839 | new_log_buf_len = size; | 840 | new_log_buf_len = size; |
| 841 | } | ||
| 842 | |||
| 843 | /* save requested log_buf_len since it's too early to process it */ | ||
| 844 | static int __init log_buf_len_setup(char *str) | ||
| 845 | { | ||
| 846 | unsigned size = memparse(str, &str); | ||
| 847 | |||
| 848 | log_buf_len_update(size); | ||
| 840 | 849 | ||
| 841 | return 0; | 850 | return 0; |
| 842 | } | 851 | } |
| 843 | early_param("log_buf_len", log_buf_len_setup); | 852 | early_param("log_buf_len", log_buf_len_setup); |
| 844 | 853 | ||
| 854 | #ifdef CONFIG_SMP | ||
| 855 | #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) | ||
| 856 | |||
| 857 | static void __init log_buf_add_cpu(void) | ||
| 858 | { | ||
| 859 | unsigned int cpu_extra; | ||
| 860 | |||
| 861 | /* | ||
| 862 | * archs should set up cpu_possible_bits properly with | ||
| 863 | * set_cpu_possible() after setup_arch() but just in | ||
| 864 | * case lets ensure this is valid. | ||
| 865 | */ | ||
| 866 | if (num_possible_cpus() == 1) | ||
| 867 | return; | ||
| 868 | |||
| 869 | cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; | ||
| 870 | |||
| 871 | /* by default this will only continue through for large > 64 CPUs */ | ||
| 872 | if (cpu_extra <= __LOG_BUF_LEN / 2) | ||
| 873 | return; | ||
| 874 | |||
| 875 | pr_info("log_buf_len individual max cpu contribution: %d bytes\n", | ||
| 876 | __LOG_CPU_MAX_BUF_LEN); | ||
| 877 | pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", | ||
| 878 | cpu_extra); | ||
| 879 | pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); | ||
| 880 | |||
| 881 | log_buf_len_update(cpu_extra + __LOG_BUF_LEN); | ||
| 882 | } | ||
| 883 | #else /* !CONFIG_SMP */ | ||
| 884 | static inline void log_buf_add_cpu(void) {} | ||
| 885 | #endif /* CONFIG_SMP */ | ||
| 886 | |||
| 845 | void __init setup_log_buf(int early) | 887 | void __init setup_log_buf(int early) |
| 846 | { | 888 | { |
| 847 | unsigned long flags; | 889 | unsigned long flags; |
| 848 | char *new_log_buf; | 890 | char *new_log_buf; |
| 849 | int free; | 891 | int free; |
| 850 | 892 | ||
| 893 | if (log_buf != __log_buf) | ||
| 894 | return; | ||
| 895 | |||
| 896 | if (!early && !new_log_buf_len) | ||
| 897 | log_buf_add_cpu(); | ||
| 898 | |||
| 851 | if (!new_log_buf_len) | 899 | if (!new_log_buf_len) |
| 852 | return; | 900 | return; |
| 853 | 901 | ||
| 854 | if (early) { | 902 | if (early) { |
| 855 | new_log_buf = | 903 | new_log_buf = |
| 856 | memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); | 904 | memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); |
| 857 | } else { | 905 | } else { |
| 858 | new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); | 906 | new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, |
| 907 | LOG_ALIGN); | ||
| 859 | } | 908 | } |
| 860 | 909 | ||
| 861 | if (unlikely(!new_log_buf)) { | 910 | if (unlikely(!new_log_buf)) { |
| @@ -872,7 +921,7 @@ void __init setup_log_buf(int early) | |||
| 872 | memcpy(log_buf, __log_buf, __LOG_BUF_LEN); | 921 | memcpy(log_buf, __log_buf, __LOG_BUF_LEN); |
| 873 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 922 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
| 874 | 923 | ||
| 875 | pr_info("log_buf_len: %d\n", log_buf_len); | 924 | pr_info("log_buf_len: %d bytes\n", log_buf_len); |
| 876 | pr_info("early log buf free: %d(%d%%)\n", | 925 | pr_info("early log buf free: %d(%d%%)\n", |
| 877 | free, (free * 100) / __LOG_BUF_LEN); | 926 | free, (free * 100) / __LOG_BUF_LEN); |
| 878 | } | 927 | } |
| @@ -881,7 +930,7 @@ static bool __read_mostly ignore_loglevel; | |||
| 881 | 930 | ||
| 882 | static int __init ignore_loglevel_setup(char *str) | 931 | static int __init ignore_loglevel_setup(char *str) |
| 883 | { | 932 | { |
| 884 | ignore_loglevel = 1; | 933 | ignore_loglevel = true; |
| 885 | pr_info("debug: ignoring loglevel setting.\n"); | 934 | pr_info("debug: ignoring loglevel setting.\n"); |
| 886 | 935 | ||
| 887 | return 0; | 936 | return 0; |
| @@ -947,11 +996,7 @@ static inline void boot_delay_msec(int level) | |||
| 947 | } | 996 | } |
| 948 | #endif | 997 | #endif |
| 949 | 998 | ||
| 950 | #if defined(CONFIG_PRINTK_TIME) | 999 | static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); |
| 951 | static bool printk_time = 1; | ||
| 952 | #else | ||
| 953 | static bool printk_time; | ||
| 954 | #endif | ||
| 955 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | 1000 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
| 956 | 1001 | ||
| 957 | static size_t print_time(u64 ts, char *buf) | 1002 | static size_t print_time(u64 ts, char *buf) |
| @@ -1310,7 +1355,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1310 | * for pending data, not the size; return the count of | 1355 | * for pending data, not the size; return the count of |
| 1311 | * records, not the length. | 1356 | * records, not the length. |
| 1312 | */ | 1357 | */ |
| 1313 | error = log_next_idx - syslog_idx; | 1358 | error = log_next_seq - syslog_seq; |
| 1314 | } else { | 1359 | } else { |
| 1315 | u64 seq = syslog_seq; | 1360 | u64 seq = syslog_seq; |
| 1316 | u32 idx = syslog_idx; | 1361 | u32 idx = syslog_idx; |
| @@ -1416,10 +1461,9 @@ static int have_callable_console(void) | |||
| 1416 | /* | 1461 | /* |
| 1417 | * Can we actually use the console at this time on this cpu? | 1462 | * Can we actually use the console at this time on this cpu? |
| 1418 | * | 1463 | * |
| 1419 | * Console drivers may assume that per-cpu resources have | 1464 | * Console drivers may assume that per-cpu resources have been allocated. So |
| 1420 | * been allocated. So unless they're explicitly marked as | 1465 | * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't |
| 1421 | * being able to cope (CON_ANYTIME) don't call them until | 1466 | * call them until this CPU is officially up. |
| 1422 | * this CPU is officially up. | ||
| 1423 | */ | 1467 | */ |
| 1424 | static inline int can_use_console(unsigned int cpu) | 1468 | static inline int can_use_console(unsigned int cpu) |
| 1425 | { | 1469 | { |
| @@ -1432,8 +1476,10 @@ static inline int can_use_console(unsigned int cpu) | |||
| 1432 | * console_lock held, and 'console_locked' set) if it | 1476 | * console_lock held, and 'console_locked' set) if it |
| 1433 | * is successful, false otherwise. | 1477 | * is successful, false otherwise. |
| 1434 | */ | 1478 | */ |
| 1435 | static int console_trylock_for_printk(unsigned int cpu) | 1479 | static int console_trylock_for_printk(void) |
| 1436 | { | 1480 | { |
| 1481 | unsigned int cpu = smp_processor_id(); | ||
| 1482 | |||
| 1437 | if (!console_trylock()) | 1483 | if (!console_trylock()) |
| 1438 | return 0; | 1484 | return 0; |
| 1439 | /* | 1485 | /* |
| @@ -1476,7 +1522,7 @@ static struct cont { | |||
| 1476 | struct task_struct *owner; /* task of first print*/ | 1522 | struct task_struct *owner; /* task of first print*/ |
| 1477 | u64 ts_nsec; /* time of first print */ | 1523 | u64 ts_nsec; /* time of first print */ |
| 1478 | u8 level; /* log level of first message */ | 1524 | u8 level; /* log level of first message */ |
| 1479 | u8 facility; /* log level of first message */ | 1525 | u8 facility; /* log facility of first message */ |
| 1480 | enum log_flags flags; /* prefix, newline flags */ | 1526 | enum log_flags flags; /* prefix, newline flags */ |
| 1481 | bool flushed:1; /* buffer sealed and committed */ | 1527 | bool flushed:1; /* buffer sealed and committed */ |
| 1482 | } cont; | 1528 | } cont; |
| @@ -1608,7 +1654,8 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1608 | */ | 1654 | */ |
| 1609 | if (!oops_in_progress && !lockdep_recursing(current)) { | 1655 | if (!oops_in_progress && !lockdep_recursing(current)) { |
| 1610 | recursion_bug = 1; | 1656 | recursion_bug = 1; |
| 1611 | goto out_restore_irqs; | 1657 | local_irq_restore(flags); |
| 1658 | return 0; | ||
| 1612 | } | 1659 | } |
| 1613 | zap_locks(); | 1660 | zap_locks(); |
| 1614 | } | 1661 | } |
| @@ -1617,27 +1664,22 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1617 | raw_spin_lock(&logbuf_lock); | 1664 | raw_spin_lock(&logbuf_lock); |
| 1618 | logbuf_cpu = this_cpu; | 1665 | logbuf_cpu = this_cpu; |
| 1619 | 1666 | ||
| 1620 | if (recursion_bug) { | 1667 | if (unlikely(recursion_bug)) { |
| 1621 | static const char recursion_msg[] = | 1668 | static const char recursion_msg[] = |
| 1622 | "BUG: recent printk recursion!"; | 1669 | "BUG: recent printk recursion!"; |
| 1623 | 1670 | ||
| 1624 | recursion_bug = 0; | 1671 | recursion_bug = 0; |
| 1625 | text_len = strlen(recursion_msg); | ||
| 1626 | /* emit KERN_CRIT message */ | 1672 | /* emit KERN_CRIT message */ |
| 1627 | printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, | 1673 | printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, |
| 1628 | NULL, 0, recursion_msg, text_len); | 1674 | NULL, 0, recursion_msg, |
| 1675 | strlen(recursion_msg)); | ||
| 1629 | } | 1676 | } |
| 1630 | 1677 | ||
| 1631 | /* | 1678 | /* |
| 1632 | * The printf needs to come first; we need the syslog | 1679 | * The printf needs to come first; we need the syslog |
| 1633 | * prefix which might be passed-in as a parameter. | 1680 | * prefix which might be passed-in as a parameter. |
| 1634 | */ | 1681 | */ |
| 1635 | if (in_sched) | 1682 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); |
| 1636 | text_len = scnprintf(text, sizeof(textbuf), | ||
| 1637 | KERN_WARNING "[sched_delayed] "); | ||
| 1638 | |||
| 1639 | text_len += vscnprintf(text + text_len, | ||
| 1640 | sizeof(textbuf) - text_len, fmt, args); | ||
| 1641 | 1683 | ||
| 1642 | /* mark and strip a trailing newline */ | 1684 | /* mark and strip a trailing newline */ |
| 1643 | if (text_len && text[text_len-1] == '\n') { | 1685 | if (text_len && text[text_len-1] == '\n') { |
| @@ -1716,21 +1758,30 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1716 | 1758 | ||
| 1717 | logbuf_cpu = UINT_MAX; | 1759 | logbuf_cpu = UINT_MAX; |
| 1718 | raw_spin_unlock(&logbuf_lock); | 1760 | raw_spin_unlock(&logbuf_lock); |
| 1761 | lockdep_on(); | ||
| 1762 | local_irq_restore(flags); | ||
| 1719 | 1763 | ||
| 1720 | /* If called from the scheduler, we can not call up(). */ | 1764 | /* If called from the scheduler, we can not call up(). */ |
| 1721 | if (!in_sched) { | 1765 | if (!in_sched) { |
| 1766 | lockdep_off(); | ||
| 1767 | /* | ||
| 1768 | * Disable preemption to avoid being preempted while holding | ||
| 1769 | * console_sem which would prevent anyone from printing to | ||
| 1770 | * console | ||
| 1771 | */ | ||
| 1772 | preempt_disable(); | ||
| 1773 | |||
| 1722 | /* | 1774 | /* |
| 1723 | * Try to acquire and then immediately release the console | 1775 | * Try to acquire and then immediately release the console |
| 1724 | * semaphore. The release will print out buffers and wake up | 1776 | * semaphore. The release will print out buffers and wake up |
| 1725 | * /dev/kmsg and syslog() users. | 1777 | * /dev/kmsg and syslog() users. |
| 1726 | */ | 1778 | */ |
| 1727 | if (console_trylock_for_printk(this_cpu)) | 1779 | if (console_trylock_for_printk()) |
| 1728 | console_unlock(); | 1780 | console_unlock(); |
| 1781 | preempt_enable(); | ||
| 1782 | lockdep_on(); | ||
| 1729 | } | 1783 | } |
| 1730 | 1784 | ||
| 1731 | lockdep_on(); | ||
| 1732 | out_restore_irqs: | ||
| 1733 | local_irq_restore(flags); | ||
| 1734 | return printed_len; | 1785 | return printed_len; |
| 1735 | } | 1786 | } |
| 1736 | EXPORT_SYMBOL(vprintk_emit); | 1787 | EXPORT_SYMBOL(vprintk_emit); |
| @@ -1802,7 +1853,7 @@ EXPORT_SYMBOL(printk); | |||
| 1802 | 1853 | ||
| 1803 | #define LOG_LINE_MAX 0 | 1854 | #define LOG_LINE_MAX 0 |
| 1804 | #define PREFIX_MAX 0 | 1855 | #define PREFIX_MAX 0 |
| 1805 | #define LOG_LINE_MAX 0 | 1856 | |
| 1806 | static u64 syslog_seq; | 1857 | static u64 syslog_seq; |
| 1807 | static u32 syslog_idx; | 1858 | static u32 syslog_idx; |
| 1808 | static u64 console_seq; | 1859 | static u64 console_seq; |
| @@ -1881,11 +1932,12 @@ static int __add_preferred_console(char *name, int idx, char *options, | |||
| 1881 | return 0; | 1932 | return 0; |
| 1882 | } | 1933 | } |
| 1883 | /* | 1934 | /* |
| 1884 | * Set up a list of consoles. Called from init/main.c | 1935 | * Set up a console. Called via do_early_param() in init/main.c |
| 1936 | * for each "console=" parameter in the boot command line. | ||
| 1885 | */ | 1937 | */ |
| 1886 | static int __init console_setup(char *str) | 1938 | static int __init console_setup(char *str) |
| 1887 | { | 1939 | { |
| 1888 | char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ | 1940 | char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ |
| 1889 | char *s, *options, *brl_options = NULL; | 1941 | char *s, *options, *brl_options = NULL; |
| 1890 | int idx; | 1942 | int idx; |
| 1891 | 1943 | ||
| @@ -1902,7 +1954,8 @@ static int __init console_setup(char *str) | |||
| 1902 | strncpy(buf, str, sizeof(buf) - 1); | 1954 | strncpy(buf, str, sizeof(buf) - 1); |
| 1903 | } | 1955 | } |
| 1904 | buf[sizeof(buf) - 1] = 0; | 1956 | buf[sizeof(buf) - 1] = 0; |
| 1905 | if ((options = strchr(str, ',')) != NULL) | 1957 | options = strchr(str, ','); |
| 1958 | if (options) | ||
| 1906 | *(options++) = 0; | 1959 | *(options++) = 0; |
| 1907 | #ifdef __sparc__ | 1960 | #ifdef __sparc__ |
| 1908 | if (!strcmp(str, "ttya")) | 1961 | if (!strcmp(str, "ttya")) |
| @@ -1911,7 +1964,7 @@ static int __init console_setup(char *str) | |||
| 1911 | strcpy(buf, "ttyS1"); | 1964 | strcpy(buf, "ttyS1"); |
| 1912 | #endif | 1965 | #endif |
| 1913 | for (s = buf; *s; s++) | 1966 | for (s = buf; *s; s++) |
| 1914 | if ((*s >= '0' && *s <= '9') || *s == ',') | 1967 | if (isdigit(*s) || *s == ',') |
| 1915 | break; | 1968 | break; |
| 1916 | idx = simple_strtoul(s, NULL, 10); | 1969 | idx = simple_strtoul(s, NULL, 10); |
| 1917 | *s = 0; | 1970 | *s = 0; |
| @@ -1950,7 +2003,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha | |||
| 1950 | i++, c++) | 2003 | i++, c++) |
| 1951 | if (strcmp(c->name, name) == 0 && c->index == idx) { | 2004 | if (strcmp(c->name, name) == 0 && c->index == idx) { |
| 1952 | strlcpy(c->name, name_new, sizeof(c->name)); | 2005 | strlcpy(c->name, name_new, sizeof(c->name)); |
| 1953 | c->name[sizeof(c->name) - 1] = 0; | ||
| 1954 | c->options = options; | 2006 | c->options = options; |
| 1955 | c->index = idx_new; | 2007 | c->index = idx_new; |
| 1956 | return i; | 2008 | return i; |
| @@ -1959,12 +2011,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha | |||
| 1959 | return -1; | 2011 | return -1; |
| 1960 | } | 2012 | } |
| 1961 | 2013 | ||
| 1962 | bool console_suspend_enabled = 1; | 2014 | bool console_suspend_enabled = true; |
| 1963 | EXPORT_SYMBOL(console_suspend_enabled); | 2015 | EXPORT_SYMBOL(console_suspend_enabled); |
| 1964 | 2016 | ||
| 1965 | static int __init console_suspend_disable(char *str) | 2017 | static int __init console_suspend_disable(char *str) |
| 1966 | { | 2018 | { |
| 1967 | console_suspend_enabled = 0; | 2019 | console_suspend_enabled = false; |
| 1968 | return 1; | 2020 | return 1; |
| 1969 | } | 2021 | } |
| 1970 | __setup("no_console_suspend", console_suspend_disable); | 2022 | __setup("no_console_suspend", console_suspend_disable); |
| @@ -2045,8 +2097,8 @@ EXPORT_SYMBOL(console_lock); | |||
| 2045 | /** | 2097 | /** |
| 2046 | * console_trylock - try to lock the console system for exclusive use. | 2098 | * console_trylock - try to lock the console system for exclusive use. |
| 2047 | * | 2099 | * |
| 2048 | * Tried to acquire a lock which guarantees that the caller has | 2100 | * Try to acquire a lock which guarantees that the caller has exclusive |
| 2049 | * exclusive access to the console system and the console_drivers list. | 2101 | * access to the console system and the console_drivers list. |
| 2050 | * | 2102 | * |
| 2051 | * returns 1 on success, and 0 on failure to acquire the lock. | 2103 | * returns 1 on success, and 0 on failure to acquire the lock. |
| 2052 | */ | 2104 | */ |
| @@ -2570,7 +2622,7 @@ void wake_up_klogd(void) | |||
| 2570 | preempt_disable(); | 2622 | preempt_disable(); |
| 2571 | if (waitqueue_active(&log_wait)) { | 2623 | if (waitqueue_active(&log_wait)) { |
| 2572 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | 2624 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
| 2573 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | 2625 | irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); |
| 2574 | } | 2626 | } |
| 2575 | preempt_enable(); | 2627 | preempt_enable(); |
| 2576 | } | 2628 | } |
| @@ -2586,7 +2638,7 @@ int printk_deferred(const char *fmt, ...) | |||
| 2586 | va_end(args); | 2638 | va_end(args); |
| 2587 | 2639 | ||
| 2588 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); | 2640 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); |
| 2589 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | 2641 | irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); |
| 2590 | preempt_enable(); | 2642 | preempt_enable(); |
| 2591 | 2643 | ||
| 2592 | return r; | 2644 | return r; |
| @@ -2618,14 +2670,13 @@ EXPORT_SYMBOL(__printk_ratelimit); | |||
| 2618 | bool printk_timed_ratelimit(unsigned long *caller_jiffies, | 2670 | bool printk_timed_ratelimit(unsigned long *caller_jiffies, |
| 2619 | unsigned int interval_msecs) | 2671 | unsigned int interval_msecs) |
| 2620 | { | 2672 | { |
| 2621 | if (*caller_jiffies == 0 | 2673 | unsigned long elapsed = jiffies - *caller_jiffies; |
| 2622 | || !time_in_range(jiffies, *caller_jiffies, | 2674 | |
| 2623 | *caller_jiffies | 2675 | if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) |
| 2624 | + msecs_to_jiffies(interval_msecs))) { | 2676 | return false; |
| 2625 | *caller_jiffies = jiffies; | 2677 | |
| 2626 | return true; | 2678 | *caller_jiffies = jiffies; |
| 2627 | } | 2679 | return true; |
| 2628 | return false; | ||
| 2629 | } | 2680 | } |
| 2630 | EXPORT_SYMBOL(printk_timed_ratelimit); | 2681 | EXPORT_SYMBOL(printk_timed_ratelimit); |
| 2631 | 2682 | ||
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 948a7693748e..240fa9094f83 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -49,11 +49,19 @@ | |||
| 49 | #include <linux/trace_clock.h> | 49 | #include <linux/trace_clock.h> |
| 50 | #include <asm/byteorder.h> | 50 | #include <asm/byteorder.h> |
| 51 | #include <linux/torture.h> | 51 | #include <linux/torture.h> |
| 52 | #include <linux/vmalloc.h> | ||
| 52 | 53 | ||
| 53 | MODULE_LICENSE("GPL"); | 54 | MODULE_LICENSE("GPL"); |
| 54 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); | 55 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); |
| 55 | 56 | ||
| 56 | 57 | ||
| 58 | torture_param(int, cbflood_inter_holdoff, HZ, | ||
| 59 | "Holdoff between floods (jiffies)"); | ||
| 60 | torture_param(int, cbflood_intra_holdoff, 1, | ||
| 61 | "Holdoff between bursts (jiffies)"); | ||
| 62 | torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); | ||
| 63 | torture_param(int, cbflood_n_per_burst, 20000, | ||
| 64 | "# callbacks per burst in flood"); | ||
| 57 | torture_param(int, fqs_duration, 0, | 65 | torture_param(int, fqs_duration, 0, |
| 58 | "Duration of fqs bursts (us), 0 to disable"); | 66 | "Duration of fqs bursts (us), 0 to disable"); |
| 59 | torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); | 67 | torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); |
| @@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444); | |||
| 96 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); | 104 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); |
| 97 | 105 | ||
| 98 | static int nrealreaders; | 106 | static int nrealreaders; |
| 107 | static int ncbflooders; | ||
| 99 | static struct task_struct *writer_task; | 108 | static struct task_struct *writer_task; |
| 100 | static struct task_struct **fakewriter_tasks; | 109 | static struct task_struct **fakewriter_tasks; |
| 101 | static struct task_struct **reader_tasks; | 110 | static struct task_struct **reader_tasks; |
| 102 | static struct task_struct *stats_task; | 111 | static struct task_struct *stats_task; |
| 112 | static struct task_struct **cbflood_task; | ||
| 103 | static struct task_struct *fqs_task; | 113 | static struct task_struct *fqs_task; |
| 104 | static struct task_struct *boost_tasks[NR_CPUS]; | 114 | static struct task_struct *boost_tasks[NR_CPUS]; |
| 105 | static struct task_struct *stall_task; | 115 | static struct task_struct *stall_task; |
| @@ -138,6 +148,7 @@ static long n_rcu_torture_boosts; | |||
| 138 | static long n_rcu_torture_timers; | 148 | static long n_rcu_torture_timers; |
| 139 | static long n_barrier_attempts; | 149 | static long n_barrier_attempts; |
| 140 | static long n_barrier_successes; | 150 | static long n_barrier_successes; |
| 151 | static atomic_long_t n_cbfloods; | ||
| 141 | static struct list_head rcu_torture_removed; | 152 | static struct list_head rcu_torture_removed; |
| 142 | 153 | ||
| 143 | static int rcu_torture_writer_state; | 154 | static int rcu_torture_writer_state; |
| @@ -157,9 +168,9 @@ static int rcu_torture_writer_state; | |||
| 157 | #else | 168 | #else |
| 158 | #define RCUTORTURE_RUNNABLE_INIT 0 | 169 | #define RCUTORTURE_RUNNABLE_INIT 0 |
| 159 | #endif | 170 | #endif |
| 160 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 171 | static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; |
| 161 | module_param(rcutorture_runnable, int, 0444); | 172 | module_param(torture_runnable, int, 0444); |
| 162 | MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); | 173 | MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); |
| 163 | 174 | ||
| 164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) | 175 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) |
| 165 | #define rcu_can_boost() 1 | 176 | #define rcu_can_boost() 1 |
| @@ -182,7 +193,7 @@ static u64 notrace rcu_trace_clock_local(void) | |||
| 182 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 193 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
| 183 | 194 | ||
| 184 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 195 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
| 185 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 196 | static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
| 186 | /* and boost task create/destroy. */ | 197 | /* and boost task create/destroy. */ |
| 187 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | 198 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ |
| 188 | static bool barrier_phase; /* Test phase. */ | 199 | static bool barrier_phase; /* Test phase. */ |
| @@ -242,7 +253,7 @@ struct rcu_torture_ops { | |||
| 242 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 253 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
| 243 | void (*cb_barrier)(void); | 254 | void (*cb_barrier)(void); |
| 244 | void (*fqs)(void); | 255 | void (*fqs)(void); |
| 245 | void (*stats)(char *page); | 256 | void (*stats)(void); |
| 246 | int irq_capable; | 257 | int irq_capable; |
| 247 | int can_boost; | 258 | int can_boost; |
| 248 | const char *name; | 259 | const char *name; |
| @@ -525,21 +536,21 @@ static void srcu_torture_barrier(void) | |||
| 525 | srcu_barrier(&srcu_ctl); | 536 | srcu_barrier(&srcu_ctl); |
| 526 | } | 537 | } |
| 527 | 538 | ||
| 528 | static void srcu_torture_stats(char *page) | 539 | static void srcu_torture_stats(void) |
| 529 | { | 540 | { |
| 530 | int cpu; | 541 | int cpu; |
| 531 | int idx = srcu_ctl.completed & 0x1; | 542 | int idx = srcu_ctl.completed & 0x1; |
| 532 | 543 | ||
| 533 | page += sprintf(page, "%s%s per-CPU(idx=%d):", | 544 | pr_alert("%s%s per-CPU(idx=%d):", |
| 534 | torture_type, TORTURE_FLAG, idx); | 545 | torture_type, TORTURE_FLAG, idx); |
| 535 | for_each_possible_cpu(cpu) { | 546 | for_each_possible_cpu(cpu) { |
| 536 | long c0, c1; | 547 | long c0, c1; |
| 537 | 548 | ||
| 538 | c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; | 549 | c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; |
| 539 | c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; | 550 | c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; |
| 540 | page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); | 551 | pr_cont(" %d(%ld,%ld)", cpu, c0, c1); |
| 541 | } | 552 | } |
| 542 | sprintf(page, "\n"); | 553 | pr_cont("\n"); |
| 543 | } | 554 | } |
| 544 | 555 | ||
| 545 | static void srcu_torture_synchronize_expedited(void) | 556 | static void srcu_torture_synchronize_expedited(void) |
| @@ -601,6 +612,52 @@ static struct rcu_torture_ops sched_ops = { | |||
| 601 | .name = "sched" | 612 | .name = "sched" |
| 602 | }; | 613 | }; |
| 603 | 614 | ||
| 615 | #ifdef CONFIG_TASKS_RCU | ||
| 616 | |||
| 617 | /* | ||
| 618 | * Definitions for RCU-tasks torture testing. | ||
| 619 | */ | ||
| 620 | |||
| 621 | static int tasks_torture_read_lock(void) | ||
| 622 | { | ||
| 623 | return 0; | ||
| 624 | } | ||
| 625 | |||
| 626 | static void tasks_torture_read_unlock(int idx) | ||
| 627 | { | ||
| 628 | } | ||
| 629 | |||
| 630 | static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) | ||
| 631 | { | ||
| 632 | call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); | ||
| 633 | } | ||
| 634 | |||
| 635 | static struct rcu_torture_ops tasks_ops = { | ||
| 636 | .ttype = RCU_TASKS_FLAVOR, | ||
| 637 | .init = rcu_sync_torture_init, | ||
| 638 | .readlock = tasks_torture_read_lock, | ||
| 639 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 640 | .readunlock = tasks_torture_read_unlock, | ||
| 641 | .completed = rcu_no_completed, | ||
| 642 | .deferred_free = rcu_tasks_torture_deferred_free, | ||
| 643 | .sync = synchronize_rcu_tasks, | ||
| 644 | .exp_sync = synchronize_rcu_tasks, | ||
| 645 | .call = call_rcu_tasks, | ||
| 646 | .cb_barrier = rcu_barrier_tasks, | ||
| 647 | .fqs = NULL, | ||
| 648 | .stats = NULL, | ||
| 649 | .irq_capable = 1, | ||
| 650 | .name = "tasks" | ||
| 651 | }; | ||
| 652 | |||
| 653 | #define RCUTORTURE_TASKS_OPS &tasks_ops, | ||
| 654 | |||
| 655 | #else /* #ifdef CONFIG_TASKS_RCU */ | ||
| 656 | |||
| 657 | #define RCUTORTURE_TASKS_OPS | ||
| 658 | |||
| 659 | #endif /* #else #ifdef CONFIG_TASKS_RCU */ | ||
| 660 | |||
| 604 | /* | 661 | /* |
| 605 | * RCU torture priority-boost testing. Runs one real-time thread per | 662 | * RCU torture priority-boost testing. Runs one real-time thread per |
| 606 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | 663 | * CPU for moderate bursts, repeatedly registering RCU callbacks and |
| @@ -667,7 +724,7 @@ static int rcu_torture_boost(void *arg) | |||
| 667 | } | 724 | } |
| 668 | call_rcu_time = jiffies; | 725 | call_rcu_time = jiffies; |
| 669 | } | 726 | } |
| 670 | cond_resched(); | 727 | cond_resched_rcu_qs(); |
| 671 | stutter_wait("rcu_torture_boost"); | 728 | stutter_wait("rcu_torture_boost"); |
| 672 | if (torture_must_stop()) | 729 | if (torture_must_stop()) |
| 673 | goto checkwait; | 730 | goto checkwait; |
| @@ -707,6 +764,58 @@ checkwait: stutter_wait("rcu_torture_boost"); | |||
| 707 | return 0; | 764 | return 0; |
| 708 | } | 765 | } |
| 709 | 766 | ||
| 767 | static void rcu_torture_cbflood_cb(struct rcu_head *rhp) | ||
| 768 | { | ||
| 769 | } | ||
| 770 | |||
| 771 | /* | ||
| 772 | * RCU torture callback-flood kthread. Repeatedly induces bursts of calls | ||
| 773 | * to call_rcu() or analogous, increasing the probability of occurrence | ||
| 774 | * of callback-overflow corner cases. | ||
| 775 | */ | ||
| 776 | static int | ||
| 777 | rcu_torture_cbflood(void *arg) | ||
| 778 | { | ||
| 779 | int err = 1; | ||
| 780 | int i; | ||
| 781 | int j; | ||
| 782 | struct rcu_head *rhp; | ||
| 783 | |||
| 784 | if (cbflood_n_per_burst > 0 && | ||
| 785 | cbflood_inter_holdoff > 0 && | ||
| 786 | cbflood_intra_holdoff > 0 && | ||
| 787 | cur_ops->call && | ||
| 788 | cur_ops->cb_barrier) { | ||
| 789 | rhp = vmalloc(sizeof(*rhp) * | ||
| 790 | cbflood_n_burst * cbflood_n_per_burst); | ||
| 791 | err = !rhp; | ||
| 792 | } | ||
| 793 | if (err) { | ||
| 794 | VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); | ||
| 795 | while (!torture_must_stop()) | ||
| 796 | schedule_timeout_interruptible(HZ); | ||
| 797 | return 0; | ||
| 798 | } | ||
| 799 | VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); | ||
| 800 | do { | ||
| 801 | schedule_timeout_interruptible(cbflood_inter_holdoff); | ||
| 802 | atomic_long_inc(&n_cbfloods); | ||
| 803 | WARN_ON(signal_pending(current)); | ||
| 804 | for (i = 0; i < cbflood_n_burst; i++) { | ||
| 805 | for (j = 0; j < cbflood_n_per_burst; j++) { | ||
| 806 | cur_ops->call(&rhp[i * cbflood_n_per_burst + j], | ||
| 807 | rcu_torture_cbflood_cb); | ||
| 808 | } | ||
| 809 | schedule_timeout_interruptible(cbflood_intra_holdoff); | ||
| 810 | WARN_ON(signal_pending(current)); | ||
| 811 | } | ||
| 812 | cur_ops->cb_barrier(); | ||
| 813 | stutter_wait("rcu_torture_cbflood"); | ||
| 814 | } while (!torture_must_stop()); | ||
| 815 | torture_kthread_stopping("rcu_torture_cbflood"); | ||
| 816 | return 0; | ||
| 817 | } | ||
| 818 | |||
| 710 | /* | 819 | /* |
| 711 | * RCU torture force-quiescent-state kthread. Repeatedly induces | 820 | * RCU torture force-quiescent-state kthread. Repeatedly induces |
| 712 | * bursts of calls to force_quiescent_state(), increasing the probability | 821 | * bursts of calls to force_quiescent_state(), increasing the probability |
| @@ -1019,7 +1128,7 @@ rcu_torture_reader(void *arg) | |||
| 1019 | __this_cpu_inc(rcu_torture_batch[completed]); | 1128 | __this_cpu_inc(rcu_torture_batch[completed]); |
| 1020 | preempt_enable(); | 1129 | preempt_enable(); |
| 1021 | cur_ops->readunlock(idx); | 1130 | cur_ops->readunlock(idx); |
| 1022 | cond_resched(); | 1131 | cond_resched_rcu_qs(); |
| 1023 | stutter_wait("rcu_torture_reader"); | 1132 | stutter_wait("rcu_torture_reader"); |
| 1024 | } while (!torture_must_stop()); | 1133 | } while (!torture_must_stop()); |
| 1025 | if (irqreader && cur_ops->irq_capable) { | 1134 | if (irqreader && cur_ops->irq_capable) { |
| @@ -1031,10 +1140,15 @@ rcu_torture_reader(void *arg) | |||
| 1031 | } | 1140 | } |
| 1032 | 1141 | ||
| 1033 | /* | 1142 | /* |
| 1034 | * Create an RCU-torture statistics message in the specified buffer. | 1143 | * Print torture statistics. Caller must ensure that there is only |
| 1144 | * one call to this function at a given time!!! This is normally | ||
| 1145 | * accomplished by relying on the module system to only have one copy | ||
| 1146 | * of the module loaded, and then by giving the rcu_torture_stats | ||
| 1147 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | ||
| 1148 | * thread is not running). | ||
| 1035 | */ | 1149 | */ |
| 1036 | static void | 1150 | static void |
| 1037 | rcu_torture_printk(char *page) | 1151 | rcu_torture_stats_print(void) |
| 1038 | { | 1152 | { |
| 1039 | int cpu; | 1153 | int cpu; |
| 1040 | int i; | 1154 | int i; |
| @@ -1052,55 +1166,61 @@ rcu_torture_printk(char *page) | |||
| 1052 | if (pipesummary[i] != 0) | 1166 | if (pipesummary[i] != 0) |
| 1053 | break; | 1167 | break; |
| 1054 | } | 1168 | } |
| 1055 | page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); | 1169 | |
| 1056 | page += sprintf(page, | 1170 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); |
| 1057 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", | 1171 | pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", |
| 1058 | rcu_torture_current, | 1172 | rcu_torture_current, |
| 1059 | rcu_torture_current_version, | 1173 | rcu_torture_current_version, |
| 1060 | list_empty(&rcu_torture_freelist), | 1174 | list_empty(&rcu_torture_freelist), |
| 1061 | atomic_read(&n_rcu_torture_alloc), | 1175 | atomic_read(&n_rcu_torture_alloc), |
| 1062 | atomic_read(&n_rcu_torture_alloc_fail), | 1176 | atomic_read(&n_rcu_torture_alloc_fail), |
| 1063 | atomic_read(&n_rcu_torture_free)); | 1177 | atomic_read(&n_rcu_torture_free)); |
| 1064 | page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", | 1178 | pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", |
| 1065 | atomic_read(&n_rcu_torture_mberror), | 1179 | atomic_read(&n_rcu_torture_mberror), |
| 1066 | n_rcu_torture_boost_ktrerror, | 1180 | n_rcu_torture_boost_ktrerror, |
| 1067 | n_rcu_torture_boost_rterror); | 1181 | n_rcu_torture_boost_rterror); |
| 1068 | page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", | 1182 | pr_cont("rtbf: %ld rtb: %ld nt: %ld ", |
| 1069 | n_rcu_torture_boost_failure, | 1183 | n_rcu_torture_boost_failure, |
| 1070 | n_rcu_torture_boosts, | 1184 | n_rcu_torture_boosts, |
| 1071 | n_rcu_torture_timers); | 1185 | n_rcu_torture_timers); |
| 1072 | page = torture_onoff_stats(page); | 1186 | torture_onoff_stats(); |
| 1073 | page += sprintf(page, "barrier: %ld/%ld:%ld", | 1187 | pr_cont("barrier: %ld/%ld:%ld ", |
| 1074 | n_barrier_successes, | 1188 | n_barrier_successes, |
| 1075 | n_barrier_attempts, | 1189 | n_barrier_attempts, |
| 1076 | n_rcu_torture_barrier_error); | 1190 | n_rcu_torture_barrier_error); |
| 1077 | page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); | 1191 | pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods)); |
| 1192 | |||
| 1193 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); | ||
| 1078 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1194 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
| 1079 | n_rcu_torture_barrier_error != 0 || | 1195 | n_rcu_torture_barrier_error != 0 || |
| 1080 | n_rcu_torture_boost_ktrerror != 0 || | 1196 | n_rcu_torture_boost_ktrerror != 0 || |
| 1081 | n_rcu_torture_boost_rterror != 0 || | 1197 | n_rcu_torture_boost_rterror != 0 || |
| 1082 | n_rcu_torture_boost_failure != 0 || | 1198 | n_rcu_torture_boost_failure != 0 || |
| 1083 | i > 1) { | 1199 | i > 1) { |
| 1084 | page += sprintf(page, "!!! "); | 1200 | pr_cont("%s", "!!! "); |
| 1085 | atomic_inc(&n_rcu_torture_error); | 1201 | atomic_inc(&n_rcu_torture_error); |
| 1086 | WARN_ON_ONCE(1); | 1202 | WARN_ON_ONCE(1); |
| 1087 | } | 1203 | } |
| 1088 | page += sprintf(page, "Reader Pipe: "); | 1204 | pr_cont("Reader Pipe: "); |
| 1089 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1205 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
| 1090 | page += sprintf(page, " %ld", pipesummary[i]); | 1206 | pr_cont(" %ld", pipesummary[i]); |
| 1091 | page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); | 1207 | pr_cont("\n"); |
| 1092 | page += sprintf(page, "Reader Batch: "); | 1208 | |
| 1209 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); | ||
| 1210 | pr_cont("Reader Batch: "); | ||
| 1093 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1211 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
| 1094 | page += sprintf(page, " %ld", batchsummary[i]); | 1212 | pr_cont(" %ld", batchsummary[i]); |
| 1095 | page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); | 1213 | pr_cont("\n"); |
| 1096 | page += sprintf(page, "Free-Block Circulation: "); | 1214 | |
| 1215 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); | ||
| 1216 | pr_cont("Free-Block Circulation: "); | ||
| 1097 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1217 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
| 1098 | page += sprintf(page, " %d", | 1218 | pr_cont(" %d", atomic_read(&rcu_torture_wcount[i])); |
| 1099 | atomic_read(&rcu_torture_wcount[i])); | ||
| 1100 | } | 1219 | } |
| 1101 | page += sprintf(page, "\n"); | 1220 | pr_cont("\n"); |
| 1221 | |||
| 1102 | if (cur_ops->stats) | 1222 | if (cur_ops->stats) |
| 1103 | cur_ops->stats(page); | 1223 | cur_ops->stats(); |
| 1104 | if (rtcv_snap == rcu_torture_current_version && | 1224 | if (rtcv_snap == rcu_torture_current_version && |
| 1105 | rcu_torture_current != NULL) { | 1225 | rcu_torture_current != NULL) { |
| 1106 | int __maybe_unused flags; | 1226 | int __maybe_unused flags; |
| @@ -1109,10 +1229,9 @@ rcu_torture_printk(char *page) | |||
| 1109 | 1229 | ||
| 1110 | rcutorture_get_gp_data(cur_ops->ttype, | 1230 | rcutorture_get_gp_data(cur_ops->ttype, |
| 1111 | &flags, &gpnum, &completed); | 1231 | &flags, &gpnum, &completed); |
| 1112 | page += sprintf(page, | 1232 | pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", |
| 1113 | "??? Writer stall state %d g%lu c%lu f%#x\n", | 1233 | rcu_torture_writer_state, |
| 1114 | rcu_torture_writer_state, | 1234 | gpnum, completed, flags); |
| 1115 | gpnum, completed, flags); | ||
| 1116 | show_rcu_gp_kthreads(); | 1235 | show_rcu_gp_kthreads(); |
| 1117 | rcutorture_trace_dump(); | 1236 | rcutorture_trace_dump(); |
| 1118 | } | 1237 | } |
| @@ -1120,30 +1239,6 @@ rcu_torture_printk(char *page) | |||
| 1120 | } | 1239 | } |
| 1121 | 1240 | ||
| 1122 | /* | 1241 | /* |
| 1123 | * Print torture statistics. Caller must ensure that there is only | ||
| 1124 | * one call to this function at a given time!!! This is normally | ||
| 1125 | * accomplished by relying on the module system to only have one copy | ||
| 1126 | * of the module loaded, and then by giving the rcu_torture_stats | ||
| 1127 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | ||
| 1128 | * thread is not running). | ||
| 1129 | */ | ||
| 1130 | static void | ||
| 1131 | rcu_torture_stats_print(void) | ||
| 1132 | { | ||
| 1133 | int size = nr_cpu_ids * 200 + 8192; | ||
| 1134 | char *buf; | ||
| 1135 | |||
| 1136 | buf = kmalloc(size, GFP_KERNEL); | ||
| 1137 | if (!buf) { | ||
| 1138 | pr_err("rcu-torture: Out of memory, need: %d", size); | ||
| 1139 | return; | ||
| 1140 | } | ||
| 1141 | rcu_torture_printk(buf); | ||
| 1142 | pr_alert("%s", buf); | ||
| 1143 | kfree(buf); | ||
| 1144 | } | ||
| 1145 | |||
| 1146 | /* | ||
| 1147 | * Periodically prints torture statistics, if periodic statistics printing | 1242 | * Periodically prints torture statistics, if periodic statistics printing |
| 1148 | * was specified via the stat_interval module parameter. | 1243 | * was specified via the stat_interval module parameter. |
| 1149 | */ | 1244 | */ |
| @@ -1295,7 +1390,8 @@ static int rcu_torture_barrier_cbs(void *arg) | |||
| 1295 | if (atomic_dec_and_test(&barrier_cbs_count)) | 1390 | if (atomic_dec_and_test(&barrier_cbs_count)) |
| 1296 | wake_up(&barrier_wq); | 1391 | wake_up(&barrier_wq); |
| 1297 | } while (!torture_must_stop()); | 1392 | } while (!torture_must_stop()); |
| 1298 | cur_ops->cb_barrier(); | 1393 | if (cur_ops->cb_barrier != NULL) |
| 1394 | cur_ops->cb_barrier(); | ||
| 1299 | destroy_rcu_head_on_stack(&rcu); | 1395 | destroy_rcu_head_on_stack(&rcu); |
| 1300 | torture_kthread_stopping("rcu_torture_barrier_cbs"); | 1396 | torture_kthread_stopping("rcu_torture_barrier_cbs"); |
| 1301 | return 0; | 1397 | return 0; |
| @@ -1418,7 +1514,7 @@ rcu_torture_cleanup(void) | |||
| 1418 | int i; | 1514 | int i; |
| 1419 | 1515 | ||
| 1420 | rcutorture_record_test_transition(); | 1516 | rcutorture_record_test_transition(); |
| 1421 | if (torture_cleanup()) { | 1517 | if (torture_cleanup_begin()) { |
| 1422 | if (cur_ops->cb_barrier != NULL) | 1518 | if (cur_ops->cb_barrier != NULL) |
| 1423 | cur_ops->cb_barrier(); | 1519 | cur_ops->cb_barrier(); |
| 1424 | return; | 1520 | return; |
| @@ -1447,6 +1543,8 @@ rcu_torture_cleanup(void) | |||
| 1447 | 1543 | ||
| 1448 | torture_stop_kthread(rcu_torture_stats, stats_task); | 1544 | torture_stop_kthread(rcu_torture_stats, stats_task); |
| 1449 | torture_stop_kthread(rcu_torture_fqs, fqs_task); | 1545 | torture_stop_kthread(rcu_torture_fqs, fqs_task); |
| 1546 | for (i = 0; i < ncbflooders; i++) | ||
| 1547 | torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); | ||
| 1450 | if ((test_boost == 1 && cur_ops->can_boost) || | 1548 | if ((test_boost == 1 && cur_ops->can_boost) || |
| 1451 | test_boost == 2) { | 1549 | test_boost == 2) { |
| 1452 | unregister_cpu_notifier(&rcutorture_cpu_nb); | 1550 | unregister_cpu_notifier(&rcutorture_cpu_nb); |
| @@ -1468,6 +1566,7 @@ rcu_torture_cleanup(void) | |||
| 1468 | "End of test: RCU_HOTPLUG"); | 1566 | "End of test: RCU_HOTPLUG"); |
| 1469 | else | 1567 | else |
| 1470 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | 1568 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
| 1569 | torture_cleanup_end(); | ||
| 1471 | } | 1570 | } |
| 1472 | 1571 | ||
| 1473 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | 1572 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD |
| @@ -1534,9 +1633,10 @@ rcu_torture_init(void) | |||
| 1534 | int firsterr = 0; | 1633 | int firsterr = 0; |
| 1535 | static struct rcu_torture_ops *torture_ops[] = { | 1634 | static struct rcu_torture_ops *torture_ops[] = { |
| 1536 | &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, | 1635 | &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, |
| 1636 | RCUTORTURE_TASKS_OPS | ||
| 1537 | }; | 1637 | }; |
| 1538 | 1638 | ||
| 1539 | if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) | 1639 | if (!torture_init_begin(torture_type, verbose, &torture_runnable)) |
| 1540 | return -EBUSY; | 1640 | return -EBUSY; |
| 1541 | 1641 | ||
| 1542 | /* Process args and tell the world that the torturer is on the job. */ | 1642 | /* Process args and tell the world that the torturer is on the job. */ |
| @@ -1693,6 +1793,24 @@ rcu_torture_init(void) | |||
| 1693 | goto unwind; | 1793 | goto unwind; |
| 1694 | if (object_debug) | 1794 | if (object_debug) |
| 1695 | rcu_test_debug_objects(); | 1795 | rcu_test_debug_objects(); |
| 1796 | if (cbflood_n_burst > 0) { | ||
| 1797 | /* Create the cbflood threads */ | ||
| 1798 | ncbflooders = (num_online_cpus() + 3) / 4; | ||
| 1799 | cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task), | ||
| 1800 | GFP_KERNEL); | ||
| 1801 | if (!cbflood_task) { | ||
| 1802 | VERBOSE_TOROUT_ERRSTRING("out of memory"); | ||
| 1803 | firsterr = -ENOMEM; | ||
| 1804 | goto unwind; | ||
| 1805 | } | ||
| 1806 | for (i = 0; i < ncbflooders; i++) { | ||
| 1807 | firsterr = torture_create_kthread(rcu_torture_cbflood, | ||
| 1808 | NULL, | ||
| 1809 | cbflood_task[i]); | ||
| 1810 | if (firsterr) | ||
| 1811 | goto unwind; | ||
| 1812 | } | ||
| 1813 | } | ||
| 1696 | rcutorture_record_test_transition(); | 1814 | rcutorture_record_test_transition(); |
| 1697 | torture_init_end(); | 1815 | torture_init_end(); |
| 1698 | return 0; | 1816 | return 0; |
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d9efcc13008c..c0623fc47125 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
| @@ -51,7 +51,7 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | |||
| 51 | 51 | ||
| 52 | #include "tiny_plugin.h" | 52 | #include "tiny_plugin.h" |
| 53 | 53 | ||
| 54 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | 54 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */ |
| 55 | static void rcu_idle_enter_common(long long newval) | 55 | static void rcu_idle_enter_common(long long newval) |
| 56 | { | 56 | { |
| 57 | if (newval) { | 57 | if (newval) { |
| @@ -62,7 +62,7 @@ static void rcu_idle_enter_common(long long newval) | |||
| 62 | } | 62 | } |
| 63 | RCU_TRACE(trace_rcu_dyntick(TPS("Start"), | 63 | RCU_TRACE(trace_rcu_dyntick(TPS("Start"), |
| 64 | rcu_dynticks_nesting, newval)); | 64 | rcu_dynticks_nesting, newval)); |
| 65 | if (!is_idle_task(current)) { | 65 | if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { |
| 66 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); | 66 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
| 67 | 67 | ||
| 68 | RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), | 68 | RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), |
| @@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval) | |||
| 72 | current->pid, current->comm, | 72 | current->pid, current->comm, |
| 73 | idle->pid, idle->comm); /* must be idle task! */ | 73 | idle->pid, idle->comm); /* must be idle task! */ |
| 74 | } | 74 | } |
| 75 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | 75 | rcu_sched_qs(); /* implies rcu_bh_inc() */ |
| 76 | barrier(); | 76 | barrier(); |
| 77 | rcu_dynticks_nesting = newval; | 77 | rcu_dynticks_nesting = newval; |
| 78 | } | 78 | } |
| @@ -114,7 +114,7 @@ void rcu_irq_exit(void) | |||
| 114 | } | 114 | } |
| 115 | EXPORT_SYMBOL_GPL(rcu_irq_exit); | 115 | EXPORT_SYMBOL_GPL(rcu_irq_exit); |
| 116 | 116 | ||
| 117 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ | 117 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */ |
| 118 | static void rcu_idle_exit_common(long long oldval) | 118 | static void rcu_idle_exit_common(long long oldval) |
| 119 | { | 119 | { |
| 120 | if (oldval) { | 120 | if (oldval) { |
| @@ -123,7 +123,7 @@ static void rcu_idle_exit_common(long long oldval) | |||
| 123 | return; | 123 | return; |
| 124 | } | 124 | } |
| 125 | RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); | 125 | RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); |
| 126 | if (!is_idle_task(current)) { | 126 | if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { |
| 127 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); | 127 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
| 128 | 128 | ||
| 129 | RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), | 129 | RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), |
| @@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | |||
| 217 | * are at it, given that any rcu quiescent state is also an rcu_bh | 217 | * are at it, given that any rcu quiescent state is also an rcu_bh |
| 218 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. | 218 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. |
| 219 | */ | 219 | */ |
| 220 | void rcu_sched_qs(int cpu) | 220 | void rcu_sched_qs(void) |
| 221 | { | 221 | { |
| 222 | unsigned long flags; | 222 | unsigned long flags; |
| 223 | 223 | ||
| @@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu) | |||
| 231 | /* | 231 | /* |
| 232 | * Record an rcu_bh quiescent state. | 232 | * Record an rcu_bh quiescent state. |
| 233 | */ | 233 | */ |
| 234 | void rcu_bh_qs(int cpu) | 234 | void rcu_bh_qs(void) |
| 235 | { | 235 | { |
| 236 | unsigned long flags; | 236 | unsigned long flags; |
| 237 | 237 | ||
| @@ -251,9 +251,11 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 251 | { | 251 | { |
| 252 | RCU_TRACE(check_cpu_stalls()); | 252 | RCU_TRACE(check_cpu_stalls()); |
| 253 | if (user || rcu_is_cpu_rrupt_from_idle()) | 253 | if (user || rcu_is_cpu_rrupt_from_idle()) |
| 254 | rcu_sched_qs(cpu); | 254 | rcu_sched_qs(); |
| 255 | else if (!in_softirq()) | 255 | else if (!in_softirq()) |
| 256 | rcu_bh_qs(cpu); | 256 | rcu_bh_qs(); |
| 257 | if (user) | ||
| 258 | rcu_note_voluntary_context_switch(current); | ||
| 257 | } | 259 | } |
| 258 | 260 | ||
| 259 | /* | 261 | /* |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1b70cb6fbe3c..9815447d22e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -79,9 +79,18 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
| 79 | * the tracing userspace tools to be able to decipher the string | 79 | * the tracing userspace tools to be able to decipher the string |
| 80 | * address to the matching string. | 80 | * address to the matching string. |
| 81 | */ | 81 | */ |
| 82 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | 82 | #ifdef CONFIG_TRACING |
| 83 | # define DEFINE_RCU_TPS(sname) \ | ||
| 83 | static char sname##_varname[] = #sname; \ | 84 | static char sname##_varname[] = #sname; \ |
| 84 | static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ | 85 | static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; |
| 86 | # define RCU_STATE_NAME(sname) sname##_varname | ||
| 87 | #else | ||
| 88 | # define DEFINE_RCU_TPS(sname) | ||
| 89 | # define RCU_STATE_NAME(sname) __stringify(sname) | ||
| 90 | #endif | ||
| 91 | |||
| 92 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | ||
| 93 | DEFINE_RCU_TPS(sname) \ | ||
| 85 | struct rcu_state sname##_state = { \ | 94 | struct rcu_state sname##_state = { \ |
| 86 | .level = { &sname##_state.node[0] }, \ | 95 | .level = { &sname##_state.node[0] }, \ |
| 87 | .call = cr, \ | 96 | .call = cr, \ |
| @@ -93,7 +102,7 @@ struct rcu_state sname##_state = { \ | |||
| 93 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 102 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
| 94 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 103 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
| 95 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | 104 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ |
| 96 | .name = sname##_varname, \ | 105 | .name = RCU_STATE_NAME(sname), \ |
| 97 | .abbr = sabbr, \ | 106 | .abbr = sabbr, \ |
| 98 | }; \ | 107 | }; \ |
| 99 | DEFINE_PER_CPU(struct rcu_data, sname##_data) | 108 | DEFINE_PER_CPU(struct rcu_data, sname##_data) |
| @@ -188,22 +197,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) | |||
| 188 | * one since the start of the grace period, this just sets a flag. | 197 | * one since the start of the grace period, this just sets a flag. |
| 189 | * The caller must have disabled preemption. | 198 | * The caller must have disabled preemption. |
| 190 | */ | 199 | */ |
| 191 | void rcu_sched_qs(int cpu) | 200 | void rcu_sched_qs(void) |
| 192 | { | 201 | { |
| 193 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 202 | if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { |
| 194 | 203 | trace_rcu_grace_period(TPS("rcu_sched"), | |
| 195 | if (rdp->passed_quiesce == 0) | 204 | __this_cpu_read(rcu_sched_data.gpnum), |
| 196 | trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); | 205 | TPS("cpuqs")); |
| 197 | rdp->passed_quiesce = 1; | 206 | __this_cpu_write(rcu_sched_data.passed_quiesce, 1); |
| 207 | } | ||
| 198 | } | 208 | } |
| 199 | 209 | ||
| 200 | void rcu_bh_qs(int cpu) | 210 | void rcu_bh_qs(void) |
| 201 | { | 211 | { |
| 202 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 212 | if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { |
| 203 | 213 | trace_rcu_grace_period(TPS("rcu_bh"), | |
| 204 | if (rdp->passed_quiesce == 0) | 214 | __this_cpu_read(rcu_bh_data.gpnum), |
| 205 | trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); | 215 | TPS("cpuqs")); |
| 206 | rdp->passed_quiesce = 1; | 216 | __this_cpu_write(rcu_bh_data.passed_quiesce, 1); |
| 217 | } | ||
| 207 | } | 218 | } |
| 208 | 219 | ||
| 209 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); | 220 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); |
| @@ -278,7 +289,7 @@ static void rcu_momentary_dyntick_idle(void) | |||
| 278 | void rcu_note_context_switch(int cpu) | 289 | void rcu_note_context_switch(int cpu) |
| 279 | { | 290 | { |
| 280 | trace_rcu_utilization(TPS("Start context switch")); | 291 | trace_rcu_utilization(TPS("Start context switch")); |
| 281 | rcu_sched_qs(cpu); | 292 | rcu_sched_qs(); |
| 282 | rcu_preempt_note_context_switch(cpu); | 293 | rcu_preempt_note_context_switch(cpu); |
| 283 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | 294 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) |
| 284 | rcu_momentary_dyntick_idle(); | 295 | rcu_momentary_dyntick_idle(); |
| @@ -526,6 +537,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | |||
| 526 | atomic_inc(&rdtp->dynticks); | 537 | atomic_inc(&rdtp->dynticks); |
| 527 | smp_mb__after_atomic(); /* Force ordering with next sojourn. */ | 538 | smp_mb__after_atomic(); /* Force ordering with next sojourn. */ |
| 528 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 539 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
| 540 | rcu_dynticks_task_enter(); | ||
| 529 | 541 | ||
| 530 | /* | 542 | /* |
| 531 | * It is illegal to enter an extended quiescent state while | 543 | * It is illegal to enter an extended quiescent state while |
| @@ -642,6 +654,7 @@ void rcu_irq_exit(void) | |||
| 642 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | 654 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, |
| 643 | int user) | 655 | int user) |
| 644 | { | 656 | { |
| 657 | rcu_dynticks_task_exit(); | ||
| 645 | smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ | 658 | smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ |
| 646 | atomic_inc(&rdtp->dynticks); | 659 | atomic_inc(&rdtp->dynticks); |
| 647 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 660 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
| @@ -819,7 +832,7 @@ bool notrace __rcu_is_watching(void) | |||
| 819 | */ | 832 | */ |
| 820 | bool notrace rcu_is_watching(void) | 833 | bool notrace rcu_is_watching(void) |
| 821 | { | 834 | { |
| 822 | int ret; | 835 | bool ret; |
| 823 | 836 | ||
| 824 | preempt_disable(); | 837 | preempt_disable(); |
| 825 | ret = __rcu_is_watching(); | 838 | ret = __rcu_is_watching(); |
| @@ -1647,7 +1660,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1647 | rnp->level, rnp->grplo, | 1660 | rnp->level, rnp->grplo, |
| 1648 | rnp->grphi, rnp->qsmask); | 1661 | rnp->grphi, rnp->qsmask); |
| 1649 | raw_spin_unlock_irq(&rnp->lock); | 1662 | raw_spin_unlock_irq(&rnp->lock); |
| 1650 | cond_resched(); | 1663 | cond_resched_rcu_qs(); |
| 1651 | } | 1664 | } |
| 1652 | 1665 | ||
| 1653 | mutex_unlock(&rsp->onoff_mutex); | 1666 | mutex_unlock(&rsp->onoff_mutex); |
| @@ -1668,7 +1681,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
| 1668 | if (fqs_state == RCU_SAVE_DYNTICK) { | 1681 | if (fqs_state == RCU_SAVE_DYNTICK) { |
| 1669 | /* Collect dyntick-idle snapshots. */ | 1682 | /* Collect dyntick-idle snapshots. */ |
| 1670 | if (is_sysidle_rcu_state(rsp)) { | 1683 | if (is_sysidle_rcu_state(rsp)) { |
| 1671 | isidle = 1; | 1684 | isidle = true; |
| 1672 | maxj = jiffies - ULONG_MAX / 4; | 1685 | maxj = jiffies - ULONG_MAX / 4; |
| 1673 | } | 1686 | } |
| 1674 | force_qs_rnp(rsp, dyntick_save_progress_counter, | 1687 | force_qs_rnp(rsp, dyntick_save_progress_counter, |
| @@ -1677,14 +1690,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
| 1677 | fqs_state = RCU_FORCE_QS; | 1690 | fqs_state = RCU_FORCE_QS; |
| 1678 | } else { | 1691 | } else { |
| 1679 | /* Handle dyntick-idle and offline CPUs. */ | 1692 | /* Handle dyntick-idle and offline CPUs. */ |
| 1680 | isidle = 0; | 1693 | isidle = false; |
| 1681 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | 1694 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); |
| 1682 | } | 1695 | } |
| 1683 | /* Clear flag to prevent immediate re-entry. */ | 1696 | /* Clear flag to prevent immediate re-entry. */ |
| 1684 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1697 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
| 1685 | raw_spin_lock_irq(&rnp->lock); | 1698 | raw_spin_lock_irq(&rnp->lock); |
| 1686 | smp_mb__after_unlock_lock(); | 1699 | smp_mb__after_unlock_lock(); |
| 1687 | ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; | 1700 | ACCESS_ONCE(rsp->gp_flags) = |
| 1701 | ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS; | ||
| 1688 | raw_spin_unlock_irq(&rnp->lock); | 1702 | raw_spin_unlock_irq(&rnp->lock); |
| 1689 | } | 1703 | } |
| 1690 | return fqs_state; | 1704 | return fqs_state; |
| @@ -1736,7 +1750,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1736 | /* smp_mb() provided by prior unlock-lock pair. */ | 1750 | /* smp_mb() provided by prior unlock-lock pair. */ |
| 1737 | nocb += rcu_future_gp_cleanup(rsp, rnp); | 1751 | nocb += rcu_future_gp_cleanup(rsp, rnp); |
| 1738 | raw_spin_unlock_irq(&rnp->lock); | 1752 | raw_spin_unlock_irq(&rnp->lock); |
| 1739 | cond_resched(); | 1753 | cond_resched_rcu_qs(); |
| 1740 | } | 1754 | } |
| 1741 | rnp = rcu_get_root(rsp); | 1755 | rnp = rcu_get_root(rsp); |
| 1742 | raw_spin_lock_irq(&rnp->lock); | 1756 | raw_spin_lock_irq(&rnp->lock); |
| @@ -1785,8 +1799,8 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1785 | /* Locking provides needed memory barrier. */ | 1799 | /* Locking provides needed memory barrier. */ |
| 1786 | if (rcu_gp_init(rsp)) | 1800 | if (rcu_gp_init(rsp)) |
| 1787 | break; | 1801 | break; |
| 1788 | cond_resched(); | 1802 | cond_resched_rcu_qs(); |
| 1789 | flush_signals(current); | 1803 | WARN_ON(signal_pending(current)); |
| 1790 | trace_rcu_grace_period(rsp->name, | 1804 | trace_rcu_grace_period(rsp->name, |
| 1791 | ACCESS_ONCE(rsp->gpnum), | 1805 | ACCESS_ONCE(rsp->gpnum), |
| 1792 | TPS("reqwaitsig")); | 1806 | TPS("reqwaitsig")); |
| @@ -1828,11 +1842,11 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1828 | trace_rcu_grace_period(rsp->name, | 1842 | trace_rcu_grace_period(rsp->name, |
| 1829 | ACCESS_ONCE(rsp->gpnum), | 1843 | ACCESS_ONCE(rsp->gpnum), |
| 1830 | TPS("fqsend")); | 1844 | TPS("fqsend")); |
| 1831 | cond_resched(); | 1845 | cond_resched_rcu_qs(); |
| 1832 | } else { | 1846 | } else { |
| 1833 | /* Deal with stray signal. */ | 1847 | /* Deal with stray signal. */ |
| 1834 | cond_resched(); | 1848 | cond_resched_rcu_qs(); |
| 1835 | flush_signals(current); | 1849 | WARN_ON(signal_pending(current)); |
| 1836 | trace_rcu_grace_period(rsp->name, | 1850 | trace_rcu_grace_period(rsp->name, |
| 1837 | ACCESS_ONCE(rsp->gpnum), | 1851 | ACCESS_ONCE(rsp->gpnum), |
| 1838 | TPS("fqswaitsig")); | 1852 | TPS("fqswaitsig")); |
| @@ -1928,7 +1942,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
| 1928 | { | 1942 | { |
| 1929 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 1943 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
| 1930 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | 1944 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); |
| 1931 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | 1945 | rcu_gp_kthread_wake(rsp); |
| 1932 | } | 1946 | } |
| 1933 | 1947 | ||
| 1934 | /* | 1948 | /* |
| @@ -2210,8 +2224,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
| 2210 | /* Adjust any no-longer-needed kthreads. */ | 2224 | /* Adjust any no-longer-needed kthreads. */ |
| 2211 | rcu_boost_kthread_setaffinity(rnp, -1); | 2225 | rcu_boost_kthread_setaffinity(rnp, -1); |
| 2212 | 2226 | ||
| 2213 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ | ||
| 2214 | |||
| 2215 | /* Exclude any attempts to start a new grace period. */ | 2227 | /* Exclude any attempts to start a new grace period. */ |
| 2216 | mutex_lock(&rsp->onoff_mutex); | 2228 | mutex_lock(&rsp->onoff_mutex); |
| 2217 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); | 2229 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); |
| @@ -2393,8 +2405,8 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 2393 | * at least not while the corresponding CPU is online. | 2405 | * at least not while the corresponding CPU is online. |
| 2394 | */ | 2406 | */ |
| 2395 | 2407 | ||
| 2396 | rcu_sched_qs(cpu); | 2408 | rcu_sched_qs(); |
| 2397 | rcu_bh_qs(cpu); | 2409 | rcu_bh_qs(); |
| 2398 | 2410 | ||
| 2399 | } else if (!in_softirq()) { | 2411 | } else if (!in_softirq()) { |
| 2400 | 2412 | ||
| @@ -2405,11 +2417,13 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 2405 | * critical section, so note it. | 2417 | * critical section, so note it. |
| 2406 | */ | 2418 | */ |
| 2407 | 2419 | ||
| 2408 | rcu_bh_qs(cpu); | 2420 | rcu_bh_qs(); |
| 2409 | } | 2421 | } |
| 2410 | rcu_preempt_check_callbacks(cpu); | 2422 | rcu_preempt_check_callbacks(cpu); |
| 2411 | if (rcu_pending(cpu)) | 2423 | if (rcu_pending(cpu)) |
| 2412 | invoke_rcu_core(); | 2424 | invoke_rcu_core(); |
| 2425 | if (user) | ||
| 2426 | rcu_note_voluntary_context_switch(current); | ||
| 2413 | trace_rcu_utilization(TPS("End scheduler-tick")); | 2427 | trace_rcu_utilization(TPS("End scheduler-tick")); |
| 2414 | } | 2428 | } |
| 2415 | 2429 | ||
| @@ -2432,7 +2446,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 2432 | struct rcu_node *rnp; | 2446 | struct rcu_node *rnp; |
| 2433 | 2447 | ||
| 2434 | rcu_for_each_leaf_node(rsp, rnp) { | 2448 | rcu_for_each_leaf_node(rsp, rnp) { |
| 2435 | cond_resched(); | 2449 | cond_resched_rcu_qs(); |
| 2436 | mask = 0; | 2450 | mask = 0; |
| 2437 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2451 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 2438 | smp_mb__after_unlock_lock(); | 2452 | smp_mb__after_unlock_lock(); |
| @@ -2449,7 +2463,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 2449 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 2463 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
| 2450 | if ((rnp->qsmask & bit) != 0) { | 2464 | if ((rnp->qsmask & bit) != 0) { |
| 2451 | if ((rnp->qsmaskinit & bit) != 0) | 2465 | if ((rnp->qsmaskinit & bit) != 0) |
| 2452 | *isidle = 0; | 2466 | *isidle = false; |
| 2453 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | 2467 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) |
| 2454 | mask |= bit; | 2468 | mask |= bit; |
| 2455 | } | 2469 | } |
| @@ -2505,9 +2519,10 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
| 2505 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2519 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
| 2506 | return; /* Someone beat us to it. */ | 2520 | return; /* Someone beat us to it. */ |
| 2507 | } | 2521 | } |
| 2508 | ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; | 2522 | ACCESS_ONCE(rsp->gp_flags) = |
| 2523 | ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS; | ||
| 2509 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2524 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
| 2510 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | 2525 | rcu_gp_kthread_wake(rsp); |
| 2511 | } | 2526 | } |
| 2512 | 2527 | ||
| 2513 | /* | 2528 | /* |
| @@ -2925,11 +2940,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
| 2925 | * restructure your code to batch your updates, and then use a single | 2940 | * restructure your code to batch your updates, and then use a single |
| 2926 | * synchronize_sched() instead. | 2941 | * synchronize_sched() instead. |
| 2927 | * | 2942 | * |
| 2928 | * Note that it is illegal to call this function while holding any lock | ||
| 2929 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | ||
| 2930 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
| 2931 | * these restriction will result in deadlock. | ||
| 2932 | * | ||
| 2933 | * This implementation can be thought of as an application of ticket | 2943 | * This implementation can be thought of as an application of ticket |
| 2934 | * locking to RCU, with sync_sched_expedited_started and | 2944 | * locking to RCU, with sync_sched_expedited_started and |
| 2935 | * sync_sched_expedited_done taking on the roles of the halves | 2945 | * sync_sched_expedited_done taking on the roles of the halves |
| @@ -2979,7 +2989,12 @@ void synchronize_sched_expedited(void) | |||
| 2979 | */ | 2989 | */ |
| 2980 | snap = atomic_long_inc_return(&rsp->expedited_start); | 2990 | snap = atomic_long_inc_return(&rsp->expedited_start); |
| 2981 | firstsnap = snap; | 2991 | firstsnap = snap; |
| 2982 | get_online_cpus(); | 2992 | if (!try_get_online_cpus()) { |
| 2993 | /* CPU hotplug operation in flight, fall back to normal GP. */ | ||
| 2994 | wait_rcu_gp(call_rcu_sched); | ||
| 2995 | atomic_long_inc(&rsp->expedited_normal); | ||
| 2996 | return; | ||
| 2997 | } | ||
| 2983 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | 2998 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); |
| 2984 | 2999 | ||
| 2985 | /* | 3000 | /* |
| @@ -3026,7 +3041,12 @@ void synchronize_sched_expedited(void) | |||
| 3026 | * and they started after our first try, so their grace | 3041 | * and they started after our first try, so their grace |
| 3027 | * period works for us. | 3042 | * period works for us. |
| 3028 | */ | 3043 | */ |
| 3029 | get_online_cpus(); | 3044 | if (!try_get_online_cpus()) { |
| 3045 | /* CPU hotplug operation in flight, use normal GP. */ | ||
| 3046 | wait_rcu_gp(call_rcu_sched); | ||
| 3047 | atomic_long_inc(&rsp->expedited_normal); | ||
| 3048 | return; | ||
| 3049 | } | ||
| 3030 | snap = atomic_long_read(&rsp->expedited_start); | 3050 | snap = atomic_long_read(&rsp->expedited_start); |
| 3031 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | 3051 | smp_mb(); /* ensure read is before try_stop_cpus(). */ |
| 3032 | } | 3052 | } |
| @@ -3279,11 +3299,16 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
| 3279 | continue; | 3299 | continue; |
| 3280 | rdp = per_cpu_ptr(rsp->rda, cpu); | 3300 | rdp = per_cpu_ptr(rsp->rda, cpu); |
| 3281 | if (rcu_is_nocb_cpu(cpu)) { | 3301 | if (rcu_is_nocb_cpu(cpu)) { |
| 3282 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, | 3302 | if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { |
| 3283 | rsp->n_barrier_done); | 3303 | _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, |
| 3284 | atomic_inc(&rsp->barrier_cpu_count); | 3304 | rsp->n_barrier_done); |
| 3285 | __call_rcu(&rdp->barrier_head, rcu_barrier_callback, | 3305 | } else { |
| 3286 | rsp, cpu, 0); | 3306 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, |
| 3307 | rsp->n_barrier_done); | ||
| 3308 | atomic_inc(&rsp->barrier_cpu_count); | ||
| 3309 | __call_rcu(&rdp->barrier_head, | ||
| 3310 | rcu_barrier_callback, rsp, cpu, 0); | ||
| 3311 | } | ||
| 3287 | } else if (ACCESS_ONCE(rdp->qlen)) { | 3312 | } else if (ACCESS_ONCE(rdp->qlen)) { |
| 3288 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 3313 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
| 3289 | rsp->n_barrier_done); | 3314 | rsp->n_barrier_done); |
| @@ -3442,6 +3467,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 3442 | case CPU_UP_PREPARE_FROZEN: | 3467 | case CPU_UP_PREPARE_FROZEN: |
| 3443 | rcu_prepare_cpu(cpu); | 3468 | rcu_prepare_cpu(cpu); |
| 3444 | rcu_prepare_kthreads(cpu); | 3469 | rcu_prepare_kthreads(cpu); |
| 3470 | rcu_spawn_all_nocb_kthreads(cpu); | ||
| 3445 | break; | 3471 | break; |
| 3446 | case CPU_ONLINE: | 3472 | case CPU_ONLINE: |
| 3447 | case CPU_DOWN_FAILED: | 3473 | case CPU_DOWN_FAILED: |
| @@ -3489,7 +3515,7 @@ static int rcu_pm_notify(struct notifier_block *self, | |||
| 3489 | } | 3515 | } |
| 3490 | 3516 | ||
| 3491 | /* | 3517 | /* |
| 3492 | * Spawn the kthread that handles this RCU flavor's grace periods. | 3518 | * Spawn the kthreads that handle each RCU flavor's grace periods. |
| 3493 | */ | 3519 | */ |
| 3494 | static int __init rcu_spawn_gp_kthread(void) | 3520 | static int __init rcu_spawn_gp_kthread(void) |
| 3495 | { | 3521 | { |
| @@ -3498,6 +3524,7 @@ static int __init rcu_spawn_gp_kthread(void) | |||
| 3498 | struct rcu_state *rsp; | 3524 | struct rcu_state *rsp; |
| 3499 | struct task_struct *t; | 3525 | struct task_struct *t; |
| 3500 | 3526 | ||
| 3527 | rcu_scheduler_fully_active = 1; | ||
| 3501 | for_each_rcu_flavor(rsp) { | 3528 | for_each_rcu_flavor(rsp) { |
| 3502 | t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); | 3529 | t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); |
| 3503 | BUG_ON(IS_ERR(t)); | 3530 | BUG_ON(IS_ERR(t)); |
| @@ -3505,8 +3532,9 @@ static int __init rcu_spawn_gp_kthread(void) | |||
| 3505 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3532 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 3506 | rsp->gp_kthread = t; | 3533 | rsp->gp_kthread = t; |
| 3507 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 3534 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 3508 | rcu_spawn_nocb_kthreads(rsp); | ||
| 3509 | } | 3535 | } |
| 3536 | rcu_spawn_nocb_kthreads(); | ||
| 3537 | rcu_spawn_boost_kthreads(); | ||
| 3510 | return 0; | 3538 | return 0; |
| 3511 | } | 3539 | } |
| 3512 | early_initcall(rcu_spawn_gp_kthread); | 3540 | early_initcall(rcu_spawn_gp_kthread); |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 71e64c718f75..bbdc45d8d74f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -350,7 +350,7 @@ struct rcu_data { | |||
| 350 | int nocb_p_count_lazy; /* (approximate). */ | 350 | int nocb_p_count_lazy; /* (approximate). */ |
| 351 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | 351 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ |
| 352 | struct task_struct *nocb_kthread; | 352 | struct task_struct *nocb_kthread; |
| 353 | bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ | 353 | int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ |
| 354 | 354 | ||
| 355 | /* The following fields are used by the leader, hence own cacheline. */ | 355 | /* The following fields are used by the leader, hence own cacheline. */ |
| 356 | struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; | 356 | struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; |
| @@ -358,7 +358,7 @@ struct rcu_data { | |||
| 358 | struct rcu_head **nocb_gp_tail; | 358 | struct rcu_head **nocb_gp_tail; |
| 359 | long nocb_gp_count; | 359 | long nocb_gp_count; |
| 360 | long nocb_gp_count_lazy; | 360 | long nocb_gp_count_lazy; |
| 361 | bool nocb_leader_wake; /* Is the nocb leader thread awake? */ | 361 | bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ |
| 362 | struct rcu_data *nocb_next_follower; | 362 | struct rcu_data *nocb_next_follower; |
| 363 | /* Next follower in wakeup chain. */ | 363 | /* Next follower in wakeup chain. */ |
| 364 | 364 | ||
| @@ -383,6 +383,11 @@ struct rcu_data { | |||
| 383 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ | 383 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ |
| 384 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 384 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
| 385 | 385 | ||
| 386 | /* Values for nocb_defer_wakeup field in struct rcu_data. */ | ||
| 387 | #define RCU_NOGP_WAKE_NOT 0 | ||
| 388 | #define RCU_NOGP_WAKE 1 | ||
| 389 | #define RCU_NOGP_WAKE_FORCE 2 | ||
| 390 | |||
| 386 | #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) | 391 | #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) |
| 387 | /* For jiffies_till_first_fqs and */ | 392 | /* For jiffies_till_first_fqs and */ |
| 388 | /* and jiffies_till_next_fqs. */ | 393 | /* and jiffies_till_next_fqs. */ |
| @@ -572,6 +577,7 @@ static void rcu_preempt_do_callbacks(void); | |||
| 572 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 577 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
| 573 | struct rcu_node *rnp); | 578 | struct rcu_node *rnp); |
| 574 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 579 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 580 | static void __init rcu_spawn_boost_kthreads(void); | ||
| 575 | static void rcu_prepare_kthreads(int cpu); | 581 | static void rcu_prepare_kthreads(int cpu); |
| 576 | static void rcu_cleanup_after_idle(int cpu); | 582 | static void rcu_cleanup_after_idle(int cpu); |
| 577 | static void rcu_prepare_for_idle(int cpu); | 583 | static void rcu_prepare_for_idle(int cpu); |
| @@ -581,6 +587,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | |||
| 581 | static void print_cpu_stall_info_end(void); | 587 | static void print_cpu_stall_info_end(void); |
| 582 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 588 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
| 583 | static void increment_cpu_stall_ticks(void); | 589 | static void increment_cpu_stall_ticks(void); |
| 590 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); | ||
| 584 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | 591 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); |
| 585 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | 592 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); |
| 586 | static void rcu_init_one_nocb(struct rcu_node *rnp); | 593 | static void rcu_init_one_nocb(struct rcu_node *rnp); |
| @@ -589,10 +596,14 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
| 589 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 596 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
| 590 | struct rcu_data *rdp, | 597 | struct rcu_data *rdp, |
| 591 | unsigned long flags); | 598 | unsigned long flags); |
| 592 | static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); | 599 | static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); |
| 593 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); | 600 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); |
| 594 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | 601 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); |
| 595 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 602 | static void rcu_spawn_all_nocb_kthreads(int cpu); |
| 603 | static void __init rcu_spawn_nocb_kthreads(void); | ||
| 604 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 605 | static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); | ||
| 606 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 596 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); | 607 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); |
| 597 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 608 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
| 598 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | 609 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); |
| @@ -605,6 +616,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | |||
| 605 | static void rcu_bind_gp_kthread(void); | 616 | static void rcu_bind_gp_kthread(void); |
| 606 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); | 617 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); |
| 607 | static bool rcu_nohz_full_cpu(struct rcu_state *rsp); | 618 | static bool rcu_nohz_full_cpu(struct rcu_state *rsp); |
| 619 | static void rcu_dynticks_task_enter(void); | ||
| 620 | static void rcu_dynticks_task_exit(void); | ||
| 608 | 621 | ||
| 609 | #endif /* #ifndef RCU_TREE_NONCORE */ | 622 | #endif /* #ifndef RCU_TREE_NONCORE */ |
| 610 | 623 | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 00dc411e9676..c1d7f27bd38f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -85,33 +85,6 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 85 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 85 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); |
| 86 | if (nr_cpu_ids != NR_CPUS) | 86 | if (nr_cpu_ids != NR_CPUS) |
| 87 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 87 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
| 88 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 89 | #ifndef CONFIG_RCU_NOCB_CPU_NONE | ||
| 90 | if (!have_rcu_nocb_mask) { | ||
| 91 | zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); | ||
| 92 | have_rcu_nocb_mask = true; | ||
| 93 | } | ||
| 94 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | ||
| 95 | pr_info("\tOffload RCU callbacks from CPU 0\n"); | ||
| 96 | cpumask_set_cpu(0, rcu_nocb_mask); | ||
| 97 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | ||
| 98 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | ||
| 99 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | ||
| 100 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); | ||
| 101 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | ||
| 102 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | ||
| 103 | if (have_rcu_nocb_mask) { | ||
| 104 | if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { | ||
| 105 | pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); | ||
| 106 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | ||
| 107 | rcu_nocb_mask); | ||
| 108 | } | ||
| 109 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | ||
| 110 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); | ||
| 111 | if (rcu_nocb_poll) | ||
| 112 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | ||
| 113 | } | ||
| 114 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 115 | } | 88 | } |
| 116 | 89 | ||
| 117 | #ifdef CONFIG_TREE_PREEMPT_RCU | 90 | #ifdef CONFIG_TREE_PREEMPT_RCU |
| @@ -134,7 +107,7 @@ static void __init rcu_bootup_announce(void) | |||
| 134 | * Return the number of RCU-preempt batches processed thus far | 107 | * Return the number of RCU-preempt batches processed thus far |
| 135 | * for debug and statistics. | 108 | * for debug and statistics. |
| 136 | */ | 109 | */ |
| 137 | long rcu_batches_completed_preempt(void) | 110 | static long rcu_batches_completed_preempt(void) |
| 138 | { | 111 | { |
| 139 | return rcu_preempt_state.completed; | 112 | return rcu_preempt_state.completed; |
| 140 | } | 113 | } |
| @@ -155,18 +128,19 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); | |||
| 155 | * not in a quiescent state. There might be any number of tasks blocked | 128 | * not in a quiescent state. There might be any number of tasks blocked |
| 156 | * while in an RCU read-side critical section. | 129 | * while in an RCU read-side critical section. |
| 157 | * | 130 | * |
| 158 | * Unlike the other rcu_*_qs() functions, callers to this function | 131 | * As with the other rcu_*_qs() functions, callers to this function |
| 159 | * must disable irqs in order to protect the assignment to | 132 | * must disable preemption. |
| 160 | * ->rcu_read_unlock_special. | 133 | */ |
| 161 | */ | 134 | static void rcu_preempt_qs(void) |
| 162 | static void rcu_preempt_qs(int cpu) | 135 | { |
| 163 | { | 136 | if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) { |
| 164 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 137 | trace_rcu_grace_period(TPS("rcu_preempt"), |
| 165 | 138 | __this_cpu_read(rcu_preempt_data.gpnum), | |
| 166 | if (rdp->passed_quiesce == 0) | 139 | TPS("cpuqs")); |
| 167 | trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); | 140 | __this_cpu_write(rcu_preempt_data.passed_quiesce, 1); |
| 168 | rdp->passed_quiesce = 1; | 141 | barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ |
| 169 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 142 | current->rcu_read_unlock_special.b.need_qs = false; |
| 143 | } | ||
| 170 | } | 144 | } |
| 171 | 145 | ||
| 172 | /* | 146 | /* |
| @@ -190,14 +164,14 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 190 | struct rcu_node *rnp; | 164 | struct rcu_node *rnp; |
| 191 | 165 | ||
| 192 | if (t->rcu_read_lock_nesting > 0 && | 166 | if (t->rcu_read_lock_nesting > 0 && |
| 193 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 167 | !t->rcu_read_unlock_special.b.blocked) { |
| 194 | 168 | ||
| 195 | /* Possibly blocking in an RCU read-side critical section. */ | 169 | /* Possibly blocking in an RCU read-side critical section. */ |
| 196 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | 170 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
| 197 | rnp = rdp->mynode; | 171 | rnp = rdp->mynode; |
| 198 | raw_spin_lock_irqsave(&rnp->lock, flags); | 172 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 199 | smp_mb__after_unlock_lock(); | 173 | smp_mb__after_unlock_lock(); |
| 200 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 174 | t->rcu_read_unlock_special.b.blocked = true; |
| 201 | t->rcu_blocked_node = rnp; | 175 | t->rcu_blocked_node = rnp; |
| 202 | 176 | ||
| 203 | /* | 177 | /* |
| @@ -239,7 +213,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 239 | : rnp->gpnum + 1); | 213 | : rnp->gpnum + 1); |
| 240 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 214 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 241 | } else if (t->rcu_read_lock_nesting < 0 && | 215 | } else if (t->rcu_read_lock_nesting < 0 && |
| 242 | t->rcu_read_unlock_special) { | 216 | t->rcu_read_unlock_special.s) { |
| 243 | 217 | ||
| 244 | /* | 218 | /* |
| 245 | * Complete exit from RCU read-side critical section on | 219 | * Complete exit from RCU read-side critical section on |
| @@ -257,9 +231,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 257 | * grace period, then the fact that the task has been enqueued | 231 | * grace period, then the fact that the task has been enqueued |
| 258 | * means that we continue to block the current grace period. | 232 | * means that we continue to block the current grace period. |
| 259 | */ | 233 | */ |
| 260 | local_irq_save(flags); | 234 | rcu_preempt_qs(); |
| 261 | rcu_preempt_qs(cpu); | ||
| 262 | local_irq_restore(flags); | ||
| 263 | } | 235 | } |
| 264 | 236 | ||
| 265 | /* | 237 | /* |
| @@ -340,7 +312,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 340 | bool drop_boost_mutex = false; | 312 | bool drop_boost_mutex = false; |
| 341 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 313 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 342 | struct rcu_node *rnp; | 314 | struct rcu_node *rnp; |
| 343 | int special; | 315 | union rcu_special special; |
| 344 | 316 | ||
| 345 | /* NMI handlers cannot block and cannot safely manipulate state. */ | 317 | /* NMI handlers cannot block and cannot safely manipulate state. */ |
| 346 | if (in_nmi()) | 318 | if (in_nmi()) |
| @@ -350,12 +322,13 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 350 | 322 | ||
| 351 | /* | 323 | /* |
| 352 | * If RCU core is waiting for this CPU to exit critical section, | 324 | * If RCU core is waiting for this CPU to exit critical section, |
| 353 | * let it know that we have done so. | 325 | * let it know that we have done so. Because irqs are disabled, |
| 326 | * t->rcu_read_unlock_special cannot change. | ||
| 354 | */ | 327 | */ |
| 355 | special = t->rcu_read_unlock_special; | 328 | special = t->rcu_read_unlock_special; |
| 356 | if (special & RCU_READ_UNLOCK_NEED_QS) { | 329 | if (special.b.need_qs) { |
| 357 | rcu_preempt_qs(smp_processor_id()); | 330 | rcu_preempt_qs(); |
| 358 | if (!t->rcu_read_unlock_special) { | 331 | if (!t->rcu_read_unlock_special.s) { |
| 359 | local_irq_restore(flags); | 332 | local_irq_restore(flags); |
| 360 | return; | 333 | return; |
| 361 | } | 334 | } |
| @@ -368,8 +341,8 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 368 | } | 341 | } |
| 369 | 342 | ||
| 370 | /* Clean up if blocked during RCU read-side critical section. */ | 343 | /* Clean up if blocked during RCU read-side critical section. */ |
| 371 | if (special & RCU_READ_UNLOCK_BLOCKED) { | 344 | if (special.b.blocked) { |
| 372 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; | 345 | t->rcu_read_unlock_special.b.blocked = false; |
| 373 | 346 | ||
| 374 | /* | 347 | /* |
| 375 | * Remove this task from the list it blocked on. The | 348 | * Remove this task from the list it blocked on. The |
| @@ -653,12 +626,13 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
| 653 | struct task_struct *t = current; | 626 | struct task_struct *t = current; |
| 654 | 627 | ||
| 655 | if (t->rcu_read_lock_nesting == 0) { | 628 | if (t->rcu_read_lock_nesting == 0) { |
| 656 | rcu_preempt_qs(cpu); | 629 | rcu_preempt_qs(); |
| 657 | return; | 630 | return; |
| 658 | } | 631 | } |
| 659 | if (t->rcu_read_lock_nesting > 0 && | 632 | if (t->rcu_read_lock_nesting > 0 && |
| 660 | per_cpu(rcu_preempt_data, cpu).qs_pending) | 633 | per_cpu(rcu_preempt_data, cpu).qs_pending && |
| 661 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | 634 | !per_cpu(rcu_preempt_data, cpu).passed_quiesce) |
| 635 | t->rcu_read_unlock_special.b.need_qs = true; | ||
| 662 | } | 636 | } |
| 663 | 637 | ||
| 664 | #ifdef CONFIG_RCU_BOOST | 638 | #ifdef CONFIG_RCU_BOOST |
| @@ -819,11 +793,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 819 | * In fact, if you are using synchronize_rcu_expedited() in a loop, | 793 | * In fact, if you are using synchronize_rcu_expedited() in a loop, |
| 820 | * please restructure your code to batch your updates, and then Use a | 794 | * please restructure your code to batch your updates, and then Use a |
| 821 | * single synchronize_rcu() instead. | 795 | * single synchronize_rcu() instead. |
| 822 | * | ||
| 823 | * Note that it is illegal to call this function while holding any lock | ||
| 824 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | ||
| 825 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
| 826 | * these restriction will result in deadlock. | ||
| 827 | */ | 796 | */ |
| 828 | void synchronize_rcu_expedited(void) | 797 | void synchronize_rcu_expedited(void) |
| 829 | { | 798 | { |
| @@ -845,7 +814,11 @@ void synchronize_rcu_expedited(void) | |||
| 845 | * being boosted. This simplifies the process of moving tasks | 814 | * being boosted. This simplifies the process of moving tasks |
| 846 | * from leaf to root rcu_node structures. | 815 | * from leaf to root rcu_node structures. |
| 847 | */ | 816 | */ |
| 848 | get_online_cpus(); | 817 | if (!try_get_online_cpus()) { |
| 818 | /* CPU-hotplug operation in flight, fall back to normal GP. */ | ||
| 819 | wait_rcu_gp(call_rcu); | ||
| 820 | return; | ||
| 821 | } | ||
| 849 | 822 | ||
| 850 | /* | 823 | /* |
| 851 | * Acquire lock, falling back to synchronize_rcu() if too many | 824 | * Acquire lock, falling back to synchronize_rcu() if too many |
| @@ -897,7 +870,8 @@ void synchronize_rcu_expedited(void) | |||
| 897 | 870 | ||
| 898 | /* Clean up and exit. */ | 871 | /* Clean up and exit. */ |
| 899 | smp_mb(); /* ensure expedited GP seen before counter increment. */ | 872 | smp_mb(); /* ensure expedited GP seen before counter increment. */ |
| 900 | ACCESS_ONCE(sync_rcu_preempt_exp_count)++; | 873 | ACCESS_ONCE(sync_rcu_preempt_exp_count) = |
| 874 | sync_rcu_preempt_exp_count + 1; | ||
| 901 | unlock_mb_ret: | 875 | unlock_mb_ret: |
| 902 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | 876 | mutex_unlock(&sync_rcu_preempt_exp_mutex); |
| 903 | mb_ret: | 877 | mb_ret: |
| @@ -941,7 +915,7 @@ void exit_rcu(void) | |||
| 941 | return; | 915 | return; |
| 942 | t->rcu_read_lock_nesting = 1; | 916 | t->rcu_read_lock_nesting = 1; |
| 943 | barrier(); | 917 | barrier(); |
| 944 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | 918 | t->rcu_read_unlock_special.b.blocked = true; |
| 945 | __rcu_read_unlock(); | 919 | __rcu_read_unlock(); |
| 946 | } | 920 | } |
| 947 | 921 | ||
| @@ -1462,14 +1436,13 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = { | |||
| 1462 | }; | 1436 | }; |
| 1463 | 1437 | ||
| 1464 | /* | 1438 | /* |
| 1465 | * Spawn all kthreads -- called as soon as the scheduler is running. | 1439 | * Spawn boost kthreads -- called as soon as the scheduler is running. |
| 1466 | */ | 1440 | */ |
| 1467 | static int __init rcu_spawn_kthreads(void) | 1441 | static void __init rcu_spawn_boost_kthreads(void) |
| 1468 | { | 1442 | { |
| 1469 | struct rcu_node *rnp; | 1443 | struct rcu_node *rnp; |
| 1470 | int cpu; | 1444 | int cpu; |
| 1471 | 1445 | ||
| 1472 | rcu_scheduler_fully_active = 1; | ||
| 1473 | for_each_possible_cpu(cpu) | 1446 | for_each_possible_cpu(cpu) |
| 1474 | per_cpu(rcu_cpu_has_work, cpu) = 0; | 1447 | per_cpu(rcu_cpu_has_work, cpu) = 0; |
| 1475 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); | 1448 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); |
| @@ -1479,9 +1452,7 @@ static int __init rcu_spawn_kthreads(void) | |||
| 1479 | rcu_for_each_leaf_node(rcu_state_p, rnp) | 1452 | rcu_for_each_leaf_node(rcu_state_p, rnp) |
| 1480 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); | 1453 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); |
| 1481 | } | 1454 | } |
| 1482 | return 0; | ||
| 1483 | } | 1455 | } |
| 1484 | early_initcall(rcu_spawn_kthreads); | ||
| 1485 | 1456 | ||
| 1486 | static void rcu_prepare_kthreads(int cpu) | 1457 | static void rcu_prepare_kthreads(int cpu) |
| 1487 | { | 1458 | { |
| @@ -1519,12 +1490,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | |||
| 1519 | { | 1490 | { |
| 1520 | } | 1491 | } |
| 1521 | 1492 | ||
| 1522 | static int __init rcu_scheduler_really_started(void) | 1493 | static void __init rcu_spawn_boost_kthreads(void) |
| 1523 | { | 1494 | { |
| 1524 | rcu_scheduler_fully_active = 1; | ||
| 1525 | return 0; | ||
| 1526 | } | 1495 | } |
| 1527 | early_initcall(rcu_scheduler_really_started); | ||
| 1528 | 1496 | ||
| 1529 | static void rcu_prepare_kthreads(int cpu) | 1497 | static void rcu_prepare_kthreads(int cpu) |
| 1530 | { | 1498 | { |
| @@ -1625,7 +1593,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) | |||
| 1625 | 1593 | ||
| 1626 | /* Exit early if we advanced recently. */ | 1594 | /* Exit early if we advanced recently. */ |
| 1627 | if (jiffies == rdtp->last_advance_all) | 1595 | if (jiffies == rdtp->last_advance_all) |
| 1628 | return 0; | 1596 | return false; |
| 1629 | rdtp->last_advance_all = jiffies; | 1597 | rdtp->last_advance_all = jiffies; |
| 1630 | 1598 | ||
| 1631 | for_each_rcu_flavor(rsp) { | 1599 | for_each_rcu_flavor(rsp) { |
| @@ -1848,7 +1816,7 @@ static int rcu_oom_notify(struct notifier_block *self, | |||
| 1848 | get_online_cpus(); | 1816 | get_online_cpus(); |
| 1849 | for_each_online_cpu(cpu) { | 1817 | for_each_online_cpu(cpu) { |
| 1850 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); | 1818 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); |
| 1851 | cond_resched(); | 1819 | cond_resched_rcu_qs(); |
| 1852 | } | 1820 | } |
| 1853 | put_online_cpus(); | 1821 | put_online_cpus(); |
| 1854 | 1822 | ||
| @@ -2074,14 +2042,41 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) | |||
| 2074 | 2042 | ||
| 2075 | if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) | 2043 | if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) |
| 2076 | return; | 2044 | return; |
| 2077 | if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { | 2045 | if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { |
| 2078 | /* Prior xchg orders against prior callback enqueue. */ | 2046 | /* Prior smp_mb__after_atomic() orders against prior enqueue. */ |
| 2079 | ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; | 2047 | ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; |
| 2080 | wake_up(&rdp_leader->nocb_wq); | 2048 | wake_up(&rdp_leader->nocb_wq); |
| 2081 | } | 2049 | } |
| 2082 | } | 2050 | } |
| 2083 | 2051 | ||
| 2084 | /* | 2052 | /* |
| 2053 | * Does the specified CPU need an RCU callback for the specified flavor | ||
| 2054 | * of rcu_barrier()? | ||
| 2055 | */ | ||
| 2056 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) | ||
| 2057 | { | ||
| 2058 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2059 | struct rcu_head *rhp; | ||
| 2060 | |||
| 2061 | /* No-CBs CPUs might have callbacks on any of three lists. */ | ||
| 2062 | rhp = ACCESS_ONCE(rdp->nocb_head); | ||
| 2063 | if (!rhp) | ||
| 2064 | rhp = ACCESS_ONCE(rdp->nocb_gp_head); | ||
| 2065 | if (!rhp) | ||
| 2066 | rhp = ACCESS_ONCE(rdp->nocb_follower_head); | ||
| 2067 | |||
| 2068 | /* Having no rcuo kthread but CBs after scheduler starts is bad! */ | ||
| 2069 | if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { | ||
| 2070 | /* RCU callback enqueued before CPU first came online??? */ | ||
| 2071 | pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", | ||
| 2072 | cpu, rhp->func); | ||
| 2073 | WARN_ON_ONCE(1); | ||
| 2074 | } | ||
| 2075 | |||
| 2076 | return !!rhp; | ||
| 2077 | } | ||
| 2078 | |||
| 2079 | /* | ||
| 2085 | * Enqueue the specified string of rcu_head structures onto the specified | 2080 | * Enqueue the specified string of rcu_head structures onto the specified |
| 2086 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the | 2081 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the |
| 2087 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy | 2082 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy |
| @@ -2104,6 +2099,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
| 2104 | ACCESS_ONCE(*old_rhpp) = rhp; | 2099 | ACCESS_ONCE(*old_rhpp) = rhp; |
| 2105 | atomic_long_add(rhcount, &rdp->nocb_q_count); | 2100 | atomic_long_add(rhcount, &rdp->nocb_q_count); |
| 2106 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); | 2101 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); |
| 2102 | smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ | ||
| 2107 | 2103 | ||
| 2108 | /* If we are not being polled and there is a kthread, awaken it ... */ | 2104 | /* If we are not being polled and there is a kthread, awaken it ... */ |
| 2109 | t = ACCESS_ONCE(rdp->nocb_kthread); | 2105 | t = ACCESS_ONCE(rdp->nocb_kthread); |
| @@ -2120,16 +2116,23 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
| 2120 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2116 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| 2121 | TPS("WakeEmpty")); | 2117 | TPS("WakeEmpty")); |
| 2122 | } else { | 2118 | } else { |
| 2123 | rdp->nocb_defer_wakeup = true; | 2119 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; |
| 2124 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2120 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| 2125 | TPS("WakeEmptyIsDeferred")); | 2121 | TPS("WakeEmptyIsDeferred")); |
| 2126 | } | 2122 | } |
| 2127 | rdp->qlen_last_fqs_check = 0; | 2123 | rdp->qlen_last_fqs_check = 0; |
| 2128 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | 2124 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { |
| 2129 | /* ... or if many callbacks queued. */ | 2125 | /* ... or if many callbacks queued. */ |
| 2130 | wake_nocb_leader(rdp, true); | 2126 | if (!irqs_disabled_flags(flags)) { |
| 2127 | wake_nocb_leader(rdp, true); | ||
| 2128 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2129 | TPS("WakeOvf")); | ||
| 2130 | } else { | ||
| 2131 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; | ||
| 2132 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2133 | TPS("WakeOvfIsDeferred")); | ||
| 2134 | } | ||
| 2131 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | 2135 | rdp->qlen_last_fqs_check = LONG_MAX / 2; |
| 2132 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); | ||
| 2133 | } else { | 2136 | } else { |
| 2134 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); | 2137 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); |
| 2135 | } | 2138 | } |
| @@ -2150,7 +2153,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
| 2150 | { | 2153 | { |
| 2151 | 2154 | ||
| 2152 | if (!rcu_is_nocb_cpu(rdp->cpu)) | 2155 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
| 2153 | return 0; | 2156 | return false; |
| 2154 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); | 2157 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); |
| 2155 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | 2158 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) |
| 2156 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | 2159 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, |
| @@ -2161,7 +2164,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
| 2161 | trace_rcu_callback(rdp->rsp->name, rhp, | 2164 | trace_rcu_callback(rdp->rsp->name, rhp, |
| 2162 | -atomic_long_read(&rdp->nocb_q_count_lazy), | 2165 | -atomic_long_read(&rdp->nocb_q_count_lazy), |
| 2163 | -atomic_long_read(&rdp->nocb_q_count)); | 2166 | -atomic_long_read(&rdp->nocb_q_count)); |
| 2164 | return 1; | 2167 | |
| 2168 | /* | ||
| 2169 | * If called from an extended quiescent state with interrupts | ||
| 2170 | * disabled, invoke the RCU core in order to allow the idle-entry | ||
| 2171 | * deferred-wakeup check to function. | ||
| 2172 | */ | ||
| 2173 | if (irqs_disabled_flags(flags) && | ||
| 2174 | !rcu_is_watching() && | ||
| 2175 | cpu_online(smp_processor_id())) | ||
| 2176 | invoke_rcu_core(); | ||
| 2177 | |||
| 2178 | return true; | ||
| 2165 | } | 2179 | } |
| 2166 | 2180 | ||
| 2167 | /* | 2181 | /* |
| @@ -2177,7 +2191,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
| 2177 | 2191 | ||
| 2178 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | 2192 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ |
| 2179 | if (!rcu_is_nocb_cpu(smp_processor_id())) | 2193 | if (!rcu_is_nocb_cpu(smp_processor_id())) |
| 2180 | return 0; | 2194 | return false; |
| 2181 | rsp->qlen = 0; | 2195 | rsp->qlen = 0; |
| 2182 | rsp->qlen_lazy = 0; | 2196 | rsp->qlen_lazy = 0; |
| 2183 | 2197 | ||
| @@ -2196,7 +2210,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
| 2196 | rsp->orphan_nxtlist = NULL; | 2210 | rsp->orphan_nxtlist = NULL; |
| 2197 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | 2211 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; |
| 2198 | } | 2212 | } |
| 2199 | return 1; | 2213 | return true; |
| 2200 | } | 2214 | } |
| 2201 | 2215 | ||
| 2202 | /* | 2216 | /* |
| @@ -2229,7 +2243,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
| 2229 | (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); | 2243 | (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); |
| 2230 | if (likely(d)) | 2244 | if (likely(d)) |
| 2231 | break; | 2245 | break; |
| 2232 | flush_signals(current); | 2246 | WARN_ON(signal_pending(current)); |
| 2233 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); | 2247 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); |
| 2234 | } | 2248 | } |
| 2235 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); | 2249 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); |
| @@ -2253,7 +2267,7 @@ wait_again: | |||
| 2253 | if (!rcu_nocb_poll) { | 2267 | if (!rcu_nocb_poll) { |
| 2254 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); | 2268 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); |
| 2255 | wait_event_interruptible(my_rdp->nocb_wq, | 2269 | wait_event_interruptible(my_rdp->nocb_wq, |
| 2256 | ACCESS_ONCE(my_rdp->nocb_leader_wake)); | 2270 | !ACCESS_ONCE(my_rdp->nocb_leader_sleep)); |
| 2257 | /* Memory barrier handled by smp_mb() calls below and repoll. */ | 2271 | /* Memory barrier handled by smp_mb() calls below and repoll. */ |
| 2258 | } else if (firsttime) { | 2272 | } else if (firsttime) { |
| 2259 | firsttime = false; /* Don't drown trace log with "Poll"! */ | 2273 | firsttime = false; /* Don't drown trace log with "Poll"! */ |
| @@ -2288,16 +2302,16 @@ wait_again: | |||
| 2288 | if (!rcu_nocb_poll) | 2302 | if (!rcu_nocb_poll) |
| 2289 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, | 2303 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, |
| 2290 | "WokeEmpty"); | 2304 | "WokeEmpty"); |
| 2291 | flush_signals(current); | 2305 | WARN_ON(signal_pending(current)); |
| 2292 | schedule_timeout_interruptible(1); | 2306 | schedule_timeout_interruptible(1); |
| 2293 | 2307 | ||
| 2294 | /* Rescan in case we were a victim of memory ordering. */ | 2308 | /* Rescan in case we were a victim of memory ordering. */ |
| 2295 | my_rdp->nocb_leader_wake = false; | 2309 | my_rdp->nocb_leader_sleep = true; |
| 2296 | smp_mb(); /* Ensure _wake false before scan. */ | 2310 | smp_mb(); /* Ensure _sleep true before scan. */ |
| 2297 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) | 2311 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) |
| 2298 | if (ACCESS_ONCE(rdp->nocb_head)) { | 2312 | if (ACCESS_ONCE(rdp->nocb_head)) { |
| 2299 | /* Found CB, so short-circuit next wait. */ | 2313 | /* Found CB, so short-circuit next wait. */ |
| 2300 | my_rdp->nocb_leader_wake = true; | 2314 | my_rdp->nocb_leader_sleep = false; |
| 2301 | break; | 2315 | break; |
| 2302 | } | 2316 | } |
| 2303 | goto wait_again; | 2317 | goto wait_again; |
| @@ -2307,17 +2321,17 @@ wait_again: | |||
| 2307 | rcu_nocb_wait_gp(my_rdp); | 2321 | rcu_nocb_wait_gp(my_rdp); |
| 2308 | 2322 | ||
| 2309 | /* | 2323 | /* |
| 2310 | * We left ->nocb_leader_wake set to reduce cache thrashing. | 2324 | * We left ->nocb_leader_sleep unset to reduce cache thrashing. |
| 2311 | * We clear it now, but recheck for new callbacks while | 2325 | * We set it now, but recheck for new callbacks while |
| 2312 | * traversing our follower list. | 2326 | * traversing our follower list. |
| 2313 | */ | 2327 | */ |
| 2314 | my_rdp->nocb_leader_wake = false; | 2328 | my_rdp->nocb_leader_sleep = true; |
| 2315 | smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ | 2329 | smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ |
| 2316 | 2330 | ||
| 2317 | /* Each pass through the following loop wakes a follower, if needed. */ | 2331 | /* Each pass through the following loop wakes a follower, if needed. */ |
| 2318 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { | 2332 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { |
| 2319 | if (ACCESS_ONCE(rdp->nocb_head)) | 2333 | if (ACCESS_ONCE(rdp->nocb_head)) |
| 2320 | my_rdp->nocb_leader_wake = true; /* No need to wait. */ | 2334 | my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ |
| 2321 | if (!rdp->nocb_gp_head) | 2335 | if (!rdp->nocb_gp_head) |
| 2322 | continue; /* No CBs, so no need to wake follower. */ | 2336 | continue; /* No CBs, so no need to wake follower. */ |
| 2323 | 2337 | ||
| @@ -2327,6 +2341,7 @@ wait_again: | |||
| 2327 | atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); | 2341 | atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); |
| 2328 | atomic_long_add(rdp->nocb_gp_count_lazy, | 2342 | atomic_long_add(rdp->nocb_gp_count_lazy, |
| 2329 | &rdp->nocb_follower_count_lazy); | 2343 | &rdp->nocb_follower_count_lazy); |
| 2344 | smp_mb__after_atomic(); /* Store *tail before wakeup. */ | ||
| 2330 | if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { | 2345 | if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { |
| 2331 | /* | 2346 | /* |
| 2332 | * List was empty, wake up the follower. | 2347 | * List was empty, wake up the follower. |
| @@ -2367,7 +2382,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) | |||
| 2367 | if (!rcu_nocb_poll) | 2382 | if (!rcu_nocb_poll) |
| 2368 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2383 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| 2369 | "WokeEmpty"); | 2384 | "WokeEmpty"); |
| 2370 | flush_signals(current); | 2385 | WARN_ON(signal_pending(current)); |
| 2371 | schedule_timeout_interruptible(1); | 2386 | schedule_timeout_interruptible(1); |
| 2372 | } | 2387 | } |
| 2373 | } | 2388 | } |
| @@ -2428,15 +2443,16 @@ static int rcu_nocb_kthread(void *arg) | |||
| 2428 | list = next; | 2443 | list = next; |
| 2429 | } | 2444 | } |
| 2430 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); | 2445 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); |
| 2431 | ACCESS_ONCE(rdp->nocb_p_count) -= c; | 2446 | ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; |
| 2432 | ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; | 2447 | ACCESS_ONCE(rdp->nocb_p_count_lazy) = |
| 2448 | rdp->nocb_p_count_lazy - cl; | ||
| 2433 | rdp->n_nocbs_invoked += c; | 2449 | rdp->n_nocbs_invoked += c; |
| 2434 | } | 2450 | } |
| 2435 | return 0; | 2451 | return 0; |
| 2436 | } | 2452 | } |
| 2437 | 2453 | ||
| 2438 | /* Is a deferred wakeup of rcu_nocb_kthread() required? */ | 2454 | /* Is a deferred wakeup of rcu_nocb_kthread() required? */ |
| 2439 | static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) | 2455 | static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) |
| 2440 | { | 2456 | { |
| 2441 | return ACCESS_ONCE(rdp->nocb_defer_wakeup); | 2457 | return ACCESS_ONCE(rdp->nocb_defer_wakeup); |
| 2442 | } | 2458 | } |
| @@ -2444,11 +2460,79 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) | |||
| 2444 | /* Do a deferred wakeup of rcu_nocb_kthread(). */ | 2460 | /* Do a deferred wakeup of rcu_nocb_kthread(). */ |
| 2445 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp) | 2461 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp) |
| 2446 | { | 2462 | { |
| 2463 | int ndw; | ||
| 2464 | |||
| 2447 | if (!rcu_nocb_need_deferred_wakeup(rdp)) | 2465 | if (!rcu_nocb_need_deferred_wakeup(rdp)) |
| 2448 | return; | 2466 | return; |
| 2449 | ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; | 2467 | ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup); |
| 2450 | wake_nocb_leader(rdp, false); | 2468 | ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT; |
| 2451 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); | 2469 | wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); |
| 2470 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); | ||
| 2471 | } | ||
| 2472 | |||
| 2473 | void __init rcu_init_nohz(void) | ||
| 2474 | { | ||
| 2475 | int cpu; | ||
| 2476 | bool need_rcu_nocb_mask = true; | ||
| 2477 | struct rcu_state *rsp; | ||
| 2478 | |||
| 2479 | #ifdef CONFIG_RCU_NOCB_CPU_NONE | ||
| 2480 | need_rcu_nocb_mask = false; | ||
| 2481 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | ||
| 2482 | |||
| 2483 | #if defined(CONFIG_NO_HZ_FULL) | ||
| 2484 | if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) | ||
| 2485 | need_rcu_nocb_mask = true; | ||
| 2486 | #endif /* #if defined(CONFIG_NO_HZ_FULL) */ | ||
| 2487 | |||
| 2488 | if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { | ||
| 2489 | if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { | ||
| 2490 | pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); | ||
| 2491 | return; | ||
| 2492 | } | ||
| 2493 | have_rcu_nocb_mask = true; | ||
| 2494 | } | ||
| 2495 | if (!have_rcu_nocb_mask) | ||
| 2496 | return; | ||
| 2497 | |||
| 2498 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | ||
| 2499 | pr_info("\tOffload RCU callbacks from CPU 0\n"); | ||
| 2500 | cpumask_set_cpu(0, rcu_nocb_mask); | ||
| 2501 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | ||
| 2502 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | ||
| 2503 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | ||
| 2504 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); | ||
| 2505 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | ||
| 2506 | #if defined(CONFIG_NO_HZ_FULL) | ||
| 2507 | if (tick_nohz_full_running) | ||
| 2508 | cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); | ||
| 2509 | #endif /* #if defined(CONFIG_NO_HZ_FULL) */ | ||
| 2510 | |||
| 2511 | if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { | ||
| 2512 | pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); | ||
| 2513 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | ||
| 2514 | rcu_nocb_mask); | ||
| 2515 | } | ||
| 2516 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | ||
| 2517 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); | ||
| 2518 | if (rcu_nocb_poll) | ||
| 2519 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | ||
| 2520 | |||
| 2521 | for_each_rcu_flavor(rsp) { | ||
| 2522 | for_each_cpu(cpu, rcu_nocb_mask) { | ||
| 2523 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2524 | |||
| 2525 | /* | ||
| 2526 | * If there are early callbacks, they will need | ||
| 2527 | * to be moved to the nocb lists. | ||
| 2528 | */ | ||
| 2529 | WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] != | ||
| 2530 | &rdp->nxtlist && | ||
| 2531 | rdp->nxttail[RCU_NEXT_TAIL] != NULL); | ||
| 2532 | init_nocb_callback_list(rdp); | ||
| 2533 | } | ||
| 2534 | rcu_organize_nocb_kthreads(rsp); | ||
| 2535 | } | ||
| 2452 | } | 2536 | } |
| 2453 | 2537 | ||
| 2454 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ | 2538 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ |
| @@ -2459,15 +2543,85 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | |||
| 2459 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; | 2543 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; |
| 2460 | } | 2544 | } |
| 2461 | 2545 | ||
| 2546 | /* | ||
| 2547 | * If the specified CPU is a no-CBs CPU that does not already have its | ||
| 2548 | * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are | ||
| 2549 | * brought online out of order, this can require re-organizing the | ||
| 2550 | * leader-follower relationships. | ||
| 2551 | */ | ||
| 2552 | static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) | ||
| 2553 | { | ||
| 2554 | struct rcu_data *rdp; | ||
| 2555 | struct rcu_data *rdp_last; | ||
| 2556 | struct rcu_data *rdp_old_leader; | ||
| 2557 | struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu); | ||
| 2558 | struct task_struct *t; | ||
| 2559 | |||
| 2560 | /* | ||
| 2561 | * If this isn't a no-CBs CPU or if it already has an rcuo kthread, | ||
| 2562 | * then nothing to do. | ||
| 2563 | */ | ||
| 2564 | if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) | ||
| 2565 | return; | ||
| 2566 | |||
| 2567 | /* If we didn't spawn the leader first, reorganize! */ | ||
| 2568 | rdp_old_leader = rdp_spawn->nocb_leader; | ||
| 2569 | if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { | ||
| 2570 | rdp_last = NULL; | ||
| 2571 | rdp = rdp_old_leader; | ||
| 2572 | do { | ||
| 2573 | rdp->nocb_leader = rdp_spawn; | ||
| 2574 | if (rdp_last && rdp != rdp_spawn) | ||
| 2575 | rdp_last->nocb_next_follower = rdp; | ||
| 2576 | rdp_last = rdp; | ||
| 2577 | rdp = rdp->nocb_next_follower; | ||
| 2578 | rdp_last->nocb_next_follower = NULL; | ||
| 2579 | } while (rdp); | ||
| 2580 | rdp_spawn->nocb_next_follower = rdp_old_leader; | ||
| 2581 | } | ||
| 2582 | |||
| 2583 | /* Spawn the kthread for this CPU and RCU flavor. */ | ||
| 2584 | t = kthread_run(rcu_nocb_kthread, rdp_spawn, | ||
| 2585 | "rcuo%c/%d", rsp->abbr, cpu); | ||
| 2586 | BUG_ON(IS_ERR(t)); | ||
| 2587 | ACCESS_ONCE(rdp_spawn->nocb_kthread) = t; | ||
| 2588 | } | ||
| 2589 | |||
| 2590 | /* | ||
| 2591 | * If the specified CPU is a no-CBs CPU that does not already have its | ||
| 2592 | * rcuo kthreads, spawn them. | ||
| 2593 | */ | ||
| 2594 | static void rcu_spawn_all_nocb_kthreads(int cpu) | ||
| 2595 | { | ||
| 2596 | struct rcu_state *rsp; | ||
| 2597 | |||
| 2598 | if (rcu_scheduler_fully_active) | ||
| 2599 | for_each_rcu_flavor(rsp) | ||
| 2600 | rcu_spawn_one_nocb_kthread(rsp, cpu); | ||
| 2601 | } | ||
| 2602 | |||
| 2603 | /* | ||
| 2604 | * Once the scheduler is running, spawn rcuo kthreads for all online | ||
| 2605 | * no-CBs CPUs. This assumes that the early_initcall()s happen before | ||
| 2606 | * non-boot CPUs come online -- if this changes, we will need to add | ||
| 2607 | * some mutual exclusion. | ||
| 2608 | */ | ||
| 2609 | static void __init rcu_spawn_nocb_kthreads(void) | ||
| 2610 | { | ||
| 2611 | int cpu; | ||
| 2612 | |||
| 2613 | for_each_online_cpu(cpu) | ||
| 2614 | rcu_spawn_all_nocb_kthreads(cpu); | ||
| 2615 | } | ||
| 2616 | |||
| 2462 | /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ | 2617 | /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ |
| 2463 | static int rcu_nocb_leader_stride = -1; | 2618 | static int rcu_nocb_leader_stride = -1; |
| 2464 | module_param(rcu_nocb_leader_stride, int, 0444); | 2619 | module_param(rcu_nocb_leader_stride, int, 0444); |
| 2465 | 2620 | ||
| 2466 | /* | 2621 | /* |
| 2467 | * Create a kthread for each RCU flavor for each no-CBs CPU. | 2622 | * Initialize leader-follower relationships for all no-CBs CPU. |
| 2468 | * Also initialize leader-follower relationships. | ||
| 2469 | */ | 2623 | */ |
| 2470 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | 2624 | static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) |
| 2471 | { | 2625 | { |
| 2472 | int cpu; | 2626 | int cpu; |
| 2473 | int ls = rcu_nocb_leader_stride; | 2627 | int ls = rcu_nocb_leader_stride; |
| @@ -2475,14 +2629,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | |||
| 2475 | struct rcu_data *rdp; | 2629 | struct rcu_data *rdp; |
| 2476 | struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ | 2630 | struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ |
| 2477 | struct rcu_data *rdp_prev = NULL; | 2631 | struct rcu_data *rdp_prev = NULL; |
| 2478 | struct task_struct *t; | ||
| 2479 | 2632 | ||
| 2480 | if (rcu_nocb_mask == NULL) | 2633 | if (!have_rcu_nocb_mask) |
| 2481 | return; | 2634 | return; |
| 2482 | #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) | ||
| 2483 | if (tick_nohz_full_running) | ||
| 2484 | cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); | ||
| 2485 | #endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ | ||
| 2486 | if (ls == -1) { | 2635 | if (ls == -1) { |
| 2487 | ls = int_sqrt(nr_cpu_ids); | 2636 | ls = int_sqrt(nr_cpu_ids); |
| 2488 | rcu_nocb_leader_stride = ls; | 2637 | rcu_nocb_leader_stride = ls; |
| @@ -2505,27 +2654,27 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | |||
| 2505 | rdp_prev->nocb_next_follower = rdp; | 2654 | rdp_prev->nocb_next_follower = rdp; |
| 2506 | } | 2655 | } |
| 2507 | rdp_prev = rdp; | 2656 | rdp_prev = rdp; |
| 2508 | |||
| 2509 | /* Spawn the kthread for this CPU. */ | ||
| 2510 | t = kthread_run(rcu_nocb_kthread, rdp, | ||
| 2511 | "rcuo%c/%d", rsp->abbr, cpu); | ||
| 2512 | BUG_ON(IS_ERR(t)); | ||
| 2513 | ACCESS_ONCE(rdp->nocb_kthread) = t; | ||
| 2514 | } | 2657 | } |
| 2515 | } | 2658 | } |
| 2516 | 2659 | ||
| 2517 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | 2660 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ |
| 2518 | static bool init_nocb_callback_list(struct rcu_data *rdp) | 2661 | static bool init_nocb_callback_list(struct rcu_data *rdp) |
| 2519 | { | 2662 | { |
| 2520 | if (rcu_nocb_mask == NULL || | 2663 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
| 2521 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) | ||
| 2522 | return false; | 2664 | return false; |
| 2665 | |||
| 2523 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 2666 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; |
| 2524 | return true; | 2667 | return true; |
| 2525 | } | 2668 | } |
| 2526 | 2669 | ||
| 2527 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | 2670 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ |
| 2528 | 2671 | ||
| 2672 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) | ||
| 2673 | { | ||
| 2674 | WARN_ON_ONCE(1); /* Should be dead code. */ | ||
| 2675 | return false; | ||
| 2676 | } | ||
| 2677 | |||
| 2529 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 2678 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) |
| 2530 | { | 2679 | { |
| 2531 | } | 2680 | } |
| @@ -2541,21 +2690,21 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) | |||
| 2541 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 2690 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
| 2542 | bool lazy, unsigned long flags) | 2691 | bool lazy, unsigned long flags) |
| 2543 | { | 2692 | { |
| 2544 | return 0; | 2693 | return false; |
| 2545 | } | 2694 | } |
| 2546 | 2695 | ||
| 2547 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 2696 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
| 2548 | struct rcu_data *rdp, | 2697 | struct rcu_data *rdp, |
| 2549 | unsigned long flags) | 2698 | unsigned long flags) |
| 2550 | { | 2699 | { |
| 2551 | return 0; | 2700 | return false; |
| 2552 | } | 2701 | } |
| 2553 | 2702 | ||
| 2554 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | 2703 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) |
| 2555 | { | 2704 | { |
| 2556 | } | 2705 | } |
| 2557 | 2706 | ||
| 2558 | static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) | 2707 | static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) |
| 2559 | { | 2708 | { |
| 2560 | return false; | 2709 | return false; |
| 2561 | } | 2710 | } |
| @@ -2564,7 +2713,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) | |||
| 2564 | { | 2713 | { |
| 2565 | } | 2714 | } |
| 2566 | 2715 | ||
| 2567 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | 2716 | static void rcu_spawn_all_nocb_kthreads(int cpu) |
| 2717 | { | ||
| 2718 | } | ||
| 2719 | |||
| 2720 | static void __init rcu_spawn_nocb_kthreads(void) | ||
| 2568 | { | 2721 | { |
| 2569 | } | 2722 | } |
| 2570 | 2723 | ||
| @@ -2595,16 +2748,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) | |||
| 2595 | 2748 | ||
| 2596 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 2749 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
| 2597 | 2750 | ||
| 2598 | /* | ||
| 2599 | * Define RCU flavor that holds sysidle state. This needs to be the | ||
| 2600 | * most active flavor of RCU. | ||
| 2601 | */ | ||
| 2602 | #ifdef CONFIG_PREEMPT_RCU | ||
| 2603 | static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; | ||
| 2604 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
| 2605 | static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; | ||
| 2606 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
| 2607 | |||
| 2608 | static int full_sysidle_state; /* Current system-idle state. */ | 2751 | static int full_sysidle_state; /* Current system-idle state. */ |
| 2609 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | 2752 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ |
| 2610 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | 2753 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ |
| @@ -2622,6 +2765,10 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | |||
| 2622 | { | 2765 | { |
| 2623 | unsigned long j; | 2766 | unsigned long j; |
| 2624 | 2767 | ||
| 2768 | /* If there are no nohz_full= CPUs, no need to track this. */ | ||
| 2769 | if (!tick_nohz_full_enabled()) | ||
| 2770 | return; | ||
| 2771 | |||
| 2625 | /* Adjust nesting, check for fully idle. */ | 2772 | /* Adjust nesting, check for fully idle. */ |
| 2626 | if (irq) { | 2773 | if (irq) { |
| 2627 | rdtp->dynticks_idle_nesting--; | 2774 | rdtp->dynticks_idle_nesting--; |
| @@ -2687,6 +2834,10 @@ void rcu_sysidle_force_exit(void) | |||
| 2687 | */ | 2834 | */ |
| 2688 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | 2835 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) |
| 2689 | { | 2836 | { |
| 2837 | /* If there are no nohz_full= CPUs, no need to track this. */ | ||
| 2838 | if (!tick_nohz_full_enabled()) | ||
| 2839 | return; | ||
| 2840 | |||
| 2690 | /* Adjust nesting, check for already non-idle. */ | 2841 | /* Adjust nesting, check for already non-idle. */ |
| 2691 | if (irq) { | 2842 | if (irq) { |
| 2692 | rdtp->dynticks_idle_nesting++; | 2843 | rdtp->dynticks_idle_nesting++; |
| @@ -2741,12 +2892,16 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | |||
| 2741 | unsigned long j; | 2892 | unsigned long j; |
| 2742 | struct rcu_dynticks *rdtp = rdp->dynticks; | 2893 | struct rcu_dynticks *rdtp = rdp->dynticks; |
| 2743 | 2894 | ||
| 2895 | /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ | ||
| 2896 | if (!tick_nohz_full_enabled()) | ||
| 2897 | return; | ||
| 2898 | |||
| 2744 | /* | 2899 | /* |
| 2745 | * If some other CPU has already reported non-idle, if this is | 2900 | * If some other CPU has already reported non-idle, if this is |
| 2746 | * not the flavor of RCU that tracks sysidle state, or if this | 2901 | * not the flavor of RCU that tracks sysidle state, or if this |
| 2747 | * is an offline or the timekeeping CPU, nothing to do. | 2902 | * is an offline or the timekeeping CPU, nothing to do. |
| 2748 | */ | 2903 | */ |
| 2749 | if (!*isidle || rdp->rsp != rcu_sysidle_state || | 2904 | if (!*isidle || rdp->rsp != rcu_state_p || |
| 2750 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | 2905 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) |
| 2751 | return; | 2906 | return; |
| 2752 | if (rcu_gp_in_progress(rdp->rsp)) | 2907 | if (rcu_gp_in_progress(rdp->rsp)) |
| @@ -2772,7 +2927,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | |||
| 2772 | */ | 2927 | */ |
| 2773 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | 2928 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) |
| 2774 | { | 2929 | { |
| 2775 | return rsp == rcu_sysidle_state; | 2930 | return rsp == rcu_state_p; |
| 2776 | } | 2931 | } |
| 2777 | 2932 | ||
| 2778 | /* | 2933 | /* |
| @@ -2850,7 +3005,7 @@ static void rcu_sysidle_cancel(void) | |||
| 2850 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | 3005 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, |
| 2851 | unsigned long maxj, bool gpkt) | 3006 | unsigned long maxj, bool gpkt) |
| 2852 | { | 3007 | { |
| 2853 | if (rsp != rcu_sysidle_state) | 3008 | if (rsp != rcu_state_p) |
| 2854 | return; /* Wrong flavor, ignore. */ | 3009 | return; /* Wrong flavor, ignore. */ |
| 2855 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | 3010 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) |
| 2856 | return; /* Running state machine from timekeeping CPU. */ | 3011 | return; /* Running state machine from timekeeping CPU. */ |
| @@ -2867,6 +3022,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | |||
| 2867 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | 3022 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, |
| 2868 | unsigned long maxj) | 3023 | unsigned long maxj) |
| 2869 | { | 3024 | { |
| 3025 | /* If there are no nohz_full= CPUs, no need to track this. */ | ||
| 3026 | if (!tick_nohz_full_enabled()) | ||
| 3027 | return; | ||
| 3028 | |||
| 2870 | rcu_sysidle_report(rsp, isidle, maxj, true); | 3029 | rcu_sysidle_report(rsp, isidle, maxj, true); |
| 2871 | } | 3030 | } |
| 2872 | 3031 | ||
| @@ -2893,7 +3052,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp) | |||
| 2893 | 3052 | ||
| 2894 | /* | 3053 | /* |
| 2895 | * Check to see if the system is fully idle, other than the timekeeping CPU. | 3054 | * Check to see if the system is fully idle, other than the timekeeping CPU. |
| 2896 | * The caller must have disabled interrupts. | 3055 | * The caller must have disabled interrupts. This is not intended to be |
| 3056 | * called unless tick_nohz_full_enabled(). | ||
| 2897 | */ | 3057 | */ |
| 2898 | bool rcu_sys_is_idle(void) | 3058 | bool rcu_sys_is_idle(void) |
| 2899 | { | 3059 | { |
| @@ -2919,13 +3079,12 @@ bool rcu_sys_is_idle(void) | |||
| 2919 | 3079 | ||
| 2920 | /* Scan all the CPUs looking for nonidle CPUs. */ | 3080 | /* Scan all the CPUs looking for nonidle CPUs. */ |
| 2921 | for_each_possible_cpu(cpu) { | 3081 | for_each_possible_cpu(cpu) { |
| 2922 | rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); | 3082 | rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
| 2923 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | 3083 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); |
| 2924 | if (!isidle) | 3084 | if (!isidle) |
| 2925 | break; | 3085 | break; |
| 2926 | } | 3086 | } |
| 2927 | rcu_sysidle_report(rcu_sysidle_state, | 3087 | rcu_sysidle_report(rcu_state_p, isidle, maxj, false); |
| 2928 | isidle, maxj, false); | ||
| 2929 | oldrss = rss; | 3088 | oldrss = rss; |
| 2930 | rss = ACCESS_ONCE(full_sysidle_state); | 3089 | rss = ACCESS_ONCE(full_sysidle_state); |
| 2931 | } | 3090 | } |
| @@ -2952,7 +3111,7 @@ bool rcu_sys_is_idle(void) | |||
| 2952 | * provided by the memory allocator. | 3111 | * provided by the memory allocator. |
| 2953 | */ | 3112 | */ |
| 2954 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | 3113 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && |
| 2955 | !rcu_gp_in_progress(rcu_sysidle_state) && | 3114 | !rcu_gp_in_progress(rcu_state_p) && |
| 2956 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | 3115 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) |
| 2957 | call_rcu(&rsh.rh, rcu_sysidle_cb); | 3116 | call_rcu(&rsh.rh, rcu_sysidle_cb); |
| 2958 | return false; | 3117 | return false; |
| @@ -3036,3 +3195,19 @@ static void rcu_bind_gp_kthread(void) | |||
| 3036 | housekeeping_affine(current); | 3195 | housekeeping_affine(current); |
| 3037 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 3196 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ |
| 3038 | } | 3197 | } |
| 3198 | |||
| 3199 | /* Record the current task on dyntick-idle entry. */ | ||
| 3200 | static void rcu_dynticks_task_enter(void) | ||
| 3201 | { | ||
| 3202 | #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) | ||
| 3203 | ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id(); | ||
| 3204 | #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ | ||
| 3205 | } | ||
| 3206 | |||
| 3207 | /* Record no current task on dyntick-idle exit. */ | ||
| 3208 | static void rcu_dynticks_task_exit(void) | ||
| 3209 | { | ||
| 3210 | #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) | ||
| 3211 | ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1; | ||
| 3212 | #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ | ||
| 3213 | } | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4056d7992a6c..3ef8ba58694e 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -47,6 +47,8 @@ | |||
| 47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
| 48 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
| 49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
| 50 | #include <linux/kthread.h> | ||
| 51 | #include <linux/tick.h> | ||
| 50 | 52 | ||
| 51 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
| 52 | 54 | ||
| @@ -91,7 +93,7 @@ void __rcu_read_unlock(void) | |||
| 91 | barrier(); /* critical section before exit code. */ | 93 | barrier(); /* critical section before exit code. */ |
| 92 | t->rcu_read_lock_nesting = INT_MIN; | 94 | t->rcu_read_lock_nesting = INT_MIN; |
| 93 | barrier(); /* assign before ->rcu_read_unlock_special load */ | 95 | barrier(); /* assign before ->rcu_read_unlock_special load */ |
| 94 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 96 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s))) |
| 95 | rcu_read_unlock_special(t); | 97 | rcu_read_unlock_special(t); |
| 96 | barrier(); /* ->rcu_read_unlock_special load before assign */ | 98 | barrier(); /* ->rcu_read_unlock_special load before assign */ |
| 97 | t->rcu_read_lock_nesting = 0; | 99 | t->rcu_read_lock_nesting = 0; |
| @@ -137,6 +139,38 @@ int notrace debug_lockdep_rcu_enabled(void) | |||
| 137 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | 139 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); |
| 138 | 140 | ||
| 139 | /** | 141 | /** |
| 142 | * rcu_read_lock_held() - might we be in RCU read-side critical section? | ||
| 143 | * | ||
| 144 | * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU | ||
| 145 | * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, | ||
| 146 | * this assumes we are in an RCU read-side critical section unless it can | ||
| 147 | * prove otherwise. This is useful for debug checks in functions that | ||
| 148 | * require that they be called within an RCU read-side critical section. | ||
| 149 | * | ||
| 150 | * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot | ||
| 151 | * and while lockdep is disabled. | ||
| 152 | * | ||
| 153 | * Note that rcu_read_lock() and the matching rcu_read_unlock() must | ||
| 154 | * occur in the same context, for example, it is illegal to invoke | ||
| 155 | * rcu_read_unlock() in process context if the matching rcu_read_lock() | ||
| 156 | * was invoked from within an irq handler. | ||
| 157 | * | ||
| 158 | * Note that rcu_read_lock() is disallowed if the CPU is either idle or | ||
| 159 | * offline from an RCU perspective, so check for those as well. | ||
| 160 | */ | ||
| 161 | int rcu_read_lock_held(void) | ||
| 162 | { | ||
| 163 | if (!debug_lockdep_rcu_enabled()) | ||
| 164 | return 1; | ||
| 165 | if (!rcu_is_watching()) | ||
| 166 | return 0; | ||
| 167 | if (!rcu_lockdep_current_cpu_online()) | ||
| 168 | return 0; | ||
| 169 | return lock_is_held(&rcu_lock_map); | ||
| 170 | } | ||
| 171 | EXPORT_SYMBOL_GPL(rcu_read_lock_held); | ||
| 172 | |||
| 173 | /** | ||
| 140 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? | 174 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? |
| 141 | * | 175 | * |
| 142 | * Check for bottom half being disabled, which covers both the | 176 | * Check for bottom half being disabled, which covers both the |
| @@ -347,3 +381,312 @@ static int __init check_cpu_stall_init(void) | |||
| 347 | early_initcall(check_cpu_stall_init); | 381 | early_initcall(check_cpu_stall_init); |
| 348 | 382 | ||
| 349 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 383 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
| 384 | |||
| 385 | #ifdef CONFIG_TASKS_RCU | ||
| 386 | |||
| 387 | /* | ||
| 388 | * Simple variant of RCU whose quiescent states are voluntary context switch, | ||
| 389 | * user-space execution, and idle. As such, grace periods can take one good | ||
| 390 | * long time. There are no read-side primitives similar to rcu_read_lock() | ||
| 391 | * and rcu_read_unlock() because this implementation is intended to get | ||
| 392 | * the system into a safe state for some of the manipulations involved in | ||
| 393 | * tracing and the like. Finally, this implementation does not support | ||
| 394 | * high call_rcu_tasks() rates from multiple CPUs. If this is required, | ||
| 395 | * per-CPU callback lists will be needed. | ||
| 396 | */ | ||
| 397 | |||
| 398 | /* Global list of callbacks and associated lock. */ | ||
| 399 | static struct rcu_head *rcu_tasks_cbs_head; | ||
| 400 | static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; | ||
| 401 | static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); | ||
| 402 | static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); | ||
| 403 | |||
| 404 | /* Track exiting tasks in order to allow them to be waited for. */ | ||
| 405 | DEFINE_SRCU(tasks_rcu_exit_srcu); | ||
| 406 | |||
| 407 | /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ | ||
| 408 | static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; | ||
| 409 | module_param(rcu_task_stall_timeout, int, 0644); | ||
| 410 | |||
| 411 | static void rcu_spawn_tasks_kthread(void); | ||
| 412 | |||
| 413 | /* | ||
| 414 | * Post an RCU-tasks callback. First call must be from process context | ||
| 415 | * after the scheduler if fully operational. | ||
| 416 | */ | ||
| 417 | void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) | ||
| 418 | { | ||
| 419 | unsigned long flags; | ||
| 420 | bool needwake; | ||
| 421 | |||
| 422 | rhp->next = NULL; | ||
| 423 | rhp->func = func; | ||
| 424 | raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); | ||
| 425 | needwake = !rcu_tasks_cbs_head; | ||
| 426 | *rcu_tasks_cbs_tail = rhp; | ||
| 427 | rcu_tasks_cbs_tail = &rhp->next; | ||
| 428 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); | ||
| 429 | if (needwake) { | ||
| 430 | rcu_spawn_tasks_kthread(); | ||
| 431 | wake_up(&rcu_tasks_cbs_wq); | ||
| 432 | } | ||
| 433 | } | ||
| 434 | EXPORT_SYMBOL_GPL(call_rcu_tasks); | ||
| 435 | |||
| 436 | /** | ||
| 437 | * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. | ||
| 438 | * | ||
| 439 | * Control will return to the caller some time after a full rcu-tasks | ||
| 440 | * grace period has elapsed, in other words after all currently | ||
| 441 | * executing rcu-tasks read-side critical sections have elapsed. These | ||
| 442 | * read-side critical sections are delimited by calls to schedule(), | ||
| 443 | * cond_resched_rcu_qs(), idle execution, userspace execution, calls | ||
| 444 | * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). | ||
| 445 | * | ||
| 446 | * This is a very specialized primitive, intended only for a few uses in | ||
| 447 | * tracing and other situations requiring manipulation of function | ||
| 448 | * preambles and profiling hooks. The synchronize_rcu_tasks() function | ||
| 449 | * is not (yet) intended for heavy use from multiple CPUs. | ||
| 450 | * | ||
| 451 | * Note that this guarantee implies further memory-ordering guarantees. | ||
| 452 | * On systems with more than one CPU, when synchronize_rcu_tasks() returns, | ||
| 453 | * each CPU is guaranteed to have executed a full memory barrier since the | ||
| 454 | * end of its last RCU-tasks read-side critical section whose beginning | ||
| 455 | * preceded the call to synchronize_rcu_tasks(). In addition, each CPU | ||
| 456 | * having an RCU-tasks read-side critical section that extends beyond | ||
| 457 | * the return from synchronize_rcu_tasks() is guaranteed to have executed | ||
| 458 | * a full memory barrier after the beginning of synchronize_rcu_tasks() | ||
| 459 | * and before the beginning of that RCU-tasks read-side critical section. | ||
| 460 | * Note that these guarantees include CPUs that are offline, idle, or | ||
| 461 | * executing in user mode, as well as CPUs that are executing in the kernel. | ||
| 462 | * | ||
| 463 | * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned | ||
| 464 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
| 465 | * to have executed a full memory barrier during the execution of | ||
| 466 | * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU | ||
| 467 | * (but again only if the system has more than one CPU). | ||
| 468 | */ | ||
| 469 | void synchronize_rcu_tasks(void) | ||
| 470 | { | ||
| 471 | /* Complain if the scheduler has not started. */ | ||
| 472 | rcu_lockdep_assert(!rcu_scheduler_active, | ||
| 473 | "synchronize_rcu_tasks called too soon"); | ||
| 474 | |||
| 475 | /* Wait for the grace period. */ | ||
| 476 | wait_rcu_gp(call_rcu_tasks); | ||
| 477 | } | ||
| 478 | EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); | ||
| 479 | |||
| 480 | /** | ||
| 481 | * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. | ||
| 482 | * | ||
| 483 | * Although the current implementation is guaranteed to wait, it is not | ||
| 484 | * obligated to, for example, if there are no pending callbacks. | ||
| 485 | */ | ||
| 486 | void rcu_barrier_tasks(void) | ||
| 487 | { | ||
| 488 | /* There is only one callback queue, so this is easy. ;-) */ | ||
| 489 | synchronize_rcu_tasks(); | ||
| 490 | } | ||
| 491 | EXPORT_SYMBOL_GPL(rcu_barrier_tasks); | ||
| 492 | |||
| 493 | /* See if tasks are still holding out, complain if so. */ | ||
| 494 | static void check_holdout_task(struct task_struct *t, | ||
| 495 | bool needreport, bool *firstreport) | ||
| 496 | { | ||
| 497 | int cpu; | ||
| 498 | |||
| 499 | if (!ACCESS_ONCE(t->rcu_tasks_holdout) || | ||
| 500 | t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || | ||
| 501 | !ACCESS_ONCE(t->on_rq) || | ||
| 502 | (IS_ENABLED(CONFIG_NO_HZ_FULL) && | ||
| 503 | !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { | ||
| 504 | ACCESS_ONCE(t->rcu_tasks_holdout) = false; | ||
| 505 | list_del_init(&t->rcu_tasks_holdout_list); | ||
| 506 | put_task_struct(t); | ||
| 507 | return; | ||
| 508 | } | ||
| 509 | if (!needreport) | ||
| 510 | return; | ||
| 511 | if (*firstreport) { | ||
| 512 | pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); | ||
| 513 | *firstreport = false; | ||
| 514 | } | ||
| 515 | cpu = task_cpu(t); | ||
| 516 | pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", | ||
| 517 | t, ".I"[is_idle_task(t)], | ||
| 518 | "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], | ||
| 519 | t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, | ||
| 520 | t->rcu_tasks_idle_cpu, cpu); | ||
| 521 | sched_show_task(t); | ||
| 522 | } | ||
| 523 | |||
| 524 | /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ | ||
| 525 | static int __noreturn rcu_tasks_kthread(void *arg) | ||
| 526 | { | ||
| 527 | unsigned long flags; | ||
| 528 | struct task_struct *g, *t; | ||
| 529 | unsigned long lastreport; | ||
| 530 | struct rcu_head *list; | ||
| 531 | struct rcu_head *next; | ||
| 532 | LIST_HEAD(rcu_tasks_holdouts); | ||
| 533 | |||
| 534 | /* FIXME: Add housekeeping affinity. */ | ||
| 535 | |||
| 536 | /* | ||
| 537 | * Each pass through the following loop makes one check for | ||
| 538 | * newly arrived callbacks, and, if there are some, waits for | ||
| 539 | * one RCU-tasks grace period and then invokes the callbacks. | ||
| 540 | * This loop is terminated by the system going down. ;-) | ||
| 541 | */ | ||
| 542 | for (;;) { | ||
| 543 | |||
| 544 | /* Pick up any new callbacks. */ | ||
| 545 | raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); | ||
| 546 | list = rcu_tasks_cbs_head; | ||
| 547 | rcu_tasks_cbs_head = NULL; | ||
| 548 | rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; | ||
| 549 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); | ||
| 550 | |||
| 551 | /* If there were none, wait a bit and start over. */ | ||
| 552 | if (!list) { | ||
| 553 | wait_event_interruptible(rcu_tasks_cbs_wq, | ||
| 554 | rcu_tasks_cbs_head); | ||
| 555 | if (!rcu_tasks_cbs_head) { | ||
| 556 | WARN_ON(signal_pending(current)); | ||
| 557 | schedule_timeout_interruptible(HZ/10); | ||
| 558 | } | ||
| 559 | continue; | ||
| 560 | } | ||
| 561 | |||
| 562 | /* | ||
| 563 | * Wait for all pre-existing t->on_rq and t->nvcsw | ||
| 564 | * transitions to complete. Invoking synchronize_sched() | ||
| 565 | * suffices because all these transitions occur with | ||
| 566 | * interrupts disabled. Without this synchronize_sched(), | ||
| 567 | * a read-side critical section that started before the | ||
| 568 | * grace period might be incorrectly seen as having started | ||
| 569 | * after the grace period. | ||
| 570 | * | ||
| 571 | * This synchronize_sched() also dispenses with the | ||
| 572 | * need for a memory barrier on the first store to | ||
| 573 | * ->rcu_tasks_holdout, as it forces the store to happen | ||
| 574 | * after the beginning of the grace period. | ||
| 575 | */ | ||
| 576 | synchronize_sched(); | ||
| 577 | |||
| 578 | /* | ||
| 579 | * There were callbacks, so we need to wait for an | ||
| 580 | * RCU-tasks grace period. Start off by scanning | ||
| 581 | * the task list for tasks that are not already | ||
| 582 | * voluntarily blocked. Mark these tasks and make | ||
| 583 | * a list of them in rcu_tasks_holdouts. | ||
| 584 | */ | ||
| 585 | rcu_read_lock(); | ||
| 586 | for_each_process_thread(g, t) { | ||
| 587 | if (t != current && ACCESS_ONCE(t->on_rq) && | ||
| 588 | !is_idle_task(t)) { | ||
| 589 | get_task_struct(t); | ||
| 590 | t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw); | ||
| 591 | ACCESS_ONCE(t->rcu_tasks_holdout) = true; | ||
| 592 | list_add(&t->rcu_tasks_holdout_list, | ||
| 593 | &rcu_tasks_holdouts); | ||
| 594 | } | ||
| 595 | } | ||
| 596 | rcu_read_unlock(); | ||
| 597 | |||
| 598 | /* | ||
| 599 | * Wait for tasks that are in the process of exiting. | ||
| 600 | * This does only part of the job, ensuring that all | ||
| 601 | * tasks that were previously exiting reach the point | ||
| 602 | * where they have disabled preemption, allowing the | ||
| 603 | * later synchronize_sched() to finish the job. | ||
| 604 | */ | ||
| 605 | synchronize_srcu(&tasks_rcu_exit_srcu); | ||
| 606 | |||
| 607 | /* | ||
| 608 | * Each pass through the following loop scans the list | ||
| 609 | * of holdout tasks, removing any that are no longer | ||
| 610 | * holdouts. When the list is empty, we are done. | ||
| 611 | */ | ||
| 612 | lastreport = jiffies; | ||
| 613 | while (!list_empty(&rcu_tasks_holdouts)) { | ||
| 614 | bool firstreport; | ||
| 615 | bool needreport; | ||
| 616 | int rtst; | ||
| 617 | struct task_struct *t1; | ||
| 618 | |||
| 619 | schedule_timeout_interruptible(HZ); | ||
| 620 | rtst = ACCESS_ONCE(rcu_task_stall_timeout); | ||
| 621 | needreport = rtst > 0 && | ||
| 622 | time_after(jiffies, lastreport + rtst); | ||
| 623 | if (needreport) | ||
| 624 | lastreport = jiffies; | ||
| 625 | firstreport = true; | ||
| 626 | WARN_ON(signal_pending(current)); | ||
| 627 | list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, | ||
| 628 | rcu_tasks_holdout_list) { | ||
| 629 | check_holdout_task(t, needreport, &firstreport); | ||
| 630 | cond_resched(); | ||
| 631 | } | ||
| 632 | } | ||
| 633 | |||
| 634 | /* | ||
| 635 | * Because ->on_rq and ->nvcsw are not guaranteed | ||
| 636 | * to have a full memory barriers prior to them in the | ||
| 637 | * schedule() path, memory reordering on other CPUs could | ||
| 638 | * cause their RCU-tasks read-side critical sections to | ||
| 639 | * extend past the end of the grace period. However, | ||
| 640 | * because these ->nvcsw updates are carried out with | ||
| 641 | * interrupts disabled, we can use synchronize_sched() | ||
| 642 | * to force the needed ordering on all such CPUs. | ||
| 643 | * | ||
| 644 | * This synchronize_sched() also confines all | ||
| 645 | * ->rcu_tasks_holdout accesses to be within the grace | ||
| 646 | * period, avoiding the need for memory barriers for | ||
| 647 | * ->rcu_tasks_holdout accesses. | ||
| 648 | * | ||
| 649 | * In addition, this synchronize_sched() waits for exiting | ||
| 650 | * tasks to complete their final preempt_disable() region | ||
| 651 | * of execution, cleaning up after the synchronize_srcu() | ||
| 652 | * above. | ||
| 653 | */ | ||
| 654 | synchronize_sched(); | ||
| 655 | |||
| 656 | /* Invoke the callbacks. */ | ||
| 657 | while (list) { | ||
| 658 | next = list->next; | ||
| 659 | local_bh_disable(); | ||
| 660 | list->func(list); | ||
| 661 | local_bh_enable(); | ||
| 662 | list = next; | ||
| 663 | cond_resched(); | ||
| 664 | } | ||
| 665 | schedule_timeout_uninterruptible(HZ/10); | ||
| 666 | } | ||
| 667 | } | ||
| 668 | |||
| 669 | /* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */ | ||
| 670 | static void rcu_spawn_tasks_kthread(void) | ||
| 671 | { | ||
| 672 | static DEFINE_MUTEX(rcu_tasks_kthread_mutex); | ||
| 673 | static struct task_struct *rcu_tasks_kthread_ptr; | ||
| 674 | struct task_struct *t; | ||
| 675 | |||
| 676 | if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) { | ||
| 677 | smp_mb(); /* Ensure caller sees full kthread. */ | ||
| 678 | return; | ||
| 679 | } | ||
| 680 | mutex_lock(&rcu_tasks_kthread_mutex); | ||
| 681 | if (rcu_tasks_kthread_ptr) { | ||
| 682 | mutex_unlock(&rcu_tasks_kthread_mutex); | ||
| 683 | return; | ||
| 684 | } | ||
| 685 | t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); | ||
| 686 | BUG_ON(IS_ERR(t)); | ||
| 687 | smp_mb(); /* Ensure others see full kthread. */ | ||
| 688 | ACCESS_ONCE(rcu_tasks_kthread_ptr) = t; | ||
| 689 | mutex_unlock(&rcu_tasks_kthread_mutex); | ||
| 690 | } | ||
| 691 | |||
| 692 | #endif /* #ifdef CONFIG_TASKS_RCU */ | ||
diff --git a/kernel/reboot.c b/kernel/reboot.c index a3a9e240fcdb..5925f5ae8dff 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
| @@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb) | |||
| 104 | } | 104 | } |
| 105 | EXPORT_SYMBOL(unregister_reboot_notifier); | 105 | EXPORT_SYMBOL(unregister_reboot_notifier); |
| 106 | 106 | ||
| 107 | /* | ||
| 108 | * Notifier list for kernel code which wants to be called | ||
| 109 | * to restart the system. | ||
| 110 | */ | ||
| 111 | static ATOMIC_NOTIFIER_HEAD(restart_handler_list); | ||
| 112 | |||
| 113 | /** | ||
| 114 | * register_restart_handler - Register function to be called to reset | ||
| 115 | * the system | ||
| 116 | * @nb: Info about handler function to be called | ||
| 117 | * @nb->priority: Handler priority. Handlers should follow the | ||
| 118 | * following guidelines for setting priorities. | ||
| 119 | * 0: Restart handler of last resort, | ||
| 120 | * with limited restart capabilities | ||
| 121 | * 128: Default restart handler; use if no other | ||
| 122 | * restart handler is expected to be available, | ||
| 123 | * and/or if restart functionality is | ||
| 124 | * sufficient to restart the entire system | ||
| 125 | * 255: Highest priority restart handler, will | ||
| 126 | * preempt all other restart handlers | ||
| 127 | * | ||
| 128 | * Registers a function with code to be called to restart the | ||
| 129 | * system. | ||
| 130 | * | ||
| 131 | * Registered functions will be called from machine_restart as last | ||
| 132 | * step of the restart sequence (if the architecture specific | ||
| 133 | * machine_restart function calls do_kernel_restart - see below | ||
| 134 | * for details). | ||
| 135 | * Registered functions are expected to restart the system immediately. | ||
| 136 | * If more than one function is registered, the restart handler priority | ||
| 137 | * selects which function will be called first. | ||
| 138 | * | ||
| 139 | * Restart handlers are expected to be registered from non-architecture | ||
| 140 | * code, typically from drivers. A typical use case would be a system | ||
| 141 | * where restart functionality is provided through a watchdog. Multiple | ||
| 142 | * restart handlers may exist; for example, one restart handler might | ||
| 143 | * restart the entire system, while another only restarts the CPU. | ||
| 144 | * In such cases, the restart handler which only restarts part of the | ||
| 145 | * hardware is expected to register with low priority to ensure that | ||
| 146 | * it only runs if no other means to restart the system is available. | ||
| 147 | * | ||
| 148 | * Currently always returns zero, as atomic_notifier_chain_register() | ||
| 149 | * always returns zero. | ||
| 150 | */ | ||
| 151 | int register_restart_handler(struct notifier_block *nb) | ||
| 152 | { | ||
| 153 | return atomic_notifier_chain_register(&restart_handler_list, nb); | ||
| 154 | } | ||
| 155 | EXPORT_SYMBOL(register_restart_handler); | ||
| 156 | |||
| 157 | /** | ||
| 158 | * unregister_restart_handler - Unregister previously registered | ||
| 159 | * restart handler | ||
| 160 | * @nb: Hook to be unregistered | ||
| 161 | * | ||
| 162 | * Unregisters a previously registered restart handler function. | ||
| 163 | * | ||
| 164 | * Returns zero on success, or %-ENOENT on failure. | ||
| 165 | */ | ||
| 166 | int unregister_restart_handler(struct notifier_block *nb) | ||
| 167 | { | ||
| 168 | return atomic_notifier_chain_unregister(&restart_handler_list, nb); | ||
| 169 | } | ||
| 170 | EXPORT_SYMBOL(unregister_restart_handler); | ||
| 171 | |||
| 172 | /** | ||
| 173 | * do_kernel_restart - Execute kernel restart handler call chain | ||
| 174 | * | ||
| 175 | * Calls functions registered with register_restart_handler. | ||
| 176 | * | ||
| 177 | * Expected to be called from machine_restart as last step of the restart | ||
| 178 | * sequence. | ||
| 179 | * | ||
| 180 | * Restarts the system immediately if a restart handler function has been | ||
| 181 | * registered. Otherwise does nothing. | ||
| 182 | */ | ||
| 183 | void do_kernel_restart(char *cmd) | ||
| 184 | { | ||
| 185 | atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd); | ||
| 186 | } | ||
| 187 | |||
| 107 | void migrate_to_reboot_cpu(void) | 188 | void migrate_to_reboot_cpu(void) |
| 108 | { | 189 | { |
| 109 | /* The boot cpu is always logical cpu 0 */ | 190 | /* The boot cpu is always logical cpu 0 */ |
diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237ac32db..0bcebffc4e77 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); | |||
| 59 | static struct resource *bootmem_resource_free; | 59 | static struct resource *bootmem_resource_free; |
| 60 | static DEFINE_SPINLOCK(bootmem_resource_lock); | 60 | static DEFINE_SPINLOCK(bootmem_resource_lock); |
| 61 | 61 | ||
| 62 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 62 | static struct resource *next_resource(struct resource *p, bool sibling_only) |
| 63 | { | 63 | { |
| 64 | struct resource *p = v; | 64 | /* Caller wants to traverse through siblings only */ |
| 65 | (*pos)++; | 65 | if (sibling_only) |
| 66 | return p->sibling; | ||
| 67 | |||
| 66 | if (p->child) | 68 | if (p->child) |
| 67 | return p->child; | 69 | return p->child; |
| 68 | while (!p->sibling && p->parent) | 70 | while (!p->sibling && p->parent) |
| @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 70 | return p->sibling; | 72 | return p->sibling; |
| 71 | } | 73 | } |
| 72 | 74 | ||
| 75 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 76 | { | ||
| 77 | struct resource *p = v; | ||
| 78 | (*pos)++; | ||
| 79 | return (void *)next_resource(p, false); | ||
| 80 | } | ||
| 81 | |||
| 73 | #ifdef CONFIG_PROC_FS | 82 | #ifdef CONFIG_PROC_FS |
| 74 | 83 | ||
| 75 | enum { MAX_IORES_LEVEL = 5 }; | 84 | enum { MAX_IORES_LEVEL = 5 }; |
| @@ -322,16 +331,19 @@ int release_resource(struct resource *old) | |||
| 322 | 331 | ||
| 323 | EXPORT_SYMBOL(release_resource); | 332 | EXPORT_SYMBOL(release_resource); |
| 324 | 333 | ||
| 325 | #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) | ||
| 326 | /* | 334 | /* |
| 327 | * Finds the lowest memory reosurce exists within [res->start.res->end) | 335 | * Finds the lowest iomem reosurce exists with-in [res->start.res->end) |
| 328 | * the caller must specify res->start, res->end, res->flags and "name". | 336 | * the caller must specify res->start, res->end, res->flags and "name". |
| 329 | * If found, returns 0, res is overwritten, if not found, returns -1. | 337 | * If found, returns 0, res is overwritten, if not found, returns -1. |
| 338 | * This walks through whole tree and not just first level children | ||
| 339 | * until and unless first_level_children_only is true. | ||
| 330 | */ | 340 | */ |
| 331 | static int find_next_system_ram(struct resource *res, char *name) | 341 | static int find_next_iomem_res(struct resource *res, char *name, |
| 342 | bool first_level_children_only) | ||
| 332 | { | 343 | { |
| 333 | resource_size_t start, end; | 344 | resource_size_t start, end; |
| 334 | struct resource *p; | 345 | struct resource *p; |
| 346 | bool sibling_only = false; | ||
| 335 | 347 | ||
| 336 | BUG_ON(!res); | 348 | BUG_ON(!res); |
| 337 | 349 | ||
| @@ -339,9 +351,12 @@ static int find_next_system_ram(struct resource *res, char *name) | |||
| 339 | end = res->end; | 351 | end = res->end; |
| 340 | BUG_ON(start >= end); | 352 | BUG_ON(start >= end); |
| 341 | 353 | ||
| 354 | if (first_level_children_only) | ||
| 355 | sibling_only = true; | ||
| 356 | |||
| 342 | read_lock(&resource_lock); | 357 | read_lock(&resource_lock); |
| 343 | for (p = iomem_resource.child; p ; p = p->sibling) { | 358 | |
| 344 | /* system ram is just marked as IORESOURCE_MEM */ | 359 | for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { |
| 345 | if (p->flags != res->flags) | 360 | if (p->flags != res->flags) |
| 346 | continue; | 361 | continue; |
| 347 | if (name && strcmp(p->name, name)) | 362 | if (name && strcmp(p->name, name)) |
| @@ -353,6 +368,7 @@ static int find_next_system_ram(struct resource *res, char *name) | |||
| 353 | if ((p->end >= start) && (p->start < end)) | 368 | if ((p->end >= start) && (p->start < end)) |
| 354 | break; | 369 | break; |
| 355 | } | 370 | } |
| 371 | |||
| 356 | read_unlock(&resource_lock); | 372 | read_unlock(&resource_lock); |
| 357 | if (!p) | 373 | if (!p) |
| 358 | return -1; | 374 | return -1; |
| @@ -365,6 +381,70 @@ static int find_next_system_ram(struct resource *res, char *name) | |||
| 365 | } | 381 | } |
| 366 | 382 | ||
| 367 | /* | 383 | /* |
| 384 | * Walks through iomem resources and calls func() with matching resource | ||
| 385 | * ranges. This walks through whole tree and not just first level children. | ||
| 386 | * All the memory ranges which overlap start,end and also match flags and | ||
| 387 | * name are valid candidates. | ||
| 388 | * | ||
| 389 | * @name: name of resource | ||
| 390 | * @flags: resource flags | ||
| 391 | * @start: start addr | ||
| 392 | * @end: end addr | ||
| 393 | */ | ||
| 394 | int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, | ||
| 395 | void *arg, int (*func)(u64, u64, void *)) | ||
| 396 | { | ||
| 397 | struct resource res; | ||
| 398 | u64 orig_end; | ||
| 399 | int ret = -1; | ||
| 400 | |||
| 401 | res.start = start; | ||
| 402 | res.end = end; | ||
| 403 | res.flags = flags; | ||
| 404 | orig_end = res.end; | ||
| 405 | while ((res.start < res.end) && | ||
| 406 | (!find_next_iomem_res(&res, name, false))) { | ||
| 407 | ret = (*func)(res.start, res.end, arg); | ||
| 408 | if (ret) | ||
| 409 | break; | ||
| 410 | res.start = res.end + 1; | ||
| 411 | res.end = orig_end; | ||
| 412 | } | ||
| 413 | return ret; | ||
| 414 | } | ||
| 415 | |||
| 416 | /* | ||
| 417 | * This function calls callback against all memory range of "System RAM" | ||
| 418 | * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. | ||
| 419 | * Now, this function is only for "System RAM". This function deals with | ||
| 420 | * full ranges and not pfn. If resources are not pfn aligned, dealing | ||
| 421 | * with pfn can truncate ranges. | ||
| 422 | */ | ||
| 423 | int walk_system_ram_res(u64 start, u64 end, void *arg, | ||
| 424 | int (*func)(u64, u64, void *)) | ||
| 425 | { | ||
| 426 | struct resource res; | ||
| 427 | u64 orig_end; | ||
| 428 | int ret = -1; | ||
| 429 | |||
| 430 | res.start = start; | ||
| 431 | res.end = end; | ||
| 432 | res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
| 433 | orig_end = res.end; | ||
| 434 | while ((res.start < res.end) && | ||
| 435 | (!find_next_iomem_res(&res, "System RAM", true))) { | ||
| 436 | ret = (*func)(res.start, res.end, arg); | ||
| 437 | if (ret) | ||
| 438 | break; | ||
| 439 | res.start = res.end + 1; | ||
| 440 | res.end = orig_end; | ||
| 441 | } | ||
| 442 | return ret; | ||
| 443 | } | ||
| 444 | |||
| 445 | #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) | ||
| 446 | |||
| 447 | /* | ||
| 368 | * This function calls callback against all memory range of "System RAM" | 448 | * This function calls callback against all memory range of "System RAM" |
| 369 | * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. | 449 | * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. |
| 370 | * Now, this function is only for "System RAM". | 450 | * Now, this function is only for "System RAM". |
| @@ -382,7 +462,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
| 382 | res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 462 | res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
| 383 | orig_end = res.end; | 463 | orig_end = res.end; |
| 384 | while ((res.start < res.end) && | 464 | while ((res.start < res.end) && |
| 385 | (find_next_system_ram(&res, "System RAM") >= 0)) { | 465 | (find_next_iomem_res(&res, "System RAM", true) >= 0)) { |
| 386 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; | 466 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; |
| 387 | end_pfn = (res.end + 1) >> PAGE_SHIFT; | 467 | end_pfn = (res.end + 1) >> PAGE_SHIFT; |
| 388 | if (end_pfn > pfn) | 468 | if (end_pfn > pfn) |
| @@ -411,6 +491,42 @@ int __weak page_is_ram(unsigned long pfn) | |||
| 411 | } | 491 | } |
| 412 | EXPORT_SYMBOL_GPL(page_is_ram); | 492 | EXPORT_SYMBOL_GPL(page_is_ram); |
| 413 | 493 | ||
| 494 | /* | ||
| 495 | * Search for a resouce entry that fully contains the specified region. | ||
| 496 | * If found, return 1 if it is RAM, 0 if not. | ||
| 497 | * If not found, or region is not fully contained, return -1 | ||
| 498 | * | ||
| 499 | * Used by the ioremap functions to ensure the user is not remapping RAM and is | ||
| 500 | * a vast speed up over walking through the resource table page by page. | ||
| 501 | */ | ||
| 502 | int region_is_ram(resource_size_t start, unsigned long size) | ||
| 503 | { | ||
| 504 | struct resource *p; | ||
| 505 | resource_size_t end = start + size - 1; | ||
| 506 | int flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
| 507 | const char *name = "System RAM"; | ||
| 508 | int ret = -1; | ||
| 509 | |||
| 510 | read_lock(&resource_lock); | ||
| 511 | for (p = iomem_resource.child; p ; p = p->sibling) { | ||
| 512 | if (end < p->start) | ||
| 513 | continue; | ||
| 514 | |||
| 515 | if (p->start <= start && end <= p->end) { | ||
| 516 | /* resource fully contains region */ | ||
| 517 | if ((p->flags != flags) || strcmp(p->name, name)) | ||
| 518 | ret = 0; | ||
| 519 | else | ||
| 520 | ret = 1; | ||
| 521 | break; | ||
| 522 | } | ||
| 523 | if (p->end < start) | ||
| 524 | break; /* not found */ | ||
| 525 | } | ||
| 526 | read_unlock(&resource_lock); | ||
| 527 | return ret; | ||
| 528 | } | ||
| 529 | |||
| 414 | void __weak arch_remove_reservations(struct resource *avail) | 530 | void __weak arch_remove_reservations(struct resource *avail) |
| 415 | { | 531 | { |
| 416 | } | 532 | } |
| @@ -1165,6 +1281,76 @@ int release_mem_region_adjustable(struct resource *parent, | |||
| 1165 | /* | 1281 | /* |
| 1166 | * Managed region resource | 1282 | * Managed region resource |
| 1167 | */ | 1283 | */ |
| 1284 | static void devm_resource_release(struct device *dev, void *ptr) | ||
| 1285 | { | ||
| 1286 | struct resource **r = ptr; | ||
| 1287 | |||
| 1288 | release_resource(*r); | ||
| 1289 | } | ||
| 1290 | |||
| 1291 | /** | ||
| 1292 | * devm_request_resource() - request and reserve an I/O or memory resource | ||
| 1293 | * @dev: device for which to request the resource | ||
| 1294 | * @root: root of the resource tree from which to request the resource | ||
| 1295 | * @new: descriptor of the resource to request | ||
| 1296 | * | ||
| 1297 | * This is a device-managed version of request_resource(). There is usually | ||
| 1298 | * no need to release resources requested by this function explicitly since | ||
| 1299 | * that will be taken care of when the device is unbound from its driver. | ||
| 1300 | * If for some reason the resource needs to be released explicitly, because | ||
| 1301 | * of ordering issues for example, drivers must call devm_release_resource() | ||
| 1302 | * rather than the regular release_resource(). | ||
| 1303 | * | ||
| 1304 | * When a conflict is detected between any existing resources and the newly | ||
| 1305 | * requested resource, an error message will be printed. | ||
| 1306 | * | ||
| 1307 | * Returns 0 on success or a negative error code on failure. | ||
| 1308 | */ | ||
| 1309 | int devm_request_resource(struct device *dev, struct resource *root, | ||
| 1310 | struct resource *new) | ||
| 1311 | { | ||
| 1312 | struct resource *conflict, **ptr; | ||
| 1313 | |||
| 1314 | ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL); | ||
| 1315 | if (!ptr) | ||
| 1316 | return -ENOMEM; | ||
| 1317 | |||
| 1318 | *ptr = new; | ||
| 1319 | |||
| 1320 | conflict = request_resource_conflict(root, new); | ||
| 1321 | if (conflict) { | ||
| 1322 | dev_err(dev, "resource collision: %pR conflicts with %s %pR\n", | ||
| 1323 | new, conflict->name, conflict); | ||
| 1324 | devres_free(ptr); | ||
| 1325 | return -EBUSY; | ||
| 1326 | } | ||
| 1327 | |||
| 1328 | devres_add(dev, ptr); | ||
| 1329 | return 0; | ||
| 1330 | } | ||
| 1331 | EXPORT_SYMBOL(devm_request_resource); | ||
| 1332 | |||
| 1333 | static int devm_resource_match(struct device *dev, void *res, void *data) | ||
| 1334 | { | ||
| 1335 | struct resource **ptr = res; | ||
| 1336 | |||
| 1337 | return *ptr == data; | ||
| 1338 | } | ||
| 1339 | |||
| 1340 | /** | ||
| 1341 | * devm_release_resource() - release a previously requested resource | ||
| 1342 | * @dev: device for which to release the resource | ||
| 1343 | * @new: descriptor of the resource to release | ||
| 1344 | * | ||
| 1345 | * Releases a resource previously requested using devm_request_resource(). | ||
| 1346 | */ | ||
| 1347 | void devm_release_resource(struct device *dev, struct resource *new) | ||
| 1348 | { | ||
| 1349 | WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match, | ||
| 1350 | new)); | ||
| 1351 | } | ||
| 1352 | EXPORT_SYMBOL(devm_release_resource); | ||
| 1353 | |||
| 1168 | struct region_devres { | 1354 | struct region_devres { |
| 1169 | struct resource *parent; | 1355 | struct resource *parent; |
| 1170 | resource_size_t start; | 1356 | resource_size_t start; |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba98301..8a2e230fb86a 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
| @@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
| 148 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | 148 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) |
| 149 | goto out; | 149 | goto out; |
| 150 | 150 | ||
| 151 | t = p; | 151 | for_each_thread(p, t) |
| 152 | do { | ||
| 153 | sched_move_task(t); | 152 | sched_move_task(t); |
| 154 | } while_each_thread(p, t); | ||
| 155 | |||
| 156 | out: | 153 | out: |
| 157 | unlock_task_sighand(p, &flags); | 154 | unlock_task_sighand(p, &flags); |
| 158 | autogroup_kref_put(prev); | 155 | autogroup_kref_put(prev); |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 3ef6451e972e..c27e4f8f4879 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
| @@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); | |||
| 134 | 134 | ||
| 135 | static inline struct sched_clock_data *this_scd(void) | 135 | static inline struct sched_clock_data *this_scd(void) |
| 136 | { | 136 | { |
| 137 | return &__get_cpu_var(sched_clock_data); | 137 | return this_cpu_ptr(&sched_clock_data); |
| 138 | } | 138 | } |
| 139 | 139 | ||
| 140 | static inline struct sched_clock_data *cpu_sdc(int cpu) | 140 | static inline struct sched_clock_data *cpu_sdc(int cpu) |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1211575a2208..240157c13ddc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -90,22 +90,6 @@ | |||
| 90 | #define CREATE_TRACE_POINTS | 90 | #define CREATE_TRACE_POINTS |
| 91 | #include <trace/events/sched.h> | 91 | #include <trace/events/sched.h> |
| 92 | 92 | ||
| 93 | #ifdef smp_mb__before_atomic | ||
| 94 | void __smp_mb__before_atomic(void) | ||
| 95 | { | ||
| 96 | smp_mb__before_atomic(); | ||
| 97 | } | ||
| 98 | EXPORT_SYMBOL(__smp_mb__before_atomic); | ||
| 99 | #endif | ||
| 100 | |||
| 101 | #ifdef smp_mb__after_atomic | ||
| 102 | void __smp_mb__after_atomic(void) | ||
| 103 | { | ||
| 104 | smp_mb__after_atomic(); | ||
| 105 | } | ||
| 106 | EXPORT_SYMBOL(__smp_mb__after_atomic); | ||
| 107 | #endif | ||
| 108 | |||
| 109 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | 93 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
| 110 | { | 94 | { |
| 111 | unsigned long delta; | 95 | unsigned long delta; |
| @@ -333,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
| 333 | for (;;) { | 317 | for (;;) { |
| 334 | rq = task_rq(p); | 318 | rq = task_rq(p); |
| 335 | raw_spin_lock(&rq->lock); | 319 | raw_spin_lock(&rq->lock); |
| 336 | if (likely(rq == task_rq(p))) | 320 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) |
| 337 | return rq; | 321 | return rq; |
| 338 | raw_spin_unlock(&rq->lock); | 322 | raw_spin_unlock(&rq->lock); |
| 323 | |||
| 324 | while (unlikely(task_on_rq_migrating(p))) | ||
| 325 | cpu_relax(); | ||
| 339 | } | 326 | } |
| 340 | } | 327 | } |
| 341 | 328 | ||
| @@ -352,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
| 352 | raw_spin_lock_irqsave(&p->pi_lock, *flags); | 339 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
| 353 | rq = task_rq(p); | 340 | rq = task_rq(p); |
| 354 | raw_spin_lock(&rq->lock); | 341 | raw_spin_lock(&rq->lock); |
| 355 | if (likely(rq == task_rq(p))) | 342 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) |
| 356 | return rq; | 343 | return rq; |
| 357 | raw_spin_unlock(&rq->lock); | 344 | raw_spin_unlock(&rq->lock); |
| 358 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | 345 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); |
| 346 | |||
| 347 | while (unlikely(task_on_rq_migrating(p))) | ||
| 348 | cpu_relax(); | ||
| 359 | } | 349 | } |
| 360 | } | 350 | } |
| 361 | 351 | ||
| @@ -449,7 +439,15 @@ static void __hrtick_start(void *arg) | |||
| 449 | void hrtick_start(struct rq *rq, u64 delay) | 439 | void hrtick_start(struct rq *rq, u64 delay) |
| 450 | { | 440 | { |
| 451 | struct hrtimer *timer = &rq->hrtick_timer; | 441 | struct hrtimer *timer = &rq->hrtick_timer; |
| 452 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 442 | ktime_t time; |
| 443 | s64 delta; | ||
| 444 | |||
| 445 | /* | ||
| 446 | * Don't schedule slices shorter than 10000ns, that just | ||
| 447 | * doesn't make sense and can cause timer DoS. | ||
| 448 | */ | ||
| 449 | delta = max_t(s64, delay, 10000LL); | ||
| 450 | time = ktime_add_ns(timer->base->get_time(), delta); | ||
| 453 | 451 | ||
| 454 | hrtimer_set_expires(timer, time); | 452 | hrtimer_set_expires(timer, time); |
| 455 | 453 | ||
| @@ -1043,7 +1041,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
| 1043 | * A queue event has occurred, and we're going to schedule. In | 1041 | * A queue event has occurred, and we're going to schedule. In |
| 1044 | * this case, we can save a useless back to back clock update. | 1042 | * this case, we can save a useless back to back clock update. |
| 1045 | */ | 1043 | */ |
| 1046 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) | 1044 | if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) |
| 1047 | rq->skip_clock_update = 1; | 1045 | rq->skip_clock_update = 1; |
| 1048 | } | 1046 | } |
| 1049 | 1047 | ||
| @@ -1088,7 +1086,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1088 | 1086 | ||
| 1089 | static void __migrate_swap_task(struct task_struct *p, int cpu) | 1087 | static void __migrate_swap_task(struct task_struct *p, int cpu) |
| 1090 | { | 1088 | { |
| 1091 | if (p->on_rq) { | 1089 | if (task_on_rq_queued(p)) { |
| 1092 | struct rq *src_rq, *dst_rq; | 1090 | struct rq *src_rq, *dst_rq; |
| 1093 | 1091 | ||
| 1094 | src_rq = task_rq(p); | 1092 | src_rq = task_rq(p); |
| @@ -1214,7 +1212,7 @@ static int migration_cpu_stop(void *data); | |||
| 1214 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) | 1212 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) |
| 1215 | { | 1213 | { |
| 1216 | unsigned long flags; | 1214 | unsigned long flags; |
| 1217 | int running, on_rq; | 1215 | int running, queued; |
| 1218 | unsigned long ncsw; | 1216 | unsigned long ncsw; |
| 1219 | struct rq *rq; | 1217 | struct rq *rq; |
| 1220 | 1218 | ||
| @@ -1252,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 1252 | rq = task_rq_lock(p, &flags); | 1250 | rq = task_rq_lock(p, &flags); |
| 1253 | trace_sched_wait_task(p); | 1251 | trace_sched_wait_task(p); |
| 1254 | running = task_running(rq, p); | 1252 | running = task_running(rq, p); |
| 1255 | on_rq = p->on_rq; | 1253 | queued = task_on_rq_queued(p); |
| 1256 | ncsw = 0; | 1254 | ncsw = 0; |
| 1257 | if (!match_state || p->state == match_state) | 1255 | if (!match_state || p->state == match_state) |
| 1258 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 1256 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
| @@ -1284,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 1284 | * running right now), it's preempted, and we should | 1282 | * running right now), it's preempted, and we should |
| 1285 | * yield - it could be a while. | 1283 | * yield - it could be a while. |
| 1286 | */ | 1284 | */ |
| 1287 | if (unlikely(on_rq)) { | 1285 | if (unlikely(queued)) { |
| 1288 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); | 1286 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
| 1289 | 1287 | ||
| 1290 | set_current_state(TASK_UNINTERRUPTIBLE); | 1288 | set_current_state(TASK_UNINTERRUPTIBLE); |
| @@ -1478,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
| 1478 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1476 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
| 1479 | { | 1477 | { |
| 1480 | activate_task(rq, p, en_flags); | 1478 | activate_task(rq, p, en_flags); |
| 1481 | p->on_rq = 1; | 1479 | p->on_rq = TASK_ON_RQ_QUEUED; |
| 1482 | 1480 | ||
| 1483 | /* if a worker is waking up, notify workqueue */ | 1481 | /* if a worker is waking up, notify workqueue */ |
| 1484 | if (p->flags & PF_WQ_WORKER) | 1482 | if (p->flags & PF_WQ_WORKER) |
| @@ -1537,7 +1535,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
| 1537 | int ret = 0; | 1535 | int ret = 0; |
| 1538 | 1536 | ||
| 1539 | rq = __task_rq_lock(p); | 1537 | rq = __task_rq_lock(p); |
| 1540 | if (p->on_rq) { | 1538 | if (task_on_rq_queued(p)) { |
| 1541 | /* check_preempt_curr() may use rq clock */ | 1539 | /* check_preempt_curr() may use rq clock */ |
| 1542 | update_rq_clock(rq); | 1540 | update_rq_clock(rq); |
| 1543 | ttwu_do_wakeup(rq, p, wake_flags); | 1541 | ttwu_do_wakeup(rq, p, wake_flags); |
| @@ -1620,6 +1618,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) | |||
| 1620 | } | 1618 | } |
| 1621 | } | 1619 | } |
| 1622 | 1620 | ||
| 1621 | void wake_up_if_idle(int cpu) | ||
| 1622 | { | ||
| 1623 | struct rq *rq = cpu_rq(cpu); | ||
| 1624 | unsigned long flags; | ||
| 1625 | |||
| 1626 | if (!is_idle_task(rq->curr)) | ||
| 1627 | return; | ||
| 1628 | |||
| 1629 | if (set_nr_if_polling(rq->idle)) { | ||
| 1630 | trace_sched_wake_idle_without_ipi(cpu); | ||
| 1631 | } else { | ||
| 1632 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 1633 | if (is_idle_task(rq->curr)) | ||
| 1634 | smp_send_reschedule(cpu); | ||
| 1635 | /* Else cpu is not in idle, do nothing here */ | ||
| 1636 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1637 | } | ||
| 1638 | } | ||
| 1639 | |||
| 1623 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1640 | bool cpus_share_cache(int this_cpu, int that_cpu) |
| 1624 | { | 1641 | { |
| 1625 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1642 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
| @@ -1742,7 +1759,7 @@ static void try_to_wake_up_local(struct task_struct *p) | |||
| 1742 | if (!(p->state & TASK_NORMAL)) | 1759 | if (!(p->state & TASK_NORMAL)) |
| 1743 | goto out; | 1760 | goto out; |
| 1744 | 1761 | ||
| 1745 | if (!p->on_rq) | 1762 | if (!task_on_rq_queued(p)) |
| 1746 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 1763 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
| 1747 | 1764 | ||
| 1748 | ttwu_do_wakeup(rq, p, 0); | 1765 | ttwu_do_wakeup(rq, p, 0); |
| @@ -1776,6 +1793,20 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
| 1776 | } | 1793 | } |
| 1777 | 1794 | ||
| 1778 | /* | 1795 | /* |
| 1796 | * This function clears the sched_dl_entity static params. | ||
| 1797 | */ | ||
| 1798 | void __dl_clear_params(struct task_struct *p) | ||
| 1799 | { | ||
| 1800 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 1801 | |||
| 1802 | dl_se->dl_runtime = 0; | ||
| 1803 | dl_se->dl_deadline = 0; | ||
| 1804 | dl_se->dl_period = 0; | ||
| 1805 | dl_se->flags = 0; | ||
| 1806 | dl_se->dl_bw = 0; | ||
| 1807 | } | ||
| 1808 | |||
| 1809 | /* | ||
| 1779 | * Perform scheduler related setup for a newly forked process p. | 1810 | * Perform scheduler related setup for a newly forked process p. |
| 1780 | * p is forked by current. | 1811 | * p is forked by current. |
| 1781 | * | 1812 | * |
| @@ -1799,10 +1830,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1799 | 1830 | ||
| 1800 | RB_CLEAR_NODE(&p->dl.rb_node); | 1831 | RB_CLEAR_NODE(&p->dl.rb_node); |
| 1801 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1832 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 1802 | p->dl.dl_runtime = p->dl.runtime = 0; | 1833 | __dl_clear_params(p); |
| 1803 | p->dl.dl_deadline = p->dl.deadline = 0; | ||
| 1804 | p->dl.dl_period = 0; | ||
| 1805 | p->dl.flags = 0; | ||
| 1806 | 1834 | ||
| 1807 | INIT_LIST_HEAD(&p->rt.run_list); | 1835 | INIT_LIST_HEAD(&p->rt.run_list); |
| 1808 | 1836 | ||
| @@ -1977,6 +2005,8 @@ unsigned long to_ratio(u64 period, u64 runtime) | |||
| 1977 | #ifdef CONFIG_SMP | 2005 | #ifdef CONFIG_SMP |
| 1978 | inline struct dl_bw *dl_bw_of(int i) | 2006 | inline struct dl_bw *dl_bw_of(int i) |
| 1979 | { | 2007 | { |
| 2008 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | ||
| 2009 | "sched RCU must be held"); | ||
| 1980 | return &cpu_rq(i)->rd->dl_bw; | 2010 | return &cpu_rq(i)->rd->dl_bw; |
| 1981 | } | 2011 | } |
| 1982 | 2012 | ||
| @@ -1985,6 +2015,8 @@ static inline int dl_bw_cpus(int i) | |||
| 1985 | struct root_domain *rd = cpu_rq(i)->rd; | 2015 | struct root_domain *rd = cpu_rq(i)->rd; |
| 1986 | int cpus = 0; | 2016 | int cpus = 0; |
| 1987 | 2017 | ||
| 2018 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | ||
| 2019 | "sched RCU must be held"); | ||
| 1988 | for_each_cpu_and(i, rd->span, cpu_active_mask) | 2020 | for_each_cpu_and(i, rd->span, cpu_active_mask) |
| 1989 | cpus++; | 2021 | cpus++; |
| 1990 | 2022 | ||
| @@ -2095,7 +2127,7 @@ void wake_up_new_task(struct task_struct *p) | |||
| 2095 | init_task_runnable_average(p); | 2127 | init_task_runnable_average(p); |
| 2096 | rq = __task_rq_lock(p); | 2128 | rq = __task_rq_lock(p); |
| 2097 | activate_task(rq, p, 0); | 2129 | activate_task(rq, p, 0); |
| 2098 | p->on_rq = 1; | 2130 | p->on_rq = TASK_ON_RQ_QUEUED; |
| 2099 | trace_sched_wakeup_new(p, true); | 2131 | trace_sched_wakeup_new(p, true); |
| 2100 | check_preempt_curr(rq, p, WF_FORK); | 2132 | check_preempt_curr(rq, p, WF_FORK); |
| 2101 | #ifdef CONFIG_SMP | 2133 | #ifdef CONFIG_SMP |
| @@ -2287,10 +2319,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) | |||
| 2287 | */ | 2319 | */ |
| 2288 | post_schedule(rq); | 2320 | post_schedule(rq); |
| 2289 | 2321 | ||
| 2290 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 2291 | /* In this case, finish_task_switch does not reenable preemption */ | ||
| 2292 | preempt_enable(); | ||
| 2293 | #endif | ||
| 2294 | if (current->set_child_tid) | 2322 | if (current->set_child_tid) |
| 2295 | put_user(task_pid_vnr(current), current->set_child_tid); | 2323 | put_user(task_pid_vnr(current), current->set_child_tid); |
| 2296 | } | 2324 | } |
| @@ -2333,9 +2361,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2333 | * of the scheduler it's an obvious special-case), so we | 2361 | * of the scheduler it's an obvious special-case), so we |
| 2334 | * do an early lockdep release here: | 2362 | * do an early lockdep release here: |
| 2335 | */ | 2363 | */ |
| 2336 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 2337 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 2364 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
| 2338 | #endif | ||
| 2339 | 2365 | ||
| 2340 | context_tracking_task_switch(prev, next); | 2366 | context_tracking_task_switch(prev, next); |
| 2341 | /* Here we just switch the register state and the stack. */ | 2367 | /* Here we just switch the register state and the stack. */ |
| @@ -2366,6 +2392,18 @@ unsigned long nr_running(void) | |||
| 2366 | return sum; | 2392 | return sum; |
| 2367 | } | 2393 | } |
| 2368 | 2394 | ||
| 2395 | /* | ||
| 2396 | * Check if only the current task is running on the cpu. | ||
| 2397 | */ | ||
| 2398 | bool single_task_running(void) | ||
| 2399 | { | ||
| 2400 | if (cpu_rq(smp_processor_id())->nr_running == 1) | ||
| 2401 | return true; | ||
| 2402 | else | ||
| 2403 | return false; | ||
| 2404 | } | ||
| 2405 | EXPORT_SYMBOL(single_task_running); | ||
| 2406 | |||
| 2369 | unsigned long long nr_context_switches(void) | 2407 | unsigned long long nr_context_switches(void) |
| 2370 | { | 2408 | { |
| 2371 | int i; | 2409 | int i; |
| @@ -2393,6 +2431,13 @@ unsigned long nr_iowait_cpu(int cpu) | |||
| 2393 | return atomic_read(&this->nr_iowait); | 2431 | return atomic_read(&this->nr_iowait); |
| 2394 | } | 2432 | } |
| 2395 | 2433 | ||
| 2434 | void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) | ||
| 2435 | { | ||
| 2436 | struct rq *this = this_rq(); | ||
| 2437 | *nr_waiters = atomic_read(&this->nr_iowait); | ||
| 2438 | *load = this->cpu_load[0]; | ||
| 2439 | } | ||
| 2440 | |||
| 2396 | #ifdef CONFIG_SMP | 2441 | #ifdef CONFIG_SMP |
| 2397 | 2442 | ||
| 2398 | /* | 2443 | /* |
| @@ -2444,7 +2489,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
| 2444 | * project cycles that may never be accounted to this | 2489 | * project cycles that may never be accounted to this |
| 2445 | * thread, breaking clock_gettime(). | 2490 | * thread, breaking clock_gettime(). |
| 2446 | */ | 2491 | */ |
| 2447 | if (task_current(rq, p) && p->on_rq) { | 2492 | if (task_current(rq, p) && task_on_rq_queued(p)) { |
| 2448 | update_rq_clock(rq); | 2493 | update_rq_clock(rq); |
| 2449 | ns = rq_clock_task(rq) - p->se.exec_start; | 2494 | ns = rq_clock_task(rq) - p->se.exec_start; |
| 2450 | if ((s64)ns < 0) | 2495 | if ((s64)ns < 0) |
| @@ -2490,7 +2535,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 2490 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has | 2535 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has |
| 2491 | * been accounted, so we're correct here as well. | 2536 | * been accounted, so we're correct here as well. |
| 2492 | */ | 2537 | */ |
| 2493 | if (!p->on_cpu || !p->on_rq) | 2538 | if (!p->on_cpu || !task_on_rq_queued(p)) |
| 2494 | return p->se.sum_exec_runtime; | 2539 | return p->se.sum_exec_runtime; |
| 2495 | #endif | 2540 | #endif |
| 2496 | 2541 | ||
| @@ -2653,6 +2698,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
| 2653 | */ | 2698 | */ |
| 2654 | static inline void schedule_debug(struct task_struct *prev) | 2699 | static inline void schedule_debug(struct task_struct *prev) |
| 2655 | { | 2700 | { |
| 2701 | #ifdef CONFIG_SCHED_STACK_END_CHECK | ||
| 2702 | BUG_ON(unlikely(task_stack_end_corrupted(prev))); | ||
| 2703 | #endif | ||
| 2656 | /* | 2704 | /* |
| 2657 | * Test if we are atomic. Since do_exit() needs to call into | 2705 | * Test if we are atomic. Since do_exit() needs to call into |
| 2658 | * schedule() atomically, we ignore that path. Otherwise whine | 2706 | * schedule() atomically, we ignore that path. Otherwise whine |
| @@ -2794,7 +2842,7 @@ need_resched: | |||
| 2794 | switch_count = &prev->nvcsw; | 2842 | switch_count = &prev->nvcsw; |
| 2795 | } | 2843 | } |
| 2796 | 2844 | ||
| 2797 | if (prev->on_rq || rq->skip_clock_update < 0) | 2845 | if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) |
| 2798 | update_rq_clock(rq); | 2846 | update_rq_clock(rq); |
| 2799 | 2847 | ||
| 2800 | next = pick_next_task(rq, prev); | 2848 | next = pick_next_task(rq, prev); |
| @@ -2903,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) | |||
| 2903 | } | 2951 | } |
| 2904 | NOKPROBE_SYMBOL(preempt_schedule); | 2952 | NOKPROBE_SYMBOL(preempt_schedule); |
| 2905 | EXPORT_SYMBOL(preempt_schedule); | 2953 | EXPORT_SYMBOL(preempt_schedule); |
| 2954 | |||
| 2955 | #ifdef CONFIG_CONTEXT_TRACKING | ||
| 2956 | /** | ||
| 2957 | * preempt_schedule_context - preempt_schedule called by tracing | ||
| 2958 | * | ||
| 2959 | * The tracing infrastructure uses preempt_enable_notrace to prevent | ||
| 2960 | * recursion and tracing preempt enabling caused by the tracing | ||
| 2961 | * infrastructure itself. But as tracing can happen in areas coming | ||
| 2962 | * from userspace or just about to enter userspace, a preempt enable | ||
| 2963 | * can occur before user_exit() is called. This will cause the scheduler | ||
| 2964 | * to be called when the system is still in usermode. | ||
| 2965 | * | ||
| 2966 | * To prevent this, the preempt_enable_notrace will use this function | ||
| 2967 | * instead of preempt_schedule() to exit user context if needed before | ||
| 2968 | * calling the scheduler. | ||
| 2969 | */ | ||
| 2970 | asmlinkage __visible void __sched notrace preempt_schedule_context(void) | ||
| 2971 | { | ||
| 2972 | enum ctx_state prev_ctx; | ||
| 2973 | |||
| 2974 | if (likely(!preemptible())) | ||
| 2975 | return; | ||
| 2976 | |||
| 2977 | do { | ||
| 2978 | __preempt_count_add(PREEMPT_ACTIVE); | ||
| 2979 | /* | ||
| 2980 | * Needs preempt disabled in case user_exit() is traced | ||
| 2981 | * and the tracer calls preempt_enable_notrace() causing | ||
| 2982 | * an infinite recursion. | ||
| 2983 | */ | ||
| 2984 | prev_ctx = exception_enter(); | ||
| 2985 | __schedule(); | ||
| 2986 | exception_exit(prev_ctx); | ||
| 2987 | |||
| 2988 | __preempt_count_sub(PREEMPT_ACTIVE); | ||
| 2989 | barrier(); | ||
| 2990 | } while (need_resched()); | ||
| 2991 | } | ||
| 2992 | EXPORT_SYMBOL_GPL(preempt_schedule_context); | ||
| 2993 | #endif /* CONFIG_CONTEXT_TRACKING */ | ||
| 2994 | |||
| 2906 | #endif /* CONFIG_PREEMPT */ | 2995 | #endif /* CONFIG_PREEMPT */ |
| 2907 | 2996 | ||
| 2908 | /* | 2997 | /* |
| @@ -2959,7 +3048,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
| 2959 | */ | 3048 | */ |
| 2960 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3049 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 2961 | { | 3050 | { |
| 2962 | int oldprio, on_rq, running, enqueue_flag = 0; | 3051 | int oldprio, queued, running, enqueue_flag = 0; |
| 2963 | struct rq *rq; | 3052 | struct rq *rq; |
| 2964 | const struct sched_class *prev_class; | 3053 | const struct sched_class *prev_class; |
| 2965 | 3054 | ||
| @@ -2988,12 +3077,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 2988 | trace_sched_pi_setprio(p, prio); | 3077 | trace_sched_pi_setprio(p, prio); |
| 2989 | oldprio = p->prio; | 3078 | oldprio = p->prio; |
| 2990 | prev_class = p->sched_class; | 3079 | prev_class = p->sched_class; |
| 2991 | on_rq = p->on_rq; | 3080 | queued = task_on_rq_queued(p); |
| 2992 | running = task_current(rq, p); | 3081 | running = task_current(rq, p); |
| 2993 | if (on_rq) | 3082 | if (queued) |
| 2994 | dequeue_task(rq, p, 0); | 3083 | dequeue_task(rq, p, 0); |
| 2995 | if (running) | 3084 | if (running) |
| 2996 | p->sched_class->put_prev_task(rq, p); | 3085 | put_prev_task(rq, p); |
| 2997 | 3086 | ||
| 2998 | /* | 3087 | /* |
| 2999 | * Boosting condition are: | 3088 | * Boosting condition are: |
| @@ -3030,7 +3119,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3030 | 3119 | ||
| 3031 | if (running) | 3120 | if (running) |
| 3032 | p->sched_class->set_curr_task(rq); | 3121 | p->sched_class->set_curr_task(rq); |
| 3033 | if (on_rq) | 3122 | if (queued) |
| 3034 | enqueue_task(rq, p, enqueue_flag); | 3123 | enqueue_task(rq, p, enqueue_flag); |
| 3035 | 3124 | ||
| 3036 | check_class_changed(rq, p, prev_class, oldprio); | 3125 | check_class_changed(rq, p, prev_class, oldprio); |
| @@ -3041,7 +3130,7 @@ out_unlock: | |||
| 3041 | 3130 | ||
| 3042 | void set_user_nice(struct task_struct *p, long nice) | 3131 | void set_user_nice(struct task_struct *p, long nice) |
| 3043 | { | 3132 | { |
| 3044 | int old_prio, delta, on_rq; | 3133 | int old_prio, delta, queued; |
| 3045 | unsigned long flags; | 3134 | unsigned long flags; |
| 3046 | struct rq *rq; | 3135 | struct rq *rq; |
| 3047 | 3136 | ||
| @@ -3062,8 +3151,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3062 | p->static_prio = NICE_TO_PRIO(nice); | 3151 | p->static_prio = NICE_TO_PRIO(nice); |
| 3063 | goto out_unlock; | 3152 | goto out_unlock; |
| 3064 | } | 3153 | } |
| 3065 | on_rq = p->on_rq; | 3154 | queued = task_on_rq_queued(p); |
| 3066 | if (on_rq) | 3155 | if (queued) |
| 3067 | dequeue_task(rq, p, 0); | 3156 | dequeue_task(rq, p, 0); |
| 3068 | 3157 | ||
| 3069 | p->static_prio = NICE_TO_PRIO(nice); | 3158 | p->static_prio = NICE_TO_PRIO(nice); |
| @@ -3072,7 +3161,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3072 | p->prio = effective_prio(p); | 3161 | p->prio = effective_prio(p); |
| 3073 | delta = p->prio - old_prio; | 3162 | delta = p->prio - old_prio; |
| 3074 | 3163 | ||
| 3075 | if (on_rq) { | 3164 | if (queued) { |
| 3076 | enqueue_task(rq, p, 0); | 3165 | enqueue_task(rq, p, 0); |
| 3077 | /* | 3166 | /* |
| 3078 | * If the task increased its priority or is running and | 3167 | * If the task increased its priority or is running and |
| @@ -3344,7 +3433,7 @@ static int __sched_setscheduler(struct task_struct *p, | |||
| 3344 | { | 3433 | { |
| 3345 | int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : | 3434 | int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : |
| 3346 | MAX_RT_PRIO - 1 - attr->sched_priority; | 3435 | MAX_RT_PRIO - 1 - attr->sched_priority; |
| 3347 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3436 | int retval, oldprio, oldpolicy = -1, queued, running; |
| 3348 | int policy = attr->sched_policy; | 3437 | int policy = attr->sched_policy; |
| 3349 | unsigned long flags; | 3438 | unsigned long flags; |
| 3350 | const struct sched_class *prev_class; | 3439 | const struct sched_class *prev_class; |
| @@ -3541,19 +3630,19 @@ change: | |||
| 3541 | return 0; | 3630 | return 0; |
| 3542 | } | 3631 | } |
| 3543 | 3632 | ||
| 3544 | on_rq = p->on_rq; | 3633 | queued = task_on_rq_queued(p); |
| 3545 | running = task_current(rq, p); | 3634 | running = task_current(rq, p); |
| 3546 | if (on_rq) | 3635 | if (queued) |
| 3547 | dequeue_task(rq, p, 0); | 3636 | dequeue_task(rq, p, 0); |
| 3548 | if (running) | 3637 | if (running) |
| 3549 | p->sched_class->put_prev_task(rq, p); | 3638 | put_prev_task(rq, p); |
| 3550 | 3639 | ||
| 3551 | prev_class = p->sched_class; | 3640 | prev_class = p->sched_class; |
| 3552 | __setscheduler(rq, p, attr); | 3641 | __setscheduler(rq, p, attr); |
| 3553 | 3642 | ||
| 3554 | if (running) | 3643 | if (running) |
| 3555 | p->sched_class->set_curr_task(rq); | 3644 | p->sched_class->set_curr_task(rq); |
| 3556 | if (on_rq) { | 3645 | if (queued) { |
| 3557 | /* | 3646 | /* |
| 3558 | * We enqueue to tail when the priority of a task is | 3647 | * We enqueue to tail when the priority of a task is |
| 3559 | * increased (user space view). | 3648 | * increased (user space view). |
| @@ -3977,14 +4066,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 3977 | rcu_read_lock(); | 4066 | rcu_read_lock(); |
| 3978 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { | 4067 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { |
| 3979 | rcu_read_unlock(); | 4068 | rcu_read_unlock(); |
| 3980 | goto out_unlock; | 4069 | goto out_free_new_mask; |
| 3981 | } | 4070 | } |
| 3982 | rcu_read_unlock(); | 4071 | rcu_read_unlock(); |
| 3983 | } | 4072 | } |
| 3984 | 4073 | ||
| 3985 | retval = security_task_setscheduler(p); | 4074 | retval = security_task_setscheduler(p); |
| 3986 | if (retval) | 4075 | if (retval) |
| 3987 | goto out_unlock; | 4076 | goto out_free_new_mask; |
| 3988 | 4077 | ||
| 3989 | 4078 | ||
| 3990 | cpuset_cpus_allowed(p, cpus_allowed); | 4079 | cpuset_cpus_allowed(p, cpus_allowed); |
| @@ -3997,13 +4086,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 3997 | * root_domain. | 4086 | * root_domain. |
| 3998 | */ | 4087 | */ |
| 3999 | #ifdef CONFIG_SMP | 4088 | #ifdef CONFIG_SMP |
| 4000 | if (task_has_dl_policy(p)) { | 4089 | if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { |
| 4001 | const struct cpumask *span = task_rq(p)->rd->span; | 4090 | rcu_read_lock(); |
| 4002 | 4091 | if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { | |
| 4003 | if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { | ||
| 4004 | retval = -EBUSY; | 4092 | retval = -EBUSY; |
| 4005 | goto out_unlock; | 4093 | rcu_read_unlock(); |
| 4094 | goto out_free_new_mask; | ||
| 4006 | } | 4095 | } |
| 4096 | rcu_read_unlock(); | ||
| 4007 | } | 4097 | } |
| 4008 | #endif | 4098 | #endif |
| 4009 | again: | 4099 | again: |
| @@ -4021,7 +4111,7 @@ again: | |||
| 4021 | goto again; | 4111 | goto again; |
| 4022 | } | 4112 | } |
| 4023 | } | 4113 | } |
| 4024 | out_unlock: | 4114 | out_free_new_mask: |
| 4025 | free_cpumask_var(new_mask); | 4115 | free_cpumask_var(new_mask); |
| 4026 | out_free_cpus_allowed: | 4116 | out_free_cpus_allowed: |
| 4027 | free_cpumask_var(cpus_allowed); | 4117 | free_cpumask_var(cpus_allowed); |
| @@ -4505,7 +4595,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 4505 | " task PC stack pid father\n"); | 4595 | " task PC stack pid father\n"); |
| 4506 | #endif | 4596 | #endif |
| 4507 | rcu_read_lock(); | 4597 | rcu_read_lock(); |
| 4508 | do_each_thread(g, p) { | 4598 | for_each_process_thread(g, p) { |
| 4509 | /* | 4599 | /* |
| 4510 | * reset the NMI-timeout, listing all files on a slow | 4600 | * reset the NMI-timeout, listing all files on a slow |
| 4511 | * console might take a lot of time: | 4601 | * console might take a lot of time: |
| @@ -4513,7 +4603,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 4513 | touch_nmi_watchdog(); | 4603 | touch_nmi_watchdog(); |
| 4514 | if (!state_filter || (p->state & state_filter)) | 4604 | if (!state_filter || (p->state & state_filter)) |
| 4515 | sched_show_task(p); | 4605 | sched_show_task(p); |
| 4516 | } while_each_thread(g, p); | 4606 | } |
| 4517 | 4607 | ||
| 4518 | touch_all_softlockup_watchdogs(); | 4608 | touch_all_softlockup_watchdogs(); |
| 4519 | 4609 | ||
| @@ -4568,7 +4658,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4568 | rcu_read_unlock(); | 4658 | rcu_read_unlock(); |
| 4569 | 4659 | ||
| 4570 | rq->curr = rq->idle = idle; | 4660 | rq->curr = rq->idle = idle; |
| 4571 | idle->on_rq = 1; | 4661 | idle->on_rq = TASK_ON_RQ_QUEUED; |
| 4572 | #if defined(CONFIG_SMP) | 4662 | #if defined(CONFIG_SMP) |
| 4573 | idle->on_cpu = 1; | 4663 | idle->on_cpu = 1; |
| 4574 | #endif | 4664 | #endif |
| @@ -4589,6 +4679,33 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4589 | } | 4679 | } |
| 4590 | 4680 | ||
| 4591 | #ifdef CONFIG_SMP | 4681 | #ifdef CONFIG_SMP |
| 4682 | /* | ||
| 4683 | * move_queued_task - move a queued task to new rq. | ||
| 4684 | * | ||
| 4685 | * Returns (locked) new rq. Old rq's lock is released. | ||
| 4686 | */ | ||
| 4687 | static struct rq *move_queued_task(struct task_struct *p, int new_cpu) | ||
| 4688 | { | ||
| 4689 | struct rq *rq = task_rq(p); | ||
| 4690 | |||
| 4691 | lockdep_assert_held(&rq->lock); | ||
| 4692 | |||
| 4693 | dequeue_task(rq, p, 0); | ||
| 4694 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
| 4695 | set_task_cpu(p, new_cpu); | ||
| 4696 | raw_spin_unlock(&rq->lock); | ||
| 4697 | |||
| 4698 | rq = cpu_rq(new_cpu); | ||
| 4699 | |||
| 4700 | raw_spin_lock(&rq->lock); | ||
| 4701 | BUG_ON(task_cpu(p) != new_cpu); | ||
| 4702 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
| 4703 | enqueue_task(rq, p, 0); | ||
| 4704 | check_preempt_curr(rq, p, 0); | ||
| 4705 | |||
| 4706 | return rq; | ||
| 4707 | } | ||
| 4708 | |||
| 4592 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | 4709 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
| 4593 | { | 4710 | { |
| 4594 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 4711 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
| @@ -4645,14 +4762,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 4645 | goto out; | 4762 | goto out; |
| 4646 | 4763 | ||
| 4647 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 4764 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
| 4648 | if (p->on_rq) { | 4765 | if (task_running(rq, p) || p->state == TASK_WAKING) { |
| 4649 | struct migration_arg arg = { p, dest_cpu }; | 4766 | struct migration_arg arg = { p, dest_cpu }; |
| 4650 | /* Need help from migration thread: drop lock and wait. */ | 4767 | /* Need help from migration thread: drop lock and wait. */ |
| 4651 | task_rq_unlock(rq, p, &flags); | 4768 | task_rq_unlock(rq, p, &flags); |
| 4652 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 4769 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
| 4653 | tlb_migrate_finish(p->mm); | 4770 | tlb_migrate_finish(p->mm); |
| 4654 | return 0; | 4771 | return 0; |
| 4655 | } | 4772 | } else if (task_on_rq_queued(p)) |
| 4773 | rq = move_queued_task(p, dest_cpu); | ||
| 4656 | out: | 4774 | out: |
| 4657 | task_rq_unlock(rq, p, &flags); | 4775 | task_rq_unlock(rq, p, &flags); |
| 4658 | 4776 | ||
| @@ -4673,20 +4791,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); | |||
| 4673 | */ | 4791 | */ |
| 4674 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4792 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
| 4675 | { | 4793 | { |
| 4676 | struct rq *rq_dest, *rq_src; | 4794 | struct rq *rq; |
| 4677 | int ret = 0; | 4795 | int ret = 0; |
| 4678 | 4796 | ||
| 4679 | if (unlikely(!cpu_active(dest_cpu))) | 4797 | if (unlikely(!cpu_active(dest_cpu))) |
| 4680 | return ret; | 4798 | return ret; |
| 4681 | 4799 | ||
| 4682 | rq_src = cpu_rq(src_cpu); | 4800 | rq = cpu_rq(src_cpu); |
| 4683 | rq_dest = cpu_rq(dest_cpu); | ||
| 4684 | 4801 | ||
| 4685 | raw_spin_lock(&p->pi_lock); | 4802 | raw_spin_lock(&p->pi_lock); |
| 4686 | double_rq_lock(rq_src, rq_dest); | 4803 | raw_spin_lock(&rq->lock); |
| 4687 | /* Already moved. */ | 4804 | /* Already moved. */ |
| 4688 | if (task_cpu(p) != src_cpu) | 4805 | if (task_cpu(p) != src_cpu) |
| 4689 | goto done; | 4806 | goto done; |
| 4807 | |||
| 4690 | /* Affinity changed (again). */ | 4808 | /* Affinity changed (again). */ |
| 4691 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | 4809 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
| 4692 | goto fail; | 4810 | goto fail; |
| @@ -4695,16 +4813,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 4695 | * If we're not on a rq, the next wake-up will ensure we're | 4813 | * If we're not on a rq, the next wake-up will ensure we're |
| 4696 | * placed properly. | 4814 | * placed properly. |
| 4697 | */ | 4815 | */ |
| 4698 | if (p->on_rq) { | 4816 | if (task_on_rq_queued(p)) |
| 4699 | dequeue_task(rq_src, p, 0); | 4817 | rq = move_queued_task(p, dest_cpu); |
| 4700 | set_task_cpu(p, dest_cpu); | ||
| 4701 | enqueue_task(rq_dest, p, 0); | ||
| 4702 | check_preempt_curr(rq_dest, p, 0); | ||
| 4703 | } | ||
| 4704 | done: | 4818 | done: |
| 4705 | ret = 1; | 4819 | ret = 1; |
| 4706 | fail: | 4820 | fail: |
| 4707 | double_rq_unlock(rq_src, rq_dest); | 4821 | raw_spin_unlock(&rq->lock); |
| 4708 | raw_spin_unlock(&p->pi_lock); | 4822 | raw_spin_unlock(&p->pi_lock); |
| 4709 | return ret; | 4823 | return ret; |
| 4710 | } | 4824 | } |
| @@ -4736,22 +4850,22 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
| 4736 | { | 4850 | { |
| 4737 | struct rq *rq; | 4851 | struct rq *rq; |
| 4738 | unsigned long flags; | 4852 | unsigned long flags; |
| 4739 | bool on_rq, running; | 4853 | bool queued, running; |
| 4740 | 4854 | ||
| 4741 | rq = task_rq_lock(p, &flags); | 4855 | rq = task_rq_lock(p, &flags); |
| 4742 | on_rq = p->on_rq; | 4856 | queued = task_on_rq_queued(p); |
| 4743 | running = task_current(rq, p); | 4857 | running = task_current(rq, p); |
| 4744 | 4858 | ||
| 4745 | if (on_rq) | 4859 | if (queued) |
| 4746 | dequeue_task(rq, p, 0); | 4860 | dequeue_task(rq, p, 0); |
| 4747 | if (running) | 4861 | if (running) |
| 4748 | p->sched_class->put_prev_task(rq, p); | 4862 | put_prev_task(rq, p); |
| 4749 | 4863 | ||
| 4750 | p->numa_preferred_nid = nid; | 4864 | p->numa_preferred_nid = nid; |
| 4751 | 4865 | ||
| 4752 | if (running) | 4866 | if (running) |
| 4753 | p->sched_class->set_curr_task(rq); | 4867 | p->sched_class->set_curr_task(rq); |
| 4754 | if (on_rq) | 4868 | if (queued) |
| 4755 | enqueue_task(rq, p, 0); | 4869 | enqueue_task(rq, p, 0); |
| 4756 | task_rq_unlock(rq, p, &flags); | 4870 | task_rq_unlock(rq, p, &flags); |
| 4757 | } | 4871 | } |
| @@ -4771,6 +4885,12 @@ static int migration_cpu_stop(void *data) | |||
| 4771 | * be on another cpu but it doesn't matter. | 4885 | * be on another cpu but it doesn't matter. |
| 4772 | */ | 4886 | */ |
| 4773 | local_irq_disable(); | 4887 | local_irq_disable(); |
| 4888 | /* | ||
| 4889 | * We need to explicitly wake pending tasks before running | ||
| 4890 | * __migrate_task() such that we will not miss enforcing cpus_allowed | ||
| 4891 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. | ||
| 4892 | */ | ||
| 4893 | sched_ttwu_pending(); | ||
| 4774 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); | 4894 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); |
| 4775 | local_irq_enable(); | 4895 | local_irq_enable(); |
| 4776 | return 0; | 4896 | return 0; |
| @@ -5181,6 +5301,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb, | |||
| 5181 | { | 5301 | { |
| 5182 | unsigned long flags; | 5302 | unsigned long flags; |
| 5183 | long cpu = (long)hcpu; | 5303 | long cpu = (long)hcpu; |
| 5304 | struct dl_bw *dl_b; | ||
| 5184 | 5305 | ||
| 5185 | switch (action & ~CPU_TASKS_FROZEN) { | 5306 | switch (action & ~CPU_TASKS_FROZEN) { |
| 5186 | case CPU_DOWN_PREPARE: | 5307 | case CPU_DOWN_PREPARE: |
| @@ -5188,15 +5309,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb, | |||
| 5188 | 5309 | ||
| 5189 | /* explicitly allow suspend */ | 5310 | /* explicitly allow suspend */ |
| 5190 | if (!(action & CPU_TASKS_FROZEN)) { | 5311 | if (!(action & CPU_TASKS_FROZEN)) { |
| 5191 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
| 5192 | bool overflow; | 5312 | bool overflow; |
| 5193 | int cpus; | 5313 | int cpus; |
| 5194 | 5314 | ||
| 5315 | rcu_read_lock_sched(); | ||
| 5316 | dl_b = dl_bw_of(cpu); | ||
| 5317 | |||
| 5195 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 5318 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 5196 | cpus = dl_bw_cpus(cpu); | 5319 | cpus = dl_bw_cpus(cpu); |
| 5197 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | 5320 | overflow = __dl_overflow(dl_b, cpus, 0, 0); |
| 5198 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 5321 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 5199 | 5322 | ||
| 5323 | rcu_read_unlock_sched(); | ||
| 5324 | |||
| 5200 | if (overflow) | 5325 | if (overflow) |
| 5201 | return notifier_from_errno(-EBUSY); | 5326 | return notifier_from_errno(-EBUSY); |
| 5202 | } | 5327 | } |
| @@ -5739,7 +5864,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5739 | const struct cpumask *span = sched_domain_span(sd); | 5864 | const struct cpumask *span = sched_domain_span(sd); |
| 5740 | struct cpumask *covered = sched_domains_tmpmask; | 5865 | struct cpumask *covered = sched_domains_tmpmask; |
| 5741 | struct sd_data *sdd = sd->private; | 5866 | struct sd_data *sdd = sd->private; |
| 5742 | struct sched_domain *child; | 5867 | struct sched_domain *sibling; |
| 5743 | int i; | 5868 | int i; |
| 5744 | 5869 | ||
| 5745 | cpumask_clear(covered); | 5870 | cpumask_clear(covered); |
| @@ -5750,10 +5875,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5750 | if (cpumask_test_cpu(i, covered)) | 5875 | if (cpumask_test_cpu(i, covered)) |
| 5751 | continue; | 5876 | continue; |
| 5752 | 5877 | ||
| 5753 | child = *per_cpu_ptr(sdd->sd, i); | 5878 | sibling = *per_cpu_ptr(sdd->sd, i); |
| 5754 | 5879 | ||
| 5755 | /* See the comment near build_group_mask(). */ | 5880 | /* See the comment near build_group_mask(). */ |
| 5756 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | 5881 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) |
| 5757 | continue; | 5882 | continue; |
| 5758 | 5883 | ||
| 5759 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 5884 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
| @@ -5763,10 +5888,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5763 | goto fail; | 5888 | goto fail; |
| 5764 | 5889 | ||
| 5765 | sg_span = sched_group_cpus(sg); | 5890 | sg_span = sched_group_cpus(sg); |
| 5766 | if (child->child) { | 5891 | if (sibling->child) |
| 5767 | child = child->child; | 5892 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); |
| 5768 | cpumask_copy(sg_span, sched_domain_span(child)); | 5893 | else |
| 5769 | } else | ||
| 5770 | cpumask_set_cpu(i, sg_span); | 5894 | cpumask_set_cpu(i, sg_span); |
| 5771 | 5895 | ||
| 5772 | cpumask_or(covered, covered, sg_span); | 5896 | cpumask_or(covered, covered, sg_span); |
| @@ -7117,13 +7241,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
| 7117 | .sched_policy = SCHED_NORMAL, | 7241 | .sched_policy = SCHED_NORMAL, |
| 7118 | }; | 7242 | }; |
| 7119 | int old_prio = p->prio; | 7243 | int old_prio = p->prio; |
| 7120 | int on_rq; | 7244 | int queued; |
| 7121 | 7245 | ||
| 7122 | on_rq = p->on_rq; | 7246 | queued = task_on_rq_queued(p); |
| 7123 | if (on_rq) | 7247 | if (queued) |
| 7124 | dequeue_task(rq, p, 0); | 7248 | dequeue_task(rq, p, 0); |
| 7125 | __setscheduler(rq, p, &attr); | 7249 | __setscheduler(rq, p, &attr); |
| 7126 | if (on_rq) { | 7250 | if (queued) { |
| 7127 | enqueue_task(rq, p, 0); | 7251 | enqueue_task(rq, p, 0); |
| 7128 | resched_curr(rq); | 7252 | resched_curr(rq); |
| 7129 | } | 7253 | } |
| @@ -7137,12 +7261,12 @@ void normalize_rt_tasks(void) | |||
| 7137 | unsigned long flags; | 7261 | unsigned long flags; |
| 7138 | struct rq *rq; | 7262 | struct rq *rq; |
| 7139 | 7263 | ||
| 7140 | read_lock_irqsave(&tasklist_lock, flags); | 7264 | read_lock(&tasklist_lock); |
| 7141 | do_each_thread(g, p) { | 7265 | for_each_process_thread(g, p) { |
| 7142 | /* | 7266 | /* |
| 7143 | * Only normalize user tasks: | 7267 | * Only normalize user tasks: |
| 7144 | */ | 7268 | */ |
| 7145 | if (!p->mm) | 7269 | if (p->flags & PF_KTHREAD) |
| 7146 | continue; | 7270 | continue; |
| 7147 | 7271 | ||
| 7148 | p->se.exec_start = 0; | 7272 | p->se.exec_start = 0; |
| @@ -7157,21 +7281,16 @@ void normalize_rt_tasks(void) | |||
| 7157 | * Renice negative nice level userspace | 7281 | * Renice negative nice level userspace |
| 7158 | * tasks back to 0: | 7282 | * tasks back to 0: |
| 7159 | */ | 7283 | */ |
| 7160 | if (task_nice(p) < 0 && p->mm) | 7284 | if (task_nice(p) < 0) |
| 7161 | set_user_nice(p, 0); | 7285 | set_user_nice(p, 0); |
| 7162 | continue; | 7286 | continue; |
| 7163 | } | 7287 | } |
| 7164 | 7288 | ||
| 7165 | raw_spin_lock(&p->pi_lock); | 7289 | rq = task_rq_lock(p, &flags); |
| 7166 | rq = __task_rq_lock(p); | ||
| 7167 | |||
| 7168 | normalize_task(rq, p); | 7290 | normalize_task(rq, p); |
| 7169 | 7291 | task_rq_unlock(rq, p, &flags); | |
| 7170 | __task_rq_unlock(rq); | 7292 | } |
| 7171 | raw_spin_unlock(&p->pi_lock); | 7293 | read_unlock(&tasklist_lock); |
| 7172 | } while_each_thread(g, p); | ||
| 7173 | |||
| 7174 | read_unlock_irqrestore(&tasklist_lock, flags); | ||
| 7175 | } | 7294 | } |
| 7176 | 7295 | ||
| 7177 | #endif /* CONFIG_MAGIC_SYSRQ */ | 7296 | #endif /* CONFIG_MAGIC_SYSRQ */ |
| @@ -7311,19 +7430,19 @@ void sched_offline_group(struct task_group *tg) | |||
| 7311 | void sched_move_task(struct task_struct *tsk) | 7430 | void sched_move_task(struct task_struct *tsk) |
| 7312 | { | 7431 | { |
| 7313 | struct task_group *tg; | 7432 | struct task_group *tg; |
| 7314 | int on_rq, running; | 7433 | int queued, running; |
| 7315 | unsigned long flags; | 7434 | unsigned long flags; |
| 7316 | struct rq *rq; | 7435 | struct rq *rq; |
| 7317 | 7436 | ||
| 7318 | rq = task_rq_lock(tsk, &flags); | 7437 | rq = task_rq_lock(tsk, &flags); |
| 7319 | 7438 | ||
| 7320 | running = task_current(rq, tsk); | 7439 | running = task_current(rq, tsk); |
| 7321 | on_rq = tsk->on_rq; | 7440 | queued = task_on_rq_queued(tsk); |
| 7322 | 7441 | ||
| 7323 | if (on_rq) | 7442 | if (queued) |
| 7324 | dequeue_task(rq, tsk, 0); | 7443 | dequeue_task(rq, tsk, 0); |
| 7325 | if (unlikely(running)) | 7444 | if (unlikely(running)) |
| 7326 | tsk->sched_class->put_prev_task(rq, tsk); | 7445 | put_prev_task(rq, tsk); |
| 7327 | 7446 | ||
| 7328 | tg = container_of(task_css_check(tsk, cpu_cgrp_id, | 7447 | tg = container_of(task_css_check(tsk, cpu_cgrp_id, |
| 7329 | lockdep_is_held(&tsk->sighand->siglock)), | 7448 | lockdep_is_held(&tsk->sighand->siglock)), |
| @@ -7333,14 +7452,14 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7333 | 7452 | ||
| 7334 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7453 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7335 | if (tsk->sched_class->task_move_group) | 7454 | if (tsk->sched_class->task_move_group) |
| 7336 | tsk->sched_class->task_move_group(tsk, on_rq); | 7455 | tsk->sched_class->task_move_group(tsk, queued); |
| 7337 | else | 7456 | else |
| 7338 | #endif | 7457 | #endif |
| 7339 | set_task_rq(tsk, task_cpu(tsk)); | 7458 | set_task_rq(tsk, task_cpu(tsk)); |
| 7340 | 7459 | ||
| 7341 | if (unlikely(running)) | 7460 | if (unlikely(running)) |
| 7342 | tsk->sched_class->set_curr_task(rq); | 7461 | tsk->sched_class->set_curr_task(rq); |
| 7343 | if (on_rq) | 7462 | if (queued) |
| 7344 | enqueue_task(rq, tsk, 0); | 7463 | enqueue_task(rq, tsk, 0); |
| 7345 | 7464 | ||
| 7346 | task_rq_unlock(rq, tsk, &flags); | 7465 | task_rq_unlock(rq, tsk, &flags); |
| @@ -7358,10 +7477,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
| 7358 | { | 7477 | { |
| 7359 | struct task_struct *g, *p; | 7478 | struct task_struct *g, *p; |
| 7360 | 7479 | ||
| 7361 | do_each_thread(g, p) { | 7480 | for_each_process_thread(g, p) { |
| 7362 | if (rt_task(p) && task_rq(p)->rt.tg == tg) | 7481 | if (rt_task(p) && task_group(p) == tg) |
| 7363 | return 1; | 7482 | return 1; |
| 7364 | } while_each_thread(g, p); | 7483 | } |
| 7365 | 7484 | ||
| 7366 | return 0; | 7485 | return 0; |
| 7367 | } | 7486 | } |
| @@ -7570,6 +7689,7 @@ static int sched_dl_global_constraints(void) | |||
| 7570 | u64 runtime = global_rt_runtime(); | 7689 | u64 runtime = global_rt_runtime(); |
| 7571 | u64 period = global_rt_period(); | 7690 | u64 period = global_rt_period(); |
| 7572 | u64 new_bw = to_ratio(period, runtime); | 7691 | u64 new_bw = to_ratio(period, runtime); |
| 7692 | struct dl_bw *dl_b; | ||
| 7573 | int cpu, ret = 0; | 7693 | int cpu, ret = 0; |
| 7574 | unsigned long flags; | 7694 | unsigned long flags; |
| 7575 | 7695 | ||
| @@ -7583,13 +7703,16 @@ static int sched_dl_global_constraints(void) | |||
| 7583 | * solutions is welcome! | 7703 | * solutions is welcome! |
| 7584 | */ | 7704 | */ |
| 7585 | for_each_possible_cpu(cpu) { | 7705 | for_each_possible_cpu(cpu) { |
| 7586 | struct dl_bw *dl_b = dl_bw_of(cpu); | 7706 | rcu_read_lock_sched(); |
| 7707 | dl_b = dl_bw_of(cpu); | ||
| 7587 | 7708 | ||
| 7588 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 7709 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 7589 | if (new_bw < dl_b->total_bw) | 7710 | if (new_bw < dl_b->total_bw) |
| 7590 | ret = -EBUSY; | 7711 | ret = -EBUSY; |
| 7591 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 7712 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 7592 | 7713 | ||
| 7714 | rcu_read_unlock_sched(); | ||
| 7715 | |||
| 7593 | if (ret) | 7716 | if (ret) |
| 7594 | break; | 7717 | break; |
| 7595 | } | 7718 | } |
| @@ -7600,6 +7723,7 @@ static int sched_dl_global_constraints(void) | |||
| 7600 | static void sched_dl_do_global(void) | 7723 | static void sched_dl_do_global(void) |
| 7601 | { | 7724 | { |
| 7602 | u64 new_bw = -1; | 7725 | u64 new_bw = -1; |
| 7726 | struct dl_bw *dl_b; | ||
| 7603 | int cpu; | 7727 | int cpu; |
| 7604 | unsigned long flags; | 7728 | unsigned long flags; |
| 7605 | 7729 | ||
| @@ -7613,11 +7737,14 @@ static void sched_dl_do_global(void) | |||
| 7613 | * FIXME: As above... | 7737 | * FIXME: As above... |
| 7614 | */ | 7738 | */ |
| 7615 | for_each_possible_cpu(cpu) { | 7739 | for_each_possible_cpu(cpu) { |
| 7616 | struct dl_bw *dl_b = dl_bw_of(cpu); | 7740 | rcu_read_lock_sched(); |
| 7741 | dl_b = dl_bw_of(cpu); | ||
| 7617 | 7742 | ||
| 7618 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 7743 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 7619 | dl_b->bw = new_bw; | 7744 | dl_b->bw = new_bw; |
| 7620 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 7745 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 7746 | |||
| 7747 | rcu_read_unlock_sched(); | ||
| 7621 | } | 7748 | } |
| 7622 | } | 7749 | } |
| 7623 | 7750 | ||
| @@ -7747,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
| 7747 | sched_offline_group(tg); | 7874 | sched_offline_group(tg); |
| 7748 | } | 7875 | } |
| 7749 | 7876 | ||
| 7877 | static void cpu_cgroup_fork(struct task_struct *task) | ||
| 7878 | { | ||
| 7879 | sched_move_task(task); | ||
| 7880 | } | ||
| 7881 | |||
| 7750 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, | 7882 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, |
| 7751 | struct cgroup_taskset *tset) | 7883 | struct cgroup_taskset *tset) |
| 7752 | { | 7884 | { |
| @@ -7998,7 +8130,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
| 7998 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; | 8130 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; |
| 7999 | 8131 | ||
| 8000 | quota = normalize_cfs_quota(tg, d); | 8132 | quota = normalize_cfs_quota(tg, d); |
| 8001 | parent_quota = parent_b->hierarchal_quota; | 8133 | parent_quota = parent_b->hierarchical_quota; |
| 8002 | 8134 | ||
| 8003 | /* | 8135 | /* |
| 8004 | * ensure max(child_quota) <= parent_quota, inherit when no | 8136 | * ensure max(child_quota) <= parent_quota, inherit when no |
| @@ -8009,7 +8141,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
| 8009 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | 8141 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) |
| 8010 | return -EINVAL; | 8142 | return -EINVAL; |
| 8011 | } | 8143 | } |
| 8012 | cfs_b->hierarchal_quota = quota; | 8144 | cfs_b->hierarchical_quota = quota; |
| 8013 | 8145 | ||
| 8014 | return 0; | 8146 | return 0; |
| 8015 | } | 8147 | } |
| @@ -8119,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { | |||
| 8119 | .css_free = cpu_cgroup_css_free, | 8251 | .css_free = cpu_cgroup_css_free, |
| 8120 | .css_online = cpu_cgroup_css_online, | 8252 | .css_online = cpu_cgroup_css_online, |
| 8121 | .css_offline = cpu_cgroup_css_offline, | 8253 | .css_offline = cpu_cgroup_css_offline, |
| 8254 | .fork = cpu_cgroup_fork, | ||
| 8122 | .can_attach = cpu_cgroup_can_attach, | 8255 | .can_attach = cpu_cgroup_can_attach, |
| 8123 | .attach = cpu_cgroup_attach, | 8256 | .attach = cpu_cgroup_attach, |
| 8124 | .exit = cpu_cgroup_exit, | 8257 | .exit = cpu_cgroup_exit, |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963dae80..539ca3ce071b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
| @@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
| 107 | int best_cpu = -1; | 107 | int best_cpu = -1; |
| 108 | const struct sched_dl_entity *dl_se = &p->dl; | 108 | const struct sched_dl_entity *dl_se = &p->dl; |
| 109 | 109 | ||
| 110 | if (later_mask && cpumask_and(later_mask, cp->free_cpus, | 110 | if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { |
| 111 | &p->cpus_allowed) && cpumask_and(later_mask, | ||
| 112 | later_mask, cpu_active_mask)) { | ||
| 113 | best_cpu = cpumask_any(later_mask); | 111 | best_cpu = cpumask_any(later_mask); |
| 114 | goto out; | 112 | goto out; |
| 115 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && | 113 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06ef865..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
| 288 | struct signal_struct *sig = tsk->signal; | 288 | struct signal_struct *sig = tsk->signal; |
| 289 | cputime_t utime, stime; | 289 | cputime_t utime, stime; |
| 290 | struct task_struct *t; | 290 | struct task_struct *t; |
| 291 | 291 | unsigned int seq, nextseq; | |
| 292 | times->utime = sig->utime; | 292 | unsigned long flags; |
| 293 | times->stime = sig->stime; | ||
| 294 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
| 295 | 293 | ||
| 296 | rcu_read_lock(); | 294 | rcu_read_lock(); |
| 297 | /* make sure we can trust tsk->thread_group list */ | 295 | /* Attempt a lockless read on the first round. */ |
| 298 | if (!likely(pid_alive(tsk))) | 296 | nextseq = 0; |
| 299 | goto out; | ||
| 300 | |||
| 301 | t = tsk; | ||
| 302 | do { | 297 | do { |
| 303 | task_cputime(t, &utime, &stime); | 298 | seq = nextseq; |
| 304 | times->utime += utime; | 299 | flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); |
| 305 | times->stime += stime; | 300 | times->utime = sig->utime; |
| 306 | times->sum_exec_runtime += task_sched_runtime(t); | 301 | times->stime = sig->stime; |
| 307 | } while_each_thread(tsk, t); | 302 | times->sum_exec_runtime = sig->sum_sched_runtime; |
| 308 | out: | 303 | |
| 304 | for_each_thread(tsk, t) { | ||
| 305 | task_cputime(t, &utime, &stime); | ||
| 306 | times->utime += utime; | ||
| 307 | times->stime += stime; | ||
| 308 | times->sum_exec_runtime += task_sched_runtime(t); | ||
| 309 | } | ||
| 310 | /* If lockless access failed, take the lock. */ | ||
| 311 | nextseq = 1; | ||
| 312 | } while (need_seqretry(&sig->stats_lock, seq)); | ||
| 313 | done_seqretry_irqrestore(&sig->stats_lock, seq, flags); | ||
| 309 | rcu_read_unlock(); | 314 | rcu_read_unlock(); |
| 310 | } | 315 | } |
| 311 | 316 | ||
| @@ -550,6 +555,23 @@ drop_precision: | |||
| 550 | } | 555 | } |
| 551 | 556 | ||
| 552 | /* | 557 | /* |
| 558 | * Atomically advance counter to the new value. Interrupts, vcpu | ||
| 559 | * scheduling, and scaling inaccuracies can cause cputime_advance | ||
| 560 | * to be occasionally called with a new value smaller than counter. | ||
| 561 | * Let's enforce atomicity. | ||
| 562 | * | ||
| 563 | * Normally a caller will only go through this loop once, or not | ||
| 564 | * at all in case a previous caller updated counter the same jiffy. | ||
| 565 | */ | ||
| 566 | static void cputime_advance(cputime_t *counter, cputime_t new) | ||
| 567 | { | ||
| 568 | cputime_t old; | ||
| 569 | |||
| 570 | while (new > (old = ACCESS_ONCE(*counter))) | ||
| 571 | cmpxchg_cputime(counter, old, new); | ||
| 572 | } | ||
| 573 | |||
| 574 | /* | ||
| 553 | * Adjust tick based cputime random precision against scheduler | 575 | * Adjust tick based cputime random precision against scheduler |
| 554 | * runtime accounting. | 576 | * runtime accounting. |
| 555 | */ | 577 | */ |
| @@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 594 | utime = rtime - stime; | 616 | utime = rtime - stime; |
| 595 | } | 617 | } |
| 596 | 618 | ||
| 597 | /* | 619 | cputime_advance(&prev->stime, stime); |
| 598 | * If the tick based count grows faster than the scheduler one, | 620 | cputime_advance(&prev->utime, utime); |
| 599 | * the result of the scaling may go backward. | ||
| 600 | * Let's enforce monotonicity. | ||
| 601 | */ | ||
| 602 | prev->stime = max(prev->stime, stime); | ||
| 603 | prev->utime = max(prev->utime, utime); | ||
| 604 | 621 | ||
| 605 | out: | 622 | out: |
| 606 | *ut = prev->utime; | 623 | *ut = prev->utime; |
| @@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
| 617 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | 634 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
| 618 | } | 635 | } |
| 619 | 636 | ||
| 620 | /* | ||
| 621 | * Must be called with siglock held. | ||
| 622 | */ | ||
| 623 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 637 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
| 624 | { | 638 | { |
| 625 | struct task_cputime cputime; | 639 | struct task_cputime cputime; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce138b652..5285332392d5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -518,21 +518,29 @@ again: | |||
| 518 | } | 518 | } |
| 519 | 519 | ||
| 520 | /* | 520 | /* |
| 521 | * We need to take care of a possible races here. In fact, the | 521 | * We need to take care of several possible races here: |
| 522 | * task might have changed its scheduling policy to something | 522 | * |
| 523 | * different from SCHED_DEADLINE or changed its reservation | 523 | * - the task might have changed its scheduling policy |
| 524 | * parameters (through sched_setattr()). | 524 | * to something different than SCHED_DEADLINE |
| 525 | * - the task might have changed its reservation parameters | ||
| 526 | * (through sched_setattr()) | ||
| 527 | * - the task might have been boosted by someone else and | ||
| 528 | * might be in the boosting/deboosting path | ||
| 529 | * | ||
| 530 | * In all this cases we bail out, as the task is already | ||
| 531 | * in the runqueue or is going to be enqueued back anyway. | ||
| 525 | */ | 532 | */ |
| 526 | if (!dl_task(p) || dl_se->dl_new) | 533 | if (!dl_task(p) || dl_se->dl_new || |
| 534 | dl_se->dl_boosted || !dl_se->dl_throttled) | ||
| 527 | goto unlock; | 535 | goto unlock; |
| 528 | 536 | ||
| 529 | sched_clock_tick(); | 537 | sched_clock_tick(); |
| 530 | update_rq_clock(rq); | 538 | update_rq_clock(rq); |
| 531 | dl_se->dl_throttled = 0; | 539 | dl_se->dl_throttled = 0; |
| 532 | dl_se->dl_yielded = 0; | 540 | dl_se->dl_yielded = 0; |
| 533 | if (p->on_rq) { | 541 | if (task_on_rq_queued(p)) { |
| 534 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | 542 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); |
| 535 | if (task_has_dl_policy(rq->curr)) | 543 | if (dl_task(rq->curr)) |
| 536 | check_preempt_curr_dl(rq, p, 0); | 544 | check_preempt_curr_dl(rq, p, 0); |
| 537 | else | 545 | else |
| 538 | resched_curr(rq); | 546 | resched_curr(rq); |
| @@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
| 847 | * smaller than our one... OTW we keep our runtime and | 855 | * smaller than our one... OTW we keep our runtime and |
| 848 | * deadline. | 856 | * deadline. |
| 849 | */ | 857 | */ |
| 850 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) | 858 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { |
| 851 | pi_se = &pi_task->dl; | 859 | pi_se = &pi_task->dl; |
| 860 | } else if (!dl_prio(p->normal_prio)) { | ||
| 861 | /* | ||
| 862 | * Special case in which we have a !SCHED_DEADLINE task | ||
| 863 | * that is going to be deboosted, but exceedes its | ||
| 864 | * runtime while doing so. No point in replenishing | ||
| 865 | * it, as it's going to return back to its original | ||
| 866 | * scheduling class after this. | ||
| 867 | */ | ||
| 868 | BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); | ||
| 869 | return; | ||
| 870 | } | ||
| 852 | 871 | ||
| 853 | /* | 872 | /* |
| 854 | * If p is throttled, we do nothing. In fact, if it exhausted | 873 | * If p is throttled, we do nothing. In fact, if it exhausted |
| @@ -997,10 +1016,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | |||
| 997 | #ifdef CONFIG_SCHED_HRTICK | 1016 | #ifdef CONFIG_SCHED_HRTICK |
| 998 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | 1017 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) |
| 999 | { | 1018 | { |
| 1000 | s64 delta = p->dl.dl_runtime - p->dl.runtime; | 1019 | hrtick_start(rq, p->dl.runtime); |
| 1001 | |||
| 1002 | if (delta > 10000) | ||
| 1003 | hrtick_start(rq, p->dl.runtime); | ||
| 1004 | } | 1020 | } |
| 1005 | #endif | 1021 | #endif |
| 1006 | 1022 | ||
| @@ -1030,7 +1046,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) | |||
| 1030 | * means a stop task can slip in, in which case we need to | 1046 | * means a stop task can slip in, in which case we need to |
| 1031 | * re-start task selection. | 1047 | * re-start task selection. |
| 1032 | */ | 1048 | */ |
| 1033 | if (rq->stop && rq->stop->on_rq) | 1049 | if (rq->stop && task_on_rq_queued(rq->stop)) |
| 1034 | return RETRY_TASK; | 1050 | return RETRY_TASK; |
| 1035 | } | 1051 | } |
| 1036 | 1052 | ||
| @@ -1124,10 +1140,8 @@ static void set_curr_task_dl(struct rq *rq) | |||
| 1124 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | 1140 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) |
| 1125 | { | 1141 | { |
| 1126 | if (!task_running(rq, p) && | 1142 | if (!task_running(rq, p) && |
| 1127 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | 1143 | cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
| 1128 | (p->nr_cpus_allowed > 1)) | ||
| 1129 | return 1; | 1144 | return 1; |
| 1130 | |||
| 1131 | return 0; | 1145 | return 0; |
| 1132 | } | 1146 | } |
| 1133 | 1147 | ||
| @@ -1158,7 +1172,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); | |||
| 1158 | static int find_later_rq(struct task_struct *task) | 1172 | static int find_later_rq(struct task_struct *task) |
| 1159 | { | 1173 | { |
| 1160 | struct sched_domain *sd; | 1174 | struct sched_domain *sd; |
| 1161 | struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); | 1175 | struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); |
| 1162 | int this_cpu = smp_processor_id(); | 1176 | int this_cpu = smp_processor_id(); |
| 1163 | int best_cpu, cpu = task_cpu(task); | 1177 | int best_cpu, cpu = task_cpu(task); |
| 1164 | 1178 | ||
| @@ -1169,6 +1183,13 @@ static int find_later_rq(struct task_struct *task) | |||
| 1169 | if (task->nr_cpus_allowed == 1) | 1183 | if (task->nr_cpus_allowed == 1) |
| 1170 | return -1; | 1184 | return -1; |
| 1171 | 1185 | ||
| 1186 | /* | ||
| 1187 | * We have to consider system topology and task affinity | ||
| 1188 | * first, then we can look for a suitable cpu. | ||
| 1189 | */ | ||
| 1190 | cpumask_copy(later_mask, task_rq(task)->rd->span); | ||
| 1191 | cpumask_and(later_mask, later_mask, cpu_active_mask); | ||
| 1192 | cpumask_and(later_mask, later_mask, &task->cpus_allowed); | ||
| 1172 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, | 1193 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, |
| 1173 | task, later_mask); | 1194 | task, later_mask); |
| 1174 | if (best_cpu == -1) | 1195 | if (best_cpu == -1) |
| @@ -1257,7 +1278,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | |||
| 1257 | if (unlikely(task_rq(task) != rq || | 1278 | if (unlikely(task_rq(task) != rq || |
| 1258 | !cpumask_test_cpu(later_rq->cpu, | 1279 | !cpumask_test_cpu(later_rq->cpu, |
| 1259 | &task->cpus_allowed) || | 1280 | &task->cpus_allowed) || |
| 1260 | task_running(rq, task) || !task->on_rq)) { | 1281 | task_running(rq, task) || |
| 1282 | !task_on_rq_queued(task))) { | ||
| 1261 | double_unlock_balance(rq, later_rq); | 1283 | double_unlock_balance(rq, later_rq); |
| 1262 | later_rq = NULL; | 1284 | later_rq = NULL; |
| 1263 | break; | 1285 | break; |
| @@ -1296,7 +1318,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) | |||
| 1296 | BUG_ON(task_current(rq, p)); | 1318 | BUG_ON(task_current(rq, p)); |
| 1297 | BUG_ON(p->nr_cpus_allowed <= 1); | 1319 | BUG_ON(p->nr_cpus_allowed <= 1); |
| 1298 | 1320 | ||
| 1299 | BUG_ON(!p->on_rq); | 1321 | BUG_ON(!task_on_rq_queued(p)); |
| 1300 | BUG_ON(!dl_task(p)); | 1322 | BUG_ON(!dl_task(p)); |
| 1301 | 1323 | ||
| 1302 | return p; | 1324 | return p; |
| @@ -1443,7 +1465,7 @@ static int pull_dl_task(struct rq *this_rq) | |||
| 1443 | dl_time_before(p->dl.deadline, | 1465 | dl_time_before(p->dl.deadline, |
| 1444 | this_rq->dl.earliest_dl.curr))) { | 1466 | this_rq->dl.earliest_dl.curr))) { |
| 1445 | WARN_ON(p == src_rq->curr); | 1467 | WARN_ON(p == src_rq->curr); |
| 1446 | WARN_ON(!p->on_rq); | 1468 | WARN_ON(!task_on_rq_queued(p)); |
| 1447 | 1469 | ||
| 1448 | /* | 1470 | /* |
| 1449 | * Then we pull iff p has actually an earlier | 1471 | * Then we pull iff p has actually an earlier |
| @@ -1569,6 +1591,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
| 1569 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | 1591 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) |
| 1570 | hrtimer_try_to_cancel(&p->dl.dl_timer); | 1592 | hrtimer_try_to_cancel(&p->dl.dl_timer); |
| 1571 | 1593 | ||
| 1594 | __dl_clear_params(p); | ||
| 1595 | |||
| 1572 | #ifdef CONFIG_SMP | 1596 | #ifdef CONFIG_SMP |
| 1573 | /* | 1597 | /* |
| 1574 | * Since this might be the only -deadline task on the rq, | 1598 | * Since this might be the only -deadline task on the rq, |
| @@ -1596,14 +1620,18 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1596 | if (unlikely(p->dl.dl_throttled)) | 1620 | if (unlikely(p->dl.dl_throttled)) |
| 1597 | return; | 1621 | return; |
| 1598 | 1622 | ||
| 1599 | if (p->on_rq && rq->curr != p) { | 1623 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1600 | #ifdef CONFIG_SMP | 1624 | #ifdef CONFIG_SMP |
| 1601 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | 1625 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) |
| 1602 | /* Only reschedule if pushing failed */ | 1626 | /* Only reschedule if pushing failed */ |
| 1603 | check_resched = 0; | 1627 | check_resched = 0; |
| 1604 | #endif /* CONFIG_SMP */ | 1628 | #endif /* CONFIG_SMP */ |
| 1605 | if (check_resched && task_has_dl_policy(rq->curr)) | 1629 | if (check_resched) { |
| 1606 | check_preempt_curr_dl(rq, p, 0); | 1630 | if (dl_task(rq->curr)) |
| 1631 | check_preempt_curr_dl(rq, p, 0); | ||
| 1632 | else | ||
| 1633 | resched_curr(rq); | ||
| 1634 | } | ||
| 1607 | } | 1635 | } |
| 1608 | } | 1636 | } |
| 1609 | 1637 | ||
| @@ -1614,7 +1642,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1614 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, | 1642 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, |
| 1615 | int oldprio) | 1643 | int oldprio) |
| 1616 | { | 1644 | { |
| 1617 | if (p->on_rq || rq->curr == p) { | 1645 | if (task_on_rq_queued(p) || rq->curr == p) { |
| 1618 | #ifdef CONFIG_SMP | 1646 | #ifdef CONFIG_SMP |
| 1619 | /* | 1647 | /* |
| 1620 | * This might be too much, but unfortunately | 1648 | * This might be too much, but unfortunately |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c34b821..ce33780d8f20 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 150 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | 150 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
| 151 | { | 151 | { |
| 152 | struct task_struct *g, *p; | 152 | struct task_struct *g, *p; |
| 153 | unsigned long flags; | ||
| 154 | 153 | ||
| 155 | SEQ_printf(m, | 154 | SEQ_printf(m, |
| 156 | "\nrunnable tasks:\n" | 155 | "\nrunnable tasks:\n" |
| @@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
| 159 | "------------------------------------------------------" | 158 | "------------------------------------------------------" |
| 160 | "----------------------------------------------------\n"); | 159 | "----------------------------------------------------\n"); |
| 161 | 160 | ||
| 162 | read_lock_irqsave(&tasklist_lock, flags); | 161 | rcu_read_lock(); |
| 163 | 162 | for_each_process_thread(g, p) { | |
| 164 | do_each_thread(g, p) { | ||
| 165 | if (task_cpu(p) != rq_cpu) | 163 | if (task_cpu(p) != rq_cpu) |
| 166 | continue; | 164 | continue; |
| 167 | 165 | ||
| 168 | print_task(m, rq, p); | 166 | print_task(m, rq, p); |
| 169 | } while_each_thread(g, p); | 167 | } |
| 170 | 168 | rcu_read_unlock(); | |
| 171 | read_unlock_irqrestore(&tasklist_lock, flags); | ||
| 172 | } | 169 | } |
| 173 | 170 | ||
| 174 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 171 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
| @@ -333,9 +330,7 @@ do { \ | |||
| 333 | print_cfs_stats(m, cpu); | 330 | print_cfs_stats(m, cpu); |
| 334 | print_rt_stats(m, cpu); | 331 | print_rt_stats(m, cpu); |
| 335 | 332 | ||
| 336 | rcu_read_lock(); | ||
| 337 | print_rq(m, rq, cpu); | 333 | print_rq(m, rq, cpu); |
| 338 | rcu_read_unlock(); | ||
| 339 | spin_unlock_irqrestore(&sched_debug_lock, flags); | 334 | spin_unlock_irqrestore(&sched_debug_lock, flags); |
| 340 | SEQ_printf(m, "\n"); | 335 | SEQ_printf(m, "\n"); |
| 341 | } | 336 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfa3c86d0d68..34baa60f8a7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
| 24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
| 25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
| 26 | #include <linux/cpuidle.h> | ||
| 26 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
| 27 | #include <linux/profile.h> | 28 | #include <linux/profile.h> |
| 28 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
| @@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 665 | } | 666 | } |
| 666 | 667 | ||
| 667 | #ifdef CONFIG_SMP | 668 | #ifdef CONFIG_SMP |
| 669 | static int select_idle_sibling(struct task_struct *p, int cpu); | ||
| 668 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
| 669 | 671 | ||
| 670 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
| @@ -826,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) | |||
| 826 | 828 | ||
| 827 | static unsigned int task_scan_min(struct task_struct *p) | 829 | static unsigned int task_scan_min(struct task_struct *p) |
| 828 | { | 830 | { |
| 831 | unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); | ||
| 829 | unsigned int scan, floor; | 832 | unsigned int scan, floor; |
| 830 | unsigned int windows = 1; | 833 | unsigned int windows = 1; |
| 831 | 834 | ||
| 832 | if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) | 835 | if (scan_size < MAX_SCAN_WINDOW) |
| 833 | windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; | 836 | windows = MAX_SCAN_WINDOW / scan_size; |
| 834 | floor = 1000 / windows; | 837 | floor = 1000 / windows; |
| 835 | 838 | ||
| 836 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); | 839 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); |
| @@ -1038,7 +1041,8 @@ struct numa_stats { | |||
| 1038 | */ | 1041 | */ |
| 1039 | static void update_numa_stats(struct numa_stats *ns, int nid) | 1042 | static void update_numa_stats(struct numa_stats *ns, int nid) |
| 1040 | { | 1043 | { |
| 1041 | int cpu, cpus = 0; | 1044 | int smt, cpu, cpus = 0; |
| 1045 | unsigned long capacity; | ||
| 1042 | 1046 | ||
| 1043 | memset(ns, 0, sizeof(*ns)); | 1047 | memset(ns, 0, sizeof(*ns)); |
| 1044 | for_each_cpu(cpu, cpumask_of_node(nid)) { | 1048 | for_each_cpu(cpu, cpumask_of_node(nid)) { |
| @@ -1062,8 +1066,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
| 1062 | if (!cpus) | 1066 | if (!cpus) |
| 1063 | return; | 1067 | return; |
| 1064 | 1068 | ||
| 1065 | ns->task_capacity = | 1069 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ |
| 1066 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); | 1070 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); |
| 1071 | capacity = cpus / smt; /* cores */ | ||
| 1072 | |||
| 1073 | ns->task_capacity = min_t(unsigned, capacity, | ||
| 1074 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); | ||
| 1067 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); | 1075 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); |
| 1068 | } | 1076 | } |
| 1069 | 1077 | ||
| @@ -1157,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1157 | long moveimp = imp; | 1165 | long moveimp = imp; |
| 1158 | 1166 | ||
| 1159 | rcu_read_lock(); | 1167 | rcu_read_lock(); |
| 1160 | cur = ACCESS_ONCE(dst_rq->curr); | 1168 | |
| 1161 | if (cur->pid == 0) /* idle */ | 1169 | raw_spin_lock_irq(&dst_rq->lock); |
| 1170 | cur = dst_rq->curr; | ||
| 1171 | /* | ||
| 1172 | * No need to move the exiting task, and this ensures that ->curr | ||
| 1173 | * wasn't reaped and thus get_task_struct() in task_numa_assign() | ||
| 1174 | * is safe under RCU read lock. | ||
| 1175 | * Note that rcu_read_lock() itself can't protect from the final | ||
| 1176 | * put_task_struct() after the last schedule(). | ||
| 1177 | */ | ||
| 1178 | if ((cur->flags & PF_EXITING) || is_idle_task(cur)) | ||
| 1162 | cur = NULL; | 1179 | cur = NULL; |
| 1180 | raw_spin_unlock_irq(&dst_rq->lock); | ||
| 1163 | 1181 | ||
| 1164 | /* | 1182 | /* |
| 1165 | * "imp" is the fault differential for the source task between the | 1183 | * "imp" is the fault differential for the source task between the |
| @@ -1206,7 +1224,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1206 | 1224 | ||
| 1207 | if (!cur) { | 1225 | if (!cur) { |
| 1208 | /* Is there capacity at our destination? */ | 1226 | /* Is there capacity at our destination? */ |
| 1209 | if (env->src_stats.has_free_capacity && | 1227 | if (env->src_stats.nr_running <= env->src_stats.task_capacity && |
| 1210 | !env->dst_stats.has_free_capacity) | 1228 | !env->dst_stats.has_free_capacity) |
| 1211 | goto unlock; | 1229 | goto unlock; |
| 1212 | 1230 | ||
| @@ -1252,6 +1270,13 @@ balance: | |||
| 1252 | if (load_too_imbalanced(src_load, dst_load, env)) | 1270 | if (load_too_imbalanced(src_load, dst_load, env)) |
| 1253 | goto unlock; | 1271 | goto unlock; |
| 1254 | 1272 | ||
| 1273 | /* | ||
| 1274 | * One idle CPU per node is evaluated for a task numa move. | ||
| 1275 | * Call select_idle_sibling to maybe find a better one. | ||
| 1276 | */ | ||
| 1277 | if (!cur) | ||
| 1278 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | ||
| 1279 | |||
| 1255 | assign: | 1280 | assign: |
| 1256 | task_numa_assign(env, cur, imp); | 1281 | task_numa_assign(env, cur, imp); |
| 1257 | unlock: | 1282 | unlock: |
| @@ -1506,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p, | |||
| 1506 | * scanning faster if shared accesses dominate as it may | 1531 | * scanning faster if shared accesses dominate as it may |
| 1507 | * simply bounce migrations uselessly | 1532 | * simply bounce migrations uselessly |
| 1508 | */ | 1533 | */ |
| 1509 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | 1534 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); |
| 1510 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | 1535 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; |
| 1511 | } | 1536 | } |
| 1512 | 1537 | ||
| @@ -1775,7 +1800,7 @@ void task_numa_free(struct task_struct *p) | |||
| 1775 | list_del(&p->numa_entry); | 1800 | list_del(&p->numa_entry); |
| 1776 | grp->nr_tasks--; | 1801 | grp->nr_tasks--; |
| 1777 | spin_unlock_irqrestore(&grp->lock, flags); | 1802 | spin_unlock_irqrestore(&grp->lock, flags); |
| 1778 | rcu_assign_pointer(p->numa_group, NULL); | 1803 | RCU_INIT_POINTER(p->numa_group, NULL); |
| 1779 | put_numa_group(grp); | 1804 | put_numa_group(grp); |
| 1780 | } | 1805 | } |
| 1781 | 1806 | ||
| @@ -1804,10 +1829,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 1804 | if (!p->mm) | 1829 | if (!p->mm) |
| 1805 | return; | 1830 | return; |
| 1806 | 1831 | ||
| 1807 | /* Do not worry about placement if exiting */ | ||
| 1808 | if (p->state == TASK_DEAD) | ||
| 1809 | return; | ||
| 1810 | |||
| 1811 | /* Allocate buffer to track faults on a per-node basis */ | 1832 | /* Allocate buffer to track faults on a per-node basis */ |
| 1812 | if (unlikely(!p->numa_faults_memory)) { | 1833 | if (unlikely(!p->numa_faults_memory)) { |
| 1813 | int size = sizeof(*p->numa_faults_memory) * | 1834 | int size = sizeof(*p->numa_faults_memory) * |
| @@ -1946,7 +1967,7 @@ void task_numa_work(struct callback_head *work) | |||
| 1946 | vma = mm->mmap; | 1967 | vma = mm->mmap; |
| 1947 | } | 1968 | } |
| 1948 | for (; vma; vma = vma->vm_next) { | 1969 | for (; vma; vma = vma->vm_next) { |
| 1949 | if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) | 1970 | if (!vma_migratable(vma) || !vma_policy_mof(vma)) |
| 1950 | continue; | 1971 | continue; |
| 1951 | 1972 | ||
| 1952 | /* | 1973 | /* |
| @@ -2211,8 +2232,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) | |||
| 2211 | 2232 | ||
| 2212 | /* | 2233 | /* |
| 2213 | * As y^PERIOD = 1/2, we can combine | 2234 | * As y^PERIOD = 1/2, we can combine |
| 2214 | * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) | 2235 | * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) |
| 2215 | * With a look-up table which covers k^n (n<PERIOD) | 2236 | * With a look-up table which covers y^n (n<PERIOD) |
| 2216 | * | 2237 | * |
| 2217 | * To achieve constant time decay_load. | 2238 | * To achieve constant time decay_load. |
| 2218 | */ | 2239 | */ |
| @@ -2377,6 +2398,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | |||
| 2377 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | 2398 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; |
| 2378 | tg_contrib -= cfs_rq->tg_load_contrib; | 2399 | tg_contrib -= cfs_rq->tg_load_contrib; |
| 2379 | 2400 | ||
| 2401 | if (!tg_contrib) | ||
| 2402 | return; | ||
| 2403 | |||
| 2380 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | 2404 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { |
| 2381 | atomic_long_add(tg_contrib, &tg->load_avg); | 2405 | atomic_long_add(tg_contrib, &tg->load_avg); |
| 2382 | cfs_rq->tg_load_contrib += tg_contrib; | 2406 | cfs_rq->tg_load_contrib += tg_contrib; |
| @@ -3892,14 +3916,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
| 3892 | resched_curr(rq); | 3916 | resched_curr(rq); |
| 3893 | return; | 3917 | return; |
| 3894 | } | 3918 | } |
| 3895 | |||
| 3896 | /* | ||
| 3897 | * Don't schedule slices shorter than 10000ns, that just | ||
| 3898 | * doesn't make sense. Rely on vruntime for fairness. | ||
| 3899 | */ | ||
| 3900 | if (rq->curr != p) | ||
| 3901 | delta = max_t(s64, 10000LL, delta); | ||
| 3902 | |||
| 3903 | hrtick_start(rq, delta); | 3919 | hrtick_start(rq, delta); |
| 3904 | } | 3920 | } |
| 3905 | } | 3921 | } |
| @@ -4087,7 +4103,7 @@ static unsigned long capacity_of(int cpu) | |||
| 4087 | static unsigned long cpu_avg_load_per_task(int cpu) | 4103 | static unsigned long cpu_avg_load_per_task(int cpu) |
| 4088 | { | 4104 | { |
| 4089 | struct rq *rq = cpu_rq(cpu); | 4105 | struct rq *rq = cpu_rq(cpu); |
| 4090 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 4106 | unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); |
| 4091 | unsigned long load_avg = rq->cfs.runnable_load_avg; | 4107 | unsigned long load_avg = rq->cfs.runnable_load_avg; |
| 4092 | 4108 | ||
| 4093 | if (nr_running) | 4109 | if (nr_running) |
| @@ -4276,8 +4292,8 @@ static int wake_wide(struct task_struct *p) | |||
| 4276 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 4292 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
| 4277 | { | 4293 | { |
| 4278 | s64 this_load, load; | 4294 | s64 this_load, load; |
| 4295 | s64 this_eff_load, prev_eff_load; | ||
| 4279 | int idx, this_cpu, prev_cpu; | 4296 | int idx, this_cpu, prev_cpu; |
| 4280 | unsigned long tl_per_task; | ||
| 4281 | struct task_group *tg; | 4297 | struct task_group *tg; |
| 4282 | unsigned long weight; | 4298 | unsigned long weight; |
| 4283 | int balanced; | 4299 | int balanced; |
| @@ -4320,47 +4336,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
| 4320 | * Otherwise check if either cpus are near enough in load to allow this | 4336 | * Otherwise check if either cpus are near enough in load to allow this |
| 4321 | * task to be woken on this_cpu. | 4337 | * task to be woken on this_cpu. |
| 4322 | */ | 4338 | */ |
| 4323 | if (this_load > 0) { | 4339 | this_eff_load = 100; |
| 4324 | s64 this_eff_load, prev_eff_load; | 4340 | this_eff_load *= capacity_of(prev_cpu); |
| 4341 | |||
| 4342 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 4343 | prev_eff_load *= capacity_of(this_cpu); | ||
| 4325 | 4344 | ||
| 4326 | this_eff_load = 100; | 4345 | if (this_load > 0) { |
| 4327 | this_eff_load *= capacity_of(prev_cpu); | ||
| 4328 | this_eff_load *= this_load + | 4346 | this_eff_load *= this_load + |
| 4329 | effective_load(tg, this_cpu, weight, weight); | 4347 | effective_load(tg, this_cpu, weight, weight); |
| 4330 | 4348 | ||
| 4331 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 4332 | prev_eff_load *= capacity_of(this_cpu); | ||
| 4333 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); | 4349 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); |
| 4350 | } | ||
| 4334 | 4351 | ||
| 4335 | balanced = this_eff_load <= prev_eff_load; | 4352 | balanced = this_eff_load <= prev_eff_load; |
| 4336 | } else | ||
| 4337 | balanced = true; | ||
| 4338 | |||
| 4339 | /* | ||
| 4340 | * If the currently running task will sleep within | ||
| 4341 | * a reasonable amount of time then attract this newly | ||
| 4342 | * woken task: | ||
| 4343 | */ | ||
| 4344 | if (sync && balanced) | ||
| 4345 | return 1; | ||
| 4346 | 4353 | ||
| 4347 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); | 4354 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); |
| 4348 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 4349 | 4355 | ||
| 4350 | if (balanced || | 4356 | if (!balanced) |
| 4351 | (this_load <= load && | 4357 | return 0; |
| 4352 | this_load + target_load(prev_cpu, idx) <= tl_per_task)) { | ||
| 4353 | /* | ||
| 4354 | * This domain has SD_WAKE_AFFINE and | ||
| 4355 | * p is cache cold in this domain, and | ||
| 4356 | * there is no bad imbalance. | ||
| 4357 | */ | ||
| 4358 | schedstat_inc(sd, ttwu_move_affine); | ||
| 4359 | schedstat_inc(p, se.statistics.nr_wakeups_affine); | ||
| 4360 | 4358 | ||
| 4361 | return 1; | 4359 | schedstat_inc(sd, ttwu_move_affine); |
| 4362 | } | 4360 | schedstat_inc(p, se.statistics.nr_wakeups_affine); |
| 4363 | return 0; | 4361 | |
| 4362 | return 1; | ||
| 4364 | } | 4363 | } |
| 4365 | 4364 | ||
| 4366 | /* | 4365 | /* |
| @@ -4428,20 +4427,46 @@ static int | |||
| 4428 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 4427 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
| 4429 | { | 4428 | { |
| 4430 | unsigned long load, min_load = ULONG_MAX; | 4429 | unsigned long load, min_load = ULONG_MAX; |
| 4431 | int idlest = -1; | 4430 | unsigned int min_exit_latency = UINT_MAX; |
| 4431 | u64 latest_idle_timestamp = 0; | ||
| 4432 | int least_loaded_cpu = this_cpu; | ||
| 4433 | int shallowest_idle_cpu = -1; | ||
| 4432 | int i; | 4434 | int i; |
| 4433 | 4435 | ||
| 4434 | /* Traverse only the allowed CPUs */ | 4436 | /* Traverse only the allowed CPUs */ |
| 4435 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { | 4437 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
| 4436 | load = weighted_cpuload(i); | 4438 | if (idle_cpu(i)) { |
| 4437 | 4439 | struct rq *rq = cpu_rq(i); | |
| 4438 | if (load < min_load || (load == min_load && i == this_cpu)) { | 4440 | struct cpuidle_state *idle = idle_get_state(rq); |
| 4439 | min_load = load; | 4441 | if (idle && idle->exit_latency < min_exit_latency) { |
| 4440 | idlest = i; | 4442 | /* |
| 4443 | * We give priority to a CPU whose idle state | ||
| 4444 | * has the smallest exit latency irrespective | ||
| 4445 | * of any idle timestamp. | ||
| 4446 | */ | ||
| 4447 | min_exit_latency = idle->exit_latency; | ||
| 4448 | latest_idle_timestamp = rq->idle_stamp; | ||
| 4449 | shallowest_idle_cpu = i; | ||
| 4450 | } else if ((!idle || idle->exit_latency == min_exit_latency) && | ||
| 4451 | rq->idle_stamp > latest_idle_timestamp) { | ||
| 4452 | /* | ||
| 4453 | * If equal or no active idle state, then | ||
| 4454 | * the most recently idled CPU might have | ||
| 4455 | * a warmer cache. | ||
| 4456 | */ | ||
| 4457 | latest_idle_timestamp = rq->idle_stamp; | ||
| 4458 | shallowest_idle_cpu = i; | ||
| 4459 | } | ||
| 4460 | } else { | ||
| 4461 | load = weighted_cpuload(i); | ||
| 4462 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
| 4463 | min_load = load; | ||
| 4464 | least_loaded_cpu = i; | ||
| 4465 | } | ||
| 4441 | } | 4466 | } |
| 4442 | } | 4467 | } |
| 4443 | 4468 | ||
| 4444 | return idlest; | 4469 | return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; |
| 4445 | } | 4470 | } |
| 4446 | 4471 | ||
| 4447 | /* | 4472 | /* |
| @@ -4513,11 +4538,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 4513 | if (p->nr_cpus_allowed == 1) | 4538 | if (p->nr_cpus_allowed == 1) |
| 4514 | return prev_cpu; | 4539 | return prev_cpu; |
| 4515 | 4540 | ||
| 4516 | if (sd_flag & SD_BALANCE_WAKE) { | 4541 | if (sd_flag & SD_BALANCE_WAKE) |
| 4517 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | 4542 | want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
| 4518 | want_affine = 1; | ||
| 4519 | new_cpu = prev_cpu; | ||
| 4520 | } | ||
| 4521 | 4543 | ||
| 4522 | rcu_read_lock(); | 4544 | rcu_read_lock(); |
| 4523 | for_each_domain(cpu, tmp) { | 4545 | for_each_domain(cpu, tmp) { |
| @@ -4704,7 +4726,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 4704 | return; | 4726 | return; |
| 4705 | 4727 | ||
| 4706 | /* | 4728 | /* |
| 4707 | * This is possible from callers such as move_task(), in which we | 4729 | * This is possible from callers such as attach_tasks(), in which we |
| 4708 | * unconditionally check_prempt_curr() after an enqueue (which may have | 4730 | * unconditionally check_prempt_curr() after an enqueue (which may have |
| 4709 | * lead to a throttle). This both saves work and prevents false | 4731 | * lead to a throttle). This both saves work and prevents false |
| 4710 | * next-buddy nomination below. | 4732 | * next-buddy nomination below. |
| @@ -5112,27 +5134,18 @@ struct lb_env { | |||
| 5112 | unsigned int loop_max; | 5134 | unsigned int loop_max; |
| 5113 | 5135 | ||
| 5114 | enum fbq_type fbq_type; | 5136 | enum fbq_type fbq_type; |
| 5137 | struct list_head tasks; | ||
| 5115 | }; | 5138 | }; |
| 5116 | 5139 | ||
| 5117 | /* | 5140 | /* |
| 5118 | * move_task - move a task from one runqueue to another runqueue. | ||
| 5119 | * Both runqueues must be locked. | ||
| 5120 | */ | ||
| 5121 | static void move_task(struct task_struct *p, struct lb_env *env) | ||
| 5122 | { | ||
| 5123 | deactivate_task(env->src_rq, p, 0); | ||
| 5124 | set_task_cpu(p, env->dst_cpu); | ||
| 5125 | activate_task(env->dst_rq, p, 0); | ||
| 5126 | check_preempt_curr(env->dst_rq, p, 0); | ||
| 5127 | } | ||
| 5128 | |||
| 5129 | /* | ||
| 5130 | * Is this task likely cache-hot: | 5141 | * Is this task likely cache-hot: |
| 5131 | */ | 5142 | */ |
| 5132 | static int task_hot(struct task_struct *p, struct lb_env *env) | 5143 | static int task_hot(struct task_struct *p, struct lb_env *env) |
| 5133 | { | 5144 | { |
| 5134 | s64 delta; | 5145 | s64 delta; |
| 5135 | 5146 | ||
| 5147 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5148 | |||
| 5136 | if (p->sched_class != &fair_sched_class) | 5149 | if (p->sched_class != &fair_sched_class) |
| 5137 | return 0; | 5150 | return 0; |
| 5138 | 5151 | ||
| @@ -5252,6 +5265,9 @@ static | |||
| 5252 | int can_migrate_task(struct task_struct *p, struct lb_env *env) | 5265 | int can_migrate_task(struct task_struct *p, struct lb_env *env) |
| 5253 | { | 5266 | { |
| 5254 | int tsk_cache_hot = 0; | 5267 | int tsk_cache_hot = 0; |
| 5268 | |||
| 5269 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5270 | |||
| 5255 | /* | 5271 | /* |
| 5256 | * We do not migrate tasks that are: | 5272 | * We do not migrate tasks that are: |
| 5257 | * 1) throttled_lb_pair, or | 5273 | * 1) throttled_lb_pair, or |
| @@ -5310,24 +5326,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 5310 | if (!tsk_cache_hot) | 5326 | if (!tsk_cache_hot) |
| 5311 | tsk_cache_hot = migrate_degrades_locality(p, env); | 5327 | tsk_cache_hot = migrate_degrades_locality(p, env); |
| 5312 | 5328 | ||
| 5313 | if (migrate_improves_locality(p, env)) { | 5329 | if (migrate_improves_locality(p, env) || !tsk_cache_hot || |
| 5314 | #ifdef CONFIG_SCHEDSTATS | 5330 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
| 5315 | if (tsk_cache_hot) { | ||
| 5316 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | ||
| 5317 | schedstat_inc(p, se.statistics.nr_forced_migrations); | ||
| 5318 | } | ||
| 5319 | #endif | ||
| 5320 | return 1; | ||
| 5321 | } | ||
| 5322 | |||
| 5323 | if (!tsk_cache_hot || | ||
| 5324 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | ||
| 5325 | |||
| 5326 | if (tsk_cache_hot) { | 5331 | if (tsk_cache_hot) { |
| 5327 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 5332 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
| 5328 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 5333 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
| 5329 | } | 5334 | } |
| 5330 | |||
| 5331 | return 1; | 5335 | return 1; |
| 5332 | } | 5336 | } |
| 5333 | 5337 | ||
| @@ -5336,47 +5340,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 5336 | } | 5340 | } |
| 5337 | 5341 | ||
| 5338 | /* | 5342 | /* |
| 5339 | * move_one_task tries to move exactly one task from busiest to this_rq, as | 5343 | * detach_task() -- detach the task for the migration specified in env |
| 5344 | */ | ||
| 5345 | static void detach_task(struct task_struct *p, struct lb_env *env) | ||
| 5346 | { | ||
| 5347 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5348 | |||
| 5349 | deactivate_task(env->src_rq, p, 0); | ||
| 5350 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
| 5351 | set_task_cpu(p, env->dst_cpu); | ||
| 5352 | } | ||
| 5353 | |||
| 5354 | /* | ||
| 5355 | * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as | ||
| 5340 | * part of active balancing operations within "domain". | 5356 | * part of active balancing operations within "domain". |
| 5341 | * Returns 1 if successful and 0 otherwise. | ||
| 5342 | * | 5357 | * |
| 5343 | * Called with both runqueues locked. | 5358 | * Returns a task if successful and NULL otherwise. |
| 5344 | */ | 5359 | */ |
| 5345 | static int move_one_task(struct lb_env *env) | 5360 | static struct task_struct *detach_one_task(struct lb_env *env) |
| 5346 | { | 5361 | { |
| 5347 | struct task_struct *p, *n; | 5362 | struct task_struct *p, *n; |
| 5348 | 5363 | ||
| 5364 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5365 | |||
| 5349 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { | 5366 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
| 5350 | if (!can_migrate_task(p, env)) | 5367 | if (!can_migrate_task(p, env)) |
| 5351 | continue; | 5368 | continue; |
| 5352 | 5369 | ||
| 5353 | move_task(p, env); | 5370 | detach_task(p, env); |
| 5371 | |||
| 5354 | /* | 5372 | /* |
| 5355 | * Right now, this is only the second place move_task() | 5373 | * Right now, this is only the second place where |
| 5356 | * is called, so we can safely collect move_task() | 5374 | * lb_gained[env->idle] is updated (other is detach_tasks) |
| 5357 | * stats here rather than inside move_task(). | 5375 | * so we can safely collect stats here rather than |
| 5376 | * inside detach_tasks(). | ||
| 5358 | */ | 5377 | */ |
| 5359 | schedstat_inc(env->sd, lb_gained[env->idle]); | 5378 | schedstat_inc(env->sd, lb_gained[env->idle]); |
| 5360 | return 1; | 5379 | return p; |
| 5361 | } | 5380 | } |
| 5362 | return 0; | 5381 | return NULL; |
| 5363 | } | 5382 | } |
| 5364 | 5383 | ||
| 5365 | static const unsigned int sched_nr_migrate_break = 32; | 5384 | static const unsigned int sched_nr_migrate_break = 32; |
| 5366 | 5385 | ||
| 5367 | /* | 5386 | /* |
| 5368 | * move_tasks tries to move up to imbalance weighted load from busiest to | 5387 | * detach_tasks() -- tries to detach up to imbalance weighted load from |
| 5369 | * this_rq, as part of a balancing operation within domain "sd". | 5388 | * busiest_rq, as part of a balancing operation within domain "sd". |
| 5370 | * Returns 1 if successful and 0 otherwise. | ||
| 5371 | * | 5389 | * |
| 5372 | * Called with both runqueues locked. | 5390 | * Returns number of detached tasks if successful and 0 otherwise. |
| 5373 | */ | 5391 | */ |
| 5374 | static int move_tasks(struct lb_env *env) | 5392 | static int detach_tasks(struct lb_env *env) |
| 5375 | { | 5393 | { |
| 5376 | struct list_head *tasks = &env->src_rq->cfs_tasks; | 5394 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
| 5377 | struct task_struct *p; | 5395 | struct task_struct *p; |
| 5378 | unsigned long load; | 5396 | unsigned long load; |
| 5379 | int pulled = 0; | 5397 | int detached = 0; |
| 5398 | |||
| 5399 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5380 | 5400 | ||
| 5381 | if (env->imbalance <= 0) | 5401 | if (env->imbalance <= 0) |
| 5382 | return 0; | 5402 | return 0; |
| @@ -5407,14 +5427,16 @@ static int move_tasks(struct lb_env *env) | |||
| 5407 | if ((load / 2) > env->imbalance) | 5427 | if ((load / 2) > env->imbalance) |
| 5408 | goto next; | 5428 | goto next; |
| 5409 | 5429 | ||
| 5410 | move_task(p, env); | 5430 | detach_task(p, env); |
| 5411 | pulled++; | 5431 | list_add(&p->se.group_node, &env->tasks); |
| 5432 | |||
| 5433 | detached++; | ||
| 5412 | env->imbalance -= load; | 5434 | env->imbalance -= load; |
| 5413 | 5435 | ||
| 5414 | #ifdef CONFIG_PREEMPT | 5436 | #ifdef CONFIG_PREEMPT |
| 5415 | /* | 5437 | /* |
| 5416 | * NEWIDLE balancing is a source of latency, so preemptible | 5438 | * NEWIDLE balancing is a source of latency, so preemptible |
| 5417 | * kernels will stop after the first task is pulled to minimize | 5439 | * kernels will stop after the first task is detached to minimize |
| 5418 | * the critical section. | 5440 | * the critical section. |
| 5419 | */ | 5441 | */ |
| 5420 | if (env->idle == CPU_NEWLY_IDLE) | 5442 | if (env->idle == CPU_NEWLY_IDLE) |
| @@ -5434,13 +5456,58 @@ next: | |||
| 5434 | } | 5456 | } |
| 5435 | 5457 | ||
| 5436 | /* | 5458 | /* |
| 5437 | * Right now, this is one of only two places move_task() is called, | 5459 | * Right now, this is one of only two places we collect this stat |
| 5438 | * so we can safely collect move_task() stats here rather than | 5460 | * so we can safely collect detach_one_task() stats here rather |
| 5439 | * inside move_task(). | 5461 | * than inside detach_one_task(). |
| 5440 | */ | 5462 | */ |
| 5441 | schedstat_add(env->sd, lb_gained[env->idle], pulled); | 5463 | schedstat_add(env->sd, lb_gained[env->idle], detached); |
| 5464 | |||
| 5465 | return detached; | ||
| 5466 | } | ||
| 5467 | |||
| 5468 | /* | ||
| 5469 | * attach_task() -- attach the task detached by detach_task() to its new rq. | ||
| 5470 | */ | ||
| 5471 | static void attach_task(struct rq *rq, struct task_struct *p) | ||
| 5472 | { | ||
| 5473 | lockdep_assert_held(&rq->lock); | ||
| 5474 | |||
| 5475 | BUG_ON(task_rq(p) != rq); | ||
| 5476 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
| 5477 | activate_task(rq, p, 0); | ||
| 5478 | check_preempt_curr(rq, p, 0); | ||
| 5479 | } | ||
| 5480 | |||
| 5481 | /* | ||
| 5482 | * attach_one_task() -- attaches the task returned from detach_one_task() to | ||
| 5483 | * its new rq. | ||
| 5484 | */ | ||
| 5485 | static void attach_one_task(struct rq *rq, struct task_struct *p) | ||
| 5486 | { | ||
| 5487 | raw_spin_lock(&rq->lock); | ||
| 5488 | attach_task(rq, p); | ||
| 5489 | raw_spin_unlock(&rq->lock); | ||
| 5490 | } | ||
| 5491 | |||
| 5492 | /* | ||
| 5493 | * attach_tasks() -- attaches all tasks detached by detach_tasks() to their | ||
| 5494 | * new rq. | ||
| 5495 | */ | ||
| 5496 | static void attach_tasks(struct lb_env *env) | ||
| 5497 | { | ||
| 5498 | struct list_head *tasks = &env->tasks; | ||
| 5499 | struct task_struct *p; | ||
| 5500 | |||
| 5501 | raw_spin_lock(&env->dst_rq->lock); | ||
| 5502 | |||
| 5503 | while (!list_empty(tasks)) { | ||
| 5504 | p = list_first_entry(tasks, struct task_struct, se.group_node); | ||
| 5505 | list_del_init(&p->se.group_node); | ||
| 5506 | |||
| 5507 | attach_task(env->dst_rq, p); | ||
| 5508 | } | ||
| 5442 | 5509 | ||
| 5443 | return pulled; | 5510 | raw_spin_unlock(&env->dst_rq->lock); |
| 5444 | } | 5511 | } |
| 5445 | 5512 | ||
| 5446 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5513 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -5559,6 +5626,13 @@ static unsigned long task_h_load(struct task_struct *p) | |||
| 5559 | #endif | 5626 | #endif |
| 5560 | 5627 | ||
| 5561 | /********** Helpers for find_busiest_group ************************/ | 5628 | /********** Helpers for find_busiest_group ************************/ |
| 5629 | |||
| 5630 | enum group_type { | ||
| 5631 | group_other = 0, | ||
| 5632 | group_imbalanced, | ||
| 5633 | group_overloaded, | ||
| 5634 | }; | ||
| 5635 | |||
| 5562 | /* | 5636 | /* |
| 5563 | * sg_lb_stats - stats of a sched_group required for load_balancing | 5637 | * sg_lb_stats - stats of a sched_group required for load_balancing |
| 5564 | */ | 5638 | */ |
| @@ -5572,7 +5646,7 @@ struct sg_lb_stats { | |||
| 5572 | unsigned int group_capacity_factor; | 5646 | unsigned int group_capacity_factor; |
| 5573 | unsigned int idle_cpus; | 5647 | unsigned int idle_cpus; |
| 5574 | unsigned int group_weight; | 5648 | unsigned int group_weight; |
| 5575 | int group_imb; /* Is there an imbalance in the group ? */ | 5649 | enum group_type group_type; |
| 5576 | int group_has_free_capacity; | 5650 | int group_has_free_capacity; |
| 5577 | #ifdef CONFIG_NUMA_BALANCING | 5651 | #ifdef CONFIG_NUMA_BALANCING |
| 5578 | unsigned int nr_numa_running; | 5652 | unsigned int nr_numa_running; |
| @@ -5610,6 +5684,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
| 5610 | .total_capacity = 0UL, | 5684 | .total_capacity = 0UL, |
| 5611 | .busiest_stat = { | 5685 | .busiest_stat = { |
| 5612 | .avg_load = 0UL, | 5686 | .avg_load = 0UL, |
| 5687 | .sum_nr_running = 0, | ||
| 5688 | .group_type = group_other, | ||
| 5613 | }, | 5689 | }, |
| 5614 | }; | 5690 | }; |
| 5615 | } | 5691 | } |
| @@ -5652,19 +5728,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | |||
| 5652 | return default_scale_capacity(sd, cpu); | 5728 | return default_scale_capacity(sd, cpu); |
| 5653 | } | 5729 | } |
| 5654 | 5730 | ||
| 5655 | static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) | 5731 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5656 | { | 5732 | { |
| 5657 | unsigned long weight = sd->span_weight; | 5733 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
| 5658 | unsigned long smt_gain = sd->smt_gain; | 5734 | return sd->smt_gain / sd->span_weight; |
| 5659 | |||
| 5660 | smt_gain /= weight; | ||
| 5661 | 5735 | ||
| 5662 | return smt_gain; | 5736 | return SCHED_CAPACITY_SCALE; |
| 5663 | } | 5737 | } |
| 5664 | 5738 | ||
| 5665 | unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) | 5739 | unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5666 | { | 5740 | { |
| 5667 | return default_scale_smt_capacity(sd, cpu); | 5741 | return default_scale_cpu_capacity(sd, cpu); |
| 5668 | } | 5742 | } |
| 5669 | 5743 | ||
| 5670 | static unsigned long scale_rt_capacity(int cpu) | 5744 | static unsigned long scale_rt_capacity(int cpu) |
| @@ -5703,18 +5777,15 @@ static unsigned long scale_rt_capacity(int cpu) | |||
| 5703 | 5777 | ||
| 5704 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 5778 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5705 | { | 5779 | { |
| 5706 | unsigned long weight = sd->span_weight; | ||
| 5707 | unsigned long capacity = SCHED_CAPACITY_SCALE; | 5780 | unsigned long capacity = SCHED_CAPACITY_SCALE; |
| 5708 | struct sched_group *sdg = sd->groups; | 5781 | struct sched_group *sdg = sd->groups; |
| 5709 | 5782 | ||
| 5710 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { | 5783 | if (sched_feat(ARCH_CAPACITY)) |
| 5711 | if (sched_feat(ARCH_CAPACITY)) | 5784 | capacity *= arch_scale_cpu_capacity(sd, cpu); |
| 5712 | capacity *= arch_scale_smt_capacity(sd, cpu); | 5785 | else |
| 5713 | else | 5786 | capacity *= default_scale_cpu_capacity(sd, cpu); |
| 5714 | capacity *= default_scale_smt_capacity(sd, cpu); | ||
| 5715 | 5787 | ||
| 5716 | capacity >>= SCHED_CAPACITY_SHIFT; | 5788 | capacity >>= SCHED_CAPACITY_SHIFT; |
| 5717 | } | ||
| 5718 | 5789 | ||
| 5719 | sdg->sgc->capacity_orig = capacity; | 5790 | sdg->sgc->capacity_orig = capacity; |
| 5720 | 5791 | ||
| @@ -5891,6 +5962,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro | |||
| 5891 | return capacity_factor; | 5962 | return capacity_factor; |
| 5892 | } | 5963 | } |
| 5893 | 5964 | ||
| 5965 | static enum group_type | ||
| 5966 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | ||
| 5967 | { | ||
| 5968 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | ||
| 5969 | return group_overloaded; | ||
| 5970 | |||
| 5971 | if (sg_imbalanced(group)) | ||
| 5972 | return group_imbalanced; | ||
| 5973 | |||
| 5974 | return group_other; | ||
| 5975 | } | ||
| 5976 | |||
| 5894 | /** | 5977 | /** |
| 5895 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 5978 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 5896 | * @env: The load balancing environment. | 5979 | * @env: The load balancing environment. |
| @@ -5920,7 +6003,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 5920 | load = source_load(i, load_idx); | 6003 | load = source_load(i, load_idx); |
| 5921 | 6004 | ||
| 5922 | sgs->group_load += load; | 6005 | sgs->group_load += load; |
| 5923 | sgs->sum_nr_running += rq->nr_running; | 6006 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
| 5924 | 6007 | ||
| 5925 | if (rq->nr_running > 1) | 6008 | if (rq->nr_running > 1) |
| 5926 | *overload = true; | 6009 | *overload = true; |
| @@ -5942,9 +6025,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 5942 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6025 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 5943 | 6026 | ||
| 5944 | sgs->group_weight = group->group_weight; | 6027 | sgs->group_weight = group->group_weight; |
| 5945 | |||
| 5946 | sgs->group_imb = sg_imbalanced(group); | ||
| 5947 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | 6028 | sgs->group_capacity_factor = sg_capacity_factor(env, group); |
| 6029 | sgs->group_type = group_classify(group, sgs); | ||
| 5948 | 6030 | ||
| 5949 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6031 | if (sgs->group_capacity_factor > sgs->sum_nr_running) |
| 5950 | sgs->group_has_free_capacity = 1; | 6032 | sgs->group_has_free_capacity = 1; |
| @@ -5968,13 +6050,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 5968 | struct sched_group *sg, | 6050 | struct sched_group *sg, |
| 5969 | struct sg_lb_stats *sgs) | 6051 | struct sg_lb_stats *sgs) |
| 5970 | { | 6052 | { |
| 5971 | if (sgs->avg_load <= sds->busiest_stat.avg_load) | 6053 | struct sg_lb_stats *busiest = &sds->busiest_stat; |
| 5972 | return false; | ||
| 5973 | 6054 | ||
| 5974 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6055 | if (sgs->group_type > busiest->group_type) |
| 5975 | return true; | 6056 | return true; |
| 5976 | 6057 | ||
| 5977 | if (sgs->group_imb) | 6058 | if (sgs->group_type < busiest->group_type) |
| 6059 | return false; | ||
| 6060 | |||
| 6061 | if (sgs->avg_load <= busiest->avg_load) | ||
| 6062 | return false; | ||
| 6063 | |||
| 6064 | /* This is the busiest node in its class. */ | ||
| 6065 | if (!(env->sd->flags & SD_ASYM_PACKING)) | ||
| 5978 | return true; | 6066 | return true; |
| 5979 | 6067 | ||
| 5980 | /* | 6068 | /* |
| @@ -5982,8 +6070,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 5982 | * numbered CPUs in the group, therefore mark all groups | 6070 | * numbered CPUs in the group, therefore mark all groups |
| 5983 | * higher than ourself as busy. | 6071 | * higher than ourself as busy. |
| 5984 | */ | 6072 | */ |
| 5985 | if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | 6073 | if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { |
| 5986 | env->dst_cpu < group_first_cpu(sg)) { | ||
| 5987 | if (!sds->busiest) | 6074 | if (!sds->busiest) |
| 5988 | return true; | 6075 | return true; |
| 5989 | 6076 | ||
| @@ -6228,7 +6315,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6228 | local = &sds->local_stat; | 6315 | local = &sds->local_stat; |
| 6229 | busiest = &sds->busiest_stat; | 6316 | busiest = &sds->busiest_stat; |
| 6230 | 6317 | ||
| 6231 | if (busiest->group_imb) { | 6318 | if (busiest->group_type == group_imbalanced) { |
| 6232 | /* | 6319 | /* |
| 6233 | * In the group_imb case we cannot rely on group-wide averages | 6320 | * In the group_imb case we cannot rely on group-wide averages |
| 6234 | * to ensure cpu-load equilibrium, look at wider averages. XXX | 6321 | * to ensure cpu-load equilibrium, look at wider averages. XXX |
| @@ -6248,12 +6335,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6248 | return fix_small_imbalance(env, sds); | 6335 | return fix_small_imbalance(env, sds); |
| 6249 | } | 6336 | } |
| 6250 | 6337 | ||
| 6251 | if (!busiest->group_imb) { | 6338 | /* |
| 6252 | /* | 6339 | * If there aren't any idle cpus, avoid creating some. |
| 6253 | * Don't want to pull so many tasks that a group would go idle. | 6340 | */ |
| 6254 | * Except of course for the group_imb case, since then we might | 6341 | if (busiest->group_type == group_overloaded && |
| 6255 | * have to drop below capacity to reach cpu-load equilibrium. | 6342 | local->group_type == group_overloaded) { |
| 6256 | */ | ||
| 6257 | load_above_capacity = | 6343 | load_above_capacity = |
| 6258 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6344 | (busiest->sum_nr_running - busiest->group_capacity_factor); |
| 6259 | 6345 | ||
| @@ -6337,7 +6423,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6337 | * work because they assume all things are equal, which typically | 6423 | * work because they assume all things are equal, which typically |
| 6338 | * isn't true due to cpus_allowed constraints and the like. | 6424 | * isn't true due to cpus_allowed constraints and the like. |
| 6339 | */ | 6425 | */ |
| 6340 | if (busiest->group_imb) | 6426 | if (busiest->group_type == group_imbalanced) |
| 6341 | goto force_balance; | 6427 | goto force_balance; |
| 6342 | 6428 | ||
| 6343 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6429 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
| @@ -6346,7 +6432,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6346 | goto force_balance; | 6432 | goto force_balance; |
| 6347 | 6433 | ||
| 6348 | /* | 6434 | /* |
| 6349 | * If the local group is more busy than the selected busiest group | 6435 | * If the local group is busier than the selected busiest group |
| 6350 | * don't try and pull any tasks. | 6436 | * don't try and pull any tasks. |
| 6351 | */ | 6437 | */ |
| 6352 | if (local->avg_load >= busiest->avg_load) | 6438 | if (local->avg_load >= busiest->avg_load) |
| @@ -6361,13 +6447,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6361 | 6447 | ||
| 6362 | if (env->idle == CPU_IDLE) { | 6448 | if (env->idle == CPU_IDLE) { |
| 6363 | /* | 6449 | /* |
| 6364 | * This cpu is idle. If the busiest group load doesn't | 6450 | * This cpu is idle. If the busiest group is not overloaded |
| 6365 | * have more tasks than the number of available cpu's and | 6451 | * and there is no imbalance between this and busiest group |
| 6366 | * there is no imbalance between this and busiest group | 6452 | * wrt idle cpus, it is balanced. The imbalance becomes |
| 6367 | * wrt to idle cpu's, it is balanced. | 6453 | * significant if the diff is greater than 1 otherwise we |
| 6454 | * might end up to just move the imbalance on another group | ||
| 6368 | */ | 6455 | */ |
| 6369 | if ((local->idle_cpus < busiest->idle_cpus) && | 6456 | if ((busiest->group_type != group_overloaded) && |
| 6370 | busiest->sum_nr_running <= busiest->group_weight) | 6457 | (local->idle_cpus <= (busiest->idle_cpus + 1))) |
| 6371 | goto out_balanced; | 6458 | goto out_balanced; |
| 6372 | } else { | 6459 | } else { |
| 6373 | /* | 6460 | /* |
| @@ -6539,7 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 6539 | struct sched_group *group; | 6626 | struct sched_group *group; |
| 6540 | struct rq *busiest; | 6627 | struct rq *busiest; |
| 6541 | unsigned long flags; | 6628 | unsigned long flags; |
| 6542 | struct cpumask *cpus = __get_cpu_var(load_balance_mask); | 6629 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); |
| 6543 | 6630 | ||
| 6544 | struct lb_env env = { | 6631 | struct lb_env env = { |
| 6545 | .sd = sd, | 6632 | .sd = sd, |
| @@ -6550,6 +6637,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 6550 | .loop_break = sched_nr_migrate_break, | 6637 | .loop_break = sched_nr_migrate_break, |
| 6551 | .cpus = cpus, | 6638 | .cpus = cpus, |
| 6552 | .fbq_type = all, | 6639 | .fbq_type = all, |
| 6640 | .tasks = LIST_HEAD_INIT(env.tasks), | ||
| 6553 | }; | 6641 | }; |
| 6554 | 6642 | ||
| 6555 | /* | 6643 | /* |
| @@ -6599,23 +6687,30 @@ redo: | |||
| 6599 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6687 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
| 6600 | 6688 | ||
| 6601 | more_balance: | 6689 | more_balance: |
| 6602 | local_irq_save(flags); | 6690 | raw_spin_lock_irqsave(&busiest->lock, flags); |
| 6603 | double_rq_lock(env.dst_rq, busiest); | ||
| 6604 | 6691 | ||
| 6605 | /* | 6692 | /* |
| 6606 | * cur_ld_moved - load moved in current iteration | 6693 | * cur_ld_moved - load moved in current iteration |
| 6607 | * ld_moved - cumulative load moved across iterations | 6694 | * ld_moved - cumulative load moved across iterations |
| 6608 | */ | 6695 | */ |
| 6609 | cur_ld_moved = move_tasks(&env); | 6696 | cur_ld_moved = detach_tasks(&env); |
| 6610 | ld_moved += cur_ld_moved; | ||
| 6611 | double_rq_unlock(env.dst_rq, busiest); | ||
| 6612 | local_irq_restore(flags); | ||
| 6613 | 6697 | ||
| 6614 | /* | 6698 | /* |
| 6615 | * some other cpu did the load balance for us. | 6699 | * We've detached some tasks from busiest_rq. Every |
| 6700 | * task is masked "TASK_ON_RQ_MIGRATING", so we can safely | ||
| 6701 | * unlock busiest->lock, and we are able to be sure | ||
| 6702 | * that nobody can manipulate the tasks in parallel. | ||
| 6703 | * See task_rq_lock() family for the details. | ||
| 6616 | */ | 6704 | */ |
| 6617 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) | 6705 | |
| 6618 | resched_cpu(env.dst_cpu); | 6706 | raw_spin_unlock(&busiest->lock); |
| 6707 | |||
| 6708 | if (cur_ld_moved) { | ||
| 6709 | attach_tasks(&env); | ||
| 6710 | ld_moved += cur_ld_moved; | ||
| 6711 | } | ||
| 6712 | |||
| 6713 | local_irq_restore(flags); | ||
| 6619 | 6714 | ||
| 6620 | if (env.flags & LBF_NEED_BREAK) { | 6715 | if (env.flags & LBF_NEED_BREAK) { |
| 6621 | env.flags &= ~LBF_NEED_BREAK; | 6716 | env.flags &= ~LBF_NEED_BREAK; |
| @@ -6665,10 +6760,8 @@ more_balance: | |||
| 6665 | if (sd_parent) { | 6760 | if (sd_parent) { |
| 6666 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; | 6761 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; |
| 6667 | 6762 | ||
| 6668 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6763 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) |
| 6669 | *group_imbalance = 1; | 6764 | *group_imbalance = 1; |
| 6670 | } else if (*group_imbalance) | ||
| 6671 | *group_imbalance = 0; | ||
| 6672 | } | 6765 | } |
| 6673 | 6766 | ||
| 6674 | /* All tasks on this runqueue were pinned by CPU affinity */ | 6767 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| @@ -6679,7 +6772,7 @@ more_balance: | |||
| 6679 | env.loop_break = sched_nr_migrate_break; | 6772 | env.loop_break = sched_nr_migrate_break; |
| 6680 | goto redo; | 6773 | goto redo; |
| 6681 | } | 6774 | } |
| 6682 | goto out_balanced; | 6775 | goto out_all_pinned; |
| 6683 | } | 6776 | } |
| 6684 | } | 6777 | } |
| 6685 | 6778 | ||
| @@ -6744,7 +6837,7 @@ more_balance: | |||
| 6744 | * If we've begun active balancing, start to back off. This | 6837 | * If we've begun active balancing, start to back off. This |
| 6745 | * case may not be covered by the all_pinned logic if there | 6838 | * case may not be covered by the all_pinned logic if there |
| 6746 | * is only 1 task on the busy runqueue (because we don't call | 6839 | * is only 1 task on the busy runqueue (because we don't call |
| 6747 | * move_tasks). | 6840 | * detach_tasks). |
| 6748 | */ | 6841 | */ |
| 6749 | if (sd->balance_interval < sd->max_interval) | 6842 | if (sd->balance_interval < sd->max_interval) |
| 6750 | sd->balance_interval *= 2; | 6843 | sd->balance_interval *= 2; |
| @@ -6753,6 +6846,23 @@ more_balance: | |||
| 6753 | goto out; | 6846 | goto out; |
| 6754 | 6847 | ||
| 6755 | out_balanced: | 6848 | out_balanced: |
| 6849 | /* | ||
| 6850 | * We reach balance although we may have faced some affinity | ||
| 6851 | * constraints. Clear the imbalance flag if it was set. | ||
| 6852 | */ | ||
| 6853 | if (sd_parent) { | ||
| 6854 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; | ||
| 6855 | |||
| 6856 | if (*group_imbalance) | ||
| 6857 | *group_imbalance = 0; | ||
| 6858 | } | ||
| 6859 | |||
| 6860 | out_all_pinned: | ||
| 6861 | /* | ||
| 6862 | * We reach balance because all tasks are pinned at this level so | ||
| 6863 | * we can't migrate them. Let the imbalance flag set so parent level | ||
| 6864 | * can try to migrate them. | ||
| 6865 | */ | ||
| 6756 | schedstat_inc(sd, lb_balanced[idle]); | 6866 | schedstat_inc(sd, lb_balanced[idle]); |
| 6757 | 6867 | ||
| 6758 | sd->nr_balance_failed = 0; | 6868 | sd->nr_balance_failed = 0; |
| @@ -6914,6 +7024,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 6914 | int target_cpu = busiest_rq->push_cpu; | 7024 | int target_cpu = busiest_rq->push_cpu; |
| 6915 | struct rq *target_rq = cpu_rq(target_cpu); | 7025 | struct rq *target_rq = cpu_rq(target_cpu); |
| 6916 | struct sched_domain *sd; | 7026 | struct sched_domain *sd; |
| 7027 | struct task_struct *p = NULL; | ||
| 6917 | 7028 | ||
| 6918 | raw_spin_lock_irq(&busiest_rq->lock); | 7029 | raw_spin_lock_irq(&busiest_rq->lock); |
| 6919 | 7030 | ||
| @@ -6933,9 +7044,6 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 6933 | */ | 7044 | */ |
| 6934 | BUG_ON(busiest_rq == target_rq); | 7045 | BUG_ON(busiest_rq == target_rq); |
| 6935 | 7046 | ||
| 6936 | /* move a task from busiest_rq to target_rq */ | ||
| 6937 | double_lock_balance(busiest_rq, target_rq); | ||
| 6938 | |||
| 6939 | /* Search for an sd spanning us and the target CPU. */ | 7047 | /* Search for an sd spanning us and the target CPU. */ |
| 6940 | rcu_read_lock(); | 7048 | rcu_read_lock(); |
| 6941 | for_each_domain(target_cpu, sd) { | 7049 | for_each_domain(target_cpu, sd) { |
| @@ -6956,16 +7064,22 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 6956 | 7064 | ||
| 6957 | schedstat_inc(sd, alb_count); | 7065 | schedstat_inc(sd, alb_count); |
| 6958 | 7066 | ||
| 6959 | if (move_one_task(&env)) | 7067 | p = detach_one_task(&env); |
| 7068 | if (p) | ||
| 6960 | schedstat_inc(sd, alb_pushed); | 7069 | schedstat_inc(sd, alb_pushed); |
| 6961 | else | 7070 | else |
| 6962 | schedstat_inc(sd, alb_failed); | 7071 | schedstat_inc(sd, alb_failed); |
| 6963 | } | 7072 | } |
| 6964 | rcu_read_unlock(); | 7073 | rcu_read_unlock(); |
| 6965 | double_unlock_balance(busiest_rq, target_rq); | ||
| 6966 | out_unlock: | 7074 | out_unlock: |
| 6967 | busiest_rq->active_balance = 0; | 7075 | busiest_rq->active_balance = 0; |
| 6968 | raw_spin_unlock_irq(&busiest_rq->lock); | 7076 | raw_spin_unlock(&busiest_rq->lock); |
| 7077 | |||
| 7078 | if (p) | ||
| 7079 | attach_one_task(target_rq, p); | ||
| 7080 | |||
| 7081 | local_irq_enable(); | ||
| 7082 | |||
| 6969 | return 0; | 7083 | return 0; |
| 6970 | } | 7084 | } |
| 6971 | 7085 | ||
| @@ -7465,7 +7579,7 @@ static void task_fork_fair(struct task_struct *p) | |||
| 7465 | static void | 7579 | static void |
| 7466 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) | 7580 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
| 7467 | { | 7581 | { |
| 7468 | if (!p->se.on_rq) | 7582 | if (!task_on_rq_queued(p)) |
| 7469 | return; | 7583 | return; |
| 7470 | 7584 | ||
| 7471 | /* | 7585 | /* |
| @@ -7490,11 +7604,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 7490 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7604 | * switched back to the fair class the enqueue_entity(.flags=0) will |
| 7491 | * do the right thing. | 7605 | * do the right thing. |
| 7492 | * | 7606 | * |
| 7493 | * If it's on_rq, then the dequeue_entity(.flags=0) will already | 7607 | * If it's queued, then the dequeue_entity(.flags=0) will already |
| 7494 | * have normalized the vruntime, if it's !on_rq, then only when | 7608 | * have normalized the vruntime, if it's !queued, then only when |
| 7495 | * the task is sleeping will it still have non-normalized vruntime. | 7609 | * the task is sleeping will it still have non-normalized vruntime. |
| 7496 | */ | 7610 | */ |
| 7497 | if (!p->on_rq && p->state != TASK_RUNNING) { | 7611 | if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { |
| 7498 | /* | 7612 | /* |
| 7499 | * Fix up our vruntime so that the current sleep doesn't | 7613 | * Fix up our vruntime so that the current sleep doesn't |
| 7500 | * cause 'unlimited' sleep bonus. | 7614 | * cause 'unlimited' sleep bonus. |
| @@ -7521,15 +7635,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 7521 | */ | 7635 | */ |
| 7522 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | 7636 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
| 7523 | { | 7637 | { |
| 7524 | struct sched_entity *se = &p->se; | ||
| 7525 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7638 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7639 | struct sched_entity *se = &p->se; | ||
| 7526 | /* | 7640 | /* |
| 7527 | * Since the real-depth could have been changed (only FAIR | 7641 | * Since the real-depth could have been changed (only FAIR |
| 7528 | * class maintain depth value), reset depth properly. | 7642 | * class maintain depth value), reset depth properly. |
| 7529 | */ | 7643 | */ |
| 7530 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 7644 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
| 7531 | #endif | 7645 | #endif |
| 7532 | if (!se->on_rq) | 7646 | if (!task_on_rq_queued(p)) |
| 7533 | return; | 7647 | return; |
| 7534 | 7648 | ||
| 7535 | /* | 7649 | /* |
| @@ -7575,7 +7689,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 7575 | } | 7689 | } |
| 7576 | 7690 | ||
| 7577 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7691 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7578 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 7692 | static void task_move_group_fair(struct task_struct *p, int queued) |
| 7579 | { | 7693 | { |
| 7580 | struct sched_entity *se = &p->se; | 7694 | struct sched_entity *se = &p->se; |
| 7581 | struct cfs_rq *cfs_rq; | 7695 | struct cfs_rq *cfs_rq; |
| @@ -7594,7 +7708,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
| 7594 | * fair sleeper stuff for the first placement, but who cares. | 7708 | * fair sleeper stuff for the first placement, but who cares. |
| 7595 | */ | 7709 | */ |
| 7596 | /* | 7710 | /* |
| 7597 | * When !on_rq, vruntime of the task has usually NOT been normalized. | 7711 | * When !queued, vruntime of the task has usually NOT been normalized. |
| 7598 | * But there are some cases where it has already been normalized: | 7712 | * But there are some cases where it has already been normalized: |
| 7599 | * | 7713 | * |
| 7600 | * - Moving a forked child which is waiting for being woken up by | 7714 | * - Moving a forked child which is waiting for being woken up by |
| @@ -7605,14 +7719,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
| 7605 | * To prevent boost or penalty in the new cfs_rq caused by delta | 7719 | * To prevent boost or penalty in the new cfs_rq caused by delta |
| 7606 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | 7720 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. |
| 7607 | */ | 7721 | */ |
| 7608 | if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) | 7722 | if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) |
| 7609 | on_rq = 1; | 7723 | queued = 1; |
| 7610 | 7724 | ||
| 7611 | if (!on_rq) | 7725 | if (!queued) |
| 7612 | se->vruntime -= cfs_rq_of(se)->min_vruntime; | 7726 | se->vruntime -= cfs_rq_of(se)->min_vruntime; |
| 7613 | set_task_rq(p, task_cpu(p)); | 7727 | set_task_rq(p, task_cpu(p)); |
| 7614 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 7728 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
| 7615 | if (!on_rq) { | 7729 | if (!queued) { |
| 7616 | cfs_rq = cfs_rq_of(se); | 7730 | cfs_rq = cfs_rq_of(se); |
| 7617 | se->vruntime += cfs_rq->min_vruntime; | 7731 | se->vruntime += cfs_rq->min_vruntime; |
| 7618 | #ifdef CONFIG_SMP | 7732 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 11e7bc434f43..c47fce75e666 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -147,6 +147,9 @@ use_default: | |||
| 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) |
| 148 | goto use_default; | 148 | goto use_default; |
| 149 | 149 | ||
| 150 | /* Take note of the planned idle state. */ | ||
| 151 | idle_set_state(this_rq(), &drv->states[next_state]); | ||
| 152 | |||
| 150 | /* | 153 | /* |
| 151 | * Enter the idle state previously returned by the governor decision. | 154 | * Enter the idle state previously returned by the governor decision. |
| 152 | * This function will block until an interrupt occurs and will take | 155 | * This function will block until an interrupt occurs and will take |
| @@ -154,6 +157,9 @@ use_default: | |||
| 154 | */ | 157 | */ |
| 155 | entered_state = cpuidle_enter(drv, dev, next_state); | 158 | entered_state = cpuidle_enter(drv, dev, next_state); |
| 156 | 159 | ||
| 160 | /* The cpu is no longer idle or about to enter idle. */ | ||
| 161 | idle_set_state(this_rq(), NULL); | ||
| 162 | |||
| 157 | if (broadcast) | 163 | if (broadcast) |
| 158 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 164 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); |
| 159 | 165 | ||
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 16f5a30f9c88..8ecd552fe4f2 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c | |||
| @@ -8,13 +8,6 @@ | |||
| 8 | 8 | ||
| 9 | #include "sched.h" | 9 | #include "sched.h" |
| 10 | 10 | ||
| 11 | unsigned long this_cpu_load(void) | ||
| 12 | { | ||
| 13 | struct rq *this = this_rq(); | ||
| 14 | return this->cpu_load[0]; | ||
| 15 | } | ||
| 16 | |||
| 17 | |||
| 18 | /* | 11 | /* |
| 19 | * Global load-average calculations | 12 | * Global load-average calculations |
| 20 | * | 13 | * |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca4fafd..d024e6ce30ba 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
| 1448 | * means a dl or stop task can slip in, in which case we need | 1448 | * means a dl or stop task can slip in, in which case we need |
| 1449 | * to re-start task selection. | 1449 | * to re-start task selection. |
| 1450 | */ | 1450 | */ |
| 1451 | if (unlikely((rq->stop && rq->stop->on_rq) || | 1451 | if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || |
| 1452 | rq->dl.dl_nr_running)) | 1452 | rq->dl.dl_nr_running)) |
| 1453 | return RETRY_TASK; | 1453 | return RETRY_TASK; |
| 1454 | } | 1454 | } |
| @@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
| 1468 | p = _pick_next_task_rt(rq); | 1468 | p = _pick_next_task_rt(rq); |
| 1469 | 1469 | ||
| 1470 | /* The running task is never eligible for pushing */ | 1470 | /* The running task is never eligible for pushing */ |
| 1471 | if (p) | 1471 | dequeue_pushable_task(rq, p); |
| 1472 | dequeue_pushable_task(rq, p); | ||
| 1473 | 1472 | ||
| 1474 | set_post_schedule(rq); | 1473 | set_post_schedule(rq); |
| 1475 | 1474 | ||
| @@ -1526,7 +1525,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); | |||
| 1526 | static int find_lowest_rq(struct task_struct *task) | 1525 | static int find_lowest_rq(struct task_struct *task) |
| 1527 | { | 1526 | { |
| 1528 | struct sched_domain *sd; | 1527 | struct sched_domain *sd; |
| 1529 | struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); | 1528 | struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); |
| 1530 | int this_cpu = smp_processor_id(); | 1529 | int this_cpu = smp_processor_id(); |
| 1531 | int cpu = task_cpu(task); | 1530 | int cpu = task_cpu(task); |
| 1532 | 1531 | ||
| @@ -1624,7 +1623,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
| 1624 | !cpumask_test_cpu(lowest_rq->cpu, | 1623 | !cpumask_test_cpu(lowest_rq->cpu, |
| 1625 | tsk_cpus_allowed(task)) || | 1624 | tsk_cpus_allowed(task)) || |
| 1626 | task_running(rq, task) || | 1625 | task_running(rq, task) || |
| 1627 | !task->on_rq)) { | 1626 | !task_on_rq_queued(task))) { |
| 1628 | 1627 | ||
| 1629 | double_unlock_balance(rq, lowest_rq); | 1628 | double_unlock_balance(rq, lowest_rq); |
| 1630 | lowest_rq = NULL; | 1629 | lowest_rq = NULL; |
| @@ -1658,7 +1657,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
| 1658 | BUG_ON(task_current(rq, p)); | 1657 | BUG_ON(task_current(rq, p)); |
| 1659 | BUG_ON(p->nr_cpus_allowed <= 1); | 1658 | BUG_ON(p->nr_cpus_allowed <= 1); |
| 1660 | 1659 | ||
| 1661 | BUG_ON(!p->on_rq); | 1660 | BUG_ON(!task_on_rq_queued(p)); |
| 1662 | BUG_ON(!rt_task(p)); | 1661 | BUG_ON(!rt_task(p)); |
| 1663 | 1662 | ||
| 1664 | return p; | 1663 | return p; |
| @@ -1809,7 +1808,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1809 | */ | 1808 | */ |
| 1810 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { | 1809 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { |
| 1811 | WARN_ON(p == src_rq->curr); | 1810 | WARN_ON(p == src_rq->curr); |
| 1812 | WARN_ON(!p->on_rq); | 1811 | WARN_ON(!task_on_rq_queued(p)); |
| 1813 | 1812 | ||
| 1814 | /* | 1813 | /* |
| 1815 | * There's a chance that p is higher in priority | 1814 | * There's a chance that p is higher in priority |
| @@ -1870,7 +1869,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
| 1870 | 1869 | ||
| 1871 | BUG_ON(!rt_task(p)); | 1870 | BUG_ON(!rt_task(p)); |
| 1872 | 1871 | ||
| 1873 | if (!p->on_rq) | 1872 | if (!task_on_rq_queued(p)) |
| 1874 | return; | 1873 | return; |
| 1875 | 1874 | ||
| 1876 | weight = cpumask_weight(new_mask); | 1875 | weight = cpumask_weight(new_mask); |
| @@ -1936,7 +1935,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
| 1936 | * we may need to handle the pulling of RT tasks | 1935 | * we may need to handle the pulling of RT tasks |
| 1937 | * now. | 1936 | * now. |
| 1938 | */ | 1937 | */ |
| 1939 | if (!p->on_rq || rq->rt.rt_nr_running) | 1938 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) |
| 1940 | return; | 1939 | return; |
| 1941 | 1940 | ||
| 1942 | if (pull_rt_task(rq)) | 1941 | if (pull_rt_task(rq)) |
| @@ -1970,7 +1969,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 1970 | * If that current running task is also an RT task | 1969 | * If that current running task is also an RT task |
| 1971 | * then see if we can move to another run queue. | 1970 | * then see if we can move to another run queue. |
| 1972 | */ | 1971 | */ |
| 1973 | if (p->on_rq && rq->curr != p) { | 1972 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1974 | #ifdef CONFIG_SMP | 1973 | #ifdef CONFIG_SMP |
| 1975 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && | 1974 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && |
| 1976 | /* Don't resched if we changed runqueues */ | 1975 | /* Don't resched if we changed runqueues */ |
| @@ -1989,7 +1988,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 1989 | static void | 1988 | static void |
| 1990 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | 1989 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
| 1991 | { | 1990 | { |
| 1992 | if (!p->on_rq) | 1991 | if (!task_on_rq_queued(p)) |
| 1993 | return; | 1992 | return; |
| 1994 | 1993 | ||
| 1995 | if (rq->curr == p) { | 1994 | if (rq->curr == p) { |
| @@ -2073,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
| 2073 | for_each_sched_rt_entity(rt_se) { | 2072 | for_each_sched_rt_entity(rt_se) { |
| 2074 | if (rt_se->run_list.prev != rt_se->run_list.next) { | 2073 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
| 2075 | requeue_task_rt(rq, p, 0); | 2074 | requeue_task_rt(rq, p, 0); |
| 2076 | set_tsk_need_resched(p); | 2075 | resched_curr(rq); |
| 2077 | return; | 2076 | return; |
| 2078 | } | 2077 | } |
| 2079 | } | 2078 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f4e9d5..24156c8434d1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -14,6 +14,11 @@ | |||
| 14 | #include "cpuacct.h" | 14 | #include "cpuacct.h" |
| 15 | 15 | ||
| 16 | struct rq; | 16 | struct rq; |
| 17 | struct cpuidle_state; | ||
| 18 | |||
| 19 | /* task_struct::on_rq states: */ | ||
| 20 | #define TASK_ON_RQ_QUEUED 1 | ||
| 21 | #define TASK_ON_RQ_MIGRATING 2 | ||
| 17 | 22 | ||
| 18 | extern __read_mostly int scheduler_running; | 23 | extern __read_mostly int scheduler_running; |
| 19 | 24 | ||
| @@ -126,6 +131,9 @@ struct rt_bandwidth { | |||
| 126 | u64 rt_runtime; | 131 | u64 rt_runtime; |
| 127 | struct hrtimer rt_period_timer; | 132 | struct hrtimer rt_period_timer; |
| 128 | }; | 133 | }; |
| 134 | |||
| 135 | void __dl_clear_params(struct task_struct *p); | ||
| 136 | |||
| 129 | /* | 137 | /* |
| 130 | * To keep the bandwidth of -deadline tasks and groups under control | 138 | * To keep the bandwidth of -deadline tasks and groups under control |
| 131 | * we need some place where: | 139 | * we need some place where: |
| @@ -184,7 +192,7 @@ struct cfs_bandwidth { | |||
| 184 | raw_spinlock_t lock; | 192 | raw_spinlock_t lock; |
| 185 | ktime_t period; | 193 | ktime_t period; |
| 186 | u64 quota, runtime; | 194 | u64 quota, runtime; |
| 187 | s64 hierarchal_quota; | 195 | s64 hierarchical_quota; |
| 188 | u64 runtime_expires; | 196 | u64 runtime_expires; |
| 189 | 197 | ||
| 190 | int idle, timer_active; | 198 | int idle, timer_active; |
| @@ -636,6 +644,11 @@ struct rq { | |||
| 636 | #ifdef CONFIG_SMP | 644 | #ifdef CONFIG_SMP |
| 637 | struct llist_head wake_list; | 645 | struct llist_head wake_list; |
| 638 | #endif | 646 | #endif |
| 647 | |||
| 648 | #ifdef CONFIG_CPU_IDLE | ||
| 649 | /* Must be inspected within a rcu lock section */ | ||
| 650 | struct cpuidle_state *idle_state; | ||
| 651 | #endif | ||
| 639 | }; | 652 | }; |
| 640 | 653 | ||
| 641 | static inline int cpu_of(struct rq *rq) | 654 | static inline int cpu_of(struct rq *rq) |
| @@ -647,13 +660,13 @@ static inline int cpu_of(struct rq *rq) | |||
| 647 | #endif | 660 | #endif |
| 648 | } | 661 | } |
| 649 | 662 | ||
| 650 | DECLARE_PER_CPU(struct rq, runqueues); | 663 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 651 | 664 | ||
| 652 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 665 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
| 653 | #define this_rq() (&__get_cpu_var(runqueues)) | 666 | #define this_rq() this_cpu_ptr(&runqueues) |
| 654 | #define task_rq(p) cpu_rq(task_cpu(p)) | 667 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 655 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 668 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 656 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | 669 | #define raw_rq() raw_cpu_ptr(&runqueues) |
| 657 | 670 | ||
| 658 | static inline u64 rq_clock(struct rq *rq) | 671 | static inline u64 rq_clock(struct rq *rq) |
| 659 | { | 672 | { |
| @@ -942,6 +955,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
| 942 | #endif | 955 | #endif |
| 943 | } | 956 | } |
| 944 | 957 | ||
| 958 | static inline int task_on_rq_queued(struct task_struct *p) | ||
| 959 | { | ||
| 960 | return p->on_rq == TASK_ON_RQ_QUEUED; | ||
| 961 | } | ||
| 962 | |||
| 963 | static inline int task_on_rq_migrating(struct task_struct *p) | ||
| 964 | { | ||
| 965 | return p->on_rq == TASK_ON_RQ_MIGRATING; | ||
| 966 | } | ||
| 945 | 967 | ||
| 946 | #ifndef prepare_arch_switch | 968 | #ifndef prepare_arch_switch |
| 947 | # define prepare_arch_switch(next) do { } while (0) | 969 | # define prepare_arch_switch(next) do { } while (0) |
| @@ -953,7 +975,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
| 953 | # define finish_arch_post_lock_switch() do { } while (0) | 975 | # define finish_arch_post_lock_switch() do { } while (0) |
| 954 | #endif | 976 | #endif |
| 955 | 977 | ||
| 956 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 957 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 978 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
| 958 | { | 979 | { |
| 959 | #ifdef CONFIG_SMP | 980 | #ifdef CONFIG_SMP |
| @@ -991,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 991 | raw_spin_unlock_irq(&rq->lock); | 1012 | raw_spin_unlock_irq(&rq->lock); |
| 992 | } | 1013 | } |
| 993 | 1014 | ||
| 994 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 995 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
| 996 | { | ||
| 997 | #ifdef CONFIG_SMP | ||
| 998 | /* | ||
| 999 | * We can optimise this out completely for !SMP, because the | ||
| 1000 | * SMP rebalancing from interrupt is the only thing that cares | ||
| 1001 | * here. | ||
| 1002 | */ | ||
| 1003 | next->on_cpu = 1; | ||
| 1004 | #endif | ||
| 1005 | raw_spin_unlock(&rq->lock); | ||
| 1006 | } | ||
| 1007 | |||
| 1008 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
| 1009 | { | ||
| 1010 | #ifdef CONFIG_SMP | ||
| 1011 | /* | ||
| 1012 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
| 1013 | * We must ensure this doesn't happen until the switch is completely | ||
| 1014 | * finished. | ||
| 1015 | */ | ||
| 1016 | smp_wmb(); | ||
| 1017 | prev->on_cpu = 0; | ||
| 1018 | #endif | ||
| 1019 | local_irq_enable(); | ||
| 1020 | } | ||
| 1021 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 1022 | |||
| 1023 | /* | 1015 | /* |
| 1024 | * wake flags | 1016 | * wake flags |
| 1025 | */ | 1017 | */ |
| @@ -1180,6 +1172,30 @@ static inline void idle_exit_fair(struct rq *rq) { } | |||
| 1180 | 1172 | ||
| 1181 | #endif | 1173 | #endif |
| 1182 | 1174 | ||
| 1175 | #ifdef CONFIG_CPU_IDLE | ||
| 1176 | static inline void idle_set_state(struct rq *rq, | ||
| 1177 | struct cpuidle_state *idle_state) | ||
| 1178 | { | ||
| 1179 | rq->idle_state = idle_state; | ||
| 1180 | } | ||
| 1181 | |||
| 1182 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | ||
| 1183 | { | ||
| 1184 | WARN_ON(!rcu_read_lock_held()); | ||
| 1185 | return rq->idle_state; | ||
| 1186 | } | ||
| 1187 | #else | ||
| 1188 | static inline void idle_set_state(struct rq *rq, | ||
| 1189 | struct cpuidle_state *idle_state) | ||
| 1190 | { | ||
| 1191 | } | ||
| 1192 | |||
| 1193 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | ||
| 1194 | { | ||
| 1195 | return NULL; | ||
| 1196 | } | ||
| 1197 | #endif | ||
| 1198 | |||
| 1183 | extern void sysrq_sched_debug_show(void); | 1199 | extern void sysrq_sched_debug_show(void); |
| 1184 | extern void sched_init_granularity(void); | 1200 | extern void sched_init_granularity(void); |
| 1185 | extern void update_max_interval(void); | 1201 | extern void update_max_interval(void); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0edadbfbb..67426e529f59 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
| @@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) | |||
| 28 | { | 28 | { |
| 29 | struct task_struct *stop = rq->stop; | 29 | struct task_struct *stop = rq->stop; |
| 30 | 30 | ||
| 31 | if (!stop || !stop->on_rq) | 31 | if (!stop || !task_on_rq_queued(stop)) |
| 32 | return NULL; | 32 | return NULL; |
| 33 | 33 | ||
| 34 | put_prev_task(rq, prev); | 34 | put_prev_task(rq, prev); |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 15cab1a4f84e..5a62915f47a8 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -343,6 +343,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, | |||
| 343 | } | 343 | } |
| 344 | EXPORT_SYMBOL(out_of_line_wait_on_bit); | 344 | EXPORT_SYMBOL(out_of_line_wait_on_bit); |
| 345 | 345 | ||
| 346 | int __sched out_of_line_wait_on_bit_timeout( | ||
| 347 | void *word, int bit, wait_bit_action_f *action, | ||
| 348 | unsigned mode, unsigned long timeout) | ||
| 349 | { | ||
| 350 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | ||
| 351 | DEFINE_WAIT_BIT(wait, word, bit); | ||
| 352 | |||
| 353 | wait.key.timeout = jiffies + timeout; | ||
| 354 | return __wait_on_bit(wq, &wait, action, mode); | ||
| 355 | } | ||
| 356 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); | ||
| 357 | |||
| 346 | int __sched | 358 | int __sched |
| 347 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | 359 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, |
| 348 | wait_bit_action_f *action, unsigned mode) | 360 | wait_bit_action_f *action, unsigned mode) |
| @@ -520,3 +532,27 @@ __sched int bit_wait_io(struct wait_bit_key *word) | |||
| 520 | return 0; | 532 | return 0; |
| 521 | } | 533 | } |
| 522 | EXPORT_SYMBOL(bit_wait_io); | 534 | EXPORT_SYMBOL(bit_wait_io); |
| 535 | |||
| 536 | __sched int bit_wait_timeout(struct wait_bit_key *word) | ||
| 537 | { | ||
| 538 | unsigned long now = ACCESS_ONCE(jiffies); | ||
| 539 | if (signal_pending_state(current->state, current)) | ||
| 540 | return 1; | ||
| 541 | if (time_after_eq(now, word->timeout)) | ||
| 542 | return -EAGAIN; | ||
| 543 | schedule_timeout(word->timeout - now); | ||
| 544 | return 0; | ||
| 545 | } | ||
| 546 | EXPORT_SYMBOL_GPL(bit_wait_timeout); | ||
| 547 | |||
| 548 | __sched int bit_wait_io_timeout(struct wait_bit_key *word) | ||
| 549 | { | ||
| 550 | unsigned long now = ACCESS_ONCE(jiffies); | ||
| 551 | if (signal_pending_state(current->state, current)) | ||
| 552 | return 1; | ||
| 553 | if (time_after_eq(now, word->timeout)) | ||
| 554 | return -EAGAIN; | ||
| 555 | io_schedule_timeout(word->timeout - now); | ||
| 556 | return 0; | ||
| 557 | } | ||
| 558 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); | ||
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 25b0043f4755..4ef9687ac115 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
| @@ -21,10 +21,11 @@ | |||
| 21 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
| 22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
| 23 | 23 | ||
| 24 | /* #define SECCOMP_DEBUG 1 */ | 24 | #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER |
| 25 | #include <asm/syscall.h> | ||
| 26 | #endif | ||
| 25 | 27 | ||
| 26 | #ifdef CONFIG_SECCOMP_FILTER | 28 | #ifdef CONFIG_SECCOMP_FILTER |
| 27 | #include <asm/syscall.h> | ||
| 28 | #include <linux/filter.h> | 29 | #include <linux/filter.h> |
| 29 | #include <linux/pid.h> | 30 | #include <linux/pid.h> |
| 30 | #include <linux/ptrace.h> | 31 | #include <linux/ptrace.h> |
| @@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |||
| 172 | * | 173 | * |
| 173 | * Returns valid seccomp BPF response codes. | 174 | * Returns valid seccomp BPF response codes. |
| 174 | */ | 175 | */ |
| 175 | static u32 seccomp_run_filters(int syscall) | 176 | static u32 seccomp_run_filters(struct seccomp_data *sd) |
| 176 | { | 177 | { |
| 177 | struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); | 178 | struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); |
| 178 | struct seccomp_data sd; | 179 | struct seccomp_data sd_local; |
| 179 | u32 ret = SECCOMP_RET_ALLOW; | 180 | u32 ret = SECCOMP_RET_ALLOW; |
| 180 | 181 | ||
| 181 | /* Ensure unexpected behavior doesn't result in failing open. */ | 182 | /* Ensure unexpected behavior doesn't result in failing open. */ |
| @@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall) | |||
| 185 | /* Make sure cross-thread synced filter points somewhere sane. */ | 186 | /* Make sure cross-thread synced filter points somewhere sane. */ |
| 186 | smp_read_barrier_depends(); | 187 | smp_read_barrier_depends(); |
| 187 | 188 | ||
| 188 | populate_seccomp_data(&sd); | 189 | if (!sd) { |
| 190 | populate_seccomp_data(&sd_local); | ||
| 191 | sd = &sd_local; | ||
| 192 | } | ||
| 189 | 193 | ||
| 190 | /* | 194 | /* |
| 191 | * All filters in the list are evaluated and the lowest BPF return | 195 | * All filters in the list are evaluated and the lowest BPF return |
| 192 | * value always takes priority (ignoring the DATA). | 196 | * value always takes priority (ignoring the DATA). |
| 193 | */ | 197 | */ |
| 194 | for (; f; f = f->prev) { | 198 | for (; f; f = f->prev) { |
| 195 | u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); | 199 | u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); |
| 196 | 200 | ||
| 197 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | 201 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) |
| 198 | ret = cur_ret; | 202 | ret = cur_ret; |
| @@ -203,7 +207,7 @@ static u32 seccomp_run_filters(int syscall) | |||
| 203 | 207 | ||
| 204 | static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) | 208 | static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) |
| 205 | { | 209 | { |
| 206 | BUG_ON(!spin_is_locked(¤t->sighand->siglock)); | 210 | assert_spin_locked(¤t->sighand->siglock); |
| 207 | 211 | ||
| 208 | if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) | 212 | if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) |
| 209 | return false; | 213 | return false; |
| @@ -214,7 +218,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) | |||
| 214 | static inline void seccomp_assign_mode(struct task_struct *task, | 218 | static inline void seccomp_assign_mode(struct task_struct *task, |
| 215 | unsigned long seccomp_mode) | 219 | unsigned long seccomp_mode) |
| 216 | { | 220 | { |
| 217 | BUG_ON(!spin_is_locked(&task->sighand->siglock)); | 221 | assert_spin_locked(&task->sighand->siglock); |
| 218 | 222 | ||
| 219 | task->seccomp.mode = seccomp_mode; | 223 | task->seccomp.mode = seccomp_mode; |
| 220 | /* | 224 | /* |
| @@ -253,7 +257,7 @@ static inline pid_t seccomp_can_sync_threads(void) | |||
| 253 | struct task_struct *thread, *caller; | 257 | struct task_struct *thread, *caller; |
| 254 | 258 | ||
| 255 | BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); | 259 | BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); |
| 256 | BUG_ON(!spin_is_locked(¤t->sighand->siglock)); | 260 | assert_spin_locked(¤t->sighand->siglock); |
| 257 | 261 | ||
| 258 | /* Validate all threads being eligible for synchronization. */ | 262 | /* Validate all threads being eligible for synchronization. */ |
| 259 | caller = current; | 263 | caller = current; |
| @@ -294,7 +298,7 @@ static inline void seccomp_sync_threads(void) | |||
| 294 | struct task_struct *thread, *caller; | 298 | struct task_struct *thread, *caller; |
| 295 | 299 | ||
| 296 | BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); | 300 | BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); |
| 297 | BUG_ON(!spin_is_locked(¤t->sighand->siglock)); | 301 | assert_spin_locked(¤t->sighand->siglock); |
| 298 | 302 | ||
| 299 | /* Synchronize all threads. */ | 303 | /* Synchronize all threads. */ |
| 300 | caller = current; | 304 | caller = current; |
| @@ -395,16 +399,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) | |||
| 395 | if (!filter) | 399 | if (!filter) |
| 396 | goto free_prog; | 400 | goto free_prog; |
| 397 | 401 | ||
| 398 | filter->prog = kzalloc(bpf_prog_size(new_len), | 402 | filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); |
| 399 | GFP_KERNEL|__GFP_NOWARN); | ||
| 400 | if (!filter->prog) | 403 | if (!filter->prog) |
| 401 | goto free_filter; | 404 | goto free_filter; |
| 402 | 405 | ||
| 403 | ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); | 406 | ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); |
| 404 | if (ret) | 407 | if (ret) |
| 405 | goto free_filter_prog; | 408 | goto free_filter_prog; |
| 406 | kfree(fp); | ||
| 407 | 409 | ||
| 410 | kfree(fp); | ||
| 408 | atomic_set(&filter->usage, 1); | 411 | atomic_set(&filter->usage, 1); |
| 409 | filter->prog->len = new_len; | 412 | filter->prog->len = new_len; |
| 410 | 413 | ||
| @@ -413,7 +416,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) | |||
| 413 | return filter; | 416 | return filter; |
| 414 | 417 | ||
| 415 | free_filter_prog: | 418 | free_filter_prog: |
| 416 | kfree(filter->prog); | 419 | __bpf_prog_free(filter->prog); |
| 417 | free_filter: | 420 | free_filter: |
| 418 | kfree(filter); | 421 | kfree(filter); |
| 419 | free_prog: | 422 | free_prog: |
| @@ -464,7 +467,7 @@ static long seccomp_attach_filter(unsigned int flags, | |||
| 464 | unsigned long total_insns; | 467 | unsigned long total_insns; |
| 465 | struct seccomp_filter *walker; | 468 | struct seccomp_filter *walker; |
| 466 | 469 | ||
| 467 | BUG_ON(!spin_is_locked(¤t->sighand->siglock)); | 470 | assert_spin_locked(¤t->sighand->siglock); |
| 468 | 471 | ||
| 469 | /* Validate resulting filter length. */ | 472 | /* Validate resulting filter length. */ |
| 470 | total_insns = filter->prog->len; | 473 | total_insns = filter->prog->len; |
| @@ -564,11 +567,55 @@ static int mode1_syscalls_32[] = { | |||
| 564 | }; | 567 | }; |
| 565 | #endif | 568 | #endif |
| 566 | 569 | ||
| 567 | int __secure_computing(int this_syscall) | 570 | static void __secure_computing_strict(int this_syscall) |
| 571 | { | ||
| 572 | int *syscall_whitelist = mode1_syscalls; | ||
| 573 | #ifdef CONFIG_COMPAT | ||
| 574 | if (is_compat_task()) | ||
| 575 | syscall_whitelist = mode1_syscalls_32; | ||
| 576 | #endif | ||
| 577 | do { | ||
| 578 | if (*syscall_whitelist == this_syscall) | ||
| 579 | return; | ||
| 580 | } while (*++syscall_whitelist); | ||
| 581 | |||
| 582 | #ifdef SECCOMP_DEBUG | ||
| 583 | dump_stack(); | ||
| 584 | #endif | ||
| 585 | audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); | ||
| 586 | do_exit(SIGKILL); | ||
| 587 | } | ||
| 588 | |||
| 589 | #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER | ||
| 590 | void secure_computing_strict(int this_syscall) | ||
| 591 | { | ||
| 592 | int mode = current->seccomp.mode; | ||
| 593 | |||
| 594 | if (mode == 0) | ||
| 595 | return; | ||
| 596 | else if (mode == SECCOMP_MODE_STRICT) | ||
| 597 | __secure_computing_strict(this_syscall); | ||
| 598 | else | ||
| 599 | BUG(); | ||
| 600 | } | ||
| 601 | #else | ||
| 602 | int __secure_computing(void) | ||
| 603 | { | ||
| 604 | u32 phase1_result = seccomp_phase1(NULL); | ||
| 605 | |||
| 606 | if (likely(phase1_result == SECCOMP_PHASE1_OK)) | ||
| 607 | return 0; | ||
| 608 | else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) | ||
| 609 | return -1; | ||
| 610 | else | ||
| 611 | return seccomp_phase2(phase1_result); | ||
| 612 | } | ||
| 613 | |||
| 614 | #ifdef CONFIG_SECCOMP_FILTER | ||
| 615 | static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) | ||
| 568 | { | 616 | { |
| 569 | int exit_sig = 0; | 617 | u32 filter_ret, action; |
| 570 | int *syscall; | 618 | int data; |
| 571 | u32 ret; | ||
| 572 | 619 | ||
| 573 | /* | 620 | /* |
| 574 | * Make sure that any changes to mode from another thread have | 621 | * Make sure that any changes to mode from another thread have |
| @@ -576,85 +623,127 @@ int __secure_computing(int this_syscall) | |||
| 576 | */ | 623 | */ |
| 577 | rmb(); | 624 | rmb(); |
| 578 | 625 | ||
| 579 | switch (current->seccomp.mode) { | 626 | filter_ret = seccomp_run_filters(sd); |
| 580 | case SECCOMP_MODE_STRICT: | 627 | data = filter_ret & SECCOMP_RET_DATA; |
| 581 | syscall = mode1_syscalls; | 628 | action = filter_ret & SECCOMP_RET_ACTION; |
| 582 | #ifdef CONFIG_COMPAT | 629 | |
| 583 | if (is_compat_task()) | 630 | switch (action) { |
| 584 | syscall = mode1_syscalls_32; | 631 | case SECCOMP_RET_ERRNO: |
| 632 | /* Set the low-order 16-bits as a errno. */ | ||
| 633 | syscall_set_return_value(current, task_pt_regs(current), | ||
| 634 | -data, 0); | ||
| 635 | goto skip; | ||
| 636 | |||
| 637 | case SECCOMP_RET_TRAP: | ||
| 638 | /* Show the handler the original registers. */ | ||
| 639 | syscall_rollback(current, task_pt_regs(current)); | ||
| 640 | /* Let the filter pass back 16 bits of data. */ | ||
| 641 | seccomp_send_sigsys(this_syscall, data); | ||
| 642 | goto skip; | ||
| 643 | |||
| 644 | case SECCOMP_RET_TRACE: | ||
| 645 | return filter_ret; /* Save the rest for phase 2. */ | ||
| 646 | |||
| 647 | case SECCOMP_RET_ALLOW: | ||
| 648 | return SECCOMP_PHASE1_OK; | ||
| 649 | |||
| 650 | case SECCOMP_RET_KILL: | ||
| 651 | default: | ||
| 652 | audit_seccomp(this_syscall, SIGSYS, action); | ||
| 653 | do_exit(SIGSYS); | ||
| 654 | } | ||
| 655 | |||
| 656 | unreachable(); | ||
| 657 | |||
| 658 | skip: | ||
| 659 | audit_seccomp(this_syscall, 0, action); | ||
| 660 | return SECCOMP_PHASE1_SKIP; | ||
| 661 | } | ||
| 585 | #endif | 662 | #endif |
| 586 | do { | 663 | |
| 587 | if (*syscall == this_syscall) | 664 | /** |
| 588 | return 0; | 665 | * seccomp_phase1() - run fast path seccomp checks on the current syscall |
| 589 | } while (*++syscall); | 666 | * @arg sd: The seccomp_data or NULL |
| 590 | exit_sig = SIGKILL; | 667 | * |
| 591 | ret = SECCOMP_RET_KILL; | 668 | * This only reads pt_regs via the syscall_xyz helpers. The only change |
| 592 | break; | 669 | * it will make to pt_regs is via syscall_set_return_value, and it will |
| 670 | * only do that if it returns SECCOMP_PHASE1_SKIP. | ||
| 671 | * | ||
| 672 | * If sd is provided, it will not read pt_regs at all. | ||
| 673 | * | ||
| 674 | * It may also call do_exit or force a signal; these actions must be | ||
| 675 | * safe. | ||
| 676 | * | ||
| 677 | * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should | ||
| 678 | * be processed normally. | ||
| 679 | * | ||
| 680 | * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be | ||
| 681 | * invoked. In this case, seccomp_phase1 will have set the return value | ||
| 682 | * using syscall_set_return_value. | ||
| 683 | * | ||
| 684 | * If it returns anything else, then the return value should be passed | ||
| 685 | * to seccomp_phase2 from a context in which ptrace hooks are safe. | ||
| 686 | */ | ||
| 687 | u32 seccomp_phase1(struct seccomp_data *sd) | ||
| 688 | { | ||
| 689 | int mode = current->seccomp.mode; | ||
| 690 | int this_syscall = sd ? sd->nr : | ||
| 691 | syscall_get_nr(current, task_pt_regs(current)); | ||
| 692 | |||
| 693 | switch (mode) { | ||
| 694 | case SECCOMP_MODE_STRICT: | ||
| 695 | __secure_computing_strict(this_syscall); /* may call do_exit */ | ||
| 696 | return SECCOMP_PHASE1_OK; | ||
| 593 | #ifdef CONFIG_SECCOMP_FILTER | 697 | #ifdef CONFIG_SECCOMP_FILTER |
| 594 | case SECCOMP_MODE_FILTER: { | 698 | case SECCOMP_MODE_FILTER: |
| 595 | int data; | 699 | return __seccomp_phase1_filter(this_syscall, sd); |
| 596 | struct pt_regs *regs = task_pt_regs(current); | ||
| 597 | ret = seccomp_run_filters(this_syscall); | ||
| 598 | data = ret & SECCOMP_RET_DATA; | ||
| 599 | ret &= SECCOMP_RET_ACTION; | ||
| 600 | switch (ret) { | ||
| 601 | case SECCOMP_RET_ERRNO: | ||
| 602 | /* Set the low-order 16-bits as a errno. */ | ||
| 603 | syscall_set_return_value(current, regs, | ||
| 604 | -data, 0); | ||
| 605 | goto skip; | ||
| 606 | case SECCOMP_RET_TRAP: | ||
| 607 | /* Show the handler the original registers. */ | ||
| 608 | syscall_rollback(current, regs); | ||
| 609 | /* Let the filter pass back 16 bits of data. */ | ||
| 610 | seccomp_send_sigsys(this_syscall, data); | ||
| 611 | goto skip; | ||
| 612 | case SECCOMP_RET_TRACE: | ||
| 613 | /* Skip these calls if there is no tracer. */ | ||
| 614 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { | ||
| 615 | syscall_set_return_value(current, regs, | ||
| 616 | -ENOSYS, 0); | ||
| 617 | goto skip; | ||
| 618 | } | ||
| 619 | /* Allow the BPF to provide the event message */ | ||
| 620 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
| 621 | /* | ||
| 622 | * The delivery of a fatal signal during event | ||
| 623 | * notification may silently skip tracer notification. | ||
| 624 | * Terminating the task now avoids executing a system | ||
| 625 | * call that may not be intended. | ||
| 626 | */ | ||
| 627 | if (fatal_signal_pending(current)) | ||
| 628 | break; | ||
| 629 | if (syscall_get_nr(current, regs) < 0) | ||
| 630 | goto skip; /* Explicit request to skip. */ | ||
| 631 | |||
| 632 | return 0; | ||
| 633 | case SECCOMP_RET_ALLOW: | ||
| 634 | return 0; | ||
| 635 | case SECCOMP_RET_KILL: | ||
| 636 | default: | ||
| 637 | break; | ||
| 638 | } | ||
| 639 | exit_sig = SIGSYS; | ||
| 640 | break; | ||
| 641 | } | ||
| 642 | #endif | 700 | #endif |
| 643 | default: | 701 | default: |
| 644 | BUG(); | 702 | BUG(); |
| 645 | } | 703 | } |
| 704 | } | ||
| 646 | 705 | ||
| 647 | #ifdef SECCOMP_DEBUG | 706 | /** |
| 648 | dump_stack(); | 707 | * seccomp_phase2() - finish slow path seccomp work for the current syscall |
| 649 | #endif | 708 | * @phase1_result: The return value from seccomp_phase1() |
| 650 | audit_seccomp(this_syscall, exit_sig, ret); | 709 | * |
| 651 | do_exit(exit_sig); | 710 | * This must be called from a context in which ptrace hooks can be used. |
| 652 | #ifdef CONFIG_SECCOMP_FILTER | 711 | * |
| 653 | skip: | 712 | * Returns 0 if the syscall should be processed or -1 to skip the syscall. |
| 654 | audit_seccomp(this_syscall, exit_sig, ret); | 713 | */ |
| 655 | #endif | 714 | int seccomp_phase2(u32 phase1_result) |
| 656 | return -1; | 715 | { |
| 716 | struct pt_regs *regs = task_pt_regs(current); | ||
| 717 | u32 action = phase1_result & SECCOMP_RET_ACTION; | ||
| 718 | int data = phase1_result & SECCOMP_RET_DATA; | ||
| 719 | |||
| 720 | BUG_ON(action != SECCOMP_RET_TRACE); | ||
| 721 | |||
| 722 | audit_seccomp(syscall_get_nr(current, regs), 0, action); | ||
| 723 | |||
| 724 | /* Skip these calls if there is no tracer. */ | ||
| 725 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { | ||
| 726 | syscall_set_return_value(current, regs, | ||
| 727 | -ENOSYS, 0); | ||
| 728 | return -1; | ||
| 729 | } | ||
| 730 | |||
| 731 | /* Allow the BPF to provide the event message */ | ||
| 732 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
| 733 | /* | ||
| 734 | * The delivery of a fatal signal during event | ||
| 735 | * notification may silently skip tracer notification. | ||
| 736 | * Terminating the task now avoids executing a system | ||
| 737 | * call that may not be intended. | ||
| 738 | */ | ||
| 739 | if (fatal_signal_pending(current)) | ||
| 740 | do_exit(SIGSYS); | ||
| 741 | if (syscall_get_nr(current, regs) < 0) | ||
| 742 | return -1; /* Explicit request to skip. */ | ||
| 743 | |||
| 744 | return 0; | ||
| 657 | } | 745 | } |
| 746 | #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ | ||
| 658 | 747 | ||
| 659 | long prctl_get_seccomp(void) | 748 | long prctl_get_seccomp(void) |
| 660 | { | 749 | { |
diff --git a/kernel/signal.c b/kernel/signal.c index 40b76e351e64..8f0876f9f6dd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -2170,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info) | |||
| 2170 | return signr; | 2170 | return signr; |
| 2171 | } | 2171 | } |
| 2172 | 2172 | ||
| 2173 | int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | 2173 | int get_signal(struct ksignal *ksig) |
| 2174 | struct pt_regs *regs, void *cookie) | ||
| 2175 | { | 2174 | { |
| 2176 | struct sighand_struct *sighand = current->sighand; | 2175 | struct sighand_struct *sighand = current->sighand; |
| 2177 | struct signal_struct *signal = current->signal; | 2176 | struct signal_struct *signal = current->signal; |
| @@ -2241,13 +2240,13 @@ relock: | |||
| 2241 | goto relock; | 2240 | goto relock; |
| 2242 | } | 2241 | } |
| 2243 | 2242 | ||
| 2244 | signr = dequeue_signal(current, ¤t->blocked, info); | 2243 | signr = dequeue_signal(current, ¤t->blocked, &ksig->info); |
| 2245 | 2244 | ||
| 2246 | if (!signr) | 2245 | if (!signr) |
| 2247 | break; /* will return 0 */ | 2246 | break; /* will return 0 */ |
| 2248 | 2247 | ||
| 2249 | if (unlikely(current->ptrace) && signr != SIGKILL) { | 2248 | if (unlikely(current->ptrace) && signr != SIGKILL) { |
| 2250 | signr = ptrace_signal(signr, info); | 2249 | signr = ptrace_signal(signr, &ksig->info); |
| 2251 | if (!signr) | 2250 | if (!signr) |
| 2252 | continue; | 2251 | continue; |
| 2253 | } | 2252 | } |
| @@ -2255,13 +2254,13 @@ relock: | |||
| 2255 | ka = &sighand->action[signr-1]; | 2254 | ka = &sighand->action[signr-1]; |
| 2256 | 2255 | ||
| 2257 | /* Trace actually delivered signals. */ | 2256 | /* Trace actually delivered signals. */ |
| 2258 | trace_signal_deliver(signr, info, ka); | 2257 | trace_signal_deliver(signr, &ksig->info, ka); |
| 2259 | 2258 | ||
| 2260 | if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ | 2259 | if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ |
| 2261 | continue; | 2260 | continue; |
| 2262 | if (ka->sa.sa_handler != SIG_DFL) { | 2261 | if (ka->sa.sa_handler != SIG_DFL) { |
| 2263 | /* Run the handler. */ | 2262 | /* Run the handler. */ |
| 2264 | *return_ka = *ka; | 2263 | ksig->ka = *ka; |
| 2265 | 2264 | ||
| 2266 | if (ka->sa.sa_flags & SA_ONESHOT) | 2265 | if (ka->sa.sa_flags & SA_ONESHOT) |
| 2267 | ka->sa.sa_handler = SIG_DFL; | 2266 | ka->sa.sa_handler = SIG_DFL; |
| @@ -2311,7 +2310,7 @@ relock: | |||
| 2311 | spin_lock_irq(&sighand->siglock); | 2310 | spin_lock_irq(&sighand->siglock); |
| 2312 | } | 2311 | } |
| 2313 | 2312 | ||
| 2314 | if (likely(do_signal_stop(info->si_signo))) { | 2313 | if (likely(do_signal_stop(ksig->info.si_signo))) { |
| 2315 | /* It released the siglock. */ | 2314 | /* It released the siglock. */ |
| 2316 | goto relock; | 2315 | goto relock; |
| 2317 | } | 2316 | } |
| @@ -2332,7 +2331,7 @@ relock: | |||
| 2332 | 2331 | ||
| 2333 | if (sig_kernel_coredump(signr)) { | 2332 | if (sig_kernel_coredump(signr)) { |
| 2334 | if (print_fatal_signals) | 2333 | if (print_fatal_signals) |
| 2335 | print_fatal_signal(info->si_signo); | 2334 | print_fatal_signal(ksig->info.si_signo); |
| 2336 | proc_coredump_connector(current); | 2335 | proc_coredump_connector(current); |
| 2337 | /* | 2336 | /* |
| 2338 | * If it was able to dump core, this kills all | 2337 | * If it was able to dump core, this kills all |
| @@ -2342,34 +2341,32 @@ relock: | |||
| 2342 | * first and our do_group_exit call below will use | 2341 | * first and our do_group_exit call below will use |
| 2343 | * that value and ignore the one we pass it. | 2342 | * that value and ignore the one we pass it. |
| 2344 | */ | 2343 | */ |
| 2345 | do_coredump(info); | 2344 | do_coredump(&ksig->info); |
| 2346 | } | 2345 | } |
| 2347 | 2346 | ||
| 2348 | /* | 2347 | /* |
| 2349 | * Death signals, no core dump. | 2348 | * Death signals, no core dump. |
| 2350 | */ | 2349 | */ |
| 2351 | do_group_exit(info->si_signo); | 2350 | do_group_exit(ksig->info.si_signo); |
| 2352 | /* NOTREACHED */ | 2351 | /* NOTREACHED */ |
| 2353 | } | 2352 | } |
| 2354 | spin_unlock_irq(&sighand->siglock); | 2353 | spin_unlock_irq(&sighand->siglock); |
| 2355 | return signr; | 2354 | |
| 2355 | ksig->sig = signr; | ||
| 2356 | return ksig->sig > 0; | ||
| 2356 | } | 2357 | } |
| 2357 | 2358 | ||
| 2358 | /** | 2359 | /** |
| 2359 | * signal_delivered - | 2360 | * signal_delivered - |
| 2360 | * @sig: number of signal being delivered | 2361 | * @ksig: kernel signal struct |
| 2361 | * @info: siginfo_t of signal being delivered | ||
| 2362 | * @ka: sigaction setting that chose the handler | ||
| 2363 | * @regs: user register state | ||
| 2364 | * @stepping: nonzero if debugger single-step or block-step in use | 2362 | * @stepping: nonzero if debugger single-step or block-step in use |
| 2365 | * | 2363 | * |
| 2366 | * This function should be called when a signal has successfully been | 2364 | * This function should be called when a signal has successfully been |
| 2367 | * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask | 2365 | * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask |
| 2368 | * is always blocked, and the signal itself is blocked unless %SA_NODEFER | 2366 | * is always blocked, and the signal itself is blocked unless %SA_NODEFER |
| 2369 | * is set in @ka->sa.sa_flags. Tracing is notified. | 2367 | * is set in @ksig->ka.sa.sa_flags. Tracing is notified. |
| 2370 | */ | 2368 | */ |
| 2371 | void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, | 2369 | static void signal_delivered(struct ksignal *ksig, int stepping) |
| 2372 | struct pt_regs *regs, int stepping) | ||
| 2373 | { | 2370 | { |
| 2374 | sigset_t blocked; | 2371 | sigset_t blocked; |
| 2375 | 2372 | ||
| @@ -2379,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, | |||
| 2379 | simply clear the restore sigmask flag. */ | 2376 | simply clear the restore sigmask flag. */ |
| 2380 | clear_restore_sigmask(); | 2377 | clear_restore_sigmask(); |
| 2381 | 2378 | ||
| 2382 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); | 2379 | sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); |
| 2383 | if (!(ka->sa.sa_flags & SA_NODEFER)) | 2380 | if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) |
| 2384 | sigaddset(&blocked, sig); | 2381 | sigaddset(&blocked, ksig->sig); |
| 2385 | set_current_blocked(&blocked); | 2382 | set_current_blocked(&blocked); |
| 2386 | tracehook_signal_handler(sig, info, ka, regs, stepping); | 2383 | tracehook_signal_handler(stepping); |
| 2387 | } | 2384 | } |
| 2388 | 2385 | ||
| 2389 | void signal_setup_done(int failed, struct ksignal *ksig, int stepping) | 2386 | void signal_setup_done(int failed, struct ksignal *ksig, int stepping) |
| @@ -2391,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping) | |||
| 2391 | if (failed) | 2388 | if (failed) |
| 2392 | force_sigsegv(ksig->sig, current); | 2389 | force_sigsegv(ksig->sig, current); |
| 2393 | else | 2390 | else |
| 2394 | signal_delivered(ksig->sig, &ksig->info, &ksig->ka, | 2391 | signal_delivered(ksig, stepping); |
| 2395 | signal_pt_regs(), stepping); | ||
| 2396 | } | 2392 | } |
| 2397 | 2393 | ||
| 2398 | /* | 2394 | /* |
diff --git a/kernel/smp.c b/kernel/smp.c index 487653b5844f..f38a1e692259 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/gfp.h> | 13 | #include <linux/gfp.h> |
| 14 | #include <linux/smp.h> | 14 | #include <linux/smp.h> |
| 15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
| 16 | #include <linux/sched.h> | ||
| 16 | 17 | ||
| 17 | #include "smpboot.h" | 18 | #include "smpboot.h" |
| 18 | 19 | ||
| @@ -164,7 +165,7 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, | |||
| 164 | if (!csd) { | 165 | if (!csd) { |
| 165 | csd = &csd_stack; | 166 | csd = &csd_stack; |
| 166 | if (!wait) | 167 | if (!wait) |
| 167 | csd = &__get_cpu_var(csd_data); | 168 | csd = this_cpu_ptr(&csd_data); |
| 168 | } | 169 | } |
| 169 | 170 | ||
| 170 | csd_lock(csd); | 171 | csd_lock(csd); |
| @@ -229,7 +230,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | |||
| 229 | 230 | ||
| 230 | WARN_ON(!irqs_disabled()); | 231 | WARN_ON(!irqs_disabled()); |
| 231 | 232 | ||
| 232 | head = &__get_cpu_var(call_single_queue); | 233 | head = this_cpu_ptr(&call_single_queue); |
| 233 | entry = llist_del_all(head); | 234 | entry = llist_del_all(head); |
| 234 | entry = llist_reverse_order(entry); | 235 | entry = llist_reverse_order(entry); |
| 235 | 236 | ||
| @@ -419,7 +420,7 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 419 | return; | 420 | return; |
| 420 | } | 421 | } |
| 421 | 422 | ||
| 422 | cfd = &__get_cpu_var(cfd_data); | 423 | cfd = this_cpu_ptr(&cfd_data); |
| 423 | 424 | ||
| 424 | cpumask_and(cfd->cpumask, mask, cpu_online_mask); | 425 | cpumask_and(cfd->cpumask, mask, cpu_online_mask); |
| 425 | cpumask_clear_cpu(this_cpu, cfd->cpumask); | 426 | cpumask_clear_cpu(this_cpu, cfd->cpumask); |
| @@ -670,7 +671,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
| 670 | if (cond_func(cpu, info)) { | 671 | if (cond_func(cpu, info)) { |
| 671 | ret = smp_call_function_single(cpu, func, | 672 | ret = smp_call_function_single(cpu, func, |
| 672 | info, wait); | 673 | info, wait); |
| 673 | WARN_ON_ONCE(!ret); | 674 | WARN_ON_ONCE(ret); |
| 674 | } | 675 | } |
| 675 | preempt_enable(); | 676 | preempt_enable(); |
| 676 | } | 677 | } |
| @@ -699,3 +700,24 @@ void kick_all_cpus_sync(void) | |||
| 699 | smp_call_function(do_nothing, NULL, 1); | 700 | smp_call_function(do_nothing, NULL, 1); |
| 700 | } | 701 | } |
| 701 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); | 702 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); |
| 703 | |||
| 704 | /** | ||
| 705 | * wake_up_all_idle_cpus - break all cpus out of idle | ||
| 706 | * wake_up_all_idle_cpus try to break all cpus which is in idle state even | ||
| 707 | * including idle polling cpus, for non-idle cpus, we will do nothing | ||
| 708 | * for them. | ||
| 709 | */ | ||
| 710 | void wake_up_all_idle_cpus(void) | ||
| 711 | { | ||
| 712 | int cpu; | ||
| 713 | |||
| 714 | preempt_disable(); | ||
| 715 | for_each_online_cpu(cpu) { | ||
| 716 | if (cpu == smp_processor_id()) | ||
| 717 | continue; | ||
| 718 | |||
| 719 | wake_up_if_idle(cpu); | ||
| 720 | } | ||
| 721 | preempt_enable(); | ||
| 722 | } | ||
| 723 | EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 5918d227730f..0699add19164 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -278,7 +278,7 @@ restart: | |||
| 278 | pending >>= softirq_bit; | 278 | pending >>= softirq_bit; |
| 279 | } | 279 | } |
| 280 | 280 | ||
| 281 | rcu_bh_qs(smp_processor_id()); | 281 | rcu_bh_qs(); |
| 282 | local_irq_disable(); | 282 | local_irq_disable(); |
| 283 | 283 | ||
| 284 | pending = local_softirq_pending(); | 284 | pending = local_softirq_pending(); |
| @@ -485,7 +485,7 @@ static void tasklet_action(struct softirq_action *a) | |||
| 485 | local_irq_disable(); | 485 | local_irq_disable(); |
| 486 | list = __this_cpu_read(tasklet_vec.head); | 486 | list = __this_cpu_read(tasklet_vec.head); |
| 487 | __this_cpu_write(tasklet_vec.head, NULL); | 487 | __this_cpu_write(tasklet_vec.head, NULL); |
| 488 | __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); | 488 | __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); |
| 489 | local_irq_enable(); | 489 | local_irq_enable(); |
| 490 | 490 | ||
| 491 | while (list) { | 491 | while (list) { |
| @@ -521,7 +521,7 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
| 521 | local_irq_disable(); | 521 | local_irq_disable(); |
| 522 | list = __this_cpu_read(tasklet_hi_vec.head); | 522 | list = __this_cpu_read(tasklet_hi_vec.head); |
| 523 | __this_cpu_write(tasklet_hi_vec.head, NULL); | 523 | __this_cpu_write(tasklet_hi_vec.head, NULL); |
| 524 | __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); | 524 | __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head)); |
| 525 | local_irq_enable(); | 525 | local_irq_enable(); |
| 526 | 526 | ||
| 527 | while (list) { | 527 | while (list) { |
diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a26..1eaa2f0b0246 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -62,28 +62,28 @@ | |||
| 62 | #include <asm/unistd.h> | 62 | #include <asm/unistd.h> |
| 63 | 63 | ||
| 64 | #ifndef SET_UNALIGN_CTL | 64 | #ifndef SET_UNALIGN_CTL |
| 65 | # define SET_UNALIGN_CTL(a,b) (-EINVAL) | 65 | # define SET_UNALIGN_CTL(a, b) (-EINVAL) |
| 66 | #endif | 66 | #endif |
| 67 | #ifndef GET_UNALIGN_CTL | 67 | #ifndef GET_UNALIGN_CTL |
| 68 | # define GET_UNALIGN_CTL(a,b) (-EINVAL) | 68 | # define GET_UNALIGN_CTL(a, b) (-EINVAL) |
| 69 | #endif | 69 | #endif |
| 70 | #ifndef SET_FPEMU_CTL | 70 | #ifndef SET_FPEMU_CTL |
| 71 | # define SET_FPEMU_CTL(a,b) (-EINVAL) | 71 | # define SET_FPEMU_CTL(a, b) (-EINVAL) |
| 72 | #endif | 72 | #endif |
| 73 | #ifndef GET_FPEMU_CTL | 73 | #ifndef GET_FPEMU_CTL |
| 74 | # define GET_FPEMU_CTL(a,b) (-EINVAL) | 74 | # define GET_FPEMU_CTL(a, b) (-EINVAL) |
| 75 | #endif | 75 | #endif |
| 76 | #ifndef SET_FPEXC_CTL | 76 | #ifndef SET_FPEXC_CTL |
| 77 | # define SET_FPEXC_CTL(a,b) (-EINVAL) | 77 | # define SET_FPEXC_CTL(a, b) (-EINVAL) |
| 78 | #endif | 78 | #endif |
| 79 | #ifndef GET_FPEXC_CTL | 79 | #ifndef GET_FPEXC_CTL |
| 80 | # define GET_FPEXC_CTL(a,b) (-EINVAL) | 80 | # define GET_FPEXC_CTL(a, b) (-EINVAL) |
| 81 | #endif | 81 | #endif |
| 82 | #ifndef GET_ENDIAN | 82 | #ifndef GET_ENDIAN |
| 83 | # define GET_ENDIAN(a,b) (-EINVAL) | 83 | # define GET_ENDIAN(a, b) (-EINVAL) |
| 84 | #endif | 84 | #endif |
| 85 | #ifndef SET_ENDIAN | 85 | #ifndef SET_ENDIAN |
| 86 | # define SET_ENDIAN(a,b) (-EINVAL) | 86 | # define SET_ENDIAN(a, b) (-EINVAL) |
| 87 | #endif | 87 | #endif |
| 88 | #ifndef GET_TSC_CTL | 88 | #ifndef GET_TSC_CTL |
| 89 | # define GET_TSC_CTL(a) (-EINVAL) | 89 | # define GET_TSC_CTL(a) (-EINVAL) |
| @@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
| 182 | rcu_read_lock(); | 182 | rcu_read_lock(); |
| 183 | read_lock(&tasklist_lock); | 183 | read_lock(&tasklist_lock); |
| 184 | switch (which) { | 184 | switch (which) { |
| 185 | case PRIO_PROCESS: | 185 | case PRIO_PROCESS: |
| 186 | if (who) | 186 | if (who) |
| 187 | p = find_task_by_vpid(who); | 187 | p = find_task_by_vpid(who); |
| 188 | else | 188 | else |
| 189 | p = current; | 189 | p = current; |
| 190 | if (p) | 190 | if (p) |
| 191 | error = set_one_prio(p, niceval, error); | 191 | error = set_one_prio(p, niceval, error); |
| 192 | break; | 192 | break; |
| 193 | case PRIO_PGRP: | 193 | case PRIO_PGRP: |
| 194 | if (who) | 194 | if (who) |
| 195 | pgrp = find_vpid(who); | 195 | pgrp = find_vpid(who); |
| 196 | else | 196 | else |
| 197 | pgrp = task_pgrp(current); | 197 | pgrp = task_pgrp(current); |
| 198 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | 198 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { |
| 199 | error = set_one_prio(p, niceval, error); | 199 | error = set_one_prio(p, niceval, error); |
| 200 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 200 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
| 201 | break; | 201 | break; |
| 202 | case PRIO_USER: | 202 | case PRIO_USER: |
| 203 | uid = make_kuid(cred->user_ns, who); | 203 | uid = make_kuid(cred->user_ns, who); |
| 204 | user = cred->user; | 204 | user = cred->user; |
| 205 | if (!who) | 205 | if (!who) |
| 206 | uid = cred->uid; | 206 | uid = cred->uid; |
| 207 | else if (!uid_eq(uid, cred->uid) && | 207 | else if (!uid_eq(uid, cred->uid)) { |
| 208 | !(user = find_user(uid))) | 208 | user = find_user(uid); |
| 209 | if (!user) | ||
| 209 | goto out_unlock; /* No processes for this user */ | 210 | goto out_unlock; /* No processes for this user */ |
| 210 | 211 | } | |
| 211 | do_each_thread(g, p) { | 212 | do_each_thread(g, p) { |
| 212 | if (uid_eq(task_uid(p), uid)) | 213 | if (uid_eq(task_uid(p), uid)) |
| 213 | error = set_one_prio(p, niceval, error); | 214 | error = set_one_prio(p, niceval, error); |
| 214 | } while_each_thread(g, p); | 215 | } while_each_thread(g, p); |
| 215 | if (!uid_eq(uid, cred->uid)) | 216 | if (!uid_eq(uid, cred->uid)) |
| 216 | free_uid(user); /* For find_user() */ | 217 | free_uid(user); /* For find_user() */ |
| 217 | break; | 218 | break; |
| 218 | } | 219 | } |
| 219 | out_unlock: | 220 | out_unlock: |
| 220 | read_unlock(&tasklist_lock); | 221 | read_unlock(&tasklist_lock); |
| @@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
| 244 | rcu_read_lock(); | 245 | rcu_read_lock(); |
| 245 | read_lock(&tasklist_lock); | 246 | read_lock(&tasklist_lock); |
| 246 | switch (which) { | 247 | switch (which) { |
| 247 | case PRIO_PROCESS: | 248 | case PRIO_PROCESS: |
| 248 | if (who) | 249 | if (who) |
| 249 | p = find_task_by_vpid(who); | 250 | p = find_task_by_vpid(who); |
| 250 | else | 251 | else |
| 251 | p = current; | 252 | p = current; |
| 252 | if (p) { | 253 | if (p) { |
| 254 | niceval = nice_to_rlimit(task_nice(p)); | ||
| 255 | if (niceval > retval) | ||
| 256 | retval = niceval; | ||
| 257 | } | ||
| 258 | break; | ||
| 259 | case PRIO_PGRP: | ||
| 260 | if (who) | ||
| 261 | pgrp = find_vpid(who); | ||
| 262 | else | ||
| 263 | pgrp = task_pgrp(current); | ||
| 264 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | ||
| 265 | niceval = nice_to_rlimit(task_nice(p)); | ||
| 266 | if (niceval > retval) | ||
| 267 | retval = niceval; | ||
| 268 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | ||
| 269 | break; | ||
| 270 | case PRIO_USER: | ||
| 271 | uid = make_kuid(cred->user_ns, who); | ||
| 272 | user = cred->user; | ||
| 273 | if (!who) | ||
| 274 | uid = cred->uid; | ||
| 275 | else if (!uid_eq(uid, cred->uid)) { | ||
| 276 | user = find_user(uid); | ||
| 277 | if (!user) | ||
| 278 | goto out_unlock; /* No processes for this user */ | ||
| 279 | } | ||
| 280 | do_each_thread(g, p) { | ||
| 281 | if (uid_eq(task_uid(p), uid)) { | ||
| 253 | niceval = nice_to_rlimit(task_nice(p)); | 282 | niceval = nice_to_rlimit(task_nice(p)); |
| 254 | if (niceval > retval) | 283 | if (niceval > retval) |
| 255 | retval = niceval; | 284 | retval = niceval; |
| 256 | } | 285 | } |
| 257 | break; | 286 | } while_each_thread(g, p); |
| 258 | case PRIO_PGRP: | 287 | if (!uid_eq(uid, cred->uid)) |
| 259 | if (who) | 288 | free_uid(user); /* for find_user() */ |
| 260 | pgrp = find_vpid(who); | 289 | break; |
| 261 | else | ||
| 262 | pgrp = task_pgrp(current); | ||
| 263 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | ||
| 264 | niceval = nice_to_rlimit(task_nice(p)); | ||
| 265 | if (niceval > retval) | ||
| 266 | retval = niceval; | ||
| 267 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | ||
| 268 | break; | ||
| 269 | case PRIO_USER: | ||
| 270 | uid = make_kuid(cred->user_ns, who); | ||
| 271 | user = cred->user; | ||
| 272 | if (!who) | ||
| 273 | uid = cred->uid; | ||
| 274 | else if (!uid_eq(uid, cred->uid) && | ||
| 275 | !(user = find_user(uid))) | ||
| 276 | goto out_unlock; /* No processes for this user */ | ||
| 277 | |||
| 278 | do_each_thread(g, p) { | ||
| 279 | if (uid_eq(task_uid(p), uid)) { | ||
| 280 | niceval = nice_to_rlimit(task_nice(p)); | ||
| 281 | if (niceval > retval) | ||
| 282 | retval = niceval; | ||
| 283 | } | ||
| 284 | } while_each_thread(g, p); | ||
| 285 | if (!uid_eq(uid, cred->uid)) | ||
| 286 | free_uid(user); /* for find_user() */ | ||
| 287 | break; | ||
| 288 | } | 290 | } |
| 289 | out_unlock: | 291 | out_unlock: |
| 290 | read_unlock(&tasklist_lock); | 292 | read_unlock(&tasklist_lock); |
| @@ -306,7 +308,7 @@ out_unlock: | |||
| 306 | * | 308 | * |
| 307 | * The general idea is that a program which uses just setregid() will be | 309 | * The general idea is that a program which uses just setregid() will be |
| 308 | * 100% compatible with BSD. A program which uses just setgid() will be | 310 | * 100% compatible with BSD. A program which uses just setgid() will be |
| 309 | * 100% compatible with POSIX with saved IDs. | 311 | * 100% compatible with POSIX with saved IDs. |
| 310 | * | 312 | * |
| 311 | * SMP: There are not races, the GIDs are checked only by filesystem | 313 | * SMP: There are not races, the GIDs are checked only by filesystem |
| 312 | * operations (as far as semantic preservation is concerned). | 314 | * operations (as far as semantic preservation is concerned). |
| @@ -364,7 +366,7 @@ error: | |||
| 364 | } | 366 | } |
| 365 | 367 | ||
| 366 | /* | 368 | /* |
| 367 | * setgid() is implemented like SysV w/ SAVED_IDS | 369 | * setgid() is implemented like SysV w/ SAVED_IDS |
| 368 | * | 370 | * |
| 369 | * SMP: Same implicit races as above. | 371 | * SMP: Same implicit races as above. |
| 370 | */ | 372 | */ |
| @@ -442,7 +444,7 @@ static int set_user(struct cred *new) | |||
| 442 | * | 444 | * |
| 443 | * The general idea is that a program which uses just setreuid() will be | 445 | * The general idea is that a program which uses just setreuid() will be |
| 444 | * 100% compatible with BSD. A program which uses just setuid() will be | 446 | * 100% compatible with BSD. A program which uses just setuid() will be |
| 445 | * 100% compatible with POSIX with saved IDs. | 447 | * 100% compatible with POSIX with saved IDs. |
| 446 | */ | 448 | */ |
| 447 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | 449 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) |
| 448 | { | 450 | { |
| @@ -503,17 +505,17 @@ error: | |||
| 503 | abort_creds(new); | 505 | abort_creds(new); |
| 504 | return retval; | 506 | return retval; |
| 505 | } | 507 | } |
| 506 | 508 | ||
| 507 | /* | 509 | /* |
| 508 | * setuid() is implemented like SysV with SAVED_IDS | 510 | * setuid() is implemented like SysV with SAVED_IDS |
| 509 | * | 511 | * |
| 510 | * Note that SAVED_ID's is deficient in that a setuid root program | 512 | * Note that SAVED_ID's is deficient in that a setuid root program |
| 511 | * like sendmail, for example, cannot set its uid to be a normal | 513 | * like sendmail, for example, cannot set its uid to be a normal |
| 512 | * user and then switch back, because if you're root, setuid() sets | 514 | * user and then switch back, because if you're root, setuid() sets |
| 513 | * the saved uid too. If you don't like this, blame the bright people | 515 | * the saved uid too. If you don't like this, blame the bright people |
| 514 | * in the POSIX committee and/or USG. Note that the BSD-style setreuid() | 516 | * in the POSIX committee and/or USG. Note that the BSD-style setreuid() |
| 515 | * will allow a root program to temporarily drop privileges and be able to | 517 | * will allow a root program to temporarily drop privileges and be able to |
| 516 | * regain them by swapping the real and effective uid. | 518 | * regain them by swapping the real and effective uid. |
| 517 | */ | 519 | */ |
| 518 | SYSCALL_DEFINE1(setuid, uid_t, uid) | 520 | SYSCALL_DEFINE1(setuid, uid_t, uid) |
| 519 | { | 521 | { |
| @@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ | |||
| 637 | euid = from_kuid_munged(cred->user_ns, cred->euid); | 639 | euid = from_kuid_munged(cred->user_ns, cred->euid); |
| 638 | suid = from_kuid_munged(cred->user_ns, cred->suid); | 640 | suid = from_kuid_munged(cred->user_ns, cred->suid); |
| 639 | 641 | ||
| 640 | if (!(retval = put_user(ruid, ruidp)) && | 642 | retval = put_user(ruid, ruidp); |
| 641 | !(retval = put_user(euid, euidp))) | 643 | if (!retval) { |
| 642 | retval = put_user(suid, suidp); | 644 | retval = put_user(euid, euidp); |
| 643 | 645 | if (!retval) | |
| 646 | return put_user(suid, suidp); | ||
| 647 | } | ||
| 644 | return retval; | 648 | return retval; |
| 645 | } | 649 | } |
| 646 | 650 | ||
| @@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ | |||
| 709 | egid = from_kgid_munged(cred->user_ns, cred->egid); | 713 | egid = from_kgid_munged(cred->user_ns, cred->egid); |
| 710 | sgid = from_kgid_munged(cred->user_ns, cred->sgid); | 714 | sgid = from_kgid_munged(cred->user_ns, cred->sgid); |
| 711 | 715 | ||
| 712 | if (!(retval = put_user(rgid, rgidp)) && | 716 | retval = put_user(rgid, rgidp); |
| 713 | !(retval = put_user(egid, egidp))) | 717 | if (!retval) { |
| 714 | retval = put_user(sgid, sgidp); | 718 | retval = put_user(egid, egidp); |
| 719 | if (!retval) | ||
| 720 | retval = put_user(sgid, sgidp); | ||
| 721 | } | ||
| 715 | 722 | ||
| 716 | return retval; | 723 | return retval; |
| 717 | } | 724 | } |
| @@ -862,11 +869,9 @@ void do_sys_times(struct tms *tms) | |||
| 862 | { | 869 | { |
| 863 | cputime_t tgutime, tgstime, cutime, cstime; | 870 | cputime_t tgutime, tgstime, cutime, cstime; |
| 864 | 871 | ||
| 865 | spin_lock_irq(¤t->sighand->siglock); | ||
| 866 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); | 872 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); |
| 867 | cutime = current->signal->cutime; | 873 | cutime = current->signal->cutime; |
| 868 | cstime = current->signal->cstime; | 874 | cstime = current->signal->cstime; |
| 869 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 870 | tms->tms_utime = cputime_to_clock_t(tgutime); | 875 | tms->tms_utime = cputime_to_clock_t(tgutime); |
| 871 | tms->tms_stime = cputime_to_clock_t(tgstime); | 876 | tms->tms_stime = cputime_to_clock_t(tgstime); |
| 872 | tms->tms_cutime = cputime_to_clock_t(cutime); | 877 | tms->tms_cutime = cputime_to_clock_t(cutime); |
| @@ -1284,7 +1289,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) | |||
| 1284 | /* | 1289 | /* |
| 1285 | * Back compatibility for getrlimit. Needed for some apps. | 1290 | * Back compatibility for getrlimit. Needed for some apps. |
| 1286 | */ | 1291 | */ |
| 1287 | |||
| 1288 | SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | 1292 | SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, |
| 1289 | struct rlimit __user *, rlim) | 1293 | struct rlimit __user *, rlim) |
| 1290 | { | 1294 | { |
| @@ -1299,7 +1303,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | |||
| 1299 | x.rlim_cur = 0x7FFFFFFF; | 1303 | x.rlim_cur = 0x7FFFFFFF; |
| 1300 | if (x.rlim_max > 0x7FFFFFFF) | 1304 | if (x.rlim_max > 0x7FFFFFFF) |
| 1301 | x.rlim_max = 0x7FFFFFFF; | 1305 | x.rlim_max = 0x7FFFFFFF; |
| 1302 | return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; | 1306 | return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; |
| 1303 | } | 1307 | } |
| 1304 | 1308 | ||
| 1305 | #endif | 1309 | #endif |
| @@ -1527,7 +1531,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1527 | cputime_t tgutime, tgstime, utime, stime; | 1531 | cputime_t tgutime, tgstime, utime, stime; |
| 1528 | unsigned long maxrss = 0; | 1532 | unsigned long maxrss = 0; |
| 1529 | 1533 | ||
| 1530 | memset((char *) r, 0, sizeof *r); | 1534 | memset((char *)r, 0, sizeof (*r)); |
| 1531 | utime = stime = 0; | 1535 | utime = stime = 0; |
| 1532 | 1536 | ||
| 1533 | if (who == RUSAGE_THREAD) { | 1537 | if (who == RUSAGE_THREAD) { |
| @@ -1541,41 +1545,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1541 | return; | 1545 | return; |
| 1542 | 1546 | ||
| 1543 | switch (who) { | 1547 | switch (who) { |
| 1544 | case RUSAGE_BOTH: | 1548 | case RUSAGE_BOTH: |
| 1545 | case RUSAGE_CHILDREN: | 1549 | case RUSAGE_CHILDREN: |
| 1546 | utime = p->signal->cutime; | 1550 | utime = p->signal->cutime; |
| 1547 | stime = p->signal->cstime; | 1551 | stime = p->signal->cstime; |
| 1548 | r->ru_nvcsw = p->signal->cnvcsw; | 1552 | r->ru_nvcsw = p->signal->cnvcsw; |
| 1549 | r->ru_nivcsw = p->signal->cnivcsw; | 1553 | r->ru_nivcsw = p->signal->cnivcsw; |
| 1550 | r->ru_minflt = p->signal->cmin_flt; | 1554 | r->ru_minflt = p->signal->cmin_flt; |
| 1551 | r->ru_majflt = p->signal->cmaj_flt; | 1555 | r->ru_majflt = p->signal->cmaj_flt; |
| 1552 | r->ru_inblock = p->signal->cinblock; | 1556 | r->ru_inblock = p->signal->cinblock; |
| 1553 | r->ru_oublock = p->signal->coublock; | 1557 | r->ru_oublock = p->signal->coublock; |
| 1554 | maxrss = p->signal->cmaxrss; | 1558 | maxrss = p->signal->cmaxrss; |
| 1555 | 1559 | ||
| 1556 | if (who == RUSAGE_CHILDREN) | 1560 | if (who == RUSAGE_CHILDREN) |
| 1557 | break; | ||
| 1558 | |||
| 1559 | case RUSAGE_SELF: | ||
| 1560 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); | ||
| 1561 | utime += tgutime; | ||
| 1562 | stime += tgstime; | ||
| 1563 | r->ru_nvcsw += p->signal->nvcsw; | ||
| 1564 | r->ru_nivcsw += p->signal->nivcsw; | ||
| 1565 | r->ru_minflt += p->signal->min_flt; | ||
| 1566 | r->ru_majflt += p->signal->maj_flt; | ||
| 1567 | r->ru_inblock += p->signal->inblock; | ||
| 1568 | r->ru_oublock += p->signal->oublock; | ||
| 1569 | if (maxrss < p->signal->maxrss) | ||
| 1570 | maxrss = p->signal->maxrss; | ||
| 1571 | t = p; | ||
| 1572 | do { | ||
| 1573 | accumulate_thread_rusage(t, r); | ||
| 1574 | } while_each_thread(p, t); | ||
| 1575 | break; | 1561 | break; |
| 1576 | 1562 | ||
| 1577 | default: | 1563 | case RUSAGE_SELF: |
| 1578 | BUG(); | 1564 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
| 1565 | utime += tgutime; | ||
| 1566 | stime += tgstime; | ||
| 1567 | r->ru_nvcsw += p->signal->nvcsw; | ||
| 1568 | r->ru_nivcsw += p->signal->nivcsw; | ||
| 1569 | r->ru_minflt += p->signal->min_flt; | ||
| 1570 | r->ru_majflt += p->signal->maj_flt; | ||
| 1571 | r->ru_inblock += p->signal->inblock; | ||
| 1572 | r->ru_oublock += p->signal->oublock; | ||
| 1573 | if (maxrss < p->signal->maxrss) | ||
| 1574 | maxrss = p->signal->maxrss; | ||
| 1575 | t = p; | ||
| 1576 | do { | ||
| 1577 | accumulate_thread_rusage(t, r); | ||
| 1578 | } while_each_thread(p, t); | ||
| 1579 | break; | ||
| 1580 | |||
| 1581 | default: | ||
| 1582 | BUG(); | ||
| 1579 | } | 1583 | } |
| 1580 | unlock_task_sighand(p, &flags); | 1584 | unlock_task_sighand(p, &flags); |
| 1581 | 1585 | ||
| @@ -1585,6 +1589,7 @@ out: | |||
| 1585 | 1589 | ||
| 1586 | if (who != RUSAGE_CHILDREN) { | 1590 | if (who != RUSAGE_CHILDREN) { |
| 1587 | struct mm_struct *mm = get_task_mm(p); | 1591 | struct mm_struct *mm = get_task_mm(p); |
| 1592 | |||
| 1588 | if (mm) { | 1593 | if (mm) { |
| 1589 | setmax_mm_hiwater_rss(&maxrss, mm); | 1594 | setmax_mm_hiwater_rss(&maxrss, mm); |
| 1590 | mmput(mm); | 1595 | mmput(mm); |
| @@ -1596,6 +1601,7 @@ out: | |||
| 1596 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1601 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) |
| 1597 | { | 1602 | { |
| 1598 | struct rusage r; | 1603 | struct rusage r; |
| 1604 | |||
| 1599 | k_getrusage(p, who, &r); | 1605 | k_getrusage(p, who, &r); |
| 1600 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | 1606 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; |
| 1601 | } | 1607 | } |
| @@ -1628,12 +1634,14 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
| 1628 | return mask; | 1634 | return mask; |
| 1629 | } | 1635 | } |
| 1630 | 1636 | ||
| 1631 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1637 | static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) |
| 1632 | { | 1638 | { |
| 1633 | struct fd exe; | 1639 | struct fd exe; |
| 1634 | struct inode *inode; | 1640 | struct inode *inode; |
| 1635 | int err; | 1641 | int err; |
| 1636 | 1642 | ||
| 1643 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); | ||
| 1644 | |||
| 1637 | exe = fdget(fd); | 1645 | exe = fdget(fd); |
| 1638 | if (!exe.file) | 1646 | if (!exe.file) |
| 1639 | return -EBADF; | 1647 | return -EBADF; |
| @@ -1654,8 +1662,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1654 | if (err) | 1662 | if (err) |
| 1655 | goto exit; | 1663 | goto exit; |
| 1656 | 1664 | ||
| 1657 | down_write(&mm->mmap_sem); | ||
| 1658 | |||
| 1659 | /* | 1665 | /* |
| 1660 | * Forbid mm->exe_file change if old file still mapped. | 1666 | * Forbid mm->exe_file change if old file still mapped. |
| 1661 | */ | 1667 | */ |
| @@ -1667,7 +1673,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1667 | if (vma->vm_file && | 1673 | if (vma->vm_file && |
| 1668 | path_equal(&vma->vm_file->f_path, | 1674 | path_equal(&vma->vm_file->f_path, |
| 1669 | &mm->exe_file->f_path)) | 1675 | &mm->exe_file->f_path)) |
| 1670 | goto exit_unlock; | 1676 | goto exit; |
| 1671 | } | 1677 | } |
| 1672 | 1678 | ||
| 1673 | /* | 1679 | /* |
| @@ -1678,34 +1684,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1678 | */ | 1684 | */ |
| 1679 | err = -EPERM; | 1685 | err = -EPERM; |
| 1680 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) | 1686 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) |
| 1681 | goto exit_unlock; | 1687 | goto exit; |
| 1682 | 1688 | ||
| 1683 | err = 0; | 1689 | err = 0; |
| 1684 | set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ | 1690 | set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ |
| 1685 | exit_unlock: | ||
| 1686 | up_write(&mm->mmap_sem); | ||
| 1687 | |||
| 1688 | exit: | 1691 | exit: |
| 1689 | fdput(exe); | 1692 | fdput(exe); |
| 1690 | return err; | 1693 | return err; |
| 1691 | } | 1694 | } |
| 1692 | 1695 | ||
| 1696 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
| 1697 | /* | ||
| 1698 | * WARNING: we don't require any capability here so be very careful | ||
| 1699 | * in what is allowed for modification from userspace. | ||
| 1700 | */ | ||
| 1701 | static int validate_prctl_map(struct prctl_mm_map *prctl_map) | ||
| 1702 | { | ||
| 1703 | unsigned long mmap_max_addr = TASK_SIZE; | ||
| 1704 | struct mm_struct *mm = current->mm; | ||
| 1705 | int error = -EINVAL, i; | ||
| 1706 | |||
| 1707 | static const unsigned char offsets[] = { | ||
| 1708 | offsetof(struct prctl_mm_map, start_code), | ||
| 1709 | offsetof(struct prctl_mm_map, end_code), | ||
| 1710 | offsetof(struct prctl_mm_map, start_data), | ||
| 1711 | offsetof(struct prctl_mm_map, end_data), | ||
| 1712 | offsetof(struct prctl_mm_map, start_brk), | ||
| 1713 | offsetof(struct prctl_mm_map, brk), | ||
| 1714 | offsetof(struct prctl_mm_map, start_stack), | ||
| 1715 | offsetof(struct prctl_mm_map, arg_start), | ||
| 1716 | offsetof(struct prctl_mm_map, arg_end), | ||
| 1717 | offsetof(struct prctl_mm_map, env_start), | ||
| 1718 | offsetof(struct prctl_mm_map, env_end), | ||
| 1719 | }; | ||
| 1720 | |||
| 1721 | /* | ||
| 1722 | * Make sure the members are not somewhere outside | ||
| 1723 | * of allowed address space. | ||
| 1724 | */ | ||
| 1725 | for (i = 0; i < ARRAY_SIZE(offsets); i++) { | ||
| 1726 | u64 val = *(u64 *)((char *)prctl_map + offsets[i]); | ||
| 1727 | |||
| 1728 | if ((unsigned long)val >= mmap_max_addr || | ||
| 1729 | (unsigned long)val < mmap_min_addr) | ||
| 1730 | goto out; | ||
| 1731 | } | ||
| 1732 | |||
| 1733 | /* | ||
| 1734 | * Make sure the pairs are ordered. | ||
| 1735 | */ | ||
| 1736 | #define __prctl_check_order(__m1, __op, __m2) \ | ||
| 1737 | ((unsigned long)prctl_map->__m1 __op \ | ||
| 1738 | (unsigned long)prctl_map->__m2) ? 0 : -EINVAL | ||
| 1739 | error = __prctl_check_order(start_code, <, end_code); | ||
| 1740 | error |= __prctl_check_order(start_data, <, end_data); | ||
| 1741 | error |= __prctl_check_order(start_brk, <=, brk); | ||
| 1742 | error |= __prctl_check_order(arg_start, <=, arg_end); | ||
| 1743 | error |= __prctl_check_order(env_start, <=, env_end); | ||
| 1744 | if (error) | ||
| 1745 | goto out; | ||
| 1746 | #undef __prctl_check_order | ||
| 1747 | |||
| 1748 | error = -EINVAL; | ||
| 1749 | |||
| 1750 | /* | ||
| 1751 | * @brk should be after @end_data in traditional maps. | ||
| 1752 | */ | ||
| 1753 | if (prctl_map->start_brk <= prctl_map->end_data || | ||
| 1754 | prctl_map->brk <= prctl_map->end_data) | ||
| 1755 | goto out; | ||
| 1756 | |||
| 1757 | /* | ||
| 1758 | * Neither we should allow to override limits if they set. | ||
| 1759 | */ | ||
| 1760 | if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, | ||
| 1761 | prctl_map->start_brk, prctl_map->end_data, | ||
| 1762 | prctl_map->start_data)) | ||
| 1763 | goto out; | ||
| 1764 | |||
| 1765 | /* | ||
| 1766 | * Someone is trying to cheat the auxv vector. | ||
| 1767 | */ | ||
| 1768 | if (prctl_map->auxv_size) { | ||
| 1769 | if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) | ||
| 1770 | goto out; | ||
| 1771 | } | ||
| 1772 | |||
| 1773 | /* | ||
| 1774 | * Finally, make sure the caller has the rights to | ||
| 1775 | * change /proc/pid/exe link: only local root should | ||
| 1776 | * be allowed to. | ||
| 1777 | */ | ||
| 1778 | if (prctl_map->exe_fd != (u32)-1) { | ||
| 1779 | struct user_namespace *ns = current_user_ns(); | ||
| 1780 | const struct cred *cred = current_cred(); | ||
| 1781 | |||
| 1782 | if (!uid_eq(cred->uid, make_kuid(ns, 0)) || | ||
| 1783 | !gid_eq(cred->gid, make_kgid(ns, 0))) | ||
| 1784 | goto out; | ||
| 1785 | } | ||
| 1786 | |||
| 1787 | error = 0; | ||
| 1788 | out: | ||
| 1789 | return error; | ||
| 1790 | } | ||
| 1791 | |||
| 1792 | static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) | ||
| 1793 | { | ||
| 1794 | struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; | ||
| 1795 | unsigned long user_auxv[AT_VECTOR_SIZE]; | ||
| 1796 | struct mm_struct *mm = current->mm; | ||
| 1797 | int error; | ||
| 1798 | |||
| 1799 | BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); | ||
| 1800 | BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); | ||
| 1801 | |||
| 1802 | if (opt == PR_SET_MM_MAP_SIZE) | ||
| 1803 | return put_user((unsigned int)sizeof(prctl_map), | ||
| 1804 | (unsigned int __user *)addr); | ||
| 1805 | |||
| 1806 | if (data_size != sizeof(prctl_map)) | ||
| 1807 | return -EINVAL; | ||
| 1808 | |||
| 1809 | if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) | ||
| 1810 | return -EFAULT; | ||
| 1811 | |||
| 1812 | error = validate_prctl_map(&prctl_map); | ||
| 1813 | if (error) | ||
| 1814 | return error; | ||
| 1815 | |||
| 1816 | if (prctl_map.auxv_size) { | ||
| 1817 | memset(user_auxv, 0, sizeof(user_auxv)); | ||
| 1818 | if (copy_from_user(user_auxv, | ||
| 1819 | (const void __user *)prctl_map.auxv, | ||
| 1820 | prctl_map.auxv_size)) | ||
| 1821 | return -EFAULT; | ||
| 1822 | |||
| 1823 | /* Last entry must be AT_NULL as specification requires */ | ||
| 1824 | user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; | ||
| 1825 | user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; | ||
| 1826 | } | ||
| 1827 | |||
| 1828 | down_write(&mm->mmap_sem); | ||
| 1829 | if (prctl_map.exe_fd != (u32)-1) | ||
| 1830 | error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); | ||
| 1831 | downgrade_write(&mm->mmap_sem); | ||
| 1832 | if (error) | ||
| 1833 | goto out; | ||
| 1834 | |||
| 1835 | /* | ||
| 1836 | * We don't validate if these members are pointing to | ||
| 1837 | * real present VMAs because application may have correspond | ||
| 1838 | * VMAs already unmapped and kernel uses these members for statistics | ||
| 1839 | * output in procfs mostly, except | ||
| 1840 | * | ||
| 1841 | * - @start_brk/@brk which are used in do_brk but kernel lookups | ||
| 1842 | * for VMAs when updating these memvers so anything wrong written | ||
| 1843 | * here cause kernel to swear at userspace program but won't lead | ||
| 1844 | * to any problem in kernel itself | ||
| 1845 | */ | ||
| 1846 | |||
| 1847 | mm->start_code = prctl_map.start_code; | ||
| 1848 | mm->end_code = prctl_map.end_code; | ||
| 1849 | mm->start_data = prctl_map.start_data; | ||
| 1850 | mm->end_data = prctl_map.end_data; | ||
| 1851 | mm->start_brk = prctl_map.start_brk; | ||
| 1852 | mm->brk = prctl_map.brk; | ||
| 1853 | mm->start_stack = prctl_map.start_stack; | ||
| 1854 | mm->arg_start = prctl_map.arg_start; | ||
| 1855 | mm->arg_end = prctl_map.arg_end; | ||
| 1856 | mm->env_start = prctl_map.env_start; | ||
| 1857 | mm->env_end = prctl_map.env_end; | ||
| 1858 | |||
| 1859 | /* | ||
| 1860 | * Note this update of @saved_auxv is lockless thus | ||
| 1861 | * if someone reads this member in procfs while we're | ||
| 1862 | * updating -- it may get partly updated results. It's | ||
| 1863 | * known and acceptable trade off: we leave it as is to | ||
| 1864 | * not introduce additional locks here making the kernel | ||
| 1865 | * more complex. | ||
| 1866 | */ | ||
| 1867 | if (prctl_map.auxv_size) | ||
| 1868 | memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); | ||
| 1869 | |||
| 1870 | error = 0; | ||
| 1871 | out: | ||
| 1872 | up_read(&mm->mmap_sem); | ||
| 1873 | return error; | ||
| 1874 | } | ||
| 1875 | #endif /* CONFIG_CHECKPOINT_RESTORE */ | ||
| 1876 | |||
| 1693 | static int prctl_set_mm(int opt, unsigned long addr, | 1877 | static int prctl_set_mm(int opt, unsigned long addr, |
| 1694 | unsigned long arg4, unsigned long arg5) | 1878 | unsigned long arg4, unsigned long arg5) |
| 1695 | { | 1879 | { |
| 1696 | unsigned long rlim = rlimit(RLIMIT_DATA); | ||
| 1697 | struct mm_struct *mm = current->mm; | 1880 | struct mm_struct *mm = current->mm; |
| 1698 | struct vm_area_struct *vma; | 1881 | struct vm_area_struct *vma; |
| 1699 | int error; | 1882 | int error; |
| 1700 | 1883 | ||
| 1701 | if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) | 1884 | if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && |
| 1885 | opt != PR_SET_MM_MAP && | ||
| 1886 | opt != PR_SET_MM_MAP_SIZE))) | ||
| 1702 | return -EINVAL; | 1887 | return -EINVAL; |
| 1703 | 1888 | ||
| 1889 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
| 1890 | if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) | ||
| 1891 | return prctl_set_mm_map(opt, (const void __user *)addr, arg4); | ||
| 1892 | #endif | ||
| 1893 | |||
| 1704 | if (!capable(CAP_SYS_RESOURCE)) | 1894 | if (!capable(CAP_SYS_RESOURCE)) |
| 1705 | return -EPERM; | 1895 | return -EPERM; |
| 1706 | 1896 | ||
| 1707 | if (opt == PR_SET_MM_EXE_FILE) | 1897 | if (opt == PR_SET_MM_EXE_FILE) { |
| 1708 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); | 1898 | down_write(&mm->mmap_sem); |
| 1899 | error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); | ||
| 1900 | up_write(&mm->mmap_sem); | ||
| 1901 | return error; | ||
| 1902 | } | ||
| 1709 | 1903 | ||
| 1710 | if (addr >= TASK_SIZE || addr < mmap_min_addr) | 1904 | if (addr >= TASK_SIZE || addr < mmap_min_addr) |
| 1711 | return -EINVAL; | 1905 | return -EINVAL; |
| @@ -1733,9 +1927,8 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 1733 | if (addr <= mm->end_data) | 1927 | if (addr <= mm->end_data) |
| 1734 | goto out; | 1928 | goto out; |
| 1735 | 1929 | ||
| 1736 | if (rlim < RLIM_INFINITY && | 1930 | if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, |
| 1737 | (mm->brk - addr) + | 1931 | mm->end_data, mm->start_data)) |
| 1738 | (mm->end_data - mm->start_data) > rlim) | ||
| 1739 | goto out; | 1932 | goto out; |
| 1740 | 1933 | ||
| 1741 | mm->start_brk = addr; | 1934 | mm->start_brk = addr; |
| @@ -1745,9 +1938,8 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 1745 | if (addr <= mm->end_data) | 1938 | if (addr <= mm->end_data) |
| 1746 | goto out; | 1939 | goto out; |
| 1747 | 1940 | ||
| 1748 | if (rlim < RLIM_INFINITY && | 1941 | if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, |
| 1749 | (addr - mm->start_brk) + | 1942 | mm->end_data, mm->start_data)) |
| 1750 | (mm->end_data - mm->start_data) > rlim) | ||
| 1751 | goto out; | 1943 | goto out; |
| 1752 | 1944 | ||
| 1753 | mm->brk = addr; | 1945 | mm->brk = addr; |
| @@ -2023,6 +2215,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, | |||
| 2023 | { | 2215 | { |
| 2024 | int err = 0; | 2216 | int err = 0; |
| 2025 | int cpu = raw_smp_processor_id(); | 2217 | int cpu = raw_smp_processor_id(); |
| 2218 | |||
| 2026 | if (cpup) | 2219 | if (cpup) |
| 2027 | err |= put_user(cpu, cpup); | 2220 | err |= put_user(cpu, cpup); |
| 2028 | if (nodep) | 2221 | if (nodep) |
| @@ -2135,7 +2328,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) | |||
| 2135 | /* Check to see if any memory value is too large for 32-bit and scale | 2328 | /* Check to see if any memory value is too large for 32-bit and scale |
| 2136 | * down if needed | 2329 | * down if needed |
| 2137 | */ | 2330 | */ |
| 2138 | if ((s.totalram >> 32) || (s.totalswap >> 32)) { | 2331 | if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { |
| 2139 | int bitcount = 0; | 2332 | int bitcount = 0; |
| 2140 | 2333 | ||
| 2141 | while (s.mem_unit < PAGE_SIZE) { | 2334 | while (s.mem_unit < PAGE_SIZE) { |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2904a2105914..02aa4185b17e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); | |||
| 25 | cond_syscall(sys_swapoff); | 25 | cond_syscall(sys_swapoff); |
| 26 | cond_syscall(sys_kexec_load); | 26 | cond_syscall(sys_kexec_load); |
| 27 | cond_syscall(compat_sys_kexec_load); | 27 | cond_syscall(compat_sys_kexec_load); |
| 28 | cond_syscall(sys_kexec_file_load); | ||
| 28 | cond_syscall(sys_init_module); | 29 | cond_syscall(sys_init_module); |
| 29 | cond_syscall(sys_finit_module); | 30 | cond_syscall(sys_finit_module); |
| 30 | cond_syscall(sys_delete_module); | 31 | cond_syscall(sys_delete_module); |
| @@ -155,6 +156,9 @@ cond_syscall(sys_process_vm_writev); | |||
| 155 | cond_syscall(compat_sys_process_vm_readv); | 156 | cond_syscall(compat_sys_process_vm_readv); |
| 156 | cond_syscall(compat_sys_process_vm_writev); | 157 | cond_syscall(compat_sys_process_vm_writev); |
| 157 | cond_syscall(sys_uselib); | 158 | cond_syscall(sys_uselib); |
| 159 | cond_syscall(sys_fadvise64); | ||
| 160 | cond_syscall(sys_fadvise64_64); | ||
| 161 | cond_syscall(sys_madvise); | ||
| 158 | 162 | ||
| 159 | /* arch-specific weak syscall entries */ | 163 | /* arch-specific weak syscall entries */ |
| 160 | cond_syscall(sys_pciconfig_read); | 164 | cond_syscall(sys_pciconfig_read); |
| @@ -197,6 +201,7 @@ cond_syscall(compat_sys_timerfd_settime); | |||
| 197 | cond_syscall(compat_sys_timerfd_gettime); | 201 | cond_syscall(compat_sys_timerfd_gettime); |
| 198 | cond_syscall(sys_eventfd); | 202 | cond_syscall(sys_eventfd); |
| 199 | cond_syscall(sys_eventfd2); | 203 | cond_syscall(sys_eventfd2); |
| 204 | cond_syscall(sys_memfd_create); | ||
| 200 | 205 | ||
| 201 | /* performance counters: */ | 206 | /* performance counters: */ |
| 202 | cond_syscall(sys_perf_event_open); | 207 | cond_syscall(sys_perf_event_open); |
| @@ -216,3 +221,6 @@ cond_syscall(sys_kcmp); | |||
| 216 | 221 | ||
| 217 | /* operate on Secure Computing state */ | 222 | /* operate on Secure Computing state */ |
| 218 | cond_syscall(sys_seccomp); | 223 | cond_syscall(sys_seccomp); |
| 224 | |||
| 225 | /* access BPF programs and maps */ | ||
| 226 | cond_syscall(sys_bpf); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e22a72c..15f2511a1b7c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = { | |||
| 387 | .data = &sysctl_numa_balancing_scan_size, | 387 | .data = &sysctl_numa_balancing_scan_size, |
| 388 | .maxlen = sizeof(unsigned int), | 388 | .maxlen = sizeof(unsigned int), |
| 389 | .mode = 0644, | 389 | .mode = 0644, |
| 390 | .proc_handler = proc_dointvec, | 390 | .proc_handler = proc_dointvec_minmax, |
| 391 | .extra1 = &one, | ||
| 391 | }, | 392 | }, |
| 392 | { | 393 | { |
| 393 | .procname = "numa_balancing", | 394 | .procname = "numa_balancing", |
| @@ -1055,15 +1056,6 @@ static struct ctl_table kern_table[] = { | |||
| 1055 | .child = key_sysctls, | 1056 | .child = key_sysctls, |
| 1056 | }, | 1057 | }, |
| 1057 | #endif | 1058 | #endif |
| 1058 | #ifdef CONFIG_RCU_TORTURE_TEST | ||
| 1059 | { | ||
| 1060 | .procname = "rcutorture_runnable", | ||
| 1061 | .data = &rcutorture_runnable, | ||
| 1062 | .maxlen = sizeof(int), | ||
| 1063 | .mode = 0644, | ||
| 1064 | .proc_handler = proc_dointvec, | ||
| 1065 | }, | ||
| 1066 | #endif | ||
| 1067 | #ifdef CONFIG_PERF_EVENTS | 1059 | #ifdef CONFIG_PERF_EVENTS |
| 1068 | /* | 1060 | /* |
| 1069 | * User-space scripts rely on the existence of this file | 1061 | * User-space scripts rely on the existence of this file |
| @@ -1240,8 +1232,7 @@ static struct ctl_table vm_table[] = { | |||
| 1240 | .maxlen = sizeof(unsigned long), | 1232 | .maxlen = sizeof(unsigned long), |
| 1241 | .mode = 0644, | 1233 | .mode = 0644, |
| 1242 | .proc_handler = hugetlb_sysctl_handler, | 1234 | .proc_handler = hugetlb_sysctl_handler, |
| 1243 | .extra1 = (void *)&hugetlb_zero, | 1235 | .extra1 = &zero, |
| 1244 | .extra2 = (void *)&hugetlb_infinity, | ||
| 1245 | }, | 1236 | }, |
| 1246 | #ifdef CONFIG_NUMA | 1237 | #ifdef CONFIG_NUMA |
| 1247 | { | 1238 | { |
| @@ -1250,8 +1241,7 @@ static struct ctl_table vm_table[] = { | |||
| 1250 | .maxlen = sizeof(unsigned long), | 1241 | .maxlen = sizeof(unsigned long), |
| 1251 | .mode = 0644, | 1242 | .mode = 0644, |
| 1252 | .proc_handler = &hugetlb_mempolicy_sysctl_handler, | 1243 | .proc_handler = &hugetlb_mempolicy_sysctl_handler, |
| 1253 | .extra1 = (void *)&hugetlb_zero, | 1244 | .extra1 = &zero, |
| 1254 | .extra2 = (void *)&hugetlb_infinity, | ||
| 1255 | }, | 1245 | }, |
| 1256 | #endif | 1246 | #endif |
| 1257 | { | 1247 | { |
| @@ -1274,8 +1264,7 @@ static struct ctl_table vm_table[] = { | |||
| 1274 | .maxlen = sizeof(unsigned long), | 1264 | .maxlen = sizeof(unsigned long), |
| 1275 | .mode = 0644, | 1265 | .mode = 0644, |
| 1276 | .proc_handler = hugetlb_overcommit_handler, | 1266 | .proc_handler = hugetlb_overcommit_handler, |
| 1277 | .extra1 = (void *)&hugetlb_zero, | 1267 | .extra1 = &zero, |
| 1278 | .extra2 = (void *)&hugetlb_infinity, | ||
| 1279 | }, | 1268 | }, |
| 1280 | #endif | 1269 | #endif |
| 1281 | { | 1270 | { |
| @@ -1463,13 +1452,6 @@ static struct ctl_table vm_table[] = { | |||
| 1463 | .extra2 = &one, | 1452 | .extra2 = &one, |
| 1464 | }, | 1453 | }, |
| 1465 | #endif | 1454 | #endif |
| 1466 | { | ||
| 1467 | .procname = "scan_unevictable_pages", | ||
| 1468 | .data = &scan_unevictable_pages, | ||
| 1469 | .maxlen = sizeof(scan_unevictable_pages), | ||
| 1470 | .mode = 0644, | ||
| 1471 | .proc_handler = scan_unevictable_handler, | ||
| 1472 | }, | ||
| 1473 | #ifdef CONFIG_MEMORY_FAILURE | 1455 | #ifdef CONFIG_MEMORY_FAILURE |
| 1474 | { | 1456 | { |
| 1475 | .procname = "memory_failure_early_kill", | 1457 | .procname = "memory_failure_early_kill", |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e4ba9a5a5ccb..9a4f750a2963 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -390,7 +390,6 @@ static const struct bin_table bin_net_ipv4_table[] = { | |||
| 390 | { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, | 390 | { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, |
| 391 | { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, | 391 | { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, |
| 392 | { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, | 392 | { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, |
| 393 | { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, | ||
| 394 | { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, | 393 | { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, |
| 395 | { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, | 394 | { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, |
| 396 | { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, | 395 | { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13d2f7cd65db..b312fcc73024 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -638,7 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
| 638 | fill_tgid_exit(tsk); | 638 | fill_tgid_exit(tsk); |
| 639 | } | 639 | } |
| 640 | 640 | ||
| 641 | listeners = __this_cpu_ptr(&listener_array); | 641 | listeners = raw_cpu_ptr(&listener_array); |
| 642 | if (list_empty(&listeners->list)) | 642 | if (list_empty(&listeners->list)) |
| 643 | return; | 643 | return; |
| 644 | 644 | ||
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbfdd83..0dbab6d1acb4 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c | |||
| @@ -14,6 +14,8 @@ | |||
| 14 | * the GNU General Public License for more details. | 14 | * the GNU General Public License for more details. |
| 15 | */ | 15 | */ |
| 16 | 16 | ||
| 17 | #define pr_fmt(fmt) "Kprobe smoke test: " fmt | ||
| 18 | |||
| 17 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
| 18 | #include <linux/kprobes.h> | 20 | #include <linux/kprobes.h> |
| 19 | #include <linux/random.h> | 21 | #include <linux/random.h> |
| @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
| 41 | { | 43 | { |
| 42 | if (preh_val != (rand1 / div_factor)) { | 44 | if (preh_val != (rand1 / div_factor)) { |
| 43 | handler_errors++; | 45 | handler_errors++; |
| 44 | printk(KERN_ERR "Kprobe smoke test failed: " | 46 | pr_err("incorrect value in post_handler\n"); |
| 45 | "incorrect value in post_handler\n"); | ||
| 46 | } | 47 | } |
| 47 | posth_val = preh_val + div_factor; | 48 | posth_val = preh_val + div_factor; |
| 48 | } | 49 | } |
| @@ -59,8 +60,7 @@ static int test_kprobe(void) | |||
| 59 | 60 | ||
| 60 | ret = register_kprobe(&kp); | 61 | ret = register_kprobe(&kp); |
| 61 | if (ret < 0) { | 62 | if (ret < 0) { |
| 62 | printk(KERN_ERR "Kprobe smoke test failed: " | 63 | pr_err("register_kprobe returned %d\n", ret); |
| 63 | "register_kprobe returned %d\n", ret); | ||
| 64 | return ret; | 64 | return ret; |
| 65 | } | 65 | } |
| 66 | 66 | ||
| @@ -68,14 +68,12 @@ static int test_kprobe(void) | |||
| 68 | unregister_kprobe(&kp); | 68 | unregister_kprobe(&kp); |
| 69 | 69 | ||
| 70 | if (preh_val == 0) { | 70 | if (preh_val == 0) { |
| 71 | printk(KERN_ERR "Kprobe smoke test failed: " | 71 | pr_err("kprobe pre_handler not called\n"); |
| 72 | "kprobe pre_handler not called\n"); | ||
| 73 | handler_errors++; | 72 | handler_errors++; |
| 74 | } | 73 | } |
| 75 | 74 | ||
| 76 | if (posth_val == 0) { | 75 | if (posth_val == 0) { |
| 77 | printk(KERN_ERR "Kprobe smoke test failed: " | 76 | pr_err("kprobe post_handler not called\n"); |
| 78 | "kprobe post_handler not called\n"); | ||
| 79 | handler_errors++; | 77 | handler_errors++; |
| 80 | } | 78 | } |
| 81 | 79 | ||
| @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, | |||
| 98 | { | 96 | { |
| 99 | if (preh_val != (rand1 / div_factor) + 1) { | 97 | if (preh_val != (rand1 / div_factor) + 1) { |
| 100 | handler_errors++; | 98 | handler_errors++; |
| 101 | printk(KERN_ERR "Kprobe smoke test failed: " | 99 | pr_err("incorrect value in post_handler2\n"); |
| 102 | "incorrect value in post_handler2\n"); | ||
| 103 | } | 100 | } |
| 104 | posth_val = preh_val + div_factor; | 101 | posth_val = preh_val + div_factor; |
| 105 | } | 102 | } |
| @@ -120,8 +117,7 @@ static int test_kprobes(void) | |||
| 120 | kp.flags = 0; | 117 | kp.flags = 0; |
| 121 | ret = register_kprobes(kps, 2); | 118 | ret = register_kprobes(kps, 2); |
| 122 | if (ret < 0) { | 119 | if (ret < 0) { |
| 123 | printk(KERN_ERR "Kprobe smoke test failed: " | 120 | pr_err("register_kprobes returned %d\n", ret); |
| 124 | "register_kprobes returned %d\n", ret); | ||
| 125 | return ret; | 121 | return ret; |
| 126 | } | 122 | } |
| 127 | 123 | ||
| @@ -130,14 +126,12 @@ static int test_kprobes(void) | |||
| 130 | ret = target(rand1); | 126 | ret = target(rand1); |
| 131 | 127 | ||
| 132 | if (preh_val == 0) { | 128 | if (preh_val == 0) { |
| 133 | printk(KERN_ERR "Kprobe smoke test failed: " | 129 | pr_err("kprobe pre_handler not called\n"); |
| 134 | "kprobe pre_handler not called\n"); | ||
| 135 | handler_errors++; | 130 | handler_errors++; |
| 136 | } | 131 | } |
| 137 | 132 | ||
| 138 | if (posth_val == 0) { | 133 | if (posth_val == 0) { |
| 139 | printk(KERN_ERR "Kprobe smoke test failed: " | 134 | pr_err("kprobe post_handler not called\n"); |
| 140 | "kprobe post_handler not called\n"); | ||
| 141 | handler_errors++; | 135 | handler_errors++; |
| 142 | } | 136 | } |
| 143 | 137 | ||
| @@ -146,14 +140,12 @@ static int test_kprobes(void) | |||
| 146 | ret = target2(rand1); | 140 | ret = target2(rand1); |
| 147 | 141 | ||
| 148 | if (preh_val == 0) { | 142 | if (preh_val == 0) { |
| 149 | printk(KERN_ERR "Kprobe smoke test failed: " | 143 | pr_err("kprobe pre_handler2 not called\n"); |
| 150 | "kprobe pre_handler2 not called\n"); | ||
| 151 | handler_errors++; | 144 | handler_errors++; |
| 152 | } | 145 | } |
| 153 | 146 | ||
| 154 | if (posth_val == 0) { | 147 | if (posth_val == 0) { |
| 155 | printk(KERN_ERR "Kprobe smoke test failed: " | 148 | pr_err("kprobe post_handler2 not called\n"); |
| 156 | "kprobe post_handler2 not called\n"); | ||
| 157 | handler_errors++; | 149 | handler_errors++; |
| 158 | } | 150 | } |
| 159 | 151 | ||
| @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) | |||
| 166 | { | 158 | { |
| 167 | if (value != rand1) { | 159 | if (value != rand1) { |
| 168 | handler_errors++; | 160 | handler_errors++; |
| 169 | printk(KERN_ERR "Kprobe smoke test failed: " | 161 | pr_err("incorrect value in jprobe handler\n"); |
| 170 | "incorrect value in jprobe handler\n"); | ||
| 171 | } | 162 | } |
| 172 | 163 | ||
| 173 | jph_val = rand1; | 164 | jph_val = rand1; |
| @@ -186,16 +177,14 @@ static int test_jprobe(void) | |||
| 186 | 177 | ||
| 187 | ret = register_jprobe(&jp); | 178 | ret = register_jprobe(&jp); |
| 188 | if (ret < 0) { | 179 | if (ret < 0) { |
| 189 | printk(KERN_ERR "Kprobe smoke test failed: " | 180 | pr_err("register_jprobe returned %d\n", ret); |
| 190 | "register_jprobe returned %d\n", ret); | ||
| 191 | return ret; | 181 | return ret; |
| 192 | } | 182 | } |
| 193 | 183 | ||
| 194 | ret = target(rand1); | 184 | ret = target(rand1); |
| 195 | unregister_jprobe(&jp); | 185 | unregister_jprobe(&jp); |
| 196 | if (jph_val == 0) { | 186 | if (jph_val == 0) { |
| 197 | printk(KERN_ERR "Kprobe smoke test failed: " | 187 | pr_err("jprobe handler not called\n"); |
| 198 | "jprobe handler not called\n"); | ||
| 199 | handler_errors++; | 188 | handler_errors++; |
| 200 | } | 189 | } |
| 201 | 190 | ||
| @@ -217,24 +206,21 @@ static int test_jprobes(void) | |||
| 217 | jp.kp.flags = 0; | 206 | jp.kp.flags = 0; |
| 218 | ret = register_jprobes(jps, 2); | 207 | ret = register_jprobes(jps, 2); |
| 219 | if (ret < 0) { | 208 | if (ret < 0) { |
| 220 | printk(KERN_ERR "Kprobe smoke test failed: " | 209 | pr_err("register_jprobes returned %d\n", ret); |
| 221 | "register_jprobes returned %d\n", ret); | ||
| 222 | return ret; | 210 | return ret; |
| 223 | } | 211 | } |
| 224 | 212 | ||
| 225 | jph_val = 0; | 213 | jph_val = 0; |
| 226 | ret = target(rand1); | 214 | ret = target(rand1); |
| 227 | if (jph_val == 0) { | 215 | if (jph_val == 0) { |
| 228 | printk(KERN_ERR "Kprobe smoke test failed: " | 216 | pr_err("jprobe handler not called\n"); |
| 229 | "jprobe handler not called\n"); | ||
| 230 | handler_errors++; | 217 | handler_errors++; |
| 231 | } | 218 | } |
| 232 | 219 | ||
| 233 | jph_val = 0; | 220 | jph_val = 0; |
| 234 | ret = target2(rand1); | 221 | ret = target2(rand1); |
| 235 | if (jph_val == 0) { | 222 | if (jph_val == 0) { |
| 236 | printk(KERN_ERR "Kprobe smoke test failed: " | 223 | pr_err("jprobe handler2 not called\n"); |
| 237 | "jprobe handler2 not called\n"); | ||
| 238 | handler_errors++; | 224 | handler_errors++; |
| 239 | } | 225 | } |
| 240 | unregister_jprobes(jps, 2); | 226 | unregister_jprobes(jps, 2); |
| @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) | |||
| 256 | 242 | ||
| 257 | if (ret != (rand1 / div_factor)) { | 243 | if (ret != (rand1 / div_factor)) { |
| 258 | handler_errors++; | 244 | handler_errors++; |
| 259 | printk(KERN_ERR "Kprobe smoke test failed: " | 245 | pr_err("incorrect value in kretprobe handler\n"); |
| 260 | "incorrect value in kretprobe handler\n"); | ||
| 261 | } | 246 | } |
| 262 | if (krph_val == 0) { | 247 | if (krph_val == 0) { |
| 263 | handler_errors++; | 248 | handler_errors++; |
| 264 | printk(KERN_ERR "Kprobe smoke test failed: " | 249 | pr_err("call to kretprobe entry handler failed\n"); |
| 265 | "call to kretprobe entry handler failed\n"); | ||
| 266 | } | 250 | } |
| 267 | 251 | ||
| 268 | krph_val = rand1; | 252 | krph_val = rand1; |
| @@ -281,16 +265,14 @@ static int test_kretprobe(void) | |||
| 281 | 265 | ||
| 282 | ret = register_kretprobe(&rp); | 266 | ret = register_kretprobe(&rp); |
| 283 | if (ret < 0) { | 267 | if (ret < 0) { |
| 284 | printk(KERN_ERR "Kprobe smoke test failed: " | 268 | pr_err("register_kretprobe returned %d\n", ret); |
| 285 | "register_kretprobe returned %d\n", ret); | ||
| 286 | return ret; | 269 | return ret; |
| 287 | } | 270 | } |
| 288 | 271 | ||
| 289 | ret = target(rand1); | 272 | ret = target(rand1); |
| 290 | unregister_kretprobe(&rp); | 273 | unregister_kretprobe(&rp); |
| 291 | if (krph_val != rand1) { | 274 | if (krph_val != rand1) { |
| 292 | printk(KERN_ERR "Kprobe smoke test failed: " | 275 | pr_err("kretprobe handler not called\n"); |
| 293 | "kretprobe handler not called\n"); | ||
| 294 | handler_errors++; | 276 | handler_errors++; |
| 295 | } | 277 | } |
| 296 | 278 | ||
| @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) | |||
| 303 | 285 | ||
| 304 | if (ret != (rand1 / div_factor) + 1) { | 286 | if (ret != (rand1 / div_factor) + 1) { |
| 305 | handler_errors++; | 287 | handler_errors++; |
| 306 | printk(KERN_ERR "Kprobe smoke test failed: " | 288 | pr_err("incorrect value in kretprobe handler2\n"); |
| 307 | "incorrect value in kretprobe handler2\n"); | ||
| 308 | } | 289 | } |
| 309 | if (krph_val == 0) { | 290 | if (krph_val == 0) { |
| 310 | handler_errors++; | 291 | handler_errors++; |
| 311 | printk(KERN_ERR "Kprobe smoke test failed: " | 292 | pr_err("call to kretprobe entry handler failed\n"); |
| 312 | "call to kretprobe entry handler failed\n"); | ||
| 313 | } | 293 | } |
| 314 | 294 | ||
| 315 | krph_val = rand1; | 295 | krph_val = rand1; |
| @@ -332,24 +312,21 @@ static int test_kretprobes(void) | |||
| 332 | rp.kp.flags = 0; | 312 | rp.kp.flags = 0; |
| 333 | ret = register_kretprobes(rps, 2); | 313 | ret = register_kretprobes(rps, 2); |
| 334 | if (ret < 0) { | 314 | if (ret < 0) { |
| 335 | printk(KERN_ERR "Kprobe smoke test failed: " | 315 | pr_err("register_kretprobe returned %d\n", ret); |
| 336 | "register_kretprobe returned %d\n", ret); | ||
| 337 | return ret; | 316 | return ret; |
| 338 | } | 317 | } |
| 339 | 318 | ||
| 340 | krph_val = 0; | 319 | krph_val = 0; |
| 341 | ret = target(rand1); | 320 | ret = target(rand1); |
| 342 | if (krph_val != rand1) { | 321 | if (krph_val != rand1) { |
| 343 | printk(KERN_ERR "Kprobe smoke test failed: " | 322 | pr_err("kretprobe handler not called\n"); |
| 344 | "kretprobe handler not called\n"); | ||
| 345 | handler_errors++; | 323 | handler_errors++; |
| 346 | } | 324 | } |
| 347 | 325 | ||
| 348 | krph_val = 0; | 326 | krph_val = 0; |
| 349 | ret = target2(rand1); | 327 | ret = target2(rand1); |
| 350 | if (krph_val != rand1) { | 328 | if (krph_val != rand1) { |
| 351 | printk(KERN_ERR "Kprobe smoke test failed: " | 329 | pr_err("kretprobe handler2 not called\n"); |
| 352 | "kretprobe handler2 not called\n"); | ||
| 353 | handler_errors++; | 330 | handler_errors++; |
| 354 | } | 331 | } |
| 355 | unregister_kretprobes(rps, 2); | 332 | unregister_kretprobes(rps, 2); |
| @@ -368,7 +345,7 @@ int init_test_probes(void) | |||
| 368 | rand1 = prandom_u32(); | 345 | rand1 = prandom_u32(); |
| 369 | } while (rand1 <= div_factor); | 346 | } while (rand1 <= div_factor); |
| 370 | 347 | ||
| 371 | printk(KERN_INFO "Kprobe smoke test started\n"); | 348 | pr_info("started\n"); |
| 372 | num_tests++; | 349 | num_tests++; |
| 373 | ret = test_kprobe(); | 350 | ret = test_kprobe(); |
| 374 | if (ret < 0) | 351 | if (ret < 0) |
| @@ -402,13 +379,11 @@ int init_test_probes(void) | |||
| 402 | #endif /* CONFIG_KRETPROBES */ | 379 | #endif /* CONFIG_KRETPROBES */ |
| 403 | 380 | ||
| 404 | if (errors) | 381 | if (errors) |
| 405 | printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " | 382 | pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); |
| 406 | "%d tests failed\n", errors, num_tests); | ||
| 407 | else if (handler_errors) | 383 | else if (handler_errors) |
| 408 | printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " | 384 | pr_err("BUG: %d error(s) running handlers\n", handler_errors); |
| 409 | "running handlers\n", handler_errors); | ||
| 410 | else | 385 | else |
| 411 | printk(KERN_INFO "Kprobe smoke test passed successfully\n"); | 386 | pr_info("passed successfully\n"); |
| 412 | 387 | ||
| 413 | return 0; | 388 | return 0; |
| 414 | } | 389 | } |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4aec4a457431..a7077d3ae52f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -464,18 +464,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) | |||
| 464 | static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, | 464 | static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, |
| 465 | ktime_t now) | 465 | ktime_t now) |
| 466 | { | 466 | { |
| 467 | unsigned long flags; | ||
| 467 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, | 468 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, |
| 468 | it.alarm.alarmtimer); | 469 | it.alarm.alarmtimer); |
| 469 | if (posix_timer_event(ptr, 0) != 0) | 470 | enum alarmtimer_restart result = ALARMTIMER_NORESTART; |
| 470 | ptr->it_overrun++; | 471 | |
| 472 | spin_lock_irqsave(&ptr->it_lock, flags); | ||
| 473 | if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { | ||
| 474 | if (posix_timer_event(ptr, 0) != 0) | ||
| 475 | ptr->it_overrun++; | ||
| 476 | } | ||
| 471 | 477 | ||
| 472 | /* Re-add periodic timers */ | 478 | /* Re-add periodic timers */ |
| 473 | if (ptr->it.alarm.interval.tv64) { | 479 | if (ptr->it.alarm.interval.tv64) { |
| 474 | ptr->it_overrun += alarm_forward(alarm, now, | 480 | ptr->it_overrun += alarm_forward(alarm, now, |
| 475 | ptr->it.alarm.interval); | 481 | ptr->it.alarm.interval); |
| 476 | return ALARMTIMER_RESTART; | 482 | result = ALARMTIMER_RESTART; |
| 477 | } | 483 | } |
| 478 | return ALARMTIMER_NORESTART; | 484 | spin_unlock_irqrestore(&ptr->it_lock, flags); |
| 485 | |||
| 486 | return result; | ||
| 479 | } | 487 | } |
| 480 | 488 | ||
| 481 | /** | 489 | /** |
| @@ -541,18 +549,22 @@ static int alarm_timer_create(struct k_itimer *new_timer) | |||
| 541 | * @new_timer: k_itimer pointer | 549 | * @new_timer: k_itimer pointer |
| 542 | * @cur_setting: itimerspec data to fill | 550 | * @cur_setting: itimerspec data to fill |
| 543 | * | 551 | * |
| 544 | * Copies the itimerspec data out from the k_itimer | 552 | * Copies out the current itimerspec data |
| 545 | */ | 553 | */ |
| 546 | static void alarm_timer_get(struct k_itimer *timr, | 554 | static void alarm_timer_get(struct k_itimer *timr, |
| 547 | struct itimerspec *cur_setting) | 555 | struct itimerspec *cur_setting) |
| 548 | { | 556 | { |
| 549 | memset(cur_setting, 0, sizeof(struct itimerspec)); | 557 | ktime_t relative_expiry_time = |
| 558 | alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); | ||
| 559 | |||
| 560 | if (ktime_to_ns(relative_expiry_time) > 0) { | ||
| 561 | cur_setting->it_value = ktime_to_timespec(relative_expiry_time); | ||
| 562 | } else { | ||
| 563 | cur_setting->it_value.tv_sec = 0; | ||
| 564 | cur_setting->it_value.tv_nsec = 0; | ||
| 565 | } | ||
| 550 | 566 | ||
| 551 | cur_setting->it_interval = | 567 | cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); |
| 552 | ktime_to_timespec(timr->it.alarm.interval); | ||
| 553 | cur_setting->it_value = | ||
| 554 | ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); | ||
| 555 | return; | ||
| 556 | } | 568 | } |
| 557 | 569 | ||
| 558 | /** | 570 | /** |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19f1305..55449909f114 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, | |||
| 72 | * Also omit the add if it would overflow the u64 boundary. | 72 | * Also omit the add if it would overflow the u64 boundary. |
| 73 | */ | 73 | */ |
| 74 | if ((~0ULL - clc > rnd) && | 74 | if ((~0ULL - clc > rnd) && |
| 75 | (!ismax || evt->mult <= (1U << evt->shift))) | 75 | (!ismax || evt->mult <= (1ULL << evt->shift))) |
| 76 | clc += rnd; | 76 | clc += rnd; |
| 77 | 77 | ||
| 78 | do_div(clc, evt->mult); | 78 | do_div(clc, evt->mult); |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..37e50aadd471 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
| @@ -558,7 +558,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) | |||
| 558 | static int hrtimer_reprogram(struct hrtimer *timer, | 558 | static int hrtimer_reprogram(struct hrtimer *timer, |
| 559 | struct hrtimer_clock_base *base) | 559 | struct hrtimer_clock_base *base) |
| 560 | { | 560 | { |
| 561 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 561 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); |
| 562 | ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | 562 | ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); |
| 563 | int res; | 563 | int res; |
| 564 | 564 | ||
| @@ -629,7 +629,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | |||
| 629 | */ | 629 | */ |
| 630 | static void retrigger_next_event(void *arg) | 630 | static void retrigger_next_event(void *arg) |
| 631 | { | 631 | { |
| 632 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | 632 | struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); |
| 633 | 633 | ||
| 634 | if (!hrtimer_hres_active()) | 634 | if (!hrtimer_hres_active()) |
| 635 | return; | 635 | return; |
| @@ -903,7 +903,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) | |||
| 903 | */ | 903 | */ |
| 904 | debug_deactivate(timer); | 904 | debug_deactivate(timer); |
| 905 | timer_stats_hrtimer_clear_start_info(timer); | 905 | timer_stats_hrtimer_clear_start_info(timer); |
| 906 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); | 906 | reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); |
| 907 | /* | 907 | /* |
| 908 | * We must preserve the CALLBACK state flag here, | 908 | * We must preserve the CALLBACK state flag here, |
| 909 | * otherwise we could move the timer base in | 909 | * otherwise we could move the timer base in |
| @@ -963,7 +963,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
| 963 | * on dynticks target. | 963 | * on dynticks target. |
| 964 | */ | 964 | */ |
| 965 | wake_up_nohz_cpu(new_base->cpu_base->cpu); | 965 | wake_up_nohz_cpu(new_base->cpu_base->cpu); |
| 966 | } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && | 966 | } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && |
| 967 | hrtimer_reprogram(timer, new_base)) { | 967 | hrtimer_reprogram(timer, new_base)) { |
| 968 | /* | 968 | /* |
| 969 | * Only allow reprogramming if the new base is on this CPU. | 969 | * Only allow reprogramming if the new base is on this CPU. |
| @@ -1103,7 +1103,7 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); | |||
| 1103 | */ | 1103 | */ |
| 1104 | ktime_t hrtimer_get_next_event(void) | 1104 | ktime_t hrtimer_get_next_event(void) |
| 1105 | { | 1105 | { |
| 1106 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1106 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); |
| 1107 | struct hrtimer_clock_base *base = cpu_base->clock_base; | 1107 | struct hrtimer_clock_base *base = cpu_base->clock_base; |
| 1108 | ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; | 1108 | ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; |
| 1109 | unsigned long flags; | 1109 | unsigned long flags; |
| @@ -1144,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
| 1144 | 1144 | ||
| 1145 | memset(timer, 0, sizeof(struct hrtimer)); | 1145 | memset(timer, 0, sizeof(struct hrtimer)); |
| 1146 | 1146 | ||
| 1147 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); | 1147 | cpu_base = raw_cpu_ptr(&hrtimer_bases); |
| 1148 | 1148 | ||
| 1149 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) | 1149 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) |
| 1150 | clock_id = CLOCK_MONOTONIC; | 1150 | clock_id = CLOCK_MONOTONIC; |
| @@ -1187,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
| 1187 | struct hrtimer_cpu_base *cpu_base; | 1187 | struct hrtimer_cpu_base *cpu_base; |
| 1188 | int base = hrtimer_clockid_to_base(which_clock); | 1188 | int base = hrtimer_clockid_to_base(which_clock); |
| 1189 | 1189 | ||
| 1190 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); | 1190 | cpu_base = raw_cpu_ptr(&hrtimer_bases); |
| 1191 | *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); | 1191 | *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); |
| 1192 | 1192 | ||
| 1193 | return 0; | 1193 | return 0; |
| @@ -1242,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) | |||
| 1242 | */ | 1242 | */ |
| 1243 | void hrtimer_interrupt(struct clock_event_device *dev) | 1243 | void hrtimer_interrupt(struct clock_event_device *dev) |
| 1244 | { | 1244 | { |
| 1245 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1245 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); |
| 1246 | ktime_t expires_next, now, entry_time, delta; | 1246 | ktime_t expires_next, now, entry_time, delta; |
| 1247 | int i, retries = 0; | 1247 | int i, retries = 0; |
| 1248 | 1248 | ||
| @@ -1376,7 +1376,7 @@ static void __hrtimer_peek_ahead_timers(void) | |||
| 1376 | if (!hrtimer_hres_active()) | 1376 | if (!hrtimer_hres_active()) |
| 1377 | return; | 1377 | return; |
| 1378 | 1378 | ||
| 1379 | td = &__get_cpu_var(tick_cpu_device); | 1379 | td = this_cpu_ptr(&tick_cpu_device); |
| 1380 | if (td && td->evtdev) | 1380 | if (td && td->evtdev) |
| 1381 | hrtimer_interrupt(td->evtdev); | 1381 | hrtimer_interrupt(td->evtdev); |
| 1382 | } | 1382 | } |
| @@ -1440,7 +1440,7 @@ void hrtimer_run_pending(void) | |||
| 1440 | void hrtimer_run_queues(void) | 1440 | void hrtimer_run_queues(void) |
| 1441 | { | 1441 | { |
| 1442 | struct timerqueue_node *node; | 1442 | struct timerqueue_node *node; |
| 1443 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1443 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); |
| 1444 | struct hrtimer_clock_base *base; | 1444 | struct hrtimer_clock_base *base; |
| 1445 | int index, gettime = 1; | 1445 | int index, gettime = 1; |
| 1446 | 1446 | ||
| @@ -1679,7 +1679,7 @@ static void migrate_hrtimers(int scpu) | |||
| 1679 | 1679 | ||
| 1680 | local_irq_disable(); | 1680 | local_irq_disable(); |
| 1681 | old_base = &per_cpu(hrtimer_bases, scpu); | 1681 | old_base = &per_cpu(hrtimer_bases, scpu); |
| 1682 | new_base = &__get_cpu_var(hrtimer_bases); | 1682 | new_base = this_cpu_ptr(&hrtimer_bases); |
| 1683 | /* | 1683 | /* |
| 1684 | * The caller is globally serialized and nobody else | 1684 | * The caller is globally serialized and nobody else |
| 1685 | * takes two locks at once, deadlock is not possible. | 1685 | * takes two locks at once, deadlock is not possible. |
| @@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, | |||
| 1776 | */ | 1776 | */ |
| 1777 | if (!expires) { | 1777 | if (!expires) { |
| 1778 | schedule(); | 1778 | schedule(); |
| 1779 | __set_current_state(TASK_RUNNING); | ||
| 1780 | return -EINTR; | 1779 | return -EINTR; |
| 1781 | } | 1780 | } |
| 1782 | 1781 | ||
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..492b986195d5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
| @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, | |||
| 272 | if (same_thread_group(tsk, current)) | 272 | if (same_thread_group(tsk, current)) |
| 273 | err = cpu_clock_sample(which_clock, tsk, &rtn); | 273 | err = cpu_clock_sample(which_clock, tsk, &rtn); |
| 274 | } else { | 274 | } else { |
| 275 | unsigned long flags; | ||
| 276 | struct sighand_struct *sighand; | ||
| 277 | |||
| 278 | /* | ||
| 279 | * while_each_thread() is not yet entirely RCU safe, | ||
| 280 | * keep locking the group while sampling process | ||
| 281 | * clock for now. | ||
| 282 | */ | ||
| 283 | sighand = lock_task_sighand(tsk, &flags); | ||
| 284 | if (!sighand) | ||
| 285 | return err; | ||
| 286 | |||
| 287 | if (tsk == current || thread_group_leader(tsk)) | 275 | if (tsk == current || thread_group_leader(tsk)) |
| 288 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); | 276 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); |
| 289 | |||
| 290 | unlock_task_sighand(tsk, &flags); | ||
| 291 | } | 277 | } |
| 292 | 278 | ||
| 293 | if (!err) | 279 | if (!err) |
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463ad90f2..31ea01f42e1f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
| @@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 636 | goto out; | 636 | goto out; |
| 637 | } | 637 | } |
| 638 | } else { | 638 | } else { |
| 639 | memset(&event.sigev_value, 0, sizeof(event.sigev_value)); | ||
| 639 | event.sigev_notify = SIGEV_SIGNAL; | 640 | event.sigev_notify = SIGEV_SIGNAL; |
| 640 | event.sigev_signo = SIGALRM; | 641 | event.sigev_signo = SIGALRM; |
| 641 | event.sigev_value.sival_int = new_timer->it_id; | 642 | event.sigev_value.sival_int = new_timer->it_id; |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 64c5990fd500..066f0ec05e48 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -554,7 +554,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
| 554 | void tick_check_oneshot_broadcast_this_cpu(void) | 554 | void tick_check_oneshot_broadcast_this_cpu(void) |
| 555 | { | 555 | { |
| 556 | if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { | 556 | if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { |
| 557 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 557 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 558 | 558 | ||
| 559 | /* | 559 | /* |
| 560 | * We might be in the middle of switching over from | 560 | * We might be in the middle of switching over from |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0a0608edeb26..7efeedf53ebd 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
| @@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td, | |||
| 224 | 224 | ||
| 225 | void tick_install_replacement(struct clock_event_device *newdev) | 225 | void tick_install_replacement(struct clock_event_device *newdev) |
| 226 | { | 226 | { |
| 227 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 227 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 228 | int cpu = smp_processor_id(); | 228 | int cpu = smp_processor_id(); |
| 229 | 229 | ||
| 230 | clockevents_exchange_device(td->evtdev, newdev); | 230 | clockevents_exchange_device(td->evtdev, newdev); |
| @@ -374,14 +374,14 @@ void tick_shutdown(unsigned int *cpup) | |||
| 374 | 374 | ||
| 375 | void tick_suspend(void) | 375 | void tick_suspend(void) |
| 376 | { | 376 | { |
| 377 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 377 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 378 | 378 | ||
| 379 | clockevents_shutdown(td->evtdev); | 379 | clockevents_shutdown(td->evtdev); |
| 380 | } | 380 | } |
| 381 | 381 | ||
| 382 | void tick_resume(void) | 382 | void tick_resume(void) |
| 383 | { | 383 | { |
| 384 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 384 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 385 | int broadcast = tick_resume_broadcast(); | 385 | int broadcast = tick_resume_broadcast(); |
| 386 | 386 | ||
| 387 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); | 387 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); |
| @@ -400,4 +400,5 @@ void tick_resume(void) | |||
| 400 | void __init tick_init(void) | 400 | void __init tick_init(void) |
| 401 | { | 401 | { |
| 402 | tick_broadcast_init(); | 402 | tick_broadcast_init(); |
| 403 | tick_nohz_init(); | ||
| 403 | } | 404 | } |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c19c1d84b6f3..366aeb4f2c66 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
| @@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; } | |||
| 99 | static inline bool tick_broadcast_oneshot_available(void) { return false; } | 99 | static inline bool tick_broadcast_oneshot_available(void) { return false; } |
| 100 | #endif /* !TICK_ONESHOT */ | 100 | #endif /* !TICK_ONESHOT */ |
| 101 | 101 | ||
| 102 | /* NO_HZ_FULL internal */ | ||
| 103 | #ifdef CONFIG_NO_HZ_FULL | ||
| 104 | extern void tick_nohz_init(void); | ||
| 105 | # else | ||
| 106 | static inline void tick_nohz_init(void) { } | ||
| 107 | #endif | ||
| 108 | |||
| 102 | /* | 109 | /* |
| 103 | * Broadcasting support | 110 | * Broadcasting support |
| 104 | */ | 111 | */ |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 824109060a33..7ce740e78e1b 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
| @@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, | |||
| 59 | */ | 59 | */ |
| 60 | int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) | 60 | int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) |
| 61 | { | 61 | { |
| 62 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 62 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 63 | struct clock_event_device *dev = td->evtdev; | 63 | struct clock_event_device *dev = td->evtdev; |
| 64 | 64 | ||
| 65 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || | 65 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee3908f..7b5741fc4110 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -205,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | |||
| 205 | */ | 205 | */ |
| 206 | void __tick_nohz_full_check(void) | 206 | void __tick_nohz_full_check(void) |
| 207 | { | 207 | { |
| 208 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 208 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 209 | 209 | ||
| 210 | if (tick_nohz_full_cpu(smp_processor_id())) { | 210 | if (tick_nohz_full_cpu(smp_processor_id())) { |
| 211 | if (ts->tick_stopped && !is_idle_task(current)) { | 211 | if (ts->tick_stopped && !is_idle_task(current)) { |
| @@ -225,6 +225,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | |||
| 225 | }; | 225 | }; |
| 226 | 226 | ||
| 227 | /* | 227 | /* |
| 228 | * Kick this CPU if it's full dynticks in order to force it to | ||
| 229 | * re-evaluate its dependency on the tick and restart it if necessary. | ||
| 230 | * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), | ||
| 231 | * is NMI safe. | ||
| 232 | */ | ||
| 233 | void tick_nohz_full_kick(void) | ||
| 234 | { | ||
| 235 | if (!tick_nohz_full_cpu(smp_processor_id())) | ||
| 236 | return; | ||
| 237 | |||
| 238 | irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); | ||
| 239 | } | ||
| 240 | |||
| 241 | /* | ||
| 228 | * Kick the CPU if it's full dynticks in order to force it to | 242 | * Kick the CPU if it's full dynticks in order to force it to |
| 229 | * re-evaluate its dependency on the tick and restart it if necessary. | 243 | * re-evaluate its dependency on the tick and restart it if necessary. |
| 230 | */ | 244 | */ |
| @@ -281,22 +295,12 @@ out: | |||
| 281 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | 295 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ |
| 282 | static int __init tick_nohz_full_setup(char *str) | 296 | static int __init tick_nohz_full_setup(char *str) |
| 283 | { | 297 | { |
| 284 | int cpu; | ||
| 285 | |||
| 286 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); | 298 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); |
| 287 | alloc_bootmem_cpumask_var(&housekeeping_mask); | ||
| 288 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { | 299 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { |
| 289 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | 300 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); |
| 301 | free_bootmem_cpumask_var(tick_nohz_full_mask); | ||
| 290 | return 1; | 302 | return 1; |
| 291 | } | 303 | } |
| 292 | |||
| 293 | cpu = smp_processor_id(); | ||
| 294 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { | ||
| 295 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | ||
| 296 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); | ||
| 297 | } | ||
| 298 | cpumask_andnot(housekeeping_mask, | ||
| 299 | cpu_possible_mask, tick_nohz_full_mask); | ||
| 300 | tick_nohz_full_running = true; | 304 | tick_nohz_full_running = true; |
| 301 | 305 | ||
| 302 | return 1; | 306 | return 1; |
| @@ -335,18 +339,11 @@ static int tick_nohz_init_all(void) | |||
| 335 | 339 | ||
| 336 | #ifdef CONFIG_NO_HZ_FULL_ALL | 340 | #ifdef CONFIG_NO_HZ_FULL_ALL |
| 337 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { | 341 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { |
| 338 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); | 342 | WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); |
| 339 | return err; | ||
| 340 | } | ||
| 341 | if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { | ||
| 342 | pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); | ||
| 343 | return err; | 343 | return err; |
| 344 | } | 344 | } |
| 345 | err = 0; | 345 | err = 0; |
| 346 | cpumask_setall(tick_nohz_full_mask); | 346 | cpumask_setall(tick_nohz_full_mask); |
| 347 | cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); | ||
| 348 | cpumask_clear(housekeeping_mask); | ||
| 349 | cpumask_set_cpu(smp_processor_id(), housekeeping_mask); | ||
| 350 | tick_nohz_full_running = true; | 347 | tick_nohz_full_running = true; |
| 351 | #endif | 348 | #endif |
| 352 | return err; | 349 | return err; |
| @@ -361,6 +358,37 @@ void __init tick_nohz_init(void) | |||
| 361 | return; | 358 | return; |
| 362 | } | 359 | } |
| 363 | 360 | ||
| 361 | if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { | ||
| 362 | WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); | ||
| 363 | cpumask_clear(tick_nohz_full_mask); | ||
| 364 | tick_nohz_full_running = false; | ||
| 365 | return; | ||
| 366 | } | ||
| 367 | |||
| 368 | /* | ||
| 369 | * Full dynticks uses irq work to drive the tick rescheduling on safe | ||
| 370 | * locking contexts. But then we need irq work to raise its own | ||
| 371 | * interrupts to avoid circular dependency on the tick | ||
| 372 | */ | ||
| 373 | if (!arch_irq_work_has_interrupt()) { | ||
| 374 | pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " | ||
| 375 | "support irq work self-IPIs\n"); | ||
| 376 | cpumask_clear(tick_nohz_full_mask); | ||
| 377 | cpumask_copy(housekeeping_mask, cpu_possible_mask); | ||
| 378 | tick_nohz_full_running = false; | ||
| 379 | return; | ||
| 380 | } | ||
| 381 | |||
| 382 | cpu = smp_processor_id(); | ||
| 383 | |||
| 384 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { | ||
| 385 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | ||
| 386 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); | ||
| 387 | } | ||
| 388 | |||
| 389 | cpumask_andnot(housekeeping_mask, | ||
| 390 | cpu_possible_mask, tick_nohz_full_mask); | ||
| 391 | |||
| 364 | for_each_cpu(cpu, tick_nohz_full_mask) | 392 | for_each_cpu(cpu, tick_nohz_full_mask) |
| 365 | context_tracking_cpu_set(cpu); | 393 | context_tracking_cpu_set(cpu); |
| 366 | 394 | ||
| @@ -545,7 +573,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 545 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; | 573 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
| 546 | ktime_t last_update, expires, ret = { .tv64 = 0 }; | 574 | ktime_t last_update, expires, ret = { .tv64 = 0 }; |
| 547 | unsigned long rcu_delta_jiffies; | 575 | unsigned long rcu_delta_jiffies; |
| 548 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 576 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
| 549 | u64 time_delta; | 577 | u64 time_delta; |
| 550 | 578 | ||
| 551 | time_delta = timekeeping_max_deferment(); | 579 | time_delta = timekeeping_max_deferment(); |
| @@ -558,7 +586,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 558 | } while (read_seqretry(&jiffies_lock, seq)); | 586 | } while (read_seqretry(&jiffies_lock, seq)); |
| 559 | 587 | ||
| 560 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || | 588 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || |
| 561 | arch_needs_cpu(cpu) || irq_work_needs_cpu()) { | 589 | arch_needs_cpu() || irq_work_needs_cpu()) { |
| 562 | next_jiffies = last_jiffies + 1; | 590 | next_jiffies = last_jiffies + 1; |
| 563 | delta_jiffies = 1; | 591 | delta_jiffies = 1; |
| 564 | } else { | 592 | } else { |
| @@ -813,7 +841,7 @@ void tick_nohz_idle_enter(void) | |||
| 813 | 841 | ||
| 814 | local_irq_disable(); | 842 | local_irq_disable(); |
| 815 | 843 | ||
| 816 | ts = &__get_cpu_var(tick_cpu_sched); | 844 | ts = this_cpu_ptr(&tick_cpu_sched); |
| 817 | ts->inidle = 1; | 845 | ts->inidle = 1; |
| 818 | __tick_nohz_idle_enter(ts); | 846 | __tick_nohz_idle_enter(ts); |
| 819 | 847 | ||
| @@ -831,7 +859,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); | |||
| 831 | */ | 859 | */ |
| 832 | void tick_nohz_irq_exit(void) | 860 | void tick_nohz_irq_exit(void) |
| 833 | { | 861 | { |
| 834 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 862 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 835 | 863 | ||
| 836 | if (ts->inidle) | 864 | if (ts->inidle) |
| 837 | __tick_nohz_idle_enter(ts); | 865 | __tick_nohz_idle_enter(ts); |
| @@ -846,7 +874,7 @@ void tick_nohz_irq_exit(void) | |||
| 846 | */ | 874 | */ |
| 847 | ktime_t tick_nohz_get_sleep_length(void) | 875 | ktime_t tick_nohz_get_sleep_length(void) |
| 848 | { | 876 | { |
| 849 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 877 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 850 | 878 | ||
| 851 | return ts->sleep_length; | 879 | return ts->sleep_length; |
| 852 | } | 880 | } |
| @@ -924,7 +952,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | |||
| 924 | */ | 952 | */ |
| 925 | void tick_nohz_idle_exit(void) | 953 | void tick_nohz_idle_exit(void) |
| 926 | { | 954 | { |
| 927 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 955 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 928 | ktime_t now; | 956 | ktime_t now; |
| 929 | 957 | ||
| 930 | local_irq_disable(); | 958 | local_irq_disable(); |
| @@ -959,7 +987,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) | |||
| 959 | */ | 987 | */ |
| 960 | static void tick_nohz_handler(struct clock_event_device *dev) | 988 | static void tick_nohz_handler(struct clock_event_device *dev) |
| 961 | { | 989 | { |
| 962 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 990 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 963 | struct pt_regs *regs = get_irq_regs(); | 991 | struct pt_regs *regs = get_irq_regs(); |
| 964 | ktime_t now = ktime_get(); | 992 | ktime_t now = ktime_get(); |
| 965 | 993 | ||
| @@ -968,6 +996,10 @@ static void tick_nohz_handler(struct clock_event_device *dev) | |||
| 968 | tick_sched_do_timer(now); | 996 | tick_sched_do_timer(now); |
| 969 | tick_sched_handle(ts, regs); | 997 | tick_sched_handle(ts, regs); |
| 970 | 998 | ||
| 999 | /* No need to reprogram if we are running tickless */ | ||
| 1000 | if (unlikely(ts->tick_stopped)) | ||
| 1001 | return; | ||
| 1002 | |||
| 971 | while (tick_nohz_reprogram(ts, now)) { | 1003 | while (tick_nohz_reprogram(ts, now)) { |
| 972 | now = ktime_get(); | 1004 | now = ktime_get(); |
| 973 | tick_do_update_jiffies64(now); | 1005 | tick_do_update_jiffies64(now); |
| @@ -979,7 +1011,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) | |||
| 979 | */ | 1011 | */ |
| 980 | static void tick_nohz_switch_to_nohz(void) | 1012 | static void tick_nohz_switch_to_nohz(void) |
| 981 | { | 1013 | { |
| 982 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1014 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 983 | ktime_t next; | 1015 | ktime_t next; |
| 984 | 1016 | ||
| 985 | if (!tick_nohz_enabled) | 1017 | if (!tick_nohz_enabled) |
| @@ -1041,7 +1073,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) | |||
| 1041 | 1073 | ||
| 1042 | static inline void tick_nohz_irq_enter(void) | 1074 | static inline void tick_nohz_irq_enter(void) |
| 1043 | { | 1075 | { |
| 1044 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1076 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 1045 | ktime_t now; | 1077 | ktime_t now; |
| 1046 | 1078 | ||
| 1047 | if (!ts->idle_active && !ts->tick_stopped) | 1079 | if (!ts->idle_active && !ts->tick_stopped) |
| @@ -1095,6 +1127,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
| 1095 | if (regs) | 1127 | if (regs) |
| 1096 | tick_sched_handle(ts, regs); | 1128 | tick_sched_handle(ts, regs); |
| 1097 | 1129 | ||
| 1130 | /* No need to reprogram if we are in idle or full dynticks mode */ | ||
| 1131 | if (unlikely(ts->tick_stopped)) | ||
| 1132 | return HRTIMER_NORESTART; | ||
| 1133 | |||
| 1098 | hrtimer_forward(timer, now, tick_period); | 1134 | hrtimer_forward(timer, now, tick_period); |
| 1099 | 1135 | ||
| 1100 | return HRTIMER_RESTART; | 1136 | return HRTIMER_RESTART; |
| @@ -1115,7 +1151,7 @@ early_param("skew_tick", skew_tick); | |||
| 1115 | */ | 1151 | */ |
| 1116 | void tick_setup_sched_timer(void) | 1152 | void tick_setup_sched_timer(void) |
| 1117 | { | 1153 | { |
| 1118 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1154 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 1119 | ktime_t now = ktime_get(); | 1155 | ktime_t now = ktime_get(); |
| 1120 | 1156 | ||
| 1121 | /* | 1157 | /* |
| @@ -1184,7 +1220,7 @@ void tick_clock_notify(void) | |||
| 1184 | */ | 1220 | */ |
| 1185 | void tick_oneshot_notify(void) | 1221 | void tick_oneshot_notify(void) |
| 1186 | { | 1222 | { |
| 1187 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1223 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 1188 | 1224 | ||
| 1189 | set_bit(0, &ts->check_clocks); | 1225 | set_bit(0, &ts->check_clocks); |
| 1190 | } | 1226 | } |
| @@ -1199,7 +1235,7 @@ void tick_oneshot_notify(void) | |||
| 1199 | */ | 1235 | */ |
| 1200 | int tick_check_oneshot_change(int allow_nohz) | 1236 | int tick_check_oneshot_change(int allow_nohz) |
| 1201 | { | 1237 | { |
| 1202 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1238 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 1203 | 1239 | ||
| 1204 | if (!test_and_clear_bit(0, &ts->check_clocks)) | 1240 | if (!test_and_clear_bit(0, &ts->check_clocks)) |
| 1205 | return 0; | 1241 | return 0; |
diff --git a/kernel/time/time.c b/kernel/time/time.c index f0294ba14634..a9ae20fb0b11 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
| @@ -559,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies); | |||
| 559 | * that a remainder subtract here would not do the right thing as the | 559 | * that a remainder subtract here would not do the right thing as the |
| 560 | * resolution values don't fall on second boundries. I.e. the line: | 560 | * resolution values don't fall on second boundries. I.e. the line: |
| 561 | * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. | 561 | * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. |
| 562 | * Note that due to the small error in the multiplier here, this | ||
| 563 | * rounding is incorrect for sufficiently large values of tv_nsec, but | ||
| 564 | * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're | ||
| 565 | * OK. | ||
| 562 | * | 566 | * |
| 563 | * Rather, we just shift the bits off the right. | 567 | * Rather, we just shift the bits off the right. |
| 564 | * | 568 | * |
| 565 | * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec | 569 | * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec |
| 566 | * value to a scaled second value. | 570 | * value to a scaled second value. |
| 567 | */ | 571 | */ |
| 568 | unsigned long | 572 | static unsigned long |
| 569 | timespec_to_jiffies(const struct timespec *value) | 573 | __timespec_to_jiffies(unsigned long sec, long nsec) |
| 570 | { | 574 | { |
| 571 | unsigned long sec = value->tv_sec; | 575 | nsec = nsec + TICK_NSEC - 1; |
| 572 | long nsec = value->tv_nsec + TICK_NSEC - 1; | ||
| 573 | 576 | ||
| 574 | if (sec >= MAX_SEC_IN_JIFFIES){ | 577 | if (sec >= MAX_SEC_IN_JIFFIES){ |
| 575 | sec = MAX_SEC_IN_JIFFIES; | 578 | sec = MAX_SEC_IN_JIFFIES; |
| @@ -580,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value) | |||
| 580 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | 583 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; |
| 581 | 584 | ||
| 582 | } | 585 | } |
| 586 | |||
| 587 | unsigned long | ||
| 588 | timespec_to_jiffies(const struct timespec *value) | ||
| 589 | { | ||
| 590 | return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); | ||
| 591 | } | ||
| 592 | |||
| 583 | EXPORT_SYMBOL(timespec_to_jiffies); | 593 | EXPORT_SYMBOL(timespec_to_jiffies); |
| 584 | 594 | ||
| 585 | void | 595 | void |
| @@ -596,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) | |||
| 596 | } | 606 | } |
| 597 | EXPORT_SYMBOL(jiffies_to_timespec); | 607 | EXPORT_SYMBOL(jiffies_to_timespec); |
| 598 | 608 | ||
| 599 | /* Same for "timeval" | 609 | /* |
| 600 | * | 610 | * We could use a similar algorithm to timespec_to_jiffies (with a |
| 601 | * Well, almost. The problem here is that the real system resolution is | 611 | * different multiplier for usec instead of nsec). But this has a |
| 602 | * in nanoseconds and the value being converted is in micro seconds. | 612 | * problem with rounding: we can't exactly add TICK_NSEC - 1 to the |
| 603 | * Also for some machines (those that use HZ = 1024, in-particular), | 613 | * usec value, since it's not necessarily integral. |
| 604 | * there is a LARGE error in the tick size in microseconds. | 614 | * |
| 605 | 615 | * We could instead round in the intermediate scaled representation | |
| 606 | * The solution we use is to do the rounding AFTER we convert the | 616 | * (i.e. in units of 1/2^(large scale) jiffies) but that's also |
| 607 | * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. | 617 | * perilous: the scaling introduces a small positive error, which |
| 608 | * Instruction wise, this should cost only an additional add with carry | 618 | * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 |
| 609 | * instruction above the way it was done above. | 619 | * units to the intermediate before shifting) leads to accidental |
| 620 | * overflow and overestimates. | ||
| 621 | * | ||
| 622 | * At the cost of one additional multiplication by a constant, just | ||
| 623 | * use the timespec implementation. | ||
| 610 | */ | 624 | */ |
| 611 | unsigned long | 625 | unsigned long |
| 612 | timeval_to_jiffies(const struct timeval *value) | 626 | timeval_to_jiffies(const struct timeval *value) |
| 613 | { | 627 | { |
| 614 | unsigned long sec = value->tv_sec; | 628 | return __timespec_to_jiffies(value->tv_sec, |
| 615 | long usec = value->tv_usec; | 629 | value->tv_usec * NSEC_PER_USEC); |
| 616 | |||
| 617 | if (sec >= MAX_SEC_IN_JIFFIES){ | ||
| 618 | sec = MAX_SEC_IN_JIFFIES; | ||
| 619 | usec = 0; | ||
| 620 | } | ||
| 621 | return (((u64)sec * SEC_CONVERSION) + | ||
| 622 | (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> | ||
| 623 | (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
| 624 | } | 630 | } |
| 625 | EXPORT_SYMBOL(timeval_to_jiffies); | 631 | EXPORT_SYMBOL(timeval_to_jiffies); |
| 626 | 632 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f36b02838a47..ec1791fae965 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); | |||
| 338 | 338 | ||
| 339 | static inline void update_vsyscall(struct timekeeper *tk) | 339 | static inline void update_vsyscall(struct timekeeper *tk) |
| 340 | { | 340 | { |
| 341 | struct timespec xt; | 341 | struct timespec xt, wm; |
| 342 | 342 | ||
| 343 | xt = timespec64_to_timespec(tk_xtime(tk)); | 343 | xt = timespec64_to_timespec(tk_xtime(tk)); |
| 344 | update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, | 344 | wm = timespec64_to_timespec(tk->wall_to_monotonic); |
| 345 | update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, | ||
| 345 | tk->tkr.cycle_last); | 346 | tk->tkr.cycle_last); |
| 346 | } | 347 | } |
| 347 | 348 | ||
| @@ -441,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
| 441 | tk->ntp_error = 0; | 442 | tk->ntp_error = 0; |
| 442 | ntp_clear(); | 443 | ntp_clear(); |
| 443 | } | 444 | } |
| 444 | update_vsyscall(tk); | ||
| 445 | update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); | ||
| 446 | 445 | ||
| 447 | tk_update_ktime_data(tk); | 446 | tk_update_ktime_data(tk); |
| 448 | 447 | ||
| 448 | update_vsyscall(tk); | ||
| 449 | update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); | ||
| 450 | |||
| 449 | if (action & TK_MIRROR) | 451 | if (action & TK_MIRROR) |
| 450 | memcpy(&shadow_timekeeper, &tk_core.timekeeper, | 452 | memcpy(&shadow_timekeeper, &tk_core.timekeeper, |
| 451 | sizeof(tk_core.timekeeper)); | 453 | sizeof(tk_core.timekeeper)); |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index aca5dfe2fa3d..3260ffdb368f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
| @@ -655,7 +655,7 @@ static inline void debug_assert_init(struct timer_list *timer) | |||
| 655 | static void do_init_timer(struct timer_list *timer, unsigned int flags, | 655 | static void do_init_timer(struct timer_list *timer, unsigned int flags, |
| 656 | const char *name, struct lock_class_key *key) | 656 | const char *name, struct lock_class_key *key) |
| 657 | { | 657 | { |
| 658 | struct tvec_base *base = __raw_get_cpu_var(tvec_bases); | 658 | struct tvec_base *base = raw_cpu_read(tvec_bases); |
| 659 | 659 | ||
| 660 | timer->entry.next = NULL; | 660 | timer->entry.next = NULL; |
| 661 | timer->base = (void *)((unsigned long)base | flags); | 661 | timer->base = (void *)((unsigned long)base | flags); |
| @@ -1385,7 +1385,7 @@ void update_process_times(int user_tick) | |||
| 1385 | rcu_check_callbacks(cpu, user_tick); | 1385 | rcu_check_callbacks(cpu, user_tick); |
| 1386 | #ifdef CONFIG_IRQ_WORK | 1386 | #ifdef CONFIG_IRQ_WORK |
| 1387 | if (in_irq()) | 1387 | if (in_irq()) |
| 1388 | irq_work_run(); | 1388 | irq_work_tick(); |
| 1389 | #endif | 1389 | #endif |
| 1390 | scheduler_tick(); | 1390 | scheduler_tick(); |
| 1391 | run_posix_cpu_timers(p); | 1391 | run_posix_cpu_timers(p); |
diff --git a/kernel/torture.c b/kernel/torture.c index d600af21f022..dd70993c266c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
| @@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup); | |||
| 211 | /* | 211 | /* |
| 212 | * Print online/offline testing statistics. | 212 | * Print online/offline testing statistics. |
| 213 | */ | 213 | */ |
| 214 | char *torture_onoff_stats(char *page) | 214 | void torture_onoff_stats(void) |
| 215 | { | 215 | { |
| 216 | #ifdef CONFIG_HOTPLUG_CPU | 216 | #ifdef CONFIG_HOTPLUG_CPU |
| 217 | page += sprintf(page, | 217 | pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", |
| 218 | "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", | 218 | n_online_successes, n_online_attempts, |
| 219 | n_online_successes, n_online_attempts, | 219 | n_offline_successes, n_offline_attempts, |
| 220 | n_offline_successes, n_offline_attempts, | 220 | min_online, max_online, |
| 221 | min_online, max_online, | 221 | min_offline, max_offline, |
| 222 | min_offline, max_offline, | 222 | sum_online, sum_offline, HZ); |
| 223 | sum_online, sum_offline, HZ); | ||
| 224 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 223 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 225 | return page; | ||
| 226 | } | 224 | } |
| 227 | EXPORT_SYMBOL_GPL(torture_onoff_stats); | 225 | EXPORT_SYMBOL_GPL(torture_onoff_stats); |
| 228 | 226 | ||
| @@ -635,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end); | |||
| 635 | * | 633 | * |
| 636 | * This must be called before the caller starts shutting down its own | 634 | * This must be called before the caller starts shutting down its own |
| 637 | * kthreads. | 635 | * kthreads. |
| 636 | * | ||
| 637 | * Both torture_cleanup_begin() and torture_cleanup_end() must be paired, | ||
| 638 | * in order to correctly perform the cleanup. They are separated because | ||
| 639 | * threads can still need to reference the torture_type type, thus nullify | ||
| 640 | * only after completing all other relevant calls. | ||
| 638 | */ | 641 | */ |
| 639 | bool torture_cleanup(void) | 642 | bool torture_cleanup_begin(void) |
| 640 | { | 643 | { |
| 641 | mutex_lock(&fullstop_mutex); | 644 | mutex_lock(&fullstop_mutex); |
| 642 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | 645 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { |
| @@ -651,12 +654,17 @@ bool torture_cleanup(void) | |||
| 651 | torture_shuffle_cleanup(); | 654 | torture_shuffle_cleanup(); |
| 652 | torture_stutter_cleanup(); | 655 | torture_stutter_cleanup(); |
| 653 | torture_onoff_cleanup(); | 656 | torture_onoff_cleanup(); |
| 657 | return false; | ||
| 658 | } | ||
| 659 | EXPORT_SYMBOL_GPL(torture_cleanup_begin); | ||
| 660 | |||
| 661 | void torture_cleanup_end(void) | ||
| 662 | { | ||
| 654 | mutex_lock(&fullstop_mutex); | 663 | mutex_lock(&fullstop_mutex); |
| 655 | torture_type = NULL; | 664 | torture_type = NULL; |
| 656 | mutex_unlock(&fullstop_mutex); | 665 | mutex_unlock(&fullstop_mutex); |
| 657 | return false; | ||
| 658 | } | 666 | } |
| 659 | EXPORT_SYMBOL_GPL(torture_cleanup); | 667 | EXPORT_SYMBOL_GPL(torture_cleanup_end); |
| 660 | 668 | ||
| 661 | /* | 669 | /* |
| 662 | * Is it time for the current torture test to stop? | 670 | * Is it time for the current torture test to stop? |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1654b12c891a..31c90fec4158 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -65,15 +65,21 @@ | |||
| 65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) | 65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) |
| 66 | 66 | ||
| 67 | #ifdef CONFIG_DYNAMIC_FTRACE | 67 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 68 | #define INIT_REGEX_LOCK(opsname) \ | 68 | #define INIT_OPS_HASH(opsname) \ |
| 69 | .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), | 69 | .func_hash = &opsname.local_hash, \ |
| 70 | .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), | ||
| 71 | #define ASSIGN_OPS_HASH(opsname, val) \ | ||
| 72 | .func_hash = val, \ | ||
| 73 | .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), | ||
| 70 | #else | 74 | #else |
| 71 | #define INIT_REGEX_LOCK(opsname) | 75 | #define INIT_OPS_HASH(opsname) |
| 76 | #define ASSIGN_OPS_HASH(opsname, val) | ||
| 72 | #endif | 77 | #endif |
| 73 | 78 | ||
| 74 | static struct ftrace_ops ftrace_list_end __read_mostly = { | 79 | static struct ftrace_ops ftrace_list_end __read_mostly = { |
| 75 | .func = ftrace_stub, | 80 | .func = ftrace_stub, |
| 76 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, | 81 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, |
| 82 | INIT_OPS_HASH(ftrace_list_end) | ||
| 77 | }; | 83 | }; |
| 78 | 84 | ||
| 79 | /* ftrace_enabled is a method to turn ftrace on or off */ | 85 | /* ftrace_enabled is a method to turn ftrace on or off */ |
| @@ -107,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | |||
| 107 | static struct ftrace_ops global_ops; | 113 | static struct ftrace_ops global_ops; |
| 108 | static struct ftrace_ops control_ops; | 114 | static struct ftrace_ops control_ops; |
| 109 | 115 | ||
| 116 | static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, | ||
| 117 | struct ftrace_ops *op, struct pt_regs *regs); | ||
| 118 | |||
| 110 | #if ARCH_SUPPORTS_FTRACE_OPS | 119 | #if ARCH_SUPPORTS_FTRACE_OPS |
| 111 | static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | 120 | static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, |
| 112 | struct ftrace_ops *op, struct pt_regs *regs); | 121 | struct ftrace_ops *op, struct pt_regs *regs); |
| @@ -140,7 +149,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) | |||
| 140 | { | 149 | { |
| 141 | #ifdef CONFIG_DYNAMIC_FTRACE | 150 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 142 | if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { | 151 | if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { |
| 143 | mutex_init(&ops->regex_lock); | 152 | mutex_init(&ops->local_hash.regex_lock); |
| 153 | ops->func_hash = &ops->local_hash; | ||
| 144 | ops->flags |= FTRACE_OPS_FL_INITIALIZED; | 154 | ops->flags |= FTRACE_OPS_FL_INITIALIZED; |
| 145 | } | 155 | } |
| 146 | #endif | 156 | #endif |
| @@ -244,18 +254,24 @@ static void update_ftrace_function(void) | |||
| 244 | ftrace_func_t func; | 254 | ftrace_func_t func; |
| 245 | 255 | ||
| 246 | /* | 256 | /* |
| 257 | * Prepare the ftrace_ops that the arch callback will use. | ||
| 258 | * If there's only one ftrace_ops registered, the ftrace_ops_list | ||
| 259 | * will point to the ops we want. | ||
| 260 | */ | ||
| 261 | set_function_trace_op = ftrace_ops_list; | ||
| 262 | |||
| 263 | /* If there's no ftrace_ops registered, just call the stub function */ | ||
| 264 | if (ftrace_ops_list == &ftrace_list_end) { | ||
| 265 | func = ftrace_stub; | ||
| 266 | |||
| 267 | /* | ||
| 247 | * If we are at the end of the list and this ops is | 268 | * If we are at the end of the list and this ops is |
| 248 | * recursion safe and not dynamic and the arch supports passing ops, | 269 | * recursion safe and not dynamic and the arch supports passing ops, |
| 249 | * then have the mcount trampoline call the function directly. | 270 | * then have the mcount trampoline call the function directly. |
| 250 | */ | 271 | */ |
| 251 | if (ftrace_ops_list == &ftrace_list_end || | 272 | } else if (ftrace_ops_list->next == &ftrace_list_end) { |
| 252 | (ftrace_ops_list->next == &ftrace_list_end && | 273 | func = ftrace_ops_get_func(ftrace_ops_list); |
| 253 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && | 274 | |
| 254 | (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && | ||
| 255 | !FTRACE_FORCE_LIST_FUNC)) { | ||
| 256 | /* Set the ftrace_ops that the arch callback uses */ | ||
| 257 | set_function_trace_op = ftrace_ops_list; | ||
| 258 | func = ftrace_ops_list->func; | ||
| 259 | } else { | 275 | } else { |
| 260 | /* Just use the default ftrace_ops */ | 276 | /* Just use the default ftrace_ops */ |
| 261 | set_function_trace_op = &ftrace_list_end; | 277 | set_function_trace_op = &ftrace_list_end; |
| @@ -899,7 +915,7 @@ static void unregister_ftrace_profiler(void) | |||
| 899 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { | 915 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { |
| 900 | .func = function_profile_call, | 916 | .func = function_profile_call, |
| 901 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, | 917 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, |
| 902 | INIT_REGEX_LOCK(ftrace_profile_ops) | 918 | INIT_OPS_HASH(ftrace_profile_ops) |
| 903 | }; | 919 | }; |
| 904 | 920 | ||
| 905 | static int register_ftrace_profiler(void) | 921 | static int register_ftrace_profiler(void) |
| @@ -1041,6 +1057,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid; | |||
| 1041 | 1057 | ||
| 1042 | static struct ftrace_ops *removed_ops; | 1058 | static struct ftrace_ops *removed_ops; |
| 1043 | 1059 | ||
| 1060 | /* | ||
| 1061 | * Set when doing a global update, like enabling all recs or disabling them. | ||
| 1062 | * It is not set when just updating a single ftrace_ops. | ||
| 1063 | */ | ||
| 1064 | static bool update_all_ops; | ||
| 1065 | |||
| 1044 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD | 1066 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD |
| 1045 | # error Dynamic ftrace depends on MCOUNT_RECORD | 1067 | # error Dynamic ftrace depends on MCOUNT_RECORD |
| 1046 | #endif | 1068 | #endif |
| @@ -1081,11 +1103,12 @@ static const struct ftrace_hash empty_hash = { | |||
| 1081 | #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) | 1103 | #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) |
| 1082 | 1104 | ||
| 1083 | static struct ftrace_ops global_ops = { | 1105 | static struct ftrace_ops global_ops = { |
| 1084 | .func = ftrace_stub, | 1106 | .func = ftrace_stub, |
| 1085 | .notrace_hash = EMPTY_HASH, | 1107 | .local_hash.notrace_hash = EMPTY_HASH, |
| 1086 | .filter_hash = EMPTY_HASH, | 1108 | .local_hash.filter_hash = EMPTY_HASH, |
| 1087 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, | 1109 | INIT_OPS_HASH(global_ops) |
| 1088 | INIT_REGEX_LOCK(global_ops) | 1110 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | |
| 1111 | FTRACE_OPS_FL_INITIALIZED, | ||
| 1089 | }; | 1112 | }; |
| 1090 | 1113 | ||
| 1091 | struct ftrace_page { | 1114 | struct ftrace_page { |
| @@ -1226,8 +1249,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) | |||
| 1226 | void ftrace_free_filter(struct ftrace_ops *ops) | 1249 | void ftrace_free_filter(struct ftrace_ops *ops) |
| 1227 | { | 1250 | { |
| 1228 | ftrace_ops_init(ops); | 1251 | ftrace_ops_init(ops); |
| 1229 | free_ftrace_hash(ops->filter_hash); | 1252 | free_ftrace_hash(ops->func_hash->filter_hash); |
| 1230 | free_ftrace_hash(ops->notrace_hash); | 1253 | free_ftrace_hash(ops->func_hash->notrace_hash); |
| 1231 | } | 1254 | } |
| 1232 | 1255 | ||
| 1233 | static struct ftrace_hash *alloc_ftrace_hash(int size_bits) | 1256 | static struct ftrace_hash *alloc_ftrace_hash(int size_bits) |
| @@ -1288,9 +1311,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | |||
| 1288 | } | 1311 | } |
| 1289 | 1312 | ||
| 1290 | static void | 1313 | static void |
| 1291 | ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); | 1314 | ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); |
| 1292 | static void | 1315 | static void |
| 1293 | ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); | 1316 | ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); |
| 1294 | 1317 | ||
| 1295 | static int | 1318 | static int |
| 1296 | ftrace_hash_move(struct ftrace_ops *ops, int enable, | 1319 | ftrace_hash_move(struct ftrace_ops *ops, int enable, |
| @@ -1299,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
| 1299 | struct ftrace_func_entry *entry; | 1322 | struct ftrace_func_entry *entry; |
| 1300 | struct hlist_node *tn; | 1323 | struct hlist_node *tn; |
| 1301 | struct hlist_head *hhd; | 1324 | struct hlist_head *hhd; |
| 1302 | struct ftrace_hash *old_hash; | ||
| 1303 | struct ftrace_hash *new_hash; | 1325 | struct ftrace_hash *new_hash; |
| 1304 | int size = src->count; | 1326 | int size = src->count; |
| 1305 | int bits = 0; | 1327 | int bits = 0; |
| @@ -1342,17 +1364,30 @@ update: | |||
| 1342 | * Remove the current set, update the hash and add | 1364 | * Remove the current set, update the hash and add |
| 1343 | * them back. | 1365 | * them back. |
| 1344 | */ | 1366 | */ |
| 1345 | ftrace_hash_rec_disable(ops, enable); | 1367 | ftrace_hash_rec_disable_modify(ops, enable); |
| 1346 | 1368 | ||
| 1347 | old_hash = *dst; | ||
| 1348 | rcu_assign_pointer(*dst, new_hash); | 1369 | rcu_assign_pointer(*dst, new_hash); |
| 1349 | free_ftrace_hash_rcu(old_hash); | ||
| 1350 | 1370 | ||
| 1351 | ftrace_hash_rec_enable(ops, enable); | 1371 | ftrace_hash_rec_enable_modify(ops, enable); |
| 1352 | 1372 | ||
| 1353 | return 0; | 1373 | return 0; |
| 1354 | } | 1374 | } |
| 1355 | 1375 | ||
| 1376 | static bool hash_contains_ip(unsigned long ip, | ||
| 1377 | struct ftrace_ops_hash *hash) | ||
| 1378 | { | ||
| 1379 | /* | ||
| 1380 | * The function record is a match if it exists in the filter | ||
| 1381 | * hash and not in the notrace hash. Note, an emty hash is | ||
| 1382 | * considered a match for the filter hash, but an empty | ||
| 1383 | * notrace hash is considered not in the notrace hash. | ||
| 1384 | */ | ||
| 1385 | return (ftrace_hash_empty(hash->filter_hash) || | ||
| 1386 | ftrace_lookup_ip(hash->filter_hash, ip)) && | ||
| 1387 | (ftrace_hash_empty(hash->notrace_hash) || | ||
| 1388 | !ftrace_lookup_ip(hash->notrace_hash, ip)); | ||
| 1389 | } | ||
| 1390 | |||
| 1356 | /* | 1391 | /* |
| 1357 | * Test the hashes for this ops to see if we want to call | 1392 | * Test the hashes for this ops to see if we want to call |
| 1358 | * the ops->func or not. | 1393 | * the ops->func or not. |
| @@ -1368,8 +1403,7 @@ update: | |||
| 1368 | static int | 1403 | static int |
| 1369 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | 1404 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) |
| 1370 | { | 1405 | { |
| 1371 | struct ftrace_hash *filter_hash; | 1406 | struct ftrace_ops_hash hash; |
| 1372 | struct ftrace_hash *notrace_hash; | ||
| 1373 | int ret; | 1407 | int ret; |
| 1374 | 1408 | ||
| 1375 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS | 1409 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS |
| @@ -1382,13 +1416,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | |||
| 1382 | return 0; | 1416 | return 0; |
| 1383 | #endif | 1417 | #endif |
| 1384 | 1418 | ||
| 1385 | filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); | 1419 | hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); |
| 1386 | notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); | 1420 | hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); |
| 1387 | 1421 | ||
| 1388 | if ((ftrace_hash_empty(filter_hash) || | 1422 | if (hash_contains_ip(ip, &hash)) |
| 1389 | ftrace_lookup_ip(filter_hash, ip)) && | ||
| 1390 | (ftrace_hash_empty(notrace_hash) || | ||
| 1391 | !ftrace_lookup_ip(notrace_hash, ip))) | ||
| 1392 | ret = 1; | 1423 | ret = 1; |
| 1393 | else | 1424 | else |
| 1394 | ret = 0; | 1425 | ret = 0; |
| @@ -1500,33 +1531,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) | |||
| 1500 | return keep_regs; | 1531 | return keep_regs; |
| 1501 | } | 1532 | } |
| 1502 | 1533 | ||
| 1503 | static void ftrace_remove_tramp(struct ftrace_ops *ops, | ||
| 1504 | struct dyn_ftrace *rec) | ||
| 1505 | { | ||
| 1506 | struct ftrace_func_entry *entry; | ||
| 1507 | |||
| 1508 | entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip); | ||
| 1509 | if (!entry) | ||
| 1510 | return; | ||
| 1511 | |||
| 1512 | /* | ||
| 1513 | * The tramp_hash entry will be removed at time | ||
| 1514 | * of update. | ||
| 1515 | */ | ||
| 1516 | ops->nr_trampolines--; | ||
| 1517 | rec->flags &= ~FTRACE_FL_TRAMP; | ||
| 1518 | } | ||
| 1519 | |||
| 1520 | static void ftrace_clear_tramps(struct dyn_ftrace *rec) | ||
| 1521 | { | ||
| 1522 | struct ftrace_ops *op; | ||
| 1523 | |||
| 1524 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 1525 | if (op->nr_trampolines) | ||
| 1526 | ftrace_remove_tramp(op, rec); | ||
| 1527 | } while_for_each_ftrace_op(op); | ||
| 1528 | } | ||
| 1529 | |||
| 1530 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | 1534 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, |
| 1531 | int filter_hash, | 1535 | int filter_hash, |
| 1532 | bool inc) | 1536 | bool inc) |
| @@ -1554,14 +1558,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1554 | * gets inversed. | 1558 | * gets inversed. |
| 1555 | */ | 1559 | */ |
| 1556 | if (filter_hash) { | 1560 | if (filter_hash) { |
| 1557 | hash = ops->filter_hash; | 1561 | hash = ops->func_hash->filter_hash; |
| 1558 | other_hash = ops->notrace_hash; | 1562 | other_hash = ops->func_hash->notrace_hash; |
| 1559 | if (ftrace_hash_empty(hash)) | 1563 | if (ftrace_hash_empty(hash)) |
| 1560 | all = 1; | 1564 | all = 1; |
| 1561 | } else { | 1565 | } else { |
| 1562 | inc = !inc; | 1566 | inc = !inc; |
| 1563 | hash = ops->notrace_hash; | 1567 | hash = ops->func_hash->notrace_hash; |
| 1564 | other_hash = ops->filter_hash; | 1568 | other_hash = ops->func_hash->filter_hash; |
| 1565 | /* | 1569 | /* |
| 1566 | * If the notrace hash has no items, | 1570 | * If the notrace hash has no items, |
| 1567 | * then there's nothing to do. | 1571 | * then there's nothing to do. |
| @@ -1615,22 +1619,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1615 | * function, and the ops has a trampoline registered | 1619 | * function, and the ops has a trampoline registered |
| 1616 | * for it, then we can call it directly. | 1620 | * for it, then we can call it directly. |
| 1617 | */ | 1621 | */ |
| 1618 | if (ftrace_rec_count(rec) == 1 && ops->trampoline) { | 1622 | if (ftrace_rec_count(rec) == 1 && ops->trampoline) |
| 1619 | rec->flags |= FTRACE_FL_TRAMP; | 1623 | rec->flags |= FTRACE_FL_TRAMP; |
| 1620 | ops->nr_trampolines++; | 1624 | else |
| 1621 | } else { | ||
| 1622 | /* | 1625 | /* |
| 1623 | * If we are adding another function callback | 1626 | * If we are adding another function callback |
| 1624 | * to this function, and the previous had a | 1627 | * to this function, and the previous had a |
| 1625 | * trampoline used, then we need to go back to | 1628 | * custom trampoline in use, then we need to go |
| 1626 | * the default trampoline. | 1629 | * back to the default trampoline. |
| 1627 | */ | 1630 | */ |
| 1628 | rec->flags &= ~FTRACE_FL_TRAMP; | 1631 | rec->flags &= ~FTRACE_FL_TRAMP; |
| 1629 | 1632 | ||
| 1630 | /* remove trampolines from any ops for this rec */ | ||
| 1631 | ftrace_clear_tramps(rec); | ||
| 1632 | } | ||
| 1633 | |||
| 1634 | /* | 1633 | /* |
| 1635 | * If any ops wants regs saved for this function | 1634 | * If any ops wants regs saved for this function |
| 1636 | * then all ops will get saved regs. | 1635 | * then all ops will get saved regs. |
| @@ -1642,9 +1641,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1642 | return; | 1641 | return; |
| 1643 | rec->flags--; | 1642 | rec->flags--; |
| 1644 | 1643 | ||
| 1645 | if (ops->trampoline && !ftrace_rec_count(rec)) | ||
| 1646 | ftrace_remove_tramp(ops, rec); | ||
| 1647 | |||
| 1648 | /* | 1644 | /* |
| 1649 | * If the rec had REGS enabled and the ops that is | 1645 | * If the rec had REGS enabled and the ops that is |
| 1650 | * being removed had REGS set, then see if there is | 1646 | * being removed had REGS set, then see if there is |
| @@ -1659,6 +1655,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1659 | } | 1655 | } |
| 1660 | 1656 | ||
| 1661 | /* | 1657 | /* |
| 1658 | * If the rec had TRAMP enabled, then it needs to | ||
| 1659 | * be cleared. As TRAMP can only be enabled iff | ||
| 1660 | * there is only a single ops attached to it. | ||
| 1661 | * In otherwords, always disable it on decrementing. | ||
| 1662 | * In the future, we may set it if rec count is | ||
| 1663 | * decremented to one, and the ops that is left | ||
| 1664 | * has a trampoline. | ||
| 1665 | */ | ||
| 1666 | rec->flags &= ~FTRACE_FL_TRAMP; | ||
| 1667 | |||
| 1668 | /* | ||
| 1662 | * flags will be cleared in ftrace_check_record() | 1669 | * flags will be cleared in ftrace_check_record() |
| 1663 | * if rec count is zero. | 1670 | * if rec count is zero. |
| 1664 | */ | 1671 | */ |
| @@ -1682,6 +1689,41 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | |||
| 1682 | __ftrace_hash_rec_update(ops, filter_hash, 1); | 1689 | __ftrace_hash_rec_update(ops, filter_hash, 1); |
| 1683 | } | 1690 | } |
| 1684 | 1691 | ||
| 1692 | static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, | ||
| 1693 | int filter_hash, int inc) | ||
| 1694 | { | ||
| 1695 | struct ftrace_ops *op; | ||
| 1696 | |||
| 1697 | __ftrace_hash_rec_update(ops, filter_hash, inc); | ||
| 1698 | |||
| 1699 | if (ops->func_hash != &global_ops.local_hash) | ||
| 1700 | return; | ||
| 1701 | |||
| 1702 | /* | ||
| 1703 | * If the ops shares the global_ops hash, then we need to update | ||
| 1704 | * all ops that are enabled and use this hash. | ||
| 1705 | */ | ||
| 1706 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 1707 | /* Already done */ | ||
| 1708 | if (op == ops) | ||
| 1709 | continue; | ||
| 1710 | if (op->func_hash == &global_ops.local_hash) | ||
| 1711 | __ftrace_hash_rec_update(op, filter_hash, inc); | ||
| 1712 | } while_for_each_ftrace_op(op); | ||
| 1713 | } | ||
| 1714 | |||
| 1715 | static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, | ||
| 1716 | int filter_hash) | ||
| 1717 | { | ||
| 1718 | ftrace_hash_rec_update_modify(ops, filter_hash, 0); | ||
| 1719 | } | ||
| 1720 | |||
| 1721 | static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, | ||
| 1722 | int filter_hash) | ||
| 1723 | { | ||
| 1724 | ftrace_hash_rec_update_modify(ops, filter_hash, 1); | ||
| 1725 | } | ||
| 1726 | |||
| 1685 | static void print_ip_ins(const char *fmt, unsigned char *p) | 1727 | static void print_ip_ins(const char *fmt, unsigned char *p) |
| 1686 | { | 1728 | { |
| 1687 | int i; | 1729 | int i; |
| @@ -1842,21 +1884,86 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) | |||
| 1842 | } | 1884 | } |
| 1843 | 1885 | ||
| 1844 | static struct ftrace_ops * | 1886 | static struct ftrace_ops * |
| 1887 | ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) | ||
| 1888 | { | ||
| 1889 | struct ftrace_ops *op; | ||
| 1890 | unsigned long ip = rec->ip; | ||
| 1891 | |||
| 1892 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 1893 | |||
| 1894 | if (!op->trampoline) | ||
| 1895 | continue; | ||
| 1896 | |||
| 1897 | if (hash_contains_ip(ip, op->func_hash)) | ||
| 1898 | return op; | ||
| 1899 | } while_for_each_ftrace_op(op); | ||
| 1900 | |||
| 1901 | return NULL; | ||
| 1902 | } | ||
| 1903 | |||
| 1904 | static struct ftrace_ops * | ||
| 1845 | ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) | 1905 | ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) |
| 1846 | { | 1906 | { |
| 1847 | struct ftrace_ops *op; | 1907 | struct ftrace_ops *op; |
| 1908 | unsigned long ip = rec->ip; | ||
| 1848 | 1909 | ||
| 1849 | /* Removed ops need to be tested first */ | 1910 | /* |
| 1850 | if (removed_ops && removed_ops->tramp_hash) { | 1911 | * Need to check removed ops first. |
| 1851 | if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) | 1912 | * If they are being removed, and this rec has a tramp, |
| 1913 | * and this rec is in the ops list, then it would be the | ||
| 1914 | * one with the tramp. | ||
| 1915 | */ | ||
| 1916 | if (removed_ops) { | ||
| 1917 | if (hash_contains_ip(ip, &removed_ops->old_hash)) | ||
| 1852 | return removed_ops; | 1918 | return removed_ops; |
| 1853 | } | 1919 | } |
| 1854 | 1920 | ||
| 1921 | /* | ||
| 1922 | * Need to find the current trampoline for a rec. | ||
| 1923 | * Now, a trampoline is only attached to a rec if there | ||
| 1924 | * was a single 'ops' attached to it. But this can be called | ||
| 1925 | * when we are adding another op to the rec or removing the | ||
| 1926 | * current one. Thus, if the op is being added, we can | ||
| 1927 | * ignore it because it hasn't attached itself to the rec | ||
| 1928 | * yet. | ||
| 1929 | * | ||
| 1930 | * If an ops is being modified (hooking to different functions) | ||
| 1931 | * then we don't care about the new functions that are being | ||
| 1932 | * added, just the old ones (that are probably being removed). | ||
| 1933 | * | ||
| 1934 | * If we are adding an ops to a function that already is using | ||
| 1935 | * a trampoline, it needs to be removed (trampolines are only | ||
| 1936 | * for single ops connected), then an ops that is not being | ||
| 1937 | * modified also needs to be checked. | ||
| 1938 | */ | ||
| 1855 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 1939 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
| 1856 | if (!op->tramp_hash) | 1940 | |
| 1941 | if (!op->trampoline) | ||
| 1857 | continue; | 1942 | continue; |
| 1858 | 1943 | ||
| 1859 | if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) | 1944 | /* |
| 1945 | * If the ops is being added, it hasn't gotten to | ||
| 1946 | * the point to be removed from this tree yet. | ||
| 1947 | */ | ||
| 1948 | if (op->flags & FTRACE_OPS_FL_ADDING) | ||
| 1949 | continue; | ||
| 1950 | |||
| 1951 | |||
| 1952 | /* | ||
| 1953 | * If the ops is being modified and is in the old | ||
| 1954 | * hash, then it is probably being removed from this | ||
| 1955 | * function. | ||
| 1956 | */ | ||
| 1957 | if ((op->flags & FTRACE_OPS_FL_MODIFYING) && | ||
| 1958 | hash_contains_ip(ip, &op->old_hash)) | ||
| 1959 | return op; | ||
| 1960 | /* | ||
| 1961 | * If the ops is not being added or modified, and it's | ||
| 1962 | * in its normal filter hash, then this must be the one | ||
| 1963 | * we want! | ||
| 1964 | */ | ||
| 1965 | if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && | ||
| 1966 | hash_contains_ip(ip, op->func_hash)) | ||
| 1860 | return op; | 1967 | return op; |
| 1861 | 1968 | ||
| 1862 | } while_for_each_ftrace_op(op); | 1969 | } while_for_each_ftrace_op(op); |
| @@ -1868,10 +1975,11 @@ static struct ftrace_ops * | |||
| 1868 | ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) | 1975 | ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) |
| 1869 | { | 1976 | { |
| 1870 | struct ftrace_ops *op; | 1977 | struct ftrace_ops *op; |
| 1978 | unsigned long ip = rec->ip; | ||
| 1871 | 1979 | ||
| 1872 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 1980 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
| 1873 | /* pass rec in as regs to have non-NULL val */ | 1981 | /* pass rec in as regs to have non-NULL val */ |
| 1874 | if (ftrace_ops_test(op, rec->ip, rec)) | 1982 | if (hash_contains_ip(ip, op->func_hash)) |
| 1875 | return op; | 1983 | return op; |
| 1876 | } while_for_each_ftrace_op(op); | 1984 | } while_for_each_ftrace_op(op); |
| 1877 | 1985 | ||
| @@ -1896,8 +2004,8 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) | |||
| 1896 | if (rec->flags & FTRACE_FL_TRAMP) { | 2004 | if (rec->flags & FTRACE_FL_TRAMP) { |
| 1897 | ops = ftrace_find_tramp_ops_new(rec); | 2005 | ops = ftrace_find_tramp_ops_new(rec); |
| 1898 | if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { | 2006 | if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { |
| 1899 | pr_warning("Bad trampoline accounting at: %p (%pS)\n", | 2007 | pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n", |
| 1900 | (void *)rec->ip, (void *)rec->ip); | 2008 | (void *)rec->ip, (void *)rec->ip, rec->flags); |
| 1901 | /* Ftrace is shutting down, return anything */ | 2009 | /* Ftrace is shutting down, return anything */ |
| 1902 | return (unsigned long)FTRACE_ADDR; | 2010 | return (unsigned long)FTRACE_ADDR; |
| 1903 | } | 2011 | } |
| @@ -1964,7 +2072,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
| 1964 | return ftrace_make_call(rec, ftrace_addr); | 2072 | return ftrace_make_call(rec, ftrace_addr); |
| 1965 | 2073 | ||
| 1966 | case FTRACE_UPDATE_MAKE_NOP: | 2074 | case FTRACE_UPDATE_MAKE_NOP: |
| 1967 | return ftrace_make_nop(NULL, rec, ftrace_addr); | 2075 | return ftrace_make_nop(NULL, rec, ftrace_old_addr); |
| 1968 | 2076 | ||
| 1969 | case FTRACE_UPDATE_MODIFY_CALL: | 2077 | case FTRACE_UPDATE_MODIFY_CALL: |
| 1970 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); | 2078 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); |
| @@ -2178,89 +2286,6 @@ void __weak arch_ftrace_update_code(int command) | |||
| 2178 | ftrace_run_stop_machine(command); | 2286 | ftrace_run_stop_machine(command); |
| 2179 | } | 2287 | } |
| 2180 | 2288 | ||
| 2181 | static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) | ||
| 2182 | { | ||
| 2183 | struct ftrace_page *pg; | ||
| 2184 | struct dyn_ftrace *rec; | ||
| 2185 | int size, bits; | ||
| 2186 | int ret; | ||
| 2187 | |||
| 2188 | size = ops->nr_trampolines; | ||
| 2189 | bits = 0; | ||
| 2190 | /* | ||
| 2191 | * Make the hash size about 1/2 the # found | ||
| 2192 | */ | ||
| 2193 | for (size /= 2; size; size >>= 1) | ||
| 2194 | bits++; | ||
| 2195 | |||
| 2196 | ops->tramp_hash = alloc_ftrace_hash(bits); | ||
| 2197 | /* | ||
| 2198 | * TODO: a failed allocation is going to screw up | ||
| 2199 | * the accounting of what needs to be modified | ||
| 2200 | * and not. For now, we kill ftrace if we fail | ||
| 2201 | * to allocate here. But there are ways around this, | ||
| 2202 | * but that will take a little more work. | ||
| 2203 | */ | ||
| 2204 | if (!ops->tramp_hash) | ||
| 2205 | return -ENOMEM; | ||
| 2206 | |||
| 2207 | do_for_each_ftrace_rec(pg, rec) { | ||
| 2208 | if (ftrace_rec_count(rec) == 1 && | ||
| 2209 | ftrace_ops_test(ops, rec->ip, rec)) { | ||
| 2210 | |||
| 2211 | /* | ||
| 2212 | * If another ops adds to a rec, the rec will | ||
| 2213 | * lose its trampoline and never get it back | ||
| 2214 | * until all ops are off of it. | ||
| 2215 | */ | ||
| 2216 | if (!(rec->flags & FTRACE_FL_TRAMP)) | ||
| 2217 | continue; | ||
| 2218 | |||
| 2219 | /* This record had better have a trampoline */ | ||
| 2220 | if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN))) | ||
| 2221 | return -1; | ||
| 2222 | |||
| 2223 | ret = add_hash_entry(ops->tramp_hash, rec->ip); | ||
| 2224 | if (ret < 0) | ||
| 2225 | return ret; | ||
| 2226 | } | ||
| 2227 | } while_for_each_ftrace_rec(); | ||
| 2228 | |||
| 2229 | /* The number of recs in the hash must match nr_trampolines */ | ||
| 2230 | FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines); | ||
| 2231 | |||
| 2232 | return 0; | ||
| 2233 | } | ||
| 2234 | |||
| 2235 | static int ftrace_save_tramp_hashes(void) | ||
| 2236 | { | ||
| 2237 | struct ftrace_ops *op; | ||
| 2238 | int ret; | ||
| 2239 | |||
| 2240 | /* | ||
| 2241 | * Now that any trampoline is being used, we need to save the | ||
| 2242 | * hashes for the ops that have them. This allows the mapping | ||
| 2243 | * back from the record to the ops that has the trampoline to | ||
| 2244 | * know what code is being replaced. Modifying code must always | ||
| 2245 | * verify what it is changing. | ||
| 2246 | */ | ||
| 2247 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 2248 | |||
| 2249 | /* The tramp_hash is recreated each time. */ | ||
| 2250 | free_ftrace_hash(op->tramp_hash); | ||
| 2251 | op->tramp_hash = NULL; | ||
| 2252 | |||
| 2253 | if (op->nr_trampolines) { | ||
| 2254 | ret = ftrace_save_ops_tramp_hash(op); | ||
| 2255 | if (ret) | ||
| 2256 | return ret; | ||
| 2257 | } | ||
| 2258 | |||
| 2259 | } while_for_each_ftrace_op(op); | ||
| 2260 | |||
| 2261 | return 0; | ||
| 2262 | } | ||
| 2263 | |||
| 2264 | static void ftrace_run_update_code(int command) | 2289 | static void ftrace_run_update_code(int command) |
| 2265 | { | 2290 | { |
| 2266 | int ret; | 2291 | int ret; |
| @@ -2280,9 +2305,16 @@ static void ftrace_run_update_code(int command) | |||
| 2280 | 2305 | ||
| 2281 | ret = ftrace_arch_code_modify_post_process(); | 2306 | ret = ftrace_arch_code_modify_post_process(); |
| 2282 | FTRACE_WARN_ON(ret); | 2307 | FTRACE_WARN_ON(ret); |
| 2308 | } | ||
| 2283 | 2309 | ||
| 2284 | ret = ftrace_save_tramp_hashes(); | 2310 | static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, |
| 2285 | FTRACE_WARN_ON(ret); | 2311 | struct ftrace_hash *old_hash) |
| 2312 | { | ||
| 2313 | ops->flags |= FTRACE_OPS_FL_MODIFYING; | ||
| 2314 | ops->old_hash.filter_hash = old_hash; | ||
| 2315 | ftrace_run_update_code(command); | ||
| 2316 | ops->old_hash.filter_hash = NULL; | ||
| 2317 | ops->flags &= ~FTRACE_OPS_FL_MODIFYING; | ||
| 2286 | } | 2318 | } |
| 2287 | 2319 | ||
| 2288 | static ftrace_func_t saved_ftrace_func; | 2320 | static ftrace_func_t saved_ftrace_func; |
| @@ -2306,6 +2338,13 @@ static void ftrace_startup_enable(int command) | |||
| 2306 | ftrace_run_update_code(command); | 2338 | ftrace_run_update_code(command); |
| 2307 | } | 2339 | } |
| 2308 | 2340 | ||
| 2341 | static void ftrace_startup_all(int command) | ||
| 2342 | { | ||
| 2343 | update_all_ops = true; | ||
| 2344 | ftrace_startup_enable(command); | ||
| 2345 | update_all_ops = false; | ||
| 2346 | } | ||
| 2347 | |||
| 2309 | static int ftrace_startup(struct ftrace_ops *ops, int command) | 2348 | static int ftrace_startup(struct ftrace_ops *ops, int command) |
| 2310 | { | 2349 | { |
| 2311 | int ret; | 2350 | int ret; |
| @@ -2320,12 +2359,22 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
| 2320 | ftrace_start_up++; | 2359 | ftrace_start_up++; |
| 2321 | command |= FTRACE_UPDATE_CALLS; | 2360 | command |= FTRACE_UPDATE_CALLS; |
| 2322 | 2361 | ||
| 2323 | ops->flags |= FTRACE_OPS_FL_ENABLED; | 2362 | /* |
| 2363 | * Note that ftrace probes uses this to start up | ||
| 2364 | * and modify functions it will probe. But we still | ||
| 2365 | * set the ADDING flag for modification, as probes | ||
| 2366 | * do not have trampolines. If they add them in the | ||
| 2367 | * future, then the probes will need to distinguish | ||
| 2368 | * between adding and updating probes. | ||
| 2369 | */ | ||
| 2370 | ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; | ||
| 2324 | 2371 | ||
| 2325 | ftrace_hash_rec_enable(ops, 1); | 2372 | ftrace_hash_rec_enable(ops, 1); |
| 2326 | 2373 | ||
| 2327 | ftrace_startup_enable(command); | 2374 | ftrace_startup_enable(command); |
| 2328 | 2375 | ||
| 2376 | ops->flags &= ~FTRACE_OPS_FL_ADDING; | ||
| 2377 | |||
| 2329 | return 0; | 2378 | return 0; |
| 2330 | } | 2379 | } |
| 2331 | 2380 | ||
| @@ -2375,11 +2424,35 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
| 2375 | * If the ops uses a trampoline, then it needs to be | 2424 | * If the ops uses a trampoline, then it needs to be |
| 2376 | * tested first on update. | 2425 | * tested first on update. |
| 2377 | */ | 2426 | */ |
| 2427 | ops->flags |= FTRACE_OPS_FL_REMOVING; | ||
| 2378 | removed_ops = ops; | 2428 | removed_ops = ops; |
| 2379 | 2429 | ||
| 2430 | /* The trampoline logic checks the old hashes */ | ||
| 2431 | ops->old_hash.filter_hash = ops->func_hash->filter_hash; | ||
| 2432 | ops->old_hash.notrace_hash = ops->func_hash->notrace_hash; | ||
| 2433 | |||
| 2380 | ftrace_run_update_code(command); | 2434 | ftrace_run_update_code(command); |
| 2381 | 2435 | ||
| 2436 | /* | ||
| 2437 | * If there's no more ops registered with ftrace, run a | ||
| 2438 | * sanity check to make sure all rec flags are cleared. | ||
| 2439 | */ | ||
| 2440 | if (ftrace_ops_list == &ftrace_list_end) { | ||
| 2441 | struct ftrace_page *pg; | ||
| 2442 | struct dyn_ftrace *rec; | ||
| 2443 | |||
| 2444 | do_for_each_ftrace_rec(pg, rec) { | ||
| 2445 | if (FTRACE_WARN_ON_ONCE(rec->flags)) | ||
| 2446 | pr_warn(" %pS flags:%lx\n", | ||
| 2447 | (void *)rec->ip, rec->flags); | ||
| 2448 | } while_for_each_ftrace_rec(); | ||
| 2449 | } | ||
| 2450 | |||
| 2451 | ops->old_hash.filter_hash = NULL; | ||
| 2452 | ops->old_hash.notrace_hash = NULL; | ||
| 2453 | |||
| 2382 | removed_ops = NULL; | 2454 | removed_ops = NULL; |
| 2455 | ops->flags &= ~FTRACE_OPS_FL_REMOVING; | ||
| 2383 | 2456 | ||
| 2384 | /* | 2457 | /* |
| 2385 | * Dynamic ops may be freed, we must make sure that all | 2458 | * Dynamic ops may be freed, we must make sure that all |
| @@ -2436,8 +2509,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) | |||
| 2436 | * Filter_hash being empty will default to trace module. | 2509 | * Filter_hash being empty will default to trace module. |
| 2437 | * But notrace hash requires a test of individual module functions. | 2510 | * But notrace hash requires a test of individual module functions. |
| 2438 | */ | 2511 | */ |
| 2439 | return ftrace_hash_empty(ops->filter_hash) && | 2512 | return ftrace_hash_empty(ops->func_hash->filter_hash) && |
| 2440 | ftrace_hash_empty(ops->notrace_hash); | 2513 | ftrace_hash_empty(ops->func_hash->notrace_hash); |
| 2441 | } | 2514 | } |
| 2442 | 2515 | ||
| 2443 | /* | 2516 | /* |
| @@ -2459,12 +2532,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) | |||
| 2459 | return 0; | 2532 | return 0; |
| 2460 | 2533 | ||
| 2461 | /* The function must be in the filter */ | 2534 | /* The function must be in the filter */ |
| 2462 | if (!ftrace_hash_empty(ops->filter_hash) && | 2535 | if (!ftrace_hash_empty(ops->func_hash->filter_hash) && |
| 2463 | !ftrace_lookup_ip(ops->filter_hash, rec->ip)) | 2536 | !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) |
| 2464 | return 0; | 2537 | return 0; |
| 2465 | 2538 | ||
| 2466 | /* If in notrace hash, we ignore it too */ | 2539 | /* If in notrace hash, we ignore it too */ |
| 2467 | if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) | 2540 | if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) |
| 2468 | return 0; | 2541 | return 0; |
| 2469 | 2542 | ||
| 2470 | return 1; | 2543 | return 1; |
| @@ -2785,10 +2858,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 2785 | } else { | 2858 | } else { |
| 2786 | rec = &iter->pg->records[iter->idx++]; | 2859 | rec = &iter->pg->records[iter->idx++]; |
| 2787 | if (((iter->flags & FTRACE_ITER_FILTER) && | 2860 | if (((iter->flags & FTRACE_ITER_FILTER) && |
| 2788 | !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || | 2861 | !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || |
| 2789 | 2862 | ||
| 2790 | ((iter->flags & FTRACE_ITER_NOTRACE) && | 2863 | ((iter->flags & FTRACE_ITER_NOTRACE) && |
| 2791 | !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || | 2864 | !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || |
| 2792 | 2865 | ||
| 2793 | ((iter->flags & FTRACE_ITER_ENABLED) && | 2866 | ((iter->flags & FTRACE_ITER_ENABLED) && |
| 2794 | !(rec->flags & FTRACE_FL_ENABLED))) { | 2867 | !(rec->flags & FTRACE_FL_ENABLED))) { |
| @@ -2837,9 +2910,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 2837 | * functions are enabled. | 2910 | * functions are enabled. |
| 2838 | */ | 2911 | */ |
| 2839 | if ((iter->flags & FTRACE_ITER_FILTER && | 2912 | if ((iter->flags & FTRACE_ITER_FILTER && |
| 2840 | ftrace_hash_empty(ops->filter_hash)) || | 2913 | ftrace_hash_empty(ops->func_hash->filter_hash)) || |
| 2841 | (iter->flags & FTRACE_ITER_NOTRACE && | 2914 | (iter->flags & FTRACE_ITER_NOTRACE && |
| 2842 | ftrace_hash_empty(ops->notrace_hash))) { | 2915 | ftrace_hash_empty(ops->func_hash->notrace_hash))) { |
| 2843 | if (*pos > 0) | 2916 | if (*pos > 0) |
| 2844 | return t_hash_start(m, pos); | 2917 | return t_hash_start(m, pos); |
| 2845 | iter->flags |= FTRACE_ITER_PRINTALL; | 2918 | iter->flags |= FTRACE_ITER_PRINTALL; |
| @@ -2904,8 +2977,8 @@ static int t_show(struct seq_file *m, void *v) | |||
| 2904 | if (rec->flags & FTRACE_FL_TRAMP_EN) { | 2977 | if (rec->flags & FTRACE_FL_TRAMP_EN) { |
| 2905 | struct ftrace_ops *ops; | 2978 | struct ftrace_ops *ops; |
| 2906 | 2979 | ||
| 2907 | ops = ftrace_find_tramp_ops_curr(rec); | 2980 | ops = ftrace_find_tramp_ops_any(rec); |
| 2908 | if (ops && ops->trampoline) | 2981 | if (ops) |
| 2909 | seq_printf(m, "\ttramp: %pS", | 2982 | seq_printf(m, "\ttramp: %pS", |
| 2910 | (void *)ops->trampoline); | 2983 | (void *)ops->trampoline); |
| 2911 | else | 2984 | else |
| @@ -3001,12 +3074,12 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
| 3001 | iter->ops = ops; | 3074 | iter->ops = ops; |
| 3002 | iter->flags = flag; | 3075 | iter->flags = flag; |
| 3003 | 3076 | ||
| 3004 | mutex_lock(&ops->regex_lock); | 3077 | mutex_lock(&ops->func_hash->regex_lock); |
| 3005 | 3078 | ||
| 3006 | if (flag & FTRACE_ITER_NOTRACE) | 3079 | if (flag & FTRACE_ITER_NOTRACE) |
| 3007 | hash = ops->notrace_hash; | 3080 | hash = ops->func_hash->notrace_hash; |
| 3008 | else | 3081 | else |
| 3009 | hash = ops->filter_hash; | 3082 | hash = ops->func_hash->filter_hash; |
| 3010 | 3083 | ||
| 3011 | if (file->f_mode & FMODE_WRITE) { | 3084 | if (file->f_mode & FMODE_WRITE) { |
| 3012 | const int size_bits = FTRACE_HASH_DEFAULT_BITS; | 3085 | const int size_bits = FTRACE_HASH_DEFAULT_BITS; |
| @@ -3041,7 +3114,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
| 3041 | file->private_data = iter; | 3114 | file->private_data = iter; |
| 3042 | 3115 | ||
| 3043 | out_unlock: | 3116 | out_unlock: |
| 3044 | mutex_unlock(&ops->regex_lock); | 3117 | mutex_unlock(&ops->func_hash->regex_lock); |
| 3045 | 3118 | ||
| 3046 | return ret; | 3119 | return ret; |
| 3047 | } | 3120 | } |
| @@ -3279,12 +3352,12 @@ static struct ftrace_ops trace_probe_ops __read_mostly = | |||
| 3279 | { | 3352 | { |
| 3280 | .func = function_trace_probe_call, | 3353 | .func = function_trace_probe_call, |
| 3281 | .flags = FTRACE_OPS_FL_INITIALIZED, | 3354 | .flags = FTRACE_OPS_FL_INITIALIZED, |
| 3282 | INIT_REGEX_LOCK(trace_probe_ops) | 3355 | INIT_OPS_HASH(trace_probe_ops) |
| 3283 | }; | 3356 | }; |
| 3284 | 3357 | ||
| 3285 | static int ftrace_probe_registered; | 3358 | static int ftrace_probe_registered; |
| 3286 | 3359 | ||
| 3287 | static void __enable_ftrace_function_probe(void) | 3360 | static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) |
| 3288 | { | 3361 | { |
| 3289 | int ret; | 3362 | int ret; |
| 3290 | int i; | 3363 | int i; |
| @@ -3292,7 +3365,8 @@ static void __enable_ftrace_function_probe(void) | |||
| 3292 | if (ftrace_probe_registered) { | 3365 | if (ftrace_probe_registered) { |
| 3293 | /* still need to update the function call sites */ | 3366 | /* still need to update the function call sites */ |
| 3294 | if (ftrace_enabled) | 3367 | if (ftrace_enabled) |
| 3295 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); | 3368 | ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, |
| 3369 | old_hash); | ||
| 3296 | return; | 3370 | return; |
| 3297 | } | 3371 | } |
| 3298 | 3372 | ||
| @@ -3342,7 +3416,8 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3342 | void *data) | 3416 | void *data) |
| 3343 | { | 3417 | { |
| 3344 | struct ftrace_func_probe *entry; | 3418 | struct ftrace_func_probe *entry; |
| 3345 | struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; | 3419 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; |
| 3420 | struct ftrace_hash *old_hash = *orig_hash; | ||
| 3346 | struct ftrace_hash *hash; | 3421 | struct ftrace_hash *hash; |
| 3347 | struct ftrace_page *pg; | 3422 | struct ftrace_page *pg; |
| 3348 | struct dyn_ftrace *rec; | 3423 | struct dyn_ftrace *rec; |
| @@ -3359,9 +3434,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3359 | if (WARN_ON(not)) | 3434 | if (WARN_ON(not)) |
| 3360 | return -EINVAL; | 3435 | return -EINVAL; |
| 3361 | 3436 | ||
| 3362 | mutex_lock(&trace_probe_ops.regex_lock); | 3437 | mutex_lock(&trace_probe_ops.func_hash->regex_lock); |
| 3363 | 3438 | ||
| 3364 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | 3439 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); |
| 3365 | if (!hash) { | 3440 | if (!hash) { |
| 3366 | count = -ENOMEM; | 3441 | count = -ENOMEM; |
| 3367 | goto out; | 3442 | goto out; |
| @@ -3420,15 +3495,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3420 | } while_for_each_ftrace_rec(); | 3495 | } while_for_each_ftrace_rec(); |
| 3421 | 3496 | ||
| 3422 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); | 3497 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); |
| 3423 | if (ret < 0) | ||
| 3424 | count = ret; | ||
| 3425 | 3498 | ||
| 3426 | __enable_ftrace_function_probe(); | 3499 | __enable_ftrace_function_probe(old_hash); |
| 3500 | |||
| 3501 | if (!ret) | ||
| 3502 | free_ftrace_hash_rcu(old_hash); | ||
| 3503 | else | ||
| 3504 | count = ret; | ||
| 3427 | 3505 | ||
| 3428 | out_unlock: | 3506 | out_unlock: |
| 3429 | mutex_unlock(&ftrace_lock); | 3507 | mutex_unlock(&ftrace_lock); |
| 3430 | out: | 3508 | out: |
| 3431 | mutex_unlock(&trace_probe_ops.regex_lock); | 3509 | mutex_unlock(&trace_probe_ops.func_hash->regex_lock); |
| 3432 | free_ftrace_hash(hash); | 3510 | free_ftrace_hash(hash); |
| 3433 | 3511 | ||
| 3434 | return count; | 3512 | return count; |
| @@ -3446,7 +3524,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3446 | struct ftrace_func_entry *rec_entry; | 3524 | struct ftrace_func_entry *rec_entry; |
| 3447 | struct ftrace_func_probe *entry; | 3525 | struct ftrace_func_probe *entry; |
| 3448 | struct ftrace_func_probe *p; | 3526 | struct ftrace_func_probe *p; |
| 3449 | struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; | 3527 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; |
| 3528 | struct ftrace_hash *old_hash = *orig_hash; | ||
| 3450 | struct list_head free_list; | 3529 | struct list_head free_list; |
| 3451 | struct ftrace_hash *hash; | 3530 | struct ftrace_hash *hash; |
| 3452 | struct hlist_node *tmp; | 3531 | struct hlist_node *tmp; |
| @@ -3454,6 +3533,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3454 | int type = MATCH_FULL; | 3533 | int type = MATCH_FULL; |
| 3455 | int i, len = 0; | 3534 | int i, len = 0; |
| 3456 | char *search; | 3535 | char *search; |
| 3536 | int ret; | ||
| 3457 | 3537 | ||
| 3458 | if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) | 3538 | if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) |
| 3459 | glob = NULL; | 3539 | glob = NULL; |
| @@ -3468,7 +3548,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3468 | return; | 3548 | return; |
| 3469 | } | 3549 | } |
| 3470 | 3550 | ||
| 3471 | mutex_lock(&trace_probe_ops.regex_lock); | 3551 | mutex_lock(&trace_probe_ops.func_hash->regex_lock); |
| 3472 | 3552 | ||
| 3473 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | 3553 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); |
| 3474 | if (!hash) | 3554 | if (!hash) |
| @@ -3512,8 +3592,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3512 | * Remove after the disable is called. Otherwise, if the last | 3592 | * Remove after the disable is called. Otherwise, if the last |
| 3513 | * probe is removed, a null hash means *all enabled*. | 3593 | * probe is removed, a null hash means *all enabled*. |
| 3514 | */ | 3594 | */ |
| 3515 | ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); | 3595 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); |
| 3516 | synchronize_sched(); | 3596 | synchronize_sched(); |
| 3597 | if (!ret) | ||
| 3598 | free_ftrace_hash_rcu(old_hash); | ||
| 3599 | |||
| 3517 | list_for_each_entry_safe(entry, p, &free_list, free_list) { | 3600 | list_for_each_entry_safe(entry, p, &free_list, free_list) { |
| 3518 | list_del(&entry->free_list); | 3601 | list_del(&entry->free_list); |
| 3519 | ftrace_free_entry(entry); | 3602 | ftrace_free_entry(entry); |
| @@ -3521,7 +3604,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3521 | mutex_unlock(&ftrace_lock); | 3604 | mutex_unlock(&ftrace_lock); |
| 3522 | 3605 | ||
| 3523 | out_unlock: | 3606 | out_unlock: |
| 3524 | mutex_unlock(&trace_probe_ops.regex_lock); | 3607 | mutex_unlock(&trace_probe_ops.func_hash->regex_lock); |
| 3525 | free_ftrace_hash(hash); | 3608 | free_ftrace_hash(hash); |
| 3526 | } | 3609 | } |
| 3527 | 3610 | ||
| @@ -3700,10 +3783,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) | |||
| 3700 | return add_hash_entry(hash, ip); | 3783 | return add_hash_entry(hash, ip); |
| 3701 | } | 3784 | } |
| 3702 | 3785 | ||
| 3703 | static void ftrace_ops_update_code(struct ftrace_ops *ops) | 3786 | static void ftrace_ops_update_code(struct ftrace_ops *ops, |
| 3787 | struct ftrace_hash *old_hash) | ||
| 3704 | { | 3788 | { |
| 3705 | if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) | 3789 | if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) |
| 3706 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); | 3790 | ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); |
| 3707 | } | 3791 | } |
| 3708 | 3792 | ||
| 3709 | static int | 3793 | static int |
| @@ -3711,18 +3795,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
| 3711 | unsigned long ip, int remove, int reset, int enable) | 3795 | unsigned long ip, int remove, int reset, int enable) |
| 3712 | { | 3796 | { |
| 3713 | struct ftrace_hash **orig_hash; | 3797 | struct ftrace_hash **orig_hash; |
| 3798 | struct ftrace_hash *old_hash; | ||
| 3714 | struct ftrace_hash *hash; | 3799 | struct ftrace_hash *hash; |
| 3715 | int ret; | 3800 | int ret; |
| 3716 | 3801 | ||
| 3717 | if (unlikely(ftrace_disabled)) | 3802 | if (unlikely(ftrace_disabled)) |
| 3718 | return -ENODEV; | 3803 | return -ENODEV; |
| 3719 | 3804 | ||
| 3720 | mutex_lock(&ops->regex_lock); | 3805 | mutex_lock(&ops->func_hash->regex_lock); |
| 3721 | 3806 | ||
| 3722 | if (enable) | 3807 | if (enable) |
| 3723 | orig_hash = &ops->filter_hash; | 3808 | orig_hash = &ops->func_hash->filter_hash; |
| 3724 | else | 3809 | else |
| 3725 | orig_hash = &ops->notrace_hash; | 3810 | orig_hash = &ops->func_hash->notrace_hash; |
| 3726 | 3811 | ||
| 3727 | if (reset) | 3812 | if (reset) |
| 3728 | hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); | 3813 | hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); |
| @@ -3745,14 +3830,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
| 3745 | } | 3830 | } |
| 3746 | 3831 | ||
| 3747 | mutex_lock(&ftrace_lock); | 3832 | mutex_lock(&ftrace_lock); |
| 3833 | old_hash = *orig_hash; | ||
| 3748 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | 3834 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); |
| 3749 | if (!ret) | 3835 | if (!ret) { |
| 3750 | ftrace_ops_update_code(ops); | 3836 | ftrace_ops_update_code(ops, old_hash); |
| 3751 | 3837 | free_ftrace_hash_rcu(old_hash); | |
| 3838 | } | ||
| 3752 | mutex_unlock(&ftrace_lock); | 3839 | mutex_unlock(&ftrace_lock); |
| 3753 | 3840 | ||
| 3754 | out_regex_unlock: | 3841 | out_regex_unlock: |
| 3755 | mutex_unlock(&ops->regex_lock); | 3842 | mutex_unlock(&ops->func_hash->regex_lock); |
| 3756 | 3843 | ||
| 3757 | free_ftrace_hash(hash); | 3844 | free_ftrace_hash(hash); |
| 3758 | return ret; | 3845 | return ret; |
| @@ -3957,6 +4044,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
| 3957 | struct seq_file *m = (struct seq_file *)file->private_data; | 4044 | struct seq_file *m = (struct seq_file *)file->private_data; |
| 3958 | struct ftrace_iterator *iter; | 4045 | struct ftrace_iterator *iter; |
| 3959 | struct ftrace_hash **orig_hash; | 4046 | struct ftrace_hash **orig_hash; |
| 4047 | struct ftrace_hash *old_hash; | ||
| 3960 | struct trace_parser *parser; | 4048 | struct trace_parser *parser; |
| 3961 | int filter_hash; | 4049 | int filter_hash; |
| 3962 | int ret; | 4050 | int ret; |
| @@ -3975,26 +4063,28 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
| 3975 | 4063 | ||
| 3976 | trace_parser_put(parser); | 4064 | trace_parser_put(parser); |
| 3977 | 4065 | ||
| 3978 | mutex_lock(&iter->ops->regex_lock); | 4066 | mutex_lock(&iter->ops->func_hash->regex_lock); |
| 3979 | 4067 | ||
| 3980 | if (file->f_mode & FMODE_WRITE) { | 4068 | if (file->f_mode & FMODE_WRITE) { |
| 3981 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); | 4069 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); |
| 3982 | 4070 | ||
| 3983 | if (filter_hash) | 4071 | if (filter_hash) |
| 3984 | orig_hash = &iter->ops->filter_hash; | 4072 | orig_hash = &iter->ops->func_hash->filter_hash; |
| 3985 | else | 4073 | else |
| 3986 | orig_hash = &iter->ops->notrace_hash; | 4074 | orig_hash = &iter->ops->func_hash->notrace_hash; |
| 3987 | 4075 | ||
| 3988 | mutex_lock(&ftrace_lock); | 4076 | mutex_lock(&ftrace_lock); |
| 4077 | old_hash = *orig_hash; | ||
| 3989 | ret = ftrace_hash_move(iter->ops, filter_hash, | 4078 | ret = ftrace_hash_move(iter->ops, filter_hash, |
| 3990 | orig_hash, iter->hash); | 4079 | orig_hash, iter->hash); |
| 3991 | if (!ret) | 4080 | if (!ret) { |
| 3992 | ftrace_ops_update_code(iter->ops); | 4081 | ftrace_ops_update_code(iter->ops, old_hash); |
| 3993 | 4082 | free_ftrace_hash_rcu(old_hash); | |
| 4083 | } | ||
| 3994 | mutex_unlock(&ftrace_lock); | 4084 | mutex_unlock(&ftrace_lock); |
| 3995 | } | 4085 | } |
| 3996 | 4086 | ||
| 3997 | mutex_unlock(&iter->ops->regex_lock); | 4087 | mutex_unlock(&iter->ops->func_hash->regex_lock); |
| 3998 | free_ftrace_hash(iter->hash); | 4088 | free_ftrace_hash(iter->hash); |
| 3999 | kfree(iter); | 4089 | kfree(iter); |
| 4000 | 4090 | ||
| @@ -4611,7 +4701,6 @@ void __init ftrace_init(void) | |||
| 4611 | static struct ftrace_ops global_ops = { | 4701 | static struct ftrace_ops global_ops = { |
| 4612 | .func = ftrace_stub, | 4702 | .func = ftrace_stub, |
| 4613 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, | 4703 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, |
| 4614 | INIT_REGEX_LOCK(global_ops) | ||
| 4615 | }; | 4704 | }; |
| 4616 | 4705 | ||
| 4617 | static int __init ftrace_nodyn_init(void) | 4706 | static int __init ftrace_nodyn_init(void) |
| @@ -4623,6 +4712,7 @@ core_initcall(ftrace_nodyn_init); | |||
| 4623 | 4712 | ||
| 4624 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 4713 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
| 4625 | static inline void ftrace_startup_enable(int command) { } | 4714 | static inline void ftrace_startup_enable(int command) { } |
| 4715 | static inline void ftrace_startup_all(int command) { } | ||
| 4626 | /* Keep as macros so we do not need to define the commands */ | 4716 | /* Keep as macros so we do not need to define the commands */ |
| 4627 | # define ftrace_startup(ops, command) \ | 4717 | # define ftrace_startup(ops, command) \ |
| 4628 | ({ \ | 4718 | ({ \ |
| @@ -4713,7 +4803,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, | |||
| 4713 | static struct ftrace_ops control_ops = { | 4803 | static struct ftrace_ops control_ops = { |
| 4714 | .func = ftrace_ops_control_func, | 4804 | .func = ftrace_ops_control_func, |
| 4715 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, | 4805 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, |
| 4716 | INIT_REGEX_LOCK(control_ops) | 4806 | INIT_OPS_HASH(control_ops) |
| 4717 | }; | 4807 | }; |
| 4718 | 4808 | ||
| 4719 | static inline void | 4809 | static inline void |
| @@ -4772,6 +4862,56 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) | |||
| 4772 | } | 4862 | } |
| 4773 | #endif | 4863 | #endif |
| 4774 | 4864 | ||
| 4865 | /* | ||
| 4866 | * If there's only one function registered but it does not support | ||
| 4867 | * recursion, this function will be called by the mcount trampoline. | ||
| 4868 | * This function will handle recursion protection. | ||
| 4869 | */ | ||
| 4870 | static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, | ||
| 4871 | struct ftrace_ops *op, struct pt_regs *regs) | ||
| 4872 | { | ||
| 4873 | int bit; | ||
| 4874 | |||
| 4875 | bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); | ||
| 4876 | if (bit < 0) | ||
| 4877 | return; | ||
| 4878 | |||
| 4879 | op->func(ip, parent_ip, op, regs); | ||
| 4880 | |||
| 4881 | trace_clear_recursion(bit); | ||
| 4882 | } | ||
| 4883 | |||
| 4884 | /** | ||
| 4885 | * ftrace_ops_get_func - get the function a trampoline should call | ||
| 4886 | * @ops: the ops to get the function for | ||
| 4887 | * | ||
| 4888 | * Normally the mcount trampoline will call the ops->func, but there | ||
| 4889 | * are times that it should not. For example, if the ops does not | ||
| 4890 | * have its own recursion protection, then it should call the | ||
| 4891 | * ftrace_ops_recurs_func() instead. | ||
| 4892 | * | ||
| 4893 | * Returns the function that the trampoline should call for @ops. | ||
| 4894 | */ | ||
| 4895 | ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) | ||
| 4896 | { | ||
| 4897 | /* | ||
| 4898 | * If this is a dynamic ops or we force list func, | ||
| 4899 | * then it needs to call the list anyway. | ||
| 4900 | */ | ||
| 4901 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) | ||
| 4902 | return ftrace_ops_list_func; | ||
| 4903 | |||
| 4904 | /* | ||
| 4905 | * If the func handles its own recursion, call it directly. | ||
| 4906 | * Otherwise call the recursion protected function that | ||
| 4907 | * will call the ftrace ops function. | ||
| 4908 | */ | ||
| 4909 | if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) | ||
| 4910 | return ftrace_ops_recurs_func; | ||
| 4911 | |||
| 4912 | return ops->func; | ||
| 4913 | } | ||
| 4914 | |||
| 4775 | static void clear_ftrace_swapper(void) | 4915 | static void clear_ftrace_swapper(void) |
| 4776 | { | 4916 | { |
| 4777 | struct task_struct *p; | 4917 | struct task_struct *p; |
| @@ -4872,7 +5012,8 @@ static int ftrace_pid_add(int p) | |||
| 4872 | set_ftrace_pid_task(pid); | 5012 | set_ftrace_pid_task(pid); |
| 4873 | 5013 | ||
| 4874 | ftrace_update_pid_func(); | 5014 | ftrace_update_pid_func(); |
| 4875 | ftrace_startup_enable(0); | 5015 | |
| 5016 | ftrace_startup_all(0); | ||
| 4876 | 5017 | ||
| 4877 | mutex_unlock(&ftrace_lock); | 5018 | mutex_unlock(&ftrace_lock); |
| 4878 | return 0; | 5019 | return 0; |
| @@ -4901,7 +5042,7 @@ static void ftrace_pid_reset(void) | |||
| 4901 | } | 5042 | } |
| 4902 | 5043 | ||
| 4903 | ftrace_update_pid_func(); | 5044 | ftrace_update_pid_func(); |
| 4904 | ftrace_startup_enable(0); | 5045 | ftrace_startup_all(0); |
| 4905 | 5046 | ||
| 4906 | mutex_unlock(&ftrace_lock); | 5047 | mutex_unlock(&ftrace_lock); |
| 4907 | } | 5048 | } |
| @@ -5145,6 +5286,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
| 5145 | 5286 | ||
| 5146 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 5287 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 5147 | 5288 | ||
| 5289 | static struct ftrace_ops graph_ops = { | ||
| 5290 | .func = ftrace_stub, | ||
| 5291 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | | ||
| 5292 | FTRACE_OPS_FL_INITIALIZED | | ||
| 5293 | FTRACE_OPS_FL_STUB, | ||
| 5294 | #ifdef FTRACE_GRAPH_TRAMP_ADDR | ||
| 5295 | .trampoline = FTRACE_GRAPH_TRAMP_ADDR, | ||
| 5296 | #endif | ||
| 5297 | ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) | ||
| 5298 | }; | ||
| 5299 | |||
| 5148 | static int ftrace_graph_active; | 5300 | static int ftrace_graph_active; |
| 5149 | 5301 | ||
| 5150 | int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) | 5302 | int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) |
| @@ -5307,12 +5459,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) | |||
| 5307 | */ | 5459 | */ |
| 5308 | static void update_function_graph_func(void) | 5460 | static void update_function_graph_func(void) |
| 5309 | { | 5461 | { |
| 5310 | if (ftrace_ops_list == &ftrace_list_end || | 5462 | struct ftrace_ops *op; |
| 5311 | (ftrace_ops_list == &global_ops && | 5463 | bool do_test = false; |
| 5312 | global_ops.next == &ftrace_list_end)) | 5464 | |
| 5313 | ftrace_graph_entry = __ftrace_graph_entry; | 5465 | /* |
| 5314 | else | 5466 | * The graph and global ops share the same set of functions |
| 5467 | * to test. If any other ops is on the list, then | ||
| 5468 | * the graph tracing needs to test if its the function | ||
| 5469 | * it should call. | ||
| 5470 | */ | ||
| 5471 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 5472 | if (op != &global_ops && op != &graph_ops && | ||
| 5473 | op != &ftrace_list_end) { | ||
| 5474 | do_test = true; | ||
| 5475 | /* in double loop, break out with goto */ | ||
| 5476 | goto out; | ||
| 5477 | } | ||
| 5478 | } while_for_each_ftrace_op(op); | ||
| 5479 | out: | ||
| 5480 | if (do_test) | ||
| 5315 | ftrace_graph_entry = ftrace_graph_entry_test; | 5481 | ftrace_graph_entry = ftrace_graph_entry_test; |
| 5482 | else | ||
| 5483 | ftrace_graph_entry = __ftrace_graph_entry; | ||
| 5316 | } | 5484 | } |
| 5317 | 5485 | ||
| 5318 | static struct notifier_block ftrace_suspend_notifier = { | 5486 | static struct notifier_block ftrace_suspend_notifier = { |
| @@ -5353,16 +5521,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
| 5353 | ftrace_graph_entry = ftrace_graph_entry_test; | 5521 | ftrace_graph_entry = ftrace_graph_entry_test; |
| 5354 | update_function_graph_func(); | 5522 | update_function_graph_func(); |
| 5355 | 5523 | ||
| 5356 | /* Function graph doesn't use the .func field of global_ops */ | 5524 | ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); |
| 5357 | global_ops.flags |= FTRACE_OPS_FL_STUB; | ||
| 5358 | |||
| 5359 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
| 5360 | /* Optimize function graph calling (if implemented by arch) */ | ||
| 5361 | if (FTRACE_GRAPH_TRAMP_ADDR != 0) | ||
| 5362 | global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR; | ||
| 5363 | #endif | ||
| 5364 | |||
| 5365 | ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); | ||
| 5366 | 5525 | ||
| 5367 | out: | 5526 | out: |
| 5368 | mutex_unlock(&ftrace_lock); | 5527 | mutex_unlock(&ftrace_lock); |
| @@ -5380,12 +5539,7 @@ void unregister_ftrace_graph(void) | |||
| 5380 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; | 5539 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; |
| 5381 | ftrace_graph_entry = ftrace_graph_entry_stub; | 5540 | ftrace_graph_entry = ftrace_graph_entry_stub; |
| 5382 | __ftrace_graph_entry = ftrace_graph_entry_stub; | 5541 | __ftrace_graph_entry = ftrace_graph_entry_stub; |
| 5383 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); | 5542 | ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); |
| 5384 | global_ops.flags &= ~FTRACE_OPS_FL_STUB; | ||
| 5385 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
| 5386 | if (FTRACE_GRAPH_TRAMP_ADDR != 0) | ||
| 5387 | global_ops.trampoline = 0; | ||
| 5388 | #endif | ||
| 5389 | unregister_pm_notifier(&ftrace_suspend_notifier); | 5543 | unregister_pm_notifier(&ftrace_suspend_notifier); |
| 5390 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 5544 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
| 5391 | 5545 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 925f629658d6..a56e07c8d15b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work) | |||
| 538 | * ring_buffer_wait - wait for input to the ring buffer | 538 | * ring_buffer_wait - wait for input to the ring buffer |
| 539 | * @buffer: buffer to wait on | 539 | * @buffer: buffer to wait on |
| 540 | * @cpu: the cpu buffer to wait on | 540 | * @cpu: the cpu buffer to wait on |
| 541 | * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS | ||
| 541 | * | 542 | * |
| 542 | * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon | 543 | * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon |
| 543 | * as data is added to any of the @buffer's cpu buffers. Otherwise | 544 | * as data is added to any of the @buffer's cpu buffers. Otherwise |
| 544 | * it will wait for data to be added to a specific cpu buffer. | 545 | * it will wait for data to be added to a specific cpu buffer. |
| 545 | */ | 546 | */ |
| 546 | int ring_buffer_wait(struct ring_buffer *buffer, int cpu) | 547 | int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) |
| 547 | { | 548 | { |
| 548 | struct ring_buffer_per_cpu *cpu_buffer; | 549 | struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); |
| 549 | DEFINE_WAIT(wait); | 550 | DEFINE_WAIT(wait); |
| 550 | struct rb_irq_work *work; | 551 | struct rb_irq_work *work; |
| 552 | int ret = 0; | ||
| 551 | 553 | ||
| 552 | /* | 554 | /* |
| 553 | * Depending on what the caller is waiting for, either any | 555 | * Depending on what the caller is waiting for, either any |
| @@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu) | |||
| 564 | } | 566 | } |
| 565 | 567 | ||
| 566 | 568 | ||
| 567 | prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); | 569 | while (true) { |
| 570 | prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); | ||
| 568 | 571 | ||
| 569 | /* | 572 | /* |
| 570 | * The events can happen in critical sections where | 573 | * The events can happen in critical sections where |
| 571 | * checking a work queue can cause deadlocks. | 574 | * checking a work queue can cause deadlocks. |
| 572 | * After adding a task to the queue, this flag is set | 575 | * After adding a task to the queue, this flag is set |
| 573 | * only to notify events to try to wake up the queue | 576 | * only to notify events to try to wake up the queue |
| 574 | * using irq_work. | 577 | * using irq_work. |
| 575 | * | 578 | * |
| 576 | * We don't clear it even if the buffer is no longer | 579 | * We don't clear it even if the buffer is no longer |
| 577 | * empty. The flag only causes the next event to run | 580 | * empty. The flag only causes the next event to run |
| 578 | * irq_work to do the work queue wake up. The worse | 581 | * irq_work to do the work queue wake up. The worse |
| 579 | * that can happen if we race with !trace_empty() is that | 582 | * that can happen if we race with !trace_empty() is that |
| 580 | * an event will cause an irq_work to try to wake up | 583 | * an event will cause an irq_work to try to wake up |
| 581 | * an empty queue. | 584 | * an empty queue. |
| 582 | * | 585 | * |
| 583 | * There's no reason to protect this flag either, as | 586 | * There's no reason to protect this flag either, as |
| 584 | * the work queue and irq_work logic will do the necessary | 587 | * the work queue and irq_work logic will do the necessary |
| 585 | * synchronization for the wake ups. The only thing | 588 | * synchronization for the wake ups. The only thing |
| 586 | * that is necessary is that the wake up happens after | 589 | * that is necessary is that the wake up happens after |
| 587 | * a task has been queued. It's OK for spurious wake ups. | 590 | * a task has been queued. It's OK for spurious wake ups. |
| 588 | */ | 591 | */ |
| 589 | work->waiters_pending = true; | 592 | work->waiters_pending = true; |
| 593 | |||
| 594 | if (signal_pending(current)) { | ||
| 595 | ret = -EINTR; | ||
| 596 | break; | ||
| 597 | } | ||
| 598 | |||
| 599 | if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) | ||
| 600 | break; | ||
| 601 | |||
| 602 | if (cpu != RING_BUFFER_ALL_CPUS && | ||
| 603 | !ring_buffer_empty_cpu(buffer, cpu)) { | ||
| 604 | unsigned long flags; | ||
| 605 | bool pagebusy; | ||
| 606 | |||
| 607 | if (!full) | ||
| 608 | break; | ||
| 609 | |||
| 610 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | ||
| 611 | pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; | ||
| 612 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | ||
| 613 | |||
| 614 | if (!pagebusy) | ||
| 615 | break; | ||
| 616 | } | ||
| 590 | 617 | ||
| 591 | if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || | ||
| 592 | (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) | ||
| 593 | schedule(); | 618 | schedule(); |
| 619 | } | ||
| 594 | 620 | ||
| 595 | finish_wait(&work->waiters, &wait); | 621 | finish_wait(&work->waiters, &wait); |
| 596 | return 0; | 622 | |
| 623 | return ret; | ||
| 597 | } | 624 | } |
| 598 | 625 | ||
| 599 | /** | 626 | /** |
| @@ -626,8 +653,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, | |||
| 626 | work = &cpu_buffer->irq_work; | 653 | work = &cpu_buffer->irq_work; |
| 627 | } | 654 | } |
| 628 | 655 | ||
| 629 | work->waiters_pending = true; | ||
| 630 | poll_wait(filp, &work->waiters, poll_table); | 656 | poll_wait(filp, &work->waiters, poll_table); |
| 657 | work->waiters_pending = true; | ||
| 658 | /* | ||
| 659 | * There's a tight race between setting the waiters_pending and | ||
| 660 | * checking if the ring buffer is empty. Once the waiters_pending bit | ||
| 661 | * is set, the next event will wake the task up, but we can get stuck | ||
| 662 | * if there's only a single event in. | ||
| 663 | * | ||
| 664 | * FIXME: Ideally, we need a memory barrier on the writer side as well, | ||
| 665 | * but adding a memory barrier to all events will cause too much of a | ||
| 666 | * performance hit in the fast path. We only need a memory barrier when | ||
| 667 | * the buffer goes from empty to having content. But as this race is | ||
| 668 | * extremely small, and it's not a problem if another event comes in, we | ||
| 669 | * will fix it later. | ||
| 670 | */ | ||
| 671 | smp_mb(); | ||
| 631 | 672 | ||
| 632 | if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || | 673 | if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || |
| 633 | (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) | 674 | (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) |
| @@ -1968,7 +2009,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | |||
| 1968 | 2009 | ||
| 1969 | /** | 2010 | /** |
| 1970 | * rb_update_event - update event type and data | 2011 | * rb_update_event - update event type and data |
| 1971 | * @event: the even to update | 2012 | * @event: the event to update |
| 1972 | * @type: the type of event | 2013 | * @type: the type of event |
| 1973 | * @length: the size of the event field in the ring buffer | 2014 | * @length: the size of the event field in the ring buffer |
| 1974 | * | 2015 | * |
| @@ -3341,21 +3382,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) | |||
| 3341 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3382 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
| 3342 | 3383 | ||
| 3343 | /* Iterator usage is expected to have record disabled */ | 3384 | /* Iterator usage is expected to have record disabled */ |
| 3344 | if (list_empty(&cpu_buffer->reader_page->list)) { | 3385 | iter->head_page = cpu_buffer->reader_page; |
| 3345 | iter->head_page = rb_set_head_page(cpu_buffer); | 3386 | iter->head = cpu_buffer->reader_page->read; |
| 3346 | if (unlikely(!iter->head_page)) | 3387 | |
| 3347 | return; | 3388 | iter->cache_reader_page = iter->head_page; |
| 3348 | iter->head = iter->head_page->read; | 3389 | iter->cache_read = cpu_buffer->read; |
| 3349 | } else { | 3390 | |
| 3350 | iter->head_page = cpu_buffer->reader_page; | ||
| 3351 | iter->head = cpu_buffer->reader_page->read; | ||
| 3352 | } | ||
| 3353 | if (iter->head) | 3391 | if (iter->head) |
| 3354 | iter->read_stamp = cpu_buffer->read_stamp; | 3392 | iter->read_stamp = cpu_buffer->read_stamp; |
| 3355 | else | 3393 | else |
| 3356 | iter->read_stamp = iter->head_page->page->time_stamp; | 3394 | iter->read_stamp = iter->head_page->page->time_stamp; |
| 3357 | iter->cache_reader_page = cpu_buffer->reader_page; | ||
| 3358 | iter->cache_read = cpu_buffer->read; | ||
| 3359 | } | 3395 | } |
| 3360 | 3396 | ||
| 3361 | /** | 3397 | /** |
| @@ -3748,12 +3784,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
| 3748 | return NULL; | 3784 | return NULL; |
| 3749 | 3785 | ||
| 3750 | /* | 3786 | /* |
| 3751 | * We repeat when a time extend is encountered. | 3787 | * We repeat when a time extend is encountered or we hit |
| 3752 | * Since the time extend is always attached to a data event, | 3788 | * the end of the page. Since the time extend is always attached |
| 3753 | * we should never loop more than once. | 3789 | * to a data event, we should never loop more than three times. |
| 3754 | * (We never hit the following condition more than twice). | 3790 | * Once for going to next page, once on time extend, and |
| 3791 | * finally once to get the event. | ||
| 3792 | * (We never hit the following condition more than thrice). | ||
| 3755 | */ | 3793 | */ |
| 3756 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) | 3794 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) |
| 3757 | return NULL; | 3795 | return NULL; |
| 3758 | 3796 | ||
| 3759 | if (rb_per_cpu_empty(cpu_buffer)) | 3797 | if (rb_per_cpu_empty(cpu_buffer)) |
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0434ff1b808e..3f9e328c30b5 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
| @@ -205,7 +205,6 @@ static void ring_buffer_consumer(void) | |||
| 205 | break; | 205 | break; |
| 206 | 206 | ||
| 207 | schedule(); | 207 | schedule(); |
| 208 | __set_current_state(TASK_RUNNING); | ||
| 209 | } | 208 | } |
| 210 | reader_finish = 0; | 209 | reader_finish = 0; |
| 211 | complete(&read_done); | 210 | complete(&read_done); |
| @@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg) | |||
| 379 | break; | 378 | break; |
| 380 | 379 | ||
| 381 | schedule(); | 380 | schedule(); |
| 382 | __set_current_state(TASK_RUNNING); | ||
| 383 | } | 381 | } |
| 384 | __set_current_state(TASK_RUNNING); | 382 | __set_current_state(TASK_RUNNING); |
| 385 | 383 | ||
| @@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg) | |||
| 407 | trace_printk("Sleeping for 10 secs\n"); | 405 | trace_printk("Sleeping for 10 secs\n"); |
| 408 | set_current_state(TASK_INTERRUPTIBLE); | 406 | set_current_state(TASK_INTERRUPTIBLE); |
| 409 | schedule_timeout(HZ * SLEEP_TIME); | 407 | schedule_timeout(HZ * SLEEP_TIME); |
| 410 | __set_current_state(TASK_RUNNING); | ||
| 411 | } | 408 | } |
| 412 | 409 | ||
| 413 | if (kill_test) | 410 | if (kill_test) |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f3ef80c8914c..0fa2d2070bd4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 1076 | } | 1076 | } |
| 1077 | #endif /* CONFIG_TRACER_MAX_TRACE */ | 1077 | #endif /* CONFIG_TRACER_MAX_TRACE */ |
| 1078 | 1078 | ||
| 1079 | static int wait_on_pipe(struct trace_iterator *iter) | 1079 | static int wait_on_pipe(struct trace_iterator *iter, bool full) |
| 1080 | { | 1080 | { |
| 1081 | /* Iterators are static, they should be filled or empty */ | 1081 | /* Iterators are static, they should be filled or empty */ |
| 1082 | if (trace_buffer_iter(iter, iter->cpu_file)) | 1082 | if (trace_buffer_iter(iter, iter->cpu_file)) |
| 1083 | return 0; | 1083 | return 0; |
| 1084 | 1084 | ||
| 1085 | return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); | 1085 | return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, |
| 1086 | full); | ||
| 1086 | } | 1087 | } |
| 1087 | 1088 | ||
| 1088 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1089 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
| @@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp) | |||
| 4434 | 4435 | ||
| 4435 | mutex_unlock(&iter->mutex); | 4436 | mutex_unlock(&iter->mutex); |
| 4436 | 4437 | ||
| 4437 | ret = wait_on_pipe(iter); | 4438 | ret = wait_on_pipe(iter, false); |
| 4438 | 4439 | ||
| 4439 | mutex_lock(&iter->mutex); | 4440 | mutex_lock(&iter->mutex); |
| 4440 | 4441 | ||
| 4441 | if (ret) | 4442 | if (ret) |
| 4442 | return ret; | 4443 | return ret; |
| 4443 | |||
| 4444 | if (signal_pending(current)) | ||
| 4445 | return -EINTR; | ||
| 4446 | } | 4444 | } |
| 4447 | 4445 | ||
| 4448 | return 1; | 4446 | return 1; |
| @@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
| 5372 | goto out_unlock; | 5370 | goto out_unlock; |
| 5373 | } | 5371 | } |
| 5374 | mutex_unlock(&trace_types_lock); | 5372 | mutex_unlock(&trace_types_lock); |
| 5375 | ret = wait_on_pipe(iter); | 5373 | ret = wait_on_pipe(iter, false); |
| 5376 | mutex_lock(&trace_types_lock); | 5374 | mutex_lock(&trace_types_lock); |
| 5377 | if (ret) { | 5375 | if (ret) { |
| 5378 | size = ret; | 5376 | size = ret; |
| 5379 | goto out_unlock; | 5377 | goto out_unlock; |
| 5380 | } | 5378 | } |
| 5381 | if (signal_pending(current)) { | ||
| 5382 | size = -EINTR; | ||
| 5383 | goto out_unlock; | ||
| 5384 | } | ||
| 5385 | goto again; | 5379 | goto again; |
| 5386 | } | 5380 | } |
| 5387 | size = 0; | 5381 | size = 0; |
| @@ -5500,7 +5494,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 5500 | }; | 5494 | }; |
| 5501 | struct buffer_ref *ref; | 5495 | struct buffer_ref *ref; |
| 5502 | int entries, size, i; | 5496 | int entries, size, i; |
| 5503 | ssize_t ret; | 5497 | ssize_t ret = 0; |
| 5504 | 5498 | ||
| 5505 | mutex_lock(&trace_types_lock); | 5499 | mutex_lock(&trace_types_lock); |
| 5506 | 5500 | ||
| @@ -5538,13 +5532,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 5538 | int r; | 5532 | int r; |
| 5539 | 5533 | ||
| 5540 | ref = kzalloc(sizeof(*ref), GFP_KERNEL); | 5534 | ref = kzalloc(sizeof(*ref), GFP_KERNEL); |
| 5541 | if (!ref) | 5535 | if (!ref) { |
| 5536 | ret = -ENOMEM; | ||
| 5542 | break; | 5537 | break; |
| 5538 | } | ||
| 5543 | 5539 | ||
| 5544 | ref->ref = 1; | 5540 | ref->ref = 1; |
| 5545 | ref->buffer = iter->trace_buffer->buffer; | 5541 | ref->buffer = iter->trace_buffer->buffer; |
| 5546 | ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); | 5542 | ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); |
| 5547 | if (!ref->page) { | 5543 | if (!ref->page) { |
| 5544 | ret = -ENOMEM; | ||
| 5548 | kfree(ref); | 5545 | kfree(ref); |
| 5549 | break; | 5546 | break; |
| 5550 | } | 5547 | } |
| @@ -5582,19 +5579,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 5582 | 5579 | ||
| 5583 | /* did we read anything? */ | 5580 | /* did we read anything? */ |
| 5584 | if (!spd.nr_pages) { | 5581 | if (!spd.nr_pages) { |
| 5582 | if (ret) | ||
| 5583 | goto out; | ||
| 5584 | |||
| 5585 | if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { | 5585 | if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { |
| 5586 | ret = -EAGAIN; | 5586 | ret = -EAGAIN; |
| 5587 | goto out; | 5587 | goto out; |
| 5588 | } | 5588 | } |
| 5589 | mutex_unlock(&trace_types_lock); | 5589 | mutex_unlock(&trace_types_lock); |
| 5590 | ret = wait_on_pipe(iter); | 5590 | ret = wait_on_pipe(iter, true); |
| 5591 | mutex_lock(&trace_types_lock); | 5591 | mutex_lock(&trace_types_lock); |
| 5592 | if (ret) | 5592 | if (ret) |
| 5593 | goto out; | 5593 | goto out; |
| 5594 | if (signal_pending(current)) { | 5594 | |
| 5595 | ret = -EINTR; | ||
| 5596 | goto out; | ||
| 5597 | } | ||
| 5598 | goto again; | 5595 | goto again; |
| 5599 | } | 5596 | } |
| 5600 | 5597 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ef06ce7e9cf8..0cc51edde3a8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -2513,8 +2513,11 @@ static __init int event_test_thread(void *unused) | |||
| 2513 | kfree(test_malloc); | 2513 | kfree(test_malloc); |
| 2514 | 2514 | ||
| 2515 | set_current_state(TASK_INTERRUPTIBLE); | 2515 | set_current_state(TASK_INTERRUPTIBLE); |
| 2516 | while (!kthread_should_stop()) | 2516 | while (!kthread_should_stop()) { |
| 2517 | schedule(); | 2517 | schedule(); |
| 2518 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 2519 | } | ||
| 2520 | __set_current_state(TASK_RUNNING); | ||
| 2518 | 2521 | ||
| 2519 | return 0; | 2522 | return 0; |
| 2520 | } | 2523 | } |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5ef60499dc8e..b0f86ea77881 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
| 382 | 382 | ||
| 383 | /* check the trace buffer */ | 383 | /* check the trace buffer */ |
| 384 | ret = trace_test_buffer(&tr->trace_buffer, &count); | 384 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
| 385 | |||
| 386 | ftrace_enabled = 1; | ||
| 385 | tracing_start(); | 387 | tracing_start(); |
| 386 | 388 | ||
| 387 | /* we should only have one item */ | 389 | /* we should only have one item */ |
| @@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
| 679 | 681 | ||
| 680 | /* check the trace buffer */ | 682 | /* check the trace buffer */ |
| 681 | ret = trace_test_buffer(&tr->trace_buffer, &count); | 683 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
| 684 | |||
| 685 | ftrace_enabled = 1; | ||
| 682 | trace->reset(tr); | 686 | trace->reset(tr); |
| 683 | tracing_start(); | 687 | tracing_start(); |
| 684 | 688 | ||
| @@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
| 1025 | #endif | 1029 | #endif |
| 1026 | 1030 | ||
| 1027 | #ifdef CONFIG_SCHED_TRACER | 1031 | #ifdef CONFIG_SCHED_TRACER |
| 1032 | |||
| 1033 | struct wakeup_test_data { | ||
| 1034 | struct completion is_ready; | ||
| 1035 | int go; | ||
| 1036 | }; | ||
| 1037 | |||
| 1028 | static int trace_wakeup_test_thread(void *data) | 1038 | static int trace_wakeup_test_thread(void *data) |
| 1029 | { | 1039 | { |
| 1030 | /* Make this a -deadline thread */ | 1040 | /* Make this a -deadline thread */ |
| @@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data) | |||
| 1034 | .sched_deadline = 10000000ULL, | 1044 | .sched_deadline = 10000000ULL, |
| 1035 | .sched_period = 10000000ULL | 1045 | .sched_period = 10000000ULL |
| 1036 | }; | 1046 | }; |
| 1037 | struct completion *x = data; | 1047 | struct wakeup_test_data *x = data; |
| 1038 | 1048 | ||
| 1039 | sched_setattr(current, &attr); | 1049 | sched_setattr(current, &attr); |
| 1040 | 1050 | ||
| 1041 | /* Make it know we have a new prio */ | 1051 | /* Make it know we have a new prio */ |
| 1042 | complete(x); | 1052 | complete(&x->is_ready); |
| 1043 | 1053 | ||
| 1044 | /* now go to sleep and let the test wake us up */ | 1054 | /* now go to sleep and let the test wake us up */ |
| 1045 | set_current_state(TASK_INTERRUPTIBLE); | 1055 | set_current_state(TASK_INTERRUPTIBLE); |
| 1046 | schedule(); | 1056 | while (!x->go) { |
| 1057 | schedule(); | ||
| 1058 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1059 | } | ||
| 1047 | 1060 | ||
| 1048 | complete(x); | 1061 | complete(&x->is_ready); |
| 1062 | |||
| 1063 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1049 | 1064 | ||
| 1050 | /* we are awake, now wait to disappear */ | 1065 | /* we are awake, now wait to disappear */ |
| 1051 | while (!kthread_should_stop()) { | 1066 | while (!kthread_should_stop()) { |
| 1052 | /* | 1067 | schedule(); |
| 1053 | * This will likely be the system top priority | 1068 | set_current_state(TASK_INTERRUPTIBLE); |
| 1054 | * task, do short sleeps to let others run. | ||
| 1055 | */ | ||
| 1056 | msleep(100); | ||
| 1057 | } | 1069 | } |
| 1058 | 1070 | ||
| 1071 | __set_current_state(TASK_RUNNING); | ||
| 1072 | |||
| 1059 | return 0; | 1073 | return 0; |
| 1060 | } | 1074 | } |
| 1061 | |||
| 1062 | int | 1075 | int |
| 1063 | trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | 1076 | trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) |
| 1064 | { | 1077 | { |
| 1065 | unsigned long save_max = tr->max_latency; | 1078 | unsigned long save_max = tr->max_latency; |
| 1066 | struct task_struct *p; | 1079 | struct task_struct *p; |
| 1067 | struct completion is_ready; | 1080 | struct wakeup_test_data data; |
| 1068 | unsigned long count; | 1081 | unsigned long count; |
| 1069 | int ret; | 1082 | int ret; |
| 1070 | 1083 | ||
| 1071 | init_completion(&is_ready); | 1084 | memset(&data, 0, sizeof(data)); |
| 1085 | |||
| 1086 | init_completion(&data.is_ready); | ||
| 1072 | 1087 | ||
| 1073 | /* create a -deadline thread */ | 1088 | /* create a -deadline thread */ |
| 1074 | p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); | 1089 | p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test"); |
| 1075 | if (IS_ERR(p)) { | 1090 | if (IS_ERR(p)) { |
| 1076 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); | 1091 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); |
| 1077 | return -1; | 1092 | return -1; |
| 1078 | } | 1093 | } |
| 1079 | 1094 | ||
| 1080 | /* make sure the thread is running at -deadline policy */ | 1095 | /* make sure the thread is running at -deadline policy */ |
| 1081 | wait_for_completion(&is_ready); | 1096 | wait_for_completion(&data.is_ready); |
| 1082 | 1097 | ||
| 1083 | /* start the tracing */ | 1098 | /* start the tracing */ |
| 1084 | ret = tracer_init(trace, tr); | 1099 | ret = tracer_init(trace, tr); |
| @@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
| 1099 | msleep(100); | 1114 | msleep(100); |
| 1100 | } | 1115 | } |
| 1101 | 1116 | ||
| 1102 | init_completion(&is_ready); | 1117 | init_completion(&data.is_ready); |
| 1118 | |||
| 1119 | data.go = 1; | ||
| 1120 | /* memory barrier is in the wake_up_process() */ | ||
| 1103 | 1121 | ||
| 1104 | wake_up_process(p); | 1122 | wake_up_process(p); |
| 1105 | 1123 | ||
| 1106 | /* Wait for the task to wake up */ | 1124 | /* Wait for the task to wake up */ |
| 1107 | wait_for_completion(&is_ready); | 1125 | wait_for_completion(&data.is_ready); |
| 1108 | 1126 | ||
| 1109 | /* stop the tracing. */ | 1127 | /* stop the tracing. */ |
| 1110 | tracing_stop(); | 1128 | tracing_stop(); |
| 1111 | /* check both trace buffers */ | 1129 | /* check both trace buffers */ |
| 1112 | ret = trace_test_buffer(&tr->trace_buffer, NULL); | 1130 | ret = trace_test_buffer(&tr->trace_buffer, NULL); |
| 1113 | printk("ret = %d\n", ret); | ||
| 1114 | if (!ret) | 1131 | if (!ret) |
| 1115 | ret = trace_test_buffer(&tr->max_buffer, &count); | 1132 | ret = trace_test_buffer(&tr->max_buffer, &count); |
| 1116 | 1133 | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..16eddb308c33 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -13,7 +13,6 @@ | |||
| 13 | #include <linux/sysctl.h> | 13 | #include <linux/sysctl.h> |
| 14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
| 15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
| 16 | #include <linux/magic.h> | ||
| 17 | 16 | ||
| 18 | #include <asm/setup.h> | 17 | #include <asm/setup.h> |
| 19 | 18 | ||
| @@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
| 171 | i++; | 170 | i++; |
| 172 | } | 171 | } |
| 173 | 172 | ||
| 174 | if ((current != &init_task && | 173 | if (task_stack_end_corrupted(current)) { |
| 175 | *(end_of_stack(current)) != STACK_END_MAGIC)) { | ||
| 176 | print_max_stack(); | 174 | print_max_stack(); |
| 177 | BUG(); | 175 | BUG(); |
| 178 | } | 176 | } |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 759d5e004517..29228c4d5696 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
| 313 | int size; | 313 | int size; |
| 314 | 314 | ||
| 315 | syscall_nr = trace_get_syscall_nr(current, regs); | 315 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 316 | if (syscall_nr < 0) | 316 | if (syscall_nr < 0 || syscall_nr >= NR_syscalls) |
| 317 | return; | 317 | return; |
| 318 | 318 | ||
| 319 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ | 319 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ |
| @@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | |||
| 360 | int syscall_nr; | 360 | int syscall_nr; |
| 361 | 361 | ||
| 362 | syscall_nr = trace_get_syscall_nr(current, regs); | 362 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 363 | if (syscall_nr < 0) | 363 | if (syscall_nr < 0 || syscall_nr >= NR_syscalls) |
| 364 | return; | 364 | return; |
| 365 | 365 | ||
| 366 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ | 366 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ |
| @@ -425,7 +425,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, | |||
| 425 | return; | 425 | return; |
| 426 | mutex_lock(&syscall_trace_lock); | 426 | mutex_lock(&syscall_trace_lock); |
| 427 | tr->sys_refcount_enter--; | 427 | tr->sys_refcount_enter--; |
| 428 | rcu_assign_pointer(tr->enter_syscall_files[num], NULL); | 428 | RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); |
| 429 | if (!tr->sys_refcount_enter) | 429 | if (!tr->sys_refcount_enter) |
| 430 | unregister_trace_sys_enter(ftrace_syscall_enter, tr); | 430 | unregister_trace_sys_enter(ftrace_syscall_enter, tr); |
| 431 | mutex_unlock(&syscall_trace_lock); | 431 | mutex_unlock(&syscall_trace_lock); |
| @@ -463,7 +463,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, | |||
| 463 | return; | 463 | return; |
| 464 | mutex_lock(&syscall_trace_lock); | 464 | mutex_lock(&syscall_trace_lock); |
| 465 | tr->sys_refcount_exit--; | 465 | tr->sys_refcount_exit--; |
| 466 | rcu_assign_pointer(tr->exit_syscall_files[num], NULL); | 466 | RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); |
| 467 | if (!tr->sys_refcount_exit) | 467 | if (!tr->sys_refcount_exit) |
| 468 | unregister_trace_sys_exit(ftrace_syscall_exit, tr); | 468 | unregister_trace_sys_exit(ftrace_syscall_exit, tr); |
| 469 | mutex_unlock(&syscall_trace_lock); | 469 | mutex_unlock(&syscall_trace_lock); |
| @@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
| 567 | int size; | 567 | int size; |
| 568 | 568 | ||
| 569 | syscall_nr = trace_get_syscall_nr(current, regs); | 569 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 570 | if (syscall_nr < 0) | 570 | if (syscall_nr < 0 || syscall_nr >= NR_syscalls) |
| 571 | return; | 571 | return; |
| 572 | if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) | 572 | if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) |
| 573 | return; | 573 | return; |
| @@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
| 641 | int size; | 641 | int size; |
| 642 | 642 | ||
| 643 | syscall_nr = trace_get_syscall_nr(current, regs); | 643 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 644 | if (syscall_nr < 0) | 644 | if (syscall_nr < 0 || syscall_nr >= NR_syscalls) |
| 645 | return; | 645 | return; |
| 646 | if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) | 646 | if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) |
| 647 | return; | 647 | return; |
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 394f70b17162..9586b670a5b2 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c | |||
| @@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); | |||
| 14 | void user_return_notifier_register(struct user_return_notifier *urn) | 14 | void user_return_notifier_register(struct user_return_notifier *urn) |
| 15 | { | 15 | { |
| 16 | set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); | 16 | set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); |
| 17 | hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); | 17 | hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list)); |
| 18 | } | 18 | } |
| 19 | EXPORT_SYMBOL_GPL(user_return_notifier_register); | 19 | EXPORT_SYMBOL_GPL(user_return_notifier_register); |
| 20 | 20 | ||
| @@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); | |||
| 25 | void user_return_notifier_unregister(struct user_return_notifier *urn) | 25 | void user_return_notifier_unregister(struct user_return_notifier *urn) |
| 26 | { | 26 | { |
| 27 | hlist_del(&urn->link); | 27 | hlist_del(&urn->link); |
| 28 | if (hlist_empty(&__get_cpu_var(return_notifier_list))) | 28 | if (hlist_empty(this_cpu_ptr(&return_notifier_list))) |
| 29 | clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); | 29 | clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); |
| 30 | } | 30 | } |
| 31 | EXPORT_SYMBOL_GPL(user_return_notifier_unregister); | 31 | EXPORT_SYMBOL_GPL(user_return_notifier_unregister); |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc02560fd6b..aa312b0dc3ec 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v) | |||
| 526 | return; | 526 | return; |
| 527 | } | 527 | } |
| 528 | 528 | ||
| 529 | struct seq_operations proc_uid_seq_operations = { | 529 | const struct seq_operations proc_uid_seq_operations = { |
| 530 | .start = uid_m_start, | 530 | .start = uid_m_start, |
| 531 | .stop = m_stop, | 531 | .stop = m_stop, |
| 532 | .next = m_next, | 532 | .next = m_next, |
| 533 | .show = uid_m_show, | 533 | .show = uid_m_show, |
| 534 | }; | 534 | }; |
| 535 | 535 | ||
| 536 | struct seq_operations proc_gid_seq_operations = { | 536 | const struct seq_operations proc_gid_seq_operations = { |
| 537 | .start = gid_m_start, | 537 | .start = gid_m_start, |
| 538 | .stop = m_stop, | 538 | .stop = m_stop, |
| 539 | .next = m_next, | 539 | .next = m_next, |
| 540 | .show = gid_m_show, | 540 | .show = gid_m_show, |
| 541 | }; | 541 | }; |
| 542 | 542 | ||
| 543 | struct seq_operations proc_projid_seq_operations = { | 543 | const struct seq_operations proc_projid_seq_operations = { |
| 544 | .start = projid_m_start, | 544 | .start = projid_m_start, |
| 545 | .stop = m_stop, | 545 | .stop = m_stop, |
| 546 | .next = m_next, | 546 | .next = m_next, |
diff --git a/kernel/utsname.c b/kernel/utsname.c index fd393124e507..883aaaa7de8a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task) | |||
| 93 | struct uts_namespace *ns = NULL; | 93 | struct uts_namespace *ns = NULL; |
| 94 | struct nsproxy *nsproxy; | 94 | struct nsproxy *nsproxy; |
| 95 | 95 | ||
| 96 | rcu_read_lock(); | 96 | task_lock(task); |
| 97 | nsproxy = task_nsproxy(task); | 97 | nsproxy = task->nsproxy; |
| 98 | if (nsproxy) { | 98 | if (nsproxy) { |
| 99 | ns = nsproxy->uts_ns; | 99 | ns = nsproxy->uts_ns; |
| 100 | get_uts_ns(ns); | 100 | get_uts_ns(ns); |
| 101 | } | 101 | } |
| 102 | rcu_read_unlock(); | 102 | task_unlock(task); |
| 103 | 103 | ||
| 104 | return ns; | 104 | return ns; |
| 105 | } | 105 | } |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd1b040..70bf11815f84 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -15,11 +15,6 @@ | |||
| 15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
| 16 | #include <linux/nmi.h> | 16 | #include <linux/nmi.h> |
| 17 | #include <linux/init.h> | 17 | #include <linux/init.h> |
| 18 | #include <linux/delay.h> | ||
| 19 | #include <linux/freezer.h> | ||
| 20 | #include <linux/kthread.h> | ||
| 21 | #include <linux/lockdep.h> | ||
| 22 | #include <linux/notifier.h> | ||
| 23 | #include <linux/module.h> | 18 | #include <linux/module.h> |
| 24 | #include <linux/sysctl.h> | 19 | #include <linux/sysctl.h> |
| 25 | #include <linux/smpboot.h> | 20 | #include <linux/smpboot.h> |
| @@ -47,6 +42,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync); | |||
| 47 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | 42 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); |
| 48 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | 43 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); |
| 49 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); | 44 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); |
| 45 | static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); | ||
| 50 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 46 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 51 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | 47 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); |
| 52 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | 48 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
| @@ -63,6 +59,25 @@ static unsigned long soft_lockup_nmi_warn; | |||
| 63 | static int hardlockup_panic = | 59 | static int hardlockup_panic = |
| 64 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | 60 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; |
| 65 | 61 | ||
| 62 | static bool hardlockup_detector_enabled = true; | ||
| 63 | /* | ||
| 64 | * We may not want to enable hard lockup detection by default in all cases, | ||
| 65 | * for example when running the kernel as a guest on a hypervisor. In these | ||
| 66 | * cases this function can be called to disable hard lockup detection. This | ||
| 67 | * function should only be executed once by the boot processor before the | ||
| 68 | * kernel command line parameters are parsed, because otherwise it is not | ||
| 69 | * possible to override this in hardlockup_panic_setup(). | ||
| 70 | */ | ||
| 71 | void watchdog_enable_hardlockup_detector(bool val) | ||
| 72 | { | ||
| 73 | hardlockup_detector_enabled = val; | ||
| 74 | } | ||
| 75 | |||
| 76 | bool watchdog_hardlockup_detector_is_enabled(void) | ||
| 77 | { | ||
| 78 | return hardlockup_detector_enabled; | ||
| 79 | } | ||
| 80 | |||
| 66 | static int __init hardlockup_panic_setup(char *str) | 81 | static int __init hardlockup_panic_setup(char *str) |
| 67 | { | 82 | { |
| 68 | if (!strncmp(str, "panic", 5)) | 83 | if (!strncmp(str, "panic", 5)) |
| @@ -71,6 +86,14 @@ static int __init hardlockup_panic_setup(char *str) | |||
| 71 | hardlockup_panic = 0; | 86 | hardlockup_panic = 0; |
| 72 | else if (!strncmp(str, "0", 1)) | 87 | else if (!strncmp(str, "0", 1)) |
| 73 | watchdog_user_enabled = 0; | 88 | watchdog_user_enabled = 0; |
| 89 | else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { | ||
| 90 | /* | ||
| 91 | * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option) | ||
| 92 | * has the same effect. | ||
| 93 | */ | ||
| 94 | watchdog_user_enabled = 1; | ||
| 95 | watchdog_enable_hardlockup_detector(true); | ||
| 96 | } | ||
| 74 | return 1; | 97 | return 1; |
| 75 | } | 98 | } |
| 76 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 99 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
| @@ -185,7 +208,7 @@ void touch_nmi_watchdog(void) | |||
| 185 | * case we shouldn't have to worry about the watchdog | 208 | * case we shouldn't have to worry about the watchdog |
| 186 | * going off. | 209 | * going off. |
| 187 | */ | 210 | */ |
| 188 | __raw_get_cpu_var(watchdog_nmi_touch) = true; | 211 | raw_cpu_write(watchdog_nmi_touch, true); |
| 189 | touch_softlockup_watchdog(); | 212 | touch_softlockup_watchdog(); |
| 190 | } | 213 | } |
| 191 | EXPORT_SYMBOL(touch_nmi_watchdog); | 214 | EXPORT_SYMBOL(touch_nmi_watchdog); |
| @@ -194,8 +217,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog); | |||
| 194 | 217 | ||
| 195 | void touch_softlockup_watchdog_sync(void) | 218 | void touch_softlockup_watchdog_sync(void) |
| 196 | { | 219 | { |
| 197 | __raw_get_cpu_var(softlockup_touch_sync) = true; | 220 | __this_cpu_write(softlockup_touch_sync, true); |
| 198 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | 221 | __this_cpu_write(watchdog_touch_ts, 0); |
| 199 | } | 222 | } |
| 200 | 223 | ||
| 201 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 224 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| @@ -260,9 +283,11 @@ static void watchdog_overflow_callback(struct perf_event *event, | |||
| 260 | return; | 283 | return; |
| 261 | 284 | ||
| 262 | if (hardlockup_panic) | 285 | if (hardlockup_panic) |
| 263 | panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | 286 | panic("Watchdog detected hard LOCKUP on cpu %d", |
| 287 | this_cpu); | ||
| 264 | else | 288 | else |
| 265 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); | 289 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", |
| 290 | this_cpu); | ||
| 266 | 291 | ||
| 267 | __this_cpu_write(hard_watchdog_warn, true); | 292 | __this_cpu_write(hard_watchdog_warn, true); |
| 268 | return; | 293 | return; |
| @@ -331,8 +356,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 331 | return HRTIMER_RESTART; | 356 | return HRTIMER_RESTART; |
| 332 | 357 | ||
| 333 | /* only warn once */ | 358 | /* only warn once */ |
| 334 | if (__this_cpu_read(soft_watchdog_warn) == true) | 359 | if (__this_cpu_read(soft_watchdog_warn) == true) { |
| 360 | /* | ||
| 361 | * When multiple processes are causing softlockups the | ||
| 362 | * softlockup detector only warns on the first one | ||
| 363 | * because the code relies on a full quiet cycle to | ||
| 364 | * re-arm. The second process prevents the quiet cycle | ||
| 365 | * and never gets reported. Use task pointers to detect | ||
| 366 | * this. | ||
| 367 | */ | ||
| 368 | if (__this_cpu_read(softlockup_task_ptr_saved) != | ||
| 369 | current) { | ||
| 370 | __this_cpu_write(soft_watchdog_warn, false); | ||
| 371 | __touch_watchdog(); | ||
| 372 | } | ||
| 335 | return HRTIMER_RESTART; | 373 | return HRTIMER_RESTART; |
| 374 | } | ||
| 336 | 375 | ||
| 337 | if (softlockup_all_cpu_backtrace) { | 376 | if (softlockup_all_cpu_backtrace) { |
| 338 | /* Prevent multiple soft-lockup reports if one cpu is already | 377 | /* Prevent multiple soft-lockup reports if one cpu is already |
| @@ -345,9 +384,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 345 | } | 384 | } |
| 346 | } | 385 | } |
| 347 | 386 | ||
| 348 | printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 387 | pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
| 349 | smp_processor_id(), duration, | 388 | smp_processor_id(), duration, |
| 350 | current->comm, task_pid_nr(current)); | 389 | current->comm, task_pid_nr(current)); |
| 390 | __this_cpu_write(softlockup_task_ptr_saved, current); | ||
| 351 | print_modules(); | 391 | print_modules(); |
| 352 | print_irqtrace_events(current); | 392 | print_irqtrace_events(current); |
| 353 | if (regs) | 393 | if (regs) |
| @@ -366,6 +406,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 366 | smp_mb__after_atomic(); | 406 | smp_mb__after_atomic(); |
| 367 | } | 407 | } |
| 368 | 408 | ||
| 409 | add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); | ||
| 369 | if (softlockup_panic) | 410 | if (softlockup_panic) |
| 370 | panic("softlockup: hung tasks"); | 411 | panic("softlockup: hung tasks"); |
| 371 | __this_cpu_write(soft_watchdog_warn, true); | 412 | __this_cpu_write(soft_watchdog_warn, true); |
| @@ -384,7 +425,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) | |||
| 384 | 425 | ||
| 385 | static void watchdog_enable(unsigned int cpu) | 426 | static void watchdog_enable(unsigned int cpu) |
| 386 | { | 427 | { |
| 387 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 428 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); |
| 388 | 429 | ||
| 389 | /* kick off the timer for the hardlockup detector */ | 430 | /* kick off the timer for the hardlockup detector */ |
| 390 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 431 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| @@ -404,7 +445,7 @@ static void watchdog_enable(unsigned int cpu) | |||
| 404 | 445 | ||
| 405 | static void watchdog_disable(unsigned int cpu) | 446 | static void watchdog_disable(unsigned int cpu) |
| 406 | { | 447 | { |
| 407 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 448 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); |
| 408 | 449 | ||
| 409 | watchdog_set_prio(SCHED_NORMAL, 0); | 450 | watchdog_set_prio(SCHED_NORMAL, 0); |
| 410 | hrtimer_cancel(hrtimer); | 451 | hrtimer_cancel(hrtimer); |
| @@ -451,6 +492,15 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
| 451 | struct perf_event_attr *wd_attr; | 492 | struct perf_event_attr *wd_attr; |
| 452 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 493 | struct perf_event *event = per_cpu(watchdog_ev, cpu); |
| 453 | 494 | ||
| 495 | /* | ||
| 496 | * Some kernels need to default hard lockup detection to | ||
| 497 | * 'disabled', for example a guest on a hypervisor. | ||
| 498 | */ | ||
| 499 | if (!watchdog_hardlockup_detector_is_enabled()) { | ||
| 500 | event = ERR_PTR(-ENOENT); | ||
| 501 | goto handle_err; | ||
| 502 | } | ||
| 503 | |||
| 454 | /* is it already setup and enabled? */ | 504 | /* is it already setup and enabled? */ |
| 455 | if (event && event->state > PERF_EVENT_STATE_OFF) | 505 | if (event && event->state > PERF_EVENT_STATE_OFF) |
| 456 | goto out; | 506 | goto out; |
| @@ -465,6 +515,7 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
| 465 | /* Try to register using hardware perf events */ | 515 | /* Try to register using hardware perf events */ |
| 466 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 516 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
| 467 | 517 | ||
| 518 | handle_err: | ||
| 468 | /* save cpu0 error for future comparision */ | 519 | /* save cpu0 error for future comparision */ |
| 469 | if (cpu == 0 && IS_ERR(event)) | 520 | if (cpu == 0 && IS_ERR(event)) |
| 470 | cpu0_err = PTR_ERR(event); | 521 | cpu0_err = PTR_ERR(event); |
| @@ -484,7 +535,7 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
| 484 | if (PTR_ERR(event) == -EOPNOTSUPP) | 535 | if (PTR_ERR(event) == -EOPNOTSUPP) |
| 485 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | 536 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); |
| 486 | else if (PTR_ERR(event) == -ENOENT) | 537 | else if (PTR_ERR(event) == -ENOENT) |
| 487 | pr_warning("disabled (cpu%i): hardware events not enabled\n", | 538 | pr_warn("disabled (cpu%i): hardware events not enabled\n", |
| 488 | cpu); | 539 | cpu); |
| 489 | else | 540 | else |
| 490 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | 541 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", |
| @@ -511,7 +562,10 @@ static void watchdog_nmi_disable(unsigned int cpu) | |||
| 511 | /* should be in cleanup, but blocks oprofile */ | 562 | /* should be in cleanup, but blocks oprofile */ |
| 512 | perf_event_release_kernel(event); | 563 | perf_event_release_kernel(event); |
| 513 | } | 564 | } |
| 514 | return; | 565 | if (cpu == 0) { |
| 566 | /* watchdog_nmi_enable() expects this to be zero initially. */ | ||
| 567 | cpu0_err = 0; | ||
| 568 | } | ||
| 515 | } | 569 | } |
| 516 | #else | 570 | #else |
| 517 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } | 571 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } |
| @@ -531,7 +585,7 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
| 531 | 585 | ||
| 532 | static void restart_watchdog_hrtimer(void *info) | 586 | static void restart_watchdog_hrtimer(void *info) |
| 533 | { | 587 | { |
| 534 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 588 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); |
| 535 | int ret; | 589 | int ret; |
| 536 | 590 | ||
| 537 | /* | 591 | /* |
| @@ -607,11 +661,13 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 607 | void __user *buffer, size_t *lenp, loff_t *ppos) | 661 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 608 | { | 662 | { |
| 609 | int err, old_thresh, old_enabled; | 663 | int err, old_thresh, old_enabled; |
| 664 | bool old_hardlockup; | ||
| 610 | static DEFINE_MUTEX(watchdog_proc_mutex); | 665 | static DEFINE_MUTEX(watchdog_proc_mutex); |
| 611 | 666 | ||
| 612 | mutex_lock(&watchdog_proc_mutex); | 667 | mutex_lock(&watchdog_proc_mutex); |
| 613 | old_thresh = ACCESS_ONCE(watchdog_thresh); | 668 | old_thresh = ACCESS_ONCE(watchdog_thresh); |
| 614 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); | 669 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); |
| 670 | old_hardlockup = watchdog_hardlockup_detector_is_enabled(); | ||
| 615 | 671 | ||
| 616 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 672 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 617 | if (err || !write) | 673 | if (err || !write) |
| @@ -623,15 +679,22 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 623 | * disabled. The 'watchdog_running' variable check in | 679 | * disabled. The 'watchdog_running' variable check in |
| 624 | * watchdog_*_all_cpus() function takes care of this. | 680 | * watchdog_*_all_cpus() function takes care of this. |
| 625 | */ | 681 | */ |
| 626 | if (watchdog_user_enabled && watchdog_thresh) | 682 | if (watchdog_user_enabled && watchdog_thresh) { |
| 683 | /* | ||
| 684 | * Prevent a change in watchdog_thresh accidentally overriding | ||
| 685 | * the enablement of the hardlockup detector. | ||
| 686 | */ | ||
| 687 | if (watchdog_user_enabled != old_enabled) | ||
| 688 | watchdog_enable_hardlockup_detector(true); | ||
| 627 | err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); | 689 | err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); |
| 628 | else | 690 | } else |
| 629 | watchdog_disable_all_cpus(); | 691 | watchdog_disable_all_cpus(); |
| 630 | 692 | ||
| 631 | /* Restore old values on failure */ | 693 | /* Restore old values on failure */ |
| 632 | if (err) { | 694 | if (err) { |
| 633 | watchdog_thresh = old_thresh; | 695 | watchdog_thresh = old_thresh; |
| 634 | watchdog_user_enabled = old_enabled; | 696 | watchdog_user_enabled = old_enabled; |
| 697 | watchdog_enable_hardlockup_detector(old_hardlockup); | ||
| 635 | } | 698 | } |
| 636 | out: | 699 | out: |
| 637 | mutex_unlock(&watchdog_proc_mutex); | 700 | mutex_unlock(&watchdog_proc_mutex); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5dbe22aa3efd..09b685daee3d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -2043,9 +2043,10 @@ __acquires(&pool->lock) | |||
| 2043 | * kernels, where a requeueing work item waiting for something to | 2043 | * kernels, where a requeueing work item waiting for something to |
| 2044 | * happen could deadlock with stop_machine as such work item could | 2044 | * happen could deadlock with stop_machine as such work item could |
| 2045 | * indefinitely requeue itself while all other CPUs are trapped in | 2045 | * indefinitely requeue itself while all other CPUs are trapped in |
| 2046 | * stop_machine. | 2046 | * stop_machine. At the same time, report a quiescent RCU state so |
| 2047 | * the same condition doesn't freeze RCU. | ||
| 2047 | */ | 2048 | */ |
| 2048 | cond_resched(); | 2049 | cond_resched_rcu_qs(); |
| 2049 | 2050 | ||
| 2050 | spin_lock_irq(&pool->lock); | 2051 | spin_lock_irq(&pool->lock); |
| 2051 | 2052 | ||
