diff options
Diffstat (limited to 'kernel')
131 files changed, 6586 insertions, 3107 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 353d3fe8ba33..85cbfb31e73e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o | |||
| 107 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 107 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
| 108 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 108 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
| 109 | obj-$(CONFIG_PADATA) += padata.o | 109 | obj-$(CONFIG_PADATA) += padata.o |
| 110 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | ||
| 110 | 111 | ||
| 111 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | 112 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) |
| 112 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 113 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/audit.c b/kernel/audit.c index e4956244ae50..939500317066 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -74,6 +74,8 @@ static int audit_initialized; | |||
| 74 | int audit_enabled; | 74 | int audit_enabled; |
| 75 | int audit_ever_enabled; | 75 | int audit_ever_enabled; |
| 76 | 76 | ||
| 77 | EXPORT_SYMBOL_GPL(audit_enabled); | ||
| 78 | |||
| 77 | /* Default state when kernel boots without any parameters. */ | 79 | /* Default state when kernel boots without any parameters. */ |
| 78 | static int audit_default; | 80 | static int audit_default; |
| 79 | 81 | ||
| @@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 671 | 673 | ||
| 672 | pid = NETLINK_CREDS(skb)->pid; | 674 | pid = NETLINK_CREDS(skb)->pid; |
| 673 | uid = NETLINK_CREDS(skb)->uid; | 675 | uid = NETLINK_CREDS(skb)->uid; |
| 674 | loginuid = NETLINK_CB(skb).loginuid; | 676 | loginuid = audit_get_loginuid(current); |
| 675 | sessionid = NETLINK_CB(skb).sessionid; | 677 | sessionid = audit_get_sessionid(current); |
| 676 | sid = NETLINK_CB(skb).sid; | 678 | security_task_getsecid(current, &sid); |
| 677 | seq = nlh->nlmsg_seq; | 679 | seq = nlh->nlmsg_seq; |
| 678 | data = NLMSG_DATA(nlh); | 680 | data = NLMSG_DATA(nlh); |
| 679 | 681 | ||
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 37b2bea170c8..e99dda04b126 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
| @@ -607,7 +607,7 @@ void audit_trim_trees(void) | |||
| 607 | spin_lock(&hash_lock); | 607 | spin_lock(&hash_lock); |
| 608 | list_for_each_entry(node, &tree->chunks, list) { | 608 | list_for_each_entry(node, &tree->chunks, list) { |
| 609 | struct audit_chunk *chunk = find_chunk(node); | 609 | struct audit_chunk *chunk = find_chunk(node); |
| 610 | /* this could be NULL if the watch is dieing else where... */ | 610 | /* this could be NULL if the watch is dying else where... */ |
| 611 | struct inode *inode = chunk->mark.i.inode; | 611 | struct inode *inode = chunk->mark.i.inode; |
| 612 | node->index |= 1U<<31; | 612 | node->index |= 1U<<31; |
| 613 | if (iterate_mounts(compare_root, inode, root_mnt)) | 613 | if (iterate_mounts(compare_root, inode, root_mnt)) |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d2e3c7866460..e683869365d9 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
| @@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) | |||
| 144 | } | 144 | } |
| 145 | 145 | ||
| 146 | /* Initialize a parent watch entry. */ | 146 | /* Initialize a parent watch entry. */ |
| 147 | static struct audit_parent *audit_init_parent(struct nameidata *ndp) | 147 | static struct audit_parent *audit_init_parent(struct path *path) |
| 148 | { | 148 | { |
| 149 | struct inode *inode = ndp->path.dentry->d_inode; | 149 | struct inode *inode = path->dentry->d_inode; |
| 150 | struct audit_parent *parent; | 150 | struct audit_parent *parent; |
| 151 | int ret; | 151 | int ret; |
| 152 | 152 | ||
| @@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
| 353 | } | 353 | } |
| 354 | 354 | ||
| 355 | /* Get path information necessary for adding watches. */ | 355 | /* Get path information necessary for adding watches. */ |
| 356 | static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) | 356 | static int audit_get_nd(struct audit_watch *watch, struct path *parent) |
| 357 | { | 357 | { |
| 358 | struct nameidata *ndparent, *ndwatch; | 358 | struct nameidata nd; |
| 359 | struct dentry *d; | ||
| 359 | int err; | 360 | int err; |
| 360 | 361 | ||
| 361 | ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); | 362 | err = kern_path_parent(watch->path, &nd); |
| 362 | if (unlikely(!ndparent)) | 363 | if (err) |
| 363 | return -ENOMEM; | 364 | return err; |
| 364 | 365 | ||
| 365 | ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); | 366 | if (nd.last_type != LAST_NORM) { |
| 366 | if (unlikely(!ndwatch)) { | 367 | path_put(&nd.path); |
| 367 | kfree(ndparent); | 368 | return -EINVAL; |
| 368 | return -ENOMEM; | ||
| 369 | } | 369 | } |
| 370 | 370 | ||
| 371 | err = path_lookup(path, LOOKUP_PARENT, ndparent); | 371 | mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); |
| 372 | if (err) { | 372 | d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); |
| 373 | kfree(ndparent); | 373 | if (IS_ERR(d)) { |
| 374 | kfree(ndwatch); | 374 | mutex_unlock(&nd.path.dentry->d_inode->i_mutex); |
| 375 | return err; | 375 | path_put(&nd.path); |
| 376 | return PTR_ERR(d); | ||
| 376 | } | 377 | } |
| 377 | 378 | if (d->d_inode) { | |
| 378 | err = path_lookup(path, 0, ndwatch); | 379 | /* update watch filter fields */ |
| 379 | if (err) { | 380 | watch->dev = d->d_inode->i_sb->s_dev; |
| 380 | kfree(ndwatch); | 381 | watch->ino = d->d_inode->i_ino; |
| 381 | ndwatch = NULL; | ||
| 382 | } | 382 | } |
| 383 | mutex_unlock(&nd.path.dentry->d_inode->i_mutex); | ||
| 383 | 384 | ||
| 384 | *ndp = ndparent; | 385 | *parent = nd.path; |
| 385 | *ndw = ndwatch; | 386 | dput(d); |
| 386 | |||
| 387 | return 0; | 387 | return 0; |
| 388 | } | 388 | } |
| 389 | 389 | ||
| 390 | /* Release resources used for watch path information. */ | ||
| 391 | static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) | ||
| 392 | { | ||
| 393 | if (ndp) { | ||
| 394 | path_put(&ndp->path); | ||
| 395 | kfree(ndp); | ||
| 396 | } | ||
| 397 | if (ndw) { | ||
| 398 | path_put(&ndw->path); | ||
| 399 | kfree(ndw); | ||
| 400 | } | ||
| 401 | } | ||
| 402 | |||
| 403 | /* Associate the given rule with an existing parent. | 390 | /* Associate the given rule with an existing parent. |
| 404 | * Caller must hold audit_filter_mutex. */ | 391 | * Caller must hold audit_filter_mutex. */ |
| 405 | static void audit_add_to_parent(struct audit_krule *krule, | 392 | static void audit_add_to_parent(struct audit_krule *krule, |
| @@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) | |||
| 440 | { | 427 | { |
| 441 | struct audit_watch *watch = krule->watch; | 428 | struct audit_watch *watch = krule->watch; |
| 442 | struct audit_parent *parent; | 429 | struct audit_parent *parent; |
| 443 | struct nameidata *ndp = NULL, *ndw = NULL; | 430 | struct path parent_path; |
| 444 | int h, ret = 0; | 431 | int h, ret = 0; |
| 445 | 432 | ||
| 446 | mutex_unlock(&audit_filter_mutex); | 433 | mutex_unlock(&audit_filter_mutex); |
| 447 | 434 | ||
| 448 | /* Avoid calling path_lookup under audit_filter_mutex. */ | 435 | /* Avoid calling path_lookup under audit_filter_mutex. */ |
| 449 | ret = audit_get_nd(watch->path, &ndp, &ndw); | 436 | ret = audit_get_nd(watch, &parent_path); |
| 450 | if (ret) { | ||
| 451 | /* caller expects mutex locked */ | ||
| 452 | mutex_lock(&audit_filter_mutex); | ||
| 453 | goto error; | ||
| 454 | } | ||
| 455 | 437 | ||
| 438 | /* caller expects mutex locked */ | ||
| 456 | mutex_lock(&audit_filter_mutex); | 439 | mutex_lock(&audit_filter_mutex); |
| 457 | 440 | ||
| 458 | /* update watch filter fields */ | 441 | if (ret) |
| 459 | if (ndw) { | 442 | return ret; |
| 460 | watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; | ||
| 461 | watch->ino = ndw->path.dentry->d_inode->i_ino; | ||
| 462 | } | ||
| 463 | 443 | ||
| 464 | /* either find an old parent or attach a new one */ | 444 | /* either find an old parent or attach a new one */ |
| 465 | parent = audit_find_parent(ndp->path.dentry->d_inode); | 445 | parent = audit_find_parent(parent_path.dentry->d_inode); |
| 466 | if (!parent) { | 446 | if (!parent) { |
| 467 | parent = audit_init_parent(ndp); | 447 | parent = audit_init_parent(&parent_path); |
| 468 | if (IS_ERR(parent)) { | 448 | if (IS_ERR(parent)) { |
| 469 | ret = PTR_ERR(parent); | 449 | ret = PTR_ERR(parent); |
| 470 | goto error; | 450 | goto error; |
| @@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) | |||
| 479 | h = audit_hash_ino((u32)watch->ino); | 459 | h = audit_hash_ino((u32)watch->ino); |
| 480 | *list = &audit_inode_hash[h]; | 460 | *list = &audit_inode_hash[h]; |
| 481 | error: | 461 | error: |
| 482 | audit_put_nd(ndp, ndw); /* NULL args OK */ | 462 | path_put(&parent_path); |
| 483 | return ret; | 463 | return ret; |
| 484 | |||
| 485 | } | 464 | } |
| 486 | 465 | ||
| 487 | void audit_remove_watch_rule(struct audit_krule *krule) | 466 | void audit_remove_watch_rule(struct audit_krule *krule) |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index add2819af71b..f8277c80d678 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
| 1238 | for (i = 0; i < rule->field_count; i++) { | 1238 | for (i = 0; i < rule->field_count; i++) { |
| 1239 | struct audit_field *f = &rule->fields[i]; | 1239 | struct audit_field *f = &rule->fields[i]; |
| 1240 | int result = 0; | 1240 | int result = 0; |
| 1241 | u32 sid; | ||
| 1241 | 1242 | ||
| 1242 | switch (f->type) { | 1243 | switch (f->type) { |
| 1243 | case AUDIT_PID: | 1244 | case AUDIT_PID: |
| @@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
| 1250 | result = audit_comparator(cb->creds.gid, f->op, f->val); | 1251 | result = audit_comparator(cb->creds.gid, f->op, f->val); |
| 1251 | break; | 1252 | break; |
| 1252 | case AUDIT_LOGINUID: | 1253 | case AUDIT_LOGINUID: |
| 1253 | result = audit_comparator(cb->loginuid, f->op, f->val); | 1254 | result = audit_comparator(audit_get_loginuid(current), |
| 1255 | f->op, f->val); | ||
| 1254 | break; | 1256 | break; |
| 1255 | case AUDIT_SUBJ_USER: | 1257 | case AUDIT_SUBJ_USER: |
| 1256 | case AUDIT_SUBJ_ROLE: | 1258 | case AUDIT_SUBJ_ROLE: |
| 1257 | case AUDIT_SUBJ_TYPE: | 1259 | case AUDIT_SUBJ_TYPE: |
| 1258 | case AUDIT_SUBJ_SEN: | 1260 | case AUDIT_SUBJ_SEN: |
| 1259 | case AUDIT_SUBJ_CLR: | 1261 | case AUDIT_SUBJ_CLR: |
| 1260 | if (f->lsm_rule) | 1262 | if (f->lsm_rule) { |
| 1261 | result = security_audit_rule_match(cb->sid, | 1263 | security_task_getsecid(current, &sid); |
| 1264 | result = security_audit_rule_match(sid, | ||
| 1262 | f->type, | 1265 | f->type, |
| 1263 | f->op, | 1266 | f->op, |
| 1264 | f->lsm_rule, | 1267 | f->lsm_rule, |
| 1265 | NULL); | 1268 | NULL); |
| 1269 | } | ||
| 1266 | break; | 1270 | break; |
| 1267 | } | 1271 | } |
| 1268 | 1272 | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f49a0318c2ed..b33513a08beb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
| 1011 | /* | 1011 | /* |
| 1012 | * to_send and len_sent accounting are very loose estimates. We aren't | 1012 | * to_send and len_sent accounting are very loose estimates. We aren't |
| 1013 | * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being | 1013 | * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being |
| 1014 | * within about 500 bytes (next page boundry) | 1014 | * within about 500 bytes (next page boundary) |
| 1015 | * | 1015 | * |
| 1016 | * why snprintf? an int is up to 12 digits long. if we just assumed when | 1016 | * why snprintf? an int is up to 12 digits long. if we just assumed when |
| 1017 | * logging that a[%d]= was going to be 16 characters long we would be wasting | 1017 | * logging that a[%d]= was going to be 16 characters long we would be wasting |
diff --git a/kernel/bounds.c b/kernel/bounds.c index 98a51f26c136..0c9b862292b2 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c | |||
| @@ -9,11 +9,13 @@ | |||
| 9 | #include <linux/page-flags.h> | 9 | #include <linux/page-flags.h> |
| 10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
| 11 | #include <linux/kbuild.h> | 11 | #include <linux/kbuild.h> |
| 12 | #include <linux/page_cgroup.h> | ||
| 12 | 13 | ||
| 13 | void foo(void) | 14 | void foo(void) |
| 14 | { | 15 | { |
| 15 | /* The enum constants to put into include/generated/bounds.h */ | 16 | /* The enum constants to put into include/generated/bounds.h */ |
| 16 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | 17 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); |
| 17 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | 18 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); |
| 19 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); | ||
| 18 | /* End of constants */ | 20 | /* End of constants */ |
| 19 | } | 21 | } |
diff --git a/kernel/capability.c b/kernel/capability.c index 9e9385f132c8..bf0c734d0c12 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/security.h> | 14 | #include <linux/security.h> |
| 15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
| 16 | #include <linux/pid_namespace.h> | 16 | #include <linux/pid_namespace.h> |
| 17 | #include <linux/user_namespace.h> | ||
| 17 | #include <asm/uaccess.h> | 18 | #include <asm/uaccess.h> |
| 18 | 19 | ||
| 19 | /* | 20 | /* |
| @@ -290,6 +291,60 @@ error: | |||
| 290 | } | 291 | } |
| 291 | 292 | ||
| 292 | /** | 293 | /** |
| 294 | * has_capability - Does a task have a capability in init_user_ns | ||
| 295 | * @t: The task in question | ||
| 296 | * @cap: The capability to be tested for | ||
| 297 | * | ||
| 298 | * Return true if the specified task has the given superior capability | ||
| 299 | * currently in effect to the initial user namespace, false if not. | ||
| 300 | * | ||
| 301 | * Note that this does not set PF_SUPERPRIV on the task. | ||
| 302 | */ | ||
| 303 | bool has_capability(struct task_struct *t, int cap) | ||
| 304 | { | ||
| 305 | int ret = security_real_capable(t, &init_user_ns, cap); | ||
| 306 | |||
| 307 | return (ret == 0); | ||
| 308 | } | ||
| 309 | |||
| 310 | /** | ||
| 311 | * has_capability - Does a task have a capability in a specific user ns | ||
| 312 | * @t: The task in question | ||
| 313 | * @ns: target user namespace | ||
| 314 | * @cap: The capability to be tested for | ||
| 315 | * | ||
| 316 | * Return true if the specified task has the given superior capability | ||
| 317 | * currently in effect to the specified user namespace, false if not. | ||
| 318 | * | ||
| 319 | * Note that this does not set PF_SUPERPRIV on the task. | ||
| 320 | */ | ||
| 321 | bool has_ns_capability(struct task_struct *t, | ||
| 322 | struct user_namespace *ns, int cap) | ||
| 323 | { | ||
| 324 | int ret = security_real_capable(t, ns, cap); | ||
| 325 | |||
| 326 | return (ret == 0); | ||
| 327 | } | ||
| 328 | |||
| 329 | /** | ||
| 330 | * has_capability_noaudit - Does a task have a capability (unaudited) | ||
| 331 | * @t: The task in question | ||
| 332 | * @cap: The capability to be tested for | ||
| 333 | * | ||
| 334 | * Return true if the specified task has the given superior capability | ||
| 335 | * currently in effect to init_user_ns, false if not. Don't write an | ||
| 336 | * audit message for the check. | ||
| 337 | * | ||
| 338 | * Note that this does not set PF_SUPERPRIV on the task. | ||
| 339 | */ | ||
| 340 | bool has_capability_noaudit(struct task_struct *t, int cap) | ||
| 341 | { | ||
| 342 | int ret = security_real_capable_noaudit(t, &init_user_ns, cap); | ||
| 343 | |||
| 344 | return (ret == 0); | ||
| 345 | } | ||
| 346 | |||
| 347 | /** | ||
| 293 | * capable - Determine if the current task has a superior capability in effect | 348 | * capable - Determine if the current task has a superior capability in effect |
| 294 | * @cap: The capability to be tested for | 349 | * @cap: The capability to be tested for |
| 295 | * | 350 | * |
| @@ -299,17 +354,48 @@ error: | |||
| 299 | * This sets PF_SUPERPRIV on the task if the capability is available on the | 354 | * This sets PF_SUPERPRIV on the task if the capability is available on the |
| 300 | * assumption that it's about to be used. | 355 | * assumption that it's about to be used. |
| 301 | */ | 356 | */ |
| 302 | int capable(int cap) | 357 | bool capable(int cap) |
| 358 | { | ||
| 359 | return ns_capable(&init_user_ns, cap); | ||
| 360 | } | ||
| 361 | EXPORT_SYMBOL(capable); | ||
| 362 | |||
| 363 | /** | ||
| 364 | * ns_capable - Determine if the current task has a superior capability in effect | ||
| 365 | * @ns: The usernamespace we want the capability in | ||
| 366 | * @cap: The capability to be tested for | ||
| 367 | * | ||
| 368 | * Return true if the current task has the given superior capability currently | ||
| 369 | * available for use, false if not. | ||
| 370 | * | ||
| 371 | * This sets PF_SUPERPRIV on the task if the capability is available on the | ||
| 372 | * assumption that it's about to be used. | ||
| 373 | */ | ||
| 374 | bool ns_capable(struct user_namespace *ns, int cap) | ||
| 303 | { | 375 | { |
| 304 | if (unlikely(!cap_valid(cap))) { | 376 | if (unlikely(!cap_valid(cap))) { |
| 305 | printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); | 377 | printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); |
| 306 | BUG(); | 378 | BUG(); |
| 307 | } | 379 | } |
| 308 | 380 | ||
| 309 | if (security_capable(current_cred(), cap) == 0) { | 381 | if (security_capable(ns, current_cred(), cap) == 0) { |
| 310 | current->flags |= PF_SUPERPRIV; | 382 | current->flags |= PF_SUPERPRIV; |
| 311 | return 1; | 383 | return true; |
| 312 | } | 384 | } |
| 313 | return 0; | 385 | return false; |
| 314 | } | 386 | } |
| 315 | EXPORT_SYMBOL(capable); | 387 | EXPORT_SYMBOL(ns_capable); |
| 388 | |||
| 389 | /** | ||
| 390 | * task_ns_capable - Determine whether current task has a superior | ||
| 391 | * capability targeted at a specific task's user namespace. | ||
| 392 | * @t: The task whose user namespace is targeted. | ||
| 393 | * @cap: The capability in question. | ||
| 394 | * | ||
| 395 | * Return true if it does, false otherwise. | ||
| 396 | */ | ||
| 397 | bool task_ns_capable(struct task_struct *t, int cap) | ||
| 398 | { | ||
| 399 | return ns_capable(task_cred_xxx(t, user)->user_ns, cap); | ||
| 400 | } | ||
| 401 | EXPORT_SYMBOL(task_ns_capable); | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b24d7027b83c..25c7eb52de1a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -157,7 +157,7 @@ struct css_id { | |||
| 157 | }; | 157 | }; |
| 158 | 158 | ||
| 159 | /* | 159 | /* |
| 160 | * cgroup_event represents events which userspace want to recieve. | 160 | * cgroup_event represents events which userspace want to receive. |
| 161 | */ | 161 | */ |
| 162 | struct cgroup_event { | 162 | struct cgroup_event { |
| 163 | /* | 163 | /* |
| @@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1813 | 1813 | ||
| 1814 | /* Update the css_set linked lists if we're using them */ | 1814 | /* Update the css_set linked lists if we're using them */ |
| 1815 | write_lock(&css_set_lock); | 1815 | write_lock(&css_set_lock); |
| 1816 | if (!list_empty(&tsk->cg_list)) { | 1816 | if (!list_empty(&tsk->cg_list)) |
| 1817 | list_del(&tsk->cg_list); | 1817 | list_move(&tsk->cg_list, &newcg->tasks); |
| 1818 | list_add(&tsk->cg_list, &newcg->tasks); | ||
| 1819 | } | ||
| 1820 | write_unlock(&css_set_lock); | 1818 | write_unlock(&css_set_lock); |
| 1821 | 1819 | ||
| 1822 | for_each_subsys(root, ss) { | 1820 | for_each_subsys(root, ss) { |
| @@ -3655,12 +3653,12 @@ again: | |||
| 3655 | spin_lock(&release_list_lock); | 3653 | spin_lock(&release_list_lock); |
| 3656 | set_bit(CGRP_REMOVED, &cgrp->flags); | 3654 | set_bit(CGRP_REMOVED, &cgrp->flags); |
| 3657 | if (!list_empty(&cgrp->release_list)) | 3655 | if (!list_empty(&cgrp->release_list)) |
| 3658 | list_del(&cgrp->release_list); | 3656 | list_del_init(&cgrp->release_list); |
| 3659 | spin_unlock(&release_list_lock); | 3657 | spin_unlock(&release_list_lock); |
| 3660 | 3658 | ||
| 3661 | cgroup_lock_hierarchy(cgrp->root); | 3659 | cgroup_lock_hierarchy(cgrp->root); |
| 3662 | /* delete this cgroup from parent->children */ | 3660 | /* delete this cgroup from parent->children */ |
| 3663 | list_del(&cgrp->sibling); | 3661 | list_del_init(&cgrp->sibling); |
| 3664 | cgroup_unlock_hierarchy(cgrp->root); | 3662 | cgroup_unlock_hierarchy(cgrp->root); |
| 3665 | 3663 | ||
| 3666 | d = dget(cgrp->dentry); | 3664 | d = dget(cgrp->dentry); |
| @@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
| 3879 | subsys[ss->subsys_id] = NULL; | 3877 | subsys[ss->subsys_id] = NULL; |
| 3880 | 3878 | ||
| 3881 | /* remove subsystem from rootnode's list of subsystems */ | 3879 | /* remove subsystem from rootnode's list of subsystems */ |
| 3882 | list_del(&ss->sibling); | 3880 | list_del_init(&ss->sibling); |
| 3883 | 3881 | ||
| 3884 | /* | 3882 | /* |
| 3885 | * disentangle the css from all css_sets attached to the dummytop. as | 3883 | * disentangle the css from all css_sets attached to the dummytop. as |
| @@ -4230,20 +4228,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
| 4230 | */ | 4228 | */ |
| 4231 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 4229 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
| 4232 | { | 4230 | { |
| 4233 | int i; | ||
| 4234 | struct css_set *cg; | 4231 | struct css_set *cg; |
| 4235 | 4232 | int i; | |
| 4236 | if (run_callbacks && need_forkexit_callback) { | ||
| 4237 | /* | ||
| 4238 | * modular subsystems can't use callbacks, so no need to lock | ||
| 4239 | * the subsys array | ||
| 4240 | */ | ||
| 4241 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
| 4242 | struct cgroup_subsys *ss = subsys[i]; | ||
| 4243 | if (ss->exit) | ||
| 4244 | ss->exit(ss, tsk); | ||
| 4245 | } | ||
| 4246 | } | ||
| 4247 | 4233 | ||
| 4248 | /* | 4234 | /* |
| 4249 | * Unlink from the css_set task list if necessary. | 4235 | * Unlink from the css_set task list if necessary. |
| @@ -4253,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
| 4253 | if (!list_empty(&tsk->cg_list)) { | 4239 | if (!list_empty(&tsk->cg_list)) { |
| 4254 | write_lock(&css_set_lock); | 4240 | write_lock(&css_set_lock); |
| 4255 | if (!list_empty(&tsk->cg_list)) | 4241 | if (!list_empty(&tsk->cg_list)) |
| 4256 | list_del(&tsk->cg_list); | 4242 | list_del_init(&tsk->cg_list); |
| 4257 | write_unlock(&css_set_lock); | 4243 | write_unlock(&css_set_lock); |
| 4258 | } | 4244 | } |
| 4259 | 4245 | ||
| @@ -4261,7 +4247,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
| 4261 | task_lock(tsk); | 4247 | task_lock(tsk); |
| 4262 | cg = tsk->cgroups; | 4248 | cg = tsk->cgroups; |
| 4263 | tsk->cgroups = &init_css_set; | 4249 | tsk->cgroups = &init_css_set; |
| 4250 | |||
| 4251 | if (run_callbacks && need_forkexit_callback) { | ||
| 4252 | /* | ||
| 4253 | * modular subsystems can't use callbacks, so no need to lock | ||
| 4254 | * the subsys array | ||
| 4255 | */ | ||
| 4256 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
| 4257 | struct cgroup_subsys *ss = subsys[i]; | ||
| 4258 | if (ss->exit) { | ||
| 4259 | struct cgroup *old_cgrp = | ||
| 4260 | rcu_dereference_raw(cg->subsys[i])->cgroup; | ||
| 4261 | struct cgroup *cgrp = task_cgroup(tsk, i); | ||
| 4262 | ss->exit(ss, cgrp, old_cgrp, tsk); | ||
| 4263 | } | ||
| 4264 | } | ||
| 4265 | } | ||
| 4264 | task_unlock(tsk); | 4266 | task_unlock(tsk); |
| 4267 | |||
| 4265 | if (cg) | 4268 | if (cg) |
| 4266 | put_css_set_taskexit(cg); | 4269 | put_css_set_taskexit(cg); |
| 4267 | } | 4270 | } |
| @@ -4813,6 +4816,29 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
| 4813 | return ret; | 4816 | return ret; |
| 4814 | } | 4817 | } |
| 4815 | 4818 | ||
| 4819 | /* | ||
| 4820 | * get corresponding css from file open on cgroupfs directory | ||
| 4821 | */ | ||
| 4822 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | ||
| 4823 | { | ||
| 4824 | struct cgroup *cgrp; | ||
| 4825 | struct inode *inode; | ||
| 4826 | struct cgroup_subsys_state *css; | ||
| 4827 | |||
| 4828 | inode = f->f_dentry->d_inode; | ||
| 4829 | /* check in cgroup filesystem dir */ | ||
| 4830 | if (inode->i_op != &cgroup_dir_inode_operations) | ||
| 4831 | return ERR_PTR(-EBADF); | ||
| 4832 | |||
| 4833 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | ||
| 4834 | return ERR_PTR(-EINVAL); | ||
| 4835 | |||
| 4836 | /* get cgroup */ | ||
| 4837 | cgrp = __d_cgrp(f->f_dentry); | ||
| 4838 | css = cgrp->subsys[id]; | ||
| 4839 | return css ? css : ERR_PTR(-ENOENT); | ||
| 4840 | } | ||
| 4841 | |||
| 4816 | #ifdef CONFIG_CGROUP_DEBUG | 4842 | #ifdef CONFIG_CGROUP_DEBUG |
| 4817 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, | 4843 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, |
| 4818 | struct cgroup *cont) | 4844 | struct cgroup *cont) |
diff --git a/kernel/compat.c b/kernel/compat.c index c9e2ec0b34a8..38b1d2c1cbe8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o, | |||
| 52 | put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; | 52 | put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; |
| 53 | } | 53 | } |
| 54 | 54 | ||
| 55 | static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) | ||
| 56 | { | ||
| 57 | memset(txc, 0, sizeof(struct timex)); | ||
| 58 | |||
| 59 | if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || | ||
| 60 | __get_user(txc->modes, &utp->modes) || | ||
| 61 | __get_user(txc->offset, &utp->offset) || | ||
| 62 | __get_user(txc->freq, &utp->freq) || | ||
| 63 | __get_user(txc->maxerror, &utp->maxerror) || | ||
| 64 | __get_user(txc->esterror, &utp->esterror) || | ||
| 65 | __get_user(txc->status, &utp->status) || | ||
| 66 | __get_user(txc->constant, &utp->constant) || | ||
| 67 | __get_user(txc->precision, &utp->precision) || | ||
| 68 | __get_user(txc->tolerance, &utp->tolerance) || | ||
| 69 | __get_user(txc->time.tv_sec, &utp->time.tv_sec) || | ||
| 70 | __get_user(txc->time.tv_usec, &utp->time.tv_usec) || | ||
| 71 | __get_user(txc->tick, &utp->tick) || | ||
| 72 | __get_user(txc->ppsfreq, &utp->ppsfreq) || | ||
| 73 | __get_user(txc->jitter, &utp->jitter) || | ||
| 74 | __get_user(txc->shift, &utp->shift) || | ||
| 75 | __get_user(txc->stabil, &utp->stabil) || | ||
| 76 | __get_user(txc->jitcnt, &utp->jitcnt) || | ||
| 77 | __get_user(txc->calcnt, &utp->calcnt) || | ||
| 78 | __get_user(txc->errcnt, &utp->errcnt) || | ||
| 79 | __get_user(txc->stbcnt, &utp->stbcnt)) | ||
| 80 | return -EFAULT; | ||
| 81 | |||
| 82 | return 0; | ||
| 83 | } | ||
| 84 | |||
| 85 | static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) | ||
| 86 | { | ||
| 87 | if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || | ||
| 88 | __put_user(txc->modes, &utp->modes) || | ||
| 89 | __put_user(txc->offset, &utp->offset) || | ||
| 90 | __put_user(txc->freq, &utp->freq) || | ||
| 91 | __put_user(txc->maxerror, &utp->maxerror) || | ||
| 92 | __put_user(txc->esterror, &utp->esterror) || | ||
| 93 | __put_user(txc->status, &utp->status) || | ||
| 94 | __put_user(txc->constant, &utp->constant) || | ||
| 95 | __put_user(txc->precision, &utp->precision) || | ||
| 96 | __put_user(txc->tolerance, &utp->tolerance) || | ||
| 97 | __put_user(txc->time.tv_sec, &utp->time.tv_sec) || | ||
| 98 | __put_user(txc->time.tv_usec, &utp->time.tv_usec) || | ||
| 99 | __put_user(txc->tick, &utp->tick) || | ||
| 100 | __put_user(txc->ppsfreq, &utp->ppsfreq) || | ||
| 101 | __put_user(txc->jitter, &utp->jitter) || | ||
| 102 | __put_user(txc->shift, &utp->shift) || | ||
| 103 | __put_user(txc->stabil, &utp->stabil) || | ||
| 104 | __put_user(txc->jitcnt, &utp->jitcnt) || | ||
| 105 | __put_user(txc->calcnt, &utp->calcnt) || | ||
| 106 | __put_user(txc->errcnt, &utp->errcnt) || | ||
| 107 | __put_user(txc->stbcnt, &utp->stbcnt) || | ||
| 108 | __put_user(txc->tai, &utp->tai)) | ||
| 109 | return -EFAULT; | ||
| 110 | return 0; | ||
| 111 | } | ||
| 112 | |||
| 55 | asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, | 113 | asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, |
| 56 | struct timezone __user *tz) | 114 | struct timezone __user *tz) |
| 57 | { | 115 | { |
| @@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock, | |||
| 617 | return err; | 675 | return err; |
| 618 | } | 676 | } |
| 619 | 677 | ||
| 678 | long compat_sys_clock_adjtime(clockid_t which_clock, | ||
| 679 | struct compat_timex __user *utp) | ||
| 680 | { | ||
| 681 | struct timex txc; | ||
| 682 | mm_segment_t oldfs; | ||
| 683 | int err, ret; | ||
| 684 | |||
| 685 | err = compat_get_timex(&txc, utp); | ||
| 686 | if (err) | ||
| 687 | return err; | ||
| 688 | |||
| 689 | oldfs = get_fs(); | ||
| 690 | set_fs(KERNEL_DS); | ||
| 691 | ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); | ||
| 692 | set_fs(oldfs); | ||
| 693 | |||
| 694 | err = compat_put_timex(utp, &txc); | ||
| 695 | if (err) | ||
| 696 | return err; | ||
| 697 | |||
| 698 | return ret; | ||
| 699 | } | ||
| 700 | |||
| 620 | long compat_sys_clock_getres(clockid_t which_clock, | 701 | long compat_sys_clock_getres(clockid_t which_clock, |
| 621 | struct compat_timespec __user *tp) | 702 | struct compat_timespec __user *tp) |
| 622 | { | 703 | { |
| @@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
| 951 | asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) | 1032 | asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) |
| 952 | { | 1033 | { |
| 953 | struct timex txc; | 1034 | struct timex txc; |
| 954 | int ret; | 1035 | int err, ret; |
| 955 | |||
| 956 | memset(&txc, 0, sizeof(struct timex)); | ||
| 957 | 1036 | ||
| 958 | if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || | 1037 | err = compat_get_timex(&txc, utp); |
| 959 | __get_user(txc.modes, &utp->modes) || | 1038 | if (err) |
| 960 | __get_user(txc.offset, &utp->offset) || | 1039 | return err; |
| 961 | __get_user(txc.freq, &utp->freq) || | ||
| 962 | __get_user(txc.maxerror, &utp->maxerror) || | ||
| 963 | __get_user(txc.esterror, &utp->esterror) || | ||
| 964 | __get_user(txc.status, &utp->status) || | ||
| 965 | __get_user(txc.constant, &utp->constant) || | ||
| 966 | __get_user(txc.precision, &utp->precision) || | ||
| 967 | __get_user(txc.tolerance, &utp->tolerance) || | ||
| 968 | __get_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
| 969 | __get_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
| 970 | __get_user(txc.tick, &utp->tick) || | ||
| 971 | __get_user(txc.ppsfreq, &utp->ppsfreq) || | ||
| 972 | __get_user(txc.jitter, &utp->jitter) || | ||
| 973 | __get_user(txc.shift, &utp->shift) || | ||
| 974 | __get_user(txc.stabil, &utp->stabil) || | ||
| 975 | __get_user(txc.jitcnt, &utp->jitcnt) || | ||
| 976 | __get_user(txc.calcnt, &utp->calcnt) || | ||
| 977 | __get_user(txc.errcnt, &utp->errcnt) || | ||
| 978 | __get_user(txc.stbcnt, &utp->stbcnt)) | ||
| 979 | return -EFAULT; | ||
| 980 | 1040 | ||
| 981 | ret = do_adjtimex(&txc); | 1041 | ret = do_adjtimex(&txc); |
| 982 | 1042 | ||
| 983 | if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || | 1043 | err = compat_put_timex(utp, &txc); |
| 984 | __put_user(txc.modes, &utp->modes) || | 1044 | if (err) |
| 985 | __put_user(txc.offset, &utp->offset) || | 1045 | return err; |
| 986 | __put_user(txc.freq, &utp->freq) || | ||
| 987 | __put_user(txc.maxerror, &utp->maxerror) || | ||
| 988 | __put_user(txc.esterror, &utp->esterror) || | ||
| 989 | __put_user(txc.status, &utp->status) || | ||
| 990 | __put_user(txc.constant, &utp->constant) || | ||
| 991 | __put_user(txc.precision, &utp->precision) || | ||
| 992 | __put_user(txc.tolerance, &utp->tolerance) || | ||
| 993 | __put_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
| 994 | __put_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
| 995 | __put_user(txc.tick, &utp->tick) || | ||
| 996 | __put_user(txc.ppsfreq, &utp->ppsfreq) || | ||
| 997 | __put_user(txc.jitter, &utp->jitter) || | ||
| 998 | __put_user(txc.shift, &utp->shift) || | ||
| 999 | __put_user(txc.stabil, &utp->stabil) || | ||
| 1000 | __put_user(txc.jitcnt, &utp->jitcnt) || | ||
| 1001 | __put_user(txc.calcnt, &utp->calcnt) || | ||
| 1002 | __put_user(txc.errcnt, &utp->errcnt) || | ||
| 1003 | __put_user(txc.stbcnt, &utp->stbcnt) || | ||
| 1004 | __put_user(txc.tai, &utp->tai)) | ||
| 1005 | ret = -EFAULT; | ||
| 1006 | 1046 | ||
| 1007 | return ret; | 1047 | return ret; |
| 1008 | } | 1048 | } |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 156cc5556140..12b7458f23b1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -126,7 +126,7 @@ static void cpu_hotplug_done(void) | |||
| 126 | #else /* #if CONFIG_HOTPLUG_CPU */ | 126 | #else /* #if CONFIG_HOTPLUG_CPU */ |
| 127 | static void cpu_hotplug_begin(void) {} | 127 | static void cpu_hotplug_begin(void) {} |
| 128 | static void cpu_hotplug_done(void) {} | 128 | static void cpu_hotplug_done(void) {} |
| 129 | #endif /* #esle #if CONFIG_HOTPLUG_CPU */ | 129 | #endif /* #else #if CONFIG_HOTPLUG_CPU */ |
| 130 | 130 | ||
| 131 | /* Need to know about CPUs going up/down? */ | 131 | /* Need to know about CPUs going up/down? */ |
| 132 | int __ref register_cpu_notifier(struct notifier_block *nb) | 132 | int __ref register_cpu_notifier(struct notifier_block *nb) |
| @@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v) | |||
| 160 | { | 160 | { |
| 161 | BUG_ON(cpu_notify(val, v)); | 161 | BUG_ON(cpu_notify(val, v)); |
| 162 | } | 162 | } |
| 163 | |||
| 164 | EXPORT_SYMBOL(register_cpu_notifier); | 163 | EXPORT_SYMBOL(register_cpu_notifier); |
| 165 | 164 | ||
| 166 | void __ref unregister_cpu_notifier(struct notifier_block *nb) | 165 | void __ref unregister_cpu_notifier(struct notifier_block *nb) |
| @@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param) | |||
| 205 | return err; | 204 | return err; |
| 206 | 205 | ||
| 207 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 206 | cpu_notify(CPU_DYING | param->mod, param->hcpu); |
| 208 | |||
| 209 | return 0; | 207 | return 0; |
| 210 | } | 208 | } |
| 211 | 209 | ||
| @@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 227 | return -EINVAL; | 225 | return -EINVAL; |
| 228 | 226 | ||
| 229 | cpu_hotplug_begin(); | 227 | cpu_hotplug_begin(); |
| 228 | |||
| 230 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); | 229 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); |
| 231 | if (err) { | 230 | if (err) { |
| 232 | nr_calls--; | 231 | nr_calls--; |
| @@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
| 304 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 303 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
| 305 | if (ret) { | 304 | if (ret) { |
| 306 | nr_calls--; | 305 | nr_calls--; |
| 307 | printk("%s: attempt to bring up CPU %u failed\n", | 306 | printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", |
| 308 | __func__, cpu); | 307 | __func__, cpu); |
| 309 | goto out_notify; | 308 | goto out_notify; |
| 310 | } | 309 | } |
| @@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void) | |||
| 450 | if (cpumask_empty(frozen_cpus)) | 449 | if (cpumask_empty(frozen_cpus)) |
| 451 | goto out; | 450 | goto out; |
| 452 | 451 | ||
| 453 | printk("Enabling non-boot CPUs ...\n"); | 452 | printk(KERN_INFO "Enabling non-boot CPUs ...\n"); |
| 454 | 453 | ||
| 455 | arch_enable_nonboot_cpus_begin(); | 454 | arch_enable_nonboot_cpus_begin(); |
| 456 | 455 | ||
| 457 | for_each_cpu(cpu, frozen_cpus) { | 456 | for_each_cpu(cpu, frozen_cpus) { |
| 458 | error = _cpu_up(cpu, 1); | 457 | error = _cpu_up(cpu, 1); |
| 459 | if (!error) { | 458 | if (!error) { |
| 460 | printk("CPU%d is up\n", cpu); | 459 | printk(KERN_INFO "CPU%d is up\n", cpu); |
| 461 | continue; | 460 | continue; |
| 462 | } | 461 | } |
| 463 | printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); | 462 | printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); |
| @@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) | |||
| 509 | */ | 508 | */ |
| 510 | 509 | ||
| 511 | /* cpu_bit_bitmap[0] is empty - so we can back into it */ | 510 | /* cpu_bit_bitmap[0] is empty - so we can back into it */ |
| 512 | #define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) | 511 | #define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) |
| 513 | #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) | 512 | #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) |
| 514 | #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) | 513 | #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) |
| 515 | #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) | 514 | #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4349935c2ad8..33eee16addb8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
| 1015 | struct cpuset *cs; | 1015 | struct cpuset *cs; |
| 1016 | int migrate; | 1016 | int migrate; |
| 1017 | const nodemask_t *oldmem = scan->data; | 1017 | const nodemask_t *oldmem = scan->data; |
| 1018 | NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); | 1018 | static nodemask_t newmems; /* protected by cgroup_mutex */ |
| 1019 | |||
| 1020 | if (!newmems) | ||
| 1021 | return; | ||
| 1022 | 1019 | ||
| 1023 | cs = cgroup_cs(scan->cg); | 1020 | cs = cgroup_cs(scan->cg); |
| 1024 | guarantee_online_mems(cs, newmems); | 1021 | guarantee_online_mems(cs, &newmems); |
| 1025 | |||
| 1026 | cpuset_change_task_nodemask(p, newmems); | ||
| 1027 | 1022 | ||
| 1028 | NODEMASK_FREE(newmems); | 1023 | cpuset_change_task_nodemask(p, &newmems); |
| 1029 | 1024 | ||
| 1030 | mm = get_task_mm(p); | 1025 | mm = get_task_mm(p); |
| 1031 | if (!mm) | 1026 | if (!mm) |
| @@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
| 1438 | struct mm_struct *mm; | 1433 | struct mm_struct *mm; |
| 1439 | struct cpuset *cs = cgroup_cs(cont); | 1434 | struct cpuset *cs = cgroup_cs(cont); |
| 1440 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1435 | struct cpuset *oldcs = cgroup_cs(oldcont); |
| 1441 | NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); | 1436 | static nodemask_t to; /* protected by cgroup_mutex */ |
| 1442 | NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); | ||
| 1443 | |||
| 1444 | if (from == NULL || to == NULL) | ||
| 1445 | goto alloc_fail; | ||
| 1446 | 1437 | ||
| 1447 | if (cs == &top_cpuset) { | 1438 | if (cs == &top_cpuset) { |
| 1448 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1439 | cpumask_copy(cpus_attach, cpu_possible_mask); |
| 1449 | } else { | 1440 | } else { |
| 1450 | guarantee_online_cpus(cs, cpus_attach); | 1441 | guarantee_online_cpus(cs, cpus_attach); |
| 1451 | } | 1442 | } |
| 1452 | guarantee_online_mems(cs, to); | 1443 | guarantee_online_mems(cs, &to); |
| 1453 | 1444 | ||
| 1454 | /* do per-task migration stuff possibly for each in the threadgroup */ | 1445 | /* do per-task migration stuff possibly for each in the threadgroup */ |
| 1455 | cpuset_attach_task(tsk, to, cs); | 1446 | cpuset_attach_task(tsk, &to, cs); |
| 1456 | if (threadgroup) { | 1447 | if (threadgroup) { |
| 1457 | struct task_struct *c; | 1448 | struct task_struct *c; |
| 1458 | rcu_read_lock(); | 1449 | rcu_read_lock(); |
| 1459 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | 1450 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { |
| 1460 | cpuset_attach_task(c, to, cs); | 1451 | cpuset_attach_task(c, &to, cs); |
| 1461 | } | 1452 | } |
| 1462 | rcu_read_unlock(); | 1453 | rcu_read_unlock(); |
| 1463 | } | 1454 | } |
| 1464 | 1455 | ||
| 1465 | /* change mm; only needs to be done once even if threadgroup */ | 1456 | /* change mm; only needs to be done once even if threadgroup */ |
| 1466 | *from = oldcs->mems_allowed; | 1457 | to = cs->mems_allowed; |
| 1467 | *to = cs->mems_allowed; | ||
| 1468 | mm = get_task_mm(tsk); | 1458 | mm = get_task_mm(tsk); |
| 1469 | if (mm) { | 1459 | if (mm) { |
| 1470 | mpol_rebind_mm(mm, to); | 1460 | mpol_rebind_mm(mm, &to); |
| 1471 | if (is_memory_migrate(cs)) | 1461 | if (is_memory_migrate(cs)) |
| 1472 | cpuset_migrate_mm(mm, from, to); | 1462 | cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); |
| 1473 | mmput(mm); | 1463 | mmput(mm); |
| 1474 | } | 1464 | } |
| 1475 | |||
| 1476 | alloc_fail: | ||
| 1477 | NODEMASK_FREE(from); | ||
| 1478 | NODEMASK_FREE(to); | ||
| 1479 | } | 1465 | } |
| 1480 | 1466 | ||
| 1481 | /* The various types of files and directories in a cpuset file system */ | 1467 | /* The various types of files and directories in a cpuset file system */ |
| @@ -1575,8 +1561,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
| 1575 | return -ENODEV; | 1561 | return -ENODEV; |
| 1576 | 1562 | ||
| 1577 | trialcs = alloc_trial_cpuset(cs); | 1563 | trialcs = alloc_trial_cpuset(cs); |
| 1578 | if (!trialcs) | 1564 | if (!trialcs) { |
| 1579 | return -ENOMEM; | 1565 | retval = -ENOMEM; |
| 1566 | goto out; | ||
| 1567 | } | ||
| 1580 | 1568 | ||
| 1581 | switch (cft->private) { | 1569 | switch (cft->private) { |
| 1582 | case FILE_CPULIST: | 1570 | case FILE_CPULIST: |
| @@ -1591,6 +1579,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
| 1591 | } | 1579 | } |
| 1592 | 1580 | ||
| 1593 | free_trial_cpuset(trialcs); | 1581 | free_trial_cpuset(trialcs); |
| 1582 | out: | ||
| 1594 | cgroup_unlock(); | 1583 | cgroup_unlock(); |
| 1595 | return retval; | 1584 | return retval; |
| 1596 | } | 1585 | } |
| @@ -1607,34 +1596,26 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
| 1607 | * across a page fault. | 1596 | * across a page fault. |
| 1608 | */ | 1597 | */ |
| 1609 | 1598 | ||
| 1610 | static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | 1599 | static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) |
| 1611 | { | 1600 | { |
| 1612 | int ret; | 1601 | size_t count; |
| 1613 | 1602 | ||
| 1614 | mutex_lock(&callback_mutex); | 1603 | mutex_lock(&callback_mutex); |
| 1615 | ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); | 1604 | count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); |
| 1616 | mutex_unlock(&callback_mutex); | 1605 | mutex_unlock(&callback_mutex); |
| 1617 | 1606 | ||
| 1618 | return ret; | 1607 | return count; |
| 1619 | } | 1608 | } |
| 1620 | 1609 | ||
| 1621 | static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | 1610 | static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) |
| 1622 | { | 1611 | { |
| 1623 | NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); | 1612 | size_t count; |
| 1624 | int retval; | ||
| 1625 | |||
| 1626 | if (mask == NULL) | ||
| 1627 | return -ENOMEM; | ||
| 1628 | 1613 | ||
| 1629 | mutex_lock(&callback_mutex); | 1614 | mutex_lock(&callback_mutex); |
| 1630 | *mask = cs->mems_allowed; | 1615 | count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); |
| 1631 | mutex_unlock(&callback_mutex); | 1616 | mutex_unlock(&callback_mutex); |
| 1632 | 1617 | ||
| 1633 | retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); | 1618 | return count; |
| 1634 | |||
| 1635 | NODEMASK_FREE(mask); | ||
| 1636 | |||
| 1637 | return retval; | ||
| 1638 | } | 1619 | } |
| 1639 | 1620 | ||
| 1640 | static ssize_t cpuset_common_file_read(struct cgroup *cont, | 1621 | static ssize_t cpuset_common_file_read(struct cgroup *cont, |
| @@ -1859,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, | |||
| 1859 | cs = cgroup_cs(cgroup); | 1840 | cs = cgroup_cs(cgroup); |
| 1860 | parent_cs = cgroup_cs(parent); | 1841 | parent_cs = cgroup_cs(parent); |
| 1861 | 1842 | ||
| 1843 | mutex_lock(&callback_mutex); | ||
| 1862 | cs->mems_allowed = parent_cs->mems_allowed; | 1844 | cs->mems_allowed = parent_cs->mems_allowed; |
| 1863 | cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); | 1845 | cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); |
| 1846 | mutex_unlock(&callback_mutex); | ||
| 1864 | return; | 1847 | return; |
| 1865 | } | 1848 | } |
| 1866 | 1849 | ||
| @@ -2063,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
| 2063 | struct cpuset *cp; /* scans cpusets being updated */ | 2046 | struct cpuset *cp; /* scans cpusets being updated */ |
| 2064 | struct cpuset *child; /* scans child cpusets of cp */ | 2047 | struct cpuset *child; /* scans child cpusets of cp */ |
| 2065 | struct cgroup *cont; | 2048 | struct cgroup *cont; |
| 2066 | NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); | 2049 | static nodemask_t oldmems; /* protected by cgroup_mutex */ |
| 2067 | |||
| 2068 | if (oldmems == NULL) | ||
| 2069 | return; | ||
| 2070 | 2050 | ||
| 2071 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 2051 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
| 2072 | 2052 | ||
| @@ -2083,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
| 2083 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | 2063 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) |
| 2084 | continue; | 2064 | continue; |
| 2085 | 2065 | ||
| 2086 | *oldmems = cp->mems_allowed; | 2066 | oldmems = cp->mems_allowed; |
| 2087 | 2067 | ||
| 2088 | /* Remove offline cpus and mems from this cpuset. */ | 2068 | /* Remove offline cpus and mems from this cpuset. */ |
| 2089 | mutex_lock(&callback_mutex); | 2069 | mutex_lock(&callback_mutex); |
| @@ -2099,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
| 2099 | remove_tasks_in_empty_cpuset(cp); | 2079 | remove_tasks_in_empty_cpuset(cp); |
| 2100 | else { | 2080 | else { |
| 2101 | update_tasks_cpumask(cp, NULL); | 2081 | update_tasks_cpumask(cp, NULL); |
| 2102 | update_tasks_nodemask(cp, oldmems, NULL); | 2082 | update_tasks_nodemask(cp, &oldmems, NULL); |
| 2103 | } | 2083 | } |
| 2104 | } | 2084 | } |
| 2105 | NODEMASK_FREE(oldmems); | ||
| 2106 | } | 2085 | } |
| 2107 | 2086 | ||
| 2108 | /* | 2087 | /* |
| @@ -2144,19 +2123,16 @@ void cpuset_update_active_cpus(void) | |||
| 2144 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2123 | static int cpuset_track_online_nodes(struct notifier_block *self, |
| 2145 | unsigned long action, void *arg) | 2124 | unsigned long action, void *arg) |
| 2146 | { | 2125 | { |
| 2147 | NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); | 2126 | static nodemask_t oldmems; /* protected by cgroup_mutex */ |
| 2148 | |||
| 2149 | if (oldmems == NULL) | ||
| 2150 | return NOTIFY_DONE; | ||
| 2151 | 2127 | ||
| 2152 | cgroup_lock(); | 2128 | cgroup_lock(); |
| 2153 | switch (action) { | 2129 | switch (action) { |
| 2154 | case MEM_ONLINE: | 2130 | case MEM_ONLINE: |
| 2155 | *oldmems = top_cpuset.mems_allowed; | 2131 | oldmems = top_cpuset.mems_allowed; |
| 2156 | mutex_lock(&callback_mutex); | 2132 | mutex_lock(&callback_mutex); |
| 2157 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2133 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
| 2158 | mutex_unlock(&callback_mutex); | 2134 | mutex_unlock(&callback_mutex); |
| 2159 | update_tasks_nodemask(&top_cpuset, oldmems, NULL); | 2135 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); |
| 2160 | break; | 2136 | break; |
| 2161 | case MEM_OFFLINE: | 2137 | case MEM_OFFLINE: |
| 2162 | /* | 2138 | /* |
| @@ -2170,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
| 2170 | } | 2146 | } |
| 2171 | cgroup_unlock(); | 2147 | cgroup_unlock(); |
| 2172 | 2148 | ||
| 2173 | NODEMASK_FREE(oldmems); | ||
| 2174 | return NOTIFY_OK; | 2149 | return NOTIFY_OK; |
| 2175 | } | 2150 | } |
| 2176 | #endif | 2151 | #endif |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 000000000000..5f85690285d4 --- /dev/null +++ b/kernel/crash_dump.c | |||
| @@ -0,0 +1,34 @@ | |||
| 1 | #include <linux/kernel.h> | ||
| 2 | #include <linux/crash_dump.h> | ||
| 3 | #include <linux/init.h> | ||
| 4 | #include <linux/errno.h> | ||
| 5 | #include <linux/module.h> | ||
| 6 | |||
| 7 | /* | ||
| 8 | * If we have booted due to a crash, max_pfn will be a very low value. We need | ||
| 9 | * to know the amount of memory that the previous kernel used. | ||
| 10 | */ | ||
| 11 | unsigned long saved_max_pfn; | ||
| 12 | |||
| 13 | /* | ||
| 14 | * stores the physical address of elf header of crash image | ||
| 15 | * | ||
| 16 | * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by | ||
| 17 | * is_kdump_kernel() to determine if we are booting after a panic. Hence put | ||
| 18 | * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. | ||
| 19 | */ | ||
| 20 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | ||
| 21 | |||
| 22 | /* | ||
| 23 | * elfcorehdr= specifies the location of elf core header stored by the crashed | ||
| 24 | * kernel. This option will be passed by kexec loader to the capture kernel. | ||
| 25 | */ | ||
| 26 | static int __init setup_elfcorehdr(char *arg) | ||
| 27 | { | ||
| 28 | char *end; | ||
| 29 | if (!arg) | ||
| 30 | return -EINVAL; | ||
| 31 | elfcorehdr_addr = memparse(arg, &end); | ||
| 32 | return end > arg ? 0 : -EINVAL; | ||
| 33 | } | ||
| 34 | early_param("elfcorehdr", setup_elfcorehdr); | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 3a9d6dd53a6c..5557b55048df 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
| @@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar; | |||
| 35 | static struct thread_group_cred init_tgcred = { | 35 | static struct thread_group_cred init_tgcred = { |
| 36 | .usage = ATOMIC_INIT(2), | 36 | .usage = ATOMIC_INIT(2), |
| 37 | .tgid = 0, | 37 | .tgid = 0, |
| 38 | .lock = SPIN_LOCK_UNLOCKED, | 38 | .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), |
| 39 | }; | 39 | }; |
| 40 | #endif | 40 | #endif |
| 41 | 41 | ||
| @@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode) | |||
| 741 | } | 741 | } |
| 742 | EXPORT_SYMBOL(set_create_files_as); | 742 | EXPORT_SYMBOL(set_create_files_as); |
| 743 | 743 | ||
| 744 | struct user_namespace *current_user_ns(void) | ||
| 745 | { | ||
| 746 | return _current_user_ns(); | ||
| 747 | } | ||
| 748 | EXPORT_SYMBOL(current_user_ns); | ||
| 749 | |||
| 744 | #ifdef CONFIG_DEBUG_CREDENTIALS | 750 | #ifdef CONFIG_DEBUG_CREDENTIALS |
| 745 | 751 | ||
| 746 | bool creds_are_invalid(const struct cred *cred) | 752 | bool creds_are_invalid(const struct cred *cred) |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index cefd4a11f6d9..bad6786dee88 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
| @@ -538,7 +538,7 @@ return_normal: | |||
| 538 | 538 | ||
| 539 | /* | 539 | /* |
| 540 | * For single stepping, try to only enter on the processor | 540 | * For single stepping, try to only enter on the processor |
| 541 | * that was single stepping. To gaurd against a deadlock, the | 541 | * that was single stepping. To guard against a deadlock, the |
| 542 | * kernel will only try for the value of sstep_tries before | 542 | * kernel will only try for the value of sstep_tries before |
| 543 | * giving up and continuing on. | 543 | * giving up and continuing on. |
| 544 | */ | 544 | */ |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 481a7bd2dfe7..a11db956dd62 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
| @@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) | |||
| 1093 | put_packet(remcom_out_buffer); | 1093 | put_packet(remcom_out_buffer); |
| 1094 | return 0; | 1094 | return 0; |
| 1095 | } | 1095 | } |
| 1096 | |||
| 1097 | /** | ||
| 1098 | * gdbstub_exit - Send an exit message to GDB | ||
| 1099 | * @status: The exit code to report. | ||
| 1100 | */ | ||
| 1101 | void gdbstub_exit(int status) | ||
| 1102 | { | ||
| 1103 | unsigned char checksum, ch, buffer[3]; | ||
| 1104 | int loop; | ||
| 1105 | |||
| 1106 | buffer[0] = 'W'; | ||
| 1107 | buffer[1] = hex_asc_hi(status); | ||
| 1108 | buffer[2] = hex_asc_lo(status); | ||
| 1109 | |||
| 1110 | dbg_io_ops->write_char('$'); | ||
| 1111 | checksum = 0; | ||
| 1112 | |||
| 1113 | for (loop = 0; loop < 3; loop++) { | ||
| 1114 | ch = buffer[loop]; | ||
| 1115 | checksum += ch; | ||
| 1116 | dbg_io_ops->write_char(ch); | ||
| 1117 | } | ||
| 1118 | |||
| 1119 | dbg_io_ops->write_char('#'); | ||
| 1120 | dbg_io_ops->write_char(hex_asc_hi(checksum)); | ||
| 1121 | dbg_io_ops->write_char(hex_asc_lo(checksum)); | ||
| 1122 | |||
| 1123 | /* make sure the output is flushed, lest the bootloader clobber it */ | ||
| 1124 | dbg_io_ops->flush(); | ||
| 1125 | } | ||
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index bd3e8e29caa3..be14779bcef6 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic; | |||
| 78 | static kdbtab_t *kdb_commands; | 78 | static kdbtab_t *kdb_commands; |
| 79 | #define KDB_BASE_CMD_MAX 50 | 79 | #define KDB_BASE_CMD_MAX 50 |
| 80 | static int kdb_max_commands = KDB_BASE_CMD_MAX; | 80 | static int kdb_max_commands = KDB_BASE_CMD_MAX; |
| 81 | static kdbtab_t kdb_base_commands[50]; | 81 | static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; |
| 82 | #define for_each_kdbcmd(cmd, num) \ | 82 | #define for_each_kdbcmd(cmd, num) \ |
| 83 | for ((cmd) = kdb_base_commands, (num) = 0; \ | 83 | for ((cmd) = kdb_base_commands, (num) = 0; \ |
| 84 | num < kdb_max_commands; \ | 84 | num < kdb_max_commands; \ |
| @@ -441,9 +441,9 @@ static int kdb_check_regs(void) | |||
| 441 | * symbol name, and offset to the caller. | 441 | * symbol name, and offset to the caller. |
| 442 | * | 442 | * |
| 443 | * The argument may consist of a numeric value (decimal or | 443 | * The argument may consist of a numeric value (decimal or |
| 444 | * hexidecimal), a symbol name, a register name (preceeded by the | 444 | * hexidecimal), a symbol name, a register name (preceded by the |
| 445 | * percent sign), an environment variable with a numeric value | 445 | * percent sign), an environment variable with a numeric value |
| 446 | * (preceeded by a dollar sign) or a simple arithmetic expression | 446 | * (preceded by a dollar sign) or a simple arithmetic expression |
| 447 | * consisting of a symbol name, +/-, and a numeric constant value | 447 | * consisting of a symbol name, +/-, and a numeric constant value |
| 448 | * (offset). | 448 | * (offset). |
| 449 | * Parameters: | 449 | * Parameters: |
| @@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value) | |||
| 1335 | * error The hardware-defined error code | 1335 | * error The hardware-defined error code |
| 1336 | * reason2 kdb's current reason code. | 1336 | * reason2 kdb's current reason code. |
| 1337 | * Initially error but can change | 1337 | * Initially error but can change |
| 1338 | * acording to kdb state. | 1338 | * according to kdb state. |
| 1339 | * db_result Result code from break or debug point. | 1339 | * db_result Result code from break or debug point. |
| 1340 | * regs The exception frame at time of fault/breakpoint. | 1340 | * regs The exception frame at time of fault/breakpoint. |
| 1341 | * should always be valid. | 1341 | * should always be valid. |
| @@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void) | |||
| 2892 | "Send a signal to a process", 0, KDB_REPEAT_NONE); | 2892 | "Send a signal to a process", 0, KDB_REPEAT_NONE); |
| 2893 | kdb_register_repeat("summary", kdb_summary, "", | 2893 | kdb_register_repeat("summary", kdb_summary, "", |
| 2894 | "Summarize the system", 4, KDB_REPEAT_NONE); | 2894 | "Summarize the system", 4, KDB_REPEAT_NONE); |
| 2895 | kdb_register_repeat("per_cpu", kdb_per_cpu, "", | 2895 | kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", |
| 2896 | "Display per_cpu variables", 3, KDB_REPEAT_NONE); | 2896 | "Display per_cpu variables", 3, KDB_REPEAT_NONE); |
| 2897 | kdb_register_repeat("grephelp", kdb_grep_help, "", | 2897 | kdb_register_repeat("grephelp", kdb_grep_help, "", |
| 2898 | "Display help on | grep", 0, KDB_REPEAT_NONE); | 2898 | "Display help on | grep", 0, KDB_REPEAT_NONE); |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 6b2485dcb050..5532dd37aa86 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
| @@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) | |||
| 545 | * Mask for process state. | 545 | * Mask for process state. |
| 546 | * Notes: | 546 | * Notes: |
| 547 | * The mask folds data from several sources into a single long value, so | 547 | * The mask folds data from several sources into a single long value, so |
| 548 | * be carefull not to overlap the bits. TASK_* bits are in the LSB, | 548 | * be careful not to overlap the bits. TASK_* bits are in the LSB, |
| 549 | * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there | 549 | * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there |
| 550 | * is no overlap between TASK_* and EXIT_* but that may not always be | 550 | * is no overlap between TASK_* and EXIT_* but that may not always be |
| 551 | * true, so EXIT_* bits are shifted left 16 bits before being stored in | 551 | * true, so EXIT_* bits are shifted left 16 bits before being stored in |
diff --git a/kernel/exit.c b/kernel/exit.c index f9a45ebcc7b1..8dd874181542 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
| 841 | /* Let father know we died | 841 | /* Let father know we died |
| 842 | * | 842 | * |
| 843 | * Thread signals are configurable, but you aren't going to use | 843 | * Thread signals are configurable, but you aren't going to use |
| 844 | * that to send signals to arbitary processes. | 844 | * that to send signals to arbitrary processes. |
| 845 | * That stops right now. | 845 | * That stops right now. |
| 846 | * | 846 | * |
| 847 | * If the parent exec id doesn't match the exec id we saved | 847 | * If the parent exec id doesn't match the exec id we saved |
| @@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code) | |||
| 908 | profile_task_exit(tsk); | 908 | profile_task_exit(tsk); |
| 909 | 909 | ||
| 910 | WARN_ON(atomic_read(&tsk->fs_excl)); | 910 | WARN_ON(atomic_read(&tsk->fs_excl)); |
| 911 | WARN_ON(blk_needs_flush_plug(tsk)); | ||
| 911 | 912 | ||
| 912 | if (unlikely(in_interrupt())) | 913 | if (unlikely(in_interrupt())) |
| 913 | panic("Aiee, killing interrupt handler!"); | 914 | panic("Aiee, killing interrupt handler!"); |
| @@ -1015,7 +1016,7 @@ NORET_TYPE void do_exit(long code) | |||
| 1015 | /* | 1016 | /* |
| 1016 | * FIXME: do that only when needed, using sched_exit tracepoint | 1017 | * FIXME: do that only when needed, using sched_exit tracepoint |
| 1017 | */ | 1018 | */ |
| 1018 | flush_ptrace_hw_breakpoint(tsk); | 1019 | ptrace_put_breakpoints(tsk); |
| 1019 | 1020 | ||
| 1020 | exit_notify(tsk, group_dead); | 1021 | exit_notify(tsk, group_dead); |
| 1021 | #ifdef CONFIG_NUMA | 1022 | #ifdef CONFIG_NUMA |
diff --git a/kernel/fork.c b/kernel/fork.c index 25e429152ddc..e7548dee636b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -40,6 +40,7 @@ | |||
| 40 | #include <linux/tracehook.h> | 40 | #include <linux/tracehook.h> |
| 41 | #include <linux/futex.h> | 41 | #include <linux/futex.h> |
| 42 | #include <linux/compat.h> | 42 | #include <linux/compat.h> |
| 43 | #include <linux/kthread.h> | ||
| 43 | #include <linux/task_io_accounting_ops.h> | 44 | #include <linux/task_io_accounting_ops.h> |
| 44 | #include <linux/rcupdate.h> | 45 | #include <linux/rcupdate.h> |
| 45 | #include <linux/ptrace.h> | 46 | #include <linux/ptrace.h> |
| @@ -109,20 +110,25 @@ int nr_processes(void) | |||
| 109 | } | 110 | } |
| 110 | 111 | ||
| 111 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 112 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
| 112 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) | 113 | # define alloc_task_struct_node(node) \ |
| 113 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) | 114 | kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) |
| 115 | # define free_task_struct(tsk) \ | ||
| 116 | kmem_cache_free(task_struct_cachep, (tsk)) | ||
| 114 | static struct kmem_cache *task_struct_cachep; | 117 | static struct kmem_cache *task_struct_cachep; |
| 115 | #endif | 118 | #endif |
| 116 | 119 | ||
| 117 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 120 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR |
| 118 | static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) | 121 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
| 122 | int node) | ||
| 119 | { | 123 | { |
| 120 | #ifdef CONFIG_DEBUG_STACK_USAGE | 124 | #ifdef CONFIG_DEBUG_STACK_USAGE |
| 121 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; | 125 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; |
| 122 | #else | 126 | #else |
| 123 | gfp_t mask = GFP_KERNEL; | 127 | gfp_t mask = GFP_KERNEL; |
| 124 | #endif | 128 | #endif |
| 125 | return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); | 129 | struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); |
| 130 | |||
| 131 | return page ? page_address(page) : NULL; | ||
| 126 | } | 132 | } |
| 127 | 133 | ||
| 128 | static inline void free_thread_info(struct thread_info *ti) | 134 | static inline void free_thread_info(struct thread_info *ti) |
| @@ -193,6 +199,7 @@ void __put_task_struct(struct task_struct *tsk) | |||
| 193 | if (!profile_handoff_task(tsk)) | 199 | if (!profile_handoff_task(tsk)) |
| 194 | free_task(tsk); | 200 | free_task(tsk); |
| 195 | } | 201 | } |
| 202 | EXPORT_SYMBOL_GPL(__put_task_struct); | ||
| 196 | 203 | ||
| 197 | /* | 204 | /* |
| 198 | * macro override instead of weak attribute alias, to workaround | 205 | * macro override instead of weak attribute alias, to workaround |
| @@ -248,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 248 | struct task_struct *tsk; | 255 | struct task_struct *tsk; |
| 249 | struct thread_info *ti; | 256 | struct thread_info *ti; |
| 250 | unsigned long *stackend; | 257 | unsigned long *stackend; |
| 251 | 258 | int node = tsk_fork_get_node(orig); | |
| 252 | int err; | 259 | int err; |
| 253 | 260 | ||
| 254 | prepare_to_copy(orig); | 261 | prepare_to_copy(orig); |
| 255 | 262 | ||
| 256 | tsk = alloc_task_struct(); | 263 | tsk = alloc_task_struct_node(node); |
| 257 | if (!tsk) | 264 | if (!tsk) |
| 258 | return NULL; | 265 | return NULL; |
| 259 | 266 | ||
| 260 | ti = alloc_thread_info(tsk); | 267 | ti = alloc_thread_info_node(tsk, node); |
| 261 | if (!ti) { | 268 | if (!ti) { |
| 262 | free_task_struct(tsk); | 269 | free_task_struct(tsk); |
| 263 | return NULL; | 270 | return NULL; |
| @@ -1180,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1180 | pid = alloc_pid(p->nsproxy->pid_ns); | 1187 | pid = alloc_pid(p->nsproxy->pid_ns); |
| 1181 | if (!pid) | 1188 | if (!pid) |
| 1182 | goto bad_fork_cleanup_io; | 1189 | goto bad_fork_cleanup_io; |
| 1183 | |||
| 1184 | if (clone_flags & CLONE_NEWPID) { | ||
| 1185 | retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); | ||
| 1186 | if (retval < 0) | ||
| 1187 | goto bad_fork_free_pid; | ||
| 1188 | } | ||
| 1189 | } | 1190 | } |
| 1190 | 1191 | ||
| 1191 | p->pid = pid_nr(pid); | 1192 | p->pid = pid_nr(pid); |
| @@ -1204,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1204 | * Clear TID on mm_release()? | 1205 | * Clear TID on mm_release()? |
| 1205 | */ | 1206 | */ |
| 1206 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; | 1207 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; |
| 1208 | #ifdef CONFIG_BLOCK | ||
| 1209 | p->plug = NULL; | ||
| 1210 | #endif | ||
| 1207 | #ifdef CONFIG_FUTEX | 1211 | #ifdef CONFIG_FUTEX |
| 1208 | p->robust_list = NULL; | 1212 | p->robust_list = NULL; |
| 1209 | #ifdef CONFIG_COMPAT | 1213 | #ifdef CONFIG_COMPAT |
| @@ -1289,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1289 | tracehook_finish_clone(p, clone_flags, trace); | 1293 | tracehook_finish_clone(p, clone_flags, trace); |
| 1290 | 1294 | ||
| 1291 | if (thread_group_leader(p)) { | 1295 | if (thread_group_leader(p)) { |
| 1292 | if (clone_flags & CLONE_NEWPID) | 1296 | if (is_child_reaper(pid)) |
| 1293 | p->nsproxy->pid_ns->child_reaper = p; | 1297 | p->nsproxy->pid_ns->child_reaper = p; |
| 1294 | 1298 | ||
| 1295 | p->signal->leader_pid = pid; | 1299 | p->signal->leader_pid = pid; |
| @@ -1512,38 +1516,24 @@ void __init proc_caches_init(void) | |||
| 1512 | } | 1516 | } |
| 1513 | 1517 | ||
| 1514 | /* | 1518 | /* |
| 1515 | * Check constraints on flags passed to the unshare system call and | 1519 | * Check constraints on flags passed to the unshare system call. |
| 1516 | * force unsharing of additional process context as appropriate. | ||
| 1517 | */ | 1520 | */ |
| 1518 | static void check_unshare_flags(unsigned long *flags_ptr) | 1521 | static int check_unshare_flags(unsigned long unshare_flags) |
| 1519 | { | 1522 | { |
| 1523 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | ||
| 1524 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | ||
| 1525 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) | ||
| 1526 | return -EINVAL; | ||
| 1520 | /* | 1527 | /* |
| 1521 | * If unsharing a thread from a thread group, must also | 1528 | * Not implemented, but pretend it works if there is nothing to |
| 1522 | * unshare vm. | 1529 | * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND |
| 1523 | */ | 1530 | * needs to unshare vm. |
| 1524 | if (*flags_ptr & CLONE_THREAD) | ||
| 1525 | *flags_ptr |= CLONE_VM; | ||
| 1526 | |||
| 1527 | /* | ||
| 1528 | * If unsharing vm, must also unshare signal handlers. | ||
| 1529 | */ | ||
| 1530 | if (*flags_ptr & CLONE_VM) | ||
| 1531 | *flags_ptr |= CLONE_SIGHAND; | ||
| 1532 | |||
| 1533 | /* | ||
| 1534 | * If unsharing namespace, must also unshare filesystem information. | ||
| 1535 | */ | 1531 | */ |
| 1536 | if (*flags_ptr & CLONE_NEWNS) | 1532 | if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { |
| 1537 | *flags_ptr |= CLONE_FS; | 1533 | /* FIXME: get_task_mm() increments ->mm_users */ |
| 1538 | } | 1534 | if (atomic_read(¤t->mm->mm_users) > 1) |
| 1539 | 1535 | return -EINVAL; | |
| 1540 | /* | 1536 | } |
| 1541 | * Unsharing of tasks created with CLONE_THREAD is not supported yet | ||
| 1542 | */ | ||
| 1543 | static int unshare_thread(unsigned long unshare_flags) | ||
| 1544 | { | ||
| 1545 | if (unshare_flags & CLONE_THREAD) | ||
| 1546 | return -EINVAL; | ||
| 1547 | 1537 | ||
| 1548 | return 0; | 1538 | return 0; |
| 1549 | } | 1539 | } |
| @@ -1570,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) | |||
| 1570 | } | 1560 | } |
| 1571 | 1561 | ||
| 1572 | /* | 1562 | /* |
| 1573 | * Unsharing of sighand is not supported yet | ||
| 1574 | */ | ||
| 1575 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) | ||
| 1576 | { | ||
| 1577 | struct sighand_struct *sigh = current->sighand; | ||
| 1578 | |||
| 1579 | if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) | ||
| 1580 | return -EINVAL; | ||
| 1581 | else | ||
| 1582 | return 0; | ||
| 1583 | } | ||
| 1584 | |||
| 1585 | /* | ||
| 1586 | * Unshare vm if it is being shared | ||
| 1587 | */ | ||
| 1588 | static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) | ||
| 1589 | { | ||
| 1590 | struct mm_struct *mm = current->mm; | ||
| 1591 | |||
| 1592 | if ((unshare_flags & CLONE_VM) && | ||
| 1593 | (mm && atomic_read(&mm->mm_users) > 1)) { | ||
| 1594 | return -EINVAL; | ||
| 1595 | } | ||
| 1596 | |||
| 1597 | return 0; | ||
| 1598 | } | ||
| 1599 | |||
| 1600 | /* | ||
| 1601 | * Unshare file descriptor table if it is being shared | 1563 | * Unshare file descriptor table if it is being shared |
| 1602 | */ | 1564 | */ |
| 1603 | static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) | 1565 | static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) |
| @@ -1625,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp | |||
| 1625 | */ | 1587 | */ |
| 1626 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | 1588 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) |
| 1627 | { | 1589 | { |
| 1628 | int err = 0; | ||
| 1629 | struct fs_struct *fs, *new_fs = NULL; | 1590 | struct fs_struct *fs, *new_fs = NULL; |
| 1630 | struct sighand_struct *new_sigh = NULL; | ||
| 1631 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; | ||
| 1632 | struct files_struct *fd, *new_fd = NULL; | 1591 | struct files_struct *fd, *new_fd = NULL; |
| 1633 | struct nsproxy *new_nsproxy = NULL; | 1592 | struct nsproxy *new_nsproxy = NULL; |
| 1634 | int do_sysvsem = 0; | 1593 | int do_sysvsem = 0; |
| 1594 | int err; | ||
| 1635 | 1595 | ||
| 1636 | check_unshare_flags(&unshare_flags); | 1596 | err = check_unshare_flags(unshare_flags); |
| 1637 | 1597 | if (err) | |
| 1638 | /* Return -EINVAL for all unsupported flags */ | ||
| 1639 | err = -EINVAL; | ||
| 1640 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | ||
| 1641 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | ||
| 1642 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) | ||
| 1643 | goto bad_unshare_out; | 1598 | goto bad_unshare_out; |
| 1644 | 1599 | ||
| 1645 | /* | 1600 | /* |
| 1601 | * If unsharing namespace, must also unshare filesystem information. | ||
| 1602 | */ | ||
| 1603 | if (unshare_flags & CLONE_NEWNS) | ||
| 1604 | unshare_flags |= CLONE_FS; | ||
| 1605 | /* | ||
| 1646 | * CLONE_NEWIPC must also detach from the undolist: after switching | 1606 | * CLONE_NEWIPC must also detach from the undolist: after switching |
| 1647 | * to a new ipc namespace, the semaphore arrays from the old | 1607 | * to a new ipc namespace, the semaphore arrays from the old |
| 1648 | * namespace are unreachable. | 1608 | * namespace are unreachable. |
| 1649 | */ | 1609 | */ |
| 1650 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) | 1610 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) |
| 1651 | do_sysvsem = 1; | 1611 | do_sysvsem = 1; |
| 1652 | if ((err = unshare_thread(unshare_flags))) | ||
| 1653 | goto bad_unshare_out; | ||
| 1654 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1612 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
| 1655 | goto bad_unshare_cleanup_thread; | 1613 | goto bad_unshare_out; |
| 1656 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) | ||
| 1657 | goto bad_unshare_cleanup_fs; | ||
| 1658 | if ((err = unshare_vm(unshare_flags, &new_mm))) | ||
| 1659 | goto bad_unshare_cleanup_sigh; | ||
| 1660 | if ((err = unshare_fd(unshare_flags, &new_fd))) | 1614 | if ((err = unshare_fd(unshare_flags, &new_fd))) |
| 1661 | goto bad_unshare_cleanup_vm; | 1615 | goto bad_unshare_cleanup_fs; |
| 1662 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, | 1616 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, |
| 1663 | new_fs))) | 1617 | new_fs))) |
| 1664 | goto bad_unshare_cleanup_fd; | 1618 | goto bad_unshare_cleanup_fd; |
| 1665 | 1619 | ||
| 1666 | if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { | 1620 | if (new_fs || new_fd || do_sysvsem || new_nsproxy) { |
| 1667 | if (do_sysvsem) { | 1621 | if (do_sysvsem) { |
| 1668 | /* | 1622 | /* |
| 1669 | * CLONE_SYSVSEM is equivalent to sys_exit(). | 1623 | * CLONE_SYSVSEM is equivalent to sys_exit(). |
| @@ -1689,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
| 1689 | spin_unlock(&fs->lock); | 1643 | spin_unlock(&fs->lock); |
| 1690 | } | 1644 | } |
| 1691 | 1645 | ||
| 1692 | if (new_mm) { | ||
| 1693 | mm = current->mm; | ||
| 1694 | active_mm = current->active_mm; | ||
| 1695 | current->mm = new_mm; | ||
| 1696 | current->active_mm = new_mm; | ||
| 1697 | if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | ||
| 1698 | atomic_dec(&mm->oom_disable_count); | ||
| 1699 | atomic_inc(&new_mm->oom_disable_count); | ||
| 1700 | } | ||
| 1701 | activate_mm(active_mm, new_mm); | ||
| 1702 | new_mm = mm; | ||
| 1703 | } | ||
| 1704 | |||
| 1705 | if (new_fd) { | 1646 | if (new_fd) { |
| 1706 | fd = current->files; | 1647 | fd = current->files; |
| 1707 | current->files = new_fd; | 1648 | current->files = new_fd; |
| @@ -1718,20 +1659,10 @@ bad_unshare_cleanup_fd: | |||
| 1718 | if (new_fd) | 1659 | if (new_fd) |
| 1719 | put_files_struct(new_fd); | 1660 | put_files_struct(new_fd); |
| 1720 | 1661 | ||
| 1721 | bad_unshare_cleanup_vm: | ||
| 1722 | if (new_mm) | ||
| 1723 | mmput(new_mm); | ||
| 1724 | |||
| 1725 | bad_unshare_cleanup_sigh: | ||
| 1726 | if (new_sigh) | ||
| 1727 | if (atomic_dec_and_test(&new_sigh->count)) | ||
| 1728 | kmem_cache_free(sighand_cachep, new_sigh); | ||
| 1729 | |||
| 1730 | bad_unshare_cleanup_fs: | 1662 | bad_unshare_cleanup_fs: |
| 1731 | if (new_fs) | 1663 | if (new_fs) |
| 1732 | free_fs_struct(new_fs); | 1664 | free_fs_struct(new_fs); |
| 1733 | 1665 | ||
| 1734 | bad_unshare_cleanup_thread: | ||
| 1735 | bad_unshare_out: | 1666 | bad_unshare_out: |
| 1736 | return err; | 1667 | return err; |
| 1737 | } | 1668 | } |
diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd6..fe28dc282eae 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, | |||
| 381 | return NULL; | 381 | return NULL; |
| 382 | } | 382 | } |
| 383 | 383 | ||
| 384 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) | 384 | static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, |
| 385 | u32 uval, u32 newval) | ||
| 385 | { | 386 | { |
| 386 | u32 curval; | 387 | int ret; |
| 387 | 388 | ||
| 388 | pagefault_disable(); | 389 | pagefault_disable(); |
| 389 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 390 | ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); |
| 390 | pagefault_enable(); | 391 | pagefault_enable(); |
| 391 | 392 | ||
| 392 | return curval; | 393 | return ret; |
| 393 | } | 394 | } |
| 394 | 395 | ||
| 395 | static int get_futex_value_locked(u32 *dest, u32 __user *from) | 396 | static int get_futex_value_locked(u32 *dest, u32 __user *from) |
| @@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
| 674 | struct task_struct *task, int set_waiters) | 675 | struct task_struct *task, int set_waiters) |
| 675 | { | 676 | { |
| 676 | int lock_taken, ret, ownerdied = 0; | 677 | int lock_taken, ret, ownerdied = 0; |
| 677 | u32 uval, newval, curval; | 678 | u32 uval, newval, curval, vpid = task_pid_vnr(task); |
| 678 | 679 | ||
| 679 | retry: | 680 | retry: |
| 680 | ret = lock_taken = 0; | 681 | ret = lock_taken = 0; |
| @@ -684,19 +685,17 @@ retry: | |||
| 684 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | 685 | * (by doing a 0 -> TID atomic cmpxchg), while holding all |
| 685 | * the locks. It will most likely not succeed. | 686 | * the locks. It will most likely not succeed. |
| 686 | */ | 687 | */ |
| 687 | newval = task_pid_vnr(task); | 688 | newval = vpid; |
| 688 | if (set_waiters) | 689 | if (set_waiters) |
| 689 | newval |= FUTEX_WAITERS; | 690 | newval |= FUTEX_WAITERS; |
| 690 | 691 | ||
| 691 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | 692 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) |
| 692 | |||
| 693 | if (unlikely(curval == -EFAULT)) | ||
| 694 | return -EFAULT; | 693 | return -EFAULT; |
| 695 | 694 | ||
| 696 | /* | 695 | /* |
| 697 | * Detect deadlocks. | 696 | * Detect deadlocks. |
| 698 | */ | 697 | */ |
| 699 | if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) | 698 | if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) |
| 700 | return -EDEADLK; | 699 | return -EDEADLK; |
| 701 | 700 | ||
| 702 | /* | 701 | /* |
| @@ -723,14 +722,12 @@ retry: | |||
| 723 | */ | 722 | */ |
| 724 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | 723 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { |
| 725 | /* Keep the OWNER_DIED bit */ | 724 | /* Keep the OWNER_DIED bit */ |
| 726 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); | 725 | newval = (curval & ~FUTEX_TID_MASK) | vpid; |
| 727 | ownerdied = 0; | 726 | ownerdied = 0; |
| 728 | lock_taken = 1; | 727 | lock_taken = 1; |
| 729 | } | 728 | } |
| 730 | 729 | ||
| 731 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 730 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) |
| 732 | |||
| 733 | if (unlikely(curval == -EFAULT)) | ||
| 734 | return -EFAULT; | 731 | return -EFAULT; |
| 735 | if (unlikely(curval != uval)) | 732 | if (unlikely(curval != uval)) |
| 736 | goto retry; | 733 | goto retry; |
| @@ -775,6 +772,24 @@ retry: | |||
| 775 | return ret; | 772 | return ret; |
| 776 | } | 773 | } |
| 777 | 774 | ||
| 775 | /** | ||
| 776 | * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket | ||
| 777 | * @q: The futex_q to unqueue | ||
| 778 | * | ||
| 779 | * The q->lock_ptr must not be NULL and must be held by the caller. | ||
| 780 | */ | ||
| 781 | static void __unqueue_futex(struct futex_q *q) | ||
| 782 | { | ||
| 783 | struct futex_hash_bucket *hb; | ||
| 784 | |||
| 785 | if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) | ||
| 786 | || WARN_ON(plist_node_empty(&q->list))) | ||
| 787 | return; | ||
| 788 | |||
| 789 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); | ||
| 790 | plist_del(&q->list, &hb->chain); | ||
| 791 | } | ||
| 792 | |||
| 778 | /* | 793 | /* |
| 779 | * The hash bucket lock must be held when this is called. | 794 | * The hash bucket lock must be held when this is called. |
| 780 | * Afterwards, the futex_q must not be accessed. | 795 | * Afterwards, the futex_q must not be accessed. |
| @@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q) | |||
| 792 | */ | 807 | */ |
| 793 | get_task_struct(p); | 808 | get_task_struct(p); |
| 794 | 809 | ||
| 795 | plist_del(&q->list, &q->list.plist); | 810 | __unqueue_futex(q); |
| 796 | /* | 811 | /* |
| 797 | * The waiting task can free the futex_q as soon as | 812 | * The waiting task can free the futex_q as soon as |
| 798 | * q->lock_ptr = NULL is written, without taking any locks. A | 813 | * q->lock_ptr = NULL is written, without taking any locks. A |
| @@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
| 843 | 858 | ||
| 844 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 859 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
| 845 | 860 | ||
| 846 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 861 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
| 847 | |||
| 848 | if (curval == -EFAULT) | ||
| 849 | ret = -EFAULT; | 862 | ret = -EFAULT; |
| 850 | else if (curval != uval) | 863 | else if (curval != uval) |
| 851 | ret = -EINVAL; | 864 | ret = -EINVAL; |
| @@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | |||
| 880 | * There is no waiter, so we unlock the futex. The owner died | 893 | * There is no waiter, so we unlock the futex. The owner died |
| 881 | * bit has not to be preserved here. We are the owner: | 894 | * bit has not to be preserved here. We are the owner: |
| 882 | */ | 895 | */ |
| 883 | oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); | 896 | if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) |
| 884 | 897 | return -EFAULT; | |
| 885 | if (oldval == -EFAULT) | ||
| 886 | return oldval; | ||
| 887 | if (oldval != uval) | 898 | if (oldval != uval) |
| 888 | return -EAGAIN; | 899 | return -EAGAIN; |
| 889 | 900 | ||
| @@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | |||
| 1071 | plist_del(&q->list, &hb1->chain); | 1082 | plist_del(&q->list, &hb1->chain); |
| 1072 | plist_add(&q->list, &hb2->chain); | 1083 | plist_add(&q->list, &hb2->chain); |
| 1073 | q->lock_ptr = &hb2->lock; | 1084 | q->lock_ptr = &hb2->lock; |
| 1074 | #ifdef CONFIG_DEBUG_PI_LIST | ||
| 1075 | q->list.plist.spinlock = &hb2->lock; | ||
| 1076 | #endif | ||
| 1077 | } | 1085 | } |
| 1078 | get_futex_key_refs(key2); | 1086 | get_futex_key_refs(key2); |
| 1079 | q->key = *key2; | 1087 | q->key = *key2; |
| @@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | |||
| 1100 | get_futex_key_refs(key); | 1108 | get_futex_key_refs(key); |
| 1101 | q->key = *key; | 1109 | q->key = *key; |
| 1102 | 1110 | ||
| 1103 | WARN_ON(plist_node_empty(&q->list)); | 1111 | __unqueue_futex(q); |
| 1104 | plist_del(&q->list, &q->list.plist); | ||
| 1105 | 1112 | ||
| 1106 | WARN_ON(!q->rt_waiter); | 1113 | WARN_ON(!q->rt_waiter); |
| 1107 | q->rt_waiter = NULL; | 1114 | q->rt_waiter = NULL; |
| 1108 | 1115 | ||
| 1109 | q->lock_ptr = &hb->lock; | 1116 | q->lock_ptr = &hb->lock; |
| 1110 | #ifdef CONFIG_DEBUG_PI_LIST | ||
| 1111 | q->list.plist.spinlock = &hb->lock; | ||
| 1112 | #endif | ||
| 1113 | 1117 | ||
| 1114 | wake_up_state(q->task, TASK_NORMAL); | 1118 | wake_up_state(q->task, TASK_NORMAL); |
| 1115 | } | 1119 | } |
| @@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | |||
| 1457 | prio = min(current->normal_prio, MAX_RT_PRIO); | 1461 | prio = min(current->normal_prio, MAX_RT_PRIO); |
| 1458 | 1462 | ||
| 1459 | plist_node_init(&q->list, prio); | 1463 | plist_node_init(&q->list, prio); |
| 1460 | #ifdef CONFIG_DEBUG_PI_LIST | ||
| 1461 | q->list.plist.spinlock = &hb->lock; | ||
| 1462 | #endif | ||
| 1463 | plist_add(&q->list, &hb->chain); | 1464 | plist_add(&q->list, &hb->chain); |
| 1464 | q->task = current; | 1465 | q->task = current; |
| 1465 | spin_unlock(&hb->lock); | 1466 | spin_unlock(&hb->lock); |
| @@ -1504,8 +1505,7 @@ retry: | |||
| 1504 | spin_unlock(lock_ptr); | 1505 | spin_unlock(lock_ptr); |
| 1505 | goto retry; | 1506 | goto retry; |
| 1506 | } | 1507 | } |
| 1507 | WARN_ON(plist_node_empty(&q->list)); | 1508 | __unqueue_futex(q); |
| 1508 | plist_del(&q->list, &q->list.plist); | ||
| 1509 | 1509 | ||
| 1510 | BUG_ON(q->pi_state); | 1510 | BUG_ON(q->pi_state); |
| 1511 | 1511 | ||
| @@ -1525,8 +1525,7 @@ retry: | |||
| 1525 | static void unqueue_me_pi(struct futex_q *q) | 1525 | static void unqueue_me_pi(struct futex_q *q) |
| 1526 | __releases(q->lock_ptr) | 1526 | __releases(q->lock_ptr) |
| 1527 | { | 1527 | { |
| 1528 | WARN_ON(plist_node_empty(&q->list)); | 1528 | __unqueue_futex(q); |
| 1529 | plist_del(&q->list, &q->list.plist); | ||
| 1530 | 1529 | ||
| 1531 | BUG_ON(!q->pi_state); | 1530 | BUG_ON(!q->pi_state); |
| 1532 | free_pi_state(q->pi_state); | 1531 | free_pi_state(q->pi_state); |
| @@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
| 1556 | 1555 | ||
| 1557 | /* | 1556 | /* |
| 1558 | * We are here either because we stole the rtmutex from the | 1557 | * We are here either because we stole the rtmutex from the |
| 1559 | * pending owner or we are the pending owner which failed to | 1558 | * previous highest priority waiter or we are the highest priority |
| 1560 | * get the rtmutex. We have to replace the pending owner TID | 1559 | * waiter but failed to get the rtmutex the first time. |
| 1561 | * in the user space variable. This must be atomic as we have | 1560 | * We have to replace the newowner TID in the user space variable. |
| 1562 | * to preserve the owner died bit here. | 1561 | * This must be atomic as we have to preserve the owner died bit here. |
| 1563 | * | 1562 | * |
| 1564 | * Note: We write the user space value _before_ changing the pi_state | 1563 | * Note: We write the user space value _before_ changing the pi_state |
| 1565 | * because we can fault here. Imagine swapped out pages or a fork | 1564 | * because we can fault here. Imagine swapped out pages or a fork |
| @@ -1578,9 +1577,7 @@ retry: | |||
| 1578 | while (1) { | 1577 | while (1) { |
| 1579 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 1578 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
| 1580 | 1579 | ||
| 1581 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 1580 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
| 1582 | |||
| 1583 | if (curval == -EFAULT) | ||
| 1584 | goto handle_fault; | 1581 | goto handle_fault; |
| 1585 | if (curval == uval) | 1582 | if (curval == uval) |
| 1586 | break; | 1583 | break; |
| @@ -1608,8 +1605,8 @@ retry: | |||
| 1608 | 1605 | ||
| 1609 | /* | 1606 | /* |
| 1610 | * To handle the page fault we need to drop the hash bucket | 1607 | * To handle the page fault we need to drop the hash bucket |
| 1611 | * lock here. That gives the other task (either the pending | 1608 | * lock here. That gives the other task (either the highest priority |
| 1612 | * owner itself or the task which stole the rtmutex) the | 1609 | * waiter itself or the task which stole the rtmutex) the |
| 1613 | * chance to try the fixup of the pi_state. So once we are | 1610 | * chance to try the fixup of the pi_state. So once we are |
| 1614 | * back from handling the fault we need to check the pi_state | 1611 | * back from handling the fault we need to check the pi_state |
| 1615 | * after reacquiring the hash bucket lock and before trying to | 1612 | * after reacquiring the hash bucket lock and before trying to |
| @@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | |||
| 1685 | /* | 1682 | /* |
| 1686 | * pi_state is incorrect, some other task did a lock steal and | 1683 | * pi_state is incorrect, some other task did a lock steal and |
| 1687 | * we returned due to timeout or signal without taking the | 1684 | * we returned due to timeout or signal without taking the |
| 1688 | * rt_mutex. Too late. We can access the rt_mutex_owner without | 1685 | * rt_mutex. Too late. |
| 1689 | * locking, as the other task is now blocked on the hash bucket | ||
| 1690 | * lock. Fix the state up. | ||
| 1691 | */ | 1686 | */ |
| 1687 | raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); | ||
| 1692 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | 1688 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); |
| 1689 | if (!owner) | ||
| 1690 | owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); | ||
| 1691 | raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); | ||
| 1693 | ret = fixup_pi_state_owner(uaddr, q, owner); | 1692 | ret = fixup_pi_state_owner(uaddr, q, owner); |
| 1694 | goto out; | 1693 | goto out; |
| 1695 | } | 1694 | } |
| 1696 | 1695 | ||
| 1697 | /* | 1696 | /* |
| 1698 | * Paranoia check. If we did not take the lock, then we should not be | 1697 | * Paranoia check. If we did not take the lock, then we should not be |
| 1699 | * the owner, nor the pending owner, of the rt_mutex. | 1698 | * the owner of the rt_mutex. |
| 1700 | */ | 1699 | */ |
| 1701 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) | 1700 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) |
| 1702 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " | 1701 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " |
| @@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | |||
| 1781 | * | 1780 | * |
| 1782 | * The basic logical guarantee of a futex is that it blocks ONLY | 1781 | * The basic logical guarantee of a futex is that it blocks ONLY |
| 1783 | * if cond(var) is known to be true at the time of blocking, for | 1782 | * if cond(var) is known to be true at the time of blocking, for |
| 1784 | * any cond. If we queued after testing *uaddr, that would open | 1783 | * any cond. If we locked the hash-bucket after testing *uaddr, that |
| 1785 | * a race condition where we could block indefinitely with | 1784 | * would open a race condition where we could block indefinitely with |
| 1786 | * cond(var) false, which would violate the guarantee. | 1785 | * cond(var) false, which would violate the guarantee. |
| 1787 | * | 1786 | * |
| 1788 | * A consequence is that futex_wait() can return zero and absorb | 1787 | * On the other hand, we insert q and release the hash-bucket only |
| 1789 | * a wakeup when *uaddr != val on entry to the syscall. This is | 1788 | * after testing *uaddr. This guarantees that futex_wait() will NOT |
| 1790 | * rare, but normal. | 1789 | * absorb a wakeup if *uaddr does not match the desired values |
| 1790 | * while the syscall executes. | ||
| 1791 | */ | 1791 | */ |
| 1792 | retry: | 1792 | retry: |
| 1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); | 1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); |
| @@ -1886,7 +1886,7 @@ retry: | |||
| 1886 | restart->futex.val = val; | 1886 | restart->futex.val = val; |
| 1887 | restart->futex.time = abs_time->tv64; | 1887 | restart->futex.time = abs_time->tv64; |
| 1888 | restart->futex.bitset = bitset; | 1888 | restart->futex.bitset = bitset; |
| 1889 | restart->futex.flags = flags; | 1889 | restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; |
| 1890 | 1890 | ||
| 1891 | ret = -ERESTART_RESTARTBLOCK; | 1891 | ret = -ERESTART_RESTARTBLOCK; |
| 1892 | 1892 | ||
| @@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) | |||
| 2046 | { | 2046 | { |
| 2047 | struct futex_hash_bucket *hb; | 2047 | struct futex_hash_bucket *hb; |
| 2048 | struct futex_q *this, *next; | 2048 | struct futex_q *this, *next; |
| 2049 | u32 uval; | ||
| 2050 | struct plist_head *head; | 2049 | struct plist_head *head; |
| 2051 | union futex_key key = FUTEX_KEY_INIT; | 2050 | union futex_key key = FUTEX_KEY_INIT; |
| 2051 | u32 uval, vpid = task_pid_vnr(current); | ||
| 2052 | int ret; | 2052 | int ret; |
| 2053 | 2053 | ||
| 2054 | retry: | 2054 | retry: |
| @@ -2057,7 +2057,7 @@ retry: | |||
| 2057 | /* | 2057 | /* |
| 2058 | * We release only a lock we actually own: | 2058 | * We release only a lock we actually own: |
| 2059 | */ | 2059 | */ |
| 2060 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) | 2060 | if ((uval & FUTEX_TID_MASK) != vpid) |
| 2061 | return -EPERM; | 2061 | return -EPERM; |
| 2062 | 2062 | ||
| 2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); | 2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
| @@ -2072,17 +2072,14 @@ retry: | |||
| 2072 | * again. If it succeeds then we can return without waking | 2072 | * again. If it succeeds then we can return without waking |
| 2073 | * anyone else up: | 2073 | * anyone else up: |
| 2074 | */ | 2074 | */ |
| 2075 | if (!(uval & FUTEX_OWNER_DIED)) | 2075 | if (!(uval & FUTEX_OWNER_DIED) && |
| 2076 | uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); | 2076 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) |
| 2077 | |||
| 2078 | |||
| 2079 | if (unlikely(uval == -EFAULT)) | ||
| 2080 | goto pi_faulted; | 2077 | goto pi_faulted; |
| 2081 | /* | 2078 | /* |
| 2082 | * Rare case: we managed to release the lock atomically, | 2079 | * Rare case: we managed to release the lock atomically, |
| 2083 | * no need to wake anyone else up: | 2080 | * no need to wake anyone else up: |
| 2084 | */ | 2081 | */ |
| 2085 | if (unlikely(uval == task_pid_vnr(current))) | 2082 | if (unlikely(uval == vpid)) |
| 2086 | goto out_unlock; | 2083 | goto out_unlock; |
| 2087 | 2084 | ||
| 2088 | /* | 2085 | /* |
| @@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
| 2167 | * We were woken prior to requeue by a timeout or a signal. | 2164 | * We were woken prior to requeue by a timeout or a signal. |
| 2168 | * Unqueue the futex_q and determine which it was. | 2165 | * Unqueue the futex_q and determine which it was. |
| 2169 | */ | 2166 | */ |
| 2170 | plist_del(&q->list, &q->list.plist); | 2167 | plist_del(&q->list, &hb->chain); |
| 2171 | 2168 | ||
| 2172 | /* Handle spurious wakeups gracefully */ | 2169 | /* Handle spurious wakeups gracefully */ |
| 2173 | ret = -EWOULDBLOCK; | 2170 | ret = -EWOULDBLOCK; |
| @@ -2421,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, | |||
| 2421 | goto err_unlock; | 2418 | goto err_unlock; |
| 2422 | ret = -EPERM; | 2419 | ret = -EPERM; |
| 2423 | pcred = __task_cred(p); | 2420 | pcred = __task_cred(p); |
| 2421 | /* If victim is in different user_ns, then uids are not | ||
| 2422 | comparable, so we must have CAP_SYS_PTRACE */ | ||
| 2423 | if (cred->user->user_ns != pcred->user->user_ns) { | ||
| 2424 | if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) | ||
| 2425 | goto err_unlock; | ||
| 2426 | goto ok; | ||
| 2427 | } | ||
| 2428 | /* If victim is in same user_ns, then uids are comparable */ | ||
| 2424 | if (cred->euid != pcred->euid && | 2429 | if (cred->euid != pcred->euid && |
| 2425 | cred->euid != pcred->uid && | 2430 | cred->euid != pcred->uid && |
| 2426 | !capable(CAP_SYS_PTRACE)) | 2431 | !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) |
| 2427 | goto err_unlock; | 2432 | goto err_unlock; |
| 2433 | ok: | ||
| 2428 | head = p->robust_list; | 2434 | head = p->robust_list; |
| 2429 | rcu_read_unlock(); | 2435 | rcu_read_unlock(); |
| 2430 | } | 2436 | } |
| @@ -2463,11 +2469,20 @@ retry: | |||
| 2463 | * userspace. | 2469 | * userspace. |
| 2464 | */ | 2470 | */ |
| 2465 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; | 2471 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
| 2466 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); | 2472 | /* |
| 2467 | 2473 | * We are not holding a lock here, but we want to have | |
| 2468 | if (nval == -EFAULT) | 2474 | * the pagefault_disable/enable() protection because |
| 2469 | return -1; | 2475 | * we want to handle the fault gracefully. If the |
| 2470 | 2476 | * access fails we try to fault in the futex with R/W | |
| 2477 | * verification via get_user_pages. get_user() above | ||
| 2478 | * does not guarantee R/W access. If that fails we | ||
| 2479 | * give up and leave the futex locked. | ||
| 2480 | */ | ||
| 2481 | if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { | ||
| 2482 | if (fault_in_user_writeable(uaddr)) | ||
| 2483 | return -1; | ||
| 2484 | goto retry; | ||
| 2485 | } | ||
| 2471 | if (nval != uval) | 2486 | if (nval != uval) |
| 2472 | goto retry; | 2487 | goto retry; |
| 2473 | 2488 | ||
| @@ -2678,8 +2693,7 @@ static int __init futex_init(void) | |||
| 2678 | * implementation, the non-functional ones will return | 2693 | * implementation, the non-functional ones will return |
| 2679 | * -ENOSYS. | 2694 | * -ENOSYS. |
| 2680 | */ | 2695 | */ |
| 2681 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); | 2696 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) |
| 2682 | if (curval == -EFAULT) | ||
| 2683 | futex_cmpxchg_enabled = 1; | 2697 | futex_cmpxchg_enabled = 1; |
| 2684 | 2698 | ||
| 2685 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | 2699 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index a7934ac75e5b..5f9e689dc8f0 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
| @@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
| 153 | goto err_unlock; | 153 | goto err_unlock; |
| 154 | ret = -EPERM; | 154 | ret = -EPERM; |
| 155 | pcred = __task_cred(p); | 155 | pcred = __task_cred(p); |
| 156 | /* If victim is in different user_ns, then uids are not | ||
| 157 | comparable, so we must have CAP_SYS_PTRACE */ | ||
| 158 | if (cred->user->user_ns != pcred->user->user_ns) { | ||
| 159 | if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) | ||
| 160 | goto err_unlock; | ||
| 161 | goto ok; | ||
| 162 | } | ||
| 163 | /* If victim is in same user_ns, then uids are comparable */ | ||
| 156 | if (cred->euid != pcred->euid && | 164 | if (cred->euid != pcred->euid && |
| 157 | cred->euid != pcred->uid && | 165 | cred->euid != pcred->uid && |
| 158 | !capable(CAP_SYS_PTRACE)) | 166 | !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) |
| 159 | goto err_unlock; | 167 | goto err_unlock; |
| 168 | ok: | ||
| 160 | head = p->compat_robust_list; | 169 | head = p->compat_robust_list; |
| 161 | rcu_read_unlock(); | 170 | rcu_read_unlock(); |
| 162 | } | 171 | } |
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 70a298d6da71..b8cadf70b1fb 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
| @@ -34,7 +34,7 @@ config GCOV_KERNEL | |||
| 34 | config GCOV_PROFILE_ALL | 34 | config GCOV_PROFILE_ALL |
| 35 | bool "Profile entire Kernel" | 35 | bool "Profile entire Kernel" |
| 36 | depends on GCOV_KERNEL | 36 | depends on GCOV_KERNEL |
| 37 | depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE | 37 | depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE |
| 38 | default n | 38 | default n |
| 39 | ---help--- | 39 | ---help--- |
| 40 | This options activates profiling for the entire kernel. | 40 | This options activates profiling for the entire kernel. |
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 3f761001d517..e97ca59e2520 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile | |||
| @@ -1,3 +1,3 @@ | |||
| 1 | EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' | 1 | ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' |
| 2 | 2 | ||
| 3 | obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o | 3 | obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o |
diff --git a/kernel/groups.c b/kernel/groups.c index 253dc0f35cf4..1cc476d52dd3 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
| @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
| 233 | struct group_info *group_info; | 233 | struct group_info *group_info; |
| 234 | int retval; | 234 | int retval; |
| 235 | 235 | ||
| 236 | if (!capable(CAP_SETGID)) | 236 | if (!nsown_capable(CAP_SETGID)) |
| 237 | return -EPERM; | 237 | return -EPERM; |
| 238 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 238 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
| 239 | return -EINVAL; | 239 | return -EINVAL; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0c8d7c048615..87fdb3f8db14 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -53,11 +53,10 @@ | |||
| 53 | /* | 53 | /* |
| 54 | * The timer bases: | 54 | * The timer bases: |
| 55 | * | 55 | * |
| 56 | * Note: If we want to add new timer bases, we have to skip the two | 56 | * There are more clockids then hrtimer bases. Thus, we index |
| 57 | * clock ids captured by the cpu-timers. We do this by holding empty | 57 | * into the timer bases by the hrtimer_base_type enum. When trying |
| 58 | * entries rather than doing math adjustment of the clock ids. | 58 | * to reach a base using a clockid, hrtimer_clockid_to_base() |
| 59 | * This ensures that we capture erroneous accesses to these clock ids | 59 | * is used to convert from clockid to the proper hrtimer_base_type. |
| 60 | * rather than moving them into the range of valid clock id's. | ||
| 61 | */ | 60 | */ |
| 62 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | 61 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = |
| 63 | { | 62 | { |
| @@ -74,30 +73,43 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |||
| 74 | .get_time = &ktime_get, | 73 | .get_time = &ktime_get, |
| 75 | .resolution = KTIME_LOW_RES, | 74 | .resolution = KTIME_LOW_RES, |
| 76 | }, | 75 | }, |
| 76 | { | ||
| 77 | .index = CLOCK_BOOTTIME, | ||
| 78 | .get_time = &ktime_get_boottime, | ||
| 79 | .resolution = KTIME_LOW_RES, | ||
| 80 | }, | ||
| 77 | } | 81 | } |
| 78 | }; | 82 | }; |
| 79 | 83 | ||
| 84 | static int hrtimer_clock_to_base_table[MAX_CLOCKS] = { | ||
| 85 | [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, | ||
| 86 | [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, | ||
| 87 | [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, | ||
| 88 | }; | ||
| 89 | |||
| 90 | static inline int hrtimer_clockid_to_base(clockid_t clock_id) | ||
| 91 | { | ||
| 92 | return hrtimer_clock_to_base_table[clock_id]; | ||
| 93 | } | ||
| 94 | |||
| 95 | |||
| 80 | /* | 96 | /* |
| 81 | * Get the coarse grained time at the softirq based on xtime and | 97 | * Get the coarse grained time at the softirq based on xtime and |
| 82 | * wall_to_monotonic. | 98 | * wall_to_monotonic. |
| 83 | */ | 99 | */ |
| 84 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | 100 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) |
| 85 | { | 101 | { |
| 86 | ktime_t xtim, tomono; | 102 | ktime_t xtim, mono, boot; |
| 87 | struct timespec xts, tom; | 103 | struct timespec xts, tom, slp; |
| 88 | unsigned long seq; | ||
| 89 | 104 | ||
| 90 | do { | 105 | get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); |
| 91 | seq = read_seqbegin(&xtime_lock); | ||
| 92 | xts = __current_kernel_time(); | ||
| 93 | tom = __get_wall_to_monotonic(); | ||
| 94 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 95 | 106 | ||
| 96 | xtim = timespec_to_ktime(xts); | 107 | xtim = timespec_to_ktime(xts); |
| 97 | tomono = timespec_to_ktime(tom); | 108 | mono = ktime_add(xtim, timespec_to_ktime(tom)); |
| 98 | base->clock_base[CLOCK_REALTIME].softirq_time = xtim; | 109 | boot = ktime_add(mono, timespec_to_ktime(slp)); |
| 99 | base->clock_base[CLOCK_MONOTONIC].softirq_time = | 110 | base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; |
| 100 | ktime_add(xtim, tomono); | 111 | base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; |
| 112 | base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; | ||
| 101 | } | 113 | } |
| 102 | 114 | ||
| 103 | /* | 115 | /* |
| @@ -184,10 +196,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, | |||
| 184 | struct hrtimer_cpu_base *new_cpu_base; | 196 | struct hrtimer_cpu_base *new_cpu_base; |
| 185 | int this_cpu = smp_processor_id(); | 197 | int this_cpu = smp_processor_id(); |
| 186 | int cpu = hrtimer_get_target(this_cpu, pinned); | 198 | int cpu = hrtimer_get_target(this_cpu, pinned); |
| 199 | int basenum = hrtimer_clockid_to_base(base->index); | ||
| 187 | 200 | ||
| 188 | again: | 201 | again: |
| 189 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); | 202 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); |
| 190 | new_base = &new_cpu_base->clock_base[base->index]; | 203 | new_base = &new_cpu_base->clock_base[basenum]; |
| 191 | 204 | ||
| 192 | if (base != new_base) { | 205 | if (base != new_base) { |
| 193 | /* | 206 | /* |
| @@ -334,6 +347,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); | |||
| 334 | 347 | ||
| 335 | static struct debug_obj_descr hrtimer_debug_descr; | 348 | static struct debug_obj_descr hrtimer_debug_descr; |
| 336 | 349 | ||
| 350 | static void *hrtimer_debug_hint(void *addr) | ||
| 351 | { | ||
| 352 | return ((struct hrtimer *) addr)->function; | ||
| 353 | } | ||
| 354 | |||
| 337 | /* | 355 | /* |
| 338 | * fixup_init is called when: | 356 | * fixup_init is called when: |
| 339 | * - an active object is initialized | 357 | * - an active object is initialized |
| @@ -393,6 +411,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) | |||
| 393 | 411 | ||
| 394 | static struct debug_obj_descr hrtimer_debug_descr = { | 412 | static struct debug_obj_descr hrtimer_debug_descr = { |
| 395 | .name = "hrtimer", | 413 | .name = "hrtimer", |
| 414 | .debug_hint = hrtimer_debug_hint, | ||
| 396 | .fixup_init = hrtimer_fixup_init, | 415 | .fixup_init = hrtimer_fixup_init, |
| 397 | .fixup_activate = hrtimer_fixup_activate, | 416 | .fixup_activate = hrtimer_fixup_activate, |
| 398 | .fixup_free = hrtimer_fixup_free, | 417 | .fixup_free = hrtimer_fixup_free, |
| @@ -611,24 +630,23 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
| 611 | static void retrigger_next_event(void *arg) | 630 | static void retrigger_next_event(void *arg) |
| 612 | { | 631 | { |
| 613 | struct hrtimer_cpu_base *base; | 632 | struct hrtimer_cpu_base *base; |
| 614 | struct timespec realtime_offset, wtm; | 633 | struct timespec realtime_offset, wtm, sleep; |
| 615 | unsigned long seq; | ||
| 616 | 634 | ||
| 617 | if (!hrtimer_hres_active()) | 635 | if (!hrtimer_hres_active()) |
| 618 | return; | 636 | return; |
| 619 | 637 | ||
| 620 | do { | 638 | get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, |
| 621 | seq = read_seqbegin(&xtime_lock); | 639 | &sleep); |
| 622 | wtm = __get_wall_to_monotonic(); | ||
| 623 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 624 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | 640 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); |
| 625 | 641 | ||
| 626 | base = &__get_cpu_var(hrtimer_bases); | 642 | base = &__get_cpu_var(hrtimer_bases); |
| 627 | 643 | ||
| 628 | /* Adjust CLOCK_REALTIME offset */ | 644 | /* Adjust CLOCK_REALTIME offset */ |
| 629 | raw_spin_lock(&base->lock); | 645 | raw_spin_lock(&base->lock); |
| 630 | base->clock_base[CLOCK_REALTIME].offset = | 646 | base->clock_base[HRTIMER_BASE_REALTIME].offset = |
| 631 | timespec_to_ktime(realtime_offset); | 647 | timespec_to_ktime(realtime_offset); |
| 648 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
| 649 | timespec_to_ktime(sleep); | ||
| 632 | 650 | ||
| 633 | hrtimer_force_reprogram(base, 0); | 651 | hrtimer_force_reprogram(base, 0); |
| 634 | raw_spin_unlock(&base->lock); | 652 | raw_spin_unlock(&base->lock); |
| @@ -673,14 +691,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
| 673 | } | 691 | } |
| 674 | 692 | ||
| 675 | /* | 693 | /* |
| 676 | * Initialize the high resolution related parts of a hrtimer | ||
| 677 | */ | ||
| 678 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | ||
| 679 | { | ||
| 680 | } | ||
| 681 | |||
| 682 | |||
| 683 | /* | ||
| 684 | * When High resolution timers are active, try to reprogram. Note, that in case | 694 | * When High resolution timers are active, try to reprogram. Note, that in case |
| 685 | * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry | 695 | * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry |
| 686 | * check happens. The timer gets enqueued into the rbtree. The reprogramming | 696 | * check happens. The timer gets enqueued into the rbtree. The reprogramming |
| @@ -725,8 +735,9 @@ static int hrtimer_switch_to_hres(void) | |||
| 725 | return 0; | 735 | return 0; |
| 726 | } | 736 | } |
| 727 | base->hres_active = 1; | 737 | base->hres_active = 1; |
| 728 | base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; | 738 | base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; |
| 729 | base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; | 739 | base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; |
| 740 | base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; | ||
| 730 | 741 | ||
| 731 | tick_setup_sched_timer(); | 742 | tick_setup_sched_timer(); |
| 732 | 743 | ||
| @@ -750,7 +761,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 750 | return 0; | 761 | return 0; |
| 751 | } | 762 | } |
| 752 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 763 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
| 753 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | ||
| 754 | 764 | ||
| 755 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 765 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
| 756 | 766 | ||
| @@ -1121,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
| 1121 | enum hrtimer_mode mode) | 1131 | enum hrtimer_mode mode) |
| 1122 | { | 1132 | { |
| 1123 | struct hrtimer_cpu_base *cpu_base; | 1133 | struct hrtimer_cpu_base *cpu_base; |
| 1134 | int base; | ||
| 1124 | 1135 | ||
| 1125 | memset(timer, 0, sizeof(struct hrtimer)); | 1136 | memset(timer, 0, sizeof(struct hrtimer)); |
| 1126 | 1137 | ||
| @@ -1129,8 +1140,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
| 1129 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) | 1140 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) |
| 1130 | clock_id = CLOCK_MONOTONIC; | 1141 | clock_id = CLOCK_MONOTONIC; |
| 1131 | 1142 | ||
| 1132 | timer->base = &cpu_base->clock_base[clock_id]; | 1143 | base = hrtimer_clockid_to_base(clock_id); |
| 1133 | hrtimer_init_timer_hres(timer); | 1144 | timer->base = &cpu_base->clock_base[base]; |
| 1134 | timerqueue_init(&timer->node); | 1145 | timerqueue_init(&timer->node); |
| 1135 | 1146 | ||
| 1136 | #ifdef CONFIG_TIMER_STATS | 1147 | #ifdef CONFIG_TIMER_STATS |
| @@ -1165,9 +1176,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init); | |||
| 1165 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | 1176 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) |
| 1166 | { | 1177 | { |
| 1167 | struct hrtimer_cpu_base *cpu_base; | 1178 | struct hrtimer_cpu_base *cpu_base; |
| 1179 | int base = hrtimer_clockid_to_base(which_clock); | ||
| 1168 | 1180 | ||
| 1169 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); | 1181 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); |
| 1170 | *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); | 1182 | *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); |
| 1171 | 1183 | ||
| 1172 | return 0; | 1184 | return 0; |
| 1173 | } | 1185 | } |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8e42fec7686d..c574f9a12c48 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | # Select this to activate the generic irq options below | ||
| 1 | config HAVE_GENERIC_HARDIRQS | 2 | config HAVE_GENERIC_HARDIRQS |
| 2 | def_bool n | 3 | bool |
| 3 | 4 | ||
| 4 | if HAVE_GENERIC_HARDIRQS | 5 | if HAVE_GENERIC_HARDIRQS |
| 5 | menu "IRQ subsystem" | 6 | menu "IRQ subsystem" |
| @@ -9,28 +10,47 @@ menu "IRQ subsystem" | |||
| 9 | config GENERIC_HARDIRQS | 10 | config GENERIC_HARDIRQS |
| 10 | def_bool y | 11 | def_bool y |
| 11 | 12 | ||
| 12 | # Select this to disable the deprecated stuff | ||
| 13 | config GENERIC_HARDIRQS_NO_DEPRECATED | ||
| 14 | def_bool n | ||
| 15 | |||
| 16 | # Options selectable by the architecture code | 13 | # Options selectable by the architecture code |
| 14 | |||
| 15 | # Make sparse irq Kconfig switch below available | ||
| 17 | config HAVE_SPARSE_IRQ | 16 | config HAVE_SPARSE_IRQ |
| 18 | def_bool n | 17 | bool |
| 19 | 18 | ||
| 19 | # Enable the generic irq autoprobe mechanism | ||
| 20 | config GENERIC_IRQ_PROBE | 20 | config GENERIC_IRQ_PROBE |
| 21 | def_bool n | 21 | bool |
| 22 | |||
| 23 | # Use the generic /proc/interrupts implementation | ||
| 24 | config GENERIC_IRQ_SHOW | ||
| 25 | bool | ||
| 26 | |||
| 27 | # Print level/edge extra information | ||
| 28 | config GENERIC_IRQ_SHOW_LEVEL | ||
| 29 | bool | ||
| 22 | 30 | ||
| 31 | # Support for delayed migration from interrupt context | ||
| 23 | config GENERIC_PENDING_IRQ | 32 | config GENERIC_PENDING_IRQ |
| 24 | def_bool n | 33 | bool |
| 25 | 34 | ||
| 35 | # Alpha specific irq affinity mechanism | ||
| 26 | config AUTO_IRQ_AFFINITY | 36 | config AUTO_IRQ_AFFINITY |
| 27 | def_bool n | 37 | bool |
| 28 | |||
| 29 | config IRQ_PER_CPU | ||
| 30 | def_bool n | ||
| 31 | 38 | ||
| 39 | # Tasklet based software resend for pending interrupts on enable_irq() | ||
| 32 | config HARDIRQS_SW_RESEND | 40 | config HARDIRQS_SW_RESEND |
| 33 | def_bool n | 41 | bool |
| 42 | |||
| 43 | # Preflow handler support for fasteoi (sparc64) | ||
| 44 | config IRQ_PREFLOW_FASTEOI | ||
| 45 | bool | ||
| 46 | |||
| 47 | # Edge style eoi based handler (cell) | ||
| 48 | config IRQ_EDGE_EOI_HANDLER | ||
| 49 | bool | ||
| 50 | |||
| 51 | # Support forced irq threading | ||
| 52 | config IRQ_FORCED_THREADING | ||
| 53 | bool | ||
| 34 | 54 | ||
| 35 | config SPARSE_IRQ | 55 | config SPARSE_IRQ |
| 36 | bool "Support sparse irq numbering" | 56 | bool "Support sparse irq numbering" |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 505798f86c36..342d8f44e401 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
| @@ -17,7 +17,7 @@ | |||
| 17 | /* | 17 | /* |
| 18 | * Autodetection depends on the fact that any interrupt that | 18 | * Autodetection depends on the fact that any interrupt that |
| 19 | * comes in on to an unassigned handler will get stuck with | 19 | * comes in on to an unassigned handler will get stuck with |
| 20 | * "IRQ_WAITING" cleared and the interrupt disabled. | 20 | * "IRQS_WAITING" cleared and the interrupt disabled. |
| 21 | */ | 21 | */ |
| 22 | static DEFINE_MUTEX(probing_active); | 22 | static DEFINE_MUTEX(probing_active); |
| 23 | 23 | ||
| @@ -32,7 +32,6 @@ unsigned long probe_irq_on(void) | |||
| 32 | { | 32 | { |
| 33 | struct irq_desc *desc; | 33 | struct irq_desc *desc; |
| 34 | unsigned long mask = 0; | 34 | unsigned long mask = 0; |
| 35 | unsigned int status; | ||
| 36 | int i; | 35 | int i; |
| 37 | 36 | ||
| 38 | /* | 37 | /* |
| @@ -46,13 +45,7 @@ unsigned long probe_irq_on(void) | |||
| 46 | */ | 45 | */ |
| 47 | for_each_irq_desc_reverse(i, desc) { | 46 | for_each_irq_desc_reverse(i, desc) { |
| 48 | raw_spin_lock_irq(&desc->lock); | 47 | raw_spin_lock_irq(&desc->lock); |
| 49 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { | 48 | if (!desc->action && irq_settings_can_probe(desc)) { |
| 50 | /* | ||
| 51 | * An old-style architecture might still have | ||
| 52 | * the handle_bad_irq handler there: | ||
| 53 | */ | ||
| 54 | compat_irq_chip_set_default_handler(desc); | ||
| 55 | |||
| 56 | /* | 49 | /* |
| 57 | * Some chips need to know about probing in | 50 | * Some chips need to know about probing in |
| 58 | * progress: | 51 | * progress: |
| @@ -60,7 +53,7 @@ unsigned long probe_irq_on(void) | |||
| 60 | if (desc->irq_data.chip->irq_set_type) | 53 | if (desc->irq_data.chip->irq_set_type) |
| 61 | desc->irq_data.chip->irq_set_type(&desc->irq_data, | 54 | desc->irq_data.chip->irq_set_type(&desc->irq_data, |
| 62 | IRQ_TYPE_PROBE); | 55 | IRQ_TYPE_PROBE); |
| 63 | desc->irq_data.chip->irq_startup(&desc->irq_data); | 56 | irq_startup(desc); |
| 64 | } | 57 | } |
| 65 | raw_spin_unlock_irq(&desc->lock); | 58 | raw_spin_unlock_irq(&desc->lock); |
| 66 | } | 59 | } |
| @@ -75,10 +68,10 @@ unsigned long probe_irq_on(void) | |||
| 75 | */ | 68 | */ |
| 76 | for_each_irq_desc_reverse(i, desc) { | 69 | for_each_irq_desc_reverse(i, desc) { |
| 77 | raw_spin_lock_irq(&desc->lock); | 70 | raw_spin_lock_irq(&desc->lock); |
| 78 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { | 71 | if (!desc->action && irq_settings_can_probe(desc)) { |
| 79 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; | 72 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; |
| 80 | if (desc->irq_data.chip->irq_startup(&desc->irq_data)) | 73 | if (irq_startup(desc)) |
| 81 | desc->status |= IRQ_PENDING; | 74 | desc->istate |= IRQS_PENDING; |
| 82 | } | 75 | } |
| 83 | raw_spin_unlock_irq(&desc->lock); | 76 | raw_spin_unlock_irq(&desc->lock); |
| 84 | } | 77 | } |
| @@ -93,13 +86,12 @@ unsigned long probe_irq_on(void) | |||
| 93 | */ | 86 | */ |
| 94 | for_each_irq_desc(i, desc) { | 87 | for_each_irq_desc(i, desc) { |
| 95 | raw_spin_lock_irq(&desc->lock); | 88 | raw_spin_lock_irq(&desc->lock); |
| 96 | status = desc->status; | ||
| 97 | 89 | ||
| 98 | if (status & IRQ_AUTODETECT) { | 90 | if (desc->istate & IRQS_AUTODETECT) { |
| 99 | /* It triggered already - consider it spurious. */ | 91 | /* It triggered already - consider it spurious. */ |
| 100 | if (!(status & IRQ_WAITING)) { | 92 | if (!(desc->istate & IRQS_WAITING)) { |
| 101 | desc->status = status & ~IRQ_AUTODETECT; | 93 | desc->istate &= ~IRQS_AUTODETECT; |
| 102 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | 94 | irq_shutdown(desc); |
| 103 | } else | 95 | } else |
| 104 | if (i < 32) | 96 | if (i < 32) |
| 105 | mask |= 1 << i; | 97 | mask |= 1 << i; |
| @@ -125,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on); | |||
| 125 | */ | 117 | */ |
| 126 | unsigned int probe_irq_mask(unsigned long val) | 118 | unsigned int probe_irq_mask(unsigned long val) |
| 127 | { | 119 | { |
| 128 | unsigned int status, mask = 0; | 120 | unsigned int mask = 0; |
| 129 | struct irq_desc *desc; | 121 | struct irq_desc *desc; |
| 130 | int i; | 122 | int i; |
| 131 | 123 | ||
| 132 | for_each_irq_desc(i, desc) { | 124 | for_each_irq_desc(i, desc) { |
| 133 | raw_spin_lock_irq(&desc->lock); | 125 | raw_spin_lock_irq(&desc->lock); |
| 134 | status = desc->status; | 126 | if (desc->istate & IRQS_AUTODETECT) { |
| 135 | 127 | if (i < 16 && !(desc->istate & IRQS_WAITING)) | |
| 136 | if (status & IRQ_AUTODETECT) { | ||
| 137 | if (i < 16 && !(status & IRQ_WAITING)) | ||
| 138 | mask |= 1 << i; | 128 | mask |= 1 << i; |
| 139 | 129 | ||
| 140 | desc->status = status & ~IRQ_AUTODETECT; | 130 | desc->istate &= ~IRQS_AUTODETECT; |
| 141 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | 131 | irq_shutdown(desc); |
| 142 | } | 132 | } |
| 143 | raw_spin_unlock_irq(&desc->lock); | 133 | raw_spin_unlock_irq(&desc->lock); |
| 144 | } | 134 | } |
| @@ -169,20 +159,18 @@ int probe_irq_off(unsigned long val) | |||
| 169 | { | 159 | { |
| 170 | int i, irq_found = 0, nr_of_irqs = 0; | 160 | int i, irq_found = 0, nr_of_irqs = 0; |
| 171 | struct irq_desc *desc; | 161 | struct irq_desc *desc; |
| 172 | unsigned int status; | ||
| 173 | 162 | ||
| 174 | for_each_irq_desc(i, desc) { | 163 | for_each_irq_desc(i, desc) { |
| 175 | raw_spin_lock_irq(&desc->lock); | 164 | raw_spin_lock_irq(&desc->lock); |
| 176 | status = desc->status; | ||
| 177 | 165 | ||
| 178 | if (status & IRQ_AUTODETECT) { | 166 | if (desc->istate & IRQS_AUTODETECT) { |
| 179 | if (!(status & IRQ_WAITING)) { | 167 | if (!(desc->istate & IRQS_WAITING)) { |
| 180 | if (!nr_of_irqs) | 168 | if (!nr_of_irqs) |
| 181 | irq_found = i; | 169 | irq_found = i; |
| 182 | nr_of_irqs++; | 170 | nr_of_irqs++; |
| 183 | } | 171 | } |
| 184 | desc->status = status & ~IRQ_AUTODETECT; | 172 | desc->istate &= ~IRQS_AUTODETECT; |
| 185 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | 173 | irq_shutdown(desc); |
| 186 | } | 174 | } |
| 187 | raw_spin_unlock_irq(&desc->lock); | 175 | raw_spin_unlock_irq(&desc->lock); |
| 188 | } | 176 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index baa5c4acad83..4af1e2b244cb 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -19,140 +19,115 @@ | |||
| 19 | #include "internals.h" | 19 | #include "internals.h" |
| 20 | 20 | ||
| 21 | /** | 21 | /** |
| 22 | * set_irq_chip - set the irq chip for an irq | 22 | * irq_set_chip - set the irq chip for an irq |
| 23 | * @irq: irq number | 23 | * @irq: irq number |
| 24 | * @chip: pointer to irq chip description structure | 24 | * @chip: pointer to irq chip description structure |
| 25 | */ | 25 | */ |
| 26 | int set_irq_chip(unsigned int irq, struct irq_chip *chip) | 26 | int irq_set_chip(unsigned int irq, struct irq_chip *chip) |
| 27 | { | 27 | { |
| 28 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 29 | unsigned long flags; | 28 | unsigned long flags; |
| 29 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
| 30 | 30 | ||
| 31 | if (!desc) { | 31 | if (!desc) |
| 32 | WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); | ||
| 33 | return -EINVAL; | 32 | return -EINVAL; |
| 34 | } | ||
| 35 | 33 | ||
| 36 | if (!chip) | 34 | if (!chip) |
| 37 | chip = &no_irq_chip; | 35 | chip = &no_irq_chip; |
| 38 | 36 | ||
| 39 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 40 | irq_chip_set_defaults(chip); | ||
| 41 | desc->irq_data.chip = chip; | 37 | desc->irq_data.chip = chip; |
| 42 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 38 | irq_put_desc_unlock(desc, flags); |
| 43 | 39 | /* | |
| 40 | * For !CONFIG_SPARSE_IRQ make the irq show up in | ||
| 41 | * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is | ||
| 42 | * already marked, and this call is harmless. | ||
| 43 | */ | ||
| 44 | irq_reserve_irq(irq); | ||
| 44 | return 0; | 45 | return 0; |
| 45 | } | 46 | } |
| 46 | EXPORT_SYMBOL(set_irq_chip); | 47 | EXPORT_SYMBOL(irq_set_chip); |
| 47 | 48 | ||
| 48 | /** | 49 | /** |
| 49 | * set_irq_type - set the irq trigger type for an irq | 50 | * irq_set_type - set the irq trigger type for an irq |
| 50 | * @irq: irq number | 51 | * @irq: irq number |
| 51 | * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h | 52 | * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h |
| 52 | */ | 53 | */ |
| 53 | int set_irq_type(unsigned int irq, unsigned int type) | 54 | int irq_set_irq_type(unsigned int irq, unsigned int type) |
| 54 | { | 55 | { |
| 55 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 56 | unsigned long flags; | 56 | unsigned long flags; |
| 57 | int ret = -ENXIO; | 57 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); |
| 58 | int ret = 0; | ||
| 58 | 59 | ||
| 59 | if (!desc) { | 60 | if (!desc) |
| 60 | printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); | 61 | return -EINVAL; |
| 61 | return -ENODEV; | ||
| 62 | } | ||
| 63 | 62 | ||
| 64 | type &= IRQ_TYPE_SENSE_MASK; | 63 | type &= IRQ_TYPE_SENSE_MASK; |
| 65 | if (type == IRQ_TYPE_NONE) | 64 | if (type != IRQ_TYPE_NONE) |
| 66 | return 0; | 65 | ret = __irq_set_trigger(desc, irq, type); |
| 67 | 66 | irq_put_desc_busunlock(desc, flags); | |
| 68 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 69 | ret = __irq_set_trigger(desc, irq, type); | ||
| 70 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 71 | return ret; | 67 | return ret; |
| 72 | } | 68 | } |
| 73 | EXPORT_SYMBOL(set_irq_type); | 69 | EXPORT_SYMBOL(irq_set_irq_type); |
| 74 | 70 | ||
| 75 | /** | 71 | /** |
| 76 | * set_irq_data - set irq type data for an irq | 72 | * irq_set_handler_data - set irq handler data for an irq |
| 77 | * @irq: Interrupt number | 73 | * @irq: Interrupt number |
| 78 | * @data: Pointer to interrupt specific data | 74 | * @data: Pointer to interrupt specific data |
| 79 | * | 75 | * |
| 80 | * Set the hardware irq controller data for an irq | 76 | * Set the hardware irq controller data for an irq |
| 81 | */ | 77 | */ |
| 82 | int set_irq_data(unsigned int irq, void *data) | 78 | int irq_set_handler_data(unsigned int irq, void *data) |
| 83 | { | 79 | { |
| 84 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 85 | unsigned long flags; | 80 | unsigned long flags; |
| 81 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
| 86 | 82 | ||
| 87 | if (!desc) { | 83 | if (!desc) |
| 88 | printk(KERN_ERR | ||
| 89 | "Trying to install controller data for IRQ%d\n", irq); | ||
| 90 | return -EINVAL; | 84 | return -EINVAL; |
| 91 | } | ||
| 92 | |||
| 93 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 94 | desc->irq_data.handler_data = data; | 85 | desc->irq_data.handler_data = data; |
| 95 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 86 | irq_put_desc_unlock(desc, flags); |
| 96 | return 0; | 87 | return 0; |
| 97 | } | 88 | } |
| 98 | EXPORT_SYMBOL(set_irq_data); | 89 | EXPORT_SYMBOL(irq_set_handler_data); |
| 99 | 90 | ||
| 100 | /** | 91 | /** |
| 101 | * set_irq_msi - set MSI descriptor data for an irq | 92 | * irq_set_msi_desc - set MSI descriptor data for an irq |
| 102 | * @irq: Interrupt number | 93 | * @irq: Interrupt number |
| 103 | * @entry: Pointer to MSI descriptor data | 94 | * @entry: Pointer to MSI descriptor data |
| 104 | * | 95 | * |
| 105 | * Set the MSI descriptor entry for an irq | 96 | * Set the MSI descriptor entry for an irq |
| 106 | */ | 97 | */ |
| 107 | int set_irq_msi(unsigned int irq, struct msi_desc *entry) | 98 | int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) |
| 108 | { | 99 | { |
| 109 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 110 | unsigned long flags; | 100 | unsigned long flags; |
| 101 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
| 111 | 102 | ||
| 112 | if (!desc) { | 103 | if (!desc) |
| 113 | printk(KERN_ERR | ||
| 114 | "Trying to install msi data for IRQ%d\n", irq); | ||
| 115 | return -EINVAL; | 104 | return -EINVAL; |
| 116 | } | ||
| 117 | |||
| 118 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 119 | desc->irq_data.msi_desc = entry; | 105 | desc->irq_data.msi_desc = entry; |
| 120 | if (entry) | 106 | if (entry) |
| 121 | entry->irq = irq; | 107 | entry->irq = irq; |
| 122 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 108 | irq_put_desc_unlock(desc, flags); |
| 123 | return 0; | 109 | return 0; |
| 124 | } | 110 | } |
| 125 | 111 | ||
| 126 | /** | 112 | /** |
| 127 | * set_irq_chip_data - set irq chip data for an irq | 113 | * irq_set_chip_data - set irq chip data for an irq |
| 128 | * @irq: Interrupt number | 114 | * @irq: Interrupt number |
| 129 | * @data: Pointer to chip specific data | 115 | * @data: Pointer to chip specific data |
| 130 | * | 116 | * |
| 131 | * Set the hardware irq chip data for an irq | 117 | * Set the hardware irq chip data for an irq |
| 132 | */ | 118 | */ |
| 133 | int set_irq_chip_data(unsigned int irq, void *data) | 119 | int irq_set_chip_data(unsigned int irq, void *data) |
| 134 | { | 120 | { |
| 135 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 136 | unsigned long flags; | 121 | unsigned long flags; |
| 122 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
| 137 | 123 | ||
| 138 | if (!desc) { | 124 | if (!desc) |
| 139 | printk(KERN_ERR | ||
| 140 | "Trying to install chip data for IRQ%d\n", irq); | ||
| 141 | return -EINVAL; | ||
| 142 | } | ||
| 143 | |||
| 144 | if (!desc->irq_data.chip) { | ||
| 145 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); | ||
| 146 | return -EINVAL; | 125 | return -EINVAL; |
| 147 | } | ||
| 148 | |||
| 149 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 150 | desc->irq_data.chip_data = data; | 126 | desc->irq_data.chip_data = data; |
| 151 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 127 | irq_put_desc_unlock(desc, flags); |
| 152 | |||
| 153 | return 0; | 128 | return 0; |
| 154 | } | 129 | } |
| 155 | EXPORT_SYMBOL(set_irq_chip_data); | 130 | EXPORT_SYMBOL(irq_set_chip_data); |
| 156 | 131 | ||
| 157 | struct irq_data *irq_get_irq_data(unsigned int irq) | 132 | struct irq_data *irq_get_irq_data(unsigned int irq) |
| 158 | { | 133 | { |
| @@ -162,221 +137,71 @@ struct irq_data *irq_get_irq_data(unsigned int irq) | |||
| 162 | } | 137 | } |
| 163 | EXPORT_SYMBOL_GPL(irq_get_irq_data); | 138 | EXPORT_SYMBOL_GPL(irq_get_irq_data); |
| 164 | 139 | ||
| 165 | /** | 140 | static void irq_state_clr_disabled(struct irq_desc *desc) |
| 166 | * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq | ||
| 167 | * | ||
| 168 | * @irq: Interrupt number | ||
| 169 | * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag | ||
| 170 | * | ||
| 171 | * The IRQ_NESTED_THREAD flag indicates that on | ||
| 172 | * request_threaded_irq() no separate interrupt thread should be | ||
| 173 | * created for the irq as the handler are called nested in the | ||
| 174 | * context of a demultiplexing interrupt handler thread. | ||
| 175 | */ | ||
| 176 | void set_irq_nested_thread(unsigned int irq, int nest) | ||
| 177 | { | ||
| 178 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 179 | unsigned long flags; | ||
| 180 | |||
| 181 | if (!desc) | ||
| 182 | return; | ||
| 183 | |||
| 184 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 185 | if (nest) | ||
| 186 | desc->status |= IRQ_NESTED_THREAD; | ||
| 187 | else | ||
| 188 | desc->status &= ~IRQ_NESTED_THREAD; | ||
| 189 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 190 | } | ||
| 191 | EXPORT_SYMBOL_GPL(set_irq_nested_thread); | ||
| 192 | |||
| 193 | /* | ||
| 194 | * default enable function | ||
| 195 | */ | ||
| 196 | static void default_enable(struct irq_data *data) | ||
| 197 | { | 141 | { |
| 198 | struct irq_desc *desc = irq_data_to_desc(data); | 142 | irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); |
| 199 | |||
| 200 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | ||
| 201 | desc->status &= ~IRQ_MASKED; | ||
| 202 | } | 143 | } |
| 203 | 144 | ||
| 204 | /* | 145 | static void irq_state_set_disabled(struct irq_desc *desc) |
| 205 | * default disable function | ||
| 206 | */ | ||
| 207 | static void default_disable(struct irq_data *data) | ||
| 208 | { | ||
| 209 | } | ||
| 210 | |||
| 211 | /* | ||
| 212 | * default startup function | ||
| 213 | */ | ||
| 214 | static unsigned int default_startup(struct irq_data *data) | ||
| 215 | { | 146 | { |
| 216 | struct irq_desc *desc = irq_data_to_desc(data); | 147 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); |
| 217 | |||
| 218 | desc->irq_data.chip->irq_enable(data); | ||
| 219 | return 0; | ||
| 220 | } | 148 | } |
| 221 | 149 | ||
| 222 | /* | 150 | static void irq_state_clr_masked(struct irq_desc *desc) |
| 223 | * default shutdown function | ||
| 224 | */ | ||
| 225 | static void default_shutdown(struct irq_data *data) | ||
| 226 | { | 151 | { |
| 227 | struct irq_desc *desc = irq_data_to_desc(data); | 152 | irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); |
| 228 | |||
| 229 | desc->irq_data.chip->irq_mask(&desc->irq_data); | ||
| 230 | desc->status |= IRQ_MASKED; | ||
| 231 | } | 153 | } |
| 232 | 154 | ||
| 233 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | 155 | static void irq_state_set_masked(struct irq_desc *desc) |
| 234 | /* Temporary migration helpers */ | ||
| 235 | static void compat_irq_mask(struct irq_data *data) | ||
| 236 | { | 156 | { |
| 237 | data->chip->mask(data->irq); | 157 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); |
| 238 | } | 158 | } |
| 239 | 159 | ||
| 240 | static void compat_irq_unmask(struct irq_data *data) | 160 | int irq_startup(struct irq_desc *desc) |
| 241 | { | 161 | { |
| 242 | data->chip->unmask(data->irq); | 162 | irq_state_clr_disabled(desc); |
| 243 | } | 163 | desc->depth = 0; |
| 244 | 164 | ||
| 245 | static void compat_irq_ack(struct irq_data *data) | 165 | if (desc->irq_data.chip->irq_startup) { |
| 246 | { | 166 | int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); |
| 247 | data->chip->ack(data->irq); | 167 | irq_state_clr_masked(desc); |
| 248 | } | 168 | return ret; |
| 249 | 169 | } | |
| 250 | static void compat_irq_mask_ack(struct irq_data *data) | ||
| 251 | { | ||
| 252 | data->chip->mask_ack(data->irq); | ||
| 253 | } | ||
| 254 | |||
| 255 | static void compat_irq_eoi(struct irq_data *data) | ||
| 256 | { | ||
| 257 | data->chip->eoi(data->irq); | ||
| 258 | } | ||
| 259 | |||
| 260 | static void compat_irq_enable(struct irq_data *data) | ||
| 261 | { | ||
| 262 | data->chip->enable(data->irq); | ||
| 263 | } | ||
| 264 | |||
| 265 | static void compat_irq_disable(struct irq_data *data) | ||
| 266 | { | ||
| 267 | data->chip->disable(data->irq); | ||
| 268 | } | ||
| 269 | |||
| 270 | static void compat_irq_shutdown(struct irq_data *data) | ||
| 271 | { | ||
| 272 | data->chip->shutdown(data->irq); | ||
| 273 | } | ||
| 274 | |||
| 275 | static unsigned int compat_irq_startup(struct irq_data *data) | ||
| 276 | { | ||
| 277 | return data->chip->startup(data->irq); | ||
| 278 | } | ||
| 279 | |||
| 280 | static int compat_irq_set_affinity(struct irq_data *data, | ||
| 281 | const struct cpumask *dest, bool force) | ||
| 282 | { | ||
| 283 | return data->chip->set_affinity(data->irq, dest); | ||
| 284 | } | ||
| 285 | |||
| 286 | static int compat_irq_set_type(struct irq_data *data, unsigned int type) | ||
| 287 | { | ||
| 288 | return data->chip->set_type(data->irq, type); | ||
| 289 | } | ||
| 290 | |||
| 291 | static int compat_irq_set_wake(struct irq_data *data, unsigned int on) | ||
| 292 | { | ||
| 293 | return data->chip->set_wake(data->irq, on); | ||
| 294 | } | ||
| 295 | 170 | ||
| 296 | static int compat_irq_retrigger(struct irq_data *data) | 171 | irq_enable(desc); |
| 297 | { | 172 | return 0; |
| 298 | return data->chip->retrigger(data->irq); | ||
| 299 | } | 173 | } |
| 300 | 174 | ||
| 301 | static void compat_bus_lock(struct irq_data *data) | 175 | void irq_shutdown(struct irq_desc *desc) |
| 302 | { | 176 | { |
| 303 | data->chip->bus_lock(data->irq); | 177 | irq_state_set_disabled(desc); |
| 178 | desc->depth = 1; | ||
| 179 | if (desc->irq_data.chip->irq_shutdown) | ||
| 180 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | ||
| 181 | if (desc->irq_data.chip->irq_disable) | ||
| 182 | desc->irq_data.chip->irq_disable(&desc->irq_data); | ||
| 183 | else | ||
| 184 | desc->irq_data.chip->irq_mask(&desc->irq_data); | ||
| 185 | irq_state_set_masked(desc); | ||
| 304 | } | 186 | } |
| 305 | 187 | ||
| 306 | static void compat_bus_sync_unlock(struct irq_data *data) | 188 | void irq_enable(struct irq_desc *desc) |
| 307 | { | 189 | { |
| 308 | data->chip->bus_sync_unlock(data->irq); | 190 | irq_state_clr_disabled(desc); |
| 191 | if (desc->irq_data.chip->irq_enable) | ||
| 192 | desc->irq_data.chip->irq_enable(&desc->irq_data); | ||
| 193 | else | ||
| 194 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | ||
| 195 | irq_state_clr_masked(desc); | ||
| 309 | } | 196 | } |
| 310 | #endif | ||
| 311 | 197 | ||
| 312 | /* | 198 | void irq_disable(struct irq_desc *desc) |
| 313 | * Fixup enable/disable function pointers | ||
| 314 | */ | ||
| 315 | void irq_chip_set_defaults(struct irq_chip *chip) | ||
| 316 | { | 199 | { |
| 317 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | 200 | irq_state_set_disabled(desc); |
| 318 | /* | 201 | if (desc->irq_data.chip->irq_disable) { |
| 319 | * Compat fixup functions need to be before we set the | 202 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
| 320 | * defaults for enable/disable/startup/shutdown | 203 | irq_state_set_masked(desc); |
| 321 | */ | 204 | } |
| 322 | if (chip->enable) | ||
| 323 | chip->irq_enable = compat_irq_enable; | ||
| 324 | if (chip->disable) | ||
| 325 | chip->irq_disable = compat_irq_disable; | ||
| 326 | if (chip->shutdown) | ||
| 327 | chip->irq_shutdown = compat_irq_shutdown; | ||
| 328 | if (chip->startup) | ||
| 329 | chip->irq_startup = compat_irq_startup; | ||
| 330 | #endif | ||
| 331 | /* | ||
| 332 | * The real defaults | ||
| 333 | */ | ||
| 334 | if (!chip->irq_enable) | ||
| 335 | chip->irq_enable = default_enable; | ||
| 336 | if (!chip->irq_disable) | ||
| 337 | chip->irq_disable = default_disable; | ||
| 338 | if (!chip->irq_startup) | ||
| 339 | chip->irq_startup = default_startup; | ||
| 340 | /* | ||
| 341 | * We use chip->irq_disable, when the user provided its own. When | ||
| 342 | * we have default_disable set for chip->irq_disable, then we need | ||
| 343 | * to use default_shutdown, otherwise the irq line is not | ||
| 344 | * disabled on free_irq(): | ||
| 345 | */ | ||
| 346 | if (!chip->irq_shutdown) | ||
| 347 | chip->irq_shutdown = chip->irq_disable != default_disable ? | ||
| 348 | chip->irq_disable : default_shutdown; | ||
| 349 | |||
| 350 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
| 351 | if (!chip->end) | ||
| 352 | chip->end = dummy_irq_chip.end; | ||
| 353 | |||
| 354 | /* | ||
| 355 | * Now fix up the remaining compat handlers | ||
| 356 | */ | ||
| 357 | if (chip->bus_lock) | ||
| 358 | chip->irq_bus_lock = compat_bus_lock; | ||
| 359 | if (chip->bus_sync_unlock) | ||
| 360 | chip->irq_bus_sync_unlock = compat_bus_sync_unlock; | ||
| 361 | if (chip->mask) | ||
| 362 | chip->irq_mask = compat_irq_mask; | ||
| 363 | if (chip->unmask) | ||
| 364 | chip->irq_unmask = compat_irq_unmask; | ||
| 365 | if (chip->ack) | ||
| 366 | chip->irq_ack = compat_irq_ack; | ||
| 367 | if (chip->mask_ack) | ||
| 368 | chip->irq_mask_ack = compat_irq_mask_ack; | ||
| 369 | if (chip->eoi) | ||
| 370 | chip->irq_eoi = compat_irq_eoi; | ||
| 371 | if (chip->set_affinity) | ||
| 372 | chip->irq_set_affinity = compat_irq_set_affinity; | ||
| 373 | if (chip->set_type) | ||
| 374 | chip->irq_set_type = compat_irq_set_type; | ||
| 375 | if (chip->set_wake) | ||
| 376 | chip->irq_set_wake = compat_irq_set_wake; | ||
| 377 | if (chip->retrigger) | ||
| 378 | chip->irq_retrigger = compat_irq_retrigger; | ||
| 379 | #endif | ||
| 380 | } | 205 | } |
| 381 | 206 | ||
| 382 | static inline void mask_ack_irq(struct irq_desc *desc) | 207 | static inline void mask_ack_irq(struct irq_desc *desc) |
| @@ -388,22 +213,22 @@ static inline void mask_ack_irq(struct irq_desc *desc) | |||
| 388 | if (desc->irq_data.chip->irq_ack) | 213 | if (desc->irq_data.chip->irq_ack) |
| 389 | desc->irq_data.chip->irq_ack(&desc->irq_data); | 214 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
| 390 | } | 215 | } |
| 391 | desc->status |= IRQ_MASKED; | 216 | irq_state_set_masked(desc); |
| 392 | } | 217 | } |
| 393 | 218 | ||
| 394 | static inline void mask_irq(struct irq_desc *desc) | 219 | void mask_irq(struct irq_desc *desc) |
| 395 | { | 220 | { |
| 396 | if (desc->irq_data.chip->irq_mask) { | 221 | if (desc->irq_data.chip->irq_mask) { |
| 397 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 222 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
| 398 | desc->status |= IRQ_MASKED; | 223 | irq_state_set_masked(desc); |
| 399 | } | 224 | } |
| 400 | } | 225 | } |
| 401 | 226 | ||
| 402 | static inline void unmask_irq(struct irq_desc *desc) | 227 | void unmask_irq(struct irq_desc *desc) |
| 403 | { | 228 | { |
| 404 | if (desc->irq_data.chip->irq_unmask) { | 229 | if (desc->irq_data.chip->irq_unmask) { |
| 405 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 230 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
| 406 | desc->status &= ~IRQ_MASKED; | 231 | irq_state_clr_masked(desc); |
| 407 | } | 232 | } |
| 408 | } | 233 | } |
| 409 | 234 | ||
| @@ -428,10 +253,10 @@ void handle_nested_irq(unsigned int irq) | |||
| 428 | kstat_incr_irqs_this_cpu(irq, desc); | 253 | kstat_incr_irqs_this_cpu(irq, desc); |
| 429 | 254 | ||
| 430 | action = desc->action; | 255 | action = desc->action; |
| 431 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | 256 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) |
| 432 | goto out_unlock; | 257 | goto out_unlock; |
| 433 | 258 | ||
| 434 | desc->status |= IRQ_INPROGRESS; | 259 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
| 435 | raw_spin_unlock_irq(&desc->lock); | 260 | raw_spin_unlock_irq(&desc->lock); |
| 436 | 261 | ||
| 437 | action_ret = action->thread_fn(action->irq, action->dev_id); | 262 | action_ret = action->thread_fn(action->irq, action->dev_id); |
| @@ -439,13 +264,20 @@ void handle_nested_irq(unsigned int irq) | |||
| 439 | note_interrupt(irq, desc, action_ret); | 264 | note_interrupt(irq, desc, action_ret); |
| 440 | 265 | ||
| 441 | raw_spin_lock_irq(&desc->lock); | 266 | raw_spin_lock_irq(&desc->lock); |
| 442 | desc->status &= ~IRQ_INPROGRESS; | 267 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
| 443 | 268 | ||
| 444 | out_unlock: | 269 | out_unlock: |
| 445 | raw_spin_unlock_irq(&desc->lock); | 270 | raw_spin_unlock_irq(&desc->lock); |
| 446 | } | 271 | } |
| 447 | EXPORT_SYMBOL_GPL(handle_nested_irq); | 272 | EXPORT_SYMBOL_GPL(handle_nested_irq); |
| 448 | 273 | ||
| 274 | static bool irq_check_poll(struct irq_desc *desc) | ||
| 275 | { | ||
| 276 | if (!(desc->istate & IRQS_POLL_INPROGRESS)) | ||
| 277 | return false; | ||
| 278 | return irq_wait_for_poll(desc); | ||
| 279 | } | ||
| 280 | |||
| 449 | /** | 281 | /** |
| 450 | * handle_simple_irq - Simple and software-decoded IRQs. | 282 | * handle_simple_irq - Simple and software-decoded IRQs. |
| 451 | * @irq: the interrupt number | 283 | * @irq: the interrupt number |
| @@ -461,29 +293,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq); | |||
| 461 | void | 293 | void |
| 462 | handle_simple_irq(unsigned int irq, struct irq_desc *desc) | 294 | handle_simple_irq(unsigned int irq, struct irq_desc *desc) |
| 463 | { | 295 | { |
| 464 | struct irqaction *action; | ||
| 465 | irqreturn_t action_ret; | ||
| 466 | |||
| 467 | raw_spin_lock(&desc->lock); | 296 | raw_spin_lock(&desc->lock); |
| 468 | 297 | ||
| 469 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 298 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) |
| 470 | goto out_unlock; | 299 | if (!irq_check_poll(desc)) |
| 471 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 300 | goto out_unlock; |
| 301 | |||
| 302 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
| 472 | kstat_incr_irqs_this_cpu(irq, desc); | 303 | kstat_incr_irqs_this_cpu(irq, desc); |
| 473 | 304 | ||
| 474 | action = desc->action; | 305 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) |
| 475 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
| 476 | goto out_unlock; | 306 | goto out_unlock; |
| 477 | 307 | ||
| 478 | desc->status |= IRQ_INPROGRESS; | 308 | handle_irq_event(desc); |
| 479 | raw_spin_unlock(&desc->lock); | ||
| 480 | 309 | ||
| 481 | action_ret = handle_IRQ_event(irq, action); | ||
| 482 | if (!noirqdebug) | ||
| 483 | note_interrupt(irq, desc, action_ret); | ||
| 484 | |||
| 485 | raw_spin_lock(&desc->lock); | ||
| 486 | desc->status &= ~IRQ_INPROGRESS; | ||
| 487 | out_unlock: | 310 | out_unlock: |
| 488 | raw_spin_unlock(&desc->lock); | 311 | raw_spin_unlock(&desc->lock); |
| 489 | } | 312 | } |
| @@ -501,42 +324,42 @@ out_unlock: | |||
| 501 | void | 324 | void |
| 502 | handle_level_irq(unsigned int irq, struct irq_desc *desc) | 325 | handle_level_irq(unsigned int irq, struct irq_desc *desc) |
| 503 | { | 326 | { |
| 504 | struct irqaction *action; | ||
| 505 | irqreturn_t action_ret; | ||
| 506 | |||
| 507 | raw_spin_lock(&desc->lock); | 327 | raw_spin_lock(&desc->lock); |
| 508 | mask_ack_irq(desc); | 328 | mask_ack_irq(desc); |
| 509 | 329 | ||
| 510 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 330 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) |
| 511 | goto out_unlock; | 331 | if (!irq_check_poll(desc)) |
| 512 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 332 | goto out_unlock; |
| 333 | |||
| 334 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
| 513 | kstat_incr_irqs_this_cpu(irq, desc); | 335 | kstat_incr_irqs_this_cpu(irq, desc); |
| 514 | 336 | ||
| 515 | /* | 337 | /* |
| 516 | * If its disabled or no action available | 338 | * If its disabled or no action available |
| 517 | * keep it masked and get out of here | 339 | * keep it masked and get out of here |
| 518 | */ | 340 | */ |
| 519 | action = desc->action; | 341 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) |
| 520 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
| 521 | goto out_unlock; | 342 | goto out_unlock; |
| 522 | 343 | ||
| 523 | desc->status |= IRQ_INPROGRESS; | 344 | handle_irq_event(desc); |
| 524 | raw_spin_unlock(&desc->lock); | ||
| 525 | |||
| 526 | action_ret = handle_IRQ_event(irq, action); | ||
| 527 | if (!noirqdebug) | ||
| 528 | note_interrupt(irq, desc, action_ret); | ||
| 529 | |||
| 530 | raw_spin_lock(&desc->lock); | ||
| 531 | desc->status &= ~IRQ_INPROGRESS; | ||
| 532 | 345 | ||
| 533 | if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) | 346 | if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) |
| 534 | unmask_irq(desc); | 347 | unmask_irq(desc); |
| 535 | out_unlock: | 348 | out_unlock: |
| 536 | raw_spin_unlock(&desc->lock); | 349 | raw_spin_unlock(&desc->lock); |
| 537 | } | 350 | } |
| 538 | EXPORT_SYMBOL_GPL(handle_level_irq); | 351 | EXPORT_SYMBOL_GPL(handle_level_irq); |
| 539 | 352 | ||
| 353 | #ifdef CONFIG_IRQ_PREFLOW_FASTEOI | ||
| 354 | static inline void preflow_handler(struct irq_desc *desc) | ||
| 355 | { | ||
| 356 | if (desc->preflow_handler) | ||
| 357 | desc->preflow_handler(&desc->irq_data); | ||
| 358 | } | ||
| 359 | #else | ||
| 360 | static inline void preflow_handler(struct irq_desc *desc) { } | ||
| 361 | #endif | ||
| 362 | |||
| 540 | /** | 363 | /** |
| 541 | * handle_fasteoi_irq - irq handler for transparent controllers | 364 | * handle_fasteoi_irq - irq handler for transparent controllers |
| 542 | * @irq: the interrupt number | 365 | * @irq: the interrupt number |
| @@ -550,42 +373,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq); | |||
| 550 | void | 373 | void |
| 551 | handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | 374 | handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) |
| 552 | { | 375 | { |
| 553 | struct irqaction *action; | ||
| 554 | irqreturn_t action_ret; | ||
| 555 | |||
| 556 | raw_spin_lock(&desc->lock); | 376 | raw_spin_lock(&desc->lock); |
| 557 | 377 | ||
| 558 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 378 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) |
| 559 | goto out; | 379 | if (!irq_check_poll(desc)) |
| 380 | goto out; | ||
| 560 | 381 | ||
| 561 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 382 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 562 | kstat_incr_irqs_this_cpu(irq, desc); | 383 | kstat_incr_irqs_this_cpu(irq, desc); |
| 563 | 384 | ||
| 564 | /* | 385 | /* |
| 565 | * If its disabled or no action available | 386 | * If its disabled or no action available |
| 566 | * then mask it and get out of here: | 387 | * then mask it and get out of here: |
| 567 | */ | 388 | */ |
| 568 | action = desc->action; | 389 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
| 569 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 390 | desc->istate |= IRQS_PENDING; |
| 570 | desc->status |= IRQ_PENDING; | ||
| 571 | mask_irq(desc); | 391 | mask_irq(desc); |
| 572 | goto out; | 392 | goto out; |
| 573 | } | 393 | } |
| 574 | 394 | ||
| 575 | desc->status |= IRQ_INPROGRESS; | 395 | if (desc->istate & IRQS_ONESHOT) |
| 576 | desc->status &= ~IRQ_PENDING; | 396 | mask_irq(desc); |
| 577 | raw_spin_unlock(&desc->lock); | ||
| 578 | 397 | ||
| 579 | action_ret = handle_IRQ_event(irq, action); | 398 | preflow_handler(desc); |
| 580 | if (!noirqdebug) | 399 | handle_irq_event(desc); |
| 581 | note_interrupt(irq, desc, action_ret); | ||
| 582 | 400 | ||
| 583 | raw_spin_lock(&desc->lock); | 401 | out_eoi: |
| 584 | desc->status &= ~IRQ_INPROGRESS; | ||
| 585 | out: | ||
| 586 | desc->irq_data.chip->irq_eoi(&desc->irq_data); | 402 | desc->irq_data.chip->irq_eoi(&desc->irq_data); |
| 587 | 403 | out_unlock: | |
| 588 | raw_spin_unlock(&desc->lock); | 404 | raw_spin_unlock(&desc->lock); |
| 405 | return; | ||
| 406 | out: | ||
| 407 | if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) | ||
| 408 | goto out_eoi; | ||
| 409 | goto out_unlock; | ||
| 589 | } | 410 | } |
| 590 | 411 | ||
| 591 | /** | 412 | /** |
| @@ -594,7 +415,7 @@ out: | |||
| 594 | * @desc: the interrupt description structure for this irq | 415 | * @desc: the interrupt description structure for this irq |
| 595 | * | 416 | * |
| 596 | * Interrupt occures on the falling and/or rising edge of a hardware | 417 | * Interrupt occures on the falling and/or rising edge of a hardware |
| 597 | * signal. The occurence is latched into the irq controller hardware | 418 | * signal. The occurrence is latched into the irq controller hardware |
| 598 | * and must be acked in order to be reenabled. After the ack another | 419 | * and must be acked in order to be reenabled. After the ack another |
| 599 | * interrupt can happen on the same source even before the first one | 420 | * interrupt can happen on the same source even before the first one |
| 600 | * is handled by the associated event handler. If this happens it | 421 | * is handled by the associated event handler. If this happens it |
| @@ -609,32 +430,27 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
| 609 | { | 430 | { |
| 610 | raw_spin_lock(&desc->lock); | 431 | raw_spin_lock(&desc->lock); |
| 611 | 432 | ||
| 612 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 433 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 613 | |||
| 614 | /* | 434 | /* |
| 615 | * If we're currently running this IRQ, or its disabled, | 435 | * If we're currently running this IRQ, or its disabled, |
| 616 | * we shouldn't process the IRQ. Mark it pending, handle | 436 | * we shouldn't process the IRQ. Mark it pending, handle |
| 617 | * the necessary masking and go out | 437 | * the necessary masking and go out |
| 618 | */ | 438 | */ |
| 619 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || | 439 | if (unlikely(irqd_irq_disabled(&desc->irq_data) || |
| 620 | !desc->action)) { | 440 | irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { |
| 621 | desc->status |= (IRQ_PENDING | IRQ_MASKED); | 441 | if (!irq_check_poll(desc)) { |
| 622 | mask_ack_irq(desc); | 442 | desc->istate |= IRQS_PENDING; |
| 623 | goto out_unlock; | 443 | mask_ack_irq(desc); |
| 444 | goto out_unlock; | ||
| 445 | } | ||
| 624 | } | 446 | } |
| 625 | kstat_incr_irqs_this_cpu(irq, desc); | 447 | kstat_incr_irqs_this_cpu(irq, desc); |
| 626 | 448 | ||
| 627 | /* Start handling the irq */ | 449 | /* Start handling the irq */ |
| 628 | desc->irq_data.chip->irq_ack(&desc->irq_data); | 450 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
| 629 | 451 | ||
| 630 | /* Mark the IRQ currently in progress.*/ | ||
| 631 | desc->status |= IRQ_INPROGRESS; | ||
| 632 | |||
| 633 | do { | 452 | do { |
| 634 | struct irqaction *action = desc->action; | 453 | if (unlikely(!desc->action)) { |
| 635 | irqreturn_t action_ret; | ||
| 636 | |||
| 637 | if (unlikely(!action)) { | ||
| 638 | mask_irq(desc); | 454 | mask_irq(desc); |
| 639 | goto out_unlock; | 455 | goto out_unlock; |
| 640 | } | 456 | } |
| @@ -644,26 +460,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
| 644 | * one, we could have masked the irq. | 460 | * one, we could have masked the irq. |
| 645 | * Renable it, if it was not disabled in meantime. | 461 | * Renable it, if it was not disabled in meantime. |
| 646 | */ | 462 | */ |
| 647 | if (unlikely((desc->status & | 463 | if (unlikely(desc->istate & IRQS_PENDING)) { |
| 648 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == | 464 | if (!irqd_irq_disabled(&desc->irq_data) && |
| 649 | (IRQ_PENDING | IRQ_MASKED))) { | 465 | irqd_irq_masked(&desc->irq_data)) |
| 650 | unmask_irq(desc); | 466 | unmask_irq(desc); |
| 651 | } | 467 | } |
| 652 | 468 | ||
| 653 | desc->status &= ~IRQ_PENDING; | 469 | handle_irq_event(desc); |
| 654 | raw_spin_unlock(&desc->lock); | ||
| 655 | action_ret = handle_IRQ_event(irq, action); | ||
| 656 | if (!noirqdebug) | ||
| 657 | note_interrupt(irq, desc, action_ret); | ||
| 658 | raw_spin_lock(&desc->lock); | ||
| 659 | 470 | ||
| 660 | } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); | 471 | } while ((desc->istate & IRQS_PENDING) && |
| 472 | !irqd_irq_disabled(&desc->irq_data)); | ||
| 661 | 473 | ||
| 662 | desc->status &= ~IRQ_INPROGRESS; | ||
| 663 | out_unlock: | 474 | out_unlock: |
| 664 | raw_spin_unlock(&desc->lock); | 475 | raw_spin_unlock(&desc->lock); |
| 665 | } | 476 | } |
| 666 | 477 | ||
| 478 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER | ||
| 479 | /** | ||
| 480 | * handle_edge_eoi_irq - edge eoi type IRQ handler | ||
| 481 | * @irq: the interrupt number | ||
| 482 | * @desc: the interrupt description structure for this irq | ||
| 483 | * | ||
| 484 | * Similar as the above handle_edge_irq, but using eoi and w/o the | ||
| 485 | * mask/unmask logic. | ||
| 486 | */ | ||
| 487 | void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) | ||
| 488 | { | ||
| 489 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
| 490 | |||
| 491 | raw_spin_lock(&desc->lock); | ||
| 492 | |||
| 493 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
| 494 | /* | ||
| 495 | * If we're currently running this IRQ, or its disabled, | ||
| 496 | * we shouldn't process the IRQ. Mark it pending, handle | ||
| 497 | * the necessary masking and go out | ||
| 498 | */ | ||
| 499 | if (unlikely(irqd_irq_disabled(&desc->irq_data) || | ||
| 500 | irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { | ||
| 501 | if (!irq_check_poll(desc)) { | ||
| 502 | desc->istate |= IRQS_PENDING; | ||
| 503 | goto out_eoi; | ||
| 504 | } | ||
| 505 | } | ||
| 506 | kstat_incr_irqs_this_cpu(irq, desc); | ||
| 507 | |||
| 508 | do { | ||
| 509 | if (unlikely(!desc->action)) | ||
| 510 | goto out_eoi; | ||
| 511 | |||
| 512 | handle_irq_event(desc); | ||
| 513 | |||
| 514 | } while ((desc->istate & IRQS_PENDING) && | ||
| 515 | !irqd_irq_disabled(&desc->irq_data)); | ||
| 516 | |||
| 517 | out_eoi: | ||
| 518 | chip->irq_eoi(&desc->irq_data); | ||
| 519 | raw_spin_unlock(&desc->lock); | ||
| 520 | } | ||
| 521 | #endif | ||
| 522 | |||
| 667 | /** | 523 | /** |
| 668 | * handle_percpu_irq - Per CPU local irq handler | 524 | * handle_percpu_irq - Per CPU local irq handler |
| 669 | * @irq: the interrupt number | 525 | * @irq: the interrupt number |
| @@ -674,103 +530,145 @@ out_unlock: | |||
| 674 | void | 530 | void |
| 675 | handle_percpu_irq(unsigned int irq, struct irq_desc *desc) | 531 | handle_percpu_irq(unsigned int irq, struct irq_desc *desc) |
| 676 | { | 532 | { |
| 677 | irqreturn_t action_ret; | 533 | struct irq_chip *chip = irq_desc_get_chip(desc); |
| 678 | 534 | ||
| 679 | kstat_incr_irqs_this_cpu(irq, desc); | 535 | kstat_incr_irqs_this_cpu(irq, desc); |
| 680 | 536 | ||
| 681 | if (desc->irq_data.chip->irq_ack) | 537 | if (chip->irq_ack) |
| 682 | desc->irq_data.chip->irq_ack(&desc->irq_data); | 538 | chip->irq_ack(&desc->irq_data); |
| 683 | 539 | ||
| 684 | action_ret = handle_IRQ_event(irq, desc->action); | 540 | handle_irq_event_percpu(desc, desc->action); |
| 685 | if (!noirqdebug) | ||
| 686 | note_interrupt(irq, desc, action_ret); | ||
| 687 | 541 | ||
| 688 | if (desc->irq_data.chip->irq_eoi) | 542 | if (chip->irq_eoi) |
| 689 | desc->irq_data.chip->irq_eoi(&desc->irq_data); | 543 | chip->irq_eoi(&desc->irq_data); |
| 690 | } | 544 | } |
| 691 | 545 | ||
| 692 | void | 546 | void |
| 693 | __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | 547 | __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, |
| 694 | const char *name) | 548 | const char *name) |
| 695 | { | 549 | { |
| 696 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 697 | unsigned long flags; | 550 | unsigned long flags; |
| 551 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
| 698 | 552 | ||
| 699 | if (!desc) { | 553 | if (!desc) |
| 700 | printk(KERN_ERR | ||
| 701 | "Trying to install type control for IRQ%d\n", irq); | ||
| 702 | return; | 554 | return; |
| 703 | } | ||
| 704 | 555 | ||
| 705 | if (!handle) | 556 | if (!handle) { |
| 706 | handle = handle_bad_irq; | 557 | handle = handle_bad_irq; |
| 707 | else if (desc->irq_data.chip == &no_irq_chip) { | 558 | } else { |
| 708 | printk(KERN_WARNING "Trying to install %sinterrupt handler " | 559 | if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) |
| 709 | "for IRQ%d\n", is_chained ? "chained " : "", irq); | 560 | goto out; |
| 710 | /* | ||
| 711 | * Some ARM implementations install a handler for really dumb | ||
| 712 | * interrupt hardware without setting an irq_chip. This worked | ||
| 713 | * with the ARM no_irq_chip but the check in setup_irq would | ||
| 714 | * prevent us to setup the interrupt at all. Switch it to | ||
| 715 | * dummy_irq_chip for easy transition. | ||
| 716 | */ | ||
| 717 | desc->irq_data.chip = &dummy_irq_chip; | ||
| 718 | } | 561 | } |
| 719 | 562 | ||
| 720 | chip_bus_lock(desc); | ||
| 721 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 722 | |||
| 723 | /* Uninstall? */ | 563 | /* Uninstall? */ |
| 724 | if (handle == handle_bad_irq) { | 564 | if (handle == handle_bad_irq) { |
| 725 | if (desc->irq_data.chip != &no_irq_chip) | 565 | if (desc->irq_data.chip != &no_irq_chip) |
| 726 | mask_ack_irq(desc); | 566 | mask_ack_irq(desc); |
| 727 | desc->status |= IRQ_DISABLED; | 567 | irq_state_set_disabled(desc); |
| 728 | desc->depth = 1; | 568 | desc->depth = 1; |
| 729 | } | 569 | } |
| 730 | desc->handle_irq = handle; | 570 | desc->handle_irq = handle; |
| 731 | desc->name = name; | 571 | desc->name = name; |
| 732 | 572 | ||
| 733 | if (handle != handle_bad_irq && is_chained) { | 573 | if (handle != handle_bad_irq && is_chained) { |
| 734 | desc->status &= ~IRQ_DISABLED; | 574 | irq_settings_set_noprobe(desc); |
| 735 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; | 575 | irq_settings_set_norequest(desc); |
| 736 | desc->depth = 0; | 576 | irq_startup(desc); |
| 737 | desc->irq_data.chip->irq_startup(&desc->irq_data); | ||
| 738 | } | 577 | } |
| 739 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 578 | out: |
| 740 | chip_bus_sync_unlock(desc); | 579 | irq_put_desc_busunlock(desc, flags); |
| 741 | } | ||
| 742 | EXPORT_SYMBOL_GPL(__set_irq_handler); | ||
| 743 | |||
| 744 | void | ||
| 745 | set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, | ||
| 746 | irq_flow_handler_t handle) | ||
| 747 | { | ||
| 748 | set_irq_chip(irq, chip); | ||
| 749 | __set_irq_handler(irq, handle, 0, NULL); | ||
| 750 | } | 580 | } |
| 581 | EXPORT_SYMBOL_GPL(__irq_set_handler); | ||
| 751 | 582 | ||
| 752 | void | 583 | void |
| 753 | set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, | 584 | irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, |
| 754 | irq_flow_handler_t handle, const char *name) | 585 | irq_flow_handler_t handle, const char *name) |
| 755 | { | 586 | { |
| 756 | set_irq_chip(irq, chip); | 587 | irq_set_chip(irq, chip); |
| 757 | __set_irq_handler(irq, handle, 0, name); | 588 | __irq_set_handler(irq, handle, 0, name); |
| 758 | } | 589 | } |
| 759 | 590 | ||
| 760 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) | 591 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) |
| 761 | { | 592 | { |
| 762 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 763 | unsigned long flags; | 593 | unsigned long flags; |
| 594 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
| 764 | 595 | ||
| 765 | if (!desc) | 596 | if (!desc) |
| 766 | return; | 597 | return; |
| 598 | irq_settings_clr_and_set(desc, clr, set); | ||
| 599 | |||
| 600 | irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | | ||
| 601 | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); | ||
| 602 | if (irq_settings_has_no_balance_set(desc)) | ||
| 603 | irqd_set(&desc->irq_data, IRQD_NO_BALANCING); | ||
| 604 | if (irq_settings_is_per_cpu(desc)) | ||
| 605 | irqd_set(&desc->irq_data, IRQD_PER_CPU); | ||
| 606 | if (irq_settings_can_move_pcntxt(desc)) | ||
| 607 | irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); | ||
| 608 | if (irq_settings_is_level(desc)) | ||
| 609 | irqd_set(&desc->irq_data, IRQD_LEVEL); | ||
| 610 | |||
| 611 | irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); | ||
| 612 | |||
| 613 | irq_put_desc_unlock(desc, flags); | ||
| 614 | } | ||
| 615 | |||
| 616 | /** | ||
| 617 | * irq_cpu_online - Invoke all irq_cpu_online functions. | ||
| 618 | * | ||
| 619 | * Iterate through all irqs and invoke the chip.irq_cpu_online() | ||
| 620 | * for each. | ||
| 621 | */ | ||
| 622 | void irq_cpu_online(void) | ||
| 623 | { | ||
| 624 | struct irq_desc *desc; | ||
| 625 | struct irq_chip *chip; | ||
| 626 | unsigned long flags; | ||
| 627 | unsigned int irq; | ||
| 628 | |||
| 629 | for_each_active_irq(irq) { | ||
| 630 | desc = irq_to_desc(irq); | ||
| 631 | if (!desc) | ||
| 632 | continue; | ||
| 767 | 633 | ||
| 768 | /* Sanitize flags */ | 634 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 769 | set &= IRQF_MODIFY_MASK; | ||
| 770 | clr &= IRQF_MODIFY_MASK; | ||
| 771 | 635 | ||
| 772 | raw_spin_lock_irqsave(&desc->lock, flags); | 636 | chip = irq_data_get_irq_chip(&desc->irq_data); |
| 773 | desc->status &= ~clr; | 637 | if (chip && chip->irq_cpu_online && |
| 774 | desc->status |= set; | 638 | (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || |
| 775 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 639 | !irqd_irq_disabled(&desc->irq_data))) |
| 640 | chip->irq_cpu_online(&desc->irq_data); | ||
| 641 | |||
| 642 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 643 | } | ||
| 644 | } | ||
| 645 | |||
| 646 | /** | ||
| 647 | * irq_cpu_offline - Invoke all irq_cpu_offline functions. | ||
| 648 | * | ||
| 649 | * Iterate through all irqs and invoke the chip.irq_cpu_offline() | ||
| 650 | * for each. | ||
| 651 | */ | ||
| 652 | void irq_cpu_offline(void) | ||
| 653 | { | ||
| 654 | struct irq_desc *desc; | ||
| 655 | struct irq_chip *chip; | ||
| 656 | unsigned long flags; | ||
| 657 | unsigned int irq; | ||
| 658 | |||
| 659 | for_each_active_irq(irq) { | ||
| 660 | desc = irq_to_desc(irq); | ||
| 661 | if (!desc) | ||
| 662 | continue; | ||
| 663 | |||
| 664 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 665 | |||
| 666 | chip = irq_data_get_irq_chip(&desc->irq_data); | ||
| 667 | if (chip && chip->irq_cpu_offline && | ||
| 668 | (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || | ||
| 669 | !irqd_irq_disabled(&desc->irq_data))) | ||
| 670 | chip->irq_cpu_offline(&desc->irq_data); | ||
| 671 | |||
| 672 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 673 | } | ||
| 776 | } | 674 | } |
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h new file mode 100644 index 000000000000..306cba37e9a5 --- /dev/null +++ b/kernel/irq/debug.h | |||
| @@ -0,0 +1,44 @@ | |||
| 1 | /* | ||
| 2 | * Debugging printout: | ||
| 3 | */ | ||
| 4 | |||
| 5 | #include <linux/kallsyms.h> | ||
| 6 | |||
| 7 | #define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) | ||
| 8 | #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) | ||
| 9 | /* FIXME */ | ||
| 10 | #define PD(f) do { } while (0) | ||
| 11 | |||
| 12 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
| 13 | { | ||
| 14 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | ||
| 15 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | ||
| 16 | printk("->handle_irq(): %p, ", desc->handle_irq); | ||
| 17 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | ||
| 18 | printk("->irq_data.chip(): %p, ", desc->irq_data.chip); | ||
| 19 | print_symbol("%s\n", (unsigned long)desc->irq_data.chip); | ||
| 20 | printk("->action(): %p\n", desc->action); | ||
| 21 | if (desc->action) { | ||
| 22 | printk("->action->handler(): %p, ", desc->action->handler); | ||
| 23 | print_symbol("%s\n", (unsigned long)desc->action->handler); | ||
| 24 | } | ||
| 25 | |||
| 26 | P(IRQ_LEVEL); | ||
| 27 | P(IRQ_PER_CPU); | ||
| 28 | P(IRQ_NOPROBE); | ||
| 29 | P(IRQ_NOREQUEST); | ||
| 30 | P(IRQ_NOAUTOEN); | ||
| 31 | |||
| 32 | PS(IRQS_AUTODETECT); | ||
| 33 | PS(IRQS_REPLAY); | ||
| 34 | PS(IRQS_WAITING); | ||
| 35 | PS(IRQS_PENDING); | ||
| 36 | |||
| 37 | PD(IRQS_INPROGRESS); | ||
| 38 | PD(IRQS_DISABLED); | ||
| 39 | PD(IRQS_MASKED); | ||
| 40 | } | ||
| 41 | |||
| 42 | #undef P | ||
| 43 | #undef PS | ||
| 44 | #undef PD | ||
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 20dc5474947e..b5fcd96c7102 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c | |||
| @@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data) | |||
| 31 | return 0; | 31 | return 0; |
| 32 | } | 32 | } |
| 33 | 33 | ||
| 34 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
| 35 | static void compat_noop(unsigned int irq) { } | ||
| 36 | #define END_INIT .end = compat_noop | ||
| 37 | #else | ||
| 38 | #define END_INIT | ||
| 39 | #endif | ||
| 40 | |||
| 41 | /* | 34 | /* |
| 42 | * Generic no controller implementation | 35 | * Generic no controller implementation |
| 43 | */ | 36 | */ |
| @@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = { | |||
| 48 | .irq_enable = noop, | 41 | .irq_enable = noop, |
| 49 | .irq_disable = noop, | 42 | .irq_disable = noop, |
| 50 | .irq_ack = ack_bad, | 43 | .irq_ack = ack_bad, |
| 51 | END_INIT | ||
| 52 | }; | 44 | }; |
| 53 | 45 | ||
| 54 | /* | 46 | /* |
| @@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = { | |||
| 64 | .irq_ack = noop, | 56 | .irq_ack = noop, |
| 65 | .irq_mask = noop, | 57 | .irq_mask = noop, |
| 66 | .irq_unmask = noop, | 58 | .irq_unmask = noop, |
| 67 | END_INIT | ||
| 68 | }; | 59 | }; |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3540a7190122..90cb55f6d7eb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) | |||
| 51 | "but no thread function available.", irq, action->name); | 51 | "but no thread function available.", irq, action->name); |
| 52 | } | 52 | } |
| 53 | 53 | ||
| 54 | /** | 54 | static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) |
| 55 | * handle_IRQ_event - irq action chain handler | 55 | { |
| 56 | * @irq: the interrupt number | 56 | /* |
| 57 | * @action: the interrupt action chain for this irq | 57 | * Wake up the handler thread for this action. In case the |
| 58 | * | 58 | * thread crashed and was killed we just pretend that we |
| 59 | * Handles the action chain of an irq event | 59 | * handled the interrupt. The hardirq handler has disabled the |
| 60 | */ | 60 | * device interrupt, so no irq storm is lurking. If the |
| 61 | irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) | 61 | * RUNTHREAD bit is already set, nothing to do. |
| 62 | */ | ||
| 63 | if (test_bit(IRQTF_DIED, &action->thread_flags) || | ||
| 64 | test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
| 65 | return; | ||
| 66 | |||
| 67 | /* | ||
| 68 | * It's safe to OR the mask lockless here. We have only two | ||
| 69 | * places which write to threads_oneshot: This code and the | ||
| 70 | * irq thread. | ||
| 71 | * | ||
| 72 | * This code is the hard irq context and can never run on two | ||
| 73 | * cpus in parallel. If it ever does we have more serious | ||
| 74 | * problems than this bitmask. | ||
| 75 | * | ||
| 76 | * The irq threads of this irq which clear their "running" bit | ||
| 77 | * in threads_oneshot are serialized via desc->lock against | ||
| 78 | * each other and they are serialized against this code by | ||
| 79 | * IRQS_INPROGRESS. | ||
| 80 | * | ||
| 81 | * Hard irq handler: | ||
| 82 | * | ||
| 83 | * spin_lock(desc->lock); | ||
| 84 | * desc->state |= IRQS_INPROGRESS; | ||
| 85 | * spin_unlock(desc->lock); | ||
| 86 | * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); | ||
| 87 | * desc->threads_oneshot |= mask; | ||
| 88 | * spin_lock(desc->lock); | ||
| 89 | * desc->state &= ~IRQS_INPROGRESS; | ||
| 90 | * spin_unlock(desc->lock); | ||
| 91 | * | ||
| 92 | * irq thread: | ||
| 93 | * | ||
| 94 | * again: | ||
| 95 | * spin_lock(desc->lock); | ||
| 96 | * if (desc->state & IRQS_INPROGRESS) { | ||
| 97 | * spin_unlock(desc->lock); | ||
| 98 | * while(desc->state & IRQS_INPROGRESS) | ||
| 99 | * cpu_relax(); | ||
| 100 | * goto again; | ||
| 101 | * } | ||
| 102 | * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
| 103 | * desc->threads_oneshot &= ~mask; | ||
| 104 | * spin_unlock(desc->lock); | ||
| 105 | * | ||
| 106 | * So either the thread waits for us to clear IRQS_INPROGRESS | ||
| 107 | * or we are waiting in the flow handler for desc->lock to be | ||
| 108 | * released before we reach this point. The thread also checks | ||
| 109 | * IRQTF_RUNTHREAD under desc->lock. If set it leaves | ||
| 110 | * threads_oneshot untouched and runs the thread another time. | ||
| 111 | */ | ||
| 112 | desc->threads_oneshot |= action->thread_mask; | ||
| 113 | wake_up_process(action->thread); | ||
| 114 | } | ||
| 115 | |||
| 116 | irqreturn_t | ||
| 117 | handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | ||
| 62 | { | 118 | { |
| 63 | irqreturn_t ret, retval = IRQ_NONE; | 119 | irqreturn_t retval = IRQ_NONE; |
| 64 | unsigned int status = 0; | 120 | unsigned int random = 0, irq = desc->irq_data.irq; |
| 65 | 121 | ||
| 66 | do { | 122 | do { |
| 123 | irqreturn_t res; | ||
| 124 | |||
| 67 | trace_irq_handler_entry(irq, action); | 125 | trace_irq_handler_entry(irq, action); |
| 68 | ret = action->handler(irq, action->dev_id); | 126 | res = action->handler(irq, action->dev_id); |
| 69 | trace_irq_handler_exit(irq, action, ret); | 127 | trace_irq_handler_exit(irq, action, res); |
| 128 | |||
| 129 | if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", | ||
| 130 | irq, action->handler)) | ||
| 131 | local_irq_disable(); | ||
| 70 | 132 | ||
| 71 | switch (ret) { | 133 | switch (res) { |
| 72 | case IRQ_WAKE_THREAD: | 134 | case IRQ_WAKE_THREAD: |
| 73 | /* | 135 | /* |
| 74 | * Set result to handled so the spurious check | 136 | * Set result to handled so the spurious check |
| 75 | * does not trigger. | 137 | * does not trigger. |
| 76 | */ | 138 | */ |
| 77 | ret = IRQ_HANDLED; | 139 | res = IRQ_HANDLED; |
| 78 | 140 | ||
| 79 | /* | 141 | /* |
| 80 | * Catch drivers which return WAKE_THREAD but | 142 | * Catch drivers which return WAKE_THREAD but |
| @@ -85,36 +147,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) | |||
| 85 | break; | 147 | break; |
| 86 | } | 148 | } |
| 87 | 149 | ||
| 88 | /* | 150 | irq_wake_thread(desc, action); |
| 89 | * Wake up the handler thread for this | ||
| 90 | * action. In case the thread crashed and was | ||
| 91 | * killed we just pretend that we handled the | ||
| 92 | * interrupt. The hardirq handler above has | ||
| 93 | * disabled the device interrupt, so no irq | ||
| 94 | * storm is lurking. | ||
| 95 | */ | ||
| 96 | if (likely(!test_bit(IRQTF_DIED, | ||
| 97 | &action->thread_flags))) { | ||
| 98 | set_bit(IRQTF_RUNTHREAD, &action->thread_flags); | ||
| 99 | wake_up_process(action->thread); | ||
| 100 | } | ||
| 101 | 151 | ||
| 102 | /* Fall through to add to randomness */ | 152 | /* Fall through to add to randomness */ |
| 103 | case IRQ_HANDLED: | 153 | case IRQ_HANDLED: |
| 104 | status |= action->flags; | 154 | random |= action->flags; |
| 105 | break; | 155 | break; |
| 106 | 156 | ||
| 107 | default: | 157 | default: |
| 108 | break; | 158 | break; |
| 109 | } | 159 | } |
| 110 | 160 | ||
| 111 | retval |= ret; | 161 | retval |= res; |
| 112 | action = action->next; | 162 | action = action->next; |
| 113 | } while (action); | 163 | } while (action); |
| 114 | 164 | ||
| 115 | if (status & IRQF_SAMPLE_RANDOM) | 165 | if (random & IRQF_SAMPLE_RANDOM) |
| 116 | add_interrupt_randomness(irq); | 166 | add_interrupt_randomness(irq); |
| 117 | local_irq_disable(); | ||
| 118 | 167 | ||
| 168 | if (!noirqdebug) | ||
| 169 | note_interrupt(irq, desc, retval); | ||
| 119 | return retval; | 170 | return retval; |
| 120 | } | 171 | } |
| 172 | |||
| 173 | irqreturn_t handle_irq_event(struct irq_desc *desc) | ||
| 174 | { | ||
| 175 | struct irqaction *action = desc->action; | ||
| 176 | irqreturn_t ret; | ||
| 177 | |||
| 178 | desc->istate &= ~IRQS_PENDING; | ||
| 179 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); | ||
| 180 | raw_spin_unlock(&desc->lock); | ||
| 181 | |||
| 182 | ret = handle_irq_event_percpu(desc, action); | ||
| 183 | |||
| 184 | raw_spin_lock(&desc->lock); | ||
| 185 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); | ||
| 186 | return ret; | ||
| 187 | } | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4571ae7e085a..6546431447d7 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -1,27 +1,87 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * IRQ subsystem internal functions and variables: | 2 | * IRQ subsystem internal functions and variables: |
| 3 | * | ||
| 4 | * Do not ever include this file from anything else than | ||
| 5 | * kernel/irq/. Do not even think about using any information outside | ||
| 6 | * of this file for your non core code. | ||
| 3 | */ | 7 | */ |
| 4 | #include <linux/irqdesc.h> | 8 | #include <linux/irqdesc.h> |
| 5 | 9 | ||
| 10 | #ifdef CONFIG_SPARSE_IRQ | ||
| 11 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) | ||
| 12 | #else | ||
| 13 | # define IRQ_BITMAP_BITS NR_IRQS | ||
| 14 | #endif | ||
| 15 | |||
| 16 | #define istate core_internal_state__do_not_mess_with_it | ||
| 17 | |||
| 6 | extern int noirqdebug; | 18 | extern int noirqdebug; |
| 7 | 19 | ||
| 8 | #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) | 20 | /* |
| 21 | * Bits used by threaded handlers: | ||
| 22 | * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run | ||
| 23 | * IRQTF_DIED - handler thread died | ||
| 24 | * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed | ||
| 25 | * IRQTF_AFFINITY - irq thread is requested to adjust affinity | ||
| 26 | * IRQTF_FORCED_THREAD - irq action is force threaded | ||
| 27 | */ | ||
| 28 | enum { | ||
| 29 | IRQTF_RUNTHREAD, | ||
| 30 | IRQTF_DIED, | ||
| 31 | IRQTF_WARNED, | ||
| 32 | IRQTF_AFFINITY, | ||
| 33 | IRQTF_FORCED_THREAD, | ||
| 34 | }; | ||
| 9 | 35 | ||
| 10 | /* Set default functions for irq_chip structures: */ | 36 | /* |
| 11 | extern void irq_chip_set_defaults(struct irq_chip *chip); | 37 | * Bit masks for desc->state |
| 38 | * | ||
| 39 | * IRQS_AUTODETECT - autodetection in progress | ||
| 40 | * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt | ||
| 41 | * detection | ||
| 42 | * IRQS_POLL_INPROGRESS - polling in progress | ||
| 43 | * IRQS_ONESHOT - irq is not unmasked in primary handler | ||
| 44 | * IRQS_REPLAY - irq is replayed | ||
| 45 | * IRQS_WAITING - irq is waiting | ||
| 46 | * IRQS_PENDING - irq is pending and replayed later | ||
| 47 | * IRQS_SUSPENDED - irq is suspended | ||
| 48 | */ | ||
| 49 | enum { | ||
| 50 | IRQS_AUTODETECT = 0x00000001, | ||
| 51 | IRQS_SPURIOUS_DISABLED = 0x00000002, | ||
| 52 | IRQS_POLL_INPROGRESS = 0x00000008, | ||
| 53 | IRQS_ONESHOT = 0x00000020, | ||
| 54 | IRQS_REPLAY = 0x00000040, | ||
| 55 | IRQS_WAITING = 0x00000080, | ||
| 56 | IRQS_PENDING = 0x00000200, | ||
| 57 | IRQS_SUSPENDED = 0x00000800, | ||
| 58 | }; | ||
| 59 | |||
| 60 | #include "debug.h" | ||
| 61 | #include "settings.h" | ||
| 12 | 62 | ||
| 13 | /* Set default handler: */ | 63 | #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) |
| 14 | extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); | ||
| 15 | 64 | ||
| 16 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 65 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
| 17 | unsigned long flags); | 66 | unsigned long flags); |
| 18 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | 67 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); |
| 19 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 68 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); |
| 20 | 69 | ||
| 70 | extern int irq_startup(struct irq_desc *desc); | ||
| 71 | extern void irq_shutdown(struct irq_desc *desc); | ||
| 72 | extern void irq_enable(struct irq_desc *desc); | ||
| 73 | extern void irq_disable(struct irq_desc *desc); | ||
| 74 | extern void mask_irq(struct irq_desc *desc); | ||
| 75 | extern void unmask_irq(struct irq_desc *desc); | ||
| 76 | |||
| 21 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 77 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
| 22 | 78 | ||
| 79 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); | ||
| 80 | irqreturn_t handle_irq_event(struct irq_desc *desc); | ||
| 81 | |||
| 23 | /* Resending of interrupts :*/ | 82 | /* Resending of interrupts :*/ |
| 24 | void check_irq_resend(struct irq_desc *desc, unsigned int irq); | 83 | void check_irq_resend(struct irq_desc *desc, unsigned int irq); |
| 84 | bool irq_wait_for_poll(struct irq_desc *desc); | ||
| 25 | 85 | ||
| 26 | #ifdef CONFIG_PROC_FS | 86 | #ifdef CONFIG_PROC_FS |
| 27 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); | 87 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); |
| @@ -37,20 +97,10 @@ static inline void unregister_handler_proc(unsigned int irq, | |||
| 37 | struct irqaction *action) { } | 97 | struct irqaction *action) { } |
| 38 | #endif | 98 | #endif |
| 39 | 99 | ||
| 40 | extern int irq_select_affinity_usr(unsigned int irq); | 100 | extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); |
| 41 | 101 | ||
| 42 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 102 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
| 43 | 103 | ||
| 44 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
| 45 | static inline void irq_end(unsigned int irq, struct irq_desc *desc) | ||
| 46 | { | ||
| 47 | if (desc->irq_data.chip && desc->irq_data.chip->end) | ||
| 48 | desc->irq_data.chip->end(irq); | ||
| 49 | } | ||
| 50 | #else | ||
| 51 | static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } | ||
| 52 | #endif | ||
| 53 | |||
| 54 | /* Inline functions for support of irq chips on slow busses */ | 104 | /* Inline functions for support of irq chips on slow busses */ |
| 55 | static inline void chip_bus_lock(struct irq_desc *desc) | 105 | static inline void chip_bus_lock(struct irq_desc *desc) |
| 56 | { | 106 | { |
| @@ -64,43 +114,58 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) | |||
| 64 | desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); | 114 | desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); |
| 65 | } | 115 | } |
| 66 | 116 | ||
| 117 | struct irq_desc * | ||
| 118 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); | ||
| 119 | void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); | ||
| 120 | |||
| 121 | static inline struct irq_desc * | ||
| 122 | irq_get_desc_buslock(unsigned int irq, unsigned long *flags) | ||
| 123 | { | ||
| 124 | return __irq_get_desc_lock(irq, flags, true); | ||
| 125 | } | ||
| 126 | |||
| 127 | static inline void | ||
| 128 | irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) | ||
| 129 | { | ||
| 130 | __irq_put_desc_unlock(desc, flags, true); | ||
| 131 | } | ||
| 132 | |||
| 133 | static inline struct irq_desc * | ||
| 134 | irq_get_desc_lock(unsigned int irq, unsigned long *flags) | ||
| 135 | { | ||
| 136 | return __irq_get_desc_lock(irq, flags, false); | ||
| 137 | } | ||
| 138 | |||
| 139 | static inline void | ||
| 140 | irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) | ||
| 141 | { | ||
| 142 | __irq_put_desc_unlock(desc, flags, false); | ||
| 143 | } | ||
| 144 | |||
| 67 | /* | 145 | /* |
| 68 | * Debugging printout: | 146 | * Manipulation functions for irq_data.state |
| 69 | */ | 147 | */ |
| 148 | static inline void irqd_set_move_pending(struct irq_data *d) | ||
| 149 | { | ||
| 150 | d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; | ||
| 151 | } | ||
| 70 | 152 | ||
| 71 | #include <linux/kallsyms.h> | 153 | static inline void irqd_clr_move_pending(struct irq_data *d) |
| 72 | 154 | { | |
| 73 | #define P(f) if (desc->status & f) printk("%14s set\n", #f) | 155 | d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; |
| 156 | } | ||
| 74 | 157 | ||
| 75 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | 158 | static inline void irqd_clear(struct irq_data *d, unsigned int mask) |
| 76 | { | 159 | { |
| 77 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | 160 | d->state_use_accessors &= ~mask; |
| 78 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | ||
| 79 | printk("->handle_irq(): %p, ", desc->handle_irq); | ||
| 80 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | ||
| 81 | printk("->irq_data.chip(): %p, ", desc->irq_data.chip); | ||
| 82 | print_symbol("%s\n", (unsigned long)desc->irq_data.chip); | ||
| 83 | printk("->action(): %p\n", desc->action); | ||
| 84 | if (desc->action) { | ||
| 85 | printk("->action->handler(): %p, ", desc->action->handler); | ||
| 86 | print_symbol("%s\n", (unsigned long)desc->action->handler); | ||
| 87 | } | ||
| 88 | |||
| 89 | P(IRQ_INPROGRESS); | ||
| 90 | P(IRQ_DISABLED); | ||
| 91 | P(IRQ_PENDING); | ||
| 92 | P(IRQ_REPLAY); | ||
| 93 | P(IRQ_AUTODETECT); | ||
| 94 | P(IRQ_WAITING); | ||
| 95 | P(IRQ_LEVEL); | ||
| 96 | P(IRQ_MASKED); | ||
| 97 | #ifdef CONFIG_IRQ_PER_CPU | ||
| 98 | P(IRQ_PER_CPU); | ||
| 99 | #endif | ||
| 100 | P(IRQ_NOPROBE); | ||
| 101 | P(IRQ_NOREQUEST); | ||
| 102 | P(IRQ_NOAUTOEN); | ||
| 103 | } | 161 | } |
| 104 | 162 | ||
| 105 | #undef P | 163 | static inline void irqd_set(struct irq_data *d, unsigned int mask) |
| 164 | { | ||
| 165 | d->state_use_accessors |= mask; | ||
| 166 | } | ||
| 106 | 167 | ||
| 168 | static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) | ||
| 169 | { | ||
| 170 | return d->state_use_accessors & mask; | ||
| 171 | } | ||
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 282f20230e67..2c039c9b9383 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | |||
| 79 | desc->irq_data.chip_data = NULL; | 79 | desc->irq_data.chip_data = NULL; |
| 80 | desc->irq_data.handler_data = NULL; | 80 | desc->irq_data.handler_data = NULL; |
| 81 | desc->irq_data.msi_desc = NULL; | 81 | desc->irq_data.msi_desc = NULL; |
| 82 | desc->status = IRQ_DEFAULT_INIT_FLAGS; | 82 | irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); |
| 83 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); | ||
| 83 | desc->handle_irq = handle_bad_irq; | 84 | desc->handle_irq = handle_bad_irq; |
| 84 | desc->depth = 1; | 85 | desc->depth = 1; |
| 85 | desc->irq_count = 0; | 86 | desc->irq_count = 0; |
| @@ -94,7 +95,7 @@ int nr_irqs = NR_IRQS; | |||
| 94 | EXPORT_SYMBOL_GPL(nr_irqs); | 95 | EXPORT_SYMBOL_GPL(nr_irqs); |
| 95 | 96 | ||
| 96 | static DEFINE_MUTEX(sparse_irq_lock); | 97 | static DEFINE_MUTEX(sparse_irq_lock); |
| 97 | static DECLARE_BITMAP(allocated_irqs, NR_IRQS); | 98 | static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); |
| 98 | 99 | ||
| 99 | #ifdef CONFIG_SPARSE_IRQ | 100 | #ifdef CONFIG_SPARSE_IRQ |
| 100 | 101 | ||
| @@ -197,13 +198,12 @@ err: | |||
| 197 | return -ENOMEM; | 198 | return -ENOMEM; |
| 198 | } | 199 | } |
| 199 | 200 | ||
| 200 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | 201 | static int irq_expand_nr_irqs(unsigned int nr) |
| 201 | { | 202 | { |
| 202 | int res = irq_alloc_descs(irq, irq, 1, node); | 203 | if (nr > IRQ_BITMAP_BITS) |
| 203 | 204 | return -ENOMEM; | |
| 204 | if (res == -EEXIST || res == irq) | 205 | nr_irqs = nr; |
| 205 | return irq_to_desc(irq); | 206 | return 0; |
| 206 | return NULL; | ||
| 207 | } | 207 | } |
| 208 | 208 | ||
| 209 | int __init early_irq_init(void) | 209 | int __init early_irq_init(void) |
| @@ -217,6 +217,15 @@ int __init early_irq_init(void) | |||
| 217 | initcnt = arch_probe_nr_irqs(); | 217 | initcnt = arch_probe_nr_irqs(); |
| 218 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); | 218 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); |
| 219 | 219 | ||
| 220 | if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) | ||
| 221 | nr_irqs = IRQ_BITMAP_BITS; | ||
| 222 | |||
| 223 | if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) | ||
| 224 | initcnt = IRQ_BITMAP_BITS; | ||
| 225 | |||
| 226 | if (initcnt > nr_irqs) | ||
| 227 | nr_irqs = initcnt; | ||
| 228 | |||
| 220 | for (i = 0; i < initcnt; i++) { | 229 | for (i = 0; i < initcnt; i++) { |
| 221 | desc = alloc_desc(i, node); | 230 | desc = alloc_desc(i, node); |
| 222 | set_bit(i, allocated_irqs); | 231 | set_bit(i, allocated_irqs); |
| @@ -229,7 +238,6 @@ int __init early_irq_init(void) | |||
| 229 | 238 | ||
| 230 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | 239 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { |
| 231 | [0 ... NR_IRQS-1] = { | 240 | [0 ... NR_IRQS-1] = { |
| 232 | .status = IRQ_DEFAULT_INIT_FLAGS, | ||
| 233 | .handle_irq = handle_bad_irq, | 241 | .handle_irq = handle_bad_irq, |
| 234 | .depth = 1, | 242 | .depth = 1, |
| 235 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), | 243 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), |
| @@ -251,8 +259,8 @@ int __init early_irq_init(void) | |||
| 251 | for (i = 0; i < count; i++) { | 259 | for (i = 0; i < count; i++) { |
| 252 | desc[i].irq_data.irq = i; | 260 | desc[i].irq_data.irq = i; |
| 253 | desc[i].irq_data.chip = &no_irq_chip; | 261 | desc[i].irq_data.chip = &no_irq_chip; |
| 254 | /* TODO : do this allocation on-demand ... */ | ||
| 255 | desc[i].kstat_irqs = alloc_percpu(unsigned int); | 262 | desc[i].kstat_irqs = alloc_percpu(unsigned int); |
| 263 | irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); | ||
| 256 | alloc_masks(desc + i, GFP_KERNEL, node); | 264 | alloc_masks(desc + i, GFP_KERNEL, node); |
| 257 | desc_smp_init(desc + i, node); | 265 | desc_smp_init(desc + i, node); |
| 258 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 266 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
| @@ -265,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) | |||
| 265 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; | 273 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; |
| 266 | } | 274 | } |
| 267 | 275 | ||
| 268 | struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) | ||
| 269 | { | ||
| 270 | return irq_to_desc(irq); | ||
| 271 | } | ||
| 272 | |||
| 273 | static void free_desc(unsigned int irq) | 276 | static void free_desc(unsigned int irq) |
| 274 | { | 277 | { |
| 275 | dynamic_irq_cleanup(irq); | 278 | dynamic_irq_cleanup(irq); |
| @@ -277,24 +280,14 @@ static void free_desc(unsigned int irq) | |||
| 277 | 280 | ||
| 278 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) | 281 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) |
| 279 | { | 282 | { |
| 280 | #if defined(CONFIG_KSTAT_IRQS_ONDEMAND) | ||
| 281 | struct irq_desc *desc; | ||
| 282 | unsigned int i; | ||
| 283 | |||
| 284 | for (i = 0; i < cnt; i++) { | ||
| 285 | desc = irq_to_desc(start + i); | ||
| 286 | if (desc && !desc->kstat_irqs) { | ||
| 287 | unsigned int __percpu *stats = alloc_percpu(unsigned int); | ||
| 288 | |||
| 289 | if (!stats) | ||
| 290 | return -1; | ||
| 291 | if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) | ||
| 292 | free_percpu(stats); | ||
| 293 | } | ||
| 294 | } | ||
| 295 | #endif | ||
| 296 | return start; | 283 | return start; |
| 297 | } | 284 | } |
| 285 | |||
| 286 | static int irq_expand_nr_irqs(unsigned int nr) | ||
| 287 | { | ||
| 288 | return -ENOMEM; | ||
| 289 | } | ||
| 290 | |||
| 298 | #endif /* !CONFIG_SPARSE_IRQ */ | 291 | #endif /* !CONFIG_SPARSE_IRQ */ |
| 299 | 292 | ||
| 300 | /* Dynamic interrupt handling */ | 293 | /* Dynamic interrupt handling */ |
| @@ -338,14 +331,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) | |||
| 338 | 331 | ||
| 339 | mutex_lock(&sparse_irq_lock); | 332 | mutex_lock(&sparse_irq_lock); |
| 340 | 333 | ||
| 341 | start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); | 334 | start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, |
| 335 | from, cnt, 0); | ||
| 342 | ret = -EEXIST; | 336 | ret = -EEXIST; |
| 343 | if (irq >=0 && start != irq) | 337 | if (irq >=0 && start != irq) |
| 344 | goto err; | 338 | goto err; |
| 345 | 339 | ||
| 346 | ret = -ENOMEM; | 340 | if (start + cnt > nr_irqs) { |
| 347 | if (start >= nr_irqs) | 341 | ret = irq_expand_nr_irqs(start + cnt); |
| 348 | goto err; | 342 | if (ret) |
| 343 | goto err; | ||
| 344 | } | ||
| 349 | 345 | ||
| 350 | bitmap_set(allocated_irqs, start, cnt); | 346 | bitmap_set(allocated_irqs, start, cnt); |
| 351 | mutex_unlock(&sparse_irq_lock); | 347 | mutex_unlock(&sparse_irq_lock); |
| @@ -392,6 +388,26 @@ unsigned int irq_get_next_irq(unsigned int offset) | |||
| 392 | return find_next_bit(allocated_irqs, nr_irqs, offset); | 388 | return find_next_bit(allocated_irqs, nr_irqs, offset); |
| 393 | } | 389 | } |
| 394 | 390 | ||
| 391 | struct irq_desc * | ||
| 392 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) | ||
| 393 | { | ||
| 394 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 395 | |||
| 396 | if (desc) { | ||
| 397 | if (bus) | ||
| 398 | chip_bus_lock(desc); | ||
| 399 | raw_spin_lock_irqsave(&desc->lock, *flags); | ||
| 400 | } | ||
| 401 | return desc; | ||
| 402 | } | ||
| 403 | |||
| 404 | void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) | ||
| 405 | { | ||
| 406 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 407 | if (bus) | ||
| 408 | chip_bus_sync_unlock(desc); | ||
| 409 | } | ||
| 410 | |||
| 395 | /** | 411 | /** |
| 396 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | 412 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq |
| 397 | * @irq: irq number to initialize | 413 | * @irq: irq number to initialize |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0caa59f747dd..07c1611f3899 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -17,6 +17,17 @@ | |||
| 17 | 17 | ||
| 18 | #include "internals.h" | 18 | #include "internals.h" |
| 19 | 19 | ||
| 20 | #ifdef CONFIG_IRQ_FORCED_THREADING | ||
| 21 | __read_mostly bool force_irqthreads; | ||
| 22 | |||
| 23 | static int __init setup_forced_irqthreads(char *arg) | ||
| 24 | { | ||
| 25 | force_irqthreads = true; | ||
| 26 | return 0; | ||
| 27 | } | ||
| 28 | early_param("threadirqs", setup_forced_irqthreads); | ||
| 29 | #endif | ||
| 30 | |||
| 20 | /** | 31 | /** |
| 21 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | 32 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) |
| 22 | * @irq: interrupt number to wait for | 33 | * @irq: interrupt number to wait for |
| @@ -30,7 +41,7 @@ | |||
| 30 | void synchronize_irq(unsigned int irq) | 41 | void synchronize_irq(unsigned int irq) |
| 31 | { | 42 | { |
| 32 | struct irq_desc *desc = irq_to_desc(irq); | 43 | struct irq_desc *desc = irq_to_desc(irq); |
| 33 | unsigned int status; | 44 | bool inprogress; |
| 34 | 45 | ||
| 35 | if (!desc) | 46 | if (!desc) |
| 36 | return; | 47 | return; |
| @@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq) | |||
| 42 | * Wait until we're out of the critical section. This might | 53 | * Wait until we're out of the critical section. This might |
| 43 | * give the wrong answer due to the lack of memory barriers. | 54 | * give the wrong answer due to the lack of memory barriers. |
| 44 | */ | 55 | */ |
| 45 | while (desc->status & IRQ_INPROGRESS) | 56 | while (irqd_irq_inprogress(&desc->irq_data)) |
| 46 | cpu_relax(); | 57 | cpu_relax(); |
| 47 | 58 | ||
| 48 | /* Ok, that indicated we're done: double-check carefully. */ | 59 | /* Ok, that indicated we're done: double-check carefully. */ |
| 49 | raw_spin_lock_irqsave(&desc->lock, flags); | 60 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 50 | status = desc->status; | 61 | inprogress = irqd_irq_inprogress(&desc->irq_data); |
| 51 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 62 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 52 | 63 | ||
| 53 | /* Oops, that failed? */ | 64 | /* Oops, that failed? */ |
| 54 | } while (status & IRQ_INPROGRESS); | 65 | } while (inprogress); |
| 55 | 66 | ||
| 56 | /* | 67 | /* |
| 57 | * We made sure that no hardirq handler is running. Now verify | 68 | * We made sure that no hardirq handler is running. Now verify |
| @@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq) | |||
| 73 | { | 84 | { |
| 74 | struct irq_desc *desc = irq_to_desc(irq); | 85 | struct irq_desc *desc = irq_to_desc(irq); |
| 75 | 86 | ||
| 76 | if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || | 87 | if (!desc || !irqd_can_balance(&desc->irq_data) || |
| 77 | !desc->irq_data.chip->irq_set_affinity) | 88 | !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) |
| 78 | return 0; | 89 | return 0; |
| 79 | 90 | ||
| 80 | return 1; | 91 | return 1; |
| @@ -100,67 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc) | |||
| 100 | } | 111 | } |
| 101 | } | 112 | } |
| 102 | 113 | ||
| 114 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
| 115 | static inline bool irq_can_move_pcntxt(struct irq_data *data) | ||
| 116 | { | ||
| 117 | return irqd_can_move_in_process_context(data); | ||
| 118 | } | ||
| 119 | static inline bool irq_move_pending(struct irq_data *data) | ||
| 120 | { | ||
| 121 | return irqd_is_setaffinity_pending(data); | ||
| 122 | } | ||
| 123 | static inline void | ||
| 124 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) | ||
| 125 | { | ||
| 126 | cpumask_copy(desc->pending_mask, mask); | ||
| 127 | } | ||
| 128 | static inline void | ||
| 129 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) | ||
| 130 | { | ||
| 131 | cpumask_copy(mask, desc->pending_mask); | ||
| 132 | } | ||
| 133 | #else | ||
| 134 | static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } | ||
| 135 | static inline bool irq_move_pending(struct irq_data *data) { return false; } | ||
| 136 | static inline void | ||
| 137 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } | ||
| 138 | static inline void | ||
| 139 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } | ||
| 140 | #endif | ||
| 141 | |||
| 142 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | ||
| 143 | { | ||
| 144 | struct irq_chip *chip = irq_data_get_irq_chip(data); | ||
| 145 | struct irq_desc *desc = irq_data_to_desc(data); | ||
| 146 | int ret = 0; | ||
| 147 | |||
| 148 | if (!chip || !chip->irq_set_affinity) | ||
| 149 | return -EINVAL; | ||
| 150 | |||
| 151 | if (irq_can_move_pcntxt(data)) { | ||
| 152 | ret = chip->irq_set_affinity(data, mask, false); | ||
| 153 | switch (ret) { | ||
| 154 | case IRQ_SET_MASK_OK: | ||
| 155 | cpumask_copy(data->affinity, mask); | ||
| 156 | case IRQ_SET_MASK_OK_NOCOPY: | ||
| 157 | irq_set_thread_affinity(desc); | ||
| 158 | ret = 0; | ||
| 159 | } | ||
| 160 | } else { | ||
| 161 | irqd_set_move_pending(data); | ||
| 162 | irq_copy_pending(desc, mask); | ||
| 163 | } | ||
| 164 | |||
| 165 | if (desc->affinity_notify) { | ||
| 166 | kref_get(&desc->affinity_notify->kref); | ||
| 167 | schedule_work(&desc->affinity_notify->work); | ||
| 168 | } | ||
| 169 | irqd_set(data, IRQD_AFFINITY_SET); | ||
| 170 | |||
| 171 | return ret; | ||
| 172 | } | ||
| 173 | |||
| 103 | /** | 174 | /** |
| 104 | * irq_set_affinity - Set the irq affinity of a given irq | 175 | * irq_set_affinity - Set the irq affinity of a given irq |
| 105 | * @irq: Interrupt to set affinity | 176 | * @irq: Interrupt to set affinity |
| 106 | * @cpumask: cpumask | 177 | * @mask: cpumask |
| 107 | * | 178 | * |
| 108 | */ | 179 | */ |
| 109 | int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) | 180 | int irq_set_affinity(unsigned int irq, const struct cpumask *mask) |
| 110 | { | 181 | { |
| 111 | struct irq_desc *desc = irq_to_desc(irq); | 182 | struct irq_desc *desc = irq_to_desc(irq); |
| 112 | struct irq_chip *chip = desc->irq_data.chip; | ||
| 113 | unsigned long flags; | 183 | unsigned long flags; |
| 184 | int ret; | ||
| 114 | 185 | ||
| 115 | if (!chip->irq_set_affinity) | 186 | if (!desc) |
| 116 | return -EINVAL; | 187 | return -EINVAL; |
| 117 | 188 | ||
| 118 | raw_spin_lock_irqsave(&desc->lock, flags); | 189 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 119 | 190 | ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); | |
| 120 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
| 121 | if (desc->status & IRQ_MOVE_PCNTXT) { | ||
| 122 | if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { | ||
| 123 | cpumask_copy(desc->irq_data.affinity, cpumask); | ||
| 124 | irq_set_thread_affinity(desc); | ||
| 125 | } | ||
| 126 | } | ||
| 127 | else { | ||
| 128 | desc->status |= IRQ_MOVE_PENDING; | ||
| 129 | cpumask_copy(desc->pending_mask, cpumask); | ||
| 130 | } | ||
| 131 | #else | ||
| 132 | if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { | ||
| 133 | cpumask_copy(desc->irq_data.affinity, cpumask); | ||
| 134 | irq_set_thread_affinity(desc); | ||
| 135 | } | ||
| 136 | #endif | ||
| 137 | desc->status |= IRQ_AFFINITY_SET; | ||
| 138 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 191 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 139 | return 0; | 192 | return ret; |
| 140 | } | 193 | } |
| 141 | 194 | ||
| 142 | int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) | 195 | int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) |
| 143 | { | 196 | { |
| 197 | unsigned long flags; | ||
| 198 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
| 199 | |||
| 200 | if (!desc) | ||
| 201 | return -EINVAL; | ||
| 202 | desc->affinity_hint = m; | ||
| 203 | irq_put_desc_unlock(desc, flags); | ||
| 204 | return 0; | ||
| 205 | } | ||
| 206 | EXPORT_SYMBOL_GPL(irq_set_affinity_hint); | ||
| 207 | |||
| 208 | static void irq_affinity_notify(struct work_struct *work) | ||
| 209 | { | ||
| 210 | struct irq_affinity_notify *notify = | ||
| 211 | container_of(work, struct irq_affinity_notify, work); | ||
| 212 | struct irq_desc *desc = irq_to_desc(notify->irq); | ||
| 213 | cpumask_var_t cpumask; | ||
| 214 | unsigned long flags; | ||
| 215 | |||
| 216 | if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) | ||
| 217 | goto out; | ||
| 218 | |||
| 219 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 220 | if (irq_move_pending(&desc->irq_data)) | ||
| 221 | irq_get_pending(cpumask, desc); | ||
| 222 | else | ||
| 223 | cpumask_copy(cpumask, desc->irq_data.affinity); | ||
| 224 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 225 | |||
| 226 | notify->notify(notify, cpumask); | ||
| 227 | |||
| 228 | free_cpumask_var(cpumask); | ||
| 229 | out: | ||
| 230 | kref_put(¬ify->kref, notify->release); | ||
| 231 | } | ||
| 232 | |||
| 233 | /** | ||
| 234 | * irq_set_affinity_notifier - control notification of IRQ affinity changes | ||
| 235 | * @irq: Interrupt for which to enable/disable notification | ||
| 236 | * @notify: Context for notification, or %NULL to disable | ||
| 237 | * notification. Function pointers must be initialised; | ||
| 238 | * the other fields will be initialised by this function. | ||
| 239 | * | ||
| 240 | * Must be called in process context. Notification may only be enabled | ||
| 241 | * after the IRQ is allocated and must be disabled before the IRQ is | ||
| 242 | * freed using free_irq(). | ||
| 243 | */ | ||
| 244 | int | ||
| 245 | irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) | ||
| 246 | { | ||
| 144 | struct irq_desc *desc = irq_to_desc(irq); | 247 | struct irq_desc *desc = irq_to_desc(irq); |
| 248 | struct irq_affinity_notify *old_notify; | ||
| 145 | unsigned long flags; | 249 | unsigned long flags; |
| 146 | 250 | ||
| 251 | /* The release function is promised process context */ | ||
| 252 | might_sleep(); | ||
| 253 | |||
| 147 | if (!desc) | 254 | if (!desc) |
| 148 | return -EINVAL; | 255 | return -EINVAL; |
| 149 | 256 | ||
| 257 | /* Complete initialisation of *notify */ | ||
| 258 | if (notify) { | ||
| 259 | notify->irq = irq; | ||
| 260 | kref_init(¬ify->kref); | ||
| 261 | INIT_WORK(¬ify->work, irq_affinity_notify); | ||
| 262 | } | ||
| 263 | |||
| 150 | raw_spin_lock_irqsave(&desc->lock, flags); | 264 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 151 | desc->affinity_hint = m; | 265 | old_notify = desc->affinity_notify; |
| 266 | desc->affinity_notify = notify; | ||
| 152 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 267 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 153 | 268 | ||
| 269 | if (old_notify) | ||
| 270 | kref_put(&old_notify->kref, old_notify->release); | ||
| 271 | |||
| 154 | return 0; | 272 | return 0; |
| 155 | } | 273 | } |
| 156 | EXPORT_SYMBOL_GPL(irq_set_affinity_hint); | 274 | EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); |
| 157 | 275 | ||
| 158 | #ifndef CONFIG_AUTO_IRQ_AFFINITY | 276 | #ifndef CONFIG_AUTO_IRQ_AFFINITY |
| 159 | /* | 277 | /* |
| 160 | * Generic version of the affinity autoselector. | 278 | * Generic version of the affinity autoselector. |
| 161 | */ | 279 | */ |
| 162 | static int setup_affinity(unsigned int irq, struct irq_desc *desc) | 280 | static int |
| 281 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | ||
| 163 | { | 282 | { |
| 283 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
| 284 | struct cpumask *set = irq_default_affinity; | ||
| 285 | int ret; | ||
| 286 | |||
| 287 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | ||
| 164 | if (!irq_can_set_affinity(irq)) | 288 | if (!irq_can_set_affinity(irq)) |
| 165 | return 0; | 289 | return 0; |
| 166 | 290 | ||
| @@ -168,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) | |||
| 168 | * Preserve an userspace affinity setup, but make sure that | 292 | * Preserve an userspace affinity setup, but make sure that |
| 169 | * one of the targets is online. | 293 | * one of the targets is online. |
| 170 | */ | 294 | */ |
| 171 | if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { | 295 | if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { |
| 172 | if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) | 296 | if (cpumask_intersects(desc->irq_data.affinity, |
| 173 | < nr_cpu_ids) | 297 | cpu_online_mask)) |
| 174 | goto set_affinity; | 298 | set = desc->irq_data.affinity; |
| 175 | else | 299 | else |
| 176 | desc->status &= ~IRQ_AFFINITY_SET; | 300 | irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); |
| 177 | } | 301 | } |
| 178 | 302 | ||
| 179 | cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); | 303 | cpumask_and(mask, cpu_online_mask, set); |
| 180 | set_affinity: | 304 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); |
| 181 | desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); | 305 | switch (ret) { |
| 182 | 306 | case IRQ_SET_MASK_OK: | |
| 307 | cpumask_copy(desc->irq_data.affinity, mask); | ||
| 308 | case IRQ_SET_MASK_OK_NOCOPY: | ||
| 309 | irq_set_thread_affinity(desc); | ||
| 310 | } | ||
| 183 | return 0; | 311 | return 0; |
| 184 | } | 312 | } |
| 185 | #else | 313 | #else |
| 186 | static inline int setup_affinity(unsigned int irq, struct irq_desc *d) | 314 | static inline int |
| 315 | setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) | ||
| 187 | { | 316 | { |
| 188 | return irq_select_affinity(irq); | 317 | return irq_select_affinity(irq); |
| 189 | } | 318 | } |
| @@ -192,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d) | |||
| 192 | /* | 321 | /* |
| 193 | * Called when affinity is set via /proc/irq | 322 | * Called when affinity is set via /proc/irq |
| 194 | */ | 323 | */ |
| 195 | int irq_select_affinity_usr(unsigned int irq) | 324 | int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) |
| 196 | { | 325 | { |
| 197 | struct irq_desc *desc = irq_to_desc(irq); | 326 | struct irq_desc *desc = irq_to_desc(irq); |
| 198 | unsigned long flags; | 327 | unsigned long flags; |
| 199 | int ret; | 328 | int ret; |
| 200 | 329 | ||
| 201 | raw_spin_lock_irqsave(&desc->lock, flags); | 330 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 202 | ret = setup_affinity(irq, desc); | 331 | ret = setup_affinity(irq, desc, mask); |
| 203 | if (!ret) | ||
| 204 | irq_set_thread_affinity(desc); | ||
| 205 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 332 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 206 | |||
| 207 | return ret; | 333 | return ret; |
| 208 | } | 334 | } |
| 209 | 335 | ||
| 210 | #else | 336 | #else |
| 211 | static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) | 337 | static inline int |
| 338 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | ||
| 212 | { | 339 | { |
| 213 | return 0; | 340 | return 0; |
| 214 | } | 341 | } |
| @@ -219,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | |||
| 219 | if (suspend) { | 346 | if (suspend) { |
| 220 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) | 347 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) |
| 221 | return; | 348 | return; |
| 222 | desc->status |= IRQ_SUSPENDED; | 349 | desc->istate |= IRQS_SUSPENDED; |
| 223 | } | 350 | } |
| 224 | 351 | ||
| 225 | if (!desc->depth++) { | 352 | if (!desc->depth++) |
| 226 | desc->status |= IRQ_DISABLED; | 353 | irq_disable(desc); |
| 227 | desc->irq_data.chip->irq_disable(&desc->irq_data); | 354 | } |
| 228 | } | 355 | |
| 356 | static int __disable_irq_nosync(unsigned int irq) | ||
| 357 | { | ||
| 358 | unsigned long flags; | ||
| 359 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
| 360 | |||
| 361 | if (!desc) | ||
| 362 | return -EINVAL; | ||
| 363 | __disable_irq(desc, irq, false); | ||
| 364 | irq_put_desc_busunlock(desc, flags); | ||
| 365 | return 0; | ||
| 229 | } | 366 | } |
| 230 | 367 | ||
| 231 | /** | 368 | /** |
| @@ -241,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | |||
| 241 | */ | 378 | */ |
| 242 | void disable_irq_nosync(unsigned int irq) | 379 | void disable_irq_nosync(unsigned int irq) |
| 243 | { | 380 | { |
| 244 | struct irq_desc *desc = irq_to_desc(irq); | 381 | __disable_irq_nosync(irq); |
| 245 | unsigned long flags; | ||
| 246 | |||
| 247 | if (!desc) | ||
| 248 | return; | ||
| 249 | |||
| 250 | chip_bus_lock(desc); | ||
| 251 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 252 | __disable_irq(desc, irq, false); | ||
| 253 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 254 | chip_bus_sync_unlock(desc); | ||
| 255 | } | 382 | } |
| 256 | EXPORT_SYMBOL(disable_irq_nosync); | 383 | EXPORT_SYMBOL(disable_irq_nosync); |
| 257 | 384 | ||
| @@ -269,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync); | |||
| 269 | */ | 396 | */ |
| 270 | void disable_irq(unsigned int irq) | 397 | void disable_irq(unsigned int irq) |
| 271 | { | 398 | { |
| 272 | struct irq_desc *desc = irq_to_desc(irq); | 399 | if (!__disable_irq_nosync(irq)) |
| 273 | |||
| 274 | if (!desc) | ||
| 275 | return; | ||
| 276 | |||
| 277 | disable_irq_nosync(irq); | ||
| 278 | if (desc->action) | ||
| 279 | synchronize_irq(irq); | 400 | synchronize_irq(irq); |
| 280 | } | 401 | } |
| 281 | EXPORT_SYMBOL(disable_irq); | 402 | EXPORT_SYMBOL(disable_irq); |
| 282 | 403 | ||
| 283 | void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | 404 | void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) |
| 284 | { | 405 | { |
| 285 | if (resume) | 406 | if (resume) { |
| 286 | desc->status &= ~IRQ_SUSPENDED; | 407 | if (!(desc->istate & IRQS_SUSPENDED)) { |
| 408 | if (!desc->action) | ||
| 409 | return; | ||
| 410 | if (!(desc->action->flags & IRQF_FORCE_RESUME)) | ||
| 411 | return; | ||
| 412 | /* Pretend that it got disabled ! */ | ||
| 413 | desc->depth++; | ||
| 414 | } | ||
| 415 | desc->istate &= ~IRQS_SUSPENDED; | ||
| 416 | } | ||
| 287 | 417 | ||
| 288 | switch (desc->depth) { | 418 | switch (desc->depth) { |
| 289 | case 0: | 419 | case 0: |
| @@ -291,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | |||
| 291 | WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); | 421 | WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); |
| 292 | break; | 422 | break; |
| 293 | case 1: { | 423 | case 1: { |
| 294 | unsigned int status = desc->status & ~IRQ_DISABLED; | 424 | if (desc->istate & IRQS_SUSPENDED) |
| 295 | |||
| 296 | if (desc->status & IRQ_SUSPENDED) | ||
| 297 | goto err_out; | 425 | goto err_out; |
| 298 | /* Prevent probing on this irq: */ | 426 | /* Prevent probing on this irq: */ |
| 299 | desc->status = status | IRQ_NOPROBE; | 427 | irq_settings_set_noprobe(desc); |
| 428 | irq_enable(desc); | ||
| 300 | check_irq_resend(desc, irq); | 429 | check_irq_resend(desc, irq); |
| 301 | /* fall-through */ | 430 | /* fall-through */ |
| 302 | } | 431 | } |
| @@ -318,21 +447,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | |||
| 318 | */ | 447 | */ |
| 319 | void enable_irq(unsigned int irq) | 448 | void enable_irq(unsigned int irq) |
| 320 | { | 449 | { |
| 321 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 322 | unsigned long flags; | 450 | unsigned long flags; |
| 451 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
| 323 | 452 | ||
| 324 | if (!desc) | 453 | if (!desc) |
| 325 | return; | 454 | return; |
| 455 | if (WARN(!desc->irq_data.chip, | ||
| 456 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) | ||
| 457 | goto out; | ||
| 326 | 458 | ||
| 327 | if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, | ||
| 328 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) | ||
| 329 | return; | ||
| 330 | |||
| 331 | chip_bus_lock(desc); | ||
| 332 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 333 | __enable_irq(desc, irq, false); | 459 | __enable_irq(desc, irq, false); |
| 334 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 460 | out: |
| 335 | chip_bus_sync_unlock(desc); | 461 | irq_put_desc_busunlock(desc, flags); |
| 336 | } | 462 | } |
| 337 | EXPORT_SYMBOL(enable_irq); | 463 | EXPORT_SYMBOL(enable_irq); |
| 338 | 464 | ||
| @@ -348,7 +474,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
| 348 | } | 474 | } |
| 349 | 475 | ||
| 350 | /** | 476 | /** |
| 351 | * set_irq_wake - control irq power management wakeup | 477 | * irq_set_irq_wake - control irq power management wakeup |
| 352 | * @irq: interrupt to control | 478 | * @irq: interrupt to control |
| 353 | * @on: enable/disable power management wakeup | 479 | * @on: enable/disable power management wakeup |
| 354 | * | 480 | * |
| @@ -359,23 +485,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
| 359 | * Wakeup mode lets this IRQ wake the system from sleep | 485 | * Wakeup mode lets this IRQ wake the system from sleep |
| 360 | * states like "suspend to RAM". | 486 | * states like "suspend to RAM". |
| 361 | */ | 487 | */ |
| 362 | int set_irq_wake(unsigned int irq, unsigned int on) | 488 | int irq_set_irq_wake(unsigned int irq, unsigned int on) |
| 363 | { | 489 | { |
| 364 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 365 | unsigned long flags; | 490 | unsigned long flags; |
| 491 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
| 366 | int ret = 0; | 492 | int ret = 0; |
| 367 | 493 | ||
| 368 | /* wakeup-capable irqs can be shared between drivers that | 494 | /* wakeup-capable irqs can be shared between drivers that |
| 369 | * don't need to have the same sleep mode behaviors. | 495 | * don't need to have the same sleep mode behaviors. |
| 370 | */ | 496 | */ |
| 371 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 372 | if (on) { | 497 | if (on) { |
| 373 | if (desc->wake_depth++ == 0) { | 498 | if (desc->wake_depth++ == 0) { |
| 374 | ret = set_irq_wake_real(irq, on); | 499 | ret = set_irq_wake_real(irq, on); |
| 375 | if (ret) | 500 | if (ret) |
| 376 | desc->wake_depth = 0; | 501 | desc->wake_depth = 0; |
| 377 | else | 502 | else |
| 378 | desc->status |= IRQ_WAKEUP; | 503 | irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); |
| 379 | } | 504 | } |
| 380 | } else { | 505 | } else { |
| 381 | if (desc->wake_depth == 0) { | 506 | if (desc->wake_depth == 0) { |
| @@ -385,14 +510,13 @@ int set_irq_wake(unsigned int irq, unsigned int on) | |||
| 385 | if (ret) | 510 | if (ret) |
| 386 | desc->wake_depth = 1; | 511 | desc->wake_depth = 1; |
| 387 | else | 512 | else |
| 388 | desc->status &= ~IRQ_WAKEUP; | 513 | irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); |
| 389 | } | 514 | } |
| 390 | } | 515 | } |
| 391 | 516 | irq_put_desc_busunlock(desc, flags); | |
| 392 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 393 | return ret; | 517 | return ret; |
| 394 | } | 518 | } |
| 395 | EXPORT_SYMBOL(set_irq_wake); | 519 | EXPORT_SYMBOL(irq_set_irq_wake); |
| 396 | 520 | ||
| 397 | /* | 521 | /* |
| 398 | * Internal function that tells the architecture code whether a | 522 | * Internal function that tells the architecture code whether a |
| @@ -401,43 +525,27 @@ EXPORT_SYMBOL(set_irq_wake); | |||
| 401 | */ | 525 | */ |
| 402 | int can_request_irq(unsigned int irq, unsigned long irqflags) | 526 | int can_request_irq(unsigned int irq, unsigned long irqflags) |
| 403 | { | 527 | { |
| 404 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 405 | struct irqaction *action; | ||
| 406 | unsigned long flags; | 528 | unsigned long flags; |
| 529 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
| 530 | int canrequest = 0; | ||
| 407 | 531 | ||
| 408 | if (!desc) | 532 | if (!desc) |
| 409 | return 0; | 533 | return 0; |
| 410 | 534 | ||
| 411 | if (desc->status & IRQ_NOREQUEST) | 535 | if (irq_settings_can_request(desc)) { |
| 412 | return 0; | 536 | if (desc->action) |
| 413 | 537 | if (irqflags & desc->action->flags & IRQF_SHARED) | |
| 414 | raw_spin_lock_irqsave(&desc->lock, flags); | 538 | canrequest =1; |
| 415 | action = desc->action; | 539 | } |
| 416 | if (action) | 540 | irq_put_desc_unlock(desc, flags); |
| 417 | if (irqflags & action->flags & IRQF_SHARED) | 541 | return canrequest; |
| 418 | action = NULL; | ||
| 419 | |||
| 420 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 421 | |||
| 422 | return !action; | ||
| 423 | } | ||
| 424 | |||
| 425 | void compat_irq_chip_set_default_handler(struct irq_desc *desc) | ||
| 426 | { | ||
| 427 | /* | ||
| 428 | * If the architecture still has not overriden | ||
| 429 | * the flow handler then zap the default. This | ||
| 430 | * should catch incorrect flow-type setting. | ||
| 431 | */ | ||
| 432 | if (desc->handle_irq == &handle_bad_irq) | ||
| 433 | desc->handle_irq = NULL; | ||
| 434 | } | 542 | } |
| 435 | 543 | ||
| 436 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 544 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
| 437 | unsigned long flags) | 545 | unsigned long flags) |
| 438 | { | 546 | { |
| 439 | int ret; | ||
| 440 | struct irq_chip *chip = desc->irq_data.chip; | 547 | struct irq_chip *chip = desc->irq_data.chip; |
| 548 | int ret, unmask = 0; | ||
| 441 | 549 | ||
| 442 | if (!chip || !chip->irq_set_type) { | 550 | if (!chip || !chip->irq_set_type) { |
| 443 | /* | 551 | /* |
| @@ -449,23 +557,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
| 449 | return 0; | 557 | return 0; |
| 450 | } | 558 | } |
| 451 | 559 | ||
| 560 | flags &= IRQ_TYPE_SENSE_MASK; | ||
| 561 | |||
| 562 | if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { | ||
| 563 | if (!irqd_irq_masked(&desc->irq_data)) | ||
| 564 | mask_irq(desc); | ||
| 565 | if (!irqd_irq_disabled(&desc->irq_data)) | ||
| 566 | unmask = 1; | ||
| 567 | } | ||
| 568 | |||
| 452 | /* caller masked out all except trigger mode flags */ | 569 | /* caller masked out all except trigger mode flags */ |
| 453 | ret = chip->irq_set_type(&desc->irq_data, flags); | 570 | ret = chip->irq_set_type(&desc->irq_data, flags); |
| 454 | 571 | ||
| 455 | if (ret) | 572 | switch (ret) { |
| 573 | case IRQ_SET_MASK_OK: | ||
| 574 | irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); | ||
| 575 | irqd_set(&desc->irq_data, flags); | ||
| 576 | |||
| 577 | case IRQ_SET_MASK_OK_NOCOPY: | ||
| 578 | flags = irqd_get_trigger_type(&desc->irq_data); | ||
| 579 | irq_settings_set_trigger_mask(desc, flags); | ||
| 580 | irqd_clear(&desc->irq_data, IRQD_LEVEL); | ||
| 581 | irq_settings_clr_level(desc); | ||
| 582 | if (flags & IRQ_TYPE_LEVEL_MASK) { | ||
| 583 | irq_settings_set_level(desc); | ||
| 584 | irqd_set(&desc->irq_data, IRQD_LEVEL); | ||
| 585 | } | ||
| 586 | |||
| 587 | ret = 0; | ||
| 588 | break; | ||
| 589 | default: | ||
| 456 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", | 590 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", |
| 457 | flags, irq, chip->irq_set_type); | 591 | flags, irq, chip->irq_set_type); |
| 458 | else { | ||
| 459 | if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) | ||
| 460 | flags |= IRQ_LEVEL; | ||
| 461 | /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ | ||
| 462 | desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); | ||
| 463 | desc->status |= flags; | ||
| 464 | |||
| 465 | if (chip != desc->irq_data.chip) | ||
| 466 | irq_chip_set_defaults(desc->irq_data.chip); | ||
| 467 | } | 592 | } |
| 468 | 593 | if (unmask) | |
| 594 | unmask_irq(desc); | ||
| 469 | return ret; | 595 | return ret; |
| 470 | } | 596 | } |
| 471 | 597 | ||
| @@ -509,8 +635,11 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
| 509 | * handler finished. unmask if the interrupt has not been disabled and | 635 | * handler finished. unmask if the interrupt has not been disabled and |
| 510 | * is marked MASKED. | 636 | * is marked MASKED. |
| 511 | */ | 637 | */ |
| 512 | static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) | 638 | static void irq_finalize_oneshot(struct irq_desc *desc, |
| 639 | struct irqaction *action, bool force) | ||
| 513 | { | 640 | { |
| 641 | if (!(desc->istate & IRQS_ONESHOT)) | ||
| 642 | return; | ||
| 514 | again: | 643 | again: |
| 515 | chip_bus_lock(desc); | 644 | chip_bus_lock(desc); |
| 516 | raw_spin_lock_irq(&desc->lock); | 645 | raw_spin_lock_irq(&desc->lock); |
| @@ -522,26 +651,42 @@ again: | |||
| 522 | * The thread is faster done than the hard interrupt handler | 651 | * The thread is faster done than the hard interrupt handler |
| 523 | * on the other CPU. If we unmask the irq line then the | 652 | * on the other CPU. If we unmask the irq line then the |
| 524 | * interrupt can come in again and masks the line, leaves due | 653 | * interrupt can come in again and masks the line, leaves due |
| 525 | * to IRQ_INPROGRESS and the irq line is masked forever. | 654 | * to IRQS_INPROGRESS and the irq line is masked forever. |
| 655 | * | ||
| 656 | * This also serializes the state of shared oneshot handlers | ||
| 657 | * versus "desc->threads_onehsot |= action->thread_mask;" in | ||
| 658 | * irq_wake_thread(). See the comment there which explains the | ||
| 659 | * serialization. | ||
| 526 | */ | 660 | */ |
| 527 | if (unlikely(desc->status & IRQ_INPROGRESS)) { | 661 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { |
| 528 | raw_spin_unlock_irq(&desc->lock); | 662 | raw_spin_unlock_irq(&desc->lock); |
| 529 | chip_bus_sync_unlock(desc); | 663 | chip_bus_sync_unlock(desc); |
| 530 | cpu_relax(); | 664 | cpu_relax(); |
| 531 | goto again; | 665 | goto again; |
| 532 | } | 666 | } |
| 533 | 667 | ||
| 534 | if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { | 668 | /* |
| 535 | desc->status &= ~IRQ_MASKED; | 669 | * Now check again, whether the thread should run. Otherwise |
| 536 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 670 | * we would clear the threads_oneshot bit of this thread which |
| 537 | } | 671 | * was just set. |
| 672 | */ | ||
| 673 | if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
| 674 | goto out_unlock; | ||
| 675 | |||
| 676 | desc->threads_oneshot &= ~action->thread_mask; | ||
| 677 | |||
| 678 | if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && | ||
| 679 | irqd_irq_masked(&desc->irq_data)) | ||
| 680 | unmask_irq(desc); | ||
| 681 | |||
| 682 | out_unlock: | ||
| 538 | raw_spin_unlock_irq(&desc->lock); | 683 | raw_spin_unlock_irq(&desc->lock); |
| 539 | chip_bus_sync_unlock(desc); | 684 | chip_bus_sync_unlock(desc); |
| 540 | } | 685 | } |
| 541 | 686 | ||
| 542 | #ifdef CONFIG_SMP | 687 | #ifdef CONFIG_SMP |
| 543 | /* | 688 | /* |
| 544 | * Check whether we need to change the affinity of the interrupt thread. | 689 | * Check whether we need to chasnge the affinity of the interrupt thread. |
| 545 | */ | 690 | */ |
| 546 | static void | 691 | static void |
| 547 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | 692 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) |
| @@ -573,6 +718,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } | |||
| 573 | #endif | 718 | #endif |
| 574 | 719 | ||
| 575 | /* | 720 | /* |
| 721 | * Interrupts which are not explicitely requested as threaded | ||
| 722 | * interrupts rely on the implicit bh/preempt disable of the hard irq | ||
| 723 | * context. So we need to disable bh here to avoid deadlocks and other | ||
| 724 | * side effects. | ||
| 725 | */ | ||
| 726 | static void | ||
| 727 | irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) | ||
| 728 | { | ||
| 729 | local_bh_disable(); | ||
| 730 | action->thread_fn(action->irq, action->dev_id); | ||
| 731 | irq_finalize_oneshot(desc, action, false); | ||
| 732 | local_bh_enable(); | ||
| 733 | } | ||
| 734 | |||
| 735 | /* | ||
| 736 | * Interrupts explicitely requested as threaded interupts want to be | ||
| 737 | * preemtible - many of them need to sleep and wait for slow busses to | ||
| 738 | * complete. | ||
| 739 | */ | ||
| 740 | static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) | ||
| 741 | { | ||
| 742 | action->thread_fn(action->irq, action->dev_id); | ||
| 743 | irq_finalize_oneshot(desc, action, false); | ||
| 744 | } | ||
| 745 | |||
| 746 | /* | ||
| 576 | * Interrupt handler thread | 747 | * Interrupt handler thread |
| 577 | */ | 748 | */ |
| 578 | static int irq_thread(void *data) | 749 | static int irq_thread(void *data) |
| @@ -582,7 +753,14 @@ static int irq_thread(void *data) | |||
| 582 | }; | 753 | }; |
| 583 | struct irqaction *action = data; | 754 | struct irqaction *action = data; |
| 584 | struct irq_desc *desc = irq_to_desc(action->irq); | 755 | struct irq_desc *desc = irq_to_desc(action->irq); |
| 585 | int wake, oneshot = desc->status & IRQ_ONESHOT; | 756 | void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); |
| 757 | int wake; | ||
| 758 | |||
| 759 | if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, | ||
| 760 | &action->thread_flags)) | ||
| 761 | handler_fn = irq_forced_thread_fn; | ||
| 762 | else | ||
| 763 | handler_fn = irq_thread_fn; | ||
| 586 | 764 | ||
| 587 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 765 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
| 588 | current->irqaction = action; | 766 | current->irqaction = action; |
| @@ -594,23 +772,19 @@ static int irq_thread(void *data) | |||
| 594 | atomic_inc(&desc->threads_active); | 772 | atomic_inc(&desc->threads_active); |
| 595 | 773 | ||
| 596 | raw_spin_lock_irq(&desc->lock); | 774 | raw_spin_lock_irq(&desc->lock); |
| 597 | if (unlikely(desc->status & IRQ_DISABLED)) { | 775 | if (unlikely(irqd_irq_disabled(&desc->irq_data))) { |
| 598 | /* | 776 | /* |
| 599 | * CHECKME: We might need a dedicated | 777 | * CHECKME: We might need a dedicated |
| 600 | * IRQ_THREAD_PENDING flag here, which | 778 | * IRQ_THREAD_PENDING flag here, which |
| 601 | * retriggers the thread in check_irq_resend() | 779 | * retriggers the thread in check_irq_resend() |
| 602 | * but AFAICT IRQ_PENDING should be fine as it | 780 | * but AFAICT IRQS_PENDING should be fine as it |
| 603 | * retriggers the interrupt itself --- tglx | 781 | * retriggers the interrupt itself --- tglx |
| 604 | */ | 782 | */ |
| 605 | desc->status |= IRQ_PENDING; | 783 | desc->istate |= IRQS_PENDING; |
| 606 | raw_spin_unlock_irq(&desc->lock); | 784 | raw_spin_unlock_irq(&desc->lock); |
| 607 | } else { | 785 | } else { |
| 608 | raw_spin_unlock_irq(&desc->lock); | 786 | raw_spin_unlock_irq(&desc->lock); |
| 609 | 787 | handler_fn(desc, action); | |
| 610 | action->thread_fn(action->irq, action->dev_id); | ||
| 611 | |||
| 612 | if (oneshot) | ||
| 613 | irq_finalize_oneshot(action->irq, desc); | ||
| 614 | } | 788 | } |
| 615 | 789 | ||
| 616 | wake = atomic_dec_and_test(&desc->threads_active); | 790 | wake = atomic_dec_and_test(&desc->threads_active); |
| @@ -619,6 +793,9 @@ static int irq_thread(void *data) | |||
| 619 | wake_up(&desc->wait_for_threads); | 793 | wake_up(&desc->wait_for_threads); |
| 620 | } | 794 | } |
| 621 | 795 | ||
| 796 | /* Prevent a stale desc->threads_oneshot */ | ||
| 797 | irq_finalize_oneshot(desc, action, true); | ||
| 798 | |||
| 622 | /* | 799 | /* |
| 623 | * Clear irqaction. Otherwise exit_irq_thread() would make | 800 | * Clear irqaction. Otherwise exit_irq_thread() would make |
| 624 | * fuzz about an active irq thread going into nirvana. | 801 | * fuzz about an active irq thread going into nirvana. |
| @@ -633,6 +810,7 @@ static int irq_thread(void *data) | |||
| 633 | void exit_irq_thread(void) | 810 | void exit_irq_thread(void) |
| 634 | { | 811 | { |
| 635 | struct task_struct *tsk = current; | 812 | struct task_struct *tsk = current; |
| 813 | struct irq_desc *desc; | ||
| 636 | 814 | ||
| 637 | if (!tsk->irqaction) | 815 | if (!tsk->irqaction) |
| 638 | return; | 816 | return; |
| @@ -641,6 +819,14 @@ void exit_irq_thread(void) | |||
| 641 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | 819 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", |
| 642 | tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); | 820 | tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); |
| 643 | 821 | ||
| 822 | desc = irq_to_desc(tsk->irqaction->irq); | ||
| 823 | |||
| 824 | /* | ||
| 825 | * Prevent a stale desc->threads_oneshot. Must be called | ||
| 826 | * before setting the IRQTF_DIED flag. | ||
| 827 | */ | ||
| 828 | irq_finalize_oneshot(desc, tsk->irqaction, true); | ||
| 829 | |||
| 644 | /* | 830 | /* |
| 645 | * Set the THREAD DIED flag to prevent further wakeups of the | 831 | * Set the THREAD DIED flag to prevent further wakeups of the |
| 646 | * soon to be gone threaded handler. | 832 | * soon to be gone threaded handler. |
| @@ -648,6 +834,22 @@ void exit_irq_thread(void) | |||
| 648 | set_bit(IRQTF_DIED, &tsk->irqaction->flags); | 834 | set_bit(IRQTF_DIED, &tsk->irqaction->flags); |
| 649 | } | 835 | } |
| 650 | 836 | ||
| 837 | static void irq_setup_forced_threading(struct irqaction *new) | ||
| 838 | { | ||
| 839 | if (!force_irqthreads) | ||
| 840 | return; | ||
| 841 | if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) | ||
| 842 | return; | ||
| 843 | |||
| 844 | new->flags |= IRQF_ONESHOT; | ||
| 845 | |||
| 846 | if (!new->thread_fn) { | ||
| 847 | set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); | ||
| 848 | new->thread_fn = new->handler; | ||
| 849 | new->handler = irq_default_primary_handler; | ||
| 850 | } | ||
| 851 | } | ||
| 852 | |||
| 651 | /* | 853 | /* |
| 652 | * Internal function to register an irqaction - typically used to | 854 | * Internal function to register an irqaction - typically used to |
| 653 | * allocate special interrupts that are part of the architecture. | 855 | * allocate special interrupts that are part of the architecture. |
| @@ -657,9 +859,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 657 | { | 859 | { |
| 658 | struct irqaction *old, **old_ptr; | 860 | struct irqaction *old, **old_ptr; |
| 659 | const char *old_name = NULL; | 861 | const char *old_name = NULL; |
| 660 | unsigned long flags; | 862 | unsigned long flags, thread_mask = 0; |
| 661 | int nested, shared = 0; | 863 | int ret, nested, shared = 0; |
| 662 | int ret; | 864 | cpumask_var_t mask; |
| 663 | 865 | ||
| 664 | if (!desc) | 866 | if (!desc) |
| 665 | return -EINVAL; | 867 | return -EINVAL; |
| @@ -683,15 +885,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 683 | rand_initialize_irq(irq); | 885 | rand_initialize_irq(irq); |
| 684 | } | 886 | } |
| 685 | 887 | ||
| 686 | /* Oneshot interrupts are not allowed with shared */ | ||
| 687 | if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) | ||
| 688 | return -EINVAL; | ||
| 689 | |||
| 690 | /* | 888 | /* |
| 691 | * Check whether the interrupt nests into another interrupt | 889 | * Check whether the interrupt nests into another interrupt |
| 692 | * thread. | 890 | * thread. |
| 693 | */ | 891 | */ |
| 694 | nested = desc->status & IRQ_NESTED_THREAD; | 892 | nested = irq_settings_is_nested_thread(desc); |
| 695 | if (nested) { | 893 | if (nested) { |
| 696 | if (!new->thread_fn) | 894 | if (!new->thread_fn) |
| 697 | return -EINVAL; | 895 | return -EINVAL; |
| @@ -701,6 +899,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 701 | * dummy function which warns when called. | 899 | * dummy function which warns when called. |
| 702 | */ | 900 | */ |
| 703 | new->handler = irq_nested_primary_handler; | 901 | new->handler = irq_nested_primary_handler; |
| 902 | } else { | ||
| 903 | irq_setup_forced_threading(new); | ||
| 704 | } | 904 | } |
| 705 | 905 | ||
| 706 | /* | 906 | /* |
| @@ -724,6 +924,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 724 | new->thread = t; | 924 | new->thread = t; |
| 725 | } | 925 | } |
| 726 | 926 | ||
| 927 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { | ||
| 928 | ret = -ENOMEM; | ||
| 929 | goto out_thread; | ||
| 930 | } | ||
| 931 | |||
| 727 | /* | 932 | /* |
| 728 | * The following block of code has to be executed atomically | 933 | * The following block of code has to be executed atomically |
| 729 | */ | 934 | */ |
| @@ -735,32 +940,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 735 | * Can't share interrupts unless both agree to and are | 940 | * Can't share interrupts unless both agree to and are |
| 736 | * the same type (level, edge, polarity). So both flag | 941 | * the same type (level, edge, polarity). So both flag |
| 737 | * fields must have IRQF_SHARED set and the bits which | 942 | * fields must have IRQF_SHARED set and the bits which |
| 738 | * set the trigger type must match. | 943 | * set the trigger type must match. Also all must |
| 944 | * agree on ONESHOT. | ||
| 739 | */ | 945 | */ |
| 740 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 946 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
| 741 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { | 947 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || |
| 948 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) { | ||
| 742 | old_name = old->name; | 949 | old_name = old->name; |
| 743 | goto mismatch; | 950 | goto mismatch; |
| 744 | } | 951 | } |
| 745 | 952 | ||
| 746 | #if defined(CONFIG_IRQ_PER_CPU) | ||
| 747 | /* All handlers must agree on per-cpuness */ | 953 | /* All handlers must agree on per-cpuness */ |
| 748 | if ((old->flags & IRQF_PERCPU) != | 954 | if ((old->flags & IRQF_PERCPU) != |
| 749 | (new->flags & IRQF_PERCPU)) | 955 | (new->flags & IRQF_PERCPU)) |
| 750 | goto mismatch; | 956 | goto mismatch; |
| 751 | #endif | ||
| 752 | 957 | ||
| 753 | /* add new interrupt at end of irq queue */ | 958 | /* add new interrupt at end of irq queue */ |
| 754 | do { | 959 | do { |
| 960 | thread_mask |= old->thread_mask; | ||
| 755 | old_ptr = &old->next; | 961 | old_ptr = &old->next; |
| 756 | old = *old_ptr; | 962 | old = *old_ptr; |
| 757 | } while (old); | 963 | } while (old); |
| 758 | shared = 1; | 964 | shared = 1; |
| 759 | } | 965 | } |
| 760 | 966 | ||
| 761 | if (!shared) { | 967 | /* |
| 762 | irq_chip_set_defaults(desc->irq_data.chip); | 968 | * Setup the thread mask for this irqaction. Unlikely to have |
| 969 | * 32 resp 64 irqs sharing one line, but who knows. | ||
| 970 | */ | ||
| 971 | if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { | ||
| 972 | ret = -EBUSY; | ||
| 973 | goto out_mask; | ||
| 974 | } | ||
| 975 | new->thread_mask = 1 << ffz(thread_mask); | ||
| 763 | 976 | ||
| 977 | if (!shared) { | ||
| 764 | init_waitqueue_head(&desc->wait_for_threads); | 978 | init_waitqueue_head(&desc->wait_for_threads); |
| 765 | 979 | ||
| 766 | /* Setup the type (level, edge polarity) if configured: */ | 980 | /* Setup the type (level, edge polarity) if configured: */ |
| @@ -769,42 +983,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 769 | new->flags & IRQF_TRIGGER_MASK); | 983 | new->flags & IRQF_TRIGGER_MASK); |
| 770 | 984 | ||
| 771 | if (ret) | 985 | if (ret) |
| 772 | goto out_thread; | 986 | goto out_mask; |
| 773 | } else | 987 | } |
| 774 | compat_irq_chip_set_default_handler(desc); | 988 | |
| 775 | #if defined(CONFIG_IRQ_PER_CPU) | 989 | desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ |
| 776 | if (new->flags & IRQF_PERCPU) | 990 | IRQS_ONESHOT | IRQS_WAITING); |
| 777 | desc->status |= IRQ_PER_CPU; | 991 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
| 778 | #endif | ||
| 779 | 992 | ||
| 780 | desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | | 993 | if (new->flags & IRQF_PERCPU) { |
| 781 | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); | 994 | irqd_set(&desc->irq_data, IRQD_PER_CPU); |
| 995 | irq_settings_set_per_cpu(desc); | ||
| 996 | } | ||
| 782 | 997 | ||
| 783 | if (new->flags & IRQF_ONESHOT) | 998 | if (new->flags & IRQF_ONESHOT) |
| 784 | desc->status |= IRQ_ONESHOT; | 999 | desc->istate |= IRQS_ONESHOT; |
| 785 | 1000 | ||
| 786 | if (!(desc->status & IRQ_NOAUTOEN)) { | 1001 | if (irq_settings_can_autoenable(desc)) |
| 787 | desc->depth = 0; | 1002 | irq_startup(desc); |
| 788 | desc->status &= ~IRQ_DISABLED; | 1003 | else |
| 789 | desc->irq_data.chip->irq_startup(&desc->irq_data); | ||
| 790 | } else | ||
| 791 | /* Undo nested disables: */ | 1004 | /* Undo nested disables: */ |
| 792 | desc->depth = 1; | 1005 | desc->depth = 1; |
| 793 | 1006 | ||
| 794 | /* Exclude IRQ from balancing if requested */ | 1007 | /* Exclude IRQ from balancing if requested */ |
| 795 | if (new->flags & IRQF_NOBALANCING) | 1008 | if (new->flags & IRQF_NOBALANCING) { |
| 796 | desc->status |= IRQ_NO_BALANCING; | 1009 | irq_settings_set_no_balancing(desc); |
| 1010 | irqd_set(&desc->irq_data, IRQD_NO_BALANCING); | ||
| 1011 | } | ||
| 797 | 1012 | ||
| 798 | /* Set default affinity mask once everything is setup */ | 1013 | /* Set default affinity mask once everything is setup */ |
| 799 | setup_affinity(irq, desc); | 1014 | setup_affinity(irq, desc, mask); |
| 800 | 1015 | ||
| 801 | } else if ((new->flags & IRQF_TRIGGER_MASK) | 1016 | } else if (new->flags & IRQF_TRIGGER_MASK) { |
| 802 | && (new->flags & IRQF_TRIGGER_MASK) | 1017 | unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; |
| 803 | != (desc->status & IRQ_TYPE_SENSE_MASK)) { | 1018 | unsigned int omsk = irq_settings_get_trigger_mask(desc); |
| 804 | /* hope the handler works with the actual trigger mode... */ | 1019 | |
| 805 | pr_warning("IRQ %d uses trigger mode %d; requested %d\n", | 1020 | if (nmsk != omsk) |
| 806 | irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), | 1021 | /* hope the handler works with current trigger mode */ |
| 807 | (int)(new->flags & IRQF_TRIGGER_MASK)); | 1022 | pr_warning("IRQ %d uses trigger mode %u; requested %u\n", |
| 1023 | irq, nmsk, omsk); | ||
| 808 | } | 1024 | } |
| 809 | 1025 | ||
| 810 | new->irq = irq; | 1026 | new->irq = irq; |
| @@ -818,8 +1034,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 818 | * Check whether we disabled the irq via the spurious handler | 1034 | * Check whether we disabled the irq via the spurious handler |
| 819 | * before. Reenable it and give it another chance. | 1035 | * before. Reenable it and give it another chance. |
| 820 | */ | 1036 | */ |
| 821 | if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { | 1037 | if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { |
| 822 | desc->status &= ~IRQ_SPURIOUS_DISABLED; | 1038 | desc->istate &= ~IRQS_SPURIOUS_DISABLED; |
| 823 | __enable_irq(desc, irq, false); | 1039 | __enable_irq(desc, irq, false); |
| 824 | } | 1040 | } |
| 825 | 1041 | ||
| @@ -835,6 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 835 | register_irq_proc(irq, desc); | 1051 | register_irq_proc(irq, desc); |
| 836 | new->dir = NULL; | 1052 | new->dir = NULL; |
| 837 | register_handler_proc(irq, new); | 1053 | register_handler_proc(irq, new); |
| 1054 | free_cpumask_var(mask); | ||
| 838 | 1055 | ||
| 839 | return 0; | 1056 | return 0; |
| 840 | 1057 | ||
| @@ -849,8 +1066,11 @@ mismatch: | |||
| 849 | #endif | 1066 | #endif |
| 850 | ret = -EBUSY; | 1067 | ret = -EBUSY; |
| 851 | 1068 | ||
| 852 | out_thread: | 1069 | out_mask: |
| 853 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1070 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 1071 | free_cpumask_var(mask); | ||
| 1072 | |||
| 1073 | out_thread: | ||
| 854 | if (new->thread) { | 1074 | if (new->thread) { |
| 855 | struct task_struct *t = new->thread; | 1075 | struct task_struct *t = new->thread; |
| 856 | 1076 | ||
| @@ -871,9 +1091,14 @@ out_thread: | |||
| 871 | */ | 1091 | */ |
| 872 | int setup_irq(unsigned int irq, struct irqaction *act) | 1092 | int setup_irq(unsigned int irq, struct irqaction *act) |
| 873 | { | 1093 | { |
| 1094 | int retval; | ||
| 874 | struct irq_desc *desc = irq_to_desc(irq); | 1095 | struct irq_desc *desc = irq_to_desc(irq); |
| 875 | 1096 | ||
| 876 | return __setup_irq(irq, desc, act); | 1097 | chip_bus_lock(desc); |
| 1098 | retval = __setup_irq(irq, desc, act); | ||
| 1099 | chip_bus_sync_unlock(desc); | ||
| 1100 | |||
| 1101 | return retval; | ||
| 877 | } | 1102 | } |
| 878 | EXPORT_SYMBOL_GPL(setup_irq); | 1103 | EXPORT_SYMBOL_GPL(setup_irq); |
| 879 | 1104 | ||
| @@ -924,13 +1149,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 924 | #endif | 1149 | #endif |
| 925 | 1150 | ||
| 926 | /* If this was the last handler, shut down the IRQ line: */ | 1151 | /* If this was the last handler, shut down the IRQ line: */ |
| 927 | if (!desc->action) { | 1152 | if (!desc->action) |
| 928 | desc->status |= IRQ_DISABLED; | 1153 | irq_shutdown(desc); |
| 929 | if (desc->irq_data.chip->irq_shutdown) | ||
| 930 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | ||
| 931 | else | ||
| 932 | desc->irq_data.chip->irq_disable(&desc->irq_data); | ||
| 933 | } | ||
| 934 | 1154 | ||
| 935 | #ifdef CONFIG_SMP | 1155 | #ifdef CONFIG_SMP |
| 936 | /* make sure affinity_hint is cleaned up */ | 1156 | /* make sure affinity_hint is cleaned up */ |
| @@ -1004,6 +1224,11 @@ void free_irq(unsigned int irq, void *dev_id) | |||
| 1004 | if (!desc) | 1224 | if (!desc) |
| 1005 | return; | 1225 | return; |
| 1006 | 1226 | ||
| 1227 | #ifdef CONFIG_SMP | ||
| 1228 | if (WARN_ON(desc->affinity_notify)) | ||
| 1229 | desc->affinity_notify = NULL; | ||
| 1230 | #endif | ||
| 1231 | |||
| 1007 | chip_bus_lock(desc); | 1232 | chip_bus_lock(desc); |
| 1008 | kfree(__free_irq(irq, dev_id)); | 1233 | kfree(__free_irq(irq, dev_id)); |
| 1009 | chip_bus_sync_unlock(desc); | 1234 | chip_bus_sync_unlock(desc); |
| @@ -1074,7 +1299,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
| 1074 | if (!desc) | 1299 | if (!desc) |
| 1075 | return -EINVAL; | 1300 | return -EINVAL; |
| 1076 | 1301 | ||
| 1077 | if (desc->status & IRQ_NOREQUEST) | 1302 | if (!irq_settings_can_request(desc)) |
| 1078 | return -EINVAL; | 1303 | return -EINVAL; |
| 1079 | 1304 | ||
| 1080 | if (!handler) { | 1305 | if (!handler) { |
| @@ -1100,7 +1325,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
| 1100 | if (retval) | 1325 | if (retval) |
| 1101 | kfree(action); | 1326 | kfree(action); |
| 1102 | 1327 | ||
| 1103 | #ifdef CONFIG_DEBUG_SHIRQ | 1328 | #ifdef CONFIG_DEBUG_SHIRQ_FIXME |
| 1104 | if (!retval && (irqflags & IRQF_SHARED)) { | 1329 | if (!retval && (irqflags & IRQF_SHARED)) { |
| 1105 | /* | 1330 | /* |
| 1106 | * It's a shared IRQ -- the driver ought to be prepared for it | 1331 | * It's a shared IRQ -- the driver ought to be prepared for it |
| @@ -1149,7 +1374,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, | |||
| 1149 | if (!desc) | 1374 | if (!desc) |
| 1150 | return -EINVAL; | 1375 | return -EINVAL; |
| 1151 | 1376 | ||
| 1152 | if (desc->status & IRQ_NESTED_THREAD) { | 1377 | if (irq_settings_is_nested_thread(desc)) { |
| 1153 | ret = request_threaded_irq(irq, NULL, handler, | 1378 | ret = request_threaded_irq(irq, NULL, handler, |
| 1154 | flags, name, dev_id); | 1379 | flags, name, dev_id); |
| 1155 | return !ret ? IRQC_IS_NESTED : ret; | 1380 | return !ret ? IRQC_IS_NESTED : ret; |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 441fd629ff04..47420908fba0 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
| @@ -4,23 +4,23 @@ | |||
| 4 | 4 | ||
| 5 | #include "internals.h" | 5 | #include "internals.h" |
| 6 | 6 | ||
| 7 | void move_masked_irq(int irq) | 7 | void irq_move_masked_irq(struct irq_data *idata) |
| 8 | { | 8 | { |
| 9 | struct irq_desc *desc = irq_to_desc(irq); | 9 | struct irq_desc *desc = irq_data_to_desc(idata); |
| 10 | struct irq_chip *chip = desc->irq_data.chip; | 10 | struct irq_chip *chip = idata->chip; |
| 11 | 11 | ||
| 12 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 12 | if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) |
| 13 | return; | 13 | return; |
| 14 | 14 | ||
| 15 | /* | 15 | /* |
| 16 | * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. | 16 | * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. |
| 17 | */ | 17 | */ |
| 18 | if (CHECK_IRQ_PER_CPU(desc->status)) { | 18 | if (!irqd_can_balance(&desc->irq_data)) { |
| 19 | WARN_ON(1); | 19 | WARN_ON(1); |
| 20 | return; | 20 | return; |
| 21 | } | 21 | } |
| 22 | 22 | ||
| 23 | desc->status &= ~IRQ_MOVE_PENDING; | 23 | irqd_clr_move_pending(&desc->irq_data); |
| 24 | 24 | ||
| 25 | if (unlikely(cpumask_empty(desc->pending_mask))) | 25 | if (unlikely(cpumask_empty(desc->pending_mask))) |
| 26 | return; | 26 | return; |
| @@ -35,7 +35,7 @@ void move_masked_irq(int irq) | |||
| 35 | * do the disable, re-program, enable sequence. | 35 | * do the disable, re-program, enable sequence. |
| 36 | * This is *not* particularly important for level triggered | 36 | * This is *not* particularly important for level triggered |
| 37 | * but in a edge trigger case, we might be setting rte | 37 | * but in a edge trigger case, we might be setting rte |
| 38 | * when an active trigger is comming in. This could | 38 | * when an active trigger is coming in. This could |
| 39 | * cause some ioapics to mal-function. | 39 | * cause some ioapics to mal-function. |
| 40 | * Being paranoid i guess! | 40 | * Being paranoid i guess! |
| 41 | * | 41 | * |
| @@ -53,15 +53,14 @@ void move_masked_irq(int irq) | |||
| 53 | cpumask_clear(desc->pending_mask); | 53 | cpumask_clear(desc->pending_mask); |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | void move_native_irq(int irq) | 56 | void irq_move_irq(struct irq_data *idata) |
| 57 | { | 57 | { |
| 58 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 59 | bool masked; | 58 | bool masked; |
| 60 | 59 | ||
| 61 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 60 | if (likely(!irqd_is_setaffinity_pending(idata))) |
| 62 | return; | 61 | return; |
| 63 | 62 | ||
| 64 | if (unlikely(desc->status & IRQ_DISABLED)) | 63 | if (unlikely(irqd_irq_disabled(idata))) |
| 65 | return; | 64 | return; |
| 66 | 65 | ||
| 67 | /* | 66 | /* |
| @@ -69,10 +68,10 @@ void move_native_irq(int irq) | |||
| 69 | * threaded interrupt with ONESHOT set, we can end up with an | 68 | * threaded interrupt with ONESHOT set, we can end up with an |
| 70 | * interrupt storm. | 69 | * interrupt storm. |
| 71 | */ | 70 | */ |
| 72 | masked = desc->status & IRQ_MASKED; | 71 | masked = irqd_irq_masked(idata); |
| 73 | if (!masked) | 72 | if (!masked) |
| 74 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 73 | idata->chip->irq_mask(idata); |
| 75 | move_masked_irq(irq); | 74 | irq_move_masked_irq(idata); |
| 76 | if (!masked) | 75 | if (!masked) |
| 77 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 76 | idata->chip->irq_unmask(idata); |
| 78 | } | 77 | } |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0d4005d85b03..f76fc00c9877 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
| @@ -18,7 +18,7 @@ | |||
| 18 | * During system-wide suspend or hibernation device drivers need to be prevented | 18 | * During system-wide suspend or hibernation device drivers need to be prevented |
| 19 | * from receiving interrupts and this function is provided for this purpose. | 19 | * from receiving interrupts and this function is provided for this purpose. |
| 20 | * It marks all interrupt lines in use, except for the timer ones, as disabled | 20 | * It marks all interrupt lines in use, except for the timer ones, as disabled |
| 21 | * and sets the IRQ_SUSPENDED flag for each of them. | 21 | * and sets the IRQS_SUSPENDED flag for each of them. |
| 22 | */ | 22 | */ |
| 23 | void suspend_device_irqs(void) | 23 | void suspend_device_irqs(void) |
| 24 | { | 24 | { |
| @@ -34,7 +34,7 @@ void suspend_device_irqs(void) | |||
| 34 | } | 34 | } |
| 35 | 35 | ||
| 36 | for_each_irq_desc(irq, desc) | 36 | for_each_irq_desc(irq, desc) |
| 37 | if (desc->status & IRQ_SUSPENDED) | 37 | if (desc->istate & IRQS_SUSPENDED) |
| 38 | synchronize_irq(irq); | 38 | synchronize_irq(irq); |
| 39 | } | 39 | } |
| 40 | EXPORT_SYMBOL_GPL(suspend_device_irqs); | 40 | EXPORT_SYMBOL_GPL(suspend_device_irqs); |
| @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); | |||
| 43 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() | 43 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() |
| 44 | * | 44 | * |
| 45 | * Enable all interrupt lines previously disabled by suspend_device_irqs() that | 45 | * Enable all interrupt lines previously disabled by suspend_device_irqs() that |
| 46 | * have the IRQ_SUSPENDED flag set. | 46 | * have the IRQS_SUSPENDED flag set. |
| 47 | */ | 47 | */ |
| 48 | void resume_device_irqs(void) | 48 | void resume_device_irqs(void) |
| 49 | { | 49 | { |
| @@ -53,9 +53,6 @@ void resume_device_irqs(void) | |||
| 53 | for_each_irq_desc(irq, desc) { | 53 | for_each_irq_desc(irq, desc) { |
| 54 | unsigned long flags; | 54 | unsigned long flags; |
| 55 | 55 | ||
| 56 | if (!(desc->status & IRQ_SUSPENDED)) | ||
| 57 | continue; | ||
| 58 | |||
| 59 | raw_spin_lock_irqsave(&desc->lock, flags); | 56 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 60 | __enable_irq(desc, irq, true); | 57 | __enable_irq(desc, irq, true); |
| 61 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 58 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| @@ -71,9 +68,24 @@ int check_wakeup_irqs(void) | |||
| 71 | struct irq_desc *desc; | 68 | struct irq_desc *desc; |
| 72 | int irq; | 69 | int irq; |
| 73 | 70 | ||
| 74 | for_each_irq_desc(irq, desc) | 71 | for_each_irq_desc(irq, desc) { |
| 75 | if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) | 72 | if (irqd_is_wakeup_set(&desc->irq_data)) { |
| 76 | return -EBUSY; | 73 | if (desc->istate & IRQS_PENDING) |
| 74 | return -EBUSY; | ||
| 75 | continue; | ||
| 76 | } | ||
| 77 | /* | ||
| 78 | * Check the non wakeup interrupts whether they need | ||
| 79 | * to be masked before finally going into suspend | ||
| 80 | * state. That's for hardware which has no wakeup | ||
| 81 | * source configuration facility. The chip | ||
| 82 | * implementation indicates that with | ||
| 83 | * IRQCHIP_MASK_ON_SUSPEND. | ||
| 84 | */ | ||
| 85 | if (desc->istate & IRQS_SUSPENDED && | ||
| 86 | irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) | ||
| 87 | mask_irq(desc); | ||
| 88 | } | ||
| 77 | 89 | ||
| 78 | return 0; | 90 | return 0; |
| 79 | } | 91 | } |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c8a2a9f8a7b..834899f2500f 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -11,6 +11,7 @@ | |||
| 11 | #include <linux/proc_fs.h> | 11 | #include <linux/proc_fs.h> |
| 12 | #include <linux/seq_file.h> | 12 | #include <linux/seq_file.h> |
| 13 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
| 14 | #include <linux/kernel_stat.h> | ||
| 14 | 15 | ||
| 15 | #include "internals.h" | 16 | #include "internals.h" |
| 16 | 17 | ||
| @@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) | |||
| 24 | const struct cpumask *mask = desc->irq_data.affinity; | 25 | const struct cpumask *mask = desc->irq_data.affinity; |
| 25 | 26 | ||
| 26 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 27 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
| 27 | if (desc->status & IRQ_MOVE_PENDING) | 28 | if (irqd_is_setaffinity_pending(&desc->irq_data)) |
| 28 | mask = desc->pending_mask; | 29 | mask = desc->pending_mask; |
| 29 | #endif | 30 | #endif |
| 30 | seq_cpumask(m, mask); | 31 | seq_cpumask(m, mask); |
| @@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, | |||
| 65 | cpumask_var_t new_value; | 66 | cpumask_var_t new_value; |
| 66 | int err; | 67 | int err; |
| 67 | 68 | ||
| 68 | if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || | 69 | if (!irq_can_set_affinity(irq) || no_irq_affinity) |
| 69 | irq_balancing_disabled(irq)) | ||
| 70 | return -EIO; | 70 | return -EIO; |
| 71 | 71 | ||
| 72 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) | 72 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) |
| @@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, | |||
| 89 | if (!cpumask_intersects(new_value, cpu_online_mask)) { | 89 | if (!cpumask_intersects(new_value, cpu_online_mask)) { |
| 90 | /* Special case for empty set - allow the architecture | 90 | /* Special case for empty set - allow the architecture |
| 91 | code to set default SMP affinity. */ | 91 | code to set default SMP affinity. */ |
| 92 | err = irq_select_affinity_usr(irq) ? -EINVAL : count; | 92 | err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; |
| 93 | } else { | 93 | } else { |
| 94 | irq_set_affinity(irq, new_value); | 94 | irq_set_affinity(irq, new_value); |
| 95 | err = count; | 95 | err = count; |
| @@ -357,3 +357,83 @@ void init_irq_proc(void) | |||
| 357 | } | 357 | } |
| 358 | } | 358 | } |
| 359 | 359 | ||
| 360 | #ifdef CONFIG_GENERIC_IRQ_SHOW | ||
| 361 | |||
| 362 | int __weak arch_show_interrupts(struct seq_file *p, int prec) | ||
| 363 | { | ||
| 364 | return 0; | ||
| 365 | } | ||
| 366 | |||
| 367 | #ifndef ACTUAL_NR_IRQS | ||
| 368 | # define ACTUAL_NR_IRQS nr_irqs | ||
| 369 | #endif | ||
| 370 | |||
| 371 | int show_interrupts(struct seq_file *p, void *v) | ||
| 372 | { | ||
| 373 | static int prec; | ||
| 374 | |||
| 375 | unsigned long flags, any_count = 0; | ||
| 376 | int i = *(loff_t *) v, j; | ||
| 377 | struct irqaction *action; | ||
| 378 | struct irq_desc *desc; | ||
| 379 | |||
| 380 | if (i > ACTUAL_NR_IRQS) | ||
| 381 | return 0; | ||
| 382 | |||
| 383 | if (i == ACTUAL_NR_IRQS) | ||
| 384 | return arch_show_interrupts(p, prec); | ||
| 385 | |||
| 386 | /* print header and calculate the width of the first column */ | ||
| 387 | if (i == 0) { | ||
| 388 | for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) | ||
| 389 | j *= 10; | ||
| 390 | |||
| 391 | seq_printf(p, "%*s", prec + 8, ""); | ||
| 392 | for_each_online_cpu(j) | ||
| 393 | seq_printf(p, "CPU%-8d", j); | ||
| 394 | seq_putc(p, '\n'); | ||
| 395 | } | ||
| 396 | |||
| 397 | desc = irq_to_desc(i); | ||
| 398 | if (!desc) | ||
| 399 | return 0; | ||
| 400 | |||
| 401 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 402 | for_each_online_cpu(j) | ||
| 403 | any_count |= kstat_irqs_cpu(i, j); | ||
| 404 | action = desc->action; | ||
| 405 | if (!action && !any_count) | ||
| 406 | goto out; | ||
| 407 | |||
| 408 | seq_printf(p, "%*d: ", prec, i); | ||
| 409 | for_each_online_cpu(j) | ||
| 410 | seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); | ||
| 411 | |||
| 412 | if (desc->irq_data.chip) { | ||
| 413 | if (desc->irq_data.chip->irq_print_chip) | ||
| 414 | desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); | ||
| 415 | else if (desc->irq_data.chip->name) | ||
| 416 | seq_printf(p, " %8s", desc->irq_data.chip->name); | ||
| 417 | else | ||
| 418 | seq_printf(p, " %8s", "-"); | ||
| 419 | } else { | ||
| 420 | seq_printf(p, " %8s", "None"); | ||
| 421 | } | ||
| 422 | #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL | ||
| 423 | seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); | ||
| 424 | #endif | ||
| 425 | if (desc->name) | ||
| 426 | seq_printf(p, "-%-8s", desc->name); | ||
| 427 | |||
| 428 | if (action) { | ||
| 429 | seq_printf(p, " %s", action->name); | ||
| 430 | while ((action = action->next) != NULL) | ||
| 431 | seq_printf(p, ", %s", action->name); | ||
| 432 | } | ||
| 433 | |||
| 434 | seq_putc(p, '\n'); | ||
| 435 | out: | ||
| 436 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 437 | return 0; | ||
| 438 | } | ||
| 439 | #endif | ||
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 891115a929aa..14dd5761e8c9 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
| @@ -23,7 +23,7 @@ | |||
| 23 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 23 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
| 24 | 24 | ||
| 25 | /* Bitmap to handle software resend of interrupts: */ | 25 | /* Bitmap to handle software resend of interrupts: */ |
| 26 | static DECLARE_BITMAP(irqs_resend, NR_IRQS); | 26 | static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); |
| 27 | 27 | ||
| 28 | /* | 28 | /* |
| 29 | * Run software resends of IRQ's | 29 | * Run software resends of IRQ's |
| @@ -55,20 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); | |||
| 55 | */ | 55 | */ |
| 56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) | 56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) |
| 57 | { | 57 | { |
| 58 | unsigned int status = desc->status; | ||
| 59 | |||
| 60 | /* | ||
| 61 | * Make sure the interrupt is enabled, before resending it: | ||
| 62 | */ | ||
| 63 | desc->irq_data.chip->irq_enable(&desc->irq_data); | ||
| 64 | |||
| 65 | /* | 58 | /* |
| 66 | * We do not resend level type interrupts. Level type | 59 | * We do not resend level type interrupts. Level type |
| 67 | * interrupts are resent by hardware when they are still | 60 | * interrupts are resent by hardware when they are still |
| 68 | * active. | 61 | * active. |
| 69 | */ | 62 | */ |
| 70 | if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | 63 | if (irq_settings_is_level(desc)) |
| 71 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; | 64 | return; |
| 65 | if (desc->istate & IRQS_REPLAY) | ||
| 66 | return; | ||
| 67 | if (desc->istate & IRQS_PENDING) { | ||
| 68 | desc->istate &= ~IRQS_PENDING; | ||
| 69 | desc->istate |= IRQS_REPLAY; | ||
| 72 | 70 | ||
| 73 | if (!desc->irq_data.chip->irq_retrigger || | 71 | if (!desc->irq_data.chip->irq_retrigger || |
| 74 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { | 72 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { |
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h new file mode 100644 index 000000000000..0d91730b6330 --- /dev/null +++ b/kernel/irq/settings.h | |||
| @@ -0,0 +1,125 @@ | |||
| 1 | /* | ||
| 2 | * Internal header to deal with irq_desc->status which will be renamed | ||
| 3 | * to irq_desc->settings. | ||
| 4 | */ | ||
| 5 | enum { | ||
| 6 | _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, | ||
| 7 | _IRQ_PER_CPU = IRQ_PER_CPU, | ||
| 8 | _IRQ_LEVEL = IRQ_LEVEL, | ||
| 9 | _IRQ_NOPROBE = IRQ_NOPROBE, | ||
| 10 | _IRQ_NOREQUEST = IRQ_NOREQUEST, | ||
| 11 | _IRQ_NOAUTOEN = IRQ_NOAUTOEN, | ||
| 12 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, | ||
| 13 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, | ||
| 14 | _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, | ||
| 15 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, | ||
| 16 | }; | ||
| 17 | |||
| 18 | #define IRQ_PER_CPU GOT_YOU_MORON | ||
| 19 | #define IRQ_NO_BALANCING GOT_YOU_MORON | ||
| 20 | #define IRQ_LEVEL GOT_YOU_MORON | ||
| 21 | #define IRQ_NOPROBE GOT_YOU_MORON | ||
| 22 | #define IRQ_NOREQUEST GOT_YOU_MORON | ||
| 23 | #define IRQ_NOAUTOEN GOT_YOU_MORON | ||
| 24 | #define IRQ_NESTED_THREAD GOT_YOU_MORON | ||
| 25 | #undef IRQF_MODIFY_MASK | ||
| 26 | #define IRQF_MODIFY_MASK GOT_YOU_MORON | ||
| 27 | |||
| 28 | static inline void | ||
| 29 | irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) | ||
| 30 | { | ||
| 31 | desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK); | ||
| 32 | desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); | ||
| 33 | } | ||
| 34 | |||
| 35 | static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) | ||
| 36 | { | ||
| 37 | return desc->status_use_accessors & _IRQ_PER_CPU; | ||
| 38 | } | ||
| 39 | |||
| 40 | static inline void irq_settings_set_per_cpu(struct irq_desc *desc) | ||
| 41 | { | ||
| 42 | desc->status_use_accessors |= _IRQ_PER_CPU; | ||
| 43 | } | ||
| 44 | |||
| 45 | static inline void irq_settings_set_no_balancing(struct irq_desc *desc) | ||
| 46 | { | ||
| 47 | desc->status_use_accessors |= _IRQ_NO_BALANCING; | ||
| 48 | } | ||
| 49 | |||
| 50 | static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) | ||
| 51 | { | ||
| 52 | return desc->status_use_accessors & _IRQ_NO_BALANCING; | ||
| 53 | } | ||
| 54 | |||
| 55 | static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) | ||
| 56 | { | ||
| 57 | return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; | ||
| 58 | } | ||
| 59 | |||
| 60 | static inline void | ||
| 61 | irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) | ||
| 62 | { | ||
| 63 | desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK; | ||
| 64 | desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK; | ||
| 65 | } | ||
| 66 | |||
| 67 | static inline bool irq_settings_is_level(struct irq_desc *desc) | ||
| 68 | { | ||
| 69 | return desc->status_use_accessors & _IRQ_LEVEL; | ||
| 70 | } | ||
| 71 | |||
| 72 | static inline void irq_settings_clr_level(struct irq_desc *desc) | ||
| 73 | { | ||
| 74 | desc->status_use_accessors &= ~_IRQ_LEVEL; | ||
| 75 | } | ||
| 76 | |||
| 77 | static inline void irq_settings_set_level(struct irq_desc *desc) | ||
| 78 | { | ||
| 79 | desc->status_use_accessors |= _IRQ_LEVEL; | ||
| 80 | } | ||
| 81 | |||
| 82 | static inline bool irq_settings_can_request(struct irq_desc *desc) | ||
| 83 | { | ||
| 84 | return !(desc->status_use_accessors & _IRQ_NOREQUEST); | ||
| 85 | } | ||
| 86 | |||
| 87 | static inline void irq_settings_clr_norequest(struct irq_desc *desc) | ||
| 88 | { | ||
| 89 | desc->status_use_accessors &= ~_IRQ_NOREQUEST; | ||
| 90 | } | ||
| 91 | |||
| 92 | static inline void irq_settings_set_norequest(struct irq_desc *desc) | ||
| 93 | { | ||
| 94 | desc->status_use_accessors |= _IRQ_NOREQUEST; | ||
| 95 | } | ||
| 96 | |||
| 97 | static inline bool irq_settings_can_probe(struct irq_desc *desc) | ||
| 98 | { | ||
| 99 | return !(desc->status_use_accessors & _IRQ_NOPROBE); | ||
| 100 | } | ||
| 101 | |||
| 102 | static inline void irq_settings_clr_noprobe(struct irq_desc *desc) | ||
| 103 | { | ||
| 104 | desc->status_use_accessors &= ~_IRQ_NOPROBE; | ||
| 105 | } | ||
| 106 | |||
| 107 | static inline void irq_settings_set_noprobe(struct irq_desc *desc) | ||
| 108 | { | ||
| 109 | desc->status_use_accessors |= _IRQ_NOPROBE; | ||
| 110 | } | ||
| 111 | |||
| 112 | static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) | ||
| 113 | { | ||
| 114 | return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; | ||
| 115 | } | ||
| 116 | |||
| 117 | static inline bool irq_settings_can_autoenable(struct irq_desc *desc) | ||
| 118 | { | ||
| 119 | return !(desc->status_use_accessors & _IRQ_NOAUTOEN); | ||
| 120 | } | ||
| 121 | |||
| 122 | static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) | ||
| 123 | { | ||
| 124 | return desc->status_use_accessors & _IRQ_NESTED_THREAD; | ||
| 125 | } | ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3089d3b9d5f3..dfbd550401b2 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
| @@ -21,70 +21,93 @@ static int irqfixup __read_mostly; | |||
| 21 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) | 21 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) |
| 22 | static void poll_spurious_irqs(unsigned long dummy); | 22 | static void poll_spurious_irqs(unsigned long dummy); |
| 23 | static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); | 23 | static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); |
| 24 | static int irq_poll_cpu; | ||
| 25 | static atomic_t irq_poll_active; | ||
| 26 | |||
| 27 | /* | ||
| 28 | * We wait here for a poller to finish. | ||
| 29 | * | ||
| 30 | * If the poll runs on this CPU, then we yell loudly and return | ||
| 31 | * false. That will leave the interrupt line disabled in the worst | ||
| 32 | * case, but it should never happen. | ||
| 33 | * | ||
| 34 | * We wait until the poller is done and then recheck disabled and | ||
| 35 | * action (about to be disabled). Only if it's still active, we return | ||
| 36 | * true and let the handler run. | ||
| 37 | */ | ||
| 38 | bool irq_wait_for_poll(struct irq_desc *desc) | ||
| 39 | { | ||
| 40 | if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), | ||
| 41 | "irq poll in progress on cpu %d for irq %d\n", | ||
| 42 | smp_processor_id(), desc->irq_data.irq)) | ||
| 43 | return false; | ||
| 44 | |||
| 45 | #ifdef CONFIG_SMP | ||
| 46 | do { | ||
| 47 | raw_spin_unlock(&desc->lock); | ||
| 48 | while (irqd_irq_inprogress(&desc->irq_data)) | ||
| 49 | cpu_relax(); | ||
| 50 | raw_spin_lock(&desc->lock); | ||
| 51 | } while (irqd_irq_inprogress(&desc->irq_data)); | ||
| 52 | /* Might have been disabled in meantime */ | ||
| 53 | return !irqd_irq_disabled(&desc->irq_data) && desc->action; | ||
| 54 | #else | ||
| 55 | return false; | ||
| 56 | #endif | ||
| 57 | } | ||
| 58 | |||
| 24 | 59 | ||
| 25 | /* | 60 | /* |
| 26 | * Recovery handler for misrouted interrupts. | 61 | * Recovery handler for misrouted interrupts. |
| 27 | */ | 62 | */ |
| 28 | static int try_one_irq(int irq, struct irq_desc *desc) | 63 | static int try_one_irq(int irq, struct irq_desc *desc, bool force) |
| 29 | { | 64 | { |
| 65 | irqreturn_t ret = IRQ_NONE; | ||
| 30 | struct irqaction *action; | 66 | struct irqaction *action; |
| 31 | int ok = 0, work = 0; | ||
| 32 | 67 | ||
| 33 | raw_spin_lock(&desc->lock); | 68 | raw_spin_lock(&desc->lock); |
| 34 | /* Already running on another processor */ | ||
| 35 | if (desc->status & IRQ_INPROGRESS) { | ||
| 36 | /* | ||
| 37 | * Already running: If it is shared get the other | ||
| 38 | * CPU to go looking for our mystery interrupt too | ||
| 39 | */ | ||
| 40 | if (desc->action && (desc->action->flags & IRQF_SHARED)) | ||
| 41 | desc->status |= IRQ_PENDING; | ||
| 42 | raw_spin_unlock(&desc->lock); | ||
| 43 | return ok; | ||
| 44 | } | ||
| 45 | /* Honour the normal IRQ locking */ | ||
| 46 | desc->status |= IRQ_INPROGRESS; | ||
| 47 | action = desc->action; | ||
| 48 | raw_spin_unlock(&desc->lock); | ||
| 49 | 69 | ||
| 50 | while (action) { | 70 | /* PER_CPU and nested thread interrupts are never polled */ |
| 51 | /* Only shared IRQ handlers are safe to call */ | 71 | if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) |
| 52 | if (action->flags & IRQF_SHARED) { | 72 | goto out; |
| 53 | if (action->handler(irq, action->dev_id) == | ||
| 54 | IRQ_HANDLED) | ||
| 55 | ok = 1; | ||
| 56 | } | ||
| 57 | action = action->next; | ||
| 58 | } | ||
| 59 | local_irq_disable(); | ||
| 60 | /* Now clean up the flags */ | ||
| 61 | raw_spin_lock(&desc->lock); | ||
| 62 | action = desc->action; | ||
| 63 | 73 | ||
| 64 | /* | 74 | /* |
| 65 | * While we were looking for a fixup someone queued a real | 75 | * Do not poll disabled interrupts unless the spurious |
| 66 | * IRQ clashing with our walk: | 76 | * disabled poller asks explicitely. |
| 67 | */ | 77 | */ |
| 68 | while ((desc->status & IRQ_PENDING) && action) { | 78 | if (irqd_irq_disabled(&desc->irq_data) && !force) |
| 79 | goto out; | ||
| 80 | |||
| 81 | /* | ||
| 82 | * All handlers must agree on IRQF_SHARED, so we test just the | ||
| 83 | * first. Check for action->next as well. | ||
| 84 | */ | ||
| 85 | action = desc->action; | ||
| 86 | if (!action || !(action->flags & IRQF_SHARED) || | ||
| 87 | (action->flags & __IRQF_TIMER) || !action->next) | ||
| 88 | goto out; | ||
| 89 | |||
| 90 | /* Already running on another processor */ | ||
| 91 | if (irqd_irq_inprogress(&desc->irq_data)) { | ||
| 69 | /* | 92 | /* |
| 70 | * Perform real IRQ processing for the IRQ we deferred | 93 | * Already running: If it is shared get the other |
| 94 | * CPU to go looking for our mystery interrupt too | ||
| 71 | */ | 95 | */ |
| 72 | work = 1; | 96 | desc->istate |= IRQS_PENDING; |
| 73 | raw_spin_unlock(&desc->lock); | 97 | goto out; |
| 74 | handle_IRQ_event(irq, action); | ||
| 75 | raw_spin_lock(&desc->lock); | ||
| 76 | desc->status &= ~IRQ_PENDING; | ||
| 77 | } | 98 | } |
| 78 | desc->status &= ~IRQ_INPROGRESS; | ||
| 79 | /* | ||
| 80 | * If we did actual work for the real IRQ line we must let the | ||
| 81 | * IRQ controller clean up too | ||
| 82 | */ | ||
| 83 | if (work) | ||
| 84 | irq_end(irq, desc); | ||
| 85 | raw_spin_unlock(&desc->lock); | ||
| 86 | 99 | ||
| 87 | return ok; | 100 | /* Mark it poll in progress */ |
| 101 | desc->istate |= IRQS_POLL_INPROGRESS; | ||
| 102 | do { | ||
| 103 | if (handle_irq_event(desc) == IRQ_HANDLED) | ||
| 104 | ret = IRQ_HANDLED; | ||
| 105 | action = desc->action; | ||
| 106 | } while ((desc->istate & IRQS_PENDING) && action); | ||
| 107 | desc->istate &= ~IRQS_POLL_INPROGRESS; | ||
| 108 | out: | ||
| 109 | raw_spin_unlock(&desc->lock); | ||
| 110 | return ret == IRQ_HANDLED; | ||
| 88 | } | 111 | } |
| 89 | 112 | ||
| 90 | static int misrouted_irq(int irq) | 113 | static int misrouted_irq(int irq) |
| @@ -92,6 +115,11 @@ static int misrouted_irq(int irq) | |||
| 92 | struct irq_desc *desc; | 115 | struct irq_desc *desc; |
| 93 | int i, ok = 0; | 116 | int i, ok = 0; |
| 94 | 117 | ||
| 118 | if (atomic_inc_return(&irq_poll_active) == 1) | ||
| 119 | goto out; | ||
| 120 | |||
| 121 | irq_poll_cpu = smp_processor_id(); | ||
| 122 | |||
| 95 | for_each_irq_desc(i, desc) { | 123 | for_each_irq_desc(i, desc) { |
| 96 | if (!i) | 124 | if (!i) |
| 97 | continue; | 125 | continue; |
| @@ -99,9 +127,11 @@ static int misrouted_irq(int irq) | |||
| 99 | if (i == irq) /* Already tried */ | 127 | if (i == irq) /* Already tried */ |
| 100 | continue; | 128 | continue; |
| 101 | 129 | ||
| 102 | if (try_one_irq(i, desc)) | 130 | if (try_one_irq(i, desc, false)) |
| 103 | ok = 1; | 131 | ok = 1; |
| 104 | } | 132 | } |
| 133 | out: | ||
| 134 | atomic_dec(&irq_poll_active); | ||
| 105 | /* So the caller can adjust the irq error counts */ | 135 | /* So the caller can adjust the irq error counts */ |
| 106 | return ok; | 136 | return ok; |
| 107 | } | 137 | } |
| @@ -111,23 +141,28 @@ static void poll_spurious_irqs(unsigned long dummy) | |||
| 111 | struct irq_desc *desc; | 141 | struct irq_desc *desc; |
| 112 | int i; | 142 | int i; |
| 113 | 143 | ||
| 144 | if (atomic_inc_return(&irq_poll_active) != 1) | ||
| 145 | goto out; | ||
| 146 | irq_poll_cpu = smp_processor_id(); | ||
| 147 | |||
| 114 | for_each_irq_desc(i, desc) { | 148 | for_each_irq_desc(i, desc) { |
| 115 | unsigned int status; | 149 | unsigned int state; |
| 116 | 150 | ||
| 117 | if (!i) | 151 | if (!i) |
| 118 | continue; | 152 | continue; |
| 119 | 153 | ||
| 120 | /* Racy but it doesn't matter */ | 154 | /* Racy but it doesn't matter */ |
| 121 | status = desc->status; | 155 | state = desc->istate; |
| 122 | barrier(); | 156 | barrier(); |
| 123 | if (!(status & IRQ_SPURIOUS_DISABLED)) | 157 | if (!(state & IRQS_SPURIOUS_DISABLED)) |
| 124 | continue; | 158 | continue; |
| 125 | 159 | ||
| 126 | local_irq_disable(); | 160 | local_irq_disable(); |
| 127 | try_one_irq(i, desc); | 161 | try_one_irq(i, desc, true); |
| 128 | local_irq_enable(); | 162 | local_irq_enable(); |
| 129 | } | 163 | } |
| 130 | 164 | out: | |
| 165 | atomic_dec(&irq_poll_active); | ||
| 131 | mod_timer(&poll_spurious_irq_timer, | 166 | mod_timer(&poll_spurious_irq_timer, |
| 132 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 167 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
| 133 | } | 168 | } |
| @@ -139,15 +174,13 @@ static void poll_spurious_irqs(unsigned long dummy) | |||
| 139 | * | 174 | * |
| 140 | * (The other 100-of-100,000 interrupts may have been a correctly | 175 | * (The other 100-of-100,000 interrupts may have been a correctly |
| 141 | * functioning device sharing an IRQ with the failing one) | 176 | * functioning device sharing an IRQ with the failing one) |
| 142 | * | ||
| 143 | * Called under desc->lock | ||
| 144 | */ | 177 | */ |
| 145 | |||
| 146 | static void | 178 | static void |
| 147 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, | 179 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, |
| 148 | irqreturn_t action_ret) | 180 | irqreturn_t action_ret) |
| 149 | { | 181 | { |
| 150 | struct irqaction *action; | 182 | struct irqaction *action; |
| 183 | unsigned long flags; | ||
| 151 | 184 | ||
| 152 | if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { | 185 | if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { |
| 153 | printk(KERN_ERR "irq event %d: bogus return value %x\n", | 186 | printk(KERN_ERR "irq event %d: bogus return value %x\n", |
| @@ -159,6 +192,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
| 159 | dump_stack(); | 192 | dump_stack(); |
| 160 | printk(KERN_ERR "handlers:\n"); | 193 | printk(KERN_ERR "handlers:\n"); |
| 161 | 194 | ||
| 195 | /* | ||
| 196 | * We need to take desc->lock here. note_interrupt() is called | ||
| 197 | * w/o desc->lock held, but IRQ_PROGRESS set. We might race | ||
| 198 | * with something else removing an action. It's ok to take | ||
| 199 | * desc->lock here. See synchronize_irq(). | ||
| 200 | */ | ||
| 201 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 162 | action = desc->action; | 202 | action = desc->action; |
| 163 | while (action) { | 203 | while (action) { |
| 164 | printk(KERN_ERR "[<%p>]", action->handler); | 204 | printk(KERN_ERR "[<%p>]", action->handler); |
| @@ -167,6 +207,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
| 167 | printk("\n"); | 207 | printk("\n"); |
| 168 | action = action->next; | 208 | action = action->next; |
| 169 | } | 209 | } |
| 210 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 170 | } | 211 | } |
| 171 | 212 | ||
| 172 | static void | 213 | static void |
| @@ -218,6 +259,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, | |||
| 218 | void note_interrupt(unsigned int irq, struct irq_desc *desc, | 259 | void note_interrupt(unsigned int irq, struct irq_desc *desc, |
| 219 | irqreturn_t action_ret) | 260 | irqreturn_t action_ret) |
| 220 | { | 261 | { |
| 262 | if (desc->istate & IRQS_POLL_INPROGRESS) | ||
| 263 | return; | ||
| 264 | |||
| 221 | if (unlikely(action_ret != IRQ_HANDLED)) { | 265 | if (unlikely(action_ret != IRQ_HANDLED)) { |
| 222 | /* | 266 | /* |
| 223 | * If we are seeing only the odd spurious IRQ caused by | 267 | * If we are seeing only the odd spurious IRQ caused by |
| @@ -254,9 +298,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
| 254 | * Now kill the IRQ | 298 | * Now kill the IRQ |
| 255 | */ | 299 | */ |
| 256 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | 300 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); |
| 257 | desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; | 301 | desc->istate |= IRQS_SPURIOUS_DISABLED; |
| 258 | desc->depth++; | 302 | desc->depth++; |
| 259 | desc->irq_data.chip->irq_disable(&desc->irq_data); | 303 | irq_disable(desc); |
| 260 | 304 | ||
| 261 | mod_timer(&poll_spurious_irq_timer, | 305 | mod_timer(&poll_spurious_irq_timer, |
| 262 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 306 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b5757..079f1d39a8b8 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr) | |||
| 64 | if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || | 64 | if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || |
| 65 | arch_is_kernel_text(addr)) | 65 | arch_is_kernel_text(addr)) |
| 66 | return 1; | 66 | return 1; |
| 67 | return in_gate_area_no_task(addr); | 67 | return in_gate_area_no_mm(addr); |
| 68 | } | 68 | } |
| 69 | 69 | ||
| 70 | static inline int is_kernel(unsigned long addr) | 70 | static inline int is_kernel(unsigned long addr) |
| 71 | { | 71 | { |
| 72 | if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) | 72 | if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) |
| 73 | return 1; | 73 | return 1; |
| 74 | return in_gate_area_no_task(addr); | 74 | return in_gate_area_no_mm(addr); |
| 75 | } | 75 | } |
| 76 | 76 | ||
| 77 | static int is_ksym_addr(unsigned long addr) | 77 | static int is_ksym_addr(unsigned long addr) |
| @@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, | |||
| 342 | } | 342 | } |
| 343 | 343 | ||
| 344 | /* Look up a kernel symbol and return it in a text buffer. */ | 344 | /* Look up a kernel symbol and return it in a text buffer. */ |
| 345 | int sprint_symbol(char *buffer, unsigned long address) | 345 | static int __sprint_symbol(char *buffer, unsigned long address, |
| 346 | int symbol_offset) | ||
| 346 | { | 347 | { |
| 347 | char *modname; | 348 | char *modname; |
| 348 | const char *name; | 349 | const char *name; |
| 349 | unsigned long offset, size; | 350 | unsigned long offset, size; |
| 350 | int len; | 351 | int len; |
| 351 | 352 | ||
| 353 | address += symbol_offset; | ||
| 352 | name = kallsyms_lookup(address, &size, &offset, &modname, buffer); | 354 | name = kallsyms_lookup(address, &size, &offset, &modname, buffer); |
| 353 | if (!name) | 355 | if (!name) |
| 354 | return sprintf(buffer, "0x%lx", address); | 356 | return sprintf(buffer, "0x%lx", address); |
| @@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address) | |||
| 357 | strcpy(buffer, name); | 359 | strcpy(buffer, name); |
| 358 | len = strlen(buffer); | 360 | len = strlen(buffer); |
| 359 | buffer += len; | 361 | buffer += len; |
| 362 | offset -= symbol_offset; | ||
| 360 | 363 | ||
| 361 | if (modname) | 364 | if (modname) |
| 362 | len += sprintf(buffer, "+%#lx/%#lx [%s]", | 365 | len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); |
| 363 | offset, size, modname); | ||
| 364 | else | 366 | else |
| 365 | len += sprintf(buffer, "+%#lx/%#lx", offset, size); | 367 | len += sprintf(buffer, "+%#lx/%#lx", offset, size); |
| 366 | 368 | ||
| 367 | return len; | 369 | return len; |
| 368 | } | 370 | } |
| 371 | |||
| 372 | /** | ||
| 373 | * sprint_symbol - Look up a kernel symbol and return it in a text buffer | ||
| 374 | * @buffer: buffer to be stored | ||
| 375 | * @address: address to lookup | ||
| 376 | * | ||
| 377 | * This function looks up a kernel symbol with @address and stores its name, | ||
| 378 | * offset, size and module name to @buffer if possible. If no symbol was found, | ||
| 379 | * just saves its @address as is. | ||
| 380 | * | ||
| 381 | * This function returns the number of bytes stored in @buffer. | ||
| 382 | */ | ||
| 383 | int sprint_symbol(char *buffer, unsigned long address) | ||
| 384 | { | ||
| 385 | return __sprint_symbol(buffer, address, 0); | ||
| 386 | } | ||
| 387 | |||
| 369 | EXPORT_SYMBOL_GPL(sprint_symbol); | 388 | EXPORT_SYMBOL_GPL(sprint_symbol); |
| 370 | 389 | ||
| 390 | /** | ||
| 391 | * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer | ||
| 392 | * @buffer: buffer to be stored | ||
| 393 | * @address: address to lookup | ||
| 394 | * | ||
| 395 | * This function is for stack backtrace and does the same thing as | ||
| 396 | * sprint_symbol() but with modified/decreased @address. If there is a | ||
| 397 | * tail-call to the function marked "noreturn", gcc optimized out code after | ||
| 398 | * the call so that the stack-saved return address could point outside of the | ||
| 399 | * caller. This function ensures that kallsyms will find the original caller | ||
| 400 | * by decreasing @address. | ||
| 401 | * | ||
| 402 | * This function returns the number of bytes stored in @buffer. | ||
| 403 | */ | ||
| 404 | int sprint_backtrace(char *buffer, unsigned long address) | ||
| 405 | { | ||
| 406 | return __sprint_symbol(buffer, address, -1); | ||
| 407 | } | ||
| 408 | |||
| 371 | /* Look up a kernel symbol and print it to the kernel messages. */ | 409 | /* Look up a kernel symbol and print it to the kernel messages. */ |
| 372 | void __print_symbol(const char *fmt, unsigned long address) | 410 | void __print_symbol(const char *fmt, unsigned long address) |
| 373 | { | 411 | { |
| @@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p) | |||
| 477 | */ | 515 | */ |
| 478 | type = iter->exported ? toupper(iter->type) : | 516 | type = iter->exported ? toupper(iter->type) : |
| 479 | tolower(iter->type); | 517 | tolower(iter->type); |
| 480 | seq_printf(m, "%0*lx %c %s\t[%s]\n", | 518 | seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, |
| 481 | (int)(2 * sizeof(void *)), | 519 | type, iter->name, iter->module_name); |
| 482 | iter->value, type, iter->name, iter->module_name); | ||
| 483 | } else | 520 | } else |
| 484 | seq_printf(m, "%0*lx %c %s\n", | 521 | seq_printf(m, "%pK %c %s\n", (void *)iter->value, |
| 485 | (int)(2 * sizeof(void *)), | 522 | iter->type, iter->name); |
| 486 | iter->value, iter->type, iter->name); | ||
| 487 | return 0; | 523 | return 0; |
| 488 | } | 524 | } |
| 489 | 525 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index ec19b92c7ebd..87b77de03dd3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
| 34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
| 35 | #include <linux/kmsg_dump.h> | 35 | #include <linux/kmsg_dump.h> |
| 36 | #include <linux/syscore_ops.h> | ||
| 36 | 37 | ||
| 37 | #include <asm/page.h> | 38 | #include <asm/page.h> |
| 38 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
| @@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
| 144 | /* Initialize the list of destination pages */ | 145 | /* Initialize the list of destination pages */ |
| 145 | INIT_LIST_HEAD(&image->dest_pages); | 146 | INIT_LIST_HEAD(&image->dest_pages); |
| 146 | 147 | ||
| 147 | /* Initialize the list of unuseable pages */ | 148 | /* Initialize the list of unusable pages */ |
| 148 | INIT_LIST_HEAD(&image->unuseable_pages); | 149 | INIT_LIST_HEAD(&image->unuseable_pages); |
| 149 | 150 | ||
| 150 | /* Read in the segments */ | 151 | /* Read in the segments */ |
| @@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, | |||
| 454 | /* Deal with the destination pages I have inadvertently allocated. | 455 | /* Deal with the destination pages I have inadvertently allocated. |
| 455 | * | 456 | * |
| 456 | * Ideally I would convert multi-page allocations into single | 457 | * Ideally I would convert multi-page allocations into single |
| 457 | * page allocations, and add everyting to image->dest_pages. | 458 | * page allocations, and add everything to image->dest_pages. |
| 458 | * | 459 | * |
| 459 | * For now it is simpler to just free the pages. | 460 | * For now it is simpler to just free the pages. |
| 460 | */ | 461 | */ |
| @@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image) | |||
| 602 | /* Walk through and free any extra destination pages I may have */ | 603 | /* Walk through and free any extra destination pages I may have */ |
| 603 | kimage_free_page_list(&image->dest_pages); | 604 | kimage_free_page_list(&image->dest_pages); |
| 604 | 605 | ||
| 605 | /* Walk through and free any unuseable pages I have cached */ | 606 | /* Walk through and free any unusable pages I have cached */ |
| 606 | kimage_free_page_list(&image->unuseable_pages); | 607 | kimage_free_page_list(&image->unuseable_pages); |
| 607 | 608 | ||
| 608 | } | 609 | } |
| @@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void) | |||
| 1099 | return size; | 1100 | return size; |
| 1100 | } | 1101 | } |
| 1101 | 1102 | ||
| 1102 | static void free_reserved_phys_range(unsigned long begin, unsigned long end) | 1103 | void __weak crash_free_reserved_phys_range(unsigned long begin, |
| 1104 | unsigned long end) | ||
| 1103 | { | 1105 | { |
| 1104 | unsigned long addr; | 1106 | unsigned long addr; |
| 1105 | 1107 | ||
| @@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size) | |||
| 1135 | start = roundup(start, PAGE_SIZE); | 1137 | start = roundup(start, PAGE_SIZE); |
| 1136 | end = roundup(start + new_size, PAGE_SIZE); | 1138 | end = roundup(start + new_size, PAGE_SIZE); |
| 1137 | 1139 | ||
| 1138 | free_reserved_phys_range(end, crashk_res.end); | 1140 | crash_free_reserved_phys_range(end, crashk_res.end); |
| 1139 | 1141 | ||
| 1140 | if ((start == end) && (crashk_res.parent != NULL)) | 1142 | if ((start == end) && (crashk_res.parent != NULL)) |
| 1141 | release_resource(&crashk_res); | 1143 | release_resource(&crashk_res); |
| @@ -1531,6 +1533,11 @@ int kernel_kexec(void) | |||
| 1531 | local_irq_disable(); | 1533 | local_irq_disable(); |
| 1532 | /* Suspend system devices */ | 1534 | /* Suspend system devices */ |
| 1533 | error = sysdev_suspend(PMSG_FREEZE); | 1535 | error = sysdev_suspend(PMSG_FREEZE); |
| 1536 | if (!error) { | ||
| 1537 | error = syscore_suspend(); | ||
| 1538 | if (error) | ||
| 1539 | sysdev_resume(); | ||
| 1540 | } | ||
| 1534 | if (error) | 1541 | if (error) |
| 1535 | goto Enable_irqs; | 1542 | goto Enable_irqs; |
| 1536 | } else | 1543 | } else |
| @@ -1545,6 +1552,7 @@ int kernel_kexec(void) | |||
| 1545 | 1552 | ||
| 1546 | #ifdef CONFIG_KEXEC_JUMP | 1553 | #ifdef CONFIG_KEXEC_JUMP |
| 1547 | if (kexec_image->preserve_context) { | 1554 | if (kexec_image->preserve_context) { |
| 1555 | syscore_resume(); | ||
| 1548 | sysdev_resume(); | 1556 | sysdev_resume(); |
| 1549 | Enable_irqs: | 1557 | Enable_irqs: |
| 1550 | local_irq_enable(); | 1558 | local_irq_enable(); |
diff --git a/kernel/kthread.c b/kernel/kthread.c index c55afba990a3..3b34d2732bce 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -27,6 +27,7 @@ struct kthread_create_info | |||
| 27 | /* Information passed to kthread() from kthreadd. */ | 27 | /* Information passed to kthread() from kthreadd. */ |
| 28 | int (*threadfn)(void *data); | 28 | int (*threadfn)(void *data); |
| 29 | void *data; | 29 | void *data; |
| 30 | int node; | ||
| 30 | 31 | ||
| 31 | /* Result passed back to kthread_create() from kthreadd. */ | 32 | /* Result passed back to kthread_create() from kthreadd. */ |
| 32 | struct task_struct *result; | 33 | struct task_struct *result; |
| @@ -98,10 +99,23 @@ static int kthread(void *_create) | |||
| 98 | do_exit(ret); | 99 | do_exit(ret); |
| 99 | } | 100 | } |
| 100 | 101 | ||
| 102 | /* called from do_fork() to get node information for about to be created task */ | ||
| 103 | int tsk_fork_get_node(struct task_struct *tsk) | ||
| 104 | { | ||
| 105 | #ifdef CONFIG_NUMA | ||
| 106 | if (tsk == kthreadd_task) | ||
| 107 | return tsk->pref_node_fork; | ||
| 108 | #endif | ||
| 109 | return numa_node_id(); | ||
| 110 | } | ||
| 111 | |||
| 101 | static void create_kthread(struct kthread_create_info *create) | 112 | static void create_kthread(struct kthread_create_info *create) |
| 102 | { | 113 | { |
| 103 | int pid; | 114 | int pid; |
| 104 | 115 | ||
| 116 | #ifdef CONFIG_NUMA | ||
| 117 | current->pref_node_fork = create->node; | ||
| 118 | #endif | ||
| 105 | /* We want our own signal handler (we take no signals by default). */ | 119 | /* We want our own signal handler (we take no signals by default). */ |
| 106 | pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); | 120 | pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); |
| 107 | if (pid < 0) { | 121 | if (pid < 0) { |
| @@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create) | |||
| 111 | } | 125 | } |
| 112 | 126 | ||
| 113 | /** | 127 | /** |
| 114 | * kthread_create - create a kthread. | 128 | * kthread_create_on_node - create a kthread. |
| 115 | * @threadfn: the function to run until signal_pending(current). | 129 | * @threadfn: the function to run until signal_pending(current). |
| 116 | * @data: data ptr for @threadfn. | 130 | * @data: data ptr for @threadfn. |
| 131 | * @node: memory node number. | ||
| 117 | * @namefmt: printf-style name for the thread. | 132 | * @namefmt: printf-style name for the thread. |
| 118 | * | 133 | * |
| 119 | * Description: This helper function creates and names a kernel | 134 | * Description: This helper function creates and names a kernel |
| 120 | * thread. The thread will be stopped: use wake_up_process() to start | 135 | * thread. The thread will be stopped: use wake_up_process() to start |
| 121 | * it. See also kthread_run(). | 136 | * it. See also kthread_run(). |
| 122 | * | 137 | * |
| 138 | * If thread is going to be bound on a particular cpu, give its node | ||
| 139 | * in @node, to get NUMA affinity for kthread stack, or else give -1. | ||
| 123 | * When woken, the thread will run @threadfn() with @data as its | 140 | * When woken, the thread will run @threadfn() with @data as its |
| 124 | * argument. @threadfn() can either call do_exit() directly if it is a | 141 | * argument. @threadfn() can either call do_exit() directly if it is a |
| 125 | * standalone thread for which noone will call kthread_stop(), or | 142 | * standalone thread for which no one will call kthread_stop(), or |
| 126 | * return when 'kthread_should_stop()' is true (which means | 143 | * return when 'kthread_should_stop()' is true (which means |
| 127 | * kthread_stop() has been called). The return value should be zero | 144 | * kthread_stop() has been called). The return value should be zero |
| 128 | * or a negative error number; it will be passed to kthread_stop(). | 145 | * or a negative error number; it will be passed to kthread_stop(). |
| 129 | * | 146 | * |
| 130 | * Returns a task_struct or ERR_PTR(-ENOMEM). | 147 | * Returns a task_struct or ERR_PTR(-ENOMEM). |
| 131 | */ | 148 | */ |
| 132 | struct task_struct *kthread_create(int (*threadfn)(void *data), | 149 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), |
| 133 | void *data, | 150 | void *data, |
| 134 | const char namefmt[], | 151 | int node, |
| 135 | ...) | 152 | const char namefmt[], |
| 153 | ...) | ||
| 136 | { | 154 | { |
| 137 | struct kthread_create_info create; | 155 | struct kthread_create_info create; |
| 138 | 156 | ||
| 139 | create.threadfn = threadfn; | 157 | create.threadfn = threadfn; |
| 140 | create.data = data; | 158 | create.data = data; |
| 159 | create.node = node; | ||
| 141 | init_completion(&create.done); | 160 | init_completion(&create.done); |
| 142 | 161 | ||
| 143 | spin_lock(&kthread_create_lock); | 162 | spin_lock(&kthread_create_lock); |
| @@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
| 164 | } | 183 | } |
| 165 | return create.result; | 184 | return create.result; |
| 166 | } | 185 | } |
| 167 | EXPORT_SYMBOL(kthread_create); | 186 | EXPORT_SYMBOL(kthread_create_on_node); |
| 168 | 187 | ||
| 169 | /** | 188 | /** |
| 170 | * kthread_bind - bind a just-created kthread to a cpu. | 189 | * kthread_bind - bind a just-created kthread to a cpu. |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ee74b35e528d..376066e10413 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
| @@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk, | |||
| 153 | } | 153 | } |
| 154 | 154 | ||
| 155 | /** | 155 | /** |
| 156 | * __account_scheduler_latency - record an occured latency | 156 | * __account_scheduler_latency - record an occurred latency |
| 157 | * @tsk - the task struct of the task hitting the latency | 157 | * @tsk - the task struct of the task hitting the latency |
| 158 | * @usecs - the duration of the latency in microseconds | 158 | * @usecs - the duration of the latency in microseconds |
| 159 | * @inter - 1 if the sleep was interruptible, 0 if uninterruptible | 159 | * @inter - 1 if the sleep was interruptible, 0 if uninterruptible |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0d2058da80f5..53a68956f131 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -2309,7 +2309,7 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
| 2309 | if (unlikely(curr->hardirqs_enabled)) { | 2309 | if (unlikely(curr->hardirqs_enabled)) { |
| 2310 | /* | 2310 | /* |
| 2311 | * Neither irq nor preemption are disabled here | 2311 | * Neither irq nor preemption are disabled here |
| 2312 | * so this is racy by nature but loosing one hit | 2312 | * so this is racy by nature but losing one hit |
| 2313 | * in a stat is not a big deal. | 2313 | * in a stat is not a big deal. |
| 2314 | */ | 2314 | */ |
| 2315 | __debug_atomic_inc(redundant_hardirqs_on); | 2315 | __debug_atomic_inc(redundant_hardirqs_on); |
| @@ -2620,7 +2620,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
| 2620 | if (!graph_lock()) | 2620 | if (!graph_lock()) |
| 2621 | return 0; | 2621 | return 0; |
| 2622 | /* | 2622 | /* |
| 2623 | * Make sure we didnt race: | 2623 | * Make sure we didn't race: |
| 2624 | */ | 2624 | */ |
| 2625 | if (unlikely(hlock_class(this)->usage_mask & new_mask)) { | 2625 | if (unlikely(hlock_class(this)->usage_mask & new_mask)) { |
| 2626 | graph_unlock(); | 2626 | graph_unlock(); |
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 1969d2fc4b36..71edd2f60c02 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
| @@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) | |||
| 225 | nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, | 225 | nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, |
| 226 | nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, | 226 | nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, |
| 227 | nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, | 227 | nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, |
| 228 | sum_forward_deps = 0, factor = 0; | 228 | sum_forward_deps = 0; |
| 229 | 229 | ||
| 230 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | 230 | list_for_each_entry(class, &all_lock_classes, lock_entry) { |
| 231 | 231 | ||
| @@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) | |||
| 283 | nr_hardirq_unsafe * nr_hardirq_safe + | 283 | nr_hardirq_unsafe * nr_hardirq_safe + |
| 284 | nr_list_entries); | 284 | nr_list_entries); |
| 285 | 285 | ||
| 286 | /* | ||
| 287 | * Estimated factor between direct and indirect | ||
| 288 | * dependencies: | ||
| 289 | */ | ||
| 290 | if (nr_list_entries) | ||
| 291 | factor = sum_forward_deps / nr_list_entries; | ||
| 292 | |||
| 293 | #ifdef CONFIG_PROVE_LOCKING | 286 | #ifdef CONFIG_PROVE_LOCKING |
| 294 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", | 287 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", |
| 295 | nr_lock_chains, MAX_LOCKDEP_CHAINS); | 288 | nr_lock_chains, MAX_LOCKDEP_CHAINS); |
diff --git a/kernel/module.c b/kernel/module.c index efa290ea94bf..d5938a5c19c4 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -809,7 +809,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
| 809 | wait_for_zero_refcount(mod); | 809 | wait_for_zero_refcount(mod); |
| 810 | 810 | ||
| 811 | mutex_unlock(&module_mutex); | 811 | mutex_unlock(&module_mutex); |
| 812 | /* Final destruction now noone is using it. */ | 812 | /* Final destruction now no one is using it. */ |
| 813 | if (mod->exit != NULL) | 813 | if (mod->exit != NULL) |
| 814 | mod->exit(); | 814 | mod->exit(); |
| 815 | blocking_notifier_call_chain(&module_notify_list, | 815 | blocking_notifier_call_chain(&module_notify_list, |
| @@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, | |||
| 1168 | { | 1168 | { |
| 1169 | struct module_sect_attr *sattr = | 1169 | struct module_sect_attr *sattr = |
| 1170 | container_of(mattr, struct module_sect_attr, mattr); | 1170 | container_of(mattr, struct module_sect_attr, mattr); |
| 1171 | return sprintf(buf, "0x%lx\n", sattr->address); | 1171 | return sprintf(buf, "0x%pK\n", (void *)sattr->address); |
| 1172 | } | 1172 | } |
| 1173 | 1173 | ||
| 1174 | static void free_sect_attrs(struct module_sect_attrs *sect_attrs) | 1174 | static void free_sect_attrs(struct module_sect_attrs *sect_attrs) |
| @@ -2777,7 +2777,7 @@ static struct module *load_module(void __user *umod, | |||
| 2777 | mod->state = MODULE_STATE_COMING; | 2777 | mod->state = MODULE_STATE_COMING; |
| 2778 | 2778 | ||
| 2779 | /* Now sew it into the lists so we can get lockdep and oops | 2779 | /* Now sew it into the lists so we can get lockdep and oops |
| 2780 | * info during argument parsing. Noone should access us, since | 2780 | * info during argument parsing. No one should access us, since |
| 2781 | * strong_try_module_get() will fail. | 2781 | * strong_try_module_get() will fail. |
| 2782 | * lockdep/oops can run asynchronous, so use the RCU list insertion | 2782 | * lockdep/oops can run asynchronous, so use the RCU list insertion |
| 2783 | * function to insert in a way safe to concurrent readers. | 2783 | * function to insert in a way safe to concurrent readers. |
| @@ -2971,7 +2971,7 @@ static const char *get_ksymbol(struct module *mod, | |||
| 2971 | else | 2971 | else |
| 2972 | nextval = (unsigned long)mod->module_core+mod->core_text_size; | 2972 | nextval = (unsigned long)mod->module_core+mod->core_text_size; |
| 2973 | 2973 | ||
| 2974 | /* Scan for closest preceeding symbol, and next symbol. (ELF | 2974 | /* Scan for closest preceding symbol, and next symbol. (ELF |
| 2975 | starts real symbols at 1). */ | 2975 | starts real symbols at 1). */ |
| 2976 | for (i = 1; i < mod->num_symtab; i++) { | 2976 | for (i = 1; i < mod->num_symtab; i++) { |
| 2977 | if (mod->symtab[i].st_shndx == SHN_UNDEF) | 2977 | if (mod->symtab[i].st_shndx == SHN_UNDEF) |
| @@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p) | |||
| 3224 | mod->state == MODULE_STATE_COMING ? "Loading": | 3224 | mod->state == MODULE_STATE_COMING ? "Loading": |
| 3225 | "Live"); | 3225 | "Live"); |
| 3226 | /* Used by oprofile and other similar tools. */ | 3226 | /* Used by oprofile and other similar tools. */ |
| 3227 | seq_printf(m, " 0x%p", mod->module_core); | 3227 | seq_printf(m, " 0x%pK", mod->module_core); |
| 3228 | 3228 | ||
| 3229 | /* Taints info */ | 3229 | /* Taints info */ |
| 3230 | if (mod->taints) | 3230 | if (mod->taints) |
diff --git a/kernel/mutex.c b/kernel/mutex.c index a5889fb28ecf..c4195fa98900 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
| @@ -245,7 +245,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 245 | } | 245 | } |
| 246 | __set_task_state(task, state); | 246 | __set_task_state(task, state); |
| 247 | 247 | ||
| 248 | /* didnt get the lock, go to sleep: */ | 248 | /* didn't get the lock, go to sleep: */ |
| 249 | spin_unlock_mutex(&lock->wait_lock, flags); | 249 | spin_unlock_mutex(&lock->wait_lock, flags); |
| 250 | preempt_enable_no_resched(); | 250 | preempt_enable_no_resched(); |
| 251 | schedule(); | 251 | schedule(); |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f74e6c00e26d..a05d191ffdd9 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
| 69 | goto out_ns; | 69 | goto out_ns; |
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); | 72 | new_nsp->uts_ns = copy_utsname(flags, tsk); |
| 73 | if (IS_ERR(new_nsp->uts_ns)) { | 73 | if (IS_ERR(new_nsp->uts_ns)) { |
| 74 | err = PTR_ERR(new_nsp->uts_ns); | 74 | err = PTR_ERR(new_nsp->uts_ns); |
| 75 | goto out_uts; | 75 | goto out_uts; |
| 76 | } | 76 | } |
| 77 | 77 | ||
| 78 | new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); | 78 | new_nsp->ipc_ns = copy_ipcs(flags, tsk); |
| 79 | if (IS_ERR(new_nsp->ipc_ns)) { | 79 | if (IS_ERR(new_nsp->ipc_ns)) { |
| 80 | err = PTR_ERR(new_nsp->ipc_ns); | 80 | err = PTR_ERR(new_nsp->ipc_ns); |
| 81 | goto out_ipc; | 81 | goto out_ipc; |
diff --git a/kernel/padata.c b/kernel/padata.c index 751019415d23..b91941df5e63 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
| @@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd) | |||
| 262 | /* | 262 | /* |
| 263 | * This cpu has to do the parallel processing of the next | 263 | * This cpu has to do the parallel processing of the next |
| 264 | * object. It's waiting in the cpu's parallelization queue, | 264 | * object. It's waiting in the cpu's parallelization queue, |
| 265 | * so exit imediately. | 265 | * so exit immediately. |
| 266 | */ | 266 | */ |
| 267 | if (PTR_ERR(padata) == -ENODATA) { | 267 | if (PTR_ERR(padata) == -ENODATA) { |
| 268 | del_timer(&pd->timer); | 268 | del_timer(&pd->timer); |
| @@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd) | |||
| 284 | /* | 284 | /* |
| 285 | * The next object that needs serialization might have arrived to | 285 | * The next object that needs serialization might have arrived to |
| 286 | * the reorder queues in the meantime, we will be called again | 286 | * the reorder queues in the meantime, we will be called again |
| 287 | * from the timer function if noone else cares for it. | 287 | * from the timer function if no one else cares for it. |
| 288 | */ | 288 | */ |
| 289 | if (atomic_read(&pd->reorder_objects) | 289 | if (atomic_read(&pd->reorder_objects) |
| 290 | && !(pinst->flags & PADATA_RESET)) | 290 | && !(pinst->flags & PADATA_RESET)) |
| @@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst) | |||
| 515 | put_online_cpus(); | 515 | put_online_cpus(); |
| 516 | } | 516 | } |
| 517 | 517 | ||
| 518 | /* Replace the internal control stucture with a new one. */ | 518 | /* Replace the internal control structure with a new one. */ |
| 519 | static void padata_replace(struct padata_instance *pinst, | 519 | static void padata_replace(struct padata_instance *pinst, |
| 520 | struct parallel_data *pd_new) | 520 | struct parallel_data *pd_new) |
| 521 | { | 521 | { |
| @@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) | |||
| 768 | } | 768 | } |
| 769 | 769 | ||
| 770 | /** | 770 | /** |
| 771 | * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) | 771 | * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) |
| 772 | * padata cpumasks. | 772 | * padata cpumasks. |
| 773 | * | 773 | * |
| 774 | * @pinst: padata instance | 774 | * @pinst: padata instance |
diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87a1704..69231670eb95 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail); | |||
| 433 | 433 | ||
| 434 | core_param(panic, panic_timeout, int, 0644); | 434 | core_param(panic, panic_timeout, int, 0644); |
| 435 | core_param(pause_on_oops, pause_on_oops, int, 0644); | 435 | core_param(pause_on_oops, pause_on_oops, int, 0644); |
| 436 | |||
| 437 | static int __init oops_setup(char *s) | ||
| 438 | { | ||
| 439 | if (!s) | ||
| 440 | return -EINVAL; | ||
| 441 | if (!strcmp(s, "panic")) | ||
| 442 | panic_on_oops = 1; | ||
| 443 | return 0; | ||
| 444 | } | ||
| 445 | early_param("oops", oops_setup); | ||
diff --git a/kernel/params.c b/kernel/params.c index 0da1411222b9..7ab388a48a2e 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -95,7 +95,7 @@ static int parse_one(char *param, | |||
| 95 | /* Find parameter */ | 95 | /* Find parameter */ |
| 96 | for (i = 0; i < num_params; i++) { | 96 | for (i = 0; i < num_params; i++) { |
| 97 | if (parameq(param, params[i].name)) { | 97 | if (parameq(param, params[i].name)) { |
| 98 | /* Noone handled NULL, so do it here. */ | 98 | /* No one handled NULL, so do it here. */ |
| 99 | if (!val && params[i].ops->set != param_set_bool) | 99 | if (!val && params[i].ops->set != param_set_bool) |
| 100 | return -EINVAL; | 100 | return -EINVAL; |
| 101 | DEBUGP("They are equal! Calling %p\n", | 101 | DEBUGP("They are equal! Calling %p\n", |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 999835b6112b..8e81a9860a0d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
| @@ -38,13 +38,96 @@ | |||
| 38 | 38 | ||
| 39 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
| 40 | 40 | ||
| 41 | struct remote_function_call { | ||
| 42 | struct task_struct *p; | ||
| 43 | int (*func)(void *info); | ||
| 44 | void *info; | ||
| 45 | int ret; | ||
| 46 | }; | ||
| 47 | |||
| 48 | static void remote_function(void *data) | ||
| 49 | { | ||
| 50 | struct remote_function_call *tfc = data; | ||
| 51 | struct task_struct *p = tfc->p; | ||
| 52 | |||
| 53 | if (p) { | ||
| 54 | tfc->ret = -EAGAIN; | ||
| 55 | if (task_cpu(p) != smp_processor_id() || !task_curr(p)) | ||
| 56 | return; | ||
| 57 | } | ||
| 58 | |||
| 59 | tfc->ret = tfc->func(tfc->info); | ||
| 60 | } | ||
| 61 | |||
| 62 | /** | ||
| 63 | * task_function_call - call a function on the cpu on which a task runs | ||
| 64 | * @p: the task to evaluate | ||
| 65 | * @func: the function to be called | ||
| 66 | * @info: the function call argument | ||
| 67 | * | ||
| 68 | * Calls the function @func when the task is currently running. This might | ||
| 69 | * be on the current CPU, which just calls the function directly | ||
| 70 | * | ||
| 71 | * returns: @func return value, or | ||
| 72 | * -ESRCH - when the process isn't running | ||
| 73 | * -EAGAIN - when the process moved away | ||
| 74 | */ | ||
| 75 | static int | ||
| 76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | ||
| 77 | { | ||
| 78 | struct remote_function_call data = { | ||
| 79 | .p = p, | ||
| 80 | .func = func, | ||
| 81 | .info = info, | ||
| 82 | .ret = -ESRCH, /* No such (running) process */ | ||
| 83 | }; | ||
| 84 | |||
| 85 | if (task_curr(p)) | ||
| 86 | smp_call_function_single(task_cpu(p), remote_function, &data, 1); | ||
| 87 | |||
| 88 | return data.ret; | ||
| 89 | } | ||
| 90 | |||
| 91 | /** | ||
| 92 | * cpu_function_call - call a function on the cpu | ||
| 93 | * @func: the function to be called | ||
| 94 | * @info: the function call argument | ||
| 95 | * | ||
| 96 | * Calls the function @func on the remote cpu. | ||
| 97 | * | ||
| 98 | * returns: @func return value or -ENXIO when the cpu is offline | ||
| 99 | */ | ||
| 100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | ||
| 101 | { | ||
| 102 | struct remote_function_call data = { | ||
| 103 | .p = NULL, | ||
| 104 | .func = func, | ||
| 105 | .info = info, | ||
| 106 | .ret = -ENXIO, /* No such CPU */ | ||
| 107 | }; | ||
| 108 | |||
| 109 | smp_call_function_single(cpu, remote_function, &data, 1); | ||
| 110 | |||
| 111 | return data.ret; | ||
| 112 | } | ||
| 113 | |||
| 114 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | ||
| 115 | PERF_FLAG_FD_OUTPUT |\ | ||
| 116 | PERF_FLAG_PID_CGROUP) | ||
| 117 | |||
| 41 | enum event_type_t { | 118 | enum event_type_t { |
| 42 | EVENT_FLEXIBLE = 0x1, | 119 | EVENT_FLEXIBLE = 0x1, |
| 43 | EVENT_PINNED = 0x2, | 120 | EVENT_PINNED = 0x2, |
| 44 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | 121 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, |
| 45 | }; | 122 | }; |
| 46 | 123 | ||
| 47 | atomic_t perf_task_events __read_mostly; | 124 | /* |
| 125 | * perf_sched_events : >0 events exist | ||
| 126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | ||
| 127 | */ | ||
| 128 | atomic_t perf_sched_events __read_mostly; | ||
| 129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | ||
| 130 | |||
| 48 | static atomic_t nr_mmap_events __read_mostly; | 131 | static atomic_t nr_mmap_events __read_mostly; |
| 49 | static atomic_t nr_comm_events __read_mostly; | 132 | static atomic_t nr_comm_events __read_mostly; |
| 50 | static atomic_t nr_task_events __read_mostly; | 133 | static atomic_t nr_task_events __read_mostly; |
| @@ -62,12 +145,30 @@ static struct srcu_struct pmus_srcu; | |||
| 62 | */ | 145 | */ |
| 63 | int sysctl_perf_event_paranoid __read_mostly = 1; | 146 | int sysctl_perf_event_paranoid __read_mostly = 1; |
| 64 | 147 | ||
| 65 | int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ | 148 | /* Minimum for 512 kiB + 1 user control page */ |
| 149 | int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ | ||
| 66 | 150 | ||
| 67 | /* | 151 | /* |
| 68 | * max perf event sample rate | 152 | * max perf event sample rate |
| 69 | */ | 153 | */ |
| 70 | int sysctl_perf_event_sample_rate __read_mostly = 100000; | 154 | #define DEFAULT_MAX_SAMPLE_RATE 100000 |
| 155 | int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | ||
| 156 | static int max_samples_per_tick __read_mostly = | ||
| 157 | DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | ||
| 158 | |||
| 159 | int perf_proc_update_handler(struct ctl_table *table, int write, | ||
| 160 | void __user *buffer, size_t *lenp, | ||
| 161 | loff_t *ppos) | ||
| 162 | { | ||
| 163 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 164 | |||
| 165 | if (ret || !write) | ||
| 166 | return ret; | ||
| 167 | |||
| 168 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); | ||
| 169 | |||
| 170 | return 0; | ||
| 171 | } | ||
| 71 | 172 | ||
| 72 | static atomic64_t perf_event_id; | 173 | static atomic64_t perf_event_id; |
| 73 | 174 | ||
| @@ -75,7 +176,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | |||
| 75 | enum event_type_t event_type); | 176 | enum event_type_t event_type); |
| 76 | 177 | ||
| 77 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 178 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
| 78 | enum event_type_t event_type); | 179 | enum event_type_t event_type, |
| 180 | struct task_struct *task); | ||
| 181 | |||
| 182 | static void update_context_time(struct perf_event_context *ctx); | ||
| 183 | static u64 perf_event_time(struct perf_event *event); | ||
| 79 | 184 | ||
| 80 | void __weak perf_event_print_debug(void) { } | 185 | void __weak perf_event_print_debug(void) { } |
| 81 | 186 | ||
| @@ -89,6 +194,361 @@ static inline u64 perf_clock(void) | |||
| 89 | return local_clock(); | 194 | return local_clock(); |
| 90 | } | 195 | } |
| 91 | 196 | ||
| 197 | static inline struct perf_cpu_context * | ||
| 198 | __get_cpu_context(struct perf_event_context *ctx) | ||
| 199 | { | ||
| 200 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
| 201 | } | ||
| 202 | |||
| 203 | #ifdef CONFIG_CGROUP_PERF | ||
| 204 | |||
| 205 | /* | ||
| 206 | * Must ensure cgroup is pinned (css_get) before calling | ||
| 207 | * this function. In other words, we cannot call this function | ||
| 208 | * if there is no cgroup event for the current CPU context. | ||
| 209 | */ | ||
| 210 | static inline struct perf_cgroup * | ||
| 211 | perf_cgroup_from_task(struct task_struct *task) | ||
| 212 | { | ||
| 213 | return container_of(task_subsys_state(task, perf_subsys_id), | ||
| 214 | struct perf_cgroup, css); | ||
| 215 | } | ||
| 216 | |||
| 217 | static inline bool | ||
| 218 | perf_cgroup_match(struct perf_event *event) | ||
| 219 | { | ||
| 220 | struct perf_event_context *ctx = event->ctx; | ||
| 221 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
| 222 | |||
| 223 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | ||
| 224 | } | ||
| 225 | |||
| 226 | static inline void perf_get_cgroup(struct perf_event *event) | ||
| 227 | { | ||
| 228 | css_get(&event->cgrp->css); | ||
| 229 | } | ||
| 230 | |||
| 231 | static inline void perf_put_cgroup(struct perf_event *event) | ||
| 232 | { | ||
| 233 | css_put(&event->cgrp->css); | ||
| 234 | } | ||
| 235 | |||
| 236 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
| 237 | { | ||
| 238 | perf_put_cgroup(event); | ||
| 239 | event->cgrp = NULL; | ||
| 240 | } | ||
| 241 | |||
| 242 | static inline int is_cgroup_event(struct perf_event *event) | ||
| 243 | { | ||
| 244 | return event->cgrp != NULL; | ||
| 245 | } | ||
| 246 | |||
| 247 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
| 248 | { | ||
| 249 | struct perf_cgroup_info *t; | ||
| 250 | |||
| 251 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
| 252 | return t->time; | ||
| 253 | } | ||
| 254 | |||
| 255 | static inline void __update_cgrp_time(struct perf_cgroup *cgrp) | ||
| 256 | { | ||
| 257 | struct perf_cgroup_info *info; | ||
| 258 | u64 now; | ||
| 259 | |||
| 260 | now = perf_clock(); | ||
| 261 | |||
| 262 | info = this_cpu_ptr(cgrp->info); | ||
| 263 | |||
| 264 | info->time += now - info->timestamp; | ||
| 265 | info->timestamp = now; | ||
| 266 | } | ||
| 267 | |||
| 268 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
| 269 | { | ||
| 270 | struct perf_cgroup *cgrp_out = cpuctx->cgrp; | ||
| 271 | if (cgrp_out) | ||
| 272 | __update_cgrp_time(cgrp_out); | ||
| 273 | } | ||
| 274 | |||
| 275 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
| 276 | { | ||
| 277 | struct perf_cgroup *cgrp; | ||
| 278 | |||
| 279 | /* | ||
| 280 | * ensure we access cgroup data only when needed and | ||
| 281 | * when we know the cgroup is pinned (css_get) | ||
| 282 | */ | ||
| 283 | if (!is_cgroup_event(event)) | ||
| 284 | return; | ||
| 285 | |||
| 286 | cgrp = perf_cgroup_from_task(current); | ||
| 287 | /* | ||
| 288 | * Do not update time when cgroup is not active | ||
| 289 | */ | ||
| 290 | if (cgrp == event->cgrp) | ||
| 291 | __update_cgrp_time(event->cgrp); | ||
| 292 | } | ||
| 293 | |||
| 294 | static inline void | ||
| 295 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
| 296 | struct perf_event_context *ctx) | ||
| 297 | { | ||
| 298 | struct perf_cgroup *cgrp; | ||
| 299 | struct perf_cgroup_info *info; | ||
| 300 | |||
| 301 | /* | ||
| 302 | * ctx->lock held by caller | ||
| 303 | * ensure we do not access cgroup data | ||
| 304 | * unless we have the cgroup pinned (css_get) | ||
| 305 | */ | ||
| 306 | if (!task || !ctx->nr_cgroups) | ||
| 307 | return; | ||
| 308 | |||
| 309 | cgrp = perf_cgroup_from_task(task); | ||
| 310 | info = this_cpu_ptr(cgrp->info); | ||
| 311 | info->timestamp = ctx->timestamp; | ||
| 312 | } | ||
| 313 | |||
| 314 | #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ | ||
| 315 | #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ | ||
| 316 | |||
| 317 | /* | ||
| 318 | * reschedule events based on the cgroup constraint of task. | ||
| 319 | * | ||
| 320 | * mode SWOUT : schedule out everything | ||
| 321 | * mode SWIN : schedule in based on cgroup for next | ||
| 322 | */ | ||
| 323 | void perf_cgroup_switch(struct task_struct *task, int mode) | ||
| 324 | { | ||
| 325 | struct perf_cpu_context *cpuctx; | ||
| 326 | struct pmu *pmu; | ||
| 327 | unsigned long flags; | ||
| 328 | |||
| 329 | /* | ||
| 330 | * disable interrupts to avoid geting nr_cgroup | ||
| 331 | * changes via __perf_event_disable(). Also | ||
| 332 | * avoids preemption. | ||
| 333 | */ | ||
| 334 | local_irq_save(flags); | ||
| 335 | |||
| 336 | /* | ||
| 337 | * we reschedule only in the presence of cgroup | ||
| 338 | * constrained events. | ||
| 339 | */ | ||
| 340 | rcu_read_lock(); | ||
| 341 | |||
| 342 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
| 343 | |||
| 344 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
| 345 | |||
| 346 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
| 347 | |||
| 348 | /* | ||
| 349 | * perf_cgroup_events says at least one | ||
| 350 | * context on this CPU has cgroup events. | ||
| 351 | * | ||
| 352 | * ctx->nr_cgroups reports the number of cgroup | ||
| 353 | * events for a context. | ||
| 354 | */ | ||
| 355 | if (cpuctx->ctx.nr_cgroups > 0) { | ||
| 356 | |||
| 357 | if (mode & PERF_CGROUP_SWOUT) { | ||
| 358 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | ||
| 359 | /* | ||
| 360 | * must not be done before ctxswout due | ||
| 361 | * to event_filter_match() in event_sched_out() | ||
| 362 | */ | ||
| 363 | cpuctx->cgrp = NULL; | ||
| 364 | } | ||
| 365 | |||
| 366 | if (mode & PERF_CGROUP_SWIN) { | ||
| 367 | WARN_ON_ONCE(cpuctx->cgrp); | ||
| 368 | /* set cgrp before ctxsw in to | ||
| 369 | * allow event_filter_match() to not | ||
| 370 | * have to pass task around | ||
| 371 | */ | ||
| 372 | cpuctx->cgrp = perf_cgroup_from_task(task); | ||
| 373 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); | ||
| 374 | } | ||
| 375 | } | ||
| 376 | |||
| 377 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
| 378 | } | ||
| 379 | |||
| 380 | rcu_read_unlock(); | ||
| 381 | |||
| 382 | local_irq_restore(flags); | ||
| 383 | } | ||
| 384 | |||
| 385 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
| 386 | { | ||
| 387 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT); | ||
| 388 | } | ||
| 389 | |||
| 390 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
| 391 | { | ||
| 392 | perf_cgroup_switch(task, PERF_CGROUP_SWIN); | ||
| 393 | } | ||
| 394 | |||
| 395 | static inline int perf_cgroup_connect(int fd, struct perf_event *event, | ||
| 396 | struct perf_event_attr *attr, | ||
| 397 | struct perf_event *group_leader) | ||
| 398 | { | ||
| 399 | struct perf_cgroup *cgrp; | ||
| 400 | struct cgroup_subsys_state *css; | ||
| 401 | struct file *file; | ||
| 402 | int ret = 0, fput_needed; | ||
| 403 | |||
| 404 | file = fget_light(fd, &fput_needed); | ||
| 405 | if (!file) | ||
| 406 | return -EBADF; | ||
| 407 | |||
| 408 | css = cgroup_css_from_dir(file, perf_subsys_id); | ||
| 409 | if (IS_ERR(css)) { | ||
| 410 | ret = PTR_ERR(css); | ||
| 411 | goto out; | ||
| 412 | } | ||
| 413 | |||
| 414 | cgrp = container_of(css, struct perf_cgroup, css); | ||
| 415 | event->cgrp = cgrp; | ||
| 416 | |||
| 417 | /* must be done before we fput() the file */ | ||
| 418 | perf_get_cgroup(event); | ||
| 419 | |||
| 420 | /* | ||
| 421 | * all events in a group must monitor | ||
| 422 | * the same cgroup because a task belongs | ||
| 423 | * to only one perf cgroup at a time | ||
| 424 | */ | ||
| 425 | if (group_leader && group_leader->cgrp != cgrp) { | ||
| 426 | perf_detach_cgroup(event); | ||
| 427 | ret = -EINVAL; | ||
| 428 | } | ||
| 429 | out: | ||
| 430 | fput_light(file, fput_needed); | ||
| 431 | return ret; | ||
| 432 | } | ||
| 433 | |||
| 434 | static inline void | ||
| 435 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
| 436 | { | ||
| 437 | struct perf_cgroup_info *t; | ||
| 438 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
| 439 | event->shadow_ctx_time = now - t->timestamp; | ||
| 440 | } | ||
| 441 | |||
| 442 | static inline void | ||
| 443 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
| 444 | { | ||
| 445 | /* | ||
| 446 | * when the current task's perf cgroup does not match | ||
| 447 | * the event's, we need to remember to call the | ||
| 448 | * perf_mark_enable() function the first time a task with | ||
| 449 | * a matching perf cgroup is scheduled in. | ||
| 450 | */ | ||
| 451 | if (is_cgroup_event(event) && !perf_cgroup_match(event)) | ||
| 452 | event->cgrp_defer_enabled = 1; | ||
| 453 | } | ||
| 454 | |||
| 455 | static inline void | ||
| 456 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
| 457 | struct perf_event_context *ctx) | ||
| 458 | { | ||
| 459 | struct perf_event *sub; | ||
| 460 | u64 tstamp = perf_event_time(event); | ||
| 461 | |||
| 462 | if (!event->cgrp_defer_enabled) | ||
| 463 | return; | ||
| 464 | |||
| 465 | event->cgrp_defer_enabled = 0; | ||
| 466 | |||
| 467 | event->tstamp_enabled = tstamp - event->total_time_enabled; | ||
| 468 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | ||
| 469 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { | ||
| 470 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; | ||
| 471 | sub->cgrp_defer_enabled = 0; | ||
| 472 | } | ||
| 473 | } | ||
| 474 | } | ||
| 475 | #else /* !CONFIG_CGROUP_PERF */ | ||
| 476 | |||
| 477 | static inline bool | ||
| 478 | perf_cgroup_match(struct perf_event *event) | ||
| 479 | { | ||
| 480 | return true; | ||
| 481 | } | ||
| 482 | |||
| 483 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
| 484 | {} | ||
| 485 | |||
| 486 | static inline int is_cgroup_event(struct perf_event *event) | ||
| 487 | { | ||
| 488 | return 0; | ||
| 489 | } | ||
| 490 | |||
| 491 | static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) | ||
| 492 | { | ||
| 493 | return 0; | ||
| 494 | } | ||
| 495 | |||
| 496 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
| 497 | { | ||
| 498 | } | ||
| 499 | |||
| 500 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
| 501 | { | ||
| 502 | } | ||
| 503 | |||
| 504 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
| 505 | { | ||
| 506 | } | ||
| 507 | |||
| 508 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
| 509 | { | ||
| 510 | } | ||
| 511 | |||
| 512 | static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, | ||
| 513 | struct perf_event_attr *attr, | ||
| 514 | struct perf_event *group_leader) | ||
| 515 | { | ||
| 516 | return -EINVAL; | ||
| 517 | } | ||
| 518 | |||
| 519 | static inline void | ||
| 520 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
| 521 | struct perf_event_context *ctx) | ||
| 522 | { | ||
| 523 | } | ||
| 524 | |||
| 525 | void | ||
| 526 | perf_cgroup_switch(struct task_struct *task, struct task_struct *next) | ||
| 527 | { | ||
| 528 | } | ||
| 529 | |||
| 530 | static inline void | ||
| 531 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
| 532 | { | ||
| 533 | } | ||
| 534 | |||
| 535 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
| 536 | { | ||
| 537 | return 0; | ||
| 538 | } | ||
| 539 | |||
| 540 | static inline void | ||
| 541 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
| 542 | { | ||
| 543 | } | ||
| 544 | |||
| 545 | static inline void | ||
| 546 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
| 547 | struct perf_event_context *ctx) | ||
| 548 | { | ||
| 549 | } | ||
| 550 | #endif | ||
| 551 | |||
| 92 | void perf_pmu_disable(struct pmu *pmu) | 552 | void perf_pmu_disable(struct pmu *pmu) |
| 93 | { | 553 | { |
| 94 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | 554 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
| @@ -254,7 +714,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
| 254 | raw_spin_lock_irqsave(&ctx->lock, flags); | 714 | raw_spin_lock_irqsave(&ctx->lock, flags); |
| 255 | --ctx->pin_count; | 715 | --ctx->pin_count; |
| 256 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 716 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| 257 | put_ctx(ctx); | ||
| 258 | } | 717 | } |
| 259 | 718 | ||
| 260 | /* | 719 | /* |
| @@ -271,6 +730,10 @@ static void update_context_time(struct perf_event_context *ctx) | |||
| 271 | static u64 perf_event_time(struct perf_event *event) | 730 | static u64 perf_event_time(struct perf_event *event) |
| 272 | { | 731 | { |
| 273 | struct perf_event_context *ctx = event->ctx; | 732 | struct perf_event_context *ctx = event->ctx; |
| 733 | |||
| 734 | if (is_cgroup_event(event)) | ||
| 735 | return perf_cgroup_event_time(event); | ||
| 736 | |||
| 274 | return ctx ? ctx->time : 0; | 737 | return ctx ? ctx->time : 0; |
| 275 | } | 738 | } |
| 276 | 739 | ||
| @@ -285,9 +748,20 @@ static void update_event_times(struct perf_event *event) | |||
| 285 | if (event->state < PERF_EVENT_STATE_INACTIVE || | 748 | if (event->state < PERF_EVENT_STATE_INACTIVE || |
| 286 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) | 749 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) |
| 287 | return; | 750 | return; |
| 288 | 751 | /* | |
| 289 | if (ctx->is_active) | 752 | * in cgroup mode, time_enabled represents |
| 753 | * the time the event was enabled AND active | ||
| 754 | * tasks were in the monitored cgroup. This is | ||
| 755 | * independent of the activity of the context as | ||
| 756 | * there may be a mix of cgroup and non-cgroup events. | ||
| 757 | * | ||
| 758 | * That is why we treat cgroup events differently | ||
| 759 | * here. | ||
| 760 | */ | ||
| 761 | if (is_cgroup_event(event)) | ||
| 290 | run_end = perf_event_time(event); | 762 | run_end = perf_event_time(event); |
| 763 | else if (ctx->is_active) | ||
| 764 | run_end = ctx->time; | ||
| 291 | else | 765 | else |
| 292 | run_end = event->tstamp_stopped; | 766 | run_end = event->tstamp_stopped; |
| 293 | 767 | ||
| @@ -299,6 +773,7 @@ static void update_event_times(struct perf_event *event) | |||
| 299 | run_end = perf_event_time(event); | 773 | run_end = perf_event_time(event); |
| 300 | 774 | ||
| 301 | event->total_time_running = run_end - event->tstamp_running; | 775 | event->total_time_running = run_end - event->tstamp_running; |
| 776 | |||
| 302 | } | 777 | } |
| 303 | 778 | ||
| 304 | /* | 779 | /* |
| @@ -347,6 +822,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 347 | list_add_tail(&event->group_entry, list); | 822 | list_add_tail(&event->group_entry, list); |
| 348 | } | 823 | } |
| 349 | 824 | ||
| 825 | if (is_cgroup_event(event)) | ||
| 826 | ctx->nr_cgroups++; | ||
| 827 | |||
| 350 | list_add_rcu(&event->event_entry, &ctx->event_list); | 828 | list_add_rcu(&event->event_entry, &ctx->event_list); |
| 351 | if (!ctx->nr_events) | 829 | if (!ctx->nr_events) |
| 352 | perf_pmu_rotate_start(ctx->pmu); | 830 | perf_pmu_rotate_start(ctx->pmu); |
| @@ -465,6 +943,7 @@ static void perf_group_attach(struct perf_event *event) | |||
| 465 | static void | 943 | static void |
| 466 | list_del_event(struct perf_event *event, struct perf_event_context *ctx) | 944 | list_del_event(struct perf_event *event, struct perf_event_context *ctx) |
| 467 | { | 945 | { |
| 946 | struct perf_cpu_context *cpuctx; | ||
| 468 | /* | 947 | /* |
| 469 | * We can have double detach due to exit/hot-unplug + close. | 948 | * We can have double detach due to exit/hot-unplug + close. |
| 470 | */ | 949 | */ |
| @@ -473,6 +952,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 473 | 952 | ||
| 474 | event->attach_state &= ~PERF_ATTACH_CONTEXT; | 953 | event->attach_state &= ~PERF_ATTACH_CONTEXT; |
| 475 | 954 | ||
| 955 | if (is_cgroup_event(event)) { | ||
| 956 | ctx->nr_cgroups--; | ||
| 957 | cpuctx = __get_cpu_context(ctx); | ||
| 958 | /* | ||
| 959 | * if there are no more cgroup events | ||
| 960 | * then cler cgrp to avoid stale pointer | ||
| 961 | * in update_cgrp_time_from_cpuctx() | ||
| 962 | */ | ||
| 963 | if (!ctx->nr_cgroups) | ||
| 964 | cpuctx->cgrp = NULL; | ||
| 965 | } | ||
| 966 | |||
| 476 | ctx->nr_events--; | 967 | ctx->nr_events--; |
| 477 | if (event->attr.inherit_stat) | 968 | if (event->attr.inherit_stat) |
| 478 | ctx->nr_stat--; | 969 | ctx->nr_stat--; |
| @@ -544,7 +1035,8 @@ out: | |||
| 544 | static inline int | 1035 | static inline int |
| 545 | event_filter_match(struct perf_event *event) | 1036 | event_filter_match(struct perf_event *event) |
| 546 | { | 1037 | { |
| 547 | return event->cpu == -1 || event->cpu == smp_processor_id(); | 1038 | return (event->cpu == -1 || event->cpu == smp_processor_id()) |
| 1039 | && perf_cgroup_match(event); | ||
| 548 | } | 1040 | } |
| 549 | 1041 | ||
| 550 | static void | 1042 | static void |
| @@ -562,7 +1054,7 @@ event_sched_out(struct perf_event *event, | |||
| 562 | */ | 1054 | */ |
| 563 | if (event->state == PERF_EVENT_STATE_INACTIVE | 1055 | if (event->state == PERF_EVENT_STATE_INACTIVE |
| 564 | && !event_filter_match(event)) { | 1056 | && !event_filter_match(event)) { |
| 565 | delta = ctx->time - event->tstamp_stopped; | 1057 | delta = tstamp - event->tstamp_stopped; |
| 566 | event->tstamp_running += delta; | 1058 | event->tstamp_running += delta; |
| 567 | event->tstamp_stopped = tstamp; | 1059 | event->tstamp_stopped = tstamp; |
| 568 | } | 1060 | } |
| @@ -606,47 +1098,30 @@ group_sched_out(struct perf_event *group_event, | |||
| 606 | cpuctx->exclusive = 0; | 1098 | cpuctx->exclusive = 0; |
| 607 | } | 1099 | } |
| 608 | 1100 | ||
| 609 | static inline struct perf_cpu_context * | ||
| 610 | __get_cpu_context(struct perf_event_context *ctx) | ||
| 611 | { | ||
| 612 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
| 613 | } | ||
| 614 | |||
| 615 | /* | 1101 | /* |
| 616 | * Cross CPU call to remove a performance event | 1102 | * Cross CPU call to remove a performance event |
| 617 | * | 1103 | * |
| 618 | * We disable the event on the hardware level first. After that we | 1104 | * We disable the event on the hardware level first. After that we |
| 619 | * remove it from the context list. | 1105 | * remove it from the context list. |
| 620 | */ | 1106 | */ |
| 621 | static void __perf_event_remove_from_context(void *info) | 1107 | static int __perf_remove_from_context(void *info) |
| 622 | { | 1108 | { |
| 623 | struct perf_event *event = info; | 1109 | struct perf_event *event = info; |
| 624 | struct perf_event_context *ctx = event->ctx; | 1110 | struct perf_event_context *ctx = event->ctx; |
| 625 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1111 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
| 626 | 1112 | ||
| 627 | /* | ||
| 628 | * If this is a task context, we need to check whether it is | ||
| 629 | * the current task context of this cpu. If not it has been | ||
| 630 | * scheduled out before the smp call arrived. | ||
| 631 | */ | ||
| 632 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
| 633 | return; | ||
| 634 | |||
| 635 | raw_spin_lock(&ctx->lock); | 1113 | raw_spin_lock(&ctx->lock); |
| 636 | |||
| 637 | event_sched_out(event, cpuctx, ctx); | 1114 | event_sched_out(event, cpuctx, ctx); |
| 638 | |||
| 639 | list_del_event(event, ctx); | 1115 | list_del_event(event, ctx); |
| 640 | |||
| 641 | raw_spin_unlock(&ctx->lock); | 1116 | raw_spin_unlock(&ctx->lock); |
| 1117 | |||
| 1118 | return 0; | ||
| 642 | } | 1119 | } |
| 643 | 1120 | ||
| 644 | 1121 | ||
| 645 | /* | 1122 | /* |
| 646 | * Remove the event from a task's (or a CPU's) list of events. | 1123 | * Remove the event from a task's (or a CPU's) list of events. |
| 647 | * | 1124 | * |
| 648 | * Must be called with ctx->mutex held. | ||
| 649 | * | ||
| 650 | * CPU events are removed with a smp call. For task events we only | 1125 | * CPU events are removed with a smp call. For task events we only |
| 651 | * call when the task is on a CPU. | 1126 | * call when the task is on a CPU. |
| 652 | * | 1127 | * |
| @@ -657,49 +1132,48 @@ static void __perf_event_remove_from_context(void *info) | |||
| 657 | * When called from perf_event_exit_task, it's OK because the | 1132 | * When called from perf_event_exit_task, it's OK because the |
| 658 | * context has been detached from its task. | 1133 | * context has been detached from its task. |
| 659 | */ | 1134 | */ |
| 660 | static void perf_event_remove_from_context(struct perf_event *event) | 1135 | static void perf_remove_from_context(struct perf_event *event) |
| 661 | { | 1136 | { |
| 662 | struct perf_event_context *ctx = event->ctx; | 1137 | struct perf_event_context *ctx = event->ctx; |
| 663 | struct task_struct *task = ctx->task; | 1138 | struct task_struct *task = ctx->task; |
| 664 | 1139 | ||
| 1140 | lockdep_assert_held(&ctx->mutex); | ||
| 1141 | |||
| 665 | if (!task) { | 1142 | if (!task) { |
| 666 | /* | 1143 | /* |
| 667 | * Per cpu events are removed via an smp call and | 1144 | * Per cpu events are removed via an smp call and |
| 668 | * the removal is always successful. | 1145 | * the removal is always successful. |
| 669 | */ | 1146 | */ |
| 670 | smp_call_function_single(event->cpu, | 1147 | cpu_function_call(event->cpu, __perf_remove_from_context, event); |
| 671 | __perf_event_remove_from_context, | ||
| 672 | event, 1); | ||
| 673 | return; | 1148 | return; |
| 674 | } | 1149 | } |
| 675 | 1150 | ||
| 676 | retry: | 1151 | retry: |
| 677 | task_oncpu_function_call(task, __perf_event_remove_from_context, | 1152 | if (!task_function_call(task, __perf_remove_from_context, event)) |
| 678 | event); | 1153 | return; |
| 679 | 1154 | ||
| 680 | raw_spin_lock_irq(&ctx->lock); | 1155 | raw_spin_lock_irq(&ctx->lock); |
| 681 | /* | 1156 | /* |
| 682 | * If the context is active we need to retry the smp call. | 1157 | * If we failed to find a running task, but find the context active now |
| 1158 | * that we've acquired the ctx->lock, retry. | ||
| 683 | */ | 1159 | */ |
| 684 | if (ctx->nr_active && !list_empty(&event->group_entry)) { | 1160 | if (ctx->is_active) { |
| 685 | raw_spin_unlock_irq(&ctx->lock); | 1161 | raw_spin_unlock_irq(&ctx->lock); |
| 686 | goto retry; | 1162 | goto retry; |
| 687 | } | 1163 | } |
| 688 | 1164 | ||
| 689 | /* | 1165 | /* |
| 690 | * The lock prevents that this context is scheduled in so we | 1166 | * Since the task isn't running, its safe to remove the event, us |
| 691 | * can remove the event safely, if the call above did not | 1167 | * holding the ctx->lock ensures the task won't get scheduled in. |
| 692 | * succeed. | ||
| 693 | */ | 1168 | */ |
| 694 | if (!list_empty(&event->group_entry)) | 1169 | list_del_event(event, ctx); |
| 695 | list_del_event(event, ctx); | ||
| 696 | raw_spin_unlock_irq(&ctx->lock); | 1170 | raw_spin_unlock_irq(&ctx->lock); |
| 697 | } | 1171 | } |
| 698 | 1172 | ||
| 699 | /* | 1173 | /* |
| 700 | * Cross CPU call to disable a performance event | 1174 | * Cross CPU call to disable a performance event |
| 701 | */ | 1175 | */ |
| 702 | static void __perf_event_disable(void *info) | 1176 | static int __perf_event_disable(void *info) |
| 703 | { | 1177 | { |
| 704 | struct perf_event *event = info; | 1178 | struct perf_event *event = info; |
| 705 | struct perf_event_context *ctx = event->ctx; | 1179 | struct perf_event_context *ctx = event->ctx; |
| @@ -708,9 +1182,12 @@ static void __perf_event_disable(void *info) | |||
| 708 | /* | 1182 | /* |
| 709 | * If this is a per-task event, need to check whether this | 1183 | * If this is a per-task event, need to check whether this |
| 710 | * event's task is the current task on this cpu. | 1184 | * event's task is the current task on this cpu. |
| 1185 | * | ||
| 1186 | * Can trigger due to concurrent perf_event_context_sched_out() | ||
| 1187 | * flipping contexts around. | ||
| 711 | */ | 1188 | */ |
| 712 | if (ctx->task && cpuctx->task_ctx != ctx) | 1189 | if (ctx->task && cpuctx->task_ctx != ctx) |
| 713 | return; | 1190 | return -EINVAL; |
| 714 | 1191 | ||
| 715 | raw_spin_lock(&ctx->lock); | 1192 | raw_spin_lock(&ctx->lock); |
| 716 | 1193 | ||
| @@ -720,6 +1197,7 @@ static void __perf_event_disable(void *info) | |||
| 720 | */ | 1197 | */ |
| 721 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { | 1198 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { |
| 722 | update_context_time(ctx); | 1199 | update_context_time(ctx); |
| 1200 | update_cgrp_time_from_event(event); | ||
| 723 | update_group_times(event); | 1201 | update_group_times(event); |
| 724 | if (event == event->group_leader) | 1202 | if (event == event->group_leader) |
| 725 | group_sched_out(event, cpuctx, ctx); | 1203 | group_sched_out(event, cpuctx, ctx); |
| @@ -729,6 +1207,8 @@ static void __perf_event_disable(void *info) | |||
| 729 | } | 1207 | } |
| 730 | 1208 | ||
| 731 | raw_spin_unlock(&ctx->lock); | 1209 | raw_spin_unlock(&ctx->lock); |
| 1210 | |||
| 1211 | return 0; | ||
| 732 | } | 1212 | } |
| 733 | 1213 | ||
| 734 | /* | 1214 | /* |
| @@ -753,13 +1233,13 @@ void perf_event_disable(struct perf_event *event) | |||
| 753 | /* | 1233 | /* |
| 754 | * Disable the event on the cpu that it's on | 1234 | * Disable the event on the cpu that it's on |
| 755 | */ | 1235 | */ |
| 756 | smp_call_function_single(event->cpu, __perf_event_disable, | 1236 | cpu_function_call(event->cpu, __perf_event_disable, event); |
| 757 | event, 1); | ||
| 758 | return; | 1237 | return; |
| 759 | } | 1238 | } |
| 760 | 1239 | ||
| 761 | retry: | 1240 | retry: |
| 762 | task_oncpu_function_call(task, __perf_event_disable, event); | 1241 | if (!task_function_call(task, __perf_event_disable, event)) |
| 1242 | return; | ||
| 763 | 1243 | ||
| 764 | raw_spin_lock_irq(&ctx->lock); | 1244 | raw_spin_lock_irq(&ctx->lock); |
| 765 | /* | 1245 | /* |
| @@ -767,6 +1247,11 @@ retry: | |||
| 767 | */ | 1247 | */ |
| 768 | if (event->state == PERF_EVENT_STATE_ACTIVE) { | 1248 | if (event->state == PERF_EVENT_STATE_ACTIVE) { |
| 769 | raw_spin_unlock_irq(&ctx->lock); | 1249 | raw_spin_unlock_irq(&ctx->lock); |
| 1250 | /* | ||
| 1251 | * Reload the task pointer, it might have been changed by | ||
| 1252 | * a concurrent perf_event_context_sched_out(). | ||
| 1253 | */ | ||
| 1254 | task = ctx->task; | ||
| 770 | goto retry; | 1255 | goto retry; |
| 771 | } | 1256 | } |
| 772 | 1257 | ||
| @@ -778,10 +1263,48 @@ retry: | |||
| 778 | update_group_times(event); | 1263 | update_group_times(event); |
| 779 | event->state = PERF_EVENT_STATE_OFF; | 1264 | event->state = PERF_EVENT_STATE_OFF; |
| 780 | } | 1265 | } |
| 781 | |||
| 782 | raw_spin_unlock_irq(&ctx->lock); | 1266 | raw_spin_unlock_irq(&ctx->lock); |
| 783 | } | 1267 | } |
| 784 | 1268 | ||
| 1269 | static void perf_set_shadow_time(struct perf_event *event, | ||
| 1270 | struct perf_event_context *ctx, | ||
| 1271 | u64 tstamp) | ||
| 1272 | { | ||
| 1273 | /* | ||
| 1274 | * use the correct time source for the time snapshot | ||
| 1275 | * | ||
| 1276 | * We could get by without this by leveraging the | ||
| 1277 | * fact that to get to this function, the caller | ||
| 1278 | * has most likely already called update_context_time() | ||
| 1279 | * and update_cgrp_time_xx() and thus both timestamp | ||
| 1280 | * are identical (or very close). Given that tstamp is, | ||
| 1281 | * already adjusted for cgroup, we could say that: | ||
| 1282 | * tstamp - ctx->timestamp | ||
| 1283 | * is equivalent to | ||
| 1284 | * tstamp - cgrp->timestamp. | ||
| 1285 | * | ||
| 1286 | * Then, in perf_output_read(), the calculation would | ||
| 1287 | * work with no changes because: | ||
| 1288 | * - event is guaranteed scheduled in | ||
| 1289 | * - no scheduled out in between | ||
| 1290 | * - thus the timestamp would be the same | ||
| 1291 | * | ||
| 1292 | * But this is a bit hairy. | ||
| 1293 | * | ||
| 1294 | * So instead, we have an explicit cgroup call to remain | ||
| 1295 | * within the time time source all along. We believe it | ||
| 1296 | * is cleaner and simpler to understand. | ||
| 1297 | */ | ||
| 1298 | if (is_cgroup_event(event)) | ||
| 1299 | perf_cgroup_set_shadow_time(event, tstamp); | ||
| 1300 | else | ||
| 1301 | event->shadow_ctx_time = tstamp - ctx->timestamp; | ||
| 1302 | } | ||
| 1303 | |||
| 1304 | #define MAX_INTERRUPTS (~0ULL) | ||
| 1305 | |||
| 1306 | static void perf_log_throttle(struct perf_event *event, int enable); | ||
| 1307 | |||
| 785 | static int | 1308 | static int |
| 786 | event_sched_in(struct perf_event *event, | 1309 | event_sched_in(struct perf_event *event, |
| 787 | struct perf_cpu_context *cpuctx, | 1310 | struct perf_cpu_context *cpuctx, |
| @@ -794,6 +1317,17 @@ event_sched_in(struct perf_event *event, | |||
| 794 | 1317 | ||
| 795 | event->state = PERF_EVENT_STATE_ACTIVE; | 1318 | event->state = PERF_EVENT_STATE_ACTIVE; |
| 796 | event->oncpu = smp_processor_id(); | 1319 | event->oncpu = smp_processor_id(); |
| 1320 | |||
| 1321 | /* | ||
| 1322 | * Unthrottle events, since we scheduled we might have missed several | ||
| 1323 | * ticks already, also for a heavily scheduling task there is little | ||
| 1324 | * guarantee it'll get a tick in a timely manner. | ||
| 1325 | */ | ||
| 1326 | if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { | ||
| 1327 | perf_log_throttle(event, 1); | ||
| 1328 | event->hw.interrupts = 0; | ||
| 1329 | } | ||
| 1330 | |||
| 797 | /* | 1331 | /* |
| 798 | * The new state must be visible before we turn it on in the hardware: | 1332 | * The new state must be visible before we turn it on in the hardware: |
| 799 | */ | 1333 | */ |
| @@ -807,7 +1341,7 @@ event_sched_in(struct perf_event *event, | |||
| 807 | 1341 | ||
| 808 | event->tstamp_running += tstamp - event->tstamp_stopped; | 1342 | event->tstamp_running += tstamp - event->tstamp_stopped; |
| 809 | 1343 | ||
| 810 | event->shadow_ctx_time = tstamp - ctx->timestamp; | 1344 | perf_set_shadow_time(event, ctx, tstamp); |
| 811 | 1345 | ||
| 812 | if (!is_software_event(event)) | 1346 | if (!is_software_event(event)) |
| 813 | cpuctx->active_oncpu++; | 1347 | cpuctx->active_oncpu++; |
| @@ -928,12 +1462,15 @@ static void add_event_to_ctx(struct perf_event *event, | |||
| 928 | event->tstamp_stopped = tstamp; | 1462 | event->tstamp_stopped = tstamp; |
| 929 | } | 1463 | } |
| 930 | 1464 | ||
| 1465 | static void perf_event_context_sched_in(struct perf_event_context *ctx, | ||
| 1466 | struct task_struct *tsk); | ||
| 1467 | |||
| 931 | /* | 1468 | /* |
| 932 | * Cross CPU call to install and enable a performance event | 1469 | * Cross CPU call to install and enable a performance event |
| 933 | * | 1470 | * |
| 934 | * Must be called with ctx->mutex held | 1471 | * Must be called with ctx->mutex held |
| 935 | */ | 1472 | */ |
| 936 | static void __perf_install_in_context(void *info) | 1473 | static int __perf_install_in_context(void *info) |
| 937 | { | 1474 | { |
| 938 | struct perf_event *event = info; | 1475 | struct perf_event *event = info; |
| 939 | struct perf_event_context *ctx = event->ctx; | 1476 | struct perf_event_context *ctx = event->ctx; |
| @@ -942,21 +1479,22 @@ static void __perf_install_in_context(void *info) | |||
| 942 | int err; | 1479 | int err; |
| 943 | 1480 | ||
| 944 | /* | 1481 | /* |
| 945 | * If this is a task context, we need to check whether it is | 1482 | * In case we're installing a new context to an already running task, |
| 946 | * the current task context of this cpu. If not it has been | 1483 | * could also happen before perf_event_task_sched_in() on architectures |
| 947 | * scheduled out before the smp call arrived. | 1484 | * which do context switches with IRQs enabled. |
| 948 | * Or possibly this is the right context but it isn't | ||
| 949 | * on this cpu because it had no events. | ||
| 950 | */ | 1485 | */ |
| 951 | if (ctx->task && cpuctx->task_ctx != ctx) { | 1486 | if (ctx->task && !cpuctx->task_ctx) |
| 952 | if (cpuctx->task_ctx || ctx->task != current) | 1487 | perf_event_context_sched_in(ctx, ctx->task); |
| 953 | return; | ||
| 954 | cpuctx->task_ctx = ctx; | ||
| 955 | } | ||
| 956 | 1488 | ||
| 957 | raw_spin_lock(&ctx->lock); | 1489 | raw_spin_lock(&ctx->lock); |
| 958 | ctx->is_active = 1; | 1490 | ctx->is_active = 1; |
| 959 | update_context_time(ctx); | 1491 | update_context_time(ctx); |
| 1492 | /* | ||
| 1493 | * update cgrp time only if current cgrp | ||
| 1494 | * matches event->cgrp. Must be done before | ||
| 1495 | * calling add_event_to_ctx() | ||
| 1496 | */ | ||
| 1497 | update_cgrp_time_from_event(event); | ||
| 960 | 1498 | ||
| 961 | add_event_to_ctx(event, ctx); | 1499 | add_event_to_ctx(event, ctx); |
| 962 | 1500 | ||
| @@ -997,6 +1535,8 @@ static void __perf_install_in_context(void *info) | |||
| 997 | 1535 | ||
| 998 | unlock: | 1536 | unlock: |
| 999 | raw_spin_unlock(&ctx->lock); | 1537 | raw_spin_unlock(&ctx->lock); |
| 1538 | |||
| 1539 | return 0; | ||
| 1000 | } | 1540 | } |
| 1001 | 1541 | ||
| 1002 | /* | 1542 | /* |
| @@ -1008,8 +1548,6 @@ unlock: | |||
| 1008 | * If the event is attached to a task which is on a CPU we use a smp | 1548 | * If the event is attached to a task which is on a CPU we use a smp |
| 1009 | * call to enable it in the task context. The task might have been | 1549 | * call to enable it in the task context. The task might have been |
| 1010 | * scheduled away, but we check this in the smp call again. | 1550 | * scheduled away, but we check this in the smp call again. |
| 1011 | * | ||
| 1012 | * Must be called with ctx->mutex held. | ||
| 1013 | */ | 1551 | */ |
| 1014 | static void | 1552 | static void |
| 1015 | perf_install_in_context(struct perf_event_context *ctx, | 1553 | perf_install_in_context(struct perf_event_context *ctx, |
| @@ -1018,6 +1556,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
| 1018 | { | 1556 | { |
| 1019 | struct task_struct *task = ctx->task; | 1557 | struct task_struct *task = ctx->task; |
| 1020 | 1558 | ||
| 1559 | lockdep_assert_held(&ctx->mutex); | ||
| 1560 | |||
| 1021 | event->ctx = ctx; | 1561 | event->ctx = ctx; |
| 1022 | 1562 | ||
| 1023 | if (!task) { | 1563 | if (!task) { |
| @@ -1025,31 +1565,29 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
| 1025 | * Per cpu events are installed via an smp call and | 1565 | * Per cpu events are installed via an smp call and |
| 1026 | * the install is always successful. | 1566 | * the install is always successful. |
| 1027 | */ | 1567 | */ |
| 1028 | smp_call_function_single(cpu, __perf_install_in_context, | 1568 | cpu_function_call(cpu, __perf_install_in_context, event); |
| 1029 | event, 1); | ||
| 1030 | return; | 1569 | return; |
| 1031 | } | 1570 | } |
| 1032 | 1571 | ||
| 1033 | retry: | 1572 | retry: |
| 1034 | task_oncpu_function_call(task, __perf_install_in_context, | 1573 | if (!task_function_call(task, __perf_install_in_context, event)) |
| 1035 | event); | 1574 | return; |
| 1036 | 1575 | ||
| 1037 | raw_spin_lock_irq(&ctx->lock); | 1576 | raw_spin_lock_irq(&ctx->lock); |
| 1038 | /* | 1577 | /* |
| 1039 | * we need to retry the smp call. | 1578 | * If we failed to find a running task, but find the context active now |
| 1579 | * that we've acquired the ctx->lock, retry. | ||
| 1040 | */ | 1580 | */ |
| 1041 | if (ctx->is_active && list_empty(&event->group_entry)) { | 1581 | if (ctx->is_active) { |
| 1042 | raw_spin_unlock_irq(&ctx->lock); | 1582 | raw_spin_unlock_irq(&ctx->lock); |
| 1043 | goto retry; | 1583 | goto retry; |
| 1044 | } | 1584 | } |
| 1045 | 1585 | ||
| 1046 | /* | 1586 | /* |
| 1047 | * The lock prevents that this context is scheduled in so we | 1587 | * Since the task isn't running, its safe to add the event, us holding |
| 1048 | * can add the event safely, if it the call above did not | 1588 | * the ctx->lock ensures the task won't get scheduled in. |
| 1049 | * succeed. | ||
| 1050 | */ | 1589 | */ |
| 1051 | if (list_empty(&event->group_entry)) | 1590 | add_event_to_ctx(event, ctx); |
| 1052 | add_event_to_ctx(event, ctx); | ||
| 1053 | raw_spin_unlock_irq(&ctx->lock); | 1591 | raw_spin_unlock_irq(&ctx->lock); |
| 1054 | } | 1592 | } |
| 1055 | 1593 | ||
| @@ -1078,7 +1616,7 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
| 1078 | /* | 1616 | /* |
| 1079 | * Cross CPU call to enable a performance event | 1617 | * Cross CPU call to enable a performance event |
| 1080 | */ | 1618 | */ |
| 1081 | static void __perf_event_enable(void *info) | 1619 | static int __perf_event_enable(void *info) |
| 1082 | { | 1620 | { |
| 1083 | struct perf_event *event = info; | 1621 | struct perf_event *event = info; |
| 1084 | struct perf_event_context *ctx = event->ctx; | 1622 | struct perf_event_context *ctx = event->ctx; |
| @@ -1086,26 +1624,27 @@ static void __perf_event_enable(void *info) | |||
| 1086 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1624 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
| 1087 | int err; | 1625 | int err; |
| 1088 | 1626 | ||
| 1089 | /* | 1627 | if (WARN_ON_ONCE(!ctx->is_active)) |
| 1090 | * If this is a per-task event, need to check whether this | 1628 | return -EINVAL; |
| 1091 | * event's task is the current task on this cpu. | ||
| 1092 | */ | ||
| 1093 | if (ctx->task && cpuctx->task_ctx != ctx) { | ||
| 1094 | if (cpuctx->task_ctx || ctx->task != current) | ||
| 1095 | return; | ||
| 1096 | cpuctx->task_ctx = ctx; | ||
| 1097 | } | ||
| 1098 | 1629 | ||
| 1099 | raw_spin_lock(&ctx->lock); | 1630 | raw_spin_lock(&ctx->lock); |
| 1100 | ctx->is_active = 1; | ||
| 1101 | update_context_time(ctx); | 1631 | update_context_time(ctx); |
| 1102 | 1632 | ||
| 1103 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 1633 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
| 1104 | goto unlock; | 1634 | goto unlock; |
| 1635 | |||
| 1636 | /* | ||
| 1637 | * set current task's cgroup time reference point | ||
| 1638 | */ | ||
| 1639 | perf_cgroup_set_timestamp(current, ctx); | ||
| 1640 | |||
| 1105 | __perf_event_mark_enabled(event, ctx); | 1641 | __perf_event_mark_enabled(event, ctx); |
| 1106 | 1642 | ||
| 1107 | if (!event_filter_match(event)) | 1643 | if (!event_filter_match(event)) { |
| 1644 | if (is_cgroup_event(event)) | ||
| 1645 | perf_cgroup_defer_enabled(event); | ||
| 1108 | goto unlock; | 1646 | goto unlock; |
| 1647 | } | ||
| 1109 | 1648 | ||
| 1110 | /* | 1649 | /* |
| 1111 | * If the event is in a group and isn't the group leader, | 1650 | * If the event is in a group and isn't the group leader, |
| @@ -1138,6 +1677,8 @@ static void __perf_event_enable(void *info) | |||
| 1138 | 1677 | ||
| 1139 | unlock: | 1678 | unlock: |
| 1140 | raw_spin_unlock(&ctx->lock); | 1679 | raw_spin_unlock(&ctx->lock); |
| 1680 | |||
| 1681 | return 0; | ||
| 1141 | } | 1682 | } |
| 1142 | 1683 | ||
| 1143 | /* | 1684 | /* |
| @@ -1158,8 +1699,7 @@ void perf_event_enable(struct perf_event *event) | |||
| 1158 | /* | 1699 | /* |
| 1159 | * Enable the event on the cpu that it's on | 1700 | * Enable the event on the cpu that it's on |
| 1160 | */ | 1701 | */ |
| 1161 | smp_call_function_single(event->cpu, __perf_event_enable, | 1702 | cpu_function_call(event->cpu, __perf_event_enable, event); |
| 1162 | event, 1); | ||
| 1163 | return; | 1703 | return; |
| 1164 | } | 1704 | } |
| 1165 | 1705 | ||
| @@ -1178,8 +1718,15 @@ void perf_event_enable(struct perf_event *event) | |||
| 1178 | event->state = PERF_EVENT_STATE_OFF; | 1718 | event->state = PERF_EVENT_STATE_OFF; |
| 1179 | 1719 | ||
| 1180 | retry: | 1720 | retry: |
| 1721 | if (!ctx->is_active) { | ||
| 1722 | __perf_event_mark_enabled(event, ctx); | ||
| 1723 | goto out; | ||
| 1724 | } | ||
| 1725 | |||
| 1181 | raw_spin_unlock_irq(&ctx->lock); | 1726 | raw_spin_unlock_irq(&ctx->lock); |
| 1182 | task_oncpu_function_call(task, __perf_event_enable, event); | 1727 | |
| 1728 | if (!task_function_call(task, __perf_event_enable, event)) | ||
| 1729 | return; | ||
| 1183 | 1730 | ||
| 1184 | raw_spin_lock_irq(&ctx->lock); | 1731 | raw_spin_lock_irq(&ctx->lock); |
| 1185 | 1732 | ||
| @@ -1187,15 +1734,14 @@ retry: | |||
| 1187 | * If the context is active and the event is still off, | 1734 | * If the context is active and the event is still off, |
| 1188 | * we need to retry the cross-call. | 1735 | * we need to retry the cross-call. |
| 1189 | */ | 1736 | */ |
| 1190 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) | 1737 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { |
| 1738 | /* | ||
| 1739 | * task could have been flipped by a concurrent | ||
| 1740 | * perf_event_context_sched_out() | ||
| 1741 | */ | ||
| 1742 | task = ctx->task; | ||
| 1191 | goto retry; | 1743 | goto retry; |
| 1192 | 1744 | } | |
| 1193 | /* | ||
| 1194 | * Since we have the lock this context can't be scheduled | ||
| 1195 | * in, so we can change the state safely. | ||
| 1196 | */ | ||
| 1197 | if (event->state == PERF_EVENT_STATE_OFF) | ||
| 1198 | __perf_event_mark_enabled(event, ctx); | ||
| 1199 | 1745 | ||
| 1200 | out: | 1746 | out: |
| 1201 | raw_spin_unlock_irq(&ctx->lock); | 1747 | raw_spin_unlock_irq(&ctx->lock); |
| @@ -1227,6 +1773,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
| 1227 | if (likely(!ctx->nr_events)) | 1773 | if (likely(!ctx->nr_events)) |
| 1228 | goto out; | 1774 | goto out; |
| 1229 | update_context_time(ctx); | 1775 | update_context_time(ctx); |
| 1776 | update_cgrp_time_from_cpuctx(cpuctx); | ||
| 1230 | 1777 | ||
| 1231 | if (!ctx->nr_active) | 1778 | if (!ctx->nr_active) |
| 1232 | goto out; | 1779 | goto out; |
| @@ -1339,8 +1886,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
| 1339 | } | 1886 | } |
| 1340 | } | 1887 | } |
| 1341 | 1888 | ||
| 1342 | void perf_event_context_sched_out(struct task_struct *task, int ctxn, | 1889 | static void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
| 1343 | struct task_struct *next) | 1890 | struct task_struct *next) |
| 1344 | { | 1891 | { |
| 1345 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; | 1892 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
| 1346 | struct perf_event_context *next_ctx; | 1893 | struct perf_event_context *next_ctx; |
| @@ -1416,6 +1963,14 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
| 1416 | 1963 | ||
| 1417 | for_each_task_context_nr(ctxn) | 1964 | for_each_task_context_nr(ctxn) |
| 1418 | perf_event_context_sched_out(task, ctxn, next); | 1965 | perf_event_context_sched_out(task, ctxn, next); |
| 1966 | |||
| 1967 | /* | ||
| 1968 | * if cgroup events exist on this CPU, then we need | ||
| 1969 | * to check if we have to switch out PMU state. | ||
| 1970 | * cgroup event are system-wide mode only | ||
| 1971 | */ | ||
| 1972 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
| 1973 | perf_cgroup_sched_out(task); | ||
| 1419 | } | 1974 | } |
| 1420 | 1975 | ||
| 1421 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1976 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
| @@ -1454,6 +2009,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, | |||
| 1454 | if (!event_filter_match(event)) | 2009 | if (!event_filter_match(event)) |
| 1455 | continue; | 2010 | continue; |
| 1456 | 2011 | ||
| 2012 | /* may need to reset tstamp_enabled */ | ||
| 2013 | if (is_cgroup_event(event)) | ||
| 2014 | perf_cgroup_mark_enabled(event, ctx); | ||
| 2015 | |||
| 1457 | if (group_can_go_on(event, cpuctx, 1)) | 2016 | if (group_can_go_on(event, cpuctx, 1)) |
| 1458 | group_sched_in(event, cpuctx, ctx); | 2017 | group_sched_in(event, cpuctx, ctx); |
| 1459 | 2018 | ||
| @@ -1486,6 +2045,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
| 1486 | if (!event_filter_match(event)) | 2045 | if (!event_filter_match(event)) |
| 1487 | continue; | 2046 | continue; |
| 1488 | 2047 | ||
| 2048 | /* may need to reset tstamp_enabled */ | ||
| 2049 | if (is_cgroup_event(event)) | ||
| 2050 | perf_cgroup_mark_enabled(event, ctx); | ||
| 2051 | |||
| 1489 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | 2052 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
| 1490 | if (group_sched_in(event, cpuctx, ctx)) | 2053 | if (group_sched_in(event, cpuctx, ctx)) |
| 1491 | can_add_hw = 0; | 2054 | can_add_hw = 0; |
| @@ -1496,15 +2059,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
| 1496 | static void | 2059 | static void |
| 1497 | ctx_sched_in(struct perf_event_context *ctx, | 2060 | ctx_sched_in(struct perf_event_context *ctx, |
| 1498 | struct perf_cpu_context *cpuctx, | 2061 | struct perf_cpu_context *cpuctx, |
| 1499 | enum event_type_t event_type) | 2062 | enum event_type_t event_type, |
| 2063 | struct task_struct *task) | ||
| 1500 | { | 2064 | { |
| 2065 | u64 now; | ||
| 2066 | |||
| 1501 | raw_spin_lock(&ctx->lock); | 2067 | raw_spin_lock(&ctx->lock); |
| 1502 | ctx->is_active = 1; | 2068 | ctx->is_active = 1; |
| 1503 | if (likely(!ctx->nr_events)) | 2069 | if (likely(!ctx->nr_events)) |
| 1504 | goto out; | 2070 | goto out; |
| 1505 | 2071 | ||
| 1506 | ctx->timestamp = perf_clock(); | 2072 | now = perf_clock(); |
| 1507 | 2073 | ctx->timestamp = now; | |
| 2074 | perf_cgroup_set_timestamp(task, ctx); | ||
| 1508 | /* | 2075 | /* |
| 1509 | * First go through the list and put on any pinned groups | 2076 | * First go through the list and put on any pinned groups |
| 1510 | * in order to give them the best chance of going on. | 2077 | * in order to give them the best chance of going on. |
| @@ -1521,11 +2088,12 @@ out: | |||
| 1521 | } | 2088 | } |
| 1522 | 2089 | ||
| 1523 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 2090 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
| 1524 | enum event_type_t event_type) | 2091 | enum event_type_t event_type, |
| 2092 | struct task_struct *task) | ||
| 1525 | { | 2093 | { |
| 1526 | struct perf_event_context *ctx = &cpuctx->ctx; | 2094 | struct perf_event_context *ctx = &cpuctx->ctx; |
| 1527 | 2095 | ||
| 1528 | ctx_sched_in(ctx, cpuctx, event_type); | 2096 | ctx_sched_in(ctx, cpuctx, event_type, task); |
| 1529 | } | 2097 | } |
| 1530 | 2098 | ||
| 1531 | static void task_ctx_sched_in(struct perf_event_context *ctx, | 2099 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
| @@ -1533,15 +2101,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, | |||
| 1533 | { | 2101 | { |
| 1534 | struct perf_cpu_context *cpuctx; | 2102 | struct perf_cpu_context *cpuctx; |
| 1535 | 2103 | ||
| 1536 | cpuctx = __get_cpu_context(ctx); | 2104 | cpuctx = __get_cpu_context(ctx); |
| 1537 | if (cpuctx->task_ctx == ctx) | 2105 | if (cpuctx->task_ctx == ctx) |
| 1538 | return; | 2106 | return; |
| 1539 | 2107 | ||
| 1540 | ctx_sched_in(ctx, cpuctx, event_type); | 2108 | ctx_sched_in(ctx, cpuctx, event_type, NULL); |
| 1541 | cpuctx->task_ctx = ctx; | 2109 | cpuctx->task_ctx = ctx; |
| 1542 | } | 2110 | } |
| 1543 | 2111 | ||
| 1544 | void perf_event_context_sched_in(struct perf_event_context *ctx) | 2112 | static void perf_event_context_sched_in(struct perf_event_context *ctx, |
| 2113 | struct task_struct *task) | ||
| 1545 | { | 2114 | { |
| 1546 | struct perf_cpu_context *cpuctx; | 2115 | struct perf_cpu_context *cpuctx; |
| 1547 | 2116 | ||
| @@ -1557,9 +2126,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) | |||
| 1557 | */ | 2126 | */ |
| 1558 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2127 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
| 1559 | 2128 | ||
| 1560 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED); | 2129 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); |
| 1561 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2130 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); |
| 1562 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); | 2131 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); |
| 1563 | 2132 | ||
| 1564 | cpuctx->task_ctx = ctx; | 2133 | cpuctx->task_ctx = ctx; |
| 1565 | 2134 | ||
| @@ -1592,14 +2161,17 @@ void __perf_event_task_sched_in(struct task_struct *task) | |||
| 1592 | if (likely(!ctx)) | 2161 | if (likely(!ctx)) |
| 1593 | continue; | 2162 | continue; |
| 1594 | 2163 | ||
| 1595 | perf_event_context_sched_in(ctx); | 2164 | perf_event_context_sched_in(ctx, task); |
| 1596 | } | 2165 | } |
| 2166 | /* | ||
| 2167 | * if cgroup events exist on this CPU, then we need | ||
| 2168 | * to check if we have to switch in PMU state. | ||
| 2169 | * cgroup event are system-wide mode only | ||
| 2170 | */ | ||
| 2171 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
| 2172 | perf_cgroup_sched_in(task); | ||
| 1597 | } | 2173 | } |
| 1598 | 2174 | ||
| 1599 | #define MAX_INTERRUPTS (~0ULL) | ||
| 1600 | |||
| 1601 | static void perf_log_throttle(struct perf_event *event, int enable); | ||
| 1602 | |||
| 1603 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2175 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
| 1604 | { | 2176 | { |
| 1605 | u64 frequency = event->attr.sample_freq; | 2177 | u64 frequency = event->attr.sample_freq; |
| @@ -1627,7 +2199,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | |||
| 1627 | * Reduce accuracy by one bit such that @a and @b converge | 2199 | * Reduce accuracy by one bit such that @a and @b converge |
| 1628 | * to a similar magnitude. | 2200 | * to a similar magnitude. |
| 1629 | */ | 2201 | */ |
| 1630 | #define REDUCE_FLS(a, b) \ | 2202 | #define REDUCE_FLS(a, b) \ |
| 1631 | do { \ | 2203 | do { \ |
| 1632 | if (a##_fls > b##_fls) { \ | 2204 | if (a##_fls > b##_fls) { \ |
| 1633 | a >>= 1; \ | 2205 | a >>= 1; \ |
| @@ -1797,7 +2369,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
| 1797 | if (ctx) | 2369 | if (ctx) |
| 1798 | rotate_ctx(ctx); | 2370 | rotate_ctx(ctx); |
| 1799 | 2371 | ||
| 1800 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2372 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); |
| 1801 | if (ctx) | 2373 | if (ctx) |
| 1802 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); | 2374 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
| 1803 | 2375 | ||
| @@ -1852,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
| 1852 | if (!ctx || !ctx->nr_events) | 2424 | if (!ctx || !ctx->nr_events) |
| 1853 | goto out; | 2425 | goto out; |
| 1854 | 2426 | ||
| 2427 | /* | ||
| 2428 | * We must ctxsw out cgroup events to avoid conflict | ||
| 2429 | * when invoking perf_task_event_sched_in() later on | ||
| 2430 | * in this function. Otherwise we end up trying to | ||
| 2431 | * ctxswin cgroup events which are already scheduled | ||
| 2432 | * in. | ||
| 2433 | */ | ||
| 2434 | perf_cgroup_sched_out(current); | ||
| 1855 | task_ctx_sched_out(ctx, EVENT_ALL); | 2435 | task_ctx_sched_out(ctx, EVENT_ALL); |
| 1856 | 2436 | ||
| 1857 | raw_spin_lock(&ctx->lock); | 2437 | raw_spin_lock(&ctx->lock); |
| @@ -1876,7 +2456,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
| 1876 | 2456 | ||
| 1877 | raw_spin_unlock(&ctx->lock); | 2457 | raw_spin_unlock(&ctx->lock); |
| 1878 | 2458 | ||
| 1879 | perf_event_context_sched_in(ctx); | 2459 | /* |
| 2460 | * Also calls ctxswin for cgroup events, if any: | ||
| 2461 | */ | ||
| 2462 | perf_event_context_sched_in(ctx, ctx->task); | ||
| 1880 | out: | 2463 | out: |
| 1881 | local_irq_restore(flags); | 2464 | local_irq_restore(flags); |
| 1882 | } | 2465 | } |
| @@ -1901,8 +2484,10 @@ static void __perf_event_read(void *info) | |||
| 1901 | return; | 2484 | return; |
| 1902 | 2485 | ||
| 1903 | raw_spin_lock(&ctx->lock); | 2486 | raw_spin_lock(&ctx->lock); |
| 1904 | if (ctx->is_active) | 2487 | if (ctx->is_active) { |
| 1905 | update_context_time(ctx); | 2488 | update_context_time(ctx); |
| 2489 | update_cgrp_time_from_event(event); | ||
| 2490 | } | ||
| 1906 | update_event_times(event); | 2491 | update_event_times(event); |
| 1907 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 2492 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
| 1908 | event->pmu->read(event); | 2493 | event->pmu->read(event); |
| @@ -1933,8 +2518,10 @@ static u64 perf_event_read(struct perf_event *event) | |||
| 1933 | * (e.g., thread is blocked), in that case | 2518 | * (e.g., thread is blocked), in that case |
| 1934 | * we cannot update context time | 2519 | * we cannot update context time |
| 1935 | */ | 2520 | */ |
| 1936 | if (ctx->is_active) | 2521 | if (ctx->is_active) { |
| 1937 | update_context_time(ctx); | 2522 | update_context_time(ctx); |
| 2523 | update_cgrp_time_from_event(event); | ||
| 2524 | } | ||
| 1938 | update_event_times(event); | 2525 | update_event_times(event); |
| 1939 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2526 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| 1940 | } | 2527 | } |
| @@ -2213,6 +2800,9 @@ errout: | |||
| 2213 | 2800 | ||
| 2214 | } | 2801 | } |
| 2215 | 2802 | ||
| 2803 | /* | ||
| 2804 | * Returns a matching context with refcount and pincount. | ||
| 2805 | */ | ||
| 2216 | static struct perf_event_context * | 2806 | static struct perf_event_context * |
| 2217 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | 2807 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) |
| 2218 | { | 2808 | { |
| @@ -2237,6 +2827,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
| 2237 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 2827 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
| 2238 | ctx = &cpuctx->ctx; | 2828 | ctx = &cpuctx->ctx; |
| 2239 | get_ctx(ctx); | 2829 | get_ctx(ctx); |
| 2830 | ++ctx->pin_count; | ||
| 2240 | 2831 | ||
| 2241 | return ctx; | 2832 | return ctx; |
| 2242 | } | 2833 | } |
| @@ -2250,6 +2841,7 @@ retry: | |||
| 2250 | ctx = perf_lock_task_context(task, ctxn, &flags); | 2841 | ctx = perf_lock_task_context(task, ctxn, &flags); |
| 2251 | if (ctx) { | 2842 | if (ctx) { |
| 2252 | unclone_ctx(ctx); | 2843 | unclone_ctx(ctx); |
| 2844 | ++ctx->pin_count; | ||
| 2253 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2845 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| 2254 | } | 2846 | } |
| 2255 | 2847 | ||
| @@ -2271,8 +2863,10 @@ retry: | |||
| 2271 | err = -ESRCH; | 2863 | err = -ESRCH; |
| 2272 | else if (task->perf_event_ctxp[ctxn]) | 2864 | else if (task->perf_event_ctxp[ctxn]) |
| 2273 | err = -EAGAIN; | 2865 | err = -EAGAIN; |
| 2274 | else | 2866 | else { |
| 2867 | ++ctx->pin_count; | ||
| 2275 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); | 2868 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); |
| 2869 | } | ||
| 2276 | mutex_unlock(&task->perf_event_mutex); | 2870 | mutex_unlock(&task->perf_event_mutex); |
| 2277 | 2871 | ||
| 2278 | if (unlikely(err)) { | 2872 | if (unlikely(err)) { |
| @@ -2312,7 +2906,7 @@ static void free_event(struct perf_event *event) | |||
| 2312 | 2906 | ||
| 2313 | if (!event->parent) { | 2907 | if (!event->parent) { |
| 2314 | if (event->attach_state & PERF_ATTACH_TASK) | 2908 | if (event->attach_state & PERF_ATTACH_TASK) |
| 2315 | jump_label_dec(&perf_task_events); | 2909 | jump_label_dec(&perf_sched_events); |
| 2316 | if (event->attr.mmap || event->attr.mmap_data) | 2910 | if (event->attr.mmap || event->attr.mmap_data) |
| 2317 | atomic_dec(&nr_mmap_events); | 2911 | atomic_dec(&nr_mmap_events); |
| 2318 | if (event->attr.comm) | 2912 | if (event->attr.comm) |
| @@ -2321,6 +2915,10 @@ static void free_event(struct perf_event *event) | |||
| 2321 | atomic_dec(&nr_task_events); | 2915 | atomic_dec(&nr_task_events); |
| 2322 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | 2916 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) |
| 2323 | put_callchain_buffers(); | 2917 | put_callchain_buffers(); |
| 2918 | if (is_cgroup_event(event)) { | ||
| 2919 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
| 2920 | jump_label_dec(&perf_sched_events); | ||
| 2921 | } | ||
| 2324 | } | 2922 | } |
| 2325 | 2923 | ||
| 2326 | if (event->buffer) { | 2924 | if (event->buffer) { |
| @@ -2328,6 +2926,9 @@ static void free_event(struct perf_event *event) | |||
| 2328 | event->buffer = NULL; | 2926 | event->buffer = NULL; |
| 2329 | } | 2927 | } |
| 2330 | 2928 | ||
| 2929 | if (is_cgroup_event(event)) | ||
| 2930 | perf_detach_cgroup(event); | ||
| 2931 | |||
| 2331 | if (event->destroy) | 2932 | if (event->destroy) |
| 2332 | event->destroy(event); | 2933 | event->destroy(event); |
| 2333 | 2934 | ||
| @@ -4395,26 +4996,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
| 4395 | if (unlikely(!is_sampling_event(event))) | 4996 | if (unlikely(!is_sampling_event(event))) |
| 4396 | return 0; | 4997 | return 0; |
| 4397 | 4998 | ||
| 4398 | if (!throttle) { | 4999 | if (unlikely(hwc->interrupts >= max_samples_per_tick)) { |
| 4399 | hwc->interrupts++; | 5000 | if (throttle) { |
| 4400 | } else { | 5001 | hwc->interrupts = MAX_INTERRUPTS; |
| 4401 | if (hwc->interrupts != MAX_INTERRUPTS) { | 5002 | perf_log_throttle(event, 0); |
| 4402 | hwc->interrupts++; | ||
| 4403 | if (HZ * hwc->interrupts > | ||
| 4404 | (u64)sysctl_perf_event_sample_rate) { | ||
| 4405 | hwc->interrupts = MAX_INTERRUPTS; | ||
| 4406 | perf_log_throttle(event, 0); | ||
| 4407 | ret = 1; | ||
| 4408 | } | ||
| 4409 | } else { | ||
| 4410 | /* | ||
| 4411 | * Keep re-disabling events even though on the previous | ||
| 4412 | * pass we disabled it - just in case we raced with a | ||
| 4413 | * sched-in and the event got enabled again: | ||
| 4414 | */ | ||
| 4415 | ret = 1; | 5003 | ret = 1; |
| 4416 | } | 5004 | } |
| 4417 | } | 5005 | } else |
| 5006 | hwc->interrupts++; | ||
| 4418 | 5007 | ||
| 4419 | if (event->attr.freq) { | 5008 | if (event->attr.freq) { |
| 4420 | u64 now = perf_clock(); | 5009 | u64 now = perf_clock(); |
| @@ -4556,7 +5145,7 @@ static int perf_exclude_event(struct perf_event *event, | |||
| 4556 | struct pt_regs *regs) | 5145 | struct pt_regs *regs) |
| 4557 | { | 5146 | { |
| 4558 | if (event->hw.state & PERF_HES_STOPPED) | 5147 | if (event->hw.state & PERF_HES_STOPPED) |
| 4559 | return 0; | 5148 | return 1; |
| 4560 | 5149 | ||
| 4561 | if (regs) { | 5150 | if (regs) { |
| 4562 | if (event->attr.exclude_user && user_mode(regs)) | 5151 | if (event->attr.exclude_user && user_mode(regs)) |
| @@ -4912,6 +5501,8 @@ static int perf_tp_event_match(struct perf_event *event, | |||
| 4912 | struct perf_sample_data *data, | 5501 | struct perf_sample_data *data, |
| 4913 | struct pt_regs *regs) | 5502 | struct pt_regs *regs) |
| 4914 | { | 5503 | { |
| 5504 | if (event->hw.state & PERF_HES_STOPPED) | ||
| 5505 | return 0; | ||
| 4915 | /* | 5506 | /* |
| 4916 | * All tracepoints are from kernel-space. | 5507 | * All tracepoints are from kernel-space. |
| 4917 | */ | 5508 | */ |
| @@ -5051,6 +5642,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
| 5051 | u64 period; | 5642 | u64 period; |
| 5052 | 5643 | ||
| 5053 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | 5644 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
| 5645 | |||
| 5646 | if (event->state != PERF_EVENT_STATE_ACTIVE) | ||
| 5647 | return HRTIMER_NORESTART; | ||
| 5648 | |||
| 5054 | event->pmu->read(event); | 5649 | event->pmu->read(event); |
| 5055 | 5650 | ||
| 5056 | perf_sample_data_init(&data, 0); | 5651 | perf_sample_data_init(&data, 0); |
| @@ -5077,9 +5672,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) | |||
| 5077 | if (!is_sampling_event(event)) | 5672 | if (!is_sampling_event(event)) |
| 5078 | return; | 5673 | return; |
| 5079 | 5674 | ||
| 5080 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 5081 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
| 5082 | |||
| 5083 | period = local64_read(&hwc->period_left); | 5675 | period = local64_read(&hwc->period_left); |
| 5084 | if (period) { | 5676 | if (period) { |
| 5085 | if (period < 0) | 5677 | if (period < 0) |
| @@ -5106,6 +5698,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) | |||
| 5106 | } | 5698 | } |
| 5107 | } | 5699 | } |
| 5108 | 5700 | ||
| 5701 | static void perf_swevent_init_hrtimer(struct perf_event *event) | ||
| 5702 | { | ||
| 5703 | struct hw_perf_event *hwc = &event->hw; | ||
| 5704 | |||
| 5705 | if (!is_sampling_event(event)) | ||
| 5706 | return; | ||
| 5707 | |||
| 5708 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 5709 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
| 5710 | |||
| 5711 | /* | ||
| 5712 | * Since hrtimers have a fixed rate, we can do a static freq->period | ||
| 5713 | * mapping and avoid the whole period adjust feedback stuff. | ||
| 5714 | */ | ||
| 5715 | if (event->attr.freq) { | ||
| 5716 | long freq = event->attr.sample_freq; | ||
| 5717 | |||
| 5718 | event->attr.sample_period = NSEC_PER_SEC / freq; | ||
| 5719 | hwc->sample_period = event->attr.sample_period; | ||
| 5720 | local64_set(&hwc->period_left, hwc->sample_period); | ||
| 5721 | event->attr.freq = 0; | ||
| 5722 | } | ||
| 5723 | } | ||
| 5724 | |||
| 5109 | /* | 5725 | /* |
| 5110 | * Software event: cpu wall time clock | 5726 | * Software event: cpu wall time clock |
| 5111 | */ | 5727 | */ |
| @@ -5158,6 +5774,8 @@ static int cpu_clock_event_init(struct perf_event *event) | |||
| 5158 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | 5774 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) |
| 5159 | return -ENOENT; | 5775 | return -ENOENT; |
| 5160 | 5776 | ||
| 5777 | perf_swevent_init_hrtimer(event); | ||
| 5778 | |||
| 5161 | return 0; | 5779 | return 0; |
| 5162 | } | 5780 | } |
| 5163 | 5781 | ||
| @@ -5213,16 +5831,9 @@ static void task_clock_event_del(struct perf_event *event, int flags) | |||
| 5213 | 5831 | ||
| 5214 | static void task_clock_event_read(struct perf_event *event) | 5832 | static void task_clock_event_read(struct perf_event *event) |
| 5215 | { | 5833 | { |
| 5216 | u64 time; | 5834 | u64 now = perf_clock(); |
| 5217 | 5835 | u64 delta = now - event->ctx->timestamp; | |
| 5218 | if (!in_nmi()) { | 5836 | u64 time = event->ctx->time + delta; |
| 5219 | update_context_time(event->ctx); | ||
| 5220 | time = event->ctx->time; | ||
| 5221 | } else { | ||
| 5222 | u64 now = perf_clock(); | ||
| 5223 | u64 delta = now - event->ctx->timestamp; | ||
| 5224 | time = event->ctx->time + delta; | ||
| 5225 | } | ||
| 5226 | 5837 | ||
| 5227 | task_clock_event_update(event, time); | 5838 | task_clock_event_update(event, time); |
| 5228 | } | 5839 | } |
| @@ -5235,6 +5846,8 @@ static int task_clock_event_init(struct perf_event *event) | |||
| 5235 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | 5846 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) |
| 5236 | return -ENOENT; | 5847 | return -ENOENT; |
| 5237 | 5848 | ||
| 5849 | perf_swevent_init_hrtimer(event); | ||
| 5850 | |||
| 5238 | return 0; | 5851 | return 0; |
| 5239 | } | 5852 | } |
| 5240 | 5853 | ||
| @@ -5506,17 +6119,22 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
| 5506 | { | 6119 | { |
| 5507 | struct pmu *pmu = NULL; | 6120 | struct pmu *pmu = NULL; |
| 5508 | int idx; | 6121 | int idx; |
| 6122 | int ret; | ||
| 5509 | 6123 | ||
| 5510 | idx = srcu_read_lock(&pmus_srcu); | 6124 | idx = srcu_read_lock(&pmus_srcu); |
| 5511 | 6125 | ||
| 5512 | rcu_read_lock(); | 6126 | rcu_read_lock(); |
| 5513 | pmu = idr_find(&pmu_idr, event->attr.type); | 6127 | pmu = idr_find(&pmu_idr, event->attr.type); |
| 5514 | rcu_read_unlock(); | 6128 | rcu_read_unlock(); |
| 5515 | if (pmu) | 6129 | if (pmu) { |
| 6130 | ret = pmu->event_init(event); | ||
| 6131 | if (ret) | ||
| 6132 | pmu = ERR_PTR(ret); | ||
| 5516 | goto unlock; | 6133 | goto unlock; |
| 6134 | } | ||
| 5517 | 6135 | ||
| 5518 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 6136 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 5519 | int ret = pmu->event_init(event); | 6137 | ret = pmu->event_init(event); |
| 5520 | if (!ret) | 6138 | if (!ret) |
| 5521 | goto unlock; | 6139 | goto unlock; |
| 5522 | 6140 | ||
| @@ -5642,7 +6260,7 @@ done: | |||
| 5642 | 6260 | ||
| 5643 | if (!event->parent) { | 6261 | if (!event->parent) { |
| 5644 | if (event->attach_state & PERF_ATTACH_TASK) | 6262 | if (event->attach_state & PERF_ATTACH_TASK) |
| 5645 | jump_label_inc(&perf_task_events); | 6263 | jump_label_inc(&perf_sched_events); |
| 5646 | if (event->attr.mmap || event->attr.mmap_data) | 6264 | if (event->attr.mmap || event->attr.mmap_data) |
| 5647 | atomic_inc(&nr_mmap_events); | 6265 | atomic_inc(&nr_mmap_events); |
| 5648 | if (event->attr.comm) | 6266 | if (event->attr.comm) |
| @@ -5817,7 +6435,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5817 | int err; | 6435 | int err; |
| 5818 | 6436 | ||
| 5819 | /* for future expandability... */ | 6437 | /* for future expandability... */ |
| 5820 | if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) | 6438 | if (flags & ~PERF_FLAG_ALL) |
| 5821 | return -EINVAL; | 6439 | return -EINVAL; |
| 5822 | 6440 | ||
| 5823 | err = perf_copy_attr(attr_uptr, &attr); | 6441 | err = perf_copy_attr(attr_uptr, &attr); |
| @@ -5834,6 +6452,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5834 | return -EINVAL; | 6452 | return -EINVAL; |
| 5835 | } | 6453 | } |
| 5836 | 6454 | ||
| 6455 | /* | ||
| 6456 | * In cgroup mode, the pid argument is used to pass the fd | ||
| 6457 | * opened to the cgroup directory in cgroupfs. The cpu argument | ||
| 6458 | * designates the cpu on which to monitor threads from that | ||
| 6459 | * cgroup. | ||
| 6460 | */ | ||
| 6461 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) | ||
| 6462 | return -EINVAL; | ||
| 6463 | |||
| 5837 | event_fd = get_unused_fd_flags(O_RDWR); | 6464 | event_fd = get_unused_fd_flags(O_RDWR); |
| 5838 | if (event_fd < 0) | 6465 | if (event_fd < 0) |
| 5839 | return event_fd; | 6466 | return event_fd; |
| @@ -5851,7 +6478,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5851 | group_leader = NULL; | 6478 | group_leader = NULL; |
| 5852 | } | 6479 | } |
| 5853 | 6480 | ||
| 5854 | if (pid != -1) { | 6481 | if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { |
| 5855 | task = find_lively_task_by_vpid(pid); | 6482 | task = find_lively_task_by_vpid(pid); |
| 5856 | if (IS_ERR(task)) { | 6483 | if (IS_ERR(task)) { |
| 5857 | err = PTR_ERR(task); | 6484 | err = PTR_ERR(task); |
| @@ -5865,6 +6492,19 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5865 | goto err_task; | 6492 | goto err_task; |
| 5866 | } | 6493 | } |
| 5867 | 6494 | ||
| 6495 | if (flags & PERF_FLAG_PID_CGROUP) { | ||
| 6496 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | ||
| 6497 | if (err) | ||
| 6498 | goto err_alloc; | ||
| 6499 | /* | ||
| 6500 | * one more event: | ||
| 6501 | * - that has cgroup constraint on event->cpu | ||
| 6502 | * - that may need work on context switch | ||
| 6503 | */ | ||
| 6504 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
| 6505 | jump_label_inc(&perf_sched_events); | ||
| 6506 | } | ||
| 6507 | |||
| 5868 | /* | 6508 | /* |
| 5869 | * Special case software events and allow them to be part of | 6509 | * Special case software events and allow them to be part of |
| 5870 | * any hardware group. | 6510 | * any hardware group. |
| @@ -5903,6 +6543,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5903 | goto err_alloc; | 6543 | goto err_alloc; |
| 5904 | } | 6544 | } |
| 5905 | 6545 | ||
| 6546 | if (task) { | ||
| 6547 | put_task_struct(task); | ||
| 6548 | task = NULL; | ||
| 6549 | } | ||
| 6550 | |||
| 5906 | /* | 6551 | /* |
| 5907 | * Look up the group leader (we will attach this event to it): | 6552 | * Look up the group leader (we will attach this event to it): |
| 5908 | */ | 6553 | */ |
| @@ -5950,10 +6595,10 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5950 | struct perf_event_context *gctx = group_leader->ctx; | 6595 | struct perf_event_context *gctx = group_leader->ctx; |
| 5951 | 6596 | ||
| 5952 | mutex_lock(&gctx->mutex); | 6597 | mutex_lock(&gctx->mutex); |
| 5953 | perf_event_remove_from_context(group_leader); | 6598 | perf_remove_from_context(group_leader); |
| 5954 | list_for_each_entry(sibling, &group_leader->sibling_list, | 6599 | list_for_each_entry(sibling, &group_leader->sibling_list, |
| 5955 | group_entry) { | 6600 | group_entry) { |
| 5956 | perf_event_remove_from_context(sibling); | 6601 | perf_remove_from_context(sibling); |
| 5957 | put_ctx(gctx); | 6602 | put_ctx(gctx); |
| 5958 | } | 6603 | } |
| 5959 | mutex_unlock(&gctx->mutex); | 6604 | mutex_unlock(&gctx->mutex); |
| @@ -5976,6 +6621,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5976 | 6621 | ||
| 5977 | perf_install_in_context(ctx, event, cpu); | 6622 | perf_install_in_context(ctx, event, cpu); |
| 5978 | ++ctx->generation; | 6623 | ++ctx->generation; |
| 6624 | perf_unpin_context(ctx); | ||
| 5979 | mutex_unlock(&ctx->mutex); | 6625 | mutex_unlock(&ctx->mutex); |
| 5980 | 6626 | ||
| 5981 | event->owner = current; | 6627 | event->owner = current; |
| @@ -6001,6 +6647,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6001 | return event_fd; | 6647 | return event_fd; |
| 6002 | 6648 | ||
| 6003 | err_context: | 6649 | err_context: |
| 6650 | perf_unpin_context(ctx); | ||
| 6004 | put_ctx(ctx); | 6651 | put_ctx(ctx); |
| 6005 | err_alloc: | 6652 | err_alloc: |
| 6006 | free_event(event); | 6653 | free_event(event); |
| @@ -6051,6 +6698,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 6051 | mutex_lock(&ctx->mutex); | 6698 | mutex_lock(&ctx->mutex); |
| 6052 | perf_install_in_context(ctx, event, cpu); | 6699 | perf_install_in_context(ctx, event, cpu); |
| 6053 | ++ctx->generation; | 6700 | ++ctx->generation; |
| 6701 | perf_unpin_context(ctx); | ||
| 6054 | mutex_unlock(&ctx->mutex); | 6702 | mutex_unlock(&ctx->mutex); |
| 6055 | 6703 | ||
| 6056 | return event; | 6704 | return event; |
| @@ -6102,17 +6750,20 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
| 6102 | struct perf_event_context *child_ctx, | 6750 | struct perf_event_context *child_ctx, |
| 6103 | struct task_struct *child) | 6751 | struct task_struct *child) |
| 6104 | { | 6752 | { |
| 6105 | struct perf_event *parent_event; | 6753 | if (child_event->parent) { |
| 6754 | raw_spin_lock_irq(&child_ctx->lock); | ||
| 6755 | perf_group_detach(child_event); | ||
| 6756 | raw_spin_unlock_irq(&child_ctx->lock); | ||
| 6757 | } | ||
| 6106 | 6758 | ||
| 6107 | perf_event_remove_from_context(child_event); | 6759 | perf_remove_from_context(child_event); |
| 6108 | 6760 | ||
| 6109 | parent_event = child_event->parent; | ||
| 6110 | /* | 6761 | /* |
| 6111 | * It can happen that parent exits first, and has events | 6762 | * It can happen that the parent exits first, and has events |
| 6112 | * that are still around due to the child reference. These | 6763 | * that are still around due to the child reference. These |
| 6113 | * events need to be zapped - but otherwise linger. | 6764 | * events need to be zapped. |
| 6114 | */ | 6765 | */ |
| 6115 | if (parent_event) { | 6766 | if (child_event->parent) { |
| 6116 | sync_child_event(child_event, child); | 6767 | sync_child_event(child_event, child); |
| 6117 | free_event(child_event); | 6768 | free_event(child_event); |
| 6118 | } | 6769 | } |
| @@ -6411,7 +7062,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
| 6411 | return 0; | 7062 | return 0; |
| 6412 | } | 7063 | } |
| 6413 | 7064 | ||
| 6414 | child_ctx = child->perf_event_ctxp[ctxn]; | 7065 | child_ctx = child->perf_event_ctxp[ctxn]; |
| 6415 | if (!child_ctx) { | 7066 | if (!child_ctx) { |
| 6416 | /* | 7067 | /* |
| 6417 | * This is executed from the parent task context, so | 7068 | * This is executed from the parent task context, so |
| @@ -6526,6 +7177,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
| 6526 | mutex_unlock(&parent_ctx->mutex); | 7177 | mutex_unlock(&parent_ctx->mutex); |
| 6527 | 7178 | ||
| 6528 | perf_unpin_context(parent_ctx); | 7179 | perf_unpin_context(parent_ctx); |
| 7180 | put_ctx(parent_ctx); | ||
| 6529 | 7181 | ||
| 6530 | return ret; | 7182 | return ret; |
| 6531 | } | 7183 | } |
| @@ -6595,9 +7247,9 @@ static void __perf_event_exit_context(void *__info) | |||
| 6595 | perf_pmu_rotate_stop(ctx->pmu); | 7247 | perf_pmu_rotate_stop(ctx->pmu); |
| 6596 | 7248 | ||
| 6597 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 7249 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
| 6598 | __perf_event_remove_from_context(event); | 7250 | __perf_remove_from_context(event); |
| 6599 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 7251 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
| 6600 | __perf_event_remove_from_context(event); | 7252 | __perf_remove_from_context(event); |
| 6601 | } | 7253 | } |
| 6602 | 7254 | ||
| 6603 | static void perf_event_exit_cpu_context(int cpu) | 7255 | static void perf_event_exit_cpu_context(int cpu) |
| @@ -6721,3 +7373,83 @@ unlock: | |||
| 6721 | return ret; | 7373 | return ret; |
| 6722 | } | 7374 | } |
| 6723 | device_initcall(perf_event_sysfs_init); | 7375 | device_initcall(perf_event_sysfs_init); |
| 7376 | |||
| 7377 | #ifdef CONFIG_CGROUP_PERF | ||
| 7378 | static struct cgroup_subsys_state *perf_cgroup_create( | ||
| 7379 | struct cgroup_subsys *ss, struct cgroup *cont) | ||
| 7380 | { | ||
| 7381 | struct perf_cgroup *jc; | ||
| 7382 | |||
| 7383 | jc = kzalloc(sizeof(*jc), GFP_KERNEL); | ||
| 7384 | if (!jc) | ||
| 7385 | return ERR_PTR(-ENOMEM); | ||
| 7386 | |||
| 7387 | jc->info = alloc_percpu(struct perf_cgroup_info); | ||
| 7388 | if (!jc->info) { | ||
| 7389 | kfree(jc); | ||
| 7390 | return ERR_PTR(-ENOMEM); | ||
| 7391 | } | ||
| 7392 | |||
| 7393 | return &jc->css; | ||
| 7394 | } | ||
| 7395 | |||
| 7396 | static void perf_cgroup_destroy(struct cgroup_subsys *ss, | ||
| 7397 | struct cgroup *cont) | ||
| 7398 | { | ||
| 7399 | struct perf_cgroup *jc; | ||
| 7400 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | ||
| 7401 | struct perf_cgroup, css); | ||
| 7402 | free_percpu(jc->info); | ||
| 7403 | kfree(jc); | ||
| 7404 | } | ||
| 7405 | |||
| 7406 | static int __perf_cgroup_move(void *info) | ||
| 7407 | { | ||
| 7408 | struct task_struct *task = info; | ||
| 7409 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); | ||
| 7410 | return 0; | ||
| 7411 | } | ||
| 7412 | |||
| 7413 | static void perf_cgroup_move(struct task_struct *task) | ||
| 7414 | { | ||
| 7415 | task_function_call(task, __perf_cgroup_move, task); | ||
| 7416 | } | ||
| 7417 | |||
| 7418 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
| 7419 | struct cgroup *old_cgrp, struct task_struct *task, | ||
| 7420 | bool threadgroup) | ||
| 7421 | { | ||
| 7422 | perf_cgroup_move(task); | ||
| 7423 | if (threadgroup) { | ||
| 7424 | struct task_struct *c; | ||
| 7425 | rcu_read_lock(); | ||
| 7426 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
| 7427 | perf_cgroup_move(c); | ||
| 7428 | } | ||
| 7429 | rcu_read_unlock(); | ||
| 7430 | } | ||
| 7431 | } | ||
| 7432 | |||
| 7433 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
| 7434 | struct cgroup *old_cgrp, struct task_struct *task) | ||
| 7435 | { | ||
| 7436 | /* | ||
| 7437 | * cgroup_exit() is called in the copy_process() failure path. | ||
| 7438 | * Ignore this case since the task hasn't ran yet, this avoids | ||
| 7439 | * trying to poke a half freed task state from generic code. | ||
| 7440 | */ | ||
| 7441 | if (!(task->flags & PF_EXITING)) | ||
| 7442 | return; | ||
| 7443 | |||
| 7444 | perf_cgroup_move(task); | ||
| 7445 | } | ||
| 7446 | |||
| 7447 | struct cgroup_subsys perf_subsys = { | ||
| 7448 | .name = "perf_event", | ||
| 7449 | .subsys_id = perf_subsys_id, | ||
| 7450 | .create = perf_cgroup_create, | ||
| 7451 | .destroy = perf_cgroup_destroy, | ||
| 7452 | .exit = perf_cgroup_exit, | ||
| 7453 | .attach = perf_cgroup_attach, | ||
| 7454 | }; | ||
| 7455 | #endif /* CONFIG_CGROUP_PERF */ | ||
diff --git a/kernel/pid.c b/kernel/pid.c index 39b65b69584f..57a8346a270e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
| 217 | return -1; | 217 | return -1; |
| 218 | } | 218 | } |
| 219 | 219 | ||
| 220 | int next_pidmap(struct pid_namespace *pid_ns, int last) | 220 | int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) |
| 221 | { | 221 | { |
| 222 | int offset; | 222 | int offset; |
| 223 | struct pidmap *map, *end; | 223 | struct pidmap *map, *end; |
| 224 | 224 | ||
| 225 | if (last >= PID_MAX_LIMIT) | ||
| 226 | return -1; | ||
| 227 | |||
| 225 | offset = (last + 1) & BITS_PER_PAGE_MASK; | 228 | offset = (last + 1) & BITS_PER_PAGE_MASK; |
| 226 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; | 229 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; |
| 227 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; | 230 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; |
| @@ -435,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) | |||
| 435 | rcu_read_unlock(); | 438 | rcu_read_unlock(); |
| 436 | return pid; | 439 | return pid; |
| 437 | } | 440 | } |
| 441 | EXPORT_SYMBOL_GPL(get_task_pid); | ||
| 438 | 442 | ||
| 439 | struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) | 443 | struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) |
| 440 | { | 444 | { |
| @@ -446,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) | |||
| 446 | rcu_read_unlock(); | 450 | rcu_read_unlock(); |
| 447 | return result; | 451 | return result; |
| 448 | } | 452 | } |
| 453 | EXPORT_SYMBOL_GPL(get_pid_task); | ||
| 449 | 454 | ||
| 450 | struct pid *find_get_pid(pid_t nr) | 455 | struct pid *find_get_pid(pid_t nr) |
| 451 | { | 456 | { |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94e1f0b..e9c9adc84ca6 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/err.h> | 14 | #include <linux/err.h> |
| 15 | #include <linux/acct.h> | 15 | #include <linux/acct.h> |
| 16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
| 17 | #include <linux/proc_fs.h> | ||
| 17 | 18 | ||
| 18 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 19 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
| 19 | 20 | ||
| @@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p | |||
| 72 | { | 73 | { |
| 73 | struct pid_namespace *ns; | 74 | struct pid_namespace *ns; |
| 74 | unsigned int level = parent_pid_ns->level + 1; | 75 | unsigned int level = parent_pid_ns->level + 1; |
| 75 | int i; | 76 | int i, err = -ENOMEM; |
| 76 | 77 | ||
| 77 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); | 78 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); |
| 78 | if (ns == NULL) | 79 | if (ns == NULL) |
| @@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p | |||
| 96 | for (i = 1; i < PIDMAP_ENTRIES; i++) | 97 | for (i = 1; i < PIDMAP_ENTRIES; i++) |
| 97 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); | 98 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); |
| 98 | 99 | ||
| 100 | err = pid_ns_prepare_proc(ns); | ||
| 101 | if (err) | ||
| 102 | goto out_put_parent_pid_ns; | ||
| 103 | |||
| 99 | return ns; | 104 | return ns; |
| 100 | 105 | ||
| 106 | out_put_parent_pid_ns: | ||
| 107 | put_pid_ns(parent_pid_ns); | ||
| 101 | out_free_map: | 108 | out_free_map: |
| 102 | kfree(ns->pidmap[0].page); | 109 | kfree(ns->pidmap[0].page); |
| 103 | out_free: | 110 | out_free: |
| 104 | kmem_cache_free(pid_ns_cachep, ns); | 111 | kmem_cache_free(pid_ns_cachep, ns); |
| 105 | out: | 112 | out: |
| 106 | return ERR_PTR(-ENOMEM); | 113 | return ERR_PTR(err); |
| 107 | } | 114 | } |
| 108 | 115 | ||
| 109 | static void destroy_pid_namespace(struct pid_namespace *ns) | 116 | static void destroy_pid_namespace(struct pid_namespace *ns) |
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index aeaa7f846821..0da058bff8eb 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
| @@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = { | |||
| 103 | 103 | ||
| 104 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | 104 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, |
| 105 | size_t count, loff_t *f_pos); | 105 | size_t count, loff_t *f_pos); |
| 106 | static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, | ||
| 107 | size_t count, loff_t *f_pos); | ||
| 106 | static int pm_qos_power_open(struct inode *inode, struct file *filp); | 108 | static int pm_qos_power_open(struct inode *inode, struct file *filp); |
| 107 | static int pm_qos_power_release(struct inode *inode, struct file *filp); | 109 | static int pm_qos_power_release(struct inode *inode, struct file *filp); |
| 108 | 110 | ||
| 109 | static const struct file_operations pm_qos_power_fops = { | 111 | static const struct file_operations pm_qos_power_fops = { |
| 110 | .write = pm_qos_power_write, | 112 | .write = pm_qos_power_write, |
| 113 | .read = pm_qos_power_read, | ||
| 111 | .open = pm_qos_power_open, | 114 | .open = pm_qos_power_open, |
| 112 | .release = pm_qos_power_release, | 115 | .release = pm_qos_power_release, |
| 113 | .llseek = noop_llseek, | 116 | .llseek = noop_llseek, |
| @@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) | |||
| 376 | } | 379 | } |
| 377 | 380 | ||
| 378 | 381 | ||
| 382 | static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, | ||
| 383 | size_t count, loff_t *f_pos) | ||
| 384 | { | ||
| 385 | s32 value; | ||
| 386 | unsigned long flags; | ||
| 387 | struct pm_qos_object *o; | ||
| 388 | struct pm_qos_request_list *pm_qos_req = filp->private_data;; | ||
| 389 | |||
| 390 | if (!pm_qos_req) | ||
| 391 | return -EINVAL; | ||
| 392 | if (!pm_qos_request_active(pm_qos_req)) | ||
| 393 | return -EINVAL; | ||
| 394 | |||
| 395 | o = pm_qos_array[pm_qos_req->pm_qos_class]; | ||
| 396 | spin_lock_irqsave(&pm_qos_lock, flags); | ||
| 397 | value = pm_qos_get_value(o); | ||
| 398 | spin_unlock_irqrestore(&pm_qos_lock, flags); | ||
| 399 | |||
| 400 | return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); | ||
| 401 | } | ||
| 402 | |||
| 379 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | 403 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, |
| 380 | size_t count, loff_t *f_pos) | 404 | size_t count, loff_t *f_pos) |
| 381 | { | 405 | { |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 05bb7173850e..0791b13df7bf 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p) | |||
| 176 | return p->utime; | 176 | return p->utime; |
| 177 | } | 177 | } |
| 178 | 178 | ||
| 179 | int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | 179 | static int |
| 180 | posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | ||
| 180 | { | 181 | { |
| 181 | int error = check_clock(which_clock); | 182 | int error = check_clock(which_clock); |
| 182 | if (!error) { | 183 | if (!error) { |
| @@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | |||
| 194 | return error; | 195 | return error; |
| 195 | } | 196 | } |
| 196 | 197 | ||
| 197 | int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) | 198 | static int |
| 199 | posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) | ||
| 198 | { | 200 | { |
| 199 | /* | 201 | /* |
| 200 | * You can never reset a CPU clock, but we check for other errors | 202 | * You can never reset a CPU clock, but we check for other errors |
| @@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
| 317 | } | 319 | } |
| 318 | 320 | ||
| 319 | 321 | ||
| 320 | int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | 322 | static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) |
| 321 | { | 323 | { |
| 322 | const pid_t pid = CPUCLOCK_PID(which_clock); | 324 | const pid_t pid = CPUCLOCK_PID(which_clock); |
| 323 | int error = -EINVAL; | 325 | int error = -EINVAL; |
| @@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | |||
| 379 | * This is called from sys_timer_create() and do_cpu_nanosleep() with the | 381 | * This is called from sys_timer_create() and do_cpu_nanosleep() with the |
| 380 | * new timer already all-zeros initialized. | 382 | * new timer already all-zeros initialized. |
| 381 | */ | 383 | */ |
| 382 | int posix_cpu_timer_create(struct k_itimer *new_timer) | 384 | static int posix_cpu_timer_create(struct k_itimer *new_timer) |
| 383 | { | 385 | { |
| 384 | int ret = 0; | 386 | int ret = 0; |
| 385 | const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); | 387 | const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); |
| @@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
| 425 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | 427 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
| 426 | * and try again. (This happens when the timer is in the middle of firing.) | 428 | * and try again. (This happens when the timer is in the middle of firing.) |
| 427 | */ | 429 | */ |
| 428 | int posix_cpu_timer_del(struct k_itimer *timer) | 430 | static int posix_cpu_timer_del(struct k_itimer *timer) |
| 429 | { | 431 | { |
| 430 | struct task_struct *p = timer->it.cpu.task; | 432 | struct task_struct *p = timer->it.cpu.task; |
| 431 | int ret = 0; | 433 | int ret = 0; |
| @@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
| 665 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | 667 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
| 666 | * and try again. (This happens when the timer is in the middle of firing.) | 668 | * and try again. (This happens when the timer is in the middle of firing.) |
| 667 | */ | 669 | */ |
| 668 | int posix_cpu_timer_set(struct k_itimer *timer, int flags, | 670 | static int posix_cpu_timer_set(struct k_itimer *timer, int flags, |
| 669 | struct itimerspec *new, struct itimerspec *old) | 671 | struct itimerspec *new, struct itimerspec *old) |
| 670 | { | 672 | { |
| 671 | struct task_struct *p = timer->it.cpu.task; | 673 | struct task_struct *p = timer->it.cpu.task; |
| 672 | union cpu_time_count old_expires, new_expires, old_incr, val; | 674 | union cpu_time_count old_expires, new_expires, old_incr, val; |
| @@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
| 820 | return ret; | 822 | return ret; |
| 821 | } | 823 | } |
| 822 | 824 | ||
| 823 | void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | 825 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) |
| 824 | { | 826 | { |
| 825 | union cpu_time_count now; | 827 | union cpu_time_count now; |
| 826 | struct task_struct *p = timer->it.cpu.task; | 828 | struct task_struct *p = timer->it.cpu.task; |
| @@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
| 1345 | 1347 | ||
| 1346 | /* | 1348 | /* |
| 1347 | * Now that all the timers on our list have the firing flag, | 1349 | * Now that all the timers on our list have the firing flag, |
| 1348 | * noone will touch their list entries but us. We'll take | 1350 | * no one will touch their list entries but us. We'll take |
| 1349 | * each timer's lock before clearing its firing flag, so no | 1351 | * each timer's lock before clearing its firing flag, so no |
| 1350 | * timer call will interfere. | 1352 | * timer call will interfere. |
| 1351 | */ | 1353 | */ |
| @@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
| 1481 | return error; | 1483 | return error; |
| 1482 | } | 1484 | } |
| 1483 | 1485 | ||
| 1484 | int posix_cpu_nsleep(const clockid_t which_clock, int flags, | 1486 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block); |
| 1485 | struct timespec *rqtp, struct timespec __user *rmtp) | 1487 | |
| 1488 | static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | ||
| 1489 | struct timespec *rqtp, struct timespec __user *rmtp) | ||
| 1486 | { | 1490 | { |
| 1487 | struct restart_block *restart_block = | 1491 | struct restart_block *restart_block = |
| 1488 | ¤t_thread_info()->restart_block; | 1492 | ¤t_thread_info()->restart_block; |
| 1489 | struct itimerspec it; | 1493 | struct itimerspec it; |
| 1490 | int error; | 1494 | int error; |
| 1491 | 1495 | ||
| @@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
| 1501 | 1505 | ||
| 1502 | if (error == -ERESTART_RESTARTBLOCK) { | 1506 | if (error == -ERESTART_RESTARTBLOCK) { |
| 1503 | 1507 | ||
| 1504 | if (flags & TIMER_ABSTIME) | 1508 | if (flags & TIMER_ABSTIME) |
| 1505 | return -ERESTARTNOHAND; | 1509 | return -ERESTARTNOHAND; |
| 1506 | /* | 1510 | /* |
| 1507 | * Report back to the user the time still remaining. | 1511 | * Report back to the user the time still remaining. |
| 1508 | */ | 1512 | */ |
| 1509 | if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1513 | if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) |
| 1510 | return -EFAULT; | 1514 | return -EFAULT; |
| 1511 | 1515 | ||
| 1512 | restart_block->fn = posix_cpu_nsleep_restart; | 1516 | restart_block->fn = posix_cpu_nsleep_restart; |
| 1513 | restart_block->arg0 = which_clock; | 1517 | restart_block->nanosleep.index = which_clock; |
| 1514 | restart_block->arg1 = (unsigned long) rmtp; | 1518 | restart_block->nanosleep.rmtp = rmtp; |
| 1515 | restart_block->arg2 = rqtp->tv_sec; | 1519 | restart_block->nanosleep.expires = timespec_to_ns(rqtp); |
| 1516 | restart_block->arg3 = rqtp->tv_nsec; | ||
| 1517 | } | 1520 | } |
| 1518 | return error; | 1521 | return error; |
| 1519 | } | 1522 | } |
| 1520 | 1523 | ||
| 1521 | long posix_cpu_nsleep_restart(struct restart_block *restart_block) | 1524 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) |
| 1522 | { | 1525 | { |
| 1523 | clockid_t which_clock = restart_block->arg0; | 1526 | clockid_t which_clock = restart_block->nanosleep.index; |
| 1524 | struct timespec __user *rmtp; | ||
| 1525 | struct timespec t; | 1527 | struct timespec t; |
| 1526 | struct itimerspec it; | 1528 | struct itimerspec it; |
| 1527 | int error; | 1529 | int error; |
| 1528 | 1530 | ||
| 1529 | rmtp = (struct timespec __user *) restart_block->arg1; | 1531 | t = ns_to_timespec(restart_block->nanosleep.expires); |
| 1530 | t.tv_sec = restart_block->arg2; | ||
| 1531 | t.tv_nsec = restart_block->arg3; | ||
| 1532 | 1532 | ||
| 1533 | restart_block->fn = do_no_restart_syscall; | ||
| 1534 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); | 1533 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); |
| 1535 | 1534 | ||
| 1536 | if (error == -ERESTART_RESTARTBLOCK) { | 1535 | if (error == -ERESTART_RESTARTBLOCK) { |
| 1536 | struct timespec __user *rmtp = restart_block->nanosleep.rmtp; | ||
| 1537 | /* | 1537 | /* |
| 1538 | * Report back to the user the time still remaining. | 1538 | * Report back to the user the time still remaining. |
| 1539 | */ | 1539 | */ |
| 1540 | if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1540 | if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) |
| 1541 | return -EFAULT; | 1541 | return -EFAULT; |
| 1542 | 1542 | ||
| 1543 | restart_block->fn = posix_cpu_nsleep_restart; | 1543 | restart_block->nanosleep.expires = timespec_to_ns(&t); |
| 1544 | restart_block->arg0 = which_clock; | ||
| 1545 | restart_block->arg1 = (unsigned long) rmtp; | ||
| 1546 | restart_block->arg2 = t.tv_sec; | ||
| 1547 | restart_block->arg3 = t.tv_nsec; | ||
| 1548 | } | 1544 | } |
| 1549 | return error; | 1545 | return error; |
| 1550 | 1546 | ||
| 1551 | } | 1547 | } |
| 1552 | 1548 | ||
| 1553 | |||
| 1554 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) | 1549 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) |
| 1555 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) | 1550 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) |
| 1556 | 1551 | ||
| @@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer) | |||
| 1594 | timer->it_clock = THREAD_CLOCK; | 1589 | timer->it_clock = THREAD_CLOCK; |
| 1595 | return posix_cpu_timer_create(timer); | 1590 | return posix_cpu_timer_create(timer); |
| 1596 | } | 1591 | } |
| 1597 | static int thread_cpu_nsleep(const clockid_t which_clock, int flags, | 1592 | |
| 1598 | struct timespec *rqtp, struct timespec __user *rmtp) | 1593 | struct k_clock clock_posix_cpu = { |
| 1599 | { | 1594 | .clock_getres = posix_cpu_clock_getres, |
| 1600 | return -EINVAL; | 1595 | .clock_set = posix_cpu_clock_set, |
| 1601 | } | 1596 | .clock_get = posix_cpu_clock_get, |
| 1602 | static long thread_cpu_nsleep_restart(struct restart_block *restart_block) | 1597 | .timer_create = posix_cpu_timer_create, |
| 1603 | { | 1598 | .nsleep = posix_cpu_nsleep, |
| 1604 | return -EINVAL; | 1599 | .nsleep_restart = posix_cpu_nsleep_restart, |
| 1605 | } | 1600 | .timer_set = posix_cpu_timer_set, |
| 1601 | .timer_del = posix_cpu_timer_del, | ||
| 1602 | .timer_get = posix_cpu_timer_get, | ||
| 1603 | }; | ||
| 1606 | 1604 | ||
| 1607 | static __init int init_posix_cpu_timers(void) | 1605 | static __init int init_posix_cpu_timers(void) |
| 1608 | { | 1606 | { |
| 1609 | struct k_clock process = { | 1607 | struct k_clock process = { |
| 1610 | .clock_getres = process_cpu_clock_getres, | 1608 | .clock_getres = process_cpu_clock_getres, |
| 1611 | .clock_get = process_cpu_clock_get, | 1609 | .clock_get = process_cpu_clock_get, |
| 1612 | .clock_set = do_posix_clock_nosettime, | 1610 | .timer_create = process_cpu_timer_create, |
| 1613 | .timer_create = process_cpu_timer_create, | 1611 | .nsleep = process_cpu_nsleep, |
| 1614 | .nsleep = process_cpu_nsleep, | 1612 | .nsleep_restart = process_cpu_nsleep_restart, |
| 1615 | .nsleep_restart = process_cpu_nsleep_restart, | ||
| 1616 | }; | 1613 | }; |
| 1617 | struct k_clock thread = { | 1614 | struct k_clock thread = { |
| 1618 | .clock_getres = thread_cpu_clock_getres, | 1615 | .clock_getres = thread_cpu_clock_getres, |
| 1619 | .clock_get = thread_cpu_clock_get, | 1616 | .clock_get = thread_cpu_clock_get, |
| 1620 | .clock_set = do_posix_clock_nosettime, | 1617 | .timer_create = thread_cpu_timer_create, |
| 1621 | .timer_create = thread_cpu_timer_create, | ||
| 1622 | .nsleep = thread_cpu_nsleep, | ||
| 1623 | .nsleep_restart = thread_cpu_nsleep_restart, | ||
| 1624 | }; | 1618 | }; |
| 1625 | struct timespec ts; | 1619 | struct timespec ts; |
| 1626 | 1620 | ||
| 1627 | register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); | 1621 | posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); |
| 1628 | register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); | 1622 | posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); |
| 1629 | 1623 | ||
| 1630 | cputime_to_timespec(cputime_one_jiffy, &ts); | 1624 | cputime_to_timespec(cputime_one_jiffy, &ts); |
| 1631 | onecputick = ts.tv_nsec; | 1625 | onecputick = ts.tv_nsec; |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 93bd2eb2bc53..e5498d7405c3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -41,6 +41,7 @@ | |||
| 41 | #include <linux/init.h> | 41 | #include <linux/init.h> |
| 42 | #include <linux/compiler.h> | 42 | #include <linux/compiler.h> |
| 43 | #include <linux/idr.h> | 43 | #include <linux/idr.h> |
| 44 | #include <linux/posix-clock.h> | ||
| 44 | #include <linux/posix-timers.h> | 45 | #include <linux/posix-timers.h> |
| 45 | #include <linux/syscalls.h> | 46 | #include <linux/syscalls.h> |
| 46 | #include <linux/wait.h> | 47 | #include <linux/wait.h> |
| @@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock); | |||
| 81 | #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" | 82 | #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" |
| 82 | #endif | 83 | #endif |
| 83 | 84 | ||
| 85 | /* | ||
| 86 | * parisc wants ENOTSUP instead of EOPNOTSUPP | ||
| 87 | */ | ||
| 88 | #ifndef ENOTSUP | ||
| 89 | # define ENANOSLEEP_NOTSUP EOPNOTSUPP | ||
| 90 | #else | ||
| 91 | # define ENANOSLEEP_NOTSUP ENOTSUP | ||
| 92 | #endif | ||
| 84 | 93 | ||
| 85 | /* | 94 | /* |
| 86 | * The timer ID is turned into a timer address by idr_find(). | 95 | * The timer ID is turned into a timer address by idr_find(). |
| @@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock); | |||
| 94 | /* | 103 | /* |
| 95 | * CLOCKs: The POSIX standard calls for a couple of clocks and allows us | 104 | * CLOCKs: The POSIX standard calls for a couple of clocks and allows us |
| 96 | * to implement others. This structure defines the various | 105 | * to implement others. This structure defines the various |
| 97 | * clocks and allows the possibility of adding others. We | 106 | * clocks. |
| 98 | * provide an interface to add clocks to the table and expect | ||
| 99 | * the "arch" code to add at least one clock that is high | ||
| 100 | * resolution. Here we define the standard CLOCK_REALTIME as a | ||
| 101 | * 1/HZ resolution clock. | ||
| 102 | * | 107 | * |
| 103 | * RESOLUTION: Clock resolution is used to round up timer and interval | 108 | * RESOLUTION: Clock resolution is used to round up timer and interval |
| 104 | * times, NOT to report clock times, which are reported with as | 109 | * times, NOT to report clock times, which are reported with as |
| @@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock); | |||
| 108 | * necessary code is written. The standard says we should say | 113 | * necessary code is written. The standard says we should say |
| 109 | * something about this issue in the documentation... | 114 | * something about this issue in the documentation... |
| 110 | * | 115 | * |
| 111 | * FUNCTIONS: The CLOCKs structure defines possible functions to handle | 116 | * FUNCTIONS: The CLOCKs structure defines possible functions to |
| 112 | * various clock functions. For clocks that use the standard | 117 | * handle various clock functions. |
| 113 | * system timer code these entries should be NULL. This will | ||
| 114 | * allow dispatch without the overhead of indirect function | ||
| 115 | * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) | ||
| 116 | * must supply functions here, even if the function just returns | ||
| 117 | * ENOSYS. The standard POSIX timer management code assumes the | ||
| 118 | * following: 1.) The k_itimer struct (sched.h) is used for the | ||
| 119 | * timer. 2.) The list, it_lock, it_clock, it_id and it_pid | ||
| 120 | * fields are not modified by timer code. | ||
| 121 | * | 118 | * |
| 122 | * At this time all functions EXCEPT clock_nanosleep can be | 119 | * The standard POSIX timer management code assumes the |
| 123 | * redirected by the CLOCKS structure. Clock_nanosleep is in | 120 | * following: 1.) The k_itimer struct (sched.h) is used for |
| 124 | * there, but the code ignores it. | 121 | * the timer. 2.) The list, it_lock, it_clock, it_id and |
| 122 | * it_pid fields are not modified by timer code. | ||
| 125 | * | 123 | * |
| 126 | * Permissions: It is assumed that the clock_settime() function defined | 124 | * Permissions: It is assumed that the clock_settime() function defined |
| 127 | * for each clock will take care of permission checks. Some | 125 | * for each clock will take care of permission checks. Some |
| @@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; | |||
| 138 | */ | 136 | */ |
| 139 | static int common_nsleep(const clockid_t, int flags, struct timespec *t, | 137 | static int common_nsleep(const clockid_t, int flags, struct timespec *t, |
| 140 | struct timespec __user *rmtp); | 138 | struct timespec __user *rmtp); |
| 139 | static int common_timer_create(struct k_itimer *new_timer); | ||
| 141 | static void common_timer_get(struct k_itimer *, struct itimerspec *); | 140 | static void common_timer_get(struct k_itimer *, struct itimerspec *); |
| 142 | static int common_timer_set(struct k_itimer *, int, | 141 | static int common_timer_set(struct k_itimer *, int, |
| 143 | struct itimerspec *, struct itimerspec *); | 142 | struct itimerspec *, struct itimerspec *); |
| @@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | |||
| 158 | spin_unlock_irqrestore(&timr->it_lock, flags); | 157 | spin_unlock_irqrestore(&timr->it_lock, flags); |
| 159 | } | 158 | } |
| 160 | 159 | ||
| 161 | /* | 160 | /* Get clock_realtime */ |
| 162 | * Call the k_clock hook function if non-null, or the default function. | 161 | static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) |
| 163 | */ | ||
| 164 | #define CLOCK_DISPATCH(clock, call, arglist) \ | ||
| 165 | ((clock) < 0 ? posix_cpu_##call arglist : \ | ||
| 166 | (posix_clocks[clock].call != NULL \ | ||
| 167 | ? (*posix_clocks[clock].call) arglist : common_##call arglist)) | ||
| 168 | |||
| 169 | /* | ||
| 170 | * Default clock hook functions when the struct k_clock passed | ||
| 171 | * to register_posix_clock leaves a function pointer null. | ||
| 172 | * | ||
| 173 | * The function common_CALL is the default implementation for | ||
| 174 | * the function pointer CALL in struct k_clock. | ||
| 175 | */ | ||
| 176 | |||
| 177 | static inline int common_clock_getres(const clockid_t which_clock, | ||
| 178 | struct timespec *tp) | ||
| 179 | { | ||
| 180 | tp->tv_sec = 0; | ||
| 181 | tp->tv_nsec = posix_clocks[which_clock].res; | ||
| 182 | return 0; | ||
| 183 | } | ||
| 184 | |||
| 185 | /* | ||
| 186 | * Get real time for posix timers | ||
| 187 | */ | ||
| 188 | static int common_clock_get(clockid_t which_clock, struct timespec *tp) | ||
| 189 | { | 162 | { |
| 190 | ktime_get_real_ts(tp); | 163 | ktime_get_real_ts(tp); |
| 191 | return 0; | 164 | return 0; |
| 192 | } | 165 | } |
| 193 | 166 | ||
| 194 | static inline int common_clock_set(const clockid_t which_clock, | 167 | /* Set clock_realtime */ |
| 195 | struct timespec *tp) | 168 | static int posix_clock_realtime_set(const clockid_t which_clock, |
| 169 | const struct timespec *tp) | ||
| 196 | { | 170 | { |
| 197 | return do_sys_settimeofday(tp, NULL); | 171 | return do_sys_settimeofday(tp, NULL); |
| 198 | } | 172 | } |
| 199 | 173 | ||
| 200 | static int common_timer_create(struct k_itimer *new_timer) | 174 | static int posix_clock_realtime_adj(const clockid_t which_clock, |
| 201 | { | 175 | struct timex *t) |
| 202 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); | ||
| 203 | return 0; | ||
| 204 | } | ||
| 205 | |||
| 206 | static int no_timer_create(struct k_itimer *new_timer) | ||
| 207 | { | ||
| 208 | return -EOPNOTSUPP; | ||
| 209 | } | ||
| 210 | |||
| 211 | static int no_nsleep(const clockid_t which_clock, int flags, | ||
| 212 | struct timespec *tsave, struct timespec __user *rmtp) | ||
| 213 | { | ||
| 214 | return -EOPNOTSUPP; | ||
| 215 | } | ||
| 216 | |||
| 217 | /* | ||
| 218 | * Return nonzero if we know a priori this clockid_t value is bogus. | ||
| 219 | */ | ||
| 220 | static inline int invalid_clockid(const clockid_t which_clock) | ||
| 221 | { | 176 | { |
| 222 | if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ | 177 | return do_adjtimex(t); |
| 223 | return 0; | ||
| 224 | if ((unsigned) which_clock >= MAX_CLOCKS) | ||
| 225 | return 1; | ||
| 226 | if (posix_clocks[which_clock].clock_getres != NULL) | ||
| 227 | return 0; | ||
| 228 | if (posix_clocks[which_clock].res != 0) | ||
| 229 | return 0; | ||
| 230 | return 1; | ||
| 231 | } | 178 | } |
| 232 | 179 | ||
| 233 | /* | 180 | /* |
| @@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) | |||
| 240 | } | 187 | } |
| 241 | 188 | ||
| 242 | /* | 189 | /* |
| 243 | * Get monotonic time for posix timers | 190 | * Get monotonic-raw time for posix timers |
| 244 | */ | 191 | */ |
| 245 | static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) | 192 | static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) |
| 246 | { | 193 | { |
| @@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp | |||
| 267 | *tp = ktime_to_timespec(KTIME_LOW_RES); | 214 | *tp = ktime_to_timespec(KTIME_LOW_RES); |
| 268 | return 0; | 215 | return 0; |
| 269 | } | 216 | } |
| 217 | |||
| 218 | static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) | ||
| 219 | { | ||
| 220 | get_monotonic_boottime(tp); | ||
| 221 | return 0; | ||
| 222 | } | ||
| 223 | |||
| 224 | |||
| 270 | /* | 225 | /* |
| 271 | * Initialize everything, well, just everything in Posix clocks/timers ;) | 226 | * Initialize everything, well, just everything in Posix clocks/timers ;) |
| 272 | */ | 227 | */ |
| 273 | static __init int init_posix_timers(void) | 228 | static __init int init_posix_timers(void) |
| 274 | { | 229 | { |
| 275 | struct k_clock clock_realtime = { | 230 | struct k_clock clock_realtime = { |
| 276 | .clock_getres = hrtimer_get_res, | 231 | .clock_getres = hrtimer_get_res, |
| 232 | .clock_get = posix_clock_realtime_get, | ||
| 233 | .clock_set = posix_clock_realtime_set, | ||
| 234 | .clock_adj = posix_clock_realtime_adj, | ||
| 235 | .nsleep = common_nsleep, | ||
| 236 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
| 237 | .timer_create = common_timer_create, | ||
| 238 | .timer_set = common_timer_set, | ||
| 239 | .timer_get = common_timer_get, | ||
| 240 | .timer_del = common_timer_del, | ||
| 277 | }; | 241 | }; |
| 278 | struct k_clock clock_monotonic = { | 242 | struct k_clock clock_monotonic = { |
| 279 | .clock_getres = hrtimer_get_res, | 243 | .clock_getres = hrtimer_get_res, |
| 280 | .clock_get = posix_ktime_get_ts, | 244 | .clock_get = posix_ktime_get_ts, |
| 281 | .clock_set = do_posix_clock_nosettime, | 245 | .nsleep = common_nsleep, |
| 246 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
| 247 | .timer_create = common_timer_create, | ||
| 248 | .timer_set = common_timer_set, | ||
| 249 | .timer_get = common_timer_get, | ||
| 250 | .timer_del = common_timer_del, | ||
| 282 | }; | 251 | }; |
| 283 | struct k_clock clock_monotonic_raw = { | 252 | struct k_clock clock_monotonic_raw = { |
| 284 | .clock_getres = hrtimer_get_res, | 253 | .clock_getres = hrtimer_get_res, |
| 285 | .clock_get = posix_get_monotonic_raw, | 254 | .clock_get = posix_get_monotonic_raw, |
| 286 | .clock_set = do_posix_clock_nosettime, | ||
| 287 | .timer_create = no_timer_create, | ||
| 288 | .nsleep = no_nsleep, | ||
| 289 | }; | 255 | }; |
| 290 | struct k_clock clock_realtime_coarse = { | 256 | struct k_clock clock_realtime_coarse = { |
| 291 | .clock_getres = posix_get_coarse_res, | 257 | .clock_getres = posix_get_coarse_res, |
| 292 | .clock_get = posix_get_realtime_coarse, | 258 | .clock_get = posix_get_realtime_coarse, |
| 293 | .clock_set = do_posix_clock_nosettime, | ||
| 294 | .timer_create = no_timer_create, | ||
| 295 | .nsleep = no_nsleep, | ||
| 296 | }; | 259 | }; |
| 297 | struct k_clock clock_monotonic_coarse = { | 260 | struct k_clock clock_monotonic_coarse = { |
| 298 | .clock_getres = posix_get_coarse_res, | 261 | .clock_getres = posix_get_coarse_res, |
| 299 | .clock_get = posix_get_monotonic_coarse, | 262 | .clock_get = posix_get_monotonic_coarse, |
| 300 | .clock_set = do_posix_clock_nosettime, | 263 | }; |
| 301 | .timer_create = no_timer_create, | 264 | struct k_clock clock_boottime = { |
| 302 | .nsleep = no_nsleep, | 265 | .clock_getres = hrtimer_get_res, |
| 266 | .clock_get = posix_get_boottime, | ||
| 267 | .nsleep = common_nsleep, | ||
| 268 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
| 269 | .timer_create = common_timer_create, | ||
| 270 | .timer_set = common_timer_set, | ||
| 271 | .timer_get = common_timer_get, | ||
| 272 | .timer_del = common_timer_del, | ||
| 303 | }; | 273 | }; |
| 304 | 274 | ||
| 305 | register_posix_clock(CLOCK_REALTIME, &clock_realtime); | 275 | posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); |
| 306 | register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); | 276 | posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); |
| 307 | register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); | 277 | posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); |
| 308 | register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); | 278 | posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); |
| 309 | register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); | 279 | posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); |
| 280 | posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); | ||
| 310 | 281 | ||
| 311 | posix_timers_cache = kmem_cache_create("posix_timers_cache", | 282 | posix_timers_cache = kmem_cache_create("posix_timers_cache", |
| 312 | sizeof (struct k_itimer), 0, SLAB_PANIC, | 283 | sizeof (struct k_itimer), 0, SLAB_PANIC, |
| @@ -342,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr) | |||
| 342 | * restarted (i.e. we have flagged this in the sys_private entry of the | 313 | * restarted (i.e. we have flagged this in the sys_private entry of the |
| 343 | * info block). | 314 | * info block). |
| 344 | * | 315 | * |
| 345 | * To protect aginst the timer going away while the interrupt is queued, | 316 | * To protect against the timer going away while the interrupt is queued, |
| 346 | * we require that the it_requeue_pending flag be set. | 317 | * we require that the it_requeue_pending flag be set. |
| 347 | */ | 318 | */ |
| 348 | void do_schedule_next_timer(struct siginfo *info) | 319 | void do_schedule_next_timer(struct siginfo *info) |
| @@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event) | |||
| 482 | return task_pid(rtn); | 453 | return task_pid(rtn); |
| 483 | } | 454 | } |
| 484 | 455 | ||
| 485 | void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) | 456 | void posix_timers_register_clock(const clockid_t clock_id, |
| 457 | struct k_clock *new_clock) | ||
| 486 | { | 458 | { |
| 487 | if ((unsigned) clock_id >= MAX_CLOCKS) { | 459 | if ((unsigned) clock_id >= MAX_CLOCKS) { |
| 488 | printk("POSIX clock register failed for clock_id %d\n", | 460 | printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", |
| 461 | clock_id); | ||
| 462 | return; | ||
| 463 | } | ||
| 464 | |||
| 465 | if (!new_clock->clock_get) { | ||
| 466 | printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", | ||
| 467 | clock_id); | ||
| 468 | return; | ||
| 469 | } | ||
| 470 | if (!new_clock->clock_getres) { | ||
| 471 | printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", | ||
| 489 | clock_id); | 472 | clock_id); |
| 490 | return; | 473 | return; |
| 491 | } | 474 | } |
| 492 | 475 | ||
| 493 | posix_clocks[clock_id] = *new_clock; | 476 | posix_clocks[clock_id] = *new_clock; |
| 494 | } | 477 | } |
| 495 | EXPORT_SYMBOL_GPL(register_posix_clock); | 478 | EXPORT_SYMBOL_GPL(posix_timers_register_clock); |
| 496 | 479 | ||
| 497 | static struct k_itimer * alloc_posix_timer(void) | 480 | static struct k_itimer * alloc_posix_timer(void) |
| 498 | { | 481 | { |
| @@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
| 523 | kmem_cache_free(posix_timers_cache, tmr); | 506 | kmem_cache_free(posix_timers_cache, tmr); |
| 524 | } | 507 | } |
| 525 | 508 | ||
| 509 | static struct k_clock *clockid_to_kclock(const clockid_t id) | ||
| 510 | { | ||
| 511 | if (id < 0) | ||
| 512 | return (id & CLOCKFD_MASK) == CLOCKFD ? | ||
| 513 | &clock_posix_dynamic : &clock_posix_cpu; | ||
| 514 | |||
| 515 | if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) | ||
| 516 | return NULL; | ||
| 517 | return &posix_clocks[id]; | ||
| 518 | } | ||
| 519 | |||
| 520 | static int common_timer_create(struct k_itimer *new_timer) | ||
| 521 | { | ||
| 522 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); | ||
| 523 | return 0; | ||
| 524 | } | ||
| 525 | |||
| 526 | /* Create a POSIX.1b interval timer. */ | 526 | /* Create a POSIX.1b interval timer. */ |
| 527 | 527 | ||
| 528 | SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | 528 | SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, |
| 529 | struct sigevent __user *, timer_event_spec, | 529 | struct sigevent __user *, timer_event_spec, |
| 530 | timer_t __user *, created_timer_id) | 530 | timer_t __user *, created_timer_id) |
| 531 | { | 531 | { |
| 532 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 532 | struct k_itimer *new_timer; | 533 | struct k_itimer *new_timer; |
| 533 | int error, new_timer_id; | 534 | int error, new_timer_id; |
| 534 | sigevent_t event; | 535 | sigevent_t event; |
| 535 | int it_id_set = IT_ID_NOT_SET; | 536 | int it_id_set = IT_ID_NOT_SET; |
| 536 | 537 | ||
| 537 | if (invalid_clockid(which_clock)) | 538 | if (!kc) |
| 538 | return -EINVAL; | 539 | return -EINVAL; |
| 540 | if (!kc->timer_create) | ||
| 541 | return -EOPNOTSUPP; | ||
| 539 | 542 | ||
| 540 | new_timer = alloc_posix_timer(); | 543 | new_timer = alloc_posix_timer(); |
| 541 | if (unlikely(!new_timer)) | 544 | if (unlikely(!new_timer)) |
| @@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 597 | goto out; | 600 | goto out; |
| 598 | } | 601 | } |
| 599 | 602 | ||
| 600 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); | 603 | error = kc->timer_create(new_timer); |
| 601 | if (error) | 604 | if (error) |
| 602 | goto out; | 605 | goto out; |
| 603 | 606 | ||
| @@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 607 | spin_unlock_irq(¤t->sighand->siglock); | 610 | spin_unlock_irq(¤t->sighand->siglock); |
| 608 | 611 | ||
| 609 | return 0; | 612 | return 0; |
| 610 | /* | 613 | /* |
| 611 | * In the case of the timer belonging to another task, after | 614 | * In the case of the timer belonging to another task, after |
| 612 | * the task is unlocked, the timer is owned by the other task | 615 | * the task is unlocked, the timer is owned by the other task |
| 613 | * and may cease to exist at any time. Don't use or modify | 616 | * and may cease to exist at any time. Don't use or modify |
| @@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) | |||
| 709 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | 712 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, |
| 710 | struct itimerspec __user *, setting) | 713 | struct itimerspec __user *, setting) |
| 711 | { | 714 | { |
| 712 | struct k_itimer *timr; | ||
| 713 | struct itimerspec cur_setting; | 715 | struct itimerspec cur_setting; |
| 716 | struct k_itimer *timr; | ||
| 717 | struct k_clock *kc; | ||
| 714 | unsigned long flags; | 718 | unsigned long flags; |
| 719 | int ret = 0; | ||
| 715 | 720 | ||
| 716 | timr = lock_timer(timer_id, &flags); | 721 | timr = lock_timer(timer_id, &flags); |
| 717 | if (!timr) | 722 | if (!timr) |
| 718 | return -EINVAL; | 723 | return -EINVAL; |
| 719 | 724 | ||
| 720 | CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); | 725 | kc = clockid_to_kclock(timr->it_clock); |
| 726 | if (WARN_ON_ONCE(!kc || !kc->timer_get)) | ||
| 727 | ret = -EINVAL; | ||
| 728 | else | ||
| 729 | kc->timer_get(timr, &cur_setting); | ||
| 721 | 730 | ||
| 722 | unlock_timer(timr, flags); | 731 | unlock_timer(timr, flags); |
| 723 | 732 | ||
| 724 | if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) | 733 | if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) |
| 725 | return -EFAULT; | 734 | return -EFAULT; |
| 726 | 735 | ||
| 727 | return 0; | 736 | return ret; |
| 728 | } | 737 | } |
| 729 | 738 | ||
| 730 | /* | 739 | /* |
| @@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | |||
| 813 | int error = 0; | 822 | int error = 0; |
| 814 | unsigned long flag; | 823 | unsigned long flag; |
| 815 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; | 824 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; |
| 825 | struct k_clock *kc; | ||
| 816 | 826 | ||
| 817 | if (!new_setting) | 827 | if (!new_setting) |
| 818 | return -EINVAL; | 828 | return -EINVAL; |
| @@ -828,8 +838,11 @@ retry: | |||
| 828 | if (!timr) | 838 | if (!timr) |
| 829 | return -EINVAL; | 839 | return -EINVAL; |
| 830 | 840 | ||
| 831 | error = CLOCK_DISPATCH(timr->it_clock, timer_set, | 841 | kc = clockid_to_kclock(timr->it_clock); |
| 832 | (timr, flags, &new_spec, rtn)); | 842 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) |
| 843 | error = -EINVAL; | ||
| 844 | else | ||
| 845 | error = kc->timer_set(timr, flags, &new_spec, rtn); | ||
| 833 | 846 | ||
| 834 | unlock_timer(timr, flag); | 847 | unlock_timer(timr, flag); |
| 835 | if (error == TIMER_RETRY) { | 848 | if (error == TIMER_RETRY) { |
| @@ -844,7 +857,7 @@ retry: | |||
| 844 | return error; | 857 | return error; |
| 845 | } | 858 | } |
| 846 | 859 | ||
| 847 | static inline int common_timer_del(struct k_itimer *timer) | 860 | static int common_timer_del(struct k_itimer *timer) |
| 848 | { | 861 | { |
| 849 | timer->it.real.interval.tv64 = 0; | 862 | timer->it.real.interval.tv64 = 0; |
| 850 | 863 | ||
| @@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer) | |||
| 855 | 868 | ||
| 856 | static inline int timer_delete_hook(struct k_itimer *timer) | 869 | static inline int timer_delete_hook(struct k_itimer *timer) |
| 857 | { | 870 | { |
| 858 | return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); | 871 | struct k_clock *kc = clockid_to_kclock(timer->it_clock); |
| 872 | |||
| 873 | if (WARN_ON_ONCE(!kc || !kc->timer_del)) | ||
| 874 | return -EINVAL; | ||
| 875 | return kc->timer_del(timer); | ||
| 859 | } | 876 | } |
| 860 | 877 | ||
| 861 | /* Delete a POSIX.1b interval timer. */ | 878 | /* Delete a POSIX.1b interval timer. */ |
| @@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig) | |||
| 927 | } | 944 | } |
| 928 | } | 945 | } |
| 929 | 946 | ||
| 930 | /* Not available / possible... functions */ | ||
| 931 | int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) | ||
| 932 | { | ||
| 933 | return -EINVAL; | ||
| 934 | } | ||
| 935 | EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); | ||
| 936 | |||
| 937 | int do_posix_clock_nonanosleep(const clockid_t clock, int flags, | ||
| 938 | struct timespec *t, struct timespec __user *r) | ||
| 939 | { | ||
| 940 | #ifndef ENOTSUP | ||
| 941 | return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ | ||
| 942 | #else /* parisc does define it separately. */ | ||
| 943 | return -ENOTSUP; | ||
| 944 | #endif | ||
| 945 | } | ||
| 946 | EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); | ||
| 947 | |||
| 948 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | 947 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, |
| 949 | const struct timespec __user *, tp) | 948 | const struct timespec __user *, tp) |
| 950 | { | 949 | { |
| 950 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 951 | struct timespec new_tp; | 951 | struct timespec new_tp; |
| 952 | 952 | ||
| 953 | if (invalid_clockid(which_clock)) | 953 | if (!kc || !kc->clock_set) |
| 954 | return -EINVAL; | 954 | return -EINVAL; |
| 955 | |||
| 955 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | 956 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) |
| 956 | return -EFAULT; | 957 | return -EFAULT; |
| 957 | 958 | ||
| 958 | return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); | 959 | return kc->clock_set(which_clock, &new_tp); |
| 959 | } | 960 | } |
| 960 | 961 | ||
| 961 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 962 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, |
| 962 | struct timespec __user *,tp) | 963 | struct timespec __user *,tp) |
| 963 | { | 964 | { |
| 965 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 964 | struct timespec kernel_tp; | 966 | struct timespec kernel_tp; |
| 965 | int error; | 967 | int error; |
| 966 | 968 | ||
| 967 | if (invalid_clockid(which_clock)) | 969 | if (!kc) |
| 968 | return -EINVAL; | 970 | return -EINVAL; |
| 969 | error = CLOCK_DISPATCH(which_clock, clock_get, | 971 | |
| 970 | (which_clock, &kernel_tp)); | 972 | error = kc->clock_get(which_clock, &kernel_tp); |
| 973 | |||
| 971 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | 974 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) |
| 972 | error = -EFAULT; | 975 | error = -EFAULT; |
| 973 | 976 | ||
| 974 | return error; | 977 | return error; |
| 978 | } | ||
| 979 | |||
| 980 | SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, | ||
| 981 | struct timex __user *, utx) | ||
| 982 | { | ||
| 983 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 984 | struct timex ktx; | ||
| 985 | int err; | ||
| 986 | |||
| 987 | if (!kc) | ||
| 988 | return -EINVAL; | ||
| 989 | if (!kc->clock_adj) | ||
| 990 | return -EOPNOTSUPP; | ||
| 991 | |||
| 992 | if (copy_from_user(&ktx, utx, sizeof(ktx))) | ||
| 993 | return -EFAULT; | ||
| 994 | |||
| 995 | err = kc->clock_adj(which_clock, &ktx); | ||
| 996 | |||
| 997 | if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) | ||
| 998 | return -EFAULT; | ||
| 975 | 999 | ||
| 1000 | return err; | ||
| 976 | } | 1001 | } |
| 977 | 1002 | ||
| 978 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, | 1003 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, |
| 979 | struct timespec __user *, tp) | 1004 | struct timespec __user *, tp) |
| 980 | { | 1005 | { |
| 1006 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 981 | struct timespec rtn_tp; | 1007 | struct timespec rtn_tp; |
| 982 | int error; | 1008 | int error; |
| 983 | 1009 | ||
| 984 | if (invalid_clockid(which_clock)) | 1010 | if (!kc) |
| 985 | return -EINVAL; | 1011 | return -EINVAL; |
| 986 | 1012 | ||
| 987 | error = CLOCK_DISPATCH(which_clock, clock_getres, | 1013 | error = kc->clock_getres(which_clock, &rtn_tp); |
| 988 | (which_clock, &rtn_tp)); | ||
| 989 | 1014 | ||
| 990 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { | 1015 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) |
| 991 | error = -EFAULT; | 1016 | error = -EFAULT; |
| 992 | } | ||
| 993 | 1017 | ||
| 994 | return error; | 1018 | return error; |
| 995 | } | 1019 | } |
| @@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
| 1009 | const struct timespec __user *, rqtp, | 1033 | const struct timespec __user *, rqtp, |
| 1010 | struct timespec __user *, rmtp) | 1034 | struct timespec __user *, rmtp) |
| 1011 | { | 1035 | { |
| 1036 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 1012 | struct timespec t; | 1037 | struct timespec t; |
| 1013 | 1038 | ||
| 1014 | if (invalid_clockid(which_clock)) | 1039 | if (!kc) |
| 1015 | return -EINVAL; | 1040 | return -EINVAL; |
| 1041 | if (!kc->nsleep) | ||
| 1042 | return -ENANOSLEEP_NOTSUP; | ||
| 1016 | 1043 | ||
| 1017 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 1044 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) |
| 1018 | return -EFAULT; | 1045 | return -EFAULT; |
| @@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
| 1020 | if (!timespec_valid(&t)) | 1047 | if (!timespec_valid(&t)) |
| 1021 | return -EINVAL; | 1048 | return -EINVAL; |
| 1022 | 1049 | ||
| 1023 | return CLOCK_DISPATCH(which_clock, nsleep, | 1050 | return kc->nsleep(which_clock, flags, &t, rmtp); |
| 1024 | (which_clock, flags, &t, rmtp)); | ||
| 1025 | } | ||
| 1026 | |||
| 1027 | /* | ||
| 1028 | * nanosleep_restart for monotonic and realtime clocks | ||
| 1029 | */ | ||
| 1030 | static int common_nsleep_restart(struct restart_block *restart_block) | ||
| 1031 | { | ||
| 1032 | return hrtimer_nanosleep_restart(restart_block); | ||
| 1033 | } | 1051 | } |
| 1034 | 1052 | ||
| 1035 | /* | 1053 | /* |
| 1036 | * This will restart clock_nanosleep. This is required only by | 1054 | * This will restart clock_nanosleep. This is required only by |
| 1037 | * compat_clock_nanosleep_restart for now. | 1055 | * compat_clock_nanosleep_restart for now. |
| 1038 | */ | 1056 | */ |
| 1039 | long | 1057 | long clock_nanosleep_restart(struct restart_block *restart_block) |
| 1040 | clock_nanosleep_restart(struct restart_block *restart_block) | ||
| 1041 | { | 1058 | { |
| 1042 | clockid_t which_clock = restart_block->arg0; | 1059 | clockid_t which_clock = restart_block->nanosleep.index; |
| 1060 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 1061 | |||
| 1062 | if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) | ||
| 1063 | return -EINVAL; | ||
| 1043 | 1064 | ||
| 1044 | return CLOCK_DISPATCH(which_clock, nsleep_restart, | 1065 | return kc->nsleep_restart(restart_block); |
| 1045 | (restart_block)); | ||
| 1046 | } | 1066 | } |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 265729966ece..6de9a8fc3417 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -1,125 +1,12 @@ | |||
| 1 | config PM | ||
| 2 | bool "Power Management support" | ||
| 3 | depends on !IA64_HP_SIM | ||
| 4 | ---help--- | ||
| 5 | "Power Management" means that parts of your computer are shut | ||
| 6 | off or put into a power conserving "sleep" mode if they are not | ||
| 7 | being used. There are two competing standards for doing this: APM | ||
| 8 | and ACPI. If you want to use either one, say Y here and then also | ||
| 9 | to the requisite support below. | ||
| 10 | |||
| 11 | Power Management is most important for battery powered laptop | ||
| 12 | computers; if you have a laptop, check out the Linux Laptop home | ||
| 13 | page on the WWW at <http://www.linux-on-laptops.com/> or | ||
| 14 | Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/> | ||
| 15 | and the Battery Powered Linux mini-HOWTO, available from | ||
| 16 | <http://www.tldp.org/docs.html#howto>. | ||
| 17 | |||
| 18 | Note that, even if you say N here, Linux on the x86 architecture | ||
| 19 | will issue the hlt instruction if nothing is to be done, thereby | ||
| 20 | sending the processor to sleep and saving power. | ||
| 21 | |||
| 22 | config PM_DEBUG | ||
| 23 | bool "Power Management Debug Support" | ||
| 24 | depends on PM | ||
| 25 | ---help--- | ||
| 26 | This option enables various debugging support in the Power Management | ||
| 27 | code. This is helpful when debugging and reporting PM bugs, like | ||
| 28 | suspend support. | ||
| 29 | |||
| 30 | config PM_ADVANCED_DEBUG | ||
| 31 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | ||
| 32 | depends on PM_DEBUG | ||
| 33 | default n | ||
| 34 | ---help--- | ||
| 35 | Add extra sysfs attributes allowing one to access some Power Management | ||
| 36 | fields of device objects from user space. If you are not a kernel | ||
| 37 | developer interested in debugging/testing Power Management, say "no". | ||
| 38 | |||
| 39 | config PM_VERBOSE | ||
| 40 | bool "Verbose Power Management debugging" | ||
| 41 | depends on PM_DEBUG | ||
| 42 | default n | ||
| 43 | ---help--- | ||
| 44 | This option enables verbose messages from the Power Management code. | ||
| 45 | |||
| 46 | config CAN_PM_TRACE | ||
| 47 | def_bool y | ||
| 48 | depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL | ||
| 49 | |||
| 50 | config PM_TRACE | ||
| 51 | bool | ||
| 52 | help | ||
| 53 | This enables code to save the last PM event point across | ||
| 54 | reboot. The architecture needs to support this, x86 for | ||
| 55 | example does by saving things in the RTC, see below. | ||
| 56 | |||
| 57 | The architecture specific code must provide the extern | ||
| 58 | functions from <linux/resume-trace.h> as well as the | ||
| 59 | <asm/resume-trace.h> header with a TRACE_RESUME() macro. | ||
| 60 | |||
| 61 | The way the information is presented is architecture- | ||
| 62 | dependent, x86 will print the information during a | ||
| 63 | late_initcall. | ||
| 64 | |||
| 65 | config PM_TRACE_RTC | ||
| 66 | bool "Suspend/resume event tracing" | ||
| 67 | depends on CAN_PM_TRACE | ||
| 68 | depends on X86 | ||
| 69 | select PM_TRACE | ||
| 70 | default n | ||
| 71 | ---help--- | ||
| 72 | This enables some cheesy code to save the last PM event point in the | ||
| 73 | RTC across reboots, so that you can debug a machine that just hangs | ||
| 74 | during suspend (or more commonly, during resume). | ||
| 75 | |||
| 76 | To use this debugging feature you should attempt to suspend the | ||
| 77 | machine, reboot it and then run | ||
| 78 | |||
| 79 | dmesg -s 1000000 | grep 'hash matches' | ||
| 80 | |||
| 81 | CAUTION: this option will cause your machine's real-time clock to be | ||
| 82 | set to an invalid time after a resume. | ||
| 83 | |||
| 84 | config PM_SLEEP_SMP | ||
| 85 | bool | ||
| 86 | depends on SMP | ||
| 87 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE | ||
| 88 | depends on PM_SLEEP | ||
| 89 | select HOTPLUG | ||
| 90 | select HOTPLUG_CPU | ||
| 91 | default y | ||
| 92 | |||
| 93 | config PM_SLEEP | ||
| 94 | bool | ||
| 95 | depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE | ||
| 96 | default y | ||
| 97 | |||
| 98 | config PM_SLEEP_ADVANCED_DEBUG | ||
| 99 | bool | ||
| 100 | depends on PM_ADVANCED_DEBUG | ||
| 101 | default n | ||
| 102 | |||
| 103 | config SUSPEND | 1 | config SUSPEND |
| 104 | bool "Suspend to RAM and standby" | 2 | bool "Suspend to RAM and standby" |
| 105 | depends on PM && ARCH_SUSPEND_POSSIBLE | 3 | depends on ARCH_SUSPEND_POSSIBLE |
| 106 | default y | 4 | default y |
| 107 | ---help--- | 5 | ---help--- |
| 108 | Allow the system to enter sleep states in which main memory is | 6 | Allow the system to enter sleep states in which main memory is |
| 109 | powered and thus its contents are preserved, such as the | 7 | powered and thus its contents are preserved, such as the |
| 110 | suspend-to-RAM state (e.g. the ACPI S3 state). | 8 | suspend-to-RAM state (e.g. the ACPI S3 state). |
| 111 | 9 | ||
| 112 | config PM_TEST_SUSPEND | ||
| 113 | bool "Test suspend/resume and wakealarm during bootup" | ||
| 114 | depends on SUSPEND && PM_DEBUG && RTC_CLASS=y | ||
| 115 | ---help--- | ||
| 116 | This option will let you suspend your machine during bootup, and | ||
| 117 | make it wake up a few seconds later using an RTC wakeup alarm. | ||
| 118 | Enable this with a kernel parameter like "test_suspend=mem". | ||
| 119 | |||
| 120 | You probably want to have your system's RTC driver statically | ||
| 121 | linked, ensuring that it's available when this test runs. | ||
| 122 | |||
| 123 | config SUSPEND_FREEZER | 10 | config SUSPEND_FREEZER |
| 124 | bool "Enable freezer for suspend to RAM/standby" \ | 11 | bool "Enable freezer for suspend to RAM/standby" \ |
| 125 | if ARCH_WANTS_FREEZER_CONTROL || BROKEN | 12 | if ARCH_WANTS_FREEZER_CONTROL || BROKEN |
| @@ -131,9 +18,13 @@ config SUSPEND_FREEZER | |||
| 131 | 18 | ||
| 132 | Turning OFF this setting is NOT recommended! If in doubt, say Y. | 19 | Turning OFF this setting is NOT recommended! If in doubt, say Y. |
| 133 | 20 | ||
| 21 | config HIBERNATE_CALLBACKS | ||
| 22 | bool | ||
| 23 | |||
| 134 | config HIBERNATION | 24 | config HIBERNATION |
| 135 | bool "Hibernation (aka 'suspend to disk')" | 25 | bool "Hibernation (aka 'suspend to disk')" |
| 136 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE | 26 | depends on SWAP && ARCH_HIBERNATION_POSSIBLE |
| 27 | select HIBERNATE_CALLBACKS | ||
| 137 | select LZO_COMPRESS | 28 | select LZO_COMPRESS |
| 138 | select LZO_DECOMPRESS | 29 | select LZO_DECOMPRESS |
| 139 | ---help--- | 30 | ---help--- |
| @@ -196,6 +87,106 @@ config PM_STD_PARTITION | |||
| 196 | suspended image to. It will simply pick the first available swap | 87 | suspended image to. It will simply pick the first available swap |
| 197 | device. | 88 | device. |
| 198 | 89 | ||
| 90 | config PM_SLEEP | ||
| 91 | def_bool y | ||
| 92 | depends on SUSPEND || HIBERNATE_CALLBACKS | ||
| 93 | |||
| 94 | config PM_SLEEP_SMP | ||
| 95 | def_bool y | ||
| 96 | depends on SMP | ||
| 97 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE | ||
| 98 | depends on PM_SLEEP | ||
| 99 | select HOTPLUG | ||
| 100 | select HOTPLUG_CPU | ||
| 101 | |||
| 102 | config PM_RUNTIME | ||
| 103 | bool "Run-time PM core functionality" | ||
| 104 | depends on !IA64_HP_SIM | ||
| 105 | ---help--- | ||
| 106 | Enable functionality allowing I/O devices to be put into energy-saving | ||
| 107 | (low power) states at run time (or autosuspended) after a specified | ||
| 108 | period of inactivity and woken up in response to a hardware-generated | ||
| 109 | wake-up event or a driver's request. | ||
| 110 | |||
| 111 | Hardware support is generally required for this functionality to work | ||
| 112 | and the bus type drivers of the buses the devices are on are | ||
| 113 | responsible for the actual handling of the autosuspend requests and | ||
| 114 | wake-up events. | ||
| 115 | |||
| 116 | config PM | ||
| 117 | def_bool y | ||
| 118 | depends on PM_SLEEP || PM_RUNTIME | ||
| 119 | |||
| 120 | config PM_DEBUG | ||
| 121 | bool "Power Management Debug Support" | ||
| 122 | depends on PM | ||
| 123 | ---help--- | ||
| 124 | This option enables various debugging support in the Power Management | ||
| 125 | code. This is helpful when debugging and reporting PM bugs, like | ||
| 126 | suspend support. | ||
| 127 | |||
| 128 | config PM_VERBOSE | ||
| 129 | bool "Verbose Power Management debugging" | ||
| 130 | depends on PM_DEBUG | ||
| 131 | ---help--- | ||
| 132 | This option enables verbose messages from the Power Management code. | ||
| 133 | |||
| 134 | config PM_ADVANCED_DEBUG | ||
| 135 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | ||
| 136 | depends on PM_DEBUG | ||
| 137 | ---help--- | ||
| 138 | Add extra sysfs attributes allowing one to access some Power Management | ||
| 139 | fields of device objects from user space. If you are not a kernel | ||
| 140 | developer interested in debugging/testing Power Management, say "no". | ||
| 141 | |||
| 142 | config PM_TEST_SUSPEND | ||
| 143 | bool "Test suspend/resume and wakealarm during bootup" | ||
| 144 | depends on SUSPEND && PM_DEBUG && RTC_CLASS=y | ||
| 145 | ---help--- | ||
| 146 | This option will let you suspend your machine during bootup, and | ||
| 147 | make it wake up a few seconds later using an RTC wakeup alarm. | ||
| 148 | Enable this with a kernel parameter like "test_suspend=mem". | ||
| 149 | |||
| 150 | You probably want to have your system's RTC driver statically | ||
| 151 | linked, ensuring that it's available when this test runs. | ||
| 152 | |||
| 153 | config CAN_PM_TRACE | ||
| 154 | def_bool y | ||
| 155 | depends on PM_DEBUG && PM_SLEEP | ||
| 156 | |||
| 157 | config PM_TRACE | ||
| 158 | bool | ||
| 159 | help | ||
| 160 | This enables code to save the last PM event point across | ||
| 161 | reboot. The architecture needs to support this, x86 for | ||
| 162 | example does by saving things in the RTC, see below. | ||
| 163 | |||
| 164 | The architecture specific code must provide the extern | ||
| 165 | functions from <linux/resume-trace.h> as well as the | ||
| 166 | <asm/resume-trace.h> header with a TRACE_RESUME() macro. | ||
| 167 | |||
| 168 | The way the information is presented is architecture- | ||
| 169 | dependent, x86 will print the information during a | ||
| 170 | late_initcall. | ||
| 171 | |||
| 172 | config PM_TRACE_RTC | ||
| 173 | bool "Suspend/resume event tracing" | ||
| 174 | depends on CAN_PM_TRACE | ||
| 175 | depends on X86 | ||
| 176 | select PM_TRACE | ||
| 177 | ---help--- | ||
| 178 | This enables some cheesy code to save the last PM event point in the | ||
| 179 | RTC across reboots, so that you can debug a machine that just hangs | ||
| 180 | during suspend (or more commonly, during resume). | ||
| 181 | |||
| 182 | To use this debugging feature you should attempt to suspend the | ||
| 183 | machine, reboot it and then run | ||
| 184 | |||
| 185 | dmesg -s 1000000 | grep 'hash matches' | ||
| 186 | |||
| 187 | CAUTION: this option will cause your machine's real-time clock to be | ||
| 188 | set to an invalid time after a resume. | ||
| 189 | |||
| 199 | config APM_EMULATION | 190 | config APM_EMULATION |
| 200 | tristate "Advanced Power Management Emulation" | 191 | tristate "Advanced Power Management Emulation" |
| 201 | depends on PM && SYS_SUPPORTS_APM_EMULATION | 192 | depends on PM && SYS_SUPPORTS_APM_EMULATION |
| @@ -222,31 +213,11 @@ config APM_EMULATION | |||
| 222 | anything, try disabling/enabling this option (or disabling/enabling | 213 | anything, try disabling/enabling this option (or disabling/enabling |
| 223 | APM in your BIOS). | 214 | APM in your BIOS). |
| 224 | 215 | ||
| 225 | config PM_RUNTIME | ||
| 226 | bool "Run-time PM core functionality" | ||
| 227 | depends on PM | ||
| 228 | ---help--- | ||
| 229 | Enable functionality allowing I/O devices to be put into energy-saving | ||
| 230 | (low power) states at run time (or autosuspended) after a specified | ||
| 231 | period of inactivity and woken up in response to a hardware-generated | ||
| 232 | wake-up event or a driver's request. | ||
| 233 | |||
| 234 | Hardware support is generally required for this functionality to work | ||
| 235 | and the bus type drivers of the buses the devices are on are | ||
| 236 | responsible for the actual handling of the autosuspend requests and | ||
| 237 | wake-up events. | ||
| 238 | |||
| 239 | config PM_OPS | ||
| 240 | bool | ||
| 241 | depends on PM_SLEEP || PM_RUNTIME | ||
| 242 | default y | ||
| 243 | |||
| 244 | config ARCH_HAS_OPP | 216 | config ARCH_HAS_OPP |
| 245 | bool | 217 | bool |
| 246 | 218 | ||
| 247 | config PM_OPP | 219 | config PM_OPP |
| 248 | bool "Operating Performance Point (OPP) Layer library" | 220 | bool "Operating Performance Point (OPP) Layer library" |
| 249 | depends on PM | ||
| 250 | depends on ARCH_HAS_OPP | 221 | depends on ARCH_HAS_OPP |
| 251 | ---help--- | 222 | ---help--- |
| 252 | SOCs have a standard set of tuples consisting of frequency and | 223 | SOCs have a standard set of tuples consisting of frequency and |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c350e18b53e3..c5ebc6a90643 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | 1 | |
| 2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | ||
| 2 | 3 | ||
| 3 | obj-$(CONFIG_PM) += main.o | 4 | obj-$(CONFIG_PM) += main.o |
| 4 | obj-$(CONFIG_PM_SLEEP) += console.o | 5 | obj-$(CONFIG_PM_SLEEP) += console.o |
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 83bbc7c02df9..d09dd10c5a5e 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c | |||
| @@ -28,7 +28,7 @@ | |||
| 28 | static int submit(int rw, struct block_device *bdev, sector_t sector, | 28 | static int submit(int rw, struct block_device *bdev, sector_t sector, |
| 29 | struct page *page, struct bio **bio_chain) | 29 | struct page *page, struct bio **bio_chain) |
| 30 | { | 30 | { |
| 31 | const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; | 31 | const int bio_rw = rw | REQ_SYNC; |
| 32 | struct bio *bio; | 32 | struct bio *bio; |
| 33 | 33 | ||
| 34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); | 34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1832bd264219..50aae660174d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/cpu.h> | 23 | #include <linux/cpu.h> |
| 24 | #include <linux/freezer.h> | 24 | #include <linux/freezer.h> |
| 25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
| 26 | #include <linux/syscore_ops.h> | ||
| 26 | #include <scsi/scsi_scan.h> | 27 | #include <scsi/scsi_scan.h> |
| 27 | #include <asm/suspend.h> | 28 | #include <asm/suspend.h> |
| 28 | 29 | ||
| @@ -272,6 +273,11 @@ static int create_image(int platform_mode) | |||
| 272 | local_irq_disable(); | 273 | local_irq_disable(); |
| 273 | 274 | ||
| 274 | error = sysdev_suspend(PMSG_FREEZE); | 275 | error = sysdev_suspend(PMSG_FREEZE); |
| 276 | if (!error) { | ||
| 277 | error = syscore_suspend(); | ||
| 278 | if (error) | ||
| 279 | sysdev_resume(); | ||
| 280 | } | ||
| 275 | if (error) { | 281 | if (error) { |
| 276 | printk(KERN_ERR "PM: Some system devices failed to power down, " | 282 | printk(KERN_ERR "PM: Some system devices failed to power down, " |
| 277 | "aborting hibernation\n"); | 283 | "aborting hibernation\n"); |
| @@ -295,6 +301,7 @@ static int create_image(int platform_mode) | |||
| 295 | } | 301 | } |
| 296 | 302 | ||
| 297 | Power_up: | 303 | Power_up: |
| 304 | syscore_resume(); | ||
| 298 | sysdev_resume(); | 305 | sysdev_resume(); |
| 299 | /* NOTE: dpm_resume_noirq() is just a resume() for devices | 306 | /* NOTE: dpm_resume_noirq() is just a resume() for devices |
| 300 | * that suspended with irqs off ... no overall powerup. | 307 | * that suspended with irqs off ... no overall powerup. |
| @@ -403,6 +410,11 @@ static int resume_target_kernel(bool platform_mode) | |||
| 403 | local_irq_disable(); | 410 | local_irq_disable(); |
| 404 | 411 | ||
| 405 | error = sysdev_suspend(PMSG_QUIESCE); | 412 | error = sysdev_suspend(PMSG_QUIESCE); |
| 413 | if (!error) { | ||
| 414 | error = syscore_suspend(); | ||
| 415 | if (error) | ||
| 416 | sysdev_resume(); | ||
| 417 | } | ||
| 406 | if (error) | 418 | if (error) |
| 407 | goto Enable_irqs; | 419 | goto Enable_irqs; |
| 408 | 420 | ||
| @@ -429,6 +441,7 @@ static int resume_target_kernel(bool platform_mode) | |||
| 429 | restore_processor_state(); | 441 | restore_processor_state(); |
| 430 | touch_softlockup_watchdog(); | 442 | touch_softlockup_watchdog(); |
| 431 | 443 | ||
| 444 | syscore_resume(); | ||
| 432 | sysdev_resume(); | 445 | sysdev_resume(); |
| 433 | 446 | ||
| 434 | Enable_irqs: | 447 | Enable_irqs: |
| @@ -516,6 +529,7 @@ int hibernation_platform_enter(void) | |||
| 516 | 529 | ||
| 517 | local_irq_disable(); | 530 | local_irq_disable(); |
| 518 | sysdev_suspend(PMSG_HIBERNATE); | 531 | sysdev_suspend(PMSG_HIBERNATE); |
| 532 | syscore_suspend(); | ||
| 519 | if (pm_wakeup_pending()) { | 533 | if (pm_wakeup_pending()) { |
| 520 | error = -EAGAIN; | 534 | error = -EAGAIN; |
| 521 | goto Power_up; | 535 | goto Power_up; |
| @@ -526,6 +540,7 @@ int hibernation_platform_enter(void) | |||
| 526 | while (1); | 540 | while (1); |
| 527 | 541 | ||
| 528 | Power_up: | 542 | Power_up: |
| 543 | syscore_resume(); | ||
| 529 | sysdev_resume(); | 544 | sysdev_resume(); |
| 530 | local_irq_enable(); | 545 | local_irq_enable(); |
| 531 | enable_nonboot_cpus(); | 546 | enable_nonboot_cpus(); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 701853042c28..de9aef8742f4 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -17,9 +17,6 @@ | |||
| 17 | 17 | ||
| 18 | DEFINE_MUTEX(pm_mutex); | 18 | DEFINE_MUTEX(pm_mutex); |
| 19 | 19 | ||
| 20 | unsigned int pm_flags; | ||
| 21 | EXPORT_SYMBOL(pm_flags); | ||
| 22 | |||
| 23 | #ifdef CONFIG_PM_SLEEP | 20 | #ifdef CONFIG_PM_SLEEP |
| 24 | 21 | ||
| 25 | /* Routines for PM-transition notifications */ | 22 | /* Routines for PM-transition notifications */ |
| @@ -227,7 +224,7 @@ power_attr(state); | |||
| 227 | * writing to 'state'. It first should read from 'wakeup_count' and store | 224 | * writing to 'state'. It first should read from 'wakeup_count' and store |
| 228 | * the read value. Then, after carrying out its own preparations for the system | 225 | * the read value. Then, after carrying out its own preparations for the system |
| 229 | * transition to a sleep state, it should write the stored value to | 226 | * transition to a sleep state, it should write the stored value to |
| 230 | * 'wakeup_count'. If that fails, at least one wakeup event has occured since | 227 | * 'wakeup_count'. If that fails, at least one wakeup event has occurred since |
| 231 | * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it | 228 | * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it |
| 232 | * is allowed to write to 'state', but the transition will be aborted if there | 229 | * is allowed to write to 'state', but the transition will be aborted if there |
| 233 | * are any wakeup events detected after 'wakeup_count' was written to. | 230 | * are any wakeup events detected after 'wakeup_count' was written to. |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 64db648ff911..ca0aacc24874 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *); | |||
| 42 | 42 | ||
| 43 | /* | 43 | /* |
| 44 | * Preferred image size in bytes (tunable via /sys/power/image_size). | 44 | * Preferred image size in bytes (tunable via /sys/power/image_size). |
| 45 | * When it is set to N, swsusp will do its best to ensure the image | 45 | * When it is set to N, the image creating code will do its best to |
| 46 | * size will not exceed N bytes, but if that is impossible, it will | 46 | * ensure the image size will not exceed N bytes, but if that is |
| 47 | * try to create the smallest image possible. | 47 | * impossible, it will try to create the smallest image possible. |
| 48 | */ | 48 | */ |
| 49 | unsigned long image_size; | 49 | unsigned long image_size; |
| 50 | 50 | ||
| 51 | void __init hibernate_image_size_init(void) | 51 | void __init hibernate_image_size_init(void) |
| 52 | { | 52 | { |
| 53 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; | 53 | image_size = (totalram_pages / 3) * PAGE_SIZE; |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | /* List of PBEs needed for restoring the pages that were allocated before | 56 | /* List of PBEs needed for restoring the pages that were allocated before |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index de6f86bfa303..8935369d503a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
| 23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
| 24 | #include <linux/suspend.h> | 24 | #include <linux/suspend.h> |
| 25 | #include <linux/syscore_ops.h> | ||
| 25 | #include <trace/events/power.h> | 26 | #include <trace/events/power.h> |
| 26 | 27 | ||
| 27 | #include "power.h" | 28 | #include "power.h" |
| @@ -164,10 +165,16 @@ static int suspend_enter(suspend_state_t state) | |||
| 164 | 165 | ||
| 165 | error = sysdev_suspend(PMSG_SUSPEND); | 166 | error = sysdev_suspend(PMSG_SUSPEND); |
| 166 | if (!error) { | 167 | if (!error) { |
| 168 | error = syscore_suspend(); | ||
| 169 | if (error) | ||
| 170 | sysdev_resume(); | ||
| 171 | } | ||
| 172 | if (!error) { | ||
| 167 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { | 173 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { |
| 168 | error = suspend_ops->enter(state); | 174 | error = suspend_ops->enter(state); |
| 169 | events_check_enabled = false; | 175 | events_check_enabled = false; |
| 170 | } | 176 | } |
| 177 | syscore_resume(); | ||
| 171 | sysdev_resume(); | 178 | sysdev_resume(); |
| 172 | } | 179 | } |
| 173 | 180 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 36231525e22f..da8ca817eae3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | |||
| 53 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 53 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
| 54 | 54 | ||
| 55 | /* printk's without a loglevel use this.. */ | 55 | /* printk's without a loglevel use this.. */ |
| 56 | #define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ | 56 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
| 57 | 57 | ||
| 58 | /* We show everything that is MORE important than this.. */ | 58 | /* We show everything that is MORE important than this.. */ |
| 59 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ | 59 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ |
| @@ -113,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol | |||
| 113 | static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ | 113 | static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ |
| 114 | 114 | ||
| 115 | /* | 115 | /* |
| 116 | * If exclusive_console is non-NULL then only this console is to be printed to. | ||
| 117 | */ | ||
| 118 | static struct console *exclusive_console; | ||
| 119 | |||
| 120 | /* | ||
| 116 | * Array of consoles built from command line options (console=) | 121 | * Array of consoles built from command line options (console=) |
| 117 | */ | 122 | */ |
| 118 | struct console_cmdline | 123 | struct console_cmdline |
| @@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end) | |||
| 476 | struct console *con; | 481 | struct console *con; |
| 477 | 482 | ||
| 478 | for_each_console(con) { | 483 | for_each_console(con) { |
| 484 | if (exclusive_console && con != exclusive_console) | ||
| 485 | continue; | ||
| 479 | if ((con->flags & CON_ENABLED) && con->write && | 486 | if ((con->flags & CON_ENABLED) && con->write && |
| 480 | (cpu_online(smp_processor_id()) || | 487 | (cpu_online(smp_processor_id()) || |
| 481 | (con->flags & CON_ANYTIME))) | 488 | (con->flags & CON_ANYTIME))) |
| @@ -515,6 +522,71 @@ static void _call_console_drivers(unsigned start, | |||
| 515 | } | 522 | } |
| 516 | 523 | ||
| 517 | /* | 524 | /* |
| 525 | * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the | ||
| 526 | * lower 3 bit are the log level, the rest are the log facility. In case | ||
| 527 | * userspace passes usual userspace syslog messages to /dev/kmsg or | ||
| 528 | * /dev/ttyprintk, the log prefix might contain the facility. Printk needs | ||
| 529 | * to extract the correct log level for in-kernel processing, and not mangle | ||
| 530 | * the original value. | ||
| 531 | * | ||
| 532 | * If a prefix is found, the length of the prefix is returned. If 'level' is | ||
| 533 | * passed, it will be filled in with the log level without a possible facility | ||
| 534 | * value. If 'special' is passed, the special printk prefix chars are accepted | ||
| 535 | * and returned. If no valid header is found, 0 is returned and the passed | ||
| 536 | * variables are not touched. | ||
| 537 | */ | ||
| 538 | static size_t log_prefix(const char *p, unsigned int *level, char *special) | ||
| 539 | { | ||
| 540 | unsigned int lev = 0; | ||
| 541 | char sp = '\0'; | ||
| 542 | size_t len; | ||
| 543 | |||
| 544 | if (p[0] != '<' || !p[1]) | ||
| 545 | return 0; | ||
| 546 | if (p[2] == '>') { | ||
| 547 | /* usual single digit level number or special char */ | ||
| 548 | switch (p[1]) { | ||
| 549 | case '0' ... '7': | ||
| 550 | lev = p[1] - '0'; | ||
| 551 | break; | ||
| 552 | case 'c': /* KERN_CONT */ | ||
| 553 | case 'd': /* KERN_DEFAULT */ | ||
| 554 | sp = p[1]; | ||
| 555 | break; | ||
| 556 | default: | ||
| 557 | return 0; | ||
| 558 | } | ||
| 559 | len = 3; | ||
| 560 | } else { | ||
| 561 | /* multi digit including the level and facility number */ | ||
| 562 | char *endp = NULL; | ||
| 563 | |||
| 564 | if (p[1] < '0' && p[1] > '9') | ||
| 565 | return 0; | ||
| 566 | |||
| 567 | lev = (simple_strtoul(&p[1], &endp, 10) & 7); | ||
| 568 | if (endp == NULL || endp[0] != '>') | ||
| 569 | return 0; | ||
| 570 | len = (endp + 1) - p; | ||
| 571 | } | ||
| 572 | |||
| 573 | /* do not accept special char if not asked for */ | ||
| 574 | if (sp && !special) | ||
| 575 | return 0; | ||
| 576 | |||
| 577 | if (special) { | ||
| 578 | *special = sp; | ||
| 579 | /* return special char, do not touch level */ | ||
| 580 | if (sp) | ||
| 581 | return len; | ||
| 582 | } | ||
| 583 | |||
| 584 | if (level) | ||
| 585 | *level = lev; | ||
| 586 | return len; | ||
| 587 | } | ||
| 588 | |||
| 589 | /* | ||
| 518 | * Call the console drivers, asking them to write out | 590 | * Call the console drivers, asking them to write out |
| 519 | * log_buf[start] to log_buf[end - 1]. | 591 | * log_buf[start] to log_buf[end - 1]. |
| 520 | * The console_lock must be held. | 592 | * The console_lock must be held. |
| @@ -529,13 +601,9 @@ static void call_console_drivers(unsigned start, unsigned end) | |||
| 529 | cur_index = start; | 601 | cur_index = start; |
| 530 | start_print = start; | 602 | start_print = start; |
| 531 | while (cur_index != end) { | 603 | while (cur_index != end) { |
| 532 | if (msg_level < 0 && ((end - cur_index) > 2) && | 604 | if (msg_level < 0 && ((end - cur_index) > 2)) { |
| 533 | LOG_BUF(cur_index + 0) == '<' && | 605 | /* strip log prefix */ |
| 534 | LOG_BUF(cur_index + 1) >= '0' && | 606 | cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); |
| 535 | LOG_BUF(cur_index + 1) <= '7' && | ||
| 536 | LOG_BUF(cur_index + 2) == '>') { | ||
| 537 | msg_level = LOG_BUF(cur_index + 1) - '0'; | ||
| 538 | cur_index += 3; | ||
| 539 | start_print = cur_index; | 607 | start_print = cur_index; |
| 540 | } | 608 | } |
| 541 | while (cur_index != end) { | 609 | while (cur_index != end) { |
| @@ -733,6 +801,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 733 | unsigned long flags; | 801 | unsigned long flags; |
| 734 | int this_cpu; | 802 | int this_cpu; |
| 735 | char *p; | 803 | char *p; |
| 804 | size_t plen; | ||
| 805 | char special; | ||
| 736 | 806 | ||
| 737 | boot_delay_msec(); | 807 | boot_delay_msec(); |
| 738 | printk_delay(); | 808 | printk_delay(); |
| @@ -773,45 +843,52 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 773 | printed_len += vscnprintf(printk_buf + printed_len, | 843 | printed_len += vscnprintf(printk_buf + printed_len, |
| 774 | sizeof(printk_buf) - printed_len, fmt, args); | 844 | sizeof(printk_buf) - printed_len, fmt, args); |
| 775 | 845 | ||
| 776 | |||
| 777 | p = printk_buf; | 846 | p = printk_buf; |
| 778 | 847 | ||
| 779 | /* Do we have a loglevel in the string? */ | 848 | /* Read log level and handle special printk prefix */ |
| 780 | if (p[0] == '<') { | 849 | plen = log_prefix(p, ¤t_log_level, &special); |
| 781 | unsigned char c = p[1]; | 850 | if (plen) { |
| 782 | if (c && p[2] == '>') { | 851 | p += plen; |
| 783 | switch (c) { | 852 | |
| 784 | case '0' ... '7': /* loglevel */ | 853 | switch (special) { |
| 785 | current_log_level = c - '0'; | 854 | case 'c': /* Strip <c> KERN_CONT, continue line */ |
| 786 | /* Fallthrough - make sure we're on a new line */ | 855 | plen = 0; |
| 787 | case 'd': /* KERN_DEFAULT */ | 856 | break; |
| 788 | if (!new_text_line) { | 857 | case 'd': /* Strip <d> KERN_DEFAULT, start new line */ |
| 789 | emit_log_char('\n'); | 858 | plen = 0; |
| 790 | new_text_line = 1; | 859 | default: |
| 791 | } | 860 | if (!new_text_line) { |
| 792 | /* Fallthrough - skip the loglevel */ | 861 | emit_log_char('\n'); |
| 793 | case 'c': /* KERN_CONT */ | 862 | new_text_line = 1; |
| 794 | p += 3; | ||
| 795 | break; | ||
| 796 | } | 863 | } |
| 797 | } | 864 | } |
| 798 | } | 865 | } |
| 799 | 866 | ||
| 800 | /* | 867 | /* |
| 801 | * Copy the output into log_buf. If the caller didn't provide | 868 | * Copy the output into log_buf. If the caller didn't provide |
| 802 | * appropriate log level tags, we insert them here | 869 | * the appropriate log prefix, we insert them here |
| 803 | */ | 870 | */ |
| 804 | for ( ; *p; p++) { | 871 | for (; *p; p++) { |
| 805 | if (new_text_line) { | 872 | if (new_text_line) { |
| 806 | /* Always output the token */ | ||
| 807 | emit_log_char('<'); | ||
| 808 | emit_log_char(current_log_level + '0'); | ||
| 809 | emit_log_char('>'); | ||
| 810 | printed_len += 3; | ||
| 811 | new_text_line = 0; | 873 | new_text_line = 0; |
| 812 | 874 | ||
| 875 | if (plen) { | ||
| 876 | /* Copy original log prefix */ | ||
| 877 | int i; | ||
| 878 | |||
| 879 | for (i = 0; i < plen; i++) | ||
| 880 | emit_log_char(printk_buf[i]); | ||
| 881 | printed_len += plen; | ||
| 882 | } else { | ||
| 883 | /* Add log prefix */ | ||
| 884 | emit_log_char('<'); | ||
| 885 | emit_log_char(current_log_level + '0'); | ||
| 886 | emit_log_char('>'); | ||
| 887 | printed_len += 3; | ||
| 888 | } | ||
| 889 | |||
| 813 | if (printk_time) { | 890 | if (printk_time) { |
| 814 | /* Follow the token with the time */ | 891 | /* Add the current time stamp */ |
| 815 | char tbuf[50], *tp; | 892 | char tbuf[50], *tp; |
| 816 | unsigned tlen; | 893 | unsigned tlen; |
| 817 | unsigned long long t; | 894 | unsigned long long t; |
| @@ -1160,6 +1237,11 @@ void console_unlock(void) | |||
| 1160 | local_irq_restore(flags); | 1237 | local_irq_restore(flags); |
| 1161 | } | 1238 | } |
| 1162 | console_locked = 0; | 1239 | console_locked = 0; |
| 1240 | |||
| 1241 | /* Release the exclusive_console once it is used */ | ||
| 1242 | if (unlikely(exclusive_console)) | ||
| 1243 | exclusive_console = NULL; | ||
| 1244 | |||
| 1163 | up(&console_sem); | 1245 | up(&console_sem); |
| 1164 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1246 | spin_unlock_irqrestore(&logbuf_lock, flags); |
| 1165 | if (wake_klogd) | 1247 | if (wake_klogd) |
| @@ -1246,6 +1328,18 @@ void console_start(struct console *console) | |||
| 1246 | } | 1328 | } |
| 1247 | EXPORT_SYMBOL(console_start); | 1329 | EXPORT_SYMBOL(console_start); |
| 1248 | 1330 | ||
| 1331 | static int __read_mostly keep_bootcon; | ||
| 1332 | |||
| 1333 | static int __init keep_bootcon_setup(char *str) | ||
| 1334 | { | ||
| 1335 | keep_bootcon = 1; | ||
| 1336 | printk(KERN_INFO "debug: skip boot console de-registration.\n"); | ||
| 1337 | |||
| 1338 | return 0; | ||
| 1339 | } | ||
| 1340 | |||
| 1341 | early_param("keep_bootcon", keep_bootcon_setup); | ||
| 1342 | |||
| 1249 | /* | 1343 | /* |
| 1250 | * The console driver calls this routine during kernel initialization | 1344 | * The console driver calls this routine during kernel initialization |
| 1251 | * to register the console printing procedure with printk() and to | 1345 | * to register the console printing procedure with printk() and to |
| @@ -1382,6 +1476,12 @@ void register_console(struct console *newcon) | |||
| 1382 | spin_lock_irqsave(&logbuf_lock, flags); | 1476 | spin_lock_irqsave(&logbuf_lock, flags); |
| 1383 | con_start = log_start; | 1477 | con_start = log_start; |
| 1384 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1478 | spin_unlock_irqrestore(&logbuf_lock, flags); |
| 1479 | /* | ||
| 1480 | * We're about to replay the log buffer. Only do this to the | ||
| 1481 | * just-registered console to avoid excessive message spam to | ||
| 1482 | * the already-registered consoles. | ||
| 1483 | */ | ||
| 1484 | exclusive_console = newcon; | ||
| 1385 | } | 1485 | } |
| 1386 | console_unlock(); | 1486 | console_unlock(); |
| 1387 | console_sysfs_notify(); | 1487 | console_sysfs_notify(); |
| @@ -1393,7 +1493,9 @@ void register_console(struct console *newcon) | |||
| 1393 | * users know there might be something in the kernel's log buffer that | 1493 | * users know there might be something in the kernel's log buffer that |
| 1394 | * went to the bootconsole (that they do not see on the real console) | 1494 | * went to the bootconsole (that they do not see on the real console) |
| 1395 | */ | 1495 | */ |
| 1396 | if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { | 1496 | if (bcon && |
| 1497 | ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && | ||
| 1498 | !keep_bootcon) { | ||
| 1397 | /* we need to iterate through twice, to make sure we print | 1499 | /* we need to iterate through twice, to make sure we print |
| 1398 | * everything out, before we unregister the console(s) | 1500 | * everything out, before we unregister the console(s) |
| 1399 | */ | 1501 | */ |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1708b1e2972d..dc7ab65f3b36 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
| 23 | #include <linux/uaccess.h> | 23 | #include <linux/uaccess.h> |
| 24 | #include <linux/regset.h> | 24 | #include <linux/regset.h> |
| 25 | #include <linux/hw_breakpoint.h> | ||
| 25 | 26 | ||
| 26 | 27 | ||
| 27 | /* | 28 | /* |
| @@ -134,21 +135,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
| 134 | return 0; | 135 | return 0; |
| 135 | rcu_read_lock(); | 136 | rcu_read_lock(); |
| 136 | tcred = __task_cred(task); | 137 | tcred = __task_cred(task); |
| 137 | if ((cred->uid != tcred->euid || | 138 | if (cred->user->user_ns == tcred->user->user_ns && |
| 138 | cred->uid != tcred->suid || | 139 | (cred->uid == tcred->euid && |
| 139 | cred->uid != tcred->uid || | 140 | cred->uid == tcred->suid && |
| 140 | cred->gid != tcred->egid || | 141 | cred->uid == tcred->uid && |
| 141 | cred->gid != tcred->sgid || | 142 | cred->gid == tcred->egid && |
| 142 | cred->gid != tcred->gid) && | 143 | cred->gid == tcred->sgid && |
| 143 | !capable(CAP_SYS_PTRACE)) { | 144 | cred->gid == tcred->gid)) |
| 144 | rcu_read_unlock(); | 145 | goto ok; |
| 145 | return -EPERM; | 146 | if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) |
| 146 | } | 147 | goto ok; |
| 148 | rcu_read_unlock(); | ||
| 149 | return -EPERM; | ||
| 150 | ok: | ||
| 147 | rcu_read_unlock(); | 151 | rcu_read_unlock(); |
| 148 | smp_rmb(); | 152 | smp_rmb(); |
| 149 | if (task->mm) | 153 | if (task->mm) |
| 150 | dumpable = get_dumpable(task->mm); | 154 | dumpable = get_dumpable(task->mm); |
| 151 | if (!dumpable && !capable(CAP_SYS_PTRACE)) | 155 | if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) |
| 152 | return -EPERM; | 156 | return -EPERM; |
| 153 | 157 | ||
| 154 | return security_ptrace_access_check(task, mode); | 158 | return security_ptrace_access_check(task, mode); |
| @@ -163,7 +167,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
| 163 | return !err; | 167 | return !err; |
| 164 | } | 168 | } |
| 165 | 169 | ||
| 166 | int ptrace_attach(struct task_struct *task) | 170 | static int ptrace_attach(struct task_struct *task) |
| 167 | { | 171 | { |
| 168 | int retval; | 172 | int retval; |
| 169 | 173 | ||
| @@ -198,7 +202,7 @@ int ptrace_attach(struct task_struct *task) | |||
| 198 | goto unlock_tasklist; | 202 | goto unlock_tasklist; |
| 199 | 203 | ||
| 200 | task->ptrace = PT_PTRACED; | 204 | task->ptrace = PT_PTRACED; |
| 201 | if (capable(CAP_SYS_PTRACE)) | 205 | if (task_ns_capable(task, CAP_SYS_PTRACE)) |
| 202 | task->ptrace |= PT_PTRACE_CAP; | 206 | task->ptrace |= PT_PTRACE_CAP; |
| 203 | 207 | ||
| 204 | __ptrace_link(task, current); | 208 | __ptrace_link(task, current); |
| @@ -219,7 +223,7 @@ out: | |||
| 219 | * Performs checks and sets PT_PTRACED. | 223 | * Performs checks and sets PT_PTRACED. |
| 220 | * Should be used by all ptrace implementations for PTRACE_TRACEME. | 224 | * Should be used by all ptrace implementations for PTRACE_TRACEME. |
| 221 | */ | 225 | */ |
| 222 | int ptrace_traceme(void) | 226 | static int ptrace_traceme(void) |
| 223 | { | 227 | { |
| 224 | int ret = -EPERM; | 228 | int ret = -EPERM; |
| 225 | 229 | ||
| @@ -293,7 +297,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) | |||
| 293 | return false; | 297 | return false; |
| 294 | } | 298 | } |
| 295 | 299 | ||
| 296 | int ptrace_detach(struct task_struct *child, unsigned int data) | 300 | static int ptrace_detach(struct task_struct *child, unsigned int data) |
| 297 | { | 301 | { |
| 298 | bool dead = false; | 302 | bool dead = false; |
| 299 | 303 | ||
| @@ -876,3 +880,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
| 876 | return ret; | 880 | return ret; |
| 877 | } | 881 | } |
| 878 | #endif /* CONFIG_COMPAT */ | 882 | #endif /* CONFIG_COMPAT */ |
| 883 | |||
| 884 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
| 885 | int ptrace_get_breakpoints(struct task_struct *tsk) | ||
| 886 | { | ||
| 887 | if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) | ||
| 888 | return 0; | ||
| 889 | |||
| 890 | return -1; | ||
| 891 | } | ||
| 892 | |||
| 893 | void ptrace_put_breakpoints(struct task_struct *tsk) | ||
| 894 | { | ||
| 895 | if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) | ||
| 896 | flush_ptrace_hw_breakpoint(tsk); | ||
| 897 | } | ||
| 898 | #endif /* CONFIG_HAVE_HW_BREAKPOINT */ | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a23a57a976d1..f3240e987928 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | |||
| 214 | * Ensure that queued callbacks are all executed. | 214 | * Ensure that queued callbacks are all executed. |
| 215 | * If we detect that we are nested in a RCU read-side critical | 215 | * If we detect that we are nested in a RCU read-side critical |
| 216 | * section, we should simply fail, otherwise we would deadlock. | 216 | * section, we should simply fail, otherwise we would deadlock. |
| 217 | * Note that the machinery to reliably determine whether | ||
| 218 | * or not we are in an RCU read-side critical section | ||
| 219 | * exists only in the preemptible RCU implementations | ||
| 220 | * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why | ||
| 221 | * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT. | ||
| 217 | */ | 222 | */ |
| 218 | #ifndef CONFIG_PREEMPT | ||
| 219 | WARN_ON(1); | ||
| 220 | return 0; | ||
| 221 | #else | ||
| 222 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 223 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
| 223 | irqs_disabled()) { | 224 | irqs_disabled()) { |
| 224 | WARN_ON(1); | 225 | WARN_ON(1); |
| @@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | |||
| 229 | rcu_barrier_bh(); | 230 | rcu_barrier_bh(); |
| 230 | debug_object_free(head, &rcuhead_debug_descr); | 231 | debug_object_free(head, &rcuhead_debug_descr); |
| 231 | return 1; | 232 | return 1; |
| 232 | #endif | ||
| 233 | default: | 233 | default: |
| 234 | return 0; | 234 | return 0; |
| 235 | } | 235 | } |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 015abaea962a..3cb8e362e883 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
| @@ -852,7 +852,7 @@ void exit_rcu(void) | |||
| 852 | if (t->rcu_read_lock_nesting == 0) | 852 | if (t->rcu_read_lock_nesting == 0) |
| 853 | return; | 853 | return; |
| 854 | t->rcu_read_lock_nesting = 1; | 854 | t->rcu_read_lock_nesting = 1; |
| 855 | rcu_read_unlock(); | 855 | __rcu_read_unlock(); |
| 856 | } | 856 | } |
| 857 | 857 | ||
| 858 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 858 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 89613f97ff26..c224da41890c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -47,7 +47,6 @@ | |||
| 47 | #include <linux/srcu.h> | 47 | #include <linux/srcu.h> |
| 48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
| 49 | #include <asm/byteorder.h> | 49 | #include <asm/byteorder.h> |
| 50 | #include <linux/sched.h> | ||
| 51 | 50 | ||
| 52 | MODULE_LICENSE("GPL"); | 51 | MODULE_LICENSE("GPL"); |
| 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " | 52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index c7eaa37a768b..34683efa2cce 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
| @@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member, | |||
| 126 | pos, buf, s - buf); | 126 | pos, buf, s - buf); |
| 127 | } | 127 | } |
| 128 | 128 | ||
| 129 | #if BITS_PER_LONG == 32 | ||
| 130 | u64 res_counter_read_u64(struct res_counter *counter, int member) | ||
| 131 | { | ||
| 132 | unsigned long flags; | ||
| 133 | u64 ret; | ||
| 134 | |||
| 135 | spin_lock_irqsave(&counter->lock, flags); | ||
| 136 | ret = *res_counter_member(counter, member); | ||
| 137 | spin_unlock_irqrestore(&counter->lock, flags); | ||
| 138 | |||
| 139 | return ret; | ||
| 140 | } | ||
| 141 | #else | ||
| 129 | u64 res_counter_read_u64(struct res_counter *counter, int member) | 142 | u64 res_counter_read_u64(struct res_counter *counter, int member) |
| 130 | { | 143 | { |
| 131 | return *res_counter_member(counter, member); | 144 | return *res_counter_member(counter, member); |
| 132 | } | 145 | } |
| 146 | #endif | ||
| 133 | 147 | ||
| 134 | int res_counter_memparse_write_strategy(const char *buf, | 148 | int res_counter_memparse_write_strategy(const char *buf, |
| 135 | unsigned long long *res) | 149 | unsigned long long *res) |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index ddabb54bb5c8..3c7cbc2c33be 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c | |||
| @@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | |||
| 215 | put_pid(waiter->deadlock_task_pid); | 215 | put_pid(waiter->deadlock_task_pid); |
| 216 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | 216 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); |
| 217 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | 217 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); |
| 218 | TRACE_WARN_ON(waiter->task); | ||
| 219 | memset(waiter, 0x22, sizeof(*waiter)); | 218 | memset(waiter, 0x22, sizeof(*waiter)); |
| 220 | } | 219 | } |
| 221 | 220 | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 66cb89bc5ef1..5c9ccd380966 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
| @@ -9,7 +9,6 @@ | |||
| 9 | #include <linux/kthread.h> | 9 | #include <linux/kthread.h> |
| 10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
| 11 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
| 12 | #include <linux/smp_lock.h> | ||
| 13 | #include <linux/spinlock.h> | 12 | #include <linux/spinlock.h> |
| 14 | #include <linux/sysdev.h> | 13 | #include <linux/sysdev.h> |
| 15 | #include <linux/timer.h> | 14 | #include <linux/timer.h> |
| @@ -27,7 +26,6 @@ struct test_thread_data { | |||
| 27 | int opcode; | 26 | int opcode; |
| 28 | int opdata; | 27 | int opdata; |
| 29 | int mutexes[MAX_RT_TEST_MUTEXES]; | 28 | int mutexes[MAX_RT_TEST_MUTEXES]; |
| 30 | int bkl; | ||
| 31 | int event; | 29 | int event; |
| 32 | struct sys_device sysdev; | 30 | struct sys_device sysdev; |
| 33 | }; | 31 | }; |
| @@ -46,9 +44,8 @@ enum test_opcodes { | |||
| 46 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ | 44 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ |
| 47 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ | 45 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ |
| 48 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ | 46 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ |
| 49 | RTTEST_LOCKBKL, /* 9 Lock BKL */ | 47 | /* 9, 10 - reserved for BKL commemoration */ |
| 50 | RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ | 48 | RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ |
| 51 | RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ | ||
| 52 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ | 49 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ |
| 53 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ | 50 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ |
| 54 | }; | 51 | }; |
| @@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
| 74 | td->mutexes[i] = 0; | 71 | td->mutexes[i] = 0; |
| 75 | } | 72 | } |
| 76 | } | 73 | } |
| 77 | |||
| 78 | if (!lockwakeup && td->bkl == 4) { | ||
| 79 | #ifdef CONFIG_LOCK_KERNEL | ||
| 80 | unlock_kernel(); | ||
| 81 | #endif | ||
| 82 | td->bkl = 0; | ||
| 83 | } | ||
| 84 | return 0; | 74 | return 0; |
| 85 | 75 | ||
| 86 | case RTTEST_RESETEVENT: | 76 | case RTTEST_RESETEVENT: |
| @@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
| 131 | td->mutexes[id] = 0; | 121 | td->mutexes[id] = 0; |
| 132 | return 0; | 122 | return 0; |
| 133 | 123 | ||
| 134 | case RTTEST_LOCKBKL: | ||
| 135 | if (td->bkl) | ||
| 136 | return 0; | ||
| 137 | td->bkl = 1; | ||
| 138 | #ifdef CONFIG_LOCK_KERNEL | ||
| 139 | lock_kernel(); | ||
| 140 | #endif | ||
| 141 | td->bkl = 4; | ||
| 142 | return 0; | ||
| 143 | |||
| 144 | case RTTEST_UNLOCKBKL: | ||
| 145 | if (td->bkl != 4) | ||
| 146 | break; | ||
| 147 | #ifdef CONFIG_LOCK_KERNEL | ||
| 148 | unlock_kernel(); | ||
| 149 | #endif | ||
| 150 | td->bkl = 0; | ||
| 151 | return 0; | ||
| 152 | |||
| 153 | default: | 124 | default: |
| 154 | break; | 125 | break; |
| 155 | } | 126 | } |
| @@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) | |||
| 196 | td->event = atomic_add_return(1, &rttest_event); | 167 | td->event = atomic_add_return(1, &rttest_event); |
| 197 | break; | 168 | break; |
| 198 | 169 | ||
| 199 | case RTTEST_LOCKBKL: | ||
| 200 | default: | 170 | default: |
| 201 | break; | 171 | break; |
| 202 | } | 172 | } |
| @@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) | |||
| 229 | td->event = atomic_add_return(1, &rttest_event); | 199 | td->event = atomic_add_return(1, &rttest_event); |
| 230 | return; | 200 | return; |
| 231 | 201 | ||
| 232 | case RTTEST_LOCKBKL: | ||
| 233 | return; | ||
| 234 | default: | 202 | default: |
| 235 | return; | 203 | return; |
| 236 | } | 204 | } |
| @@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute | |||
| 380 | spin_lock(&rttest_lock); | 348 | spin_lock(&rttest_lock); |
| 381 | 349 | ||
| 382 | curr += sprintf(curr, | 350 | curr += sprintf(curr, |
| 383 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", | 351 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", |
| 384 | td->opcode, td->event, tsk->state, | 352 | td->opcode, td->event, tsk->state, |
| 385 | (MAX_RT_PRIO - 1) - tsk->prio, | 353 | (MAX_RT_PRIO - 1) - tsk->prio, |
| 386 | (MAX_RT_PRIO - 1) - tsk->normal_prio, | 354 | (MAX_RT_PRIO - 1) - tsk->normal_prio, |
| 387 | tsk->pi_blocked_on, td->bkl); | 355 | tsk->pi_blocked_on); |
| 388 | 356 | ||
| 389 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) | 357 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) |
| 390 | curr += sprintf(curr, "%d", td->mutexes[i]); | 358 | curr += sprintf(curr, "%d", td->mutexes[i]); |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a9604815786a..ab449117aaf2 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
| @@ -20,41 +20,34 @@ | |||
| 20 | /* | 20 | /* |
| 21 | * lock->owner state tracking: | 21 | * lock->owner state tracking: |
| 22 | * | 22 | * |
| 23 | * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 | 23 | * lock->owner holds the task_struct pointer of the owner. Bit 0 |
| 24 | * are used to keep track of the "owner is pending" and "lock has | 24 | * is used to keep track of the "lock has waiters" state. |
| 25 | * waiters" state. | ||
| 26 | * | 25 | * |
| 27 | * owner bit1 bit0 | 26 | * owner bit0 |
| 28 | * NULL 0 0 lock is free (fast acquire possible) | 27 | * NULL 0 lock is free (fast acquire possible) |
| 29 | * NULL 0 1 invalid state | 28 | * NULL 1 lock is free and has waiters and the top waiter |
| 30 | * NULL 1 0 Transitional State* | 29 | * is going to take the lock* |
| 31 | * NULL 1 1 invalid state | 30 | * taskpointer 0 lock is held (fast release possible) |
| 32 | * taskpointer 0 0 lock is held (fast release possible) | 31 | * taskpointer 1 lock is held and has waiters** |
| 33 | * taskpointer 0 1 task is pending owner | ||
| 34 | * taskpointer 1 0 lock is held and has waiters | ||
| 35 | * taskpointer 1 1 task is pending owner and lock has more waiters | ||
| 36 | * | ||
| 37 | * Pending ownership is assigned to the top (highest priority) | ||
| 38 | * waiter of the lock, when the lock is released. The thread is woken | ||
| 39 | * up and can now take the lock. Until the lock is taken (bit 0 | ||
| 40 | * cleared) a competing higher priority thread can steal the lock | ||
| 41 | * which puts the woken up thread back on the waiters list. | ||
| 42 | * | 32 | * |
| 43 | * The fast atomic compare exchange based acquire and release is only | 33 | * The fast atomic compare exchange based acquire and release is only |
| 44 | * possible when bit 0 and 1 of lock->owner are 0. | 34 | * possible when bit 0 of lock->owner is 0. |
| 35 | * | ||
| 36 | * (*) It also can be a transitional state when grabbing the lock | ||
| 37 | * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, | ||
| 38 | * we need to set the bit0 before looking at the lock, and the owner may be | ||
| 39 | * NULL in this small time, hence this can be a transitional state. | ||
| 45 | * | 40 | * |
| 46 | * (*) There's a small time where the owner can be NULL and the | 41 | * (**) There is a small time when bit 0 is set but there are no |
| 47 | * "lock has waiters" bit is set. This can happen when grabbing the lock. | 42 | * waiters. This can happen when grabbing the lock in the slow path. |
| 48 | * To prevent a cmpxchg of the owner releasing the lock, we need to set this | 43 | * To prevent a cmpxchg of the owner releasing the lock, we need to |
| 49 | * bit before looking at the lock, hence the reason this is a transitional | 44 | * set this bit before looking at the lock. |
| 50 | * state. | ||
| 51 | */ | 45 | */ |
| 52 | 46 | ||
| 53 | static void | 47 | static void |
| 54 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, | 48 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) |
| 55 | unsigned long mask) | ||
| 56 | { | 49 | { |
| 57 | unsigned long val = (unsigned long)owner | mask; | 50 | unsigned long val = (unsigned long)owner; |
| 58 | 51 | ||
| 59 | if (rt_mutex_has_waiters(lock)) | 52 | if (rt_mutex_has_waiters(lock)) |
| 60 | val |= RT_MUTEX_HAS_WAITERS; | 53 | val |= RT_MUTEX_HAS_WAITERS; |
| @@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
| 203 | * reached or the state of the chain has changed while we | 196 | * reached or the state of the chain has changed while we |
| 204 | * dropped the locks. | 197 | * dropped the locks. |
| 205 | */ | 198 | */ |
| 206 | if (!waiter || !waiter->task) | 199 | if (!waiter) |
| 207 | goto out_unlock_pi; | 200 | goto out_unlock_pi; |
| 208 | 201 | ||
| 209 | /* | 202 | /* |
| 210 | * Check the orig_waiter state. After we dropped the locks, | 203 | * Check the orig_waiter state. After we dropped the locks, |
| 211 | * the previous owner of the lock might have released the lock | 204 | * the previous owner of the lock might have released the lock. |
| 212 | * and made us the pending owner: | ||
| 213 | */ | 205 | */ |
| 214 | if (orig_waiter && !orig_waiter->task) | 206 | if (orig_waiter && !rt_mutex_owner(orig_lock)) |
| 215 | goto out_unlock_pi; | 207 | goto out_unlock_pi; |
| 216 | 208 | ||
| 217 | /* | 209 | /* |
| @@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
| 254 | 246 | ||
| 255 | /* Release the task */ | 247 | /* Release the task */ |
| 256 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 248 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
| 249 | if (!rt_mutex_owner(lock)) { | ||
| 250 | /* | ||
| 251 | * If the requeue above changed the top waiter, then we need | ||
| 252 | * to wake the new top waiter up to try to get the lock. | ||
| 253 | */ | ||
| 254 | |||
| 255 | if (top_waiter != rt_mutex_top_waiter(lock)) | ||
| 256 | wake_up_process(rt_mutex_top_waiter(lock)->task); | ||
| 257 | raw_spin_unlock(&lock->wait_lock); | ||
| 258 | goto out_put_task; | ||
| 259 | } | ||
| 257 | put_task_struct(task); | 260 | put_task_struct(task); |
| 258 | 261 | ||
| 259 | /* Grab the next task */ | 262 | /* Grab the next task */ |
| @@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
| 296 | } | 299 | } |
| 297 | 300 | ||
| 298 | /* | 301 | /* |
| 299 | * Optimization: check if we can steal the lock from the | ||
| 300 | * assigned pending owner [which might not have taken the | ||
| 301 | * lock yet]: | ||
| 302 | */ | ||
| 303 | static inline int try_to_steal_lock(struct rt_mutex *lock, | ||
| 304 | struct task_struct *task) | ||
| 305 | { | ||
| 306 | struct task_struct *pendowner = rt_mutex_owner(lock); | ||
| 307 | struct rt_mutex_waiter *next; | ||
| 308 | unsigned long flags; | ||
| 309 | |||
| 310 | if (!rt_mutex_owner_pending(lock)) | ||
| 311 | return 0; | ||
| 312 | |||
| 313 | if (pendowner == task) | ||
| 314 | return 1; | ||
| 315 | |||
| 316 | raw_spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
| 317 | if (task->prio >= pendowner->prio) { | ||
| 318 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
| 319 | return 0; | ||
| 320 | } | ||
| 321 | |||
| 322 | /* | ||
| 323 | * Check if a waiter is enqueued on the pending owners | ||
| 324 | * pi_waiters list. Remove it and readjust pending owners | ||
| 325 | * priority. | ||
| 326 | */ | ||
| 327 | if (likely(!rt_mutex_has_waiters(lock))) { | ||
| 328 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
| 329 | return 1; | ||
| 330 | } | ||
| 331 | |||
| 332 | /* No chain handling, pending owner is not blocked on anything: */ | ||
| 333 | next = rt_mutex_top_waiter(lock); | ||
| 334 | plist_del(&next->pi_list_entry, &pendowner->pi_waiters); | ||
| 335 | __rt_mutex_adjust_prio(pendowner); | ||
| 336 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
| 337 | |||
| 338 | /* | ||
| 339 | * We are going to steal the lock and a waiter was | ||
| 340 | * enqueued on the pending owners pi_waiters queue. So | ||
| 341 | * we have to enqueue this waiter into | ||
| 342 | * task->pi_waiters list. This covers the case, | ||
| 343 | * where task is boosted because it holds another | ||
| 344 | * lock and gets unboosted because the booster is | ||
| 345 | * interrupted, so we would delay a waiter with higher | ||
| 346 | * priority as task->normal_prio. | ||
| 347 | * | ||
| 348 | * Note: in the rare case of a SCHED_OTHER task changing | ||
| 349 | * its priority and thus stealing the lock, next->task | ||
| 350 | * might be task: | ||
| 351 | */ | ||
| 352 | if (likely(next->task != task)) { | ||
| 353 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
| 354 | plist_add(&next->pi_list_entry, &task->pi_waiters); | ||
| 355 | __rt_mutex_adjust_prio(task); | ||
| 356 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 357 | } | ||
| 358 | return 1; | ||
| 359 | } | ||
| 360 | |||
| 361 | /* | ||
| 362 | * Try to take an rt-mutex | 302 | * Try to take an rt-mutex |
| 363 | * | 303 | * |
| 364 | * This fails | ||
| 365 | * - when the lock has a real owner | ||
| 366 | * - when a different pending owner exists and has higher priority than current | ||
| 367 | * | ||
| 368 | * Must be called with lock->wait_lock held. | 304 | * Must be called with lock->wait_lock held. |
| 305 | * | ||
| 306 | * @lock: the lock to be acquired. | ||
| 307 | * @task: the task which wants to acquire the lock | ||
| 308 | * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) | ||
| 369 | */ | 309 | */ |
| 370 | static int try_to_take_rt_mutex(struct rt_mutex *lock) | 310 | static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, |
| 311 | struct rt_mutex_waiter *waiter) | ||
| 371 | { | 312 | { |
| 372 | /* | 313 | /* |
| 373 | * We have to be careful here if the atomic speedups are | 314 | * We have to be careful here if the atomic speedups are |
| @@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) | |||
| 390 | */ | 331 | */ |
| 391 | mark_rt_mutex_waiters(lock); | 332 | mark_rt_mutex_waiters(lock); |
| 392 | 333 | ||
| 393 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) | 334 | if (rt_mutex_owner(lock)) |
| 394 | return 0; | 335 | return 0; |
| 395 | 336 | ||
| 337 | /* | ||
| 338 | * It will get the lock because of one of these conditions: | ||
| 339 | * 1) there is no waiter | ||
| 340 | * 2) higher priority than waiters | ||
| 341 | * 3) it is top waiter | ||
| 342 | */ | ||
| 343 | if (rt_mutex_has_waiters(lock)) { | ||
| 344 | if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { | ||
| 345 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) | ||
| 346 | return 0; | ||
| 347 | } | ||
| 348 | } | ||
| 349 | |||
| 350 | if (waiter || rt_mutex_has_waiters(lock)) { | ||
| 351 | unsigned long flags; | ||
| 352 | struct rt_mutex_waiter *top; | ||
| 353 | |||
| 354 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
| 355 | |||
| 356 | /* remove the queued waiter. */ | ||
| 357 | if (waiter) { | ||
| 358 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
| 359 | task->pi_blocked_on = NULL; | ||
| 360 | } | ||
| 361 | |||
| 362 | /* | ||
| 363 | * We have to enqueue the top waiter(if it exists) into | ||
| 364 | * task->pi_waiters list. | ||
| 365 | */ | ||
| 366 | if (rt_mutex_has_waiters(lock)) { | ||
| 367 | top = rt_mutex_top_waiter(lock); | ||
| 368 | top->pi_list_entry.prio = top->list_entry.prio; | ||
| 369 | plist_add(&top->pi_list_entry, &task->pi_waiters); | ||
| 370 | } | ||
| 371 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 372 | } | ||
| 373 | |||
| 396 | /* We got the lock. */ | 374 | /* We got the lock. */ |
| 397 | debug_rt_mutex_lock(lock); | 375 | debug_rt_mutex_lock(lock); |
| 398 | 376 | ||
| 399 | rt_mutex_set_owner(lock, current, 0); | 377 | rt_mutex_set_owner(lock, task); |
| 400 | 378 | ||
| 401 | rt_mutex_deadlock_account_lock(lock, current); | 379 | rt_mutex_deadlock_account_lock(lock, task); |
| 402 | 380 | ||
| 403 | return 1; | 381 | return 1; |
| 404 | } | 382 | } |
| @@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
| 436 | 414 | ||
| 437 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 415 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
| 438 | 416 | ||
| 417 | if (!owner) | ||
| 418 | return 0; | ||
| 419 | |||
| 439 | if (waiter == rt_mutex_top_waiter(lock)) { | 420 | if (waiter == rt_mutex_top_waiter(lock)) { |
| 440 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 421 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
| 441 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | 422 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); |
| @@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
| 472 | /* | 453 | /* |
| 473 | * Wake up the next waiter on the lock. | 454 | * Wake up the next waiter on the lock. |
| 474 | * | 455 | * |
| 475 | * Remove the top waiter from the current tasks waiter list and from | 456 | * Remove the top waiter from the current tasks waiter list and wake it up. |
| 476 | * the lock waiter list. Set it as pending owner. Then wake it up. | ||
| 477 | * | 457 | * |
| 478 | * Called with lock->wait_lock held. | 458 | * Called with lock->wait_lock held. |
| 479 | */ | 459 | */ |
| 480 | static void wakeup_next_waiter(struct rt_mutex *lock) | 460 | static void wakeup_next_waiter(struct rt_mutex *lock) |
| 481 | { | 461 | { |
| 482 | struct rt_mutex_waiter *waiter; | 462 | struct rt_mutex_waiter *waiter; |
| 483 | struct task_struct *pendowner; | ||
| 484 | unsigned long flags; | 463 | unsigned long flags; |
| 485 | 464 | ||
| 486 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 465 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
| 487 | 466 | ||
| 488 | waiter = rt_mutex_top_waiter(lock); | 467 | waiter = rt_mutex_top_waiter(lock); |
| 489 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
| 490 | 468 | ||
| 491 | /* | 469 | /* |
| 492 | * Remove it from current->pi_waiters. We do not adjust a | 470 | * Remove it from current->pi_waiters. We do not adjust a |
| @@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock) | |||
| 495 | * lock->wait_lock. | 473 | * lock->wait_lock. |
| 496 | */ | 474 | */ |
| 497 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | 475 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); |
| 498 | pendowner = waiter->task; | ||
| 499 | waiter->task = NULL; | ||
| 500 | 476 | ||
| 501 | rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); | 477 | rt_mutex_set_owner(lock, NULL); |
| 502 | 478 | ||
| 503 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 479 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
| 504 | 480 | ||
| 505 | /* | 481 | wake_up_process(waiter->task); |
| 506 | * Clear the pi_blocked_on variable and enqueue a possible | ||
| 507 | * waiter into the pi_waiters list of the pending owner. This | ||
| 508 | * prevents that in case the pending owner gets unboosted a | ||
| 509 | * waiter with higher priority than pending-owner->normal_prio | ||
| 510 | * is blocked on the unboosted (pending) owner. | ||
| 511 | */ | ||
| 512 | raw_spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
| 513 | |||
| 514 | WARN_ON(!pendowner->pi_blocked_on); | ||
| 515 | WARN_ON(pendowner->pi_blocked_on != waiter); | ||
| 516 | WARN_ON(pendowner->pi_blocked_on->lock != lock); | ||
| 517 | |||
| 518 | pendowner->pi_blocked_on = NULL; | ||
| 519 | |||
| 520 | if (rt_mutex_has_waiters(lock)) { | ||
| 521 | struct rt_mutex_waiter *next; | ||
| 522 | |||
| 523 | next = rt_mutex_top_waiter(lock); | ||
| 524 | plist_add(&next->pi_list_entry, &pendowner->pi_waiters); | ||
| 525 | } | ||
| 526 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
| 527 | |||
| 528 | wake_up_process(pendowner); | ||
| 529 | } | 482 | } |
| 530 | 483 | ||
| 531 | /* | 484 | /* |
| 532 | * Remove a waiter from a lock | 485 | * Remove a waiter from a lock and give up |
| 533 | * | 486 | * |
| 534 | * Must be called with lock->wait_lock held | 487 | * Must be called with lock->wait_lock held and |
| 488 | * have just failed to try_to_take_rt_mutex(). | ||
| 535 | */ | 489 | */ |
| 536 | static void remove_waiter(struct rt_mutex *lock, | 490 | static void remove_waiter(struct rt_mutex *lock, |
| 537 | struct rt_mutex_waiter *waiter) | 491 | struct rt_mutex_waiter *waiter) |
| @@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock, | |||
| 543 | 497 | ||
| 544 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 498 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
| 545 | plist_del(&waiter->list_entry, &lock->wait_list); | 499 | plist_del(&waiter->list_entry, &lock->wait_list); |
| 546 | waiter->task = NULL; | ||
| 547 | current->pi_blocked_on = NULL; | 500 | current->pi_blocked_on = NULL; |
| 548 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 501 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
| 549 | 502 | ||
| 550 | if (first && owner != current) { | 503 | if (!owner) |
| 504 | return; | ||
| 505 | |||
| 506 | if (first) { | ||
| 551 | 507 | ||
| 552 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 508 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
| 553 | 509 | ||
| @@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
| 614 | * or TASK_UNINTERRUPTIBLE) | 570 | * or TASK_UNINTERRUPTIBLE) |
| 615 | * @timeout: the pre-initialized and started timer, or NULL for none | 571 | * @timeout: the pre-initialized and started timer, or NULL for none |
| 616 | * @waiter: the pre-initialized rt_mutex_waiter | 572 | * @waiter: the pre-initialized rt_mutex_waiter |
| 617 | * @detect_deadlock: passed to task_blocks_on_rt_mutex | ||
| 618 | * | 573 | * |
| 619 | * lock->wait_lock must be held by the caller. | 574 | * lock->wait_lock must be held by the caller. |
| 620 | */ | 575 | */ |
| 621 | static int __sched | 576 | static int __sched |
| 622 | __rt_mutex_slowlock(struct rt_mutex *lock, int state, | 577 | __rt_mutex_slowlock(struct rt_mutex *lock, int state, |
| 623 | struct hrtimer_sleeper *timeout, | 578 | struct hrtimer_sleeper *timeout, |
| 624 | struct rt_mutex_waiter *waiter, | 579 | struct rt_mutex_waiter *waiter) |
| 625 | int detect_deadlock) | ||
| 626 | { | 580 | { |
| 627 | int ret = 0; | 581 | int ret = 0; |
| 628 | 582 | ||
| 629 | for (;;) { | 583 | for (;;) { |
| 630 | /* Try to acquire the lock: */ | 584 | /* Try to acquire the lock: */ |
| 631 | if (try_to_take_rt_mutex(lock)) | 585 | if (try_to_take_rt_mutex(lock, current, waiter)) |
| 632 | break; | 586 | break; |
| 633 | 587 | ||
| 634 | /* | 588 | /* |
| @@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
| 645 | break; | 599 | break; |
| 646 | } | 600 | } |
| 647 | 601 | ||
| 648 | /* | ||
| 649 | * waiter->task is NULL the first time we come here and | ||
| 650 | * when we have been woken up by the previous owner | ||
| 651 | * but the lock got stolen by a higher prio task. | ||
| 652 | */ | ||
| 653 | if (!waiter->task) { | ||
| 654 | ret = task_blocks_on_rt_mutex(lock, waiter, current, | ||
| 655 | detect_deadlock); | ||
| 656 | /* | ||
| 657 | * If we got woken up by the owner then start loop | ||
| 658 | * all over without going into schedule to try | ||
| 659 | * to get the lock now: | ||
| 660 | */ | ||
| 661 | if (unlikely(!waiter->task)) { | ||
| 662 | /* | ||
| 663 | * Reset the return value. We might | ||
| 664 | * have returned with -EDEADLK and the | ||
| 665 | * owner released the lock while we | ||
| 666 | * were walking the pi chain. | ||
| 667 | */ | ||
| 668 | ret = 0; | ||
| 669 | continue; | ||
| 670 | } | ||
| 671 | if (unlikely(ret)) | ||
| 672 | break; | ||
| 673 | } | ||
| 674 | |||
| 675 | raw_spin_unlock(&lock->wait_lock); | 602 | raw_spin_unlock(&lock->wait_lock); |
| 676 | 603 | ||
| 677 | debug_rt_mutex_print_deadlock(waiter); | 604 | debug_rt_mutex_print_deadlock(waiter); |
| 678 | 605 | ||
| 679 | if (waiter->task) | 606 | schedule_rt_mutex(lock); |
| 680 | schedule_rt_mutex(lock); | ||
| 681 | 607 | ||
| 682 | raw_spin_lock(&lock->wait_lock); | 608 | raw_spin_lock(&lock->wait_lock); |
| 683 | set_current_state(state); | 609 | set_current_state(state); |
| @@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
| 698 | int ret = 0; | 624 | int ret = 0; |
| 699 | 625 | ||
| 700 | debug_rt_mutex_init_waiter(&waiter); | 626 | debug_rt_mutex_init_waiter(&waiter); |
| 701 | waiter.task = NULL; | ||
| 702 | 627 | ||
| 703 | raw_spin_lock(&lock->wait_lock); | 628 | raw_spin_lock(&lock->wait_lock); |
| 704 | 629 | ||
| 705 | /* Try to acquire the lock again: */ | 630 | /* Try to acquire the lock again: */ |
| 706 | if (try_to_take_rt_mutex(lock)) { | 631 | if (try_to_take_rt_mutex(lock, current, NULL)) { |
| 707 | raw_spin_unlock(&lock->wait_lock); | 632 | raw_spin_unlock(&lock->wait_lock); |
| 708 | return 0; | 633 | return 0; |
| 709 | } | 634 | } |
| @@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
| 717 | timeout->task = NULL; | 642 | timeout->task = NULL; |
| 718 | } | 643 | } |
| 719 | 644 | ||
| 720 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, | 645 | ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); |
| 721 | detect_deadlock); | 646 | |
| 647 | if (likely(!ret)) | ||
| 648 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); | ||
| 722 | 649 | ||
| 723 | set_current_state(TASK_RUNNING); | 650 | set_current_state(TASK_RUNNING); |
| 724 | 651 | ||
| 725 | if (unlikely(waiter.task)) | 652 | if (unlikely(ret)) |
| 726 | remove_waiter(lock, &waiter); | 653 | remove_waiter(lock, &waiter); |
| 727 | 654 | ||
| 728 | /* | 655 | /* |
| @@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
| 737 | if (unlikely(timeout)) | 664 | if (unlikely(timeout)) |
| 738 | hrtimer_cancel(&timeout->timer); | 665 | hrtimer_cancel(&timeout->timer); |
| 739 | 666 | ||
| 740 | /* | ||
| 741 | * Readjust priority, when we did not get the lock. We might | ||
| 742 | * have been the pending owner and boosted. Since we did not | ||
| 743 | * take the lock, the PI boost has to go. | ||
| 744 | */ | ||
| 745 | if (unlikely(ret)) | ||
| 746 | rt_mutex_adjust_prio(current); | ||
| 747 | |||
| 748 | debug_rt_mutex_free_waiter(&waiter); | 667 | debug_rt_mutex_free_waiter(&waiter); |
| 749 | 668 | ||
| 750 | return ret; | 669 | return ret; |
| @@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) | |||
| 762 | 681 | ||
| 763 | if (likely(rt_mutex_owner(lock) != current)) { | 682 | if (likely(rt_mutex_owner(lock) != current)) { |
| 764 | 683 | ||
| 765 | ret = try_to_take_rt_mutex(lock); | 684 | ret = try_to_take_rt_mutex(lock, current, NULL); |
| 766 | /* | 685 | /* |
| 767 | * try_to_take_rt_mutex() sets the lock waiters | 686 | * try_to_take_rt_mutex() sets the lock waiters |
| 768 | * bit unconditionally. Clean this up. | 687 | * bit unconditionally. Clean this up. |
| @@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | |||
| 992 | { | 911 | { |
| 993 | __rt_mutex_init(lock, NULL); | 912 | __rt_mutex_init(lock, NULL); |
| 994 | debug_rt_mutex_proxy_lock(lock, proxy_owner); | 913 | debug_rt_mutex_proxy_lock(lock, proxy_owner); |
| 995 | rt_mutex_set_owner(lock, proxy_owner, 0); | 914 | rt_mutex_set_owner(lock, proxy_owner); |
| 996 | rt_mutex_deadlock_account_lock(lock, proxy_owner); | 915 | rt_mutex_deadlock_account_lock(lock, proxy_owner); |
| 997 | } | 916 | } |
| 998 | 917 | ||
| @@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |||
| 1008 | struct task_struct *proxy_owner) | 927 | struct task_struct *proxy_owner) |
| 1009 | { | 928 | { |
| 1010 | debug_rt_mutex_proxy_unlock(lock); | 929 | debug_rt_mutex_proxy_unlock(lock); |
| 1011 | rt_mutex_set_owner(lock, NULL, 0); | 930 | rt_mutex_set_owner(lock, NULL); |
| 1012 | rt_mutex_deadlock_account_unlock(proxy_owner); | 931 | rt_mutex_deadlock_account_unlock(proxy_owner); |
| 1013 | } | 932 | } |
| 1014 | 933 | ||
| @@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |||
| 1034 | 953 | ||
| 1035 | raw_spin_lock(&lock->wait_lock); | 954 | raw_spin_lock(&lock->wait_lock); |
| 1036 | 955 | ||
| 1037 | mark_rt_mutex_waiters(lock); | 956 | if (try_to_take_rt_mutex(lock, task, NULL)) { |
| 1038 | |||
| 1039 | if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { | ||
| 1040 | /* We got the lock for task. */ | ||
| 1041 | debug_rt_mutex_lock(lock); | ||
| 1042 | rt_mutex_set_owner(lock, task, 0); | ||
| 1043 | raw_spin_unlock(&lock->wait_lock); | 957 | raw_spin_unlock(&lock->wait_lock); |
| 1044 | rt_mutex_deadlock_account_lock(lock, task); | ||
| 1045 | return 1; | 958 | return 1; |
| 1046 | } | 959 | } |
| 1047 | 960 | ||
| 1048 | ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); | 961 | ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); |
| 1049 | 962 | ||
| 1050 | if (ret && !waiter->task) { | 963 | if (ret && !rt_mutex_owner(lock)) { |
| 1051 | /* | 964 | /* |
| 1052 | * Reset the return value. We might have | 965 | * Reset the return value. We might have |
| 1053 | * returned with -EDEADLK and the owner | 966 | * returned with -EDEADLK and the owner |
| @@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |||
| 1056 | */ | 969 | */ |
| 1057 | ret = 0; | 970 | ret = 0; |
| 1058 | } | 971 | } |
| 972 | |||
| 973 | if (unlikely(ret)) | ||
| 974 | remove_waiter(lock, waiter); | ||
| 975 | |||
| 1059 | raw_spin_unlock(&lock->wait_lock); | 976 | raw_spin_unlock(&lock->wait_lock); |
| 1060 | 977 | ||
| 1061 | debug_rt_mutex_print_deadlock(waiter); | 978 | debug_rt_mutex_print_deadlock(waiter); |
| @@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | |||
| 1110 | 1027 | ||
| 1111 | set_current_state(TASK_INTERRUPTIBLE); | 1028 | set_current_state(TASK_INTERRUPTIBLE); |
| 1112 | 1029 | ||
| 1113 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, | 1030 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); |
| 1114 | detect_deadlock); | ||
| 1115 | 1031 | ||
| 1116 | set_current_state(TASK_RUNNING); | 1032 | set_current_state(TASK_RUNNING); |
| 1117 | 1033 | ||
| 1118 | if (unlikely(waiter->task)) | 1034 | if (unlikely(ret)) |
| 1119 | remove_waiter(lock, waiter); | 1035 | remove_waiter(lock, waiter); |
| 1120 | 1036 | ||
| 1121 | /* | 1037 | /* |
| @@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | |||
| 1126 | 1042 | ||
| 1127 | raw_spin_unlock(&lock->wait_lock); | 1043 | raw_spin_unlock(&lock->wait_lock); |
| 1128 | 1044 | ||
| 1129 | /* | ||
| 1130 | * Readjust priority, when we did not get the lock. We might have been | ||
| 1131 | * the pending owner and boosted. Since we did not take the lock, the | ||
| 1132 | * PI boost has to go. | ||
| 1133 | */ | ||
| 1134 | if (unlikely(ret)) | ||
| 1135 | rt_mutex_adjust_prio(current); | ||
| 1136 | |||
| 1137 | return ret; | 1045 | return ret; |
| 1138 | } | 1046 | } |
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81866af..53a66c85261b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h | |||
| @@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p) | |||
| 91 | /* | 91 | /* |
| 92 | * lock->owner state tracking: | 92 | * lock->owner state tracking: |
| 93 | */ | 93 | */ |
| 94 | #define RT_MUTEX_OWNER_PENDING 1UL | 94 | #define RT_MUTEX_HAS_WAITERS 1UL |
| 95 | #define RT_MUTEX_HAS_WAITERS 2UL | 95 | #define RT_MUTEX_OWNER_MASKALL 1UL |
| 96 | #define RT_MUTEX_OWNER_MASKALL 3UL | ||
| 97 | 96 | ||
| 98 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | 97 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) |
| 99 | { | 98 | { |
| @@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | |||
| 101 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); | 100 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); |
| 102 | } | 101 | } |
| 103 | 102 | ||
| 104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) | ||
| 105 | { | ||
| 106 | return (struct task_struct *) | ||
| 107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
| 108 | } | ||
| 109 | |||
| 110 | static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | ||
| 111 | { | ||
| 112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | ||
| 113 | } | ||
| 114 | |||
| 115 | /* | 103 | /* |
| 116 | * PI-futex support (proxy locking functions, etc.): | 104 | * PI-futex support (proxy locking functions, etc.): |
| 117 | */ | 105 | */ |
diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..312f8b95c2d4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -32,7 +32,6 @@ | |||
| 32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
| 33 | #include <linux/uaccess.h> | 33 | #include <linux/uaccess.h> |
| 34 | #include <linux/highmem.h> | 34 | #include <linux/highmem.h> |
| 35 | #include <linux/smp_lock.h> | ||
| 36 | #include <asm/mmu_context.h> | 35 | #include <asm/mmu_context.h> |
| 37 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
| 38 | #include <linux/capability.h> | 37 | #include <linux/capability.h> |
| @@ -324,7 +323,7 @@ struct cfs_rq { | |||
| 324 | * 'curr' points to currently running entity on this cfs_rq. | 323 | * 'curr' points to currently running entity on this cfs_rq. |
| 325 | * It is set to NULL otherwise (i.e when none are currently running). | 324 | * It is set to NULL otherwise (i.e when none are currently running). |
| 326 | */ | 325 | */ |
| 327 | struct sched_entity *curr, *next, *last; | 326 | struct sched_entity *curr, *next, *last, *skip; |
| 328 | 327 | ||
| 329 | unsigned int nr_spread_over; | 328 | unsigned int nr_spread_over; |
| 330 | 329 | ||
| @@ -606,9 +605,6 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
| 606 | struct task_group *tg; | 605 | struct task_group *tg; |
| 607 | struct cgroup_subsys_state *css; | 606 | struct cgroup_subsys_state *css; |
| 608 | 607 | ||
| 609 | if (p->flags & PF_EXITING) | ||
| 610 | return &root_task_group; | ||
| 611 | |||
| 612 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 608 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
| 613 | lockdep_is_held(&task_rq(p)->lock)); | 609 | lockdep_is_held(&task_rq(p)->lock)); |
| 614 | tg = container_of(css, struct task_group, css); | 610 | tg = container_of(css, struct task_group, css); |
| @@ -664,10 +660,9 @@ static void update_rq_clock(struct rq *rq) | |||
| 664 | #endif | 660 | #endif |
| 665 | 661 | ||
| 666 | /** | 662 | /** |
| 667 | * runqueue_is_locked | 663 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked |
| 668 | * @cpu: the processor in question. | 664 | * @cpu: the processor in question. |
| 669 | * | 665 | * |
| 670 | * Returns true if the current cpu runqueue is locked. | ||
| 671 | * This interface allows printk to be called with the runqueue lock | 666 | * This interface allows printk to be called with the runqueue lock |
| 672 | * held and know whether or not it is OK to wake up the klogd. | 667 | * held and know whether or not it is OK to wake up the klogd. |
| 673 | */ | 668 | */ |
| @@ -1686,6 +1681,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
| 1686 | __release(rq2->lock); | 1681 | __release(rq2->lock); |
| 1687 | } | 1682 | } |
| 1688 | 1683 | ||
| 1684 | #else /* CONFIG_SMP */ | ||
| 1685 | |||
| 1686 | /* | ||
| 1687 | * double_rq_lock - safely lock two runqueues | ||
| 1688 | * | ||
| 1689 | * Note this does not disable interrupts like task_rq_lock, | ||
| 1690 | * you need to do so manually before calling. | ||
| 1691 | */ | ||
| 1692 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
| 1693 | __acquires(rq1->lock) | ||
| 1694 | __acquires(rq2->lock) | ||
| 1695 | { | ||
| 1696 | BUG_ON(!irqs_disabled()); | ||
| 1697 | BUG_ON(rq1 != rq2); | ||
| 1698 | raw_spin_lock(&rq1->lock); | ||
| 1699 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
| 1700 | } | ||
| 1701 | |||
| 1702 | /* | ||
| 1703 | * double_rq_unlock - safely unlock two runqueues | ||
| 1704 | * | ||
| 1705 | * Note this does not restore interrupts like task_rq_unlock, | ||
| 1706 | * you need to do so manually after calling. | ||
| 1707 | */ | ||
| 1708 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
| 1709 | __releases(rq1->lock) | ||
| 1710 | __releases(rq2->lock) | ||
| 1711 | { | ||
| 1712 | BUG_ON(rq1 != rq2); | ||
| 1713 | raw_spin_unlock(&rq1->lock); | ||
| 1714 | __release(rq2->lock); | ||
| 1715 | } | ||
| 1716 | |||
| 1689 | #endif | 1717 | #endif |
| 1690 | 1718 | ||
| 1691 | static void calc_load_account_idle(struct rq *this_rq); | 1719 | static void calc_load_account_idle(struct rq *this_rq); |
| @@ -1880,7 +1908,7 @@ void account_system_vtime(struct task_struct *curr) | |||
| 1880 | */ | 1908 | */ |
| 1881 | if (hardirq_count()) | 1909 | if (hardirq_count()) |
| 1882 | __this_cpu_add(cpu_hardirq_time, delta); | 1910 | __this_cpu_add(cpu_hardirq_time, delta); |
| 1883 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 1911 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
| 1884 | __this_cpu_add(cpu_softirq_time, delta); | 1912 | __this_cpu_add(cpu_softirq_time, delta); |
| 1885 | 1913 | ||
| 1886 | irq_time_write_end(); | 1914 | irq_time_write_end(); |
| @@ -1920,8 +1948,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
| 1920 | sched_rt_avg_update(rq, irq_delta); | 1948 | sched_rt_avg_update(rq, irq_delta); |
| 1921 | } | 1949 | } |
| 1922 | 1950 | ||
| 1951 | static int irqtime_account_hi_update(void) | ||
| 1952 | { | ||
| 1953 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
| 1954 | unsigned long flags; | ||
| 1955 | u64 latest_ns; | ||
| 1956 | int ret = 0; | ||
| 1957 | |||
| 1958 | local_irq_save(flags); | ||
| 1959 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
| 1960 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | ||
| 1961 | ret = 1; | ||
| 1962 | local_irq_restore(flags); | ||
| 1963 | return ret; | ||
| 1964 | } | ||
| 1965 | |||
| 1966 | static int irqtime_account_si_update(void) | ||
| 1967 | { | ||
| 1968 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
| 1969 | unsigned long flags; | ||
| 1970 | u64 latest_ns; | ||
| 1971 | int ret = 0; | ||
| 1972 | |||
| 1973 | local_irq_save(flags); | ||
| 1974 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
| 1975 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | ||
| 1976 | ret = 1; | ||
| 1977 | local_irq_restore(flags); | ||
| 1978 | return ret; | ||
| 1979 | } | ||
| 1980 | |||
| 1923 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 1981 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 1924 | 1982 | ||
| 1983 | #define sched_clock_irqtime (0) | ||
| 1984 | |||
| 1925 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 1985 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
| 1926 | { | 1986 | { |
| 1927 | rq->clock_task += delta; | 1987 | rq->clock_task += delta; |
| @@ -2025,14 +2085,14 @@ inline int task_curr(const struct task_struct *p) | |||
| 2025 | 2085 | ||
| 2026 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2086 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
| 2027 | const struct sched_class *prev_class, | 2087 | const struct sched_class *prev_class, |
| 2028 | int oldprio, int running) | 2088 | int oldprio) |
| 2029 | { | 2089 | { |
| 2030 | if (prev_class != p->sched_class) { | 2090 | if (prev_class != p->sched_class) { |
| 2031 | if (prev_class->switched_from) | 2091 | if (prev_class->switched_from) |
| 2032 | prev_class->switched_from(rq, p, running); | 2092 | prev_class->switched_from(rq, p); |
| 2033 | p->sched_class->switched_to(rq, p, running); | 2093 | p->sched_class->switched_to(rq, p); |
| 2034 | } else | 2094 | } else if (oldprio != p->prio) |
| 2035 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2095 | p->sched_class->prio_changed(rq, p, oldprio); |
| 2036 | } | 2096 | } |
| 2037 | 2097 | ||
| 2038 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 2098 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
| @@ -2224,7 +2284,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 2224 | * yield - it could be a while. | 2284 | * yield - it could be a while. |
| 2225 | */ | 2285 | */ |
| 2226 | if (unlikely(on_rq)) { | 2286 | if (unlikely(on_rq)) { |
| 2227 | schedule_timeout_uninterruptible(1); | 2287 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
| 2288 | |||
| 2289 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 2290 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); | ||
| 2228 | continue; | 2291 | continue; |
| 2229 | } | 2292 | } |
| 2230 | 2293 | ||
| @@ -2246,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 2246 | * Cause a process which is running on another CPU to enter | 2309 | * Cause a process which is running on another CPU to enter |
| 2247 | * kernel-mode, without any delay. (to get signals handled.) | 2310 | * kernel-mode, without any delay. (to get signals handled.) |
| 2248 | * | 2311 | * |
| 2249 | * NOTE: this function doesnt have to take the runqueue lock, | 2312 | * NOTE: this function doesn't have to take the runqueue lock, |
| 2250 | * because all it wants to ensure is that the remote task enters | 2313 | * because all it wants to ensure is that the remote task enters |
| 2251 | * the kernel. If the IPI races and the task has been migrated | 2314 | * the kernel. If the IPI races and the task has been migrated |
| 2252 | * to another CPU then no harm is done and the purpose has been | 2315 | * to another CPU then no harm is done and the purpose has been |
| @@ -2265,27 +2328,6 @@ void kick_process(struct task_struct *p) | |||
| 2265 | EXPORT_SYMBOL_GPL(kick_process); | 2328 | EXPORT_SYMBOL_GPL(kick_process); |
| 2266 | #endif /* CONFIG_SMP */ | 2329 | #endif /* CONFIG_SMP */ |
| 2267 | 2330 | ||
| 2268 | /** | ||
| 2269 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
| 2270 | * @p: the task to evaluate | ||
| 2271 | * @func: the function to be called | ||
| 2272 | * @info: the function call argument | ||
| 2273 | * | ||
| 2274 | * Calls the function @func when the task is currently running. This might | ||
| 2275 | * be on the current CPU, which just calls the function directly | ||
| 2276 | */ | ||
| 2277 | void task_oncpu_function_call(struct task_struct *p, | ||
| 2278 | void (*func) (void *info), void *info) | ||
| 2279 | { | ||
| 2280 | int cpu; | ||
| 2281 | |||
| 2282 | preempt_disable(); | ||
| 2283 | cpu = task_cpu(p); | ||
| 2284 | if (task_curr(p)) | ||
| 2285 | smp_call_function_single(cpu, func, info, 1); | ||
| 2286 | preempt_enable(); | ||
| 2287 | } | ||
| 2288 | |||
| 2289 | #ifdef CONFIG_SMP | 2331 | #ifdef CONFIG_SMP |
| 2290 | /* | 2332 | /* |
| 2291 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2333 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. |
| @@ -2566,6 +2608,7 @@ static void __sched_fork(struct task_struct *p) | |||
| 2566 | p->se.sum_exec_runtime = 0; | 2608 | p->se.sum_exec_runtime = 0; |
| 2567 | p->se.prev_sum_exec_runtime = 0; | 2609 | p->se.prev_sum_exec_runtime = 0; |
| 2568 | p->se.nr_migrations = 0; | 2610 | p->se.nr_migrations = 0; |
| 2611 | p->se.vruntime = 0; | ||
| 2569 | 2612 | ||
| 2570 | #ifdef CONFIG_SCHEDSTATS | 2613 | #ifdef CONFIG_SCHEDSTATS |
| 2571 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2614 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
| @@ -2776,9 +2819,12 @@ static inline void | |||
| 2776 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2819 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
| 2777 | struct task_struct *next) | 2820 | struct task_struct *next) |
| 2778 | { | 2821 | { |
| 2822 | sched_info_switch(prev, next); | ||
| 2823 | perf_event_task_sched_out(prev, next); | ||
| 2779 | fire_sched_out_preempt_notifiers(prev, next); | 2824 | fire_sched_out_preempt_notifiers(prev, next); |
| 2780 | prepare_lock_switch(rq, next); | 2825 | prepare_lock_switch(rq, next); |
| 2781 | prepare_arch_switch(next); | 2826 | prepare_arch_switch(next); |
| 2827 | trace_sched_switch(prev, next); | ||
| 2782 | } | 2828 | } |
| 2783 | 2829 | ||
| 2784 | /** | 2830 | /** |
| @@ -2911,7 +2957,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2911 | struct mm_struct *mm, *oldmm; | 2957 | struct mm_struct *mm, *oldmm; |
| 2912 | 2958 | ||
| 2913 | prepare_task_switch(rq, prev, next); | 2959 | prepare_task_switch(rq, prev, next); |
| 2914 | trace_sched_switch(prev, next); | 2960 | |
| 2915 | mm = next->mm; | 2961 | mm = next->mm; |
| 2916 | oldmm = prev->active_mm; | 2962 | oldmm = prev->active_mm; |
| 2917 | /* | 2963 | /* |
| @@ -3568,6 +3614,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
| 3568 | } | 3614 | } |
| 3569 | 3615 | ||
| 3570 | /* | 3616 | /* |
| 3617 | * Account system cpu time to a process and desired cpustat field | ||
| 3618 | * @p: the process that the cpu time gets accounted to | ||
| 3619 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 3620 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 3621 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
| 3622 | */ | ||
| 3623 | static inline | ||
| 3624 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
| 3625 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | ||
| 3626 | { | ||
| 3627 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
| 3628 | |||
| 3629 | /* Add system time to process. */ | ||
| 3630 | p->stime = cputime_add(p->stime, cputime); | ||
| 3631 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
| 3632 | account_group_system_time(p, cputime); | ||
| 3633 | |||
| 3634 | /* Add system time to cpustat. */ | ||
| 3635 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | ||
| 3636 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
| 3637 | |||
| 3638 | /* Account for system time used */ | ||
| 3639 | acct_update_integrals(p); | ||
| 3640 | } | ||
| 3641 | |||
| 3642 | /* | ||
| 3571 | * Account system cpu time to a process. | 3643 | * Account system cpu time to a process. |
| 3572 | * @p: the process that the cpu time gets accounted to | 3644 | * @p: the process that the cpu time gets accounted to |
| 3573 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3645 | * @hardirq_offset: the offset to subtract from hardirq_count() |
| @@ -3578,36 +3650,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 3578 | cputime_t cputime, cputime_t cputime_scaled) | 3650 | cputime_t cputime, cputime_t cputime_scaled) |
| 3579 | { | 3651 | { |
| 3580 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3652 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
| 3581 | cputime64_t tmp; | 3653 | cputime64_t *target_cputime64; |
| 3582 | 3654 | ||
| 3583 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 3655 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
| 3584 | account_guest_time(p, cputime, cputime_scaled); | 3656 | account_guest_time(p, cputime, cputime_scaled); |
| 3585 | return; | 3657 | return; |
| 3586 | } | 3658 | } |
| 3587 | 3659 | ||
| 3588 | /* Add system time to process. */ | ||
| 3589 | p->stime = cputime_add(p->stime, cputime); | ||
| 3590 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
| 3591 | account_group_system_time(p, cputime); | ||
| 3592 | |||
| 3593 | /* Add system time to cpustat. */ | ||
| 3594 | tmp = cputime_to_cputime64(cputime); | ||
| 3595 | if (hardirq_count() - hardirq_offset) | 3660 | if (hardirq_count() - hardirq_offset) |
| 3596 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3661 | target_cputime64 = &cpustat->irq; |
| 3597 | else if (in_serving_softirq()) | 3662 | else if (in_serving_softirq()) |
| 3598 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3663 | target_cputime64 = &cpustat->softirq; |
| 3599 | else | 3664 | else |
| 3600 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3665 | target_cputime64 = &cpustat->system; |
| 3601 | |||
| 3602 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
| 3603 | 3666 | ||
| 3604 | /* Account for system time used */ | 3667 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); |
| 3605 | acct_update_integrals(p); | ||
| 3606 | } | 3668 | } |
| 3607 | 3669 | ||
| 3608 | /* | 3670 | /* |
| 3609 | * Account for involuntary wait time. | 3671 | * Account for involuntary wait time. |
| 3610 | * @steal: the cpu time spent in involuntary wait | 3672 | * @cputime: the cpu time spent in involuntary wait |
| 3611 | */ | 3673 | */ |
| 3612 | void account_steal_time(cputime_t cputime) | 3674 | void account_steal_time(cputime_t cputime) |
| 3613 | { | 3675 | { |
| @@ -3635,6 +3697,73 @@ void account_idle_time(cputime_t cputime) | |||
| 3635 | 3697 | ||
| 3636 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3698 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
| 3637 | 3699 | ||
| 3700 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 3701 | /* | ||
| 3702 | * Account a tick to a process and cpustat | ||
| 3703 | * @p: the process that the cpu time gets accounted to | ||
| 3704 | * @user_tick: is the tick from userspace | ||
| 3705 | * @rq: the pointer to rq | ||
| 3706 | * | ||
| 3707 | * Tick demultiplexing follows the order | ||
| 3708 | * - pending hardirq update | ||
| 3709 | * - pending softirq update | ||
| 3710 | * - user_time | ||
| 3711 | * - idle_time | ||
| 3712 | * - system time | ||
| 3713 | * - check for guest_time | ||
| 3714 | * - else account as system_time | ||
| 3715 | * | ||
| 3716 | * Check for hardirq is done both for system and user time as there is | ||
| 3717 | * no timer going off while we are on hardirq and hence we may never get an | ||
| 3718 | * opportunity to update it solely in system time. | ||
| 3719 | * p->stime and friends are only updated on system time and not on irq | ||
| 3720 | * softirq as those do not count in task exec_runtime any more. | ||
| 3721 | */ | ||
| 3722 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 3723 | struct rq *rq) | ||
| 3724 | { | ||
| 3725 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 3726 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | ||
| 3727 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
| 3728 | |||
| 3729 | if (irqtime_account_hi_update()) { | ||
| 3730 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | ||
| 3731 | } else if (irqtime_account_si_update()) { | ||
| 3732 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | ||
| 3733 | } else if (this_cpu_ksoftirqd() == p) { | ||
| 3734 | /* | ||
| 3735 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
| 3736 | * So, we have to handle it separately here. | ||
| 3737 | * Also, p->stime needs to be updated for ksoftirqd. | ||
| 3738 | */ | ||
| 3739 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 3740 | &cpustat->softirq); | ||
| 3741 | } else if (user_tick) { | ||
| 3742 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3743 | } else if (p == rq->idle) { | ||
| 3744 | account_idle_time(cputime_one_jiffy); | ||
| 3745 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
| 3746 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3747 | } else { | ||
| 3748 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 3749 | &cpustat->system); | ||
| 3750 | } | ||
| 3751 | } | ||
| 3752 | |||
| 3753 | static void irqtime_account_idle_ticks(int ticks) | ||
| 3754 | { | ||
| 3755 | int i; | ||
| 3756 | struct rq *rq = this_rq(); | ||
| 3757 | |||
| 3758 | for (i = 0; i < ticks; i++) | ||
| 3759 | irqtime_account_process_tick(current, 0, rq); | ||
| 3760 | } | ||
| 3761 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 3762 | static void irqtime_account_idle_ticks(int ticks) {} | ||
| 3763 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 3764 | struct rq *rq) {} | ||
| 3765 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 3766 | |||
| 3638 | /* | 3767 | /* |
| 3639 | * Account a single tick of cpu time. | 3768 | * Account a single tick of cpu time. |
| 3640 | * @p: the process that the cpu time gets accounted to | 3769 | * @p: the process that the cpu time gets accounted to |
| @@ -3645,6 +3774,11 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
| 3645 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 3774 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
| 3646 | struct rq *rq = this_rq(); | 3775 | struct rq *rq = this_rq(); |
| 3647 | 3776 | ||
| 3777 | if (sched_clock_irqtime) { | ||
| 3778 | irqtime_account_process_tick(p, user_tick, rq); | ||
| 3779 | return; | ||
| 3780 | } | ||
| 3781 | |||
| 3648 | if (user_tick) | 3782 | if (user_tick) |
| 3649 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 3783 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
| 3650 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 3784 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
| @@ -3670,6 +3804,12 @@ void account_steal_ticks(unsigned long ticks) | |||
| 3670 | */ | 3804 | */ |
| 3671 | void account_idle_ticks(unsigned long ticks) | 3805 | void account_idle_ticks(unsigned long ticks) |
| 3672 | { | 3806 | { |
| 3807 | |||
| 3808 | if (sched_clock_irqtime) { | ||
| 3809 | irqtime_account_idle_ticks(ticks); | ||
| 3810 | return; | ||
| 3811 | } | ||
| 3812 | |||
| 3673 | account_idle_time(jiffies_to_cputime(ticks)); | 3813 | account_idle_time(jiffies_to_cputime(ticks)); |
| 3674 | } | 3814 | } |
| 3675 | 3815 | ||
| @@ -3945,9 +4085,6 @@ need_resched: | |||
| 3945 | rcu_note_context_switch(cpu); | 4085 | rcu_note_context_switch(cpu); |
| 3946 | prev = rq->curr; | 4086 | prev = rq->curr; |
| 3947 | 4087 | ||
| 3948 | release_kernel_lock(prev); | ||
| 3949 | need_resched_nonpreemptible: | ||
| 3950 | |||
| 3951 | schedule_debug(prev); | 4088 | schedule_debug(prev); |
| 3952 | 4089 | ||
| 3953 | if (sched_feat(HRTICK)) | 4090 | if (sched_feat(HRTICK)) |
| @@ -3974,6 +4111,16 @@ need_resched_nonpreemptible: | |||
| 3974 | try_to_wake_up_local(to_wakeup); | 4111 | try_to_wake_up_local(to_wakeup); |
| 3975 | } | 4112 | } |
| 3976 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 4113 | deactivate_task(rq, prev, DEQUEUE_SLEEP); |
| 4114 | |||
| 4115 | /* | ||
| 4116 | * If we are going to sleep and we have plugged IO queued, make | ||
| 4117 | * sure to submit it to avoid deadlocks. | ||
| 4118 | */ | ||
| 4119 | if (blk_needs_flush_plug(prev)) { | ||
| 4120 | raw_spin_unlock(&rq->lock); | ||
| 4121 | blk_schedule_flush_plug(prev); | ||
| 4122 | raw_spin_lock(&rq->lock); | ||
| 4123 | } | ||
| 3977 | } | 4124 | } |
| 3978 | switch_count = &prev->nvcsw; | 4125 | switch_count = &prev->nvcsw; |
| 3979 | } | 4126 | } |
| @@ -3989,9 +4136,6 @@ need_resched_nonpreemptible: | |||
| 3989 | rq->skip_clock_update = 0; | 4136 | rq->skip_clock_update = 0; |
| 3990 | 4137 | ||
| 3991 | if (likely(prev != next)) { | 4138 | if (likely(prev != next)) { |
| 3992 | sched_info_switch(prev, next); | ||
| 3993 | perf_event_task_sched_out(prev, next); | ||
| 3994 | |||
| 3995 | rq->nr_switches++; | 4139 | rq->nr_switches++; |
| 3996 | rq->curr = next; | 4140 | rq->curr = next; |
| 3997 | ++*switch_count; | 4141 | ++*switch_count; |
| @@ -4010,9 +4154,6 @@ need_resched_nonpreemptible: | |||
| 4010 | 4154 | ||
| 4011 | post_schedule(rq); | 4155 | post_schedule(rq); |
| 4012 | 4156 | ||
| 4013 | if (unlikely(reacquire_kernel_lock(prev))) | ||
| 4014 | goto need_resched_nonpreemptible; | ||
| 4015 | |||
| 4016 | preempt_enable_no_resched(); | 4157 | preempt_enable_no_resched(); |
| 4017 | if (need_resched()) | 4158 | if (need_resched()) |
| 4018 | goto need_resched; | 4159 | goto need_resched; |
| @@ -4213,6 +4354,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
| 4213 | { | 4354 | { |
| 4214 | __wake_up_common(q, mode, 1, 0, key); | 4355 | __wake_up_common(q, mode, 1, 0, key); |
| 4215 | } | 4356 | } |
| 4357 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
| 4216 | 4358 | ||
| 4217 | /** | 4359 | /** |
| 4218 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | 4360 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. |
| @@ -4570,11 +4712,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 4570 | 4712 | ||
| 4571 | if (running) | 4713 | if (running) |
| 4572 | p->sched_class->set_curr_task(rq); | 4714 | p->sched_class->set_curr_task(rq); |
| 4573 | if (on_rq) { | 4715 | if (on_rq) |
| 4574 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4716 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
| 4575 | 4717 | ||
| 4576 | check_class_changed(rq, p, prev_class, oldprio, running); | 4718 | check_class_changed(rq, p, prev_class, oldprio); |
| 4577 | } | ||
| 4578 | task_rq_unlock(rq, &flags); | 4719 | task_rq_unlock(rq, &flags); |
| 4579 | } | 4720 | } |
| 4580 | 4721 | ||
| @@ -4761,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p) | |||
| 4761 | 4902 | ||
| 4762 | rcu_read_lock(); | 4903 | rcu_read_lock(); |
| 4763 | pcred = __task_cred(p); | 4904 | pcred = __task_cred(p); |
| 4764 | match = (cred->euid == pcred->euid || | 4905 | if (cred->user->user_ns == pcred->user->user_ns) |
| 4765 | cred->euid == pcred->uid); | 4906 | match = (cred->euid == pcred->euid || |
| 4907 | cred->euid == pcred->uid); | ||
| 4908 | else | ||
| 4909 | match = false; | ||
| 4766 | rcu_read_unlock(); | 4910 | rcu_read_unlock(); |
| 4767 | return match; | 4911 | return match; |
| 4768 | } | 4912 | } |
| @@ -4822,12 +4966,15 @@ recheck: | |||
| 4822 | param->sched_priority > rlim_rtprio) | 4966 | param->sched_priority > rlim_rtprio) |
| 4823 | return -EPERM; | 4967 | return -EPERM; |
| 4824 | } | 4968 | } |
| 4969 | |||
| 4825 | /* | 4970 | /* |
| 4826 | * Like positive nice levels, dont allow tasks to | 4971 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
| 4827 | * move out of SCHED_IDLE either: | 4972 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
| 4828 | */ | 4973 | */ |
| 4829 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 4974 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
| 4830 | return -EPERM; | 4975 | if (!can_nice(p, TASK_NICE(p))) |
| 4976 | return -EPERM; | ||
| 4977 | } | ||
| 4831 | 4978 | ||
| 4832 | /* can't change other user's priorities */ | 4979 | /* can't change other user's priorities */ |
| 4833 | if (!check_same_owner(p)) | 4980 | if (!check_same_owner(p)) |
| @@ -4850,7 +4997,7 @@ recheck: | |||
| 4850 | */ | 4997 | */ |
| 4851 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 4998 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 4852 | /* | 4999 | /* |
| 4853 | * To be able to change p->policy safely, the apropriate | 5000 | * To be able to change p->policy safely, the appropriate |
| 4854 | * runqueue lock must be held. | 5001 | * runqueue lock must be held. |
| 4855 | */ | 5002 | */ |
| 4856 | rq = __task_rq_lock(p); | 5003 | rq = __task_rq_lock(p); |
| @@ -4864,6 +5011,17 @@ recheck: | |||
| 4864 | return -EINVAL; | 5011 | return -EINVAL; |
| 4865 | } | 5012 | } |
| 4866 | 5013 | ||
| 5014 | /* | ||
| 5015 | * If not changing anything there's no need to proceed further: | ||
| 5016 | */ | ||
| 5017 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | ||
| 5018 | param->sched_priority == p->rt_priority))) { | ||
| 5019 | |||
| 5020 | __task_rq_unlock(rq); | ||
| 5021 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 5022 | return 0; | ||
| 5023 | } | ||
| 5024 | |||
| 4867 | #ifdef CONFIG_RT_GROUP_SCHED | 5025 | #ifdef CONFIG_RT_GROUP_SCHED |
| 4868 | if (user) { | 5026 | if (user) { |
| 4869 | /* | 5027 | /* |
| @@ -4902,11 +5060,10 @@ recheck: | |||
| 4902 | 5060 | ||
| 4903 | if (running) | 5061 | if (running) |
| 4904 | p->sched_class->set_curr_task(rq); | 5062 | p->sched_class->set_curr_task(rq); |
| 4905 | if (on_rq) { | 5063 | if (on_rq) |
| 4906 | activate_task(rq, p, 0); | 5064 | activate_task(rq, p, 0); |
| 4907 | 5065 | ||
| 4908 | check_class_changed(rq, p, prev_class, oldprio, running); | 5066 | check_class_changed(rq, p, prev_class, oldprio); |
| 4909 | } | ||
| 4910 | __task_rq_unlock(rq); | 5067 | __task_rq_unlock(rq); |
| 4911 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5068 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 4912 | 5069 | ||
| @@ -5088,7 +5245,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 5088 | goto out_free_cpus_allowed; | 5245 | goto out_free_cpus_allowed; |
| 5089 | } | 5246 | } |
| 5090 | retval = -EPERM; | 5247 | retval = -EPERM; |
| 5091 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5248 | if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) |
| 5092 | goto out_unlock; | 5249 | goto out_unlock; |
| 5093 | 5250 | ||
| 5094 | retval = security_task_setscheduler(p); | 5251 | retval = security_task_setscheduler(p); |
| @@ -5323,6 +5480,67 @@ void __sched yield(void) | |||
| 5323 | } | 5480 | } |
| 5324 | EXPORT_SYMBOL(yield); | 5481 | EXPORT_SYMBOL(yield); |
| 5325 | 5482 | ||
| 5483 | /** | ||
| 5484 | * yield_to - yield the current processor to another thread in | ||
| 5485 | * your thread group, or accelerate that thread toward the | ||
| 5486 | * processor it's on. | ||
| 5487 | * @p: target task | ||
| 5488 | * @preempt: whether task preemption is allowed or not | ||
| 5489 | * | ||
| 5490 | * It's the caller's job to ensure that the target task struct | ||
| 5491 | * can't go away on us before we can do any checks. | ||
| 5492 | * | ||
| 5493 | * Returns true if we indeed boosted the target task. | ||
| 5494 | */ | ||
| 5495 | bool __sched yield_to(struct task_struct *p, bool preempt) | ||
| 5496 | { | ||
| 5497 | struct task_struct *curr = current; | ||
| 5498 | struct rq *rq, *p_rq; | ||
| 5499 | unsigned long flags; | ||
| 5500 | bool yielded = 0; | ||
| 5501 | |||
| 5502 | local_irq_save(flags); | ||
| 5503 | rq = this_rq(); | ||
| 5504 | |||
| 5505 | again: | ||
| 5506 | p_rq = task_rq(p); | ||
| 5507 | double_rq_lock(rq, p_rq); | ||
| 5508 | while (task_rq(p) != p_rq) { | ||
| 5509 | double_rq_unlock(rq, p_rq); | ||
| 5510 | goto again; | ||
| 5511 | } | ||
| 5512 | |||
| 5513 | if (!curr->sched_class->yield_to_task) | ||
| 5514 | goto out; | ||
| 5515 | |||
| 5516 | if (curr->sched_class != p->sched_class) | ||
| 5517 | goto out; | ||
| 5518 | |||
| 5519 | if (task_running(p_rq, p) || p->state) | ||
| 5520 | goto out; | ||
| 5521 | |||
| 5522 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | ||
| 5523 | if (yielded) { | ||
| 5524 | schedstat_inc(rq, yld_count); | ||
| 5525 | /* | ||
| 5526 | * Make p's CPU reschedule; pick_next_entity takes care of | ||
| 5527 | * fairness. | ||
| 5528 | */ | ||
| 5529 | if (preempt && rq != p_rq) | ||
| 5530 | resched_task(p_rq->curr); | ||
| 5531 | } | ||
| 5532 | |||
| 5533 | out: | ||
| 5534 | double_rq_unlock(rq, p_rq); | ||
| 5535 | local_irq_restore(flags); | ||
| 5536 | |||
| 5537 | if (yielded) | ||
| 5538 | schedule(); | ||
| 5539 | |||
| 5540 | return yielded; | ||
| 5541 | } | ||
| 5542 | EXPORT_SYMBOL_GPL(yield_to); | ||
| 5543 | |||
| 5326 | /* | 5544 | /* |
| 5327 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5545 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
| 5328 | * that process accounting knows that this is a task in IO wait state. | 5546 | * that process accounting knows that this is a task in IO wait state. |
| @@ -5333,6 +5551,7 @@ void __sched io_schedule(void) | |||
| 5333 | 5551 | ||
| 5334 | delayacct_blkio_start(); | 5552 | delayacct_blkio_start(); |
| 5335 | atomic_inc(&rq->nr_iowait); | 5553 | atomic_inc(&rq->nr_iowait); |
| 5554 | blk_flush_plug(current); | ||
| 5336 | current->in_iowait = 1; | 5555 | current->in_iowait = 1; |
| 5337 | schedule(); | 5556 | schedule(); |
| 5338 | current->in_iowait = 0; | 5557 | current->in_iowait = 0; |
| @@ -5348,6 +5567,7 @@ long __sched io_schedule_timeout(long timeout) | |||
| 5348 | 5567 | ||
| 5349 | delayacct_blkio_start(); | 5568 | delayacct_blkio_start(); |
| 5350 | atomic_inc(&rq->nr_iowait); | 5569 | atomic_inc(&rq->nr_iowait); |
| 5570 | blk_flush_plug(current); | ||
| 5351 | current->in_iowait = 1; | 5571 | current->in_iowait = 1; |
| 5352 | ret = schedule_timeout(timeout); | 5572 | ret = schedule_timeout(timeout); |
| 5353 | current->in_iowait = 0; | 5573 | current->in_iowait = 0; |
| @@ -5496,7 +5716,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 5496 | do_each_thread(g, p) { | 5716 | do_each_thread(g, p) { |
| 5497 | /* | 5717 | /* |
| 5498 | * reset the NMI-timeout, listing all files on a slow | 5718 | * reset the NMI-timeout, listing all files on a slow |
| 5499 | * console might take alot of time: | 5719 | * console might take a lot of time: |
| 5500 | */ | 5720 | */ |
| 5501 | touch_nmi_watchdog(); | 5721 | touch_nmi_watchdog(); |
| 5502 | if (!state_filter || (p->state & state_filter)) | 5722 | if (!state_filter || (p->state & state_filter)) |
| @@ -5571,7 +5791,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 5571 | * The idle tasks have their own, simple scheduling class: | 5791 | * The idle tasks have their own, simple scheduling class: |
| 5572 | */ | 5792 | */ |
| 5573 | idle->sched_class = &idle_sched_class; | 5793 | idle->sched_class = &idle_sched_class; |
| 5574 | ftrace_graph_init_task(idle); | 5794 | ftrace_graph_init_idle_task(idle, cpu); |
| 5575 | } | 5795 | } |
| 5576 | 5796 | ||
| 5577 | /* | 5797 | /* |
| @@ -6111,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 6111 | break; | 6331 | break; |
| 6112 | #endif | 6332 | #endif |
| 6113 | } | 6333 | } |
| 6334 | |||
| 6335 | update_max_interval(); | ||
| 6336 | |||
| 6114 | return NOTIFY_OK; | 6337 | return NOTIFY_OK; |
| 6115 | } | 6338 | } |
| 6116 | 6339 | ||
| @@ -7796,6 +8019,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
| 7796 | INIT_LIST_HEAD(&cfs_rq->tasks); | 8019 | INIT_LIST_HEAD(&cfs_rq->tasks); |
| 7797 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8020 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7798 | cfs_rq->rq = rq; | 8021 | cfs_rq->rq = rq; |
| 8022 | /* allow initial update_cfs_load() to truncate */ | ||
| 8023 | #ifdef CONFIG_SMP | ||
| 8024 | cfs_rq->load_stamp = 1; | ||
| 8025 | #endif | ||
| 7799 | #endif | 8026 | #endif |
| 7800 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 8027 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
| 7801 | } | 8028 | } |
| @@ -8074,7 +8301,7 @@ static inline int preempt_count_equals(int preempt_offset) | |||
| 8074 | { | 8301 | { |
| 8075 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 8302 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
| 8076 | 8303 | ||
| 8077 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 8304 | return (nested == preempt_offset); |
| 8078 | } | 8305 | } |
| 8079 | 8306 | ||
| 8080 | void __might_sleep(const char *file, int line, int preempt_offset) | 8307 | void __might_sleep(const char *file, int line, int preempt_offset) |
| @@ -8109,6 +8336,8 @@ EXPORT_SYMBOL(__might_sleep); | |||
| 8109 | #ifdef CONFIG_MAGIC_SYSRQ | 8336 | #ifdef CONFIG_MAGIC_SYSRQ |
| 8110 | static void normalize_task(struct rq *rq, struct task_struct *p) | 8337 | static void normalize_task(struct rq *rq, struct task_struct *p) |
| 8111 | { | 8338 | { |
| 8339 | const struct sched_class *prev_class = p->sched_class; | ||
| 8340 | int old_prio = p->prio; | ||
| 8112 | int on_rq; | 8341 | int on_rq; |
| 8113 | 8342 | ||
| 8114 | on_rq = p->se.on_rq; | 8343 | on_rq = p->se.on_rq; |
| @@ -8119,6 +8348,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
| 8119 | activate_task(rq, p, 0); | 8348 | activate_task(rq, p, 0); |
| 8120 | resched_task(rq->curr); | 8349 | resched_task(rq->curr); |
| 8121 | } | 8350 | } |
| 8351 | |||
| 8352 | check_class_changed(rq, p, prev_class, old_prio); | ||
| 8122 | } | 8353 | } |
| 8123 | 8354 | ||
| 8124 | void normalize_rt_tasks(void) | 8355 | void normalize_rt_tasks(void) |
| @@ -8234,7 +8465,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8234 | { | 8465 | { |
| 8235 | struct cfs_rq *cfs_rq; | 8466 | struct cfs_rq *cfs_rq; |
| 8236 | struct sched_entity *se; | 8467 | struct sched_entity *se; |
| 8237 | struct rq *rq; | ||
| 8238 | int i; | 8468 | int i; |
| 8239 | 8469 | ||
| 8240 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8470 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
| @@ -8247,8 +8477,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8247 | tg->shares = NICE_0_LOAD; | 8477 | tg->shares = NICE_0_LOAD; |
| 8248 | 8478 | ||
| 8249 | for_each_possible_cpu(i) { | 8479 | for_each_possible_cpu(i) { |
| 8250 | rq = cpu_rq(i); | ||
| 8251 | |||
| 8252 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8480 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
| 8253 | GFP_KERNEL, cpu_to_node(i)); | 8481 | GFP_KERNEL, cpu_to_node(i)); |
| 8254 | if (!cfs_rq) | 8482 | if (!cfs_rq) |
| @@ -8510,7 +8738,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 8510 | /* Propagate contribution to hierarchy */ | 8738 | /* Propagate contribution to hierarchy */ |
| 8511 | raw_spin_lock_irqsave(&rq->lock, flags); | 8739 | raw_spin_lock_irqsave(&rq->lock, flags); |
| 8512 | for_each_sched_entity(se) | 8740 | for_each_sched_entity(se) |
| 8513 | update_cfs_shares(group_cfs_rq(se), 0); | 8741 | update_cfs_shares(group_cfs_rq(se)); |
| 8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8742 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 8515 | } | 8743 | } |
| 8516 | 8744 | ||
| @@ -8884,7 +9112,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
| 8884 | } | 9112 | } |
| 8885 | 9113 | ||
| 8886 | static void | 9114 | static void |
| 8887 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) | 9115 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
| 9116 | struct cgroup *old_cgrp, struct task_struct *task) | ||
| 8888 | { | 9117 | { |
| 8889 | /* | 9118 | /* |
| 8890 | * cgroup_exit() is called in the copy_process() failure path. | 9119 | * cgroup_exit() is called in the copy_process() failure path. |
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 9fb656283157..429242f3c484 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c | |||
| @@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr; | |||
| 12 | static void __init autogroup_init(struct task_struct *init_task) | 12 | static void __init autogroup_init(struct task_struct *init_task) |
| 13 | { | 13 | { |
| 14 | autogroup_default.tg = &root_task_group; | 14 | autogroup_default.tg = &root_task_group; |
| 15 | root_task_group.autogroup = &autogroup_default; | ||
| 16 | kref_init(&autogroup_default.kref); | 15 | kref_init(&autogroup_default.kref); |
| 17 | init_rwsem(&autogroup_default.lock); | 16 | init_rwsem(&autogroup_default.lock); |
| 18 | init_task->signal->autogroup = &autogroup_default; | 17 | init_task->signal->autogroup = &autogroup_default; |
| @@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
| 130 | 129 | ||
| 131 | static inline bool task_group_is_autogroup(struct task_group *tg) | 130 | static inline bool task_group_is_autogroup(struct task_group *tg) |
| 132 | { | 131 | { |
| 133 | return tg != &root_task_group && tg->autogroup; | 132 | return !!tg->autogroup; |
| 134 | } | 133 | } |
| 135 | 134 | ||
| 136 | static inline struct task_group * | 135 | static inline struct task_group * |
| @@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
| 161 | 160 | ||
| 162 | p->signal->autogroup = autogroup_kref_get(ag); | 161 | p->signal->autogroup = autogroup_kref_get(ag); |
| 163 | 162 | ||
| 163 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | ||
| 164 | goto out; | ||
| 165 | |||
| 164 | t = p; | 166 | t = p; |
| 165 | do { | 167 | do { |
| 166 | sched_move_task(t); | 168 | sched_move_task(t); |
| 167 | } while_each_thread(p, t); | 169 | } while_each_thread(p, t); |
| 168 | 170 | ||
| 171 | out: | ||
| 169 | unlock_task_sighand(p, &flags); | 172 | unlock_task_sighand(p, &flags); |
| 170 | autogroup_kref_put(prev); | 173 | autogroup_kref_put(prev); |
| 171 | } | 174 | } |
| @@ -176,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p) | |||
| 176 | struct autogroup *ag = autogroup_create(); | 179 | struct autogroup *ag = autogroup_create(); |
| 177 | 180 | ||
| 178 | autogroup_move_group(p, ag); | 181 | autogroup_move_group(p, ag); |
| 179 | /* drop extra refrence added by autogroup_create() */ | 182 | /* drop extra reference added by autogroup_create() */ |
| 180 | autogroup_kref_put(ag); | 183 | autogroup_kref_put(ag); |
| 181 | } | 184 | } |
| 182 | EXPORT_SYMBOL(sched_autogroup_create_attach); | 185 | EXPORT_SYMBOL(sched_autogroup_create_attach); |
| @@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | |||
| 247 | { | 250 | { |
| 248 | struct autogroup *ag = autogroup_task_get(p); | 251 | struct autogroup *ag = autogroup_task_get(p); |
| 249 | 252 | ||
| 253 | if (!task_group_is_autogroup(ag->tg)) | ||
| 254 | goto out; | ||
| 255 | |||
| 250 | down_read(&ag->lock); | 256 | down_read(&ag->lock); |
| 251 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); | 257 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); |
| 252 | up_read(&ag->lock); | 258 | up_read(&ag->lock); |
| 253 | 259 | ||
| 260 | out: | ||
| 254 | autogroup_kref_put(ag); | 261 | autogroup_kref_put(ag); |
| 255 | } | 262 | } |
| 256 | #endif /* CONFIG_PROC_FS */ | 263 | #endif /* CONFIG_PROC_FS */ |
| @@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | |||
| 258 | #ifdef CONFIG_SCHED_DEBUG | 265 | #ifdef CONFIG_SCHED_DEBUG |
| 259 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 266 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) |
| 260 | { | 267 | { |
| 261 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | 268 | if (!task_group_is_autogroup(tg)) |
| 262 | |||
| 263 | if (!enabled || !tg->autogroup) | ||
| 264 | return 0; | 269 | return 0; |
| 265 | 270 | ||
| 266 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 271 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 7b859ffe5dad..05577055cfca 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h | |||
| @@ -1,6 +1,11 @@ | |||
| 1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 2 | 2 | ||
| 3 | struct autogroup { | 3 | struct autogroup { |
| 4 | /* | ||
| 5 | * reference doesn't mean how many thread attach to this | ||
| 6 | * autogroup now. It just stands for the number of task | ||
| 7 | * could use this autogroup. | ||
| 8 | */ | ||
| 4 | struct kref kref; | 9 | struct kref kref; |
| 5 | struct task_group *tg; | 10 | struct task_group *tg; |
| 6 | struct rw_semaphore lock; | 11 | struct rw_semaphore lock; |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index eb6cb8edd075..7bacd83a4158 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 179 | 179 | ||
| 180 | raw_spin_lock_irqsave(&rq->lock, flags); | 180 | raw_spin_lock_irqsave(&rq->lock, flags); |
| 181 | if (cfs_rq->rb_leftmost) | 181 | if (cfs_rq->rb_leftmost) |
| 182 | MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; | 182 | MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; |
| 183 | last = __pick_last_entity(cfs_rq); | 183 | last = __pick_last_entity(cfs_rq); |
| 184 | if (last) | 184 | if (last) |
| 185 | max_vruntime = last->vruntime; | 185 | max_vruntime = last->vruntime; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c26e2df450e..6fa833ab2cb8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | 22 | ||
| 23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
| 24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
| 25 | #include <linux/cpumask.h> | ||
| 25 | 26 | ||
| 26 | /* | 27 | /* |
| 27 | * Targeted preemption latency for CPU-bound tasks: | 28 | * Targeted preemption latency for CPU-bound tasks: |
| @@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8; | |||
| 69 | unsigned int sysctl_sched_child_runs_first __read_mostly; | 70 | unsigned int sysctl_sched_child_runs_first __read_mostly; |
| 70 | 71 | ||
| 71 | /* | 72 | /* |
| 72 | * sys_sched_yield() compat mode | ||
| 73 | * | ||
| 74 | * This option switches the agressive yield implementation of the | ||
| 75 | * old scheduler back on. | ||
| 76 | */ | ||
| 77 | unsigned int __read_mostly sysctl_sched_compat_yield; | ||
| 78 | |||
| 79 | /* | ||
| 80 | * SCHED_OTHER wake-up granularity. | 73 | * SCHED_OTHER wake-up granularity. |
| 81 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | 74 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 82 | * | 75 | * |
| @@ -419,7 +412,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 419 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 412 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
| 420 | } | 413 | } |
| 421 | 414 | ||
| 422 | static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | 415 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
| 423 | { | 416 | { |
| 424 | struct rb_node *left = cfs_rq->rb_leftmost; | 417 | struct rb_node *left = cfs_rq->rb_leftmost; |
| 425 | 418 | ||
| @@ -429,6 +422,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
| 429 | return rb_entry(left, struct sched_entity, run_node); | 422 | return rb_entry(left, struct sched_entity, run_node); |
| 430 | } | 423 | } |
| 431 | 424 | ||
| 425 | static struct sched_entity *__pick_next_entity(struct sched_entity *se) | ||
| 426 | { | ||
| 427 | struct rb_node *next = rb_next(&se->run_node); | ||
| 428 | |||
| 429 | if (!next) | ||
| 430 | return NULL; | ||
| 431 | |||
| 432 | return rb_entry(next, struct sched_entity, run_node); | ||
| 433 | } | ||
| 434 | |||
| 435 | #ifdef CONFIG_SCHED_DEBUG | ||
| 432 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 436 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
| 433 | { | 437 | { |
| 434 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 438 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
| @@ -443,7 +447,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
| 443 | * Scheduling class statistics methods: | 447 | * Scheduling class statistics methods: |
| 444 | */ | 448 | */ |
| 445 | 449 | ||
| 446 | #ifdef CONFIG_SCHED_DEBUG | ||
| 447 | int sched_proc_update_handler(struct ctl_table *table, int write, | 450 | int sched_proc_update_handler(struct ctl_table *table, int write, |
| 448 | void __user *buffer, size_t *lenp, | 451 | void __user *buffer, size_t *lenp, |
| 449 | loff_t *ppos) | 452 | loff_t *ppos) |
| @@ -540,7 +543,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 540 | } | 543 | } |
| 541 | 544 | ||
| 542 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | 545 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); |
| 543 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); | 546 | static void update_cfs_shares(struct cfs_rq *cfs_rq); |
| 544 | 547 | ||
| 545 | /* | 548 | /* |
| 546 | * Update the current task's runtime statistics. Skip current tasks that | 549 | * Update the current task's runtime statistics. Skip current tasks that |
| @@ -733,6 +736,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
| 733 | now - cfs_rq->load_last > 4 * period) { | 736 | now - cfs_rq->load_last > 4 * period) { |
| 734 | cfs_rq->load_period = 0; | 737 | cfs_rq->load_period = 0; |
| 735 | cfs_rq->load_avg = 0; | 738 | cfs_rq->load_avg = 0; |
| 739 | delta = period - 1; | ||
| 736 | } | 740 | } |
| 737 | 741 | ||
| 738 | cfs_rq->load_stamp = now; | 742 | cfs_rq->load_stamp = now; |
| @@ -763,16 +767,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
| 763 | list_del_leaf_cfs_rq(cfs_rq); | 767 | list_del_leaf_cfs_rq(cfs_rq); |
| 764 | } | 768 | } |
| 765 | 769 | ||
| 766 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | 770 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
| 767 | long weight_delta) | ||
| 768 | { | 771 | { |
| 769 | long load_weight, load, shares; | 772 | long load_weight, load, shares; |
| 770 | 773 | ||
| 771 | load = cfs_rq->load.weight + weight_delta; | 774 | load = cfs_rq->load.weight; |
| 772 | 775 | ||
| 773 | load_weight = atomic_read(&tg->load_weight); | 776 | load_weight = atomic_read(&tg->load_weight); |
| 774 | load_weight -= cfs_rq->load_contribution; | ||
| 775 | load_weight += load; | 777 | load_weight += load; |
| 778 | load_weight -= cfs_rq->load_contribution; | ||
| 776 | 779 | ||
| 777 | shares = (tg->shares * load); | 780 | shares = (tg->shares * load); |
| 778 | if (load_weight) | 781 | if (load_weight) |
| @@ -790,7 +793,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | |||
| 790 | { | 793 | { |
| 791 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | 794 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { |
| 792 | update_cfs_load(cfs_rq, 0); | 795 | update_cfs_load(cfs_rq, 0); |
| 793 | update_cfs_shares(cfs_rq, 0); | 796 | update_cfs_shares(cfs_rq); |
| 794 | } | 797 | } |
| 795 | } | 798 | } |
| 796 | # else /* CONFIG_SMP */ | 799 | # else /* CONFIG_SMP */ |
| @@ -798,8 +801,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
| 798 | { | 801 | { |
| 799 | } | 802 | } |
| 800 | 803 | ||
| 801 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | 804 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
| 802 | long weight_delta) | ||
| 803 | { | 805 | { |
| 804 | return tg->shares; | 806 | return tg->shares; |
| 805 | } | 807 | } |
| @@ -824,7 +826,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
| 824 | account_entity_enqueue(cfs_rq, se); | 826 | account_entity_enqueue(cfs_rq, se); |
| 825 | } | 827 | } |
| 826 | 828 | ||
| 827 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | 829 | static void update_cfs_shares(struct cfs_rq *cfs_rq) |
| 828 | { | 830 | { |
| 829 | struct task_group *tg; | 831 | struct task_group *tg; |
| 830 | struct sched_entity *se; | 832 | struct sched_entity *se; |
| @@ -838,7 +840,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | |||
| 838 | if (likely(se->load.weight == tg->shares)) | 840 | if (likely(se->load.weight == tg->shares)) |
| 839 | return; | 841 | return; |
| 840 | #endif | 842 | #endif |
| 841 | shares = calc_cfs_shares(cfs_rq, tg, weight_delta); | 843 | shares = calc_cfs_shares(cfs_rq, tg); |
| 842 | 844 | ||
| 843 | reweight_entity(cfs_rq_of(se), se, shares); | 845 | reweight_entity(cfs_rq_of(se), se, shares); |
| 844 | } | 846 | } |
| @@ -847,7 +849,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
| 847 | { | 849 | { |
| 848 | } | 850 | } |
| 849 | 851 | ||
| 850 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | 852 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) |
| 851 | { | 853 | { |
| 852 | } | 854 | } |
| 853 | 855 | ||
| @@ -978,8 +980,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 978 | */ | 980 | */ |
| 979 | update_curr(cfs_rq); | 981 | update_curr(cfs_rq); |
| 980 | update_cfs_load(cfs_rq, 0); | 982 | update_cfs_load(cfs_rq, 0); |
| 981 | update_cfs_shares(cfs_rq, se->load.weight); | ||
| 982 | account_entity_enqueue(cfs_rq, se); | 983 | account_entity_enqueue(cfs_rq, se); |
| 984 | update_cfs_shares(cfs_rq); | ||
| 983 | 985 | ||
| 984 | if (flags & ENQUEUE_WAKEUP) { | 986 | if (flags & ENQUEUE_WAKEUP) { |
| 985 | place_entity(cfs_rq, se, 0); | 987 | place_entity(cfs_rq, se, 0); |
| @@ -996,19 +998,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 996 | list_add_leaf_cfs_rq(cfs_rq); | 998 | list_add_leaf_cfs_rq(cfs_rq); |
| 997 | } | 999 | } |
| 998 | 1000 | ||
| 999 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1001 | static void __clear_buddies_last(struct sched_entity *se) |
| 1000 | { | 1002 | { |
| 1001 | if (!se || cfs_rq->last == se) | 1003 | for_each_sched_entity(se) { |
| 1002 | cfs_rq->last = NULL; | 1004 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 1005 | if (cfs_rq->last == se) | ||
| 1006 | cfs_rq->last = NULL; | ||
| 1007 | else | ||
| 1008 | break; | ||
| 1009 | } | ||
| 1010 | } | ||
| 1003 | 1011 | ||
| 1004 | if (!se || cfs_rq->next == se) | 1012 | static void __clear_buddies_next(struct sched_entity *se) |
| 1005 | cfs_rq->next = NULL; | 1013 | { |
| 1014 | for_each_sched_entity(se) { | ||
| 1015 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
| 1016 | if (cfs_rq->next == se) | ||
| 1017 | cfs_rq->next = NULL; | ||
| 1018 | else | ||
| 1019 | break; | ||
| 1020 | } | ||
| 1021 | } | ||
| 1022 | |||
| 1023 | static void __clear_buddies_skip(struct sched_entity *se) | ||
| 1024 | { | ||
| 1025 | for_each_sched_entity(se) { | ||
| 1026 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
| 1027 | if (cfs_rq->skip == se) | ||
| 1028 | cfs_rq->skip = NULL; | ||
| 1029 | else | ||
| 1030 | break; | ||
| 1031 | } | ||
| 1006 | } | 1032 | } |
| 1007 | 1033 | ||
| 1008 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1034 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 1009 | { | 1035 | { |
| 1010 | for_each_sched_entity(se) | 1036 | if (cfs_rq->last == se) |
| 1011 | __clear_buddies(cfs_rq_of(se), se); | 1037 | __clear_buddies_last(se); |
| 1038 | |||
| 1039 | if (cfs_rq->next == se) | ||
| 1040 | __clear_buddies_next(se); | ||
| 1041 | |||
| 1042 | if (cfs_rq->skip == se) | ||
| 1043 | __clear_buddies_skip(se); | ||
| 1012 | } | 1044 | } |
| 1013 | 1045 | ||
| 1014 | static void | 1046 | static void |
| @@ -1041,7 +1073,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 1041 | update_cfs_load(cfs_rq, 0); | 1073 | update_cfs_load(cfs_rq, 0); |
| 1042 | account_entity_dequeue(cfs_rq, se); | 1074 | account_entity_dequeue(cfs_rq, se); |
| 1043 | update_min_vruntime(cfs_rq); | 1075 | update_min_vruntime(cfs_rq); |
| 1044 | update_cfs_shares(cfs_rq, 0); | 1076 | update_cfs_shares(cfs_rq); |
| 1045 | 1077 | ||
| 1046 | /* | 1078 | /* |
| 1047 | * Normalize the entity after updating the min_vruntime because the | 1079 | * Normalize the entity after updating the min_vruntime because the |
| @@ -1084,7 +1116,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
| 1084 | return; | 1116 | return; |
| 1085 | 1117 | ||
| 1086 | if (cfs_rq->nr_running > 1) { | 1118 | if (cfs_rq->nr_running > 1) { |
| 1087 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1119 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
| 1088 | s64 delta = curr->vruntime - se->vruntime; | 1120 | s64 delta = curr->vruntime - se->vruntime; |
| 1089 | 1121 | ||
| 1090 | if (delta < 0) | 1122 | if (delta < 0) |
| @@ -1128,13 +1160,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 1128 | static int | 1160 | static int |
| 1129 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | 1161 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); |
| 1130 | 1162 | ||
| 1163 | /* | ||
| 1164 | * Pick the next process, keeping these things in mind, in this order: | ||
| 1165 | * 1) keep things fair between processes/task groups | ||
| 1166 | * 2) pick the "next" process, since someone really wants that to run | ||
| 1167 | * 3) pick the "last" process, for cache locality | ||
| 1168 | * 4) do not run the "skip" process, if something else is available | ||
| 1169 | */ | ||
| 1131 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 1170 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
| 1132 | { | 1171 | { |
| 1133 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1172 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
| 1134 | struct sched_entity *left = se; | 1173 | struct sched_entity *left = se; |
| 1135 | 1174 | ||
| 1136 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | 1175 | /* |
| 1137 | se = cfs_rq->next; | 1176 | * Avoid running the skip buddy, if running something else can |
| 1177 | * be done without getting too unfair. | ||
| 1178 | */ | ||
| 1179 | if (cfs_rq->skip == se) { | ||
| 1180 | struct sched_entity *second = __pick_next_entity(se); | ||
| 1181 | if (second && wakeup_preempt_entity(second, left) < 1) | ||
| 1182 | se = second; | ||
| 1183 | } | ||
| 1138 | 1184 | ||
| 1139 | /* | 1185 | /* |
| 1140 | * Prefer last buddy, try to return the CPU to a preempted task. | 1186 | * Prefer last buddy, try to return the CPU to a preempted task. |
| @@ -1142,6 +1188,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
| 1142 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) | 1188 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) |
| 1143 | se = cfs_rq->last; | 1189 | se = cfs_rq->last; |
| 1144 | 1190 | ||
| 1191 | /* | ||
| 1192 | * Someone really wants this to run. If it's not unfair, run it. | ||
| 1193 | */ | ||
| 1194 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | ||
| 1195 | se = cfs_rq->next; | ||
| 1196 | |||
| 1145 | clear_buddies(cfs_rq, se); | 1197 | clear_buddies(cfs_rq, se); |
| 1146 | 1198 | ||
| 1147 | return se; | 1199 | return se; |
| @@ -1282,7 +1334,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1282 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1334 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 1283 | 1335 | ||
| 1284 | update_cfs_load(cfs_rq, 0); | 1336 | update_cfs_load(cfs_rq, 0); |
| 1285 | update_cfs_shares(cfs_rq, 0); | 1337 | update_cfs_shares(cfs_rq); |
| 1286 | } | 1338 | } |
| 1287 | 1339 | ||
| 1288 | hrtick_update(rq); | 1340 | hrtick_update(rq); |
| @@ -1312,58 +1364,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1312 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1364 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 1313 | 1365 | ||
| 1314 | update_cfs_load(cfs_rq, 0); | 1366 | update_cfs_load(cfs_rq, 0); |
| 1315 | update_cfs_shares(cfs_rq, 0); | 1367 | update_cfs_shares(cfs_rq); |
| 1316 | } | 1368 | } |
| 1317 | 1369 | ||
| 1318 | hrtick_update(rq); | 1370 | hrtick_update(rq); |
| 1319 | } | 1371 | } |
| 1320 | 1372 | ||
| 1321 | /* | ||
| 1322 | * sched_yield() support is very simple - we dequeue and enqueue. | ||
| 1323 | * | ||
| 1324 | * If compat_yield is turned on then we requeue to the end of the tree. | ||
| 1325 | */ | ||
| 1326 | static void yield_task_fair(struct rq *rq) | ||
| 1327 | { | ||
| 1328 | struct task_struct *curr = rq->curr; | ||
| 1329 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
| 1330 | struct sched_entity *rightmost, *se = &curr->se; | ||
| 1331 | |||
| 1332 | /* | ||
| 1333 | * Are we the only task in the tree? | ||
| 1334 | */ | ||
| 1335 | if (unlikely(cfs_rq->nr_running == 1)) | ||
| 1336 | return; | ||
| 1337 | |||
| 1338 | clear_buddies(cfs_rq, se); | ||
| 1339 | |||
| 1340 | if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { | ||
| 1341 | update_rq_clock(rq); | ||
| 1342 | /* | ||
| 1343 | * Update run-time statistics of the 'current'. | ||
| 1344 | */ | ||
| 1345 | update_curr(cfs_rq); | ||
| 1346 | |||
| 1347 | return; | ||
| 1348 | } | ||
| 1349 | /* | ||
| 1350 | * Find the rightmost entry in the rbtree: | ||
| 1351 | */ | ||
| 1352 | rightmost = __pick_last_entity(cfs_rq); | ||
| 1353 | /* | ||
| 1354 | * Already in the rightmost position? | ||
| 1355 | */ | ||
| 1356 | if (unlikely(!rightmost || entity_before(rightmost, se))) | ||
| 1357 | return; | ||
| 1358 | |||
| 1359 | /* | ||
| 1360 | * Minimally necessary key value to be last in the tree: | ||
| 1361 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
| 1362 | * 'current' within the tree based on its new key value. | ||
| 1363 | */ | ||
| 1364 | se->vruntime = rightmost->vruntime + 1; | ||
| 1365 | } | ||
| 1366 | |||
| 1367 | #ifdef CONFIG_SMP | 1373 | #ifdef CONFIG_SMP |
| 1368 | 1374 | ||
| 1369 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1375 | static void task_waking_fair(struct rq *rq, struct task_struct *p) |
| @@ -1834,6 +1840,14 @@ static void set_next_buddy(struct sched_entity *se) | |||
| 1834 | } | 1840 | } |
| 1835 | } | 1841 | } |
| 1836 | 1842 | ||
| 1843 | static void set_skip_buddy(struct sched_entity *se) | ||
| 1844 | { | ||
| 1845 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | ||
| 1846 | for_each_sched_entity(se) | ||
| 1847 | cfs_rq_of(se)->skip = se; | ||
| 1848 | } | ||
| 1849 | } | ||
| 1850 | |||
| 1837 | /* | 1851 | /* |
| 1838 | * Preempt the current task with a newly woken task if needed: | 1852 | * Preempt the current task with a newly woken task if needed: |
| 1839 | */ | 1853 | */ |
| @@ -1857,16 +1871,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 1857 | if (test_tsk_need_resched(curr)) | 1871 | if (test_tsk_need_resched(curr)) |
| 1858 | return; | 1872 | return; |
| 1859 | 1873 | ||
| 1874 | /* Idle tasks are by definition preempted by non-idle tasks. */ | ||
| 1875 | if (unlikely(curr->policy == SCHED_IDLE) && | ||
| 1876 | likely(p->policy != SCHED_IDLE)) | ||
| 1877 | goto preempt; | ||
| 1878 | |||
| 1860 | /* | 1879 | /* |
| 1861 | * Batch and idle tasks do not preempt (their preemption is driven by | 1880 | * Batch and idle tasks do not preempt non-idle tasks (their preemption |
| 1862 | * the tick): | 1881 | * is driven by the tick): |
| 1863 | */ | 1882 | */ |
| 1864 | if (unlikely(p->policy != SCHED_NORMAL)) | 1883 | if (unlikely(p->policy != SCHED_NORMAL)) |
| 1865 | return; | 1884 | return; |
| 1866 | 1885 | ||
| 1867 | /* Idle tasks are by definition preempted by everybody. */ | ||
| 1868 | if (unlikely(curr->policy == SCHED_IDLE)) | ||
| 1869 | goto preempt; | ||
| 1870 | 1886 | ||
| 1871 | if (!sched_feat(WAKEUP_PREEMPT)) | 1887 | if (!sched_feat(WAKEUP_PREEMPT)) |
| 1872 | return; | 1888 | return; |
| @@ -1932,6 +1948,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
| 1932 | } | 1948 | } |
| 1933 | } | 1949 | } |
| 1934 | 1950 | ||
| 1951 | /* | ||
| 1952 | * sched_yield() is very simple | ||
| 1953 | * | ||
| 1954 | * The magic of dealing with the ->skip buddy is in pick_next_entity. | ||
| 1955 | */ | ||
| 1956 | static void yield_task_fair(struct rq *rq) | ||
| 1957 | { | ||
| 1958 | struct task_struct *curr = rq->curr; | ||
| 1959 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
| 1960 | struct sched_entity *se = &curr->se; | ||
| 1961 | |||
| 1962 | /* | ||
| 1963 | * Are we the only task in the tree? | ||
| 1964 | */ | ||
| 1965 | if (unlikely(rq->nr_running == 1)) | ||
| 1966 | return; | ||
| 1967 | |||
| 1968 | clear_buddies(cfs_rq, se); | ||
| 1969 | |||
| 1970 | if (curr->policy != SCHED_BATCH) { | ||
| 1971 | update_rq_clock(rq); | ||
| 1972 | /* | ||
| 1973 | * Update run-time statistics of the 'current'. | ||
| 1974 | */ | ||
| 1975 | update_curr(cfs_rq); | ||
| 1976 | } | ||
| 1977 | |||
| 1978 | set_skip_buddy(se); | ||
| 1979 | } | ||
| 1980 | |||
| 1981 | static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) | ||
| 1982 | { | ||
| 1983 | struct sched_entity *se = &p->se; | ||
| 1984 | |||
| 1985 | if (!se->on_rq) | ||
| 1986 | return false; | ||
| 1987 | |||
| 1988 | /* Tell the scheduler that we'd really like pse to run next. */ | ||
| 1989 | set_next_buddy(se); | ||
| 1990 | |||
| 1991 | yield_task_fair(rq); | ||
| 1992 | |||
| 1993 | return true; | ||
| 1994 | } | ||
| 1995 | |||
| 1935 | #ifdef CONFIG_SMP | 1996 | #ifdef CONFIG_SMP |
| 1936 | /************************************************** | 1997 | /************************************************** |
| 1937 | * Fair scheduling class load-balancing methods: | 1998 | * Fair scheduling class load-balancing methods: |
| @@ -2043,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2043 | enum cpu_idle_type idle, int *all_pinned, | 2104 | enum cpu_idle_type idle, int *all_pinned, |
| 2044 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) | 2105 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) |
| 2045 | { | 2106 | { |
| 2046 | int loops = 0, pulled = 0, pinned = 0; | 2107 | int loops = 0, pulled = 0; |
| 2047 | long rem_load_move = max_load_move; | 2108 | long rem_load_move = max_load_move; |
| 2048 | struct task_struct *p, *n; | 2109 | struct task_struct *p, *n; |
| 2049 | 2110 | ||
| 2050 | if (max_load_move == 0) | 2111 | if (max_load_move == 0) |
| 2051 | goto out; | 2112 | goto out; |
| 2052 | 2113 | ||
| 2053 | pinned = 1; | ||
| 2054 | |||
| 2055 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 2114 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
| 2056 | if (loops++ > sysctl_sched_nr_migrate) | 2115 | if (loops++ > sysctl_sched_nr_migrate) |
| 2057 | break; | 2116 | break; |
| 2058 | 2117 | ||
| 2059 | if ((p->se.load.weight >> 1) > rem_load_move || | 2118 | if ((p->se.load.weight >> 1) > rem_load_move || |
| 2060 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) | 2119 | !can_migrate_task(p, busiest, this_cpu, sd, idle, |
| 2120 | all_pinned)) | ||
| 2061 | continue; | 2121 | continue; |
| 2062 | 2122 | ||
| 2063 | pull_task(busiest, p, this_rq, this_cpu); | 2123 | pull_task(busiest, p, this_rq, this_cpu); |
| @@ -2092,9 +2152,6 @@ out: | |||
| 2092 | */ | 2152 | */ |
| 2093 | schedstat_add(sd, lb_gained[idle], pulled); | 2153 | schedstat_add(sd, lb_gained[idle], pulled); |
| 2094 | 2154 | ||
| 2095 | if (all_pinned) | ||
| 2096 | *all_pinned = pinned; | ||
| 2097 | |||
| 2098 | return max_load_move - rem_load_move; | 2155 | return max_load_move - rem_load_move; |
| 2099 | } | 2156 | } |
| 2100 | 2157 | ||
| @@ -2123,7 +2180,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) | |||
| 2123 | * We need to update shares after updating tg->load_weight in | 2180 | * We need to update shares after updating tg->load_weight in |
| 2124 | * order to adjust the weight of groups with long running tasks. | 2181 | * order to adjust the weight of groups with long running tasks. |
| 2125 | */ | 2182 | */ |
| 2126 | update_cfs_shares(cfs_rq, 0); | 2183 | update_cfs_shares(cfs_rq); |
| 2127 | 2184 | ||
| 2128 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 2185 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 2129 | 2186 | ||
| @@ -2610,7 +2667,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 2610 | * @this_cpu: Cpu for which load balance is currently performed. | 2667 | * @this_cpu: Cpu for which load balance is currently performed. |
| 2611 | * @idle: Idle status of this_cpu | 2668 | * @idle: Idle status of this_cpu |
| 2612 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 2669 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
| 2613 | * @sd_idle: Idle status of the sched_domain containing group. | ||
| 2614 | * @local_group: Does group contain this_cpu. | 2670 | * @local_group: Does group contain this_cpu. |
| 2615 | * @cpus: Set of cpus considered for load balancing. | 2671 | * @cpus: Set of cpus considered for load balancing. |
| 2616 | * @balance: Should we balance. | 2672 | * @balance: Should we balance. |
| @@ -2618,7 +2674,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 2618 | */ | 2674 | */ |
| 2619 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 2675 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
| 2620 | struct sched_group *group, int this_cpu, | 2676 | struct sched_group *group, int this_cpu, |
| 2621 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 2677 | enum cpu_idle_type idle, int load_idx, |
| 2622 | int local_group, const struct cpumask *cpus, | 2678 | int local_group, const struct cpumask *cpus, |
| 2623 | int *balance, struct sg_lb_stats *sgs) | 2679 | int *balance, struct sg_lb_stats *sgs) |
| 2624 | { | 2680 | { |
| @@ -2638,9 +2694,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2638 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 2694 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
| 2639 | struct rq *rq = cpu_rq(i); | 2695 | struct rq *rq = cpu_rq(i); |
| 2640 | 2696 | ||
| 2641 | if (*sd_idle && rq->nr_running) | ||
| 2642 | *sd_idle = 0; | ||
| 2643 | |||
| 2644 | /* Bias balancing toward cpus of our domain */ | 2697 | /* Bias balancing toward cpus of our domain */ |
| 2645 | if (local_group) { | 2698 | if (local_group) { |
| 2646 | if (idle_cpu(i) && !first_idle_cpu) { | 2699 | if (idle_cpu(i) && !first_idle_cpu) { |
| @@ -2685,7 +2738,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2685 | 2738 | ||
| 2686 | /* | 2739 | /* |
| 2687 | * Consider the group unbalanced when the imbalance is larger | 2740 | * Consider the group unbalanced when the imbalance is larger |
| 2688 | * than the average weight of two tasks. | 2741 | * than the average weight of a task. |
| 2689 | * | 2742 | * |
| 2690 | * APZ: with cgroup the avg task weight can vary wildly and | 2743 | * APZ: with cgroup the avg task weight can vary wildly and |
| 2691 | * might not be a suitable number - should we keep a | 2744 | * might not be a suitable number - should we keep a |
| @@ -2695,7 +2748,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2695 | if (sgs->sum_nr_running) | 2748 | if (sgs->sum_nr_running) |
| 2696 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2749 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 2697 | 2750 | ||
| 2698 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) | 2751 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) |
| 2699 | sgs->group_imb = 1; | 2752 | sgs->group_imb = 1; |
| 2700 | 2753 | ||
| 2701 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2754 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
| @@ -2755,15 +2808,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
| 2755 | * @sd: sched_domain whose statistics are to be updated. | 2808 | * @sd: sched_domain whose statistics are to be updated. |
| 2756 | * @this_cpu: Cpu for which load balance is currently performed. | 2809 | * @this_cpu: Cpu for which load balance is currently performed. |
| 2757 | * @idle: Idle status of this_cpu | 2810 | * @idle: Idle status of this_cpu |
| 2758 | * @sd_idle: Idle status of the sched_domain containing sg. | ||
| 2759 | * @cpus: Set of cpus considered for load balancing. | 2811 | * @cpus: Set of cpus considered for load balancing. |
| 2760 | * @balance: Should we balance. | 2812 | * @balance: Should we balance. |
| 2761 | * @sds: variable to hold the statistics for this sched_domain. | 2813 | * @sds: variable to hold the statistics for this sched_domain. |
| 2762 | */ | 2814 | */ |
| 2763 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 2815 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, |
| 2764 | enum cpu_idle_type idle, int *sd_idle, | 2816 | enum cpu_idle_type idle, const struct cpumask *cpus, |
| 2765 | const struct cpumask *cpus, int *balance, | 2817 | int *balance, struct sd_lb_stats *sds) |
| 2766 | struct sd_lb_stats *sds) | ||
| 2767 | { | 2818 | { |
| 2768 | struct sched_domain *child = sd->child; | 2819 | struct sched_domain *child = sd->child; |
| 2769 | struct sched_group *sg = sd->groups; | 2820 | struct sched_group *sg = sd->groups; |
| @@ -2781,7 +2832,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2781 | 2832 | ||
| 2782 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 2833 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
| 2783 | memset(&sgs, 0, sizeof(sgs)); | 2834 | memset(&sgs, 0, sizeof(sgs)); |
| 2784 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, | 2835 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, |
| 2785 | local_group, cpus, balance, &sgs); | 2836 | local_group, cpus, balance, &sgs); |
| 2786 | 2837 | ||
| 2787 | if (local_group && !(*balance)) | 2838 | if (local_group && !(*balance)) |
| @@ -3007,7 +3058,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 3007 | 3058 | ||
| 3008 | /* | 3059 | /* |
| 3009 | * if *imbalance is less than the average load per runnable task | 3060 | * if *imbalance is less than the average load per runnable task |
| 3010 | * there is no gaurantee that any tasks will be moved so we'll have | 3061 | * there is no guarantee that any tasks will be moved so we'll have |
| 3011 | * a think about bumping its value to force at least one task to be | 3062 | * a think about bumping its value to force at least one task to be |
| 3012 | * moved | 3063 | * moved |
| 3013 | */ | 3064 | */ |
| @@ -3033,7 +3084,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 3033 | * @imbalance: Variable which stores amount of weighted load which should | 3084 | * @imbalance: Variable which stores amount of weighted load which should |
| 3034 | * be moved to restore balance/put a group to idle. | 3085 | * be moved to restore balance/put a group to idle. |
| 3035 | * @idle: The idle status of this_cpu. | 3086 | * @idle: The idle status of this_cpu. |
| 3036 | * @sd_idle: The idleness of sd | ||
| 3037 | * @cpus: The set of CPUs under consideration for load-balancing. | 3087 | * @cpus: The set of CPUs under consideration for load-balancing. |
| 3038 | * @balance: Pointer to a variable indicating if this_cpu | 3088 | * @balance: Pointer to a variable indicating if this_cpu |
| 3039 | * is the appropriate cpu to perform load balancing at this_level. | 3089 | * is the appropriate cpu to perform load balancing at this_level. |
| @@ -3046,7 +3096,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 3046 | static struct sched_group * | 3096 | static struct sched_group * |
| 3047 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3097 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
| 3048 | unsigned long *imbalance, enum cpu_idle_type idle, | 3098 | unsigned long *imbalance, enum cpu_idle_type idle, |
| 3049 | int *sd_idle, const struct cpumask *cpus, int *balance) | 3099 | const struct cpumask *cpus, int *balance) |
| 3050 | { | 3100 | { |
| 3051 | struct sd_lb_stats sds; | 3101 | struct sd_lb_stats sds; |
| 3052 | 3102 | ||
| @@ -3056,22 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 3056 | * Compute the various statistics relavent for load balancing at | 3106 | * Compute the various statistics relavent for load balancing at |
| 3057 | * this level. | 3107 | * this level. |
| 3058 | */ | 3108 | */ |
| 3059 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | 3109 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); |
| 3060 | balance, &sds); | 3110 | |
| 3061 | 3111 | /* | |
| 3062 | /* Cases where imbalance does not exist from POV of this_cpu */ | 3112 | * this_cpu is not the appropriate cpu to perform load balancing at |
| 3063 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | 3113 | * this level. |
| 3064 | * at this level. | ||
| 3065 | * 2) There is no busy sibling group to pull from. | ||
| 3066 | * 3) This group is the busiest group. | ||
| 3067 | * 4) This group is more busy than the avg busieness at this | ||
| 3068 | * sched_domain. | ||
| 3069 | * 5) The imbalance is within the specified limit. | ||
| 3070 | * | ||
| 3071 | * Note: when doing newidle balance, if the local group has excess | ||
| 3072 | * capacity (i.e. nr_running < group_capacity) and the busiest group | ||
| 3073 | * does not have any capacity, we force a load balance to pull tasks | ||
| 3074 | * to the local group. In this case, we skip past checks 3, 4 and 5. | ||
| 3075 | */ | 3114 | */ |
| 3076 | if (!(*balance)) | 3115 | if (!(*balance)) |
| 3077 | goto ret; | 3116 | goto ret; |
| @@ -3080,41 +3119,56 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 3080 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 3119 | check_asym_packing(sd, &sds, this_cpu, imbalance)) |
| 3081 | return sds.busiest; | 3120 | return sds.busiest; |
| 3082 | 3121 | ||
| 3122 | /* There is no busy sibling group to pull tasks from */ | ||
| 3083 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3123 | if (!sds.busiest || sds.busiest_nr_running == 0) |
| 3084 | goto out_balanced; | 3124 | goto out_balanced; |
| 3085 | 3125 | ||
| 3086 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 3126 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; |
| 3127 | |||
| 3128 | /* | ||
| 3129 | * If the busiest group is imbalanced the below checks don't | ||
| 3130 | * work because they assumes all things are equal, which typically | ||
| 3131 | * isn't true due to cpus_allowed constraints and the like. | ||
| 3132 | */ | ||
| 3133 | if (sds.group_imb) | ||
| 3134 | goto force_balance; | ||
| 3135 | |||
| 3136 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | ||
| 3087 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 3137 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
| 3088 | !sds.busiest_has_capacity) | 3138 | !sds.busiest_has_capacity) |
| 3089 | goto force_balance; | 3139 | goto force_balance; |
| 3090 | 3140 | ||
| 3141 | /* | ||
| 3142 | * If the local group is more busy than the selected busiest group | ||
| 3143 | * don't try and pull any tasks. | ||
| 3144 | */ | ||
| 3091 | if (sds.this_load >= sds.max_load) | 3145 | if (sds.this_load >= sds.max_load) |
| 3092 | goto out_balanced; | 3146 | goto out_balanced; |
| 3093 | 3147 | ||
| 3094 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | 3148 | /* |
| 3095 | 3149 | * Don't pull any tasks if this group is already above the domain | |
| 3150 | * average load. | ||
| 3151 | */ | ||
| 3096 | if (sds.this_load >= sds.avg_load) | 3152 | if (sds.this_load >= sds.avg_load) |
| 3097 | goto out_balanced; | 3153 | goto out_balanced; |
| 3098 | 3154 | ||
| 3099 | /* | 3155 | if (idle == CPU_IDLE) { |
| 3100 | * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. | ||
| 3101 | * And to check for busy balance use !idle_cpu instead of | ||
| 3102 | * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE | ||
| 3103 | * even when they are idle. | ||
| 3104 | */ | ||
| 3105 | if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { | ||
| 3106 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
| 3107 | goto out_balanced; | ||
| 3108 | } else { | ||
| 3109 | /* | 3156 | /* |
| 3110 | * This cpu is idle. If the busiest group load doesn't | 3157 | * This cpu is idle. If the busiest group load doesn't |
| 3111 | * have more tasks than the number of available cpu's and | 3158 | * have more tasks than the number of available cpu's and |
| 3112 | * there is no imbalance between this and busiest group | 3159 | * there is no imbalance between this and busiest group |
| 3113 | * wrt to idle cpu's, it is balanced. | 3160 | * wrt to idle cpu's, it is balanced. |
| 3114 | */ | 3161 | */ |
| 3115 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 3162 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && |
| 3116 | sds.busiest_nr_running <= sds.busiest_group_weight) | 3163 | sds.busiest_nr_running <= sds.busiest_group_weight) |
| 3117 | goto out_balanced; | 3164 | goto out_balanced; |
| 3165 | } else { | ||
| 3166 | /* | ||
| 3167 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | ||
| 3168 | * imbalance_pct to be conservative. | ||
| 3169 | */ | ||
| 3170 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
| 3171 | goto out_balanced; | ||
| 3118 | } | 3172 | } |
| 3119 | 3173 | ||
| 3120 | force_balance: | 3174 | force_balance: |
| @@ -3193,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
| 3193 | /* Working cpumask for load_balance and load_balance_newidle. */ | 3247 | /* Working cpumask for load_balance and load_balance_newidle. */ |
| 3194 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 3248 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
| 3195 | 3249 | ||
| 3196 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | 3250 | static int need_active_balance(struct sched_domain *sd, int idle, |
| 3197 | int busiest_cpu, int this_cpu) | 3251 | int busiest_cpu, int this_cpu) |
| 3198 | { | 3252 | { |
| 3199 | if (idle == CPU_NEWLY_IDLE) { | 3253 | if (idle == CPU_NEWLY_IDLE) { |
| @@ -3225,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | |||
| 3225 | * move_tasks() will succeed. ld_moved will be true and this | 3279 | * move_tasks() will succeed. ld_moved will be true and this |
| 3226 | * active balance code will not be triggered. | 3280 | * active balance code will not be triggered. |
| 3227 | */ | 3281 | */ |
| 3228 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
| 3229 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 3230 | return 0; | ||
| 3231 | |||
| 3232 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | 3282 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) |
| 3233 | return 0; | 3283 | return 0; |
| 3234 | } | 3284 | } |
| @@ -3246,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 3246 | struct sched_domain *sd, enum cpu_idle_type idle, | 3296 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 3247 | int *balance) | 3297 | int *balance) |
| 3248 | { | 3298 | { |
| 3249 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3299 | int ld_moved, all_pinned = 0, active_balance = 0; |
| 3250 | struct sched_group *group; | 3300 | struct sched_group *group; |
| 3251 | unsigned long imbalance; | 3301 | unsigned long imbalance; |
| 3252 | struct rq *busiest; | 3302 | struct rq *busiest; |
| @@ -3255,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 3255 | 3305 | ||
| 3256 | cpumask_copy(cpus, cpu_active_mask); | 3306 | cpumask_copy(cpus, cpu_active_mask); |
| 3257 | 3307 | ||
| 3258 | /* | ||
| 3259 | * When power savings policy is enabled for the parent domain, idle | ||
| 3260 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
| 3261 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
| 3262 | * portraying it as CPU_NOT_IDLE. | ||
| 3263 | */ | ||
| 3264 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
| 3265 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 3266 | sd_idle = 1; | ||
| 3267 | |||
| 3268 | schedstat_inc(sd, lb_count[idle]); | 3308 | schedstat_inc(sd, lb_count[idle]); |
| 3269 | 3309 | ||
| 3270 | redo: | 3310 | redo: |
| 3271 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3311 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, |
| 3272 | cpus, balance); | 3312 | cpus, balance); |
| 3273 | 3313 | ||
| 3274 | if (*balance == 0) | 3314 | if (*balance == 0) |
| @@ -3297,6 +3337,7 @@ redo: | |||
| 3297 | * still unbalanced. ld_moved simply stays zero, so it is | 3337 | * still unbalanced. ld_moved simply stays zero, so it is |
| 3298 | * correctly treated as an imbalance. | 3338 | * correctly treated as an imbalance. |
| 3299 | */ | 3339 | */ |
| 3340 | all_pinned = 1; | ||
| 3300 | local_irq_save(flags); | 3341 | local_irq_save(flags); |
| 3301 | double_rq_lock(this_rq, busiest); | 3342 | double_rq_lock(this_rq, busiest); |
| 3302 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 3343 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
| @@ -3330,8 +3371,7 @@ redo: | |||
| 3330 | if (idle != CPU_NEWLY_IDLE) | 3371 | if (idle != CPU_NEWLY_IDLE) |
| 3331 | sd->nr_balance_failed++; | 3372 | sd->nr_balance_failed++; |
| 3332 | 3373 | ||
| 3333 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), | 3374 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { |
| 3334 | this_cpu)) { | ||
| 3335 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3375 | raw_spin_lock_irqsave(&busiest->lock, flags); |
| 3336 | 3376 | ||
| 3337 | /* don't kick the active_load_balance_cpu_stop, | 3377 | /* don't kick the active_load_balance_cpu_stop, |
| @@ -3386,10 +3426,6 @@ redo: | |||
| 3386 | sd->balance_interval *= 2; | 3426 | sd->balance_interval *= 2; |
| 3387 | } | 3427 | } |
| 3388 | 3428 | ||
| 3389 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
| 3390 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 3391 | ld_moved = -1; | ||
| 3392 | |||
| 3393 | goto out; | 3429 | goto out; |
| 3394 | 3430 | ||
| 3395 | out_balanced: | 3431 | out_balanced: |
| @@ -3403,11 +3439,7 @@ out_one_pinned: | |||
| 3403 | (sd->balance_interval < sd->max_interval)) | 3439 | (sd->balance_interval < sd->max_interval)) |
| 3404 | sd->balance_interval *= 2; | 3440 | sd->balance_interval *= 2; |
| 3405 | 3441 | ||
| 3406 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3442 | ld_moved = 0; |
| 3407 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 3408 | ld_moved = -1; | ||
| 3409 | else | ||
| 3410 | ld_moved = 0; | ||
| 3411 | out: | 3443 | out: |
| 3412 | return ld_moved; | 3444 | return ld_moved; |
| 3413 | } | 3445 | } |
| @@ -3786,6 +3818,17 @@ void select_nohz_load_balancer(int stop_tick) | |||
| 3786 | 3818 | ||
| 3787 | static DEFINE_SPINLOCK(balancing); | 3819 | static DEFINE_SPINLOCK(balancing); |
| 3788 | 3820 | ||
| 3821 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
| 3822 | |||
| 3823 | /* | ||
| 3824 | * Scale the max load_balance interval with the number of CPUs in the system. | ||
| 3825 | * This trades load-balance latency on larger machines for less cross talk. | ||
| 3826 | */ | ||
| 3827 | static void update_max_interval(void) | ||
| 3828 | { | ||
| 3829 | max_load_balance_interval = HZ*num_online_cpus()/10; | ||
| 3830 | } | ||
| 3831 | |||
| 3789 | /* | 3832 | /* |
| 3790 | * It checks each scheduling domain to see if it is due to be balanced, | 3833 | * It checks each scheduling domain to see if it is due to be balanced, |
| 3791 | * and initiates a balancing operation if so. | 3834 | * and initiates a balancing operation if so. |
| @@ -3815,10 +3858,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3815 | 3858 | ||
| 3816 | /* scale ms to jiffies */ | 3859 | /* scale ms to jiffies */ |
| 3817 | interval = msecs_to_jiffies(interval); | 3860 | interval = msecs_to_jiffies(interval); |
| 3818 | if (unlikely(!interval)) | 3861 | interval = clamp(interval, 1UL, max_load_balance_interval); |
| 3819 | interval = 1; | ||
| 3820 | if (interval > HZ*NR_CPUS/10) | ||
| 3821 | interval = HZ*NR_CPUS/10; | ||
| 3822 | 3862 | ||
| 3823 | need_serialize = sd->flags & SD_SERIALIZE; | 3863 | need_serialize = sd->flags & SD_SERIALIZE; |
| 3824 | 3864 | ||
| @@ -3831,8 +3871,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3831 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 3871 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
| 3832 | /* | 3872 | /* |
| 3833 | * We've pulled tasks over so either we're no | 3873 | * We've pulled tasks over so either we're no |
| 3834 | * longer idle, or one of our SMT siblings is | 3874 | * longer idle. |
| 3835 | * not idle. | ||
| 3836 | */ | 3875 | */ |
| 3837 | idle = CPU_NOT_IDLE; | 3876 | idle = CPU_NOT_IDLE; |
| 3838 | } | 3877 | } |
| @@ -4079,33 +4118,62 @@ static void task_fork_fair(struct task_struct *p) | |||
| 4079 | * Priority of the task has changed. Check to see if we preempt | 4118 | * Priority of the task has changed. Check to see if we preempt |
| 4080 | * the current task. | 4119 | * the current task. |
| 4081 | */ | 4120 | */ |
| 4082 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | 4121 | static void |
| 4083 | int oldprio, int running) | 4122 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
| 4084 | { | 4123 | { |
| 4124 | if (!p->se.on_rq) | ||
| 4125 | return; | ||
| 4126 | |||
| 4085 | /* | 4127 | /* |
| 4086 | * Reschedule if we are currently running on this runqueue and | 4128 | * Reschedule if we are currently running on this runqueue and |
| 4087 | * our priority decreased, or if we are not currently running on | 4129 | * our priority decreased, or if we are not currently running on |
| 4088 | * this runqueue and our priority is higher than the current's | 4130 | * this runqueue and our priority is higher than the current's |
| 4089 | */ | 4131 | */ |
| 4090 | if (running) { | 4132 | if (rq->curr == p) { |
| 4091 | if (p->prio > oldprio) | 4133 | if (p->prio > oldprio) |
| 4092 | resched_task(rq->curr); | 4134 | resched_task(rq->curr); |
| 4093 | } else | 4135 | } else |
| 4094 | check_preempt_curr(rq, p, 0); | 4136 | check_preempt_curr(rq, p, 0); |
| 4095 | } | 4137 | } |
| 4096 | 4138 | ||
| 4139 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | ||
| 4140 | { | ||
| 4141 | struct sched_entity *se = &p->se; | ||
| 4142 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
| 4143 | |||
| 4144 | /* | ||
| 4145 | * Ensure the task's vruntime is normalized, so that when its | ||
| 4146 | * switched back to the fair class the enqueue_entity(.flags=0) will | ||
| 4147 | * do the right thing. | ||
| 4148 | * | ||
| 4149 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | ||
| 4150 | * have normalized the vruntime, if it was !on_rq, then only when | ||
| 4151 | * the task is sleeping will it still have non-normalized vruntime. | ||
| 4152 | */ | ||
| 4153 | if (!se->on_rq && p->state != TASK_RUNNING) { | ||
| 4154 | /* | ||
| 4155 | * Fix up our vruntime so that the current sleep doesn't | ||
| 4156 | * cause 'unlimited' sleep bonus. | ||
| 4157 | */ | ||
| 4158 | place_entity(cfs_rq, se, 0); | ||
| 4159 | se->vruntime -= cfs_rq->min_vruntime; | ||
| 4160 | } | ||
| 4161 | } | ||
| 4162 | |||
| 4097 | /* | 4163 | /* |
| 4098 | * We switched to the sched_fair class. | 4164 | * We switched to the sched_fair class. |
| 4099 | */ | 4165 | */ |
| 4100 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | 4166 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
| 4101 | int running) | ||
| 4102 | { | 4167 | { |
| 4168 | if (!p->se.on_rq) | ||
| 4169 | return; | ||
| 4170 | |||
| 4103 | /* | 4171 | /* |
| 4104 | * We were most likely switched from sched_rt, so | 4172 | * We were most likely switched from sched_rt, so |
| 4105 | * kick off the schedule if running, otherwise just see | 4173 | * kick off the schedule if running, otherwise just see |
| 4106 | * if we can still preempt the current task. | 4174 | * if we can still preempt the current task. |
| 4107 | */ | 4175 | */ |
| 4108 | if (running) | 4176 | if (rq->curr == p) |
| 4109 | resched_task(rq->curr); | 4177 | resched_task(rq->curr); |
| 4110 | else | 4178 | else |
| 4111 | check_preempt_curr(rq, p, 0); | 4179 | check_preempt_curr(rq, p, 0); |
| @@ -4171,6 +4239,7 @@ static const struct sched_class fair_sched_class = { | |||
| 4171 | .enqueue_task = enqueue_task_fair, | 4239 | .enqueue_task = enqueue_task_fair, |
| 4172 | .dequeue_task = dequeue_task_fair, | 4240 | .dequeue_task = dequeue_task_fair, |
| 4173 | .yield_task = yield_task_fair, | 4241 | .yield_task = yield_task_fair, |
| 4242 | .yield_to_task = yield_to_task_fair, | ||
| 4174 | 4243 | ||
| 4175 | .check_preempt_curr = check_preempt_wakeup, | 4244 | .check_preempt_curr = check_preempt_wakeup, |
| 4176 | 4245 | ||
| @@ -4191,6 +4260,7 @@ static const struct sched_class fair_sched_class = { | |||
| 4191 | .task_fork = task_fork_fair, | 4260 | .task_fork = task_fork_fair, |
| 4192 | 4261 | ||
| 4193 | .prio_changed = prio_changed_fair, | 4262 | .prio_changed = prio_changed_fair, |
| 4263 | .switched_from = switched_from_fair, | ||
| 4194 | .switched_to = switched_to_fair, | 4264 | .switched_to = switched_to_fair, |
| 4195 | 4265 | ||
| 4196 | .get_rr_interval = get_rr_interval_fair, | 4266 | .get_rr_interval = get_rr_interval_fair, |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9fa0f402c87c..a776a6396427 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
| @@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq) | |||
| 52 | { | 52 | { |
| 53 | } | 53 | } |
| 54 | 54 | ||
| 55 | static void switched_to_idle(struct rq *rq, struct task_struct *p, | 55 | static void switched_to_idle(struct rq *rq, struct task_struct *p) |
| 56 | int running) | ||
| 57 | { | 56 | { |
| 58 | /* Can this actually happen?? */ | 57 | BUG(); |
| 59 | if (running) | ||
| 60 | resched_task(rq->curr); | ||
| 61 | else | ||
| 62 | check_preempt_curr(rq, p, 0); | ||
| 63 | } | 58 | } |
| 64 | 59 | ||
| 65 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | 60 | static void |
| 66 | int oldprio, int running) | 61 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) |
| 67 | { | 62 | { |
| 68 | /* This can happen for hot plug CPUS */ | 63 | BUG(); |
| 69 | |||
| 70 | /* | ||
| 71 | * Reschedule if we are currently running on this runqueue and | ||
| 72 | * our priority decreased, or if we are not currently running on | ||
| 73 | * this runqueue and our priority is higher than the current's | ||
| 74 | */ | ||
| 75 | if (running) { | ||
| 76 | if (p->prio > oldprio) | ||
| 77 | resched_task(rq->curr); | ||
| 78 | } else | ||
| 79 | check_preempt_curr(rq, p, 0); | ||
| 80 | } | 64 | } |
| 81 | 65 | ||
| 82 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | 66 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) |
| @@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = { | |||
| 110 | 94 | ||
| 111 | .prio_changed = prio_changed_idle, | 95 | .prio_changed = prio_changed_idle, |
| 112 | .switched_to = switched_to_idle, | 96 | .switched_to = switched_to_idle, |
| 113 | |||
| 114 | /* no .task_new for idle tasks */ | ||
| 115 | }; | 97 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index ad6267714c84..e7cebdc65f82 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -210,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | |||
| 210 | 210 | ||
| 211 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 211 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
| 212 | { | 212 | { |
| 213 | int this_cpu = smp_processor_id(); | ||
| 214 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 213 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; |
| 215 | struct sched_rt_entity *rt_se; | 214 | struct sched_rt_entity *rt_se; |
| 216 | 215 | ||
| 217 | rt_se = rt_rq->tg->rt_se[this_cpu]; | 216 | int cpu = cpu_of(rq_of_rt_rq(rt_rq)); |
| 217 | |||
| 218 | rt_se = rt_rq->tg->rt_se[cpu]; | ||
| 218 | 219 | ||
| 219 | if (rt_rq->rt_nr_running) { | 220 | if (rt_rq->rt_nr_running) { |
| 220 | if (rt_se && !on_rt_rq(rt_se)) | 221 | if (rt_se && !on_rt_rq(rt_se)) |
| @@ -226,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
| 226 | 227 | ||
| 227 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 228 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
| 228 | { | 229 | { |
| 229 | int this_cpu = smp_processor_id(); | ||
| 230 | struct sched_rt_entity *rt_se; | 230 | struct sched_rt_entity *rt_se; |
| 231 | int cpu = cpu_of(rq_of_rt_rq(rt_rq)); | ||
| 231 | 232 | ||
| 232 | rt_se = rt_rq->tg->rt_se[this_cpu]; | 233 | rt_se = rt_rq->tg->rt_se[cpu]; |
| 233 | 234 | ||
| 234 | if (rt_se && on_rt_rq(rt_se)) | 235 | if (rt_se && on_rt_rq(rt_se)) |
| 235 | dequeue_rt_entity(rt_se); | 236 | dequeue_rt_entity(rt_se); |
| @@ -565,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
| 565 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 566 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
| 566 | idle = 0; | 567 | idle = 0; |
| 567 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 568 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
| 568 | } else if (rt_rq->rt_nr_running) | 569 | } else if (rt_rq->rt_nr_running) { |
| 569 | idle = 0; | 570 | idle = 0; |
| 571 | if (!rt_rq_throttled(rt_rq)) | ||
| 572 | enqueue = 1; | ||
| 573 | } | ||
| 570 | 574 | ||
| 571 | if (enqueue) | 575 | if (enqueue) |
| 572 | sched_rt_rq_enqueue(rt_rq); | 576 | sched_rt_rq_enqueue(rt_rq); |
| @@ -1374,7 +1378,7 @@ retry: | |||
| 1374 | task = pick_next_pushable_task(rq); | 1378 | task = pick_next_pushable_task(rq); |
| 1375 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 1379 | if (task_cpu(next_task) == rq->cpu && task == next_task) { |
| 1376 | /* | 1380 | /* |
| 1377 | * If we get here, the task hasnt moved at all, but | 1381 | * If we get here, the task hasn't moved at all, but |
| 1378 | * it has failed to push. We will not try again, | 1382 | * it has failed to push. We will not try again, |
| 1379 | * since the other cpus will pull from us when they | 1383 | * since the other cpus will pull from us when they |
| 1380 | * are ready. | 1384 | * are ready. |
| @@ -1484,7 +1488,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1484 | /* | 1488 | /* |
| 1485 | * We continue with the search, just in | 1489 | * We continue with the search, just in |
| 1486 | * case there's an even higher prio task | 1490 | * case there's an even higher prio task |
| 1487 | * in another runqueue. (low likelyhood | 1491 | * in another runqueue. (low likelihood |
| 1488 | * but possible) | 1492 | * but possible) |
| 1489 | */ | 1493 | */ |
| 1490 | } | 1494 | } |
| @@ -1595,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq) | |||
| 1595 | * When switch from the rt queue, we bring ourselves to a position | 1599 | * When switch from the rt queue, we bring ourselves to a position |
| 1596 | * that we might want to pull RT tasks from other runqueues. | 1600 | * that we might want to pull RT tasks from other runqueues. |
| 1597 | */ | 1601 | */ |
| 1598 | static void switched_from_rt(struct rq *rq, struct task_struct *p, | 1602 | static void switched_from_rt(struct rq *rq, struct task_struct *p) |
| 1599 | int running) | ||
| 1600 | { | 1603 | { |
| 1601 | /* | 1604 | /* |
| 1602 | * If there are other RT tasks then we will reschedule | 1605 | * If there are other RT tasks then we will reschedule |
| @@ -1605,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, | |||
| 1605 | * we may need to handle the pulling of RT tasks | 1608 | * we may need to handle the pulling of RT tasks |
| 1606 | * now. | 1609 | * now. |
| 1607 | */ | 1610 | */ |
| 1608 | if (!rq->rt.rt_nr_running) | 1611 | if (p->se.on_rq && !rq->rt.rt_nr_running) |
| 1609 | pull_rt_task(rq); | 1612 | pull_rt_task(rq); |
| 1610 | } | 1613 | } |
| 1611 | 1614 | ||
| @@ -1624,8 +1627,7 @@ static inline void init_sched_rt_class(void) | |||
| 1624 | * with RT tasks. In this case we try to push them off to | 1627 | * with RT tasks. In this case we try to push them off to |
| 1625 | * other runqueues. | 1628 | * other runqueues. |
| 1626 | */ | 1629 | */ |
| 1627 | static void switched_to_rt(struct rq *rq, struct task_struct *p, | 1630 | static void switched_to_rt(struct rq *rq, struct task_struct *p) |
| 1628 | int running) | ||
| 1629 | { | 1631 | { |
| 1630 | int check_resched = 1; | 1632 | int check_resched = 1; |
| 1631 | 1633 | ||
| @@ -1636,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, | |||
| 1636 | * If that current running task is also an RT task | 1638 | * If that current running task is also an RT task |
| 1637 | * then see if we can move to another run queue. | 1639 | * then see if we can move to another run queue. |
| 1638 | */ | 1640 | */ |
| 1639 | if (!running) { | 1641 | if (p->se.on_rq && rq->curr != p) { |
| 1640 | #ifdef CONFIG_SMP | 1642 | #ifdef CONFIG_SMP |
| 1641 | if (rq->rt.overloaded && push_rt_task(rq) && | 1643 | if (rq->rt.overloaded && push_rt_task(rq) && |
| 1642 | /* Don't resched if we changed runqueues */ | 1644 | /* Don't resched if we changed runqueues */ |
| @@ -1652,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, | |||
| 1652 | * Priority of the task has changed. This may cause | 1654 | * Priority of the task has changed. This may cause |
| 1653 | * us to initiate a push or pull. | 1655 | * us to initiate a push or pull. |
| 1654 | */ | 1656 | */ |
| 1655 | static void prio_changed_rt(struct rq *rq, struct task_struct *p, | 1657 | static void |
| 1656 | int oldprio, int running) | 1658 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
| 1657 | { | 1659 | { |
| 1658 | if (running) { | 1660 | if (!p->se.on_rq) |
| 1661 | return; | ||
| 1662 | |||
| 1663 | if (rq->curr == p) { | ||
| 1659 | #ifdef CONFIG_SMP | 1664 | #ifdef CONFIG_SMP |
| 1660 | /* | 1665 | /* |
| 1661 | * If our priority decreases while running, we | 1666 | * If our priority decreases while running, we |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 2bf6b47058c1..1ba2bd40fdac 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
| @@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq) | |||
| 59 | { | 59 | { |
| 60 | } | 60 | } |
| 61 | 61 | ||
| 62 | static void switched_to_stop(struct rq *rq, struct task_struct *p, | 62 | static void switched_to_stop(struct rq *rq, struct task_struct *p) |
| 63 | int running) | ||
| 64 | { | 63 | { |
| 65 | BUG(); /* its impossible to change to this class */ | 64 | BUG(); /* its impossible to change to this class */ |
| 66 | } | 65 | } |
| 67 | 66 | ||
| 68 | static void prio_changed_stop(struct rq *rq, struct task_struct *p, | 67 | static void |
| 69 | int oldprio, int running) | 68 | prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) |
| 70 | { | 69 | { |
| 71 | BUG(); /* how!?, what priority? */ | 70 | BUG(); /* how!?, what priority? */ |
| 72 | } | 71 | } |
| @@ -103,6 +102,4 @@ static const struct sched_class stop_sched_class = { | |||
| 103 | 102 | ||
| 104 | .prio_changed = prio_changed_stop, | 103 | .prio_changed = prio_changed_stop, |
| 105 | .switched_to = switched_to_stop, | 104 | .switched_to = switched_to_stop, |
| 106 | |||
| 107 | /* no .task_new for stop tasks */ | ||
| 108 | }; | 105 | }; |
diff --git a/kernel/signal.c b/kernel/signal.c index 4e3cff10fdce..7165af5f1b11 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig) | |||
| 226 | /* | 226 | /* |
| 227 | * allocate a new signal queue record | 227 | * allocate a new signal queue record |
| 228 | * - this may be called without locks if and only if t == current, otherwise an | 228 | * - this may be called without locks if and only if t == current, otherwise an |
| 229 | * appopriate lock must be held to stop the target task from exiting | 229 | * appropriate lock must be held to stop the target task from exiting |
| 230 | */ | 230 | */ |
| 231 | static struct sigqueue * | 231 | static struct sigqueue * |
| 232 | __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) | 232 | __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) |
| @@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig) | |||
| 375 | return !tracehook_consider_fatal_signal(tsk, sig); | 375 | return !tracehook_consider_fatal_signal(tsk, sig); |
| 376 | } | 376 | } |
| 377 | 377 | ||
| 378 | 378 | /* | |
| 379 | /* Notify the system that a driver wants to block all signals for this | 379 | * Notify the system that a driver wants to block all signals for this |
| 380 | * process, and wants to be notified if any signals at all were to be | 380 | * process, and wants to be notified if any signals at all were to be |
| 381 | * sent/acted upon. If the notifier routine returns non-zero, then the | 381 | * sent/acted upon. If the notifier routine returns non-zero, then the |
| 382 | * signal will be acted upon after all. If the notifier routine returns 0, | 382 | * signal will be acted upon after all. If the notifier routine returns 0, |
| 383 | * then then signal will be blocked. Only one block per process is | 383 | * then then signal will be blocked. Only one block per process is |
| 384 | * allowed. priv is a pointer to private data that the notifier routine | 384 | * allowed. priv is a pointer to private data that the notifier routine |
| 385 | * can use to determine if the signal should be blocked or not. */ | 385 | * can use to determine if the signal should be blocked or not. |
| 386 | 386 | */ | |
| 387 | void | 387 | void |
| 388 | block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) | 388 | block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) |
| 389 | { | 389 | { |
| @@ -434,9 +434,10 @@ still_pending: | |||
| 434 | copy_siginfo(info, &first->info); | 434 | copy_siginfo(info, &first->info); |
| 435 | __sigqueue_free(first); | 435 | __sigqueue_free(first); |
| 436 | } else { | 436 | } else { |
| 437 | /* Ok, it wasn't in the queue. This must be | 437 | /* |
| 438 | a fast-pathed signal or we must have been | 438 | * Ok, it wasn't in the queue. This must be |
| 439 | out of queue space. So zero out the info. | 439 | * a fast-pathed signal or we must have been |
| 440 | * out of queue space. So zero out the info. | ||
| 440 | */ | 441 | */ |
| 441 | info->si_signo = sig; | 442 | info->si_signo = sig; |
| 442 | info->si_errno = 0; | 443 | info->si_errno = 0; |
| @@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | |||
| 468 | } | 469 | } |
| 469 | 470 | ||
| 470 | /* | 471 | /* |
| 471 | * Dequeue a signal and return the element to the caller, which is | 472 | * Dequeue a signal and return the element to the caller, which is |
| 472 | * expected to free it. | 473 | * expected to free it. |
| 473 | * | 474 | * |
| 474 | * All callers have to hold the siglock. | 475 | * All callers have to hold the siglock. |
| @@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 490 | * itimers are process shared and we restart periodic | 491 | * itimers are process shared and we restart periodic |
| 491 | * itimers in the signal delivery path to prevent DoS | 492 | * itimers in the signal delivery path to prevent DoS |
| 492 | * attacks in the high resolution timer case. This is | 493 | * attacks in the high resolution timer case. This is |
| 493 | * compliant with the old way of self restarting | 494 | * compliant with the old way of self-restarting |
| 494 | * itimers, as the SIGALRM is a legacy signal and only | 495 | * itimers, as the SIGALRM is a legacy signal and only |
| 495 | * queued once. Changing the restart behaviour to | 496 | * queued once. Changing the restart behaviour to |
| 496 | * restart the timer in the signal dequeue path is | 497 | * restart the timer in the signal dequeue path is |
| @@ -636,13 +637,33 @@ static inline bool si_fromuser(const struct siginfo *info) | |||
| 636 | } | 637 | } |
| 637 | 638 | ||
| 638 | /* | 639 | /* |
| 640 | * called with RCU read lock from check_kill_permission() | ||
| 641 | */ | ||
| 642 | static int kill_ok_by_cred(struct task_struct *t) | ||
| 643 | { | ||
| 644 | const struct cred *cred = current_cred(); | ||
| 645 | const struct cred *tcred = __task_cred(t); | ||
| 646 | |||
| 647 | if (cred->user->user_ns == tcred->user->user_ns && | ||
| 648 | (cred->euid == tcred->suid || | ||
| 649 | cred->euid == tcred->uid || | ||
| 650 | cred->uid == tcred->suid || | ||
| 651 | cred->uid == tcred->uid)) | ||
| 652 | return 1; | ||
| 653 | |||
| 654 | if (ns_capable(tcred->user->user_ns, CAP_KILL)) | ||
| 655 | return 1; | ||
| 656 | |||
| 657 | return 0; | ||
| 658 | } | ||
| 659 | |||
| 660 | /* | ||
| 639 | * Bad permissions for sending the signal | 661 | * Bad permissions for sending the signal |
| 640 | * - the caller must hold the RCU read lock | 662 | * - the caller must hold the RCU read lock |
| 641 | */ | 663 | */ |
| 642 | static int check_kill_permission(int sig, struct siginfo *info, | 664 | static int check_kill_permission(int sig, struct siginfo *info, |
| 643 | struct task_struct *t) | 665 | struct task_struct *t) |
| 644 | { | 666 | { |
| 645 | const struct cred *cred, *tcred; | ||
| 646 | struct pid *sid; | 667 | struct pid *sid; |
| 647 | int error; | 668 | int error; |
| 648 | 669 | ||
| @@ -656,14 +677,8 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
| 656 | if (error) | 677 | if (error) |
| 657 | return error; | 678 | return error; |
| 658 | 679 | ||
| 659 | cred = current_cred(); | ||
| 660 | tcred = __task_cred(t); | ||
| 661 | if (!same_thread_group(current, t) && | 680 | if (!same_thread_group(current, t) && |
| 662 | (cred->euid ^ tcred->suid) && | 681 | !kill_ok_by_cred(t)) { |
| 663 | (cred->euid ^ tcred->uid) && | ||
| 664 | (cred->uid ^ tcred->suid) && | ||
| 665 | (cred->uid ^ tcred->uid) && | ||
| 666 | !capable(CAP_KILL)) { | ||
| 667 | switch (sig) { | 682 | switch (sig) { |
| 668 | case SIGCONT: | 683 | case SIGCONT: |
| 669 | sid = task_session(t); | 684 | sid = task_session(t); |
| @@ -909,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
| 909 | if (info == SEND_SIG_FORCED) | 924 | if (info == SEND_SIG_FORCED) |
| 910 | goto out_set; | 925 | goto out_set; |
| 911 | 926 | ||
| 912 | /* Real-time signals must be queued if sent by sigqueue, or | 927 | /* |
| 913 | some other real-time mechanism. It is implementation | 928 | * Real-time signals must be queued if sent by sigqueue, or |
| 914 | defined whether kill() does so. We attempt to do so, on | 929 | * some other real-time mechanism. It is implementation |
| 915 | the principle of least surprise, but since kill is not | 930 | * defined whether kill() does so. We attempt to do so, on |
| 916 | allowed to fail with EAGAIN when low on memory we just | 931 | * the principle of least surprise, but since kill is not |
| 917 | make sure at least one signal gets delivered and don't | 932 | * allowed to fail with EAGAIN when low on memory we just |
| 918 | pass on the info struct. */ | 933 | * make sure at least one signal gets delivered and don't |
| 919 | 934 | * pass on the info struct. | |
| 935 | */ | ||
| 920 | if (sig < SIGRTMIN) | 936 | if (sig < SIGRTMIN) |
| 921 | override_rlimit = (is_si_special(info) || info->si_code >= 0); | 937 | override_rlimit = (is_si_special(info) || info->si_code >= 0); |
| 922 | else | 938 | else |
| @@ -1187,8 +1203,7 @@ retry: | |||
| 1187 | return error; | 1203 | return error; |
| 1188 | } | 1204 | } |
| 1189 | 1205 | ||
| 1190 | int | 1206 | int kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
| 1191 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) | ||
| 1192 | { | 1207 | { |
| 1193 | int error; | 1208 | int error; |
| 1194 | rcu_read_lock(); | 1209 | rcu_read_lock(); |
| @@ -1285,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) | |||
| 1285 | * These are for backward compatibility with the rest of the kernel source. | 1300 | * These are for backward compatibility with the rest of the kernel source. |
| 1286 | */ | 1301 | */ |
| 1287 | 1302 | ||
| 1288 | int | 1303 | int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) |
| 1289 | send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | ||
| 1290 | { | 1304 | { |
| 1291 | /* | 1305 | /* |
| 1292 | * Make sure legacy kernel users don't send in bad values | 1306 | * Make sure legacy kernel users don't send in bad values |
| @@ -1354,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid); | |||
| 1354 | * These functions support sending signals using preallocated sigqueue | 1368 | * These functions support sending signals using preallocated sigqueue |
| 1355 | * structures. This is needed "because realtime applications cannot | 1369 | * structures. This is needed "because realtime applications cannot |
| 1356 | * afford to lose notifications of asynchronous events, like timer | 1370 | * afford to lose notifications of asynchronous events, like timer |
| 1357 | * expirations or I/O completions". In the case of Posix Timers | 1371 | * expirations or I/O completions". In the case of POSIX Timers |
| 1358 | * we allocate the sigqueue structure from the timer_create. If this | 1372 | * we allocate the sigqueue structure from the timer_create. If this |
| 1359 | * allocation fails we are able to report the failure to the application | 1373 | * allocation fails we are able to report the failure to the application |
| 1360 | * with an EAGAIN error. | 1374 | * with an EAGAIN error. |
| @@ -1539,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | |||
| 1539 | info.si_signo = SIGCHLD; | 1553 | info.si_signo = SIGCHLD; |
| 1540 | info.si_errno = 0; | 1554 | info.si_errno = 0; |
| 1541 | /* | 1555 | /* |
| 1542 | * see comment in do_notify_parent() abot the following 3 lines | 1556 | * see comment in do_notify_parent() about the following 4 lines |
| 1543 | */ | 1557 | */ |
| 1544 | rcu_read_lock(); | 1558 | rcu_read_lock(); |
| 1545 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); | 1559 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); |
| @@ -1597,7 +1611,7 @@ static inline int may_ptrace_stop(void) | |||
| 1597 | } | 1611 | } |
| 1598 | 1612 | ||
| 1599 | /* | 1613 | /* |
| 1600 | * Return nonzero if there is a SIGKILL that should be waking us up. | 1614 | * Return non-zero if there is a SIGKILL that should be waking us up. |
| 1601 | * Called with the siglock held. | 1615 | * Called with the siglock held. |
| 1602 | */ | 1616 | */ |
| 1603 | static int sigkill_pending(struct task_struct *tsk) | 1617 | static int sigkill_pending(struct task_struct *tsk) |
| @@ -1721,7 +1735,7 @@ void ptrace_notify(int exit_code) | |||
| 1721 | /* | 1735 | /* |
| 1722 | * This performs the stopping for SIGSTOP and other stop signals. | 1736 | * This performs the stopping for SIGSTOP and other stop signals. |
| 1723 | * We have to stop all threads in the thread group. | 1737 | * We have to stop all threads in the thread group. |
| 1724 | * Returns nonzero if we've actually stopped and released the siglock. | 1738 | * Returns non-zero if we've actually stopped and released the siglock. |
| 1725 | * Returns zero if we didn't stop and still hold the siglock. | 1739 | * Returns zero if we didn't stop and still hold the siglock. |
| 1726 | */ | 1740 | */ |
| 1727 | static int do_signal_stop(int signr) | 1741 | static int do_signal_stop(int signr) |
| @@ -1809,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info, | |||
| 1809 | 1823 | ||
| 1810 | current->exit_code = 0; | 1824 | current->exit_code = 0; |
| 1811 | 1825 | ||
| 1812 | /* Update the siginfo structure if the signal has | 1826 | /* |
| 1813 | changed. If the debugger wanted something | 1827 | * Update the siginfo structure if the signal has |
| 1814 | specific in the siginfo structure then it should | 1828 | * changed. If the debugger wanted something |
| 1815 | have updated *info via PTRACE_SETSIGINFO. */ | 1829 | * specific in the siginfo structure then it should |
| 1830 | * have updated *info via PTRACE_SETSIGINFO. | ||
| 1831 | */ | ||
| 1816 | if (signr != info->si_signo) { | 1832 | if (signr != info->si_signo) { |
| 1817 | info->si_signo = signr; | 1833 | info->si_signo = signr; |
| 1818 | info->si_errno = 0; | 1834 | info->si_errno = 0; |
| @@ -1871,7 +1887,7 @@ relock: | |||
| 1871 | for (;;) { | 1887 | for (;;) { |
| 1872 | struct k_sigaction *ka; | 1888 | struct k_sigaction *ka; |
| 1873 | /* | 1889 | /* |
| 1874 | * Tracing can induce an artifical signal and choose sigaction. | 1890 | * Tracing can induce an artificial signal and choose sigaction. |
| 1875 | * The return value in @signr determines the default action, | 1891 | * The return value in @signr determines the default action, |
| 1876 | * but @info->si_signo is the signal number we will report. | 1892 | * but @info->si_signo is the signal number we will report. |
| 1877 | */ | 1893 | */ |
| @@ -2020,7 +2036,8 @@ void exit_signals(struct task_struct *tsk) | |||
| 2020 | if (!signal_pending(tsk)) | 2036 | if (!signal_pending(tsk)) |
| 2021 | goto out; | 2037 | goto out; |
| 2022 | 2038 | ||
| 2023 | /* It could be that __group_complete_signal() choose us to | 2039 | /* |
| 2040 | * It could be that __group_complete_signal() choose us to | ||
| 2024 | * notify about group-wide signal. Another thread should be | 2041 | * notify about group-wide signal. Another thread should be |
| 2025 | * woken now to take the signal since we will not. | 2042 | * woken now to take the signal since we will not. |
| 2026 | */ | 2043 | */ |
| @@ -2058,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals); | |||
| 2058 | * System call entry points. | 2075 | * System call entry points. |
| 2059 | */ | 2076 | */ |
| 2060 | 2077 | ||
| 2078 | /** | ||
| 2079 | * sys_restart_syscall - restart a system call | ||
| 2080 | */ | ||
| 2061 | SYSCALL_DEFINE0(restart_syscall) | 2081 | SYSCALL_DEFINE0(restart_syscall) |
| 2062 | { | 2082 | { |
| 2063 | struct restart_block *restart = ¤t_thread_info()->restart_block; | 2083 | struct restart_block *restart = ¤t_thread_info()->restart_block; |
| @@ -2111,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | |||
| 2111 | return error; | 2131 | return error; |
| 2112 | } | 2132 | } |
| 2113 | 2133 | ||
| 2134 | /** | ||
| 2135 | * sys_rt_sigprocmask - change the list of currently blocked signals | ||
| 2136 | * @how: whether to add, remove, or set signals | ||
| 2137 | * @set: stores pending signals | ||
| 2138 | * @oset: previous value of signal mask if non-null | ||
| 2139 | * @sigsetsize: size of sigset_t type | ||
| 2140 | */ | ||
| 2114 | SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, | 2141 | SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, |
| 2115 | sigset_t __user *, oset, size_t, sigsetsize) | 2142 | sigset_t __user *, oset, size_t, sigsetsize) |
| 2116 | { | 2143 | { |
| @@ -2169,8 +2196,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize) | |||
| 2169 | 2196 | ||
| 2170 | out: | 2197 | out: |
| 2171 | return error; | 2198 | return error; |
| 2172 | } | 2199 | } |
| 2173 | 2200 | ||
| 2201 | /** | ||
| 2202 | * sys_rt_sigpending - examine a pending signal that has been raised | ||
| 2203 | * while blocked | ||
| 2204 | * @set: stores pending signals | ||
| 2205 | * @sigsetsize: size of sigset_t type or larger | ||
| 2206 | */ | ||
| 2174 | SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) | 2207 | SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) |
| 2175 | { | 2208 | { |
| 2176 | return do_sigpending(set, sigsetsize); | 2209 | return do_sigpending(set, sigsetsize); |
| @@ -2219,9 +2252,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
| 2219 | err |= __put_user(from->si_trapno, &to->si_trapno); | 2252 | err |= __put_user(from->si_trapno, &to->si_trapno); |
| 2220 | #endif | 2253 | #endif |
| 2221 | #ifdef BUS_MCEERR_AO | 2254 | #ifdef BUS_MCEERR_AO |
| 2222 | /* | 2255 | /* |
| 2223 | * Other callers might not initialize the si_lsb field, | 2256 | * Other callers might not initialize the si_lsb field, |
| 2224 | * so check explicitely for the right codes here. | 2257 | * so check explicitly for the right codes here. |
| 2225 | */ | 2258 | */ |
| 2226 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) | 2259 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) |
| 2227 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | 2260 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); |
| @@ -2250,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
| 2250 | 2283 | ||
| 2251 | #endif | 2284 | #endif |
| 2252 | 2285 | ||
| 2286 | /** | ||
| 2287 | * sys_rt_sigtimedwait - synchronously wait for queued signals specified | ||
| 2288 | * in @uthese | ||
| 2289 | * @uthese: queued signals to wait for | ||
| 2290 | * @uinfo: if non-null, the signal's siginfo is returned here | ||
| 2291 | * @uts: upper bound on process time suspension | ||
| 2292 | * @sigsetsize: size of sigset_t type | ||
| 2293 | */ | ||
| 2253 | SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | 2294 | SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, |
| 2254 | siginfo_t __user *, uinfo, const struct timespec __user *, uts, | 2295 | siginfo_t __user *, uinfo, const struct timespec __user *, uts, |
| 2255 | size_t, sigsetsize) | 2296 | size_t, sigsetsize) |
| @@ -2266,7 +2307,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
| 2266 | 2307 | ||
| 2267 | if (copy_from_user(&these, uthese, sizeof(these))) | 2308 | if (copy_from_user(&these, uthese, sizeof(these))) |
| 2268 | return -EFAULT; | 2309 | return -EFAULT; |
| 2269 | 2310 | ||
| 2270 | /* | 2311 | /* |
| 2271 | * Invert the set of allowed signals to get those we | 2312 | * Invert the set of allowed signals to get those we |
| 2272 | * want to block. | 2313 | * want to block. |
| @@ -2291,9 +2332,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
| 2291 | + (ts.tv_sec || ts.tv_nsec)); | 2332 | + (ts.tv_sec || ts.tv_nsec)); |
| 2292 | 2333 | ||
| 2293 | if (timeout) { | 2334 | if (timeout) { |
| 2294 | /* None ready -- temporarily unblock those we're | 2335 | /* |
| 2336 | * None ready -- temporarily unblock those we're | ||
| 2295 | * interested while we are sleeping in so that we'll | 2337 | * interested while we are sleeping in so that we'll |
| 2296 | * be awakened when they arrive. */ | 2338 | * be awakened when they arrive. |
| 2339 | */ | ||
| 2297 | current->real_blocked = current->blocked; | 2340 | current->real_blocked = current->blocked; |
| 2298 | sigandsets(¤t->blocked, ¤t->blocked, &these); | 2341 | sigandsets(¤t->blocked, ¤t->blocked, &these); |
| 2299 | recalc_sigpending(); | 2342 | recalc_sigpending(); |
| @@ -2325,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
| 2325 | return ret; | 2368 | return ret; |
| 2326 | } | 2369 | } |
| 2327 | 2370 | ||
| 2371 | /** | ||
| 2372 | * sys_kill - send a signal to a process | ||
| 2373 | * @pid: the PID of the process | ||
| 2374 | * @sig: signal to be sent | ||
| 2375 | */ | ||
| 2328 | SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) | 2376 | SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) |
| 2329 | { | 2377 | { |
| 2330 | struct siginfo info; | 2378 | struct siginfo info; |
| @@ -2400,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) | |||
| 2400 | return do_tkill(tgid, pid, sig); | 2448 | return do_tkill(tgid, pid, sig); |
| 2401 | } | 2449 | } |
| 2402 | 2450 | ||
| 2403 | /* | 2451 | /** |
| 2452 | * sys_tkill - send signal to one specific task | ||
| 2453 | * @pid: the PID of the task | ||
| 2454 | * @sig: signal to be sent | ||
| 2455 | * | ||
| 2404 | * Send a signal to only one task, even if it's a CLONE_THREAD task. | 2456 | * Send a signal to only one task, even if it's a CLONE_THREAD task. |
| 2405 | */ | 2457 | */ |
| 2406 | SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) | 2458 | SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) |
| @@ -2412,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) | |||
| 2412 | return do_tkill(0, pid, sig); | 2464 | return do_tkill(0, pid, sig); |
| 2413 | } | 2465 | } |
| 2414 | 2466 | ||
| 2467 | /** | ||
| 2468 | * sys_rt_sigqueueinfo - send signal information to a signal | ||
| 2469 | * @pid: the PID of the thread | ||
| 2470 | * @sig: signal to be sent | ||
| 2471 | * @uinfo: signal info to be sent | ||
| 2472 | */ | ||
| 2415 | SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, | 2473 | SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, |
| 2416 | siginfo_t __user *, uinfo) | 2474 | siginfo_t __user *, uinfo) |
| 2417 | { | 2475 | { |
| @@ -2421,9 +2479,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, | |||
| 2421 | return -EFAULT; | 2479 | return -EFAULT; |
| 2422 | 2480 | ||
| 2423 | /* Not even root can pretend to send signals from the kernel. | 2481 | /* Not even root can pretend to send signals from the kernel. |
| 2424 | Nor can they impersonate a kill(), which adds source info. */ | 2482 | * Nor can they impersonate a kill()/tgkill(), which adds source info. |
| 2425 | if (info.si_code >= 0) | 2483 | */ |
| 2484 | if (info.si_code >= 0 || info.si_code == SI_TKILL) { | ||
| 2485 | /* We used to allow any < 0 si_code */ | ||
| 2486 | WARN_ON_ONCE(info.si_code < 0); | ||
| 2426 | return -EPERM; | 2487 | return -EPERM; |
| 2488 | } | ||
| 2427 | info.si_signo = sig; | 2489 | info.si_signo = sig; |
| 2428 | 2490 | ||
| 2429 | /* POSIX.1b doesn't mention process groups. */ | 2491 | /* POSIX.1b doesn't mention process groups. */ |
| @@ -2437,9 +2499,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) | |||
| 2437 | return -EINVAL; | 2499 | return -EINVAL; |
| 2438 | 2500 | ||
| 2439 | /* Not even root can pretend to send signals from the kernel. | 2501 | /* Not even root can pretend to send signals from the kernel. |
| 2440 | Nor can they impersonate a kill(), which adds source info. */ | 2502 | * Nor can they impersonate a kill()/tgkill(), which adds source info. |
| 2441 | if (info->si_code >= 0) | 2503 | */ |
| 2504 | if (info->si_code >= 0 || info->si_code == SI_TKILL) { | ||
| 2505 | /* We used to allow any < 0 si_code */ | ||
| 2506 | WARN_ON_ONCE(info->si_code < 0); | ||
| 2442 | return -EPERM; | 2507 | return -EPERM; |
| 2508 | } | ||
| 2443 | info->si_signo = sig; | 2509 | info->si_signo = sig; |
| 2444 | 2510 | ||
| 2445 | return do_send_specific(tgid, pid, sig, info); | 2511 | return do_send_specific(tgid, pid, sig, info); |
| @@ -2531,12 +2597,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s | |||
| 2531 | 2597 | ||
| 2532 | error = -EINVAL; | 2598 | error = -EINVAL; |
| 2533 | /* | 2599 | /* |
| 2534 | * | 2600 | * Note - this code used to test ss_flags incorrectly: |
| 2535 | * Note - this code used to test ss_flags incorrectly | ||
| 2536 | * old code may have been written using ss_flags==0 | 2601 | * old code may have been written using ss_flags==0 |
| 2537 | * to mean ss_flags==SS_ONSTACK (as this was the only | 2602 | * to mean ss_flags==SS_ONSTACK (as this was the only |
| 2538 | * way that worked) - this fix preserves that older | 2603 | * way that worked) - this fix preserves that older |
| 2539 | * mechanism | 2604 | * mechanism. |
| 2540 | */ | 2605 | */ |
| 2541 | if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) | 2606 | if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) |
| 2542 | goto out; | 2607 | goto out; |
| @@ -2570,6 +2635,10 @@ out: | |||
| 2570 | 2635 | ||
| 2571 | #ifdef __ARCH_WANT_SYS_SIGPENDING | 2636 | #ifdef __ARCH_WANT_SYS_SIGPENDING |
| 2572 | 2637 | ||
| 2638 | /** | ||
| 2639 | * sys_sigpending - examine pending signals | ||
| 2640 | * @set: where mask of pending signal is returned | ||
| 2641 | */ | ||
| 2573 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | 2642 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) |
| 2574 | { | 2643 | { |
| 2575 | return do_sigpending(set, sizeof(*set)); | 2644 | return do_sigpending(set, sizeof(*set)); |
| @@ -2578,8 +2647,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | |||
| 2578 | #endif | 2647 | #endif |
| 2579 | 2648 | ||
| 2580 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 2649 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
| 2581 | /* Some platforms have their own version with special arguments others | 2650 | /** |
| 2582 | support only sys_rt_sigprocmask. */ | 2651 | * sys_sigprocmask - examine and change blocked signals |
| 2652 | * @how: whether to add, remove, or set signals | ||
| 2653 | * @set: signals to add or remove (if non-null) | ||
| 2654 | * @oset: previous value of signal mask if non-null | ||
| 2655 | * | ||
| 2656 | * Some platforms have their own version with special arguments; | ||
| 2657 | * others support only sys_rt_sigprocmask. | ||
| 2658 | */ | ||
| 2583 | 2659 | ||
| 2584 | SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, | 2660 | SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, |
| 2585 | old_sigset_t __user *, oset) | 2661 | old_sigset_t __user *, oset) |
| @@ -2632,6 +2708,13 @@ out: | |||
| 2632 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ | 2708 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ |
| 2633 | 2709 | ||
| 2634 | #ifdef __ARCH_WANT_SYS_RT_SIGACTION | 2710 | #ifdef __ARCH_WANT_SYS_RT_SIGACTION |
| 2711 | /** | ||
| 2712 | * sys_rt_sigaction - alter an action taken by a process | ||
| 2713 | * @sig: signal to be sent | ||
| 2714 | * @act: new sigaction | ||
| 2715 | * @oact: used to save the previous sigaction | ||
| 2716 | * @sigsetsize: size of sigset_t type | ||
| 2717 | */ | ||
| 2635 | SYSCALL_DEFINE4(rt_sigaction, int, sig, | 2718 | SYSCALL_DEFINE4(rt_sigaction, int, sig, |
| 2636 | const struct sigaction __user *, act, | 2719 | const struct sigaction __user *, act, |
| 2637 | struct sigaction __user *, oact, | 2720 | struct sigaction __user *, oact, |
| @@ -2718,6 +2801,12 @@ SYSCALL_DEFINE0(pause) | |||
| 2718 | #endif | 2801 | #endif |
| 2719 | 2802 | ||
| 2720 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND | 2803 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND |
| 2804 | /** | ||
| 2805 | * sys_rt_sigsuspend - replace the signal mask for a value with the | ||
| 2806 | * @unewset value until a signal is received | ||
| 2807 | * @unewset: new signal mask value | ||
| 2808 | * @sigsetsize: size of sigset_t type | ||
| 2809 | */ | ||
| 2721 | SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) | 2810 | SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) |
| 2722 | { | 2811 | { |
| 2723 | sigset_t newset; | 2812 | sigset_t newset; |
diff --git a/kernel/smp.c b/kernel/smp.c index 9910744f0856..73a195193558 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void) | |||
| 194 | */ | 194 | */ |
| 195 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { | 195 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { |
| 196 | int refs; | 196 | int refs; |
| 197 | void (*func) (void *info); | 197 | smp_call_func_t func; |
| 198 | 198 | ||
| 199 | /* | 199 | /* |
| 200 | * Since we walk the list without any locks, we might | 200 | * Since we walk the list without any locks, we might |
| @@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void) | |||
| 214 | if (atomic_read(&data->refs) == 0) | 214 | if (atomic_read(&data->refs) == 0) |
| 215 | continue; | 215 | continue; |
| 216 | 216 | ||
| 217 | func = data->csd.func; /* for later warn */ | 217 | func = data->csd.func; /* save for later warn */ |
| 218 | data->csd.func(data->csd.info); | 218 | func(data->csd.info); |
| 219 | 219 | ||
| 220 | /* | 220 | /* |
| 221 | * If the cpu mask is not still set then it enabled interrupts, | 221 | * If the cpu mask is not still set then func enabled |
| 222 | * we took another smp interrupt, and executed the function | 222 | * interrupts (BUG), and this cpu took another smp call |
| 223 | * twice on this cpu. In theory that copy decremented refs. | 223 | * function interrupt and executed func(info) twice |
| 224 | * on this cpu. That nested execution decremented refs. | ||
| 224 | */ | 225 | */ |
| 225 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { | 226 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { |
| 226 | WARN(1, "%pS enabled interrupts and double executed\n", | 227 | WARN(1, "%pf enabled interrupts and double executed\n", func); |
| 227 | func); | ||
| 228 | continue; | 228 | continue; |
| 229 | } | 229 | } |
| 230 | 230 | ||
| @@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 450 | { | 450 | { |
| 451 | struct call_function_data *data; | 451 | struct call_function_data *data; |
| 452 | unsigned long flags; | 452 | unsigned long flags; |
| 453 | int cpu, next_cpu, this_cpu = smp_processor_id(); | 453 | int refs, cpu, next_cpu, this_cpu = smp_processor_id(); |
| 454 | 454 | ||
| 455 | /* | 455 | /* |
| 456 | * Can deadlock when called with interrupts disabled. | 456 | * Can deadlock when called with interrupts disabled. |
| @@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 461 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() | 461 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() |
| 462 | && !oops_in_progress && !early_boot_irqs_disabled); | 462 | && !oops_in_progress && !early_boot_irqs_disabled); |
| 463 | 463 | ||
| 464 | /* So, what's a CPU they want? Ignoring this one. */ | 464 | /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ |
| 465 | cpu = cpumask_first_and(mask, cpu_online_mask); | 465 | cpu = cpumask_first_and(mask, cpu_online_mask); |
| 466 | if (cpu == this_cpu) | 466 | if (cpu == this_cpu) |
| 467 | cpu = cpumask_next_and(cpu, mask, cpu_online_mask); | 467 | cpu = cpumask_next_and(cpu, mask, cpu_online_mask); |
| @@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 483 | 483 | ||
| 484 | data = &__get_cpu_var(cfd_data); | 484 | data = &__get_cpu_var(cfd_data); |
| 485 | csd_lock(&data->csd); | 485 | csd_lock(&data->csd); |
| 486 | |||
| 487 | /* This BUG_ON verifies our reuse assertions and can be removed */ | ||
| 486 | BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); | 488 | BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); |
| 487 | 489 | ||
| 490 | /* | ||
| 491 | * The global call function queue list add and delete are protected | ||
| 492 | * by a lock, but the list is traversed without any lock, relying | ||
| 493 | * on the rcu list add and delete to allow safe concurrent traversal. | ||
| 494 | * We reuse the call function data without waiting for any grace | ||
| 495 | * period after some other cpu removes it from the global queue. | ||
| 496 | * This means a cpu might find our data block as it is being | ||
| 497 | * filled out. | ||
| 498 | * | ||
| 499 | * We hold off the interrupt handler on the other cpu by | ||
| 500 | * ordering our writes to the cpu mask vs our setting of the | ||
| 501 | * refs counter. We assert only the cpu owning the data block | ||
| 502 | * will set a bit in cpumask, and each bit will only be cleared | ||
| 503 | * by the subject cpu. Each cpu must first find its bit is | ||
| 504 | * set and then check that refs is set indicating the element is | ||
| 505 | * ready to be processed, otherwise it must skip the entry. | ||
| 506 | * | ||
| 507 | * On the previous iteration refs was set to 0 by another cpu. | ||
| 508 | * To avoid the use of transitivity, set the counter to 0 here | ||
| 509 | * so the wmb will pair with the rmb in the interrupt handler. | ||
| 510 | */ | ||
| 511 | atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ | ||
| 512 | |||
| 488 | data->csd.func = func; | 513 | data->csd.func = func; |
| 489 | data->csd.info = info; | 514 | data->csd.info = info; |
| 490 | cpumask_and(data->cpumask, mask, cpu_online_mask); | ||
| 491 | cpumask_clear_cpu(this_cpu, data->cpumask); | ||
| 492 | 515 | ||
| 493 | /* | 516 | /* Ensure 0 refs is visible before mask. Also orders func and info */ |
| 494 | * To ensure the interrupt handler gets an complete view | ||
| 495 | * we order the cpumask and refs writes and order the read | ||
| 496 | * of them in the interrupt handler. In addition we may | ||
| 497 | * only clear our own cpu bit from the mask. | ||
| 498 | */ | ||
| 499 | smp_wmb(); | 517 | smp_wmb(); |
| 500 | 518 | ||
| 501 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); | 519 | /* We rely on the "and" being processed before the store */ |
| 520 | cpumask_and(data->cpumask, mask, cpu_online_mask); | ||
| 521 | cpumask_clear_cpu(this_cpu, data->cpumask); | ||
| 522 | refs = cpumask_weight(data->cpumask); | ||
| 523 | |||
| 524 | /* Some callers race with other cpus changing the passed mask */ | ||
| 525 | if (unlikely(!refs)) { | ||
| 526 | csd_unlock(&data->csd); | ||
| 527 | return; | ||
| 528 | } | ||
| 502 | 529 | ||
| 503 | raw_spin_lock_irqsave(&call_function.lock, flags); | 530 | raw_spin_lock_irqsave(&call_function.lock, flags); |
| 504 | /* | 531 | /* |
| @@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 507 | * will not miss any other list entries: | 534 | * will not miss any other list entries: |
| 508 | */ | 535 | */ |
| 509 | list_add_rcu(&data->csd.list, &call_function.queue); | 536 | list_add_rcu(&data->csd.list, &call_function.queue); |
| 537 | /* | ||
| 538 | * We rely on the wmb() in list_add_rcu to complete our writes | ||
| 539 | * to the cpumask before this write to refs, which indicates | ||
| 540 | * data is on the list and is ready to be processed. | ||
| 541 | */ | ||
| 542 | atomic_set(&data->refs, refs); | ||
| 510 | raw_spin_unlock_irqrestore(&call_function.lock, flags); | 543 | raw_spin_unlock_irqrestore(&call_function.lock, flags); |
| 511 | 544 | ||
| 512 | /* | 545 | /* |
| @@ -571,6 +604,87 @@ void ipi_call_unlock_irq(void) | |||
| 571 | } | 604 | } |
| 572 | #endif /* USE_GENERIC_SMP_HELPERS */ | 605 | #endif /* USE_GENERIC_SMP_HELPERS */ |
| 573 | 606 | ||
| 607 | /* Setup configured maximum number of CPUs to activate */ | ||
| 608 | unsigned int setup_max_cpus = NR_CPUS; | ||
| 609 | EXPORT_SYMBOL(setup_max_cpus); | ||
| 610 | |||
| 611 | |||
| 612 | /* | ||
| 613 | * Setup routine for controlling SMP activation | ||
| 614 | * | ||
| 615 | * Command-line option of "nosmp" or "maxcpus=0" will disable SMP | ||
| 616 | * activation entirely (the MPS table probe still happens, though). | ||
| 617 | * | ||
| 618 | * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer | ||
| 619 | * greater than 0, limits the maximum number of CPUs activated in | ||
| 620 | * SMP mode to <NUM>. | ||
| 621 | */ | ||
| 622 | |||
| 623 | void __weak arch_disable_smp_support(void) { } | ||
| 624 | |||
| 625 | static int __init nosmp(char *str) | ||
| 626 | { | ||
| 627 | setup_max_cpus = 0; | ||
| 628 | arch_disable_smp_support(); | ||
| 629 | |||
| 630 | return 0; | ||
| 631 | } | ||
| 632 | |||
| 633 | early_param("nosmp", nosmp); | ||
| 634 | |||
| 635 | /* this is hard limit */ | ||
| 636 | static int __init nrcpus(char *str) | ||
| 637 | { | ||
| 638 | int nr_cpus; | ||
| 639 | |||
| 640 | get_option(&str, &nr_cpus); | ||
| 641 | if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) | ||
| 642 | nr_cpu_ids = nr_cpus; | ||
| 643 | |||
| 644 | return 0; | ||
| 645 | } | ||
| 646 | |||
| 647 | early_param("nr_cpus", nrcpus); | ||
| 648 | |||
| 649 | static int __init maxcpus(char *str) | ||
| 650 | { | ||
| 651 | get_option(&str, &setup_max_cpus); | ||
| 652 | if (setup_max_cpus == 0) | ||
| 653 | arch_disable_smp_support(); | ||
| 654 | |||
| 655 | return 0; | ||
| 656 | } | ||
| 657 | |||
| 658 | early_param("maxcpus", maxcpus); | ||
| 659 | |||
| 660 | /* Setup number of possible processor ids */ | ||
| 661 | int nr_cpu_ids __read_mostly = NR_CPUS; | ||
| 662 | EXPORT_SYMBOL(nr_cpu_ids); | ||
| 663 | |||
| 664 | /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ | ||
| 665 | void __init setup_nr_cpu_ids(void) | ||
| 666 | { | ||
| 667 | nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; | ||
| 668 | } | ||
| 669 | |||
| 670 | /* Called by boot processor to activate the rest. */ | ||
| 671 | void __init smp_init(void) | ||
| 672 | { | ||
| 673 | unsigned int cpu; | ||
| 674 | |||
| 675 | /* FIXME: This should be done in userspace --RR */ | ||
| 676 | for_each_present_cpu(cpu) { | ||
| 677 | if (num_online_cpus() >= setup_max_cpus) | ||
| 678 | break; | ||
| 679 | if (!cpu_online(cpu)) | ||
| 680 | cpu_up(cpu); | ||
| 681 | } | ||
| 682 | |||
| 683 | /* Any cleanup work */ | ||
| 684 | printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); | ||
| 685 | smp_cpus_done(setup_max_cpus); | ||
| 686 | } | ||
| 687 | |||
| 574 | /* | 688 | /* |
| 575 | * Call a function on all processors. May be used during early boot while | 689 | * Call a function on all processors. May be used during early boot while |
| 576 | * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead | 690 | * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec388..174f976c2874 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat); | |||
| 54 | 54 | ||
| 55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; | 55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; |
| 56 | 56 | ||
| 57 | static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | 57 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); |
| 58 | 58 | ||
| 59 | char *softirq_to_name[NR_SOFTIRQS] = { | 59 | char *softirq_to_name[NR_SOFTIRQS] = { |
| 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", |
| @@ -311,9 +311,21 @@ void irq_enter(void) | |||
| 311 | } | 311 | } |
| 312 | 312 | ||
| 313 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 313 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
| 314 | # define invoke_softirq() __do_softirq() | 314 | static inline void invoke_softirq(void) |
| 315 | { | ||
| 316 | if (!force_irqthreads) | ||
| 317 | __do_softirq(); | ||
| 318 | else | ||
| 319 | wakeup_softirqd(); | ||
| 320 | } | ||
| 315 | #else | 321 | #else |
| 316 | # define invoke_softirq() do_softirq() | 322 | static inline void invoke_softirq(void) |
| 323 | { | ||
| 324 | if (!force_irqthreads) | ||
| 325 | do_softirq(); | ||
| 326 | else | ||
| 327 | wakeup_softirqd(); | ||
| 328 | } | ||
| 317 | #endif | 329 | #endif |
| 318 | 330 | ||
| 319 | /* | 331 | /* |
| @@ -555,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data) | |||
| 555 | /** | 567 | /** |
| 556 | * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks | 568 | * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks |
| 557 | * @ttimer: tasklet_hrtimer which is initialized | 569 | * @ttimer: tasklet_hrtimer which is initialized |
| 558 | * @function: hrtimer callback funtion which gets called from softirq context | 570 | * @function: hrtimer callback function which gets called from softirq context |
| 559 | * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) | 571 | * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) |
| 560 | * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) | 572 | * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) |
| 561 | */ | 573 | */ |
| @@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
| 721 | { | 733 | { |
| 722 | set_current_state(TASK_INTERRUPTIBLE); | 734 | set_current_state(TASK_INTERRUPTIBLE); |
| 723 | 735 | ||
| 724 | current->flags |= PF_KSOFTIRQD; | ||
| 725 | while (!kthread_should_stop()) { | 736 | while (!kthread_should_stop()) { |
| 726 | preempt_disable(); | 737 | preempt_disable(); |
| 727 | if (!local_softirq_pending()) { | 738 | if (!local_softirq_pending()) { |
| @@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
| 738 | don't process */ | 749 | don't process */ |
| 739 | if (cpu_is_offline((long)__bind_cpu)) | 750 | if (cpu_is_offline((long)__bind_cpu)) |
| 740 | goto wait_to_die; | 751 | goto wait_to_die; |
| 741 | do_softirq(); | 752 | local_irq_disable(); |
| 753 | if (local_softirq_pending()) | ||
| 754 | __do_softirq(); | ||
| 755 | local_irq_enable(); | ||
| 742 | preempt_enable_no_resched(); | 756 | preempt_enable_no_resched(); |
| 743 | cond_resched(); | 757 | cond_resched(); |
| 744 | preempt_disable(); | 758 | preempt_disable(); |
| @@ -831,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
| 831 | switch (action) { | 845 | switch (action) { |
| 832 | case CPU_UP_PREPARE: | 846 | case CPU_UP_PREPARE: |
| 833 | case CPU_UP_PREPARE_FROZEN: | 847 | case CPU_UP_PREPARE_FROZEN: |
| 834 | p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); | 848 | p = kthread_create_on_node(run_ksoftirqd, |
| 849 | hcpu, | ||
| 850 | cpu_to_node(hotcpu), | ||
| 851 | "ksoftirqd/%d", hotcpu); | ||
| 835 | if (IS_ERR(p)) { | 852 | if (IS_ERR(p)) { |
| 836 | printk("ksoftirqd for %i failed\n", hotcpu); | 853 | printk("ksoftirqd for %i failed\n", hotcpu); |
| 837 | return notifier_from_errno(PTR_ERR(p)); | 854 | return notifier_from_errno(PTR_ERR(p)); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2df820b03beb..e3516b29076c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |||
| 301 | case CPU_UP_PREPARE: | 301 | case CPU_UP_PREPARE: |
| 302 | BUG_ON(stopper->thread || stopper->enabled || | 302 | BUG_ON(stopper->thread || stopper->enabled || |
| 303 | !list_empty(&stopper->works)); | 303 | !list_empty(&stopper->works)); |
| 304 | p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", | 304 | p = kthread_create_on_node(cpu_stopper_thread, |
| 305 | cpu); | 305 | stopper, |
| 306 | cpu_to_node(cpu), | ||
| 307 | "migration/%d", cpu); | ||
| 306 | if (IS_ERR(p)) | 308 | if (IS_ERR(p)) |
| 307 | return notifier_from_errno(PTR_ERR(p)); | 309 | return notifier_from_errno(PTR_ERR(p)); |
| 308 | get_task_struct(p); | 310 | get_task_struct(p); |
diff --git a/kernel/sys.c b/kernel/sys.c index 18da702ec813..af468edf096a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -37,6 +37,7 @@ | |||
| 37 | #include <linux/ptrace.h> | 37 | #include <linux/ptrace.h> |
| 38 | #include <linux/fs_struct.h> | 38 | #include <linux/fs_struct.h> |
| 39 | #include <linux/gfp.h> | 39 | #include <linux/gfp.h> |
| 40 | #include <linux/syscore_ops.h> | ||
| 40 | 41 | ||
| 41 | #include <linux/compat.h> | 42 | #include <linux/compat.h> |
| 42 | #include <linux/syscalls.h> | 43 | #include <linux/syscalls.h> |
| @@ -119,16 +120,33 @@ EXPORT_SYMBOL(cad_pid); | |||
| 119 | void (*pm_power_off_prepare)(void); | 120 | void (*pm_power_off_prepare)(void); |
| 120 | 121 | ||
| 121 | /* | 122 | /* |
| 123 | * Returns true if current's euid is same as p's uid or euid, | ||
| 124 | * or has CAP_SYS_NICE to p's user_ns. | ||
| 125 | * | ||
| 126 | * Called with rcu_read_lock, creds are safe | ||
| 127 | */ | ||
| 128 | static bool set_one_prio_perm(struct task_struct *p) | ||
| 129 | { | ||
| 130 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); | ||
| 131 | |||
| 132 | if (pcred->user->user_ns == cred->user->user_ns && | ||
| 133 | (pcred->uid == cred->euid || | ||
| 134 | pcred->euid == cred->euid)) | ||
| 135 | return true; | ||
| 136 | if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) | ||
| 137 | return true; | ||
| 138 | return false; | ||
| 139 | } | ||
| 140 | |||
| 141 | /* | ||
| 122 | * set the priority of a task | 142 | * set the priority of a task |
| 123 | * - the caller must hold the RCU read lock | 143 | * - the caller must hold the RCU read lock |
| 124 | */ | 144 | */ |
| 125 | static int set_one_prio(struct task_struct *p, int niceval, int error) | 145 | static int set_one_prio(struct task_struct *p, int niceval, int error) |
| 126 | { | 146 | { |
| 127 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); | ||
| 128 | int no_nice; | 147 | int no_nice; |
| 129 | 148 | ||
| 130 | if (pcred->uid != cred->euid && | 149 | if (!set_one_prio_perm(p)) { |
| 131 | pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { | ||
| 132 | error = -EPERM; | 150 | error = -EPERM; |
| 133 | goto out; | 151 | goto out; |
| 134 | } | 152 | } |
| @@ -298,6 +316,7 @@ void kernel_restart_prepare(char *cmd) | |||
| 298 | system_state = SYSTEM_RESTART; | 316 | system_state = SYSTEM_RESTART; |
| 299 | device_shutdown(); | 317 | device_shutdown(); |
| 300 | sysdev_shutdown(); | 318 | sysdev_shutdown(); |
| 319 | syscore_shutdown(); | ||
| 301 | } | 320 | } |
| 302 | 321 | ||
| 303 | /** | 322 | /** |
| @@ -336,6 +355,7 @@ void kernel_halt(void) | |||
| 336 | { | 355 | { |
| 337 | kernel_shutdown_prepare(SYSTEM_HALT); | 356 | kernel_shutdown_prepare(SYSTEM_HALT); |
| 338 | sysdev_shutdown(); | 357 | sysdev_shutdown(); |
| 358 | syscore_shutdown(); | ||
| 339 | printk(KERN_EMERG "System halted.\n"); | 359 | printk(KERN_EMERG "System halted.\n"); |
| 340 | kmsg_dump(KMSG_DUMP_HALT); | 360 | kmsg_dump(KMSG_DUMP_HALT); |
| 341 | machine_halt(); | 361 | machine_halt(); |
| @@ -355,6 +375,7 @@ void kernel_power_off(void) | |||
| 355 | pm_power_off_prepare(); | 375 | pm_power_off_prepare(); |
| 356 | disable_nonboot_cpus(); | 376 | disable_nonboot_cpus(); |
| 357 | sysdev_shutdown(); | 377 | sysdev_shutdown(); |
| 378 | syscore_shutdown(); | ||
| 358 | printk(KERN_EMERG "Power down.\n"); | 379 | printk(KERN_EMERG "Power down.\n"); |
| 359 | kmsg_dump(KMSG_DUMP_POWEROFF); | 380 | kmsg_dump(KMSG_DUMP_POWEROFF); |
| 360 | machine_power_off(); | 381 | machine_power_off(); |
| @@ -502,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
| 502 | if (rgid != (gid_t) -1) { | 523 | if (rgid != (gid_t) -1) { |
| 503 | if (old->gid == rgid || | 524 | if (old->gid == rgid || |
| 504 | old->egid == rgid || | 525 | old->egid == rgid || |
| 505 | capable(CAP_SETGID)) | 526 | nsown_capable(CAP_SETGID)) |
| 506 | new->gid = rgid; | 527 | new->gid = rgid; |
| 507 | else | 528 | else |
| 508 | goto error; | 529 | goto error; |
| @@ -511,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
| 511 | if (old->gid == egid || | 532 | if (old->gid == egid || |
| 512 | old->egid == egid || | 533 | old->egid == egid || |
| 513 | old->sgid == egid || | 534 | old->sgid == egid || |
| 514 | capable(CAP_SETGID)) | 535 | nsown_capable(CAP_SETGID)) |
| 515 | new->egid = egid; | 536 | new->egid = egid; |
| 516 | else | 537 | else |
| 517 | goto error; | 538 | goto error; |
| @@ -546,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) | |||
| 546 | old = current_cred(); | 567 | old = current_cred(); |
| 547 | 568 | ||
| 548 | retval = -EPERM; | 569 | retval = -EPERM; |
| 549 | if (capable(CAP_SETGID)) | 570 | if (nsown_capable(CAP_SETGID)) |
| 550 | new->gid = new->egid = new->sgid = new->fsgid = gid; | 571 | new->gid = new->egid = new->sgid = new->fsgid = gid; |
| 551 | else if (gid == old->gid || gid == old->sgid) | 572 | else if (gid == old->gid || gid == old->sgid) |
| 552 | new->egid = new->fsgid = gid; | 573 | new->egid = new->fsgid = gid; |
| @@ -613,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
| 613 | new->uid = ruid; | 634 | new->uid = ruid; |
| 614 | if (old->uid != ruid && | 635 | if (old->uid != ruid && |
| 615 | old->euid != ruid && | 636 | old->euid != ruid && |
| 616 | !capable(CAP_SETUID)) | 637 | !nsown_capable(CAP_SETUID)) |
| 617 | goto error; | 638 | goto error; |
| 618 | } | 639 | } |
| 619 | 640 | ||
| @@ -622,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
| 622 | if (old->uid != euid && | 643 | if (old->uid != euid && |
| 623 | old->euid != euid && | 644 | old->euid != euid && |
| 624 | old->suid != euid && | 645 | old->suid != euid && |
| 625 | !capable(CAP_SETUID)) | 646 | !nsown_capable(CAP_SETUID)) |
| 626 | goto error; | 647 | goto error; |
| 627 | } | 648 | } |
| 628 | 649 | ||
| @@ -670,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) | |||
| 670 | old = current_cred(); | 691 | old = current_cred(); |
| 671 | 692 | ||
| 672 | retval = -EPERM; | 693 | retval = -EPERM; |
| 673 | if (capable(CAP_SETUID)) { | 694 | if (nsown_capable(CAP_SETUID)) { |
| 674 | new->suid = new->uid = uid; | 695 | new->suid = new->uid = uid; |
| 675 | if (uid != old->uid) { | 696 | if (uid != old->uid) { |
| 676 | retval = set_user(new); | 697 | retval = set_user(new); |
| @@ -712,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | |||
| 712 | old = current_cred(); | 733 | old = current_cred(); |
| 713 | 734 | ||
| 714 | retval = -EPERM; | 735 | retval = -EPERM; |
| 715 | if (!capable(CAP_SETUID)) { | 736 | if (!nsown_capable(CAP_SETUID)) { |
| 716 | if (ruid != (uid_t) -1 && ruid != old->uid && | 737 | if (ruid != (uid_t) -1 && ruid != old->uid && |
| 717 | ruid != old->euid && ruid != old->suid) | 738 | ruid != old->euid && ruid != old->suid) |
| 718 | goto error; | 739 | goto error; |
| @@ -776,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | |||
| 776 | old = current_cred(); | 797 | old = current_cred(); |
| 777 | 798 | ||
| 778 | retval = -EPERM; | 799 | retval = -EPERM; |
| 779 | if (!capable(CAP_SETGID)) { | 800 | if (!nsown_capable(CAP_SETGID)) { |
| 780 | if (rgid != (gid_t) -1 && rgid != old->gid && | 801 | if (rgid != (gid_t) -1 && rgid != old->gid && |
| 781 | rgid != old->egid && rgid != old->sgid) | 802 | rgid != old->egid && rgid != old->sgid) |
| 782 | goto error; | 803 | goto error; |
| @@ -836,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) | |||
| 836 | 857 | ||
| 837 | if (uid == old->uid || uid == old->euid || | 858 | if (uid == old->uid || uid == old->euid || |
| 838 | uid == old->suid || uid == old->fsuid || | 859 | uid == old->suid || uid == old->fsuid || |
| 839 | capable(CAP_SETUID)) { | 860 | nsown_capable(CAP_SETUID)) { |
| 840 | if (uid != old_fsuid) { | 861 | if (uid != old_fsuid) { |
| 841 | new->fsuid = uid; | 862 | new->fsuid = uid; |
| 842 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) | 863 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) |
| @@ -869,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) | |||
| 869 | 890 | ||
| 870 | if (gid == old->gid || gid == old->egid || | 891 | if (gid == old->gid || gid == old->egid || |
| 871 | gid == old->sgid || gid == old->fsgid || | 892 | gid == old->sgid || gid == old->fsgid || |
| 872 | capable(CAP_SETGID)) { | 893 | nsown_capable(CAP_SETGID)) { |
| 873 | if (gid != old_fsgid) { | 894 | if (gid != old_fsgid) { |
| 874 | new->fsgid = gid; | 895 | new->fsgid = gid; |
| 875 | goto change_okay; | 896 | goto change_okay; |
| @@ -1177,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) | |||
| 1177 | int errno; | 1198 | int errno; |
| 1178 | char tmp[__NEW_UTS_LEN]; | 1199 | char tmp[__NEW_UTS_LEN]; |
| 1179 | 1200 | ||
| 1180 | if (!capable(CAP_SYS_ADMIN)) | 1201 | if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) |
| 1181 | return -EPERM; | 1202 | return -EPERM; |
| 1203 | |||
| 1182 | if (len < 0 || len > __NEW_UTS_LEN) | 1204 | if (len < 0 || len > __NEW_UTS_LEN) |
| 1183 | return -EINVAL; | 1205 | return -EINVAL; |
| 1184 | down_write(&uts_sem); | 1206 | down_write(&uts_sem); |
| @@ -1226,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) | |||
| 1226 | int errno; | 1248 | int errno; |
| 1227 | char tmp[__NEW_UTS_LEN]; | 1249 | char tmp[__NEW_UTS_LEN]; |
| 1228 | 1250 | ||
| 1229 | if (!capable(CAP_SYS_ADMIN)) | 1251 | if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) |
| 1230 | return -EPERM; | 1252 | return -EPERM; |
| 1231 | if (len < 0 || len > __NEW_UTS_LEN) | 1253 | if (len < 0 || len > __NEW_UTS_LEN) |
| 1232 | return -EINVAL; | 1254 | return -EINVAL; |
| @@ -1341,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, | |||
| 1341 | rlim = tsk->signal->rlim + resource; | 1363 | rlim = tsk->signal->rlim + resource; |
| 1342 | task_lock(tsk->group_leader); | 1364 | task_lock(tsk->group_leader); |
| 1343 | if (new_rlim) { | 1365 | if (new_rlim) { |
| 1366 | /* Keep the capable check against init_user_ns until | ||
| 1367 | cgroups can contain all limits */ | ||
| 1344 | if (new_rlim->rlim_max > rlim->rlim_max && | 1368 | if (new_rlim->rlim_max > rlim->rlim_max && |
| 1345 | !capable(CAP_SYS_RESOURCE)) | 1369 | !capable(CAP_SYS_RESOURCE)) |
| 1346 | retval = -EPERM; | 1370 | retval = -EPERM; |
| @@ -1384,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task) | |||
| 1384 | { | 1408 | { |
| 1385 | const struct cred *cred = current_cred(), *tcred; | 1409 | const struct cred *cred = current_cred(), *tcred; |
| 1386 | 1410 | ||
| 1387 | tcred = __task_cred(task); | 1411 | if (current == task) |
| 1388 | if (current != task && | 1412 | return 0; |
| 1389 | (cred->uid != tcred->euid || | ||
| 1390 | cred->uid != tcred->suid || | ||
| 1391 | cred->uid != tcred->uid || | ||
| 1392 | cred->gid != tcred->egid || | ||
| 1393 | cred->gid != tcred->sgid || | ||
| 1394 | cred->gid != tcred->gid) && | ||
| 1395 | !capable(CAP_SYS_RESOURCE)) { | ||
| 1396 | return -EPERM; | ||
| 1397 | } | ||
| 1398 | 1413 | ||
| 1399 | return 0; | 1414 | tcred = __task_cred(task); |
| 1415 | if (cred->user->user_ns == tcred->user->user_ns && | ||
| 1416 | (cred->uid == tcred->euid && | ||
| 1417 | cred->uid == tcred->suid && | ||
| 1418 | cred->uid == tcred->uid && | ||
| 1419 | cred->gid == tcred->egid && | ||
| 1420 | cred->gid == tcred->sgid && | ||
| 1421 | cred->gid == tcred->gid)) | ||
| 1422 | return 0; | ||
| 1423 | if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) | ||
| 1424 | return 0; | ||
| 1425 | |||
| 1426 | return -EPERM; | ||
| 1400 | } | 1427 | } |
| 1401 | 1428 | ||
| 1402 | SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, | 1429 | SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c782fe9924c7..25cc41cd8f33 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open); | |||
| 186 | /* fanotify! */ | 186 | /* fanotify! */ |
| 187 | cond_syscall(sys_fanotify_init); | 187 | cond_syscall(sys_fanotify_init); |
| 188 | cond_syscall(sys_fanotify_mark); | 188 | cond_syscall(sys_fanotify_mark); |
| 189 | |||
| 190 | /* open by handle */ | ||
| 191 | cond_syscall(sys_name_to_handle_at); | ||
| 192 | cond_syscall(sys_open_by_handle_at); | ||
| 193 | cond_syscall(compat_sys_open_by_handle_at); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83db985..c0bb32414b17 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -117,6 +117,7 @@ static int neg_one = -1; | |||
| 117 | static int zero; | 117 | static int zero; |
| 118 | static int __maybe_unused one = 1; | 118 | static int __maybe_unused one = 1; |
| 119 | static int __maybe_unused two = 2; | 119 | static int __maybe_unused two = 2; |
| 120 | static int __maybe_unused three = 3; | ||
| 120 | static unsigned long one_ul = 1; | 121 | static unsigned long one_ul = 1; |
| 121 | static int one_hundred = 100; | 122 | static int one_hundred = 100; |
| 122 | #ifdef CONFIG_PRINTK | 123 | #ifdef CONFIG_PRINTK |
| @@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write, | |||
| 169 | void __user *buffer, size_t *lenp, loff_t *ppos); | 170 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| 170 | #endif | 171 | #endif |
| 171 | 172 | ||
| 173 | #ifdef CONFIG_PRINTK | ||
| 174 | static int proc_dmesg_restrict(struct ctl_table *table, int write, | ||
| 175 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
| 176 | #endif | ||
| 177 | |||
| 172 | #ifdef CONFIG_MAGIC_SYSRQ | 178 | #ifdef CONFIG_MAGIC_SYSRQ |
| 173 | /* Note: sysrq code uses it's own private copy */ | 179 | /* Note: sysrq code uses it's own private copy */ |
| 174 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; | 180 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; |
| @@ -194,9 +200,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, | |||
| 194 | static struct ctl_table root_table[]; | 200 | static struct ctl_table root_table[]; |
| 195 | static struct ctl_table_root sysctl_table_root; | 201 | static struct ctl_table_root sysctl_table_root; |
| 196 | static struct ctl_table_header root_table_header = { | 202 | static struct ctl_table_header root_table_header = { |
| 197 | .count = 1, | 203 | {{.count = 1, |
| 198 | .ctl_table = root_table, | 204 | .ctl_table = root_table, |
| 199 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), | 205 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, |
| 200 | .root = &sysctl_table_root, | 206 | .root = &sysctl_table_root, |
| 201 | .set = &sysctl_table_root.default_set, | 207 | .set = &sysctl_table_root.default_set, |
| 202 | }; | 208 | }; |
| @@ -361,20 +367,13 @@ static struct ctl_table kern_table[] = { | |||
| 361 | .mode = 0644, | 367 | .mode = 0644, |
| 362 | .proc_handler = sched_rt_handler, | 368 | .proc_handler = sched_rt_handler, |
| 363 | }, | 369 | }, |
| 364 | { | ||
| 365 | .procname = "sched_compat_yield", | ||
| 366 | .data = &sysctl_sched_compat_yield, | ||
| 367 | .maxlen = sizeof(unsigned int), | ||
| 368 | .mode = 0644, | ||
| 369 | .proc_handler = proc_dointvec, | ||
| 370 | }, | ||
| 371 | #ifdef CONFIG_SCHED_AUTOGROUP | 370 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 372 | { | 371 | { |
| 373 | .procname = "sched_autogroup_enabled", | 372 | .procname = "sched_autogroup_enabled", |
| 374 | .data = &sysctl_sched_autogroup_enabled, | 373 | .data = &sysctl_sched_autogroup_enabled, |
| 375 | .maxlen = sizeof(unsigned int), | 374 | .maxlen = sizeof(unsigned int), |
| 376 | .mode = 0644, | 375 | .mode = 0644, |
| 377 | .proc_handler = proc_dointvec, | 376 | .proc_handler = proc_dointvec_minmax, |
| 378 | .extra1 = &zero, | 377 | .extra1 = &zero, |
| 379 | .extra2 = &one, | 378 | .extra2 = &one, |
| 380 | }, | 379 | }, |
| @@ -713,7 +712,7 @@ static struct ctl_table kern_table[] = { | |||
| 713 | .data = &kptr_restrict, | 712 | .data = &kptr_restrict, |
| 714 | .maxlen = sizeof(int), | 713 | .maxlen = sizeof(int), |
| 715 | .mode = 0644, | 714 | .mode = 0644, |
| 716 | .proc_handler = proc_dointvec_minmax, | 715 | .proc_handler = proc_dmesg_restrict, |
| 717 | .extra1 = &zero, | 716 | .extra1 = &zero, |
| 718 | .extra2 = &two, | 717 | .extra2 = &two, |
| 719 | }, | 718 | }, |
| @@ -948,7 +947,7 @@ static struct ctl_table kern_table[] = { | |||
| 948 | .data = &sysctl_perf_event_sample_rate, | 947 | .data = &sysctl_perf_event_sample_rate, |
| 949 | .maxlen = sizeof(sysctl_perf_event_sample_rate), | 948 | .maxlen = sizeof(sysctl_perf_event_sample_rate), |
| 950 | .mode = 0644, | 949 | .mode = 0644, |
| 951 | .proc_handler = proc_dointvec, | 950 | .proc_handler = perf_proc_update_handler, |
| 952 | }, | 951 | }, |
| 953 | #endif | 952 | #endif |
| 954 | #ifdef CONFIG_KMEMCHECK | 953 | #ifdef CONFIG_KMEMCHECK |
| @@ -978,14 +977,18 @@ static struct ctl_table vm_table[] = { | |||
| 978 | .data = &sysctl_overcommit_memory, | 977 | .data = &sysctl_overcommit_memory, |
| 979 | .maxlen = sizeof(sysctl_overcommit_memory), | 978 | .maxlen = sizeof(sysctl_overcommit_memory), |
| 980 | .mode = 0644, | 979 | .mode = 0644, |
| 981 | .proc_handler = proc_dointvec, | 980 | .proc_handler = proc_dointvec_minmax, |
| 981 | .extra1 = &zero, | ||
| 982 | .extra2 = &two, | ||
| 982 | }, | 983 | }, |
| 983 | { | 984 | { |
| 984 | .procname = "panic_on_oom", | 985 | .procname = "panic_on_oom", |
| 985 | .data = &sysctl_panic_on_oom, | 986 | .data = &sysctl_panic_on_oom, |
| 986 | .maxlen = sizeof(sysctl_panic_on_oom), | 987 | .maxlen = sizeof(sysctl_panic_on_oom), |
| 987 | .mode = 0644, | 988 | .mode = 0644, |
| 988 | .proc_handler = proc_dointvec, | 989 | .proc_handler = proc_dointvec_minmax, |
| 990 | .extra1 = &zero, | ||
| 991 | .extra2 = &two, | ||
| 989 | }, | 992 | }, |
| 990 | { | 993 | { |
| 991 | .procname = "oom_kill_allocating_task", | 994 | .procname = "oom_kill_allocating_task", |
| @@ -1013,7 +1016,8 @@ static struct ctl_table vm_table[] = { | |||
| 1013 | .data = &page_cluster, | 1016 | .data = &page_cluster, |
| 1014 | .maxlen = sizeof(int), | 1017 | .maxlen = sizeof(int), |
| 1015 | .mode = 0644, | 1018 | .mode = 0644, |
| 1016 | .proc_handler = proc_dointvec, | 1019 | .proc_handler = proc_dointvec_minmax, |
| 1020 | .extra1 = &zero, | ||
| 1017 | }, | 1021 | }, |
| 1018 | { | 1022 | { |
| 1019 | .procname = "dirty_background_ratio", | 1023 | .procname = "dirty_background_ratio", |
| @@ -1061,7 +1065,8 @@ static struct ctl_table vm_table[] = { | |||
| 1061 | .data = &dirty_expire_interval, | 1065 | .data = &dirty_expire_interval, |
| 1062 | .maxlen = sizeof(dirty_expire_interval), | 1066 | .maxlen = sizeof(dirty_expire_interval), |
| 1063 | .mode = 0644, | 1067 | .mode = 0644, |
| 1064 | .proc_handler = proc_dointvec, | 1068 | .proc_handler = proc_dointvec_minmax, |
| 1069 | .extra1 = &zero, | ||
| 1065 | }, | 1070 | }, |
| 1066 | { | 1071 | { |
| 1067 | .procname = "nr_pdflush_threads", | 1072 | .procname = "nr_pdflush_threads", |
| @@ -1137,6 +1142,8 @@ static struct ctl_table vm_table[] = { | |||
| 1137 | .maxlen = sizeof(int), | 1142 | .maxlen = sizeof(int), |
| 1138 | .mode = 0644, | 1143 | .mode = 0644, |
| 1139 | .proc_handler = drop_caches_sysctl_handler, | 1144 | .proc_handler = drop_caches_sysctl_handler, |
| 1145 | .extra1 = &one, | ||
| 1146 | .extra2 = &three, | ||
| 1140 | }, | 1147 | }, |
| 1141 | #ifdef CONFIG_COMPACTION | 1148 | #ifdef CONFIG_COMPACTION |
| 1142 | { | 1149 | { |
| @@ -1567,11 +1574,16 @@ void sysctl_head_get(struct ctl_table_header *head) | |||
| 1567 | spin_unlock(&sysctl_lock); | 1574 | spin_unlock(&sysctl_lock); |
| 1568 | } | 1575 | } |
| 1569 | 1576 | ||
| 1577 | static void free_head(struct rcu_head *rcu) | ||
| 1578 | { | ||
| 1579 | kfree(container_of(rcu, struct ctl_table_header, rcu)); | ||
| 1580 | } | ||
| 1581 | |||
| 1570 | void sysctl_head_put(struct ctl_table_header *head) | 1582 | void sysctl_head_put(struct ctl_table_header *head) |
| 1571 | { | 1583 | { |
| 1572 | spin_lock(&sysctl_lock); | 1584 | spin_lock(&sysctl_lock); |
| 1573 | if (!--head->count) | 1585 | if (!--head->count) |
| 1574 | kfree(head); | 1586 | call_rcu(&head->rcu, free_head); |
| 1575 | spin_unlock(&sysctl_lock); | 1587 | spin_unlock(&sysctl_lock); |
| 1576 | } | 1588 | } |
| 1577 | 1589 | ||
| @@ -1685,13 +1697,8 @@ static int test_perm(int mode, int op) | |||
| 1685 | 1697 | ||
| 1686 | int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) | 1698 | int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) |
| 1687 | { | 1699 | { |
| 1688 | int error; | ||
| 1689 | int mode; | 1700 | int mode; |
| 1690 | 1701 | ||
| 1691 | error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); | ||
| 1692 | if (error) | ||
| 1693 | return error; | ||
| 1694 | |||
| 1695 | if (root->permissions) | 1702 | if (root->permissions) |
| 1696 | mode = root->permissions(root, current->nsproxy, table); | 1703 | mode = root->permissions(root, current->nsproxy, table); |
| 1697 | else | 1704 | else |
| @@ -1948,10 +1955,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) | |||
| 1948 | start_unregistering(header); | 1955 | start_unregistering(header); |
| 1949 | if (!--header->parent->count) { | 1956 | if (!--header->parent->count) { |
| 1950 | WARN_ON(1); | 1957 | WARN_ON(1); |
| 1951 | kfree(header->parent); | 1958 | call_rcu(&header->parent->rcu, free_head); |
| 1952 | } | 1959 | } |
| 1953 | if (!--header->count) | 1960 | if (!--header->count) |
| 1954 | kfree(header); | 1961 | call_rcu(&header->rcu, free_head); |
| 1955 | spin_unlock(&sysctl_lock); | 1962 | spin_unlock(&sysctl_lock); |
| 1956 | } | 1963 | } |
| 1957 | 1964 | ||
| @@ -2392,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write, | |||
| 2392 | return err; | 2399 | return err; |
| 2393 | } | 2400 | } |
| 2394 | 2401 | ||
| 2402 | #ifdef CONFIG_PRINTK | ||
| 2403 | static int proc_dmesg_restrict(struct ctl_table *table, int write, | ||
| 2404 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2405 | { | ||
| 2406 | if (write && !capable(CAP_SYS_ADMIN)) | ||
| 2407 | return -EPERM; | ||
| 2408 | |||
| 2409 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 2410 | } | ||
| 2411 | #endif | ||
| 2412 | |||
| 2395 | struct do_proc_dointvec_minmax_conv_param { | 2413 | struct do_proc_dointvec_minmax_conv_param { |
| 2396 | int *min; | 2414 | int *min; |
| 2397 | int *max; | 2415 | int *max; |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bedf7c9a..3b8e028b9601 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
| 1321 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) | 1321 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) |
| 1322 | { | 1322 | { |
| 1323 | const struct bin_table *table = NULL; | 1323 | const struct bin_table *table = NULL; |
| 1324 | struct nameidata nd; | ||
| 1325 | struct vfsmount *mnt; | 1324 | struct vfsmount *mnt; |
| 1326 | struct file *file; | 1325 | struct file *file; |
| 1327 | ssize_t result; | 1326 | ssize_t result; |
| 1328 | char *pathname; | 1327 | char *pathname; |
| 1329 | int flags; | 1328 | int flags; |
| 1330 | int acc_mode; | ||
| 1331 | 1329 | ||
| 1332 | pathname = sysctl_getname(name, nlen, &table); | 1330 | pathname = sysctl_getname(name, nlen, &table); |
| 1333 | result = PTR_ERR(pathname); | 1331 | result = PTR_ERR(pathname); |
| @@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
| 1337 | /* How should the sysctl be accessed? */ | 1335 | /* How should the sysctl be accessed? */ |
| 1338 | if (oldval && oldlen && newval && newlen) { | 1336 | if (oldval && oldlen && newval && newlen) { |
| 1339 | flags = O_RDWR; | 1337 | flags = O_RDWR; |
| 1340 | acc_mode = MAY_READ | MAY_WRITE; | ||
| 1341 | } else if (newval && newlen) { | 1338 | } else if (newval && newlen) { |
| 1342 | flags = O_WRONLY; | 1339 | flags = O_WRONLY; |
| 1343 | acc_mode = MAY_WRITE; | ||
| 1344 | } else if (oldval && oldlen) { | 1340 | } else if (oldval && oldlen) { |
| 1345 | flags = O_RDONLY; | 1341 | flags = O_RDONLY; |
| 1346 | acc_mode = MAY_READ; | ||
| 1347 | } else { | 1342 | } else { |
| 1348 | result = 0; | 1343 | result = 0; |
| 1349 | goto out_putname; | 1344 | goto out_putname; |
| 1350 | } | 1345 | } |
| 1351 | 1346 | ||
| 1352 | mnt = current->nsproxy->pid_ns->proc_mnt; | 1347 | mnt = current->nsproxy->pid_ns->proc_mnt; |
| 1353 | result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); | 1348 | file = file_open_root(mnt->mnt_root, mnt, pathname, flags); |
| 1354 | if (result) | ||
| 1355 | goto out_putname; | ||
| 1356 | |||
| 1357 | result = may_open(&nd.path, acc_mode, flags); | ||
| 1358 | if (result) | ||
| 1359 | goto out_putpath; | ||
| 1360 | |||
| 1361 | file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); | ||
| 1362 | result = PTR_ERR(file); | 1349 | result = PTR_ERR(file); |
| 1363 | if (IS_ERR(file)) | 1350 | if (IS_ERR(file)) |
| 1364 | goto out_putname; | 1351 | goto out_putname; |
| @@ -1370,10 +1357,6 @@ out_putname: | |||
| 1370 | putname(pathname); | 1357 | putname(pathname); |
| 1371 | out: | 1358 | out: |
| 1372 | return result; | 1359 | return result; |
| 1373 | |||
| 1374 | out_putpath: | ||
| 1375 | path_put(&nd.path); | ||
| 1376 | goto out_putname; | ||
| 1377 | } | 1360 | } |
| 1378 | 1361 | ||
| 1379 | 1362 | ||
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 10b90d8a03c4..4e4932a7b360 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
| @@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) | |||
| 111 | const char *fail = NULL; | 111 | const char *fail = NULL; |
| 112 | 112 | ||
| 113 | if (table->parent) { | 113 | if (table->parent) { |
| 114 | if (table->procname && !table->parent->procname) | 114 | if (!table->parent->procname) |
| 115 | set_fail(&fail, table, "Parent without procname"); | 115 | set_fail(&fail, table, "Parent without procname"); |
| 116 | } | 116 | } |
| 117 | if (!table->procname) | ||
| 118 | set_fail(&fail, table, "No procname"); | ||
| 119 | if (table->child) { | 117 | if (table->child) { |
| 120 | if (table->data) | 118 | if (table->data) |
| 121 | set_fail(&fail, table, "Directory with data?"); | 119 | set_fail(&fail, table, "Directory with data?"); |
| @@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) | |||
| 144 | set_fail(&fail, table, "No maxlen"); | 142 | set_fail(&fail, table, "No maxlen"); |
| 145 | } | 143 | } |
| 146 | #ifdef CONFIG_PROC_SYSCTL | 144 | #ifdef CONFIG_PROC_SYSCTL |
| 147 | if (table->procname && !table->proc_handler) | 145 | if (!table->proc_handler) |
| 148 | set_fail(&fail, table, "No proc_handler"); | 146 | set_fail(&fail, table, "No proc_handler"); |
| 149 | #endif | 147 | #endif |
| 150 | #if 0 | ||
| 151 | if (!table->procname && table->proc_handler) | ||
| 152 | set_fail(&fail, table, "proc_handler without procname"); | ||
| 153 | #endif | ||
| 154 | sysctl_check_leaf(namespaces, table, &fail); | 148 | sysctl_check_leaf(namespaces, table, &fail); |
| 155 | } | 149 | } |
| 156 | if (table->mode > 0777) | 150 | if (table->mode > 0777) |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3971c6b9d58d..9ffea360a778 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -685,7 +685,7 @@ static int __init taskstats_init(void) | |||
| 685 | goto err_cgroup_ops; | 685 | goto err_cgroup_ops; |
| 686 | 686 | ||
| 687 | family_registered = 1; | 687 | family_registered = 1; |
| 688 | printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); | 688 | pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); |
| 689 | return 0; | 689 | return 0; |
| 690 | err_cgroup_ops: | 690 | err_cgroup_ops: |
| 691 | genl_unregister_ops(&family, &taskstats_ops); | 691 | genl_unregister_ops(&family, &taskstats_ops); |
diff --git a/kernel/time.c b/kernel/time.c index 32174359576f..8e8dc6d705c9 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -150,7 +150,7 @@ static inline void warp_clock(void) | |||
| 150 | * various programs will get confused when the clock gets warped. | 150 | * various programs will get confused when the clock gets warped. |
| 151 | */ | 151 | */ |
| 152 | 152 | ||
| 153 | int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) | 153 | int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) |
| 154 | { | 154 | { |
| 155 | static int firsttime = 1; | 155 | static int firsttime = 1; |
| 156 | int error = 0; | 156 | int error = 0; |
| @@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x) | |||
| 645 | } | 645 | } |
| 646 | 646 | ||
| 647 | /** | 647 | /** |
| 648 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies | 648 | * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 |
| 649 | * | 649 | * |
| 650 | * @n: nsecs in u64 | 650 | * @n: nsecs in u64 |
| 651 | * | 651 | * |
| @@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x) | |||
| 657 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) | 657 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) |
| 658 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years | 658 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years |
| 659 | */ | 659 | */ |
| 660 | unsigned long nsecs_to_jiffies(u64 n) | 660 | u64 nsecs_to_jiffies64(u64 n) |
| 661 | { | 661 | { |
| 662 | #if (NSEC_PER_SEC % HZ) == 0 | 662 | #if (NSEC_PER_SEC % HZ) == 0 |
| 663 | /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ | 663 | /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ |
| @@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n) | |||
| 674 | #endif | 674 | #endif |
| 675 | } | 675 | } |
| 676 | 676 | ||
| 677 | #if (BITS_PER_LONG < 64) | 677 | /** |
| 678 | u64 get_jiffies_64(void) | 678 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies |
| 679 | * | ||
| 680 | * @n: nsecs in u64 | ||
| 681 | * | ||
| 682 | * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. | ||
| 683 | * And this doesn't return MAX_JIFFY_OFFSET since this function is designed | ||
| 684 | * for scheduler, not for use in device drivers to calculate timeout value. | ||
| 685 | * | ||
| 686 | * note: | ||
| 687 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) | ||
| 688 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years | ||
| 689 | */ | ||
| 690 | unsigned long nsecs_to_jiffies(u64 n) | ||
| 679 | { | 691 | { |
| 680 | unsigned long seq; | 692 | return (unsigned long)nsecs_to_jiffies64(n); |
| 681 | u64 ret; | ||
| 682 | |||
| 683 | do { | ||
| 684 | seq = read_seqbegin(&xtime_lock); | ||
| 685 | ret = jiffies_64; | ||
| 686 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 687 | return ret; | ||
| 688 | } | 693 | } |
| 689 | EXPORT_SYMBOL(get_jiffies_64); | ||
| 690 | #endif | ||
| 691 | |||
| 692 | EXPORT_SYMBOL(jiffies); | ||
| 693 | 694 | ||
| 694 | /* | 695 | /* |
| 695 | * Add two timespec values and do a safety check for overflow. | 696 | * Add two timespec values and do a safety check for overflow. |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06c..b0425991e9ac 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o |
| 2 | obj-y += timeconv.o posix-clock.o | ||
| 2 | 3 | ||
| 3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
| 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d7395fdfb9f3..0d74b9ba90c8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -18,7 +18,6 @@ | |||
| 18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
| 19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
| 20 | #include <linux/sysdev.h> | 20 | #include <linux/sysdev.h> |
| 21 | #include <linux/tick.h> | ||
| 22 | 21 | ||
| 23 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
| 24 | 23 | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a8456909..a470154e0408 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
| @@ -22,8 +22,11 @@ | |||
| 22 | ************************************************************************/ | 22 | ************************************************************************/ |
| 23 | #include <linux/clocksource.h> | 23 | #include <linux/clocksource.h> |
| 24 | #include <linux/jiffies.h> | 24 | #include <linux/jiffies.h> |
| 25 | #include <linux/module.h> | ||
| 25 | #include <linux/init.h> | 26 | #include <linux/init.h> |
| 26 | 27 | ||
| 28 | #include "tick-internal.h" | ||
| 29 | |||
| 27 | /* The Jiffies based clocksource is the lowest common | 30 | /* The Jiffies based clocksource is the lowest common |
| 28 | * denominator clock source which should function on | 31 | * denominator clock source which should function on |
| 29 | * all systems. It has the same coarse resolution as | 32 | * all systems. It has the same coarse resolution as |
| @@ -31,7 +34,7 @@ | |||
| 31 | * inaccuracies caused by missed or lost timer | 34 | * inaccuracies caused by missed or lost timer |
| 32 | * interrupts and the inability for the timer | 35 | * interrupts and the inability for the timer |
| 33 | * interrupt hardware to accuratly tick at the | 36 | * interrupt hardware to accuratly tick at the |
| 34 | * requested HZ value. It is also not reccomended | 37 | * requested HZ value. It is also not recommended |
| 35 | * for "tick-less" systems. | 38 | * for "tick-less" systems. |
| 36 | */ | 39 | */ |
| 37 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) | 40 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) |
| @@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = { | |||
| 64 | .shift = JIFFIES_SHIFT, | 67 | .shift = JIFFIES_SHIFT, |
| 65 | }; | 68 | }; |
| 66 | 69 | ||
| 70 | #if (BITS_PER_LONG < 64) | ||
| 71 | u64 get_jiffies_64(void) | ||
| 72 | { | ||
| 73 | unsigned long seq; | ||
| 74 | u64 ret; | ||
| 75 | |||
| 76 | do { | ||
| 77 | seq = read_seqbegin(&xtime_lock); | ||
| 78 | ret = jiffies_64; | ||
| 79 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 80 | return ret; | ||
| 81 | } | ||
| 82 | EXPORT_SYMBOL(get_jiffies_64); | ||
| 83 | #endif | ||
| 84 | |||
| 85 | EXPORT_SYMBOL(jiffies); | ||
| 86 | |||
| 67 | static int __init init_jiffies_clocksource(void) | 87 | static int __init init_jiffies_clocksource(void) |
| 68 | { | 88 | { |
| 69 | return clocksource_register(&clocksource_jiffies); | 89 | return clocksource_register(&clocksource_jiffies); |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5c00242fa921..f6117a4c7cb8 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -16,6 +16,8 @@ | |||
| 16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
| 17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
| 18 | 18 | ||
| 19 | #include "tick-internal.h" | ||
| 20 | |||
| 19 | /* | 21 | /* |
| 20 | * NTP timekeeping variables: | 22 | * NTP timekeeping variables: |
| 21 | */ | 23 | */ |
| @@ -646,6 +648,19 @@ int do_adjtimex(struct timex *txc) | |||
| 646 | hrtimer_cancel(&leap_timer); | 648 | hrtimer_cancel(&leap_timer); |
| 647 | } | 649 | } |
| 648 | 650 | ||
| 651 | if (txc->modes & ADJ_SETOFFSET) { | ||
| 652 | struct timespec delta; | ||
| 653 | delta.tv_sec = txc->time.tv_sec; | ||
| 654 | delta.tv_nsec = txc->time.tv_usec; | ||
| 655 | if (!capable(CAP_SYS_TIME)) | ||
| 656 | return -EPERM; | ||
| 657 | if (!(txc->modes & ADJ_NANO)) | ||
| 658 | delta.tv_nsec *= 1000; | ||
| 659 | result = timekeeping_inject_offset(&delta); | ||
| 660 | if (result) | ||
| 661 | return result; | ||
| 662 | } | ||
| 663 | |||
| 649 | getnstimeofday(&ts); | 664 | getnstimeofday(&ts); |
| 650 | 665 | ||
| 651 | write_seqlock_irq(&xtime_lock); | 666 | write_seqlock_irq(&xtime_lock); |
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 000000000000..c340ca658f37 --- /dev/null +++ b/kernel/time/posix-clock.c | |||
| @@ -0,0 +1,445 @@ | |||
| 1 | /* | ||
| 2 | * posix-clock.c - support for dynamic clock devices | ||
| 3 | * | ||
| 4 | * Copyright (C) 2010 OMICRON electronics GmbH | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License as published by | ||
| 8 | * the Free Software Foundation; either version 2 of the License, or | ||
| 9 | * (at your option) any later version. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU General Public License | ||
| 17 | * along with this program; if not, write to the Free Software | ||
| 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
| 19 | */ | ||
| 20 | #include <linux/device.h> | ||
| 21 | #include <linux/file.h> | ||
| 22 | #include <linux/posix-clock.h> | ||
| 23 | #include <linux/slab.h> | ||
| 24 | #include <linux/syscalls.h> | ||
| 25 | #include <linux/uaccess.h> | ||
| 26 | |||
| 27 | static void delete_clock(struct kref *kref); | ||
| 28 | |||
| 29 | /* | ||
| 30 | * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. | ||
| 31 | */ | ||
| 32 | static struct posix_clock *get_posix_clock(struct file *fp) | ||
| 33 | { | ||
| 34 | struct posix_clock *clk = fp->private_data; | ||
| 35 | |||
| 36 | down_read(&clk->rwsem); | ||
| 37 | |||
| 38 | if (!clk->zombie) | ||
| 39 | return clk; | ||
| 40 | |||
| 41 | up_read(&clk->rwsem); | ||
| 42 | |||
| 43 | return NULL; | ||
| 44 | } | ||
| 45 | |||
| 46 | static void put_posix_clock(struct posix_clock *clk) | ||
| 47 | { | ||
| 48 | up_read(&clk->rwsem); | ||
| 49 | } | ||
| 50 | |||
| 51 | static ssize_t posix_clock_read(struct file *fp, char __user *buf, | ||
| 52 | size_t count, loff_t *ppos) | ||
| 53 | { | ||
| 54 | struct posix_clock *clk = get_posix_clock(fp); | ||
| 55 | int err = -EINVAL; | ||
| 56 | |||
| 57 | if (!clk) | ||
| 58 | return -ENODEV; | ||
| 59 | |||
| 60 | if (clk->ops.read) | ||
| 61 | err = clk->ops.read(clk, fp->f_flags, buf, count); | ||
| 62 | |||
| 63 | put_posix_clock(clk); | ||
| 64 | |||
| 65 | return err; | ||
| 66 | } | ||
| 67 | |||
| 68 | static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) | ||
| 69 | { | ||
| 70 | struct posix_clock *clk = get_posix_clock(fp); | ||
| 71 | int result = 0; | ||
| 72 | |||
| 73 | if (!clk) | ||
| 74 | return -ENODEV; | ||
| 75 | |||
| 76 | if (clk->ops.poll) | ||
| 77 | result = clk->ops.poll(clk, fp, wait); | ||
| 78 | |||
| 79 | put_posix_clock(clk); | ||
| 80 | |||
| 81 | return result; | ||
| 82 | } | ||
| 83 | |||
| 84 | static int posix_clock_fasync(int fd, struct file *fp, int on) | ||
| 85 | { | ||
| 86 | struct posix_clock *clk = get_posix_clock(fp); | ||
| 87 | int err = 0; | ||
| 88 | |||
| 89 | if (!clk) | ||
| 90 | return -ENODEV; | ||
| 91 | |||
| 92 | if (clk->ops.fasync) | ||
| 93 | err = clk->ops.fasync(clk, fd, fp, on); | ||
| 94 | |||
| 95 | put_posix_clock(clk); | ||
| 96 | |||
| 97 | return err; | ||
| 98 | } | ||
| 99 | |||
| 100 | static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) | ||
| 101 | { | ||
| 102 | struct posix_clock *clk = get_posix_clock(fp); | ||
| 103 | int err = -ENODEV; | ||
| 104 | |||
| 105 | if (!clk) | ||
| 106 | return -ENODEV; | ||
| 107 | |||
| 108 | if (clk->ops.mmap) | ||
| 109 | err = clk->ops.mmap(clk, vma); | ||
| 110 | |||
| 111 | put_posix_clock(clk); | ||
| 112 | |||
| 113 | return err; | ||
| 114 | } | ||
| 115 | |||
| 116 | static long posix_clock_ioctl(struct file *fp, | ||
| 117 | unsigned int cmd, unsigned long arg) | ||
| 118 | { | ||
| 119 | struct posix_clock *clk = get_posix_clock(fp); | ||
| 120 | int err = -ENOTTY; | ||
| 121 | |||
| 122 | if (!clk) | ||
| 123 | return -ENODEV; | ||
| 124 | |||
| 125 | if (clk->ops.ioctl) | ||
| 126 | err = clk->ops.ioctl(clk, cmd, arg); | ||
| 127 | |||
| 128 | put_posix_clock(clk); | ||
| 129 | |||
| 130 | return err; | ||
| 131 | } | ||
| 132 | |||
| 133 | #ifdef CONFIG_COMPAT | ||
| 134 | static long posix_clock_compat_ioctl(struct file *fp, | ||
| 135 | unsigned int cmd, unsigned long arg) | ||
| 136 | { | ||
| 137 | struct posix_clock *clk = get_posix_clock(fp); | ||
| 138 | int err = -ENOTTY; | ||
| 139 | |||
| 140 | if (!clk) | ||
| 141 | return -ENODEV; | ||
| 142 | |||
| 143 | if (clk->ops.ioctl) | ||
| 144 | err = clk->ops.ioctl(clk, cmd, arg); | ||
| 145 | |||
| 146 | put_posix_clock(clk); | ||
| 147 | |||
| 148 | return err; | ||
| 149 | } | ||
| 150 | #endif | ||
| 151 | |||
| 152 | static int posix_clock_open(struct inode *inode, struct file *fp) | ||
| 153 | { | ||
| 154 | int err; | ||
| 155 | struct posix_clock *clk = | ||
| 156 | container_of(inode->i_cdev, struct posix_clock, cdev); | ||
| 157 | |||
| 158 | down_read(&clk->rwsem); | ||
| 159 | |||
| 160 | if (clk->zombie) { | ||
| 161 | err = -ENODEV; | ||
| 162 | goto out; | ||
| 163 | } | ||
| 164 | if (clk->ops.open) | ||
| 165 | err = clk->ops.open(clk, fp->f_mode); | ||
| 166 | else | ||
| 167 | err = 0; | ||
| 168 | |||
| 169 | if (!err) { | ||
| 170 | kref_get(&clk->kref); | ||
| 171 | fp->private_data = clk; | ||
| 172 | } | ||
| 173 | out: | ||
| 174 | up_read(&clk->rwsem); | ||
| 175 | return err; | ||
| 176 | } | ||
| 177 | |||
| 178 | static int posix_clock_release(struct inode *inode, struct file *fp) | ||
| 179 | { | ||
| 180 | struct posix_clock *clk = fp->private_data; | ||
| 181 | int err = 0; | ||
| 182 | |||
| 183 | if (clk->ops.release) | ||
| 184 | err = clk->ops.release(clk); | ||
| 185 | |||
| 186 | kref_put(&clk->kref, delete_clock); | ||
| 187 | |||
| 188 | fp->private_data = NULL; | ||
| 189 | |||
| 190 | return err; | ||
| 191 | } | ||
| 192 | |||
| 193 | static const struct file_operations posix_clock_file_operations = { | ||
| 194 | .owner = THIS_MODULE, | ||
| 195 | .llseek = no_llseek, | ||
| 196 | .read = posix_clock_read, | ||
| 197 | .poll = posix_clock_poll, | ||
| 198 | .unlocked_ioctl = posix_clock_ioctl, | ||
| 199 | .open = posix_clock_open, | ||
| 200 | .release = posix_clock_release, | ||
| 201 | .fasync = posix_clock_fasync, | ||
| 202 | .mmap = posix_clock_mmap, | ||
| 203 | #ifdef CONFIG_COMPAT | ||
| 204 | .compat_ioctl = posix_clock_compat_ioctl, | ||
| 205 | #endif | ||
| 206 | }; | ||
| 207 | |||
| 208 | int posix_clock_register(struct posix_clock *clk, dev_t devid) | ||
| 209 | { | ||
| 210 | int err; | ||
| 211 | |||
| 212 | kref_init(&clk->kref); | ||
| 213 | init_rwsem(&clk->rwsem); | ||
| 214 | |||
| 215 | cdev_init(&clk->cdev, &posix_clock_file_operations); | ||
| 216 | clk->cdev.owner = clk->ops.owner; | ||
| 217 | err = cdev_add(&clk->cdev, devid, 1); | ||
| 218 | |||
| 219 | return err; | ||
| 220 | } | ||
| 221 | EXPORT_SYMBOL_GPL(posix_clock_register); | ||
| 222 | |||
| 223 | static void delete_clock(struct kref *kref) | ||
| 224 | { | ||
| 225 | struct posix_clock *clk = container_of(kref, struct posix_clock, kref); | ||
| 226 | |||
| 227 | if (clk->release) | ||
| 228 | clk->release(clk); | ||
| 229 | } | ||
| 230 | |||
| 231 | void posix_clock_unregister(struct posix_clock *clk) | ||
| 232 | { | ||
| 233 | cdev_del(&clk->cdev); | ||
| 234 | |||
| 235 | down_write(&clk->rwsem); | ||
| 236 | clk->zombie = true; | ||
| 237 | up_write(&clk->rwsem); | ||
| 238 | |||
| 239 | kref_put(&clk->kref, delete_clock); | ||
| 240 | } | ||
| 241 | EXPORT_SYMBOL_GPL(posix_clock_unregister); | ||
| 242 | |||
| 243 | struct posix_clock_desc { | ||
| 244 | struct file *fp; | ||
| 245 | struct posix_clock *clk; | ||
| 246 | }; | ||
| 247 | |||
| 248 | static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) | ||
| 249 | { | ||
| 250 | struct file *fp = fget(CLOCKID_TO_FD(id)); | ||
| 251 | int err = -EINVAL; | ||
| 252 | |||
| 253 | if (!fp) | ||
| 254 | return err; | ||
| 255 | |||
| 256 | if (fp->f_op->open != posix_clock_open || !fp->private_data) | ||
| 257 | goto out; | ||
| 258 | |||
| 259 | cd->fp = fp; | ||
| 260 | cd->clk = get_posix_clock(fp); | ||
| 261 | |||
| 262 | err = cd->clk ? 0 : -ENODEV; | ||
| 263 | out: | ||
| 264 | if (err) | ||
| 265 | fput(fp); | ||
| 266 | return err; | ||
| 267 | } | ||
| 268 | |||
| 269 | static void put_clock_desc(struct posix_clock_desc *cd) | ||
| 270 | { | ||
| 271 | put_posix_clock(cd->clk); | ||
| 272 | fput(cd->fp); | ||
| 273 | } | ||
| 274 | |||
| 275 | static int pc_clock_adjtime(clockid_t id, struct timex *tx) | ||
| 276 | { | ||
| 277 | struct posix_clock_desc cd; | ||
| 278 | int err; | ||
| 279 | |||
| 280 | err = get_clock_desc(id, &cd); | ||
| 281 | if (err) | ||
| 282 | return err; | ||
| 283 | |||
| 284 | if ((cd.fp->f_mode & FMODE_WRITE) == 0) { | ||
| 285 | err = -EACCES; | ||
| 286 | goto out; | ||
| 287 | } | ||
| 288 | |||
| 289 | if (cd.clk->ops.clock_adjtime) | ||
| 290 | err = cd.clk->ops.clock_adjtime(cd.clk, tx); | ||
| 291 | else | ||
| 292 | err = -EOPNOTSUPP; | ||
| 293 | out: | ||
| 294 | put_clock_desc(&cd); | ||
| 295 | |||
| 296 | return err; | ||
| 297 | } | ||
| 298 | |||
| 299 | static int pc_clock_gettime(clockid_t id, struct timespec *ts) | ||
| 300 | { | ||
| 301 | struct posix_clock_desc cd; | ||
| 302 | int err; | ||
| 303 | |||
| 304 | err = get_clock_desc(id, &cd); | ||
| 305 | if (err) | ||
| 306 | return err; | ||
| 307 | |||
| 308 | if (cd.clk->ops.clock_gettime) | ||
| 309 | err = cd.clk->ops.clock_gettime(cd.clk, ts); | ||
| 310 | else | ||
| 311 | err = -EOPNOTSUPP; | ||
| 312 | |||
| 313 | put_clock_desc(&cd); | ||
| 314 | |||
| 315 | return err; | ||
| 316 | } | ||
| 317 | |||
| 318 | static int pc_clock_getres(clockid_t id, struct timespec *ts) | ||
| 319 | { | ||
| 320 | struct posix_clock_desc cd; | ||
| 321 | int err; | ||
| 322 | |||
| 323 | err = get_clock_desc(id, &cd); | ||
| 324 | if (err) | ||
| 325 | return err; | ||
| 326 | |||
| 327 | if (cd.clk->ops.clock_getres) | ||
| 328 | err = cd.clk->ops.clock_getres(cd.clk, ts); | ||
| 329 | else | ||
| 330 | err = -EOPNOTSUPP; | ||
| 331 | |||
| 332 | put_clock_desc(&cd); | ||
| 333 | |||
| 334 | return err; | ||
| 335 | } | ||
| 336 | |||
| 337 | static int pc_clock_settime(clockid_t id, const struct timespec *ts) | ||
| 338 | { | ||
| 339 | struct posix_clock_desc cd; | ||
| 340 | int err; | ||
| 341 | |||
| 342 | err = get_clock_desc(id, &cd); | ||
| 343 | if (err) | ||
| 344 | return err; | ||
| 345 | |||
| 346 | if ((cd.fp->f_mode & FMODE_WRITE) == 0) { | ||
| 347 | err = -EACCES; | ||
| 348 | goto out; | ||
| 349 | } | ||
| 350 | |||
| 351 | if (cd.clk->ops.clock_settime) | ||
| 352 | err = cd.clk->ops.clock_settime(cd.clk, ts); | ||
| 353 | else | ||
| 354 | err = -EOPNOTSUPP; | ||
| 355 | out: | ||
| 356 | put_clock_desc(&cd); | ||
| 357 | |||
| 358 | return err; | ||
| 359 | } | ||
| 360 | |||
| 361 | static int pc_timer_create(struct k_itimer *kit) | ||
| 362 | { | ||
| 363 | clockid_t id = kit->it_clock; | ||
| 364 | struct posix_clock_desc cd; | ||
| 365 | int err; | ||
| 366 | |||
| 367 | err = get_clock_desc(id, &cd); | ||
| 368 | if (err) | ||
| 369 | return err; | ||
| 370 | |||
| 371 | if (cd.clk->ops.timer_create) | ||
| 372 | err = cd.clk->ops.timer_create(cd.clk, kit); | ||
| 373 | else | ||
| 374 | err = -EOPNOTSUPP; | ||
| 375 | |||
| 376 | put_clock_desc(&cd); | ||
| 377 | |||
| 378 | return err; | ||
| 379 | } | ||
| 380 | |||
| 381 | static int pc_timer_delete(struct k_itimer *kit) | ||
| 382 | { | ||
| 383 | clockid_t id = kit->it_clock; | ||
| 384 | struct posix_clock_desc cd; | ||
| 385 | int err; | ||
| 386 | |||
| 387 | err = get_clock_desc(id, &cd); | ||
| 388 | if (err) | ||
| 389 | return err; | ||
| 390 | |||
| 391 | if (cd.clk->ops.timer_delete) | ||
| 392 | err = cd.clk->ops.timer_delete(cd.clk, kit); | ||
| 393 | else | ||
| 394 | err = -EOPNOTSUPP; | ||
| 395 | |||
| 396 | put_clock_desc(&cd); | ||
| 397 | |||
| 398 | return err; | ||
| 399 | } | ||
| 400 | |||
| 401 | static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) | ||
| 402 | { | ||
| 403 | clockid_t id = kit->it_clock; | ||
| 404 | struct posix_clock_desc cd; | ||
| 405 | |||
| 406 | if (get_clock_desc(id, &cd)) | ||
| 407 | return; | ||
| 408 | |||
| 409 | if (cd.clk->ops.timer_gettime) | ||
| 410 | cd.clk->ops.timer_gettime(cd.clk, kit, ts); | ||
| 411 | |||
| 412 | put_clock_desc(&cd); | ||
| 413 | } | ||
| 414 | |||
| 415 | static int pc_timer_settime(struct k_itimer *kit, int flags, | ||
| 416 | struct itimerspec *ts, struct itimerspec *old) | ||
| 417 | { | ||
| 418 | clockid_t id = kit->it_clock; | ||
| 419 | struct posix_clock_desc cd; | ||
| 420 | int err; | ||
| 421 | |||
| 422 | err = get_clock_desc(id, &cd); | ||
| 423 | if (err) | ||
| 424 | return err; | ||
| 425 | |||
| 426 | if (cd.clk->ops.timer_settime) | ||
| 427 | err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); | ||
| 428 | else | ||
| 429 | err = -EOPNOTSUPP; | ||
| 430 | |||
| 431 | put_clock_desc(&cd); | ||
| 432 | |||
| 433 | return err; | ||
| 434 | } | ||
| 435 | |||
| 436 | struct k_clock clock_posix_dynamic = { | ||
| 437 | .clock_getres = pc_clock_getres, | ||
| 438 | .clock_set = pc_clock_settime, | ||
| 439 | .clock_get = pc_clock_gettime, | ||
| 440 | .clock_adj = pc_clock_adjtime, | ||
| 441 | .timer_create = pc_timer_create, | ||
| 442 | .timer_set = pc_timer_settime, | ||
| 443 | .timer_del = pc_timer_delete, | ||
| 444 | .timer_get = pc_timer_gettime, | ||
| 445 | }; | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 48b2761b5668..da800ffa810c 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -18,7 +18,6 @@ | |||
| 18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
| 19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
| 20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
| 21 | #include <linux/tick.h> | ||
| 22 | 21 | ||
| 23 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
| 24 | 23 | ||
| @@ -600,4 +599,14 @@ int tick_broadcast_oneshot_active(void) | |||
| 600 | return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; | 599 | return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; |
| 601 | } | 600 | } |
| 602 | 601 | ||
| 602 | /* | ||
| 603 | * Check whether the broadcast device supports oneshot. | ||
| 604 | */ | ||
| 605 | bool tick_broadcast_oneshot_available(void) | ||
| 606 | { | ||
| 607 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | ||
| 608 | |||
| 609 | return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; | ||
| 610 | } | ||
| 611 | |||
| 603 | #endif | 612 | #endif |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 051bc80a0c43..119528de8235 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
| @@ -18,7 +18,6 @@ | |||
| 18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
| 19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
| 20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
| 21 | #include <linux/tick.h> | ||
| 22 | 21 | ||
| 23 | #include <asm/irq_regs.h> | 22 | #include <asm/irq_regs.h> |
| 24 | 23 | ||
| @@ -51,7 +50,11 @@ int tick_is_oneshot_available(void) | |||
| 51 | { | 50 | { |
| 52 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); | 51 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
| 53 | 52 | ||
| 54 | return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); | 53 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) |
| 54 | return 0; | ||
| 55 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
| 56 | return 1; | ||
| 57 | return tick_broadcast_oneshot_available(); | ||
| 55 | } | 58 | } |
| 56 | 59 | ||
| 57 | /* | 60 | /* |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 290eefbc1f60..1009b06d6f89 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
| @@ -1,6 +1,10 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * tick internal variable and functions used by low/high res code | 2 | * tick internal variable and functions used by low/high res code |
| 3 | */ | 3 | */ |
| 4 | #include <linux/hrtimer.h> | ||
| 5 | #include <linux/tick.h> | ||
| 6 | |||
| 7 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD | ||
| 4 | 8 | ||
| 5 | #define TICK_DO_TIMER_NONE -1 | 9 | #define TICK_DO_TIMER_NONE -1 |
| 6 | #define TICK_DO_TIMER_BOOT -2 | 10 | #define TICK_DO_TIMER_BOOT -2 |
| @@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); | |||
| 36 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); | 40 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); |
| 37 | extern int tick_broadcast_oneshot_active(void); | 41 | extern int tick_broadcast_oneshot_active(void); |
| 38 | extern void tick_check_oneshot_broadcast(int cpu); | 42 | extern void tick_check_oneshot_broadcast(int cpu); |
| 43 | bool tick_broadcast_oneshot_available(void); | ||
| 39 | # else /* BROADCAST */ | 44 | # else /* BROADCAST */ |
| 40 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 45 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
| 41 | { | 46 | { |
| @@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { } | |||
| 46 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | 51 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } |
| 47 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 52 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
| 48 | static inline void tick_check_oneshot_broadcast(int cpu) { } | 53 | static inline void tick_check_oneshot_broadcast(int cpu) { } |
| 54 | static inline bool tick_broadcast_oneshot_available(void) { return true; } | ||
| 49 | # endif /* !BROADCAST */ | 55 | # endif /* !BROADCAST */ |
| 50 | 56 | ||
| 51 | #else /* !ONESHOT */ | 57 | #else /* !ONESHOT */ |
| @@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
| 76 | return 0; | 82 | return 0; |
| 77 | } | 83 | } |
| 78 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 84 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
| 85 | static inline bool tick_broadcast_oneshot_available(void) { return false; } | ||
| 79 | #endif /* !TICK_ONESHOT */ | 86 | #endif /* !TICK_ONESHOT */ |
| 80 | 87 | ||
| 81 | /* | 88 | /* |
| @@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) | |||
| 132 | { | 139 | { |
| 133 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | 140 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); |
| 134 | } | 141 | } |
| 142 | |||
| 143 | #endif | ||
| 144 | |||
| 145 | extern void do_timer(unsigned long ticks); | ||
| 146 | extern seqlock_t xtime_lock; | ||
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5cbc101f908b..2d04411a5f05 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
| @@ -18,7 +18,6 @@ | |||
| 18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
| 19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
| 20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
| 21 | #include <linux/tick.h> | ||
| 22 | 21 | ||
| 23 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
| 24 | 23 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c55ea2433471..d5097c44b407 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -19,7 +19,6 @@ | |||
| 19 | #include <linux/percpu.h> | 19 | #include <linux/percpu.h> |
| 20 | #include <linux/profile.h> | 20 | #include <linux/profile.h> |
| 21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
| 22 | #include <linux/tick.h> | ||
| 23 | #include <linux/module.h> | 22 | #include <linux/module.h> |
| 24 | 23 | ||
| 25 | #include <asm/irq_regs.h> | 24 | #include <asm/irq_regs.h> |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d27c7562902c..8ad5d576755e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -14,7 +14,7 @@ | |||
| 14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
| 15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
| 16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
| 17 | #include <linux/sysdev.h> | 17 | #include <linux/syscore_ops.h> |
| 18 | #include <linux/clocksource.h> | 18 | #include <linux/clocksource.h> |
| 19 | #include <linux/jiffies.h> | 19 | #include <linux/jiffies.h> |
| 20 | #include <linux/time.h> | 20 | #include <linux/time.h> |
| @@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
| 353 | * | 353 | * |
| 354 | * Sets the time of day to the new time and update NTP and notify hrtimers | 354 | * Sets the time of day to the new time and update NTP and notify hrtimers |
| 355 | */ | 355 | */ |
| 356 | int do_settimeofday(struct timespec *tv) | 356 | int do_settimeofday(const struct timespec *tv) |
| 357 | { | 357 | { |
| 358 | struct timespec ts_delta; | 358 | struct timespec ts_delta; |
| 359 | unsigned long flags; | 359 | unsigned long flags; |
| @@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv) | |||
| 387 | 387 | ||
| 388 | EXPORT_SYMBOL(do_settimeofday); | 388 | EXPORT_SYMBOL(do_settimeofday); |
| 389 | 389 | ||
| 390 | |||
| 391 | /** | ||
| 392 | * timekeeping_inject_offset - Adds or subtracts from the current time. | ||
| 393 | * @tv: pointer to the timespec variable containing the offset | ||
| 394 | * | ||
| 395 | * Adds or subtracts an offset value from the current time. | ||
| 396 | */ | ||
| 397 | int timekeeping_inject_offset(struct timespec *ts) | ||
| 398 | { | ||
| 399 | unsigned long flags; | ||
| 400 | |||
| 401 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) | ||
| 402 | return -EINVAL; | ||
| 403 | |||
| 404 | write_seqlock_irqsave(&xtime_lock, flags); | ||
| 405 | |||
| 406 | timekeeping_forward_now(); | ||
| 407 | |||
| 408 | xtime = timespec_add(xtime, *ts); | ||
| 409 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); | ||
| 410 | |||
| 411 | timekeeper.ntp_error = 0; | ||
| 412 | ntp_clear(); | ||
| 413 | |||
| 414 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | ||
| 415 | timekeeper.mult); | ||
| 416 | |||
| 417 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
| 418 | |||
| 419 | /* signal hrtimers about time change */ | ||
| 420 | clock_was_set(); | ||
| 421 | |||
| 422 | return 0; | ||
| 423 | } | ||
| 424 | EXPORT_SYMBOL(timekeeping_inject_offset); | ||
| 425 | |||
| 390 | /** | 426 | /** |
| 391 | * change_clocksource - Swaps clocksources if a new one is available | 427 | * change_clocksource - Swaps clocksources if a new one is available |
| 392 | * | 428 | * |
| @@ -561,13 +597,12 @@ static struct timespec timekeeping_suspend_time; | |||
| 561 | 597 | ||
| 562 | /** | 598 | /** |
| 563 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 599 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
| 564 | * @dev: unused | ||
| 565 | * | 600 | * |
| 566 | * This is for the generic clocksource timekeeping. | 601 | * This is for the generic clocksource timekeeping. |
| 567 | * xtime/wall_to_monotonic/jiffies/etc are | 602 | * xtime/wall_to_monotonic/jiffies/etc are |
| 568 | * still managed by arch specific suspend/resume code. | 603 | * still managed by arch specific suspend/resume code. |
| 569 | */ | 604 | */ |
| 570 | static int timekeeping_resume(struct sys_device *dev) | 605 | static void timekeeping_resume(void) |
| 571 | { | 606 | { |
| 572 | unsigned long flags; | 607 | unsigned long flags; |
| 573 | struct timespec ts; | 608 | struct timespec ts; |
| @@ -596,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev) | |||
| 596 | 631 | ||
| 597 | /* Resume hrtimers */ | 632 | /* Resume hrtimers */ |
| 598 | hres_timers_resume(); | 633 | hres_timers_resume(); |
| 599 | |||
| 600 | return 0; | ||
| 601 | } | 634 | } |
| 602 | 635 | ||
| 603 | static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | 636 | static int timekeeping_suspend(void) |
| 604 | { | 637 | { |
| 605 | unsigned long flags; | 638 | unsigned long flags; |
| 606 | 639 | ||
| @@ -618,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
| 618 | } | 651 | } |
| 619 | 652 | ||
| 620 | /* sysfs resume/suspend bits for timekeeping */ | 653 | /* sysfs resume/suspend bits for timekeeping */ |
| 621 | static struct sysdev_class timekeeping_sysclass = { | 654 | static struct syscore_ops timekeeping_syscore_ops = { |
| 622 | .name = "timekeeping", | ||
| 623 | .resume = timekeeping_resume, | 655 | .resume = timekeeping_resume, |
| 624 | .suspend = timekeeping_suspend, | 656 | .suspend = timekeeping_suspend, |
| 625 | }; | 657 | }; |
| 626 | 658 | ||
| 627 | static struct sys_device device_timer = { | 659 | static int __init timekeeping_init_ops(void) |
| 628 | .id = 0, | ||
| 629 | .cls = &timekeeping_sysclass, | ||
| 630 | }; | ||
| 631 | |||
| 632 | static int __init timekeeping_init_device(void) | ||
| 633 | { | 660 | { |
| 634 | int error = sysdev_class_register(&timekeeping_sysclass); | 661 | register_syscore_ops(&timekeeping_syscore_ops); |
| 635 | if (!error) | 662 | return 0; |
| 636 | error = sysdev_register(&device_timer); | ||
| 637 | return error; | ||
| 638 | } | 663 | } |
| 639 | 664 | ||
| 640 | device_initcall(timekeeping_init_device); | 665 | device_initcall(timekeeping_init_ops); |
| 641 | 666 | ||
| 642 | /* | 667 | /* |
| 643 | * If the error is already larger, we look ahead even further | 668 | * If the error is already larger, we look ahead even further |
| @@ -779,7 +804,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
| 779 | * | 804 | * |
| 780 | * Called from the timer interrupt, must hold a write on xtime_lock. | 805 | * Called from the timer interrupt, must hold a write on xtime_lock. |
| 781 | */ | 806 | */ |
| 782 | void update_wall_time(void) | 807 | static void update_wall_time(void) |
| 783 | { | 808 | { |
| 784 | struct clocksource *clock; | 809 | struct clocksource *clock; |
| 785 | cycle_t offset; | 810 | cycle_t offset; |
| @@ -871,7 +896,7 @@ void update_wall_time(void) | |||
| 871 | * getboottime - Return the real time of system boot. | 896 | * getboottime - Return the real time of system boot. |
| 872 | * @ts: pointer to the timespec to be set | 897 | * @ts: pointer to the timespec to be set |
| 873 | * | 898 | * |
| 874 | * Returns the time of day in a timespec. | 899 | * Returns the wall-time of boot in a timespec. |
| 875 | * | 900 | * |
| 876 | * This is based on the wall_to_monotonic offset and the total suspend | 901 | * This is based on the wall_to_monotonic offset and the total suspend |
| 877 | * time. Calls to settimeofday will affect the value returned (which | 902 | * time. Calls to settimeofday will affect the value returned (which |
| @@ -889,6 +914,55 @@ void getboottime(struct timespec *ts) | |||
| 889 | } | 914 | } |
| 890 | EXPORT_SYMBOL_GPL(getboottime); | 915 | EXPORT_SYMBOL_GPL(getboottime); |
| 891 | 916 | ||
| 917 | |||
| 918 | /** | ||
| 919 | * get_monotonic_boottime - Returns monotonic time since boot | ||
| 920 | * @ts: pointer to the timespec to be set | ||
| 921 | * | ||
| 922 | * Returns the monotonic time since boot in a timespec. | ||
| 923 | * | ||
| 924 | * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also | ||
| 925 | * includes the time spent in suspend. | ||
| 926 | */ | ||
| 927 | void get_monotonic_boottime(struct timespec *ts) | ||
| 928 | { | ||
| 929 | struct timespec tomono, sleep; | ||
| 930 | unsigned int seq; | ||
| 931 | s64 nsecs; | ||
| 932 | |||
| 933 | WARN_ON(timekeeping_suspended); | ||
| 934 | |||
| 935 | do { | ||
| 936 | seq = read_seqbegin(&xtime_lock); | ||
| 937 | *ts = xtime; | ||
| 938 | tomono = wall_to_monotonic; | ||
| 939 | sleep = total_sleep_time; | ||
| 940 | nsecs = timekeeping_get_ns(); | ||
| 941 | |||
| 942 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 943 | |||
| 944 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, | ||
| 945 | ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); | ||
| 946 | } | ||
| 947 | EXPORT_SYMBOL_GPL(get_monotonic_boottime); | ||
| 948 | |||
| 949 | /** | ||
| 950 | * ktime_get_boottime - Returns monotonic time since boot in a ktime | ||
| 951 | * | ||
| 952 | * Returns the monotonic time since boot in a ktime | ||
| 953 | * | ||
| 954 | * This is similar to CLOCK_MONTONIC/ktime_get, but also | ||
| 955 | * includes the time spent in suspend. | ||
| 956 | */ | ||
| 957 | ktime_t ktime_get_boottime(void) | ||
| 958 | { | ||
| 959 | struct timespec ts; | ||
| 960 | |||
| 961 | get_monotonic_boottime(&ts); | ||
| 962 | return timespec_to_ktime(ts); | ||
| 963 | } | ||
| 964 | EXPORT_SYMBOL_GPL(ktime_get_boottime); | ||
| 965 | |||
| 892 | /** | 966 | /** |
| 893 | * monotonic_to_bootbased - Convert the monotonic time to boot based. | 967 | * monotonic_to_bootbased - Convert the monotonic time to boot based. |
| 894 | * @ts: pointer to the timespec to be converted | 968 | * @ts: pointer to the timespec to be converted |
| @@ -910,11 +984,6 @@ struct timespec __current_kernel_time(void) | |||
| 910 | return xtime; | 984 | return xtime; |
| 911 | } | 985 | } |
| 912 | 986 | ||
| 913 | struct timespec __get_wall_to_monotonic(void) | ||
| 914 | { | ||
| 915 | return wall_to_monotonic; | ||
| 916 | } | ||
| 917 | |||
| 918 | struct timespec current_kernel_time(void) | 987 | struct timespec current_kernel_time(void) |
| 919 | { | 988 | { |
| 920 | struct timespec now; | 989 | struct timespec now; |
| @@ -946,3 +1015,48 @@ struct timespec get_monotonic_coarse(void) | |||
| 946 | now.tv_nsec + mono.tv_nsec); | 1015 | now.tv_nsec + mono.tv_nsec); |
| 947 | return now; | 1016 | return now; |
| 948 | } | 1017 | } |
| 1018 | |||
| 1019 | /* | ||
| 1020 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | ||
| 1021 | * without sampling the sequence number in xtime_lock. | ||
| 1022 | * jiffies is defined in the linker script... | ||
| 1023 | */ | ||
| 1024 | void do_timer(unsigned long ticks) | ||
| 1025 | { | ||
| 1026 | jiffies_64 += ticks; | ||
| 1027 | update_wall_time(); | ||
| 1028 | calc_global_load(ticks); | ||
| 1029 | } | ||
| 1030 | |||
| 1031 | /** | ||
| 1032 | * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, | ||
| 1033 | * and sleep offsets. | ||
| 1034 | * @xtim: pointer to timespec to be set with xtime | ||
| 1035 | * @wtom: pointer to timespec to be set with wall_to_monotonic | ||
| 1036 | * @sleep: pointer to timespec to be set with time in suspend | ||
| 1037 | */ | ||
| 1038 | void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | ||
| 1039 | struct timespec *wtom, struct timespec *sleep) | ||
| 1040 | { | ||
| 1041 | unsigned long seq; | ||
| 1042 | |||
| 1043 | do { | ||
| 1044 | seq = read_seqbegin(&xtime_lock); | ||
| 1045 | *xtim = xtime; | ||
| 1046 | *wtom = wall_to_monotonic; | ||
| 1047 | *sleep = total_sleep_time; | ||
| 1048 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 1049 | } | ||
| 1050 | |||
| 1051 | /** | ||
| 1052 | * xtime_update() - advances the timekeeping infrastructure | ||
| 1053 | * @ticks: number of ticks, that have elapsed since the last call. | ||
| 1054 | * | ||
| 1055 | * Must be called with interrupts disabled. | ||
| 1056 | */ | ||
| 1057 | void xtime_update(unsigned long ticks) | ||
| 1058 | { | ||
| 1059 | write_seqlock(&xtime_lock); | ||
| 1060 | do_timer(ticks); | ||
| 1061 | write_sequnlock(&xtime_lock); | ||
| 1062 | } | ||
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 2f3b585b8d7d..a5d0a3a85dd8 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
| @@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, | |||
| 236 | unsigned int timer_flag) | 236 | unsigned int timer_flag) |
| 237 | { | 237 | { |
| 238 | /* | 238 | /* |
| 239 | * It doesnt matter which lock we take: | 239 | * It doesn't matter which lock we take: |
| 240 | */ | 240 | */ |
| 241 | raw_spinlock_t *lock; | 241 | raw_spinlock_t *lock; |
| 242 | struct entry *entry, input; | 242 | struct entry *entry, input; |
diff --git a/kernel/timer.c b/kernel/timer.c index d6459923d245..fd6198692b57 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {} | |||
| 404 | 404 | ||
| 405 | static struct debug_obj_descr timer_debug_descr; | 405 | static struct debug_obj_descr timer_debug_descr; |
| 406 | 406 | ||
| 407 | static void *timer_debug_hint(void *addr) | ||
| 408 | { | ||
| 409 | return ((struct timer_list *) addr)->function; | ||
| 410 | } | ||
| 411 | |||
| 407 | /* | 412 | /* |
| 408 | * fixup_init is called when: | 413 | * fixup_init is called when: |
| 409 | * - an active object is initialized | 414 | * - an active object is initialized |
| @@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) | |||
| 477 | 482 | ||
| 478 | static struct debug_obj_descr timer_debug_descr = { | 483 | static struct debug_obj_descr timer_debug_descr = { |
| 479 | .name = "timer_list", | 484 | .name = "timer_list", |
| 485 | .debug_hint = timer_debug_hint, | ||
| 480 | .fixup_init = timer_fixup_init, | 486 | .fixup_init = timer_fixup_init, |
| 481 | .fixup_activate = timer_fixup_activate, | 487 | .fixup_activate = timer_fixup_activate, |
| 482 | .fixup_free = timer_fixup_free, | 488 | .fixup_free = timer_fixup_free, |
| @@ -964,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync); | |||
| 964 | * add_timer_on(). Upon exit the timer is not queued and the handler is | 970 | * add_timer_on(). Upon exit the timer is not queued and the handler is |
| 965 | * not running on any CPU. | 971 | * not running on any CPU. |
| 966 | * | 972 | * |
| 973 | * Note: You must not hold locks that are held in interrupt context | ||
| 974 | * while calling this function. Even if the lock has nothing to do | ||
| 975 | * with the timer in question. Here's why: | ||
| 976 | * | ||
| 977 | * CPU0 CPU1 | ||
| 978 | * ---- ---- | ||
| 979 | * <SOFTIRQ> | ||
| 980 | * call_timer_fn(); | ||
| 981 | * base->running_timer = mytimer; | ||
| 982 | * spin_lock_irq(somelock); | ||
| 983 | * <IRQ> | ||
| 984 | * spin_lock(somelock); | ||
| 985 | * del_timer_sync(mytimer); | ||
| 986 | * while (base->running_timer == mytimer); | ||
| 987 | * | ||
| 988 | * Now del_timer_sync() will never return and never release somelock. | ||
| 989 | * The interrupt on the other CPU is waiting to grab somelock but | ||
| 990 | * it has interrupted the softirq that CPU0 is waiting to finish. | ||
| 991 | * | ||
| 967 | * The function returns whether it has deactivated a pending timer or not. | 992 | * The function returns whether it has deactivated a pending timer or not. |
| 968 | */ | 993 | */ |
| 969 | int del_timer_sync(struct timer_list *timer) | 994 | int del_timer_sync(struct timer_list *timer) |
| @@ -971,6 +996,10 @@ int del_timer_sync(struct timer_list *timer) | |||
| 971 | #ifdef CONFIG_LOCKDEP | 996 | #ifdef CONFIG_LOCKDEP |
| 972 | unsigned long flags; | 997 | unsigned long flags; |
| 973 | 998 | ||
| 999 | /* | ||
| 1000 | * If lockdep gives a backtrace here, please reference | ||
| 1001 | * the synchronization rules above. | ||
| 1002 | */ | ||
| 974 | local_irq_save(flags); | 1003 | local_irq_save(flags); |
| 975 | lock_map_acquire(&timer->lockdep_map); | 1004 | lock_map_acquire(&timer->lockdep_map); |
| 976 | lock_map_release(&timer->lockdep_map); | 1005 | lock_map_release(&timer->lockdep_map); |
| @@ -1295,19 +1324,6 @@ void run_local_timers(void) | |||
| 1295 | raise_softirq(TIMER_SOFTIRQ); | 1324 | raise_softirq(TIMER_SOFTIRQ); |
| 1296 | } | 1325 | } |
| 1297 | 1326 | ||
| 1298 | /* | ||
| 1299 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | ||
| 1300 | * without sampling the sequence number in xtime_lock. | ||
| 1301 | * jiffies is defined in the linker script... | ||
| 1302 | */ | ||
| 1303 | |||
| 1304 | void do_timer(unsigned long ticks) | ||
| 1305 | { | ||
| 1306 | jiffies_64 += ticks; | ||
| 1307 | update_wall_time(); | ||
| 1308 | calc_global_load(ticks); | ||
| 1309 | } | ||
| 1310 | |||
| 1311 | #ifdef __ARCH_WANT_SYS_ALARM | 1327 | #ifdef __ARCH_WANT_SYS_ALARM |
| 1312 | 1328 | ||
| 1313 | /* | 1329 | /* |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 14674dce77a6..2ad39e556cb4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -141,7 +141,7 @@ if FTRACE | |||
| 141 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
| 142 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
| 143 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
| 144 | select FRAME_POINTER if !ARM_UNWIND && !S390 | 144 | select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE |
| 145 | select KALLSYMS | 145 | select KALLSYMS |
| 146 | select GENERIC_TRACER | 146 | select GENERIC_TRACER |
| 147 | select CONTEXT_SWITCH_TRACER | 147 | select CONTEXT_SWITCH_TRACER |
| @@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES | |||
| 275 | This tracer profiles all the the likely and unlikely macros | 275 | This tracer profiles all the the likely and unlikely macros |
| 276 | in the kernel. It will display the results in: | 276 | in the kernel. It will display the results in: |
| 277 | 277 | ||
| 278 | /sys/kernel/debug/tracing/profile_annotated_branch | 278 | /sys/kernel/debug/tracing/trace_stat/branch_annotated |
| 279 | 279 | ||
| 280 | Note: this will add a significant overhead; only turn this | 280 | Note: this will add a significant overhead; only turn this |
| 281 | on if you need to profile the system's use of these macros. | 281 | on if you need to profile the system's use of these macros. |
| @@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES | |||
| 288 | taken in the kernel is recorded whether it hit or miss. | 288 | taken in the kernel is recorded whether it hit or miss. |
| 289 | The results will be displayed in: | 289 | The results will be displayed in: |
| 290 | 290 | ||
| 291 | /sys/kernel/debug/tracing/profile_branch | 291 | /sys/kernel/debug/tracing/trace_stat/branch_all |
| 292 | 292 | ||
| 293 | This option also enables the likely/unlikely profiler. | 293 | This option also enables the likely/unlikely profiler. |
| 294 | 294 | ||
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d95721f33702..6957aa298dfa 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q) | |||
| 703 | * | 703 | * |
| 704 | **/ | 704 | **/ |
| 705 | static void blk_add_trace_rq(struct request_queue *q, struct request *rq, | 705 | static void blk_add_trace_rq(struct request_queue *q, struct request *rq, |
| 706 | u32 what) | 706 | u32 what) |
| 707 | { | 707 | { |
| 708 | struct blk_trace *bt = q->blk_trace; | 708 | struct blk_trace *bt = q->blk_trace; |
| 709 | int rw = rq->cmd_flags & 0x03; | ||
| 710 | 709 | ||
| 711 | if (likely(!bt)) | 710 | if (likely(!bt)) |
| 712 | return; | 711 | return; |
| 713 | 712 | ||
| 714 | if (rq->cmd_flags & REQ_DISCARD) | ||
| 715 | rw |= REQ_DISCARD; | ||
| 716 | |||
| 717 | if (rq->cmd_flags & REQ_SECURE) | ||
| 718 | rw |= REQ_SECURE; | ||
| 719 | |||
| 720 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { | 713 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { |
| 721 | what |= BLK_TC_ACT(BLK_TC_PC); | 714 | what |= BLK_TC_ACT(BLK_TC_PC); |
| 722 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, | 715 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, |
| 723 | what, rq->errors, rq->cmd_len, rq->cmd); | 716 | what, rq->errors, rq->cmd_len, rq->cmd); |
| 724 | } else { | 717 | } else { |
| 725 | what |= BLK_TC_ACT(BLK_TC_FS); | 718 | what |= BLK_TC_ACT(BLK_TC_FS); |
| 726 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, | 719 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), |
| 727 | what, rq->errors, 0, NULL); | 720 | rq->cmd_flags, what, rq->errors, 0, NULL); |
| 728 | } | 721 | } |
| 729 | } | 722 | } |
| 730 | 723 | ||
| @@ -857,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) | |||
| 857 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); | 850 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); |
| 858 | } | 851 | } |
| 859 | 852 | ||
| 860 | static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) | 853 | static void blk_add_trace_unplug(void *ignore, struct request_queue *q, |
| 854 | unsigned int depth, bool explicit) | ||
| 861 | { | 855 | { |
| 862 | struct blk_trace *bt = q->blk_trace; | 856 | struct blk_trace *bt = q->blk_trace; |
| 863 | 857 | ||
| 864 | if (bt) { | 858 | if (bt) { |
| 865 | unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; | 859 | __be64 rpdu = cpu_to_be64(depth); |
| 866 | __be64 rpdu = cpu_to_be64(pdu); | 860 | u32 what; |
| 867 | 861 | ||
| 868 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, | 862 | if (explicit) |
| 869 | sizeof(rpdu), &rpdu); | 863 | what = BLK_TA_UNPLUG_IO; |
| 870 | } | 864 | else |
| 871 | } | 865 | what = BLK_TA_UNPLUG_TIMER; |
| 872 | |||
| 873 | static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) | ||
| 874 | { | ||
| 875 | struct blk_trace *bt = q->blk_trace; | ||
| 876 | |||
| 877 | if (bt) { | ||
| 878 | unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; | ||
| 879 | __be64 rpdu = cpu_to_be64(pdu); | ||
| 880 | 866 | ||
| 881 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, | 867 | __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); |
| 882 | sizeof(rpdu), &rpdu); | ||
| 883 | } | 868 | } |
| 884 | } | 869 | } |
| 885 | 870 | ||
| @@ -1022,9 +1007,7 @@ static void blk_register_tracepoints(void) | |||
| 1022 | WARN_ON(ret); | 1007 | WARN_ON(ret); |
| 1023 | ret = register_trace_block_plug(blk_add_trace_plug, NULL); | 1008 | ret = register_trace_block_plug(blk_add_trace_plug, NULL); |
| 1024 | WARN_ON(ret); | 1009 | WARN_ON(ret); |
| 1025 | ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); | 1010 | ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); |
| 1026 | WARN_ON(ret); | ||
| 1027 | ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); | ||
| 1028 | WARN_ON(ret); | 1011 | WARN_ON(ret); |
| 1029 | ret = register_trace_block_split(blk_add_trace_split, NULL); | 1012 | ret = register_trace_block_split(blk_add_trace_split, NULL); |
| 1030 | WARN_ON(ret); | 1013 | WARN_ON(ret); |
| @@ -1039,8 +1022,7 @@ static void blk_unregister_tracepoints(void) | |||
| 1039 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1022 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
| 1040 | unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); | 1023 | unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
| 1041 | unregister_trace_block_split(blk_add_trace_split, NULL); | 1024 | unregister_trace_block_split(blk_add_trace_split, NULL); |
| 1042 | unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); | 1025 | unregister_trace_block_unplug(blk_add_trace_unplug, NULL); |
| 1043 | unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); | ||
| 1044 | unregister_trace_block_plug(blk_add_trace_plug, NULL); | 1026 | unregister_trace_block_plug(blk_add_trace_plug, NULL); |
| 1045 | unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); | 1027 | unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); |
| 1046 | unregister_trace_block_getrq(blk_add_trace_getrq, NULL); | 1028 | unregister_trace_block_getrq(blk_add_trace_getrq, NULL); |
| @@ -1827,21 +1809,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
| 1827 | rwbs[i] = '\0'; | 1809 | rwbs[i] = '\0'; |
| 1828 | } | 1810 | } |
| 1829 | 1811 | ||
| 1830 | void blk_fill_rwbs_rq(char *rwbs, struct request *rq) | ||
| 1831 | { | ||
| 1832 | int rw = rq->cmd_flags & 0x03; | ||
| 1833 | int bytes; | ||
| 1834 | |||
| 1835 | if (rq->cmd_flags & REQ_DISCARD) | ||
| 1836 | rw |= REQ_DISCARD; | ||
| 1837 | |||
| 1838 | if (rq->cmd_flags & REQ_SECURE) | ||
| 1839 | rw |= REQ_SECURE; | ||
| 1840 | |||
| 1841 | bytes = blk_rq_bytes(rq); | ||
| 1842 | |||
| 1843 | blk_fill_rwbs(rwbs, rw, bytes); | ||
| 1844 | } | ||
| 1845 | |||
| 1846 | #endif /* CONFIG_EVENT_TRACING */ | 1812 | #endif /* CONFIG_EVENT_TRACING */ |
| 1847 | 1813 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae83883..ee24fa1935ac 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -1268,7 +1268,7 @@ static int ftrace_update_code(struct module *mod) | |||
| 1268 | p->flags = 0L; | 1268 | p->flags = 0L; |
| 1269 | 1269 | ||
| 1270 | /* | 1270 | /* |
| 1271 | * Do the initial record convertion from mcount jump | 1271 | * Do the initial record conversion from mcount jump |
| 1272 | * to the NOP instructions. | 1272 | * to the NOP instructions. |
| 1273 | */ | 1273 | */ |
| 1274 | if (!ftrace_code_disable(mod, p)) { | 1274 | if (!ftrace_code_disable(mod, p)) { |
| @@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 1467 | return t_hash_next(m, pos); | 1467 | return t_hash_next(m, pos); |
| 1468 | 1468 | ||
| 1469 | (*pos)++; | 1469 | (*pos)++; |
| 1470 | iter->pos = *pos; | 1470 | iter->pos = iter->func_pos = *pos; |
| 1471 | 1471 | ||
| 1472 | if (iter->flags & FTRACE_ITER_PRINTALL) | 1472 | if (iter->flags & FTRACE_ITER_PRINTALL) |
| 1473 | return t_hash_start(m, pos); | 1473 | return t_hash_start(m, pos); |
| @@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 1502 | if (!rec) | 1502 | if (!rec) |
| 1503 | return t_hash_start(m, pos); | 1503 | return t_hash_start(m, pos); |
| 1504 | 1504 | ||
| 1505 | iter->func_pos = *pos; | ||
| 1506 | iter->func = rec; | 1505 | iter->func = rec; |
| 1507 | 1506 | ||
| 1508 | return iter; | 1507 | return iter; |
| @@ -3328,7 +3327,7 @@ static int start_graph_tracing(void) | |||
| 3328 | /* The cpu_boot init_task->ret_stack will never be freed */ | 3327 | /* The cpu_boot init_task->ret_stack will never be freed */ |
| 3329 | for_each_online_cpu(cpu) { | 3328 | for_each_online_cpu(cpu) { |
| 3330 | if (!idle_task(cpu)->ret_stack) | 3329 | if (!idle_task(cpu)->ret_stack) |
| 3331 | ftrace_graph_init_task(idle_task(cpu)); | 3330 | ftrace_graph_init_idle_task(idle_task(cpu), cpu); |
| 3332 | } | 3331 | } |
| 3333 | 3332 | ||
| 3334 | do { | 3333 | do { |
| @@ -3418,6 +3417,49 @@ void unregister_ftrace_graph(void) | |||
| 3418 | mutex_unlock(&ftrace_lock); | 3417 | mutex_unlock(&ftrace_lock); |
| 3419 | } | 3418 | } |
| 3420 | 3419 | ||
| 3420 | static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); | ||
| 3421 | |||
| 3422 | static void | ||
| 3423 | graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) | ||
| 3424 | { | ||
| 3425 | atomic_set(&t->tracing_graph_pause, 0); | ||
| 3426 | atomic_set(&t->trace_overrun, 0); | ||
| 3427 | t->ftrace_timestamp = 0; | ||
| 3428 | /* make curr_ret_stack visible before we add the ret_stack */ | ||
| 3429 | smp_wmb(); | ||
| 3430 | t->ret_stack = ret_stack; | ||
| 3431 | } | ||
| 3432 | |||
| 3433 | /* | ||
| 3434 | * Allocate a return stack for the idle task. May be the first | ||
| 3435 | * time through, or it may be done by CPU hotplug online. | ||
| 3436 | */ | ||
| 3437 | void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) | ||
| 3438 | { | ||
| 3439 | t->curr_ret_stack = -1; | ||
| 3440 | /* | ||
| 3441 | * The idle task has no parent, it either has its own | ||
| 3442 | * stack or no stack at all. | ||
| 3443 | */ | ||
| 3444 | if (t->ret_stack) | ||
| 3445 | WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); | ||
| 3446 | |||
| 3447 | if (ftrace_graph_active) { | ||
| 3448 | struct ftrace_ret_stack *ret_stack; | ||
| 3449 | |||
| 3450 | ret_stack = per_cpu(idle_ret_stack, cpu); | ||
| 3451 | if (!ret_stack) { | ||
| 3452 | ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH | ||
| 3453 | * sizeof(struct ftrace_ret_stack), | ||
| 3454 | GFP_KERNEL); | ||
| 3455 | if (!ret_stack) | ||
| 3456 | return; | ||
| 3457 | per_cpu(idle_ret_stack, cpu) = ret_stack; | ||
| 3458 | } | ||
| 3459 | graph_init_task(t, ret_stack); | ||
| 3460 | } | ||
| 3461 | } | ||
| 3462 | |||
| 3421 | /* Allocate a return stack for newly created task */ | 3463 | /* Allocate a return stack for newly created task */ |
| 3422 | void ftrace_graph_init_task(struct task_struct *t) | 3464 | void ftrace_graph_init_task(struct task_struct *t) |
| 3423 | { | 3465 | { |
| @@ -3433,12 +3475,7 @@ void ftrace_graph_init_task(struct task_struct *t) | |||
| 3433 | GFP_KERNEL); | 3475 | GFP_KERNEL); |
| 3434 | if (!ret_stack) | 3476 | if (!ret_stack) |
| 3435 | return; | 3477 | return; |
| 3436 | atomic_set(&t->tracing_graph_pause, 0); | 3478 | graph_init_task(t, ret_stack); |
| 3437 | atomic_set(&t->trace_overrun, 0); | ||
| 3438 | t->ftrace_timestamp = 0; | ||
| 3439 | /* make curr_ret_stack visable before we add the ret_stack */ | ||
| 3440 | smp_wmb(); | ||
| 3441 | t->ret_stack = ret_stack; | ||
| 3442 | } | 3479 | } |
| 3443 | } | 3480 | } |
| 3444 | 3481 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..0ef7b4b2a1f7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -5,7 +5,6 @@ | |||
| 5 | */ | 5 | */ |
| 6 | #include <linux/ring_buffer.h> | 6 | #include <linux/ring_buffer.h> |
| 7 | #include <linux/trace_clock.h> | 7 | #include <linux/trace_clock.h> |
| 8 | #include <linux/ftrace_irq.h> | ||
| 9 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
| 10 | #include <linux/debugfs.h> | 9 | #include <linux/debugfs.h> |
| 11 | #include <linux/uaccess.h> | 10 | #include <linux/uaccess.h> |
| @@ -669,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list) | |||
| 669 | * the reader page). But if the next page is a header page, | 668 | * the reader page). But if the next page is a header page, |
| 670 | * its flags will be non zero. | 669 | * its flags will be non zero. |
| 671 | */ | 670 | */ |
| 672 | static int inline | 671 | static inline int |
| 673 | rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, | 672 | rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, |
| 674 | struct buffer_page *page, struct list_head *list) | 673 | struct buffer_page *page, struct list_head *list) |
| 675 | { | 674 | { |
| @@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
| 1429 | } | 1428 | } |
| 1430 | EXPORT_SYMBOL_GPL(ring_buffer_resize); | 1429 | EXPORT_SYMBOL_GPL(ring_buffer_resize); |
| 1431 | 1430 | ||
| 1431 | void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) | ||
| 1432 | { | ||
| 1433 | mutex_lock(&buffer->mutex); | ||
| 1434 | if (val) | ||
| 1435 | buffer->flags |= RB_FL_OVERWRITE; | ||
| 1436 | else | ||
| 1437 | buffer->flags &= ~RB_FL_OVERWRITE; | ||
| 1438 | mutex_unlock(&buffer->mutex); | ||
| 1439 | } | ||
| 1440 | EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); | ||
| 1441 | |||
| 1432 | static inline void * | 1442 | static inline void * |
| 1433 | __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) | 1443 | __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) |
| 1434 | { | 1444 | { |
| @@ -1468,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage) | |||
| 1468 | return local_read(&bpage->entries) & RB_WRITE_MASK; | 1478 | return local_read(&bpage->entries) & RB_WRITE_MASK; |
| 1469 | } | 1479 | } |
| 1470 | 1480 | ||
| 1471 | /* Size is determined by what has been commited */ | 1481 | /* Size is determined by what has been committed */ |
| 1472 | static inline unsigned rb_page_size(struct buffer_page *bpage) | 1482 | static inline unsigned rb_page_size(struct buffer_page *bpage) |
| 1473 | { | 1483 | { |
| 1474 | return rb_page_commit(bpage); | 1484 | return rb_page_commit(bpage); |
| @@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
| 2162 | if (likely(ts >= cpu_buffer->write_stamp)) { | 2172 | if (likely(ts >= cpu_buffer->write_stamp)) { |
| 2163 | delta = diff; | 2173 | delta = diff; |
| 2164 | if (unlikely(test_time_stamp(delta))) { | 2174 | if (unlikely(test_time_stamp(delta))) { |
| 2175 | int local_clock_stable = 1; | ||
| 2176 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
| 2177 | local_clock_stable = sched_clock_stable; | ||
| 2178 | #endif | ||
| 2165 | WARN_ONCE(delta > (1ULL << 59), | 2179 | WARN_ONCE(delta > (1ULL << 59), |
| 2166 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", | 2180 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", |
| 2167 | (unsigned long long)delta, | 2181 | (unsigned long long)delta, |
| 2168 | (unsigned long long)ts, | 2182 | (unsigned long long)ts, |
| 2169 | (unsigned long long)cpu_buffer->write_stamp); | 2183 | (unsigned long long)cpu_buffer->write_stamp, |
| 2184 | local_clock_stable ? "" : | ||
| 2185 | "If you just came from a suspend/resume,\n" | ||
| 2186 | "please switch to the trace global clock:\n" | ||
| 2187 | " echo global > /sys/kernel/debug/tracing/trace_clock\n"); | ||
| 2170 | add_timestamp = 1; | 2188 | add_timestamp = 1; |
| 2171 | } | 2189 | } |
| 2172 | } | 2190 | } |
| @@ -2914,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
| 2914 | /* | 2932 | /* |
| 2915 | * cpu_buffer->pages just needs to point to the buffer, it | 2933 | * cpu_buffer->pages just needs to point to the buffer, it |
| 2916 | * has no specific buffer page to point to. Lets move it out | 2934 | * has no specific buffer page to point to. Lets move it out |
| 2917 | * of our way so we don't accidently swap it. | 2935 | * of our way so we don't accidentally swap it. |
| 2918 | */ | 2936 | */ |
| 2919 | cpu_buffer->pages = reader->list.prev; | 2937 | cpu_buffer->pages = reader->list.prev; |
| 2920 | 2938 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc53ecb80589..1cb49be7c7fb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -41,8 +41,6 @@ | |||
| 41 | #include "trace.h" | 41 | #include "trace.h" |
| 42 | #include "trace_output.h" | 42 | #include "trace_output.h" |
| 43 | 43 | ||
| 44 | #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) | ||
| 45 | |||
| 46 | /* | 44 | /* |
| 47 | * On boot up, the ring buffer is set to the minimum size, so that | 45 | * On boot up, the ring buffer is set to the minimum size, so that |
| 48 | * we do not waste memory on systems that are not using tracing. | 46 | * we do not waste memory on systems that are not using tracing. |
| @@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
| 340 | /* trace_flags holds trace_options default values */ | 338 | /* trace_flags holds trace_options default values */ |
| 341 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
| 342 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
| 343 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; | 341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; |
| 344 | 342 | ||
| 345 | static int trace_stop_count; | 343 | static int trace_stop_count; |
| 346 | static DEFINE_SPINLOCK(tracing_start_lock); | 344 | static DEFINE_SPINLOCK(tracing_start_lock); |
| @@ -425,6 +423,7 @@ static const char *trace_options[] = { | |||
| 425 | "sleep-time", | 423 | "sleep-time", |
| 426 | "graph-time", | 424 | "graph-time", |
| 427 | "record-cmd", | 425 | "record-cmd", |
| 426 | "overwrite", | ||
| 428 | NULL | 427 | NULL |
| 429 | }; | 428 | }; |
| 430 | 429 | ||
| @@ -780,6 +779,11 @@ __acquires(kernel_lock) | |||
| 780 | tracing_reset_online_cpus(tr); | 779 | tracing_reset_online_cpus(tr); |
| 781 | 780 | ||
| 782 | current_trace = type; | 781 | current_trace = type; |
| 782 | |||
| 783 | /* If we expanded the buffers, make sure the max is expanded too */ | ||
| 784 | if (ring_buffer_expanded && type->use_max_tr) | ||
| 785 | ring_buffer_resize(max_tr.buffer, trace_buf_size); | ||
| 786 | |||
| 783 | /* the test is responsible for initializing and enabling */ | 787 | /* the test is responsible for initializing and enabling */ |
| 784 | pr_info("Testing tracer %s: ", type->name); | 788 | pr_info("Testing tracer %s: ", type->name); |
| 785 | ret = type->selftest(type, tr); | 789 | ret = type->selftest(type, tr); |
| @@ -792,6 +796,10 @@ __acquires(kernel_lock) | |||
| 792 | /* Only reset on passing, to avoid touching corrupted buffers */ | 796 | /* Only reset on passing, to avoid touching corrupted buffers */ |
| 793 | tracing_reset_online_cpus(tr); | 797 | tracing_reset_online_cpus(tr); |
| 794 | 798 | ||
| 799 | /* Shrink the max buffer again */ | ||
| 800 | if (ring_buffer_expanded && type->use_max_tr) | ||
| 801 | ring_buffer_resize(max_tr.buffer, 1); | ||
| 802 | |||
| 795 | printk(KERN_CONT "PASSED\n"); | 803 | printk(KERN_CONT "PASSED\n"); |
| 796 | } | 804 | } |
| 797 | #endif | 805 | #endif |
| @@ -1102,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | |||
| 1102 | 1110 | ||
| 1103 | entry->preempt_count = pc & 0xff; | 1111 | entry->preempt_count = pc & 0xff; |
| 1104 | entry->pid = (tsk) ? tsk->pid : 0; | 1112 | entry->pid = (tsk) ? tsk->pid : 0; |
| 1105 | entry->lock_depth = (tsk) ? tsk->lock_depth : 0; | 1113 | entry->padding = 0; |
| 1106 | entry->flags = | 1114 | entry->flags = |
| 1107 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT | 1115 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT |
| 1108 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | | 1116 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | |
| @@ -1749,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m) | |||
| 1749 | seq_puts(m, "# | / _----=> need-resched \n"); | 1757 | seq_puts(m, "# | / _----=> need-resched \n"); |
| 1750 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); | 1758 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); |
| 1751 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); | 1759 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); |
| 1752 | seq_puts(m, "# |||| /_--=> lock-depth \n"); | 1760 | seq_puts(m, "# |||| / delay \n"); |
| 1753 | seq_puts(m, "# |||||/ delay \n"); | 1761 | seq_puts(m, "# cmd pid ||||| time | caller \n"); |
| 1754 | seq_puts(m, "# cmd pid |||||| time | caller \n"); | 1762 | seq_puts(m, "# \\ / ||||| \\ | / \n"); |
| 1755 | seq_puts(m, "# \\ / |||||| \\ | / \n"); | ||
| 1756 | } | 1763 | } |
| 1757 | 1764 | ||
| 1758 | static void print_func_help_header(struct seq_file *m) | 1765 | static void print_func_help_header(struct seq_file *m) |
| @@ -2529,6 +2536,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
| 2529 | 2536 | ||
| 2530 | if (mask == TRACE_ITER_RECORD_CMD) | 2537 | if (mask == TRACE_ITER_RECORD_CMD) |
| 2531 | trace_event_enable_cmd_record(enabled); | 2538 | trace_event_enable_cmd_record(enabled); |
| 2539 | |||
| 2540 | if (mask == TRACE_ITER_OVERWRITE) | ||
| 2541 | ring_buffer_change_overwrite(global_trace.buffer, enabled); | ||
| 2532 | } | 2542 | } |
| 2533 | 2543 | ||
| 2534 | static ssize_t | 2544 | static ssize_t |
| @@ -2710,6 +2720,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, | |||
| 2710 | 2720 | ||
| 2711 | mutex_lock(&trace_types_lock); | 2721 | mutex_lock(&trace_types_lock); |
| 2712 | if (tracer_enabled ^ val) { | 2722 | if (tracer_enabled ^ val) { |
| 2723 | |||
| 2724 | /* Only need to warn if this is used to change the state */ | ||
| 2725 | WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); | ||
| 2726 | |||
| 2713 | if (val) { | 2727 | if (val) { |
| 2714 | tracer_enabled = 1; | 2728 | tracer_enabled = 1; |
| 2715 | if (current_trace->start) | 2729 | if (current_trace->start) |
| @@ -3226,7 +3240,7 @@ waitagain: | |||
| 3226 | trace_seq_init(&iter->seq); | 3240 | trace_seq_init(&iter->seq); |
| 3227 | 3241 | ||
| 3228 | /* | 3242 | /* |
| 3229 | * If there was nothing to send to user, inspite of consuming trace | 3243 | * If there was nothing to send to user, in spite of consuming trace |
| 3230 | * entries, go back to wait for more entries. | 3244 | * entries, go back to wait for more entries. |
| 3231 | */ | 3245 | */ |
| 3232 | if (sret == -EBUSY) | 3246 | if (sret == -EBUSY) |
| @@ -4551,9 +4565,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | |||
| 4551 | __init static int tracer_alloc_buffers(void) | 4565 | __init static int tracer_alloc_buffers(void) |
| 4552 | { | 4566 | { |
| 4553 | int ring_buf_size; | 4567 | int ring_buf_size; |
| 4568 | enum ring_buffer_flags rb_flags; | ||
| 4554 | int i; | 4569 | int i; |
| 4555 | int ret = -ENOMEM; | 4570 | int ret = -ENOMEM; |
| 4556 | 4571 | ||
| 4572 | |||
| 4557 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) | 4573 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) |
| 4558 | goto out; | 4574 | goto out; |
| 4559 | 4575 | ||
| @@ -4566,12 +4582,13 @@ __init static int tracer_alloc_buffers(void) | |||
| 4566 | else | 4582 | else |
| 4567 | ring_buf_size = 1; | 4583 | ring_buf_size = 1; |
| 4568 | 4584 | ||
| 4585 | rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; | ||
| 4586 | |||
| 4569 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); | 4587 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); |
| 4570 | cpumask_copy(tracing_cpumask, cpu_all_mask); | 4588 | cpumask_copy(tracing_cpumask, cpu_all_mask); |
| 4571 | 4589 | ||
| 4572 | /* TODO: make the number of buffers hot pluggable with CPUS */ | 4590 | /* TODO: make the number of buffers hot pluggable with CPUS */ |
| 4573 | global_trace.buffer = ring_buffer_alloc(ring_buf_size, | 4591 | global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); |
| 4574 | TRACE_BUFFER_FLAGS); | ||
| 4575 | if (!global_trace.buffer) { | 4592 | if (!global_trace.buffer) { |
| 4576 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); | 4593 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); |
| 4577 | WARN_ON(1); | 4594 | WARN_ON(1); |
| @@ -4581,7 +4598,7 @@ __init static int tracer_alloc_buffers(void) | |||
| 4581 | 4598 | ||
| 4582 | 4599 | ||
| 4583 | #ifdef CONFIG_TRACER_MAX_TRACE | 4600 | #ifdef CONFIG_TRACER_MAX_TRACE |
| 4584 | max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); | 4601 | max_tr.buffer = ring_buffer_alloc(1, rb_flags); |
| 4585 | if (!max_tr.buffer) { | 4602 | if (!max_tr.buffer) { |
| 4586 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); | 4603 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); |
| 4587 | WARN_ON(1); | 4604 | WARN_ON(1); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c3..5e9dfc6286dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -272,8 +272,8 @@ struct tracer { | |||
| 272 | /* If you handled the flag setting, return 0 */ | 272 | /* If you handled the flag setting, return 0 */ |
| 273 | int (*set_flag)(u32 old_flags, u32 bit, int set); | 273 | int (*set_flag)(u32 old_flags, u32 bit, int set); |
| 274 | struct tracer *next; | 274 | struct tracer *next; |
| 275 | int print_max; | ||
| 276 | struct tracer_flags *flags; | 275 | struct tracer_flags *flags; |
| 276 | int print_max; | ||
| 277 | int use_max_tr; | 277 | int use_max_tr; |
| 278 | }; | 278 | }; |
| 279 | 279 | ||
| @@ -606,6 +606,7 @@ enum trace_iterator_flags { | |||
| 606 | TRACE_ITER_SLEEP_TIME = 0x40000, | 606 | TRACE_ITER_SLEEP_TIME = 0x40000, |
| 607 | TRACE_ITER_GRAPH_TIME = 0x80000, | 607 | TRACE_ITER_GRAPH_TIME = 0x80000, |
| 608 | TRACE_ITER_RECORD_CMD = 0x100000, | 608 | TRACE_ITER_RECORD_CMD = 0x100000, |
| 609 | TRACE_ITER_OVERWRITE = 0x200000, | ||
| 609 | }; | 610 | }; |
| 610 | 611 | ||
| 611 | /* | 612 | /* |
| @@ -661,8 +662,10 @@ struct ftrace_event_field { | |||
| 661 | }; | 662 | }; |
| 662 | 663 | ||
| 663 | struct event_filter { | 664 | struct event_filter { |
| 664 | int n_preds; | 665 | int n_preds; /* Number assigned */ |
| 665 | struct filter_pred **preds; | 666 | int a_preds; /* allocated */ |
| 667 | struct filter_pred *preds; | ||
| 668 | struct filter_pred *root; | ||
| 666 | char *filter_string; | 669 | char *filter_string; |
| 667 | }; | 670 | }; |
| 668 | 671 | ||
| @@ -674,11 +677,23 @@ struct event_subsystem { | |||
| 674 | int nr_events; | 677 | int nr_events; |
| 675 | }; | 678 | }; |
| 676 | 679 | ||
| 680 | #define FILTER_PRED_INVALID ((unsigned short)-1) | ||
| 681 | #define FILTER_PRED_IS_RIGHT (1 << 15) | ||
| 682 | #define FILTER_PRED_FOLD (1 << 15) | ||
| 683 | |||
| 684 | /* | ||
| 685 | * The max preds is the size of unsigned short with | ||
| 686 | * two flags at the MSBs. One bit is used for both the IS_RIGHT | ||
| 687 | * and FOLD flags. The other is reserved. | ||
| 688 | * | ||
| 689 | * 2^14 preds is way more than enough. | ||
| 690 | */ | ||
| 691 | #define MAX_FILTER_PRED 16384 | ||
| 692 | |||
| 677 | struct filter_pred; | 693 | struct filter_pred; |
| 678 | struct regex; | 694 | struct regex; |
| 679 | 695 | ||
| 680 | typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, | 696 | typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); |
| 681 | int val1, int val2); | ||
| 682 | 697 | ||
| 683 | typedef int (*regex_match_func)(char *str, struct regex *r, int len); | 698 | typedef int (*regex_match_func)(char *str, struct regex *r, int len); |
| 684 | 699 | ||
| @@ -700,11 +715,23 @@ struct filter_pred { | |||
| 700 | filter_pred_fn_t fn; | 715 | filter_pred_fn_t fn; |
| 701 | u64 val; | 716 | u64 val; |
| 702 | struct regex regex; | 717 | struct regex regex; |
| 703 | char *field_name; | 718 | /* |
| 719 | * Leaf nodes use field_name, ops is used by AND and OR | ||
| 720 | * nodes. The field_name is always freed when freeing a pred. | ||
| 721 | * We can overload field_name for ops and have it freed | ||
| 722 | * as well. | ||
| 723 | */ | ||
| 724 | union { | ||
| 725 | char *field_name; | ||
| 726 | unsigned short *ops; | ||
| 727 | }; | ||
| 704 | int offset; | 728 | int offset; |
| 705 | int not; | 729 | int not; |
| 706 | int op; | 730 | int op; |
| 707 | int pop_n; | 731 | unsigned short index; |
| 732 | unsigned short parent; | ||
| 733 | unsigned short left; | ||
| 734 | unsigned short right; | ||
| 708 | }; | 735 | }; |
| 709 | 736 | ||
| 710 | extern struct list_head ftrace_common_fields; | 737 | extern struct list_head ftrace_common_fields; |
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 685a67d55db0..6302747a1398 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
| @@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void) | |||
| 46 | } | 46 | } |
| 47 | 47 | ||
| 48 | /* | 48 | /* |
| 49 | * trace_clock(): 'inbetween' trace clock. Not completely serialized, | 49 | * trace_clock(): 'between' trace clock. Not completely serialized, |
| 50 | * but not completely incorrect when crossing CPUs either. | 50 | * but not completely incorrect when crossing CPUs either. |
| 51 | * | 51 | * |
| 52 | * This is based on cpu_clock(), which will allow at most ~1 jiffy of | 52 | * This is based on cpu_clock(), which will allow at most ~1 jiffy of |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 6cf223764be8..e32744c84d94 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
| @@ -27,7 +27,7 @@ | |||
| 27 | * in the structure. | 27 | * in the structure. |
| 28 | * | 28 | * |
| 29 | * * for structures within structures, the format of the internal | 29 | * * for structures within structures, the format of the internal |
| 30 | * structure is layed out. This allows the internal structure | 30 | * structure is laid out. This allows the internal structure |
| 31 | * to be deciphered for the format file. Although these macros | 31 | * to be deciphered for the format file. Although these macros |
| 32 | * may become out of sync with the internal structure, they | 32 | * may become out of sync with the internal structure, they |
| 33 | * will create a compile error if it happens. Since the | 33 | * will create a compile error if it happens. Since the |
| @@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, | |||
| 109 | */ | 109 | */ |
| 110 | #define FTRACE_CTX_FIELDS \ | 110 | #define FTRACE_CTX_FIELDS \ |
| 111 | __field( unsigned int, prev_pid ) \ | 111 | __field( unsigned int, prev_pid ) \ |
| 112 | __field( unsigned int, next_pid ) \ | ||
| 113 | __field( unsigned int, next_cpu ) \ | ||
| 112 | __field( unsigned char, prev_prio ) \ | 114 | __field( unsigned char, prev_prio ) \ |
| 113 | __field( unsigned char, prev_state ) \ | 115 | __field( unsigned char, prev_state ) \ |
| 114 | __field( unsigned int, next_pid ) \ | ||
| 115 | __field( unsigned char, next_prio ) \ | 116 | __field( unsigned char, next_prio ) \ |
| 116 | __field( unsigned char, next_state ) \ | 117 | __field( unsigned char, next_state ) |
| 117 | __field( unsigned int, next_cpu ) | ||
| 118 | 118 | ||
| 119 | FTRACE_ENTRY(context_switch, ctx_switch_entry, | 119 | FTRACE_ENTRY(context_switch, ctx_switch_entry, |
| 120 | 120 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5f499e0438a4..2fe110341359 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -116,7 +116,7 @@ static int trace_define_common_fields(void) | |||
| 116 | __common_field(unsigned char, flags); | 116 | __common_field(unsigned char, flags); |
| 117 | __common_field(unsigned char, preempt_count); | 117 | __common_field(unsigned char, preempt_count); |
| 118 | __common_field(int, pid); | 118 | __common_field(int, pid); |
| 119 | __common_field(int, lock_depth); | 119 | __common_field(int, padding); |
| 120 | 120 | ||
| 121 | return ret; | 121 | return ret; |
| 122 | } | 122 | } |
| @@ -326,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) | |||
| 326 | { | 326 | { |
| 327 | return __ftrace_set_clr_event(NULL, system, event, set); | 327 | return __ftrace_set_clr_event(NULL, system, event, set); |
| 328 | } | 328 | } |
| 329 | EXPORT_SYMBOL_GPL(trace_set_clr_event); | ||
| 329 | 330 | ||
| 330 | /* 128 should be much more than enough */ | 331 | /* 128 should be much more than enough */ |
| 331 | #define EVENT_BUF_SIZE 127 | 332 | #define EVENT_BUF_SIZE 127 |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17f..8008ddcfbf20 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
| @@ -123,9 +123,13 @@ struct filter_parse_state { | |||
| 123 | } operand; | 123 | } operand; |
| 124 | }; | 124 | }; |
| 125 | 125 | ||
| 126 | struct pred_stack { | ||
| 127 | struct filter_pred **preds; | ||
| 128 | int index; | ||
| 129 | }; | ||
| 130 | |||
| 126 | #define DEFINE_COMPARISON_PRED(type) \ | 131 | #define DEFINE_COMPARISON_PRED(type) \ |
| 127 | static int filter_pred_##type(struct filter_pred *pred, void *event, \ | 132 | static int filter_pred_##type(struct filter_pred *pred, void *event) \ |
| 128 | int val1, int val2) \ | ||
| 129 | { \ | 133 | { \ |
| 130 | type *addr = (type *)(event + pred->offset); \ | 134 | type *addr = (type *)(event + pred->offset); \ |
| 131 | type val = (type)pred->val; \ | 135 | type val = (type)pred->val; \ |
| @@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ | |||
| 152 | } | 156 | } |
| 153 | 157 | ||
| 154 | #define DEFINE_EQUALITY_PRED(size) \ | 158 | #define DEFINE_EQUALITY_PRED(size) \ |
| 155 | static int filter_pred_##size(struct filter_pred *pred, void *event, \ | 159 | static int filter_pred_##size(struct filter_pred *pred, void *event) \ |
| 156 | int val1, int val2) \ | ||
| 157 | { \ | 160 | { \ |
| 158 | u##size *addr = (u##size *)(event + pred->offset); \ | 161 | u##size *addr = (u##size *)(event + pred->offset); \ |
| 159 | u##size val = (u##size)pred->val; \ | 162 | u##size val = (u##size)pred->val; \ |
| @@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32); | |||
| 178 | DEFINE_EQUALITY_PRED(16); | 181 | DEFINE_EQUALITY_PRED(16); |
| 179 | DEFINE_EQUALITY_PRED(8); | 182 | DEFINE_EQUALITY_PRED(8); |
| 180 | 183 | ||
| 181 | static int filter_pred_and(struct filter_pred *pred __attribute((unused)), | ||
| 182 | void *event __attribute((unused)), | ||
| 183 | int val1, int val2) | ||
| 184 | { | ||
| 185 | return val1 && val2; | ||
| 186 | } | ||
| 187 | |||
| 188 | static int filter_pred_or(struct filter_pred *pred __attribute((unused)), | ||
| 189 | void *event __attribute((unused)), | ||
| 190 | int val1, int val2) | ||
| 191 | { | ||
| 192 | return val1 || val2; | ||
| 193 | } | ||
| 194 | |||
| 195 | /* Filter predicate for fixed sized arrays of characters */ | 184 | /* Filter predicate for fixed sized arrays of characters */ |
| 196 | static int filter_pred_string(struct filter_pred *pred, void *event, | 185 | static int filter_pred_string(struct filter_pred *pred, void *event) |
| 197 | int val1, int val2) | ||
| 198 | { | 186 | { |
| 199 | char *addr = (char *)(event + pred->offset); | 187 | char *addr = (char *)(event + pred->offset); |
| 200 | int cmp, match; | 188 | int cmp, match; |
| @@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, | |||
| 207 | } | 195 | } |
| 208 | 196 | ||
| 209 | /* Filter predicate for char * pointers */ | 197 | /* Filter predicate for char * pointers */ |
| 210 | static int filter_pred_pchar(struct filter_pred *pred, void *event, | 198 | static int filter_pred_pchar(struct filter_pred *pred, void *event) |
| 211 | int val1, int val2) | ||
| 212 | { | 199 | { |
| 213 | char **addr = (char **)(event + pred->offset); | 200 | char **addr = (char **)(event + pred->offset); |
| 214 | int cmp, match; | 201 | int cmp, match; |
| @@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, | |||
| 231 | * and add it to the address of the entry, and at last we have | 218 | * and add it to the address of the entry, and at last we have |
| 232 | * the address of the string. | 219 | * the address of the string. |
| 233 | */ | 220 | */ |
| 234 | static int filter_pred_strloc(struct filter_pred *pred, void *event, | 221 | static int filter_pred_strloc(struct filter_pred *pred, void *event) |
| 235 | int val1, int val2) | ||
| 236 | { | 222 | { |
| 237 | u32 str_item = *(u32 *)(event + pred->offset); | 223 | u32 str_item = *(u32 *)(event + pred->offset); |
| 238 | int str_loc = str_item & 0xffff; | 224 | int str_loc = str_item & 0xffff; |
| @@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, | |||
| 247 | return match; | 233 | return match; |
| 248 | } | 234 | } |
| 249 | 235 | ||
| 250 | static int filter_pred_none(struct filter_pred *pred, void *event, | 236 | static int filter_pred_none(struct filter_pred *pred, void *event) |
| 251 | int val1, int val2) | ||
| 252 | { | 237 | { |
| 253 | return 0; | 238 | return 0; |
| 254 | } | 239 | } |
| @@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred) | |||
| 377 | pred->not ^= not; | 362 | pred->not ^= not; |
| 378 | } | 363 | } |
| 379 | 364 | ||
| 365 | enum move_type { | ||
| 366 | MOVE_DOWN, | ||
| 367 | MOVE_UP_FROM_LEFT, | ||
| 368 | MOVE_UP_FROM_RIGHT | ||
| 369 | }; | ||
| 370 | |||
| 371 | static struct filter_pred * | ||
| 372 | get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, | ||
| 373 | int index, enum move_type *move) | ||
| 374 | { | ||
| 375 | if (pred->parent & FILTER_PRED_IS_RIGHT) | ||
| 376 | *move = MOVE_UP_FROM_RIGHT; | ||
| 377 | else | ||
| 378 | *move = MOVE_UP_FROM_LEFT; | ||
| 379 | pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; | ||
| 380 | |||
| 381 | return pred; | ||
| 382 | } | ||
| 383 | |||
| 384 | /* | ||
| 385 | * A series of AND or ORs where found together. Instead of | ||
| 386 | * climbing up and down the tree branches, an array of the | ||
| 387 | * ops were made in order of checks. We can just move across | ||
| 388 | * the array and short circuit if needed. | ||
| 389 | */ | ||
| 390 | static int process_ops(struct filter_pred *preds, | ||
| 391 | struct filter_pred *op, void *rec) | ||
| 392 | { | ||
| 393 | struct filter_pred *pred; | ||
| 394 | int match = 0; | ||
| 395 | int type; | ||
| 396 | int i; | ||
| 397 | |||
| 398 | /* | ||
| 399 | * Micro-optimization: We set type to true if op | ||
| 400 | * is an OR and false otherwise (AND). Then we | ||
| 401 | * just need to test if the match is equal to | ||
| 402 | * the type, and if it is, we can short circuit the | ||
| 403 | * rest of the checks: | ||
| 404 | * | ||
| 405 | * if ((match && op->op == OP_OR) || | ||
| 406 | * (!match && op->op == OP_AND)) | ||
| 407 | * return match; | ||
| 408 | */ | ||
| 409 | type = op->op == OP_OR; | ||
| 410 | |||
| 411 | for (i = 0; i < op->val; i++) { | ||
| 412 | pred = &preds[op->ops[i]]; | ||
| 413 | match = pred->fn(pred, rec); | ||
| 414 | if (!!match == type) | ||
| 415 | return match; | ||
| 416 | } | ||
| 417 | return match; | ||
| 418 | } | ||
| 419 | |||
| 380 | /* return 1 if event matches, 0 otherwise (discard) */ | 420 | /* return 1 if event matches, 0 otherwise (discard) */ |
| 381 | int filter_match_preds(struct event_filter *filter, void *rec) | 421 | int filter_match_preds(struct event_filter *filter, void *rec) |
| 382 | { | 422 | { |
| 383 | int match, top = 0, val1 = 0, val2 = 0; | 423 | int match = -1; |
| 384 | int stack[MAX_FILTER_PRED]; | 424 | enum move_type move = MOVE_DOWN; |
| 425 | struct filter_pred *preds; | ||
| 385 | struct filter_pred *pred; | 426 | struct filter_pred *pred; |
| 386 | int i; | 427 | struct filter_pred *root; |
| 428 | int n_preds; | ||
| 429 | int done = 0; | ||
| 430 | |||
| 431 | /* no filter is considered a match */ | ||
| 432 | if (!filter) | ||
| 433 | return 1; | ||
| 434 | |||
| 435 | n_preds = filter->n_preds; | ||
| 436 | |||
| 437 | if (!n_preds) | ||
| 438 | return 1; | ||
| 439 | |||
| 440 | /* | ||
| 441 | * n_preds, root and filter->preds are protect with preemption disabled. | ||
| 442 | */ | ||
| 443 | preds = rcu_dereference_sched(filter->preds); | ||
| 444 | root = rcu_dereference_sched(filter->root); | ||
| 445 | if (!root) | ||
| 446 | return 1; | ||
| 447 | |||
| 448 | pred = root; | ||
| 387 | 449 | ||
| 388 | for (i = 0; i < filter->n_preds; i++) { | 450 | /* match is currently meaningless */ |
| 389 | pred = filter->preds[i]; | 451 | match = -1; |
| 390 | if (!pred->pop_n) { | 452 | |
| 391 | match = pred->fn(pred, rec, val1, val2); | 453 | do { |
| 392 | stack[top++] = match; | 454 | switch (move) { |
| 455 | case MOVE_DOWN: | ||
| 456 | /* only AND and OR have children */ | ||
| 457 | if (pred->left != FILTER_PRED_INVALID) { | ||
| 458 | /* If ops is set, then it was folded. */ | ||
| 459 | if (!pred->ops) { | ||
| 460 | /* keep going to down the left side */ | ||
| 461 | pred = &preds[pred->left]; | ||
| 462 | continue; | ||
| 463 | } | ||
| 464 | /* We can treat folded ops as a leaf node */ | ||
| 465 | match = process_ops(preds, pred, rec); | ||
| 466 | } else | ||
| 467 | match = pred->fn(pred, rec); | ||
| 468 | /* If this pred is the only pred */ | ||
| 469 | if (pred == root) | ||
| 470 | break; | ||
| 471 | pred = get_pred_parent(pred, preds, | ||
| 472 | pred->parent, &move); | ||
| 473 | continue; | ||
| 474 | case MOVE_UP_FROM_LEFT: | ||
| 475 | /* | ||
| 476 | * Check for short circuits. | ||
| 477 | * | ||
| 478 | * Optimization: !!match == (pred->op == OP_OR) | ||
| 479 | * is the same as: | ||
| 480 | * if ((match && pred->op == OP_OR) || | ||
| 481 | * (!match && pred->op == OP_AND)) | ||
| 482 | */ | ||
| 483 | if (!!match == (pred->op == OP_OR)) { | ||
| 484 | if (pred == root) | ||
| 485 | break; | ||
| 486 | pred = get_pred_parent(pred, preds, | ||
| 487 | pred->parent, &move); | ||
| 488 | continue; | ||
| 489 | } | ||
| 490 | /* now go down the right side of the tree. */ | ||
| 491 | pred = &preds[pred->right]; | ||
| 492 | move = MOVE_DOWN; | ||
| 493 | continue; | ||
| 494 | case MOVE_UP_FROM_RIGHT: | ||
| 495 | /* We finished this equation. */ | ||
| 496 | if (pred == root) | ||
| 497 | break; | ||
| 498 | pred = get_pred_parent(pred, preds, | ||
| 499 | pred->parent, &move); | ||
| 393 | continue; | 500 | continue; |
| 394 | } | 501 | } |
| 395 | if (pred->pop_n > top) { | 502 | done = 1; |
| 396 | WARN_ON_ONCE(1); | 503 | } while (!done); |
| 397 | return 0; | ||
| 398 | } | ||
| 399 | val1 = stack[--top]; | ||
| 400 | val2 = stack[--top]; | ||
| 401 | match = pred->fn(pred, rec, val1, val2); | ||
| 402 | stack[top++] = match; | ||
| 403 | } | ||
| 404 | 504 | ||
| 405 | return stack[--top]; | 505 | return match; |
| 406 | } | 506 | } |
| 407 | EXPORT_SYMBOL_GPL(filter_match_preds); | 507 | EXPORT_SYMBOL_GPL(filter_match_preds); |
| 408 | 508 | ||
| @@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) | |||
| 414 | 514 | ||
| 415 | static void remove_filter_string(struct event_filter *filter) | 515 | static void remove_filter_string(struct event_filter *filter) |
| 416 | { | 516 | { |
| 517 | if (!filter) | ||
| 518 | return; | ||
| 519 | |||
| 417 | kfree(filter->filter_string); | 520 | kfree(filter->filter_string); |
| 418 | filter->filter_string = NULL; | 521 | filter->filter_string = NULL; |
| 419 | } | 522 | } |
| @@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, | |||
| 473 | 576 | ||
| 474 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | 577 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) |
| 475 | { | 578 | { |
| 476 | struct event_filter *filter = call->filter; | 579 | struct event_filter *filter; |
| 477 | 580 | ||
| 478 | mutex_lock(&event_mutex); | 581 | mutex_lock(&event_mutex); |
| 582 | filter = call->filter; | ||
| 479 | if (filter && filter->filter_string) | 583 | if (filter && filter->filter_string) |
| 480 | trace_seq_printf(s, "%s\n", filter->filter_string); | 584 | trace_seq_printf(s, "%s\n", filter->filter_string); |
| 481 | else | 585 | else |
| @@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | |||
| 486 | void print_subsystem_event_filter(struct event_subsystem *system, | 590 | void print_subsystem_event_filter(struct event_subsystem *system, |
| 487 | struct trace_seq *s) | 591 | struct trace_seq *s) |
| 488 | { | 592 | { |
| 489 | struct event_filter *filter = system->filter; | 593 | struct event_filter *filter; |
| 490 | 594 | ||
| 491 | mutex_lock(&event_mutex); | 595 | mutex_lock(&event_mutex); |
| 596 | filter = system->filter; | ||
| 492 | if (filter && filter->filter_string) | 597 | if (filter && filter->filter_string) |
| 493 | trace_seq_printf(s, "%s\n", filter->filter_string); | 598 | trace_seq_printf(s, "%s\n", filter->filter_string); |
| 494 | else | 599 | else |
| @@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred) | |||
| 539 | pred->regex.len = 0; | 644 | pred->regex.len = 0; |
| 540 | } | 645 | } |
| 541 | 646 | ||
| 542 | static int filter_set_pred(struct filter_pred *dest, | 647 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) |
| 648 | { | ||
| 649 | stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); | ||
| 650 | if (!stack->preds) | ||
| 651 | return -ENOMEM; | ||
| 652 | stack->index = n_preds; | ||
| 653 | return 0; | ||
| 654 | } | ||
| 655 | |||
| 656 | static void __free_pred_stack(struct pred_stack *stack) | ||
| 657 | { | ||
| 658 | kfree(stack->preds); | ||
| 659 | stack->index = 0; | ||
| 660 | } | ||
| 661 | |||
| 662 | static int __push_pred_stack(struct pred_stack *stack, | ||
| 663 | struct filter_pred *pred) | ||
| 664 | { | ||
| 665 | int index = stack->index; | ||
| 666 | |||
| 667 | if (WARN_ON(index == 0)) | ||
| 668 | return -ENOSPC; | ||
| 669 | |||
| 670 | stack->preds[--index] = pred; | ||
| 671 | stack->index = index; | ||
| 672 | return 0; | ||
| 673 | } | ||
| 674 | |||
| 675 | static struct filter_pred * | ||
| 676 | __pop_pred_stack(struct pred_stack *stack) | ||
| 677 | { | ||
| 678 | struct filter_pred *pred; | ||
| 679 | int index = stack->index; | ||
| 680 | |||
| 681 | pred = stack->preds[index++]; | ||
| 682 | if (!pred) | ||
| 683 | return NULL; | ||
| 684 | |||
| 685 | stack->index = index; | ||
| 686 | return pred; | ||
| 687 | } | ||
| 688 | |||
| 689 | static int filter_set_pred(struct event_filter *filter, | ||
| 690 | int idx, | ||
| 691 | struct pred_stack *stack, | ||
| 543 | struct filter_pred *src, | 692 | struct filter_pred *src, |
| 544 | filter_pred_fn_t fn) | 693 | filter_pred_fn_t fn) |
| 545 | { | 694 | { |
| 695 | struct filter_pred *dest = &filter->preds[idx]; | ||
| 696 | struct filter_pred *left; | ||
| 697 | struct filter_pred *right; | ||
| 698 | |||
| 546 | *dest = *src; | 699 | *dest = *src; |
| 547 | if (src->field_name) { | 700 | if (src->field_name) { |
| 548 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); | 701 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); |
| @@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest, | |||
| 550 | return -ENOMEM; | 703 | return -ENOMEM; |
| 551 | } | 704 | } |
| 552 | dest->fn = fn; | 705 | dest->fn = fn; |
| 706 | dest->index = idx; | ||
| 553 | 707 | ||
| 554 | return 0; | 708 | if (dest->op == OP_OR || dest->op == OP_AND) { |
| 709 | right = __pop_pred_stack(stack); | ||
| 710 | left = __pop_pred_stack(stack); | ||
| 711 | if (!left || !right) | ||
| 712 | return -EINVAL; | ||
| 713 | /* | ||
| 714 | * If both children can be folded | ||
| 715 | * and they are the same op as this op or a leaf, | ||
| 716 | * then this op can be folded. | ||
| 717 | */ | ||
| 718 | if (left->index & FILTER_PRED_FOLD && | ||
| 719 | (left->op == dest->op || | ||
| 720 | left->left == FILTER_PRED_INVALID) && | ||
| 721 | right->index & FILTER_PRED_FOLD && | ||
| 722 | (right->op == dest->op || | ||
| 723 | right->left == FILTER_PRED_INVALID)) | ||
| 724 | dest->index |= FILTER_PRED_FOLD; | ||
| 725 | |||
| 726 | dest->left = left->index & ~FILTER_PRED_FOLD; | ||
| 727 | dest->right = right->index & ~FILTER_PRED_FOLD; | ||
| 728 | left->parent = dest->index & ~FILTER_PRED_FOLD; | ||
| 729 | right->parent = dest->index | FILTER_PRED_IS_RIGHT; | ||
| 730 | } else { | ||
| 731 | /* | ||
| 732 | * Make dest->left invalid to be used as a quick | ||
| 733 | * way to know this is a leaf node. | ||
| 734 | */ | ||
| 735 | dest->left = FILTER_PRED_INVALID; | ||
| 736 | |||
| 737 | /* All leafs allow folding the parent ops. */ | ||
| 738 | dest->index |= FILTER_PRED_FOLD; | ||
| 739 | } | ||
| 740 | |||
| 741 | return __push_pred_stack(stack, dest); | ||
| 555 | } | 742 | } |
| 556 | 743 | ||
| 557 | static void filter_disable_preds(struct ftrace_event_call *call) | 744 | static void __free_preds(struct event_filter *filter) |
| 558 | { | 745 | { |
| 559 | struct event_filter *filter = call->filter; | ||
| 560 | int i; | 746 | int i; |
| 561 | 747 | ||
| 562 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | 748 | if (filter->preds) { |
| 749 | for (i = 0; i < filter->a_preds; i++) | ||
| 750 | kfree(filter->preds[i].field_name); | ||
| 751 | kfree(filter->preds); | ||
| 752 | filter->preds = NULL; | ||
| 753 | } | ||
| 754 | filter->a_preds = 0; | ||
| 563 | filter->n_preds = 0; | 755 | filter->n_preds = 0; |
| 564 | |||
| 565 | for (i = 0; i < MAX_FILTER_PRED; i++) | ||
| 566 | filter->preds[i]->fn = filter_pred_none; | ||
| 567 | } | 756 | } |
| 568 | 757 | ||
| 569 | static void __free_preds(struct event_filter *filter) | 758 | static void filter_disable(struct ftrace_event_call *call) |
| 570 | { | 759 | { |
| 571 | int i; | 760 | call->flags &= ~TRACE_EVENT_FL_FILTERED; |
| 761 | } | ||
| 572 | 762 | ||
| 763 | static void __free_filter(struct event_filter *filter) | ||
| 764 | { | ||
| 573 | if (!filter) | 765 | if (!filter) |
| 574 | return; | 766 | return; |
| 575 | 767 | ||
| 576 | for (i = 0; i < MAX_FILTER_PRED; i++) { | 768 | __free_preds(filter); |
| 577 | if (filter->preds[i]) | ||
| 578 | filter_free_pred(filter->preds[i]); | ||
| 579 | } | ||
| 580 | kfree(filter->preds); | ||
| 581 | kfree(filter->filter_string); | 769 | kfree(filter->filter_string); |
| 582 | kfree(filter); | 770 | kfree(filter); |
| 583 | } | 771 | } |
| 584 | 772 | ||
| 773 | /* | ||
| 774 | * Called when destroying the ftrace_event_call. | ||
| 775 | * The call is being freed, so we do not need to worry about | ||
| 776 | * the call being currently used. This is for module code removing | ||
| 777 | * the tracepoints from within it. | ||
| 778 | */ | ||
| 585 | void destroy_preds(struct ftrace_event_call *call) | 779 | void destroy_preds(struct ftrace_event_call *call) |
| 586 | { | 780 | { |
| 587 | __free_preds(call->filter); | 781 | __free_filter(call->filter); |
| 588 | call->filter = NULL; | 782 | call->filter = NULL; |
| 589 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | ||
| 590 | } | 783 | } |
| 591 | 784 | ||
| 592 | static struct event_filter *__alloc_preds(void) | 785 | static struct event_filter *__alloc_filter(void) |
| 593 | { | 786 | { |
| 594 | struct event_filter *filter; | 787 | struct event_filter *filter; |
| 788 | |||
| 789 | filter = kzalloc(sizeof(*filter), GFP_KERNEL); | ||
| 790 | return filter; | ||
| 791 | } | ||
| 792 | |||
| 793 | static int __alloc_preds(struct event_filter *filter, int n_preds) | ||
| 794 | { | ||
| 595 | struct filter_pred *pred; | 795 | struct filter_pred *pred; |
| 596 | int i; | 796 | int i; |
| 597 | 797 | ||
| 598 | filter = kzalloc(sizeof(*filter), GFP_KERNEL); | 798 | if (filter->preds) |
| 599 | if (!filter) | 799 | __free_preds(filter); |
| 600 | return ERR_PTR(-ENOMEM); | ||
| 601 | 800 | ||
| 602 | filter->n_preds = 0; | 801 | filter->preds = |
| 802 | kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); | ||
| 603 | 803 | ||
| 604 | filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); | ||
| 605 | if (!filter->preds) | 804 | if (!filter->preds) |
| 606 | goto oom; | 805 | return -ENOMEM; |
| 607 | 806 | ||
| 608 | for (i = 0; i < MAX_FILTER_PRED; i++) { | 807 | filter->a_preds = n_preds; |
| 609 | pred = kzalloc(sizeof(*pred), GFP_KERNEL); | 808 | filter->n_preds = 0; |
| 610 | if (!pred) | 809 | |
| 611 | goto oom; | 810 | for (i = 0; i < n_preds; i++) { |
| 811 | pred = &filter->preds[i]; | ||
| 612 | pred->fn = filter_pred_none; | 812 | pred->fn = filter_pred_none; |
| 613 | filter->preds[i] = pred; | ||
| 614 | } | 813 | } |
| 615 | 814 | ||
| 616 | return filter; | ||
| 617 | |||
| 618 | oom: | ||
| 619 | __free_preds(filter); | ||
| 620 | return ERR_PTR(-ENOMEM); | ||
| 621 | } | ||
| 622 | |||
| 623 | static int init_preds(struct ftrace_event_call *call) | ||
| 624 | { | ||
| 625 | if (call->filter) | ||
| 626 | return 0; | ||
| 627 | |||
| 628 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | ||
| 629 | call->filter = __alloc_preds(); | ||
| 630 | if (IS_ERR(call->filter)) | ||
| 631 | return PTR_ERR(call->filter); | ||
| 632 | |||
| 633 | return 0; | 815 | return 0; |
| 634 | } | 816 | } |
| 635 | 817 | ||
| 636 | static int init_subsystem_preds(struct event_subsystem *system) | 818 | static void filter_free_subsystem_preds(struct event_subsystem *system) |
| 637 | { | 819 | { |
| 638 | struct ftrace_event_call *call; | 820 | struct ftrace_event_call *call; |
| 639 | int err; | ||
| 640 | 821 | ||
| 641 | list_for_each_entry(call, &ftrace_events, list) { | 822 | list_for_each_entry(call, &ftrace_events, list) { |
| 642 | if (strcmp(call->class->system, system->name) != 0) | 823 | if (strcmp(call->class->system, system->name) != 0) |
| 643 | continue; | 824 | continue; |
| 644 | 825 | ||
| 645 | err = init_preds(call); | 826 | filter_disable(call); |
| 646 | if (err) | 827 | remove_filter_string(call->filter); |
| 647 | return err; | ||
| 648 | } | 828 | } |
| 649 | |||
| 650 | return 0; | ||
| 651 | } | 829 | } |
| 652 | 830 | ||
| 653 | static void filter_free_subsystem_preds(struct event_subsystem *system) | 831 | static void filter_free_subsystem_filters(struct event_subsystem *system) |
| 654 | { | 832 | { |
| 655 | struct ftrace_event_call *call; | 833 | struct ftrace_event_call *call; |
| 656 | 834 | ||
| 657 | list_for_each_entry(call, &ftrace_events, list) { | 835 | list_for_each_entry(call, &ftrace_events, list) { |
| 658 | if (strcmp(call->class->system, system->name) != 0) | 836 | if (strcmp(call->class->system, system->name) != 0) |
| 659 | continue; | 837 | continue; |
| 660 | 838 | __free_filter(call->filter); | |
| 661 | filter_disable_preds(call); | 839 | call->filter = NULL; |
| 662 | remove_filter_string(call->filter); | ||
| 663 | } | 840 | } |
| 664 | } | 841 | } |
| 665 | 842 | ||
| @@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, | |||
| 667 | struct ftrace_event_call *call, | 844 | struct ftrace_event_call *call, |
| 668 | struct event_filter *filter, | 845 | struct event_filter *filter, |
| 669 | struct filter_pred *pred, | 846 | struct filter_pred *pred, |
| 847 | struct pred_stack *stack, | ||
| 670 | filter_pred_fn_t fn) | 848 | filter_pred_fn_t fn) |
| 671 | { | 849 | { |
| 672 | int idx, err; | 850 | int idx, err; |
| 673 | 851 | ||
| 674 | if (filter->n_preds == MAX_FILTER_PRED) { | 852 | if (WARN_ON(filter->n_preds == filter->a_preds)) { |
| 675 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 853 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
| 676 | return -ENOSPC; | 854 | return -ENOSPC; |
| 677 | } | 855 | } |
| 678 | 856 | ||
| 679 | idx = filter->n_preds; | 857 | idx = filter->n_preds; |
| 680 | filter_clear_pred(filter->preds[idx]); | 858 | filter_clear_pred(&filter->preds[idx]); |
| 681 | err = filter_set_pred(filter->preds[idx], pred, fn); | 859 | err = filter_set_pred(filter, idx, stack, pred, fn); |
| 682 | if (err) | 860 | if (err) |
| 683 | return err; | 861 | return err; |
| 684 | 862 | ||
| @@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
| 763 | struct ftrace_event_call *call, | 941 | struct ftrace_event_call *call, |
| 764 | struct event_filter *filter, | 942 | struct event_filter *filter, |
| 765 | struct filter_pred *pred, | 943 | struct filter_pred *pred, |
| 944 | struct pred_stack *stack, | ||
| 766 | bool dry_run) | 945 | bool dry_run) |
| 767 | { | 946 | { |
| 768 | struct ftrace_event_field *field; | 947 | struct ftrace_event_field *field; |
| @@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
| 770 | unsigned long long val; | 949 | unsigned long long val; |
| 771 | int ret; | 950 | int ret; |
| 772 | 951 | ||
| 773 | pred->fn = filter_pred_none; | 952 | fn = pred->fn = filter_pred_none; |
| 774 | 953 | ||
| 775 | if (pred->op == OP_AND) { | 954 | if (pred->op == OP_AND) |
| 776 | pred->pop_n = 2; | ||
| 777 | fn = filter_pred_and; | ||
| 778 | goto add_pred_fn; | 955 | goto add_pred_fn; |
| 779 | } else if (pred->op == OP_OR) { | 956 | else if (pred->op == OP_OR) |
| 780 | pred->pop_n = 2; | ||
| 781 | fn = filter_pred_or; | ||
| 782 | goto add_pred_fn; | 957 | goto add_pred_fn; |
| 783 | } | ||
| 784 | 958 | ||
| 785 | field = find_event_field(call, pred->field_name); | 959 | field = find_event_field(call, pred->field_name); |
| 786 | if (!field) { | 960 | if (!field) { |
| @@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
| 829 | 1003 | ||
| 830 | add_pred_fn: | 1004 | add_pred_fn: |
| 831 | if (!dry_run) | 1005 | if (!dry_run) |
| 832 | return filter_add_pred_fn(ps, call, filter, pred, fn); | 1006 | return filter_add_pred_fn(ps, call, filter, pred, stack, fn); |
| 833 | return 0; | 1007 | return 0; |
| 834 | } | 1008 | } |
| 835 | 1009 | ||
| @@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps) | |||
| 1187 | return 0; | 1361 | return 0; |
| 1188 | } | 1362 | } |
| 1189 | 1363 | ||
| 1364 | static int count_preds(struct filter_parse_state *ps) | ||
| 1365 | { | ||
| 1366 | struct postfix_elt *elt; | ||
| 1367 | int n_preds = 0; | ||
| 1368 | |||
| 1369 | list_for_each_entry(elt, &ps->postfix, list) { | ||
| 1370 | if (elt->op == OP_NONE) | ||
| 1371 | continue; | ||
| 1372 | n_preds++; | ||
| 1373 | } | ||
| 1374 | |||
| 1375 | return n_preds; | ||
| 1376 | } | ||
| 1377 | |||
| 1378 | /* | ||
| 1379 | * The tree is walked at filtering of an event. If the tree is not correctly | ||
| 1380 | * built, it may cause an infinite loop. Check here that the tree does | ||
| 1381 | * indeed terminate. | ||
| 1382 | */ | ||
| 1383 | static int check_pred_tree(struct event_filter *filter, | ||
| 1384 | struct filter_pred *root) | ||
| 1385 | { | ||
| 1386 | struct filter_pred *preds; | ||
| 1387 | struct filter_pred *pred; | ||
| 1388 | enum move_type move = MOVE_DOWN; | ||
| 1389 | int count = 0; | ||
| 1390 | int done = 0; | ||
| 1391 | int max; | ||
| 1392 | |||
| 1393 | /* | ||
| 1394 | * The max that we can hit a node is three times. | ||
| 1395 | * Once going down, once coming up from left, and | ||
| 1396 | * once coming up from right. This is more than enough | ||
| 1397 | * since leafs are only hit a single time. | ||
| 1398 | */ | ||
| 1399 | max = 3 * filter->n_preds; | ||
| 1400 | |||
| 1401 | preds = filter->preds; | ||
| 1402 | if (!preds) | ||
| 1403 | return -EINVAL; | ||
| 1404 | pred = root; | ||
| 1405 | |||
| 1406 | do { | ||
| 1407 | if (WARN_ON(count++ > max)) | ||
| 1408 | return -EINVAL; | ||
| 1409 | |||
| 1410 | switch (move) { | ||
| 1411 | case MOVE_DOWN: | ||
| 1412 | if (pred->left != FILTER_PRED_INVALID) { | ||
| 1413 | pred = &preds[pred->left]; | ||
| 1414 | continue; | ||
| 1415 | } | ||
| 1416 | /* A leaf at the root is just a leaf in the tree */ | ||
| 1417 | if (pred == root) | ||
| 1418 | break; | ||
| 1419 | pred = get_pred_parent(pred, preds, | ||
| 1420 | pred->parent, &move); | ||
| 1421 | continue; | ||
| 1422 | case MOVE_UP_FROM_LEFT: | ||
| 1423 | pred = &preds[pred->right]; | ||
| 1424 | move = MOVE_DOWN; | ||
| 1425 | continue; | ||
| 1426 | case MOVE_UP_FROM_RIGHT: | ||
| 1427 | if (pred == root) | ||
| 1428 | break; | ||
| 1429 | pred = get_pred_parent(pred, preds, | ||
| 1430 | pred->parent, &move); | ||
| 1431 | continue; | ||
| 1432 | } | ||
| 1433 | done = 1; | ||
| 1434 | } while (!done); | ||
| 1435 | |||
| 1436 | /* We are fine. */ | ||
| 1437 | return 0; | ||
| 1438 | } | ||
| 1439 | |||
| 1440 | static int count_leafs(struct filter_pred *preds, struct filter_pred *root) | ||
| 1441 | { | ||
| 1442 | struct filter_pred *pred; | ||
| 1443 | enum move_type move = MOVE_DOWN; | ||
| 1444 | int count = 0; | ||
| 1445 | int done = 0; | ||
| 1446 | |||
| 1447 | pred = root; | ||
| 1448 | |||
| 1449 | do { | ||
| 1450 | switch (move) { | ||
| 1451 | case MOVE_DOWN: | ||
| 1452 | if (pred->left != FILTER_PRED_INVALID) { | ||
| 1453 | pred = &preds[pred->left]; | ||
| 1454 | continue; | ||
| 1455 | } | ||
| 1456 | /* A leaf at the root is just a leaf in the tree */ | ||
| 1457 | if (pred == root) | ||
| 1458 | return 1; | ||
| 1459 | count++; | ||
| 1460 | pred = get_pred_parent(pred, preds, | ||
| 1461 | pred->parent, &move); | ||
| 1462 | continue; | ||
| 1463 | case MOVE_UP_FROM_LEFT: | ||
| 1464 | pred = &preds[pred->right]; | ||
| 1465 | move = MOVE_DOWN; | ||
| 1466 | continue; | ||
| 1467 | case MOVE_UP_FROM_RIGHT: | ||
| 1468 | if (pred == root) | ||
| 1469 | break; | ||
| 1470 | pred = get_pred_parent(pred, preds, | ||
| 1471 | pred->parent, &move); | ||
| 1472 | continue; | ||
| 1473 | } | ||
| 1474 | done = 1; | ||
| 1475 | } while (!done); | ||
| 1476 | |||
| 1477 | return count; | ||
| 1478 | } | ||
| 1479 | |||
| 1480 | static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | ||
| 1481 | { | ||
| 1482 | struct filter_pred *pred; | ||
| 1483 | enum move_type move = MOVE_DOWN; | ||
| 1484 | int count = 0; | ||
| 1485 | int children; | ||
| 1486 | int done = 0; | ||
| 1487 | |||
| 1488 | /* No need to keep the fold flag */ | ||
| 1489 | root->index &= ~FILTER_PRED_FOLD; | ||
| 1490 | |||
| 1491 | /* If the root is a leaf then do nothing */ | ||
| 1492 | if (root->left == FILTER_PRED_INVALID) | ||
| 1493 | return 0; | ||
| 1494 | |||
| 1495 | /* count the children */ | ||
| 1496 | children = count_leafs(preds, &preds[root->left]); | ||
| 1497 | children += count_leafs(preds, &preds[root->right]); | ||
| 1498 | |||
| 1499 | root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); | ||
| 1500 | if (!root->ops) | ||
| 1501 | return -ENOMEM; | ||
| 1502 | |||
| 1503 | root->val = children; | ||
| 1504 | |||
| 1505 | pred = root; | ||
| 1506 | do { | ||
| 1507 | switch (move) { | ||
| 1508 | case MOVE_DOWN: | ||
| 1509 | if (pred->left != FILTER_PRED_INVALID) { | ||
| 1510 | pred = &preds[pred->left]; | ||
| 1511 | continue; | ||
| 1512 | } | ||
| 1513 | if (WARN_ON(count == children)) | ||
| 1514 | return -EINVAL; | ||
| 1515 | pred->index &= ~FILTER_PRED_FOLD; | ||
| 1516 | root->ops[count++] = pred->index; | ||
| 1517 | pred = get_pred_parent(pred, preds, | ||
| 1518 | pred->parent, &move); | ||
| 1519 | continue; | ||
| 1520 | case MOVE_UP_FROM_LEFT: | ||
| 1521 | pred = &preds[pred->right]; | ||
| 1522 | move = MOVE_DOWN; | ||
| 1523 | continue; | ||
| 1524 | case MOVE_UP_FROM_RIGHT: | ||
| 1525 | if (pred == root) | ||
| 1526 | break; | ||
| 1527 | pred = get_pred_parent(pred, preds, | ||
| 1528 | pred->parent, &move); | ||
| 1529 | continue; | ||
| 1530 | } | ||
| 1531 | done = 1; | ||
| 1532 | } while (!done); | ||
| 1533 | |||
| 1534 | return 0; | ||
| 1535 | } | ||
| 1536 | |||
| 1537 | /* | ||
| 1538 | * To optimize the processing of the ops, if we have several "ors" or | ||
| 1539 | * "ands" together, we can put them in an array and process them all | ||
| 1540 | * together speeding up the filter logic. | ||
| 1541 | */ | ||
| 1542 | static int fold_pred_tree(struct event_filter *filter, | ||
| 1543 | struct filter_pred *root) | ||
| 1544 | { | ||
| 1545 | struct filter_pred *preds; | ||
| 1546 | struct filter_pred *pred; | ||
| 1547 | enum move_type move = MOVE_DOWN; | ||
| 1548 | int done = 0; | ||
| 1549 | int err; | ||
| 1550 | |||
| 1551 | preds = filter->preds; | ||
| 1552 | if (!preds) | ||
| 1553 | return -EINVAL; | ||
| 1554 | pred = root; | ||
| 1555 | |||
| 1556 | do { | ||
| 1557 | switch (move) { | ||
| 1558 | case MOVE_DOWN: | ||
| 1559 | if (pred->index & FILTER_PRED_FOLD) { | ||
| 1560 | err = fold_pred(preds, pred); | ||
| 1561 | if (err) | ||
| 1562 | return err; | ||
| 1563 | /* Folded nodes are like leafs */ | ||
| 1564 | } else if (pred->left != FILTER_PRED_INVALID) { | ||
| 1565 | pred = &preds[pred->left]; | ||
| 1566 | continue; | ||
| 1567 | } | ||
| 1568 | |||
| 1569 | /* A leaf at the root is just a leaf in the tree */ | ||
| 1570 | if (pred == root) | ||
| 1571 | break; | ||
| 1572 | pred = get_pred_parent(pred, preds, | ||
| 1573 | pred->parent, &move); | ||
| 1574 | continue; | ||
| 1575 | case MOVE_UP_FROM_LEFT: | ||
| 1576 | pred = &preds[pred->right]; | ||
| 1577 | move = MOVE_DOWN; | ||
| 1578 | continue; | ||
| 1579 | case MOVE_UP_FROM_RIGHT: | ||
| 1580 | if (pred == root) | ||
| 1581 | break; | ||
| 1582 | pred = get_pred_parent(pred, preds, | ||
| 1583 | pred->parent, &move); | ||
| 1584 | continue; | ||
| 1585 | } | ||
| 1586 | done = 1; | ||
| 1587 | } while (!done); | ||
| 1588 | |||
| 1589 | return 0; | ||
| 1590 | } | ||
| 1591 | |||
| 1190 | static int replace_preds(struct ftrace_event_call *call, | 1592 | static int replace_preds(struct ftrace_event_call *call, |
| 1191 | struct event_filter *filter, | 1593 | struct event_filter *filter, |
| 1192 | struct filter_parse_state *ps, | 1594 | struct filter_parse_state *ps, |
| @@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call, | |||
| 1195 | { | 1597 | { |
| 1196 | char *operand1 = NULL, *operand2 = NULL; | 1598 | char *operand1 = NULL, *operand2 = NULL; |
| 1197 | struct filter_pred *pred; | 1599 | struct filter_pred *pred; |
| 1600 | struct filter_pred *root; | ||
| 1198 | struct postfix_elt *elt; | 1601 | struct postfix_elt *elt; |
| 1602 | struct pred_stack stack = { }; /* init to NULL */ | ||
| 1199 | int err; | 1603 | int err; |
| 1200 | int n_preds = 0; | 1604 | int n_preds = 0; |
| 1201 | 1605 | ||
| 1606 | n_preds = count_preds(ps); | ||
| 1607 | if (n_preds >= MAX_FILTER_PRED) { | ||
| 1608 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | ||
| 1609 | return -ENOSPC; | ||
| 1610 | } | ||
| 1611 | |||
| 1202 | err = check_preds(ps); | 1612 | err = check_preds(ps); |
| 1203 | if (err) | 1613 | if (err) |
| 1204 | return err; | 1614 | return err; |
| 1205 | 1615 | ||
| 1616 | if (!dry_run) { | ||
| 1617 | err = __alloc_pred_stack(&stack, n_preds); | ||
| 1618 | if (err) | ||
| 1619 | return err; | ||
| 1620 | err = __alloc_preds(filter, n_preds); | ||
| 1621 | if (err) | ||
| 1622 | goto fail; | ||
| 1623 | } | ||
| 1624 | |||
| 1625 | n_preds = 0; | ||
| 1206 | list_for_each_entry(elt, &ps->postfix, list) { | 1626 | list_for_each_entry(elt, &ps->postfix, list) { |
| 1207 | if (elt->op == OP_NONE) { | 1627 | if (elt->op == OP_NONE) { |
| 1208 | if (!operand1) | 1628 | if (!operand1) |
| @@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call, | |||
| 1211 | operand2 = elt->operand; | 1631 | operand2 = elt->operand; |
| 1212 | else { | 1632 | else { |
| 1213 | parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); | 1633 | parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); |
| 1214 | return -EINVAL; | 1634 | err = -EINVAL; |
| 1635 | goto fail; | ||
| 1215 | } | 1636 | } |
| 1216 | continue; | 1637 | continue; |
| 1217 | } | 1638 | } |
| 1218 | 1639 | ||
| 1219 | if (n_preds++ == MAX_FILTER_PRED) { | 1640 | if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { |
| 1220 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 1641 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
| 1221 | return -ENOSPC; | 1642 | err = -ENOSPC; |
| 1643 | goto fail; | ||
| 1222 | } | 1644 | } |
| 1223 | 1645 | ||
| 1224 | if (elt->op == OP_AND || elt->op == OP_OR) { | 1646 | if (elt->op == OP_AND || elt->op == OP_OR) { |
| @@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call, | |||
| 1228 | 1650 | ||
| 1229 | if (!operand1 || !operand2) { | 1651 | if (!operand1 || !operand2) { |
| 1230 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); | 1652 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); |
| 1231 | return -EINVAL; | 1653 | err = -EINVAL; |
| 1654 | goto fail; | ||
| 1232 | } | 1655 | } |
| 1233 | 1656 | ||
| 1234 | pred = create_pred(elt->op, operand1, operand2); | 1657 | pred = create_pred(elt->op, operand1, operand2); |
| 1235 | add_pred: | 1658 | add_pred: |
| 1236 | if (!pred) | 1659 | if (!pred) { |
| 1237 | return -ENOMEM; | 1660 | err = -ENOMEM; |
| 1238 | err = filter_add_pred(ps, call, filter, pred, dry_run); | 1661 | goto fail; |
| 1662 | } | ||
| 1663 | err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); | ||
| 1239 | filter_free_pred(pred); | 1664 | filter_free_pred(pred); |
| 1240 | if (err) | 1665 | if (err) |
| 1241 | return err; | 1666 | goto fail; |
| 1242 | 1667 | ||
| 1243 | operand1 = operand2 = NULL; | 1668 | operand1 = operand2 = NULL; |
| 1244 | } | 1669 | } |
| 1245 | 1670 | ||
| 1246 | return 0; | 1671 | if (!dry_run) { |
| 1672 | /* We should have one item left on the stack */ | ||
| 1673 | pred = __pop_pred_stack(&stack); | ||
| 1674 | if (!pred) | ||
| 1675 | return -EINVAL; | ||
| 1676 | /* This item is where we start from in matching */ | ||
| 1677 | root = pred; | ||
| 1678 | /* Make sure the stack is empty */ | ||
| 1679 | pred = __pop_pred_stack(&stack); | ||
| 1680 | if (WARN_ON(pred)) { | ||
| 1681 | err = -EINVAL; | ||
| 1682 | filter->root = NULL; | ||
| 1683 | goto fail; | ||
| 1684 | } | ||
| 1685 | err = check_pred_tree(filter, root); | ||
| 1686 | if (err) | ||
| 1687 | goto fail; | ||
| 1688 | |||
| 1689 | /* Optimize the tree */ | ||
| 1690 | err = fold_pred_tree(filter, root); | ||
| 1691 | if (err) | ||
| 1692 | goto fail; | ||
| 1693 | |||
| 1694 | /* We don't set root until we know it works */ | ||
| 1695 | barrier(); | ||
| 1696 | filter->root = root; | ||
| 1697 | } | ||
| 1698 | |||
| 1699 | err = 0; | ||
| 1700 | fail: | ||
| 1701 | __free_pred_stack(&stack); | ||
| 1702 | return err; | ||
| 1247 | } | 1703 | } |
| 1248 | 1704 | ||
| 1705 | struct filter_list { | ||
| 1706 | struct list_head list; | ||
| 1707 | struct event_filter *filter; | ||
| 1708 | }; | ||
| 1709 | |||
| 1249 | static int replace_system_preds(struct event_subsystem *system, | 1710 | static int replace_system_preds(struct event_subsystem *system, |
| 1250 | struct filter_parse_state *ps, | 1711 | struct filter_parse_state *ps, |
| 1251 | char *filter_string) | 1712 | char *filter_string) |
| 1252 | { | 1713 | { |
| 1253 | struct ftrace_event_call *call; | 1714 | struct ftrace_event_call *call; |
| 1715 | struct filter_list *filter_item; | ||
| 1716 | struct filter_list *tmp; | ||
| 1717 | LIST_HEAD(filter_list); | ||
| 1254 | bool fail = true; | 1718 | bool fail = true; |
| 1255 | int err; | 1719 | int err; |
| 1256 | 1720 | ||
| 1257 | list_for_each_entry(call, &ftrace_events, list) { | 1721 | list_for_each_entry(call, &ftrace_events, list) { |
| 1258 | struct event_filter *filter = call->filter; | ||
| 1259 | 1722 | ||
| 1260 | if (strcmp(call->class->system, system->name) != 0) | 1723 | if (strcmp(call->class->system, system->name) != 0) |
| 1261 | continue; | 1724 | continue; |
| 1262 | 1725 | ||
| 1263 | /* try to see if the filter can be applied */ | 1726 | /* |
| 1264 | err = replace_preds(call, filter, ps, filter_string, true); | 1727 | * Try to see if the filter can be applied |
| 1728 | * (filter arg is ignored on dry_run) | ||
| 1729 | */ | ||
| 1730 | err = replace_preds(call, NULL, ps, filter_string, true); | ||
| 1265 | if (err) | 1731 | if (err) |
| 1732 | goto fail; | ||
| 1733 | } | ||
| 1734 | |||
| 1735 | list_for_each_entry(call, &ftrace_events, list) { | ||
| 1736 | struct event_filter *filter; | ||
| 1737 | |||
| 1738 | if (strcmp(call->class->system, system->name) != 0) | ||
| 1266 | continue; | 1739 | continue; |
| 1267 | 1740 | ||
| 1268 | /* really apply the filter */ | 1741 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); |
| 1269 | filter_disable_preds(call); | 1742 | if (!filter_item) |
| 1270 | err = replace_preds(call, filter, ps, filter_string, false); | 1743 | goto fail_mem; |
| 1744 | |||
| 1745 | list_add_tail(&filter_item->list, &filter_list); | ||
| 1746 | |||
| 1747 | filter_item->filter = __alloc_filter(); | ||
| 1748 | if (!filter_item->filter) | ||
| 1749 | goto fail_mem; | ||
| 1750 | filter = filter_item->filter; | ||
| 1751 | |||
| 1752 | /* Can only fail on no memory */ | ||
| 1753 | err = replace_filter_string(filter, filter_string); | ||
| 1271 | if (err) | 1754 | if (err) |
| 1272 | filter_disable_preds(call); | 1755 | goto fail_mem; |
| 1273 | else { | 1756 | |
| 1757 | err = replace_preds(call, filter, ps, filter_string, false); | ||
| 1758 | if (err) { | ||
| 1759 | filter_disable(call); | ||
| 1760 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | ||
| 1761 | append_filter_err(ps, filter); | ||
| 1762 | } else | ||
| 1274 | call->flags |= TRACE_EVENT_FL_FILTERED; | 1763 | call->flags |= TRACE_EVENT_FL_FILTERED; |
| 1275 | replace_filter_string(filter, filter_string); | 1764 | /* |
| 1276 | } | 1765 | * Regardless of if this returned an error, we still |
| 1766 | * replace the filter for the call. | ||
| 1767 | */ | ||
| 1768 | filter = call->filter; | ||
| 1769 | call->filter = filter_item->filter; | ||
| 1770 | filter_item->filter = filter; | ||
| 1771 | |||
| 1277 | fail = false; | 1772 | fail = false; |
| 1278 | } | 1773 | } |
| 1279 | 1774 | ||
| 1280 | if (fail) { | 1775 | if (fail) |
| 1281 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | 1776 | goto fail; |
| 1282 | return -EINVAL; | 1777 | |
| 1778 | /* | ||
| 1779 | * The calls can still be using the old filters. | ||
| 1780 | * Do a synchronize_sched() to ensure all calls are | ||
| 1781 | * done with them before we free them. | ||
| 1782 | */ | ||
| 1783 | synchronize_sched(); | ||
| 1784 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
| 1785 | __free_filter(filter_item->filter); | ||
| 1786 | list_del(&filter_item->list); | ||
| 1787 | kfree(filter_item); | ||
| 1283 | } | 1788 | } |
| 1284 | return 0; | 1789 | return 0; |
| 1790 | fail: | ||
| 1791 | /* No call succeeded */ | ||
| 1792 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
| 1793 | list_del(&filter_item->list); | ||
| 1794 | kfree(filter_item); | ||
| 1795 | } | ||
| 1796 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | ||
| 1797 | return -EINVAL; | ||
| 1798 | fail_mem: | ||
| 1799 | /* If any call succeeded, we still need to sync */ | ||
| 1800 | if (!fail) | ||
| 1801 | synchronize_sched(); | ||
| 1802 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
| 1803 | __free_filter(filter_item->filter); | ||
| 1804 | list_del(&filter_item->list); | ||
| 1805 | kfree(filter_item); | ||
| 1806 | } | ||
| 1807 | return -ENOMEM; | ||
| 1285 | } | 1808 | } |
| 1286 | 1809 | ||
| 1287 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | 1810 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) |
| 1288 | { | 1811 | { |
| 1289 | int err; | ||
| 1290 | struct filter_parse_state *ps; | 1812 | struct filter_parse_state *ps; |
| 1813 | struct event_filter *filter; | ||
| 1814 | struct event_filter *tmp; | ||
| 1815 | int err = 0; | ||
| 1291 | 1816 | ||
| 1292 | mutex_lock(&event_mutex); | 1817 | mutex_lock(&event_mutex); |
| 1293 | 1818 | ||
| 1294 | err = init_preds(call); | ||
| 1295 | if (err) | ||
| 1296 | goto out_unlock; | ||
| 1297 | |||
| 1298 | if (!strcmp(strstrip(filter_string), "0")) { | 1819 | if (!strcmp(strstrip(filter_string), "0")) { |
| 1299 | filter_disable_preds(call); | 1820 | filter_disable(call); |
| 1300 | remove_filter_string(call->filter); | 1821 | filter = call->filter; |
| 1822 | if (!filter) | ||
| 1823 | goto out_unlock; | ||
| 1824 | call->filter = NULL; | ||
| 1825 | /* Make sure the filter is not being used */ | ||
| 1826 | synchronize_sched(); | ||
| 1827 | __free_filter(filter); | ||
| 1301 | goto out_unlock; | 1828 | goto out_unlock; |
| 1302 | } | 1829 | } |
| 1303 | 1830 | ||
| @@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | |||
| 1306 | if (!ps) | 1833 | if (!ps) |
| 1307 | goto out_unlock; | 1834 | goto out_unlock; |
| 1308 | 1835 | ||
| 1309 | filter_disable_preds(call); | 1836 | filter = __alloc_filter(); |
| 1310 | replace_filter_string(call->filter, filter_string); | 1837 | if (!filter) { |
| 1838 | kfree(ps); | ||
| 1839 | goto out_unlock; | ||
| 1840 | } | ||
| 1841 | |||
| 1842 | replace_filter_string(filter, filter_string); | ||
| 1311 | 1843 | ||
| 1312 | parse_init(ps, filter_ops, filter_string); | 1844 | parse_init(ps, filter_ops, filter_string); |
| 1313 | err = filter_parse(ps); | 1845 | err = filter_parse(ps); |
| 1314 | if (err) { | 1846 | if (err) { |
| 1315 | append_filter_err(ps, call->filter); | 1847 | append_filter_err(ps, filter); |
| 1316 | goto out; | 1848 | goto out; |
| 1317 | } | 1849 | } |
| 1318 | 1850 | ||
| 1319 | err = replace_preds(call, call->filter, ps, filter_string, false); | 1851 | err = replace_preds(call, filter, ps, filter_string, false); |
| 1320 | if (err) | 1852 | if (err) { |
| 1321 | append_filter_err(ps, call->filter); | 1853 | filter_disable(call); |
| 1322 | else | 1854 | append_filter_err(ps, filter); |
| 1855 | } else | ||
| 1323 | call->flags |= TRACE_EVENT_FL_FILTERED; | 1856 | call->flags |= TRACE_EVENT_FL_FILTERED; |
| 1324 | out: | 1857 | out: |
| 1858 | /* | ||
| 1859 | * Always swap the call filter with the new filter | ||
| 1860 | * even if there was an error. If there was an error | ||
| 1861 | * in the filter, we disable the filter and show the error | ||
| 1862 | * string | ||
| 1863 | */ | ||
| 1864 | tmp = call->filter; | ||
| 1865 | call->filter = filter; | ||
| 1866 | if (tmp) { | ||
| 1867 | /* Make sure the call is done with the filter */ | ||
| 1868 | synchronize_sched(); | ||
| 1869 | __free_filter(tmp); | ||
| 1870 | } | ||
| 1325 | filter_opstack_clear(ps); | 1871 | filter_opstack_clear(ps); |
| 1326 | postfix_clear(ps); | 1872 | postfix_clear(ps); |
| 1327 | kfree(ps); | 1873 | kfree(ps); |
| @@ -1334,18 +1880,21 @@ out_unlock: | |||
| 1334 | int apply_subsystem_event_filter(struct event_subsystem *system, | 1880 | int apply_subsystem_event_filter(struct event_subsystem *system, |
| 1335 | char *filter_string) | 1881 | char *filter_string) |
| 1336 | { | 1882 | { |
| 1337 | int err; | ||
| 1338 | struct filter_parse_state *ps; | 1883 | struct filter_parse_state *ps; |
| 1884 | struct event_filter *filter; | ||
| 1885 | int err = 0; | ||
| 1339 | 1886 | ||
| 1340 | mutex_lock(&event_mutex); | 1887 | mutex_lock(&event_mutex); |
| 1341 | 1888 | ||
| 1342 | err = init_subsystem_preds(system); | ||
| 1343 | if (err) | ||
| 1344 | goto out_unlock; | ||
| 1345 | |||
| 1346 | if (!strcmp(strstrip(filter_string), "0")) { | 1889 | if (!strcmp(strstrip(filter_string), "0")) { |
| 1347 | filter_free_subsystem_preds(system); | 1890 | filter_free_subsystem_preds(system); |
| 1348 | remove_filter_string(system->filter); | 1891 | remove_filter_string(system->filter); |
| 1892 | filter = system->filter; | ||
| 1893 | system->filter = NULL; | ||
| 1894 | /* Ensure all filters are no longer used */ | ||
| 1895 | synchronize_sched(); | ||
| 1896 | filter_free_subsystem_filters(system); | ||
| 1897 | __free_filter(filter); | ||
| 1349 | goto out_unlock; | 1898 | goto out_unlock; |
| 1350 | } | 1899 | } |
| 1351 | 1900 | ||
| @@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
| 1354 | if (!ps) | 1903 | if (!ps) |
| 1355 | goto out_unlock; | 1904 | goto out_unlock; |
| 1356 | 1905 | ||
| 1357 | replace_filter_string(system->filter, filter_string); | 1906 | filter = __alloc_filter(); |
| 1907 | if (!filter) | ||
| 1908 | goto out; | ||
| 1909 | |||
| 1910 | replace_filter_string(filter, filter_string); | ||
| 1911 | /* | ||
| 1912 | * No event actually uses the system filter | ||
| 1913 | * we can free it without synchronize_sched(). | ||
| 1914 | */ | ||
| 1915 | __free_filter(system->filter); | ||
| 1916 | system->filter = filter; | ||
| 1358 | 1917 | ||
| 1359 | parse_init(ps, filter_ops, filter_string); | 1918 | parse_init(ps, filter_ops, filter_string); |
| 1360 | err = filter_parse(ps); | 1919 | err = filter_parse(ps); |
| @@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event) | |||
| 1384 | struct event_filter *filter = event->filter; | 1943 | struct event_filter *filter = event->filter; |
| 1385 | 1944 | ||
| 1386 | event->filter = NULL; | 1945 | event->filter = NULL; |
| 1387 | __free_preds(filter); | 1946 | __free_filter(filter); |
| 1388 | } | 1947 | } |
| 1389 | 1948 | ||
| 1390 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, | 1949 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, |
| @@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
| 1410 | if (event->filter) | 1969 | if (event->filter) |
| 1411 | goto out_unlock; | 1970 | goto out_unlock; |
| 1412 | 1971 | ||
| 1413 | filter = __alloc_preds(); | 1972 | filter = __alloc_filter(); |
| 1414 | if (IS_ERR(filter)) { | 1973 | if (!filter) { |
| 1415 | err = PTR_ERR(filter); | 1974 | err = PTR_ERR(filter); |
| 1416 | goto out_unlock; | 1975 | goto out_unlock; |
| 1417 | } | 1976 | } |
| @@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
| 1419 | err = -ENOMEM; | 1978 | err = -ENOMEM; |
| 1420 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | 1979 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); |
| 1421 | if (!ps) | 1980 | if (!ps) |
| 1422 | goto free_preds; | 1981 | goto free_filter; |
| 1423 | 1982 | ||
| 1424 | parse_init(ps, filter_ops, filter_str); | 1983 | parse_init(ps, filter_ops, filter_str); |
| 1425 | err = filter_parse(ps); | 1984 | err = filter_parse(ps); |
| @@ -1435,9 +1994,9 @@ free_ps: | |||
| 1435 | postfix_clear(ps); | 1994 | postfix_clear(ps); |
| 1436 | kfree(ps); | 1995 | kfree(ps); |
| 1437 | 1996 | ||
| 1438 | free_preds: | 1997 | free_filter: |
| 1439 | if (err) | 1998 | if (err) |
| 1440 | __free_preds(filter); | 1999 | __free_filter(filter); |
| 1441 | 2000 | ||
| 1442 | out_unlock: | 2001 | out_unlock: |
| 1443 | mutex_unlock(&event_mutex); | 2002 | mutex_unlock(&event_mutex); |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 76b05980225c..962cdb24ed81 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | |||
| 905 | * | 905 | * |
| 906 | * returns 1 if | 906 | * returns 1 if |
| 907 | * - we are inside irq code | 907 | * - we are inside irq code |
| 908 | * - we just extered irq code | 908 | * - we just entered irq code |
| 909 | * | 909 | * |
| 910 | * retunns 0 if | 910 | * retunns 0 if |
| 911 | * - funcgraph-interrupts option is set | 911 | * - funcgraph-interrupts option is set |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 92b6e1e12d98..a4969b47afc1 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
| @@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = { | |||
| 80 | * skip the latency if the sequence has changed - some other section | 80 | * skip the latency if the sequence has changed - some other section |
| 81 | * did a maximum and could disturb our measurement with serial console | 81 | * did a maximum and could disturb our measurement with serial console |
| 82 | * printouts, etc. Truly coinciding maximum latencies should be rare | 82 | * printouts, etc. Truly coinciding maximum latencies should be rare |
| 83 | * and what happens together happens separately as well, so this doesnt | 83 | * and what happens together happens separately as well, so this doesn't |
| 84 | * decrease the validity of the maximum found: | 84 | * decrease the validity of the maximum found: |
| 85 | */ | 85 | */ |
| 86 | static __cacheline_aligned_in_smp unsigned long max_sequence; | 86 | static __cacheline_aligned_in_smp unsigned long max_sequence; |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2dec9bcde8b4..35d55a386145 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | |||
| 353 | kfree(data); | 353 | kfree(data); |
| 354 | } | 354 | } |
| 355 | 355 | ||
| 356 | /* Bitfield fetch function */ | ||
| 357 | struct bitfield_fetch_param { | ||
| 358 | struct fetch_param orig; | ||
| 359 | unsigned char hi_shift; | ||
| 360 | unsigned char low_shift; | ||
| 361 | }; | ||
| 362 | |||
| 363 | #define DEFINE_FETCH_bitfield(type) \ | ||
| 364 | static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ | ||
| 365 | void *data, void *dest) \ | ||
| 366 | { \ | ||
| 367 | struct bitfield_fetch_param *bprm = data; \ | ||
| 368 | type buf = 0; \ | ||
| 369 | call_fetch(&bprm->orig, regs, &buf); \ | ||
| 370 | if (buf) { \ | ||
| 371 | buf <<= bprm->hi_shift; \ | ||
| 372 | buf >>= bprm->low_shift; \ | ||
| 373 | } \ | ||
| 374 | *(type *)dest = buf; \ | ||
| 375 | } | ||
| 376 | DEFINE_BASIC_FETCH_FUNCS(bitfield) | ||
| 377 | #define fetch_bitfield_string NULL | ||
| 378 | #define fetch_bitfield_string_size NULL | ||
| 379 | |||
| 380 | static __kprobes void | ||
| 381 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
| 382 | { | ||
| 383 | /* | ||
| 384 | * Don't check the bitfield itself, because this must be the | ||
| 385 | * last fetch function. | ||
| 386 | */ | ||
| 387 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
| 388 | free_deref_fetch_param(data->orig.data); | ||
| 389 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
| 390 | free_symbol_cache(data->orig.data); | ||
| 391 | kfree(data); | ||
| 392 | } | ||
| 356 | /* Default (unsigned long) fetch type */ | 393 | /* Default (unsigned long) fetch type */ |
| 357 | #define __DEFAULT_FETCH_TYPE(t) u##t | 394 | #define __DEFAULT_FETCH_TYPE(t) u##t |
| 358 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | 395 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) |
| @@ -367,6 +404,7 @@ enum { | |||
| 367 | FETCH_MTD_memory, | 404 | FETCH_MTD_memory, |
| 368 | FETCH_MTD_symbol, | 405 | FETCH_MTD_symbol, |
| 369 | FETCH_MTD_deref, | 406 | FETCH_MTD_deref, |
| 407 | FETCH_MTD_bitfield, | ||
| 370 | FETCH_MTD_END, | 408 | FETCH_MTD_END, |
| 371 | }; | 409 | }; |
| 372 | 410 | ||
| @@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \ | |||
| 387 | ASSIGN_FETCH_FUNC(memory, ftype), \ | 425 | ASSIGN_FETCH_FUNC(memory, ftype), \ |
| 388 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | 426 | ASSIGN_FETCH_FUNC(symbol, ftype), \ |
| 389 | ASSIGN_FETCH_FUNC(deref, ftype), \ | 427 | ASSIGN_FETCH_FUNC(deref, ftype), \ |
| 428 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
| 390 | } \ | 429 | } \ |
| 391 | } | 430 | } |
| 392 | 431 | ||
| @@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type) | |||
| 430 | if (!type) | 469 | if (!type) |
| 431 | type = DEFAULT_FETCH_TYPE_STR; | 470 | type = DEFAULT_FETCH_TYPE_STR; |
| 432 | 471 | ||
| 472 | /* Special case: bitfield */ | ||
| 473 | if (*type == 'b') { | ||
| 474 | unsigned long bs; | ||
| 475 | type = strchr(type, '/'); | ||
| 476 | if (!type) | ||
| 477 | goto fail; | ||
| 478 | type++; | ||
| 479 | if (strict_strtoul(type, 0, &bs)) | ||
| 480 | goto fail; | ||
| 481 | switch (bs) { | ||
| 482 | case 8: | ||
| 483 | return find_fetch_type("u8"); | ||
| 484 | case 16: | ||
| 485 | return find_fetch_type("u16"); | ||
| 486 | case 32: | ||
| 487 | return find_fetch_type("u32"); | ||
| 488 | case 64: | ||
| 489 | return find_fetch_type("u64"); | ||
| 490 | default: | ||
| 491 | goto fail; | ||
| 492 | } | ||
| 493 | } | ||
| 494 | |||
| 433 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) | 495 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) |
| 434 | if (strcmp(type, fetch_type_table[i].name) == 0) | 496 | if (strcmp(type, fetch_type_table[i].name) == 0) |
| 435 | return &fetch_type_table[i]; | 497 | return &fetch_type_table[i]; |
| 498 | fail: | ||
| 436 | return NULL; | 499 | return NULL; |
| 437 | } | 500 | } |
| 438 | 501 | ||
| @@ -586,7 +649,9 @@ error: | |||
| 586 | 649 | ||
| 587 | static void free_probe_arg(struct probe_arg *arg) | 650 | static void free_probe_arg(struct probe_arg *arg) |
| 588 | { | 651 | { |
| 589 | if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | 652 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) |
| 653 | free_bitfield_fetch_param(arg->fetch.data); | ||
| 654 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
| 590 | free_deref_fetch_param(arg->fetch.data); | 655 | free_deref_fetch_param(arg->fetch.data); |
| 591 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | 656 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) |
| 592 | free_symbol_cache(arg->fetch.data); | 657 | free_symbol_cache(arg->fetch.data); |
| @@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
| 767 | } | 832 | } |
| 768 | break; | 833 | break; |
| 769 | case '+': /* deref memory */ | 834 | case '+': /* deref memory */ |
| 835 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | ||
| 770 | case '-': | 836 | case '-': |
| 771 | tmp = strchr(arg, '('); | 837 | tmp = strchr(arg, '('); |
| 772 | if (!tmp) | 838 | if (!tmp) |
| 773 | break; | 839 | break; |
| 774 | *tmp = '\0'; | 840 | *tmp = '\0'; |
| 775 | ret = strict_strtol(arg + 1, 0, &offset); | 841 | ret = strict_strtol(arg, 0, &offset); |
| 776 | if (ret) | 842 | if (ret) |
| 777 | break; | 843 | break; |
| 778 | if (arg[0] == '-') | ||
| 779 | offset = -offset; | ||
| 780 | arg = tmp + 1; | 844 | arg = tmp + 1; |
| 781 | tmp = strrchr(arg, ')'); | 845 | tmp = strrchr(arg, ')'); |
| 782 | if (tmp) { | 846 | if (tmp) { |
| @@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
| 807 | return ret; | 871 | return ret; |
| 808 | } | 872 | } |
| 809 | 873 | ||
| 874 | #define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) | ||
| 875 | |||
| 876 | /* Bitfield type needs to be parsed into a fetch function */ | ||
| 877 | static int __parse_bitfield_probe_arg(const char *bf, | ||
| 878 | const struct fetch_type *t, | ||
| 879 | struct fetch_param *f) | ||
| 880 | { | ||
| 881 | struct bitfield_fetch_param *bprm; | ||
| 882 | unsigned long bw, bo; | ||
| 883 | char *tail; | ||
| 884 | |||
| 885 | if (*bf != 'b') | ||
| 886 | return 0; | ||
| 887 | |||
| 888 | bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); | ||
| 889 | if (!bprm) | ||
| 890 | return -ENOMEM; | ||
| 891 | bprm->orig = *f; | ||
| 892 | f->fn = t->fetch[FETCH_MTD_bitfield]; | ||
| 893 | f->data = (void *)bprm; | ||
| 894 | |||
| 895 | bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ | ||
| 896 | if (bw == 0 || *tail != '@') | ||
| 897 | return -EINVAL; | ||
| 898 | |||
| 899 | bf = tail + 1; | ||
| 900 | bo = simple_strtoul(bf, &tail, 0); | ||
| 901 | if (tail == bf || *tail != '/') | ||
| 902 | return -EINVAL; | ||
| 903 | |||
| 904 | bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); | ||
| 905 | bprm->low_shift = bprm->hi_shift + bo; | ||
| 906 | return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; | ||
| 907 | } | ||
| 908 | |||
| 810 | /* String length checking wrapper */ | 909 | /* String length checking wrapper */ |
| 811 | static int parse_probe_arg(char *arg, struct trace_probe *tp, | 910 | static int parse_probe_arg(char *arg, struct trace_probe *tp, |
| 812 | struct probe_arg *parg, int is_return) | 911 | struct probe_arg *parg, int is_return) |
| @@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
| 836 | parg->offset = tp->size; | 935 | parg->offset = tp->size; |
| 837 | tp->size += parg->type->size; | 936 | tp->size += parg->type->size; |
| 838 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); | 937 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); |
| 938 | if (ret >= 0 && t != NULL) | ||
| 939 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | ||
| 839 | if (ret >= 0) { | 940 | if (ret >= 0) { |
| 840 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | 941 | parg->fetch_size.fn = get_fetch_size_function(parg->type, |
| 841 | parg->fetch.fn); | 942 | parg->fetch.fn); |
| @@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf) | |||
| 1130 | return ret; | 1231 | return ret; |
| 1131 | } | 1232 | } |
| 1132 | 1233 | ||
| 1133 | #define WRITE_BUFSIZE 128 | 1234 | #define WRITE_BUFSIZE 4096 |
| 1134 | 1235 | ||
| 1135 | static ssize_t probes_write(struct file *file, const char __user *buffer, | 1236 | static ssize_t probes_write(struct file *file, const char __user *buffer, |
| 1136 | size_t count, loff_t *ppos) | 1237 | size_t count, loff_t *ppos) |
| @@ -1738,7 +1839,7 @@ static void unregister_probe_event(struct trace_probe *tp) | |||
| 1738 | kfree(tp->call.print_fmt); | 1839 | kfree(tp->call.print_fmt); |
| 1739 | } | 1840 | } |
| 1740 | 1841 | ||
| 1741 | /* Make a debugfs interface for controling probe points */ | 1842 | /* Make a debugfs interface for controlling probe points */ |
| 1742 | static __init int init_kprobe_trace(void) | 1843 | static __init int init_kprobe_trace(void) |
| 1743 | { | 1844 | { |
| 1744 | struct dentry *d_tracer; | 1845 | struct dentry *d_tracer; |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272baa2206..456be9063c2d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) | |||
| 529 | * @entry: The trace entry field from the ring buffer | 529 | * @entry: The trace entry field from the ring buffer |
| 530 | * | 530 | * |
| 531 | * Prints the generic fields of irqs off, in hard or softirq, preempt | 531 | * Prints the generic fields of irqs off, in hard or softirq, preempt |
| 532 | * count and lock depth. | 532 | * count. |
| 533 | */ | 533 | */ |
| 534 | int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | 534 | int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) |
| 535 | { | 535 | { |
| 536 | int hardirq, softirq; | 536 | char hardsoft_irq; |
| 537 | char need_resched; | ||
| 538 | char irqs_off; | ||
| 539 | int hardirq; | ||
| 540 | int softirq; | ||
| 537 | int ret; | 541 | int ret; |
| 538 | 542 | ||
| 539 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; | 543 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; |
| 540 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; | 544 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; |
| 541 | 545 | ||
| 546 | irqs_off = | ||
| 547 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | ||
| 548 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : | ||
| 549 | '.'; | ||
| 550 | need_resched = | ||
| 551 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; | ||
| 552 | hardsoft_irq = | ||
| 553 | (hardirq && softirq) ? 'H' : | ||
| 554 | hardirq ? 'h' : | ||
| 555 | softirq ? 's' : | ||
| 556 | '.'; | ||
| 557 | |||
| 542 | if (!trace_seq_printf(s, "%c%c%c", | 558 | if (!trace_seq_printf(s, "%c%c%c", |
| 543 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | 559 | irqs_off, need_resched, hardsoft_irq)) |
| 544 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? | ||
| 545 | 'X' : '.', | ||
| 546 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? | ||
| 547 | 'N' : '.', | ||
| 548 | (hardirq && softirq) ? 'H' : | ||
| 549 | hardirq ? 'h' : softirq ? 's' : '.')) | ||
| 550 | return 0; | 560 | return 0; |
| 551 | 561 | ||
| 552 | if (entry->preempt_count) | 562 | if (entry->preempt_count) |
| @@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
| 554 | else | 564 | else |
| 555 | ret = trace_seq_putc(s, '.'); | 565 | ret = trace_seq_putc(s, '.'); |
| 556 | 566 | ||
| 557 | if (!ret) | 567 | return ret; |
| 558 | return 0; | ||
| 559 | |||
| 560 | if (entry->lock_depth < 0) | ||
| 561 | return trace_seq_putc(s, '.'); | ||
| 562 | |||
| 563 | return trace_seq_printf(s, "%d", entry->lock_depth); | ||
| 564 | } | 568 | } |
| 565 | 569 | ||
| 566 | static int | 570 | static int |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d070c43..7e62c0a18456 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
| @@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) | |||
| 247 | ctx_trace = tr; | 247 | ctx_trace = tr; |
| 248 | } | 248 | } |
| 249 | 249 | ||
| 250 | static void stop_sched_trace(struct trace_array *tr) | ||
| 251 | { | ||
| 252 | tracing_stop_sched_switch_record(); | ||
| 253 | } | ||
| 254 | |||
| 255 | static int sched_switch_trace_init(struct trace_array *tr) | ||
| 256 | { | ||
| 257 | ctx_trace = tr; | ||
| 258 | tracing_reset_online_cpus(tr); | ||
| 259 | tracing_start_sched_switch_record(); | ||
| 260 | return 0; | ||
| 261 | } | ||
| 262 | |||
| 263 | static void sched_switch_trace_reset(struct trace_array *tr) | ||
| 264 | { | ||
| 265 | if (sched_ref) | ||
| 266 | stop_sched_trace(tr); | ||
| 267 | } | ||
| 268 | |||
| 269 | static void sched_switch_trace_start(struct trace_array *tr) | ||
| 270 | { | ||
| 271 | sched_stopped = 0; | ||
| 272 | } | ||
| 273 | |||
| 274 | static void sched_switch_trace_stop(struct trace_array *tr) | ||
| 275 | { | ||
| 276 | sched_stopped = 1; | ||
| 277 | } | ||
| 278 | |||
| 279 | static struct tracer sched_switch_trace __read_mostly = | ||
| 280 | { | ||
| 281 | .name = "sched_switch", | ||
| 282 | .init = sched_switch_trace_init, | ||
| 283 | .reset = sched_switch_trace_reset, | ||
| 284 | .start = sched_switch_trace_start, | ||
| 285 | .stop = sched_switch_trace_stop, | ||
| 286 | .wait_pipe = poll_wait_pipe, | ||
| 287 | #ifdef CONFIG_FTRACE_SELFTEST | ||
| 288 | .selftest = trace_selftest_startup_sched_switch, | ||
| 289 | #endif | ||
| 290 | }; | ||
| 291 | |||
| 292 | __init static int init_sched_switch_trace(void) | ||
| 293 | { | ||
| 294 | return register_tracer(&sched_switch_trace); | ||
| 295 | } | ||
| 296 | device_initcall(init_sched_switch_trace); | ||
| 297 | |||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5c9fe08d2093..ee7b5a0bb9f8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[]; | |||
| 60 | 60 | ||
| 61 | static struct syscall_metadata **syscalls_metadata; | 61 | static struct syscall_metadata **syscalls_metadata; |
| 62 | 62 | ||
| 63 | #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME | ||
| 64 | static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) | ||
| 65 | { | ||
| 66 | /* | ||
| 67 | * Only compare after the "sys" prefix. Archs that use | ||
| 68 | * syscall wrappers may have syscalls symbols aliases prefixed | ||
| 69 | * with "SyS" instead of "sys", leading to an unwanted | ||
| 70 | * mismatch. | ||
| 71 | */ | ||
| 72 | return !strcmp(sym + 3, name + 3); | ||
| 73 | } | ||
| 74 | #endif | ||
| 75 | |||
| 63 | static __init struct syscall_metadata * | 76 | static __init struct syscall_metadata * |
| 64 | find_syscall_meta(unsigned long syscall) | 77 | find_syscall_meta(unsigned long syscall) |
| 65 | { | 78 | { |
| @@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall) | |||
| 72 | stop = __stop_syscalls_metadata; | 85 | stop = __stop_syscalls_metadata; |
| 73 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); | 86 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); |
| 74 | 87 | ||
| 88 | if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) | ||
| 89 | return NULL; | ||
| 90 | |||
| 75 | for ( ; start < stop; start++) { | 91 | for ( ; start < stop; start++) { |
| 76 | /* | 92 | if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) |
| 77 | * Only compare after the "sys" prefix. Archs that use | ||
| 78 | * syscall wrappers may have syscalls symbols aliases prefixed | ||
| 79 | * with "SyS" instead of "sys", leading to an unwanted | ||
| 80 | * mismatch. | ||
| 81 | */ | ||
| 82 | if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) | ||
| 83 | return *start; | 93 | return *start; |
| 84 | } | 94 | } |
| 85 | return NULL; | 95 | return NULL; |
| @@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) | |||
| 359 | int num; | 369 | int num; |
| 360 | 370 | ||
| 361 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 371 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
| 362 | if (num < 0 || num >= NR_syscalls) | 372 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
| 363 | return -ENOSYS; | 373 | return -ENOSYS; |
| 364 | mutex_lock(&syscall_trace_lock); | 374 | mutex_lock(&syscall_trace_lock); |
| 365 | if (!sys_refcount_enter) | 375 | if (!sys_refcount_enter) |
| @@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) | |||
| 377 | int num; | 387 | int num; |
| 378 | 388 | ||
| 379 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 389 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
| 380 | if (num < 0 || num >= NR_syscalls) | 390 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
| 381 | return; | 391 | return; |
| 382 | mutex_lock(&syscall_trace_lock); | 392 | mutex_lock(&syscall_trace_lock); |
| 383 | sys_refcount_enter--; | 393 | sys_refcount_enter--; |
| @@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) | |||
| 393 | int num; | 403 | int num; |
| 394 | 404 | ||
| 395 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 405 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
| 396 | if (num < 0 || num >= NR_syscalls) | 406 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
| 397 | return -ENOSYS; | 407 | return -ENOSYS; |
| 398 | mutex_lock(&syscall_trace_lock); | 408 | mutex_lock(&syscall_trace_lock); |
| 399 | if (!sys_refcount_exit) | 409 | if (!sys_refcount_exit) |
| @@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
| 411 | int num; | 421 | int num; |
| 412 | 422 | ||
| 413 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 423 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
| 414 | if (num < 0 || num >= NR_syscalls) | 424 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
| 415 | return; | 425 | return; |
| 416 | mutex_lock(&syscall_trace_lock); | 426 | mutex_lock(&syscall_trace_lock); |
| 417 | sys_refcount_exit--; | 427 | sys_refcount_exit--; |
| @@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
| 424 | int init_syscall_trace(struct ftrace_event_call *call) | 434 | int init_syscall_trace(struct ftrace_event_call *call) |
| 425 | { | 435 | { |
| 426 | int id; | 436 | int id; |
| 437 | int num; | ||
| 438 | |||
| 439 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | ||
| 440 | if (num < 0 || num >= NR_syscalls) { | ||
| 441 | pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", | ||
| 442 | ((struct syscall_metadata *)call->data)->name); | ||
| 443 | return -ENOSYS; | ||
| 444 | } | ||
| 427 | 445 | ||
| 428 | if (set_syscall_print_fmt(call) < 0) | 446 | if (set_syscall_print_fmt(call) < 0) |
| 429 | return -ENOMEM; | 447 | return -ENOMEM; |
| @@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
| 438 | return id; | 456 | return id; |
| 439 | } | 457 | } |
| 440 | 458 | ||
| 441 | unsigned long __init arch_syscall_addr(int nr) | 459 | unsigned long __init __weak arch_syscall_addr(int nr) |
| 442 | { | 460 | { |
| 443 | return (unsigned long)sys_call_table[nr]; | 461 | return (unsigned long)sys_call_table[nr]; |
| 444 | } | 462 | } |
diff --git a/kernel/uid16.c b/kernel/uid16.c index 419209893d87..51c6e89e8619 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
| @@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
| 189 | struct group_info *group_info; | 189 | struct group_info *group_info; |
| 190 | int retval; | 190 | int retval; |
| 191 | 191 | ||
| 192 | if (!capable(CAP_SETGID)) | 192 | if (!nsown_capable(CAP_SETGID)) |
| 193 | return -EPERM; | 193 | return -EPERM; |
| 194 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 194 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
| 195 | return -EINVAL; | 195 | return -EINVAL; |
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index eb27fd3430a2..92cb706c7fc8 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c | |||
| @@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); | |||
| 20 | 20 | ||
| 21 | /* | 21 | /* |
| 22 | * Removes a registered user return notifier. Must be called from atomic | 22 | * Removes a registered user return notifier. Must be called from atomic |
| 23 | * context, and from the same cpu registration occured in. | 23 | * context, and from the same cpu registration occurred in. |
| 24 | */ | 24 | */ |
| 25 | void user_return_notifier_unregister(struct user_return_notifier *urn) | 25 | void user_return_notifier_unregister(struct user_return_notifier *urn) |
| 26 | { | 26 | { |
diff --git a/kernel/user.c b/kernel/user.c index 5c598ca781df..9e03e9c1df8d 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -17,9 +17,13 @@ | |||
| 17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
| 18 | #include <linux/user_namespace.h> | 18 | #include <linux/user_namespace.h> |
| 19 | 19 | ||
| 20 | /* | ||
| 21 | * userns count is 1 for root user, 1 for init_uts_ns, | ||
| 22 | * and 1 for... ? | ||
| 23 | */ | ||
| 20 | struct user_namespace init_user_ns = { | 24 | struct user_namespace init_user_ns = { |
| 21 | .kref = { | 25 | .kref = { |
| 22 | .refcount = ATOMIC_INIT(2), | 26 | .refcount = ATOMIC_INIT(3), |
| 23 | }, | 27 | }, |
| 24 | .creator = &root_user, | 28 | .creator = &root_user, |
| 25 | }; | 29 | }; |
| @@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep; | |||
| 47 | */ | 51 | */ |
| 48 | static DEFINE_SPINLOCK(uidhash_lock); | 52 | static DEFINE_SPINLOCK(uidhash_lock); |
| 49 | 53 | ||
| 50 | /* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ | 54 | /* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ |
| 51 | struct user_struct root_user = { | 55 | struct user_struct root_user = { |
| 52 | .__count = ATOMIC_INIT(2), | 56 | .__count = ATOMIC_INIT(2), |
| 53 | .processes = ATOMIC_INIT(1), | 57 | .processes = ATOMIC_INIT(1), |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 8a82b4b8ea52..44646179eaba 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/utsname.h> | 14 | #include <linux/utsname.h> |
| 15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
| 16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
| 17 | #include <linux/user_namespace.h> | ||
| 17 | 18 | ||
| 18 | static struct uts_namespace *create_uts_ns(void) | 19 | static struct uts_namespace *create_uts_ns(void) |
| 19 | { | 20 | { |
| @@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void) | |||
| 30 | * @old_ns: namespace to clone | 31 | * @old_ns: namespace to clone |
| 31 | * Return NULL on error (failure to kmalloc), new ns otherwise | 32 | * Return NULL on error (failure to kmalloc), new ns otherwise |
| 32 | */ | 33 | */ |
| 33 | static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | 34 | static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, |
| 35 | struct uts_namespace *old_ns) | ||
| 34 | { | 36 | { |
| 35 | struct uts_namespace *ns; | 37 | struct uts_namespace *ns; |
| 36 | 38 | ||
| @@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | |||
| 40 | 42 | ||
| 41 | down_read(&uts_sem); | 43 | down_read(&uts_sem); |
| 42 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 44 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
| 45 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); | ||
| 43 | up_read(&uts_sem); | 46 | up_read(&uts_sem); |
| 44 | return ns; | 47 | return ns; |
| 45 | } | 48 | } |
| @@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | |||
| 50 | * utsname of this process won't be seen by parent, and vice | 53 | * utsname of this process won't be seen by parent, and vice |
| 51 | * versa. | 54 | * versa. |
| 52 | */ | 55 | */ |
| 53 | struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) | 56 | struct uts_namespace *copy_utsname(unsigned long flags, |
| 57 | struct task_struct *tsk) | ||
| 54 | { | 58 | { |
| 59 | struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; | ||
| 55 | struct uts_namespace *new_ns; | 60 | struct uts_namespace *new_ns; |
| 56 | 61 | ||
| 57 | BUG_ON(!old_ns); | 62 | BUG_ON(!old_ns); |
| @@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol | |||
| 60 | if (!(flags & CLONE_NEWUTS)) | 65 | if (!(flags & CLONE_NEWUTS)) |
| 61 | return old_ns; | 66 | return old_ns; |
| 62 | 67 | ||
| 63 | new_ns = clone_uts_ns(old_ns); | 68 | new_ns = clone_uts_ns(tsk, old_ns); |
| 64 | 69 | ||
| 65 | put_uts_ns(old_ns); | 70 | put_uts_ns(old_ns); |
| 66 | return new_ns; | 71 | return new_ns; |
| @@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref) | |||
| 71 | struct uts_namespace *ns; | 76 | struct uts_namespace *ns; |
| 72 | 77 | ||
| 73 | ns = container_of(kref, struct uts_namespace, kref); | 78 | ns = container_of(kref, struct uts_namespace, kref); |
| 79 | put_user_ns(ns->user_ns); | ||
| 74 | kfree(ns); | 80 | kfree(ns); |
| 75 | } | 81 | } |
diff --git a/kernel/wait.c b/kernel/wait.c index b0310eb6cc1e..f45ea8d2a1ce 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
| @@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait); | |||
| 142 | * woken up through the queue. | 142 | * woken up through the queue. |
| 143 | * | 143 | * |
| 144 | * This prevents waiter starvation where an exclusive waiter | 144 | * This prevents waiter starvation where an exclusive waiter |
| 145 | * aborts and is woken up concurrently and noone wakes up | 145 | * aborts and is woken up concurrently and no one wakes up |
| 146 | * the next waiter. | 146 | * the next waiter. |
| 147 | */ | 147 | */ |
| 148 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | 148 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18bb15776c57..14733d4d156b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | |||
| 48 | * Should we panic when a soft-lockup or hard-lockup occurs: | 48 | * Should we panic when a soft-lockup or hard-lockup occurs: |
| 49 | */ | 49 | */ |
| 50 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 50 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 51 | static int hardlockup_panic; | 51 | static int hardlockup_panic = |
| 52 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
| 52 | 53 | ||
| 53 | static int __init hardlockup_panic_setup(char *str) | 54 | static int __init hardlockup_panic_setup(char *str) |
| 54 | { | 55 | { |
| 55 | if (!strncmp(str, "panic", 5)) | 56 | if (!strncmp(str, "panic", 5)) |
| 56 | hardlockup_panic = 1; | 57 | hardlockup_panic = 1; |
| 58 | else if (!strncmp(str, "nopanic", 7)) | ||
| 59 | hardlockup_panic = 0; | ||
| 57 | else if (!strncmp(str, "0", 1)) | 60 | else if (!strncmp(str, "0", 1)) |
| 58 | watchdog_enabled = 0; | 61 | watchdog_enabled = 0; |
| 59 | return 1; | 62 | return 1; |
| @@ -415,19 +418,25 @@ static int watchdog_prepare_cpu(int cpu) | |||
| 415 | static int watchdog_enable(int cpu) | 418 | static int watchdog_enable(int cpu) |
| 416 | { | 419 | { |
| 417 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | 420 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); |
| 418 | int err; | 421 | int err = 0; |
| 419 | 422 | ||
| 420 | /* enable the perf event */ | 423 | /* enable the perf event */ |
| 421 | err = watchdog_nmi_enable(cpu); | 424 | err = watchdog_nmi_enable(cpu); |
| 422 | if (err) | 425 | |
| 423 | return err; | 426 | /* Regardless of err above, fall through and start softlockup */ |
| 424 | 427 | ||
| 425 | /* create the watchdog thread */ | 428 | /* create the watchdog thread */ |
| 426 | if (!p) { | 429 | if (!p) { |
| 427 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | 430 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); |
| 428 | if (IS_ERR(p)) { | 431 | if (IS_ERR(p)) { |
| 429 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | 432 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); |
| 430 | return PTR_ERR(p); | 433 | if (!err) { |
| 434 | /* if hardlockup hasn't already set this */ | ||
| 435 | err = PTR_ERR(p); | ||
| 436 | /* and disable the perf event */ | ||
| 437 | watchdog_nmi_disable(cpu); | ||
| 438 | } | ||
| 439 | goto out; | ||
| 431 | } | 440 | } |
| 432 | kthread_bind(p, cpu); | 441 | kthread_bind(p, cpu); |
| 433 | per_cpu(watchdog_touch_ts, cpu) = 0; | 442 | per_cpu(watchdog_touch_ts, cpu) = 0; |
| @@ -435,7 +444,8 @@ static int watchdog_enable(int cpu) | |||
| 435 | wake_up_process(p); | 444 | wake_up_process(p); |
| 436 | } | 445 | } |
| 437 | 446 | ||
| 438 | return 0; | 447 | out: |
| 448 | return err; | ||
| 439 | } | 449 | } |
| 440 | 450 | ||
| 441 | static void watchdog_disable(int cpu) | 451 | static void watchdog_disable(int cpu) |
| @@ -547,7 +557,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 547 | break; | 557 | break; |
| 548 | #endif /* CONFIG_HOTPLUG_CPU */ | 558 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 549 | } | 559 | } |
| 550 | return notifier_from_errno(err); | 560 | |
| 561 | /* | ||
| 562 | * hardlockup and softlockup are not important enough | ||
| 563 | * to block cpu bring up. Just always succeed and | ||
| 564 | * rely on printk output to flag problems. | ||
| 565 | */ | ||
| 566 | return NOTIFY_OK; | ||
| 551 | } | 567 | } |
| 552 | 568 | ||
| 553 | static struct notifier_block __cpuinitdata cpu_nfb = { | 569 | static struct notifier_block __cpuinitdata cpu_nfb = { |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee6578b578ad..e3378e8d3a5c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -251,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly; | |||
| 251 | struct workqueue_struct *system_long_wq __read_mostly; | 251 | struct workqueue_struct *system_long_wq __read_mostly; |
| 252 | struct workqueue_struct *system_nrt_wq __read_mostly; | 252 | struct workqueue_struct *system_nrt_wq __read_mostly; |
| 253 | struct workqueue_struct *system_unbound_wq __read_mostly; | 253 | struct workqueue_struct *system_unbound_wq __read_mostly; |
| 254 | struct workqueue_struct *system_freezable_wq __read_mostly; | ||
| 254 | EXPORT_SYMBOL_GPL(system_wq); | 255 | EXPORT_SYMBOL_GPL(system_wq); |
| 255 | EXPORT_SYMBOL_GPL(system_long_wq); | 256 | EXPORT_SYMBOL_GPL(system_long_wq); |
| 256 | EXPORT_SYMBOL_GPL(system_nrt_wq); | 257 | EXPORT_SYMBOL_GPL(system_nrt_wq); |
| 257 | EXPORT_SYMBOL_GPL(system_unbound_wq); | 258 | EXPORT_SYMBOL_GPL(system_unbound_wq); |
| 259 | EXPORT_SYMBOL_GPL(system_freezable_wq); | ||
| 258 | 260 | ||
| 259 | #define CREATE_TRACE_POINTS | 261 | #define CREATE_TRACE_POINTS |
| 260 | #include <trace/events/workqueue.h> | 262 | #include <trace/events/workqueue.h> |
| @@ -316,6 +318,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, | |||
| 316 | 318 | ||
| 317 | static struct debug_obj_descr work_debug_descr; | 319 | static struct debug_obj_descr work_debug_descr; |
| 318 | 320 | ||
| 321 | static void *work_debug_hint(void *addr) | ||
| 322 | { | ||
| 323 | return ((struct work_struct *) addr)->func; | ||
| 324 | } | ||
| 325 | |||
| 319 | /* | 326 | /* |
| 320 | * fixup_init is called when: | 327 | * fixup_init is called when: |
| 321 | * - an active object is initialized | 328 | * - an active object is initialized |
| @@ -387,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) | |||
| 387 | 394 | ||
| 388 | static struct debug_obj_descr work_debug_descr = { | 395 | static struct debug_obj_descr work_debug_descr = { |
| 389 | .name = "work_struct", | 396 | .name = "work_struct", |
| 397 | .debug_hint = work_debug_hint, | ||
| 390 | .fixup_init = work_fixup_init, | 398 | .fixup_init = work_fixup_init, |
| 391 | .fixup_activate = work_fixup_activate, | 399 | .fixup_activate = work_fixup_activate, |
| 392 | .fixup_free = work_fixup_free, | 400 | .fixup_free = work_fixup_free, |
| @@ -1283,8 +1291,14 @@ __acquires(&gcwq->lock) | |||
| 1283 | return true; | 1291 | return true; |
| 1284 | spin_unlock_irq(&gcwq->lock); | 1292 | spin_unlock_irq(&gcwq->lock); |
| 1285 | 1293 | ||
| 1286 | /* CPU has come up inbetween, retry migration */ | 1294 | /* |
| 1295 | * We've raced with CPU hot[un]plug. Give it a breather | ||
| 1296 | * and retry migration. cond_resched() is required here; | ||
| 1297 | * otherwise, we might deadlock against cpu_stop trying to | ||
| 1298 | * bring down the CPU on non-preemptive kernel. | ||
| 1299 | */ | ||
| 1287 | cpu_relax(); | 1300 | cpu_relax(); |
| 1301 | cond_resched(); | ||
| 1288 | } | 1302 | } |
| 1289 | } | 1303 | } |
| 1290 | 1304 | ||
| @@ -1358,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) | |||
| 1358 | worker->id = id; | 1372 | worker->id = id; |
| 1359 | 1373 | ||
| 1360 | if (!on_unbound_cpu) | 1374 | if (!on_unbound_cpu) |
| 1361 | worker->task = kthread_create(worker_thread, worker, | 1375 | worker->task = kthread_create_on_node(worker_thread, |
| 1362 | "kworker/%u:%d", gcwq->cpu, id); | 1376 | worker, |
| 1377 | cpu_to_node(gcwq->cpu), | ||
| 1378 | "kworker/%u:%d", gcwq->cpu, id); | ||
| 1363 | else | 1379 | else |
| 1364 | worker->task = kthread_create(worker_thread, worker, | 1380 | worker->task = kthread_create(worker_thread, worker, |
| 1365 | "kworker/u:%d", id); | 1381 | "kworker/u:%d", id); |
| @@ -3775,8 +3791,10 @@ static int __init init_workqueues(void) | |||
| 3775 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); | 3791 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); |
| 3776 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, | 3792 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, |
| 3777 | WQ_UNBOUND_MAX_ACTIVE); | 3793 | WQ_UNBOUND_MAX_ACTIVE); |
| 3794 | system_freezable_wq = alloc_workqueue("events_freezable", | ||
| 3795 | WQ_FREEZABLE, 0); | ||
| 3778 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || | 3796 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || |
| 3779 | !system_unbound_wq); | 3797 | !system_unbound_wq || !system_freezable_wq); |
| 3780 | return 0; | 3798 | return 0; |
| 3781 | } | 3799 | } |
| 3782 | early_initcall(init_workqueues); | 3800 | early_initcall(init_workqueues); |
