diff options
author | Frederic Weisbecker <fweisbec@gmail.com> | 2010-05-12 17:19:01 -0400 |
---|---|---|
committer | Frederic Weisbecker <fweisbec@gmail.com> | 2010-05-12 17:20:33 -0400 |
commit | a9aa1d02de36b450990b0e25a88fc2ff1c3e6b94 (patch) | |
tree | 1f9d19f1642d263e65906a916a48be9339accc73 /kernel | |
parent | 5671a10e2bc7f99d9157c6044faf8be2ef302361 (diff) | |
parent | b57f95a38233a2e73b679bea4a5453a1cc2a1cc9 (diff) |
Merge commit 'v2.6.34-rc7' into perf/nmi
Merge reason: catch up with latest softlockup detector changes.
Diffstat (limited to 'kernel')
125 files changed, 7261 insertions, 4172 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 8a5abe53ebad..d5c30060ac14 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
13 | async.o | 13 | async.o range.o |
14 | obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o | ||
14 | obj-y += groups.o | 15 | obj-y += groups.o |
15 | 16 | ||
16 | ifdef CONFIG_FUNCTION_TRACER | 17 | ifdef CONFIG_FUNCTION_TRACER |
@@ -91,6 +92,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | |||
91 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 92 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
92 | obj-$(CONFIG_TRACEPOINTS) += tracepoint.o | 93 | obj-$(CONFIG_TRACEPOINTS) += tracepoint.o |
93 | obj-$(CONFIG_LATENCYTOP) += latencytop.o | 94 | obj-$(CONFIG_LATENCYTOP) += latencytop.o |
95 | obj-$(CONFIG_BINFMT_ELF) += elfcore.o | ||
96 | obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o | ||
97 | obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o | ||
94 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ | 98 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ |
95 | obj-$(CONFIG_TRACING) += trace/ | 99 | obj-$(CONFIG_TRACING) += trace/ |
96 | obj-$(CONFIG_X86_DS) += trace/ | 100 | obj-$(CONFIG_X86_DS) += trace/ |
@@ -101,6 +105,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o | |||
101 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 105 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
102 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 106 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
103 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 107 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
108 | obj-$(CONFIG_PADATA) += padata.o | ||
104 | 109 | ||
105 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | 110 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) |
106 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 111 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/acct.c b/kernel/acct.c index a6605ca921b6..24f8c81fc48d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -588,16 +588,6 @@ out: | |||
588 | } | 588 | } |
589 | 589 | ||
590 | /** | 590 | /** |
591 | * acct_init_pacct - initialize a new pacct_struct | ||
592 | * @pacct: per-process accounting info struct to initialize | ||
593 | */ | ||
594 | void acct_init_pacct(struct pacct_struct *pacct) | ||
595 | { | ||
596 | memset(pacct, 0, sizeof(struct pacct_struct)); | ||
597 | pacct->ac_utime = pacct->ac_stime = cputime_zero; | ||
598 | } | ||
599 | |||
600 | /** | ||
601 | * acct_collect - collect accounting information into pacct_struct | 591 | * acct_collect - collect accounting information into pacct_struct |
602 | * @exitcode: task exit code | 592 | * @exitcode: task exit code |
603 | * @group_dead: not 0, if this thread is the last one in the process. | 593 | * @group_dead: not 0, if this thread is the last one in the process. |
diff --git a/kernel/async.c b/kernel/async.c index 27235f5de198..15319d6c18fe 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel. | |||
56 | #include <linux/init.h> | 56 | #include <linux/init.h> |
57 | #include <linux/kthread.h> | 57 | #include <linux/kthread.h> |
58 | #include <linux/delay.h> | 58 | #include <linux/delay.h> |
59 | #include <linux/slab.h> | ||
59 | #include <asm/atomic.h> | 60 | #include <asm/atomic.h> |
60 | 61 | ||
61 | static async_cookie_t next_cookie = 1; | 62 | static async_cookie_t next_cookie = 1; |
diff --git a/kernel/audit.c b/kernel/audit.c index 5feed232be9d..c71bd26631a2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <asm/atomic.h> | 46 | #include <asm/atomic.h> |
47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
48 | #include <linux/module.h> | 48 | #include <linux/module.h> |
49 | #include <linux/slab.h> | ||
49 | #include <linux/err.h> | 50 | #include <linux/err.h> |
50 | #include <linux/kthread.h> | 51 | #include <linux/kthread.h> |
51 | 52 | ||
@@ -398,7 +399,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
398 | skb_get(skb); | 399 | skb_get(skb); |
399 | err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); | 400 | err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); |
400 | if (err < 0) { | 401 | if (err < 0) { |
401 | BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ | 402 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ |
402 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 403 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
403 | audit_log_lost("auditd dissapeared\n"); | 404 | audit_log_lost("auditd dissapeared\n"); |
404 | audit_pid = 0; | 405 | audit_pid = 0; |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 4b05bd9479db..46a57b57a335 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -3,6 +3,7 @@ | |||
3 | #include <linux/namei.h> | 3 | #include <linux/namei.h> |
4 | #include <linux/mount.h> | 4 | #include <linux/mount.h> |
5 | #include <linux/kthread.h> | 5 | #include <linux/kthread.h> |
6 | #include <linux/slab.h> | ||
6 | 7 | ||
7 | struct audit_tree; | 8 | struct audit_tree; |
8 | struct audit_chunk; | 9 | struct audit_chunk; |
@@ -548,6 +549,11 @@ int audit_remove_tree_rule(struct audit_krule *rule) | |||
548 | return 0; | 549 | return 0; |
549 | } | 550 | } |
550 | 551 | ||
552 | static int compare_root(struct vfsmount *mnt, void *arg) | ||
553 | { | ||
554 | return mnt->mnt_root->d_inode == arg; | ||
555 | } | ||
556 | |||
551 | void audit_trim_trees(void) | 557 | void audit_trim_trees(void) |
552 | { | 558 | { |
553 | struct list_head cursor; | 559 | struct list_head cursor; |
@@ -559,7 +565,6 @@ void audit_trim_trees(void) | |||
559 | struct path path; | 565 | struct path path; |
560 | struct vfsmount *root_mnt; | 566 | struct vfsmount *root_mnt; |
561 | struct node *node; | 567 | struct node *node; |
562 | struct list_head list; | ||
563 | int err; | 568 | int err; |
564 | 569 | ||
565 | tree = container_of(cursor.next, struct audit_tree, list); | 570 | tree = container_of(cursor.next, struct audit_tree, list); |
@@ -577,24 +582,16 @@ void audit_trim_trees(void) | |||
577 | if (!root_mnt) | 582 | if (!root_mnt) |
578 | goto skip_it; | 583 | goto skip_it; |
579 | 584 | ||
580 | list_add_tail(&list, &root_mnt->mnt_list); | ||
581 | spin_lock(&hash_lock); | 585 | spin_lock(&hash_lock); |
582 | list_for_each_entry(node, &tree->chunks, list) { | 586 | list_for_each_entry(node, &tree->chunks, list) { |
583 | struct audit_chunk *chunk = find_chunk(node); | 587 | struct inode *inode = find_chunk(node)->watch.inode; |
584 | struct inode *inode = chunk->watch.inode; | ||
585 | struct vfsmount *mnt; | ||
586 | node->index |= 1U<<31; | 588 | node->index |= 1U<<31; |
587 | list_for_each_entry(mnt, &list, mnt_list) { | 589 | if (iterate_mounts(compare_root, inode, root_mnt)) |
588 | if (mnt->mnt_root->d_inode == inode) { | 590 | node->index &= ~(1U<<31); |
589 | node->index &= ~(1U<<31); | ||
590 | break; | ||
591 | } | ||
592 | } | ||
593 | } | 591 | } |
594 | spin_unlock(&hash_lock); | 592 | spin_unlock(&hash_lock); |
595 | trim_marked(tree); | 593 | trim_marked(tree); |
596 | put_tree(tree); | 594 | put_tree(tree); |
597 | list_del_init(&list); | ||
598 | drop_collected_mounts(root_mnt); | 595 | drop_collected_mounts(root_mnt); |
599 | skip_it: | 596 | skip_it: |
600 | mutex_lock(&audit_filter_mutex); | 597 | mutex_lock(&audit_filter_mutex); |
@@ -603,22 +600,6 @@ skip_it: | |||
603 | mutex_unlock(&audit_filter_mutex); | 600 | mutex_unlock(&audit_filter_mutex); |
604 | } | 601 | } |
605 | 602 | ||
606 | static int is_under(struct vfsmount *mnt, struct dentry *dentry, | ||
607 | struct path *path) | ||
608 | { | ||
609 | if (mnt != path->mnt) { | ||
610 | for (;;) { | ||
611 | if (mnt->mnt_parent == mnt) | ||
612 | return 0; | ||
613 | if (mnt->mnt_parent == path->mnt) | ||
614 | break; | ||
615 | mnt = mnt->mnt_parent; | ||
616 | } | ||
617 | dentry = mnt->mnt_mountpoint; | ||
618 | } | ||
619 | return is_subdir(dentry, path->dentry); | ||
620 | } | ||
621 | |||
622 | int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) | 603 | int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) |
623 | { | 604 | { |
624 | 605 | ||
@@ -638,13 +619,17 @@ void audit_put_tree(struct audit_tree *tree) | |||
638 | put_tree(tree); | 619 | put_tree(tree); |
639 | } | 620 | } |
640 | 621 | ||
622 | static int tag_mount(struct vfsmount *mnt, void *arg) | ||
623 | { | ||
624 | return tag_chunk(mnt->mnt_root->d_inode, arg); | ||
625 | } | ||
626 | |||
641 | /* called with audit_filter_mutex */ | 627 | /* called with audit_filter_mutex */ |
642 | int audit_add_tree_rule(struct audit_krule *rule) | 628 | int audit_add_tree_rule(struct audit_krule *rule) |
643 | { | 629 | { |
644 | struct audit_tree *seed = rule->tree, *tree; | 630 | struct audit_tree *seed = rule->tree, *tree; |
645 | struct path path; | 631 | struct path path; |
646 | struct vfsmount *mnt, *p; | 632 | struct vfsmount *mnt; |
647 | struct list_head list; | ||
648 | int err; | 633 | int err; |
649 | 634 | ||
650 | list_for_each_entry(tree, &tree_list, list) { | 635 | list_for_each_entry(tree, &tree_list, list) { |
@@ -670,16 +655,9 @@ int audit_add_tree_rule(struct audit_krule *rule) | |||
670 | err = -ENOMEM; | 655 | err = -ENOMEM; |
671 | goto Err; | 656 | goto Err; |
672 | } | 657 | } |
673 | list_add_tail(&list, &mnt->mnt_list); | ||
674 | 658 | ||
675 | get_tree(tree); | 659 | get_tree(tree); |
676 | list_for_each_entry(p, &list, mnt_list) { | 660 | err = iterate_mounts(tag_mount, tree, mnt); |
677 | err = tag_chunk(p->mnt_root->d_inode, tree); | ||
678 | if (err) | ||
679 | break; | ||
680 | } | ||
681 | |||
682 | list_del(&list); | ||
683 | drop_collected_mounts(mnt); | 661 | drop_collected_mounts(mnt); |
684 | 662 | ||
685 | if (!err) { | 663 | if (!err) { |
@@ -714,31 +692,23 @@ int audit_tag_tree(char *old, char *new) | |||
714 | { | 692 | { |
715 | struct list_head cursor, barrier; | 693 | struct list_head cursor, barrier; |
716 | int failed = 0; | 694 | int failed = 0; |
717 | struct path path; | 695 | struct path path1, path2; |
718 | struct vfsmount *tagged; | 696 | struct vfsmount *tagged; |
719 | struct list_head list; | ||
720 | struct vfsmount *mnt; | ||
721 | struct dentry *dentry; | ||
722 | int err; | 697 | int err; |
723 | 698 | ||
724 | err = kern_path(new, 0, &path); | 699 | err = kern_path(new, 0, &path2); |
725 | if (err) | 700 | if (err) |
726 | return err; | 701 | return err; |
727 | tagged = collect_mounts(&path); | 702 | tagged = collect_mounts(&path2); |
728 | path_put(&path); | 703 | path_put(&path2); |
729 | if (!tagged) | 704 | if (!tagged) |
730 | return -ENOMEM; | 705 | return -ENOMEM; |
731 | 706 | ||
732 | err = kern_path(old, 0, &path); | 707 | err = kern_path(old, 0, &path1); |
733 | if (err) { | 708 | if (err) { |
734 | drop_collected_mounts(tagged); | 709 | drop_collected_mounts(tagged); |
735 | return err; | 710 | return err; |
736 | } | 711 | } |
737 | mnt = mntget(path.mnt); | ||
738 | dentry = dget(path.dentry); | ||
739 | path_put(&path); | ||
740 | |||
741 | list_add_tail(&list, &tagged->mnt_list); | ||
742 | 712 | ||
743 | mutex_lock(&audit_filter_mutex); | 713 | mutex_lock(&audit_filter_mutex); |
744 | list_add(&barrier, &tree_list); | 714 | list_add(&barrier, &tree_list); |
@@ -746,7 +716,7 @@ int audit_tag_tree(char *old, char *new) | |||
746 | 716 | ||
747 | while (cursor.next != &tree_list) { | 717 | while (cursor.next != &tree_list) { |
748 | struct audit_tree *tree; | 718 | struct audit_tree *tree; |
749 | struct vfsmount *p; | 719 | int good_one = 0; |
750 | 720 | ||
751 | tree = container_of(cursor.next, struct audit_tree, list); | 721 | tree = container_of(cursor.next, struct audit_tree, list); |
752 | get_tree(tree); | 722 | get_tree(tree); |
@@ -754,30 +724,19 @@ int audit_tag_tree(char *old, char *new) | |||
754 | list_add(&cursor, &tree->list); | 724 | list_add(&cursor, &tree->list); |
755 | mutex_unlock(&audit_filter_mutex); | 725 | mutex_unlock(&audit_filter_mutex); |
756 | 726 | ||
757 | err = kern_path(tree->pathname, 0, &path); | 727 | err = kern_path(tree->pathname, 0, &path2); |
758 | if (err) { | 728 | if (!err) { |
759 | put_tree(tree); | 729 | good_one = path_is_under(&path1, &path2); |
760 | mutex_lock(&audit_filter_mutex); | 730 | path_put(&path2); |
761 | continue; | ||
762 | } | 731 | } |
763 | 732 | ||
764 | spin_lock(&vfsmount_lock); | 733 | if (!good_one) { |
765 | if (!is_under(mnt, dentry, &path)) { | ||
766 | spin_unlock(&vfsmount_lock); | ||
767 | path_put(&path); | ||
768 | put_tree(tree); | 734 | put_tree(tree); |
769 | mutex_lock(&audit_filter_mutex); | 735 | mutex_lock(&audit_filter_mutex); |
770 | continue; | 736 | continue; |
771 | } | 737 | } |
772 | spin_unlock(&vfsmount_lock); | ||
773 | path_put(&path); | ||
774 | |||
775 | list_for_each_entry(p, &list, mnt_list) { | ||
776 | failed = tag_chunk(p->mnt_root->d_inode, tree); | ||
777 | if (failed) | ||
778 | break; | ||
779 | } | ||
780 | 738 | ||
739 | failed = iterate_mounts(tag_mount, tree, tagged); | ||
781 | if (failed) { | 740 | if (failed) { |
782 | put_tree(tree); | 741 | put_tree(tree); |
783 | mutex_lock(&audit_filter_mutex); | 742 | mutex_lock(&audit_filter_mutex); |
@@ -818,10 +777,8 @@ int audit_tag_tree(char *old, char *new) | |||
818 | } | 777 | } |
819 | list_del(&barrier); | 778 | list_del(&barrier); |
820 | list_del(&cursor); | 779 | list_del(&cursor); |
821 | list_del(&list); | ||
822 | mutex_unlock(&audit_filter_mutex); | 780 | mutex_unlock(&audit_filter_mutex); |
823 | dput(dentry); | 781 | path_put(&path1); |
824 | mntput(mnt); | ||
825 | drop_collected_mounts(tagged); | 782 | drop_collected_mounts(tagged); |
826 | return failed; | 783 | return failed; |
827 | } | 784 | } |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index cc7e87936cbc..8df43696f4ba 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/namei.h> | 27 | #include <linux/namei.h> |
28 | #include <linux/netlink.h> | 28 | #include <linux/netlink.h> |
29 | #include <linux/sched.h> | 29 | #include <linux/sched.h> |
30 | #include <linux/slab.h> | ||
30 | #include <linux/inotify.h> | 31 | #include <linux/inotify.h> |
31 | #include <linux/security.h> | 32 | #include <linux/security.h> |
32 | #include "audit.h" | 33 | #include "audit.h" |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a70604047f3c..ce08041f578d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/namei.h> | 27 | #include <linux/namei.h> |
28 | #include <linux/netlink.h> | 28 | #include <linux/netlink.h> |
29 | #include <linux/sched.h> | 29 | #include <linux/sched.h> |
30 | #include <linux/slab.h> | ||
30 | #include <linux/security.h> | 31 | #include <linux/security.h> |
31 | #include "audit.h" | 32 | #include "audit.h" |
32 | 33 | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fc0f928167e7..3828ad5fb8f1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/namei.h> | 49 | #include <linux/namei.h> |
50 | #include <linux/mm.h> | 50 | #include <linux/mm.h> |
51 | #include <linux/module.h> | 51 | #include <linux/module.h> |
52 | #include <linux/slab.h> | ||
52 | #include <linux/mount.h> | 53 | #include <linux/mount.h> |
53 | #include <linux/socket.h> | 54 | #include <linux/socket.h> |
54 | #include <linux/mqueue.h> | 55 | #include <linux/mqueue.h> |
@@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context, | |||
1893 | { | 1894 | { |
1894 | if (context->name_count >= AUDIT_NAMES) { | 1895 | if (context->name_count >= AUDIT_NAMES) { |
1895 | if (inode) | 1896 | if (inode) |
1896 | printk(KERN_DEBUG "name_count maxed, losing inode data: " | 1897 | printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " |
1897 | "dev=%02x:%02x, inode=%lu\n", | 1898 | "dev=%02x:%02x, inode=%lu\n", |
1898 | MAJOR(inode->i_sb->s_dev), | 1899 | MAJOR(inode->i_sb->s_dev), |
1899 | MINOR(inode->i_sb->s_dev), | 1900 | MINOR(inode->i_sb->s_dev), |
@@ -1988,7 +1989,6 @@ void __audit_inode(const char *name, const struct dentry *dentry) | |||
1988 | 1989 | ||
1989 | /** | 1990 | /** |
1990 | * audit_inode_child - collect inode info for created/removed objects | 1991 | * audit_inode_child - collect inode info for created/removed objects |
1991 | * @dname: inode's dentry name | ||
1992 | * @dentry: dentry being audited | 1992 | * @dentry: dentry being audited |
1993 | * @parent: inode of dentry parent | 1993 | * @parent: inode of dentry parent |
1994 | * | 1994 | * |
@@ -2000,13 +2000,14 @@ void __audit_inode(const char *name, const struct dentry *dentry) | |||
2000 | * must be hooked prior, in order to capture the target inode during | 2000 | * must be hooked prior, in order to capture the target inode during |
2001 | * unsuccessful attempts. | 2001 | * unsuccessful attempts. |
2002 | */ | 2002 | */ |
2003 | void __audit_inode_child(const char *dname, const struct dentry *dentry, | 2003 | void __audit_inode_child(const struct dentry *dentry, |
2004 | const struct inode *parent) | 2004 | const struct inode *parent) |
2005 | { | 2005 | { |
2006 | int idx; | 2006 | int idx; |
2007 | struct audit_context *context = current->audit_context; | 2007 | struct audit_context *context = current->audit_context; |
2008 | const char *found_parent = NULL, *found_child = NULL; | 2008 | const char *found_parent = NULL, *found_child = NULL; |
2009 | const struct inode *inode = dentry->d_inode; | 2009 | const struct inode *inode = dentry->d_inode; |
2010 | const char *dname = dentry->d_name.name; | ||
2010 | int dirlen = 0; | 2011 | int dirlen = 0; |
2011 | 2012 | ||
2012 | if (!context->in_syscall) | 2013 | if (!context->in_syscall) |
@@ -2014,9 +2015,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry, | |||
2014 | 2015 | ||
2015 | if (inode) | 2016 | if (inode) |
2016 | handle_one(inode); | 2017 | handle_one(inode); |
2017 | /* determine matching parent */ | ||
2018 | if (!dname) | ||
2019 | goto add_names; | ||
2020 | 2018 | ||
2021 | /* parent is more likely, look for it first */ | 2019 | /* parent is more likely, look for it first */ |
2022 | for (idx = 0; idx < context->name_count; idx++) { | 2020 | for (idx = 0; idx < context->name_count; idx++) { |
diff --git a/kernel/capability.c b/kernel/capability.c index 7f876e60521f..9e4697e9b276 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, | |||
135 | if (pid && (pid != task_pid_vnr(current))) { | 135 | if (pid && (pid != task_pid_vnr(current))) { |
136 | struct task_struct *target; | 136 | struct task_struct *target; |
137 | 137 | ||
138 | read_lock(&tasklist_lock); | 138 | rcu_read_lock(); |
139 | 139 | ||
140 | target = find_task_by_vpid(pid); | 140 | target = find_task_by_vpid(pid); |
141 | if (!target) | 141 | if (!target) |
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, | |||
143 | else | 143 | else |
144 | ret = security_capget(target, pEp, pIp, pPp); | 144 | ret = security_capget(target, pEp, pIp, pPp); |
145 | 145 | ||
146 | read_unlock(&tasklist_lock); | 146 | rcu_read_unlock(); |
147 | } else | 147 | } else |
148 | ret = security_capget(current, pEp, pIp, pPp); | 148 | ret = security_capget(current, pEp, pIp, pPp); |
149 | 149 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1fbcc748044a..3a53c771e503 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -4,6 +4,10 @@ | |||
4 | * Based originally on the cpuset system, extracted by Paul Menage | 4 | * Based originally on the cpuset system, extracted by Paul Menage |
5 | * Copyright (C) 2006 Google, Inc | 5 | * Copyright (C) 2006 Google, Inc |
6 | * | 6 | * |
7 | * Notifications support | ||
8 | * Copyright (C) 2009 Nokia Corporation | ||
9 | * Author: Kirill A. Shutemov | ||
10 | * | ||
7 | * Copyright notices from the original cpuset code: | 11 | * Copyright notices from the original cpuset code: |
8 | * -------------------------------------------------- | 12 | * -------------------------------------------------- |
9 | * Copyright (C) 2003 BULL SA. | 13 | * Copyright (C) 2003 BULL SA. |
@@ -43,6 +47,7 @@ | |||
43 | #include <linux/string.h> | 47 | #include <linux/string.h> |
44 | #include <linux/sort.h> | 48 | #include <linux/sort.h> |
45 | #include <linux/kmod.h> | 49 | #include <linux/kmod.h> |
50 | #include <linux/module.h> | ||
46 | #include <linux/delayacct.h> | 51 | #include <linux/delayacct.h> |
47 | #include <linux/cgroupstats.h> | 52 | #include <linux/cgroupstats.h> |
48 | #include <linux/hash.h> | 53 | #include <linux/hash.h> |
@@ -51,15 +56,21 @@ | |||
51 | #include <linux/pid_namespace.h> | 56 | #include <linux/pid_namespace.h> |
52 | #include <linux/idr.h> | 57 | #include <linux/idr.h> |
53 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
59 | #include <linux/eventfd.h> | ||
60 | #include <linux/poll.h> | ||
54 | 61 | ||
55 | #include <asm/atomic.h> | 62 | #include <asm/atomic.h> |
56 | 63 | ||
57 | static DEFINE_MUTEX(cgroup_mutex); | 64 | static DEFINE_MUTEX(cgroup_mutex); |
58 | 65 | ||
59 | /* Generate an array of cgroup subsystem pointers */ | 66 | /* |
67 | * Generate an array of cgroup subsystem pointers. At boot time, this is | ||
68 | * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are | ||
69 | * registered after that. The mutable section of this array is protected by | ||
70 | * cgroup_mutex. | ||
71 | */ | ||
60 | #define SUBSYS(_x) &_x ## _subsys, | 72 | #define SUBSYS(_x) &_x ## _subsys, |
61 | 73 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { | |
62 | static struct cgroup_subsys *subsys[] = { | ||
63 | #include <linux/cgroup_subsys.h> | 74 | #include <linux/cgroup_subsys.h> |
64 | }; | 75 | }; |
65 | 76 | ||
@@ -146,6 +157,35 @@ struct css_id { | |||
146 | unsigned short stack[0]; /* Array of Length (depth+1) */ | 157 | unsigned short stack[0]; /* Array of Length (depth+1) */ |
147 | }; | 158 | }; |
148 | 159 | ||
160 | /* | ||
161 | * cgroup_event represents events which userspace want to recieve. | ||
162 | */ | ||
163 | struct cgroup_event { | ||
164 | /* | ||
165 | * Cgroup which the event belongs to. | ||
166 | */ | ||
167 | struct cgroup *cgrp; | ||
168 | /* | ||
169 | * Control file which the event associated. | ||
170 | */ | ||
171 | struct cftype *cft; | ||
172 | /* | ||
173 | * eventfd to signal userspace about the event. | ||
174 | */ | ||
175 | struct eventfd_ctx *eventfd; | ||
176 | /* | ||
177 | * Each of these stored in a list by the cgroup. | ||
178 | */ | ||
179 | struct list_head list; | ||
180 | /* | ||
181 | * All fields below needed to unregister event when | ||
182 | * userspace closes eventfd. | ||
183 | */ | ||
184 | poll_table pt; | ||
185 | wait_queue_head_t *wqh; | ||
186 | wait_queue_t wait; | ||
187 | struct work_struct remove; | ||
188 | }; | ||
149 | 189 | ||
150 | /* The list of hierarchy roots */ | 190 | /* The list of hierarchy roots */ |
151 | 191 | ||
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
166 | */ | 206 | */ |
167 | static int need_forkexit_callback __read_mostly; | 207 | static int need_forkexit_callback __read_mostly; |
168 | 208 | ||
209 | #ifdef CONFIG_PROVE_LOCKING | ||
210 | int cgroup_lock_is_held(void) | ||
211 | { | ||
212 | return lockdep_is_held(&cgroup_mutex); | ||
213 | } | ||
214 | #else /* #ifdef CONFIG_PROVE_LOCKING */ | ||
215 | int cgroup_lock_is_held(void) | ||
216 | { | ||
217 | return mutex_is_locked(&cgroup_mutex); | ||
218 | } | ||
219 | #endif /* #else #ifdef CONFIG_PROVE_LOCKING */ | ||
220 | |||
221 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | ||
222 | |||
169 | /* convenient tests for these bits */ | 223 | /* convenient tests for these bits */ |
170 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 224 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
171 | { | 225 | { |
@@ -235,7 +289,8 @@ struct cg_cgroup_link { | |||
235 | static struct css_set init_css_set; | 289 | static struct css_set init_css_set; |
236 | static struct cg_cgroup_link init_css_set_link; | 290 | static struct cg_cgroup_link init_css_set_link; |
237 | 291 | ||
238 | static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); | 292 | static int cgroup_init_idr(struct cgroup_subsys *ss, |
293 | struct cgroup_subsys_state *css); | ||
239 | 294 | ||
240 | /* css_set_lock protects the list of css_set objects, and the | 295 | /* css_set_lock protects the list of css_set objects, and the |
241 | * chain of tasks off each css_set. Nests outside task->alloc_lock | 296 | * chain of tasks off each css_set. Nests outside task->alloc_lock |
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set( | |||
433 | struct hlist_node *node; | 488 | struct hlist_node *node; |
434 | struct css_set *cg; | 489 | struct css_set *cg; |
435 | 490 | ||
436 | /* Built the set of subsystem state objects that we want to | 491 | /* |
437 | * see in the new css_set */ | 492 | * Build the set of subsystem state objects that we want to see in the |
493 | * new css_set. while subsystems can change globally, the entries here | ||
494 | * won't change, so no need for locking. | ||
495 | */ | ||
438 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 496 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
439 | if (root->subsys_bits & (1UL << i)) { | 497 | if (root->subsys_bits & (1UL << i)) { |
440 | /* Subsystem is in this hierarchy. So we want | 498 | /* Subsystem is in this hierarchy. So we want |
@@ -681,6 +739,7 @@ void cgroup_lock(void) | |||
681 | { | 739 | { |
682 | mutex_lock(&cgroup_mutex); | 740 | mutex_lock(&cgroup_mutex); |
683 | } | 741 | } |
742 | EXPORT_SYMBOL_GPL(cgroup_lock); | ||
684 | 743 | ||
685 | /** | 744 | /** |
686 | * cgroup_unlock - release lock on cgroup changes | 745 | * cgroup_unlock - release lock on cgroup changes |
@@ -691,6 +750,7 @@ void cgroup_unlock(void) | |||
691 | { | 750 | { |
692 | mutex_unlock(&cgroup_mutex); | 751 | mutex_unlock(&cgroup_mutex); |
693 | } | 752 | } |
753 | EXPORT_SYMBOL_GPL(cgroup_unlock); | ||
694 | 754 | ||
695 | /* | 755 | /* |
696 | * A couple of forward declarations required, due to cyclic reference loop: | 756 | * A couple of forward declarations required, due to cyclic reference loop: |
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
742 | if (ret) | 802 | if (ret) |
743 | break; | 803 | break; |
744 | } | 804 | } |
805 | |||
745 | return ret; | 806 | return ret; |
746 | } | 807 | } |
747 | 808 | ||
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | |||
869 | css_put(css); | 930 | css_put(css); |
870 | } | 931 | } |
871 | 932 | ||
872 | 933 | /* | |
934 | * Call with cgroup_mutex held. Drops reference counts on modules, including | ||
935 | * any duplicate ones that parse_cgroupfs_options took. If this function | ||
936 | * returns an error, no reference counts are touched. | ||
937 | */ | ||
873 | static int rebind_subsystems(struct cgroupfs_root *root, | 938 | static int rebind_subsystems(struct cgroupfs_root *root, |
874 | unsigned long final_bits) | 939 | unsigned long final_bits) |
875 | { | 940 | { |
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
877 | struct cgroup *cgrp = &root->top_cgroup; | 942 | struct cgroup *cgrp = &root->top_cgroup; |
878 | int i; | 943 | int i; |
879 | 944 | ||
945 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | ||
946 | |||
880 | removed_bits = root->actual_subsys_bits & ~final_bits; | 947 | removed_bits = root->actual_subsys_bits & ~final_bits; |
881 | added_bits = final_bits & ~root->actual_subsys_bits; | 948 | added_bits = final_bits & ~root->actual_subsys_bits; |
882 | /* Check that any added subsystems are currently free */ | 949 | /* Check that any added subsystems are currently free */ |
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
885 | struct cgroup_subsys *ss = subsys[i]; | 952 | struct cgroup_subsys *ss = subsys[i]; |
886 | if (!(bit & added_bits)) | 953 | if (!(bit & added_bits)) |
887 | continue; | 954 | continue; |
955 | /* | ||
956 | * Nobody should tell us to do a subsys that doesn't exist: | ||
957 | * parse_cgroupfs_options should catch that case and refcounts | ||
958 | * ensure that subsystems won't disappear once selected. | ||
959 | */ | ||
960 | BUG_ON(ss == NULL); | ||
888 | if (ss->root != &rootnode) { | 961 | if (ss->root != &rootnode) { |
889 | /* Subsystem isn't free */ | 962 | /* Subsystem isn't free */ |
890 | return -EBUSY; | 963 | return -EBUSY; |
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
904 | unsigned long bit = 1UL << i; | 977 | unsigned long bit = 1UL << i; |
905 | if (bit & added_bits) { | 978 | if (bit & added_bits) { |
906 | /* We're binding this subsystem to this hierarchy */ | 979 | /* We're binding this subsystem to this hierarchy */ |
980 | BUG_ON(ss == NULL); | ||
907 | BUG_ON(cgrp->subsys[i]); | 981 | BUG_ON(cgrp->subsys[i]); |
908 | BUG_ON(!dummytop->subsys[i]); | 982 | BUG_ON(!dummytop->subsys[i]); |
909 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); | 983 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); |
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
915 | if (ss->bind) | 989 | if (ss->bind) |
916 | ss->bind(ss, cgrp); | 990 | ss->bind(ss, cgrp); |
917 | mutex_unlock(&ss->hierarchy_mutex); | 991 | mutex_unlock(&ss->hierarchy_mutex); |
992 | /* refcount was already taken, and we're keeping it */ | ||
918 | } else if (bit & removed_bits) { | 993 | } else if (bit & removed_bits) { |
919 | /* We're removing this subsystem */ | 994 | /* We're removing this subsystem */ |
995 | BUG_ON(ss == NULL); | ||
920 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); | 996 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); |
921 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 997 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); |
922 | mutex_lock(&ss->hierarchy_mutex); | 998 | mutex_lock(&ss->hierarchy_mutex); |
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
927 | subsys[i]->root = &rootnode; | 1003 | subsys[i]->root = &rootnode; |
928 | list_move(&ss->sibling, &rootnode.subsys_list); | 1004 | list_move(&ss->sibling, &rootnode.subsys_list); |
929 | mutex_unlock(&ss->hierarchy_mutex); | 1005 | mutex_unlock(&ss->hierarchy_mutex); |
1006 | /* subsystem is now free - drop reference on module */ | ||
1007 | module_put(ss->module); | ||
930 | } else if (bit & final_bits) { | 1008 | } else if (bit & final_bits) { |
931 | /* Subsystem state should already exist */ | 1009 | /* Subsystem state should already exist */ |
1010 | BUG_ON(ss == NULL); | ||
932 | BUG_ON(!cgrp->subsys[i]); | 1011 | BUG_ON(!cgrp->subsys[i]); |
1012 | /* | ||
1013 | * a refcount was taken, but we already had one, so | ||
1014 | * drop the extra reference. | ||
1015 | */ | ||
1016 | module_put(ss->module); | ||
1017 | #ifdef CONFIG_MODULE_UNLOAD | ||
1018 | BUG_ON(ss->module && !module_refcount(ss->module)); | ||
1019 | #endif | ||
933 | } else { | 1020 | } else { |
934 | /* Subsystem state shouldn't exist */ | 1021 | /* Subsystem state shouldn't exist */ |
935 | BUG_ON(cgrp->subsys[i]); | 1022 | BUG_ON(cgrp->subsys[i]); |
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts { | |||
971 | 1058 | ||
972 | }; | 1059 | }; |
973 | 1060 | ||
974 | /* Convert a hierarchy specifier into a bitmask of subsystems and | 1061 | /* |
975 | * flags. */ | 1062 | * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call |
976 | static int parse_cgroupfs_options(char *data, | 1063 | * with cgroup_mutex held to protect the subsys[] array. This function takes |
977 | struct cgroup_sb_opts *opts) | 1064 | * refcounts on subsystems to be used, unless it returns error, in which case |
1065 | * no refcounts are taken. | ||
1066 | */ | ||
1067 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | ||
978 | { | 1068 | { |
979 | char *token, *o = data ?: "all"; | 1069 | char *token, *o = data ?: "all"; |
980 | unsigned long mask = (unsigned long)-1; | 1070 | unsigned long mask = (unsigned long)-1; |
1071 | int i; | ||
1072 | bool module_pin_failed = false; | ||
1073 | |||
1074 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | ||
981 | 1075 | ||
982 | #ifdef CONFIG_CPUSETS | 1076 | #ifdef CONFIG_CPUSETS |
983 | mask = ~(1UL << cpuset_subsys_id); | 1077 | mask = ~(1UL << cpuset_subsys_id); |
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data, | |||
990 | return -EINVAL; | 1084 | return -EINVAL; |
991 | if (!strcmp(token, "all")) { | 1085 | if (!strcmp(token, "all")) { |
992 | /* Add all non-disabled subsystems */ | 1086 | /* Add all non-disabled subsystems */ |
993 | int i; | ||
994 | opts->subsys_bits = 0; | 1087 | opts->subsys_bits = 0; |
995 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1088 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
996 | struct cgroup_subsys *ss = subsys[i]; | 1089 | struct cgroup_subsys *ss = subsys[i]; |
1090 | if (ss == NULL) | ||
1091 | continue; | ||
997 | if (!ss->disabled) | 1092 | if (!ss->disabled) |
998 | opts->subsys_bits |= 1ul << i; | 1093 | opts->subsys_bits |= 1ul << i; |
999 | } | 1094 | } |
@@ -1011,7 +1106,6 @@ static int parse_cgroupfs_options(char *data, | |||
1011 | if (!opts->release_agent) | 1106 | if (!opts->release_agent) |
1012 | return -ENOMEM; | 1107 | return -ENOMEM; |
1013 | } else if (!strncmp(token, "name=", 5)) { | 1108 | } else if (!strncmp(token, "name=", 5)) { |
1014 | int i; | ||
1015 | const char *name = token + 5; | 1109 | const char *name = token + 5; |
1016 | /* Can't specify an empty name */ | 1110 | /* Can't specify an empty name */ |
1017 | if (!strlen(name)) | 1111 | if (!strlen(name)) |
@@ -1035,9 +1129,10 @@ static int parse_cgroupfs_options(char *data, | |||
1035 | return -ENOMEM; | 1129 | return -ENOMEM; |
1036 | } else { | 1130 | } else { |
1037 | struct cgroup_subsys *ss; | 1131 | struct cgroup_subsys *ss; |
1038 | int i; | ||
1039 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1132 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1040 | ss = subsys[i]; | 1133 | ss = subsys[i]; |
1134 | if (ss == NULL) | ||
1135 | continue; | ||
1041 | if (!strcmp(token, ss->name)) { | 1136 | if (!strcmp(token, ss->name)) { |
1042 | if (!ss->disabled) | 1137 | if (!ss->disabled) |
1043 | set_bit(i, &opts->subsys_bits); | 1138 | set_bit(i, &opts->subsys_bits); |
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data, | |||
1072 | if (!opts->subsys_bits && !opts->name) | 1167 | if (!opts->subsys_bits && !opts->name) |
1073 | return -EINVAL; | 1168 | return -EINVAL; |
1074 | 1169 | ||
1170 | /* | ||
1171 | * Grab references on all the modules we'll need, so the subsystems | ||
1172 | * don't dance around before rebind_subsystems attaches them. This may | ||
1173 | * take duplicate reference counts on a subsystem that's already used, | ||
1174 | * but rebind_subsystems handles this case. | ||
1175 | */ | ||
1176 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1177 | unsigned long bit = 1UL << i; | ||
1178 | |||
1179 | if (!(bit & opts->subsys_bits)) | ||
1180 | continue; | ||
1181 | if (!try_module_get(subsys[i]->module)) { | ||
1182 | module_pin_failed = true; | ||
1183 | break; | ||
1184 | } | ||
1185 | } | ||
1186 | if (module_pin_failed) { | ||
1187 | /* | ||
1188 | * oops, one of the modules was going away. this means that we | ||
1189 | * raced with a module_delete call, and to the user this is | ||
1190 | * essentially a "subsystem doesn't exist" case. | ||
1191 | */ | ||
1192 | for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { | ||
1193 | /* drop refcounts only on the ones we took */ | ||
1194 | unsigned long bit = 1UL << i; | ||
1195 | |||
1196 | if (!(bit & opts->subsys_bits)) | ||
1197 | continue; | ||
1198 | module_put(subsys[i]->module); | ||
1199 | } | ||
1200 | return -ENOENT; | ||
1201 | } | ||
1202 | |||
1075 | return 0; | 1203 | return 0; |
1076 | } | 1204 | } |
1077 | 1205 | ||
1206 | static void drop_parsed_module_refcounts(unsigned long subsys_bits) | ||
1207 | { | ||
1208 | int i; | ||
1209 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1210 | unsigned long bit = 1UL << i; | ||
1211 | |||
1212 | if (!(bit & subsys_bits)) | ||
1213 | continue; | ||
1214 | module_put(subsys[i]->module); | ||
1215 | } | ||
1216 | } | ||
1217 | |||
1078 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | 1218 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) |
1079 | { | 1219 | { |
1080 | int ret = 0; | 1220 | int ret = 0; |
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1091 | if (ret) | 1231 | if (ret) |
1092 | goto out_unlock; | 1232 | goto out_unlock; |
1093 | 1233 | ||
1094 | /* Don't allow flags to change at remount */ | 1234 | /* Don't allow flags or name to change at remount */ |
1095 | if (opts.flags != root->flags) { | 1235 | if (opts.flags != root->flags || |
1096 | ret = -EINVAL; | 1236 | (opts.name && strcmp(opts.name, root->name))) { |
1097 | goto out_unlock; | ||
1098 | } | ||
1099 | |||
1100 | /* Don't allow name to change at remount */ | ||
1101 | if (opts.name && strcmp(opts.name, root->name)) { | ||
1102 | ret = -EINVAL; | 1237 | ret = -EINVAL; |
1238 | drop_parsed_module_refcounts(opts.subsys_bits); | ||
1103 | goto out_unlock; | 1239 | goto out_unlock; |
1104 | } | 1240 | } |
1105 | 1241 | ||
1106 | ret = rebind_subsystems(root, opts.subsys_bits); | 1242 | ret = rebind_subsystems(root, opts.subsys_bits); |
1107 | if (ret) | 1243 | if (ret) { |
1244 | drop_parsed_module_refcounts(opts.subsys_bits); | ||
1108 | goto out_unlock; | 1245 | goto out_unlock; |
1246 | } | ||
1109 | 1247 | ||
1110 | /* (re)populate subsystem files */ | 1248 | /* (re)populate subsystem files */ |
1111 | cgroup_populate_dir(cgrp); | 1249 | cgroup_populate_dir(cgrp); |
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1136 | INIT_LIST_HEAD(&cgrp->release_list); | 1274 | INIT_LIST_HEAD(&cgrp->release_list); |
1137 | INIT_LIST_HEAD(&cgrp->pidlists); | 1275 | INIT_LIST_HEAD(&cgrp->pidlists); |
1138 | mutex_init(&cgrp->pidlist_mutex); | 1276 | mutex_init(&cgrp->pidlist_mutex); |
1277 | INIT_LIST_HEAD(&cgrp->event_list); | ||
1278 | spin_lock_init(&cgrp->event_list_lock); | ||
1139 | } | 1279 | } |
1140 | 1280 | ||
1141 | static void init_cgroup_root(struct cgroupfs_root *root) | 1281 | static void init_cgroup_root(struct cgroupfs_root *root) |
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1291 | struct cgroupfs_root *new_root; | 1431 | struct cgroupfs_root *new_root; |
1292 | 1432 | ||
1293 | /* First find the desired set of subsystems */ | 1433 | /* First find the desired set of subsystems */ |
1434 | mutex_lock(&cgroup_mutex); | ||
1294 | ret = parse_cgroupfs_options(data, &opts); | 1435 | ret = parse_cgroupfs_options(data, &opts); |
1436 | mutex_unlock(&cgroup_mutex); | ||
1295 | if (ret) | 1437 | if (ret) |
1296 | goto out_err; | 1438 | goto out_err; |
1297 | 1439 | ||
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1302 | new_root = cgroup_root_from_opts(&opts); | 1444 | new_root = cgroup_root_from_opts(&opts); |
1303 | if (IS_ERR(new_root)) { | 1445 | if (IS_ERR(new_root)) { |
1304 | ret = PTR_ERR(new_root); | 1446 | ret = PTR_ERR(new_root); |
1305 | goto out_err; | 1447 | goto drop_modules; |
1306 | } | 1448 | } |
1307 | opts.new_root = new_root; | 1449 | opts.new_root = new_root; |
1308 | 1450 | ||
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1311 | if (IS_ERR(sb)) { | 1453 | if (IS_ERR(sb)) { |
1312 | ret = PTR_ERR(sb); | 1454 | ret = PTR_ERR(sb); |
1313 | cgroup_drop_root(opts.new_root); | 1455 | cgroup_drop_root(opts.new_root); |
1314 | goto out_err; | 1456 | goto drop_modules; |
1315 | } | 1457 | } |
1316 | 1458 | ||
1317 | root = sb->s_fs_info; | 1459 | root = sb->s_fs_info; |
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1367 | free_cg_links(&tmp_cg_links); | 1509 | free_cg_links(&tmp_cg_links); |
1368 | goto drop_new_super; | 1510 | goto drop_new_super; |
1369 | } | 1511 | } |
1512 | /* | ||
1513 | * There must be no failure case after here, since rebinding | ||
1514 | * takes care of subsystems' refcounts, which are explicitly | ||
1515 | * dropped in the failure exit path. | ||
1516 | */ | ||
1370 | 1517 | ||
1371 | /* EBUSY should be the only error here */ | 1518 | /* EBUSY should be the only error here */ |
1372 | BUG_ON(ret); | 1519 | BUG_ON(ret); |
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1405 | * any) is not needed | 1552 | * any) is not needed |
1406 | */ | 1553 | */ |
1407 | cgroup_drop_root(opts.new_root); | 1554 | cgroup_drop_root(opts.new_root); |
1555 | /* no subsys rebinding, so refcounts don't change */ | ||
1556 | drop_parsed_module_refcounts(opts.subsys_bits); | ||
1408 | } | 1557 | } |
1409 | 1558 | ||
1410 | simple_set_mnt(mnt, sb); | 1559 | simple_set_mnt(mnt, sb); |
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1414 | 1563 | ||
1415 | drop_new_super: | 1564 | drop_new_super: |
1416 | deactivate_locked_super(sb); | 1565 | deactivate_locked_super(sb); |
1566 | drop_modules: | ||
1567 | drop_parsed_module_refcounts(opts.subsys_bits); | ||
1417 | out_err: | 1568 | out_err: |
1418 | kfree(opts.release_agent); | 1569 | kfree(opts.release_agent); |
1419 | kfree(opts.name); | 1570 | kfree(opts.name); |
@@ -1495,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
1495 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1646 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1496 | { | 1647 | { |
1497 | char *start; | 1648 | char *start; |
1498 | struct dentry *dentry = rcu_dereference(cgrp->dentry); | 1649 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, |
1650 | rcu_read_lock_held() || | ||
1651 | cgroup_lock_is_held()); | ||
1499 | 1652 | ||
1500 | if (!dentry || cgrp == dummytop) { | 1653 | if (!dentry || cgrp == dummytop) { |
1501 | /* | 1654 | /* |
@@ -1511,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1511 | *--start = '\0'; | 1664 | *--start = '\0'; |
1512 | for (;;) { | 1665 | for (;;) { |
1513 | int len = dentry->d_name.len; | 1666 | int len = dentry->d_name.len; |
1667 | |||
1514 | if ((start -= len) < buf) | 1668 | if ((start -= len) < buf) |
1515 | return -ENAMETOOLONG; | 1669 | return -ENAMETOOLONG; |
1516 | memcpy(start, cgrp->dentry->d_name.name, len); | 1670 | memcpy(start, dentry->d_name.name, len); |
1517 | cgrp = cgrp->parent; | 1671 | cgrp = cgrp->parent; |
1518 | if (!cgrp) | 1672 | if (!cgrp) |
1519 | break; | 1673 | break; |
1520 | dentry = rcu_dereference(cgrp->dentry); | 1674 | |
1675 | dentry = rcu_dereference_check(cgrp->dentry, | ||
1676 | rcu_read_lock_held() || | ||
1677 | cgroup_lock_is_held()); | ||
1521 | if (!cgrp->parent) | 1678 | if (!cgrp->parent) |
1522 | continue; | 1679 | continue; |
1523 | if (--start < buf) | 1680 | if (--start < buf) |
@@ -1527,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1527 | memmove(buf, start, buf + buflen - start); | 1684 | memmove(buf, start, buf + buflen - start); |
1528 | return 0; | 1685 | return 0; |
1529 | } | 1686 | } |
1687 | EXPORT_SYMBOL_GPL(cgroup_path); | ||
1530 | 1688 | ||
1531 | /** | 1689 | /** |
1532 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' | 1690 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' |
@@ -1539,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1539 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1697 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
1540 | { | 1698 | { |
1541 | int retval = 0; | 1699 | int retval = 0; |
1542 | struct cgroup_subsys *ss; | 1700 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1543 | struct cgroup *oldcgrp; | 1701 | struct cgroup *oldcgrp; |
1544 | struct css_set *cg; | 1702 | struct css_set *cg; |
1545 | struct css_set *newcg; | 1703 | struct css_set *newcg; |
@@ -1553,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1553 | for_each_subsys(root, ss) { | 1711 | for_each_subsys(root, ss) { |
1554 | if (ss->can_attach) { | 1712 | if (ss->can_attach) { |
1555 | retval = ss->can_attach(ss, cgrp, tsk, false); | 1713 | retval = ss->can_attach(ss, cgrp, tsk, false); |
1556 | if (retval) | 1714 | if (retval) { |
1557 | return retval; | 1715 | /* |
1716 | * Remember on which subsystem the can_attach() | ||
1717 | * failed, so that we only call cancel_attach() | ||
1718 | * against the subsystems whose can_attach() | ||
1719 | * succeeded. (See below) | ||
1720 | */ | ||
1721 | failed_ss = ss; | ||
1722 | goto out; | ||
1723 | } | ||
1558 | } | 1724 | } |
1559 | } | 1725 | } |
1560 | 1726 | ||
@@ -1568,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1568 | */ | 1734 | */ |
1569 | newcg = find_css_set(cg, cgrp); | 1735 | newcg = find_css_set(cg, cgrp); |
1570 | put_css_set(cg); | 1736 | put_css_set(cg); |
1571 | if (!newcg) | 1737 | if (!newcg) { |
1572 | return -ENOMEM; | 1738 | retval = -ENOMEM; |
1739 | goto out; | ||
1740 | } | ||
1573 | 1741 | ||
1574 | task_lock(tsk); | 1742 | task_lock(tsk); |
1575 | if (tsk->flags & PF_EXITING) { | 1743 | if (tsk->flags & PF_EXITING) { |
1576 | task_unlock(tsk); | 1744 | task_unlock(tsk); |
1577 | put_css_set(newcg); | 1745 | put_css_set(newcg); |
1578 | return -ESRCH; | 1746 | retval = -ESRCH; |
1747 | goto out; | ||
1579 | } | 1748 | } |
1580 | rcu_assign_pointer(tsk->cgroups, newcg); | 1749 | rcu_assign_pointer(tsk->cgroups, newcg); |
1581 | task_unlock(tsk); | 1750 | task_unlock(tsk); |
@@ -1601,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1601 | * is no longer empty. | 1770 | * is no longer empty. |
1602 | */ | 1771 | */ |
1603 | cgroup_wakeup_rmdir_waiter(cgrp); | 1772 | cgroup_wakeup_rmdir_waiter(cgrp); |
1604 | return 0; | 1773 | out: |
1774 | if (retval) { | ||
1775 | for_each_subsys(root, ss) { | ||
1776 | if (ss == failed_ss) | ||
1777 | /* | ||
1778 | * This subsystem was the one that failed the | ||
1779 | * can_attach() check earlier, so we don't need | ||
1780 | * to call cancel_attach() against it or any | ||
1781 | * remaining subsystems. | ||
1782 | */ | ||
1783 | break; | ||
1784 | if (ss->cancel_attach) | ||
1785 | ss->cancel_attach(ss, cgrp, tsk, false); | ||
1786 | } | ||
1787 | } | ||
1788 | return retval; | ||
1605 | } | 1789 | } |
1606 | 1790 | ||
1607 | /* | 1791 | /* |
@@ -1667,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp) | |||
1667 | } | 1851 | } |
1668 | return true; | 1852 | return true; |
1669 | } | 1853 | } |
1854 | EXPORT_SYMBOL_GPL(cgroup_lock_live_group); | ||
1670 | 1855 | ||
1671 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | 1856 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, |
1672 | const char *buffer) | 1857 | const char *buffer) |
@@ -1935,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = { | |||
1935 | .rename = cgroup_rename, | 2120 | .rename = cgroup_rename, |
1936 | }; | 2121 | }; |
1937 | 2122 | ||
2123 | /* | ||
2124 | * Check if a file is a control file | ||
2125 | */ | ||
2126 | static inline struct cftype *__file_cft(struct file *file) | ||
2127 | { | ||
2128 | if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) | ||
2129 | return ERR_PTR(-EINVAL); | ||
2130 | return __d_cft(file->f_dentry); | ||
2131 | } | ||
2132 | |||
1938 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2133 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, |
1939 | struct super_block *sb) | 2134 | struct super_block *sb) |
1940 | { | 2135 | { |
@@ -2054,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp, | |||
2054 | error = PTR_ERR(dentry); | 2249 | error = PTR_ERR(dentry); |
2055 | return error; | 2250 | return error; |
2056 | } | 2251 | } |
2252 | EXPORT_SYMBOL_GPL(cgroup_add_file); | ||
2057 | 2253 | ||
2058 | int cgroup_add_files(struct cgroup *cgrp, | 2254 | int cgroup_add_files(struct cgroup *cgrp, |
2059 | struct cgroup_subsys *subsys, | 2255 | struct cgroup_subsys *subsys, |
@@ -2068,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp, | |||
2068 | } | 2264 | } |
2069 | return 0; | 2265 | return 0; |
2070 | } | 2266 | } |
2267 | EXPORT_SYMBOL_GPL(cgroup_add_files); | ||
2071 | 2268 | ||
2072 | /** | 2269 | /** |
2073 | * cgroup_task_count - count the number of tasks in a cgroup. | 2270 | * cgroup_task_count - count the number of tasks in a cgroup. |
@@ -2453,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
2453 | { | 2650 | { |
2454 | struct cgroup_pidlist *l; | 2651 | struct cgroup_pidlist *l; |
2455 | /* don't need task_nsproxy() if we're looking at ourself */ | 2652 | /* don't need task_nsproxy() if we're looking at ourself */ |
2456 | struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); | 2653 | struct pid_namespace *ns = current->nsproxy->pid_ns; |
2654 | |||
2457 | /* | 2655 | /* |
2458 | * We can't drop the pidlist_mutex before taking the l->mutex in case | 2656 | * We can't drop the pidlist_mutex before taking the l->mutex in case |
2459 | * the last ref-holder is trying to remove l from the list at the same | 2657 | * the last ref-holder is trying to remove l from the list at the same |
@@ -2463,8 +2661,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
2463 | mutex_lock(&cgrp->pidlist_mutex); | 2661 | mutex_lock(&cgrp->pidlist_mutex); |
2464 | list_for_each_entry(l, &cgrp->pidlists, links) { | 2662 | list_for_each_entry(l, &cgrp->pidlists, links) { |
2465 | if (l->key.type == type && l->key.ns == ns) { | 2663 | if (l->key.type == type && l->key.ns == ns) { |
2466 | /* found a matching list - drop the extra refcount */ | ||
2467 | put_pid_ns(ns); | ||
2468 | /* make sure l doesn't vanish out from under us */ | 2664 | /* make sure l doesn't vanish out from under us */ |
2469 | down_write(&l->mutex); | 2665 | down_write(&l->mutex); |
2470 | mutex_unlock(&cgrp->pidlist_mutex); | 2666 | mutex_unlock(&cgrp->pidlist_mutex); |
@@ -2475,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
2475 | l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); | 2671 | l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); |
2476 | if (!l) { | 2672 | if (!l) { |
2477 | mutex_unlock(&cgrp->pidlist_mutex); | 2673 | mutex_unlock(&cgrp->pidlist_mutex); |
2478 | put_pid_ns(ns); | ||
2479 | return l; | 2674 | return l; |
2480 | } | 2675 | } |
2481 | init_rwsem(&l->mutex); | 2676 | init_rwsem(&l->mutex); |
2482 | down_write(&l->mutex); | 2677 | down_write(&l->mutex); |
2483 | l->key.type = type; | 2678 | l->key.type = type; |
2484 | l->key.ns = ns; | 2679 | l->key.ns = get_pid_ns(ns); |
2485 | l->use_count = 0; /* don't increment here */ | 2680 | l->use_count = 0; /* don't increment here */ |
2486 | l->list = NULL; | 2681 | l->list = NULL; |
2487 | l->owner = cgrp; | 2682 | l->owner = cgrp; |
@@ -2789,6 +2984,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, | |||
2789 | } | 2984 | } |
2790 | 2985 | ||
2791 | /* | 2986 | /* |
2987 | * Unregister event and free resources. | ||
2988 | * | ||
2989 | * Gets called from workqueue. | ||
2990 | */ | ||
2991 | static void cgroup_event_remove(struct work_struct *work) | ||
2992 | { | ||
2993 | struct cgroup_event *event = container_of(work, struct cgroup_event, | ||
2994 | remove); | ||
2995 | struct cgroup *cgrp = event->cgrp; | ||
2996 | |||
2997 | /* TODO: check return code */ | ||
2998 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | ||
2999 | |||
3000 | eventfd_ctx_put(event->eventfd); | ||
3001 | kfree(event); | ||
3002 | dput(cgrp->dentry); | ||
3003 | } | ||
3004 | |||
3005 | /* | ||
3006 | * Gets called on POLLHUP on eventfd when user closes it. | ||
3007 | * | ||
3008 | * Called with wqh->lock held and interrupts disabled. | ||
3009 | */ | ||
3010 | static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | ||
3011 | int sync, void *key) | ||
3012 | { | ||
3013 | struct cgroup_event *event = container_of(wait, | ||
3014 | struct cgroup_event, wait); | ||
3015 | struct cgroup *cgrp = event->cgrp; | ||
3016 | unsigned long flags = (unsigned long)key; | ||
3017 | |||
3018 | if (flags & POLLHUP) { | ||
3019 | remove_wait_queue_locked(event->wqh, &event->wait); | ||
3020 | spin_lock(&cgrp->event_list_lock); | ||
3021 | list_del(&event->list); | ||
3022 | spin_unlock(&cgrp->event_list_lock); | ||
3023 | /* | ||
3024 | * We are in atomic context, but cgroup_event_remove() may | ||
3025 | * sleep, so we have to call it in workqueue. | ||
3026 | */ | ||
3027 | schedule_work(&event->remove); | ||
3028 | } | ||
3029 | |||
3030 | return 0; | ||
3031 | } | ||
3032 | |||
3033 | static void cgroup_event_ptable_queue_proc(struct file *file, | ||
3034 | wait_queue_head_t *wqh, poll_table *pt) | ||
3035 | { | ||
3036 | struct cgroup_event *event = container_of(pt, | ||
3037 | struct cgroup_event, pt); | ||
3038 | |||
3039 | event->wqh = wqh; | ||
3040 | add_wait_queue(wqh, &event->wait); | ||
3041 | } | ||
3042 | |||
3043 | /* | ||
3044 | * Parse input and register new cgroup event handler. | ||
3045 | * | ||
3046 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
3047 | * Interpretation of args is defined by control file implementation. | ||
3048 | */ | ||
3049 | static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | ||
3050 | const char *buffer) | ||
3051 | { | ||
3052 | struct cgroup_event *event = NULL; | ||
3053 | unsigned int efd, cfd; | ||
3054 | struct file *efile = NULL; | ||
3055 | struct file *cfile = NULL; | ||
3056 | char *endp; | ||
3057 | int ret; | ||
3058 | |||
3059 | efd = simple_strtoul(buffer, &endp, 10); | ||
3060 | if (*endp != ' ') | ||
3061 | return -EINVAL; | ||
3062 | buffer = endp + 1; | ||
3063 | |||
3064 | cfd = simple_strtoul(buffer, &endp, 10); | ||
3065 | if ((*endp != ' ') && (*endp != '\0')) | ||
3066 | return -EINVAL; | ||
3067 | buffer = endp + 1; | ||
3068 | |||
3069 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
3070 | if (!event) | ||
3071 | return -ENOMEM; | ||
3072 | event->cgrp = cgrp; | ||
3073 | INIT_LIST_HEAD(&event->list); | ||
3074 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | ||
3075 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | ||
3076 | INIT_WORK(&event->remove, cgroup_event_remove); | ||
3077 | |||
3078 | efile = eventfd_fget(efd); | ||
3079 | if (IS_ERR(efile)) { | ||
3080 | ret = PTR_ERR(efile); | ||
3081 | goto fail; | ||
3082 | } | ||
3083 | |||
3084 | event->eventfd = eventfd_ctx_fileget(efile); | ||
3085 | if (IS_ERR(event->eventfd)) { | ||
3086 | ret = PTR_ERR(event->eventfd); | ||
3087 | goto fail; | ||
3088 | } | ||
3089 | |||
3090 | cfile = fget(cfd); | ||
3091 | if (!cfile) { | ||
3092 | ret = -EBADF; | ||
3093 | goto fail; | ||
3094 | } | ||
3095 | |||
3096 | /* the process need read permission on control file */ | ||
3097 | ret = file_permission(cfile, MAY_READ); | ||
3098 | if (ret < 0) | ||
3099 | goto fail; | ||
3100 | |||
3101 | event->cft = __file_cft(cfile); | ||
3102 | if (IS_ERR(event->cft)) { | ||
3103 | ret = PTR_ERR(event->cft); | ||
3104 | goto fail; | ||
3105 | } | ||
3106 | |||
3107 | if (!event->cft->register_event || !event->cft->unregister_event) { | ||
3108 | ret = -EINVAL; | ||
3109 | goto fail; | ||
3110 | } | ||
3111 | |||
3112 | ret = event->cft->register_event(cgrp, event->cft, | ||
3113 | event->eventfd, buffer); | ||
3114 | if (ret) | ||
3115 | goto fail; | ||
3116 | |||
3117 | if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { | ||
3118 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | ||
3119 | ret = 0; | ||
3120 | goto fail; | ||
3121 | } | ||
3122 | |||
3123 | /* | ||
3124 | * Events should be removed after rmdir of cgroup directory, but before | ||
3125 | * destroying subsystem state objects. Let's take reference to cgroup | ||
3126 | * directory dentry to do that. | ||
3127 | */ | ||
3128 | dget(cgrp->dentry); | ||
3129 | |||
3130 | spin_lock(&cgrp->event_list_lock); | ||
3131 | list_add(&event->list, &cgrp->event_list); | ||
3132 | spin_unlock(&cgrp->event_list_lock); | ||
3133 | |||
3134 | fput(cfile); | ||
3135 | fput(efile); | ||
3136 | |||
3137 | return 0; | ||
3138 | |||
3139 | fail: | ||
3140 | if (cfile) | ||
3141 | fput(cfile); | ||
3142 | |||
3143 | if (event && event->eventfd && !IS_ERR(event->eventfd)) | ||
3144 | eventfd_ctx_put(event->eventfd); | ||
3145 | |||
3146 | if (!IS_ERR_OR_NULL(efile)) | ||
3147 | fput(efile); | ||
3148 | |||
3149 | kfree(event); | ||
3150 | |||
3151 | return ret; | ||
3152 | } | ||
3153 | |||
3154 | /* | ||
2792 | * for the common functions, 'private' gives the type of file | 3155 | * for the common functions, 'private' gives the type of file |
2793 | */ | 3156 | */ |
2794 | /* for hysterical raisins, we can't put this on the older files */ | 3157 | /* for hysterical raisins, we can't put this on the older files */ |
@@ -2813,6 +3176,11 @@ static struct cftype files[] = { | |||
2813 | .read_u64 = cgroup_read_notify_on_release, | 3176 | .read_u64 = cgroup_read_notify_on_release, |
2814 | .write_u64 = cgroup_write_notify_on_release, | 3177 | .write_u64 = cgroup_write_notify_on_release, |
2815 | }, | 3178 | }, |
3179 | { | ||
3180 | .name = CGROUP_FILE_GENERIC_PREFIX "event_control", | ||
3181 | .write_string = cgroup_write_event_control, | ||
3182 | .mode = S_IWUGO, | ||
3183 | }, | ||
2816 | }; | 3184 | }; |
2817 | 3185 | ||
2818 | static struct cftype cft_release_agent = { | 3186 | static struct cftype cft_release_agent = { |
@@ -2877,8 +3245,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | |||
2877 | /* We need to take each hierarchy_mutex in a consistent order */ | 3245 | /* We need to take each hierarchy_mutex in a consistent order */ |
2878 | int i; | 3246 | int i; |
2879 | 3247 | ||
3248 | /* | ||
3249 | * No worry about a race with rebind_subsystems that might mess up the | ||
3250 | * locking order, since both parties are under cgroup_mutex. | ||
3251 | */ | ||
2880 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3252 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
2881 | struct cgroup_subsys *ss = subsys[i]; | 3253 | struct cgroup_subsys *ss = subsys[i]; |
3254 | if (ss == NULL) | ||
3255 | continue; | ||
2882 | if (ss->root == root) | 3256 | if (ss->root == root) |
2883 | mutex_lock(&ss->hierarchy_mutex); | 3257 | mutex_lock(&ss->hierarchy_mutex); |
2884 | } | 3258 | } |
@@ -2890,6 +3264,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) | |||
2890 | 3264 | ||
2891 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3265 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
2892 | struct cgroup_subsys *ss = subsys[i]; | 3266 | struct cgroup_subsys *ss = subsys[i]; |
3267 | if (ss == NULL) | ||
3268 | continue; | ||
2893 | if (ss->root == root) | 3269 | if (ss->root == root) |
2894 | mutex_unlock(&ss->hierarchy_mutex); | 3270 | mutex_unlock(&ss->hierarchy_mutex); |
2895 | } | 3271 | } |
@@ -2936,14 +3312,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
2936 | 3312 | ||
2937 | for_each_subsys(root, ss) { | 3313 | for_each_subsys(root, ss) { |
2938 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); | 3314 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); |
3315 | |||
2939 | if (IS_ERR(css)) { | 3316 | if (IS_ERR(css)) { |
2940 | err = PTR_ERR(css); | 3317 | err = PTR_ERR(css); |
2941 | goto err_destroy; | 3318 | goto err_destroy; |
2942 | } | 3319 | } |
2943 | init_cgroup_css(css, ss, cgrp); | 3320 | init_cgroup_css(css, ss, cgrp); |
2944 | if (ss->use_id) | 3321 | if (ss->use_id) { |
2945 | if (alloc_css_id(ss, parent, cgrp)) | 3322 | err = alloc_css_id(ss, parent, cgrp); |
3323 | if (err) | ||
2946 | goto err_destroy; | 3324 | goto err_destroy; |
3325 | } | ||
2947 | /* At error, ->destroy() callback has to free assigned ID. */ | 3326 | /* At error, ->destroy() callback has to free assigned ID. */ |
2948 | } | 3327 | } |
2949 | 3328 | ||
@@ -3010,11 +3389,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3010 | * synchronization other than RCU, and the subsystem linked | 3389 | * synchronization other than RCU, and the subsystem linked |
3011 | * list isn't RCU-safe */ | 3390 | * list isn't RCU-safe */ |
3012 | int i; | 3391 | int i; |
3392 | /* | ||
3393 | * We won't need to lock the subsys array, because the subsystems | ||
3394 | * we're concerned about aren't going anywhere since our cgroup root | ||
3395 | * has a reference on them. | ||
3396 | */ | ||
3013 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3397 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3014 | struct cgroup_subsys *ss = subsys[i]; | 3398 | struct cgroup_subsys *ss = subsys[i]; |
3015 | struct cgroup_subsys_state *css; | 3399 | struct cgroup_subsys_state *css; |
3016 | /* Skip subsystems not in this hierarchy */ | 3400 | /* Skip subsystems not present or not in this hierarchy */ |
3017 | if (ss->root != cgrp->root) | 3401 | if (ss == NULL || ss->root != cgrp->root) |
3018 | continue; | 3402 | continue; |
3019 | css = cgrp->subsys[ss->subsys_id]; | 3403 | css = cgrp->subsys[ss->subsys_id]; |
3020 | /* When called from check_for_release() it's possible | 3404 | /* When called from check_for_release() it's possible |
@@ -3088,6 +3472,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
3088 | struct dentry *d; | 3472 | struct dentry *d; |
3089 | struct cgroup *parent; | 3473 | struct cgroup *parent; |
3090 | DEFINE_WAIT(wait); | 3474 | DEFINE_WAIT(wait); |
3475 | struct cgroup_event *event, *tmp; | ||
3091 | int ret; | 3476 | int ret; |
3092 | 3477 | ||
3093 | /* the vfs holds both inode->i_mutex already */ | 3478 | /* the vfs holds both inode->i_mutex already */ |
@@ -3171,6 +3556,20 @@ again: | |||
3171 | set_bit(CGRP_RELEASABLE, &parent->flags); | 3556 | set_bit(CGRP_RELEASABLE, &parent->flags); |
3172 | check_for_release(parent); | 3557 | check_for_release(parent); |
3173 | 3558 | ||
3559 | /* | ||
3560 | * Unregister events and notify userspace. | ||
3561 | * Notify userspace about cgroup removing only after rmdir of cgroup | ||
3562 | * directory to avoid race between userspace and kernelspace | ||
3563 | */ | ||
3564 | spin_lock(&cgrp->event_list_lock); | ||
3565 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | ||
3566 | list_del(&event->list); | ||
3567 | remove_wait_queue(event->wqh, &event->wait); | ||
3568 | eventfd_signal(event->eventfd, 1); | ||
3569 | schedule_work(&event->remove); | ||
3570 | } | ||
3571 | spin_unlock(&cgrp->event_list_lock); | ||
3572 | |||
3174 | mutex_unlock(&cgroup_mutex); | 3573 | mutex_unlock(&cgroup_mutex); |
3175 | return 0; | 3574 | return 0; |
3176 | } | 3575 | } |
@@ -3205,9 +3604,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
3205 | mutex_init(&ss->hierarchy_mutex); | 3604 | mutex_init(&ss->hierarchy_mutex); |
3206 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); | 3605 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); |
3207 | ss->active = 1; | 3606 | ss->active = 1; |
3607 | |||
3608 | /* this function shouldn't be used with modular subsystems, since they | ||
3609 | * need to register a subsys_id, among other things */ | ||
3610 | BUG_ON(ss->module); | ||
3208 | } | 3611 | } |
3209 | 3612 | ||
3210 | /** | 3613 | /** |
3614 | * cgroup_load_subsys: load and register a modular subsystem at runtime | ||
3615 | * @ss: the subsystem to load | ||
3616 | * | ||
3617 | * This function should be called in a modular subsystem's initcall. If the | ||
3618 | * subsytem is built as a module, it will be assigned a new subsys_id and set | ||
3619 | * up for use. If the subsystem is built-in anyway, work is delegated to the | ||
3620 | * simpler cgroup_init_subsys. | ||
3621 | */ | ||
3622 | int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | ||
3623 | { | ||
3624 | int i; | ||
3625 | struct cgroup_subsys_state *css; | ||
3626 | |||
3627 | /* check name and function validity */ | ||
3628 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || | ||
3629 | ss->create == NULL || ss->destroy == NULL) | ||
3630 | return -EINVAL; | ||
3631 | |||
3632 | /* | ||
3633 | * we don't support callbacks in modular subsystems. this check is | ||
3634 | * before the ss->module check for consistency; a subsystem that could | ||
3635 | * be a module should still have no callbacks even if the user isn't | ||
3636 | * compiling it as one. | ||
3637 | */ | ||
3638 | if (ss->fork || ss->exit) | ||
3639 | return -EINVAL; | ||
3640 | |||
3641 | /* | ||
3642 | * an optionally modular subsystem is built-in: we want to do nothing, | ||
3643 | * since cgroup_init_subsys will have already taken care of it. | ||
3644 | */ | ||
3645 | if (ss->module == NULL) { | ||
3646 | /* a few sanity checks */ | ||
3647 | BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); | ||
3648 | BUG_ON(subsys[ss->subsys_id] != ss); | ||
3649 | return 0; | ||
3650 | } | ||
3651 | |||
3652 | /* | ||
3653 | * need to register a subsys id before anything else - for example, | ||
3654 | * init_cgroup_css needs it. | ||
3655 | */ | ||
3656 | mutex_lock(&cgroup_mutex); | ||
3657 | /* find the first empty slot in the array */ | ||
3658 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | ||
3659 | if (subsys[i] == NULL) | ||
3660 | break; | ||
3661 | } | ||
3662 | if (i == CGROUP_SUBSYS_COUNT) { | ||
3663 | /* maximum number of subsystems already registered! */ | ||
3664 | mutex_unlock(&cgroup_mutex); | ||
3665 | return -EBUSY; | ||
3666 | } | ||
3667 | /* assign ourselves the subsys_id */ | ||
3668 | ss->subsys_id = i; | ||
3669 | subsys[i] = ss; | ||
3670 | |||
3671 | /* | ||
3672 | * no ss->create seems to need anything important in the ss struct, so | ||
3673 | * this can happen first (i.e. before the rootnode attachment). | ||
3674 | */ | ||
3675 | css = ss->create(ss, dummytop); | ||
3676 | if (IS_ERR(css)) { | ||
3677 | /* failure case - need to deassign the subsys[] slot. */ | ||
3678 | subsys[i] = NULL; | ||
3679 | mutex_unlock(&cgroup_mutex); | ||
3680 | return PTR_ERR(css); | ||
3681 | } | ||
3682 | |||
3683 | list_add(&ss->sibling, &rootnode.subsys_list); | ||
3684 | ss->root = &rootnode; | ||
3685 | |||
3686 | /* our new subsystem will be attached to the dummy hierarchy. */ | ||
3687 | init_cgroup_css(css, ss, dummytop); | ||
3688 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | ||
3689 | if (ss->use_id) { | ||
3690 | int ret = cgroup_init_idr(ss, css); | ||
3691 | if (ret) { | ||
3692 | dummytop->subsys[ss->subsys_id] = NULL; | ||
3693 | ss->destroy(ss, dummytop); | ||
3694 | subsys[i] = NULL; | ||
3695 | mutex_unlock(&cgroup_mutex); | ||
3696 | return ret; | ||
3697 | } | ||
3698 | } | ||
3699 | |||
3700 | /* | ||
3701 | * Now we need to entangle the css into the existing css_sets. unlike | ||
3702 | * in cgroup_init_subsys, there are now multiple css_sets, so each one | ||
3703 | * will need a new pointer to it; done by iterating the css_set_table. | ||
3704 | * furthermore, modifying the existing css_sets will corrupt the hash | ||
3705 | * table state, so each changed css_set will need its hash recomputed. | ||
3706 | * this is all done under the css_set_lock. | ||
3707 | */ | ||
3708 | write_lock(&css_set_lock); | ||
3709 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { | ||
3710 | struct css_set *cg; | ||
3711 | struct hlist_node *node, *tmp; | ||
3712 | struct hlist_head *bucket = &css_set_table[i], *new_bucket; | ||
3713 | |||
3714 | hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { | ||
3715 | /* skip entries that we already rehashed */ | ||
3716 | if (cg->subsys[ss->subsys_id]) | ||
3717 | continue; | ||
3718 | /* remove existing entry */ | ||
3719 | hlist_del(&cg->hlist); | ||
3720 | /* set new value */ | ||
3721 | cg->subsys[ss->subsys_id] = css; | ||
3722 | /* recompute hash and restore entry */ | ||
3723 | new_bucket = css_set_hash(cg->subsys); | ||
3724 | hlist_add_head(&cg->hlist, new_bucket); | ||
3725 | } | ||
3726 | } | ||
3727 | write_unlock(&css_set_lock); | ||
3728 | |||
3729 | mutex_init(&ss->hierarchy_mutex); | ||
3730 | lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); | ||
3731 | ss->active = 1; | ||
3732 | |||
3733 | /* success! */ | ||
3734 | mutex_unlock(&cgroup_mutex); | ||
3735 | return 0; | ||
3736 | } | ||
3737 | EXPORT_SYMBOL_GPL(cgroup_load_subsys); | ||
3738 | |||
3739 | /** | ||
3740 | * cgroup_unload_subsys: unload a modular subsystem | ||
3741 | * @ss: the subsystem to unload | ||
3742 | * | ||
3743 | * This function should be called in a modular subsystem's exitcall. When this | ||
3744 | * function is invoked, the refcount on the subsystem's module will be 0, so | ||
3745 | * the subsystem will not be attached to any hierarchy. | ||
3746 | */ | ||
3747 | void cgroup_unload_subsys(struct cgroup_subsys *ss) | ||
3748 | { | ||
3749 | struct cg_cgroup_link *link; | ||
3750 | struct hlist_head *hhead; | ||
3751 | |||
3752 | BUG_ON(ss->module == NULL); | ||
3753 | |||
3754 | /* | ||
3755 | * we shouldn't be called if the subsystem is in use, and the use of | ||
3756 | * try_module_get in parse_cgroupfs_options should ensure that it | ||
3757 | * doesn't start being used while we're killing it off. | ||
3758 | */ | ||
3759 | BUG_ON(ss->root != &rootnode); | ||
3760 | |||
3761 | mutex_lock(&cgroup_mutex); | ||
3762 | /* deassign the subsys_id */ | ||
3763 | BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); | ||
3764 | subsys[ss->subsys_id] = NULL; | ||
3765 | |||
3766 | /* remove subsystem from rootnode's list of subsystems */ | ||
3767 | list_del(&ss->sibling); | ||
3768 | |||
3769 | /* | ||
3770 | * disentangle the css from all css_sets attached to the dummytop. as | ||
3771 | * in loading, we need to pay our respects to the hashtable gods. | ||
3772 | */ | ||
3773 | write_lock(&css_set_lock); | ||
3774 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { | ||
3775 | struct css_set *cg = link->cg; | ||
3776 | |||
3777 | hlist_del(&cg->hlist); | ||
3778 | BUG_ON(!cg->subsys[ss->subsys_id]); | ||
3779 | cg->subsys[ss->subsys_id] = NULL; | ||
3780 | hhead = css_set_hash(cg->subsys); | ||
3781 | hlist_add_head(&cg->hlist, hhead); | ||
3782 | } | ||
3783 | write_unlock(&css_set_lock); | ||
3784 | |||
3785 | /* | ||
3786 | * remove subsystem's css from the dummytop and free it - need to free | ||
3787 | * before marking as null because ss->destroy needs the cgrp->subsys | ||
3788 | * pointer to find their state. note that this also takes care of | ||
3789 | * freeing the css_id. | ||
3790 | */ | ||
3791 | ss->destroy(ss, dummytop); | ||
3792 | dummytop->subsys[ss->subsys_id] = NULL; | ||
3793 | |||
3794 | mutex_unlock(&cgroup_mutex); | ||
3795 | } | ||
3796 | EXPORT_SYMBOL_GPL(cgroup_unload_subsys); | ||
3797 | |||
3798 | /** | ||
3211 | * cgroup_init_early - cgroup initialization at system boot | 3799 | * cgroup_init_early - cgroup initialization at system boot |
3212 | * | 3800 | * |
3213 | * Initialize cgroups at system boot, and initialize any | 3801 | * Initialize cgroups at system boot, and initialize any |
@@ -3235,7 +3823,8 @@ int __init cgroup_init_early(void) | |||
3235 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) | 3823 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) |
3236 | INIT_HLIST_HEAD(&css_set_table[i]); | 3824 | INIT_HLIST_HEAD(&css_set_table[i]); |
3237 | 3825 | ||
3238 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3826 | /* at bootup time, we don't worry about modular subsystems */ |
3827 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3239 | struct cgroup_subsys *ss = subsys[i]; | 3828 | struct cgroup_subsys *ss = subsys[i]; |
3240 | 3829 | ||
3241 | BUG_ON(!ss->name); | 3830 | BUG_ON(!ss->name); |
@@ -3270,12 +3859,13 @@ int __init cgroup_init(void) | |||
3270 | if (err) | 3859 | if (err) |
3271 | return err; | 3860 | return err; |
3272 | 3861 | ||
3273 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3862 | /* at bootup time, we don't worry about modular subsystems */ |
3863 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3274 | struct cgroup_subsys *ss = subsys[i]; | 3864 | struct cgroup_subsys *ss = subsys[i]; |
3275 | if (!ss->early_init) | 3865 | if (!ss->early_init) |
3276 | cgroup_init_subsys(ss); | 3866 | cgroup_init_subsys(ss); |
3277 | if (ss->use_id) | 3867 | if (ss->use_id) |
3278 | cgroup_subsys_init_idr(ss); | 3868 | cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); |
3279 | } | 3869 | } |
3280 | 3870 | ||
3281 | /* Add init_css_set to the hash table */ | 3871 | /* Add init_css_set to the hash table */ |
@@ -3379,9 +3969,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
3379 | int i; | 3969 | int i; |
3380 | 3970 | ||
3381 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); | 3971 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); |
3972 | /* | ||
3973 | * ideally we don't want subsystems moving around while we do this. | ||
3974 | * cgroup_mutex is also necessary to guarantee an atomic snapshot of | ||
3975 | * subsys/hierarchy state. | ||
3976 | */ | ||
3382 | mutex_lock(&cgroup_mutex); | 3977 | mutex_lock(&cgroup_mutex); |
3383 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 3978 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3384 | struct cgroup_subsys *ss = subsys[i]; | 3979 | struct cgroup_subsys *ss = subsys[i]; |
3980 | if (ss == NULL) | ||
3981 | continue; | ||
3385 | seq_printf(m, "%s\t%d\t%d\t%d\n", | 3982 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
3386 | ss->name, ss->root->hierarchy_id, | 3983 | ss->name, ss->root->hierarchy_id, |
3387 | ss->root->number_of_cgroups, !ss->disabled); | 3984 | ss->root->number_of_cgroups, !ss->disabled); |
@@ -3439,7 +4036,12 @@ void cgroup_fork_callbacks(struct task_struct *child) | |||
3439 | { | 4036 | { |
3440 | if (need_forkexit_callback) { | 4037 | if (need_forkexit_callback) { |
3441 | int i; | 4038 | int i; |
3442 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4039 | /* |
4040 | * forkexit callbacks are only supported for builtin | ||
4041 | * subsystems, and the builtin section of the subsys array is | ||
4042 | * immutable, so we don't need to lock the subsys array here. | ||
4043 | */ | ||
4044 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3443 | struct cgroup_subsys *ss = subsys[i]; | 4045 | struct cgroup_subsys *ss = subsys[i]; |
3444 | if (ss->fork) | 4046 | if (ss->fork) |
3445 | ss->fork(ss, child); | 4047 | ss->fork(ss, child); |
@@ -3508,7 +4110,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
3508 | struct css_set *cg; | 4110 | struct css_set *cg; |
3509 | 4111 | ||
3510 | if (run_callbacks && need_forkexit_callback) { | 4112 | if (run_callbacks && need_forkexit_callback) { |
3511 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4113 | /* |
4114 | * modular subsystems can't use callbacks, so no need to lock | ||
4115 | * the subsys array | ||
4116 | */ | ||
4117 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3512 | struct cgroup_subsys *ss = subsys[i]; | 4118 | struct cgroup_subsys *ss = subsys[i]; |
3513 | if (ss->exit) | 4119 | if (ss->exit) |
3514 | ss->exit(ss, tsk); | 4120 | ss->exit(ss, tsk); |
@@ -3702,12 +4308,13 @@ static void check_for_release(struct cgroup *cgrp) | |||
3702 | } | 4308 | } |
3703 | } | 4309 | } |
3704 | 4310 | ||
3705 | void __css_put(struct cgroup_subsys_state *css) | 4311 | /* Caller must verify that the css is not for root cgroup */ |
4312 | void __css_put(struct cgroup_subsys_state *css, int count) | ||
3706 | { | 4313 | { |
3707 | struct cgroup *cgrp = css->cgroup; | 4314 | struct cgroup *cgrp = css->cgroup; |
3708 | int val; | 4315 | int val; |
3709 | rcu_read_lock(); | 4316 | rcu_read_lock(); |
3710 | val = atomic_dec_return(&css->refcnt); | 4317 | val = atomic_sub_return(count, &css->refcnt); |
3711 | if (val == 1) { | 4318 | if (val == 1) { |
3712 | if (notify_on_release(cgrp)) { | 4319 | if (notify_on_release(cgrp)) { |
3713 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4320 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
@@ -3718,6 +4325,7 @@ void __css_put(struct cgroup_subsys_state *css) | |||
3718 | rcu_read_unlock(); | 4325 | rcu_read_unlock(); |
3719 | WARN_ON_ONCE(val < 1); | 4326 | WARN_ON_ONCE(val < 1); |
3720 | } | 4327 | } |
4328 | EXPORT_SYMBOL_GPL(__css_put); | ||
3721 | 4329 | ||
3722 | /* | 4330 | /* |
3723 | * Notify userspace when a cgroup is released, by running the | 4331 | * Notify userspace when a cgroup is released, by running the |
@@ -3799,8 +4407,11 @@ static int __init cgroup_disable(char *str) | |||
3799 | while ((token = strsep(&str, ",")) != NULL) { | 4407 | while ((token = strsep(&str, ",")) != NULL) { |
3800 | if (!*token) | 4408 | if (!*token) |
3801 | continue; | 4409 | continue; |
3802 | 4410 | /* | |
3803 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4411 | * cgroup_disable, being at boot time, can't know about module |
4412 | * subsystems, so we don't worry about them. | ||
4413 | */ | ||
4414 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
3804 | struct cgroup_subsys *ss = subsys[i]; | 4415 | struct cgroup_subsys *ss = subsys[i]; |
3805 | 4416 | ||
3806 | if (!strcmp(token, ss->name)) { | 4417 | if (!strcmp(token, ss->name)) { |
@@ -3830,6 +4441,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
3830 | return cssid->id; | 4441 | return cssid->id; |
3831 | return 0; | 4442 | return 0; |
3832 | } | 4443 | } |
4444 | EXPORT_SYMBOL_GPL(css_id); | ||
3833 | 4445 | ||
3834 | unsigned short css_depth(struct cgroup_subsys_state *css) | 4446 | unsigned short css_depth(struct cgroup_subsys_state *css) |
3835 | { | 4447 | { |
@@ -3839,6 +4451,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
3839 | return cssid->depth; | 4451 | return cssid->depth; |
3840 | return 0; | 4452 | return 0; |
3841 | } | 4453 | } |
4454 | EXPORT_SYMBOL_GPL(css_depth); | ||
3842 | 4455 | ||
3843 | bool css_is_ancestor(struct cgroup_subsys_state *child, | 4456 | bool css_is_ancestor(struct cgroup_subsys_state *child, |
3844 | const struct cgroup_subsys_state *root) | 4457 | const struct cgroup_subsys_state *root) |
@@ -3875,6 +4488,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | |||
3875 | spin_unlock(&ss->id_lock); | 4488 | spin_unlock(&ss->id_lock); |
3876 | call_rcu(&id->rcu_head, __free_css_id_cb); | 4489 | call_rcu(&id->rcu_head, __free_css_id_cb); |
3877 | } | 4490 | } |
4491 | EXPORT_SYMBOL_GPL(free_css_id); | ||
3878 | 4492 | ||
3879 | /* | 4493 | /* |
3880 | * This is called by init or create(). Then, calls to this function are | 4494 | * This is called by init or create(). Then, calls to this function are |
@@ -3924,15 +4538,14 @@ err_out: | |||
3924 | 4538 | ||
3925 | } | 4539 | } |
3926 | 4540 | ||
3927 | static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) | 4541 | static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, |
4542 | struct cgroup_subsys_state *rootcss) | ||
3928 | { | 4543 | { |
3929 | struct css_id *newid; | 4544 | struct css_id *newid; |
3930 | struct cgroup_subsys_state *rootcss; | ||
3931 | 4545 | ||
3932 | spin_lock_init(&ss->id_lock); | 4546 | spin_lock_init(&ss->id_lock); |
3933 | idr_init(&ss->idr); | 4547 | idr_init(&ss->idr); |
3934 | 4548 | ||
3935 | rootcss = init_css_set.subsys[ss->subsys_id]; | ||
3936 | newid = get_new_cssid(ss, 0); | 4549 | newid = get_new_cssid(ss, 0); |
3937 | if (IS_ERR(newid)) | 4550 | if (IS_ERR(newid)) |
3938 | return PTR_ERR(newid); | 4551 | return PTR_ERR(newid); |
@@ -3948,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, | |||
3948 | { | 4561 | { |
3949 | int subsys_id, i, depth = 0; | 4562 | int subsys_id, i, depth = 0; |
3950 | struct cgroup_subsys_state *parent_css, *child_css; | 4563 | struct cgroup_subsys_state *parent_css, *child_css; |
3951 | struct css_id *child_id, *parent_id = NULL; | 4564 | struct css_id *child_id, *parent_id; |
3952 | 4565 | ||
3953 | subsys_id = ss->subsys_id; | 4566 | subsys_id = ss->subsys_id; |
3954 | parent_css = parent->subsys[subsys_id]; | 4567 | parent_css = parent->subsys[subsys_id]; |
3955 | child_css = child->subsys[subsys_id]; | 4568 | child_css = child->subsys[subsys_id]; |
3956 | depth = css_depth(parent_css) + 1; | ||
3957 | parent_id = parent_css->id; | 4569 | parent_id = parent_css->id; |
4570 | depth = parent_id->depth; | ||
3958 | 4571 | ||
3959 | child_id = get_new_cssid(ss, depth); | 4572 | child_id = get_new_cssid(ss, depth); |
3960 | if (IS_ERR(child_id)) | 4573 | if (IS_ERR(child_id)) |
@@ -3992,6 +4605,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) | |||
3992 | 4605 | ||
3993 | return rcu_dereference(cssid->css); | 4606 | return rcu_dereference(cssid->css); |
3994 | } | 4607 | } |
4608 | EXPORT_SYMBOL_GPL(css_lookup); | ||
3995 | 4609 | ||
3996 | /** | 4610 | /** |
3997 | * css_get_next - lookup next cgroup under specified hierarchy. | 4611 | * css_get_next - lookup next cgroup under specified hierarchy. |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 59e9ef6aab40..e5c0244962b0 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -15,6 +15,7 @@ | |||
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/slab.h> | ||
18 | #include <linux/cgroup.h> | 19 | #include <linux/cgroup.h> |
19 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
20 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
47 | struct freezer, css); | 48 | struct freezer, css); |
48 | } | 49 | } |
49 | 50 | ||
50 | int cgroup_frozen(struct task_struct *task) | 51 | int cgroup_freezing_or_frozen(struct task_struct *task) |
51 | { | 52 | { |
52 | struct freezer *freezer; | 53 | struct freezer *freezer; |
53 | enum freezer_state state; | 54 | enum freezer_state state; |
54 | 55 | ||
55 | task_lock(task); | 56 | task_lock(task); |
56 | freezer = task_freezer(task); | 57 | freezer = task_freezer(task); |
57 | state = freezer->state; | 58 | if (!freezer->css.cgroup->parent) |
59 | state = CGROUP_THAWED; /* root cgroup can't be frozen */ | ||
60 | else | ||
61 | state = freezer->state; | ||
58 | task_unlock(task); | 62 | task_unlock(task); |
59 | 63 | ||
60 | return state == CGROUP_FROZEN; | 64 | return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); |
61 | } | 65 | } |
62 | 66 | ||
63 | /* | 67 | /* |
@@ -201,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | |||
201 | * No lock is needed, since the task isn't on tasklist yet, | 205 | * No lock is needed, since the task isn't on tasklist yet, |
202 | * so it can't be moved to another cgroup, which means the | 206 | * so it can't be moved to another cgroup, which means the |
203 | * freezer won't be removed and will be valid during this | 207 | * freezer won't be removed and will be valid during this |
204 | * function call. | 208 | * function call. Nevertheless, apply RCU read-side critical |
209 | * section to suppress RCU lockdep false positives. | ||
205 | */ | 210 | */ |
211 | rcu_read_lock(); | ||
206 | freezer = task_freezer(task); | 212 | freezer = task_freezer(task); |
213 | rcu_read_unlock(); | ||
207 | 214 | ||
208 | /* | 215 | /* |
209 | * The root cgroup is non-freezable, so we can skip the | 216 | * The root cgroup is non-freezable, so we can skip the |
diff --git a/kernel/compat.c b/kernel/compat.c index f6c204f07ea6..7f40e9275fd9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/posix-timers.h> | 25 | #include <linux/posix-timers.h> |
26 | #include <linux/times.h> | 26 | #include <linux/times.h> |
27 | #include <linux/ptrace.h> | 27 | #include <linux/ptrace.h> |
28 | #include <linux/gfp.h> | ||
28 | 29 | ||
29 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
30 | 31 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 1c8ddd6ee940..25bba73b1be3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/kthread.h> | 14 | #include <linux/kthread.h> |
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <linux/gfp.h> | ||
17 | 18 | ||
18 | #ifdef CONFIG_SMP | 19 | #ifdef CONFIG_SMP |
19 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ | 20 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ |
@@ -151,13 +152,13 @@ static inline void check_for_tasks(int cpu) | |||
151 | 152 | ||
152 | write_lock_irq(&tasklist_lock); | 153 | write_lock_irq(&tasklist_lock); |
153 | for_each_process(p) { | 154 | for_each_process(p) { |
154 | if (task_cpu(p) == cpu && | 155 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && |
155 | (!cputime_eq(p->utime, cputime_zero) || | 156 | (!cputime_eq(p->utime, cputime_zero) || |
156 | !cputime_eq(p->stime, cputime_zero))) | 157 | !cputime_eq(p->stime, cputime_zero))) |
157 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ | 158 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " |
158 | (state = %ld, flags = %x) \n", | 159 | "(state = %ld, flags = %x)\n", |
159 | p->comm, task_pid_nr(p), cpu, | 160 | p->comm, task_pid_nr(p), cpu, |
160 | p->state, p->flags); | 161 | p->state, p->flags); |
161 | } | 162 | } |
162 | write_unlock_irq(&tasklist_lock); | 163 | write_unlock_irq(&tasklist_lock); |
163 | } | 164 | } |
@@ -338,7 +339,7 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
338 | if (!cpu_possible(cpu)) { | 339 | if (!cpu_possible(cpu)) { |
339 | printk(KERN_ERR "can't online cpu %d because it is not " | 340 | printk(KERN_ERR "can't online cpu %d because it is not " |
340 | "configured as may-hotadd at boot time\n", cpu); | 341 | "configured as may-hotadd at boot time\n", cpu); |
341 | #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) | 342 | #if defined(CONFIG_IA64) |
342 | printk(KERN_ERR "please check additional_cpus= boot " | 343 | printk(KERN_ERR "please check additional_cpus= boot " |
343 | "parameter\n"); | 344 | "parameter\n"); |
344 | #endif | 345 | #endif |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba401fab459f..d10946748ec2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
920 | * call to guarantee_online_mems(), as we know no one is changing | 920 | * call to guarantee_online_mems(), as we know no one is changing |
921 | * our task's cpuset. | 921 | * our task's cpuset. |
922 | * | 922 | * |
923 | * Hold callback_mutex around the two modifications of our tasks | ||
924 | * mems_allowed to synchronize with cpuset_mems_allowed(). | ||
925 | * | ||
926 | * While the mm_struct we are migrating is typically from some | 923 | * While the mm_struct we are migrating is typically from some |
927 | * other task, the task_struct mems_allowed that we are hacking | 924 | * other task, the task_struct mems_allowed that we are hacking |
928 | * is for our current task, which must allocate new pages for that | 925 | * is for our current task, which must allocate new pages for that |
@@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
973 | struct cpuset *cs; | 970 | struct cpuset *cs; |
974 | int migrate; | 971 | int migrate; |
975 | const nodemask_t *oldmem = scan->data; | 972 | const nodemask_t *oldmem = scan->data; |
976 | nodemask_t newmems; | 973 | NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); |
974 | |||
975 | if (!newmems) | ||
976 | return; | ||
977 | 977 | ||
978 | cs = cgroup_cs(scan->cg); | 978 | cs = cgroup_cs(scan->cg); |
979 | guarantee_online_mems(cs, &newmems); | 979 | guarantee_online_mems(cs, newmems); |
980 | 980 | ||
981 | task_lock(p); | 981 | task_lock(p); |
982 | cpuset_change_task_nodemask(p, &newmems); | 982 | cpuset_change_task_nodemask(p, newmems); |
983 | task_unlock(p); | 983 | task_unlock(p); |
984 | 984 | ||
985 | NODEMASK_FREE(newmems); | ||
986 | |||
985 | mm = get_task_mm(p); | 987 | mm = get_task_mm(p); |
986 | if (!mm) | 988 | if (!mm) |
987 | return; | 989 | return; |
@@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
1051 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | 1053 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, |
1052 | const char *buf) | 1054 | const char *buf) |
1053 | { | 1055 | { |
1054 | nodemask_t oldmem; | 1056 | NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); |
1055 | int retval; | 1057 | int retval; |
1056 | struct ptr_heap heap; | 1058 | struct ptr_heap heap; |
1057 | 1059 | ||
1060 | if (!oldmem) | ||
1061 | return -ENOMEM; | ||
1062 | |||
1058 | /* | 1063 | /* |
1059 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | 1064 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; |
1060 | * it's read-only | 1065 | * it's read-only |
1061 | */ | 1066 | */ |
1062 | if (cs == &top_cpuset) | 1067 | if (cs == &top_cpuset) { |
1063 | return -EACCES; | 1068 | retval = -EACCES; |
1069 | goto done; | ||
1070 | } | ||
1064 | 1071 | ||
1065 | /* | 1072 | /* |
1066 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | 1073 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. |
@@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1076 | goto done; | 1083 | goto done; |
1077 | 1084 | ||
1078 | if (!nodes_subset(trialcs->mems_allowed, | 1085 | if (!nodes_subset(trialcs->mems_allowed, |
1079 | node_states[N_HIGH_MEMORY])) | 1086 | node_states[N_HIGH_MEMORY])) { |
1080 | return -EINVAL; | 1087 | retval = -EINVAL; |
1088 | goto done; | ||
1089 | } | ||
1081 | } | 1090 | } |
1082 | oldmem = cs->mems_allowed; | 1091 | *oldmem = cs->mems_allowed; |
1083 | if (nodes_equal(oldmem, trialcs->mems_allowed)) { | 1092 | if (nodes_equal(*oldmem, trialcs->mems_allowed)) { |
1084 | retval = 0; /* Too easy - nothing to do */ | 1093 | retval = 0; /* Too easy - nothing to do */ |
1085 | goto done; | 1094 | goto done; |
1086 | } | 1095 | } |
@@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1096 | cs->mems_allowed = trialcs->mems_allowed; | 1105 | cs->mems_allowed = trialcs->mems_allowed; |
1097 | mutex_unlock(&callback_mutex); | 1106 | mutex_unlock(&callback_mutex); |
1098 | 1107 | ||
1099 | update_tasks_nodemask(cs, &oldmem, &heap); | 1108 | update_tasks_nodemask(cs, oldmem, &heap); |
1100 | 1109 | ||
1101 | heap_free(&heap); | 1110 | heap_free(&heap); |
1102 | done: | 1111 | done: |
1112 | NODEMASK_FREE(oldmem); | ||
1103 | return retval; | 1113 | return retval; |
1104 | } | 1114 | } |
1105 | 1115 | ||
@@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
1384 | struct cgroup *oldcont, struct task_struct *tsk, | 1394 | struct cgroup *oldcont, struct task_struct *tsk, |
1385 | bool threadgroup) | 1395 | bool threadgroup) |
1386 | { | 1396 | { |
1387 | nodemask_t from, to; | ||
1388 | struct mm_struct *mm; | 1397 | struct mm_struct *mm; |
1389 | struct cpuset *cs = cgroup_cs(cont); | 1398 | struct cpuset *cs = cgroup_cs(cont); |
1390 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1399 | struct cpuset *oldcs = cgroup_cs(oldcont); |
1400 | NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); | ||
1401 | NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); | ||
1402 | |||
1403 | if (from == NULL || to == NULL) | ||
1404 | goto alloc_fail; | ||
1391 | 1405 | ||
1392 | if (cs == &top_cpuset) { | 1406 | if (cs == &top_cpuset) { |
1393 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1407 | cpumask_copy(cpus_attach, cpu_possible_mask); |
1394 | to = node_possible_map; | ||
1395 | } else { | 1408 | } else { |
1396 | guarantee_online_cpus(cs, cpus_attach); | 1409 | guarantee_online_cpus(cs, cpus_attach); |
1397 | guarantee_online_mems(cs, &to); | ||
1398 | } | 1410 | } |
1411 | guarantee_online_mems(cs, to); | ||
1399 | 1412 | ||
1400 | /* do per-task migration stuff possibly for each in the threadgroup */ | 1413 | /* do per-task migration stuff possibly for each in the threadgroup */ |
1401 | cpuset_attach_task(tsk, &to, cs); | 1414 | cpuset_attach_task(tsk, to, cs); |
1402 | if (threadgroup) { | 1415 | if (threadgroup) { |
1403 | struct task_struct *c; | 1416 | struct task_struct *c; |
1404 | rcu_read_lock(); | 1417 | rcu_read_lock(); |
1405 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | 1418 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { |
1406 | cpuset_attach_task(c, &to, cs); | 1419 | cpuset_attach_task(c, to, cs); |
1407 | } | 1420 | } |
1408 | rcu_read_unlock(); | 1421 | rcu_read_unlock(); |
1409 | } | 1422 | } |
1410 | 1423 | ||
1411 | /* change mm; only needs to be done once even if threadgroup */ | 1424 | /* change mm; only needs to be done once even if threadgroup */ |
1412 | from = oldcs->mems_allowed; | 1425 | *from = oldcs->mems_allowed; |
1413 | to = cs->mems_allowed; | 1426 | *to = cs->mems_allowed; |
1414 | mm = get_task_mm(tsk); | 1427 | mm = get_task_mm(tsk); |
1415 | if (mm) { | 1428 | if (mm) { |
1416 | mpol_rebind_mm(mm, &to); | 1429 | mpol_rebind_mm(mm, to); |
1417 | if (is_memory_migrate(cs)) | 1430 | if (is_memory_migrate(cs)) |
1418 | cpuset_migrate_mm(mm, &from, &to); | 1431 | cpuset_migrate_mm(mm, from, to); |
1419 | mmput(mm); | 1432 | mmput(mm); |
1420 | } | 1433 | } |
1434 | |||
1435 | alloc_fail: | ||
1436 | NODEMASK_FREE(from); | ||
1437 | NODEMASK_FREE(to); | ||
1421 | } | 1438 | } |
1422 | 1439 | ||
1423 | /* The various types of files and directories in a cpuset file system */ | 1440 | /* The various types of files and directories in a cpuset file system */ |
@@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
1562 | 1579 | ||
1563 | static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | 1580 | static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) |
1564 | { | 1581 | { |
1565 | nodemask_t mask; | 1582 | NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); |
1583 | int retval; | ||
1584 | |||
1585 | if (mask == NULL) | ||
1586 | return -ENOMEM; | ||
1566 | 1587 | ||
1567 | mutex_lock(&callback_mutex); | 1588 | mutex_lock(&callback_mutex); |
1568 | mask = cs->mems_allowed; | 1589 | *mask = cs->mems_allowed; |
1569 | mutex_unlock(&callback_mutex); | 1590 | mutex_unlock(&callback_mutex); |
1570 | 1591 | ||
1571 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 1592 | retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); |
1593 | |||
1594 | NODEMASK_FREE(mask); | ||
1595 | |||
1596 | return retval; | ||
1572 | } | 1597 | } |
1573 | 1598 | ||
1574 | static ssize_t cpuset_common_file_read(struct cgroup *cont, | 1599 | static ssize_t cpuset_common_file_read(struct cgroup *cont, |
@@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
1997 | struct cpuset *cp; /* scans cpusets being updated */ | 2022 | struct cpuset *cp; /* scans cpusets being updated */ |
1998 | struct cpuset *child; /* scans child cpusets of cp */ | 2023 | struct cpuset *child; /* scans child cpusets of cp */ |
1999 | struct cgroup *cont; | 2024 | struct cgroup *cont; |
2000 | nodemask_t oldmems; | 2025 | NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); |
2026 | |||
2027 | if (oldmems == NULL) | ||
2028 | return; | ||
2001 | 2029 | ||
2002 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 2030 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
2003 | 2031 | ||
@@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2014 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | 2042 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) |
2015 | continue; | 2043 | continue; |
2016 | 2044 | ||
2017 | oldmems = cp->mems_allowed; | 2045 | *oldmems = cp->mems_allowed; |
2018 | 2046 | ||
2019 | /* Remove offline cpus and mems from this cpuset. */ | 2047 | /* Remove offline cpus and mems from this cpuset. */ |
2020 | mutex_lock(&callback_mutex); | 2048 | mutex_lock(&callback_mutex); |
@@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2030 | remove_tasks_in_empty_cpuset(cp); | 2058 | remove_tasks_in_empty_cpuset(cp); |
2031 | else { | 2059 | else { |
2032 | update_tasks_cpumask(cp, NULL); | 2060 | update_tasks_cpumask(cp, NULL); |
2033 | update_tasks_nodemask(cp, &oldmems, NULL); | 2061 | update_tasks_nodemask(cp, oldmems, NULL); |
2034 | } | 2062 | } |
2035 | } | 2063 | } |
2064 | NODEMASK_FREE(oldmems); | ||
2036 | } | 2065 | } |
2037 | 2066 | ||
2038 | /* | 2067 | /* |
@@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
2090 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2119 | static int cpuset_track_online_nodes(struct notifier_block *self, |
2091 | unsigned long action, void *arg) | 2120 | unsigned long action, void *arg) |
2092 | { | 2121 | { |
2122 | NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); | ||
2123 | |||
2124 | if (oldmems == NULL) | ||
2125 | return NOTIFY_DONE; | ||
2126 | |||
2093 | cgroup_lock(); | 2127 | cgroup_lock(); |
2094 | switch (action) { | 2128 | switch (action) { |
2095 | case MEM_ONLINE: | 2129 | case MEM_ONLINE: |
2096 | case MEM_OFFLINE: | 2130 | *oldmems = top_cpuset.mems_allowed; |
2097 | mutex_lock(&callback_mutex); | 2131 | mutex_lock(&callback_mutex); |
2098 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2132 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
2099 | mutex_unlock(&callback_mutex); | 2133 | mutex_unlock(&callback_mutex); |
2100 | if (action == MEM_OFFLINE) | 2134 | update_tasks_nodemask(&top_cpuset, oldmems, NULL); |
2101 | scan_for_empty_cpusets(&top_cpuset); | 2135 | break; |
2136 | case MEM_OFFLINE: | ||
2137 | /* | ||
2138 | * needn't update top_cpuset.mems_allowed explicitly because | ||
2139 | * scan_for_empty_cpusets() will update it. | ||
2140 | */ | ||
2141 | scan_for_empty_cpusets(&top_cpuset); | ||
2102 | break; | 2142 | break; |
2103 | default: | 2143 | default: |
2104 | break; | 2144 | break; |
2105 | } | 2145 | } |
2106 | cgroup_unlock(); | 2146 | cgroup_unlock(); |
2147 | |||
2148 | NODEMASK_FREE(oldmems); | ||
2107 | return NOTIFY_OK; | 2149 | return NOTIFY_OK; |
2108 | } | 2150 | } |
2109 | #endif | 2151 | #endif |
diff --git a/kernel/cred.c b/kernel/cred.c index dd76cfe5f5b0..62af1816c235 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -10,6 +10,7 @@ | |||
10 | */ | 10 | */ |
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/cred.h> | 12 | #include <linux/cred.h> |
13 | #include <linux/slab.h> | ||
13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
14 | #include <linux/key.h> | 15 | #include <linux/key.h> |
15 | #include <linux/keyctl.h> | 16 | #include <linux/keyctl.h> |
@@ -224,7 +225,7 @@ struct cred *cred_alloc_blank(void) | |||
224 | #ifdef CONFIG_KEYS | 225 | #ifdef CONFIG_KEYS |
225 | new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); | 226 | new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); |
226 | if (!new->tgcred) { | 227 | if (!new->tgcred) { |
227 | kfree(new); | 228 | kmem_cache_free(cred_jar, new); |
228 | return NULL; | 229 | return NULL; |
229 | } | 230 | } |
230 | atomic_set(&new->tgcred->usage, 1); | 231 | atomic_set(&new->tgcred->usage, 1); |
@@ -364,7 +365,7 @@ struct cred *prepare_usermodehelper_creds(void) | |||
364 | 365 | ||
365 | new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); | 366 | new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); |
366 | if (!new) | 367 | if (!new) |
367 | return NULL; | 368 | goto free_tgcred; |
368 | 369 | ||
369 | kdebug("prepare_usermodehelper_creds() alloc %p", new); | 370 | kdebug("prepare_usermodehelper_creds() alloc %p", new); |
370 | 371 | ||
@@ -398,6 +399,12 @@ struct cred *prepare_usermodehelper_creds(void) | |||
398 | error: | 399 | error: |
399 | put_cred(new); | 400 | put_cred(new); |
400 | return NULL; | 401 | return NULL; |
402 | |||
403 | free_tgcred: | ||
404 | #ifdef CONFIG_KEYS | ||
405 | kfree(tgcred); | ||
406 | #endif | ||
407 | return NULL; | ||
401 | } | 408 | } |
402 | 409 | ||
403 | /* | 410 | /* |
@@ -786,8 +793,6 @@ bool creds_are_invalid(const struct cred *cred) | |||
786 | { | 793 | { |
787 | if (cred->magic != CRED_MAGIC) | 794 | if (cred->magic != CRED_MAGIC) |
788 | return true; | 795 | return true; |
789 | if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) | ||
790 | return true; | ||
791 | #ifdef CONFIG_SECURITY_SELINUX | 796 | #ifdef CONFIG_SECURITY_SELINUX |
792 | if (selinux_is_enabled()) { | 797 | if (selinux_is_enabled()) { |
793 | if ((unsigned long) cred->security < PAGE_SIZE) | 798 | if ((unsigned long) cred->security < PAGE_SIZE) |
diff --git a/kernel/early_res.c b/kernel/early_res.c new file mode 100644 index 000000000000..31aa9332ef3f --- /dev/null +++ b/kernel/early_res.c | |||
@@ -0,0 +1,584 @@ | |||
1 | /* | ||
2 | * early_res, could be used to replace bootmem | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/bootmem.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/early_res.h> | ||
10 | |||
11 | /* | ||
12 | * Early reserved memory areas. | ||
13 | */ | ||
14 | /* | ||
15 | * need to make sure this one is bigger enough before | ||
16 | * find_fw_memmap_area could be used | ||
17 | */ | ||
18 | #define MAX_EARLY_RES_X 32 | ||
19 | |||
20 | struct early_res { | ||
21 | u64 start, end; | ||
22 | char name[15]; | ||
23 | char overlap_ok; | ||
24 | }; | ||
25 | static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; | ||
26 | |||
27 | static int max_early_res __initdata = MAX_EARLY_RES_X; | ||
28 | static struct early_res *early_res __initdata = &early_res_x[0]; | ||
29 | static int early_res_count __initdata; | ||
30 | |||
31 | static int __init find_overlapped_early(u64 start, u64 end) | ||
32 | { | ||
33 | int i; | ||
34 | struct early_res *r; | ||
35 | |||
36 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
37 | r = &early_res[i]; | ||
38 | if (end > r->start && start < r->end) | ||
39 | break; | ||
40 | } | ||
41 | |||
42 | return i; | ||
43 | } | ||
44 | |||
45 | /* | ||
46 | * Drop the i-th range from the early reservation map, | ||
47 | * by copying any higher ranges down one over it, and | ||
48 | * clearing what had been the last slot. | ||
49 | */ | ||
50 | static void __init drop_range(int i) | ||
51 | { | ||
52 | int j; | ||
53 | |||
54 | for (j = i + 1; j < max_early_res && early_res[j].end; j++) | ||
55 | ; | ||
56 | |||
57 | memmove(&early_res[i], &early_res[i + 1], | ||
58 | (j - 1 - i) * sizeof(struct early_res)); | ||
59 | |||
60 | early_res[j - 1].end = 0; | ||
61 | early_res_count--; | ||
62 | } | ||
63 | |||
64 | static void __init drop_range_partial(int i, u64 start, u64 end) | ||
65 | { | ||
66 | u64 common_start, common_end; | ||
67 | u64 old_start, old_end; | ||
68 | |||
69 | old_start = early_res[i].start; | ||
70 | old_end = early_res[i].end; | ||
71 | common_start = max(old_start, start); | ||
72 | common_end = min(old_end, end); | ||
73 | |||
74 | /* no overlap ? */ | ||
75 | if (common_start >= common_end) | ||
76 | return; | ||
77 | |||
78 | if (old_start < common_start) { | ||
79 | /* make head segment */ | ||
80 | early_res[i].end = common_start; | ||
81 | if (old_end > common_end) { | ||
82 | char name[15]; | ||
83 | |||
84 | /* | ||
85 | * Save a local copy of the name, since the | ||
86 | * early_res array could get resized inside | ||
87 | * reserve_early_without_check() -> | ||
88 | * __check_and_double_early_res(), which would | ||
89 | * make the current name pointer invalid. | ||
90 | */ | ||
91 | strncpy(name, early_res[i].name, | ||
92 | sizeof(early_res[i].name) - 1); | ||
93 | /* add another for left over on tail */ | ||
94 | reserve_early_without_check(common_end, old_end, name); | ||
95 | } | ||
96 | return; | ||
97 | } else { | ||
98 | if (old_end > common_end) { | ||
99 | /* reuse the entry for tail left */ | ||
100 | early_res[i].start = common_end; | ||
101 | return; | ||
102 | } | ||
103 | /* all covered */ | ||
104 | drop_range(i); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Split any existing ranges that: | ||
110 | * 1) are marked 'overlap_ok', and | ||
111 | * 2) overlap with the stated range [start, end) | ||
112 | * into whatever portion (if any) of the existing range is entirely | ||
113 | * below or entirely above the stated range. Drop the portion | ||
114 | * of the existing range that overlaps with the stated range, | ||
115 | * which will allow the caller of this routine to then add that | ||
116 | * stated range without conflicting with any existing range. | ||
117 | */ | ||
118 | static void __init drop_overlaps_that_are_ok(u64 start, u64 end) | ||
119 | { | ||
120 | int i; | ||
121 | struct early_res *r; | ||
122 | u64 lower_start, lower_end; | ||
123 | u64 upper_start, upper_end; | ||
124 | char name[15]; | ||
125 | |||
126 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
127 | r = &early_res[i]; | ||
128 | |||
129 | /* Continue past non-overlapping ranges */ | ||
130 | if (end <= r->start || start >= r->end) | ||
131 | continue; | ||
132 | |||
133 | /* | ||
134 | * Leave non-ok overlaps as is; let caller | ||
135 | * panic "Overlapping early reservations" | ||
136 | * when it hits this overlap. | ||
137 | */ | ||
138 | if (!r->overlap_ok) | ||
139 | return; | ||
140 | |||
141 | /* | ||
142 | * We have an ok overlap. We will drop it from the early | ||
143 | * reservation map, and add back in any non-overlapping | ||
144 | * portions (lower or upper) as separate, overlap_ok, | ||
145 | * non-overlapping ranges. | ||
146 | */ | ||
147 | |||
148 | /* 1. Note any non-overlapping (lower or upper) ranges. */ | ||
149 | strncpy(name, r->name, sizeof(name) - 1); | ||
150 | |||
151 | lower_start = lower_end = 0; | ||
152 | upper_start = upper_end = 0; | ||
153 | if (r->start < start) { | ||
154 | lower_start = r->start; | ||
155 | lower_end = start; | ||
156 | } | ||
157 | if (r->end > end) { | ||
158 | upper_start = end; | ||
159 | upper_end = r->end; | ||
160 | } | ||
161 | |||
162 | /* 2. Drop the original ok overlapping range */ | ||
163 | drop_range(i); | ||
164 | |||
165 | i--; /* resume for-loop on copied down entry */ | ||
166 | |||
167 | /* 3. Add back in any non-overlapping ranges. */ | ||
168 | if (lower_end) | ||
169 | reserve_early_overlap_ok(lower_start, lower_end, name); | ||
170 | if (upper_end) | ||
171 | reserve_early_overlap_ok(upper_start, upper_end, name); | ||
172 | } | ||
173 | } | ||
174 | |||
175 | static void __init __reserve_early(u64 start, u64 end, char *name, | ||
176 | int overlap_ok) | ||
177 | { | ||
178 | int i; | ||
179 | struct early_res *r; | ||
180 | |||
181 | i = find_overlapped_early(start, end); | ||
182 | if (i >= max_early_res) | ||
183 | panic("Too many early reservations"); | ||
184 | r = &early_res[i]; | ||
185 | if (r->end) | ||
186 | panic("Overlapping early reservations " | ||
187 | "%llx-%llx %s to %llx-%llx %s\n", | ||
188 | start, end - 1, name ? name : "", r->start, | ||
189 | r->end - 1, r->name); | ||
190 | r->start = start; | ||
191 | r->end = end; | ||
192 | r->overlap_ok = overlap_ok; | ||
193 | if (name) | ||
194 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
195 | early_res_count++; | ||
196 | } | ||
197 | |||
198 | /* | ||
199 | * A few early reservtations come here. | ||
200 | * | ||
201 | * The 'overlap_ok' in the name of this routine does -not- mean it | ||
202 | * is ok for these reservations to overlap an earlier reservation. | ||
203 | * Rather it means that it is ok for subsequent reservations to | ||
204 | * overlap this one. | ||
205 | * | ||
206 | * Use this entry point to reserve early ranges when you are doing | ||
207 | * so out of "Paranoia", reserving perhaps more memory than you need, | ||
208 | * just in case, and don't mind a subsequent overlapping reservation | ||
209 | * that is known to be needed. | ||
210 | * | ||
211 | * The drop_overlaps_that_are_ok() call here isn't really needed. | ||
212 | * It would be needed if we had two colliding 'overlap_ok' | ||
213 | * reservations, so that the second such would not panic on the | ||
214 | * overlap with the first. We don't have any such as of this | ||
215 | * writing, but might as well tolerate such if it happens in | ||
216 | * the future. | ||
217 | */ | ||
218 | void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) | ||
219 | { | ||
220 | drop_overlaps_that_are_ok(start, end); | ||
221 | __reserve_early(start, end, name, 1); | ||
222 | } | ||
223 | |||
224 | static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) | ||
225 | { | ||
226 | u64 start, end, size, mem; | ||
227 | struct early_res *new; | ||
228 | |||
229 | /* do we have enough slots left ? */ | ||
230 | if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) | ||
231 | return; | ||
232 | |||
233 | /* double it */ | ||
234 | mem = -1ULL; | ||
235 | size = sizeof(struct early_res) * max_early_res * 2; | ||
236 | if (early_res == early_res_x) | ||
237 | start = 0; | ||
238 | else | ||
239 | start = early_res[0].end; | ||
240 | end = ex_start; | ||
241 | if (start + size < end) | ||
242 | mem = find_fw_memmap_area(start, end, size, | ||
243 | sizeof(struct early_res)); | ||
244 | if (mem == -1ULL) { | ||
245 | start = ex_end; | ||
246 | end = get_max_mapped(); | ||
247 | if (start + size < end) | ||
248 | mem = find_fw_memmap_area(start, end, size, | ||
249 | sizeof(struct early_res)); | ||
250 | } | ||
251 | if (mem == -1ULL) | ||
252 | panic("can not find more space for early_res array"); | ||
253 | |||
254 | new = __va(mem); | ||
255 | /* save the first one for own */ | ||
256 | new[0].start = mem; | ||
257 | new[0].end = mem + size; | ||
258 | new[0].overlap_ok = 0; | ||
259 | /* copy old to new */ | ||
260 | if (early_res == early_res_x) { | ||
261 | memcpy(&new[1], &early_res[0], | ||
262 | sizeof(struct early_res) * max_early_res); | ||
263 | memset(&new[max_early_res+1], 0, | ||
264 | sizeof(struct early_res) * (max_early_res - 1)); | ||
265 | early_res_count++; | ||
266 | } else { | ||
267 | memcpy(&new[1], &early_res[1], | ||
268 | sizeof(struct early_res) * (max_early_res - 1)); | ||
269 | memset(&new[max_early_res], 0, | ||
270 | sizeof(struct early_res) * max_early_res); | ||
271 | } | ||
272 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
273 | early_res = new; | ||
274 | max_early_res *= 2; | ||
275 | printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", | ||
276 | max_early_res, mem, mem + size - 1); | ||
277 | } | ||
278 | |||
279 | /* | ||
280 | * Most early reservations come here. | ||
281 | * | ||
282 | * We first have drop_overlaps_that_are_ok() drop any pre-existing | ||
283 | * 'overlap_ok' ranges, so that we can then reserve this memory | ||
284 | * range without risk of panic'ing on an overlapping overlap_ok | ||
285 | * early reservation. | ||
286 | */ | ||
287 | void __init reserve_early(u64 start, u64 end, char *name) | ||
288 | { | ||
289 | if (start >= end) | ||
290 | return; | ||
291 | |||
292 | __check_and_double_early_res(start, end); | ||
293 | |||
294 | drop_overlaps_that_are_ok(start, end); | ||
295 | __reserve_early(start, end, name, 0); | ||
296 | } | ||
297 | |||
298 | void __init reserve_early_without_check(u64 start, u64 end, char *name) | ||
299 | { | ||
300 | struct early_res *r; | ||
301 | |||
302 | if (start >= end) | ||
303 | return; | ||
304 | |||
305 | __check_and_double_early_res(start, end); | ||
306 | |||
307 | r = &early_res[early_res_count]; | ||
308 | |||
309 | r->start = start; | ||
310 | r->end = end; | ||
311 | r->overlap_ok = 0; | ||
312 | if (name) | ||
313 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
314 | early_res_count++; | ||
315 | } | ||
316 | |||
317 | void __init free_early(u64 start, u64 end) | ||
318 | { | ||
319 | struct early_res *r; | ||
320 | int i; | ||
321 | |||
322 | i = find_overlapped_early(start, end); | ||
323 | r = &early_res[i]; | ||
324 | if (i >= max_early_res || r->end != end || r->start != start) | ||
325 | panic("free_early on not reserved area: %llx-%llx!", | ||
326 | start, end - 1); | ||
327 | |||
328 | drop_range(i); | ||
329 | } | ||
330 | |||
331 | void __init free_early_partial(u64 start, u64 end) | ||
332 | { | ||
333 | struct early_res *r; | ||
334 | int i; | ||
335 | |||
336 | if (start == end) | ||
337 | return; | ||
338 | |||
339 | if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) | ||
340 | return; | ||
341 | |||
342 | try_next: | ||
343 | i = find_overlapped_early(start, end); | ||
344 | if (i >= max_early_res) | ||
345 | return; | ||
346 | |||
347 | r = &early_res[i]; | ||
348 | /* hole ? */ | ||
349 | if (r->end >= end && r->start <= start) { | ||
350 | drop_range_partial(i, start, end); | ||
351 | return; | ||
352 | } | ||
353 | |||
354 | drop_range_partial(i, start, end); | ||
355 | goto try_next; | ||
356 | } | ||
357 | |||
358 | #ifdef CONFIG_NO_BOOTMEM | ||
359 | static void __init subtract_early_res(struct range *range, int az) | ||
360 | { | ||
361 | int i, count; | ||
362 | u64 final_start, final_end; | ||
363 | int idx = 0; | ||
364 | |||
365 | count = 0; | ||
366 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
367 | count++; | ||
368 | |||
369 | /* need to skip first one ?*/ | ||
370 | if (early_res != early_res_x) | ||
371 | idx = 1; | ||
372 | |||
373 | #define DEBUG_PRINT_EARLY_RES 1 | ||
374 | |||
375 | #if DEBUG_PRINT_EARLY_RES | ||
376 | printk(KERN_INFO "Subtract (%d early reservations)\n", count); | ||
377 | #endif | ||
378 | for (i = idx; i < count; i++) { | ||
379 | struct early_res *r = &early_res[i]; | ||
380 | #if DEBUG_PRINT_EARLY_RES | ||
381 | printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, | ||
382 | r->start, r->end, r->name); | ||
383 | #endif | ||
384 | final_start = PFN_DOWN(r->start); | ||
385 | final_end = PFN_UP(r->end); | ||
386 | if (final_start >= final_end) | ||
387 | continue; | ||
388 | subtract_range(range, az, final_start, final_end); | ||
389 | } | ||
390 | |||
391 | } | ||
392 | |||
393 | int __init get_free_all_memory_range(struct range **rangep, int nodeid) | ||
394 | { | ||
395 | int i, count; | ||
396 | u64 start = 0, end; | ||
397 | u64 size; | ||
398 | u64 mem; | ||
399 | struct range *range; | ||
400 | int nr_range; | ||
401 | |||
402 | count = 0; | ||
403 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
404 | count++; | ||
405 | |||
406 | count *= 2; | ||
407 | |||
408 | size = sizeof(struct range) * count; | ||
409 | end = get_max_mapped(); | ||
410 | #ifdef MAX_DMA32_PFN | ||
411 | if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) | ||
412 | start = MAX_DMA32_PFN << PAGE_SHIFT; | ||
413 | #endif | ||
414 | mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); | ||
415 | if (mem == -1ULL) | ||
416 | panic("can not find more space for range free"); | ||
417 | |||
418 | range = __va(mem); | ||
419 | /* use early_node_map[] and early_res to get range array at first */ | ||
420 | memset(range, 0, size); | ||
421 | nr_range = 0; | ||
422 | |||
423 | /* need to go over early_node_map to find out good range for node */ | ||
424 | nr_range = add_from_early_node_map(range, count, nr_range, nodeid); | ||
425 | #ifdef CONFIG_X86_32 | ||
426 | subtract_range(range, count, max_low_pfn, -1ULL); | ||
427 | #endif | ||
428 | subtract_early_res(range, count); | ||
429 | nr_range = clean_sort_range(range, count); | ||
430 | |||
431 | /* need to clear it ? */ | ||
432 | if (nodeid == MAX_NUMNODES) { | ||
433 | memset(&early_res[0], 0, | ||
434 | sizeof(struct early_res) * max_early_res); | ||
435 | early_res = NULL; | ||
436 | max_early_res = 0; | ||
437 | } | ||
438 | |||
439 | *rangep = range; | ||
440 | return nr_range; | ||
441 | } | ||
442 | #else | ||
443 | void __init early_res_to_bootmem(u64 start, u64 end) | ||
444 | { | ||
445 | int i, count; | ||
446 | u64 final_start, final_end; | ||
447 | int idx = 0; | ||
448 | |||
449 | count = 0; | ||
450 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
451 | count++; | ||
452 | |||
453 | /* need to skip first one ?*/ | ||
454 | if (early_res != early_res_x) | ||
455 | idx = 1; | ||
456 | |||
457 | printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", | ||
458 | count - idx, max_early_res, start, end); | ||
459 | for (i = idx; i < count; i++) { | ||
460 | struct early_res *r = &early_res[i]; | ||
461 | printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, | ||
462 | r->start, r->end, r->name); | ||
463 | final_start = max(start, r->start); | ||
464 | final_end = min(end, r->end); | ||
465 | if (final_start >= final_end) { | ||
466 | printk(KERN_CONT "\n"); | ||
467 | continue; | ||
468 | } | ||
469 | printk(KERN_CONT " ==> [%010llx - %010llx]\n", | ||
470 | final_start, final_end); | ||
471 | reserve_bootmem_generic(final_start, final_end - final_start, | ||
472 | BOOTMEM_DEFAULT); | ||
473 | } | ||
474 | /* clear them */ | ||
475 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
476 | early_res = NULL; | ||
477 | max_early_res = 0; | ||
478 | early_res_count = 0; | ||
479 | } | ||
480 | #endif | ||
481 | |||
482 | /* Check for already reserved areas */ | ||
483 | static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) | ||
484 | { | ||
485 | int i; | ||
486 | u64 addr = *addrp; | ||
487 | int changed = 0; | ||
488 | struct early_res *r; | ||
489 | again: | ||
490 | i = find_overlapped_early(addr, addr + size); | ||
491 | r = &early_res[i]; | ||
492 | if (i < max_early_res && r->end) { | ||
493 | *addrp = addr = round_up(r->end, align); | ||
494 | changed = 1; | ||
495 | goto again; | ||
496 | } | ||
497 | return changed; | ||
498 | } | ||
499 | |||
500 | /* Check for already reserved areas */ | ||
501 | static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) | ||
502 | { | ||
503 | int i; | ||
504 | u64 addr = *addrp, last; | ||
505 | u64 size = *sizep; | ||
506 | int changed = 0; | ||
507 | again: | ||
508 | last = addr + size; | ||
509 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
510 | struct early_res *r = &early_res[i]; | ||
511 | if (last > r->start && addr < r->start) { | ||
512 | size = r->start - addr; | ||
513 | changed = 1; | ||
514 | goto again; | ||
515 | } | ||
516 | if (last > r->end && addr < r->end) { | ||
517 | addr = round_up(r->end, align); | ||
518 | size = last - addr; | ||
519 | changed = 1; | ||
520 | goto again; | ||
521 | } | ||
522 | if (last <= r->end && addr >= r->start) { | ||
523 | (*sizep)++; | ||
524 | return 0; | ||
525 | } | ||
526 | } | ||
527 | if (changed) { | ||
528 | *addrp = addr; | ||
529 | *sizep = size; | ||
530 | } | ||
531 | return changed; | ||
532 | } | ||
533 | |||
534 | /* | ||
535 | * Find a free area with specified alignment in a specific range. | ||
536 | * only with the area.between start to end is active range from early_node_map | ||
537 | * so they are good as RAM | ||
538 | */ | ||
539 | u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, | ||
540 | u64 size, u64 align) | ||
541 | { | ||
542 | u64 addr, last; | ||
543 | |||
544 | addr = round_up(ei_start, align); | ||
545 | if (addr < start) | ||
546 | addr = round_up(start, align); | ||
547 | if (addr >= ei_last) | ||
548 | goto out; | ||
549 | while (bad_addr(&addr, size, align) && addr+size <= ei_last) | ||
550 | ; | ||
551 | last = addr + size; | ||
552 | if (last > ei_last) | ||
553 | goto out; | ||
554 | if (last > end) | ||
555 | goto out; | ||
556 | |||
557 | return addr; | ||
558 | |||
559 | out: | ||
560 | return -1ULL; | ||
561 | } | ||
562 | |||
563 | u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, | ||
564 | u64 *sizep, u64 align) | ||
565 | { | ||
566 | u64 addr, last; | ||
567 | |||
568 | addr = round_up(ei_start, align); | ||
569 | if (addr < start) | ||
570 | addr = round_up(start, align); | ||
571 | if (addr >= ei_last) | ||
572 | goto out; | ||
573 | *sizep = ei_last - addr; | ||
574 | while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) | ||
575 | ; | ||
576 | last = addr + *sizep; | ||
577 | if (last > ei_last) | ||
578 | goto out; | ||
579 | |||
580 | return addr; | ||
581 | |||
582 | out: | ||
583 | return -1ULL; | ||
584 | } | ||
diff --git a/kernel/elfcore.c b/kernel/elfcore.c new file mode 100644 index 000000000000..ff915efef66d --- /dev/null +++ b/kernel/elfcore.c | |||
@@ -0,0 +1,28 @@ | |||
1 | #include <linux/elf.h> | ||
2 | #include <linux/fs.h> | ||
3 | #include <linux/mm.h> | ||
4 | |||
5 | #include <asm/elf.h> | ||
6 | |||
7 | |||
8 | Elf_Half __weak elf_core_extra_phdrs(void) | ||
9 | { | ||
10 | return 0; | ||
11 | } | ||
12 | |||
13 | int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, | ||
14 | unsigned long limit) | ||
15 | { | ||
16 | return 1; | ||
17 | } | ||
18 | |||
19 | int __weak elf_core_write_extra_data(struct file *file, size_t *size, | ||
20 | unsigned long limit) | ||
21 | { | ||
22 | return 1; | ||
23 | } | ||
24 | |||
25 | size_t __weak elf_core_extra_data_size(void) | ||
26 | { | ||
27 | return 0; | ||
28 | } | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 546774a31a66..7f2683a10ac4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk) | |||
85 | BUG_ON(!sig); | 85 | BUG_ON(!sig); |
86 | BUG_ON(!atomic_read(&sig->count)); | 86 | BUG_ON(!atomic_read(&sig->count)); |
87 | 87 | ||
88 | sighand = rcu_dereference(tsk->sighand); | 88 | sighand = rcu_dereference_check(tsk->sighand, |
89 | rcu_read_lock_held() || | ||
90 | lockdep_tasklist_lock_is_held()); | ||
89 | spin_lock(&sighand->siglock); | 91 | spin_lock(&sighand->siglock); |
90 | 92 | ||
91 | posix_cpu_timers_exit(tsk); | 93 | posix_cpu_timers_exit(tsk); |
@@ -170,8 +172,10 @@ void release_task(struct task_struct * p) | |||
170 | repeat: | 172 | repeat: |
171 | tracehook_prepare_release_task(p); | 173 | tracehook_prepare_release_task(p); |
172 | /* don't need to get the RCU readlock here - the process is dead and | 174 | /* don't need to get the RCU readlock here - the process is dead and |
173 | * can't be modifying its own credentials */ | 175 | * can't be modifying its own credentials. But shut RCU-lockdep up */ |
176 | rcu_read_lock(); | ||
174 | atomic_dec(&__task_cred(p)->user->processes); | 177 | atomic_dec(&__task_cred(p)->user->processes); |
178 | rcu_read_unlock(); | ||
175 | 179 | ||
176 | proc_flush_task(p); | 180 | proc_flush_task(p); |
177 | 181 | ||
@@ -473,9 +477,11 @@ static void close_files(struct files_struct * files) | |||
473 | /* | 477 | /* |
474 | * It is safe to dereference the fd table without RCU or | 478 | * It is safe to dereference the fd table without RCU or |
475 | * ->file_lock because this is the last reference to the | 479 | * ->file_lock because this is the last reference to the |
476 | * files structure. | 480 | * files structure. But use RCU to shut RCU-lockdep up. |
477 | */ | 481 | */ |
482 | rcu_read_lock(); | ||
478 | fdt = files_fdtable(files); | 483 | fdt = files_fdtable(files); |
484 | rcu_read_unlock(); | ||
479 | for (;;) { | 485 | for (;;) { |
480 | unsigned long set; | 486 | unsigned long set; |
481 | i = j * __NFDBITS; | 487 | i = j * __NFDBITS; |
@@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files) | |||
521 | * at the end of the RCU grace period. Otherwise, | 527 | * at the end of the RCU grace period. Otherwise, |
522 | * you can free files immediately. | 528 | * you can free files immediately. |
523 | */ | 529 | */ |
530 | rcu_read_lock(); | ||
524 | fdt = files_fdtable(files); | 531 | fdt = files_fdtable(files); |
525 | if (fdt != &files->fdtab) | 532 | if (fdt != &files->fdtab) |
526 | kmem_cache_free(files_cachep, files); | 533 | kmem_cache_free(files_cachep, files); |
527 | free_fdtable(fdt); | 534 | free_fdtable(fdt); |
535 | rcu_read_unlock(); | ||
528 | } | 536 | } |
529 | } | 537 | } |
530 | 538 | ||
@@ -944,7 +952,9 @@ NORET_TYPE void do_exit(long code) | |||
944 | preempt_count()); | 952 | preempt_count()); |
945 | 953 | ||
946 | acct_update_integrals(tsk); | 954 | acct_update_integrals(tsk); |
947 | 955 | /* sync mm's RSS info before statistics gathering */ | |
956 | if (tsk->mm) | ||
957 | sync_mm_rss(tsk, tsk->mm); | ||
948 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 958 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
949 | if (group_dead) { | 959 | if (group_dead) { |
950 | hrtimer_cancel(&tsk->signal->real_timer); | 960 | hrtimer_cancel(&tsk->signal->real_timer); |
@@ -1180,7 +1190,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1180 | 1190 | ||
1181 | if (unlikely(wo->wo_flags & WNOWAIT)) { | 1191 | if (unlikely(wo->wo_flags & WNOWAIT)) { |
1182 | int exit_code = p->exit_code; | 1192 | int exit_code = p->exit_code; |
1183 | int why, status; | 1193 | int why; |
1184 | 1194 | ||
1185 | get_task_struct(p); | 1195 | get_task_struct(p); |
1186 | read_unlock(&tasklist_lock); | 1196 | read_unlock(&tasklist_lock); |
diff --git a/kernel/fork.c b/kernel/fork.c index 5b2959b3ffc2..44b0791b0a2e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -87,6 +87,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; | |||
87 | 87 | ||
88 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ | 88 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
89 | 89 | ||
90 | #ifdef CONFIG_PROVE_RCU | ||
91 | int lockdep_tasklist_lock_is_held(void) | ||
92 | { | ||
93 | return lockdep_is_held(&tasklist_lock); | ||
94 | } | ||
95 | EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); | ||
96 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
97 | |||
90 | int nr_processes(void) | 98 | int nr_processes(void) |
91 | { | 99 | { |
92 | int cpu; | 100 | int cpu; |
@@ -328,15 +336,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
328 | if (!tmp) | 336 | if (!tmp) |
329 | goto fail_nomem; | 337 | goto fail_nomem; |
330 | *tmp = *mpnt; | 338 | *tmp = *mpnt; |
339 | INIT_LIST_HEAD(&tmp->anon_vma_chain); | ||
331 | pol = mpol_dup(vma_policy(mpnt)); | 340 | pol = mpol_dup(vma_policy(mpnt)); |
332 | retval = PTR_ERR(pol); | 341 | retval = PTR_ERR(pol); |
333 | if (IS_ERR(pol)) | 342 | if (IS_ERR(pol)) |
334 | goto fail_nomem_policy; | 343 | goto fail_nomem_policy; |
335 | vma_set_policy(tmp, pol); | 344 | vma_set_policy(tmp, pol); |
345 | if (anon_vma_fork(tmp, mpnt)) | ||
346 | goto fail_nomem_anon_vma_fork; | ||
336 | tmp->vm_flags &= ~VM_LOCKED; | 347 | tmp->vm_flags &= ~VM_LOCKED; |
337 | tmp->vm_mm = mm; | 348 | tmp->vm_mm = mm; |
338 | tmp->vm_next = NULL; | 349 | tmp->vm_next = NULL; |
339 | anon_vma_link(tmp); | ||
340 | file = tmp->vm_file; | 350 | file = tmp->vm_file; |
341 | if (file) { | 351 | if (file) { |
342 | struct inode *inode = file->f_path.dentry->d_inode; | 352 | struct inode *inode = file->f_path.dentry->d_inode; |
@@ -391,6 +401,8 @@ out: | |||
391 | flush_tlb_mm(oldmm); | 401 | flush_tlb_mm(oldmm); |
392 | up_write(&oldmm->mmap_sem); | 402 | up_write(&oldmm->mmap_sem); |
393 | return retval; | 403 | return retval; |
404 | fail_nomem_anon_vma_fork: | ||
405 | mpol_put(pol); | ||
394 | fail_nomem_policy: | 406 | fail_nomem_policy: |
395 | kmem_cache_free(vm_area_cachep, tmp); | 407 | kmem_cache_free(vm_area_cachep, tmp); |
396 | fail_nomem: | 408 | fail_nomem: |
@@ -454,8 +466,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
454 | (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; | 466 | (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; |
455 | mm->core_state = NULL; | 467 | mm->core_state = NULL; |
456 | mm->nr_ptes = 0; | 468 | mm->nr_ptes = 0; |
457 | set_mm_counter(mm, file_rss, 0); | 469 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
458 | set_mm_counter(mm, anon_rss, 0); | ||
459 | spin_lock_init(&mm->page_table_lock); | 470 | spin_lock_init(&mm->page_table_lock); |
460 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 471 | mm->free_area_cache = TASK_UNMAPPED_BASE; |
461 | mm->cached_hole_size = ~0UL; | 472 | mm->cached_hole_size = ~0UL; |
@@ -824,23 +835,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) | |||
824 | */ | 835 | */ |
825 | static void posix_cpu_timers_init_group(struct signal_struct *sig) | 836 | static void posix_cpu_timers_init_group(struct signal_struct *sig) |
826 | { | 837 | { |
838 | unsigned long cpu_limit; | ||
839 | |||
827 | /* Thread group counters. */ | 840 | /* Thread group counters. */ |
828 | thread_group_cputime_init(sig); | 841 | thread_group_cputime_init(sig); |
829 | 842 | ||
830 | /* Expiration times and increments. */ | 843 | cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
831 | sig->it[CPUCLOCK_PROF].expires = cputime_zero; | 844 | if (cpu_limit != RLIM_INFINITY) { |
832 | sig->it[CPUCLOCK_PROF].incr = cputime_zero; | 845 | sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); |
833 | sig->it[CPUCLOCK_VIRT].expires = cputime_zero; | ||
834 | sig->it[CPUCLOCK_VIRT].incr = cputime_zero; | ||
835 | |||
836 | /* Cached expiration times. */ | ||
837 | sig->cputime_expires.prof_exp = cputime_zero; | ||
838 | sig->cputime_expires.virt_exp = cputime_zero; | ||
839 | sig->cputime_expires.sched_exp = 0; | ||
840 | |||
841 | if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { | ||
842 | sig->cputime_expires.prof_exp = | ||
843 | secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); | ||
844 | sig->cputimer.running = 1; | 846 | sig->cputimer.running = 1; |
845 | } | 847 | } |
846 | 848 | ||
@@ -857,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
857 | if (clone_flags & CLONE_THREAD) | 859 | if (clone_flags & CLONE_THREAD) |
858 | return 0; | 860 | return 0; |
859 | 861 | ||
860 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); | 862 | sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); |
861 | tsk->signal = sig; | 863 | tsk->signal = sig; |
862 | if (!sig) | 864 | if (!sig) |
863 | return -ENOMEM; | 865 | return -ENOMEM; |
@@ -865,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
865 | atomic_set(&sig->count, 1); | 867 | atomic_set(&sig->count, 1); |
866 | atomic_set(&sig->live, 1); | 868 | atomic_set(&sig->live, 1); |
867 | init_waitqueue_head(&sig->wait_chldexit); | 869 | init_waitqueue_head(&sig->wait_chldexit); |
868 | sig->flags = 0; | ||
869 | if (clone_flags & CLONE_NEWPID) | 870 | if (clone_flags & CLONE_NEWPID) |
870 | sig->flags |= SIGNAL_UNKILLABLE; | 871 | sig->flags |= SIGNAL_UNKILLABLE; |
871 | sig->group_exit_code = 0; | ||
872 | sig->group_exit_task = NULL; | ||
873 | sig->group_stop_count = 0; | ||
874 | sig->curr_target = tsk; | 872 | sig->curr_target = tsk; |
875 | init_sigpending(&sig->shared_pending); | 873 | init_sigpending(&sig->shared_pending); |
876 | INIT_LIST_HEAD(&sig->posix_timers); | 874 | INIT_LIST_HEAD(&sig->posix_timers); |
877 | 875 | ||
878 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 876 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
879 | sig->it_real_incr.tv64 = 0; | ||
880 | sig->real_timer.function = it_real_fn; | 877 | sig->real_timer.function = it_real_fn; |
881 | 878 | ||
882 | sig->leader = 0; /* session leadership doesn't inherit */ | ||
883 | sig->tty_old_pgrp = NULL; | ||
884 | sig->tty = NULL; | ||
885 | |||
886 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; | ||
887 | sig->gtime = cputime_zero; | ||
888 | sig->cgtime = cputime_zero; | ||
889 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
890 | sig->prev_utime = sig->prev_stime = cputime_zero; | ||
891 | #endif | ||
892 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; | ||
893 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; | ||
894 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; | ||
895 | sig->maxrss = sig->cmaxrss = 0; | ||
896 | task_io_accounting_init(&sig->ioac); | ||
897 | sig->sum_sched_runtime = 0; | ||
898 | taskstats_tgid_init(sig); | ||
899 | |||
900 | task_lock(current->group_leader); | 879 | task_lock(current->group_leader); |
901 | memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); | 880 | memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); |
902 | task_unlock(current->group_leader); | 881 | task_unlock(current->group_leader); |
903 | 882 | ||
904 | posix_cpu_timers_init_group(sig); | 883 | posix_cpu_timers_init_group(sig); |
905 | 884 | ||
906 | acct_init_pacct(&sig->pacct); | ||
907 | |||
908 | tty_audit_fork(sig); | 885 | tty_audit_fork(sig); |
909 | 886 | ||
910 | sig->oom_adj = current->signal->oom_adj; | 887 | sig->oom_adj = current->signal->oom_adj; |
@@ -1033,7 +1010,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1033 | #endif | 1010 | #endif |
1034 | retval = -EAGAIN; | 1011 | retval = -EAGAIN; |
1035 | if (atomic_read(&p->real_cred->user->processes) >= | 1012 | if (atomic_read(&p->real_cred->user->processes) >= |
1036 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { | 1013 | task_rlimit(p, RLIMIT_NPROC)) { |
1037 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && | 1014 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && |
1038 | p->real_cred->user != INIT_USER) | 1015 | p->real_cred->user != INIT_USER) |
1039 | goto bad_fork_free; | 1016 | goto bad_fork_free; |
@@ -1075,6 +1052,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1075 | p->prev_utime = cputime_zero; | 1052 | p->prev_utime = cputime_zero; |
1076 | p->prev_stime = cputime_zero; | 1053 | p->prev_stime = cputime_zero; |
1077 | #endif | 1054 | #endif |
1055 | #if defined(SPLIT_RSS_COUNTING) | ||
1056 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); | ||
1057 | #endif | ||
1078 | 1058 | ||
1079 | p->default_timer_slack_ns = current->timer_slack_ns; | 1059 | p->default_timer_slack_ns = current->timer_slack_ns; |
1080 | 1060 | ||
@@ -1241,21 +1221,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1241 | /* Need tasklist lock for parent etc handling! */ | 1221 | /* Need tasklist lock for parent etc handling! */ |
1242 | write_lock_irq(&tasklist_lock); | 1222 | write_lock_irq(&tasklist_lock); |
1243 | 1223 | ||
1244 | /* | ||
1245 | * The task hasn't been attached yet, so its cpus_allowed mask will | ||
1246 | * not be changed, nor will its assigned CPU. | ||
1247 | * | ||
1248 | * The cpus_allowed mask of the parent may have changed after it was | ||
1249 | * copied first time - so re-copy it here, then check the child's CPU | ||
1250 | * to ensure it is on a valid CPU (and if not, just force it back to | ||
1251 | * parent's CPU). This avoids alot of nasty races. | ||
1252 | */ | ||
1253 | p->cpus_allowed = current->cpus_allowed; | ||
1254 | p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; | ||
1255 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || | ||
1256 | !cpu_online(task_cpu(p)))) | ||
1257 | set_task_cpu(p, smp_processor_id()); | ||
1258 | |||
1259 | /* CLONE_PARENT re-uses the old parent */ | 1224 | /* CLONE_PARENT re-uses the old parent */ |
1260 | if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { | 1225 | if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { |
1261 | p->real_parent = current->real_parent; | 1226 | p->real_parent = current->real_parent; |
diff --git a/kernel/futex.c b/kernel/futex.c index d9b3a2228f9d..e7a35f1039e7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
530 | return -EINVAL; | 530 | return -EINVAL; |
531 | 531 | ||
532 | WARN_ON(!atomic_read(&pi_state->refcount)); | 532 | WARN_ON(!atomic_read(&pi_state->refcount)); |
533 | WARN_ON(pid && pi_state->owner && | 533 | |
534 | pi_state->owner->pid != pid); | 534 | /* |
535 | * When pi_state->owner is NULL then the owner died | ||
536 | * and another waiter is on the fly. pi_state->owner | ||
537 | * is fixed up by the task which acquires | ||
538 | * pi_state->rt_mutex. | ||
539 | * | ||
540 | * We do not check for pid == 0 which can happen when | ||
541 | * the owner died and robust_list_exit() cleared the | ||
542 | * TID. | ||
543 | */ | ||
544 | if (pid && pi_state->owner) { | ||
545 | /* | ||
546 | * Bail out if user space manipulated the | ||
547 | * futex value. | ||
548 | */ | ||
549 | if (pid != task_pid_vnr(pi_state->owner)) | ||
550 | return -EINVAL; | ||
551 | } | ||
535 | 552 | ||
536 | atomic_inc(&pi_state->refcount); | 553 | atomic_inc(&pi_state->refcount); |
537 | *ps = pi_state; | 554 | *ps = pi_state; |
@@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
758 | if (!pi_state) | 775 | if (!pi_state) |
759 | return -EINVAL; | 776 | return -EINVAL; |
760 | 777 | ||
778 | /* | ||
779 | * If current does not own the pi_state then the futex is | ||
780 | * inconsistent and user space fiddled with the futex value. | ||
781 | */ | ||
782 | if (pi_state->owner != current) | ||
783 | return -EINVAL; | ||
784 | |||
761 | raw_spin_lock(&pi_state->pi_mutex.wait_lock); | 785 | raw_spin_lock(&pi_state->pi_mutex.wait_lock); |
762 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | 786 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); |
763 | 787 | ||
@@ -1971,7 +1995,7 @@ retry_private: | |||
1971 | /* Unqueue and drop the lock */ | 1995 | /* Unqueue and drop the lock */ |
1972 | unqueue_me_pi(&q); | 1996 | unqueue_me_pi(&q); |
1973 | 1997 | ||
1974 | goto out; | 1998 | goto out_put_key; |
1975 | 1999 | ||
1976 | out_unlock_put_key: | 2000 | out_unlock_put_key: |
1977 | queue_unlock(&q, hb); | 2001 | queue_unlock(&q, hb); |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 235716556bf1..d49afb2395e5 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
146 | struct task_struct *p; | 146 | struct task_struct *p; |
147 | 147 | ||
148 | ret = -ESRCH; | 148 | ret = -ESRCH; |
149 | read_lock(&tasklist_lock); | 149 | rcu_read_lock(); |
150 | p = find_task_by_vpid(pid); | 150 | p = find_task_by_vpid(pid); |
151 | if (!p) | 151 | if (!p) |
152 | goto err_unlock; | 152 | goto err_unlock; |
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
157 | !capable(CAP_SYS_PTRACE)) | 157 | !capable(CAP_SYS_PTRACE)) |
158 | goto err_unlock; | 158 | goto err_unlock; |
159 | head = p->compat_robust_list; | 159 | head = p->compat_robust_list; |
160 | read_unlock(&tasklist_lock); | 160 | rcu_read_unlock(); |
161 | } | 161 | } |
162 | 162 | ||
163 | if (put_user(sizeof(*head), len_ptr)) | 163 | if (put_user(sizeof(*head), len_ptr)) |
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
165 | return put_user(ptr_to_compat(head), head_ptr); | 165 | return put_user(ptr_to_compat(head), head_ptr); |
166 | 166 | ||
167 | err_unlock: | 167 | err_unlock: |
168 | read_unlock(&tasklist_lock); | 168 | rcu_read_unlock(); |
169 | 169 | ||
170 | return ret; | 170 | return ret; |
171 | } | 171 | } |
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c030ae657f20..03808ed342a6 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
@@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) | |||
243 | * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) | 243 | * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) |
244 | * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM | 244 | * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM |
245 | */ | 245 | */ |
246 | int reserve_bp_slot(struct perf_event *bp) | 246 | static int __reserve_bp_slot(struct perf_event *bp) |
247 | { | 247 | { |
248 | struct bp_busy_slots slots = {0}; | 248 | struct bp_busy_slots slots = {0}; |
249 | int ret = 0; | ||
250 | |||
251 | mutex_lock(&nr_bp_mutex); | ||
252 | 249 | ||
253 | fetch_bp_busy_slots(&slots, bp); | 250 | fetch_bp_busy_slots(&slots, bp); |
254 | 251 | ||
255 | /* Flexible counters need to keep at least one slot */ | 252 | /* Flexible counters need to keep at least one slot */ |
256 | if (slots.pinned + (!!slots.flexible) == HBP_NUM) { | 253 | if (slots.pinned + (!!slots.flexible) == HBP_NUM) |
257 | ret = -ENOSPC; | 254 | return -ENOSPC; |
258 | goto end; | ||
259 | } | ||
260 | 255 | ||
261 | toggle_bp_slot(bp, true); | 256 | toggle_bp_slot(bp, true); |
262 | 257 | ||
263 | end: | 258 | return 0; |
259 | } | ||
260 | |||
261 | int reserve_bp_slot(struct perf_event *bp) | ||
262 | { | ||
263 | int ret; | ||
264 | |||
265 | mutex_lock(&nr_bp_mutex); | ||
266 | |||
267 | ret = __reserve_bp_slot(bp); | ||
268 | |||
264 | mutex_unlock(&nr_bp_mutex); | 269 | mutex_unlock(&nr_bp_mutex); |
265 | 270 | ||
266 | return ret; | 271 | return ret; |
267 | } | 272 | } |
268 | 273 | ||
274 | static void __release_bp_slot(struct perf_event *bp) | ||
275 | { | ||
276 | toggle_bp_slot(bp, false); | ||
277 | } | ||
278 | |||
269 | void release_bp_slot(struct perf_event *bp) | 279 | void release_bp_slot(struct perf_event *bp) |
270 | { | 280 | { |
271 | mutex_lock(&nr_bp_mutex); | 281 | mutex_lock(&nr_bp_mutex); |
272 | 282 | ||
273 | toggle_bp_slot(bp, false); | 283 | __release_bp_slot(bp); |
274 | 284 | ||
275 | mutex_unlock(&nr_bp_mutex); | 285 | mutex_unlock(&nr_bp_mutex); |
276 | } | 286 | } |
277 | 287 | ||
288 | /* | ||
289 | * Allow the kernel debugger to reserve breakpoint slots without | ||
290 | * taking a lock using the dbg_* variant of for the reserve and | ||
291 | * release breakpoint slots. | ||
292 | */ | ||
293 | int dbg_reserve_bp_slot(struct perf_event *bp) | ||
294 | { | ||
295 | if (mutex_is_locked(&nr_bp_mutex)) | ||
296 | return -1; | ||
297 | |||
298 | return __reserve_bp_slot(bp); | ||
299 | } | ||
300 | |||
301 | int dbg_release_bp_slot(struct perf_event *bp) | ||
302 | { | ||
303 | if (mutex_is_locked(&nr_bp_mutex)) | ||
304 | return -1; | ||
305 | |||
306 | __release_bp_slot(bp); | ||
307 | |||
308 | return 0; | ||
309 | } | ||
278 | 310 | ||
279 | int register_perf_hw_breakpoint(struct perf_event *bp) | 311 | int register_perf_hw_breakpoint(struct perf_event *bp) |
280 | { | 312 | { |
@@ -328,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | |||
328 | int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) | 360 | int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) |
329 | { | 361 | { |
330 | u64 old_addr = bp->attr.bp_addr; | 362 | u64 old_addr = bp->attr.bp_addr; |
363 | u64 old_len = bp->attr.bp_len; | ||
331 | int old_type = bp->attr.bp_type; | 364 | int old_type = bp->attr.bp_type; |
332 | int old_len = bp->attr.bp_len; | ||
333 | int err = 0; | 365 | int err = 0; |
334 | 366 | ||
335 | perf_event_disable(bp); | 367 | perf_event_disable(bp); |
@@ -381,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); | |||
381 | * | 413 | * |
382 | * @return a set of per_cpu pointers to perf events | 414 | * @return a set of per_cpu pointers to perf events |
383 | */ | 415 | */ |
384 | struct perf_event ** | 416 | struct perf_event * __percpu * |
385 | register_wide_hw_breakpoint(struct perf_event_attr *attr, | 417 | register_wide_hw_breakpoint(struct perf_event_attr *attr, |
386 | perf_overflow_handler_t triggered) | 418 | perf_overflow_handler_t triggered) |
387 | { | 419 | { |
388 | struct perf_event **cpu_events, **pevent, *bp; | 420 | struct perf_event * __percpu *cpu_events, **pevent, *bp; |
389 | long err; | 421 | long err; |
390 | int cpu; | 422 | int cpu; |
391 | 423 | ||
392 | cpu_events = alloc_percpu(typeof(*cpu_events)); | 424 | cpu_events = alloc_percpu(typeof(*cpu_events)); |
393 | if (!cpu_events) | 425 | if (!cpu_events) |
394 | return ERR_PTR(-ENOMEM); | 426 | return (void __percpu __force *)ERR_PTR(-ENOMEM); |
395 | 427 | ||
396 | get_online_cpus(); | 428 | get_online_cpus(); |
397 | for_each_online_cpu(cpu) { | 429 | for_each_online_cpu(cpu) { |
@@ -419,7 +451,7 @@ fail: | |||
419 | put_online_cpus(); | 451 | put_online_cpus(); |
420 | 452 | ||
421 | free_percpu(cpu_events); | 453 | free_percpu(cpu_events); |
422 | return ERR_PTR(err); | 454 | return (void __percpu __force *)ERR_PTR(err); |
423 | } | 455 | } |
424 | EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); | 456 | EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); |
425 | 457 | ||
@@ -427,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); | |||
427 | * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel | 459 | * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel |
428 | * @cpu_events: the per cpu set of events to unregister | 460 | * @cpu_events: the per cpu set of events to unregister |
429 | */ | 461 | */ |
430 | void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) | 462 | void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) |
431 | { | 463 | { |
432 | int cpu; | 464 | int cpu; |
433 | struct perf_event **pevent; | 465 | struct perf_event **pevent; |
@@ -457,5 +489,4 @@ struct pmu perf_ops_bp = { | |||
457 | .enable = arch_install_hw_breakpoint, | 489 | .enable = arch_install_hw_breakpoint, |
458 | .disable = arch_uninstall_hw_breakpoint, | 490 | .disable = arch_uninstall_hw_breakpoint, |
459 | .read = hw_breakpoint_pmu_read, | 491 | .read = hw_breakpoint_pmu_read, |
460 | .unthrottle = hw_breakpoint_pmu_unthrottle | ||
461 | }; | 492 | }; |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa28f666..b7091d5ca2f8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -18,11 +18,7 @@ | |||
18 | 18 | ||
19 | #include "internals.h" | 19 | #include "internals.h" |
20 | 20 | ||
21 | /** | 21 | static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) |
22 | * dynamic_irq_init - initialize a dynamically allocated irq | ||
23 | * @irq: irq number to initialize | ||
24 | */ | ||
25 | void dynamic_irq_init(unsigned int irq) | ||
26 | { | 22 | { |
27 | struct irq_desc *desc; | 23 | struct irq_desc *desc; |
28 | unsigned long flags; | 24 | unsigned long flags; |
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq) | |||
41 | desc->depth = 1; | 37 | desc->depth = 1; |
42 | desc->msi_desc = NULL; | 38 | desc->msi_desc = NULL; |
43 | desc->handler_data = NULL; | 39 | desc->handler_data = NULL; |
44 | desc->chip_data = NULL; | 40 | if (!keep_chip_data) |
41 | desc->chip_data = NULL; | ||
45 | desc->action = NULL; | 42 | desc->action = NULL; |
46 | desc->irq_count = 0; | 43 | desc->irq_count = 0; |
47 | desc->irqs_unhandled = 0; | 44 | desc->irqs_unhandled = 0; |
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq) | |||
55 | } | 52 | } |
56 | 53 | ||
57 | /** | 54 | /** |
58 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | 55 | * dynamic_irq_init - initialize a dynamically allocated irq |
59 | * @irq: irq number to initialize | 56 | * @irq: irq number to initialize |
60 | */ | 57 | */ |
61 | void dynamic_irq_cleanup(unsigned int irq) | 58 | void dynamic_irq_init(unsigned int irq) |
59 | { | ||
60 | dynamic_irq_init_x(irq, false); | ||
61 | } | ||
62 | |||
63 | /** | ||
64 | * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq | ||
65 | * @irq: irq number to initialize | ||
66 | * | ||
67 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
68 | */ | ||
69 | void dynamic_irq_init_keep_chip_data(unsigned int irq) | ||
70 | { | ||
71 | dynamic_irq_init_x(irq, true); | ||
72 | } | ||
73 | |||
74 | static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) | ||
62 | { | 75 | { |
63 | struct irq_desc *desc = irq_to_desc(irq); | 76 | struct irq_desc *desc = irq_to_desc(irq); |
64 | unsigned long flags; | 77 | unsigned long flags; |
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
77 | } | 90 | } |
78 | desc->msi_desc = NULL; | 91 | desc->msi_desc = NULL; |
79 | desc->handler_data = NULL; | 92 | desc->handler_data = NULL; |
80 | desc->chip_data = NULL; | 93 | if (!keep_chip_data) |
94 | desc->chip_data = NULL; | ||
81 | desc->handle_irq = handle_bad_irq; | 95 | desc->handle_irq = handle_bad_irq; |
82 | desc->chip = &no_irq_chip; | 96 | desc->chip = &no_irq_chip; |
83 | desc->name = NULL; | 97 | desc->name = NULL; |
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
85 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 99 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
86 | } | 100 | } |
87 | 101 | ||
102 | /** | ||
103 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
104 | * @irq: irq number to initialize | ||
105 | */ | ||
106 | void dynamic_irq_cleanup(unsigned int irq) | ||
107 | { | ||
108 | dynamic_irq_cleanup_x(irq, false); | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq | ||
113 | * @irq: irq number to initialize | ||
114 | * | ||
115 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
116 | */ | ||
117 | void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) | ||
118 | { | ||
119 | dynamic_irq_cleanup_x(irq, true); | ||
120 | } | ||
121 | |||
88 | 122 | ||
89 | /** | 123 | /** |
90 | * set_irq_chip - set the irq chip for an irq | 124 | * set_irq_chip - set the irq chip for an irq |
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) | |||
325 | if (desc->chip->ack) | 359 | if (desc->chip->ack) |
326 | desc->chip->ack(irq); | 360 | desc->chip->ack(irq); |
327 | } | 361 | } |
362 | desc->status |= IRQ_MASKED; | ||
363 | } | ||
364 | |||
365 | static inline void mask_irq(struct irq_desc *desc, int irq) | ||
366 | { | ||
367 | if (desc->chip->mask) { | ||
368 | desc->chip->mask(irq); | ||
369 | desc->status |= IRQ_MASKED; | ||
370 | } | ||
371 | } | ||
372 | |||
373 | static inline void unmask_irq(struct irq_desc *desc, int irq) | ||
374 | { | ||
375 | if (desc->chip->unmask) { | ||
376 | desc->chip->unmask(irq); | ||
377 | desc->status &= ~IRQ_MASKED; | ||
378 | } | ||
328 | } | 379 | } |
329 | 380 | ||
330 | /* | 381 | /* |
@@ -450,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
450 | raw_spin_lock(&desc->lock); | 501 | raw_spin_lock(&desc->lock); |
451 | desc->status &= ~IRQ_INPROGRESS; | 502 | desc->status &= ~IRQ_INPROGRESS; |
452 | 503 | ||
453 | if (unlikely(desc->status & IRQ_ONESHOT)) | 504 | if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) |
454 | desc->status |= IRQ_MASKED; | 505 | unmask_irq(desc, irq); |
455 | else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) | ||
456 | desc->chip->unmask(irq); | ||
457 | out_unlock: | 506 | out_unlock: |
458 | raw_spin_unlock(&desc->lock); | 507 | raw_spin_unlock(&desc->lock); |
459 | } | 508 | } |
@@ -490,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
490 | action = desc->action; | 539 | action = desc->action; |
491 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 540 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { |
492 | desc->status |= IRQ_PENDING; | 541 | desc->status |= IRQ_PENDING; |
493 | if (desc->chip->mask) | 542 | mask_irq(desc, irq); |
494 | desc->chip->mask(irq); | ||
495 | goto out; | 543 | goto out; |
496 | } | 544 | } |
497 | 545 | ||
@@ -520,7 +568,7 @@ out: | |||
520 | * signal. The occurence is latched into the irq controller hardware | 568 | * signal. The occurence is latched into the irq controller hardware |
521 | * and must be acked in order to be reenabled. After the ack another | 569 | * and must be acked in order to be reenabled. After the ack another |
522 | * interrupt can happen on the same source even before the first one | 570 | * interrupt can happen on the same source even before the first one |
523 | * is handled by the assosiacted event handler. If this happens it | 571 | * is handled by the associated event handler. If this happens it |
524 | * might be necessary to disable (mask) the interrupt depending on the | 572 | * might be necessary to disable (mask) the interrupt depending on the |
525 | * controller hardware. This requires to reenable the interrupt inside | 573 | * controller hardware. This requires to reenable the interrupt inside |
526 | * of the loop which handles the interrupts which have arrived while | 574 | * of the loop which handles the interrupts which have arrived while |
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
559 | irqreturn_t action_ret; | 607 | irqreturn_t action_ret; |
560 | 608 | ||
561 | if (unlikely(!action)) { | 609 | if (unlikely(!action)) { |
562 | desc->chip->mask(irq); | 610 | mask_irq(desc, irq); |
563 | goto out_unlock; | 611 | goto out_unlock; |
564 | } | 612 | } |
565 | 613 | ||
@@ -571,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
571 | if (unlikely((desc->status & | 619 | if (unlikely((desc->status & |
572 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == | 620 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == |
573 | (IRQ_PENDING | IRQ_MASKED))) { | 621 | (IRQ_PENDING | IRQ_MASKED))) { |
574 | desc->chip->unmask(irq); | 622 | unmask_irq(desc, irq); |
575 | desc->status &= ~IRQ_MASKED; | ||
576 | } | 623 | } |
577 | 624 | ||
578 | desc->status &= ~IRQ_PENDING; | 625 | desc->status &= ~IRQ_PENDING; |
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, | |||
682 | __set_irq_handler(irq, handle, 0, name); | 729 | __set_irq_handler(irq, handle, 0, name); |
683 | } | 730 | } |
684 | 731 | ||
685 | void __init set_irq_noprobe(unsigned int irq) | 732 | void set_irq_noprobe(unsigned int irq) |
686 | { | 733 | { |
687 | struct irq_desc *desc = irq_to_desc(irq); | 734 | struct irq_desc *desc = irq_to_desc(irq); |
688 | unsigned long flags; | 735 | unsigned long flags; |
@@ -697,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq) | |||
697 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 744 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
698 | } | 745 | } |
699 | 746 | ||
700 | void __init set_irq_probe(unsigned int irq) | 747 | void set_irq_probe(unsigned int irq) |
701 | { | 748 | { |
702 | struct irq_desc *desc = irq_to_desc(irq); | 749 | struct irq_desc *desc = irq_to_desc(irq); |
703 | unsigned long flags; | 750 | unsigned long flags; |
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index d06df9c41cba..1ef4ffcdfa55 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c | |||
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data) | |||
42 | * automatically freed on driver detach. | 42 | * automatically freed on driver detach. |
43 | * | 43 | * |
44 | * If an IRQ allocated with this function needs to be freed | 44 | * If an IRQ allocated with this function needs to be freed |
45 | * separately, dev_free_irq() must be used. | 45 | * separately, devm_free_irq() must be used. |
46 | */ | 46 | */ |
47 | int devm_request_threaded_irq(struct device *dev, unsigned int irq, | 47 | int devm_request_threaded_irq(struct device *dev, unsigned int irq, |
48 | irq_handler_t handler, irq_handler_t thread_fn, | 48 | irq_handler_t handler, irq_handler_t thread_fn, |
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq); | |||
81 | * Except for the extra @dev argument, this function takes the | 81 | * Except for the extra @dev argument, this function takes the |
82 | * same arguments and performs the same function as free_irq(). | 82 | * same arguments and performs the same function as free_irq(). |
83 | * This function instead of free_irq() should be used to manually | 83 | * This function instead of free_irq() should be used to manually |
84 | * free IRQs allocated with dev_request_irq(). | 84 | * free IRQs allocated with devm_request_irq(). |
85 | */ | 85 | */ |
86 | void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) | 86 | void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) |
87 | { | 87 | { |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 814940e7f485..76d5a671bfe1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -19,7 +19,7 @@ | |||
19 | #include <linux/kernel_stat.h> | 19 | #include <linux/kernel_stat.h> |
20 | #include <linux/rculist.h> | 20 | #include <linux/rculist.h> |
21 | #include <linux/hash.h> | 21 | #include <linux/hash.h> |
22 | #include <linux/bootmem.h> | 22 | #include <linux/radix-tree.h> |
23 | #include <trace/events/irq.h> | 23 | #include <trace/events/irq.h> |
24 | 24 | ||
25 | #include "internals.h" | 25 | #include "internals.h" |
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) | |||
87 | { | 87 | { |
88 | void *ptr; | 88 | void *ptr; |
89 | 89 | ||
90 | if (slab_is_available()) | 90 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), |
91 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), | 91 | GFP_ATOMIC, node); |
92 | GFP_ATOMIC, node); | ||
93 | else | ||
94 | ptr = alloc_bootmem_node(NODE_DATA(node), | ||
95 | nr * sizeof(*desc->kstat_irqs)); | ||
96 | 92 | ||
97 | /* | 93 | /* |
98 | * don't overwite if can not get new one | 94 | * don't overwite if can not get new one |
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) | |||
132 | */ | 128 | */ |
133 | DEFINE_RAW_SPINLOCK(sparse_irq_lock); | 129 | DEFINE_RAW_SPINLOCK(sparse_irq_lock); |
134 | 130 | ||
135 | struct irq_desc **irq_desc_ptrs __read_mostly; | 131 | static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); |
132 | |||
133 | static void set_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
134 | { | ||
135 | radix_tree_insert(&irq_desc_tree, irq, desc); | ||
136 | } | ||
137 | |||
138 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
139 | { | ||
140 | return radix_tree_lookup(&irq_desc_tree, irq); | ||
141 | } | ||
142 | |||
143 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
144 | { | ||
145 | void **ptr; | ||
146 | |||
147 | ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); | ||
148 | if (ptr) | ||
149 | radix_tree_replace_slot(ptr, desc); | ||
150 | } | ||
136 | 151 | ||
137 | static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { | 152 | static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { |
138 | [0 ... NR_IRQS_LEGACY-1] = { | 153 | [0 ... NR_IRQS_LEGACY-1] = { |
@@ -164,9 +179,6 @@ int __init early_irq_init(void) | |||
164 | legacy_count = ARRAY_SIZE(irq_desc_legacy); | 179 | legacy_count = ARRAY_SIZE(irq_desc_legacy); |
165 | node = first_online_node; | 180 | node = first_online_node; |
166 | 181 | ||
167 | /* allocate irq_desc_ptrs array based on nr_irqs */ | ||
168 | irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); | ||
169 | |||
170 | /* allocate based on nr_cpu_ids */ | 182 | /* allocate based on nr_cpu_ids */ |
171 | kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * | 183 | kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * |
172 | sizeof(int), GFP_NOWAIT, node); | 184 | sizeof(int), GFP_NOWAIT, node); |
@@ -180,23 +192,12 @@ int __init early_irq_init(void) | |||
180 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 192 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
181 | alloc_desc_masks(&desc[i], node, true); | 193 | alloc_desc_masks(&desc[i], node, true); |
182 | init_desc_masks(&desc[i]); | 194 | init_desc_masks(&desc[i]); |
183 | irq_desc_ptrs[i] = desc + i; | 195 | set_irq_desc(i, &desc[i]); |
184 | } | 196 | } |
185 | 197 | ||
186 | for (i = legacy_count; i < nr_irqs; i++) | ||
187 | irq_desc_ptrs[i] = NULL; | ||
188 | |||
189 | return arch_early_irq_init(); | 198 | return arch_early_irq_init(); |
190 | } | 199 | } |
191 | 200 | ||
192 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
193 | { | ||
194 | if (irq_desc_ptrs && irq < nr_irqs) | ||
195 | return irq_desc_ptrs[irq]; | ||
196 | |||
197 | return NULL; | ||
198 | } | ||
199 | |||
200 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | 201 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) |
201 | { | 202 | { |
202 | struct irq_desc *desc; | 203 | struct irq_desc *desc; |
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | |||
208 | return NULL; | 209 | return NULL; |
209 | } | 210 | } |
210 | 211 | ||
211 | desc = irq_desc_ptrs[irq]; | 212 | desc = irq_to_desc(irq); |
212 | if (desc) | 213 | if (desc) |
213 | return desc; | 214 | return desc; |
214 | 215 | ||
215 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | 216 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); |
216 | 217 | ||
217 | /* We have to check it to avoid races with another CPU */ | 218 | /* We have to check it to avoid races with another CPU */ |
218 | desc = irq_desc_ptrs[irq]; | 219 | desc = irq_to_desc(irq); |
219 | if (desc) | 220 | if (desc) |
220 | goto out_unlock; | 221 | goto out_unlock; |
221 | 222 | ||
222 | if (slab_is_available()) | 223 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); |
223 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | ||
224 | else | ||
225 | desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); | ||
226 | 224 | ||
227 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); | 225 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); |
228 | if (!desc) { | 226 | if (!desc) { |
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | |||
231 | } | 229 | } |
232 | init_one_irq_desc(irq, desc, node); | 230 | init_one_irq_desc(irq, desc, node); |
233 | 231 | ||
234 | irq_desc_ptrs[irq] = desc; | 232 | set_irq_desc(irq, desc); |
235 | 233 | ||
236 | out_unlock: | 234 | out_unlock: |
237 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | 235 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b2821f070a3d..c63f3bc88f0b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc); | |||
21 | extern raw_spinlock_t sparse_irq_lock; | 21 | extern raw_spinlock_t sparse_irq_lock; |
22 | 22 | ||
23 | #ifdef CONFIG_SPARSE_IRQ | 23 | #ifdef CONFIG_SPARSE_IRQ |
24 | /* irq_desc_ptrs allocated at boot time */ | 24 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc); |
25 | extern struct irq_desc **irq_desc_ptrs; | ||
26 | #else | ||
27 | /* irq_desc_ptrs is a fixed size array */ | ||
28 | extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; | ||
29 | #endif | 25 | #endif |
30 | 26 | ||
31 | #ifdef CONFIG_PROC_FS | 27 | #ifdef CONFIG_PROC_FS |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index eb6078ca60c7..704e488730a5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) | |||
382 | { | 382 | { |
383 | struct irq_desc *desc = irq_to_desc(irq); | 383 | struct irq_desc *desc = irq_to_desc(irq); |
384 | struct irqaction *action; | 384 | struct irqaction *action; |
385 | unsigned long flags; | ||
385 | 386 | ||
386 | if (!desc) | 387 | if (!desc) |
387 | return 0; | 388 | return 0; |
@@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) | |||
389 | if (desc->status & IRQ_NOREQUEST) | 390 | if (desc->status & IRQ_NOREQUEST) |
390 | return 0; | 391 | return 0; |
391 | 392 | ||
393 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
392 | action = desc->action; | 394 | action = desc->action; |
393 | if (action) | 395 | if (action) |
394 | if (irqflags & action->flags & IRQF_SHARED) | 396 | if (irqflags & action->flags & IRQF_SHARED) |
395 | action = NULL; | 397 | action = NULL; |
396 | 398 | ||
399 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
400 | |||
397 | return !action; | 401 | return !action; |
398 | } | 402 | } |
399 | 403 | ||
@@ -483,8 +487,26 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
483 | */ | 487 | */ |
484 | static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) | 488 | static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) |
485 | { | 489 | { |
490 | again: | ||
486 | chip_bus_lock(irq, desc); | 491 | chip_bus_lock(irq, desc); |
487 | raw_spin_lock_irq(&desc->lock); | 492 | raw_spin_lock_irq(&desc->lock); |
493 | |||
494 | /* | ||
495 | * Implausible though it may be we need to protect us against | ||
496 | * the following scenario: | ||
497 | * | ||
498 | * The thread is faster done than the hard interrupt handler | ||
499 | * on the other CPU. If we unmask the irq line then the | ||
500 | * interrupt can come in again and masks the line, leaves due | ||
501 | * to IRQ_INPROGRESS and the irq line is masked forever. | ||
502 | */ | ||
503 | if (unlikely(desc->status & IRQ_INPROGRESS)) { | ||
504 | raw_spin_unlock_irq(&desc->lock); | ||
505 | chip_bus_sync_unlock(irq, desc); | ||
506 | cpu_relax(); | ||
507 | goto again; | ||
508 | } | ||
509 | |||
488 | if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { | 510 | if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { |
489 | desc->status &= ~IRQ_MASKED; | 511 | desc->status &= ~IRQ_MASKED; |
490 | desc->chip->unmask(irq); | 512 | desc->chip->unmask(irq); |
@@ -735,6 +757,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
735 | if (new->flags & IRQF_ONESHOT) | 757 | if (new->flags & IRQF_ONESHOT) |
736 | desc->status |= IRQ_ONESHOT; | 758 | desc->status |= IRQ_ONESHOT; |
737 | 759 | ||
760 | /* | ||
761 | * Force MSI interrupts to run with interrupts | ||
762 | * disabled. The multi vector cards can cause stack | ||
763 | * overflows due to nested interrupts when enough of | ||
764 | * them are directed to a core and fire at the same | ||
765 | * time. | ||
766 | */ | ||
767 | if (desc->msi_desc) | ||
768 | new->flags |= IRQF_DISABLED; | ||
769 | |||
738 | if (!(desc->status & IRQ_NOAUTOEN)) { | 770 | if (!(desc->status & IRQ_NOAUTOEN)) { |
739 | desc->depth = 0; | 771 | desc->depth = 0; |
740 | desc->status &= ~IRQ_DISABLED; | 772 | desc->status &= ~IRQ_DISABLED; |
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 26bac9d8f860..65d3845665ac 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c | |||
@@ -6,6 +6,7 @@ | |||
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/irq.h> | 8 | #include <linux/irq.h> |
9 | #include <linux/slab.h> | ||
9 | #include <linux/module.h> | 10 | #include <linux/module.h> |
10 | #include <linux/random.h> | 11 | #include <linux/random.h> |
11 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
@@ -70,7 +71,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | |||
70 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | 71 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); |
71 | 72 | ||
72 | /* We have to check it to avoid races with another CPU */ | 73 | /* We have to check it to avoid races with another CPU */ |
73 | desc = irq_desc_ptrs[irq]; | 74 | desc = irq_to_desc(irq); |
74 | 75 | ||
75 | if (desc && old_desc != desc) | 76 | if (desc && old_desc != desc) |
76 | goto out_unlock; | 77 | goto out_unlock; |
@@ -90,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | |||
90 | goto out_unlock; | 91 | goto out_unlock; |
91 | } | 92 | } |
92 | 93 | ||
93 | irq_desc_ptrs[irq] = desc; | 94 | replace_irq_desc(irq, desc); |
94 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | 95 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); |
95 | 96 | ||
96 | /* free the old one */ | 97 | /* free the old one */ |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6f50eccc79c0..7a6eb04ef6b5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -7,6 +7,7 @@ | |||
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/irq.h> | 9 | #include <linux/irq.h> |
10 | #include <linux/gfp.h> | ||
10 | #include <linux/proc_fs.h> | 11 | #include <linux/proc_fs.h> |
11 | #include <linux/seq_file.h> | 12 | #include <linux/seq_file.h> |
12 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8e5288a8a355..13aff293f4de 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/sched.h> /* for cond_resched */ | 21 | #include <linux/sched.h> /* for cond_resched */ |
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/ctype.h> | 23 | #include <linux/ctype.h> |
24 | #include <linux/slab.h> | ||
24 | 25 | ||
25 | #include <asm/sections.h> | 26 | #include <asm/sections.h> |
26 | 27 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index ef077fb73155..87ebe8adc474 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -41,7 +41,7 @@ | |||
41 | #include <asm/sections.h> | 41 | #include <asm/sections.h> |
42 | 42 | ||
43 | /* Per cpu memory for storing cpu states in case of system crash. */ | 43 | /* Per cpu memory for storing cpu states in case of system crash. */ |
44 | note_buf_t* crash_notes; | 44 | note_buf_t __percpu *crash_notes; |
45 | 45 | ||
46 | /* vmcoreinfo stuff */ | 46 | /* vmcoreinfo stuff */ |
47 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | 47 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; |
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 32c5c15d750d..35edbe22e9a9 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) | |||
80 | 80 | ||
81 | buffer = kmalloc(size, gfp_mask); | 81 | buffer = kmalloc(size, gfp_mask); |
82 | if (!buffer) { | 82 | if (!buffer) { |
83 | _kfifo_init(fifo, 0, 0); | 83 | _kfifo_init(fifo, NULL, 0); |
84 | return -ENOMEM; | 84 | return -ENOMEM; |
85 | } | 85 | } |
86 | 86 | ||
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc); | |||
97 | void kfifo_free(struct kfifo *fifo) | 97 | void kfifo_free(struct kfifo *fifo) |
98 | { | 98 | { |
99 | kfree(fifo->buffer); | 99 | kfree(fifo->buffer); |
100 | _kfifo_init(fifo, NULL, 0); | ||
100 | } | 101 | } |
101 | EXPORT_SYMBOL(kfifo_free); | 102 | EXPORT_SYMBOL(kfifo_free); |
102 | 103 | ||
@@ -349,6 +350,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n); | |||
349 | * @fifo: the fifo to be used. | 350 | * @fifo: the fifo to be used. |
350 | * @from: pointer to the data to be added. | 351 | * @from: pointer to the data to be added. |
351 | * @len: the length of the data to be added. | 352 | * @len: the length of the data to be added. |
353 | * @total: the actual returned data length. | ||
352 | * | 354 | * |
353 | * This function copies at most @len bytes from the @from into the | 355 | * This function copies at most @len bytes from the @from into the |
354 | * FIFO depending and returns -EFAULT/0. | 356 | * FIFO depending and returns -EFAULT/0. |
@@ -399,7 +401,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n); | |||
399 | * @fifo: the fifo to be used. | 401 | * @fifo: the fifo to be used. |
400 | * @to: where the data must be copied. | 402 | * @to: where the data must be copied. |
401 | * @len: the size of the destination buffer. | 403 | * @len: the size of the destination buffer. |
402 | @ @lenout: pointer to output variable with copied data | 404 | * @lenout: pointer to output variable with copied data |
403 | * | 405 | * |
404 | * This function copies at most @len bytes from the FIFO into the | 406 | * This function copies at most @len bytes from the FIFO into the |
405 | * @to buffer and 0 or -EFAULT. | 407 | * @to buffer and 0 or -EFAULT. |
diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e23514..11f3515ca83f 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c | |||
@@ -69,9 +69,16 @@ struct kgdb_state { | |||
69 | struct pt_regs *linux_regs; | 69 | struct pt_regs *linux_regs; |
70 | }; | 70 | }; |
71 | 71 | ||
72 | /* Exception state values */ | ||
73 | #define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ | ||
74 | #define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ | ||
75 | #define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ | ||
76 | #define DCPU_SSTEP 0x8 /* CPU is single stepping */ | ||
77 | |||
72 | static struct debuggerinfo_struct { | 78 | static struct debuggerinfo_struct { |
73 | void *debuggerinfo; | 79 | void *debuggerinfo; |
74 | struct task_struct *task; | 80 | struct task_struct *task; |
81 | int exception_state; | ||
75 | } kgdb_info[NR_CPUS]; | 82 | } kgdb_info[NR_CPUS]; |
76 | 83 | ||
77 | /** | 84 | /** |
@@ -391,27 +398,22 @@ int kgdb_mem2hex(char *mem, char *buf, int count) | |||
391 | 398 | ||
392 | /* | 399 | /* |
393 | * Copy the binary array pointed to by buf into mem. Fix $, #, and | 400 | * Copy the binary array pointed to by buf into mem. Fix $, #, and |
394 | * 0x7d escaped with 0x7d. Return a pointer to the character after | 401 | * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success. |
395 | * the last byte written. | 402 | * The input buf is overwitten with the result to write to mem. |
396 | */ | 403 | */ |
397 | static int kgdb_ebin2mem(char *buf, char *mem, int count) | 404 | static int kgdb_ebin2mem(char *buf, char *mem, int count) |
398 | { | 405 | { |
399 | int err = 0; | 406 | int size = 0; |
400 | char c; | 407 | char *c = buf; |
401 | 408 | ||
402 | while (count-- > 0) { | 409 | while (count-- > 0) { |
403 | c = *buf++; | 410 | c[size] = *buf++; |
404 | if (c == 0x7d) | 411 | if (c[size] == 0x7d) |
405 | c = *buf++ ^ 0x20; | 412 | c[size] = *buf++ ^ 0x20; |
406 | 413 | size++; | |
407 | err = probe_kernel_write(mem, &c, 1); | ||
408 | if (err) | ||
409 | break; | ||
410 | |||
411 | mem++; | ||
412 | } | 414 | } |
413 | 415 | ||
414 | return err; | 416 | return probe_kernel_write(mem, c, size); |
415 | } | 417 | } |
416 | 418 | ||
417 | /* | 419 | /* |
@@ -563,46 +565,6 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid) | |||
563 | } | 565 | } |
564 | 566 | ||
565 | /* | 567 | /* |
566 | * CPU debug state control: | ||
567 | */ | ||
568 | |||
569 | #ifdef CONFIG_SMP | ||
570 | static void kgdb_wait(struct pt_regs *regs) | ||
571 | { | ||
572 | unsigned long flags; | ||
573 | int cpu; | ||
574 | |||
575 | local_irq_save(flags); | ||
576 | cpu = raw_smp_processor_id(); | ||
577 | kgdb_info[cpu].debuggerinfo = regs; | ||
578 | kgdb_info[cpu].task = current; | ||
579 | /* | ||
580 | * Make sure the above info reaches the primary CPU before | ||
581 | * our cpu_in_kgdb[] flag setting does: | ||
582 | */ | ||
583 | smp_wmb(); | ||
584 | atomic_set(&cpu_in_kgdb[cpu], 1); | ||
585 | |||
586 | /* Wait till primary CPU is done with debugging */ | ||
587 | while (atomic_read(&passive_cpu_wait[cpu])) | ||
588 | cpu_relax(); | ||
589 | |||
590 | kgdb_info[cpu].debuggerinfo = NULL; | ||
591 | kgdb_info[cpu].task = NULL; | ||
592 | |||
593 | /* fix up hardware debug registers on local cpu */ | ||
594 | if (arch_kgdb_ops.correct_hw_break) | ||
595 | arch_kgdb_ops.correct_hw_break(); | ||
596 | |||
597 | /* Signal the primary CPU that we are done: */ | ||
598 | atomic_set(&cpu_in_kgdb[cpu], 0); | ||
599 | touch_softlockup_watchdog(); | ||
600 | clocksource_touch_watchdog(); | ||
601 | local_irq_restore(flags); | ||
602 | } | ||
603 | #endif | ||
604 | |||
605 | /* | ||
606 | * Some architectures need cache flushes when we set/clear a | 568 | * Some architectures need cache flushes when we set/clear a |
607 | * breakpoint: | 569 | * breakpoint: |
608 | */ | 570 | */ |
@@ -1397,34 +1359,13 @@ static int kgdb_reenter_check(struct kgdb_state *ks) | |||
1397 | return 1; | 1359 | return 1; |
1398 | } | 1360 | } |
1399 | 1361 | ||
1400 | /* | 1362 | static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) |
1401 | * kgdb_handle_exception() - main entry point from a kernel exception | ||
1402 | * | ||
1403 | * Locking hierarchy: | ||
1404 | * interface locks, if any (begin_session) | ||
1405 | * kgdb lock (kgdb_active) | ||
1406 | */ | ||
1407 | int | ||
1408 | kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | ||
1409 | { | 1363 | { |
1410 | struct kgdb_state kgdb_var; | ||
1411 | struct kgdb_state *ks = &kgdb_var; | ||
1412 | unsigned long flags; | 1364 | unsigned long flags; |
1413 | int sstep_tries = 100; | 1365 | int sstep_tries = 100; |
1414 | int error = 0; | 1366 | int error = 0; |
1415 | int i, cpu; | 1367 | int i, cpu; |
1416 | 1368 | int trace_on = 0; | |
1417 | ks->cpu = raw_smp_processor_id(); | ||
1418 | ks->ex_vector = evector; | ||
1419 | ks->signo = signo; | ||
1420 | ks->ex_vector = evector; | ||
1421 | ks->err_code = ecode; | ||
1422 | ks->kgdb_usethreadid = 0; | ||
1423 | ks->linux_regs = regs; | ||
1424 | |||
1425 | if (kgdb_reenter_check(ks)) | ||
1426 | return 0; /* Ouch, double exception ! */ | ||
1427 | |||
1428 | acquirelock: | 1369 | acquirelock: |
1429 | /* | 1370 | /* |
1430 | * Interrupts will be restored by the 'trap return' code, except when | 1371 | * Interrupts will be restored by the 'trap return' code, except when |
@@ -1432,13 +1373,43 @@ acquirelock: | |||
1432 | */ | 1373 | */ |
1433 | local_irq_save(flags); | 1374 | local_irq_save(flags); |
1434 | 1375 | ||
1435 | cpu = raw_smp_processor_id(); | 1376 | cpu = ks->cpu; |
1377 | kgdb_info[cpu].debuggerinfo = regs; | ||
1378 | kgdb_info[cpu].task = current; | ||
1379 | /* | ||
1380 | * Make sure the above info reaches the primary CPU before | ||
1381 | * our cpu_in_kgdb[] flag setting does: | ||
1382 | */ | ||
1383 | atomic_inc(&cpu_in_kgdb[cpu]); | ||
1436 | 1384 | ||
1437 | /* | 1385 | /* |
1438 | * Acquire the kgdb_active lock: | 1386 | * CPU will loop if it is a slave or request to become a kgdb |
1387 | * master cpu and acquire the kgdb_active lock: | ||
1439 | */ | 1388 | */ |
1440 | while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) | 1389 | while (1) { |
1390 | if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { | ||
1391 | if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) | ||
1392 | break; | ||
1393 | } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { | ||
1394 | if (!atomic_read(&passive_cpu_wait[cpu])) | ||
1395 | goto return_normal; | ||
1396 | } else { | ||
1397 | return_normal: | ||
1398 | /* Return to normal operation by executing any | ||
1399 | * hw breakpoint fixup. | ||
1400 | */ | ||
1401 | if (arch_kgdb_ops.correct_hw_break) | ||
1402 | arch_kgdb_ops.correct_hw_break(); | ||
1403 | if (trace_on) | ||
1404 | tracing_on(); | ||
1405 | atomic_dec(&cpu_in_kgdb[cpu]); | ||
1406 | touch_softlockup_watchdog_sync(); | ||
1407 | clocksource_touch_watchdog(); | ||
1408 | local_irq_restore(flags); | ||
1409 | return 0; | ||
1410 | } | ||
1441 | cpu_relax(); | 1411 | cpu_relax(); |
1412 | } | ||
1442 | 1413 | ||
1443 | /* | 1414 | /* |
1444 | * For single stepping, try to only enter on the processor | 1415 | * For single stepping, try to only enter on the processor |
@@ -1450,7 +1421,7 @@ acquirelock: | |||
1450 | (kgdb_info[cpu].task && | 1421 | (kgdb_info[cpu].task && |
1451 | kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { | 1422 | kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { |
1452 | atomic_set(&kgdb_active, -1); | 1423 | atomic_set(&kgdb_active, -1); |
1453 | touch_softlockup_watchdog(); | 1424 | touch_softlockup_watchdog_sync(); |
1454 | clocksource_touch_watchdog(); | 1425 | clocksource_touch_watchdog(); |
1455 | local_irq_restore(flags); | 1426 | local_irq_restore(flags); |
1456 | 1427 | ||
@@ -1472,9 +1443,6 @@ acquirelock: | |||
1472 | if (kgdb_io_ops->pre_exception) | 1443 | if (kgdb_io_ops->pre_exception) |
1473 | kgdb_io_ops->pre_exception(); | 1444 | kgdb_io_ops->pre_exception(); |
1474 | 1445 | ||
1475 | kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; | ||
1476 | kgdb_info[ks->cpu].task = current; | ||
1477 | |||
1478 | kgdb_disable_hw_debug(ks->linux_regs); | 1446 | kgdb_disable_hw_debug(ks->linux_regs); |
1479 | 1447 | ||
1480 | /* | 1448 | /* |
@@ -1483,15 +1451,9 @@ acquirelock: | |||
1483 | */ | 1451 | */ |
1484 | if (!kgdb_single_step) { | 1452 | if (!kgdb_single_step) { |
1485 | for (i = 0; i < NR_CPUS; i++) | 1453 | for (i = 0; i < NR_CPUS; i++) |
1486 | atomic_set(&passive_cpu_wait[i], 1); | 1454 | atomic_inc(&passive_cpu_wait[i]); |
1487 | } | 1455 | } |
1488 | 1456 | ||
1489 | /* | ||
1490 | * spin_lock code is good enough as a barrier so we don't | ||
1491 | * need one here: | ||
1492 | */ | ||
1493 | atomic_set(&cpu_in_kgdb[ks->cpu], 1); | ||
1494 | |||
1495 | #ifdef CONFIG_SMP | 1457 | #ifdef CONFIG_SMP |
1496 | /* Signal the other CPUs to enter kgdb_wait() */ | 1458 | /* Signal the other CPUs to enter kgdb_wait() */ |
1497 | if ((!kgdb_single_step) && kgdb_do_roundup) | 1459 | if ((!kgdb_single_step) && kgdb_do_roundup) |
@@ -1515,6 +1477,9 @@ acquirelock: | |||
1515 | kgdb_single_step = 0; | 1477 | kgdb_single_step = 0; |
1516 | kgdb_contthread = current; | 1478 | kgdb_contthread = current; |
1517 | exception_level = 0; | 1479 | exception_level = 0; |
1480 | trace_on = tracing_is_on(); | ||
1481 | if (trace_on) | ||
1482 | tracing_off(); | ||
1518 | 1483 | ||
1519 | /* Talk to debugger with gdbserial protocol */ | 1484 | /* Talk to debugger with gdbserial protocol */ |
1520 | error = gdb_serial_stub(ks); | 1485 | error = gdb_serial_stub(ks); |
@@ -1523,13 +1488,11 @@ acquirelock: | |||
1523 | if (kgdb_io_ops->post_exception) | 1488 | if (kgdb_io_ops->post_exception) |
1524 | kgdb_io_ops->post_exception(); | 1489 | kgdb_io_ops->post_exception(); |
1525 | 1490 | ||
1526 | kgdb_info[ks->cpu].debuggerinfo = NULL; | 1491 | atomic_dec(&cpu_in_kgdb[ks->cpu]); |
1527 | kgdb_info[ks->cpu].task = NULL; | ||
1528 | atomic_set(&cpu_in_kgdb[ks->cpu], 0); | ||
1529 | 1492 | ||
1530 | if (!kgdb_single_step) { | 1493 | if (!kgdb_single_step) { |
1531 | for (i = NR_CPUS-1; i >= 0; i--) | 1494 | for (i = NR_CPUS-1; i >= 0; i--) |
1532 | atomic_set(&passive_cpu_wait[i], 0); | 1495 | atomic_dec(&passive_cpu_wait[i]); |
1533 | /* | 1496 | /* |
1534 | * Wait till all the CPUs have quit | 1497 | * Wait till all the CPUs have quit |
1535 | * from the debugger. | 1498 | * from the debugger. |
@@ -1548,22 +1511,63 @@ kgdb_restore: | |||
1548 | else | 1511 | else |
1549 | kgdb_sstep_pid = 0; | 1512 | kgdb_sstep_pid = 0; |
1550 | } | 1513 | } |
1514 | if (trace_on) | ||
1515 | tracing_on(); | ||
1551 | /* Free kgdb_active */ | 1516 | /* Free kgdb_active */ |
1552 | atomic_set(&kgdb_active, -1); | 1517 | atomic_set(&kgdb_active, -1); |
1553 | touch_softlockup_watchdog(); | 1518 | touch_softlockup_watchdog_sync(); |
1554 | clocksource_touch_watchdog(); | 1519 | clocksource_touch_watchdog(); |
1555 | local_irq_restore(flags); | 1520 | local_irq_restore(flags); |
1556 | 1521 | ||
1557 | return error; | 1522 | return error; |
1558 | } | 1523 | } |
1559 | 1524 | ||
1525 | /* | ||
1526 | * kgdb_handle_exception() - main entry point from a kernel exception | ||
1527 | * | ||
1528 | * Locking hierarchy: | ||
1529 | * interface locks, if any (begin_session) | ||
1530 | * kgdb lock (kgdb_active) | ||
1531 | */ | ||
1532 | int | ||
1533 | kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | ||
1534 | { | ||
1535 | struct kgdb_state kgdb_var; | ||
1536 | struct kgdb_state *ks = &kgdb_var; | ||
1537 | int ret; | ||
1538 | |||
1539 | ks->cpu = raw_smp_processor_id(); | ||
1540 | ks->ex_vector = evector; | ||
1541 | ks->signo = signo; | ||
1542 | ks->ex_vector = evector; | ||
1543 | ks->err_code = ecode; | ||
1544 | ks->kgdb_usethreadid = 0; | ||
1545 | ks->linux_regs = regs; | ||
1546 | |||
1547 | if (kgdb_reenter_check(ks)) | ||
1548 | return 0; /* Ouch, double exception ! */ | ||
1549 | kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; | ||
1550 | ret = kgdb_cpu_enter(ks, regs); | ||
1551 | kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER; | ||
1552 | return ret; | ||
1553 | } | ||
1554 | |||
1560 | int kgdb_nmicallback(int cpu, void *regs) | 1555 | int kgdb_nmicallback(int cpu, void *regs) |
1561 | { | 1556 | { |
1562 | #ifdef CONFIG_SMP | 1557 | #ifdef CONFIG_SMP |
1558 | struct kgdb_state kgdb_var; | ||
1559 | struct kgdb_state *ks = &kgdb_var; | ||
1560 | |||
1561 | memset(ks, 0, sizeof(struct kgdb_state)); | ||
1562 | ks->cpu = cpu; | ||
1563 | ks->linux_regs = regs; | ||
1564 | |||
1563 | if (!atomic_read(&cpu_in_kgdb[cpu]) && | 1565 | if (!atomic_read(&cpu_in_kgdb[cpu]) && |
1564 | atomic_read(&kgdb_active) != cpu && | 1566 | atomic_read(&kgdb_active) != -1 && |
1565 | atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { | 1567 | atomic_read(&kgdb_active) != cpu) { |
1566 | kgdb_wait((struct pt_regs *)regs); | 1568 | kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; |
1569 | kgdb_cpu_enter(ks, regs); | ||
1570 | kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE; | ||
1567 | return 0; | 1571 | return 0; |
1568 | } | 1572 | } |
1569 | #endif | 1573 | #endif |
@@ -1739,11 +1743,11 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); | |||
1739 | */ | 1743 | */ |
1740 | void kgdb_breakpoint(void) | 1744 | void kgdb_breakpoint(void) |
1741 | { | 1745 | { |
1742 | atomic_set(&kgdb_setting_breakpoint, 1); | 1746 | atomic_inc(&kgdb_setting_breakpoint); |
1743 | wmb(); /* Sync point before breakpoint */ | 1747 | wmb(); /* Sync point before breakpoint */ |
1744 | arch_kgdb_breakpoint(); | 1748 | arch_kgdb_breakpoint(); |
1745 | wmb(); /* Sync point after breakpoint */ | 1749 | wmb(); /* Sync point after breakpoint */ |
1746 | atomic_set(&kgdb_setting_breakpoint, 0); | 1750 | atomic_dec(&kgdb_setting_breakpoint); |
1747 | } | 1751 | } |
1748 | EXPORT_SYMBOL_GPL(kgdb_breakpoint); | 1752 | EXPORT_SYMBOL_GPL(kgdb_breakpoint); |
1749 | 1753 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ccec774c716d..0ed46f3e51e9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -42,9 +42,11 @@ | |||
42 | #include <linux/freezer.h> | 42 | #include <linux/freezer.h> |
43 | #include <linux/seq_file.h> | 43 | #include <linux/seq_file.h> |
44 | #include <linux/debugfs.h> | 44 | #include <linux/debugfs.h> |
45 | #include <linux/sysctl.h> | ||
45 | #include <linux/kdebug.h> | 46 | #include <linux/kdebug.h> |
46 | #include <linux/memory.h> | 47 | #include <linux/memory.h> |
47 | #include <linux/ftrace.h> | 48 | #include <linux/ftrace.h> |
49 | #include <linux/cpu.h> | ||
48 | 50 | ||
49 | #include <asm-generic/sections.h> | 51 | #include <asm-generic/sections.h> |
50 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
@@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { | |||
105 | * stepping on the instruction on a vmalloced/kmalloced/data page | 107 | * stepping on the instruction on a vmalloced/kmalloced/data page |
106 | * is a recipe for disaster | 108 | * is a recipe for disaster |
107 | */ | 109 | */ |
108 | #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) | ||
109 | |||
110 | struct kprobe_insn_page { | 110 | struct kprobe_insn_page { |
111 | struct list_head list; | 111 | struct list_head list; |
112 | kprobe_opcode_t *insns; /* Page of instruction slots */ | 112 | kprobe_opcode_t *insns; /* Page of instruction slots */ |
113 | char slot_used[INSNS_PER_PAGE]; | ||
114 | int nused; | 113 | int nused; |
115 | int ngarbage; | 114 | int ngarbage; |
115 | char slot_used[]; | ||
116 | }; | ||
117 | |||
118 | #define KPROBE_INSN_PAGE_SIZE(slots) \ | ||
119 | (offsetof(struct kprobe_insn_page, slot_used) + \ | ||
120 | (sizeof(char) * (slots))) | ||
121 | |||
122 | struct kprobe_insn_cache { | ||
123 | struct list_head pages; /* list of kprobe_insn_page */ | ||
124 | size_t insn_size; /* size of instruction slot */ | ||
125 | int nr_garbage; | ||
116 | }; | 126 | }; |
117 | 127 | ||
128 | static int slots_per_page(struct kprobe_insn_cache *c) | ||
129 | { | ||
130 | return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); | ||
131 | } | ||
132 | |||
118 | enum kprobe_slot_state { | 133 | enum kprobe_slot_state { |
119 | SLOT_CLEAN = 0, | 134 | SLOT_CLEAN = 0, |
120 | SLOT_DIRTY = 1, | 135 | SLOT_DIRTY = 1, |
121 | SLOT_USED = 2, | 136 | SLOT_USED = 2, |
122 | }; | 137 | }; |
123 | 138 | ||
124 | static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ | 139 | static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ |
125 | static LIST_HEAD(kprobe_insn_pages); | 140 | static struct kprobe_insn_cache kprobe_insn_slots = { |
126 | static int kprobe_garbage_slots; | 141 | .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), |
127 | static int collect_garbage_slots(void); | 142 | .insn_size = MAX_INSN_SIZE, |
143 | .nr_garbage = 0, | ||
144 | }; | ||
145 | static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); | ||
128 | 146 | ||
129 | /** | 147 | /** |
130 | * __get_insn_slot() - Find a slot on an executable page for an instruction. | 148 | * __get_insn_slot() - Find a slot on an executable page for an instruction. |
131 | * We allocate an executable page if there's no room on existing ones. | 149 | * We allocate an executable page if there's no room on existing ones. |
132 | */ | 150 | */ |
133 | static kprobe_opcode_t __kprobes *__get_insn_slot(void) | 151 | static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) |
134 | { | 152 | { |
135 | struct kprobe_insn_page *kip; | 153 | struct kprobe_insn_page *kip; |
136 | 154 | ||
137 | retry: | 155 | retry: |
138 | list_for_each_entry(kip, &kprobe_insn_pages, list) { | 156 | list_for_each_entry(kip, &c->pages, list) { |
139 | if (kip->nused < INSNS_PER_PAGE) { | 157 | if (kip->nused < slots_per_page(c)) { |
140 | int i; | 158 | int i; |
141 | for (i = 0; i < INSNS_PER_PAGE; i++) { | 159 | for (i = 0; i < slots_per_page(c); i++) { |
142 | if (kip->slot_used[i] == SLOT_CLEAN) { | 160 | if (kip->slot_used[i] == SLOT_CLEAN) { |
143 | kip->slot_used[i] = SLOT_USED; | 161 | kip->slot_used[i] = SLOT_USED; |
144 | kip->nused++; | 162 | kip->nused++; |
145 | return kip->insns + (i * MAX_INSN_SIZE); | 163 | return kip->insns + (i * c->insn_size); |
146 | } | 164 | } |
147 | } | 165 | } |
148 | /* Surprise! No unused slots. Fix kip->nused. */ | 166 | /* kip->nused is broken. Fix it. */ |
149 | kip->nused = INSNS_PER_PAGE; | 167 | kip->nused = slots_per_page(c); |
168 | WARN_ON(1); | ||
150 | } | 169 | } |
151 | } | 170 | } |
152 | 171 | ||
153 | /* If there are any garbage slots, collect it and try again. */ | 172 | /* If there are any garbage slots, collect it and try again. */ |
154 | if (kprobe_garbage_slots && collect_garbage_slots() == 0) { | 173 | if (c->nr_garbage && collect_garbage_slots(c) == 0) |
155 | goto retry; | 174 | goto retry; |
156 | } | 175 | |
157 | /* All out of space. Need to allocate a new page. Use slot 0. */ | 176 | /* All out of space. Need to allocate a new page. */ |
158 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | 177 | kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); |
159 | if (!kip) | 178 | if (!kip) |
160 | return NULL; | 179 | return NULL; |
161 | 180 | ||
@@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) | |||
170 | return NULL; | 189 | return NULL; |
171 | } | 190 | } |
172 | INIT_LIST_HEAD(&kip->list); | 191 | INIT_LIST_HEAD(&kip->list); |
173 | list_add(&kip->list, &kprobe_insn_pages); | 192 | memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); |
174 | memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); | ||
175 | kip->slot_used[0] = SLOT_USED; | 193 | kip->slot_used[0] = SLOT_USED; |
176 | kip->nused = 1; | 194 | kip->nused = 1; |
177 | kip->ngarbage = 0; | 195 | kip->ngarbage = 0; |
196 | list_add(&kip->list, &c->pages); | ||
178 | return kip->insns; | 197 | return kip->insns; |
179 | } | 198 | } |
180 | 199 | ||
200 | |||
181 | kprobe_opcode_t __kprobes *get_insn_slot(void) | 201 | kprobe_opcode_t __kprobes *get_insn_slot(void) |
182 | { | 202 | { |
183 | kprobe_opcode_t *ret; | 203 | kprobe_opcode_t *ret = NULL; |
204 | |||
184 | mutex_lock(&kprobe_insn_mutex); | 205 | mutex_lock(&kprobe_insn_mutex); |
185 | ret = __get_insn_slot(); | 206 | ret = __get_insn_slot(&kprobe_insn_slots); |
186 | mutex_unlock(&kprobe_insn_mutex); | 207 | mutex_unlock(&kprobe_insn_mutex); |
208 | |||
187 | return ret; | 209 | return ret; |
188 | } | 210 | } |
189 | 211 | ||
@@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | |||
199 | * so as not to have to set it up again the | 221 | * so as not to have to set it up again the |
200 | * next time somebody inserts a probe. | 222 | * next time somebody inserts a probe. |
201 | */ | 223 | */ |
202 | if (!list_is_singular(&kprobe_insn_pages)) { | 224 | if (!list_is_singular(&kip->list)) { |
203 | list_del(&kip->list); | 225 | list_del(&kip->list); |
204 | module_free(NULL, kip->insns); | 226 | module_free(NULL, kip->insns); |
205 | kfree(kip); | 227 | kfree(kip); |
@@ -209,51 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | |||
209 | return 0; | 231 | return 0; |
210 | } | 232 | } |
211 | 233 | ||
212 | static int __kprobes collect_garbage_slots(void) | 234 | static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) |
213 | { | 235 | { |
214 | struct kprobe_insn_page *kip, *next; | 236 | struct kprobe_insn_page *kip, *next; |
215 | 237 | ||
216 | /* Ensure no-one is interrupted on the garbages */ | 238 | /* Ensure no-one is interrupted on the garbages */ |
217 | synchronize_sched(); | 239 | synchronize_sched(); |
218 | 240 | ||
219 | list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { | 241 | list_for_each_entry_safe(kip, next, &c->pages, list) { |
220 | int i; | 242 | int i; |
221 | if (kip->ngarbage == 0) | 243 | if (kip->ngarbage == 0) |
222 | continue; | 244 | continue; |
223 | kip->ngarbage = 0; /* we will collect all garbages */ | 245 | kip->ngarbage = 0; /* we will collect all garbages */ |
224 | for (i = 0; i < INSNS_PER_PAGE; i++) { | 246 | for (i = 0; i < slots_per_page(c); i++) { |
225 | if (kip->slot_used[i] == SLOT_DIRTY && | 247 | if (kip->slot_used[i] == SLOT_DIRTY && |
226 | collect_one_slot(kip, i)) | 248 | collect_one_slot(kip, i)) |
227 | break; | 249 | break; |
228 | } | 250 | } |
229 | } | 251 | } |
230 | kprobe_garbage_slots = 0; | 252 | c->nr_garbage = 0; |
231 | return 0; | 253 | return 0; |
232 | } | 254 | } |
233 | 255 | ||
234 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | 256 | static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, |
257 | kprobe_opcode_t *slot, int dirty) | ||
235 | { | 258 | { |
236 | struct kprobe_insn_page *kip; | 259 | struct kprobe_insn_page *kip; |
237 | 260 | ||
238 | mutex_lock(&kprobe_insn_mutex); | 261 | list_for_each_entry(kip, &c->pages, list) { |
239 | list_for_each_entry(kip, &kprobe_insn_pages, list) { | 262 | long idx = ((long)slot - (long)kip->insns) / |
240 | if (kip->insns <= slot && | 263 | (c->insn_size * sizeof(kprobe_opcode_t)); |
241 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { | 264 | if (idx >= 0 && idx < slots_per_page(c)) { |
242 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | 265 | WARN_ON(kip->slot_used[idx] != SLOT_USED); |
243 | if (dirty) { | 266 | if (dirty) { |
244 | kip->slot_used[i] = SLOT_DIRTY; | 267 | kip->slot_used[idx] = SLOT_DIRTY; |
245 | kip->ngarbage++; | 268 | kip->ngarbage++; |
269 | if (++c->nr_garbage > slots_per_page(c)) | ||
270 | collect_garbage_slots(c); | ||
246 | } else | 271 | } else |
247 | collect_one_slot(kip, i); | 272 | collect_one_slot(kip, idx); |
248 | break; | 273 | return; |
249 | } | 274 | } |
250 | } | 275 | } |
276 | /* Could not free this slot. */ | ||
277 | WARN_ON(1); | ||
278 | } | ||
251 | 279 | ||
252 | if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) | 280 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) |
253 | collect_garbage_slots(); | 281 | { |
254 | 282 | mutex_lock(&kprobe_insn_mutex); | |
283 | __free_insn_slot(&kprobe_insn_slots, slot, dirty); | ||
255 | mutex_unlock(&kprobe_insn_mutex); | 284 | mutex_unlock(&kprobe_insn_mutex); |
256 | } | 285 | } |
286 | #ifdef CONFIG_OPTPROBES | ||
287 | /* For optimized_kprobe buffer */ | ||
288 | static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ | ||
289 | static struct kprobe_insn_cache kprobe_optinsn_slots = { | ||
290 | .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), | ||
291 | /* .insn_size is initialized later */ | ||
292 | .nr_garbage = 0, | ||
293 | }; | ||
294 | /* Get a slot for optimized_kprobe buffer */ | ||
295 | kprobe_opcode_t __kprobes *get_optinsn_slot(void) | ||
296 | { | ||
297 | kprobe_opcode_t *ret = NULL; | ||
298 | |||
299 | mutex_lock(&kprobe_optinsn_mutex); | ||
300 | ret = __get_insn_slot(&kprobe_optinsn_slots); | ||
301 | mutex_unlock(&kprobe_optinsn_mutex); | ||
302 | |||
303 | return ret; | ||
304 | } | ||
305 | |||
306 | void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) | ||
307 | { | ||
308 | mutex_lock(&kprobe_optinsn_mutex); | ||
309 | __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); | ||
310 | mutex_unlock(&kprobe_optinsn_mutex); | ||
311 | } | ||
312 | #endif | ||
257 | #endif | 313 | #endif |
258 | 314 | ||
259 | /* We have preemption disabled.. so it is safe to use __ versions */ | 315 | /* We have preemption disabled.. so it is safe to use __ versions */ |
@@ -284,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr) | |||
284 | if (p->addr == addr) | 340 | if (p->addr == addr) |
285 | return p; | 341 | return p; |
286 | } | 342 | } |
343 | |||
344 | return NULL; | ||
345 | } | ||
346 | |||
347 | static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); | ||
348 | |||
349 | /* Return true if the kprobe is an aggregator */ | ||
350 | static inline int kprobe_aggrprobe(struct kprobe *p) | ||
351 | { | ||
352 | return p->pre_handler == aggr_pre_handler; | ||
353 | } | ||
354 | |||
355 | /* | ||
356 | * Keep all fields in the kprobe consistent | ||
357 | */ | ||
358 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
359 | { | ||
360 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | ||
361 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | ||
362 | } | ||
363 | |||
364 | #ifdef CONFIG_OPTPROBES | ||
365 | /* NOTE: change this value only with kprobe_mutex held */ | ||
366 | static bool kprobes_allow_optimization; | ||
367 | |||
368 | /* | ||
369 | * Call all pre_handler on the list, but ignores its return value. | ||
370 | * This must be called from arch-dep optimized caller. | ||
371 | */ | ||
372 | void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
373 | { | ||
374 | struct kprobe *kp; | ||
375 | |||
376 | list_for_each_entry_rcu(kp, &p->list, list) { | ||
377 | if (kp->pre_handler && likely(!kprobe_disabled(kp))) { | ||
378 | set_kprobe_instance(kp); | ||
379 | kp->pre_handler(kp, regs); | ||
380 | } | ||
381 | reset_kprobe_instance(); | ||
382 | } | ||
383 | } | ||
384 | |||
385 | /* Return true(!0) if the kprobe is ready for optimization. */ | ||
386 | static inline int kprobe_optready(struct kprobe *p) | ||
387 | { | ||
388 | struct optimized_kprobe *op; | ||
389 | |||
390 | if (kprobe_aggrprobe(p)) { | ||
391 | op = container_of(p, struct optimized_kprobe, kp); | ||
392 | return arch_prepared_optinsn(&op->optinsn); | ||
393 | } | ||
394 | |||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Return an optimized kprobe whose optimizing code replaces | ||
400 | * instructions including addr (exclude breakpoint). | ||
401 | */ | ||
402 | struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | ||
403 | { | ||
404 | int i; | ||
405 | struct kprobe *p = NULL; | ||
406 | struct optimized_kprobe *op; | ||
407 | |||
408 | /* Don't check i == 0, since that is a breakpoint case. */ | ||
409 | for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) | ||
410 | p = get_kprobe((void *)(addr - i)); | ||
411 | |||
412 | if (p && kprobe_optready(p)) { | ||
413 | op = container_of(p, struct optimized_kprobe, kp); | ||
414 | if (arch_within_optimized_kprobe(op, addr)) | ||
415 | return p; | ||
416 | } | ||
417 | |||
287 | return NULL; | 418 | return NULL; |
288 | } | 419 | } |
289 | 420 | ||
421 | /* Optimization staging list, protected by kprobe_mutex */ | ||
422 | static LIST_HEAD(optimizing_list); | ||
423 | |||
424 | static void kprobe_optimizer(struct work_struct *work); | ||
425 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | ||
426 | #define OPTIMIZE_DELAY 5 | ||
427 | |||
428 | /* Kprobe jump optimizer */ | ||
429 | static __kprobes void kprobe_optimizer(struct work_struct *work) | ||
430 | { | ||
431 | struct optimized_kprobe *op, *tmp; | ||
432 | |||
433 | /* Lock modules while optimizing kprobes */ | ||
434 | mutex_lock(&module_mutex); | ||
435 | mutex_lock(&kprobe_mutex); | ||
436 | if (kprobes_all_disarmed || !kprobes_allow_optimization) | ||
437 | goto end; | ||
438 | |||
439 | /* | ||
440 | * Wait for quiesence period to ensure all running interrupts | ||
441 | * are done. Because optprobe may modify multiple instructions | ||
442 | * there is a chance that Nth instruction is interrupted. In that | ||
443 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
444 | * instruction. This wait is for avoiding it. | ||
445 | */ | ||
446 | synchronize_sched(); | ||
447 | |||
448 | /* | ||
449 | * The optimization/unoptimization refers online_cpus via | ||
450 | * stop_machine() and cpu-hotplug modifies online_cpus. | ||
451 | * And same time, text_mutex will be held in cpu-hotplug and here. | ||
452 | * This combination can cause a deadlock (cpu-hotplug try to lock | ||
453 | * text_mutex but stop_machine can not be done because online_cpus | ||
454 | * has been changed) | ||
455 | * To avoid this deadlock, we need to call get_online_cpus() | ||
456 | * for preventing cpu-hotplug outside of text_mutex locking. | ||
457 | */ | ||
458 | get_online_cpus(); | ||
459 | mutex_lock(&text_mutex); | ||
460 | list_for_each_entry_safe(op, tmp, &optimizing_list, list) { | ||
461 | WARN_ON(kprobe_disabled(&op->kp)); | ||
462 | if (arch_optimize_kprobe(op) < 0) | ||
463 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
464 | list_del_init(&op->list); | ||
465 | } | ||
466 | mutex_unlock(&text_mutex); | ||
467 | put_online_cpus(); | ||
468 | end: | ||
469 | mutex_unlock(&kprobe_mutex); | ||
470 | mutex_unlock(&module_mutex); | ||
471 | } | ||
472 | |||
473 | /* Optimize kprobe if p is ready to be optimized */ | ||
474 | static __kprobes void optimize_kprobe(struct kprobe *p) | ||
475 | { | ||
476 | struct optimized_kprobe *op; | ||
477 | |||
478 | /* Check if the kprobe is disabled or not ready for optimization. */ | ||
479 | if (!kprobe_optready(p) || !kprobes_allow_optimization || | ||
480 | (kprobe_disabled(p) || kprobes_all_disarmed)) | ||
481 | return; | ||
482 | |||
483 | /* Both of break_handler and post_handler are not supported. */ | ||
484 | if (p->break_handler || p->post_handler) | ||
485 | return; | ||
486 | |||
487 | op = container_of(p, struct optimized_kprobe, kp); | ||
488 | |||
489 | /* Check there is no other kprobes at the optimized instructions */ | ||
490 | if (arch_check_optimized_kprobe(op) < 0) | ||
491 | return; | ||
492 | |||
493 | /* Check if it is already optimized. */ | ||
494 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) | ||
495 | return; | ||
496 | |||
497 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; | ||
498 | list_add(&op->list, &optimizing_list); | ||
499 | if (!delayed_work_pending(&optimizing_work)) | ||
500 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | ||
501 | } | ||
502 | |||
503 | /* Unoptimize a kprobe if p is optimized */ | ||
504 | static __kprobes void unoptimize_kprobe(struct kprobe *p) | ||
505 | { | ||
506 | struct optimized_kprobe *op; | ||
507 | |||
508 | if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { | ||
509 | op = container_of(p, struct optimized_kprobe, kp); | ||
510 | if (!list_empty(&op->list)) | ||
511 | /* Dequeue from the optimization queue */ | ||
512 | list_del_init(&op->list); | ||
513 | else | ||
514 | /* Replace jump with break */ | ||
515 | arch_unoptimize_kprobe(op); | ||
516 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
517 | } | ||
518 | } | ||
519 | |||
520 | /* Remove optimized instructions */ | ||
521 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) | ||
522 | { | ||
523 | struct optimized_kprobe *op; | ||
524 | |||
525 | op = container_of(p, struct optimized_kprobe, kp); | ||
526 | if (!list_empty(&op->list)) { | ||
527 | /* Dequeue from the optimization queue */ | ||
528 | list_del_init(&op->list); | ||
529 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
530 | } | ||
531 | /* Don't unoptimize, because the target code will be freed. */ | ||
532 | arch_remove_optimized_kprobe(op); | ||
533 | } | ||
534 | |||
535 | /* Try to prepare optimized instructions */ | ||
536 | static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | ||
537 | { | ||
538 | struct optimized_kprobe *op; | ||
539 | |||
540 | op = container_of(p, struct optimized_kprobe, kp); | ||
541 | arch_prepare_optimized_kprobe(op); | ||
542 | } | ||
543 | |||
544 | /* Free optimized instructions and optimized_kprobe */ | ||
545 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
546 | { | ||
547 | struct optimized_kprobe *op; | ||
548 | |||
549 | op = container_of(p, struct optimized_kprobe, kp); | ||
550 | arch_remove_optimized_kprobe(op); | ||
551 | kfree(op); | ||
552 | } | ||
553 | |||
554 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | ||
555 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | ||
556 | { | ||
557 | struct optimized_kprobe *op; | ||
558 | |||
559 | op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL); | ||
560 | if (!op) | ||
561 | return NULL; | ||
562 | |||
563 | INIT_LIST_HEAD(&op->list); | ||
564 | op->kp.addr = p->addr; | ||
565 | arch_prepare_optimized_kprobe(op); | ||
566 | |||
567 | return &op->kp; | ||
568 | } | ||
569 | |||
570 | static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); | ||
571 | |||
572 | /* | ||
573 | * Prepare an optimized_kprobe and optimize it | ||
574 | * NOTE: p must be a normal registered kprobe | ||
575 | */ | ||
576 | static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | ||
577 | { | ||
578 | struct kprobe *ap; | ||
579 | struct optimized_kprobe *op; | ||
580 | |||
581 | ap = alloc_aggr_kprobe(p); | ||
582 | if (!ap) | ||
583 | return; | ||
584 | |||
585 | op = container_of(ap, struct optimized_kprobe, kp); | ||
586 | if (!arch_prepared_optinsn(&op->optinsn)) { | ||
587 | /* If failed to setup optimizing, fallback to kprobe */ | ||
588 | free_aggr_kprobe(ap); | ||
589 | return; | ||
590 | } | ||
591 | |||
592 | init_aggr_kprobe(ap, p); | ||
593 | optimize_kprobe(ap); | ||
594 | } | ||
595 | |||
596 | #ifdef CONFIG_SYSCTL | ||
597 | static void __kprobes optimize_all_kprobes(void) | ||
598 | { | ||
599 | struct hlist_head *head; | ||
600 | struct hlist_node *node; | ||
601 | struct kprobe *p; | ||
602 | unsigned int i; | ||
603 | |||
604 | /* If optimization is already allowed, just return */ | ||
605 | if (kprobes_allow_optimization) | ||
606 | return; | ||
607 | |||
608 | kprobes_allow_optimization = true; | ||
609 | mutex_lock(&text_mutex); | ||
610 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | ||
611 | head = &kprobe_table[i]; | ||
612 | hlist_for_each_entry_rcu(p, node, head, hlist) | ||
613 | if (!kprobe_disabled(p)) | ||
614 | optimize_kprobe(p); | ||
615 | } | ||
616 | mutex_unlock(&text_mutex); | ||
617 | printk(KERN_INFO "Kprobes globally optimized\n"); | ||
618 | } | ||
619 | |||
620 | static void __kprobes unoptimize_all_kprobes(void) | ||
621 | { | ||
622 | struct hlist_head *head; | ||
623 | struct hlist_node *node; | ||
624 | struct kprobe *p; | ||
625 | unsigned int i; | ||
626 | |||
627 | /* If optimization is already prohibited, just return */ | ||
628 | if (!kprobes_allow_optimization) | ||
629 | return; | ||
630 | |||
631 | kprobes_allow_optimization = false; | ||
632 | printk(KERN_INFO "Kprobes globally unoptimized\n"); | ||
633 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
634 | mutex_lock(&text_mutex); | ||
635 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | ||
636 | head = &kprobe_table[i]; | ||
637 | hlist_for_each_entry_rcu(p, node, head, hlist) { | ||
638 | if (!kprobe_disabled(p)) | ||
639 | unoptimize_kprobe(p); | ||
640 | } | ||
641 | } | ||
642 | |||
643 | mutex_unlock(&text_mutex); | ||
644 | put_online_cpus(); | ||
645 | /* Allow all currently running kprobes to complete */ | ||
646 | synchronize_sched(); | ||
647 | } | ||
648 | |||
649 | int sysctl_kprobes_optimization; | ||
650 | int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | ||
651 | void __user *buffer, size_t *length, | ||
652 | loff_t *ppos) | ||
653 | { | ||
654 | int ret; | ||
655 | |||
656 | mutex_lock(&kprobe_mutex); | ||
657 | sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; | ||
658 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
659 | |||
660 | if (sysctl_kprobes_optimization) | ||
661 | optimize_all_kprobes(); | ||
662 | else | ||
663 | unoptimize_all_kprobes(); | ||
664 | mutex_unlock(&kprobe_mutex); | ||
665 | |||
666 | return ret; | ||
667 | } | ||
668 | #endif /* CONFIG_SYSCTL */ | ||
669 | |||
670 | static void __kprobes __arm_kprobe(struct kprobe *p) | ||
671 | { | ||
672 | struct kprobe *old_p; | ||
673 | |||
674 | /* Check collision with other optimized kprobes */ | ||
675 | old_p = get_optimized_kprobe((unsigned long)p->addr); | ||
676 | if (unlikely(old_p)) | ||
677 | unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ | ||
678 | |||
679 | arch_arm_kprobe(p); | ||
680 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ | ||
681 | } | ||
682 | |||
683 | static void __kprobes __disarm_kprobe(struct kprobe *p) | ||
684 | { | ||
685 | struct kprobe *old_p; | ||
686 | |||
687 | unoptimize_kprobe(p); /* Try to unoptimize */ | ||
688 | arch_disarm_kprobe(p); | ||
689 | |||
690 | /* If another kprobe was blocked, optimize it. */ | ||
691 | old_p = get_optimized_kprobe((unsigned long)p->addr); | ||
692 | if (unlikely(old_p)) | ||
693 | optimize_kprobe(old_p); | ||
694 | } | ||
695 | |||
696 | #else /* !CONFIG_OPTPROBES */ | ||
697 | |||
698 | #define optimize_kprobe(p) do {} while (0) | ||
699 | #define unoptimize_kprobe(p) do {} while (0) | ||
700 | #define kill_optimized_kprobe(p) do {} while (0) | ||
701 | #define prepare_optimized_kprobe(p) do {} while (0) | ||
702 | #define try_to_optimize_kprobe(p) do {} while (0) | ||
703 | #define __arm_kprobe(p) arch_arm_kprobe(p) | ||
704 | #define __disarm_kprobe(p) arch_disarm_kprobe(p) | ||
705 | |||
706 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
707 | { | ||
708 | kfree(p); | ||
709 | } | ||
710 | |||
711 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | ||
712 | { | ||
713 | return kzalloc(sizeof(struct kprobe), GFP_KERNEL); | ||
714 | } | ||
715 | #endif /* CONFIG_OPTPROBES */ | ||
716 | |||
290 | /* Arm a kprobe with text_mutex */ | 717 | /* Arm a kprobe with text_mutex */ |
291 | static void __kprobes arm_kprobe(struct kprobe *kp) | 718 | static void __kprobes arm_kprobe(struct kprobe *kp) |
292 | { | 719 | { |
720 | /* | ||
721 | * Here, since __arm_kprobe() doesn't use stop_machine(), | ||
722 | * this doesn't cause deadlock on text_mutex. So, we don't | ||
723 | * need get_online_cpus(). | ||
724 | */ | ||
293 | mutex_lock(&text_mutex); | 725 | mutex_lock(&text_mutex); |
294 | arch_arm_kprobe(kp); | 726 | __arm_kprobe(kp); |
295 | mutex_unlock(&text_mutex); | 727 | mutex_unlock(&text_mutex); |
296 | } | 728 | } |
297 | 729 | ||
298 | /* Disarm a kprobe with text_mutex */ | 730 | /* Disarm a kprobe with text_mutex */ |
299 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 731 | static void __kprobes disarm_kprobe(struct kprobe *kp) |
300 | { | 732 | { |
733 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
301 | mutex_lock(&text_mutex); | 734 | mutex_lock(&text_mutex); |
302 | arch_disarm_kprobe(kp); | 735 | __disarm_kprobe(kp); |
303 | mutex_unlock(&text_mutex); | 736 | mutex_unlock(&text_mutex); |
737 | put_online_cpus(); | ||
304 | } | 738 | } |
305 | 739 | ||
306 | /* | 740 | /* |
@@ -369,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | |||
369 | void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) | 803 | void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) |
370 | { | 804 | { |
371 | struct kprobe *kp; | 805 | struct kprobe *kp; |
372 | if (p->pre_handler != aggr_pre_handler) { | 806 | if (!kprobe_aggrprobe(p)) { |
373 | p->nmissed++; | 807 | p->nmissed++; |
374 | } else { | 808 | } else { |
375 | list_for_each_entry_rcu(kp, &p->list, list) | 809 | list_for_each_entry_rcu(kp, &p->list, list) |
@@ -493,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) | |||
493 | } | 927 | } |
494 | 928 | ||
495 | /* | 929 | /* |
496 | * Keep all fields in the kprobe consistent | ||
497 | */ | ||
498 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
499 | { | ||
500 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | ||
501 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * Add the new probe to ap->list. Fail if this is the | 930 | * Add the new probe to ap->list. Fail if this is the |
506 | * second jprobe at the address - two jprobes can't coexist | 931 | * second jprobe at the address - two jprobes can't coexist |
507 | */ | 932 | */ |
508 | static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | 933 | static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) |
509 | { | 934 | { |
510 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); | 935 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); |
936 | |||
937 | if (p->break_handler || p->post_handler) | ||
938 | unoptimize_kprobe(ap); /* Fall back to normal kprobe */ | ||
939 | |||
511 | if (p->break_handler) { | 940 | if (p->break_handler) { |
512 | if (ap->break_handler) | 941 | if (ap->break_handler) |
513 | return -EEXIST; | 942 | return -EEXIST; |
@@ -522,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
522 | ap->flags &= ~KPROBE_FLAG_DISABLED; | 951 | ap->flags &= ~KPROBE_FLAG_DISABLED; |
523 | if (!kprobes_all_disarmed) | 952 | if (!kprobes_all_disarmed) |
524 | /* Arm the breakpoint again. */ | 953 | /* Arm the breakpoint again. */ |
525 | arm_kprobe(ap); | 954 | __arm_kprobe(ap); |
526 | } | 955 | } |
527 | return 0; | 956 | return 0; |
528 | } | 957 | } |
@@ -531,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
531 | * Fill in the required fields of the "manager kprobe". Replace the | 960 | * Fill in the required fields of the "manager kprobe". Replace the |
532 | * earlier kprobe in the hlist with the manager kprobe | 961 | * earlier kprobe in the hlist with the manager kprobe |
533 | */ | 962 | */ |
534 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | 963 | static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) |
535 | { | 964 | { |
965 | /* Copy p's insn slot to ap */ | ||
536 | copy_kprobe(p, ap); | 966 | copy_kprobe(p, ap); |
537 | flush_insn_slot(ap); | 967 | flush_insn_slot(ap); |
538 | ap->addr = p->addr; | 968 | ap->addr = p->addr; |
539 | ap->flags = p->flags; | 969 | ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; |
540 | ap->pre_handler = aggr_pre_handler; | 970 | ap->pre_handler = aggr_pre_handler; |
541 | ap->fault_handler = aggr_fault_handler; | 971 | ap->fault_handler = aggr_fault_handler; |
542 | /* We don't care the kprobe which has gone. */ | 972 | /* We don't care the kprobe which has gone. */ |
@@ -546,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
546 | ap->break_handler = aggr_break_handler; | 976 | ap->break_handler = aggr_break_handler; |
547 | 977 | ||
548 | INIT_LIST_HEAD(&ap->list); | 978 | INIT_LIST_HEAD(&ap->list); |
549 | list_add_rcu(&p->list, &ap->list); | 979 | INIT_HLIST_NODE(&ap->hlist); |
550 | 980 | ||
981 | list_add_rcu(&p->list, &ap->list); | ||
551 | hlist_replace_rcu(&p->hlist, &ap->hlist); | 982 | hlist_replace_rcu(&p->hlist, &ap->hlist); |
552 | } | 983 | } |
553 | 984 | ||
@@ -561,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
561 | int ret = 0; | 992 | int ret = 0; |
562 | struct kprobe *ap = old_p; | 993 | struct kprobe *ap = old_p; |
563 | 994 | ||
564 | if (old_p->pre_handler != aggr_pre_handler) { | 995 | if (!kprobe_aggrprobe(old_p)) { |
565 | /* If old_p is not an aggr_probe, create new aggr_kprobe. */ | 996 | /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ |
566 | ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); | 997 | ap = alloc_aggr_kprobe(old_p); |
567 | if (!ap) | 998 | if (!ap) |
568 | return -ENOMEM; | 999 | return -ENOMEM; |
569 | add_aggr_kprobe(ap, old_p); | 1000 | init_aggr_kprobe(ap, old_p); |
570 | } | 1001 | } |
571 | 1002 | ||
572 | if (kprobe_gone(ap)) { | 1003 | if (kprobe_gone(ap)) { |
@@ -585,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
585 | */ | 1016 | */ |
586 | return ret; | 1017 | return ret; |
587 | 1018 | ||
1019 | /* Prepare optimized instructions if possible. */ | ||
1020 | prepare_optimized_kprobe(ap); | ||
1021 | |||
588 | /* | 1022 | /* |
589 | * Clear gone flag to prevent allocating new slot again, and | 1023 | * Clear gone flag to prevent allocating new slot again, and |
590 | * set disabled flag because it is not armed yet. | 1024 | * set disabled flag because it is not armed yet. |
@@ -593,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
593 | | KPROBE_FLAG_DISABLED; | 1027 | | KPROBE_FLAG_DISABLED; |
594 | } | 1028 | } |
595 | 1029 | ||
1030 | /* Copy ap's insn slot to p */ | ||
596 | copy_kprobe(ap, p); | 1031 | copy_kprobe(ap, p); |
597 | return add_new_kprobe(ap, p); | 1032 | return add_new_kprobe(ap, p); |
598 | } | 1033 | } |
@@ -743,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
743 | p->nmissed = 0; | 1178 | p->nmissed = 0; |
744 | INIT_LIST_HEAD(&p->list); | 1179 | INIT_LIST_HEAD(&p->list); |
745 | mutex_lock(&kprobe_mutex); | 1180 | mutex_lock(&kprobe_mutex); |
1181 | |||
1182 | get_online_cpus(); /* For avoiding text_mutex deadlock. */ | ||
1183 | mutex_lock(&text_mutex); | ||
1184 | |||
746 | old_p = get_kprobe(p->addr); | 1185 | old_p = get_kprobe(p->addr); |
747 | if (old_p) { | 1186 | if (old_p) { |
1187 | /* Since this may unoptimize old_p, locking text_mutex. */ | ||
748 | ret = register_aggr_kprobe(old_p, p); | 1188 | ret = register_aggr_kprobe(old_p, p); |
749 | goto out; | 1189 | goto out; |
750 | } | 1190 | } |
751 | 1191 | ||
752 | mutex_lock(&text_mutex); | ||
753 | ret = arch_prepare_kprobe(p); | 1192 | ret = arch_prepare_kprobe(p); |
754 | if (ret) | 1193 | if (ret) |
755 | goto out_unlock_text; | 1194 | goto out; |
756 | 1195 | ||
757 | INIT_HLIST_NODE(&p->hlist); | 1196 | INIT_HLIST_NODE(&p->hlist); |
758 | hlist_add_head_rcu(&p->hlist, | 1197 | hlist_add_head_rcu(&p->hlist, |
759 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 1198 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
760 | 1199 | ||
761 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) | 1200 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) |
762 | arch_arm_kprobe(p); | 1201 | __arm_kprobe(p); |
1202 | |||
1203 | /* Try to optimize kprobe */ | ||
1204 | try_to_optimize_kprobe(p); | ||
763 | 1205 | ||
764 | out_unlock_text: | ||
765 | mutex_unlock(&text_mutex); | ||
766 | out: | 1206 | out: |
1207 | mutex_unlock(&text_mutex); | ||
1208 | put_online_cpus(); | ||
767 | mutex_unlock(&kprobe_mutex); | 1209 | mutex_unlock(&kprobe_mutex); |
768 | 1210 | ||
769 | if (probed_mod) | 1211 | if (probed_mod) |
@@ -785,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) | |||
785 | return -EINVAL; | 1227 | return -EINVAL; |
786 | 1228 | ||
787 | if (old_p == p || | 1229 | if (old_p == p || |
788 | (old_p->pre_handler == aggr_pre_handler && | 1230 | (kprobe_aggrprobe(old_p) && |
789 | list_is_singular(&old_p->list))) { | 1231 | list_is_singular(&old_p->list))) { |
790 | /* | 1232 | /* |
791 | * Only probe on the hash list. Disarm only if kprobes are | 1233 | * Only probe on the hash list. Disarm only if kprobes are |
@@ -793,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) | |||
793 | * already have been removed. We save on flushing icache. | 1235 | * already have been removed. We save on flushing icache. |
794 | */ | 1236 | */ |
795 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) | 1237 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) |
796 | disarm_kprobe(p); | 1238 | disarm_kprobe(old_p); |
797 | hlist_del_rcu(&old_p->hlist); | 1239 | hlist_del_rcu(&old_p->hlist); |
798 | } else { | 1240 | } else { |
799 | if (p->break_handler && !kprobe_gone(p)) | 1241 | if (p->break_handler && !kprobe_gone(p)) |
@@ -809,8 +1251,13 @@ noclean: | |||
809 | list_del_rcu(&p->list); | 1251 | list_del_rcu(&p->list); |
810 | if (!kprobe_disabled(old_p)) { | 1252 | if (!kprobe_disabled(old_p)) { |
811 | try_to_disable_aggr_kprobe(old_p); | 1253 | try_to_disable_aggr_kprobe(old_p); |
812 | if (!kprobes_all_disarmed && kprobe_disabled(old_p)) | 1254 | if (!kprobes_all_disarmed) { |
813 | disarm_kprobe(old_p); | 1255 | if (kprobe_disabled(old_p)) |
1256 | disarm_kprobe(old_p); | ||
1257 | else | ||
1258 | /* Try to optimize this probe again */ | ||
1259 | optimize_kprobe(old_p); | ||
1260 | } | ||
814 | } | 1261 | } |
815 | } | 1262 | } |
816 | return 0; | 1263 | return 0; |
@@ -827,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | |||
827 | old_p = list_entry(p->list.next, struct kprobe, list); | 1274 | old_p = list_entry(p->list.next, struct kprobe, list); |
828 | list_del(&p->list); | 1275 | list_del(&p->list); |
829 | arch_remove_kprobe(old_p); | 1276 | arch_remove_kprobe(old_p); |
830 | kfree(old_p); | 1277 | free_aggr_kprobe(old_p); |
831 | } | 1278 | } |
832 | } | 1279 | } |
833 | 1280 | ||
@@ -1123,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1123 | struct kprobe *kp; | 1570 | struct kprobe *kp; |
1124 | 1571 | ||
1125 | p->flags |= KPROBE_FLAG_GONE; | 1572 | p->flags |= KPROBE_FLAG_GONE; |
1126 | if (p->pre_handler == aggr_pre_handler) { | 1573 | if (kprobe_aggrprobe(p)) { |
1127 | /* | 1574 | /* |
1128 | * If this is an aggr_kprobe, we have to list all the | 1575 | * If this is an aggr_kprobe, we have to list all the |
1129 | * chained probes and mark them GONE. | 1576 | * chained probes and mark them GONE. |
@@ -1132,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1132 | kp->flags |= KPROBE_FLAG_GONE; | 1579 | kp->flags |= KPROBE_FLAG_GONE; |
1133 | p->post_handler = NULL; | 1580 | p->post_handler = NULL; |
1134 | p->break_handler = NULL; | 1581 | p->break_handler = NULL; |
1582 | kill_optimized_kprobe(p); | ||
1135 | } | 1583 | } |
1136 | /* | 1584 | /* |
1137 | * Here, we can remove insn_slot safely, because no thread calls | 1585 | * Here, we can remove insn_slot safely, because no thread calls |
@@ -1241,6 +1689,15 @@ static int __init init_kprobes(void) | |||
1241 | } | 1689 | } |
1242 | } | 1690 | } |
1243 | 1691 | ||
1692 | #if defined(CONFIG_OPTPROBES) | ||
1693 | #if defined(__ARCH_WANT_KPROBES_INSN_SLOT) | ||
1694 | /* Init kprobe_optinsn_slots */ | ||
1695 | kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; | ||
1696 | #endif | ||
1697 | /* By default, kprobes can be optimized */ | ||
1698 | kprobes_allow_optimization = true; | ||
1699 | #endif | ||
1700 | |||
1244 | /* By default, kprobes are armed */ | 1701 | /* By default, kprobes are armed */ |
1245 | kprobes_all_disarmed = false; | 1702 | kprobes_all_disarmed = false; |
1246 | 1703 | ||
@@ -1259,7 +1716,7 @@ static int __init init_kprobes(void) | |||
1259 | 1716 | ||
1260 | #ifdef CONFIG_DEBUG_FS | 1717 | #ifdef CONFIG_DEBUG_FS |
1261 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | 1718 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, |
1262 | const char *sym, int offset,char *modname) | 1719 | const char *sym, int offset, char *modname, struct kprobe *pp) |
1263 | { | 1720 | { |
1264 | char *kprobe_type; | 1721 | char *kprobe_type; |
1265 | 1722 | ||
@@ -1269,19 +1726,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | |||
1269 | kprobe_type = "j"; | 1726 | kprobe_type = "j"; |
1270 | else | 1727 | else |
1271 | kprobe_type = "k"; | 1728 | kprobe_type = "k"; |
1729 | |||
1272 | if (sym) | 1730 | if (sym) |
1273 | seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", | 1731 | seq_printf(pi, "%p %s %s+0x%x %s ", |
1274 | p->addr, kprobe_type, sym, offset, | 1732 | p->addr, kprobe_type, sym, offset, |
1275 | (modname ? modname : " "), | 1733 | (modname ? modname : " ")); |
1276 | (kprobe_gone(p) ? "[GONE]" : ""), | ||
1277 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? | ||
1278 | "[DISABLED]" : "")); | ||
1279 | else | 1734 | else |
1280 | seq_printf(pi, "%p %s %p %s%s\n", | 1735 | seq_printf(pi, "%p %s %p ", |
1281 | p->addr, kprobe_type, p->addr, | 1736 | p->addr, kprobe_type, p->addr); |
1282 | (kprobe_gone(p) ? "[GONE]" : ""), | 1737 | |
1283 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? | 1738 | if (!pp) |
1284 | "[DISABLED]" : "")); | 1739 | pp = p; |
1740 | seq_printf(pi, "%s%s%s\n", | ||
1741 | (kprobe_gone(p) ? "[GONE]" : ""), | ||
1742 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), | ||
1743 | (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); | ||
1285 | } | 1744 | } |
1286 | 1745 | ||
1287 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) | 1746 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) |
@@ -1317,11 +1776,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) | |||
1317 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 1776 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
1318 | sym = kallsyms_lookup((unsigned long)p->addr, NULL, | 1777 | sym = kallsyms_lookup((unsigned long)p->addr, NULL, |
1319 | &offset, &modname, namebuf); | 1778 | &offset, &modname, namebuf); |
1320 | if (p->pre_handler == aggr_pre_handler) { | 1779 | if (kprobe_aggrprobe(p)) { |
1321 | list_for_each_entry_rcu(kp, &p->list, list) | 1780 | list_for_each_entry_rcu(kp, &p->list, list) |
1322 | report_probe(pi, kp, sym, offset, modname); | 1781 | report_probe(pi, kp, sym, offset, modname, p); |
1323 | } else | 1782 | } else |
1324 | report_probe(pi, p, sym, offset, modname); | 1783 | report_probe(pi, p, sym, offset, modname, NULL); |
1325 | } | 1784 | } |
1326 | preempt_enable(); | 1785 | preempt_enable(); |
1327 | return 0; | 1786 | return 0; |
@@ -1399,12 +1858,13 @@ int __kprobes enable_kprobe(struct kprobe *kp) | |||
1399 | goto out; | 1858 | goto out; |
1400 | } | 1859 | } |
1401 | 1860 | ||
1402 | if (!kprobes_all_disarmed && kprobe_disabled(p)) | ||
1403 | arm_kprobe(p); | ||
1404 | |||
1405 | p->flags &= ~KPROBE_FLAG_DISABLED; | ||
1406 | if (p != kp) | 1861 | if (p != kp) |
1407 | kp->flags &= ~KPROBE_FLAG_DISABLED; | 1862 | kp->flags &= ~KPROBE_FLAG_DISABLED; |
1863 | |||
1864 | if (!kprobes_all_disarmed && kprobe_disabled(p)) { | ||
1865 | p->flags &= ~KPROBE_FLAG_DISABLED; | ||
1866 | arm_kprobe(p); | ||
1867 | } | ||
1408 | out: | 1868 | out: |
1409 | mutex_unlock(&kprobe_mutex); | 1869 | mutex_unlock(&kprobe_mutex); |
1410 | return ret; | 1870 | return ret; |
@@ -1424,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void) | |||
1424 | if (!kprobes_all_disarmed) | 1884 | if (!kprobes_all_disarmed) |
1425 | goto already_enabled; | 1885 | goto already_enabled; |
1426 | 1886 | ||
1887 | /* Arming kprobes doesn't optimize kprobe itself */ | ||
1427 | mutex_lock(&text_mutex); | 1888 | mutex_lock(&text_mutex); |
1428 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1889 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1429 | head = &kprobe_table[i]; | 1890 | head = &kprobe_table[i]; |
1430 | hlist_for_each_entry_rcu(p, node, head, hlist) | 1891 | hlist_for_each_entry_rcu(p, node, head, hlist) |
1431 | if (!kprobe_disabled(p)) | 1892 | if (!kprobe_disabled(p)) |
1432 | arch_arm_kprobe(p); | 1893 | __arm_kprobe(p); |
1433 | } | 1894 | } |
1434 | mutex_unlock(&text_mutex); | 1895 | mutex_unlock(&text_mutex); |
1435 | 1896 | ||
@@ -1456,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void) | |||
1456 | 1917 | ||
1457 | kprobes_all_disarmed = true; | 1918 | kprobes_all_disarmed = true; |
1458 | printk(KERN_INFO "Kprobes globally disabled\n"); | 1919 | printk(KERN_INFO "Kprobes globally disabled\n"); |
1920 | |||
1921 | /* | ||
1922 | * Here we call get_online_cpus() for avoiding text_mutex deadlock, | ||
1923 | * because disarming may also unoptimize kprobes. | ||
1924 | */ | ||
1925 | get_online_cpus(); | ||
1459 | mutex_lock(&text_mutex); | 1926 | mutex_lock(&text_mutex); |
1460 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1927 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1461 | head = &kprobe_table[i]; | 1928 | head = &kprobe_table[i]; |
1462 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 1929 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
1463 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 1930 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
1464 | arch_disarm_kprobe(p); | 1931 | __disarm_kprobe(p); |
1465 | } | 1932 | } |
1466 | } | 1933 | } |
1467 | 1934 | ||
1468 | mutex_unlock(&text_mutex); | 1935 | mutex_unlock(&text_mutex); |
1936 | put_online_cpus(); | ||
1469 | mutex_unlock(&kprobe_mutex); | 1937 | mutex_unlock(&kprobe_mutex); |
1470 | /* Allow all currently running kprobes to complete */ | 1938 | /* Allow all currently running kprobes to complete */ |
1471 | synchronize_sched(); | 1939 | synchronize_sched(); |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3feaf5a74514..21fe3c426948 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, | |||
33 | } | 33 | } |
34 | KERNEL_ATTR_RO(uevent_seqnum); | 34 | KERNEL_ATTR_RO(uevent_seqnum); |
35 | 35 | ||
36 | /* uevent helper program, used during early boo */ | 36 | /* uevent helper program, used during early boot */ |
37 | static ssize_t uevent_helper_show(struct kobject *kobj, | 37 | static ssize_t uevent_helper_show(struct kobject *kobj, |
38 | struct kobj_attribute *attr, char *buf) | 38 | struct kobj_attribute *attr, char *buf) |
39 | { | 39 | { |
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void) | |||
197 | goto group_exit; | 197 | goto group_exit; |
198 | } | 198 | } |
199 | 199 | ||
200 | /* create the /sys/kernel/uids/ directory */ | ||
201 | error = uids_sysfs_init(); | ||
202 | if (error) | ||
203 | goto notes_exit; | ||
204 | |||
205 | return 0; | 200 | return 0; |
206 | 201 | ||
207 | notes_exit: | ||
208 | if (notes_size > 0) | ||
209 | sysfs_remove_bin_file(kernel_kobj, ¬es_attr); | ||
210 | group_exit: | 202 | group_exit: |
211 | sysfs_remove_group(kernel_kobj, &kernel_attr_group); | 203 | sysfs_remove_group(kernel_kobj, &kernel_attr_group); |
212 | kset_exit: | 204 | kset_exit: |
diff --git a/kernel/kthread.c b/kernel/kthread.c index fbb6222fe7e0..83911c780175 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
101 | * | 101 | * |
102 | * Description: This helper function creates and names a kernel | 102 | * Description: This helper function creates and names a kernel |
103 | * thread. The thread will be stopped: use wake_up_process() to start | 103 | * thread. The thread will be stopped: use wake_up_process() to start |
104 | * it. See also kthread_run(), kthread_create_on_cpu(). | 104 | * it. See also kthread_run(). |
105 | * | 105 | * |
106 | * When woken, the thread will run @threadfn() with @data as its | 106 | * When woken, the thread will run @threadfn() with @data as its |
107 | * argument. @threadfn() can either call do_exit() directly if it is a | 107 | * argument. @threadfn() can either call do_exit() directly if it is a |
@@ -219,7 +219,7 @@ int kthreadd(void *unused) | |||
219 | set_task_comm(tsk, "kthreadd"); | 219 | set_task_comm(tsk, "kthreadd"); |
220 | ignore_signals(tsk); | 220 | ignore_signals(tsk); |
221 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 221 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
222 | set_mems_allowed(node_possible_map); | 222 | set_mems_allowed(node_states[N_HIGH_MEMORY]); |
223 | 223 | ||
224 | current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; | 224 | current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; |
225 | 225 | ||
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ca07c5c0c914..877fb306d415 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -56,7 +56,6 @@ | |||
56 | #include <linux/module.h> | 56 | #include <linux/module.h> |
57 | #include <linux/sched.h> | 57 | #include <linux/sched.h> |
58 | #include <linux/list.h> | 58 | #include <linux/list.h> |
59 | #include <linux/slab.h> | ||
60 | #include <linux/stacktrace.h> | 59 | #include <linux/stacktrace.h> |
61 | 60 | ||
62 | static DEFINE_SPINLOCK(latency_lock); | 61 | static DEFINE_SPINLOCK(latency_lock); |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5feaddcdbe49..2594e1ce41cb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/ftrace.h> | 43 | #include <linux/ftrace.h> |
44 | #include <linux/stringify.h> | 44 | #include <linux/stringify.h> |
45 | #include <linux/bitops.h> | 45 | #include <linux/bitops.h> |
46 | #include <linux/gfp.h> | ||
46 | 47 | ||
47 | #include <asm/sections.h> | 48 | #include <asm/sections.h> |
48 | 49 | ||
@@ -582,9 +583,6 @@ static int static_obj(void *obj) | |||
582 | unsigned long start = (unsigned long) &_stext, | 583 | unsigned long start = (unsigned long) &_stext, |
583 | end = (unsigned long) &_end, | 584 | end = (unsigned long) &_end, |
584 | addr = (unsigned long) obj; | 585 | addr = (unsigned long) obj; |
585 | #ifdef CONFIG_SMP | ||
586 | int i; | ||
587 | #endif | ||
588 | 586 | ||
589 | /* | 587 | /* |
590 | * static variable? | 588 | * static variable? |
@@ -595,24 +593,16 @@ static int static_obj(void *obj) | |||
595 | if (arch_is_kernel_data(addr)) | 593 | if (arch_is_kernel_data(addr)) |
596 | return 1; | 594 | return 1; |
597 | 595 | ||
598 | #ifdef CONFIG_SMP | ||
599 | /* | 596 | /* |
600 | * percpu var? | 597 | * in-kernel percpu var? |
601 | */ | 598 | */ |
602 | for_each_possible_cpu(i) { | 599 | if (is_kernel_percpu_address(addr)) |
603 | start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); | 600 | return 1; |
604 | end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM | ||
605 | + per_cpu_offset(i); | ||
606 | |||
607 | if ((addr >= start) && (addr < end)) | ||
608 | return 1; | ||
609 | } | ||
610 | #endif | ||
611 | 601 | ||
612 | /* | 602 | /* |
613 | * module var? | 603 | * module static or percpu var? |
614 | */ | 604 | */ |
615 | return is_module_address(addr); | 605 | return is_module_address(addr) || is_module_percpu_address(addr); |
616 | } | 606 | } |
617 | 607 | ||
618 | /* | 608 | /* |
@@ -2147,7 +2137,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, | |||
2147 | return ret; | 2137 | return ret; |
2148 | 2138 | ||
2149 | return print_irq_inversion_bug(curr, &root, target_entry, | 2139 | return print_irq_inversion_bug(curr, &root, target_entry, |
2150 | this, 1, irqclass); | 2140 | this, 0, irqclass); |
2151 | } | 2141 | } |
2152 | 2142 | ||
2153 | void print_irqtrace_events(struct task_struct *curr) | 2143 | void print_irqtrace_events(struct task_struct *curr) |
@@ -3211,8 +3201,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3211 | { | 3201 | { |
3212 | unsigned long flags; | 3202 | unsigned long flags; |
3213 | 3203 | ||
3214 | trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); | ||
3215 | |||
3216 | if (unlikely(current->lockdep_recursion)) | 3204 | if (unlikely(current->lockdep_recursion)) |
3217 | return; | 3205 | return; |
3218 | 3206 | ||
@@ -3220,6 +3208,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3220 | check_flags(flags); | 3208 | check_flags(flags); |
3221 | 3209 | ||
3222 | current->lockdep_recursion = 1; | 3210 | current->lockdep_recursion = 1; |
3211 | trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); | ||
3223 | __lock_acquire(lock, subclass, trylock, read, check, | 3212 | __lock_acquire(lock, subclass, trylock, read, check, |
3224 | irqs_disabled_flags(flags), nest_lock, ip, 0); | 3213 | irqs_disabled_flags(flags), nest_lock, ip, 0); |
3225 | current->lockdep_recursion = 0; | 3214 | current->lockdep_recursion = 0; |
@@ -3232,14 +3221,13 @@ void lock_release(struct lockdep_map *lock, int nested, | |||
3232 | { | 3221 | { |
3233 | unsigned long flags; | 3222 | unsigned long flags; |
3234 | 3223 | ||
3235 | trace_lock_release(lock, nested, ip); | ||
3236 | |||
3237 | if (unlikely(current->lockdep_recursion)) | 3224 | if (unlikely(current->lockdep_recursion)) |
3238 | return; | 3225 | return; |
3239 | 3226 | ||
3240 | raw_local_irq_save(flags); | 3227 | raw_local_irq_save(flags); |
3241 | check_flags(flags); | 3228 | check_flags(flags); |
3242 | current->lockdep_recursion = 1; | 3229 | current->lockdep_recursion = 1; |
3230 | trace_lock_release(lock, nested, ip); | ||
3243 | __lock_release(lock, nested, ip); | 3231 | __lock_release(lock, nested, ip); |
3244 | current->lockdep_recursion = 0; | 3232 | current->lockdep_recursion = 0; |
3245 | raw_local_irq_restore(flags); | 3233 | raw_local_irq_restore(flags); |
@@ -3413,8 +3401,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) | |||
3413 | { | 3401 | { |
3414 | unsigned long flags; | 3402 | unsigned long flags; |
3415 | 3403 | ||
3416 | trace_lock_contended(lock, ip); | ||
3417 | |||
3418 | if (unlikely(!lock_stat)) | 3404 | if (unlikely(!lock_stat)) |
3419 | return; | 3405 | return; |
3420 | 3406 | ||
@@ -3424,6 +3410,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) | |||
3424 | raw_local_irq_save(flags); | 3410 | raw_local_irq_save(flags); |
3425 | check_flags(flags); | 3411 | check_flags(flags); |
3426 | current->lockdep_recursion = 1; | 3412 | current->lockdep_recursion = 1; |
3413 | trace_lock_contended(lock, ip); | ||
3427 | __lock_contended(lock, ip); | 3414 | __lock_contended(lock, ip); |
3428 | current->lockdep_recursion = 0; | 3415 | current->lockdep_recursion = 0; |
3429 | raw_local_irq_restore(flags); | 3416 | raw_local_irq_restore(flags); |
@@ -3809,3 +3796,22 @@ void lockdep_sys_exit(void) | |||
3809 | lockdep_print_held_locks(curr); | 3796 | lockdep_print_held_locks(curr); |
3810 | } | 3797 | } |
3811 | } | 3798 | } |
3799 | |||
3800 | void lockdep_rcu_dereference(const char *file, const int line) | ||
3801 | { | ||
3802 | struct task_struct *curr = current; | ||
3803 | |||
3804 | if (!debug_locks_off()) | ||
3805 | return; | ||
3806 | printk("\n===================================================\n"); | ||
3807 | printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); | ||
3808 | printk( "---------------------------------------------------\n"); | ||
3809 | printk("%s:%d invoked rcu_dereference_check() without protection!\n", | ||
3810 | file, line); | ||
3811 | printk("\nother info that might help us debug this:\n\n"); | ||
3812 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); | ||
3813 | lockdep_print_held_locks(curr); | ||
3814 | printk("\nstack backtrace:\n"); | ||
3815 | dump_stack(); | ||
3816 | } | ||
3817 | EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); | ||
diff --git a/kernel/module.c b/kernel/module.c index f82386bd9ee9..1016b75b026a 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -370,27 +370,33 @@ EXPORT_SYMBOL_GPL(find_module); | |||
370 | 370 | ||
371 | #ifdef CONFIG_SMP | 371 | #ifdef CONFIG_SMP |
372 | 372 | ||
373 | static void *percpu_modalloc(unsigned long size, unsigned long align, | 373 | static inline void __percpu *mod_percpu(struct module *mod) |
374 | const char *name) | ||
375 | { | 374 | { |
376 | void *ptr; | 375 | return mod->percpu; |
376 | } | ||
377 | 377 | ||
378 | static int percpu_modalloc(struct module *mod, | ||
379 | unsigned long size, unsigned long align) | ||
380 | { | ||
378 | if (align > PAGE_SIZE) { | 381 | if (align > PAGE_SIZE) { |
379 | printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", | 382 | printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", |
380 | name, align, PAGE_SIZE); | 383 | mod->name, align, PAGE_SIZE); |
381 | align = PAGE_SIZE; | 384 | align = PAGE_SIZE; |
382 | } | 385 | } |
383 | 386 | ||
384 | ptr = __alloc_reserved_percpu(size, align); | 387 | mod->percpu = __alloc_reserved_percpu(size, align); |
385 | if (!ptr) | 388 | if (!mod->percpu) { |
386 | printk(KERN_WARNING | 389 | printk(KERN_WARNING |
387 | "Could not allocate %lu bytes percpu data\n", size); | 390 | "Could not allocate %lu bytes percpu data\n", size); |
388 | return ptr; | 391 | return -ENOMEM; |
392 | } | ||
393 | mod->percpu_size = size; | ||
394 | return 0; | ||
389 | } | 395 | } |
390 | 396 | ||
391 | static void percpu_modfree(void *freeme) | 397 | static void percpu_modfree(struct module *mod) |
392 | { | 398 | { |
393 | free_percpu(freeme); | 399 | free_percpu(mod->percpu); |
394 | } | 400 | } |
395 | 401 | ||
396 | static unsigned int find_pcpusec(Elf_Ehdr *hdr, | 402 | static unsigned int find_pcpusec(Elf_Ehdr *hdr, |
@@ -400,24 +406,62 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, | |||
400 | return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); | 406 | return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); |
401 | } | 407 | } |
402 | 408 | ||
403 | static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) | 409 | static void percpu_modcopy(struct module *mod, |
410 | const void *from, unsigned long size) | ||
404 | { | 411 | { |
405 | int cpu; | 412 | int cpu; |
406 | 413 | ||
407 | for_each_possible_cpu(cpu) | 414 | for_each_possible_cpu(cpu) |
408 | memcpy(pcpudest + per_cpu_offset(cpu), from, size); | 415 | memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); |
416 | } | ||
417 | |||
418 | /** | ||
419 | * is_module_percpu_address - test whether address is from module static percpu | ||
420 | * @addr: address to test | ||
421 | * | ||
422 | * Test whether @addr belongs to module static percpu area. | ||
423 | * | ||
424 | * RETURNS: | ||
425 | * %true if @addr is from module static percpu area | ||
426 | */ | ||
427 | bool is_module_percpu_address(unsigned long addr) | ||
428 | { | ||
429 | struct module *mod; | ||
430 | unsigned int cpu; | ||
431 | |||
432 | preempt_disable(); | ||
433 | |||
434 | list_for_each_entry_rcu(mod, &modules, list) { | ||
435 | if (!mod->percpu_size) | ||
436 | continue; | ||
437 | for_each_possible_cpu(cpu) { | ||
438 | void *start = per_cpu_ptr(mod->percpu, cpu); | ||
439 | |||
440 | if ((void *)addr >= start && | ||
441 | (void *)addr < start + mod->percpu_size) { | ||
442 | preempt_enable(); | ||
443 | return true; | ||
444 | } | ||
445 | } | ||
446 | } | ||
447 | |||
448 | preempt_enable(); | ||
449 | return false; | ||
409 | } | 450 | } |
410 | 451 | ||
411 | #else /* ... !CONFIG_SMP */ | 452 | #else /* ... !CONFIG_SMP */ |
412 | 453 | ||
413 | static inline void *percpu_modalloc(unsigned long size, unsigned long align, | 454 | static inline void __percpu *mod_percpu(struct module *mod) |
414 | const char *name) | ||
415 | { | 455 | { |
416 | return NULL; | 456 | return NULL; |
417 | } | 457 | } |
418 | static inline void percpu_modfree(void *pcpuptr) | 458 | static inline int percpu_modalloc(struct module *mod, |
459 | unsigned long size, unsigned long align) | ||
460 | { | ||
461 | return -ENOMEM; | ||
462 | } | ||
463 | static inline void percpu_modfree(struct module *mod) | ||
419 | { | 464 | { |
420 | BUG(); | ||
421 | } | 465 | } |
422 | static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, | 466 | static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, |
423 | Elf_Shdr *sechdrs, | 467 | Elf_Shdr *sechdrs, |
@@ -425,12 +469,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, | |||
425 | { | 469 | { |
426 | return 0; | 470 | return 0; |
427 | } | 471 | } |
428 | static inline void percpu_modcopy(void *pcpudst, const void *src, | 472 | static inline void percpu_modcopy(struct module *mod, |
429 | unsigned long size) | 473 | const void *from, unsigned long size) |
430 | { | 474 | { |
431 | /* pcpusec should be 0, and size of that section should be 0. */ | 475 | /* pcpusec should be 0, and size of that section should be 0. */ |
432 | BUG_ON(size != 0); | 476 | BUG_ON(size != 0); |
433 | } | 477 | } |
478 | bool is_module_percpu_address(unsigned long addr) | ||
479 | { | ||
480 | return false; | ||
481 | } | ||
434 | 482 | ||
435 | #endif /* CONFIG_SMP */ | 483 | #endif /* CONFIG_SMP */ |
436 | 484 | ||
@@ -473,10 +521,13 @@ static void module_unload_init(struct module *mod) | |||
473 | int cpu; | 521 | int cpu; |
474 | 522 | ||
475 | INIT_LIST_HEAD(&mod->modules_which_use_me); | 523 | INIT_LIST_HEAD(&mod->modules_which_use_me); |
476 | for_each_possible_cpu(cpu) | 524 | for_each_possible_cpu(cpu) { |
477 | local_set(__module_ref_addr(mod, cpu), 0); | 525 | per_cpu_ptr(mod->refptr, cpu)->incs = 0; |
526 | per_cpu_ptr(mod->refptr, cpu)->decs = 0; | ||
527 | } | ||
528 | |||
478 | /* Hold reference count during initialization. */ | 529 | /* Hold reference count during initialization. */ |
479 | local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); | 530 | __this_cpu_write(mod->refptr->incs, 1); |
480 | /* Backwards compatibility macros put refcount during init. */ | 531 | /* Backwards compatibility macros put refcount during init. */ |
481 | mod->waiter = current; | 532 | mod->waiter = current; |
482 | } | 533 | } |
@@ -615,12 +666,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced) | |||
615 | 666 | ||
616 | unsigned int module_refcount(struct module *mod) | 667 | unsigned int module_refcount(struct module *mod) |
617 | { | 668 | { |
618 | unsigned int total = 0; | 669 | unsigned int incs = 0, decs = 0; |
619 | int cpu; | 670 | int cpu; |
620 | 671 | ||
621 | for_each_possible_cpu(cpu) | 672 | for_each_possible_cpu(cpu) |
622 | total += local_read(__module_ref_addr(mod, cpu)); | 673 | decs += per_cpu_ptr(mod->refptr, cpu)->decs; |
623 | return total; | 674 | /* |
675 | * ensure the incs are added up after the decs. | ||
676 | * module_put ensures incs are visible before decs with smp_wmb. | ||
677 | * | ||
678 | * This 2-count scheme avoids the situation where the refcount | ||
679 | * for CPU0 is read, then CPU0 increments the module refcount, | ||
680 | * then CPU1 drops that refcount, then the refcount for CPU1 is | ||
681 | * read. We would record a decrement but not its corresponding | ||
682 | * increment so we would see a low count (disaster). | ||
683 | * | ||
684 | * Rare situation? But module_refcount can be preempted, and we | ||
685 | * might be tallying up 4096+ CPUs. So it is not impossible. | ||
686 | */ | ||
687 | smp_rmb(); | ||
688 | for_each_possible_cpu(cpu) | ||
689 | incs += per_cpu_ptr(mod->refptr, cpu)->incs; | ||
690 | return incs - decs; | ||
624 | } | 691 | } |
625 | EXPORT_SYMBOL(module_refcount); | 692 | EXPORT_SYMBOL(module_refcount); |
626 | 693 | ||
@@ -796,14 +863,16 @@ static struct module_attribute refcnt = { | |||
796 | void module_put(struct module *module) | 863 | void module_put(struct module *module) |
797 | { | 864 | { |
798 | if (module) { | 865 | if (module) { |
799 | unsigned int cpu = get_cpu(); | 866 | preempt_disable(); |
800 | local_dec(__module_ref_addr(module, cpu)); | 867 | smp_wmb(); /* see comment in module_refcount */ |
868 | __this_cpu_inc(module->refptr->decs); | ||
869 | |||
801 | trace_module_put(module, _RET_IP_, | 870 | trace_module_put(module, _RET_IP_, |
802 | local_read(__module_ref_addr(module, cpu))); | 871 | __this_cpu_read(module->refptr->decs)); |
803 | /* Maybe they're waiting for us to drop reference? */ | 872 | /* Maybe they're waiting for us to drop reference? */ |
804 | if (unlikely(!module_is_live(module))) | 873 | if (unlikely(!module_is_live(module))) |
805 | wake_up_process(module->waiter); | 874 | wake_up_process(module->waiter); |
806 | put_cpu(); | 875 | preempt_enable(); |
807 | } | 876 | } |
808 | } | 877 | } |
809 | EXPORT_SYMBOL(module_put); | 878 | EXPORT_SYMBOL(module_put); |
@@ -1083,6 +1152,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, | |||
1083 | if (sattr->name == NULL) | 1152 | if (sattr->name == NULL) |
1084 | goto out; | 1153 | goto out; |
1085 | sect_attrs->nsections++; | 1154 | sect_attrs->nsections++; |
1155 | sysfs_attr_init(&sattr->mattr.attr); | ||
1086 | sattr->mattr.show = module_sect_show; | 1156 | sattr->mattr.show = module_sect_show; |
1087 | sattr->mattr.store = NULL; | 1157 | sattr->mattr.store = NULL; |
1088 | sattr->mattr.attr.name = sattr->name; | 1158 | sattr->mattr.attr.name = sattr->name; |
@@ -1178,6 +1248,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, | |||
1178 | if (sect_empty(&sechdrs[i])) | 1248 | if (sect_empty(&sechdrs[i])) |
1179 | continue; | 1249 | continue; |
1180 | if (sechdrs[i].sh_type == SHT_NOTE) { | 1250 | if (sechdrs[i].sh_type == SHT_NOTE) { |
1251 | sysfs_bin_attr_init(nattr); | ||
1181 | nattr->attr.name = mod->sect_attrs->attrs[loaded].name; | 1252 | nattr->attr.name = mod->sect_attrs->attrs[loaded].name; |
1182 | nattr->attr.mode = S_IRUGO; | 1253 | nattr->attr.mode = S_IRUGO; |
1183 | nattr->size = sechdrs[i].sh_size; | 1254 | nattr->size = sechdrs[i].sh_size; |
@@ -1250,6 +1321,7 @@ int module_add_modinfo_attrs(struct module *mod) | |||
1250 | if (!attr->test || | 1321 | if (!attr->test || |
1251 | (attr->test && attr->test(mod))) { | 1322 | (attr->test && attr->test(mod))) { |
1252 | memcpy(temp_attr, attr, sizeof(*temp_attr)); | 1323 | memcpy(temp_attr, attr, sizeof(*temp_attr)); |
1324 | sysfs_attr_init(&temp_attr->attr); | ||
1253 | error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); | 1325 | error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); |
1254 | ++temp_attr; | 1326 | ++temp_attr; |
1255 | } | 1327 | } |
@@ -1395,11 +1467,10 @@ static void free_module(struct module *mod) | |||
1395 | /* This may be NULL, but that's OK */ | 1467 | /* This may be NULL, but that's OK */ |
1396 | module_free(mod, mod->module_init); | 1468 | module_free(mod, mod->module_init); |
1397 | kfree(mod->args); | 1469 | kfree(mod->args); |
1398 | if (mod->percpu) | 1470 | percpu_modfree(mod); |
1399 | percpu_modfree(mod->percpu); | 1471 | #if defined(CONFIG_MODULE_UNLOAD) |
1400 | #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) | ||
1401 | if (mod->refptr) | 1472 | if (mod->refptr) |
1402 | percpu_modfree(mod->refptr); | 1473 | free_percpu(mod->refptr); |
1403 | #endif | 1474 | #endif |
1404 | /* Free lock-classes: */ | 1475 | /* Free lock-classes: */ |
1405 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1476 | lockdep_free_key_range(mod->module_core, mod->core_size); |
@@ -1515,7 +1586,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, | |||
1515 | default: | 1586 | default: |
1516 | /* Divert to percpu allocation if a percpu var. */ | 1587 | /* Divert to percpu allocation if a percpu var. */ |
1517 | if (sym[i].st_shndx == pcpuindex) | 1588 | if (sym[i].st_shndx == pcpuindex) |
1518 | secbase = (unsigned long)mod->percpu; | 1589 | secbase = (unsigned long)mod_percpu(mod); |
1519 | else | 1590 | else |
1520 | secbase = sechdrs[sym[i].st_shndx].sh_addr; | 1591 | secbase = sechdrs[sym[i].st_shndx].sh_addr; |
1521 | sym[i].st_value += secbase; | 1592 | sym[i].st_value += secbase; |
@@ -1949,7 +2020,7 @@ static noinline struct module *load_module(void __user *umod, | |||
1949 | unsigned int modindex, versindex, infoindex, pcpuindex; | 2020 | unsigned int modindex, versindex, infoindex, pcpuindex; |
1950 | struct module *mod; | 2021 | struct module *mod; |
1951 | long err = 0; | 2022 | long err = 0; |
1952 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ | 2023 | void *ptr = NULL; /* Stops spurious gcc warning */ |
1953 | unsigned long symoffs, stroffs, *strmap; | 2024 | unsigned long symoffs, stroffs, *strmap; |
1954 | 2025 | ||
1955 | mm_segment_t old_fs; | 2026 | mm_segment_t old_fs; |
@@ -2089,15 +2160,11 @@ static noinline struct module *load_module(void __user *umod, | |||
2089 | 2160 | ||
2090 | if (pcpuindex) { | 2161 | if (pcpuindex) { |
2091 | /* We have a special allocation for this section. */ | 2162 | /* We have a special allocation for this section. */ |
2092 | percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, | 2163 | err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, |
2093 | sechdrs[pcpuindex].sh_addralign, | 2164 | sechdrs[pcpuindex].sh_addralign); |
2094 | mod->name); | 2165 | if (err) |
2095 | if (!percpu) { | ||
2096 | err = -ENOMEM; | ||
2097 | goto free_mod; | 2166 | goto free_mod; |
2098 | } | ||
2099 | sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2167 | sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; |
2100 | mod->percpu = percpu; | ||
2101 | } | 2168 | } |
2102 | 2169 | ||
2103 | /* Determine total sizes, and put offsets in sh_entsize. For now | 2170 | /* Determine total sizes, and put offsets in sh_entsize. For now |
@@ -2162,9 +2229,8 @@ static noinline struct module *load_module(void __user *umod, | |||
2162 | mod = (void *)sechdrs[modindex].sh_addr; | 2229 | mod = (void *)sechdrs[modindex].sh_addr; |
2163 | kmemleak_load_module(mod, hdr, sechdrs, secstrings); | 2230 | kmemleak_load_module(mod, hdr, sechdrs, secstrings); |
2164 | 2231 | ||
2165 | #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) | 2232 | #if defined(CONFIG_MODULE_UNLOAD) |
2166 | mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), | 2233 | mod->refptr = alloc_percpu(struct module_ref); |
2167 | mod->name); | ||
2168 | if (!mod->refptr) { | 2234 | if (!mod->refptr) { |
2169 | err = -ENOMEM; | 2235 | err = -ENOMEM; |
2170 | goto free_init; | 2236 | goto free_init; |
@@ -2313,7 +2379,7 @@ static noinline struct module *load_module(void __user *umod, | |||
2313 | sort_extable(mod->extable, mod->extable + mod->num_exentries); | 2379 | sort_extable(mod->extable, mod->extable + mod->num_exentries); |
2314 | 2380 | ||
2315 | /* Finally, copy percpu area over. */ | 2381 | /* Finally, copy percpu area over. */ |
2316 | percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, | 2382 | percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, |
2317 | sechdrs[pcpuindex].sh_size); | 2383 | sechdrs[pcpuindex].sh_size); |
2318 | 2384 | ||
2319 | add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, | 2385 | add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, |
@@ -2396,8 +2462,8 @@ static noinline struct module *load_module(void __user *umod, | |||
2396 | kobject_put(&mod->mkobj.kobj); | 2462 | kobject_put(&mod->mkobj.kobj); |
2397 | free_unload: | 2463 | free_unload: |
2398 | module_unload_free(mod); | 2464 | module_unload_free(mod); |
2399 | #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) | 2465 | #if defined(CONFIG_MODULE_UNLOAD) |
2400 | percpu_modfree(mod->refptr); | 2466 | free_percpu(mod->refptr); |
2401 | free_init: | 2467 | free_init: |
2402 | #endif | 2468 | #endif |
2403 | module_free(mod, mod->module_init); | 2469 | module_free(mod, mod->module_init); |
@@ -2405,8 +2471,7 @@ static noinline struct module *load_module(void __user *umod, | |||
2405 | module_free(mod, mod->module_core); | 2471 | module_free(mod, mod->module_core); |
2406 | /* mod will be freed with core. Don't access it beyond this line! */ | 2472 | /* mod will be freed with core. Don't access it beyond this line! */ |
2407 | free_percpu: | 2473 | free_percpu: |
2408 | if (percpu) | 2474 | percpu_modfree(mod); |
2409 | percpu_modfree(percpu); | ||
2410 | free_mod: | 2475 | free_mod: |
2411 | kfree(args); | 2476 | kfree(args); |
2412 | kfree(strmap); | 2477 | kfree(strmap); |
diff --git a/kernel/notifier.c b/kernel/notifier.c index acd24e7643eb..2488ba7eb568 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, | |||
78 | int ret = NOTIFY_DONE; | 78 | int ret = NOTIFY_DONE; |
79 | struct notifier_block *nb, *next_nb; | 79 | struct notifier_block *nb, *next_nb; |
80 | 80 | ||
81 | nb = rcu_dereference(*nl); | 81 | nb = rcu_dereference_raw(*nl); |
82 | 82 | ||
83 | while (nb && nr_to_call) { | 83 | while (nb && nr_to_call) { |
84 | next_nb = rcu_dereference(nb->next); | 84 | next_nb = rcu_dereference_raw(nb->next); |
85 | 85 | ||
86 | #ifdef CONFIG_DEBUG_NOTIFIERS | 86 | #ifdef CONFIG_DEBUG_NOTIFIERS |
87 | if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { | 87 | if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { |
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, | |||
309 | * racy then it does not matter what the result of the test | 309 | * racy then it does not matter what the result of the test |
310 | * is, we re-check the list after having taken the lock anyway: | 310 | * is, we re-check the list after having taken the lock anyway: |
311 | */ | 311 | */ |
312 | if (rcu_dereference(nh->head)) { | 312 | if (rcu_dereference_raw(nh->head)) { |
313 | down_read(&nh->rwsem); | 313 | down_read(&nh->rwsem); |
314 | ret = notifier_call_chain(&nh->head, val, v, nr_to_call, | 314 | ret = notifier_call_chain(&nh->head, val, v, nr_to_call, |
315 | nr_calls); | 315 | nr_calls); |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 09b4ff9711b2..f74e6c00e26d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -13,6 +13,7 @@ | |||
13 | * Pavel Emelianov <xemul@openvz.org> | 13 | * Pavel Emelianov <xemul@openvz.org> |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/slab.h> | ||
16 | #include <linux/module.h> | 17 | #include <linux/module.h> |
17 | #include <linux/nsproxy.h> | 18 | #include <linux/nsproxy.h> |
18 | #include <linux/init_task.h> | 19 | #include <linux/init_task.h> |
@@ -24,7 +25,18 @@ | |||
24 | 25 | ||
25 | static struct kmem_cache *nsproxy_cachep; | 26 | static struct kmem_cache *nsproxy_cachep; |
26 | 27 | ||
27 | struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); | 28 | struct nsproxy init_nsproxy = { |
29 | .count = ATOMIC_INIT(1), | ||
30 | .uts_ns = &init_uts_ns, | ||
31 | #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) | ||
32 | .ipc_ns = &init_ipc_ns, | ||
33 | #endif | ||
34 | .mnt_ns = NULL, | ||
35 | .pid_ns = &init_pid_ns, | ||
36 | #ifdef CONFIG_NET | ||
37 | .net_ns = &init_net, | ||
38 | #endif | ||
39 | }; | ||
28 | 40 | ||
29 | static inline struct nsproxy *create_nsproxy(void) | 41 | static inline struct nsproxy *create_nsproxy(void) |
30 | { | 42 | { |
diff --git a/kernel/padata.c b/kernel/padata.c new file mode 100644 index 000000000000..fd03513c7327 --- /dev/null +++ b/kernel/padata.c | |||
@@ -0,0 +1,697 @@ | |||
1 | /* | ||
2 | * padata.c - generic interface to process data streams in parallel | ||
3 | * | ||
4 | * Copyright (C) 2008, 2009 secunet Security Networks AG | ||
5 | * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify it | ||
8 | * under the terms and conditions of the GNU General Public License, | ||
9 | * version 2, as published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
14 | * more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License along with | ||
17 | * this program; if not, write to the Free Software Foundation, Inc., | ||
18 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/cpumask.h> | ||
23 | #include <linux/err.h> | ||
24 | #include <linux/cpu.h> | ||
25 | #include <linux/padata.h> | ||
26 | #include <linux/mutex.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/rcupdate.h> | ||
30 | |||
31 | #define MAX_SEQ_NR INT_MAX - NR_CPUS | ||
32 | #define MAX_OBJ_NUM 10000 * NR_CPUS | ||
33 | |||
34 | static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) | ||
35 | { | ||
36 | int cpu, target_cpu; | ||
37 | |||
38 | target_cpu = cpumask_first(pd->cpumask); | ||
39 | for (cpu = 0; cpu < cpu_index; cpu++) | ||
40 | target_cpu = cpumask_next(target_cpu, pd->cpumask); | ||
41 | |||
42 | return target_cpu; | ||
43 | } | ||
44 | |||
45 | static int padata_cpu_hash(struct padata_priv *padata) | ||
46 | { | ||
47 | int cpu_index; | ||
48 | struct parallel_data *pd; | ||
49 | |||
50 | pd = padata->pd; | ||
51 | |||
52 | /* | ||
53 | * Hash the sequence numbers to the cpus by taking | ||
54 | * seq_nr mod. number of cpus in use. | ||
55 | */ | ||
56 | cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); | ||
57 | |||
58 | return padata_index_to_cpu(pd, cpu_index); | ||
59 | } | ||
60 | |||
61 | static void padata_parallel_worker(struct work_struct *work) | ||
62 | { | ||
63 | struct padata_queue *queue; | ||
64 | struct parallel_data *pd; | ||
65 | struct padata_instance *pinst; | ||
66 | LIST_HEAD(local_list); | ||
67 | |||
68 | local_bh_disable(); | ||
69 | queue = container_of(work, struct padata_queue, pwork); | ||
70 | pd = queue->pd; | ||
71 | pinst = pd->pinst; | ||
72 | |||
73 | spin_lock(&queue->parallel.lock); | ||
74 | list_replace_init(&queue->parallel.list, &local_list); | ||
75 | spin_unlock(&queue->parallel.lock); | ||
76 | |||
77 | while (!list_empty(&local_list)) { | ||
78 | struct padata_priv *padata; | ||
79 | |||
80 | padata = list_entry(local_list.next, | ||
81 | struct padata_priv, list); | ||
82 | |||
83 | list_del_init(&padata->list); | ||
84 | |||
85 | padata->parallel(padata); | ||
86 | } | ||
87 | |||
88 | local_bh_enable(); | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * padata_do_parallel - padata parallelization function | ||
93 | * | ||
94 | * @pinst: padata instance | ||
95 | * @padata: object to be parallelized | ||
96 | * @cb_cpu: cpu the serialization callback function will run on, | ||
97 | * must be in the cpumask of padata. | ||
98 | * | ||
99 | * The parallelization callback function will run with BHs off. | ||
100 | * Note: Every object which is parallelized by padata_do_parallel | ||
101 | * must be seen by padata_do_serial. | ||
102 | */ | ||
103 | int padata_do_parallel(struct padata_instance *pinst, | ||
104 | struct padata_priv *padata, int cb_cpu) | ||
105 | { | ||
106 | int target_cpu, err; | ||
107 | struct padata_queue *queue; | ||
108 | struct parallel_data *pd; | ||
109 | |||
110 | rcu_read_lock_bh(); | ||
111 | |||
112 | pd = rcu_dereference(pinst->pd); | ||
113 | |||
114 | err = 0; | ||
115 | if (!(pinst->flags & PADATA_INIT)) | ||
116 | goto out; | ||
117 | |||
118 | err = -EBUSY; | ||
119 | if ((pinst->flags & PADATA_RESET)) | ||
120 | goto out; | ||
121 | |||
122 | if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) | ||
123 | goto out; | ||
124 | |||
125 | err = -EINVAL; | ||
126 | if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) | ||
127 | goto out; | ||
128 | |||
129 | err = -EINPROGRESS; | ||
130 | atomic_inc(&pd->refcnt); | ||
131 | padata->pd = pd; | ||
132 | padata->cb_cpu = cb_cpu; | ||
133 | |||
134 | if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) | ||
135 | atomic_set(&pd->seq_nr, -1); | ||
136 | |||
137 | padata->seq_nr = atomic_inc_return(&pd->seq_nr); | ||
138 | |||
139 | target_cpu = padata_cpu_hash(padata); | ||
140 | queue = per_cpu_ptr(pd->queue, target_cpu); | ||
141 | |||
142 | spin_lock(&queue->parallel.lock); | ||
143 | list_add_tail(&padata->list, &queue->parallel.list); | ||
144 | spin_unlock(&queue->parallel.lock); | ||
145 | |||
146 | queue_work_on(target_cpu, pinst->wq, &queue->pwork); | ||
147 | |||
148 | out: | ||
149 | rcu_read_unlock_bh(); | ||
150 | |||
151 | return err; | ||
152 | } | ||
153 | EXPORT_SYMBOL(padata_do_parallel); | ||
154 | |||
155 | static struct padata_priv *padata_get_next(struct parallel_data *pd) | ||
156 | { | ||
157 | int cpu, num_cpus, empty, calc_seq_nr; | ||
158 | int seq_nr, next_nr, overrun, next_overrun; | ||
159 | struct padata_queue *queue, *next_queue; | ||
160 | struct padata_priv *padata; | ||
161 | struct padata_list *reorder; | ||
162 | |||
163 | empty = 0; | ||
164 | next_nr = -1; | ||
165 | next_overrun = 0; | ||
166 | next_queue = NULL; | ||
167 | |||
168 | num_cpus = cpumask_weight(pd->cpumask); | ||
169 | |||
170 | for_each_cpu(cpu, pd->cpumask) { | ||
171 | queue = per_cpu_ptr(pd->queue, cpu); | ||
172 | reorder = &queue->reorder; | ||
173 | |||
174 | /* | ||
175 | * Calculate the seq_nr of the object that should be | ||
176 | * next in this queue. | ||
177 | */ | ||
178 | overrun = 0; | ||
179 | calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) | ||
180 | + queue->cpu_index; | ||
181 | |||
182 | if (unlikely(calc_seq_nr > pd->max_seq_nr)) { | ||
183 | calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; | ||
184 | overrun = 1; | ||
185 | } | ||
186 | |||
187 | if (!list_empty(&reorder->list)) { | ||
188 | padata = list_entry(reorder->list.next, | ||
189 | struct padata_priv, list); | ||
190 | |||
191 | seq_nr = padata->seq_nr; | ||
192 | BUG_ON(calc_seq_nr != seq_nr); | ||
193 | } else { | ||
194 | seq_nr = calc_seq_nr; | ||
195 | empty++; | ||
196 | } | ||
197 | |||
198 | if (next_nr < 0 || seq_nr < next_nr | ||
199 | || (next_overrun && !overrun)) { | ||
200 | next_nr = seq_nr; | ||
201 | next_overrun = overrun; | ||
202 | next_queue = queue; | ||
203 | } | ||
204 | } | ||
205 | |||
206 | padata = NULL; | ||
207 | |||
208 | if (empty == num_cpus) | ||
209 | goto out; | ||
210 | |||
211 | reorder = &next_queue->reorder; | ||
212 | |||
213 | if (!list_empty(&reorder->list)) { | ||
214 | padata = list_entry(reorder->list.next, | ||
215 | struct padata_priv, list); | ||
216 | |||
217 | if (unlikely(next_overrun)) { | ||
218 | for_each_cpu(cpu, pd->cpumask) { | ||
219 | queue = per_cpu_ptr(pd->queue, cpu); | ||
220 | atomic_set(&queue->num_obj, 0); | ||
221 | } | ||
222 | } | ||
223 | |||
224 | spin_lock(&reorder->lock); | ||
225 | list_del_init(&padata->list); | ||
226 | atomic_dec(&pd->reorder_objects); | ||
227 | spin_unlock(&reorder->lock); | ||
228 | |||
229 | atomic_inc(&next_queue->num_obj); | ||
230 | |||
231 | goto out; | ||
232 | } | ||
233 | |||
234 | if (next_nr % num_cpus == next_queue->cpu_index) { | ||
235 | padata = ERR_PTR(-ENODATA); | ||
236 | goto out; | ||
237 | } | ||
238 | |||
239 | padata = ERR_PTR(-EINPROGRESS); | ||
240 | out: | ||
241 | return padata; | ||
242 | } | ||
243 | |||
244 | static void padata_reorder(struct parallel_data *pd) | ||
245 | { | ||
246 | struct padata_priv *padata; | ||
247 | struct padata_queue *queue; | ||
248 | struct padata_instance *pinst = pd->pinst; | ||
249 | |||
250 | try_again: | ||
251 | if (!spin_trylock_bh(&pd->lock)) | ||
252 | goto out; | ||
253 | |||
254 | while (1) { | ||
255 | padata = padata_get_next(pd); | ||
256 | |||
257 | if (!padata || PTR_ERR(padata) == -EINPROGRESS) | ||
258 | break; | ||
259 | |||
260 | if (PTR_ERR(padata) == -ENODATA) { | ||
261 | spin_unlock_bh(&pd->lock); | ||
262 | goto out; | ||
263 | } | ||
264 | |||
265 | queue = per_cpu_ptr(pd->queue, padata->cb_cpu); | ||
266 | |||
267 | spin_lock(&queue->serial.lock); | ||
268 | list_add_tail(&padata->list, &queue->serial.list); | ||
269 | spin_unlock(&queue->serial.lock); | ||
270 | |||
271 | queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); | ||
272 | } | ||
273 | |||
274 | spin_unlock_bh(&pd->lock); | ||
275 | |||
276 | if (atomic_read(&pd->reorder_objects)) | ||
277 | goto try_again; | ||
278 | |||
279 | out: | ||
280 | return; | ||
281 | } | ||
282 | |||
283 | static void padata_serial_worker(struct work_struct *work) | ||
284 | { | ||
285 | struct padata_queue *queue; | ||
286 | struct parallel_data *pd; | ||
287 | LIST_HEAD(local_list); | ||
288 | |||
289 | local_bh_disable(); | ||
290 | queue = container_of(work, struct padata_queue, swork); | ||
291 | pd = queue->pd; | ||
292 | |||
293 | spin_lock(&queue->serial.lock); | ||
294 | list_replace_init(&queue->serial.list, &local_list); | ||
295 | spin_unlock(&queue->serial.lock); | ||
296 | |||
297 | while (!list_empty(&local_list)) { | ||
298 | struct padata_priv *padata; | ||
299 | |||
300 | padata = list_entry(local_list.next, | ||
301 | struct padata_priv, list); | ||
302 | |||
303 | list_del_init(&padata->list); | ||
304 | |||
305 | padata->serial(padata); | ||
306 | atomic_dec(&pd->refcnt); | ||
307 | } | ||
308 | local_bh_enable(); | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * padata_do_serial - padata serialization function | ||
313 | * | ||
314 | * @padata: object to be serialized. | ||
315 | * | ||
316 | * padata_do_serial must be called for every parallelized object. | ||
317 | * The serialization callback function will run with BHs off. | ||
318 | */ | ||
319 | void padata_do_serial(struct padata_priv *padata) | ||
320 | { | ||
321 | int cpu; | ||
322 | struct padata_queue *queue; | ||
323 | struct parallel_data *pd; | ||
324 | |||
325 | pd = padata->pd; | ||
326 | |||
327 | cpu = get_cpu(); | ||
328 | queue = per_cpu_ptr(pd->queue, cpu); | ||
329 | |||
330 | spin_lock(&queue->reorder.lock); | ||
331 | atomic_inc(&pd->reorder_objects); | ||
332 | list_add_tail(&padata->list, &queue->reorder.list); | ||
333 | spin_unlock(&queue->reorder.lock); | ||
334 | |||
335 | put_cpu(); | ||
336 | |||
337 | padata_reorder(pd); | ||
338 | } | ||
339 | EXPORT_SYMBOL(padata_do_serial); | ||
340 | |||
341 | static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, | ||
342 | const struct cpumask *cpumask) | ||
343 | { | ||
344 | int cpu, cpu_index, num_cpus; | ||
345 | struct padata_queue *queue; | ||
346 | struct parallel_data *pd; | ||
347 | |||
348 | cpu_index = 0; | ||
349 | |||
350 | pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); | ||
351 | if (!pd) | ||
352 | goto err; | ||
353 | |||
354 | pd->queue = alloc_percpu(struct padata_queue); | ||
355 | if (!pd->queue) | ||
356 | goto err_free_pd; | ||
357 | |||
358 | if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) | ||
359 | goto err_free_queue; | ||
360 | |||
361 | for_each_possible_cpu(cpu) { | ||
362 | queue = per_cpu_ptr(pd->queue, cpu); | ||
363 | |||
364 | queue->pd = pd; | ||
365 | |||
366 | if (cpumask_test_cpu(cpu, cpumask) | ||
367 | && cpumask_test_cpu(cpu, cpu_active_mask)) { | ||
368 | queue->cpu_index = cpu_index; | ||
369 | cpu_index++; | ||
370 | } else | ||
371 | queue->cpu_index = -1; | ||
372 | |||
373 | INIT_LIST_HEAD(&queue->reorder.list); | ||
374 | INIT_LIST_HEAD(&queue->parallel.list); | ||
375 | INIT_LIST_HEAD(&queue->serial.list); | ||
376 | spin_lock_init(&queue->reorder.lock); | ||
377 | spin_lock_init(&queue->parallel.lock); | ||
378 | spin_lock_init(&queue->serial.lock); | ||
379 | |||
380 | INIT_WORK(&queue->pwork, padata_parallel_worker); | ||
381 | INIT_WORK(&queue->swork, padata_serial_worker); | ||
382 | atomic_set(&queue->num_obj, 0); | ||
383 | } | ||
384 | |||
385 | cpumask_and(pd->cpumask, cpumask, cpu_active_mask); | ||
386 | |||
387 | num_cpus = cpumask_weight(pd->cpumask); | ||
388 | pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; | ||
389 | |||
390 | atomic_set(&pd->seq_nr, -1); | ||
391 | atomic_set(&pd->reorder_objects, 0); | ||
392 | atomic_set(&pd->refcnt, 0); | ||
393 | pd->pinst = pinst; | ||
394 | spin_lock_init(&pd->lock); | ||
395 | |||
396 | return pd; | ||
397 | |||
398 | err_free_queue: | ||
399 | free_percpu(pd->queue); | ||
400 | err_free_pd: | ||
401 | kfree(pd); | ||
402 | err: | ||
403 | return NULL; | ||
404 | } | ||
405 | |||
406 | static void padata_free_pd(struct parallel_data *pd) | ||
407 | { | ||
408 | free_cpumask_var(pd->cpumask); | ||
409 | free_percpu(pd->queue); | ||
410 | kfree(pd); | ||
411 | } | ||
412 | |||
413 | static void padata_replace(struct padata_instance *pinst, | ||
414 | struct parallel_data *pd_new) | ||
415 | { | ||
416 | struct parallel_data *pd_old = pinst->pd; | ||
417 | |||
418 | pinst->flags |= PADATA_RESET; | ||
419 | |||
420 | rcu_assign_pointer(pinst->pd, pd_new); | ||
421 | |||
422 | synchronize_rcu(); | ||
423 | |||
424 | while (atomic_read(&pd_old->refcnt) != 0) | ||
425 | yield(); | ||
426 | |||
427 | flush_workqueue(pinst->wq); | ||
428 | |||
429 | padata_free_pd(pd_old); | ||
430 | |||
431 | pinst->flags &= ~PADATA_RESET; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * padata_set_cpumask - set the cpumask that padata should use | ||
436 | * | ||
437 | * @pinst: padata instance | ||
438 | * @cpumask: the cpumask to use | ||
439 | */ | ||
440 | int padata_set_cpumask(struct padata_instance *pinst, | ||
441 | cpumask_var_t cpumask) | ||
442 | { | ||
443 | struct parallel_data *pd; | ||
444 | int err = 0; | ||
445 | |||
446 | might_sleep(); | ||
447 | |||
448 | mutex_lock(&pinst->lock); | ||
449 | |||
450 | pd = padata_alloc_pd(pinst, cpumask); | ||
451 | if (!pd) { | ||
452 | err = -ENOMEM; | ||
453 | goto out; | ||
454 | } | ||
455 | |||
456 | cpumask_copy(pinst->cpumask, cpumask); | ||
457 | |||
458 | padata_replace(pinst, pd); | ||
459 | |||
460 | out: | ||
461 | mutex_unlock(&pinst->lock); | ||
462 | |||
463 | return err; | ||
464 | } | ||
465 | EXPORT_SYMBOL(padata_set_cpumask); | ||
466 | |||
467 | static int __padata_add_cpu(struct padata_instance *pinst, int cpu) | ||
468 | { | ||
469 | struct parallel_data *pd; | ||
470 | |||
471 | if (cpumask_test_cpu(cpu, cpu_active_mask)) { | ||
472 | pd = padata_alloc_pd(pinst, pinst->cpumask); | ||
473 | if (!pd) | ||
474 | return -ENOMEM; | ||
475 | |||
476 | padata_replace(pinst, pd); | ||
477 | } | ||
478 | |||
479 | return 0; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * padata_add_cpu - add a cpu to the padata cpumask | ||
484 | * | ||
485 | * @pinst: padata instance | ||
486 | * @cpu: cpu to add | ||
487 | */ | ||
488 | int padata_add_cpu(struct padata_instance *pinst, int cpu) | ||
489 | { | ||
490 | int err; | ||
491 | |||
492 | might_sleep(); | ||
493 | |||
494 | mutex_lock(&pinst->lock); | ||
495 | |||
496 | cpumask_set_cpu(cpu, pinst->cpumask); | ||
497 | err = __padata_add_cpu(pinst, cpu); | ||
498 | |||
499 | mutex_unlock(&pinst->lock); | ||
500 | |||
501 | return err; | ||
502 | } | ||
503 | EXPORT_SYMBOL(padata_add_cpu); | ||
504 | |||
505 | static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) | ||
506 | { | ||
507 | struct parallel_data *pd; | ||
508 | |||
509 | if (cpumask_test_cpu(cpu, cpu_online_mask)) { | ||
510 | pd = padata_alloc_pd(pinst, pinst->cpumask); | ||
511 | if (!pd) | ||
512 | return -ENOMEM; | ||
513 | |||
514 | padata_replace(pinst, pd); | ||
515 | } | ||
516 | |||
517 | return 0; | ||
518 | } | ||
519 | |||
520 | /* | ||
521 | * padata_remove_cpu - remove a cpu from the padata cpumask | ||
522 | * | ||
523 | * @pinst: padata instance | ||
524 | * @cpu: cpu to remove | ||
525 | */ | ||
526 | int padata_remove_cpu(struct padata_instance *pinst, int cpu) | ||
527 | { | ||
528 | int err; | ||
529 | |||
530 | might_sleep(); | ||
531 | |||
532 | mutex_lock(&pinst->lock); | ||
533 | |||
534 | cpumask_clear_cpu(cpu, pinst->cpumask); | ||
535 | err = __padata_remove_cpu(pinst, cpu); | ||
536 | |||
537 | mutex_unlock(&pinst->lock); | ||
538 | |||
539 | return err; | ||
540 | } | ||
541 | EXPORT_SYMBOL(padata_remove_cpu); | ||
542 | |||
543 | /* | ||
544 | * padata_start - start the parallel processing | ||
545 | * | ||
546 | * @pinst: padata instance to start | ||
547 | */ | ||
548 | void padata_start(struct padata_instance *pinst) | ||
549 | { | ||
550 | might_sleep(); | ||
551 | |||
552 | mutex_lock(&pinst->lock); | ||
553 | pinst->flags |= PADATA_INIT; | ||
554 | mutex_unlock(&pinst->lock); | ||
555 | } | ||
556 | EXPORT_SYMBOL(padata_start); | ||
557 | |||
558 | /* | ||
559 | * padata_stop - stop the parallel processing | ||
560 | * | ||
561 | * @pinst: padata instance to stop | ||
562 | */ | ||
563 | void padata_stop(struct padata_instance *pinst) | ||
564 | { | ||
565 | might_sleep(); | ||
566 | |||
567 | mutex_lock(&pinst->lock); | ||
568 | pinst->flags &= ~PADATA_INIT; | ||
569 | mutex_unlock(&pinst->lock); | ||
570 | } | ||
571 | EXPORT_SYMBOL(padata_stop); | ||
572 | |||
573 | static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, | ||
574 | unsigned long action, void *hcpu) | ||
575 | { | ||
576 | int err; | ||
577 | struct padata_instance *pinst; | ||
578 | int cpu = (unsigned long)hcpu; | ||
579 | |||
580 | pinst = container_of(nfb, struct padata_instance, cpu_notifier); | ||
581 | |||
582 | switch (action) { | ||
583 | case CPU_ONLINE: | ||
584 | case CPU_ONLINE_FROZEN: | ||
585 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
586 | break; | ||
587 | mutex_lock(&pinst->lock); | ||
588 | err = __padata_add_cpu(pinst, cpu); | ||
589 | mutex_unlock(&pinst->lock); | ||
590 | if (err) | ||
591 | return NOTIFY_BAD; | ||
592 | break; | ||
593 | |||
594 | case CPU_DOWN_PREPARE: | ||
595 | case CPU_DOWN_PREPARE_FROZEN: | ||
596 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
597 | break; | ||
598 | mutex_lock(&pinst->lock); | ||
599 | err = __padata_remove_cpu(pinst, cpu); | ||
600 | mutex_unlock(&pinst->lock); | ||
601 | if (err) | ||
602 | return NOTIFY_BAD; | ||
603 | break; | ||
604 | |||
605 | case CPU_UP_CANCELED: | ||
606 | case CPU_UP_CANCELED_FROZEN: | ||
607 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
608 | break; | ||
609 | mutex_lock(&pinst->lock); | ||
610 | __padata_remove_cpu(pinst, cpu); | ||
611 | mutex_unlock(&pinst->lock); | ||
612 | |||
613 | case CPU_DOWN_FAILED: | ||
614 | case CPU_DOWN_FAILED_FROZEN: | ||
615 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
616 | break; | ||
617 | mutex_lock(&pinst->lock); | ||
618 | __padata_add_cpu(pinst, cpu); | ||
619 | mutex_unlock(&pinst->lock); | ||
620 | } | ||
621 | |||
622 | return NOTIFY_OK; | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * padata_alloc - allocate and initialize a padata instance | ||
627 | * | ||
628 | * @cpumask: cpumask that padata uses for parallelization | ||
629 | * @wq: workqueue to use for the allocated padata instance | ||
630 | */ | ||
631 | struct padata_instance *padata_alloc(const struct cpumask *cpumask, | ||
632 | struct workqueue_struct *wq) | ||
633 | { | ||
634 | int err; | ||
635 | struct padata_instance *pinst; | ||
636 | struct parallel_data *pd; | ||
637 | |||
638 | pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); | ||
639 | if (!pinst) | ||
640 | goto err; | ||
641 | |||
642 | pd = padata_alloc_pd(pinst, cpumask); | ||
643 | if (!pd) | ||
644 | goto err_free_inst; | ||
645 | |||
646 | if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) | ||
647 | goto err_free_pd; | ||
648 | |||
649 | rcu_assign_pointer(pinst->pd, pd); | ||
650 | |||
651 | pinst->wq = wq; | ||
652 | |||
653 | cpumask_copy(pinst->cpumask, cpumask); | ||
654 | |||
655 | pinst->flags = 0; | ||
656 | |||
657 | pinst->cpu_notifier.notifier_call = padata_cpu_callback; | ||
658 | pinst->cpu_notifier.priority = 0; | ||
659 | err = register_hotcpu_notifier(&pinst->cpu_notifier); | ||
660 | if (err) | ||
661 | goto err_free_cpumask; | ||
662 | |||
663 | mutex_init(&pinst->lock); | ||
664 | |||
665 | return pinst; | ||
666 | |||
667 | err_free_cpumask: | ||
668 | free_cpumask_var(pinst->cpumask); | ||
669 | err_free_pd: | ||
670 | padata_free_pd(pd); | ||
671 | err_free_inst: | ||
672 | kfree(pinst); | ||
673 | err: | ||
674 | return NULL; | ||
675 | } | ||
676 | EXPORT_SYMBOL(padata_alloc); | ||
677 | |||
678 | /* | ||
679 | * padata_free - free a padata instance | ||
680 | * | ||
681 | * @ padata_inst: padata instance to free | ||
682 | */ | ||
683 | void padata_free(struct padata_instance *pinst) | ||
684 | { | ||
685 | padata_stop(pinst); | ||
686 | |||
687 | synchronize_rcu(); | ||
688 | |||
689 | while (atomic_read(&pinst->pd->refcnt) != 0) | ||
690 | yield(); | ||
691 | |||
692 | unregister_hotcpu_notifier(&pinst->cpu_notifier); | ||
693 | padata_free_pd(pinst->pd); | ||
694 | free_cpumask_var(pinst->cpumask); | ||
695 | kfree(pinst); | ||
696 | } | ||
697 | EXPORT_SYMBOL(padata_free); | ||
diff --git a/kernel/panic.c b/kernel/panic.c index c787333282b8..13d966b4c14a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | |||
36 | 36 | ||
37 | EXPORT_SYMBOL(panic_notifier_list); | 37 | EXPORT_SYMBOL(panic_notifier_list); |
38 | 38 | ||
39 | static long no_blink(long time) | ||
40 | { | ||
41 | return 0; | ||
42 | } | ||
43 | |||
44 | /* Returns how long it waited in ms */ | 39 | /* Returns how long it waited in ms */ |
45 | long (*panic_blink)(long time); | 40 | long (*panic_blink)(long time); |
46 | EXPORT_SYMBOL(panic_blink); | 41 | EXPORT_SYMBOL(panic_blink); |
47 | 42 | ||
43 | static void panic_blink_one_second(void) | ||
44 | { | ||
45 | static long i = 0, end; | ||
46 | |||
47 | if (panic_blink) { | ||
48 | end = i + MSEC_PER_SEC; | ||
49 | |||
50 | while (i < end) { | ||
51 | i += panic_blink(i); | ||
52 | mdelay(1); | ||
53 | i++; | ||
54 | } | ||
55 | } else { | ||
56 | /* | ||
57 | * When running under a hypervisor a small mdelay may get | ||
58 | * rounded up to the hypervisor timeslice. For example, with | ||
59 | * a 1ms in 10ms hypervisor timeslice we might inflate a | ||
60 | * mdelay(1) loop by 10x. | ||
61 | * | ||
62 | * If we have nothing to blink, spin on 1 second calls to | ||
63 | * mdelay to avoid this. | ||
64 | */ | ||
65 | mdelay(MSEC_PER_SEC); | ||
66 | } | ||
67 | } | ||
68 | |||
48 | /** | 69 | /** |
49 | * panic - halt the system | 70 | * panic - halt the system |
50 | * @fmt: The text string to print | 71 | * @fmt: The text string to print |
@@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
95 | 116 | ||
96 | bust_spinlocks(0); | 117 | bust_spinlocks(0); |
97 | 118 | ||
98 | if (!panic_blink) | ||
99 | panic_blink = no_blink; | ||
100 | |||
101 | if (panic_timeout > 0) { | 119 | if (panic_timeout > 0) { |
102 | /* | 120 | /* |
103 | * Delay timeout seconds before rebooting the machine. | 121 | * Delay timeout seconds before rebooting the machine. |
@@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
105 | */ | 123 | */ |
106 | printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); | 124 | printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); |
107 | 125 | ||
108 | for (i = 0; i < panic_timeout*1000; ) { | 126 | for (i = 0; i < panic_timeout; i++) { |
109 | touch_nmi_watchdog(); | 127 | touch_nmi_watchdog(); |
110 | i += panic_blink(i); | 128 | panic_blink_one_second(); |
111 | mdelay(1); | ||
112 | i++; | ||
113 | } | 129 | } |
114 | /* | 130 | /* |
115 | * This will not be a clean reboot, with everything | 131 | * This will not be a clean reboot, with everything |
@@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
135 | } | 151 | } |
136 | #endif | 152 | #endif |
137 | local_irq_enable(); | 153 | local_irq_enable(); |
138 | for (i = 0; ; ) { | 154 | while (1) { |
139 | touch_softlockup_watchdog(); | 155 | touch_softlockup_watchdog(); |
140 | i += panic_blink(i); | 156 | panic_blink_one_second(); |
141 | mdelay(1); | ||
142 | i++; | ||
143 | } | 157 | } |
144 | } | 158 | } |
145 | 159 | ||
diff --git a/kernel/params.c b/kernel/params.c index cf1b69183127..0b30ecd53a52 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/err.h> | 24 | #include <linux/err.h> |
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/ctype.h> | 26 | #include <linux/ctype.h> |
27 | #include <linux/string.h> | ||
28 | 27 | ||
29 | #if 0 | 28 | #if 0 |
30 | #define DEBUGP printk | 29 | #define DEBUGP printk |
@@ -402,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp) | |||
402 | } | 401 | } |
403 | 402 | ||
404 | /* sysfs output in /sys/modules/XYZ/parameters/ */ | 403 | /* sysfs output in /sys/modules/XYZ/parameters/ */ |
405 | #define to_module_attr(n) container_of(n, struct module_attribute, attr); | 404 | #define to_module_attr(n) container_of(n, struct module_attribute, attr) |
406 | #define to_module_kobject(n) container_of(n, struct module_kobject, kobj); | 405 | #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) |
407 | 406 | ||
408 | extern struct kernel_param __start___param[], __stop___param[]; | 407 | extern struct kernel_param __start___param[], __stop___param[]; |
409 | 408 | ||
@@ -421,7 +420,7 @@ struct module_param_attrs | |||
421 | }; | 420 | }; |
422 | 421 | ||
423 | #ifdef CONFIG_SYSFS | 422 | #ifdef CONFIG_SYSFS |
424 | #define to_param_attr(n) container_of(n, struct param_attribute, mattr); | 423 | #define to_param_attr(n) container_of(n, struct param_attribute, mattr) |
425 | 424 | ||
426 | static ssize_t param_attr_show(struct module_attribute *mattr, | 425 | static ssize_t param_attr_show(struct module_attribute *mattr, |
427 | struct module *mod, char *buf) | 426 | struct module *mod, char *buf) |
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, | |||
517 | new->grp.attrs = attrs; | 516 | new->grp.attrs = attrs; |
518 | 517 | ||
519 | /* Tack new one on the end. */ | 518 | /* Tack new one on the end. */ |
519 | sysfs_attr_init(&new->attrs[num].mattr.attr); | ||
520 | new->attrs[num].param = kp; | 520 | new->attrs[num].param = kp; |
521 | new->attrs[num].mattr.show = param_attr_show; | 521 | new->attrs[num].mattr.show = param_attr_show; |
522 | new->attrs[num].mattr.store = param_attr_store; | 522 | new->attrs[num].mattr.store = param_attr_store; |
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj, | |||
723 | return ret; | 723 | return ret; |
724 | } | 724 | } |
725 | 725 | ||
726 | static struct sysfs_ops module_sysfs_ops = { | 726 | static const struct sysfs_ops module_sysfs_ops = { |
727 | .show = module_attr_show, | 727 | .show = module_attr_show, |
728 | .store = module_attr_store, | 728 | .store = module_attr_store, |
729 | }; | 729 | }; |
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) | |||
737 | return 0; | 737 | return 0; |
738 | } | 738 | } |
739 | 739 | ||
740 | static struct kset_uevent_ops module_uevent_ops = { | 740 | static const struct kset_uevent_ops module_uevent_ops = { |
741 | .filter = uevent_filter, | 741 | .filter = uevent_filter, |
742 | }; | 742 | }; |
743 | 743 | ||
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 087025fe3ba1..3d1552d3c12b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/smp.h> | 15 | #include <linux/smp.h> |
16 | #include <linux/file.h> | 16 | #include <linux/file.h> |
17 | #include <linux/poll.h> | 17 | #include <linux/poll.h> |
18 | #include <linux/slab.h> | ||
18 | #include <linux/sysfs.h> | 19 | #include <linux/sysfs.h> |
19 | #include <linux/dcache.h> | 20 | #include <linux/dcache.h> |
20 | #include <linux/percpu.h> | 21 | #include <linux/percpu.h> |
@@ -56,21 +57,6 @@ static atomic_t nr_task_events __read_mostly; | |||
56 | */ | 57 | */ |
57 | int sysctl_perf_event_paranoid __read_mostly = 1; | 58 | int sysctl_perf_event_paranoid __read_mostly = 1; |
58 | 59 | ||
59 | static inline bool perf_paranoid_tracepoint_raw(void) | ||
60 | { | ||
61 | return sysctl_perf_event_paranoid > -1; | ||
62 | } | ||
63 | |||
64 | static inline bool perf_paranoid_cpu(void) | ||
65 | { | ||
66 | return sysctl_perf_event_paranoid > 0; | ||
67 | } | ||
68 | |||
69 | static inline bool perf_paranoid_kernel(void) | ||
70 | { | ||
71 | return sysctl_perf_event_paranoid > 1; | ||
72 | } | ||
73 | |||
74 | int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ | 60 | int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ |
75 | 61 | ||
76 | /* | 62 | /* |
@@ -96,13 +82,10 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) | |||
96 | void __weak hw_perf_disable(void) { barrier(); } | 82 | void __weak hw_perf_disable(void) { barrier(); } |
97 | void __weak hw_perf_enable(void) { barrier(); } | 83 | void __weak hw_perf_enable(void) { barrier(); } |
98 | 84 | ||
99 | void __weak hw_perf_event_setup(int cpu) { barrier(); } | ||
100 | void __weak hw_perf_event_setup_online(int cpu) { barrier(); } | ||
101 | |||
102 | int __weak | 85 | int __weak |
103 | hw_perf_group_sched_in(struct perf_event *group_leader, | 86 | hw_perf_group_sched_in(struct perf_event *group_leader, |
104 | struct perf_cpu_context *cpuctx, | 87 | struct perf_cpu_context *cpuctx, |
105 | struct perf_event_context *ctx, int cpu) | 88 | struct perf_event_context *ctx) |
106 | { | 89 | { |
107 | return 0; | 90 | return 0; |
108 | } | 91 | } |
@@ -111,25 +94,15 @@ void __weak perf_event_print_debug(void) { } | |||
111 | 94 | ||
112 | static DEFINE_PER_CPU(int, perf_disable_count); | 95 | static DEFINE_PER_CPU(int, perf_disable_count); |
113 | 96 | ||
114 | void __perf_disable(void) | ||
115 | { | ||
116 | __get_cpu_var(perf_disable_count)++; | ||
117 | } | ||
118 | |||
119 | bool __perf_enable(void) | ||
120 | { | ||
121 | return !--__get_cpu_var(perf_disable_count); | ||
122 | } | ||
123 | |||
124 | void perf_disable(void) | 97 | void perf_disable(void) |
125 | { | 98 | { |
126 | __perf_disable(); | 99 | if (!__get_cpu_var(perf_disable_count)++) |
127 | hw_perf_disable(); | 100 | hw_perf_disable(); |
128 | } | 101 | } |
129 | 102 | ||
130 | void perf_enable(void) | 103 | void perf_enable(void) |
131 | { | 104 | { |
132 | if (__perf_enable()) | 105 | if (!--__get_cpu_var(perf_disable_count)) |
133 | hw_perf_enable(); | 106 | hw_perf_enable(); |
134 | } | 107 | } |
135 | 108 | ||
@@ -248,7 +221,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
248 | 221 | ||
249 | static inline u64 perf_clock(void) | 222 | static inline u64 perf_clock(void) |
250 | { | 223 | { |
251 | return cpu_clock(smp_processor_id()); | 224 | return cpu_clock(raw_smp_processor_id()); |
252 | } | 225 | } |
253 | 226 | ||
254 | /* | 227 | /* |
@@ -632,14 +605,13 @@ void perf_event_disable(struct perf_event *event) | |||
632 | static int | 605 | static int |
633 | event_sched_in(struct perf_event *event, | 606 | event_sched_in(struct perf_event *event, |
634 | struct perf_cpu_context *cpuctx, | 607 | struct perf_cpu_context *cpuctx, |
635 | struct perf_event_context *ctx, | 608 | struct perf_event_context *ctx) |
636 | int cpu) | ||
637 | { | 609 | { |
638 | if (event->state <= PERF_EVENT_STATE_OFF) | 610 | if (event->state <= PERF_EVENT_STATE_OFF) |
639 | return 0; | 611 | return 0; |
640 | 612 | ||
641 | event->state = PERF_EVENT_STATE_ACTIVE; | 613 | event->state = PERF_EVENT_STATE_ACTIVE; |
642 | event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ | 614 | event->oncpu = smp_processor_id(); |
643 | /* | 615 | /* |
644 | * The new state must be visible before we turn it on in the hardware: | 616 | * The new state must be visible before we turn it on in the hardware: |
645 | */ | 617 | */ |
@@ -666,8 +638,7 @@ event_sched_in(struct perf_event *event, | |||
666 | static int | 638 | static int |
667 | group_sched_in(struct perf_event *group_event, | 639 | group_sched_in(struct perf_event *group_event, |
668 | struct perf_cpu_context *cpuctx, | 640 | struct perf_cpu_context *cpuctx, |
669 | struct perf_event_context *ctx, | 641 | struct perf_event_context *ctx) |
670 | int cpu) | ||
671 | { | 642 | { |
672 | struct perf_event *event, *partial_group; | 643 | struct perf_event *event, *partial_group; |
673 | int ret; | 644 | int ret; |
@@ -675,18 +646,18 @@ group_sched_in(struct perf_event *group_event, | |||
675 | if (group_event->state == PERF_EVENT_STATE_OFF) | 646 | if (group_event->state == PERF_EVENT_STATE_OFF) |
676 | return 0; | 647 | return 0; |
677 | 648 | ||
678 | ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); | 649 | ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); |
679 | if (ret) | 650 | if (ret) |
680 | return ret < 0 ? ret : 0; | 651 | return ret < 0 ? ret : 0; |
681 | 652 | ||
682 | if (event_sched_in(group_event, cpuctx, ctx, cpu)) | 653 | if (event_sched_in(group_event, cpuctx, ctx)) |
683 | return -EAGAIN; | 654 | return -EAGAIN; |
684 | 655 | ||
685 | /* | 656 | /* |
686 | * Schedule in siblings as one group (if any): | 657 | * Schedule in siblings as one group (if any): |
687 | */ | 658 | */ |
688 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 659 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
689 | if (event_sched_in(event, cpuctx, ctx, cpu)) { | 660 | if (event_sched_in(event, cpuctx, ctx)) { |
690 | partial_group = event; | 661 | partial_group = event; |
691 | goto group_error; | 662 | goto group_error; |
692 | } | 663 | } |
@@ -760,7 +731,6 @@ static void __perf_install_in_context(void *info) | |||
760 | struct perf_event *event = info; | 731 | struct perf_event *event = info; |
761 | struct perf_event_context *ctx = event->ctx; | 732 | struct perf_event_context *ctx = event->ctx; |
762 | struct perf_event *leader = event->group_leader; | 733 | struct perf_event *leader = event->group_leader; |
763 | int cpu = smp_processor_id(); | ||
764 | int err; | 734 | int err; |
765 | 735 | ||
766 | /* | 736 | /* |
@@ -807,7 +777,7 @@ static void __perf_install_in_context(void *info) | |||
807 | if (!group_can_go_on(event, cpuctx, 1)) | 777 | if (!group_can_go_on(event, cpuctx, 1)) |
808 | err = -EEXIST; | 778 | err = -EEXIST; |
809 | else | 779 | else |
810 | err = event_sched_in(event, cpuctx, ctx, cpu); | 780 | err = event_sched_in(event, cpuctx, ctx); |
811 | 781 | ||
812 | if (err) { | 782 | if (err) { |
813 | /* | 783 | /* |
@@ -949,11 +919,9 @@ static void __perf_event_enable(void *info) | |||
949 | } else { | 919 | } else { |
950 | perf_disable(); | 920 | perf_disable(); |
951 | if (event == leader) | 921 | if (event == leader) |
952 | err = group_sched_in(event, cpuctx, ctx, | 922 | err = group_sched_in(event, cpuctx, ctx); |
953 | smp_processor_id()); | ||
954 | else | 923 | else |
955 | err = event_sched_in(event, cpuctx, ctx, | 924 | err = event_sched_in(event, cpuctx, ctx); |
956 | smp_processor_id()); | ||
957 | perf_enable(); | 925 | perf_enable(); |
958 | } | 926 | } |
959 | 927 | ||
@@ -1197,11 +1165,9 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1197 | struct perf_event_context *ctx = task->perf_event_ctxp; | 1165 | struct perf_event_context *ctx = task->perf_event_ctxp; |
1198 | struct perf_event_context *next_ctx; | 1166 | struct perf_event_context *next_ctx; |
1199 | struct perf_event_context *parent; | 1167 | struct perf_event_context *parent; |
1200 | struct pt_regs *regs; | ||
1201 | int do_switch = 1; | 1168 | int do_switch = 1; |
1202 | 1169 | ||
1203 | regs = task_pt_regs(task); | 1170 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); |
1204 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); | ||
1205 | 1171 | ||
1206 | if (likely(!ctx || !cpuctx->task_ctx)) | 1172 | if (likely(!ctx || !cpuctx->task_ctx)) |
1207 | return; | 1173 | return; |
@@ -1280,19 +1246,18 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | |||
1280 | 1246 | ||
1281 | static void | 1247 | static void |
1282 | ctx_pinned_sched_in(struct perf_event_context *ctx, | 1248 | ctx_pinned_sched_in(struct perf_event_context *ctx, |
1283 | struct perf_cpu_context *cpuctx, | 1249 | struct perf_cpu_context *cpuctx) |
1284 | int cpu) | ||
1285 | { | 1250 | { |
1286 | struct perf_event *event; | 1251 | struct perf_event *event; |
1287 | 1252 | ||
1288 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 1253 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
1289 | if (event->state <= PERF_EVENT_STATE_OFF) | 1254 | if (event->state <= PERF_EVENT_STATE_OFF) |
1290 | continue; | 1255 | continue; |
1291 | if (event->cpu != -1 && event->cpu != cpu) | 1256 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
1292 | continue; | 1257 | continue; |
1293 | 1258 | ||
1294 | if (group_can_go_on(event, cpuctx, 1)) | 1259 | if (group_can_go_on(event, cpuctx, 1)) |
1295 | group_sched_in(event, cpuctx, ctx, cpu); | 1260 | group_sched_in(event, cpuctx, ctx); |
1296 | 1261 | ||
1297 | /* | 1262 | /* |
1298 | * If this pinned group hasn't been scheduled, | 1263 | * If this pinned group hasn't been scheduled, |
@@ -1307,8 +1272,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, | |||
1307 | 1272 | ||
1308 | static void | 1273 | static void |
1309 | ctx_flexible_sched_in(struct perf_event_context *ctx, | 1274 | ctx_flexible_sched_in(struct perf_event_context *ctx, |
1310 | struct perf_cpu_context *cpuctx, | 1275 | struct perf_cpu_context *cpuctx) |
1311 | int cpu) | ||
1312 | { | 1276 | { |
1313 | struct perf_event *event; | 1277 | struct perf_event *event; |
1314 | int can_add_hw = 1; | 1278 | int can_add_hw = 1; |
@@ -1321,11 +1285,11 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1321 | * Listen to the 'cpu' scheduling filter constraint | 1285 | * Listen to the 'cpu' scheduling filter constraint |
1322 | * of events: | 1286 | * of events: |
1323 | */ | 1287 | */ |
1324 | if (event->cpu != -1 && event->cpu != cpu) | 1288 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
1325 | continue; | 1289 | continue; |
1326 | 1290 | ||
1327 | if (group_can_go_on(event, cpuctx, can_add_hw)) | 1291 | if (group_can_go_on(event, cpuctx, can_add_hw)) |
1328 | if (group_sched_in(event, cpuctx, ctx, cpu)) | 1292 | if (group_sched_in(event, cpuctx, ctx)) |
1329 | can_add_hw = 0; | 1293 | can_add_hw = 0; |
1330 | } | 1294 | } |
1331 | } | 1295 | } |
@@ -1335,8 +1299,6 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1335 | struct perf_cpu_context *cpuctx, | 1299 | struct perf_cpu_context *cpuctx, |
1336 | enum event_type_t event_type) | 1300 | enum event_type_t event_type) |
1337 | { | 1301 | { |
1338 | int cpu = smp_processor_id(); | ||
1339 | |||
1340 | raw_spin_lock(&ctx->lock); | 1302 | raw_spin_lock(&ctx->lock); |
1341 | ctx->is_active = 1; | 1303 | ctx->is_active = 1; |
1342 | if (likely(!ctx->nr_events)) | 1304 | if (likely(!ctx->nr_events)) |
@@ -1351,11 +1313,11 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1351 | * in order to give them the best chance of going on. | 1313 | * in order to give them the best chance of going on. |
1352 | */ | 1314 | */ |
1353 | if (event_type & EVENT_PINNED) | 1315 | if (event_type & EVENT_PINNED) |
1354 | ctx_pinned_sched_in(ctx, cpuctx, cpu); | 1316 | ctx_pinned_sched_in(ctx, cpuctx); |
1355 | 1317 | ||
1356 | /* Then walk through the lower prio flexible groups */ | 1318 | /* Then walk through the lower prio flexible groups */ |
1357 | if (event_type & EVENT_FLEXIBLE) | 1319 | if (event_type & EVENT_FLEXIBLE) |
1358 | ctx_flexible_sched_in(ctx, cpuctx, cpu); | 1320 | ctx_flexible_sched_in(ctx, cpuctx); |
1359 | 1321 | ||
1360 | perf_enable(); | 1322 | perf_enable(); |
1361 | out: | 1323 | out: |
@@ -1493,6 +1455,22 @@ do { \ | |||
1493 | return div64_u64(dividend, divisor); | 1455 | return div64_u64(dividend, divisor); |
1494 | } | 1456 | } |
1495 | 1457 | ||
1458 | static void perf_event_stop(struct perf_event *event) | ||
1459 | { | ||
1460 | if (!event->pmu->stop) | ||
1461 | return event->pmu->disable(event); | ||
1462 | |||
1463 | return event->pmu->stop(event); | ||
1464 | } | ||
1465 | |||
1466 | static int perf_event_start(struct perf_event *event) | ||
1467 | { | ||
1468 | if (!event->pmu->start) | ||
1469 | return event->pmu->enable(event); | ||
1470 | |||
1471 | return event->pmu->start(event); | ||
1472 | } | ||
1473 | |||
1496 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | 1474 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) |
1497 | { | 1475 | { |
1498 | struct hw_perf_event *hwc = &event->hw; | 1476 | struct hw_perf_event *hwc = &event->hw; |
@@ -1513,9 +1491,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
1513 | 1491 | ||
1514 | if (atomic64_read(&hwc->period_left) > 8*sample_period) { | 1492 | if (atomic64_read(&hwc->period_left) > 8*sample_period) { |
1515 | perf_disable(); | 1493 | perf_disable(); |
1516 | event->pmu->disable(event); | 1494 | perf_event_stop(event); |
1517 | atomic64_set(&hwc->period_left, 0); | 1495 | atomic64_set(&hwc->period_left, 0); |
1518 | event->pmu->enable(event); | 1496 | perf_event_start(event); |
1519 | perf_enable(); | 1497 | perf_enable(); |
1520 | } | 1498 | } |
1521 | } | 1499 | } |
@@ -1545,12 +1523,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1545 | */ | 1523 | */ |
1546 | if (interrupts == MAX_INTERRUPTS) { | 1524 | if (interrupts == MAX_INTERRUPTS) { |
1547 | perf_log_throttle(event, 1); | 1525 | perf_log_throttle(event, 1); |
1526 | perf_disable(); | ||
1548 | event->pmu->unthrottle(event); | 1527 | event->pmu->unthrottle(event); |
1528 | perf_enable(); | ||
1549 | } | 1529 | } |
1550 | 1530 | ||
1551 | if (!event->attr.freq || !event->attr.sample_freq) | 1531 | if (!event->attr.freq || !event->attr.sample_freq) |
1552 | continue; | 1532 | continue; |
1553 | 1533 | ||
1534 | perf_disable(); | ||
1554 | event->pmu->read(event); | 1535 | event->pmu->read(event); |
1555 | now = atomic64_read(&event->count); | 1536 | now = atomic64_read(&event->count); |
1556 | delta = now - hwc->freq_count_stamp; | 1537 | delta = now - hwc->freq_count_stamp; |
@@ -1558,6 +1539,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1558 | 1539 | ||
1559 | if (delta > 0) | 1540 | if (delta > 0) |
1560 | perf_adjust_period(event, TICK_NSEC, delta); | 1541 | perf_adjust_period(event, TICK_NSEC, delta); |
1542 | perf_enable(); | ||
1561 | } | 1543 | } |
1562 | raw_spin_unlock(&ctx->lock); | 1544 | raw_spin_unlock(&ctx->lock); |
1563 | } | 1545 | } |
@@ -1567,9 +1549,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1567 | */ | 1549 | */ |
1568 | static void rotate_ctx(struct perf_event_context *ctx) | 1550 | static void rotate_ctx(struct perf_event_context *ctx) |
1569 | { | 1551 | { |
1570 | if (!ctx->nr_events) | ||
1571 | return; | ||
1572 | |||
1573 | raw_spin_lock(&ctx->lock); | 1552 | raw_spin_lock(&ctx->lock); |
1574 | 1553 | ||
1575 | /* Rotate the first entry last of non-pinned groups */ | 1554 | /* Rotate the first entry last of non-pinned groups */ |
@@ -1582,19 +1561,28 @@ void perf_event_task_tick(struct task_struct *curr) | |||
1582 | { | 1561 | { |
1583 | struct perf_cpu_context *cpuctx; | 1562 | struct perf_cpu_context *cpuctx; |
1584 | struct perf_event_context *ctx; | 1563 | struct perf_event_context *ctx; |
1564 | int rotate = 0; | ||
1585 | 1565 | ||
1586 | if (!atomic_read(&nr_events)) | 1566 | if (!atomic_read(&nr_events)) |
1587 | return; | 1567 | return; |
1588 | 1568 | ||
1589 | cpuctx = &__get_cpu_var(perf_cpu_context); | 1569 | cpuctx = &__get_cpu_var(perf_cpu_context); |
1590 | ctx = curr->perf_event_ctxp; | 1570 | if (cpuctx->ctx.nr_events && |
1571 | cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | ||
1572 | rotate = 1; | ||
1591 | 1573 | ||
1592 | perf_disable(); | 1574 | ctx = curr->perf_event_ctxp; |
1575 | if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) | ||
1576 | rotate = 1; | ||
1593 | 1577 | ||
1594 | perf_ctx_adjust_freq(&cpuctx->ctx); | 1578 | perf_ctx_adjust_freq(&cpuctx->ctx); |
1595 | if (ctx) | 1579 | if (ctx) |
1596 | perf_ctx_adjust_freq(ctx); | 1580 | perf_ctx_adjust_freq(ctx); |
1597 | 1581 | ||
1582 | if (!rotate) | ||
1583 | return; | ||
1584 | |||
1585 | perf_disable(); | ||
1598 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 1586 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1599 | if (ctx) | 1587 | if (ctx) |
1600 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); | 1588 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); |
@@ -1606,7 +1594,6 @@ void perf_event_task_tick(struct task_struct *curr) | |||
1606 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 1594 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); |
1607 | if (ctx) | 1595 | if (ctx) |
1608 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); | 1596 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); |
1609 | |||
1610 | perf_enable(); | 1597 | perf_enable(); |
1611 | } | 1598 | } |
1612 | 1599 | ||
@@ -2602,7 +2589,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2602 | if (user_locked > user_lock_limit) | 2589 | if (user_locked > user_lock_limit) |
2603 | extra = user_locked - user_lock_limit; | 2590 | extra = user_locked - user_lock_limit; |
2604 | 2591 | ||
2605 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 2592 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
2606 | lock_limit >>= PAGE_SHIFT; | 2593 | lock_limit >>= PAGE_SHIFT; |
2607 | locked = vma->vm_mm->locked_vm + extra; | 2594 | locked = vma->vm_mm->locked_vm + extra; |
2608 | 2595 | ||
@@ -2798,6 +2785,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
2798 | return NULL; | 2785 | return NULL; |
2799 | } | 2786 | } |
2800 | 2787 | ||
2788 | __weak | ||
2789 | void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) | ||
2790 | { | ||
2791 | } | ||
2792 | |||
2793 | |||
2801 | /* | 2794 | /* |
2802 | * Output | 2795 | * Output |
2803 | */ | 2796 | */ |
@@ -3383,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event, | |||
3383 | struct perf_task_event *task_event) | 3376 | struct perf_task_event *task_event) |
3384 | { | 3377 | { |
3385 | struct perf_output_handle handle; | 3378 | struct perf_output_handle handle; |
3386 | int size; | ||
3387 | struct task_struct *task = task_event->task; | 3379 | struct task_struct *task = task_event->task; |
3388 | int ret; | 3380 | unsigned long flags; |
3381 | int size, ret; | ||
3382 | |||
3383 | /* | ||
3384 | * If this CPU attempts to acquire an rq lock held by a CPU spinning | ||
3385 | * in perf_output_lock() from interrupt context, it's game over. | ||
3386 | */ | ||
3387 | local_irq_save(flags); | ||
3389 | 3388 | ||
3390 | size = task_event->event_id.header.size; | 3389 | size = task_event->event_id.header.size; |
3391 | ret = perf_output_begin(&handle, event, size, 0, 0); | 3390 | ret = perf_output_begin(&handle, event, size, 0, 0); |
3392 | 3391 | ||
3393 | if (ret) | 3392 | if (ret) { |
3393 | local_irq_restore(flags); | ||
3394 | return; | 3394 | return; |
3395 | } | ||
3395 | 3396 | ||
3396 | task_event->event_id.pid = perf_event_pid(event, task); | 3397 | task_event->event_id.pid = perf_event_pid(event, task); |
3397 | task_event->event_id.ppid = perf_event_pid(event, current); | 3398 | task_event->event_id.ppid = perf_event_pid(event, current); |
@@ -3399,16 +3400,15 @@ static void perf_event_task_output(struct perf_event *event, | |||
3399 | task_event->event_id.tid = perf_event_tid(event, task); | 3400 | task_event->event_id.tid = perf_event_tid(event, task); |
3400 | task_event->event_id.ptid = perf_event_tid(event, current); | 3401 | task_event->event_id.ptid = perf_event_tid(event, current); |
3401 | 3402 | ||
3402 | task_event->event_id.time = perf_clock(); | ||
3403 | |||
3404 | perf_output_put(&handle, task_event->event_id); | 3403 | perf_output_put(&handle, task_event->event_id); |
3405 | 3404 | ||
3406 | perf_output_end(&handle); | 3405 | perf_output_end(&handle); |
3406 | local_irq_restore(flags); | ||
3407 | } | 3407 | } |
3408 | 3408 | ||
3409 | static int perf_event_task_match(struct perf_event *event) | 3409 | static int perf_event_task_match(struct perf_event *event) |
3410 | { | 3410 | { |
3411 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 3411 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3412 | return 0; | 3412 | return 0; |
3413 | 3413 | ||
3414 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3414 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
@@ -3440,7 +3440,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) | |||
3440 | cpuctx = &get_cpu_var(perf_cpu_context); | 3440 | cpuctx = &get_cpu_var(perf_cpu_context); |
3441 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 3441 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
3442 | if (!ctx) | 3442 | if (!ctx) |
3443 | ctx = rcu_dereference(task_event->task->perf_event_ctxp); | 3443 | ctx = rcu_dereference(current->perf_event_ctxp); |
3444 | if (ctx) | 3444 | if (ctx) |
3445 | perf_event_task_ctx(ctx, task_event); | 3445 | perf_event_task_ctx(ctx, task_event); |
3446 | put_cpu_var(perf_cpu_context); | 3446 | put_cpu_var(perf_cpu_context); |
@@ -3471,6 +3471,7 @@ static void perf_event_task(struct task_struct *task, | |||
3471 | /* .ppid */ | 3471 | /* .ppid */ |
3472 | /* .tid */ | 3472 | /* .tid */ |
3473 | /* .ptid */ | 3473 | /* .ptid */ |
3474 | .time = perf_clock(), | ||
3474 | }, | 3475 | }, |
3475 | }; | 3476 | }; |
3476 | 3477 | ||
@@ -3520,7 +3521,7 @@ static void perf_event_comm_output(struct perf_event *event, | |||
3520 | 3521 | ||
3521 | static int perf_event_comm_match(struct perf_event *event) | 3522 | static int perf_event_comm_match(struct perf_event *event) |
3522 | { | 3523 | { |
3523 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 3524 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3524 | return 0; | 3525 | return 0; |
3525 | 3526 | ||
3526 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3527 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
@@ -3640,7 +3641,7 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
3640 | static int perf_event_mmap_match(struct perf_event *event, | 3641 | static int perf_event_mmap_match(struct perf_event *event, |
3641 | struct perf_mmap_event *mmap_event) | 3642 | struct perf_mmap_event *mmap_event) |
3642 | { | 3643 | { |
3643 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 3644 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3644 | return 0; | 3645 | return 0; |
3645 | 3646 | ||
3646 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3647 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
@@ -3749,7 +3750,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) | |||
3749 | /* .tid */ | 3750 | /* .tid */ |
3750 | .start = vma->vm_start, | 3751 | .start = vma->vm_start, |
3751 | .len = vma->vm_end - vma->vm_start, | 3752 | .len = vma->vm_end - vma->vm_start, |
3752 | .pgoff = vma->vm_pgoff, | 3753 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, |
3753 | }, | 3754 | }, |
3754 | }; | 3755 | }; |
3755 | 3756 | ||
@@ -4116,8 +4117,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, | |||
4116 | if (rctx < 0) | 4117 | if (rctx < 0) |
4117 | return; | 4118 | return; |
4118 | 4119 | ||
4119 | data.addr = addr; | 4120 | perf_sample_data_init(&data, addr); |
4120 | data.raw = NULL; | ||
4121 | 4121 | ||
4122 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); | 4122 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); |
4123 | 4123 | ||
@@ -4162,11 +4162,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
4162 | struct perf_event *event; | 4162 | struct perf_event *event; |
4163 | u64 period; | 4163 | u64 period; |
4164 | 4164 | ||
4165 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | 4165 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
4166 | event->pmu->read(event); | 4166 | event->pmu->read(event); |
4167 | 4167 | ||
4168 | data.addr = 0; | 4168 | perf_sample_data_init(&data, 0); |
4169 | data.raw = NULL; | ||
4170 | data.period = event->hw.last_period; | 4169 | data.period = event->hw.last_period; |
4171 | regs = get_irq_regs(); | 4170 | regs = get_irq_regs(); |
4172 | /* | 4171 | /* |
@@ -4328,26 +4327,20 @@ static const struct pmu perf_ops_task_clock = { | |||
4328 | #ifdef CONFIG_EVENT_TRACING | 4327 | #ifdef CONFIG_EVENT_TRACING |
4329 | 4328 | ||
4330 | void perf_tp_event(int event_id, u64 addr, u64 count, void *record, | 4329 | void perf_tp_event(int event_id, u64 addr, u64 count, void *record, |
4331 | int entry_size) | 4330 | int entry_size, struct pt_regs *regs) |
4332 | { | 4331 | { |
4332 | struct perf_sample_data data; | ||
4333 | struct perf_raw_record raw = { | 4333 | struct perf_raw_record raw = { |
4334 | .size = entry_size, | 4334 | .size = entry_size, |
4335 | .data = record, | 4335 | .data = record, |
4336 | }; | 4336 | }; |
4337 | 4337 | ||
4338 | struct perf_sample_data data = { | 4338 | perf_sample_data_init(&data, addr); |
4339 | .addr = addr, | 4339 | data.raw = &raw; |
4340 | .raw = &raw, | ||
4341 | }; | ||
4342 | |||
4343 | struct pt_regs *regs = get_irq_regs(); | ||
4344 | |||
4345 | if (!regs) | ||
4346 | regs = task_pt_regs(current); | ||
4347 | 4340 | ||
4348 | /* Trace events already protected against recursion */ | 4341 | /* Trace events already protected against recursion */ |
4349 | do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, | 4342 | do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, |
4350 | &data, regs); | 4343 | &data, regs); |
4351 | } | 4344 | } |
4352 | EXPORT_SYMBOL_GPL(perf_tp_event); | 4345 | EXPORT_SYMBOL_GPL(perf_tp_event); |
4353 | 4346 | ||
@@ -4363,7 +4356,7 @@ static int perf_tp_event_match(struct perf_event *event, | |||
4363 | 4356 | ||
4364 | static void tp_perf_event_destroy(struct perf_event *event) | 4357 | static void tp_perf_event_destroy(struct perf_event *event) |
4365 | { | 4358 | { |
4366 | ftrace_profile_disable(event->attr.config); | 4359 | perf_trace_disable(event->attr.config); |
4367 | } | 4360 | } |
4368 | 4361 | ||
4369 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4362 | static const struct pmu *tp_perf_event_init(struct perf_event *event) |
@@ -4377,7 +4370,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
4377 | !capable(CAP_SYS_ADMIN)) | 4370 | !capable(CAP_SYS_ADMIN)) |
4378 | return ERR_PTR(-EPERM); | 4371 | return ERR_PTR(-EPERM); |
4379 | 4372 | ||
4380 | if (ftrace_profile_enable(event->attr.config)) | 4373 | if (perf_trace_enable(event->attr.config)) |
4381 | return NULL; | 4374 | return NULL; |
4382 | 4375 | ||
4383 | event->destroy = tp_perf_event_destroy; | 4376 | event->destroy = tp_perf_event_destroy; |
@@ -4456,8 +4449,7 @@ void perf_bp_event(struct perf_event *bp, void *data) | |||
4456 | struct perf_sample_data sample; | 4449 | struct perf_sample_data sample; |
4457 | struct pt_regs *regs = data; | 4450 | struct pt_regs *regs = data; |
4458 | 4451 | ||
4459 | sample.raw = NULL; | 4452 | perf_sample_data_init(&sample, bp->attr.bp_addr); |
4460 | sample.addr = bp->attr.bp_addr; | ||
4461 | 4453 | ||
4462 | if (!perf_exclude_event(bp, regs)) | 4454 | if (!perf_exclude_event(bp, regs)) |
4463 | perf_swevent_add(bp, 1, 1, &sample, regs); | 4455 | perf_swevent_add(bp, 1, 1, &sample, regs); |
@@ -4720,7 +4712,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
4720 | if (attr->type >= PERF_TYPE_MAX) | 4712 | if (attr->type >= PERF_TYPE_MAX) |
4721 | return -EINVAL; | 4713 | return -EINVAL; |
4722 | 4714 | ||
4723 | if (attr->__reserved_1 || attr->__reserved_2) | 4715 | if (attr->__reserved_1) |
4724 | return -EINVAL; | 4716 | return -EINVAL; |
4725 | 4717 | ||
4726 | if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) | 4718 | if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) |
@@ -4905,7 +4897,7 @@ err_fput_free_put_context: | |||
4905 | 4897 | ||
4906 | err_free_put_context: | 4898 | err_free_put_context: |
4907 | if (err < 0) | 4899 | if (err < 0) |
4908 | kfree(event); | 4900 | free_event(event); |
4909 | 4901 | ||
4910 | err_put_context: | 4902 | err_put_context: |
4911 | if (err < 0) | 4903 | if (err < 0) |
@@ -5385,18 +5377,26 @@ int perf_event_init_task(struct task_struct *child) | |||
5385 | return ret; | 5377 | return ret; |
5386 | } | 5378 | } |
5387 | 5379 | ||
5380 | static void __init perf_event_init_all_cpus(void) | ||
5381 | { | ||
5382 | int cpu; | ||
5383 | struct perf_cpu_context *cpuctx; | ||
5384 | |||
5385 | for_each_possible_cpu(cpu) { | ||
5386 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5387 | __perf_event_init_context(&cpuctx->ctx, NULL); | ||
5388 | } | ||
5389 | } | ||
5390 | |||
5388 | static void __cpuinit perf_event_init_cpu(int cpu) | 5391 | static void __cpuinit perf_event_init_cpu(int cpu) |
5389 | { | 5392 | { |
5390 | struct perf_cpu_context *cpuctx; | 5393 | struct perf_cpu_context *cpuctx; |
5391 | 5394 | ||
5392 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 5395 | cpuctx = &per_cpu(perf_cpu_context, cpu); |
5393 | __perf_event_init_context(&cpuctx->ctx, NULL); | ||
5394 | 5396 | ||
5395 | spin_lock(&perf_resource_lock); | 5397 | spin_lock(&perf_resource_lock); |
5396 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; | 5398 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; |
5397 | spin_unlock(&perf_resource_lock); | 5399 | spin_unlock(&perf_resource_lock); |
5398 | |||
5399 | hw_perf_event_setup(cpu); | ||
5400 | } | 5400 | } |
5401 | 5401 | ||
5402 | #ifdef CONFIG_HOTPLUG_CPU | 5402 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -5436,11 +5436,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
5436 | perf_event_init_cpu(cpu); | 5436 | perf_event_init_cpu(cpu); |
5437 | break; | 5437 | break; |
5438 | 5438 | ||
5439 | case CPU_ONLINE: | ||
5440 | case CPU_ONLINE_FROZEN: | ||
5441 | hw_perf_event_setup_online(cpu); | ||
5442 | break; | ||
5443 | |||
5444 | case CPU_DOWN_PREPARE: | 5439 | case CPU_DOWN_PREPARE: |
5445 | case CPU_DOWN_PREPARE_FROZEN: | 5440 | case CPU_DOWN_PREPARE_FROZEN: |
5446 | perf_event_exit_cpu(cpu); | 5441 | perf_event_exit_cpu(cpu); |
@@ -5463,6 +5458,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { | |||
5463 | 5458 | ||
5464 | void __init perf_event_init(void) | 5459 | void __init perf_event_init(void) |
5465 | { | 5460 | { |
5461 | perf_event_init_all_cpus(); | ||
5466 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | 5462 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, |
5467 | (void *)(long)smp_processor_id()); | 5463 | (void *)(long)smp_processor_id()); |
5468 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, | 5464 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, |
@@ -5470,13 +5466,16 @@ void __init perf_event_init(void) | |||
5470 | register_cpu_notifier(&perf_cpu_nb); | 5466 | register_cpu_notifier(&perf_cpu_nb); |
5471 | } | 5467 | } |
5472 | 5468 | ||
5473 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) | 5469 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, |
5470 | struct sysdev_class_attribute *attr, | ||
5471 | char *buf) | ||
5474 | { | 5472 | { |
5475 | return sprintf(buf, "%d\n", perf_reserved_percpu); | 5473 | return sprintf(buf, "%d\n", perf_reserved_percpu); |
5476 | } | 5474 | } |
5477 | 5475 | ||
5478 | static ssize_t | 5476 | static ssize_t |
5479 | perf_set_reserve_percpu(struct sysdev_class *class, | 5477 | perf_set_reserve_percpu(struct sysdev_class *class, |
5478 | struct sysdev_class_attribute *attr, | ||
5480 | const char *buf, | 5479 | const char *buf, |
5481 | size_t count) | 5480 | size_t count) |
5482 | { | 5481 | { |
@@ -5505,13 +5504,17 @@ perf_set_reserve_percpu(struct sysdev_class *class, | |||
5505 | return count; | 5504 | return count; |
5506 | } | 5505 | } |
5507 | 5506 | ||
5508 | static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) | 5507 | static ssize_t perf_show_overcommit(struct sysdev_class *class, |
5508 | struct sysdev_class_attribute *attr, | ||
5509 | char *buf) | ||
5509 | { | 5510 | { |
5510 | return sprintf(buf, "%d\n", perf_overcommit); | 5511 | return sprintf(buf, "%d\n", perf_overcommit); |
5511 | } | 5512 | } |
5512 | 5513 | ||
5513 | static ssize_t | 5514 | static ssize_t |
5514 | perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) | 5515 | perf_set_overcommit(struct sysdev_class *class, |
5516 | struct sysdev_class_attribute *attr, | ||
5517 | const char *buf, size_t count) | ||
5515 | { | 5518 | { |
5516 | unsigned long val; | 5519 | unsigned long val; |
5517 | int err; | 5520 | int err; |
diff --git a/kernel/pid.c b/kernel/pid.c index 2e17c9c92cbe..aebb30d9c233 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
367 | struct task_struct *result = NULL; | 367 | struct task_struct *result = NULL; |
368 | if (pid) { | 368 | if (pid) { |
369 | struct hlist_node *first; | 369 | struct hlist_node *first; |
370 | first = rcu_dereference(pid->tasks[type].first); | 370 | first = rcu_dereference_check(pid->tasks[type].first, |
371 | rcu_read_lock_held() || | ||
372 | lockdep_tasklist_lock_is_held()); | ||
371 | if (first) | 373 | if (first) |
372 | result = hlist_entry(first, struct task_struct, pids[(type)].node); | 374 | result = hlist_entry(first, struct task_struct, pids[(type)].node); |
373 | } | 375 | } |
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
376 | EXPORT_SYMBOL(pid_task); | 378 | EXPORT_SYMBOL(pid_task); |
377 | 379 | ||
378 | /* | 380 | /* |
379 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. | 381 | * Must be called under rcu_read_lock(). |
380 | */ | 382 | */ |
381 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 383 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
382 | { | 384 | { |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 86b3796b0436..a5aff94e1f0b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/syscalls.h> | 13 | #include <linux/syscalls.h> |
14 | #include <linux/err.h> | 14 | #include <linux/err.h> |
15 | #include <linux/acct.h> | 15 | #include <linux/acct.h> |
16 | #include <linux/slab.h> | ||
16 | 17 | ||
17 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 18 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
18 | 19 | ||
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
161 | rcu_read_lock(); | 162 | rcu_read_lock(); |
162 | 163 | ||
163 | /* | 164 | /* |
164 | * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring | 165 | * Any nested-container's init processes won't ignore the |
165 | * any nested-container's init processes don't ignore the | 166 | * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). |
166 | * signal | ||
167 | */ | 167 | */ |
168 | task = pid_task(find_vpid(nr), PIDTYPE_PID); | 168 | task = pid_task(find_vpid(nr), PIDTYPE_PID); |
169 | if (task) | 169 | if (task) |
170 | force_sig(SIGKILL, task); | 170 | send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); |
171 | 171 | ||
172 | rcu_read_unlock(); | 172 | rcu_read_unlock(); |
173 | 173 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 438ff4523513..bc7704b3a443 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
982 | int maxfire; | 982 | int maxfire; |
983 | struct list_head *timers = tsk->cpu_timers; | 983 | struct list_head *timers = tsk->cpu_timers; |
984 | struct signal_struct *const sig = tsk->signal; | 984 | struct signal_struct *const sig = tsk->signal; |
985 | unsigned long soft; | ||
985 | 986 | ||
986 | maxfire = 20; | 987 | maxfire = 20; |
987 | tsk->cputime_expires.prof_exp = cputime_zero; | 988 | tsk->cputime_expires.prof_exp = cputime_zero; |
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1030 | /* | 1031 | /* |
1031 | * Check for the special case thread timers. | 1032 | * Check for the special case thread timers. |
1032 | */ | 1033 | */ |
1033 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { | 1034 | soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); |
1034 | unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; | 1035 | if (soft != RLIM_INFINITY) { |
1035 | unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; | 1036 | unsigned long hard = |
1037 | ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); | ||
1036 | 1038 | ||
1037 | if (hard != RLIM_INFINITY && | 1039 | if (hard != RLIM_INFINITY && |
1038 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { | 1040 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { |
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1043 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | 1045 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
1044 | return; | 1046 | return; |
1045 | } | 1047 | } |
1046 | if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { | 1048 | if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { |
1047 | /* | 1049 | /* |
1048 | * At the soft limit, send a SIGXCPU every second. | 1050 | * At the soft limit, send a SIGXCPU every second. |
1049 | */ | 1051 | */ |
1050 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur | 1052 | if (soft < hard) { |
1051 | < sig->rlim[RLIMIT_RTTIME].rlim_max) { | 1053 | soft += USEC_PER_SEC; |
1052 | sig->rlim[RLIMIT_RTTIME].rlim_cur += | 1054 | sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; |
1053 | USEC_PER_SEC; | ||
1054 | } | 1055 | } |
1055 | printk(KERN_INFO | 1056 | printk(KERN_INFO |
1056 | "RT Watchdog Timeout: %s[%d]\n", | 1057 | "RT Watchdog Timeout: %s[%d]\n", |
@@ -1060,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1060 | } | 1061 | } |
1061 | } | 1062 | } |
1062 | 1063 | ||
1063 | static void stop_process_timers(struct task_struct *tsk) | 1064 | static void stop_process_timers(struct signal_struct *sig) |
1064 | { | 1065 | { |
1065 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 1066 | struct thread_group_cputimer *cputimer = &sig->cputimer; |
1066 | unsigned long flags; | 1067 | unsigned long flags; |
1067 | 1068 | ||
1068 | if (!cputimer->running) | 1069 | if (!cputimer->running) |
@@ -1071,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk) | |||
1071 | spin_lock_irqsave(&cputimer->lock, flags); | 1072 | spin_lock_irqsave(&cputimer->lock, flags); |
1072 | cputimer->running = 0; | 1073 | cputimer->running = 0; |
1073 | spin_unlock_irqrestore(&cputimer->lock, flags); | 1074 | spin_unlock_irqrestore(&cputimer->lock, flags); |
1075 | |||
1076 | sig->cputime_expires.prof_exp = cputime_zero; | ||
1077 | sig->cputime_expires.virt_exp = cputime_zero; | ||
1078 | sig->cputime_expires.sched_exp = 0; | ||
1074 | } | 1079 | } |
1075 | 1080 | ||
1076 | static u32 onecputick; | 1081 | static u32 onecputick; |
@@ -1121,6 +1126,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1121 | unsigned long long sum_sched_runtime, sched_expires; | 1126 | unsigned long long sum_sched_runtime, sched_expires; |
1122 | struct list_head *timers = sig->cpu_timers; | 1127 | struct list_head *timers = sig->cpu_timers; |
1123 | struct task_cputime cputime; | 1128 | struct task_cputime cputime; |
1129 | unsigned long soft; | ||
1124 | 1130 | ||
1125 | /* | 1131 | /* |
1126 | * Don't sample the current process CPU clocks if there are no timers. | 1132 | * Don't sample the current process CPU clocks if there are no timers. |
@@ -1131,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1131 | list_empty(&timers[CPUCLOCK_VIRT]) && | 1137 | list_empty(&timers[CPUCLOCK_VIRT]) && |
1132 | cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && | 1138 | cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && |
1133 | list_empty(&timers[CPUCLOCK_SCHED])) { | 1139 | list_empty(&timers[CPUCLOCK_SCHED])) { |
1134 | stop_process_timers(tsk); | 1140 | stop_process_timers(sig); |
1135 | return; | 1141 | return; |
1136 | } | 1142 | } |
1137 | 1143 | ||
@@ -1193,11 +1199,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
1193 | SIGPROF); | 1199 | SIGPROF); |
1194 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, | 1200 | check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, |
1195 | SIGVTALRM); | 1201 | SIGVTALRM); |
1196 | 1202 | soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); | |
1197 | if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { | 1203 | if (soft != RLIM_INFINITY) { |
1198 | unsigned long psecs = cputime_to_secs(ptime); | 1204 | unsigned long psecs = cputime_to_secs(ptime); |
1205 | unsigned long hard = | ||
1206 | ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); | ||
1199 | cputime_t x; | 1207 | cputime_t x; |
1200 | if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { | 1208 | if (psecs >= hard) { |
1201 | /* | 1209 | /* |
1202 | * At the hard limit, we just die. | 1210 | * At the hard limit, we just die. |
1203 | * No need to calculate anything else now. | 1211 | * No need to calculate anything else now. |
@@ -1205,17 +1213,17 @@ static void check_process_timers(struct task_struct *tsk, | |||
1205 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | 1213 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
1206 | return; | 1214 | return; |
1207 | } | 1215 | } |
1208 | if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { | 1216 | if (psecs >= soft) { |
1209 | /* | 1217 | /* |
1210 | * At the soft limit, send a SIGXCPU every second. | 1218 | * At the soft limit, send a SIGXCPU every second. |
1211 | */ | 1219 | */ |
1212 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | 1220 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); |
1213 | if (sig->rlim[RLIMIT_CPU].rlim_cur | 1221 | if (soft < hard) { |
1214 | < sig->rlim[RLIMIT_CPU].rlim_max) { | 1222 | soft++; |
1215 | sig->rlim[RLIMIT_CPU].rlim_cur++; | 1223 | sig->rlim[RLIMIT_CPU].rlim_cur = soft; |
1216 | } | 1224 | } |
1217 | } | 1225 | } |
1218 | x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); | 1226 | x = secs_to_cputime(soft); |
1219 | if (cputime_eq(prof_expires, cputime_zero) || | 1227 | if (cputime_eq(prof_expires, cputime_zero) || |
1220 | cputime_lt(x, prof_expires)) { | 1228 | cputime_lt(x, prof_expires)) { |
1221 | prof_expires = x; | 1229 | prof_expires = x; |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 495440779ce3..00d1fda58ab6 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, | |||
256 | return 0; | 256 | return 0; |
257 | } | 257 | } |
258 | 258 | ||
259 | int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) | 259 | static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) |
260 | { | 260 | { |
261 | *tp = ktime_to_timespec(KTIME_LOW_RES); | 261 | *tp = ktime_to_timespec(KTIME_LOW_RES); |
262 | return 0; | 262 | return 0; |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 91e09d3b2eb2..5c36ea9d55d2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -27,6 +27,15 @@ config PM_DEBUG | |||
27 | code. This is helpful when debugging and reporting PM bugs, like | 27 | code. This is helpful when debugging and reporting PM bugs, like |
28 | suspend support. | 28 | suspend support. |
29 | 29 | ||
30 | config PM_ADVANCED_DEBUG | ||
31 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | ||
32 | depends on PM_DEBUG | ||
33 | default n | ||
34 | ---help--- | ||
35 | Add extra sysfs attributes allowing one to access some Power Management | ||
36 | fields of device objects from user space. If you are not a kernel | ||
37 | developer interested in debugging/testing Power Management, say "no". | ||
38 | |||
30 | config PM_VERBOSE | 39 | config PM_VERBOSE |
31 | bool "Verbose Power Management debugging" | 40 | bool "Verbose Power Management debugging" |
32 | depends on PM_DEBUG | 41 | depends on PM_DEBUG |
@@ -85,6 +94,11 @@ config PM_SLEEP | |||
85 | depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE | 94 | depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE |
86 | default y | 95 | default y |
87 | 96 | ||
97 | config PM_SLEEP_ADVANCED_DEBUG | ||
98 | bool | ||
99 | depends on PM_ADVANCED_DEBUG | ||
100 | default n | ||
101 | |||
88 | config SUSPEND | 102 | config SUSPEND |
89 | bool "Suspend to RAM and standby" | 103 | bool "Suspend to RAM and standby" |
90 | depends on PM && ARCH_SUSPEND_POSSIBLE | 104 | depends on PM && ARCH_SUSPEND_POSSIBLE |
@@ -222,3 +236,8 @@ config PM_RUNTIME | |||
222 | and the bus type drivers of the buses the devices are on are | 236 | and the bus type drivers of the buses the devices are on are |
223 | responsible for the actual handling of the autosuspend requests and | 237 | responsible for the actual handling of the autosuspend requests and |
224 | wake-up events. | 238 | wake-up events. |
239 | |||
240 | config PM_OPS | ||
241 | bool | ||
242 | depends on PM_SLEEP || PM_RUNTIME | ||
243 | default y | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index bbfe472d7524..aa9e916da4d5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/console.h> | 22 | #include <linux/console.h> |
23 | #include <linux/cpu.h> | 23 | #include <linux/cpu.h> |
24 | #include <linux/freezer.h> | 24 | #include <linux/freezer.h> |
25 | #include <linux/gfp.h> | ||
25 | #include <scsi/scsi_scan.h> | 26 | #include <scsi/scsi_scan.h> |
26 | #include <asm/suspend.h> | 27 | #include <asm/suspend.h> |
27 | 28 | ||
@@ -323,6 +324,7 @@ static int create_image(int platform_mode) | |||
323 | int hibernation_snapshot(int platform_mode) | 324 | int hibernation_snapshot(int platform_mode) |
324 | { | 325 | { |
325 | int error; | 326 | int error; |
327 | gfp_t saved_mask; | ||
326 | 328 | ||
327 | error = platform_begin(platform_mode); | 329 | error = platform_begin(platform_mode); |
328 | if (error) | 330 | if (error) |
@@ -334,6 +336,7 @@ int hibernation_snapshot(int platform_mode) | |||
334 | goto Close; | 336 | goto Close; |
335 | 337 | ||
336 | suspend_console(); | 338 | suspend_console(); |
339 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
337 | error = dpm_suspend_start(PMSG_FREEZE); | 340 | error = dpm_suspend_start(PMSG_FREEZE); |
338 | if (error) | 341 | if (error) |
339 | goto Recover_platform; | 342 | goto Recover_platform; |
@@ -351,6 +354,7 @@ int hibernation_snapshot(int platform_mode) | |||
351 | 354 | ||
352 | dpm_resume_end(in_suspend ? | 355 | dpm_resume_end(in_suspend ? |
353 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); | 356 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); |
357 | set_gfp_allowed_mask(saved_mask); | ||
354 | resume_console(); | 358 | resume_console(); |
355 | Close: | 359 | Close: |
356 | platform_end(platform_mode); | 360 | platform_end(platform_mode); |
@@ -445,14 +449,17 @@ static int resume_target_kernel(bool platform_mode) | |||
445 | int hibernation_restore(int platform_mode) | 449 | int hibernation_restore(int platform_mode) |
446 | { | 450 | { |
447 | int error; | 451 | int error; |
452 | gfp_t saved_mask; | ||
448 | 453 | ||
449 | pm_prepare_console(); | 454 | pm_prepare_console(); |
450 | suspend_console(); | 455 | suspend_console(); |
456 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
451 | error = dpm_suspend_start(PMSG_QUIESCE); | 457 | error = dpm_suspend_start(PMSG_QUIESCE); |
452 | if (!error) { | 458 | if (!error) { |
453 | error = resume_target_kernel(platform_mode); | 459 | error = resume_target_kernel(platform_mode); |
454 | dpm_resume_end(PMSG_RECOVER); | 460 | dpm_resume_end(PMSG_RECOVER); |
455 | } | 461 | } |
462 | set_gfp_allowed_mask(saved_mask); | ||
456 | resume_console(); | 463 | resume_console(); |
457 | pm_restore_console(); | 464 | pm_restore_console(); |
458 | return error; | 465 | return error; |
@@ -466,6 +473,7 @@ int hibernation_restore(int platform_mode) | |||
466 | int hibernation_platform_enter(void) | 473 | int hibernation_platform_enter(void) |
467 | { | 474 | { |
468 | int error; | 475 | int error; |
476 | gfp_t saved_mask; | ||
469 | 477 | ||
470 | if (!hibernation_ops) | 478 | if (!hibernation_ops) |
471 | return -ENOSYS; | 479 | return -ENOSYS; |
@@ -481,6 +489,7 @@ int hibernation_platform_enter(void) | |||
481 | 489 | ||
482 | entering_platform_hibernation = true; | 490 | entering_platform_hibernation = true; |
483 | suspend_console(); | 491 | suspend_console(); |
492 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
484 | error = dpm_suspend_start(PMSG_HIBERNATE); | 493 | error = dpm_suspend_start(PMSG_HIBERNATE); |
485 | if (error) { | 494 | if (error) { |
486 | if (hibernation_ops->recover) | 495 | if (hibernation_ops->recover) |
@@ -518,6 +527,7 @@ int hibernation_platform_enter(void) | |||
518 | Resume_devices: | 527 | Resume_devices: |
519 | entering_platform_hibernation = false; | 528 | entering_platform_hibernation = false; |
520 | dpm_resume_end(PMSG_RESTORE); | 529 | dpm_resume_end(PMSG_RESTORE); |
530 | set_gfp_allowed_mask(saved_mask); | ||
521 | resume_console(); | 531 | resume_console(); |
522 | 532 | ||
523 | Close: | 533 | Close: |
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c index 39ac698ef836..fdcad9ed5a7b 100644 --- a/kernel/power/hibernate_nvs.c +++ b/kernel/power/hibernate_nvs.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/list.h> | 11 | #include <linux/list.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/slab.h> | ||
13 | #include <linux/suspend.h> | 14 | #include <linux/suspend.h> |
14 | 15 | ||
15 | /* | 16 | /* |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 0998c7139053..b58800b21fc0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val) | |||
44 | == NOTIFY_BAD) ? -EINVAL : 0; | 44 | == NOTIFY_BAD) ? -EINVAL : 0; |
45 | } | 45 | } |
46 | 46 | ||
47 | /* If set, devices may be suspended and resumed asynchronously. */ | ||
48 | int pm_async_enabled = 1; | ||
49 | |||
50 | static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
51 | char *buf) | ||
52 | { | ||
53 | return sprintf(buf, "%d\n", pm_async_enabled); | ||
54 | } | ||
55 | |||
56 | static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
57 | const char *buf, size_t n) | ||
58 | { | ||
59 | unsigned long val; | ||
60 | |||
61 | if (strict_strtoul(buf, 10, &val)) | ||
62 | return -EINVAL; | ||
63 | |||
64 | if (val > 1) | ||
65 | return -EINVAL; | ||
66 | |||
67 | pm_async_enabled = val; | ||
68 | return n; | ||
69 | } | ||
70 | |||
71 | power_attr(pm_async); | ||
72 | |||
47 | #ifdef CONFIG_PM_DEBUG | 73 | #ifdef CONFIG_PM_DEBUG |
48 | int pm_test_level = TEST_NONE; | 74 | int pm_test_level = TEST_NONE; |
49 | 75 | ||
@@ -208,9 +234,12 @@ static struct attribute * g[] = { | |||
208 | #ifdef CONFIG_PM_TRACE | 234 | #ifdef CONFIG_PM_TRACE |
209 | &pm_trace_attr.attr, | 235 | &pm_trace_attr.attr, |
210 | #endif | 236 | #endif |
211 | #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) | 237 | #ifdef CONFIG_PM_SLEEP |
238 | &pm_async_attr.attr, | ||
239 | #ifdef CONFIG_PM_DEBUG | ||
212 | &pm_test_attr.attr, | 240 | &pm_test_attr.attr, |
213 | #endif | 241 | #endif |
242 | #endif | ||
214 | NULL, | 243 | NULL, |
215 | }; | 244 | }; |
216 | 245 | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index 5ade1bdcf366..71ae29052ab6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only) | |||
88 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " | 88 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " |
89 | "(%d tasks refusing to freeze):\n", | 89 | "(%d tasks refusing to freeze):\n", |
90 | elapsed_csecs / 100, elapsed_csecs % 100, todo); | 90 | elapsed_csecs / 100, elapsed_csecs % 100, todo); |
91 | show_state(); | ||
92 | read_lock(&tasklist_lock); | 91 | read_lock(&tasklist_lock); |
93 | do_each_thread(g, p) { | 92 | do_each_thread(g, p) { |
94 | task_lock(p); | 93 | task_lock(p); |
95 | if (freezing(p) && !freezer_should_skip(p)) | 94 | if (freezing(p) && !freezer_should_skip(p)) |
96 | printk(KERN_ERR " %s\n", p->comm); | 95 | sched_show_task(p); |
97 | cancel_freezing(p); | 96 | cancel_freezing(p); |
98 | task_unlock(p); | 97 | task_unlock(p); |
99 | } while_each_thread(g, p); | 98 | } while_each_thread(g, p); |
@@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only) | |||
145 | if (nosig_only && should_send_signal(p)) | 144 | if (nosig_only && should_send_signal(p)) |
146 | continue; | 145 | continue; |
147 | 146 | ||
148 | if (cgroup_frozen(p)) | 147 | if (cgroup_freezing_or_frozen(p)) |
149 | continue; | 148 | continue; |
150 | 149 | ||
151 | thaw_process(p); | 150 | thaw_process(p); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 36cb168e4330..be861c26dda7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/console.h> | 26 | #include <linux/console.h> |
27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
28 | #include <linux/list.h> | 28 | #include <linux/list.h> |
29 | #include <linux/slab.h> | ||
29 | 30 | ||
30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
31 | #include <asm/mmu_context.h> | 32 | #include <asm/mmu_context.h> |
@@ -1181,7 +1182,7 @@ static void free_unnecessary_pages(void) | |||
1181 | 1182 | ||
1182 | memory_bm_position_reset(©_bm); | 1183 | memory_bm_position_reset(©_bm); |
1183 | 1184 | ||
1184 | while (to_free_normal > 0 && to_free_highmem > 0) { | 1185 | while (to_free_normal > 0 || to_free_highmem > 0) { |
1185 | unsigned long pfn = memory_bm_next_pfn(©_bm); | 1186 | unsigned long pfn = memory_bm_next_pfn(©_bm); |
1186 | struct page *page = pfn_to_page(pfn); | 1187 | struct page *page = pfn_to_page(pfn); |
1187 | 1188 | ||
@@ -1500,7 +1501,7 @@ asmlinkage int swsusp_save(void) | |||
1500 | { | 1501 | { |
1501 | unsigned int nr_pages, nr_highmem; | 1502 | unsigned int nr_pages, nr_highmem; |
1502 | 1503 | ||
1503 | printk(KERN_INFO "PM: Creating hibernation image: \n"); | 1504 | printk(KERN_INFO "PM: Creating hibernation image:\n"); |
1504 | 1505 | ||
1505 | drain_local_pages(NULL); | 1506 | drain_local_pages(NULL); |
1506 | nr_pages = count_data_pages(); | 1507 | nr_pages = count_data_pages(); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6f10dfc2d3e9..56e7dbb8b996 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/console.h> | 15 | #include <linux/console.h> |
16 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
17 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
18 | #include <linux/gfp.h> | ||
18 | 19 | ||
19 | #include "power.h" | 20 | #include "power.h" |
20 | 21 | ||
@@ -189,6 +190,7 @@ static int suspend_enter(suspend_state_t state) | |||
189 | int suspend_devices_and_enter(suspend_state_t state) | 190 | int suspend_devices_and_enter(suspend_state_t state) |
190 | { | 191 | { |
191 | int error; | 192 | int error; |
193 | gfp_t saved_mask; | ||
192 | 194 | ||
193 | if (!suspend_ops) | 195 | if (!suspend_ops) |
194 | return -ENOSYS; | 196 | return -ENOSYS; |
@@ -199,6 +201,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
199 | goto Close; | 201 | goto Close; |
200 | } | 202 | } |
201 | suspend_console(); | 203 | suspend_console(); |
204 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
202 | suspend_test_start(); | 205 | suspend_test_start(); |
203 | error = dpm_suspend_start(PMSG_SUSPEND); | 206 | error = dpm_suspend_start(PMSG_SUSPEND); |
204 | if (error) { | 207 | if (error) { |
@@ -215,6 +218,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
215 | suspend_test_start(); | 218 | suspend_test_start(); |
216 | dpm_resume_end(PMSG_RESUME); | 219 | dpm_resume_end(PMSG_RESUME); |
217 | suspend_test_finish("resume devices"); | 220 | suspend_test_finish("resume devices"); |
221 | set_gfp_allowed_mask(saved_mask); | ||
218 | resume_console(); | 222 | resume_console(); |
219 | Close: | 223 | Close: |
220 | if (suspend_ops->end) | 224 | if (suspend_ops->end) |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 09b2b0ae9e9d..66824d71983a 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
24 | #include <linux/swapops.h> | 24 | #include <linux/swapops.h> |
25 | #include <linux/pm.h> | 25 | #include <linux/pm.h> |
26 | #include <linux/slab.h> | ||
26 | 27 | ||
27 | #include "power.h" | 28 | #include "power.h" |
28 | 29 | ||
@@ -657,10 +658,6 @@ int swsusp_read(unsigned int *flags_p) | |||
657 | struct swsusp_info *header; | 658 | struct swsusp_info *header; |
658 | 659 | ||
659 | *flags_p = swsusp_header->flags; | 660 | *flags_p = swsusp_header->flags; |
660 | if (IS_ERR(resume_bdev)) { | ||
661 | pr_debug("PM: Image device not initialised\n"); | ||
662 | return PTR_ERR(resume_bdev); | ||
663 | } | ||
664 | 661 | ||
665 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | 662 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); |
666 | error = snapshot_write_next(&snapshot, PAGE_SIZE); | 663 | error = snapshot_write_next(&snapshot, PAGE_SIZE); |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c deleted file mode 100644 index 5b3601bd1893..000000000000 --- a/kernel/power/swsusp.c +++ /dev/null | |||
@@ -1,58 +0,0 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/swsusp.c | ||
3 | * | ||
4 | * This file provides code to write suspend image to swap and read it back. | ||
5 | * | ||
6 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> | ||
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> | ||
8 | * | ||
9 | * This file is released under the GPLv2. | ||
10 | * | ||
11 | * I'd like to thank the following people for their work: | ||
12 | * | ||
13 | * Pavel Machek <pavel@ucw.cz>: | ||
14 | * Modifications, defectiveness pointing, being with me at the very beginning, | ||
15 | * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. | ||
16 | * | ||
17 | * Steve Doddi <dirk@loth.demon.co.uk>: | ||
18 | * Support the possibility of hardware state restoring. | ||
19 | * | ||
20 | * Raph <grey.havens@earthling.net>: | ||
21 | * Support for preserving states of network devices and virtual console | ||
22 | * (including X and svgatextmode) | ||
23 | * | ||
24 | * Kurt Garloff <garloff@suse.de>: | ||
25 | * Straightened the critical function in order to prevent compilers from | ||
26 | * playing tricks with local variables. | ||
27 | * | ||
28 | * Andreas Mohr <a.mohr@mailto.de> | ||
29 | * | ||
30 | * Alex Badea <vampire@go.ro>: | ||
31 | * Fixed runaway init | ||
32 | * | ||
33 | * Rafael J. Wysocki <rjw@sisk.pl> | ||
34 | * Reworked the freeing of memory and the handling of swap | ||
35 | * | ||
36 | * More state savers are welcome. Especially for the scsi layer... | ||
37 | * | ||
38 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt | ||
39 | */ | ||
40 | |||
41 | #include <linux/mm.h> | ||
42 | #include <linux/suspend.h> | ||
43 | #include <linux/spinlock.h> | ||
44 | #include <linux/kernel.h> | ||
45 | #include <linux/major.h> | ||
46 | #include <linux/swap.h> | ||
47 | #include <linux/pm.h> | ||
48 | #include <linux/swapops.h> | ||
49 | #include <linux/bootmem.h> | ||
50 | #include <linux/syscalls.h> | ||
51 | #include <linux/highmem.h> | ||
52 | #include <linux/time.h> | ||
53 | #include <linux/rbtree.h> | ||
54 | #include <linux/io.h> | ||
55 | |||
56 | #include "power.h" | ||
57 | |||
58 | int in_suspend __nosavedata = 0; | ||
diff --git a/kernel/power/user.c b/kernel/power/user.c index bf0014d6a5f0..a8c96212bc1b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
195 | return res; | 195 | return res; |
196 | } | 196 | } |
197 | 197 | ||
198 | static void snapshot_deprecated_ioctl(unsigned int cmd) | ||
199 | { | ||
200 | if (printk_ratelimit()) | ||
201 | printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " | ||
202 | "be removed soon, update your suspend-to-disk " | ||
203 | "utilities\n", | ||
204 | __builtin_return_address(0), cmd); | ||
205 | } | ||
206 | |||
198 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, | 207 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, |
199 | unsigned long arg) | 208 | unsigned long arg) |
200 | { | 209 | { |
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
246 | data->frozen = 0; | 255 | data->frozen = 0; |
247 | break; | 256 | break; |
248 | 257 | ||
249 | case SNAPSHOT_CREATE_IMAGE: | ||
250 | case SNAPSHOT_ATOMIC_SNAPSHOT: | 258 | case SNAPSHOT_ATOMIC_SNAPSHOT: |
259 | snapshot_deprecated_ioctl(cmd); | ||
260 | case SNAPSHOT_CREATE_IMAGE: | ||
251 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { | 261 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { |
252 | error = -EPERM; | 262 | error = -EPERM; |
253 | break; | 263 | break; |
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
275 | data->ready = 0; | 285 | data->ready = 0; |
276 | break; | 286 | break; |
277 | 287 | ||
278 | case SNAPSHOT_PREF_IMAGE_SIZE: | ||
279 | case SNAPSHOT_SET_IMAGE_SIZE: | 288 | case SNAPSHOT_SET_IMAGE_SIZE: |
289 | snapshot_deprecated_ioctl(cmd); | ||
290 | case SNAPSHOT_PREF_IMAGE_SIZE: | ||
280 | image_size = arg; | 291 | image_size = arg; |
281 | break; | 292 | break; |
282 | 293 | ||
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
290 | error = put_user(size, (loff_t __user *)arg); | 301 | error = put_user(size, (loff_t __user *)arg); |
291 | break; | 302 | break; |
292 | 303 | ||
293 | case SNAPSHOT_AVAIL_SWAP_SIZE: | ||
294 | case SNAPSHOT_AVAIL_SWAP: | 304 | case SNAPSHOT_AVAIL_SWAP: |
305 | snapshot_deprecated_ioctl(cmd); | ||
306 | case SNAPSHOT_AVAIL_SWAP_SIZE: | ||
295 | size = count_swap_pages(data->swap, 1); | 307 | size = count_swap_pages(data->swap, 1); |
296 | size <<= PAGE_SHIFT; | 308 | size <<= PAGE_SHIFT; |
297 | error = put_user(size, (loff_t __user *)arg); | 309 | error = put_user(size, (loff_t __user *)arg); |
298 | break; | 310 | break; |
299 | 311 | ||
300 | case SNAPSHOT_ALLOC_SWAP_PAGE: | ||
301 | case SNAPSHOT_GET_SWAP_PAGE: | 312 | case SNAPSHOT_GET_SWAP_PAGE: |
313 | snapshot_deprecated_ioctl(cmd); | ||
314 | case SNAPSHOT_ALLOC_SWAP_PAGE: | ||
302 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | 315 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { |
303 | error = -ENODEV; | 316 | error = -ENODEV; |
304 | break; | 317 | break; |
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
321 | break; | 334 | break; |
322 | 335 | ||
323 | case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ | 336 | case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ |
337 | snapshot_deprecated_ioctl(cmd); | ||
324 | if (!swsusp_swap_in_use()) { | 338 | if (!swsusp_swap_in_use()) { |
325 | /* | 339 | /* |
326 | * User space encodes device types as two-byte values, | 340 | * User space encodes device types as two-byte values, |
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
362 | break; | 376 | break; |
363 | 377 | ||
364 | case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ | 378 | case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ |
379 | snapshot_deprecated_ioctl(cmd); | ||
365 | error = -EINVAL; | 380 | error = -EINVAL; |
366 | 381 | ||
367 | switch (arg) { | 382 | switch (arg) { |
@@ -405,7 +420,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
405 | * User space encodes device types as two-byte values, | 420 | * User space encodes device types as two-byte values, |
406 | * so we need to recode them | 421 | * so we need to recode them |
407 | */ | 422 | */ |
408 | swdev = old_decode_dev(swap_area.dev); | 423 | swdev = new_decode_dev(swap_area.dev); |
409 | if (swdev) { | 424 | if (swdev) { |
410 | offset = swap_area.offset; | 425 | offset = swap_area.offset; |
411 | data->swap = swap_type_of(swdev, offset, NULL); | 426 | data->swap = swap_type_of(swdev, offset, NULL); |
diff --git a/kernel/printk.c b/kernel/printk.c index 1751c456b71f..75077ad0b537 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/kexec.h> | 35 | #include <linux/kexec.h> |
36 | #include <linux/ratelimit.h> | 36 | #include <linux/ratelimit.h> |
37 | #include <linux/kmsg_dump.h> | 37 | #include <linux/kmsg_dump.h> |
38 | #include <linux/syslog.h> | ||
38 | 39 | ||
39 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
40 | 41 | ||
@@ -69,8 +70,6 @@ int console_printk[4] = { | |||
69 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | 70 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ |
70 | }; | 71 | }; |
71 | 72 | ||
72 | static int saved_console_loglevel = -1; | ||
73 | |||
74 | /* | 73 | /* |
75 | * Low level drivers may need that to know if they can schedule in | 74 | * Low level drivers may need that to know if they can schedule in |
76 | * their unblank() callback or not. So let's export it. | 75 | * their unblank() callback or not. So let's export it. |
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN]; | |||
145 | static char *log_buf = __log_buf; | 144 | static char *log_buf = __log_buf; |
146 | static int log_buf_len = __LOG_BUF_LEN; | 145 | static int log_buf_len = __LOG_BUF_LEN; |
147 | static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ | 146 | static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ |
147 | static int saved_console_loglevel = -1; | ||
148 | 148 | ||
149 | #ifdef CONFIG_KEXEC | 149 | #ifdef CONFIG_KEXEC |
150 | /* | 150 | /* |
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void) | |||
258 | } | 258 | } |
259 | #endif | 259 | #endif |
260 | 260 | ||
261 | /* | 261 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
262 | * Commands to do_syslog: | ||
263 | * | ||
264 | * 0 -- Close the log. Currently a NOP. | ||
265 | * 1 -- Open the log. Currently a NOP. | ||
266 | * 2 -- Read from the log. | ||
267 | * 3 -- Read all messages remaining in the ring buffer. | ||
268 | * 4 -- Read and clear all messages remaining in the ring buffer | ||
269 | * 5 -- Clear ring buffer. | ||
270 | * 6 -- Disable printk's to console | ||
271 | * 7 -- Enable printk's to console | ||
272 | * 8 -- Set level of messages printed to console | ||
273 | * 9 -- Return number of unread characters in the log buffer | ||
274 | * 10 -- Return size of the log buffer | ||
275 | */ | ||
276 | int do_syslog(int type, char __user *buf, int len) | ||
277 | { | 262 | { |
278 | unsigned i, j, limit, count; | 263 | unsigned i, j, limit, count; |
279 | int do_clear = 0; | 264 | int do_clear = 0; |
280 | char c; | 265 | char c; |
281 | int error = 0; | 266 | int error = 0; |
282 | 267 | ||
283 | error = security_syslog(type); | 268 | error = security_syslog(type, from_file); |
284 | if (error) | 269 | if (error) |
285 | return error; | 270 | return error; |
286 | 271 | ||
287 | switch (type) { | 272 | switch (type) { |
288 | case 0: /* Close log */ | 273 | case SYSLOG_ACTION_CLOSE: /* Close log */ |
289 | break; | 274 | break; |
290 | case 1: /* Open log */ | 275 | case SYSLOG_ACTION_OPEN: /* Open log */ |
291 | break; | 276 | break; |
292 | case 2: /* Read from log */ | 277 | case SYSLOG_ACTION_READ: /* Read from log */ |
293 | error = -EINVAL; | 278 | error = -EINVAL; |
294 | if (!buf || len < 0) | 279 | if (!buf || len < 0) |
295 | goto out; | 280 | goto out; |
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len) | |||
320 | if (!error) | 305 | if (!error) |
321 | error = i; | 306 | error = i; |
322 | break; | 307 | break; |
323 | case 4: /* Read/clear last kernel messages */ | 308 | /* Read/clear last kernel messages */ |
309 | case SYSLOG_ACTION_READ_CLEAR: | ||
324 | do_clear = 1; | 310 | do_clear = 1; |
325 | /* FALL THRU */ | 311 | /* FALL THRU */ |
326 | case 3: /* Read last kernel messages */ | 312 | /* Read last kernel messages */ |
313 | case SYSLOG_ACTION_READ_ALL: | ||
327 | error = -EINVAL; | 314 | error = -EINVAL; |
328 | if (!buf || len < 0) | 315 | if (!buf || len < 0) |
329 | goto out; | 316 | goto out; |
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len) | |||
376 | } | 363 | } |
377 | } | 364 | } |
378 | break; | 365 | break; |
379 | case 5: /* Clear ring buffer */ | 366 | /* Clear ring buffer */ |
367 | case SYSLOG_ACTION_CLEAR: | ||
380 | logged_chars = 0; | 368 | logged_chars = 0; |
381 | break; | 369 | break; |
382 | case 6: /* Disable logging to console */ | 370 | /* Disable logging to console */ |
371 | case SYSLOG_ACTION_CONSOLE_OFF: | ||
383 | if (saved_console_loglevel == -1) | 372 | if (saved_console_loglevel == -1) |
384 | saved_console_loglevel = console_loglevel; | 373 | saved_console_loglevel = console_loglevel; |
385 | console_loglevel = minimum_console_loglevel; | 374 | console_loglevel = minimum_console_loglevel; |
386 | break; | 375 | break; |
387 | case 7: /* Enable logging to console */ | 376 | /* Enable logging to console */ |
377 | case SYSLOG_ACTION_CONSOLE_ON: | ||
388 | if (saved_console_loglevel != -1) { | 378 | if (saved_console_loglevel != -1) { |
389 | console_loglevel = saved_console_loglevel; | 379 | console_loglevel = saved_console_loglevel; |
390 | saved_console_loglevel = -1; | 380 | saved_console_loglevel = -1; |
391 | } | 381 | } |
392 | break; | 382 | break; |
393 | case 8: /* Set level of messages printed to console */ | 383 | /* Set level of messages printed to console */ |
384 | case SYSLOG_ACTION_CONSOLE_LEVEL: | ||
394 | error = -EINVAL; | 385 | error = -EINVAL; |
395 | if (len < 1 || len > 8) | 386 | if (len < 1 || len > 8) |
396 | goto out; | 387 | goto out; |
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len) | |||
401 | saved_console_loglevel = -1; | 392 | saved_console_loglevel = -1; |
402 | error = 0; | 393 | error = 0; |
403 | break; | 394 | break; |
404 | case 9: /* Number of chars in the log buffer */ | 395 | /* Number of chars in the log buffer */ |
396 | case SYSLOG_ACTION_SIZE_UNREAD: | ||
405 | error = log_end - log_start; | 397 | error = log_end - log_start; |
406 | break; | 398 | break; |
407 | case 10: /* Size of the log buffer */ | 399 | /* Size of the log buffer */ |
400 | case SYSLOG_ACTION_SIZE_BUFFER: | ||
408 | error = log_buf_len; | 401 | error = log_buf_len; |
409 | break; | 402 | break; |
410 | default: | 403 | default: |
@@ -417,7 +410,7 @@ out: | |||
417 | 410 | ||
418 | SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | 411 | SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) |
419 | { | 412 | { |
420 | return do_syslog(type, buf, len); | 413 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
421 | } | 414 | } |
422 | 415 | ||
423 | /* | 416 | /* |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 23bd09cd042e..42ad8ae729a0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
23 | #include <linux/syscalls.h> | 23 | #include <linux/syscalls.h> |
24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
25 | #include <linux/regset.h> | ||
25 | 26 | ||
26 | 27 | ||
27 | /* | 28 | /* |
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data) | |||
511 | return 0; | 512 | return 0; |
512 | } | 513 | } |
513 | 514 | ||
515 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | ||
516 | |||
517 | static const struct user_regset * | ||
518 | find_regset(const struct user_regset_view *view, unsigned int type) | ||
519 | { | ||
520 | const struct user_regset *regset; | ||
521 | int n; | ||
522 | |||
523 | for (n = 0; n < view->n; ++n) { | ||
524 | regset = view->regsets + n; | ||
525 | if (regset->core_note_type == type) | ||
526 | return regset; | ||
527 | } | ||
528 | |||
529 | return NULL; | ||
530 | } | ||
531 | |||
532 | static int ptrace_regset(struct task_struct *task, int req, unsigned int type, | ||
533 | struct iovec *kiov) | ||
534 | { | ||
535 | const struct user_regset_view *view = task_user_regset_view(task); | ||
536 | const struct user_regset *regset = find_regset(view, type); | ||
537 | int regset_no; | ||
538 | |||
539 | if (!regset || (kiov->iov_len % regset->size) != 0) | ||
540 | return -EINVAL; | ||
541 | |||
542 | regset_no = regset - view->regsets; | ||
543 | kiov->iov_len = min(kiov->iov_len, | ||
544 | (__kernel_size_t) (regset->n * regset->size)); | ||
545 | |||
546 | if (req == PTRACE_GETREGSET) | ||
547 | return copy_regset_to_user(task, view, regset_no, 0, | ||
548 | kiov->iov_len, kiov->iov_base); | ||
549 | else | ||
550 | return copy_regset_from_user(task, view, regset_no, 0, | ||
551 | kiov->iov_len, kiov->iov_base); | ||
552 | } | ||
553 | |||
554 | #endif | ||
555 | |||
514 | int ptrace_request(struct task_struct *child, long request, | 556 | int ptrace_request(struct task_struct *child, long request, |
515 | long addr, long data) | 557 | long addr, long data) |
516 | { | 558 | { |
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request, | |||
573 | return 0; | 615 | return 0; |
574 | return ptrace_resume(child, request, SIGKILL); | 616 | return ptrace_resume(child, request, SIGKILL); |
575 | 617 | ||
618 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | ||
619 | case PTRACE_GETREGSET: | ||
620 | case PTRACE_SETREGSET: | ||
621 | { | ||
622 | struct iovec kiov; | ||
623 | struct iovec __user *uiov = (struct iovec __user *) data; | ||
624 | |||
625 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) | ||
626 | return -EFAULT; | ||
627 | |||
628 | if (__get_user(kiov.iov_base, &uiov->iov_base) || | ||
629 | __get_user(kiov.iov_len, &uiov->iov_len)) | ||
630 | return -EFAULT; | ||
631 | |||
632 | ret = ptrace_regset(child, request, addr, &kiov); | ||
633 | if (!ret) | ||
634 | ret = __put_user(kiov.iov_len, &uiov->iov_len); | ||
635 | break; | ||
636 | } | ||
637 | #endif | ||
576 | default: | 638 | default: |
577 | break; | 639 | break; |
578 | } | 640 | } |
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
711 | else | 773 | else |
712 | ret = ptrace_setsiginfo(child, &siginfo); | 774 | ret = ptrace_setsiginfo(child, &siginfo); |
713 | break; | 775 | break; |
776 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | ||
777 | case PTRACE_GETREGSET: | ||
778 | case PTRACE_SETREGSET: | ||
779 | { | ||
780 | struct iovec kiov; | ||
781 | struct compat_iovec __user *uiov = | ||
782 | (struct compat_iovec __user *) datap; | ||
783 | compat_uptr_t ptr; | ||
784 | compat_size_t len; | ||
785 | |||
786 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) | ||
787 | return -EFAULT; | ||
788 | |||
789 | if (__get_user(ptr, &uiov->iov_base) || | ||
790 | __get_user(len, &uiov->iov_len)) | ||
791 | return -EFAULT; | ||
792 | |||
793 | kiov.iov_base = compat_ptr(ptr); | ||
794 | kiov.iov_len = len; | ||
795 | |||
796 | ret = ptrace_regset(child, request, addr, &kiov); | ||
797 | if (!ret) | ||
798 | ret = __put_user(kiov.iov_len, &uiov->iov_len); | ||
799 | break; | ||
800 | } | ||
801 | #endif | ||
714 | 802 | ||
715 | default: | 803 | default: |
716 | ret = ptrace_request(child, request, addr, data); | 804 | ret = ptrace_request(child, request, addr, data); |
diff --git a/kernel/range.c b/kernel/range.c new file mode 100644 index 000000000000..74e2e6114927 --- /dev/null +++ b/kernel/range.c | |||
@@ -0,0 +1,163 @@ | |||
1 | /* | ||
2 | * Range add and subtract | ||
3 | */ | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/init.h> | ||
6 | #include <linux/sort.h> | ||
7 | |||
8 | #include <linux/range.h> | ||
9 | |||
10 | #ifndef ARRAY_SIZE | ||
11 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) | ||
12 | #endif | ||
13 | |||
14 | int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) | ||
15 | { | ||
16 | if (start >= end) | ||
17 | return nr_range; | ||
18 | |||
19 | /* Out of slots: */ | ||
20 | if (nr_range >= az) | ||
21 | return nr_range; | ||
22 | |||
23 | range[nr_range].start = start; | ||
24 | range[nr_range].end = end; | ||
25 | |||
26 | nr_range++; | ||
27 | |||
28 | return nr_range; | ||
29 | } | ||
30 | |||
31 | int add_range_with_merge(struct range *range, int az, int nr_range, | ||
32 | u64 start, u64 end) | ||
33 | { | ||
34 | int i; | ||
35 | |||
36 | if (start >= end) | ||
37 | return nr_range; | ||
38 | |||
39 | /* Try to merge it with old one: */ | ||
40 | for (i = 0; i < nr_range; i++) { | ||
41 | u64 final_start, final_end; | ||
42 | u64 common_start, common_end; | ||
43 | |||
44 | if (!range[i].end) | ||
45 | continue; | ||
46 | |||
47 | common_start = max(range[i].start, start); | ||
48 | common_end = min(range[i].end, end); | ||
49 | if (common_start > common_end) | ||
50 | continue; | ||
51 | |||
52 | final_start = min(range[i].start, start); | ||
53 | final_end = max(range[i].end, end); | ||
54 | |||
55 | range[i].start = final_start; | ||
56 | range[i].end = final_end; | ||
57 | return nr_range; | ||
58 | } | ||
59 | |||
60 | /* Need to add it: */ | ||
61 | return add_range(range, az, nr_range, start, end); | ||
62 | } | ||
63 | |||
64 | void subtract_range(struct range *range, int az, u64 start, u64 end) | ||
65 | { | ||
66 | int i, j; | ||
67 | |||
68 | if (start >= end) | ||
69 | return; | ||
70 | |||
71 | for (j = 0; j < az; j++) { | ||
72 | if (!range[j].end) | ||
73 | continue; | ||
74 | |||
75 | if (start <= range[j].start && end >= range[j].end) { | ||
76 | range[j].start = 0; | ||
77 | range[j].end = 0; | ||
78 | continue; | ||
79 | } | ||
80 | |||
81 | if (start <= range[j].start && end < range[j].end && | ||
82 | range[j].start < end) { | ||
83 | range[j].start = end; | ||
84 | continue; | ||
85 | } | ||
86 | |||
87 | |||
88 | if (start > range[j].start && end >= range[j].end && | ||
89 | range[j].end > start) { | ||
90 | range[j].end = start; | ||
91 | continue; | ||
92 | } | ||
93 | |||
94 | if (start > range[j].start && end < range[j].end) { | ||
95 | /* Find the new spare: */ | ||
96 | for (i = 0; i < az; i++) { | ||
97 | if (range[i].end == 0) | ||
98 | break; | ||
99 | } | ||
100 | if (i < az) { | ||
101 | range[i].end = range[j].end; | ||
102 | range[i].start = end; | ||
103 | } else { | ||
104 | printk(KERN_ERR "run of slot in ranges\n"); | ||
105 | } | ||
106 | range[j].end = start; | ||
107 | continue; | ||
108 | } | ||
109 | } | ||
110 | } | ||
111 | |||
112 | static int cmp_range(const void *x1, const void *x2) | ||
113 | { | ||
114 | const struct range *r1 = x1; | ||
115 | const struct range *r2 = x2; | ||
116 | s64 start1, start2; | ||
117 | |||
118 | start1 = r1->start; | ||
119 | start2 = r2->start; | ||
120 | |||
121 | return start1 - start2; | ||
122 | } | ||
123 | |||
124 | int clean_sort_range(struct range *range, int az) | ||
125 | { | ||
126 | int i, j, k = az - 1, nr_range = 0; | ||
127 | |||
128 | for (i = 0; i < k; i++) { | ||
129 | if (range[i].end) | ||
130 | continue; | ||
131 | for (j = k; j > i; j--) { | ||
132 | if (range[j].end) { | ||
133 | k = j; | ||
134 | break; | ||
135 | } | ||
136 | } | ||
137 | if (j == i) | ||
138 | break; | ||
139 | range[i].start = range[k].start; | ||
140 | range[i].end = range[k].end; | ||
141 | range[k].start = 0; | ||
142 | range[k].end = 0; | ||
143 | k--; | ||
144 | } | ||
145 | /* count it */ | ||
146 | for (i = 0; i < az; i++) { | ||
147 | if (!range[i].end) { | ||
148 | nr_range = i; | ||
149 | break; | ||
150 | } | ||
151 | } | ||
152 | |||
153 | /* sort them */ | ||
154 | sort(range, nr_range, sizeof(struct range), cmp_range, NULL); | ||
155 | |||
156 | return nr_range; | ||
157 | } | ||
158 | |||
159 | void sort_range(struct range *range, int nr_range) | ||
160 | { | ||
161 | /* sort them */ | ||
162 | sort(range, nr_range, sizeof(struct range), cmp_range, NULL); | ||
163 | } | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 9b7fd4723878..49d808e833b0 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -44,14 +44,73 @@ | |||
44 | #include <linux/cpu.h> | 44 | #include <linux/cpu.h> |
45 | #include <linux/mutex.h> | 45 | #include <linux/mutex.h> |
46 | #include <linux/module.h> | 46 | #include <linux/module.h> |
47 | #include <linux/kernel_stat.h> | ||
48 | #include <linux/hardirq.h> | ||
47 | 49 | ||
48 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 50 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
49 | static struct lock_class_key rcu_lock_key; | 51 | static struct lock_class_key rcu_lock_key; |
50 | struct lockdep_map rcu_lock_map = | 52 | struct lockdep_map rcu_lock_map = |
51 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | 53 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); |
52 | EXPORT_SYMBOL_GPL(rcu_lock_map); | 54 | EXPORT_SYMBOL_GPL(rcu_lock_map); |
55 | |||
56 | static struct lock_class_key rcu_bh_lock_key; | ||
57 | struct lockdep_map rcu_bh_lock_map = | ||
58 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); | ||
59 | EXPORT_SYMBOL_GPL(rcu_bh_lock_map); | ||
60 | |||
61 | static struct lock_class_key rcu_sched_lock_key; | ||
62 | struct lockdep_map rcu_sched_lock_map = | ||
63 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); | ||
64 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); | ||
53 | #endif | 65 | #endif |
54 | 66 | ||
67 | int rcu_scheduler_active __read_mostly; | ||
68 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
69 | |||
70 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
71 | |||
72 | int debug_lockdep_rcu_enabled(void) | ||
73 | { | ||
74 | return rcu_scheduler_active && debug_locks && | ||
75 | current->lockdep_recursion == 0; | ||
76 | } | ||
77 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | ||
78 | |||
79 | /** | ||
80 | * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? | ||
81 | * | ||
82 | * Check for bottom half being disabled, which covers both the | ||
83 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses | ||
84 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) | ||
85 | * will show the situation. | ||
86 | * | ||
87 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. | ||
88 | */ | ||
89 | int rcu_read_lock_bh_held(void) | ||
90 | { | ||
91 | if (!debug_lockdep_rcu_enabled()) | ||
92 | return 1; | ||
93 | return in_softirq(); | ||
94 | } | ||
95 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | ||
96 | |||
97 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
98 | |||
99 | /* | ||
100 | * This function is invoked towards the end of the scheduler's initialization | ||
101 | * process. Before this is called, the idle task might contain | ||
102 | * RCU read-side critical sections (during which time, this idle | ||
103 | * task is booting the system). After this function is called, the | ||
104 | * idle tasks are prohibited from containing RCU read-side critical | ||
105 | * sections. | ||
106 | */ | ||
107 | void rcu_scheduler_starting(void) | ||
108 | { | ||
109 | WARN_ON(num_online_cpus() != 1); | ||
110 | WARN_ON(nr_context_switches() > 0); | ||
111 | rcu_scheduler_active = 1; | ||
112 | } | ||
113 | |||
55 | /* | 114 | /* |
56 | * Awaken the corresponding synchronize_rcu() instance now that a | 115 | * Awaken the corresponding synchronize_rcu() instance now that a |
57 | * grace period has elapsed. | 116 | * grace period has elapsed. |
@@ -63,3 +122,14 @@ void wakeme_after_rcu(struct rcu_head *head) | |||
63 | rcu = container_of(head, struct rcu_synchronize, head); | 122 | rcu = container_of(head, struct rcu_synchronize, head); |
64 | complete(&rcu->completion); | 123 | complete(&rcu->completion); |
65 | } | 124 | } |
125 | |||
126 | #ifdef CONFIG_PROVE_RCU | ||
127 | /* | ||
128 | * wrapper function to avoid #include problems. | ||
129 | */ | ||
130 | int rcu_my_thread_group_empty(void) | ||
131 | { | ||
132 | return thread_group_empty(current); | ||
133 | } | ||
134 | EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); | ||
135 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9bb52177af02..58df55bf83ed 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | |||
61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ |
62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ |
63 | static int irqreader = 1; /* RCU readers from irq (timers). */ | 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ |
64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | ||
65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | ||
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | ||
64 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | 67 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ |
65 | 68 | ||
66 | module_param(nreaders, int, 0444); | 69 | module_param(nreaders, int, 0444); |
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444); | |||
79 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | 82 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); |
80 | module_param(irqreader, int, 0444); | 83 | module_param(irqreader, int, 0444); |
81 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | 84 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); |
85 | module_param(fqs_duration, int, 0444); | ||
86 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); | ||
87 | module_param(fqs_holdoff, int, 0444); | ||
88 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | ||
89 | module_param(fqs_stutter, int, 0444); | ||
90 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | ||
82 | module_param(torture_type, charp, 0444); | 91 | module_param(torture_type, charp, 0444); |
83 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 92 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); |
84 | 93 | ||
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks; | |||
99 | static struct task_struct *stats_task; | 108 | static struct task_struct *stats_task; |
100 | static struct task_struct *shuffler_task; | 109 | static struct task_struct *shuffler_task; |
101 | static struct task_struct *stutter_task; | 110 | static struct task_struct *stutter_task; |
111 | static struct task_struct *fqs_task; | ||
102 | 112 | ||
103 | #define RCU_TORTURE_PIPE_LEN 10 | 113 | #define RCU_TORTURE_PIPE_LEN 10 |
104 | 114 | ||
@@ -263,6 +273,7 @@ struct rcu_torture_ops { | |||
263 | void (*deferred_free)(struct rcu_torture *p); | 273 | void (*deferred_free)(struct rcu_torture *p); |
264 | void (*sync)(void); | 274 | void (*sync)(void); |
265 | void (*cb_barrier)(void); | 275 | void (*cb_barrier)(void); |
276 | void (*fqs)(void); | ||
266 | int (*stats)(char *page); | 277 | int (*stats)(char *page); |
267 | int irq_capable; | 278 | int irq_capable; |
268 | char *name; | 279 | char *name; |
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
347 | .deferred_free = rcu_torture_deferred_free, | 358 | .deferred_free = rcu_torture_deferred_free, |
348 | .sync = synchronize_rcu, | 359 | .sync = synchronize_rcu, |
349 | .cb_barrier = rcu_barrier, | 360 | .cb_barrier = rcu_barrier, |
361 | .fqs = rcu_force_quiescent_state, | ||
350 | .stats = NULL, | 362 | .stats = NULL, |
351 | .irq_capable = 1, | 363 | .irq_capable = 1, |
352 | .name = "rcu" | 364 | .name = "rcu" |
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
388 | .deferred_free = rcu_sync_torture_deferred_free, | 400 | .deferred_free = rcu_sync_torture_deferred_free, |
389 | .sync = synchronize_rcu, | 401 | .sync = synchronize_rcu, |
390 | .cb_barrier = NULL, | 402 | .cb_barrier = NULL, |
403 | .fqs = rcu_force_quiescent_state, | ||
391 | .stats = NULL, | 404 | .stats = NULL, |
392 | .irq_capable = 1, | 405 | .irq_capable = 1, |
393 | .name = "rcu_sync" | 406 | .name = "rcu_sync" |
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
403 | .deferred_free = rcu_sync_torture_deferred_free, | 416 | .deferred_free = rcu_sync_torture_deferred_free, |
404 | .sync = synchronize_rcu_expedited, | 417 | .sync = synchronize_rcu_expedited, |
405 | .cb_barrier = NULL, | 418 | .cb_barrier = NULL, |
419 | .fqs = rcu_force_quiescent_state, | ||
406 | .stats = NULL, | 420 | .stats = NULL, |
407 | .irq_capable = 1, | 421 | .irq_capable = 1, |
408 | .name = "rcu_expedited" | 422 | .name = "rcu_expedited" |
@@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
465 | .deferred_free = rcu_bh_torture_deferred_free, | 479 | .deferred_free = rcu_bh_torture_deferred_free, |
466 | .sync = rcu_bh_torture_synchronize, | 480 | .sync = rcu_bh_torture_synchronize, |
467 | .cb_barrier = rcu_barrier_bh, | 481 | .cb_barrier = rcu_barrier_bh, |
482 | .fqs = rcu_bh_force_quiescent_state, | ||
468 | .stats = NULL, | 483 | .stats = NULL, |
469 | .irq_capable = 1, | 484 | .irq_capable = 1, |
470 | .name = "rcu_bh" | 485 | .name = "rcu_bh" |
@@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
480 | .deferred_free = rcu_sync_torture_deferred_free, | 495 | .deferred_free = rcu_sync_torture_deferred_free, |
481 | .sync = rcu_bh_torture_synchronize, | 496 | .sync = rcu_bh_torture_synchronize, |
482 | .cb_barrier = NULL, | 497 | .cb_barrier = NULL, |
498 | .fqs = rcu_bh_force_quiescent_state, | ||
483 | .stats = NULL, | 499 | .stats = NULL, |
484 | .irq_capable = 1, | 500 | .irq_capable = 1, |
485 | .name = "rcu_bh_sync" | 501 | .name = "rcu_bh_sync" |
@@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = { | |||
621 | .deferred_free = rcu_sched_torture_deferred_free, | 637 | .deferred_free = rcu_sched_torture_deferred_free, |
622 | .sync = sched_torture_synchronize, | 638 | .sync = sched_torture_synchronize, |
623 | .cb_barrier = rcu_barrier_sched, | 639 | .cb_barrier = rcu_barrier_sched, |
640 | .fqs = rcu_sched_force_quiescent_state, | ||
624 | .stats = NULL, | 641 | .stats = NULL, |
625 | .irq_capable = 1, | 642 | .irq_capable = 1, |
626 | .name = "sched" | 643 | .name = "sched" |
@@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = { | |||
636 | .deferred_free = rcu_sync_torture_deferred_free, | 653 | .deferred_free = rcu_sync_torture_deferred_free, |
637 | .sync = sched_torture_synchronize, | 654 | .sync = sched_torture_synchronize, |
638 | .cb_barrier = NULL, | 655 | .cb_barrier = NULL, |
656 | .fqs = rcu_sched_force_quiescent_state, | ||
639 | .stats = NULL, | 657 | .stats = NULL, |
640 | .name = "sched_sync" | 658 | .name = "sched_sync" |
641 | }; | 659 | }; |
@@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = { | |||
650 | .deferred_free = rcu_sync_torture_deferred_free, | 668 | .deferred_free = rcu_sync_torture_deferred_free, |
651 | .sync = synchronize_sched_expedited, | 669 | .sync = synchronize_sched_expedited, |
652 | .cb_barrier = NULL, | 670 | .cb_barrier = NULL, |
671 | .fqs = rcu_sched_force_quiescent_state, | ||
653 | .stats = rcu_expedited_torture_stats, | 672 | .stats = rcu_expedited_torture_stats, |
654 | .irq_capable = 1, | 673 | .irq_capable = 1, |
655 | .name = "sched_expedited" | 674 | .name = "sched_expedited" |
656 | }; | 675 | }; |
657 | 676 | ||
658 | /* | 677 | /* |
678 | * RCU torture force-quiescent-state kthread. Repeatedly induces | ||
679 | * bursts of calls to force_quiescent_state(), increasing the probability | ||
680 | * of occurrence of some important types of race conditions. | ||
681 | */ | ||
682 | static int | ||
683 | rcu_torture_fqs(void *arg) | ||
684 | { | ||
685 | unsigned long fqs_resume_time; | ||
686 | int fqs_burst_remaining; | ||
687 | |||
688 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); | ||
689 | do { | ||
690 | fqs_resume_time = jiffies + fqs_stutter * HZ; | ||
691 | while (jiffies - fqs_resume_time > LONG_MAX) { | ||
692 | schedule_timeout_interruptible(1); | ||
693 | } | ||
694 | fqs_burst_remaining = fqs_duration; | ||
695 | while (fqs_burst_remaining > 0) { | ||
696 | cur_ops->fqs(); | ||
697 | udelay(fqs_holdoff); | ||
698 | fqs_burst_remaining -= fqs_holdoff; | ||
699 | } | ||
700 | rcu_stutter_wait("rcu_torture_fqs"); | ||
701 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
702 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); | ||
703 | rcutorture_shutdown_absorb("rcu_torture_fqs"); | ||
704 | while (!kthread_should_stop()) | ||
705 | schedule_timeout_uninterruptible(1); | ||
706 | return 0; | ||
707 | } | ||
708 | |||
709 | /* | ||
659 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 710 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
660 | * for that pointed to by rcu_torture_current, freeing the old structure | 711 | * for that pointed to by rcu_torture_current, freeing the old structure |
661 | * after a series of grace periods (the "pipeline"). | 712 | * after a series of grace periods (the "pipeline"). |
@@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused) | |||
745 | 796 | ||
746 | idx = cur_ops->readlock(); | 797 | idx = cur_ops->readlock(); |
747 | completed = cur_ops->completed(); | 798 | completed = cur_ops->completed(); |
748 | p = rcu_dereference(rcu_torture_current); | 799 | p = rcu_dereference_check(rcu_torture_current, |
800 | rcu_read_lock_held() || | ||
801 | rcu_read_lock_bh_held() || | ||
802 | rcu_read_lock_sched_held() || | ||
803 | srcu_read_lock_held(&srcu_ctl)); | ||
749 | if (p == NULL) { | 804 | if (p == NULL) { |
750 | /* Leave because rcu_torture_writer is not yet underway */ | 805 | /* Leave because rcu_torture_writer is not yet underway */ |
751 | cur_ops->readunlock(idx); | 806 | cur_ops->readunlock(idx); |
@@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused) | |||
763 | /* Should not happen, but... */ | 818 | /* Should not happen, but... */ |
764 | pipe_count = RCU_TORTURE_PIPE_LEN; | 819 | pipe_count = RCU_TORTURE_PIPE_LEN; |
765 | } | 820 | } |
766 | __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); | 821 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
767 | completed = cur_ops->completed() - completed; | 822 | completed = cur_ops->completed() - completed; |
768 | if (completed > RCU_TORTURE_PIPE_LEN) { | 823 | if (completed > RCU_TORTURE_PIPE_LEN) { |
769 | /* Should not happen, but... */ | 824 | /* Should not happen, but... */ |
770 | completed = RCU_TORTURE_PIPE_LEN; | 825 | completed = RCU_TORTURE_PIPE_LEN; |
771 | } | 826 | } |
772 | __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); | 827 | __this_cpu_inc(rcu_torture_batch[completed]); |
773 | preempt_enable(); | 828 | preempt_enable(); |
774 | cur_ops->readunlock(idx); | 829 | cur_ops->readunlock(idx); |
775 | } | 830 | } |
@@ -798,11 +853,15 @@ rcu_torture_reader(void *arg) | |||
798 | do { | 853 | do { |
799 | if (irqreader && cur_ops->irq_capable) { | 854 | if (irqreader && cur_ops->irq_capable) { |
800 | if (!timer_pending(&t)) | 855 | if (!timer_pending(&t)) |
801 | mod_timer(&t, 1); | 856 | mod_timer(&t, jiffies + 1); |
802 | } | 857 | } |
803 | idx = cur_ops->readlock(); | 858 | idx = cur_ops->readlock(); |
804 | completed = cur_ops->completed(); | 859 | completed = cur_ops->completed(); |
805 | p = rcu_dereference(rcu_torture_current); | 860 | p = rcu_dereference_check(rcu_torture_current, |
861 | rcu_read_lock_held() || | ||
862 | rcu_read_lock_bh_held() || | ||
863 | rcu_read_lock_sched_held() || | ||
864 | srcu_read_lock_held(&srcu_ctl)); | ||
806 | if (p == NULL) { | 865 | if (p == NULL) { |
807 | /* Wait for rcu_torture_writer to get underway */ | 866 | /* Wait for rcu_torture_writer to get underway */ |
808 | cur_ops->readunlock(idx); | 867 | cur_ops->readunlock(idx); |
@@ -818,13 +877,13 @@ rcu_torture_reader(void *arg) | |||
818 | /* Should not happen, but... */ | 877 | /* Should not happen, but... */ |
819 | pipe_count = RCU_TORTURE_PIPE_LEN; | 878 | pipe_count = RCU_TORTURE_PIPE_LEN; |
820 | } | 879 | } |
821 | __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); | 880 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
822 | completed = cur_ops->completed() - completed; | 881 | completed = cur_ops->completed() - completed; |
823 | if (completed > RCU_TORTURE_PIPE_LEN) { | 882 | if (completed > RCU_TORTURE_PIPE_LEN) { |
824 | /* Should not happen, but... */ | 883 | /* Should not happen, but... */ |
825 | completed = RCU_TORTURE_PIPE_LEN; | 884 | completed = RCU_TORTURE_PIPE_LEN; |
826 | } | 885 | } |
827 | __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); | 886 | __this_cpu_inc(rcu_torture_batch[completed]); |
828 | preempt_enable(); | 887 | preempt_enable(); |
829 | cur_ops->readunlock(idx); | 888 | cur_ops->readunlock(idx); |
830 | schedule(); | 889 | schedule(); |
@@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag) | |||
1030 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1089 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1031 | "--- %s: nreaders=%d nfakewriters=%d " | 1090 | "--- %s: nreaders=%d nfakewriters=%d " |
1032 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1091 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
1033 | "shuffle_interval=%d stutter=%d irqreader=%d\n", | 1092 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1093 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", | ||
1034 | torture_type, tag, nrealreaders, nfakewriters, | 1094 | torture_type, tag, nrealreaders, nfakewriters, |
1035 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1095 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1036 | stutter, irqreader); | 1096 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); |
1037 | } | 1097 | } |
1038 | 1098 | ||
1039 | static struct notifier_block rcutorture_nb = { | 1099 | static struct notifier_block rcutorture_nb = { |
@@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void) | |||
1109 | } | 1169 | } |
1110 | stats_task = NULL; | 1170 | stats_task = NULL; |
1111 | 1171 | ||
1172 | if (fqs_task) { | ||
1173 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); | ||
1174 | kthread_stop(fqs_task); | ||
1175 | } | ||
1176 | fqs_task = NULL; | ||
1177 | |||
1112 | /* Wait for all RCU callbacks to fire. */ | 1178 | /* Wait for all RCU callbacks to fire. */ |
1113 | 1179 | ||
1114 | if (cur_ops->cb_barrier != NULL) | 1180 | if (cur_ops->cb_barrier != NULL) |
@@ -1154,6 +1220,11 @@ rcu_torture_init(void) | |||
1154 | mutex_unlock(&fullstop_mutex); | 1220 | mutex_unlock(&fullstop_mutex); |
1155 | return -EINVAL; | 1221 | return -EINVAL; |
1156 | } | 1222 | } |
1223 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | ||
1224 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " | ||
1225 | "fqs_duration, fqs disabled.\n"); | ||
1226 | fqs_duration = 0; | ||
1227 | } | ||
1157 | if (cur_ops->init) | 1228 | if (cur_ops->init) |
1158 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 1229 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
1159 | 1230 | ||
@@ -1282,6 +1353,19 @@ rcu_torture_init(void) | |||
1282 | goto unwind; | 1353 | goto unwind; |
1283 | } | 1354 | } |
1284 | } | 1355 | } |
1356 | if (fqs_duration < 0) | ||
1357 | fqs_duration = 0; | ||
1358 | if (fqs_duration) { | ||
1359 | /* Create the stutter thread */ | ||
1360 | fqs_task = kthread_run(rcu_torture_fqs, NULL, | ||
1361 | "rcu_torture_fqs"); | ||
1362 | if (IS_ERR(fqs_task)) { | ||
1363 | firsterr = PTR_ERR(fqs_task); | ||
1364 | VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); | ||
1365 | fqs_task = NULL; | ||
1366 | goto unwind; | ||
1367 | } | ||
1368 | } | ||
1285 | register_reboot_notifier(&rcutorture_nb); | 1369 | register_reboot_notifier(&rcutorture_nb); |
1286 | mutex_unlock(&fullstop_mutex); | 1370 | mutex_unlock(&fullstop_mutex); |
1287 | return 0; | 1371 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 53ae9598f798..3ec8160fc75f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -46,7 +46,6 @@ | |||
46 | #include <linux/cpu.h> | 46 | #include <linux/cpu.h> |
47 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
48 | #include <linux/time.h> | 48 | #include <linux/time.h> |
49 | #include <linux/kernel_stat.h> | ||
50 | 49 | ||
51 | #include "rcutree.h" | 50 | #include "rcutree.h" |
52 | 51 | ||
@@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
66 | .signaled = RCU_GP_IDLE, \ | 65 | .signaled = RCU_GP_IDLE, \ |
67 | .gpnum = -300, \ | 66 | .gpnum = -300, \ |
68 | .completed = -300, \ | 67 | .completed = -300, \ |
69 | .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ | 68 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ |
70 | .orphan_cbs_list = NULL, \ | 69 | .orphan_cbs_list = NULL, \ |
71 | .orphan_cbs_tail = &name.orphan_cbs_list, \ | 70 | .orphan_cbs_tail = &name.orphan_cbs_list, \ |
72 | .orphan_qlen = 0, \ | 71 | .orphan_qlen = 0, \ |
73 | .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ | 72 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ |
74 | .n_force_qs = 0, \ | 73 | .n_force_qs = 0, \ |
75 | .n_force_qs_ngp = 0, \ | 74 | .n_force_qs_ngp = 0, \ |
76 | } | 75 | } |
@@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | |||
81 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | 80 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); |
82 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 81 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
83 | 82 | ||
84 | static int rcu_scheduler_active __read_mostly; | ||
85 | |||
86 | |||
87 | /* | 83 | /* |
88 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 84 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
89 | * permit this function to be invoked without holding the root rcu_node | 85 | * permit this function to be invoked without holding the root rcu_node |
@@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void) | |||
157 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | 153 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); |
158 | 154 | ||
159 | /* | 155 | /* |
156 | * Force a quiescent state for RCU BH. | ||
157 | */ | ||
158 | void rcu_bh_force_quiescent_state(void) | ||
159 | { | ||
160 | force_quiescent_state(&rcu_bh_state, 0); | ||
161 | } | ||
162 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | ||
163 | |||
164 | /* | ||
165 | * Force a quiescent state for RCU-sched. | ||
166 | */ | ||
167 | void rcu_sched_force_quiescent_state(void) | ||
168 | { | ||
169 | force_quiescent_state(&rcu_sched_state, 0); | ||
170 | } | ||
171 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | ||
172 | |||
173 | /* | ||
160 | * Does the CPU have callbacks ready to be invoked? | 174 | * Does the CPU have callbacks ready to be invoked? |
161 | */ | 175 | */ |
162 | static int | 176 | static int |
@@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
439 | 453 | ||
440 | /* Only let one CPU complain about others per time interval. */ | 454 | /* Only let one CPU complain about others per time interval. */ |
441 | 455 | ||
442 | spin_lock_irqsave(&rnp->lock, flags); | 456 | raw_spin_lock_irqsave(&rnp->lock, flags); |
443 | delta = jiffies - rsp->jiffies_stall; | 457 | delta = jiffies - rsp->jiffies_stall; |
444 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { | 458 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { |
445 | spin_unlock_irqrestore(&rnp->lock, flags); | 459 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
446 | return; | 460 | return; |
447 | } | 461 | } |
448 | rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | 462 | rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; |
@@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
452 | * due to CPU offlining. | 466 | * due to CPU offlining. |
453 | */ | 467 | */ |
454 | rcu_print_task_stall(rnp); | 468 | rcu_print_task_stall(rnp); |
455 | spin_unlock_irqrestore(&rnp->lock, flags); | 469 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
456 | 470 | ||
457 | /* OK, time to rat on our buddy... */ | 471 | /* OK, time to rat on our buddy... */ |
458 | 472 | ||
459 | printk(KERN_ERR "INFO: RCU detected CPU stalls:"); | 473 | printk(KERN_ERR "INFO: RCU detected CPU stalls:"); |
460 | rcu_for_each_leaf_node(rsp, rnp) { | 474 | rcu_for_each_leaf_node(rsp, rnp) { |
475 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
461 | rcu_print_task_stall(rnp); | 476 | rcu_print_task_stall(rnp); |
477 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
462 | if (rnp->qsmask == 0) | 478 | if (rnp->qsmask == 0) |
463 | continue; | 479 | continue; |
464 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 480 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) |
@@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
469 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 485 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); |
470 | trigger_all_cpu_backtrace(); | 486 | trigger_all_cpu_backtrace(); |
471 | 487 | ||
488 | /* If so configured, complain about tasks blocking the grace period. */ | ||
489 | |||
490 | rcu_print_detail_task_stall(rsp); | ||
491 | |||
472 | force_quiescent_state(rsp, 0); /* Kick them all. */ | 492 | force_quiescent_state(rsp, 0); /* Kick them all. */ |
473 | } | 493 | } |
474 | 494 | ||
@@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
481 | smp_processor_id(), jiffies - rsp->gp_start); | 501 | smp_processor_id(), jiffies - rsp->gp_start); |
482 | trigger_all_cpu_backtrace(); | 502 | trigger_all_cpu_backtrace(); |
483 | 503 | ||
484 | spin_lock_irqsave(&rnp->lock, flags); | 504 | raw_spin_lock_irqsave(&rnp->lock, flags); |
485 | if ((long)(jiffies - rsp->jiffies_stall) >= 0) | 505 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) |
486 | rsp->jiffies_stall = | 506 | rsp->jiffies_stall = |
487 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | 507 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; |
488 | spin_unlock_irqrestore(&rnp->lock, flags); | 508 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
489 | 509 | ||
490 | set_need_resched(); /* kick ourselves to get things going. */ | 510 | set_need_resched(); /* kick ourselves to get things going. */ |
491 | } | 511 | } |
@@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) | |||
545 | local_irq_save(flags); | 565 | local_irq_save(flags); |
546 | rnp = rdp->mynode; | 566 | rnp = rdp->mynode; |
547 | if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ | 567 | if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ |
548 | !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ | 568 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ |
549 | local_irq_restore(flags); | 569 | local_irq_restore(flags); |
550 | return; | 570 | return; |
551 | } | 571 | } |
552 | __note_new_gpnum(rsp, rnp, rdp); | 572 | __note_new_gpnum(rsp, rnp, rdp); |
553 | spin_unlock_irqrestore(&rnp->lock, flags); | 573 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
554 | } | 574 | } |
555 | 575 | ||
556 | /* | 576 | /* |
@@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) | |||
609 | local_irq_save(flags); | 629 | local_irq_save(flags); |
610 | rnp = rdp->mynode; | 630 | rnp = rdp->mynode; |
611 | if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ | 631 | if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ |
612 | !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ | 632 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ |
613 | local_irq_restore(flags); | 633 | local_irq_restore(flags); |
614 | return; | 634 | return; |
615 | } | 635 | } |
616 | __rcu_process_gp_end(rsp, rnp, rdp); | 636 | __rcu_process_gp_end(rsp, rnp, rdp); |
617 | spin_unlock_irqrestore(&rnp->lock, flags); | 637 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
618 | } | 638 | } |
619 | 639 | ||
620 | /* | 640 | /* |
@@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
659 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 679 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; |
660 | struct rcu_node *rnp = rcu_get_root(rsp); | 680 | struct rcu_node *rnp = rcu_get_root(rsp); |
661 | 681 | ||
662 | if (!cpu_needs_another_gp(rsp, rdp)) { | 682 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { |
683 | if (cpu_needs_another_gp(rsp, rdp)) | ||
684 | rsp->fqs_need_gp = 1; | ||
663 | if (rnp->completed == rsp->completed) { | 685 | if (rnp->completed == rsp->completed) { |
664 | spin_unlock_irqrestore(&rnp->lock, flags); | 686 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
665 | return; | 687 | return; |
666 | } | 688 | } |
667 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 689 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
668 | 690 | ||
669 | /* | 691 | /* |
670 | * Propagate new ->completed value to rcu_node structures | 692 | * Propagate new ->completed value to rcu_node structures |
@@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
672 | * of the next grace period to process their callbacks. | 694 | * of the next grace period to process their callbacks. |
673 | */ | 695 | */ |
674 | rcu_for_each_node_breadth_first(rsp, rnp) { | 696 | rcu_for_each_node_breadth_first(rsp, rnp) { |
675 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 697 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
676 | rnp->completed = rsp->completed; | 698 | rnp->completed = rsp->completed; |
677 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 699 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
678 | } | 700 | } |
679 | local_irq_restore(flags); | 701 | local_irq_restore(flags); |
680 | return; | 702 | return; |
@@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
695 | rnp->completed = rsp->completed; | 717 | rnp->completed = rsp->completed; |
696 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 718 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
697 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 719 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
698 | spin_unlock_irqrestore(&rnp->lock, flags); | 720 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
699 | return; | 721 | return; |
700 | } | 722 | } |
701 | 723 | ||
702 | spin_unlock(&rnp->lock); /* leave irqs disabled. */ | 724 | raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ |
703 | 725 | ||
704 | 726 | ||
705 | /* Exclude any concurrent CPU-hotplug operations. */ | 727 | /* Exclude any concurrent CPU-hotplug operations. */ |
706 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 728 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
707 | 729 | ||
708 | /* | 730 | /* |
709 | * Set the quiescent-state-needed bits in all the rcu_node | 731 | * Set the quiescent-state-needed bits in all the rcu_node |
@@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
723 | * irqs disabled. | 745 | * irqs disabled. |
724 | */ | 746 | */ |
725 | rcu_for_each_node_breadth_first(rsp, rnp) { | 747 | rcu_for_each_node_breadth_first(rsp, rnp) { |
726 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 748 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
727 | rcu_preempt_check_blocked_tasks(rnp); | 749 | rcu_preempt_check_blocked_tasks(rnp); |
728 | rnp->qsmask = rnp->qsmaskinit; | 750 | rnp->qsmask = rnp->qsmaskinit; |
729 | rnp->gpnum = rsp->gpnum; | 751 | rnp->gpnum = rsp->gpnum; |
730 | rnp->completed = rsp->completed; | 752 | rnp->completed = rsp->completed; |
731 | if (rnp == rdp->mynode) | 753 | if (rnp == rdp->mynode) |
732 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 754 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
733 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 755 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
734 | } | 756 | } |
735 | 757 | ||
736 | rnp = rcu_get_root(rsp); | 758 | rnp = rcu_get_root(rsp); |
737 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 759 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
738 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | 760 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ |
739 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 761 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
740 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 762 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
741 | } | 763 | } |
742 | 764 | ||
743 | /* | 765 | /* |
@@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
776 | if (!(rnp->qsmask & mask)) { | 798 | if (!(rnp->qsmask & mask)) { |
777 | 799 | ||
778 | /* Our bit has already been cleared, so done. */ | 800 | /* Our bit has already been cleared, so done. */ |
779 | spin_unlock_irqrestore(&rnp->lock, flags); | 801 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
780 | return; | 802 | return; |
781 | } | 803 | } |
782 | rnp->qsmask &= ~mask; | 804 | rnp->qsmask &= ~mask; |
783 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 805 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { |
784 | 806 | ||
785 | /* Other bits still set at this level, so done. */ | 807 | /* Other bits still set at this level, so done. */ |
786 | spin_unlock_irqrestore(&rnp->lock, flags); | 808 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
787 | return; | 809 | return; |
788 | } | 810 | } |
789 | mask = rnp->grpmask; | 811 | mask = rnp->grpmask; |
@@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
793 | 815 | ||
794 | break; | 816 | break; |
795 | } | 817 | } |
796 | spin_unlock_irqrestore(&rnp->lock, flags); | 818 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
797 | rnp_c = rnp; | 819 | rnp_c = rnp; |
798 | rnp = rnp->parent; | 820 | rnp = rnp->parent; |
799 | spin_lock_irqsave(&rnp->lock, flags); | 821 | raw_spin_lock_irqsave(&rnp->lock, flags); |
800 | WARN_ON_ONCE(rnp_c->qsmask); | 822 | WARN_ON_ONCE(rnp_c->qsmask); |
801 | } | 823 | } |
802 | 824 | ||
@@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
825 | struct rcu_node *rnp; | 847 | struct rcu_node *rnp; |
826 | 848 | ||
827 | rnp = rdp->mynode; | 849 | rnp = rdp->mynode; |
828 | spin_lock_irqsave(&rnp->lock, flags); | 850 | raw_spin_lock_irqsave(&rnp->lock, flags); |
829 | if (lastcomp != rnp->completed) { | 851 | if (lastcomp != rnp->completed) { |
830 | 852 | ||
831 | /* | 853 | /* |
@@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
837 | * race occurred. | 859 | * race occurred. |
838 | */ | 860 | */ |
839 | rdp->passed_quiesc = 0; /* try again later! */ | 861 | rdp->passed_quiesc = 0; /* try again later! */ |
840 | spin_unlock_irqrestore(&rnp->lock, flags); | 862 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
841 | return; | 863 | return; |
842 | } | 864 | } |
843 | mask = rdp->grpmask; | 865 | mask = rdp->grpmask; |
844 | if ((rnp->qsmask & mask) == 0) { | 866 | if ((rnp->qsmask & mask) == 0) { |
845 | spin_unlock_irqrestore(&rnp->lock, flags); | 867 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
846 | } else { | 868 | } else { |
847 | rdp->qs_pending = 0; | 869 | rdp->qs_pending = 0; |
848 | 870 | ||
@@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |||
906 | 928 | ||
907 | if (rdp->nxtlist == NULL) | 929 | if (rdp->nxtlist == NULL) |
908 | return; /* irqs disabled, so comparison is stable. */ | 930 | return; /* irqs disabled, so comparison is stable. */ |
909 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 931 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
910 | *rsp->orphan_cbs_tail = rdp->nxtlist; | 932 | *rsp->orphan_cbs_tail = rdp->nxtlist; |
911 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 933 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; |
912 | rdp->nxtlist = NULL; | 934 | rdp->nxtlist = NULL; |
@@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |||
914 | rdp->nxttail[i] = &rdp->nxtlist; | 936 | rdp->nxttail[i] = &rdp->nxtlist; |
915 | rsp->orphan_qlen += rdp->qlen; | 937 | rsp->orphan_qlen += rdp->qlen; |
916 | rdp->qlen = 0; | 938 | rdp->qlen = 0; |
917 | spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 939 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
918 | } | 940 | } |
919 | 941 | ||
920 | /* | 942 | /* |
@@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
925 | unsigned long flags; | 947 | unsigned long flags; |
926 | struct rcu_data *rdp; | 948 | struct rcu_data *rdp; |
927 | 949 | ||
928 | spin_lock_irqsave(&rsp->onofflock, flags); | 950 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
929 | rdp = rsp->rda[smp_processor_id()]; | 951 | rdp = rsp->rda[smp_processor_id()]; |
930 | if (rsp->orphan_cbs_list == NULL) { | 952 | if (rsp->orphan_cbs_list == NULL) { |
931 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 953 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
932 | return; | 954 | return; |
933 | } | 955 | } |
934 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | 956 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; |
@@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
937 | rsp->orphan_cbs_list = NULL; | 959 | rsp->orphan_cbs_list = NULL; |
938 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | 960 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; |
939 | rsp->orphan_qlen = 0; | 961 | rsp->orphan_qlen = 0; |
940 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 962 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
941 | } | 963 | } |
942 | 964 | ||
943 | /* | 965 | /* |
@@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
953 | struct rcu_node *rnp; | 975 | struct rcu_node *rnp; |
954 | 976 | ||
955 | /* Exclude any attempts to start a new grace period. */ | 977 | /* Exclude any attempts to start a new grace period. */ |
956 | spin_lock_irqsave(&rsp->onofflock, flags); | 978 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
957 | 979 | ||
958 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 980 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
959 | rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ | 981 | rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ |
960 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 982 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
961 | do { | 983 | do { |
962 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 984 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
963 | rnp->qsmaskinit &= ~mask; | 985 | rnp->qsmaskinit &= ~mask; |
964 | if (rnp->qsmaskinit != 0) { | 986 | if (rnp->qsmaskinit != 0) { |
965 | if (rnp != rdp->mynode) | 987 | if (rnp != rdp->mynode) |
966 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 988 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
967 | break; | 989 | break; |
968 | } | 990 | } |
969 | if (rnp == rdp->mynode) | 991 | if (rnp == rdp->mynode) |
970 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); | 992 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); |
971 | else | 993 | else |
972 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 994 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
973 | mask = rnp->grpmask; | 995 | mask = rnp->grpmask; |
974 | rnp = rnp->parent; | 996 | rnp = rnp->parent; |
975 | } while (rnp != NULL); | 997 | } while (rnp != NULL); |
@@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
980 | * because invoking rcu_report_unblock_qs_rnp() with ->onofflock | 1002 | * because invoking rcu_report_unblock_qs_rnp() with ->onofflock |
981 | * held leads to deadlock. | 1003 | * held leads to deadlock. |
982 | */ | 1004 | */ |
983 | spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 1005 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
984 | rnp = rdp->mynode; | 1006 | rnp = rdp->mynode; |
985 | if (need_report & RCU_OFL_TASKS_NORM_GP) | 1007 | if (need_report & RCU_OFL_TASKS_NORM_GP) |
986 | rcu_report_unblock_qs_rnp(rnp, flags); | 1008 | rcu_report_unblock_qs_rnp(rnp, flags); |
987 | else | 1009 | else |
988 | spin_unlock_irqrestore(&rnp->lock, flags); | 1010 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
989 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1011 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
990 | rcu_report_exp_rnp(rsp, rnp); | 1012 | rcu_report_exp_rnp(rsp, rnp); |
991 | 1013 | ||
@@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user) | |||
1144 | /* | 1166 | /* |
1145 | * Scan the leaf rcu_node structures, processing dyntick state for any that | 1167 | * Scan the leaf rcu_node structures, processing dyntick state for any that |
1146 | * have not yet encountered a quiescent state, using the function specified. | 1168 | * have not yet encountered a quiescent state, using the function specified. |
1147 | * Returns 1 if the current grace period ends while scanning (possibly | 1169 | * The caller must have suppressed start of new grace periods. |
1148 | * because we made it end). | ||
1149 | */ | 1170 | */ |
1150 | static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | 1171 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) |
1151 | int (*f)(struct rcu_data *)) | ||
1152 | { | 1172 | { |
1153 | unsigned long bit; | 1173 | unsigned long bit; |
1154 | int cpu; | 1174 | int cpu; |
@@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | |||
1158 | 1178 | ||
1159 | rcu_for_each_leaf_node(rsp, rnp) { | 1179 | rcu_for_each_leaf_node(rsp, rnp) { |
1160 | mask = 0; | 1180 | mask = 0; |
1161 | spin_lock_irqsave(&rnp->lock, flags); | 1181 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1162 | if (rnp->completed != lastcomp) { | 1182 | if (!rcu_gp_in_progress(rsp)) { |
1163 | spin_unlock_irqrestore(&rnp->lock, flags); | 1183 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1164 | return 1; | 1184 | return; |
1165 | } | 1185 | } |
1166 | if (rnp->qsmask == 0) { | 1186 | if (rnp->qsmask == 0) { |
1167 | spin_unlock_irqrestore(&rnp->lock, flags); | 1187 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1168 | continue; | 1188 | continue; |
1169 | } | 1189 | } |
1170 | cpu = rnp->grplo; | 1190 | cpu = rnp->grplo; |
@@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | |||
1173 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) | 1193 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) |
1174 | mask |= bit; | 1194 | mask |= bit; |
1175 | } | 1195 | } |
1176 | if (mask != 0 && rnp->completed == lastcomp) { | 1196 | if (mask != 0) { |
1177 | 1197 | ||
1178 | /* rcu_report_qs_rnp() releases rnp->lock. */ | 1198 | /* rcu_report_qs_rnp() releases rnp->lock. */ |
1179 | rcu_report_qs_rnp(mask, rsp, rnp, flags); | 1199 | rcu_report_qs_rnp(mask, rsp, rnp, flags); |
1180 | continue; | 1200 | continue; |
1181 | } | 1201 | } |
1182 | spin_unlock_irqrestore(&rnp->lock, flags); | 1202 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1183 | } | 1203 | } |
1184 | return 0; | ||
1185 | } | 1204 | } |
1186 | 1205 | ||
1187 | /* | 1206 | /* |
@@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | |||
1191 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | 1210 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed) |
1192 | { | 1211 | { |
1193 | unsigned long flags; | 1212 | unsigned long flags; |
1194 | long lastcomp; | ||
1195 | struct rcu_node *rnp = rcu_get_root(rsp); | 1213 | struct rcu_node *rnp = rcu_get_root(rsp); |
1196 | u8 signaled; | ||
1197 | u8 forcenow; | ||
1198 | 1214 | ||
1199 | if (!rcu_gp_in_progress(rsp)) | 1215 | if (!rcu_gp_in_progress(rsp)) |
1200 | return; /* No grace period in progress, nothing to force. */ | 1216 | return; /* No grace period in progress, nothing to force. */ |
1201 | if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { | 1217 | if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { |
1202 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ | 1218 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ |
1203 | return; /* Someone else is already on the job. */ | 1219 | return; /* Someone else is already on the job. */ |
1204 | } | 1220 | } |
1205 | if (relaxed && | 1221 | if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) |
1206 | (long)(rsp->jiffies_force_qs - jiffies) >= 0) | 1222 | goto unlock_fqs_ret; /* no emergency and done recently. */ |
1207 | goto unlock_ret; /* no emergency and done recently. */ | ||
1208 | rsp->n_force_qs++; | 1223 | rsp->n_force_qs++; |
1209 | spin_lock(&rnp->lock); | 1224 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
1210 | lastcomp = rsp->gpnum - 1; | ||
1211 | signaled = rsp->signaled; | ||
1212 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | 1225 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; |
1213 | if(!rcu_gp_in_progress(rsp)) { | 1226 | if(!rcu_gp_in_progress(rsp)) { |
1214 | rsp->n_force_qs_ngp++; | 1227 | rsp->n_force_qs_ngp++; |
1215 | spin_unlock(&rnp->lock); | 1228 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
1216 | goto unlock_ret; /* no GP in progress, time updated. */ | 1229 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ |
1217 | } | 1230 | } |
1218 | spin_unlock(&rnp->lock); | 1231 | rsp->fqs_active = 1; |
1219 | switch (signaled) { | 1232 | switch (rsp->signaled) { |
1220 | case RCU_GP_IDLE: | 1233 | case RCU_GP_IDLE: |
1221 | case RCU_GP_INIT: | 1234 | case RCU_GP_INIT: |
1222 | 1235 | ||
@@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1224 | 1237 | ||
1225 | case RCU_SAVE_DYNTICK: | 1238 | case RCU_SAVE_DYNTICK: |
1226 | 1239 | ||
1240 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
1227 | if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) | 1241 | if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) |
1228 | break; /* So gcc recognizes the dead code. */ | 1242 | break; /* So gcc recognizes the dead code. */ |
1229 | 1243 | ||
1230 | /* Record dyntick-idle state. */ | 1244 | /* Record dyntick-idle state. */ |
1231 | if (rcu_process_dyntick(rsp, lastcomp, | 1245 | force_qs_rnp(rsp, dyntick_save_progress_counter); |
1232 | dyntick_save_progress_counter)) | 1246 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
1233 | goto unlock_ret; | 1247 | if (rcu_gp_in_progress(rsp)) |
1234 | /* fall into next case. */ | ||
1235 | |||
1236 | case RCU_SAVE_COMPLETED: | ||
1237 | |||
1238 | /* Update state, record completion counter. */ | ||
1239 | forcenow = 0; | ||
1240 | spin_lock(&rnp->lock); | ||
1241 | if (lastcomp + 1 == rsp->gpnum && | ||
1242 | lastcomp == rsp->completed && | ||
1243 | rsp->signaled == signaled) { | ||
1244 | rsp->signaled = RCU_FORCE_QS; | 1248 | rsp->signaled = RCU_FORCE_QS; |
1245 | rsp->completed_fqs = lastcomp; | 1249 | break; |
1246 | forcenow = signaled == RCU_SAVE_COMPLETED; | ||
1247 | } | ||
1248 | spin_unlock(&rnp->lock); | ||
1249 | if (!forcenow) | ||
1250 | break; | ||
1251 | /* fall into next case. */ | ||
1252 | 1250 | ||
1253 | case RCU_FORCE_QS: | 1251 | case RCU_FORCE_QS: |
1254 | 1252 | ||
1255 | /* Check dyntick-idle state, send IPI to laggarts. */ | 1253 | /* Check dyntick-idle state, send IPI to laggarts. */ |
1256 | if (rcu_process_dyntick(rsp, rsp->completed_fqs, | 1254 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
1257 | rcu_implicit_dynticks_qs)) | 1255 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); |
1258 | goto unlock_ret; | ||
1259 | 1256 | ||
1260 | /* Leave state in case more forcing is required. */ | 1257 | /* Leave state in case more forcing is required. */ |
1261 | 1258 | ||
1259 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
1262 | break; | 1260 | break; |
1263 | } | 1261 | } |
1264 | unlock_ret: | 1262 | rsp->fqs_active = 0; |
1265 | spin_unlock_irqrestore(&rsp->fqslock, flags); | 1263 | if (rsp->fqs_need_gp) { |
1264 | raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ | ||
1265 | rsp->fqs_need_gp = 0; | ||
1266 | rcu_start_gp(rsp, flags); /* releases rnp->lock */ | ||
1267 | return; | ||
1268 | } | ||
1269 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
1270 | unlock_fqs_ret: | ||
1271 | raw_spin_unlock_irqrestore(&rsp->fqslock, flags); | ||
1266 | } | 1272 | } |
1267 | 1273 | ||
1268 | #else /* #ifdef CONFIG_SMP */ | 1274 | #else /* #ifdef CONFIG_SMP */ |
@@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1290 | * If an RCU GP has gone long enough, go check for dyntick | 1296 | * If an RCU GP has gone long enough, go check for dyntick |
1291 | * idle CPUs and, if needed, send resched IPIs. | 1297 | * idle CPUs and, if needed, send resched IPIs. |
1292 | */ | 1298 | */ |
1293 | if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) | 1299 | if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
1294 | force_quiescent_state(rsp, 1); | 1300 | force_quiescent_state(rsp, 1); |
1295 | 1301 | ||
1296 | /* | 1302 | /* |
@@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1304 | 1310 | ||
1305 | /* Does this CPU require a not-yet-started grace period? */ | 1311 | /* Does this CPU require a not-yet-started grace period? */ |
1306 | if (cpu_needs_another_gp(rsp, rdp)) { | 1312 | if (cpu_needs_another_gp(rsp, rdp)) { |
1307 | spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); | 1313 | raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); |
1308 | rcu_start_gp(rsp, flags); /* releases above lock */ | 1314 | rcu_start_gp(rsp, flags); /* releases above lock */ |
1309 | } | 1315 | } |
1310 | 1316 | ||
@@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
1335 | * grace-period manipulations above. | 1341 | * grace-period manipulations above. |
1336 | */ | 1342 | */ |
1337 | smp_mb(); /* See above block comment. */ | 1343 | smp_mb(); /* See above block comment. */ |
1344 | |||
1345 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ | ||
1346 | rcu_needs_cpu_flush(); | ||
1338 | } | 1347 | } |
1339 | 1348 | ||
1340 | static void | 1349 | static void |
@@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1369 | unsigned long nestflag; | 1378 | unsigned long nestflag; |
1370 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 1379 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
1371 | 1380 | ||
1372 | spin_lock_irqsave(&rnp_root->lock, nestflag); | 1381 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); |
1373 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ | 1382 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ |
1374 | } | 1383 | } |
1375 | 1384 | ||
@@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1387 | force_quiescent_state(rsp, 0); | 1396 | force_quiescent_state(rsp, 0); |
1388 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1397 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1389 | rdp->qlen_last_fqs_check = rdp->qlen; | 1398 | rdp->qlen_last_fqs_check = rdp->qlen; |
1390 | } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) | 1399 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
1391 | force_quiescent_state(rsp, 1); | 1400 | force_quiescent_state(rsp, 1); |
1392 | local_irq_restore(flags); | 1401 | local_irq_restore(flags); |
1393 | } | 1402 | } |
@@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1520 | 1529 | ||
1521 | /* Has an RCU GP gone long enough to send resched IPIs &c? */ | 1530 | /* Has an RCU GP gone long enough to send resched IPIs &c? */ |
1522 | if (rcu_gp_in_progress(rsp) && | 1531 | if (rcu_gp_in_progress(rsp) && |
1523 | ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { | 1532 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { |
1524 | rdp->n_rp_need_fqs++; | 1533 | rdp->n_rp_need_fqs++; |
1525 | return 1; | 1534 | return 1; |
1526 | } | 1535 | } |
@@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu) | |||
1545 | /* | 1554 | /* |
1546 | * Check to see if any future RCU-related work will need to be done | 1555 | * Check to see if any future RCU-related work will need to be done |
1547 | * by the current CPU, even if none need be done immediately, returning | 1556 | * by the current CPU, even if none need be done immediately, returning |
1548 | * 1 if so. This function is part of the RCU implementation; it is -not- | 1557 | * 1 if so. |
1549 | * an exported member of the RCU API. | ||
1550 | */ | 1558 | */ |
1551 | int rcu_needs_cpu(int cpu) | 1559 | static int rcu_needs_cpu_quick_check(int cpu) |
1552 | { | 1560 | { |
1553 | /* RCU callbacks either ready or pending? */ | 1561 | /* RCU callbacks either ready or pending? */ |
1554 | return per_cpu(rcu_sched_data, cpu).nxtlist || | 1562 | return per_cpu(rcu_sched_data, cpu).nxtlist || |
@@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu) | |||
1556 | rcu_preempt_needs_cpu(cpu); | 1564 | rcu_preempt_needs_cpu(cpu); |
1557 | } | 1565 | } |
1558 | 1566 | ||
1559 | /* | ||
1560 | * This function is invoked towards the end of the scheduler's initialization | ||
1561 | * process. Before this is called, the idle task might contain | ||
1562 | * RCU read-side critical sections (during which time, this idle | ||
1563 | * task is booting the system). After this function is called, the | ||
1564 | * idle tasks are prohibited from containing RCU read-side critical | ||
1565 | * sections. | ||
1566 | */ | ||
1567 | void rcu_scheduler_starting(void) | ||
1568 | { | ||
1569 | WARN_ON(num_online_cpus() != 1); | ||
1570 | WARN_ON(nr_context_switches() > 0); | ||
1571 | rcu_scheduler_active = 1; | ||
1572 | } | ||
1573 | |||
1574 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 1567 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; |
1575 | static atomic_t rcu_barrier_cpu_count; | 1568 | static atomic_t rcu_barrier_cpu_count; |
1576 | static DEFINE_MUTEX(rcu_barrier_mutex); | 1569 | static DEFINE_MUTEX(rcu_barrier_mutex); |
@@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1659 | struct rcu_node *rnp = rcu_get_root(rsp); | 1652 | struct rcu_node *rnp = rcu_get_root(rsp); |
1660 | 1653 | ||
1661 | /* Set up local state, ensuring consistent view of global state. */ | 1654 | /* Set up local state, ensuring consistent view of global state. */ |
1662 | spin_lock_irqsave(&rnp->lock, flags); | 1655 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1663 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); | 1656 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); |
1664 | rdp->nxtlist = NULL; | 1657 | rdp->nxtlist = NULL; |
1665 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1658 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
@@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1669 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 1662 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
1670 | #endif /* #ifdef CONFIG_NO_HZ */ | 1663 | #endif /* #ifdef CONFIG_NO_HZ */ |
1671 | rdp->cpu = cpu; | 1664 | rdp->cpu = cpu; |
1672 | spin_unlock_irqrestore(&rnp->lock, flags); | 1665 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1673 | } | 1666 | } |
1674 | 1667 | ||
1675 | /* | 1668 | /* |
@@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1687 | struct rcu_node *rnp = rcu_get_root(rsp); | 1680 | struct rcu_node *rnp = rcu_get_root(rsp); |
1688 | 1681 | ||
1689 | /* Set up local state, ensuring consistent view of global state. */ | 1682 | /* Set up local state, ensuring consistent view of global state. */ |
1690 | spin_lock_irqsave(&rnp->lock, flags); | 1683 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1691 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | 1684 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ |
1692 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | 1685 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ |
1693 | rdp->beenonline = 1; /* We have now been online. */ | 1686 | rdp->beenonline = 1; /* We have now been online. */ |
@@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1695 | rdp->qlen_last_fqs_check = 0; | 1688 | rdp->qlen_last_fqs_check = 0; |
1696 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1689 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1697 | rdp->blimit = blimit; | 1690 | rdp->blimit = blimit; |
1698 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1691 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1699 | 1692 | ||
1700 | /* | 1693 | /* |
1701 | * A new grace period might start here. If so, we won't be part | 1694 | * A new grace period might start here. If so, we won't be part |
@@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1703 | */ | 1696 | */ |
1704 | 1697 | ||
1705 | /* Exclude any attempts to start a new GP on large systems. */ | 1698 | /* Exclude any attempts to start a new GP on large systems. */ |
1706 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1699 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
1707 | 1700 | ||
1708 | /* Add CPU to rcu_node bitmasks. */ | 1701 | /* Add CPU to rcu_node bitmasks. */ |
1709 | rnp = rdp->mynode; | 1702 | rnp = rdp->mynode; |
1710 | mask = rdp->grpmask; | 1703 | mask = rdp->grpmask; |
1711 | do { | 1704 | do { |
1712 | /* Exclude any attempts to start a new GP on small systems. */ | 1705 | /* Exclude any attempts to start a new GP on small systems. */ |
1713 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 1706 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
1714 | rnp->qsmaskinit |= mask; | 1707 | rnp->qsmaskinit |= mask; |
1715 | mask = rnp->grpmask; | 1708 | mask = rnp->grpmask; |
1716 | if (rnp == rdp->mynode) { | 1709 | if (rnp == rdp->mynode) { |
@@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1718 | rdp->completed = rnp->completed; | 1711 | rdp->completed = rnp->completed; |
1719 | rdp->passed_quiesc_completed = rnp->completed - 1; | 1712 | rdp->passed_quiesc_completed = rnp->completed - 1; |
1720 | } | 1713 | } |
1721 | spin_unlock(&rnp->lock); /* irqs already disabled. */ | 1714 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
1722 | rnp = rnp->parent; | 1715 | rnp = rnp->parent; |
1723 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); | 1716 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); |
1724 | 1717 | ||
1725 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 1718 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
1726 | } | 1719 | } |
1727 | 1720 | ||
1728 | static void __cpuinit rcu_online_cpu(int cpu) | 1721 | static void __cpuinit rcu_online_cpu(int cpu) |
@@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
1806 | */ | 1799 | */ |
1807 | static void __init rcu_init_one(struct rcu_state *rsp) | 1800 | static void __init rcu_init_one(struct rcu_state *rsp) |
1808 | { | 1801 | { |
1802 | static char *buf[] = { "rcu_node_level_0", | ||
1803 | "rcu_node_level_1", | ||
1804 | "rcu_node_level_2", | ||
1805 | "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ | ||
1809 | int cpustride = 1; | 1806 | int cpustride = 1; |
1810 | int i; | 1807 | int i; |
1811 | int j; | 1808 | int j; |
1812 | struct rcu_node *rnp; | 1809 | struct rcu_node *rnp; |
1813 | 1810 | ||
1811 | BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ | ||
1812 | |||
1814 | /* Initialize the level-tracking arrays. */ | 1813 | /* Initialize the level-tracking arrays. */ |
1815 | 1814 | ||
1816 | for (i = 1; i < NUM_RCU_LVLS; i++) | 1815 | for (i = 1; i < NUM_RCU_LVLS; i++) |
@@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
1823 | cpustride *= rsp->levelspread[i]; | 1822 | cpustride *= rsp->levelspread[i]; |
1824 | rnp = rsp->level[i]; | 1823 | rnp = rsp->level[i]; |
1825 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { | 1824 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { |
1826 | spin_lock_init(&rnp->lock); | 1825 | raw_spin_lock_init(&rnp->lock); |
1827 | lockdep_set_class(&rnp->lock, &rcu_node_class[i]); | 1826 | lockdep_set_class_and_name(&rnp->lock, |
1827 | &rcu_node_class[i], buf[i]); | ||
1828 | rnp->gpnum = 0; | 1828 | rnp->gpnum = 0; |
1829 | rnp->qsmask = 0; | 1829 | rnp->qsmask = 0; |
1830 | rnp->qsmaskinit = 0; | 1830 | rnp->qsmaskinit = 0; |
@@ -1876,7 +1876,7 @@ do { \ | |||
1876 | 1876 | ||
1877 | void __init rcu_init(void) | 1877 | void __init rcu_init(void) |
1878 | { | 1878 | { |
1879 | int i; | 1879 | int cpu; |
1880 | 1880 | ||
1881 | rcu_bootup_announce(); | 1881 | rcu_bootup_announce(); |
1882 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 1882 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
@@ -1896,8 +1896,8 @@ void __init rcu_init(void) | |||
1896 | * or the scheduler are operational. | 1896 | * or the scheduler are operational. |
1897 | */ | 1897 | */ |
1898 | cpu_notifier(rcu_cpu_notify, 0); | 1898 | cpu_notifier(rcu_cpu_notify, 0); |
1899 | for_each_online_cpu(i) | 1899 | for_each_online_cpu(cpu) |
1900 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); | 1900 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
1901 | } | 1901 | } |
1902 | 1902 | ||
1903 | #include "rcutree_plugin.h" | 1903 | #include "rcutree_plugin.h" |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d2a0046f63b2..4a525a30e08e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -90,12 +90,12 @@ struct rcu_dynticks { | |||
90 | * Definition for node within the RCU grace-period-detection hierarchy. | 90 | * Definition for node within the RCU grace-period-detection hierarchy. |
91 | */ | 91 | */ |
92 | struct rcu_node { | 92 | struct rcu_node { |
93 | spinlock_t lock; /* Root rcu_node's lock protects some */ | 93 | raw_spinlock_t lock; /* Root rcu_node's lock protects some */ |
94 | /* rcu_state fields as well as following. */ | 94 | /* rcu_state fields as well as following. */ |
95 | long gpnum; /* Current grace period for this node. */ | 95 | unsigned long gpnum; /* Current grace period for this node. */ |
96 | /* This will either be equal to or one */ | 96 | /* This will either be equal to or one */ |
97 | /* behind the root rcu_node's gpnum. */ | 97 | /* behind the root rcu_node's gpnum. */ |
98 | long completed; /* Last grace period completed for this node. */ | 98 | unsigned long completed; /* Last GP completed for this node. */ |
99 | /* This will either be equal to or one */ | 99 | /* This will either be equal to or one */ |
100 | /* behind the root rcu_node's gpnum. */ | 100 | /* behind the root rcu_node's gpnum. */ |
101 | unsigned long qsmask; /* CPUs or groups that need to switch in */ | 101 | unsigned long qsmask; /* CPUs or groups that need to switch in */ |
@@ -161,11 +161,11 @@ struct rcu_node { | |||
161 | /* Per-CPU data for read-copy update. */ | 161 | /* Per-CPU data for read-copy update. */ |
162 | struct rcu_data { | 162 | struct rcu_data { |
163 | /* 1) quiescent-state and grace-period handling : */ | 163 | /* 1) quiescent-state and grace-period handling : */ |
164 | long completed; /* Track rsp->completed gp number */ | 164 | unsigned long completed; /* Track rsp->completed gp number */ |
165 | /* in order to detect GP end. */ | 165 | /* in order to detect GP end. */ |
166 | long gpnum; /* Highest gp number that this CPU */ | 166 | unsigned long gpnum; /* Highest gp number that this CPU */ |
167 | /* is aware of having started. */ | 167 | /* is aware of having started. */ |
168 | long passed_quiesc_completed; | 168 | unsigned long passed_quiesc_completed; |
169 | /* Value of completed at time of qs. */ | 169 | /* Value of completed at time of qs. */ |
170 | bool passed_quiesc; /* User-mode/idle loop etc. */ | 170 | bool passed_quiesc; /* User-mode/idle loop etc. */ |
171 | bool qs_pending; /* Core waits for quiesc state. */ | 171 | bool qs_pending; /* Core waits for quiesc state. */ |
@@ -221,14 +221,14 @@ struct rcu_data { | |||
221 | unsigned long resched_ipi; /* Sent a resched IPI. */ | 221 | unsigned long resched_ipi; /* Sent a resched IPI. */ |
222 | 222 | ||
223 | /* 5) __rcu_pending() statistics. */ | 223 | /* 5) __rcu_pending() statistics. */ |
224 | long n_rcu_pending; /* rcu_pending() calls since boot. */ | 224 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ |
225 | long n_rp_qs_pending; | 225 | unsigned long n_rp_qs_pending; |
226 | long n_rp_cb_ready; | 226 | unsigned long n_rp_cb_ready; |
227 | long n_rp_cpu_needs_gp; | 227 | unsigned long n_rp_cpu_needs_gp; |
228 | long n_rp_gp_completed; | 228 | unsigned long n_rp_gp_completed; |
229 | long n_rp_gp_started; | 229 | unsigned long n_rp_gp_started; |
230 | long n_rp_need_fqs; | 230 | unsigned long n_rp_need_fqs; |
231 | long n_rp_need_nothing; | 231 | unsigned long n_rp_need_nothing; |
232 | 232 | ||
233 | int cpu; | 233 | int cpu; |
234 | }; | 234 | }; |
@@ -237,25 +237,36 @@ struct rcu_data { | |||
237 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ | 237 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ |
238 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ | 238 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ |
239 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ | 239 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ |
240 | #define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ | 240 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ |
241 | #define RCU_FORCE_QS 4 /* Need to force quiescent state. */ | ||
242 | #ifdef CONFIG_NO_HZ | 241 | #ifdef CONFIG_NO_HZ |
243 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 242 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
244 | #else /* #ifdef CONFIG_NO_HZ */ | 243 | #else /* #ifdef CONFIG_NO_HZ */ |
245 | #define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED | 244 | #define RCU_SIGNAL_INIT RCU_FORCE_QS |
246 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 245 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
247 | 246 | ||
248 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 247 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
249 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 248 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
250 | #define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ | 249 | |
251 | #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ | 250 | #ifdef CONFIG_PROVE_RCU |
252 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ | 251 | #define RCU_STALL_DELAY_DELTA (5 * HZ) |
253 | /* to take at least one */ | 252 | #else |
254 | /* scheduling clock irq */ | 253 | #define RCU_STALL_DELAY_DELTA 0 |
255 | /* before ratting on them. */ | 254 | #endif |
255 | |||
256 | #define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) | ||
257 | /* for rsp->jiffies_stall */ | ||
258 | #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) | ||
259 | /* for rsp->jiffies_stall */ | ||
260 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ | ||
261 | /* to take at least one */ | ||
262 | /* scheduling clock irq */ | ||
263 | /* before ratting on them. */ | ||
256 | 264 | ||
257 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 265 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
258 | 266 | ||
267 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) | ||
268 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | ||
269 | |||
259 | /* | 270 | /* |
260 | * RCU global state, including node hierarchy. This hierarchy is | 271 | * RCU global state, including node hierarchy. This hierarchy is |
261 | * represented in "heap" form in a dense array. The root (first level) | 272 | * represented in "heap" form in a dense array. The root (first level) |
@@ -277,12 +288,19 @@ struct rcu_state { | |||
277 | 288 | ||
278 | u8 signaled ____cacheline_internodealigned_in_smp; | 289 | u8 signaled ____cacheline_internodealigned_in_smp; |
279 | /* Force QS state. */ | 290 | /* Force QS state. */ |
280 | long gpnum; /* Current gp number. */ | 291 | u8 fqs_active; /* force_quiescent_state() */ |
281 | long completed; /* # of last completed gp. */ | 292 | /* is running. */ |
293 | u8 fqs_need_gp; /* A CPU was prevented from */ | ||
294 | /* starting a new grace */ | ||
295 | /* period because */ | ||
296 | /* force_quiescent_state() */ | ||
297 | /* was running. */ | ||
298 | unsigned long gpnum; /* Current gp number. */ | ||
299 | unsigned long completed; /* # of last completed gp. */ | ||
282 | 300 | ||
283 | /* End of fields guarded by root rcu_node's lock. */ | 301 | /* End of fields guarded by root rcu_node's lock. */ |
284 | 302 | ||
285 | spinlock_t onofflock; /* exclude on/offline and */ | 303 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
286 | /* starting new GP. Also */ | 304 | /* starting new GP. Also */ |
287 | /* protects the following */ | 305 | /* protects the following */ |
288 | /* orphan_cbs fields. */ | 306 | /* orphan_cbs fields. */ |
@@ -292,10 +310,8 @@ struct rcu_state { | |||
292 | /* going offline. */ | 310 | /* going offline. */ |
293 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ | 311 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ |
294 | long orphan_qlen; /* Number of orphaned cbs. */ | 312 | long orphan_qlen; /* Number of orphaned cbs. */ |
295 | spinlock_t fqslock; /* Only one task forcing */ | 313 | raw_spinlock_t fqslock; /* Only one task forcing */ |
296 | /* quiescent states. */ | 314 | /* quiescent states. */ |
297 | long completed_fqs; /* Value of completed @ snap. */ | ||
298 | /* Protected by fqslock. */ | ||
299 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 315 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
300 | /* force_quiescent_state(). */ | 316 | /* force_quiescent_state(). */ |
301 | unsigned long n_force_qs; /* Number of calls to */ | 317 | unsigned long n_force_qs; /* Number of calls to */ |
@@ -319,8 +335,6 @@ struct rcu_state { | |||
319 | #define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ | 335 | #define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ |
320 | /* GP were moved to root. */ | 336 | /* GP were moved to root. */ |
321 | 337 | ||
322 | #ifdef RCU_TREE_NONCORE | ||
323 | |||
324 | /* | 338 | /* |
325 | * RCU implementation internal declarations: | 339 | * RCU implementation internal declarations: |
326 | */ | 340 | */ |
@@ -335,7 +349,7 @@ extern struct rcu_state rcu_preempt_state; | |||
335 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | 349 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); |
336 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 350 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
337 | 351 | ||
338 | #else /* #ifdef RCU_TREE_NONCORE */ | 352 | #ifndef RCU_TREE_NONCORE |
339 | 353 | ||
340 | /* Forward declarations for rcutree_plugin.h */ | 354 | /* Forward declarations for rcutree_plugin.h */ |
341 | static void rcu_bootup_announce(void); | 355 | static void rcu_bootup_announce(void); |
@@ -347,6 +361,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | |||
347 | unsigned long flags); | 361 | unsigned long flags); |
348 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 362 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
349 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 363 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
364 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | ||
350 | static void rcu_print_task_stall(struct rcu_node *rnp); | 365 | static void rcu_print_task_stall(struct rcu_node *rnp); |
351 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 366 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
352 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 367 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
@@ -367,5 +382,6 @@ static int rcu_preempt_needs_cpu(int cpu); | |||
367 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 382 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
368 | static void rcu_preempt_send_cbs_to_orphanage(void); | 383 | static void rcu_preempt_send_cbs_to_orphanage(void); |
369 | static void __init __rcu_init_preempt(void); | 384 | static void __init __rcu_init_preempt(void); |
385 | static void rcu_needs_cpu_flush(void); | ||
370 | 386 | ||
371 | #endif /* #else #ifdef RCU_TREE_NONCORE */ | 387 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 37fbccdf41d5..79b53bda8943 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -62,6 +62,15 @@ long rcu_batches_completed(void) | |||
62 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 62 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * Force a quiescent state for preemptible RCU. | ||
66 | */ | ||
67 | void rcu_force_quiescent_state(void) | ||
68 | { | ||
69 | force_quiescent_state(&rcu_preempt_state, 0); | ||
70 | } | ||
71 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
72 | |||
73 | /* | ||
65 | * Record a preemptable-RCU quiescent state for the specified CPU. Note | 74 | * Record a preemptable-RCU quiescent state for the specified CPU. Note |
66 | * that this just means that the task currently running on the CPU is | 75 | * that this just means that the task currently running on the CPU is |
67 | * not in a quiescent state. There might be any number of tasks blocked | 76 | * not in a quiescent state. There might be any number of tasks blocked |
@@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
102 | /* Possibly blocking in an RCU read-side critical section. */ | 111 | /* Possibly blocking in an RCU read-side critical section. */ |
103 | rdp = rcu_preempt_state.rda[cpu]; | 112 | rdp = rcu_preempt_state.rda[cpu]; |
104 | rnp = rdp->mynode; | 113 | rnp = rdp->mynode; |
105 | spin_lock_irqsave(&rnp->lock, flags); | 114 | raw_spin_lock_irqsave(&rnp->lock, flags); |
106 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 115 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
107 | t->rcu_blocked_node = rnp; | 116 | t->rcu_blocked_node = rnp; |
108 | 117 | ||
@@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
123 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 132 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); |
124 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; | 133 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; |
125 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); | 134 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); |
126 | spin_unlock_irqrestore(&rnp->lock, flags); | 135 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
127 | } | 136 | } |
128 | 137 | ||
129 | /* | 138 | /* |
@@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
180 | struct rcu_node *rnp_p; | 189 | struct rcu_node *rnp_p; |
181 | 190 | ||
182 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 191 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { |
183 | spin_unlock_irqrestore(&rnp->lock, flags); | 192 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
184 | return; /* Still need more quiescent states! */ | 193 | return; /* Still need more quiescent states! */ |
185 | } | 194 | } |
186 | 195 | ||
@@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
197 | 206 | ||
198 | /* Report up the rest of the hierarchy. */ | 207 | /* Report up the rest of the hierarchy. */ |
199 | mask = rnp->grpmask; | 208 | mask = rnp->grpmask; |
200 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 209 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
201 | spin_lock(&rnp_p->lock); /* irqs already disabled. */ | 210 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ |
202 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); | 211 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); |
203 | } | 212 | } |
204 | 213 | ||
@@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
248 | */ | 257 | */ |
249 | for (;;) { | 258 | for (;;) { |
250 | rnp = t->rcu_blocked_node; | 259 | rnp = t->rcu_blocked_node; |
251 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 260 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
252 | if (rnp == t->rcu_blocked_node) | 261 | if (rnp == t->rcu_blocked_node) |
253 | break; | 262 | break; |
254 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 263 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
255 | } | 264 | } |
256 | empty = !rcu_preempted_readers(rnp); | 265 | empty = !rcu_preempted_readers(rnp); |
257 | empty_exp = !rcu_preempted_readers_exp(rnp); | 266 | empty_exp = !rcu_preempted_readers_exp(rnp); |
@@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
265 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. | 274 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. |
266 | */ | 275 | */ |
267 | if (empty) | 276 | if (empty) |
268 | spin_unlock_irqrestore(&rnp->lock, flags); | 277 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
269 | else | 278 | else |
270 | rcu_report_unblock_qs_rnp(rnp, flags); | 279 | rcu_report_unblock_qs_rnp(rnp, flags); |
271 | 280 | ||
@@ -295,29 +304,73 @@ void __rcu_read_unlock(void) | |||
295 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && | 304 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && |
296 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 305 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
297 | rcu_read_unlock_special(t); | 306 | rcu_read_unlock_special(t); |
307 | #ifdef CONFIG_PROVE_LOCKING | ||
308 | WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); | ||
309 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
298 | } | 310 | } |
299 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | 311 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); |
300 | 312 | ||
301 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 313 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
302 | 314 | ||
315 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | ||
316 | |||
317 | /* | ||
318 | * Dump detailed information for all tasks blocking the current RCU | ||
319 | * grace period on the specified rcu_node structure. | ||
320 | */ | ||
321 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | ||
322 | { | ||
323 | unsigned long flags; | ||
324 | struct list_head *lp; | ||
325 | int phase; | ||
326 | struct task_struct *t; | ||
327 | |||
328 | if (rcu_preempted_readers(rnp)) { | ||
329 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
330 | phase = rnp->gpnum & 0x1; | ||
331 | lp = &rnp->blocked_tasks[phase]; | ||
332 | list_for_each_entry(t, lp, rcu_node_entry) | ||
333 | sched_show_task(t); | ||
334 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
335 | } | ||
336 | } | ||
337 | |||
338 | /* | ||
339 | * Dump detailed information for all tasks blocking the current RCU | ||
340 | * grace period. | ||
341 | */ | ||
342 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
343 | { | ||
344 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
345 | |||
346 | rcu_print_detail_task_stall_rnp(rnp); | ||
347 | rcu_for_each_leaf_node(rsp, rnp) | ||
348 | rcu_print_detail_task_stall_rnp(rnp); | ||
349 | } | ||
350 | |||
351 | #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
352 | |||
353 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
354 | { | ||
355 | } | ||
356 | |||
357 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
358 | |||
303 | /* | 359 | /* |
304 | * Scan the current list of tasks blocked within RCU read-side critical | 360 | * Scan the current list of tasks blocked within RCU read-side critical |
305 | * sections, printing out the tid of each. | 361 | * sections, printing out the tid of each. |
306 | */ | 362 | */ |
307 | static void rcu_print_task_stall(struct rcu_node *rnp) | 363 | static void rcu_print_task_stall(struct rcu_node *rnp) |
308 | { | 364 | { |
309 | unsigned long flags; | ||
310 | struct list_head *lp; | 365 | struct list_head *lp; |
311 | int phase; | 366 | int phase; |
312 | struct task_struct *t; | 367 | struct task_struct *t; |
313 | 368 | ||
314 | if (rcu_preempted_readers(rnp)) { | 369 | if (rcu_preempted_readers(rnp)) { |
315 | spin_lock_irqsave(&rnp->lock, flags); | ||
316 | phase = rnp->gpnum & 0x1; | 370 | phase = rnp->gpnum & 0x1; |
317 | lp = &rnp->blocked_tasks[phase]; | 371 | lp = &rnp->blocked_tasks[phase]; |
318 | list_for_each_entry(t, lp, rcu_node_entry) | 372 | list_for_each_entry(t, lp, rcu_node_entry) |
319 | printk(" P%d", t->pid); | 373 | printk(" P%d", t->pid); |
320 | spin_unlock_irqrestore(&rnp->lock, flags); | ||
321 | } | 374 | } |
322 | } | 375 | } |
323 | 376 | ||
@@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
388 | lp_root = &rnp_root->blocked_tasks[i]; | 441 | lp_root = &rnp_root->blocked_tasks[i]; |
389 | while (!list_empty(lp)) { | 442 | while (!list_empty(lp)) { |
390 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); | 443 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); |
391 | spin_lock(&rnp_root->lock); /* irqs already disabled */ | 444 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
392 | list_del(&tp->rcu_node_entry); | 445 | list_del(&tp->rcu_node_entry); |
393 | tp->rcu_blocked_node = rnp_root; | 446 | tp->rcu_blocked_node = rnp_root; |
394 | list_add(&tp->rcu_node_entry, lp_root); | 447 | list_add(&tp->rcu_node_entry, lp_root); |
395 | spin_unlock(&rnp_root->lock); /* irqs remain disabled */ | 448 | raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ |
396 | } | 449 | } |
397 | } | 450 | } |
398 | return retval; | 451 | return retval; |
@@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
516 | unsigned long flags; | 569 | unsigned long flags; |
517 | unsigned long mask; | 570 | unsigned long mask; |
518 | 571 | ||
519 | spin_lock_irqsave(&rnp->lock, flags); | 572 | raw_spin_lock_irqsave(&rnp->lock, flags); |
520 | for (;;) { | 573 | for (;;) { |
521 | if (!sync_rcu_preempt_exp_done(rnp)) | 574 | if (!sync_rcu_preempt_exp_done(rnp)) |
522 | break; | 575 | break; |
@@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
525 | break; | 578 | break; |
526 | } | 579 | } |
527 | mask = rnp->grpmask; | 580 | mask = rnp->grpmask; |
528 | spin_unlock(&rnp->lock); /* irqs remain disabled */ | 581 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
529 | rnp = rnp->parent; | 582 | rnp = rnp->parent; |
530 | spin_lock(&rnp->lock); /* irqs already disabled */ | 583 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
531 | rnp->expmask &= ~mask; | 584 | rnp->expmask &= ~mask; |
532 | } | 585 | } |
533 | spin_unlock_irqrestore(&rnp->lock, flags); | 586 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
534 | } | 587 | } |
535 | 588 | ||
536 | /* | 589 | /* |
@@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
545 | { | 598 | { |
546 | int must_wait; | 599 | int must_wait; |
547 | 600 | ||
548 | spin_lock(&rnp->lock); /* irqs already disabled */ | 601 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
549 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); | 602 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); |
550 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); | 603 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); |
551 | must_wait = rcu_preempted_readers_exp(rnp); | 604 | must_wait = rcu_preempted_readers_exp(rnp); |
552 | spin_unlock(&rnp->lock); /* irqs remain disabled */ | 605 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
553 | if (!must_wait) | 606 | if (!must_wait) |
554 | rcu_report_exp_rnp(rsp, rnp); | 607 | rcu_report_exp_rnp(rsp, rnp); |
555 | } | 608 | } |
@@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void) | |||
594 | /* force all RCU readers onto blocked_tasks[]. */ | 647 | /* force all RCU readers onto blocked_tasks[]. */ |
595 | synchronize_sched_expedited(); | 648 | synchronize_sched_expedited(); |
596 | 649 | ||
597 | spin_lock_irqsave(&rsp->onofflock, flags); | 650 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
598 | 651 | ||
599 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | 652 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ |
600 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | 653 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { |
601 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 654 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
602 | rnp->expmask = rnp->qsmaskinit; | 655 | rnp->expmask = rnp->qsmaskinit; |
603 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 656 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
604 | } | 657 | } |
605 | 658 | ||
606 | /* Snapshot current state of ->blocked_tasks[] lists. */ | 659 | /* Snapshot current state of ->blocked_tasks[] lists. */ |
@@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void) | |||
609 | if (NUM_RCU_NODES > 1) | 662 | if (NUM_RCU_NODES > 1) |
610 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); | 663 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); |
611 | 664 | ||
612 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 665 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
613 | 666 | ||
614 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ | 667 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ |
615 | rnp = rcu_get_root(rsp); | 668 | rnp = rcu_get_root(rsp); |
@@ -713,6 +766,16 @@ long rcu_batches_completed(void) | |||
713 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 766 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
714 | 767 | ||
715 | /* | 768 | /* |
769 | * Force a quiescent state for RCU, which, because there is no preemptible | ||
770 | * RCU, becomes the same as rcu-sched. | ||
771 | */ | ||
772 | void rcu_force_quiescent_state(void) | ||
773 | { | ||
774 | rcu_sched_force_quiescent_state(); | ||
775 | } | ||
776 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
777 | |||
778 | /* | ||
716 | * Because preemptable RCU does not exist, we never have to check for | 779 | * Because preemptable RCU does not exist, we never have to check for |
717 | * CPUs being in quiescent states. | 780 | * CPUs being in quiescent states. |
718 | */ | 781 | */ |
@@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp) | |||
734 | /* Because preemptible RCU does not exist, no quieting of tasks. */ | 797 | /* Because preemptible RCU does not exist, no quieting of tasks. */ |
735 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | 798 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) |
736 | { | 799 | { |
737 | spin_unlock_irqrestore(&rnp->lock, flags); | 800 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
738 | } | 801 | } |
739 | 802 | ||
740 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 803 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
@@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
745 | * Because preemptable RCU does not exist, we never have to check for | 808 | * Because preemptable RCU does not exist, we never have to check for |
746 | * tasks blocked within RCU read-side critical sections. | 809 | * tasks blocked within RCU read-side critical sections. |
747 | */ | 810 | */ |
811 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
812 | { | ||
813 | } | ||
814 | |||
815 | /* | ||
816 | * Because preemptable RCU does not exist, we never have to check for | ||
817 | * tasks blocked within RCU read-side critical sections. | ||
818 | */ | ||
748 | static void rcu_print_task_stall(struct rcu_node *rnp) | 819 | static void rcu_print_task_stall(struct rcu_node *rnp) |
749 | { | 820 | { |
750 | } | 821 | } |
@@ -884,3 +955,115 @@ static void __init __rcu_init_preempt(void) | |||
884 | } | 955 | } |
885 | 956 | ||
886 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 957 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
958 | |||
959 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | ||
960 | |||
961 | /* | ||
962 | * Check to see if any future RCU-related work will need to be done | ||
963 | * by the current CPU, even if none need be done immediately, returning | ||
964 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
965 | * an exported member of the RCU API. | ||
966 | * | ||
967 | * Because we have preemptible RCU, just check whether this CPU needs | ||
968 | * any flavor of RCU. Do not chew up lots of CPU cycles with preemption | ||
969 | * disabled in a most-likely vain attempt to cause RCU not to need this CPU. | ||
970 | */ | ||
971 | int rcu_needs_cpu(int cpu) | ||
972 | { | ||
973 | return rcu_needs_cpu_quick_check(cpu); | ||
974 | } | ||
975 | |||
976 | /* | ||
977 | * Check to see if we need to continue a callback-flush operations to | ||
978 | * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle | ||
979 | * entry is not configured, so we never do need to. | ||
980 | */ | ||
981 | static void rcu_needs_cpu_flush(void) | ||
982 | { | ||
983 | } | ||
984 | |||
985 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
986 | |||
987 | #define RCU_NEEDS_CPU_FLUSHES 5 | ||
988 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | ||
989 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | ||
990 | |||
991 | /* | ||
992 | * Check to see if any future RCU-related work will need to be done | ||
993 | * by the current CPU, even if none need be done immediately, returning | ||
994 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
995 | * an exported member of the RCU API. | ||
996 | * | ||
997 | * Because we are not supporting preemptible RCU, attempt to accelerate | ||
998 | * any current grace periods so that RCU no longer needs this CPU, but | ||
999 | * only if all other CPUs are already in dynticks-idle mode. This will | ||
1000 | * allow the CPU cores to be powered down immediately, as opposed to after | ||
1001 | * waiting many milliseconds for grace periods to elapse. | ||
1002 | * | ||
1003 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | ||
1004 | * disabled, we do one pass of force_quiescent_state(), then do a | ||
1005 | * raise_softirq() to cause rcu_process_callbacks() to be invoked later. | ||
1006 | * The per-cpu rcu_dyntick_drain variable controls the sequencing. | ||
1007 | */ | ||
1008 | int rcu_needs_cpu(int cpu) | ||
1009 | { | ||
1010 | int c = 0; | ||
1011 | int thatcpu; | ||
1012 | |||
1013 | /* Check for being in the holdoff period. */ | ||
1014 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) | ||
1015 | return rcu_needs_cpu_quick_check(cpu); | ||
1016 | |||
1017 | /* Don't bother unless we are the last non-dyntick-idle CPU. */ | ||
1018 | for_each_cpu_not(thatcpu, nohz_cpu_mask) | ||
1019 | if (thatcpu != cpu) { | ||
1020 | per_cpu(rcu_dyntick_drain, cpu) = 0; | ||
1021 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | ||
1022 | return rcu_needs_cpu_quick_check(cpu); | ||
1023 | } | ||
1024 | |||
1025 | /* Check and update the rcu_dyntick_drain sequencing. */ | ||
1026 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | ||
1027 | /* First time through, initialize the counter. */ | ||
1028 | per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; | ||
1029 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | ||
1030 | /* We have hit the limit, so time to give up. */ | ||
1031 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | ||
1032 | return rcu_needs_cpu_quick_check(cpu); | ||
1033 | } | ||
1034 | |||
1035 | /* Do one step pushing remaining RCU callbacks through. */ | ||
1036 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | ||
1037 | rcu_sched_qs(cpu); | ||
1038 | force_quiescent_state(&rcu_sched_state, 0); | ||
1039 | c = c || per_cpu(rcu_sched_data, cpu).nxtlist; | ||
1040 | } | ||
1041 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | ||
1042 | rcu_bh_qs(cpu); | ||
1043 | force_quiescent_state(&rcu_bh_state, 0); | ||
1044 | c = c || per_cpu(rcu_bh_data, cpu).nxtlist; | ||
1045 | } | ||
1046 | |||
1047 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | ||
1048 | if (c) | ||
1049 | raise_softirq(RCU_SOFTIRQ); | ||
1050 | return c; | ||
1051 | } | ||
1052 | |||
1053 | /* | ||
1054 | * Check to see if we need to continue a callback-flush operations to | ||
1055 | * allow the last CPU to enter dyntick-idle mode. | ||
1056 | */ | ||
1057 | static void rcu_needs_cpu_flush(void) | ||
1058 | { | ||
1059 | int cpu = smp_processor_id(); | ||
1060 | unsigned long flags; | ||
1061 | |||
1062 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) | ||
1063 | return; | ||
1064 | local_irq_save(flags); | ||
1065 | (void)rcu_needs_cpu(cpu); | ||
1066 | local_irq_restore(flags); | ||
1067 | } | ||
1068 | |||
1069 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9d2c88423b31..d45db2e35d27 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
50 | { | 50 | { |
51 | if (!rdp->beenonline) | 51 | if (!rdp->beenonline) |
52 | return; | 52 | return; |
53 | seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", | 53 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", |
54 | rdp->cpu, | 54 | rdp->cpu, |
55 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 55 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
56 | rdp->completed, rdp->gpnum, | 56 | rdp->completed, rdp->gpnum, |
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
105 | { | 105 | { |
106 | if (!rdp->beenonline) | 106 | if (!rdp->beenonline) |
107 | return; | 107 | return; |
108 | seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", | 108 | seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", |
109 | rdp->cpu, | 109 | rdp->cpu, |
110 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", | 110 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", |
111 | rdp->completed, rdp->gpnum, | 111 | rdp->completed, rdp->gpnum, |
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = { | |||
155 | 155 | ||
156 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 156 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
157 | { | 157 | { |
158 | long gpnum; | 158 | unsigned long gpnum; |
159 | int level = 0; | 159 | int level = 0; |
160 | int phase; | 160 | int phase; |
161 | struct rcu_node *rnp; | 161 | struct rcu_node *rnp; |
162 | 162 | ||
163 | gpnum = rsp->gpnum; | 163 | gpnum = rsp->gpnum; |
164 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " | 164 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
165 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", | 165 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", |
166 | rsp->completed, gpnum, rsp->signaled, | 166 | rsp->completed, gpnum, rsp->signaled, |
167 | (long)(rsp->jiffies_force_qs - jiffies), | 167 | (long)(rsp->jiffies_force_qs - jiffies), |
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = { | |||
215 | static int show_rcugp(struct seq_file *m, void *unused) | 215 | static int show_rcugp(struct seq_file *m, void *unused) |
216 | { | 216 | { |
217 | #ifdef CONFIG_TREE_PREEMPT_RCU | 217 | #ifdef CONFIG_TREE_PREEMPT_RCU |
218 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", | 218 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", |
219 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); | 219 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); |
220 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 220 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
221 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", | 221 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", |
222 | rcu_sched_state.completed, rcu_sched_state.gpnum); | 222 | rcu_sched_state.completed, rcu_sched_state.gpnum); |
223 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", | 223 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", |
224 | rcu_bh_state.completed, rcu_bh_state.gpnum); | 224 | rcu_bh_state.completed, rcu_bh_state.gpnum); |
225 | return 0; | 225 | return 0; |
226 | } | 226 | } |
diff --git a/kernel/relay.c b/kernel/relay.c index c705a41b4ba3..3d97f2821611 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) | |||
1215 | /* | 1215 | /* |
1216 | * subbuf_splice_actor - splice up to one subbuf's worth of data | 1216 | * subbuf_splice_actor - splice up to one subbuf's worth of data |
1217 | */ | 1217 | */ |
1218 | static int subbuf_splice_actor(struct file *in, | 1218 | static ssize_t subbuf_splice_actor(struct file *in, |
1219 | loff_t *ppos, | 1219 | loff_t *ppos, |
1220 | struct pipe_inode_info *pipe, | 1220 | struct pipe_inode_info *pipe, |
1221 | size_t len, | 1221 | size_t len, |
1222 | unsigned int flags, | 1222 | unsigned int flags, |
1223 | int *nonpad_ret) | 1223 | int *nonpad_ret) |
1224 | { | 1224 | { |
1225 | unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; | 1225 | unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; |
1226 | struct rchan_buf *rbuf = in->private_data; | 1226 | struct rchan_buf *rbuf = in->private_data; |
1227 | unsigned int subbuf_size = rbuf->chan->subbuf_size; | 1227 | unsigned int subbuf_size = rbuf->chan->subbuf_size; |
1228 | uint64_t pos = (uint64_t) *ppos; | 1228 | uint64_t pos = (uint64_t) *ppos; |
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in, | |||
1241 | .ops = &relay_pipe_buf_ops, | 1241 | .ops = &relay_pipe_buf_ops, |
1242 | .spd_release = relay_page_release, | 1242 | .spd_release = relay_page_release, |
1243 | }; | 1243 | }; |
1244 | ssize_t ret; | ||
1244 | 1245 | ||
1245 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) | 1246 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) |
1246 | return 0; | 1247 | return 0; |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bcdabf37c40b..c7eaa37a768b 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -10,7 +10,6 @@ | |||
10 | #include <linux/types.h> | 10 | #include <linux/types.h> |
11 | #include <linux/parser.h> | 11 | #include <linux/parser.h> |
12 | #include <linux/fs.h> | 12 | #include <linux/fs.h> |
13 | #include <linux/slab.h> | ||
14 | #include <linux/res_counter.h> | 13 | #include <linux/res_counter.h> |
15 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
16 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
diff --git a/kernel/resource.c b/kernel/resource.c index af96c1e4b54b..9c358e263534 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -188,20 +188,65 @@ static int __release_resource(struct resource *old) | |||
188 | return -EINVAL; | 188 | return -EINVAL; |
189 | } | 189 | } |
190 | 190 | ||
191 | static void __release_child_resources(struct resource *r) | ||
192 | { | ||
193 | struct resource *tmp, *p; | ||
194 | resource_size_t size; | ||
195 | |||
196 | p = r->child; | ||
197 | r->child = NULL; | ||
198 | while (p) { | ||
199 | tmp = p; | ||
200 | p = p->sibling; | ||
201 | |||
202 | tmp->parent = NULL; | ||
203 | tmp->sibling = NULL; | ||
204 | __release_child_resources(tmp); | ||
205 | |||
206 | printk(KERN_DEBUG "release child resource %pR\n", tmp); | ||
207 | /* need to restore size, and keep flags */ | ||
208 | size = resource_size(tmp); | ||
209 | tmp->start = 0; | ||
210 | tmp->end = size - 1; | ||
211 | } | ||
212 | } | ||
213 | |||
214 | void release_child_resources(struct resource *r) | ||
215 | { | ||
216 | write_lock(&resource_lock); | ||
217 | __release_child_resources(r); | ||
218 | write_unlock(&resource_lock); | ||
219 | } | ||
220 | |||
191 | /** | 221 | /** |
192 | * request_resource - request and reserve an I/O or memory resource | 222 | * request_resource_conflict - request and reserve an I/O or memory resource |
193 | * @root: root resource descriptor | 223 | * @root: root resource descriptor |
194 | * @new: resource descriptor desired by caller | 224 | * @new: resource descriptor desired by caller |
195 | * | 225 | * |
196 | * Returns 0 for success, negative error code on error. | 226 | * Returns 0 for success, conflict resource on error. |
197 | */ | 227 | */ |
198 | int request_resource(struct resource *root, struct resource *new) | 228 | struct resource *request_resource_conflict(struct resource *root, struct resource *new) |
199 | { | 229 | { |
200 | struct resource *conflict; | 230 | struct resource *conflict; |
201 | 231 | ||
202 | write_lock(&resource_lock); | 232 | write_lock(&resource_lock); |
203 | conflict = __request_resource(root, new); | 233 | conflict = __request_resource(root, new); |
204 | write_unlock(&resource_lock); | 234 | write_unlock(&resource_lock); |
235 | return conflict; | ||
236 | } | ||
237 | |||
238 | /** | ||
239 | * request_resource - request and reserve an I/O or memory resource | ||
240 | * @root: root resource descriptor | ||
241 | * @new: resource descriptor desired by caller | ||
242 | * | ||
243 | * Returns 0 for success, negative error code on error. | ||
244 | */ | ||
245 | int request_resource(struct resource *root, struct resource *new) | ||
246 | { | ||
247 | struct resource *conflict; | ||
248 | |||
249 | conflict = request_resource_conflict(root, new); | ||
205 | return conflict ? -EBUSY : 0; | 250 | return conflict ? -EBUSY : 0; |
206 | } | 251 | } |
207 | 252 | ||
@@ -274,7 +319,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
274 | void *arg, int (*func)(unsigned long, unsigned long, void *)) | 319 | void *arg, int (*func)(unsigned long, unsigned long, void *)) |
275 | { | 320 | { |
276 | struct resource res; | 321 | struct resource res; |
277 | unsigned long pfn, len; | 322 | unsigned long pfn, end_pfn; |
278 | u64 orig_end; | 323 | u64 orig_end; |
279 | int ret = -1; | 324 | int ret = -1; |
280 | 325 | ||
@@ -284,9 +329,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
284 | orig_end = res.end; | 329 | orig_end = res.end; |
285 | while ((res.start < res.end) && | 330 | while ((res.start < res.end) && |
286 | (find_next_system_ram(&res, "System RAM") >= 0)) { | 331 | (find_next_system_ram(&res, "System RAM") >= 0)) { |
287 | pfn = (unsigned long)(res.start >> PAGE_SHIFT); | 332 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; |
288 | len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); | 333 | end_pfn = (res.end + 1) >> PAGE_SHIFT; |
289 | ret = (*func)(pfn, len, arg); | 334 | if (end_pfn > pfn) |
335 | ret = (*func)(pfn, end_pfn - pfn, arg); | ||
290 | if (ret) | 336 | if (ret) |
291 | break; | 337 | break; |
292 | res.start = res.end + 1; | 338 | res.start = res.end + 1; |
@@ -297,14 +343,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
297 | 343 | ||
298 | #endif | 344 | #endif |
299 | 345 | ||
346 | static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) | ||
347 | { | ||
348 | return 1; | ||
349 | } | ||
350 | /* | ||
351 | * This generic page_is_ram() returns true if specified address is | ||
352 | * registered as "System RAM" in iomem_resource list. | ||
353 | */ | ||
354 | int __weak page_is_ram(unsigned long pfn) | ||
355 | { | ||
356 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | ||
357 | } | ||
358 | |||
300 | /* | 359 | /* |
301 | * Find empty slot in the resource tree given range and alignment. | 360 | * Find empty slot in the resource tree given range and alignment. |
302 | */ | 361 | */ |
303 | static int find_resource(struct resource *root, struct resource *new, | 362 | static int find_resource(struct resource *root, struct resource *new, |
304 | resource_size_t size, resource_size_t min, | 363 | resource_size_t size, resource_size_t min, |
305 | resource_size_t max, resource_size_t align, | 364 | resource_size_t max, resource_size_t align, |
306 | void (*alignf)(void *, struct resource *, | 365 | resource_size_t (*alignf)(void *, |
307 | resource_size_t, resource_size_t), | 366 | const struct resource *, |
367 | resource_size_t, | ||
368 | resource_size_t), | ||
308 | void *alignf_data) | 369 | void *alignf_data) |
309 | { | 370 | { |
310 | struct resource *this = root->child; | 371 | struct resource *this = root->child; |
@@ -330,7 +391,7 @@ static int find_resource(struct resource *root, struct resource *new, | |||
330 | tmp.end = max; | 391 | tmp.end = max; |
331 | tmp.start = ALIGN(tmp.start, align); | 392 | tmp.start = ALIGN(tmp.start, align); |
332 | if (alignf) | 393 | if (alignf) |
333 | alignf(alignf_data, &tmp, size, align); | 394 | tmp.start = alignf(alignf_data, &tmp, size, align); |
334 | if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { | 395 | if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { |
335 | new->start = tmp.start; | 396 | new->start = tmp.start; |
336 | new->end = tmp.start + size - 1; | 397 | new->end = tmp.start + size - 1; |
@@ -358,8 +419,10 @@ static int find_resource(struct resource *root, struct resource *new, | |||
358 | int allocate_resource(struct resource *root, struct resource *new, | 419 | int allocate_resource(struct resource *root, struct resource *new, |
359 | resource_size_t size, resource_size_t min, | 420 | resource_size_t size, resource_size_t min, |
360 | resource_size_t max, resource_size_t align, | 421 | resource_size_t max, resource_size_t align, |
361 | void (*alignf)(void *, struct resource *, | 422 | resource_size_t (*alignf)(void *, |
362 | resource_size_t, resource_size_t), | 423 | const struct resource *, |
424 | resource_size_t, | ||
425 | resource_size_t), | ||
363 | void *alignf_data) | 426 | void *alignf_data) |
364 | { | 427 | { |
365 | int err; | 428 | int err; |
@@ -426,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou | |||
426 | } | 489 | } |
427 | 490 | ||
428 | /** | 491 | /** |
429 | * insert_resource - Inserts a resource in the resource tree | 492 | * insert_resource_conflict - Inserts resource in the resource tree |
430 | * @parent: parent of the new resource | 493 | * @parent: parent of the new resource |
431 | * @new: new resource to insert | 494 | * @new: new resource to insert |
432 | * | 495 | * |
433 | * Returns 0 on success, -EBUSY if the resource can't be inserted. | 496 | * Returns 0 on success, conflict resource if the resource can't be inserted. |
434 | * | 497 | * |
435 | * This function is equivalent to request_resource when no conflict | 498 | * This function is equivalent to request_resource_conflict when no conflict |
436 | * happens. If a conflict happens, and the conflicting resources | 499 | * happens. If a conflict happens, and the conflicting resources |
437 | * entirely fit within the range of the new resource, then the new | 500 | * entirely fit within the range of the new resource, then the new |
438 | * resource is inserted and the conflicting resources become children of | 501 | * resource is inserted and the conflicting resources become children of |
439 | * the new resource. | 502 | * the new resource. |
440 | */ | 503 | */ |
441 | int insert_resource(struct resource *parent, struct resource *new) | 504 | struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) |
442 | { | 505 | { |
443 | struct resource *conflict; | 506 | struct resource *conflict; |
444 | 507 | ||
445 | write_lock(&resource_lock); | 508 | write_lock(&resource_lock); |
446 | conflict = __insert_resource(parent, new); | 509 | conflict = __insert_resource(parent, new); |
447 | write_unlock(&resource_lock); | 510 | write_unlock(&resource_lock); |
511 | return conflict; | ||
512 | } | ||
513 | |||
514 | /** | ||
515 | * insert_resource - Inserts a resource in the resource tree | ||
516 | * @parent: parent of the new resource | ||
517 | * @new: new resource to insert | ||
518 | * | ||
519 | * Returns 0 on success, -EBUSY if the resource can't be inserted. | ||
520 | */ | ||
521 | int insert_resource(struct resource *parent, struct resource *new) | ||
522 | { | ||
523 | struct resource *conflict; | ||
524 | |||
525 | conflict = insert_resource_conflict(parent, new); | ||
448 | return conflict ? -EBUSY : 0; | 526 | return conflict ? -EBUSY : 0; |
449 | } | 527 | } |
450 | 528 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 7266b912139f..3c2a54f70ffe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -71,6 +71,7 @@ | |||
71 | #include <linux/debugfs.h> | 71 | #include <linux/debugfs.h> |
72 | #include <linux/ctype.h> | 72 | #include <linux/ctype.h> |
73 | #include <linux/ftrace.h> | 73 | #include <linux/ftrace.h> |
74 | #include <linux/slab.h> | ||
74 | 75 | ||
75 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
@@ -233,7 +234,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
233 | */ | 234 | */ |
234 | static DEFINE_MUTEX(sched_domains_mutex); | 235 | static DEFINE_MUTEX(sched_domains_mutex); |
235 | 236 | ||
236 | #ifdef CONFIG_GROUP_SCHED | 237 | #ifdef CONFIG_CGROUP_SCHED |
237 | 238 | ||
238 | #include <linux/cgroup.h> | 239 | #include <linux/cgroup.h> |
239 | 240 | ||
@@ -243,13 +244,7 @@ static LIST_HEAD(task_groups); | |||
243 | 244 | ||
244 | /* task group related information */ | 245 | /* task group related information */ |
245 | struct task_group { | 246 | struct task_group { |
246 | #ifdef CONFIG_CGROUP_SCHED | ||
247 | struct cgroup_subsys_state css; | 247 | struct cgroup_subsys_state css; |
248 | #endif | ||
249 | |||
250 | #ifdef CONFIG_USER_SCHED | ||
251 | uid_t uid; | ||
252 | #endif | ||
253 | 248 | ||
254 | #ifdef CONFIG_FAIR_GROUP_SCHED | 249 | #ifdef CONFIG_FAIR_GROUP_SCHED |
255 | /* schedulable entities of this group on each cpu */ | 250 | /* schedulable entities of this group on each cpu */ |
@@ -274,35 +269,7 @@ struct task_group { | |||
274 | struct list_head children; | 269 | struct list_head children; |
275 | }; | 270 | }; |
276 | 271 | ||
277 | #ifdef CONFIG_USER_SCHED | ||
278 | |||
279 | /* Helper function to pass uid information to create_sched_user() */ | ||
280 | void set_tg_uid(struct user_struct *user) | ||
281 | { | ||
282 | user->tg->uid = user->uid; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * Root task group. | ||
287 | * Every UID task group (including init_task_group aka UID-0) will | ||
288 | * be a child to this group. | ||
289 | */ | ||
290 | struct task_group root_task_group; | ||
291 | |||
292 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
293 | /* Default task group's sched entity on each cpu */ | ||
294 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | ||
295 | /* Default task group's cfs_rq on each cpu */ | ||
296 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); | ||
297 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
298 | |||
299 | #ifdef CONFIG_RT_GROUP_SCHED | ||
300 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
301 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var); | ||
302 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
303 | #else /* !CONFIG_USER_SCHED */ | ||
304 | #define root_task_group init_task_group | 272 | #define root_task_group init_task_group |
305 | #endif /* CONFIG_USER_SCHED */ | ||
306 | 273 | ||
307 | /* task_group_lock serializes add/remove of task groups and also changes to | 274 | /* task_group_lock serializes add/remove of task groups and also changes to |
308 | * a task group's cpu shares. | 275 | * a task group's cpu shares. |
@@ -318,11 +285,7 @@ static int root_task_group_empty(void) | |||
318 | } | 285 | } |
319 | #endif | 286 | #endif |
320 | 287 | ||
321 | #ifdef CONFIG_USER_SCHED | ||
322 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | ||
323 | #else /* !CONFIG_USER_SCHED */ | ||
324 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 288 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
325 | #endif /* CONFIG_USER_SCHED */ | ||
326 | 289 | ||
327 | /* | 290 | /* |
328 | * A weight of 0 or 1 can cause arithmetics problems. | 291 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -348,11 +311,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
348 | { | 311 | { |
349 | struct task_group *tg; | 312 | struct task_group *tg; |
350 | 313 | ||
351 | #ifdef CONFIG_USER_SCHED | 314 | #ifdef CONFIG_CGROUP_SCHED |
352 | rcu_read_lock(); | ||
353 | tg = __task_cred(p)->user->tg; | ||
354 | rcu_read_unlock(); | ||
355 | #elif defined(CONFIG_CGROUP_SCHED) | ||
356 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), | 315 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), |
357 | struct task_group, css); | 316 | struct task_group, css); |
358 | #else | 317 | #else |
@@ -364,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
364 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 323 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
365 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | 324 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
366 | { | 325 | { |
326 | /* | ||
327 | * Strictly speaking this rcu_read_lock() is not needed since the | ||
328 | * task_group is tied to the cgroup, which in turn can never go away | ||
329 | * as long as there are tasks attached to it. | ||
330 | * | ||
331 | * However since task_group() uses task_subsys_state() which is an | ||
332 | * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. | ||
333 | */ | ||
334 | rcu_read_lock(); | ||
367 | #ifdef CONFIG_FAIR_GROUP_SCHED | 335 | #ifdef CONFIG_FAIR_GROUP_SCHED |
368 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | 336 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; |
369 | p->se.parent = task_group(p)->se[cpu]; | 337 | p->se.parent = task_group(p)->se[cpu]; |
@@ -373,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
373 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | 341 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; |
374 | p->rt.parent = task_group(p)->rt_se[cpu]; | 342 | p->rt.parent = task_group(p)->rt_se[cpu]; |
375 | #endif | 343 | #endif |
344 | rcu_read_unlock(); | ||
376 | } | 345 | } |
377 | 346 | ||
378 | #else | 347 | #else |
@@ -383,7 +352,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
383 | return NULL; | 352 | return NULL; |
384 | } | 353 | } |
385 | 354 | ||
386 | #endif /* CONFIG_GROUP_SCHED */ | 355 | #endif /* CONFIG_CGROUP_SCHED */ |
387 | 356 | ||
388 | /* CFS-related fields in a runqueue */ | 357 | /* CFS-related fields in a runqueue */ |
389 | struct cfs_rq { | 358 | struct cfs_rq { |
@@ -478,7 +447,6 @@ struct rt_rq { | |||
478 | struct rq *rq; | 447 | struct rq *rq; |
479 | struct list_head leaf_rt_rq_list; | 448 | struct list_head leaf_rt_rq_list; |
480 | struct task_group *tg; | 449 | struct task_group *tg; |
481 | struct sched_rt_entity *rt_se; | ||
482 | #endif | 450 | #endif |
483 | }; | 451 | }; |
484 | 452 | ||
@@ -645,6 +613,11 @@ static inline int cpu_of(struct rq *rq) | |||
645 | #endif | 613 | #endif |
646 | } | 614 | } |
647 | 615 | ||
616 | #define rcu_dereference_check_sched_domain(p) \ | ||
617 | rcu_dereference_check((p), \ | ||
618 | rcu_read_lock_sched_held() || \ | ||
619 | lockdep_is_held(&sched_domains_mutex)) | ||
620 | |||
648 | /* | 621 | /* |
649 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 622 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
650 | * See detach_destroy_domains: synchronize_sched for details. | 623 | * See detach_destroy_domains: synchronize_sched for details. |
@@ -653,7 +626,7 @@ static inline int cpu_of(struct rq *rq) | |||
653 | * preempt-disabled sections. | 626 | * preempt-disabled sections. |
654 | */ | 627 | */ |
655 | #define for_each_domain(cpu, __sd) \ | 628 | #define for_each_domain(cpu, __sd) \ |
656 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | 629 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
657 | 630 | ||
658 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 631 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
659 | #define this_rq() (&__get_cpu_var(runqueues)) | 632 | #define this_rq() (&__get_cpu_var(runqueues)) |
@@ -941,16 +914,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
941 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 914 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
942 | 915 | ||
943 | /* | 916 | /* |
917 | * Check whether the task is waking, we use this to synchronize against | ||
918 | * ttwu() so that task_cpu() reports a stable number. | ||
919 | * | ||
920 | * We need to make an exception for PF_STARTING tasks because the fork | ||
921 | * path might require task_rq_lock() to work, eg. it can call | ||
922 | * set_cpus_allowed_ptr() from the cpuset clone_ns code. | ||
923 | */ | ||
924 | static inline int task_is_waking(struct task_struct *p) | ||
925 | { | ||
926 | return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); | ||
927 | } | ||
928 | |||
929 | /* | ||
944 | * __task_rq_lock - lock the runqueue a given task resides on. | 930 | * __task_rq_lock - lock the runqueue a given task resides on. |
945 | * Must be called interrupts disabled. | 931 | * Must be called interrupts disabled. |
946 | */ | 932 | */ |
947 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 933 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
948 | __acquires(rq->lock) | 934 | __acquires(rq->lock) |
949 | { | 935 | { |
936 | struct rq *rq; | ||
937 | |||
950 | for (;;) { | 938 | for (;;) { |
951 | struct rq *rq = task_rq(p); | 939 | while (task_is_waking(p)) |
940 | cpu_relax(); | ||
941 | rq = task_rq(p); | ||
952 | raw_spin_lock(&rq->lock); | 942 | raw_spin_lock(&rq->lock); |
953 | if (likely(rq == task_rq(p))) | 943 | if (likely(rq == task_rq(p) && !task_is_waking(p))) |
954 | return rq; | 944 | return rq; |
955 | raw_spin_unlock(&rq->lock); | 945 | raw_spin_unlock(&rq->lock); |
956 | } | 946 | } |
@@ -967,10 +957,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
967 | struct rq *rq; | 957 | struct rq *rq; |
968 | 958 | ||
969 | for (;;) { | 959 | for (;;) { |
960 | while (task_is_waking(p)) | ||
961 | cpu_relax(); | ||
970 | local_irq_save(*flags); | 962 | local_irq_save(*flags); |
971 | rq = task_rq(p); | 963 | rq = task_rq(p); |
972 | raw_spin_lock(&rq->lock); | 964 | raw_spin_lock(&rq->lock); |
973 | if (likely(rq == task_rq(p))) | 965 | if (likely(rq == task_rq(p) && !task_is_waking(p))) |
974 | return rq; | 966 | return rq; |
975 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 967 | raw_spin_unlock_irqrestore(&rq->lock, *flags); |
976 | } | 968 | } |
@@ -1390,32 +1382,6 @@ static const u32 prio_to_wmult[40] = { | |||
1390 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1382 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
1391 | }; | 1383 | }; |
1392 | 1384 | ||
1393 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | ||
1394 | |||
1395 | /* | ||
1396 | * runqueue iterator, to support SMP load-balancing between different | ||
1397 | * scheduling classes, without having to expose their internal data | ||
1398 | * structures to the load-balancing proper: | ||
1399 | */ | ||
1400 | struct rq_iterator { | ||
1401 | void *arg; | ||
1402 | struct task_struct *(*start)(void *); | ||
1403 | struct task_struct *(*next)(void *); | ||
1404 | }; | ||
1405 | |||
1406 | #ifdef CONFIG_SMP | ||
1407 | static unsigned long | ||
1408 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1409 | unsigned long max_load_move, struct sched_domain *sd, | ||
1410 | enum cpu_idle_type idle, int *all_pinned, | ||
1411 | int *this_best_prio, struct rq_iterator *iterator); | ||
1412 | |||
1413 | static int | ||
1414 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1415 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1416 | struct rq_iterator *iterator); | ||
1417 | #endif | ||
1418 | |||
1419 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 1385 | /* Time spent by the tasks of the cpu accounting group executing in ... */ |
1420 | enum cpuacct_stat_index { | 1386 | enum cpuacct_stat_index { |
1421 | CPUACCT_STAT_USER, /* ... user mode */ | 1387 | CPUACCT_STAT_USER, /* ... user mode */ |
@@ -1531,7 +1497,7 @@ static unsigned long target_load(int cpu, int type) | |||
1531 | 1497 | ||
1532 | static struct sched_group *group_of(int cpu) | 1498 | static struct sched_group *group_of(int cpu) |
1533 | { | 1499 | { |
1534 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | 1500 | struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); |
1535 | 1501 | ||
1536 | if (!sd) | 1502 | if (!sd) |
1537 | return NULL; | 1503 | return NULL; |
@@ -1566,7 +1532,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1566 | 1532 | ||
1567 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1533 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1568 | 1534 | ||
1569 | static __read_mostly unsigned long *update_shares_data; | 1535 | static __read_mostly unsigned long __percpu *update_shares_data; |
1570 | 1536 | ||
1571 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1537 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1572 | 1538 | ||
@@ -1701,16 +1667,6 @@ static void update_shares(struct sched_domain *sd) | |||
1701 | } | 1667 | } |
1702 | } | 1668 | } |
1703 | 1669 | ||
1704 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1705 | { | ||
1706 | if (root_task_group_empty()) | ||
1707 | return; | ||
1708 | |||
1709 | raw_spin_unlock(&rq->lock); | ||
1710 | update_shares(sd); | ||
1711 | raw_spin_lock(&rq->lock); | ||
1712 | } | ||
1713 | |||
1714 | static void update_h_load(long cpu) | 1670 | static void update_h_load(long cpu) |
1715 | { | 1671 | { |
1716 | if (root_task_group_empty()) | 1672 | if (root_task_group_empty()) |
@@ -1725,10 +1681,6 @@ static inline void update_shares(struct sched_domain *sd) | |||
1725 | { | 1681 | { |
1726 | } | 1682 | } |
1727 | 1683 | ||
1728 | static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1729 | { | ||
1730 | } | ||
1731 | |||
1732 | #endif | 1684 | #endif |
1733 | 1685 | ||
1734 | #ifdef CONFIG_PREEMPT | 1686 | #ifdef CONFIG_PREEMPT |
@@ -1805,6 +1757,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | |||
1805 | raw_spin_unlock(&busiest->lock); | 1757 | raw_spin_unlock(&busiest->lock); |
1806 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | 1758 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); |
1807 | } | 1759 | } |
1760 | |||
1761 | /* | ||
1762 | * double_rq_lock - safely lock two runqueues | ||
1763 | * | ||
1764 | * Note this does not disable interrupts like task_rq_lock, | ||
1765 | * you need to do so manually before calling. | ||
1766 | */ | ||
1767 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1768 | __acquires(rq1->lock) | ||
1769 | __acquires(rq2->lock) | ||
1770 | { | ||
1771 | BUG_ON(!irqs_disabled()); | ||
1772 | if (rq1 == rq2) { | ||
1773 | raw_spin_lock(&rq1->lock); | ||
1774 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1775 | } else { | ||
1776 | if (rq1 < rq2) { | ||
1777 | raw_spin_lock(&rq1->lock); | ||
1778 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
1779 | } else { | ||
1780 | raw_spin_lock(&rq2->lock); | ||
1781 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
1782 | } | ||
1783 | } | ||
1784 | update_rq_clock(rq1); | ||
1785 | update_rq_clock(rq2); | ||
1786 | } | ||
1787 | |||
1788 | /* | ||
1789 | * double_rq_unlock - safely unlock two runqueues | ||
1790 | * | ||
1791 | * Note this does not restore interrupts like task_rq_unlock, | ||
1792 | * you need to do so manually after calling. | ||
1793 | */ | ||
1794 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1795 | __releases(rq1->lock) | ||
1796 | __releases(rq2->lock) | ||
1797 | { | ||
1798 | raw_spin_unlock(&rq1->lock); | ||
1799 | if (rq1 != rq2) | ||
1800 | raw_spin_unlock(&rq2->lock); | ||
1801 | else | ||
1802 | __release(rq2->lock); | ||
1803 | } | ||
1804 | |||
1808 | #endif | 1805 | #endif |
1809 | 1806 | ||
1810 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1807 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1834,18 +1831,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1834 | #endif | 1831 | #endif |
1835 | } | 1832 | } |
1836 | 1833 | ||
1837 | #include "sched_stats.h" | 1834 | static const struct sched_class rt_sched_class; |
1838 | #include "sched_idletask.c" | ||
1839 | #include "sched_fair.c" | ||
1840 | #include "sched_rt.c" | ||
1841 | #ifdef CONFIG_SCHED_DEBUG | ||
1842 | # include "sched_debug.c" | ||
1843 | #endif | ||
1844 | 1835 | ||
1845 | #define sched_class_highest (&rt_sched_class) | 1836 | #define sched_class_highest (&rt_sched_class) |
1846 | #define for_each_class(class) \ | 1837 | #define for_each_class(class) \ |
1847 | for (class = sched_class_highest; class; class = class->next) | 1838 | for (class = sched_class_highest; class; class = class->next) |
1848 | 1839 | ||
1840 | #include "sched_stats.h" | ||
1841 | |||
1849 | static void inc_nr_running(struct rq *rq) | 1842 | static void inc_nr_running(struct rq *rq) |
1850 | { | 1843 | { |
1851 | rq->nr_running++; | 1844 | rq->nr_running++; |
@@ -1883,13 +1876,14 @@ static void update_avg(u64 *avg, u64 sample) | |||
1883 | *avg += diff >> 3; | 1876 | *avg += diff >> 3; |
1884 | } | 1877 | } |
1885 | 1878 | ||
1886 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1879 | static void |
1880 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
1887 | { | 1881 | { |
1888 | if (wakeup) | 1882 | if (wakeup) |
1889 | p->se.start_runtime = p->se.sum_exec_runtime; | 1883 | p->se.start_runtime = p->se.sum_exec_runtime; |
1890 | 1884 | ||
1891 | sched_info_queued(p); | 1885 | sched_info_queued(p); |
1892 | p->sched_class->enqueue_task(rq, p, wakeup); | 1886 | p->sched_class->enqueue_task(rq, p, wakeup, head); |
1893 | p->se.on_rq = 1; | 1887 | p->se.on_rq = 1; |
1894 | } | 1888 | } |
1895 | 1889 | ||
@@ -1912,6 +1906,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1912 | } | 1906 | } |
1913 | 1907 | ||
1914 | /* | 1908 | /* |
1909 | * activate_task - move a task to the runqueue. | ||
1910 | */ | ||
1911 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | ||
1912 | { | ||
1913 | if (task_contributes_to_load(p)) | ||
1914 | rq->nr_uninterruptible--; | ||
1915 | |||
1916 | enqueue_task(rq, p, wakeup, false); | ||
1917 | inc_nr_running(rq); | ||
1918 | } | ||
1919 | |||
1920 | /* | ||
1921 | * deactivate_task - remove a task from the runqueue. | ||
1922 | */ | ||
1923 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | ||
1924 | { | ||
1925 | if (task_contributes_to_load(p)) | ||
1926 | rq->nr_uninterruptible++; | ||
1927 | |||
1928 | dequeue_task(rq, p, sleep); | ||
1929 | dec_nr_running(rq); | ||
1930 | } | ||
1931 | |||
1932 | #include "sched_idletask.c" | ||
1933 | #include "sched_fair.c" | ||
1934 | #include "sched_rt.c" | ||
1935 | #ifdef CONFIG_SCHED_DEBUG | ||
1936 | # include "sched_debug.c" | ||
1937 | #endif | ||
1938 | |||
1939 | /* | ||
1915 | * __normal_prio - return the priority that is based on the static prio | 1940 | * __normal_prio - return the priority that is based on the static prio |
1916 | */ | 1941 | */ |
1917 | static inline int __normal_prio(struct task_struct *p) | 1942 | static inline int __normal_prio(struct task_struct *p) |
@@ -1957,30 +1982,6 @@ static int effective_prio(struct task_struct *p) | |||
1957 | return p->prio; | 1982 | return p->prio; |
1958 | } | 1983 | } |
1959 | 1984 | ||
1960 | /* | ||
1961 | * activate_task - move a task to the runqueue. | ||
1962 | */ | ||
1963 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | ||
1964 | { | ||
1965 | if (task_contributes_to_load(p)) | ||
1966 | rq->nr_uninterruptible--; | ||
1967 | |||
1968 | enqueue_task(rq, p, wakeup); | ||
1969 | inc_nr_running(rq); | ||
1970 | } | ||
1971 | |||
1972 | /* | ||
1973 | * deactivate_task - remove a task from the runqueue. | ||
1974 | */ | ||
1975 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | ||
1976 | { | ||
1977 | if (task_contributes_to_load(p)) | ||
1978 | rq->nr_uninterruptible++; | ||
1979 | |||
1980 | dequeue_task(rq, p, sleep); | ||
1981 | dec_nr_running(rq); | ||
1982 | } | ||
1983 | |||
1984 | /** | 1985 | /** |
1985 | * task_curr - is this task currently executing on a CPU? | 1986 | * task_curr - is this task currently executing on a CPU? |
1986 | * @p: the task in question. | 1987 | * @p: the task in question. |
@@ -2320,14 +2321,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2320 | } | 2321 | } |
2321 | 2322 | ||
2322 | /* | 2323 | /* |
2323 | * Called from: | 2324 | * Gets called from 3 sites (exec, fork, wakeup), since it is called without |
2324 | * | 2325 | * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done |
2325 | * - fork, @p is stable because it isn't on the tasklist yet | 2326 | * by: |
2326 | * | 2327 | * |
2327 | * - exec, @p is unstable, retry loop | 2328 | * exec: is unstable, retry loop |
2328 | * | 2329 | * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING |
2329 | * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so | ||
2330 | * we should be good. | ||
2331 | */ | 2330 | */ |
2332 | static inline | 2331 | static inline |
2333 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | 2332 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
@@ -2371,7 +2370,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2371 | { | 2370 | { |
2372 | int cpu, orig_cpu, this_cpu, success = 0; | 2371 | int cpu, orig_cpu, this_cpu, success = 0; |
2373 | unsigned long flags; | 2372 | unsigned long flags; |
2374 | struct rq *rq, *orig_rq; | 2373 | struct rq *rq; |
2375 | 2374 | ||
2376 | if (!sched_feat(SYNC_WAKEUPS)) | 2375 | if (!sched_feat(SYNC_WAKEUPS)) |
2377 | wake_flags &= ~WF_SYNC; | 2376 | wake_flags &= ~WF_SYNC; |
@@ -2379,7 +2378,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2379 | this_cpu = get_cpu(); | 2378 | this_cpu = get_cpu(); |
2380 | 2379 | ||
2381 | smp_wmb(); | 2380 | smp_wmb(); |
2382 | rq = orig_rq = task_rq_lock(p, &flags); | 2381 | rq = task_rq_lock(p, &flags); |
2383 | update_rq_clock(rq); | 2382 | update_rq_clock(rq); |
2384 | if (!(p->state & state)) | 2383 | if (!(p->state & state)) |
2385 | goto out; | 2384 | goto out; |
@@ -2410,14 +2409,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2410 | __task_rq_unlock(rq); | 2409 | __task_rq_unlock(rq); |
2411 | 2410 | ||
2412 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 2411 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2413 | if (cpu != orig_cpu) | 2412 | if (cpu != orig_cpu) { |
2413 | /* | ||
2414 | * Since we migrate the task without holding any rq->lock, | ||
2415 | * we need to be careful with task_rq_lock(), since that | ||
2416 | * might end up locking an invalid rq. | ||
2417 | */ | ||
2414 | set_task_cpu(p, cpu); | 2418 | set_task_cpu(p, cpu); |
2419 | } | ||
2415 | 2420 | ||
2416 | rq = __task_rq_lock(p); | 2421 | rq = cpu_rq(cpu); |
2422 | raw_spin_lock(&rq->lock); | ||
2417 | update_rq_clock(rq); | 2423 | update_rq_clock(rq); |
2418 | 2424 | ||
2425 | /* | ||
2426 | * We migrated the task without holding either rq->lock, however | ||
2427 | * since the task is not on the task list itself, nobody else | ||
2428 | * will try and migrate the task, hence the rq should match the | ||
2429 | * cpu we just moved it to. | ||
2430 | */ | ||
2431 | WARN_ON(task_cpu(p) != cpu); | ||
2419 | WARN_ON(p->state != TASK_WAKING); | 2432 | WARN_ON(p->state != TASK_WAKING); |
2420 | cpu = task_cpu(p); | ||
2421 | 2433 | ||
2422 | #ifdef CONFIG_SCHEDSTATS | 2434 | #ifdef CONFIG_SCHEDSTATS |
2423 | schedstat_inc(rq, ttwu_count); | 2435 | schedstat_inc(rq, ttwu_count); |
@@ -2620,9 +2632,6 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2620 | if (p->sched_class->task_fork) | 2632 | if (p->sched_class->task_fork) |
2621 | p->sched_class->task_fork(p); | 2633 | p->sched_class->task_fork(p); |
2622 | 2634 | ||
2623 | #ifdef CONFIG_SMP | ||
2624 | cpu = select_task_rq(p, SD_BALANCE_FORK, 0); | ||
2625 | #endif | ||
2626 | set_task_cpu(p, cpu); | 2635 | set_task_cpu(p, cpu); |
2627 | 2636 | ||
2628 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2637 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
@@ -2652,8 +2661,29 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2652 | { | 2661 | { |
2653 | unsigned long flags; | 2662 | unsigned long flags; |
2654 | struct rq *rq; | 2663 | struct rq *rq; |
2664 | int cpu __maybe_unused = get_cpu(); | ||
2665 | |||
2666 | #ifdef CONFIG_SMP | ||
2667 | /* | ||
2668 | * Fork balancing, do it here and not earlier because: | ||
2669 | * - cpus_allowed can change in the fork path | ||
2670 | * - any previously selected cpu might disappear through hotplug | ||
2671 | * | ||
2672 | * We still have TASK_WAKING but PF_STARTING is gone now, meaning | ||
2673 | * ->cpus_allowed is stable, we have preemption disabled, meaning | ||
2674 | * cpu_online_mask is stable. | ||
2675 | */ | ||
2676 | cpu = select_task_rq(p, SD_BALANCE_FORK, 0); | ||
2677 | set_task_cpu(p, cpu); | ||
2678 | #endif | ||
2679 | |||
2680 | /* | ||
2681 | * Since the task is not on the rq and we still have TASK_WAKING set | ||
2682 | * nobody else will migrate this task. | ||
2683 | */ | ||
2684 | rq = cpu_rq(cpu); | ||
2685 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
2655 | 2686 | ||
2656 | rq = task_rq_lock(p, &flags); | ||
2657 | BUG_ON(p->state != TASK_WAKING); | 2687 | BUG_ON(p->state != TASK_WAKING); |
2658 | p->state = TASK_RUNNING; | 2688 | p->state = TASK_RUNNING; |
2659 | update_rq_clock(rq); | 2689 | update_rq_clock(rq); |
@@ -2665,6 +2695,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2665 | p->sched_class->task_woken(rq, p); | 2695 | p->sched_class->task_woken(rq, p); |
2666 | #endif | 2696 | #endif |
2667 | task_rq_unlock(rq, &flags); | 2697 | task_rq_unlock(rq, &flags); |
2698 | put_cpu(); | ||
2668 | } | 2699 | } |
2669 | 2700 | ||
2670 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2701 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -3094,50 +3125,6 @@ static void update_cpu_load(struct rq *this_rq) | |||
3094 | #ifdef CONFIG_SMP | 3125 | #ifdef CONFIG_SMP |
3095 | 3126 | ||
3096 | /* | 3127 | /* |
3097 | * double_rq_lock - safely lock two runqueues | ||
3098 | * | ||
3099 | * Note this does not disable interrupts like task_rq_lock, | ||
3100 | * you need to do so manually before calling. | ||
3101 | */ | ||
3102 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
3103 | __acquires(rq1->lock) | ||
3104 | __acquires(rq2->lock) | ||
3105 | { | ||
3106 | BUG_ON(!irqs_disabled()); | ||
3107 | if (rq1 == rq2) { | ||
3108 | raw_spin_lock(&rq1->lock); | ||
3109 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
3110 | } else { | ||
3111 | if (rq1 < rq2) { | ||
3112 | raw_spin_lock(&rq1->lock); | ||
3113 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
3114 | } else { | ||
3115 | raw_spin_lock(&rq2->lock); | ||
3116 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
3117 | } | ||
3118 | } | ||
3119 | update_rq_clock(rq1); | ||
3120 | update_rq_clock(rq2); | ||
3121 | } | ||
3122 | |||
3123 | /* | ||
3124 | * double_rq_unlock - safely unlock two runqueues | ||
3125 | * | ||
3126 | * Note this does not restore interrupts like task_rq_unlock, | ||
3127 | * you need to do so manually after calling. | ||
3128 | */ | ||
3129 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
3130 | __releases(rq1->lock) | ||
3131 | __releases(rq2->lock) | ||
3132 | { | ||
3133 | raw_spin_unlock(&rq1->lock); | ||
3134 | if (rq1 != rq2) | ||
3135 | raw_spin_unlock(&rq2->lock); | ||
3136 | else | ||
3137 | __release(rq2->lock); | ||
3138 | } | ||
3139 | |||
3140 | /* | ||
3141 | * sched_exec - execve() is a valuable balancing opportunity, because at | 3128 | * sched_exec - execve() is a valuable balancing opportunity, because at |
3142 | * this point the task has the smallest effective memory and cache footprint. | 3129 | * this point the task has the smallest effective memory and cache footprint. |
3143 | */ | 3130 | */ |
@@ -3185,1771 +3172,6 @@ again: | |||
3185 | task_rq_unlock(rq, &flags); | 3172 | task_rq_unlock(rq, &flags); |
3186 | } | 3173 | } |
3187 | 3174 | ||
3188 | /* | ||
3189 | * pull_task - move a task from a remote runqueue to the local runqueue. | ||
3190 | * Both runqueues must be locked. | ||
3191 | */ | ||
3192 | static void pull_task(struct rq *src_rq, struct task_struct *p, | ||
3193 | struct rq *this_rq, int this_cpu) | ||
3194 | { | ||
3195 | deactivate_task(src_rq, p, 0); | ||
3196 | set_task_cpu(p, this_cpu); | ||
3197 | activate_task(this_rq, p, 0); | ||
3198 | check_preempt_curr(this_rq, p, 0); | ||
3199 | } | ||
3200 | |||
3201 | /* | ||
3202 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | ||
3203 | */ | ||
3204 | static | ||
3205 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | ||
3206 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3207 | int *all_pinned) | ||
3208 | { | ||
3209 | int tsk_cache_hot = 0; | ||
3210 | /* | ||
3211 | * We do not migrate tasks that are: | ||
3212 | * 1) running (obviously), or | ||
3213 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | ||
3214 | * 3) are cache-hot on their current CPU. | ||
3215 | */ | ||
3216 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | ||
3217 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
3218 | return 0; | ||
3219 | } | ||
3220 | *all_pinned = 0; | ||
3221 | |||
3222 | if (task_running(rq, p)) { | ||
3223 | schedstat_inc(p, se.nr_failed_migrations_running); | ||
3224 | return 0; | ||
3225 | } | ||
3226 | |||
3227 | /* | ||
3228 | * Aggressive migration if: | ||
3229 | * 1) task is cache cold, or | ||
3230 | * 2) too many balance attempts have failed. | ||
3231 | */ | ||
3232 | |||
3233 | tsk_cache_hot = task_hot(p, rq->clock, sd); | ||
3234 | if (!tsk_cache_hot || | ||
3235 | sd->nr_balance_failed > sd->cache_nice_tries) { | ||
3236 | #ifdef CONFIG_SCHEDSTATS | ||
3237 | if (tsk_cache_hot) { | ||
3238 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
3239 | schedstat_inc(p, se.nr_forced_migrations); | ||
3240 | } | ||
3241 | #endif | ||
3242 | return 1; | ||
3243 | } | ||
3244 | |||
3245 | if (tsk_cache_hot) { | ||
3246 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
3247 | return 0; | ||
3248 | } | ||
3249 | return 1; | ||
3250 | } | ||
3251 | |||
3252 | static unsigned long | ||
3253 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3254 | unsigned long max_load_move, struct sched_domain *sd, | ||
3255 | enum cpu_idle_type idle, int *all_pinned, | ||
3256 | int *this_best_prio, struct rq_iterator *iterator) | ||
3257 | { | ||
3258 | int loops = 0, pulled = 0, pinned = 0; | ||
3259 | struct task_struct *p; | ||
3260 | long rem_load_move = max_load_move; | ||
3261 | |||
3262 | if (max_load_move == 0) | ||
3263 | goto out; | ||
3264 | |||
3265 | pinned = 1; | ||
3266 | |||
3267 | /* | ||
3268 | * Start the load-balancing iterator: | ||
3269 | */ | ||
3270 | p = iterator->start(iterator->arg); | ||
3271 | next: | ||
3272 | if (!p || loops++ > sysctl_sched_nr_migrate) | ||
3273 | goto out; | ||
3274 | |||
3275 | if ((p->se.load.weight >> 1) > rem_load_move || | ||
3276 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | ||
3277 | p = iterator->next(iterator->arg); | ||
3278 | goto next; | ||
3279 | } | ||
3280 | |||
3281 | pull_task(busiest, p, this_rq, this_cpu); | ||
3282 | pulled++; | ||
3283 | rem_load_move -= p->se.load.weight; | ||
3284 | |||
3285 | #ifdef CONFIG_PREEMPT | ||
3286 | /* | ||
3287 | * NEWIDLE balancing is a source of latency, so preemptible kernels | ||
3288 | * will stop after the first task is pulled to minimize the critical | ||
3289 | * section. | ||
3290 | */ | ||
3291 | if (idle == CPU_NEWLY_IDLE) | ||
3292 | goto out; | ||
3293 | #endif | ||
3294 | |||
3295 | /* | ||
3296 | * We only want to steal up to the prescribed amount of weighted load. | ||
3297 | */ | ||
3298 | if (rem_load_move > 0) { | ||
3299 | if (p->prio < *this_best_prio) | ||
3300 | *this_best_prio = p->prio; | ||
3301 | p = iterator->next(iterator->arg); | ||
3302 | goto next; | ||
3303 | } | ||
3304 | out: | ||
3305 | /* | ||
3306 | * Right now, this is one of only two places pull_task() is called, | ||
3307 | * so we can safely collect pull_task() stats here rather than | ||
3308 | * inside pull_task(). | ||
3309 | */ | ||
3310 | schedstat_add(sd, lb_gained[idle], pulled); | ||
3311 | |||
3312 | if (all_pinned) | ||
3313 | *all_pinned = pinned; | ||
3314 | |||
3315 | return max_load_move - rem_load_move; | ||
3316 | } | ||
3317 | |||
3318 | /* | ||
3319 | * move_tasks tries to move up to max_load_move weighted load from busiest to | ||
3320 | * this_rq, as part of a balancing operation within domain "sd". | ||
3321 | * Returns 1 if successful and 0 otherwise. | ||
3322 | * | ||
3323 | * Called with both runqueues locked. | ||
3324 | */ | ||
3325 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3326 | unsigned long max_load_move, | ||
3327 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3328 | int *all_pinned) | ||
3329 | { | ||
3330 | const struct sched_class *class = sched_class_highest; | ||
3331 | unsigned long total_load_moved = 0; | ||
3332 | int this_best_prio = this_rq->curr->prio; | ||
3333 | |||
3334 | do { | ||
3335 | total_load_moved += | ||
3336 | class->load_balance(this_rq, this_cpu, busiest, | ||
3337 | max_load_move - total_load_moved, | ||
3338 | sd, idle, all_pinned, &this_best_prio); | ||
3339 | class = class->next; | ||
3340 | |||
3341 | #ifdef CONFIG_PREEMPT | ||
3342 | /* | ||
3343 | * NEWIDLE balancing is a source of latency, so preemptible | ||
3344 | * kernels will stop after the first task is pulled to minimize | ||
3345 | * the critical section. | ||
3346 | */ | ||
3347 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | ||
3348 | break; | ||
3349 | #endif | ||
3350 | } while (class && max_load_move > total_load_moved); | ||
3351 | |||
3352 | return total_load_moved > 0; | ||
3353 | } | ||
3354 | |||
3355 | static int | ||
3356 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3357 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3358 | struct rq_iterator *iterator) | ||
3359 | { | ||
3360 | struct task_struct *p = iterator->start(iterator->arg); | ||
3361 | int pinned = 0; | ||
3362 | |||
3363 | while (p) { | ||
3364 | if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | ||
3365 | pull_task(busiest, p, this_rq, this_cpu); | ||
3366 | /* | ||
3367 | * Right now, this is only the second place pull_task() | ||
3368 | * is called, so we can safely collect pull_task() | ||
3369 | * stats here rather than inside pull_task(). | ||
3370 | */ | ||
3371 | schedstat_inc(sd, lb_gained[idle]); | ||
3372 | |||
3373 | return 1; | ||
3374 | } | ||
3375 | p = iterator->next(iterator->arg); | ||
3376 | } | ||
3377 | |||
3378 | return 0; | ||
3379 | } | ||
3380 | |||
3381 | /* | ||
3382 | * move_one_task tries to move exactly one task from busiest to this_rq, as | ||
3383 | * part of active balancing operations within "domain". | ||
3384 | * Returns 1 if successful and 0 otherwise. | ||
3385 | * | ||
3386 | * Called with both runqueues locked. | ||
3387 | */ | ||
3388 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3389 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
3390 | { | ||
3391 | const struct sched_class *class; | ||
3392 | |||
3393 | for_each_class(class) { | ||
3394 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | ||
3395 | return 1; | ||
3396 | } | ||
3397 | |||
3398 | return 0; | ||
3399 | } | ||
3400 | /********** Helpers for find_busiest_group ************************/ | ||
3401 | /* | ||
3402 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
3403 | * during load balancing. | ||
3404 | */ | ||
3405 | struct sd_lb_stats { | ||
3406 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
3407 | struct sched_group *this; /* Local group in this sd */ | ||
3408 | unsigned long total_load; /* Total load of all groups in sd */ | ||
3409 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
3410 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
3411 | |||
3412 | /** Statistics of this group */ | ||
3413 | unsigned long this_load; | ||
3414 | unsigned long this_load_per_task; | ||
3415 | unsigned long this_nr_running; | ||
3416 | |||
3417 | /* Statistics of the busiest group */ | ||
3418 | unsigned long max_load; | ||
3419 | unsigned long busiest_load_per_task; | ||
3420 | unsigned long busiest_nr_running; | ||
3421 | |||
3422 | int group_imb; /* Is there imbalance in this sd */ | ||
3423 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3424 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
3425 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
3426 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
3427 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
3428 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
3429 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
3430 | #endif | ||
3431 | }; | ||
3432 | |||
3433 | /* | ||
3434 | * sg_lb_stats - stats of a sched_group required for load_balancing | ||
3435 | */ | ||
3436 | struct sg_lb_stats { | ||
3437 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | ||
3438 | unsigned long group_load; /* Total load over the CPUs of the group */ | ||
3439 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
3440 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
3441 | unsigned long group_capacity; | ||
3442 | int group_imb; /* Is there an imbalance in the group ? */ | ||
3443 | }; | ||
3444 | |||
3445 | /** | ||
3446 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
3447 | * @group: The group whose first cpu is to be returned. | ||
3448 | */ | ||
3449 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
3450 | { | ||
3451 | return cpumask_first(sched_group_cpus(group)); | ||
3452 | } | ||
3453 | |||
3454 | /** | ||
3455 | * get_sd_load_idx - Obtain the load index for a given sched domain. | ||
3456 | * @sd: The sched_domain whose load_idx is to be obtained. | ||
3457 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | ||
3458 | */ | ||
3459 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
3460 | enum cpu_idle_type idle) | ||
3461 | { | ||
3462 | int load_idx; | ||
3463 | |||
3464 | switch (idle) { | ||
3465 | case CPU_NOT_IDLE: | ||
3466 | load_idx = sd->busy_idx; | ||
3467 | break; | ||
3468 | |||
3469 | case CPU_NEWLY_IDLE: | ||
3470 | load_idx = sd->newidle_idx; | ||
3471 | break; | ||
3472 | default: | ||
3473 | load_idx = sd->idle_idx; | ||
3474 | break; | ||
3475 | } | ||
3476 | |||
3477 | return load_idx; | ||
3478 | } | ||
3479 | |||
3480 | |||
3481 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3482 | /** | ||
3483 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
3484 | * the given sched_domain, during load balancing. | ||
3485 | * | ||
3486 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
3487 | * @sds: Variable containing the statistics for sd. | ||
3488 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
3489 | */ | ||
3490 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3491 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3492 | { | ||
3493 | /* | ||
3494 | * Busy processors will not participate in power savings | ||
3495 | * balance. | ||
3496 | */ | ||
3497 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3498 | sds->power_savings_balance = 0; | ||
3499 | else { | ||
3500 | sds->power_savings_balance = 1; | ||
3501 | sds->min_nr_running = ULONG_MAX; | ||
3502 | sds->leader_nr_running = 0; | ||
3503 | } | ||
3504 | } | ||
3505 | |||
3506 | /** | ||
3507 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
3508 | * sched_domain while performing load balancing. | ||
3509 | * | ||
3510 | * @group: sched_group belonging to the sched_domain under consideration. | ||
3511 | * @sds: Variable containing the statistics of the sched_domain | ||
3512 | * @local_group: Does group contain the CPU for which we're performing | ||
3513 | * load balancing ? | ||
3514 | * @sgs: Variable containing the statistics of the group. | ||
3515 | */ | ||
3516 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3517 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3518 | { | ||
3519 | |||
3520 | if (!sds->power_savings_balance) | ||
3521 | return; | ||
3522 | |||
3523 | /* | ||
3524 | * If the local group is idle or completely loaded | ||
3525 | * no need to do power savings balance at this domain | ||
3526 | */ | ||
3527 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
3528 | !sds->this_nr_running)) | ||
3529 | sds->power_savings_balance = 0; | ||
3530 | |||
3531 | /* | ||
3532 | * If a group is already running at full capacity or idle, | ||
3533 | * don't include that group in power savings calculations | ||
3534 | */ | ||
3535 | if (!sds->power_savings_balance || | ||
3536 | sgs->sum_nr_running >= sgs->group_capacity || | ||
3537 | !sgs->sum_nr_running) | ||
3538 | return; | ||
3539 | |||
3540 | /* | ||
3541 | * Calculate the group which has the least non-idle load. | ||
3542 | * This is the group from where we need to pick up the load | ||
3543 | * for saving power | ||
3544 | */ | ||
3545 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
3546 | (sgs->sum_nr_running == sds->min_nr_running && | ||
3547 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
3548 | sds->group_min = group; | ||
3549 | sds->min_nr_running = sgs->sum_nr_running; | ||
3550 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
3551 | sgs->sum_nr_running; | ||
3552 | } | ||
3553 | |||
3554 | /* | ||
3555 | * Calculate the group which is almost near its | ||
3556 | * capacity but still has some space to pick up some load | ||
3557 | * from other group and save more power | ||
3558 | */ | ||
3559 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
3560 | return; | ||
3561 | |||
3562 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
3563 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
3564 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
3565 | sds->group_leader = group; | ||
3566 | sds->leader_nr_running = sgs->sum_nr_running; | ||
3567 | } | ||
3568 | } | ||
3569 | |||
3570 | /** | ||
3571 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
3572 | * @sds: Variable containing the statistics of the sched_domain | ||
3573 | * under consideration. | ||
3574 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
3575 | * @imbalance: Variable to store the imbalance. | ||
3576 | * | ||
3577 | * Description: | ||
3578 | * Check if we have potential to perform some power-savings balance. | ||
3579 | * If yes, set the busiest group to be the least loaded group in the | ||
3580 | * sched_domain, so that it's CPUs can be put to idle. | ||
3581 | * | ||
3582 | * Returns 1 if there is potential to perform power-savings balance. | ||
3583 | * Else returns 0. | ||
3584 | */ | ||
3585 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3586 | int this_cpu, unsigned long *imbalance) | ||
3587 | { | ||
3588 | if (!sds->power_savings_balance) | ||
3589 | return 0; | ||
3590 | |||
3591 | if (sds->this != sds->group_leader || | ||
3592 | sds->group_leader == sds->group_min) | ||
3593 | return 0; | ||
3594 | |||
3595 | *imbalance = sds->min_load_per_task; | ||
3596 | sds->busiest = sds->group_min; | ||
3597 | |||
3598 | return 1; | ||
3599 | |||
3600 | } | ||
3601 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3602 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3603 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3604 | { | ||
3605 | return; | ||
3606 | } | ||
3607 | |||
3608 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3609 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3610 | { | ||
3611 | return; | ||
3612 | } | ||
3613 | |||
3614 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3615 | int this_cpu, unsigned long *imbalance) | ||
3616 | { | ||
3617 | return 0; | ||
3618 | } | ||
3619 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3620 | |||
3621 | |||
3622 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3623 | { | ||
3624 | return SCHED_LOAD_SCALE; | ||
3625 | } | ||
3626 | |||
3627 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3628 | { | ||
3629 | return default_scale_freq_power(sd, cpu); | ||
3630 | } | ||
3631 | |||
3632 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3633 | { | ||
3634 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3635 | unsigned long smt_gain = sd->smt_gain; | ||
3636 | |||
3637 | smt_gain /= weight; | ||
3638 | |||
3639 | return smt_gain; | ||
3640 | } | ||
3641 | |||
3642 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3643 | { | ||
3644 | return default_scale_smt_power(sd, cpu); | ||
3645 | } | ||
3646 | |||
3647 | unsigned long scale_rt_power(int cpu) | ||
3648 | { | ||
3649 | struct rq *rq = cpu_rq(cpu); | ||
3650 | u64 total, available; | ||
3651 | |||
3652 | sched_avg_update(rq); | ||
3653 | |||
3654 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
3655 | available = total - rq->rt_avg; | ||
3656 | |||
3657 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
3658 | total = SCHED_LOAD_SCALE; | ||
3659 | |||
3660 | total >>= SCHED_LOAD_SHIFT; | ||
3661 | |||
3662 | return div_u64(available, total); | ||
3663 | } | ||
3664 | |||
3665 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
3666 | { | ||
3667 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3668 | unsigned long power = SCHED_LOAD_SCALE; | ||
3669 | struct sched_group *sdg = sd->groups; | ||
3670 | |||
3671 | if (sched_feat(ARCH_POWER)) | ||
3672 | power *= arch_scale_freq_power(sd, cpu); | ||
3673 | else | ||
3674 | power *= default_scale_freq_power(sd, cpu); | ||
3675 | |||
3676 | power >>= SCHED_LOAD_SHIFT; | ||
3677 | |||
3678 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
3679 | if (sched_feat(ARCH_POWER)) | ||
3680 | power *= arch_scale_smt_power(sd, cpu); | ||
3681 | else | ||
3682 | power *= default_scale_smt_power(sd, cpu); | ||
3683 | |||
3684 | power >>= SCHED_LOAD_SHIFT; | ||
3685 | } | ||
3686 | |||
3687 | power *= scale_rt_power(cpu); | ||
3688 | power >>= SCHED_LOAD_SHIFT; | ||
3689 | |||
3690 | if (!power) | ||
3691 | power = 1; | ||
3692 | |||
3693 | sdg->cpu_power = power; | ||
3694 | } | ||
3695 | |||
3696 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
3697 | { | ||
3698 | struct sched_domain *child = sd->child; | ||
3699 | struct sched_group *group, *sdg = sd->groups; | ||
3700 | unsigned long power; | ||
3701 | |||
3702 | if (!child) { | ||
3703 | update_cpu_power(sd, cpu); | ||
3704 | return; | ||
3705 | } | ||
3706 | |||
3707 | power = 0; | ||
3708 | |||
3709 | group = child->groups; | ||
3710 | do { | ||
3711 | power += group->cpu_power; | ||
3712 | group = group->next; | ||
3713 | } while (group != child->groups); | ||
3714 | |||
3715 | sdg->cpu_power = power; | ||
3716 | } | ||
3717 | |||
3718 | /** | ||
3719 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | ||
3720 | * @sd: The sched_domain whose statistics are to be updated. | ||
3721 | * @group: sched_group whose statistics are to be updated. | ||
3722 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3723 | * @idle: Idle status of this_cpu | ||
3724 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | ||
3725 | * @sd_idle: Idle status of the sched_domain containing group. | ||
3726 | * @local_group: Does group contain this_cpu. | ||
3727 | * @cpus: Set of cpus considered for load balancing. | ||
3728 | * @balance: Should we balance. | ||
3729 | * @sgs: variable to hold the statistics for this group. | ||
3730 | */ | ||
3731 | static inline void update_sg_lb_stats(struct sched_domain *sd, | ||
3732 | struct sched_group *group, int this_cpu, | ||
3733 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | ||
3734 | int local_group, const struct cpumask *cpus, | ||
3735 | int *balance, struct sg_lb_stats *sgs) | ||
3736 | { | ||
3737 | unsigned long load, max_cpu_load, min_cpu_load; | ||
3738 | int i; | ||
3739 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
3740 | unsigned long sum_avg_load_per_task; | ||
3741 | unsigned long avg_load_per_task; | ||
3742 | |||
3743 | if (local_group) { | ||
3744 | balance_cpu = group_first_cpu(group); | ||
3745 | if (balance_cpu == this_cpu) | ||
3746 | update_group_power(sd, this_cpu); | ||
3747 | } | ||
3748 | |||
3749 | /* Tally up the load of all CPUs in the group */ | ||
3750 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3751 | max_cpu_load = 0; | ||
3752 | min_cpu_load = ~0UL; | ||
3753 | |||
3754 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | ||
3755 | struct rq *rq = cpu_rq(i); | ||
3756 | |||
3757 | if (*sd_idle && rq->nr_running) | ||
3758 | *sd_idle = 0; | ||
3759 | |||
3760 | /* Bias balancing toward cpus of our domain */ | ||
3761 | if (local_group) { | ||
3762 | if (idle_cpu(i) && !first_idle_cpu) { | ||
3763 | first_idle_cpu = 1; | ||
3764 | balance_cpu = i; | ||
3765 | } | ||
3766 | |||
3767 | load = target_load(i, load_idx); | ||
3768 | } else { | ||
3769 | load = source_load(i, load_idx); | ||
3770 | if (load > max_cpu_load) | ||
3771 | max_cpu_load = load; | ||
3772 | if (min_cpu_load > load) | ||
3773 | min_cpu_load = load; | ||
3774 | } | ||
3775 | |||
3776 | sgs->group_load += load; | ||
3777 | sgs->sum_nr_running += rq->nr_running; | ||
3778 | sgs->sum_weighted_load += weighted_cpuload(i); | ||
3779 | |||
3780 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
3781 | } | ||
3782 | |||
3783 | /* | ||
3784 | * First idle cpu or the first cpu(busiest) in this sched group | ||
3785 | * is eligible for doing load balancing at this and above | ||
3786 | * domains. In the newly idle case, we will allow all the cpu's | ||
3787 | * to do the newly idle load balance. | ||
3788 | */ | ||
3789 | if (idle != CPU_NEWLY_IDLE && local_group && | ||
3790 | balance_cpu != this_cpu && balance) { | ||
3791 | *balance = 0; | ||
3792 | return; | ||
3793 | } | ||
3794 | |||
3795 | /* Adjust by relative CPU power of the group */ | ||
3796 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
3797 | |||
3798 | |||
3799 | /* | ||
3800 | * Consider the group unbalanced when the imbalance is larger | ||
3801 | * than the average weight of two tasks. | ||
3802 | * | ||
3803 | * APZ: with cgroup the avg task weight can vary wildly and | ||
3804 | * might not be a suitable number - should we keep a | ||
3805 | * normalized nr_running number somewhere that negates | ||
3806 | * the hierarchy? | ||
3807 | */ | ||
3808 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / | ||
3809 | group->cpu_power; | ||
3810 | |||
3811 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
3812 | sgs->group_imb = 1; | ||
3813 | |||
3814 | sgs->group_capacity = | ||
3815 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | ||
3816 | } | ||
3817 | |||
3818 | /** | ||
3819 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | ||
3820 | * @sd: sched_domain whose statistics are to be updated. | ||
3821 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3822 | * @idle: Idle status of this_cpu | ||
3823 | * @sd_idle: Idle status of the sched_domain containing group. | ||
3824 | * @cpus: Set of cpus considered for load balancing. | ||
3825 | * @balance: Should we balance. | ||
3826 | * @sds: variable to hold the statistics for this sched_domain. | ||
3827 | */ | ||
3828 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | ||
3829 | enum cpu_idle_type idle, int *sd_idle, | ||
3830 | const struct cpumask *cpus, int *balance, | ||
3831 | struct sd_lb_stats *sds) | ||
3832 | { | ||
3833 | struct sched_domain *child = sd->child; | ||
3834 | struct sched_group *group = sd->groups; | ||
3835 | struct sg_lb_stats sgs; | ||
3836 | int load_idx, prefer_sibling = 0; | ||
3837 | |||
3838 | if (child && child->flags & SD_PREFER_SIBLING) | ||
3839 | prefer_sibling = 1; | ||
3840 | |||
3841 | init_sd_power_savings_stats(sd, sds, idle); | ||
3842 | load_idx = get_sd_load_idx(sd, idle); | ||
3843 | |||
3844 | do { | ||
3845 | int local_group; | ||
3846 | |||
3847 | local_group = cpumask_test_cpu(this_cpu, | ||
3848 | sched_group_cpus(group)); | ||
3849 | memset(&sgs, 0, sizeof(sgs)); | ||
3850 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | ||
3851 | local_group, cpus, balance, &sgs); | ||
3852 | |||
3853 | if (local_group && balance && !(*balance)) | ||
3854 | return; | ||
3855 | |||
3856 | sds->total_load += sgs.group_load; | ||
3857 | sds->total_pwr += group->cpu_power; | ||
3858 | |||
3859 | /* | ||
3860 | * In case the child domain prefers tasks go to siblings | ||
3861 | * first, lower the group capacity to one so that we'll try | ||
3862 | * and move all the excess tasks away. | ||
3863 | */ | ||
3864 | if (prefer_sibling) | ||
3865 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
3866 | |||
3867 | if (local_group) { | ||
3868 | sds->this_load = sgs.avg_load; | ||
3869 | sds->this = group; | ||
3870 | sds->this_nr_running = sgs.sum_nr_running; | ||
3871 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
3872 | } else if (sgs.avg_load > sds->max_load && | ||
3873 | (sgs.sum_nr_running > sgs.group_capacity || | ||
3874 | sgs.group_imb)) { | ||
3875 | sds->max_load = sgs.avg_load; | ||
3876 | sds->busiest = group; | ||
3877 | sds->busiest_nr_running = sgs.sum_nr_running; | ||
3878 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
3879 | sds->group_imb = sgs.group_imb; | ||
3880 | } | ||
3881 | |||
3882 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | ||
3883 | group = group->next; | ||
3884 | } while (group != sd->groups); | ||
3885 | } | ||
3886 | |||
3887 | /** | ||
3888 | * fix_small_imbalance - Calculate the minor imbalance that exists | ||
3889 | * amongst the groups of a sched_domain, during | ||
3890 | * load balancing. | ||
3891 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | ||
3892 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
3893 | * @imbalance: Variable to store the imbalance. | ||
3894 | */ | ||
3895 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | ||
3896 | int this_cpu, unsigned long *imbalance) | ||
3897 | { | ||
3898 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | ||
3899 | unsigned int imbn = 2; | ||
3900 | |||
3901 | if (sds->this_nr_running) { | ||
3902 | sds->this_load_per_task /= sds->this_nr_running; | ||
3903 | if (sds->busiest_load_per_task > | ||
3904 | sds->this_load_per_task) | ||
3905 | imbn = 1; | ||
3906 | } else | ||
3907 | sds->this_load_per_task = | ||
3908 | cpu_avg_load_per_task(this_cpu); | ||
3909 | |||
3910 | if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= | ||
3911 | sds->busiest_load_per_task * imbn) { | ||
3912 | *imbalance = sds->busiest_load_per_task; | ||
3913 | return; | ||
3914 | } | ||
3915 | |||
3916 | /* | ||
3917 | * OK, we don't have enough imbalance to justify moving tasks, | ||
3918 | * however we may be able to increase total CPU power used by | ||
3919 | * moving them. | ||
3920 | */ | ||
3921 | |||
3922 | pwr_now += sds->busiest->cpu_power * | ||
3923 | min(sds->busiest_load_per_task, sds->max_load); | ||
3924 | pwr_now += sds->this->cpu_power * | ||
3925 | min(sds->this_load_per_task, sds->this_load); | ||
3926 | pwr_now /= SCHED_LOAD_SCALE; | ||
3927 | |||
3928 | /* Amount of load we'd subtract */ | ||
3929 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
3930 | sds->busiest->cpu_power; | ||
3931 | if (sds->max_load > tmp) | ||
3932 | pwr_move += sds->busiest->cpu_power * | ||
3933 | min(sds->busiest_load_per_task, sds->max_load - tmp); | ||
3934 | |||
3935 | /* Amount of load we'd add */ | ||
3936 | if (sds->max_load * sds->busiest->cpu_power < | ||
3937 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | ||
3938 | tmp = (sds->max_load * sds->busiest->cpu_power) / | ||
3939 | sds->this->cpu_power; | ||
3940 | else | ||
3941 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
3942 | sds->this->cpu_power; | ||
3943 | pwr_move += sds->this->cpu_power * | ||
3944 | min(sds->this_load_per_task, sds->this_load + tmp); | ||
3945 | pwr_move /= SCHED_LOAD_SCALE; | ||
3946 | |||
3947 | /* Move if we gain throughput */ | ||
3948 | if (pwr_move > pwr_now) | ||
3949 | *imbalance = sds->busiest_load_per_task; | ||
3950 | } | ||
3951 | |||
3952 | /** | ||
3953 | * calculate_imbalance - Calculate the amount of imbalance present within the | ||
3954 | * groups of a given sched_domain during load balance. | ||
3955 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | ||
3956 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
3957 | * @imbalance: The variable to store the imbalance. | ||
3958 | */ | ||
3959 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | ||
3960 | unsigned long *imbalance) | ||
3961 | { | ||
3962 | unsigned long max_pull; | ||
3963 | /* | ||
3964 | * In the presence of smp nice balancing, certain scenarios can have | ||
3965 | * max load less than avg load(as we skip the groups at or below | ||
3966 | * its cpu_power, while calculating max_load..) | ||
3967 | */ | ||
3968 | if (sds->max_load < sds->avg_load) { | ||
3969 | *imbalance = 0; | ||
3970 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
3971 | } | ||
3972 | |||
3973 | /* Don't want to pull so many tasks that a group would go idle */ | ||
3974 | max_pull = min(sds->max_load - sds->avg_load, | ||
3975 | sds->max_load - sds->busiest_load_per_task); | ||
3976 | |||
3977 | /* How much load to actually move to equalise the imbalance */ | ||
3978 | *imbalance = min(max_pull * sds->busiest->cpu_power, | ||
3979 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | ||
3980 | / SCHED_LOAD_SCALE; | ||
3981 | |||
3982 | /* | ||
3983 | * if *imbalance is less than the average load per runnable task | ||
3984 | * there is no gaurantee that any tasks will be moved so we'll have | ||
3985 | * a think about bumping its value to force at least one task to be | ||
3986 | * moved | ||
3987 | */ | ||
3988 | if (*imbalance < sds->busiest_load_per_task) | ||
3989 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
3990 | |||
3991 | } | ||
3992 | /******* find_busiest_group() helpers end here *********************/ | ||
3993 | |||
3994 | /** | ||
3995 | * find_busiest_group - Returns the busiest group within the sched_domain | ||
3996 | * if there is an imbalance. If there isn't an imbalance, and | ||
3997 | * the user has opted for power-savings, it returns a group whose | ||
3998 | * CPUs can be put to idle by rebalancing those tasks elsewhere, if | ||
3999 | * such a group exists. | ||
4000 | * | ||
4001 | * Also calculates the amount of weighted load which should be moved | ||
4002 | * to restore balance. | ||
4003 | * | ||
4004 | * @sd: The sched_domain whose busiest group is to be returned. | ||
4005 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
4006 | * @imbalance: Variable which stores amount of weighted load which should | ||
4007 | * be moved to restore balance/put a group to idle. | ||
4008 | * @idle: The idle status of this_cpu. | ||
4009 | * @sd_idle: The idleness of sd | ||
4010 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
4011 | * @balance: Pointer to a variable indicating if this_cpu | ||
4012 | * is the appropriate cpu to perform load balancing at this_level. | ||
4013 | * | ||
4014 | * Returns: - the busiest group if imbalance exists. | ||
4015 | * - If no imbalance and user has opted for power-savings balance, | ||
4016 | * return the least loaded group whose CPUs can be | ||
4017 | * put to idle by rebalancing its tasks onto our group. | ||
4018 | */ | ||
4019 | static struct sched_group * | ||
4020 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
4021 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
4022 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
4023 | { | ||
4024 | struct sd_lb_stats sds; | ||
4025 | |||
4026 | memset(&sds, 0, sizeof(sds)); | ||
4027 | |||
4028 | /* | ||
4029 | * Compute the various statistics relavent for load balancing at | ||
4030 | * this level. | ||
4031 | */ | ||
4032 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | ||
4033 | balance, &sds); | ||
4034 | |||
4035 | /* Cases where imbalance does not exist from POV of this_cpu */ | ||
4036 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | ||
4037 | * at this level. | ||
4038 | * 2) There is no busy sibling group to pull from. | ||
4039 | * 3) This group is the busiest group. | ||
4040 | * 4) This group is more busy than the avg busieness at this | ||
4041 | * sched_domain. | ||
4042 | * 5) The imbalance is within the specified limit. | ||
4043 | * 6) Any rebalance would lead to ping-pong | ||
4044 | */ | ||
4045 | if (balance && !(*balance)) | ||
4046 | goto ret; | ||
4047 | |||
4048 | if (!sds.busiest || sds.busiest_nr_running == 0) | ||
4049 | goto out_balanced; | ||
4050 | |||
4051 | if (sds.this_load >= sds.max_load) | ||
4052 | goto out_balanced; | ||
4053 | |||
4054 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | ||
4055 | |||
4056 | if (sds.this_load >= sds.avg_load) | ||
4057 | goto out_balanced; | ||
4058 | |||
4059 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
4060 | goto out_balanced; | ||
4061 | |||
4062 | sds.busiest_load_per_task /= sds.busiest_nr_running; | ||
4063 | if (sds.group_imb) | ||
4064 | sds.busiest_load_per_task = | ||
4065 | min(sds.busiest_load_per_task, sds.avg_load); | ||
4066 | |||
4067 | /* | ||
4068 | * We're trying to get all the cpus to the average_load, so we don't | ||
4069 | * want to push ourselves above the average load, nor do we wish to | ||
4070 | * reduce the max loaded cpu below the average load, as either of these | ||
4071 | * actions would just result in more rebalancing later, and ping-pong | ||
4072 | * tasks around. Thus we look for the minimum possible imbalance. | ||
4073 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
4074 | * be counted as no imbalance for these purposes -- we can't fix that | ||
4075 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
4076 | * appear as very large values with unsigned longs. | ||
4077 | */ | ||
4078 | if (sds.max_load <= sds.busiest_load_per_task) | ||
4079 | goto out_balanced; | ||
4080 | |||
4081 | /* Looks like there is an imbalance. Compute it */ | ||
4082 | calculate_imbalance(&sds, this_cpu, imbalance); | ||
4083 | return sds.busiest; | ||
4084 | |||
4085 | out_balanced: | ||
4086 | /* | ||
4087 | * There is no obvious imbalance. But check if we can do some balancing | ||
4088 | * to save power. | ||
4089 | */ | ||
4090 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
4091 | return sds.busiest; | ||
4092 | ret: | ||
4093 | *imbalance = 0; | ||
4094 | return NULL; | ||
4095 | } | ||
4096 | |||
4097 | /* | ||
4098 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | ||
4099 | */ | ||
4100 | static struct rq * | ||
4101 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | ||
4102 | unsigned long imbalance, const struct cpumask *cpus) | ||
4103 | { | ||
4104 | struct rq *busiest = NULL, *rq; | ||
4105 | unsigned long max_load = 0; | ||
4106 | int i; | ||
4107 | |||
4108 | for_each_cpu(i, sched_group_cpus(group)) { | ||
4109 | unsigned long power = power_of(i); | ||
4110 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
4111 | unsigned long wl; | ||
4112 | |||
4113 | if (!cpumask_test_cpu(i, cpus)) | ||
4114 | continue; | ||
4115 | |||
4116 | rq = cpu_rq(i); | ||
4117 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; | ||
4118 | wl /= power; | ||
4119 | |||
4120 | if (capacity && rq->nr_running == 1 && wl > imbalance) | ||
4121 | continue; | ||
4122 | |||
4123 | if (wl > max_load) { | ||
4124 | max_load = wl; | ||
4125 | busiest = rq; | ||
4126 | } | ||
4127 | } | ||
4128 | |||
4129 | return busiest; | ||
4130 | } | ||
4131 | |||
4132 | /* | ||
4133 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
4134 | * so long as it is large enough. | ||
4135 | */ | ||
4136 | #define MAX_PINNED_INTERVAL 512 | ||
4137 | |||
4138 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
4139 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | ||
4140 | |||
4141 | /* | ||
4142 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
4143 | * tasks if there is an imbalance. | ||
4144 | */ | ||
4145 | static int load_balance(int this_cpu, struct rq *this_rq, | ||
4146 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
4147 | int *balance) | ||
4148 | { | ||
4149 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
4150 | struct sched_group *group; | ||
4151 | unsigned long imbalance; | ||
4152 | struct rq *busiest; | ||
4153 | unsigned long flags; | ||
4154 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
4155 | |||
4156 | cpumask_copy(cpus, cpu_active_mask); | ||
4157 | |||
4158 | /* | ||
4159 | * When power savings policy is enabled for the parent domain, idle | ||
4160 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
4161 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
4162 | * portraying it as CPU_NOT_IDLE. | ||
4163 | */ | ||
4164 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
4165 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4166 | sd_idle = 1; | ||
4167 | |||
4168 | schedstat_inc(sd, lb_count[idle]); | ||
4169 | |||
4170 | redo: | ||
4171 | update_shares(sd); | ||
4172 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
4173 | cpus, balance); | ||
4174 | |||
4175 | if (*balance == 0) | ||
4176 | goto out_balanced; | ||
4177 | |||
4178 | if (!group) { | ||
4179 | schedstat_inc(sd, lb_nobusyg[idle]); | ||
4180 | goto out_balanced; | ||
4181 | } | ||
4182 | |||
4183 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | ||
4184 | if (!busiest) { | ||
4185 | schedstat_inc(sd, lb_nobusyq[idle]); | ||
4186 | goto out_balanced; | ||
4187 | } | ||
4188 | |||
4189 | BUG_ON(busiest == this_rq); | ||
4190 | |||
4191 | schedstat_add(sd, lb_imbalance[idle], imbalance); | ||
4192 | |||
4193 | ld_moved = 0; | ||
4194 | if (busiest->nr_running > 1) { | ||
4195 | /* | ||
4196 | * Attempt to move tasks. If find_busiest_group has found | ||
4197 | * an imbalance but busiest->nr_running <= 1, the group is | ||
4198 | * still unbalanced. ld_moved simply stays zero, so it is | ||
4199 | * correctly treated as an imbalance. | ||
4200 | */ | ||
4201 | local_irq_save(flags); | ||
4202 | double_rq_lock(this_rq, busiest); | ||
4203 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
4204 | imbalance, sd, idle, &all_pinned); | ||
4205 | double_rq_unlock(this_rq, busiest); | ||
4206 | local_irq_restore(flags); | ||
4207 | |||
4208 | /* | ||
4209 | * some other cpu did the load balance for us. | ||
4210 | */ | ||
4211 | if (ld_moved && this_cpu != smp_processor_id()) | ||
4212 | resched_cpu(this_cpu); | ||
4213 | |||
4214 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
4215 | if (unlikely(all_pinned)) { | ||
4216 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
4217 | if (!cpumask_empty(cpus)) | ||
4218 | goto redo; | ||
4219 | goto out_balanced; | ||
4220 | } | ||
4221 | } | ||
4222 | |||
4223 | if (!ld_moved) { | ||
4224 | schedstat_inc(sd, lb_failed[idle]); | ||
4225 | sd->nr_balance_failed++; | ||
4226 | |||
4227 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | ||
4228 | |||
4229 | raw_spin_lock_irqsave(&busiest->lock, flags); | ||
4230 | |||
4231 | /* don't kick the migration_thread, if the curr | ||
4232 | * task on busiest cpu can't be moved to this_cpu | ||
4233 | */ | ||
4234 | if (!cpumask_test_cpu(this_cpu, | ||
4235 | &busiest->curr->cpus_allowed)) { | ||
4236 | raw_spin_unlock_irqrestore(&busiest->lock, | ||
4237 | flags); | ||
4238 | all_pinned = 1; | ||
4239 | goto out_one_pinned; | ||
4240 | } | ||
4241 | |||
4242 | if (!busiest->active_balance) { | ||
4243 | busiest->active_balance = 1; | ||
4244 | busiest->push_cpu = this_cpu; | ||
4245 | active_balance = 1; | ||
4246 | } | ||
4247 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | ||
4248 | if (active_balance) | ||
4249 | wake_up_process(busiest->migration_thread); | ||
4250 | |||
4251 | /* | ||
4252 | * We've kicked active balancing, reset the failure | ||
4253 | * counter. | ||
4254 | */ | ||
4255 | sd->nr_balance_failed = sd->cache_nice_tries+1; | ||
4256 | } | ||
4257 | } else | ||
4258 | sd->nr_balance_failed = 0; | ||
4259 | |||
4260 | if (likely(!active_balance)) { | ||
4261 | /* We were unbalanced, so reset the balancing interval */ | ||
4262 | sd->balance_interval = sd->min_interval; | ||
4263 | } else { | ||
4264 | /* | ||
4265 | * If we've begun active balancing, start to back off. This | ||
4266 | * case may not be covered by the all_pinned logic if there | ||
4267 | * is only 1 task on the busy runqueue (because we don't call | ||
4268 | * move_tasks). | ||
4269 | */ | ||
4270 | if (sd->balance_interval < sd->max_interval) | ||
4271 | sd->balance_interval *= 2; | ||
4272 | } | ||
4273 | |||
4274 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4275 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4276 | ld_moved = -1; | ||
4277 | |||
4278 | goto out; | ||
4279 | |||
4280 | out_balanced: | ||
4281 | schedstat_inc(sd, lb_balanced[idle]); | ||
4282 | |||
4283 | sd->nr_balance_failed = 0; | ||
4284 | |||
4285 | out_one_pinned: | ||
4286 | /* tune up the balancing interval */ | ||
4287 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
4288 | (sd->balance_interval < sd->max_interval)) | ||
4289 | sd->balance_interval *= 2; | ||
4290 | |||
4291 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4292 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4293 | ld_moved = -1; | ||
4294 | else | ||
4295 | ld_moved = 0; | ||
4296 | out: | ||
4297 | if (ld_moved) | ||
4298 | update_shares(sd); | ||
4299 | return ld_moved; | ||
4300 | } | ||
4301 | |||
4302 | /* | ||
4303 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
4304 | * tasks if there is an imbalance. | ||
4305 | * | ||
4306 | * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). | ||
4307 | * this_rq is locked. | ||
4308 | */ | ||
4309 | static int | ||
4310 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | ||
4311 | { | ||
4312 | struct sched_group *group; | ||
4313 | struct rq *busiest = NULL; | ||
4314 | unsigned long imbalance; | ||
4315 | int ld_moved = 0; | ||
4316 | int sd_idle = 0; | ||
4317 | int all_pinned = 0; | ||
4318 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
4319 | |||
4320 | cpumask_copy(cpus, cpu_active_mask); | ||
4321 | |||
4322 | /* | ||
4323 | * When power savings policy is enabled for the parent domain, idle | ||
4324 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
4325 | * let the state of idle sibling percolate up as IDLE, instead of | ||
4326 | * portraying it as CPU_NOT_IDLE. | ||
4327 | */ | ||
4328 | if (sd->flags & SD_SHARE_CPUPOWER && | ||
4329 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4330 | sd_idle = 1; | ||
4331 | |||
4332 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | ||
4333 | redo: | ||
4334 | update_shares_locked(this_rq, sd); | ||
4335 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | ||
4336 | &sd_idle, cpus, NULL); | ||
4337 | if (!group) { | ||
4338 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | ||
4339 | goto out_balanced; | ||
4340 | } | ||
4341 | |||
4342 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); | ||
4343 | if (!busiest) { | ||
4344 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | ||
4345 | goto out_balanced; | ||
4346 | } | ||
4347 | |||
4348 | BUG_ON(busiest == this_rq); | ||
4349 | |||
4350 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); | ||
4351 | |||
4352 | ld_moved = 0; | ||
4353 | if (busiest->nr_running > 1) { | ||
4354 | /* Attempt to move tasks */ | ||
4355 | double_lock_balance(this_rq, busiest); | ||
4356 | /* this_rq->clock is already updated */ | ||
4357 | update_rq_clock(busiest); | ||
4358 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
4359 | imbalance, sd, CPU_NEWLY_IDLE, | ||
4360 | &all_pinned); | ||
4361 | double_unlock_balance(this_rq, busiest); | ||
4362 | |||
4363 | if (unlikely(all_pinned)) { | ||
4364 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
4365 | if (!cpumask_empty(cpus)) | ||
4366 | goto redo; | ||
4367 | } | ||
4368 | } | ||
4369 | |||
4370 | if (!ld_moved) { | ||
4371 | int active_balance = 0; | ||
4372 | |||
4373 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); | ||
4374 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4375 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4376 | return -1; | ||
4377 | |||
4378 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
4379 | return -1; | ||
4380 | |||
4381 | if (sd->nr_balance_failed++ < 2) | ||
4382 | return -1; | ||
4383 | |||
4384 | /* | ||
4385 | * The only task running in a non-idle cpu can be moved to this | ||
4386 | * cpu in an attempt to completely freeup the other CPU | ||
4387 | * package. The same method used to move task in load_balance() | ||
4388 | * have been extended for load_balance_newidle() to speedup | ||
4389 | * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) | ||
4390 | * | ||
4391 | * The package power saving logic comes from | ||
4392 | * find_busiest_group(). If there are no imbalance, then | ||
4393 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
4394 | * f_b_g() will select a group from which a running task may be | ||
4395 | * pulled to this cpu in order to make the other package idle. | ||
4396 | * If there is no opportunity to make a package idle and if | ||
4397 | * there are no imbalance, then f_b_g() will return NULL and no | ||
4398 | * action will be taken in load_balance_newidle(). | ||
4399 | * | ||
4400 | * Under normal task pull operation due to imbalance, there | ||
4401 | * will be more than one task in the source run queue and | ||
4402 | * move_tasks() will succeed. ld_moved will be true and this | ||
4403 | * active balance code will not be triggered. | ||
4404 | */ | ||
4405 | |||
4406 | /* Lock busiest in correct order while this_rq is held */ | ||
4407 | double_lock_balance(this_rq, busiest); | ||
4408 | |||
4409 | /* | ||
4410 | * don't kick the migration_thread, if the curr | ||
4411 | * task on busiest cpu can't be moved to this_cpu | ||
4412 | */ | ||
4413 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | ||
4414 | double_unlock_balance(this_rq, busiest); | ||
4415 | all_pinned = 1; | ||
4416 | return ld_moved; | ||
4417 | } | ||
4418 | |||
4419 | if (!busiest->active_balance) { | ||
4420 | busiest->active_balance = 1; | ||
4421 | busiest->push_cpu = this_cpu; | ||
4422 | active_balance = 1; | ||
4423 | } | ||
4424 | |||
4425 | double_unlock_balance(this_rq, busiest); | ||
4426 | /* | ||
4427 | * Should not call ttwu while holding a rq->lock | ||
4428 | */ | ||
4429 | raw_spin_unlock(&this_rq->lock); | ||
4430 | if (active_balance) | ||
4431 | wake_up_process(busiest->migration_thread); | ||
4432 | raw_spin_lock(&this_rq->lock); | ||
4433 | |||
4434 | } else | ||
4435 | sd->nr_balance_failed = 0; | ||
4436 | |||
4437 | update_shares_locked(this_rq, sd); | ||
4438 | return ld_moved; | ||
4439 | |||
4440 | out_balanced: | ||
4441 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); | ||
4442 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
4443 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
4444 | return -1; | ||
4445 | sd->nr_balance_failed = 0; | ||
4446 | |||
4447 | return 0; | ||
4448 | } | ||
4449 | |||
4450 | /* | ||
4451 | * idle_balance is called by schedule() if this_cpu is about to become | ||
4452 | * idle. Attempts to pull tasks from other CPUs. | ||
4453 | */ | ||
4454 | static void idle_balance(int this_cpu, struct rq *this_rq) | ||
4455 | { | ||
4456 | struct sched_domain *sd; | ||
4457 | int pulled_task = 0; | ||
4458 | unsigned long next_balance = jiffies + HZ; | ||
4459 | |||
4460 | this_rq->idle_stamp = this_rq->clock; | ||
4461 | |||
4462 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
4463 | return; | ||
4464 | |||
4465 | for_each_domain(this_cpu, sd) { | ||
4466 | unsigned long interval; | ||
4467 | |||
4468 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
4469 | continue; | ||
4470 | |||
4471 | if (sd->flags & SD_BALANCE_NEWIDLE) | ||
4472 | /* If we've pulled tasks over stop searching: */ | ||
4473 | pulled_task = load_balance_newidle(this_cpu, this_rq, | ||
4474 | sd); | ||
4475 | |||
4476 | interval = msecs_to_jiffies(sd->balance_interval); | ||
4477 | if (time_after(next_balance, sd->last_balance + interval)) | ||
4478 | next_balance = sd->last_balance + interval; | ||
4479 | if (pulled_task) { | ||
4480 | this_rq->idle_stamp = 0; | ||
4481 | break; | ||
4482 | } | ||
4483 | } | ||
4484 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | ||
4485 | /* | ||
4486 | * We are going idle. next_balance may be set based on | ||
4487 | * a busy processor. So reset next_balance. | ||
4488 | */ | ||
4489 | this_rq->next_balance = next_balance; | ||
4490 | } | ||
4491 | } | ||
4492 | |||
4493 | /* | ||
4494 | * active_load_balance is run by migration threads. It pushes running tasks | ||
4495 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | ||
4496 | * running on each physical CPU where possible, and avoids physical / | ||
4497 | * logical imbalances. | ||
4498 | * | ||
4499 | * Called with busiest_rq locked. | ||
4500 | */ | ||
4501 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | ||
4502 | { | ||
4503 | int target_cpu = busiest_rq->push_cpu; | ||
4504 | struct sched_domain *sd; | ||
4505 | struct rq *target_rq; | ||
4506 | |||
4507 | /* Is there any task to move? */ | ||
4508 | if (busiest_rq->nr_running <= 1) | ||
4509 | return; | ||
4510 | |||
4511 | target_rq = cpu_rq(target_cpu); | ||
4512 | |||
4513 | /* | ||
4514 | * This condition is "impossible", if it occurs | ||
4515 | * we need to fix it. Originally reported by | ||
4516 | * Bjorn Helgaas on a 128-cpu setup. | ||
4517 | */ | ||
4518 | BUG_ON(busiest_rq == target_rq); | ||
4519 | |||
4520 | /* move a task from busiest_rq to target_rq */ | ||
4521 | double_lock_balance(busiest_rq, target_rq); | ||
4522 | update_rq_clock(busiest_rq); | ||
4523 | update_rq_clock(target_rq); | ||
4524 | |||
4525 | /* Search for an sd spanning us and the target CPU. */ | ||
4526 | for_each_domain(target_cpu, sd) { | ||
4527 | if ((sd->flags & SD_LOAD_BALANCE) && | ||
4528 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | ||
4529 | break; | ||
4530 | } | ||
4531 | |||
4532 | if (likely(sd)) { | ||
4533 | schedstat_inc(sd, alb_count); | ||
4534 | |||
4535 | if (move_one_task(target_rq, target_cpu, busiest_rq, | ||
4536 | sd, CPU_IDLE)) | ||
4537 | schedstat_inc(sd, alb_pushed); | ||
4538 | else | ||
4539 | schedstat_inc(sd, alb_failed); | ||
4540 | } | ||
4541 | double_unlock_balance(busiest_rq, target_rq); | ||
4542 | } | ||
4543 | |||
4544 | #ifdef CONFIG_NO_HZ | ||
4545 | static struct { | ||
4546 | atomic_t load_balancer; | ||
4547 | cpumask_var_t cpu_mask; | ||
4548 | cpumask_var_t ilb_grp_nohz_mask; | ||
4549 | } nohz ____cacheline_aligned = { | ||
4550 | .load_balancer = ATOMIC_INIT(-1), | ||
4551 | }; | ||
4552 | |||
4553 | int get_nohz_load_balancer(void) | ||
4554 | { | ||
4555 | return atomic_read(&nohz.load_balancer); | ||
4556 | } | ||
4557 | |||
4558 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
4559 | /** | ||
4560 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4561 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4562 | * be returned. | ||
4563 | * @flag: The flag to check for the lowest sched_domain | ||
4564 | * for the given cpu. | ||
4565 | * | ||
4566 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4567 | */ | ||
4568 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4569 | { | ||
4570 | struct sched_domain *sd; | ||
4571 | |||
4572 | for_each_domain(cpu, sd) | ||
4573 | if (sd && (sd->flags & flag)) | ||
4574 | break; | ||
4575 | |||
4576 | return sd; | ||
4577 | } | ||
4578 | |||
4579 | /** | ||
4580 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4581 | * @cpu: The cpu whose domains we're iterating over. | ||
4582 | * @sd: variable holding the value of the power_savings_sd | ||
4583 | * for cpu. | ||
4584 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4585 | * | ||
4586 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4587 | * set, starting from the lowest sched_domain to the highest. | ||
4588 | */ | ||
4589 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4590 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4591 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4592 | |||
4593 | /** | ||
4594 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
4595 | * @ilb_group: group to be checked for semi-idleness | ||
4596 | * | ||
4597 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
4598 | * | ||
4599 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
4600 | * and atleast one non-idle CPU. This helper function checks if the given | ||
4601 | * sched_group is semi-idle or not. | ||
4602 | */ | ||
4603 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
4604 | { | ||
4605 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
4606 | sched_group_cpus(ilb_group)); | ||
4607 | |||
4608 | /* | ||
4609 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
4610 | * and atleast one idle cpu. | ||
4611 | */ | ||
4612 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
4613 | return 0; | ||
4614 | |||
4615 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
4616 | return 0; | ||
4617 | |||
4618 | return 1; | ||
4619 | } | ||
4620 | /** | ||
4621 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4622 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4623 | * | ||
4624 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4625 | * Else, returns >= nr_cpu_ids. | ||
4626 | * | ||
4627 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4628 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4629 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4630 | * when there are other idle cpu's which are better suited for that job. | ||
4631 | */ | ||
4632 | static int find_new_ilb(int cpu) | ||
4633 | { | ||
4634 | struct sched_domain *sd; | ||
4635 | struct sched_group *ilb_group; | ||
4636 | |||
4637 | /* | ||
4638 | * Have idle load balancer selection from semi-idle packages only | ||
4639 | * when power-aware load balancing is enabled | ||
4640 | */ | ||
4641 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4642 | goto out_done; | ||
4643 | |||
4644 | /* | ||
4645 | * Optimize for the case when we have no idle CPUs or only one | ||
4646 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4647 | */ | ||
4648 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
4649 | goto out_done; | ||
4650 | |||
4651 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4652 | ilb_group = sd->groups; | ||
4653 | |||
4654 | do { | ||
4655 | if (is_semi_idle_group(ilb_group)) | ||
4656 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
4657 | |||
4658 | ilb_group = ilb_group->next; | ||
4659 | |||
4660 | } while (ilb_group != sd->groups); | ||
4661 | } | ||
4662 | |||
4663 | out_done: | ||
4664 | return cpumask_first(nohz.cpu_mask); | ||
4665 | } | ||
4666 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4667 | static inline int find_new_ilb(int call_cpu) | ||
4668 | { | ||
4669 | return cpumask_first(nohz.cpu_mask); | ||
4670 | } | ||
4671 | #endif | ||
4672 | |||
4673 | /* | ||
4674 | * This routine will try to nominate the ilb (idle load balancing) | ||
4675 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
4676 | * load balancing on behalf of all those cpus. If all the cpus in the system | ||
4677 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
4678 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
4679 | * arrives... | ||
4680 | * | ||
4681 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
4682 | * for idle load balancing. ilb owner will still be part of | ||
4683 | * nohz.cpu_mask.. | ||
4684 | * | ||
4685 | * While stopping the tick, this cpu will become the ilb owner if there | ||
4686 | * is no other owner. And will be the owner till that cpu becomes busy | ||
4687 | * or if all cpus in the system stop their ticks at which point | ||
4688 | * there is no need for ilb owner. | ||
4689 | * | ||
4690 | * When the ilb owner becomes busy, it nominates another owner, during the | ||
4691 | * next busy scheduler_tick() | ||
4692 | */ | ||
4693 | int select_nohz_load_balancer(int stop_tick) | ||
4694 | { | ||
4695 | int cpu = smp_processor_id(); | ||
4696 | |||
4697 | if (stop_tick) { | ||
4698 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
4699 | |||
4700 | if (!cpu_active(cpu)) { | ||
4701 | if (atomic_read(&nohz.load_balancer) != cpu) | ||
4702 | return 0; | ||
4703 | |||
4704 | /* | ||
4705 | * If we are going offline and still the leader, | ||
4706 | * give up! | ||
4707 | */ | ||
4708 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
4709 | BUG(); | ||
4710 | |||
4711 | return 0; | ||
4712 | } | ||
4713 | |||
4714 | cpumask_set_cpu(cpu, nohz.cpu_mask); | ||
4715 | |||
4716 | /* time for ilb owner also to sleep */ | ||
4717 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | ||
4718 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
4719 | atomic_set(&nohz.load_balancer, -1); | ||
4720 | return 0; | ||
4721 | } | ||
4722 | |||
4723 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
4724 | /* make me the ilb owner */ | ||
4725 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
4726 | return 1; | ||
4727 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
4728 | int new_ilb; | ||
4729 | |||
4730 | if (!(sched_smt_power_savings || | ||
4731 | sched_mc_power_savings)) | ||
4732 | return 1; | ||
4733 | /* | ||
4734 | * Check to see if there is a more power-efficient | ||
4735 | * ilb. | ||
4736 | */ | ||
4737 | new_ilb = find_new_ilb(cpu); | ||
4738 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
4739 | atomic_set(&nohz.load_balancer, -1); | ||
4740 | resched_cpu(new_ilb); | ||
4741 | return 0; | ||
4742 | } | ||
4743 | return 1; | ||
4744 | } | ||
4745 | } else { | ||
4746 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
4747 | return 0; | ||
4748 | |||
4749 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
4750 | |||
4751 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
4752 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
4753 | BUG(); | ||
4754 | } | ||
4755 | return 0; | ||
4756 | } | ||
4757 | #endif | ||
4758 | |||
4759 | static DEFINE_SPINLOCK(balancing); | ||
4760 | |||
4761 | /* | ||
4762 | * It checks each scheduling domain to see if it is due to be balanced, | ||
4763 | * and initiates a balancing operation if so. | ||
4764 | * | ||
4765 | * Balancing parameters are set up in arch_init_sched_domains. | ||
4766 | */ | ||
4767 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | ||
4768 | { | ||
4769 | int balance = 1; | ||
4770 | struct rq *rq = cpu_rq(cpu); | ||
4771 | unsigned long interval; | ||
4772 | struct sched_domain *sd; | ||
4773 | /* Earliest time when we have to do rebalance again */ | ||
4774 | unsigned long next_balance = jiffies + 60*HZ; | ||
4775 | int update_next_balance = 0; | ||
4776 | int need_serialize; | ||
4777 | |||
4778 | for_each_domain(cpu, sd) { | ||
4779 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
4780 | continue; | ||
4781 | |||
4782 | interval = sd->balance_interval; | ||
4783 | if (idle != CPU_IDLE) | ||
4784 | interval *= sd->busy_factor; | ||
4785 | |||
4786 | /* scale ms to jiffies */ | ||
4787 | interval = msecs_to_jiffies(interval); | ||
4788 | if (unlikely(!interval)) | ||
4789 | interval = 1; | ||
4790 | if (interval > HZ*NR_CPUS/10) | ||
4791 | interval = HZ*NR_CPUS/10; | ||
4792 | |||
4793 | need_serialize = sd->flags & SD_SERIALIZE; | ||
4794 | |||
4795 | if (need_serialize) { | ||
4796 | if (!spin_trylock(&balancing)) | ||
4797 | goto out; | ||
4798 | } | ||
4799 | |||
4800 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
4801 | if (load_balance(cpu, rq, sd, idle, &balance)) { | ||
4802 | /* | ||
4803 | * We've pulled tasks over so either we're no | ||
4804 | * longer idle, or one of our SMT siblings is | ||
4805 | * not idle. | ||
4806 | */ | ||
4807 | idle = CPU_NOT_IDLE; | ||
4808 | } | ||
4809 | sd->last_balance = jiffies; | ||
4810 | } | ||
4811 | if (need_serialize) | ||
4812 | spin_unlock(&balancing); | ||
4813 | out: | ||
4814 | if (time_after(next_balance, sd->last_balance + interval)) { | ||
4815 | next_balance = sd->last_balance + interval; | ||
4816 | update_next_balance = 1; | ||
4817 | } | ||
4818 | |||
4819 | /* | ||
4820 | * Stop the load balance at this level. There is another | ||
4821 | * CPU in our sched group which is doing load balancing more | ||
4822 | * actively. | ||
4823 | */ | ||
4824 | if (!balance) | ||
4825 | break; | ||
4826 | } | ||
4827 | |||
4828 | /* | ||
4829 | * next_balance will be updated only when there is a need. | ||
4830 | * When the cpu is attached to null domain for ex, it will not be | ||
4831 | * updated. | ||
4832 | */ | ||
4833 | if (likely(update_next_balance)) | ||
4834 | rq->next_balance = next_balance; | ||
4835 | } | ||
4836 | |||
4837 | /* | ||
4838 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
4839 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
4840 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
4841 | */ | ||
4842 | static void run_rebalance_domains(struct softirq_action *h) | ||
4843 | { | ||
4844 | int this_cpu = smp_processor_id(); | ||
4845 | struct rq *this_rq = cpu_rq(this_cpu); | ||
4846 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | ||
4847 | CPU_IDLE : CPU_NOT_IDLE; | ||
4848 | |||
4849 | rebalance_domains(this_cpu, idle); | ||
4850 | |||
4851 | #ifdef CONFIG_NO_HZ | ||
4852 | /* | ||
4853 | * If this cpu is the owner for idle load balancing, then do the | ||
4854 | * balancing on behalf of the other idle cpus whose ticks are | ||
4855 | * stopped. | ||
4856 | */ | ||
4857 | if (this_rq->idle_at_tick && | ||
4858 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
4859 | struct rq *rq; | ||
4860 | int balance_cpu; | ||
4861 | |||
4862 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
4863 | if (balance_cpu == this_cpu) | ||
4864 | continue; | ||
4865 | |||
4866 | /* | ||
4867 | * If this cpu gets work to do, stop the load balancing | ||
4868 | * work being done for other cpus. Next load | ||
4869 | * balancing owner will pick it up. | ||
4870 | */ | ||
4871 | if (need_resched()) | ||
4872 | break; | ||
4873 | |||
4874 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
4875 | |||
4876 | rq = cpu_rq(balance_cpu); | ||
4877 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
4878 | this_rq->next_balance = rq->next_balance; | ||
4879 | } | ||
4880 | } | ||
4881 | #endif | ||
4882 | } | ||
4883 | |||
4884 | static inline int on_null_domain(int cpu) | ||
4885 | { | ||
4886 | return !rcu_dereference(cpu_rq(cpu)->sd); | ||
4887 | } | ||
4888 | |||
4889 | /* | ||
4890 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | ||
4891 | * | ||
4892 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
4893 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
4894 | * if the whole system is idle. | ||
4895 | */ | ||
4896 | static inline void trigger_load_balance(struct rq *rq, int cpu) | ||
4897 | { | ||
4898 | #ifdef CONFIG_NO_HZ | ||
4899 | /* | ||
4900 | * If we were in the nohz mode recently and busy at the current | ||
4901 | * scheduler tick, then check if we need to nominate new idle | ||
4902 | * load balancer. | ||
4903 | */ | ||
4904 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
4905 | rq->in_nohz_recently = 0; | ||
4906 | |||
4907 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
4908 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
4909 | atomic_set(&nohz.load_balancer, -1); | ||
4910 | } | ||
4911 | |||
4912 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
4913 | int ilb = find_new_ilb(cpu); | ||
4914 | |||
4915 | if (ilb < nr_cpu_ids) | ||
4916 | resched_cpu(ilb); | ||
4917 | } | ||
4918 | } | ||
4919 | |||
4920 | /* | ||
4921 | * If this cpu is idle and doing idle load balancing for all the | ||
4922 | * cpus with ticks stopped, is it time for that to stop? | ||
4923 | */ | ||
4924 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
4925 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
4926 | resched_cpu(cpu); | ||
4927 | return; | ||
4928 | } | ||
4929 | |||
4930 | /* | ||
4931 | * If this cpu is idle and the idle load balancing is done by | ||
4932 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
4933 | */ | ||
4934 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
4935 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
4936 | return; | ||
4937 | #endif | ||
4938 | /* Don't need to rebalance while attached to NULL domain */ | ||
4939 | if (time_after_eq(jiffies, rq->next_balance) && | ||
4940 | likely(!on_null_domain(cpu))) | ||
4941 | raise_softirq(SCHED_SOFTIRQ); | ||
4942 | } | ||
4943 | |||
4944 | #else /* CONFIG_SMP */ | ||
4945 | |||
4946 | /* | ||
4947 | * on UP we do not need to balance between CPUs: | ||
4948 | */ | ||
4949 | static inline void idle_balance(int cpu, struct rq *rq) | ||
4950 | { | ||
4951 | } | ||
4952 | |||
4953 | #endif | 3175 | #endif |
4954 | 3176 | ||
4955 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3177 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
@@ -5568,7 +3790,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
5568 | * the mutex owner just released it and exited. | 3790 | * the mutex owner just released it and exited. |
5569 | */ | 3791 | */ |
5570 | if (probe_kernel_address(&owner->cpu, cpu)) | 3792 | if (probe_kernel_address(&owner->cpu, cpu)) |
5571 | goto out; | 3793 | return 0; |
5572 | #else | 3794 | #else |
5573 | cpu = owner->cpu; | 3795 | cpu = owner->cpu; |
5574 | #endif | 3796 | #endif |
@@ -5578,14 +3800,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
5578 | * the cpu field may no longer be valid. | 3800 | * the cpu field may no longer be valid. |
5579 | */ | 3801 | */ |
5580 | if (cpu >= nr_cpumask_bits) | 3802 | if (cpu >= nr_cpumask_bits) |
5581 | goto out; | 3803 | return 0; |
5582 | 3804 | ||
5583 | /* | 3805 | /* |
5584 | * We need to validate that we can do a | 3806 | * We need to validate that we can do a |
5585 | * get_cpu() and that we have the percpu area. | 3807 | * get_cpu() and that we have the percpu area. |
5586 | */ | 3808 | */ |
5587 | if (!cpu_online(cpu)) | 3809 | if (!cpu_online(cpu)) |
5588 | goto out; | 3810 | return 0; |
5589 | 3811 | ||
5590 | rq = cpu_rq(cpu); | 3812 | rq = cpu_rq(cpu); |
5591 | 3813 | ||
@@ -5604,7 +3826,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
5604 | 3826 | ||
5605 | cpu_relax(); | 3827 | cpu_relax(); |
5606 | } | 3828 | } |
5607 | out: | 3829 | |
5608 | return 1; | 3830 | return 1; |
5609 | } | 3831 | } |
5610 | #endif | 3832 | #endif |
@@ -6049,7 +4271,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6049 | unsigned long flags; | 4271 | unsigned long flags; |
6050 | int oldprio, on_rq, running; | 4272 | int oldprio, on_rq, running; |
6051 | struct rq *rq; | 4273 | struct rq *rq; |
6052 | const struct sched_class *prev_class = p->sched_class; | 4274 | const struct sched_class *prev_class; |
6053 | 4275 | ||
6054 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4276 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
6055 | 4277 | ||
@@ -6057,6 +4279,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6057 | update_rq_clock(rq); | 4279 | update_rq_clock(rq); |
6058 | 4280 | ||
6059 | oldprio = p->prio; | 4281 | oldprio = p->prio; |
4282 | prev_class = p->sched_class; | ||
6060 | on_rq = p->se.on_rq; | 4283 | on_rq = p->se.on_rq; |
6061 | running = task_current(rq, p); | 4284 | running = task_current(rq, p); |
6062 | if (on_rq) | 4285 | if (on_rq) |
@@ -6074,7 +4297,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6074 | if (running) | 4297 | if (running) |
6075 | p->sched_class->set_curr_task(rq); | 4298 | p->sched_class->set_curr_task(rq); |
6076 | if (on_rq) { | 4299 | if (on_rq) { |
6077 | enqueue_task(rq, p, 0); | 4300 | enqueue_task(rq, p, 0, oldprio < prio); |
6078 | 4301 | ||
6079 | check_class_changed(rq, p, prev_class, oldprio, running); | 4302 | check_class_changed(rq, p, prev_class, oldprio, running); |
6080 | } | 4303 | } |
@@ -6118,7 +4341,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
6118 | delta = p->prio - old_prio; | 4341 | delta = p->prio - old_prio; |
6119 | 4342 | ||
6120 | if (on_rq) { | 4343 | if (on_rq) { |
6121 | enqueue_task(rq, p, 0); | 4344 | enqueue_task(rq, p, 0, false); |
6122 | /* | 4345 | /* |
6123 | * If the task increased its priority or is running and | 4346 | * If the task increased its priority or is running and |
6124 | * lowered its priority, then reschedule its CPU: | 4347 | * lowered its priority, then reschedule its CPU: |
@@ -6141,7 +4364,7 @@ int can_nice(const struct task_struct *p, const int nice) | |||
6141 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 4364 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
6142 | int nice_rlim = 20 - nice; | 4365 | int nice_rlim = 20 - nice; |
6143 | 4366 | ||
6144 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 4367 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
6145 | capable(CAP_SYS_NICE)); | 4368 | capable(CAP_SYS_NICE)); |
6146 | } | 4369 | } |
6147 | 4370 | ||
@@ -6276,7 +4499,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
6276 | { | 4499 | { |
6277 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4500 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
6278 | unsigned long flags; | 4501 | unsigned long flags; |
6279 | const struct sched_class *prev_class = p->sched_class; | 4502 | const struct sched_class *prev_class; |
6280 | struct rq *rq; | 4503 | struct rq *rq; |
6281 | int reset_on_fork; | 4504 | int reset_on_fork; |
6282 | 4505 | ||
@@ -6318,7 +4541,7 @@ recheck: | |||
6318 | 4541 | ||
6319 | if (!lock_task_sighand(p, &flags)) | 4542 | if (!lock_task_sighand(p, &flags)) |
6320 | return -ESRCH; | 4543 | return -ESRCH; |
6321 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; | 4544 | rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); |
6322 | unlock_task_sighand(p, &flags); | 4545 | unlock_task_sighand(p, &flags); |
6323 | 4546 | ||
6324 | /* can't set/change the rt policy */ | 4547 | /* can't set/change the rt policy */ |
@@ -6390,6 +4613,7 @@ recheck: | |||
6390 | p->sched_reset_on_fork = reset_on_fork; | 4613 | p->sched_reset_on_fork = reset_on_fork; |
6391 | 4614 | ||
6392 | oldprio = p->prio; | 4615 | oldprio = p->prio; |
4616 | prev_class = p->sched_class; | ||
6393 | __setscheduler(rq, p, policy, param->sched_priority); | 4617 | __setscheduler(rq, p, policy, param->sched_priority); |
6394 | 4618 | ||
6395 | if (running) | 4619 | if (running) |
@@ -6689,7 +4913,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
6689 | int ret; | 4913 | int ret; |
6690 | cpumask_var_t mask; | 4914 | cpumask_var_t mask; |
6691 | 4915 | ||
6692 | if (len < cpumask_size()) | 4916 | if ((len * BITS_PER_BYTE) < nr_cpu_ids) |
4917 | return -EINVAL; | ||
4918 | if (len & (sizeof(unsigned long)-1)) | ||
6693 | return -EINVAL; | 4919 | return -EINVAL; |
6694 | 4920 | ||
6695 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) | 4921 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) |
@@ -6697,10 +4923,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
6697 | 4923 | ||
6698 | ret = sched_getaffinity(pid, mask); | 4924 | ret = sched_getaffinity(pid, mask); |
6699 | if (ret == 0) { | 4925 | if (ret == 0) { |
6700 | if (copy_to_user(user_mask_ptr, mask, cpumask_size())) | 4926 | size_t retlen = min_t(size_t, len, cpumask_size()); |
4927 | |||
4928 | if (copy_to_user(user_mask_ptr, mask, retlen)) | ||
6701 | ret = -EFAULT; | 4929 | ret = -EFAULT; |
6702 | else | 4930 | else |
6703 | ret = cpumask_size(); | 4931 | ret = retlen; |
6704 | } | 4932 | } |
6705 | free_cpumask_var(mask); | 4933 | free_cpumask_var(mask); |
6706 | 4934 | ||
@@ -7140,23 +5368,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
7140 | struct rq *rq; | 5368 | struct rq *rq; |
7141 | int ret = 0; | 5369 | int ret = 0; |
7142 | 5370 | ||
7143 | /* | ||
7144 | * Since we rely on wake-ups to migrate sleeping tasks, don't change | ||
7145 | * the ->cpus_allowed mask from under waking tasks, which would be | ||
7146 | * possible when we change rq->lock in ttwu(), so synchronize against | ||
7147 | * TASK_WAKING to avoid that. | ||
7148 | */ | ||
7149 | again: | ||
7150 | while (p->state == TASK_WAKING) | ||
7151 | cpu_relax(); | ||
7152 | |||
7153 | rq = task_rq_lock(p, &flags); | 5371 | rq = task_rq_lock(p, &flags); |
7154 | 5372 | ||
7155 | if (p->state == TASK_WAKING) { | ||
7156 | task_rq_unlock(rq, &flags); | ||
7157 | goto again; | ||
7158 | } | ||
7159 | |||
7160 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 5373 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
7161 | ret = -EINVAL; | 5374 | ret = -EINVAL; |
7162 | goto out; | 5375 | goto out; |
@@ -7185,7 +5398,7 @@ again: | |||
7185 | 5398 | ||
7186 | get_task_struct(mt); | 5399 | get_task_struct(mt); |
7187 | task_rq_unlock(rq, &flags); | 5400 | task_rq_unlock(rq, &flags); |
7188 | wake_up_process(rq->migration_thread); | 5401 | wake_up_process(mt); |
7189 | put_task_struct(mt); | 5402 | put_task_struct(mt); |
7190 | wait_for_completion(&req.done); | 5403 | wait_for_completion(&req.done); |
7191 | tlb_migrate_finish(p->mm); | 5404 | tlb_migrate_finish(p->mm); |
@@ -9208,11 +7421,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
9208 | 7421 | ||
9209 | #ifdef CONFIG_SCHED_MC | 7422 | #ifdef CONFIG_SCHED_MC |
9210 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, | 7423 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, |
7424 | struct sysdev_class_attribute *attr, | ||
9211 | char *page) | 7425 | char *page) |
9212 | { | 7426 | { |
9213 | return sprintf(page, "%u\n", sched_mc_power_savings); | 7427 | return sprintf(page, "%u\n", sched_mc_power_savings); |
9214 | } | 7428 | } |
9215 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, | 7429 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, |
7430 | struct sysdev_class_attribute *attr, | ||
9216 | const char *buf, size_t count) | 7431 | const char *buf, size_t count) |
9217 | { | 7432 | { |
9218 | return sched_power_savings_store(buf, count, 0); | 7433 | return sched_power_savings_store(buf, count, 0); |
@@ -9224,11 +7439,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, | |||
9224 | 7439 | ||
9225 | #ifdef CONFIG_SCHED_SMT | 7440 | #ifdef CONFIG_SCHED_SMT |
9226 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, | 7441 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, |
7442 | struct sysdev_class_attribute *attr, | ||
9227 | char *page) | 7443 | char *page) |
9228 | { | 7444 | { |
9229 | return sprintf(page, "%u\n", sched_smt_power_savings); | 7445 | return sprintf(page, "%u\n", sched_smt_power_savings); |
9230 | } | 7446 | } |
9231 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, | 7447 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, |
7448 | struct sysdev_class_attribute *attr, | ||
9232 | const char *buf, size_t count) | 7449 | const char *buf, size_t count) |
9233 | { | 7450 | { |
9234 | return sched_power_savings_store(buf, count, 1); | 7451 | return sched_power_savings_store(buf, count, 1); |
@@ -9443,7 +7660,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
9443 | tg->rt_rq[cpu] = rt_rq; | 7660 | tg->rt_rq[cpu] = rt_rq; |
9444 | init_rt_rq(rt_rq, rq); | 7661 | init_rt_rq(rt_rq, rq); |
9445 | rt_rq->tg = tg; | 7662 | rt_rq->tg = tg; |
9446 | rt_rq->rt_se = rt_se; | ||
9447 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7663 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
9448 | if (add) | 7664 | if (add) |
9449 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 7665 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
@@ -9474,9 +7690,6 @@ void __init sched_init(void) | |||
9474 | #ifdef CONFIG_RT_GROUP_SCHED | 7690 | #ifdef CONFIG_RT_GROUP_SCHED |
9475 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7691 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
9476 | #endif | 7692 | #endif |
9477 | #ifdef CONFIG_USER_SCHED | ||
9478 | alloc_size *= 2; | ||
9479 | #endif | ||
9480 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7693 | #ifdef CONFIG_CPUMASK_OFFSTACK |
9481 | alloc_size += num_possible_cpus() * cpumask_size(); | 7694 | alloc_size += num_possible_cpus() * cpumask_size(); |
9482 | #endif | 7695 | #endif |
@@ -9490,13 +7703,6 @@ void __init sched_init(void) | |||
9490 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7703 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; |
9491 | ptr += nr_cpu_ids * sizeof(void **); | 7704 | ptr += nr_cpu_ids * sizeof(void **); |
9492 | 7705 | ||
9493 | #ifdef CONFIG_USER_SCHED | ||
9494 | root_task_group.se = (struct sched_entity **)ptr; | ||
9495 | ptr += nr_cpu_ids * sizeof(void **); | ||
9496 | |||
9497 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
9498 | ptr += nr_cpu_ids * sizeof(void **); | ||
9499 | #endif /* CONFIG_USER_SCHED */ | ||
9500 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7706 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9501 | #ifdef CONFIG_RT_GROUP_SCHED | 7707 | #ifdef CONFIG_RT_GROUP_SCHED |
9502 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7708 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; |
@@ -9505,13 +7711,6 @@ void __init sched_init(void) | |||
9505 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 7711 | init_task_group.rt_rq = (struct rt_rq **)ptr; |
9506 | ptr += nr_cpu_ids * sizeof(void **); | 7712 | ptr += nr_cpu_ids * sizeof(void **); |
9507 | 7713 | ||
9508 | #ifdef CONFIG_USER_SCHED | ||
9509 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
9510 | ptr += nr_cpu_ids * sizeof(void **); | ||
9511 | |||
9512 | root_task_group.rt_rq = (struct rt_rq **)ptr; | ||
9513 | ptr += nr_cpu_ids * sizeof(void **); | ||
9514 | #endif /* CONFIG_USER_SCHED */ | ||
9515 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7714 | #endif /* CONFIG_RT_GROUP_SCHED */ |
9516 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7715 | #ifdef CONFIG_CPUMASK_OFFSTACK |
9517 | for_each_possible_cpu(i) { | 7716 | for_each_possible_cpu(i) { |
@@ -9531,22 +7730,13 @@ void __init sched_init(void) | |||
9531 | #ifdef CONFIG_RT_GROUP_SCHED | 7730 | #ifdef CONFIG_RT_GROUP_SCHED |
9532 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 7731 | init_rt_bandwidth(&init_task_group.rt_bandwidth, |
9533 | global_rt_period(), global_rt_runtime()); | 7732 | global_rt_period(), global_rt_runtime()); |
9534 | #ifdef CONFIG_USER_SCHED | ||
9535 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | ||
9536 | global_rt_period(), RUNTIME_INF); | ||
9537 | #endif /* CONFIG_USER_SCHED */ | ||
9538 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7733 | #endif /* CONFIG_RT_GROUP_SCHED */ |
9539 | 7734 | ||
9540 | #ifdef CONFIG_GROUP_SCHED | 7735 | #ifdef CONFIG_CGROUP_SCHED |
9541 | list_add(&init_task_group.list, &task_groups); | 7736 | list_add(&init_task_group.list, &task_groups); |
9542 | INIT_LIST_HEAD(&init_task_group.children); | 7737 | INIT_LIST_HEAD(&init_task_group.children); |
9543 | 7738 | ||
9544 | #ifdef CONFIG_USER_SCHED | 7739 | #endif /* CONFIG_CGROUP_SCHED */ |
9545 | INIT_LIST_HEAD(&root_task_group.children); | ||
9546 | init_task_group.parent = &root_task_group; | ||
9547 | list_add(&init_task_group.siblings, &root_task_group.children); | ||
9548 | #endif /* CONFIG_USER_SCHED */ | ||
9549 | #endif /* CONFIG_GROUP_SCHED */ | ||
9550 | 7740 | ||
9551 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | 7741 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP |
9552 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | 7742 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), |
@@ -9586,25 +7776,6 @@ void __init sched_init(void) | |||
9586 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7776 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). |
9587 | */ | 7777 | */ |
9588 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7778 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); |
9589 | #elif defined CONFIG_USER_SCHED | ||
9590 | root_task_group.shares = NICE_0_LOAD; | ||
9591 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); | ||
9592 | /* | ||
9593 | * In case of task-groups formed thr' the user id of tasks, | ||
9594 | * init_task_group represents tasks belonging to root user. | ||
9595 | * Hence it forms a sibling of all subsequent groups formed. | ||
9596 | * In this case, init_task_group gets only a fraction of overall | ||
9597 | * system cpu resource, based on the weight assigned to root | ||
9598 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
9599 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
9600 | * (init_tg_cfs_rq) and having one entity represent this group of | ||
9601 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
9602 | */ | ||
9603 | init_tg_cfs_entry(&init_task_group, | ||
9604 | &per_cpu(init_tg_cfs_rq, i), | ||
9605 | &per_cpu(init_sched_entity, i), i, 1, | ||
9606 | root_task_group.se[i]); | ||
9607 | |||
9608 | #endif | 7779 | #endif |
9609 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7780 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9610 | 7781 | ||
@@ -9613,12 +7784,6 @@ void __init sched_init(void) | |||
9613 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7784 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
9614 | #ifdef CONFIG_CGROUP_SCHED | 7785 | #ifdef CONFIG_CGROUP_SCHED |
9615 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | 7786 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); |
9616 | #elif defined CONFIG_USER_SCHED | ||
9617 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); | ||
9618 | init_tg_rt_entry(&init_task_group, | ||
9619 | &per_cpu(init_rt_rq_var, i), | ||
9620 | &per_cpu(init_sched_rt_entity, i), i, 1, | ||
9621 | root_task_group.rt_se[i]); | ||
9622 | #endif | 7787 | #endif |
9623 | #endif | 7788 | #endif |
9624 | 7789 | ||
@@ -9703,7 +7868,7 @@ static inline int preempt_count_equals(int preempt_offset) | |||
9703 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 7868 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); |
9704 | } | 7869 | } |
9705 | 7870 | ||
9706 | void __might_sleep(char *file, int line, int preempt_offset) | 7871 | void __might_sleep(const char *file, int line, int preempt_offset) |
9707 | { | 7872 | { |
9708 | #ifdef in_atomic | 7873 | #ifdef in_atomic |
9709 | static unsigned long prev_jiffy; /* ratelimiting */ | 7874 | static unsigned long prev_jiffy; /* ratelimiting */ |
@@ -10014,7 +8179,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
10014 | } | 8179 | } |
10015 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8180 | #endif /* CONFIG_RT_GROUP_SCHED */ |
10016 | 8181 | ||
10017 | #ifdef CONFIG_GROUP_SCHED | 8182 | #ifdef CONFIG_CGROUP_SCHED |
10018 | static void free_sched_group(struct task_group *tg) | 8183 | static void free_sched_group(struct task_group *tg) |
10019 | { | 8184 | { |
10020 | free_fair_sched_group(tg); | 8185 | free_fair_sched_group(tg); |
@@ -10119,11 +8284,11 @@ void sched_move_task(struct task_struct *tsk) | |||
10119 | if (unlikely(running)) | 8284 | if (unlikely(running)) |
10120 | tsk->sched_class->set_curr_task(rq); | 8285 | tsk->sched_class->set_curr_task(rq); |
10121 | if (on_rq) | 8286 | if (on_rq) |
10122 | enqueue_task(rq, tsk, 0); | 8287 | enqueue_task(rq, tsk, 0, false); |
10123 | 8288 | ||
10124 | task_rq_unlock(rq, &flags); | 8289 | task_rq_unlock(rq, &flags); |
10125 | } | 8290 | } |
10126 | #endif /* CONFIG_GROUP_SCHED */ | 8291 | #endif /* CONFIG_CGROUP_SCHED */ |
10127 | 8292 | ||
10128 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8293 | #ifdef CONFIG_FAIR_GROUP_SCHED |
10129 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | 8294 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
@@ -10265,13 +8430,6 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
10265 | runtime = d->rt_runtime; | 8430 | runtime = d->rt_runtime; |
10266 | } | 8431 | } |
10267 | 8432 | ||
10268 | #ifdef CONFIG_USER_SCHED | ||
10269 | if (tg == &root_task_group) { | ||
10270 | period = global_rt_period(); | ||
10271 | runtime = global_rt_runtime(); | ||
10272 | } | ||
10273 | #endif | ||
10274 | |||
10275 | /* | 8433 | /* |
10276 | * Cannot have more runtime than the period. | 8434 | * Cannot have more runtime than the period. |
10277 | */ | 8435 | */ |
@@ -10674,7 +8832,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
10674 | struct cpuacct { | 8832 | struct cpuacct { |
10675 | struct cgroup_subsys_state css; | 8833 | struct cgroup_subsys_state css; |
10676 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 8834 | /* cpuusage holds pointer to a u64-type object on every cpu */ |
10677 | u64 *cpuusage; | 8835 | u64 __percpu *cpuusage; |
10678 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | 8836 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; |
10679 | struct cpuacct *parent; | 8837 | struct cpuacct *parent; |
10680 | }; | 8838 | }; |
@@ -10891,12 +9049,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
10891 | } | 9049 | } |
10892 | 9050 | ||
10893 | /* | 9051 | /* |
9052 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | ||
9053 | * in cputime_t units. As a result, cpuacct_update_stats calls | ||
9054 | * percpu_counter_add with values large enough to always overflow the | ||
9055 | * per cpu batch limit causing bad SMP scalability. | ||
9056 | * | ||
9057 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | ||
9058 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | ||
9059 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | ||
9060 | */ | ||
9061 | #ifdef CONFIG_SMP | ||
9062 | #define CPUACCT_BATCH \ | ||
9063 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | ||
9064 | #else | ||
9065 | #define CPUACCT_BATCH 0 | ||
9066 | #endif | ||
9067 | |||
9068 | /* | ||
10894 | * Charge the system/user time to the task's accounting group. | 9069 | * Charge the system/user time to the task's accounting group. |
10895 | */ | 9070 | */ |
10896 | static void cpuacct_update_stats(struct task_struct *tsk, | 9071 | static void cpuacct_update_stats(struct task_struct *tsk, |
10897 | enum cpuacct_stat_index idx, cputime_t val) | 9072 | enum cpuacct_stat_index idx, cputime_t val) |
10898 | { | 9073 | { |
10899 | struct cpuacct *ca; | 9074 | struct cpuacct *ca; |
9075 | int batch = CPUACCT_BATCH; | ||
10900 | 9076 | ||
10901 | if (unlikely(!cpuacct_subsys.active)) | 9077 | if (unlikely(!cpuacct_subsys.active)) |
10902 | return; | 9078 | return; |
@@ -10905,7 +9081,7 @@ static void cpuacct_update_stats(struct task_struct *tsk, | |||
10905 | ca = task_ca(tsk); | 9081 | ca = task_ca(tsk); |
10906 | 9082 | ||
10907 | do { | 9083 | do { |
10908 | percpu_counter_add(&ca->cpustat[idx], val); | 9084 | __percpu_counter_add(&ca->cpustat[idx], val, batch); |
10909 | ca = ca->parent; | 9085 | ca = ca->parent; |
10910 | } while (ca); | 9086 | } while (ca); |
10911 | rcu_read_unlock(); | 9087 | rcu_read_unlock(); |
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 597b33099dfa..e6871cb3fc83 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -27,6 +27,7 @@ | |||
27 | * of the License. | 27 | * of the License. |
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/gfp.h> | ||
30 | #include "sched_cpupri.h" | 31 | #include "sched_cpupri.h" |
31 | 32 | ||
32 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
@@ -47,9 +48,7 @@ static int convert_prio(int prio) | |||
47 | } | 48 | } |
48 | 49 | ||
49 | #define for_each_cpupri_active(array, idx) \ | 50 | #define for_each_cpupri_active(array, idx) \ |
50 | for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ | 51 | for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) |
51 | idx < CPUPRI_NR_PRIORITIES; \ | ||
52 | idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) | ||
53 | 52 | ||
54 | /** | 53 | /** |
55 | * cpupri_find - find the best (lowest-pri) CPU in the system | 54 | * cpupri_find - find the best (lowest-pri) CPU in the system |
@@ -58,7 +57,7 @@ static int convert_prio(int prio) | |||
58 | * @lowest_mask: A mask to fill in with selected CPUs (or NULL) | 57 | * @lowest_mask: A mask to fill in with selected CPUs (or NULL) |
59 | * | 58 | * |
60 | * Note: This function returns the recommended CPUs as calculated during the | 59 | * Note: This function returns the recommended CPUs as calculated during the |
61 | * current invokation. By the time the call returns, the CPUs may have in | 60 | * current invocation. By the time the call returns, the CPUs may have in |
62 | * fact changed priorities any number of times. While not ideal, it is not | 61 | * fact changed priorities any number of times. While not ideal, it is not |
63 | * an issue of correctness since the normal rebalancer logic will correct | 62 | * an issue of correctness since the normal rebalancer logic will correct |
64 | * any discrepancies created by racing against the uncertainty of the current | 63 | * any discrepancies created by racing against the uncertainty of the current |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 67f95aada4b9..19be00ba6123 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
114 | { | 114 | { |
115 | char path[64]; | 115 | char path[64]; |
116 | 116 | ||
117 | rcu_read_lock(); | ||
117 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); | 118 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); |
119 | rcu_read_unlock(); | ||
118 | SEQ_printf(m, " %s", path); | 120 | SEQ_printf(m, " %s", path); |
119 | } | 121 | } |
120 | #endif | 122 | #endif |
@@ -518,8 +520,4 @@ void proc_sched_set_task(struct task_struct *p) | |||
518 | p->se.nr_wakeups_idle = 0; | 520 | p->se.nr_wakeups_idle = 0; |
519 | p->sched_info.bkl_count = 0; | 521 | p->sched_info.bkl_count = 0; |
520 | #endif | 522 | #endif |
521 | p->se.sum_exec_runtime = 0; | ||
522 | p->se.prev_sum_exec_runtime = 0; | ||
523 | p->nvcsw = 0; | ||
524 | p->nivcsw = 0; | ||
525 | } | 523 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8fe7ee81c552..5a5ea2cd924f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq) | |||
1053 | * increased. Here we update the fair scheduling stats and | 1053 | * increased. Here we update the fair scheduling stats and |
1054 | * then put the task into the rbtree: | 1054 | * then put the task into the rbtree: |
1055 | */ | 1055 | */ |
1056 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 1056 | static void |
1057 | enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
1057 | { | 1058 | { |
1058 | struct cfs_rq *cfs_rq; | 1059 | struct cfs_rq *cfs_rq; |
1059 | struct sched_entity *se = &p->se; | 1060 | struct sched_entity *se = &p->se; |
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1815 | */ | 1816 | */ |
1816 | 1817 | ||
1817 | /* | 1818 | /* |
1818 | * Load-balancing iterator. Note: while the runqueue stays locked | 1819 | * pull_task - move a task from a remote runqueue to the local runqueue. |
1819 | * during the whole iteration, the current task might be | 1820 | * Both runqueues must be locked. |
1820 | * dequeued so the iterator has to be dequeue-safe. Here we | ||
1821 | * achieve that by always pre-iterating before returning | ||
1822 | * the current task: | ||
1823 | */ | 1821 | */ |
1824 | static struct task_struct * | 1822 | static void pull_task(struct rq *src_rq, struct task_struct *p, |
1825 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) | 1823 | struct rq *this_rq, int this_cpu) |
1826 | { | 1824 | { |
1827 | struct task_struct *p = NULL; | 1825 | deactivate_task(src_rq, p, 0); |
1828 | struct sched_entity *se; | 1826 | set_task_cpu(p, this_cpu); |
1827 | activate_task(this_rq, p, 0); | ||
1828 | check_preempt_curr(this_rq, p, 0); | ||
1829 | } | ||
1829 | 1830 | ||
1830 | if (next == &cfs_rq->tasks) | 1831 | /* |
1831 | return NULL; | 1832 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
1833 | */ | ||
1834 | static | ||
1835 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | ||
1836 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1837 | int *all_pinned) | ||
1838 | { | ||
1839 | int tsk_cache_hot = 0; | ||
1840 | /* | ||
1841 | * We do not migrate tasks that are: | ||
1842 | * 1) running (obviously), or | ||
1843 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | ||
1844 | * 3) are cache-hot on their current CPU. | ||
1845 | */ | ||
1846 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | ||
1847 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
1848 | return 0; | ||
1849 | } | ||
1850 | *all_pinned = 0; | ||
1832 | 1851 | ||
1833 | se = list_entry(next, struct sched_entity, group_node); | 1852 | if (task_running(rq, p)) { |
1834 | p = task_of(se); | 1853 | schedstat_inc(p, se.nr_failed_migrations_running); |
1835 | cfs_rq->balance_iterator = next->next; | 1854 | return 0; |
1855 | } | ||
1836 | 1856 | ||
1837 | return p; | 1857 | /* |
1838 | } | 1858 | * Aggressive migration if: |
1859 | * 1) task is cache cold, or | ||
1860 | * 2) too many balance attempts have failed. | ||
1861 | */ | ||
1839 | 1862 | ||
1840 | static struct task_struct *load_balance_start_fair(void *arg) | 1863 | tsk_cache_hot = task_hot(p, rq->clock, sd); |
1841 | { | 1864 | if (!tsk_cache_hot || |
1842 | struct cfs_rq *cfs_rq = arg; | 1865 | sd->nr_balance_failed > sd->cache_nice_tries) { |
1866 | #ifdef CONFIG_SCHEDSTATS | ||
1867 | if (tsk_cache_hot) { | ||
1868 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
1869 | schedstat_inc(p, se.nr_forced_migrations); | ||
1870 | } | ||
1871 | #endif | ||
1872 | return 1; | ||
1873 | } | ||
1843 | 1874 | ||
1844 | return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); | 1875 | if (tsk_cache_hot) { |
1876 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
1877 | return 0; | ||
1878 | } | ||
1879 | return 1; | ||
1845 | } | 1880 | } |
1846 | 1881 | ||
1847 | static struct task_struct *load_balance_next_fair(void *arg) | 1882 | /* |
1883 | * move_one_task tries to move exactly one task from busiest to this_rq, as | ||
1884 | * part of active balancing operations within "domain". | ||
1885 | * Returns 1 if successful and 0 otherwise. | ||
1886 | * | ||
1887 | * Called with both runqueues locked. | ||
1888 | */ | ||
1889 | static int | ||
1890 | move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1891 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
1848 | { | 1892 | { |
1849 | struct cfs_rq *cfs_rq = arg; | 1893 | struct task_struct *p, *n; |
1894 | struct cfs_rq *cfs_rq; | ||
1895 | int pinned = 0; | ||
1896 | |||
1897 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | ||
1898 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | ||
1899 | |||
1900 | if (!can_migrate_task(p, busiest, this_cpu, | ||
1901 | sd, idle, &pinned)) | ||
1902 | continue; | ||
1850 | 1903 | ||
1851 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); | 1904 | pull_task(busiest, p, this_rq, this_cpu); |
1905 | /* | ||
1906 | * Right now, this is only the second place pull_task() | ||
1907 | * is called, so we can safely collect pull_task() | ||
1908 | * stats here rather than inside pull_task(). | ||
1909 | */ | ||
1910 | schedstat_inc(sd, lb_gained[idle]); | ||
1911 | return 1; | ||
1912 | } | ||
1913 | } | ||
1914 | |||
1915 | return 0; | ||
1852 | } | 1916 | } |
1853 | 1917 | ||
1854 | static unsigned long | 1918 | static unsigned long |
1855 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1919 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1856 | unsigned long max_load_move, struct sched_domain *sd, | 1920 | unsigned long max_load_move, struct sched_domain *sd, |
1857 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | 1921 | enum cpu_idle_type idle, int *all_pinned, |
1858 | struct cfs_rq *cfs_rq) | 1922 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) |
1859 | { | 1923 | { |
1860 | struct rq_iterator cfs_rq_iterator; | 1924 | int loops = 0, pulled = 0, pinned = 0; |
1925 | long rem_load_move = max_load_move; | ||
1926 | struct task_struct *p, *n; | ||
1861 | 1927 | ||
1862 | cfs_rq_iterator.start = load_balance_start_fair; | 1928 | if (max_load_move == 0) |
1863 | cfs_rq_iterator.next = load_balance_next_fair; | 1929 | goto out; |
1864 | cfs_rq_iterator.arg = cfs_rq; | ||
1865 | 1930 | ||
1866 | return balance_tasks(this_rq, this_cpu, busiest, | 1931 | pinned = 1; |
1867 | max_load_move, sd, idle, all_pinned, | 1932 | |
1868 | this_best_prio, &cfs_rq_iterator); | 1933 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
1934 | if (loops++ > sysctl_sched_nr_migrate) | ||
1935 | break; | ||
1936 | |||
1937 | if ((p->se.load.weight >> 1) > rem_load_move || | ||
1938 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) | ||
1939 | continue; | ||
1940 | |||
1941 | pull_task(busiest, p, this_rq, this_cpu); | ||
1942 | pulled++; | ||
1943 | rem_load_move -= p->se.load.weight; | ||
1944 | |||
1945 | #ifdef CONFIG_PREEMPT | ||
1946 | /* | ||
1947 | * NEWIDLE balancing is a source of latency, so preemptible | ||
1948 | * kernels will stop after the first task is pulled to minimize | ||
1949 | * the critical section. | ||
1950 | */ | ||
1951 | if (idle == CPU_NEWLY_IDLE) | ||
1952 | break; | ||
1953 | #endif | ||
1954 | |||
1955 | /* | ||
1956 | * We only want to steal up to the prescribed amount of | ||
1957 | * weighted load. | ||
1958 | */ | ||
1959 | if (rem_load_move <= 0) | ||
1960 | break; | ||
1961 | |||
1962 | if (p->prio < *this_best_prio) | ||
1963 | *this_best_prio = p->prio; | ||
1964 | } | ||
1965 | out: | ||
1966 | /* | ||
1967 | * Right now, this is one of only two places pull_task() is called, | ||
1968 | * so we can safely collect pull_task() stats here rather than | ||
1969 | * inside pull_task(). | ||
1970 | */ | ||
1971 | schedstat_add(sd, lb_gained[idle], pulled); | ||
1972 | |||
1973 | if (all_pinned) | ||
1974 | *all_pinned = pinned; | ||
1975 | |||
1976 | return max_load_move - rem_load_move; | ||
1869 | } | 1977 | } |
1870 | 1978 | ||
1871 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1979 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1897 | rem_load = (u64)rem_load_move * busiest_weight; | 2005 | rem_load = (u64)rem_load_move * busiest_weight; |
1898 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 2006 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
1899 | 2007 | ||
1900 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, | 2008 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
1901 | rem_load, sd, idle, all_pinned, this_best_prio, | 2009 | rem_load, sd, idle, all_pinned, this_best_prio, |
1902 | tg->cfs_rq[busiest_cpu]); | 2010 | busiest_cfs_rq); |
1903 | 2011 | ||
1904 | if (!moved_load) | 2012 | if (!moved_load) |
1905 | continue; | 2013 | continue; |
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1922 | struct sched_domain *sd, enum cpu_idle_type idle, | 2030 | struct sched_domain *sd, enum cpu_idle_type idle, |
1923 | int *all_pinned, int *this_best_prio) | 2031 | int *all_pinned, int *this_best_prio) |
1924 | { | 2032 | { |
1925 | return __load_balance_fair(this_rq, this_cpu, busiest, | 2033 | return balance_tasks(this_rq, this_cpu, busiest, |
1926 | max_load_move, sd, idle, all_pinned, | 2034 | max_load_move, sd, idle, all_pinned, |
1927 | this_best_prio, &busiest->cfs); | 2035 | this_best_prio, &busiest->cfs); |
1928 | } | 2036 | } |
1929 | #endif | 2037 | #endif |
1930 | 2038 | ||
1931 | static int | 2039 | /* |
1932 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2040 | * move_tasks tries to move up to max_load_move weighted load from busiest to |
1933 | struct sched_domain *sd, enum cpu_idle_type idle) | 2041 | * this_rq, as part of a balancing operation within domain "sd". |
2042 | * Returns 1 if successful and 0 otherwise. | ||
2043 | * | ||
2044 | * Called with both runqueues locked. | ||
2045 | */ | ||
2046 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
2047 | unsigned long max_load_move, | ||
2048 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
2049 | int *all_pinned) | ||
1934 | { | 2050 | { |
1935 | struct cfs_rq *busy_cfs_rq; | 2051 | unsigned long total_load_moved = 0, load_moved; |
1936 | struct rq_iterator cfs_rq_iterator; | 2052 | int this_best_prio = this_rq->curr->prio; |
1937 | 2053 | ||
1938 | cfs_rq_iterator.start = load_balance_start_fair; | 2054 | do { |
1939 | cfs_rq_iterator.next = load_balance_next_fair; | 2055 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
2056 | max_load_move - total_load_moved, | ||
2057 | sd, idle, all_pinned, &this_best_prio); | ||
1940 | 2058 | ||
1941 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 2059 | total_load_moved += load_moved; |
2060 | |||
2061 | #ifdef CONFIG_PREEMPT | ||
1942 | /* | 2062 | /* |
1943 | * pass busy_cfs_rq argument into | 2063 | * NEWIDLE balancing is a source of latency, so preemptible |
1944 | * load_balance_[start|next]_fair iterators | 2064 | * kernels will stop after the first task is pulled to minimize |
2065 | * the critical section. | ||
1945 | */ | 2066 | */ |
1946 | cfs_rq_iterator.arg = busy_cfs_rq; | 2067 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) |
1947 | if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, | 2068 | break; |
1948 | &cfs_rq_iterator)) | 2069 | |
1949 | return 1; | 2070 | if (raw_spin_is_contended(&this_rq->lock) || |
2071 | raw_spin_is_contended(&busiest->lock)) | ||
2072 | break; | ||
2073 | #endif | ||
2074 | } while (load_moved && max_load_move > total_load_moved); | ||
2075 | |||
2076 | return total_load_moved > 0; | ||
2077 | } | ||
2078 | |||
2079 | /********** Helpers for find_busiest_group ************************/ | ||
2080 | /* | ||
2081 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
2082 | * during load balancing. | ||
2083 | */ | ||
2084 | struct sd_lb_stats { | ||
2085 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
2086 | struct sched_group *this; /* Local group in this sd */ | ||
2087 | unsigned long total_load; /* Total load of all groups in sd */ | ||
2088 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
2089 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
2090 | |||
2091 | /** Statistics of this group */ | ||
2092 | unsigned long this_load; | ||
2093 | unsigned long this_load_per_task; | ||
2094 | unsigned long this_nr_running; | ||
2095 | |||
2096 | /* Statistics of the busiest group */ | ||
2097 | unsigned long max_load; | ||
2098 | unsigned long busiest_load_per_task; | ||
2099 | unsigned long busiest_nr_running; | ||
2100 | unsigned long busiest_group_capacity; | ||
2101 | |||
2102 | int group_imb; /* Is there imbalance in this sd */ | ||
2103 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2104 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
2105 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
2106 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
2107 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
2108 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
2109 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
2110 | #endif | ||
2111 | }; | ||
2112 | |||
2113 | /* | ||
2114 | * sg_lb_stats - stats of a sched_group required for load_balancing | ||
2115 | */ | ||
2116 | struct sg_lb_stats { | ||
2117 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | ||
2118 | unsigned long group_load; /* Total load over the CPUs of the group */ | ||
2119 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
2120 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
2121 | unsigned long group_capacity; | ||
2122 | int group_imb; /* Is there an imbalance in the group ? */ | ||
2123 | }; | ||
2124 | |||
2125 | /** | ||
2126 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
2127 | * @group: The group whose first cpu is to be returned. | ||
2128 | */ | ||
2129 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
2130 | { | ||
2131 | return cpumask_first(sched_group_cpus(group)); | ||
2132 | } | ||
2133 | |||
2134 | /** | ||
2135 | * get_sd_load_idx - Obtain the load index for a given sched domain. | ||
2136 | * @sd: The sched_domain whose load_idx is to be obtained. | ||
2137 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | ||
2138 | */ | ||
2139 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
2140 | enum cpu_idle_type idle) | ||
2141 | { | ||
2142 | int load_idx; | ||
2143 | |||
2144 | switch (idle) { | ||
2145 | case CPU_NOT_IDLE: | ||
2146 | load_idx = sd->busy_idx; | ||
2147 | break; | ||
2148 | |||
2149 | case CPU_NEWLY_IDLE: | ||
2150 | load_idx = sd->newidle_idx; | ||
2151 | break; | ||
2152 | default: | ||
2153 | load_idx = sd->idle_idx; | ||
2154 | break; | ||
1950 | } | 2155 | } |
1951 | 2156 | ||
2157 | return load_idx; | ||
2158 | } | ||
2159 | |||
2160 | |||
2161 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2162 | /** | ||
2163 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
2164 | * the given sched_domain, during load balancing. | ||
2165 | * | ||
2166 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
2167 | * @sds: Variable containing the statistics for sd. | ||
2168 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
2169 | */ | ||
2170 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
2171 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
2172 | { | ||
2173 | /* | ||
2174 | * Busy processors will not participate in power savings | ||
2175 | * balance. | ||
2176 | */ | ||
2177 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2178 | sds->power_savings_balance = 0; | ||
2179 | else { | ||
2180 | sds->power_savings_balance = 1; | ||
2181 | sds->min_nr_running = ULONG_MAX; | ||
2182 | sds->leader_nr_running = 0; | ||
2183 | } | ||
2184 | } | ||
2185 | |||
2186 | /** | ||
2187 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
2188 | * sched_domain while performing load balancing. | ||
2189 | * | ||
2190 | * @group: sched_group belonging to the sched_domain under consideration. | ||
2191 | * @sds: Variable containing the statistics of the sched_domain | ||
2192 | * @local_group: Does group contain the CPU for which we're performing | ||
2193 | * load balancing ? | ||
2194 | * @sgs: Variable containing the statistics of the group. | ||
2195 | */ | ||
2196 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
2197 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
2198 | { | ||
2199 | |||
2200 | if (!sds->power_savings_balance) | ||
2201 | return; | ||
2202 | |||
2203 | /* | ||
2204 | * If the local group is idle or completely loaded | ||
2205 | * no need to do power savings balance at this domain | ||
2206 | */ | ||
2207 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
2208 | !sds->this_nr_running)) | ||
2209 | sds->power_savings_balance = 0; | ||
2210 | |||
2211 | /* | ||
2212 | * If a group is already running at full capacity or idle, | ||
2213 | * don't include that group in power savings calculations | ||
2214 | */ | ||
2215 | if (!sds->power_savings_balance || | ||
2216 | sgs->sum_nr_running >= sgs->group_capacity || | ||
2217 | !sgs->sum_nr_running) | ||
2218 | return; | ||
2219 | |||
2220 | /* | ||
2221 | * Calculate the group which has the least non-idle load. | ||
2222 | * This is the group from where we need to pick up the load | ||
2223 | * for saving power | ||
2224 | */ | ||
2225 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
2226 | (sgs->sum_nr_running == sds->min_nr_running && | ||
2227 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
2228 | sds->group_min = group; | ||
2229 | sds->min_nr_running = sgs->sum_nr_running; | ||
2230 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
2231 | sgs->sum_nr_running; | ||
2232 | } | ||
2233 | |||
2234 | /* | ||
2235 | * Calculate the group which is almost near its | ||
2236 | * capacity but still has some space to pick up some load | ||
2237 | * from other group and save more power | ||
2238 | */ | ||
2239 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
2240 | return; | ||
2241 | |||
2242 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
2243 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
2244 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
2245 | sds->group_leader = group; | ||
2246 | sds->leader_nr_running = sgs->sum_nr_running; | ||
2247 | } | ||
2248 | } | ||
2249 | |||
2250 | /** | ||
2251 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
2252 | * @sds: Variable containing the statistics of the sched_domain | ||
2253 | * under consideration. | ||
2254 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
2255 | * @imbalance: Variable to store the imbalance. | ||
2256 | * | ||
2257 | * Description: | ||
2258 | * Check if we have potential to perform some power-savings balance. | ||
2259 | * If yes, set the busiest group to be the least loaded group in the | ||
2260 | * sched_domain, so that it's CPUs can be put to idle. | ||
2261 | * | ||
2262 | * Returns 1 if there is potential to perform power-savings balance. | ||
2263 | * Else returns 0. | ||
2264 | */ | ||
2265 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
2266 | int this_cpu, unsigned long *imbalance) | ||
2267 | { | ||
2268 | if (!sds->power_savings_balance) | ||
2269 | return 0; | ||
2270 | |||
2271 | if (sds->this != sds->group_leader || | ||
2272 | sds->group_leader == sds->group_min) | ||
2273 | return 0; | ||
2274 | |||
2275 | *imbalance = sds->min_load_per_task; | ||
2276 | sds->busiest = sds->group_min; | ||
2277 | |||
2278 | return 1; | ||
2279 | |||
2280 | } | ||
2281 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
2282 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
2283 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
2284 | { | ||
2285 | return; | ||
2286 | } | ||
2287 | |||
2288 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
2289 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
2290 | { | ||
2291 | return; | ||
2292 | } | ||
2293 | |||
2294 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
2295 | int this_cpu, unsigned long *imbalance) | ||
2296 | { | ||
1952 | return 0; | 2297 | return 0; |
1953 | } | 2298 | } |
2299 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
2300 | |||
2301 | |||
2302 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
2303 | { | ||
2304 | return SCHED_LOAD_SCALE; | ||
2305 | } | ||
2306 | |||
2307 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
2308 | { | ||
2309 | return default_scale_freq_power(sd, cpu); | ||
2310 | } | ||
2311 | |||
2312 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
2313 | { | ||
2314 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
2315 | unsigned long smt_gain = sd->smt_gain; | ||
2316 | |||
2317 | smt_gain /= weight; | ||
2318 | |||
2319 | return smt_gain; | ||
2320 | } | ||
2321 | |||
2322 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
2323 | { | ||
2324 | return default_scale_smt_power(sd, cpu); | ||
2325 | } | ||
2326 | |||
2327 | unsigned long scale_rt_power(int cpu) | ||
2328 | { | ||
2329 | struct rq *rq = cpu_rq(cpu); | ||
2330 | u64 total, available; | ||
2331 | |||
2332 | sched_avg_update(rq); | ||
2333 | |||
2334 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
2335 | available = total - rq->rt_avg; | ||
2336 | |||
2337 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
2338 | total = SCHED_LOAD_SCALE; | ||
2339 | |||
2340 | total >>= SCHED_LOAD_SHIFT; | ||
2341 | |||
2342 | return div_u64(available, total); | ||
2343 | } | ||
2344 | |||
2345 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
2346 | { | ||
2347 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
2348 | unsigned long power = SCHED_LOAD_SCALE; | ||
2349 | struct sched_group *sdg = sd->groups; | ||
2350 | |||
2351 | if (sched_feat(ARCH_POWER)) | ||
2352 | power *= arch_scale_freq_power(sd, cpu); | ||
2353 | else | ||
2354 | power *= default_scale_freq_power(sd, cpu); | ||
2355 | |||
2356 | power >>= SCHED_LOAD_SHIFT; | ||
2357 | |||
2358 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
2359 | if (sched_feat(ARCH_POWER)) | ||
2360 | power *= arch_scale_smt_power(sd, cpu); | ||
2361 | else | ||
2362 | power *= default_scale_smt_power(sd, cpu); | ||
2363 | |||
2364 | power >>= SCHED_LOAD_SHIFT; | ||
2365 | } | ||
2366 | |||
2367 | power *= scale_rt_power(cpu); | ||
2368 | power >>= SCHED_LOAD_SHIFT; | ||
2369 | |||
2370 | if (!power) | ||
2371 | power = 1; | ||
2372 | |||
2373 | sdg->cpu_power = power; | ||
2374 | } | ||
2375 | |||
2376 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
2377 | { | ||
2378 | struct sched_domain *child = sd->child; | ||
2379 | struct sched_group *group, *sdg = sd->groups; | ||
2380 | unsigned long power; | ||
2381 | |||
2382 | if (!child) { | ||
2383 | update_cpu_power(sd, cpu); | ||
2384 | return; | ||
2385 | } | ||
2386 | |||
2387 | power = 0; | ||
2388 | |||
2389 | group = child->groups; | ||
2390 | do { | ||
2391 | power += group->cpu_power; | ||
2392 | group = group->next; | ||
2393 | } while (group != child->groups); | ||
2394 | |||
2395 | sdg->cpu_power = power; | ||
2396 | } | ||
2397 | |||
2398 | /** | ||
2399 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | ||
2400 | * @sd: The sched_domain whose statistics are to be updated. | ||
2401 | * @group: sched_group whose statistics are to be updated. | ||
2402 | * @this_cpu: Cpu for which load balance is currently performed. | ||
2403 | * @idle: Idle status of this_cpu | ||
2404 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | ||
2405 | * @sd_idle: Idle status of the sched_domain containing group. | ||
2406 | * @local_group: Does group contain this_cpu. | ||
2407 | * @cpus: Set of cpus considered for load balancing. | ||
2408 | * @balance: Should we balance. | ||
2409 | * @sgs: variable to hold the statistics for this group. | ||
2410 | */ | ||
2411 | static inline void update_sg_lb_stats(struct sched_domain *sd, | ||
2412 | struct sched_group *group, int this_cpu, | ||
2413 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | ||
2414 | int local_group, const struct cpumask *cpus, | ||
2415 | int *balance, struct sg_lb_stats *sgs) | ||
2416 | { | ||
2417 | unsigned long load, max_cpu_load, min_cpu_load; | ||
2418 | int i; | ||
2419 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
2420 | unsigned long avg_load_per_task = 0; | ||
2421 | |||
2422 | if (local_group) | ||
2423 | balance_cpu = group_first_cpu(group); | ||
2424 | |||
2425 | /* Tally up the load of all CPUs in the group */ | ||
2426 | max_cpu_load = 0; | ||
2427 | min_cpu_load = ~0UL; | ||
2428 | |||
2429 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | ||
2430 | struct rq *rq = cpu_rq(i); | ||
2431 | |||
2432 | if (*sd_idle && rq->nr_running) | ||
2433 | *sd_idle = 0; | ||
2434 | |||
2435 | /* Bias balancing toward cpus of our domain */ | ||
2436 | if (local_group) { | ||
2437 | if (idle_cpu(i) && !first_idle_cpu) { | ||
2438 | first_idle_cpu = 1; | ||
2439 | balance_cpu = i; | ||
2440 | } | ||
2441 | |||
2442 | load = target_load(i, load_idx); | ||
2443 | } else { | ||
2444 | load = source_load(i, load_idx); | ||
2445 | if (load > max_cpu_load) | ||
2446 | max_cpu_load = load; | ||
2447 | if (min_cpu_load > load) | ||
2448 | min_cpu_load = load; | ||
2449 | } | ||
2450 | |||
2451 | sgs->group_load += load; | ||
2452 | sgs->sum_nr_running += rq->nr_running; | ||
2453 | sgs->sum_weighted_load += weighted_cpuload(i); | ||
2454 | |||
2455 | } | ||
2456 | |||
2457 | /* | ||
2458 | * First idle cpu or the first cpu(busiest) in this sched group | ||
2459 | * is eligible for doing load balancing at this and above | ||
2460 | * domains. In the newly idle case, we will allow all the cpu's | ||
2461 | * to do the newly idle load balance. | ||
2462 | */ | ||
2463 | if (idle != CPU_NEWLY_IDLE && local_group && | ||
2464 | balance_cpu != this_cpu) { | ||
2465 | *balance = 0; | ||
2466 | return; | ||
2467 | } | ||
2468 | |||
2469 | update_group_power(sd, this_cpu); | ||
2470 | |||
2471 | /* Adjust by relative CPU power of the group */ | ||
2472 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
2473 | |||
2474 | /* | ||
2475 | * Consider the group unbalanced when the imbalance is larger | ||
2476 | * than the average weight of two tasks. | ||
2477 | * | ||
2478 | * APZ: with cgroup the avg task weight can vary wildly and | ||
2479 | * might not be a suitable number - should we keep a | ||
2480 | * normalized nr_running number somewhere that negates | ||
2481 | * the hierarchy? | ||
2482 | */ | ||
2483 | if (sgs->sum_nr_running) | ||
2484 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | ||
2485 | |||
2486 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
2487 | sgs->group_imb = 1; | ||
2488 | |||
2489 | sgs->group_capacity = | ||
2490 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | ||
2491 | } | ||
2492 | |||
2493 | /** | ||
2494 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | ||
2495 | * @sd: sched_domain whose statistics are to be updated. | ||
2496 | * @this_cpu: Cpu for which load balance is currently performed. | ||
2497 | * @idle: Idle status of this_cpu | ||
2498 | * @sd_idle: Idle status of the sched_domain containing group. | ||
2499 | * @cpus: Set of cpus considered for load balancing. | ||
2500 | * @balance: Should we balance. | ||
2501 | * @sds: variable to hold the statistics for this sched_domain. | ||
2502 | */ | ||
2503 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | ||
2504 | enum cpu_idle_type idle, int *sd_idle, | ||
2505 | const struct cpumask *cpus, int *balance, | ||
2506 | struct sd_lb_stats *sds) | ||
2507 | { | ||
2508 | struct sched_domain *child = sd->child; | ||
2509 | struct sched_group *group = sd->groups; | ||
2510 | struct sg_lb_stats sgs; | ||
2511 | int load_idx, prefer_sibling = 0; | ||
2512 | |||
2513 | if (child && child->flags & SD_PREFER_SIBLING) | ||
2514 | prefer_sibling = 1; | ||
2515 | |||
2516 | init_sd_power_savings_stats(sd, sds, idle); | ||
2517 | load_idx = get_sd_load_idx(sd, idle); | ||
2518 | |||
2519 | do { | ||
2520 | int local_group; | ||
2521 | |||
2522 | local_group = cpumask_test_cpu(this_cpu, | ||
2523 | sched_group_cpus(group)); | ||
2524 | memset(&sgs, 0, sizeof(sgs)); | ||
2525 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | ||
2526 | local_group, cpus, balance, &sgs); | ||
2527 | |||
2528 | if (local_group && !(*balance)) | ||
2529 | return; | ||
2530 | |||
2531 | sds->total_load += sgs.group_load; | ||
2532 | sds->total_pwr += group->cpu_power; | ||
2533 | |||
2534 | /* | ||
2535 | * In case the child domain prefers tasks go to siblings | ||
2536 | * first, lower the group capacity to one so that we'll try | ||
2537 | * and move all the excess tasks away. | ||
2538 | */ | ||
2539 | if (prefer_sibling) | ||
2540 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
2541 | |||
2542 | if (local_group) { | ||
2543 | sds->this_load = sgs.avg_load; | ||
2544 | sds->this = group; | ||
2545 | sds->this_nr_running = sgs.sum_nr_running; | ||
2546 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
2547 | } else if (sgs.avg_load > sds->max_load && | ||
2548 | (sgs.sum_nr_running > sgs.group_capacity || | ||
2549 | sgs.group_imb)) { | ||
2550 | sds->max_load = sgs.avg_load; | ||
2551 | sds->busiest = group; | ||
2552 | sds->busiest_nr_running = sgs.sum_nr_running; | ||
2553 | sds->busiest_group_capacity = sgs.group_capacity; | ||
2554 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
2555 | sds->group_imb = sgs.group_imb; | ||
2556 | } | ||
2557 | |||
2558 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | ||
2559 | group = group->next; | ||
2560 | } while (group != sd->groups); | ||
2561 | } | ||
2562 | |||
2563 | /** | ||
2564 | * fix_small_imbalance - Calculate the minor imbalance that exists | ||
2565 | * amongst the groups of a sched_domain, during | ||
2566 | * load balancing. | ||
2567 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | ||
2568 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
2569 | * @imbalance: Variable to store the imbalance. | ||
2570 | */ | ||
2571 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | ||
2572 | int this_cpu, unsigned long *imbalance) | ||
2573 | { | ||
2574 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | ||
2575 | unsigned int imbn = 2; | ||
2576 | unsigned long scaled_busy_load_per_task; | ||
2577 | |||
2578 | if (sds->this_nr_running) { | ||
2579 | sds->this_load_per_task /= sds->this_nr_running; | ||
2580 | if (sds->busiest_load_per_task > | ||
2581 | sds->this_load_per_task) | ||
2582 | imbn = 1; | ||
2583 | } else | ||
2584 | sds->this_load_per_task = | ||
2585 | cpu_avg_load_per_task(this_cpu); | ||
2586 | |||
2587 | scaled_busy_load_per_task = sds->busiest_load_per_task | ||
2588 | * SCHED_LOAD_SCALE; | ||
2589 | scaled_busy_load_per_task /= sds->busiest->cpu_power; | ||
2590 | |||
2591 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | ||
2592 | (scaled_busy_load_per_task * imbn)) { | ||
2593 | *imbalance = sds->busiest_load_per_task; | ||
2594 | return; | ||
2595 | } | ||
2596 | |||
2597 | /* | ||
2598 | * OK, we don't have enough imbalance to justify moving tasks, | ||
2599 | * however we may be able to increase total CPU power used by | ||
2600 | * moving them. | ||
2601 | */ | ||
2602 | |||
2603 | pwr_now += sds->busiest->cpu_power * | ||
2604 | min(sds->busiest_load_per_task, sds->max_load); | ||
2605 | pwr_now += sds->this->cpu_power * | ||
2606 | min(sds->this_load_per_task, sds->this_load); | ||
2607 | pwr_now /= SCHED_LOAD_SCALE; | ||
2608 | |||
2609 | /* Amount of load we'd subtract */ | ||
2610 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
2611 | sds->busiest->cpu_power; | ||
2612 | if (sds->max_load > tmp) | ||
2613 | pwr_move += sds->busiest->cpu_power * | ||
2614 | min(sds->busiest_load_per_task, sds->max_load - tmp); | ||
2615 | |||
2616 | /* Amount of load we'd add */ | ||
2617 | if (sds->max_load * sds->busiest->cpu_power < | ||
2618 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | ||
2619 | tmp = (sds->max_load * sds->busiest->cpu_power) / | ||
2620 | sds->this->cpu_power; | ||
2621 | else | ||
2622 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
2623 | sds->this->cpu_power; | ||
2624 | pwr_move += sds->this->cpu_power * | ||
2625 | min(sds->this_load_per_task, sds->this_load + tmp); | ||
2626 | pwr_move /= SCHED_LOAD_SCALE; | ||
2627 | |||
2628 | /* Move if we gain throughput */ | ||
2629 | if (pwr_move > pwr_now) | ||
2630 | *imbalance = sds->busiest_load_per_task; | ||
2631 | } | ||
2632 | |||
2633 | /** | ||
2634 | * calculate_imbalance - Calculate the amount of imbalance present within the | ||
2635 | * groups of a given sched_domain during load balance. | ||
2636 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | ||
2637 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
2638 | * @imbalance: The variable to store the imbalance. | ||
2639 | */ | ||
2640 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | ||
2641 | unsigned long *imbalance) | ||
2642 | { | ||
2643 | unsigned long max_pull, load_above_capacity = ~0UL; | ||
2644 | |||
2645 | sds->busiest_load_per_task /= sds->busiest_nr_running; | ||
2646 | if (sds->group_imb) { | ||
2647 | sds->busiest_load_per_task = | ||
2648 | min(sds->busiest_load_per_task, sds->avg_load); | ||
2649 | } | ||
2650 | |||
2651 | /* | ||
2652 | * In the presence of smp nice balancing, certain scenarios can have | ||
2653 | * max load less than avg load(as we skip the groups at or below | ||
2654 | * its cpu_power, while calculating max_load..) | ||
2655 | */ | ||
2656 | if (sds->max_load < sds->avg_load) { | ||
2657 | *imbalance = 0; | ||
2658 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
2659 | } | ||
2660 | |||
2661 | if (!sds->group_imb) { | ||
2662 | /* | ||
2663 | * Don't want to pull so many tasks that a group would go idle. | ||
2664 | */ | ||
2665 | load_above_capacity = (sds->busiest_nr_running - | ||
2666 | sds->busiest_group_capacity); | ||
2667 | |||
2668 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); | ||
2669 | |||
2670 | load_above_capacity /= sds->busiest->cpu_power; | ||
2671 | } | ||
2672 | |||
2673 | /* | ||
2674 | * We're trying to get all the cpus to the average_load, so we don't | ||
2675 | * want to push ourselves above the average load, nor do we wish to | ||
2676 | * reduce the max loaded cpu below the average load. At the same time, | ||
2677 | * we also don't want to reduce the group load below the group capacity | ||
2678 | * (so that we can implement power-savings policies etc). Thus we look | ||
2679 | * for the minimum possible imbalance. | ||
2680 | * Be careful of negative numbers as they'll appear as very large values | ||
2681 | * with unsigned longs. | ||
2682 | */ | ||
2683 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | ||
2684 | |||
2685 | /* How much load to actually move to equalise the imbalance */ | ||
2686 | *imbalance = min(max_pull * sds->busiest->cpu_power, | ||
2687 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | ||
2688 | / SCHED_LOAD_SCALE; | ||
2689 | |||
2690 | /* | ||
2691 | * if *imbalance is less than the average load per runnable task | ||
2692 | * there is no gaurantee that any tasks will be moved so we'll have | ||
2693 | * a think about bumping its value to force at least one task to be | ||
2694 | * moved | ||
2695 | */ | ||
2696 | if (*imbalance < sds->busiest_load_per_task) | ||
2697 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
2698 | |||
2699 | } | ||
2700 | /******* find_busiest_group() helpers end here *********************/ | ||
2701 | |||
2702 | /** | ||
2703 | * find_busiest_group - Returns the busiest group within the sched_domain | ||
2704 | * if there is an imbalance. If there isn't an imbalance, and | ||
2705 | * the user has opted for power-savings, it returns a group whose | ||
2706 | * CPUs can be put to idle by rebalancing those tasks elsewhere, if | ||
2707 | * such a group exists. | ||
2708 | * | ||
2709 | * Also calculates the amount of weighted load which should be moved | ||
2710 | * to restore balance. | ||
2711 | * | ||
2712 | * @sd: The sched_domain whose busiest group is to be returned. | ||
2713 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
2714 | * @imbalance: Variable which stores amount of weighted load which should | ||
2715 | * be moved to restore balance/put a group to idle. | ||
2716 | * @idle: The idle status of this_cpu. | ||
2717 | * @sd_idle: The idleness of sd | ||
2718 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
2719 | * @balance: Pointer to a variable indicating if this_cpu | ||
2720 | * is the appropriate cpu to perform load balancing at this_level. | ||
2721 | * | ||
2722 | * Returns: - the busiest group if imbalance exists. | ||
2723 | * - If no imbalance and user has opted for power-savings balance, | ||
2724 | * return the least loaded group whose CPUs can be | ||
2725 | * put to idle by rebalancing its tasks onto our group. | ||
2726 | */ | ||
2727 | static struct sched_group * | ||
2728 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
2729 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
2730 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
2731 | { | ||
2732 | struct sd_lb_stats sds; | ||
2733 | |||
2734 | memset(&sds, 0, sizeof(sds)); | ||
2735 | |||
2736 | /* | ||
2737 | * Compute the various statistics relavent for load balancing at | ||
2738 | * this level. | ||
2739 | */ | ||
2740 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | ||
2741 | balance, &sds); | ||
2742 | |||
2743 | /* Cases where imbalance does not exist from POV of this_cpu */ | ||
2744 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | ||
2745 | * at this level. | ||
2746 | * 2) There is no busy sibling group to pull from. | ||
2747 | * 3) This group is the busiest group. | ||
2748 | * 4) This group is more busy than the avg busieness at this | ||
2749 | * sched_domain. | ||
2750 | * 5) The imbalance is within the specified limit. | ||
2751 | */ | ||
2752 | if (!(*balance)) | ||
2753 | goto ret; | ||
2754 | |||
2755 | if (!sds.busiest || sds.busiest_nr_running == 0) | ||
2756 | goto out_balanced; | ||
2757 | |||
2758 | if (sds.this_load >= sds.max_load) | ||
2759 | goto out_balanced; | ||
2760 | |||
2761 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | ||
2762 | |||
2763 | if (sds.this_load >= sds.avg_load) | ||
2764 | goto out_balanced; | ||
2765 | |||
2766 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
2767 | goto out_balanced; | ||
2768 | |||
2769 | /* Looks like there is an imbalance. Compute it */ | ||
2770 | calculate_imbalance(&sds, this_cpu, imbalance); | ||
2771 | return sds.busiest; | ||
2772 | |||
2773 | out_balanced: | ||
2774 | /* | ||
2775 | * There is no obvious imbalance. But check if we can do some balancing | ||
2776 | * to save power. | ||
2777 | */ | ||
2778 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
2779 | return sds.busiest; | ||
2780 | ret: | ||
2781 | *imbalance = 0; | ||
2782 | return NULL; | ||
2783 | } | ||
2784 | |||
2785 | /* | ||
2786 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | ||
2787 | */ | ||
2788 | static struct rq * | ||
2789 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | ||
2790 | unsigned long imbalance, const struct cpumask *cpus) | ||
2791 | { | ||
2792 | struct rq *busiest = NULL, *rq; | ||
2793 | unsigned long max_load = 0; | ||
2794 | int i; | ||
2795 | |||
2796 | for_each_cpu(i, sched_group_cpus(group)) { | ||
2797 | unsigned long power = power_of(i); | ||
2798 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
2799 | unsigned long wl; | ||
2800 | |||
2801 | if (!cpumask_test_cpu(i, cpus)) | ||
2802 | continue; | ||
2803 | |||
2804 | rq = cpu_rq(i); | ||
2805 | wl = weighted_cpuload(i); | ||
2806 | |||
2807 | /* | ||
2808 | * When comparing with imbalance, use weighted_cpuload() | ||
2809 | * which is not scaled with the cpu power. | ||
2810 | */ | ||
2811 | if (capacity && rq->nr_running == 1 && wl > imbalance) | ||
2812 | continue; | ||
2813 | |||
2814 | /* | ||
2815 | * For the load comparisons with the other cpu's, consider | ||
2816 | * the weighted_cpuload() scaled with the cpu power, so that | ||
2817 | * the load can be moved away from the cpu that is potentially | ||
2818 | * running at a lower capacity. | ||
2819 | */ | ||
2820 | wl = (wl * SCHED_LOAD_SCALE) / power; | ||
2821 | |||
2822 | if (wl > max_load) { | ||
2823 | max_load = wl; | ||
2824 | busiest = rq; | ||
2825 | } | ||
2826 | } | ||
2827 | |||
2828 | return busiest; | ||
2829 | } | ||
2830 | |||
2831 | /* | ||
2832 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
2833 | * so long as it is large enough. | ||
2834 | */ | ||
2835 | #define MAX_PINNED_INTERVAL 512 | ||
2836 | |||
2837 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
2838 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | ||
2839 | |||
2840 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | ||
2841 | { | ||
2842 | if (idle == CPU_NEWLY_IDLE) { | ||
2843 | /* | ||
2844 | * The only task running in a non-idle cpu can be moved to this | ||
2845 | * cpu in an attempt to completely freeup the other CPU | ||
2846 | * package. | ||
2847 | * | ||
2848 | * The package power saving logic comes from | ||
2849 | * find_busiest_group(). If there are no imbalance, then | ||
2850 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
2851 | * f_b_g() will select a group from which a running task may be | ||
2852 | * pulled to this cpu in order to make the other package idle. | ||
2853 | * If there is no opportunity to make a package idle and if | ||
2854 | * there are no imbalance, then f_b_g() will return NULL and no | ||
2855 | * action will be taken in load_balance_newidle(). | ||
2856 | * | ||
2857 | * Under normal task pull operation due to imbalance, there | ||
2858 | * will be more than one task in the source run queue and | ||
2859 | * move_tasks() will succeed. ld_moved will be true and this | ||
2860 | * active balance code will not be triggered. | ||
2861 | */ | ||
2862 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
2863 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2864 | return 0; | ||
2865 | |||
2866 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
2867 | return 0; | ||
2868 | } | ||
2869 | |||
2870 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | ||
2871 | } | ||
2872 | |||
2873 | /* | ||
2874 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
2875 | * tasks if there is an imbalance. | ||
2876 | */ | ||
2877 | static int load_balance(int this_cpu, struct rq *this_rq, | ||
2878 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
2879 | int *balance) | ||
2880 | { | ||
2881 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
2882 | struct sched_group *group; | ||
2883 | unsigned long imbalance; | ||
2884 | struct rq *busiest; | ||
2885 | unsigned long flags; | ||
2886 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
2887 | |||
2888 | cpumask_copy(cpus, cpu_active_mask); | ||
2889 | |||
2890 | /* | ||
2891 | * When power savings policy is enabled for the parent domain, idle | ||
2892 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
2893 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
2894 | * portraying it as CPU_NOT_IDLE. | ||
2895 | */ | ||
2896 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
2897 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2898 | sd_idle = 1; | ||
2899 | |||
2900 | schedstat_inc(sd, lb_count[idle]); | ||
2901 | |||
2902 | redo: | ||
2903 | update_shares(sd); | ||
2904 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
2905 | cpus, balance); | ||
2906 | |||
2907 | if (*balance == 0) | ||
2908 | goto out_balanced; | ||
2909 | |||
2910 | if (!group) { | ||
2911 | schedstat_inc(sd, lb_nobusyg[idle]); | ||
2912 | goto out_balanced; | ||
2913 | } | ||
2914 | |||
2915 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | ||
2916 | if (!busiest) { | ||
2917 | schedstat_inc(sd, lb_nobusyq[idle]); | ||
2918 | goto out_balanced; | ||
2919 | } | ||
2920 | |||
2921 | BUG_ON(busiest == this_rq); | ||
2922 | |||
2923 | schedstat_add(sd, lb_imbalance[idle], imbalance); | ||
2924 | |||
2925 | ld_moved = 0; | ||
2926 | if (busiest->nr_running > 1) { | ||
2927 | /* | ||
2928 | * Attempt to move tasks. If find_busiest_group has found | ||
2929 | * an imbalance but busiest->nr_running <= 1, the group is | ||
2930 | * still unbalanced. ld_moved simply stays zero, so it is | ||
2931 | * correctly treated as an imbalance. | ||
2932 | */ | ||
2933 | local_irq_save(flags); | ||
2934 | double_rq_lock(this_rq, busiest); | ||
2935 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
2936 | imbalance, sd, idle, &all_pinned); | ||
2937 | double_rq_unlock(this_rq, busiest); | ||
2938 | local_irq_restore(flags); | ||
2939 | |||
2940 | /* | ||
2941 | * some other cpu did the load balance for us. | ||
2942 | */ | ||
2943 | if (ld_moved && this_cpu != smp_processor_id()) | ||
2944 | resched_cpu(this_cpu); | ||
2945 | |||
2946 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
2947 | if (unlikely(all_pinned)) { | ||
2948 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
2949 | if (!cpumask_empty(cpus)) | ||
2950 | goto redo; | ||
2951 | goto out_balanced; | ||
2952 | } | ||
2953 | } | ||
2954 | |||
2955 | if (!ld_moved) { | ||
2956 | schedstat_inc(sd, lb_failed[idle]); | ||
2957 | sd->nr_balance_failed++; | ||
2958 | |||
2959 | if (need_active_balance(sd, sd_idle, idle)) { | ||
2960 | raw_spin_lock_irqsave(&busiest->lock, flags); | ||
2961 | |||
2962 | /* don't kick the migration_thread, if the curr | ||
2963 | * task on busiest cpu can't be moved to this_cpu | ||
2964 | */ | ||
2965 | if (!cpumask_test_cpu(this_cpu, | ||
2966 | &busiest->curr->cpus_allowed)) { | ||
2967 | raw_spin_unlock_irqrestore(&busiest->lock, | ||
2968 | flags); | ||
2969 | all_pinned = 1; | ||
2970 | goto out_one_pinned; | ||
2971 | } | ||
2972 | |||
2973 | if (!busiest->active_balance) { | ||
2974 | busiest->active_balance = 1; | ||
2975 | busiest->push_cpu = this_cpu; | ||
2976 | active_balance = 1; | ||
2977 | } | ||
2978 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | ||
2979 | if (active_balance) | ||
2980 | wake_up_process(busiest->migration_thread); | ||
2981 | |||
2982 | /* | ||
2983 | * We've kicked active balancing, reset the failure | ||
2984 | * counter. | ||
2985 | */ | ||
2986 | sd->nr_balance_failed = sd->cache_nice_tries+1; | ||
2987 | } | ||
2988 | } else | ||
2989 | sd->nr_balance_failed = 0; | ||
2990 | |||
2991 | if (likely(!active_balance)) { | ||
2992 | /* We were unbalanced, so reset the balancing interval */ | ||
2993 | sd->balance_interval = sd->min_interval; | ||
2994 | } else { | ||
2995 | /* | ||
2996 | * If we've begun active balancing, start to back off. This | ||
2997 | * case may not be covered by the all_pinned logic if there | ||
2998 | * is only 1 task on the busy runqueue (because we don't call | ||
2999 | * move_tasks). | ||
3000 | */ | ||
3001 | if (sd->balance_interval < sd->max_interval) | ||
3002 | sd->balance_interval *= 2; | ||
3003 | } | ||
3004 | |||
3005 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3006 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3007 | ld_moved = -1; | ||
3008 | |||
3009 | goto out; | ||
3010 | |||
3011 | out_balanced: | ||
3012 | schedstat_inc(sd, lb_balanced[idle]); | ||
3013 | |||
3014 | sd->nr_balance_failed = 0; | ||
3015 | |||
3016 | out_one_pinned: | ||
3017 | /* tune up the balancing interval */ | ||
3018 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
3019 | (sd->balance_interval < sd->max_interval)) | ||
3020 | sd->balance_interval *= 2; | ||
3021 | |||
3022 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3023 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3024 | ld_moved = -1; | ||
3025 | else | ||
3026 | ld_moved = 0; | ||
3027 | out: | ||
3028 | if (ld_moved) | ||
3029 | update_shares(sd); | ||
3030 | return ld_moved; | ||
3031 | } | ||
3032 | |||
3033 | /* | ||
3034 | * idle_balance is called by schedule() if this_cpu is about to become | ||
3035 | * idle. Attempts to pull tasks from other CPUs. | ||
3036 | */ | ||
3037 | static void idle_balance(int this_cpu, struct rq *this_rq) | ||
3038 | { | ||
3039 | struct sched_domain *sd; | ||
3040 | int pulled_task = 0; | ||
3041 | unsigned long next_balance = jiffies + HZ; | ||
3042 | |||
3043 | this_rq->idle_stamp = this_rq->clock; | ||
3044 | |||
3045 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
3046 | return; | ||
3047 | |||
3048 | /* | ||
3049 | * Drop the rq->lock, but keep IRQ/preempt disabled. | ||
3050 | */ | ||
3051 | raw_spin_unlock(&this_rq->lock); | ||
3052 | |||
3053 | for_each_domain(this_cpu, sd) { | ||
3054 | unsigned long interval; | ||
3055 | int balance = 1; | ||
3056 | |||
3057 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
3058 | continue; | ||
3059 | |||
3060 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
3061 | /* If we've pulled tasks over stop searching: */ | ||
3062 | pulled_task = load_balance(this_cpu, this_rq, | ||
3063 | sd, CPU_NEWLY_IDLE, &balance); | ||
3064 | } | ||
3065 | |||
3066 | interval = msecs_to_jiffies(sd->balance_interval); | ||
3067 | if (time_after(next_balance, sd->last_balance + interval)) | ||
3068 | next_balance = sd->last_balance + interval; | ||
3069 | if (pulled_task) { | ||
3070 | this_rq->idle_stamp = 0; | ||
3071 | break; | ||
3072 | } | ||
3073 | } | ||
3074 | |||
3075 | raw_spin_lock(&this_rq->lock); | ||
3076 | |||
3077 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | ||
3078 | /* | ||
3079 | * We are going idle. next_balance may be set based on | ||
3080 | * a busy processor. So reset next_balance. | ||
3081 | */ | ||
3082 | this_rq->next_balance = next_balance; | ||
3083 | } | ||
3084 | } | ||
3085 | |||
3086 | /* | ||
3087 | * active_load_balance is run by migration threads. It pushes running tasks | ||
3088 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | ||
3089 | * running on each physical CPU where possible, and avoids physical / | ||
3090 | * logical imbalances. | ||
3091 | * | ||
3092 | * Called with busiest_rq locked. | ||
3093 | */ | ||
3094 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | ||
3095 | { | ||
3096 | int target_cpu = busiest_rq->push_cpu; | ||
3097 | struct sched_domain *sd; | ||
3098 | struct rq *target_rq; | ||
3099 | |||
3100 | /* Is there any task to move? */ | ||
3101 | if (busiest_rq->nr_running <= 1) | ||
3102 | return; | ||
3103 | |||
3104 | target_rq = cpu_rq(target_cpu); | ||
3105 | |||
3106 | /* | ||
3107 | * This condition is "impossible", if it occurs | ||
3108 | * we need to fix it. Originally reported by | ||
3109 | * Bjorn Helgaas on a 128-cpu setup. | ||
3110 | */ | ||
3111 | BUG_ON(busiest_rq == target_rq); | ||
3112 | |||
3113 | /* move a task from busiest_rq to target_rq */ | ||
3114 | double_lock_balance(busiest_rq, target_rq); | ||
3115 | update_rq_clock(busiest_rq); | ||
3116 | update_rq_clock(target_rq); | ||
3117 | |||
3118 | /* Search for an sd spanning us and the target CPU. */ | ||
3119 | for_each_domain(target_cpu, sd) { | ||
3120 | if ((sd->flags & SD_LOAD_BALANCE) && | ||
3121 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | ||
3122 | break; | ||
3123 | } | ||
3124 | |||
3125 | if (likely(sd)) { | ||
3126 | schedstat_inc(sd, alb_count); | ||
3127 | |||
3128 | if (move_one_task(target_rq, target_cpu, busiest_rq, | ||
3129 | sd, CPU_IDLE)) | ||
3130 | schedstat_inc(sd, alb_pushed); | ||
3131 | else | ||
3132 | schedstat_inc(sd, alb_failed); | ||
3133 | } | ||
3134 | double_unlock_balance(busiest_rq, target_rq); | ||
3135 | } | ||
3136 | |||
3137 | #ifdef CONFIG_NO_HZ | ||
3138 | static struct { | ||
3139 | atomic_t load_balancer; | ||
3140 | cpumask_var_t cpu_mask; | ||
3141 | cpumask_var_t ilb_grp_nohz_mask; | ||
3142 | } nohz ____cacheline_aligned = { | ||
3143 | .load_balancer = ATOMIC_INIT(-1), | ||
3144 | }; | ||
3145 | |||
3146 | int get_nohz_load_balancer(void) | ||
3147 | { | ||
3148 | return atomic_read(&nohz.load_balancer); | ||
3149 | } | ||
3150 | |||
3151 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3152 | /** | ||
3153 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
3154 | * @cpu: The cpu whose lowest level of sched domain is to | ||
3155 | * be returned. | ||
3156 | * @flag: The flag to check for the lowest sched_domain | ||
3157 | * for the given cpu. | ||
3158 | * | ||
3159 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
3160 | */ | ||
3161 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
3162 | { | ||
3163 | struct sched_domain *sd; | ||
3164 | |||
3165 | for_each_domain(cpu, sd) | ||
3166 | if (sd && (sd->flags & flag)) | ||
3167 | break; | ||
3168 | |||
3169 | return sd; | ||
3170 | } | ||
3171 | |||
3172 | /** | ||
3173 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
3174 | * @cpu: The cpu whose domains we're iterating over. | ||
3175 | * @sd: variable holding the value of the power_savings_sd | ||
3176 | * for cpu. | ||
3177 | * @flag: The flag to filter the sched_domains to be iterated. | ||
3178 | * | ||
3179 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
3180 | * set, starting from the lowest sched_domain to the highest. | ||
3181 | */ | ||
3182 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
3183 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
3184 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
3185 | |||
3186 | /** | ||
3187 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
3188 | * @ilb_group: group to be checked for semi-idleness | ||
3189 | * | ||
3190 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
3191 | * | ||
3192 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
3193 | * and atleast one non-idle CPU. This helper function checks if the given | ||
3194 | * sched_group is semi-idle or not. | ||
3195 | */ | ||
3196 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
3197 | { | ||
3198 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
3199 | sched_group_cpus(ilb_group)); | ||
3200 | |||
3201 | /* | ||
3202 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
3203 | * and atleast one idle cpu. | ||
3204 | */ | ||
3205 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
3206 | return 0; | ||
3207 | |||
3208 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
3209 | return 0; | ||
3210 | |||
3211 | return 1; | ||
3212 | } | ||
3213 | /** | ||
3214 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
3215 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
3216 | * | ||
3217 | * Returns: Returns the id of the idle load balancer if it exists, | ||
3218 | * Else, returns >= nr_cpu_ids. | ||
3219 | * | ||
3220 | * This algorithm picks the idle load balancer such that it belongs to a | ||
3221 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
3222 | * completely idle packages/cores just for the purpose of idle load balancing | ||
3223 | * when there are other idle cpu's which are better suited for that job. | ||
3224 | */ | ||
3225 | static int find_new_ilb(int cpu) | ||
3226 | { | ||
3227 | struct sched_domain *sd; | ||
3228 | struct sched_group *ilb_group; | ||
3229 | |||
3230 | /* | ||
3231 | * Have idle load balancer selection from semi-idle packages only | ||
3232 | * when power-aware load balancing is enabled | ||
3233 | */ | ||
3234 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
3235 | goto out_done; | ||
3236 | |||
3237 | /* | ||
3238 | * Optimize for the case when we have no idle CPUs or only one | ||
3239 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
3240 | */ | ||
3241 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
3242 | goto out_done; | ||
3243 | |||
3244 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
3245 | ilb_group = sd->groups; | ||
3246 | |||
3247 | do { | ||
3248 | if (is_semi_idle_group(ilb_group)) | ||
3249 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
3250 | |||
3251 | ilb_group = ilb_group->next; | ||
3252 | |||
3253 | } while (ilb_group != sd->groups); | ||
3254 | } | ||
3255 | |||
3256 | out_done: | ||
3257 | return cpumask_first(nohz.cpu_mask); | ||
3258 | } | ||
3259 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
3260 | static inline int find_new_ilb(int call_cpu) | ||
3261 | { | ||
3262 | return cpumask_first(nohz.cpu_mask); | ||
3263 | } | ||
3264 | #endif | ||
3265 | |||
3266 | /* | ||
3267 | * This routine will try to nominate the ilb (idle load balancing) | ||
3268 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
3269 | * load balancing on behalf of all those cpus. If all the cpus in the system | ||
3270 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
3271 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
3272 | * arrives... | ||
3273 | * | ||
3274 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
3275 | * for idle load balancing. ilb owner will still be part of | ||
3276 | * nohz.cpu_mask.. | ||
3277 | * | ||
3278 | * While stopping the tick, this cpu will become the ilb owner if there | ||
3279 | * is no other owner. And will be the owner till that cpu becomes busy | ||
3280 | * or if all cpus in the system stop their ticks at which point | ||
3281 | * there is no need for ilb owner. | ||
3282 | * | ||
3283 | * When the ilb owner becomes busy, it nominates another owner, during the | ||
3284 | * next busy scheduler_tick() | ||
3285 | */ | ||
3286 | int select_nohz_load_balancer(int stop_tick) | ||
3287 | { | ||
3288 | int cpu = smp_processor_id(); | ||
3289 | |||
3290 | if (stop_tick) { | ||
3291 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
3292 | |||
3293 | if (!cpu_active(cpu)) { | ||
3294 | if (atomic_read(&nohz.load_balancer) != cpu) | ||
3295 | return 0; | ||
3296 | |||
3297 | /* | ||
3298 | * If we are going offline and still the leader, | ||
3299 | * give up! | ||
3300 | */ | ||
3301 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
3302 | BUG(); | ||
3303 | |||
3304 | return 0; | ||
3305 | } | ||
3306 | |||
3307 | cpumask_set_cpu(cpu, nohz.cpu_mask); | ||
3308 | |||
3309 | /* time for ilb owner also to sleep */ | ||
3310 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | ||
3311 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
3312 | atomic_set(&nohz.load_balancer, -1); | ||
3313 | return 0; | ||
3314 | } | ||
3315 | |||
3316 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3317 | /* make me the ilb owner */ | ||
3318 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
3319 | return 1; | ||
3320 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3321 | int new_ilb; | ||
3322 | |||
3323 | if (!(sched_smt_power_savings || | ||
3324 | sched_mc_power_savings)) | ||
3325 | return 1; | ||
3326 | /* | ||
3327 | * Check to see if there is a more power-efficient | ||
3328 | * ilb. | ||
3329 | */ | ||
3330 | new_ilb = find_new_ilb(cpu); | ||
3331 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
3332 | atomic_set(&nohz.load_balancer, -1); | ||
3333 | resched_cpu(new_ilb); | ||
3334 | return 0; | ||
3335 | } | ||
3336 | return 1; | ||
3337 | } | ||
3338 | } else { | ||
3339 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
3340 | return 0; | ||
3341 | |||
3342 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
3343 | |||
3344 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
3345 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
3346 | BUG(); | ||
3347 | } | ||
3348 | return 0; | ||
3349 | } | ||
3350 | #endif | ||
3351 | |||
3352 | static DEFINE_SPINLOCK(balancing); | ||
3353 | |||
3354 | /* | ||
3355 | * It checks each scheduling domain to see if it is due to be balanced, | ||
3356 | * and initiates a balancing operation if so. | ||
3357 | * | ||
3358 | * Balancing parameters are set up in arch_init_sched_domains. | ||
3359 | */ | ||
3360 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | ||
3361 | { | ||
3362 | int balance = 1; | ||
3363 | struct rq *rq = cpu_rq(cpu); | ||
3364 | unsigned long interval; | ||
3365 | struct sched_domain *sd; | ||
3366 | /* Earliest time when we have to do rebalance again */ | ||
3367 | unsigned long next_balance = jiffies + 60*HZ; | ||
3368 | int update_next_balance = 0; | ||
3369 | int need_serialize; | ||
3370 | |||
3371 | for_each_domain(cpu, sd) { | ||
3372 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
3373 | continue; | ||
3374 | |||
3375 | interval = sd->balance_interval; | ||
3376 | if (idle != CPU_IDLE) | ||
3377 | interval *= sd->busy_factor; | ||
3378 | |||
3379 | /* scale ms to jiffies */ | ||
3380 | interval = msecs_to_jiffies(interval); | ||
3381 | if (unlikely(!interval)) | ||
3382 | interval = 1; | ||
3383 | if (interval > HZ*NR_CPUS/10) | ||
3384 | interval = HZ*NR_CPUS/10; | ||
3385 | |||
3386 | need_serialize = sd->flags & SD_SERIALIZE; | ||
3387 | |||
3388 | if (need_serialize) { | ||
3389 | if (!spin_trylock(&balancing)) | ||
3390 | goto out; | ||
3391 | } | ||
3392 | |||
3393 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
3394 | if (load_balance(cpu, rq, sd, idle, &balance)) { | ||
3395 | /* | ||
3396 | * We've pulled tasks over so either we're no | ||
3397 | * longer idle, or one of our SMT siblings is | ||
3398 | * not idle. | ||
3399 | */ | ||
3400 | idle = CPU_NOT_IDLE; | ||
3401 | } | ||
3402 | sd->last_balance = jiffies; | ||
3403 | } | ||
3404 | if (need_serialize) | ||
3405 | spin_unlock(&balancing); | ||
3406 | out: | ||
3407 | if (time_after(next_balance, sd->last_balance + interval)) { | ||
3408 | next_balance = sd->last_balance + interval; | ||
3409 | update_next_balance = 1; | ||
3410 | } | ||
3411 | |||
3412 | /* | ||
3413 | * Stop the load balance at this level. There is another | ||
3414 | * CPU in our sched group which is doing load balancing more | ||
3415 | * actively. | ||
3416 | */ | ||
3417 | if (!balance) | ||
3418 | break; | ||
3419 | } | ||
3420 | |||
3421 | /* | ||
3422 | * next_balance will be updated only when there is a need. | ||
3423 | * When the cpu is attached to null domain for ex, it will not be | ||
3424 | * updated. | ||
3425 | */ | ||
3426 | if (likely(update_next_balance)) | ||
3427 | rq->next_balance = next_balance; | ||
3428 | } | ||
3429 | |||
3430 | /* | ||
3431 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
3432 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
3433 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
3434 | */ | ||
3435 | static void run_rebalance_domains(struct softirq_action *h) | ||
3436 | { | ||
3437 | int this_cpu = smp_processor_id(); | ||
3438 | struct rq *this_rq = cpu_rq(this_cpu); | ||
3439 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | ||
3440 | CPU_IDLE : CPU_NOT_IDLE; | ||
3441 | |||
3442 | rebalance_domains(this_cpu, idle); | ||
3443 | |||
3444 | #ifdef CONFIG_NO_HZ | ||
3445 | /* | ||
3446 | * If this cpu is the owner for idle load balancing, then do the | ||
3447 | * balancing on behalf of the other idle cpus whose ticks are | ||
3448 | * stopped. | ||
3449 | */ | ||
3450 | if (this_rq->idle_at_tick && | ||
3451 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
3452 | struct rq *rq; | ||
3453 | int balance_cpu; | ||
3454 | |||
3455 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
3456 | if (balance_cpu == this_cpu) | ||
3457 | continue; | ||
3458 | |||
3459 | /* | ||
3460 | * If this cpu gets work to do, stop the load balancing | ||
3461 | * work being done for other cpus. Next load | ||
3462 | * balancing owner will pick it up. | ||
3463 | */ | ||
3464 | if (need_resched()) | ||
3465 | break; | ||
3466 | |||
3467 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3468 | |||
3469 | rq = cpu_rq(balance_cpu); | ||
3470 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3471 | this_rq->next_balance = rq->next_balance; | ||
3472 | } | ||
3473 | } | ||
3474 | #endif | ||
3475 | } | ||
3476 | |||
3477 | static inline int on_null_domain(int cpu) | ||
3478 | { | ||
3479 | return !rcu_dereference_sched(cpu_rq(cpu)->sd); | ||
3480 | } | ||
3481 | |||
3482 | /* | ||
3483 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | ||
3484 | * | ||
3485 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
3486 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
3487 | * if the whole system is idle. | ||
3488 | */ | ||
3489 | static inline void trigger_load_balance(struct rq *rq, int cpu) | ||
3490 | { | ||
3491 | #ifdef CONFIG_NO_HZ | ||
3492 | /* | ||
3493 | * If we were in the nohz mode recently and busy at the current | ||
3494 | * scheduler tick, then check if we need to nominate new idle | ||
3495 | * load balancer. | ||
3496 | */ | ||
3497 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
3498 | rq->in_nohz_recently = 0; | ||
3499 | |||
3500 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3501 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
3502 | atomic_set(&nohz.load_balancer, -1); | ||
3503 | } | ||
3504 | |||
3505 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3506 | int ilb = find_new_ilb(cpu); | ||
3507 | |||
3508 | if (ilb < nr_cpu_ids) | ||
3509 | resched_cpu(ilb); | ||
3510 | } | ||
3511 | } | ||
3512 | |||
3513 | /* | ||
3514 | * If this cpu is idle and doing idle load balancing for all the | ||
3515 | * cpus with ticks stopped, is it time for that to stop? | ||
3516 | */ | ||
3517 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
3518 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3519 | resched_cpu(cpu); | ||
3520 | return; | ||
3521 | } | ||
3522 | |||
3523 | /* | ||
3524 | * If this cpu is idle and the idle load balancing is done by | ||
3525 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
3526 | */ | ||
3527 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
3528 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
3529 | return; | ||
3530 | #endif | ||
3531 | /* Don't need to rebalance while attached to NULL domain */ | ||
3532 | if (time_after_eq(jiffies, rq->next_balance) && | ||
3533 | likely(!on_null_domain(cpu))) | ||
3534 | raise_softirq(SCHED_SOFTIRQ); | ||
3535 | } | ||
1954 | 3536 | ||
1955 | static void rq_online_fair(struct rq *rq) | 3537 | static void rq_online_fair(struct rq *rq) |
1956 | { | 3538 | { |
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq) | |||
1962 | update_sysctl(); | 3544 | update_sysctl(); |
1963 | } | 3545 | } |
1964 | 3546 | ||
3547 | #else /* CONFIG_SMP */ | ||
3548 | |||
3549 | /* | ||
3550 | * on UP we do not need to balance between CPUs: | ||
3551 | */ | ||
3552 | static inline void idle_balance(int cpu, struct rq *rq) | ||
3553 | { | ||
3554 | } | ||
3555 | |||
1965 | #endif /* CONFIG_SMP */ | 3556 | #endif /* CONFIG_SMP */ |
1966 | 3557 | ||
1967 | /* | 3558 | /* |
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq) | |||
2076 | } | 3667 | } |
2077 | #endif | 3668 | #endif |
2078 | 3669 | ||
2079 | unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) | 3670 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) |
2080 | { | 3671 | { |
2081 | struct sched_entity *se = &task->se; | 3672 | struct sched_entity *se = &task->se; |
2082 | unsigned int rr_interval = 0; | 3673 | unsigned int rr_interval = 0; |
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = { | |||
2108 | #ifdef CONFIG_SMP | 3699 | #ifdef CONFIG_SMP |
2109 | .select_task_rq = select_task_rq_fair, | 3700 | .select_task_rq = select_task_rq_fair, |
2110 | 3701 | ||
2111 | .load_balance = load_balance_fair, | ||
2112 | .move_one_task = move_one_task_fair, | ||
2113 | .rq_online = rq_online_fair, | 3702 | .rq_online = rq_online_fair, |
2114 | .rq_offline = rq_offline_fair, | 3703 | .rq_offline = rq_offline_fair, |
2115 | 3704 | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 5f93b570d383..a8a6d8a50947 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | |||
44 | { | 44 | { |
45 | } | 45 | } |
46 | 46 | ||
47 | #ifdef CONFIG_SMP | ||
48 | static unsigned long | ||
49 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
50 | unsigned long max_load_move, | ||
51 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
52 | int *all_pinned, int *this_best_prio) | ||
53 | { | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static int | ||
58 | move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
59 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
60 | { | ||
61 | return 0; | ||
62 | } | ||
63 | #endif | ||
64 | |||
65 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | 47 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
66 | { | 48 | { |
67 | } | 49 | } |
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, | |||
97 | check_preempt_curr(rq, p, 0); | 79 | check_preempt_curr(rq, p, 0); |
98 | } | 80 | } |
99 | 81 | ||
100 | unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | 82 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) |
101 | { | 83 | { |
102 | return 0; | 84 | return 0; |
103 | } | 85 | } |
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = { | |||
119 | 101 | ||
120 | #ifdef CONFIG_SMP | 102 | #ifdef CONFIG_SMP |
121 | .select_task_rq = select_task_rq_idle, | 103 | .select_task_rq = select_task_rq_idle, |
122 | |||
123 | .load_balance = load_balance_idle, | ||
124 | .move_one_task = move_one_task_idle, | ||
125 | #endif | 104 | #endif |
126 | 105 | ||
127 | .set_curr_task = set_curr_task_idle, | 106 | .set_curr_task = set_curr_task_idle, |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f48328ac216f..b5b920ae2ea7 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | |||
194 | return rt_se->my_q; | 194 | return rt_se->my_q; |
195 | } | 195 | } |
196 | 196 | ||
197 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se); | 197 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); |
198 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | 198 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); |
199 | 199 | ||
200 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 200 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
201 | { | 201 | { |
202 | int this_cpu = smp_processor_id(); | ||
202 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 203 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; |
203 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | 204 | struct sched_rt_entity *rt_se; |
205 | |||
206 | rt_se = rt_rq->tg->rt_se[this_cpu]; | ||
204 | 207 | ||
205 | if (rt_rq->rt_nr_running) { | 208 | if (rt_rq->rt_nr_running) { |
206 | if (rt_se && !on_rt_rq(rt_se)) | 209 | if (rt_se && !on_rt_rq(rt_se)) |
207 | enqueue_rt_entity(rt_se); | 210 | enqueue_rt_entity(rt_se, false); |
208 | if (rt_rq->highest_prio.curr < curr->prio) | 211 | if (rt_rq->highest_prio.curr < curr->prio) |
209 | resched_task(curr); | 212 | resched_task(curr); |
210 | } | 213 | } |
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
212 | 215 | ||
213 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 216 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
214 | { | 217 | { |
215 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | 218 | int this_cpu = smp_processor_id(); |
219 | struct sched_rt_entity *rt_se; | ||
220 | |||
221 | rt_se = rt_rq->tg->rt_se[this_cpu]; | ||
216 | 222 | ||
217 | if (rt_se && on_rt_rq(rt_se)) | 223 | if (rt_se && on_rt_rq(rt_se)) |
218 | dequeue_rt_entity(rt_se); | 224 | dequeue_rt_entity(rt_se); |
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
803 | dec_rt_group(rt_se, rt_rq); | 809 | dec_rt_group(rt_se, rt_rq); |
804 | } | 810 | } |
805 | 811 | ||
806 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | 812 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) |
807 | { | 813 | { |
808 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 814 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
809 | struct rt_prio_array *array = &rt_rq->active; | 815 | struct rt_prio_array *array = &rt_rq->active; |
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | |||
819 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 825 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
820 | return; | 826 | return; |
821 | 827 | ||
822 | list_add_tail(&rt_se->run_list, queue); | 828 | if (head) |
829 | list_add(&rt_se->run_list, queue); | ||
830 | else | ||
831 | list_add_tail(&rt_se->run_list, queue); | ||
823 | __set_bit(rt_se_prio(rt_se), array->bitmap); | 832 | __set_bit(rt_se_prio(rt_se), array->bitmap); |
824 | 833 | ||
825 | inc_rt_tasks(rt_se, rt_rq); | 834 | inc_rt_tasks(rt_se, rt_rq); |
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | |||
856 | } | 865 | } |
857 | } | 866 | } |
858 | 867 | ||
859 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se) | 868 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) |
860 | { | 869 | { |
861 | dequeue_rt_stack(rt_se); | 870 | dequeue_rt_stack(rt_se); |
862 | for_each_sched_rt_entity(rt_se) | 871 | for_each_sched_rt_entity(rt_se) |
863 | __enqueue_rt_entity(rt_se); | 872 | __enqueue_rt_entity(rt_se, head); |
864 | } | 873 | } |
865 | 874 | ||
866 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | 875 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) |
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
871 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | 880 | struct rt_rq *rt_rq = group_rt_rq(rt_se); |
872 | 881 | ||
873 | if (rt_rq && rt_rq->rt_nr_running) | 882 | if (rt_rq && rt_rq->rt_nr_running) |
874 | __enqueue_rt_entity(rt_se); | 883 | __enqueue_rt_entity(rt_se, false); |
875 | } | 884 | } |
876 | } | 885 | } |
877 | 886 | ||
878 | /* | 887 | /* |
879 | * Adding/removing a task to/from a priority array: | 888 | * Adding/removing a task to/from a priority array: |
880 | */ | 889 | */ |
881 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | 890 | static void |
891 | enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
882 | { | 892 | { |
883 | struct sched_rt_entity *rt_se = &p->rt; | 893 | struct sched_rt_entity *rt_se = &p->rt; |
884 | 894 | ||
885 | if (wakeup) | 895 | if (wakeup) |
886 | rt_se->timeout = 0; | 896 | rt_se->timeout = 0; |
887 | 897 | ||
888 | enqueue_rt_entity(rt_se); | 898 | enqueue_rt_entity(rt_se, head); |
889 | 899 | ||
890 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 900 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
891 | enqueue_pushable_task(rq, p); | 901 | enqueue_pushable_task(rq, p); |
@@ -1136,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | |||
1136 | if (next && next->prio < idx) | 1146 | if (next && next->prio < idx) |
1137 | continue; | 1147 | continue; |
1138 | list_for_each_entry(rt_se, array->queue + idx, run_list) { | 1148 | list_for_each_entry(rt_se, array->queue + idx, run_list) { |
1139 | struct task_struct *p = rt_task_of(rt_se); | 1149 | struct task_struct *p; |
1150 | |||
1151 | if (!rt_entity_is_task(rt_se)) | ||
1152 | continue; | ||
1153 | |||
1154 | p = rt_task_of(rt_se); | ||
1140 | if (pick_rt_task(rq, p, cpu)) { | 1155 | if (pick_rt_task(rq, p, cpu)) { |
1141 | next = p; | 1156 | next = p; |
1142 | break; | 1157 | break; |
@@ -1481,24 +1496,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1481 | push_rt_tasks(rq); | 1496 | push_rt_tasks(rq); |
1482 | } | 1497 | } |
1483 | 1498 | ||
1484 | static unsigned long | ||
1485 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1486 | unsigned long max_load_move, | ||
1487 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1488 | int *all_pinned, int *this_best_prio) | ||
1489 | { | ||
1490 | /* don't touch RT tasks */ | ||
1491 | return 0; | ||
1492 | } | ||
1493 | |||
1494 | static int | ||
1495 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1496 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
1497 | { | ||
1498 | /* don't touch RT tasks */ | ||
1499 | return 0; | ||
1500 | } | ||
1501 | |||
1502 | static void set_cpus_allowed_rt(struct task_struct *p, | 1499 | static void set_cpus_allowed_rt(struct task_struct *p, |
1503 | const struct cpumask *new_mask) | 1500 | const struct cpumask *new_mask) |
1504 | { | 1501 | { |
@@ -1670,8 +1667,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
1670 | if (!p->signal) | 1667 | if (!p->signal) |
1671 | return; | 1668 | return; |
1672 | 1669 | ||
1673 | soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; | 1670 | /* max may change after cur was read, this will be fixed next tick */ |
1674 | hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; | 1671 | soft = task_rlimit(p, RLIMIT_RTTIME); |
1672 | hard = task_rlimit_max(p, RLIMIT_RTTIME); | ||
1675 | 1673 | ||
1676 | if (soft != RLIM_INFINITY) { | 1674 | if (soft != RLIM_INFINITY) { |
1677 | unsigned long next; | 1675 | unsigned long next; |
@@ -1721,7 +1719,7 @@ static void set_curr_task_rt(struct rq *rq) | |||
1721 | dequeue_pushable_task(rq, p); | 1719 | dequeue_pushable_task(rq, p); |
1722 | } | 1720 | } |
1723 | 1721 | ||
1724 | unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | 1722 | static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) |
1725 | { | 1723 | { |
1726 | /* | 1724 | /* |
1727 | * Time slice is 0 for SCHED_FIFO tasks | 1725 | * Time slice is 0 for SCHED_FIFO tasks |
@@ -1746,8 +1744,6 @@ static const struct sched_class rt_sched_class = { | |||
1746 | #ifdef CONFIG_SMP | 1744 | #ifdef CONFIG_SMP |
1747 | .select_task_rq = select_task_rq_rt, | 1745 | .select_task_rq = select_task_rq_rt, |
1748 | 1746 | ||
1749 | .load_balance = load_balance_rt, | ||
1750 | .move_one_task = move_one_task_rt, | ||
1751 | .set_cpus_allowed = set_cpus_allowed_rt, | 1747 | .set_cpus_allowed = set_cpus_allowed_rt, |
1752 | .rq_online = rq_online_rt, | 1748 | .rq_online = rq_online_rt, |
1753 | .rq_offline = rq_offline_rt, | 1749 | .rq_offline = rq_offline_rt, |
diff --git a/kernel/signal.c b/kernel/signal.c index 934ae5e687b9..dbd7fe073c55 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -159,6 +159,10 @@ void recalc_sigpending(void) | |||
159 | 159 | ||
160 | /* Given the mask, find the first available signal that should be serviced. */ | 160 | /* Given the mask, find the first available signal that should be serviced. */ |
161 | 161 | ||
162 | #define SYNCHRONOUS_MASK \ | ||
163 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ | ||
164 | sigmask(SIGTRAP) | sigmask(SIGFPE)) | ||
165 | |||
162 | int next_signal(struct sigpending *pending, sigset_t *mask) | 166 | int next_signal(struct sigpending *pending, sigset_t *mask) |
163 | { | 167 | { |
164 | unsigned long i, *s, *m, x; | 168 | unsigned long i, *s, *m, x; |
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask) | |||
166 | 170 | ||
167 | s = pending->signal.sig; | 171 | s = pending->signal.sig; |
168 | m = mask->sig; | 172 | m = mask->sig; |
173 | |||
174 | /* | ||
175 | * Handle the first word specially: it contains the | ||
176 | * synchronous signals that need to be dequeued first. | ||
177 | */ | ||
178 | x = *s &~ *m; | ||
179 | if (x) { | ||
180 | if (x & SYNCHRONOUS_MASK) | ||
181 | x &= SYNCHRONOUS_MASK; | ||
182 | sig = ffz(~x) + 1; | ||
183 | return sig; | ||
184 | } | ||
185 | |||
169 | switch (_NSIG_WORDS) { | 186 | switch (_NSIG_WORDS) { |
170 | default: | 187 | default: |
171 | for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) | 188 | for (i = 1; i < _NSIG_WORDS; ++i) { |
172 | if ((x = *s &~ *m) != 0) { | 189 | x = *++s &~ *++m; |
173 | sig = ffz(~x) + i*_NSIG_BPW + 1; | 190 | if (!x) |
174 | break; | 191 | continue; |
175 | } | 192 | sig = ffz(~x) + i*_NSIG_BPW + 1; |
193 | break; | ||
194 | } | ||
176 | break; | 195 | break; |
177 | 196 | ||
178 | case 2: if ((x = s[0] &~ m[0]) != 0) | 197 | case 2: |
179 | sig = 1; | 198 | x = s[1] &~ m[1]; |
180 | else if ((x = s[1] &~ m[1]) != 0) | 199 | if (!x) |
181 | sig = _NSIG_BPW + 1; | ||
182 | else | ||
183 | break; | 200 | break; |
184 | sig += ffz(~x); | 201 | sig = ffz(~x) + _NSIG_BPW + 1; |
185 | break; | 202 | break; |
186 | 203 | ||
187 | case 1: if ((x = *s &~ *m) != 0) | 204 | case 1: |
188 | sig = ffz(~x) + 1; | 205 | /* Nothing to do */ |
189 | break; | 206 | break; |
190 | } | 207 | } |
191 | 208 | ||
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi | |||
228 | 245 | ||
229 | if (override_rlimit || | 246 | if (override_rlimit || |
230 | atomic_read(&user->sigpending) <= | 247 | atomic_read(&user->sigpending) <= |
231 | t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { | 248 | task_rlimit(t, RLIMIT_SIGPENDING)) { |
232 | q = kmem_cache_alloc(sigqueue_cachep, flags); | 249 | q = kmem_cache_alloc(sigqueue_cachep, flags); |
233 | } else { | 250 | } else { |
234 | print_dropped_signal(sig); | 251 | print_dropped_signal(sig); |
diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 7494bbf5a270..7d3f4fa9ef4f 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c | |||
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, | |||
637 | goto cancelled; | 637 | goto cancelled; |
638 | 638 | ||
639 | /* the timer holds a reference whilst it is pending */ | 639 | /* the timer holds a reference whilst it is pending */ |
640 | ret = work->ops->get_ref(work); | 640 | ret = slow_work_get_ref(work); |
641 | if (ret < 0) | 641 | if (ret < 0) |
642 | goto cant_get_ref; | 642 | goto cant_get_ref; |
643 | 643 | ||
diff --git a/kernel/slow-work.h b/kernel/slow-work.h index 321f3c59d732..a29ebd1ef41d 100644 --- a/kernel/slow-work.h +++ b/kernel/slow-work.h | |||
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); | |||
43 | */ | 43 | */ |
44 | static inline void slow_work_set_thread_pid(int id, pid_t pid) | 44 | static inline void slow_work_set_thread_pid(int id, pid_t pid) |
45 | { | 45 | { |
46 | #ifdef CONFIG_SLOW_WORK_PROC | 46 | #ifdef CONFIG_SLOW_WORK_DEBUG |
47 | slow_work_pids[id] = pid; | 47 | slow_work_pids[id] = pid; |
48 | #endif | 48 | #endif |
49 | } | 49 | } |
50 | 50 | ||
51 | static inline void slow_work_mark_time(struct slow_work *work) | 51 | static inline void slow_work_mark_time(struct slow_work *work) |
52 | { | 52 | { |
53 | #ifdef CONFIG_SLOW_WORK_PROC | 53 | #ifdef CONFIG_SLOW_WORK_DEBUG |
54 | work->mark = CURRENT_TIME; | 54 | work->mark = CURRENT_TIME; |
55 | #endif | 55 | #endif |
56 | } | 56 | } |
57 | 57 | ||
58 | static inline void slow_work_begin_exec(int id, struct slow_work *work) | 58 | static inline void slow_work_begin_exec(int id, struct slow_work *work) |
59 | { | 59 | { |
60 | #ifdef CONFIG_SLOW_WORK_PROC | 60 | #ifdef CONFIG_SLOW_WORK_DEBUG |
61 | slow_work_execs[id] = work; | 61 | slow_work_execs[id] = work; |
62 | #endif | 62 | #endif |
63 | } | 63 | } |
64 | 64 | ||
65 | static inline void slow_work_end_exec(int id, struct slow_work *work) | 65 | static inline void slow_work_end_exec(int id, struct slow_work *work) |
66 | { | 66 | { |
67 | #ifdef CONFIG_SLOW_WORK_PROC | 67 | #ifdef CONFIG_SLOW_WORK_DEBUG |
68 | write_lock(&slow_work_execs_lock); | 68 | write_lock(&slow_work_execs_lock); |
69 | slow_work_execs[id] = NULL; | 69 | slow_work_execs[id] = NULL; |
70 | write_unlock(&slow_work_execs_lock); | 70 | write_unlock(&slow_work_execs_lock); |
diff --git a/kernel/smp.c b/kernel/smp.c index f10408422444..3fc697336183 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -9,11 +9,10 @@ | |||
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/percpu.h> | 10 | #include <linux/percpu.h> |
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/gfp.h> | ||
12 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
13 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
14 | 15 | ||
15 | static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); | ||
16 | |||
17 | static struct { | 16 | static struct { |
18 | struct list_head queue; | 17 | struct list_head queue; |
19 | raw_spinlock_t lock; | 18 | raw_spinlock_t lock; |
@@ -33,12 +32,14 @@ struct call_function_data { | |||
33 | cpumask_var_t cpumask; | 32 | cpumask_var_t cpumask; |
34 | }; | 33 | }; |
35 | 34 | ||
35 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); | ||
36 | |||
36 | struct call_single_queue { | 37 | struct call_single_queue { |
37 | struct list_head list; | 38 | struct list_head list; |
38 | raw_spinlock_t lock; | 39 | raw_spinlock_t lock; |
39 | }; | 40 | }; |
40 | 41 | ||
41 | static DEFINE_PER_CPU(struct call_function_data, cfd_data); | 42 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); |
42 | 43 | ||
43 | static int | 44 | static int |
44 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | 45 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) |
@@ -256,7 +257,7 @@ void generic_smp_call_function_single_interrupt(void) | |||
256 | } | 257 | } |
257 | } | 258 | } |
258 | 259 | ||
259 | static DEFINE_PER_CPU(struct call_single_data, csd_data); | 260 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); |
260 | 261 | ||
261 | /* | 262 | /* |
262 | * smp_call_function_single - Run a function on a specific CPU | 263 | * smp_call_function_single - Run a function on a specific CPU |
diff --git a/kernel/softirq.c b/kernel/softirq.c index a09502e2ef75..7c1a67ef0274 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill); | |||
500 | */ | 500 | */ |
501 | 501 | ||
502 | /* | 502 | /* |
503 | * The trampoline is called when the hrtimer expires. If this is | 503 | * The trampoline is called when the hrtimer expires. It schedules a tasklet |
504 | * called from the hrtimer interrupt then we schedule the tasklet as | 504 | * to run __tasklet_hrtimer_trampoline() which in turn will call the intended |
505 | * the timer callback function expects to run in softirq context. If | 505 | * hrtimer callback, but from softirq context. |
506 | * it's called in softirq context anyway (i.e. high resolution timers | ||
507 | * disabled) then the hrtimer callback is called right away. | ||
508 | */ | 506 | */ |
509 | static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) | 507 | static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) |
510 | { | 508 | { |
511 | struct tasklet_hrtimer *ttimer = | 509 | struct tasklet_hrtimer *ttimer = |
512 | container_of(timer, struct tasklet_hrtimer, timer); | 510 | container_of(timer, struct tasklet_hrtimer, timer); |
513 | 511 | ||
514 | if (hrtimer_is_hres_active(timer)) { | 512 | tasklet_hi_schedule(&ttimer->tasklet); |
515 | tasklet_hi_schedule(&ttimer->tasklet); | 513 | return HRTIMER_NORESTART; |
516 | return HRTIMER_NORESTART; | ||
517 | } | ||
518 | return ttimer->function(timer); | ||
519 | } | 514 | } |
520 | 515 | ||
521 | /* | 516 | /* |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d22579087e27..4b493f67dcb5 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock); | |||
25 | static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ | 25 | static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ |
26 | static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ | 26 | static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ |
27 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | 27 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); |
28 | static DEFINE_PER_CPU(bool, softlock_touch_sync); | ||
28 | 29 | ||
29 | static int __read_mostly did_panic; | 30 | static int __read_mostly did_panic; |
30 | int __read_mostly softlockup_thresh = 60; | 31 | int __read_mostly softlockup_thresh = 60; |
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void) | |||
79 | } | 80 | } |
80 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 81 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
81 | 82 | ||
83 | void touch_softlockup_watchdog_sync(void) | ||
84 | { | ||
85 | __raw_get_cpu_var(softlock_touch_sync) = true; | ||
86 | __raw_get_cpu_var(softlockup_touch_ts) = 0; | ||
87 | } | ||
88 | |||
82 | void touch_all_softlockup_watchdogs(void) | 89 | void touch_all_softlockup_watchdogs(void) |
83 | { | 90 | { |
84 | int cpu; | 91 | int cpu; |
@@ -118,6 +125,14 @@ void softlockup_tick(void) | |||
118 | } | 125 | } |
119 | 126 | ||
120 | if (touch_ts == 0) { | 127 | if (touch_ts == 0) { |
128 | if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { | ||
129 | /* | ||
130 | * If the time stamp was touched atomically | ||
131 | * make sure the scheduler tick is up to date. | ||
132 | */ | ||
133 | per_cpu(softlock_touch_sync, this_cpu) = false; | ||
134 | sched_clock_tick(); | ||
135 | } | ||
121 | __touch_softlockup_watchdog(); | 136 | __touch_softlockup_watchdog(); |
122 | return; | 137 | return; |
123 | } | 138 | } |
@@ -140,11 +155,11 @@ void softlockup_tick(void) | |||
140 | * Wake up the high-prio watchdog task twice per | 155 | * Wake up the high-prio watchdog task twice per |
141 | * threshold timespan. | 156 | * threshold timespan. |
142 | */ | 157 | */ |
143 | if (now > touch_ts + softlockup_thresh/2) | 158 | if (time_after(now - softlockup_thresh/2, touch_ts)) |
144 | wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); | 159 | wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); |
145 | 160 | ||
146 | /* Warn about unreasonable delays: */ | 161 | /* Warn about unreasonable delays: */ |
147 | if (now <= (touch_ts + softlockup_thresh)) | 162 | if (time_before_eq(now - softlockup_thresh, touch_ts)) |
148 | return; | 163 | return; |
149 | 164 | ||
150 | per_cpu(softlockup_print_ts, this_cpu) = touch_ts; | 165 | per_cpu(softlockup_print_ts, this_cpu) = touch_ts; |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 818d7d9aa03c..2980da3fd509 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -30,10 +30,33 @@ | |||
30 | #include <linux/preempt.h> | 30 | #include <linux/preempt.h> |
31 | #include <linux/rcupdate.h> | 31 | #include <linux/rcupdate.h> |
32 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
33 | #include <linux/slab.h> | ||
34 | #include <linux/smp.h> | 33 | #include <linux/smp.h> |
35 | #include <linux/srcu.h> | 34 | #include <linux/srcu.h> |
36 | 35 | ||
36 | static int init_srcu_struct_fields(struct srcu_struct *sp) | ||
37 | { | ||
38 | sp->completed = 0; | ||
39 | mutex_init(&sp->mutex); | ||
40 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | ||
41 | return sp->per_cpu_ref ? 0 : -ENOMEM; | ||
42 | } | ||
43 | |||
44 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
45 | |||
46 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
47 | struct lock_class_key *key) | ||
48 | { | ||
49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
50 | /* Don't re-initialize a lock while it is held. */ | ||
51 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
52 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
53 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
54 | return init_srcu_struct_fields(sp); | ||
55 | } | ||
56 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
57 | |||
58 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
59 | |||
37 | /** | 60 | /** |
38 | * init_srcu_struct - initialize a sleep-RCU structure | 61 | * init_srcu_struct - initialize a sleep-RCU structure |
39 | * @sp: structure to initialize. | 62 | * @sp: structure to initialize. |
@@ -44,13 +67,12 @@ | |||
44 | */ | 67 | */ |
45 | int init_srcu_struct(struct srcu_struct *sp) | 68 | int init_srcu_struct(struct srcu_struct *sp) |
46 | { | 69 | { |
47 | sp->completed = 0; | 70 | return init_srcu_struct_fields(sp); |
48 | mutex_init(&sp->mutex); | ||
49 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | ||
50 | return (sp->per_cpu_ref ? 0 : -ENOMEM); | ||
51 | } | 71 | } |
52 | EXPORT_SYMBOL_GPL(init_srcu_struct); | 72 | EXPORT_SYMBOL_GPL(init_srcu_struct); |
53 | 73 | ||
74 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
75 | |||
54 | /* | 76 | /* |
55 | * srcu_readers_active_idx -- returns approximate number of readers | 77 | * srcu_readers_active_idx -- returns approximate number of readers |
56 | * active on the specified rank of per-CPU counters. | 78 | * active on the specified rank of per-CPU counters. |
@@ -100,15 +122,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp) | |||
100 | } | 122 | } |
101 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | 123 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); |
102 | 124 | ||
103 | /** | 125 | /* |
104 | * srcu_read_lock - register a new reader for an SRCU-protected structure. | ||
105 | * @sp: srcu_struct in which to register the new reader. | ||
106 | * | ||
107 | * Counts the new reader in the appropriate per-CPU element of the | 126 | * Counts the new reader in the appropriate per-CPU element of the |
108 | * srcu_struct. Must be called from process context. | 127 | * srcu_struct. Must be called from process context. |
109 | * Returns an index that must be passed to the matching srcu_read_unlock(). | 128 | * Returns an index that must be passed to the matching srcu_read_unlock(). |
110 | */ | 129 | */ |
111 | int srcu_read_lock(struct srcu_struct *sp) | 130 | int __srcu_read_lock(struct srcu_struct *sp) |
112 | { | 131 | { |
113 | int idx; | 132 | int idx; |
114 | 133 | ||
@@ -120,31 +139,27 @@ int srcu_read_lock(struct srcu_struct *sp) | |||
120 | preempt_enable(); | 139 | preempt_enable(); |
121 | return idx; | 140 | return idx; |
122 | } | 141 | } |
123 | EXPORT_SYMBOL_GPL(srcu_read_lock); | 142 | EXPORT_SYMBOL_GPL(__srcu_read_lock); |
124 | 143 | ||
125 | /** | 144 | /* |
126 | * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. | ||
127 | * @sp: srcu_struct in which to unregister the old reader. | ||
128 | * @idx: return value from corresponding srcu_read_lock(). | ||
129 | * | ||
130 | * Removes the count for the old reader from the appropriate per-CPU | 145 | * Removes the count for the old reader from the appropriate per-CPU |
131 | * element of the srcu_struct. Note that this may well be a different | 146 | * element of the srcu_struct. Note that this may well be a different |
132 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | 147 | * CPU than that which was incremented by the corresponding srcu_read_lock(). |
133 | * Must be called from process context. | 148 | * Must be called from process context. |
134 | */ | 149 | */ |
135 | void srcu_read_unlock(struct srcu_struct *sp, int idx) | 150 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
136 | { | 151 | { |
137 | preempt_disable(); | 152 | preempt_disable(); |
138 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 153 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ |
139 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 154 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; |
140 | preempt_enable(); | 155 | preempt_enable(); |
141 | } | 156 | } |
142 | EXPORT_SYMBOL_GPL(srcu_read_unlock); | 157 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
143 | 158 | ||
144 | /* | 159 | /* |
145 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 160 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
146 | */ | 161 | */ |
147 | void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 162 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) |
148 | { | 163 | { |
149 | int idx; | 164 | int idx; |
150 | 165 | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 912823e2a11b..9bb9fb1bd79c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -45,7 +45,7 @@ static int refcount; | |||
45 | static struct workqueue_struct *stop_machine_wq; | 45 | static struct workqueue_struct *stop_machine_wq; |
46 | static struct stop_machine_data active, idle; | 46 | static struct stop_machine_data active, idle; |
47 | static const struct cpumask *active_cpus; | 47 | static const struct cpumask *active_cpus; |
48 | static void *stop_machine_work; | 48 | static void __percpu *stop_machine_work; |
49 | 49 | ||
50 | static void set_state(enum stopmachine_state newstate) | 50 | static void set_state(enum stopmachine_state newstate) |
51 | { | 51 | { |
diff --git a/kernel/sys.c b/kernel/sys.c index 26a6b73a6b85..7cb426a58965 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -33,8 +33,10 @@ | |||
33 | #include <linux/task_io_accounting_ops.h> | 33 | #include <linux/task_io_accounting_ops.h> |
34 | #include <linux/seccomp.h> | 34 | #include <linux/seccomp.h> |
35 | #include <linux/cpu.h> | 35 | #include <linux/cpu.h> |
36 | #include <linux/personality.h> | ||
36 | #include <linux/ptrace.h> | 37 | #include <linux/ptrace.h> |
37 | #include <linux/fs_struct.h> | 38 | #include <linux/fs_struct.h> |
39 | #include <linux/gfp.h> | ||
38 | 40 | ||
39 | #include <linux/compat.h> | 41 | #include <linux/compat.h> |
40 | #include <linux/syscalls.h> | 42 | #include <linux/syscalls.h> |
@@ -222,6 +224,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
222 | if (which > PRIO_USER || which < PRIO_PROCESS) | 224 | if (which > PRIO_USER || which < PRIO_PROCESS) |
223 | return -EINVAL; | 225 | return -EINVAL; |
224 | 226 | ||
227 | rcu_read_lock(); | ||
225 | read_lock(&tasklist_lock); | 228 | read_lock(&tasklist_lock); |
226 | switch (which) { | 229 | switch (which) { |
227 | case PRIO_PROCESS: | 230 | case PRIO_PROCESS: |
@@ -267,6 +270,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
267 | } | 270 | } |
268 | out_unlock: | 271 | out_unlock: |
269 | read_unlock(&tasklist_lock); | 272 | read_unlock(&tasklist_lock); |
273 | rcu_read_unlock(); | ||
270 | 274 | ||
271 | return retval; | 275 | return retval; |
272 | } | 276 | } |
@@ -569,13 +573,7 @@ static int set_user(struct cred *new) | |||
569 | if (!new_user) | 573 | if (!new_user) |
570 | return -EAGAIN; | 574 | return -EAGAIN; |
571 | 575 | ||
572 | if (!task_can_switch_user(new_user, current)) { | 576 | if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && |
573 | free_uid(new_user); | ||
574 | return -EINVAL; | ||
575 | } | ||
576 | |||
577 | if (atomic_read(&new_user->processes) >= | ||
578 | current->signal->rlim[RLIMIT_NPROC].rlim_cur && | ||
579 | new_user != INIT_USER) { | 577 | new_user != INIT_USER) { |
580 | free_uid(new_user); | 578 | free_uid(new_user); |
581 | return -EAGAIN; | 579 | return -EAGAIN; |
@@ -1118,6 +1116,15 @@ out: | |||
1118 | 1116 | ||
1119 | DECLARE_RWSEM(uts_sem); | 1117 | DECLARE_RWSEM(uts_sem); |
1120 | 1118 | ||
1119 | #ifdef COMPAT_UTS_MACHINE | ||
1120 | #define override_architecture(name) \ | ||
1121 | (personality(current->personality) == PER_LINUX32 && \ | ||
1122 | copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ | ||
1123 | sizeof(COMPAT_UTS_MACHINE))) | ||
1124 | #else | ||
1125 | #define override_architecture(name) 0 | ||
1126 | #endif | ||
1127 | |||
1121 | SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) | 1128 | SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) |
1122 | { | 1129 | { |
1123 | int errno = 0; | 1130 | int errno = 0; |
@@ -1126,9 +1133,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) | |||
1126 | if (copy_to_user(name, utsname(), sizeof *name)) | 1133 | if (copy_to_user(name, utsname(), sizeof *name)) |
1127 | errno = -EFAULT; | 1134 | errno = -EFAULT; |
1128 | up_read(&uts_sem); | 1135 | up_read(&uts_sem); |
1136 | |||
1137 | if (!errno && override_architecture(name)) | ||
1138 | errno = -EFAULT; | ||
1129 | return errno; | 1139 | return errno; |
1130 | } | 1140 | } |
1131 | 1141 | ||
1142 | #ifdef __ARCH_WANT_SYS_OLD_UNAME | ||
1143 | /* | ||
1144 | * Old cruft | ||
1145 | */ | ||
1146 | SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) | ||
1147 | { | ||
1148 | int error = 0; | ||
1149 | |||
1150 | if (!name) | ||
1151 | return -EFAULT; | ||
1152 | |||
1153 | down_read(&uts_sem); | ||
1154 | if (copy_to_user(name, utsname(), sizeof(*name))) | ||
1155 | error = -EFAULT; | ||
1156 | up_read(&uts_sem); | ||
1157 | |||
1158 | if (!error && override_architecture(name)) | ||
1159 | error = -EFAULT; | ||
1160 | return error; | ||
1161 | } | ||
1162 | |||
1163 | SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) | ||
1164 | { | ||
1165 | int error; | ||
1166 | |||
1167 | if (!name) | ||
1168 | return -EFAULT; | ||
1169 | if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) | ||
1170 | return -EFAULT; | ||
1171 | |||
1172 | down_read(&uts_sem); | ||
1173 | error = __copy_to_user(&name->sysname, &utsname()->sysname, | ||
1174 | __OLD_UTS_LEN); | ||
1175 | error |= __put_user(0, name->sysname + __OLD_UTS_LEN); | ||
1176 | error |= __copy_to_user(&name->nodename, &utsname()->nodename, | ||
1177 | __OLD_UTS_LEN); | ||
1178 | error |= __put_user(0, name->nodename + __OLD_UTS_LEN); | ||
1179 | error |= __copy_to_user(&name->release, &utsname()->release, | ||
1180 | __OLD_UTS_LEN); | ||
1181 | error |= __put_user(0, name->release + __OLD_UTS_LEN); | ||
1182 | error |= __copy_to_user(&name->version, &utsname()->version, | ||
1183 | __OLD_UTS_LEN); | ||
1184 | error |= __put_user(0, name->version + __OLD_UTS_LEN); | ||
1185 | error |= __copy_to_user(&name->machine, &utsname()->machine, | ||
1186 | __OLD_UTS_LEN); | ||
1187 | error |= __put_user(0, name->machine + __OLD_UTS_LEN); | ||
1188 | up_read(&uts_sem); | ||
1189 | |||
1190 | if (!error && override_architecture(name)) | ||
1191 | error = -EFAULT; | ||
1192 | return error ? -EFAULT : 0; | ||
1193 | } | ||
1194 | #endif | ||
1195 | |||
1132 | SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) | 1196 | SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) |
1133 | { | 1197 | { |
1134 | int errno; | 1198 | int errno; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 695384f12a7d..70f2ea758ffe 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16); | |||
126 | cond_syscall(sys_setuid16); | 126 | cond_syscall(sys_setuid16); |
127 | cond_syscall(sys_vm86old); | 127 | cond_syscall(sys_vm86old); |
128 | cond_syscall(sys_vm86); | 128 | cond_syscall(sys_vm86); |
129 | cond_syscall(sys_ipc); | ||
129 | cond_syscall(compat_sys_ipc); | 130 | cond_syscall(compat_sys_ipc); |
130 | cond_syscall(compat_sys_sysctl); | 131 | cond_syscall(compat_sys_sysctl); |
131 | cond_syscall(sys_flock); | 132 | cond_syscall(sys_flock); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ac72c9e6bd9b..a38af430f0d8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/sysctl.h> | 25 | #include <linux/sysctl.h> |
26 | #include <linux/signal.h> | ||
26 | #include <linux/proc_fs.h> | 27 | #include <linux/proc_fs.h> |
27 | #include <linux/security.h> | 28 | #include <linux/security.h> |
28 | #include <linux/ctype.h> | 29 | #include <linux/ctype.h> |
@@ -50,6 +51,7 @@ | |||
50 | #include <linux/ftrace.h> | 51 | #include <linux/ftrace.h> |
51 | #include <linux/slow-work.h> | 52 | #include <linux/slow-work.h> |
52 | #include <linux/perf_event.h> | 53 | #include <linux/perf_event.h> |
54 | #include <linux/kprobes.h> | ||
53 | 55 | ||
54 | #include <asm/uaccess.h> | 56 | #include <asm/uaccess.h> |
55 | #include <asm/processor.h> | 57 | #include <asm/processor.h> |
@@ -59,6 +61,18 @@ | |||
59 | #include <asm/stacktrace.h> | 61 | #include <asm/stacktrace.h> |
60 | #include <asm/io.h> | 62 | #include <asm/io.h> |
61 | #endif | 63 | #endif |
64 | #ifdef CONFIG_BSD_PROCESS_ACCT | ||
65 | #include <linux/acct.h> | ||
66 | #endif | ||
67 | #ifdef CONFIG_RT_MUTEXES | ||
68 | #include <linux/rtmutex.h> | ||
69 | #endif | ||
70 | #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) | ||
71 | #include <linux/lockdep.h> | ||
72 | #endif | ||
73 | #ifdef CONFIG_CHR_DEV_SG | ||
74 | #include <scsi/sg.h> | ||
75 | #endif | ||
62 | 76 | ||
63 | #ifdef CONFIG_NMI_WATCHDOG | 77 | #ifdef CONFIG_NMI_WATCHDOG |
64 | #include <linux/nmi.h> | 78 | #include <linux/nmi.h> |
@@ -68,8 +82,6 @@ | |||
68 | #if defined(CONFIG_SYSCTL) | 82 | #if defined(CONFIG_SYSCTL) |
69 | 83 | ||
70 | /* External variables not in a header file. */ | 84 | /* External variables not in a header file. */ |
71 | extern int C_A_D; | ||
72 | extern int print_fatal_signals; | ||
73 | extern int sysctl_overcommit_memory; | 85 | extern int sysctl_overcommit_memory; |
74 | extern int sysctl_overcommit_ratio; | 86 | extern int sysctl_overcommit_ratio; |
75 | extern int sysctl_panic_on_oom; | 87 | extern int sysctl_panic_on_oom; |
@@ -91,9 +103,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; | |||
91 | #ifndef CONFIG_MMU | 103 | #ifndef CONFIG_MMU |
92 | extern int sysctl_nr_trim_pages; | 104 | extern int sysctl_nr_trim_pages; |
93 | #endif | 105 | #endif |
94 | #ifdef CONFIG_RCU_TORTURE_TEST | ||
95 | extern int rcutorture_runnable; | ||
96 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ | ||
97 | #ifdef CONFIG_BLOCK | 106 | #ifdef CONFIG_BLOCK |
98 | extern int blk_iopoll_enabled; | 107 | extern int blk_iopoll_enabled; |
99 | #endif | 108 | #endif |
@@ -123,14 +132,6 @@ static int min_percpu_pagelist_fract = 8; | |||
123 | 132 | ||
124 | static int ngroups_max = NGROUPS_MAX; | 133 | static int ngroups_max = NGROUPS_MAX; |
125 | 134 | ||
126 | #ifdef CONFIG_MODULES | ||
127 | extern char modprobe_path[]; | ||
128 | extern int modules_disabled; | ||
129 | #endif | ||
130 | #ifdef CONFIG_CHR_DEV_SG | ||
131 | extern int sg_big_buff; | ||
132 | #endif | ||
133 | |||
134 | #ifdef CONFIG_SPARC | 135 | #ifdef CONFIG_SPARC |
135 | #include <asm/system.h> | 136 | #include <asm/system.h> |
136 | #endif | 137 | #endif |
@@ -152,10 +153,6 @@ extern int sysctl_userprocess_debug; | |||
152 | extern int spin_retry; | 153 | extern int spin_retry; |
153 | #endif | 154 | #endif |
154 | 155 | ||
155 | #ifdef CONFIG_BSD_PROCESS_ACCT | ||
156 | extern int acct_parm[]; | ||
157 | #endif | ||
158 | |||
159 | #ifdef CONFIG_IA64 | 156 | #ifdef CONFIG_IA64 |
160 | extern int no_unaligned_warning; | 157 | extern int no_unaligned_warning; |
161 | extern int unaligned_dump_stack; | 158 | extern int unaligned_dump_stack; |
@@ -163,10 +160,6 @@ extern int unaligned_dump_stack; | |||
163 | 160 | ||
164 | extern struct ratelimit_state printk_ratelimit_state; | 161 | extern struct ratelimit_state printk_ratelimit_state; |
165 | 162 | ||
166 | #ifdef CONFIG_RT_MUTEXES | ||
167 | extern int max_lock_depth; | ||
168 | #endif | ||
169 | |||
170 | #ifdef CONFIG_PROC_SYSCTL | 163 | #ifdef CONFIG_PROC_SYSCTL |
171 | static int proc_do_cad_pid(struct ctl_table *table, int write, | 164 | static int proc_do_cad_pid(struct ctl_table *table, int write, |
172 | void __user *buffer, size_t *lenp, loff_t *ppos); | 165 | void __user *buffer, size_t *lenp, loff_t *ppos); |
@@ -205,9 +198,6 @@ extern struct ctl_table epoll_table[]; | |||
205 | int sysctl_legacy_va_layout; | 198 | int sysctl_legacy_va_layout; |
206 | #endif | 199 | #endif |
207 | 200 | ||
208 | extern int prove_locking; | ||
209 | extern int lock_stat; | ||
210 | |||
211 | /* The default sysctl tables: */ | 201 | /* The default sysctl tables: */ |
212 | 202 | ||
213 | static struct ctl_table root_table[] = { | 203 | static struct ctl_table root_table[] = { |
@@ -1454,7 +1444,7 @@ static struct ctl_table fs_table[] = { | |||
1454 | }; | 1444 | }; |
1455 | 1445 | ||
1456 | static struct ctl_table debug_table[] = { | 1446 | static struct ctl_table debug_table[] = { |
1457 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) | 1447 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) |
1458 | { | 1448 | { |
1459 | .procname = "exception-trace", | 1449 | .procname = "exception-trace", |
1460 | .data = &show_unhandled_signals, | 1450 | .data = &show_unhandled_signals, |
@@ -1463,6 +1453,17 @@ static struct ctl_table debug_table[] = { | |||
1463 | .proc_handler = proc_dointvec | 1453 | .proc_handler = proc_dointvec |
1464 | }, | 1454 | }, |
1465 | #endif | 1455 | #endif |
1456 | #if defined(CONFIG_OPTPROBES) | ||
1457 | { | ||
1458 | .procname = "kprobes-optimization", | ||
1459 | .data = &sysctl_kprobes_optimization, | ||
1460 | .maxlen = sizeof(int), | ||
1461 | .mode = 0644, | ||
1462 | .proc_handler = proc_kprobes_optimization_handler, | ||
1463 | .extra1 = &zero, | ||
1464 | .extra2 = &one, | ||
1465 | }, | ||
1466 | #endif | ||
1466 | { } | 1467 | { } |
1467 | }; | 1468 | }; |
1468 | 1469 | ||
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 8f5d16e0707a..59030570f5ca 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/file.h> | 13 | #include <linux/file.h> |
14 | #include <linux/ctype.h> | 14 | #include <linux/ctype.h> |
15 | #include <linux/netdevice.h> | 15 | #include <linux/netdevice.h> |
16 | #include <linux/slab.h> | ||
16 | 17 | ||
17 | #ifdef CONFIG_SYSCTL_SYSCALL | 18 | #ifdef CONFIG_SYSCTL_SYSCALL |
18 | 19 | ||
@@ -1331,7 +1332,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1331 | ssize_t result; | 1332 | ssize_t result; |
1332 | char *pathname; | 1333 | char *pathname; |
1333 | int flags; | 1334 | int flags; |
1334 | int acc_mode, fmode; | 1335 | int acc_mode; |
1335 | 1336 | ||
1336 | pathname = sysctl_getname(name, nlen, &table); | 1337 | pathname = sysctl_getname(name, nlen, &table); |
1337 | result = PTR_ERR(pathname); | 1338 | result = PTR_ERR(pathname); |
@@ -1342,15 +1343,12 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1342 | if (oldval && oldlen && newval && newlen) { | 1343 | if (oldval && oldlen && newval && newlen) { |
1343 | flags = O_RDWR; | 1344 | flags = O_RDWR; |
1344 | acc_mode = MAY_READ | MAY_WRITE; | 1345 | acc_mode = MAY_READ | MAY_WRITE; |
1345 | fmode = FMODE_READ | FMODE_WRITE; | ||
1346 | } else if (newval && newlen) { | 1346 | } else if (newval && newlen) { |
1347 | flags = O_WRONLY; | 1347 | flags = O_WRONLY; |
1348 | acc_mode = MAY_WRITE; | 1348 | acc_mode = MAY_WRITE; |
1349 | fmode = FMODE_WRITE; | ||
1350 | } else if (oldval && oldlen) { | 1349 | } else if (oldval && oldlen) { |
1351 | flags = O_RDONLY; | 1350 | flags = O_RDONLY; |
1352 | acc_mode = MAY_READ; | 1351 | acc_mode = MAY_READ; |
1353 | fmode = FMODE_READ; | ||
1354 | } else { | 1352 | } else { |
1355 | result = 0; | 1353 | result = 0; |
1356 | goto out_putname; | 1354 | goto out_putname; |
@@ -1361,7 +1359,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1361 | if (result) | 1359 | if (result) |
1362 | goto out_putname; | 1360 | goto out_putname; |
1363 | 1361 | ||
1364 | result = may_open(&nd.path, acc_mode, fmode); | 1362 | result = may_open(&nd.path, acc_mode, flags); |
1365 | if (result) | 1363 | if (result) |
1366 | goto out_putpath; | 1364 | goto out_putpath; |
1367 | 1365 | ||
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ea8384d3caa7..11281d5792bd 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/delayacct.h> | 22 | #include <linux/delayacct.h> |
23 | #include <linux/cpumask.h> | 23 | #include <linux/cpumask.h> |
24 | #include <linux/percpu.h> | 24 | #include <linux/percpu.h> |
25 | #include <linux/slab.h> | ||
25 | #include <linux/cgroupstats.h> | 26 | #include <linux/cgroupstats.h> |
26 | #include <linux/cgroup.h> | 27 | #include <linux/cgroup.h> |
27 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
@@ -46,15 +47,13 @@ static struct genl_family family = { | |||
46 | .maxattr = TASKSTATS_CMD_ATTR_MAX, | 47 | .maxattr = TASKSTATS_CMD_ATTR_MAX, |
47 | }; | 48 | }; |
48 | 49 | ||
49 | static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] | 50 | static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { |
50 | __read_mostly = { | ||
51 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, | 51 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, |
52 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, | 52 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, |
53 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, | 53 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, |
54 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; | 54 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; |
55 | 55 | ||
56 | static struct nla_policy | 56 | static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { |
57 | cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { | ||
58 | [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, | 57 | [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, |
59 | }; | 58 | }; |
60 | 59 | ||
diff --git a/kernel/time.c b/kernel/time.c index 804798005d19..656dccfe1cbb 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -35,7 +35,6 @@ | |||
35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
36 | #include <linux/security.h> | 36 | #include <linux/security.h> |
37 | #include <linux/fs.h> | 37 | #include <linux/fs.h> |
38 | #include <linux/slab.h> | ||
39 | #include <linux/math64.h> | 38 | #include <linux/math64.h> |
40 | #include <linux/ptrace.h> | 39 | #include <linux/ptrace.h> |
41 | 40 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e85c23404d34..1f5dde637457 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void) | |||
343 | { | 343 | { |
344 | unsigned long flags; | 344 | unsigned long flags; |
345 | 345 | ||
346 | spin_lock_irqsave(&watchdog_lock, flags); | 346 | /* |
347 | * We use trylock here to avoid a potential dead lock when | ||
348 | * kgdb calls this code after the kernel has been stopped with | ||
349 | * watchdog_lock held. When watchdog_lock is held we just | ||
350 | * return and accept, that the watchdog might trigger and mark | ||
351 | * the monitored clock source (usually TSC) unstable. | ||
352 | * | ||
353 | * This does not affect the other caller clocksource_resume() | ||
354 | * because at this point the kernel is UP, interrupts are | ||
355 | * disabled and nothing can hold watchdog_lock. | ||
356 | */ | ||
357 | if (!spin_trylock_irqsave(&watchdog_lock, flags)) | ||
358 | return; | ||
347 | clocksource_reset_watchdog(); | 359 | clocksource_reset_watchdog(); |
348 | spin_unlock_irqrestore(&watchdog_lock, flags); | 360 | spin_unlock_irqrestore(&watchdog_lock, flags); |
349 | } | 361 | } |
@@ -441,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; } | |||
441 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ | 453 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ |
442 | 454 | ||
443 | /** | 455 | /** |
456 | * clocksource_suspend - suspend the clocksource(s) | ||
457 | */ | ||
458 | void clocksource_suspend(void) | ||
459 | { | ||
460 | struct clocksource *cs; | ||
461 | |||
462 | list_for_each_entry_reverse(cs, &clocksource_list, list) | ||
463 | if (cs->suspend) | ||
464 | cs->suspend(cs); | ||
465 | } | ||
466 | |||
467 | /** | ||
444 | * clocksource_resume - resume the clocksource(s) | 468 | * clocksource_resume - resume the clocksource(s) |
445 | */ | 469 | */ |
446 | void clocksource_resume(void) | 470 | void clocksource_resume(void) |
@@ -449,7 +473,7 @@ void clocksource_resume(void) | |||
449 | 473 | ||
450 | list_for_each_entry(cs, &clocksource_list, list) | 474 | list_for_each_entry(cs, &clocksource_list, list) |
451 | if (cs->resume) | 475 | if (cs->resume) |
452 | cs->resume(); | 476 | cs->resume(cs); |
453 | 477 | ||
454 | clocksource_resume_watchdog(); | 478 | clocksource_resume_watchdog(); |
455 | } | 479 | } |
@@ -458,8 +482,8 @@ void clocksource_resume(void) | |||
458 | * clocksource_touch_watchdog - Update watchdog | 482 | * clocksource_touch_watchdog - Update watchdog |
459 | * | 483 | * |
460 | * Update the watchdog after exception contexts such as kgdb so as not | 484 | * Update the watchdog after exception contexts such as kgdb so as not |
461 | * to incorrectly trip the watchdog. | 485 | * to incorrectly trip the watchdog. This might fail when the kernel |
462 | * | 486 | * was stopped in code which holds watchdog_lock. |
463 | */ | 487 | */ |
464 | void clocksource_touch_watchdog(void) | 488 | void clocksource_touch_watchdog(void) |
465 | { | 489 | { |
@@ -568,6 +592,10 @@ static inline void clocksource_select(void) { } | |||
568 | */ | 592 | */ |
569 | static int __init clocksource_done_booting(void) | 593 | static int __init clocksource_done_booting(void) |
570 | { | 594 | { |
595 | mutex_lock(&clocksource_mutex); | ||
596 | curr_clocksource = clocksource_default_clock(); | ||
597 | mutex_unlock(&clocksource_mutex); | ||
598 | |||
571 | finished_booting = 1; | 599 | finished_booting = 1; |
572 | 600 | ||
573 | /* | 601 | /* |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4800f933910e..7c0f180d6e9d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -58,10 +58,10 @@ static s64 time_offset; | |||
58 | static long time_constant = 2; | 58 | static long time_constant = 2; |
59 | 59 | ||
60 | /* maximum error (usecs): */ | 60 | /* maximum error (usecs): */ |
61 | long time_maxerror = NTP_PHASE_LIMIT; | 61 | static long time_maxerror = NTP_PHASE_LIMIT; |
62 | 62 | ||
63 | /* estimated error (usecs): */ | 63 | /* estimated error (usecs): */ |
64 | long time_esterror = NTP_PHASE_LIMIT; | 64 | static long time_esterror = NTP_PHASE_LIMIT; |
65 | 65 | ||
66 | /* frequency offset (scaled nsecs/secs): */ | 66 | /* frequency offset (scaled nsecs/secs): */ |
67 | static s64 time_freq; | 67 | static s64 time_freq; |
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset) | |||
142 | * Select how the frequency is to be controlled | 142 | * Select how the frequency is to be controlled |
143 | * and in which mode (PLL or FLL). | 143 | * and in which mode (PLL or FLL). |
144 | */ | 144 | */ |
145 | secs = xtime.tv_sec - time_reftime; | 145 | secs = get_seconds() - time_reftime; |
146 | if (unlikely(time_status & STA_FREQHOLD)) | 146 | if (unlikely(time_status & STA_FREQHOLD)) |
147 | secs = 0; | 147 | secs = 0; |
148 | 148 | ||
149 | time_reftime = xtime.tv_sec; | 149 | time_reftime = get_seconds(); |
150 | 150 | ||
151 | offset64 = offset; | 151 | offset64 = offset; |
152 | freq_adj = (offset64 * secs) << | 152 | freq_adj = (offset64 * secs) << |
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
368 | * reference time to current time. | 368 | * reference time to current time. |
369 | */ | 369 | */ |
370 | if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) | 370 | if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) |
371 | time_reftime = xtime.tv_sec; | 371 | time_reftime = get_seconds(); |
372 | 372 | ||
373 | /* only set allowed bits */ | 373 | /* only set allowed bits */ |
374 | time_status &= STA_RONLY; | 374 | time_status &= STA_RONLY; |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0a8a213016f0..aada0e52680a 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -22,6 +22,29 @@ | |||
22 | 22 | ||
23 | #include "tick-internal.h" | 23 | #include "tick-internal.h" |
24 | 24 | ||
25 | /* Limit min_delta to a jiffie */ | ||
26 | #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) | ||
27 | |||
28 | static int tick_increase_min_delta(struct clock_event_device *dev) | ||
29 | { | ||
30 | /* Nothing to do if we already reached the limit */ | ||
31 | if (dev->min_delta_ns >= MIN_DELTA_LIMIT) | ||
32 | return -ETIME; | ||
33 | |||
34 | if (dev->min_delta_ns < 5000) | ||
35 | dev->min_delta_ns = 5000; | ||
36 | else | ||
37 | dev->min_delta_ns += dev->min_delta_ns >> 1; | ||
38 | |||
39 | if (dev->min_delta_ns > MIN_DELTA_LIMIT) | ||
40 | dev->min_delta_ns = MIN_DELTA_LIMIT; | ||
41 | |||
42 | printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", | ||
43 | dev->name ? dev->name : "?", | ||
44 | (unsigned long long) dev->min_delta_ns); | ||
45 | return 0; | ||
46 | } | ||
47 | |||
25 | /** | 48 | /** |
26 | * tick_program_event internal worker function | 49 | * tick_program_event internal worker function |
27 | */ | 50 | */ |
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, | |||
37 | if (!ret || !force) | 60 | if (!ret || !force) |
38 | return ret; | 61 | return ret; |
39 | 62 | ||
63 | dev->retries++; | ||
40 | /* | 64 | /* |
41 | * We tried 2 times to program the device with the given | 65 | * We tried 3 times to program the device with the given |
42 | * min_delta_ns. If that's not working then we double it | 66 | * min_delta_ns. If that's not working then we increase it |
43 | * and emit a warning. | 67 | * and emit a warning. |
44 | */ | 68 | */ |
45 | if (++i > 2) { | 69 | if (++i > 2) { |
46 | /* Increase the min. delta and try again */ | 70 | /* Increase the min. delta and try again */ |
47 | if (!dev->min_delta_ns) | 71 | if (tick_increase_min_delta(dev)) { |
48 | dev->min_delta_ns = 5000; | 72 | /* |
49 | else | 73 | * Get out of the loop if min_delta_ns |
50 | dev->min_delta_ns += dev->min_delta_ns >> 1; | 74 | * hit the limit already. That's |
51 | 75 | * better than staying here forever. | |
52 | printk(KERN_WARNING | 76 | * |
53 | "CE: %s increasing min_delta_ns to %llu nsec\n", | 77 | * We clear next_event so we have a |
54 | dev->name ? dev->name : "?", | 78 | * chance that the box survives. |
55 | (unsigned long long) dev->min_delta_ns << 1); | 79 | */ |
56 | 80 | printk(KERN_WARNING | |
81 | "CE: Reprogramming failure. Giving up\n"); | ||
82 | dev->next_event.tv64 = KTIME_MAX; | ||
83 | return -ETIME; | ||
84 | } | ||
57 | i = 0; | 85 | i = 0; |
58 | } | 86 | } |
59 | 87 | ||
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index 12f5c55090be..ac38fbb176cc 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c | |||
@@ -19,6 +19,7 @@ | |||
19 | 19 | ||
20 | #include <linux/timecompare.h> | 20 | #include <linux/timecompare.h> |
21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
22 | #include <linux/slab.h> | ||
22 | #include <linux/math64.h> | 23 | #include <linux/math64.h> |
23 | 24 | ||
24 | /* | 25 | /* |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7faaa32fbf4f..39f6177fafac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -622,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
622 | write_sequnlock_irqrestore(&xtime_lock, flags); | 622 | write_sequnlock_irqrestore(&xtime_lock, flags); |
623 | 623 | ||
624 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 624 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
625 | clocksource_suspend(); | ||
625 | 626 | ||
626 | return 0; | 627 | return 0; |
627 | } | 628 | } |
@@ -817,7 +818,8 @@ void update_wall_time(void) | |||
817 | shift = min(shift, maxshift); | 818 | shift = min(shift, maxshift); |
818 | while (offset >= timekeeper.cycle_interval) { | 819 | while (offset >= timekeeper.cycle_interval) { |
819 | offset = logarithmic_accumulation(offset, shift); | 820 | offset = logarithmic_accumulation(offset, shift); |
820 | shift--; | 821 | if(offset < timekeeper.cycle_interval<<shift) |
822 | shift--; | ||
821 | } | 823 | } |
822 | 824 | ||
823 | /* correct the clock when NTP error is too big */ | 825 | /* correct the clock when NTP error is too big */ |
@@ -880,6 +882,7 @@ void getboottime(struct timespec *ts) | |||
880 | 882 | ||
881 | set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); | 883 | set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); |
882 | } | 884 | } |
885 | EXPORT_SYMBOL_GPL(getboottime); | ||
883 | 886 | ||
884 | /** | 887 | /** |
885 | * monotonic_to_bootbased - Convert the monotonic time to boot based. | 888 | * monotonic_to_bootbased - Convert the monotonic time to boot based. |
@@ -889,6 +892,7 @@ void monotonic_to_bootbased(struct timespec *ts) | |||
889 | { | 892 | { |
890 | *ts = timespec_add_safe(*ts, total_sleep_time); | 893 | *ts = timespec_add_safe(*ts, total_sleep_time); |
891 | } | 894 | } |
895 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); | ||
892 | 896 | ||
893 | unsigned long get_seconds(void) | 897 | unsigned long get_seconds(void) |
894 | { | 898 | { |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index bdfb8dd1050c..1a4a7dd78777 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) | |||
228 | SEQ_printf(m, " event_handler: "); | 228 | SEQ_printf(m, " event_handler: "); |
229 | print_name_offset(m, dev->event_handler); | 229 | print_name_offset(m, dev->event_handler); |
230 | SEQ_printf(m, "\n"); | 230 | SEQ_printf(m, "\n"); |
231 | SEQ_printf(m, " retries: %lu\n", dev->retries); | ||
231 | } | 232 | } |
232 | 233 | ||
233 | static void timer_list_show_tickdevices(struct seq_file *m) | 234 | static void timer_list_show_tickdevices(struct seq_file *m) |
@@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v) | |||
257 | u64 now = ktime_to_ns(ktime_get()); | 258 | u64 now = ktime_to_ns(ktime_get()); |
258 | int cpu; | 259 | int cpu; |
259 | 260 | ||
260 | SEQ_printf(m, "Timer List Version: v0.5\n"); | 261 | SEQ_printf(m, "Timer List Version: v0.6\n"); |
261 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); | 262 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); |
262 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); | 263 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); |
263 | 264 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index c61a7949387f..aeb6a54f2771 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/kallsyms.h> | 39 | #include <linux/kallsyms.h> |
40 | #include <linux/perf_event.h> | 40 | #include <linux/perf_event.h> |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/slab.h> | ||
42 | 43 | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
44 | #include <asm/unistd.h> | 45 | #include <asm/unistd.h> |
@@ -880,6 +881,7 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
880 | if (base->running_timer == timer) | 881 | if (base->running_timer == timer) |
881 | goto out; | 882 | goto out; |
882 | 883 | ||
884 | timer_stats_timer_clear_start_info(timer); | ||
883 | ret = 0; | 885 | ret = 0; |
884 | if (timer_pending(timer)) { | 886 | if (timer_pending(timer)) { |
885 | detach_timer(timer, 1); | 887 | detach_timer(timer, 1); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6c22d8a2f289..13e13d428cd3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER | |||
27 | config HAVE_FUNCTION_GRAPH_FP_TEST | 27 | config HAVE_FUNCTION_GRAPH_FP_TEST |
28 | bool | 28 | bool |
29 | help | 29 | help |
30 | An arch may pass in a unique value (frame pointer) to both the | 30 | See Documentation/trace/ftrace-design.txt |
31 | entering and exiting of a function. On exit, the value is compared | ||
32 | and if it does not match, then it will panic the kernel. | ||
33 | 31 | ||
34 | config HAVE_FUNCTION_TRACE_MCOUNT_TEST | 32 | config HAVE_FUNCTION_TRACE_MCOUNT_TEST |
35 | bool | 33 | bool |
@@ -330,15 +328,6 @@ config BRANCH_TRACER | |||
330 | 328 | ||
331 | Say N if unsure. | 329 | Say N if unsure. |
332 | 330 | ||
333 | config POWER_TRACER | ||
334 | bool "Trace power consumption behavior" | ||
335 | depends on X86 | ||
336 | select GENERIC_TRACER | ||
337 | help | ||
338 | This tracer helps developers to analyze and optimize the kernel's | ||
339 | power management decisions, specifically the C-state and P-state | ||
340 | behavior. | ||
341 | |||
342 | config KSYM_TRACER | 331 | config KSYM_TRACER |
343 | bool "Trace read and write access on kernel memory locations" | 332 | bool "Trace read and write access on kernel memory locations" |
344 | depends on HAVE_HW_BREAKPOINT | 333 | depends on HAVE_HW_BREAKPOINT |
@@ -451,7 +440,7 @@ config BLK_DEV_IO_TRACE | |||
451 | 440 | ||
452 | config KPROBE_EVENT | 441 | config KPROBE_EVENT |
453 | depends on KPROBES | 442 | depends on KPROBES |
454 | depends on X86 | 443 | depends on HAVE_REGS_AND_STACK_ACCESS_API |
455 | bool "Enable kprobes-based dynamic events" | 444 | bool "Enable kprobes-based dynamic events" |
456 | select TRACING | 445 | select TRACING |
457 | default y | 446 | default y |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d00c6fe23f54..78edc6490038 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o | |||
52 | obj-$(CONFIG_EVENT_TRACING) += trace_export.o | 52 | obj-$(CONFIG_EVENT_TRACING) += trace_export.o |
53 | obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o | 53 | obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o |
54 | ifeq ($(CONFIG_PERF_EVENTS),y) | 54 | ifeq ($(CONFIG_PERF_EVENTS),y) |
55 | obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o | 55 | obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o |
56 | endif | 56 | endif |
57 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 57 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
58 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 58 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d9d6206e0b14..b3bc91a3f510 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/percpu.h> | 21 | #include <linux/percpu.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/mutex.h> | 23 | #include <linux/mutex.h> |
24 | #include <linux/slab.h> | ||
24 | #include <linux/debugfs.h> | 25 | #include <linux/debugfs.h> |
25 | #include <linux/smp_lock.h> | 26 | #include <linux/smp_lock.h> |
26 | #include <linux/time.h> | 27 | #include <linux/time.h> |
@@ -540,9 +541,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
540 | if (ret) | 541 | if (ret) |
541 | return ret; | 542 | return ret; |
542 | 543 | ||
543 | if (copy_to_user(arg, &buts, sizeof(buts))) | 544 | if (copy_to_user(arg, &buts, sizeof(buts))) { |
545 | blk_trace_remove(q); | ||
544 | return -EFAULT; | 546 | return -EFAULT; |
545 | 547 | } | |
546 | return 0; | 548 | return 0; |
547 | } | 549 | } |
548 | EXPORT_SYMBOL_GPL(blk_trace_setup); | 550 | EXPORT_SYMBOL_GPL(blk_trace_setup); |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1904797f4a8a..2404b59b3097 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -24,9 +24,11 @@ | |||
24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
25 | #include <linux/ftrace.h> | 25 | #include <linux/ftrace.h> |
26 | #include <linux/sysctl.h> | 26 | #include <linux/sysctl.h> |
27 | #include <linux/slab.h> | ||
27 | #include <linux/ctype.h> | 28 | #include <linux/ctype.h> |
28 | #include <linux/list.h> | 29 | #include <linux/list.h> |
29 | #include <linux/hash.h> | 30 | #include <linux/hash.h> |
31 | #include <linux/rcupdate.h> | ||
30 | 32 | ||
31 | #include <trace/events/sched.h> | 33 | #include <trace/events/sched.h> |
32 | 34 | ||
@@ -84,22 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | |||
84 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; | 86 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; |
85 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | 87 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; |
86 | 88 | ||
87 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 89 | /* |
88 | static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); | 90 | * Traverse the ftrace_list, invoking all entries. The reason that we |
89 | #endif | 91 | * can use rcu_dereference_raw() is that elements removed from this list |
90 | 92 | * are simply leaked, so there is no need to interact with a grace-period | |
93 | * mechanism. The rcu_dereference_raw() calls are needed to handle | ||
94 | * concurrent insertions into the ftrace_list. | ||
95 | * | ||
96 | * Silly Alpha and silly pointer-speculation compiler optimizations! | ||
97 | */ | ||
91 | static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) | 98 | static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) |
92 | { | 99 | { |
93 | struct ftrace_ops *op = ftrace_list; | 100 | struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ |
94 | |||
95 | /* in case someone actually ports this to alpha! */ | ||
96 | read_barrier_depends(); | ||
97 | 101 | ||
98 | while (op != &ftrace_list_end) { | 102 | while (op != &ftrace_list_end) { |
99 | /* silly alpha */ | ||
100 | read_barrier_depends(); | ||
101 | op->func(ip, parent_ip); | 103 | op->func(ip, parent_ip); |
102 | op = op->next; | 104 | op = rcu_dereference_raw(op->next); /*see above*/ |
103 | }; | 105 | }; |
104 | } | 106 | } |
105 | 107 | ||
@@ -154,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
154 | * the ops->next pointer is valid before another CPU sees | 156 | * the ops->next pointer is valid before another CPU sees |
155 | * the ops pointer included into the ftrace_list. | 157 | * the ops pointer included into the ftrace_list. |
156 | */ | 158 | */ |
157 | smp_wmb(); | 159 | rcu_assign_pointer(ftrace_list, ops); |
158 | ftrace_list = ops; | ||
159 | 160 | ||
160 | if (ftrace_enabled) { | 161 | if (ftrace_enabled) { |
161 | ftrace_func_t func; | 162 | ftrace_func_t func; |
@@ -2276,6 +2277,8 @@ __setup("ftrace_filter=", set_ftrace_filter); | |||
2276 | 2277 | ||
2277 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 2278 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
2278 | static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; | 2279 | static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; |
2280 | static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); | ||
2281 | |||
2279 | static int __init set_graph_function(char *str) | 2282 | static int __init set_graph_function(char *str) |
2280 | { | 2283 | { |
2281 | strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); | 2284 | strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); |
@@ -2402,6 +2405,7 @@ static const struct file_operations ftrace_notrace_fops = { | |||
2402 | static DEFINE_MUTEX(graph_lock); | 2405 | static DEFINE_MUTEX(graph_lock); |
2403 | 2406 | ||
2404 | int ftrace_graph_count; | 2407 | int ftrace_graph_count; |
2408 | int ftrace_graph_filter_enabled; | ||
2405 | unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; | 2409 | unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; |
2406 | 2410 | ||
2407 | static void * | 2411 | static void * |
@@ -2424,7 +2428,7 @@ static void *g_start(struct seq_file *m, loff_t *pos) | |||
2424 | mutex_lock(&graph_lock); | 2428 | mutex_lock(&graph_lock); |
2425 | 2429 | ||
2426 | /* Nothing, tell g_show to print all functions are enabled */ | 2430 | /* Nothing, tell g_show to print all functions are enabled */ |
2427 | if (!ftrace_graph_count && !*pos) | 2431 | if (!ftrace_graph_filter_enabled && !*pos) |
2428 | return (void *)1; | 2432 | return (void *)1; |
2429 | 2433 | ||
2430 | return __g_next(m, pos); | 2434 | return __g_next(m, pos); |
@@ -2470,6 +2474,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) | |||
2470 | mutex_lock(&graph_lock); | 2474 | mutex_lock(&graph_lock); |
2471 | if ((file->f_mode & FMODE_WRITE) && | 2475 | if ((file->f_mode & FMODE_WRITE) && |
2472 | (file->f_flags & O_TRUNC)) { | 2476 | (file->f_flags & O_TRUNC)) { |
2477 | ftrace_graph_filter_enabled = 0; | ||
2473 | ftrace_graph_count = 0; | 2478 | ftrace_graph_count = 0; |
2474 | memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); | 2479 | memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); |
2475 | } | 2480 | } |
@@ -2495,7 +2500,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2495 | struct dyn_ftrace *rec; | 2500 | struct dyn_ftrace *rec; |
2496 | struct ftrace_page *pg; | 2501 | struct ftrace_page *pg; |
2497 | int search_len; | 2502 | int search_len; |
2498 | int found = 0; | 2503 | int fail = 1; |
2499 | int type, not; | 2504 | int type, not; |
2500 | char *search; | 2505 | char *search; |
2501 | bool exists; | 2506 | bool exists; |
@@ -2506,37 +2511,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2506 | 2511 | ||
2507 | /* decode regex */ | 2512 | /* decode regex */ |
2508 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); | 2513 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); |
2509 | if (not) | 2514 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) |
2510 | return -EINVAL; | 2515 | return -EBUSY; |
2511 | 2516 | ||
2512 | search_len = strlen(search); | 2517 | search_len = strlen(search); |
2513 | 2518 | ||
2514 | mutex_lock(&ftrace_lock); | 2519 | mutex_lock(&ftrace_lock); |
2515 | do_for_each_ftrace_rec(pg, rec) { | 2520 | do_for_each_ftrace_rec(pg, rec) { |
2516 | 2521 | ||
2517 | if (*idx >= FTRACE_GRAPH_MAX_FUNCS) | ||
2518 | break; | ||
2519 | |||
2520 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) | 2522 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) |
2521 | continue; | 2523 | continue; |
2522 | 2524 | ||
2523 | if (ftrace_match_record(rec, search, search_len, type)) { | 2525 | if (ftrace_match_record(rec, search, search_len, type)) { |
2524 | /* ensure it is not already in the array */ | 2526 | /* if it is in the array */ |
2525 | exists = false; | 2527 | exists = false; |
2526 | for (i = 0; i < *idx; i++) | 2528 | for (i = 0; i < *idx; i++) { |
2527 | if (array[i] == rec->ip) { | 2529 | if (array[i] == rec->ip) { |
2528 | exists = true; | 2530 | exists = true; |
2529 | break; | 2531 | break; |
2530 | } | 2532 | } |
2531 | if (!exists) | 2533 | } |
2532 | array[(*idx)++] = rec->ip; | 2534 | |
2533 | found = 1; | 2535 | if (!not) { |
2536 | fail = 0; | ||
2537 | if (!exists) { | ||
2538 | array[(*idx)++] = rec->ip; | ||
2539 | if (*idx >= FTRACE_GRAPH_MAX_FUNCS) | ||
2540 | goto out; | ||
2541 | } | ||
2542 | } else { | ||
2543 | if (exists) { | ||
2544 | array[i] = array[--(*idx)]; | ||
2545 | array[*idx] = 0; | ||
2546 | fail = 0; | ||
2547 | } | ||
2548 | } | ||
2534 | } | 2549 | } |
2535 | } while_for_each_ftrace_rec(); | 2550 | } while_for_each_ftrace_rec(); |
2536 | 2551 | out: | |
2537 | mutex_unlock(&ftrace_lock); | 2552 | mutex_unlock(&ftrace_lock); |
2538 | 2553 | ||
2539 | return found ? 0 : -EINVAL; | 2554 | if (fail) |
2555 | return -EINVAL; | ||
2556 | |||
2557 | ftrace_graph_filter_enabled = 1; | ||
2558 | return 0; | ||
2540 | } | 2559 | } |
2541 | 2560 | ||
2542 | static ssize_t | 2561 | static ssize_t |
@@ -2546,16 +2565,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, | |||
2546 | struct trace_parser parser; | 2565 | struct trace_parser parser; |
2547 | ssize_t read, ret; | 2566 | ssize_t read, ret; |
2548 | 2567 | ||
2549 | if (!cnt || cnt < 0) | 2568 | if (!cnt) |
2550 | return 0; | 2569 | return 0; |
2551 | 2570 | ||
2552 | mutex_lock(&graph_lock); | 2571 | mutex_lock(&graph_lock); |
2553 | 2572 | ||
2554 | if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { | ||
2555 | ret = -EBUSY; | ||
2556 | goto out_unlock; | ||
2557 | } | ||
2558 | |||
2559 | if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { | 2573 | if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { |
2560 | ret = -ENOMEM; | 2574 | ret = -ENOMEM; |
2561 | goto out_unlock; | 2575 | goto out_unlock; |
@@ -3340,6 +3354,7 @@ void ftrace_graph_init_task(struct task_struct *t) | |||
3340 | { | 3354 | { |
3341 | /* Make sure we do not use the parent ret_stack */ | 3355 | /* Make sure we do not use the parent ret_stack */ |
3342 | t->ret_stack = NULL; | 3356 | t->ret_stack = NULL; |
3357 | t->curr_ret_stack = -1; | ||
3343 | 3358 | ||
3344 | if (ftrace_graph_active) { | 3359 | if (ftrace_graph_active) { |
3345 | struct ftrace_ret_stack *ret_stack; | 3360 | struct ftrace_ret_stack *ret_stack; |
@@ -3349,7 +3364,6 @@ void ftrace_graph_init_task(struct task_struct *t) | |||
3349 | GFP_KERNEL); | 3364 | GFP_KERNEL); |
3350 | if (!ret_stack) | 3365 | if (!ret_stack) |
3351 | return; | 3366 | return; |
3352 | t->curr_ret_stack = -1; | ||
3353 | atomic_set(&t->tracing_graph_pause, 0); | 3367 | atomic_set(&t->tracing_graph_pause, 0); |
3354 | atomic_set(&t->trace_overrun, 0); | 3368 | atomic_set(&t->trace_overrun, 0); |
3355 | t->ftrace_timestamp = 0; | 3369 | t->ftrace_timestamp = 0; |
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9f4f565b01e6..a22582a06161 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/workqueue.h> | 9 | #include <linux/workqueue.h> |
10 | #include <linux/sched.h> | 10 | #include <linux/sched.h> |
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/slab.h> | ||
13 | 12 | ||
14 | #define CREATE_TRACE_POINTS | 13 | #define CREATE_TRACE_POINTS |
15 | #include <trace/events/power.h> | 14 | #include <trace/events/power.h> |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index edefe3b2801b..41ca394feb22 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -14,12 +14,14 @@ | |||
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/percpu.h> | 15 | #include <linux/percpu.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <linux/slab.h> | ||
17 | #include <linux/init.h> | 18 | #include <linux/init.h> |
18 | #include <linux/hash.h> | 19 | #include <linux/hash.h> |
19 | #include <linux/list.h> | 20 | #include <linux/list.h> |
20 | #include <linux/cpu.h> | 21 | #include <linux/cpu.h> |
21 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
22 | 23 | ||
24 | #include <asm/local.h> | ||
23 | #include "trace.h" | 25 | #include "trace.h" |
24 | 26 | ||
25 | /* | 27 | /* |
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on); | |||
206 | #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) | 208 | #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) |
207 | #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ | 209 | #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ |
208 | 210 | ||
211 | #if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | ||
212 | # define RB_FORCE_8BYTE_ALIGNMENT 0 | ||
213 | # define RB_ARCH_ALIGNMENT RB_ALIGNMENT | ||
214 | #else | ||
215 | # define RB_FORCE_8BYTE_ALIGNMENT 1 | ||
216 | # define RB_ARCH_ALIGNMENT 8U | ||
217 | #endif | ||
218 | |||
209 | /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ | 219 | /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ |
210 | #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX | 220 | #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX |
211 | 221 | ||
@@ -464,6 +474,8 @@ struct ring_buffer_iter { | |||
464 | struct ring_buffer_per_cpu *cpu_buffer; | 474 | struct ring_buffer_per_cpu *cpu_buffer; |
465 | unsigned long head; | 475 | unsigned long head; |
466 | struct buffer_page *head_page; | 476 | struct buffer_page *head_page; |
477 | struct buffer_page *cache_reader_page; | ||
478 | unsigned long cache_read; | ||
467 | u64 read_stamp; | 479 | u64 read_stamp; |
468 | }; | 480 | }; |
469 | 481 | ||
@@ -1198,18 +1210,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | |||
1198 | 1210 | ||
1199 | for (i = 0; i < nr_pages; i++) { | 1211 | for (i = 0; i < nr_pages; i++) { |
1200 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | 1212 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) |
1201 | return; | 1213 | goto out; |
1202 | p = cpu_buffer->pages->next; | 1214 | p = cpu_buffer->pages->next; |
1203 | bpage = list_entry(p, struct buffer_page, list); | 1215 | bpage = list_entry(p, struct buffer_page, list); |
1204 | list_del_init(&bpage->list); | 1216 | list_del_init(&bpage->list); |
1205 | free_buffer_page(bpage); | 1217 | free_buffer_page(bpage); |
1206 | } | 1218 | } |
1207 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | 1219 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) |
1208 | return; | 1220 | goto out; |
1209 | 1221 | ||
1210 | rb_reset_cpu(cpu_buffer); | 1222 | rb_reset_cpu(cpu_buffer); |
1211 | rb_check_pages(cpu_buffer); | 1223 | rb_check_pages(cpu_buffer); |
1212 | 1224 | ||
1225 | out: | ||
1213 | spin_unlock_irq(&cpu_buffer->reader_lock); | 1226 | spin_unlock_irq(&cpu_buffer->reader_lock); |
1214 | } | 1227 | } |
1215 | 1228 | ||
@@ -1226,7 +1239,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1226 | 1239 | ||
1227 | for (i = 0; i < nr_pages; i++) { | 1240 | for (i = 0; i < nr_pages; i++) { |
1228 | if (RB_WARN_ON(cpu_buffer, list_empty(pages))) | 1241 | if (RB_WARN_ON(cpu_buffer, list_empty(pages))) |
1229 | return; | 1242 | goto out; |
1230 | p = pages->next; | 1243 | p = pages->next; |
1231 | bpage = list_entry(p, struct buffer_page, list); | 1244 | bpage = list_entry(p, struct buffer_page, list); |
1232 | list_del_init(&bpage->list); | 1245 | list_del_init(&bpage->list); |
@@ -1235,6 +1248,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1235 | rb_reset_cpu(cpu_buffer); | 1248 | rb_reset_cpu(cpu_buffer); |
1236 | rb_check_pages(cpu_buffer); | 1249 | rb_check_pages(cpu_buffer); |
1237 | 1250 | ||
1251 | out: | ||
1238 | spin_unlock_irq(&cpu_buffer->reader_lock); | 1252 | spin_unlock_irq(&cpu_buffer->reader_lock); |
1239 | } | 1253 | } |
1240 | 1254 | ||
@@ -1544,7 +1558,7 @@ rb_update_event(struct ring_buffer_event *event, | |||
1544 | 1558 | ||
1545 | case 0: | 1559 | case 0: |
1546 | length -= RB_EVNT_HDR_SIZE; | 1560 | length -= RB_EVNT_HDR_SIZE; |
1547 | if (length > RB_MAX_SMALL_DATA) | 1561 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) |
1548 | event->array[0] = length; | 1562 | event->array[0] = length; |
1549 | else | 1563 | else |
1550 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); | 1564 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); |
@@ -1719,11 +1733,11 @@ static unsigned rb_calculate_event_length(unsigned length) | |||
1719 | if (!length) | 1733 | if (!length) |
1720 | length = 1; | 1734 | length = 1; |
1721 | 1735 | ||
1722 | if (length > RB_MAX_SMALL_DATA) | 1736 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) |
1723 | length += sizeof(event.array[0]); | 1737 | length += sizeof(event.array[0]); |
1724 | 1738 | ||
1725 | length += RB_EVNT_HDR_SIZE; | 1739 | length += RB_EVNT_HDR_SIZE; |
1726 | length = ALIGN(length, RB_ALIGNMENT); | 1740 | length = ALIGN(length, RB_ARCH_ALIGNMENT); |
1727 | 1741 | ||
1728 | return length; | 1742 | return length; |
1729 | } | 1743 | } |
@@ -2230,12 +2244,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
2230 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2244 | if (ring_buffer_flags != RB_BUFFERS_ON) |
2231 | return NULL; | 2245 | return NULL; |
2232 | 2246 | ||
2233 | if (atomic_read(&buffer->record_disabled)) | ||
2234 | return NULL; | ||
2235 | |||
2236 | /* If we are tracing schedule, we don't want to recurse */ | 2247 | /* If we are tracing schedule, we don't want to recurse */ |
2237 | resched = ftrace_preempt_disable(); | 2248 | resched = ftrace_preempt_disable(); |
2238 | 2249 | ||
2250 | if (atomic_read(&buffer->record_disabled)) | ||
2251 | goto out_nocheck; | ||
2252 | |||
2239 | if (trace_recursive_lock()) | 2253 | if (trace_recursive_lock()) |
2240 | goto out_nocheck; | 2254 | goto out_nocheck; |
2241 | 2255 | ||
@@ -2467,11 +2481,11 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
2467 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2481 | if (ring_buffer_flags != RB_BUFFERS_ON) |
2468 | return -EBUSY; | 2482 | return -EBUSY; |
2469 | 2483 | ||
2470 | if (atomic_read(&buffer->record_disabled)) | ||
2471 | return -EBUSY; | ||
2472 | |||
2473 | resched = ftrace_preempt_disable(); | 2484 | resched = ftrace_preempt_disable(); |
2474 | 2485 | ||
2486 | if (atomic_read(&buffer->record_disabled)) | ||
2487 | goto out; | ||
2488 | |||
2475 | cpu = raw_smp_processor_id(); | 2489 | cpu = raw_smp_processor_id(); |
2476 | 2490 | ||
2477 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 2491 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
@@ -2539,7 +2553,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable); | |||
2539 | * @buffer: The ring buffer to enable writes | 2553 | * @buffer: The ring buffer to enable writes |
2540 | * | 2554 | * |
2541 | * Note, multiple disables will need the same number of enables | 2555 | * Note, multiple disables will need the same number of enables |
2542 | * to truely enable the writing (much like preempt_disable). | 2556 | * to truly enable the writing (much like preempt_disable). |
2543 | */ | 2557 | */ |
2544 | void ring_buffer_record_enable(struct ring_buffer *buffer) | 2558 | void ring_buffer_record_enable(struct ring_buffer *buffer) |
2545 | { | 2559 | { |
@@ -2575,7 +2589,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); | |||
2575 | * @cpu: The CPU to enable. | 2589 | * @cpu: The CPU to enable. |
2576 | * | 2590 | * |
2577 | * Note, multiple disables will need the same number of enables | 2591 | * Note, multiple disables will need the same number of enables |
2578 | * to truely enable the writing (much like preempt_disable). | 2592 | * to truly enable the writing (much like preempt_disable). |
2579 | */ | 2593 | */ |
2580 | void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) | 2594 | void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) |
2581 | { | 2595 | { |
@@ -2716,6 +2730,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) | |||
2716 | iter->read_stamp = cpu_buffer->read_stamp; | 2730 | iter->read_stamp = cpu_buffer->read_stamp; |
2717 | else | 2731 | else |
2718 | iter->read_stamp = iter->head_page->page->time_stamp; | 2732 | iter->read_stamp = iter->head_page->page->time_stamp; |
2733 | iter->cache_reader_page = cpu_buffer->reader_page; | ||
2734 | iter->cache_read = cpu_buffer->read; | ||
2719 | } | 2735 | } |
2720 | 2736 | ||
2721 | /** | 2737 | /** |
@@ -3060,13 +3076,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
3060 | struct ring_buffer_event *event; | 3076 | struct ring_buffer_event *event; |
3061 | int nr_loops = 0; | 3077 | int nr_loops = 0; |
3062 | 3078 | ||
3063 | if (ring_buffer_iter_empty(iter)) | ||
3064 | return NULL; | ||
3065 | |||
3066 | cpu_buffer = iter->cpu_buffer; | 3079 | cpu_buffer = iter->cpu_buffer; |
3067 | buffer = cpu_buffer->buffer; | 3080 | buffer = cpu_buffer->buffer; |
3068 | 3081 | ||
3082 | /* | ||
3083 | * Check if someone performed a consuming read to | ||
3084 | * the buffer. A consuming read invalidates the iterator | ||
3085 | * and we need to reset the iterator in this case. | ||
3086 | */ | ||
3087 | if (unlikely(iter->cache_read != cpu_buffer->read || | ||
3088 | iter->cache_reader_page != cpu_buffer->reader_page)) | ||
3089 | rb_iter_reset(iter); | ||
3090 | |||
3069 | again: | 3091 | again: |
3092 | if (ring_buffer_iter_empty(iter)) | ||
3093 | return NULL; | ||
3094 | |||
3070 | /* | 3095 | /* |
3071 | * We repeat when a timestamp is encountered. | 3096 | * We repeat when a timestamp is encountered. |
3072 | * We can get multiple timestamps by nested interrupts or also | 3097 | * We can get multiple timestamps by nested interrupts or also |
@@ -3081,6 +3106,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
3081 | if (rb_per_cpu_empty(cpu_buffer)) | 3106 | if (rb_per_cpu_empty(cpu_buffer)) |
3082 | return NULL; | 3107 | return NULL; |
3083 | 3108 | ||
3109 | if (iter->head >= local_read(&iter->head_page->page->commit)) { | ||
3110 | rb_inc_iter(iter); | ||
3111 | goto again; | ||
3112 | } | ||
3113 | |||
3084 | event = rb_iter_head_event(iter); | 3114 | event = rb_iter_head_event(iter); |
3085 | 3115 | ||
3086 | switch (event->type_len) { | 3116 | switch (event->type_len) { |
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index b2477caf09c2..df74c7982255 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/kthread.h> | 8 | #include <linux/kthread.h> |
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/time.h> | 10 | #include <linux/time.h> |
11 | #include <asm/local.h> | ||
11 | 12 | ||
12 | struct rb_page { | 13 | struct rb_page { |
13 | u64 ts; | 14 | u64 ts; |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0df1b0f2cb9e..44f916a04065 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -32,10 +32,11 @@ | |||
32 | #include <linux/splice.h> | 32 | #include <linux/splice.h> |
33 | #include <linux/kdebug.h> | 33 | #include <linux/kdebug.h> |
34 | #include <linux/string.h> | 34 | #include <linux/string.h> |
35 | #include <linux/rwsem.h> | ||
36 | #include <linux/slab.h> | ||
35 | #include <linux/ctype.h> | 37 | #include <linux/ctype.h> |
36 | #include <linux/init.h> | 38 | #include <linux/init.h> |
37 | #include <linux/poll.h> | 39 | #include <linux/poll.h> |
38 | #include <linux/gfp.h> | ||
39 | #include <linux/fs.h> | 40 | #include <linux/fs.h> |
40 | 41 | ||
41 | #include "trace.h" | 42 | #include "trace.h" |
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled); | |||
91 | static inline void ftrace_disable_cpu(void) | 92 | static inline void ftrace_disable_cpu(void) |
92 | { | 93 | { |
93 | preempt_disable(); | 94 | preempt_disable(); |
94 | __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); | 95 | __this_cpu_inc(ftrace_cpu_disabled); |
95 | } | 96 | } |
96 | 97 | ||
97 | static inline void ftrace_enable_cpu(void) | 98 | static inline void ftrace_enable_cpu(void) |
98 | { | 99 | { |
99 | __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); | 100 | __this_cpu_dec(ftrace_cpu_disabled); |
100 | preempt_enable(); | 101 | preempt_enable(); |
101 | } | 102 | } |
102 | 103 | ||
103 | static cpumask_var_t __read_mostly tracing_buffer_mask; | 104 | static cpumask_var_t __read_mostly tracing_buffer_mask; |
104 | 105 | ||
105 | /* Define which cpu buffers are currently read in trace_pipe */ | ||
106 | static cpumask_var_t tracing_reader_cpumask; | ||
107 | |||
108 | #define for_each_tracing_cpu(cpu) \ | 106 | #define for_each_tracing_cpu(cpu) \ |
109 | for_each_cpu(cpu, tracing_buffer_mask) | 107 | for_each_cpu(cpu, tracing_buffer_mask) |
110 | 108 | ||
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly; | |||
243 | 241 | ||
244 | /* | 242 | /* |
245 | * trace_types_lock is used to protect the trace_types list. | 243 | * trace_types_lock is used to protect the trace_types list. |
246 | * This lock is also used to keep user access serialized. | ||
247 | * Accesses from userspace will grab this lock while userspace | ||
248 | * activities happen inside the kernel. | ||
249 | */ | 244 | */ |
250 | static DEFINE_MUTEX(trace_types_lock); | 245 | static DEFINE_MUTEX(trace_types_lock); |
251 | 246 | ||
247 | /* | ||
248 | * serialize the access of the ring buffer | ||
249 | * | ||
250 | * ring buffer serializes readers, but it is low level protection. | ||
251 | * The validity of the events (which returns by ring_buffer_peek() ..etc) | ||
252 | * are not protected by ring buffer. | ||
253 | * | ||
254 | * The content of events may become garbage if we allow other process consumes | ||
255 | * these events concurrently: | ||
256 | * A) the page of the consumed events may become a normal page | ||
257 | * (not reader page) in ring buffer, and this page will be rewrited | ||
258 | * by events producer. | ||
259 | * B) The page of the consumed events may become a page for splice_read, | ||
260 | * and this page will be returned to system. | ||
261 | * | ||
262 | * These primitives allow multi process access to different cpu ring buffer | ||
263 | * concurrently. | ||
264 | * | ||
265 | * These primitives don't distinguish read-only and read-consume access. | ||
266 | * Multi read-only access are also serialized. | ||
267 | */ | ||
268 | |||
269 | #ifdef CONFIG_SMP | ||
270 | static DECLARE_RWSEM(all_cpu_access_lock); | ||
271 | static DEFINE_PER_CPU(struct mutex, cpu_access_lock); | ||
272 | |||
273 | static inline void trace_access_lock(int cpu) | ||
274 | { | ||
275 | if (cpu == TRACE_PIPE_ALL_CPU) { | ||
276 | /* gain it for accessing the whole ring buffer. */ | ||
277 | down_write(&all_cpu_access_lock); | ||
278 | } else { | ||
279 | /* gain it for accessing a cpu ring buffer. */ | ||
280 | |||
281 | /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ | ||
282 | down_read(&all_cpu_access_lock); | ||
283 | |||
284 | /* Secondly block other access to this @cpu ring buffer. */ | ||
285 | mutex_lock(&per_cpu(cpu_access_lock, cpu)); | ||
286 | } | ||
287 | } | ||
288 | |||
289 | static inline void trace_access_unlock(int cpu) | ||
290 | { | ||
291 | if (cpu == TRACE_PIPE_ALL_CPU) { | ||
292 | up_write(&all_cpu_access_lock); | ||
293 | } else { | ||
294 | mutex_unlock(&per_cpu(cpu_access_lock, cpu)); | ||
295 | up_read(&all_cpu_access_lock); | ||
296 | } | ||
297 | } | ||
298 | |||
299 | static inline void trace_access_lock_init(void) | ||
300 | { | ||
301 | int cpu; | ||
302 | |||
303 | for_each_possible_cpu(cpu) | ||
304 | mutex_init(&per_cpu(cpu_access_lock, cpu)); | ||
305 | } | ||
306 | |||
307 | #else | ||
308 | |||
309 | static DEFINE_MUTEX(access_lock); | ||
310 | |||
311 | static inline void trace_access_lock(int cpu) | ||
312 | { | ||
313 | (void)cpu; | ||
314 | mutex_lock(&access_lock); | ||
315 | } | ||
316 | |||
317 | static inline void trace_access_unlock(int cpu) | ||
318 | { | ||
319 | (void)cpu; | ||
320 | mutex_unlock(&access_lock); | ||
321 | } | ||
322 | |||
323 | static inline void trace_access_lock_init(void) | ||
324 | { | ||
325 | } | ||
326 | |||
327 | #endif | ||
328 | |||
252 | /* trace_wait is a waitqueue for tasks blocked on trace_poll */ | 329 | /* trace_wait is a waitqueue for tasks blocked on trace_poll */ |
253 | static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | 330 | static DECLARE_WAIT_QUEUE_HEAD(trace_wait); |
254 | 331 | ||
@@ -297,6 +374,21 @@ static int __init set_buf_size(char *str) | |||
297 | } | 374 | } |
298 | __setup("trace_buf_size=", set_buf_size); | 375 | __setup("trace_buf_size=", set_buf_size); |
299 | 376 | ||
377 | static int __init set_tracing_thresh(char *str) | ||
378 | { | ||
379 | unsigned long threshhold; | ||
380 | int ret; | ||
381 | |||
382 | if (!str) | ||
383 | return 0; | ||
384 | ret = strict_strtoul(str, 0, &threshhold); | ||
385 | if (ret < 0) | ||
386 | return 0; | ||
387 | tracing_thresh = threshhold * 1000; | ||
388 | return 1; | ||
389 | } | ||
390 | __setup("tracing_thresh=", set_tracing_thresh); | ||
391 | |||
300 | unsigned long nsecs_to_usecs(unsigned long nsecs) | 392 | unsigned long nsecs_to_usecs(unsigned long nsecs) |
301 | { | 393 | { |
302 | return nsecs / 1000; | 394 | return nsecs / 1000; |
@@ -502,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | |||
502 | static arch_spinlock_t ftrace_max_lock = | 594 | static arch_spinlock_t ftrace_max_lock = |
503 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 595 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
504 | 596 | ||
597 | unsigned long __read_mostly tracing_thresh; | ||
598 | |||
505 | #ifdef CONFIG_TRACER_MAX_TRACE | 599 | #ifdef CONFIG_TRACER_MAX_TRACE |
506 | unsigned long __read_mostly tracing_max_latency; | 600 | unsigned long __read_mostly tracing_max_latency; |
507 | unsigned long __read_mostly tracing_thresh; | ||
508 | 601 | ||
509 | /* | 602 | /* |
510 | * Copy the new maximum trace into the separate maximum-trace | 603 | * Copy the new maximum trace into the separate maximum-trace |
@@ -515,7 +608,7 @@ static void | |||
515 | __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | 608 | __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) |
516 | { | 609 | { |
517 | struct trace_array_cpu *data = tr->data[cpu]; | 610 | struct trace_array_cpu *data = tr->data[cpu]; |
518 | struct trace_array_cpu *max_data = tr->data[cpu]; | 611 | struct trace_array_cpu *max_data; |
519 | 612 | ||
520 | max_tr.cpu = cpu; | 613 | max_tr.cpu = cpu; |
521 | max_tr.time_start = data->preempt_timestamp; | 614 | max_tr.time_start = data->preempt_timestamp; |
@@ -525,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
525 | max_data->critical_start = data->critical_start; | 618 | max_data->critical_start = data->critical_start; |
526 | max_data->critical_end = data->critical_end; | 619 | max_data->critical_end = data->critical_end; |
527 | 620 | ||
528 | memcpy(data->comm, tsk->comm, TASK_COMM_LEN); | 621 | memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); |
529 | max_data->pid = tsk->pid; | 622 | max_data->pid = tsk->pid; |
530 | max_data->uid = task_uid(tsk); | 623 | max_data->uid = task_uid(tsk); |
531 | max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; | 624 | max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; |
@@ -747,10 +840,10 @@ out: | |||
747 | mutex_unlock(&trace_types_lock); | 840 | mutex_unlock(&trace_types_lock); |
748 | } | 841 | } |
749 | 842 | ||
750 | static void __tracing_reset(struct trace_array *tr, int cpu) | 843 | static void __tracing_reset(struct ring_buffer *buffer, int cpu) |
751 | { | 844 | { |
752 | ftrace_disable_cpu(); | 845 | ftrace_disable_cpu(); |
753 | ring_buffer_reset_cpu(tr->buffer, cpu); | 846 | ring_buffer_reset_cpu(buffer, cpu); |
754 | ftrace_enable_cpu(); | 847 | ftrace_enable_cpu(); |
755 | } | 848 | } |
756 | 849 | ||
@@ -762,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu) | |||
762 | 855 | ||
763 | /* Make sure all commits have finished */ | 856 | /* Make sure all commits have finished */ |
764 | synchronize_sched(); | 857 | synchronize_sched(); |
765 | __tracing_reset(tr, cpu); | 858 | __tracing_reset(buffer, cpu); |
766 | 859 | ||
767 | ring_buffer_record_enable(buffer); | 860 | ring_buffer_record_enable(buffer); |
768 | } | 861 | } |
@@ -780,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) | |||
780 | tr->time_start = ftrace_now(tr->cpu); | 873 | tr->time_start = ftrace_now(tr->cpu); |
781 | 874 | ||
782 | for_each_online_cpu(cpu) | 875 | for_each_online_cpu(cpu) |
783 | __tracing_reset(tr, cpu); | 876 | __tracing_reset(buffer, cpu); |
784 | 877 | ||
785 | ring_buffer_record_enable(buffer); | 878 | ring_buffer_record_enable(buffer); |
786 | } | 879 | } |
@@ -857,6 +950,8 @@ void tracing_start(void) | |||
857 | goto out; | 950 | goto out; |
858 | } | 951 | } |
859 | 952 | ||
953 | /* Prevent the buffers from switching */ | ||
954 | arch_spin_lock(&ftrace_max_lock); | ||
860 | 955 | ||
861 | buffer = global_trace.buffer; | 956 | buffer = global_trace.buffer; |
862 | if (buffer) | 957 | if (buffer) |
@@ -866,6 +961,8 @@ void tracing_start(void) | |||
866 | if (buffer) | 961 | if (buffer) |
867 | ring_buffer_record_enable(buffer); | 962 | ring_buffer_record_enable(buffer); |
868 | 963 | ||
964 | arch_spin_unlock(&ftrace_max_lock); | ||
965 | |||
869 | ftrace_start(); | 966 | ftrace_start(); |
870 | out: | 967 | out: |
871 | spin_unlock_irqrestore(&tracing_start_lock, flags); | 968 | spin_unlock_irqrestore(&tracing_start_lock, flags); |
@@ -887,6 +984,9 @@ void tracing_stop(void) | |||
887 | if (trace_stop_count++) | 984 | if (trace_stop_count++) |
888 | goto out; | 985 | goto out; |
889 | 986 | ||
987 | /* Prevent the buffers from switching */ | ||
988 | arch_spin_lock(&ftrace_max_lock); | ||
989 | |||
890 | buffer = global_trace.buffer; | 990 | buffer = global_trace.buffer; |
891 | if (buffer) | 991 | if (buffer) |
892 | ring_buffer_record_disable(buffer); | 992 | ring_buffer_record_disable(buffer); |
@@ -895,6 +995,8 @@ void tracing_stop(void) | |||
895 | if (buffer) | 995 | if (buffer) |
896 | ring_buffer_record_disable(buffer); | 996 | ring_buffer_record_disable(buffer); |
897 | 997 | ||
998 | arch_spin_unlock(&ftrace_max_lock); | ||
999 | |||
898 | out: | 1000 | out: |
899 | spin_unlock_irqrestore(&tracing_start_lock, flags); | 1001 | spin_unlock_irqrestore(&tracing_start_lock, flags); |
900 | } | 1002 | } |
@@ -951,6 +1053,11 @@ void trace_find_cmdline(int pid, char comm[]) | |||
951 | return; | 1053 | return; |
952 | } | 1054 | } |
953 | 1055 | ||
1056 | if (WARN_ON_ONCE(pid < 0)) { | ||
1057 | strcpy(comm, "<XXX>"); | ||
1058 | return; | ||
1059 | } | ||
1060 | |||
954 | if (pid > PID_MAX_DEFAULT) { | 1061 | if (pid > PID_MAX_DEFAULT) { |
955 | strcpy(comm, "<...>"); | 1062 | strcpy(comm, "<...>"); |
956 | return; | 1063 | return; |
@@ -1084,7 +1191,7 @@ trace_function(struct trace_array *tr, | |||
1084 | struct ftrace_entry *entry; | 1191 | struct ftrace_entry *entry; |
1085 | 1192 | ||
1086 | /* If we are reading the ring buffer, don't trace */ | 1193 | /* If we are reading the ring buffer, don't trace */ |
1087 | if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) | 1194 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
1088 | return; | 1195 | return; |
1089 | 1196 | ||
1090 | event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), | 1197 | event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), |
@@ -1177,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1177 | if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) | 1284 | if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) |
1178 | return; | 1285 | return; |
1179 | 1286 | ||
1287 | /* | ||
1288 | * NMIs can not handle page faults, even with fix ups. | ||
1289 | * The save user stack can (and often does) fault. | ||
1290 | */ | ||
1291 | if (unlikely(in_nmi())) | ||
1292 | return; | ||
1293 | |||
1180 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, | 1294 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, |
1181 | sizeof(*entry), flags, pc); | 1295 | sizeof(*entry), flags, pc); |
1182 | if (!event) | 1296 | if (!event) |
@@ -1315,8 +1429,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1315 | entry->fmt = fmt; | 1429 | entry->fmt = fmt; |
1316 | 1430 | ||
1317 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); | 1431 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); |
1318 | if (!filter_check_discard(call, entry, buffer, event)) | 1432 | if (!filter_check_discard(call, entry, buffer, event)) { |
1319 | ring_buffer_unlock_commit(buffer, event); | 1433 | ring_buffer_unlock_commit(buffer, event); |
1434 | ftrace_trace_stack(buffer, flags, 6, pc); | ||
1435 | } | ||
1320 | 1436 | ||
1321 | out_unlock: | 1437 | out_unlock: |
1322 | arch_spin_unlock(&trace_buf_lock); | 1438 | arch_spin_unlock(&trace_buf_lock); |
@@ -1389,8 +1505,10 @@ int trace_array_vprintk(struct trace_array *tr, | |||
1389 | 1505 | ||
1390 | memcpy(&entry->buf, trace_buf, len); | 1506 | memcpy(&entry->buf, trace_buf, len); |
1391 | entry->buf[len] = '\0'; | 1507 | entry->buf[len] = '\0'; |
1392 | if (!filter_check_discard(call, entry, buffer, event)) | 1508 | if (!filter_check_discard(call, entry, buffer, event)) { |
1393 | ring_buffer_unlock_commit(buffer, event); | 1509 | ring_buffer_unlock_commit(buffer, event); |
1510 | ftrace_trace_stack(buffer, irq_flags, 6, pc); | ||
1511 | } | ||
1394 | 1512 | ||
1395 | out_unlock: | 1513 | out_unlock: |
1396 | arch_spin_unlock(&trace_buf_lock); | 1514 | arch_spin_unlock(&trace_buf_lock); |
@@ -1580,12 +1698,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu) | |||
1580 | } | 1698 | } |
1581 | 1699 | ||
1582 | /* | 1700 | /* |
1583 | * No necessary locking here. The worst thing which can | ||
1584 | * happen is loosing events consumed at the same time | ||
1585 | * by a trace_pipe reader. | ||
1586 | * Other than that, we don't risk to crash the ring buffer | ||
1587 | * because it serializes the readers. | ||
1588 | * | ||
1589 | * The current tracer is copied to avoid a global locking | 1701 | * The current tracer is copied to avoid a global locking |
1590 | * all around. | 1702 | * all around. |
1591 | */ | 1703 | */ |
@@ -1623,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1623 | 1735 | ||
1624 | ftrace_enable_cpu(); | 1736 | ftrace_enable_cpu(); |
1625 | 1737 | ||
1738 | iter->leftover = 0; | ||
1626 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) | 1739 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) |
1627 | ; | 1740 | ; |
1628 | 1741 | ||
@@ -1640,12 +1753,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1640 | } | 1753 | } |
1641 | 1754 | ||
1642 | trace_event_read_lock(); | 1755 | trace_event_read_lock(); |
1756 | trace_access_lock(cpu_file); | ||
1643 | return p; | 1757 | return p; |
1644 | } | 1758 | } |
1645 | 1759 | ||
1646 | static void s_stop(struct seq_file *m, void *p) | 1760 | static void s_stop(struct seq_file *m, void *p) |
1647 | { | 1761 | { |
1762 | struct trace_iterator *iter = m->private; | ||
1763 | |||
1648 | atomic_dec(&trace_record_cmdline_disabled); | 1764 | atomic_dec(&trace_record_cmdline_disabled); |
1765 | trace_access_unlock(iter->cpu_file); | ||
1649 | trace_event_read_unlock(); | 1766 | trace_event_read_unlock(); |
1650 | } | 1767 | } |
1651 | 1768 | ||
@@ -2836,22 +2953,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
2836 | 2953 | ||
2837 | mutex_lock(&trace_types_lock); | 2954 | mutex_lock(&trace_types_lock); |
2838 | 2955 | ||
2839 | /* We only allow one reader per cpu */ | ||
2840 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | ||
2841 | if (!cpumask_empty(tracing_reader_cpumask)) { | ||
2842 | ret = -EBUSY; | ||
2843 | goto out; | ||
2844 | } | ||
2845 | cpumask_setall(tracing_reader_cpumask); | ||
2846 | } else { | ||
2847 | if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) | ||
2848 | cpumask_set_cpu(cpu_file, tracing_reader_cpumask); | ||
2849 | else { | ||
2850 | ret = -EBUSY; | ||
2851 | goto out; | ||
2852 | } | ||
2853 | } | ||
2854 | |||
2855 | /* create a buffer to store the information to pass to userspace */ | 2956 | /* create a buffer to store the information to pass to userspace */ |
2856 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2957 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); |
2857 | if (!iter) { | 2958 | if (!iter) { |
@@ -2907,12 +3008,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) | |||
2907 | 3008 | ||
2908 | mutex_lock(&trace_types_lock); | 3009 | mutex_lock(&trace_types_lock); |
2909 | 3010 | ||
2910 | if (iter->cpu_file == TRACE_PIPE_ALL_CPU) | ||
2911 | cpumask_clear(tracing_reader_cpumask); | ||
2912 | else | ||
2913 | cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); | ||
2914 | |||
2915 | |||
2916 | if (iter->trace->pipe_close) | 3011 | if (iter->trace->pipe_close) |
2917 | iter->trace->pipe_close(iter); | 3012 | iter->trace->pipe_close(iter); |
2918 | 3013 | ||
@@ -3074,6 +3169,7 @@ waitagain: | |||
3074 | iter->pos = -1; | 3169 | iter->pos = -1; |
3075 | 3170 | ||
3076 | trace_event_read_lock(); | 3171 | trace_event_read_lock(); |
3172 | trace_access_lock(iter->cpu_file); | ||
3077 | while (find_next_entry_inc(iter) != NULL) { | 3173 | while (find_next_entry_inc(iter) != NULL) { |
3078 | enum print_line_t ret; | 3174 | enum print_line_t ret; |
3079 | int len = iter->seq.len; | 3175 | int len = iter->seq.len; |
@@ -3090,6 +3186,7 @@ waitagain: | |||
3090 | if (iter->seq.len >= cnt) | 3186 | if (iter->seq.len >= cnt) |
3091 | break; | 3187 | break; |
3092 | } | 3188 | } |
3189 | trace_access_unlock(iter->cpu_file); | ||
3093 | trace_event_read_unlock(); | 3190 | trace_event_read_unlock(); |
3094 | 3191 | ||
3095 | /* Now copy what we have to the user */ | 3192 | /* Now copy what we have to the user */ |
@@ -3215,6 +3312,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3215 | } | 3312 | } |
3216 | 3313 | ||
3217 | trace_event_read_lock(); | 3314 | trace_event_read_lock(); |
3315 | trace_access_lock(iter->cpu_file); | ||
3218 | 3316 | ||
3219 | /* Fill as many pages as possible. */ | 3317 | /* Fill as many pages as possible. */ |
3220 | for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { | 3318 | for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { |
@@ -3238,6 +3336,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3238 | trace_seq_init(&iter->seq); | 3336 | trace_seq_init(&iter->seq); |
3239 | } | 3337 | } |
3240 | 3338 | ||
3339 | trace_access_unlock(iter->cpu_file); | ||
3241 | trace_event_read_unlock(); | 3340 | trace_event_read_unlock(); |
3242 | mutex_unlock(&iter->mutex); | 3341 | mutex_unlock(&iter->mutex); |
3243 | 3342 | ||
@@ -3539,10 +3638,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3539 | 3638 | ||
3540 | info->read = 0; | 3639 | info->read = 0; |
3541 | 3640 | ||
3641 | trace_access_lock(info->cpu); | ||
3542 | ret = ring_buffer_read_page(info->tr->buffer, | 3642 | ret = ring_buffer_read_page(info->tr->buffer, |
3543 | &info->spare, | 3643 | &info->spare, |
3544 | count, | 3644 | count, |
3545 | info->cpu, 0); | 3645 | info->cpu, 0); |
3646 | trace_access_unlock(info->cpu); | ||
3546 | if (ret < 0) | 3647 | if (ret < 0) |
3547 | return 0; | 3648 | return 0; |
3548 | 3649 | ||
@@ -3670,6 +3771,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3670 | len &= PAGE_MASK; | 3771 | len &= PAGE_MASK; |
3671 | } | 3772 | } |
3672 | 3773 | ||
3774 | trace_access_lock(info->cpu); | ||
3673 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 3775 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); |
3674 | 3776 | ||
3675 | for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { | 3777 | for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { |
@@ -3717,6 +3819,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3717 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 3819 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); |
3718 | } | 3820 | } |
3719 | 3821 | ||
3822 | trace_access_unlock(info->cpu); | ||
3720 | spd.nr_pages = i; | 3823 | spd.nr_pages = i; |
3721 | 3824 | ||
3722 | /* did we read anything? */ | 3825 | /* did we read anything? */ |
@@ -4153,6 +4256,8 @@ static __init int tracer_init_debugfs(void) | |||
4153 | struct dentry *d_tracer; | 4256 | struct dentry *d_tracer; |
4154 | int cpu; | 4257 | int cpu; |
4155 | 4258 | ||
4259 | trace_access_lock_init(); | ||
4260 | |||
4156 | d_tracer = tracing_init_dentry(); | 4261 | d_tracer = tracing_init_dentry(); |
4157 | 4262 | ||
4158 | trace_create_file("tracing_enabled", 0644, d_tracer, | 4263 | trace_create_file("tracing_enabled", 0644, d_tracer, |
@@ -4176,10 +4281,10 @@ static __init int tracer_init_debugfs(void) | |||
4176 | #ifdef CONFIG_TRACER_MAX_TRACE | 4281 | #ifdef CONFIG_TRACER_MAX_TRACE |
4177 | trace_create_file("tracing_max_latency", 0644, d_tracer, | 4282 | trace_create_file("tracing_max_latency", 0644, d_tracer, |
4178 | &tracing_max_latency, &tracing_max_lat_fops); | 4283 | &tracing_max_latency, &tracing_max_lat_fops); |
4284 | #endif | ||
4179 | 4285 | ||
4180 | trace_create_file("tracing_thresh", 0644, d_tracer, | 4286 | trace_create_file("tracing_thresh", 0644, d_tracer, |
4181 | &tracing_thresh, &tracing_max_lat_fops); | 4287 | &tracing_thresh, &tracing_max_lat_fops); |
4182 | #endif | ||
4183 | 4288 | ||
4184 | trace_create_file("README", 0444, d_tracer, | 4289 | trace_create_file("README", 0444, d_tracer, |
4185 | NULL, &tracing_readme_fops); | 4290 | NULL, &tracing_readme_fops); |
@@ -4387,9 +4492,6 @@ __init static int tracer_alloc_buffers(void) | |||
4387 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 4492 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) |
4388 | goto out_free_buffer_mask; | 4493 | goto out_free_buffer_mask; |
4389 | 4494 | ||
4390 | if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) | ||
4391 | goto out_free_tracing_cpumask; | ||
4392 | |||
4393 | /* To save memory, keep the ring buffer size to its minimum */ | 4495 | /* To save memory, keep the ring buffer size to its minimum */ |
4394 | if (ring_buffer_expanded) | 4496 | if (ring_buffer_expanded) |
4395 | ring_buf_size = trace_buf_size; | 4497 | ring_buf_size = trace_buf_size; |
@@ -4447,8 +4549,6 @@ __init static int tracer_alloc_buffers(void) | |||
4447 | return 0; | 4549 | return 0; |
4448 | 4550 | ||
4449 | out_free_cpumask: | 4551 | out_free_cpumask: |
4450 | free_cpumask_var(tracing_reader_cpumask); | ||
4451 | out_free_tracing_cpumask: | ||
4452 | free_cpumask_var(tracing_cpumask); | 4552 | free_cpumask_var(tracing_cpumask); |
4453 | out_free_buffer_mask: | 4553 | out_free_buffer_mask: |
4454 | free_cpumask_var(tracing_buffer_mask); | 4554 | free_cpumask_var(tracing_buffer_mask); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4df6a77eb196..2825ef2c0b15 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); | |||
396 | 396 | ||
397 | extern unsigned long nsecs_to_usecs(unsigned long nsecs); | 397 | extern unsigned long nsecs_to_usecs(unsigned long nsecs); |
398 | 398 | ||
399 | extern unsigned long tracing_thresh; | ||
400 | |||
399 | #ifdef CONFIG_TRACER_MAX_TRACE | 401 | #ifdef CONFIG_TRACER_MAX_TRACE |
400 | extern unsigned long tracing_max_latency; | 402 | extern unsigned long tracing_max_latency; |
401 | extern unsigned long tracing_thresh; | ||
402 | 403 | ||
403 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); | 404 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); |
404 | void update_max_tr_single(struct trace_array *tr, | 405 | void update_max_tr_single(struct trace_array *tr, |
@@ -497,6 +498,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); | |||
497 | #ifdef CONFIG_DYNAMIC_FTRACE | 498 | #ifdef CONFIG_DYNAMIC_FTRACE |
498 | /* TODO: make this variable */ | 499 | /* TODO: make this variable */ |
499 | #define FTRACE_GRAPH_MAX_FUNCS 32 | 500 | #define FTRACE_GRAPH_MAX_FUNCS 32 |
501 | extern int ftrace_graph_filter_enabled; | ||
500 | extern int ftrace_graph_count; | 502 | extern int ftrace_graph_count; |
501 | extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; | 503 | extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; |
502 | 504 | ||
@@ -504,7 +506,7 @@ static inline int ftrace_graph_addr(unsigned long addr) | |||
504 | { | 506 | { |
505 | int i; | 507 | int i; |
506 | 508 | ||
507 | if (!ftrace_graph_count || test_tsk_trace_graph(current)) | 509 | if (!ftrace_graph_filter_enabled) |
508 | return 1; | 510 | return 1; |
509 | 511 | ||
510 | for (i = 0; i < ftrace_graph_count; i++) { | 512 | for (i = 0; i < ftrace_graph_count; i++) { |
@@ -549,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task) | |||
549 | * struct trace_parser - servers for reading the user input separated by spaces | 551 | * struct trace_parser - servers for reading the user input separated by spaces |
550 | * @cont: set if the input is not complete - no final space char was found | 552 | * @cont: set if the input is not complete - no final space char was found |
551 | * @buffer: holds the parsed user input | 553 | * @buffer: holds the parsed user input |
552 | * @idx: user input lenght | 554 | * @idx: user input length |
553 | * @size: buffer size | 555 | * @size: buffer size |
554 | */ | 556 | */ |
555 | struct trace_parser { | 557 | struct trace_parser { |
@@ -791,7 +793,8 @@ extern const char *__stop___trace_bprintk_fmt[]; | |||
791 | 793 | ||
792 | #undef FTRACE_ENTRY | 794 | #undef FTRACE_ENTRY |
793 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ | 795 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ |
794 | extern struct ftrace_event_call event_##call; | 796 | extern struct ftrace_event_call \ |
797 | __attribute__((__aligned__(4))) event_##call; | ||
795 | #undef FTRACE_ENTRY_DUP | 798 | #undef FTRACE_ENTRY_DUP |
796 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ | 799 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ |
797 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) | 800 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4a194f08f88c..b9bc4d470177 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2) | |||
307 | return -1; | 307 | return -1; |
308 | if (percent_a > percent_b) | 308 | if (percent_a > percent_b) |
309 | return 1; | 309 | return 1; |
310 | else | 310 | |
311 | return 0; | 311 | if (a->incorrect < b->incorrect) |
312 | return -1; | ||
313 | if (a->incorrect > b->incorrect) | ||
314 | return 1; | ||
315 | |||
316 | /* | ||
317 | * Since the above shows worse (incorrect) cases | ||
318 | * first, we continue that by showing best (correct) | ||
319 | * cases last. | ||
320 | */ | ||
321 | if (a->correct > b->correct) | ||
322 | return -1; | ||
323 | if (a->correct < b->correct) | ||
324 | return 1; | ||
325 | |||
326 | return 0; | ||
312 | } | 327 | } |
313 | 328 | ||
314 | static struct tracer_stat annotated_branch_stats = { | 329 | static struct tracer_stat annotated_branch_stats = { |
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 84a3a7ba072a..9d589d8dcd1a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -13,6 +13,7 @@ | |||
13 | * Tracer plugins will chose a default from these clocks. | 13 | * Tracer plugins will chose a default from these clocks. |
14 | */ | 14 | */ |
15 | #include <linux/spinlock.h> | 15 | #include <linux/spinlock.h> |
16 | #include <linux/irqflags.h> | ||
16 | #include <linux/hardirq.h> | 17 | #include <linux/hardirq.h> |
17 | #include <linux/module.h> | 18 | #include <linux/module.h> |
18 | #include <linux/percpu.h> | 19 | #include <linux/percpu.h> |
@@ -83,7 +84,7 @@ u64 notrace trace_clock_global(void) | |||
83 | int this_cpu; | 84 | int this_cpu; |
84 | u64 now; | 85 | u64 now; |
85 | 86 | ||
86 | raw_local_irq_save(flags); | 87 | local_irq_save(flags); |
87 | 88 | ||
88 | this_cpu = raw_smp_processor_id(); | 89 | this_cpu = raw_smp_processor_id(); |
89 | now = cpu_clock(this_cpu); | 90 | now = cpu_clock(this_cpu); |
@@ -109,7 +110,7 @@ u64 notrace trace_clock_global(void) | |||
109 | arch_spin_unlock(&trace_clock_struct.lock); | 110 | arch_spin_unlock(&trace_clock_struct.lock); |
110 | 111 | ||
111 | out: | 112 | out: |
112 | raw_local_irq_restore(flags); | 113 | local_irq_restore(flags); |
113 | 114 | ||
114 | return now; | 115 | return now; |
115 | } | 116 | } |
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c index f0d693005075..0565bb42566f 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -1,32 +1,41 @@ | |||
1 | /* | 1 | /* |
2 | * trace event based perf counter profiling | 2 | * trace event based perf event profiling/tracing |
3 | * | 3 | * |
4 | * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> | 4 | * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> |
5 | * | 5 | * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/kprobes.h> | 9 | #include <linux/kprobes.h> |
10 | #include "trace.h" | 10 | #include "trace.h" |
11 | 11 | ||
12 | DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); | ||
13 | EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); | ||
14 | |||
15 | EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); | ||
12 | 16 | ||
13 | static char *perf_trace_buf; | 17 | static char *perf_trace_buf; |
14 | static char *perf_trace_buf_nmi; | 18 | static char *perf_trace_buf_nmi; |
15 | 19 | ||
16 | typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; | 20 | /* |
21 | * Force it to be aligned to unsigned long to avoid misaligned accesses | ||
22 | * suprises | ||
23 | */ | ||
24 | typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) | ||
25 | perf_trace_t; | ||
17 | 26 | ||
18 | /* Count the events in use (per event id, not per instance) */ | 27 | /* Count the events in use (per event id, not per instance) */ |
19 | static int total_profile_count; | 28 | static int total_ref_count; |
20 | 29 | ||
21 | static int ftrace_profile_enable_event(struct ftrace_event_call *event) | 30 | static int perf_trace_event_enable(struct ftrace_event_call *event) |
22 | { | 31 | { |
23 | char *buf; | 32 | char *buf; |
24 | int ret = -ENOMEM; | 33 | int ret = -ENOMEM; |
25 | 34 | ||
26 | if (event->profile_count++ > 0) | 35 | if (event->perf_refcount++ > 0) |
27 | return 0; | 36 | return 0; |
28 | 37 | ||
29 | if (!total_profile_count) { | 38 | if (!total_ref_count) { |
30 | buf = (char *)alloc_percpu(perf_trace_t); | 39 | buf = (char *)alloc_percpu(perf_trace_t); |
31 | if (!buf) | 40 | if (!buf) |
32 | goto fail_buf; | 41 | goto fail_buf; |
@@ -40,35 +49,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) | |||
40 | rcu_assign_pointer(perf_trace_buf_nmi, buf); | 49 | rcu_assign_pointer(perf_trace_buf_nmi, buf); |
41 | } | 50 | } |
42 | 51 | ||
43 | ret = event->profile_enable(event); | 52 | ret = event->perf_event_enable(event); |
44 | if (!ret) { | 53 | if (!ret) { |
45 | total_profile_count++; | 54 | total_ref_count++; |
46 | return 0; | 55 | return 0; |
47 | } | 56 | } |
48 | 57 | ||
49 | fail_buf_nmi: | 58 | fail_buf_nmi: |
50 | if (!total_profile_count) { | 59 | if (!total_ref_count) { |
51 | free_percpu(perf_trace_buf_nmi); | 60 | free_percpu(perf_trace_buf_nmi); |
52 | free_percpu(perf_trace_buf); | 61 | free_percpu(perf_trace_buf); |
53 | perf_trace_buf_nmi = NULL; | 62 | perf_trace_buf_nmi = NULL; |
54 | perf_trace_buf = NULL; | 63 | perf_trace_buf = NULL; |
55 | } | 64 | } |
56 | fail_buf: | 65 | fail_buf: |
57 | event->profile_count--; | 66 | event->perf_refcount--; |
58 | 67 | ||
59 | return ret; | 68 | return ret; |
60 | } | 69 | } |
61 | 70 | ||
62 | int ftrace_profile_enable(int event_id) | 71 | int perf_trace_enable(int event_id) |
63 | { | 72 | { |
64 | struct ftrace_event_call *event; | 73 | struct ftrace_event_call *event; |
65 | int ret = -EINVAL; | 74 | int ret = -EINVAL; |
66 | 75 | ||
67 | mutex_lock(&event_mutex); | 76 | mutex_lock(&event_mutex); |
68 | list_for_each_entry(event, &ftrace_events, list) { | 77 | list_for_each_entry(event, &ftrace_events, list) { |
69 | if (event->id == event_id && event->profile_enable && | 78 | if (event->id == event_id && event->perf_event_enable && |
70 | try_module_get(event->mod)) { | 79 | try_module_get(event->mod)) { |
71 | ret = ftrace_profile_enable_event(event); | 80 | ret = perf_trace_event_enable(event); |
72 | break; | 81 | break; |
73 | } | 82 | } |
74 | } | 83 | } |
@@ -77,16 +86,16 @@ int ftrace_profile_enable(int event_id) | |||
77 | return ret; | 86 | return ret; |
78 | } | 87 | } |
79 | 88 | ||
80 | static void ftrace_profile_disable_event(struct ftrace_event_call *event) | 89 | static void perf_trace_event_disable(struct ftrace_event_call *event) |
81 | { | 90 | { |
82 | char *buf, *nmi_buf; | 91 | char *buf, *nmi_buf; |
83 | 92 | ||
84 | if (--event->profile_count > 0) | 93 | if (--event->perf_refcount > 0) |
85 | return; | 94 | return; |
86 | 95 | ||
87 | event->profile_disable(event); | 96 | event->perf_event_disable(event); |
88 | 97 | ||
89 | if (!--total_profile_count) { | 98 | if (!--total_ref_count) { |
90 | buf = perf_trace_buf; | 99 | buf = perf_trace_buf; |
91 | rcu_assign_pointer(perf_trace_buf, NULL); | 100 | rcu_assign_pointer(perf_trace_buf, NULL); |
92 | 101 | ||
@@ -104,14 +113,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) | |||
104 | } | 113 | } |
105 | } | 114 | } |
106 | 115 | ||
107 | void ftrace_profile_disable(int event_id) | 116 | void perf_trace_disable(int event_id) |
108 | { | 117 | { |
109 | struct ftrace_event_call *event; | 118 | struct ftrace_event_call *event; |
110 | 119 | ||
111 | mutex_lock(&event_mutex); | 120 | mutex_lock(&event_mutex); |
112 | list_for_each_entry(event, &ftrace_events, list) { | 121 | list_for_each_entry(event, &ftrace_events, list) { |
113 | if (event->id == event_id) { | 122 | if (event->id == event_id) { |
114 | ftrace_profile_disable_event(event); | 123 | perf_trace_event_disable(event); |
115 | module_put(event->mod); | 124 | module_put(event->mod); |
116 | break; | 125 | break; |
117 | } | 126 | } |
@@ -119,13 +128,15 @@ void ftrace_profile_disable(int event_id) | |||
119 | mutex_unlock(&event_mutex); | 128 | mutex_unlock(&event_mutex); |
120 | } | 129 | } |
121 | 130 | ||
122 | __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, | 131 | __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, |
123 | int *rctxp, unsigned long *irq_flags) | 132 | int *rctxp, unsigned long *irq_flags) |
124 | { | 133 | { |
125 | struct trace_entry *entry; | 134 | struct trace_entry *entry; |
126 | char *trace_buf, *raw_data; | 135 | char *trace_buf, *raw_data; |
127 | int pc, cpu; | 136 | int pc, cpu; |
128 | 137 | ||
138 | BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); | ||
139 | |||
129 | pc = preempt_count(); | 140 | pc = preempt_count(); |
130 | 141 | ||
131 | /* Protect the per cpu buffer, begin the rcu read side */ | 142 | /* Protect the per cpu buffer, begin the rcu read side */ |
@@ -138,9 +149,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, | |||
138 | cpu = smp_processor_id(); | 149 | cpu = smp_processor_id(); |
139 | 150 | ||
140 | if (in_nmi()) | 151 | if (in_nmi()) |
141 | trace_buf = rcu_dereference(perf_trace_buf_nmi); | 152 | trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); |
142 | else | 153 | else |
143 | trace_buf = rcu_dereference(perf_trace_buf); | 154 | trace_buf = rcu_dereference_sched(perf_trace_buf); |
144 | 155 | ||
145 | if (!trace_buf) | 156 | if (!trace_buf) |
146 | goto err; | 157 | goto err; |
@@ -148,7 +159,7 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, | |||
148 | raw_data = per_cpu_ptr(trace_buf, cpu); | 159 | raw_data = per_cpu_ptr(trace_buf, cpu); |
149 | 160 | ||
150 | /* zero the dead bytes from align to not leak stack to user */ | 161 | /* zero the dead bytes from align to not leak stack to user */ |
151 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | 162 | memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); |
152 | 163 | ||
153 | entry = (struct trace_entry *)raw_data; | 164 | entry = (struct trace_entry *)raw_data; |
154 | tracing_generic_entry_update(entry, *irq_flags, pc); | 165 | tracing_generic_entry_update(entry, *irq_flags, pc); |
@@ -161,4 +172,4 @@ err_recursion: | |||
161 | local_irq_restore(*irq_flags); | 172 | local_irq_restore(*irq_flags); |
162 | return NULL; | 173 | return NULL; |
163 | } | 174 | } |
164 | EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); | 175 | EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 189b09baf4fb..c697c7043349 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/ctype.h> | 17 | #include <linux/ctype.h> |
18 | #include <linux/slab.h> | ||
18 | #include <linux/delay.h> | 19 | #include <linux/delay.h> |
19 | 20 | ||
20 | #include <asm/setup.h> | 21 | #include <asm/setup.h> |
@@ -60,10 +61,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, | |||
60 | return 0; | 61 | return 0; |
61 | 62 | ||
62 | err: | 63 | err: |
63 | if (field) { | 64 | if (field) |
64 | kfree(field->name); | 65 | kfree(field->name); |
65 | kfree(field->type); | ||
66 | } | ||
67 | kfree(field); | 66 | kfree(field); |
68 | 67 | ||
69 | return -ENOMEM; | 68 | return -ENOMEM; |
@@ -520,41 +519,16 @@ out: | |||
520 | return ret; | 519 | return ret; |
521 | } | 520 | } |
522 | 521 | ||
523 | extern char *__bad_type_size(void); | ||
524 | |||
525 | #undef FIELD | ||
526 | #define FIELD(type, name) \ | ||
527 | sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ | ||
528 | #type, "common_" #name, offsetof(typeof(field), name), \ | ||
529 | sizeof(field.name), is_signed_type(type) | ||
530 | |||
531 | static int trace_write_header(struct trace_seq *s) | ||
532 | { | ||
533 | struct trace_entry field; | ||
534 | |||
535 | /* struct trace_entry */ | ||
536 | return trace_seq_printf(s, | ||
537 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
538 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
539 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
540 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
541 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
542 | "\n", | ||
543 | FIELD(unsigned short, type), | ||
544 | FIELD(unsigned char, flags), | ||
545 | FIELD(unsigned char, preempt_count), | ||
546 | FIELD(int, pid), | ||
547 | FIELD(int, lock_depth)); | ||
548 | } | ||
549 | |||
550 | static ssize_t | 522 | static ssize_t |
551 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | 523 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, |
552 | loff_t *ppos) | 524 | loff_t *ppos) |
553 | { | 525 | { |
554 | struct ftrace_event_call *call = filp->private_data; | 526 | struct ftrace_event_call *call = filp->private_data; |
527 | struct ftrace_event_field *field; | ||
555 | struct trace_seq *s; | 528 | struct trace_seq *s; |
529 | int common_field_count = 5; | ||
556 | char *buf; | 530 | char *buf; |
557 | int r; | 531 | int r = 0; |
558 | 532 | ||
559 | if (*ppos) | 533 | if (*ppos) |
560 | return 0; | 534 | return 0; |
@@ -565,14 +539,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
565 | 539 | ||
566 | trace_seq_init(s); | 540 | trace_seq_init(s); |
567 | 541 | ||
568 | /* If any of the first writes fail, so will the show_format. */ | ||
569 | |||
570 | trace_seq_printf(s, "name: %s\n", call->name); | 542 | trace_seq_printf(s, "name: %s\n", call->name); |
571 | trace_seq_printf(s, "ID: %d\n", call->id); | 543 | trace_seq_printf(s, "ID: %d\n", call->id); |
572 | trace_seq_printf(s, "format:\n"); | 544 | trace_seq_printf(s, "format:\n"); |
573 | trace_write_header(s); | ||
574 | 545 | ||
575 | r = call->show_format(call, s); | 546 | list_for_each_entry_reverse(field, &call->fields, link) { |
547 | /* | ||
548 | * Smartly shows the array type(except dynamic array). | ||
549 | * Normal: | ||
550 | * field:TYPE VAR | ||
551 | * If TYPE := TYPE[LEN], it is shown: | ||
552 | * field:TYPE VAR[LEN] | ||
553 | */ | ||
554 | const char *array_descriptor = strchr(field->type, '['); | ||
555 | |||
556 | if (!strncmp(field->type, "__data_loc", 10)) | ||
557 | array_descriptor = NULL; | ||
558 | |||
559 | if (!array_descriptor) { | ||
560 | r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" | ||
561 | "\tsize:%u;\tsigned:%d;\n", | ||
562 | field->type, field->name, field->offset, | ||
563 | field->size, !!field->is_signed); | ||
564 | } else { | ||
565 | r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" | ||
566 | "\tsize:%u;\tsigned:%d;\n", | ||
567 | (int)(array_descriptor - field->type), | ||
568 | field->type, field->name, | ||
569 | array_descriptor, field->offset, | ||
570 | field->size, !!field->is_signed); | ||
571 | } | ||
572 | |||
573 | if (--common_field_count == 0) | ||
574 | r = trace_seq_printf(s, "\n"); | ||
575 | |||
576 | if (!r) | ||
577 | break; | ||
578 | } | ||
579 | |||
580 | if (r) | ||
581 | r = trace_seq_printf(s, "\nprint fmt: %s\n", | ||
582 | call->print_fmt); | ||
583 | |||
576 | if (!r) { | 584 | if (!r) { |
577 | /* | 585 | /* |
578 | * ug! The format output is bigger than a PAGE!! | 586 | * ug! The format output is bigger than a PAGE!! |
@@ -931,7 +939,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
931 | trace_create_file("enable", 0644, call->dir, call, | 939 | trace_create_file("enable", 0644, call->dir, call, |
932 | enable); | 940 | enable); |
933 | 941 | ||
934 | if (call->id && call->profile_enable) | 942 | if (call->id && call->perf_event_enable) |
935 | trace_create_file("id", 0444, call->dir, call, | 943 | trace_create_file("id", 0444, call->dir, call, |
936 | id); | 944 | id); |
937 | 945 | ||
@@ -948,10 +956,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
948 | filter); | 956 | filter); |
949 | } | 957 | } |
950 | 958 | ||
951 | /* A trace may not want to export its format */ | ||
952 | if (!call->show_format) | ||
953 | return 0; | ||
954 | |||
955 | trace_create_file("format", 0444, call->dir, call, | 959 | trace_create_file("format", 0444, call->dir, call, |
956 | format); | 960 | format); |
957 | 961 | ||
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4615f62a04f1..88c0b6dbd7fe 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/ctype.h> | 22 | #include <linux/ctype.h> |
23 | #include <linux/mutex.h> | 23 | #include <linux/mutex.h> |
24 | #include <linux/perf_event.h> | 24 | #include <linux/perf_event.h> |
25 | #include <linux/slab.h> | ||
25 | 26 | ||
26 | #include "trace.h" | 27 | #include "trace.h" |
27 | #include "trace_output.h" | 28 | #include "trace_output.h" |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4fa5dc1ee4e..e091f64ba6ce 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
62 | 62 | ||
63 | #include "trace_entries.h" | 63 | #include "trace_entries.h" |
64 | 64 | ||
65 | |||
66 | #undef __field | ||
67 | #define __field(type, item) \ | ||
68 | ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ | ||
69 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
70 | offsetof(typeof(field), item), \ | ||
71 | sizeof(field.item), is_signed_type(type)); \ | ||
72 | if (!ret) \ | ||
73 | return 0; | ||
74 | |||
75 | #undef __field_desc | ||
76 | #define __field_desc(type, container, item) \ | ||
77 | ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ | ||
78 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
79 | offsetof(typeof(field), container.item), \ | ||
80 | sizeof(field.container.item), \ | ||
81 | is_signed_type(type)); \ | ||
82 | if (!ret) \ | ||
83 | return 0; | ||
84 | |||
85 | #undef __array | ||
86 | #define __array(type, item, len) \ | ||
87 | ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ | ||
88 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
89 | offsetof(typeof(field), item), \ | ||
90 | sizeof(field.item), is_signed_type(type)); \ | ||
91 | if (!ret) \ | ||
92 | return 0; | ||
93 | |||
94 | #undef __array_desc | ||
95 | #define __array_desc(type, container, item, len) \ | ||
96 | ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ | ||
97 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
98 | offsetof(typeof(field), container.item), \ | ||
99 | sizeof(field.container.item), \ | ||
100 | is_signed_type(type)); \ | ||
101 | if (!ret) \ | ||
102 | return 0; | ||
103 | |||
104 | #undef __dynamic_array | ||
105 | #define __dynamic_array(type, item) \ | ||
106 | ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ | ||
107 | "offset:%zu;\tsize:0;\tsigned:%u;\n", \ | ||
108 | offsetof(typeof(field), item), \ | ||
109 | is_signed_type(type)); \ | ||
110 | if (!ret) \ | ||
111 | return 0; | ||
112 | |||
113 | #undef F_printk | ||
114 | #define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) | ||
115 | |||
116 | #undef __entry | ||
117 | #define __entry REC | ||
118 | |||
119 | #undef FTRACE_ENTRY | ||
120 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ | ||
121 | static int \ | ||
122 | ftrace_format_##name(struct ftrace_event_call *unused, \ | ||
123 | struct trace_seq *s) \ | ||
124 | { \ | ||
125 | struct struct_name field __attribute__((unused)); \ | ||
126 | int ret = 0; \ | ||
127 | \ | ||
128 | tstruct; \ | ||
129 | \ | ||
130 | trace_seq_printf(s, "\nprint fmt: " print); \ | ||
131 | \ | ||
132 | return ret; \ | ||
133 | } | ||
134 | |||
135 | #include "trace_entries.h" | ||
136 | |||
137 | #undef __field | 65 | #undef __field |
138 | #define __field(type, item) \ | 66 | #define __field(type, item) \ |
139 | ret = trace_define_field(event_call, #type, #item, \ | 67 | ret = trace_define_field(event_call, #type, #item, \ |
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ | |||
175 | return ret; | 103 | return ret; |
176 | 104 | ||
177 | #undef __dynamic_array | 105 | #undef __dynamic_array |
178 | #define __dynamic_array(type, item) | 106 | #define __dynamic_array(type, item) \ |
107 | ret = trace_define_field(event_call, #type, #item, \ | ||
108 | offsetof(typeof(field), item), \ | ||
109 | 0, is_signed_type(type), FILTER_OTHER);\ | ||
110 | if (ret) \ | ||
111 | return ret; | ||
179 | 112 | ||
180 | #undef FTRACE_ENTRY | 113 | #undef FTRACE_ENTRY |
181 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ | 114 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ |
@@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) | |||
198 | return 0; | 131 | return 0; |
199 | } | 132 | } |
200 | 133 | ||
134 | #undef __entry | ||
135 | #define __entry REC | ||
136 | |||
201 | #undef __field | 137 | #undef __field |
202 | #define __field(type, item) | 138 | #define __field(type, item) |
203 | 139 | ||
@@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) | |||
213 | #undef __dynamic_array | 149 | #undef __dynamic_array |
214 | #define __dynamic_array(type, item) | 150 | #define __dynamic_array(type, item) |
215 | 151 | ||
152 | #undef F_printk | ||
153 | #define F_printk(fmt, args...) #fmt ", " __stringify(args) | ||
154 | |||
216 | #undef FTRACE_ENTRY | 155 | #undef FTRACE_ENTRY |
217 | #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ | 156 | #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ |
218 | \ | 157 | \ |
@@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ | |||
223 | .id = type, \ | 162 | .id = type, \ |
224 | .system = __stringify(TRACE_SYSTEM), \ | 163 | .system = __stringify(TRACE_SYSTEM), \ |
225 | .raw_init = ftrace_raw_init_event, \ | 164 | .raw_init = ftrace_raw_init_event, \ |
226 | .show_format = ftrace_format_##call, \ | 165 | .print_fmt = print, \ |
227 | .define_fields = ftrace_define_fields_##call, \ | 166 | .define_fields = ftrace_define_fields_##call, \ |
228 | }; \ | 167 | }; \ |
229 | 168 | ||
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b1342c5d37cf..9aed1a5cf553 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/debugfs.h> | 9 | #include <linux/debugfs.h> |
10 | #include <linux/uaccess.h> | 10 | #include <linux/uaccess.h> |
11 | #include <linux/ftrace.h> | 11 | #include <linux/ftrace.h> |
12 | #include <linux/slab.h> | ||
12 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
13 | 14 | ||
14 | #include "trace.h" | 15 | #include "trace.h" |
@@ -18,6 +19,7 @@ struct fgraph_cpu_data { | |||
18 | pid_t last_pid; | 19 | pid_t last_pid; |
19 | int depth; | 20 | int depth; |
20 | int ignore; | 21 | int ignore; |
22 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; | ||
21 | }; | 23 | }; |
22 | 24 | ||
23 | struct fgraph_data { | 25 | struct fgraph_data { |
@@ -187,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr, | |||
187 | struct ring_buffer *buffer = tr->buffer; | 189 | struct ring_buffer *buffer = tr->buffer; |
188 | struct ftrace_graph_ent_entry *entry; | 190 | struct ftrace_graph_ent_entry *entry; |
189 | 191 | ||
190 | if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) | 192 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
191 | return 0; | 193 | return 0; |
192 | 194 | ||
193 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, | 195 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, |
@@ -212,13 +214,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
212 | int cpu; | 214 | int cpu; |
213 | int pc; | 215 | int pc; |
214 | 216 | ||
215 | if (unlikely(!tr)) | ||
216 | return 0; | ||
217 | |||
218 | if (!ftrace_trace_task(current)) | 217 | if (!ftrace_trace_task(current)) |
219 | return 0; | 218 | return 0; |
220 | 219 | ||
221 | if (!ftrace_graph_addr(trace->func)) | 220 | /* trace it when it is-nested-in or is a function enabled. */ |
221 | if (!(trace->depth || ftrace_graph_addr(trace->func))) | ||
222 | return 0; | 222 | return 0; |
223 | 223 | ||
224 | local_irq_save(flags); | 224 | local_irq_save(flags); |
@@ -231,9 +231,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
231 | } else { | 231 | } else { |
232 | ret = 0; | 232 | ret = 0; |
233 | } | 233 | } |
234 | /* Only do the atomic if it is not already set */ | ||
235 | if (!test_tsk_trace_graph(current)) | ||
236 | set_tsk_trace_graph(current); | ||
237 | 234 | ||
238 | atomic_dec(&data->disabled); | 235 | atomic_dec(&data->disabled); |
239 | local_irq_restore(flags); | 236 | local_irq_restore(flags); |
@@ -241,6 +238,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
241 | return ret; | 238 | return ret; |
242 | } | 239 | } |
243 | 240 | ||
241 | int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) | ||
242 | { | ||
243 | if (tracing_thresh) | ||
244 | return 1; | ||
245 | else | ||
246 | return trace_graph_entry(trace); | ||
247 | } | ||
248 | |||
244 | static void __trace_graph_return(struct trace_array *tr, | 249 | static void __trace_graph_return(struct trace_array *tr, |
245 | struct ftrace_graph_ret *trace, | 250 | struct ftrace_graph_ret *trace, |
246 | unsigned long flags, | 251 | unsigned long flags, |
@@ -251,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr, | |||
251 | struct ring_buffer *buffer = tr->buffer; | 256 | struct ring_buffer *buffer = tr->buffer; |
252 | struct ftrace_graph_ret_entry *entry; | 257 | struct ftrace_graph_ret_entry *entry; |
253 | 258 | ||
254 | if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) | 259 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
255 | return; | 260 | return; |
256 | 261 | ||
257 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, | 262 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, |
@@ -281,19 +286,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace) | |||
281 | pc = preempt_count(); | 286 | pc = preempt_count(); |
282 | __trace_graph_return(tr, trace, flags, pc); | 287 | __trace_graph_return(tr, trace, flags, pc); |
283 | } | 288 | } |
284 | if (!trace->depth) | ||
285 | clear_tsk_trace_graph(current); | ||
286 | atomic_dec(&data->disabled); | 289 | atomic_dec(&data->disabled); |
287 | local_irq_restore(flags); | 290 | local_irq_restore(flags); |
288 | } | 291 | } |
289 | 292 | ||
293 | void set_graph_array(struct trace_array *tr) | ||
294 | { | ||
295 | graph_array = tr; | ||
296 | |||
297 | /* Make graph_array visible before we start tracing */ | ||
298 | |||
299 | smp_mb(); | ||
300 | } | ||
301 | |||
302 | void trace_graph_thresh_return(struct ftrace_graph_ret *trace) | ||
303 | { | ||
304 | if (tracing_thresh && | ||
305 | (trace->rettime - trace->calltime < tracing_thresh)) | ||
306 | return; | ||
307 | else | ||
308 | trace_graph_return(trace); | ||
309 | } | ||
310 | |||
290 | static int graph_trace_init(struct trace_array *tr) | 311 | static int graph_trace_init(struct trace_array *tr) |
291 | { | 312 | { |
292 | int ret; | 313 | int ret; |
293 | 314 | ||
294 | graph_array = tr; | 315 | set_graph_array(tr); |
295 | ret = register_ftrace_graph(&trace_graph_return, | 316 | if (tracing_thresh) |
296 | &trace_graph_entry); | 317 | ret = register_ftrace_graph(&trace_graph_thresh_return, |
318 | &trace_graph_thresh_entry); | ||
319 | else | ||
320 | ret = register_ftrace_graph(&trace_graph_return, | ||
321 | &trace_graph_entry); | ||
297 | if (ret) | 322 | if (ret) |
298 | return ret; | 323 | return ret; |
299 | tracing_start_cmdline_record(); | 324 | tracing_start_cmdline_record(); |
@@ -301,11 +326,6 @@ static int graph_trace_init(struct trace_array *tr) | |||
301 | return 0; | 326 | return 0; |
302 | } | 327 | } |
303 | 328 | ||
304 | void set_graph_array(struct trace_array *tr) | ||
305 | { | ||
306 | graph_array = tr; | ||
307 | } | ||
308 | |||
309 | static void graph_trace_reset(struct trace_array *tr) | 329 | static void graph_trace_reset(struct trace_array *tr) |
310 | { | 330 | { |
311 | tracing_stop_cmdline_record(); | 331 | tracing_stop_cmdline_record(); |
@@ -673,15 +693,21 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
673 | duration = graph_ret->rettime - graph_ret->calltime; | 693 | duration = graph_ret->rettime - graph_ret->calltime; |
674 | 694 | ||
675 | if (data) { | 695 | if (data) { |
696 | struct fgraph_cpu_data *cpu_data; | ||
676 | int cpu = iter->cpu; | 697 | int cpu = iter->cpu; |
677 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 698 | |
699 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); | ||
678 | 700 | ||
679 | /* | 701 | /* |
680 | * Comments display at + 1 to depth. Since | 702 | * Comments display at + 1 to depth. Since |
681 | * this is a leaf function, keep the comments | 703 | * this is a leaf function, keep the comments |
682 | * equal to this depth. | 704 | * equal to this depth. |
683 | */ | 705 | */ |
684 | *depth = call->depth - 1; | 706 | cpu_data->depth = call->depth - 1; |
707 | |||
708 | /* No need to keep this function around for this depth */ | ||
709 | if (call->depth < FTRACE_RETFUNC_DEPTH) | ||
710 | cpu_data->enter_funcs[call->depth] = 0; | ||
685 | } | 711 | } |
686 | 712 | ||
687 | /* Overhead */ | 713 | /* Overhead */ |
@@ -721,10 +747,15 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
721 | int i; | 747 | int i; |
722 | 748 | ||
723 | if (data) { | 749 | if (data) { |
750 | struct fgraph_cpu_data *cpu_data; | ||
724 | int cpu = iter->cpu; | 751 | int cpu = iter->cpu; |
725 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | ||
726 | 752 | ||
727 | *depth = call->depth; | 753 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); |
754 | cpu_data->depth = call->depth; | ||
755 | |||
756 | /* Save this function pointer to see if the exit matches */ | ||
757 | if (call->depth < FTRACE_RETFUNC_DEPTH) | ||
758 | cpu_data->enter_funcs[call->depth] = call->func; | ||
728 | } | 759 | } |
729 | 760 | ||
730 | /* No overhead */ | 761 | /* No overhead */ |
@@ -854,19 +885,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
854 | struct fgraph_data *data = iter->private; | 885 | struct fgraph_data *data = iter->private; |
855 | pid_t pid = ent->pid; | 886 | pid_t pid = ent->pid; |
856 | int cpu = iter->cpu; | 887 | int cpu = iter->cpu; |
888 | int func_match = 1; | ||
857 | int ret; | 889 | int ret; |
858 | int i; | 890 | int i; |
859 | 891 | ||
860 | if (data) { | 892 | if (data) { |
893 | struct fgraph_cpu_data *cpu_data; | ||
861 | int cpu = iter->cpu; | 894 | int cpu = iter->cpu; |
862 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 895 | |
896 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); | ||
863 | 897 | ||
864 | /* | 898 | /* |
865 | * Comments display at + 1 to depth. This is the | 899 | * Comments display at + 1 to depth. This is the |
866 | * return from a function, we now want the comments | 900 | * return from a function, we now want the comments |
867 | * to display at the same level of the bracket. | 901 | * to display at the same level of the bracket. |
868 | */ | 902 | */ |
869 | *depth = trace->depth - 1; | 903 | cpu_data->depth = trace->depth - 1; |
904 | |||
905 | if (trace->depth < FTRACE_RETFUNC_DEPTH) { | ||
906 | if (cpu_data->enter_funcs[trace->depth] != trace->func) | ||
907 | func_match = 0; | ||
908 | cpu_data->enter_funcs[trace->depth] = 0; | ||
909 | } | ||
870 | } | 910 | } |
871 | 911 | ||
872 | if (print_graph_prologue(iter, s, 0, 0)) | 912 | if (print_graph_prologue(iter, s, 0, 0)) |
@@ -891,9 +931,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
891 | return TRACE_TYPE_PARTIAL_LINE; | 931 | return TRACE_TYPE_PARTIAL_LINE; |
892 | } | 932 | } |
893 | 933 | ||
894 | ret = trace_seq_printf(s, "}\n"); | 934 | /* |
895 | if (!ret) | 935 | * If the return function does not have a matching entry, |
896 | return TRACE_TYPE_PARTIAL_LINE; | 936 | * then the entry was lost. Instead of just printing |
937 | * the '}' and letting the user guess what function this | ||
938 | * belongs to, write out the function name. | ||
939 | */ | ||
940 | if (func_match) { | ||
941 | ret = trace_seq_printf(s, "}\n"); | ||
942 | if (!ret) | ||
943 | return TRACE_TYPE_PARTIAL_LINE; | ||
944 | } else { | ||
945 | ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); | ||
946 | if (!ret) | ||
947 | return TRACE_TYPE_PARTIAL_LINE; | ||
948 | } | ||
897 | 949 | ||
898 | /* Overrun */ | 950 | /* Overrun */ |
899 | if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { | 951 | if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6178abf3637e..1251e367bae9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -635,12 +635,12 @@ static int create_trace_probe(int argc, char **argv) | |||
635 | event = strchr(group, '/') + 1; | 635 | event = strchr(group, '/') + 1; |
636 | event[-1] = '\0'; | 636 | event[-1] = '\0'; |
637 | if (strlen(group) == 0) { | 637 | if (strlen(group) == 0) { |
638 | pr_info("Group name is not specifiled\n"); | 638 | pr_info("Group name is not specified\n"); |
639 | return -EINVAL; | 639 | return -EINVAL; |
640 | } | 640 | } |
641 | } | 641 | } |
642 | if (strlen(event) == 0) { | 642 | if (strlen(event) == 0) { |
643 | pr_info("Event name is not specifiled\n"); | 643 | pr_info("Event name is not specified\n"); |
644 | return -EINVAL; | 644 | return -EINVAL; |
645 | } | 645 | } |
646 | } | 646 | } |
@@ -673,7 +673,7 @@ static int create_trace_probe(int argc, char **argv) | |||
673 | return -EINVAL; | 673 | return -EINVAL; |
674 | } | 674 | } |
675 | /* an address specified */ | 675 | /* an address specified */ |
676 | ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); | 676 | ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); |
677 | if (ret) { | 677 | if (ret) { |
678 | pr_info("Failed to parse address.\n"); | 678 | pr_info("Failed to parse address.\n"); |
679 | return ret; | 679 | return ret; |
@@ -1155,86 +1155,66 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
1155 | return 0; | 1155 | return 0; |
1156 | } | 1156 | } |
1157 | 1157 | ||
1158 | static int __probe_event_show_format(struct trace_seq *s, | 1158 | static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) |
1159 | struct trace_probe *tp, const char *fmt, | ||
1160 | const char *arg) | ||
1161 | { | 1159 | { |
1162 | int i; | 1160 | int i; |
1161 | int pos = 0; | ||
1163 | 1162 | ||
1164 | /* Show format */ | 1163 | const char *fmt, *arg; |
1165 | if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) | ||
1166 | return 0; | ||
1167 | 1164 | ||
1168 | for (i = 0; i < tp->nr_args; i++) | 1165 | if (!probe_is_return(tp)) { |
1169 | if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) | 1166 | fmt = "(%lx)"; |
1170 | return 0; | 1167 | arg = "REC->" FIELD_STRING_IP; |
1168 | } else { | ||
1169 | fmt = "(%lx <- %lx)"; | ||
1170 | arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; | ||
1171 | } | ||
1171 | 1172 | ||
1172 | if (!trace_seq_printf(s, "\", %s", arg)) | 1173 | /* When len=0, we just calculate the needed length */ |
1173 | return 0; | 1174 | #define LEN_OR_ZERO (len ? len - pos : 0) |
1174 | 1175 | ||
1175 | for (i = 0; i < tp->nr_args; i++) | 1176 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); |
1176 | if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) | ||
1177 | return 0; | ||
1178 | 1177 | ||
1179 | return trace_seq_puts(s, "\n"); | 1178 | for (i = 0; i < tp->nr_args; i++) { |
1180 | } | 1179 | pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", |
1180 | tp->args[i].name); | ||
1181 | } | ||
1181 | 1182 | ||
1182 | #undef SHOW_FIELD | 1183 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); |
1183 | #define SHOW_FIELD(type, item, name) \ | ||
1184 | do { \ | ||
1185 | ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ | ||
1186 | "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ | ||
1187 | (unsigned int)offsetof(typeof(field), item),\ | ||
1188 | (unsigned int)sizeof(type), \ | ||
1189 | is_signed_type(type)); \ | ||
1190 | if (!ret) \ | ||
1191 | return 0; \ | ||
1192 | } while (0) | ||
1193 | 1184 | ||
1194 | static int kprobe_event_show_format(struct ftrace_event_call *call, | 1185 | for (i = 0; i < tp->nr_args; i++) { |
1195 | struct trace_seq *s) | 1186 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", |
1196 | { | 1187 | tp->args[i].name); |
1197 | struct kprobe_trace_entry field __attribute__((unused)); | 1188 | } |
1198 | int ret, i; | ||
1199 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1200 | |||
1201 | SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); | ||
1202 | SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); | ||
1203 | 1189 | ||
1204 | /* Show fields */ | 1190 | #undef LEN_OR_ZERO |
1205 | for (i = 0; i < tp->nr_args; i++) | ||
1206 | SHOW_FIELD(unsigned long, args[i], tp->args[i].name); | ||
1207 | trace_seq_puts(s, "\n"); | ||
1208 | 1191 | ||
1209 | return __probe_event_show_format(s, tp, "(%lx)", | 1192 | /* return the length of print_fmt */ |
1210 | "REC->" FIELD_STRING_IP); | 1193 | return pos; |
1211 | } | 1194 | } |
1212 | 1195 | ||
1213 | static int kretprobe_event_show_format(struct ftrace_event_call *call, | 1196 | static int set_print_fmt(struct trace_probe *tp) |
1214 | struct trace_seq *s) | ||
1215 | { | 1197 | { |
1216 | struct kretprobe_trace_entry field __attribute__((unused)); | 1198 | int len; |
1217 | int ret, i; | 1199 | char *print_fmt; |
1218 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1219 | 1200 | ||
1220 | SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); | 1201 | /* First: called with 0 length to calculate the needed length */ |
1221 | SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); | 1202 | len = __set_print_fmt(tp, NULL, 0); |
1222 | SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); | 1203 | print_fmt = kmalloc(len + 1, GFP_KERNEL); |
1204 | if (!print_fmt) | ||
1205 | return -ENOMEM; | ||
1223 | 1206 | ||
1224 | /* Show fields */ | 1207 | /* Second: actually write the @print_fmt */ |
1225 | for (i = 0; i < tp->nr_args; i++) | 1208 | __set_print_fmt(tp, print_fmt, len + 1); |
1226 | SHOW_FIELD(unsigned long, args[i], tp->args[i].name); | 1209 | tp->call.print_fmt = print_fmt; |
1227 | trace_seq_puts(s, "\n"); | ||
1228 | 1210 | ||
1229 | return __probe_event_show_format(s, tp, "(%lx <- %lx)", | 1211 | return 0; |
1230 | "REC->" FIELD_STRING_FUNC | ||
1231 | ", REC->" FIELD_STRING_RETIP); | ||
1232 | } | 1212 | } |
1233 | 1213 | ||
1234 | #ifdef CONFIG_PERF_EVENTS | 1214 | #ifdef CONFIG_PERF_EVENTS |
1235 | 1215 | ||
1236 | /* Kprobe profile handler */ | 1216 | /* Kprobe profile handler */ |
1237 | static __kprobes void kprobe_profile_func(struct kprobe *kp, | 1217 | static __kprobes void kprobe_perf_func(struct kprobe *kp, |
1238 | struct pt_regs *regs) | 1218 | struct pt_regs *regs) |
1239 | { | 1219 | { |
1240 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | 1220 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); |
@@ -1247,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp, | |||
1247 | __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); | 1227 | __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); |
1248 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1228 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1249 | size -= sizeof(u32); | 1229 | size -= sizeof(u32); |
1250 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, | 1230 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
1251 | "profile buffer not large enough")) | 1231 | "profile buffer not large enough")) |
1252 | return; | 1232 | return; |
1253 | 1233 | ||
1254 | entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); | 1234 | entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); |
1255 | if (!entry) | 1235 | if (!entry) |
1256 | return; | 1236 | return; |
1257 | 1237 | ||
@@ -1260,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp, | |||
1260 | for (i = 0; i < tp->nr_args; i++) | 1240 | for (i = 0; i < tp->nr_args; i++) |
1261 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); | 1241 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); |
1262 | 1242 | ||
1263 | ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); | 1243 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); |
1264 | } | 1244 | } |
1265 | 1245 | ||
1266 | /* Kretprobe profile handler */ | 1246 | /* Kretprobe profile handler */ |
1267 | static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, | 1247 | static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, |
1268 | struct pt_regs *regs) | 1248 | struct pt_regs *regs) |
1269 | { | 1249 | { |
1270 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | 1250 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); |
@@ -1277,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, | |||
1277 | __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); | 1257 | __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); |
1278 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1258 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1279 | size -= sizeof(u32); | 1259 | size -= sizeof(u32); |
1280 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, | 1260 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
1281 | "profile buffer not large enough")) | 1261 | "profile buffer not large enough")) |
1282 | return; | 1262 | return; |
1283 | 1263 | ||
1284 | entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); | 1264 | entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); |
1285 | if (!entry) | 1265 | if (!entry) |
1286 | return; | 1266 | return; |
1287 | 1267 | ||
@@ -1291,10 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, | |||
1291 | for (i = 0; i < tp->nr_args; i++) | 1271 | for (i = 0; i < tp->nr_args; i++) |
1292 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); | 1272 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); |
1293 | 1273 | ||
1294 | ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); | 1274 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, |
1275 | irq_flags, regs); | ||
1295 | } | 1276 | } |
1296 | 1277 | ||
1297 | static int probe_profile_enable(struct ftrace_event_call *call) | 1278 | static int probe_perf_enable(struct ftrace_event_call *call) |
1298 | { | 1279 | { |
1299 | struct trace_probe *tp = (struct trace_probe *)call->data; | 1280 | struct trace_probe *tp = (struct trace_probe *)call->data; |
1300 | 1281 | ||
@@ -1306,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call) | |||
1306 | return enable_kprobe(&tp->rp.kp); | 1287 | return enable_kprobe(&tp->rp.kp); |
1307 | } | 1288 | } |
1308 | 1289 | ||
1309 | static void probe_profile_disable(struct ftrace_event_call *call) | 1290 | static void probe_perf_disable(struct ftrace_event_call *call) |
1310 | { | 1291 | { |
1311 | struct trace_probe *tp = (struct trace_probe *)call->data; | 1292 | struct trace_probe *tp = (struct trace_probe *)call->data; |
1312 | 1293 | ||
@@ -1331,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | |||
1331 | kprobe_trace_func(kp, regs); | 1312 | kprobe_trace_func(kp, regs); |
1332 | #ifdef CONFIG_PERF_EVENTS | 1313 | #ifdef CONFIG_PERF_EVENTS |
1333 | if (tp->flags & TP_FLAG_PROFILE) | 1314 | if (tp->flags & TP_FLAG_PROFILE) |
1334 | kprobe_profile_func(kp, regs); | 1315 | kprobe_perf_func(kp, regs); |
1335 | #endif | 1316 | #endif |
1336 | return 0; /* We don't tweek kernel, so just return 0 */ | 1317 | return 0; /* We don't tweek kernel, so just return 0 */ |
1337 | } | 1318 | } |
@@ -1345,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) | |||
1345 | kretprobe_trace_func(ri, regs); | 1326 | kretprobe_trace_func(ri, regs); |
1346 | #ifdef CONFIG_PERF_EVENTS | 1327 | #ifdef CONFIG_PERF_EVENTS |
1347 | if (tp->flags & TP_FLAG_PROFILE) | 1328 | if (tp->flags & TP_FLAG_PROFILE) |
1348 | kretprobe_profile_func(ri, regs); | 1329 | kretprobe_perf_func(ri, regs); |
1349 | #endif | 1330 | #endif |
1350 | return 0; /* We don't tweek kernel, so just return 0 */ | 1331 | return 0; /* We don't tweek kernel, so just return 0 */ |
1351 | } | 1332 | } |
@@ -1359,30 +1340,33 @@ static int register_probe_event(struct trace_probe *tp) | |||
1359 | if (probe_is_return(tp)) { | 1340 | if (probe_is_return(tp)) { |
1360 | tp->event.trace = print_kretprobe_event; | 1341 | tp->event.trace = print_kretprobe_event; |
1361 | call->raw_init = probe_event_raw_init; | 1342 | call->raw_init = probe_event_raw_init; |
1362 | call->show_format = kretprobe_event_show_format; | ||
1363 | call->define_fields = kretprobe_event_define_fields; | 1343 | call->define_fields = kretprobe_event_define_fields; |
1364 | } else { | 1344 | } else { |
1365 | tp->event.trace = print_kprobe_event; | 1345 | tp->event.trace = print_kprobe_event; |
1366 | call->raw_init = probe_event_raw_init; | 1346 | call->raw_init = probe_event_raw_init; |
1367 | call->show_format = kprobe_event_show_format; | ||
1368 | call->define_fields = kprobe_event_define_fields; | 1347 | call->define_fields = kprobe_event_define_fields; |
1369 | } | 1348 | } |
1349 | if (set_print_fmt(tp) < 0) | ||
1350 | return -ENOMEM; | ||
1370 | call->event = &tp->event; | 1351 | call->event = &tp->event; |
1371 | call->id = register_ftrace_event(&tp->event); | 1352 | call->id = register_ftrace_event(&tp->event); |
1372 | if (!call->id) | 1353 | if (!call->id) { |
1354 | kfree(call->print_fmt); | ||
1373 | return -ENODEV; | 1355 | return -ENODEV; |
1356 | } | ||
1374 | call->enabled = 0; | 1357 | call->enabled = 0; |
1375 | call->regfunc = probe_event_enable; | 1358 | call->regfunc = probe_event_enable; |
1376 | call->unregfunc = probe_event_disable; | 1359 | call->unregfunc = probe_event_disable; |
1377 | 1360 | ||
1378 | #ifdef CONFIG_PERF_EVENTS | 1361 | #ifdef CONFIG_PERF_EVENTS |
1379 | call->profile_enable = probe_profile_enable; | 1362 | call->perf_event_enable = probe_perf_enable; |
1380 | call->profile_disable = probe_profile_disable; | 1363 | call->perf_event_disable = probe_perf_disable; |
1381 | #endif | 1364 | #endif |
1382 | call->data = tp; | 1365 | call->data = tp; |
1383 | ret = trace_add_event_call(call); | 1366 | ret = trace_add_event_call(call); |
1384 | if (ret) { | 1367 | if (ret) { |
1385 | pr_info("Failed to register kprobe event: %s\n", call->name); | 1368 | pr_info("Failed to register kprobe event: %s\n", call->name); |
1369 | kfree(call->print_fmt); | ||
1386 | unregister_ftrace_event(&tp->event); | 1370 | unregister_ftrace_event(&tp->event); |
1387 | } | 1371 | } |
1388 | return ret; | 1372 | return ret; |
@@ -1392,6 +1376,7 @@ static void unregister_probe_event(struct trace_probe *tp) | |||
1392 | { | 1376 | { |
1393 | /* tp->event is unregistered in trace_remove_event_call() */ | 1377 | /* tp->event is unregistered in trace_remove_event_call() */ |
1394 | trace_remove_event_call(&tp->call); | 1378 | trace_remove_event_call(&tp->call); |
1379 | kfree(tp->call.print_fmt); | ||
1395 | } | 1380 | } |
1396 | 1381 | ||
1397 | /* Make a debugfs interface for controling probe points */ | 1382 | /* Make a debugfs interface for controling probe points */ |
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 94103cdcf9d8..d59cd6879477 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/debugfs.h> | 23 | #include <linux/debugfs.h> |
24 | #include <linux/ftrace.h> | 24 | #include <linux/ftrace.h> |
25 | #include <linux/module.h> | 25 | #include <linux/module.h> |
26 | #include <linux/slab.h> | ||
26 | #include <linux/fs.h> | 27 | #include <linux/fs.h> |
27 | 28 | ||
28 | #include "trace_output.h" | 29 | #include "trace_output.h" |
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0acd834659ed..017fa376505d 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
10 | #include <linux/mmiotrace.h> | 10 | #include <linux/mmiotrace.h> |
11 | #include <linux/pci.h> | 11 | #include <linux/pci.h> |
12 | #include <linux/slab.h> | ||
12 | #include <linux/time.h> | 13 | #include <linux/time.h> |
13 | 14 | ||
14 | #include <asm/atomic.h> | 15 | #include <asm/atomic.h> |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 280fea470d67..81003b4d617f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -3,6 +3,7 @@ | |||
3 | #include <linux/stringify.h> | 3 | #include <linux/stringify.h> |
4 | #include <linux/kthread.h> | 4 | #include <linux/kthread.h> |
5 | #include <linux/delay.h> | 5 | #include <linux/delay.h> |
6 | #include <linux/slab.h> | ||
6 | 7 | ||
7 | static inline int trace_valid_entry(struct trace_entry *entry) | 8 | static inline int trace_valid_entry(struct trace_entry *entry) |
8 | { | 9 | { |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 678a5120ee30..f4bc9b27de5f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, | |||
157 | unsigned long val, flags; | 157 | unsigned long val, flags; |
158 | char buf[64]; | 158 | char buf[64]; |
159 | int ret; | 159 | int ret; |
160 | int cpu; | ||
160 | 161 | ||
161 | if (count >= sizeof(buf)) | 162 | if (count >= sizeof(buf)) |
162 | return -EINVAL; | 163 | return -EINVAL; |
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, | |||
171 | return ret; | 172 | return ret; |
172 | 173 | ||
173 | local_irq_save(flags); | 174 | local_irq_save(flags); |
175 | |||
176 | /* | ||
177 | * In case we trace inside arch_spin_lock() or after (NMI), | ||
178 | * we will cause circular lock, so we also need to increase | ||
179 | * the percpu trace_active here. | ||
180 | */ | ||
181 | cpu = smp_processor_id(); | ||
182 | per_cpu(trace_active, cpu)++; | ||
183 | |||
174 | arch_spin_lock(&max_stack_lock); | 184 | arch_spin_lock(&max_stack_lock); |
175 | *ptr = val; | 185 | *ptr = val; |
176 | arch_spin_unlock(&max_stack_lock); | 186 | arch_spin_unlock(&max_stack_lock); |
187 | |||
188 | per_cpu(trace_active, cpu)--; | ||
177 | local_irq_restore(flags); | 189 | local_irq_restore(flags); |
178 | 190 | ||
179 | return count; | 191 | return count; |
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
206 | 218 | ||
207 | static void *t_start(struct seq_file *m, loff_t *pos) | 219 | static void *t_start(struct seq_file *m, loff_t *pos) |
208 | { | 220 | { |
221 | int cpu; | ||
222 | |||
209 | local_irq_disable(); | 223 | local_irq_disable(); |
224 | |||
225 | cpu = smp_processor_id(); | ||
226 | per_cpu(trace_active, cpu)++; | ||
227 | |||
210 | arch_spin_lock(&max_stack_lock); | 228 | arch_spin_lock(&max_stack_lock); |
211 | 229 | ||
212 | if (*pos == 0) | 230 | if (*pos == 0) |
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
217 | 235 | ||
218 | static void t_stop(struct seq_file *m, void *p) | 236 | static void t_stop(struct seq_file *m, void *p) |
219 | { | 237 | { |
238 | int cpu; | ||
239 | |||
220 | arch_spin_unlock(&max_stack_lock); | 240 | arch_spin_unlock(&max_stack_lock); |
241 | |||
242 | cpu = smp_processor_id(); | ||
243 | per_cpu(trace_active, cpu)--; | ||
244 | |||
221 | local_irq_enable(); | 245 | local_irq_enable(); |
222 | } | 246 | } |
223 | 247 | ||
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index a4bb239eb987..96cffb269e73 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c | |||
@@ -10,6 +10,7 @@ | |||
10 | 10 | ||
11 | 11 | ||
12 | #include <linux/list.h> | 12 | #include <linux/list.h> |
13 | #include <linux/slab.h> | ||
13 | #include <linux/rbtree.h> | 14 | #include <linux/rbtree.h> |
14 | #include <linux/debugfs.h> | 15 | #include <linux/debugfs.h> |
15 | #include "trace_stat.h" | 16 | #include "trace_stat.h" |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4e332b9e449c..4d6d711717f2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -1,5 +1,6 @@ | |||
1 | #include <trace/syscall.h> | 1 | #include <trace/syscall.h> |
2 | #include <trace/events/syscalls.h> | 2 | #include <trace/events/syscalls.h> |
3 | #include <linux/slab.h> | ||
3 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
4 | #include <linux/ftrace.h> | 5 | #include <linux/ftrace.h> |
5 | #include <linux/perf_event.h> | 6 | #include <linux/perf_event.h> |
@@ -143,70 +144,65 @@ extern char *__bad_type_size(void); | |||
143 | #type, #name, offsetof(typeof(trace), name), \ | 144 | #type, #name, offsetof(typeof(trace), name), \ |
144 | sizeof(trace.name), is_signed_type(type) | 145 | sizeof(trace.name), is_signed_type(type) |
145 | 146 | ||
146 | int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) | 147 | static |
148 | int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) | ||
147 | { | 149 | { |
148 | int i; | 150 | int i; |
149 | int ret; | 151 | int pos = 0; |
150 | struct syscall_metadata *entry = call->data; | ||
151 | struct syscall_trace_enter trace; | ||
152 | int offset = offsetof(struct syscall_trace_enter, args); | ||
153 | 152 | ||
154 | ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" | 153 | /* When len=0, we just calculate the needed length */ |
155 | "\tsigned:%u;\n", | 154 | #define LEN_OR_ZERO (len ? len - pos : 0) |
156 | SYSCALL_FIELD(int, nr)); | ||
157 | if (!ret) | ||
158 | return 0; | ||
159 | 155 | ||
156 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); | ||
160 | for (i = 0; i < entry->nb_args; i++) { | 157 | for (i = 0; i < entry->nb_args; i++) { |
161 | ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], | 158 | pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", |
162 | entry->args[i]); | 159 | entry->args[i], sizeof(unsigned long), |
163 | if (!ret) | 160 | i == entry->nb_args - 1 ? "" : ", "); |
164 | return 0; | ||
165 | ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" | ||
166 | "\tsigned:%u;\n", offset, | ||
167 | sizeof(unsigned long), | ||
168 | is_signed_type(unsigned long)); | ||
169 | if (!ret) | ||
170 | return 0; | ||
171 | offset += sizeof(unsigned long); | ||
172 | } | 161 | } |
162 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); | ||
173 | 163 | ||
174 | trace_seq_puts(s, "\nprint fmt: \""); | ||
175 | for (i = 0; i < entry->nb_args; i++) { | 164 | for (i = 0; i < entry->nb_args; i++) { |
176 | ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], | 165 | pos += snprintf(buf + pos, LEN_OR_ZERO, |
177 | sizeof(unsigned long), | 166 | ", ((unsigned long)(REC->%s))", entry->args[i]); |
178 | i == entry->nb_args - 1 ? "" : ", "); | ||
179 | if (!ret) | ||
180 | return 0; | ||
181 | } | 167 | } |
182 | trace_seq_putc(s, '"'); | ||
183 | 168 | ||
184 | for (i = 0; i < entry->nb_args; i++) { | 169 | #undef LEN_OR_ZERO |
185 | ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", | ||
186 | entry->args[i]); | ||
187 | if (!ret) | ||
188 | return 0; | ||
189 | } | ||
190 | 170 | ||
191 | return trace_seq_putc(s, '\n'); | 171 | /* return the length of print_fmt */ |
172 | return pos; | ||
192 | } | 173 | } |
193 | 174 | ||
194 | int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) | 175 | static int set_syscall_print_fmt(struct ftrace_event_call *call) |
195 | { | 176 | { |
196 | int ret; | 177 | char *print_fmt; |
197 | struct syscall_trace_exit trace; | 178 | int len; |
179 | struct syscall_metadata *entry = call->data; | ||
198 | 180 | ||
199 | ret = trace_seq_printf(s, | 181 | if (entry->enter_event != call) { |
200 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" | 182 | call->print_fmt = "\"0x%lx\", REC->ret"; |
201 | "\tsigned:%u;\n" | ||
202 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" | ||
203 | "\tsigned:%u;\n", | ||
204 | SYSCALL_FIELD(int, nr), | ||
205 | SYSCALL_FIELD(long, ret)); | ||
206 | if (!ret) | ||
207 | return 0; | 183 | return 0; |
184 | } | ||
208 | 185 | ||
209 | return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); | 186 | /* First: called with 0 length to calculate the needed length */ |
187 | len = __set_enter_print_fmt(entry, NULL, 0); | ||
188 | |||
189 | print_fmt = kmalloc(len + 1, GFP_KERNEL); | ||
190 | if (!print_fmt) | ||
191 | return -ENOMEM; | ||
192 | |||
193 | /* Second: actually write the @print_fmt */ | ||
194 | __set_enter_print_fmt(entry, print_fmt, len + 1); | ||
195 | call->print_fmt = print_fmt; | ||
196 | |||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | static void free_syscall_print_fmt(struct ftrace_event_call *call) | ||
201 | { | ||
202 | struct syscall_metadata *entry = call->data; | ||
203 | |||
204 | if (entry->enter_event == call) | ||
205 | kfree(call->print_fmt); | ||
210 | } | 206 | } |
211 | 207 | ||
212 | int syscall_enter_define_fields(struct ftrace_event_call *call) | 208 | int syscall_enter_define_fields(struct ftrace_event_call *call) |
@@ -386,12 +382,22 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
386 | { | 382 | { |
387 | int id; | 383 | int id; |
388 | 384 | ||
389 | id = register_ftrace_event(call->event); | 385 | if (set_syscall_print_fmt(call) < 0) |
390 | if (!id) | 386 | return -ENOMEM; |
391 | return -ENODEV; | 387 | |
392 | call->id = id; | 388 | id = trace_event_raw_init(call); |
393 | INIT_LIST_HEAD(&call->fields); | 389 | |
394 | return 0; | 390 | if (id < 0) { |
391 | free_syscall_print_fmt(call); | ||
392 | return id; | ||
393 | } | ||
394 | |||
395 | return id; | ||
396 | } | ||
397 | |||
398 | unsigned long __init arch_syscall_addr(int nr) | ||
399 | { | ||
400 | return (unsigned long)sys_call_table[nr]; | ||
395 | } | 401 | } |
396 | 402 | ||
397 | int __init init_ftrace_syscalls(void) | 403 | int __init init_ftrace_syscalls(void) |
@@ -423,12 +429,12 @@ core_initcall(init_ftrace_syscalls); | |||
423 | 429 | ||
424 | #ifdef CONFIG_PERF_EVENTS | 430 | #ifdef CONFIG_PERF_EVENTS |
425 | 431 | ||
426 | static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); | 432 | static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); |
427 | static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); | 433 | static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); |
428 | static int sys_prof_refcount_enter; | 434 | static int sys_perf_refcount_enter; |
429 | static int sys_prof_refcount_exit; | 435 | static int sys_perf_refcount_exit; |
430 | 436 | ||
431 | static void prof_syscall_enter(struct pt_regs *regs, long id) | 437 | static void perf_syscall_enter(struct pt_regs *regs, long id) |
432 | { | 438 | { |
433 | struct syscall_metadata *sys_data; | 439 | struct syscall_metadata *sys_data; |
434 | struct syscall_trace_enter *rec; | 440 | struct syscall_trace_enter *rec; |
@@ -438,7 +444,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) | |||
438 | int size; | 444 | int size; |
439 | 445 | ||
440 | syscall_nr = syscall_get_nr(current, regs); | 446 | syscall_nr = syscall_get_nr(current, regs); |
441 | if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) | 447 | if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) |
442 | return; | 448 | return; |
443 | 449 | ||
444 | sys_data = syscall_nr_to_meta(syscall_nr); | 450 | sys_data = syscall_nr_to_meta(syscall_nr); |
@@ -450,11 +456,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) | |||
450 | size = ALIGN(size + sizeof(u32), sizeof(u64)); | 456 | size = ALIGN(size + sizeof(u32), sizeof(u64)); |
451 | size -= sizeof(u32); | 457 | size -= sizeof(u32); |
452 | 458 | ||
453 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, | 459 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
454 | "profile buffer not large enough")) | 460 | "perf buffer not large enough")) |
455 | return; | 461 | return; |
456 | 462 | ||
457 | rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, | 463 | rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, |
458 | sys_data->enter_event->id, &rctx, &flags); | 464 | sys_data->enter_event->id, &rctx, &flags); |
459 | if (!rec) | 465 | if (!rec) |
460 | return; | 466 | return; |
@@ -462,10 +468,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) | |||
462 | rec->nr = syscall_nr; | 468 | rec->nr = syscall_nr; |
463 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, | 469 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, |
464 | (unsigned long *)&rec->args); | 470 | (unsigned long *)&rec->args); |
465 | ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); | 471 | perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); |
466 | } | 472 | } |
467 | 473 | ||
468 | int prof_sysenter_enable(struct ftrace_event_call *call) | 474 | int perf_sysenter_enable(struct ftrace_event_call *call) |
469 | { | 475 | { |
470 | int ret = 0; | 476 | int ret = 0; |
471 | int num; | 477 | int num; |
@@ -473,34 +479,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call) | |||
473 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 479 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
474 | 480 | ||
475 | mutex_lock(&syscall_trace_lock); | 481 | mutex_lock(&syscall_trace_lock); |
476 | if (!sys_prof_refcount_enter) | 482 | if (!sys_perf_refcount_enter) |
477 | ret = register_trace_sys_enter(prof_syscall_enter); | 483 | ret = register_trace_sys_enter(perf_syscall_enter); |
478 | if (ret) { | 484 | if (ret) { |
479 | pr_info("event trace: Could not activate" | 485 | pr_info("event trace: Could not activate" |
480 | "syscall entry trace point"); | 486 | "syscall entry trace point"); |
481 | } else { | 487 | } else { |
482 | set_bit(num, enabled_prof_enter_syscalls); | 488 | set_bit(num, enabled_perf_enter_syscalls); |
483 | sys_prof_refcount_enter++; | 489 | sys_perf_refcount_enter++; |
484 | } | 490 | } |
485 | mutex_unlock(&syscall_trace_lock); | 491 | mutex_unlock(&syscall_trace_lock); |
486 | return ret; | 492 | return ret; |
487 | } | 493 | } |
488 | 494 | ||
489 | void prof_sysenter_disable(struct ftrace_event_call *call) | 495 | void perf_sysenter_disable(struct ftrace_event_call *call) |
490 | { | 496 | { |
491 | int num; | 497 | int num; |
492 | 498 | ||
493 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 499 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
494 | 500 | ||
495 | mutex_lock(&syscall_trace_lock); | 501 | mutex_lock(&syscall_trace_lock); |
496 | sys_prof_refcount_enter--; | 502 | sys_perf_refcount_enter--; |
497 | clear_bit(num, enabled_prof_enter_syscalls); | 503 | clear_bit(num, enabled_perf_enter_syscalls); |
498 | if (!sys_prof_refcount_enter) | 504 | if (!sys_perf_refcount_enter) |
499 | unregister_trace_sys_enter(prof_syscall_enter); | 505 | unregister_trace_sys_enter(perf_syscall_enter); |
500 | mutex_unlock(&syscall_trace_lock); | 506 | mutex_unlock(&syscall_trace_lock); |
501 | } | 507 | } |
502 | 508 | ||
503 | static void prof_syscall_exit(struct pt_regs *regs, long ret) | 509 | static void perf_syscall_exit(struct pt_regs *regs, long ret) |
504 | { | 510 | { |
505 | struct syscall_metadata *sys_data; | 511 | struct syscall_metadata *sys_data; |
506 | struct syscall_trace_exit *rec; | 512 | struct syscall_trace_exit *rec; |
@@ -510,7 +516,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) | |||
510 | int size; | 516 | int size; |
511 | 517 | ||
512 | syscall_nr = syscall_get_nr(current, regs); | 518 | syscall_nr = syscall_get_nr(current, regs); |
513 | if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) | 519 | if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) |
514 | return; | 520 | return; |
515 | 521 | ||
516 | sys_data = syscall_nr_to_meta(syscall_nr); | 522 | sys_data = syscall_nr_to_meta(syscall_nr); |
@@ -525,11 +531,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) | |||
525 | * Impossible, but be paranoid with the future | 531 | * Impossible, but be paranoid with the future |
526 | * How to put this check outside runtime? | 532 | * How to put this check outside runtime? |
527 | */ | 533 | */ |
528 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, | 534 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
529 | "exit event has grown above profile buffer size")) | 535 | "exit event has grown above perf buffer size")) |
530 | return; | 536 | return; |
531 | 537 | ||
532 | rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, | 538 | rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, |
533 | sys_data->exit_event->id, &rctx, &flags); | 539 | sys_data->exit_event->id, &rctx, &flags); |
534 | if (!rec) | 540 | if (!rec) |
535 | return; | 541 | return; |
@@ -537,10 +543,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) | |||
537 | rec->nr = syscall_nr; | 543 | rec->nr = syscall_nr; |
538 | rec->ret = syscall_get_return_value(current, regs); | 544 | rec->ret = syscall_get_return_value(current, regs); |
539 | 545 | ||
540 | ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); | 546 | perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); |
541 | } | 547 | } |
542 | 548 | ||
543 | int prof_sysexit_enable(struct ftrace_event_call *call) | 549 | int perf_sysexit_enable(struct ftrace_event_call *call) |
544 | { | 550 | { |
545 | int ret = 0; | 551 | int ret = 0; |
546 | int num; | 552 | int num; |
@@ -548,30 +554,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call) | |||
548 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 554 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
549 | 555 | ||
550 | mutex_lock(&syscall_trace_lock); | 556 | mutex_lock(&syscall_trace_lock); |
551 | if (!sys_prof_refcount_exit) | 557 | if (!sys_perf_refcount_exit) |
552 | ret = register_trace_sys_exit(prof_syscall_exit); | 558 | ret = register_trace_sys_exit(perf_syscall_exit); |
553 | if (ret) { | 559 | if (ret) { |
554 | pr_info("event trace: Could not activate" | 560 | pr_info("event trace: Could not activate" |
555 | "syscall entry trace point"); | 561 | "syscall exit trace point"); |
556 | } else { | 562 | } else { |
557 | set_bit(num, enabled_prof_exit_syscalls); | 563 | set_bit(num, enabled_perf_exit_syscalls); |
558 | sys_prof_refcount_exit++; | 564 | sys_perf_refcount_exit++; |
559 | } | 565 | } |
560 | mutex_unlock(&syscall_trace_lock); | 566 | mutex_unlock(&syscall_trace_lock); |
561 | return ret; | 567 | return ret; |
562 | } | 568 | } |
563 | 569 | ||
564 | void prof_sysexit_disable(struct ftrace_event_call *call) | 570 | void perf_sysexit_disable(struct ftrace_event_call *call) |
565 | { | 571 | { |
566 | int num; | 572 | int num; |
567 | 573 | ||
568 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 574 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
569 | 575 | ||
570 | mutex_lock(&syscall_trace_lock); | 576 | mutex_lock(&syscall_trace_lock); |
571 | sys_prof_refcount_exit--; | 577 | sys_perf_refcount_exit--; |
572 | clear_bit(num, enabled_prof_exit_syscalls); | 578 | clear_bit(num, enabled_perf_exit_syscalls); |
573 | if (!sys_prof_refcount_exit) | 579 | if (!sys_perf_refcount_exit) |
574 | unregister_trace_sys_exit(prof_syscall_exit); | 580 | unregister_trace_sys_exit(perf_syscall_exit); |
575 | mutex_unlock(&syscall_trace_lock); | 581 | mutex_unlock(&syscall_trace_lock); |
576 | } | 582 | } |
577 | 583 | ||
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 40cafb07dffd..cc2d2faa7d9e 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <trace/events/workqueue.h> | 9 | #include <trace/events/workqueue.h> |
10 | #include <linux/list.h> | 10 | #include <linux/list.h> |
11 | #include <linux/percpu.h> | 11 | #include <linux/percpu.h> |
12 | #include <linux/slab.h> | ||
12 | #include <linux/kref.h> | 13 | #include <linux/kref.h> |
13 | #include "trace_stat.h" | 14 | #include "trace_stat.h" |
14 | #include "trace.h" | 15 | #include "trace.h" |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 00d59d048edf..0a67e041edf8 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/tsacct_kern.h> | 21 | #include <linux/tsacct_kern.h> |
22 | #include <linux/acct.h> | 22 | #include <linux/acct.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/mm.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * fill in basic accounting fields | 27 | * fill in basic accounting fields |
diff --git a/kernel/user.c b/kernel/user.c index 46d0165ca70c..766467b3bcb7 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -56,9 +56,6 @@ struct user_struct root_user = { | |||
56 | .sigpending = ATOMIC_INIT(0), | 56 | .sigpending = ATOMIC_INIT(0), |
57 | .locked_shm = 0, | 57 | .locked_shm = 0, |
58 | .user_ns = &init_user_ns, | 58 | .user_ns = &init_user_ns, |
59 | #ifdef CONFIG_USER_SCHED | ||
60 | .tg = &init_task_group, | ||
61 | #endif | ||
62 | }; | 59 | }; |
63 | 60 | ||
64 | /* | 61 | /* |
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up) | |||
75 | put_user_ns(up->user_ns); | 72 | put_user_ns(up->user_ns); |
76 | } | 73 | } |
77 | 74 | ||
78 | #ifdef CONFIG_USER_SCHED | ||
79 | |||
80 | static void sched_destroy_user(struct user_struct *up) | ||
81 | { | ||
82 | sched_destroy_group(up->tg); | ||
83 | } | ||
84 | |||
85 | static int sched_create_user(struct user_struct *up) | ||
86 | { | ||
87 | int rc = 0; | ||
88 | |||
89 | up->tg = sched_create_group(&root_task_group); | ||
90 | if (IS_ERR(up->tg)) | ||
91 | rc = -ENOMEM; | ||
92 | |||
93 | set_tg_uid(up); | ||
94 | |||
95 | return rc; | ||
96 | } | ||
97 | |||
98 | #else /* CONFIG_USER_SCHED */ | ||
99 | |||
100 | static void sched_destroy_user(struct user_struct *up) { } | ||
101 | static int sched_create_user(struct user_struct *up) { return 0; } | ||
102 | |||
103 | #endif /* CONFIG_USER_SCHED */ | ||
104 | |||
105 | #if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) | ||
106 | |||
107 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | ||
108 | { | ||
109 | struct user_struct *user; | ||
110 | struct hlist_node *h; | ||
111 | |||
112 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | ||
113 | if (user->uid == uid) { | ||
114 | /* possibly resurrect an "almost deleted" object */ | ||
115 | if (atomic_inc_return(&user->__count) == 1) | ||
116 | cancel_delayed_work(&user->work); | ||
117 | return user; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | return NULL; | ||
122 | } | ||
123 | |||
124 | static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ | ||
125 | static DEFINE_MUTEX(uids_mutex); | ||
126 | |||
127 | static inline void uids_mutex_lock(void) | ||
128 | { | ||
129 | mutex_lock(&uids_mutex); | ||
130 | } | ||
131 | |||
132 | static inline void uids_mutex_unlock(void) | ||
133 | { | ||
134 | mutex_unlock(&uids_mutex); | ||
135 | } | ||
136 | |||
137 | /* uid directory attributes */ | ||
138 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
139 | static ssize_t cpu_shares_show(struct kobject *kobj, | ||
140 | struct kobj_attribute *attr, | ||
141 | char *buf) | ||
142 | { | ||
143 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
144 | |||
145 | return sprintf(buf, "%lu\n", sched_group_shares(up->tg)); | ||
146 | } | ||
147 | |||
148 | static ssize_t cpu_shares_store(struct kobject *kobj, | ||
149 | struct kobj_attribute *attr, | ||
150 | const char *buf, size_t size) | ||
151 | { | ||
152 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
153 | unsigned long shares; | ||
154 | int rc; | ||
155 | |||
156 | sscanf(buf, "%lu", &shares); | ||
157 | |||
158 | rc = sched_group_set_shares(up->tg, shares); | ||
159 | |||
160 | return (rc ? rc : size); | ||
161 | } | ||
162 | |||
163 | static struct kobj_attribute cpu_share_attr = | ||
164 | __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); | ||
165 | #endif | ||
166 | |||
167 | #ifdef CONFIG_RT_GROUP_SCHED | ||
168 | static ssize_t cpu_rt_runtime_show(struct kobject *kobj, | ||
169 | struct kobj_attribute *attr, | ||
170 | char *buf) | ||
171 | { | ||
172 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
173 | |||
174 | return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); | ||
175 | } | ||
176 | |||
177 | static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | ||
178 | struct kobj_attribute *attr, | ||
179 | const char *buf, size_t size) | ||
180 | { | ||
181 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
182 | unsigned long rt_runtime; | ||
183 | int rc; | ||
184 | |||
185 | sscanf(buf, "%ld", &rt_runtime); | ||
186 | |||
187 | rc = sched_group_set_rt_runtime(up->tg, rt_runtime); | ||
188 | |||
189 | return (rc ? rc : size); | ||
190 | } | ||
191 | |||
192 | static struct kobj_attribute cpu_rt_runtime_attr = | ||
193 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); | ||
194 | |||
195 | static ssize_t cpu_rt_period_show(struct kobject *kobj, | ||
196 | struct kobj_attribute *attr, | ||
197 | char *buf) | ||
198 | { | ||
199 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
200 | |||
201 | return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); | ||
202 | } | ||
203 | |||
204 | static ssize_t cpu_rt_period_store(struct kobject *kobj, | ||
205 | struct kobj_attribute *attr, | ||
206 | const char *buf, size_t size) | ||
207 | { | ||
208 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
209 | unsigned long rt_period; | ||
210 | int rc; | ||
211 | |||
212 | sscanf(buf, "%lu", &rt_period); | ||
213 | |||
214 | rc = sched_group_set_rt_period(up->tg, rt_period); | ||
215 | |||
216 | return (rc ? rc : size); | ||
217 | } | ||
218 | |||
219 | static struct kobj_attribute cpu_rt_period_attr = | ||
220 | __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); | ||
221 | #endif | ||
222 | |||
223 | /* default attributes per uid directory */ | ||
224 | static struct attribute *uids_attributes[] = { | ||
225 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
226 | &cpu_share_attr.attr, | ||
227 | #endif | ||
228 | #ifdef CONFIG_RT_GROUP_SCHED | ||
229 | &cpu_rt_runtime_attr.attr, | ||
230 | &cpu_rt_period_attr.attr, | ||
231 | #endif | ||
232 | NULL | ||
233 | }; | ||
234 | |||
235 | /* the lifetime of user_struct is not managed by the core (now) */ | ||
236 | static void uids_release(struct kobject *kobj) | ||
237 | { | ||
238 | return; | ||
239 | } | ||
240 | |||
241 | static struct kobj_type uids_ktype = { | ||
242 | .sysfs_ops = &kobj_sysfs_ops, | ||
243 | .default_attrs = uids_attributes, | ||
244 | .release = uids_release, | ||
245 | }; | ||
246 | |||
247 | /* | ||
248 | * Create /sys/kernel/uids/<uid>/cpu_share file for this user | ||
249 | * We do not create this file for users in a user namespace (until | ||
250 | * sysfs tagging is implemented). | ||
251 | * | ||
252 | * See Documentation/scheduler/sched-design-CFS.txt for ramifications. | ||
253 | */ | ||
254 | static int uids_user_create(struct user_struct *up) | ||
255 | { | ||
256 | struct kobject *kobj = &up->kobj; | ||
257 | int error; | ||
258 | |||
259 | memset(kobj, 0, sizeof(struct kobject)); | ||
260 | if (up->user_ns != &init_user_ns) | ||
261 | return 0; | ||
262 | kobj->kset = uids_kset; | ||
263 | error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); | ||
264 | if (error) { | ||
265 | kobject_put(kobj); | ||
266 | goto done; | ||
267 | } | ||
268 | |||
269 | kobject_uevent(kobj, KOBJ_ADD); | ||
270 | done: | ||
271 | return error; | ||
272 | } | ||
273 | |||
274 | /* create these entries in sysfs: | ||
275 | * "/sys/kernel/uids" directory | ||
276 | * "/sys/kernel/uids/0" directory (for root user) | ||
277 | * "/sys/kernel/uids/0/cpu_share" file (for root user) | ||
278 | */ | ||
279 | int __init uids_sysfs_init(void) | ||
280 | { | ||
281 | uids_kset = kset_create_and_add("uids", NULL, kernel_kobj); | ||
282 | if (!uids_kset) | ||
283 | return -ENOMEM; | ||
284 | |||
285 | return uids_user_create(&root_user); | ||
286 | } | ||
287 | |||
288 | /* delayed work function to remove sysfs directory for a user and free up | ||
289 | * corresponding structures. | ||
290 | */ | ||
291 | static void cleanup_user_struct(struct work_struct *w) | ||
292 | { | ||
293 | struct user_struct *up = container_of(w, struct user_struct, work.work); | ||
294 | unsigned long flags; | ||
295 | int remove_user = 0; | ||
296 | |||
297 | /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() | ||
298 | * atomic. | ||
299 | */ | ||
300 | uids_mutex_lock(); | ||
301 | |||
302 | spin_lock_irqsave(&uidhash_lock, flags); | ||
303 | if (atomic_read(&up->__count) == 0) { | ||
304 | uid_hash_remove(up); | ||
305 | remove_user = 1; | ||
306 | } | ||
307 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
308 | |||
309 | if (!remove_user) | ||
310 | goto done; | ||
311 | |||
312 | if (up->user_ns == &init_user_ns) { | ||
313 | kobject_uevent(&up->kobj, KOBJ_REMOVE); | ||
314 | kobject_del(&up->kobj); | ||
315 | kobject_put(&up->kobj); | ||
316 | } | ||
317 | |||
318 | sched_destroy_user(up); | ||
319 | key_put(up->uid_keyring); | ||
320 | key_put(up->session_keyring); | ||
321 | kmem_cache_free(uid_cachep, up); | ||
322 | |||
323 | done: | ||
324 | uids_mutex_unlock(); | ||
325 | } | ||
326 | |||
327 | /* IRQs are disabled and uidhash_lock is held upon function entry. | ||
328 | * IRQ state (as stored in flags) is restored and uidhash_lock released | ||
329 | * upon function exit. | ||
330 | */ | ||
331 | static void free_user(struct user_struct *up, unsigned long flags) | ||
332 | { | ||
333 | INIT_DELAYED_WORK(&up->work, cleanup_user_struct); | ||
334 | schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); | ||
335 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
336 | } | ||
337 | |||
338 | #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ | ||
339 | |||
340 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | 75 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) |
341 | { | 76 | { |
342 | struct user_struct *user; | 77 | struct user_struct *user; |
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | |||
352 | return NULL; | 87 | return NULL; |
353 | } | 88 | } |
354 | 89 | ||
355 | int uids_sysfs_init(void) { return 0; } | ||
356 | static inline int uids_user_create(struct user_struct *up) { return 0; } | ||
357 | static inline void uids_mutex_lock(void) { } | ||
358 | static inline void uids_mutex_unlock(void) { } | ||
359 | |||
360 | /* IRQs are disabled and uidhash_lock is held upon function entry. | 90 | /* IRQs are disabled and uidhash_lock is held upon function entry. |
361 | * IRQ state (as stored in flags) is restored and uidhash_lock released | 91 | * IRQ state (as stored in flags) is restored and uidhash_lock released |
362 | * upon function exit. | 92 | * upon function exit. |
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags) | |||
365 | { | 95 | { |
366 | uid_hash_remove(up); | 96 | uid_hash_remove(up); |
367 | spin_unlock_irqrestore(&uidhash_lock, flags); | 97 | spin_unlock_irqrestore(&uidhash_lock, flags); |
368 | sched_destroy_user(up); | ||
369 | key_put(up->uid_keyring); | 98 | key_put(up->uid_keyring); |
370 | key_put(up->session_keyring); | 99 | key_put(up->session_keyring); |
371 | kmem_cache_free(uid_cachep, up); | 100 | kmem_cache_free(uid_cachep, up); |
372 | } | 101 | } |
373 | 102 | ||
374 | #endif | ||
375 | |||
376 | #if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) | ||
377 | /* | ||
378 | * We need to check if a setuid can take place. This function should be called | ||
379 | * before successfully completing the setuid. | ||
380 | */ | ||
381 | int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) | ||
382 | { | ||
383 | |||
384 | return sched_rt_can_attach(up->tg, tsk); | ||
385 | |||
386 | } | ||
387 | #else | ||
388 | int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) | ||
389 | { | ||
390 | return 1; | ||
391 | } | ||
392 | #endif | ||
393 | |||
394 | /* | 103 | /* |
395 | * Locate the user_struct for the passed UID. If found, take a ref on it. The | 104 | * Locate the user_struct for the passed UID. If found, take a ref on it. The |
396 | * caller must undo that ref with free_uid(). | 105 | * caller must undo that ref with free_uid(). |
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
431 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() | 140 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() |
432 | * atomic. | 141 | * atomic. |
433 | */ | 142 | */ |
434 | uids_mutex_lock(); | ||
435 | |||
436 | spin_lock_irq(&uidhash_lock); | 143 | spin_lock_irq(&uidhash_lock); |
437 | up = uid_hash_find(uid, hashent); | 144 | up = uid_hash_find(uid, hashent); |
438 | spin_unlock_irq(&uidhash_lock); | 145 | spin_unlock_irq(&uidhash_lock); |
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
445 | new->uid = uid; | 152 | new->uid = uid; |
446 | atomic_set(&new->__count, 1); | 153 | atomic_set(&new->__count, 1); |
447 | 154 | ||
448 | if (sched_create_user(new) < 0) | ||
449 | goto out_free_user; | ||
450 | |||
451 | new->user_ns = get_user_ns(ns); | 155 | new->user_ns = get_user_ns(ns); |
452 | 156 | ||
453 | if (uids_user_create(new)) | ||
454 | goto out_destoy_sched; | ||
455 | |||
456 | /* | 157 | /* |
457 | * Before adding this, check whether we raced | 158 | * Before adding this, check whether we raced |
458 | * on adding the same user already.. | 159 | * on adding the same user already.. |
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
475 | spin_unlock_irq(&uidhash_lock); | 176 | spin_unlock_irq(&uidhash_lock); |
476 | } | 177 | } |
477 | 178 | ||
478 | uids_mutex_unlock(); | ||
479 | |||
480 | return up; | 179 | return up; |
481 | 180 | ||
482 | out_destoy_sched: | ||
483 | sched_destroy_user(new); | ||
484 | put_user_ns(new->user_ns); | 181 | put_user_ns(new->user_ns); |
485 | out_free_user: | ||
486 | kmem_cache_free(uid_cachep, new); | 182 | kmem_cache_free(uid_cachep, new); |
487 | out_unlock: | 183 | out_unlock: |
488 | uids_mutex_unlock(); | ||
489 | return NULL; | 184 | return NULL; |
490 | } | 185 | } |
491 | 186 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dee48658805c..5bfb213984b2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -774,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork) | |||
774 | { | 774 | { |
775 | if (del_timer_sync(&dwork->timer)) { | 775 | if (del_timer_sync(&dwork->timer)) { |
776 | struct cpu_workqueue_struct *cwq; | 776 | struct cpu_workqueue_struct *cwq; |
777 | cwq = wq_per_cpu(keventd_wq, get_cpu()); | 777 | cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); |
778 | __queue_work(cwq, &dwork->work); | 778 | __queue_work(cwq, &dwork->work); |
779 | put_cpu(); | 779 | put_cpu(); |
780 | } | 780 | } |