diff options
Diffstat (limited to 'kernel')
77 files changed, 5859 insertions, 3895 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f2..7b974699f8c2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
| 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
| 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
| 13 | async.o | 13 | async.o range.o |
| 14 | obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o | ||
| 14 | obj-y += groups.o | 15 | obj-y += groups.o |
| 15 | 16 | ||
| 16 | ifdef CONFIG_FUNCTION_TRACER | 17 | ifdef CONFIG_FUNCTION_TRACER |
| @@ -100,6 +101,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o | |||
| 100 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 101 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
| 101 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 102 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
| 102 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 103 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
| 104 | obj-$(CONFIG_PADATA) += padata.o | ||
| 103 | 105 | ||
| 104 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | 106 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) |
| 105 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 107 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 4b05bd9479db..028e85663f27 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
| @@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule) | |||
| 548 | return 0; | 548 | return 0; |
| 549 | } | 549 | } |
| 550 | 550 | ||
| 551 | static int compare_root(struct vfsmount *mnt, void *arg) | ||
| 552 | { | ||
| 553 | return mnt->mnt_root->d_inode == arg; | ||
| 554 | } | ||
| 555 | |||
| 551 | void audit_trim_trees(void) | 556 | void audit_trim_trees(void) |
| 552 | { | 557 | { |
| 553 | struct list_head cursor; | 558 | struct list_head cursor; |
| @@ -559,7 +564,6 @@ void audit_trim_trees(void) | |||
| 559 | struct path path; | 564 | struct path path; |
| 560 | struct vfsmount *root_mnt; | 565 | struct vfsmount *root_mnt; |
| 561 | struct node *node; | 566 | struct node *node; |
| 562 | struct list_head list; | ||
| 563 | int err; | 567 | int err; |
| 564 | 568 | ||
| 565 | tree = container_of(cursor.next, struct audit_tree, list); | 569 | tree = container_of(cursor.next, struct audit_tree, list); |
| @@ -577,24 +581,16 @@ void audit_trim_trees(void) | |||
| 577 | if (!root_mnt) | 581 | if (!root_mnt) |
| 578 | goto skip_it; | 582 | goto skip_it; |
| 579 | 583 | ||
| 580 | list_add_tail(&list, &root_mnt->mnt_list); | ||
| 581 | spin_lock(&hash_lock); | 584 | spin_lock(&hash_lock); |
| 582 | list_for_each_entry(node, &tree->chunks, list) { | 585 | list_for_each_entry(node, &tree->chunks, list) { |
| 583 | struct audit_chunk *chunk = find_chunk(node); | 586 | struct inode *inode = find_chunk(node)->watch.inode; |
| 584 | struct inode *inode = chunk->watch.inode; | ||
| 585 | struct vfsmount *mnt; | ||
| 586 | node->index |= 1U<<31; | 587 | node->index |= 1U<<31; |
| 587 | list_for_each_entry(mnt, &list, mnt_list) { | 588 | if (iterate_mounts(compare_root, inode, root_mnt)) |
| 588 | if (mnt->mnt_root->d_inode == inode) { | 589 | node->index &= ~(1U<<31); |
| 589 | node->index &= ~(1U<<31); | ||
| 590 | break; | ||
| 591 | } | ||
| 592 | } | ||
| 593 | } | 590 | } |
| 594 | spin_unlock(&hash_lock); | 591 | spin_unlock(&hash_lock); |
| 595 | trim_marked(tree); | 592 | trim_marked(tree); |
| 596 | put_tree(tree); | 593 | put_tree(tree); |
| 597 | list_del_init(&list); | ||
| 598 | drop_collected_mounts(root_mnt); | 594 | drop_collected_mounts(root_mnt); |
| 599 | skip_it: | 595 | skip_it: |
| 600 | mutex_lock(&audit_filter_mutex); | 596 | mutex_lock(&audit_filter_mutex); |
| @@ -603,22 +599,6 @@ skip_it: | |||
| 603 | mutex_unlock(&audit_filter_mutex); | 599 | mutex_unlock(&audit_filter_mutex); |
| 604 | } | 600 | } |
| 605 | 601 | ||
| 606 | static int is_under(struct vfsmount *mnt, struct dentry *dentry, | ||
| 607 | struct path *path) | ||
| 608 | { | ||
| 609 | if (mnt != path->mnt) { | ||
| 610 | for (;;) { | ||
| 611 | if (mnt->mnt_parent == mnt) | ||
| 612 | return 0; | ||
| 613 | if (mnt->mnt_parent == path->mnt) | ||
| 614 | break; | ||
| 615 | mnt = mnt->mnt_parent; | ||
| 616 | } | ||
| 617 | dentry = mnt->mnt_mountpoint; | ||
| 618 | } | ||
| 619 | return is_subdir(dentry, path->dentry); | ||
| 620 | } | ||
| 621 | |||
| 622 | int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) | 602 | int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) |
| 623 | { | 603 | { |
| 624 | 604 | ||
| @@ -638,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree) | |||
| 638 | put_tree(tree); | 618 | put_tree(tree); |
| 639 | } | 619 | } |
| 640 | 620 | ||
| 621 | static int tag_mount(struct vfsmount *mnt, void *arg) | ||
| 622 | { | ||
| 623 | return tag_chunk(mnt->mnt_root->d_inode, arg); | ||
| 624 | } | ||
| 625 | |||
| 641 | /* called with audit_filter_mutex */ | 626 | /* called with audit_filter_mutex */ |
| 642 | int audit_add_tree_rule(struct audit_krule *rule) | 627 | int audit_add_tree_rule(struct audit_krule *rule) |
| 643 | { | 628 | { |
| 644 | struct audit_tree *seed = rule->tree, *tree; | 629 | struct audit_tree *seed = rule->tree, *tree; |
| 645 | struct path path; | 630 | struct path path; |
| 646 | struct vfsmount *mnt, *p; | 631 | struct vfsmount *mnt; |
| 647 | struct list_head list; | ||
| 648 | int err; | 632 | int err; |
| 649 | 633 | ||
| 650 | list_for_each_entry(tree, &tree_list, list) { | 634 | list_for_each_entry(tree, &tree_list, list) { |
| @@ -670,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule) | |||
| 670 | err = -ENOMEM; | 654 | err = -ENOMEM; |
| 671 | goto Err; | 655 | goto Err; |
| 672 | } | 656 | } |
| 673 | list_add_tail(&list, &mnt->mnt_list); | ||
| 674 | 657 | ||
| 675 | get_tree(tree); | 658 | get_tree(tree); |
| 676 | list_for_each_entry(p, &list, mnt_list) { | 659 | err = iterate_mounts(tag_mount, tree, mnt); |
| 677 | err = tag_chunk(p->mnt_root->d_inode, tree); | ||
| 678 | if (err) | ||
| 679 | break; | ||
| 680 | } | ||
| 681 | |||
| 682 | list_del(&list); | ||
| 683 | drop_collected_mounts(mnt); | 660 | drop_collected_mounts(mnt); |
| 684 | 661 | ||
| 685 | if (!err) { | 662 | if (!err) { |
| @@ -714,31 +691,23 @@ int audit_tag_tree(char *old, char *new) | |||
| 714 | { | 691 | { |
| 715 | struct list_head cursor, barrier; | 692 | struct list_head cursor, barrier; |
| 716 | int failed = 0; | 693 | int failed = 0; |
| 717 | struct path path; | 694 | struct path path1, path2; |
| 718 | struct vfsmount *tagged; | 695 | struct vfsmount *tagged; |
| 719 | struct list_head list; | ||
| 720 | struct vfsmount *mnt; | ||
| 721 | struct dentry *dentry; | ||
| 722 | int err; | 696 | int err; |
| 723 | 697 | ||
| 724 | err = kern_path(new, 0, &path); | 698 | err = kern_path(new, 0, &path2); |
| 725 | if (err) | 699 | if (err) |
| 726 | return err; | 700 | return err; |
| 727 | tagged = collect_mounts(&path); | 701 | tagged = collect_mounts(&path2); |
| 728 | path_put(&path); | 702 | path_put(&path2); |
| 729 | if (!tagged) | 703 | if (!tagged) |
| 730 | return -ENOMEM; | 704 | return -ENOMEM; |
| 731 | 705 | ||
| 732 | err = kern_path(old, 0, &path); | 706 | err = kern_path(old, 0, &path1); |
| 733 | if (err) { | 707 | if (err) { |
| 734 | drop_collected_mounts(tagged); | 708 | drop_collected_mounts(tagged); |
| 735 | return err; | 709 | return err; |
| 736 | } | 710 | } |
| 737 | mnt = mntget(path.mnt); | ||
| 738 | dentry = dget(path.dentry); | ||
| 739 | path_put(&path); | ||
| 740 | |||
| 741 | list_add_tail(&list, &tagged->mnt_list); | ||
| 742 | 711 | ||
| 743 | mutex_lock(&audit_filter_mutex); | 712 | mutex_lock(&audit_filter_mutex); |
| 744 | list_add(&barrier, &tree_list); | 713 | list_add(&barrier, &tree_list); |
| @@ -746,7 +715,7 @@ int audit_tag_tree(char *old, char *new) | |||
| 746 | 715 | ||
| 747 | while (cursor.next != &tree_list) { | 716 | while (cursor.next != &tree_list) { |
| 748 | struct audit_tree *tree; | 717 | struct audit_tree *tree; |
| 749 | struct vfsmount *p; | 718 | int good_one = 0; |
| 750 | 719 | ||
| 751 | tree = container_of(cursor.next, struct audit_tree, list); | 720 | tree = container_of(cursor.next, struct audit_tree, list); |
| 752 | get_tree(tree); | 721 | get_tree(tree); |
| @@ -754,30 +723,19 @@ int audit_tag_tree(char *old, char *new) | |||
| 754 | list_add(&cursor, &tree->list); | 723 | list_add(&cursor, &tree->list); |
| 755 | mutex_unlock(&audit_filter_mutex); | 724 | mutex_unlock(&audit_filter_mutex); |
| 756 | 725 | ||
| 757 | err = kern_path(tree->pathname, 0, &path); | 726 | err = kern_path(tree->pathname, 0, &path2); |
| 758 | if (err) { | 727 | if (!err) { |
| 759 | put_tree(tree); | 728 | good_one = path_is_under(&path1, &path2); |
| 760 | mutex_lock(&audit_filter_mutex); | 729 | path_put(&path2); |
| 761 | continue; | ||
| 762 | } | 730 | } |
| 763 | 731 | ||
| 764 | spin_lock(&vfsmount_lock); | 732 | if (!good_one) { |
| 765 | if (!is_under(mnt, dentry, &path)) { | ||
| 766 | spin_unlock(&vfsmount_lock); | ||
| 767 | path_put(&path); | ||
| 768 | put_tree(tree); | 733 | put_tree(tree); |
| 769 | mutex_lock(&audit_filter_mutex); | 734 | mutex_lock(&audit_filter_mutex); |
| 770 | continue; | 735 | continue; |
| 771 | } | 736 | } |
| 772 | spin_unlock(&vfsmount_lock); | ||
| 773 | path_put(&path); | ||
| 774 | |||
| 775 | list_for_each_entry(p, &list, mnt_list) { | ||
| 776 | failed = tag_chunk(p->mnt_root->d_inode, tree); | ||
| 777 | if (failed) | ||
| 778 | break; | ||
| 779 | } | ||
| 780 | 737 | ||
| 738 | failed = iterate_mounts(tag_mount, tree, tagged); | ||
| 781 | if (failed) { | 739 | if (failed) { |
| 782 | put_tree(tree); | 740 | put_tree(tree); |
| 783 | mutex_lock(&audit_filter_mutex); | 741 | mutex_lock(&audit_filter_mutex); |
| @@ -818,10 +776,8 @@ int audit_tag_tree(char *old, char *new) | |||
| 818 | } | 776 | } |
| 819 | list_del(&barrier); | 777 | list_del(&barrier); |
| 820 | list_del(&cursor); | 778 | list_del(&cursor); |
| 821 | list_del(&list); | ||
| 822 | mutex_unlock(&audit_filter_mutex); | 779 | mutex_unlock(&audit_filter_mutex); |
| 823 | dput(dentry); | 780 | path_put(&path1); |
| 824 | mntput(mnt); | ||
| 825 | drop_collected_mounts(tagged); | 781 | drop_collected_mounts(tagged); |
| 826 | return failed; | 782 | return failed; |
| 827 | } | 783 | } |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fc0f928167e7..f3a461c0970a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry) | |||
| 1988 | 1988 | ||
| 1989 | /** | 1989 | /** |
| 1990 | * audit_inode_child - collect inode info for created/removed objects | 1990 | * audit_inode_child - collect inode info for created/removed objects |
| 1991 | * @dname: inode's dentry name | ||
| 1992 | * @dentry: dentry being audited | 1991 | * @dentry: dentry being audited |
| 1993 | * @parent: inode of dentry parent | 1992 | * @parent: inode of dentry parent |
| 1994 | * | 1993 | * |
| @@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry) | |||
| 2000 | * must be hooked prior, in order to capture the target inode during | 1999 | * must be hooked prior, in order to capture the target inode during |
| 2001 | * unsuccessful attempts. | 2000 | * unsuccessful attempts. |
| 2002 | */ | 2001 | */ |
| 2003 | void __audit_inode_child(const char *dname, const struct dentry *dentry, | 2002 | void __audit_inode_child(const struct dentry *dentry, |
| 2004 | const struct inode *parent) | 2003 | const struct inode *parent) |
| 2005 | { | 2004 | { |
| 2006 | int idx; | 2005 | int idx; |
| 2007 | struct audit_context *context = current->audit_context; | 2006 | struct audit_context *context = current->audit_context; |
| 2008 | const char *found_parent = NULL, *found_child = NULL; | 2007 | const char *found_parent = NULL, *found_child = NULL; |
| 2009 | const struct inode *inode = dentry->d_inode; | 2008 | const struct inode *inode = dentry->d_inode; |
| 2009 | const char *dname = dentry->d_name.name; | ||
| 2010 | int dirlen = 0; | 2010 | int dirlen = 0; |
| 2011 | 2011 | ||
| 2012 | if (!context->in_syscall) | 2012 | if (!context->in_syscall) |
| @@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry, | |||
| 2014 | 2014 | ||
| 2015 | if (inode) | 2015 | if (inode) |
| 2016 | handle_one(inode); | 2016 | handle_one(inode); |
| 2017 | /* determine matching parent */ | ||
| 2018 | if (!dname) | ||
| 2019 | goto add_names; | ||
| 2020 | 2017 | ||
| 2021 | /* parent is more likely, look for it first */ | 2018 | /* parent is more likely, look for it first */ |
| 2022 | for (idx = 0; idx < context->name_count; idx++) { | 2019 | for (idx = 0; idx < context->name_count; idx++) { |
diff --git a/kernel/capability.c b/kernel/capability.c index 7f876e60521f..9e4697e9b276 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, | |||
| 135 | if (pid && (pid != task_pid_vnr(current))) { | 135 | if (pid && (pid != task_pid_vnr(current))) { |
| 136 | struct task_struct *target; | 136 | struct task_struct *target; |
| 137 | 137 | ||
| 138 | read_lock(&tasklist_lock); | 138 | rcu_read_lock(); |
| 139 | 139 | ||
| 140 | target = find_task_by_vpid(pid); | 140 | target = find_task_by_vpid(pid); |
| 141 | if (!target) | 141 | if (!target) |
| @@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, | |||
| 143 | else | 143 | else |
| 144 | ret = security_capget(target, pEp, pIp, pPp); | 144 | ret = security_capget(target, pEp, pIp, pPp); |
| 145 | 145 | ||
| 146 | read_unlock(&tasklist_lock); | 146 | rcu_read_unlock(); |
| 147 | } else | 147 | } else |
| 148 | ret = security_capget(current, pEp, pIp, pPp); | 148 | ret = security_capget(current, pEp, pIp, pPp); |
| 149 | 149 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aa3bee566446..4fd90e129772 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | */ | 23 | */ |
| 24 | 24 | ||
| 25 | #include <linux/cgroup.h> | 25 | #include <linux/cgroup.h> |
| 26 | #include <linux/module.h> | ||
| 26 | #include <linux/ctype.h> | 27 | #include <linux/ctype.h> |
| 27 | #include <linux/errno.h> | 28 | #include <linux/errno.h> |
| 28 | #include <linux/fs.h> | 29 | #include <linux/fs.h> |
| @@ -166,6 +167,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
| 166 | */ | 167 | */ |
| 167 | static int need_forkexit_callback __read_mostly; | 168 | static int need_forkexit_callback __read_mostly; |
| 168 | 169 | ||
| 170 | #ifdef CONFIG_PROVE_LOCKING | ||
| 171 | int cgroup_lock_is_held(void) | ||
| 172 | { | ||
| 173 | return lockdep_is_held(&cgroup_mutex); | ||
| 174 | } | ||
| 175 | #else /* #ifdef CONFIG_PROVE_LOCKING */ | ||
| 176 | int cgroup_lock_is_held(void) | ||
| 177 | { | ||
| 178 | return mutex_is_locked(&cgroup_mutex); | ||
| 179 | } | ||
| 180 | #endif /* #else #ifdef CONFIG_PROVE_LOCKING */ | ||
| 181 | |||
| 182 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | ||
| 183 | |||
| 169 | /* convenient tests for these bits */ | 184 | /* convenient tests for these bits */ |
| 170 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 185 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
| 171 | { | 186 | { |
diff --git a/kernel/early_res.c b/kernel/early_res.c new file mode 100644 index 000000000000..3cb2c661bb78 --- /dev/null +++ b/kernel/early_res.c | |||
| @@ -0,0 +1,578 @@ | |||
| 1 | /* | ||
| 2 | * early_res, could be used to replace bootmem | ||
| 3 | */ | ||
| 4 | #include <linux/kernel.h> | ||
| 5 | #include <linux/types.h> | ||
| 6 | #include <linux/init.h> | ||
| 7 | #include <linux/bootmem.h> | ||
| 8 | #include <linux/mm.h> | ||
| 9 | #include <linux/early_res.h> | ||
| 10 | |||
| 11 | /* | ||
| 12 | * Early reserved memory areas. | ||
| 13 | */ | ||
| 14 | /* | ||
| 15 | * need to make sure this one is bigger enough before | ||
| 16 | * find_fw_memmap_area could be used | ||
| 17 | */ | ||
| 18 | #define MAX_EARLY_RES_X 32 | ||
| 19 | |||
| 20 | struct early_res { | ||
| 21 | u64 start, end; | ||
| 22 | char name[15]; | ||
| 23 | char overlap_ok; | ||
| 24 | }; | ||
| 25 | static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; | ||
| 26 | |||
| 27 | static int max_early_res __initdata = MAX_EARLY_RES_X; | ||
| 28 | static struct early_res *early_res __initdata = &early_res_x[0]; | ||
| 29 | static int early_res_count __initdata; | ||
| 30 | |||
| 31 | static int __init find_overlapped_early(u64 start, u64 end) | ||
| 32 | { | ||
| 33 | int i; | ||
| 34 | struct early_res *r; | ||
| 35 | |||
| 36 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
| 37 | r = &early_res[i]; | ||
| 38 | if (end > r->start && start < r->end) | ||
| 39 | break; | ||
| 40 | } | ||
| 41 | |||
| 42 | return i; | ||
| 43 | } | ||
| 44 | |||
| 45 | /* | ||
| 46 | * Drop the i-th range from the early reservation map, | ||
| 47 | * by copying any higher ranges down one over it, and | ||
| 48 | * clearing what had been the last slot. | ||
| 49 | */ | ||
| 50 | static void __init drop_range(int i) | ||
| 51 | { | ||
| 52 | int j; | ||
| 53 | |||
| 54 | for (j = i + 1; j < max_early_res && early_res[j].end; j++) | ||
| 55 | ; | ||
| 56 | |||
| 57 | memmove(&early_res[i], &early_res[i + 1], | ||
| 58 | (j - 1 - i) * sizeof(struct early_res)); | ||
| 59 | |||
| 60 | early_res[j - 1].end = 0; | ||
| 61 | early_res_count--; | ||
| 62 | } | ||
| 63 | |||
| 64 | static void __init drop_range_partial(int i, u64 start, u64 end) | ||
| 65 | { | ||
| 66 | u64 common_start, common_end; | ||
| 67 | u64 old_start, old_end; | ||
| 68 | |||
| 69 | old_start = early_res[i].start; | ||
| 70 | old_end = early_res[i].end; | ||
| 71 | common_start = max(old_start, start); | ||
| 72 | common_end = min(old_end, end); | ||
| 73 | |||
| 74 | /* no overlap ? */ | ||
| 75 | if (common_start >= common_end) | ||
| 76 | return; | ||
| 77 | |||
| 78 | if (old_start < common_start) { | ||
| 79 | /* make head segment */ | ||
| 80 | early_res[i].end = common_start; | ||
| 81 | if (old_end > common_end) { | ||
| 82 | char name[15]; | ||
| 83 | |||
| 84 | /* | ||
| 85 | * Save a local copy of the name, since the | ||
| 86 | * early_res array could get resized inside | ||
| 87 | * reserve_early_without_check() -> | ||
| 88 | * __check_and_double_early_res(), which would | ||
| 89 | * make the current name pointer invalid. | ||
| 90 | */ | ||
| 91 | strncpy(name, early_res[i].name, | ||
| 92 | sizeof(early_res[i].name) - 1); | ||
| 93 | /* add another for left over on tail */ | ||
| 94 | reserve_early_without_check(common_end, old_end, name); | ||
| 95 | } | ||
| 96 | return; | ||
| 97 | } else { | ||
| 98 | if (old_end > common_end) { | ||
| 99 | /* reuse the entry for tail left */ | ||
| 100 | early_res[i].start = common_end; | ||
| 101 | return; | ||
| 102 | } | ||
| 103 | /* all covered */ | ||
| 104 | drop_range(i); | ||
| 105 | } | ||
| 106 | } | ||
| 107 | |||
| 108 | /* | ||
| 109 | * Split any existing ranges that: | ||
| 110 | * 1) are marked 'overlap_ok', and | ||
| 111 | * 2) overlap with the stated range [start, end) | ||
| 112 | * into whatever portion (if any) of the existing range is entirely | ||
| 113 | * below or entirely above the stated range. Drop the portion | ||
| 114 | * of the existing range that overlaps with the stated range, | ||
| 115 | * which will allow the caller of this routine to then add that | ||
| 116 | * stated range without conflicting with any existing range. | ||
| 117 | */ | ||
| 118 | static void __init drop_overlaps_that_are_ok(u64 start, u64 end) | ||
| 119 | { | ||
| 120 | int i; | ||
| 121 | struct early_res *r; | ||
| 122 | u64 lower_start, lower_end; | ||
| 123 | u64 upper_start, upper_end; | ||
| 124 | char name[15]; | ||
| 125 | |||
| 126 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
| 127 | r = &early_res[i]; | ||
| 128 | |||
| 129 | /* Continue past non-overlapping ranges */ | ||
| 130 | if (end <= r->start || start >= r->end) | ||
| 131 | continue; | ||
| 132 | |||
| 133 | /* | ||
| 134 | * Leave non-ok overlaps as is; let caller | ||
| 135 | * panic "Overlapping early reservations" | ||
| 136 | * when it hits this overlap. | ||
| 137 | */ | ||
| 138 | if (!r->overlap_ok) | ||
| 139 | return; | ||
| 140 | |||
| 141 | /* | ||
| 142 | * We have an ok overlap. We will drop it from the early | ||
| 143 | * reservation map, and add back in any non-overlapping | ||
| 144 | * portions (lower or upper) as separate, overlap_ok, | ||
| 145 | * non-overlapping ranges. | ||
| 146 | */ | ||
| 147 | |||
| 148 | /* 1. Note any non-overlapping (lower or upper) ranges. */ | ||
| 149 | strncpy(name, r->name, sizeof(name) - 1); | ||
| 150 | |||
| 151 | lower_start = lower_end = 0; | ||
| 152 | upper_start = upper_end = 0; | ||
| 153 | if (r->start < start) { | ||
| 154 | lower_start = r->start; | ||
| 155 | lower_end = start; | ||
| 156 | } | ||
| 157 | if (r->end > end) { | ||
| 158 | upper_start = end; | ||
| 159 | upper_end = r->end; | ||
| 160 | } | ||
| 161 | |||
| 162 | /* 2. Drop the original ok overlapping range */ | ||
| 163 | drop_range(i); | ||
| 164 | |||
| 165 | i--; /* resume for-loop on copied down entry */ | ||
| 166 | |||
| 167 | /* 3. Add back in any non-overlapping ranges. */ | ||
| 168 | if (lower_end) | ||
| 169 | reserve_early_overlap_ok(lower_start, lower_end, name); | ||
| 170 | if (upper_end) | ||
| 171 | reserve_early_overlap_ok(upper_start, upper_end, name); | ||
| 172 | } | ||
| 173 | } | ||
| 174 | |||
| 175 | static void __init __reserve_early(u64 start, u64 end, char *name, | ||
| 176 | int overlap_ok) | ||
| 177 | { | ||
| 178 | int i; | ||
| 179 | struct early_res *r; | ||
| 180 | |||
| 181 | i = find_overlapped_early(start, end); | ||
| 182 | if (i >= max_early_res) | ||
| 183 | panic("Too many early reservations"); | ||
| 184 | r = &early_res[i]; | ||
| 185 | if (r->end) | ||
| 186 | panic("Overlapping early reservations " | ||
| 187 | "%llx-%llx %s to %llx-%llx %s\n", | ||
| 188 | start, end - 1, name ? name : "", r->start, | ||
| 189 | r->end - 1, r->name); | ||
| 190 | r->start = start; | ||
| 191 | r->end = end; | ||
| 192 | r->overlap_ok = overlap_ok; | ||
| 193 | if (name) | ||
| 194 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
| 195 | early_res_count++; | ||
| 196 | } | ||
| 197 | |||
| 198 | /* | ||
| 199 | * A few early reservtations come here. | ||
| 200 | * | ||
| 201 | * The 'overlap_ok' in the name of this routine does -not- mean it | ||
| 202 | * is ok for these reservations to overlap an earlier reservation. | ||
| 203 | * Rather it means that it is ok for subsequent reservations to | ||
| 204 | * overlap this one. | ||
| 205 | * | ||
| 206 | * Use this entry point to reserve early ranges when you are doing | ||
| 207 | * so out of "Paranoia", reserving perhaps more memory than you need, | ||
| 208 | * just in case, and don't mind a subsequent overlapping reservation | ||
| 209 | * that is known to be needed. | ||
| 210 | * | ||
| 211 | * The drop_overlaps_that_are_ok() call here isn't really needed. | ||
| 212 | * It would be needed if we had two colliding 'overlap_ok' | ||
| 213 | * reservations, so that the second such would not panic on the | ||
| 214 | * overlap with the first. We don't have any such as of this | ||
| 215 | * writing, but might as well tolerate such if it happens in | ||
| 216 | * the future. | ||
| 217 | */ | ||
| 218 | void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) | ||
| 219 | { | ||
| 220 | drop_overlaps_that_are_ok(start, end); | ||
| 221 | __reserve_early(start, end, name, 1); | ||
| 222 | } | ||
| 223 | |||
| 224 | static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) | ||
| 225 | { | ||
| 226 | u64 start, end, size, mem; | ||
| 227 | struct early_res *new; | ||
| 228 | |||
| 229 | /* do we have enough slots left ? */ | ||
| 230 | if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) | ||
| 231 | return; | ||
| 232 | |||
| 233 | /* double it */ | ||
| 234 | mem = -1ULL; | ||
| 235 | size = sizeof(struct early_res) * max_early_res * 2; | ||
| 236 | if (early_res == early_res_x) | ||
| 237 | start = 0; | ||
| 238 | else | ||
| 239 | start = early_res[0].end; | ||
| 240 | end = ex_start; | ||
| 241 | if (start + size < end) | ||
| 242 | mem = find_fw_memmap_area(start, end, size, | ||
| 243 | sizeof(struct early_res)); | ||
| 244 | if (mem == -1ULL) { | ||
| 245 | start = ex_end; | ||
| 246 | end = get_max_mapped(); | ||
| 247 | if (start + size < end) | ||
| 248 | mem = find_fw_memmap_area(start, end, size, | ||
| 249 | sizeof(struct early_res)); | ||
| 250 | } | ||
| 251 | if (mem == -1ULL) | ||
| 252 | panic("can not find more space for early_res array"); | ||
| 253 | |||
| 254 | new = __va(mem); | ||
| 255 | /* save the first one for own */ | ||
| 256 | new[0].start = mem; | ||
| 257 | new[0].end = mem + size; | ||
| 258 | new[0].overlap_ok = 0; | ||
| 259 | /* copy old to new */ | ||
| 260 | if (early_res == early_res_x) { | ||
| 261 | memcpy(&new[1], &early_res[0], | ||
| 262 | sizeof(struct early_res) * max_early_res); | ||
| 263 | memset(&new[max_early_res+1], 0, | ||
| 264 | sizeof(struct early_res) * (max_early_res - 1)); | ||
| 265 | early_res_count++; | ||
| 266 | } else { | ||
| 267 | memcpy(&new[1], &early_res[1], | ||
| 268 | sizeof(struct early_res) * (max_early_res - 1)); | ||
| 269 | memset(&new[max_early_res], 0, | ||
| 270 | sizeof(struct early_res) * max_early_res); | ||
| 271 | } | ||
| 272 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
| 273 | early_res = new; | ||
| 274 | max_early_res *= 2; | ||
| 275 | printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", | ||
| 276 | max_early_res, mem, mem + size - 1); | ||
| 277 | } | ||
| 278 | |||
| 279 | /* | ||
| 280 | * Most early reservations come here. | ||
| 281 | * | ||
| 282 | * We first have drop_overlaps_that_are_ok() drop any pre-existing | ||
| 283 | * 'overlap_ok' ranges, so that we can then reserve this memory | ||
| 284 | * range without risk of panic'ing on an overlapping overlap_ok | ||
| 285 | * early reservation. | ||
| 286 | */ | ||
| 287 | void __init reserve_early(u64 start, u64 end, char *name) | ||
| 288 | { | ||
| 289 | if (start >= end) | ||
| 290 | return; | ||
| 291 | |||
| 292 | __check_and_double_early_res(start, end); | ||
| 293 | |||
| 294 | drop_overlaps_that_are_ok(start, end); | ||
| 295 | __reserve_early(start, end, name, 0); | ||
| 296 | } | ||
| 297 | |||
| 298 | void __init reserve_early_without_check(u64 start, u64 end, char *name) | ||
| 299 | { | ||
| 300 | struct early_res *r; | ||
| 301 | |||
| 302 | if (start >= end) | ||
| 303 | return; | ||
| 304 | |||
| 305 | __check_and_double_early_res(start, end); | ||
| 306 | |||
| 307 | r = &early_res[early_res_count]; | ||
| 308 | |||
| 309 | r->start = start; | ||
| 310 | r->end = end; | ||
| 311 | r->overlap_ok = 0; | ||
| 312 | if (name) | ||
| 313 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
| 314 | early_res_count++; | ||
| 315 | } | ||
| 316 | |||
| 317 | void __init free_early(u64 start, u64 end) | ||
| 318 | { | ||
| 319 | struct early_res *r; | ||
| 320 | int i; | ||
| 321 | |||
| 322 | i = find_overlapped_early(start, end); | ||
| 323 | r = &early_res[i]; | ||
| 324 | if (i >= max_early_res || r->end != end || r->start != start) | ||
| 325 | panic("free_early on not reserved area: %llx-%llx!", | ||
| 326 | start, end - 1); | ||
| 327 | |||
| 328 | drop_range(i); | ||
| 329 | } | ||
| 330 | |||
| 331 | void __init free_early_partial(u64 start, u64 end) | ||
| 332 | { | ||
| 333 | struct early_res *r; | ||
| 334 | int i; | ||
| 335 | |||
| 336 | try_next: | ||
| 337 | i = find_overlapped_early(start, end); | ||
| 338 | if (i >= max_early_res) | ||
| 339 | return; | ||
| 340 | |||
| 341 | r = &early_res[i]; | ||
| 342 | /* hole ? */ | ||
| 343 | if (r->end >= end && r->start <= start) { | ||
| 344 | drop_range_partial(i, start, end); | ||
| 345 | return; | ||
| 346 | } | ||
| 347 | |||
| 348 | drop_range_partial(i, start, end); | ||
| 349 | goto try_next; | ||
| 350 | } | ||
| 351 | |||
| 352 | #ifdef CONFIG_NO_BOOTMEM | ||
| 353 | static void __init subtract_early_res(struct range *range, int az) | ||
| 354 | { | ||
| 355 | int i, count; | ||
| 356 | u64 final_start, final_end; | ||
| 357 | int idx = 0; | ||
| 358 | |||
| 359 | count = 0; | ||
| 360 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
| 361 | count++; | ||
| 362 | |||
| 363 | /* need to skip first one ?*/ | ||
| 364 | if (early_res != early_res_x) | ||
| 365 | idx = 1; | ||
| 366 | |||
| 367 | #define DEBUG_PRINT_EARLY_RES 1 | ||
| 368 | |||
| 369 | #if DEBUG_PRINT_EARLY_RES | ||
| 370 | printk(KERN_INFO "Subtract (%d early reservations)\n", count); | ||
| 371 | #endif | ||
| 372 | for (i = idx; i < count; i++) { | ||
| 373 | struct early_res *r = &early_res[i]; | ||
| 374 | #if DEBUG_PRINT_EARLY_RES | ||
| 375 | printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, | ||
| 376 | r->start, r->end, r->name); | ||
| 377 | #endif | ||
| 378 | final_start = PFN_DOWN(r->start); | ||
| 379 | final_end = PFN_UP(r->end); | ||
| 380 | if (final_start >= final_end) | ||
| 381 | continue; | ||
| 382 | subtract_range(range, az, final_start, final_end); | ||
| 383 | } | ||
| 384 | |||
| 385 | } | ||
| 386 | |||
| 387 | int __init get_free_all_memory_range(struct range **rangep, int nodeid) | ||
| 388 | { | ||
| 389 | int i, count; | ||
| 390 | u64 start = 0, end; | ||
| 391 | u64 size; | ||
| 392 | u64 mem; | ||
| 393 | struct range *range; | ||
| 394 | int nr_range; | ||
| 395 | |||
| 396 | count = 0; | ||
| 397 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
| 398 | count++; | ||
| 399 | |||
| 400 | count *= 2; | ||
| 401 | |||
| 402 | size = sizeof(struct range) * count; | ||
| 403 | end = get_max_mapped(); | ||
| 404 | #ifdef MAX_DMA32_PFN | ||
| 405 | if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) | ||
| 406 | start = MAX_DMA32_PFN << PAGE_SHIFT; | ||
| 407 | #endif | ||
| 408 | mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); | ||
| 409 | if (mem == -1ULL) | ||
| 410 | panic("can not find more space for range free"); | ||
| 411 | |||
| 412 | range = __va(mem); | ||
| 413 | /* use early_node_map[] and early_res to get range array at first */ | ||
| 414 | memset(range, 0, size); | ||
| 415 | nr_range = 0; | ||
| 416 | |||
| 417 | /* need to go over early_node_map to find out good range for node */ | ||
| 418 | nr_range = add_from_early_node_map(range, count, nr_range, nodeid); | ||
| 419 | #ifdef CONFIG_X86_32 | ||
| 420 | subtract_range(range, count, max_low_pfn, -1ULL); | ||
| 421 | #endif | ||
| 422 | subtract_early_res(range, count); | ||
| 423 | nr_range = clean_sort_range(range, count); | ||
| 424 | |||
| 425 | /* need to clear it ? */ | ||
| 426 | if (nodeid == MAX_NUMNODES) { | ||
| 427 | memset(&early_res[0], 0, | ||
| 428 | sizeof(struct early_res) * max_early_res); | ||
| 429 | early_res = NULL; | ||
| 430 | max_early_res = 0; | ||
| 431 | } | ||
| 432 | |||
| 433 | *rangep = range; | ||
| 434 | return nr_range; | ||
| 435 | } | ||
| 436 | #else | ||
| 437 | void __init early_res_to_bootmem(u64 start, u64 end) | ||
| 438 | { | ||
| 439 | int i, count; | ||
| 440 | u64 final_start, final_end; | ||
| 441 | int idx = 0; | ||
| 442 | |||
| 443 | count = 0; | ||
| 444 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
| 445 | count++; | ||
| 446 | |||
| 447 | /* need to skip first one ?*/ | ||
| 448 | if (early_res != early_res_x) | ||
| 449 | idx = 1; | ||
| 450 | |||
| 451 | printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", | ||
| 452 | count - idx, max_early_res, start, end); | ||
| 453 | for (i = idx; i < count; i++) { | ||
| 454 | struct early_res *r = &early_res[i]; | ||
| 455 | printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, | ||
| 456 | r->start, r->end, r->name); | ||
| 457 | final_start = max(start, r->start); | ||
| 458 | final_end = min(end, r->end); | ||
| 459 | if (final_start >= final_end) { | ||
| 460 | printk(KERN_CONT "\n"); | ||
| 461 | continue; | ||
| 462 | } | ||
| 463 | printk(KERN_CONT " ==> [%010llx - %010llx]\n", | ||
| 464 | final_start, final_end); | ||
| 465 | reserve_bootmem_generic(final_start, final_end - final_start, | ||
| 466 | BOOTMEM_DEFAULT); | ||
| 467 | } | ||
| 468 | /* clear them */ | ||
| 469 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
| 470 | early_res = NULL; | ||
| 471 | max_early_res = 0; | ||
| 472 | early_res_count = 0; | ||
| 473 | } | ||
| 474 | #endif | ||
| 475 | |||
| 476 | /* Check for already reserved areas */ | ||
| 477 | static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) | ||
| 478 | { | ||
| 479 | int i; | ||
| 480 | u64 addr = *addrp; | ||
| 481 | int changed = 0; | ||
| 482 | struct early_res *r; | ||
| 483 | again: | ||
| 484 | i = find_overlapped_early(addr, addr + size); | ||
| 485 | r = &early_res[i]; | ||
| 486 | if (i < max_early_res && r->end) { | ||
| 487 | *addrp = addr = round_up(r->end, align); | ||
| 488 | changed = 1; | ||
| 489 | goto again; | ||
| 490 | } | ||
| 491 | return changed; | ||
| 492 | } | ||
| 493 | |||
| 494 | /* Check for already reserved areas */ | ||
| 495 | static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) | ||
| 496 | { | ||
| 497 | int i; | ||
| 498 | u64 addr = *addrp, last; | ||
| 499 | u64 size = *sizep; | ||
| 500 | int changed = 0; | ||
| 501 | again: | ||
| 502 | last = addr + size; | ||
| 503 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
| 504 | struct early_res *r = &early_res[i]; | ||
| 505 | if (last > r->start && addr < r->start) { | ||
| 506 | size = r->start - addr; | ||
| 507 | changed = 1; | ||
| 508 | goto again; | ||
| 509 | } | ||
| 510 | if (last > r->end && addr < r->end) { | ||
| 511 | addr = round_up(r->end, align); | ||
| 512 | size = last - addr; | ||
| 513 | changed = 1; | ||
| 514 | goto again; | ||
| 515 | } | ||
| 516 | if (last <= r->end && addr >= r->start) { | ||
| 517 | (*sizep)++; | ||
| 518 | return 0; | ||
| 519 | } | ||
| 520 | } | ||
| 521 | if (changed) { | ||
| 522 | *addrp = addr; | ||
| 523 | *sizep = size; | ||
| 524 | } | ||
| 525 | return changed; | ||
| 526 | } | ||
| 527 | |||
| 528 | /* | ||
| 529 | * Find a free area with specified alignment in a specific range. | ||
| 530 | * only with the area.between start to end is active range from early_node_map | ||
| 531 | * so they are good as RAM | ||
| 532 | */ | ||
| 533 | u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, | ||
| 534 | u64 size, u64 align) | ||
| 535 | { | ||
| 536 | u64 addr, last; | ||
| 537 | |||
| 538 | addr = round_up(ei_start, align); | ||
| 539 | if (addr < start) | ||
| 540 | addr = round_up(start, align); | ||
| 541 | if (addr >= ei_last) | ||
| 542 | goto out; | ||
| 543 | while (bad_addr(&addr, size, align) && addr+size <= ei_last) | ||
| 544 | ; | ||
| 545 | last = addr + size; | ||
| 546 | if (last > ei_last) | ||
| 547 | goto out; | ||
| 548 | if (last > end) | ||
| 549 | goto out; | ||
| 550 | |||
| 551 | return addr; | ||
| 552 | |||
| 553 | out: | ||
| 554 | return -1ULL; | ||
| 555 | } | ||
| 556 | |||
| 557 | u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, | ||
| 558 | u64 *sizep, u64 align) | ||
| 559 | { | ||
| 560 | u64 addr, last; | ||
| 561 | |||
| 562 | addr = round_up(ei_start, align); | ||
| 563 | if (addr < start) | ||
| 564 | addr = round_up(start, align); | ||
| 565 | if (addr >= ei_last) | ||
| 566 | goto out; | ||
| 567 | *sizep = ei_last - addr; | ||
| 568 | while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) | ||
| 569 | ; | ||
| 570 | last = addr + *sizep; | ||
| 571 | if (last > ei_last) | ||
| 572 | goto out; | ||
| 573 | |||
| 574 | return addr; | ||
| 575 | |||
| 576 | out: | ||
| 577 | return -1ULL; | ||
| 578 | } | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 546774a31a66..45ed043b8bf5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 85 | BUG_ON(!sig); | 85 | BUG_ON(!sig); |
| 86 | BUG_ON(!atomic_read(&sig->count)); | 86 | BUG_ON(!atomic_read(&sig->count)); |
| 87 | 87 | ||
| 88 | sighand = rcu_dereference(tsk->sighand); | 88 | sighand = rcu_dereference_check(tsk->sighand, |
| 89 | rcu_read_lock_held() || | ||
| 90 | lockdep_is_held(&tasklist_lock)); | ||
| 89 | spin_lock(&sighand->siglock); | 91 | spin_lock(&sighand->siglock); |
| 90 | 92 | ||
| 91 | posix_cpu_timers_exit(tsk); | 93 | posix_cpu_timers_exit(tsk); |
| @@ -170,8 +172,10 @@ void release_task(struct task_struct * p) | |||
| 170 | repeat: | 172 | repeat: |
| 171 | tracehook_prepare_release_task(p); | 173 | tracehook_prepare_release_task(p); |
| 172 | /* don't need to get the RCU readlock here - the process is dead and | 174 | /* don't need to get the RCU readlock here - the process is dead and |
| 173 | * can't be modifying its own credentials */ | 175 | * can't be modifying its own credentials. But shut RCU-lockdep up */ |
| 176 | rcu_read_lock(); | ||
| 174 | atomic_dec(&__task_cred(p)->user->processes); | 177 | atomic_dec(&__task_cred(p)->user->processes); |
| 178 | rcu_read_unlock(); | ||
| 175 | 179 | ||
| 176 | proc_flush_task(p); | 180 | proc_flush_task(p); |
| 177 | 181 | ||
| @@ -473,9 +477,11 @@ static void close_files(struct files_struct * files) | |||
| 473 | /* | 477 | /* |
| 474 | * It is safe to dereference the fd table without RCU or | 478 | * It is safe to dereference the fd table without RCU or |
| 475 | * ->file_lock because this is the last reference to the | 479 | * ->file_lock because this is the last reference to the |
| 476 | * files structure. | 480 | * files structure. But use RCU to shut RCU-lockdep up. |
| 477 | */ | 481 | */ |
| 482 | rcu_read_lock(); | ||
| 478 | fdt = files_fdtable(files); | 483 | fdt = files_fdtable(files); |
| 484 | rcu_read_unlock(); | ||
| 479 | for (;;) { | 485 | for (;;) { |
| 480 | unsigned long set; | 486 | unsigned long set; |
| 481 | i = j * __NFDBITS; | 487 | i = j * __NFDBITS; |
| @@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files) | |||
| 521 | * at the end of the RCU grace period. Otherwise, | 527 | * at the end of the RCU grace period. Otherwise, |
| 522 | * you can free files immediately. | 528 | * you can free files immediately. |
| 523 | */ | 529 | */ |
| 530 | rcu_read_lock(); | ||
| 524 | fdt = files_fdtable(files); | 531 | fdt = files_fdtable(files); |
| 525 | if (fdt != &files->fdtab) | 532 | if (fdt != &files->fdtab) |
| 526 | kmem_cache_free(files_cachep, files); | 533 | kmem_cache_free(files_cachep, files); |
| 527 | free_fdtable(fdt); | 534 | free_fdtable(fdt); |
| 535 | rcu_read_unlock(); | ||
| 528 | } | 536 | } |
| 529 | } | 537 | } |
| 530 | 538 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index f88bd984df35..17bbf093356d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -86,6 +86,7 @@ int max_threads; /* tunable limit on nr_threads */ | |||
| 86 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; | 86 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
| 87 | 87 | ||
| 88 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ | 88 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
| 89 | EXPORT_SYMBOL_GPL(tasklist_lock); | ||
| 89 | 90 | ||
| 90 | int nr_processes(void) | 91 | int nr_processes(void) |
| 91 | { | 92 | { |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 235716556bf1..d49afb2395e5 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
| @@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
| 146 | struct task_struct *p; | 146 | struct task_struct *p; |
| 147 | 147 | ||
| 148 | ret = -ESRCH; | 148 | ret = -ESRCH; |
| 149 | read_lock(&tasklist_lock); | 149 | rcu_read_lock(); |
| 150 | p = find_task_by_vpid(pid); | 150 | p = find_task_by_vpid(pid); |
| 151 | if (!p) | 151 | if (!p) |
| 152 | goto err_unlock; | 152 | goto err_unlock; |
| @@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
| 157 | !capable(CAP_SYS_PTRACE)) | 157 | !capable(CAP_SYS_PTRACE)) |
| 158 | goto err_unlock; | 158 | goto err_unlock; |
| 159 | head = p->compat_robust_list; | 159 | head = p->compat_robust_list; |
| 160 | read_unlock(&tasklist_lock); | 160 | rcu_read_unlock(); |
| 161 | } | 161 | } |
| 162 | 162 | ||
| 163 | if (put_user(sizeof(*head), len_ptr)) | 163 | if (put_user(sizeof(*head), len_ptr)) |
| @@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
| 165 | return put_user(ptr_to_compat(head), head_ptr); | 165 | return put_user(ptr_to_compat(head), head_ptr); |
| 166 | 166 | ||
| 167 | err_unlock: | 167 | err_unlock: |
| 168 | read_unlock(&tasklist_lock); | 168 | rcu_read_unlock(); |
| 169 | 169 | ||
| 170 | return ret; | 170 | return ret; |
| 171 | } | 171 | } |
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 8a5c7d55ac9f..967e66143e11 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
| @@ -360,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | |||
| 360 | int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) | 360 | int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) |
| 361 | { | 361 | { |
| 362 | u64 old_addr = bp->attr.bp_addr; | 362 | u64 old_addr = bp->attr.bp_addr; |
| 363 | u64 old_len = bp->attr.bp_len; | ||
| 363 | int old_type = bp->attr.bp_type; | 364 | int old_type = bp->attr.bp_type; |
| 364 | int old_len = bp->attr.bp_len; | ||
| 365 | int err = 0; | 365 | int err = 0; |
| 366 | 366 | ||
| 367 | perf_event_disable(bp); | 367 | perf_event_disable(bp); |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa28f666..d70394f12ee9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -18,11 +18,7 @@ | |||
| 18 | 18 | ||
| 19 | #include "internals.h" | 19 | #include "internals.h" |
| 20 | 20 | ||
| 21 | /** | 21 | static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) |
| 22 | * dynamic_irq_init - initialize a dynamically allocated irq | ||
| 23 | * @irq: irq number to initialize | ||
| 24 | */ | ||
| 25 | void dynamic_irq_init(unsigned int irq) | ||
| 26 | { | 22 | { |
| 27 | struct irq_desc *desc; | 23 | struct irq_desc *desc; |
| 28 | unsigned long flags; | 24 | unsigned long flags; |
| @@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq) | |||
| 41 | desc->depth = 1; | 37 | desc->depth = 1; |
| 42 | desc->msi_desc = NULL; | 38 | desc->msi_desc = NULL; |
| 43 | desc->handler_data = NULL; | 39 | desc->handler_data = NULL; |
| 44 | desc->chip_data = NULL; | 40 | if (!keep_chip_data) |
| 41 | desc->chip_data = NULL; | ||
| 45 | desc->action = NULL; | 42 | desc->action = NULL; |
| 46 | desc->irq_count = 0; | 43 | desc->irq_count = 0; |
| 47 | desc->irqs_unhandled = 0; | 44 | desc->irqs_unhandled = 0; |
| @@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq) | |||
| 55 | } | 52 | } |
| 56 | 53 | ||
| 57 | /** | 54 | /** |
| 58 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | 55 | * dynamic_irq_init - initialize a dynamically allocated irq |
| 59 | * @irq: irq number to initialize | 56 | * @irq: irq number to initialize |
| 60 | */ | 57 | */ |
| 61 | void dynamic_irq_cleanup(unsigned int irq) | 58 | void dynamic_irq_init(unsigned int irq) |
| 59 | { | ||
| 60 | dynamic_irq_init_x(irq, false); | ||
| 61 | } | ||
| 62 | |||
| 63 | /** | ||
| 64 | * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq | ||
| 65 | * @irq: irq number to initialize | ||
| 66 | * | ||
| 67 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
| 68 | */ | ||
| 69 | void dynamic_irq_init_keep_chip_data(unsigned int irq) | ||
| 70 | { | ||
| 71 | dynamic_irq_init_x(irq, true); | ||
| 72 | } | ||
| 73 | |||
| 74 | static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) | ||
| 62 | { | 75 | { |
| 63 | struct irq_desc *desc = irq_to_desc(irq); | 76 | struct irq_desc *desc = irq_to_desc(irq); |
| 64 | unsigned long flags; | 77 | unsigned long flags; |
| @@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
| 77 | } | 90 | } |
| 78 | desc->msi_desc = NULL; | 91 | desc->msi_desc = NULL; |
| 79 | desc->handler_data = NULL; | 92 | desc->handler_data = NULL; |
| 80 | desc->chip_data = NULL; | 93 | if (!keep_chip_data) |
| 94 | desc->chip_data = NULL; | ||
| 81 | desc->handle_irq = handle_bad_irq; | 95 | desc->handle_irq = handle_bad_irq; |
| 82 | desc->chip = &no_irq_chip; | 96 | desc->chip = &no_irq_chip; |
| 83 | desc->name = NULL; | 97 | desc->name = NULL; |
| @@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
| 85 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 99 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 86 | } | 100 | } |
| 87 | 101 | ||
| 102 | /** | ||
| 103 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
| 104 | * @irq: irq number to initialize | ||
| 105 | */ | ||
| 106 | void dynamic_irq_cleanup(unsigned int irq) | ||
| 107 | { | ||
| 108 | dynamic_irq_cleanup_x(irq, false); | ||
| 109 | } | ||
| 110 | |||
| 111 | /** | ||
| 112 | * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq | ||
| 113 | * @irq: irq number to initialize | ||
| 114 | * | ||
| 115 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
| 116 | */ | ||
| 117 | void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) | ||
| 118 | { | ||
| 119 | dynamic_irq_cleanup_x(irq, true); | ||
| 120 | } | ||
| 121 | |||
| 88 | 122 | ||
| 89 | /** | 123 | /** |
| 90 | * set_irq_chip - set the irq chip for an irq | 124 | * set_irq_chip - set the irq chip for an irq |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 814940e7f485..76d5a671bfe1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -19,7 +19,7 @@ | |||
| 19 | #include <linux/kernel_stat.h> | 19 | #include <linux/kernel_stat.h> |
| 20 | #include <linux/rculist.h> | 20 | #include <linux/rculist.h> |
| 21 | #include <linux/hash.h> | 21 | #include <linux/hash.h> |
| 22 | #include <linux/bootmem.h> | 22 | #include <linux/radix-tree.h> |
| 23 | #include <trace/events/irq.h> | 23 | #include <trace/events/irq.h> |
| 24 | 24 | ||
| 25 | #include "internals.h" | 25 | #include "internals.h" |
| @@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) | |||
| 87 | { | 87 | { |
| 88 | void *ptr; | 88 | void *ptr; |
| 89 | 89 | ||
| 90 | if (slab_is_available()) | 90 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), |
| 91 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), | 91 | GFP_ATOMIC, node); |
| 92 | GFP_ATOMIC, node); | ||
| 93 | else | ||
| 94 | ptr = alloc_bootmem_node(NODE_DATA(node), | ||
| 95 | nr * sizeof(*desc->kstat_irqs)); | ||
| 96 | 92 | ||
| 97 | /* | 93 | /* |
| 98 | * don't overwite if can not get new one | 94 | * don't overwite if can not get new one |
| @@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) | |||
| 132 | */ | 128 | */ |
| 133 | DEFINE_RAW_SPINLOCK(sparse_irq_lock); | 129 | DEFINE_RAW_SPINLOCK(sparse_irq_lock); |
| 134 | 130 | ||
| 135 | struct irq_desc **irq_desc_ptrs __read_mostly; | 131 | static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); |
| 132 | |||
| 133 | static void set_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
| 134 | { | ||
| 135 | radix_tree_insert(&irq_desc_tree, irq, desc); | ||
| 136 | } | ||
| 137 | |||
| 138 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
| 139 | { | ||
| 140 | return radix_tree_lookup(&irq_desc_tree, irq); | ||
| 141 | } | ||
| 142 | |||
| 143 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
| 144 | { | ||
| 145 | void **ptr; | ||
| 146 | |||
| 147 | ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); | ||
| 148 | if (ptr) | ||
| 149 | radix_tree_replace_slot(ptr, desc); | ||
| 150 | } | ||
| 136 | 151 | ||
| 137 | static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { | 152 | static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { |
| 138 | [0 ... NR_IRQS_LEGACY-1] = { | 153 | [0 ... NR_IRQS_LEGACY-1] = { |
| @@ -164,9 +179,6 @@ int __init early_irq_init(void) | |||
| 164 | legacy_count = ARRAY_SIZE(irq_desc_legacy); | 179 | legacy_count = ARRAY_SIZE(irq_desc_legacy); |
| 165 | node = first_online_node; | 180 | node = first_online_node; |
| 166 | 181 | ||
| 167 | /* allocate irq_desc_ptrs array based on nr_irqs */ | ||
| 168 | irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); | ||
| 169 | |||
| 170 | /* allocate based on nr_cpu_ids */ | 182 | /* allocate based on nr_cpu_ids */ |
| 171 | kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * | 183 | kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * |
| 172 | sizeof(int), GFP_NOWAIT, node); | 184 | sizeof(int), GFP_NOWAIT, node); |
| @@ -180,23 +192,12 @@ int __init early_irq_init(void) | |||
| 180 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 192 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
| 181 | alloc_desc_masks(&desc[i], node, true); | 193 | alloc_desc_masks(&desc[i], node, true); |
| 182 | init_desc_masks(&desc[i]); | 194 | init_desc_masks(&desc[i]); |
| 183 | irq_desc_ptrs[i] = desc + i; | 195 | set_irq_desc(i, &desc[i]); |
| 184 | } | 196 | } |
| 185 | 197 | ||
| 186 | for (i = legacy_count; i < nr_irqs; i++) | ||
| 187 | irq_desc_ptrs[i] = NULL; | ||
| 188 | |||
| 189 | return arch_early_irq_init(); | 198 | return arch_early_irq_init(); |
| 190 | } | 199 | } |
| 191 | 200 | ||
| 192 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
| 193 | { | ||
| 194 | if (irq_desc_ptrs && irq < nr_irqs) | ||
| 195 | return irq_desc_ptrs[irq]; | ||
| 196 | |||
| 197 | return NULL; | ||
| 198 | } | ||
| 199 | |||
| 200 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | 201 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) |
| 201 | { | 202 | { |
| 202 | struct irq_desc *desc; | 203 | struct irq_desc *desc; |
| @@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | |||
| 208 | return NULL; | 209 | return NULL; |
| 209 | } | 210 | } |
| 210 | 211 | ||
| 211 | desc = irq_desc_ptrs[irq]; | 212 | desc = irq_to_desc(irq); |
| 212 | if (desc) | 213 | if (desc) |
| 213 | return desc; | 214 | return desc; |
| 214 | 215 | ||
| 215 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | 216 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); |
| 216 | 217 | ||
| 217 | /* We have to check it to avoid races with another CPU */ | 218 | /* We have to check it to avoid races with another CPU */ |
| 218 | desc = irq_desc_ptrs[irq]; | 219 | desc = irq_to_desc(irq); |
| 219 | if (desc) | 220 | if (desc) |
| 220 | goto out_unlock; | 221 | goto out_unlock; |
| 221 | 222 | ||
| 222 | if (slab_is_available()) | 223 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); |
| 223 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | ||
| 224 | else | ||
| 225 | desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); | ||
| 226 | 224 | ||
| 227 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); | 225 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); |
| 228 | if (!desc) { | 226 | if (!desc) { |
| @@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | |||
| 231 | } | 229 | } |
| 232 | init_one_irq_desc(irq, desc, node); | 230 | init_one_irq_desc(irq, desc, node); |
| 233 | 231 | ||
| 234 | irq_desc_ptrs[irq] = desc; | 232 | set_irq_desc(irq, desc); |
| 235 | 233 | ||
| 236 | out_unlock: | 234 | out_unlock: |
| 237 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | 235 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b2821f070a3d..c63f3bc88f0b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc); | |||
| 21 | extern raw_spinlock_t sparse_irq_lock; | 21 | extern raw_spinlock_t sparse_irq_lock; |
| 22 | 22 | ||
| 23 | #ifdef CONFIG_SPARSE_IRQ | 23 | #ifdef CONFIG_SPARSE_IRQ |
| 24 | /* irq_desc_ptrs allocated at boot time */ | 24 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc); |
| 25 | extern struct irq_desc **irq_desc_ptrs; | ||
| 26 | #else | ||
| 27 | /* irq_desc_ptrs is a fixed size array */ | ||
| 28 | extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; | ||
| 29 | #endif | 25 | #endif |
| 30 | 26 | ||
| 31 | #ifdef CONFIG_PROC_FS | 27 | #ifdef CONFIG_PROC_FS |
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 26bac9d8f860..963559dbd858 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c | |||
| @@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | |||
| 70 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | 70 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); |
| 71 | 71 | ||
| 72 | /* We have to check it to avoid races with another CPU */ | 72 | /* We have to check it to avoid races with another CPU */ |
| 73 | desc = irq_desc_ptrs[irq]; | 73 | desc = irq_to_desc(irq); |
| 74 | 74 | ||
| 75 | if (desc && old_desc != desc) | 75 | if (desc && old_desc != desc) |
| 76 | goto out_unlock; | 76 | goto out_unlock; |
| @@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | |||
| 90 | goto out_unlock; | 90 | goto out_unlock; |
| 91 | } | 91 | } |
| 92 | 92 | ||
| 93 | irq_desc_ptrs[irq] = desc; | 93 | replace_irq_desc(irq, desc); |
| 94 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | 94 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); |
| 95 | 95 | ||
| 96 | /* free the old one */ | 96 | /* free the old one */ |
diff --git a/kernel/kexec.c b/kernel/kexec.c index ef077fb73155..87ebe8adc474 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -41,7 +41,7 @@ | |||
| 41 | #include <asm/sections.h> | 41 | #include <asm/sections.h> |
| 42 | 42 | ||
| 43 | /* Per cpu memory for storing cpu states in case of system crash. */ | 43 | /* Per cpu memory for storing cpu states in case of system crash. */ |
| 44 | note_buf_t* crash_notes; | 44 | note_buf_t __percpu *crash_notes; |
| 45 | 45 | ||
| 46 | /* vmcoreinfo stuff */ | 46 | /* vmcoreinfo stuff */ |
| 47 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | 47 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; |
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 498cabba225e..35edbe22e9a9 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
| @@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) | |||
| 80 | 80 | ||
| 81 | buffer = kmalloc(size, gfp_mask); | 81 | buffer = kmalloc(size, gfp_mask); |
| 82 | if (!buffer) { | 82 | if (!buffer) { |
| 83 | _kfifo_init(fifo, 0, 0); | 83 | _kfifo_init(fifo, NULL, 0); |
| 84 | return -ENOMEM; | 84 | return -ENOMEM; |
| 85 | } | 85 | } |
| 86 | 86 | ||
| @@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc); | |||
| 97 | void kfifo_free(struct kfifo *fifo) | 97 | void kfifo_free(struct kfifo *fifo) |
| 98 | { | 98 | { |
| 99 | kfree(fifo->buffer); | 99 | kfree(fifo->buffer); |
| 100 | _kfifo_init(fifo, NULL, 0); | ||
| 100 | } | 101 | } |
| 101 | EXPORT_SYMBOL(kfifo_free); | 102 | EXPORT_SYMBOL(kfifo_free); |
| 102 | 103 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b7df302a0204..fa034d29cf73 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -42,8 +42,11 @@ | |||
| 42 | #include <linux/freezer.h> | 42 | #include <linux/freezer.h> |
| 43 | #include <linux/seq_file.h> | 43 | #include <linux/seq_file.h> |
| 44 | #include <linux/debugfs.h> | 44 | #include <linux/debugfs.h> |
| 45 | #include <linux/sysctl.h> | ||
| 45 | #include <linux/kdebug.h> | 46 | #include <linux/kdebug.h> |
| 46 | #include <linux/memory.h> | 47 | #include <linux/memory.h> |
| 48 | #include <linux/ftrace.h> | ||
| 49 | #include <linux/cpu.h> | ||
| 47 | 50 | ||
| 48 | #include <asm-generic/sections.h> | 51 | #include <asm-generic/sections.h> |
| 49 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
| @@ -93,6 +96,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { | |||
| 93 | {"native_get_debugreg",}, | 96 | {"native_get_debugreg",}, |
| 94 | {"irq_entries_start",}, | 97 | {"irq_entries_start",}, |
| 95 | {"common_interrupt",}, | 98 | {"common_interrupt",}, |
| 99 | {"mcount",}, /* mcount can be called from everywhere */ | ||
| 96 | {NULL} /* Terminator */ | 100 | {NULL} /* Terminator */ |
| 97 | }; | 101 | }; |
| 98 | 102 | ||
| @@ -103,81 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { | |||
| 103 | * stepping on the instruction on a vmalloced/kmalloced/data page | 107 | * stepping on the instruction on a vmalloced/kmalloced/data page |
| 104 | * is a recipe for disaster | 108 | * is a recipe for disaster |
| 105 | */ | 109 | */ |
| 106 | #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) | ||
| 107 | |||
| 108 | struct kprobe_insn_page { | 110 | struct kprobe_insn_page { |
| 109 | struct list_head list; | 111 | struct list_head list; |
| 110 | kprobe_opcode_t *insns; /* Page of instruction slots */ | 112 | kprobe_opcode_t *insns; /* Page of instruction slots */ |
| 111 | char slot_used[INSNS_PER_PAGE]; | ||
| 112 | int nused; | 113 | int nused; |
| 113 | int ngarbage; | 114 | int ngarbage; |
| 115 | char slot_used[]; | ||
| 114 | }; | 116 | }; |
| 115 | 117 | ||
| 118 | #define KPROBE_INSN_PAGE_SIZE(slots) \ | ||
| 119 | (offsetof(struct kprobe_insn_page, slot_used) + \ | ||
| 120 | (sizeof(char) * (slots))) | ||
| 121 | |||
| 122 | struct kprobe_insn_cache { | ||
| 123 | struct list_head pages; /* list of kprobe_insn_page */ | ||
| 124 | size_t insn_size; /* size of instruction slot */ | ||
| 125 | int nr_garbage; | ||
| 126 | }; | ||
| 127 | |||
| 128 | static int slots_per_page(struct kprobe_insn_cache *c) | ||
| 129 | { | ||
| 130 | return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); | ||
| 131 | } | ||
| 132 | |||
| 116 | enum kprobe_slot_state { | 133 | enum kprobe_slot_state { |
| 117 | SLOT_CLEAN = 0, | 134 | SLOT_CLEAN = 0, |
| 118 | SLOT_DIRTY = 1, | 135 | SLOT_DIRTY = 1, |
| 119 | SLOT_USED = 2, | 136 | SLOT_USED = 2, |
| 120 | }; | 137 | }; |
| 121 | 138 | ||
| 122 | static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ | 139 | static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ |
| 123 | static LIST_HEAD(kprobe_insn_pages); | 140 | static struct kprobe_insn_cache kprobe_insn_slots = { |
| 124 | static int kprobe_garbage_slots; | 141 | .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), |
| 125 | static int collect_garbage_slots(void); | 142 | .insn_size = MAX_INSN_SIZE, |
| 126 | 143 | .nr_garbage = 0, | |
| 127 | static int __kprobes check_safety(void) | 144 | }; |
| 128 | { | 145 | static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); |
| 129 | int ret = 0; | ||
| 130 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER) | ||
| 131 | ret = freeze_processes(); | ||
| 132 | if (ret == 0) { | ||
| 133 | struct task_struct *p, *q; | ||
| 134 | do_each_thread(p, q) { | ||
| 135 | if (p != current && p->state == TASK_RUNNING && | ||
| 136 | p->pid != 0) { | ||
| 137 | printk("Check failed: %s is running\n",p->comm); | ||
| 138 | ret = -1; | ||
| 139 | goto loop_end; | ||
| 140 | } | ||
| 141 | } while_each_thread(p, q); | ||
| 142 | } | ||
| 143 | loop_end: | ||
| 144 | thaw_processes(); | ||
| 145 | #else | ||
| 146 | synchronize_sched(); | ||
| 147 | #endif | ||
| 148 | return ret; | ||
| 149 | } | ||
| 150 | 146 | ||
| 151 | /** | 147 | /** |
| 152 | * __get_insn_slot() - Find a slot on an executable page for an instruction. | 148 | * __get_insn_slot() - Find a slot on an executable page for an instruction. |
| 153 | * We allocate an executable page if there's no room on existing ones. | 149 | * We allocate an executable page if there's no room on existing ones. |
| 154 | */ | 150 | */ |
| 155 | static kprobe_opcode_t __kprobes *__get_insn_slot(void) | 151 | static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) |
| 156 | { | 152 | { |
| 157 | struct kprobe_insn_page *kip; | 153 | struct kprobe_insn_page *kip; |
| 158 | 154 | ||
| 159 | retry: | 155 | retry: |
| 160 | list_for_each_entry(kip, &kprobe_insn_pages, list) { | 156 | list_for_each_entry(kip, &c->pages, list) { |
| 161 | if (kip->nused < INSNS_PER_PAGE) { | 157 | if (kip->nused < slots_per_page(c)) { |
| 162 | int i; | 158 | int i; |
| 163 | for (i = 0; i < INSNS_PER_PAGE; i++) { | 159 | for (i = 0; i < slots_per_page(c); i++) { |
| 164 | if (kip->slot_used[i] == SLOT_CLEAN) { | 160 | if (kip->slot_used[i] == SLOT_CLEAN) { |
| 165 | kip->slot_used[i] = SLOT_USED; | 161 | kip->slot_used[i] = SLOT_USED; |
| 166 | kip->nused++; | 162 | kip->nused++; |
| 167 | return kip->insns + (i * MAX_INSN_SIZE); | 163 | return kip->insns + (i * c->insn_size); |
| 168 | } | 164 | } |
| 169 | } | 165 | } |
| 170 | /* Surprise! No unused slots. Fix kip->nused. */ | 166 | /* kip->nused is broken. Fix it. */ |
| 171 | kip->nused = INSNS_PER_PAGE; | 167 | kip->nused = slots_per_page(c); |
| 168 | WARN_ON(1); | ||
| 172 | } | 169 | } |
| 173 | } | 170 | } |
| 174 | 171 | ||
| 175 | /* If there are any garbage slots, collect it and try again. */ | 172 | /* If there are any garbage slots, collect it and try again. */ |
| 176 | if (kprobe_garbage_slots && collect_garbage_slots() == 0) { | 173 | if (c->nr_garbage && collect_garbage_slots(c) == 0) |
| 177 | goto retry; | 174 | goto retry; |
| 178 | } | 175 | |
| 179 | /* All out of space. Need to allocate a new page. Use slot 0. */ | 176 | /* All out of space. Need to allocate a new page. */ |
| 180 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | 177 | kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); |
| 181 | if (!kip) | 178 | if (!kip) |
| 182 | return NULL; | 179 | return NULL; |
| 183 | 180 | ||
| @@ -192,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) | |||
| 192 | return NULL; | 189 | return NULL; |
| 193 | } | 190 | } |
| 194 | INIT_LIST_HEAD(&kip->list); | 191 | INIT_LIST_HEAD(&kip->list); |
| 195 | list_add(&kip->list, &kprobe_insn_pages); | 192 | memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); |
| 196 | memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); | ||
| 197 | kip->slot_used[0] = SLOT_USED; | 193 | kip->slot_used[0] = SLOT_USED; |
| 198 | kip->nused = 1; | 194 | kip->nused = 1; |
| 199 | kip->ngarbage = 0; | 195 | kip->ngarbage = 0; |
| 196 | list_add(&kip->list, &c->pages); | ||
| 200 | return kip->insns; | 197 | return kip->insns; |
| 201 | } | 198 | } |
| 202 | 199 | ||
| 200 | |||
| 203 | kprobe_opcode_t __kprobes *get_insn_slot(void) | 201 | kprobe_opcode_t __kprobes *get_insn_slot(void) |
| 204 | { | 202 | { |
| 205 | kprobe_opcode_t *ret; | 203 | kprobe_opcode_t *ret = NULL; |
| 204 | |||
| 206 | mutex_lock(&kprobe_insn_mutex); | 205 | mutex_lock(&kprobe_insn_mutex); |
| 207 | ret = __get_insn_slot(); | 206 | ret = __get_insn_slot(&kprobe_insn_slots); |
| 208 | mutex_unlock(&kprobe_insn_mutex); | 207 | mutex_unlock(&kprobe_insn_mutex); |
| 208 | |||
| 209 | return ret; | 209 | return ret; |
| 210 | } | 210 | } |
| 211 | 211 | ||
| @@ -221,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | |||
| 221 | * so as not to have to set it up again the | 221 | * so as not to have to set it up again the |
| 222 | * next time somebody inserts a probe. | 222 | * next time somebody inserts a probe. |
| 223 | */ | 223 | */ |
| 224 | if (!list_is_singular(&kprobe_insn_pages)) { | 224 | if (!list_is_singular(&kip->list)) { |
| 225 | list_del(&kip->list); | 225 | list_del(&kip->list); |
| 226 | module_free(NULL, kip->insns); | 226 | module_free(NULL, kip->insns); |
| 227 | kfree(kip); | 227 | kfree(kip); |
| @@ -231,52 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | |||
| 231 | return 0; | 231 | return 0; |
| 232 | } | 232 | } |
| 233 | 233 | ||
| 234 | static int __kprobes collect_garbage_slots(void) | 234 | static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) |
| 235 | { | 235 | { |
| 236 | struct kprobe_insn_page *kip, *next; | 236 | struct kprobe_insn_page *kip, *next; |
| 237 | 237 | ||
| 238 | /* Ensure no-one is preepmted on the garbages */ | 238 | /* Ensure no-one is interrupted on the garbages */ |
| 239 | if (check_safety()) | 239 | synchronize_sched(); |
| 240 | return -EAGAIN; | ||
| 241 | 240 | ||
| 242 | list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { | 241 | list_for_each_entry_safe(kip, next, &c->pages, list) { |
| 243 | int i; | 242 | int i; |
| 244 | if (kip->ngarbage == 0) | 243 | if (kip->ngarbage == 0) |
| 245 | continue; | 244 | continue; |
| 246 | kip->ngarbage = 0; /* we will collect all garbages */ | 245 | kip->ngarbage = 0; /* we will collect all garbages */ |
| 247 | for (i = 0; i < INSNS_PER_PAGE; i++) { | 246 | for (i = 0; i < slots_per_page(c); i++) { |
| 248 | if (kip->slot_used[i] == SLOT_DIRTY && | 247 | if (kip->slot_used[i] == SLOT_DIRTY && |
| 249 | collect_one_slot(kip, i)) | 248 | collect_one_slot(kip, i)) |
| 250 | break; | 249 | break; |
| 251 | } | 250 | } |
| 252 | } | 251 | } |
| 253 | kprobe_garbage_slots = 0; | 252 | c->nr_garbage = 0; |
| 254 | return 0; | 253 | return 0; |
| 255 | } | 254 | } |
| 256 | 255 | ||
| 257 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | 256 | static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, |
| 257 | kprobe_opcode_t *slot, int dirty) | ||
| 258 | { | 258 | { |
| 259 | struct kprobe_insn_page *kip; | 259 | struct kprobe_insn_page *kip; |
| 260 | 260 | ||
| 261 | mutex_lock(&kprobe_insn_mutex); | 261 | list_for_each_entry(kip, &c->pages, list) { |
| 262 | list_for_each_entry(kip, &kprobe_insn_pages, list) { | 262 | long idx = ((long)slot - (long)kip->insns) / c->insn_size; |
| 263 | if (kip->insns <= slot && | 263 | if (idx >= 0 && idx < slots_per_page(c)) { |
| 264 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { | 264 | WARN_ON(kip->slot_used[idx] != SLOT_USED); |
| 265 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | ||
| 266 | if (dirty) { | 265 | if (dirty) { |
| 267 | kip->slot_used[i] = SLOT_DIRTY; | 266 | kip->slot_used[idx] = SLOT_DIRTY; |
| 268 | kip->ngarbage++; | 267 | kip->ngarbage++; |
| 268 | if (++c->nr_garbage > slots_per_page(c)) | ||
| 269 | collect_garbage_slots(c); | ||
| 269 | } else | 270 | } else |
| 270 | collect_one_slot(kip, i); | 271 | collect_one_slot(kip, idx); |
| 271 | break; | 272 | return; |
| 272 | } | 273 | } |
| 273 | } | 274 | } |
| 275 | /* Could not free this slot. */ | ||
| 276 | WARN_ON(1); | ||
| 277 | } | ||
| 274 | 278 | ||
| 275 | if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) | 279 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) |
| 276 | collect_garbage_slots(); | 280 | { |
| 277 | 281 | mutex_lock(&kprobe_insn_mutex); | |
| 282 | __free_insn_slot(&kprobe_insn_slots, slot, dirty); | ||
| 278 | mutex_unlock(&kprobe_insn_mutex); | 283 | mutex_unlock(&kprobe_insn_mutex); |
| 279 | } | 284 | } |
| 285 | #ifdef CONFIG_OPTPROBES | ||
| 286 | /* For optimized_kprobe buffer */ | ||
| 287 | static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ | ||
| 288 | static struct kprobe_insn_cache kprobe_optinsn_slots = { | ||
| 289 | .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), | ||
| 290 | /* .insn_size is initialized later */ | ||
| 291 | .nr_garbage = 0, | ||
| 292 | }; | ||
| 293 | /* Get a slot for optimized_kprobe buffer */ | ||
| 294 | kprobe_opcode_t __kprobes *get_optinsn_slot(void) | ||
| 295 | { | ||
| 296 | kprobe_opcode_t *ret = NULL; | ||
| 297 | |||
| 298 | mutex_lock(&kprobe_optinsn_mutex); | ||
| 299 | ret = __get_insn_slot(&kprobe_optinsn_slots); | ||
| 300 | mutex_unlock(&kprobe_optinsn_mutex); | ||
| 301 | |||
| 302 | return ret; | ||
| 303 | } | ||
| 304 | |||
| 305 | void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) | ||
| 306 | { | ||
| 307 | mutex_lock(&kprobe_optinsn_mutex); | ||
| 308 | __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); | ||
| 309 | mutex_unlock(&kprobe_optinsn_mutex); | ||
| 310 | } | ||
| 311 | #endif | ||
| 280 | #endif | 312 | #endif |
| 281 | 313 | ||
| 282 | /* We have preemption disabled.. so it is safe to use __ versions */ | 314 | /* We have preemption disabled.. so it is safe to use __ versions */ |
| @@ -307,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr) | |||
| 307 | if (p->addr == addr) | 339 | if (p->addr == addr) |
| 308 | return p; | 340 | return p; |
| 309 | } | 341 | } |
| 342 | |||
| 310 | return NULL; | 343 | return NULL; |
| 311 | } | 344 | } |
| 312 | 345 | ||
| 346 | static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); | ||
| 347 | |||
| 348 | /* Return true if the kprobe is an aggregator */ | ||
| 349 | static inline int kprobe_aggrprobe(struct kprobe *p) | ||
| 350 | { | ||
| 351 | return p->pre_handler == aggr_pre_handler; | ||
| 352 | } | ||
| 353 | |||
| 354 | /* | ||
| 355 | * Keep all fields in the kprobe consistent | ||
| 356 | */ | ||
| 357 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
| 358 | { | ||
| 359 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | ||
| 360 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | ||
| 361 | } | ||
| 362 | |||
| 363 | #ifdef CONFIG_OPTPROBES | ||
| 364 | /* NOTE: change this value only with kprobe_mutex held */ | ||
| 365 | static bool kprobes_allow_optimization; | ||
| 366 | |||
| 367 | /* | ||
| 368 | * Call all pre_handler on the list, but ignores its return value. | ||
| 369 | * This must be called from arch-dep optimized caller. | ||
| 370 | */ | ||
| 371 | void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
| 372 | { | ||
| 373 | struct kprobe *kp; | ||
| 374 | |||
| 375 | list_for_each_entry_rcu(kp, &p->list, list) { | ||
| 376 | if (kp->pre_handler && likely(!kprobe_disabled(kp))) { | ||
| 377 | set_kprobe_instance(kp); | ||
| 378 | kp->pre_handler(kp, regs); | ||
| 379 | } | ||
| 380 | reset_kprobe_instance(); | ||
| 381 | } | ||
| 382 | } | ||
| 383 | |||
| 384 | /* Return true(!0) if the kprobe is ready for optimization. */ | ||
| 385 | static inline int kprobe_optready(struct kprobe *p) | ||
| 386 | { | ||
| 387 | struct optimized_kprobe *op; | ||
| 388 | |||
| 389 | if (kprobe_aggrprobe(p)) { | ||
| 390 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 391 | return arch_prepared_optinsn(&op->optinsn); | ||
| 392 | } | ||
| 393 | |||
| 394 | return 0; | ||
| 395 | } | ||
| 396 | |||
| 397 | /* | ||
| 398 | * Return an optimized kprobe whose optimizing code replaces | ||
| 399 | * instructions including addr (exclude breakpoint). | ||
| 400 | */ | ||
| 401 | struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | ||
| 402 | { | ||
| 403 | int i; | ||
| 404 | struct kprobe *p = NULL; | ||
| 405 | struct optimized_kprobe *op; | ||
| 406 | |||
| 407 | /* Don't check i == 0, since that is a breakpoint case. */ | ||
| 408 | for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) | ||
| 409 | p = get_kprobe((void *)(addr - i)); | ||
| 410 | |||
| 411 | if (p && kprobe_optready(p)) { | ||
| 412 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 413 | if (arch_within_optimized_kprobe(op, addr)) | ||
| 414 | return p; | ||
| 415 | } | ||
| 416 | |||
| 417 | return NULL; | ||
| 418 | } | ||
| 419 | |||
| 420 | /* Optimization staging list, protected by kprobe_mutex */ | ||
| 421 | static LIST_HEAD(optimizing_list); | ||
| 422 | |||
| 423 | static void kprobe_optimizer(struct work_struct *work); | ||
| 424 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | ||
| 425 | #define OPTIMIZE_DELAY 5 | ||
| 426 | |||
| 427 | /* Kprobe jump optimizer */ | ||
| 428 | static __kprobes void kprobe_optimizer(struct work_struct *work) | ||
| 429 | { | ||
| 430 | struct optimized_kprobe *op, *tmp; | ||
| 431 | |||
| 432 | /* Lock modules while optimizing kprobes */ | ||
| 433 | mutex_lock(&module_mutex); | ||
| 434 | mutex_lock(&kprobe_mutex); | ||
| 435 | if (kprobes_all_disarmed || !kprobes_allow_optimization) | ||
| 436 | goto end; | ||
| 437 | |||
| 438 | /* | ||
| 439 | * Wait for quiesence period to ensure all running interrupts | ||
| 440 | * are done. Because optprobe may modify multiple instructions | ||
| 441 | * there is a chance that Nth instruction is interrupted. In that | ||
| 442 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
| 443 | * instruction. This wait is for avoiding it. | ||
| 444 | */ | ||
| 445 | synchronize_sched(); | ||
| 446 | |||
| 447 | /* | ||
| 448 | * The optimization/unoptimization refers online_cpus via | ||
| 449 | * stop_machine() and cpu-hotplug modifies online_cpus. | ||
| 450 | * And same time, text_mutex will be held in cpu-hotplug and here. | ||
| 451 | * This combination can cause a deadlock (cpu-hotplug try to lock | ||
| 452 | * text_mutex but stop_machine can not be done because online_cpus | ||
| 453 | * has been changed) | ||
| 454 | * To avoid this deadlock, we need to call get_online_cpus() | ||
| 455 | * for preventing cpu-hotplug outside of text_mutex locking. | ||
| 456 | */ | ||
| 457 | get_online_cpus(); | ||
| 458 | mutex_lock(&text_mutex); | ||
| 459 | list_for_each_entry_safe(op, tmp, &optimizing_list, list) { | ||
| 460 | WARN_ON(kprobe_disabled(&op->kp)); | ||
| 461 | if (arch_optimize_kprobe(op) < 0) | ||
| 462 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
| 463 | list_del_init(&op->list); | ||
| 464 | } | ||
| 465 | mutex_unlock(&text_mutex); | ||
| 466 | put_online_cpus(); | ||
| 467 | end: | ||
| 468 | mutex_unlock(&kprobe_mutex); | ||
| 469 | mutex_unlock(&module_mutex); | ||
| 470 | } | ||
| 471 | |||
| 472 | /* Optimize kprobe if p is ready to be optimized */ | ||
| 473 | static __kprobes void optimize_kprobe(struct kprobe *p) | ||
| 474 | { | ||
| 475 | struct optimized_kprobe *op; | ||
| 476 | |||
| 477 | /* Check if the kprobe is disabled or not ready for optimization. */ | ||
| 478 | if (!kprobe_optready(p) || !kprobes_allow_optimization || | ||
| 479 | (kprobe_disabled(p) || kprobes_all_disarmed)) | ||
| 480 | return; | ||
| 481 | |||
| 482 | /* Both of break_handler and post_handler are not supported. */ | ||
| 483 | if (p->break_handler || p->post_handler) | ||
| 484 | return; | ||
| 485 | |||
| 486 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 487 | |||
| 488 | /* Check there is no other kprobes at the optimized instructions */ | ||
| 489 | if (arch_check_optimized_kprobe(op) < 0) | ||
| 490 | return; | ||
| 491 | |||
| 492 | /* Check if it is already optimized. */ | ||
| 493 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) | ||
| 494 | return; | ||
| 495 | |||
| 496 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; | ||
| 497 | list_add(&op->list, &optimizing_list); | ||
| 498 | if (!delayed_work_pending(&optimizing_work)) | ||
| 499 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | ||
| 500 | } | ||
| 501 | |||
| 502 | /* Unoptimize a kprobe if p is optimized */ | ||
| 503 | static __kprobes void unoptimize_kprobe(struct kprobe *p) | ||
| 504 | { | ||
| 505 | struct optimized_kprobe *op; | ||
| 506 | |||
| 507 | if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { | ||
| 508 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 509 | if (!list_empty(&op->list)) | ||
| 510 | /* Dequeue from the optimization queue */ | ||
| 511 | list_del_init(&op->list); | ||
| 512 | else | ||
| 513 | /* Replace jump with break */ | ||
| 514 | arch_unoptimize_kprobe(op); | ||
| 515 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
| 516 | } | ||
| 517 | } | ||
| 518 | |||
| 519 | /* Remove optimized instructions */ | ||
| 520 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) | ||
| 521 | { | ||
| 522 | struct optimized_kprobe *op; | ||
| 523 | |||
| 524 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 525 | if (!list_empty(&op->list)) { | ||
| 526 | /* Dequeue from the optimization queue */ | ||
| 527 | list_del_init(&op->list); | ||
| 528 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
| 529 | } | ||
| 530 | /* Don't unoptimize, because the target code will be freed. */ | ||
| 531 | arch_remove_optimized_kprobe(op); | ||
| 532 | } | ||
| 533 | |||
| 534 | /* Try to prepare optimized instructions */ | ||
| 535 | static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | ||
| 536 | { | ||
| 537 | struct optimized_kprobe *op; | ||
| 538 | |||
| 539 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 540 | arch_prepare_optimized_kprobe(op); | ||
| 541 | } | ||
| 542 | |||
| 543 | /* Free optimized instructions and optimized_kprobe */ | ||
| 544 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
| 545 | { | ||
| 546 | struct optimized_kprobe *op; | ||
| 547 | |||
| 548 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 549 | arch_remove_optimized_kprobe(op); | ||
| 550 | kfree(op); | ||
| 551 | } | ||
| 552 | |||
| 553 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | ||
| 554 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | ||
| 555 | { | ||
| 556 | struct optimized_kprobe *op; | ||
| 557 | |||
| 558 | op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL); | ||
| 559 | if (!op) | ||
| 560 | return NULL; | ||
| 561 | |||
| 562 | INIT_LIST_HEAD(&op->list); | ||
| 563 | op->kp.addr = p->addr; | ||
| 564 | arch_prepare_optimized_kprobe(op); | ||
| 565 | |||
| 566 | return &op->kp; | ||
| 567 | } | ||
| 568 | |||
| 569 | static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); | ||
| 570 | |||
| 571 | /* | ||
| 572 | * Prepare an optimized_kprobe and optimize it | ||
| 573 | * NOTE: p must be a normal registered kprobe | ||
| 574 | */ | ||
| 575 | static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | ||
| 576 | { | ||
| 577 | struct kprobe *ap; | ||
| 578 | struct optimized_kprobe *op; | ||
| 579 | |||
| 580 | ap = alloc_aggr_kprobe(p); | ||
| 581 | if (!ap) | ||
| 582 | return; | ||
| 583 | |||
| 584 | op = container_of(ap, struct optimized_kprobe, kp); | ||
| 585 | if (!arch_prepared_optinsn(&op->optinsn)) { | ||
| 586 | /* If failed to setup optimizing, fallback to kprobe */ | ||
| 587 | free_aggr_kprobe(ap); | ||
| 588 | return; | ||
| 589 | } | ||
| 590 | |||
| 591 | init_aggr_kprobe(ap, p); | ||
| 592 | optimize_kprobe(ap); | ||
| 593 | } | ||
| 594 | |||
| 595 | #ifdef CONFIG_SYSCTL | ||
| 596 | static void __kprobes optimize_all_kprobes(void) | ||
| 597 | { | ||
| 598 | struct hlist_head *head; | ||
| 599 | struct hlist_node *node; | ||
| 600 | struct kprobe *p; | ||
| 601 | unsigned int i; | ||
| 602 | |||
| 603 | /* If optimization is already allowed, just return */ | ||
| 604 | if (kprobes_allow_optimization) | ||
| 605 | return; | ||
| 606 | |||
| 607 | kprobes_allow_optimization = true; | ||
| 608 | mutex_lock(&text_mutex); | ||
| 609 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | ||
| 610 | head = &kprobe_table[i]; | ||
| 611 | hlist_for_each_entry_rcu(p, node, head, hlist) | ||
| 612 | if (!kprobe_disabled(p)) | ||
| 613 | optimize_kprobe(p); | ||
| 614 | } | ||
| 615 | mutex_unlock(&text_mutex); | ||
| 616 | printk(KERN_INFO "Kprobes globally optimized\n"); | ||
| 617 | } | ||
| 618 | |||
| 619 | static void __kprobes unoptimize_all_kprobes(void) | ||
| 620 | { | ||
| 621 | struct hlist_head *head; | ||
| 622 | struct hlist_node *node; | ||
| 623 | struct kprobe *p; | ||
| 624 | unsigned int i; | ||
| 625 | |||
| 626 | /* If optimization is already prohibited, just return */ | ||
| 627 | if (!kprobes_allow_optimization) | ||
| 628 | return; | ||
| 629 | |||
| 630 | kprobes_allow_optimization = false; | ||
| 631 | printk(KERN_INFO "Kprobes globally unoptimized\n"); | ||
| 632 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
| 633 | mutex_lock(&text_mutex); | ||
| 634 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | ||
| 635 | head = &kprobe_table[i]; | ||
| 636 | hlist_for_each_entry_rcu(p, node, head, hlist) { | ||
| 637 | if (!kprobe_disabled(p)) | ||
| 638 | unoptimize_kprobe(p); | ||
| 639 | } | ||
| 640 | } | ||
| 641 | |||
| 642 | mutex_unlock(&text_mutex); | ||
| 643 | put_online_cpus(); | ||
| 644 | /* Allow all currently running kprobes to complete */ | ||
| 645 | synchronize_sched(); | ||
| 646 | } | ||
| 647 | |||
| 648 | int sysctl_kprobes_optimization; | ||
| 649 | int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | ||
| 650 | void __user *buffer, size_t *length, | ||
| 651 | loff_t *ppos) | ||
| 652 | { | ||
| 653 | int ret; | ||
| 654 | |||
| 655 | mutex_lock(&kprobe_mutex); | ||
| 656 | sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; | ||
| 657 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
| 658 | |||
| 659 | if (sysctl_kprobes_optimization) | ||
| 660 | optimize_all_kprobes(); | ||
| 661 | else | ||
| 662 | unoptimize_all_kprobes(); | ||
| 663 | mutex_unlock(&kprobe_mutex); | ||
| 664 | |||
| 665 | return ret; | ||
| 666 | } | ||
| 667 | #endif /* CONFIG_SYSCTL */ | ||
| 668 | |||
| 669 | static void __kprobes __arm_kprobe(struct kprobe *p) | ||
| 670 | { | ||
| 671 | struct kprobe *old_p; | ||
| 672 | |||
| 673 | /* Check collision with other optimized kprobes */ | ||
| 674 | old_p = get_optimized_kprobe((unsigned long)p->addr); | ||
| 675 | if (unlikely(old_p)) | ||
| 676 | unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ | ||
| 677 | |||
| 678 | arch_arm_kprobe(p); | ||
| 679 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ | ||
| 680 | } | ||
| 681 | |||
| 682 | static void __kprobes __disarm_kprobe(struct kprobe *p) | ||
| 683 | { | ||
| 684 | struct kprobe *old_p; | ||
| 685 | |||
| 686 | unoptimize_kprobe(p); /* Try to unoptimize */ | ||
| 687 | arch_disarm_kprobe(p); | ||
| 688 | |||
| 689 | /* If another kprobe was blocked, optimize it. */ | ||
| 690 | old_p = get_optimized_kprobe((unsigned long)p->addr); | ||
| 691 | if (unlikely(old_p)) | ||
| 692 | optimize_kprobe(old_p); | ||
| 693 | } | ||
| 694 | |||
| 695 | #else /* !CONFIG_OPTPROBES */ | ||
| 696 | |||
| 697 | #define optimize_kprobe(p) do {} while (0) | ||
| 698 | #define unoptimize_kprobe(p) do {} while (0) | ||
| 699 | #define kill_optimized_kprobe(p) do {} while (0) | ||
| 700 | #define prepare_optimized_kprobe(p) do {} while (0) | ||
| 701 | #define try_to_optimize_kprobe(p) do {} while (0) | ||
| 702 | #define __arm_kprobe(p) arch_arm_kprobe(p) | ||
| 703 | #define __disarm_kprobe(p) arch_disarm_kprobe(p) | ||
| 704 | |||
| 705 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
| 706 | { | ||
| 707 | kfree(p); | ||
| 708 | } | ||
| 709 | |||
| 710 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | ||
| 711 | { | ||
| 712 | return kzalloc(sizeof(struct kprobe), GFP_KERNEL); | ||
| 713 | } | ||
| 714 | #endif /* CONFIG_OPTPROBES */ | ||
| 715 | |||
| 313 | /* Arm a kprobe with text_mutex */ | 716 | /* Arm a kprobe with text_mutex */ |
| 314 | static void __kprobes arm_kprobe(struct kprobe *kp) | 717 | static void __kprobes arm_kprobe(struct kprobe *kp) |
| 315 | { | 718 | { |
| 719 | /* | ||
| 720 | * Here, since __arm_kprobe() doesn't use stop_machine(), | ||
| 721 | * this doesn't cause deadlock on text_mutex. So, we don't | ||
| 722 | * need get_online_cpus(). | ||
| 723 | */ | ||
| 316 | mutex_lock(&text_mutex); | 724 | mutex_lock(&text_mutex); |
| 317 | arch_arm_kprobe(kp); | 725 | __arm_kprobe(kp); |
| 318 | mutex_unlock(&text_mutex); | 726 | mutex_unlock(&text_mutex); |
| 319 | } | 727 | } |
| 320 | 728 | ||
| 321 | /* Disarm a kprobe with text_mutex */ | 729 | /* Disarm a kprobe with text_mutex */ |
| 322 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 730 | static void __kprobes disarm_kprobe(struct kprobe *kp) |
| 323 | { | 731 | { |
| 732 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
| 324 | mutex_lock(&text_mutex); | 733 | mutex_lock(&text_mutex); |
| 325 | arch_disarm_kprobe(kp); | 734 | __disarm_kprobe(kp); |
| 326 | mutex_unlock(&text_mutex); | 735 | mutex_unlock(&text_mutex); |
| 736 | put_online_cpus(); | ||
| 327 | } | 737 | } |
| 328 | 738 | ||
| 329 | /* | 739 | /* |
| @@ -392,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | |||
| 392 | void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) | 802 | void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) |
| 393 | { | 803 | { |
| 394 | struct kprobe *kp; | 804 | struct kprobe *kp; |
| 395 | if (p->pre_handler != aggr_pre_handler) { | 805 | if (!kprobe_aggrprobe(p)) { |
| 396 | p->nmissed++; | 806 | p->nmissed++; |
| 397 | } else { | 807 | } else { |
| 398 | list_for_each_entry_rcu(kp, &p->list, list) | 808 | list_for_each_entry_rcu(kp, &p->list, list) |
| @@ -516,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) | |||
| 516 | } | 926 | } |
| 517 | 927 | ||
| 518 | /* | 928 | /* |
| 519 | * Keep all fields in the kprobe consistent | ||
| 520 | */ | ||
| 521 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
| 522 | { | ||
| 523 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | ||
| 524 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | ||
| 525 | } | ||
| 526 | |||
| 527 | /* | ||
| 528 | * Add the new probe to ap->list. Fail if this is the | 929 | * Add the new probe to ap->list. Fail if this is the |
| 529 | * second jprobe at the address - two jprobes can't coexist | 930 | * second jprobe at the address - two jprobes can't coexist |
| 530 | */ | 931 | */ |
| 531 | static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | 932 | static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) |
| 532 | { | 933 | { |
| 533 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); | 934 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); |
| 935 | |||
| 936 | if (p->break_handler || p->post_handler) | ||
| 937 | unoptimize_kprobe(ap); /* Fall back to normal kprobe */ | ||
| 938 | |||
| 534 | if (p->break_handler) { | 939 | if (p->break_handler) { |
| 535 | if (ap->break_handler) | 940 | if (ap->break_handler) |
| 536 | return -EEXIST; | 941 | return -EEXIST; |
| @@ -545,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
| 545 | ap->flags &= ~KPROBE_FLAG_DISABLED; | 950 | ap->flags &= ~KPROBE_FLAG_DISABLED; |
| 546 | if (!kprobes_all_disarmed) | 951 | if (!kprobes_all_disarmed) |
| 547 | /* Arm the breakpoint again. */ | 952 | /* Arm the breakpoint again. */ |
| 548 | arm_kprobe(ap); | 953 | __arm_kprobe(ap); |
| 549 | } | 954 | } |
| 550 | return 0; | 955 | return 0; |
| 551 | } | 956 | } |
| @@ -554,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
| 554 | * Fill in the required fields of the "manager kprobe". Replace the | 959 | * Fill in the required fields of the "manager kprobe". Replace the |
| 555 | * earlier kprobe in the hlist with the manager kprobe | 960 | * earlier kprobe in the hlist with the manager kprobe |
| 556 | */ | 961 | */ |
| 557 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | 962 | static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) |
| 558 | { | 963 | { |
| 964 | /* Copy p's insn slot to ap */ | ||
| 559 | copy_kprobe(p, ap); | 965 | copy_kprobe(p, ap); |
| 560 | flush_insn_slot(ap); | 966 | flush_insn_slot(ap); |
| 561 | ap->addr = p->addr; | 967 | ap->addr = p->addr; |
| 562 | ap->flags = p->flags; | 968 | ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; |
| 563 | ap->pre_handler = aggr_pre_handler; | 969 | ap->pre_handler = aggr_pre_handler; |
| 564 | ap->fault_handler = aggr_fault_handler; | 970 | ap->fault_handler = aggr_fault_handler; |
| 565 | /* We don't care the kprobe which has gone. */ | 971 | /* We don't care the kprobe which has gone. */ |
| @@ -569,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
| 569 | ap->break_handler = aggr_break_handler; | 975 | ap->break_handler = aggr_break_handler; |
| 570 | 976 | ||
| 571 | INIT_LIST_HEAD(&ap->list); | 977 | INIT_LIST_HEAD(&ap->list); |
| 572 | list_add_rcu(&p->list, &ap->list); | 978 | INIT_HLIST_NODE(&ap->hlist); |
| 573 | 979 | ||
| 980 | list_add_rcu(&p->list, &ap->list); | ||
| 574 | hlist_replace_rcu(&p->hlist, &ap->hlist); | 981 | hlist_replace_rcu(&p->hlist, &ap->hlist); |
| 575 | } | 982 | } |
| 576 | 983 | ||
| @@ -584,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
| 584 | int ret = 0; | 991 | int ret = 0; |
| 585 | struct kprobe *ap = old_p; | 992 | struct kprobe *ap = old_p; |
| 586 | 993 | ||
| 587 | if (old_p->pre_handler != aggr_pre_handler) { | 994 | if (!kprobe_aggrprobe(old_p)) { |
| 588 | /* If old_p is not an aggr_probe, create new aggr_kprobe. */ | 995 | /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ |
| 589 | ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); | 996 | ap = alloc_aggr_kprobe(old_p); |
| 590 | if (!ap) | 997 | if (!ap) |
| 591 | return -ENOMEM; | 998 | return -ENOMEM; |
| 592 | add_aggr_kprobe(ap, old_p); | 999 | init_aggr_kprobe(ap, old_p); |
| 593 | } | 1000 | } |
| 594 | 1001 | ||
| 595 | if (kprobe_gone(ap)) { | 1002 | if (kprobe_gone(ap)) { |
| @@ -608,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
| 608 | */ | 1015 | */ |
| 609 | return ret; | 1016 | return ret; |
| 610 | 1017 | ||
| 1018 | /* Prepare optimized instructions if possible. */ | ||
| 1019 | prepare_optimized_kprobe(ap); | ||
| 1020 | |||
| 611 | /* | 1021 | /* |
| 612 | * Clear gone flag to prevent allocating new slot again, and | 1022 | * Clear gone flag to prevent allocating new slot again, and |
| 613 | * set disabled flag because it is not armed yet. | 1023 | * set disabled flag because it is not armed yet. |
| @@ -616,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
| 616 | | KPROBE_FLAG_DISABLED; | 1026 | | KPROBE_FLAG_DISABLED; |
| 617 | } | 1027 | } |
| 618 | 1028 | ||
| 1029 | /* Copy ap's insn slot to p */ | ||
| 619 | copy_kprobe(ap, p); | 1030 | copy_kprobe(ap, p); |
| 620 | return add_new_kprobe(ap, p); | 1031 | return add_new_kprobe(ap, p); |
| 621 | } | 1032 | } |
| @@ -728,7 +1139,8 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
| 728 | 1139 | ||
| 729 | preempt_disable(); | 1140 | preempt_disable(); |
| 730 | if (!kernel_text_address((unsigned long) p->addr) || | 1141 | if (!kernel_text_address((unsigned long) p->addr) || |
| 731 | in_kprobes_functions((unsigned long) p->addr)) { | 1142 | in_kprobes_functions((unsigned long) p->addr) || |
| 1143 | ftrace_text_reserved(p->addr, p->addr)) { | ||
| 732 | preempt_enable(); | 1144 | preempt_enable(); |
| 733 | return -EINVAL; | 1145 | return -EINVAL; |
| 734 | } | 1146 | } |
| @@ -765,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
| 765 | p->nmissed = 0; | 1177 | p->nmissed = 0; |
| 766 | INIT_LIST_HEAD(&p->list); | 1178 | INIT_LIST_HEAD(&p->list); |
| 767 | mutex_lock(&kprobe_mutex); | 1179 | mutex_lock(&kprobe_mutex); |
| 1180 | |||
| 1181 | get_online_cpus(); /* For avoiding text_mutex deadlock. */ | ||
| 1182 | mutex_lock(&text_mutex); | ||
| 1183 | |||
| 768 | old_p = get_kprobe(p->addr); | 1184 | old_p = get_kprobe(p->addr); |
| 769 | if (old_p) { | 1185 | if (old_p) { |
| 1186 | /* Since this may unoptimize old_p, locking text_mutex. */ | ||
| 770 | ret = register_aggr_kprobe(old_p, p); | 1187 | ret = register_aggr_kprobe(old_p, p); |
| 771 | goto out; | 1188 | goto out; |
| 772 | } | 1189 | } |
| 773 | 1190 | ||
| 774 | mutex_lock(&text_mutex); | ||
| 775 | ret = arch_prepare_kprobe(p); | 1191 | ret = arch_prepare_kprobe(p); |
| 776 | if (ret) | 1192 | if (ret) |
| 777 | goto out_unlock_text; | 1193 | goto out; |
| 778 | 1194 | ||
| 779 | INIT_HLIST_NODE(&p->hlist); | 1195 | INIT_HLIST_NODE(&p->hlist); |
| 780 | hlist_add_head_rcu(&p->hlist, | 1196 | hlist_add_head_rcu(&p->hlist, |
| 781 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 1197 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
| 782 | 1198 | ||
| 783 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) | 1199 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) |
| 784 | arch_arm_kprobe(p); | 1200 | __arm_kprobe(p); |
| 1201 | |||
| 1202 | /* Try to optimize kprobe */ | ||
| 1203 | try_to_optimize_kprobe(p); | ||
| 785 | 1204 | ||
| 786 | out_unlock_text: | ||
| 787 | mutex_unlock(&text_mutex); | ||
| 788 | out: | 1205 | out: |
| 1206 | mutex_unlock(&text_mutex); | ||
| 1207 | put_online_cpus(); | ||
| 789 | mutex_unlock(&kprobe_mutex); | 1208 | mutex_unlock(&kprobe_mutex); |
| 790 | 1209 | ||
| 791 | if (probed_mod) | 1210 | if (probed_mod) |
| @@ -807,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) | |||
| 807 | return -EINVAL; | 1226 | return -EINVAL; |
| 808 | 1227 | ||
| 809 | if (old_p == p || | 1228 | if (old_p == p || |
| 810 | (old_p->pre_handler == aggr_pre_handler && | 1229 | (kprobe_aggrprobe(old_p) && |
| 811 | list_is_singular(&old_p->list))) { | 1230 | list_is_singular(&old_p->list))) { |
| 812 | /* | 1231 | /* |
| 813 | * Only probe on the hash list. Disarm only if kprobes are | 1232 | * Only probe on the hash list. Disarm only if kprobes are |
| @@ -815,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) | |||
| 815 | * already have been removed. We save on flushing icache. | 1234 | * already have been removed. We save on flushing icache. |
| 816 | */ | 1235 | */ |
| 817 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) | 1236 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) |
| 818 | disarm_kprobe(p); | 1237 | disarm_kprobe(old_p); |
| 819 | hlist_del_rcu(&old_p->hlist); | 1238 | hlist_del_rcu(&old_p->hlist); |
| 820 | } else { | 1239 | } else { |
| 821 | if (p->break_handler && !kprobe_gone(p)) | 1240 | if (p->break_handler && !kprobe_gone(p)) |
| @@ -831,8 +1250,13 @@ noclean: | |||
| 831 | list_del_rcu(&p->list); | 1250 | list_del_rcu(&p->list); |
| 832 | if (!kprobe_disabled(old_p)) { | 1251 | if (!kprobe_disabled(old_p)) { |
| 833 | try_to_disable_aggr_kprobe(old_p); | 1252 | try_to_disable_aggr_kprobe(old_p); |
| 834 | if (!kprobes_all_disarmed && kprobe_disabled(old_p)) | 1253 | if (!kprobes_all_disarmed) { |
| 835 | disarm_kprobe(old_p); | 1254 | if (kprobe_disabled(old_p)) |
| 1255 | disarm_kprobe(old_p); | ||
| 1256 | else | ||
| 1257 | /* Try to optimize this probe again */ | ||
| 1258 | optimize_kprobe(old_p); | ||
| 1259 | } | ||
| 836 | } | 1260 | } |
| 837 | } | 1261 | } |
| 838 | return 0; | 1262 | return 0; |
| @@ -849,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | |||
| 849 | old_p = list_entry(p->list.next, struct kprobe, list); | 1273 | old_p = list_entry(p->list.next, struct kprobe, list); |
| 850 | list_del(&p->list); | 1274 | list_del(&p->list); |
| 851 | arch_remove_kprobe(old_p); | 1275 | arch_remove_kprobe(old_p); |
| 852 | kfree(old_p); | 1276 | free_aggr_kprobe(old_p); |
| 853 | } | 1277 | } |
| 854 | } | 1278 | } |
| 855 | 1279 | ||
| @@ -1145,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
| 1145 | struct kprobe *kp; | 1569 | struct kprobe *kp; |
| 1146 | 1570 | ||
| 1147 | p->flags |= KPROBE_FLAG_GONE; | 1571 | p->flags |= KPROBE_FLAG_GONE; |
| 1148 | if (p->pre_handler == aggr_pre_handler) { | 1572 | if (kprobe_aggrprobe(p)) { |
| 1149 | /* | 1573 | /* |
| 1150 | * If this is an aggr_kprobe, we have to list all the | 1574 | * If this is an aggr_kprobe, we have to list all the |
| 1151 | * chained probes and mark them GONE. | 1575 | * chained probes and mark them GONE. |
| @@ -1154,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
| 1154 | kp->flags |= KPROBE_FLAG_GONE; | 1578 | kp->flags |= KPROBE_FLAG_GONE; |
| 1155 | p->post_handler = NULL; | 1579 | p->post_handler = NULL; |
| 1156 | p->break_handler = NULL; | 1580 | p->break_handler = NULL; |
| 1581 | kill_optimized_kprobe(p); | ||
| 1157 | } | 1582 | } |
| 1158 | /* | 1583 | /* |
| 1159 | * Here, we can remove insn_slot safely, because no thread calls | 1584 | * Here, we can remove insn_slot safely, because no thread calls |
| @@ -1263,6 +1688,15 @@ static int __init init_kprobes(void) | |||
| 1263 | } | 1688 | } |
| 1264 | } | 1689 | } |
| 1265 | 1690 | ||
| 1691 | #if defined(CONFIG_OPTPROBES) | ||
| 1692 | #if defined(__ARCH_WANT_KPROBES_INSN_SLOT) | ||
| 1693 | /* Init kprobe_optinsn_slots */ | ||
| 1694 | kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; | ||
| 1695 | #endif | ||
| 1696 | /* By default, kprobes can be optimized */ | ||
| 1697 | kprobes_allow_optimization = true; | ||
| 1698 | #endif | ||
| 1699 | |||
| 1266 | /* By default, kprobes are armed */ | 1700 | /* By default, kprobes are armed */ |
| 1267 | kprobes_all_disarmed = false; | 1701 | kprobes_all_disarmed = false; |
| 1268 | 1702 | ||
| @@ -1281,7 +1715,7 @@ static int __init init_kprobes(void) | |||
| 1281 | 1715 | ||
| 1282 | #ifdef CONFIG_DEBUG_FS | 1716 | #ifdef CONFIG_DEBUG_FS |
| 1283 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | 1717 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, |
| 1284 | const char *sym, int offset,char *modname) | 1718 | const char *sym, int offset, char *modname, struct kprobe *pp) |
| 1285 | { | 1719 | { |
| 1286 | char *kprobe_type; | 1720 | char *kprobe_type; |
| 1287 | 1721 | ||
| @@ -1291,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | |||
| 1291 | kprobe_type = "j"; | 1725 | kprobe_type = "j"; |
| 1292 | else | 1726 | else |
| 1293 | kprobe_type = "k"; | 1727 | kprobe_type = "k"; |
| 1728 | |||
| 1294 | if (sym) | 1729 | if (sym) |
| 1295 | seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", | 1730 | seq_printf(pi, "%p %s %s+0x%x %s ", |
| 1296 | p->addr, kprobe_type, sym, offset, | 1731 | p->addr, kprobe_type, sym, offset, |
| 1297 | (modname ? modname : " "), | 1732 | (modname ? modname : " ")); |
| 1298 | (kprobe_gone(p) ? "[GONE]" : ""), | ||
| 1299 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? | ||
| 1300 | "[DISABLED]" : "")); | ||
| 1301 | else | 1733 | else |
| 1302 | seq_printf(pi, "%p %s %p %s%s\n", | 1734 | seq_printf(pi, "%p %s %p ", |
| 1303 | p->addr, kprobe_type, p->addr, | 1735 | p->addr, kprobe_type, p->addr); |
| 1304 | (kprobe_gone(p) ? "[GONE]" : ""), | 1736 | |
| 1305 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? | 1737 | if (!pp) |
| 1306 | "[DISABLED]" : "")); | 1738 | pp = p; |
| 1739 | seq_printf(pi, "%s%s%s\n", | ||
| 1740 | (kprobe_gone(p) ? "[GONE]" : ""), | ||
| 1741 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), | ||
| 1742 | (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); | ||
| 1307 | } | 1743 | } |
| 1308 | 1744 | ||
| 1309 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) | 1745 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) |
| @@ -1339,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) | |||
| 1339 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 1775 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
| 1340 | sym = kallsyms_lookup((unsigned long)p->addr, NULL, | 1776 | sym = kallsyms_lookup((unsigned long)p->addr, NULL, |
| 1341 | &offset, &modname, namebuf); | 1777 | &offset, &modname, namebuf); |
| 1342 | if (p->pre_handler == aggr_pre_handler) { | 1778 | if (kprobe_aggrprobe(p)) { |
| 1343 | list_for_each_entry_rcu(kp, &p->list, list) | 1779 | list_for_each_entry_rcu(kp, &p->list, list) |
| 1344 | report_probe(pi, kp, sym, offset, modname); | 1780 | report_probe(pi, kp, sym, offset, modname, p); |
| 1345 | } else | 1781 | } else |
| 1346 | report_probe(pi, p, sym, offset, modname); | 1782 | report_probe(pi, p, sym, offset, modname, NULL); |
| 1347 | } | 1783 | } |
| 1348 | preempt_enable(); | 1784 | preempt_enable(); |
| 1349 | return 0; | 1785 | return 0; |
| @@ -1421,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp) | |||
| 1421 | goto out; | 1857 | goto out; |
| 1422 | } | 1858 | } |
| 1423 | 1859 | ||
| 1424 | if (!kprobes_all_disarmed && kprobe_disabled(p)) | ||
| 1425 | arm_kprobe(p); | ||
| 1426 | |||
| 1427 | p->flags &= ~KPROBE_FLAG_DISABLED; | ||
| 1428 | if (p != kp) | 1860 | if (p != kp) |
| 1429 | kp->flags &= ~KPROBE_FLAG_DISABLED; | 1861 | kp->flags &= ~KPROBE_FLAG_DISABLED; |
| 1862 | |||
| 1863 | if (!kprobes_all_disarmed && kprobe_disabled(p)) { | ||
| 1864 | p->flags &= ~KPROBE_FLAG_DISABLED; | ||
| 1865 | arm_kprobe(p); | ||
| 1866 | } | ||
| 1430 | out: | 1867 | out: |
| 1431 | mutex_unlock(&kprobe_mutex); | 1868 | mutex_unlock(&kprobe_mutex); |
| 1432 | return ret; | 1869 | return ret; |
| @@ -1446,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void) | |||
| 1446 | if (!kprobes_all_disarmed) | 1883 | if (!kprobes_all_disarmed) |
| 1447 | goto already_enabled; | 1884 | goto already_enabled; |
| 1448 | 1885 | ||
| 1886 | /* Arming kprobes doesn't optimize kprobe itself */ | ||
| 1449 | mutex_lock(&text_mutex); | 1887 | mutex_lock(&text_mutex); |
| 1450 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1888 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 1451 | head = &kprobe_table[i]; | 1889 | head = &kprobe_table[i]; |
| 1452 | hlist_for_each_entry_rcu(p, node, head, hlist) | 1890 | hlist_for_each_entry_rcu(p, node, head, hlist) |
| 1453 | if (!kprobe_disabled(p)) | 1891 | if (!kprobe_disabled(p)) |
| 1454 | arch_arm_kprobe(p); | 1892 | __arm_kprobe(p); |
| 1455 | } | 1893 | } |
| 1456 | mutex_unlock(&text_mutex); | 1894 | mutex_unlock(&text_mutex); |
| 1457 | 1895 | ||
| @@ -1478,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void) | |||
| 1478 | 1916 | ||
| 1479 | kprobes_all_disarmed = true; | 1917 | kprobes_all_disarmed = true; |
| 1480 | printk(KERN_INFO "Kprobes globally disabled\n"); | 1918 | printk(KERN_INFO "Kprobes globally disabled\n"); |
| 1919 | |||
| 1920 | /* | ||
| 1921 | * Here we call get_online_cpus() for avoiding text_mutex deadlock, | ||
| 1922 | * because disarming may also unoptimize kprobes. | ||
| 1923 | */ | ||
| 1924 | get_online_cpus(); | ||
| 1481 | mutex_lock(&text_mutex); | 1925 | mutex_lock(&text_mutex); |
| 1482 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1926 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 1483 | head = &kprobe_table[i]; | 1927 | head = &kprobe_table[i]; |
| 1484 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 1928 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
| 1485 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 1929 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
| 1486 | arch_disarm_kprobe(p); | 1930 | __disarm_kprobe(p); |
| 1487 | } | 1931 | } |
| 1488 | } | 1932 | } |
| 1489 | 1933 | ||
| 1490 | mutex_unlock(&text_mutex); | 1934 | mutex_unlock(&text_mutex); |
| 1935 | put_online_cpus(); | ||
| 1491 | mutex_unlock(&kprobe_mutex); | 1936 | mutex_unlock(&kprobe_mutex); |
| 1492 | /* Allow all currently running kprobes to complete */ | 1937 | /* Allow all currently running kprobes to complete */ |
| 1493 | synchronize_sched(); | 1938 | synchronize_sched(); |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3feaf5a74514..6b1ccc3f0205 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -197,16 +197,8 @@ static int __init ksysfs_init(void) | |||
| 197 | goto group_exit; | 197 | goto group_exit; |
| 198 | } | 198 | } |
| 199 | 199 | ||
| 200 | /* create the /sys/kernel/uids/ directory */ | ||
| 201 | error = uids_sysfs_init(); | ||
| 202 | if (error) | ||
| 203 | goto notes_exit; | ||
| 204 | |||
| 205 | return 0; | 200 | return 0; |
| 206 | 201 | ||
| 207 | notes_exit: | ||
| 208 | if (notes_size > 0) | ||
| 209 | sysfs_remove_bin_file(kernel_kobj, ¬es_attr); | ||
| 210 | group_exit: | 202 | group_exit: |
| 211 | sysfs_remove_group(kernel_kobj, &kernel_attr_group); | 203 | sysfs_remove_group(kernel_kobj, &kernel_attr_group); |
| 212 | kset_exit: | 204 | kset_exit: |
diff --git a/kernel/kthread.c b/kernel/kthread.c index fbb6222fe7e0..82ed0ea15194 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
| 101 | * | 101 | * |
| 102 | * Description: This helper function creates and names a kernel | 102 | * Description: This helper function creates and names a kernel |
| 103 | * thread. The thread will be stopped: use wake_up_process() to start | 103 | * thread. The thread will be stopped: use wake_up_process() to start |
| 104 | * it. See also kthread_run(), kthread_create_on_cpu(). | 104 | * it. See also kthread_run(). |
| 105 | * | 105 | * |
| 106 | * When woken, the thread will run @threadfn() with @data as its | 106 | * When woken, the thread will run @threadfn() with @data as its |
| 107 | * argument. @threadfn() can either call do_exit() directly if it is a | 107 | * argument. @threadfn() can either call do_exit() directly if it is a |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c62ec14609b9..0c30d0455de1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -3809,3 +3809,21 @@ void lockdep_sys_exit(void) | |||
| 3809 | lockdep_print_held_locks(curr); | 3809 | lockdep_print_held_locks(curr); |
| 3810 | } | 3810 | } |
| 3811 | } | 3811 | } |
| 3812 | |||
| 3813 | void lockdep_rcu_dereference(const char *file, const int line) | ||
| 3814 | { | ||
| 3815 | struct task_struct *curr = current; | ||
| 3816 | |||
| 3817 | if (!debug_locks_off()) | ||
| 3818 | return; | ||
| 3819 | printk("\n===================================================\n"); | ||
| 3820 | printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); | ||
| 3821 | printk( "---------------------------------------------------\n"); | ||
| 3822 | printk("%s:%d invoked rcu_dereference_check() without protection!\n", | ||
| 3823 | file, line); | ||
| 3824 | printk("\nother info that might help us debug this:\n\n"); | ||
| 3825 | lockdep_print_held_locks(curr); | ||
| 3826 | printk("\nstack backtrace:\n"); | ||
| 3827 | dump_stack(); | ||
| 3828 | } | ||
| 3829 | EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); | ||
diff --git a/kernel/module.c b/kernel/module.c index f82386bd9ee9..e5538d5f00ad 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod) | |||
| 474 | 474 | ||
| 475 | INIT_LIST_HEAD(&mod->modules_which_use_me); | 475 | INIT_LIST_HEAD(&mod->modules_which_use_me); |
| 476 | for_each_possible_cpu(cpu) | 476 | for_each_possible_cpu(cpu) |
| 477 | local_set(__module_ref_addr(mod, cpu), 0); | 477 | per_cpu_ptr(mod->refptr, cpu)->count = 0; |
| 478 | |||
| 478 | /* Hold reference count during initialization. */ | 479 | /* Hold reference count during initialization. */ |
| 479 | local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); | 480 | __this_cpu_write(mod->refptr->count, 1); |
| 480 | /* Backwards compatibility macros put refcount during init. */ | 481 | /* Backwards compatibility macros put refcount during init. */ |
| 481 | mod->waiter = current; | 482 | mod->waiter = current; |
| 482 | } | 483 | } |
| @@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod) | |||
| 619 | int cpu; | 620 | int cpu; |
| 620 | 621 | ||
| 621 | for_each_possible_cpu(cpu) | 622 | for_each_possible_cpu(cpu) |
| 622 | total += local_read(__module_ref_addr(mod, cpu)); | 623 | total += per_cpu_ptr(mod->refptr, cpu)->count; |
| 623 | return total; | 624 | return total; |
| 624 | } | 625 | } |
| 625 | EXPORT_SYMBOL(module_refcount); | 626 | EXPORT_SYMBOL(module_refcount); |
| @@ -796,14 +797,15 @@ static struct module_attribute refcnt = { | |||
| 796 | void module_put(struct module *module) | 797 | void module_put(struct module *module) |
| 797 | { | 798 | { |
| 798 | if (module) { | 799 | if (module) { |
| 799 | unsigned int cpu = get_cpu(); | 800 | preempt_disable(); |
| 800 | local_dec(__module_ref_addr(module, cpu)); | 801 | __this_cpu_dec(module->refptr->count); |
| 802 | |||
| 801 | trace_module_put(module, _RET_IP_, | 803 | trace_module_put(module, _RET_IP_, |
| 802 | local_read(__module_ref_addr(module, cpu))); | 804 | __this_cpu_read(module->refptr->count)); |
| 803 | /* Maybe they're waiting for us to drop reference? */ | 805 | /* Maybe they're waiting for us to drop reference? */ |
| 804 | if (unlikely(!module_is_live(module))) | 806 | if (unlikely(!module_is_live(module))) |
| 805 | wake_up_process(module->waiter); | 807 | wake_up_process(module->waiter); |
| 806 | put_cpu(); | 808 | preempt_enable(); |
| 807 | } | 809 | } |
| 808 | } | 810 | } |
| 809 | EXPORT_SYMBOL(module_put); | 811 | EXPORT_SYMBOL(module_put); |
| @@ -1397,9 +1399,9 @@ static void free_module(struct module *mod) | |||
| 1397 | kfree(mod->args); | 1399 | kfree(mod->args); |
| 1398 | if (mod->percpu) | 1400 | if (mod->percpu) |
| 1399 | percpu_modfree(mod->percpu); | 1401 | percpu_modfree(mod->percpu); |
| 1400 | #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) | 1402 | #if defined(CONFIG_MODULE_UNLOAD) |
| 1401 | if (mod->refptr) | 1403 | if (mod->refptr) |
| 1402 | percpu_modfree(mod->refptr); | 1404 | free_percpu(mod->refptr); |
| 1403 | #endif | 1405 | #endif |
| 1404 | /* Free lock-classes: */ | 1406 | /* Free lock-classes: */ |
| 1405 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1407 | lockdep_free_key_range(mod->module_core, mod->core_size); |
| @@ -2162,9 +2164,8 @@ static noinline struct module *load_module(void __user *umod, | |||
| 2162 | mod = (void *)sechdrs[modindex].sh_addr; | 2164 | mod = (void *)sechdrs[modindex].sh_addr; |
| 2163 | kmemleak_load_module(mod, hdr, sechdrs, secstrings); | 2165 | kmemleak_load_module(mod, hdr, sechdrs, secstrings); |
| 2164 | 2166 | ||
| 2165 | #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) | 2167 | #if defined(CONFIG_MODULE_UNLOAD) |
| 2166 | mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), | 2168 | mod->refptr = alloc_percpu(struct module_ref); |
| 2167 | mod->name); | ||
| 2168 | if (!mod->refptr) { | 2169 | if (!mod->refptr) { |
| 2169 | err = -ENOMEM; | 2170 | err = -ENOMEM; |
| 2170 | goto free_init; | 2171 | goto free_init; |
| @@ -2396,8 +2397,8 @@ static noinline struct module *load_module(void __user *umod, | |||
| 2396 | kobject_put(&mod->mkobj.kobj); | 2397 | kobject_put(&mod->mkobj.kobj); |
| 2397 | free_unload: | 2398 | free_unload: |
| 2398 | module_unload_free(mod); | 2399 | module_unload_free(mod); |
| 2399 | #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) | 2400 | #if defined(CONFIG_MODULE_UNLOAD) |
| 2400 | percpu_modfree(mod->refptr); | 2401 | free_percpu(mod->refptr); |
| 2401 | free_init: | 2402 | free_init: |
| 2402 | #endif | 2403 | #endif |
| 2403 | module_free(mod, mod->module_init); | 2404 | module_free(mod, mod->module_init); |
diff --git a/kernel/notifier.c b/kernel/notifier.c index acd24e7643eb..2488ba7eb568 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
| @@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, | |||
| 78 | int ret = NOTIFY_DONE; | 78 | int ret = NOTIFY_DONE; |
| 79 | struct notifier_block *nb, *next_nb; | 79 | struct notifier_block *nb, *next_nb; |
| 80 | 80 | ||
| 81 | nb = rcu_dereference(*nl); | 81 | nb = rcu_dereference_raw(*nl); |
| 82 | 82 | ||
| 83 | while (nb && nr_to_call) { | 83 | while (nb && nr_to_call) { |
| 84 | next_nb = rcu_dereference(nb->next); | 84 | next_nb = rcu_dereference_raw(nb->next); |
| 85 | 85 | ||
| 86 | #ifdef CONFIG_DEBUG_NOTIFIERS | 86 | #ifdef CONFIG_DEBUG_NOTIFIERS |
| 87 | if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { | 87 | if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { |
| @@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, | |||
| 309 | * racy then it does not matter what the result of the test | 309 | * racy then it does not matter what the result of the test |
| 310 | * is, we re-check the list after having taken the lock anyway: | 310 | * is, we re-check the list after having taken the lock anyway: |
| 311 | */ | 311 | */ |
| 312 | if (rcu_dereference(nh->head)) { | 312 | if (rcu_dereference_raw(nh->head)) { |
| 313 | down_read(&nh->rwsem); | 313 | down_read(&nh->rwsem); |
| 314 | ret = notifier_call_chain(&nh->head, val, v, nr_to_call, | 314 | ret = notifier_call_chain(&nh->head, val, v, nr_to_call, |
| 315 | nr_calls); | 315 | nr_calls); |
diff --git a/kernel/padata.c b/kernel/padata.c new file mode 100644 index 000000000000..93caf65ff57c --- /dev/null +++ b/kernel/padata.c | |||
| @@ -0,0 +1,696 @@ | |||
| 1 | /* | ||
| 2 | * padata.c - generic interface to process data streams in parallel | ||
| 3 | * | ||
| 4 | * Copyright (C) 2008, 2009 secunet Security Networks AG | ||
| 5 | * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify it | ||
| 8 | * under the terms and conditions of the GNU General Public License, | ||
| 9 | * version 2, as published by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
| 14 | * more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU General Public License along with | ||
| 17 | * this program; if not, write to the Free Software Foundation, Inc., | ||
| 18 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
| 19 | */ | ||
| 20 | |||
| 21 | #include <linux/module.h> | ||
| 22 | #include <linux/cpumask.h> | ||
| 23 | #include <linux/err.h> | ||
| 24 | #include <linux/cpu.h> | ||
| 25 | #include <linux/padata.h> | ||
| 26 | #include <linux/mutex.h> | ||
| 27 | #include <linux/sched.h> | ||
| 28 | #include <linux/rcupdate.h> | ||
| 29 | |||
| 30 | #define MAX_SEQ_NR INT_MAX - NR_CPUS | ||
| 31 | #define MAX_OBJ_NUM 10000 * NR_CPUS | ||
| 32 | |||
| 33 | static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) | ||
| 34 | { | ||
| 35 | int cpu, target_cpu; | ||
| 36 | |||
| 37 | target_cpu = cpumask_first(pd->cpumask); | ||
| 38 | for (cpu = 0; cpu < cpu_index; cpu++) | ||
| 39 | target_cpu = cpumask_next(target_cpu, pd->cpumask); | ||
| 40 | |||
| 41 | return target_cpu; | ||
| 42 | } | ||
| 43 | |||
| 44 | static int padata_cpu_hash(struct padata_priv *padata) | ||
| 45 | { | ||
| 46 | int cpu_index; | ||
| 47 | struct parallel_data *pd; | ||
| 48 | |||
| 49 | pd = padata->pd; | ||
| 50 | |||
| 51 | /* | ||
| 52 | * Hash the sequence numbers to the cpus by taking | ||
| 53 | * seq_nr mod. number of cpus in use. | ||
| 54 | */ | ||
| 55 | cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); | ||
| 56 | |||
| 57 | return padata_index_to_cpu(pd, cpu_index); | ||
| 58 | } | ||
| 59 | |||
| 60 | static void padata_parallel_worker(struct work_struct *work) | ||
| 61 | { | ||
| 62 | struct padata_queue *queue; | ||
| 63 | struct parallel_data *pd; | ||
| 64 | struct padata_instance *pinst; | ||
| 65 | LIST_HEAD(local_list); | ||
| 66 | |||
| 67 | local_bh_disable(); | ||
| 68 | queue = container_of(work, struct padata_queue, pwork); | ||
| 69 | pd = queue->pd; | ||
| 70 | pinst = pd->pinst; | ||
| 71 | |||
| 72 | spin_lock(&queue->parallel.lock); | ||
| 73 | list_replace_init(&queue->parallel.list, &local_list); | ||
| 74 | spin_unlock(&queue->parallel.lock); | ||
| 75 | |||
| 76 | while (!list_empty(&local_list)) { | ||
| 77 | struct padata_priv *padata; | ||
| 78 | |||
| 79 | padata = list_entry(local_list.next, | ||
| 80 | struct padata_priv, list); | ||
| 81 | |||
| 82 | list_del_init(&padata->list); | ||
| 83 | |||
| 84 | padata->parallel(padata); | ||
| 85 | } | ||
| 86 | |||
| 87 | local_bh_enable(); | ||
| 88 | } | ||
| 89 | |||
| 90 | /* | ||
| 91 | * padata_do_parallel - padata parallelization function | ||
| 92 | * | ||
| 93 | * @pinst: padata instance | ||
| 94 | * @padata: object to be parallelized | ||
| 95 | * @cb_cpu: cpu the serialization callback function will run on, | ||
| 96 | * must be in the cpumask of padata. | ||
| 97 | * | ||
| 98 | * The parallelization callback function will run with BHs off. | ||
| 99 | * Note: Every object which is parallelized by padata_do_parallel | ||
| 100 | * must be seen by padata_do_serial. | ||
| 101 | */ | ||
| 102 | int padata_do_parallel(struct padata_instance *pinst, | ||
| 103 | struct padata_priv *padata, int cb_cpu) | ||
| 104 | { | ||
| 105 | int target_cpu, err; | ||
| 106 | struct padata_queue *queue; | ||
| 107 | struct parallel_data *pd; | ||
| 108 | |||
| 109 | rcu_read_lock_bh(); | ||
| 110 | |||
| 111 | pd = rcu_dereference(pinst->pd); | ||
| 112 | |||
| 113 | err = 0; | ||
| 114 | if (!(pinst->flags & PADATA_INIT)) | ||
| 115 | goto out; | ||
| 116 | |||
| 117 | err = -EBUSY; | ||
| 118 | if ((pinst->flags & PADATA_RESET)) | ||
| 119 | goto out; | ||
| 120 | |||
| 121 | if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) | ||
| 122 | goto out; | ||
| 123 | |||
| 124 | err = -EINVAL; | ||
| 125 | if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) | ||
| 126 | goto out; | ||
| 127 | |||
| 128 | err = -EINPROGRESS; | ||
| 129 | atomic_inc(&pd->refcnt); | ||
| 130 | padata->pd = pd; | ||
| 131 | padata->cb_cpu = cb_cpu; | ||
| 132 | |||
| 133 | if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) | ||
| 134 | atomic_set(&pd->seq_nr, -1); | ||
| 135 | |||
| 136 | padata->seq_nr = atomic_inc_return(&pd->seq_nr); | ||
| 137 | |||
| 138 | target_cpu = padata_cpu_hash(padata); | ||
| 139 | queue = per_cpu_ptr(pd->queue, target_cpu); | ||
| 140 | |||
| 141 | spin_lock(&queue->parallel.lock); | ||
| 142 | list_add_tail(&padata->list, &queue->parallel.list); | ||
| 143 | spin_unlock(&queue->parallel.lock); | ||
| 144 | |||
| 145 | queue_work_on(target_cpu, pinst->wq, &queue->pwork); | ||
| 146 | |||
| 147 | out: | ||
| 148 | rcu_read_unlock_bh(); | ||
| 149 | |||
| 150 | return err; | ||
| 151 | } | ||
| 152 | EXPORT_SYMBOL(padata_do_parallel); | ||
| 153 | |||
| 154 | static struct padata_priv *padata_get_next(struct parallel_data *pd) | ||
| 155 | { | ||
| 156 | int cpu, num_cpus, empty, calc_seq_nr; | ||
| 157 | int seq_nr, next_nr, overrun, next_overrun; | ||
| 158 | struct padata_queue *queue, *next_queue; | ||
| 159 | struct padata_priv *padata; | ||
| 160 | struct padata_list *reorder; | ||
| 161 | |||
| 162 | empty = 0; | ||
| 163 | next_nr = -1; | ||
| 164 | next_overrun = 0; | ||
| 165 | next_queue = NULL; | ||
| 166 | |||
| 167 | num_cpus = cpumask_weight(pd->cpumask); | ||
| 168 | |||
| 169 | for_each_cpu(cpu, pd->cpumask) { | ||
| 170 | queue = per_cpu_ptr(pd->queue, cpu); | ||
| 171 | reorder = &queue->reorder; | ||
| 172 | |||
| 173 | /* | ||
| 174 | * Calculate the seq_nr of the object that should be | ||
| 175 | * next in this queue. | ||
| 176 | */ | ||
| 177 | overrun = 0; | ||
| 178 | calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) | ||
| 179 | + queue->cpu_index; | ||
| 180 | |||
| 181 | if (unlikely(calc_seq_nr > pd->max_seq_nr)) { | ||
| 182 | calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; | ||
| 183 | overrun = 1; | ||
| 184 | } | ||
| 185 | |||
| 186 | if (!list_empty(&reorder->list)) { | ||
| 187 | padata = list_entry(reorder->list.next, | ||
| 188 | struct padata_priv, list); | ||
| 189 | |||
| 190 | seq_nr = padata->seq_nr; | ||
| 191 | BUG_ON(calc_seq_nr != seq_nr); | ||
| 192 | } else { | ||
| 193 | seq_nr = calc_seq_nr; | ||
| 194 | empty++; | ||
| 195 | } | ||
| 196 | |||
| 197 | if (next_nr < 0 || seq_nr < next_nr | ||
| 198 | || (next_overrun && !overrun)) { | ||
| 199 | next_nr = seq_nr; | ||
| 200 | next_overrun = overrun; | ||
| 201 | next_queue = queue; | ||
| 202 | } | ||
| 203 | } | ||
| 204 | |||
| 205 | padata = NULL; | ||
| 206 | |||
| 207 | if (empty == num_cpus) | ||
| 208 | goto out; | ||
| 209 | |||
| 210 | reorder = &next_queue->reorder; | ||
| 211 | |||
| 212 | if (!list_empty(&reorder->list)) { | ||
| 213 | padata = list_entry(reorder->list.next, | ||
| 214 | struct padata_priv, list); | ||
| 215 | |||
| 216 | if (unlikely(next_overrun)) { | ||
| 217 | for_each_cpu(cpu, pd->cpumask) { | ||
| 218 | queue = per_cpu_ptr(pd->queue, cpu); | ||
| 219 | atomic_set(&queue->num_obj, 0); | ||
| 220 | } | ||
| 221 | } | ||
| 222 | |||
| 223 | spin_lock(&reorder->lock); | ||
| 224 | list_del_init(&padata->list); | ||
| 225 | atomic_dec(&pd->reorder_objects); | ||
| 226 | spin_unlock(&reorder->lock); | ||
| 227 | |||
| 228 | atomic_inc(&next_queue->num_obj); | ||
| 229 | |||
| 230 | goto out; | ||
| 231 | } | ||
| 232 | |||
| 233 | if (next_nr % num_cpus == next_queue->cpu_index) { | ||
| 234 | padata = ERR_PTR(-ENODATA); | ||
| 235 | goto out; | ||
| 236 | } | ||
| 237 | |||
| 238 | padata = ERR_PTR(-EINPROGRESS); | ||
| 239 | out: | ||
| 240 | return padata; | ||
| 241 | } | ||
| 242 | |||
| 243 | static void padata_reorder(struct parallel_data *pd) | ||
| 244 | { | ||
| 245 | struct padata_priv *padata; | ||
| 246 | struct padata_queue *queue; | ||
| 247 | struct padata_instance *pinst = pd->pinst; | ||
| 248 | |||
| 249 | try_again: | ||
| 250 | if (!spin_trylock_bh(&pd->lock)) | ||
| 251 | goto out; | ||
| 252 | |||
| 253 | while (1) { | ||
| 254 | padata = padata_get_next(pd); | ||
| 255 | |||
| 256 | if (!padata || PTR_ERR(padata) == -EINPROGRESS) | ||
| 257 | break; | ||
| 258 | |||
| 259 | if (PTR_ERR(padata) == -ENODATA) { | ||
| 260 | spin_unlock_bh(&pd->lock); | ||
| 261 | goto out; | ||
| 262 | } | ||
| 263 | |||
| 264 | queue = per_cpu_ptr(pd->queue, padata->cb_cpu); | ||
| 265 | |||
| 266 | spin_lock(&queue->serial.lock); | ||
| 267 | list_add_tail(&padata->list, &queue->serial.list); | ||
| 268 | spin_unlock(&queue->serial.lock); | ||
| 269 | |||
| 270 | queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); | ||
| 271 | } | ||
| 272 | |||
| 273 | spin_unlock_bh(&pd->lock); | ||
| 274 | |||
| 275 | if (atomic_read(&pd->reorder_objects)) | ||
| 276 | goto try_again; | ||
| 277 | |||
| 278 | out: | ||
| 279 | return; | ||
| 280 | } | ||
| 281 | |||
| 282 | static void padata_serial_worker(struct work_struct *work) | ||
| 283 | { | ||
| 284 | struct padata_queue *queue; | ||
| 285 | struct parallel_data *pd; | ||
| 286 | LIST_HEAD(local_list); | ||
| 287 | |||
| 288 | local_bh_disable(); | ||
| 289 | queue = container_of(work, struct padata_queue, swork); | ||
| 290 | pd = queue->pd; | ||
| 291 | |||
| 292 | spin_lock(&queue->serial.lock); | ||
| 293 | list_replace_init(&queue->serial.list, &local_list); | ||
| 294 | spin_unlock(&queue->serial.lock); | ||
| 295 | |||
| 296 | while (!list_empty(&local_list)) { | ||
| 297 | struct padata_priv *padata; | ||
| 298 | |||
| 299 | padata = list_entry(local_list.next, | ||
| 300 | struct padata_priv, list); | ||
| 301 | |||
| 302 | list_del_init(&padata->list); | ||
| 303 | |||
| 304 | padata->serial(padata); | ||
| 305 | atomic_dec(&pd->refcnt); | ||
| 306 | } | ||
| 307 | local_bh_enable(); | ||
| 308 | } | ||
| 309 | |||
| 310 | /* | ||
| 311 | * padata_do_serial - padata serialization function | ||
| 312 | * | ||
| 313 | * @padata: object to be serialized. | ||
| 314 | * | ||
| 315 | * padata_do_serial must be called for every parallelized object. | ||
| 316 | * The serialization callback function will run with BHs off. | ||
| 317 | */ | ||
| 318 | void padata_do_serial(struct padata_priv *padata) | ||
| 319 | { | ||
| 320 | int cpu; | ||
| 321 | struct padata_queue *queue; | ||
| 322 | struct parallel_data *pd; | ||
| 323 | |||
| 324 | pd = padata->pd; | ||
| 325 | |||
| 326 | cpu = get_cpu(); | ||
| 327 | queue = per_cpu_ptr(pd->queue, cpu); | ||
| 328 | |||
| 329 | spin_lock(&queue->reorder.lock); | ||
| 330 | atomic_inc(&pd->reorder_objects); | ||
| 331 | list_add_tail(&padata->list, &queue->reorder.list); | ||
| 332 | spin_unlock(&queue->reorder.lock); | ||
| 333 | |||
| 334 | put_cpu(); | ||
| 335 | |||
| 336 | padata_reorder(pd); | ||
| 337 | } | ||
| 338 | EXPORT_SYMBOL(padata_do_serial); | ||
| 339 | |||
| 340 | static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, | ||
| 341 | const struct cpumask *cpumask) | ||
| 342 | { | ||
| 343 | int cpu, cpu_index, num_cpus; | ||
| 344 | struct padata_queue *queue; | ||
| 345 | struct parallel_data *pd; | ||
| 346 | |||
| 347 | cpu_index = 0; | ||
| 348 | |||
| 349 | pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); | ||
| 350 | if (!pd) | ||
| 351 | goto err; | ||
| 352 | |||
| 353 | pd->queue = alloc_percpu(struct padata_queue); | ||
| 354 | if (!pd->queue) | ||
| 355 | goto err_free_pd; | ||
| 356 | |||
| 357 | if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) | ||
| 358 | goto err_free_queue; | ||
| 359 | |||
| 360 | for_each_possible_cpu(cpu) { | ||
| 361 | queue = per_cpu_ptr(pd->queue, cpu); | ||
| 362 | |||
| 363 | queue->pd = pd; | ||
| 364 | |||
| 365 | if (cpumask_test_cpu(cpu, cpumask) | ||
| 366 | && cpumask_test_cpu(cpu, cpu_active_mask)) { | ||
| 367 | queue->cpu_index = cpu_index; | ||
| 368 | cpu_index++; | ||
| 369 | } else | ||
| 370 | queue->cpu_index = -1; | ||
| 371 | |||
| 372 | INIT_LIST_HEAD(&queue->reorder.list); | ||
| 373 | INIT_LIST_HEAD(&queue->parallel.list); | ||
| 374 | INIT_LIST_HEAD(&queue->serial.list); | ||
| 375 | spin_lock_init(&queue->reorder.lock); | ||
| 376 | spin_lock_init(&queue->parallel.lock); | ||
| 377 | spin_lock_init(&queue->serial.lock); | ||
| 378 | |||
| 379 | INIT_WORK(&queue->pwork, padata_parallel_worker); | ||
| 380 | INIT_WORK(&queue->swork, padata_serial_worker); | ||
| 381 | atomic_set(&queue->num_obj, 0); | ||
| 382 | } | ||
| 383 | |||
| 384 | cpumask_and(pd->cpumask, cpumask, cpu_active_mask); | ||
| 385 | |||
| 386 | num_cpus = cpumask_weight(pd->cpumask); | ||
| 387 | pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; | ||
| 388 | |||
| 389 | atomic_set(&pd->seq_nr, -1); | ||
| 390 | atomic_set(&pd->reorder_objects, 0); | ||
| 391 | atomic_set(&pd->refcnt, 0); | ||
| 392 | pd->pinst = pinst; | ||
| 393 | spin_lock_init(&pd->lock); | ||
| 394 | |||
| 395 | return pd; | ||
| 396 | |||
| 397 | err_free_queue: | ||
| 398 | free_percpu(pd->queue); | ||
| 399 | err_free_pd: | ||
| 400 | kfree(pd); | ||
| 401 | err: | ||
| 402 | return NULL; | ||
| 403 | } | ||
| 404 | |||
| 405 | static void padata_free_pd(struct parallel_data *pd) | ||
| 406 | { | ||
| 407 | free_cpumask_var(pd->cpumask); | ||
| 408 | free_percpu(pd->queue); | ||
| 409 | kfree(pd); | ||
| 410 | } | ||
| 411 | |||
| 412 | static void padata_replace(struct padata_instance *pinst, | ||
| 413 | struct parallel_data *pd_new) | ||
| 414 | { | ||
| 415 | struct parallel_data *pd_old = pinst->pd; | ||
| 416 | |||
| 417 | pinst->flags |= PADATA_RESET; | ||
| 418 | |||
| 419 | rcu_assign_pointer(pinst->pd, pd_new); | ||
| 420 | |||
| 421 | synchronize_rcu(); | ||
| 422 | |||
| 423 | while (atomic_read(&pd_old->refcnt) != 0) | ||
| 424 | yield(); | ||
| 425 | |||
| 426 | flush_workqueue(pinst->wq); | ||
| 427 | |||
| 428 | padata_free_pd(pd_old); | ||
| 429 | |||
| 430 | pinst->flags &= ~PADATA_RESET; | ||
| 431 | } | ||
| 432 | |||
| 433 | /* | ||
| 434 | * padata_set_cpumask - set the cpumask that padata should use | ||
| 435 | * | ||
| 436 | * @pinst: padata instance | ||
| 437 | * @cpumask: the cpumask to use | ||
| 438 | */ | ||
| 439 | int padata_set_cpumask(struct padata_instance *pinst, | ||
| 440 | cpumask_var_t cpumask) | ||
| 441 | { | ||
| 442 | struct parallel_data *pd; | ||
| 443 | int err = 0; | ||
| 444 | |||
| 445 | might_sleep(); | ||
| 446 | |||
| 447 | mutex_lock(&pinst->lock); | ||
| 448 | |||
| 449 | pd = padata_alloc_pd(pinst, cpumask); | ||
| 450 | if (!pd) { | ||
| 451 | err = -ENOMEM; | ||
| 452 | goto out; | ||
| 453 | } | ||
| 454 | |||
| 455 | cpumask_copy(pinst->cpumask, cpumask); | ||
| 456 | |||
| 457 | padata_replace(pinst, pd); | ||
| 458 | |||
| 459 | out: | ||
| 460 | mutex_unlock(&pinst->lock); | ||
| 461 | |||
| 462 | return err; | ||
| 463 | } | ||
| 464 | EXPORT_SYMBOL(padata_set_cpumask); | ||
| 465 | |||
| 466 | static int __padata_add_cpu(struct padata_instance *pinst, int cpu) | ||
| 467 | { | ||
| 468 | struct parallel_data *pd; | ||
| 469 | |||
| 470 | if (cpumask_test_cpu(cpu, cpu_active_mask)) { | ||
| 471 | pd = padata_alloc_pd(pinst, pinst->cpumask); | ||
| 472 | if (!pd) | ||
| 473 | return -ENOMEM; | ||
| 474 | |||
| 475 | padata_replace(pinst, pd); | ||
| 476 | } | ||
| 477 | |||
| 478 | return 0; | ||
| 479 | } | ||
| 480 | |||
| 481 | /* | ||
| 482 | * padata_add_cpu - add a cpu to the padata cpumask | ||
| 483 | * | ||
| 484 | * @pinst: padata instance | ||
| 485 | * @cpu: cpu to add | ||
| 486 | */ | ||
| 487 | int padata_add_cpu(struct padata_instance *pinst, int cpu) | ||
| 488 | { | ||
| 489 | int err; | ||
| 490 | |||
| 491 | might_sleep(); | ||
| 492 | |||
| 493 | mutex_lock(&pinst->lock); | ||
| 494 | |||
| 495 | cpumask_set_cpu(cpu, pinst->cpumask); | ||
| 496 | err = __padata_add_cpu(pinst, cpu); | ||
| 497 | |||
| 498 | mutex_unlock(&pinst->lock); | ||
| 499 | |||
| 500 | return err; | ||
| 501 | } | ||
| 502 | EXPORT_SYMBOL(padata_add_cpu); | ||
| 503 | |||
| 504 | static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) | ||
| 505 | { | ||
| 506 | struct parallel_data *pd; | ||
| 507 | |||
| 508 | if (cpumask_test_cpu(cpu, cpu_online_mask)) { | ||
| 509 | pd = padata_alloc_pd(pinst, pinst->cpumask); | ||
| 510 | if (!pd) | ||
| 511 | return -ENOMEM; | ||
| 512 | |||
| 513 | padata_replace(pinst, pd); | ||
| 514 | } | ||
| 515 | |||
| 516 | return 0; | ||
| 517 | } | ||
| 518 | |||
| 519 | /* | ||
| 520 | * padata_remove_cpu - remove a cpu from the padata cpumask | ||
| 521 | * | ||
| 522 | * @pinst: padata instance | ||
| 523 | * @cpu: cpu to remove | ||
| 524 | */ | ||
| 525 | int padata_remove_cpu(struct padata_instance *pinst, int cpu) | ||
| 526 | { | ||
| 527 | int err; | ||
| 528 | |||
| 529 | might_sleep(); | ||
| 530 | |||
| 531 | mutex_lock(&pinst->lock); | ||
| 532 | |||
| 533 | cpumask_clear_cpu(cpu, pinst->cpumask); | ||
| 534 | err = __padata_remove_cpu(pinst, cpu); | ||
| 535 | |||
| 536 | mutex_unlock(&pinst->lock); | ||
| 537 | |||
| 538 | return err; | ||
| 539 | } | ||
| 540 | EXPORT_SYMBOL(padata_remove_cpu); | ||
| 541 | |||
| 542 | /* | ||
| 543 | * padata_start - start the parallel processing | ||
| 544 | * | ||
| 545 | * @pinst: padata instance to start | ||
| 546 | */ | ||
| 547 | void padata_start(struct padata_instance *pinst) | ||
| 548 | { | ||
| 549 | might_sleep(); | ||
| 550 | |||
| 551 | mutex_lock(&pinst->lock); | ||
| 552 | pinst->flags |= PADATA_INIT; | ||
| 553 | mutex_unlock(&pinst->lock); | ||
| 554 | } | ||
| 555 | EXPORT_SYMBOL(padata_start); | ||
| 556 | |||
| 557 | /* | ||
| 558 | * padata_stop - stop the parallel processing | ||
| 559 | * | ||
| 560 | * @pinst: padata instance to stop | ||
| 561 | */ | ||
| 562 | void padata_stop(struct padata_instance *pinst) | ||
| 563 | { | ||
| 564 | might_sleep(); | ||
| 565 | |||
| 566 | mutex_lock(&pinst->lock); | ||
| 567 | pinst->flags &= ~PADATA_INIT; | ||
| 568 | mutex_unlock(&pinst->lock); | ||
| 569 | } | ||
| 570 | EXPORT_SYMBOL(padata_stop); | ||
| 571 | |||
| 572 | static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, | ||
| 573 | unsigned long action, void *hcpu) | ||
| 574 | { | ||
| 575 | int err; | ||
| 576 | struct padata_instance *pinst; | ||
| 577 | int cpu = (unsigned long)hcpu; | ||
| 578 | |||
| 579 | pinst = container_of(nfb, struct padata_instance, cpu_notifier); | ||
| 580 | |||
| 581 | switch (action) { | ||
| 582 | case CPU_ONLINE: | ||
| 583 | case CPU_ONLINE_FROZEN: | ||
| 584 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
| 585 | break; | ||
| 586 | mutex_lock(&pinst->lock); | ||
| 587 | err = __padata_add_cpu(pinst, cpu); | ||
| 588 | mutex_unlock(&pinst->lock); | ||
| 589 | if (err) | ||
| 590 | return NOTIFY_BAD; | ||
| 591 | break; | ||
| 592 | |||
| 593 | case CPU_DOWN_PREPARE: | ||
| 594 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 595 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
| 596 | break; | ||
| 597 | mutex_lock(&pinst->lock); | ||
| 598 | err = __padata_remove_cpu(pinst, cpu); | ||
| 599 | mutex_unlock(&pinst->lock); | ||
| 600 | if (err) | ||
| 601 | return NOTIFY_BAD; | ||
| 602 | break; | ||
| 603 | |||
| 604 | case CPU_UP_CANCELED: | ||
| 605 | case CPU_UP_CANCELED_FROZEN: | ||
| 606 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
| 607 | break; | ||
| 608 | mutex_lock(&pinst->lock); | ||
| 609 | __padata_remove_cpu(pinst, cpu); | ||
| 610 | mutex_unlock(&pinst->lock); | ||
| 611 | |||
| 612 | case CPU_DOWN_FAILED: | ||
| 613 | case CPU_DOWN_FAILED_FROZEN: | ||
| 614 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | ||
| 615 | break; | ||
| 616 | mutex_lock(&pinst->lock); | ||
| 617 | __padata_add_cpu(pinst, cpu); | ||
| 618 | mutex_unlock(&pinst->lock); | ||
| 619 | } | ||
| 620 | |||
| 621 | return NOTIFY_OK; | ||
| 622 | } | ||
| 623 | |||
| 624 | /* | ||
| 625 | * padata_alloc - allocate and initialize a padata instance | ||
| 626 | * | ||
| 627 | * @cpumask: cpumask that padata uses for parallelization | ||
| 628 | * @wq: workqueue to use for the allocated padata instance | ||
| 629 | */ | ||
| 630 | struct padata_instance *padata_alloc(const struct cpumask *cpumask, | ||
| 631 | struct workqueue_struct *wq) | ||
| 632 | { | ||
| 633 | int err; | ||
| 634 | struct padata_instance *pinst; | ||
| 635 | struct parallel_data *pd; | ||
| 636 | |||
| 637 | pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); | ||
| 638 | if (!pinst) | ||
| 639 | goto err; | ||
| 640 | |||
| 641 | pd = padata_alloc_pd(pinst, cpumask); | ||
| 642 | if (!pd) | ||
| 643 | goto err_free_inst; | ||
| 644 | |||
| 645 | if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) | ||
| 646 | goto err_free_pd; | ||
| 647 | |||
| 648 | rcu_assign_pointer(pinst->pd, pd); | ||
| 649 | |||
| 650 | pinst->wq = wq; | ||
| 651 | |||
| 652 | cpumask_copy(pinst->cpumask, cpumask); | ||
| 653 | |||
| 654 | pinst->flags = 0; | ||
| 655 | |||
| 656 | pinst->cpu_notifier.notifier_call = padata_cpu_callback; | ||
| 657 | pinst->cpu_notifier.priority = 0; | ||
| 658 | err = register_hotcpu_notifier(&pinst->cpu_notifier); | ||
| 659 | if (err) | ||
| 660 | goto err_free_cpumask; | ||
| 661 | |||
| 662 | mutex_init(&pinst->lock); | ||
| 663 | |||
| 664 | return pinst; | ||
| 665 | |||
| 666 | err_free_cpumask: | ||
| 667 | free_cpumask_var(pinst->cpumask); | ||
| 668 | err_free_pd: | ||
| 669 | padata_free_pd(pd); | ||
| 670 | err_free_inst: | ||
| 671 | kfree(pinst); | ||
| 672 | err: | ||
| 673 | return NULL; | ||
| 674 | } | ||
| 675 | EXPORT_SYMBOL(padata_alloc); | ||
| 676 | |||
| 677 | /* | ||
| 678 | * padata_free - free a padata instance | ||
| 679 | * | ||
| 680 | * @ padata_inst: padata instance to free | ||
| 681 | */ | ||
| 682 | void padata_free(struct padata_instance *pinst) | ||
| 683 | { | ||
| 684 | padata_stop(pinst); | ||
| 685 | |||
| 686 | synchronize_rcu(); | ||
| 687 | |||
| 688 | while (atomic_read(&pinst->pd->refcnt) != 0) | ||
| 689 | yield(); | ||
| 690 | |||
| 691 | unregister_hotcpu_notifier(&pinst->cpu_notifier); | ||
| 692 | padata_free_pd(pinst->pd); | ||
| 693 | free_cpumask_var(pinst->cpumask); | ||
| 694 | kfree(pinst); | ||
| 695 | } | ||
| 696 | EXPORT_SYMBOL(padata_free); | ||
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index d27746bd3a06..a661e7991865 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
| @@ -98,11 +98,12 @@ void __weak hw_perf_enable(void) { barrier(); } | |||
| 98 | 98 | ||
| 99 | void __weak hw_perf_event_setup(int cpu) { barrier(); } | 99 | void __weak hw_perf_event_setup(int cpu) { barrier(); } |
| 100 | void __weak hw_perf_event_setup_online(int cpu) { barrier(); } | 100 | void __weak hw_perf_event_setup_online(int cpu) { barrier(); } |
| 101 | void __weak hw_perf_event_setup_offline(int cpu) { barrier(); } | ||
| 101 | 102 | ||
| 102 | int __weak | 103 | int __weak |
| 103 | hw_perf_group_sched_in(struct perf_event *group_leader, | 104 | hw_perf_group_sched_in(struct perf_event *group_leader, |
| 104 | struct perf_cpu_context *cpuctx, | 105 | struct perf_cpu_context *cpuctx, |
| 105 | struct perf_event_context *ctx, int cpu) | 106 | struct perf_event_context *ctx) |
| 106 | { | 107 | { |
| 107 | return 0; | 108 | return 0; |
| 108 | } | 109 | } |
| @@ -248,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
| 248 | 249 | ||
| 249 | static inline u64 perf_clock(void) | 250 | static inline u64 perf_clock(void) |
| 250 | { | 251 | { |
| 251 | return cpu_clock(smp_processor_id()); | 252 | return cpu_clock(raw_smp_processor_id()); |
| 252 | } | 253 | } |
| 253 | 254 | ||
| 254 | /* | 255 | /* |
| @@ -289,6 +290,15 @@ static void update_event_times(struct perf_event *event) | |||
| 289 | event->total_time_running = run_end - event->tstamp_running; | 290 | event->total_time_running = run_end - event->tstamp_running; |
| 290 | } | 291 | } |
| 291 | 292 | ||
| 293 | static struct list_head * | ||
| 294 | ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) | ||
| 295 | { | ||
| 296 | if (event->attr.pinned) | ||
| 297 | return &ctx->pinned_groups; | ||
| 298 | else | ||
| 299 | return &ctx->flexible_groups; | ||
| 300 | } | ||
| 301 | |||
| 292 | /* | 302 | /* |
| 293 | * Add a event from the lists for its context. | 303 | * Add a event from the lists for its context. |
| 294 | * Must be called with ctx->mutex and ctx->lock held. | 304 | * Must be called with ctx->mutex and ctx->lock held. |
| @@ -303,9 +313,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 303 | * add it straight to the context's event list, or to the group | 313 | * add it straight to the context's event list, or to the group |
| 304 | * leader's sibling list: | 314 | * leader's sibling list: |
| 305 | */ | 315 | */ |
| 306 | if (group_leader == event) | 316 | if (group_leader == event) { |
| 307 | list_add_tail(&event->group_entry, &ctx->group_list); | 317 | struct list_head *list; |
| 308 | else { | 318 | |
| 319 | if (is_software_event(event)) | ||
| 320 | event->group_flags |= PERF_GROUP_SOFTWARE; | ||
| 321 | |||
| 322 | list = ctx_group_list(event, ctx); | ||
| 323 | list_add_tail(&event->group_entry, list); | ||
| 324 | } else { | ||
| 325 | if (group_leader->group_flags & PERF_GROUP_SOFTWARE && | ||
| 326 | !is_software_event(event)) | ||
| 327 | group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; | ||
| 328 | |||
| 309 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 329 | list_add_tail(&event->group_entry, &group_leader->sibling_list); |
| 310 | group_leader->nr_siblings++; | 330 | group_leader->nr_siblings++; |
| 311 | } | 331 | } |
| @@ -355,9 +375,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 355 | * to the context list directly: | 375 | * to the context list directly: |
| 356 | */ | 376 | */ |
| 357 | list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { | 377 | list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { |
| 378 | struct list_head *list; | ||
| 358 | 379 | ||
| 359 | list_move_tail(&sibling->group_entry, &ctx->group_list); | 380 | list = ctx_group_list(event, ctx); |
| 381 | list_move_tail(&sibling->group_entry, list); | ||
| 360 | sibling->group_leader = sibling; | 382 | sibling->group_leader = sibling; |
| 383 | |||
| 384 | /* Inherit group flags from the previous leader */ | ||
| 385 | sibling->group_flags = event->group_flags; | ||
| 361 | } | 386 | } |
| 362 | } | 387 | } |
| 363 | 388 | ||
| @@ -608,14 +633,13 @@ void perf_event_disable(struct perf_event *event) | |||
| 608 | static int | 633 | static int |
| 609 | event_sched_in(struct perf_event *event, | 634 | event_sched_in(struct perf_event *event, |
| 610 | struct perf_cpu_context *cpuctx, | 635 | struct perf_cpu_context *cpuctx, |
| 611 | struct perf_event_context *ctx, | 636 | struct perf_event_context *ctx) |
| 612 | int cpu) | ||
| 613 | { | 637 | { |
| 614 | if (event->state <= PERF_EVENT_STATE_OFF) | 638 | if (event->state <= PERF_EVENT_STATE_OFF) |
| 615 | return 0; | 639 | return 0; |
| 616 | 640 | ||
| 617 | event->state = PERF_EVENT_STATE_ACTIVE; | 641 | event->state = PERF_EVENT_STATE_ACTIVE; |
| 618 | event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ | 642 | event->oncpu = smp_processor_id(); |
| 619 | /* | 643 | /* |
| 620 | * The new state must be visible before we turn it on in the hardware: | 644 | * The new state must be visible before we turn it on in the hardware: |
| 621 | */ | 645 | */ |
| @@ -642,8 +666,7 @@ event_sched_in(struct perf_event *event, | |||
| 642 | static int | 666 | static int |
| 643 | group_sched_in(struct perf_event *group_event, | 667 | group_sched_in(struct perf_event *group_event, |
| 644 | struct perf_cpu_context *cpuctx, | 668 | struct perf_cpu_context *cpuctx, |
| 645 | struct perf_event_context *ctx, | 669 | struct perf_event_context *ctx) |
| 646 | int cpu) | ||
| 647 | { | 670 | { |
| 648 | struct perf_event *event, *partial_group; | 671 | struct perf_event *event, *partial_group; |
| 649 | int ret; | 672 | int ret; |
| @@ -651,18 +674,18 @@ group_sched_in(struct perf_event *group_event, | |||
| 651 | if (group_event->state == PERF_EVENT_STATE_OFF) | 674 | if (group_event->state == PERF_EVENT_STATE_OFF) |
| 652 | return 0; | 675 | return 0; |
| 653 | 676 | ||
| 654 | ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); | 677 | ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); |
| 655 | if (ret) | 678 | if (ret) |
| 656 | return ret < 0 ? ret : 0; | 679 | return ret < 0 ? ret : 0; |
| 657 | 680 | ||
| 658 | if (event_sched_in(group_event, cpuctx, ctx, cpu)) | 681 | if (event_sched_in(group_event, cpuctx, ctx)) |
| 659 | return -EAGAIN; | 682 | return -EAGAIN; |
| 660 | 683 | ||
| 661 | /* | 684 | /* |
| 662 | * Schedule in siblings as one group (if any): | 685 | * Schedule in siblings as one group (if any): |
| 663 | */ | 686 | */ |
| 664 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 687 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
| 665 | if (event_sched_in(event, cpuctx, ctx, cpu)) { | 688 | if (event_sched_in(event, cpuctx, ctx)) { |
| 666 | partial_group = event; | 689 | partial_group = event; |
| 667 | goto group_error; | 690 | goto group_error; |
| 668 | } | 691 | } |
| @@ -686,24 +709,6 @@ group_error: | |||
| 686 | } | 709 | } |
| 687 | 710 | ||
| 688 | /* | 711 | /* |
| 689 | * Return 1 for a group consisting entirely of software events, | ||
| 690 | * 0 if the group contains any hardware events. | ||
| 691 | */ | ||
| 692 | static int is_software_only_group(struct perf_event *leader) | ||
| 693 | { | ||
| 694 | struct perf_event *event; | ||
| 695 | |||
| 696 | if (!is_software_event(leader)) | ||
| 697 | return 0; | ||
| 698 | |||
| 699 | list_for_each_entry(event, &leader->sibling_list, group_entry) | ||
| 700 | if (!is_software_event(event)) | ||
| 701 | return 0; | ||
| 702 | |||
| 703 | return 1; | ||
| 704 | } | ||
| 705 | |||
| 706 | /* | ||
| 707 | * Work out whether we can put this event group on the CPU now. | 712 | * Work out whether we can put this event group on the CPU now. |
| 708 | */ | 713 | */ |
| 709 | static int group_can_go_on(struct perf_event *event, | 714 | static int group_can_go_on(struct perf_event *event, |
| @@ -713,7 +718,7 @@ static int group_can_go_on(struct perf_event *event, | |||
| 713 | /* | 718 | /* |
| 714 | * Groups consisting entirely of software events can always go on. | 719 | * Groups consisting entirely of software events can always go on. |
| 715 | */ | 720 | */ |
| 716 | if (is_software_only_group(event)) | 721 | if (event->group_flags & PERF_GROUP_SOFTWARE) |
| 717 | return 1; | 722 | return 1; |
| 718 | /* | 723 | /* |
| 719 | * If an exclusive group is already on, no other hardware | 724 | * If an exclusive group is already on, no other hardware |
| @@ -754,7 +759,6 @@ static void __perf_install_in_context(void *info) | |||
| 754 | struct perf_event *event = info; | 759 | struct perf_event *event = info; |
| 755 | struct perf_event_context *ctx = event->ctx; | 760 | struct perf_event_context *ctx = event->ctx; |
| 756 | struct perf_event *leader = event->group_leader; | 761 | struct perf_event *leader = event->group_leader; |
| 757 | int cpu = smp_processor_id(); | ||
| 758 | int err; | 762 | int err; |
| 759 | 763 | ||
| 760 | /* | 764 | /* |
| @@ -801,7 +805,7 @@ static void __perf_install_in_context(void *info) | |||
| 801 | if (!group_can_go_on(event, cpuctx, 1)) | 805 | if (!group_can_go_on(event, cpuctx, 1)) |
| 802 | err = -EEXIST; | 806 | err = -EEXIST; |
| 803 | else | 807 | else |
| 804 | err = event_sched_in(event, cpuctx, ctx, cpu); | 808 | err = event_sched_in(event, cpuctx, ctx); |
| 805 | 809 | ||
| 806 | if (err) { | 810 | if (err) { |
| 807 | /* | 811 | /* |
| @@ -943,11 +947,9 @@ static void __perf_event_enable(void *info) | |||
| 943 | } else { | 947 | } else { |
| 944 | perf_disable(); | 948 | perf_disable(); |
| 945 | if (event == leader) | 949 | if (event == leader) |
| 946 | err = group_sched_in(event, cpuctx, ctx, | 950 | err = group_sched_in(event, cpuctx, ctx); |
| 947 | smp_processor_id()); | ||
| 948 | else | 951 | else |
| 949 | err = event_sched_in(event, cpuctx, ctx, | 952 | err = event_sched_in(event, cpuctx, ctx); |
| 950 | smp_processor_id()); | ||
| 951 | perf_enable(); | 953 | perf_enable(); |
| 952 | } | 954 | } |
| 953 | 955 | ||
| @@ -1043,8 +1045,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
| 1043 | return 0; | 1045 | return 0; |
| 1044 | } | 1046 | } |
| 1045 | 1047 | ||
| 1046 | void __perf_event_sched_out(struct perf_event_context *ctx, | 1048 | enum event_type_t { |
| 1047 | struct perf_cpu_context *cpuctx) | 1049 | EVENT_FLEXIBLE = 0x1, |
| 1050 | EVENT_PINNED = 0x2, | ||
| 1051 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | ||
| 1052 | }; | ||
| 1053 | |||
| 1054 | static void ctx_sched_out(struct perf_event_context *ctx, | ||
| 1055 | struct perf_cpu_context *cpuctx, | ||
| 1056 | enum event_type_t event_type) | ||
| 1048 | { | 1057 | { |
| 1049 | struct perf_event *event; | 1058 | struct perf_event *event; |
| 1050 | 1059 | ||
| @@ -1055,10 +1064,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx, | |||
| 1055 | update_context_time(ctx); | 1064 | update_context_time(ctx); |
| 1056 | 1065 | ||
| 1057 | perf_disable(); | 1066 | perf_disable(); |
| 1058 | if (ctx->nr_active) { | 1067 | if (!ctx->nr_active) |
| 1059 | list_for_each_entry(event, &ctx->group_list, group_entry) | 1068 | goto out_enable; |
| 1069 | |||
| 1070 | if (event_type & EVENT_PINNED) | ||
| 1071 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | ||
| 1060 | group_sched_out(event, cpuctx, ctx); | 1072 | group_sched_out(event, cpuctx, ctx); |
| 1061 | } | 1073 | |
| 1074 | if (event_type & EVENT_FLEXIBLE) | ||
| 1075 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | ||
| 1076 | group_sched_out(event, cpuctx, ctx); | ||
| 1077 | |||
| 1078 | out_enable: | ||
| 1062 | perf_enable(); | 1079 | perf_enable(); |
| 1063 | out: | 1080 | out: |
| 1064 | raw_spin_unlock(&ctx->lock); | 1081 | raw_spin_unlock(&ctx->lock); |
| @@ -1170,9 +1187,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
| 1170 | * not restart the event. | 1187 | * not restart the event. |
| 1171 | */ | 1188 | */ |
| 1172 | void perf_event_task_sched_out(struct task_struct *task, | 1189 | void perf_event_task_sched_out(struct task_struct *task, |
| 1173 | struct task_struct *next, int cpu) | 1190 | struct task_struct *next) |
| 1174 | { | 1191 | { |
| 1175 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 1192 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
| 1176 | struct perf_event_context *ctx = task->perf_event_ctxp; | 1193 | struct perf_event_context *ctx = task->perf_event_ctxp; |
| 1177 | struct perf_event_context *next_ctx; | 1194 | struct perf_event_context *next_ctx; |
| 1178 | struct perf_event_context *parent; | 1195 | struct perf_event_context *parent; |
| @@ -1220,15 +1237,13 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
| 1220 | rcu_read_unlock(); | 1237 | rcu_read_unlock(); |
| 1221 | 1238 | ||
| 1222 | if (do_switch) { | 1239 | if (do_switch) { |
| 1223 | __perf_event_sched_out(ctx, cpuctx); | 1240 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); |
| 1224 | cpuctx->task_ctx = NULL; | 1241 | cpuctx->task_ctx = NULL; |
| 1225 | } | 1242 | } |
| 1226 | } | 1243 | } |
| 1227 | 1244 | ||
| 1228 | /* | 1245 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
| 1229 | * Called with IRQs disabled | 1246 | enum event_type_t event_type) |
| 1230 | */ | ||
| 1231 | static void __perf_event_task_sched_out(struct perf_event_context *ctx) | ||
| 1232 | { | 1247 | { |
| 1233 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1248 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
| 1234 | 1249 | ||
| @@ -1238,47 +1253,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx) | |||
| 1238 | if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) | 1253 | if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) |
| 1239 | return; | 1254 | return; |
| 1240 | 1255 | ||
| 1241 | __perf_event_sched_out(ctx, cpuctx); | 1256 | ctx_sched_out(ctx, cpuctx, event_type); |
| 1242 | cpuctx->task_ctx = NULL; | 1257 | cpuctx->task_ctx = NULL; |
| 1243 | } | 1258 | } |
| 1244 | 1259 | ||
| 1245 | /* | 1260 | /* |
| 1246 | * Called with IRQs disabled | 1261 | * Called with IRQs disabled |
| 1247 | */ | 1262 | */ |
| 1248 | static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) | 1263 | static void __perf_event_task_sched_out(struct perf_event_context *ctx) |
| 1264 | { | ||
| 1265 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
| 1266 | } | ||
| 1267 | |||
| 1268 | /* | ||
| 1269 | * Called with IRQs disabled | ||
| 1270 | */ | ||
| 1271 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | ||
| 1272 | enum event_type_t event_type) | ||
| 1249 | { | 1273 | { |
| 1250 | __perf_event_sched_out(&cpuctx->ctx, cpuctx); | 1274 | ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); |
| 1251 | } | 1275 | } |
| 1252 | 1276 | ||
| 1253 | static void | 1277 | static void |
| 1254 | __perf_event_sched_in(struct perf_event_context *ctx, | 1278 | ctx_pinned_sched_in(struct perf_event_context *ctx, |
| 1255 | struct perf_cpu_context *cpuctx, int cpu) | 1279 | struct perf_cpu_context *cpuctx) |
| 1256 | { | 1280 | { |
| 1257 | struct perf_event *event; | 1281 | struct perf_event *event; |
| 1258 | int can_add_hw = 1; | ||
| 1259 | |||
| 1260 | raw_spin_lock(&ctx->lock); | ||
| 1261 | ctx->is_active = 1; | ||
| 1262 | if (likely(!ctx->nr_events)) | ||
| 1263 | goto out; | ||
| 1264 | |||
| 1265 | ctx->timestamp = perf_clock(); | ||
| 1266 | |||
| 1267 | perf_disable(); | ||
| 1268 | 1282 | ||
| 1269 | /* | 1283 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
| 1270 | * First go through the list and put on any pinned groups | 1284 | if (event->state <= PERF_EVENT_STATE_OFF) |
| 1271 | * in order to give them the best chance of going on. | ||
| 1272 | */ | ||
| 1273 | list_for_each_entry(event, &ctx->group_list, group_entry) { | ||
| 1274 | if (event->state <= PERF_EVENT_STATE_OFF || | ||
| 1275 | !event->attr.pinned) | ||
| 1276 | continue; | 1285 | continue; |
| 1277 | if (event->cpu != -1 && event->cpu != cpu) | 1286 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
| 1278 | continue; | 1287 | continue; |
| 1279 | 1288 | ||
| 1280 | if (group_can_go_on(event, cpuctx, 1)) | 1289 | if (group_can_go_on(event, cpuctx, 1)) |
| 1281 | group_sched_in(event, cpuctx, ctx, cpu); | 1290 | group_sched_in(event, cpuctx, ctx); |
| 1282 | 1291 | ||
| 1283 | /* | 1292 | /* |
| 1284 | * If this pinned group hasn't been scheduled, | 1293 | * If this pinned group hasn't been scheduled, |
| @@ -1289,32 +1298,83 @@ __perf_event_sched_in(struct perf_event_context *ctx, | |||
| 1289 | event->state = PERF_EVENT_STATE_ERROR; | 1298 | event->state = PERF_EVENT_STATE_ERROR; |
| 1290 | } | 1299 | } |
| 1291 | } | 1300 | } |
| 1301 | } | ||
| 1292 | 1302 | ||
| 1293 | list_for_each_entry(event, &ctx->group_list, group_entry) { | 1303 | static void |
| 1294 | /* | 1304 | ctx_flexible_sched_in(struct perf_event_context *ctx, |
| 1295 | * Ignore events in OFF or ERROR state, and | 1305 | struct perf_cpu_context *cpuctx) |
| 1296 | * ignore pinned events since we did them already. | 1306 | { |
| 1297 | */ | 1307 | struct perf_event *event; |
| 1298 | if (event->state <= PERF_EVENT_STATE_OFF || | 1308 | int can_add_hw = 1; |
| 1299 | event->attr.pinned) | ||
| 1300 | continue; | ||
| 1301 | 1309 | ||
| 1310 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) { | ||
| 1311 | /* Ignore events in OFF or ERROR state */ | ||
| 1312 | if (event->state <= PERF_EVENT_STATE_OFF) | ||
| 1313 | continue; | ||
| 1302 | /* | 1314 | /* |
| 1303 | * Listen to the 'cpu' scheduling filter constraint | 1315 | * Listen to the 'cpu' scheduling filter constraint |
| 1304 | * of events: | 1316 | * of events: |
| 1305 | */ | 1317 | */ |
| 1306 | if (event->cpu != -1 && event->cpu != cpu) | 1318 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
| 1307 | continue; | 1319 | continue; |
| 1308 | 1320 | ||
| 1309 | if (group_can_go_on(event, cpuctx, can_add_hw)) | 1321 | if (group_can_go_on(event, cpuctx, can_add_hw)) |
| 1310 | if (group_sched_in(event, cpuctx, ctx, cpu)) | 1322 | if (group_sched_in(event, cpuctx, ctx)) |
| 1311 | can_add_hw = 0; | 1323 | can_add_hw = 0; |
| 1312 | } | 1324 | } |
| 1325 | } | ||
| 1326 | |||
| 1327 | static void | ||
| 1328 | ctx_sched_in(struct perf_event_context *ctx, | ||
| 1329 | struct perf_cpu_context *cpuctx, | ||
| 1330 | enum event_type_t event_type) | ||
| 1331 | { | ||
| 1332 | raw_spin_lock(&ctx->lock); | ||
| 1333 | ctx->is_active = 1; | ||
| 1334 | if (likely(!ctx->nr_events)) | ||
| 1335 | goto out; | ||
| 1336 | |||
| 1337 | ctx->timestamp = perf_clock(); | ||
| 1338 | |||
| 1339 | perf_disable(); | ||
| 1340 | |||
| 1341 | /* | ||
| 1342 | * First go through the list and put on any pinned groups | ||
| 1343 | * in order to give them the best chance of going on. | ||
| 1344 | */ | ||
| 1345 | if (event_type & EVENT_PINNED) | ||
| 1346 | ctx_pinned_sched_in(ctx, cpuctx); | ||
| 1347 | |||
| 1348 | /* Then walk through the lower prio flexible groups */ | ||
| 1349 | if (event_type & EVENT_FLEXIBLE) | ||
| 1350 | ctx_flexible_sched_in(ctx, cpuctx); | ||
| 1351 | |||
| 1313 | perf_enable(); | 1352 | perf_enable(); |
| 1314 | out: | 1353 | out: |
| 1315 | raw_spin_unlock(&ctx->lock); | 1354 | raw_spin_unlock(&ctx->lock); |
| 1316 | } | 1355 | } |
| 1317 | 1356 | ||
| 1357 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | ||
| 1358 | enum event_type_t event_type) | ||
| 1359 | { | ||
| 1360 | struct perf_event_context *ctx = &cpuctx->ctx; | ||
| 1361 | |||
| 1362 | ctx_sched_in(ctx, cpuctx, event_type); | ||
| 1363 | } | ||
| 1364 | |||
| 1365 | static void task_ctx_sched_in(struct task_struct *task, | ||
| 1366 | enum event_type_t event_type) | ||
| 1367 | { | ||
| 1368 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
| 1369 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
| 1370 | |||
| 1371 | if (likely(!ctx)) | ||
| 1372 | return; | ||
| 1373 | if (cpuctx->task_ctx == ctx) | ||
| 1374 | return; | ||
| 1375 | ctx_sched_in(ctx, cpuctx, event_type); | ||
| 1376 | cpuctx->task_ctx = ctx; | ||
| 1377 | } | ||
| 1318 | /* | 1378 | /* |
| 1319 | * Called from scheduler to add the events of the current task | 1379 | * Called from scheduler to add the events of the current task |
| 1320 | * with interrupts disabled. | 1380 | * with interrupts disabled. |
| @@ -1326,38 +1386,128 @@ __perf_event_sched_in(struct perf_event_context *ctx, | |||
| 1326 | * accessing the event control register. If a NMI hits, then it will | 1386 | * accessing the event control register. If a NMI hits, then it will |
| 1327 | * keep the event running. | 1387 | * keep the event running. |
| 1328 | */ | 1388 | */ |
| 1329 | void perf_event_task_sched_in(struct task_struct *task, int cpu) | 1389 | void perf_event_task_sched_in(struct task_struct *task) |
| 1330 | { | 1390 | { |
| 1331 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 1391 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
| 1332 | struct perf_event_context *ctx = task->perf_event_ctxp; | 1392 | struct perf_event_context *ctx = task->perf_event_ctxp; |
| 1333 | 1393 | ||
| 1334 | if (likely(!ctx)) | 1394 | if (likely(!ctx)) |
| 1335 | return; | 1395 | return; |
| 1396 | |||
| 1336 | if (cpuctx->task_ctx == ctx) | 1397 | if (cpuctx->task_ctx == ctx) |
| 1337 | return; | 1398 | return; |
| 1338 | __perf_event_sched_in(ctx, cpuctx, cpu); | 1399 | |
| 1400 | /* | ||
| 1401 | * We want to keep the following priority order: | ||
| 1402 | * cpu pinned (that don't need to move), task pinned, | ||
| 1403 | * cpu flexible, task flexible. | ||
| 1404 | */ | ||
| 1405 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | ||
| 1406 | |||
| 1407 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED); | ||
| 1408 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | ||
| 1409 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); | ||
| 1410 | |||
| 1339 | cpuctx->task_ctx = ctx; | 1411 | cpuctx->task_ctx = ctx; |
| 1340 | } | 1412 | } |
| 1341 | 1413 | ||
| 1342 | static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) | 1414 | #define MAX_INTERRUPTS (~0ULL) |
| 1415 | |||
| 1416 | static void perf_log_throttle(struct perf_event *event, int enable); | ||
| 1417 | |||
| 1418 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | ||
| 1343 | { | 1419 | { |
| 1344 | struct perf_event_context *ctx = &cpuctx->ctx; | 1420 | u64 frequency = event->attr.sample_freq; |
| 1421 | u64 sec = NSEC_PER_SEC; | ||
| 1422 | u64 divisor, dividend; | ||
| 1423 | |||
| 1424 | int count_fls, nsec_fls, frequency_fls, sec_fls; | ||
| 1425 | |||
| 1426 | count_fls = fls64(count); | ||
| 1427 | nsec_fls = fls64(nsec); | ||
| 1428 | frequency_fls = fls64(frequency); | ||
| 1429 | sec_fls = 30; | ||
| 1430 | |||
| 1431 | /* | ||
| 1432 | * We got @count in @nsec, with a target of sample_freq HZ | ||
| 1433 | * the target period becomes: | ||
| 1434 | * | ||
| 1435 | * @count * 10^9 | ||
| 1436 | * period = ------------------- | ||
| 1437 | * @nsec * sample_freq | ||
| 1438 | * | ||
| 1439 | */ | ||
| 1440 | |||
| 1441 | /* | ||
| 1442 | * Reduce accuracy by one bit such that @a and @b converge | ||
| 1443 | * to a similar magnitude. | ||
| 1444 | */ | ||
| 1445 | #define REDUCE_FLS(a, b) \ | ||
| 1446 | do { \ | ||
| 1447 | if (a##_fls > b##_fls) { \ | ||
| 1448 | a >>= 1; \ | ||
| 1449 | a##_fls--; \ | ||
| 1450 | } else { \ | ||
| 1451 | b >>= 1; \ | ||
| 1452 | b##_fls--; \ | ||
| 1453 | } \ | ||
| 1454 | } while (0) | ||
| 1455 | |||
| 1456 | /* | ||
| 1457 | * Reduce accuracy until either term fits in a u64, then proceed with | ||
| 1458 | * the other, so that finally we can do a u64/u64 division. | ||
| 1459 | */ | ||
| 1460 | while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { | ||
| 1461 | REDUCE_FLS(nsec, frequency); | ||
| 1462 | REDUCE_FLS(sec, count); | ||
| 1463 | } | ||
| 1464 | |||
| 1465 | if (count_fls + sec_fls > 64) { | ||
| 1466 | divisor = nsec * frequency; | ||
| 1345 | 1467 | ||
| 1346 | __perf_event_sched_in(ctx, cpuctx, cpu); | 1468 | while (count_fls + sec_fls > 64) { |
| 1469 | REDUCE_FLS(count, sec); | ||
| 1470 | divisor >>= 1; | ||
| 1471 | } | ||
| 1472 | |||
| 1473 | dividend = count * sec; | ||
| 1474 | } else { | ||
| 1475 | dividend = count * sec; | ||
| 1476 | |||
| 1477 | while (nsec_fls + frequency_fls > 64) { | ||
| 1478 | REDUCE_FLS(nsec, frequency); | ||
| 1479 | dividend >>= 1; | ||
| 1480 | } | ||
| 1481 | |||
| 1482 | divisor = nsec * frequency; | ||
| 1483 | } | ||
| 1484 | |||
| 1485 | return div64_u64(dividend, divisor); | ||
| 1347 | } | 1486 | } |
| 1348 | 1487 | ||
| 1349 | #define MAX_INTERRUPTS (~0ULL) | 1488 | static void perf_event_stop(struct perf_event *event) |
| 1489 | { | ||
| 1490 | if (!event->pmu->stop) | ||
| 1491 | return event->pmu->disable(event); | ||
| 1350 | 1492 | ||
| 1351 | static void perf_log_throttle(struct perf_event *event, int enable); | 1493 | return event->pmu->stop(event); |
| 1494 | } | ||
| 1495 | |||
| 1496 | static int perf_event_start(struct perf_event *event) | ||
| 1497 | { | ||
| 1498 | if (!event->pmu->start) | ||
| 1499 | return event->pmu->enable(event); | ||
| 1500 | |||
| 1501 | return event->pmu->start(event); | ||
| 1502 | } | ||
| 1352 | 1503 | ||
| 1353 | static void perf_adjust_period(struct perf_event *event, u64 events) | 1504 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) |
| 1354 | { | 1505 | { |
| 1355 | struct hw_perf_event *hwc = &event->hw; | 1506 | struct hw_perf_event *hwc = &event->hw; |
| 1356 | u64 period, sample_period; | 1507 | u64 period, sample_period; |
| 1357 | s64 delta; | 1508 | s64 delta; |
| 1358 | 1509 | ||
| 1359 | events *= hwc->sample_period; | 1510 | period = perf_calculate_period(event, nsec, count); |
| 1360 | period = div64_u64(events, event->attr.sample_freq); | ||
| 1361 | 1511 | ||
| 1362 | delta = (s64)(period - hwc->sample_period); | 1512 | delta = (s64)(period - hwc->sample_period); |
| 1363 | delta = (delta + 7) / 8; /* low pass filter */ | 1513 | delta = (delta + 7) / 8; /* low pass filter */ |
| @@ -1368,13 +1518,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events) | |||
| 1368 | sample_period = 1; | 1518 | sample_period = 1; |
| 1369 | 1519 | ||
| 1370 | hwc->sample_period = sample_period; | 1520 | hwc->sample_period = sample_period; |
| 1521 | |||
| 1522 | if (atomic64_read(&hwc->period_left) > 8*sample_period) { | ||
| 1523 | perf_disable(); | ||
| 1524 | perf_event_stop(event); | ||
| 1525 | atomic64_set(&hwc->period_left, 0); | ||
| 1526 | perf_event_start(event); | ||
| 1527 | perf_enable(); | ||
| 1528 | } | ||
| 1371 | } | 1529 | } |
| 1372 | 1530 | ||
| 1373 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | 1531 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx) |
| 1374 | { | 1532 | { |
| 1375 | struct perf_event *event; | 1533 | struct perf_event *event; |
| 1376 | struct hw_perf_event *hwc; | 1534 | struct hw_perf_event *hwc; |
| 1377 | u64 interrupts, freq; | 1535 | u64 interrupts, now; |
| 1536 | s64 delta; | ||
| 1378 | 1537 | ||
| 1379 | raw_spin_lock(&ctx->lock); | 1538 | raw_spin_lock(&ctx->lock); |
| 1380 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 1539 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
| @@ -1395,44 +1554,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
| 1395 | if (interrupts == MAX_INTERRUPTS) { | 1554 | if (interrupts == MAX_INTERRUPTS) { |
| 1396 | perf_log_throttle(event, 1); | 1555 | perf_log_throttle(event, 1); |
| 1397 | event->pmu->unthrottle(event); | 1556 | event->pmu->unthrottle(event); |
| 1398 | interrupts = 2*sysctl_perf_event_sample_rate/HZ; | ||
| 1399 | } | 1557 | } |
| 1400 | 1558 | ||
| 1401 | if (!event->attr.freq || !event->attr.sample_freq) | 1559 | if (!event->attr.freq || !event->attr.sample_freq) |
| 1402 | continue; | 1560 | continue; |
| 1403 | 1561 | ||
| 1404 | /* | 1562 | event->pmu->read(event); |
| 1405 | * if the specified freq < HZ then we need to skip ticks | 1563 | now = atomic64_read(&event->count); |
| 1406 | */ | 1564 | delta = now - hwc->freq_count_stamp; |
| 1407 | if (event->attr.sample_freq < HZ) { | 1565 | hwc->freq_count_stamp = now; |
| 1408 | freq = event->attr.sample_freq; | ||
| 1409 | |||
| 1410 | hwc->freq_count += freq; | ||
| 1411 | hwc->freq_interrupts += interrupts; | ||
| 1412 | |||
| 1413 | if (hwc->freq_count < HZ) | ||
| 1414 | continue; | ||
| 1415 | |||
| 1416 | interrupts = hwc->freq_interrupts; | ||
| 1417 | hwc->freq_interrupts = 0; | ||
| 1418 | hwc->freq_count -= HZ; | ||
| 1419 | } else | ||
| 1420 | freq = HZ; | ||
| 1421 | |||
| 1422 | perf_adjust_period(event, freq * interrupts); | ||
| 1423 | 1566 | ||
| 1424 | /* | 1567 | if (delta > 0) |
| 1425 | * In order to avoid being stalled by an (accidental) huge | 1568 | perf_adjust_period(event, TICK_NSEC, delta); |
| 1426 | * sample period, force reset the sample period if we didn't | ||
| 1427 | * get any events in this freq period. | ||
| 1428 | */ | ||
| 1429 | if (!interrupts) { | ||
| 1430 | perf_disable(); | ||
| 1431 | event->pmu->disable(event); | ||
| 1432 | atomic64_set(&hwc->period_left, 0); | ||
| 1433 | event->pmu->enable(event); | ||
| 1434 | perf_enable(); | ||
| 1435 | } | ||
| 1436 | } | 1569 | } |
| 1437 | raw_spin_unlock(&ctx->lock); | 1570 | raw_spin_unlock(&ctx->lock); |
| 1438 | } | 1571 | } |
| @@ -1442,26 +1575,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
| 1442 | */ | 1575 | */ |
| 1443 | static void rotate_ctx(struct perf_event_context *ctx) | 1576 | static void rotate_ctx(struct perf_event_context *ctx) |
| 1444 | { | 1577 | { |
| 1445 | struct perf_event *event; | ||
| 1446 | |||
| 1447 | if (!ctx->nr_events) | 1578 | if (!ctx->nr_events) |
| 1448 | return; | 1579 | return; |
| 1449 | 1580 | ||
| 1450 | raw_spin_lock(&ctx->lock); | 1581 | raw_spin_lock(&ctx->lock); |
| 1451 | /* | 1582 | |
| 1452 | * Rotate the first entry last (works just fine for group events too): | 1583 | /* Rotate the first entry last of non-pinned groups */ |
| 1453 | */ | 1584 | list_rotate_left(&ctx->flexible_groups); |
| 1454 | perf_disable(); | ||
| 1455 | list_for_each_entry(event, &ctx->group_list, group_entry) { | ||
| 1456 | list_move_tail(&event->group_entry, &ctx->group_list); | ||
| 1457 | break; | ||
| 1458 | } | ||
| 1459 | perf_enable(); | ||
| 1460 | 1585 | ||
| 1461 | raw_spin_unlock(&ctx->lock); | 1586 | raw_spin_unlock(&ctx->lock); |
| 1462 | } | 1587 | } |
| 1463 | 1588 | ||
| 1464 | void perf_event_task_tick(struct task_struct *curr, int cpu) | 1589 | void perf_event_task_tick(struct task_struct *curr) |
| 1465 | { | 1590 | { |
| 1466 | struct perf_cpu_context *cpuctx; | 1591 | struct perf_cpu_context *cpuctx; |
| 1467 | struct perf_event_context *ctx; | 1592 | struct perf_event_context *ctx; |
| @@ -1469,24 +1594,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) | |||
| 1469 | if (!atomic_read(&nr_events)) | 1594 | if (!atomic_read(&nr_events)) |
| 1470 | return; | 1595 | return; |
| 1471 | 1596 | ||
| 1472 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 1597 | cpuctx = &__get_cpu_var(perf_cpu_context); |
| 1473 | ctx = curr->perf_event_ctxp; | 1598 | ctx = curr->perf_event_ctxp; |
| 1474 | 1599 | ||
| 1600 | perf_disable(); | ||
| 1601 | |||
| 1475 | perf_ctx_adjust_freq(&cpuctx->ctx); | 1602 | perf_ctx_adjust_freq(&cpuctx->ctx); |
| 1476 | if (ctx) | 1603 | if (ctx) |
| 1477 | perf_ctx_adjust_freq(ctx); | 1604 | perf_ctx_adjust_freq(ctx); |
| 1478 | 1605 | ||
| 1479 | perf_event_cpu_sched_out(cpuctx); | 1606 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
| 1480 | if (ctx) | 1607 | if (ctx) |
| 1481 | __perf_event_task_sched_out(ctx); | 1608 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); |
| 1482 | 1609 | ||
| 1483 | rotate_ctx(&cpuctx->ctx); | 1610 | rotate_ctx(&cpuctx->ctx); |
| 1484 | if (ctx) | 1611 | if (ctx) |
| 1485 | rotate_ctx(ctx); | 1612 | rotate_ctx(ctx); |
| 1486 | 1613 | ||
| 1487 | perf_event_cpu_sched_in(cpuctx, cpu); | 1614 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); |
| 1488 | if (ctx) | 1615 | if (ctx) |
| 1489 | perf_event_task_sched_in(curr, cpu); | 1616 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); |
| 1617 | |||
| 1618 | perf_enable(); | ||
| 1619 | } | ||
| 1620 | |||
| 1621 | static int event_enable_on_exec(struct perf_event *event, | ||
| 1622 | struct perf_event_context *ctx) | ||
| 1623 | { | ||
| 1624 | if (!event->attr.enable_on_exec) | ||
| 1625 | return 0; | ||
| 1626 | |||
| 1627 | event->attr.enable_on_exec = 0; | ||
| 1628 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | ||
| 1629 | return 0; | ||
| 1630 | |||
| 1631 | __perf_event_mark_enabled(event, ctx); | ||
| 1632 | |||
| 1633 | return 1; | ||
| 1490 | } | 1634 | } |
| 1491 | 1635 | ||
| 1492 | /* | 1636 | /* |
| @@ -1499,6 +1643,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
| 1499 | struct perf_event *event; | 1643 | struct perf_event *event; |
| 1500 | unsigned long flags; | 1644 | unsigned long flags; |
| 1501 | int enabled = 0; | 1645 | int enabled = 0; |
| 1646 | int ret; | ||
| 1502 | 1647 | ||
| 1503 | local_irq_save(flags); | 1648 | local_irq_save(flags); |
| 1504 | ctx = task->perf_event_ctxp; | 1649 | ctx = task->perf_event_ctxp; |
| @@ -1509,14 +1654,16 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
| 1509 | 1654 | ||
| 1510 | raw_spin_lock(&ctx->lock); | 1655 | raw_spin_lock(&ctx->lock); |
| 1511 | 1656 | ||
| 1512 | list_for_each_entry(event, &ctx->group_list, group_entry) { | 1657 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
| 1513 | if (!event->attr.enable_on_exec) | 1658 | ret = event_enable_on_exec(event, ctx); |
| 1514 | continue; | 1659 | if (ret) |
| 1515 | event->attr.enable_on_exec = 0; | 1660 | enabled = 1; |
| 1516 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 1661 | } |
| 1517 | continue; | 1662 | |
| 1518 | __perf_event_mark_enabled(event, ctx); | 1663 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) { |
| 1519 | enabled = 1; | 1664 | ret = event_enable_on_exec(event, ctx); |
| 1665 | if (ret) | ||
| 1666 | enabled = 1; | ||
| 1520 | } | 1667 | } |
| 1521 | 1668 | ||
| 1522 | /* | 1669 | /* |
| @@ -1527,7 +1674,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
| 1527 | 1674 | ||
| 1528 | raw_spin_unlock(&ctx->lock); | 1675 | raw_spin_unlock(&ctx->lock); |
| 1529 | 1676 | ||
| 1530 | perf_event_task_sched_in(task, smp_processor_id()); | 1677 | perf_event_task_sched_in(task); |
| 1531 | out: | 1678 | out: |
| 1532 | local_irq_restore(flags); | 1679 | local_irq_restore(flags); |
| 1533 | } | 1680 | } |
| @@ -1590,7 +1737,8 @@ __perf_event_init_context(struct perf_event_context *ctx, | |||
| 1590 | { | 1737 | { |
| 1591 | raw_spin_lock_init(&ctx->lock); | 1738 | raw_spin_lock_init(&ctx->lock); |
| 1592 | mutex_init(&ctx->mutex); | 1739 | mutex_init(&ctx->mutex); |
| 1593 | INIT_LIST_HEAD(&ctx->group_list); | 1740 | INIT_LIST_HEAD(&ctx->pinned_groups); |
| 1741 | INIT_LIST_HEAD(&ctx->flexible_groups); | ||
| 1594 | INIT_LIST_HEAD(&ctx->event_list); | 1742 | INIT_LIST_HEAD(&ctx->event_list); |
| 1595 | atomic_set(&ctx->refcount, 1); | 1743 | atomic_set(&ctx->refcount, 1); |
| 1596 | ctx->task = task; | 1744 | ctx->task = task; |
| @@ -3259,8 +3407,6 @@ static void perf_event_task_output(struct perf_event *event, | |||
| 3259 | task_event->event_id.tid = perf_event_tid(event, task); | 3407 | task_event->event_id.tid = perf_event_tid(event, task); |
| 3260 | task_event->event_id.ptid = perf_event_tid(event, current); | 3408 | task_event->event_id.ptid = perf_event_tid(event, current); |
| 3261 | 3409 | ||
| 3262 | task_event->event_id.time = perf_clock(); | ||
| 3263 | |||
| 3264 | perf_output_put(&handle, task_event->event_id); | 3410 | perf_output_put(&handle, task_event->event_id); |
| 3265 | 3411 | ||
| 3266 | perf_output_end(&handle); | 3412 | perf_output_end(&handle); |
| @@ -3268,7 +3414,7 @@ static void perf_event_task_output(struct perf_event *event, | |||
| 3268 | 3414 | ||
| 3269 | static int perf_event_task_match(struct perf_event *event) | 3415 | static int perf_event_task_match(struct perf_event *event) |
| 3270 | { | 3416 | { |
| 3271 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 3417 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
| 3272 | return 0; | 3418 | return 0; |
| 3273 | 3419 | ||
| 3274 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3420 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
| @@ -3300,7 +3446,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) | |||
| 3300 | cpuctx = &get_cpu_var(perf_cpu_context); | 3446 | cpuctx = &get_cpu_var(perf_cpu_context); |
| 3301 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 3447 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
| 3302 | if (!ctx) | 3448 | if (!ctx) |
| 3303 | ctx = rcu_dereference(task_event->task->perf_event_ctxp); | 3449 | ctx = rcu_dereference(current->perf_event_ctxp); |
| 3304 | if (ctx) | 3450 | if (ctx) |
| 3305 | perf_event_task_ctx(ctx, task_event); | 3451 | perf_event_task_ctx(ctx, task_event); |
| 3306 | put_cpu_var(perf_cpu_context); | 3452 | put_cpu_var(perf_cpu_context); |
| @@ -3331,6 +3477,7 @@ static void perf_event_task(struct task_struct *task, | |||
| 3331 | /* .ppid */ | 3477 | /* .ppid */ |
| 3332 | /* .tid */ | 3478 | /* .tid */ |
| 3333 | /* .ptid */ | 3479 | /* .ptid */ |
| 3480 | .time = perf_clock(), | ||
| 3334 | }, | 3481 | }, |
| 3335 | }; | 3482 | }; |
| 3336 | 3483 | ||
| @@ -3380,7 +3527,7 @@ static void perf_event_comm_output(struct perf_event *event, | |||
| 3380 | 3527 | ||
| 3381 | static int perf_event_comm_match(struct perf_event *event) | 3528 | static int perf_event_comm_match(struct perf_event *event) |
| 3382 | { | 3529 | { |
| 3383 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 3530 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
| 3384 | return 0; | 3531 | return 0; |
| 3385 | 3532 | ||
| 3386 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3533 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
| @@ -3500,7 +3647,7 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
| 3500 | static int perf_event_mmap_match(struct perf_event *event, | 3647 | static int perf_event_mmap_match(struct perf_event *event, |
| 3501 | struct perf_mmap_event *mmap_event) | 3648 | struct perf_mmap_event *mmap_event) |
| 3502 | { | 3649 | { |
| 3503 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 3650 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
| 3504 | return 0; | 3651 | return 0; |
| 3505 | 3652 | ||
| 3506 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3653 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
| @@ -3609,7 +3756,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) | |||
| 3609 | /* .tid */ | 3756 | /* .tid */ |
| 3610 | .start = vma->vm_start, | 3757 | .start = vma->vm_start, |
| 3611 | .len = vma->vm_end - vma->vm_start, | 3758 | .len = vma->vm_end - vma->vm_start, |
| 3612 | .pgoff = vma->vm_pgoff, | 3759 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, |
| 3613 | }, | 3760 | }, |
| 3614 | }; | 3761 | }; |
| 3615 | 3762 | ||
| @@ -3689,12 +3836,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
| 3689 | 3836 | ||
| 3690 | if (event->attr.freq) { | 3837 | if (event->attr.freq) { |
| 3691 | u64 now = perf_clock(); | 3838 | u64 now = perf_clock(); |
| 3692 | s64 delta = now - hwc->freq_stamp; | 3839 | s64 delta = now - hwc->freq_time_stamp; |
| 3693 | 3840 | ||
| 3694 | hwc->freq_stamp = now; | 3841 | hwc->freq_time_stamp = now; |
| 3695 | 3842 | ||
| 3696 | if (delta > 0 && delta < TICK_NSEC) | 3843 | if (delta > 0 && delta < 2*TICK_NSEC) |
| 3697 | perf_adjust_period(event, NSEC_PER_SEC / (int)delta); | 3844 | perf_adjust_period(event, delta, hwc->last_period); |
| 3698 | } | 3845 | } |
| 3699 | 3846 | ||
| 3700 | /* | 3847 | /* |
| @@ -4185,7 +4332,7 @@ static const struct pmu perf_ops_task_clock = { | |||
| 4185 | .read = task_clock_perf_event_read, | 4332 | .read = task_clock_perf_event_read, |
| 4186 | }; | 4333 | }; |
| 4187 | 4334 | ||
| 4188 | #ifdef CONFIG_EVENT_PROFILE | 4335 | #ifdef CONFIG_EVENT_TRACING |
| 4189 | 4336 | ||
| 4190 | void perf_tp_event(int event_id, u64 addr, u64 count, void *record, | 4337 | void perf_tp_event(int event_id, u64 addr, u64 count, void *record, |
| 4191 | int entry_size) | 4338 | int entry_size) |
| @@ -4290,7 +4437,7 @@ static void perf_event_free_filter(struct perf_event *event) | |||
| 4290 | { | 4437 | { |
| 4291 | } | 4438 | } |
| 4292 | 4439 | ||
| 4293 | #endif /* CONFIG_EVENT_PROFILE */ | 4440 | #endif /* CONFIG_EVENT_TRACING */ |
| 4294 | 4441 | ||
| 4295 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 4442 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
| 4296 | static void bp_perf_event_destroy(struct perf_event *event) | 4443 | static void bp_perf_event_destroy(struct perf_event *event) |
| @@ -4580,7 +4727,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
| 4580 | if (attr->type >= PERF_TYPE_MAX) | 4727 | if (attr->type >= PERF_TYPE_MAX) |
| 4581 | return -EINVAL; | 4728 | return -EINVAL; |
| 4582 | 4729 | ||
| 4583 | if (attr->__reserved_1 || attr->__reserved_2) | 4730 | if (attr->__reserved_1) |
| 4584 | return -EINVAL; | 4731 | return -EINVAL; |
| 4585 | 4732 | ||
| 4586 | if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) | 4733 | if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) |
| @@ -4871,8 +5018,15 @@ inherit_event(struct perf_event *parent_event, | |||
| 4871 | else | 5018 | else |
| 4872 | child_event->state = PERF_EVENT_STATE_OFF; | 5019 | child_event->state = PERF_EVENT_STATE_OFF; |
| 4873 | 5020 | ||
| 4874 | if (parent_event->attr.freq) | 5021 | if (parent_event->attr.freq) { |
| 4875 | child_event->hw.sample_period = parent_event->hw.sample_period; | 5022 | u64 sample_period = parent_event->hw.sample_period; |
| 5023 | struct hw_perf_event *hwc = &child_event->hw; | ||
| 5024 | |||
| 5025 | hwc->sample_period = sample_period; | ||
| 5026 | hwc->last_period = sample_period; | ||
| 5027 | |||
| 5028 | atomic64_set(&hwc->period_left, sample_period); | ||
| 5029 | } | ||
| 4876 | 5030 | ||
| 4877 | child_event->overflow_handler = parent_event->overflow_handler; | 5031 | child_event->overflow_handler = parent_event->overflow_handler; |
| 4878 | 5032 | ||
| @@ -5040,7 +5194,11 @@ void perf_event_exit_task(struct task_struct *child) | |||
| 5040 | mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); | 5194 | mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); |
| 5041 | 5195 | ||
| 5042 | again: | 5196 | again: |
| 5043 | list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, | 5197 | list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, |
| 5198 | group_entry) | ||
| 5199 | __perf_event_exit_task(child_event, child_ctx, child); | ||
| 5200 | |||
| 5201 | list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, | ||
| 5044 | group_entry) | 5202 | group_entry) |
| 5045 | __perf_event_exit_task(child_event, child_ctx, child); | 5203 | __perf_event_exit_task(child_event, child_ctx, child); |
| 5046 | 5204 | ||
| @@ -5049,7 +5207,8 @@ again: | |||
| 5049 | * its siblings to the list, but we obtained 'tmp' before that which | 5207 | * its siblings to the list, but we obtained 'tmp' before that which |
| 5050 | * will still point to the list head terminating the iteration. | 5208 | * will still point to the list head terminating the iteration. |
| 5051 | */ | 5209 | */ |
| 5052 | if (!list_empty(&child_ctx->group_list)) | 5210 | if (!list_empty(&child_ctx->pinned_groups) || |
| 5211 | !list_empty(&child_ctx->flexible_groups)) | ||
| 5053 | goto again; | 5212 | goto again; |
| 5054 | 5213 | ||
| 5055 | mutex_unlock(&child_ctx->mutex); | 5214 | mutex_unlock(&child_ctx->mutex); |
| @@ -5057,6 +5216,24 @@ again: | |||
| 5057 | put_ctx(child_ctx); | 5216 | put_ctx(child_ctx); |
| 5058 | } | 5217 | } |
| 5059 | 5218 | ||
| 5219 | static void perf_free_event(struct perf_event *event, | ||
| 5220 | struct perf_event_context *ctx) | ||
| 5221 | { | ||
| 5222 | struct perf_event *parent = event->parent; | ||
| 5223 | |||
| 5224 | if (WARN_ON_ONCE(!parent)) | ||
| 5225 | return; | ||
| 5226 | |||
| 5227 | mutex_lock(&parent->child_mutex); | ||
| 5228 | list_del_init(&event->child_list); | ||
| 5229 | mutex_unlock(&parent->child_mutex); | ||
| 5230 | |||
| 5231 | fput(parent->filp); | ||
| 5232 | |||
| 5233 | list_del_event(event, ctx); | ||
| 5234 | free_event(event); | ||
| 5235 | } | ||
| 5236 | |||
| 5060 | /* | 5237 | /* |
| 5061 | * free an unexposed, unused context as created by inheritance by | 5238 | * free an unexposed, unused context as created by inheritance by |
| 5062 | * init_task below, used by fork() in case of fail. | 5239 | * init_task below, used by fork() in case of fail. |
| @@ -5071,36 +5248,70 @@ void perf_event_free_task(struct task_struct *task) | |||
| 5071 | 5248 | ||
| 5072 | mutex_lock(&ctx->mutex); | 5249 | mutex_lock(&ctx->mutex); |
| 5073 | again: | 5250 | again: |
| 5074 | list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { | 5251 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
| 5075 | struct perf_event *parent = event->parent; | 5252 | perf_free_event(event, ctx); |
| 5076 | 5253 | ||
| 5077 | if (WARN_ON_ONCE(!parent)) | 5254 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, |
| 5078 | continue; | 5255 | group_entry) |
| 5256 | perf_free_event(event, ctx); | ||
| 5257 | |||
| 5258 | if (!list_empty(&ctx->pinned_groups) || | ||
| 5259 | !list_empty(&ctx->flexible_groups)) | ||
| 5260 | goto again; | ||
| 5079 | 5261 | ||
| 5080 | mutex_lock(&parent->child_mutex); | 5262 | mutex_unlock(&ctx->mutex); |
| 5081 | list_del_init(&event->child_list); | ||
| 5082 | mutex_unlock(&parent->child_mutex); | ||
| 5083 | 5263 | ||
| 5084 | fput(parent->filp); | 5264 | put_ctx(ctx); |
| 5265 | } | ||
| 5085 | 5266 | ||
| 5086 | list_del_event(event, ctx); | 5267 | static int |
| 5087 | free_event(event); | 5268 | inherit_task_group(struct perf_event *event, struct task_struct *parent, |
| 5269 | struct perf_event_context *parent_ctx, | ||
| 5270 | struct task_struct *child, | ||
| 5271 | int *inherited_all) | ||
| 5272 | { | ||
| 5273 | int ret; | ||
| 5274 | struct perf_event_context *child_ctx = child->perf_event_ctxp; | ||
| 5275 | |||
| 5276 | if (!event->attr.inherit) { | ||
| 5277 | *inherited_all = 0; | ||
| 5278 | return 0; | ||
| 5088 | } | 5279 | } |
| 5089 | 5280 | ||
| 5090 | if (!list_empty(&ctx->group_list)) | 5281 | if (!child_ctx) { |
| 5091 | goto again; | 5282 | /* |
| 5283 | * This is executed from the parent task context, so | ||
| 5284 | * inherit events that have been marked for cloning. | ||
| 5285 | * First allocate and initialize a context for the | ||
| 5286 | * child. | ||
| 5287 | */ | ||
| 5092 | 5288 | ||
| 5093 | mutex_unlock(&ctx->mutex); | 5289 | child_ctx = kzalloc(sizeof(struct perf_event_context), |
| 5290 | GFP_KERNEL); | ||
| 5291 | if (!child_ctx) | ||
| 5292 | return -ENOMEM; | ||
| 5094 | 5293 | ||
| 5095 | put_ctx(ctx); | 5294 | __perf_event_init_context(child_ctx, child); |
| 5295 | child->perf_event_ctxp = child_ctx; | ||
| 5296 | get_task_struct(child); | ||
| 5297 | } | ||
| 5298 | |||
| 5299 | ret = inherit_group(event, parent, parent_ctx, | ||
| 5300 | child, child_ctx); | ||
| 5301 | |||
| 5302 | if (ret) | ||
| 5303 | *inherited_all = 0; | ||
| 5304 | |||
| 5305 | return ret; | ||
| 5096 | } | 5306 | } |
| 5097 | 5307 | ||
| 5308 | |||
| 5098 | /* | 5309 | /* |
| 5099 | * Initialize the perf_event context in task_struct | 5310 | * Initialize the perf_event context in task_struct |
| 5100 | */ | 5311 | */ |
| 5101 | int perf_event_init_task(struct task_struct *child) | 5312 | int perf_event_init_task(struct task_struct *child) |
| 5102 | { | 5313 | { |
| 5103 | struct perf_event_context *child_ctx = NULL, *parent_ctx; | 5314 | struct perf_event_context *child_ctx, *parent_ctx; |
| 5104 | struct perf_event_context *cloned_ctx; | 5315 | struct perf_event_context *cloned_ctx; |
| 5105 | struct perf_event *event; | 5316 | struct perf_event *event; |
| 5106 | struct task_struct *parent = current; | 5317 | struct task_struct *parent = current; |
| @@ -5138,41 +5349,22 @@ int perf_event_init_task(struct task_struct *child) | |||
| 5138 | * We dont have to disable NMIs - we are only looking at | 5349 | * We dont have to disable NMIs - we are only looking at |
| 5139 | * the list, not manipulating it: | 5350 | * the list, not manipulating it: |
| 5140 | */ | 5351 | */ |
| 5141 | list_for_each_entry(event, &parent_ctx->group_list, group_entry) { | 5352 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { |
| 5142 | 5353 | ret = inherit_task_group(event, parent, parent_ctx, child, | |
| 5143 | if (!event->attr.inherit) { | 5354 | &inherited_all); |
| 5144 | inherited_all = 0; | 5355 | if (ret) |
| 5145 | continue; | 5356 | break; |
| 5146 | } | 5357 | } |
| 5147 | |||
| 5148 | if (!child->perf_event_ctxp) { | ||
| 5149 | /* | ||
| 5150 | * This is executed from the parent task context, so | ||
| 5151 | * inherit events that have been marked for cloning. | ||
| 5152 | * First allocate and initialize a context for the | ||
| 5153 | * child. | ||
| 5154 | */ | ||
| 5155 | |||
| 5156 | child_ctx = kzalloc(sizeof(struct perf_event_context), | ||
| 5157 | GFP_KERNEL); | ||
| 5158 | if (!child_ctx) { | ||
| 5159 | ret = -ENOMEM; | ||
| 5160 | break; | ||
| 5161 | } | ||
| 5162 | |||
| 5163 | __perf_event_init_context(child_ctx, child); | ||
| 5164 | child->perf_event_ctxp = child_ctx; | ||
| 5165 | get_task_struct(child); | ||
| 5166 | } | ||
| 5167 | 5358 | ||
| 5168 | ret = inherit_group(event, parent, parent_ctx, | 5359 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
| 5169 | child, child_ctx); | 5360 | ret = inherit_task_group(event, parent, parent_ctx, child, |
| 5170 | if (ret) { | 5361 | &inherited_all); |
| 5171 | inherited_all = 0; | 5362 | if (ret) |
| 5172 | break; | 5363 | break; |
| 5173 | } | ||
| 5174 | } | 5364 | } |
| 5175 | 5365 | ||
| 5366 | child_ctx = child->perf_event_ctxp; | ||
| 5367 | |||
| 5176 | if (child_ctx && inherited_all) { | 5368 | if (child_ctx && inherited_all) { |
| 5177 | /* | 5369 | /* |
| 5178 | * Mark the child context as a clone of the parent | 5370 | * Mark the child context as a clone of the parent |
| @@ -5221,7 +5413,9 @@ static void __perf_event_exit_cpu(void *info) | |||
| 5221 | struct perf_event_context *ctx = &cpuctx->ctx; | 5413 | struct perf_event_context *ctx = &cpuctx->ctx; |
| 5222 | struct perf_event *event, *tmp; | 5414 | struct perf_event *event, *tmp; |
| 5223 | 5415 | ||
| 5224 | list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) | 5416 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
| 5417 | __perf_event_remove_from_context(event); | ||
| 5418 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | ||
| 5225 | __perf_event_remove_from_context(event); | 5419 | __perf_event_remove_from_context(event); |
| 5226 | } | 5420 | } |
| 5227 | static void perf_event_exit_cpu(int cpu) | 5421 | static void perf_event_exit_cpu(int cpu) |
| @@ -5259,6 +5453,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
| 5259 | perf_event_exit_cpu(cpu); | 5453 | perf_event_exit_cpu(cpu); |
| 5260 | break; | 5454 | break; |
| 5261 | 5455 | ||
| 5456 | case CPU_DEAD: | ||
| 5457 | hw_perf_event_setup_offline(cpu); | ||
| 5458 | break; | ||
| 5459 | |||
| 5262 | default: | 5460 | default: |
| 5263 | break; | 5461 | break; |
| 5264 | } | 5462 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index 2e17c9c92cbe..b08e697cd83f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -367,7 +367,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
| 367 | struct task_struct *result = NULL; | 367 | struct task_struct *result = NULL; |
| 368 | if (pid) { | 368 | if (pid) { |
| 369 | struct hlist_node *first; | 369 | struct hlist_node *first; |
| 370 | first = rcu_dereference(pid->tasks[type].first); | 370 | first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); |
| 371 | if (first) | 371 | if (first) |
| 372 | result = hlist_entry(first, struct task_struct, pids[(type)].node); | 372 | result = hlist_entry(first, struct task_struct, pids[(type)].node); |
| 373 | } | 373 | } |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 495440779ce3..00d1fda58ab6 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, | |||
| 256 | return 0; | 256 | return 0; |
| 257 | } | 257 | } |
| 258 | 258 | ||
| 259 | int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) | 259 | static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) |
| 260 | { | 260 | { |
| 261 | *tp = ktime_to_timespec(KTIME_LOW_RES); | 261 | *tp = ktime_to_timespec(KTIME_LOW_RES); |
| 262 | return 0; | 262 | return 0; |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 91e09d3b2eb2..5c36ea9d55d2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -27,6 +27,15 @@ config PM_DEBUG | |||
| 27 | code. This is helpful when debugging and reporting PM bugs, like | 27 | code. This is helpful when debugging and reporting PM bugs, like |
| 28 | suspend support. | 28 | suspend support. |
| 29 | 29 | ||
| 30 | config PM_ADVANCED_DEBUG | ||
| 31 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | ||
| 32 | depends on PM_DEBUG | ||
| 33 | default n | ||
| 34 | ---help--- | ||
| 35 | Add extra sysfs attributes allowing one to access some Power Management | ||
| 36 | fields of device objects from user space. If you are not a kernel | ||
| 37 | developer interested in debugging/testing Power Management, say "no". | ||
| 38 | |||
| 30 | config PM_VERBOSE | 39 | config PM_VERBOSE |
| 31 | bool "Verbose Power Management debugging" | 40 | bool "Verbose Power Management debugging" |
| 32 | depends on PM_DEBUG | 41 | depends on PM_DEBUG |
| @@ -85,6 +94,11 @@ config PM_SLEEP | |||
| 85 | depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE | 94 | depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE |
| 86 | default y | 95 | default y |
| 87 | 96 | ||
| 97 | config PM_SLEEP_ADVANCED_DEBUG | ||
| 98 | bool | ||
| 99 | depends on PM_ADVANCED_DEBUG | ||
| 100 | default n | ||
| 101 | |||
| 88 | config SUSPEND | 102 | config SUSPEND |
| 89 | bool "Suspend to RAM and standby" | 103 | bool "Suspend to RAM and standby" |
| 90 | depends on PM && ARCH_SUSPEND_POSSIBLE | 104 | depends on PM && ARCH_SUSPEND_POSSIBLE |
| @@ -222,3 +236,8 @@ config PM_RUNTIME | |||
| 222 | and the bus type drivers of the buses the devices are on are | 236 | and the bus type drivers of the buses the devices are on are |
| 223 | responsible for the actual handling of the autosuspend requests and | 237 | responsible for the actual handling of the autosuspend requests and |
| 224 | wake-up events. | 238 | wake-up events. |
| 239 | |||
| 240 | config PM_OPS | ||
| 241 | bool | ||
| 242 | depends on PM_SLEEP || PM_RUNTIME | ||
| 243 | default y | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 0998c7139053..b58800b21fc0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val) | |||
| 44 | == NOTIFY_BAD) ? -EINVAL : 0; | 44 | == NOTIFY_BAD) ? -EINVAL : 0; |
| 45 | } | 45 | } |
| 46 | 46 | ||
| 47 | /* If set, devices may be suspended and resumed asynchronously. */ | ||
| 48 | int pm_async_enabled = 1; | ||
| 49 | |||
| 50 | static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
| 51 | char *buf) | ||
| 52 | { | ||
| 53 | return sprintf(buf, "%d\n", pm_async_enabled); | ||
| 54 | } | ||
| 55 | |||
| 56 | static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
| 57 | const char *buf, size_t n) | ||
| 58 | { | ||
| 59 | unsigned long val; | ||
| 60 | |||
| 61 | if (strict_strtoul(buf, 10, &val)) | ||
| 62 | return -EINVAL; | ||
| 63 | |||
| 64 | if (val > 1) | ||
| 65 | return -EINVAL; | ||
| 66 | |||
| 67 | pm_async_enabled = val; | ||
| 68 | return n; | ||
| 69 | } | ||
| 70 | |||
| 71 | power_attr(pm_async); | ||
| 72 | |||
| 47 | #ifdef CONFIG_PM_DEBUG | 73 | #ifdef CONFIG_PM_DEBUG |
| 48 | int pm_test_level = TEST_NONE; | 74 | int pm_test_level = TEST_NONE; |
| 49 | 75 | ||
| @@ -208,9 +234,12 @@ static struct attribute * g[] = { | |||
| 208 | #ifdef CONFIG_PM_TRACE | 234 | #ifdef CONFIG_PM_TRACE |
| 209 | &pm_trace_attr.attr, | 235 | &pm_trace_attr.attr, |
| 210 | #endif | 236 | #endif |
| 211 | #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) | 237 | #ifdef CONFIG_PM_SLEEP |
| 238 | &pm_async_attr.attr, | ||
| 239 | #ifdef CONFIG_PM_DEBUG | ||
| 212 | &pm_test_attr.attr, | 240 | &pm_test_attr.attr, |
| 213 | #endif | 241 | #endif |
| 242 | #endif | ||
| 214 | NULL, | 243 | NULL, |
| 215 | }; | 244 | }; |
| 216 | 245 | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 36cb168e4330..830cadecbdfc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void) | |||
| 1181 | 1181 | ||
| 1182 | memory_bm_position_reset(©_bm); | 1182 | memory_bm_position_reset(©_bm); |
| 1183 | 1183 | ||
| 1184 | while (to_free_normal > 0 && to_free_highmem > 0) { | 1184 | while (to_free_normal > 0 || to_free_highmem > 0) { |
| 1185 | unsigned long pfn = memory_bm_next_pfn(©_bm); | 1185 | unsigned long pfn = memory_bm_next_pfn(©_bm); |
| 1186 | struct page *page = pfn_to_page(pfn); | 1186 | struct page *page = pfn_to_page(pfn); |
| 1187 | 1187 | ||
| @@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void) | |||
| 1500 | { | 1500 | { |
| 1501 | unsigned int nr_pages, nr_highmem; | 1501 | unsigned int nr_pages, nr_highmem; |
| 1502 | 1502 | ||
| 1503 | printk(KERN_INFO "PM: Creating hibernation image: \n"); | 1503 | printk(KERN_INFO "PM: Creating hibernation image:\n"); |
| 1504 | 1504 | ||
| 1505 | drain_local_pages(NULL); | 1505 | drain_local_pages(NULL); |
| 1506 | nr_pages = count_data_pages(); | 1506 | nr_pages = count_data_pages(); |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 09b2b0ae9e9d..1d575733d4e1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p) | |||
| 657 | struct swsusp_info *header; | 657 | struct swsusp_info *header; |
| 658 | 658 | ||
| 659 | *flags_p = swsusp_header->flags; | 659 | *flags_p = swsusp_header->flags; |
| 660 | if (IS_ERR(resume_bdev)) { | ||
| 661 | pr_debug("PM: Image device not initialised\n"); | ||
| 662 | return PTR_ERR(resume_bdev); | ||
| 663 | } | ||
| 664 | 660 | ||
| 665 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | 661 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); |
| 666 | error = snapshot_write_next(&snapshot, PAGE_SIZE); | 662 | error = snapshot_write_next(&snapshot, PAGE_SIZE); |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c deleted file mode 100644 index 5b3601bd1893..000000000000 --- a/kernel/power/swsusp.c +++ /dev/null | |||
| @@ -1,58 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/power/swsusp.c | ||
| 3 | * | ||
| 4 | * This file provides code to write suspend image to swap and read it back. | ||
| 5 | * | ||
| 6 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> | ||
| 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> | ||
| 8 | * | ||
| 9 | * This file is released under the GPLv2. | ||
| 10 | * | ||
| 11 | * I'd like to thank the following people for their work: | ||
| 12 | * | ||
| 13 | * Pavel Machek <pavel@ucw.cz>: | ||
| 14 | * Modifications, defectiveness pointing, being with me at the very beginning, | ||
| 15 | * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. | ||
| 16 | * | ||
| 17 | * Steve Doddi <dirk@loth.demon.co.uk>: | ||
| 18 | * Support the possibility of hardware state restoring. | ||
| 19 | * | ||
| 20 | * Raph <grey.havens@earthling.net>: | ||
| 21 | * Support for preserving states of network devices and virtual console | ||
| 22 | * (including X and svgatextmode) | ||
| 23 | * | ||
| 24 | * Kurt Garloff <garloff@suse.de>: | ||
| 25 | * Straightened the critical function in order to prevent compilers from | ||
| 26 | * playing tricks with local variables. | ||
| 27 | * | ||
| 28 | * Andreas Mohr <a.mohr@mailto.de> | ||
| 29 | * | ||
| 30 | * Alex Badea <vampire@go.ro>: | ||
| 31 | * Fixed runaway init | ||
| 32 | * | ||
| 33 | * Rafael J. Wysocki <rjw@sisk.pl> | ||
| 34 | * Reworked the freeing of memory and the handling of swap | ||
| 35 | * | ||
| 36 | * More state savers are welcome. Especially for the scsi layer... | ||
| 37 | * | ||
| 38 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt | ||
| 39 | */ | ||
| 40 | |||
| 41 | #include <linux/mm.h> | ||
| 42 | #include <linux/suspend.h> | ||
| 43 | #include <linux/spinlock.h> | ||
| 44 | #include <linux/kernel.h> | ||
| 45 | #include <linux/major.h> | ||
| 46 | #include <linux/swap.h> | ||
| 47 | #include <linux/pm.h> | ||
| 48 | #include <linux/swapops.h> | ||
| 49 | #include <linux/bootmem.h> | ||
| 50 | #include <linux/syscalls.h> | ||
| 51 | #include <linux/highmem.h> | ||
| 52 | #include <linux/time.h> | ||
| 53 | #include <linux/rbtree.h> | ||
| 54 | #include <linux/io.h> | ||
| 55 | |||
| 56 | #include "power.h" | ||
| 57 | |||
| 58 | int in_suspend __nosavedata = 0; | ||
diff --git a/kernel/power/user.c b/kernel/power/user.c index bf0014d6a5f0..4d2289626a84 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
| 195 | return res; | 195 | return res; |
| 196 | } | 196 | } |
| 197 | 197 | ||
| 198 | static void snapshot_deprecated_ioctl(unsigned int cmd) | ||
| 199 | { | ||
| 200 | if (printk_ratelimit()) | ||
| 201 | printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " | ||
| 202 | "be removed soon, update your suspend-to-disk " | ||
| 203 | "utilities\n", | ||
| 204 | __builtin_return_address(0), cmd); | ||
| 205 | } | ||
| 206 | |||
| 198 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, | 207 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, |
| 199 | unsigned long arg) | 208 | unsigned long arg) |
| 200 | { | 209 | { |
| @@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 246 | data->frozen = 0; | 255 | data->frozen = 0; |
| 247 | break; | 256 | break; |
| 248 | 257 | ||
| 249 | case SNAPSHOT_CREATE_IMAGE: | ||
| 250 | case SNAPSHOT_ATOMIC_SNAPSHOT: | 258 | case SNAPSHOT_ATOMIC_SNAPSHOT: |
| 259 | snapshot_deprecated_ioctl(cmd); | ||
| 260 | case SNAPSHOT_CREATE_IMAGE: | ||
| 251 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { | 261 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { |
| 252 | error = -EPERM; | 262 | error = -EPERM; |
| 253 | break; | 263 | break; |
| @@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 275 | data->ready = 0; | 285 | data->ready = 0; |
| 276 | break; | 286 | break; |
| 277 | 287 | ||
| 278 | case SNAPSHOT_PREF_IMAGE_SIZE: | ||
| 279 | case SNAPSHOT_SET_IMAGE_SIZE: | 288 | case SNAPSHOT_SET_IMAGE_SIZE: |
| 289 | snapshot_deprecated_ioctl(cmd); | ||
| 290 | case SNAPSHOT_PREF_IMAGE_SIZE: | ||
| 280 | image_size = arg; | 291 | image_size = arg; |
| 281 | break; | 292 | break; |
| 282 | 293 | ||
| @@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 290 | error = put_user(size, (loff_t __user *)arg); | 301 | error = put_user(size, (loff_t __user *)arg); |
| 291 | break; | 302 | break; |
| 292 | 303 | ||
| 293 | case SNAPSHOT_AVAIL_SWAP_SIZE: | ||
| 294 | case SNAPSHOT_AVAIL_SWAP: | 304 | case SNAPSHOT_AVAIL_SWAP: |
| 305 | snapshot_deprecated_ioctl(cmd); | ||
| 306 | case SNAPSHOT_AVAIL_SWAP_SIZE: | ||
| 295 | size = count_swap_pages(data->swap, 1); | 307 | size = count_swap_pages(data->swap, 1); |
| 296 | size <<= PAGE_SHIFT; | 308 | size <<= PAGE_SHIFT; |
| 297 | error = put_user(size, (loff_t __user *)arg); | 309 | error = put_user(size, (loff_t __user *)arg); |
| 298 | break; | 310 | break; |
| 299 | 311 | ||
| 300 | case SNAPSHOT_ALLOC_SWAP_PAGE: | ||
| 301 | case SNAPSHOT_GET_SWAP_PAGE: | 312 | case SNAPSHOT_GET_SWAP_PAGE: |
| 313 | snapshot_deprecated_ioctl(cmd); | ||
| 314 | case SNAPSHOT_ALLOC_SWAP_PAGE: | ||
| 302 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | 315 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { |
| 303 | error = -ENODEV; | 316 | error = -ENODEV; |
| 304 | break; | 317 | break; |
| @@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 321 | break; | 334 | break; |
| 322 | 335 | ||
| 323 | case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ | 336 | case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ |
| 337 | snapshot_deprecated_ioctl(cmd); | ||
| 324 | if (!swsusp_swap_in_use()) { | 338 | if (!swsusp_swap_in_use()) { |
| 325 | /* | 339 | /* |
| 326 | * User space encodes device types as two-byte values, | 340 | * User space encodes device types as two-byte values, |
| @@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 362 | break; | 376 | break; |
| 363 | 377 | ||
| 364 | case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ | 378 | case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ |
| 379 | snapshot_deprecated_ioctl(cmd); | ||
| 365 | error = -EINVAL; | 380 | error = -EINVAL; |
| 366 | 381 | ||
| 367 | switch (arg) { | 382 | switch (arg) { |
diff --git a/kernel/printk.c b/kernel/printk.c index 1751c456b71f..40674122ecf2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/kexec.h> | 35 | #include <linux/kexec.h> |
| 36 | #include <linux/ratelimit.h> | 36 | #include <linux/ratelimit.h> |
| 37 | #include <linux/kmsg_dump.h> | 37 | #include <linux/kmsg_dump.h> |
| 38 | #include <linux/syslog.h> | ||
| 38 | 39 | ||
| 39 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
| 40 | 41 | ||
| @@ -258,38 +259,23 @@ static inline void boot_delay_msec(void) | |||
| 258 | } | 259 | } |
| 259 | #endif | 260 | #endif |
| 260 | 261 | ||
| 261 | /* | 262 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
| 262 | * Commands to do_syslog: | ||
| 263 | * | ||
| 264 | * 0 -- Close the log. Currently a NOP. | ||
| 265 | * 1 -- Open the log. Currently a NOP. | ||
| 266 | * 2 -- Read from the log. | ||
| 267 | * 3 -- Read all messages remaining in the ring buffer. | ||
| 268 | * 4 -- Read and clear all messages remaining in the ring buffer | ||
| 269 | * 5 -- Clear ring buffer. | ||
| 270 | * 6 -- Disable printk's to console | ||
| 271 | * 7 -- Enable printk's to console | ||
| 272 | * 8 -- Set level of messages printed to console | ||
| 273 | * 9 -- Return number of unread characters in the log buffer | ||
| 274 | * 10 -- Return size of the log buffer | ||
| 275 | */ | ||
| 276 | int do_syslog(int type, char __user *buf, int len) | ||
| 277 | { | 263 | { |
| 278 | unsigned i, j, limit, count; | 264 | unsigned i, j, limit, count; |
| 279 | int do_clear = 0; | 265 | int do_clear = 0; |
| 280 | char c; | 266 | char c; |
| 281 | int error = 0; | 267 | int error = 0; |
| 282 | 268 | ||
| 283 | error = security_syslog(type); | 269 | error = security_syslog(type, from_file); |
| 284 | if (error) | 270 | if (error) |
| 285 | return error; | 271 | return error; |
| 286 | 272 | ||
| 287 | switch (type) { | 273 | switch (type) { |
| 288 | case 0: /* Close log */ | 274 | case SYSLOG_ACTION_CLOSE: /* Close log */ |
| 289 | break; | 275 | break; |
| 290 | case 1: /* Open log */ | 276 | case SYSLOG_ACTION_OPEN: /* Open log */ |
| 291 | break; | 277 | break; |
| 292 | case 2: /* Read from log */ | 278 | case SYSLOG_ACTION_READ: /* Read from log */ |
| 293 | error = -EINVAL; | 279 | error = -EINVAL; |
| 294 | if (!buf || len < 0) | 280 | if (!buf || len < 0) |
| 295 | goto out; | 281 | goto out; |
| @@ -320,10 +306,12 @@ int do_syslog(int type, char __user *buf, int len) | |||
| 320 | if (!error) | 306 | if (!error) |
| 321 | error = i; | 307 | error = i; |
| 322 | break; | 308 | break; |
| 323 | case 4: /* Read/clear last kernel messages */ | 309 | /* Read/clear last kernel messages */ |
| 310 | case SYSLOG_ACTION_READ_CLEAR: | ||
| 324 | do_clear = 1; | 311 | do_clear = 1; |
| 325 | /* FALL THRU */ | 312 | /* FALL THRU */ |
| 326 | case 3: /* Read last kernel messages */ | 313 | /* Read last kernel messages */ |
| 314 | case SYSLOG_ACTION_READ_ALL: | ||
| 327 | error = -EINVAL; | 315 | error = -EINVAL; |
| 328 | if (!buf || len < 0) | 316 | if (!buf || len < 0) |
| 329 | goto out; | 317 | goto out; |
| @@ -376,21 +364,25 @@ int do_syslog(int type, char __user *buf, int len) | |||
| 376 | } | 364 | } |
| 377 | } | 365 | } |
| 378 | break; | 366 | break; |
| 379 | case 5: /* Clear ring buffer */ | 367 | /* Clear ring buffer */ |
| 368 | case SYSLOG_ACTION_CLEAR: | ||
| 380 | logged_chars = 0; | 369 | logged_chars = 0; |
| 381 | break; | 370 | break; |
| 382 | case 6: /* Disable logging to console */ | 371 | /* Disable logging to console */ |
| 372 | case SYSLOG_ACTION_CONSOLE_OFF: | ||
| 383 | if (saved_console_loglevel == -1) | 373 | if (saved_console_loglevel == -1) |
| 384 | saved_console_loglevel = console_loglevel; | 374 | saved_console_loglevel = console_loglevel; |
| 385 | console_loglevel = minimum_console_loglevel; | 375 | console_loglevel = minimum_console_loglevel; |
| 386 | break; | 376 | break; |
| 387 | case 7: /* Enable logging to console */ | 377 | /* Enable logging to console */ |
| 378 | case SYSLOG_ACTION_CONSOLE_ON: | ||
| 388 | if (saved_console_loglevel != -1) { | 379 | if (saved_console_loglevel != -1) { |
| 389 | console_loglevel = saved_console_loglevel; | 380 | console_loglevel = saved_console_loglevel; |
| 390 | saved_console_loglevel = -1; | 381 | saved_console_loglevel = -1; |
| 391 | } | 382 | } |
| 392 | break; | 383 | break; |
| 393 | case 8: /* Set level of messages printed to console */ | 384 | /* Set level of messages printed to console */ |
| 385 | case SYSLOG_ACTION_CONSOLE_LEVEL: | ||
| 394 | error = -EINVAL; | 386 | error = -EINVAL; |
| 395 | if (len < 1 || len > 8) | 387 | if (len < 1 || len > 8) |
| 396 | goto out; | 388 | goto out; |
| @@ -401,10 +393,12 @@ int do_syslog(int type, char __user *buf, int len) | |||
| 401 | saved_console_loglevel = -1; | 393 | saved_console_loglevel = -1; |
| 402 | error = 0; | 394 | error = 0; |
| 403 | break; | 395 | break; |
| 404 | case 9: /* Number of chars in the log buffer */ | 396 | /* Number of chars in the log buffer */ |
| 397 | case SYSLOG_ACTION_SIZE_UNREAD: | ||
| 405 | error = log_end - log_start; | 398 | error = log_end - log_start; |
| 406 | break; | 399 | break; |
| 407 | case 10: /* Size of the log buffer */ | 400 | /* Size of the log buffer */ |
| 401 | case SYSLOG_ACTION_SIZE_BUFFER: | ||
| 408 | error = log_buf_len; | 402 | error = log_buf_len; |
| 409 | break; | 403 | break; |
| 410 | default: | 404 | default: |
| @@ -417,7 +411,7 @@ out: | |||
| 417 | 411 | ||
| 418 | SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | 412 | SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) |
| 419 | { | 413 | { |
| 420 | return do_syslog(type, buf, len); | 414 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
| 421 | } | 415 | } |
| 422 | 416 | ||
| 423 | /* | 417 | /* |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 23bd09cd042e..42ad8ae729a0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
| 23 | #include <linux/syscalls.h> | 23 | #include <linux/syscalls.h> |
| 24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
| 25 | #include <linux/regset.h> | ||
| 25 | 26 | ||
| 26 | 27 | ||
| 27 | /* | 28 | /* |
| @@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data) | |||
| 511 | return 0; | 512 | return 0; |
| 512 | } | 513 | } |
| 513 | 514 | ||
| 515 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | ||
| 516 | |||
| 517 | static const struct user_regset * | ||
| 518 | find_regset(const struct user_regset_view *view, unsigned int type) | ||
| 519 | { | ||
| 520 | const struct user_regset *regset; | ||
| 521 | int n; | ||
| 522 | |||
| 523 | for (n = 0; n < view->n; ++n) { | ||
| 524 | regset = view->regsets + n; | ||
| 525 | if (regset->core_note_type == type) | ||
| 526 | return regset; | ||
| 527 | } | ||
| 528 | |||
| 529 | return NULL; | ||
| 530 | } | ||
| 531 | |||
| 532 | static int ptrace_regset(struct task_struct *task, int req, unsigned int type, | ||
| 533 | struct iovec *kiov) | ||
| 534 | { | ||
| 535 | const struct user_regset_view *view = task_user_regset_view(task); | ||
| 536 | const struct user_regset *regset = find_regset(view, type); | ||
| 537 | int regset_no; | ||
| 538 | |||
| 539 | if (!regset || (kiov->iov_len % regset->size) != 0) | ||
| 540 | return -EINVAL; | ||
| 541 | |||
| 542 | regset_no = regset - view->regsets; | ||
| 543 | kiov->iov_len = min(kiov->iov_len, | ||
| 544 | (__kernel_size_t) (regset->n * regset->size)); | ||
| 545 | |||
| 546 | if (req == PTRACE_GETREGSET) | ||
| 547 | return copy_regset_to_user(task, view, regset_no, 0, | ||
| 548 | kiov->iov_len, kiov->iov_base); | ||
| 549 | else | ||
| 550 | return copy_regset_from_user(task, view, regset_no, 0, | ||
| 551 | kiov->iov_len, kiov->iov_base); | ||
| 552 | } | ||
| 553 | |||
| 554 | #endif | ||
| 555 | |||
| 514 | int ptrace_request(struct task_struct *child, long request, | 556 | int ptrace_request(struct task_struct *child, long request, |
| 515 | long addr, long data) | 557 | long addr, long data) |
| 516 | { | 558 | { |
| @@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 573 | return 0; | 615 | return 0; |
| 574 | return ptrace_resume(child, request, SIGKILL); | 616 | return ptrace_resume(child, request, SIGKILL); |
| 575 | 617 | ||
| 618 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | ||
| 619 | case PTRACE_GETREGSET: | ||
| 620 | case PTRACE_SETREGSET: | ||
| 621 | { | ||
| 622 | struct iovec kiov; | ||
| 623 | struct iovec __user *uiov = (struct iovec __user *) data; | ||
| 624 | |||
| 625 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) | ||
| 626 | return -EFAULT; | ||
| 627 | |||
| 628 | if (__get_user(kiov.iov_base, &uiov->iov_base) || | ||
| 629 | __get_user(kiov.iov_len, &uiov->iov_len)) | ||
| 630 | return -EFAULT; | ||
| 631 | |||
| 632 | ret = ptrace_regset(child, request, addr, &kiov); | ||
| 633 | if (!ret) | ||
| 634 | ret = __put_user(kiov.iov_len, &uiov->iov_len); | ||
| 635 | break; | ||
| 636 | } | ||
| 637 | #endif | ||
| 576 | default: | 638 | default: |
| 577 | break; | 639 | break; |
| 578 | } | 640 | } |
| @@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
| 711 | else | 773 | else |
| 712 | ret = ptrace_setsiginfo(child, &siginfo); | 774 | ret = ptrace_setsiginfo(child, &siginfo); |
| 713 | break; | 775 | break; |
| 776 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | ||
| 777 | case PTRACE_GETREGSET: | ||
| 778 | case PTRACE_SETREGSET: | ||
| 779 | { | ||
| 780 | struct iovec kiov; | ||
| 781 | struct compat_iovec __user *uiov = | ||
| 782 | (struct compat_iovec __user *) datap; | ||
| 783 | compat_uptr_t ptr; | ||
| 784 | compat_size_t len; | ||
| 785 | |||
| 786 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) | ||
| 787 | return -EFAULT; | ||
| 788 | |||
| 789 | if (__get_user(ptr, &uiov->iov_base) || | ||
| 790 | __get_user(len, &uiov->iov_len)) | ||
| 791 | return -EFAULT; | ||
| 792 | |||
| 793 | kiov.iov_base = compat_ptr(ptr); | ||
| 794 | kiov.iov_len = len; | ||
| 795 | |||
| 796 | ret = ptrace_regset(child, request, addr, &kiov); | ||
| 797 | if (!ret) | ||
| 798 | ret = __put_user(kiov.iov_len, &uiov->iov_len); | ||
| 799 | break; | ||
| 800 | } | ||
| 801 | #endif | ||
| 714 | 802 | ||
| 715 | default: | 803 | default: |
| 716 | ret = ptrace_request(child, request, addr, data); | 804 | ret = ptrace_request(child, request, addr, data); |
diff --git a/kernel/range.c b/kernel/range.c new file mode 100644 index 000000000000..74e2e6114927 --- /dev/null +++ b/kernel/range.c | |||
| @@ -0,0 +1,163 @@ | |||
| 1 | /* | ||
| 2 | * Range add and subtract | ||
| 3 | */ | ||
| 4 | #include <linux/module.h> | ||
| 5 | #include <linux/init.h> | ||
| 6 | #include <linux/sort.h> | ||
| 7 | |||
| 8 | #include <linux/range.h> | ||
| 9 | |||
| 10 | #ifndef ARRAY_SIZE | ||
| 11 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) | ||
| 12 | #endif | ||
| 13 | |||
| 14 | int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) | ||
| 15 | { | ||
| 16 | if (start >= end) | ||
| 17 | return nr_range; | ||
| 18 | |||
| 19 | /* Out of slots: */ | ||
| 20 | if (nr_range >= az) | ||
| 21 | return nr_range; | ||
| 22 | |||
| 23 | range[nr_range].start = start; | ||
| 24 | range[nr_range].end = end; | ||
| 25 | |||
| 26 | nr_range++; | ||
| 27 | |||
| 28 | return nr_range; | ||
| 29 | } | ||
| 30 | |||
| 31 | int add_range_with_merge(struct range *range, int az, int nr_range, | ||
| 32 | u64 start, u64 end) | ||
| 33 | { | ||
| 34 | int i; | ||
| 35 | |||
| 36 | if (start >= end) | ||
| 37 | return nr_range; | ||
| 38 | |||
| 39 | /* Try to merge it with old one: */ | ||
| 40 | for (i = 0; i < nr_range; i++) { | ||
| 41 | u64 final_start, final_end; | ||
| 42 | u64 common_start, common_end; | ||
| 43 | |||
| 44 | if (!range[i].end) | ||
| 45 | continue; | ||
| 46 | |||
| 47 | common_start = max(range[i].start, start); | ||
| 48 | common_end = min(range[i].end, end); | ||
| 49 | if (common_start > common_end) | ||
| 50 | continue; | ||
| 51 | |||
| 52 | final_start = min(range[i].start, start); | ||
| 53 | final_end = max(range[i].end, end); | ||
| 54 | |||
| 55 | range[i].start = final_start; | ||
| 56 | range[i].end = final_end; | ||
| 57 | return nr_range; | ||
| 58 | } | ||
| 59 | |||
| 60 | /* Need to add it: */ | ||
| 61 | return add_range(range, az, nr_range, start, end); | ||
| 62 | } | ||
| 63 | |||
| 64 | void subtract_range(struct range *range, int az, u64 start, u64 end) | ||
| 65 | { | ||
| 66 | int i, j; | ||
| 67 | |||
| 68 | if (start >= end) | ||
| 69 | return; | ||
| 70 | |||
| 71 | for (j = 0; j < az; j++) { | ||
| 72 | if (!range[j].end) | ||
| 73 | continue; | ||
| 74 | |||
| 75 | if (start <= range[j].start && end >= range[j].end) { | ||
| 76 | range[j].start = 0; | ||
| 77 | range[j].end = 0; | ||
| 78 | continue; | ||
| 79 | } | ||
| 80 | |||
| 81 | if (start <= range[j].start && end < range[j].end && | ||
| 82 | range[j].start < end) { | ||
| 83 | range[j].start = end; | ||
| 84 | continue; | ||
| 85 | } | ||
| 86 | |||
| 87 | |||
| 88 | if (start > range[j].start && end >= range[j].end && | ||
| 89 | range[j].end > start) { | ||
| 90 | range[j].end = start; | ||
| 91 | continue; | ||
| 92 | } | ||
| 93 | |||
| 94 | if (start > range[j].start && end < range[j].end) { | ||
| 95 | /* Find the new spare: */ | ||
| 96 | for (i = 0; i < az; i++) { | ||
| 97 | if (range[i].end == 0) | ||
| 98 | break; | ||
| 99 | } | ||
| 100 | if (i < az) { | ||
| 101 | range[i].end = range[j].end; | ||
| 102 | range[i].start = end; | ||
| 103 | } else { | ||
| 104 | printk(KERN_ERR "run of slot in ranges\n"); | ||
| 105 | } | ||
| 106 | range[j].end = start; | ||
| 107 | continue; | ||
| 108 | } | ||
| 109 | } | ||
| 110 | } | ||
| 111 | |||
| 112 | static int cmp_range(const void *x1, const void *x2) | ||
| 113 | { | ||
| 114 | const struct range *r1 = x1; | ||
| 115 | const struct range *r2 = x2; | ||
| 116 | s64 start1, start2; | ||
| 117 | |||
| 118 | start1 = r1->start; | ||
| 119 | start2 = r2->start; | ||
| 120 | |||
| 121 | return start1 - start2; | ||
| 122 | } | ||
| 123 | |||
| 124 | int clean_sort_range(struct range *range, int az) | ||
| 125 | { | ||
| 126 | int i, j, k = az - 1, nr_range = 0; | ||
| 127 | |||
| 128 | for (i = 0; i < k; i++) { | ||
| 129 | if (range[i].end) | ||
| 130 | continue; | ||
| 131 | for (j = k; j > i; j--) { | ||
| 132 | if (range[j].end) { | ||
| 133 | k = j; | ||
| 134 | break; | ||
| 135 | } | ||
| 136 | } | ||
| 137 | if (j == i) | ||
| 138 | break; | ||
| 139 | range[i].start = range[k].start; | ||
| 140 | range[i].end = range[k].end; | ||
| 141 | range[k].start = 0; | ||
| 142 | range[k].end = 0; | ||
| 143 | k--; | ||
| 144 | } | ||
| 145 | /* count it */ | ||
| 146 | for (i = 0; i < az; i++) { | ||
| 147 | if (!range[i].end) { | ||
| 148 | nr_range = i; | ||
| 149 | break; | ||
| 150 | } | ||
| 151 | } | ||
| 152 | |||
| 153 | /* sort them */ | ||
| 154 | sort(range, nr_range, sizeof(struct range), cmp_range, NULL); | ||
| 155 | |||
| 156 | return nr_range; | ||
| 157 | } | ||
| 158 | |||
| 159 | void sort_range(struct range *range, int nr_range) | ||
| 160 | { | ||
| 161 | /* sort them */ | ||
| 162 | sort(range, nr_range, sizeof(struct range), cmp_range, NULL); | ||
| 163 | } | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 9b7fd4723878..f1125c1a6321 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -44,14 +44,43 @@ | |||
| 44 | #include <linux/cpu.h> | 44 | #include <linux/cpu.h> |
| 45 | #include <linux/mutex.h> | 45 | #include <linux/mutex.h> |
| 46 | #include <linux/module.h> | 46 | #include <linux/module.h> |
| 47 | #include <linux/kernel_stat.h> | ||
| 47 | 48 | ||
| 48 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| 49 | static struct lock_class_key rcu_lock_key; | 50 | static struct lock_class_key rcu_lock_key; |
| 50 | struct lockdep_map rcu_lock_map = | 51 | struct lockdep_map rcu_lock_map = |
| 51 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | 52 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); |
| 52 | EXPORT_SYMBOL_GPL(rcu_lock_map); | 53 | EXPORT_SYMBOL_GPL(rcu_lock_map); |
| 54 | |||
| 55 | static struct lock_class_key rcu_bh_lock_key; | ||
| 56 | struct lockdep_map rcu_bh_lock_map = | ||
| 57 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); | ||
| 58 | EXPORT_SYMBOL_GPL(rcu_bh_lock_map); | ||
| 59 | |||
| 60 | static struct lock_class_key rcu_sched_lock_key; | ||
| 61 | struct lockdep_map rcu_sched_lock_map = | ||
| 62 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); | ||
| 63 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); | ||
| 53 | #endif | 64 | #endif |
| 54 | 65 | ||
| 66 | int rcu_scheduler_active __read_mostly; | ||
| 67 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
| 68 | |||
| 69 | /* | ||
| 70 | * This function is invoked towards the end of the scheduler's initialization | ||
| 71 | * process. Before this is called, the idle task might contain | ||
| 72 | * RCU read-side critical sections (during which time, this idle | ||
| 73 | * task is booting the system). After this function is called, the | ||
| 74 | * idle tasks are prohibited from containing RCU read-side critical | ||
| 75 | * sections. | ||
| 76 | */ | ||
| 77 | void rcu_scheduler_starting(void) | ||
| 78 | { | ||
| 79 | WARN_ON(num_online_cpus() != 1); | ||
| 80 | WARN_ON(nr_context_switches() > 0); | ||
| 81 | rcu_scheduler_active = 1; | ||
| 82 | } | ||
| 83 | |||
| 55 | /* | 84 | /* |
| 56 | * Awaken the corresponding synchronize_rcu() instance now that a | 85 | * Awaken the corresponding synchronize_rcu() instance now that a |
| 57 | * grace period has elapsed. | 86 | * grace period has elapsed. |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9bb52177af02..58df55bf83ed 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | |||
| 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ |
| 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ |
| 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ | 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ |
| 64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | ||
| 65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | ||
| 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | ||
| 64 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | 67 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ |
| 65 | 68 | ||
| 66 | module_param(nreaders, int, 0444); | 69 | module_param(nreaders, int, 0444); |
| @@ -79,6 +82,12 @@ module_param(stutter, int, 0444); | |||
| 79 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | 82 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); |
| 80 | module_param(irqreader, int, 0444); | 83 | module_param(irqreader, int, 0444); |
| 81 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | 84 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); |
| 85 | module_param(fqs_duration, int, 0444); | ||
| 86 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); | ||
| 87 | module_param(fqs_holdoff, int, 0444); | ||
| 88 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | ||
| 89 | module_param(fqs_stutter, int, 0444); | ||
| 90 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | ||
| 82 | module_param(torture_type, charp, 0444); | 91 | module_param(torture_type, charp, 0444); |
| 83 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 92 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); |
| 84 | 93 | ||
| @@ -99,6 +108,7 @@ static struct task_struct **reader_tasks; | |||
| 99 | static struct task_struct *stats_task; | 108 | static struct task_struct *stats_task; |
| 100 | static struct task_struct *shuffler_task; | 109 | static struct task_struct *shuffler_task; |
| 101 | static struct task_struct *stutter_task; | 110 | static struct task_struct *stutter_task; |
| 111 | static struct task_struct *fqs_task; | ||
| 102 | 112 | ||
| 103 | #define RCU_TORTURE_PIPE_LEN 10 | 113 | #define RCU_TORTURE_PIPE_LEN 10 |
| 104 | 114 | ||
| @@ -263,6 +273,7 @@ struct rcu_torture_ops { | |||
| 263 | void (*deferred_free)(struct rcu_torture *p); | 273 | void (*deferred_free)(struct rcu_torture *p); |
| 264 | void (*sync)(void); | 274 | void (*sync)(void); |
| 265 | void (*cb_barrier)(void); | 275 | void (*cb_barrier)(void); |
| 276 | void (*fqs)(void); | ||
| 266 | int (*stats)(char *page); | 277 | int (*stats)(char *page); |
| 267 | int irq_capable; | 278 | int irq_capable; |
| 268 | char *name; | 279 | char *name; |
| @@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
| 347 | .deferred_free = rcu_torture_deferred_free, | 358 | .deferred_free = rcu_torture_deferred_free, |
| 348 | .sync = synchronize_rcu, | 359 | .sync = synchronize_rcu, |
| 349 | .cb_barrier = rcu_barrier, | 360 | .cb_barrier = rcu_barrier, |
| 361 | .fqs = rcu_force_quiescent_state, | ||
| 350 | .stats = NULL, | 362 | .stats = NULL, |
| 351 | .irq_capable = 1, | 363 | .irq_capable = 1, |
| 352 | .name = "rcu" | 364 | .name = "rcu" |
| @@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
| 388 | .deferred_free = rcu_sync_torture_deferred_free, | 400 | .deferred_free = rcu_sync_torture_deferred_free, |
| 389 | .sync = synchronize_rcu, | 401 | .sync = synchronize_rcu, |
| 390 | .cb_barrier = NULL, | 402 | .cb_barrier = NULL, |
| 403 | .fqs = rcu_force_quiescent_state, | ||
| 391 | .stats = NULL, | 404 | .stats = NULL, |
| 392 | .irq_capable = 1, | 405 | .irq_capable = 1, |
| 393 | .name = "rcu_sync" | 406 | .name = "rcu_sync" |
| @@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
| 403 | .deferred_free = rcu_sync_torture_deferred_free, | 416 | .deferred_free = rcu_sync_torture_deferred_free, |
| 404 | .sync = synchronize_rcu_expedited, | 417 | .sync = synchronize_rcu_expedited, |
| 405 | .cb_barrier = NULL, | 418 | .cb_barrier = NULL, |
| 419 | .fqs = rcu_force_quiescent_state, | ||
| 406 | .stats = NULL, | 420 | .stats = NULL, |
| 407 | .irq_capable = 1, | 421 | .irq_capable = 1, |
| 408 | .name = "rcu_expedited" | 422 | .name = "rcu_expedited" |
| @@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
| 465 | .deferred_free = rcu_bh_torture_deferred_free, | 479 | .deferred_free = rcu_bh_torture_deferred_free, |
| 466 | .sync = rcu_bh_torture_synchronize, | 480 | .sync = rcu_bh_torture_synchronize, |
| 467 | .cb_barrier = rcu_barrier_bh, | 481 | .cb_barrier = rcu_barrier_bh, |
| 482 | .fqs = rcu_bh_force_quiescent_state, | ||
| 468 | .stats = NULL, | 483 | .stats = NULL, |
| 469 | .irq_capable = 1, | 484 | .irq_capable = 1, |
| 470 | .name = "rcu_bh" | 485 | .name = "rcu_bh" |
| @@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
| 480 | .deferred_free = rcu_sync_torture_deferred_free, | 495 | .deferred_free = rcu_sync_torture_deferred_free, |
| 481 | .sync = rcu_bh_torture_synchronize, | 496 | .sync = rcu_bh_torture_synchronize, |
| 482 | .cb_barrier = NULL, | 497 | .cb_barrier = NULL, |
| 498 | .fqs = rcu_bh_force_quiescent_state, | ||
| 483 | .stats = NULL, | 499 | .stats = NULL, |
| 484 | .irq_capable = 1, | 500 | .irq_capable = 1, |
| 485 | .name = "rcu_bh_sync" | 501 | .name = "rcu_bh_sync" |
| @@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = { | |||
| 621 | .deferred_free = rcu_sched_torture_deferred_free, | 637 | .deferred_free = rcu_sched_torture_deferred_free, |
| 622 | .sync = sched_torture_synchronize, | 638 | .sync = sched_torture_synchronize, |
| 623 | .cb_barrier = rcu_barrier_sched, | 639 | .cb_barrier = rcu_barrier_sched, |
| 640 | .fqs = rcu_sched_force_quiescent_state, | ||
| 624 | .stats = NULL, | 641 | .stats = NULL, |
| 625 | .irq_capable = 1, | 642 | .irq_capable = 1, |
| 626 | .name = "sched" | 643 | .name = "sched" |
| @@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = { | |||
| 636 | .deferred_free = rcu_sync_torture_deferred_free, | 653 | .deferred_free = rcu_sync_torture_deferred_free, |
| 637 | .sync = sched_torture_synchronize, | 654 | .sync = sched_torture_synchronize, |
| 638 | .cb_barrier = NULL, | 655 | .cb_barrier = NULL, |
| 656 | .fqs = rcu_sched_force_quiescent_state, | ||
| 639 | .stats = NULL, | 657 | .stats = NULL, |
| 640 | .name = "sched_sync" | 658 | .name = "sched_sync" |
| 641 | }; | 659 | }; |
| @@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = { | |||
| 650 | .deferred_free = rcu_sync_torture_deferred_free, | 668 | .deferred_free = rcu_sync_torture_deferred_free, |
| 651 | .sync = synchronize_sched_expedited, | 669 | .sync = synchronize_sched_expedited, |
| 652 | .cb_barrier = NULL, | 670 | .cb_barrier = NULL, |
| 671 | .fqs = rcu_sched_force_quiescent_state, | ||
| 653 | .stats = rcu_expedited_torture_stats, | 672 | .stats = rcu_expedited_torture_stats, |
| 654 | .irq_capable = 1, | 673 | .irq_capable = 1, |
| 655 | .name = "sched_expedited" | 674 | .name = "sched_expedited" |
| 656 | }; | 675 | }; |
| 657 | 676 | ||
| 658 | /* | 677 | /* |
| 678 | * RCU torture force-quiescent-state kthread. Repeatedly induces | ||
| 679 | * bursts of calls to force_quiescent_state(), increasing the probability | ||
| 680 | * of occurrence of some important types of race conditions. | ||
| 681 | */ | ||
| 682 | static int | ||
| 683 | rcu_torture_fqs(void *arg) | ||
| 684 | { | ||
| 685 | unsigned long fqs_resume_time; | ||
| 686 | int fqs_burst_remaining; | ||
| 687 | |||
| 688 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); | ||
| 689 | do { | ||
| 690 | fqs_resume_time = jiffies + fqs_stutter * HZ; | ||
| 691 | while (jiffies - fqs_resume_time > LONG_MAX) { | ||
| 692 | schedule_timeout_interruptible(1); | ||
| 693 | } | ||
| 694 | fqs_burst_remaining = fqs_duration; | ||
| 695 | while (fqs_burst_remaining > 0) { | ||
| 696 | cur_ops->fqs(); | ||
| 697 | udelay(fqs_holdoff); | ||
| 698 | fqs_burst_remaining -= fqs_holdoff; | ||
| 699 | } | ||
| 700 | rcu_stutter_wait("rcu_torture_fqs"); | ||
| 701 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
| 702 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); | ||
| 703 | rcutorture_shutdown_absorb("rcu_torture_fqs"); | ||
| 704 | while (!kthread_should_stop()) | ||
| 705 | schedule_timeout_uninterruptible(1); | ||
| 706 | return 0; | ||
| 707 | } | ||
| 708 | |||
| 709 | /* | ||
| 659 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 710 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
| 660 | * for that pointed to by rcu_torture_current, freeing the old structure | 711 | * for that pointed to by rcu_torture_current, freeing the old structure |
| 661 | * after a series of grace periods (the "pipeline"). | 712 | * after a series of grace periods (the "pipeline"). |
| @@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused) | |||
| 745 | 796 | ||
| 746 | idx = cur_ops->readlock(); | 797 | idx = cur_ops->readlock(); |
| 747 | completed = cur_ops->completed(); | 798 | completed = cur_ops->completed(); |
| 748 | p = rcu_dereference(rcu_torture_current); | 799 | p = rcu_dereference_check(rcu_torture_current, |
| 800 | rcu_read_lock_held() || | ||
| 801 | rcu_read_lock_bh_held() || | ||
| 802 | rcu_read_lock_sched_held() || | ||
| 803 | srcu_read_lock_held(&srcu_ctl)); | ||
| 749 | if (p == NULL) { | 804 | if (p == NULL) { |
| 750 | /* Leave because rcu_torture_writer is not yet underway */ | 805 | /* Leave because rcu_torture_writer is not yet underway */ |
| 751 | cur_ops->readunlock(idx); | 806 | cur_ops->readunlock(idx); |
| @@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused) | |||
| 763 | /* Should not happen, but... */ | 818 | /* Should not happen, but... */ |
| 764 | pipe_count = RCU_TORTURE_PIPE_LEN; | 819 | pipe_count = RCU_TORTURE_PIPE_LEN; |
| 765 | } | 820 | } |
| 766 | __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); | 821 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
| 767 | completed = cur_ops->completed() - completed; | 822 | completed = cur_ops->completed() - completed; |
| 768 | if (completed > RCU_TORTURE_PIPE_LEN) { | 823 | if (completed > RCU_TORTURE_PIPE_LEN) { |
| 769 | /* Should not happen, but... */ | 824 | /* Should not happen, but... */ |
| 770 | completed = RCU_TORTURE_PIPE_LEN; | 825 | completed = RCU_TORTURE_PIPE_LEN; |
| 771 | } | 826 | } |
| 772 | __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); | 827 | __this_cpu_inc(rcu_torture_batch[completed]); |
| 773 | preempt_enable(); | 828 | preempt_enable(); |
| 774 | cur_ops->readunlock(idx); | 829 | cur_ops->readunlock(idx); |
| 775 | } | 830 | } |
| @@ -798,11 +853,15 @@ rcu_torture_reader(void *arg) | |||
| 798 | do { | 853 | do { |
| 799 | if (irqreader && cur_ops->irq_capable) { | 854 | if (irqreader && cur_ops->irq_capable) { |
| 800 | if (!timer_pending(&t)) | 855 | if (!timer_pending(&t)) |
| 801 | mod_timer(&t, 1); | 856 | mod_timer(&t, jiffies + 1); |
| 802 | } | 857 | } |
| 803 | idx = cur_ops->readlock(); | 858 | idx = cur_ops->readlock(); |
| 804 | completed = cur_ops->completed(); | 859 | completed = cur_ops->completed(); |
| 805 | p = rcu_dereference(rcu_torture_current); | 860 | p = rcu_dereference_check(rcu_torture_current, |
| 861 | rcu_read_lock_held() || | ||
| 862 | rcu_read_lock_bh_held() || | ||
| 863 | rcu_read_lock_sched_held() || | ||
| 864 | srcu_read_lock_held(&srcu_ctl)); | ||
| 806 | if (p == NULL) { | 865 | if (p == NULL) { |
| 807 | /* Wait for rcu_torture_writer to get underway */ | 866 | /* Wait for rcu_torture_writer to get underway */ |
| 808 | cur_ops->readunlock(idx); | 867 | cur_ops->readunlock(idx); |
| @@ -818,13 +877,13 @@ rcu_torture_reader(void *arg) | |||
| 818 | /* Should not happen, but... */ | 877 | /* Should not happen, but... */ |
| 819 | pipe_count = RCU_TORTURE_PIPE_LEN; | 878 | pipe_count = RCU_TORTURE_PIPE_LEN; |
| 820 | } | 879 | } |
| 821 | __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); | 880 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
| 822 | completed = cur_ops->completed() - completed; | 881 | completed = cur_ops->completed() - completed; |
| 823 | if (completed > RCU_TORTURE_PIPE_LEN) { | 882 | if (completed > RCU_TORTURE_PIPE_LEN) { |
| 824 | /* Should not happen, but... */ | 883 | /* Should not happen, but... */ |
| 825 | completed = RCU_TORTURE_PIPE_LEN; | 884 | completed = RCU_TORTURE_PIPE_LEN; |
| 826 | } | 885 | } |
| 827 | __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); | 886 | __this_cpu_inc(rcu_torture_batch[completed]); |
| 828 | preempt_enable(); | 887 | preempt_enable(); |
| 829 | cur_ops->readunlock(idx); | 888 | cur_ops->readunlock(idx); |
| 830 | schedule(); | 889 | schedule(); |
| @@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag) | |||
| 1030 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1089 | printk(KERN_ALERT "%s" TORTURE_FLAG |
| 1031 | "--- %s: nreaders=%d nfakewriters=%d " | 1090 | "--- %s: nreaders=%d nfakewriters=%d " |
| 1032 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1091 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
| 1033 | "shuffle_interval=%d stutter=%d irqreader=%d\n", | 1092 | "shuffle_interval=%d stutter=%d irqreader=%d " |
| 1093 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", | ||
| 1034 | torture_type, tag, nrealreaders, nfakewriters, | 1094 | torture_type, tag, nrealreaders, nfakewriters, |
| 1035 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1095 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
| 1036 | stutter, irqreader); | 1096 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); |
| 1037 | } | 1097 | } |
| 1038 | 1098 | ||
| 1039 | static struct notifier_block rcutorture_nb = { | 1099 | static struct notifier_block rcutorture_nb = { |
| @@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void) | |||
| 1109 | } | 1169 | } |
| 1110 | stats_task = NULL; | 1170 | stats_task = NULL; |
| 1111 | 1171 | ||
| 1172 | if (fqs_task) { | ||
| 1173 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); | ||
| 1174 | kthread_stop(fqs_task); | ||
| 1175 | } | ||
| 1176 | fqs_task = NULL; | ||
| 1177 | |||
| 1112 | /* Wait for all RCU callbacks to fire. */ | 1178 | /* Wait for all RCU callbacks to fire. */ |
| 1113 | 1179 | ||
| 1114 | if (cur_ops->cb_barrier != NULL) | 1180 | if (cur_ops->cb_barrier != NULL) |
| @@ -1154,6 +1220,11 @@ rcu_torture_init(void) | |||
| 1154 | mutex_unlock(&fullstop_mutex); | 1220 | mutex_unlock(&fullstop_mutex); |
| 1155 | return -EINVAL; | 1221 | return -EINVAL; |
| 1156 | } | 1222 | } |
| 1223 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | ||
| 1224 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " | ||
| 1225 | "fqs_duration, fqs disabled.\n"); | ||
| 1226 | fqs_duration = 0; | ||
| 1227 | } | ||
| 1157 | if (cur_ops->init) | 1228 | if (cur_ops->init) |
| 1158 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 1229 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
| 1159 | 1230 | ||
| @@ -1282,6 +1353,19 @@ rcu_torture_init(void) | |||
| 1282 | goto unwind; | 1353 | goto unwind; |
| 1283 | } | 1354 | } |
| 1284 | } | 1355 | } |
| 1356 | if (fqs_duration < 0) | ||
| 1357 | fqs_duration = 0; | ||
| 1358 | if (fqs_duration) { | ||
| 1359 | /* Create the stutter thread */ | ||
| 1360 | fqs_task = kthread_run(rcu_torture_fqs, NULL, | ||
| 1361 | "rcu_torture_fqs"); | ||
| 1362 | if (IS_ERR(fqs_task)) { | ||
| 1363 | firsterr = PTR_ERR(fqs_task); | ||
| 1364 | VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); | ||
| 1365 | fqs_task = NULL; | ||
| 1366 | goto unwind; | ||
| 1367 | } | ||
| 1368 | } | ||
| 1285 | register_reboot_notifier(&rcutorture_nb); | 1369 | register_reboot_notifier(&rcutorture_nb); |
| 1286 | mutex_unlock(&fullstop_mutex); | 1370 | mutex_unlock(&fullstop_mutex); |
| 1287 | return 0; | 1371 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 53ae9598f798..3ec8160fc75f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -46,7 +46,6 @@ | |||
| 46 | #include <linux/cpu.h> | 46 | #include <linux/cpu.h> |
| 47 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
| 48 | #include <linux/time.h> | 48 | #include <linux/time.h> |
| 49 | #include <linux/kernel_stat.h> | ||
| 50 | 49 | ||
| 51 | #include "rcutree.h" | 50 | #include "rcutree.h" |
| 52 | 51 | ||
| @@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
| 66 | .signaled = RCU_GP_IDLE, \ | 65 | .signaled = RCU_GP_IDLE, \ |
| 67 | .gpnum = -300, \ | 66 | .gpnum = -300, \ |
| 68 | .completed = -300, \ | 67 | .completed = -300, \ |
| 69 | .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ | 68 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ |
| 70 | .orphan_cbs_list = NULL, \ | 69 | .orphan_cbs_list = NULL, \ |
| 71 | .orphan_cbs_tail = &name.orphan_cbs_list, \ | 70 | .orphan_cbs_tail = &name.orphan_cbs_list, \ |
| 72 | .orphan_qlen = 0, \ | 71 | .orphan_qlen = 0, \ |
| 73 | .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ | 72 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ |
| 74 | .n_force_qs = 0, \ | 73 | .n_force_qs = 0, \ |
| 75 | .n_force_qs_ngp = 0, \ | 74 | .n_force_qs_ngp = 0, \ |
| 76 | } | 75 | } |
| @@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | |||
| 81 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | 80 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); |
| 82 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 81 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
| 83 | 82 | ||
| 84 | static int rcu_scheduler_active __read_mostly; | ||
| 85 | |||
| 86 | |||
| 87 | /* | 83 | /* |
| 88 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 84 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
| 89 | * permit this function to be invoked without holding the root rcu_node | 85 | * permit this function to be invoked without holding the root rcu_node |
| @@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void) | |||
| 157 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | 153 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); |
| 158 | 154 | ||
| 159 | /* | 155 | /* |
| 156 | * Force a quiescent state for RCU BH. | ||
| 157 | */ | ||
| 158 | void rcu_bh_force_quiescent_state(void) | ||
| 159 | { | ||
| 160 | force_quiescent_state(&rcu_bh_state, 0); | ||
| 161 | } | ||
| 162 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | ||
| 163 | |||
| 164 | /* | ||
| 165 | * Force a quiescent state for RCU-sched. | ||
| 166 | */ | ||
| 167 | void rcu_sched_force_quiescent_state(void) | ||
| 168 | { | ||
| 169 | force_quiescent_state(&rcu_sched_state, 0); | ||
| 170 | } | ||
| 171 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | ||
| 172 | |||
| 173 | /* | ||
| 160 | * Does the CPU have callbacks ready to be invoked? | 174 | * Does the CPU have callbacks ready to be invoked? |
| 161 | */ | 175 | */ |
| 162 | static int | 176 | static int |
| @@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 439 | 453 | ||
| 440 | /* Only let one CPU complain about others per time interval. */ | 454 | /* Only let one CPU complain about others per time interval. */ |
| 441 | 455 | ||
| 442 | spin_lock_irqsave(&rnp->lock, flags); | 456 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 443 | delta = jiffies - rsp->jiffies_stall; | 457 | delta = jiffies - rsp->jiffies_stall; |
| 444 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { | 458 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { |
| 445 | spin_unlock_irqrestore(&rnp->lock, flags); | 459 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 446 | return; | 460 | return; |
| 447 | } | 461 | } |
| 448 | rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | 462 | rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; |
| @@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 452 | * due to CPU offlining. | 466 | * due to CPU offlining. |
| 453 | */ | 467 | */ |
| 454 | rcu_print_task_stall(rnp); | 468 | rcu_print_task_stall(rnp); |
| 455 | spin_unlock_irqrestore(&rnp->lock, flags); | 469 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 456 | 470 | ||
| 457 | /* OK, time to rat on our buddy... */ | 471 | /* OK, time to rat on our buddy... */ |
| 458 | 472 | ||
| 459 | printk(KERN_ERR "INFO: RCU detected CPU stalls:"); | 473 | printk(KERN_ERR "INFO: RCU detected CPU stalls:"); |
| 460 | rcu_for_each_leaf_node(rsp, rnp) { | 474 | rcu_for_each_leaf_node(rsp, rnp) { |
| 475 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 461 | rcu_print_task_stall(rnp); | 476 | rcu_print_task_stall(rnp); |
| 477 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 462 | if (rnp->qsmask == 0) | 478 | if (rnp->qsmask == 0) |
| 463 | continue; | 479 | continue; |
| 464 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 480 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) |
| @@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 469 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 485 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); |
| 470 | trigger_all_cpu_backtrace(); | 486 | trigger_all_cpu_backtrace(); |
| 471 | 487 | ||
| 488 | /* If so configured, complain about tasks blocking the grace period. */ | ||
| 489 | |||
| 490 | rcu_print_detail_task_stall(rsp); | ||
| 491 | |||
| 472 | force_quiescent_state(rsp, 0); /* Kick them all. */ | 492 | force_quiescent_state(rsp, 0); /* Kick them all. */ |
| 473 | } | 493 | } |
| 474 | 494 | ||
| @@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
| 481 | smp_processor_id(), jiffies - rsp->gp_start); | 501 | smp_processor_id(), jiffies - rsp->gp_start); |
| 482 | trigger_all_cpu_backtrace(); | 502 | trigger_all_cpu_backtrace(); |
| 483 | 503 | ||
| 484 | spin_lock_irqsave(&rnp->lock, flags); | 504 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 485 | if ((long)(jiffies - rsp->jiffies_stall) >= 0) | 505 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) |
| 486 | rsp->jiffies_stall = | 506 | rsp->jiffies_stall = |
| 487 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | 507 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; |
| 488 | spin_unlock_irqrestore(&rnp->lock, flags); | 508 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 489 | 509 | ||
| 490 | set_need_resched(); /* kick ourselves to get things going. */ | 510 | set_need_resched(); /* kick ourselves to get things going. */ |
| 491 | } | 511 | } |
| @@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 545 | local_irq_save(flags); | 565 | local_irq_save(flags); |
| 546 | rnp = rdp->mynode; | 566 | rnp = rdp->mynode; |
| 547 | if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ | 567 | if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ |
| 548 | !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ | 568 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ |
| 549 | local_irq_restore(flags); | 569 | local_irq_restore(flags); |
| 550 | return; | 570 | return; |
| 551 | } | 571 | } |
| 552 | __note_new_gpnum(rsp, rnp, rdp); | 572 | __note_new_gpnum(rsp, rnp, rdp); |
| 553 | spin_unlock_irqrestore(&rnp->lock, flags); | 573 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 554 | } | 574 | } |
| 555 | 575 | ||
| 556 | /* | 576 | /* |
| @@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 609 | local_irq_save(flags); | 629 | local_irq_save(flags); |
| 610 | rnp = rdp->mynode; | 630 | rnp = rdp->mynode; |
| 611 | if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ | 631 | if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ |
| 612 | !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ | 632 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ |
| 613 | local_irq_restore(flags); | 633 | local_irq_restore(flags); |
| 614 | return; | 634 | return; |
| 615 | } | 635 | } |
| 616 | __rcu_process_gp_end(rsp, rnp, rdp); | 636 | __rcu_process_gp_end(rsp, rnp, rdp); |
| 617 | spin_unlock_irqrestore(&rnp->lock, flags); | 637 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 618 | } | 638 | } |
| 619 | 639 | ||
| 620 | /* | 640 | /* |
| @@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
| 659 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 679 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; |
| 660 | struct rcu_node *rnp = rcu_get_root(rsp); | 680 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 661 | 681 | ||
| 662 | if (!cpu_needs_another_gp(rsp, rdp)) { | 682 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { |
| 683 | if (cpu_needs_another_gp(rsp, rdp)) | ||
| 684 | rsp->fqs_need_gp = 1; | ||
| 663 | if (rnp->completed == rsp->completed) { | 685 | if (rnp->completed == rsp->completed) { |
| 664 | spin_unlock_irqrestore(&rnp->lock, flags); | 686 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 665 | return; | 687 | return; |
| 666 | } | 688 | } |
| 667 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 689 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 668 | 690 | ||
| 669 | /* | 691 | /* |
| 670 | * Propagate new ->completed value to rcu_node structures | 692 | * Propagate new ->completed value to rcu_node structures |
| @@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
| 672 | * of the next grace period to process their callbacks. | 694 | * of the next grace period to process their callbacks. |
| 673 | */ | 695 | */ |
| 674 | rcu_for_each_node_breadth_first(rsp, rnp) { | 696 | rcu_for_each_node_breadth_first(rsp, rnp) { |
| 675 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 697 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
| 676 | rnp->completed = rsp->completed; | 698 | rnp->completed = rsp->completed; |
| 677 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 699 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 678 | } | 700 | } |
| 679 | local_irq_restore(flags); | 701 | local_irq_restore(flags); |
| 680 | return; | 702 | return; |
| @@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
| 695 | rnp->completed = rsp->completed; | 717 | rnp->completed = rsp->completed; |
| 696 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 718 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
| 697 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 719 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
| 698 | spin_unlock_irqrestore(&rnp->lock, flags); | 720 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 699 | return; | 721 | return; |
| 700 | } | 722 | } |
| 701 | 723 | ||
| 702 | spin_unlock(&rnp->lock); /* leave irqs disabled. */ | 724 | raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ |
| 703 | 725 | ||
| 704 | 726 | ||
| 705 | /* Exclude any concurrent CPU-hotplug operations. */ | 727 | /* Exclude any concurrent CPU-hotplug operations. */ |
| 706 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 728 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
| 707 | 729 | ||
| 708 | /* | 730 | /* |
| 709 | * Set the quiescent-state-needed bits in all the rcu_node | 731 | * Set the quiescent-state-needed bits in all the rcu_node |
| @@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
| 723 | * irqs disabled. | 745 | * irqs disabled. |
| 724 | */ | 746 | */ |
| 725 | rcu_for_each_node_breadth_first(rsp, rnp) { | 747 | rcu_for_each_node_breadth_first(rsp, rnp) { |
| 726 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 748 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
| 727 | rcu_preempt_check_blocked_tasks(rnp); | 749 | rcu_preempt_check_blocked_tasks(rnp); |
| 728 | rnp->qsmask = rnp->qsmaskinit; | 750 | rnp->qsmask = rnp->qsmaskinit; |
| 729 | rnp->gpnum = rsp->gpnum; | 751 | rnp->gpnum = rsp->gpnum; |
| 730 | rnp->completed = rsp->completed; | 752 | rnp->completed = rsp->completed; |
| 731 | if (rnp == rdp->mynode) | 753 | if (rnp == rdp->mynode) |
| 732 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 754 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
| 733 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 755 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 734 | } | 756 | } |
| 735 | 757 | ||
| 736 | rnp = rcu_get_root(rsp); | 758 | rnp = rcu_get_root(rsp); |
| 737 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 759 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
| 738 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | 760 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ |
| 739 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 761 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 740 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 762 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| 741 | } | 763 | } |
| 742 | 764 | ||
| 743 | /* | 765 | /* |
| @@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
| 776 | if (!(rnp->qsmask & mask)) { | 798 | if (!(rnp->qsmask & mask)) { |
| 777 | 799 | ||
| 778 | /* Our bit has already been cleared, so done. */ | 800 | /* Our bit has already been cleared, so done. */ |
| 779 | spin_unlock_irqrestore(&rnp->lock, flags); | 801 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 780 | return; | 802 | return; |
| 781 | } | 803 | } |
| 782 | rnp->qsmask &= ~mask; | 804 | rnp->qsmask &= ~mask; |
| 783 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 805 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { |
| 784 | 806 | ||
| 785 | /* Other bits still set at this level, so done. */ | 807 | /* Other bits still set at this level, so done. */ |
| 786 | spin_unlock_irqrestore(&rnp->lock, flags); | 808 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 787 | return; | 809 | return; |
| 788 | } | 810 | } |
| 789 | mask = rnp->grpmask; | 811 | mask = rnp->grpmask; |
| @@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
| 793 | 815 | ||
| 794 | break; | 816 | break; |
| 795 | } | 817 | } |
| 796 | spin_unlock_irqrestore(&rnp->lock, flags); | 818 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 797 | rnp_c = rnp; | 819 | rnp_c = rnp; |
| 798 | rnp = rnp->parent; | 820 | rnp = rnp->parent; |
| 799 | spin_lock_irqsave(&rnp->lock, flags); | 821 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 800 | WARN_ON_ONCE(rnp_c->qsmask); | 822 | WARN_ON_ONCE(rnp_c->qsmask); |
| 801 | } | 823 | } |
| 802 | 824 | ||
| @@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
| 825 | struct rcu_node *rnp; | 847 | struct rcu_node *rnp; |
| 826 | 848 | ||
| 827 | rnp = rdp->mynode; | 849 | rnp = rdp->mynode; |
| 828 | spin_lock_irqsave(&rnp->lock, flags); | 850 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 829 | if (lastcomp != rnp->completed) { | 851 | if (lastcomp != rnp->completed) { |
| 830 | 852 | ||
| 831 | /* | 853 | /* |
| @@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
| 837 | * race occurred. | 859 | * race occurred. |
| 838 | */ | 860 | */ |
| 839 | rdp->passed_quiesc = 0; /* try again later! */ | 861 | rdp->passed_quiesc = 0; /* try again later! */ |
| 840 | spin_unlock_irqrestore(&rnp->lock, flags); | 862 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 841 | return; | 863 | return; |
| 842 | } | 864 | } |
| 843 | mask = rdp->grpmask; | 865 | mask = rdp->grpmask; |
| 844 | if ((rnp->qsmask & mask) == 0) { | 866 | if ((rnp->qsmask & mask) == 0) { |
| 845 | spin_unlock_irqrestore(&rnp->lock, flags); | 867 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 846 | } else { | 868 | } else { |
| 847 | rdp->qs_pending = 0; | 869 | rdp->qs_pending = 0; |
| 848 | 870 | ||
| @@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |||
| 906 | 928 | ||
| 907 | if (rdp->nxtlist == NULL) | 929 | if (rdp->nxtlist == NULL) |
| 908 | return; /* irqs disabled, so comparison is stable. */ | 930 | return; /* irqs disabled, so comparison is stable. */ |
| 909 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 931 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
| 910 | *rsp->orphan_cbs_tail = rdp->nxtlist; | 932 | *rsp->orphan_cbs_tail = rdp->nxtlist; |
| 911 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 933 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; |
| 912 | rdp->nxtlist = NULL; | 934 | rdp->nxtlist = NULL; |
| @@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |||
| 914 | rdp->nxttail[i] = &rdp->nxtlist; | 936 | rdp->nxttail[i] = &rdp->nxtlist; |
| 915 | rsp->orphan_qlen += rdp->qlen; | 937 | rsp->orphan_qlen += rdp->qlen; |
| 916 | rdp->qlen = 0; | 938 | rdp->qlen = 0; |
| 917 | spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 939 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
| 918 | } | 940 | } |
| 919 | 941 | ||
| 920 | /* | 942 | /* |
| @@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
| 925 | unsigned long flags; | 947 | unsigned long flags; |
| 926 | struct rcu_data *rdp; | 948 | struct rcu_data *rdp; |
| 927 | 949 | ||
| 928 | spin_lock_irqsave(&rsp->onofflock, flags); | 950 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
| 929 | rdp = rsp->rda[smp_processor_id()]; | 951 | rdp = rsp->rda[smp_processor_id()]; |
| 930 | if (rsp->orphan_cbs_list == NULL) { | 952 | if (rsp->orphan_cbs_list == NULL) { |
| 931 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 953 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| 932 | return; | 954 | return; |
| 933 | } | 955 | } |
| 934 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | 956 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; |
| @@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
| 937 | rsp->orphan_cbs_list = NULL; | 959 | rsp->orphan_cbs_list = NULL; |
| 938 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | 960 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; |
| 939 | rsp->orphan_qlen = 0; | 961 | rsp->orphan_qlen = 0; |
| 940 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 962 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| 941 | } | 963 | } |
| 942 | 964 | ||
| 943 | /* | 965 | /* |
| @@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
| 953 | struct rcu_node *rnp; | 975 | struct rcu_node *rnp; |
| 954 | 976 | ||
| 955 | /* Exclude any attempts to start a new grace period. */ | 977 | /* Exclude any attempts to start a new grace period. */ |
| 956 | spin_lock_irqsave(&rsp->onofflock, flags); | 978 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
| 957 | 979 | ||
| 958 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 980 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
| 959 | rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ | 981 | rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ |
| 960 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 982 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
| 961 | do { | 983 | do { |
| 962 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 984 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
| 963 | rnp->qsmaskinit &= ~mask; | 985 | rnp->qsmaskinit &= ~mask; |
| 964 | if (rnp->qsmaskinit != 0) { | 986 | if (rnp->qsmaskinit != 0) { |
| 965 | if (rnp != rdp->mynode) | 987 | if (rnp != rdp->mynode) |
| 966 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 988 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 967 | break; | 989 | break; |
| 968 | } | 990 | } |
| 969 | if (rnp == rdp->mynode) | 991 | if (rnp == rdp->mynode) |
| 970 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); | 992 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); |
| 971 | else | 993 | else |
| 972 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 994 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 973 | mask = rnp->grpmask; | 995 | mask = rnp->grpmask; |
| 974 | rnp = rnp->parent; | 996 | rnp = rnp->parent; |
| 975 | } while (rnp != NULL); | 997 | } while (rnp != NULL); |
| @@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
| 980 | * because invoking rcu_report_unblock_qs_rnp() with ->onofflock | 1002 | * because invoking rcu_report_unblock_qs_rnp() with ->onofflock |
| 981 | * held leads to deadlock. | 1003 | * held leads to deadlock. |
| 982 | */ | 1004 | */ |
| 983 | spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 1005 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
| 984 | rnp = rdp->mynode; | 1006 | rnp = rdp->mynode; |
| 985 | if (need_report & RCU_OFL_TASKS_NORM_GP) | 1007 | if (need_report & RCU_OFL_TASKS_NORM_GP) |
| 986 | rcu_report_unblock_qs_rnp(rnp, flags); | 1008 | rcu_report_unblock_qs_rnp(rnp, flags); |
| 987 | else | 1009 | else |
| 988 | spin_unlock_irqrestore(&rnp->lock, flags); | 1010 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 989 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1011 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
| 990 | rcu_report_exp_rnp(rsp, rnp); | 1012 | rcu_report_exp_rnp(rsp, rnp); |
| 991 | 1013 | ||
| @@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 1144 | /* | 1166 | /* |
| 1145 | * Scan the leaf rcu_node structures, processing dyntick state for any that | 1167 | * Scan the leaf rcu_node structures, processing dyntick state for any that |
| 1146 | * have not yet encountered a quiescent state, using the function specified. | 1168 | * have not yet encountered a quiescent state, using the function specified. |
| 1147 | * Returns 1 if the current grace period ends while scanning (possibly | 1169 | * The caller must have suppressed start of new grace periods. |
| 1148 | * because we made it end). | ||
| 1149 | */ | 1170 | */ |
| 1150 | static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | 1171 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) |
| 1151 | int (*f)(struct rcu_data *)) | ||
| 1152 | { | 1172 | { |
| 1153 | unsigned long bit; | 1173 | unsigned long bit; |
| 1154 | int cpu; | 1174 | int cpu; |
| @@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | |||
| 1158 | 1178 | ||
| 1159 | rcu_for_each_leaf_node(rsp, rnp) { | 1179 | rcu_for_each_leaf_node(rsp, rnp) { |
| 1160 | mask = 0; | 1180 | mask = 0; |
| 1161 | spin_lock_irqsave(&rnp->lock, flags); | 1181 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 1162 | if (rnp->completed != lastcomp) { | 1182 | if (!rcu_gp_in_progress(rsp)) { |
| 1163 | spin_unlock_irqrestore(&rnp->lock, flags); | 1183 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1164 | return 1; | 1184 | return; |
| 1165 | } | 1185 | } |
| 1166 | if (rnp->qsmask == 0) { | 1186 | if (rnp->qsmask == 0) { |
| 1167 | spin_unlock_irqrestore(&rnp->lock, flags); | 1187 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1168 | continue; | 1188 | continue; |
| 1169 | } | 1189 | } |
| 1170 | cpu = rnp->grplo; | 1190 | cpu = rnp->grplo; |
| @@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | |||
| 1173 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) | 1193 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) |
| 1174 | mask |= bit; | 1194 | mask |= bit; |
| 1175 | } | 1195 | } |
| 1176 | if (mask != 0 && rnp->completed == lastcomp) { | 1196 | if (mask != 0) { |
| 1177 | 1197 | ||
| 1178 | /* rcu_report_qs_rnp() releases rnp->lock. */ | 1198 | /* rcu_report_qs_rnp() releases rnp->lock. */ |
| 1179 | rcu_report_qs_rnp(mask, rsp, rnp, flags); | 1199 | rcu_report_qs_rnp(mask, rsp, rnp, flags); |
| 1180 | continue; | 1200 | continue; |
| 1181 | } | 1201 | } |
| 1182 | spin_unlock_irqrestore(&rnp->lock, flags); | 1202 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1183 | } | 1203 | } |
| 1184 | return 0; | ||
| 1185 | } | 1204 | } |
| 1186 | 1205 | ||
| 1187 | /* | 1206 | /* |
| @@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, | |||
| 1191 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | 1210 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed) |
| 1192 | { | 1211 | { |
| 1193 | unsigned long flags; | 1212 | unsigned long flags; |
| 1194 | long lastcomp; | ||
| 1195 | struct rcu_node *rnp = rcu_get_root(rsp); | 1213 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1196 | u8 signaled; | ||
| 1197 | u8 forcenow; | ||
| 1198 | 1214 | ||
| 1199 | if (!rcu_gp_in_progress(rsp)) | 1215 | if (!rcu_gp_in_progress(rsp)) |
| 1200 | return; /* No grace period in progress, nothing to force. */ | 1216 | return; /* No grace period in progress, nothing to force. */ |
| 1201 | if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { | 1217 | if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { |
| 1202 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ | 1218 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ |
| 1203 | return; /* Someone else is already on the job. */ | 1219 | return; /* Someone else is already on the job. */ |
| 1204 | } | 1220 | } |
| 1205 | if (relaxed && | 1221 | if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) |
| 1206 | (long)(rsp->jiffies_force_qs - jiffies) >= 0) | 1222 | goto unlock_fqs_ret; /* no emergency and done recently. */ |
| 1207 | goto unlock_ret; /* no emergency and done recently. */ | ||
| 1208 | rsp->n_force_qs++; | 1223 | rsp->n_force_qs++; |
| 1209 | spin_lock(&rnp->lock); | 1224 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
| 1210 | lastcomp = rsp->gpnum - 1; | ||
| 1211 | signaled = rsp->signaled; | ||
| 1212 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | 1225 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; |
| 1213 | if(!rcu_gp_in_progress(rsp)) { | 1226 | if(!rcu_gp_in_progress(rsp)) { |
| 1214 | rsp->n_force_qs_ngp++; | 1227 | rsp->n_force_qs_ngp++; |
| 1215 | spin_unlock(&rnp->lock); | 1228 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
| 1216 | goto unlock_ret; /* no GP in progress, time updated. */ | 1229 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ |
| 1217 | } | 1230 | } |
| 1218 | spin_unlock(&rnp->lock); | 1231 | rsp->fqs_active = 1; |
| 1219 | switch (signaled) { | 1232 | switch (rsp->signaled) { |
| 1220 | case RCU_GP_IDLE: | 1233 | case RCU_GP_IDLE: |
| 1221 | case RCU_GP_INIT: | 1234 | case RCU_GP_INIT: |
| 1222 | 1235 | ||
| @@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
| 1224 | 1237 | ||
| 1225 | case RCU_SAVE_DYNTICK: | 1238 | case RCU_SAVE_DYNTICK: |
| 1226 | 1239 | ||
| 1240 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
| 1227 | if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) | 1241 | if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) |
| 1228 | break; /* So gcc recognizes the dead code. */ | 1242 | break; /* So gcc recognizes the dead code. */ |
| 1229 | 1243 | ||
| 1230 | /* Record dyntick-idle state. */ | 1244 | /* Record dyntick-idle state. */ |
| 1231 | if (rcu_process_dyntick(rsp, lastcomp, | 1245 | force_qs_rnp(rsp, dyntick_save_progress_counter); |
| 1232 | dyntick_save_progress_counter)) | 1246 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
| 1233 | goto unlock_ret; | 1247 | if (rcu_gp_in_progress(rsp)) |
| 1234 | /* fall into next case. */ | ||
| 1235 | |||
| 1236 | case RCU_SAVE_COMPLETED: | ||
| 1237 | |||
| 1238 | /* Update state, record completion counter. */ | ||
| 1239 | forcenow = 0; | ||
| 1240 | spin_lock(&rnp->lock); | ||
| 1241 | if (lastcomp + 1 == rsp->gpnum && | ||
| 1242 | lastcomp == rsp->completed && | ||
| 1243 | rsp->signaled == signaled) { | ||
| 1244 | rsp->signaled = RCU_FORCE_QS; | 1248 | rsp->signaled = RCU_FORCE_QS; |
| 1245 | rsp->completed_fqs = lastcomp; | 1249 | break; |
| 1246 | forcenow = signaled == RCU_SAVE_COMPLETED; | ||
| 1247 | } | ||
| 1248 | spin_unlock(&rnp->lock); | ||
| 1249 | if (!forcenow) | ||
| 1250 | break; | ||
| 1251 | /* fall into next case. */ | ||
| 1252 | 1250 | ||
| 1253 | case RCU_FORCE_QS: | 1251 | case RCU_FORCE_QS: |
| 1254 | 1252 | ||
| 1255 | /* Check dyntick-idle state, send IPI to laggarts. */ | 1253 | /* Check dyntick-idle state, send IPI to laggarts. */ |
| 1256 | if (rcu_process_dyntick(rsp, rsp->completed_fqs, | 1254 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
| 1257 | rcu_implicit_dynticks_qs)) | 1255 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); |
| 1258 | goto unlock_ret; | ||
| 1259 | 1256 | ||
| 1260 | /* Leave state in case more forcing is required. */ | 1257 | /* Leave state in case more forcing is required. */ |
| 1261 | 1258 | ||
| 1259 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
| 1262 | break; | 1260 | break; |
| 1263 | } | 1261 | } |
| 1264 | unlock_ret: | 1262 | rsp->fqs_active = 0; |
| 1265 | spin_unlock_irqrestore(&rsp->fqslock, flags); | 1263 | if (rsp->fqs_need_gp) { |
| 1264 | raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ | ||
| 1265 | rsp->fqs_need_gp = 0; | ||
| 1266 | rcu_start_gp(rsp, flags); /* releases rnp->lock */ | ||
| 1267 | return; | ||
| 1268 | } | ||
| 1269 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
| 1270 | unlock_fqs_ret: | ||
| 1271 | raw_spin_unlock_irqrestore(&rsp->fqslock, flags); | ||
| 1266 | } | 1272 | } |
| 1267 | 1273 | ||
| 1268 | #else /* #ifdef CONFIG_SMP */ | 1274 | #else /* #ifdef CONFIG_SMP */ |
| @@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1290 | * If an RCU GP has gone long enough, go check for dyntick | 1296 | * If an RCU GP has gone long enough, go check for dyntick |
| 1291 | * idle CPUs and, if needed, send resched IPIs. | 1297 | * idle CPUs and, if needed, send resched IPIs. |
| 1292 | */ | 1298 | */ |
| 1293 | if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) | 1299 | if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
| 1294 | force_quiescent_state(rsp, 1); | 1300 | force_quiescent_state(rsp, 1); |
| 1295 | 1301 | ||
| 1296 | /* | 1302 | /* |
| @@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1304 | 1310 | ||
| 1305 | /* Does this CPU require a not-yet-started grace period? */ | 1311 | /* Does this CPU require a not-yet-started grace period? */ |
| 1306 | if (cpu_needs_another_gp(rsp, rdp)) { | 1312 | if (cpu_needs_another_gp(rsp, rdp)) { |
| 1307 | spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); | 1313 | raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); |
| 1308 | rcu_start_gp(rsp, flags); /* releases above lock */ | 1314 | rcu_start_gp(rsp, flags); /* releases above lock */ |
| 1309 | } | 1315 | } |
| 1310 | 1316 | ||
| @@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
| 1335 | * grace-period manipulations above. | 1341 | * grace-period manipulations above. |
| 1336 | */ | 1342 | */ |
| 1337 | smp_mb(); /* See above block comment. */ | 1343 | smp_mb(); /* See above block comment. */ |
| 1344 | |||
| 1345 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ | ||
| 1346 | rcu_needs_cpu_flush(); | ||
| 1338 | } | 1347 | } |
| 1339 | 1348 | ||
| 1340 | static void | 1349 | static void |
| @@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1369 | unsigned long nestflag; | 1378 | unsigned long nestflag; |
| 1370 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 1379 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
| 1371 | 1380 | ||
| 1372 | spin_lock_irqsave(&rnp_root->lock, nestflag); | 1381 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); |
| 1373 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ | 1382 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ |
| 1374 | } | 1383 | } |
| 1375 | 1384 | ||
| @@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1387 | force_quiescent_state(rsp, 0); | 1396 | force_quiescent_state(rsp, 0); |
| 1388 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1397 | rdp->n_force_qs_snap = rsp->n_force_qs; |
| 1389 | rdp->qlen_last_fqs_check = rdp->qlen; | 1398 | rdp->qlen_last_fqs_check = rdp->qlen; |
| 1390 | } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) | 1399 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
| 1391 | force_quiescent_state(rsp, 1); | 1400 | force_quiescent_state(rsp, 1); |
| 1392 | local_irq_restore(flags); | 1401 | local_irq_restore(flags); |
| 1393 | } | 1402 | } |
| @@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1520 | 1529 | ||
| 1521 | /* Has an RCU GP gone long enough to send resched IPIs &c? */ | 1530 | /* Has an RCU GP gone long enough to send resched IPIs &c? */ |
| 1522 | if (rcu_gp_in_progress(rsp) && | 1531 | if (rcu_gp_in_progress(rsp) && |
| 1523 | ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { | 1532 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { |
| 1524 | rdp->n_rp_need_fqs++; | 1533 | rdp->n_rp_need_fqs++; |
| 1525 | return 1; | 1534 | return 1; |
| 1526 | } | 1535 | } |
| @@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu) | |||
| 1545 | /* | 1554 | /* |
| 1546 | * Check to see if any future RCU-related work will need to be done | 1555 | * Check to see if any future RCU-related work will need to be done |
| 1547 | * by the current CPU, even if none need be done immediately, returning | 1556 | * by the current CPU, even if none need be done immediately, returning |
| 1548 | * 1 if so. This function is part of the RCU implementation; it is -not- | 1557 | * 1 if so. |
| 1549 | * an exported member of the RCU API. | ||
| 1550 | */ | 1558 | */ |
| 1551 | int rcu_needs_cpu(int cpu) | 1559 | static int rcu_needs_cpu_quick_check(int cpu) |
| 1552 | { | 1560 | { |
| 1553 | /* RCU callbacks either ready or pending? */ | 1561 | /* RCU callbacks either ready or pending? */ |
| 1554 | return per_cpu(rcu_sched_data, cpu).nxtlist || | 1562 | return per_cpu(rcu_sched_data, cpu).nxtlist || |
| @@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu) | |||
| 1556 | rcu_preempt_needs_cpu(cpu); | 1564 | rcu_preempt_needs_cpu(cpu); |
| 1557 | } | 1565 | } |
| 1558 | 1566 | ||
| 1559 | /* | ||
| 1560 | * This function is invoked towards the end of the scheduler's initialization | ||
| 1561 | * process. Before this is called, the idle task might contain | ||
| 1562 | * RCU read-side critical sections (during which time, this idle | ||
| 1563 | * task is booting the system). After this function is called, the | ||
| 1564 | * idle tasks are prohibited from containing RCU read-side critical | ||
| 1565 | * sections. | ||
| 1566 | */ | ||
| 1567 | void rcu_scheduler_starting(void) | ||
| 1568 | { | ||
| 1569 | WARN_ON(num_online_cpus() != 1); | ||
| 1570 | WARN_ON(nr_context_switches() > 0); | ||
| 1571 | rcu_scheduler_active = 1; | ||
| 1572 | } | ||
| 1573 | |||
| 1574 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 1567 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; |
| 1575 | static atomic_t rcu_barrier_cpu_count; | 1568 | static atomic_t rcu_barrier_cpu_count; |
| 1576 | static DEFINE_MUTEX(rcu_barrier_mutex); | 1569 | static DEFINE_MUTEX(rcu_barrier_mutex); |
| @@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 1659 | struct rcu_node *rnp = rcu_get_root(rsp); | 1652 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1660 | 1653 | ||
| 1661 | /* Set up local state, ensuring consistent view of global state. */ | 1654 | /* Set up local state, ensuring consistent view of global state. */ |
| 1662 | spin_lock_irqsave(&rnp->lock, flags); | 1655 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 1663 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); | 1656 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); |
| 1664 | rdp->nxtlist = NULL; | 1657 | rdp->nxtlist = NULL; |
| 1665 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1658 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
| @@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 1669 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 1662 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
| 1670 | #endif /* #ifdef CONFIG_NO_HZ */ | 1663 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 1671 | rdp->cpu = cpu; | 1664 | rdp->cpu = cpu; |
| 1672 | spin_unlock_irqrestore(&rnp->lock, flags); | 1665 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1673 | } | 1666 | } |
| 1674 | 1667 | ||
| 1675 | /* | 1668 | /* |
| @@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
| 1687 | struct rcu_node *rnp = rcu_get_root(rsp); | 1680 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1688 | 1681 | ||
| 1689 | /* Set up local state, ensuring consistent view of global state. */ | 1682 | /* Set up local state, ensuring consistent view of global state. */ |
| 1690 | spin_lock_irqsave(&rnp->lock, flags); | 1683 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 1691 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | 1684 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ |
| 1692 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | 1685 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ |
| 1693 | rdp->beenonline = 1; /* We have now been online. */ | 1686 | rdp->beenonline = 1; /* We have now been online. */ |
| @@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
| 1695 | rdp->qlen_last_fqs_check = 0; | 1688 | rdp->qlen_last_fqs_check = 0; |
| 1696 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1689 | rdp->n_force_qs_snap = rsp->n_force_qs; |
| 1697 | rdp->blimit = blimit; | 1690 | rdp->blimit = blimit; |
| 1698 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1691 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 1699 | 1692 | ||
| 1700 | /* | 1693 | /* |
| 1701 | * A new grace period might start here. If so, we won't be part | 1694 | * A new grace period might start here. If so, we won't be part |
| @@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
| 1703 | */ | 1696 | */ |
| 1704 | 1697 | ||
| 1705 | /* Exclude any attempts to start a new GP on large systems. */ | 1698 | /* Exclude any attempts to start a new GP on large systems. */ |
| 1706 | spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1699 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
| 1707 | 1700 | ||
| 1708 | /* Add CPU to rcu_node bitmasks. */ | 1701 | /* Add CPU to rcu_node bitmasks. */ |
| 1709 | rnp = rdp->mynode; | 1702 | rnp = rdp->mynode; |
| 1710 | mask = rdp->grpmask; | 1703 | mask = rdp->grpmask; |
| 1711 | do { | 1704 | do { |
| 1712 | /* Exclude any attempts to start a new GP on small systems. */ | 1705 | /* Exclude any attempts to start a new GP on small systems. */ |
| 1713 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 1706 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
| 1714 | rnp->qsmaskinit |= mask; | 1707 | rnp->qsmaskinit |= mask; |
| 1715 | mask = rnp->grpmask; | 1708 | mask = rnp->grpmask; |
| 1716 | if (rnp == rdp->mynode) { | 1709 | if (rnp == rdp->mynode) { |
| @@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
| 1718 | rdp->completed = rnp->completed; | 1711 | rdp->completed = rnp->completed; |
| 1719 | rdp->passed_quiesc_completed = rnp->completed - 1; | 1712 | rdp->passed_quiesc_completed = rnp->completed - 1; |
| 1720 | } | 1713 | } |
| 1721 | spin_unlock(&rnp->lock); /* irqs already disabled. */ | 1714 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
| 1722 | rnp = rnp->parent; | 1715 | rnp = rnp->parent; |
| 1723 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); | 1716 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); |
| 1724 | 1717 | ||
| 1725 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 1718 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| 1726 | } | 1719 | } |
| 1727 | 1720 | ||
| 1728 | static void __cpuinit rcu_online_cpu(int cpu) | 1721 | static void __cpuinit rcu_online_cpu(int cpu) |
| @@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
| 1806 | */ | 1799 | */ |
| 1807 | static void __init rcu_init_one(struct rcu_state *rsp) | 1800 | static void __init rcu_init_one(struct rcu_state *rsp) |
| 1808 | { | 1801 | { |
| 1802 | static char *buf[] = { "rcu_node_level_0", | ||
| 1803 | "rcu_node_level_1", | ||
| 1804 | "rcu_node_level_2", | ||
| 1805 | "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ | ||
| 1809 | int cpustride = 1; | 1806 | int cpustride = 1; |
| 1810 | int i; | 1807 | int i; |
| 1811 | int j; | 1808 | int j; |
| 1812 | struct rcu_node *rnp; | 1809 | struct rcu_node *rnp; |
| 1813 | 1810 | ||
| 1811 | BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ | ||
| 1812 | |||
| 1814 | /* Initialize the level-tracking arrays. */ | 1813 | /* Initialize the level-tracking arrays. */ |
| 1815 | 1814 | ||
| 1816 | for (i = 1; i < NUM_RCU_LVLS; i++) | 1815 | for (i = 1; i < NUM_RCU_LVLS; i++) |
| @@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
| 1823 | cpustride *= rsp->levelspread[i]; | 1822 | cpustride *= rsp->levelspread[i]; |
| 1824 | rnp = rsp->level[i]; | 1823 | rnp = rsp->level[i]; |
| 1825 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { | 1824 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { |
| 1826 | spin_lock_init(&rnp->lock); | 1825 | raw_spin_lock_init(&rnp->lock); |
| 1827 | lockdep_set_class(&rnp->lock, &rcu_node_class[i]); | 1826 | lockdep_set_class_and_name(&rnp->lock, |
| 1827 | &rcu_node_class[i], buf[i]); | ||
| 1828 | rnp->gpnum = 0; | 1828 | rnp->gpnum = 0; |
| 1829 | rnp->qsmask = 0; | 1829 | rnp->qsmask = 0; |
| 1830 | rnp->qsmaskinit = 0; | 1830 | rnp->qsmaskinit = 0; |
| @@ -1876,7 +1876,7 @@ do { \ | |||
| 1876 | 1876 | ||
| 1877 | void __init rcu_init(void) | 1877 | void __init rcu_init(void) |
| 1878 | { | 1878 | { |
| 1879 | int i; | 1879 | int cpu; |
| 1880 | 1880 | ||
| 1881 | rcu_bootup_announce(); | 1881 | rcu_bootup_announce(); |
| 1882 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 1882 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
| @@ -1896,8 +1896,8 @@ void __init rcu_init(void) | |||
| 1896 | * or the scheduler are operational. | 1896 | * or the scheduler are operational. |
| 1897 | */ | 1897 | */ |
| 1898 | cpu_notifier(rcu_cpu_notify, 0); | 1898 | cpu_notifier(rcu_cpu_notify, 0); |
| 1899 | for_each_online_cpu(i) | 1899 | for_each_online_cpu(cpu) |
| 1900 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); | 1900 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
| 1901 | } | 1901 | } |
| 1902 | 1902 | ||
| 1903 | #include "rcutree_plugin.h" | 1903 | #include "rcutree_plugin.h" |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d2a0046f63b2..1439eb504c22 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
| @@ -90,12 +90,12 @@ struct rcu_dynticks { | |||
| 90 | * Definition for node within the RCU grace-period-detection hierarchy. | 90 | * Definition for node within the RCU grace-period-detection hierarchy. |
| 91 | */ | 91 | */ |
| 92 | struct rcu_node { | 92 | struct rcu_node { |
| 93 | spinlock_t lock; /* Root rcu_node's lock protects some */ | 93 | raw_spinlock_t lock; /* Root rcu_node's lock protects some */ |
| 94 | /* rcu_state fields as well as following. */ | 94 | /* rcu_state fields as well as following. */ |
| 95 | long gpnum; /* Current grace period for this node. */ | 95 | unsigned long gpnum; /* Current grace period for this node. */ |
| 96 | /* This will either be equal to or one */ | 96 | /* This will either be equal to or one */ |
| 97 | /* behind the root rcu_node's gpnum. */ | 97 | /* behind the root rcu_node's gpnum. */ |
| 98 | long completed; /* Last grace period completed for this node. */ | 98 | unsigned long completed; /* Last GP completed for this node. */ |
| 99 | /* This will either be equal to or one */ | 99 | /* This will either be equal to or one */ |
| 100 | /* behind the root rcu_node's gpnum. */ | 100 | /* behind the root rcu_node's gpnum. */ |
| 101 | unsigned long qsmask; /* CPUs or groups that need to switch in */ | 101 | unsigned long qsmask; /* CPUs or groups that need to switch in */ |
| @@ -161,11 +161,11 @@ struct rcu_node { | |||
| 161 | /* Per-CPU data for read-copy update. */ | 161 | /* Per-CPU data for read-copy update. */ |
| 162 | struct rcu_data { | 162 | struct rcu_data { |
| 163 | /* 1) quiescent-state and grace-period handling : */ | 163 | /* 1) quiescent-state and grace-period handling : */ |
| 164 | long completed; /* Track rsp->completed gp number */ | 164 | unsigned long completed; /* Track rsp->completed gp number */ |
| 165 | /* in order to detect GP end. */ | 165 | /* in order to detect GP end. */ |
| 166 | long gpnum; /* Highest gp number that this CPU */ | 166 | unsigned long gpnum; /* Highest gp number that this CPU */ |
| 167 | /* is aware of having started. */ | 167 | /* is aware of having started. */ |
| 168 | long passed_quiesc_completed; | 168 | unsigned long passed_quiesc_completed; |
| 169 | /* Value of completed at time of qs. */ | 169 | /* Value of completed at time of qs. */ |
| 170 | bool passed_quiesc; /* User-mode/idle loop etc. */ | 170 | bool passed_quiesc; /* User-mode/idle loop etc. */ |
| 171 | bool qs_pending; /* Core waits for quiesc state. */ | 171 | bool qs_pending; /* Core waits for quiesc state. */ |
| @@ -221,14 +221,14 @@ struct rcu_data { | |||
| 221 | unsigned long resched_ipi; /* Sent a resched IPI. */ | 221 | unsigned long resched_ipi; /* Sent a resched IPI. */ |
| 222 | 222 | ||
| 223 | /* 5) __rcu_pending() statistics. */ | 223 | /* 5) __rcu_pending() statistics. */ |
| 224 | long n_rcu_pending; /* rcu_pending() calls since boot. */ | 224 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ |
| 225 | long n_rp_qs_pending; | 225 | unsigned long n_rp_qs_pending; |
| 226 | long n_rp_cb_ready; | 226 | unsigned long n_rp_cb_ready; |
| 227 | long n_rp_cpu_needs_gp; | 227 | unsigned long n_rp_cpu_needs_gp; |
| 228 | long n_rp_gp_completed; | 228 | unsigned long n_rp_gp_completed; |
| 229 | long n_rp_gp_started; | 229 | unsigned long n_rp_gp_started; |
| 230 | long n_rp_need_fqs; | 230 | unsigned long n_rp_need_fqs; |
| 231 | long n_rp_need_nothing; | 231 | unsigned long n_rp_need_nothing; |
| 232 | 232 | ||
| 233 | int cpu; | 233 | int cpu; |
| 234 | }; | 234 | }; |
| @@ -237,12 +237,11 @@ struct rcu_data { | |||
| 237 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ | 237 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ |
| 238 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ | 238 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ |
| 239 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ | 239 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ |
| 240 | #define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ | 240 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ |
| 241 | #define RCU_FORCE_QS 4 /* Need to force quiescent state. */ | ||
| 242 | #ifdef CONFIG_NO_HZ | 241 | #ifdef CONFIG_NO_HZ |
| 243 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 242 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
| 244 | #else /* #ifdef CONFIG_NO_HZ */ | 243 | #else /* #ifdef CONFIG_NO_HZ */ |
| 245 | #define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED | 244 | #define RCU_SIGNAL_INIT RCU_FORCE_QS |
| 246 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 245 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
| 247 | 246 | ||
| 248 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 247 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
| @@ -256,6 +255,9 @@ struct rcu_data { | |||
| 256 | 255 | ||
| 257 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 256 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
| 258 | 257 | ||
| 258 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) | ||
| 259 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | ||
| 260 | |||
| 259 | /* | 261 | /* |
| 260 | * RCU global state, including node hierarchy. This hierarchy is | 262 | * RCU global state, including node hierarchy. This hierarchy is |
| 261 | * represented in "heap" form in a dense array. The root (first level) | 263 | * represented in "heap" form in a dense array. The root (first level) |
| @@ -277,12 +279,19 @@ struct rcu_state { | |||
| 277 | 279 | ||
| 278 | u8 signaled ____cacheline_internodealigned_in_smp; | 280 | u8 signaled ____cacheline_internodealigned_in_smp; |
| 279 | /* Force QS state. */ | 281 | /* Force QS state. */ |
| 280 | long gpnum; /* Current gp number. */ | 282 | u8 fqs_active; /* force_quiescent_state() */ |
| 281 | long completed; /* # of last completed gp. */ | 283 | /* is running. */ |
| 284 | u8 fqs_need_gp; /* A CPU was prevented from */ | ||
| 285 | /* starting a new grace */ | ||
| 286 | /* period because */ | ||
| 287 | /* force_quiescent_state() */ | ||
| 288 | /* was running. */ | ||
| 289 | unsigned long gpnum; /* Current gp number. */ | ||
| 290 | unsigned long completed; /* # of last completed gp. */ | ||
| 282 | 291 | ||
| 283 | /* End of fields guarded by root rcu_node's lock. */ | 292 | /* End of fields guarded by root rcu_node's lock. */ |
| 284 | 293 | ||
| 285 | spinlock_t onofflock; /* exclude on/offline and */ | 294 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
| 286 | /* starting new GP. Also */ | 295 | /* starting new GP. Also */ |
| 287 | /* protects the following */ | 296 | /* protects the following */ |
| 288 | /* orphan_cbs fields. */ | 297 | /* orphan_cbs fields. */ |
| @@ -292,10 +301,8 @@ struct rcu_state { | |||
| 292 | /* going offline. */ | 301 | /* going offline. */ |
| 293 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ | 302 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ |
| 294 | long orphan_qlen; /* Number of orphaned cbs. */ | 303 | long orphan_qlen; /* Number of orphaned cbs. */ |
| 295 | spinlock_t fqslock; /* Only one task forcing */ | 304 | raw_spinlock_t fqslock; /* Only one task forcing */ |
| 296 | /* quiescent states. */ | 305 | /* quiescent states. */ |
| 297 | long completed_fqs; /* Value of completed @ snap. */ | ||
| 298 | /* Protected by fqslock. */ | ||
| 299 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 306 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
| 300 | /* force_quiescent_state(). */ | 307 | /* force_quiescent_state(). */ |
| 301 | unsigned long n_force_qs; /* Number of calls to */ | 308 | unsigned long n_force_qs; /* Number of calls to */ |
| @@ -319,8 +326,6 @@ struct rcu_state { | |||
| 319 | #define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ | 326 | #define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ |
| 320 | /* GP were moved to root. */ | 327 | /* GP were moved to root. */ |
| 321 | 328 | ||
| 322 | #ifdef RCU_TREE_NONCORE | ||
| 323 | |||
| 324 | /* | 329 | /* |
| 325 | * RCU implementation internal declarations: | 330 | * RCU implementation internal declarations: |
| 326 | */ | 331 | */ |
| @@ -335,7 +340,7 @@ extern struct rcu_state rcu_preempt_state; | |||
| 335 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | 340 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); |
| 336 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 341 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
| 337 | 342 | ||
| 338 | #else /* #ifdef RCU_TREE_NONCORE */ | 343 | #ifndef RCU_TREE_NONCORE |
| 339 | 344 | ||
| 340 | /* Forward declarations for rcutree_plugin.h */ | 345 | /* Forward declarations for rcutree_plugin.h */ |
| 341 | static void rcu_bootup_announce(void); | 346 | static void rcu_bootup_announce(void); |
| @@ -347,6 +352,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | |||
| 347 | unsigned long flags); | 352 | unsigned long flags); |
| 348 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 353 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 349 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 354 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
| 355 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | ||
| 350 | static void rcu_print_task_stall(struct rcu_node *rnp); | 356 | static void rcu_print_task_stall(struct rcu_node *rnp); |
| 351 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 357 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
| 352 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 358 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
| @@ -367,5 +373,6 @@ static int rcu_preempt_needs_cpu(int cpu); | |||
| 367 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 373 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
| 368 | static void rcu_preempt_send_cbs_to_orphanage(void); | 374 | static void rcu_preempt_send_cbs_to_orphanage(void); |
| 369 | static void __init __rcu_init_preempt(void); | 375 | static void __init __rcu_init_preempt(void); |
| 376 | static void rcu_needs_cpu_flush(void); | ||
| 370 | 377 | ||
| 371 | #endif /* #else #ifdef RCU_TREE_NONCORE */ | 378 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 37fbccdf41d5..464ad2cdee00 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
| @@ -62,6 +62,15 @@ long rcu_batches_completed(void) | |||
| 62 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 62 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| 65 | * Force a quiescent state for preemptible RCU. | ||
| 66 | */ | ||
| 67 | void rcu_force_quiescent_state(void) | ||
| 68 | { | ||
| 69 | force_quiescent_state(&rcu_preempt_state, 0); | ||
| 70 | } | ||
| 71 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
| 72 | |||
| 73 | /* | ||
| 65 | * Record a preemptable-RCU quiescent state for the specified CPU. Note | 74 | * Record a preemptable-RCU quiescent state for the specified CPU. Note |
| 66 | * that this just means that the task currently running on the CPU is | 75 | * that this just means that the task currently running on the CPU is |
| 67 | * not in a quiescent state. There might be any number of tasks blocked | 76 | * not in a quiescent state. There might be any number of tasks blocked |
| @@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 102 | /* Possibly blocking in an RCU read-side critical section. */ | 111 | /* Possibly blocking in an RCU read-side critical section. */ |
| 103 | rdp = rcu_preempt_state.rda[cpu]; | 112 | rdp = rcu_preempt_state.rda[cpu]; |
| 104 | rnp = rdp->mynode; | 113 | rnp = rdp->mynode; |
| 105 | spin_lock_irqsave(&rnp->lock, flags); | 114 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 106 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 115 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
| 107 | t->rcu_blocked_node = rnp; | 116 | t->rcu_blocked_node = rnp; |
| 108 | 117 | ||
| @@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 123 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 132 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); |
| 124 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; | 133 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; |
| 125 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); | 134 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); |
| 126 | spin_unlock_irqrestore(&rnp->lock, flags); | 135 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 127 | } | 136 | } |
| 128 | 137 | ||
| 129 | /* | 138 | /* |
| @@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
| 180 | struct rcu_node *rnp_p; | 189 | struct rcu_node *rnp_p; |
| 181 | 190 | ||
| 182 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 191 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { |
| 183 | spin_unlock_irqrestore(&rnp->lock, flags); | 192 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 184 | return; /* Still need more quiescent states! */ | 193 | return; /* Still need more quiescent states! */ |
| 185 | } | 194 | } |
| 186 | 195 | ||
| @@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
| 197 | 206 | ||
| 198 | /* Report up the rest of the hierarchy. */ | 207 | /* Report up the rest of the hierarchy. */ |
| 199 | mask = rnp->grpmask; | 208 | mask = rnp->grpmask; |
| 200 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 209 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 201 | spin_lock(&rnp_p->lock); /* irqs already disabled. */ | 210 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ |
| 202 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); | 211 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); |
| 203 | } | 212 | } |
| 204 | 213 | ||
| @@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 248 | */ | 257 | */ |
| 249 | for (;;) { | 258 | for (;;) { |
| 250 | rnp = t->rcu_blocked_node; | 259 | rnp = t->rcu_blocked_node; |
| 251 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 260 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
| 252 | if (rnp == t->rcu_blocked_node) | 261 | if (rnp == t->rcu_blocked_node) |
| 253 | break; | 262 | break; |
| 254 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 263 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 255 | } | 264 | } |
| 256 | empty = !rcu_preempted_readers(rnp); | 265 | empty = !rcu_preempted_readers(rnp); |
| 257 | empty_exp = !rcu_preempted_readers_exp(rnp); | 266 | empty_exp = !rcu_preempted_readers_exp(rnp); |
| @@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 265 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. | 274 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. |
| 266 | */ | 275 | */ |
| 267 | if (empty) | 276 | if (empty) |
| 268 | spin_unlock_irqrestore(&rnp->lock, flags); | 277 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 269 | else | 278 | else |
| 270 | rcu_report_unblock_qs_rnp(rnp, flags); | 279 | rcu_report_unblock_qs_rnp(rnp, flags); |
| 271 | 280 | ||
| @@ -295,29 +304,73 @@ void __rcu_read_unlock(void) | |||
| 295 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && | 304 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && |
| 296 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 305 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
| 297 | rcu_read_unlock_special(t); | 306 | rcu_read_unlock_special(t); |
| 307 | #ifdef CONFIG_PROVE_LOCKING | ||
| 308 | WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); | ||
| 309 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
| 298 | } | 310 | } |
| 299 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | 311 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); |
| 300 | 312 | ||
| 301 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 313 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
| 302 | 314 | ||
| 315 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | ||
| 316 | |||
| 317 | /* | ||
| 318 | * Dump detailed information for all tasks blocking the current RCU | ||
| 319 | * grace period on the specified rcu_node structure. | ||
| 320 | */ | ||
| 321 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | ||
| 322 | { | ||
| 323 | unsigned long flags; | ||
| 324 | struct list_head *lp; | ||
| 325 | int phase; | ||
| 326 | struct task_struct *t; | ||
| 327 | |||
| 328 | if (rcu_preempted_readers(rnp)) { | ||
| 329 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 330 | phase = rnp->gpnum & 0x1; | ||
| 331 | lp = &rnp->blocked_tasks[phase]; | ||
| 332 | list_for_each_entry(t, lp, rcu_node_entry) | ||
| 333 | sched_show_task(t); | ||
| 334 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 335 | } | ||
| 336 | } | ||
| 337 | |||
| 338 | /* | ||
| 339 | * Dump detailed information for all tasks blocking the current RCU | ||
| 340 | * grace period. | ||
| 341 | */ | ||
| 342 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
| 343 | { | ||
| 344 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 345 | |||
| 346 | rcu_print_detail_task_stall_rnp(rnp); | ||
| 347 | rcu_for_each_leaf_node(rsp, rnp) | ||
| 348 | rcu_print_detail_task_stall_rnp(rnp); | ||
| 349 | } | ||
| 350 | |||
| 351 | #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
| 352 | |||
| 353 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
| 354 | { | ||
| 355 | } | ||
| 356 | |||
| 357 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
| 358 | |||
| 303 | /* | 359 | /* |
| 304 | * Scan the current list of tasks blocked within RCU read-side critical | 360 | * Scan the current list of tasks blocked within RCU read-side critical |
| 305 | * sections, printing out the tid of each. | 361 | * sections, printing out the tid of each. |
| 306 | */ | 362 | */ |
| 307 | static void rcu_print_task_stall(struct rcu_node *rnp) | 363 | static void rcu_print_task_stall(struct rcu_node *rnp) |
| 308 | { | 364 | { |
| 309 | unsigned long flags; | ||
| 310 | struct list_head *lp; | 365 | struct list_head *lp; |
| 311 | int phase; | 366 | int phase; |
| 312 | struct task_struct *t; | 367 | struct task_struct *t; |
| 313 | 368 | ||
| 314 | if (rcu_preempted_readers(rnp)) { | 369 | if (rcu_preempted_readers(rnp)) { |
| 315 | spin_lock_irqsave(&rnp->lock, flags); | ||
| 316 | phase = rnp->gpnum & 0x1; | 370 | phase = rnp->gpnum & 0x1; |
| 317 | lp = &rnp->blocked_tasks[phase]; | 371 | lp = &rnp->blocked_tasks[phase]; |
| 318 | list_for_each_entry(t, lp, rcu_node_entry) | 372 | list_for_each_entry(t, lp, rcu_node_entry) |
| 319 | printk(" P%d", t->pid); | 373 | printk(" P%d", t->pid); |
| 320 | spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 321 | } | 374 | } |
| 322 | } | 375 | } |
| 323 | 376 | ||
| @@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 388 | lp_root = &rnp_root->blocked_tasks[i]; | 441 | lp_root = &rnp_root->blocked_tasks[i]; |
| 389 | while (!list_empty(lp)) { | 442 | while (!list_empty(lp)) { |
| 390 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); | 443 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); |
| 391 | spin_lock(&rnp_root->lock); /* irqs already disabled */ | 444 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
| 392 | list_del(&tp->rcu_node_entry); | 445 | list_del(&tp->rcu_node_entry); |
| 393 | tp->rcu_blocked_node = rnp_root; | 446 | tp->rcu_blocked_node = rnp_root; |
| 394 | list_add(&tp->rcu_node_entry, lp_root); | 447 | list_add(&tp->rcu_node_entry, lp_root); |
| 395 | spin_unlock(&rnp_root->lock); /* irqs remain disabled */ | 448 | raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ |
| 396 | } | 449 | } |
| 397 | } | 450 | } |
| 398 | return retval; | 451 | return retval; |
| @@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 516 | unsigned long flags; | 569 | unsigned long flags; |
| 517 | unsigned long mask; | 570 | unsigned long mask; |
| 518 | 571 | ||
| 519 | spin_lock_irqsave(&rnp->lock, flags); | 572 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 520 | for (;;) { | 573 | for (;;) { |
| 521 | if (!sync_rcu_preempt_exp_done(rnp)) | 574 | if (!sync_rcu_preempt_exp_done(rnp)) |
| 522 | break; | 575 | break; |
| @@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 525 | break; | 578 | break; |
| 526 | } | 579 | } |
| 527 | mask = rnp->grpmask; | 580 | mask = rnp->grpmask; |
| 528 | spin_unlock(&rnp->lock); /* irqs remain disabled */ | 581 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
| 529 | rnp = rnp->parent; | 582 | rnp = rnp->parent; |
| 530 | spin_lock(&rnp->lock); /* irqs already disabled */ | 583 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
| 531 | rnp->expmask &= ~mask; | 584 | rnp->expmask &= ~mask; |
| 532 | } | 585 | } |
| 533 | spin_unlock_irqrestore(&rnp->lock, flags); | 586 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 534 | } | 587 | } |
| 535 | 588 | ||
| 536 | /* | 589 | /* |
| @@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 545 | { | 598 | { |
| 546 | int must_wait; | 599 | int must_wait; |
| 547 | 600 | ||
| 548 | spin_lock(&rnp->lock); /* irqs already disabled */ | 601 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
| 549 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); | 602 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); |
| 550 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); | 603 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); |
| 551 | must_wait = rcu_preempted_readers_exp(rnp); | 604 | must_wait = rcu_preempted_readers_exp(rnp); |
| 552 | spin_unlock(&rnp->lock); /* irqs remain disabled */ | 605 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
| 553 | if (!must_wait) | 606 | if (!must_wait) |
| 554 | rcu_report_exp_rnp(rsp, rnp); | 607 | rcu_report_exp_rnp(rsp, rnp); |
| 555 | } | 608 | } |
| @@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void) | |||
| 594 | /* force all RCU readers onto blocked_tasks[]. */ | 647 | /* force all RCU readers onto blocked_tasks[]. */ |
| 595 | synchronize_sched_expedited(); | 648 | synchronize_sched_expedited(); |
| 596 | 649 | ||
| 597 | spin_lock_irqsave(&rsp->onofflock, flags); | 650 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
| 598 | 651 | ||
| 599 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | 652 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ |
| 600 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | 653 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { |
| 601 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 654 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
| 602 | rnp->expmask = rnp->qsmaskinit; | 655 | rnp->expmask = rnp->qsmaskinit; |
| 603 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 656 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 604 | } | 657 | } |
| 605 | 658 | ||
| 606 | /* Snapshot current state of ->blocked_tasks[] lists. */ | 659 | /* Snapshot current state of ->blocked_tasks[] lists. */ |
| @@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void) | |||
| 609 | if (NUM_RCU_NODES > 1) | 662 | if (NUM_RCU_NODES > 1) |
| 610 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); | 663 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); |
| 611 | 664 | ||
| 612 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 665 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| 613 | 666 | ||
| 614 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ | 667 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ |
| 615 | rnp = rcu_get_root(rsp); | 668 | rnp = rcu_get_root(rsp); |
| @@ -713,6 +766,16 @@ long rcu_batches_completed(void) | |||
| 713 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 766 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
| 714 | 767 | ||
| 715 | /* | 768 | /* |
| 769 | * Force a quiescent state for RCU, which, because there is no preemptible | ||
| 770 | * RCU, becomes the same as rcu-sched. | ||
| 771 | */ | ||
| 772 | void rcu_force_quiescent_state(void) | ||
| 773 | { | ||
| 774 | rcu_sched_force_quiescent_state(); | ||
| 775 | } | ||
| 776 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
| 777 | |||
| 778 | /* | ||
| 716 | * Because preemptable RCU does not exist, we never have to check for | 779 | * Because preemptable RCU does not exist, we never have to check for |
| 717 | * CPUs being in quiescent states. | 780 | * CPUs being in quiescent states. |
| 718 | */ | 781 | */ |
| @@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp) | |||
| 734 | /* Because preemptible RCU does not exist, no quieting of tasks. */ | 797 | /* Because preemptible RCU does not exist, no quieting of tasks. */ |
| 735 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | 798 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) |
| 736 | { | 799 | { |
| 737 | spin_unlock_irqrestore(&rnp->lock, flags); | 800 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 738 | } | 801 | } |
| 739 | 802 | ||
| 740 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 803 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| @@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
| 745 | * Because preemptable RCU does not exist, we never have to check for | 808 | * Because preemptable RCU does not exist, we never have to check for |
| 746 | * tasks blocked within RCU read-side critical sections. | 809 | * tasks blocked within RCU read-side critical sections. |
| 747 | */ | 810 | */ |
| 811 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
| 812 | { | ||
| 813 | } | ||
| 814 | |||
| 815 | /* | ||
| 816 | * Because preemptable RCU does not exist, we never have to check for | ||
| 817 | * tasks blocked within RCU read-side critical sections. | ||
| 818 | */ | ||
| 748 | static void rcu_print_task_stall(struct rcu_node *rnp) | 819 | static void rcu_print_task_stall(struct rcu_node *rnp) |
| 749 | { | 820 | { |
| 750 | } | 821 | } |
| @@ -884,3 +955,113 @@ static void __init __rcu_init_preempt(void) | |||
| 884 | } | 955 | } |
| 885 | 956 | ||
| 886 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 957 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
| 958 | |||
| 959 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | ||
| 960 | |||
| 961 | /* | ||
| 962 | * Check to see if any future RCU-related work will need to be done | ||
| 963 | * by the current CPU, even if none need be done immediately, returning | ||
| 964 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
| 965 | * an exported member of the RCU API. | ||
| 966 | * | ||
| 967 | * Because we have preemptible RCU, just check whether this CPU needs | ||
| 968 | * any flavor of RCU. Do not chew up lots of CPU cycles with preemption | ||
| 969 | * disabled in a most-likely vain attempt to cause RCU not to need this CPU. | ||
| 970 | */ | ||
| 971 | int rcu_needs_cpu(int cpu) | ||
| 972 | { | ||
| 973 | return rcu_needs_cpu_quick_check(cpu); | ||
| 974 | } | ||
| 975 | |||
| 976 | /* | ||
| 977 | * Check to see if we need to continue a callback-flush operations to | ||
| 978 | * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle | ||
| 979 | * entry is not configured, so we never do need to. | ||
| 980 | */ | ||
| 981 | static void rcu_needs_cpu_flush(void) | ||
| 982 | { | ||
| 983 | } | ||
| 984 | |||
| 985 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
| 986 | |||
| 987 | #define RCU_NEEDS_CPU_FLUSHES 5 | ||
| 988 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | ||
| 989 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | ||
| 990 | |||
| 991 | /* | ||
| 992 | * Check to see if any future RCU-related work will need to be done | ||
| 993 | * by the current CPU, even if none need be done immediately, returning | ||
| 994 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
| 995 | * an exported member of the RCU API. | ||
| 996 | * | ||
| 997 | * Because we are not supporting preemptible RCU, attempt to accelerate | ||
| 998 | * any current grace periods so that RCU no longer needs this CPU, but | ||
| 999 | * only if all other CPUs are already in dynticks-idle mode. This will | ||
| 1000 | * allow the CPU cores to be powered down immediately, as opposed to after | ||
| 1001 | * waiting many milliseconds for grace periods to elapse. | ||
| 1002 | * | ||
| 1003 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | ||
| 1004 | * disabled, we do one pass of force_quiescent_state(), then do a | ||
| 1005 | * raise_softirq() to cause rcu_process_callbacks() to be invoked later. | ||
| 1006 | * The per-cpu rcu_dyntick_drain variable controls the sequencing. | ||
| 1007 | */ | ||
| 1008 | int rcu_needs_cpu(int cpu) | ||
| 1009 | { | ||
| 1010 | int c = 0; | ||
| 1011 | int thatcpu; | ||
| 1012 | |||
| 1013 | /* Don't bother unless we are the last non-dyntick-idle CPU. */ | ||
| 1014 | for_each_cpu_not(thatcpu, nohz_cpu_mask) | ||
| 1015 | if (thatcpu != cpu) { | ||
| 1016 | per_cpu(rcu_dyntick_drain, cpu) = 0; | ||
| 1017 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | ||
| 1018 | return rcu_needs_cpu_quick_check(cpu); | ||
| 1019 | } | ||
| 1020 | |||
| 1021 | /* Check and update the rcu_dyntick_drain sequencing. */ | ||
| 1022 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | ||
| 1023 | /* First time through, initialize the counter. */ | ||
| 1024 | per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; | ||
| 1025 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | ||
| 1026 | /* We have hit the limit, so time to give up. */ | ||
| 1027 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | ||
| 1028 | return rcu_needs_cpu_quick_check(cpu); | ||
| 1029 | } | ||
| 1030 | |||
| 1031 | /* Do one step pushing remaining RCU callbacks through. */ | ||
| 1032 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | ||
| 1033 | rcu_sched_qs(cpu); | ||
| 1034 | force_quiescent_state(&rcu_sched_state, 0); | ||
| 1035 | c = c || per_cpu(rcu_sched_data, cpu).nxtlist; | ||
| 1036 | } | ||
| 1037 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | ||
| 1038 | rcu_bh_qs(cpu); | ||
| 1039 | force_quiescent_state(&rcu_bh_state, 0); | ||
| 1040 | c = c || per_cpu(rcu_bh_data, cpu).nxtlist; | ||
| 1041 | } | ||
| 1042 | |||
| 1043 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | ||
| 1044 | if (c) { | ||
| 1045 | raise_softirq(RCU_SOFTIRQ); | ||
| 1046 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | ||
| 1047 | } | ||
| 1048 | return c; | ||
| 1049 | } | ||
| 1050 | |||
| 1051 | /* | ||
| 1052 | * Check to see if we need to continue a callback-flush operations to | ||
| 1053 | * allow the last CPU to enter dyntick-idle mode. | ||
| 1054 | */ | ||
| 1055 | static void rcu_needs_cpu_flush(void) | ||
| 1056 | { | ||
| 1057 | int cpu = smp_processor_id(); | ||
| 1058 | unsigned long flags; | ||
| 1059 | |||
| 1060 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) | ||
| 1061 | return; | ||
| 1062 | local_irq_save(flags); | ||
| 1063 | (void)rcu_needs_cpu(cpu); | ||
| 1064 | local_irq_restore(flags); | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9d2c88423b31..d45db2e35d27 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
| @@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
| 50 | { | 50 | { |
| 51 | if (!rdp->beenonline) | 51 | if (!rdp->beenonline) |
| 52 | return; | 52 | return; |
| 53 | seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", | 53 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", |
| 54 | rdp->cpu, | 54 | rdp->cpu, |
| 55 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 55 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
| 56 | rdp->completed, rdp->gpnum, | 56 | rdp->completed, rdp->gpnum, |
| @@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
| 105 | { | 105 | { |
| 106 | if (!rdp->beenonline) | 106 | if (!rdp->beenonline) |
| 107 | return; | 107 | return; |
| 108 | seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", | 108 | seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", |
| 109 | rdp->cpu, | 109 | rdp->cpu, |
| 110 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", | 110 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", |
| 111 | rdp->completed, rdp->gpnum, | 111 | rdp->completed, rdp->gpnum, |
| @@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = { | |||
| 155 | 155 | ||
| 156 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 156 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
| 157 | { | 157 | { |
| 158 | long gpnum; | 158 | unsigned long gpnum; |
| 159 | int level = 0; | 159 | int level = 0; |
| 160 | int phase; | 160 | int phase; |
| 161 | struct rcu_node *rnp; | 161 | struct rcu_node *rnp; |
| 162 | 162 | ||
| 163 | gpnum = rsp->gpnum; | 163 | gpnum = rsp->gpnum; |
| 164 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " | 164 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
| 165 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", | 165 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", |
| 166 | rsp->completed, gpnum, rsp->signaled, | 166 | rsp->completed, gpnum, rsp->signaled, |
| 167 | (long)(rsp->jiffies_force_qs - jiffies), | 167 | (long)(rsp->jiffies_force_qs - jiffies), |
| @@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = { | |||
| 215 | static int show_rcugp(struct seq_file *m, void *unused) | 215 | static int show_rcugp(struct seq_file *m, void *unused) |
| 216 | { | 216 | { |
| 217 | #ifdef CONFIG_TREE_PREEMPT_RCU | 217 | #ifdef CONFIG_TREE_PREEMPT_RCU |
| 218 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", | 218 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", |
| 219 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); | 219 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); |
| 220 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 220 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
| 221 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", | 221 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", |
| 222 | rcu_sched_state.completed, rcu_sched_state.gpnum); | 222 | rcu_sched_state.completed, rcu_sched_state.gpnum); |
| 223 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", | 223 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", |
| 224 | rcu_bh_state.completed, rcu_bh_state.gpnum); | 224 | rcu_bh_state.completed, rcu_bh_state.gpnum); |
| 225 | return 0; | 225 | return 0; |
| 226 | } | 226 | } |
diff --git a/kernel/resource.c b/kernel/resource.c index af96c1e4b54b..2d5be5d9bf5f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -188,6 +188,36 @@ static int __release_resource(struct resource *old) | |||
| 188 | return -EINVAL; | 188 | return -EINVAL; |
| 189 | } | 189 | } |
| 190 | 190 | ||
| 191 | static void __release_child_resources(struct resource *r) | ||
| 192 | { | ||
| 193 | struct resource *tmp, *p; | ||
| 194 | resource_size_t size; | ||
| 195 | |||
| 196 | p = r->child; | ||
| 197 | r->child = NULL; | ||
| 198 | while (p) { | ||
| 199 | tmp = p; | ||
| 200 | p = p->sibling; | ||
| 201 | |||
| 202 | tmp->parent = NULL; | ||
| 203 | tmp->sibling = NULL; | ||
| 204 | __release_child_resources(tmp); | ||
| 205 | |||
| 206 | printk(KERN_DEBUG "release child resource %pR\n", tmp); | ||
| 207 | /* need to restore size, and keep flags */ | ||
| 208 | size = resource_size(tmp); | ||
| 209 | tmp->start = 0; | ||
| 210 | tmp->end = size - 1; | ||
| 211 | } | ||
| 212 | } | ||
| 213 | |||
| 214 | void release_child_resources(struct resource *r) | ||
| 215 | { | ||
| 216 | write_lock(&resource_lock); | ||
| 217 | __release_child_resources(r); | ||
| 218 | write_unlock(&resource_lock); | ||
| 219 | } | ||
| 220 | |||
| 191 | /** | 221 | /** |
| 192 | * request_resource - request and reserve an I/O or memory resource | 222 | * request_resource - request and reserve an I/O or memory resource |
| 193 | * @root: root resource descriptor | 223 | * @root: root resource descriptor |
| @@ -274,7 +304,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
| 274 | void *arg, int (*func)(unsigned long, unsigned long, void *)) | 304 | void *arg, int (*func)(unsigned long, unsigned long, void *)) |
| 275 | { | 305 | { |
| 276 | struct resource res; | 306 | struct resource res; |
| 277 | unsigned long pfn, len; | 307 | unsigned long pfn, end_pfn; |
| 278 | u64 orig_end; | 308 | u64 orig_end; |
| 279 | int ret = -1; | 309 | int ret = -1; |
| 280 | 310 | ||
| @@ -284,9 +314,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
| 284 | orig_end = res.end; | 314 | orig_end = res.end; |
| 285 | while ((res.start < res.end) && | 315 | while ((res.start < res.end) && |
| 286 | (find_next_system_ram(&res, "System RAM") >= 0)) { | 316 | (find_next_system_ram(&res, "System RAM") >= 0)) { |
| 287 | pfn = (unsigned long)(res.start >> PAGE_SHIFT); | 317 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; |
| 288 | len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); | 318 | end_pfn = (res.end + 1) >> PAGE_SHIFT; |
| 289 | ret = (*func)(pfn, len, arg); | 319 | if (end_pfn > pfn) |
| 320 | ret = (*func)(pfn, end_pfn - pfn, arg); | ||
| 290 | if (ret) | 321 | if (ret) |
| 291 | break; | 322 | break; |
| 292 | res.start = res.end + 1; | 323 | res.start = res.end + 1; |
| @@ -297,14 +328,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
| 297 | 328 | ||
| 298 | #endif | 329 | #endif |
| 299 | 330 | ||
| 331 | static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) | ||
| 332 | { | ||
| 333 | return 1; | ||
| 334 | } | ||
| 335 | /* | ||
| 336 | * This generic page_is_ram() returns true if specified address is | ||
| 337 | * registered as "System RAM" in iomem_resource list. | ||
| 338 | */ | ||
| 339 | int __weak page_is_ram(unsigned long pfn) | ||
| 340 | { | ||
| 341 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | ||
| 342 | } | ||
| 343 | |||
| 300 | /* | 344 | /* |
| 301 | * Find empty slot in the resource tree given range and alignment. | 345 | * Find empty slot in the resource tree given range and alignment. |
| 302 | */ | 346 | */ |
| 303 | static int find_resource(struct resource *root, struct resource *new, | 347 | static int find_resource(struct resource *root, struct resource *new, |
| 304 | resource_size_t size, resource_size_t min, | 348 | resource_size_t size, resource_size_t min, |
| 305 | resource_size_t max, resource_size_t align, | 349 | resource_size_t max, resource_size_t align, |
| 306 | void (*alignf)(void *, struct resource *, | 350 | resource_size_t (*alignf)(void *, |
| 307 | resource_size_t, resource_size_t), | 351 | const struct resource *, |
| 352 | resource_size_t, | ||
| 353 | resource_size_t), | ||
| 308 | void *alignf_data) | 354 | void *alignf_data) |
| 309 | { | 355 | { |
| 310 | struct resource *this = root->child; | 356 | struct resource *this = root->child; |
| @@ -330,7 +376,7 @@ static int find_resource(struct resource *root, struct resource *new, | |||
| 330 | tmp.end = max; | 376 | tmp.end = max; |
| 331 | tmp.start = ALIGN(tmp.start, align); | 377 | tmp.start = ALIGN(tmp.start, align); |
| 332 | if (alignf) | 378 | if (alignf) |
| 333 | alignf(alignf_data, &tmp, size, align); | 379 | tmp.start = alignf(alignf_data, &tmp, size, align); |
| 334 | if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { | 380 | if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { |
| 335 | new->start = tmp.start; | 381 | new->start = tmp.start; |
| 336 | new->end = tmp.start + size - 1; | 382 | new->end = tmp.start + size - 1; |
| @@ -358,8 +404,10 @@ static int find_resource(struct resource *root, struct resource *new, | |||
| 358 | int allocate_resource(struct resource *root, struct resource *new, | 404 | int allocate_resource(struct resource *root, struct resource *new, |
| 359 | resource_size_t size, resource_size_t min, | 405 | resource_size_t size, resource_size_t min, |
| 360 | resource_size_t max, resource_size_t align, | 406 | resource_size_t max, resource_size_t align, |
| 361 | void (*alignf)(void *, struct resource *, | 407 | resource_size_t (*alignf)(void *, |
| 362 | resource_size_t, resource_size_t), | 408 | const struct resource *, |
| 409 | resource_size_t, | ||
| 410 | resource_size_t), | ||
| 363 | void *alignf_data) | 411 | void *alignf_data) |
| 364 | { | 412 | { |
| 365 | int err; | 413 | int err; |
diff --git a/kernel/sched.c b/kernel/sched.c index 3a8fb30a91b1..abb36b16b93b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 233 | */ | 233 | */ |
| 234 | static DEFINE_MUTEX(sched_domains_mutex); | 234 | static DEFINE_MUTEX(sched_domains_mutex); |
| 235 | 235 | ||
| 236 | #ifdef CONFIG_GROUP_SCHED | 236 | #ifdef CONFIG_CGROUP_SCHED |
| 237 | 237 | ||
| 238 | #include <linux/cgroup.h> | 238 | #include <linux/cgroup.h> |
| 239 | 239 | ||
| @@ -243,13 +243,7 @@ static LIST_HEAD(task_groups); | |||
| 243 | 243 | ||
| 244 | /* task group related information */ | 244 | /* task group related information */ |
| 245 | struct task_group { | 245 | struct task_group { |
| 246 | #ifdef CONFIG_CGROUP_SCHED | ||
| 247 | struct cgroup_subsys_state css; | 246 | struct cgroup_subsys_state css; |
| 248 | #endif | ||
| 249 | |||
| 250 | #ifdef CONFIG_USER_SCHED | ||
| 251 | uid_t uid; | ||
| 252 | #endif | ||
| 253 | 247 | ||
| 254 | #ifdef CONFIG_FAIR_GROUP_SCHED | 248 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 255 | /* schedulable entities of this group on each cpu */ | 249 | /* schedulable entities of this group on each cpu */ |
| @@ -274,35 +268,7 @@ struct task_group { | |||
| 274 | struct list_head children; | 268 | struct list_head children; |
| 275 | }; | 269 | }; |
| 276 | 270 | ||
| 277 | #ifdef CONFIG_USER_SCHED | ||
| 278 | |||
| 279 | /* Helper function to pass uid information to create_sched_user() */ | ||
| 280 | void set_tg_uid(struct user_struct *user) | ||
| 281 | { | ||
| 282 | user->tg->uid = user->uid; | ||
| 283 | } | ||
| 284 | |||
| 285 | /* | ||
| 286 | * Root task group. | ||
| 287 | * Every UID task group (including init_task_group aka UID-0) will | ||
| 288 | * be a child to this group. | ||
| 289 | */ | ||
| 290 | struct task_group root_task_group; | ||
| 291 | |||
| 292 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 293 | /* Default task group's sched entity on each cpu */ | ||
| 294 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | ||
| 295 | /* Default task group's cfs_rq on each cpu */ | ||
| 296 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); | ||
| 297 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 298 | |||
| 299 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 300 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
| 301 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var); | ||
| 302 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
| 303 | #else /* !CONFIG_USER_SCHED */ | ||
| 304 | #define root_task_group init_task_group | 271 | #define root_task_group init_task_group |
| 305 | #endif /* CONFIG_USER_SCHED */ | ||
| 306 | 272 | ||
| 307 | /* task_group_lock serializes add/remove of task groups and also changes to | 273 | /* task_group_lock serializes add/remove of task groups and also changes to |
| 308 | * a task group's cpu shares. | 274 | * a task group's cpu shares. |
| @@ -318,11 +284,7 @@ static int root_task_group_empty(void) | |||
| 318 | } | 284 | } |
| 319 | #endif | 285 | #endif |
| 320 | 286 | ||
| 321 | #ifdef CONFIG_USER_SCHED | ||
| 322 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | ||
| 323 | #else /* !CONFIG_USER_SCHED */ | ||
| 324 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 287 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
| 325 | #endif /* CONFIG_USER_SCHED */ | ||
| 326 | 288 | ||
| 327 | /* | 289 | /* |
| 328 | * A weight of 0 or 1 can cause arithmetics problems. | 290 | * A weight of 0 or 1 can cause arithmetics problems. |
| @@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
| 348 | { | 310 | { |
| 349 | struct task_group *tg; | 311 | struct task_group *tg; |
| 350 | 312 | ||
| 351 | #ifdef CONFIG_USER_SCHED | 313 | #ifdef CONFIG_CGROUP_SCHED |
| 352 | rcu_read_lock(); | ||
| 353 | tg = __task_cred(p)->user->tg; | ||
| 354 | rcu_read_unlock(); | ||
| 355 | #elif defined(CONFIG_CGROUP_SCHED) | ||
| 356 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), | 314 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), |
| 357 | struct task_group, css); | 315 | struct task_group, css); |
| 358 | #else | 316 | #else |
| @@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
| 383 | return NULL; | 341 | return NULL; |
| 384 | } | 342 | } |
| 385 | 343 | ||
| 386 | #endif /* CONFIG_GROUP_SCHED */ | 344 | #endif /* CONFIG_CGROUP_SCHED */ |
| 387 | 345 | ||
| 388 | /* CFS-related fields in a runqueue */ | 346 | /* CFS-related fields in a runqueue */ |
| 389 | struct cfs_rq { | 347 | struct cfs_rq { |
| @@ -478,7 +436,6 @@ struct rt_rq { | |||
| 478 | struct rq *rq; | 436 | struct rq *rq; |
| 479 | struct list_head leaf_rt_rq_list; | 437 | struct list_head leaf_rt_rq_list; |
| 480 | struct task_group *tg; | 438 | struct task_group *tg; |
| 481 | struct sched_rt_entity *rt_se; | ||
| 482 | #endif | 439 | #endif |
| 483 | }; | 440 | }; |
| 484 | 441 | ||
| @@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq) | |||
| 645 | #endif | 602 | #endif |
| 646 | } | 603 | } |
| 647 | 604 | ||
| 605 | #define rcu_dereference_check_sched_domain(p) \ | ||
| 606 | rcu_dereference_check((p), \ | ||
| 607 | rcu_read_lock_sched_held() || \ | ||
| 608 | lockdep_is_held(&sched_domains_mutex)) | ||
| 609 | |||
| 648 | /* | 610 | /* |
| 649 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 611 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
| 650 | * See detach_destroy_domains: synchronize_sched for details. | 612 | * See detach_destroy_domains: synchronize_sched for details. |
| @@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq) | |||
| 653 | * preempt-disabled sections. | 615 | * preempt-disabled sections. |
| 654 | */ | 616 | */ |
| 655 | #define for_each_domain(cpu, __sd) \ | 617 | #define for_each_domain(cpu, __sd) \ |
| 656 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | 618 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
| 657 | 619 | ||
| 658 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 620 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
| 659 | #define this_rq() (&__get_cpu_var(runqueues)) | 621 | #define this_rq() (&__get_cpu_var(runqueues)) |
| @@ -941,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 941 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 903 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
| 942 | 904 | ||
| 943 | /* | 905 | /* |
| 906 | * Check whether the task is waking, we use this to synchronize against | ||
| 907 | * ttwu() so that task_cpu() reports a stable number. | ||
| 908 | * | ||
| 909 | * We need to make an exception for PF_STARTING tasks because the fork | ||
| 910 | * path might require task_rq_lock() to work, eg. it can call | ||
| 911 | * set_cpus_allowed_ptr() from the cpuset clone_ns code. | ||
| 912 | */ | ||
| 913 | static inline int task_is_waking(struct task_struct *p) | ||
| 914 | { | ||
| 915 | return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); | ||
| 916 | } | ||
| 917 | |||
| 918 | /* | ||
| 944 | * __task_rq_lock - lock the runqueue a given task resides on. | 919 | * __task_rq_lock - lock the runqueue a given task resides on. |
| 945 | * Must be called interrupts disabled. | 920 | * Must be called interrupts disabled. |
| 946 | */ | 921 | */ |
| 947 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 922 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
| 948 | __acquires(rq->lock) | 923 | __acquires(rq->lock) |
| 949 | { | 924 | { |
| 925 | struct rq *rq; | ||
| 926 | |||
| 950 | for (;;) { | 927 | for (;;) { |
| 951 | struct rq *rq = task_rq(p); | 928 | while (task_is_waking(p)) |
| 929 | cpu_relax(); | ||
| 930 | rq = task_rq(p); | ||
| 952 | raw_spin_lock(&rq->lock); | 931 | raw_spin_lock(&rq->lock); |
| 953 | if (likely(rq == task_rq(p))) | 932 | if (likely(rq == task_rq(p) && !task_is_waking(p))) |
| 954 | return rq; | 933 | return rq; |
| 955 | raw_spin_unlock(&rq->lock); | 934 | raw_spin_unlock(&rq->lock); |
| 956 | } | 935 | } |
| @@ -967,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
| 967 | struct rq *rq; | 946 | struct rq *rq; |
| 968 | 947 | ||
| 969 | for (;;) { | 948 | for (;;) { |
| 949 | while (task_is_waking(p)) | ||
| 950 | cpu_relax(); | ||
| 970 | local_irq_save(*flags); | 951 | local_irq_save(*flags); |
| 971 | rq = task_rq(p); | 952 | rq = task_rq(p); |
| 972 | raw_spin_lock(&rq->lock); | 953 | raw_spin_lock(&rq->lock); |
| 973 | if (likely(rq == task_rq(p))) | 954 | if (likely(rq == task_rq(p) && !task_is_waking(p))) |
| 974 | return rq; | 955 | return rq; |
| 975 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 956 | raw_spin_unlock_irqrestore(&rq->lock, *flags); |
| 976 | } | 957 | } |
| @@ -1390,32 +1371,6 @@ static const u32 prio_to_wmult[40] = { | |||
| 1390 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1371 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
| 1391 | }; | 1372 | }; |
| 1392 | 1373 | ||
| 1393 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | ||
| 1394 | |||
| 1395 | /* | ||
| 1396 | * runqueue iterator, to support SMP load-balancing between different | ||
| 1397 | * scheduling classes, without having to expose their internal data | ||
| 1398 | * structures to the load-balancing proper: | ||
| 1399 | */ | ||
| 1400 | struct rq_iterator { | ||
| 1401 | void *arg; | ||
| 1402 | struct task_struct *(*start)(void *); | ||
| 1403 | struct task_struct *(*next)(void *); | ||
| 1404 | }; | ||
| 1405 | |||
| 1406 | #ifdef CONFIG_SMP | ||
| 1407 | static unsigned long | ||
| 1408 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1409 | unsigned long max_load_move, struct sched_domain *sd, | ||
| 1410 | enum cpu_idle_type idle, int *all_pinned, | ||
| 1411 | int *this_best_prio, struct rq_iterator *iterator); | ||
| 1412 | |||
| 1413 | static int | ||
| 1414 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1415 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 1416 | struct rq_iterator *iterator); | ||
| 1417 | #endif | ||
| 1418 | |||
| 1419 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 1374 | /* Time spent by the tasks of the cpu accounting group executing in ... */ |
| 1420 | enum cpuacct_stat_index { | 1375 | enum cpuacct_stat_index { |
| 1421 | CPUACCT_STAT_USER, /* ... user mode */ | 1376 | CPUACCT_STAT_USER, /* ... user mode */ |
| @@ -1531,7 +1486,7 @@ static unsigned long target_load(int cpu, int type) | |||
| 1531 | 1486 | ||
| 1532 | static struct sched_group *group_of(int cpu) | 1487 | static struct sched_group *group_of(int cpu) |
| 1533 | { | 1488 | { |
| 1534 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | 1489 | struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); |
| 1535 | 1490 | ||
| 1536 | if (!sd) | 1491 | if (!sd) |
| 1537 | return NULL; | 1492 | return NULL; |
| @@ -1566,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 1566 | 1521 | ||
| 1567 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1522 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1568 | 1523 | ||
| 1569 | static __read_mostly unsigned long *update_shares_data; | 1524 | static __read_mostly unsigned long __percpu *update_shares_data; |
| 1570 | 1525 | ||
| 1571 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1526 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
| 1572 | 1527 | ||
| @@ -1701,16 +1656,6 @@ static void update_shares(struct sched_domain *sd) | |||
| 1701 | } | 1656 | } |
| 1702 | } | 1657 | } |
| 1703 | 1658 | ||
| 1704 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
| 1705 | { | ||
| 1706 | if (root_task_group_empty()) | ||
| 1707 | return; | ||
| 1708 | |||
| 1709 | raw_spin_unlock(&rq->lock); | ||
| 1710 | update_shares(sd); | ||
| 1711 | raw_spin_lock(&rq->lock); | ||
| 1712 | } | ||
| 1713 | |||
| 1714 | static void update_h_load(long cpu) | 1659 | static void update_h_load(long cpu) |
| 1715 | { | 1660 | { |
| 1716 | if (root_task_group_empty()) | 1661 | if (root_task_group_empty()) |
| @@ -1725,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd) | |||
| 1725 | { | 1670 | { |
| 1726 | } | 1671 | } |
| 1727 | 1672 | ||
| 1728 | static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
| 1729 | { | ||
| 1730 | } | ||
| 1731 | |||
| 1732 | #endif | 1673 | #endif |
| 1733 | 1674 | ||
| 1734 | #ifdef CONFIG_PREEMPT | 1675 | #ifdef CONFIG_PREEMPT |
| @@ -1805,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | |||
| 1805 | raw_spin_unlock(&busiest->lock); | 1746 | raw_spin_unlock(&busiest->lock); |
| 1806 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | 1747 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); |
| 1807 | } | 1748 | } |
| 1749 | |||
| 1750 | /* | ||
| 1751 | * double_rq_lock - safely lock two runqueues | ||
| 1752 | * | ||
| 1753 | * Note this does not disable interrupts like task_rq_lock, | ||
| 1754 | * you need to do so manually before calling. | ||
| 1755 | */ | ||
| 1756 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
| 1757 | __acquires(rq1->lock) | ||
| 1758 | __acquires(rq2->lock) | ||
| 1759 | { | ||
| 1760 | BUG_ON(!irqs_disabled()); | ||
| 1761 | if (rq1 == rq2) { | ||
| 1762 | raw_spin_lock(&rq1->lock); | ||
| 1763 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
| 1764 | } else { | ||
| 1765 | if (rq1 < rq2) { | ||
| 1766 | raw_spin_lock(&rq1->lock); | ||
| 1767 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
| 1768 | } else { | ||
| 1769 | raw_spin_lock(&rq2->lock); | ||
| 1770 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
| 1771 | } | ||
| 1772 | } | ||
| 1773 | update_rq_clock(rq1); | ||
| 1774 | update_rq_clock(rq2); | ||
| 1775 | } | ||
| 1776 | |||
| 1777 | /* | ||
| 1778 | * double_rq_unlock - safely unlock two runqueues | ||
| 1779 | * | ||
| 1780 | * Note this does not restore interrupts like task_rq_unlock, | ||
| 1781 | * you need to do so manually after calling. | ||
| 1782 | */ | ||
| 1783 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
| 1784 | __releases(rq1->lock) | ||
| 1785 | __releases(rq2->lock) | ||
| 1786 | { | ||
| 1787 | raw_spin_unlock(&rq1->lock); | ||
| 1788 | if (rq1 != rq2) | ||
| 1789 | raw_spin_unlock(&rq2->lock); | ||
| 1790 | else | ||
| 1791 | __release(rq2->lock); | ||
| 1792 | } | ||
| 1793 | |||
| 1808 | #endif | 1794 | #endif |
| 1809 | 1795 | ||
| 1810 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1796 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
| 1834 | #endif | 1820 | #endif |
| 1835 | } | 1821 | } |
| 1836 | 1822 | ||
| 1837 | #include "sched_stats.h" | 1823 | static const struct sched_class rt_sched_class; |
| 1838 | #include "sched_idletask.c" | ||
| 1839 | #include "sched_fair.c" | ||
| 1840 | #include "sched_rt.c" | ||
| 1841 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1842 | # include "sched_debug.c" | ||
| 1843 | #endif | ||
| 1844 | 1824 | ||
| 1845 | #define sched_class_highest (&rt_sched_class) | 1825 | #define sched_class_highest (&rt_sched_class) |
| 1846 | #define for_each_class(class) \ | 1826 | #define for_each_class(class) \ |
| 1847 | for (class = sched_class_highest; class; class = class->next) | 1827 | for (class = sched_class_highest; class; class = class->next) |
| 1848 | 1828 | ||
| 1829 | #include "sched_stats.h" | ||
| 1830 | |||
| 1849 | static void inc_nr_running(struct rq *rq) | 1831 | static void inc_nr_running(struct rq *rq) |
| 1850 | { | 1832 | { |
| 1851 | rq->nr_running++; | 1833 | rq->nr_running++; |
| @@ -1883,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample) | |||
| 1883 | *avg += diff >> 3; | 1865 | *avg += diff >> 3; |
| 1884 | } | 1866 | } |
| 1885 | 1867 | ||
| 1886 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1868 | static void |
| 1869 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
| 1887 | { | 1870 | { |
| 1888 | if (wakeup) | 1871 | if (wakeup) |
| 1889 | p->se.start_runtime = p->se.sum_exec_runtime; | 1872 | p->se.start_runtime = p->se.sum_exec_runtime; |
| 1890 | 1873 | ||
| 1891 | sched_info_queued(p); | 1874 | sched_info_queued(p); |
| 1892 | p->sched_class->enqueue_task(rq, p, wakeup); | 1875 | p->sched_class->enqueue_task(rq, p, wakeup, head); |
| 1893 | p->se.on_rq = 1; | 1876 | p->se.on_rq = 1; |
| 1894 | } | 1877 | } |
| 1895 | 1878 | ||
| @@ -1912,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | |||
| 1912 | } | 1895 | } |
| 1913 | 1896 | ||
| 1914 | /* | 1897 | /* |
| 1898 | * activate_task - move a task to the runqueue. | ||
| 1899 | */ | ||
| 1900 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | ||
| 1901 | { | ||
| 1902 | if (task_contributes_to_load(p)) | ||
| 1903 | rq->nr_uninterruptible--; | ||
| 1904 | |||
| 1905 | enqueue_task(rq, p, wakeup, false); | ||
| 1906 | inc_nr_running(rq); | ||
| 1907 | } | ||
| 1908 | |||
| 1909 | /* | ||
| 1910 | * deactivate_task - remove a task from the runqueue. | ||
| 1911 | */ | ||
| 1912 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | ||
| 1913 | { | ||
| 1914 | if (task_contributes_to_load(p)) | ||
| 1915 | rq->nr_uninterruptible++; | ||
| 1916 | |||
| 1917 | dequeue_task(rq, p, sleep); | ||
| 1918 | dec_nr_running(rq); | ||
| 1919 | } | ||
| 1920 | |||
| 1921 | #include "sched_idletask.c" | ||
| 1922 | #include "sched_fair.c" | ||
| 1923 | #include "sched_rt.c" | ||
| 1924 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1925 | # include "sched_debug.c" | ||
| 1926 | #endif | ||
| 1927 | |||
| 1928 | /* | ||
| 1915 | * __normal_prio - return the priority that is based on the static prio | 1929 | * __normal_prio - return the priority that is based on the static prio |
| 1916 | */ | 1930 | */ |
| 1917 | static inline int __normal_prio(struct task_struct *p) | 1931 | static inline int __normal_prio(struct task_struct *p) |
| @@ -1957,30 +1971,6 @@ static int effective_prio(struct task_struct *p) | |||
| 1957 | return p->prio; | 1971 | return p->prio; |
| 1958 | } | 1972 | } |
| 1959 | 1973 | ||
| 1960 | /* | ||
| 1961 | * activate_task - move a task to the runqueue. | ||
| 1962 | */ | ||
| 1963 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | ||
| 1964 | { | ||
| 1965 | if (task_contributes_to_load(p)) | ||
| 1966 | rq->nr_uninterruptible--; | ||
| 1967 | |||
| 1968 | enqueue_task(rq, p, wakeup); | ||
| 1969 | inc_nr_running(rq); | ||
| 1970 | } | ||
| 1971 | |||
| 1972 | /* | ||
| 1973 | * deactivate_task - remove a task from the runqueue. | ||
| 1974 | */ | ||
| 1975 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | ||
| 1976 | { | ||
| 1977 | if (task_contributes_to_load(p)) | ||
| 1978 | rq->nr_uninterruptible++; | ||
| 1979 | |||
| 1980 | dequeue_task(rq, p, sleep); | ||
| 1981 | dec_nr_running(rq); | ||
| 1982 | } | ||
| 1983 | |||
| 1984 | /** | 1974 | /** |
| 1985 | * task_curr - is this task currently executing on a CPU? | 1975 | * task_curr - is this task currently executing on a CPU? |
| 1986 | * @p: the task in question. | 1976 | * @p: the task in question. |
| @@ -2408,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
| 2408 | __task_rq_unlock(rq); | 2398 | __task_rq_unlock(rq); |
| 2409 | 2399 | ||
| 2410 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 2400 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
| 2411 | if (cpu != orig_cpu) | 2401 | if (cpu != orig_cpu) { |
| 2402 | /* | ||
| 2403 | * Since we migrate the task without holding any rq->lock, | ||
| 2404 | * we need to be careful with task_rq_lock(), since that | ||
| 2405 | * might end up locking an invalid rq. | ||
| 2406 | */ | ||
| 2412 | set_task_cpu(p, cpu); | 2407 | set_task_cpu(p, cpu); |
| 2408 | } | ||
| 2413 | 2409 | ||
| 2414 | rq = __task_rq_lock(p); | 2410 | rq = cpu_rq(cpu); |
| 2411 | raw_spin_lock(&rq->lock); | ||
| 2415 | update_rq_clock(rq); | 2412 | update_rq_clock(rq); |
| 2416 | 2413 | ||
| 2414 | /* | ||
| 2415 | * We migrated the task without holding either rq->lock, however | ||
| 2416 | * since the task is not on the task list itself, nobody else | ||
| 2417 | * will try and migrate the task, hence the rq should match the | ||
| 2418 | * cpu we just moved it to. | ||
| 2419 | */ | ||
| 2420 | WARN_ON(task_cpu(p) != cpu); | ||
| 2417 | WARN_ON(p->state != TASK_WAKING); | 2421 | WARN_ON(p->state != TASK_WAKING); |
| 2418 | cpu = task_cpu(p); | ||
| 2419 | 2422 | ||
| 2420 | #ifdef CONFIG_SCHEDSTATS | 2423 | #ifdef CONFIG_SCHEDSTATS |
| 2421 | schedstat_inc(rq, ttwu_count); | 2424 | schedstat_inc(rq, ttwu_count); |
| @@ -2663,7 +2666,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 2663 | set_task_cpu(p, cpu); | 2666 | set_task_cpu(p, cpu); |
| 2664 | #endif | 2667 | #endif |
| 2665 | 2668 | ||
| 2666 | rq = task_rq_lock(p, &flags); | 2669 | /* |
| 2670 | * Since the task is not on the rq and we still have TASK_WAKING set | ||
| 2671 | * nobody else will migrate this task. | ||
| 2672 | */ | ||
| 2673 | rq = cpu_rq(cpu); | ||
| 2674 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 2675 | |||
| 2667 | BUG_ON(p->state != TASK_WAKING); | 2676 | BUG_ON(p->state != TASK_WAKING); |
| 2668 | p->state = TASK_RUNNING; | 2677 | p->state = TASK_RUNNING; |
| 2669 | update_rq_clock(rq); | 2678 | update_rq_clock(rq); |
| @@ -2794,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2794 | */ | 2803 | */ |
| 2795 | prev_state = prev->state; | 2804 | prev_state = prev->state; |
| 2796 | finish_arch_switch(prev); | 2805 | finish_arch_switch(prev); |
| 2797 | perf_event_task_sched_in(current, cpu_of(rq)); | 2806 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
| 2807 | local_irq_disable(); | ||
| 2808 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 2809 | perf_event_task_sched_in(current); | ||
| 2810 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 2811 | local_irq_enable(); | ||
| 2812 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 2798 | finish_lock_switch(rq, prev); | 2813 | finish_lock_switch(rq, prev); |
| 2799 | 2814 | ||
| 2800 | fire_sched_in_preempt_notifiers(current); | 2815 | fire_sched_in_preempt_notifiers(current); |
| @@ -3099,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq) | |||
| 3099 | #ifdef CONFIG_SMP | 3114 | #ifdef CONFIG_SMP |
| 3100 | 3115 | ||
| 3101 | /* | 3116 | /* |
| 3102 | * double_rq_lock - safely lock two runqueues | ||
| 3103 | * | ||
| 3104 | * Note this does not disable interrupts like task_rq_lock, | ||
| 3105 | * you need to do so manually before calling. | ||
| 3106 | */ | ||
| 3107 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
| 3108 | __acquires(rq1->lock) | ||
| 3109 | __acquires(rq2->lock) | ||
| 3110 | { | ||
| 3111 | BUG_ON(!irqs_disabled()); | ||
| 3112 | if (rq1 == rq2) { | ||
| 3113 | raw_spin_lock(&rq1->lock); | ||
| 3114 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
| 3115 | } else { | ||
| 3116 | if (rq1 < rq2) { | ||
| 3117 | raw_spin_lock(&rq1->lock); | ||
| 3118 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
| 3119 | } else { | ||
| 3120 | raw_spin_lock(&rq2->lock); | ||
| 3121 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
| 3122 | } | ||
| 3123 | } | ||
| 3124 | update_rq_clock(rq1); | ||
| 3125 | update_rq_clock(rq2); | ||
| 3126 | } | ||
| 3127 | |||
| 3128 | /* | ||
| 3129 | * double_rq_unlock - safely unlock two runqueues | ||
| 3130 | * | ||
| 3131 | * Note this does not restore interrupts like task_rq_unlock, | ||
| 3132 | * you need to do so manually after calling. | ||
| 3133 | */ | ||
| 3134 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
| 3135 | __releases(rq1->lock) | ||
| 3136 | __releases(rq2->lock) | ||
| 3137 | { | ||
| 3138 | raw_spin_unlock(&rq1->lock); | ||
| 3139 | if (rq1 != rq2) | ||
| 3140 | raw_spin_unlock(&rq2->lock); | ||
| 3141 | else | ||
| 3142 | __release(rq2->lock); | ||
| 3143 | } | ||
| 3144 | |||
| 3145 | /* | ||
| 3146 | * sched_exec - execve() is a valuable balancing opportunity, because at | 3117 | * sched_exec - execve() is a valuable balancing opportunity, because at |
| 3147 | * this point the task has the smallest effective memory and cache footprint. | 3118 | * this point the task has the smallest effective memory and cache footprint. |
| 3148 | */ | 3119 | */ |
| @@ -3190,1771 +3161,6 @@ again: | |||
| 3190 | task_rq_unlock(rq, &flags); | 3161 | task_rq_unlock(rq, &flags); |
| 3191 | } | 3162 | } |
| 3192 | 3163 | ||
| 3193 | /* | ||
| 3194 | * pull_task - move a task from a remote runqueue to the local runqueue. | ||
| 3195 | * Both runqueues must be locked. | ||
| 3196 | */ | ||
| 3197 | static void pull_task(struct rq *src_rq, struct task_struct *p, | ||
| 3198 | struct rq *this_rq, int this_cpu) | ||
| 3199 | { | ||
| 3200 | deactivate_task(src_rq, p, 0); | ||
| 3201 | set_task_cpu(p, this_cpu); | ||
| 3202 | activate_task(this_rq, p, 0); | ||
| 3203 | check_preempt_curr(this_rq, p, 0); | ||
| 3204 | } | ||
| 3205 | |||
| 3206 | /* | ||
| 3207 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | ||
| 3208 | */ | ||
| 3209 | static | ||
| 3210 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | ||
| 3211 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 3212 | int *all_pinned) | ||
| 3213 | { | ||
| 3214 | int tsk_cache_hot = 0; | ||
| 3215 | /* | ||
| 3216 | * We do not migrate tasks that are: | ||
| 3217 | * 1) running (obviously), or | ||
| 3218 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | ||
| 3219 | * 3) are cache-hot on their current CPU. | ||
| 3220 | */ | ||
| 3221 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | ||
| 3222 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
| 3223 | return 0; | ||
| 3224 | } | ||
| 3225 | *all_pinned = 0; | ||
| 3226 | |||
| 3227 | if (task_running(rq, p)) { | ||
| 3228 | schedstat_inc(p, se.nr_failed_migrations_running); | ||
| 3229 | return 0; | ||
| 3230 | } | ||
| 3231 | |||
| 3232 | /* | ||
| 3233 | * Aggressive migration if: | ||
| 3234 | * 1) task is cache cold, or | ||
| 3235 | * 2) too many balance attempts have failed. | ||
| 3236 | */ | ||
| 3237 | |||
| 3238 | tsk_cache_hot = task_hot(p, rq->clock, sd); | ||
| 3239 | if (!tsk_cache_hot || | ||
| 3240 | sd->nr_balance_failed > sd->cache_nice_tries) { | ||
| 3241 | #ifdef CONFIG_SCHEDSTATS | ||
| 3242 | if (tsk_cache_hot) { | ||
| 3243 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
| 3244 | schedstat_inc(p, se.nr_forced_migrations); | ||
| 3245 | } | ||
| 3246 | #endif | ||
| 3247 | return 1; | ||
| 3248 | } | ||
| 3249 | |||
| 3250 | if (tsk_cache_hot) { | ||
| 3251 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
| 3252 | return 0; | ||
| 3253 | } | ||
| 3254 | return 1; | ||
| 3255 | } | ||
| 3256 | |||
| 3257 | static unsigned long | ||
| 3258 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 3259 | unsigned long max_load_move, struct sched_domain *sd, | ||
| 3260 | enum cpu_idle_type idle, int *all_pinned, | ||
| 3261 | int *this_best_prio, struct rq_iterator *iterator) | ||
| 3262 | { | ||
| 3263 | int loops = 0, pulled = 0, pinned = 0; | ||
| 3264 | struct task_struct *p; | ||
| 3265 | long rem_load_move = max_load_move; | ||
| 3266 | |||
| 3267 | if (max_load_move == 0) | ||
| 3268 | goto out; | ||
| 3269 | |||
| 3270 | pinned = 1; | ||
| 3271 | |||
| 3272 | /* | ||
| 3273 | * Start the load-balancing iterator: | ||
| 3274 | */ | ||
| 3275 | p = iterator->start(iterator->arg); | ||
| 3276 | next: | ||
| 3277 | if (!p || loops++ > sysctl_sched_nr_migrate) | ||
| 3278 | goto out; | ||
| 3279 | |||
| 3280 | if ((p->se.load.weight >> 1) > rem_load_move || | ||
| 3281 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | ||
| 3282 | p = iterator->next(iterator->arg); | ||
| 3283 | goto next; | ||
| 3284 | } | ||
| 3285 | |||
| 3286 | pull_task(busiest, p, this_rq, this_cpu); | ||
| 3287 | pulled++; | ||
| 3288 | rem_load_move -= p->se.load.weight; | ||
| 3289 | |||
| 3290 | #ifdef CONFIG_PREEMPT | ||
| 3291 | /* | ||
| 3292 | * NEWIDLE balancing is a source of latency, so preemptible kernels | ||
| 3293 | * will stop after the first task is pulled to minimize the critical | ||
| 3294 | * section. | ||
| 3295 | */ | ||
| 3296 | if (idle == CPU_NEWLY_IDLE) | ||
| 3297 | goto out; | ||
| 3298 | #endif | ||
| 3299 | |||
| 3300 | /* | ||
| 3301 | * We only want to steal up to the prescribed amount of weighted load. | ||
| 3302 | */ | ||
| 3303 | if (rem_load_move > 0) { | ||
| 3304 | if (p->prio < *this_best_prio) | ||
| 3305 | *this_best_prio = p->prio; | ||
| 3306 | p = iterator->next(iterator->arg); | ||
| 3307 | goto next; | ||
| 3308 | } | ||
| 3309 | out: | ||
| 3310 | /* | ||
| 3311 | * Right now, this is one of only two places pull_task() is called, | ||
| 3312 | * so we can safely collect pull_task() stats here rather than | ||
| 3313 | * inside pull_task(). | ||
| 3314 | */ | ||
| 3315 | schedstat_add(sd, lb_gained[idle], pulled); | ||
| 3316 | |||
| 3317 | if (all_pinned) | ||
| 3318 | *all_pinned = pinned; | ||
| 3319 | |||
| 3320 | return max_load_move - rem_load_move; | ||
| 3321 | } | ||
| 3322 | |||
| 3323 | /* | ||
| 3324 | * move_tasks tries to move up to max_load_move weighted load from busiest to | ||
| 3325 | * this_rq, as part of a balancing operation within domain "sd". | ||
| 3326 | * Returns 1 if successful and 0 otherwise. | ||
| 3327 | * | ||
| 3328 | * Called with both runqueues locked. | ||
| 3329 | */ | ||
| 3330 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 3331 | unsigned long max_load_move, | ||
| 3332 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 3333 | int *all_pinned) | ||
| 3334 | { | ||
| 3335 | const struct sched_class *class = sched_class_highest; | ||
| 3336 | unsigned long total_load_moved = 0; | ||
| 3337 | int this_best_prio = this_rq->curr->prio; | ||
| 3338 | |||
| 3339 | do { | ||
| 3340 | total_load_moved += | ||
| 3341 | class->load_balance(this_rq, this_cpu, busiest, | ||
| 3342 | max_load_move - total_load_moved, | ||
| 3343 | sd, idle, all_pinned, &this_best_prio); | ||
| 3344 | class = class->next; | ||
| 3345 | |||
| 3346 | #ifdef CONFIG_PREEMPT | ||
| 3347 | /* | ||
| 3348 | * NEWIDLE balancing is a source of latency, so preemptible | ||
| 3349 | * kernels will stop after the first task is pulled to minimize | ||
| 3350 | * the critical section. | ||
| 3351 | */ | ||
| 3352 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | ||
| 3353 | break; | ||
| 3354 | #endif | ||
| 3355 | } while (class && max_load_move > total_load_moved); | ||
| 3356 | |||
| 3357 | return total_load_moved > 0; | ||
| 3358 | } | ||
| 3359 | |||
| 3360 | static int | ||
| 3361 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 3362 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 3363 | struct rq_iterator *iterator) | ||
| 3364 | { | ||
| 3365 | struct task_struct *p = iterator->start(iterator->arg); | ||
| 3366 | int pinned = 0; | ||
| 3367 | |||
| 3368 | while (p) { | ||
| 3369 | if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | ||
| 3370 | pull_task(busiest, p, this_rq, this_cpu); | ||
| 3371 | /* | ||
| 3372 | * Right now, this is only the second place pull_task() | ||
| 3373 | * is called, so we can safely collect pull_task() | ||
| 3374 | * stats here rather than inside pull_task(). | ||
| 3375 | */ | ||
| 3376 | schedstat_inc(sd, lb_gained[idle]); | ||
| 3377 | |||
| 3378 | return 1; | ||
| 3379 | } | ||
| 3380 | p = iterator->next(iterator->arg); | ||
| 3381 | } | ||
| 3382 | |||
| 3383 | return 0; | ||
| 3384 | } | ||
| 3385 | |||
| 3386 | /* | ||
| 3387 | * move_one_task tries to move exactly one task from busiest to this_rq, as | ||
| 3388 | * part of active balancing operations within "domain". | ||
| 3389 | * Returns 1 if successful and 0 otherwise. | ||
| 3390 | * | ||
| 3391 | * Called with both runqueues locked. | ||
| 3392 | */ | ||
| 3393 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 3394 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
| 3395 | { | ||
| 3396 | const struct sched_class *class; | ||
| 3397 | |||
| 3398 | for_each_class(class) { | ||
| 3399 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | ||
| 3400 | return 1; | ||
| 3401 | } | ||
| 3402 | |||
| 3403 | return 0; | ||
| 3404 | } | ||
| 3405 | /********** Helpers for find_busiest_group ************************/ | ||
| 3406 | /* | ||
| 3407 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
| 3408 | * during load balancing. | ||
| 3409 | */ | ||
| 3410 | struct sd_lb_stats { | ||
| 3411 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
| 3412 | struct sched_group *this; /* Local group in this sd */ | ||
| 3413 | unsigned long total_load; /* Total load of all groups in sd */ | ||
| 3414 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
| 3415 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
| 3416 | |||
| 3417 | /** Statistics of this group */ | ||
| 3418 | unsigned long this_load; | ||
| 3419 | unsigned long this_load_per_task; | ||
| 3420 | unsigned long this_nr_running; | ||
| 3421 | |||
| 3422 | /* Statistics of the busiest group */ | ||
| 3423 | unsigned long max_load; | ||
| 3424 | unsigned long busiest_load_per_task; | ||
| 3425 | unsigned long busiest_nr_running; | ||
| 3426 | |||
| 3427 | int group_imb; /* Is there imbalance in this sd */ | ||
| 3428 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 3429 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
| 3430 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
| 3431 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
| 3432 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
| 3433 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
| 3434 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
| 3435 | #endif | ||
| 3436 | }; | ||
| 3437 | |||
| 3438 | /* | ||
| 3439 | * sg_lb_stats - stats of a sched_group required for load_balancing | ||
| 3440 | */ | ||
| 3441 | struct sg_lb_stats { | ||
| 3442 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | ||
| 3443 | unsigned long group_load; /* Total load over the CPUs of the group */ | ||
| 3444 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
| 3445 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
| 3446 | unsigned long group_capacity; | ||
| 3447 | int group_imb; /* Is there an imbalance in the group ? */ | ||
| 3448 | }; | ||
| 3449 | |||
| 3450 | /** | ||
| 3451 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
| 3452 | * @group: The group whose first cpu is to be returned. | ||
| 3453 | */ | ||
| 3454 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
| 3455 | { | ||
| 3456 | return cpumask_first(sched_group_cpus(group)); | ||
| 3457 | } | ||
| 3458 | |||
| 3459 | /** | ||
| 3460 | * get_sd_load_idx - Obtain the load index for a given sched domain. | ||
| 3461 | * @sd: The sched_domain whose load_idx is to be obtained. | ||
| 3462 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | ||
| 3463 | */ | ||
| 3464 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
| 3465 | enum cpu_idle_type idle) | ||
| 3466 | { | ||
| 3467 | int load_idx; | ||
| 3468 | |||
| 3469 | switch (idle) { | ||
| 3470 | case CPU_NOT_IDLE: | ||
| 3471 | load_idx = sd->busy_idx; | ||
| 3472 | break; | ||
| 3473 | |||
| 3474 | case CPU_NEWLY_IDLE: | ||
| 3475 | load_idx = sd->newidle_idx; | ||
| 3476 | break; | ||
| 3477 | default: | ||
| 3478 | load_idx = sd->idle_idx; | ||
| 3479 | break; | ||
| 3480 | } | ||
| 3481 | |||
| 3482 | return load_idx; | ||
| 3483 | } | ||
| 3484 | |||
| 3485 | |||
| 3486 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 3487 | /** | ||
| 3488 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
| 3489 | * the given sched_domain, during load balancing. | ||
| 3490 | * | ||
| 3491 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
| 3492 | * @sds: Variable containing the statistics for sd. | ||
| 3493 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
| 3494 | */ | ||
| 3495 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
| 3496 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
| 3497 | { | ||
| 3498 | /* | ||
| 3499 | * Busy processors will not participate in power savings | ||
| 3500 | * balance. | ||
| 3501 | */ | ||
| 3502 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
| 3503 | sds->power_savings_balance = 0; | ||
| 3504 | else { | ||
| 3505 | sds->power_savings_balance = 1; | ||
| 3506 | sds->min_nr_running = ULONG_MAX; | ||
| 3507 | sds->leader_nr_running = 0; | ||
| 3508 | } | ||
| 3509 | } | ||
| 3510 | |||
| 3511 | /** | ||
| 3512 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
| 3513 | * sched_domain while performing load balancing. | ||
| 3514 | * | ||
| 3515 | * @group: sched_group belonging to the sched_domain under consideration. | ||
| 3516 | * @sds: Variable containing the statistics of the sched_domain | ||
| 3517 | * @local_group: Does group contain the CPU for which we're performing | ||
| 3518 | * load balancing ? | ||
| 3519 | * @sgs: Variable containing the statistics of the group. | ||
| 3520 | */ | ||
| 3521 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
| 3522 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
| 3523 | { | ||
| 3524 | |||
| 3525 | if (!sds->power_savings_balance) | ||
| 3526 | return; | ||
| 3527 | |||
| 3528 | /* | ||
| 3529 | * If the local group is idle or completely loaded | ||
| 3530 | * no need to do power savings balance at this domain | ||
| 3531 | */ | ||
| 3532 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
| 3533 | !sds->this_nr_running)) | ||
| 3534 | sds->power_savings_balance = 0; | ||
| 3535 | |||
| 3536 | /* | ||
| 3537 | * If a group is already running at full capacity or idle, | ||
| 3538 | * don't include that group in power savings calculations | ||
| 3539 | */ | ||
| 3540 | if (!sds->power_savings_balance || | ||
| 3541 | sgs->sum_nr_running >= sgs->group_capacity || | ||
| 3542 | !sgs->sum_nr_running) | ||
| 3543 | return; | ||
| 3544 | |||
| 3545 | /* | ||
| 3546 | * Calculate the group which has the least non-idle load. | ||
| 3547 | * This is the group from where we need to pick up the load | ||
| 3548 | * for saving power | ||
| 3549 | */ | ||
| 3550 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
| 3551 | (sgs->sum_nr_running == sds->min_nr_running && | ||
| 3552 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
| 3553 | sds->group_min = group; | ||
| 3554 | sds->min_nr_running = sgs->sum_nr_running; | ||
| 3555 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
| 3556 | sgs->sum_nr_running; | ||
| 3557 | } | ||
| 3558 | |||
| 3559 | /* | ||
| 3560 | * Calculate the group which is almost near its | ||
| 3561 | * capacity but still has some space to pick up some load | ||
| 3562 | * from other group and save more power | ||
| 3563 | */ | ||
| 3564 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
| 3565 | return; | ||
| 3566 | |||
| 3567 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
| 3568 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
| 3569 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
| 3570 | sds->group_leader = group; | ||
| 3571 | sds->leader_nr_running = sgs->sum_nr_running; | ||
| 3572 | } | ||
| 3573 | } | ||
| 3574 | |||
| 3575 | /** | ||
| 3576 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
| 3577 | * @sds: Variable containing the statistics of the sched_domain | ||
| 3578 | * under consideration. | ||
| 3579 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
| 3580 | * @imbalance: Variable to store the imbalance. | ||
| 3581 | * | ||
| 3582 | * Description: | ||
| 3583 | * Check if we have potential to perform some power-savings balance. | ||
| 3584 | * If yes, set the busiest group to be the least loaded group in the | ||
| 3585 | * sched_domain, so that it's CPUs can be put to idle. | ||
| 3586 | * | ||
| 3587 | * Returns 1 if there is potential to perform power-savings balance. | ||
| 3588 | * Else returns 0. | ||
| 3589 | */ | ||
| 3590 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
| 3591 | int this_cpu, unsigned long *imbalance) | ||
| 3592 | { | ||
| 3593 | if (!sds->power_savings_balance) | ||
| 3594 | return 0; | ||
| 3595 | |||
| 3596 | if (sds->this != sds->group_leader || | ||
| 3597 | sds->group_leader == sds->group_min) | ||
| 3598 | return 0; | ||
| 3599 | |||
| 3600 | *imbalance = sds->min_load_per_task; | ||
| 3601 | sds->busiest = sds->group_min; | ||
| 3602 | |||
| 3603 | return 1; | ||
| 3604 | |||
| 3605 | } | ||
| 3606 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
| 3607 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
| 3608 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
| 3609 | { | ||
| 3610 | return; | ||
| 3611 | } | ||
| 3612 | |||
| 3613 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
| 3614 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
| 3615 | { | ||
| 3616 | return; | ||
| 3617 | } | ||
| 3618 | |||
| 3619 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
| 3620 | int this_cpu, unsigned long *imbalance) | ||
| 3621 | { | ||
| 3622 | return 0; | ||
| 3623 | } | ||
| 3624 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
| 3625 | |||
| 3626 | |||
| 3627 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
| 3628 | { | ||
| 3629 | return SCHED_LOAD_SCALE; | ||
| 3630 | } | ||
| 3631 | |||
| 3632 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
| 3633 | { | ||
| 3634 | return default_scale_freq_power(sd, cpu); | ||
| 3635 | } | ||
| 3636 | |||
| 3637 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
| 3638 | { | ||
| 3639 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 3640 | unsigned long smt_gain = sd->smt_gain; | ||
| 3641 | |||
| 3642 | smt_gain /= weight; | ||
| 3643 | |||
| 3644 | return smt_gain; | ||
| 3645 | } | ||
| 3646 | |||
| 3647 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
| 3648 | { | ||
| 3649 | return default_scale_smt_power(sd, cpu); | ||
| 3650 | } | ||
| 3651 | |||
| 3652 | unsigned long scale_rt_power(int cpu) | ||
| 3653 | { | ||
| 3654 | struct rq *rq = cpu_rq(cpu); | ||
| 3655 | u64 total, available; | ||
| 3656 | |||
| 3657 | sched_avg_update(rq); | ||
| 3658 | |||
| 3659 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
| 3660 | available = total - rq->rt_avg; | ||
| 3661 | |||
| 3662 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
| 3663 | total = SCHED_LOAD_SCALE; | ||
| 3664 | |||
| 3665 | total >>= SCHED_LOAD_SHIFT; | ||
| 3666 | |||
| 3667 | return div_u64(available, total); | ||
| 3668 | } | ||
| 3669 | |||
| 3670 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
| 3671 | { | ||
| 3672 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 3673 | unsigned long power = SCHED_LOAD_SCALE; | ||
| 3674 | struct sched_group *sdg = sd->groups; | ||
| 3675 | |||
| 3676 | if (sched_feat(ARCH_POWER)) | ||
| 3677 | power *= arch_scale_freq_power(sd, cpu); | ||
| 3678 | else | ||
| 3679 | power *= default_scale_freq_power(sd, cpu); | ||
| 3680 | |||
| 3681 | power >>= SCHED_LOAD_SHIFT; | ||
| 3682 | |||
| 3683 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
| 3684 | if (sched_feat(ARCH_POWER)) | ||
| 3685 | power *= arch_scale_smt_power(sd, cpu); | ||
| 3686 | else | ||
| 3687 | power *= default_scale_smt_power(sd, cpu); | ||
| 3688 | |||
| 3689 | power >>= SCHED_LOAD_SHIFT; | ||
| 3690 | } | ||
| 3691 | |||
| 3692 | power *= scale_rt_power(cpu); | ||
| 3693 | power >>= SCHED_LOAD_SHIFT; | ||
| 3694 | |||
| 3695 | if (!power) | ||
| 3696 | power = 1; | ||
| 3697 | |||
| 3698 | sdg->cpu_power = power; | ||
| 3699 | } | ||
| 3700 | |||
| 3701 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
| 3702 | { | ||
| 3703 | struct sched_domain *child = sd->child; | ||
| 3704 | struct sched_group *group, *sdg = sd->groups; | ||
| 3705 | unsigned long power; | ||
| 3706 | |||
| 3707 | if (!child) { | ||
| 3708 | update_cpu_power(sd, cpu); | ||
| 3709 | return; | ||
| 3710 | } | ||
| 3711 | |||
| 3712 | power = 0; | ||
| 3713 | |||
| 3714 | group = child->groups; | ||
| 3715 | do { | ||
| 3716 | power += group->cpu_power; | ||
| 3717 | group = group->next; | ||
| 3718 | } while (group != child->groups); | ||
| 3719 | |||
| 3720 | sdg->cpu_power = power; | ||
| 3721 | } | ||
| 3722 | |||
| 3723 | /** | ||
| 3724 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | ||
| 3725 | * @sd: The sched_domain whose statistics are to be updated. | ||
| 3726 | * @group: sched_group whose statistics are to be updated. | ||
| 3727 | * @this_cpu: Cpu for which load balance is currently performed. | ||
| 3728 | * @idle: Idle status of this_cpu | ||
| 3729 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | ||
| 3730 | * @sd_idle: Idle status of the sched_domain containing group. | ||
| 3731 | * @local_group: Does group contain this_cpu. | ||
| 3732 | * @cpus: Set of cpus considered for load balancing. | ||
| 3733 | * @balance: Should we balance. | ||
| 3734 | * @sgs: variable to hold the statistics for this group. | ||
| 3735 | */ | ||
| 3736 | static inline void update_sg_lb_stats(struct sched_domain *sd, | ||
| 3737 | struct sched_group *group, int this_cpu, | ||
| 3738 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | ||
| 3739 | int local_group, const struct cpumask *cpus, | ||
| 3740 | int *balance, struct sg_lb_stats *sgs) | ||
| 3741 | { | ||
| 3742 | unsigned long load, max_cpu_load, min_cpu_load; | ||
| 3743 | int i; | ||
| 3744 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
| 3745 | unsigned long sum_avg_load_per_task; | ||
| 3746 | unsigned long avg_load_per_task; | ||
| 3747 | |||
| 3748 | if (local_group) { | ||
| 3749 | balance_cpu = group_first_cpu(group); | ||
| 3750 | if (balance_cpu == this_cpu) | ||
| 3751 | update_group_power(sd, this_cpu); | ||
| 3752 | } | ||
| 3753 | |||
| 3754 | /* Tally up the load of all CPUs in the group */ | ||
| 3755 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
| 3756 | max_cpu_load = 0; | ||
| 3757 | min_cpu_load = ~0UL; | ||
| 3758 | |||
| 3759 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | ||
| 3760 | struct rq *rq = cpu_rq(i); | ||
| 3761 | |||
| 3762 | if (*sd_idle && rq->nr_running) | ||
| 3763 | *sd_idle = 0; | ||
| 3764 | |||
| 3765 | /* Bias balancing toward cpus of our domain */ | ||
| 3766 | if (local_group) { | ||
| 3767 | if (idle_cpu(i) && !first_idle_cpu) { | ||
| 3768 | first_idle_cpu = 1; | ||
| 3769 | balance_cpu = i; | ||
| 3770 | } | ||
| 3771 | |||
| 3772 | load = target_load(i, load_idx); | ||
| 3773 | } else { | ||
| 3774 | load = source_load(i, load_idx); | ||
| 3775 | if (load > max_cpu_load) | ||
| 3776 | max_cpu_load = load; | ||
| 3777 | if (min_cpu_load > load) | ||
| 3778 | min_cpu_load = load; | ||
| 3779 | } | ||
| 3780 | |||
| 3781 | sgs->group_load += load; | ||
| 3782 | sgs->sum_nr_running += rq->nr_running; | ||
| 3783 | sgs->sum_weighted_load += weighted_cpuload(i); | ||
| 3784 | |||
| 3785 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
| 3786 | } | ||
| 3787 | |||
| 3788 | /* | ||
| 3789 | * First idle cpu or the first cpu(busiest) in this sched group | ||
| 3790 | * is eligible for doing load balancing at this and above | ||
| 3791 | * domains. In the newly idle case, we will allow all the cpu's | ||
| 3792 | * to do the newly idle load balance. | ||
| 3793 | */ | ||
| 3794 | if (idle != CPU_NEWLY_IDLE && local_group && | ||
| 3795 | balance_cpu != this_cpu && balance) { | ||
| 3796 | *balance = 0; | ||
| 3797 | return; | ||
| 3798 | } | ||
| 3799 | |||
| 3800 | /* Adjust by relative CPU power of the group */ | ||
| 3801 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
| 3802 | |||
| 3803 | |||
| 3804 | /* | ||
| 3805 | * Consider the group unbalanced when the imbalance is larger | ||
| 3806 | * than the average weight of two tasks. | ||
| 3807 | * | ||
| 3808 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 3809 | * might not be a suitable number - should we keep a | ||
| 3810 | * normalized nr_running number somewhere that negates | ||
| 3811 | * the hierarchy? | ||
| 3812 | */ | ||
| 3813 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / | ||
| 3814 | group->cpu_power; | ||
| 3815 | |||
| 3816 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
| 3817 | sgs->group_imb = 1; | ||
| 3818 | |||
| 3819 | sgs->group_capacity = | ||
| 3820 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | ||
| 3821 | } | ||
| 3822 | |||
| 3823 | /** | ||
| 3824 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | ||
| 3825 | * @sd: sched_domain whose statistics are to be updated. | ||
| 3826 | * @this_cpu: Cpu for which load balance is currently performed. | ||
| 3827 | * @idle: Idle status of this_cpu | ||
| 3828 | * @sd_idle: Idle status of the sched_domain containing group. | ||
| 3829 | * @cpus: Set of cpus considered for load balancing. | ||
| 3830 | * @balance: Should we balance. | ||
| 3831 | * @sds: variable to hold the statistics for this sched_domain. | ||
| 3832 | */ | ||
| 3833 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | ||
| 3834 | enum cpu_idle_type idle, int *sd_idle, | ||
| 3835 | const struct cpumask *cpus, int *balance, | ||
| 3836 | struct sd_lb_stats *sds) | ||
| 3837 | { | ||
| 3838 | struct sched_domain *child = sd->child; | ||
| 3839 | struct sched_group *group = sd->groups; | ||
| 3840 | struct sg_lb_stats sgs; | ||
| 3841 | int load_idx, prefer_sibling = 0; | ||
| 3842 | |||
| 3843 | if (child && child->flags & SD_PREFER_SIBLING) | ||
| 3844 | prefer_sibling = 1; | ||
| 3845 | |||
| 3846 | init_sd_power_savings_stats(sd, sds, idle); | ||
| 3847 | load_idx = get_sd_load_idx(sd, idle); | ||
| 3848 | |||
| 3849 | do { | ||
| 3850 | int local_group; | ||
| 3851 | |||
| 3852 | local_group = cpumask_test_cpu(this_cpu, | ||
| 3853 | sched_group_cpus(group)); | ||
| 3854 | memset(&sgs, 0, sizeof(sgs)); | ||
| 3855 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | ||
| 3856 | local_group, cpus, balance, &sgs); | ||
| 3857 | |||
| 3858 | if (local_group && balance && !(*balance)) | ||
| 3859 | return; | ||
| 3860 | |||
| 3861 | sds->total_load += sgs.group_load; | ||
| 3862 | sds->total_pwr += group->cpu_power; | ||
| 3863 | |||
| 3864 | /* | ||
| 3865 | * In case the child domain prefers tasks go to siblings | ||
| 3866 | * first, lower the group capacity to one so that we'll try | ||
| 3867 | * and move all the excess tasks away. | ||
| 3868 | */ | ||
| 3869 | if (prefer_sibling) | ||
| 3870 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
| 3871 | |||
| 3872 | if (local_group) { | ||
| 3873 | sds->this_load = sgs.avg_load; | ||
| 3874 | sds->this = group; | ||
| 3875 | sds->this_nr_running = sgs.sum_nr_running; | ||
| 3876 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
| 3877 | } else if (sgs.avg_load > sds->max_load && | ||
| 3878 | (sgs.sum_nr_running > sgs.group_capacity || | ||
| 3879 | sgs.group_imb)) { | ||
| 3880 | sds->max_load = sgs.avg_load; | ||
| 3881 | sds->busiest = group; | ||
| 3882 | sds->busiest_nr_running = sgs.sum_nr_running; | ||
| 3883 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
| 3884 | sds->group_imb = sgs.group_imb; | ||
| 3885 | } | ||
| 3886 | |||
| 3887 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | ||
| 3888 | group = group->next; | ||
| 3889 | } while (group != sd->groups); | ||
| 3890 | } | ||
| 3891 | |||
| 3892 | /** | ||
| 3893 | * fix_small_imbalance - Calculate the minor imbalance that exists | ||
| 3894 | * amongst the groups of a sched_domain, during | ||
| 3895 | * load balancing. | ||
| 3896 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | ||
| 3897 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
| 3898 | * @imbalance: Variable to store the imbalance. | ||
| 3899 | */ | ||
| 3900 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | ||
| 3901 | int this_cpu, unsigned long *imbalance) | ||
| 3902 | { | ||
| 3903 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | ||
| 3904 | unsigned int imbn = 2; | ||
| 3905 | |||
| 3906 | if (sds->this_nr_running) { | ||
| 3907 | sds->this_load_per_task /= sds->this_nr_running; | ||
| 3908 | if (sds->busiest_load_per_task > | ||
| 3909 | sds->this_load_per_task) | ||
| 3910 | imbn = 1; | ||
| 3911 | } else | ||
| 3912 | sds->this_load_per_task = | ||
| 3913 | cpu_avg_load_per_task(this_cpu); | ||
| 3914 | |||
| 3915 | if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= | ||
| 3916 | sds->busiest_load_per_task * imbn) { | ||
| 3917 | *imbalance = sds->busiest_load_per_task; | ||
| 3918 | return; | ||
| 3919 | } | ||
| 3920 | |||
| 3921 | /* | ||
| 3922 | * OK, we don't have enough imbalance to justify moving tasks, | ||
| 3923 | * however we may be able to increase total CPU power used by | ||
| 3924 | * moving them. | ||
| 3925 | */ | ||
| 3926 | |||
| 3927 | pwr_now += sds->busiest->cpu_power * | ||
| 3928 | min(sds->busiest_load_per_task, sds->max_load); | ||
| 3929 | pwr_now += sds->this->cpu_power * | ||
| 3930 | min(sds->this_load_per_task, sds->this_load); | ||
| 3931 | pwr_now /= SCHED_LOAD_SCALE; | ||
| 3932 | |||
| 3933 | /* Amount of load we'd subtract */ | ||
| 3934 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
| 3935 | sds->busiest->cpu_power; | ||
| 3936 | if (sds->max_load > tmp) | ||
| 3937 | pwr_move += sds->busiest->cpu_power * | ||
| 3938 | min(sds->busiest_load_per_task, sds->max_load - tmp); | ||
| 3939 | |||
| 3940 | /* Amount of load we'd add */ | ||
| 3941 | if (sds->max_load * sds->busiest->cpu_power < | ||
| 3942 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | ||
| 3943 | tmp = (sds->max_load * sds->busiest->cpu_power) / | ||
| 3944 | sds->this->cpu_power; | ||
| 3945 | else | ||
| 3946 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
| 3947 | sds->this->cpu_power; | ||
| 3948 | pwr_move += sds->this->cpu_power * | ||
| 3949 | min(sds->this_load_per_task, sds->this_load + tmp); | ||
| 3950 | pwr_move /= SCHED_LOAD_SCALE; | ||
| 3951 | |||
| 3952 | /* Move if we gain throughput */ | ||
| 3953 | if (pwr_move > pwr_now) | ||
| 3954 | *imbalance = sds->busiest_load_per_task; | ||
| 3955 | } | ||
| 3956 | |||
| 3957 | /** | ||
| 3958 | * calculate_imbalance - Calculate the amount of imbalance present within the | ||
| 3959 | * groups of a given sched_domain during load balance. | ||
| 3960 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | ||
| 3961 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
| 3962 | * @imbalance: The variable to store the imbalance. | ||
| 3963 | */ | ||
| 3964 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | ||
| 3965 | unsigned long *imbalance) | ||
| 3966 | { | ||
| 3967 | unsigned long max_pull; | ||
| 3968 | /* | ||
| 3969 | * In the presence of smp nice balancing, certain scenarios can have | ||
| 3970 | * max load less than avg load(as we skip the groups at or below | ||
| 3971 | * its cpu_power, while calculating max_load..) | ||
| 3972 | */ | ||
| 3973 | if (sds->max_load < sds->avg_load) { | ||
| 3974 | *imbalance = 0; | ||
| 3975 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
| 3976 | } | ||
| 3977 | |||
| 3978 | /* Don't want to pull so many tasks that a group would go idle */ | ||
| 3979 | max_pull = min(sds->max_load - sds->avg_load, | ||
| 3980 | sds->max_load - sds->busiest_load_per_task); | ||
| 3981 | |||
| 3982 | /* How much load to actually move to equalise the imbalance */ | ||
| 3983 | *imbalance = min(max_pull * sds->busiest->cpu_power, | ||
| 3984 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | ||
| 3985 | / SCHED_LOAD_SCALE; | ||
| 3986 | |||
| 3987 | /* | ||
| 3988 | * if *imbalance is less than the average load per runnable task | ||
| 3989 | * there is no gaurantee that any tasks will be moved so we'll have | ||
| 3990 | * a think about bumping its value to force at least one task to be | ||
| 3991 | * moved | ||
| 3992 | */ | ||
| 3993 | if (*imbalance < sds->busiest_load_per_task) | ||
| 3994 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
| 3995 | |||
| 3996 | } | ||
| 3997 | /******* find_busiest_group() helpers end here *********************/ | ||
| 3998 | |||
| 3999 | /** | ||
| 4000 | * find_busiest_group - Returns the busiest group within the sched_domain | ||
| 4001 | * if there is an imbalance. If there isn't an imbalance, and | ||
| 4002 | * the user has opted for power-savings, it returns a group whose | ||
| 4003 | * CPUs can be put to idle by rebalancing those tasks elsewhere, if | ||
| 4004 | * such a group exists. | ||
| 4005 | * | ||
| 4006 | * Also calculates the amount of weighted load which should be moved | ||
| 4007 | * to restore balance. | ||
| 4008 | * | ||
| 4009 | * @sd: The sched_domain whose busiest group is to be returned. | ||
| 4010 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
| 4011 | * @imbalance: Variable which stores amount of weighted load which should | ||
| 4012 | * be moved to restore balance/put a group to idle. | ||
| 4013 | * @idle: The idle status of this_cpu. | ||
| 4014 | * @sd_idle: The idleness of sd | ||
| 4015 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
| 4016 | * @balance: Pointer to a variable indicating if this_cpu | ||
| 4017 | * is the appropriate cpu to perform load balancing at this_level. | ||
| 4018 | * | ||
| 4019 | * Returns: - the busiest group if imbalance exists. | ||
| 4020 | * - If no imbalance and user has opted for power-savings balance, | ||
| 4021 | * return the least loaded group whose CPUs can be | ||
| 4022 | * put to idle by rebalancing its tasks onto our group. | ||
| 4023 | */ | ||
| 4024 | static struct sched_group * | ||
| 4025 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
| 4026 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
| 4027 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
| 4028 | { | ||
| 4029 | struct sd_lb_stats sds; | ||
| 4030 | |||
| 4031 | memset(&sds, 0, sizeof(sds)); | ||
| 4032 | |||
| 4033 | /* | ||
| 4034 | * Compute the various statistics relavent for load balancing at | ||
| 4035 | * this level. | ||
| 4036 | */ | ||
| 4037 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | ||
| 4038 | balance, &sds); | ||
| 4039 | |||
| 4040 | /* Cases where imbalance does not exist from POV of this_cpu */ | ||
| 4041 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | ||
| 4042 | * at this level. | ||
| 4043 | * 2) There is no busy sibling group to pull from. | ||
| 4044 | * 3) This group is the busiest group. | ||
| 4045 | * 4) This group is more busy than the avg busieness at this | ||
| 4046 | * sched_domain. | ||
| 4047 | * 5) The imbalance is within the specified limit. | ||
| 4048 | * 6) Any rebalance would lead to ping-pong | ||
| 4049 | */ | ||
| 4050 | if (balance && !(*balance)) | ||
| 4051 | goto ret; | ||
| 4052 | |||
| 4053 | if (!sds.busiest || sds.busiest_nr_running == 0) | ||
| 4054 | goto out_balanced; | ||
| 4055 | |||
| 4056 | if (sds.this_load >= sds.max_load) | ||
| 4057 | goto out_balanced; | ||
| 4058 | |||
| 4059 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | ||
| 4060 | |||
| 4061 | if (sds.this_load >= sds.avg_load) | ||
| 4062 | goto out_balanced; | ||
| 4063 | |||
| 4064 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
| 4065 | goto out_balanced; | ||
| 4066 | |||
| 4067 | sds.busiest_load_per_task /= sds.busiest_nr_running; | ||
| 4068 | if (sds.group_imb) | ||
| 4069 | sds.busiest_load_per_task = | ||
| 4070 | min(sds.busiest_load_per_task, sds.avg_load); | ||
| 4071 | |||
| 4072 | /* | ||
| 4073 | * We're trying to get all the cpus to the average_load, so we don't | ||
| 4074 | * want to push ourselves above the average load, nor do we wish to | ||
| 4075 | * reduce the max loaded cpu below the average load, as either of these | ||
| 4076 | * actions would just result in more rebalancing later, and ping-pong | ||
| 4077 | * tasks around. Thus we look for the minimum possible imbalance. | ||
| 4078 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
| 4079 | * be counted as no imbalance for these purposes -- we can't fix that | ||
| 4080 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
| 4081 | * appear as very large values with unsigned longs. | ||
| 4082 | */ | ||
| 4083 | if (sds.max_load <= sds.busiest_load_per_task) | ||
| 4084 | goto out_balanced; | ||
| 4085 | |||
| 4086 | /* Looks like there is an imbalance. Compute it */ | ||
| 4087 | calculate_imbalance(&sds, this_cpu, imbalance); | ||
| 4088 | return sds.busiest; | ||
| 4089 | |||
| 4090 | out_balanced: | ||
| 4091 | /* | ||
| 4092 | * There is no obvious imbalance. But check if we can do some balancing | ||
| 4093 | * to save power. | ||
| 4094 | */ | ||
| 4095 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
| 4096 | return sds.busiest; | ||
| 4097 | ret: | ||
| 4098 | *imbalance = 0; | ||
| 4099 | return NULL; | ||
| 4100 | } | ||
| 4101 | |||
| 4102 | /* | ||
| 4103 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | ||
| 4104 | */ | ||
| 4105 | static struct rq * | ||
| 4106 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | ||
| 4107 | unsigned long imbalance, const struct cpumask *cpus) | ||
| 4108 | { | ||
| 4109 | struct rq *busiest = NULL, *rq; | ||
| 4110 | unsigned long max_load = 0; | ||
| 4111 | int i; | ||
| 4112 | |||
| 4113 | for_each_cpu(i, sched_group_cpus(group)) { | ||
| 4114 | unsigned long power = power_of(i); | ||
| 4115 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
| 4116 | unsigned long wl; | ||
| 4117 | |||
| 4118 | if (!cpumask_test_cpu(i, cpus)) | ||
| 4119 | continue; | ||
| 4120 | |||
| 4121 | rq = cpu_rq(i); | ||
| 4122 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; | ||
| 4123 | wl /= power; | ||
| 4124 | |||
| 4125 | if (capacity && rq->nr_running == 1 && wl > imbalance) | ||
| 4126 | continue; | ||
| 4127 | |||
| 4128 | if (wl > max_load) { | ||
| 4129 | max_load = wl; | ||
| 4130 | busiest = rq; | ||
| 4131 | } | ||
| 4132 | } | ||
| 4133 | |||
| 4134 | return busiest; | ||
| 4135 | } | ||
| 4136 | |||
| 4137 | /* | ||
| 4138 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
| 4139 | * so long as it is large enough. | ||
| 4140 | */ | ||
| 4141 | #define MAX_PINNED_INTERVAL 512 | ||
| 4142 | |||
| 4143 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
| 4144 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | ||
| 4145 | |||
| 4146 | /* | ||
| 4147 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
| 4148 | * tasks if there is an imbalance. | ||
| 4149 | */ | ||
| 4150 | static int load_balance(int this_cpu, struct rq *this_rq, | ||
| 4151 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 4152 | int *balance) | ||
| 4153 | { | ||
| 4154 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
| 4155 | struct sched_group *group; | ||
| 4156 | unsigned long imbalance; | ||
| 4157 | struct rq *busiest; | ||
| 4158 | unsigned long flags; | ||
| 4159 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
| 4160 | |||
| 4161 | cpumask_copy(cpus, cpu_active_mask); | ||
| 4162 | |||
| 4163 | /* | ||
| 4164 | * When power savings policy is enabled for the parent domain, idle | ||
| 4165 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
| 4166 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
| 4167 | * portraying it as CPU_NOT_IDLE. | ||
| 4168 | */ | ||
| 4169 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
| 4170 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 4171 | sd_idle = 1; | ||
| 4172 | |||
| 4173 | schedstat_inc(sd, lb_count[idle]); | ||
| 4174 | |||
| 4175 | redo: | ||
| 4176 | update_shares(sd); | ||
| 4177 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
| 4178 | cpus, balance); | ||
| 4179 | |||
| 4180 | if (*balance == 0) | ||
| 4181 | goto out_balanced; | ||
| 4182 | |||
| 4183 | if (!group) { | ||
| 4184 | schedstat_inc(sd, lb_nobusyg[idle]); | ||
| 4185 | goto out_balanced; | ||
| 4186 | } | ||
| 4187 | |||
| 4188 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | ||
| 4189 | if (!busiest) { | ||
| 4190 | schedstat_inc(sd, lb_nobusyq[idle]); | ||
| 4191 | goto out_balanced; | ||
| 4192 | } | ||
| 4193 | |||
| 4194 | BUG_ON(busiest == this_rq); | ||
| 4195 | |||
| 4196 | schedstat_add(sd, lb_imbalance[idle], imbalance); | ||
| 4197 | |||
| 4198 | ld_moved = 0; | ||
| 4199 | if (busiest->nr_running > 1) { | ||
| 4200 | /* | ||
| 4201 | * Attempt to move tasks. If find_busiest_group has found | ||
| 4202 | * an imbalance but busiest->nr_running <= 1, the group is | ||
| 4203 | * still unbalanced. ld_moved simply stays zero, so it is | ||
| 4204 | * correctly treated as an imbalance. | ||
| 4205 | */ | ||
| 4206 | local_irq_save(flags); | ||
| 4207 | double_rq_lock(this_rq, busiest); | ||
| 4208 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
| 4209 | imbalance, sd, idle, &all_pinned); | ||
| 4210 | double_rq_unlock(this_rq, busiest); | ||
| 4211 | local_irq_restore(flags); | ||
| 4212 | |||
| 4213 | /* | ||
| 4214 | * some other cpu did the load balance for us. | ||
| 4215 | */ | ||
| 4216 | if (ld_moved && this_cpu != smp_processor_id()) | ||
| 4217 | resched_cpu(this_cpu); | ||
| 4218 | |||
| 4219 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
| 4220 | if (unlikely(all_pinned)) { | ||
| 4221 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
| 4222 | if (!cpumask_empty(cpus)) | ||
| 4223 | goto redo; | ||
| 4224 | goto out_balanced; | ||
| 4225 | } | ||
| 4226 | } | ||
| 4227 | |||
| 4228 | if (!ld_moved) { | ||
| 4229 | schedstat_inc(sd, lb_failed[idle]); | ||
| 4230 | sd->nr_balance_failed++; | ||
| 4231 | |||
| 4232 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | ||
| 4233 | |||
| 4234 | raw_spin_lock_irqsave(&busiest->lock, flags); | ||
| 4235 | |||
| 4236 | /* don't kick the migration_thread, if the curr | ||
| 4237 | * task on busiest cpu can't be moved to this_cpu | ||
| 4238 | */ | ||
| 4239 | if (!cpumask_test_cpu(this_cpu, | ||
| 4240 | &busiest->curr->cpus_allowed)) { | ||
| 4241 | raw_spin_unlock_irqrestore(&busiest->lock, | ||
| 4242 | flags); | ||
| 4243 | all_pinned = 1; | ||
| 4244 | goto out_one_pinned; | ||
| 4245 | } | ||
| 4246 | |||
| 4247 | if (!busiest->active_balance) { | ||
| 4248 | busiest->active_balance = 1; | ||
| 4249 | busiest->push_cpu = this_cpu; | ||
| 4250 | active_balance = 1; | ||
| 4251 | } | ||
| 4252 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | ||
| 4253 | if (active_balance) | ||
| 4254 | wake_up_process(busiest->migration_thread); | ||
| 4255 | |||
| 4256 | /* | ||
| 4257 | * We've kicked active balancing, reset the failure | ||
| 4258 | * counter. | ||
| 4259 | */ | ||
| 4260 | sd->nr_balance_failed = sd->cache_nice_tries+1; | ||
| 4261 | } | ||
| 4262 | } else | ||
| 4263 | sd->nr_balance_failed = 0; | ||
| 4264 | |||
| 4265 | if (likely(!active_balance)) { | ||
| 4266 | /* We were unbalanced, so reset the balancing interval */ | ||
| 4267 | sd->balance_interval = sd->min_interval; | ||
| 4268 | } else { | ||
| 4269 | /* | ||
| 4270 | * If we've begun active balancing, start to back off. This | ||
| 4271 | * case may not be covered by the all_pinned logic if there | ||
| 4272 | * is only 1 task on the busy runqueue (because we don't call | ||
| 4273 | * move_tasks). | ||
| 4274 | */ | ||
| 4275 | if (sd->balance_interval < sd->max_interval) | ||
| 4276 | sd->balance_interval *= 2; | ||
| 4277 | } | ||
| 4278 | |||
| 4279 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
| 4280 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 4281 | ld_moved = -1; | ||
| 4282 | |||
| 4283 | goto out; | ||
| 4284 | |||
| 4285 | out_balanced: | ||
| 4286 | schedstat_inc(sd, lb_balanced[idle]); | ||
| 4287 | |||
| 4288 | sd->nr_balance_failed = 0; | ||
| 4289 | |||
| 4290 | out_one_pinned: | ||
| 4291 | /* tune up the balancing interval */ | ||
| 4292 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
| 4293 | (sd->balance_interval < sd->max_interval)) | ||
| 4294 | sd->balance_interval *= 2; | ||
| 4295 | |||
| 4296 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
| 4297 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 4298 | ld_moved = -1; | ||
| 4299 | else | ||
| 4300 | ld_moved = 0; | ||
| 4301 | out: | ||
| 4302 | if (ld_moved) | ||
| 4303 | update_shares(sd); | ||
| 4304 | return ld_moved; | ||
| 4305 | } | ||
| 4306 | |||
| 4307 | /* | ||
| 4308 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
| 4309 | * tasks if there is an imbalance. | ||
| 4310 | * | ||
| 4311 | * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). | ||
| 4312 | * this_rq is locked. | ||
| 4313 | */ | ||
| 4314 | static int | ||
| 4315 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | ||
| 4316 | { | ||
| 4317 | struct sched_group *group; | ||
| 4318 | struct rq *busiest = NULL; | ||
| 4319 | unsigned long imbalance; | ||
| 4320 | int ld_moved = 0; | ||
| 4321 | int sd_idle = 0; | ||
| 4322 | int all_pinned = 0; | ||
| 4323 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
| 4324 | |||
| 4325 | cpumask_copy(cpus, cpu_active_mask); | ||
| 4326 | |||
| 4327 | /* | ||
| 4328 | * When power savings policy is enabled for the parent domain, idle | ||
| 4329 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
| 4330 | * let the state of idle sibling percolate up as IDLE, instead of | ||
| 4331 | * portraying it as CPU_NOT_IDLE. | ||
| 4332 | */ | ||
| 4333 | if (sd->flags & SD_SHARE_CPUPOWER && | ||
| 4334 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 4335 | sd_idle = 1; | ||
| 4336 | |||
| 4337 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | ||
| 4338 | redo: | ||
| 4339 | update_shares_locked(this_rq, sd); | ||
| 4340 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | ||
| 4341 | &sd_idle, cpus, NULL); | ||
| 4342 | if (!group) { | ||
| 4343 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | ||
| 4344 | goto out_balanced; | ||
| 4345 | } | ||
| 4346 | |||
| 4347 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); | ||
| 4348 | if (!busiest) { | ||
| 4349 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | ||
| 4350 | goto out_balanced; | ||
| 4351 | } | ||
| 4352 | |||
| 4353 | BUG_ON(busiest == this_rq); | ||
| 4354 | |||
| 4355 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); | ||
| 4356 | |||
| 4357 | ld_moved = 0; | ||
| 4358 | if (busiest->nr_running > 1) { | ||
| 4359 | /* Attempt to move tasks */ | ||
| 4360 | double_lock_balance(this_rq, busiest); | ||
| 4361 | /* this_rq->clock is already updated */ | ||
| 4362 | update_rq_clock(busiest); | ||
| 4363 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
| 4364 | imbalance, sd, CPU_NEWLY_IDLE, | ||
| 4365 | &all_pinned); | ||
| 4366 | double_unlock_balance(this_rq, busiest); | ||
| 4367 | |||
| 4368 | if (unlikely(all_pinned)) { | ||
| 4369 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
| 4370 | if (!cpumask_empty(cpus)) | ||
| 4371 | goto redo; | ||
| 4372 | } | ||
| 4373 | } | ||
| 4374 | |||
| 4375 | if (!ld_moved) { | ||
| 4376 | int active_balance = 0; | ||
| 4377 | |||
| 4378 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); | ||
| 4379 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
| 4380 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 4381 | return -1; | ||
| 4382 | |||
| 4383 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
| 4384 | return -1; | ||
| 4385 | |||
| 4386 | if (sd->nr_balance_failed++ < 2) | ||
| 4387 | return -1; | ||
| 4388 | |||
| 4389 | /* | ||
| 4390 | * The only task running in a non-idle cpu can be moved to this | ||
| 4391 | * cpu in an attempt to completely freeup the other CPU | ||
| 4392 | * package. The same method used to move task in load_balance() | ||
| 4393 | * have been extended for load_balance_newidle() to speedup | ||
| 4394 | * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) | ||
| 4395 | * | ||
| 4396 | * The package power saving logic comes from | ||
| 4397 | * find_busiest_group(). If there are no imbalance, then | ||
| 4398 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
| 4399 | * f_b_g() will select a group from which a running task may be | ||
| 4400 | * pulled to this cpu in order to make the other package idle. | ||
| 4401 | * If there is no opportunity to make a package idle and if | ||
| 4402 | * there are no imbalance, then f_b_g() will return NULL and no | ||
| 4403 | * action will be taken in load_balance_newidle(). | ||
| 4404 | * | ||
| 4405 | * Under normal task pull operation due to imbalance, there | ||
| 4406 | * will be more than one task in the source run queue and | ||
| 4407 | * move_tasks() will succeed. ld_moved will be true and this | ||
| 4408 | * active balance code will not be triggered. | ||
| 4409 | */ | ||
| 4410 | |||
| 4411 | /* Lock busiest in correct order while this_rq is held */ | ||
| 4412 | double_lock_balance(this_rq, busiest); | ||
| 4413 | |||
| 4414 | /* | ||
| 4415 | * don't kick the migration_thread, if the curr | ||
| 4416 | * task on busiest cpu can't be moved to this_cpu | ||
| 4417 | */ | ||
| 4418 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | ||
| 4419 | double_unlock_balance(this_rq, busiest); | ||
| 4420 | all_pinned = 1; | ||
| 4421 | return ld_moved; | ||
| 4422 | } | ||
| 4423 | |||
| 4424 | if (!busiest->active_balance) { | ||
| 4425 | busiest->active_balance = 1; | ||
| 4426 | busiest->push_cpu = this_cpu; | ||
| 4427 | active_balance = 1; | ||
| 4428 | } | ||
| 4429 | |||
| 4430 | double_unlock_balance(this_rq, busiest); | ||
| 4431 | /* | ||
| 4432 | * Should not call ttwu while holding a rq->lock | ||
| 4433 | */ | ||
| 4434 | raw_spin_unlock(&this_rq->lock); | ||
| 4435 | if (active_balance) | ||
| 4436 | wake_up_process(busiest->migration_thread); | ||
| 4437 | raw_spin_lock(&this_rq->lock); | ||
| 4438 | |||
| 4439 | } else | ||
| 4440 | sd->nr_balance_failed = 0; | ||
| 4441 | |||
| 4442 | update_shares_locked(this_rq, sd); | ||
| 4443 | return ld_moved; | ||
| 4444 | |||
| 4445 | out_balanced: | ||
| 4446 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); | ||
| 4447 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
| 4448 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 4449 | return -1; | ||
| 4450 | sd->nr_balance_failed = 0; | ||
| 4451 | |||
| 4452 | return 0; | ||
| 4453 | } | ||
| 4454 | |||
| 4455 | /* | ||
| 4456 | * idle_balance is called by schedule() if this_cpu is about to become | ||
| 4457 | * idle. Attempts to pull tasks from other CPUs. | ||
| 4458 | */ | ||
| 4459 | static void idle_balance(int this_cpu, struct rq *this_rq) | ||
| 4460 | { | ||
| 4461 | struct sched_domain *sd; | ||
| 4462 | int pulled_task = 0; | ||
| 4463 | unsigned long next_balance = jiffies + HZ; | ||
| 4464 | |||
| 4465 | this_rq->idle_stamp = this_rq->clock; | ||
| 4466 | |||
| 4467 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
| 4468 | return; | ||
| 4469 | |||
| 4470 | for_each_domain(this_cpu, sd) { | ||
| 4471 | unsigned long interval; | ||
| 4472 | |||
| 4473 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
| 4474 | continue; | ||
| 4475 | |||
| 4476 | if (sd->flags & SD_BALANCE_NEWIDLE) | ||
| 4477 | /* If we've pulled tasks over stop searching: */ | ||
| 4478 | pulled_task = load_balance_newidle(this_cpu, this_rq, | ||
| 4479 | sd); | ||
| 4480 | |||
| 4481 | interval = msecs_to_jiffies(sd->balance_interval); | ||
| 4482 | if (time_after(next_balance, sd->last_balance + interval)) | ||
| 4483 | next_balance = sd->last_balance + interval; | ||
| 4484 | if (pulled_task) { | ||
| 4485 | this_rq->idle_stamp = 0; | ||
| 4486 | break; | ||
| 4487 | } | ||
| 4488 | } | ||
| 4489 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | ||
| 4490 | /* | ||
| 4491 | * We are going idle. next_balance may be set based on | ||
| 4492 | * a busy processor. So reset next_balance. | ||
| 4493 | */ | ||
| 4494 | this_rq->next_balance = next_balance; | ||
| 4495 | } | ||
| 4496 | } | ||
| 4497 | |||
| 4498 | /* | ||
| 4499 | * active_load_balance is run by migration threads. It pushes running tasks | ||
| 4500 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | ||
| 4501 | * running on each physical CPU where possible, and avoids physical / | ||
| 4502 | * logical imbalances. | ||
| 4503 | * | ||
| 4504 | * Called with busiest_rq locked. | ||
| 4505 | */ | ||
| 4506 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | ||
| 4507 | { | ||
| 4508 | int target_cpu = busiest_rq->push_cpu; | ||
| 4509 | struct sched_domain *sd; | ||
| 4510 | struct rq *target_rq; | ||
| 4511 | |||
| 4512 | /* Is there any task to move? */ | ||
| 4513 | if (busiest_rq->nr_running <= 1) | ||
| 4514 | return; | ||
| 4515 | |||
| 4516 | target_rq = cpu_rq(target_cpu); | ||
| 4517 | |||
| 4518 | /* | ||
| 4519 | * This condition is "impossible", if it occurs | ||
| 4520 | * we need to fix it. Originally reported by | ||
| 4521 | * Bjorn Helgaas on a 128-cpu setup. | ||
| 4522 | */ | ||
| 4523 | BUG_ON(busiest_rq == target_rq); | ||
| 4524 | |||
| 4525 | /* move a task from busiest_rq to target_rq */ | ||
| 4526 | double_lock_balance(busiest_rq, target_rq); | ||
| 4527 | update_rq_clock(busiest_rq); | ||
| 4528 | update_rq_clock(target_rq); | ||
| 4529 | |||
| 4530 | /* Search for an sd spanning us and the target CPU. */ | ||
| 4531 | for_each_domain(target_cpu, sd) { | ||
| 4532 | if ((sd->flags & SD_LOAD_BALANCE) && | ||
| 4533 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | ||
| 4534 | break; | ||
| 4535 | } | ||
| 4536 | |||
| 4537 | if (likely(sd)) { | ||
| 4538 | schedstat_inc(sd, alb_count); | ||
| 4539 | |||
| 4540 | if (move_one_task(target_rq, target_cpu, busiest_rq, | ||
| 4541 | sd, CPU_IDLE)) | ||
| 4542 | schedstat_inc(sd, alb_pushed); | ||
| 4543 | else | ||
| 4544 | schedstat_inc(sd, alb_failed); | ||
| 4545 | } | ||
| 4546 | double_unlock_balance(busiest_rq, target_rq); | ||
| 4547 | } | ||
| 4548 | |||
| 4549 | #ifdef CONFIG_NO_HZ | ||
| 4550 | static struct { | ||
| 4551 | atomic_t load_balancer; | ||
| 4552 | cpumask_var_t cpu_mask; | ||
| 4553 | cpumask_var_t ilb_grp_nohz_mask; | ||
| 4554 | } nohz ____cacheline_aligned = { | ||
| 4555 | .load_balancer = ATOMIC_INIT(-1), | ||
| 4556 | }; | ||
| 4557 | |||
| 4558 | int get_nohz_load_balancer(void) | ||
| 4559 | { | ||
| 4560 | return atomic_read(&nohz.load_balancer); | ||
| 4561 | } | ||
| 4562 | |||
| 4563 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 4564 | /** | ||
| 4565 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
| 4566 | * @cpu: The cpu whose lowest level of sched domain is to | ||
| 4567 | * be returned. | ||
| 4568 | * @flag: The flag to check for the lowest sched_domain | ||
| 4569 | * for the given cpu. | ||
| 4570 | * | ||
| 4571 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
| 4572 | */ | ||
| 4573 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
| 4574 | { | ||
| 4575 | struct sched_domain *sd; | ||
| 4576 | |||
| 4577 | for_each_domain(cpu, sd) | ||
| 4578 | if (sd && (sd->flags & flag)) | ||
| 4579 | break; | ||
| 4580 | |||
| 4581 | return sd; | ||
| 4582 | } | ||
| 4583 | |||
| 4584 | /** | ||
| 4585 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
| 4586 | * @cpu: The cpu whose domains we're iterating over. | ||
| 4587 | * @sd: variable holding the value of the power_savings_sd | ||
| 4588 | * for cpu. | ||
| 4589 | * @flag: The flag to filter the sched_domains to be iterated. | ||
| 4590 | * | ||
| 4591 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
| 4592 | * set, starting from the lowest sched_domain to the highest. | ||
| 4593 | */ | ||
| 4594 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
| 4595 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
| 4596 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
| 4597 | |||
| 4598 | /** | ||
| 4599 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
| 4600 | * @ilb_group: group to be checked for semi-idleness | ||
| 4601 | * | ||
| 4602 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
| 4603 | * | ||
| 4604 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
| 4605 | * and atleast one non-idle CPU. This helper function checks if the given | ||
| 4606 | * sched_group is semi-idle or not. | ||
| 4607 | */ | ||
| 4608 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
| 4609 | { | ||
| 4610 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
| 4611 | sched_group_cpus(ilb_group)); | ||
| 4612 | |||
| 4613 | /* | ||
| 4614 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
| 4615 | * and atleast one idle cpu. | ||
| 4616 | */ | ||
| 4617 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
| 4618 | return 0; | ||
| 4619 | |||
| 4620 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
| 4621 | return 0; | ||
| 4622 | |||
| 4623 | return 1; | ||
| 4624 | } | ||
| 4625 | /** | ||
| 4626 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
| 4627 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
| 4628 | * | ||
| 4629 | * Returns: Returns the id of the idle load balancer if it exists, | ||
| 4630 | * Else, returns >= nr_cpu_ids. | ||
| 4631 | * | ||
| 4632 | * This algorithm picks the idle load balancer such that it belongs to a | ||
| 4633 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
| 4634 | * completely idle packages/cores just for the purpose of idle load balancing | ||
| 4635 | * when there are other idle cpu's which are better suited for that job. | ||
| 4636 | */ | ||
| 4637 | static int find_new_ilb(int cpu) | ||
| 4638 | { | ||
| 4639 | struct sched_domain *sd; | ||
| 4640 | struct sched_group *ilb_group; | ||
| 4641 | |||
| 4642 | /* | ||
| 4643 | * Have idle load balancer selection from semi-idle packages only | ||
| 4644 | * when power-aware load balancing is enabled | ||
| 4645 | */ | ||
| 4646 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
| 4647 | goto out_done; | ||
| 4648 | |||
| 4649 | /* | ||
| 4650 | * Optimize for the case when we have no idle CPUs or only one | ||
| 4651 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
| 4652 | */ | ||
| 4653 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
| 4654 | goto out_done; | ||
| 4655 | |||
| 4656 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
| 4657 | ilb_group = sd->groups; | ||
| 4658 | |||
| 4659 | do { | ||
| 4660 | if (is_semi_idle_group(ilb_group)) | ||
| 4661 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
| 4662 | |||
| 4663 | ilb_group = ilb_group->next; | ||
| 4664 | |||
| 4665 | } while (ilb_group != sd->groups); | ||
| 4666 | } | ||
| 4667 | |||
| 4668 | out_done: | ||
| 4669 | return cpumask_first(nohz.cpu_mask); | ||
| 4670 | } | ||
| 4671 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
| 4672 | static inline int find_new_ilb(int call_cpu) | ||
| 4673 | { | ||
| 4674 | return cpumask_first(nohz.cpu_mask); | ||
| 4675 | } | ||
| 4676 | #endif | ||
| 4677 | |||
| 4678 | /* | ||
| 4679 | * This routine will try to nominate the ilb (idle load balancing) | ||
| 4680 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
| 4681 | * load balancing on behalf of all those cpus. If all the cpus in the system | ||
| 4682 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
| 4683 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
| 4684 | * arrives... | ||
| 4685 | * | ||
| 4686 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
| 4687 | * for idle load balancing. ilb owner will still be part of | ||
| 4688 | * nohz.cpu_mask.. | ||
| 4689 | * | ||
| 4690 | * While stopping the tick, this cpu will become the ilb owner if there | ||
| 4691 | * is no other owner. And will be the owner till that cpu becomes busy | ||
| 4692 | * or if all cpus in the system stop their ticks at which point | ||
| 4693 | * there is no need for ilb owner. | ||
| 4694 | * | ||
| 4695 | * When the ilb owner becomes busy, it nominates another owner, during the | ||
| 4696 | * next busy scheduler_tick() | ||
| 4697 | */ | ||
| 4698 | int select_nohz_load_balancer(int stop_tick) | ||
| 4699 | { | ||
| 4700 | int cpu = smp_processor_id(); | ||
| 4701 | |||
| 4702 | if (stop_tick) { | ||
| 4703 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
| 4704 | |||
| 4705 | if (!cpu_active(cpu)) { | ||
| 4706 | if (atomic_read(&nohz.load_balancer) != cpu) | ||
| 4707 | return 0; | ||
| 4708 | |||
| 4709 | /* | ||
| 4710 | * If we are going offline and still the leader, | ||
| 4711 | * give up! | ||
| 4712 | */ | ||
| 4713 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
| 4714 | BUG(); | ||
| 4715 | |||
| 4716 | return 0; | ||
| 4717 | } | ||
| 4718 | |||
| 4719 | cpumask_set_cpu(cpu, nohz.cpu_mask); | ||
| 4720 | |||
| 4721 | /* time for ilb owner also to sleep */ | ||
| 4722 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | ||
| 4723 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
| 4724 | atomic_set(&nohz.load_balancer, -1); | ||
| 4725 | return 0; | ||
| 4726 | } | ||
| 4727 | |||
| 4728 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
| 4729 | /* make me the ilb owner */ | ||
| 4730 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
| 4731 | return 1; | ||
| 4732 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
| 4733 | int new_ilb; | ||
| 4734 | |||
| 4735 | if (!(sched_smt_power_savings || | ||
| 4736 | sched_mc_power_savings)) | ||
| 4737 | return 1; | ||
| 4738 | /* | ||
| 4739 | * Check to see if there is a more power-efficient | ||
| 4740 | * ilb. | ||
| 4741 | */ | ||
| 4742 | new_ilb = find_new_ilb(cpu); | ||
| 4743 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
| 4744 | atomic_set(&nohz.load_balancer, -1); | ||
| 4745 | resched_cpu(new_ilb); | ||
| 4746 | return 0; | ||
| 4747 | } | ||
| 4748 | return 1; | ||
| 4749 | } | ||
| 4750 | } else { | ||
| 4751 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
| 4752 | return 0; | ||
| 4753 | |||
| 4754 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
| 4755 | |||
| 4756 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
| 4757 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
| 4758 | BUG(); | ||
| 4759 | } | ||
| 4760 | return 0; | ||
| 4761 | } | ||
| 4762 | #endif | ||
| 4763 | |||
| 4764 | static DEFINE_SPINLOCK(balancing); | ||
| 4765 | |||
| 4766 | /* | ||
| 4767 | * It checks each scheduling domain to see if it is due to be balanced, | ||
| 4768 | * and initiates a balancing operation if so. | ||
| 4769 | * | ||
| 4770 | * Balancing parameters are set up in arch_init_sched_domains. | ||
| 4771 | */ | ||
| 4772 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | ||
| 4773 | { | ||
| 4774 | int balance = 1; | ||
| 4775 | struct rq *rq = cpu_rq(cpu); | ||
| 4776 | unsigned long interval; | ||
| 4777 | struct sched_domain *sd; | ||
| 4778 | /* Earliest time when we have to do rebalance again */ | ||
| 4779 | unsigned long next_balance = jiffies + 60*HZ; | ||
| 4780 | int update_next_balance = 0; | ||
| 4781 | int need_serialize; | ||
| 4782 | |||
| 4783 | for_each_domain(cpu, sd) { | ||
| 4784 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
| 4785 | continue; | ||
| 4786 | |||
| 4787 | interval = sd->balance_interval; | ||
| 4788 | if (idle != CPU_IDLE) | ||
| 4789 | interval *= sd->busy_factor; | ||
| 4790 | |||
| 4791 | /* scale ms to jiffies */ | ||
| 4792 | interval = msecs_to_jiffies(interval); | ||
| 4793 | if (unlikely(!interval)) | ||
| 4794 | interval = 1; | ||
| 4795 | if (interval > HZ*NR_CPUS/10) | ||
| 4796 | interval = HZ*NR_CPUS/10; | ||
| 4797 | |||
| 4798 | need_serialize = sd->flags & SD_SERIALIZE; | ||
| 4799 | |||
| 4800 | if (need_serialize) { | ||
| 4801 | if (!spin_trylock(&balancing)) | ||
| 4802 | goto out; | ||
| 4803 | } | ||
| 4804 | |||
| 4805 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
| 4806 | if (load_balance(cpu, rq, sd, idle, &balance)) { | ||
| 4807 | /* | ||
| 4808 | * We've pulled tasks over so either we're no | ||
| 4809 | * longer idle, or one of our SMT siblings is | ||
| 4810 | * not idle. | ||
| 4811 | */ | ||
| 4812 | idle = CPU_NOT_IDLE; | ||
| 4813 | } | ||
| 4814 | sd->last_balance = jiffies; | ||
| 4815 | } | ||
| 4816 | if (need_serialize) | ||
| 4817 | spin_unlock(&balancing); | ||
| 4818 | out: | ||
| 4819 | if (time_after(next_balance, sd->last_balance + interval)) { | ||
| 4820 | next_balance = sd->last_balance + interval; | ||
| 4821 | update_next_balance = 1; | ||
| 4822 | } | ||
| 4823 | |||
| 4824 | /* | ||
| 4825 | * Stop the load balance at this level. There is another | ||
| 4826 | * CPU in our sched group which is doing load balancing more | ||
| 4827 | * actively. | ||
| 4828 | */ | ||
| 4829 | if (!balance) | ||
| 4830 | break; | ||
| 4831 | } | ||
| 4832 | |||
| 4833 | /* | ||
| 4834 | * next_balance will be updated only when there is a need. | ||
| 4835 | * When the cpu is attached to null domain for ex, it will not be | ||
| 4836 | * updated. | ||
| 4837 | */ | ||
| 4838 | if (likely(update_next_balance)) | ||
| 4839 | rq->next_balance = next_balance; | ||
| 4840 | } | ||
| 4841 | |||
| 4842 | /* | ||
| 4843 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
| 4844 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
| 4845 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
| 4846 | */ | ||
| 4847 | static void run_rebalance_domains(struct softirq_action *h) | ||
| 4848 | { | ||
| 4849 | int this_cpu = smp_processor_id(); | ||
| 4850 | struct rq *this_rq = cpu_rq(this_cpu); | ||
| 4851 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | ||
| 4852 | CPU_IDLE : CPU_NOT_IDLE; | ||
| 4853 | |||
| 4854 | rebalance_domains(this_cpu, idle); | ||
| 4855 | |||
| 4856 | #ifdef CONFIG_NO_HZ | ||
| 4857 | /* | ||
| 4858 | * If this cpu is the owner for idle load balancing, then do the | ||
| 4859 | * balancing on behalf of the other idle cpus whose ticks are | ||
| 4860 | * stopped. | ||
| 4861 | */ | ||
| 4862 | if (this_rq->idle_at_tick && | ||
| 4863 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
| 4864 | struct rq *rq; | ||
| 4865 | int balance_cpu; | ||
| 4866 | |||
| 4867 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
| 4868 | if (balance_cpu == this_cpu) | ||
| 4869 | continue; | ||
| 4870 | |||
| 4871 | /* | ||
| 4872 | * If this cpu gets work to do, stop the load balancing | ||
| 4873 | * work being done for other cpus. Next load | ||
| 4874 | * balancing owner will pick it up. | ||
| 4875 | */ | ||
| 4876 | if (need_resched()) | ||
| 4877 | break; | ||
| 4878 | |||
| 4879 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
| 4880 | |||
| 4881 | rq = cpu_rq(balance_cpu); | ||
| 4882 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
| 4883 | this_rq->next_balance = rq->next_balance; | ||
| 4884 | } | ||
| 4885 | } | ||
| 4886 | #endif | ||
| 4887 | } | ||
| 4888 | |||
| 4889 | static inline int on_null_domain(int cpu) | ||
| 4890 | { | ||
| 4891 | return !rcu_dereference(cpu_rq(cpu)->sd); | ||
| 4892 | } | ||
| 4893 | |||
| 4894 | /* | ||
| 4895 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | ||
| 4896 | * | ||
| 4897 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
| 4898 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
| 4899 | * if the whole system is idle. | ||
| 4900 | */ | ||
| 4901 | static inline void trigger_load_balance(struct rq *rq, int cpu) | ||
| 4902 | { | ||
| 4903 | #ifdef CONFIG_NO_HZ | ||
| 4904 | /* | ||
| 4905 | * If we were in the nohz mode recently and busy at the current | ||
| 4906 | * scheduler tick, then check if we need to nominate new idle | ||
| 4907 | * load balancer. | ||
| 4908 | */ | ||
| 4909 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
| 4910 | rq->in_nohz_recently = 0; | ||
| 4911 | |||
| 4912 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
| 4913 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
| 4914 | atomic_set(&nohz.load_balancer, -1); | ||
| 4915 | } | ||
| 4916 | |||
| 4917 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
| 4918 | int ilb = find_new_ilb(cpu); | ||
| 4919 | |||
| 4920 | if (ilb < nr_cpu_ids) | ||
| 4921 | resched_cpu(ilb); | ||
| 4922 | } | ||
| 4923 | } | ||
| 4924 | |||
| 4925 | /* | ||
| 4926 | * If this cpu is idle and doing idle load balancing for all the | ||
| 4927 | * cpus with ticks stopped, is it time for that to stop? | ||
| 4928 | */ | ||
| 4929 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
| 4930 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
| 4931 | resched_cpu(cpu); | ||
| 4932 | return; | ||
| 4933 | } | ||
| 4934 | |||
| 4935 | /* | ||
| 4936 | * If this cpu is idle and the idle load balancing is done by | ||
| 4937 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
| 4938 | */ | ||
| 4939 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
| 4940 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
| 4941 | return; | ||
| 4942 | #endif | ||
| 4943 | /* Don't need to rebalance while attached to NULL domain */ | ||
| 4944 | if (time_after_eq(jiffies, rq->next_balance) && | ||
| 4945 | likely(!on_null_domain(cpu))) | ||
| 4946 | raise_softirq(SCHED_SOFTIRQ); | ||
| 4947 | } | ||
| 4948 | |||
| 4949 | #else /* CONFIG_SMP */ | ||
| 4950 | |||
| 4951 | /* | ||
| 4952 | * on UP we do not need to balance between CPUs: | ||
| 4953 | */ | ||
| 4954 | static inline void idle_balance(int cpu, struct rq *rq) | ||
| 4955 | { | ||
| 4956 | } | ||
| 4957 | |||
| 4958 | #endif | 3164 | #endif |
| 4959 | 3165 | ||
| 4960 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3166 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
| @@ -5309,7 +3515,7 @@ void scheduler_tick(void) | |||
| 5309 | curr->sched_class->task_tick(rq, curr, 0); | 3515 | curr->sched_class->task_tick(rq, curr, 0); |
| 5310 | raw_spin_unlock(&rq->lock); | 3516 | raw_spin_unlock(&rq->lock); |
| 5311 | 3517 | ||
| 5312 | perf_event_task_tick(curr, cpu); | 3518 | perf_event_task_tick(curr); |
| 5313 | 3519 | ||
| 5314 | #ifdef CONFIG_SMP | 3520 | #ifdef CONFIG_SMP |
| 5315 | rq->idle_at_tick = idle_cpu(cpu); | 3521 | rq->idle_at_tick = idle_cpu(cpu); |
| @@ -5523,7 +3729,7 @@ need_resched_nonpreemptible: | |||
| 5523 | 3729 | ||
| 5524 | if (likely(prev != next)) { | 3730 | if (likely(prev != next)) { |
| 5525 | sched_info_switch(prev, next); | 3731 | sched_info_switch(prev, next); |
| 5526 | perf_event_task_sched_out(prev, next, cpu); | 3732 | perf_event_task_sched_out(prev, next); |
| 5527 | 3733 | ||
| 5528 | rq->nr_switches++; | 3734 | rq->nr_switches++; |
| 5529 | rq->curr = next; | 3735 | rq->curr = next; |
| @@ -6054,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 6054 | unsigned long flags; | 4260 | unsigned long flags; |
| 6055 | int oldprio, on_rq, running; | 4261 | int oldprio, on_rq, running; |
| 6056 | struct rq *rq; | 4262 | struct rq *rq; |
| 6057 | const struct sched_class *prev_class = p->sched_class; | 4263 | const struct sched_class *prev_class; |
| 6058 | 4264 | ||
| 6059 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4265 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
| 6060 | 4266 | ||
| @@ -6062,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 6062 | update_rq_clock(rq); | 4268 | update_rq_clock(rq); |
| 6063 | 4269 | ||
| 6064 | oldprio = p->prio; | 4270 | oldprio = p->prio; |
| 4271 | prev_class = p->sched_class; | ||
| 6065 | on_rq = p->se.on_rq; | 4272 | on_rq = p->se.on_rq; |
| 6066 | running = task_current(rq, p); | 4273 | running = task_current(rq, p); |
| 6067 | if (on_rq) | 4274 | if (on_rq) |
| @@ -6079,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 6079 | if (running) | 4286 | if (running) |
| 6080 | p->sched_class->set_curr_task(rq); | 4287 | p->sched_class->set_curr_task(rq); |
| 6081 | if (on_rq) { | 4288 | if (on_rq) { |
| 6082 | enqueue_task(rq, p, 0); | 4289 | enqueue_task(rq, p, 0, oldprio < prio); |
| 6083 | 4290 | ||
| 6084 | check_class_changed(rq, p, prev_class, oldprio, running); | 4291 | check_class_changed(rq, p, prev_class, oldprio, running); |
| 6085 | } | 4292 | } |
| @@ -6123,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 6123 | delta = p->prio - old_prio; | 4330 | delta = p->prio - old_prio; |
| 6124 | 4331 | ||
| 6125 | if (on_rq) { | 4332 | if (on_rq) { |
| 6126 | enqueue_task(rq, p, 0); | 4333 | enqueue_task(rq, p, 0, false); |
| 6127 | /* | 4334 | /* |
| 6128 | * If the task increased its priority or is running and | 4335 | * If the task increased its priority or is running and |
| 6129 | * lowered its priority, then reschedule its CPU: | 4336 | * lowered its priority, then reschedule its CPU: |
| @@ -6281,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
| 6281 | { | 4488 | { |
| 6282 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4489 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
| 6283 | unsigned long flags; | 4490 | unsigned long flags; |
| 6284 | const struct sched_class *prev_class = p->sched_class; | 4491 | const struct sched_class *prev_class; |
| 6285 | struct rq *rq; | 4492 | struct rq *rq; |
| 6286 | int reset_on_fork; | 4493 | int reset_on_fork; |
| 6287 | 4494 | ||
| @@ -6395,6 +4602,7 @@ recheck: | |||
| 6395 | p->sched_reset_on_fork = reset_on_fork; | 4602 | p->sched_reset_on_fork = reset_on_fork; |
| 6396 | 4603 | ||
| 6397 | oldprio = p->prio; | 4604 | oldprio = p->prio; |
| 4605 | prev_class = p->sched_class; | ||
| 6398 | __setscheduler(rq, p, policy, param->sched_priority); | 4606 | __setscheduler(rq, p, policy, param->sched_priority); |
| 6399 | 4607 | ||
| 6400 | if (running) | 4608 | if (running) |
| @@ -7145,27 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 7145 | struct rq *rq; | 5353 | struct rq *rq; |
| 7146 | int ret = 0; | 5354 | int ret = 0; |
| 7147 | 5355 | ||
| 7148 | /* | ||
| 7149 | * Since we rely on wake-ups to migrate sleeping tasks, don't change | ||
| 7150 | * the ->cpus_allowed mask from under waking tasks, which would be | ||
| 7151 | * possible when we change rq->lock in ttwu(), so synchronize against | ||
| 7152 | * TASK_WAKING to avoid that. | ||
| 7153 | * | ||
| 7154 | * Make an exception for freshly cloned tasks, since cpuset namespaces | ||
| 7155 | * might move the task about, we have to validate the target in | ||
| 7156 | * wake_up_new_task() anyway since the cpu might have gone away. | ||
| 7157 | */ | ||
| 7158 | again: | ||
| 7159 | while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) | ||
| 7160 | cpu_relax(); | ||
| 7161 | |||
| 7162 | rq = task_rq_lock(p, &flags); | 5356 | rq = task_rq_lock(p, &flags); |
| 7163 | 5357 | ||
| 7164 | if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { | ||
| 7165 | task_rq_unlock(rq, &flags); | ||
| 7166 | goto again; | ||
| 7167 | } | ||
| 7168 | |||
| 7169 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 5358 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
| 7170 | ret = -EINVAL; | 5359 | ret = -EINVAL; |
| 7171 | goto out; | 5360 | goto out; |
| @@ -9452,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
| 9452 | tg->rt_rq[cpu] = rt_rq; | 7641 | tg->rt_rq[cpu] = rt_rq; |
| 9453 | init_rt_rq(rt_rq, rq); | 7642 | init_rt_rq(rt_rq, rq); |
| 9454 | rt_rq->tg = tg; | 7643 | rt_rq->tg = tg; |
| 9455 | rt_rq->rt_se = rt_se; | ||
| 9456 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7644 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
| 9457 | if (add) | 7645 | if (add) |
| 9458 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 7646 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
| @@ -9483,9 +7671,6 @@ void __init sched_init(void) | |||
| 9483 | #ifdef CONFIG_RT_GROUP_SCHED | 7671 | #ifdef CONFIG_RT_GROUP_SCHED |
| 9484 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7672 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
| 9485 | #endif | 7673 | #endif |
| 9486 | #ifdef CONFIG_USER_SCHED | ||
| 9487 | alloc_size *= 2; | ||
| 9488 | #endif | ||
| 9489 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7674 | #ifdef CONFIG_CPUMASK_OFFSTACK |
| 9490 | alloc_size += num_possible_cpus() * cpumask_size(); | 7675 | alloc_size += num_possible_cpus() * cpumask_size(); |
| 9491 | #endif | 7676 | #endif |
| @@ -9499,13 +7684,6 @@ void __init sched_init(void) | |||
| 9499 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7684 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; |
| 9500 | ptr += nr_cpu_ids * sizeof(void **); | 7685 | ptr += nr_cpu_ids * sizeof(void **); |
| 9501 | 7686 | ||
| 9502 | #ifdef CONFIG_USER_SCHED | ||
| 9503 | root_task_group.se = (struct sched_entity **)ptr; | ||
| 9504 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 9505 | |||
| 9506 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
| 9507 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 9508 | #endif /* CONFIG_USER_SCHED */ | ||
| 9509 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7687 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 9510 | #ifdef CONFIG_RT_GROUP_SCHED | 7688 | #ifdef CONFIG_RT_GROUP_SCHED |
| 9511 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7689 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; |
| @@ -9514,13 +7692,6 @@ void __init sched_init(void) | |||
| 9514 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 7692 | init_task_group.rt_rq = (struct rt_rq **)ptr; |
| 9515 | ptr += nr_cpu_ids * sizeof(void **); | 7693 | ptr += nr_cpu_ids * sizeof(void **); |
| 9516 | 7694 | ||
| 9517 | #ifdef CONFIG_USER_SCHED | ||
| 9518 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
| 9519 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 9520 | |||
| 9521 | root_task_group.rt_rq = (struct rt_rq **)ptr; | ||
| 9522 | ptr += nr_cpu_ids * sizeof(void **); | ||
| 9523 | #endif /* CONFIG_USER_SCHED */ | ||
| 9524 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7695 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 9525 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7696 | #ifdef CONFIG_CPUMASK_OFFSTACK |
| 9526 | for_each_possible_cpu(i) { | 7697 | for_each_possible_cpu(i) { |
| @@ -9540,22 +7711,13 @@ void __init sched_init(void) | |||
| 9540 | #ifdef CONFIG_RT_GROUP_SCHED | 7711 | #ifdef CONFIG_RT_GROUP_SCHED |
| 9541 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 7712 | init_rt_bandwidth(&init_task_group.rt_bandwidth, |
| 9542 | global_rt_period(), global_rt_runtime()); | 7713 | global_rt_period(), global_rt_runtime()); |
| 9543 | #ifdef CONFIG_USER_SCHED | ||
| 9544 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | ||
| 9545 | global_rt_period(), RUNTIME_INF); | ||
| 9546 | #endif /* CONFIG_USER_SCHED */ | ||
| 9547 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7714 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 9548 | 7715 | ||
| 9549 | #ifdef CONFIG_GROUP_SCHED | 7716 | #ifdef CONFIG_CGROUP_SCHED |
| 9550 | list_add(&init_task_group.list, &task_groups); | 7717 | list_add(&init_task_group.list, &task_groups); |
| 9551 | INIT_LIST_HEAD(&init_task_group.children); | 7718 | INIT_LIST_HEAD(&init_task_group.children); |
| 9552 | 7719 | ||
| 9553 | #ifdef CONFIG_USER_SCHED | 7720 | #endif /* CONFIG_CGROUP_SCHED */ |
| 9554 | INIT_LIST_HEAD(&root_task_group.children); | ||
| 9555 | init_task_group.parent = &root_task_group; | ||
| 9556 | list_add(&init_task_group.siblings, &root_task_group.children); | ||
| 9557 | #endif /* CONFIG_USER_SCHED */ | ||
| 9558 | #endif /* CONFIG_GROUP_SCHED */ | ||
| 9559 | 7721 | ||
| 9560 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | 7722 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP |
| 9561 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | 7723 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), |
| @@ -9595,25 +7757,6 @@ void __init sched_init(void) | |||
| 9595 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7757 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). |
| 9596 | */ | 7758 | */ |
| 9597 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7759 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); |
| 9598 | #elif defined CONFIG_USER_SCHED | ||
| 9599 | root_task_group.shares = NICE_0_LOAD; | ||
| 9600 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); | ||
| 9601 | /* | ||
| 9602 | * In case of task-groups formed thr' the user id of tasks, | ||
| 9603 | * init_task_group represents tasks belonging to root user. | ||
| 9604 | * Hence it forms a sibling of all subsequent groups formed. | ||
| 9605 | * In this case, init_task_group gets only a fraction of overall | ||
| 9606 | * system cpu resource, based on the weight assigned to root | ||
| 9607 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
| 9608 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
| 9609 | * (init_tg_cfs_rq) and having one entity represent this group of | ||
| 9610 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
| 9611 | */ | ||
| 9612 | init_tg_cfs_entry(&init_task_group, | ||
| 9613 | &per_cpu(init_tg_cfs_rq, i), | ||
| 9614 | &per_cpu(init_sched_entity, i), i, 1, | ||
| 9615 | root_task_group.se[i]); | ||
| 9616 | |||
| 9617 | #endif | 7760 | #endif |
| 9618 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7761 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 9619 | 7762 | ||
| @@ -9622,12 +7765,6 @@ void __init sched_init(void) | |||
| 9622 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7765 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
| 9623 | #ifdef CONFIG_CGROUP_SCHED | 7766 | #ifdef CONFIG_CGROUP_SCHED |
| 9624 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | 7767 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); |
| 9625 | #elif defined CONFIG_USER_SCHED | ||
| 9626 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); | ||
| 9627 | init_tg_rt_entry(&init_task_group, | ||
| 9628 | &per_cpu(init_rt_rq_var, i), | ||
| 9629 | &per_cpu(init_sched_rt_entity, i), i, 1, | ||
| 9630 | root_task_group.rt_se[i]); | ||
| 9631 | #endif | 7768 | #endif |
| 9632 | #endif | 7769 | #endif |
| 9633 | 7770 | ||
| @@ -9712,7 +7849,7 @@ static inline int preempt_count_equals(int preempt_offset) | |||
| 9712 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 7849 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); |
| 9713 | } | 7850 | } |
| 9714 | 7851 | ||
| 9715 | void __might_sleep(char *file, int line, int preempt_offset) | 7852 | void __might_sleep(const char *file, int line, int preempt_offset) |
| 9716 | { | 7853 | { |
| 9717 | #ifdef in_atomic | 7854 | #ifdef in_atomic |
| 9718 | static unsigned long prev_jiffy; /* ratelimiting */ | 7855 | static unsigned long prev_jiffy; /* ratelimiting */ |
| @@ -10023,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
| 10023 | } | 8160 | } |
| 10024 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8161 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 10025 | 8162 | ||
| 10026 | #ifdef CONFIG_GROUP_SCHED | 8163 | #ifdef CONFIG_CGROUP_SCHED |
| 10027 | static void free_sched_group(struct task_group *tg) | 8164 | static void free_sched_group(struct task_group *tg) |
| 10028 | { | 8165 | { |
| 10029 | free_fair_sched_group(tg); | 8166 | free_fair_sched_group(tg); |
| @@ -10128,11 +8265,11 @@ void sched_move_task(struct task_struct *tsk) | |||
| 10128 | if (unlikely(running)) | 8265 | if (unlikely(running)) |
| 10129 | tsk->sched_class->set_curr_task(rq); | 8266 | tsk->sched_class->set_curr_task(rq); |
| 10130 | if (on_rq) | 8267 | if (on_rq) |
| 10131 | enqueue_task(rq, tsk, 0); | 8268 | enqueue_task(rq, tsk, 0, false); |
| 10132 | 8269 | ||
| 10133 | task_rq_unlock(rq, &flags); | 8270 | task_rq_unlock(rq, &flags); |
| 10134 | } | 8271 | } |
| 10135 | #endif /* CONFIG_GROUP_SCHED */ | 8272 | #endif /* CONFIG_CGROUP_SCHED */ |
| 10136 | 8273 | ||
| 10137 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8274 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 10138 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | 8275 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
| @@ -10274,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
| 10274 | runtime = d->rt_runtime; | 8411 | runtime = d->rt_runtime; |
| 10275 | } | 8412 | } |
| 10276 | 8413 | ||
| 10277 | #ifdef CONFIG_USER_SCHED | ||
| 10278 | if (tg == &root_task_group) { | ||
| 10279 | period = global_rt_period(); | ||
| 10280 | runtime = global_rt_runtime(); | ||
| 10281 | } | ||
| 10282 | #endif | ||
| 10283 | |||
| 10284 | /* | 8414 | /* |
| 10285 | * Cannot have more runtime than the period. | 8415 | * Cannot have more runtime than the period. |
| 10286 | */ | 8416 | */ |
| @@ -10683,7 +8813,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
| 10683 | struct cpuacct { | 8813 | struct cpuacct { |
| 10684 | struct cgroup_subsys_state css; | 8814 | struct cgroup_subsys_state css; |
| 10685 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 8815 | /* cpuusage holds pointer to a u64-type object on every cpu */ |
| 10686 | u64 *cpuusage; | 8816 | u64 __percpu *cpuusage; |
| 10687 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | 8817 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; |
| 10688 | struct cpuacct *parent; | 8818 | struct cpuacct *parent; |
| 10689 | }; | 8819 | }; |
| @@ -10900,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
| 10900 | } | 9030 | } |
| 10901 | 9031 | ||
| 10902 | /* | 9032 | /* |
| 9033 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | ||
| 9034 | * in cputime_t units. As a result, cpuacct_update_stats calls | ||
| 9035 | * percpu_counter_add with values large enough to always overflow the | ||
| 9036 | * per cpu batch limit causing bad SMP scalability. | ||
| 9037 | * | ||
| 9038 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | ||
| 9039 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | ||
| 9040 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | ||
| 9041 | */ | ||
| 9042 | #ifdef CONFIG_SMP | ||
| 9043 | #define CPUACCT_BATCH \ | ||
| 9044 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | ||
| 9045 | #else | ||
| 9046 | #define CPUACCT_BATCH 0 | ||
| 9047 | #endif | ||
| 9048 | |||
| 9049 | /* | ||
| 10903 | * Charge the system/user time to the task's accounting group. | 9050 | * Charge the system/user time to the task's accounting group. |
| 10904 | */ | 9051 | */ |
| 10905 | static void cpuacct_update_stats(struct task_struct *tsk, | 9052 | static void cpuacct_update_stats(struct task_struct *tsk, |
| 10906 | enum cpuacct_stat_index idx, cputime_t val) | 9053 | enum cpuacct_stat_index idx, cputime_t val) |
| 10907 | { | 9054 | { |
| 10908 | struct cpuacct *ca; | 9055 | struct cpuacct *ca; |
| 9056 | int batch = CPUACCT_BATCH; | ||
| 10909 | 9057 | ||
| 10910 | if (unlikely(!cpuacct_subsys.active)) | 9058 | if (unlikely(!cpuacct_subsys.active)) |
| 10911 | return; | 9059 | return; |
| @@ -10914,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk, | |||
| 10914 | ca = task_ca(tsk); | 9062 | ca = task_ca(tsk); |
| 10915 | 9063 | ||
| 10916 | do { | 9064 | do { |
| 10917 | percpu_counter_add(&ca->cpustat[idx], val); | 9065 | __percpu_counter_add(&ca->cpustat[idx], val, batch); |
| 10918 | ca = ca->parent; | 9066 | ca = ca->parent; |
| 10919 | } while (ca); | 9067 | } while (ca); |
| 10920 | rcu_read_unlock(); | 9068 | rcu_read_unlock(); |
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 597b33099dfa..eeb3506c4834 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
| @@ -47,9 +47,7 @@ static int convert_prio(int prio) | |||
| 47 | } | 47 | } |
| 48 | 48 | ||
| 49 | #define for_each_cpupri_active(array, idx) \ | 49 | #define for_each_cpupri_active(array, idx) \ |
| 50 | for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ | 50 | for_each_bit(idx, array, CPUPRI_NR_PRIORITIES) |
| 51 | idx < CPUPRI_NR_PRIORITIES; \ | ||
| 52 | idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) | ||
| 53 | 51 | ||
| 54 | /** | 52 | /** |
| 55 | * cpupri_find - find the best (lowest-pri) CPU in the system | 53 | * cpupri_find - find the best (lowest-pri) CPU in the system |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8fe7ee81c552..3e1fd96c6cf9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq) | |||
| 1053 | * increased. Here we update the fair scheduling stats and | 1053 | * increased. Here we update the fair scheduling stats and |
| 1054 | * then put the task into the rbtree: | 1054 | * then put the task into the rbtree: |
| 1055 | */ | 1055 | */ |
| 1056 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 1056 | static void |
| 1057 | enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
| 1057 | { | 1058 | { |
| 1058 | struct cfs_rq *cfs_rq; | 1059 | struct cfs_rq *cfs_rq; |
| 1059 | struct sched_entity *se = &p->se; | 1060 | struct sched_entity *se = &p->se; |
| @@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
| 1815 | */ | 1816 | */ |
| 1816 | 1817 | ||
| 1817 | /* | 1818 | /* |
| 1818 | * Load-balancing iterator. Note: while the runqueue stays locked | 1819 | * pull_task - move a task from a remote runqueue to the local runqueue. |
| 1819 | * during the whole iteration, the current task might be | 1820 | * Both runqueues must be locked. |
| 1820 | * dequeued so the iterator has to be dequeue-safe. Here we | ||
| 1821 | * achieve that by always pre-iterating before returning | ||
| 1822 | * the current task: | ||
| 1823 | */ | 1821 | */ |
| 1824 | static struct task_struct * | 1822 | static void pull_task(struct rq *src_rq, struct task_struct *p, |
| 1825 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) | 1823 | struct rq *this_rq, int this_cpu) |
| 1826 | { | 1824 | { |
| 1827 | struct task_struct *p = NULL; | 1825 | deactivate_task(src_rq, p, 0); |
| 1828 | struct sched_entity *se; | 1826 | set_task_cpu(p, this_cpu); |
| 1827 | activate_task(this_rq, p, 0); | ||
| 1828 | check_preempt_curr(this_rq, p, 0); | ||
| 1829 | } | ||
| 1829 | 1830 | ||
| 1830 | if (next == &cfs_rq->tasks) | 1831 | /* |
| 1831 | return NULL; | 1832 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
| 1833 | */ | ||
| 1834 | static | ||
| 1835 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | ||
| 1836 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 1837 | int *all_pinned) | ||
| 1838 | { | ||
| 1839 | int tsk_cache_hot = 0; | ||
| 1840 | /* | ||
| 1841 | * We do not migrate tasks that are: | ||
| 1842 | * 1) running (obviously), or | ||
| 1843 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | ||
| 1844 | * 3) are cache-hot on their current CPU. | ||
| 1845 | */ | ||
| 1846 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | ||
| 1847 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
| 1848 | return 0; | ||
| 1849 | } | ||
| 1850 | *all_pinned = 0; | ||
| 1832 | 1851 | ||
| 1833 | se = list_entry(next, struct sched_entity, group_node); | 1852 | if (task_running(rq, p)) { |
| 1834 | p = task_of(se); | 1853 | schedstat_inc(p, se.nr_failed_migrations_running); |
| 1835 | cfs_rq->balance_iterator = next->next; | 1854 | return 0; |
| 1855 | } | ||
| 1836 | 1856 | ||
| 1837 | return p; | 1857 | /* |
| 1838 | } | 1858 | * Aggressive migration if: |
| 1859 | * 1) task is cache cold, or | ||
| 1860 | * 2) too many balance attempts have failed. | ||
| 1861 | */ | ||
| 1839 | 1862 | ||
| 1840 | static struct task_struct *load_balance_start_fair(void *arg) | 1863 | tsk_cache_hot = task_hot(p, rq->clock, sd); |
| 1841 | { | 1864 | if (!tsk_cache_hot || |
| 1842 | struct cfs_rq *cfs_rq = arg; | 1865 | sd->nr_balance_failed > sd->cache_nice_tries) { |
| 1866 | #ifdef CONFIG_SCHEDSTATS | ||
| 1867 | if (tsk_cache_hot) { | ||
| 1868 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
| 1869 | schedstat_inc(p, se.nr_forced_migrations); | ||
| 1870 | } | ||
| 1871 | #endif | ||
| 1872 | return 1; | ||
| 1873 | } | ||
| 1843 | 1874 | ||
| 1844 | return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); | 1875 | if (tsk_cache_hot) { |
| 1876 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
| 1877 | return 0; | ||
| 1878 | } | ||
| 1879 | return 1; | ||
| 1845 | } | 1880 | } |
| 1846 | 1881 | ||
| 1847 | static struct task_struct *load_balance_next_fair(void *arg) | 1882 | /* |
| 1883 | * move_one_task tries to move exactly one task from busiest to this_rq, as | ||
| 1884 | * part of active balancing operations within "domain". | ||
| 1885 | * Returns 1 if successful and 0 otherwise. | ||
| 1886 | * | ||
| 1887 | * Called with both runqueues locked. | ||
| 1888 | */ | ||
| 1889 | static int | ||
| 1890 | move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1891 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
| 1848 | { | 1892 | { |
| 1849 | struct cfs_rq *cfs_rq = arg; | 1893 | struct task_struct *p, *n; |
| 1894 | struct cfs_rq *cfs_rq; | ||
| 1895 | int pinned = 0; | ||
| 1896 | |||
| 1897 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | ||
| 1898 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | ||
| 1899 | |||
| 1900 | if (!can_migrate_task(p, busiest, this_cpu, | ||
| 1901 | sd, idle, &pinned)) | ||
| 1902 | continue; | ||
| 1850 | 1903 | ||
| 1851 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); | 1904 | pull_task(busiest, p, this_rq, this_cpu); |
| 1905 | /* | ||
| 1906 | * Right now, this is only the second place pull_task() | ||
| 1907 | * is called, so we can safely collect pull_task() | ||
| 1908 | * stats here rather than inside pull_task(). | ||
| 1909 | */ | ||
| 1910 | schedstat_inc(sd, lb_gained[idle]); | ||
| 1911 | return 1; | ||
| 1912 | } | ||
| 1913 | } | ||
| 1914 | |||
| 1915 | return 0; | ||
| 1852 | } | 1916 | } |
| 1853 | 1917 | ||
| 1854 | static unsigned long | 1918 | static unsigned long |
| 1855 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1919 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1856 | unsigned long max_load_move, struct sched_domain *sd, | 1920 | unsigned long max_load_move, struct sched_domain *sd, |
| 1857 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | 1921 | enum cpu_idle_type idle, int *all_pinned, |
| 1858 | struct cfs_rq *cfs_rq) | 1922 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) |
| 1859 | { | 1923 | { |
| 1860 | struct rq_iterator cfs_rq_iterator; | 1924 | int loops = 0, pulled = 0, pinned = 0; |
| 1925 | long rem_load_move = max_load_move; | ||
| 1926 | struct task_struct *p, *n; | ||
| 1861 | 1927 | ||
| 1862 | cfs_rq_iterator.start = load_balance_start_fair; | 1928 | if (max_load_move == 0) |
| 1863 | cfs_rq_iterator.next = load_balance_next_fair; | 1929 | goto out; |
| 1864 | cfs_rq_iterator.arg = cfs_rq; | ||
| 1865 | 1930 | ||
| 1866 | return balance_tasks(this_rq, this_cpu, busiest, | 1931 | pinned = 1; |
| 1867 | max_load_move, sd, idle, all_pinned, | 1932 | |
| 1868 | this_best_prio, &cfs_rq_iterator); | 1933 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
| 1934 | if (loops++ > sysctl_sched_nr_migrate) | ||
| 1935 | break; | ||
| 1936 | |||
| 1937 | if ((p->se.load.weight >> 1) > rem_load_move || | ||
| 1938 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) | ||
| 1939 | continue; | ||
| 1940 | |||
| 1941 | pull_task(busiest, p, this_rq, this_cpu); | ||
| 1942 | pulled++; | ||
| 1943 | rem_load_move -= p->se.load.weight; | ||
| 1944 | |||
| 1945 | #ifdef CONFIG_PREEMPT | ||
| 1946 | /* | ||
| 1947 | * NEWIDLE balancing is a source of latency, so preemptible | ||
| 1948 | * kernels will stop after the first task is pulled to minimize | ||
| 1949 | * the critical section. | ||
| 1950 | */ | ||
| 1951 | if (idle == CPU_NEWLY_IDLE) | ||
| 1952 | break; | ||
| 1953 | #endif | ||
| 1954 | |||
| 1955 | /* | ||
| 1956 | * We only want to steal up to the prescribed amount of | ||
| 1957 | * weighted load. | ||
| 1958 | */ | ||
| 1959 | if (rem_load_move <= 0) | ||
| 1960 | break; | ||
| 1961 | |||
| 1962 | if (p->prio < *this_best_prio) | ||
| 1963 | *this_best_prio = p->prio; | ||
| 1964 | } | ||
| 1965 | out: | ||
| 1966 | /* | ||
| 1967 | * Right now, this is one of only two places pull_task() is called, | ||
| 1968 | * so we can safely collect pull_task() stats here rather than | ||
| 1969 | * inside pull_task(). | ||
| 1970 | */ | ||
| 1971 | schedstat_add(sd, lb_gained[idle], pulled); | ||
| 1972 | |||
| 1973 | if (all_pinned) | ||
| 1974 | *all_pinned = pinned; | ||
| 1975 | |||
| 1976 | return max_load_move - rem_load_move; | ||
| 1869 | } | 1977 | } |
| 1870 | 1978 | ||
| 1871 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1979 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1897 | rem_load = (u64)rem_load_move * busiest_weight; | 2005 | rem_load = (u64)rem_load_move * busiest_weight; |
| 1898 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 2006 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
| 1899 | 2007 | ||
| 1900 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, | 2008 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
| 1901 | rem_load, sd, idle, all_pinned, this_best_prio, | 2009 | rem_load, sd, idle, all_pinned, this_best_prio, |
| 1902 | tg->cfs_rq[busiest_cpu]); | 2010 | busiest_cfs_rq); |
| 1903 | 2011 | ||
| 1904 | if (!moved_load) | 2012 | if (!moved_load) |
| 1905 | continue; | 2013 | continue; |
| @@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1922 | struct sched_domain *sd, enum cpu_idle_type idle, | 2030 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 1923 | int *all_pinned, int *this_best_prio) | 2031 | int *all_pinned, int *this_best_prio) |
| 1924 | { | 2032 | { |
| 1925 | return __load_balance_fair(this_rq, this_cpu, busiest, | 2033 | return balance_tasks(this_rq, this_cpu, busiest, |
| 1926 | max_load_move, sd, idle, all_pinned, | 2034 | max_load_move, sd, idle, all_pinned, |
| 1927 | this_best_prio, &busiest->cfs); | 2035 | this_best_prio, &busiest->cfs); |
| 1928 | } | 2036 | } |
| 1929 | #endif | 2037 | #endif |
| 1930 | 2038 | ||
| 1931 | static int | 2039 | /* |
| 1932 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2040 | * move_tasks tries to move up to max_load_move weighted load from busiest to |
| 1933 | struct sched_domain *sd, enum cpu_idle_type idle) | 2041 | * this_rq, as part of a balancing operation within domain "sd". |
| 2042 | * Returns 1 if successful and 0 otherwise. | ||
| 2043 | * | ||
| 2044 | * Called with both runqueues locked. | ||
| 2045 | */ | ||
| 2046 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 2047 | unsigned long max_load_move, | ||
| 2048 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 2049 | int *all_pinned) | ||
| 1934 | { | 2050 | { |
| 1935 | struct cfs_rq *busy_cfs_rq; | 2051 | unsigned long total_load_moved = 0, load_moved; |
| 1936 | struct rq_iterator cfs_rq_iterator; | 2052 | int this_best_prio = this_rq->curr->prio; |
| 1937 | 2053 | ||
| 1938 | cfs_rq_iterator.start = load_balance_start_fair; | 2054 | do { |
| 1939 | cfs_rq_iterator.next = load_balance_next_fair; | 2055 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
| 2056 | max_load_move - total_load_moved, | ||
| 2057 | sd, idle, all_pinned, &this_best_prio); | ||
| 1940 | 2058 | ||
| 1941 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 2059 | total_load_moved += load_moved; |
| 2060 | |||
| 2061 | #ifdef CONFIG_PREEMPT | ||
| 1942 | /* | 2062 | /* |
| 1943 | * pass busy_cfs_rq argument into | 2063 | * NEWIDLE balancing is a source of latency, so preemptible |
| 1944 | * load_balance_[start|next]_fair iterators | 2064 | * kernels will stop after the first task is pulled to minimize |
| 2065 | * the critical section. | ||
| 1945 | */ | 2066 | */ |
| 1946 | cfs_rq_iterator.arg = busy_cfs_rq; | 2067 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) |
| 1947 | if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, | 2068 | break; |
| 1948 | &cfs_rq_iterator)) | 2069 | |
| 1949 | return 1; | 2070 | if (raw_spin_is_contended(&this_rq->lock) || |
| 2071 | raw_spin_is_contended(&busiest->lock)) | ||
| 2072 | break; | ||
| 2073 | #endif | ||
| 2074 | } while (load_moved && max_load_move > total_load_moved); | ||
| 2075 | |||
| 2076 | return total_load_moved > 0; | ||
| 2077 | } | ||
| 2078 | |||
| 2079 | /********** Helpers for find_busiest_group ************************/ | ||
| 2080 | /* | ||
| 2081 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
| 2082 | * during load balancing. | ||
| 2083 | */ | ||
| 2084 | struct sd_lb_stats { | ||
| 2085 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
| 2086 | struct sched_group *this; /* Local group in this sd */ | ||
| 2087 | unsigned long total_load; /* Total load of all groups in sd */ | ||
| 2088 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
| 2089 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
| 2090 | |||
| 2091 | /** Statistics of this group */ | ||
| 2092 | unsigned long this_load; | ||
| 2093 | unsigned long this_load_per_task; | ||
| 2094 | unsigned long this_nr_running; | ||
| 2095 | |||
| 2096 | /* Statistics of the busiest group */ | ||
| 2097 | unsigned long max_load; | ||
| 2098 | unsigned long busiest_load_per_task; | ||
| 2099 | unsigned long busiest_nr_running; | ||
| 2100 | unsigned long busiest_group_capacity; | ||
| 2101 | |||
| 2102 | int group_imb; /* Is there imbalance in this sd */ | ||
| 2103 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 2104 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
| 2105 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
| 2106 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
| 2107 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
| 2108 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
| 2109 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
| 2110 | #endif | ||
| 2111 | }; | ||
| 2112 | |||
| 2113 | /* | ||
| 2114 | * sg_lb_stats - stats of a sched_group required for load_balancing | ||
| 2115 | */ | ||
| 2116 | struct sg_lb_stats { | ||
| 2117 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | ||
| 2118 | unsigned long group_load; /* Total load over the CPUs of the group */ | ||
| 2119 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
| 2120 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
| 2121 | unsigned long group_capacity; | ||
| 2122 | int group_imb; /* Is there an imbalance in the group ? */ | ||
| 2123 | }; | ||
| 2124 | |||
| 2125 | /** | ||
| 2126 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
| 2127 | * @group: The group whose first cpu is to be returned. | ||
| 2128 | */ | ||
| 2129 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
| 2130 | { | ||
| 2131 | return cpumask_first(sched_group_cpus(group)); | ||
| 2132 | } | ||
| 2133 | |||
| 2134 | /** | ||
| 2135 | * get_sd_load_idx - Obtain the load index for a given sched domain. | ||
| 2136 | * @sd: The sched_domain whose load_idx is to be obtained. | ||
| 2137 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | ||
| 2138 | */ | ||
| 2139 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
| 2140 | enum cpu_idle_type idle) | ||
| 2141 | { | ||
| 2142 | int load_idx; | ||
| 2143 | |||
| 2144 | switch (idle) { | ||
| 2145 | case CPU_NOT_IDLE: | ||
| 2146 | load_idx = sd->busy_idx; | ||
| 2147 | break; | ||
| 2148 | |||
| 2149 | case CPU_NEWLY_IDLE: | ||
| 2150 | load_idx = sd->newidle_idx; | ||
| 2151 | break; | ||
| 2152 | default: | ||
| 2153 | load_idx = sd->idle_idx; | ||
| 2154 | break; | ||
| 1950 | } | 2155 | } |
| 1951 | 2156 | ||
| 2157 | return load_idx; | ||
| 2158 | } | ||
| 2159 | |||
| 2160 | |||
| 2161 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 2162 | /** | ||
| 2163 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
| 2164 | * the given sched_domain, during load balancing. | ||
| 2165 | * | ||
| 2166 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
| 2167 | * @sds: Variable containing the statistics for sd. | ||
| 2168 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
| 2169 | */ | ||
| 2170 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
| 2171 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
| 2172 | { | ||
| 2173 | /* | ||
| 2174 | * Busy processors will not participate in power savings | ||
| 2175 | * balance. | ||
| 2176 | */ | ||
| 2177 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
| 2178 | sds->power_savings_balance = 0; | ||
| 2179 | else { | ||
| 2180 | sds->power_savings_balance = 1; | ||
| 2181 | sds->min_nr_running = ULONG_MAX; | ||
| 2182 | sds->leader_nr_running = 0; | ||
| 2183 | } | ||
| 2184 | } | ||
| 2185 | |||
| 2186 | /** | ||
| 2187 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
| 2188 | * sched_domain while performing load balancing. | ||
| 2189 | * | ||
| 2190 | * @group: sched_group belonging to the sched_domain under consideration. | ||
| 2191 | * @sds: Variable containing the statistics of the sched_domain | ||
| 2192 | * @local_group: Does group contain the CPU for which we're performing | ||
| 2193 | * load balancing ? | ||
| 2194 | * @sgs: Variable containing the statistics of the group. | ||
| 2195 | */ | ||
| 2196 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
| 2197 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
| 2198 | { | ||
| 2199 | |||
| 2200 | if (!sds->power_savings_balance) | ||
| 2201 | return; | ||
| 2202 | |||
| 2203 | /* | ||
| 2204 | * If the local group is idle or completely loaded | ||
| 2205 | * no need to do power savings balance at this domain | ||
| 2206 | */ | ||
| 2207 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
| 2208 | !sds->this_nr_running)) | ||
| 2209 | sds->power_savings_balance = 0; | ||
| 2210 | |||
| 2211 | /* | ||
| 2212 | * If a group is already running at full capacity or idle, | ||
| 2213 | * don't include that group in power savings calculations | ||
| 2214 | */ | ||
| 2215 | if (!sds->power_savings_balance || | ||
| 2216 | sgs->sum_nr_running >= sgs->group_capacity || | ||
| 2217 | !sgs->sum_nr_running) | ||
| 2218 | return; | ||
| 2219 | |||
| 2220 | /* | ||
| 2221 | * Calculate the group which has the least non-idle load. | ||
| 2222 | * This is the group from where we need to pick up the load | ||
| 2223 | * for saving power | ||
| 2224 | */ | ||
| 2225 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
| 2226 | (sgs->sum_nr_running == sds->min_nr_running && | ||
| 2227 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
| 2228 | sds->group_min = group; | ||
| 2229 | sds->min_nr_running = sgs->sum_nr_running; | ||
| 2230 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
| 2231 | sgs->sum_nr_running; | ||
| 2232 | } | ||
| 2233 | |||
| 2234 | /* | ||
| 2235 | * Calculate the group which is almost near its | ||
| 2236 | * capacity but still has some space to pick up some load | ||
| 2237 | * from other group and save more power | ||
| 2238 | */ | ||
| 2239 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
| 2240 | return; | ||
| 2241 | |||
| 2242 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
| 2243 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
| 2244 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
| 2245 | sds->group_leader = group; | ||
| 2246 | sds->leader_nr_running = sgs->sum_nr_running; | ||
| 2247 | } | ||
| 2248 | } | ||
| 2249 | |||
| 2250 | /** | ||
| 2251 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
| 2252 | * @sds: Variable containing the statistics of the sched_domain | ||
| 2253 | * under consideration. | ||
| 2254 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
| 2255 | * @imbalance: Variable to store the imbalance. | ||
| 2256 | * | ||
| 2257 | * Description: | ||
| 2258 | * Check if we have potential to perform some power-savings balance. | ||
| 2259 | * If yes, set the busiest group to be the least loaded group in the | ||
| 2260 | * sched_domain, so that it's CPUs can be put to idle. | ||
| 2261 | * | ||
| 2262 | * Returns 1 if there is potential to perform power-savings balance. | ||
| 2263 | * Else returns 0. | ||
| 2264 | */ | ||
| 2265 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
| 2266 | int this_cpu, unsigned long *imbalance) | ||
| 2267 | { | ||
| 2268 | if (!sds->power_savings_balance) | ||
| 2269 | return 0; | ||
| 2270 | |||
| 2271 | if (sds->this != sds->group_leader || | ||
| 2272 | sds->group_leader == sds->group_min) | ||
| 2273 | return 0; | ||
| 2274 | |||
| 2275 | *imbalance = sds->min_load_per_task; | ||
| 2276 | sds->busiest = sds->group_min; | ||
| 2277 | |||
| 2278 | return 1; | ||
| 2279 | |||
| 2280 | } | ||
| 2281 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
| 2282 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
| 2283 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
| 2284 | { | ||
| 2285 | return; | ||
| 2286 | } | ||
| 2287 | |||
| 2288 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
| 2289 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
| 2290 | { | ||
| 2291 | return; | ||
| 2292 | } | ||
| 2293 | |||
| 2294 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
| 2295 | int this_cpu, unsigned long *imbalance) | ||
| 2296 | { | ||
| 1952 | return 0; | 2297 | return 0; |
| 1953 | } | 2298 | } |
| 2299 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
| 2300 | |||
| 2301 | |||
| 2302 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
| 2303 | { | ||
| 2304 | return SCHED_LOAD_SCALE; | ||
| 2305 | } | ||
| 2306 | |||
| 2307 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
| 2308 | { | ||
| 2309 | return default_scale_freq_power(sd, cpu); | ||
| 2310 | } | ||
| 2311 | |||
| 2312 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
| 2313 | { | ||
| 2314 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 2315 | unsigned long smt_gain = sd->smt_gain; | ||
| 2316 | |||
| 2317 | smt_gain /= weight; | ||
| 2318 | |||
| 2319 | return smt_gain; | ||
| 2320 | } | ||
| 2321 | |||
| 2322 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
| 2323 | { | ||
| 2324 | return default_scale_smt_power(sd, cpu); | ||
| 2325 | } | ||
| 2326 | |||
| 2327 | unsigned long scale_rt_power(int cpu) | ||
| 2328 | { | ||
| 2329 | struct rq *rq = cpu_rq(cpu); | ||
| 2330 | u64 total, available; | ||
| 2331 | |||
| 2332 | sched_avg_update(rq); | ||
| 2333 | |||
| 2334 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
| 2335 | available = total - rq->rt_avg; | ||
| 2336 | |||
| 2337 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
| 2338 | total = SCHED_LOAD_SCALE; | ||
| 2339 | |||
| 2340 | total >>= SCHED_LOAD_SHIFT; | ||
| 2341 | |||
| 2342 | return div_u64(available, total); | ||
| 2343 | } | ||
| 2344 | |||
| 2345 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
| 2346 | { | ||
| 2347 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 2348 | unsigned long power = SCHED_LOAD_SCALE; | ||
| 2349 | struct sched_group *sdg = sd->groups; | ||
| 2350 | |||
| 2351 | if (sched_feat(ARCH_POWER)) | ||
| 2352 | power *= arch_scale_freq_power(sd, cpu); | ||
| 2353 | else | ||
| 2354 | power *= default_scale_freq_power(sd, cpu); | ||
| 2355 | |||
| 2356 | power >>= SCHED_LOAD_SHIFT; | ||
| 2357 | |||
| 2358 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
| 2359 | if (sched_feat(ARCH_POWER)) | ||
| 2360 | power *= arch_scale_smt_power(sd, cpu); | ||
| 2361 | else | ||
| 2362 | power *= default_scale_smt_power(sd, cpu); | ||
| 2363 | |||
| 2364 | power >>= SCHED_LOAD_SHIFT; | ||
| 2365 | } | ||
| 2366 | |||
| 2367 | power *= scale_rt_power(cpu); | ||
| 2368 | power >>= SCHED_LOAD_SHIFT; | ||
| 2369 | |||
| 2370 | if (!power) | ||
| 2371 | power = 1; | ||
| 2372 | |||
| 2373 | sdg->cpu_power = power; | ||
| 2374 | } | ||
| 2375 | |||
| 2376 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
| 2377 | { | ||
| 2378 | struct sched_domain *child = sd->child; | ||
| 2379 | struct sched_group *group, *sdg = sd->groups; | ||
| 2380 | unsigned long power; | ||
| 2381 | |||
| 2382 | if (!child) { | ||
| 2383 | update_cpu_power(sd, cpu); | ||
| 2384 | return; | ||
| 2385 | } | ||
| 2386 | |||
| 2387 | power = 0; | ||
| 2388 | |||
| 2389 | group = child->groups; | ||
| 2390 | do { | ||
| 2391 | power += group->cpu_power; | ||
| 2392 | group = group->next; | ||
| 2393 | } while (group != child->groups); | ||
| 2394 | |||
| 2395 | sdg->cpu_power = power; | ||
| 2396 | } | ||
| 2397 | |||
| 2398 | /** | ||
| 2399 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | ||
| 2400 | * @sd: The sched_domain whose statistics are to be updated. | ||
| 2401 | * @group: sched_group whose statistics are to be updated. | ||
| 2402 | * @this_cpu: Cpu for which load balance is currently performed. | ||
| 2403 | * @idle: Idle status of this_cpu | ||
| 2404 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | ||
| 2405 | * @sd_idle: Idle status of the sched_domain containing group. | ||
| 2406 | * @local_group: Does group contain this_cpu. | ||
| 2407 | * @cpus: Set of cpus considered for load balancing. | ||
| 2408 | * @balance: Should we balance. | ||
| 2409 | * @sgs: variable to hold the statistics for this group. | ||
| 2410 | */ | ||
| 2411 | static inline void update_sg_lb_stats(struct sched_domain *sd, | ||
| 2412 | struct sched_group *group, int this_cpu, | ||
| 2413 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | ||
| 2414 | int local_group, const struct cpumask *cpus, | ||
| 2415 | int *balance, struct sg_lb_stats *sgs) | ||
| 2416 | { | ||
| 2417 | unsigned long load, max_cpu_load, min_cpu_load; | ||
| 2418 | int i; | ||
| 2419 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
| 2420 | unsigned long avg_load_per_task = 0; | ||
| 2421 | |||
| 2422 | if (local_group) | ||
| 2423 | balance_cpu = group_first_cpu(group); | ||
| 2424 | |||
| 2425 | /* Tally up the load of all CPUs in the group */ | ||
| 2426 | max_cpu_load = 0; | ||
| 2427 | min_cpu_load = ~0UL; | ||
| 2428 | |||
| 2429 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | ||
| 2430 | struct rq *rq = cpu_rq(i); | ||
| 2431 | |||
| 2432 | if (*sd_idle && rq->nr_running) | ||
| 2433 | *sd_idle = 0; | ||
| 2434 | |||
| 2435 | /* Bias balancing toward cpus of our domain */ | ||
| 2436 | if (local_group) { | ||
| 2437 | if (idle_cpu(i) && !first_idle_cpu) { | ||
| 2438 | first_idle_cpu = 1; | ||
| 2439 | balance_cpu = i; | ||
| 2440 | } | ||
| 2441 | |||
| 2442 | load = target_load(i, load_idx); | ||
| 2443 | } else { | ||
| 2444 | load = source_load(i, load_idx); | ||
| 2445 | if (load > max_cpu_load) | ||
| 2446 | max_cpu_load = load; | ||
| 2447 | if (min_cpu_load > load) | ||
| 2448 | min_cpu_load = load; | ||
| 2449 | } | ||
| 2450 | |||
| 2451 | sgs->group_load += load; | ||
| 2452 | sgs->sum_nr_running += rq->nr_running; | ||
| 2453 | sgs->sum_weighted_load += weighted_cpuload(i); | ||
| 2454 | |||
| 2455 | } | ||
| 2456 | |||
| 2457 | /* | ||
| 2458 | * First idle cpu or the first cpu(busiest) in this sched group | ||
| 2459 | * is eligible for doing load balancing at this and above | ||
| 2460 | * domains. In the newly idle case, we will allow all the cpu's | ||
| 2461 | * to do the newly idle load balance. | ||
| 2462 | */ | ||
| 2463 | if (idle != CPU_NEWLY_IDLE && local_group && | ||
| 2464 | balance_cpu != this_cpu) { | ||
| 2465 | *balance = 0; | ||
| 2466 | return; | ||
| 2467 | } | ||
| 2468 | |||
| 2469 | update_group_power(sd, this_cpu); | ||
| 2470 | |||
| 2471 | /* Adjust by relative CPU power of the group */ | ||
| 2472 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
| 2473 | |||
| 2474 | /* | ||
| 2475 | * Consider the group unbalanced when the imbalance is larger | ||
| 2476 | * than the average weight of two tasks. | ||
| 2477 | * | ||
| 2478 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 2479 | * might not be a suitable number - should we keep a | ||
| 2480 | * normalized nr_running number somewhere that negates | ||
| 2481 | * the hierarchy? | ||
| 2482 | */ | ||
| 2483 | if (sgs->sum_nr_running) | ||
| 2484 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | ||
| 2485 | |||
| 2486 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
| 2487 | sgs->group_imb = 1; | ||
| 2488 | |||
| 2489 | sgs->group_capacity = | ||
| 2490 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | ||
| 2491 | } | ||
| 2492 | |||
| 2493 | /** | ||
| 2494 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | ||
| 2495 | * @sd: sched_domain whose statistics are to be updated. | ||
| 2496 | * @this_cpu: Cpu for which load balance is currently performed. | ||
| 2497 | * @idle: Idle status of this_cpu | ||
| 2498 | * @sd_idle: Idle status of the sched_domain containing group. | ||
| 2499 | * @cpus: Set of cpus considered for load balancing. | ||
| 2500 | * @balance: Should we balance. | ||
| 2501 | * @sds: variable to hold the statistics for this sched_domain. | ||
| 2502 | */ | ||
| 2503 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | ||
| 2504 | enum cpu_idle_type idle, int *sd_idle, | ||
| 2505 | const struct cpumask *cpus, int *balance, | ||
| 2506 | struct sd_lb_stats *sds) | ||
| 2507 | { | ||
| 2508 | struct sched_domain *child = sd->child; | ||
| 2509 | struct sched_group *group = sd->groups; | ||
| 2510 | struct sg_lb_stats sgs; | ||
| 2511 | int load_idx, prefer_sibling = 0; | ||
| 2512 | |||
| 2513 | if (child && child->flags & SD_PREFER_SIBLING) | ||
| 2514 | prefer_sibling = 1; | ||
| 2515 | |||
| 2516 | init_sd_power_savings_stats(sd, sds, idle); | ||
| 2517 | load_idx = get_sd_load_idx(sd, idle); | ||
| 2518 | |||
| 2519 | do { | ||
| 2520 | int local_group; | ||
| 2521 | |||
| 2522 | local_group = cpumask_test_cpu(this_cpu, | ||
| 2523 | sched_group_cpus(group)); | ||
| 2524 | memset(&sgs, 0, sizeof(sgs)); | ||
| 2525 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | ||
| 2526 | local_group, cpus, balance, &sgs); | ||
| 2527 | |||
| 2528 | if (local_group && !(*balance)) | ||
| 2529 | return; | ||
| 2530 | |||
| 2531 | sds->total_load += sgs.group_load; | ||
| 2532 | sds->total_pwr += group->cpu_power; | ||
| 2533 | |||
| 2534 | /* | ||
| 2535 | * In case the child domain prefers tasks go to siblings | ||
| 2536 | * first, lower the group capacity to one so that we'll try | ||
| 2537 | * and move all the excess tasks away. | ||
| 2538 | */ | ||
| 2539 | if (prefer_sibling) | ||
| 2540 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
| 2541 | |||
| 2542 | if (local_group) { | ||
| 2543 | sds->this_load = sgs.avg_load; | ||
| 2544 | sds->this = group; | ||
| 2545 | sds->this_nr_running = sgs.sum_nr_running; | ||
| 2546 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
| 2547 | } else if (sgs.avg_load > sds->max_load && | ||
| 2548 | (sgs.sum_nr_running > sgs.group_capacity || | ||
| 2549 | sgs.group_imb)) { | ||
| 2550 | sds->max_load = sgs.avg_load; | ||
| 2551 | sds->busiest = group; | ||
| 2552 | sds->busiest_nr_running = sgs.sum_nr_running; | ||
| 2553 | sds->busiest_group_capacity = sgs.group_capacity; | ||
| 2554 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
| 2555 | sds->group_imb = sgs.group_imb; | ||
| 2556 | } | ||
| 2557 | |||
| 2558 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | ||
| 2559 | group = group->next; | ||
| 2560 | } while (group != sd->groups); | ||
| 2561 | } | ||
| 2562 | |||
| 2563 | /** | ||
| 2564 | * fix_small_imbalance - Calculate the minor imbalance that exists | ||
| 2565 | * amongst the groups of a sched_domain, during | ||
| 2566 | * load balancing. | ||
| 2567 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | ||
| 2568 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
| 2569 | * @imbalance: Variable to store the imbalance. | ||
| 2570 | */ | ||
| 2571 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | ||
| 2572 | int this_cpu, unsigned long *imbalance) | ||
| 2573 | { | ||
| 2574 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | ||
| 2575 | unsigned int imbn = 2; | ||
| 2576 | unsigned long scaled_busy_load_per_task; | ||
| 2577 | |||
| 2578 | if (sds->this_nr_running) { | ||
| 2579 | sds->this_load_per_task /= sds->this_nr_running; | ||
| 2580 | if (sds->busiest_load_per_task > | ||
| 2581 | sds->this_load_per_task) | ||
| 2582 | imbn = 1; | ||
| 2583 | } else | ||
| 2584 | sds->this_load_per_task = | ||
| 2585 | cpu_avg_load_per_task(this_cpu); | ||
| 2586 | |||
| 2587 | scaled_busy_load_per_task = sds->busiest_load_per_task | ||
| 2588 | * SCHED_LOAD_SCALE; | ||
| 2589 | scaled_busy_load_per_task /= sds->busiest->cpu_power; | ||
| 2590 | |||
| 2591 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | ||
| 2592 | (scaled_busy_load_per_task * imbn)) { | ||
| 2593 | *imbalance = sds->busiest_load_per_task; | ||
| 2594 | return; | ||
| 2595 | } | ||
| 2596 | |||
| 2597 | /* | ||
| 2598 | * OK, we don't have enough imbalance to justify moving tasks, | ||
| 2599 | * however we may be able to increase total CPU power used by | ||
| 2600 | * moving them. | ||
| 2601 | */ | ||
| 2602 | |||
| 2603 | pwr_now += sds->busiest->cpu_power * | ||
| 2604 | min(sds->busiest_load_per_task, sds->max_load); | ||
| 2605 | pwr_now += sds->this->cpu_power * | ||
| 2606 | min(sds->this_load_per_task, sds->this_load); | ||
| 2607 | pwr_now /= SCHED_LOAD_SCALE; | ||
| 2608 | |||
| 2609 | /* Amount of load we'd subtract */ | ||
| 2610 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
| 2611 | sds->busiest->cpu_power; | ||
| 2612 | if (sds->max_load > tmp) | ||
| 2613 | pwr_move += sds->busiest->cpu_power * | ||
| 2614 | min(sds->busiest_load_per_task, sds->max_load - tmp); | ||
| 2615 | |||
| 2616 | /* Amount of load we'd add */ | ||
| 2617 | if (sds->max_load * sds->busiest->cpu_power < | ||
| 2618 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | ||
| 2619 | tmp = (sds->max_load * sds->busiest->cpu_power) / | ||
| 2620 | sds->this->cpu_power; | ||
| 2621 | else | ||
| 2622 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | ||
| 2623 | sds->this->cpu_power; | ||
| 2624 | pwr_move += sds->this->cpu_power * | ||
| 2625 | min(sds->this_load_per_task, sds->this_load + tmp); | ||
| 2626 | pwr_move /= SCHED_LOAD_SCALE; | ||
| 2627 | |||
| 2628 | /* Move if we gain throughput */ | ||
| 2629 | if (pwr_move > pwr_now) | ||
| 2630 | *imbalance = sds->busiest_load_per_task; | ||
| 2631 | } | ||
| 2632 | |||
| 2633 | /** | ||
| 2634 | * calculate_imbalance - Calculate the amount of imbalance present within the | ||
| 2635 | * groups of a given sched_domain during load balance. | ||
| 2636 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | ||
| 2637 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
| 2638 | * @imbalance: The variable to store the imbalance. | ||
| 2639 | */ | ||
| 2640 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | ||
| 2641 | unsigned long *imbalance) | ||
| 2642 | { | ||
| 2643 | unsigned long max_pull, load_above_capacity = ~0UL; | ||
| 2644 | |||
| 2645 | sds->busiest_load_per_task /= sds->busiest_nr_running; | ||
| 2646 | if (sds->group_imb) { | ||
| 2647 | sds->busiest_load_per_task = | ||
| 2648 | min(sds->busiest_load_per_task, sds->avg_load); | ||
| 2649 | } | ||
| 2650 | |||
| 2651 | /* | ||
| 2652 | * In the presence of smp nice balancing, certain scenarios can have | ||
| 2653 | * max load less than avg load(as we skip the groups at or below | ||
| 2654 | * its cpu_power, while calculating max_load..) | ||
| 2655 | */ | ||
| 2656 | if (sds->max_load < sds->avg_load) { | ||
| 2657 | *imbalance = 0; | ||
| 2658 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
| 2659 | } | ||
| 2660 | |||
| 2661 | if (!sds->group_imb) { | ||
| 2662 | /* | ||
| 2663 | * Don't want to pull so many tasks that a group would go idle. | ||
| 2664 | */ | ||
| 2665 | load_above_capacity = (sds->busiest_nr_running - | ||
| 2666 | sds->busiest_group_capacity); | ||
| 2667 | |||
| 2668 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); | ||
| 2669 | |||
| 2670 | load_above_capacity /= sds->busiest->cpu_power; | ||
| 2671 | } | ||
| 2672 | |||
| 2673 | /* | ||
| 2674 | * We're trying to get all the cpus to the average_load, so we don't | ||
| 2675 | * want to push ourselves above the average load, nor do we wish to | ||
| 2676 | * reduce the max loaded cpu below the average load. At the same time, | ||
| 2677 | * we also don't want to reduce the group load below the group capacity | ||
| 2678 | * (so that we can implement power-savings policies etc). Thus we look | ||
| 2679 | * for the minimum possible imbalance. | ||
| 2680 | * Be careful of negative numbers as they'll appear as very large values | ||
| 2681 | * with unsigned longs. | ||
| 2682 | */ | ||
| 2683 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | ||
| 2684 | |||
| 2685 | /* How much load to actually move to equalise the imbalance */ | ||
| 2686 | *imbalance = min(max_pull * sds->busiest->cpu_power, | ||
| 2687 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | ||
| 2688 | / SCHED_LOAD_SCALE; | ||
| 2689 | |||
| 2690 | /* | ||
| 2691 | * if *imbalance is less than the average load per runnable task | ||
| 2692 | * there is no gaurantee that any tasks will be moved so we'll have | ||
| 2693 | * a think about bumping its value to force at least one task to be | ||
| 2694 | * moved | ||
| 2695 | */ | ||
| 2696 | if (*imbalance < sds->busiest_load_per_task) | ||
| 2697 | return fix_small_imbalance(sds, this_cpu, imbalance); | ||
| 2698 | |||
| 2699 | } | ||
| 2700 | /******* find_busiest_group() helpers end here *********************/ | ||
| 2701 | |||
| 2702 | /** | ||
| 2703 | * find_busiest_group - Returns the busiest group within the sched_domain | ||
| 2704 | * if there is an imbalance. If there isn't an imbalance, and | ||
| 2705 | * the user has opted for power-savings, it returns a group whose | ||
| 2706 | * CPUs can be put to idle by rebalancing those tasks elsewhere, if | ||
| 2707 | * such a group exists. | ||
| 2708 | * | ||
| 2709 | * Also calculates the amount of weighted load which should be moved | ||
| 2710 | * to restore balance. | ||
| 2711 | * | ||
| 2712 | * @sd: The sched_domain whose busiest group is to be returned. | ||
| 2713 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
| 2714 | * @imbalance: Variable which stores amount of weighted load which should | ||
| 2715 | * be moved to restore balance/put a group to idle. | ||
| 2716 | * @idle: The idle status of this_cpu. | ||
| 2717 | * @sd_idle: The idleness of sd | ||
| 2718 | * @cpus: The set of CPUs under consideration for load-balancing. | ||
| 2719 | * @balance: Pointer to a variable indicating if this_cpu | ||
| 2720 | * is the appropriate cpu to perform load balancing at this_level. | ||
| 2721 | * | ||
| 2722 | * Returns: - the busiest group if imbalance exists. | ||
| 2723 | * - If no imbalance and user has opted for power-savings balance, | ||
| 2724 | * return the least loaded group whose CPUs can be | ||
| 2725 | * put to idle by rebalancing its tasks onto our group. | ||
| 2726 | */ | ||
| 2727 | static struct sched_group * | ||
| 2728 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
| 2729 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
| 2730 | int *sd_idle, const struct cpumask *cpus, int *balance) | ||
| 2731 | { | ||
| 2732 | struct sd_lb_stats sds; | ||
| 2733 | |||
| 2734 | memset(&sds, 0, sizeof(sds)); | ||
| 2735 | |||
| 2736 | /* | ||
| 2737 | * Compute the various statistics relavent for load balancing at | ||
| 2738 | * this level. | ||
| 2739 | */ | ||
| 2740 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | ||
| 2741 | balance, &sds); | ||
| 2742 | |||
| 2743 | /* Cases where imbalance does not exist from POV of this_cpu */ | ||
| 2744 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | ||
| 2745 | * at this level. | ||
| 2746 | * 2) There is no busy sibling group to pull from. | ||
| 2747 | * 3) This group is the busiest group. | ||
| 2748 | * 4) This group is more busy than the avg busieness at this | ||
| 2749 | * sched_domain. | ||
| 2750 | * 5) The imbalance is within the specified limit. | ||
| 2751 | */ | ||
| 2752 | if (!(*balance)) | ||
| 2753 | goto ret; | ||
| 2754 | |||
| 2755 | if (!sds.busiest || sds.busiest_nr_running == 0) | ||
| 2756 | goto out_balanced; | ||
| 2757 | |||
| 2758 | if (sds.this_load >= sds.max_load) | ||
| 2759 | goto out_balanced; | ||
| 2760 | |||
| 2761 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | ||
| 2762 | |||
| 2763 | if (sds.this_load >= sds.avg_load) | ||
| 2764 | goto out_balanced; | ||
| 2765 | |||
| 2766 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
| 2767 | goto out_balanced; | ||
| 2768 | |||
| 2769 | /* Looks like there is an imbalance. Compute it */ | ||
| 2770 | calculate_imbalance(&sds, this_cpu, imbalance); | ||
| 2771 | return sds.busiest; | ||
| 2772 | |||
| 2773 | out_balanced: | ||
| 2774 | /* | ||
| 2775 | * There is no obvious imbalance. But check if we can do some balancing | ||
| 2776 | * to save power. | ||
| 2777 | */ | ||
| 2778 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
| 2779 | return sds.busiest; | ||
| 2780 | ret: | ||
| 2781 | *imbalance = 0; | ||
| 2782 | return NULL; | ||
| 2783 | } | ||
| 2784 | |||
| 2785 | /* | ||
| 2786 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | ||
| 2787 | */ | ||
| 2788 | static struct rq * | ||
| 2789 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | ||
| 2790 | unsigned long imbalance, const struct cpumask *cpus) | ||
| 2791 | { | ||
| 2792 | struct rq *busiest = NULL, *rq; | ||
| 2793 | unsigned long max_load = 0; | ||
| 2794 | int i; | ||
| 2795 | |||
| 2796 | for_each_cpu(i, sched_group_cpus(group)) { | ||
| 2797 | unsigned long power = power_of(i); | ||
| 2798 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
| 2799 | unsigned long wl; | ||
| 2800 | |||
| 2801 | if (!cpumask_test_cpu(i, cpus)) | ||
| 2802 | continue; | ||
| 2803 | |||
| 2804 | rq = cpu_rq(i); | ||
| 2805 | wl = weighted_cpuload(i); | ||
| 2806 | |||
| 2807 | /* | ||
| 2808 | * When comparing with imbalance, use weighted_cpuload() | ||
| 2809 | * which is not scaled with the cpu power. | ||
| 2810 | */ | ||
| 2811 | if (capacity && rq->nr_running == 1 && wl > imbalance) | ||
| 2812 | continue; | ||
| 2813 | |||
| 2814 | /* | ||
| 2815 | * For the load comparisons with the other cpu's, consider | ||
| 2816 | * the weighted_cpuload() scaled with the cpu power, so that | ||
| 2817 | * the load can be moved away from the cpu that is potentially | ||
| 2818 | * running at a lower capacity. | ||
| 2819 | */ | ||
| 2820 | wl = (wl * SCHED_LOAD_SCALE) / power; | ||
| 2821 | |||
| 2822 | if (wl > max_load) { | ||
| 2823 | max_load = wl; | ||
| 2824 | busiest = rq; | ||
| 2825 | } | ||
| 2826 | } | ||
| 2827 | |||
| 2828 | return busiest; | ||
| 2829 | } | ||
| 2830 | |||
| 2831 | /* | ||
| 2832 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
| 2833 | * so long as it is large enough. | ||
| 2834 | */ | ||
| 2835 | #define MAX_PINNED_INTERVAL 512 | ||
| 2836 | |||
| 2837 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
| 2838 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | ||
| 2839 | |||
| 2840 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | ||
| 2841 | { | ||
| 2842 | if (idle == CPU_NEWLY_IDLE) { | ||
| 2843 | /* | ||
| 2844 | * The only task running in a non-idle cpu can be moved to this | ||
| 2845 | * cpu in an attempt to completely freeup the other CPU | ||
| 2846 | * package. | ||
| 2847 | * | ||
| 2848 | * The package power saving logic comes from | ||
| 2849 | * find_busiest_group(). If there are no imbalance, then | ||
| 2850 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
| 2851 | * f_b_g() will select a group from which a running task may be | ||
| 2852 | * pulled to this cpu in order to make the other package idle. | ||
| 2853 | * If there is no opportunity to make a package idle and if | ||
| 2854 | * there are no imbalance, then f_b_g() will return NULL and no | ||
| 2855 | * action will be taken in load_balance_newidle(). | ||
| 2856 | * | ||
| 2857 | * Under normal task pull operation due to imbalance, there | ||
| 2858 | * will be more than one task in the source run queue and | ||
| 2859 | * move_tasks() will succeed. ld_moved will be true and this | ||
| 2860 | * active balance code will not be triggered. | ||
| 2861 | */ | ||
| 2862 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
| 2863 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 2864 | return 0; | ||
| 2865 | |||
| 2866 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
| 2867 | return 0; | ||
| 2868 | } | ||
| 2869 | |||
| 2870 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | ||
| 2871 | } | ||
| 2872 | |||
| 2873 | /* | ||
| 2874 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
| 2875 | * tasks if there is an imbalance. | ||
| 2876 | */ | ||
| 2877 | static int load_balance(int this_cpu, struct rq *this_rq, | ||
| 2878 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 2879 | int *balance) | ||
| 2880 | { | ||
| 2881 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
| 2882 | struct sched_group *group; | ||
| 2883 | unsigned long imbalance; | ||
| 2884 | struct rq *busiest; | ||
| 2885 | unsigned long flags; | ||
| 2886 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | ||
| 2887 | |||
| 2888 | cpumask_copy(cpus, cpu_active_mask); | ||
| 2889 | |||
| 2890 | /* | ||
| 2891 | * When power savings policy is enabled for the parent domain, idle | ||
| 2892 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
| 2893 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
| 2894 | * portraying it as CPU_NOT_IDLE. | ||
| 2895 | */ | ||
| 2896 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
| 2897 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 2898 | sd_idle = 1; | ||
| 2899 | |||
| 2900 | schedstat_inc(sd, lb_count[idle]); | ||
| 2901 | |||
| 2902 | redo: | ||
| 2903 | update_shares(sd); | ||
| 2904 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
| 2905 | cpus, balance); | ||
| 2906 | |||
| 2907 | if (*balance == 0) | ||
| 2908 | goto out_balanced; | ||
| 2909 | |||
| 2910 | if (!group) { | ||
| 2911 | schedstat_inc(sd, lb_nobusyg[idle]); | ||
| 2912 | goto out_balanced; | ||
| 2913 | } | ||
| 2914 | |||
| 2915 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | ||
| 2916 | if (!busiest) { | ||
| 2917 | schedstat_inc(sd, lb_nobusyq[idle]); | ||
| 2918 | goto out_balanced; | ||
| 2919 | } | ||
| 2920 | |||
| 2921 | BUG_ON(busiest == this_rq); | ||
| 2922 | |||
| 2923 | schedstat_add(sd, lb_imbalance[idle], imbalance); | ||
| 2924 | |||
| 2925 | ld_moved = 0; | ||
| 2926 | if (busiest->nr_running > 1) { | ||
| 2927 | /* | ||
| 2928 | * Attempt to move tasks. If find_busiest_group has found | ||
| 2929 | * an imbalance but busiest->nr_running <= 1, the group is | ||
| 2930 | * still unbalanced. ld_moved simply stays zero, so it is | ||
| 2931 | * correctly treated as an imbalance. | ||
| 2932 | */ | ||
| 2933 | local_irq_save(flags); | ||
| 2934 | double_rq_lock(this_rq, busiest); | ||
| 2935 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | ||
| 2936 | imbalance, sd, idle, &all_pinned); | ||
| 2937 | double_rq_unlock(this_rq, busiest); | ||
| 2938 | local_irq_restore(flags); | ||
| 2939 | |||
| 2940 | /* | ||
| 2941 | * some other cpu did the load balance for us. | ||
| 2942 | */ | ||
| 2943 | if (ld_moved && this_cpu != smp_processor_id()) | ||
| 2944 | resched_cpu(this_cpu); | ||
| 2945 | |||
| 2946 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
| 2947 | if (unlikely(all_pinned)) { | ||
| 2948 | cpumask_clear_cpu(cpu_of(busiest), cpus); | ||
| 2949 | if (!cpumask_empty(cpus)) | ||
| 2950 | goto redo; | ||
| 2951 | goto out_balanced; | ||
| 2952 | } | ||
| 2953 | } | ||
| 2954 | |||
| 2955 | if (!ld_moved) { | ||
| 2956 | schedstat_inc(sd, lb_failed[idle]); | ||
| 2957 | sd->nr_balance_failed++; | ||
| 2958 | |||
| 2959 | if (need_active_balance(sd, sd_idle, idle)) { | ||
| 2960 | raw_spin_lock_irqsave(&busiest->lock, flags); | ||
| 2961 | |||
| 2962 | /* don't kick the migration_thread, if the curr | ||
| 2963 | * task on busiest cpu can't be moved to this_cpu | ||
| 2964 | */ | ||
| 2965 | if (!cpumask_test_cpu(this_cpu, | ||
| 2966 | &busiest->curr->cpus_allowed)) { | ||
| 2967 | raw_spin_unlock_irqrestore(&busiest->lock, | ||
| 2968 | flags); | ||
| 2969 | all_pinned = 1; | ||
| 2970 | goto out_one_pinned; | ||
| 2971 | } | ||
| 2972 | |||
| 2973 | if (!busiest->active_balance) { | ||
| 2974 | busiest->active_balance = 1; | ||
| 2975 | busiest->push_cpu = this_cpu; | ||
| 2976 | active_balance = 1; | ||
| 2977 | } | ||
| 2978 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | ||
| 2979 | if (active_balance) | ||
| 2980 | wake_up_process(busiest->migration_thread); | ||
| 2981 | |||
| 2982 | /* | ||
| 2983 | * We've kicked active balancing, reset the failure | ||
| 2984 | * counter. | ||
| 2985 | */ | ||
| 2986 | sd->nr_balance_failed = sd->cache_nice_tries+1; | ||
| 2987 | } | ||
| 2988 | } else | ||
| 2989 | sd->nr_balance_failed = 0; | ||
| 2990 | |||
| 2991 | if (likely(!active_balance)) { | ||
| 2992 | /* We were unbalanced, so reset the balancing interval */ | ||
| 2993 | sd->balance_interval = sd->min_interval; | ||
| 2994 | } else { | ||
| 2995 | /* | ||
| 2996 | * If we've begun active balancing, start to back off. This | ||
| 2997 | * case may not be covered by the all_pinned logic if there | ||
| 2998 | * is only 1 task on the busy runqueue (because we don't call | ||
| 2999 | * move_tasks). | ||
| 3000 | */ | ||
| 3001 | if (sd->balance_interval < sd->max_interval) | ||
| 3002 | sd->balance_interval *= 2; | ||
| 3003 | } | ||
| 3004 | |||
| 3005 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
| 3006 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 3007 | ld_moved = -1; | ||
| 3008 | |||
| 3009 | goto out; | ||
| 3010 | |||
| 3011 | out_balanced: | ||
| 3012 | schedstat_inc(sd, lb_balanced[idle]); | ||
| 3013 | |||
| 3014 | sd->nr_balance_failed = 0; | ||
| 3015 | |||
| 3016 | out_one_pinned: | ||
| 3017 | /* tune up the balancing interval */ | ||
| 3018 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
| 3019 | (sd->balance_interval < sd->max_interval)) | ||
| 3020 | sd->balance_interval *= 2; | ||
| 3021 | |||
| 3022 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
| 3023 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
| 3024 | ld_moved = -1; | ||
| 3025 | else | ||
| 3026 | ld_moved = 0; | ||
| 3027 | out: | ||
| 3028 | if (ld_moved) | ||
| 3029 | update_shares(sd); | ||
| 3030 | return ld_moved; | ||
| 3031 | } | ||
| 3032 | |||
| 3033 | /* | ||
| 3034 | * idle_balance is called by schedule() if this_cpu is about to become | ||
| 3035 | * idle. Attempts to pull tasks from other CPUs. | ||
| 3036 | */ | ||
| 3037 | static void idle_balance(int this_cpu, struct rq *this_rq) | ||
| 3038 | { | ||
| 3039 | struct sched_domain *sd; | ||
| 3040 | int pulled_task = 0; | ||
| 3041 | unsigned long next_balance = jiffies + HZ; | ||
| 3042 | |||
| 3043 | this_rq->idle_stamp = this_rq->clock; | ||
| 3044 | |||
| 3045 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
| 3046 | return; | ||
| 3047 | |||
| 3048 | /* | ||
| 3049 | * Drop the rq->lock, but keep IRQ/preempt disabled. | ||
| 3050 | */ | ||
| 3051 | raw_spin_unlock(&this_rq->lock); | ||
| 3052 | |||
| 3053 | for_each_domain(this_cpu, sd) { | ||
| 3054 | unsigned long interval; | ||
| 3055 | int balance = 1; | ||
| 3056 | |||
| 3057 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
| 3058 | continue; | ||
| 3059 | |||
| 3060 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
| 3061 | /* If we've pulled tasks over stop searching: */ | ||
| 3062 | pulled_task = load_balance(this_cpu, this_rq, | ||
| 3063 | sd, CPU_NEWLY_IDLE, &balance); | ||
| 3064 | } | ||
| 3065 | |||
| 3066 | interval = msecs_to_jiffies(sd->balance_interval); | ||
| 3067 | if (time_after(next_balance, sd->last_balance + interval)) | ||
| 3068 | next_balance = sd->last_balance + interval; | ||
| 3069 | if (pulled_task) { | ||
| 3070 | this_rq->idle_stamp = 0; | ||
| 3071 | break; | ||
| 3072 | } | ||
| 3073 | } | ||
| 3074 | |||
| 3075 | raw_spin_lock(&this_rq->lock); | ||
| 3076 | |||
| 3077 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | ||
| 3078 | /* | ||
| 3079 | * We are going idle. next_balance may be set based on | ||
| 3080 | * a busy processor. So reset next_balance. | ||
| 3081 | */ | ||
| 3082 | this_rq->next_balance = next_balance; | ||
| 3083 | } | ||
| 3084 | } | ||
| 3085 | |||
| 3086 | /* | ||
| 3087 | * active_load_balance is run by migration threads. It pushes running tasks | ||
| 3088 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | ||
| 3089 | * running on each physical CPU where possible, and avoids physical / | ||
| 3090 | * logical imbalances. | ||
| 3091 | * | ||
| 3092 | * Called with busiest_rq locked. | ||
| 3093 | */ | ||
| 3094 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | ||
| 3095 | { | ||
| 3096 | int target_cpu = busiest_rq->push_cpu; | ||
| 3097 | struct sched_domain *sd; | ||
| 3098 | struct rq *target_rq; | ||
| 3099 | |||
| 3100 | /* Is there any task to move? */ | ||
| 3101 | if (busiest_rq->nr_running <= 1) | ||
| 3102 | return; | ||
| 3103 | |||
| 3104 | target_rq = cpu_rq(target_cpu); | ||
| 3105 | |||
| 3106 | /* | ||
| 3107 | * This condition is "impossible", if it occurs | ||
| 3108 | * we need to fix it. Originally reported by | ||
| 3109 | * Bjorn Helgaas on a 128-cpu setup. | ||
| 3110 | */ | ||
| 3111 | BUG_ON(busiest_rq == target_rq); | ||
| 3112 | |||
| 3113 | /* move a task from busiest_rq to target_rq */ | ||
| 3114 | double_lock_balance(busiest_rq, target_rq); | ||
| 3115 | update_rq_clock(busiest_rq); | ||
| 3116 | update_rq_clock(target_rq); | ||
| 3117 | |||
| 3118 | /* Search for an sd spanning us and the target CPU. */ | ||
| 3119 | for_each_domain(target_cpu, sd) { | ||
| 3120 | if ((sd->flags & SD_LOAD_BALANCE) && | ||
| 3121 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | ||
| 3122 | break; | ||
| 3123 | } | ||
| 3124 | |||
| 3125 | if (likely(sd)) { | ||
| 3126 | schedstat_inc(sd, alb_count); | ||
| 3127 | |||
| 3128 | if (move_one_task(target_rq, target_cpu, busiest_rq, | ||
| 3129 | sd, CPU_IDLE)) | ||
| 3130 | schedstat_inc(sd, alb_pushed); | ||
| 3131 | else | ||
| 3132 | schedstat_inc(sd, alb_failed); | ||
| 3133 | } | ||
| 3134 | double_unlock_balance(busiest_rq, target_rq); | ||
| 3135 | } | ||
| 3136 | |||
| 3137 | #ifdef CONFIG_NO_HZ | ||
| 3138 | static struct { | ||
| 3139 | atomic_t load_balancer; | ||
| 3140 | cpumask_var_t cpu_mask; | ||
| 3141 | cpumask_var_t ilb_grp_nohz_mask; | ||
| 3142 | } nohz ____cacheline_aligned = { | ||
| 3143 | .load_balancer = ATOMIC_INIT(-1), | ||
| 3144 | }; | ||
| 3145 | |||
| 3146 | int get_nohz_load_balancer(void) | ||
| 3147 | { | ||
| 3148 | return atomic_read(&nohz.load_balancer); | ||
| 3149 | } | ||
| 3150 | |||
| 3151 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 3152 | /** | ||
| 3153 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
| 3154 | * @cpu: The cpu whose lowest level of sched domain is to | ||
| 3155 | * be returned. | ||
| 3156 | * @flag: The flag to check for the lowest sched_domain | ||
| 3157 | * for the given cpu. | ||
| 3158 | * | ||
| 3159 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
| 3160 | */ | ||
| 3161 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
| 3162 | { | ||
| 3163 | struct sched_domain *sd; | ||
| 3164 | |||
| 3165 | for_each_domain(cpu, sd) | ||
| 3166 | if (sd && (sd->flags & flag)) | ||
| 3167 | break; | ||
| 3168 | |||
| 3169 | return sd; | ||
| 3170 | } | ||
| 3171 | |||
| 3172 | /** | ||
| 3173 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
| 3174 | * @cpu: The cpu whose domains we're iterating over. | ||
| 3175 | * @sd: variable holding the value of the power_savings_sd | ||
| 3176 | * for cpu. | ||
| 3177 | * @flag: The flag to filter the sched_domains to be iterated. | ||
| 3178 | * | ||
| 3179 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
| 3180 | * set, starting from the lowest sched_domain to the highest. | ||
| 3181 | */ | ||
| 3182 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
| 3183 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
| 3184 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
| 3185 | |||
| 3186 | /** | ||
| 3187 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
| 3188 | * @ilb_group: group to be checked for semi-idleness | ||
| 3189 | * | ||
| 3190 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
| 3191 | * | ||
| 3192 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
| 3193 | * and atleast one non-idle CPU. This helper function checks if the given | ||
| 3194 | * sched_group is semi-idle or not. | ||
| 3195 | */ | ||
| 3196 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
| 3197 | { | ||
| 3198 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
| 3199 | sched_group_cpus(ilb_group)); | ||
| 3200 | |||
| 3201 | /* | ||
| 3202 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
| 3203 | * and atleast one idle cpu. | ||
| 3204 | */ | ||
| 3205 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
| 3206 | return 0; | ||
| 3207 | |||
| 3208 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
| 3209 | return 0; | ||
| 3210 | |||
| 3211 | return 1; | ||
| 3212 | } | ||
| 3213 | /** | ||
| 3214 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
| 3215 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
| 3216 | * | ||
| 3217 | * Returns: Returns the id of the idle load balancer if it exists, | ||
| 3218 | * Else, returns >= nr_cpu_ids. | ||
| 3219 | * | ||
| 3220 | * This algorithm picks the idle load balancer such that it belongs to a | ||
| 3221 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
| 3222 | * completely idle packages/cores just for the purpose of idle load balancing | ||
| 3223 | * when there are other idle cpu's which are better suited for that job. | ||
| 3224 | */ | ||
| 3225 | static int find_new_ilb(int cpu) | ||
| 3226 | { | ||
| 3227 | struct sched_domain *sd; | ||
| 3228 | struct sched_group *ilb_group; | ||
| 3229 | |||
| 3230 | /* | ||
| 3231 | * Have idle load balancer selection from semi-idle packages only | ||
| 3232 | * when power-aware load balancing is enabled | ||
| 3233 | */ | ||
| 3234 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
| 3235 | goto out_done; | ||
| 3236 | |||
| 3237 | /* | ||
| 3238 | * Optimize for the case when we have no idle CPUs or only one | ||
| 3239 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
| 3240 | */ | ||
| 3241 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
| 3242 | goto out_done; | ||
| 3243 | |||
| 3244 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
| 3245 | ilb_group = sd->groups; | ||
| 3246 | |||
| 3247 | do { | ||
| 3248 | if (is_semi_idle_group(ilb_group)) | ||
| 3249 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
| 3250 | |||
| 3251 | ilb_group = ilb_group->next; | ||
| 3252 | |||
| 3253 | } while (ilb_group != sd->groups); | ||
| 3254 | } | ||
| 3255 | |||
| 3256 | out_done: | ||
| 3257 | return cpumask_first(nohz.cpu_mask); | ||
| 3258 | } | ||
| 3259 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
| 3260 | static inline int find_new_ilb(int call_cpu) | ||
| 3261 | { | ||
| 3262 | return cpumask_first(nohz.cpu_mask); | ||
| 3263 | } | ||
| 3264 | #endif | ||
| 3265 | |||
| 3266 | /* | ||
| 3267 | * This routine will try to nominate the ilb (idle load balancing) | ||
| 3268 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
| 3269 | * load balancing on behalf of all those cpus. If all the cpus in the system | ||
| 3270 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
| 3271 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
| 3272 | * arrives... | ||
| 3273 | * | ||
| 3274 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
| 3275 | * for idle load balancing. ilb owner will still be part of | ||
| 3276 | * nohz.cpu_mask.. | ||
| 3277 | * | ||
| 3278 | * While stopping the tick, this cpu will become the ilb owner if there | ||
| 3279 | * is no other owner. And will be the owner till that cpu becomes busy | ||
| 3280 | * or if all cpus in the system stop their ticks at which point | ||
| 3281 | * there is no need for ilb owner. | ||
| 3282 | * | ||
| 3283 | * When the ilb owner becomes busy, it nominates another owner, during the | ||
| 3284 | * next busy scheduler_tick() | ||
| 3285 | */ | ||
| 3286 | int select_nohz_load_balancer(int stop_tick) | ||
| 3287 | { | ||
| 3288 | int cpu = smp_processor_id(); | ||
| 3289 | |||
| 3290 | if (stop_tick) { | ||
| 3291 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
| 3292 | |||
| 3293 | if (!cpu_active(cpu)) { | ||
| 3294 | if (atomic_read(&nohz.load_balancer) != cpu) | ||
| 3295 | return 0; | ||
| 3296 | |||
| 3297 | /* | ||
| 3298 | * If we are going offline and still the leader, | ||
| 3299 | * give up! | ||
| 3300 | */ | ||
| 3301 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
| 3302 | BUG(); | ||
| 3303 | |||
| 3304 | return 0; | ||
| 3305 | } | ||
| 3306 | |||
| 3307 | cpumask_set_cpu(cpu, nohz.cpu_mask); | ||
| 3308 | |||
| 3309 | /* time for ilb owner also to sleep */ | ||
| 3310 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | ||
| 3311 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
| 3312 | atomic_set(&nohz.load_balancer, -1); | ||
| 3313 | return 0; | ||
| 3314 | } | ||
| 3315 | |||
| 3316 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
| 3317 | /* make me the ilb owner */ | ||
| 3318 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
| 3319 | return 1; | ||
| 3320 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
| 3321 | int new_ilb; | ||
| 3322 | |||
| 3323 | if (!(sched_smt_power_savings || | ||
| 3324 | sched_mc_power_savings)) | ||
| 3325 | return 1; | ||
| 3326 | /* | ||
| 3327 | * Check to see if there is a more power-efficient | ||
| 3328 | * ilb. | ||
| 3329 | */ | ||
| 3330 | new_ilb = find_new_ilb(cpu); | ||
| 3331 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
| 3332 | atomic_set(&nohz.load_balancer, -1); | ||
| 3333 | resched_cpu(new_ilb); | ||
| 3334 | return 0; | ||
| 3335 | } | ||
| 3336 | return 1; | ||
| 3337 | } | ||
| 3338 | } else { | ||
| 3339 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
| 3340 | return 0; | ||
| 3341 | |||
| 3342 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
| 3343 | |||
| 3344 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
| 3345 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
| 3346 | BUG(); | ||
| 3347 | } | ||
| 3348 | return 0; | ||
| 3349 | } | ||
| 3350 | #endif | ||
| 3351 | |||
| 3352 | static DEFINE_SPINLOCK(balancing); | ||
| 3353 | |||
| 3354 | /* | ||
| 3355 | * It checks each scheduling domain to see if it is due to be balanced, | ||
| 3356 | * and initiates a balancing operation if so. | ||
| 3357 | * | ||
| 3358 | * Balancing parameters are set up in arch_init_sched_domains. | ||
| 3359 | */ | ||
| 3360 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | ||
| 3361 | { | ||
| 3362 | int balance = 1; | ||
| 3363 | struct rq *rq = cpu_rq(cpu); | ||
| 3364 | unsigned long interval; | ||
| 3365 | struct sched_domain *sd; | ||
| 3366 | /* Earliest time when we have to do rebalance again */ | ||
| 3367 | unsigned long next_balance = jiffies + 60*HZ; | ||
| 3368 | int update_next_balance = 0; | ||
| 3369 | int need_serialize; | ||
| 3370 | |||
| 3371 | for_each_domain(cpu, sd) { | ||
| 3372 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
| 3373 | continue; | ||
| 3374 | |||
| 3375 | interval = sd->balance_interval; | ||
| 3376 | if (idle != CPU_IDLE) | ||
| 3377 | interval *= sd->busy_factor; | ||
| 3378 | |||
| 3379 | /* scale ms to jiffies */ | ||
| 3380 | interval = msecs_to_jiffies(interval); | ||
| 3381 | if (unlikely(!interval)) | ||
| 3382 | interval = 1; | ||
| 3383 | if (interval > HZ*NR_CPUS/10) | ||
| 3384 | interval = HZ*NR_CPUS/10; | ||
| 3385 | |||
| 3386 | need_serialize = sd->flags & SD_SERIALIZE; | ||
| 3387 | |||
| 3388 | if (need_serialize) { | ||
| 3389 | if (!spin_trylock(&balancing)) | ||
| 3390 | goto out; | ||
| 3391 | } | ||
| 3392 | |||
| 3393 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
| 3394 | if (load_balance(cpu, rq, sd, idle, &balance)) { | ||
| 3395 | /* | ||
| 3396 | * We've pulled tasks over so either we're no | ||
| 3397 | * longer idle, or one of our SMT siblings is | ||
| 3398 | * not idle. | ||
| 3399 | */ | ||
| 3400 | idle = CPU_NOT_IDLE; | ||
| 3401 | } | ||
| 3402 | sd->last_balance = jiffies; | ||
| 3403 | } | ||
| 3404 | if (need_serialize) | ||
| 3405 | spin_unlock(&balancing); | ||
| 3406 | out: | ||
| 3407 | if (time_after(next_balance, sd->last_balance + interval)) { | ||
| 3408 | next_balance = sd->last_balance + interval; | ||
| 3409 | update_next_balance = 1; | ||
| 3410 | } | ||
| 3411 | |||
| 3412 | /* | ||
| 3413 | * Stop the load balance at this level. There is another | ||
| 3414 | * CPU in our sched group which is doing load balancing more | ||
| 3415 | * actively. | ||
| 3416 | */ | ||
| 3417 | if (!balance) | ||
| 3418 | break; | ||
| 3419 | } | ||
| 3420 | |||
| 3421 | /* | ||
| 3422 | * next_balance will be updated only when there is a need. | ||
| 3423 | * When the cpu is attached to null domain for ex, it will not be | ||
| 3424 | * updated. | ||
| 3425 | */ | ||
| 3426 | if (likely(update_next_balance)) | ||
| 3427 | rq->next_balance = next_balance; | ||
| 3428 | } | ||
| 3429 | |||
| 3430 | /* | ||
| 3431 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
| 3432 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
| 3433 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
| 3434 | */ | ||
| 3435 | static void run_rebalance_domains(struct softirq_action *h) | ||
| 3436 | { | ||
| 3437 | int this_cpu = smp_processor_id(); | ||
| 3438 | struct rq *this_rq = cpu_rq(this_cpu); | ||
| 3439 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | ||
| 3440 | CPU_IDLE : CPU_NOT_IDLE; | ||
| 3441 | |||
| 3442 | rebalance_domains(this_cpu, idle); | ||
| 3443 | |||
| 3444 | #ifdef CONFIG_NO_HZ | ||
| 3445 | /* | ||
| 3446 | * If this cpu is the owner for idle load balancing, then do the | ||
| 3447 | * balancing on behalf of the other idle cpus whose ticks are | ||
| 3448 | * stopped. | ||
| 3449 | */ | ||
| 3450 | if (this_rq->idle_at_tick && | ||
| 3451 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
| 3452 | struct rq *rq; | ||
| 3453 | int balance_cpu; | ||
| 3454 | |||
| 3455 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
| 3456 | if (balance_cpu == this_cpu) | ||
| 3457 | continue; | ||
| 3458 | |||
| 3459 | /* | ||
| 3460 | * If this cpu gets work to do, stop the load balancing | ||
| 3461 | * work being done for other cpus. Next load | ||
| 3462 | * balancing owner will pick it up. | ||
| 3463 | */ | ||
| 3464 | if (need_resched()) | ||
| 3465 | break; | ||
| 3466 | |||
| 3467 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
| 3468 | |||
| 3469 | rq = cpu_rq(balance_cpu); | ||
| 3470 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
| 3471 | this_rq->next_balance = rq->next_balance; | ||
| 3472 | } | ||
| 3473 | } | ||
| 3474 | #endif | ||
| 3475 | } | ||
| 3476 | |||
| 3477 | static inline int on_null_domain(int cpu) | ||
| 3478 | { | ||
| 3479 | return !rcu_dereference(cpu_rq(cpu)->sd); | ||
| 3480 | } | ||
| 3481 | |||
| 3482 | /* | ||
| 3483 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | ||
| 3484 | * | ||
| 3485 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
| 3486 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
| 3487 | * if the whole system is idle. | ||
| 3488 | */ | ||
| 3489 | static inline void trigger_load_balance(struct rq *rq, int cpu) | ||
| 3490 | { | ||
| 3491 | #ifdef CONFIG_NO_HZ | ||
| 3492 | /* | ||
| 3493 | * If we were in the nohz mode recently and busy at the current | ||
| 3494 | * scheduler tick, then check if we need to nominate new idle | ||
| 3495 | * load balancer. | ||
| 3496 | */ | ||
| 3497 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
| 3498 | rq->in_nohz_recently = 0; | ||
| 3499 | |||
| 3500 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
| 3501 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
| 3502 | atomic_set(&nohz.load_balancer, -1); | ||
| 3503 | } | ||
| 3504 | |||
| 3505 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
| 3506 | int ilb = find_new_ilb(cpu); | ||
| 3507 | |||
| 3508 | if (ilb < nr_cpu_ids) | ||
| 3509 | resched_cpu(ilb); | ||
| 3510 | } | ||
| 3511 | } | ||
| 3512 | |||
| 3513 | /* | ||
| 3514 | * If this cpu is idle and doing idle load balancing for all the | ||
| 3515 | * cpus with ticks stopped, is it time for that to stop? | ||
| 3516 | */ | ||
| 3517 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
| 3518 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
| 3519 | resched_cpu(cpu); | ||
| 3520 | return; | ||
| 3521 | } | ||
| 3522 | |||
| 3523 | /* | ||
| 3524 | * If this cpu is idle and the idle load balancing is done by | ||
| 3525 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
| 3526 | */ | ||
| 3527 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
| 3528 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
| 3529 | return; | ||
| 3530 | #endif | ||
| 3531 | /* Don't need to rebalance while attached to NULL domain */ | ||
| 3532 | if (time_after_eq(jiffies, rq->next_balance) && | ||
| 3533 | likely(!on_null_domain(cpu))) | ||
| 3534 | raise_softirq(SCHED_SOFTIRQ); | ||
| 3535 | } | ||
| 1954 | 3536 | ||
| 1955 | static void rq_online_fair(struct rq *rq) | 3537 | static void rq_online_fair(struct rq *rq) |
| 1956 | { | 3538 | { |
| @@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq) | |||
| 1962 | update_sysctl(); | 3544 | update_sysctl(); |
| 1963 | } | 3545 | } |
| 1964 | 3546 | ||
| 3547 | #else /* CONFIG_SMP */ | ||
| 3548 | |||
| 3549 | /* | ||
| 3550 | * on UP we do not need to balance between CPUs: | ||
| 3551 | */ | ||
| 3552 | static inline void idle_balance(int cpu, struct rq *rq) | ||
| 3553 | { | ||
| 3554 | } | ||
| 3555 | |||
| 1965 | #endif /* CONFIG_SMP */ | 3556 | #endif /* CONFIG_SMP */ |
| 1966 | 3557 | ||
| 1967 | /* | 3558 | /* |
| @@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq) | |||
| 2076 | } | 3667 | } |
| 2077 | #endif | 3668 | #endif |
| 2078 | 3669 | ||
| 2079 | unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) | 3670 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) |
| 2080 | { | 3671 | { |
| 2081 | struct sched_entity *se = &task->se; | 3672 | struct sched_entity *se = &task->se; |
| 2082 | unsigned int rr_interval = 0; | 3673 | unsigned int rr_interval = 0; |
| @@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = { | |||
| 2108 | #ifdef CONFIG_SMP | 3699 | #ifdef CONFIG_SMP |
| 2109 | .select_task_rq = select_task_rq_fair, | 3700 | .select_task_rq = select_task_rq_fair, |
| 2110 | 3701 | ||
| 2111 | .load_balance = load_balance_fair, | ||
| 2112 | .move_one_task = move_one_task_fair, | ||
| 2113 | .rq_online = rq_online_fair, | 3702 | .rq_online = rq_online_fair, |
| 2114 | .rq_offline = rq_offline_fair, | 3703 | .rq_offline = rq_offline_fair, |
| 2115 | 3704 | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 5f93b570d383..a8a6d8a50947 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
| @@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | |||
| 44 | { | 44 | { |
| 45 | } | 45 | } |
| 46 | 46 | ||
| 47 | #ifdef CONFIG_SMP | ||
| 48 | static unsigned long | ||
| 49 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 50 | unsigned long max_load_move, | ||
| 51 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 52 | int *all_pinned, int *this_best_prio) | ||
| 53 | { | ||
| 54 | return 0; | ||
| 55 | } | ||
| 56 | |||
| 57 | static int | ||
| 58 | move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 59 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
| 60 | { | ||
| 61 | return 0; | ||
| 62 | } | ||
| 63 | #endif | ||
| 64 | |||
| 65 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | 47 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
| 66 | { | 48 | { |
| 67 | } | 49 | } |
| @@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, | |||
| 97 | check_preempt_curr(rq, p, 0); | 79 | check_preempt_curr(rq, p, 0); |
| 98 | } | 80 | } |
| 99 | 81 | ||
| 100 | unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | 82 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) |
| 101 | { | 83 | { |
| 102 | return 0; | 84 | return 0; |
| 103 | } | 85 | } |
| @@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = { | |||
| 119 | 101 | ||
| 120 | #ifdef CONFIG_SMP | 102 | #ifdef CONFIG_SMP |
| 121 | .select_task_rq = select_task_rq_idle, | 103 | .select_task_rq = select_task_rq_idle, |
| 122 | |||
| 123 | .load_balance = load_balance_idle, | ||
| 124 | .move_one_task = move_one_task_idle, | ||
| 125 | #endif | 104 | #endif |
| 126 | 105 | ||
| 127 | .set_curr_task = set_curr_task_idle, | 106 | .set_curr_task = set_curr_task_idle, |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f48328ac216f..bf3e38fdbe6d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | |||
| 194 | return rt_se->my_q; | 194 | return rt_se->my_q; |
| 195 | } | 195 | } |
| 196 | 196 | ||
| 197 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se); | 197 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); |
| 198 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | 198 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); |
| 199 | 199 | ||
| 200 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 200 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
| 201 | { | 201 | { |
| 202 | int this_cpu = smp_processor_id(); | ||
| 202 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 203 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; |
| 203 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | 204 | struct sched_rt_entity *rt_se; |
| 205 | |||
| 206 | rt_se = rt_rq->tg->rt_se[this_cpu]; | ||
| 204 | 207 | ||
| 205 | if (rt_rq->rt_nr_running) { | 208 | if (rt_rq->rt_nr_running) { |
| 206 | if (rt_se && !on_rt_rq(rt_se)) | 209 | if (rt_se && !on_rt_rq(rt_se)) |
| 207 | enqueue_rt_entity(rt_se); | 210 | enqueue_rt_entity(rt_se, false); |
| 208 | if (rt_rq->highest_prio.curr < curr->prio) | 211 | if (rt_rq->highest_prio.curr < curr->prio) |
| 209 | resched_task(curr); | 212 | resched_task(curr); |
| 210 | } | 213 | } |
| @@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
| 212 | 215 | ||
| 213 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 216 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
| 214 | { | 217 | { |
| 215 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | 218 | int this_cpu = smp_processor_id(); |
| 219 | struct sched_rt_entity *rt_se; | ||
| 220 | |||
| 221 | rt_se = rt_rq->tg->rt_se[this_cpu]; | ||
| 216 | 222 | ||
| 217 | if (rt_se && on_rt_rq(rt_se)) | 223 | if (rt_se && on_rt_rq(rt_se)) |
| 218 | dequeue_rt_entity(rt_se); | 224 | dequeue_rt_entity(rt_se); |
| @@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 803 | dec_rt_group(rt_se, rt_rq); | 809 | dec_rt_group(rt_se, rt_rq); |
| 804 | } | 810 | } |
| 805 | 811 | ||
| 806 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | 812 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) |
| 807 | { | 813 | { |
| 808 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 814 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
| 809 | struct rt_prio_array *array = &rt_rq->active; | 815 | struct rt_prio_array *array = &rt_rq->active; |
| @@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 819 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 825 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
| 820 | return; | 826 | return; |
| 821 | 827 | ||
| 822 | list_add_tail(&rt_se->run_list, queue); | 828 | if (head) |
| 829 | list_add(&rt_se->run_list, queue); | ||
| 830 | else | ||
| 831 | list_add_tail(&rt_se->run_list, queue); | ||
| 823 | __set_bit(rt_se_prio(rt_se), array->bitmap); | 832 | __set_bit(rt_se_prio(rt_se), array->bitmap); |
| 824 | 833 | ||
| 825 | inc_rt_tasks(rt_se, rt_rq); | 834 | inc_rt_tasks(rt_se, rt_rq); |
| @@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | |||
| 856 | } | 865 | } |
| 857 | } | 866 | } |
| 858 | 867 | ||
| 859 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se) | 868 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) |
| 860 | { | 869 | { |
| 861 | dequeue_rt_stack(rt_se); | 870 | dequeue_rt_stack(rt_se); |
| 862 | for_each_sched_rt_entity(rt_se) | 871 | for_each_sched_rt_entity(rt_se) |
| 863 | __enqueue_rt_entity(rt_se); | 872 | __enqueue_rt_entity(rt_se, head); |
| 864 | } | 873 | } |
| 865 | 874 | ||
| 866 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | 875 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) |
| @@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 871 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | 880 | struct rt_rq *rt_rq = group_rt_rq(rt_se); |
| 872 | 881 | ||
| 873 | if (rt_rq && rt_rq->rt_nr_running) | 882 | if (rt_rq && rt_rq->rt_nr_running) |
| 874 | __enqueue_rt_entity(rt_se); | 883 | __enqueue_rt_entity(rt_se, false); |
| 875 | } | 884 | } |
| 876 | } | 885 | } |
| 877 | 886 | ||
| 878 | /* | 887 | /* |
| 879 | * Adding/removing a task to/from a priority array: | 888 | * Adding/removing a task to/from a priority array: |
| 880 | */ | 889 | */ |
| 881 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | 890 | static void |
| 891 | enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
| 882 | { | 892 | { |
| 883 | struct sched_rt_entity *rt_se = &p->rt; | 893 | struct sched_rt_entity *rt_se = &p->rt; |
| 884 | 894 | ||
| 885 | if (wakeup) | 895 | if (wakeup) |
| 886 | rt_se->timeout = 0; | 896 | rt_se->timeout = 0; |
| 887 | 897 | ||
| 888 | enqueue_rt_entity(rt_se); | 898 | enqueue_rt_entity(rt_se, head); |
| 889 | 899 | ||
| 890 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 900 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
| 891 | enqueue_pushable_task(rq, p); | 901 | enqueue_pushable_task(rq, p); |
| @@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
| 1481 | push_rt_tasks(rq); | 1491 | push_rt_tasks(rq); |
| 1482 | } | 1492 | } |
| 1483 | 1493 | ||
| 1484 | static unsigned long | ||
| 1485 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1486 | unsigned long max_load_move, | ||
| 1487 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 1488 | int *all_pinned, int *this_best_prio) | ||
| 1489 | { | ||
| 1490 | /* don't touch RT tasks */ | ||
| 1491 | return 0; | ||
| 1492 | } | ||
| 1493 | |||
| 1494 | static int | ||
| 1495 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1496 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
| 1497 | { | ||
| 1498 | /* don't touch RT tasks */ | ||
| 1499 | return 0; | ||
| 1500 | } | ||
| 1501 | |||
| 1502 | static void set_cpus_allowed_rt(struct task_struct *p, | 1494 | static void set_cpus_allowed_rt(struct task_struct *p, |
| 1503 | const struct cpumask *new_mask) | 1495 | const struct cpumask *new_mask) |
| 1504 | { | 1496 | { |
| @@ -1721,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq) | |||
| 1721 | dequeue_pushable_task(rq, p); | 1713 | dequeue_pushable_task(rq, p); |
| 1722 | } | 1714 | } |
| 1723 | 1715 | ||
| 1724 | unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | 1716 | static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) |
| 1725 | { | 1717 | { |
| 1726 | /* | 1718 | /* |
| 1727 | * Time slice is 0 for SCHED_FIFO tasks | 1719 | * Time slice is 0 for SCHED_FIFO tasks |
| @@ -1746,8 +1738,6 @@ static const struct sched_class rt_sched_class = { | |||
| 1746 | #ifdef CONFIG_SMP | 1738 | #ifdef CONFIG_SMP |
| 1747 | .select_task_rq = select_task_rq_rt, | 1739 | .select_task_rq = select_task_rq_rt, |
| 1748 | 1740 | ||
| 1749 | .load_balance = load_balance_rt, | ||
| 1750 | .move_one_task = move_one_task_rt, | ||
| 1751 | .set_cpus_allowed = set_cpus_allowed_rt, | 1741 | .set_cpus_allowed = set_cpus_allowed_rt, |
| 1752 | .rq_online = rq_online_rt, | 1742 | .rq_online = rq_online_rt, |
| 1753 | .rq_offline = rq_offline_rt, | 1743 | .rq_offline = rq_offline_rt, |
diff --git a/kernel/signal.c b/kernel/signal.c index 934ae5e687b9..5bb9baffa4f1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -159,6 +159,10 @@ void recalc_sigpending(void) | |||
| 159 | 159 | ||
| 160 | /* Given the mask, find the first available signal that should be serviced. */ | 160 | /* Given the mask, find the first available signal that should be serviced. */ |
| 161 | 161 | ||
| 162 | #define SYNCHRONOUS_MASK \ | ||
| 163 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ | ||
| 164 | sigmask(SIGTRAP) | sigmask(SIGFPE)) | ||
| 165 | |||
| 162 | int next_signal(struct sigpending *pending, sigset_t *mask) | 166 | int next_signal(struct sigpending *pending, sigset_t *mask) |
| 163 | { | 167 | { |
| 164 | unsigned long i, *s, *m, x; | 168 | unsigned long i, *s, *m, x; |
| @@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask) | |||
| 166 | 170 | ||
| 167 | s = pending->signal.sig; | 171 | s = pending->signal.sig; |
| 168 | m = mask->sig; | 172 | m = mask->sig; |
| 173 | |||
| 174 | /* | ||
| 175 | * Handle the first word specially: it contains the | ||
| 176 | * synchronous signals that need to be dequeued first. | ||
| 177 | */ | ||
| 178 | x = *s &~ *m; | ||
| 179 | if (x) { | ||
| 180 | if (x & SYNCHRONOUS_MASK) | ||
| 181 | x &= SYNCHRONOUS_MASK; | ||
| 182 | sig = ffz(~x) + 1; | ||
| 183 | return sig; | ||
| 184 | } | ||
| 185 | |||
| 169 | switch (_NSIG_WORDS) { | 186 | switch (_NSIG_WORDS) { |
| 170 | default: | 187 | default: |
| 171 | for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) | 188 | for (i = 1; i < _NSIG_WORDS; ++i) { |
| 172 | if ((x = *s &~ *m) != 0) { | 189 | x = *++s &~ *++m; |
| 173 | sig = ffz(~x) + i*_NSIG_BPW + 1; | 190 | if (!x) |
| 174 | break; | 191 | continue; |
| 175 | } | 192 | sig = ffz(~x) + i*_NSIG_BPW + 1; |
| 193 | break; | ||
| 194 | } | ||
| 176 | break; | 195 | break; |
| 177 | 196 | ||
| 178 | case 2: if ((x = s[0] &~ m[0]) != 0) | 197 | case 2: |
| 179 | sig = 1; | 198 | x = s[1] &~ m[1]; |
| 180 | else if ((x = s[1] &~ m[1]) != 0) | 199 | if (!x) |
| 181 | sig = _NSIG_BPW + 1; | ||
| 182 | else | ||
| 183 | break; | 200 | break; |
| 184 | sig += ffz(~x); | 201 | sig = ffz(~x) + _NSIG_BPW + 1; |
| 185 | break; | 202 | break; |
| 186 | 203 | ||
| 187 | case 1: if ((x = *s &~ *m) != 0) | 204 | case 1: |
| 188 | sig = ffz(~x) + 1; | 205 | /* Nothing to do */ |
| 189 | break; | 206 | break; |
| 190 | } | 207 | } |
| 191 | 208 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index f10408422444..9867b6bfefce 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -12,8 +12,6 @@ | |||
| 12 | #include <linux/smp.h> | 12 | #include <linux/smp.h> |
| 13 | #include <linux/cpu.h> | 13 | #include <linux/cpu.h> |
| 14 | 14 | ||
| 15 | static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); | ||
| 16 | |||
| 17 | static struct { | 15 | static struct { |
| 18 | struct list_head queue; | 16 | struct list_head queue; |
| 19 | raw_spinlock_t lock; | 17 | raw_spinlock_t lock; |
| @@ -33,12 +31,14 @@ struct call_function_data { | |||
| 33 | cpumask_var_t cpumask; | 31 | cpumask_var_t cpumask; |
| 34 | }; | 32 | }; |
| 35 | 33 | ||
| 34 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); | ||
| 35 | |||
| 36 | struct call_single_queue { | 36 | struct call_single_queue { |
| 37 | struct list_head list; | 37 | struct list_head list; |
| 38 | raw_spinlock_t lock; | 38 | raw_spinlock_t lock; |
| 39 | }; | 39 | }; |
| 40 | 40 | ||
| 41 | static DEFINE_PER_CPU(struct call_function_data, cfd_data); | 41 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); |
| 42 | 42 | ||
| 43 | static int | 43 | static int |
| 44 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | 44 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) |
| @@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void) | |||
| 256 | } | 256 | } |
| 257 | } | 257 | } |
| 258 | 258 | ||
| 259 | static DEFINE_PER_CPU(struct call_single_data, csd_data); | 259 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); |
| 260 | 260 | ||
| 261 | /* | 261 | /* |
| 262 | * smp_call_function_single - Run a function on a specific CPU | 262 | * smp_call_function_single - Run a function on a specific CPU |
diff --git a/kernel/softirq.c b/kernel/softirq.c index a09502e2ef75..7c1a67ef0274 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill); | |||
| 500 | */ | 500 | */ |
| 501 | 501 | ||
| 502 | /* | 502 | /* |
| 503 | * The trampoline is called when the hrtimer expires. If this is | 503 | * The trampoline is called when the hrtimer expires. It schedules a tasklet |
| 504 | * called from the hrtimer interrupt then we schedule the tasklet as | 504 | * to run __tasklet_hrtimer_trampoline() which in turn will call the intended |
| 505 | * the timer callback function expects to run in softirq context. If | 505 | * hrtimer callback, but from softirq context. |
| 506 | * it's called in softirq context anyway (i.e. high resolution timers | ||
| 507 | * disabled) then the hrtimer callback is called right away. | ||
| 508 | */ | 506 | */ |
| 509 | static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) | 507 | static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) |
| 510 | { | 508 | { |
| 511 | struct tasklet_hrtimer *ttimer = | 509 | struct tasklet_hrtimer *ttimer = |
| 512 | container_of(timer, struct tasklet_hrtimer, timer); | 510 | container_of(timer, struct tasklet_hrtimer, timer); |
| 513 | 511 | ||
| 514 | if (hrtimer_is_hres_active(timer)) { | 512 | tasklet_hi_schedule(&ttimer->tasklet); |
| 515 | tasklet_hi_schedule(&ttimer->tasklet); | 513 | return HRTIMER_NORESTART; |
| 516 | return HRTIMER_NORESTART; | ||
| 517 | } | ||
| 518 | return ttimer->function(timer); | ||
| 519 | } | 514 | } |
| 520 | 515 | ||
| 521 | /* | 516 | /* |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 818d7d9aa03c..bde4295774c8 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
| @@ -34,6 +34,30 @@ | |||
| 34 | #include <linux/smp.h> | 34 | #include <linux/smp.h> |
| 35 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
| 36 | 36 | ||
| 37 | static int init_srcu_struct_fields(struct srcu_struct *sp) | ||
| 38 | { | ||
| 39 | sp->completed = 0; | ||
| 40 | mutex_init(&sp->mutex); | ||
| 41 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | ||
| 42 | return sp->per_cpu_ref ? 0 : -ENOMEM; | ||
| 43 | } | ||
| 44 | |||
| 45 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 46 | |||
| 47 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
| 48 | struct lock_class_key *key) | ||
| 49 | { | ||
| 50 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 51 | /* Don't re-initialize a lock while it is held. */ | ||
| 52 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
| 53 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
| 54 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 55 | return init_srcu_struct_fields(sp); | ||
| 56 | } | ||
| 57 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
| 58 | |||
| 59 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 60 | |||
| 37 | /** | 61 | /** |
| 38 | * init_srcu_struct - initialize a sleep-RCU structure | 62 | * init_srcu_struct - initialize a sleep-RCU structure |
| 39 | * @sp: structure to initialize. | 63 | * @sp: structure to initialize. |
| @@ -44,13 +68,12 @@ | |||
| 44 | */ | 68 | */ |
| 45 | int init_srcu_struct(struct srcu_struct *sp) | 69 | int init_srcu_struct(struct srcu_struct *sp) |
| 46 | { | 70 | { |
| 47 | sp->completed = 0; | 71 | return init_srcu_struct_fields(sp); |
| 48 | mutex_init(&sp->mutex); | ||
| 49 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | ||
| 50 | return (sp->per_cpu_ref ? 0 : -ENOMEM); | ||
| 51 | } | 72 | } |
| 52 | EXPORT_SYMBOL_GPL(init_srcu_struct); | 73 | EXPORT_SYMBOL_GPL(init_srcu_struct); |
| 53 | 74 | ||
| 75 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 76 | |||
| 54 | /* | 77 | /* |
| 55 | * srcu_readers_active_idx -- returns approximate number of readers | 78 | * srcu_readers_active_idx -- returns approximate number of readers |
| 56 | * active on the specified rank of per-CPU counters. | 79 | * active on the specified rank of per-CPU counters. |
| @@ -100,15 +123,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp) | |||
| 100 | } | 123 | } |
| 101 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | 124 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); |
| 102 | 125 | ||
| 103 | /** | 126 | /* |
| 104 | * srcu_read_lock - register a new reader for an SRCU-protected structure. | ||
| 105 | * @sp: srcu_struct in which to register the new reader. | ||
| 106 | * | ||
| 107 | * Counts the new reader in the appropriate per-CPU element of the | 127 | * Counts the new reader in the appropriate per-CPU element of the |
| 108 | * srcu_struct. Must be called from process context. | 128 | * srcu_struct. Must be called from process context. |
| 109 | * Returns an index that must be passed to the matching srcu_read_unlock(). | 129 | * Returns an index that must be passed to the matching srcu_read_unlock(). |
| 110 | */ | 130 | */ |
| 111 | int srcu_read_lock(struct srcu_struct *sp) | 131 | int __srcu_read_lock(struct srcu_struct *sp) |
| 112 | { | 132 | { |
| 113 | int idx; | 133 | int idx; |
| 114 | 134 | ||
| @@ -120,31 +140,27 @@ int srcu_read_lock(struct srcu_struct *sp) | |||
| 120 | preempt_enable(); | 140 | preempt_enable(); |
| 121 | return idx; | 141 | return idx; |
| 122 | } | 142 | } |
| 123 | EXPORT_SYMBOL_GPL(srcu_read_lock); | 143 | EXPORT_SYMBOL_GPL(__srcu_read_lock); |
| 124 | 144 | ||
| 125 | /** | 145 | /* |
| 126 | * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. | ||
| 127 | * @sp: srcu_struct in which to unregister the old reader. | ||
| 128 | * @idx: return value from corresponding srcu_read_lock(). | ||
| 129 | * | ||
| 130 | * Removes the count for the old reader from the appropriate per-CPU | 146 | * Removes the count for the old reader from the appropriate per-CPU |
| 131 | * element of the srcu_struct. Note that this may well be a different | 147 | * element of the srcu_struct. Note that this may well be a different |
| 132 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | 148 | * CPU than that which was incremented by the corresponding srcu_read_lock(). |
| 133 | * Must be called from process context. | 149 | * Must be called from process context. |
| 134 | */ | 150 | */ |
| 135 | void srcu_read_unlock(struct srcu_struct *sp, int idx) | 151 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
| 136 | { | 152 | { |
| 137 | preempt_disable(); | 153 | preempt_disable(); |
| 138 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 154 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ |
| 139 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 155 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; |
| 140 | preempt_enable(); | 156 | preempt_enable(); |
| 141 | } | 157 | } |
| 142 | EXPORT_SYMBOL_GPL(srcu_read_unlock); | 158 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
| 143 | 159 | ||
| 144 | /* | 160 | /* |
| 145 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 161 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
| 146 | */ | 162 | */ |
| 147 | void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 163 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) |
| 148 | { | 164 | { |
| 149 | int idx; | 165 | int idx; |
| 150 | 166 | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 912823e2a11b..9bb9fb1bd79c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -45,7 +45,7 @@ static int refcount; | |||
| 45 | static struct workqueue_struct *stop_machine_wq; | 45 | static struct workqueue_struct *stop_machine_wq; |
| 46 | static struct stop_machine_data active, idle; | 46 | static struct stop_machine_data active, idle; |
| 47 | static const struct cpumask *active_cpus; | 47 | static const struct cpumask *active_cpus; |
| 48 | static void *stop_machine_work; | 48 | static void __percpu *stop_machine_work; |
| 49 | 49 | ||
| 50 | static void set_state(enum stopmachine_state newstate) | 50 | static void set_state(enum stopmachine_state newstate) |
| 51 | { | 51 | { |
diff --git a/kernel/sys.c b/kernel/sys.c index 26a6b73a6b85..877fe4f8e05e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
| 222 | if (which > PRIO_USER || which < PRIO_PROCESS) | 222 | if (which > PRIO_USER || which < PRIO_PROCESS) |
| 223 | return -EINVAL; | 223 | return -EINVAL; |
| 224 | 224 | ||
| 225 | rcu_read_lock(); | ||
| 225 | read_lock(&tasklist_lock); | 226 | read_lock(&tasklist_lock); |
| 226 | switch (which) { | 227 | switch (which) { |
| 227 | case PRIO_PROCESS: | 228 | case PRIO_PROCESS: |
| @@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
| 267 | } | 268 | } |
| 268 | out_unlock: | 269 | out_unlock: |
| 269 | read_unlock(&tasklist_lock); | 270 | read_unlock(&tasklist_lock); |
| 271 | rcu_read_unlock(); | ||
| 270 | 272 | ||
| 271 | return retval; | 273 | return retval; |
| 272 | } | 274 | } |
| @@ -569,11 +571,6 @@ static int set_user(struct cred *new) | |||
| 569 | if (!new_user) | 571 | if (!new_user) |
| 570 | return -EAGAIN; | 572 | return -EAGAIN; |
| 571 | 573 | ||
| 572 | if (!task_can_switch_user(new_user, current)) { | ||
| 573 | free_uid(new_user); | ||
| 574 | return -EINVAL; | ||
| 575 | } | ||
| 576 | |||
| 577 | if (atomic_read(&new_user->processes) >= | 574 | if (atomic_read(&new_user->processes) >= |
| 578 | current->signal->rlim[RLIMIT_NPROC].rlim_cur && | 575 | current->signal->rlim[RLIMIT_NPROC].rlim_cur && |
| 579 | new_user != INIT_USER) { | 576 | new_user != INIT_USER) { |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a68b2448468..0ef19c614f6d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -50,6 +50,7 @@ | |||
| 50 | #include <linux/ftrace.h> | 50 | #include <linux/ftrace.h> |
| 51 | #include <linux/slow-work.h> | 51 | #include <linux/slow-work.h> |
| 52 | #include <linux/perf_event.h> | 52 | #include <linux/perf_event.h> |
| 53 | #include <linux/kprobes.h> | ||
| 53 | 54 | ||
| 54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
| 55 | #include <asm/processor.h> | 56 | #include <asm/processor.h> |
| @@ -1441,7 +1442,7 @@ static struct ctl_table fs_table[] = { | |||
| 1441 | }; | 1442 | }; |
| 1442 | 1443 | ||
| 1443 | static struct ctl_table debug_table[] = { | 1444 | static struct ctl_table debug_table[] = { |
| 1444 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) | 1445 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) |
| 1445 | { | 1446 | { |
| 1446 | .procname = "exception-trace", | 1447 | .procname = "exception-trace", |
| 1447 | .data = &show_unhandled_signals, | 1448 | .data = &show_unhandled_signals, |
| @@ -1450,6 +1451,17 @@ static struct ctl_table debug_table[] = { | |||
| 1450 | .proc_handler = proc_dointvec | 1451 | .proc_handler = proc_dointvec |
| 1451 | }, | 1452 | }, |
| 1452 | #endif | 1453 | #endif |
| 1454 | #if defined(CONFIG_OPTPROBES) | ||
| 1455 | { | ||
| 1456 | .procname = "kprobes-optimization", | ||
| 1457 | .data = &sysctl_kprobes_optimization, | ||
| 1458 | .maxlen = sizeof(int), | ||
| 1459 | .mode = 0644, | ||
| 1460 | .proc_handler = proc_kprobes_optimization_handler, | ||
| 1461 | .extra1 = &zero, | ||
| 1462 | .extra2 = &one, | ||
| 1463 | }, | ||
| 1464 | #endif | ||
| 1453 | { } | 1465 | { } |
| 1454 | }; | 1466 | }; |
| 1455 | 1467 | ||
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 8f5d16e0707a..8cd50d8f9bde 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
| 1331 | ssize_t result; | 1331 | ssize_t result; |
| 1332 | char *pathname; | 1332 | char *pathname; |
| 1333 | int flags; | 1333 | int flags; |
| 1334 | int acc_mode, fmode; | 1334 | int acc_mode; |
| 1335 | 1335 | ||
| 1336 | pathname = sysctl_getname(name, nlen, &table); | 1336 | pathname = sysctl_getname(name, nlen, &table); |
| 1337 | result = PTR_ERR(pathname); | 1337 | result = PTR_ERR(pathname); |
| @@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
| 1342 | if (oldval && oldlen && newval && newlen) { | 1342 | if (oldval && oldlen && newval && newlen) { |
| 1343 | flags = O_RDWR; | 1343 | flags = O_RDWR; |
| 1344 | acc_mode = MAY_READ | MAY_WRITE; | 1344 | acc_mode = MAY_READ | MAY_WRITE; |
| 1345 | fmode = FMODE_READ | FMODE_WRITE; | ||
| 1346 | } else if (newval && newlen) { | 1345 | } else if (newval && newlen) { |
| 1347 | flags = O_WRONLY; | 1346 | flags = O_WRONLY; |
| 1348 | acc_mode = MAY_WRITE; | 1347 | acc_mode = MAY_WRITE; |
| 1349 | fmode = FMODE_WRITE; | ||
| 1350 | } else if (oldval && oldlen) { | 1348 | } else if (oldval && oldlen) { |
| 1351 | flags = O_RDONLY; | 1349 | flags = O_RDONLY; |
| 1352 | acc_mode = MAY_READ; | 1350 | acc_mode = MAY_READ; |
| 1353 | fmode = FMODE_READ; | ||
| 1354 | } else { | 1351 | } else { |
| 1355 | result = 0; | 1352 | result = 0; |
| 1356 | goto out_putname; | 1353 | goto out_putname; |
| @@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
| 1361 | if (result) | 1358 | if (result) |
| 1362 | goto out_putname; | 1359 | goto out_putname; |
| 1363 | 1360 | ||
| 1364 | result = may_open(&nd.path, acc_mode, fmode); | 1361 | result = may_open(&nd.path, acc_mode, flags); |
| 1365 | if (result) | 1362 | if (result) |
| 1366 | goto out_putpath; | 1363 | goto out_putpath; |
| 1367 | 1364 | ||
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ea8384d3caa7..899ca51be5e8 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -46,15 +46,13 @@ static struct genl_family family = { | |||
| 46 | .maxattr = TASKSTATS_CMD_ATTR_MAX, | 46 | .maxattr = TASKSTATS_CMD_ATTR_MAX, |
| 47 | }; | 47 | }; |
| 48 | 48 | ||
| 49 | static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] | 49 | static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { |
| 50 | __read_mostly = { | ||
| 51 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, | 50 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, |
| 52 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, | 51 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, |
| 53 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, | 52 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, |
| 54 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; | 53 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; |
| 55 | 54 | ||
| 56 | static struct nla_policy | 55 | static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { |
| 57 | cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { | ||
| 58 | [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, | 56 | [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, |
| 59 | }; | 57 | }; |
| 60 | 58 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 13700833c181..1f663d23e85e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -453,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; } | |||
| 453 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ | 453 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ |
| 454 | 454 | ||
| 455 | /** | 455 | /** |
| 456 | * clocksource_suspend - suspend the clocksource(s) | ||
| 457 | */ | ||
| 458 | void clocksource_suspend(void) | ||
| 459 | { | ||
| 460 | struct clocksource *cs; | ||
| 461 | |||
| 462 | list_for_each_entry_reverse(cs, &clocksource_list, list) | ||
| 463 | if (cs->suspend) | ||
| 464 | cs->suspend(cs); | ||
| 465 | } | ||
| 466 | |||
| 467 | /** | ||
| 456 | * clocksource_resume - resume the clocksource(s) | 468 | * clocksource_resume - resume the clocksource(s) |
| 457 | */ | 469 | */ |
| 458 | void clocksource_resume(void) | 470 | void clocksource_resume(void) |
| @@ -461,7 +473,7 @@ void clocksource_resume(void) | |||
| 461 | 473 | ||
| 462 | list_for_each_entry(cs, &clocksource_list, list) | 474 | list_for_each_entry(cs, &clocksource_list, list) |
| 463 | if (cs->resume) | 475 | if (cs->resume) |
| 464 | cs->resume(); | 476 | cs->resume(cs); |
| 465 | 477 | ||
| 466 | clocksource_resume_watchdog(); | 478 | clocksource_resume_watchdog(); |
| 467 | } | 479 | } |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4800f933910e..7c0f180d6e9d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -58,10 +58,10 @@ static s64 time_offset; | |||
| 58 | static long time_constant = 2; | 58 | static long time_constant = 2; |
| 59 | 59 | ||
| 60 | /* maximum error (usecs): */ | 60 | /* maximum error (usecs): */ |
| 61 | long time_maxerror = NTP_PHASE_LIMIT; | 61 | static long time_maxerror = NTP_PHASE_LIMIT; |
| 62 | 62 | ||
| 63 | /* estimated error (usecs): */ | 63 | /* estimated error (usecs): */ |
| 64 | long time_esterror = NTP_PHASE_LIMIT; | 64 | static long time_esterror = NTP_PHASE_LIMIT; |
| 65 | 65 | ||
| 66 | /* frequency offset (scaled nsecs/secs): */ | 66 | /* frequency offset (scaled nsecs/secs): */ |
| 67 | static s64 time_freq; | 67 | static s64 time_freq; |
| @@ -142,11 +142,11 @@ static void ntp_update_offset(long offset) | |||
| 142 | * Select how the frequency is to be controlled | 142 | * Select how the frequency is to be controlled |
| 143 | * and in which mode (PLL or FLL). | 143 | * and in which mode (PLL or FLL). |
| 144 | */ | 144 | */ |
| 145 | secs = xtime.tv_sec - time_reftime; | 145 | secs = get_seconds() - time_reftime; |
| 146 | if (unlikely(time_status & STA_FREQHOLD)) | 146 | if (unlikely(time_status & STA_FREQHOLD)) |
| 147 | secs = 0; | 147 | secs = 0; |
| 148 | 148 | ||
| 149 | time_reftime = xtime.tv_sec; | 149 | time_reftime = get_seconds(); |
| 150 | 150 | ||
| 151 | offset64 = offset; | 151 | offset64 = offset; |
| 152 | freq_adj = (offset64 * secs) << | 152 | freq_adj = (offset64 * secs) << |
| @@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
| 368 | * reference time to current time. | 368 | * reference time to current time. |
| 369 | */ | 369 | */ |
| 370 | if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) | 370 | if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) |
| 371 | time_reftime = xtime.tv_sec; | 371 | time_reftime = get_seconds(); |
| 372 | 372 | ||
| 373 | /* only set allowed bits */ | 373 | /* only set allowed bits */ |
| 374 | time_status &= STA_RONLY; | 374 | time_status &= STA_RONLY; |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7faaa32fbf4f..16736379a9ca 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -622,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
| 622 | write_sequnlock_irqrestore(&xtime_lock, flags); | 622 | write_sequnlock_irqrestore(&xtime_lock, flags); |
| 623 | 623 | ||
| 624 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 624 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
| 625 | clocksource_suspend(); | ||
| 625 | 626 | ||
| 626 | return 0; | 627 | return 0; |
| 627 | } | 628 | } |
| @@ -880,6 +881,7 @@ void getboottime(struct timespec *ts) | |||
| 880 | 881 | ||
| 881 | set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); | 882 | set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); |
| 882 | } | 883 | } |
| 884 | EXPORT_SYMBOL_GPL(getboottime); | ||
| 883 | 885 | ||
| 884 | /** | 886 | /** |
| 885 | * monotonic_to_bootbased - Convert the monotonic time to boot based. | 887 | * monotonic_to_bootbased - Convert the monotonic time to boot based. |
| @@ -889,6 +891,7 @@ void monotonic_to_bootbased(struct timespec *ts) | |||
| 889 | { | 891 | { |
| 890 | *ts = timespec_add_safe(*ts, total_sleep_time); | 892 | *ts = timespec_add_safe(*ts, total_sleep_time); |
| 891 | } | 893 | } |
| 894 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); | ||
| 892 | 895 | ||
| 893 | unsigned long get_seconds(void) | 896 | unsigned long get_seconds(void) |
| 894 | { | 897 | { |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 60e2ce0181ee..13e13d428cd3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -328,15 +328,6 @@ config BRANCH_TRACER | |||
| 328 | 328 | ||
| 329 | Say N if unsure. | 329 | Say N if unsure. |
| 330 | 330 | ||
| 331 | config POWER_TRACER | ||
| 332 | bool "Trace power consumption behavior" | ||
| 333 | depends on X86 | ||
| 334 | select GENERIC_TRACER | ||
| 335 | help | ||
| 336 | This tracer helps developers to analyze and optimize the kernel's | ||
| 337 | power management decisions, specifically the C-state and P-state | ||
| 338 | behavior. | ||
| 339 | |||
| 340 | config KSYM_TRACER | 331 | config KSYM_TRACER |
| 341 | bool "Trace read and write access on kernel memory locations" | 332 | bool "Trace read and write access on kernel memory locations" |
| 342 | depends on HAVE_HW_BREAKPOINT | 333 | depends on HAVE_HW_BREAKPOINT |
| @@ -449,7 +440,7 @@ config BLK_DEV_IO_TRACE | |||
| 449 | 440 | ||
| 450 | config KPROBE_EVENT | 441 | config KPROBE_EVENT |
| 451 | depends on KPROBES | 442 | depends on KPROBES |
| 452 | depends on X86 | 443 | depends on HAVE_REGS_AND_STACK_ACCESS_API |
| 453 | bool "Enable kprobes-based dynamic events" | 444 | bool "Enable kprobes-based dynamic events" |
| 454 | select TRACING | 445 | select TRACING |
| 455 | default y | 446 | default y |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cd9ecd89ec77..d00c6fe23f54 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -51,7 +51,9 @@ endif | |||
| 51 | obj-$(CONFIG_EVENT_TRACING) += trace_events.o | 51 | obj-$(CONFIG_EVENT_TRACING) += trace_events.o |
| 52 | obj-$(CONFIG_EVENT_TRACING) += trace_export.o | 52 | obj-$(CONFIG_EVENT_TRACING) += trace_export.o |
| 53 | obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o | 53 | obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o |
| 54 | obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o | 54 | ifeq ($(CONFIG_PERF_EVENTS),y) |
| 55 | obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o | ||
| 56 | endif | ||
| 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 57 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
| 56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 58 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
| 57 | obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o | 59 | obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d9d6206e0b14..07f945a99430 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -540,9 +540,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
| 540 | if (ret) | 540 | if (ret) |
| 541 | return ret; | 541 | return ret; |
| 542 | 542 | ||
| 543 | if (copy_to_user(arg, &buts, sizeof(buts))) | 543 | if (copy_to_user(arg, &buts, sizeof(buts))) { |
| 544 | blk_trace_remove(q); | ||
| 544 | return -EFAULT; | 545 | return -EFAULT; |
| 545 | 546 | } | |
| 546 | return 0; | 547 | return 0; |
| 547 | } | 548 | } |
| 548 | EXPORT_SYMBOL_GPL(blk_trace_setup); | 549 | EXPORT_SYMBOL_GPL(blk_trace_setup); |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1e6640f80454..83783579378f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -22,7 +22,6 @@ | |||
| 22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
| 23 | #include <linux/kthread.h> | 23 | #include <linux/kthread.h> |
| 24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
| 25 | #include <linux/kprobes.h> | ||
| 26 | #include <linux/ftrace.h> | 25 | #include <linux/ftrace.h> |
| 27 | #include <linux/sysctl.h> | 26 | #include <linux/sysctl.h> |
| 28 | #include <linux/ctype.h> | 27 | #include <linux/ctype.h> |
| @@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records; | |||
| 898 | } \ | 897 | } \ |
| 899 | } | 898 | } |
| 900 | 899 | ||
| 901 | #ifdef CONFIG_KPROBES | ||
| 902 | |||
| 903 | static int frozen_record_count; | ||
| 904 | |||
| 905 | static inline void freeze_record(struct dyn_ftrace *rec) | ||
| 906 | { | ||
| 907 | if (!(rec->flags & FTRACE_FL_FROZEN)) { | ||
| 908 | rec->flags |= FTRACE_FL_FROZEN; | ||
| 909 | frozen_record_count++; | ||
| 910 | } | ||
| 911 | } | ||
| 912 | |||
| 913 | static inline void unfreeze_record(struct dyn_ftrace *rec) | ||
| 914 | { | ||
| 915 | if (rec->flags & FTRACE_FL_FROZEN) { | ||
| 916 | rec->flags &= ~FTRACE_FL_FROZEN; | ||
| 917 | frozen_record_count--; | ||
| 918 | } | ||
| 919 | } | ||
| 920 | |||
| 921 | static inline int record_frozen(struct dyn_ftrace *rec) | ||
| 922 | { | ||
| 923 | return rec->flags & FTRACE_FL_FROZEN; | ||
| 924 | } | ||
| 925 | #else | ||
| 926 | # define freeze_record(rec) ({ 0; }) | ||
| 927 | # define unfreeze_record(rec) ({ 0; }) | ||
| 928 | # define record_frozen(rec) ({ 0; }) | ||
| 929 | #endif /* CONFIG_KPROBES */ | ||
| 930 | |||
| 931 | static void ftrace_free_rec(struct dyn_ftrace *rec) | 900 | static void ftrace_free_rec(struct dyn_ftrace *rec) |
| 932 | { | 901 | { |
| 933 | rec->freelist = ftrace_free_records; | 902 | rec->freelist = ftrace_free_records; |
| @@ -1025,6 +994,21 @@ static void ftrace_bug(int failed, unsigned long ip) | |||
| 1025 | } | 994 | } |
| 1026 | 995 | ||
| 1027 | 996 | ||
| 997 | /* Return 1 if the address range is reserved for ftrace */ | ||
| 998 | int ftrace_text_reserved(void *start, void *end) | ||
| 999 | { | ||
| 1000 | struct dyn_ftrace *rec; | ||
| 1001 | struct ftrace_page *pg; | ||
| 1002 | |||
| 1003 | do_for_each_ftrace_rec(pg, rec) { | ||
| 1004 | if (rec->ip <= (unsigned long)end && | ||
| 1005 | rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) | ||
| 1006 | return 1; | ||
| 1007 | } while_for_each_ftrace_rec(); | ||
| 1008 | return 0; | ||
| 1009 | } | ||
| 1010 | |||
| 1011 | |||
| 1028 | static int | 1012 | static int |
| 1029 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | 1013 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) |
| 1030 | { | 1014 | { |
| @@ -1076,14 +1060,6 @@ static void ftrace_replace_code(int enable) | |||
| 1076 | !(rec->flags & FTRACE_FL_CONVERTED)) | 1060 | !(rec->flags & FTRACE_FL_CONVERTED)) |
| 1077 | continue; | 1061 | continue; |
| 1078 | 1062 | ||
| 1079 | /* ignore updates to this record's mcount site */ | ||
| 1080 | if (get_kprobe((void *)rec->ip)) { | ||
| 1081 | freeze_record(rec); | ||
| 1082 | continue; | ||
| 1083 | } else { | ||
| 1084 | unfreeze_record(rec); | ||
| 1085 | } | ||
| 1086 | |||
| 1087 | failed = __ftrace_replace_code(rec, enable); | 1063 | failed = __ftrace_replace_code(rec, enable); |
| 1088 | if (failed) { | 1064 | if (failed) { |
| 1089 | rec->flags |= FTRACE_FL_FAILED; | 1065 | rec->flags |= FTRACE_FL_FAILED; |
| @@ -2426,6 +2402,7 @@ static const struct file_operations ftrace_notrace_fops = { | |||
| 2426 | static DEFINE_MUTEX(graph_lock); | 2402 | static DEFINE_MUTEX(graph_lock); |
| 2427 | 2403 | ||
| 2428 | int ftrace_graph_count; | 2404 | int ftrace_graph_count; |
| 2405 | int ftrace_graph_filter_enabled; | ||
| 2429 | unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; | 2406 | unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; |
| 2430 | 2407 | ||
| 2431 | static void * | 2408 | static void * |
| @@ -2448,7 +2425,7 @@ static void *g_start(struct seq_file *m, loff_t *pos) | |||
| 2448 | mutex_lock(&graph_lock); | 2425 | mutex_lock(&graph_lock); |
| 2449 | 2426 | ||
| 2450 | /* Nothing, tell g_show to print all functions are enabled */ | 2427 | /* Nothing, tell g_show to print all functions are enabled */ |
| 2451 | if (!ftrace_graph_count && !*pos) | 2428 | if (!ftrace_graph_filter_enabled && !*pos) |
| 2452 | return (void *)1; | 2429 | return (void *)1; |
| 2453 | 2430 | ||
| 2454 | return __g_next(m, pos); | 2431 | return __g_next(m, pos); |
| @@ -2494,6 +2471,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) | |||
| 2494 | mutex_lock(&graph_lock); | 2471 | mutex_lock(&graph_lock); |
| 2495 | if ((file->f_mode & FMODE_WRITE) && | 2472 | if ((file->f_mode & FMODE_WRITE) && |
| 2496 | (file->f_flags & O_TRUNC)) { | 2473 | (file->f_flags & O_TRUNC)) { |
| 2474 | ftrace_graph_filter_enabled = 0; | ||
| 2497 | ftrace_graph_count = 0; | 2475 | ftrace_graph_count = 0; |
| 2498 | memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); | 2476 | memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); |
| 2499 | } | 2477 | } |
| @@ -2519,7 +2497,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
| 2519 | struct dyn_ftrace *rec; | 2497 | struct dyn_ftrace *rec; |
| 2520 | struct ftrace_page *pg; | 2498 | struct ftrace_page *pg; |
| 2521 | int search_len; | 2499 | int search_len; |
| 2522 | int found = 0; | 2500 | int fail = 1; |
| 2523 | int type, not; | 2501 | int type, not; |
| 2524 | char *search; | 2502 | char *search; |
| 2525 | bool exists; | 2503 | bool exists; |
| @@ -2530,37 +2508,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
| 2530 | 2508 | ||
| 2531 | /* decode regex */ | 2509 | /* decode regex */ |
| 2532 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); | 2510 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); |
| 2533 | if (not) | 2511 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) |
| 2534 | return -EINVAL; | 2512 | return -EBUSY; |
| 2535 | 2513 | ||
| 2536 | search_len = strlen(search); | 2514 | search_len = strlen(search); |
| 2537 | 2515 | ||
| 2538 | mutex_lock(&ftrace_lock); | 2516 | mutex_lock(&ftrace_lock); |
| 2539 | do_for_each_ftrace_rec(pg, rec) { | 2517 | do_for_each_ftrace_rec(pg, rec) { |
| 2540 | 2518 | ||
| 2541 | if (*idx >= FTRACE_GRAPH_MAX_FUNCS) | ||
| 2542 | break; | ||
| 2543 | |||
| 2544 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) | 2519 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) |
| 2545 | continue; | 2520 | continue; |
| 2546 | 2521 | ||
| 2547 | if (ftrace_match_record(rec, search, search_len, type)) { | 2522 | if (ftrace_match_record(rec, search, search_len, type)) { |
| 2548 | /* ensure it is not already in the array */ | 2523 | /* if it is in the array */ |
| 2549 | exists = false; | 2524 | exists = false; |
| 2550 | for (i = 0; i < *idx; i++) | 2525 | for (i = 0; i < *idx; i++) { |
| 2551 | if (array[i] == rec->ip) { | 2526 | if (array[i] == rec->ip) { |
| 2552 | exists = true; | 2527 | exists = true; |
| 2553 | break; | 2528 | break; |
| 2554 | } | 2529 | } |
| 2555 | if (!exists) | 2530 | } |
| 2556 | array[(*idx)++] = rec->ip; | 2531 | |
| 2557 | found = 1; | 2532 | if (!not) { |
| 2533 | fail = 0; | ||
| 2534 | if (!exists) { | ||
| 2535 | array[(*idx)++] = rec->ip; | ||
| 2536 | if (*idx >= FTRACE_GRAPH_MAX_FUNCS) | ||
| 2537 | goto out; | ||
| 2538 | } | ||
| 2539 | } else { | ||
| 2540 | if (exists) { | ||
| 2541 | array[i] = array[--(*idx)]; | ||
| 2542 | array[*idx] = 0; | ||
| 2543 | fail = 0; | ||
| 2544 | } | ||
| 2545 | } | ||
| 2558 | } | 2546 | } |
| 2559 | } while_for_each_ftrace_rec(); | 2547 | } while_for_each_ftrace_rec(); |
| 2560 | 2548 | out: | |
| 2561 | mutex_unlock(&ftrace_lock); | 2549 | mutex_unlock(&ftrace_lock); |
| 2562 | 2550 | ||
| 2563 | return found ? 0 : -EINVAL; | 2551 | if (fail) |
| 2552 | return -EINVAL; | ||
| 2553 | |||
| 2554 | ftrace_graph_filter_enabled = 1; | ||
| 2555 | return 0; | ||
| 2564 | } | 2556 | } |
| 2565 | 2557 | ||
| 2566 | static ssize_t | 2558 | static ssize_t |
| @@ -2570,16 +2562,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, | |||
| 2570 | struct trace_parser parser; | 2562 | struct trace_parser parser; |
| 2571 | ssize_t read, ret; | 2563 | ssize_t read, ret; |
| 2572 | 2564 | ||
| 2573 | if (!cnt || cnt < 0) | 2565 | if (!cnt) |
| 2574 | return 0; | 2566 | return 0; |
| 2575 | 2567 | ||
| 2576 | mutex_lock(&graph_lock); | 2568 | mutex_lock(&graph_lock); |
| 2577 | 2569 | ||
| 2578 | if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { | ||
| 2579 | ret = -EBUSY; | ||
| 2580 | goto out_unlock; | ||
| 2581 | } | ||
| 2582 | |||
| 2583 | if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { | 2570 | if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { |
| 2584 | ret = -ENOMEM; | 2571 | ret = -ENOMEM; |
| 2585 | goto out_unlock; | 2572 | goto out_unlock; |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8c1b2d290718..0287f9f52f5a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/cpu.h> | 20 | #include <linux/cpu.h> |
| 21 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
| 22 | 22 | ||
| 23 | #include <asm/local.h> | ||
| 23 | #include "trace.h" | 24 | #include "trace.h" |
| 24 | 25 | ||
| 25 | /* | 26 | /* |
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index b2477caf09c2..df74c7982255 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #include <linux/kthread.h> | 8 | #include <linux/kthread.h> |
| 9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
| 10 | #include <linux/time.h> | 10 | #include <linux/time.h> |
| 11 | #include <asm/local.h> | ||
| 11 | 12 | ||
| 12 | struct rb_page { | 13 | struct rb_page { |
| 13 | u64 ts; | 14 | u64 ts; |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eac6875cb990..ed01fdba4a55 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/splice.h> | 32 | #include <linux/splice.h> |
| 33 | #include <linux/kdebug.h> | 33 | #include <linux/kdebug.h> |
| 34 | #include <linux/string.h> | 34 | #include <linux/string.h> |
| 35 | #include <linux/rwsem.h> | ||
| 35 | #include <linux/ctype.h> | 36 | #include <linux/ctype.h> |
| 36 | #include <linux/init.h> | 37 | #include <linux/init.h> |
| 37 | #include <linux/poll.h> | 38 | #include <linux/poll.h> |
| @@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled); | |||
| 91 | static inline void ftrace_disable_cpu(void) | 92 | static inline void ftrace_disable_cpu(void) |
| 92 | { | 93 | { |
| 93 | preempt_disable(); | 94 | preempt_disable(); |
| 94 | __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); | 95 | __this_cpu_inc(ftrace_cpu_disabled); |
| 95 | } | 96 | } |
| 96 | 97 | ||
| 97 | static inline void ftrace_enable_cpu(void) | 98 | static inline void ftrace_enable_cpu(void) |
| 98 | { | 99 | { |
| 99 | __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); | 100 | __this_cpu_dec(ftrace_cpu_disabled); |
| 100 | preempt_enable(); | 101 | preempt_enable(); |
| 101 | } | 102 | } |
| 102 | 103 | ||
| 103 | static cpumask_var_t __read_mostly tracing_buffer_mask; | 104 | static cpumask_var_t __read_mostly tracing_buffer_mask; |
| 104 | 105 | ||
| 105 | /* Define which cpu buffers are currently read in trace_pipe */ | ||
| 106 | static cpumask_var_t tracing_reader_cpumask; | ||
| 107 | |||
| 108 | #define for_each_tracing_cpu(cpu) \ | 106 | #define for_each_tracing_cpu(cpu) \ |
| 109 | for_each_cpu(cpu, tracing_buffer_mask) | 107 | for_each_cpu(cpu, tracing_buffer_mask) |
| 110 | 108 | ||
| @@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly; | |||
| 243 | 241 | ||
| 244 | /* | 242 | /* |
| 245 | * trace_types_lock is used to protect the trace_types list. | 243 | * trace_types_lock is used to protect the trace_types list. |
| 246 | * This lock is also used to keep user access serialized. | ||
| 247 | * Accesses from userspace will grab this lock while userspace | ||
| 248 | * activities happen inside the kernel. | ||
| 249 | */ | 244 | */ |
| 250 | static DEFINE_MUTEX(trace_types_lock); | 245 | static DEFINE_MUTEX(trace_types_lock); |
| 251 | 246 | ||
| 247 | /* | ||
| 248 | * serialize the access of the ring buffer | ||
| 249 | * | ||
| 250 | * ring buffer serializes readers, but it is low level protection. | ||
| 251 | * The validity of the events (which returns by ring_buffer_peek() ..etc) | ||
| 252 | * are not protected by ring buffer. | ||
| 253 | * | ||
| 254 | * The content of events may become garbage if we allow other process consumes | ||
| 255 | * these events concurrently: | ||
| 256 | * A) the page of the consumed events may become a normal page | ||
| 257 | * (not reader page) in ring buffer, and this page will be rewrited | ||
| 258 | * by events producer. | ||
| 259 | * B) The page of the consumed events may become a page for splice_read, | ||
| 260 | * and this page will be returned to system. | ||
| 261 | * | ||
| 262 | * These primitives allow multi process access to different cpu ring buffer | ||
| 263 | * concurrently. | ||
| 264 | * | ||
| 265 | * These primitives don't distinguish read-only and read-consume access. | ||
| 266 | * Multi read-only access are also serialized. | ||
| 267 | */ | ||
| 268 | |||
| 269 | #ifdef CONFIG_SMP | ||
| 270 | static DECLARE_RWSEM(all_cpu_access_lock); | ||
| 271 | static DEFINE_PER_CPU(struct mutex, cpu_access_lock); | ||
| 272 | |||
| 273 | static inline void trace_access_lock(int cpu) | ||
| 274 | { | ||
| 275 | if (cpu == TRACE_PIPE_ALL_CPU) { | ||
| 276 | /* gain it for accessing the whole ring buffer. */ | ||
| 277 | down_write(&all_cpu_access_lock); | ||
| 278 | } else { | ||
| 279 | /* gain it for accessing a cpu ring buffer. */ | ||
| 280 | |||
| 281 | /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ | ||
| 282 | down_read(&all_cpu_access_lock); | ||
| 283 | |||
| 284 | /* Secondly block other access to this @cpu ring buffer. */ | ||
| 285 | mutex_lock(&per_cpu(cpu_access_lock, cpu)); | ||
| 286 | } | ||
| 287 | } | ||
| 288 | |||
| 289 | static inline void trace_access_unlock(int cpu) | ||
| 290 | { | ||
| 291 | if (cpu == TRACE_PIPE_ALL_CPU) { | ||
| 292 | up_write(&all_cpu_access_lock); | ||
| 293 | } else { | ||
| 294 | mutex_unlock(&per_cpu(cpu_access_lock, cpu)); | ||
| 295 | up_read(&all_cpu_access_lock); | ||
| 296 | } | ||
| 297 | } | ||
| 298 | |||
| 299 | static inline void trace_access_lock_init(void) | ||
| 300 | { | ||
| 301 | int cpu; | ||
| 302 | |||
| 303 | for_each_possible_cpu(cpu) | ||
| 304 | mutex_init(&per_cpu(cpu_access_lock, cpu)); | ||
| 305 | } | ||
| 306 | |||
| 307 | #else | ||
| 308 | |||
| 309 | static DEFINE_MUTEX(access_lock); | ||
| 310 | |||
| 311 | static inline void trace_access_lock(int cpu) | ||
| 312 | { | ||
| 313 | (void)cpu; | ||
| 314 | mutex_lock(&access_lock); | ||
| 315 | } | ||
| 316 | |||
| 317 | static inline void trace_access_unlock(int cpu) | ||
| 318 | { | ||
| 319 | (void)cpu; | ||
| 320 | mutex_unlock(&access_lock); | ||
| 321 | } | ||
| 322 | |||
| 323 | static inline void trace_access_lock_init(void) | ||
| 324 | { | ||
| 325 | } | ||
| 326 | |||
| 327 | #endif | ||
| 328 | |||
| 252 | /* trace_wait is a waitqueue for tasks blocked on trace_poll */ | 329 | /* trace_wait is a waitqueue for tasks blocked on trace_poll */ |
| 253 | static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | 330 | static DECLARE_WAIT_QUEUE_HEAD(trace_wait); |
| 254 | 331 | ||
| @@ -1089,7 +1166,7 @@ trace_function(struct trace_array *tr, | |||
| 1089 | struct ftrace_entry *entry; | 1166 | struct ftrace_entry *entry; |
| 1090 | 1167 | ||
| 1091 | /* If we are reading the ring buffer, don't trace */ | 1168 | /* If we are reading the ring buffer, don't trace */ |
| 1092 | if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) | 1169 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
| 1093 | return; | 1170 | return; |
| 1094 | 1171 | ||
| 1095 | event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), | 1172 | event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), |
| @@ -1320,8 +1397,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
| 1320 | entry->fmt = fmt; | 1397 | entry->fmt = fmt; |
| 1321 | 1398 | ||
| 1322 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); | 1399 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); |
| 1323 | if (!filter_check_discard(call, entry, buffer, event)) | 1400 | if (!filter_check_discard(call, entry, buffer, event)) { |
| 1324 | ring_buffer_unlock_commit(buffer, event); | 1401 | ring_buffer_unlock_commit(buffer, event); |
| 1402 | ftrace_trace_stack(buffer, flags, 6, pc); | ||
| 1403 | } | ||
| 1325 | 1404 | ||
| 1326 | out_unlock: | 1405 | out_unlock: |
| 1327 | arch_spin_unlock(&trace_buf_lock); | 1406 | arch_spin_unlock(&trace_buf_lock); |
| @@ -1394,8 +1473,10 @@ int trace_array_vprintk(struct trace_array *tr, | |||
| 1394 | 1473 | ||
| 1395 | memcpy(&entry->buf, trace_buf, len); | 1474 | memcpy(&entry->buf, trace_buf, len); |
| 1396 | entry->buf[len] = '\0'; | 1475 | entry->buf[len] = '\0'; |
| 1397 | if (!filter_check_discard(call, entry, buffer, event)) | 1476 | if (!filter_check_discard(call, entry, buffer, event)) { |
| 1398 | ring_buffer_unlock_commit(buffer, event); | 1477 | ring_buffer_unlock_commit(buffer, event); |
| 1478 | ftrace_trace_stack(buffer, irq_flags, 6, pc); | ||
| 1479 | } | ||
| 1399 | 1480 | ||
| 1400 | out_unlock: | 1481 | out_unlock: |
| 1401 | arch_spin_unlock(&trace_buf_lock); | 1482 | arch_spin_unlock(&trace_buf_lock); |
| @@ -1585,12 +1666,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu) | |||
| 1585 | } | 1666 | } |
| 1586 | 1667 | ||
| 1587 | /* | 1668 | /* |
| 1588 | * No necessary locking here. The worst thing which can | ||
| 1589 | * happen is loosing events consumed at the same time | ||
| 1590 | * by a trace_pipe reader. | ||
| 1591 | * Other than that, we don't risk to crash the ring buffer | ||
| 1592 | * because it serializes the readers. | ||
| 1593 | * | ||
| 1594 | * The current tracer is copied to avoid a global locking | 1669 | * The current tracer is copied to avoid a global locking |
| 1595 | * all around. | 1670 | * all around. |
| 1596 | */ | 1671 | */ |
| @@ -1645,12 +1720,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
| 1645 | } | 1720 | } |
| 1646 | 1721 | ||
| 1647 | trace_event_read_lock(); | 1722 | trace_event_read_lock(); |
| 1723 | trace_access_lock(cpu_file); | ||
| 1648 | return p; | 1724 | return p; |
| 1649 | } | 1725 | } |
| 1650 | 1726 | ||
| 1651 | static void s_stop(struct seq_file *m, void *p) | 1727 | static void s_stop(struct seq_file *m, void *p) |
| 1652 | { | 1728 | { |
| 1729 | struct trace_iterator *iter = m->private; | ||
| 1730 | |||
| 1653 | atomic_dec(&trace_record_cmdline_disabled); | 1731 | atomic_dec(&trace_record_cmdline_disabled); |
| 1732 | trace_access_unlock(iter->cpu_file); | ||
| 1654 | trace_event_read_unlock(); | 1733 | trace_event_read_unlock(); |
| 1655 | } | 1734 | } |
| 1656 | 1735 | ||
| @@ -2841,22 +2920,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
| 2841 | 2920 | ||
| 2842 | mutex_lock(&trace_types_lock); | 2921 | mutex_lock(&trace_types_lock); |
| 2843 | 2922 | ||
| 2844 | /* We only allow one reader per cpu */ | ||
| 2845 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | ||
| 2846 | if (!cpumask_empty(tracing_reader_cpumask)) { | ||
| 2847 | ret = -EBUSY; | ||
| 2848 | goto out; | ||
| 2849 | } | ||
| 2850 | cpumask_setall(tracing_reader_cpumask); | ||
| 2851 | } else { | ||
| 2852 | if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) | ||
| 2853 | cpumask_set_cpu(cpu_file, tracing_reader_cpumask); | ||
| 2854 | else { | ||
| 2855 | ret = -EBUSY; | ||
| 2856 | goto out; | ||
| 2857 | } | ||
| 2858 | } | ||
| 2859 | |||
| 2860 | /* create a buffer to store the information to pass to userspace */ | 2923 | /* create a buffer to store the information to pass to userspace */ |
| 2861 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2924 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); |
| 2862 | if (!iter) { | 2925 | if (!iter) { |
| @@ -2912,12 +2975,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) | |||
| 2912 | 2975 | ||
| 2913 | mutex_lock(&trace_types_lock); | 2976 | mutex_lock(&trace_types_lock); |
| 2914 | 2977 | ||
| 2915 | if (iter->cpu_file == TRACE_PIPE_ALL_CPU) | ||
| 2916 | cpumask_clear(tracing_reader_cpumask); | ||
| 2917 | else | ||
| 2918 | cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); | ||
| 2919 | |||
| 2920 | |||
| 2921 | if (iter->trace->pipe_close) | 2978 | if (iter->trace->pipe_close) |
| 2922 | iter->trace->pipe_close(iter); | 2979 | iter->trace->pipe_close(iter); |
| 2923 | 2980 | ||
| @@ -3079,6 +3136,7 @@ waitagain: | |||
| 3079 | iter->pos = -1; | 3136 | iter->pos = -1; |
| 3080 | 3137 | ||
| 3081 | trace_event_read_lock(); | 3138 | trace_event_read_lock(); |
| 3139 | trace_access_lock(iter->cpu_file); | ||
| 3082 | while (find_next_entry_inc(iter) != NULL) { | 3140 | while (find_next_entry_inc(iter) != NULL) { |
| 3083 | enum print_line_t ret; | 3141 | enum print_line_t ret; |
| 3084 | int len = iter->seq.len; | 3142 | int len = iter->seq.len; |
| @@ -3095,6 +3153,7 @@ waitagain: | |||
| 3095 | if (iter->seq.len >= cnt) | 3153 | if (iter->seq.len >= cnt) |
| 3096 | break; | 3154 | break; |
| 3097 | } | 3155 | } |
| 3156 | trace_access_unlock(iter->cpu_file); | ||
| 3098 | trace_event_read_unlock(); | 3157 | trace_event_read_unlock(); |
| 3099 | 3158 | ||
| 3100 | /* Now copy what we have to the user */ | 3159 | /* Now copy what we have to the user */ |
| @@ -3220,6 +3279,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
| 3220 | } | 3279 | } |
| 3221 | 3280 | ||
| 3222 | trace_event_read_lock(); | 3281 | trace_event_read_lock(); |
| 3282 | trace_access_lock(iter->cpu_file); | ||
| 3223 | 3283 | ||
| 3224 | /* Fill as many pages as possible. */ | 3284 | /* Fill as many pages as possible. */ |
| 3225 | for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { | 3285 | for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { |
| @@ -3243,6 +3303,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
| 3243 | trace_seq_init(&iter->seq); | 3303 | trace_seq_init(&iter->seq); |
| 3244 | } | 3304 | } |
| 3245 | 3305 | ||
| 3306 | trace_access_unlock(iter->cpu_file); | ||
| 3246 | trace_event_read_unlock(); | 3307 | trace_event_read_unlock(); |
| 3247 | mutex_unlock(&iter->mutex); | 3308 | mutex_unlock(&iter->mutex); |
| 3248 | 3309 | ||
| @@ -3544,10 +3605,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
| 3544 | 3605 | ||
| 3545 | info->read = 0; | 3606 | info->read = 0; |
| 3546 | 3607 | ||
| 3608 | trace_access_lock(info->cpu); | ||
| 3547 | ret = ring_buffer_read_page(info->tr->buffer, | 3609 | ret = ring_buffer_read_page(info->tr->buffer, |
| 3548 | &info->spare, | 3610 | &info->spare, |
| 3549 | count, | 3611 | count, |
| 3550 | info->cpu, 0); | 3612 | info->cpu, 0); |
| 3613 | trace_access_unlock(info->cpu); | ||
| 3551 | if (ret < 0) | 3614 | if (ret < 0) |
| 3552 | return 0; | 3615 | return 0; |
| 3553 | 3616 | ||
| @@ -3675,6 +3738,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 3675 | len &= PAGE_MASK; | 3738 | len &= PAGE_MASK; |
| 3676 | } | 3739 | } |
| 3677 | 3740 | ||
| 3741 | trace_access_lock(info->cpu); | ||
| 3678 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 3742 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); |
| 3679 | 3743 | ||
| 3680 | for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { | 3744 | for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { |
| @@ -3722,6 +3786,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 3722 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 3786 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); |
| 3723 | } | 3787 | } |
| 3724 | 3788 | ||
| 3789 | trace_access_unlock(info->cpu); | ||
| 3725 | spd.nr_pages = i; | 3790 | spd.nr_pages = i; |
| 3726 | 3791 | ||
| 3727 | /* did we read anything? */ | 3792 | /* did we read anything? */ |
| @@ -4158,6 +4223,8 @@ static __init int tracer_init_debugfs(void) | |||
| 4158 | struct dentry *d_tracer; | 4223 | struct dentry *d_tracer; |
| 4159 | int cpu; | 4224 | int cpu; |
| 4160 | 4225 | ||
| 4226 | trace_access_lock_init(); | ||
| 4227 | |||
| 4161 | d_tracer = tracing_init_dentry(); | 4228 | d_tracer = tracing_init_dentry(); |
| 4162 | 4229 | ||
| 4163 | trace_create_file("tracing_enabled", 0644, d_tracer, | 4230 | trace_create_file("tracing_enabled", 0644, d_tracer, |
| @@ -4392,9 +4459,6 @@ __init static int tracer_alloc_buffers(void) | |||
| 4392 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 4459 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) |
| 4393 | goto out_free_buffer_mask; | 4460 | goto out_free_buffer_mask; |
| 4394 | 4461 | ||
| 4395 | if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) | ||
| 4396 | goto out_free_tracing_cpumask; | ||
| 4397 | |||
| 4398 | /* To save memory, keep the ring buffer size to its minimum */ | 4462 | /* To save memory, keep the ring buffer size to its minimum */ |
| 4399 | if (ring_buffer_expanded) | 4463 | if (ring_buffer_expanded) |
| 4400 | ring_buf_size = trace_buf_size; | 4464 | ring_buf_size = trace_buf_size; |
| @@ -4452,8 +4516,6 @@ __init static int tracer_alloc_buffers(void) | |||
| 4452 | return 0; | 4516 | return 0; |
| 4453 | 4517 | ||
| 4454 | out_free_cpumask: | 4518 | out_free_cpumask: |
| 4455 | free_cpumask_var(tracing_reader_cpumask); | ||
| 4456 | out_free_tracing_cpumask: | ||
| 4457 | free_cpumask_var(tracing_cpumask); | 4519 | free_cpumask_var(tracing_cpumask); |
| 4458 | out_free_buffer_mask: | 4520 | out_free_buffer_mask: |
| 4459 | free_cpumask_var(tracing_buffer_mask); | 4521 | free_cpumask_var(tracing_buffer_mask); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4df6a77eb196..fd05bcaf91b0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -497,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); | |||
| 497 | #ifdef CONFIG_DYNAMIC_FTRACE | 497 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 498 | /* TODO: make this variable */ | 498 | /* TODO: make this variable */ |
| 499 | #define FTRACE_GRAPH_MAX_FUNCS 32 | 499 | #define FTRACE_GRAPH_MAX_FUNCS 32 |
| 500 | extern int ftrace_graph_filter_enabled; | ||
| 500 | extern int ftrace_graph_count; | 501 | extern int ftrace_graph_count; |
| 501 | extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; | 502 | extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; |
| 502 | 503 | ||
| @@ -504,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr) | |||
| 504 | { | 505 | { |
| 505 | int i; | 506 | int i; |
| 506 | 507 | ||
| 507 | if (!ftrace_graph_count || test_tsk_trace_graph(current)) | 508 | if (!ftrace_graph_filter_enabled) |
| 508 | return 1; | 509 | return 1; |
| 509 | 510 | ||
| 510 | for (i = 0; i < ftrace_graph_count; i++) { | 511 | for (i = 0; i < ftrace_graph_count; i++) { |
| @@ -791,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[]; | |||
| 791 | 792 | ||
| 792 | #undef FTRACE_ENTRY | 793 | #undef FTRACE_ENTRY |
| 793 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ | 794 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ |
| 794 | extern struct ftrace_event_call event_##call; | 795 | extern struct ftrace_event_call \ |
| 796 | __attribute__((__aligned__(4))) event_##call; | ||
| 795 | #undef FTRACE_ENTRY_DUP | 797 | #undef FTRACE_ENTRY_DUP |
| 796 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ | 798 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ |
| 797 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) | 799 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4a194f08f88c..b9bc4d470177 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
| @@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2) | |||
| 307 | return -1; | 307 | return -1; |
| 308 | if (percent_a > percent_b) | 308 | if (percent_a > percent_b) |
| 309 | return 1; | 309 | return 1; |
| 310 | else | 310 | |
| 311 | return 0; | 311 | if (a->incorrect < b->incorrect) |
| 312 | return -1; | ||
| 313 | if (a->incorrect > b->incorrect) | ||
| 314 | return 1; | ||
| 315 | |||
| 316 | /* | ||
| 317 | * Since the above shows worse (incorrect) cases | ||
| 318 | * first, we continue that by showing best (correct) | ||
| 319 | * cases last. | ||
| 320 | */ | ||
| 321 | if (a->correct > b->correct) | ||
| 322 | return -1; | ||
| 323 | if (a->correct < b->correct) | ||
| 324 | return 1; | ||
| 325 | |||
| 326 | return 0; | ||
| 312 | } | 327 | } |
| 313 | 328 | ||
| 314 | static struct tracer_stat annotated_branch_stats = { | 329 | static struct tracer_stat annotated_branch_stats = { |
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 9e25573242cf..f0d693005075 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c | |||
| @@ -6,14 +6,12 @@ | |||
| 6 | */ | 6 | */ |
| 7 | 7 | ||
| 8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
| 9 | #include <linux/kprobes.h> | ||
| 9 | #include "trace.h" | 10 | #include "trace.h" |
| 10 | 11 | ||
| 11 | 12 | ||
| 12 | char *perf_trace_buf; | 13 | static char *perf_trace_buf; |
| 13 | EXPORT_SYMBOL_GPL(perf_trace_buf); | 14 | static char *perf_trace_buf_nmi; |
| 14 | |||
| 15 | char *perf_trace_buf_nmi; | ||
| 16 | EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); | ||
| 17 | 15 | ||
| 18 | typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; | 16 | typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; |
| 19 | 17 | ||
| @@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id) | |||
| 120 | } | 118 | } |
| 121 | mutex_unlock(&event_mutex); | 119 | mutex_unlock(&event_mutex); |
| 122 | } | 120 | } |
| 121 | |||
| 122 | __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, | ||
| 123 | int *rctxp, unsigned long *irq_flags) | ||
| 124 | { | ||
| 125 | struct trace_entry *entry; | ||
| 126 | char *trace_buf, *raw_data; | ||
| 127 | int pc, cpu; | ||
| 128 | |||
| 129 | pc = preempt_count(); | ||
| 130 | |||
| 131 | /* Protect the per cpu buffer, begin the rcu read side */ | ||
| 132 | local_irq_save(*irq_flags); | ||
| 133 | |||
| 134 | *rctxp = perf_swevent_get_recursion_context(); | ||
| 135 | if (*rctxp < 0) | ||
| 136 | goto err_recursion; | ||
| 137 | |||
| 138 | cpu = smp_processor_id(); | ||
| 139 | |||
| 140 | if (in_nmi()) | ||
| 141 | trace_buf = rcu_dereference(perf_trace_buf_nmi); | ||
| 142 | else | ||
| 143 | trace_buf = rcu_dereference(perf_trace_buf); | ||
| 144 | |||
| 145 | if (!trace_buf) | ||
| 146 | goto err; | ||
| 147 | |||
| 148 | raw_data = per_cpu_ptr(trace_buf, cpu); | ||
| 149 | |||
| 150 | /* zero the dead bytes from align to not leak stack to user */ | ||
| 151 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | ||
| 152 | |||
| 153 | entry = (struct trace_entry *)raw_data; | ||
| 154 | tracing_generic_entry_update(entry, *irq_flags, pc); | ||
| 155 | entry->type = type; | ||
| 156 | |||
| 157 | return raw_data; | ||
| 158 | err: | ||
| 159 | perf_swevent_put_recursion_context(*rctxp); | ||
| 160 | err_recursion: | ||
| 161 | local_irq_restore(*irq_flags); | ||
| 162 | return NULL; | ||
| 163 | } | ||
| 164 | EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 189b09baf4fb..3f972ad98d04 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, | |||
| 60 | return 0; | 60 | return 0; |
| 61 | 61 | ||
| 62 | err: | 62 | err: |
| 63 | if (field) { | 63 | if (field) |
| 64 | kfree(field->name); | 64 | kfree(field->name); |
| 65 | kfree(field->type); | ||
| 66 | } | ||
| 67 | kfree(field); | 65 | kfree(field); |
| 68 | 66 | ||
| 69 | return -ENOMEM; | 67 | return -ENOMEM; |
| @@ -520,41 +518,16 @@ out: | |||
| 520 | return ret; | 518 | return ret; |
| 521 | } | 519 | } |
| 522 | 520 | ||
| 523 | extern char *__bad_type_size(void); | ||
| 524 | |||
| 525 | #undef FIELD | ||
| 526 | #define FIELD(type, name) \ | ||
| 527 | sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ | ||
| 528 | #type, "common_" #name, offsetof(typeof(field), name), \ | ||
| 529 | sizeof(field.name), is_signed_type(type) | ||
| 530 | |||
| 531 | static int trace_write_header(struct trace_seq *s) | ||
| 532 | { | ||
| 533 | struct trace_entry field; | ||
| 534 | |||
| 535 | /* struct trace_entry */ | ||
| 536 | return trace_seq_printf(s, | ||
| 537 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
| 538 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
| 539 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
| 540 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
| 541 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" | ||
| 542 | "\n", | ||
| 543 | FIELD(unsigned short, type), | ||
| 544 | FIELD(unsigned char, flags), | ||
| 545 | FIELD(unsigned char, preempt_count), | ||
| 546 | FIELD(int, pid), | ||
| 547 | FIELD(int, lock_depth)); | ||
| 548 | } | ||
| 549 | |||
| 550 | static ssize_t | 521 | static ssize_t |
| 551 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | 522 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, |
| 552 | loff_t *ppos) | 523 | loff_t *ppos) |
| 553 | { | 524 | { |
| 554 | struct ftrace_event_call *call = filp->private_data; | 525 | struct ftrace_event_call *call = filp->private_data; |
| 526 | struct ftrace_event_field *field; | ||
| 555 | struct trace_seq *s; | 527 | struct trace_seq *s; |
| 528 | int common_field_count = 5; | ||
| 556 | char *buf; | 529 | char *buf; |
| 557 | int r; | 530 | int r = 0; |
| 558 | 531 | ||
| 559 | if (*ppos) | 532 | if (*ppos) |
| 560 | return 0; | 533 | return 0; |
| @@ -565,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
| 565 | 538 | ||
| 566 | trace_seq_init(s); | 539 | trace_seq_init(s); |
| 567 | 540 | ||
| 568 | /* If any of the first writes fail, so will the show_format. */ | ||
| 569 | |||
| 570 | trace_seq_printf(s, "name: %s\n", call->name); | 541 | trace_seq_printf(s, "name: %s\n", call->name); |
| 571 | trace_seq_printf(s, "ID: %d\n", call->id); | 542 | trace_seq_printf(s, "ID: %d\n", call->id); |
| 572 | trace_seq_printf(s, "format:\n"); | 543 | trace_seq_printf(s, "format:\n"); |
| 573 | trace_write_header(s); | ||
| 574 | 544 | ||
| 575 | r = call->show_format(call, s); | 545 | list_for_each_entry_reverse(field, &call->fields, link) { |
| 546 | /* | ||
| 547 | * Smartly shows the array type(except dynamic array). | ||
| 548 | * Normal: | ||
| 549 | * field:TYPE VAR | ||
| 550 | * If TYPE := TYPE[LEN], it is shown: | ||
| 551 | * field:TYPE VAR[LEN] | ||
| 552 | */ | ||
| 553 | const char *array_descriptor = strchr(field->type, '['); | ||
| 554 | |||
| 555 | if (!strncmp(field->type, "__data_loc", 10)) | ||
| 556 | array_descriptor = NULL; | ||
| 557 | |||
| 558 | if (!array_descriptor) { | ||
| 559 | r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" | ||
| 560 | "\tsize:%u;\tsigned:%d;\n", | ||
| 561 | field->type, field->name, field->offset, | ||
| 562 | field->size, !!field->is_signed); | ||
| 563 | } else { | ||
| 564 | r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" | ||
| 565 | "\tsize:%u;\tsigned:%d;\n", | ||
| 566 | (int)(array_descriptor - field->type), | ||
| 567 | field->type, field->name, | ||
| 568 | array_descriptor, field->offset, | ||
| 569 | field->size, !!field->is_signed); | ||
| 570 | } | ||
| 571 | |||
| 572 | if (--common_field_count == 0) | ||
| 573 | r = trace_seq_printf(s, "\n"); | ||
| 574 | |||
| 575 | if (!r) | ||
| 576 | break; | ||
| 577 | } | ||
| 578 | |||
| 579 | if (r) | ||
| 580 | r = trace_seq_printf(s, "\nprint fmt: %s\n", | ||
| 581 | call->print_fmt); | ||
| 582 | |||
| 576 | if (!r) { | 583 | if (!r) { |
| 577 | /* | 584 | /* |
| 578 | * ug! The format output is bigger than a PAGE!! | 585 | * ug! The format output is bigger than a PAGE!! |
| @@ -948,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
| 948 | filter); | 955 | filter); |
| 949 | } | 956 | } |
| 950 | 957 | ||
| 951 | /* A trace may not want to export its format */ | ||
| 952 | if (!call->show_format) | ||
| 953 | return 0; | ||
| 954 | |||
| 955 | trace_create_file("format", 0444, call->dir, call, | 958 | trace_create_file("format", 0444, call->dir, call, |
| 956 | format); | 959 | format); |
| 957 | 960 | ||
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e42af9aad69f..4615f62a04f1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
| @@ -1371,7 +1371,7 @@ out_unlock: | |||
| 1371 | return err; | 1371 | return err; |
| 1372 | } | 1372 | } |
| 1373 | 1373 | ||
| 1374 | #ifdef CONFIG_EVENT_PROFILE | 1374 | #ifdef CONFIG_PERF_EVENTS |
| 1375 | 1375 | ||
| 1376 | void ftrace_profile_free_filter(struct perf_event *event) | 1376 | void ftrace_profile_free_filter(struct perf_event *event) |
| 1377 | { | 1377 | { |
| @@ -1439,5 +1439,5 @@ out_unlock: | |||
| 1439 | return err; | 1439 | return err; |
| 1440 | } | 1440 | } |
| 1441 | 1441 | ||
| 1442 | #endif /* CONFIG_EVENT_PROFILE */ | 1442 | #endif /* CONFIG_PERF_EVENTS */ |
| 1443 | 1443 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4fa5dc1ee4e..e091f64ba6ce 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
| @@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
| 62 | 62 | ||
| 63 | #include "trace_entries.h" | 63 | #include "trace_entries.h" |
| 64 | 64 | ||
| 65 | |||
| 66 | #undef __field | ||
| 67 | #define __field(type, item) \ | ||
| 68 | ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ | ||
| 69 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
| 70 | offsetof(typeof(field), item), \ | ||
| 71 | sizeof(field.item), is_signed_type(type)); \ | ||
| 72 | if (!ret) \ | ||
| 73 | return 0; | ||
| 74 | |||
| 75 | #undef __field_desc | ||
| 76 | #define __field_desc(type, container, item) \ | ||
| 77 | ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ | ||
| 78 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
| 79 | offsetof(typeof(field), container.item), \ | ||
| 80 | sizeof(field.container.item), \ | ||
| 81 | is_signed_type(type)); \ | ||
| 82 | if (!ret) \ | ||
| 83 | return 0; | ||
| 84 | |||
| 85 | #undef __array | ||
| 86 | #define __array(type, item, len) \ | ||
| 87 | ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ | ||
| 88 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
| 89 | offsetof(typeof(field), item), \ | ||
| 90 | sizeof(field.item), is_signed_type(type)); \ | ||
| 91 | if (!ret) \ | ||
| 92 | return 0; | ||
| 93 | |||
| 94 | #undef __array_desc | ||
| 95 | #define __array_desc(type, container, item, len) \ | ||
| 96 | ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ | ||
| 97 | "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ | ||
| 98 | offsetof(typeof(field), container.item), \ | ||
| 99 | sizeof(field.container.item), \ | ||
| 100 | is_signed_type(type)); \ | ||
| 101 | if (!ret) \ | ||
| 102 | return 0; | ||
| 103 | |||
| 104 | #undef __dynamic_array | ||
| 105 | #define __dynamic_array(type, item) \ | ||
| 106 | ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ | ||
| 107 | "offset:%zu;\tsize:0;\tsigned:%u;\n", \ | ||
| 108 | offsetof(typeof(field), item), \ | ||
| 109 | is_signed_type(type)); \ | ||
| 110 | if (!ret) \ | ||
| 111 | return 0; | ||
| 112 | |||
| 113 | #undef F_printk | ||
| 114 | #define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) | ||
| 115 | |||
| 116 | #undef __entry | ||
| 117 | #define __entry REC | ||
| 118 | |||
| 119 | #undef FTRACE_ENTRY | ||
| 120 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ | ||
| 121 | static int \ | ||
| 122 | ftrace_format_##name(struct ftrace_event_call *unused, \ | ||
| 123 | struct trace_seq *s) \ | ||
| 124 | { \ | ||
| 125 | struct struct_name field __attribute__((unused)); \ | ||
| 126 | int ret = 0; \ | ||
| 127 | \ | ||
| 128 | tstruct; \ | ||
| 129 | \ | ||
| 130 | trace_seq_printf(s, "\nprint fmt: " print); \ | ||
| 131 | \ | ||
| 132 | return ret; \ | ||
| 133 | } | ||
| 134 | |||
| 135 | #include "trace_entries.h" | ||
| 136 | |||
| 137 | #undef __field | 65 | #undef __field |
| 138 | #define __field(type, item) \ | 66 | #define __field(type, item) \ |
| 139 | ret = trace_define_field(event_call, #type, #item, \ | 67 | ret = trace_define_field(event_call, #type, #item, \ |
| @@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ | |||
| 175 | return ret; | 103 | return ret; |
| 176 | 104 | ||
| 177 | #undef __dynamic_array | 105 | #undef __dynamic_array |
| 178 | #define __dynamic_array(type, item) | 106 | #define __dynamic_array(type, item) \ |
| 107 | ret = trace_define_field(event_call, #type, #item, \ | ||
| 108 | offsetof(typeof(field), item), \ | ||
| 109 | 0, is_signed_type(type), FILTER_OTHER);\ | ||
| 110 | if (ret) \ | ||
| 111 | return ret; | ||
| 179 | 112 | ||
| 180 | #undef FTRACE_ENTRY | 113 | #undef FTRACE_ENTRY |
| 181 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ | 114 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ |
| @@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) | |||
| 198 | return 0; | 131 | return 0; |
| 199 | } | 132 | } |
| 200 | 133 | ||
| 134 | #undef __entry | ||
| 135 | #define __entry REC | ||
| 136 | |||
| 201 | #undef __field | 137 | #undef __field |
| 202 | #define __field(type, item) | 138 | #define __field(type, item) |
| 203 | 139 | ||
| @@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) | |||
| 213 | #undef __dynamic_array | 149 | #undef __dynamic_array |
| 214 | #define __dynamic_array(type, item) | 150 | #define __dynamic_array(type, item) |
| 215 | 151 | ||
| 152 | #undef F_printk | ||
| 153 | #define F_printk(fmt, args...) #fmt ", " __stringify(args) | ||
| 154 | |||
| 216 | #undef FTRACE_ENTRY | 155 | #undef FTRACE_ENTRY |
| 217 | #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ | 156 | #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ |
| 218 | \ | 157 | \ |
| @@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ | |||
| 223 | .id = type, \ | 162 | .id = type, \ |
| 224 | .system = __stringify(TRACE_SYSTEM), \ | 163 | .system = __stringify(TRACE_SYSTEM), \ |
| 225 | .raw_init = ftrace_raw_init_event, \ | 164 | .raw_init = ftrace_raw_init_event, \ |
| 226 | .show_format = ftrace_format_##call, \ | 165 | .print_fmt = print, \ |
| 227 | .define_fields = ftrace_define_fields_##call, \ | 166 | .define_fields = ftrace_define_fields_##call, \ |
| 228 | }; \ | 167 | }; \ |
| 229 | 168 | ||
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b1342c5d37cf..3fc2a575664f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -18,6 +18,7 @@ struct fgraph_cpu_data { | |||
| 18 | pid_t last_pid; | 18 | pid_t last_pid; |
| 19 | int depth; | 19 | int depth; |
| 20 | int ignore; | 20 | int ignore; |
| 21 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; | ||
| 21 | }; | 22 | }; |
| 22 | 23 | ||
| 23 | struct fgraph_data { | 24 | struct fgraph_data { |
| @@ -187,7 +188,7 @@ static int __trace_graph_entry(struct trace_array *tr, | |||
| 187 | struct ring_buffer *buffer = tr->buffer; | 188 | struct ring_buffer *buffer = tr->buffer; |
| 188 | struct ftrace_graph_ent_entry *entry; | 189 | struct ftrace_graph_ent_entry *entry; |
| 189 | 190 | ||
| 190 | if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) | 191 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
| 191 | return 0; | 192 | return 0; |
| 192 | 193 | ||
| 193 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, | 194 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, |
| @@ -212,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
| 212 | int cpu; | 213 | int cpu; |
| 213 | int pc; | 214 | int pc; |
| 214 | 215 | ||
| 215 | if (unlikely(!tr)) | ||
| 216 | return 0; | ||
| 217 | |||
| 218 | if (!ftrace_trace_task(current)) | 216 | if (!ftrace_trace_task(current)) |
| 219 | return 0; | 217 | return 0; |
| 220 | 218 | ||
| 221 | if (!ftrace_graph_addr(trace->func)) | 219 | /* trace it when it is-nested-in or is a function enabled. */ |
| 220 | if (!(trace->depth || ftrace_graph_addr(trace->func))) | ||
| 222 | return 0; | 221 | return 0; |
| 223 | 222 | ||
| 224 | local_irq_save(flags); | 223 | local_irq_save(flags); |
| @@ -231,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
| 231 | } else { | 230 | } else { |
| 232 | ret = 0; | 231 | ret = 0; |
| 233 | } | 232 | } |
| 234 | /* Only do the atomic if it is not already set */ | ||
| 235 | if (!test_tsk_trace_graph(current)) | ||
| 236 | set_tsk_trace_graph(current); | ||
| 237 | 233 | ||
| 238 | atomic_dec(&data->disabled); | 234 | atomic_dec(&data->disabled); |
| 239 | local_irq_restore(flags); | 235 | local_irq_restore(flags); |
| @@ -251,7 +247,7 @@ static void __trace_graph_return(struct trace_array *tr, | |||
| 251 | struct ring_buffer *buffer = tr->buffer; | 247 | struct ring_buffer *buffer = tr->buffer; |
| 252 | struct ftrace_graph_ret_entry *entry; | 248 | struct ftrace_graph_ret_entry *entry; |
| 253 | 249 | ||
| 254 | if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) | 250 | if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) |
| 255 | return; | 251 | return; |
| 256 | 252 | ||
| 257 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, | 253 | event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, |
| @@ -281,17 +277,24 @@ void trace_graph_return(struct ftrace_graph_ret *trace) | |||
| 281 | pc = preempt_count(); | 277 | pc = preempt_count(); |
| 282 | __trace_graph_return(tr, trace, flags, pc); | 278 | __trace_graph_return(tr, trace, flags, pc); |
| 283 | } | 279 | } |
| 284 | if (!trace->depth) | ||
| 285 | clear_tsk_trace_graph(current); | ||
| 286 | atomic_dec(&data->disabled); | 280 | atomic_dec(&data->disabled); |
| 287 | local_irq_restore(flags); | 281 | local_irq_restore(flags); |
| 288 | } | 282 | } |
| 289 | 283 | ||
| 284 | void set_graph_array(struct trace_array *tr) | ||
| 285 | { | ||
| 286 | graph_array = tr; | ||
| 287 | |||
| 288 | /* Make graph_array visible before we start tracing */ | ||
| 289 | |||
| 290 | smp_mb(); | ||
| 291 | } | ||
| 292 | |||
| 290 | static int graph_trace_init(struct trace_array *tr) | 293 | static int graph_trace_init(struct trace_array *tr) |
| 291 | { | 294 | { |
| 292 | int ret; | 295 | int ret; |
| 293 | 296 | ||
| 294 | graph_array = tr; | 297 | set_graph_array(tr); |
| 295 | ret = register_ftrace_graph(&trace_graph_return, | 298 | ret = register_ftrace_graph(&trace_graph_return, |
| 296 | &trace_graph_entry); | 299 | &trace_graph_entry); |
| 297 | if (ret) | 300 | if (ret) |
| @@ -301,11 +304,6 @@ static int graph_trace_init(struct trace_array *tr) | |||
| 301 | return 0; | 304 | return 0; |
| 302 | } | 305 | } |
| 303 | 306 | ||
| 304 | void set_graph_array(struct trace_array *tr) | ||
| 305 | { | ||
| 306 | graph_array = tr; | ||
| 307 | } | ||
| 308 | |||
| 309 | static void graph_trace_reset(struct trace_array *tr) | 307 | static void graph_trace_reset(struct trace_array *tr) |
| 310 | { | 308 | { |
| 311 | tracing_stop_cmdline_record(); | 309 | tracing_stop_cmdline_record(); |
| @@ -673,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
| 673 | duration = graph_ret->rettime - graph_ret->calltime; | 671 | duration = graph_ret->rettime - graph_ret->calltime; |
| 674 | 672 | ||
| 675 | if (data) { | 673 | if (data) { |
| 674 | struct fgraph_cpu_data *cpu_data; | ||
| 676 | int cpu = iter->cpu; | 675 | int cpu = iter->cpu; |
| 677 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 676 | |
| 677 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); | ||
| 678 | 678 | ||
| 679 | /* | 679 | /* |
| 680 | * Comments display at + 1 to depth. Since | 680 | * Comments display at + 1 to depth. Since |
| 681 | * this is a leaf function, keep the comments | 681 | * this is a leaf function, keep the comments |
| 682 | * equal to this depth. | 682 | * equal to this depth. |
| 683 | */ | 683 | */ |
| 684 | *depth = call->depth - 1; | 684 | cpu_data->depth = call->depth - 1; |
| 685 | |||
| 686 | /* No need to keep this function around for this depth */ | ||
| 687 | if (call->depth < FTRACE_RETFUNC_DEPTH) | ||
| 688 | cpu_data->enter_funcs[call->depth] = 0; | ||
| 685 | } | 689 | } |
| 686 | 690 | ||
| 687 | /* Overhead */ | 691 | /* Overhead */ |
| @@ -721,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
| 721 | int i; | 725 | int i; |
| 722 | 726 | ||
| 723 | if (data) { | 727 | if (data) { |
| 728 | struct fgraph_cpu_data *cpu_data; | ||
| 724 | int cpu = iter->cpu; | 729 | int cpu = iter->cpu; |
| 725 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | ||
| 726 | 730 | ||
| 727 | *depth = call->depth; | 731 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); |
| 732 | cpu_data->depth = call->depth; | ||
| 733 | |||
| 734 | /* Save this function pointer to see if the exit matches */ | ||
| 735 | if (call->depth < FTRACE_RETFUNC_DEPTH) | ||
| 736 | cpu_data->enter_funcs[call->depth] = call->func; | ||
| 728 | } | 737 | } |
| 729 | 738 | ||
| 730 | /* No overhead */ | 739 | /* No overhead */ |
| @@ -854,19 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 854 | struct fgraph_data *data = iter->private; | 863 | struct fgraph_data *data = iter->private; |
| 855 | pid_t pid = ent->pid; | 864 | pid_t pid = ent->pid; |
| 856 | int cpu = iter->cpu; | 865 | int cpu = iter->cpu; |
| 866 | int func_match = 1; | ||
| 857 | int ret; | 867 | int ret; |
| 858 | int i; | 868 | int i; |
| 859 | 869 | ||
| 860 | if (data) { | 870 | if (data) { |
| 871 | struct fgraph_cpu_data *cpu_data; | ||
| 861 | int cpu = iter->cpu; | 872 | int cpu = iter->cpu; |
| 862 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 873 | |
| 874 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); | ||
| 863 | 875 | ||
| 864 | /* | 876 | /* |
| 865 | * Comments display at + 1 to depth. This is the | 877 | * Comments display at + 1 to depth. This is the |
| 866 | * return from a function, we now want the comments | 878 | * return from a function, we now want the comments |
| 867 | * to display at the same level of the bracket. | 879 | * to display at the same level of the bracket. |
| 868 | */ | 880 | */ |
| 869 | *depth = trace->depth - 1; | 881 | cpu_data->depth = trace->depth - 1; |
| 882 | |||
| 883 | if (trace->depth < FTRACE_RETFUNC_DEPTH) { | ||
| 884 | if (cpu_data->enter_funcs[trace->depth] != trace->func) | ||
| 885 | func_match = 0; | ||
| 886 | cpu_data->enter_funcs[trace->depth] = 0; | ||
| 887 | } | ||
| 870 | } | 888 | } |
| 871 | 889 | ||
| 872 | if (print_graph_prologue(iter, s, 0, 0)) | 890 | if (print_graph_prologue(iter, s, 0, 0)) |
| @@ -891,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 891 | return TRACE_TYPE_PARTIAL_LINE; | 909 | return TRACE_TYPE_PARTIAL_LINE; |
| 892 | } | 910 | } |
| 893 | 911 | ||
| 894 | ret = trace_seq_printf(s, "}\n"); | 912 | /* |
| 895 | if (!ret) | 913 | * If the return function does not have a matching entry, |
| 896 | return TRACE_TYPE_PARTIAL_LINE; | 914 | * then the entry was lost. Instead of just printing |
| 915 | * the '}' and letting the user guess what function this | ||
| 916 | * belongs to, write out the function name. | ||
| 917 | */ | ||
| 918 | if (func_match) { | ||
| 919 | ret = trace_seq_printf(s, "}\n"); | ||
| 920 | if (!ret) | ||
| 921 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 922 | } else { | ||
| 923 | ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); | ||
| 924 | if (!ret) | ||
| 925 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 926 | } | ||
| 897 | 927 | ||
| 898 | /* Overrun */ | 928 | /* Overrun */ |
| 899 | if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { | 929 | if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6ea90c0e2c96..505c92273b1a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) | |||
| 91 | return retval; | 91 | return retval; |
| 92 | } | 92 | } |
| 93 | 93 | ||
| 94 | static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) | ||
| 95 | { | ||
| 96 | return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num)); | ||
| 97 | } | ||
| 98 | |||
| 99 | static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, | 94 | static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, |
| 100 | void *dummy) | 95 | void *dummy) |
| 101 | { | 96 | { |
| @@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) | |||
| 231 | { | 226 | { |
| 232 | int ret = -EINVAL; | 227 | int ret = -EINVAL; |
| 233 | 228 | ||
| 234 | if (ff->func == fetch_argument) | 229 | if (ff->func == fetch_register) { |
| 235 | ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data); | ||
| 236 | else if (ff->func == fetch_register) { | ||
| 237 | const char *name; | 230 | const char *name; |
| 238 | name = regs_query_register_name((unsigned int)((long)ff->data)); | 231 | name = regs_query_register_name((unsigned int)((long)ff->data)); |
| 239 | ret = snprintf(buf, n, "%%%s", name); | 232 | ret = snprintf(buf, n, "%%%s", name); |
| @@ -489,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) | |||
| 489 | } | 482 | } |
| 490 | } else | 483 | } else |
| 491 | ret = -EINVAL; | 484 | ret = -EINVAL; |
| 492 | } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) { | ||
| 493 | ret = strict_strtoul(arg + 3, 10, ¶m); | ||
| 494 | if (ret || param > PARAM_MAX_ARGS) | ||
| 495 | ret = -EINVAL; | ||
| 496 | else { | ||
| 497 | ff->func = fetch_argument; | ||
| 498 | ff->data = (void *)param; | ||
| 499 | } | ||
| 500 | } else | 485 | } else |
| 501 | ret = -EINVAL; | 486 | ret = -EINVAL; |
| 502 | return ret; | 487 | return ret; |
| @@ -611,7 +596,6 @@ static int create_trace_probe(int argc, char **argv) | |||
| 611 | * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] | 596 | * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] |
| 612 | * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] | 597 | * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] |
| 613 | * Fetch args: | 598 | * Fetch args: |
| 614 | * $argN : fetch Nth of function argument. (N:0-) | ||
| 615 | * $retval : fetch return value | 599 | * $retval : fetch return value |
| 616 | * $stack : fetch stack address | 600 | * $stack : fetch stack address |
| 617 | * $stackN : fetch Nth of stack (N:0-) | 601 | * $stackN : fetch Nth of stack (N:0-) |
| @@ -651,12 +635,12 @@ static int create_trace_probe(int argc, char **argv) | |||
| 651 | event = strchr(group, '/') + 1; | 635 | event = strchr(group, '/') + 1; |
| 652 | event[-1] = '\0'; | 636 | event[-1] = '\0'; |
| 653 | if (strlen(group) == 0) { | 637 | if (strlen(group) == 0) { |
| 654 | pr_info("Group name is not specifiled\n"); | 638 | pr_info("Group name is not specified\n"); |
| 655 | return -EINVAL; | 639 | return -EINVAL; |
| 656 | } | 640 | } |
| 657 | } | 641 | } |
| 658 | if (strlen(event) == 0) { | 642 | if (strlen(event) == 0) { |
| 659 | pr_info("Event name is not specifiled\n"); | 643 | pr_info("Event name is not specified\n"); |
| 660 | return -EINVAL; | 644 | return -EINVAL; |
| 661 | } | 645 | } |
| 662 | } | 646 | } |
| @@ -689,7 +673,7 @@ static int create_trace_probe(int argc, char **argv) | |||
| 689 | return -EINVAL; | 673 | return -EINVAL; |
| 690 | } | 674 | } |
| 691 | /* an address specified */ | 675 | /* an address specified */ |
| 692 | ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); | 676 | ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); |
| 693 | if (ret) { | 677 | if (ret) { |
| 694 | pr_info("Failed to parse address.\n"); | 678 | pr_info("Failed to parse address.\n"); |
| 695 | return ret; | 679 | return ret; |
| @@ -958,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = { | |||
| 958 | }; | 942 | }; |
| 959 | 943 | ||
| 960 | /* Kprobe handler */ | 944 | /* Kprobe handler */ |
| 961 | static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | 945 | static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) |
| 962 | { | 946 | { |
| 963 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | 947 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); |
| 964 | struct kprobe_trace_entry *entry; | 948 | struct kprobe_trace_entry *entry; |
| @@ -978,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
| 978 | event = trace_current_buffer_lock_reserve(&buffer, call->id, size, | 962 | event = trace_current_buffer_lock_reserve(&buffer, call->id, size, |
| 979 | irq_flags, pc); | 963 | irq_flags, pc); |
| 980 | if (!event) | 964 | if (!event) |
| 981 | return 0; | 965 | return; |
| 982 | 966 | ||
| 983 | entry = ring_buffer_event_data(event); | 967 | entry = ring_buffer_event_data(event); |
| 984 | entry->nargs = tp->nr_args; | 968 | entry->nargs = tp->nr_args; |
| @@ -988,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
| 988 | 972 | ||
| 989 | if (!filter_current_check_discard(buffer, call, entry, event)) | 973 | if (!filter_current_check_discard(buffer, call, entry, event)) |
| 990 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 974 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); |
| 991 | return 0; | ||
| 992 | } | 975 | } |
| 993 | 976 | ||
| 994 | /* Kretprobe handler */ | 977 | /* Kretprobe handler */ |
| 995 | static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, | 978 | static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, |
| 996 | struct pt_regs *regs) | 979 | struct pt_regs *regs) |
| 997 | { | 980 | { |
| 998 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | 981 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); |
| @@ -1011,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, | |||
| 1011 | event = trace_current_buffer_lock_reserve(&buffer, call->id, size, | 994 | event = trace_current_buffer_lock_reserve(&buffer, call->id, size, |
| 1012 | irq_flags, pc); | 995 | irq_flags, pc); |
| 1013 | if (!event) | 996 | if (!event) |
| 1014 | return 0; | 997 | return; |
| 1015 | 998 | ||
| 1016 | entry = ring_buffer_event_data(event); | 999 | entry = ring_buffer_event_data(event); |
| 1017 | entry->nargs = tp->nr_args; | 1000 | entry->nargs = tp->nr_args; |
| @@ -1022,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, | |||
| 1022 | 1005 | ||
| 1023 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1006 | if (!filter_current_check_discard(buffer, call, entry, event)) |
| 1024 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1007 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); |
| 1025 | |||
| 1026 | return 0; | ||
| 1027 | } | 1008 | } |
| 1028 | 1009 | ||
| 1029 | /* Event entry printers */ | 1010 | /* Event entry printers */ |
| @@ -1174,213 +1155,123 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
| 1174 | return 0; | 1155 | return 0; |
| 1175 | } | 1156 | } |
| 1176 | 1157 | ||
| 1177 | static int __probe_event_show_format(struct trace_seq *s, | 1158 | static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) |
| 1178 | struct trace_probe *tp, const char *fmt, | ||
| 1179 | const char *arg) | ||
| 1180 | { | 1159 | { |
| 1181 | int i; | 1160 | int i; |
| 1161 | int pos = 0; | ||
| 1182 | 1162 | ||
| 1183 | /* Show format */ | 1163 | const char *fmt, *arg; |
| 1184 | if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) | ||
| 1185 | return 0; | ||
| 1186 | 1164 | ||
| 1187 | for (i = 0; i < tp->nr_args; i++) | 1165 | if (!probe_is_return(tp)) { |
| 1188 | if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) | 1166 | fmt = "(%lx)"; |
| 1189 | return 0; | 1167 | arg = "REC->" FIELD_STRING_IP; |
| 1168 | } else { | ||
| 1169 | fmt = "(%lx <- %lx)"; | ||
| 1170 | arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; | ||
| 1171 | } | ||
| 1190 | 1172 | ||
| 1191 | if (!trace_seq_printf(s, "\", %s", arg)) | 1173 | /* When len=0, we just calculate the needed length */ |
| 1192 | return 0; | 1174 | #define LEN_OR_ZERO (len ? len - pos : 0) |
| 1193 | 1175 | ||
| 1194 | for (i = 0; i < tp->nr_args; i++) | 1176 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); |
| 1195 | if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) | ||
| 1196 | return 0; | ||
| 1197 | |||
| 1198 | return trace_seq_puts(s, "\n"); | ||
| 1199 | } | ||
| 1200 | 1177 | ||
| 1201 | #undef SHOW_FIELD | 1178 | for (i = 0; i < tp->nr_args; i++) { |
| 1202 | #define SHOW_FIELD(type, item, name) \ | 1179 | pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", |
| 1203 | do { \ | 1180 | tp->args[i].name); |
| 1204 | ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ | 1181 | } |
| 1205 | "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ | ||
| 1206 | (unsigned int)offsetof(typeof(field), item),\ | ||
| 1207 | (unsigned int)sizeof(type), \ | ||
| 1208 | is_signed_type(type)); \ | ||
| 1209 | if (!ret) \ | ||
| 1210 | return 0; \ | ||
| 1211 | } while (0) | ||
| 1212 | 1182 | ||
| 1213 | static int kprobe_event_show_format(struct ftrace_event_call *call, | 1183 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); |
| 1214 | struct trace_seq *s) | ||
| 1215 | { | ||
| 1216 | struct kprobe_trace_entry field __attribute__((unused)); | ||
| 1217 | int ret, i; | ||
| 1218 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
| 1219 | 1184 | ||
| 1220 | SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); | 1185 | for (i = 0; i < tp->nr_args; i++) { |
| 1221 | SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); | 1186 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", |
| 1187 | tp->args[i].name); | ||
| 1188 | } | ||
| 1222 | 1189 | ||
| 1223 | /* Show fields */ | 1190 | #undef LEN_OR_ZERO |
| 1224 | for (i = 0; i < tp->nr_args; i++) | ||
| 1225 | SHOW_FIELD(unsigned long, args[i], tp->args[i].name); | ||
| 1226 | trace_seq_puts(s, "\n"); | ||
| 1227 | 1191 | ||
| 1228 | return __probe_event_show_format(s, tp, "(%lx)", | 1192 | /* return the length of print_fmt */ |
| 1229 | "REC->" FIELD_STRING_IP); | 1193 | return pos; |
| 1230 | } | 1194 | } |
| 1231 | 1195 | ||
| 1232 | static int kretprobe_event_show_format(struct ftrace_event_call *call, | 1196 | static int set_print_fmt(struct trace_probe *tp) |
| 1233 | struct trace_seq *s) | ||
| 1234 | { | 1197 | { |
| 1235 | struct kretprobe_trace_entry field __attribute__((unused)); | 1198 | int len; |
| 1236 | int ret, i; | 1199 | char *print_fmt; |
| 1237 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
| 1238 | 1200 | ||
| 1239 | SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); | 1201 | /* First: called with 0 length to calculate the needed length */ |
| 1240 | SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); | 1202 | len = __set_print_fmt(tp, NULL, 0); |
| 1241 | SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); | 1203 | print_fmt = kmalloc(len + 1, GFP_KERNEL); |
| 1204 | if (!print_fmt) | ||
| 1205 | return -ENOMEM; | ||
| 1242 | 1206 | ||
| 1243 | /* Show fields */ | 1207 | /* Second: actually write the @print_fmt */ |
| 1244 | for (i = 0; i < tp->nr_args; i++) | 1208 | __set_print_fmt(tp, print_fmt, len + 1); |
| 1245 | SHOW_FIELD(unsigned long, args[i], tp->args[i].name); | 1209 | tp->call.print_fmt = print_fmt; |
| 1246 | trace_seq_puts(s, "\n"); | ||
| 1247 | 1210 | ||
| 1248 | return __probe_event_show_format(s, tp, "(%lx <- %lx)", | 1211 | return 0; |
| 1249 | "REC->" FIELD_STRING_FUNC | ||
| 1250 | ", REC->" FIELD_STRING_RETIP); | ||
| 1251 | } | 1212 | } |
| 1252 | 1213 | ||
| 1253 | #ifdef CONFIG_EVENT_PROFILE | 1214 | #ifdef CONFIG_PERF_EVENTS |
| 1254 | 1215 | ||
| 1255 | /* Kprobe profile handler */ | 1216 | /* Kprobe profile handler */ |
| 1256 | static __kprobes int kprobe_profile_func(struct kprobe *kp, | 1217 | static __kprobes void kprobe_profile_func(struct kprobe *kp, |
| 1257 | struct pt_regs *regs) | 1218 | struct pt_regs *regs) |
| 1258 | { | 1219 | { |
| 1259 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | 1220 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); |
| 1260 | struct ftrace_event_call *call = &tp->call; | 1221 | struct ftrace_event_call *call = &tp->call; |
| 1261 | struct kprobe_trace_entry *entry; | 1222 | struct kprobe_trace_entry *entry; |
| 1262 | struct trace_entry *ent; | 1223 | int size, __size, i; |
| 1263 | int size, __size, i, pc, __cpu; | ||
| 1264 | unsigned long irq_flags; | 1224 | unsigned long irq_flags; |
| 1265 | char *trace_buf; | ||
| 1266 | char *raw_data; | ||
| 1267 | int rctx; | 1225 | int rctx; |
| 1268 | 1226 | ||
| 1269 | pc = preempt_count(); | ||
| 1270 | __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); | 1227 | __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); |
| 1271 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1228 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
| 1272 | size -= sizeof(u32); | 1229 | size -= sizeof(u32); |
| 1273 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, | 1230 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, |
| 1274 | "profile buffer not large enough")) | 1231 | "profile buffer not large enough")) |
| 1275 | return 0; | 1232 | return; |
| 1276 | |||
| 1277 | /* | ||
| 1278 | * Protect the non nmi buffer | ||
| 1279 | * This also protects the rcu read side | ||
| 1280 | */ | ||
| 1281 | local_irq_save(irq_flags); | ||
| 1282 | |||
| 1283 | rctx = perf_swevent_get_recursion_context(); | ||
| 1284 | if (rctx < 0) | ||
| 1285 | goto end_recursion; | ||
| 1286 | |||
| 1287 | __cpu = smp_processor_id(); | ||
| 1288 | |||
| 1289 | if (in_nmi()) | ||
| 1290 | trace_buf = rcu_dereference(perf_trace_buf_nmi); | ||
| 1291 | else | ||
| 1292 | trace_buf = rcu_dereference(perf_trace_buf); | ||
| 1293 | 1233 | ||
| 1294 | if (!trace_buf) | 1234 | entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); |
| 1295 | goto end; | 1235 | if (!entry) |
| 1296 | 1236 | return; | |
| 1297 | raw_data = per_cpu_ptr(trace_buf, __cpu); | ||
| 1298 | |||
| 1299 | /* Zero dead bytes from alignment to avoid buffer leak to userspace */ | ||
| 1300 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | ||
| 1301 | entry = (struct kprobe_trace_entry *)raw_data; | ||
| 1302 | ent = &entry->ent; | ||
| 1303 | 1237 | ||
| 1304 | tracing_generic_entry_update(ent, irq_flags, pc); | ||
| 1305 | ent->type = call->id; | ||
| 1306 | entry->nargs = tp->nr_args; | 1238 | entry->nargs = tp->nr_args; |
| 1307 | entry->ip = (unsigned long)kp->addr; | 1239 | entry->ip = (unsigned long)kp->addr; |
| 1308 | for (i = 0; i < tp->nr_args; i++) | 1240 | for (i = 0; i < tp->nr_args; i++) |
| 1309 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); | 1241 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); |
| 1310 | perf_tp_event(call->id, entry->ip, 1, entry, size); | ||
| 1311 | |||
| 1312 | end: | ||
| 1313 | perf_swevent_put_recursion_context(rctx); | ||
| 1314 | end_recursion: | ||
| 1315 | local_irq_restore(irq_flags); | ||
| 1316 | 1242 | ||
| 1317 | return 0; | 1243 | ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); |
| 1318 | } | 1244 | } |
| 1319 | 1245 | ||
| 1320 | /* Kretprobe profile handler */ | 1246 | /* Kretprobe profile handler */ |
| 1321 | static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, | 1247 | static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, |
| 1322 | struct pt_regs *regs) | 1248 | struct pt_regs *regs) |
| 1323 | { | 1249 | { |
| 1324 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | 1250 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); |
| 1325 | struct ftrace_event_call *call = &tp->call; | 1251 | struct ftrace_event_call *call = &tp->call; |
| 1326 | struct kretprobe_trace_entry *entry; | 1252 | struct kretprobe_trace_entry *entry; |
| 1327 | struct trace_entry *ent; | 1253 | int size, __size, i; |
| 1328 | int size, __size, i, pc, __cpu; | ||
| 1329 | unsigned long irq_flags; | 1254 | unsigned long irq_flags; |
| 1330 | char *trace_buf; | ||
| 1331 | char *raw_data; | ||
| 1332 | int rctx; | 1255 | int rctx; |
| 1333 | 1256 | ||
| 1334 | pc = preempt_count(); | ||
| 1335 | __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); | 1257 | __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); |
| 1336 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1258 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
| 1337 | size -= sizeof(u32); | 1259 | size -= sizeof(u32); |
| 1338 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, | 1260 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, |
| 1339 | "profile buffer not large enough")) | 1261 | "profile buffer not large enough")) |
| 1340 | return 0; | 1262 | return; |
| 1341 | |||
| 1342 | /* | ||
| 1343 | * Protect the non nmi buffer | ||
| 1344 | * This also protects the rcu read side | ||
| 1345 | */ | ||
| 1346 | local_irq_save(irq_flags); | ||
| 1347 | |||
| 1348 | rctx = perf_swevent_get_recursion_context(); | ||
| 1349 | if (rctx < 0) | ||
| 1350 | goto end_recursion; | ||
| 1351 | |||
| 1352 | __cpu = smp_processor_id(); | ||
| 1353 | 1263 | ||
| 1354 | if (in_nmi()) | 1264 | entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); |
| 1355 | trace_buf = rcu_dereference(perf_trace_buf_nmi); | 1265 | if (!entry) |
| 1356 | else | 1266 | return; |
| 1357 | trace_buf = rcu_dereference(perf_trace_buf); | ||
| 1358 | |||
| 1359 | if (!trace_buf) | ||
| 1360 | goto end; | ||
| 1361 | |||
| 1362 | raw_data = per_cpu_ptr(trace_buf, __cpu); | ||
| 1363 | |||
| 1364 | /* Zero dead bytes from alignment to avoid buffer leak to userspace */ | ||
| 1365 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | ||
| 1366 | entry = (struct kretprobe_trace_entry *)raw_data; | ||
| 1367 | ent = &entry->ent; | ||
| 1368 | 1267 | ||
| 1369 | tracing_generic_entry_update(ent, irq_flags, pc); | ||
| 1370 | ent->type = call->id; | ||
| 1371 | entry->nargs = tp->nr_args; | 1268 | entry->nargs = tp->nr_args; |
| 1372 | entry->func = (unsigned long)tp->rp.kp.addr; | 1269 | entry->func = (unsigned long)tp->rp.kp.addr; |
| 1373 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1270 | entry->ret_ip = (unsigned long)ri->ret_addr; |
| 1374 | for (i = 0; i < tp->nr_args; i++) | 1271 | for (i = 0; i < tp->nr_args; i++) |
| 1375 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); | 1272 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); |
| 1376 | perf_tp_event(call->id, entry->ret_ip, 1, entry, size); | ||
| 1377 | |||
| 1378 | end: | ||
| 1379 | perf_swevent_put_recursion_context(rctx); | ||
| 1380 | end_recursion: | ||
| 1381 | local_irq_restore(irq_flags); | ||
| 1382 | 1273 | ||
| 1383 | return 0; | 1274 | ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); |
| 1384 | } | 1275 | } |
| 1385 | 1276 | ||
| 1386 | static int probe_profile_enable(struct ftrace_event_call *call) | 1277 | static int probe_profile_enable(struct ftrace_event_call *call) |
| @@ -1408,7 +1299,7 @@ static void probe_profile_disable(struct ftrace_event_call *call) | |||
| 1408 | disable_kprobe(&tp->rp.kp); | 1299 | disable_kprobe(&tp->rp.kp); |
| 1409 | } | 1300 | } |
| 1410 | } | 1301 | } |
| 1411 | #endif /* CONFIG_EVENT_PROFILE */ | 1302 | #endif /* CONFIG_PERF_EVENTS */ |
| 1412 | 1303 | ||
| 1413 | 1304 | ||
| 1414 | static __kprobes | 1305 | static __kprobes |
| @@ -1418,10 +1309,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | |||
| 1418 | 1309 | ||
| 1419 | if (tp->flags & TP_FLAG_TRACE) | 1310 | if (tp->flags & TP_FLAG_TRACE) |
| 1420 | kprobe_trace_func(kp, regs); | 1311 | kprobe_trace_func(kp, regs); |
| 1421 | #ifdef CONFIG_EVENT_PROFILE | 1312 | #ifdef CONFIG_PERF_EVENTS |
| 1422 | if (tp->flags & TP_FLAG_PROFILE) | 1313 | if (tp->flags & TP_FLAG_PROFILE) |
| 1423 | kprobe_profile_func(kp, regs); | 1314 | kprobe_profile_func(kp, regs); |
| 1424 | #endif /* CONFIG_EVENT_PROFILE */ | 1315 | #endif |
| 1425 | return 0; /* We don't tweek kernel, so just return 0 */ | 1316 | return 0; /* We don't tweek kernel, so just return 0 */ |
| 1426 | } | 1317 | } |
| 1427 | 1318 | ||
| @@ -1432,10 +1323,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) | |||
| 1432 | 1323 | ||
| 1433 | if (tp->flags & TP_FLAG_TRACE) | 1324 | if (tp->flags & TP_FLAG_TRACE) |
| 1434 | kretprobe_trace_func(ri, regs); | 1325 | kretprobe_trace_func(ri, regs); |
| 1435 | #ifdef CONFIG_EVENT_PROFILE | 1326 | #ifdef CONFIG_PERF_EVENTS |
| 1436 | if (tp->flags & TP_FLAG_PROFILE) | 1327 | if (tp->flags & TP_FLAG_PROFILE) |
| 1437 | kretprobe_profile_func(ri, regs); | 1328 | kretprobe_profile_func(ri, regs); |
| 1438 | #endif /* CONFIG_EVENT_PROFILE */ | 1329 | #endif |
| 1439 | return 0; /* We don't tweek kernel, so just return 0 */ | 1330 | return 0; /* We don't tweek kernel, so just return 0 */ |
| 1440 | } | 1331 | } |
| 1441 | 1332 | ||
| @@ -1448,23 +1339,25 @@ static int register_probe_event(struct trace_probe *tp) | |||
| 1448 | if (probe_is_return(tp)) { | 1339 | if (probe_is_return(tp)) { |
| 1449 | tp->event.trace = print_kretprobe_event; | 1340 | tp->event.trace = print_kretprobe_event; |
| 1450 | call->raw_init = probe_event_raw_init; | 1341 | call->raw_init = probe_event_raw_init; |
| 1451 | call->show_format = kretprobe_event_show_format; | ||
| 1452 | call->define_fields = kretprobe_event_define_fields; | 1342 | call->define_fields = kretprobe_event_define_fields; |
| 1453 | } else { | 1343 | } else { |
| 1454 | tp->event.trace = print_kprobe_event; | 1344 | tp->event.trace = print_kprobe_event; |
| 1455 | call->raw_init = probe_event_raw_init; | 1345 | call->raw_init = probe_event_raw_init; |
| 1456 | call->show_format = kprobe_event_show_format; | ||
| 1457 | call->define_fields = kprobe_event_define_fields; | 1346 | call->define_fields = kprobe_event_define_fields; |
| 1458 | } | 1347 | } |
| 1348 | if (set_print_fmt(tp) < 0) | ||
| 1349 | return -ENOMEM; | ||
| 1459 | call->event = &tp->event; | 1350 | call->event = &tp->event; |
| 1460 | call->id = register_ftrace_event(&tp->event); | 1351 | call->id = register_ftrace_event(&tp->event); |
| 1461 | if (!call->id) | 1352 | if (!call->id) { |
| 1353 | kfree(call->print_fmt); | ||
| 1462 | return -ENODEV; | 1354 | return -ENODEV; |
| 1355 | } | ||
| 1463 | call->enabled = 0; | 1356 | call->enabled = 0; |
| 1464 | call->regfunc = probe_event_enable; | 1357 | call->regfunc = probe_event_enable; |
| 1465 | call->unregfunc = probe_event_disable; | 1358 | call->unregfunc = probe_event_disable; |
| 1466 | 1359 | ||
| 1467 | #ifdef CONFIG_EVENT_PROFILE | 1360 | #ifdef CONFIG_PERF_EVENTS |
| 1468 | call->profile_enable = probe_profile_enable; | 1361 | call->profile_enable = probe_profile_enable; |
| 1469 | call->profile_disable = probe_profile_disable; | 1362 | call->profile_disable = probe_profile_disable; |
| 1470 | #endif | 1363 | #endif |
| @@ -1472,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp) | |||
| 1472 | ret = trace_add_event_call(call); | 1365 | ret = trace_add_event_call(call); |
| 1473 | if (ret) { | 1366 | if (ret) { |
| 1474 | pr_info("Failed to register kprobe event: %s\n", call->name); | 1367 | pr_info("Failed to register kprobe event: %s\n", call->name); |
| 1368 | kfree(call->print_fmt); | ||
| 1475 | unregister_ftrace_event(&tp->event); | 1369 | unregister_ftrace_event(&tp->event); |
| 1476 | } | 1370 | } |
| 1477 | return ret; | 1371 | return ret; |
| @@ -1481,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp) | |||
| 1481 | { | 1375 | { |
| 1482 | /* tp->event is unregistered in trace_remove_event_call() */ | 1376 | /* tp->event is unregistered in trace_remove_event_call() */ |
| 1483 | trace_remove_event_call(&tp->call); | 1377 | trace_remove_event_call(&tp->call); |
| 1378 | kfree(tp->call.print_fmt); | ||
| 1484 | } | 1379 | } |
| 1485 | 1380 | ||
| 1486 | /* Make a debugfs interface for controling probe points */ | 1381 | /* Make a debugfs interface for controling probe points */ |
| @@ -1523,28 +1418,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3, | |||
| 1523 | 1418 | ||
| 1524 | static __init int kprobe_trace_self_tests_init(void) | 1419 | static __init int kprobe_trace_self_tests_init(void) |
| 1525 | { | 1420 | { |
| 1526 | int ret; | 1421 | int ret, warn = 0; |
| 1527 | int (*target)(int, int, int, int, int, int); | 1422 | int (*target)(int, int, int, int, int, int); |
| 1423 | struct trace_probe *tp; | ||
| 1528 | 1424 | ||
| 1529 | target = kprobe_trace_selftest_target; | 1425 | target = kprobe_trace_selftest_target; |
| 1530 | 1426 | ||
| 1531 | pr_info("Testing kprobe tracing: "); | 1427 | pr_info("Testing kprobe tracing: "); |
| 1532 | 1428 | ||
| 1533 | ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " | 1429 | ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " |
| 1534 | "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); | 1430 | "$stack $stack0 +0($stack)"); |
| 1535 | if (WARN_ON_ONCE(ret)) | 1431 | if (WARN_ON_ONCE(ret)) { |
| 1536 | pr_warning("error enabling function entry\n"); | 1432 | pr_warning("error on probing function entry.\n"); |
| 1433 | warn++; | ||
| 1434 | } else { | ||
| 1435 | /* Enable trace point */ | ||
| 1436 | tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); | ||
| 1437 | if (WARN_ON_ONCE(tp == NULL)) { | ||
| 1438 | pr_warning("error on getting new probe.\n"); | ||
| 1439 | warn++; | ||
| 1440 | } else | ||
| 1441 | probe_event_enable(&tp->call); | ||
| 1442 | } | ||
| 1537 | 1443 | ||
| 1538 | ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " | 1444 | ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " |
| 1539 | "$retval"); | 1445 | "$retval"); |
| 1540 | if (WARN_ON_ONCE(ret)) | 1446 | if (WARN_ON_ONCE(ret)) { |
| 1541 | pr_warning("error enabling function return\n"); | 1447 | pr_warning("error on probing function return.\n"); |
| 1448 | warn++; | ||
| 1449 | } else { | ||
| 1450 | /* Enable trace point */ | ||
| 1451 | tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); | ||
| 1452 | if (WARN_ON_ONCE(tp == NULL)) { | ||
| 1453 | pr_warning("error on getting new probe.\n"); | ||
| 1454 | warn++; | ||
| 1455 | } else | ||
| 1456 | probe_event_enable(&tp->call); | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | if (warn) | ||
| 1460 | goto end; | ||
| 1542 | 1461 | ||
| 1543 | ret = target(1, 2, 3, 4, 5, 6); | 1462 | ret = target(1, 2, 3, 4, 5, 6); |
| 1544 | 1463 | ||
| 1545 | cleanup_all_probes(); | 1464 | ret = command_trace_probe("-:testprobe"); |
| 1465 | if (WARN_ON_ONCE(ret)) { | ||
| 1466 | pr_warning("error on deleting a probe.\n"); | ||
| 1467 | warn++; | ||
| 1468 | } | ||
| 1546 | 1469 | ||
| 1547 | pr_cont("OK\n"); | 1470 | ret = command_trace_probe("-:testprobe2"); |
| 1471 | if (WARN_ON_ONCE(ret)) { | ||
| 1472 | pr_warning("error on deleting a probe.\n"); | ||
| 1473 | warn++; | ||
| 1474 | } | ||
| 1475 | |||
| 1476 | end: | ||
| 1477 | cleanup_all_probes(); | ||
| 1478 | if (warn) | ||
| 1479 | pr_cont("NG: Some tests are failed. Please check them.\n"); | ||
| 1480 | else | ||
| 1481 | pr_cont("OK\n"); | ||
| 1548 | return 0; | 1482 | return 0; |
| 1549 | } | 1483 | } |
| 1550 | 1484 | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 678a5120ee30..f4bc9b27de5f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, | |||
| 157 | unsigned long val, flags; | 157 | unsigned long val, flags; |
| 158 | char buf[64]; | 158 | char buf[64]; |
| 159 | int ret; | 159 | int ret; |
| 160 | int cpu; | ||
| 160 | 161 | ||
| 161 | if (count >= sizeof(buf)) | 162 | if (count >= sizeof(buf)) |
| 162 | return -EINVAL; | 163 | return -EINVAL; |
| @@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, | |||
| 171 | return ret; | 172 | return ret; |
| 172 | 173 | ||
| 173 | local_irq_save(flags); | 174 | local_irq_save(flags); |
| 175 | |||
| 176 | /* | ||
| 177 | * In case we trace inside arch_spin_lock() or after (NMI), | ||
| 178 | * we will cause circular lock, so we also need to increase | ||
| 179 | * the percpu trace_active here. | ||
| 180 | */ | ||
| 181 | cpu = smp_processor_id(); | ||
| 182 | per_cpu(trace_active, cpu)++; | ||
| 183 | |||
| 174 | arch_spin_lock(&max_stack_lock); | 184 | arch_spin_lock(&max_stack_lock); |
| 175 | *ptr = val; | 185 | *ptr = val; |
| 176 | arch_spin_unlock(&max_stack_lock); | 186 | arch_spin_unlock(&max_stack_lock); |
| 187 | |||
| 188 | per_cpu(trace_active, cpu)--; | ||
| 177 | local_irq_restore(flags); | 189 | local_irq_restore(flags); |
| 178 | 190 | ||
| 179 | return count; | 191 | return count; |
| @@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 206 | 218 | ||
| 207 | static void *t_start(struct seq_file *m, loff_t *pos) | 219 | static void *t_start(struct seq_file *m, loff_t *pos) |
| 208 | { | 220 | { |
| 221 | int cpu; | ||
| 222 | |||
| 209 | local_irq_disable(); | 223 | local_irq_disable(); |
| 224 | |||
| 225 | cpu = smp_processor_id(); | ||
| 226 | per_cpu(trace_active, cpu)++; | ||
| 227 | |||
| 210 | arch_spin_lock(&max_stack_lock); | 228 | arch_spin_lock(&max_stack_lock); |
| 211 | 229 | ||
| 212 | if (*pos == 0) | 230 | if (*pos == 0) |
| @@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 217 | 235 | ||
| 218 | static void t_stop(struct seq_file *m, void *p) | 236 | static void t_stop(struct seq_file *m, void *p) |
| 219 | { | 237 | { |
| 238 | int cpu; | ||
| 239 | |||
| 220 | arch_spin_unlock(&max_stack_lock); | 240 | arch_spin_unlock(&max_stack_lock); |
| 241 | |||
| 242 | cpu = smp_processor_id(); | ||
| 243 | per_cpu(trace_active, cpu)--; | ||
| 244 | |||
| 221 | local_irq_enable(); | 245 | local_irq_enable(); |
| 222 | } | 246 | } |
| 223 | 247 | ||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 75289f372dd2..cba47d7935cc 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -143,70 +143,65 @@ extern char *__bad_type_size(void); | |||
| 143 | #type, #name, offsetof(typeof(trace), name), \ | 143 | #type, #name, offsetof(typeof(trace), name), \ |
| 144 | sizeof(trace.name), is_signed_type(type) | 144 | sizeof(trace.name), is_signed_type(type) |
| 145 | 145 | ||
| 146 | int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) | 146 | static |
| 147 | int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) | ||
| 147 | { | 148 | { |
| 148 | int i; | 149 | int i; |
| 149 | int ret; | 150 | int pos = 0; |
| 150 | struct syscall_metadata *entry = call->data; | ||
| 151 | struct syscall_trace_enter trace; | ||
| 152 | int offset = offsetof(struct syscall_trace_enter, args); | ||
| 153 | 151 | ||
| 154 | ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" | 152 | /* When len=0, we just calculate the needed length */ |
| 155 | "\tsigned:%u;\n", | 153 | #define LEN_OR_ZERO (len ? len - pos : 0) |
| 156 | SYSCALL_FIELD(int, nr)); | ||
| 157 | if (!ret) | ||
| 158 | return 0; | ||
| 159 | 154 | ||
| 155 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); | ||
| 160 | for (i = 0; i < entry->nb_args; i++) { | 156 | for (i = 0; i < entry->nb_args; i++) { |
| 161 | ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], | 157 | pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", |
| 162 | entry->args[i]); | 158 | entry->args[i], sizeof(unsigned long), |
| 163 | if (!ret) | 159 | i == entry->nb_args - 1 ? "" : ", "); |
| 164 | return 0; | ||
| 165 | ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" | ||
| 166 | "\tsigned:%u;\n", offset, | ||
| 167 | sizeof(unsigned long), | ||
| 168 | is_signed_type(unsigned long)); | ||
| 169 | if (!ret) | ||
| 170 | return 0; | ||
| 171 | offset += sizeof(unsigned long); | ||
| 172 | } | 160 | } |
| 161 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); | ||
| 173 | 162 | ||
| 174 | trace_seq_puts(s, "\nprint fmt: \""); | ||
| 175 | for (i = 0; i < entry->nb_args; i++) { | 163 | for (i = 0; i < entry->nb_args; i++) { |
| 176 | ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], | 164 | pos += snprintf(buf + pos, LEN_OR_ZERO, |
| 177 | sizeof(unsigned long), | 165 | ", ((unsigned long)(REC->%s))", entry->args[i]); |
| 178 | i == entry->nb_args - 1 ? "" : ", "); | ||
| 179 | if (!ret) | ||
| 180 | return 0; | ||
| 181 | } | 166 | } |
| 182 | trace_seq_putc(s, '"'); | ||
| 183 | 167 | ||
| 184 | for (i = 0; i < entry->nb_args; i++) { | 168 | #undef LEN_OR_ZERO |
| 185 | ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", | ||
| 186 | entry->args[i]); | ||
| 187 | if (!ret) | ||
| 188 | return 0; | ||
| 189 | } | ||
| 190 | 169 | ||
| 191 | return trace_seq_putc(s, '\n'); | 170 | /* return the length of print_fmt */ |
| 171 | return pos; | ||
| 192 | } | 172 | } |
| 193 | 173 | ||
| 194 | int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) | 174 | static int set_syscall_print_fmt(struct ftrace_event_call *call) |
| 195 | { | 175 | { |
| 196 | int ret; | 176 | char *print_fmt; |
| 197 | struct syscall_trace_exit trace; | 177 | int len; |
| 178 | struct syscall_metadata *entry = call->data; | ||
| 198 | 179 | ||
| 199 | ret = trace_seq_printf(s, | 180 | if (entry->enter_event != call) { |
| 200 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" | 181 | call->print_fmt = "\"0x%lx\", REC->ret"; |
| 201 | "\tsigned:%u;\n" | ||
| 202 | "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" | ||
| 203 | "\tsigned:%u;\n", | ||
| 204 | SYSCALL_FIELD(int, nr), | ||
| 205 | SYSCALL_FIELD(long, ret)); | ||
| 206 | if (!ret) | ||
| 207 | return 0; | 182 | return 0; |
| 183 | } | ||
| 184 | |||
| 185 | /* First: called with 0 length to calculate the needed length */ | ||
| 186 | len = __set_enter_print_fmt(entry, NULL, 0); | ||
| 187 | |||
| 188 | print_fmt = kmalloc(len + 1, GFP_KERNEL); | ||
| 189 | if (!print_fmt) | ||
| 190 | return -ENOMEM; | ||
| 191 | |||
| 192 | /* Second: actually write the @print_fmt */ | ||
| 193 | __set_enter_print_fmt(entry, print_fmt, len + 1); | ||
| 194 | call->print_fmt = print_fmt; | ||
| 208 | 195 | ||
| 209 | return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); | 196 | return 0; |
| 197 | } | ||
| 198 | |||
| 199 | static void free_syscall_print_fmt(struct ftrace_event_call *call) | ||
| 200 | { | ||
| 201 | struct syscall_metadata *entry = call->data; | ||
| 202 | |||
| 203 | if (entry->enter_event == call) | ||
| 204 | kfree(call->print_fmt); | ||
| 210 | } | 205 | } |
| 211 | 206 | ||
| 212 | int syscall_enter_define_fields(struct ftrace_event_call *call) | 207 | int syscall_enter_define_fields(struct ftrace_event_call *call) |
| @@ -386,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
| 386 | { | 381 | { |
| 387 | int id; | 382 | int id; |
| 388 | 383 | ||
| 389 | id = register_ftrace_event(call->event); | 384 | if (set_syscall_print_fmt(call) < 0) |
| 390 | if (!id) | 385 | return -ENOMEM; |
| 391 | return -ENODEV; | 386 | |
| 392 | call->id = id; | 387 | id = trace_event_raw_init(call); |
| 393 | INIT_LIST_HEAD(&call->fields); | 388 | |
| 394 | return 0; | 389 | if (id < 0) { |
| 390 | free_syscall_print_fmt(call); | ||
| 391 | return id; | ||
| 392 | } | ||
| 393 | |||
| 394 | return id; | ||
| 395 | } | ||
| 396 | |||
| 397 | unsigned long __init arch_syscall_addr(int nr) | ||
| 398 | { | ||
| 399 | return (unsigned long)sys_call_table[nr]; | ||
| 395 | } | 400 | } |
| 396 | 401 | ||
| 397 | int __init init_ftrace_syscalls(void) | 402 | int __init init_ftrace_syscalls(void) |
| @@ -421,7 +426,7 @@ int __init init_ftrace_syscalls(void) | |||
| 421 | } | 426 | } |
| 422 | core_initcall(init_ftrace_syscalls); | 427 | core_initcall(init_ftrace_syscalls); |
| 423 | 428 | ||
| 424 | #ifdef CONFIG_EVENT_PROFILE | 429 | #ifdef CONFIG_PERF_EVENTS |
| 425 | 430 | ||
| 426 | static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); | 431 | static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); |
| 427 | static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); | 432 | static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); |
| @@ -433,12 +438,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) | |||
| 433 | struct syscall_metadata *sys_data; | 438 | struct syscall_metadata *sys_data; |
| 434 | struct syscall_trace_enter *rec; | 439 | struct syscall_trace_enter *rec; |
| 435 | unsigned long flags; | 440 | unsigned long flags; |
| 436 | char *trace_buf; | ||
| 437 | char *raw_data; | ||
| 438 | int syscall_nr; | 441 | int syscall_nr; |
| 439 | int rctx; | 442 | int rctx; |
| 440 | int size; | 443 | int size; |
| 441 | int cpu; | ||
| 442 | 444 | ||
| 443 | syscall_nr = syscall_get_nr(current, regs); | 445 | syscall_nr = syscall_get_nr(current, regs); |
| 444 | if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) | 446 | if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) |
| @@ -457,37 +459,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) | |||
| 457 | "profile buffer not large enough")) | 459 | "profile buffer not large enough")) |
| 458 | return; | 460 | return; |
| 459 | 461 | ||
| 460 | /* Protect the per cpu buffer, begin the rcu read side */ | 462 | rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, |
| 461 | local_irq_save(flags); | 463 | sys_data->enter_event->id, &rctx, &flags); |
| 462 | 464 | if (!rec) | |
| 463 | rctx = perf_swevent_get_recursion_context(); | 465 | return; |
| 464 | if (rctx < 0) | ||
| 465 | goto end_recursion; | ||
| 466 | |||
| 467 | cpu = smp_processor_id(); | ||
| 468 | |||
| 469 | trace_buf = rcu_dereference(perf_trace_buf); | ||
| 470 | |||
| 471 | if (!trace_buf) | ||
| 472 | goto end; | ||
| 473 | |||
| 474 | raw_data = per_cpu_ptr(trace_buf, cpu); | ||
| 475 | |||
| 476 | /* zero the dead bytes from align to not leak stack to user */ | ||
| 477 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | ||
| 478 | 466 | ||
| 479 | rec = (struct syscall_trace_enter *) raw_data; | ||
| 480 | tracing_generic_entry_update(&rec->ent, 0, 0); | ||
| 481 | rec->ent.type = sys_data->enter_event->id; | ||
| 482 | rec->nr = syscall_nr; | 467 | rec->nr = syscall_nr; |
| 483 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, | 468 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, |
| 484 | (unsigned long *)&rec->args); | 469 | (unsigned long *)&rec->args); |
| 485 | perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); | 470 | ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); |
| 486 | |||
| 487 | end: | ||
| 488 | perf_swevent_put_recursion_context(rctx); | ||
| 489 | end_recursion: | ||
| 490 | local_irq_restore(flags); | ||
| 491 | } | 471 | } |
| 492 | 472 | ||
| 493 | int prof_sysenter_enable(struct ftrace_event_call *call) | 473 | int prof_sysenter_enable(struct ftrace_event_call *call) |
| @@ -531,11 +511,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) | |||
| 531 | struct syscall_trace_exit *rec; | 511 | struct syscall_trace_exit *rec; |
| 532 | unsigned long flags; | 512 | unsigned long flags; |
| 533 | int syscall_nr; | 513 | int syscall_nr; |
| 534 | char *trace_buf; | ||
| 535 | char *raw_data; | ||
| 536 | int rctx; | 514 | int rctx; |
| 537 | int size; | 515 | int size; |
| 538 | int cpu; | ||
| 539 | 516 | ||
| 540 | syscall_nr = syscall_get_nr(current, regs); | 517 | syscall_nr = syscall_get_nr(current, regs); |
| 541 | if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) | 518 | if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) |
| @@ -557,38 +534,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) | |||
| 557 | "exit event has grown above profile buffer size")) | 534 | "exit event has grown above profile buffer size")) |
| 558 | return; | 535 | return; |
| 559 | 536 | ||
| 560 | /* Protect the per cpu buffer, begin the rcu read side */ | 537 | rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, |
| 561 | local_irq_save(flags); | 538 | sys_data->exit_event->id, &rctx, &flags); |
| 562 | 539 | if (!rec) | |
| 563 | rctx = perf_swevent_get_recursion_context(); | 540 | return; |
| 564 | if (rctx < 0) | ||
| 565 | goto end_recursion; | ||
| 566 | |||
| 567 | cpu = smp_processor_id(); | ||
| 568 | |||
| 569 | trace_buf = rcu_dereference(perf_trace_buf); | ||
| 570 | |||
| 571 | if (!trace_buf) | ||
| 572 | goto end; | ||
| 573 | |||
| 574 | raw_data = per_cpu_ptr(trace_buf, cpu); | ||
| 575 | |||
| 576 | /* zero the dead bytes from align to not leak stack to user */ | ||
| 577 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | ||
| 578 | |||
| 579 | rec = (struct syscall_trace_exit *)raw_data; | ||
| 580 | 541 | ||
| 581 | tracing_generic_entry_update(&rec->ent, 0, 0); | ||
| 582 | rec->ent.type = sys_data->exit_event->id; | ||
| 583 | rec->nr = syscall_nr; | 542 | rec->nr = syscall_nr; |
| 584 | rec->ret = syscall_get_return_value(current, regs); | 543 | rec->ret = syscall_get_return_value(current, regs); |
| 585 | 544 | ||
| 586 | perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); | 545 | ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); |
| 587 | |||
| 588 | end: | ||
| 589 | perf_swevent_put_recursion_context(rctx); | ||
| 590 | end_recursion: | ||
| 591 | local_irq_restore(flags); | ||
| 592 | } | 546 | } |
| 593 | 547 | ||
| 594 | int prof_sysexit_enable(struct ftrace_event_call *call) | 548 | int prof_sysexit_enable(struct ftrace_event_call *call) |
| @@ -603,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call) | |||
| 603 | ret = register_trace_sys_exit(prof_syscall_exit); | 557 | ret = register_trace_sys_exit(prof_syscall_exit); |
| 604 | if (ret) { | 558 | if (ret) { |
| 605 | pr_info("event trace: Could not activate" | 559 | pr_info("event trace: Could not activate" |
| 606 | "syscall entry trace point"); | 560 | "syscall exit trace point"); |
| 607 | } else { | 561 | } else { |
| 608 | set_bit(num, enabled_prof_exit_syscalls); | 562 | set_bit(num, enabled_prof_exit_syscalls); |
| 609 | sys_prof_refcount_exit++; | 563 | sys_prof_refcount_exit++; |
| @@ -626,6 +580,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call) | |||
| 626 | mutex_unlock(&syscall_trace_lock); | 580 | mutex_unlock(&syscall_trace_lock); |
| 627 | } | 581 | } |
| 628 | 582 | ||
| 629 | #endif | 583 | #endif /* CONFIG_PERF_EVENTS */ |
| 630 | |||
| 631 | 584 | ||
diff --git a/kernel/user.c b/kernel/user.c index 46d0165ca70c..766467b3bcb7 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -56,9 +56,6 @@ struct user_struct root_user = { | |||
| 56 | .sigpending = ATOMIC_INIT(0), | 56 | .sigpending = ATOMIC_INIT(0), |
| 57 | .locked_shm = 0, | 57 | .locked_shm = 0, |
| 58 | .user_ns = &init_user_ns, | 58 | .user_ns = &init_user_ns, |
| 59 | #ifdef CONFIG_USER_SCHED | ||
| 60 | .tg = &init_task_group, | ||
| 61 | #endif | ||
| 62 | }; | 59 | }; |
| 63 | 60 | ||
| 64 | /* | 61 | /* |
| @@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up) | |||
| 75 | put_user_ns(up->user_ns); | 72 | put_user_ns(up->user_ns); |
| 76 | } | 73 | } |
| 77 | 74 | ||
| 78 | #ifdef CONFIG_USER_SCHED | ||
| 79 | |||
| 80 | static void sched_destroy_user(struct user_struct *up) | ||
| 81 | { | ||
| 82 | sched_destroy_group(up->tg); | ||
| 83 | } | ||
| 84 | |||
| 85 | static int sched_create_user(struct user_struct *up) | ||
| 86 | { | ||
| 87 | int rc = 0; | ||
| 88 | |||
| 89 | up->tg = sched_create_group(&root_task_group); | ||
| 90 | if (IS_ERR(up->tg)) | ||
| 91 | rc = -ENOMEM; | ||
| 92 | |||
| 93 | set_tg_uid(up); | ||
| 94 | |||
| 95 | return rc; | ||
| 96 | } | ||
| 97 | |||
| 98 | #else /* CONFIG_USER_SCHED */ | ||
| 99 | |||
| 100 | static void sched_destroy_user(struct user_struct *up) { } | ||
| 101 | static int sched_create_user(struct user_struct *up) { return 0; } | ||
| 102 | |||
| 103 | #endif /* CONFIG_USER_SCHED */ | ||
| 104 | |||
| 105 | #if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) | ||
| 106 | |||
| 107 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | ||
| 108 | { | ||
| 109 | struct user_struct *user; | ||
| 110 | struct hlist_node *h; | ||
| 111 | |||
| 112 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | ||
| 113 | if (user->uid == uid) { | ||
| 114 | /* possibly resurrect an "almost deleted" object */ | ||
| 115 | if (atomic_inc_return(&user->__count) == 1) | ||
| 116 | cancel_delayed_work(&user->work); | ||
| 117 | return user; | ||
| 118 | } | ||
| 119 | } | ||
| 120 | |||
| 121 | return NULL; | ||
| 122 | } | ||
| 123 | |||
| 124 | static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ | ||
| 125 | static DEFINE_MUTEX(uids_mutex); | ||
| 126 | |||
| 127 | static inline void uids_mutex_lock(void) | ||
| 128 | { | ||
| 129 | mutex_lock(&uids_mutex); | ||
| 130 | } | ||
| 131 | |||
| 132 | static inline void uids_mutex_unlock(void) | ||
| 133 | { | ||
| 134 | mutex_unlock(&uids_mutex); | ||
| 135 | } | ||
| 136 | |||
| 137 | /* uid directory attributes */ | ||
| 138 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 139 | static ssize_t cpu_shares_show(struct kobject *kobj, | ||
| 140 | struct kobj_attribute *attr, | ||
| 141 | char *buf) | ||
| 142 | { | ||
| 143 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 144 | |||
| 145 | return sprintf(buf, "%lu\n", sched_group_shares(up->tg)); | ||
| 146 | } | ||
| 147 | |||
| 148 | static ssize_t cpu_shares_store(struct kobject *kobj, | ||
| 149 | struct kobj_attribute *attr, | ||
| 150 | const char *buf, size_t size) | ||
| 151 | { | ||
| 152 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 153 | unsigned long shares; | ||
| 154 | int rc; | ||
| 155 | |||
| 156 | sscanf(buf, "%lu", &shares); | ||
| 157 | |||
| 158 | rc = sched_group_set_shares(up->tg, shares); | ||
| 159 | |||
| 160 | return (rc ? rc : size); | ||
| 161 | } | ||
| 162 | |||
| 163 | static struct kobj_attribute cpu_share_attr = | ||
| 164 | __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); | ||
| 165 | #endif | ||
| 166 | |||
| 167 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 168 | static ssize_t cpu_rt_runtime_show(struct kobject *kobj, | ||
| 169 | struct kobj_attribute *attr, | ||
| 170 | char *buf) | ||
| 171 | { | ||
| 172 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 173 | |||
| 174 | return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); | ||
| 175 | } | ||
| 176 | |||
| 177 | static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | ||
| 178 | struct kobj_attribute *attr, | ||
| 179 | const char *buf, size_t size) | ||
| 180 | { | ||
| 181 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 182 | unsigned long rt_runtime; | ||
| 183 | int rc; | ||
| 184 | |||
| 185 | sscanf(buf, "%ld", &rt_runtime); | ||
| 186 | |||
| 187 | rc = sched_group_set_rt_runtime(up->tg, rt_runtime); | ||
| 188 | |||
| 189 | return (rc ? rc : size); | ||
| 190 | } | ||
| 191 | |||
| 192 | static struct kobj_attribute cpu_rt_runtime_attr = | ||
| 193 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); | ||
| 194 | |||
| 195 | static ssize_t cpu_rt_period_show(struct kobject *kobj, | ||
| 196 | struct kobj_attribute *attr, | ||
| 197 | char *buf) | ||
| 198 | { | ||
| 199 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 200 | |||
| 201 | return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); | ||
| 202 | } | ||
| 203 | |||
| 204 | static ssize_t cpu_rt_period_store(struct kobject *kobj, | ||
| 205 | struct kobj_attribute *attr, | ||
| 206 | const char *buf, size_t size) | ||
| 207 | { | ||
| 208 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
| 209 | unsigned long rt_period; | ||
| 210 | int rc; | ||
| 211 | |||
| 212 | sscanf(buf, "%lu", &rt_period); | ||
| 213 | |||
| 214 | rc = sched_group_set_rt_period(up->tg, rt_period); | ||
| 215 | |||
| 216 | return (rc ? rc : size); | ||
| 217 | } | ||
| 218 | |||
| 219 | static struct kobj_attribute cpu_rt_period_attr = | ||
| 220 | __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); | ||
| 221 | #endif | ||
| 222 | |||
| 223 | /* default attributes per uid directory */ | ||
| 224 | static struct attribute *uids_attributes[] = { | ||
| 225 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 226 | &cpu_share_attr.attr, | ||
| 227 | #endif | ||
| 228 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 229 | &cpu_rt_runtime_attr.attr, | ||
| 230 | &cpu_rt_period_attr.attr, | ||
| 231 | #endif | ||
| 232 | NULL | ||
| 233 | }; | ||
| 234 | |||
| 235 | /* the lifetime of user_struct is not managed by the core (now) */ | ||
| 236 | static void uids_release(struct kobject *kobj) | ||
| 237 | { | ||
| 238 | return; | ||
| 239 | } | ||
| 240 | |||
| 241 | static struct kobj_type uids_ktype = { | ||
| 242 | .sysfs_ops = &kobj_sysfs_ops, | ||
| 243 | .default_attrs = uids_attributes, | ||
| 244 | .release = uids_release, | ||
| 245 | }; | ||
| 246 | |||
| 247 | /* | ||
| 248 | * Create /sys/kernel/uids/<uid>/cpu_share file for this user | ||
| 249 | * We do not create this file for users in a user namespace (until | ||
| 250 | * sysfs tagging is implemented). | ||
| 251 | * | ||
| 252 | * See Documentation/scheduler/sched-design-CFS.txt for ramifications. | ||
| 253 | */ | ||
| 254 | static int uids_user_create(struct user_struct *up) | ||
| 255 | { | ||
| 256 | struct kobject *kobj = &up->kobj; | ||
| 257 | int error; | ||
| 258 | |||
| 259 | memset(kobj, 0, sizeof(struct kobject)); | ||
| 260 | if (up->user_ns != &init_user_ns) | ||
| 261 | return 0; | ||
| 262 | kobj->kset = uids_kset; | ||
| 263 | error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); | ||
| 264 | if (error) { | ||
| 265 | kobject_put(kobj); | ||
| 266 | goto done; | ||
| 267 | } | ||
| 268 | |||
| 269 | kobject_uevent(kobj, KOBJ_ADD); | ||
| 270 | done: | ||
| 271 | return error; | ||
| 272 | } | ||
| 273 | |||
| 274 | /* create these entries in sysfs: | ||
| 275 | * "/sys/kernel/uids" directory | ||
| 276 | * "/sys/kernel/uids/0" directory (for root user) | ||
| 277 | * "/sys/kernel/uids/0/cpu_share" file (for root user) | ||
| 278 | */ | ||
| 279 | int __init uids_sysfs_init(void) | ||
| 280 | { | ||
| 281 | uids_kset = kset_create_and_add("uids", NULL, kernel_kobj); | ||
| 282 | if (!uids_kset) | ||
| 283 | return -ENOMEM; | ||
| 284 | |||
| 285 | return uids_user_create(&root_user); | ||
| 286 | } | ||
| 287 | |||
| 288 | /* delayed work function to remove sysfs directory for a user and free up | ||
| 289 | * corresponding structures. | ||
| 290 | */ | ||
| 291 | static void cleanup_user_struct(struct work_struct *w) | ||
| 292 | { | ||
| 293 | struct user_struct *up = container_of(w, struct user_struct, work.work); | ||
| 294 | unsigned long flags; | ||
| 295 | int remove_user = 0; | ||
| 296 | |||
| 297 | /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() | ||
| 298 | * atomic. | ||
| 299 | */ | ||
| 300 | uids_mutex_lock(); | ||
| 301 | |||
| 302 | spin_lock_irqsave(&uidhash_lock, flags); | ||
| 303 | if (atomic_read(&up->__count) == 0) { | ||
| 304 | uid_hash_remove(up); | ||
| 305 | remove_user = 1; | ||
| 306 | } | ||
| 307 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
| 308 | |||
| 309 | if (!remove_user) | ||
| 310 | goto done; | ||
| 311 | |||
| 312 | if (up->user_ns == &init_user_ns) { | ||
| 313 | kobject_uevent(&up->kobj, KOBJ_REMOVE); | ||
| 314 | kobject_del(&up->kobj); | ||
| 315 | kobject_put(&up->kobj); | ||
| 316 | } | ||
| 317 | |||
| 318 | sched_destroy_user(up); | ||
| 319 | key_put(up->uid_keyring); | ||
| 320 | key_put(up->session_keyring); | ||
| 321 | kmem_cache_free(uid_cachep, up); | ||
| 322 | |||
| 323 | done: | ||
| 324 | uids_mutex_unlock(); | ||
| 325 | } | ||
| 326 | |||
| 327 | /* IRQs are disabled and uidhash_lock is held upon function entry. | ||
| 328 | * IRQ state (as stored in flags) is restored and uidhash_lock released | ||
| 329 | * upon function exit. | ||
| 330 | */ | ||
| 331 | static void free_user(struct user_struct *up, unsigned long flags) | ||
| 332 | { | ||
| 333 | INIT_DELAYED_WORK(&up->work, cleanup_user_struct); | ||
| 334 | schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); | ||
| 335 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
| 336 | } | ||
| 337 | |||
| 338 | #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ | ||
| 339 | |||
| 340 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | 75 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) |
| 341 | { | 76 | { |
| 342 | struct user_struct *user; | 77 | struct user_struct *user; |
| @@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | |||
| 352 | return NULL; | 87 | return NULL; |
| 353 | } | 88 | } |
| 354 | 89 | ||
| 355 | int uids_sysfs_init(void) { return 0; } | ||
| 356 | static inline int uids_user_create(struct user_struct *up) { return 0; } | ||
| 357 | static inline void uids_mutex_lock(void) { } | ||
| 358 | static inline void uids_mutex_unlock(void) { } | ||
| 359 | |||
| 360 | /* IRQs are disabled and uidhash_lock is held upon function entry. | 90 | /* IRQs are disabled and uidhash_lock is held upon function entry. |
| 361 | * IRQ state (as stored in flags) is restored and uidhash_lock released | 91 | * IRQ state (as stored in flags) is restored and uidhash_lock released |
| 362 | * upon function exit. | 92 | * upon function exit. |
| @@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags) | |||
| 365 | { | 95 | { |
| 366 | uid_hash_remove(up); | 96 | uid_hash_remove(up); |
| 367 | spin_unlock_irqrestore(&uidhash_lock, flags); | 97 | spin_unlock_irqrestore(&uidhash_lock, flags); |
| 368 | sched_destroy_user(up); | ||
| 369 | key_put(up->uid_keyring); | 98 | key_put(up->uid_keyring); |
| 370 | key_put(up->session_keyring); | 99 | key_put(up->session_keyring); |
| 371 | kmem_cache_free(uid_cachep, up); | 100 | kmem_cache_free(uid_cachep, up); |
| 372 | } | 101 | } |
| 373 | 102 | ||
| 374 | #endif | ||
| 375 | |||
| 376 | #if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) | ||
| 377 | /* | ||
| 378 | * We need to check if a setuid can take place. This function should be called | ||
| 379 | * before successfully completing the setuid. | ||
| 380 | */ | ||
| 381 | int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) | ||
| 382 | { | ||
| 383 | |||
| 384 | return sched_rt_can_attach(up->tg, tsk); | ||
| 385 | |||
| 386 | } | ||
| 387 | #else | ||
| 388 | int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) | ||
| 389 | { | ||
| 390 | return 1; | ||
| 391 | } | ||
| 392 | #endif | ||
| 393 | |||
| 394 | /* | 103 | /* |
| 395 | * Locate the user_struct for the passed UID. If found, take a ref on it. The | 104 | * Locate the user_struct for the passed UID. If found, take a ref on it. The |
| 396 | * caller must undo that ref with free_uid(). | 105 | * caller must undo that ref with free_uid(). |
| @@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 431 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() | 140 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() |
| 432 | * atomic. | 141 | * atomic. |
| 433 | */ | 142 | */ |
| 434 | uids_mutex_lock(); | ||
| 435 | |||
| 436 | spin_lock_irq(&uidhash_lock); | 143 | spin_lock_irq(&uidhash_lock); |
| 437 | up = uid_hash_find(uid, hashent); | 144 | up = uid_hash_find(uid, hashent); |
| 438 | spin_unlock_irq(&uidhash_lock); | 145 | spin_unlock_irq(&uidhash_lock); |
| @@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 445 | new->uid = uid; | 152 | new->uid = uid; |
| 446 | atomic_set(&new->__count, 1); | 153 | atomic_set(&new->__count, 1); |
| 447 | 154 | ||
| 448 | if (sched_create_user(new) < 0) | ||
| 449 | goto out_free_user; | ||
| 450 | |||
| 451 | new->user_ns = get_user_ns(ns); | 155 | new->user_ns = get_user_ns(ns); |
| 452 | 156 | ||
| 453 | if (uids_user_create(new)) | ||
| 454 | goto out_destoy_sched; | ||
| 455 | |||
| 456 | /* | 157 | /* |
| 457 | * Before adding this, check whether we raced | 158 | * Before adding this, check whether we raced |
| 458 | * on adding the same user already.. | 159 | * on adding the same user already.. |
| @@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 475 | spin_unlock_irq(&uidhash_lock); | 176 | spin_unlock_irq(&uidhash_lock); |
| 476 | } | 177 | } |
| 477 | 178 | ||
| 478 | uids_mutex_unlock(); | ||
| 479 | |||
| 480 | return up; | 179 | return up; |
| 481 | 180 | ||
| 482 | out_destoy_sched: | ||
| 483 | sched_destroy_user(new); | ||
| 484 | put_user_ns(new->user_ns); | 181 | put_user_ns(new->user_ns); |
| 485 | out_free_user: | ||
| 486 | kmem_cache_free(uid_cachep, new); | 182 | kmem_cache_free(uid_cachep, new); |
| 487 | out_unlock: | 183 | out_unlock: |
| 488 | uids_mutex_unlock(); | ||
| 489 | return NULL; | 184 | return NULL; |
| 490 | } | 185 | } |
| 491 | 186 | ||
