diff options
Diffstat (limited to 'kernel')
57 files changed, 5057 insertions, 1393 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6c584c55a6e9..6c5f081132a4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o | 12 | notifier.o ksysfs.o pm_qos_params.o |
13 | 13 | ||
14 | obj-$(CONFIG_SYSCTL) += sysctl_check.o | 14 | obj-$(CONFIG_SYSCTL) += sysctl_check.o |
@@ -53,6 +53,7 @@ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | |||
53 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 53 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
54 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | 54 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o |
55 | obj-$(CONFIG_KPROBES) += kprobes.o | 55 | obj-$(CONFIG_KPROBES) += kprobes.o |
56 | obj-$(CONFIG_KGDB) += kgdb.o | ||
56 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | 57 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o |
57 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 58 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
58 | obj-$(CONFIG_SECCOMP) += seccomp.o | 59 | obj-$(CONFIG_SECCOMP) += seccomp.o |
diff --git a/kernel/audit.c b/kernel/audit.c index b782b046543d..a7b16086d36f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -21,7 +21,7 @@ | |||
21 | * | 21 | * |
22 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> | 22 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> |
23 | * | 23 | * |
24 | * Goals: 1) Integrate fully with SELinux. | 24 | * Goals: 1) Integrate fully with Security Modules. |
25 | * 2) Minimal run-time overhead: | 25 | * 2) Minimal run-time overhead: |
26 | * a) Minimal when syscall auditing is disabled (audit_enable=0). | 26 | * a) Minimal when syscall auditing is disabled (audit_enable=0). |
27 | * b) Small when syscall auditing is enabled and no audit record | 27 | * b) Small when syscall auditing is enabled and no audit record |
@@ -55,7 +55,6 @@ | |||
55 | #include <net/netlink.h> | 55 | #include <net/netlink.h> |
56 | #include <linux/skbuff.h> | 56 | #include <linux/skbuff.h> |
57 | #include <linux/netlink.h> | 57 | #include <linux/netlink.h> |
58 | #include <linux/selinux.h> | ||
59 | #include <linux/inotify.h> | 58 | #include <linux/inotify.h> |
60 | #include <linux/freezer.h> | 59 | #include <linux/freezer.h> |
61 | #include <linux/tty.h> | 60 | #include <linux/tty.h> |
@@ -265,13 +264,13 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
265 | char *ctx = NULL; | 264 | char *ctx = NULL; |
266 | u32 len; | 265 | u32 len; |
267 | 266 | ||
268 | rc = selinux_sid_to_string(sid, &ctx, &len); | 267 | rc = security_secid_to_secctx(sid, &ctx, &len); |
269 | if (rc) { | 268 | if (rc) { |
270 | audit_log_format(ab, " sid=%u", sid); | 269 | audit_log_format(ab, " sid=%u", sid); |
271 | allow_changes = 0; /* Something weird, deny request */ | 270 | allow_changes = 0; /* Something weird, deny request */ |
272 | } else { | 271 | } else { |
273 | audit_log_format(ab, " subj=%s", ctx); | 272 | audit_log_format(ab, " subj=%s", ctx); |
274 | kfree(ctx); | 273 | security_release_secctx(ctx, len); |
275 | } | 274 | } |
276 | } | 275 | } |
277 | audit_log_format(ab, " res=%d", allow_changes); | 276 | audit_log_format(ab, " res=%d", allow_changes); |
@@ -550,12 +549,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
550 | audit_log_format(*ab, "user pid=%d uid=%u auid=%u", | 549 | audit_log_format(*ab, "user pid=%d uid=%u auid=%u", |
551 | pid, uid, auid); | 550 | pid, uid, auid); |
552 | if (sid) { | 551 | if (sid) { |
553 | rc = selinux_sid_to_string(sid, &ctx, &len); | 552 | rc = security_secid_to_secctx(sid, &ctx, &len); |
554 | if (rc) | 553 | if (rc) |
555 | audit_log_format(*ab, " ssid=%u", sid); | 554 | audit_log_format(*ab, " ssid=%u", sid); |
556 | else | 555 | else { |
557 | audit_log_format(*ab, " subj=%s", ctx); | 556 | audit_log_format(*ab, " subj=%s", ctx); |
558 | kfree(ctx); | 557 | security_release_secctx(ctx, len); |
558 | } | ||
559 | } | 559 | } |
560 | 560 | ||
561 | return rc; | 561 | return rc; |
@@ -758,18 +758,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
758 | break; | 758 | break; |
759 | } | 759 | } |
760 | case AUDIT_SIGNAL_INFO: | 760 | case AUDIT_SIGNAL_INFO: |
761 | err = selinux_sid_to_string(audit_sig_sid, &ctx, &len); | 761 | err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); |
762 | if (err) | 762 | if (err) |
763 | return err; | 763 | return err; |
764 | sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); | 764 | sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); |
765 | if (!sig_data) { | 765 | if (!sig_data) { |
766 | kfree(ctx); | 766 | security_release_secctx(ctx, len); |
767 | return -ENOMEM; | 767 | return -ENOMEM; |
768 | } | 768 | } |
769 | sig_data->uid = audit_sig_uid; | 769 | sig_data->uid = audit_sig_uid; |
770 | sig_data->pid = audit_sig_pid; | 770 | sig_data->pid = audit_sig_pid; |
771 | memcpy(sig_data->ctx, ctx, len); | 771 | memcpy(sig_data->ctx, ctx, len); |
772 | kfree(ctx); | 772 | security_release_secctx(ctx, len); |
773 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, | 773 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, |
774 | 0, 0, sig_data, sizeof(*sig_data) + len); | 774 | 0, 0, sig_data, sizeof(*sig_data) + len); |
775 | kfree(sig_data); | 775 | kfree(sig_data); |
@@ -881,10 +881,6 @@ static int __init audit_init(void) | |||
881 | audit_enabled = audit_default; | 881 | audit_enabled = audit_default; |
882 | audit_ever_enabled |= !!audit_default; | 882 | audit_ever_enabled |= !!audit_default; |
883 | 883 | ||
884 | /* Register the callback with selinux. This callback will be invoked | ||
885 | * when a new policy is loaded. */ | ||
886 | selinux_audit_set_callback(&selinux_audit_rule_update); | ||
887 | |||
888 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); | 884 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); |
889 | 885 | ||
890 | #ifdef CONFIG_AUDITSYSCALL | 886 | #ifdef CONFIG_AUDITSYSCALL |
diff --git a/kernel/audit.h b/kernel/audit.h index 2554bd524fd1..3cfc54ee3e1f 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -65,34 +65,9 @@ struct audit_watch { | |||
65 | struct list_head rules; /* associated rules */ | 65 | struct list_head rules; /* associated rules */ |
66 | }; | 66 | }; |
67 | 67 | ||
68 | struct audit_field { | ||
69 | u32 type; | ||
70 | u32 val; | ||
71 | u32 op; | ||
72 | char *se_str; | ||
73 | struct selinux_audit_rule *se_rule; | ||
74 | }; | ||
75 | |||
76 | struct audit_tree; | 68 | struct audit_tree; |
77 | struct audit_chunk; | 69 | struct audit_chunk; |
78 | 70 | ||
79 | struct audit_krule { | ||
80 | int vers_ops; | ||
81 | u32 flags; | ||
82 | u32 listnr; | ||
83 | u32 action; | ||
84 | u32 mask[AUDIT_BITMASK_SIZE]; | ||
85 | u32 buflen; /* for data alloc on list rules */ | ||
86 | u32 field_count; | ||
87 | char *filterkey; /* ties events to rules */ | ||
88 | struct audit_field *fields; | ||
89 | struct audit_field *arch_f; /* quick access to arch field */ | ||
90 | struct audit_field *inode_f; /* quick access to an inode field */ | ||
91 | struct audit_watch *watch; /* associated watch */ | ||
92 | struct audit_tree *tree; /* associated watched tree */ | ||
93 | struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ | ||
94 | }; | ||
95 | |||
96 | struct audit_entry { | 71 | struct audit_entry { |
97 | struct list_head list; | 72 | struct list_head list; |
98 | struct rcu_head rcu; | 73 | struct rcu_head rcu; |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 2f2914b7cc30..28fef6bf8534 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/netlink.h> | 28 | #include <linux/netlink.h> |
29 | #include <linux/sched.h> | 29 | #include <linux/sched.h> |
30 | #include <linux/inotify.h> | 30 | #include <linux/inotify.h> |
31 | #include <linux/selinux.h> | 31 | #include <linux/security.h> |
32 | #include "audit.h" | 32 | #include "audit.h" |
33 | 33 | ||
34 | /* | 34 | /* |
@@ -38,7 +38,7 @@ | |||
38 | * Synchronizes writes and blocking reads of audit's filterlist | 38 | * Synchronizes writes and blocking reads of audit's filterlist |
39 | * data. Rcu is used to traverse the filterlist and access | 39 | * data. Rcu is used to traverse the filterlist and access |
40 | * contents of structs audit_entry, audit_watch and opaque | 40 | * contents of structs audit_entry, audit_watch and opaque |
41 | * selinux rules during filtering. If modified, these structures | 41 | * LSM rules during filtering. If modified, these structures |
42 | * must be copied and replace their counterparts in the filterlist. | 42 | * must be copied and replace their counterparts in the filterlist. |
43 | * An audit_parent struct is not accessed during filtering, so may | 43 | * An audit_parent struct is not accessed during filtering, so may |
44 | * be written directly provided audit_filter_mutex is held. | 44 | * be written directly provided audit_filter_mutex is held. |
@@ -139,8 +139,8 @@ static inline void audit_free_rule(struct audit_entry *e) | |||
139 | if (e->rule.fields) | 139 | if (e->rule.fields) |
140 | for (i = 0; i < e->rule.field_count; i++) { | 140 | for (i = 0; i < e->rule.field_count; i++) { |
141 | struct audit_field *f = &e->rule.fields[i]; | 141 | struct audit_field *f = &e->rule.fields[i]; |
142 | kfree(f->se_str); | 142 | kfree(f->lsm_str); |
143 | selinux_audit_rule_free(f->se_rule); | 143 | security_audit_rule_free(f->lsm_rule); |
144 | } | 144 | } |
145 | kfree(e->rule.fields); | 145 | kfree(e->rule.fields); |
146 | kfree(e->rule.filterkey); | 146 | kfree(e->rule.filterkey); |
@@ -554,8 +554,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
554 | f->op = data->fieldflags[i] & AUDIT_OPERATORS; | 554 | f->op = data->fieldflags[i] & AUDIT_OPERATORS; |
555 | f->type = data->fields[i]; | 555 | f->type = data->fields[i]; |
556 | f->val = data->values[i]; | 556 | f->val = data->values[i]; |
557 | f->se_str = NULL; | 557 | f->lsm_str = NULL; |
558 | f->se_rule = NULL; | 558 | f->lsm_rule = NULL; |
559 | switch(f->type) { | 559 | switch(f->type) { |
560 | case AUDIT_PID: | 560 | case AUDIT_PID: |
561 | case AUDIT_UID: | 561 | case AUDIT_UID: |
@@ -597,12 +597,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
597 | goto exit_free; | 597 | goto exit_free; |
598 | entry->rule.buflen += f->val; | 598 | entry->rule.buflen += f->val; |
599 | 599 | ||
600 | err = selinux_audit_rule_init(f->type, f->op, str, | 600 | err = security_audit_rule_init(f->type, f->op, str, |
601 | &f->se_rule); | 601 | (void **)&f->lsm_rule); |
602 | /* Keep currently invalid fields around in case they | 602 | /* Keep currently invalid fields around in case they |
603 | * become valid after a policy reload. */ | 603 | * become valid after a policy reload. */ |
604 | if (err == -EINVAL) { | 604 | if (err == -EINVAL) { |
605 | printk(KERN_WARNING "audit rule for selinux " | 605 | printk(KERN_WARNING "audit rule for LSM " |
606 | "\'%s\' is invalid\n", str); | 606 | "\'%s\' is invalid\n", str); |
607 | err = 0; | 607 | err = 0; |
608 | } | 608 | } |
@@ -610,7 +610,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
610 | kfree(str); | 610 | kfree(str); |
611 | goto exit_free; | 611 | goto exit_free; |
612 | } else | 612 | } else |
613 | f->se_str = str; | 613 | f->lsm_str = str; |
614 | break; | 614 | break; |
615 | case AUDIT_WATCH: | 615 | case AUDIT_WATCH: |
616 | str = audit_unpack_string(&bufp, &remain, f->val); | 616 | str = audit_unpack_string(&bufp, &remain, f->val); |
@@ -754,7 +754,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
754 | case AUDIT_OBJ_LEV_LOW: | 754 | case AUDIT_OBJ_LEV_LOW: |
755 | case AUDIT_OBJ_LEV_HIGH: | 755 | case AUDIT_OBJ_LEV_HIGH: |
756 | data->buflen += data->values[i] = | 756 | data->buflen += data->values[i] = |
757 | audit_pack_string(&bufp, f->se_str); | 757 | audit_pack_string(&bufp, f->lsm_str); |
758 | break; | 758 | break; |
759 | case AUDIT_WATCH: | 759 | case AUDIT_WATCH: |
760 | data->buflen += data->values[i] = | 760 | data->buflen += data->values[i] = |
@@ -806,7 +806,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
806 | case AUDIT_OBJ_TYPE: | 806 | case AUDIT_OBJ_TYPE: |
807 | case AUDIT_OBJ_LEV_LOW: | 807 | case AUDIT_OBJ_LEV_LOW: |
808 | case AUDIT_OBJ_LEV_HIGH: | 808 | case AUDIT_OBJ_LEV_HIGH: |
809 | if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) | 809 | if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str)) |
810 | return 1; | 810 | return 1; |
811 | break; | 811 | break; |
812 | case AUDIT_WATCH: | 812 | case AUDIT_WATCH: |
@@ -862,28 +862,28 @@ out: | |||
862 | return new; | 862 | return new; |
863 | } | 863 | } |
864 | 864 | ||
865 | /* Duplicate selinux field information. The se_rule is opaque, so must be | 865 | /* Duplicate LSM field information. The lsm_rule is opaque, so must be |
866 | * re-initialized. */ | 866 | * re-initialized. */ |
867 | static inline int audit_dupe_selinux_field(struct audit_field *df, | 867 | static inline int audit_dupe_lsm_field(struct audit_field *df, |
868 | struct audit_field *sf) | 868 | struct audit_field *sf) |
869 | { | 869 | { |
870 | int ret = 0; | 870 | int ret = 0; |
871 | char *se_str; | 871 | char *lsm_str; |
872 | 872 | ||
873 | /* our own copy of se_str */ | 873 | /* our own copy of lsm_str */ |
874 | se_str = kstrdup(sf->se_str, GFP_KERNEL); | 874 | lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL); |
875 | if (unlikely(!se_str)) | 875 | if (unlikely(!lsm_str)) |
876 | return -ENOMEM; | 876 | return -ENOMEM; |
877 | df->se_str = se_str; | 877 | df->lsm_str = lsm_str; |
878 | 878 | ||
879 | /* our own (refreshed) copy of se_rule */ | 879 | /* our own (refreshed) copy of lsm_rule */ |
880 | ret = selinux_audit_rule_init(df->type, df->op, df->se_str, | 880 | ret = security_audit_rule_init(df->type, df->op, df->lsm_str, |
881 | &df->se_rule); | 881 | (void **)&df->lsm_rule); |
882 | /* Keep currently invalid fields around in case they | 882 | /* Keep currently invalid fields around in case they |
883 | * become valid after a policy reload. */ | 883 | * become valid after a policy reload. */ |
884 | if (ret == -EINVAL) { | 884 | if (ret == -EINVAL) { |
885 | printk(KERN_WARNING "audit rule for selinux \'%s\' is " | 885 | printk(KERN_WARNING "audit rule for LSM \'%s\' is " |
886 | "invalid\n", df->se_str); | 886 | "invalid\n", df->lsm_str); |
887 | ret = 0; | 887 | ret = 0; |
888 | } | 888 | } |
889 | 889 | ||
@@ -891,7 +891,7 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, | |||
891 | } | 891 | } |
892 | 892 | ||
893 | /* Duplicate an audit rule. This will be a deep copy with the exception | 893 | /* Duplicate an audit rule. This will be a deep copy with the exception |
894 | * of the watch - that pointer is carried over. The selinux specific fields | 894 | * of the watch - that pointer is carried over. The LSM specific fields |
895 | * will be updated in the copy. The point is to be able to replace the old | 895 | * will be updated in the copy. The point is to be able to replace the old |
896 | * rule with the new rule in the filterlist, then free the old rule. | 896 | * rule with the new rule in the filterlist, then free the old rule. |
897 | * The rlist element is undefined; list manipulations are handled apart from | 897 | * The rlist element is undefined; list manipulations are handled apart from |
@@ -930,7 +930,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
930 | new->tree = old->tree; | 930 | new->tree = old->tree; |
931 | memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); | 931 | memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); |
932 | 932 | ||
933 | /* deep copy this information, updating the se_rule fields, because | 933 | /* deep copy this information, updating the lsm_rule fields, because |
934 | * the originals will all be freed when the old rule is freed. */ | 934 | * the originals will all be freed when the old rule is freed. */ |
935 | for (i = 0; i < fcount; i++) { | 935 | for (i = 0; i < fcount; i++) { |
936 | switch (new->fields[i].type) { | 936 | switch (new->fields[i].type) { |
@@ -944,7 +944,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
944 | case AUDIT_OBJ_TYPE: | 944 | case AUDIT_OBJ_TYPE: |
945 | case AUDIT_OBJ_LEV_LOW: | 945 | case AUDIT_OBJ_LEV_LOW: |
946 | case AUDIT_OBJ_LEV_HIGH: | 946 | case AUDIT_OBJ_LEV_HIGH: |
947 | err = audit_dupe_selinux_field(&new->fields[i], | 947 | err = audit_dupe_lsm_field(&new->fields[i], |
948 | &old->fields[i]); | 948 | &old->fields[i]); |
949 | break; | 949 | break; |
950 | case AUDIT_FILTERKEY: | 950 | case AUDIT_FILTERKEY: |
@@ -1515,11 +1515,12 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, | |||
1515 | if (sid) { | 1515 | if (sid) { |
1516 | char *ctx = NULL; | 1516 | char *ctx = NULL; |
1517 | u32 len; | 1517 | u32 len; |
1518 | if (selinux_sid_to_string(sid, &ctx, &len)) | 1518 | if (security_secid_to_secctx(sid, &ctx, &len)) |
1519 | audit_log_format(ab, " ssid=%u", sid); | 1519 | audit_log_format(ab, " ssid=%u", sid); |
1520 | else | 1520 | else { |
1521 | audit_log_format(ab, " subj=%s", ctx); | 1521 | audit_log_format(ab, " subj=%s", ctx); |
1522 | kfree(ctx); | 1522 | security_release_secctx(ctx, len); |
1523 | } | ||
1523 | } | 1524 | } |
1524 | audit_log_format(ab, " op=%s rule key=", action); | 1525 | audit_log_format(ab, " op=%s rule key=", action); |
1525 | if (rule->filterkey) | 1526 | if (rule->filterkey) |
@@ -1761,38 +1762,12 @@ unlock_and_return: | |||
1761 | return result; | 1762 | return result; |
1762 | } | 1763 | } |
1763 | 1764 | ||
1764 | /* Check to see if the rule contains any selinux fields. Returns 1 if there | 1765 | /* This function will re-initialize the lsm_rule field of all applicable rules. |
1765 | are selinux fields specified in the rule, 0 otherwise. */ | 1766 | * It will traverse the filter lists serarching for rules that contain LSM |
1766 | static inline int audit_rule_has_selinux(struct audit_krule *rule) | ||
1767 | { | ||
1768 | int i; | ||
1769 | |||
1770 | for (i = 0; i < rule->field_count; i++) { | ||
1771 | struct audit_field *f = &rule->fields[i]; | ||
1772 | switch (f->type) { | ||
1773 | case AUDIT_SUBJ_USER: | ||
1774 | case AUDIT_SUBJ_ROLE: | ||
1775 | case AUDIT_SUBJ_TYPE: | ||
1776 | case AUDIT_SUBJ_SEN: | ||
1777 | case AUDIT_SUBJ_CLR: | ||
1778 | case AUDIT_OBJ_USER: | ||
1779 | case AUDIT_OBJ_ROLE: | ||
1780 | case AUDIT_OBJ_TYPE: | ||
1781 | case AUDIT_OBJ_LEV_LOW: | ||
1782 | case AUDIT_OBJ_LEV_HIGH: | ||
1783 | return 1; | ||
1784 | } | ||
1785 | } | ||
1786 | |||
1787 | return 0; | ||
1788 | } | ||
1789 | |||
1790 | /* This function will re-initialize the se_rule field of all applicable rules. | ||
1791 | * It will traverse the filter lists serarching for rules that contain selinux | ||
1792 | * specific filter fields. When such a rule is found, it is copied, the | 1767 | * specific filter fields. When such a rule is found, it is copied, the |
1793 | * selinux field is re-initialized, and the old rule is replaced with the | 1768 | * LSM field is re-initialized, and the old rule is replaced with the |
1794 | * updated rule. */ | 1769 | * updated rule. */ |
1795 | int selinux_audit_rule_update(void) | 1770 | int audit_update_lsm_rules(void) |
1796 | { | 1771 | { |
1797 | struct audit_entry *entry, *n, *nentry; | 1772 | struct audit_entry *entry, *n, *nentry; |
1798 | struct audit_watch *watch; | 1773 | struct audit_watch *watch; |
@@ -1804,7 +1779,7 @@ int selinux_audit_rule_update(void) | |||
1804 | 1779 | ||
1805 | for (i = 0; i < AUDIT_NR_FILTERS; i++) { | 1780 | for (i = 0; i < AUDIT_NR_FILTERS; i++) { |
1806 | list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { | 1781 | list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { |
1807 | if (!audit_rule_has_selinux(&entry->rule)) | 1782 | if (!security_audit_rule_known(&entry->rule)) |
1808 | continue; | 1783 | continue; |
1809 | 1784 | ||
1810 | watch = entry->rule.watch; | 1785 | watch = entry->rule.watch; |
@@ -1815,7 +1790,7 @@ int selinux_audit_rule_update(void) | |||
1815 | * return value */ | 1790 | * return value */ |
1816 | if (!err) | 1791 | if (!err) |
1817 | err = PTR_ERR(nentry); | 1792 | err = PTR_ERR(nentry); |
1818 | audit_panic("error updating selinux filters"); | 1793 | audit_panic("error updating LSM filters"); |
1819 | if (watch) | 1794 | if (watch) |
1820 | list_del(&entry->rule.rlist); | 1795 | list_del(&entry->rule.rlist); |
1821 | list_del_rcu(&entry->list); | 1796 | list_del_rcu(&entry->list); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 782262e4107d..56e56ed594a8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -61,7 +61,6 @@ | |||
61 | #include <linux/security.h> | 61 | #include <linux/security.h> |
62 | #include <linux/list.h> | 62 | #include <linux/list.h> |
63 | #include <linux/tty.h> | 63 | #include <linux/tty.h> |
64 | #include <linux/selinux.h> | ||
65 | #include <linux/binfmts.h> | 64 | #include <linux/binfmts.h> |
66 | #include <linux/highmem.h> | 65 | #include <linux/highmem.h> |
67 | #include <linux/syscalls.h> | 66 | #include <linux/syscalls.h> |
@@ -528,14 +527,14 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
528 | match for now to avoid losing information that | 527 | match for now to avoid losing information that |
529 | may be wanted. An error message will also be | 528 | may be wanted. An error message will also be |
530 | logged upon error */ | 529 | logged upon error */ |
531 | if (f->se_rule) { | 530 | if (f->lsm_rule) { |
532 | if (need_sid) { | 531 | if (need_sid) { |
533 | selinux_get_task_sid(tsk, &sid); | 532 | security_task_getsecid(tsk, &sid); |
534 | need_sid = 0; | 533 | need_sid = 0; |
535 | } | 534 | } |
536 | result = selinux_audit_rule_match(sid, f->type, | 535 | result = security_audit_rule_match(sid, f->type, |
537 | f->op, | 536 | f->op, |
538 | f->se_rule, | 537 | f->lsm_rule, |
539 | ctx); | 538 | ctx); |
540 | } | 539 | } |
541 | break; | 540 | break; |
@@ -546,18 +545,18 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
546 | case AUDIT_OBJ_LEV_HIGH: | 545 | case AUDIT_OBJ_LEV_HIGH: |
547 | /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR | 546 | /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR |
548 | also applies here */ | 547 | also applies here */ |
549 | if (f->se_rule) { | 548 | if (f->lsm_rule) { |
550 | /* Find files that match */ | 549 | /* Find files that match */ |
551 | if (name) { | 550 | if (name) { |
552 | result = selinux_audit_rule_match( | 551 | result = security_audit_rule_match( |
553 | name->osid, f->type, f->op, | 552 | name->osid, f->type, f->op, |
554 | f->se_rule, ctx); | 553 | f->lsm_rule, ctx); |
555 | } else if (ctx) { | 554 | } else if (ctx) { |
556 | for (j = 0; j < ctx->name_count; j++) { | 555 | for (j = 0; j < ctx->name_count; j++) { |
557 | if (selinux_audit_rule_match( | 556 | if (security_audit_rule_match( |
558 | ctx->names[j].osid, | 557 | ctx->names[j].osid, |
559 | f->type, f->op, | 558 | f->type, f->op, |
560 | f->se_rule, ctx)) { | 559 | f->lsm_rule, ctx)) { |
561 | ++result; | 560 | ++result; |
562 | break; | 561 | break; |
563 | } | 562 | } |
@@ -570,7 +569,7 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
570 | aux = aux->next) { | 569 | aux = aux->next) { |
571 | if (aux->type == AUDIT_IPC) { | 570 | if (aux->type == AUDIT_IPC) { |
572 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 571 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
573 | if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { | 572 | if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) { |
574 | ++result; | 573 | ++result; |
575 | break; | 574 | break; |
576 | } | 575 | } |
@@ -885,11 +884,11 @@ void audit_log_task_context(struct audit_buffer *ab) | |||
885 | int error; | 884 | int error; |
886 | u32 sid; | 885 | u32 sid; |
887 | 886 | ||
888 | selinux_get_task_sid(current, &sid); | 887 | security_task_getsecid(current, &sid); |
889 | if (!sid) | 888 | if (!sid) |
890 | return; | 889 | return; |
891 | 890 | ||
892 | error = selinux_sid_to_string(sid, &ctx, &len); | 891 | error = security_secid_to_secctx(sid, &ctx, &len); |
893 | if (error) { | 892 | if (error) { |
894 | if (error != -EINVAL) | 893 | if (error != -EINVAL) |
895 | goto error_path; | 894 | goto error_path; |
@@ -897,7 +896,7 @@ void audit_log_task_context(struct audit_buffer *ab) | |||
897 | } | 896 | } |
898 | 897 | ||
899 | audit_log_format(ab, " subj=%s", ctx); | 898 | audit_log_format(ab, " subj=%s", ctx); |
900 | kfree(ctx); | 899 | security_release_secctx(ctx, len); |
901 | return; | 900 | return; |
902 | 901 | ||
903 | error_path: | 902 | error_path: |
@@ -941,7 +940,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
941 | u32 sid, char *comm) | 940 | u32 sid, char *comm) |
942 | { | 941 | { |
943 | struct audit_buffer *ab; | 942 | struct audit_buffer *ab; |
944 | char *s = NULL; | 943 | char *ctx = NULL; |
945 | u32 len; | 944 | u32 len; |
946 | int rc = 0; | 945 | int rc = 0; |
947 | 946 | ||
@@ -951,15 +950,16 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
951 | 950 | ||
952 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, | 951 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, |
953 | uid, sessionid); | 952 | uid, sessionid); |
954 | if (selinux_sid_to_string(sid, &s, &len)) { | 953 | if (security_secid_to_secctx(sid, &ctx, &len)) { |
955 | audit_log_format(ab, " obj=(none)"); | 954 | audit_log_format(ab, " obj=(none)"); |
956 | rc = 1; | 955 | rc = 1; |
957 | } else | 956 | } else { |
958 | audit_log_format(ab, " obj=%s", s); | 957 | audit_log_format(ab, " obj=%s", ctx); |
958 | security_release_secctx(ctx, len); | ||
959 | } | ||
959 | audit_log_format(ab, " ocomm="); | 960 | audit_log_format(ab, " ocomm="); |
960 | audit_log_untrustedstring(ab, comm); | 961 | audit_log_untrustedstring(ab, comm); |
961 | audit_log_end(ab); | 962 | audit_log_end(ab); |
962 | kfree(s); | ||
963 | 963 | ||
964 | return rc; | 964 | return rc; |
965 | } | 965 | } |
@@ -1271,14 +1271,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1271 | if (axi->osid != 0) { | 1271 | if (axi->osid != 0) { |
1272 | char *ctx = NULL; | 1272 | char *ctx = NULL; |
1273 | u32 len; | 1273 | u32 len; |
1274 | if (selinux_sid_to_string( | 1274 | if (security_secid_to_secctx( |
1275 | axi->osid, &ctx, &len)) { | 1275 | axi->osid, &ctx, &len)) { |
1276 | audit_log_format(ab, " osid=%u", | 1276 | audit_log_format(ab, " osid=%u", |
1277 | axi->osid); | 1277 | axi->osid); |
1278 | call_panic = 1; | 1278 | call_panic = 1; |
1279 | } else | 1279 | } else { |
1280 | audit_log_format(ab, " obj=%s", ctx); | 1280 | audit_log_format(ab, " obj=%s", ctx); |
1281 | kfree(ctx); | 1281 | security_release_secctx(ctx, len); |
1282 | } | ||
1282 | } | 1283 | } |
1283 | break; } | 1284 | break; } |
1284 | 1285 | ||
@@ -1392,13 +1393,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1392 | if (n->osid != 0) { | 1393 | if (n->osid != 0) { |
1393 | char *ctx = NULL; | 1394 | char *ctx = NULL; |
1394 | u32 len; | 1395 | u32 len; |
1395 | if (selinux_sid_to_string( | 1396 | if (security_secid_to_secctx( |
1396 | n->osid, &ctx, &len)) { | 1397 | n->osid, &ctx, &len)) { |
1397 | audit_log_format(ab, " osid=%u", n->osid); | 1398 | audit_log_format(ab, " osid=%u", n->osid); |
1398 | call_panic = 2; | 1399 | call_panic = 2; |
1399 | } else | 1400 | } else { |
1400 | audit_log_format(ab, " obj=%s", ctx); | 1401 | audit_log_format(ab, " obj=%s", ctx); |
1401 | kfree(ctx); | 1402 | security_release_secctx(ctx, len); |
1403 | } | ||
1402 | } | 1404 | } |
1403 | 1405 | ||
1404 | audit_log_end(ab); | 1406 | audit_log_end(ab); |
@@ -1775,7 +1777,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode | |||
1775 | name->uid = inode->i_uid; | 1777 | name->uid = inode->i_uid; |
1776 | name->gid = inode->i_gid; | 1778 | name->gid = inode->i_gid; |
1777 | name->rdev = inode->i_rdev; | 1779 | name->rdev = inode->i_rdev; |
1778 | selinux_get_inode_sid(inode, &name->osid); | 1780 | security_inode_getsecid(inode, &name->osid); |
1779 | } | 1781 | } |
1780 | 1782 | ||
1781 | /** | 1783 | /** |
@@ -2190,8 +2192,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
2190 | ax->uid = ipcp->uid; | 2192 | ax->uid = ipcp->uid; |
2191 | ax->gid = ipcp->gid; | 2193 | ax->gid = ipcp->gid; |
2192 | ax->mode = ipcp->mode; | 2194 | ax->mode = ipcp->mode; |
2193 | selinux_get_ipc_sid(ipcp, &ax->osid); | 2195 | security_ipc_getsecid(ipcp, &ax->osid); |
2194 | |||
2195 | ax->d.type = AUDIT_IPC; | 2196 | ax->d.type = AUDIT_IPC; |
2196 | ax->d.next = context->aux; | 2197 | ax->d.next = context->aux; |
2197 | context->aux = (void *)ax; | 2198 | context->aux = (void *)ax; |
@@ -2343,7 +2344,7 @@ void __audit_ptrace(struct task_struct *t) | |||
2343 | context->target_auid = audit_get_loginuid(t); | 2344 | context->target_auid = audit_get_loginuid(t); |
2344 | context->target_uid = t->uid; | 2345 | context->target_uid = t->uid; |
2345 | context->target_sessionid = audit_get_sessionid(t); | 2346 | context->target_sessionid = audit_get_sessionid(t); |
2346 | selinux_get_task_sid(t, &context->target_sid); | 2347 | security_task_getsecid(t, &context->target_sid); |
2347 | memcpy(context->target_comm, t->comm, TASK_COMM_LEN); | 2348 | memcpy(context->target_comm, t->comm, TASK_COMM_LEN); |
2348 | } | 2349 | } |
2349 | 2350 | ||
@@ -2371,7 +2372,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2371 | audit_sig_uid = tsk->loginuid; | 2372 | audit_sig_uid = tsk->loginuid; |
2372 | else | 2373 | else |
2373 | audit_sig_uid = tsk->uid; | 2374 | audit_sig_uid = tsk->uid; |
2374 | selinux_get_task_sid(tsk, &audit_sig_sid); | 2375 | security_task_getsecid(tsk, &audit_sig_sid); |
2375 | } | 2376 | } |
2376 | if (!audit_signals || audit_dummy_context()) | 2377 | if (!audit_signals || audit_dummy_context()) |
2377 | return 0; | 2378 | return 0; |
@@ -2384,7 +2385,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2384 | ctx->target_auid = audit_get_loginuid(t); | 2385 | ctx->target_auid = audit_get_loginuid(t); |
2385 | ctx->target_uid = t->uid; | 2386 | ctx->target_uid = t->uid; |
2386 | ctx->target_sessionid = audit_get_sessionid(t); | 2387 | ctx->target_sessionid = audit_get_sessionid(t); |
2387 | selinux_get_task_sid(t, &ctx->target_sid); | 2388 | security_task_getsecid(t, &ctx->target_sid); |
2388 | memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); | 2389 | memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); |
2389 | return 0; | 2390 | return 0; |
2390 | } | 2391 | } |
@@ -2405,7 +2406,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2405 | axp->target_auid[axp->pid_count] = audit_get_loginuid(t); | 2406 | axp->target_auid[axp->pid_count] = audit_get_loginuid(t); |
2406 | axp->target_uid[axp->pid_count] = t->uid; | 2407 | axp->target_uid[axp->pid_count] = t->uid; |
2407 | axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); | 2408 | axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); |
2408 | selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); | 2409 | security_task_getsecid(t, &axp->target_sid[axp->pid_count]); |
2409 | memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); | 2410 | memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); |
2410 | axp->pid_count++; | 2411 | axp->pid_count++; |
2411 | 2412 | ||
@@ -2435,16 +2436,17 @@ void audit_core_dumps(long signr) | |||
2435 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2436 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2436 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", | 2437 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", |
2437 | auid, current->uid, current->gid, sessionid); | 2438 | auid, current->uid, current->gid, sessionid); |
2438 | selinux_get_task_sid(current, &sid); | 2439 | security_task_getsecid(current, &sid); |
2439 | if (sid) { | 2440 | if (sid) { |
2440 | char *ctx = NULL; | 2441 | char *ctx = NULL; |
2441 | u32 len; | 2442 | u32 len; |
2442 | 2443 | ||
2443 | if (selinux_sid_to_string(sid, &ctx, &len)) | 2444 | if (security_secid_to_secctx(sid, &ctx, &len)) |
2444 | audit_log_format(ab, " ssid=%u", sid); | 2445 | audit_log_format(ab, " ssid=%u", sid); |
2445 | else | 2446 | else { |
2446 | audit_log_format(ab, " subj=%s", ctx); | 2447 | audit_log_format(ab, " subj=%s", ctx); |
2447 | kfree(ctx); | 2448 | security_release_secctx(ctx, len); |
2449 | } | ||
2448 | } | 2450 | } |
2449 | audit_log_format(ab, " pid=%d comm=", current->pid); | 2451 | audit_log_format(ab, " pid=%d comm=", current->pid); |
2450 | audit_log_untrustedstring(ab, current->comm); | 2452 | audit_log_untrustedstring(ab, current->comm); |
diff --git a/kernel/bounds.c b/kernel/bounds.c new file mode 100644 index 000000000000..c3c55544db2f --- /dev/null +++ b/kernel/bounds.c | |||
@@ -0,0 +1,23 @@ | |||
1 | /* | ||
2 | * Generate definitions needed by the preprocessor. | ||
3 | * This code generates raw asm output which is post-processed | ||
4 | * to extract and format the required data. | ||
5 | */ | ||
6 | |||
7 | #define __GENERATING_BOUNDS_H | ||
8 | /* Include headers that define the enum constants of interest */ | ||
9 | #include <linux/page-flags.h> | ||
10 | #include <linux/mmzone.h> | ||
11 | |||
12 | #define DEFINE(sym, val) \ | ||
13 | asm volatile("\n->" #sym " %0 " #val : : "i" (val)) | ||
14 | |||
15 | #define BLANK() asm volatile("\n->" : : ) | ||
16 | |||
17 | void foo(void) | ||
18 | { | ||
19 | /* The enum constants to put into include/linux/bounds.h */ | ||
20 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | ||
21 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | ||
22 | /* End of constants */ | ||
23 | } | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 62f1a5231fe9..6d8de051382b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1722,7 +1722,12 @@ void cgroup_enable_task_cg_lists(void) | |||
1722 | use_task_css_set_links = 1; | 1722 | use_task_css_set_links = 1; |
1723 | do_each_thread(g, p) { | 1723 | do_each_thread(g, p) { |
1724 | task_lock(p); | 1724 | task_lock(p); |
1725 | if (list_empty(&p->cg_list)) | 1725 | /* |
1726 | * We should check if the process is exiting, otherwise | ||
1727 | * it will race with cgroup_exit() in that the list | ||
1728 | * entry won't be deleted though the process has exited. | ||
1729 | */ | ||
1730 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) | ||
1726 | list_add(&p->cg_list, &p->cgroups->tasks); | 1731 | list_add(&p->cg_list, &p->cgroups->tasks); |
1727 | task_unlock(p); | 1732 | task_unlock(p); |
1728 | } while_each_thread(g, p); | 1733 | } while_each_thread(g, p); |
@@ -2569,6 +2574,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v) | |||
2569 | /* Skip this hierarchy if it has no active subsystems */ | 2574 | /* Skip this hierarchy if it has no active subsystems */ |
2570 | if (!root->actual_subsys_bits) | 2575 | if (!root->actual_subsys_bits) |
2571 | continue; | 2576 | continue; |
2577 | seq_printf(m, "%lu:", root->subsys_bits); | ||
2572 | for_each_subsys(root, ss) | 2578 | for_each_subsys(root, ss) |
2573 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 2579 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
2574 | seq_putc(m, ':'); | 2580 | seq_putc(m, ':'); |
diff --git a/kernel/compat.c b/kernel/compat.c index 5f0e201bcfd3..e1ef04870c2a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -47,15 +47,14 @@ static long compat_nanosleep_restart(struct restart_block *restart) | |||
47 | mm_segment_t oldfs; | 47 | mm_segment_t oldfs; |
48 | long ret; | 48 | long ret; |
49 | 49 | ||
50 | rmtp = (struct compat_timespec __user *)(restart->arg1); | 50 | restart->nanosleep.rmtp = (struct timespec __user *) &rmt; |
51 | restart->arg1 = (unsigned long)&rmt; | ||
52 | oldfs = get_fs(); | 51 | oldfs = get_fs(); |
53 | set_fs(KERNEL_DS); | 52 | set_fs(KERNEL_DS); |
54 | ret = hrtimer_nanosleep_restart(restart); | 53 | ret = hrtimer_nanosleep_restart(restart); |
55 | set_fs(oldfs); | 54 | set_fs(oldfs); |
56 | 55 | ||
57 | if (ret) { | 56 | if (ret) { |
58 | restart->arg1 = (unsigned long)rmtp; | 57 | rmtp = restart->nanosleep.compat_rmtp; |
59 | 58 | ||
60 | if (rmtp && put_compat_timespec(&rmt, rmtp)) | 59 | if (rmtp && put_compat_timespec(&rmt, rmtp)) |
61 | return -EFAULT; | 60 | return -EFAULT; |
@@ -89,7 +88,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, | |||
89 | = ¤t_thread_info()->restart_block; | 88 | = ¤t_thread_info()->restart_block; |
90 | 89 | ||
91 | restart->fn = compat_nanosleep_restart; | 90 | restart->fn = compat_nanosleep_restart; |
92 | restart->arg1 = (unsigned long)rmtp; | 91 | restart->nanosleep.compat_rmtp = rmtp; |
93 | 92 | ||
94 | if (rmtp && put_compat_timespec(&rmt, rmtp)) | 93 | if (rmtp && put_compat_timespec(&rmt, rmtp)) |
95 | return -EFAULT; | 94 | return -EFAULT; |
@@ -446,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, | |||
446 | if (retval) | 445 | if (retval) |
447 | return retval; | 446 | return retval; |
448 | 447 | ||
449 | return sched_setaffinity(pid, new_mask); | 448 | return sched_setaffinity(pid, &new_mask); |
450 | } | 449 | } |
451 | 450 | ||
452 | asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, | 451 | asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, |
@@ -607,9 +606,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) | |||
607 | long err; | 606 | long err; |
608 | mm_segment_t oldfs; | 607 | mm_segment_t oldfs; |
609 | struct timespec tu; | 608 | struct timespec tu; |
610 | struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); | 609 | struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; |
611 | 610 | ||
612 | restart->arg1 = (unsigned long) &tu; | 611 | restart->nanosleep.rmtp = (struct timespec __user *) &tu; |
613 | oldfs = get_fs(); | 612 | oldfs = get_fs(); |
614 | set_fs(KERNEL_DS); | 613 | set_fs(KERNEL_DS); |
615 | err = clock_nanosleep_restart(restart); | 614 | err = clock_nanosleep_restart(restart); |
@@ -621,7 +620,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) | |||
621 | 620 | ||
622 | if (err == -ERESTART_RESTARTBLOCK) { | 621 | if (err == -ERESTART_RESTARTBLOCK) { |
623 | restart->fn = compat_clock_nanosleep_restart; | 622 | restart->fn = compat_clock_nanosleep_restart; |
624 | restart->arg1 = (unsigned long) rmtp; | 623 | restart->nanosleep.compat_rmtp = rmtp; |
625 | } | 624 | } |
626 | return err; | 625 | return err; |
627 | } | 626 | } |
@@ -652,7 +651,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, | |||
652 | if (err == -ERESTART_RESTARTBLOCK) { | 651 | if (err == -ERESTART_RESTARTBLOCK) { |
653 | restart = ¤t_thread_info()->restart_block; | 652 | restart = ¤t_thread_info()->restart_block; |
654 | restart->fn = compat_clock_nanosleep_restart; | 653 | restart->fn = compat_clock_nanosleep_restart; |
655 | restart->arg1 = (unsigned long) rmtp; | 654 | restart->nanosleep.compat_rmtp = rmtp; |
656 | } | 655 | } |
657 | return err; | 656 | return err; |
658 | } | 657 | } |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2eff3f63abed..2011ad8d2697 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -232,9 +232,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
232 | 232 | ||
233 | /* Ensure that we are not runnable on dying cpu */ | 233 | /* Ensure that we are not runnable on dying cpu */ |
234 | old_allowed = current->cpus_allowed; | 234 | old_allowed = current->cpus_allowed; |
235 | tmp = CPU_MASK_ALL; | 235 | cpus_setall(tmp); |
236 | cpu_clear(cpu, tmp); | 236 | cpu_clear(cpu, tmp); |
237 | set_cpus_allowed(current, tmp); | 237 | set_cpus_allowed_ptr(current, &tmp); |
238 | 238 | ||
239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); | 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
240 | 240 | ||
@@ -268,7 +268,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
268 | out_thread: | 268 | out_thread: |
269 | err = kthread_stop(p); | 269 | err = kthread_stop(p); |
270 | out_allowed: | 270 | out_allowed: |
271 | set_cpus_allowed(current, old_allowed); | 271 | set_cpus_allowed_ptr(current, &old_allowed); |
272 | out_release: | 272 | out_release: |
273 | cpu_hotplug_done(); | 273 | cpu_hotplug_done(); |
274 | return err; | 274 | return err; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a1b61f414228..48a976c52cf5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -98,6 +98,9 @@ struct cpuset { | |||
98 | /* partition number for rebuild_sched_domains() */ | 98 | /* partition number for rebuild_sched_domains() */ |
99 | int pn; | 99 | int pn; |
100 | 100 | ||
101 | /* for custom sched domain */ | ||
102 | int relax_domain_level; | ||
103 | |||
101 | /* used for walking a cpuset heirarchy */ | 104 | /* used for walking a cpuset heirarchy */ |
102 | struct list_head stack_list; | 105 | struct list_head stack_list; |
103 | }; | 106 | }; |
@@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
478 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 481 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
479 | } | 482 | } |
480 | 483 | ||
484 | static void | ||
485 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | ||
486 | { | ||
487 | if (!dattr) | ||
488 | return; | ||
489 | if (dattr->relax_domain_level < c->relax_domain_level) | ||
490 | dattr->relax_domain_level = c->relax_domain_level; | ||
491 | return; | ||
492 | } | ||
493 | |||
481 | /* | 494 | /* |
482 | * rebuild_sched_domains() | 495 | * rebuild_sched_domains() |
483 | * | 496 | * |
@@ -553,12 +566,14 @@ static void rebuild_sched_domains(void) | |||
553 | int csn; /* how many cpuset ptrs in csa so far */ | 566 | int csn; /* how many cpuset ptrs in csa so far */ |
554 | int i, j, k; /* indices for partition finding loops */ | 567 | int i, j, k; /* indices for partition finding loops */ |
555 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ | 568 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ |
569 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | ||
556 | int ndoms; /* number of sched domains in result */ | 570 | int ndoms; /* number of sched domains in result */ |
557 | int nslot; /* next empty doms[] cpumask_t slot */ | 571 | int nslot; /* next empty doms[] cpumask_t slot */ |
558 | 572 | ||
559 | q = NULL; | 573 | q = NULL; |
560 | csa = NULL; | 574 | csa = NULL; |
561 | doms = NULL; | 575 | doms = NULL; |
576 | dattr = NULL; | ||
562 | 577 | ||
563 | /* Special case for the 99% of systems with one, full, sched domain */ | 578 | /* Special case for the 99% of systems with one, full, sched domain */ |
564 | if (is_sched_load_balance(&top_cpuset)) { | 579 | if (is_sched_load_balance(&top_cpuset)) { |
@@ -566,6 +581,11 @@ static void rebuild_sched_domains(void) | |||
566 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 581 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
567 | if (!doms) | 582 | if (!doms) |
568 | goto rebuild; | 583 | goto rebuild; |
584 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | ||
585 | if (dattr) { | ||
586 | *dattr = SD_ATTR_INIT; | ||
587 | update_domain_attr(dattr, &top_cpuset); | ||
588 | } | ||
569 | *doms = top_cpuset.cpus_allowed; | 589 | *doms = top_cpuset.cpus_allowed; |
570 | goto rebuild; | 590 | goto rebuild; |
571 | } | 591 | } |
@@ -622,6 +642,7 @@ restart: | |||
622 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 642 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
623 | if (!doms) | 643 | if (!doms) |
624 | goto rebuild; | 644 | goto rebuild; |
645 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | ||
625 | 646 | ||
626 | for (nslot = 0, i = 0; i < csn; i++) { | 647 | for (nslot = 0, i = 0; i < csn; i++) { |
627 | struct cpuset *a = csa[i]; | 648 | struct cpuset *a = csa[i]; |
@@ -644,12 +665,15 @@ restart: | |||
644 | } | 665 | } |
645 | 666 | ||
646 | cpus_clear(*dp); | 667 | cpus_clear(*dp); |
668 | if (dattr) | ||
669 | *(dattr + nslot) = SD_ATTR_INIT; | ||
647 | for (j = i; j < csn; j++) { | 670 | for (j = i; j < csn; j++) { |
648 | struct cpuset *b = csa[j]; | 671 | struct cpuset *b = csa[j]; |
649 | 672 | ||
650 | if (apn == b->pn) { | 673 | if (apn == b->pn) { |
651 | cpus_or(*dp, *dp, b->cpus_allowed); | 674 | cpus_or(*dp, *dp, b->cpus_allowed); |
652 | b->pn = -1; | 675 | b->pn = -1; |
676 | update_domain_attr(dattr, b); | ||
653 | } | 677 | } |
654 | } | 678 | } |
655 | nslot++; | 679 | nslot++; |
@@ -660,7 +684,7 @@ restart: | |||
660 | rebuild: | 684 | rebuild: |
661 | /* Have scheduler rebuild sched domains */ | 685 | /* Have scheduler rebuild sched domains */ |
662 | get_online_cpus(); | 686 | get_online_cpus(); |
663 | partition_sched_domains(ndoms, doms); | 687 | partition_sched_domains(ndoms, doms, dattr); |
664 | put_online_cpus(); | 688 | put_online_cpus(); |
665 | 689 | ||
666 | done: | 690 | done: |
@@ -668,6 +692,7 @@ done: | |||
668 | kfifo_free(q); | 692 | kfifo_free(q); |
669 | kfree(csa); | 693 | kfree(csa); |
670 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 694 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
695 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | ||
671 | } | 696 | } |
672 | 697 | ||
673 | static inline int started_after_time(struct task_struct *t1, | 698 | static inline int started_after_time(struct task_struct *t1, |
@@ -729,7 +754,7 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | |||
729 | */ | 754 | */ |
730 | void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | 755 | void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) |
731 | { | 756 | { |
732 | set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); | 757 | set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); |
733 | } | 758 | } |
734 | 759 | ||
735 | /** | 760 | /** |
@@ -916,7 +941,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
916 | cs->mems_generation = cpuset_mems_generation++; | 941 | cs->mems_generation = cpuset_mems_generation++; |
917 | mutex_unlock(&callback_mutex); | 942 | mutex_unlock(&callback_mutex); |
918 | 943 | ||
919 | cpuset_being_rebound = cs; /* causes mpol_copy() rebind */ | 944 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
920 | 945 | ||
921 | fudge = 10; /* spare mmarray[] slots */ | 946 | fudge = 10; /* spare mmarray[] slots */ |
922 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | 947 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ |
@@ -967,7 +992,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
967 | * rebind the vma mempolicies of each mm in mmarray[] to their | 992 | * rebind the vma mempolicies of each mm in mmarray[] to their |
968 | * new cpuset, and release that mm. The mpol_rebind_mm() | 993 | * new cpuset, and release that mm. The mpol_rebind_mm() |
969 | * call takes mmap_sem, which we couldn't take while holding | 994 | * call takes mmap_sem, which we couldn't take while holding |
970 | * tasklist_lock. Forks can happen again now - the mpol_copy() | 995 | * tasklist_lock. Forks can happen again now - the mpol_dup() |
971 | * cpuset_being_rebound check will catch such forks, and rebind | 996 | * cpuset_being_rebound check will catch such forks, and rebind |
972 | * their vma mempolicies too. Because we still hold the global | 997 | * their vma mempolicies too. Because we still hold the global |
973 | * cgroup_mutex, we know that no other rebind effort will | 998 | * cgroup_mutex, we know that no other rebind effort will |
@@ -1011,6 +1036,21 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
1011 | return 0; | 1036 | return 0; |
1012 | } | 1037 | } |
1013 | 1038 | ||
1039 | static int update_relax_domain_level(struct cpuset *cs, char *buf) | ||
1040 | { | ||
1041 | int val = simple_strtol(buf, NULL, 10); | ||
1042 | |||
1043 | if (val < 0) | ||
1044 | val = -1; | ||
1045 | |||
1046 | if (val != cs->relax_domain_level) { | ||
1047 | cs->relax_domain_level = val; | ||
1048 | rebuild_sched_domains(); | ||
1049 | } | ||
1050 | |||
1051 | return 0; | ||
1052 | } | ||
1053 | |||
1014 | /* | 1054 | /* |
1015 | * update_flag - read a 0 or a 1 in a file and update associated flag | 1055 | * update_flag - read a 0 or a 1 in a file and update associated flag |
1016 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 1056 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
@@ -1178,7 +1218,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
1178 | 1218 | ||
1179 | mutex_lock(&callback_mutex); | 1219 | mutex_lock(&callback_mutex); |
1180 | guarantee_online_cpus(cs, &cpus); | 1220 | guarantee_online_cpus(cs, &cpus); |
1181 | set_cpus_allowed(tsk, cpus); | 1221 | set_cpus_allowed_ptr(tsk, &cpus); |
1182 | mutex_unlock(&callback_mutex); | 1222 | mutex_unlock(&callback_mutex); |
1183 | 1223 | ||
1184 | from = oldcs->mems_allowed; | 1224 | from = oldcs->mems_allowed; |
@@ -1202,6 +1242,7 @@ typedef enum { | |||
1202 | FILE_CPU_EXCLUSIVE, | 1242 | FILE_CPU_EXCLUSIVE, |
1203 | FILE_MEM_EXCLUSIVE, | 1243 | FILE_MEM_EXCLUSIVE, |
1204 | FILE_SCHED_LOAD_BALANCE, | 1244 | FILE_SCHED_LOAD_BALANCE, |
1245 | FILE_SCHED_RELAX_DOMAIN_LEVEL, | ||
1205 | FILE_MEMORY_PRESSURE_ENABLED, | 1246 | FILE_MEMORY_PRESSURE_ENABLED, |
1206 | FILE_MEMORY_PRESSURE, | 1247 | FILE_MEMORY_PRESSURE, |
1207 | FILE_SPREAD_PAGE, | 1248 | FILE_SPREAD_PAGE, |
@@ -1224,7 +1265,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, | |||
1224 | return -E2BIG; | 1265 | return -E2BIG; |
1225 | 1266 | ||
1226 | /* +1 for nul-terminator */ | 1267 | /* +1 for nul-terminator */ |
1227 | if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) | 1268 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); |
1269 | if (!buffer) | ||
1228 | return -ENOMEM; | 1270 | return -ENOMEM; |
1229 | 1271 | ||
1230 | if (copy_from_user(buffer, userbuf, nbytes)) { | 1272 | if (copy_from_user(buffer, userbuf, nbytes)) { |
@@ -1256,6 +1298,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, | |||
1256 | case FILE_SCHED_LOAD_BALANCE: | 1298 | case FILE_SCHED_LOAD_BALANCE: |
1257 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); | 1299 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); |
1258 | break; | 1300 | break; |
1301 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | ||
1302 | retval = update_relax_domain_level(cs, buffer); | ||
1303 | break; | ||
1259 | case FILE_MEMORY_MIGRATE: | 1304 | case FILE_MEMORY_MIGRATE: |
1260 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | 1305 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); |
1261 | break; | 1306 | break; |
@@ -1354,6 +1399,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, | |||
1354 | case FILE_SCHED_LOAD_BALANCE: | 1399 | case FILE_SCHED_LOAD_BALANCE: |
1355 | *s++ = is_sched_load_balance(cs) ? '1' : '0'; | 1400 | *s++ = is_sched_load_balance(cs) ? '1' : '0'; |
1356 | break; | 1401 | break; |
1402 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | ||
1403 | s += sprintf(s, "%d", cs->relax_domain_level); | ||
1404 | break; | ||
1357 | case FILE_MEMORY_MIGRATE: | 1405 | case FILE_MEMORY_MIGRATE: |
1358 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | 1406 | *s++ = is_memory_migrate(cs) ? '1' : '0'; |
1359 | break; | 1407 | break; |
@@ -1424,6 +1472,13 @@ static struct cftype cft_sched_load_balance = { | |||
1424 | .private = FILE_SCHED_LOAD_BALANCE, | 1472 | .private = FILE_SCHED_LOAD_BALANCE, |
1425 | }; | 1473 | }; |
1426 | 1474 | ||
1475 | static struct cftype cft_sched_relax_domain_level = { | ||
1476 | .name = "sched_relax_domain_level", | ||
1477 | .read = cpuset_common_file_read, | ||
1478 | .write = cpuset_common_file_write, | ||
1479 | .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, | ||
1480 | }; | ||
1481 | |||
1427 | static struct cftype cft_memory_migrate = { | 1482 | static struct cftype cft_memory_migrate = { |
1428 | .name = "memory_migrate", | 1483 | .name = "memory_migrate", |
1429 | .read = cpuset_common_file_read, | 1484 | .read = cpuset_common_file_read, |
@@ -1475,6 +1530,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1475 | return err; | 1530 | return err; |
1476 | if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) | 1531 | if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) |
1477 | return err; | 1532 | return err; |
1533 | if ((err = cgroup_add_file(cont, ss, | ||
1534 | &cft_sched_relax_domain_level)) < 0) | ||
1535 | return err; | ||
1478 | if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) | 1536 | if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) |
1479 | return err; | 1537 | return err; |
1480 | if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) | 1538 | if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) |
@@ -1555,10 +1613,11 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1555 | if (is_spread_slab(parent)) | 1613 | if (is_spread_slab(parent)) |
1556 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1614 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
1557 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1615 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
1558 | cs->cpus_allowed = CPU_MASK_NONE; | 1616 | cpus_clear(cs->cpus_allowed); |
1559 | cs->mems_allowed = NODE_MASK_NONE; | 1617 | nodes_clear(cs->mems_allowed); |
1560 | cs->mems_generation = cpuset_mems_generation++; | 1618 | cs->mems_generation = cpuset_mems_generation++; |
1561 | fmeter_init(&cs->fmeter); | 1619 | fmeter_init(&cs->fmeter); |
1620 | cs->relax_domain_level = -1; | ||
1562 | 1621 | ||
1563 | cs->parent = parent; | 1622 | cs->parent = parent; |
1564 | number_of_cpusets++; | 1623 | number_of_cpusets++; |
@@ -1625,12 +1684,13 @@ int __init cpuset_init(void) | |||
1625 | { | 1684 | { |
1626 | int err = 0; | 1685 | int err = 0; |
1627 | 1686 | ||
1628 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | 1687 | cpus_setall(top_cpuset.cpus_allowed); |
1629 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1688 | nodes_setall(top_cpuset.mems_allowed); |
1630 | 1689 | ||
1631 | fmeter_init(&top_cpuset.fmeter); | 1690 | fmeter_init(&top_cpuset.fmeter); |
1632 | top_cpuset.mems_generation = cpuset_mems_generation++; | 1691 | top_cpuset.mems_generation = cpuset_mems_generation++; |
1633 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); | 1692 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); |
1693 | top_cpuset.relax_domain_level = -1; | ||
1634 | 1694 | ||
1635 | err = register_filesystem(&cpuset_fs_type); | 1695 | err = register_filesystem(&cpuset_fs_type); |
1636 | if (err < 0) | 1696 | if (err < 0) |
@@ -1844,6 +1904,7 @@ void __init cpuset_init_smp(void) | |||
1844 | 1904 | ||
1845 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 1905 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
1846 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 1906 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
1907 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. | ||
1847 | * | 1908 | * |
1848 | * Description: Returns the cpumask_t cpus_allowed of the cpuset | 1909 | * Description: Returns the cpumask_t cpus_allowed of the cpuset |
1849 | * attached to the specified @tsk. Guaranteed to return some non-empty | 1910 | * attached to the specified @tsk. Guaranteed to return some non-empty |
@@ -1851,35 +1912,27 @@ void __init cpuset_init_smp(void) | |||
1851 | * tasks cpuset. | 1912 | * tasks cpuset. |
1852 | **/ | 1913 | **/ |
1853 | 1914 | ||
1854 | cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) | 1915 | void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) |
1855 | { | 1916 | { |
1856 | cpumask_t mask; | ||
1857 | |||
1858 | mutex_lock(&callback_mutex); | 1917 | mutex_lock(&callback_mutex); |
1859 | mask = cpuset_cpus_allowed_locked(tsk); | 1918 | cpuset_cpus_allowed_locked(tsk, pmask); |
1860 | mutex_unlock(&callback_mutex); | 1919 | mutex_unlock(&callback_mutex); |
1861 | |||
1862 | return mask; | ||
1863 | } | 1920 | } |
1864 | 1921 | ||
1865 | /** | 1922 | /** |
1866 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. | 1923 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. |
1867 | * Must be called with callback_mutex held. | 1924 | * Must be called with callback_mutex held. |
1868 | **/ | 1925 | **/ |
1869 | cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) | 1926 | void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) |
1870 | { | 1927 | { |
1871 | cpumask_t mask; | ||
1872 | |||
1873 | task_lock(tsk); | 1928 | task_lock(tsk); |
1874 | guarantee_online_cpus(task_cs(tsk), &mask); | 1929 | guarantee_online_cpus(task_cs(tsk), pmask); |
1875 | task_unlock(tsk); | 1930 | task_unlock(tsk); |
1876 | |||
1877 | return mask; | ||
1878 | } | 1931 | } |
1879 | 1932 | ||
1880 | void cpuset_init_current_mems_allowed(void) | 1933 | void cpuset_init_current_mems_allowed(void) |
1881 | { | 1934 | { |
1882 | current->mems_allowed = NODE_MASK_ALL; | 1935 | nodes_setall(current->mems_allowed); |
1883 | } | 1936 | } |
1884 | 1937 | ||
1885 | /** | 1938 | /** |
@@ -1906,22 +1959,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | |||
1906 | } | 1959 | } |
1907 | 1960 | ||
1908 | /** | 1961 | /** |
1909 | * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed | 1962 | * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed |
1910 | * @zl: the zonelist to be checked | 1963 | * @nodemask: the nodemask to be checked |
1911 | * | 1964 | * |
1912 | * Are any of the nodes on zonelist zl allowed in current->mems_allowed? | 1965 | * Are any of the nodes in the nodemask allowed in current->mems_allowed? |
1913 | */ | 1966 | */ |
1914 | int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | 1967 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) |
1915 | { | 1968 | { |
1916 | int i; | 1969 | return nodes_intersects(*nodemask, current->mems_allowed); |
1917 | |||
1918 | for (i = 0; zl->zones[i]; i++) { | ||
1919 | int nid = zone_to_nid(zl->zones[i]); | ||
1920 | |||
1921 | if (node_isset(nid, current->mems_allowed)) | ||
1922 | return 1; | ||
1923 | } | ||
1924 | return 0; | ||
1925 | } | 1970 | } |
1926 | 1971 | ||
1927 | /* | 1972 | /* |
@@ -2261,8 +2306,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) | |||
2261 | m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, | 2306 | m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, |
2262 | task->cpus_allowed); | 2307 | task->cpus_allowed); |
2263 | seq_printf(m, "\n"); | 2308 | seq_printf(m, "\n"); |
2309 | seq_printf(m, "Cpus_allowed_list:\t"); | ||
2310 | m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, | ||
2311 | task->cpus_allowed); | ||
2312 | seq_printf(m, "\n"); | ||
2264 | seq_printf(m, "Mems_allowed:\t"); | 2313 | seq_printf(m, "Mems_allowed:\t"); |
2265 | m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, | 2314 | m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, |
2266 | task->mems_allowed); | 2315 | task->mems_allowed); |
2267 | seq_printf(m, "\n"); | 2316 | seq_printf(m, "\n"); |
2317 | seq_printf(m, "Mems_allowed_list:\t"); | ||
2318 | m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, | ||
2319 | task->mems_allowed); | ||
2320 | seq_printf(m, "\n"); | ||
2268 | } | 2321 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index 53872bf993fa..2a9d98c641ac 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -507,10 +507,9 @@ void put_files_struct(struct files_struct *files) | |||
507 | } | 507 | } |
508 | } | 508 | } |
509 | 509 | ||
510 | EXPORT_SYMBOL(put_files_struct); | 510 | void reset_files_struct(struct files_struct *files) |
511 | |||
512 | void reset_files_struct(struct task_struct *tsk, struct files_struct *files) | ||
513 | { | 511 | { |
512 | struct task_struct *tsk = current; | ||
514 | struct files_struct *old; | 513 | struct files_struct *old; |
515 | 514 | ||
516 | old = tsk->files; | 515 | old = tsk->files; |
@@ -519,9 +518,8 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files) | |||
519 | task_unlock(tsk); | 518 | task_unlock(tsk); |
520 | put_files_struct(old); | 519 | put_files_struct(old); |
521 | } | 520 | } |
522 | EXPORT_SYMBOL(reset_files_struct); | ||
523 | 521 | ||
524 | static void __exit_files(struct task_struct *tsk) | 522 | void exit_files(struct task_struct *tsk) |
525 | { | 523 | { |
526 | struct files_struct * files = tsk->files; | 524 | struct files_struct * files = tsk->files; |
527 | 525 | ||
@@ -533,12 +531,7 @@ static void __exit_files(struct task_struct *tsk) | |||
533 | } | 531 | } |
534 | } | 532 | } |
535 | 533 | ||
536 | void exit_files(struct task_struct *tsk) | 534 | void put_fs_struct(struct fs_struct *fs) |
537 | { | ||
538 | __exit_files(tsk); | ||
539 | } | ||
540 | |||
541 | static void __put_fs_struct(struct fs_struct *fs) | ||
542 | { | 535 | { |
543 | /* No need to hold fs->lock if we are killing it */ | 536 | /* No need to hold fs->lock if we are killing it */ |
544 | if (atomic_dec_and_test(&fs->count)) { | 537 | if (atomic_dec_and_test(&fs->count)) { |
@@ -550,12 +543,7 @@ static void __put_fs_struct(struct fs_struct *fs) | |||
550 | } | 543 | } |
551 | } | 544 | } |
552 | 545 | ||
553 | void put_fs_struct(struct fs_struct *fs) | 546 | void exit_fs(struct task_struct *tsk) |
554 | { | ||
555 | __put_fs_struct(fs); | ||
556 | } | ||
557 | |||
558 | static void __exit_fs(struct task_struct *tsk) | ||
559 | { | 547 | { |
560 | struct fs_struct * fs = tsk->fs; | 548 | struct fs_struct * fs = tsk->fs; |
561 | 549 | ||
@@ -563,15 +551,10 @@ static void __exit_fs(struct task_struct *tsk) | |||
563 | task_lock(tsk); | 551 | task_lock(tsk); |
564 | tsk->fs = NULL; | 552 | tsk->fs = NULL; |
565 | task_unlock(tsk); | 553 | task_unlock(tsk); |
566 | __put_fs_struct(fs); | 554 | put_fs_struct(fs); |
567 | } | 555 | } |
568 | } | 556 | } |
569 | 557 | ||
570 | void exit_fs(struct task_struct *tsk) | ||
571 | { | ||
572 | __exit_fs(tsk); | ||
573 | } | ||
574 | |||
575 | EXPORT_SYMBOL_GPL(exit_fs); | 558 | EXPORT_SYMBOL_GPL(exit_fs); |
576 | 559 | ||
577 | /* | 560 | /* |
@@ -967,8 +950,8 @@ NORET_TYPE void do_exit(long code) | |||
967 | if (group_dead) | 950 | if (group_dead) |
968 | acct_process(); | 951 | acct_process(); |
969 | exit_sem(tsk); | 952 | exit_sem(tsk); |
970 | __exit_files(tsk); | 953 | exit_files(tsk); |
971 | __exit_fs(tsk); | 954 | exit_fs(tsk); |
972 | check_stack_usage(); | 955 | check_stack_usage(); |
973 | exit_thread(); | 956 | exit_thread(); |
974 | cgroup_exit(tsk, 1); | 957 | cgroup_exit(tsk, 1); |
@@ -984,7 +967,7 @@ NORET_TYPE void do_exit(long code) | |||
984 | proc_exit_connector(tsk); | 967 | proc_exit_connector(tsk); |
985 | exit_notify(tsk, group_dead); | 968 | exit_notify(tsk, group_dead); |
986 | #ifdef CONFIG_NUMA | 969 | #ifdef CONFIG_NUMA |
987 | mpol_free(tsk->mempolicy); | 970 | mpol_put(tsk->mempolicy); |
988 | tsk->mempolicy = NULL; | 971 | tsk->mempolicy = NULL; |
989 | #endif | 972 | #endif |
990 | #ifdef CONFIG_FUTEX | 973 | #ifdef CONFIG_FUTEX |
@@ -1608,7 +1591,7 @@ asmlinkage long sys_waitid(int which, pid_t upid, | |||
1608 | put_pid(pid); | 1591 | put_pid(pid); |
1609 | 1592 | ||
1610 | /* avoid REGPARM breakage on x86: */ | 1593 | /* avoid REGPARM breakage on x86: */ |
1611 | prevent_tail_call(ret); | 1594 | asmlinkage_protect(5, ret, which, upid, infop, options, ru); |
1612 | return ret; | 1595 | return ret; |
1613 | } | 1596 | } |
1614 | 1597 | ||
@@ -1640,7 +1623,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, | |||
1640 | put_pid(pid); | 1623 | put_pid(pid); |
1641 | 1624 | ||
1642 | /* avoid REGPARM breakage on x86: */ | 1625 | /* avoid REGPARM breakage on x86: */ |
1643 | prevent_tail_call(ret); | 1626 | asmlinkage_protect(4, ret, upid, stat_addr, options, ru); |
1644 | return ret; | 1627 | return ret; |
1645 | } | 1628 | } |
1646 | 1629 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 9c042f901570..6067e429f281 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -132,6 +132,14 @@ void __put_task_struct(struct task_struct *tsk) | |||
132 | free_task(tsk); | 132 | free_task(tsk); |
133 | } | 133 | } |
134 | 134 | ||
135 | /* | ||
136 | * macro override instead of weak attribute alias, to workaround | ||
137 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. | ||
138 | */ | ||
139 | #ifndef arch_task_cache_init | ||
140 | #define arch_task_cache_init() | ||
141 | #endif | ||
142 | |||
135 | void __init fork_init(unsigned long mempages) | 143 | void __init fork_init(unsigned long mempages) |
136 | { | 144 | { |
137 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 145 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
@@ -144,6 +152,9 @@ void __init fork_init(unsigned long mempages) | |||
144 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); | 152 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); |
145 | #endif | 153 | #endif |
146 | 154 | ||
155 | /* do the arch specific task caches init */ | ||
156 | arch_task_cache_init(); | ||
157 | |||
147 | /* | 158 | /* |
148 | * The default maximum number of threads is set to a safe | 159 | * The default maximum number of threads is set to a safe |
149 | * value: the thread structures can take up at most half | 160 | * value: the thread structures can take up at most half |
@@ -163,6 +174,13 @@ void __init fork_init(unsigned long mempages) | |||
163 | init_task.signal->rlim[RLIMIT_NPROC]; | 174 | init_task.signal->rlim[RLIMIT_NPROC]; |
164 | } | 175 | } |
165 | 176 | ||
177 | int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, | ||
178 | struct task_struct *src) | ||
179 | { | ||
180 | *dst = *src; | ||
181 | return 0; | ||
182 | } | ||
183 | |||
166 | static struct task_struct *dup_task_struct(struct task_struct *orig) | 184 | static struct task_struct *dup_task_struct(struct task_struct *orig) |
167 | { | 185 | { |
168 | struct task_struct *tsk; | 186 | struct task_struct *tsk; |
@@ -181,15 +199,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
181 | return NULL; | 199 | return NULL; |
182 | } | 200 | } |
183 | 201 | ||
184 | *tsk = *orig; | 202 | err = arch_dup_task_struct(tsk, orig); |
203 | if (err) | ||
204 | goto out; | ||
205 | |||
185 | tsk->stack = ti; | 206 | tsk->stack = ti; |
186 | 207 | ||
187 | err = prop_local_init_single(&tsk->dirties); | 208 | err = prop_local_init_single(&tsk->dirties); |
188 | if (err) { | 209 | if (err) |
189 | free_thread_info(ti); | 210 | goto out; |
190 | free_task_struct(tsk); | ||
191 | return NULL; | ||
192 | } | ||
193 | 211 | ||
194 | setup_thread_stack(tsk, orig); | 212 | setup_thread_stack(tsk, orig); |
195 | 213 | ||
@@ -205,6 +223,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
205 | #endif | 223 | #endif |
206 | tsk->splice_pipe = NULL; | 224 | tsk->splice_pipe = NULL; |
207 | return tsk; | 225 | return tsk; |
226 | |||
227 | out: | ||
228 | free_thread_info(ti); | ||
229 | free_task_struct(tsk); | ||
230 | return NULL; | ||
208 | } | 231 | } |
209 | 232 | ||
210 | #ifdef CONFIG_MMU | 233 | #ifdef CONFIG_MMU |
@@ -256,7 +279,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
256 | if (!tmp) | 279 | if (!tmp) |
257 | goto fail_nomem; | 280 | goto fail_nomem; |
258 | *tmp = *mpnt; | 281 | *tmp = *mpnt; |
259 | pol = mpol_copy(vma_policy(mpnt)); | 282 | pol = mpol_dup(vma_policy(mpnt)); |
260 | retval = PTR_ERR(pol); | 283 | retval = PTR_ERR(pol); |
261 | if (IS_ERR(pol)) | 284 | if (IS_ERR(pol)) |
262 | goto fail_nomem_policy; | 285 | goto fail_nomem_policy; |
@@ -498,7 +521,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
498 | * Allocate a new mm structure and copy contents from the | 521 | * Allocate a new mm structure and copy contents from the |
499 | * mm structure of the passed in task structure. | 522 | * mm structure of the passed in task structure. |
500 | */ | 523 | */ |
501 | static struct mm_struct *dup_mm(struct task_struct *tsk) | 524 | struct mm_struct *dup_mm(struct task_struct *tsk) |
502 | { | 525 | { |
503 | struct mm_struct *mm, *oldmm = current->mm; | 526 | struct mm_struct *mm, *oldmm = current->mm; |
504 | int err; | 527 | int err; |
@@ -782,12 +805,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
782 | goto out; | 805 | goto out; |
783 | } | 806 | } |
784 | 807 | ||
785 | /* | ||
786 | * Note: we may be using current for both targets (See exec.c) | ||
787 | * This works because we cache current->files (old) as oldf. Don't | ||
788 | * break this. | ||
789 | */ | ||
790 | tsk->files = NULL; | ||
791 | newf = dup_fd(oldf, &error); | 808 | newf = dup_fd(oldf, &error); |
792 | if (!newf) | 809 | if (!newf) |
793 | goto out; | 810 | goto out; |
@@ -823,34 +840,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | |||
823 | return 0; | 840 | return 0; |
824 | } | 841 | } |
825 | 842 | ||
826 | /* | ||
827 | * Helper to unshare the files of the current task. | ||
828 | * We don't want to expose copy_files internals to | ||
829 | * the exec layer of the kernel. | ||
830 | */ | ||
831 | |||
832 | int unshare_files(void) | ||
833 | { | ||
834 | struct files_struct *files = current->files; | ||
835 | int rc; | ||
836 | |||
837 | BUG_ON(!files); | ||
838 | |||
839 | /* This can race but the race causes us to copy when we don't | ||
840 | need to and drop the copy */ | ||
841 | if(atomic_read(&files->count) == 1) | ||
842 | { | ||
843 | atomic_inc(&files->count); | ||
844 | return 0; | ||
845 | } | ||
846 | rc = copy_files(0, current); | ||
847 | if(rc) | ||
848 | current->files = files; | ||
849 | return rc; | ||
850 | } | ||
851 | |||
852 | EXPORT_SYMBOL(unshare_files); | ||
853 | |||
854 | static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | 843 | static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) |
855 | { | 844 | { |
856 | struct sighand_struct *sig; | 845 | struct sighand_struct *sig; |
@@ -1127,7 +1116,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1127 | p->audit_context = NULL; | 1116 | p->audit_context = NULL; |
1128 | cgroup_fork(p); | 1117 | cgroup_fork(p); |
1129 | #ifdef CONFIG_NUMA | 1118 | #ifdef CONFIG_NUMA |
1130 | p->mempolicy = mpol_copy(p->mempolicy); | 1119 | p->mempolicy = mpol_dup(p->mempolicy); |
1131 | if (IS_ERR(p->mempolicy)) { | 1120 | if (IS_ERR(p->mempolicy)) { |
1132 | retval = PTR_ERR(p->mempolicy); | 1121 | retval = PTR_ERR(p->mempolicy); |
1133 | p->mempolicy = NULL; | 1122 | p->mempolicy = NULL; |
@@ -1385,7 +1374,7 @@ bad_fork_cleanup_security: | |||
1385 | security_task_free(p); | 1374 | security_task_free(p); |
1386 | bad_fork_cleanup_policy: | 1375 | bad_fork_cleanup_policy: |
1387 | #ifdef CONFIG_NUMA | 1376 | #ifdef CONFIG_NUMA |
1388 | mpol_free(p->mempolicy); | 1377 | mpol_put(p->mempolicy); |
1389 | bad_fork_cleanup_cgroup: | 1378 | bad_fork_cleanup_cgroup: |
1390 | #endif | 1379 | #endif |
1391 | cgroup_exit(p, cgroup_callbacks_done); | 1380 | cgroup_exit(p, cgroup_callbacks_done); |
@@ -1788,3 +1777,27 @@ bad_unshare_cleanup_thread: | |||
1788 | bad_unshare_out: | 1777 | bad_unshare_out: |
1789 | return err; | 1778 | return err; |
1790 | } | 1779 | } |
1780 | |||
1781 | /* | ||
1782 | * Helper to unshare the files of the current task. | ||
1783 | * We don't want to expose copy_files internals to | ||
1784 | * the exec layer of the kernel. | ||
1785 | */ | ||
1786 | |||
1787 | int unshare_files(struct files_struct **displaced) | ||
1788 | { | ||
1789 | struct task_struct *task = current; | ||
1790 | struct files_struct *copy = NULL; | ||
1791 | int error; | ||
1792 | |||
1793 | error = unshare_fd(CLONE_FILES, ©); | ||
1794 | if (error || !copy) { | ||
1795 | *displaced = NULL; | ||
1796 | return error; | ||
1797 | } | ||
1798 | *displaced = task->files; | ||
1799 | task_lock(task); | ||
1800 | task->files = copy; | ||
1801 | task_unlock(task); | ||
1802 | return 0; | ||
1803 | } | ||
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 98bee013f71f..dea4c9124ac8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -590,7 +590,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
590 | list_add_tail(&timer->cb_entry, | 590 | list_add_tail(&timer->cb_entry, |
591 | &base->cpu_base->cb_pending); | 591 | &base->cpu_base->cb_pending); |
592 | timer->state = HRTIMER_STATE_PENDING; | 592 | timer->state = HRTIMER_STATE_PENDING; |
593 | raise_softirq(HRTIMER_SOFTIRQ); | ||
594 | return 1; | 593 | return 1; |
595 | default: | 594 | default: |
596 | BUG(); | 595 | BUG(); |
@@ -633,6 +632,11 @@ static int hrtimer_switch_to_hres(void) | |||
633 | return 1; | 632 | return 1; |
634 | } | 633 | } |
635 | 634 | ||
635 | static inline void hrtimer_raise_softirq(void) | ||
636 | { | ||
637 | raise_softirq(HRTIMER_SOFTIRQ); | ||
638 | } | ||
639 | |||
636 | #else | 640 | #else |
637 | 641 | ||
638 | static inline int hrtimer_hres_active(void) { return 0; } | 642 | static inline int hrtimer_hres_active(void) { return 0; } |
@@ -651,6 +655,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer, | |||
651 | { | 655 | { |
652 | return 0; | 656 | return 0; |
653 | } | 657 | } |
658 | static inline void hrtimer_raise_softirq(void) { } | ||
654 | 659 | ||
655 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 660 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
656 | 661 | ||
@@ -850,7 +855,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
850 | { | 855 | { |
851 | struct hrtimer_clock_base *base, *new_base; | 856 | struct hrtimer_clock_base *base, *new_base; |
852 | unsigned long flags; | 857 | unsigned long flags; |
853 | int ret; | 858 | int ret, raise; |
854 | 859 | ||
855 | base = lock_hrtimer_base(timer, &flags); | 860 | base = lock_hrtimer_base(timer, &flags); |
856 | 861 | ||
@@ -884,8 +889,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
884 | enqueue_hrtimer(timer, new_base, | 889 | enqueue_hrtimer(timer, new_base, |
885 | new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); | 890 | new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); |
886 | 891 | ||
892 | /* | ||
893 | * The timer may be expired and moved to the cb_pending | ||
894 | * list. We can not raise the softirq with base lock held due | ||
895 | * to a possible deadlock with runqueue lock. | ||
896 | */ | ||
897 | raise = timer->state == HRTIMER_STATE_PENDING; | ||
898 | |||
887 | unlock_hrtimer_base(timer, &flags); | 899 | unlock_hrtimer_base(timer, &flags); |
888 | 900 | ||
901 | if (raise) | ||
902 | hrtimer_raise_softirq(); | ||
903 | |||
889 | return ret; | 904 | return ret; |
890 | } | 905 | } |
891 | EXPORT_SYMBOL_GPL(hrtimer_start); | 906 | EXPORT_SYMBOL_GPL(hrtimer_start); |
@@ -1080,8 +1095,19 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) | |||
1080 | * If the timer was rearmed on another CPU, reprogram | 1095 | * If the timer was rearmed on another CPU, reprogram |
1081 | * the event device. | 1096 | * the event device. |
1082 | */ | 1097 | */ |
1083 | if (timer->base->first == &timer->node) | 1098 | struct hrtimer_clock_base *base = timer->base; |
1084 | hrtimer_reprogram(timer, timer->base); | 1099 | |
1100 | if (base->first == &timer->node && | ||
1101 | hrtimer_reprogram(timer, base)) { | ||
1102 | /* | ||
1103 | * Timer is expired. Thus move it from tree to | ||
1104 | * pending list again. | ||
1105 | */ | ||
1106 | __remove_hrtimer(timer, base, | ||
1107 | HRTIMER_STATE_PENDING, 0); | ||
1108 | list_add_tail(&timer->cb_entry, | ||
1109 | &base->cpu_base->cb_pending); | ||
1110 | } | ||
1085 | } | 1111 | } |
1086 | } | 1112 | } |
1087 | spin_unlock_irq(&cpu_base->lock); | 1113 | spin_unlock_irq(&cpu_base->lock); |
@@ -1238,51 +1264,50 @@ void hrtimer_run_pending(void) | |||
1238 | /* | 1264 | /* |
1239 | * Called from hardirq context every jiffy | 1265 | * Called from hardirq context every jiffy |
1240 | */ | 1266 | */ |
1241 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | 1267 | void hrtimer_run_queues(void) |
1242 | int index) | ||
1243 | { | 1268 | { |
1244 | struct rb_node *node; | 1269 | struct rb_node *node; |
1245 | struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; | 1270 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
1271 | struct hrtimer_clock_base *base; | ||
1272 | int index, gettime = 1; | ||
1246 | 1273 | ||
1247 | if (!base->first) | 1274 | if (hrtimer_hres_active()) |
1248 | return; | 1275 | return; |
1249 | 1276 | ||
1250 | if (base->get_softirq_time) | 1277 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { |
1251 | base->softirq_time = base->get_softirq_time(); | 1278 | base = &cpu_base->clock_base[index]; |
1252 | |||
1253 | spin_lock(&cpu_base->lock); | ||
1254 | |||
1255 | while ((node = base->first)) { | ||
1256 | struct hrtimer *timer; | ||
1257 | |||
1258 | timer = rb_entry(node, struct hrtimer, node); | ||
1259 | if (base->softirq_time.tv64 <= timer->expires.tv64) | ||
1260 | break; | ||
1261 | 1279 | ||
1262 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { | 1280 | if (!base->first) |
1263 | __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); | ||
1264 | list_add_tail(&timer->cb_entry, | ||
1265 | &base->cpu_base->cb_pending); | ||
1266 | continue; | 1281 | continue; |
1282 | |||
1283 | if (base->get_softirq_time) | ||
1284 | base->softirq_time = base->get_softirq_time(); | ||
1285 | else if (gettime) { | ||
1286 | hrtimer_get_softirq_time(cpu_base); | ||
1287 | gettime = 0; | ||
1267 | } | 1288 | } |
1268 | 1289 | ||
1269 | __run_hrtimer(timer); | 1290 | spin_lock(&cpu_base->lock); |
1270 | } | ||
1271 | spin_unlock(&cpu_base->lock); | ||
1272 | } | ||
1273 | 1291 | ||
1274 | void hrtimer_run_queues(void) | 1292 | while ((node = base->first)) { |
1275 | { | 1293 | struct hrtimer *timer; |
1276 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1277 | int i; | ||
1278 | 1294 | ||
1279 | if (hrtimer_hres_active()) | 1295 | timer = rb_entry(node, struct hrtimer, node); |
1280 | return; | 1296 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
1297 | break; | ||
1281 | 1298 | ||
1282 | hrtimer_get_softirq_time(cpu_base); | 1299 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { |
1300 | __remove_hrtimer(timer, base, | ||
1301 | HRTIMER_STATE_PENDING, 0); | ||
1302 | list_add_tail(&timer->cb_entry, | ||
1303 | &base->cpu_base->cb_pending); | ||
1304 | continue; | ||
1305 | } | ||
1283 | 1306 | ||
1284 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1307 | __run_hrtimer(timer); |
1285 | run_hrtimer_queue(cpu_base, i); | 1308 | } |
1309 | spin_unlock(&cpu_base->lock); | ||
1310 | } | ||
1286 | } | 1311 | } |
1287 | 1312 | ||
1288 | /* | 1313 | /* |
@@ -1354,13 +1379,13 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) | |||
1354 | struct hrtimer_sleeper t; | 1379 | struct hrtimer_sleeper t; |
1355 | struct timespec __user *rmtp; | 1380 | struct timespec __user *rmtp; |
1356 | 1381 | ||
1357 | hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); | 1382 | hrtimer_init(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS); |
1358 | t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; | 1383 | t.timer.expires.tv64 = restart->nanosleep.expires; |
1359 | 1384 | ||
1360 | if (do_nanosleep(&t, HRTIMER_MODE_ABS)) | 1385 | if (do_nanosleep(&t, HRTIMER_MODE_ABS)) |
1361 | return 0; | 1386 | return 0; |
1362 | 1387 | ||
1363 | rmtp = (struct timespec __user *)restart->arg1; | 1388 | rmtp = restart->nanosleep.rmtp; |
1364 | if (rmtp) { | 1389 | if (rmtp) { |
1365 | int ret = update_rmtp(&t.timer, rmtp); | 1390 | int ret = update_rmtp(&t.timer, rmtp); |
1366 | if (ret <= 0) | 1391 | if (ret <= 0) |
@@ -1394,10 +1419,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
1394 | 1419 | ||
1395 | restart = ¤t_thread_info()->restart_block; | 1420 | restart = ¤t_thread_info()->restart_block; |
1396 | restart->fn = hrtimer_nanosleep_restart; | 1421 | restart->fn = hrtimer_nanosleep_restart; |
1397 | restart->arg0 = (unsigned long) t.timer.base->index; | 1422 | restart->nanosleep.index = t.timer.base->index; |
1398 | restart->arg1 = (unsigned long) rmtp; | 1423 | restart->nanosleep.rmtp = rmtp; |
1399 | restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; | 1424 | restart->nanosleep.expires = t.timer.expires.tv64; |
1400 | restart->arg3 = t.timer.expires.tv64 >> 32; | ||
1401 | 1425 | ||
1402 | return -ERESTART_RESTARTBLOCK; | 1426 | return -ERESTART_RESTARTBLOCK; |
1403 | } | 1427 | } |
@@ -1425,7 +1449,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
1425 | int i; | 1449 | int i; |
1426 | 1450 | ||
1427 | spin_lock_init(&cpu_base->lock); | 1451 | spin_lock_init(&cpu_base->lock); |
1428 | lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); | ||
1429 | 1452 | ||
1430 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1453 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
1431 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1454 | cpu_base->clock_base[i].cpu_base = cpu_base; |
@@ -1466,16 +1489,16 @@ static void migrate_hrtimers(int cpu) | |||
1466 | tick_cancel_sched_timer(cpu); | 1489 | tick_cancel_sched_timer(cpu); |
1467 | 1490 | ||
1468 | local_irq_disable(); | 1491 | local_irq_disable(); |
1469 | double_spin_lock(&new_base->lock, &old_base->lock, | 1492 | spin_lock(&new_base->lock); |
1470 | smp_processor_id() < cpu); | 1493 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); |
1471 | 1494 | ||
1472 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1495 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
1473 | migrate_hrtimer_list(&old_base->clock_base[i], | 1496 | migrate_hrtimer_list(&old_base->clock_base[i], |
1474 | &new_base->clock_base[i]); | 1497 | &new_base->clock_base[i]); |
1475 | } | 1498 | } |
1476 | 1499 | ||
1477 | double_spin_unlock(&new_base->lock, &old_base->lock, | 1500 | spin_unlock(&old_base->lock); |
1478 | smp_processor_id() < cpu); | 1501 | spin_unlock(&new_base->lock); |
1479 | local_irq_enable(); | 1502 | local_irq_enable(); |
1480 | put_cpu_var(hrtimer_bases); | 1503 | put_cpu_var(hrtimer_bases); |
1481 | } | 1504 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fdb3fbe2b0c4..964964baefa2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq) | |||
47 | desc->irq_count = 0; | 47 | desc->irq_count = 0; |
48 | desc->irqs_unhandled = 0; | 48 | desc->irqs_unhandled = 0; |
49 | #ifdef CONFIG_SMP | 49 | #ifdef CONFIG_SMP |
50 | desc->affinity = CPU_MASK_ALL; | 50 | cpus_setall(desc->affinity); |
51 | #endif | 51 | #endif |
52 | spin_unlock_irqrestore(&desc->lock, flags); | 52 | spin_unlock_irqrestore(&desc->lock, flags); |
53 | } | 53 | } |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 06a0e2775651..cb85c79989b4 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
30 | #include <asm/io.h> | 30 | #include <asm/io.h> |
31 | #include <asm/system.h> | 31 | #include <asm/system.h> |
32 | #include <asm/semaphore.h> | ||
33 | #include <asm/sections.h> | 32 | #include <asm/sections.h> |
34 | 33 | ||
35 | /* Per cpu memory for storing cpu states in case of system crash. */ | 34 | /* Per cpu memory for storing cpu states in case of system crash. */ |
@@ -1406,6 +1405,9 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1406 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | 1405 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); |
1407 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | 1406 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); |
1408 | VMCOREINFO_NUMBER(NR_FREE_PAGES); | 1407 | VMCOREINFO_NUMBER(NR_FREE_PAGES); |
1408 | VMCOREINFO_NUMBER(PG_lru); | ||
1409 | VMCOREINFO_NUMBER(PG_private); | ||
1410 | VMCOREINFO_NUMBER(PG_swapcache); | ||
1409 | 1411 | ||
1410 | arch_crash_save_vmcoreinfo(); | 1412 | arch_crash_save_vmcoreinfo(); |
1411 | 1413 | ||
diff --git a/kernel/kgdb.c b/kernel/kgdb.c new file mode 100644 index 000000000000..1bd0ec1c80b2 --- /dev/null +++ b/kernel/kgdb.c | |||
@@ -0,0 +1,1700 @@ | |||
1 | /* | ||
2 | * KGDB stub. | ||
3 | * | ||
4 | * Maintainer: Jason Wessel <jason.wessel@windriver.com> | ||
5 | * | ||
6 | * Copyright (C) 2000-2001 VERITAS Software Corporation. | ||
7 | * Copyright (C) 2002-2004 Timesys Corporation | ||
8 | * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> | ||
9 | * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> | ||
10 | * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> | ||
11 | * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. | ||
12 | * Copyright (C) 2005-2008 Wind River Systems, Inc. | ||
13 | * Copyright (C) 2007 MontaVista Software, Inc. | ||
14 | * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
15 | * | ||
16 | * Contributors at various stages not listed above: | ||
17 | * Jason Wessel ( jason.wessel@windriver.com ) | ||
18 | * George Anzinger <george@mvista.com> | ||
19 | * Anurekh Saxena (anurekh.saxena@timesys.com) | ||
20 | * Lake Stevens Instrument Division (Glenn Engel) | ||
21 | * Jim Kingdon, Cygnus Support. | ||
22 | * | ||
23 | * Original KGDB stub: David Grothe <dave@gcom.com>, | ||
24 | * Tigran Aivazian <tigran@sco.com> | ||
25 | * | ||
26 | * This file is licensed under the terms of the GNU General Public License | ||
27 | * version 2. This program is licensed "as is" without any warranty of any | ||
28 | * kind, whether express or implied. | ||
29 | */ | ||
30 | #include <linux/pid_namespace.h> | ||
31 | #include <linux/clocksource.h> | ||
32 | #include <linux/interrupt.h> | ||
33 | #include <linux/spinlock.h> | ||
34 | #include <linux/console.h> | ||
35 | #include <linux/threads.h> | ||
36 | #include <linux/uaccess.h> | ||
37 | #include <linux/kernel.h> | ||
38 | #include <linux/module.h> | ||
39 | #include <linux/ptrace.h> | ||
40 | #include <linux/reboot.h> | ||
41 | #include <linux/string.h> | ||
42 | #include <linux/delay.h> | ||
43 | #include <linux/sched.h> | ||
44 | #include <linux/sysrq.h> | ||
45 | #include <linux/init.h> | ||
46 | #include <linux/kgdb.h> | ||
47 | #include <linux/pid.h> | ||
48 | #include <linux/smp.h> | ||
49 | #include <linux/mm.h> | ||
50 | |||
51 | #include <asm/cacheflush.h> | ||
52 | #include <asm/byteorder.h> | ||
53 | #include <asm/atomic.h> | ||
54 | #include <asm/system.h> | ||
55 | |||
56 | static int kgdb_break_asap; | ||
57 | |||
58 | struct kgdb_state { | ||
59 | int ex_vector; | ||
60 | int signo; | ||
61 | int err_code; | ||
62 | int cpu; | ||
63 | int pass_exception; | ||
64 | long threadid; | ||
65 | long kgdb_usethreadid; | ||
66 | struct pt_regs *linux_regs; | ||
67 | }; | ||
68 | |||
69 | static struct debuggerinfo_struct { | ||
70 | void *debuggerinfo; | ||
71 | struct task_struct *task; | ||
72 | } kgdb_info[NR_CPUS]; | ||
73 | |||
74 | /** | ||
75 | * kgdb_connected - Is a host GDB connected to us? | ||
76 | */ | ||
77 | int kgdb_connected; | ||
78 | EXPORT_SYMBOL_GPL(kgdb_connected); | ||
79 | |||
80 | /* All the KGDB handlers are installed */ | ||
81 | static int kgdb_io_module_registered; | ||
82 | |||
83 | /* Guard for recursive entry */ | ||
84 | static int exception_level; | ||
85 | |||
86 | static struct kgdb_io *kgdb_io_ops; | ||
87 | static DEFINE_SPINLOCK(kgdb_registration_lock); | ||
88 | |||
89 | /* kgdb console driver is loaded */ | ||
90 | static int kgdb_con_registered; | ||
91 | /* determine if kgdb console output should be used */ | ||
92 | static int kgdb_use_con; | ||
93 | |||
94 | static int __init opt_kgdb_con(char *str) | ||
95 | { | ||
96 | kgdb_use_con = 1; | ||
97 | return 0; | ||
98 | } | ||
99 | |||
100 | early_param("kgdbcon", opt_kgdb_con); | ||
101 | |||
102 | module_param(kgdb_use_con, int, 0644); | ||
103 | |||
104 | /* | ||
105 | * Holds information about breakpoints in a kernel. These breakpoints are | ||
106 | * added and removed by gdb. | ||
107 | */ | ||
108 | static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { | ||
109 | [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } | ||
110 | }; | ||
111 | |||
112 | /* | ||
113 | * The CPU# of the active CPU, or -1 if none: | ||
114 | */ | ||
115 | atomic_t kgdb_active = ATOMIC_INIT(-1); | ||
116 | |||
117 | /* | ||
118 | * We use NR_CPUs not PERCPU, in case kgdb is used to debug early | ||
119 | * bootup code (which might not have percpu set up yet): | ||
120 | */ | ||
121 | static atomic_t passive_cpu_wait[NR_CPUS]; | ||
122 | static atomic_t cpu_in_kgdb[NR_CPUS]; | ||
123 | atomic_t kgdb_setting_breakpoint; | ||
124 | |||
125 | struct task_struct *kgdb_usethread; | ||
126 | struct task_struct *kgdb_contthread; | ||
127 | |||
128 | int kgdb_single_step; | ||
129 | |||
130 | /* Our I/O buffers. */ | ||
131 | static char remcom_in_buffer[BUFMAX]; | ||
132 | static char remcom_out_buffer[BUFMAX]; | ||
133 | |||
134 | /* Storage for the registers, in GDB format. */ | ||
135 | static unsigned long gdb_regs[(NUMREGBYTES + | ||
136 | sizeof(unsigned long) - 1) / | ||
137 | sizeof(unsigned long)]; | ||
138 | |||
139 | /* to keep track of the CPU which is doing the single stepping*/ | ||
140 | atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); | ||
141 | |||
142 | /* | ||
143 | * If you are debugging a problem where roundup (the collection of | ||
144 | * all other CPUs) is a problem [this should be extremely rare], | ||
145 | * then use the nokgdbroundup option to avoid roundup. In that case | ||
146 | * the other CPUs might interfere with your debugging context, so | ||
147 | * use this with care: | ||
148 | */ | ||
149 | int kgdb_do_roundup = 1; | ||
150 | |||
151 | static int __init opt_nokgdbroundup(char *str) | ||
152 | { | ||
153 | kgdb_do_roundup = 0; | ||
154 | |||
155 | return 0; | ||
156 | } | ||
157 | |||
158 | early_param("nokgdbroundup", opt_nokgdbroundup); | ||
159 | |||
160 | /* | ||
161 | * Finally, some KGDB code :-) | ||
162 | */ | ||
163 | |||
164 | /* | ||
165 | * Weak aliases for breakpoint management, | ||
166 | * can be overriden by architectures when needed: | ||
167 | */ | ||
168 | int __weak kgdb_validate_break_address(unsigned long addr) | ||
169 | { | ||
170 | char tmp_variable[BREAK_INSTR_SIZE]; | ||
171 | |||
172 | return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE); | ||
173 | } | ||
174 | |||
175 | int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) | ||
176 | { | ||
177 | int err; | ||
178 | |||
179 | err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); | ||
180 | if (err) | ||
181 | return err; | ||
182 | |||
183 | return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, | ||
184 | BREAK_INSTR_SIZE); | ||
185 | } | ||
186 | |||
187 | int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) | ||
188 | { | ||
189 | return probe_kernel_write((char *)addr, | ||
190 | (char *)bundle, BREAK_INSTR_SIZE); | ||
191 | } | ||
192 | |||
193 | unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) | ||
194 | { | ||
195 | return instruction_pointer(regs); | ||
196 | } | ||
197 | |||
198 | int __weak kgdb_arch_init(void) | ||
199 | { | ||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | int __weak kgdb_skipexception(int exception, struct pt_regs *regs) | ||
204 | { | ||
205 | return 0; | ||
206 | } | ||
207 | |||
208 | void __weak | ||
209 | kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) | ||
210 | { | ||
211 | return; | ||
212 | } | ||
213 | |||
214 | /** | ||
215 | * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. | ||
216 | * @regs: Current &struct pt_regs. | ||
217 | * | ||
218 | * This function will be called if the particular architecture must | ||
219 | * disable hardware debugging while it is processing gdb packets or | ||
220 | * handling exception. | ||
221 | */ | ||
222 | void __weak kgdb_disable_hw_debug(struct pt_regs *regs) | ||
223 | { | ||
224 | } | ||
225 | |||
226 | /* | ||
227 | * GDB remote protocol parser: | ||
228 | */ | ||
229 | |||
230 | static const char hexchars[] = "0123456789abcdef"; | ||
231 | |||
232 | static int hex(char ch) | ||
233 | { | ||
234 | if ((ch >= 'a') && (ch <= 'f')) | ||
235 | return ch - 'a' + 10; | ||
236 | if ((ch >= '0') && (ch <= '9')) | ||
237 | return ch - '0'; | ||
238 | if ((ch >= 'A') && (ch <= 'F')) | ||
239 | return ch - 'A' + 10; | ||
240 | return -1; | ||
241 | } | ||
242 | |||
243 | /* scan for the sequence $<data>#<checksum> */ | ||
244 | static void get_packet(char *buffer) | ||
245 | { | ||
246 | unsigned char checksum; | ||
247 | unsigned char xmitcsum; | ||
248 | int count; | ||
249 | char ch; | ||
250 | |||
251 | do { | ||
252 | /* | ||
253 | * Spin and wait around for the start character, ignore all | ||
254 | * other characters: | ||
255 | */ | ||
256 | while ((ch = (kgdb_io_ops->read_char())) != '$') | ||
257 | /* nothing */; | ||
258 | |||
259 | kgdb_connected = 1; | ||
260 | checksum = 0; | ||
261 | xmitcsum = -1; | ||
262 | |||
263 | count = 0; | ||
264 | |||
265 | /* | ||
266 | * now, read until a # or end of buffer is found: | ||
267 | */ | ||
268 | while (count < (BUFMAX - 1)) { | ||
269 | ch = kgdb_io_ops->read_char(); | ||
270 | if (ch == '#') | ||
271 | break; | ||
272 | checksum = checksum + ch; | ||
273 | buffer[count] = ch; | ||
274 | count = count + 1; | ||
275 | } | ||
276 | buffer[count] = 0; | ||
277 | |||
278 | if (ch == '#') { | ||
279 | xmitcsum = hex(kgdb_io_ops->read_char()) << 4; | ||
280 | xmitcsum += hex(kgdb_io_ops->read_char()); | ||
281 | |||
282 | if (checksum != xmitcsum) | ||
283 | /* failed checksum */ | ||
284 | kgdb_io_ops->write_char('-'); | ||
285 | else | ||
286 | /* successful transfer */ | ||
287 | kgdb_io_ops->write_char('+'); | ||
288 | if (kgdb_io_ops->flush) | ||
289 | kgdb_io_ops->flush(); | ||
290 | } | ||
291 | } while (checksum != xmitcsum); | ||
292 | } | ||
293 | |||
294 | /* | ||
295 | * Send the packet in buffer. | ||
296 | * Check for gdb connection if asked for. | ||
297 | */ | ||
298 | static void put_packet(char *buffer) | ||
299 | { | ||
300 | unsigned char checksum; | ||
301 | int count; | ||
302 | char ch; | ||
303 | |||
304 | /* | ||
305 | * $<packet info>#<checksum>. | ||
306 | */ | ||
307 | while (1) { | ||
308 | kgdb_io_ops->write_char('$'); | ||
309 | checksum = 0; | ||
310 | count = 0; | ||
311 | |||
312 | while ((ch = buffer[count])) { | ||
313 | kgdb_io_ops->write_char(ch); | ||
314 | checksum += ch; | ||
315 | count++; | ||
316 | } | ||
317 | |||
318 | kgdb_io_ops->write_char('#'); | ||
319 | kgdb_io_ops->write_char(hexchars[checksum >> 4]); | ||
320 | kgdb_io_ops->write_char(hexchars[checksum & 0xf]); | ||
321 | if (kgdb_io_ops->flush) | ||
322 | kgdb_io_ops->flush(); | ||
323 | |||
324 | /* Now see what we get in reply. */ | ||
325 | ch = kgdb_io_ops->read_char(); | ||
326 | |||
327 | if (ch == 3) | ||
328 | ch = kgdb_io_ops->read_char(); | ||
329 | |||
330 | /* If we get an ACK, we are done. */ | ||
331 | if (ch == '+') | ||
332 | return; | ||
333 | |||
334 | /* | ||
335 | * If we get the start of another packet, this means | ||
336 | * that GDB is attempting to reconnect. We will NAK | ||
337 | * the packet being sent, and stop trying to send this | ||
338 | * packet. | ||
339 | */ | ||
340 | if (ch == '$') { | ||
341 | kgdb_io_ops->write_char('-'); | ||
342 | if (kgdb_io_ops->flush) | ||
343 | kgdb_io_ops->flush(); | ||
344 | return; | ||
345 | } | ||
346 | } | ||
347 | } | ||
348 | |||
349 | static char *pack_hex_byte(char *pkt, u8 byte) | ||
350 | { | ||
351 | *pkt++ = hexchars[byte >> 4]; | ||
352 | *pkt++ = hexchars[byte & 0xf]; | ||
353 | |||
354 | return pkt; | ||
355 | } | ||
356 | |||
357 | /* | ||
358 | * Convert the memory pointed to by mem into hex, placing result in buf. | ||
359 | * Return a pointer to the last char put in buf (null). May return an error. | ||
360 | */ | ||
361 | int kgdb_mem2hex(char *mem, char *buf, int count) | ||
362 | { | ||
363 | char *tmp; | ||
364 | int err; | ||
365 | |||
366 | /* | ||
367 | * We use the upper half of buf as an intermediate buffer for the | ||
368 | * raw memory copy. Hex conversion will work against this one. | ||
369 | */ | ||
370 | tmp = buf + count; | ||
371 | |||
372 | err = probe_kernel_read(tmp, mem, count); | ||
373 | if (!err) { | ||
374 | while (count > 0) { | ||
375 | buf = pack_hex_byte(buf, *tmp); | ||
376 | tmp++; | ||
377 | count--; | ||
378 | } | ||
379 | |||
380 | *buf = 0; | ||
381 | } | ||
382 | |||
383 | return err; | ||
384 | } | ||
385 | |||
386 | /* | ||
387 | * Copy the binary array pointed to by buf into mem. Fix $, #, and | ||
388 | * 0x7d escaped with 0x7d. Return a pointer to the character after | ||
389 | * the last byte written. | ||
390 | */ | ||
391 | static int kgdb_ebin2mem(char *buf, char *mem, int count) | ||
392 | { | ||
393 | int err = 0; | ||
394 | char c; | ||
395 | |||
396 | while (count-- > 0) { | ||
397 | c = *buf++; | ||
398 | if (c == 0x7d) | ||
399 | c = *buf++ ^ 0x20; | ||
400 | |||
401 | err = probe_kernel_write(mem, &c, 1); | ||
402 | if (err) | ||
403 | break; | ||
404 | |||
405 | mem++; | ||
406 | } | ||
407 | |||
408 | return err; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * Convert the hex array pointed to by buf into binary to be placed in mem. | ||
413 | * Return a pointer to the character AFTER the last byte written. | ||
414 | * May return an error. | ||
415 | */ | ||
416 | int kgdb_hex2mem(char *buf, char *mem, int count) | ||
417 | { | ||
418 | char *tmp_raw; | ||
419 | char *tmp_hex; | ||
420 | |||
421 | /* | ||
422 | * We use the upper half of buf as an intermediate buffer for the | ||
423 | * raw memory that is converted from hex. | ||
424 | */ | ||
425 | tmp_raw = buf + count * 2; | ||
426 | |||
427 | tmp_hex = tmp_raw - 1; | ||
428 | while (tmp_hex >= buf) { | ||
429 | tmp_raw--; | ||
430 | *tmp_raw = hex(*tmp_hex--); | ||
431 | *tmp_raw |= hex(*tmp_hex--) << 4; | ||
432 | } | ||
433 | |||
434 | return probe_kernel_write(mem, tmp_raw, count); | ||
435 | } | ||
436 | |||
437 | /* | ||
438 | * While we find nice hex chars, build a long_val. | ||
439 | * Return number of chars processed. | ||
440 | */ | ||
441 | int kgdb_hex2long(char **ptr, long *long_val) | ||
442 | { | ||
443 | int hex_val; | ||
444 | int num = 0; | ||
445 | |||
446 | *long_val = 0; | ||
447 | |||
448 | while (**ptr) { | ||
449 | hex_val = hex(**ptr); | ||
450 | if (hex_val < 0) | ||
451 | break; | ||
452 | |||
453 | *long_val = (*long_val << 4) | hex_val; | ||
454 | num++; | ||
455 | (*ptr)++; | ||
456 | } | ||
457 | |||
458 | return num; | ||
459 | } | ||
460 | |||
461 | /* Write memory due to an 'M' or 'X' packet. */ | ||
462 | static int write_mem_msg(int binary) | ||
463 | { | ||
464 | char *ptr = &remcom_in_buffer[1]; | ||
465 | unsigned long addr; | ||
466 | unsigned long length; | ||
467 | int err; | ||
468 | |||
469 | if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && | ||
470 | kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { | ||
471 | if (binary) | ||
472 | err = kgdb_ebin2mem(ptr, (char *)addr, length); | ||
473 | else | ||
474 | err = kgdb_hex2mem(ptr, (char *)addr, length); | ||
475 | if (err) | ||
476 | return err; | ||
477 | if (CACHE_FLUSH_IS_SAFE) | ||
478 | flush_icache_range(addr, addr + length + 1); | ||
479 | return 0; | ||
480 | } | ||
481 | |||
482 | return -EINVAL; | ||
483 | } | ||
484 | |||
485 | static void error_packet(char *pkt, int error) | ||
486 | { | ||
487 | error = -error; | ||
488 | pkt[0] = 'E'; | ||
489 | pkt[1] = hexchars[(error / 10)]; | ||
490 | pkt[2] = hexchars[(error % 10)]; | ||
491 | pkt[3] = '\0'; | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * Thread ID accessors. We represent a flat TID space to GDB, where | ||
496 | * the per CPU idle threads (which under Linux all have PID 0) are | ||
497 | * remapped to negative TIDs. | ||
498 | */ | ||
499 | |||
500 | #define BUF_THREAD_ID_SIZE 16 | ||
501 | |||
502 | static char *pack_threadid(char *pkt, unsigned char *id) | ||
503 | { | ||
504 | char *limit; | ||
505 | |||
506 | limit = pkt + BUF_THREAD_ID_SIZE; | ||
507 | while (pkt < limit) | ||
508 | pkt = pack_hex_byte(pkt, *id++); | ||
509 | |||
510 | return pkt; | ||
511 | } | ||
512 | |||
513 | static void int_to_threadref(unsigned char *id, int value) | ||
514 | { | ||
515 | unsigned char *scan; | ||
516 | int i = 4; | ||
517 | |||
518 | scan = (unsigned char *)id; | ||
519 | while (i--) | ||
520 | *scan++ = 0; | ||
521 | *scan++ = (value >> 24) & 0xff; | ||
522 | *scan++ = (value >> 16) & 0xff; | ||
523 | *scan++ = (value >> 8) & 0xff; | ||
524 | *scan++ = (value & 0xff); | ||
525 | } | ||
526 | |||
527 | static struct task_struct *getthread(struct pt_regs *regs, int tid) | ||
528 | { | ||
529 | /* | ||
530 | * Non-positive TIDs are remapped idle tasks: | ||
531 | */ | ||
532 | if (tid <= 0) | ||
533 | return idle_task(-tid); | ||
534 | |||
535 | /* | ||
536 | * find_task_by_pid_ns() does not take the tasklist lock anymore | ||
537 | * but is nicely RCU locked - hence is a pretty resilient | ||
538 | * thing to use: | ||
539 | */ | ||
540 | return find_task_by_pid_ns(tid, &init_pid_ns); | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * CPU debug state control: | ||
545 | */ | ||
546 | |||
547 | #ifdef CONFIG_SMP | ||
548 | static void kgdb_wait(struct pt_regs *regs) | ||
549 | { | ||
550 | unsigned long flags; | ||
551 | int cpu; | ||
552 | |||
553 | local_irq_save(flags); | ||
554 | cpu = raw_smp_processor_id(); | ||
555 | kgdb_info[cpu].debuggerinfo = regs; | ||
556 | kgdb_info[cpu].task = current; | ||
557 | /* | ||
558 | * Make sure the above info reaches the primary CPU before | ||
559 | * our cpu_in_kgdb[] flag setting does: | ||
560 | */ | ||
561 | smp_wmb(); | ||
562 | atomic_set(&cpu_in_kgdb[cpu], 1); | ||
563 | |||
564 | /* Wait till primary CPU is done with debugging */ | ||
565 | while (atomic_read(&passive_cpu_wait[cpu])) | ||
566 | cpu_relax(); | ||
567 | |||
568 | kgdb_info[cpu].debuggerinfo = NULL; | ||
569 | kgdb_info[cpu].task = NULL; | ||
570 | |||
571 | /* fix up hardware debug registers on local cpu */ | ||
572 | if (arch_kgdb_ops.correct_hw_break) | ||
573 | arch_kgdb_ops.correct_hw_break(); | ||
574 | |||
575 | /* Signal the primary CPU that we are done: */ | ||
576 | atomic_set(&cpu_in_kgdb[cpu], 0); | ||
577 | clocksource_touch_watchdog(); | ||
578 | local_irq_restore(flags); | ||
579 | } | ||
580 | #endif | ||
581 | |||
582 | /* | ||
583 | * Some architectures need cache flushes when we set/clear a | ||
584 | * breakpoint: | ||
585 | */ | ||
586 | static void kgdb_flush_swbreak_addr(unsigned long addr) | ||
587 | { | ||
588 | if (!CACHE_FLUSH_IS_SAFE) | ||
589 | return; | ||
590 | |||
591 | if (current->mm && current->mm->mmap_cache) { | ||
592 | flush_cache_range(current->mm->mmap_cache, | ||
593 | addr, addr + BREAK_INSTR_SIZE); | ||
594 | } | ||
595 | /* Force flush instruction cache if it was outside the mm */ | ||
596 | flush_icache_range(addr, addr + BREAK_INSTR_SIZE); | ||
597 | } | ||
598 | |||
599 | /* | ||
600 | * SW breakpoint management: | ||
601 | */ | ||
602 | static int kgdb_activate_sw_breakpoints(void) | ||
603 | { | ||
604 | unsigned long addr; | ||
605 | int error = 0; | ||
606 | int i; | ||
607 | |||
608 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
609 | if (kgdb_break[i].state != BP_SET) | ||
610 | continue; | ||
611 | |||
612 | addr = kgdb_break[i].bpt_addr; | ||
613 | error = kgdb_arch_set_breakpoint(addr, | ||
614 | kgdb_break[i].saved_instr); | ||
615 | if (error) | ||
616 | return error; | ||
617 | |||
618 | kgdb_flush_swbreak_addr(addr); | ||
619 | kgdb_break[i].state = BP_ACTIVE; | ||
620 | } | ||
621 | return 0; | ||
622 | } | ||
623 | |||
624 | static int kgdb_set_sw_break(unsigned long addr) | ||
625 | { | ||
626 | int err = kgdb_validate_break_address(addr); | ||
627 | int breakno = -1; | ||
628 | int i; | ||
629 | |||
630 | if (err) | ||
631 | return err; | ||
632 | |||
633 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
634 | if ((kgdb_break[i].state == BP_SET) && | ||
635 | (kgdb_break[i].bpt_addr == addr)) | ||
636 | return -EEXIST; | ||
637 | } | ||
638 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
639 | if (kgdb_break[i].state == BP_REMOVED && | ||
640 | kgdb_break[i].bpt_addr == addr) { | ||
641 | breakno = i; | ||
642 | break; | ||
643 | } | ||
644 | } | ||
645 | |||
646 | if (breakno == -1) { | ||
647 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
648 | if (kgdb_break[i].state == BP_UNDEFINED) { | ||
649 | breakno = i; | ||
650 | break; | ||
651 | } | ||
652 | } | ||
653 | } | ||
654 | |||
655 | if (breakno == -1) | ||
656 | return -E2BIG; | ||
657 | |||
658 | kgdb_break[breakno].state = BP_SET; | ||
659 | kgdb_break[breakno].type = BP_BREAKPOINT; | ||
660 | kgdb_break[breakno].bpt_addr = addr; | ||
661 | |||
662 | return 0; | ||
663 | } | ||
664 | |||
665 | static int kgdb_deactivate_sw_breakpoints(void) | ||
666 | { | ||
667 | unsigned long addr; | ||
668 | int error = 0; | ||
669 | int i; | ||
670 | |||
671 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
672 | if (kgdb_break[i].state != BP_ACTIVE) | ||
673 | continue; | ||
674 | addr = kgdb_break[i].bpt_addr; | ||
675 | error = kgdb_arch_remove_breakpoint(addr, | ||
676 | kgdb_break[i].saved_instr); | ||
677 | if (error) | ||
678 | return error; | ||
679 | |||
680 | kgdb_flush_swbreak_addr(addr); | ||
681 | kgdb_break[i].state = BP_SET; | ||
682 | } | ||
683 | return 0; | ||
684 | } | ||
685 | |||
686 | static int kgdb_remove_sw_break(unsigned long addr) | ||
687 | { | ||
688 | int i; | ||
689 | |||
690 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
691 | if ((kgdb_break[i].state == BP_SET) && | ||
692 | (kgdb_break[i].bpt_addr == addr)) { | ||
693 | kgdb_break[i].state = BP_REMOVED; | ||
694 | return 0; | ||
695 | } | ||
696 | } | ||
697 | return -ENOENT; | ||
698 | } | ||
699 | |||
700 | int kgdb_isremovedbreak(unsigned long addr) | ||
701 | { | ||
702 | int i; | ||
703 | |||
704 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
705 | if ((kgdb_break[i].state == BP_REMOVED) && | ||
706 | (kgdb_break[i].bpt_addr == addr)) | ||
707 | return 1; | ||
708 | } | ||
709 | return 0; | ||
710 | } | ||
711 | |||
712 | int remove_all_break(void) | ||
713 | { | ||
714 | unsigned long addr; | ||
715 | int error; | ||
716 | int i; | ||
717 | |||
718 | /* Clear memory breakpoints. */ | ||
719 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | ||
720 | if (kgdb_break[i].state != BP_ACTIVE) | ||
721 | goto setundefined; | ||
722 | addr = kgdb_break[i].bpt_addr; | ||
723 | error = kgdb_arch_remove_breakpoint(addr, | ||
724 | kgdb_break[i].saved_instr); | ||
725 | if (error) | ||
726 | printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", | ||
727 | addr); | ||
728 | setundefined: | ||
729 | kgdb_break[i].state = BP_UNDEFINED; | ||
730 | } | ||
731 | |||
732 | /* Clear hardware breakpoints. */ | ||
733 | if (arch_kgdb_ops.remove_all_hw_break) | ||
734 | arch_kgdb_ops.remove_all_hw_break(); | ||
735 | |||
736 | return 0; | ||
737 | } | ||
738 | |||
739 | /* | ||
740 | * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: | ||
741 | */ | ||
742 | static inline int shadow_pid(int realpid) | ||
743 | { | ||
744 | if (realpid) | ||
745 | return realpid; | ||
746 | |||
747 | return -1-raw_smp_processor_id(); | ||
748 | } | ||
749 | |||
750 | static char gdbmsgbuf[BUFMAX + 1]; | ||
751 | |||
752 | static void kgdb_msg_write(const char *s, int len) | ||
753 | { | ||
754 | char *bufptr; | ||
755 | int wcount; | ||
756 | int i; | ||
757 | |||
758 | /* 'O'utput */ | ||
759 | gdbmsgbuf[0] = 'O'; | ||
760 | |||
761 | /* Fill and send buffers... */ | ||
762 | while (len > 0) { | ||
763 | bufptr = gdbmsgbuf + 1; | ||
764 | |||
765 | /* Calculate how many this time */ | ||
766 | if ((len << 1) > (BUFMAX - 2)) | ||
767 | wcount = (BUFMAX - 2) >> 1; | ||
768 | else | ||
769 | wcount = len; | ||
770 | |||
771 | /* Pack in hex chars */ | ||
772 | for (i = 0; i < wcount; i++) | ||
773 | bufptr = pack_hex_byte(bufptr, s[i]); | ||
774 | *bufptr = '\0'; | ||
775 | |||
776 | /* Move up */ | ||
777 | s += wcount; | ||
778 | len -= wcount; | ||
779 | |||
780 | /* Write packet */ | ||
781 | put_packet(gdbmsgbuf); | ||
782 | } | ||
783 | } | ||
784 | |||
785 | /* | ||
786 | * Return true if there is a valid kgdb I/O module. Also if no | ||
787 | * debugger is attached a message can be printed to the console about | ||
788 | * waiting for the debugger to attach. | ||
789 | * | ||
790 | * The print_wait argument is only to be true when called from inside | ||
791 | * the core kgdb_handle_exception, because it will wait for the | ||
792 | * debugger to attach. | ||
793 | */ | ||
794 | static int kgdb_io_ready(int print_wait) | ||
795 | { | ||
796 | if (!kgdb_io_ops) | ||
797 | return 0; | ||
798 | if (kgdb_connected) | ||
799 | return 1; | ||
800 | if (atomic_read(&kgdb_setting_breakpoint)) | ||
801 | return 1; | ||
802 | if (print_wait) | ||
803 | printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); | ||
804 | return 1; | ||
805 | } | ||
806 | |||
807 | /* | ||
808 | * All the functions that start with gdb_cmd are the various | ||
809 | * operations to implement the handlers for the gdbserial protocol | ||
810 | * where KGDB is communicating with an external debugger | ||
811 | */ | ||
812 | |||
813 | /* Handle the '?' status packets */ | ||
814 | static void gdb_cmd_status(struct kgdb_state *ks) | ||
815 | { | ||
816 | /* | ||
817 | * We know that this packet is only sent | ||
818 | * during initial connect. So to be safe, | ||
819 | * we clear out our breakpoints now in case | ||
820 | * GDB is reconnecting. | ||
821 | */ | ||
822 | remove_all_break(); | ||
823 | |||
824 | remcom_out_buffer[0] = 'S'; | ||
825 | pack_hex_byte(&remcom_out_buffer[1], ks->signo); | ||
826 | } | ||
827 | |||
828 | /* Handle the 'g' get registers request */ | ||
829 | static void gdb_cmd_getregs(struct kgdb_state *ks) | ||
830 | { | ||
831 | struct task_struct *thread; | ||
832 | void *local_debuggerinfo; | ||
833 | int i; | ||
834 | |||
835 | thread = kgdb_usethread; | ||
836 | if (!thread) { | ||
837 | thread = kgdb_info[ks->cpu].task; | ||
838 | local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; | ||
839 | } else { | ||
840 | local_debuggerinfo = NULL; | ||
841 | for (i = 0; i < NR_CPUS; i++) { | ||
842 | /* | ||
843 | * Try to find the task on some other | ||
844 | * or possibly this node if we do not | ||
845 | * find the matching task then we try | ||
846 | * to approximate the results. | ||
847 | */ | ||
848 | if (thread == kgdb_info[i].task) | ||
849 | local_debuggerinfo = kgdb_info[i].debuggerinfo; | ||
850 | } | ||
851 | } | ||
852 | |||
853 | /* | ||
854 | * All threads that don't have debuggerinfo should be | ||
855 | * in __schedule() sleeping, since all other CPUs | ||
856 | * are in kgdb_wait, and thus have debuggerinfo. | ||
857 | */ | ||
858 | if (local_debuggerinfo) { | ||
859 | pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); | ||
860 | } else { | ||
861 | /* | ||
862 | * Pull stuff saved during switch_to; nothing | ||
863 | * else is accessible (or even particularly | ||
864 | * relevant). | ||
865 | * | ||
866 | * This should be enough for a stack trace. | ||
867 | */ | ||
868 | sleeping_thread_to_gdb_regs(gdb_regs, thread); | ||
869 | } | ||
870 | kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); | ||
871 | } | ||
872 | |||
873 | /* Handle the 'G' set registers request */ | ||
874 | static void gdb_cmd_setregs(struct kgdb_state *ks) | ||
875 | { | ||
876 | kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); | ||
877 | |||
878 | if (kgdb_usethread && kgdb_usethread != current) { | ||
879 | error_packet(remcom_out_buffer, -EINVAL); | ||
880 | } else { | ||
881 | gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); | ||
882 | strcpy(remcom_out_buffer, "OK"); | ||
883 | } | ||
884 | } | ||
885 | |||
886 | /* Handle the 'm' memory read bytes */ | ||
887 | static void gdb_cmd_memread(struct kgdb_state *ks) | ||
888 | { | ||
889 | char *ptr = &remcom_in_buffer[1]; | ||
890 | unsigned long length; | ||
891 | unsigned long addr; | ||
892 | int err; | ||
893 | |||
894 | if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && | ||
895 | kgdb_hex2long(&ptr, &length) > 0) { | ||
896 | err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); | ||
897 | if (err) | ||
898 | error_packet(remcom_out_buffer, err); | ||
899 | } else { | ||
900 | error_packet(remcom_out_buffer, -EINVAL); | ||
901 | } | ||
902 | } | ||
903 | |||
904 | /* Handle the 'M' memory write bytes */ | ||
905 | static void gdb_cmd_memwrite(struct kgdb_state *ks) | ||
906 | { | ||
907 | int err = write_mem_msg(0); | ||
908 | |||
909 | if (err) | ||
910 | error_packet(remcom_out_buffer, err); | ||
911 | else | ||
912 | strcpy(remcom_out_buffer, "OK"); | ||
913 | } | ||
914 | |||
915 | /* Handle the 'X' memory binary write bytes */ | ||
916 | static void gdb_cmd_binwrite(struct kgdb_state *ks) | ||
917 | { | ||
918 | int err = write_mem_msg(1); | ||
919 | |||
920 | if (err) | ||
921 | error_packet(remcom_out_buffer, err); | ||
922 | else | ||
923 | strcpy(remcom_out_buffer, "OK"); | ||
924 | } | ||
925 | |||
926 | /* Handle the 'D' or 'k', detach or kill packets */ | ||
927 | static void gdb_cmd_detachkill(struct kgdb_state *ks) | ||
928 | { | ||
929 | int error; | ||
930 | |||
931 | /* The detach case */ | ||
932 | if (remcom_in_buffer[0] == 'D') { | ||
933 | error = remove_all_break(); | ||
934 | if (error < 0) { | ||
935 | error_packet(remcom_out_buffer, error); | ||
936 | } else { | ||
937 | strcpy(remcom_out_buffer, "OK"); | ||
938 | kgdb_connected = 0; | ||
939 | } | ||
940 | put_packet(remcom_out_buffer); | ||
941 | } else { | ||
942 | /* | ||
943 | * Assume the kill case, with no exit code checking, | ||
944 | * trying to force detach the debugger: | ||
945 | */ | ||
946 | remove_all_break(); | ||
947 | kgdb_connected = 0; | ||
948 | } | ||
949 | } | ||
950 | |||
951 | /* Handle the 'R' reboot packets */ | ||
952 | static int gdb_cmd_reboot(struct kgdb_state *ks) | ||
953 | { | ||
954 | /* For now, only honor R0 */ | ||
955 | if (strcmp(remcom_in_buffer, "R0") == 0) { | ||
956 | printk(KERN_CRIT "Executing emergency reboot\n"); | ||
957 | strcpy(remcom_out_buffer, "OK"); | ||
958 | put_packet(remcom_out_buffer); | ||
959 | |||
960 | /* | ||
961 | * Execution should not return from | ||
962 | * machine_emergency_restart() | ||
963 | */ | ||
964 | machine_emergency_restart(); | ||
965 | kgdb_connected = 0; | ||
966 | |||
967 | return 1; | ||
968 | } | ||
969 | return 0; | ||
970 | } | ||
971 | |||
972 | /* Handle the 'q' query packets */ | ||
973 | static void gdb_cmd_query(struct kgdb_state *ks) | ||
974 | { | ||
975 | struct task_struct *thread; | ||
976 | unsigned char thref[8]; | ||
977 | char *ptr; | ||
978 | int i; | ||
979 | |||
980 | switch (remcom_in_buffer[1]) { | ||
981 | case 's': | ||
982 | case 'f': | ||
983 | if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { | ||
984 | error_packet(remcom_out_buffer, -EINVAL); | ||
985 | break; | ||
986 | } | ||
987 | |||
988 | if (remcom_in_buffer[1] == 'f') | ||
989 | ks->threadid = 1; | ||
990 | |||
991 | remcom_out_buffer[0] = 'm'; | ||
992 | ptr = remcom_out_buffer + 1; | ||
993 | |||
994 | for (i = 0; i < 17; ks->threadid++) { | ||
995 | thread = getthread(ks->linux_regs, ks->threadid); | ||
996 | if (thread) { | ||
997 | int_to_threadref(thref, ks->threadid); | ||
998 | pack_threadid(ptr, thref); | ||
999 | ptr += BUF_THREAD_ID_SIZE; | ||
1000 | *(ptr++) = ','; | ||
1001 | i++; | ||
1002 | } | ||
1003 | } | ||
1004 | *(--ptr) = '\0'; | ||
1005 | break; | ||
1006 | |||
1007 | case 'C': | ||
1008 | /* Current thread id */ | ||
1009 | strcpy(remcom_out_buffer, "QC"); | ||
1010 | ks->threadid = shadow_pid(current->pid); | ||
1011 | int_to_threadref(thref, ks->threadid); | ||
1012 | pack_threadid(remcom_out_buffer + 2, thref); | ||
1013 | break; | ||
1014 | case 'T': | ||
1015 | if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { | ||
1016 | error_packet(remcom_out_buffer, -EINVAL); | ||
1017 | break; | ||
1018 | } | ||
1019 | ks->threadid = 0; | ||
1020 | ptr = remcom_in_buffer + 17; | ||
1021 | kgdb_hex2long(&ptr, &ks->threadid); | ||
1022 | if (!getthread(ks->linux_regs, ks->threadid)) { | ||
1023 | error_packet(remcom_out_buffer, -EINVAL); | ||
1024 | break; | ||
1025 | } | ||
1026 | if (ks->threadid > 0) { | ||
1027 | kgdb_mem2hex(getthread(ks->linux_regs, | ||
1028 | ks->threadid)->comm, | ||
1029 | remcom_out_buffer, 16); | ||
1030 | } else { | ||
1031 | static char tmpstr[23 + BUF_THREAD_ID_SIZE]; | ||
1032 | |||
1033 | sprintf(tmpstr, "Shadow task %d for pid 0", | ||
1034 | (int)(-ks->threadid-1)); | ||
1035 | kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); | ||
1036 | } | ||
1037 | break; | ||
1038 | } | ||
1039 | } | ||
1040 | |||
1041 | /* Handle the 'H' task query packets */ | ||
1042 | static void gdb_cmd_task(struct kgdb_state *ks) | ||
1043 | { | ||
1044 | struct task_struct *thread; | ||
1045 | char *ptr; | ||
1046 | |||
1047 | switch (remcom_in_buffer[1]) { | ||
1048 | case 'g': | ||
1049 | ptr = &remcom_in_buffer[2]; | ||
1050 | kgdb_hex2long(&ptr, &ks->threadid); | ||
1051 | thread = getthread(ks->linux_regs, ks->threadid); | ||
1052 | if (!thread && ks->threadid > 0) { | ||
1053 | error_packet(remcom_out_buffer, -EINVAL); | ||
1054 | break; | ||
1055 | } | ||
1056 | kgdb_usethread = thread; | ||
1057 | ks->kgdb_usethreadid = ks->threadid; | ||
1058 | strcpy(remcom_out_buffer, "OK"); | ||
1059 | break; | ||
1060 | case 'c': | ||
1061 | ptr = &remcom_in_buffer[2]; | ||
1062 | kgdb_hex2long(&ptr, &ks->threadid); | ||
1063 | if (!ks->threadid) { | ||
1064 | kgdb_contthread = NULL; | ||
1065 | } else { | ||
1066 | thread = getthread(ks->linux_regs, ks->threadid); | ||
1067 | if (!thread && ks->threadid > 0) { | ||
1068 | error_packet(remcom_out_buffer, -EINVAL); | ||
1069 | break; | ||
1070 | } | ||
1071 | kgdb_contthread = thread; | ||
1072 | } | ||
1073 | strcpy(remcom_out_buffer, "OK"); | ||
1074 | break; | ||
1075 | } | ||
1076 | } | ||
1077 | |||
1078 | /* Handle the 'T' thread query packets */ | ||
1079 | static void gdb_cmd_thread(struct kgdb_state *ks) | ||
1080 | { | ||
1081 | char *ptr = &remcom_in_buffer[1]; | ||
1082 | struct task_struct *thread; | ||
1083 | |||
1084 | kgdb_hex2long(&ptr, &ks->threadid); | ||
1085 | thread = getthread(ks->linux_regs, ks->threadid); | ||
1086 | if (thread) | ||
1087 | strcpy(remcom_out_buffer, "OK"); | ||
1088 | else | ||
1089 | error_packet(remcom_out_buffer, -EINVAL); | ||
1090 | } | ||
1091 | |||
1092 | /* Handle the 'z' or 'Z' breakpoint remove or set packets */ | ||
1093 | static void gdb_cmd_break(struct kgdb_state *ks) | ||
1094 | { | ||
1095 | /* | ||
1096 | * Since GDB-5.3, it's been drafted that '0' is a software | ||
1097 | * breakpoint, '1' is a hardware breakpoint, so let's do that. | ||
1098 | */ | ||
1099 | char *bpt_type = &remcom_in_buffer[1]; | ||
1100 | char *ptr = &remcom_in_buffer[2]; | ||
1101 | unsigned long addr; | ||
1102 | unsigned long length; | ||
1103 | int error = 0; | ||
1104 | |||
1105 | if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { | ||
1106 | /* Unsupported */ | ||
1107 | if (*bpt_type > '4') | ||
1108 | return; | ||
1109 | } else { | ||
1110 | if (*bpt_type != '0' && *bpt_type != '1') | ||
1111 | /* Unsupported. */ | ||
1112 | return; | ||
1113 | } | ||
1114 | |||
1115 | /* | ||
1116 | * Test if this is a hardware breakpoint, and | ||
1117 | * if we support it: | ||
1118 | */ | ||
1119 | if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) | ||
1120 | /* Unsupported. */ | ||
1121 | return; | ||
1122 | |||
1123 | if (*(ptr++) != ',') { | ||
1124 | error_packet(remcom_out_buffer, -EINVAL); | ||
1125 | return; | ||
1126 | } | ||
1127 | if (!kgdb_hex2long(&ptr, &addr)) { | ||
1128 | error_packet(remcom_out_buffer, -EINVAL); | ||
1129 | return; | ||
1130 | } | ||
1131 | if (*(ptr++) != ',' || | ||
1132 | !kgdb_hex2long(&ptr, &length)) { | ||
1133 | error_packet(remcom_out_buffer, -EINVAL); | ||
1134 | return; | ||
1135 | } | ||
1136 | |||
1137 | if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') | ||
1138 | error = kgdb_set_sw_break(addr); | ||
1139 | else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') | ||
1140 | error = kgdb_remove_sw_break(addr); | ||
1141 | else if (remcom_in_buffer[0] == 'Z') | ||
1142 | error = arch_kgdb_ops.set_hw_breakpoint(addr, | ||
1143 | (int)length, *bpt_type - '0'); | ||
1144 | else if (remcom_in_buffer[0] == 'z') | ||
1145 | error = arch_kgdb_ops.remove_hw_breakpoint(addr, | ||
1146 | (int) length, *bpt_type - '0'); | ||
1147 | |||
1148 | if (error == 0) | ||
1149 | strcpy(remcom_out_buffer, "OK"); | ||
1150 | else | ||
1151 | error_packet(remcom_out_buffer, error); | ||
1152 | } | ||
1153 | |||
1154 | /* Handle the 'C' signal / exception passing packets */ | ||
1155 | static int gdb_cmd_exception_pass(struct kgdb_state *ks) | ||
1156 | { | ||
1157 | /* C09 == pass exception | ||
1158 | * C15 == detach kgdb, pass exception | ||
1159 | */ | ||
1160 | if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { | ||
1161 | |||
1162 | ks->pass_exception = 1; | ||
1163 | remcom_in_buffer[0] = 'c'; | ||
1164 | |||
1165 | } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { | ||
1166 | |||
1167 | ks->pass_exception = 1; | ||
1168 | remcom_in_buffer[0] = 'D'; | ||
1169 | remove_all_break(); | ||
1170 | kgdb_connected = 0; | ||
1171 | return 1; | ||
1172 | |||
1173 | } else { | ||
1174 | error_packet(remcom_out_buffer, -EINVAL); | ||
1175 | return 0; | ||
1176 | } | ||
1177 | |||
1178 | /* Indicate fall through */ | ||
1179 | return -1; | ||
1180 | } | ||
1181 | |||
1182 | /* | ||
1183 | * This function performs all gdbserial command procesing | ||
1184 | */ | ||
1185 | static int gdb_serial_stub(struct kgdb_state *ks) | ||
1186 | { | ||
1187 | int error = 0; | ||
1188 | int tmp; | ||
1189 | |||
1190 | /* Clear the out buffer. */ | ||
1191 | memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); | ||
1192 | |||
1193 | if (kgdb_connected) { | ||
1194 | unsigned char thref[8]; | ||
1195 | char *ptr; | ||
1196 | |||
1197 | /* Reply to host that an exception has occurred */ | ||
1198 | ptr = remcom_out_buffer; | ||
1199 | *ptr++ = 'T'; | ||
1200 | ptr = pack_hex_byte(ptr, ks->signo); | ||
1201 | ptr += strlen(strcpy(ptr, "thread:")); | ||
1202 | int_to_threadref(thref, shadow_pid(current->pid)); | ||
1203 | ptr = pack_threadid(ptr, thref); | ||
1204 | *ptr++ = ';'; | ||
1205 | put_packet(remcom_out_buffer); | ||
1206 | } | ||
1207 | |||
1208 | kgdb_usethread = kgdb_info[ks->cpu].task; | ||
1209 | ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); | ||
1210 | ks->pass_exception = 0; | ||
1211 | |||
1212 | while (1) { | ||
1213 | error = 0; | ||
1214 | |||
1215 | /* Clear the out buffer. */ | ||
1216 | memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); | ||
1217 | |||
1218 | get_packet(remcom_in_buffer); | ||
1219 | |||
1220 | switch (remcom_in_buffer[0]) { | ||
1221 | case '?': /* gdbserial status */ | ||
1222 | gdb_cmd_status(ks); | ||
1223 | break; | ||
1224 | case 'g': /* return the value of the CPU registers */ | ||
1225 | gdb_cmd_getregs(ks); | ||
1226 | break; | ||
1227 | case 'G': /* set the value of the CPU registers - return OK */ | ||
1228 | gdb_cmd_setregs(ks); | ||
1229 | break; | ||
1230 | case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ | ||
1231 | gdb_cmd_memread(ks); | ||
1232 | break; | ||
1233 | case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ | ||
1234 | gdb_cmd_memwrite(ks); | ||
1235 | break; | ||
1236 | case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ | ||
1237 | gdb_cmd_binwrite(ks); | ||
1238 | break; | ||
1239 | /* kill or detach. KGDB should treat this like a | ||
1240 | * continue. | ||
1241 | */ | ||
1242 | case 'D': /* Debugger detach */ | ||
1243 | case 'k': /* Debugger detach via kill */ | ||
1244 | gdb_cmd_detachkill(ks); | ||
1245 | goto default_handle; | ||
1246 | case 'R': /* Reboot */ | ||
1247 | if (gdb_cmd_reboot(ks)) | ||
1248 | goto default_handle; | ||
1249 | break; | ||
1250 | case 'q': /* query command */ | ||
1251 | gdb_cmd_query(ks); | ||
1252 | break; | ||
1253 | case 'H': /* task related */ | ||
1254 | gdb_cmd_task(ks); | ||
1255 | break; | ||
1256 | case 'T': /* Query thread status */ | ||
1257 | gdb_cmd_thread(ks); | ||
1258 | break; | ||
1259 | case 'z': /* Break point remove */ | ||
1260 | case 'Z': /* Break point set */ | ||
1261 | gdb_cmd_break(ks); | ||
1262 | break; | ||
1263 | case 'C': /* Exception passing */ | ||
1264 | tmp = gdb_cmd_exception_pass(ks); | ||
1265 | if (tmp > 0) | ||
1266 | goto default_handle; | ||
1267 | if (tmp == 0) | ||
1268 | break; | ||
1269 | /* Fall through on tmp < 0 */ | ||
1270 | case 'c': /* Continue packet */ | ||
1271 | case 's': /* Single step packet */ | ||
1272 | if (kgdb_contthread && kgdb_contthread != current) { | ||
1273 | /* Can't switch threads in kgdb */ | ||
1274 | error_packet(remcom_out_buffer, -EINVAL); | ||
1275 | break; | ||
1276 | } | ||
1277 | kgdb_activate_sw_breakpoints(); | ||
1278 | /* Fall through to default processing */ | ||
1279 | default: | ||
1280 | default_handle: | ||
1281 | error = kgdb_arch_handle_exception(ks->ex_vector, | ||
1282 | ks->signo, | ||
1283 | ks->err_code, | ||
1284 | remcom_in_buffer, | ||
1285 | remcom_out_buffer, | ||
1286 | ks->linux_regs); | ||
1287 | /* | ||
1288 | * Leave cmd processing on error, detach, | ||
1289 | * kill, continue, or single step. | ||
1290 | */ | ||
1291 | if (error >= 0 || remcom_in_buffer[0] == 'D' || | ||
1292 | remcom_in_buffer[0] == 'k') { | ||
1293 | error = 0; | ||
1294 | goto kgdb_exit; | ||
1295 | } | ||
1296 | |||
1297 | } | ||
1298 | |||
1299 | /* reply to the request */ | ||
1300 | put_packet(remcom_out_buffer); | ||
1301 | } | ||
1302 | |||
1303 | kgdb_exit: | ||
1304 | if (ks->pass_exception) | ||
1305 | error = 1; | ||
1306 | return error; | ||
1307 | } | ||
1308 | |||
1309 | static int kgdb_reenter_check(struct kgdb_state *ks) | ||
1310 | { | ||
1311 | unsigned long addr; | ||
1312 | |||
1313 | if (atomic_read(&kgdb_active) != raw_smp_processor_id()) | ||
1314 | return 0; | ||
1315 | |||
1316 | /* Panic on recursive debugger calls: */ | ||
1317 | exception_level++; | ||
1318 | addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); | ||
1319 | kgdb_deactivate_sw_breakpoints(); | ||
1320 | |||
1321 | /* | ||
1322 | * If the break point removed ok at the place exception | ||
1323 | * occurred, try to recover and print a warning to the end | ||
1324 | * user because the user planted a breakpoint in a place that | ||
1325 | * KGDB needs in order to function. | ||
1326 | */ | ||
1327 | if (kgdb_remove_sw_break(addr) == 0) { | ||
1328 | exception_level = 0; | ||
1329 | kgdb_skipexception(ks->ex_vector, ks->linux_regs); | ||
1330 | kgdb_activate_sw_breakpoints(); | ||
1331 | printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", | ||
1332 | addr); | ||
1333 | WARN_ON_ONCE(1); | ||
1334 | |||
1335 | return 1; | ||
1336 | } | ||
1337 | remove_all_break(); | ||
1338 | kgdb_skipexception(ks->ex_vector, ks->linux_regs); | ||
1339 | |||
1340 | if (exception_level > 1) { | ||
1341 | dump_stack(); | ||
1342 | panic("Recursive entry to debugger"); | ||
1343 | } | ||
1344 | |||
1345 | printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); | ||
1346 | dump_stack(); | ||
1347 | panic("Recursive entry to debugger"); | ||
1348 | |||
1349 | return 1; | ||
1350 | } | ||
1351 | |||
1352 | /* | ||
1353 | * kgdb_handle_exception() - main entry point from a kernel exception | ||
1354 | * | ||
1355 | * Locking hierarchy: | ||
1356 | * interface locks, if any (begin_session) | ||
1357 | * kgdb lock (kgdb_active) | ||
1358 | */ | ||
1359 | int | ||
1360 | kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | ||
1361 | { | ||
1362 | struct kgdb_state kgdb_var; | ||
1363 | struct kgdb_state *ks = &kgdb_var; | ||
1364 | unsigned long flags; | ||
1365 | int error = 0; | ||
1366 | int i, cpu; | ||
1367 | |||
1368 | ks->cpu = raw_smp_processor_id(); | ||
1369 | ks->ex_vector = evector; | ||
1370 | ks->signo = signo; | ||
1371 | ks->ex_vector = evector; | ||
1372 | ks->err_code = ecode; | ||
1373 | ks->kgdb_usethreadid = 0; | ||
1374 | ks->linux_regs = regs; | ||
1375 | |||
1376 | if (kgdb_reenter_check(ks)) | ||
1377 | return 0; /* Ouch, double exception ! */ | ||
1378 | |||
1379 | acquirelock: | ||
1380 | /* | ||
1381 | * Interrupts will be restored by the 'trap return' code, except when | ||
1382 | * single stepping. | ||
1383 | */ | ||
1384 | local_irq_save(flags); | ||
1385 | |||
1386 | cpu = raw_smp_processor_id(); | ||
1387 | |||
1388 | /* | ||
1389 | * Acquire the kgdb_active lock: | ||
1390 | */ | ||
1391 | while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) | ||
1392 | cpu_relax(); | ||
1393 | |||
1394 | /* | ||
1395 | * Do not start the debugger connection on this CPU if the last | ||
1396 | * instance of the exception handler wanted to come into the | ||
1397 | * debugger on a different CPU via a single step | ||
1398 | */ | ||
1399 | if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && | ||
1400 | atomic_read(&kgdb_cpu_doing_single_step) != cpu) { | ||
1401 | |||
1402 | atomic_set(&kgdb_active, -1); | ||
1403 | clocksource_touch_watchdog(); | ||
1404 | local_irq_restore(flags); | ||
1405 | |||
1406 | goto acquirelock; | ||
1407 | } | ||
1408 | |||
1409 | if (!kgdb_io_ready(1)) { | ||
1410 | error = 1; | ||
1411 | goto kgdb_restore; /* No I/O connection, so resume the system */ | ||
1412 | } | ||
1413 | |||
1414 | /* | ||
1415 | * Don't enter if we have hit a removed breakpoint. | ||
1416 | */ | ||
1417 | if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) | ||
1418 | goto kgdb_restore; | ||
1419 | |||
1420 | /* Call the I/O driver's pre_exception routine */ | ||
1421 | if (kgdb_io_ops->pre_exception) | ||
1422 | kgdb_io_ops->pre_exception(); | ||
1423 | |||
1424 | kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; | ||
1425 | kgdb_info[ks->cpu].task = current; | ||
1426 | |||
1427 | kgdb_disable_hw_debug(ks->linux_regs); | ||
1428 | |||
1429 | /* | ||
1430 | * Get the passive CPU lock which will hold all the non-primary | ||
1431 | * CPU in a spin state while the debugger is active | ||
1432 | */ | ||
1433 | if (!kgdb_single_step || !kgdb_contthread) { | ||
1434 | for (i = 0; i < NR_CPUS; i++) | ||
1435 | atomic_set(&passive_cpu_wait[i], 1); | ||
1436 | } | ||
1437 | |||
1438 | /* | ||
1439 | * spin_lock code is good enough as a barrier so we don't | ||
1440 | * need one here: | ||
1441 | */ | ||
1442 | atomic_set(&cpu_in_kgdb[ks->cpu], 1); | ||
1443 | |||
1444 | #ifdef CONFIG_SMP | ||
1445 | /* Signal the other CPUs to enter kgdb_wait() */ | ||
1446 | if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) | ||
1447 | kgdb_roundup_cpus(flags); | ||
1448 | #endif | ||
1449 | |||
1450 | /* | ||
1451 | * Wait for the other CPUs to be notified and be waiting for us: | ||
1452 | */ | ||
1453 | for_each_online_cpu(i) { | ||
1454 | while (!atomic_read(&cpu_in_kgdb[i])) | ||
1455 | cpu_relax(); | ||
1456 | } | ||
1457 | |||
1458 | /* | ||
1459 | * At this point the primary processor is completely | ||
1460 | * in the debugger and all secondary CPUs are quiescent | ||
1461 | */ | ||
1462 | kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); | ||
1463 | kgdb_deactivate_sw_breakpoints(); | ||
1464 | kgdb_single_step = 0; | ||
1465 | kgdb_contthread = NULL; | ||
1466 | exception_level = 0; | ||
1467 | |||
1468 | /* Talk to debugger with gdbserial protocol */ | ||
1469 | error = gdb_serial_stub(ks); | ||
1470 | |||
1471 | /* Call the I/O driver's post_exception routine */ | ||
1472 | if (kgdb_io_ops->post_exception) | ||
1473 | kgdb_io_ops->post_exception(); | ||
1474 | |||
1475 | kgdb_info[ks->cpu].debuggerinfo = NULL; | ||
1476 | kgdb_info[ks->cpu].task = NULL; | ||
1477 | atomic_set(&cpu_in_kgdb[ks->cpu], 0); | ||
1478 | |||
1479 | if (!kgdb_single_step || !kgdb_contthread) { | ||
1480 | for (i = NR_CPUS-1; i >= 0; i--) | ||
1481 | atomic_set(&passive_cpu_wait[i], 0); | ||
1482 | /* | ||
1483 | * Wait till all the CPUs have quit | ||
1484 | * from the debugger. | ||
1485 | */ | ||
1486 | for_each_online_cpu(i) { | ||
1487 | while (atomic_read(&cpu_in_kgdb[i])) | ||
1488 | cpu_relax(); | ||
1489 | } | ||
1490 | } | ||
1491 | |||
1492 | kgdb_restore: | ||
1493 | /* Free kgdb_active */ | ||
1494 | atomic_set(&kgdb_active, -1); | ||
1495 | clocksource_touch_watchdog(); | ||
1496 | local_irq_restore(flags); | ||
1497 | |||
1498 | return error; | ||
1499 | } | ||
1500 | |||
1501 | int kgdb_nmicallback(int cpu, void *regs) | ||
1502 | { | ||
1503 | #ifdef CONFIG_SMP | ||
1504 | if (!atomic_read(&cpu_in_kgdb[cpu]) && | ||
1505 | atomic_read(&kgdb_active) != cpu && | ||
1506 | atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { | ||
1507 | kgdb_wait((struct pt_regs *)regs); | ||
1508 | return 0; | ||
1509 | } | ||
1510 | #endif | ||
1511 | return 1; | ||
1512 | } | ||
1513 | |||
1514 | void kgdb_console_write(struct console *co, const char *s, unsigned count) | ||
1515 | { | ||
1516 | unsigned long flags; | ||
1517 | |||
1518 | /* If we're debugging, or KGDB has not connected, don't try | ||
1519 | * and print. */ | ||
1520 | if (!kgdb_connected || atomic_read(&kgdb_active) != -1) | ||
1521 | return; | ||
1522 | |||
1523 | local_irq_save(flags); | ||
1524 | kgdb_msg_write(s, count); | ||
1525 | local_irq_restore(flags); | ||
1526 | } | ||
1527 | |||
1528 | static struct console kgdbcons = { | ||
1529 | .name = "kgdb", | ||
1530 | .write = kgdb_console_write, | ||
1531 | .flags = CON_PRINTBUFFER | CON_ENABLED, | ||
1532 | .index = -1, | ||
1533 | }; | ||
1534 | |||
1535 | #ifdef CONFIG_MAGIC_SYSRQ | ||
1536 | static void sysrq_handle_gdb(int key, struct tty_struct *tty) | ||
1537 | { | ||
1538 | if (!kgdb_io_ops) { | ||
1539 | printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); | ||
1540 | return; | ||
1541 | } | ||
1542 | if (!kgdb_connected) | ||
1543 | printk(KERN_CRIT "Entering KGDB\n"); | ||
1544 | |||
1545 | kgdb_breakpoint(); | ||
1546 | } | ||
1547 | |||
1548 | static struct sysrq_key_op sysrq_gdb_op = { | ||
1549 | .handler = sysrq_handle_gdb, | ||
1550 | .help_msg = "Gdb", | ||
1551 | .action_msg = "GDB", | ||
1552 | }; | ||
1553 | #endif | ||
1554 | |||
1555 | static void kgdb_register_callbacks(void) | ||
1556 | { | ||
1557 | if (!kgdb_io_module_registered) { | ||
1558 | kgdb_io_module_registered = 1; | ||
1559 | kgdb_arch_init(); | ||
1560 | #ifdef CONFIG_MAGIC_SYSRQ | ||
1561 | register_sysrq_key('g', &sysrq_gdb_op); | ||
1562 | #endif | ||
1563 | if (kgdb_use_con && !kgdb_con_registered) { | ||
1564 | register_console(&kgdbcons); | ||
1565 | kgdb_con_registered = 1; | ||
1566 | } | ||
1567 | } | ||
1568 | } | ||
1569 | |||
1570 | static void kgdb_unregister_callbacks(void) | ||
1571 | { | ||
1572 | /* | ||
1573 | * When this routine is called KGDB should unregister from the | ||
1574 | * panic handler and clean up, making sure it is not handling any | ||
1575 | * break exceptions at the time. | ||
1576 | */ | ||
1577 | if (kgdb_io_module_registered) { | ||
1578 | kgdb_io_module_registered = 0; | ||
1579 | kgdb_arch_exit(); | ||
1580 | #ifdef CONFIG_MAGIC_SYSRQ | ||
1581 | unregister_sysrq_key('g', &sysrq_gdb_op); | ||
1582 | #endif | ||
1583 | if (kgdb_con_registered) { | ||
1584 | unregister_console(&kgdbcons); | ||
1585 | kgdb_con_registered = 0; | ||
1586 | } | ||
1587 | } | ||
1588 | } | ||
1589 | |||
1590 | static void kgdb_initial_breakpoint(void) | ||
1591 | { | ||
1592 | kgdb_break_asap = 0; | ||
1593 | |||
1594 | printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); | ||
1595 | kgdb_breakpoint(); | ||
1596 | } | ||
1597 | |||
1598 | /** | ||
1599 | * kgdb_register_io_module - register KGDB IO module | ||
1600 | * @new_kgdb_io_ops: the io ops vector | ||
1601 | * | ||
1602 | * Register it with the KGDB core. | ||
1603 | */ | ||
1604 | int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops) | ||
1605 | { | ||
1606 | int err; | ||
1607 | |||
1608 | spin_lock(&kgdb_registration_lock); | ||
1609 | |||
1610 | if (kgdb_io_ops) { | ||
1611 | spin_unlock(&kgdb_registration_lock); | ||
1612 | |||
1613 | printk(KERN_ERR "kgdb: Another I/O driver is already " | ||
1614 | "registered with KGDB.\n"); | ||
1615 | return -EBUSY; | ||
1616 | } | ||
1617 | |||
1618 | if (new_kgdb_io_ops->init) { | ||
1619 | err = new_kgdb_io_ops->init(); | ||
1620 | if (err) { | ||
1621 | spin_unlock(&kgdb_registration_lock); | ||
1622 | return err; | ||
1623 | } | ||
1624 | } | ||
1625 | |||
1626 | kgdb_io_ops = new_kgdb_io_ops; | ||
1627 | |||
1628 | spin_unlock(&kgdb_registration_lock); | ||
1629 | |||
1630 | printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", | ||
1631 | new_kgdb_io_ops->name); | ||
1632 | |||
1633 | /* Arm KGDB now. */ | ||
1634 | kgdb_register_callbacks(); | ||
1635 | |||
1636 | if (kgdb_break_asap) | ||
1637 | kgdb_initial_breakpoint(); | ||
1638 | |||
1639 | return 0; | ||
1640 | } | ||
1641 | EXPORT_SYMBOL_GPL(kgdb_register_io_module); | ||
1642 | |||
1643 | /** | ||
1644 | * kkgdb_unregister_io_module - unregister KGDB IO module | ||
1645 | * @old_kgdb_io_ops: the io ops vector | ||
1646 | * | ||
1647 | * Unregister it with the KGDB core. | ||
1648 | */ | ||
1649 | void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops) | ||
1650 | { | ||
1651 | BUG_ON(kgdb_connected); | ||
1652 | |||
1653 | /* | ||
1654 | * KGDB is no longer able to communicate out, so | ||
1655 | * unregister our callbacks and reset state. | ||
1656 | */ | ||
1657 | kgdb_unregister_callbacks(); | ||
1658 | |||
1659 | spin_lock(&kgdb_registration_lock); | ||
1660 | |||
1661 | WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops); | ||
1662 | kgdb_io_ops = NULL; | ||
1663 | |||
1664 | spin_unlock(&kgdb_registration_lock); | ||
1665 | |||
1666 | printk(KERN_INFO | ||
1667 | "kgdb: Unregistered I/O driver %s, debugger disabled.\n", | ||
1668 | old_kgdb_io_ops->name); | ||
1669 | } | ||
1670 | EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); | ||
1671 | |||
1672 | /** | ||
1673 | * kgdb_breakpoint - generate breakpoint exception | ||
1674 | * | ||
1675 | * This function will generate a breakpoint exception. It is used at the | ||
1676 | * beginning of a program to sync up with a debugger and can be used | ||
1677 | * otherwise as a quick means to stop program execution and "break" into | ||
1678 | * the debugger. | ||
1679 | */ | ||
1680 | void kgdb_breakpoint(void) | ||
1681 | { | ||
1682 | atomic_set(&kgdb_setting_breakpoint, 1); | ||
1683 | wmb(); /* Sync point before breakpoint */ | ||
1684 | arch_kgdb_breakpoint(); | ||
1685 | wmb(); /* Sync point after breakpoint */ | ||
1686 | atomic_set(&kgdb_setting_breakpoint, 0); | ||
1687 | } | ||
1688 | EXPORT_SYMBOL_GPL(kgdb_breakpoint); | ||
1689 | |||
1690 | static int __init opt_kgdb_wait(char *str) | ||
1691 | { | ||
1692 | kgdb_break_asap = 1; | ||
1693 | |||
1694 | if (kgdb_io_module_registered) | ||
1695 | kgdb_initial_breakpoint(); | ||
1696 | |||
1697 | return 0; | ||
1698 | } | ||
1699 | |||
1700 | early_param("kgdbwait", opt_kgdb_wait); | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 22be3ff3f363..e2764047ec03 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -165,7 +165,7 @@ static int ____call_usermodehelper(void *data) | |||
165 | } | 165 | } |
166 | 166 | ||
167 | /* We can run anywhere, unlike our parent keventd(). */ | 167 | /* We can run anywhere, unlike our parent keventd(). */ |
168 | set_cpus_allowed(current, CPU_MASK_ALL); | 168 | set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); |
169 | 169 | ||
170 | /* | 170 | /* |
171 | * Our parent is keventd, which runs with elevated scheduling priority. | 171 | * Our parent is keventd, which runs with elevated scheduling priority. |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fcfb580c3afc..1e0250cb9486 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -72,6 +72,18 @@ DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | |||
72 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 72 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
73 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 73 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
74 | 74 | ||
75 | /* | ||
76 | * Normally, functions that we'd want to prohibit kprobes in, are marked | ||
77 | * __kprobes. But, there are cases where such functions already belong to | ||
78 | * a different section (__sched for preempt_schedule) | ||
79 | * | ||
80 | * For such cases, we now have a blacklist | ||
81 | */ | ||
82 | struct kprobe_blackpoint kprobe_blacklist[] = { | ||
83 | {"preempt_schedule",}, | ||
84 | {NULL} /* Terminator */ | ||
85 | }; | ||
86 | |||
75 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | 87 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT |
76 | /* | 88 | /* |
77 | * kprobe->ainsn.insn points to the copy of the instruction to be | 89 | * kprobe->ainsn.insn points to the copy of the instruction to be |
@@ -417,6 +429,21 @@ static inline void free_rp_inst(struct kretprobe *rp) | |||
417 | } | 429 | } |
418 | } | 430 | } |
419 | 431 | ||
432 | static void __kprobes cleanup_rp_inst(struct kretprobe *rp) | ||
433 | { | ||
434 | unsigned long flags; | ||
435 | struct kretprobe_instance *ri; | ||
436 | struct hlist_node *pos, *next; | ||
437 | /* No race here */ | ||
438 | spin_lock_irqsave(&kretprobe_lock, flags); | ||
439 | hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { | ||
440 | ri->rp = NULL; | ||
441 | hlist_del(&ri->uflist); | ||
442 | } | ||
443 | spin_unlock_irqrestore(&kretprobe_lock, flags); | ||
444 | free_rp_inst(rp); | ||
445 | } | ||
446 | |||
420 | /* | 447 | /* |
421 | * Keep all fields in the kprobe consistent | 448 | * Keep all fields in the kprobe consistent |
422 | */ | 449 | */ |
@@ -492,9 +519,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
492 | 519 | ||
493 | static int __kprobes in_kprobes_functions(unsigned long addr) | 520 | static int __kprobes in_kprobes_functions(unsigned long addr) |
494 | { | 521 | { |
522 | struct kprobe_blackpoint *kb; | ||
523 | |||
495 | if (addr >= (unsigned long)__kprobes_text_start && | 524 | if (addr >= (unsigned long)__kprobes_text_start && |
496 | addr < (unsigned long)__kprobes_text_end) | 525 | addr < (unsigned long)__kprobes_text_end) |
497 | return -EINVAL; | 526 | return -EINVAL; |
527 | /* | ||
528 | * If there exists a kprobe_blacklist, verify and | ||
529 | * fail any probe registration in the prohibited area | ||
530 | */ | ||
531 | for (kb = kprobe_blacklist; kb->name != NULL; kb++) { | ||
532 | if (kb->start_addr) { | ||
533 | if (addr >= kb->start_addr && | ||
534 | addr < (kb->start_addr + kb->range)) | ||
535 | return -EINVAL; | ||
536 | } | ||
537 | } | ||
498 | return 0; | 538 | return 0; |
499 | } | 539 | } |
500 | 540 | ||
@@ -555,6 +595,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
555 | } | 595 | } |
556 | 596 | ||
557 | p->nmissed = 0; | 597 | p->nmissed = 0; |
598 | INIT_LIST_HEAD(&p->list); | ||
558 | mutex_lock(&kprobe_mutex); | 599 | mutex_lock(&kprobe_mutex); |
559 | old_p = get_kprobe(p->addr); | 600 | old_p = get_kprobe(p->addr); |
560 | if (old_p) { | 601 | if (old_p) { |
@@ -581,35 +622,28 @@ out: | |||
581 | return ret; | 622 | return ret; |
582 | } | 623 | } |
583 | 624 | ||
584 | int __kprobes register_kprobe(struct kprobe *p) | 625 | /* |
585 | { | 626 | * Unregister a kprobe without a scheduler synchronization. |
586 | return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); | 627 | */ |
587 | } | 628 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) |
588 | |||
589 | void __kprobes unregister_kprobe(struct kprobe *p) | ||
590 | { | 629 | { |
591 | struct module *mod; | ||
592 | struct kprobe *old_p, *list_p; | 630 | struct kprobe *old_p, *list_p; |
593 | int cleanup_p; | ||
594 | 631 | ||
595 | mutex_lock(&kprobe_mutex); | ||
596 | old_p = get_kprobe(p->addr); | 632 | old_p = get_kprobe(p->addr); |
597 | if (unlikely(!old_p)) { | 633 | if (unlikely(!old_p)) |
598 | mutex_unlock(&kprobe_mutex); | 634 | return -EINVAL; |
599 | return; | 635 | |
600 | } | ||
601 | if (p != old_p) { | 636 | if (p != old_p) { |
602 | list_for_each_entry_rcu(list_p, &old_p->list, list) | 637 | list_for_each_entry_rcu(list_p, &old_p->list, list) |
603 | if (list_p == p) | 638 | if (list_p == p) |
604 | /* kprobe p is a valid probe */ | 639 | /* kprobe p is a valid probe */ |
605 | goto valid_p; | 640 | goto valid_p; |
606 | mutex_unlock(&kprobe_mutex); | 641 | return -EINVAL; |
607 | return; | ||
608 | } | 642 | } |
609 | valid_p: | 643 | valid_p: |
610 | if (old_p == p || | 644 | if (old_p == p || |
611 | (old_p->pre_handler == aggr_pre_handler && | 645 | (old_p->pre_handler == aggr_pre_handler && |
612 | p->list.next == &old_p->list && p->list.prev == &old_p->list)) { | 646 | list_is_singular(&old_p->list))) { |
613 | /* | 647 | /* |
614 | * Only probe on the hash list. Disarm only if kprobes are | 648 | * Only probe on the hash list. Disarm only if kprobes are |
615 | * enabled - otherwise, the breakpoint would already have | 649 | * enabled - otherwise, the breakpoint would already have |
@@ -618,43 +652,97 @@ valid_p: | |||
618 | if (kprobe_enabled) | 652 | if (kprobe_enabled) |
619 | arch_disarm_kprobe(p); | 653 | arch_disarm_kprobe(p); |
620 | hlist_del_rcu(&old_p->hlist); | 654 | hlist_del_rcu(&old_p->hlist); |
621 | cleanup_p = 1; | ||
622 | } else { | 655 | } else { |
656 | if (p->break_handler) | ||
657 | old_p->break_handler = NULL; | ||
658 | if (p->post_handler) { | ||
659 | list_for_each_entry_rcu(list_p, &old_p->list, list) { | ||
660 | if ((list_p != p) && (list_p->post_handler)) | ||
661 | goto noclean; | ||
662 | } | ||
663 | old_p->post_handler = NULL; | ||
664 | } | ||
665 | noclean: | ||
623 | list_del_rcu(&p->list); | 666 | list_del_rcu(&p->list); |
624 | cleanup_p = 0; | ||
625 | } | 667 | } |
668 | return 0; | ||
669 | } | ||
626 | 670 | ||
627 | mutex_unlock(&kprobe_mutex); | 671 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) |
672 | { | ||
673 | struct module *mod; | ||
674 | struct kprobe *old_p; | ||
628 | 675 | ||
629 | synchronize_sched(); | ||
630 | if (p->mod_refcounted) { | 676 | if (p->mod_refcounted) { |
631 | mod = module_text_address((unsigned long)p->addr); | 677 | mod = module_text_address((unsigned long)p->addr); |
632 | if (mod) | 678 | if (mod) |
633 | module_put(mod); | 679 | module_put(mod); |
634 | } | 680 | } |
635 | 681 | ||
636 | if (cleanup_p) { | 682 | if (list_empty(&p->list) || list_is_singular(&p->list)) { |
637 | if (p != old_p) { | 683 | if (!list_empty(&p->list)) { |
638 | list_del_rcu(&p->list); | 684 | /* "p" is the last child of an aggr_kprobe */ |
685 | old_p = list_entry(p->list.next, struct kprobe, list); | ||
686 | list_del(&p->list); | ||
639 | kfree(old_p); | 687 | kfree(old_p); |
640 | } | 688 | } |
641 | arch_remove_kprobe(p); | 689 | arch_remove_kprobe(p); |
642 | } else { | 690 | } |
643 | mutex_lock(&kprobe_mutex); | 691 | } |
644 | if (p->break_handler) | 692 | |
645 | old_p->break_handler = NULL; | 693 | static int __register_kprobes(struct kprobe **kps, int num, |
646 | if (p->post_handler){ | 694 | unsigned long called_from) |
647 | list_for_each_entry_rcu(list_p, &old_p->list, list){ | 695 | { |
648 | if (list_p->post_handler){ | 696 | int i, ret = 0; |
649 | cleanup_p = 2; | 697 | |
650 | break; | 698 | if (num <= 0) |
651 | } | 699 | return -EINVAL; |
652 | } | 700 | for (i = 0; i < num; i++) { |
653 | if (cleanup_p == 0) | 701 | ret = __register_kprobe(kps[i], called_from); |
654 | old_p->post_handler = NULL; | 702 | if (ret < 0 && i > 0) { |
703 | unregister_kprobes(kps, i); | ||
704 | break; | ||
655 | } | 705 | } |
656 | mutex_unlock(&kprobe_mutex); | ||
657 | } | 706 | } |
707 | return ret; | ||
708 | } | ||
709 | |||
710 | /* | ||
711 | * Registration and unregistration functions for kprobe. | ||
712 | */ | ||
713 | int __kprobes register_kprobe(struct kprobe *p) | ||
714 | { | ||
715 | return __register_kprobes(&p, 1, | ||
716 | (unsigned long)__builtin_return_address(0)); | ||
717 | } | ||
718 | |||
719 | void __kprobes unregister_kprobe(struct kprobe *p) | ||
720 | { | ||
721 | unregister_kprobes(&p, 1); | ||
722 | } | ||
723 | |||
724 | int __kprobes register_kprobes(struct kprobe **kps, int num) | ||
725 | { | ||
726 | return __register_kprobes(kps, num, | ||
727 | (unsigned long)__builtin_return_address(0)); | ||
728 | } | ||
729 | |||
730 | void __kprobes unregister_kprobes(struct kprobe **kps, int num) | ||
731 | { | ||
732 | int i; | ||
733 | |||
734 | if (num <= 0) | ||
735 | return; | ||
736 | mutex_lock(&kprobe_mutex); | ||
737 | for (i = 0; i < num; i++) | ||
738 | if (__unregister_kprobe_top(kps[i]) < 0) | ||
739 | kps[i]->addr = NULL; | ||
740 | mutex_unlock(&kprobe_mutex); | ||
741 | |||
742 | synchronize_sched(); | ||
743 | for (i = 0; i < num; i++) | ||
744 | if (kps[i]->addr) | ||
745 | __unregister_kprobe_bottom(kps[i]); | ||
658 | } | 746 | } |
659 | 747 | ||
660 | static struct notifier_block kprobe_exceptions_nb = { | 748 | static struct notifier_block kprobe_exceptions_nb = { |
@@ -667,24 +755,69 @@ unsigned long __weak arch_deref_entry_point(void *entry) | |||
667 | return (unsigned long)entry; | 755 | return (unsigned long)entry; |
668 | } | 756 | } |
669 | 757 | ||
670 | int __kprobes register_jprobe(struct jprobe *jp) | 758 | static int __register_jprobes(struct jprobe **jps, int num, |
759 | unsigned long called_from) | ||
671 | { | 760 | { |
672 | unsigned long addr = arch_deref_entry_point(jp->entry); | 761 | struct jprobe *jp; |
762 | int ret = 0, i; | ||
673 | 763 | ||
674 | if (!kernel_text_address(addr)) | 764 | if (num <= 0) |
675 | return -EINVAL; | 765 | return -EINVAL; |
766 | for (i = 0; i < num; i++) { | ||
767 | unsigned long addr; | ||
768 | jp = jps[i]; | ||
769 | addr = arch_deref_entry_point(jp->entry); | ||
770 | |||
771 | if (!kernel_text_address(addr)) | ||
772 | ret = -EINVAL; | ||
773 | else { | ||
774 | /* Todo: Verify probepoint is a function entry point */ | ||
775 | jp->kp.pre_handler = setjmp_pre_handler; | ||
776 | jp->kp.break_handler = longjmp_break_handler; | ||
777 | ret = __register_kprobe(&jp->kp, called_from); | ||
778 | } | ||
779 | if (ret < 0 && i > 0) { | ||
780 | unregister_jprobes(jps, i); | ||
781 | break; | ||
782 | } | ||
783 | } | ||
784 | return ret; | ||
785 | } | ||
676 | 786 | ||
677 | /* Todo: Verify probepoint is a function entry point */ | 787 | int __kprobes register_jprobe(struct jprobe *jp) |
678 | jp->kp.pre_handler = setjmp_pre_handler; | 788 | { |
679 | jp->kp.break_handler = longjmp_break_handler; | 789 | return __register_jprobes(&jp, 1, |
680 | |||
681 | return __register_kprobe(&jp->kp, | ||
682 | (unsigned long)__builtin_return_address(0)); | 790 | (unsigned long)__builtin_return_address(0)); |
683 | } | 791 | } |
684 | 792 | ||
685 | void __kprobes unregister_jprobe(struct jprobe *jp) | 793 | void __kprobes unregister_jprobe(struct jprobe *jp) |
686 | { | 794 | { |
687 | unregister_kprobe(&jp->kp); | 795 | unregister_jprobes(&jp, 1); |
796 | } | ||
797 | |||
798 | int __kprobes register_jprobes(struct jprobe **jps, int num) | ||
799 | { | ||
800 | return __register_jprobes(jps, num, | ||
801 | (unsigned long)__builtin_return_address(0)); | ||
802 | } | ||
803 | |||
804 | void __kprobes unregister_jprobes(struct jprobe **jps, int num) | ||
805 | { | ||
806 | int i; | ||
807 | |||
808 | if (num <= 0) | ||
809 | return; | ||
810 | mutex_lock(&kprobe_mutex); | ||
811 | for (i = 0; i < num; i++) | ||
812 | if (__unregister_kprobe_top(&jps[i]->kp) < 0) | ||
813 | jps[i]->kp.addr = NULL; | ||
814 | mutex_unlock(&kprobe_mutex); | ||
815 | |||
816 | synchronize_sched(); | ||
817 | for (i = 0; i < num; i++) { | ||
818 | if (jps[i]->kp.addr) | ||
819 | __unregister_kprobe_bottom(&jps[i]->kp); | ||
820 | } | ||
688 | } | 821 | } |
689 | 822 | ||
690 | #ifdef CONFIG_KRETPROBES | 823 | #ifdef CONFIG_KRETPROBES |
@@ -725,7 +858,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
725 | return 0; | 858 | return 0; |
726 | } | 859 | } |
727 | 860 | ||
728 | int __kprobes register_kretprobe(struct kretprobe *rp) | 861 | static int __kprobes __register_kretprobe(struct kretprobe *rp, |
862 | unsigned long called_from) | ||
729 | { | 863 | { |
730 | int ret = 0; | 864 | int ret = 0; |
731 | struct kretprobe_instance *inst; | 865 | struct kretprobe_instance *inst; |
@@ -771,46 +905,101 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
771 | 905 | ||
772 | rp->nmissed = 0; | 906 | rp->nmissed = 0; |
773 | /* Establish function entry probe point */ | 907 | /* Establish function entry probe point */ |
774 | if ((ret = __register_kprobe(&rp->kp, | 908 | ret = __register_kprobe(&rp->kp, called_from); |
775 | (unsigned long)__builtin_return_address(0))) != 0) | 909 | if (ret != 0) |
776 | free_rp_inst(rp); | 910 | free_rp_inst(rp); |
777 | return ret; | 911 | return ret; |
778 | } | 912 | } |
779 | 913 | ||
914 | static int __register_kretprobes(struct kretprobe **rps, int num, | ||
915 | unsigned long called_from) | ||
916 | { | ||
917 | int ret = 0, i; | ||
918 | |||
919 | if (num <= 0) | ||
920 | return -EINVAL; | ||
921 | for (i = 0; i < num; i++) { | ||
922 | ret = __register_kretprobe(rps[i], called_from); | ||
923 | if (ret < 0 && i > 0) { | ||
924 | unregister_kretprobes(rps, i); | ||
925 | break; | ||
926 | } | ||
927 | } | ||
928 | return ret; | ||
929 | } | ||
930 | |||
931 | int __kprobes register_kretprobe(struct kretprobe *rp) | ||
932 | { | ||
933 | return __register_kretprobes(&rp, 1, | ||
934 | (unsigned long)__builtin_return_address(0)); | ||
935 | } | ||
936 | |||
937 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | ||
938 | { | ||
939 | unregister_kretprobes(&rp, 1); | ||
940 | } | ||
941 | |||
942 | int __kprobes register_kretprobes(struct kretprobe **rps, int num) | ||
943 | { | ||
944 | return __register_kretprobes(rps, num, | ||
945 | (unsigned long)__builtin_return_address(0)); | ||
946 | } | ||
947 | |||
948 | void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) | ||
949 | { | ||
950 | int i; | ||
951 | |||
952 | if (num <= 0) | ||
953 | return; | ||
954 | mutex_lock(&kprobe_mutex); | ||
955 | for (i = 0; i < num; i++) | ||
956 | if (__unregister_kprobe_top(&rps[i]->kp) < 0) | ||
957 | rps[i]->kp.addr = NULL; | ||
958 | mutex_unlock(&kprobe_mutex); | ||
959 | |||
960 | synchronize_sched(); | ||
961 | for (i = 0; i < num; i++) { | ||
962 | if (rps[i]->kp.addr) { | ||
963 | __unregister_kprobe_bottom(&rps[i]->kp); | ||
964 | cleanup_rp_inst(rps[i]); | ||
965 | } | ||
966 | } | ||
967 | } | ||
968 | |||
780 | #else /* CONFIG_KRETPROBES */ | 969 | #else /* CONFIG_KRETPROBES */ |
781 | int __kprobes register_kretprobe(struct kretprobe *rp) | 970 | int __kprobes register_kretprobe(struct kretprobe *rp) |
782 | { | 971 | { |
783 | return -ENOSYS; | 972 | return -ENOSYS; |
784 | } | 973 | } |
785 | 974 | ||
786 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, | 975 | int __kprobes register_kretprobes(struct kretprobe **rps, int num) |
787 | struct pt_regs *regs) | ||
788 | { | 976 | { |
789 | return 0; | 977 | return -ENOSYS; |
790 | } | 978 | } |
791 | #endif /* CONFIG_KRETPROBES */ | ||
792 | |||
793 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | 979 | void __kprobes unregister_kretprobe(struct kretprobe *rp) |
794 | { | 980 | { |
795 | unsigned long flags; | 981 | } |
796 | struct kretprobe_instance *ri; | ||
797 | struct hlist_node *pos, *next; | ||
798 | 982 | ||
799 | unregister_kprobe(&rp->kp); | 983 | void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) |
984 | { | ||
985 | } | ||
800 | 986 | ||
801 | /* No race here */ | 987 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, |
802 | spin_lock_irqsave(&kretprobe_lock, flags); | 988 | struct pt_regs *regs) |
803 | hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { | 989 | { |
804 | ri->rp = NULL; | 990 | return 0; |
805 | hlist_del(&ri->uflist); | ||
806 | } | ||
807 | spin_unlock_irqrestore(&kretprobe_lock, flags); | ||
808 | free_rp_inst(rp); | ||
809 | } | 991 | } |
810 | 992 | ||
993 | #endif /* CONFIG_KRETPROBES */ | ||
994 | |||
811 | static int __init init_kprobes(void) | 995 | static int __init init_kprobes(void) |
812 | { | 996 | { |
813 | int i, err = 0; | 997 | int i, err = 0; |
998 | unsigned long offset = 0, size = 0; | ||
999 | char *modname, namebuf[128]; | ||
1000 | const char *symbol_name; | ||
1001 | void *addr; | ||
1002 | struct kprobe_blackpoint *kb; | ||
814 | 1003 | ||
815 | /* FIXME allocate the probe table, currently defined statically */ | 1004 | /* FIXME allocate the probe table, currently defined statically */ |
816 | /* initialize all list heads */ | 1005 | /* initialize all list heads */ |
@@ -819,6 +1008,28 @@ static int __init init_kprobes(void) | |||
819 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | 1008 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); |
820 | } | 1009 | } |
821 | 1010 | ||
1011 | /* | ||
1012 | * Lookup and populate the kprobe_blacklist. | ||
1013 | * | ||
1014 | * Unlike the kretprobe blacklist, we'll need to determine | ||
1015 | * the range of addresses that belong to the said functions, | ||
1016 | * since a kprobe need not necessarily be at the beginning | ||
1017 | * of a function. | ||
1018 | */ | ||
1019 | for (kb = kprobe_blacklist; kb->name != NULL; kb++) { | ||
1020 | kprobe_lookup_name(kb->name, addr); | ||
1021 | if (!addr) | ||
1022 | continue; | ||
1023 | |||
1024 | kb->start_addr = (unsigned long)addr; | ||
1025 | symbol_name = kallsyms_lookup(kb->start_addr, | ||
1026 | &size, &offset, &modname, namebuf); | ||
1027 | if (!symbol_name) | ||
1028 | kb->range = 0; | ||
1029 | else | ||
1030 | kb->range = size; | ||
1031 | } | ||
1032 | |||
822 | if (kretprobe_blacklist_size) { | 1033 | if (kretprobe_blacklist_size) { |
823 | /* lookup the function address from its name */ | 1034 | /* lookup the function address from its name */ |
824 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { | 1035 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { |
@@ -1066,8 +1277,12 @@ module_init(init_kprobes); | |||
1066 | 1277 | ||
1067 | EXPORT_SYMBOL_GPL(register_kprobe); | 1278 | EXPORT_SYMBOL_GPL(register_kprobe); |
1068 | EXPORT_SYMBOL_GPL(unregister_kprobe); | 1279 | EXPORT_SYMBOL_GPL(unregister_kprobe); |
1280 | EXPORT_SYMBOL_GPL(register_kprobes); | ||
1281 | EXPORT_SYMBOL_GPL(unregister_kprobes); | ||
1069 | EXPORT_SYMBOL_GPL(register_jprobe); | 1282 | EXPORT_SYMBOL_GPL(register_jprobe); |
1070 | EXPORT_SYMBOL_GPL(unregister_jprobe); | 1283 | EXPORT_SYMBOL_GPL(unregister_jprobe); |
1284 | EXPORT_SYMBOL_GPL(register_jprobes); | ||
1285 | EXPORT_SYMBOL_GPL(unregister_jprobes); | ||
1071 | #ifdef CONFIG_KPROBES | 1286 | #ifdef CONFIG_KPROBES |
1072 | EXPORT_SYMBOL_GPL(jprobe_return); | 1287 | EXPORT_SYMBOL_GPL(jprobe_return); |
1073 | #endif | 1288 | #endif |
@@ -1075,4 +1290,6 @@ EXPORT_SYMBOL_GPL(jprobe_return); | |||
1075 | #ifdef CONFIG_KPROBES | 1290 | #ifdef CONFIG_KPROBES |
1076 | EXPORT_SYMBOL_GPL(register_kretprobe); | 1291 | EXPORT_SYMBOL_GPL(register_kretprobe); |
1077 | EXPORT_SYMBOL_GPL(unregister_kretprobe); | 1292 | EXPORT_SYMBOL_GPL(unregister_kretprobe); |
1293 | EXPORT_SYMBOL_GPL(register_kretprobes); | ||
1294 | EXPORT_SYMBOL_GPL(unregister_kretprobes); | ||
1078 | #endif | 1295 | #endif |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 0ac887882f90..92cf6930ab51 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/file.h> | 13 | #include <linux/file.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
16 | #include <asm/semaphore.h> | ||
17 | 16 | ||
18 | #define KTHREAD_NICE_LEVEL (-5) | 17 | #define KTHREAD_NICE_LEVEL (-5) |
19 | 18 | ||
@@ -180,6 +179,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) | |||
180 | wait_task_inactive(k); | 179 | wait_task_inactive(k); |
181 | set_task_cpu(k, cpu); | 180 | set_task_cpu(k, cpu); |
182 | k->cpus_allowed = cpumask_of_cpu(cpu); | 181 | k->cpus_allowed = cpumask_of_cpu(cpu); |
182 | k->rt.nr_cpus_allowed = 1; | ||
183 | } | 183 | } |
184 | EXPORT_SYMBOL(kthread_bind); | 184 | EXPORT_SYMBOL(kthread_bind); |
185 | 185 | ||
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index b4e3c85abe74..7c74dab0d21b 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record | |||
64 | return; | 64 | return; |
65 | 65 | ||
66 | for (i = 0; i < MAXLR; i++) { | 66 | for (i = 0; i < MAXLR; i++) { |
67 | int q; | 67 | int q, same = 1; |
68 | int same = 1; | 68 | |
69 | /* Nothing stored: */ | 69 | /* Nothing stored: */ |
70 | if (!latency_record[i].backtrace[0]) { | 70 | if (!latency_record[i].backtrace[0]) { |
71 | if (firstnonnull > i) | 71 | if (firstnonnull > i) |
@@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record | |||
73 | continue; | 73 | continue; |
74 | } | 74 | } |
75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | 75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { |
76 | if (latency_record[i].backtrace[q] != | 76 | unsigned long record = lat->backtrace[q]; |
77 | lat->backtrace[q]) | 77 | |
78 | if (latency_record[i].backtrace[q] != record) { | ||
78 | same = 0; | 79 | same = 0; |
79 | if (same && lat->backtrace[q] == 0) | ||
80 | break; | 80 | break; |
81 | if (same && lat->backtrace[q] == ULONG_MAX) | 81 | } |
82 | |||
83 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | ||
84 | if (record == 0 || record == ULONG_MAX) | ||
82 | break; | 85 | break; |
83 | } | 86 | } |
84 | if (same) { | 87 | if (same) { |
@@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
143 | for (i = 0; i < LT_SAVECOUNT ; i++) { | 146 | for (i = 0; i < LT_SAVECOUNT ; i++) { |
144 | struct latency_record *mylat; | 147 | struct latency_record *mylat; |
145 | int same = 1; | 148 | int same = 1; |
149 | |||
146 | mylat = &tsk->latency_record[i]; | 150 | mylat = &tsk->latency_record[i]; |
147 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | 151 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { |
148 | if (mylat->backtrace[q] != | 152 | unsigned long record = lat.backtrace[q]; |
149 | lat.backtrace[q]) | 153 | |
154 | if (mylat->backtrace[q] != record) { | ||
150 | same = 0; | 155 | same = 0; |
151 | if (same && lat.backtrace[q] == 0) | ||
152 | break; | 156 | break; |
153 | if (same && lat.backtrace[q] == ULONG_MAX) | 157 | } |
158 | |||
159 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | ||
160 | if (record == 0 || record == ULONG_MAX) | ||
154 | break; | 161 | break; |
155 | } | 162 | } |
156 | if (same) { | 163 | if (same) { |
diff --git a/kernel/module.c b/kernel/module.c index 5d437bffd8dc..8d6cccc6c3cf 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -43,7 +43,6 @@ | |||
43 | #include <linux/mutex.h> | 43 | #include <linux/mutex.h> |
44 | #include <linux/unwind.h> | 44 | #include <linux/unwind.h> |
45 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
46 | #include <asm/semaphore.h> | ||
47 | #include <asm/cacheflush.h> | 46 | #include <asm/cacheflush.h> |
48 | #include <linux/license.h> | 47 | #include <linux/license.h> |
49 | #include <asm/sections.h> | 48 | #include <asm/sections.h> |
@@ -664,7 +663,7 @@ static void free_module(struct module *mod); | |||
664 | 663 | ||
665 | static void wait_for_zero_refcount(struct module *mod) | 664 | static void wait_for_zero_refcount(struct module *mod) |
666 | { | 665 | { |
667 | /* Since we might sleep for some time, drop the semaphore first */ | 666 | /* Since we might sleep for some time, release the mutex first */ |
668 | mutex_unlock(&module_mutex); | 667 | mutex_unlock(&module_mutex); |
669 | for (;;) { | 668 | for (;;) { |
670 | DEBUGP("Looking at refcount...\n"); | 669 | DEBUGP("Looking at refcount...\n"); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6d792b66d854..5ca37fa50beb 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level) | |||
92 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); | 92 | atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); |
93 | 93 | ||
94 | for (i = 1; i < PIDMAP_ENTRIES; i++) { | 94 | for (i = 1; i < PIDMAP_ENTRIES; i++) { |
95 | ns->pidmap[i].page = 0; | 95 | ns->pidmap[i].page = NULL; |
96 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); | 96 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); |
97 | } | 97 | } |
98 | 98 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 2eae91f954ca..ae5c6c147c4b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -1087,45 +1087,45 @@ static void check_process_timers(struct task_struct *tsk, | |||
1087 | maxfire = 20; | 1087 | maxfire = 20; |
1088 | prof_expires = cputime_zero; | 1088 | prof_expires = cputime_zero; |
1089 | while (!list_empty(timers)) { | 1089 | while (!list_empty(timers)) { |
1090 | struct cpu_timer_list *t = list_first_entry(timers, | 1090 | struct cpu_timer_list *tl = list_first_entry(timers, |
1091 | struct cpu_timer_list, | 1091 | struct cpu_timer_list, |
1092 | entry); | 1092 | entry); |
1093 | if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { | 1093 | if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { |
1094 | prof_expires = t->expires.cpu; | 1094 | prof_expires = tl->expires.cpu; |
1095 | break; | 1095 | break; |
1096 | } | 1096 | } |
1097 | t->firing = 1; | 1097 | tl->firing = 1; |
1098 | list_move_tail(&t->entry, firing); | 1098 | list_move_tail(&tl->entry, firing); |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | ++timers; | 1101 | ++timers; |
1102 | maxfire = 20; | 1102 | maxfire = 20; |
1103 | virt_expires = cputime_zero; | 1103 | virt_expires = cputime_zero; |
1104 | while (!list_empty(timers)) { | 1104 | while (!list_empty(timers)) { |
1105 | struct cpu_timer_list *t = list_first_entry(timers, | 1105 | struct cpu_timer_list *tl = list_first_entry(timers, |
1106 | struct cpu_timer_list, | 1106 | struct cpu_timer_list, |
1107 | entry); | 1107 | entry); |
1108 | if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { | 1108 | if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { |
1109 | virt_expires = t->expires.cpu; | 1109 | virt_expires = tl->expires.cpu; |
1110 | break; | 1110 | break; |
1111 | } | 1111 | } |
1112 | t->firing = 1; | 1112 | tl->firing = 1; |
1113 | list_move_tail(&t->entry, firing); | 1113 | list_move_tail(&tl->entry, firing); |
1114 | } | 1114 | } |
1115 | 1115 | ||
1116 | ++timers; | 1116 | ++timers; |
1117 | maxfire = 20; | 1117 | maxfire = 20; |
1118 | sched_expires = 0; | 1118 | sched_expires = 0; |
1119 | while (!list_empty(timers)) { | 1119 | while (!list_empty(timers)) { |
1120 | struct cpu_timer_list *t = list_first_entry(timers, | 1120 | struct cpu_timer_list *tl = list_first_entry(timers, |
1121 | struct cpu_timer_list, | 1121 | struct cpu_timer_list, |
1122 | entry); | 1122 | entry); |
1123 | if (!--maxfire || sum_sched_runtime < t->expires.sched) { | 1123 | if (!--maxfire || sum_sched_runtime < tl->expires.sched) { |
1124 | sched_expires = t->expires.sched; | 1124 | sched_expires = tl->expires.sched; |
1125 | break; | 1125 | break; |
1126 | } | 1126 | } |
1127 | t->firing = 1; | 1127 | tl->firing = 1; |
1128 | list_move_tail(&t->entry, firing); | 1128 | list_move_tail(&tl->entry, firing); |
1129 | } | 1129 | } |
1130 | 1130 | ||
1131 | /* | 1131 | /* |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a9b04203a66d..8476956ffd92 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -37,7 +37,6 @@ | |||
37 | #include <linux/mutex.h> | 37 | #include <linux/mutex.h> |
38 | 38 | ||
39 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
40 | #include <asm/semaphore.h> | ||
41 | #include <linux/list.h> | 40 | #include <linux/list.h> |
42 | #include <linux/init.h> | 41 | #include <linux/init.h> |
43 | #include <linux/compiler.h> | 42 | #include <linux/compiler.h> |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 6233f3b4ae66..b45da40e8d25 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -19,16 +19,6 @@ config PM | |||
19 | will issue the hlt instruction if nothing is to be done, thereby | 19 | will issue the hlt instruction if nothing is to be done, thereby |
20 | sending the processor to sleep and saving power. | 20 | sending the processor to sleep and saving power. |
21 | 21 | ||
22 | config PM_LEGACY | ||
23 | bool "Legacy Power Management API (DEPRECATED)" | ||
24 | depends on PM | ||
25 | default n | ||
26 | ---help--- | ||
27 | Support for pm_register() and friends. This old API is obsoleted | ||
28 | by the driver model. | ||
29 | |||
30 | If unsure, say N. | ||
31 | |||
32 | config PM_DEBUG | 22 | config PM_DEBUG |
33 | bool "Power Management Debug Support" | 23 | bool "Power Management Debug Support" |
34 | depends on PM | 24 | depends on PM |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f7dfff28ecdb..597823b5b700 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -4,7 +4,6 @@ EXTRA_CFLAGS += -DDEBUG | |||
4 | endif | 4 | endif |
5 | 5 | ||
6 | obj-y := main.o | 6 | obj-y := main.o |
7 | obj-$(CONFIG_PM_LEGACY) += pm.o | ||
8 | obj-$(CONFIG_PM_SLEEP) += process.o console.o | 7 | obj-$(CONFIG_PM_SLEEP) += process.o console.o |
9 | obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o | 8 | obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o |
10 | 9 | ||
diff --git a/kernel/power/console.c b/kernel/power/console.c index 89bcf4973ee5..b8628be2a465 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c | |||
@@ -7,17 +7,39 @@ | |||
7 | #include <linux/vt_kern.h> | 7 | #include <linux/vt_kern.h> |
8 | #include <linux/kbd_kern.h> | 8 | #include <linux/kbd_kern.h> |
9 | #include <linux/console.h> | 9 | #include <linux/console.h> |
10 | #include <linux/module.h> | ||
10 | #include "power.h" | 11 | #include "power.h" |
11 | 12 | ||
12 | #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) | 13 | #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) |
13 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | 14 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) |
14 | 15 | ||
15 | static int orig_fgconsole, orig_kmsg; | 16 | static int orig_fgconsole, orig_kmsg; |
17 | static int disable_vt_switch; | ||
18 | |||
19 | /* | ||
20 | * Normally during a suspend, we allocate a new console and switch to it. | ||
21 | * When we resume, we switch back to the original console. This switch | ||
22 | * can be slow, so on systems where the framebuffer can handle restoration | ||
23 | * of video registers anyways, there's little point in doing the console | ||
24 | * switch. This function allows you to disable it by passing it '0'. | ||
25 | */ | ||
26 | void pm_set_vt_switch(int do_switch) | ||
27 | { | ||
28 | acquire_console_sem(); | ||
29 | disable_vt_switch = !do_switch; | ||
30 | release_console_sem(); | ||
31 | } | ||
32 | EXPORT_SYMBOL(pm_set_vt_switch); | ||
16 | 33 | ||
17 | int pm_prepare_console(void) | 34 | int pm_prepare_console(void) |
18 | { | 35 | { |
19 | acquire_console_sem(); | 36 | acquire_console_sem(); |
20 | 37 | ||
38 | if (disable_vt_switch) { | ||
39 | release_console_sem(); | ||
40 | return 0; | ||
41 | } | ||
42 | |||
21 | orig_fgconsole = fg_console; | 43 | orig_fgconsole = fg_console; |
22 | 44 | ||
23 | if (vc_allocate(SUSPEND_CONSOLE)) { | 45 | if (vc_allocate(SUSPEND_CONSOLE)) { |
@@ -50,9 +72,12 @@ int pm_prepare_console(void) | |||
50 | void pm_restore_console(void) | 72 | void pm_restore_console(void) |
51 | { | 73 | { |
52 | acquire_console_sem(); | 74 | acquire_console_sem(); |
75 | if (disable_vt_switch) { | ||
76 | release_console_sem(); | ||
77 | return; | ||
78 | } | ||
53 | set_console(orig_fgconsole); | 79 | set_console(orig_fgconsole); |
54 | release_console_sem(); | 80 | release_console_sem(); |
55 | kmsg_redirect = orig_kmsg; | 81 | kmsg_redirect = orig_kmsg; |
56 | return; | ||
57 | } | 82 | } |
58 | #endif | 83 | #endif |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c deleted file mode 100644 index 60c73fa670d5..000000000000 --- a/kernel/power/pm.c +++ /dev/null | |||
@@ -1,205 +0,0 @@ | |||
1 | /* | ||
2 | * pm.c - Power management interface | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrew Henroid | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | */ | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/spinlock.h> | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/pm.h> | ||
26 | #include <linux/pm_legacy.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/mutex.h> | ||
29 | |||
30 | /* | ||
31 | * Locking notes: | ||
32 | * pm_devs_lock can be a semaphore providing pm ops are not called | ||
33 | * from an interrupt handler (already a bad idea so no change here). Each | ||
34 | * change must be protected so that an unlink of an entry doesn't clash | ||
35 | * with a pm send - which is permitted to sleep in the current architecture | ||
36 | * | ||
37 | * Module unloads clashing with pm events now work out safely, the module | ||
38 | * unload path will block until the event has been sent. It may well block | ||
39 | * until a resume but that will be fine. | ||
40 | */ | ||
41 | |||
42 | static DEFINE_MUTEX(pm_devs_lock); | ||
43 | static LIST_HEAD(pm_devs); | ||
44 | |||
45 | /** | ||
46 | * pm_register - register a device with power management | ||
47 | * @type: device type | ||
48 | * @id: device ID | ||
49 | * @callback: callback function | ||
50 | * | ||
51 | * Add a device to the list of devices that wish to be notified about | ||
52 | * power management events. A &pm_dev structure is returned on success, | ||
53 | * on failure the return is %NULL. | ||
54 | * | ||
55 | * The callback function will be called in process context and | ||
56 | * it may sleep. | ||
57 | */ | ||
58 | |||
59 | struct pm_dev *pm_register(pm_dev_t type, | ||
60 | unsigned long id, | ||
61 | pm_callback callback) | ||
62 | { | ||
63 | struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); | ||
64 | if (dev) { | ||
65 | dev->type = type; | ||
66 | dev->id = id; | ||
67 | dev->callback = callback; | ||
68 | |||
69 | mutex_lock(&pm_devs_lock); | ||
70 | list_add(&dev->entry, &pm_devs); | ||
71 | mutex_unlock(&pm_devs_lock); | ||
72 | } | ||
73 | return dev; | ||
74 | } | ||
75 | |||
76 | /** | ||
77 | * pm_send - send request to a single device | ||
78 | * @dev: device to send to | ||
79 | * @rqst: power management request | ||
80 | * @data: data for the callback | ||
81 | * | ||
82 | * Issue a power management request to a given device. The | ||
83 | * %PM_SUSPEND and %PM_RESUME events are handled specially. The | ||
84 | * data field must hold the intended next state. No call is made | ||
85 | * if the state matches. | ||
86 | * | ||
87 | * BUGS: what stops two power management requests occurring in parallel | ||
88 | * and conflicting. | ||
89 | * | ||
90 | * WARNING: Calling pm_send directly is not generally recommended, in | ||
91 | * particular there is no locking against the pm_dev going away. The | ||
92 | * caller must maintain all needed locking or have 'inside knowledge' | ||
93 | * on the safety. Also remember that this function is not locked against | ||
94 | * pm_unregister. This means that you must handle SMP races on callback | ||
95 | * execution and unload yourself. | ||
96 | */ | ||
97 | |||
98 | static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data) | ||
99 | { | ||
100 | int status = 0; | ||
101 | unsigned long prev_state, next_state; | ||
102 | |||
103 | if (in_interrupt()) | ||
104 | BUG(); | ||
105 | |||
106 | switch (rqst) { | ||
107 | case PM_SUSPEND: | ||
108 | case PM_RESUME: | ||
109 | prev_state = dev->state; | ||
110 | next_state = (unsigned long) data; | ||
111 | if (prev_state != next_state) { | ||
112 | if (dev->callback) | ||
113 | status = (*dev->callback)(dev, rqst, data); | ||
114 | if (!status) { | ||
115 | dev->state = next_state; | ||
116 | dev->prev_state = prev_state; | ||
117 | } | ||
118 | } | ||
119 | else { | ||
120 | dev->prev_state = prev_state; | ||
121 | } | ||
122 | break; | ||
123 | default: | ||
124 | if (dev->callback) | ||
125 | status = (*dev->callback)(dev, rqst, data); | ||
126 | break; | ||
127 | } | ||
128 | return status; | ||
129 | } | ||
130 | |||
131 | /* | ||
132 | * Undo incomplete request | ||
133 | */ | ||
134 | static void pm_undo_all(struct pm_dev *last) | ||
135 | { | ||
136 | struct list_head *entry = last->entry.prev; | ||
137 | while (entry != &pm_devs) { | ||
138 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | ||
139 | if (dev->state != dev->prev_state) { | ||
140 | /* previous state was zero (running) resume or | ||
141 | * previous state was non-zero (suspended) suspend | ||
142 | */ | ||
143 | pm_request_t undo = (dev->prev_state | ||
144 | ? PM_SUSPEND:PM_RESUME); | ||
145 | pm_send(dev, undo, (void*) dev->prev_state); | ||
146 | } | ||
147 | entry = entry->prev; | ||
148 | } | ||
149 | } | ||
150 | |||
151 | /** | ||
152 | * pm_send_all - send request to all managed devices | ||
153 | * @rqst: power management request | ||
154 | * @data: data for the callback | ||
155 | * | ||
156 | * Issue a power management request to a all devices. The | ||
157 | * %PM_SUSPEND events are handled specially. Any device is | ||
158 | * permitted to fail a suspend by returning a non zero (error) | ||
159 | * value from its callback function. If any device vetoes a | ||
160 | * suspend request then all other devices that have suspended | ||
161 | * during the processing of this request are restored to their | ||
162 | * previous state. | ||
163 | * | ||
164 | * WARNING: This function takes the pm_devs_lock. The lock is not dropped until | ||
165 | * the callbacks have completed. This prevents races against pm locking | ||
166 | * functions, races against module unload pm_unregister code. It does | ||
167 | * mean however that you must not issue pm_ functions within the callback | ||
168 | * or you will deadlock and users will hate you. | ||
169 | * | ||
170 | * Zero is returned on success. If a suspend fails then the status | ||
171 | * from the device that vetoes the suspend is returned. | ||
172 | * | ||
173 | * BUGS: what stops two power management requests occurring in parallel | ||
174 | * and conflicting. | ||
175 | */ | ||
176 | |||
177 | int pm_send_all(pm_request_t rqst, void *data) | ||
178 | { | ||
179 | struct list_head *entry; | ||
180 | |||
181 | mutex_lock(&pm_devs_lock); | ||
182 | entry = pm_devs.next; | ||
183 | while (entry != &pm_devs) { | ||
184 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | ||
185 | if (dev->callback) { | ||
186 | int status = pm_send(dev, rqst, data); | ||
187 | if (status) { | ||
188 | /* return devices to previous state on | ||
189 | * failed suspend request | ||
190 | */ | ||
191 | if (rqst == PM_SUSPEND) | ||
192 | pm_undo_all(dev); | ||
193 | mutex_unlock(&pm_devs_lock); | ||
194 | return status; | ||
195 | } | ||
196 | } | ||
197 | entry = entry->next; | ||
198 | } | ||
199 | mutex_unlock(&pm_devs_lock); | ||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | EXPORT_SYMBOL(pm_register); | ||
204 | EXPORT_SYMBOL(pm_send_all); | ||
205 | |||
diff --git a/kernel/printk.c b/kernel/printk.c index c46a20a19a15..bdd4ea8c3f2b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -643,8 +643,21 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) | |||
643 | { | 643 | { |
644 | int retval = 0; | 644 | int retval = 0; |
645 | 645 | ||
646 | if (can_use_console(cpu)) | 646 | if (!try_acquire_console_sem()) { |
647 | retval = !try_acquire_console_sem(); | 647 | retval = 1; |
648 | |||
649 | /* | ||
650 | * If we can't use the console, we need to release | ||
651 | * the console semaphore by hand to avoid flushing | ||
652 | * the buffer. We need to hold the console semaphore | ||
653 | * in order to do this test safely. | ||
654 | */ | ||
655 | if (!can_use_console(cpu)) { | ||
656 | console_locked = 0; | ||
657 | up(&console_sem); | ||
658 | retval = 0; | ||
659 | } | ||
660 | } | ||
648 | printk_cpu = UINT_MAX; | 661 | printk_cpu = UINT_MAX; |
649 | spin_unlock(&logbuf_lock); | 662 | spin_unlock(&logbuf_lock); |
650 | return retval; | 663 | return retval; |
diff --git a/kernel/profile.c b/kernel/profile.c index 3b7a1b055122..606d7387265c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -23,7 +23,6 @@ | |||
23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
24 | #include <linux/mutex.h> | 24 | #include <linux/mutex.h> |
25 | #include <asm/sections.h> | 25 | #include <asm/sections.h> |
26 | #include <asm/semaphore.h> | ||
27 | #include <asm/irq_regs.h> | 26 | #include <asm/irq_regs.h> |
28 | #include <asm/ptrace.h> | 27 | #include <asm/ptrace.h> |
29 | 28 | ||
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index fdb34e86f923..dac4b4e57293 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -323,9 +323,8 @@ static int ptrace_setoptions(struct task_struct *child, long data) | |||
323 | return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; | 323 | return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; |
324 | } | 324 | } |
325 | 325 | ||
326 | static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) | 326 | static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) |
327 | { | 327 | { |
328 | siginfo_t lastinfo; | ||
329 | int error = -ESRCH; | 328 | int error = -ESRCH; |
330 | 329 | ||
331 | read_lock(&tasklist_lock); | 330 | read_lock(&tasklist_lock); |
@@ -333,31 +332,25 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) | |||
333 | error = -EINVAL; | 332 | error = -EINVAL; |
334 | spin_lock_irq(&child->sighand->siglock); | 333 | spin_lock_irq(&child->sighand->siglock); |
335 | if (likely(child->last_siginfo != NULL)) { | 334 | if (likely(child->last_siginfo != NULL)) { |
336 | lastinfo = *child->last_siginfo; | 335 | *info = *child->last_siginfo; |
337 | error = 0; | 336 | error = 0; |
338 | } | 337 | } |
339 | spin_unlock_irq(&child->sighand->siglock); | 338 | spin_unlock_irq(&child->sighand->siglock); |
340 | } | 339 | } |
341 | read_unlock(&tasklist_lock); | 340 | read_unlock(&tasklist_lock); |
342 | if (!error) | ||
343 | return copy_siginfo_to_user(data, &lastinfo); | ||
344 | return error; | 341 | return error; |
345 | } | 342 | } |
346 | 343 | ||
347 | static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) | 344 | static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) |
348 | { | 345 | { |
349 | siginfo_t newinfo; | ||
350 | int error = -ESRCH; | 346 | int error = -ESRCH; |
351 | 347 | ||
352 | if (copy_from_user(&newinfo, data, sizeof (siginfo_t))) | ||
353 | return -EFAULT; | ||
354 | |||
355 | read_lock(&tasklist_lock); | 348 | read_lock(&tasklist_lock); |
356 | if (likely(child->sighand != NULL)) { | 349 | if (likely(child->sighand != NULL)) { |
357 | error = -EINVAL; | 350 | error = -EINVAL; |
358 | spin_lock_irq(&child->sighand->siglock); | 351 | spin_lock_irq(&child->sighand->siglock); |
359 | if (likely(child->last_siginfo != NULL)) { | 352 | if (likely(child->last_siginfo != NULL)) { |
360 | *child->last_siginfo = newinfo; | 353 | *child->last_siginfo = *info; |
361 | error = 0; | 354 | error = 0; |
362 | } | 355 | } |
363 | spin_unlock_irq(&child->sighand->siglock); | 356 | spin_unlock_irq(&child->sighand->siglock); |
@@ -424,6 +417,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
424 | long addr, long data) | 417 | long addr, long data) |
425 | { | 418 | { |
426 | int ret = -EIO; | 419 | int ret = -EIO; |
420 | siginfo_t siginfo; | ||
427 | 421 | ||
428 | switch (request) { | 422 | switch (request) { |
429 | case PTRACE_PEEKTEXT: | 423 | case PTRACE_PEEKTEXT: |
@@ -442,12 +436,22 @@ int ptrace_request(struct task_struct *child, long request, | |||
442 | case PTRACE_GETEVENTMSG: | 436 | case PTRACE_GETEVENTMSG: |
443 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); | 437 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); |
444 | break; | 438 | break; |
439 | |||
445 | case PTRACE_GETSIGINFO: | 440 | case PTRACE_GETSIGINFO: |
446 | ret = ptrace_getsiginfo(child, (siginfo_t __user *) data); | 441 | ret = ptrace_getsiginfo(child, &siginfo); |
442 | if (!ret) | ||
443 | ret = copy_siginfo_to_user((siginfo_t __user *) data, | ||
444 | &siginfo); | ||
447 | break; | 445 | break; |
446 | |||
448 | case PTRACE_SETSIGINFO: | 447 | case PTRACE_SETSIGINFO: |
449 | ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); | 448 | if (copy_from_user(&siginfo, (siginfo_t __user *) data, |
449 | sizeof siginfo)) | ||
450 | ret = -EFAULT; | ||
451 | else | ||
452 | ret = ptrace_setsiginfo(child, &siginfo); | ||
450 | break; | 453 | break; |
454 | |||
451 | case PTRACE_DETACH: /* detach a process that was attached. */ | 455 | case PTRACE_DETACH: /* detach a process that was attached. */ |
452 | ret = ptrace_detach(child, data); | 456 | ret = ptrace_detach(child, data); |
453 | break; | 457 | break; |
@@ -608,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) | |||
608 | return (copied == sizeof(data)) ? 0 : -EIO; | 612 | return (copied == sizeof(data)) ? 0 : -EIO; |
609 | } | 613 | } |
610 | 614 | ||
611 | #ifdef CONFIG_COMPAT | 615 | #if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE |
612 | #include <linux/compat.h> | 616 | #include <linux/compat.h> |
613 | 617 | ||
614 | int compat_ptrace_request(struct task_struct *child, compat_long_t request, | 618 | int compat_ptrace_request(struct task_struct *child, compat_long_t request, |
@@ -616,6 +620,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
616 | { | 620 | { |
617 | compat_ulong_t __user *datap = compat_ptr(data); | 621 | compat_ulong_t __user *datap = compat_ptr(data); |
618 | compat_ulong_t word; | 622 | compat_ulong_t word; |
623 | siginfo_t siginfo; | ||
619 | int ret; | 624 | int ret; |
620 | 625 | ||
621 | switch (request) { | 626 | switch (request) { |
@@ -638,6 +643,23 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
638 | ret = put_user((compat_ulong_t) child->ptrace_message, datap); | 643 | ret = put_user((compat_ulong_t) child->ptrace_message, datap); |
639 | break; | 644 | break; |
640 | 645 | ||
646 | case PTRACE_GETSIGINFO: | ||
647 | ret = ptrace_getsiginfo(child, &siginfo); | ||
648 | if (!ret) | ||
649 | ret = copy_siginfo_to_user32( | ||
650 | (struct compat_siginfo __user *) datap, | ||
651 | &siginfo); | ||
652 | break; | ||
653 | |||
654 | case PTRACE_SETSIGINFO: | ||
655 | memset(&siginfo, 0, sizeof siginfo); | ||
656 | if (copy_siginfo_from_user32( | ||
657 | &siginfo, (struct compat_siginfo __user *) datap)) | ||
658 | ret = -EFAULT; | ||
659 | else | ||
660 | ret = ptrace_setsiginfo(child, &siginfo); | ||
661 | break; | ||
662 | |||
641 | default: | 663 | default: |
642 | ret = ptrace_request(child, request, addr, data); | 664 | ret = ptrace_request(child, request, addr, data); |
643 | } | 665 | } |
@@ -645,7 +667,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
645 | return ret; | 667 | return ret; |
646 | } | 668 | } |
647 | 669 | ||
648 | #ifdef __ARCH_WANT_COMPAT_SYS_PTRACE | ||
649 | asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | 670 | asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, |
650 | compat_long_t addr, compat_long_t data) | 671 | compat_long_t addr, compat_long_t data) |
651 | { | 672 | { |
@@ -688,6 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
688 | unlock_kernel(); | 709 | unlock_kernel(); |
689 | return ret; | 710 | return ret; |
690 | } | 711 | } |
691 | #endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ | 712 | #endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ |
692 | |||
693 | #endif /* CONFIG_COMPAT */ | ||
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e9517014b57c..e1cdf196a515 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
@@ -1007,10 +1007,10 @@ void __synchronize_sched(void) | |||
1007 | if (sched_getaffinity(0, &oldmask) < 0) | 1007 | if (sched_getaffinity(0, &oldmask) < 0) |
1008 | oldmask = cpu_possible_map; | 1008 | oldmask = cpu_possible_map; |
1009 | for_each_online_cpu(cpu) { | 1009 | for_each_online_cpu(cpu) { |
1010 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | 1010 | sched_setaffinity(0, &cpumask_of_cpu(cpu)); |
1011 | schedule(); | 1011 | schedule(); |
1012 | } | 1012 | } |
1013 | sched_setaffinity(0, oldmask); | 1013 | sched_setaffinity(0, &oldmask); |
1014 | } | 1014 | } |
1015 | EXPORT_SYMBOL_GPL(__synchronize_sched); | 1015 | EXPORT_SYMBOL_GPL(__synchronize_sched); |
1016 | 1016 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index fd599829e72a..47894f919d4e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -723,9 +723,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ | |||
723 | */ | 723 | */ |
724 | static void rcu_torture_shuffle_tasks(void) | 724 | static void rcu_torture_shuffle_tasks(void) |
725 | { | 725 | { |
726 | cpumask_t tmp_mask = CPU_MASK_ALL; | 726 | cpumask_t tmp_mask; |
727 | int i; | 727 | int i; |
728 | 728 | ||
729 | cpus_setall(tmp_mask); | ||
729 | get_online_cpus(); | 730 | get_online_cpus(); |
730 | 731 | ||
731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 732 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
@@ -737,25 +738,27 @@ static void rcu_torture_shuffle_tasks(void) | |||
737 | if (rcu_idle_cpu != -1) | 738 | if (rcu_idle_cpu != -1) |
738 | cpu_clear(rcu_idle_cpu, tmp_mask); | 739 | cpu_clear(rcu_idle_cpu, tmp_mask); |
739 | 740 | ||
740 | set_cpus_allowed(current, tmp_mask); | 741 | set_cpus_allowed_ptr(current, &tmp_mask); |
741 | 742 | ||
742 | if (reader_tasks) { | 743 | if (reader_tasks) { |
743 | for (i = 0; i < nrealreaders; i++) | 744 | for (i = 0; i < nrealreaders; i++) |
744 | if (reader_tasks[i]) | 745 | if (reader_tasks[i]) |
745 | set_cpus_allowed(reader_tasks[i], tmp_mask); | 746 | set_cpus_allowed_ptr(reader_tasks[i], |
747 | &tmp_mask); | ||
746 | } | 748 | } |
747 | 749 | ||
748 | if (fakewriter_tasks) { | 750 | if (fakewriter_tasks) { |
749 | for (i = 0; i < nfakewriters; i++) | 751 | for (i = 0; i < nfakewriters; i++) |
750 | if (fakewriter_tasks[i]) | 752 | if (fakewriter_tasks[i]) |
751 | set_cpus_allowed(fakewriter_tasks[i], tmp_mask); | 753 | set_cpus_allowed_ptr(fakewriter_tasks[i], |
754 | &tmp_mask); | ||
752 | } | 755 | } |
753 | 756 | ||
754 | if (writer_task) | 757 | if (writer_task) |
755 | set_cpus_allowed(writer_task, tmp_mask); | 758 | set_cpus_allowed_ptr(writer_task, &tmp_mask); |
756 | 759 | ||
757 | if (stats_task) | 760 | if (stats_task) |
758 | set_cpus_allowed(stats_task, tmp_mask); | 761 | set_cpus_allowed_ptr(stats_task, &tmp_mask); |
759 | 762 | ||
760 | if (rcu_idle_cpu == -1) | 763 | if (rcu_idle_cpu == -1) |
761 | rcu_idle_cpu = num_online_cpus() - 1; | 764 | rcu_idle_cpu = num_online_cpus() - 1; |
diff --git a/kernel/resource.c b/kernel/resource.c index 82aea814d409..cee12cc47cab 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -486,6 +486,24 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t | |||
486 | 486 | ||
487 | EXPORT_SYMBOL(adjust_resource); | 487 | EXPORT_SYMBOL(adjust_resource); |
488 | 488 | ||
489 | /** | ||
490 | * resource_alignment - calculate resource's alignment | ||
491 | * @res: resource pointer | ||
492 | * | ||
493 | * Returns alignment on success, 0 (invalid alignment) on failure. | ||
494 | */ | ||
495 | resource_size_t resource_alignment(struct resource *res) | ||
496 | { | ||
497 | switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { | ||
498 | case IORESOURCE_SIZEALIGN: | ||
499 | return res->end - res->start + 1; | ||
500 | case IORESOURCE_STARTALIGN: | ||
501 | return res->start; | ||
502 | default: | ||
503 | return 0; | ||
504 | } | ||
505 | } | ||
506 | |||
489 | /* | 507 | /* |
490 | * This is compatibility stuff for IO resources. | 508 | * This is compatibility stuff for IO resources. |
491 | * | 509 | * |
diff --git a/kernel/sched.c b/kernel/sched.c index 8dcdec6fe0fe..740fb409e5bb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -66,6 +66,10 @@ | |||
66 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
67 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | 68 | #include <linux/hrtimer.h> |
69 | #include <linux/tick.h> | ||
70 | #include <linux/bootmem.h> | ||
71 | #include <linux/debugfs.h> | ||
72 | #include <linux/ctype.h> | ||
69 | 73 | ||
70 | #include <asm/tlb.h> | 74 | #include <asm/tlb.h> |
71 | #include <asm/irq_regs.h> | 75 | #include <asm/irq_regs.h> |
@@ -114,6 +118,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
114 | */ | 118 | */ |
115 | #define DEF_TIMESLICE (100 * HZ / 1000) | 119 | #define DEF_TIMESLICE (100 * HZ / 1000) |
116 | 120 | ||
121 | /* | ||
122 | * single value that denotes runtime == period, ie unlimited time. | ||
123 | */ | ||
124 | #define RUNTIME_INF ((u64)~0ULL) | ||
125 | |||
117 | #ifdef CONFIG_SMP | 126 | #ifdef CONFIG_SMP |
118 | /* | 127 | /* |
119 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | 128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) |
@@ -155,6 +164,84 @@ struct rt_prio_array { | |||
155 | struct list_head queue[MAX_RT_PRIO]; | 164 | struct list_head queue[MAX_RT_PRIO]; |
156 | }; | 165 | }; |
157 | 166 | ||
167 | struct rt_bandwidth { | ||
168 | /* nests inside the rq lock: */ | ||
169 | spinlock_t rt_runtime_lock; | ||
170 | ktime_t rt_period; | ||
171 | u64 rt_runtime; | ||
172 | struct hrtimer rt_period_timer; | ||
173 | }; | ||
174 | |||
175 | static struct rt_bandwidth def_rt_bandwidth; | ||
176 | |||
177 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
178 | |||
179 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
180 | { | ||
181 | struct rt_bandwidth *rt_b = | ||
182 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
183 | ktime_t now; | ||
184 | int overrun; | ||
185 | int idle = 0; | ||
186 | |||
187 | for (;;) { | ||
188 | now = hrtimer_cb_get_time(timer); | ||
189 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
190 | |||
191 | if (!overrun) | ||
192 | break; | ||
193 | |||
194 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
195 | } | ||
196 | |||
197 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
198 | } | ||
199 | |||
200 | static | ||
201 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
202 | { | ||
203 | rt_b->rt_period = ns_to_ktime(period); | ||
204 | rt_b->rt_runtime = runtime; | ||
205 | |||
206 | spin_lock_init(&rt_b->rt_runtime_lock); | ||
207 | |||
208 | hrtimer_init(&rt_b->rt_period_timer, | ||
209 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
210 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
211 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
212 | } | ||
213 | |||
214 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
215 | { | ||
216 | ktime_t now; | ||
217 | |||
218 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
219 | return; | ||
220 | |||
221 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
222 | return; | ||
223 | |||
224 | spin_lock(&rt_b->rt_runtime_lock); | ||
225 | for (;;) { | ||
226 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
227 | break; | ||
228 | |||
229 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
230 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
231 | hrtimer_start(&rt_b->rt_period_timer, | ||
232 | rt_b->rt_period_timer.expires, | ||
233 | HRTIMER_MODE_ABS); | ||
234 | } | ||
235 | spin_unlock(&rt_b->rt_runtime_lock); | ||
236 | } | ||
237 | |||
238 | #ifdef CONFIG_RT_GROUP_SCHED | ||
239 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
240 | { | ||
241 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
242 | } | ||
243 | #endif | ||
244 | |||
158 | #ifdef CONFIG_GROUP_SCHED | 245 | #ifdef CONFIG_GROUP_SCHED |
159 | 246 | ||
160 | #include <linux/cgroup.h> | 247 | #include <linux/cgroup.h> |
@@ -181,29 +268,39 @@ struct task_group { | |||
181 | struct sched_rt_entity **rt_se; | 268 | struct sched_rt_entity **rt_se; |
182 | struct rt_rq **rt_rq; | 269 | struct rt_rq **rt_rq; |
183 | 270 | ||
184 | u64 rt_runtime; | 271 | struct rt_bandwidth rt_bandwidth; |
185 | #endif | 272 | #endif |
186 | 273 | ||
187 | struct rcu_head rcu; | 274 | struct rcu_head rcu; |
188 | struct list_head list; | 275 | struct list_head list; |
276 | |||
277 | struct task_group *parent; | ||
278 | struct list_head siblings; | ||
279 | struct list_head children; | ||
189 | }; | 280 | }; |
190 | 281 | ||
282 | #ifdef CONFIG_USER_SCHED | ||
283 | |||
284 | /* | ||
285 | * Root task group. | ||
286 | * Every UID task group (including init_task_group aka UID-0) will | ||
287 | * be a child to this group. | ||
288 | */ | ||
289 | struct task_group root_task_group; | ||
290 | |||
191 | #ifdef CONFIG_FAIR_GROUP_SCHED | 291 | #ifdef CONFIG_FAIR_GROUP_SCHED |
192 | /* Default task group's sched entity on each cpu */ | 292 | /* Default task group's sched entity on each cpu */ |
193 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 293 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
194 | /* Default task group's cfs_rq on each cpu */ | 294 | /* Default task group's cfs_rq on each cpu */ |
195 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 295 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
196 | |||
197 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | ||
198 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | ||
199 | #endif | 296 | #endif |
200 | 297 | ||
201 | #ifdef CONFIG_RT_GROUP_SCHED | 298 | #ifdef CONFIG_RT_GROUP_SCHED |
202 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 299 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
203 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 300 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
204 | 301 | #endif | |
205 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | 302 | #else |
206 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | 303 | #define root_task_group init_task_group |
207 | #endif | 304 | #endif |
208 | 305 | ||
209 | /* task_group_lock serializes add/remove of task groups and also changes to | 306 | /* task_group_lock serializes add/remove of task groups and also changes to |
@@ -221,23 +318,15 @@ static DEFINE_MUTEX(doms_cur_mutex); | |||
221 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 318 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
222 | #endif | 319 | #endif |
223 | 320 | ||
321 | #define MIN_SHARES 2 | ||
322 | |||
224 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 323 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
225 | #endif | 324 | #endif |
226 | 325 | ||
227 | /* Default task group. | 326 | /* Default task group. |
228 | * Every task in system belong to this group at bootup. | 327 | * Every task in system belong to this group at bootup. |
229 | */ | 328 | */ |
230 | struct task_group init_task_group = { | 329 | struct task_group init_task_group; |
231 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
232 | .se = init_sched_entity_p, | ||
233 | .cfs_rq = init_cfs_rq_p, | ||
234 | #endif | ||
235 | |||
236 | #ifdef CONFIG_RT_GROUP_SCHED | ||
237 | .rt_se = init_sched_rt_entity_p, | ||
238 | .rt_rq = init_rt_rq_p, | ||
239 | #endif | ||
240 | }; | ||
241 | 330 | ||
242 | /* return group to which a task belongs */ | 331 | /* return group to which a task belongs */ |
243 | static inline struct task_group *task_group(struct task_struct *p) | 332 | static inline struct task_group *task_group(struct task_struct *p) |
@@ -297,8 +386,12 @@ struct cfs_rq { | |||
297 | 386 | ||
298 | struct rb_root tasks_timeline; | 387 | struct rb_root tasks_timeline; |
299 | struct rb_node *rb_leftmost; | 388 | struct rb_node *rb_leftmost; |
300 | struct rb_node *rb_load_balance_curr; | 389 | |
301 | /* 'curr' points to currently running entity on this cfs_rq. | 390 | struct list_head tasks; |
391 | struct list_head *balance_iterator; | ||
392 | |||
393 | /* | ||
394 | * 'curr' points to currently running entity on this cfs_rq. | ||
302 | * It is set to NULL otherwise (i.e when none are currently running). | 395 | * It is set to NULL otherwise (i.e when none are currently running). |
303 | */ | 396 | */ |
304 | struct sched_entity *curr, *next; | 397 | struct sched_entity *curr, *next; |
@@ -318,6 +411,43 @@ struct cfs_rq { | |||
318 | */ | 411 | */ |
319 | struct list_head leaf_cfs_rq_list; | 412 | struct list_head leaf_cfs_rq_list; |
320 | struct task_group *tg; /* group that "owns" this runqueue */ | 413 | struct task_group *tg; /* group that "owns" this runqueue */ |
414 | |||
415 | #ifdef CONFIG_SMP | ||
416 | unsigned long task_weight; | ||
417 | unsigned long shares; | ||
418 | /* | ||
419 | * We need space to build a sched_domain wide view of the full task | ||
420 | * group tree, in order to avoid depending on dynamic memory allocation | ||
421 | * during the load balancing we place this in the per cpu task group | ||
422 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
423 | * but more should not be needed anyway. | ||
424 | */ | ||
425 | struct aggregate_struct { | ||
426 | /* | ||
427 | * load = weight(cpus) * f(tg) | ||
428 | * | ||
429 | * Where f(tg) is the recursive weight fraction assigned to | ||
430 | * this group. | ||
431 | */ | ||
432 | unsigned long load; | ||
433 | |||
434 | /* | ||
435 | * part of the group weight distributed to this span. | ||
436 | */ | ||
437 | unsigned long shares; | ||
438 | |||
439 | /* | ||
440 | * The sum of all runqueue weights within this span. | ||
441 | */ | ||
442 | unsigned long rq_weight; | ||
443 | |||
444 | /* | ||
445 | * Weight contributed by tasks; this is the part we can | ||
446 | * influence by moving tasks around. | ||
447 | */ | ||
448 | unsigned long task_weight; | ||
449 | } aggregate; | ||
450 | #endif | ||
321 | #endif | 451 | #endif |
322 | }; | 452 | }; |
323 | 453 | ||
@@ -334,6 +464,9 @@ struct rt_rq { | |||
334 | #endif | 464 | #endif |
335 | int rt_throttled; | 465 | int rt_throttled; |
336 | u64 rt_time; | 466 | u64 rt_time; |
467 | u64 rt_runtime; | ||
468 | /* Nests inside the rq lock: */ | ||
469 | spinlock_t rt_runtime_lock; | ||
337 | 470 | ||
338 | #ifdef CONFIG_RT_GROUP_SCHED | 471 | #ifdef CONFIG_RT_GROUP_SCHED |
339 | unsigned long rt_nr_boosted; | 472 | unsigned long rt_nr_boosted; |
@@ -396,6 +529,7 @@ struct rq { | |||
396 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 529 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
397 | unsigned char idle_at_tick; | 530 | unsigned char idle_at_tick; |
398 | #ifdef CONFIG_NO_HZ | 531 | #ifdef CONFIG_NO_HZ |
532 | unsigned long last_tick_seen; | ||
399 | unsigned char in_nohz_recently; | 533 | unsigned char in_nohz_recently; |
400 | #endif | 534 | #endif |
401 | /* capture load from *all* tasks on this cpu: */ | 535 | /* capture load from *all* tasks on this cpu: */ |
@@ -405,8 +539,6 @@ struct rq { | |||
405 | 539 | ||
406 | struct cfs_rq cfs; | 540 | struct cfs_rq cfs; |
407 | struct rt_rq rt; | 541 | struct rt_rq rt; |
408 | u64 rt_period_expire; | ||
409 | int rt_throttled; | ||
410 | 542 | ||
411 | #ifdef CONFIG_FAIR_GROUP_SCHED | 543 | #ifdef CONFIG_FAIR_GROUP_SCHED |
412 | /* list of leaf cfs_rq on this cpu: */ | 544 | /* list of leaf cfs_rq on this cpu: */ |
@@ -499,6 +631,32 @@ static inline int cpu_of(struct rq *rq) | |||
499 | #endif | 631 | #endif |
500 | } | 632 | } |
501 | 633 | ||
634 | #ifdef CONFIG_NO_HZ | ||
635 | static inline bool nohz_on(int cpu) | ||
636 | { | ||
637 | return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; | ||
638 | } | ||
639 | |||
640 | static inline u64 max_skipped_ticks(struct rq *rq) | ||
641 | { | ||
642 | return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; | ||
643 | } | ||
644 | |||
645 | static inline void update_last_tick_seen(struct rq *rq) | ||
646 | { | ||
647 | rq->last_tick_seen = jiffies; | ||
648 | } | ||
649 | #else | ||
650 | static inline u64 max_skipped_ticks(struct rq *rq) | ||
651 | { | ||
652 | return 1; | ||
653 | } | ||
654 | |||
655 | static inline void update_last_tick_seen(struct rq *rq) | ||
656 | { | ||
657 | } | ||
658 | #endif | ||
659 | |||
502 | /* | 660 | /* |
503 | * Update the per-runqueue clock, as finegrained as the platform can give | 661 | * Update the per-runqueue clock, as finegrained as the platform can give |
504 | * us, but without assuming monotonicity, etc.: | 662 | * us, but without assuming monotonicity, etc.: |
@@ -523,9 +681,12 @@ static void __update_rq_clock(struct rq *rq) | |||
523 | /* | 681 | /* |
524 | * Catch too large forward jumps too: | 682 | * Catch too large forward jumps too: |
525 | */ | 683 | */ |
526 | if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { | 684 | u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; |
527 | if (clock < rq->tick_timestamp + TICK_NSEC) | 685 | u64 max_time = rq->tick_timestamp + max_jump; |
528 | clock = rq->tick_timestamp + TICK_NSEC; | 686 | |
687 | if (unlikely(clock + delta > max_time)) { | ||
688 | if (clock < max_time) | ||
689 | clock = max_time; | ||
529 | else | 690 | else |
530 | clock++; | 691 | clock++; |
531 | rq->clock_overflows++; | 692 | rq->clock_overflows++; |
@@ -561,23 +722,6 @@ static void update_rq_clock(struct rq *rq) | |||
561 | #define task_rq(p) cpu_rq(task_cpu(p)) | 722 | #define task_rq(p) cpu_rq(task_cpu(p)) |
562 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 723 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
563 | 724 | ||
564 | unsigned long rt_needs_cpu(int cpu) | ||
565 | { | ||
566 | struct rq *rq = cpu_rq(cpu); | ||
567 | u64 delta; | ||
568 | |||
569 | if (!rq->rt_throttled) | ||
570 | return 0; | ||
571 | |||
572 | if (rq->clock > rq->rt_period_expire) | ||
573 | return 1; | ||
574 | |||
575 | delta = rq->rt_period_expire - rq->clock; | ||
576 | do_div(delta, NSEC_PER_SEC / HZ); | ||
577 | |||
578 | return (unsigned long)delta; | ||
579 | } | ||
580 | |||
581 | /* | 725 | /* |
582 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 726 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
583 | */ | 727 | */ |
@@ -590,22 +734,137 @@ unsigned long rt_needs_cpu(int cpu) | |||
590 | /* | 734 | /* |
591 | * Debugging: various feature bits | 735 | * Debugging: various feature bits |
592 | */ | 736 | */ |
737 | |||
738 | #define SCHED_FEAT(name, enabled) \ | ||
739 | __SCHED_FEAT_##name , | ||
740 | |||
593 | enum { | 741 | enum { |
594 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | 742 | #include "sched_features.h" |
595 | SCHED_FEAT_WAKEUP_PREEMPT = 2, | ||
596 | SCHED_FEAT_START_DEBIT = 4, | ||
597 | SCHED_FEAT_HRTICK = 8, | ||
598 | SCHED_FEAT_DOUBLE_TICK = 16, | ||
599 | }; | 743 | }; |
600 | 744 | ||
745 | #undef SCHED_FEAT | ||
746 | |||
747 | #define SCHED_FEAT(name, enabled) \ | ||
748 | (1UL << __SCHED_FEAT_##name) * enabled | | ||
749 | |||
601 | const_debug unsigned int sysctl_sched_features = | 750 | const_debug unsigned int sysctl_sched_features = |
602 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | | 751 | #include "sched_features.h" |
603 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 752 | 0; |
604 | SCHED_FEAT_START_DEBIT * 1 | | 753 | |
605 | SCHED_FEAT_HRTICK * 1 | | 754 | #undef SCHED_FEAT |
606 | SCHED_FEAT_DOUBLE_TICK * 0; | 755 | |
756 | #ifdef CONFIG_SCHED_DEBUG | ||
757 | #define SCHED_FEAT(name, enabled) \ | ||
758 | #name , | ||
759 | |||
760 | __read_mostly char *sched_feat_names[] = { | ||
761 | #include "sched_features.h" | ||
762 | NULL | ||
763 | }; | ||
764 | |||
765 | #undef SCHED_FEAT | ||
607 | 766 | ||
608 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 767 | int sched_feat_open(struct inode *inode, struct file *filp) |
768 | { | ||
769 | filp->private_data = inode->i_private; | ||
770 | return 0; | ||
771 | } | ||
772 | |||
773 | static ssize_t | ||
774 | sched_feat_read(struct file *filp, char __user *ubuf, | ||
775 | size_t cnt, loff_t *ppos) | ||
776 | { | ||
777 | char *buf; | ||
778 | int r = 0; | ||
779 | int len = 0; | ||
780 | int i; | ||
781 | |||
782 | for (i = 0; sched_feat_names[i]; i++) { | ||
783 | len += strlen(sched_feat_names[i]); | ||
784 | len += 4; | ||
785 | } | ||
786 | |||
787 | buf = kmalloc(len + 2, GFP_KERNEL); | ||
788 | if (!buf) | ||
789 | return -ENOMEM; | ||
790 | |||
791 | for (i = 0; sched_feat_names[i]; i++) { | ||
792 | if (sysctl_sched_features & (1UL << i)) | ||
793 | r += sprintf(buf + r, "%s ", sched_feat_names[i]); | ||
794 | else | ||
795 | r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); | ||
796 | } | ||
797 | |||
798 | r += sprintf(buf + r, "\n"); | ||
799 | WARN_ON(r >= len + 2); | ||
800 | |||
801 | r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
802 | |||
803 | kfree(buf); | ||
804 | |||
805 | return r; | ||
806 | } | ||
807 | |||
808 | static ssize_t | ||
809 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
810 | size_t cnt, loff_t *ppos) | ||
811 | { | ||
812 | char buf[64]; | ||
813 | char *cmp = buf; | ||
814 | int neg = 0; | ||
815 | int i; | ||
816 | |||
817 | if (cnt > 63) | ||
818 | cnt = 63; | ||
819 | |||
820 | if (copy_from_user(&buf, ubuf, cnt)) | ||
821 | return -EFAULT; | ||
822 | |||
823 | buf[cnt] = 0; | ||
824 | |||
825 | if (strncmp(buf, "NO_", 3) == 0) { | ||
826 | neg = 1; | ||
827 | cmp += 3; | ||
828 | } | ||
829 | |||
830 | for (i = 0; sched_feat_names[i]; i++) { | ||
831 | int len = strlen(sched_feat_names[i]); | ||
832 | |||
833 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
834 | if (neg) | ||
835 | sysctl_sched_features &= ~(1UL << i); | ||
836 | else | ||
837 | sysctl_sched_features |= (1UL << i); | ||
838 | break; | ||
839 | } | ||
840 | } | ||
841 | |||
842 | if (!sched_feat_names[i]) | ||
843 | return -EINVAL; | ||
844 | |||
845 | filp->f_pos += cnt; | ||
846 | |||
847 | return cnt; | ||
848 | } | ||
849 | |||
850 | static struct file_operations sched_feat_fops = { | ||
851 | .open = sched_feat_open, | ||
852 | .read = sched_feat_read, | ||
853 | .write = sched_feat_write, | ||
854 | }; | ||
855 | |||
856 | static __init int sched_init_debug(void) | ||
857 | { | ||
858 | debugfs_create_file("sched_features", 0644, NULL, NULL, | ||
859 | &sched_feat_fops); | ||
860 | |||
861 | return 0; | ||
862 | } | ||
863 | late_initcall(sched_init_debug); | ||
864 | |||
865 | #endif | ||
866 | |||
867 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
609 | 868 | ||
610 | /* | 869 | /* |
611 | * Number of tasks to iterate in a single balance run. | 870 | * Number of tasks to iterate in a single balance run. |
@@ -627,16 +886,52 @@ static __read_mostly int scheduler_running; | |||
627 | */ | 886 | */ |
628 | int sysctl_sched_rt_runtime = 950000; | 887 | int sysctl_sched_rt_runtime = 950000; |
629 | 888 | ||
630 | /* | 889 | static inline u64 global_rt_period(void) |
631 | * single value that denotes runtime == period, ie unlimited time. | 890 | { |
632 | */ | 891 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
633 | #define RUNTIME_INF ((u64)~0ULL) | 892 | } |
893 | |||
894 | static inline u64 global_rt_runtime(void) | ||
895 | { | ||
896 | if (sysctl_sched_rt_period < 0) | ||
897 | return RUNTIME_INF; | ||
898 | |||
899 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
900 | } | ||
901 | |||
902 | static const unsigned long long time_sync_thresh = 100000; | ||
903 | |||
904 | static DEFINE_PER_CPU(unsigned long long, time_offset); | ||
905 | static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); | ||
634 | 906 | ||
635 | /* | 907 | /* |
636 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 908 | * Global lock which we take every now and then to synchronize |
637 | * clock constructed from sched_clock(): | 909 | * the CPUs time. This method is not warp-safe, but it's good |
910 | * enough to synchronize slowly diverging time sources and thus | ||
911 | * it's good enough for tracing: | ||
638 | */ | 912 | */ |
639 | unsigned long long cpu_clock(int cpu) | 913 | static DEFINE_SPINLOCK(time_sync_lock); |
914 | static unsigned long long prev_global_time; | ||
915 | |||
916 | static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) | ||
917 | { | ||
918 | unsigned long flags; | ||
919 | |||
920 | spin_lock_irqsave(&time_sync_lock, flags); | ||
921 | |||
922 | if (time < prev_global_time) { | ||
923 | per_cpu(time_offset, cpu) += prev_global_time - time; | ||
924 | time = prev_global_time; | ||
925 | } else { | ||
926 | prev_global_time = time; | ||
927 | } | ||
928 | |||
929 | spin_unlock_irqrestore(&time_sync_lock, flags); | ||
930 | |||
931 | return time; | ||
932 | } | ||
933 | |||
934 | static unsigned long long __cpu_clock(int cpu) | ||
640 | { | 935 | { |
641 | unsigned long long now; | 936 | unsigned long long now; |
642 | unsigned long flags; | 937 | unsigned long flags; |
@@ -657,6 +952,24 @@ unsigned long long cpu_clock(int cpu) | |||
657 | 952 | ||
658 | return now; | 953 | return now; |
659 | } | 954 | } |
955 | |||
956 | /* | ||
957 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
958 | * clock constructed from sched_clock(): | ||
959 | */ | ||
960 | unsigned long long cpu_clock(int cpu) | ||
961 | { | ||
962 | unsigned long long prev_cpu_time, time, delta_time; | ||
963 | |||
964 | prev_cpu_time = per_cpu(prev_cpu_time, cpu); | ||
965 | time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); | ||
966 | delta_time = time-prev_cpu_time; | ||
967 | |||
968 | if (unlikely(delta_time > time_sync_thresh)) | ||
969 | time = __sync_cpu_clock(time, cpu); | ||
970 | |||
971 | return time; | ||
972 | } | ||
660 | EXPORT_SYMBOL_GPL(cpu_clock); | 973 | EXPORT_SYMBOL_GPL(cpu_clock); |
661 | 974 | ||
662 | #ifndef prepare_arch_switch | 975 | #ifndef prepare_arch_switch |
@@ -1116,6 +1429,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) | |||
1116 | */ | 1429 | */ |
1117 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1430 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
1118 | 1431 | ||
1432 | /* | ||
1433 | * delta *= weight / lw | ||
1434 | */ | ||
1119 | static unsigned long | 1435 | static unsigned long |
1120 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1436 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
1121 | struct load_weight *lw) | 1437 | struct load_weight *lw) |
@@ -1138,12 +1454,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1138 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1454 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
1139 | } | 1455 | } |
1140 | 1456 | ||
1141 | static inline unsigned long | ||
1142 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | ||
1143 | { | ||
1144 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | ||
1145 | } | ||
1146 | |||
1147 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1457 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
1148 | { | 1458 | { |
1149 | lw->weight += inc; | 1459 | lw->weight += inc; |
@@ -1241,11 +1551,347 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
1241 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1551 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
1242 | #endif | 1552 | #endif |
1243 | 1553 | ||
1554 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1555 | { | ||
1556 | update_load_add(&rq->load, load); | ||
1557 | } | ||
1558 | |||
1559 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1560 | { | ||
1561 | update_load_sub(&rq->load, load); | ||
1562 | } | ||
1563 | |||
1244 | #ifdef CONFIG_SMP | 1564 | #ifdef CONFIG_SMP |
1245 | static unsigned long source_load(int cpu, int type); | 1565 | static unsigned long source_load(int cpu, int type); |
1246 | static unsigned long target_load(int cpu, int type); | 1566 | static unsigned long target_load(int cpu, int type); |
1247 | static unsigned long cpu_avg_load_per_task(int cpu); | 1567 | static unsigned long cpu_avg_load_per_task(int cpu); |
1248 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1568 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1569 | |||
1570 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1571 | |||
1572 | /* | ||
1573 | * Group load balancing. | ||
1574 | * | ||
1575 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
1576 | * Given the pictures below, and assuming each item has equal weight: | ||
1577 | * | ||
1578 | * root 1 - thread | ||
1579 | * / | \ A - group | ||
1580 | * A 1 B | ||
1581 | * /|\ / \ | ||
1582 | * C 2 D 3 4 | ||
1583 | * | | | ||
1584 | * 5 6 | ||
1585 | * | ||
1586 | * load: | ||
1587 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
1588 | * which equals 1/9-th of the total load. | ||
1589 | * | ||
1590 | * shares: | ||
1591 | * The weight of this group on the selected cpus. | ||
1592 | * | ||
1593 | * rq_weight: | ||
1594 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
1595 | * B would get 2. | ||
1596 | * | ||
1597 | * task_weight: | ||
1598 | * Part of the rq_weight contributed by tasks; all groups except B would | ||
1599 | * get 1, B gets 2. | ||
1600 | */ | ||
1601 | |||
1602 | static inline struct aggregate_struct * | ||
1603 | aggregate(struct task_group *tg, struct sched_domain *sd) | ||
1604 | { | ||
1605 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | ||
1606 | } | ||
1607 | |||
1608 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | ||
1609 | |||
1610 | /* | ||
1611 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1612 | * leaving it for the final time. | ||
1613 | */ | ||
1614 | static | ||
1615 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | ||
1616 | struct sched_domain *sd) | ||
1617 | { | ||
1618 | struct task_group *parent, *child; | ||
1619 | |||
1620 | rcu_read_lock(); | ||
1621 | parent = &root_task_group; | ||
1622 | down: | ||
1623 | (*down)(parent, sd); | ||
1624 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
1625 | parent = child; | ||
1626 | goto down; | ||
1627 | |||
1628 | up: | ||
1629 | continue; | ||
1630 | } | ||
1631 | (*up)(parent, sd); | ||
1632 | |||
1633 | child = parent; | ||
1634 | parent = parent->parent; | ||
1635 | if (parent) | ||
1636 | goto up; | ||
1637 | rcu_read_unlock(); | ||
1638 | } | ||
1639 | |||
1640 | /* | ||
1641 | * Calculate the aggregate runqueue weight. | ||
1642 | */ | ||
1643 | static | ||
1644 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | ||
1645 | { | ||
1646 | unsigned long rq_weight = 0; | ||
1647 | unsigned long task_weight = 0; | ||
1648 | int i; | ||
1649 | |||
1650 | for_each_cpu_mask(i, sd->span) { | ||
1651 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1652 | task_weight += tg->cfs_rq[i]->task_weight; | ||
1653 | } | ||
1654 | |||
1655 | aggregate(tg, sd)->rq_weight = rq_weight; | ||
1656 | aggregate(tg, sd)->task_weight = task_weight; | ||
1657 | } | ||
1658 | |||
1659 | /* | ||
1660 | * Compute the weight of this group on the given cpus. | ||
1661 | */ | ||
1662 | static | ||
1663 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | ||
1664 | { | ||
1665 | unsigned long shares = 0; | ||
1666 | int i; | ||
1667 | |||
1668 | for_each_cpu_mask(i, sd->span) | ||
1669 | shares += tg->cfs_rq[i]->shares; | ||
1670 | |||
1671 | if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) | ||
1672 | shares = tg->shares; | ||
1673 | |||
1674 | aggregate(tg, sd)->shares = shares; | ||
1675 | } | ||
1676 | |||
1677 | /* | ||
1678 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
1679 | * weight and this group's parent's load, i.e. top-down. | ||
1680 | */ | ||
1681 | static | ||
1682 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | ||
1683 | { | ||
1684 | unsigned long load; | ||
1685 | |||
1686 | if (!tg->parent) { | ||
1687 | int i; | ||
1688 | |||
1689 | load = 0; | ||
1690 | for_each_cpu_mask(i, sd->span) | ||
1691 | load += cpu_rq(i)->load.weight; | ||
1692 | |||
1693 | } else { | ||
1694 | load = aggregate(tg->parent, sd)->load; | ||
1695 | |||
1696 | /* | ||
1697 | * shares is our weight in the parent's rq so | ||
1698 | * shares/parent->rq_weight gives our fraction of the load | ||
1699 | */ | ||
1700 | load *= aggregate(tg, sd)->shares; | ||
1701 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | ||
1702 | } | ||
1703 | |||
1704 | aggregate(tg, sd)->load = load; | ||
1705 | } | ||
1706 | |||
1707 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1708 | |||
1709 | /* | ||
1710 | * Calculate and set the cpu's group shares. | ||
1711 | */ | ||
1712 | static void | ||
1713 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | ||
1714 | int tcpu) | ||
1715 | { | ||
1716 | int boost = 0; | ||
1717 | unsigned long shares; | ||
1718 | unsigned long rq_weight; | ||
1719 | |||
1720 | if (!tg->se[tcpu]) | ||
1721 | return; | ||
1722 | |||
1723 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | ||
1724 | |||
1725 | /* | ||
1726 | * If there are currently no tasks on the cpu pretend there is one of | ||
1727 | * average load so that when a new task gets to run here it will not | ||
1728 | * get delayed by group starvation. | ||
1729 | */ | ||
1730 | if (!rq_weight) { | ||
1731 | boost = 1; | ||
1732 | rq_weight = NICE_0_LOAD; | ||
1733 | } | ||
1734 | |||
1735 | /* | ||
1736 | * \Sum shares * rq_weight | ||
1737 | * shares = ----------------------- | ||
1738 | * \Sum rq_weight | ||
1739 | * | ||
1740 | */ | ||
1741 | shares = aggregate(tg, sd)->shares * rq_weight; | ||
1742 | shares /= aggregate(tg, sd)->rq_weight + 1; | ||
1743 | |||
1744 | /* | ||
1745 | * record the actual number of shares, not the boosted amount. | ||
1746 | */ | ||
1747 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | ||
1748 | |||
1749 | if (shares < MIN_SHARES) | ||
1750 | shares = MIN_SHARES; | ||
1751 | |||
1752 | __set_se_shares(tg->se[tcpu], shares); | ||
1753 | } | ||
1754 | |||
1755 | /* | ||
1756 | * Re-adjust the weights on the cpu the task came from and on the cpu the | ||
1757 | * task went to. | ||
1758 | */ | ||
1759 | static void | ||
1760 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1761 | int scpu, int dcpu) | ||
1762 | { | ||
1763 | unsigned long shares; | ||
1764 | |||
1765 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1766 | |||
1767 | __update_group_shares_cpu(tg, sd, scpu); | ||
1768 | __update_group_shares_cpu(tg, sd, dcpu); | ||
1769 | |||
1770 | /* | ||
1771 | * ensure we never loose shares due to rounding errors in the | ||
1772 | * above redistribution. | ||
1773 | */ | ||
1774 | shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1775 | if (shares) | ||
1776 | tg->cfs_rq[dcpu]->shares += shares; | ||
1777 | } | ||
1778 | |||
1779 | /* | ||
1780 | * Because changing a group's shares changes the weight of the super-group | ||
1781 | * we need to walk up the tree and change all shares until we hit the root. | ||
1782 | */ | ||
1783 | static void | ||
1784 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1785 | int scpu, int dcpu) | ||
1786 | { | ||
1787 | while (tg) { | ||
1788 | __move_group_shares(tg, sd, scpu, dcpu); | ||
1789 | tg = tg->parent; | ||
1790 | } | ||
1791 | } | ||
1792 | |||
1793 | static | ||
1794 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | ||
1795 | { | ||
1796 | unsigned long shares = aggregate(tg, sd)->shares; | ||
1797 | int i; | ||
1798 | |||
1799 | for_each_cpu_mask(i, sd->span) { | ||
1800 | struct rq *rq = cpu_rq(i); | ||
1801 | unsigned long flags; | ||
1802 | |||
1803 | spin_lock_irqsave(&rq->lock, flags); | ||
1804 | __update_group_shares_cpu(tg, sd, i); | ||
1805 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1806 | } | ||
1807 | |||
1808 | aggregate_group_shares(tg, sd); | ||
1809 | |||
1810 | /* | ||
1811 | * ensure we never loose shares due to rounding errors in the | ||
1812 | * above redistribution. | ||
1813 | */ | ||
1814 | shares -= aggregate(tg, sd)->shares; | ||
1815 | if (shares) { | ||
1816 | tg->cfs_rq[sd->first_cpu]->shares += shares; | ||
1817 | aggregate(tg, sd)->shares += shares; | ||
1818 | } | ||
1819 | } | ||
1820 | |||
1821 | /* | ||
1822 | * Calculate the accumulative weight and recursive load of each task group | ||
1823 | * while walking down the tree. | ||
1824 | */ | ||
1825 | static | ||
1826 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | ||
1827 | { | ||
1828 | aggregate_group_weight(tg, sd); | ||
1829 | aggregate_group_shares(tg, sd); | ||
1830 | aggregate_group_load(tg, sd); | ||
1831 | } | ||
1832 | |||
1833 | /* | ||
1834 | * Rebalance the cpu shares while walking back up the tree. | ||
1835 | */ | ||
1836 | static | ||
1837 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | ||
1838 | { | ||
1839 | aggregate_group_set_shares(tg, sd); | ||
1840 | } | ||
1841 | |||
1842 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | ||
1843 | |||
1844 | static void __init init_aggregate(void) | ||
1845 | { | ||
1846 | int i; | ||
1847 | |||
1848 | for_each_possible_cpu(i) | ||
1849 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
1850 | } | ||
1851 | |||
1852 | static int get_aggregate(struct sched_domain *sd) | ||
1853 | { | ||
1854 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | ||
1855 | return 0; | ||
1856 | |||
1857 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | ||
1858 | return 1; | ||
1859 | } | ||
1860 | |||
1861 | static void put_aggregate(struct sched_domain *sd) | ||
1862 | { | ||
1863 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | ||
1864 | } | ||
1865 | |||
1866 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1867 | { | ||
1868 | cfs_rq->shares = shares; | ||
1869 | } | ||
1870 | |||
1871 | #else | ||
1872 | |||
1873 | static inline void init_aggregate(void) | ||
1874 | { | ||
1875 | } | ||
1876 | |||
1877 | static inline int get_aggregate(struct sched_domain *sd) | ||
1878 | { | ||
1879 | return 0; | ||
1880 | } | ||
1881 | |||
1882 | static inline void put_aggregate(struct sched_domain *sd) | ||
1883 | { | ||
1884 | } | ||
1885 | #endif | ||
1886 | |||
1887 | #else /* CONFIG_SMP */ | ||
1888 | |||
1889 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1890 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1891 | { | ||
1892 | } | ||
1893 | #endif | ||
1894 | |||
1249 | #endif /* CONFIG_SMP */ | 1895 | #endif /* CONFIG_SMP */ |
1250 | 1896 | ||
1251 | #include "sched_stats.h" | 1897 | #include "sched_stats.h" |
@@ -1258,26 +1904,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1258 | 1904 | ||
1259 | #define sched_class_highest (&rt_sched_class) | 1905 | #define sched_class_highest (&rt_sched_class) |
1260 | 1906 | ||
1261 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1907 | static void inc_nr_running(struct rq *rq) |
1262 | { | ||
1263 | update_load_add(&rq->load, p->se.load.weight); | ||
1264 | } | ||
1265 | |||
1266 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1267 | { | ||
1268 | update_load_sub(&rq->load, p->se.load.weight); | ||
1269 | } | ||
1270 | |||
1271 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1272 | { | 1908 | { |
1273 | rq->nr_running++; | 1909 | rq->nr_running++; |
1274 | inc_load(rq, p); | ||
1275 | } | 1910 | } |
1276 | 1911 | ||
1277 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1912 | static void dec_nr_running(struct rq *rq) |
1278 | { | 1913 | { |
1279 | rq->nr_running--; | 1914 | rq->nr_running--; |
1280 | dec_load(rq, p); | ||
1281 | } | 1915 | } |
1282 | 1916 | ||
1283 | static void set_load_weight(struct task_struct *p) | 1917 | static void set_load_weight(struct task_struct *p) |
@@ -1369,7 +2003,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1369 | rq->nr_uninterruptible--; | 2003 | rq->nr_uninterruptible--; |
1370 | 2004 | ||
1371 | enqueue_task(rq, p, wakeup); | 2005 | enqueue_task(rq, p, wakeup); |
1372 | inc_nr_running(p, rq); | 2006 | inc_nr_running(rq); |
1373 | } | 2007 | } |
1374 | 2008 | ||
1375 | /* | 2009 | /* |
@@ -1381,7 +2015,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1381 | rq->nr_uninterruptible++; | 2015 | rq->nr_uninterruptible++; |
1382 | 2016 | ||
1383 | dequeue_task(rq, p, sleep); | 2017 | dequeue_task(rq, p, sleep); |
1384 | dec_nr_running(p, rq); | 2018 | dec_nr_running(rq); |
1385 | } | 2019 | } |
1386 | 2020 | ||
1387 | /** | 2021 | /** |
@@ -1438,7 +2072,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
1438 | /* | 2072 | /* |
1439 | * Buddy candidates are cache hot: | 2073 | * Buddy candidates are cache hot: |
1440 | */ | 2074 | */ |
1441 | if (&p->se == cfs_rq_of(&p->se)->next) | 2075 | if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) |
1442 | return 1; | 2076 | return 1; |
1443 | 2077 | ||
1444 | if (p->sched_class != &fair_sched_class) | 2078 | if (p->sched_class != &fair_sched_class) |
@@ -1728,17 +2362,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1728 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | 2362 | * find_idlest_cpu - find the idlest cpu among the cpus in group. |
1729 | */ | 2363 | */ |
1730 | static int | 2364 | static int |
1731 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 2365 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, |
2366 | cpumask_t *tmp) | ||
1732 | { | 2367 | { |
1733 | cpumask_t tmp; | ||
1734 | unsigned long load, min_load = ULONG_MAX; | 2368 | unsigned long load, min_load = ULONG_MAX; |
1735 | int idlest = -1; | 2369 | int idlest = -1; |
1736 | int i; | 2370 | int i; |
1737 | 2371 | ||
1738 | /* Traverse only the allowed CPUs */ | 2372 | /* Traverse only the allowed CPUs */ |
1739 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 2373 | cpus_and(*tmp, group->cpumask, p->cpus_allowed); |
1740 | 2374 | ||
1741 | for_each_cpu_mask(i, tmp) { | 2375 | for_each_cpu_mask(i, *tmp) { |
1742 | load = weighted_cpuload(i); | 2376 | load = weighted_cpuload(i); |
1743 | 2377 | ||
1744 | if (load < min_load || (load == min_load && i == this_cpu)) { | 2378 | if (load < min_load || (load == min_load && i == this_cpu)) { |
@@ -1777,7 +2411,7 @@ static int sched_balance_self(int cpu, int flag) | |||
1777 | } | 2411 | } |
1778 | 2412 | ||
1779 | while (sd) { | 2413 | while (sd) { |
1780 | cpumask_t span; | 2414 | cpumask_t span, tmpmask; |
1781 | struct sched_group *group; | 2415 | struct sched_group *group; |
1782 | int new_cpu, weight; | 2416 | int new_cpu, weight; |
1783 | 2417 | ||
@@ -1793,7 +2427,7 @@ static int sched_balance_self(int cpu, int flag) | |||
1793 | continue; | 2427 | continue; |
1794 | } | 2428 | } |
1795 | 2429 | ||
1796 | new_cpu = find_idlest_cpu(group, t, cpu); | 2430 | new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); |
1797 | if (new_cpu == -1 || new_cpu == cpu) { | 2431 | if (new_cpu == -1 || new_cpu == cpu) { |
1798 | /* Now try balancing at a lower domain level of cpu */ | 2432 | /* Now try balancing at a lower domain level of cpu */ |
1799 | sd = sd->child; | 2433 | sd = sd->child; |
@@ -1839,6 +2473,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1839 | long old_state; | 2473 | long old_state; |
1840 | struct rq *rq; | 2474 | struct rq *rq; |
1841 | 2475 | ||
2476 | if (!sched_feat(SYNC_WAKEUPS)) | ||
2477 | sync = 0; | ||
2478 | |||
1842 | smp_wmb(); | 2479 | smp_wmb(); |
1843 | rq = task_rq_lock(p, &flags); | 2480 | rq = task_rq_lock(p, &flags); |
1844 | old_state = p->state; | 2481 | old_state = p->state; |
@@ -1955,6 +2592,7 @@ static void __sched_fork(struct task_struct *p) | |||
1955 | 2592 | ||
1956 | INIT_LIST_HEAD(&p->rt.run_list); | 2593 | INIT_LIST_HEAD(&p->rt.run_list); |
1957 | p->se.on_rq = 0; | 2594 | p->se.on_rq = 0; |
2595 | INIT_LIST_HEAD(&p->se.group_node); | ||
1958 | 2596 | ||
1959 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2597 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1960 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2598 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2030,7 +2668,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2030 | * management (if any): | 2668 | * management (if any): |
2031 | */ | 2669 | */ |
2032 | p->sched_class->task_new(rq, p); | 2670 | p->sched_class->task_new(rq, p); |
2033 | inc_nr_running(p, rq); | 2671 | inc_nr_running(rq); |
2034 | } | 2672 | } |
2035 | check_preempt_curr(rq, p); | 2673 | check_preempt_curr(rq, p); |
2036 | #ifdef CONFIG_SMP | 2674 | #ifdef CONFIG_SMP |
@@ -2674,7 +3312,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2674 | static struct sched_group * | 3312 | static struct sched_group * |
2675 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3313 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2676 | unsigned long *imbalance, enum cpu_idle_type idle, | 3314 | unsigned long *imbalance, enum cpu_idle_type idle, |
2677 | int *sd_idle, cpumask_t *cpus, int *balance) | 3315 | int *sd_idle, const cpumask_t *cpus, int *balance) |
2678 | { | 3316 | { |
2679 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 3317 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
2680 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 3318 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
@@ -2975,7 +3613,7 @@ ret: | |||
2975 | */ | 3613 | */ |
2976 | static struct rq * | 3614 | static struct rq * |
2977 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 3615 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, |
2978 | unsigned long imbalance, cpumask_t *cpus) | 3616 | unsigned long imbalance, const cpumask_t *cpus) |
2979 | { | 3617 | { |
2980 | struct rq *busiest = NULL, *rq; | 3618 | struct rq *busiest = NULL, *rq; |
2981 | unsigned long max_load = 0; | 3619 | unsigned long max_load = 0; |
@@ -3014,14 +3652,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
3014 | */ | 3652 | */ |
3015 | static int load_balance(int this_cpu, struct rq *this_rq, | 3653 | static int load_balance(int this_cpu, struct rq *this_rq, |
3016 | struct sched_domain *sd, enum cpu_idle_type idle, | 3654 | struct sched_domain *sd, enum cpu_idle_type idle, |
3017 | int *balance) | 3655 | int *balance, cpumask_t *cpus) |
3018 | { | 3656 | { |
3019 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3657 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
3020 | struct sched_group *group; | 3658 | struct sched_group *group; |
3021 | unsigned long imbalance; | 3659 | unsigned long imbalance; |
3022 | struct rq *busiest; | 3660 | struct rq *busiest; |
3023 | cpumask_t cpus = CPU_MASK_ALL; | ||
3024 | unsigned long flags; | 3661 | unsigned long flags; |
3662 | int unlock_aggregate; | ||
3663 | |||
3664 | cpus_setall(*cpus); | ||
3665 | |||
3666 | unlock_aggregate = get_aggregate(sd); | ||
3025 | 3667 | ||
3026 | /* | 3668 | /* |
3027 | * When power savings policy is enabled for the parent domain, idle | 3669 | * When power savings policy is enabled for the parent domain, idle |
@@ -3037,7 +3679,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3037 | 3679 | ||
3038 | redo: | 3680 | redo: |
3039 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3681 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3040 | &cpus, balance); | 3682 | cpus, balance); |
3041 | 3683 | ||
3042 | if (*balance == 0) | 3684 | if (*balance == 0) |
3043 | goto out_balanced; | 3685 | goto out_balanced; |
@@ -3047,7 +3689,7 @@ redo: | |||
3047 | goto out_balanced; | 3689 | goto out_balanced; |
3048 | } | 3690 | } |
3049 | 3691 | ||
3050 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); | 3692 | busiest = find_busiest_queue(group, idle, imbalance, cpus); |
3051 | if (!busiest) { | 3693 | if (!busiest) { |
3052 | schedstat_inc(sd, lb_nobusyq[idle]); | 3694 | schedstat_inc(sd, lb_nobusyq[idle]); |
3053 | goto out_balanced; | 3695 | goto out_balanced; |
@@ -3080,8 +3722,8 @@ redo: | |||
3080 | 3722 | ||
3081 | /* All tasks on this runqueue were pinned by CPU affinity */ | 3723 | /* All tasks on this runqueue were pinned by CPU affinity */ |
3082 | if (unlikely(all_pinned)) { | 3724 | if (unlikely(all_pinned)) { |
3083 | cpu_clear(cpu_of(busiest), cpus); | 3725 | cpu_clear(cpu_of(busiest), *cpus); |
3084 | if (!cpus_empty(cpus)) | 3726 | if (!cpus_empty(*cpus)) |
3085 | goto redo; | 3727 | goto redo; |
3086 | goto out_balanced; | 3728 | goto out_balanced; |
3087 | } | 3729 | } |
@@ -3138,8 +3780,9 @@ redo: | |||
3138 | 3780 | ||
3139 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3781 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3140 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3782 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3141 | return -1; | 3783 | ld_moved = -1; |
3142 | return ld_moved; | 3784 | |
3785 | goto out; | ||
3143 | 3786 | ||
3144 | out_balanced: | 3787 | out_balanced: |
3145 | schedstat_inc(sd, lb_balanced[idle]); | 3788 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -3154,8 +3797,13 @@ out_one_pinned: | |||
3154 | 3797 | ||
3155 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3798 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3156 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3799 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3157 | return -1; | 3800 | ld_moved = -1; |
3158 | return 0; | 3801 | else |
3802 | ld_moved = 0; | ||
3803 | out: | ||
3804 | if (unlock_aggregate) | ||
3805 | put_aggregate(sd); | ||
3806 | return ld_moved; | ||
3159 | } | 3807 | } |
3160 | 3808 | ||
3161 | /* | 3809 | /* |
@@ -3166,7 +3814,8 @@ out_one_pinned: | |||
3166 | * this_rq is locked. | 3814 | * this_rq is locked. |
3167 | */ | 3815 | */ |
3168 | static int | 3816 | static int |
3169 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | 3817 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, |
3818 | cpumask_t *cpus) | ||
3170 | { | 3819 | { |
3171 | struct sched_group *group; | 3820 | struct sched_group *group; |
3172 | struct rq *busiest = NULL; | 3821 | struct rq *busiest = NULL; |
@@ -3174,7 +3823,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
3174 | int ld_moved = 0; | 3823 | int ld_moved = 0; |
3175 | int sd_idle = 0; | 3824 | int sd_idle = 0; |
3176 | int all_pinned = 0; | 3825 | int all_pinned = 0; |
3177 | cpumask_t cpus = CPU_MASK_ALL; | 3826 | |
3827 | cpus_setall(*cpus); | ||
3178 | 3828 | ||
3179 | /* | 3829 | /* |
3180 | * When power savings policy is enabled for the parent domain, idle | 3830 | * When power savings policy is enabled for the parent domain, idle |
@@ -3189,14 +3839,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
3189 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3839 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
3190 | redo: | 3840 | redo: |
3191 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3841 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
3192 | &sd_idle, &cpus, NULL); | 3842 | &sd_idle, cpus, NULL); |
3193 | if (!group) { | 3843 | if (!group) { |
3194 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | 3844 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); |
3195 | goto out_balanced; | 3845 | goto out_balanced; |
3196 | } | 3846 | } |
3197 | 3847 | ||
3198 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, | 3848 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); |
3199 | &cpus); | ||
3200 | if (!busiest) { | 3849 | if (!busiest) { |
3201 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | 3850 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); |
3202 | goto out_balanced; | 3851 | goto out_balanced; |
@@ -3218,8 +3867,8 @@ redo: | |||
3218 | spin_unlock(&busiest->lock); | 3867 | spin_unlock(&busiest->lock); |
3219 | 3868 | ||
3220 | if (unlikely(all_pinned)) { | 3869 | if (unlikely(all_pinned)) { |
3221 | cpu_clear(cpu_of(busiest), cpus); | 3870 | cpu_clear(cpu_of(busiest), *cpus); |
3222 | if (!cpus_empty(cpus)) | 3871 | if (!cpus_empty(*cpus)) |
3223 | goto redo; | 3872 | goto redo; |
3224 | } | 3873 | } |
3225 | } | 3874 | } |
@@ -3253,6 +3902,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3253 | struct sched_domain *sd; | 3902 | struct sched_domain *sd; |
3254 | int pulled_task = -1; | 3903 | int pulled_task = -1; |
3255 | unsigned long next_balance = jiffies + HZ; | 3904 | unsigned long next_balance = jiffies + HZ; |
3905 | cpumask_t tmpmask; | ||
3256 | 3906 | ||
3257 | for_each_domain(this_cpu, sd) { | 3907 | for_each_domain(this_cpu, sd) { |
3258 | unsigned long interval; | 3908 | unsigned long interval; |
@@ -3262,8 +3912,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3262 | 3912 | ||
3263 | if (sd->flags & SD_BALANCE_NEWIDLE) | 3913 | if (sd->flags & SD_BALANCE_NEWIDLE) |
3264 | /* If we've pulled tasks over stop searching: */ | 3914 | /* If we've pulled tasks over stop searching: */ |
3265 | pulled_task = load_balance_newidle(this_cpu, | 3915 | pulled_task = load_balance_newidle(this_cpu, this_rq, |
3266 | this_rq, sd); | 3916 | sd, &tmpmask); |
3267 | 3917 | ||
3268 | interval = msecs_to_jiffies(sd->balance_interval); | 3918 | interval = msecs_to_jiffies(sd->balance_interval); |
3269 | if (time_after(next_balance, sd->last_balance + interval)) | 3919 | if (time_after(next_balance, sd->last_balance + interval)) |
@@ -3422,6 +4072,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3422 | /* Earliest time when we have to do rebalance again */ | 4072 | /* Earliest time when we have to do rebalance again */ |
3423 | unsigned long next_balance = jiffies + 60*HZ; | 4073 | unsigned long next_balance = jiffies + 60*HZ; |
3424 | int update_next_balance = 0; | 4074 | int update_next_balance = 0; |
4075 | cpumask_t tmp; | ||
3425 | 4076 | ||
3426 | for_each_domain(cpu, sd) { | 4077 | for_each_domain(cpu, sd) { |
3427 | if (!(sd->flags & SD_LOAD_BALANCE)) | 4078 | if (!(sd->flags & SD_LOAD_BALANCE)) |
@@ -3445,7 +4096,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3445 | } | 4096 | } |
3446 | 4097 | ||
3447 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 4098 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
3448 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 4099 | if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { |
3449 | /* | 4100 | /* |
3450 | * We've pulled tasks over so either we're no | 4101 | * We've pulled tasks over so either we're no |
3451 | * longer idle, or one of our SMT siblings is | 4102 | * longer idle, or one of our SMT siblings is |
@@ -3561,7 +4212,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) | |||
3561 | */ | 4212 | */ |
3562 | int ilb = first_cpu(nohz.cpu_mask); | 4213 | int ilb = first_cpu(nohz.cpu_mask); |
3563 | 4214 | ||
3564 | if (ilb != NR_CPUS) | 4215 | if (ilb < nr_cpu_ids) |
3565 | resched_cpu(ilb); | 4216 | resched_cpu(ilb); |
3566 | } | 4217 | } |
3567 | } | 4218 | } |
@@ -3765,9 +4416,9 @@ void scheduler_tick(void) | |||
3765 | rq->clock_underflows++; | 4416 | rq->clock_underflows++; |
3766 | } | 4417 | } |
3767 | rq->tick_timestamp = rq->clock; | 4418 | rq->tick_timestamp = rq->clock; |
4419 | update_last_tick_seen(rq); | ||
3768 | update_cpu_load(rq); | 4420 | update_cpu_load(rq); |
3769 | curr->sched_class->task_tick(rq, curr, 0); | 4421 | curr->sched_class->task_tick(rq, curr, 0); |
3770 | update_sched_rt_period(rq); | ||
3771 | spin_unlock(&rq->lock); | 4422 | spin_unlock(&rq->lock); |
3772 | 4423 | ||
3773 | #ifdef CONFIG_SMP | 4424 | #ifdef CONFIG_SMP |
@@ -4367,10 +5018,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4367 | goto out_unlock; | 5018 | goto out_unlock; |
4368 | } | 5019 | } |
4369 | on_rq = p->se.on_rq; | 5020 | on_rq = p->se.on_rq; |
4370 | if (on_rq) { | 5021 | if (on_rq) |
4371 | dequeue_task(rq, p, 0); | 5022 | dequeue_task(rq, p, 0); |
4372 | dec_load(rq, p); | ||
4373 | } | ||
4374 | 5023 | ||
4375 | p->static_prio = NICE_TO_PRIO(nice); | 5024 | p->static_prio = NICE_TO_PRIO(nice); |
4376 | set_load_weight(p); | 5025 | set_load_weight(p); |
@@ -4380,7 +5029,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4380 | 5029 | ||
4381 | if (on_rq) { | 5030 | if (on_rq) { |
4382 | enqueue_task(rq, p, 0); | 5031 | enqueue_task(rq, p, 0); |
4383 | inc_load(rq, p); | ||
4384 | /* | 5032 | /* |
4385 | * If the task increased its priority or is running and | 5033 | * If the task increased its priority or is running and |
4386 | * lowered its priority, then reschedule its CPU: | 5034 | * lowered its priority, then reschedule its CPU: |
@@ -4602,7 +5250,7 @@ recheck: | |||
4602 | * Do not allow realtime tasks into groups that have no runtime | 5250 | * Do not allow realtime tasks into groups that have no runtime |
4603 | * assigned. | 5251 | * assigned. |
4604 | */ | 5252 | */ |
4605 | if (rt_policy(policy) && task_group(p)->rt_runtime == 0) | 5253 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) |
4606 | return -EPERM; | 5254 | return -EPERM; |
4607 | #endif | 5255 | #endif |
4608 | 5256 | ||
@@ -4764,9 +5412,10 @@ out_unlock: | |||
4764 | return retval; | 5412 | return retval; |
4765 | } | 5413 | } |
4766 | 5414 | ||
4767 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 5415 | long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) |
4768 | { | 5416 | { |
4769 | cpumask_t cpus_allowed; | 5417 | cpumask_t cpus_allowed; |
5418 | cpumask_t new_mask = *in_mask; | ||
4770 | struct task_struct *p; | 5419 | struct task_struct *p; |
4771 | int retval; | 5420 | int retval; |
4772 | 5421 | ||
@@ -4797,13 +5446,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4797 | if (retval) | 5446 | if (retval) |
4798 | goto out_unlock; | 5447 | goto out_unlock; |
4799 | 5448 | ||
4800 | cpus_allowed = cpuset_cpus_allowed(p); | 5449 | cpuset_cpus_allowed(p, &cpus_allowed); |
4801 | cpus_and(new_mask, new_mask, cpus_allowed); | 5450 | cpus_and(new_mask, new_mask, cpus_allowed); |
4802 | again: | 5451 | again: |
4803 | retval = set_cpus_allowed(p, new_mask); | 5452 | retval = set_cpus_allowed_ptr(p, &new_mask); |
4804 | 5453 | ||
4805 | if (!retval) { | 5454 | if (!retval) { |
4806 | cpus_allowed = cpuset_cpus_allowed(p); | 5455 | cpuset_cpus_allowed(p, &cpus_allowed); |
4807 | if (!cpus_subset(new_mask, cpus_allowed)) { | 5456 | if (!cpus_subset(new_mask, cpus_allowed)) { |
4808 | /* | 5457 | /* |
4809 | * We must have raced with a concurrent cpuset | 5458 | * We must have raced with a concurrent cpuset |
@@ -4847,7 +5496,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
4847 | if (retval) | 5496 | if (retval) |
4848 | return retval; | 5497 | return retval; |
4849 | 5498 | ||
4850 | return sched_setaffinity(pid, new_mask); | 5499 | return sched_setaffinity(pid, &new_mask); |
4851 | } | 5500 | } |
4852 | 5501 | ||
4853 | /* | 5502 | /* |
@@ -5309,7 +5958,6 @@ static inline void sched_init_granularity(void) | |||
5309 | sysctl_sched_latency = limit; | 5958 | sysctl_sched_latency = limit; |
5310 | 5959 | ||
5311 | sysctl_sched_wakeup_granularity *= factor; | 5960 | sysctl_sched_wakeup_granularity *= factor; |
5312 | sysctl_sched_batch_wakeup_granularity *= factor; | ||
5313 | } | 5961 | } |
5314 | 5962 | ||
5315 | #ifdef CONFIG_SMP | 5963 | #ifdef CONFIG_SMP |
@@ -5338,7 +5986,7 @@ static inline void sched_init_granularity(void) | |||
5338 | * task must not exit() & deallocate itself prematurely. The | 5986 | * task must not exit() & deallocate itself prematurely. The |
5339 | * call is not atomic; no spinlocks may be held. | 5987 | * call is not atomic; no spinlocks may be held. |
5340 | */ | 5988 | */ |
5341 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 5989 | int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) |
5342 | { | 5990 | { |
5343 | struct migration_req req; | 5991 | struct migration_req req; |
5344 | unsigned long flags; | 5992 | unsigned long flags; |
@@ -5346,23 +5994,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
5346 | int ret = 0; | 5994 | int ret = 0; |
5347 | 5995 | ||
5348 | rq = task_rq_lock(p, &flags); | 5996 | rq = task_rq_lock(p, &flags); |
5349 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 5997 | if (!cpus_intersects(*new_mask, cpu_online_map)) { |
5350 | ret = -EINVAL; | 5998 | ret = -EINVAL; |
5351 | goto out; | 5999 | goto out; |
5352 | } | 6000 | } |
5353 | 6001 | ||
5354 | if (p->sched_class->set_cpus_allowed) | 6002 | if (p->sched_class->set_cpus_allowed) |
5355 | p->sched_class->set_cpus_allowed(p, &new_mask); | 6003 | p->sched_class->set_cpus_allowed(p, new_mask); |
5356 | else { | 6004 | else { |
5357 | p->cpus_allowed = new_mask; | 6005 | p->cpus_allowed = *new_mask; |
5358 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | 6006 | p->rt.nr_cpus_allowed = cpus_weight(*new_mask); |
5359 | } | 6007 | } |
5360 | 6008 | ||
5361 | /* Can the task run on the task's current CPU? If so, we're done */ | 6009 | /* Can the task run on the task's current CPU? If so, we're done */ |
5362 | if (cpu_isset(task_cpu(p), new_mask)) | 6010 | if (cpu_isset(task_cpu(p), *new_mask)) |
5363 | goto out; | 6011 | goto out; |
5364 | 6012 | ||
5365 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { | 6013 | if (migrate_task(p, any_online_cpu(*new_mask), &req)) { |
5366 | /* Need help from migration thread: drop lock and wait. */ | 6014 | /* Need help from migration thread: drop lock and wait. */ |
5367 | task_rq_unlock(rq, &flags); | 6015 | task_rq_unlock(rq, &flags); |
5368 | wake_up_process(rq->migration_thread); | 6016 | wake_up_process(rq->migration_thread); |
@@ -5375,7 +6023,7 @@ out: | |||
5375 | 6023 | ||
5376 | return ret; | 6024 | return ret; |
5377 | } | 6025 | } |
5378 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 6026 | EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); |
5379 | 6027 | ||
5380 | /* | 6028 | /* |
5381 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 6029 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
@@ -5513,12 +6161,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5513 | dest_cpu = any_online_cpu(mask); | 6161 | dest_cpu = any_online_cpu(mask); |
5514 | 6162 | ||
5515 | /* On any allowed CPU? */ | 6163 | /* On any allowed CPU? */ |
5516 | if (dest_cpu == NR_CPUS) | 6164 | if (dest_cpu >= nr_cpu_ids) |
5517 | dest_cpu = any_online_cpu(p->cpus_allowed); | 6165 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5518 | 6166 | ||
5519 | /* No more Mr. Nice Guy. */ | 6167 | /* No more Mr. Nice Guy. */ |
5520 | if (dest_cpu == NR_CPUS) { | 6168 | if (dest_cpu >= nr_cpu_ids) { |
5521 | cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); | 6169 | cpumask_t cpus_allowed; |
6170 | |||
6171 | cpuset_cpus_allowed_locked(p, &cpus_allowed); | ||
5522 | /* | 6172 | /* |
5523 | * Try to stay on the same cpuset, where the | 6173 | * Try to stay on the same cpuset, where the |
5524 | * current cpuset may be a subset of all cpus. | 6174 | * current cpuset may be a subset of all cpus. |
@@ -5554,7 +6204,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5554 | */ | 6204 | */ |
5555 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 6205 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5556 | { | 6206 | { |
5557 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 6207 | struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); |
5558 | unsigned long flags; | 6208 | unsigned long flags; |
5559 | 6209 | ||
5560 | local_irq_save(flags); | 6210 | local_irq_save(flags); |
@@ -5966,20 +6616,16 @@ void __init migration_init(void) | |||
5966 | 6616 | ||
5967 | #ifdef CONFIG_SMP | 6617 | #ifdef CONFIG_SMP |
5968 | 6618 | ||
5969 | /* Number of possible processor ids */ | ||
5970 | int nr_cpu_ids __read_mostly = NR_CPUS; | ||
5971 | EXPORT_SYMBOL(nr_cpu_ids); | ||
5972 | |||
5973 | #ifdef CONFIG_SCHED_DEBUG | 6619 | #ifdef CONFIG_SCHED_DEBUG |
5974 | 6620 | ||
5975 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | 6621 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
6622 | cpumask_t *groupmask) | ||
5976 | { | 6623 | { |
5977 | struct sched_group *group = sd->groups; | 6624 | struct sched_group *group = sd->groups; |
5978 | cpumask_t groupmask; | 6625 | char str[256]; |
5979 | char str[NR_CPUS]; | ||
5980 | 6626 | ||
5981 | cpumask_scnprintf(str, NR_CPUS, sd->span); | 6627 | cpulist_scnprintf(str, sizeof(str), sd->span); |
5982 | cpus_clear(groupmask); | 6628 | cpus_clear(*groupmask); |
5983 | 6629 | ||
5984 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | 6630 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); |
5985 | 6631 | ||
@@ -6023,25 +6669,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | |||
6023 | break; | 6669 | break; |
6024 | } | 6670 | } |
6025 | 6671 | ||
6026 | if (cpus_intersects(groupmask, group->cpumask)) { | 6672 | if (cpus_intersects(*groupmask, group->cpumask)) { |
6027 | printk(KERN_CONT "\n"); | 6673 | printk(KERN_CONT "\n"); |
6028 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 6674 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
6029 | break; | 6675 | break; |
6030 | } | 6676 | } |
6031 | 6677 | ||
6032 | cpus_or(groupmask, groupmask, group->cpumask); | 6678 | cpus_or(*groupmask, *groupmask, group->cpumask); |
6033 | 6679 | ||
6034 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); | 6680 | cpulist_scnprintf(str, sizeof(str), group->cpumask); |
6035 | printk(KERN_CONT " %s", str); | 6681 | printk(KERN_CONT " %s", str); |
6036 | 6682 | ||
6037 | group = group->next; | 6683 | group = group->next; |
6038 | } while (group != sd->groups); | 6684 | } while (group != sd->groups); |
6039 | printk(KERN_CONT "\n"); | 6685 | printk(KERN_CONT "\n"); |
6040 | 6686 | ||
6041 | if (!cpus_equal(sd->span, groupmask)) | 6687 | if (!cpus_equal(sd->span, *groupmask)) |
6042 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 6688 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); |
6043 | 6689 | ||
6044 | if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) | 6690 | if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) |
6045 | printk(KERN_ERR "ERROR: parent span is not a superset " | 6691 | printk(KERN_ERR "ERROR: parent span is not a superset " |
6046 | "of domain->span\n"); | 6692 | "of domain->span\n"); |
6047 | return 0; | 6693 | return 0; |
@@ -6049,6 +6695,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | |||
6049 | 6695 | ||
6050 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6696 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6051 | { | 6697 | { |
6698 | cpumask_t *groupmask; | ||
6052 | int level = 0; | 6699 | int level = 0; |
6053 | 6700 | ||
6054 | if (!sd) { | 6701 | if (!sd) { |
@@ -6058,14 +6705,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6058 | 6705 | ||
6059 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6706 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6060 | 6707 | ||
6708 | groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | ||
6709 | if (!groupmask) { | ||
6710 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
6711 | return; | ||
6712 | } | ||
6713 | |||
6061 | for (;;) { | 6714 | for (;;) { |
6062 | if (sched_domain_debug_one(sd, cpu, level)) | 6715 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) |
6063 | break; | 6716 | break; |
6064 | level++; | 6717 | level++; |
6065 | sd = sd->parent; | 6718 | sd = sd->parent; |
6066 | if (!sd) | 6719 | if (!sd) |
6067 | break; | 6720 | break; |
6068 | } | 6721 | } |
6722 | kfree(groupmask); | ||
6069 | } | 6723 | } |
6070 | #else | 6724 | #else |
6071 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6725 | # define sched_domain_debug(sd, cpu) do { } while (0) |
@@ -6253,30 +6907,33 @@ __setup("isolcpus=", isolated_cpu_setup); | |||
6253 | * and ->cpu_power to 0. | 6907 | * and ->cpu_power to 0. |
6254 | */ | 6908 | */ |
6255 | static void | 6909 | static void |
6256 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | 6910 | init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, |
6257 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, | 6911 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
6258 | struct sched_group **sg)) | 6912 | struct sched_group **sg, |
6913 | cpumask_t *tmpmask), | ||
6914 | cpumask_t *covered, cpumask_t *tmpmask) | ||
6259 | { | 6915 | { |
6260 | struct sched_group *first = NULL, *last = NULL; | 6916 | struct sched_group *first = NULL, *last = NULL; |
6261 | cpumask_t covered = CPU_MASK_NONE; | ||
6262 | int i; | 6917 | int i; |
6263 | 6918 | ||
6264 | for_each_cpu_mask(i, span) { | 6919 | cpus_clear(*covered); |
6920 | |||
6921 | for_each_cpu_mask(i, *span) { | ||
6265 | struct sched_group *sg; | 6922 | struct sched_group *sg; |
6266 | int group = group_fn(i, cpu_map, &sg); | 6923 | int group = group_fn(i, cpu_map, &sg, tmpmask); |
6267 | int j; | 6924 | int j; |
6268 | 6925 | ||
6269 | if (cpu_isset(i, covered)) | 6926 | if (cpu_isset(i, *covered)) |
6270 | continue; | 6927 | continue; |
6271 | 6928 | ||
6272 | sg->cpumask = CPU_MASK_NONE; | 6929 | cpus_clear(sg->cpumask); |
6273 | sg->__cpu_power = 0; | 6930 | sg->__cpu_power = 0; |
6274 | 6931 | ||
6275 | for_each_cpu_mask(j, span) { | 6932 | for_each_cpu_mask(j, *span) { |
6276 | if (group_fn(j, cpu_map, NULL) != group) | 6933 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
6277 | continue; | 6934 | continue; |
6278 | 6935 | ||
6279 | cpu_set(j, covered); | 6936 | cpu_set(j, *covered); |
6280 | cpu_set(j, sg->cpumask); | 6937 | cpu_set(j, sg->cpumask); |
6281 | } | 6938 | } |
6282 | if (!first) | 6939 | if (!first) |
@@ -6302,7 +6959,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
6302 | * | 6959 | * |
6303 | * Should use nodemask_t. | 6960 | * Should use nodemask_t. |
6304 | */ | 6961 | */ |
6305 | static int find_next_best_node(int node, unsigned long *used_nodes) | 6962 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6306 | { | 6963 | { |
6307 | int i, n, val, min_val, best_node = 0; | 6964 | int i, n, val, min_val, best_node = 0; |
6308 | 6965 | ||
@@ -6316,7 +6973,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
6316 | continue; | 6973 | continue; |
6317 | 6974 | ||
6318 | /* Skip already used nodes */ | 6975 | /* Skip already used nodes */ |
6319 | if (test_bit(n, used_nodes)) | 6976 | if (node_isset(n, *used_nodes)) |
6320 | continue; | 6977 | continue; |
6321 | 6978 | ||
6322 | /* Simple min distance search */ | 6979 | /* Simple min distance search */ |
@@ -6328,40 +6985,37 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
6328 | } | 6985 | } |
6329 | } | 6986 | } |
6330 | 6987 | ||
6331 | set_bit(best_node, used_nodes); | 6988 | node_set(best_node, *used_nodes); |
6332 | return best_node; | 6989 | return best_node; |
6333 | } | 6990 | } |
6334 | 6991 | ||
6335 | /** | 6992 | /** |
6336 | * sched_domain_node_span - get a cpumask for a node's sched_domain | 6993 | * sched_domain_node_span - get a cpumask for a node's sched_domain |
6337 | * @node: node whose cpumask we're constructing | 6994 | * @node: node whose cpumask we're constructing |
6338 | * @size: number of nodes to include in this span | 6995 | * @span: resulting cpumask |
6339 | * | 6996 | * |
6340 | * Given a node, construct a good cpumask for its sched_domain to span. It | 6997 | * Given a node, construct a good cpumask for its sched_domain to span. It |
6341 | * should be one that prevents unnecessary balancing, but also spreads tasks | 6998 | * should be one that prevents unnecessary balancing, but also spreads tasks |
6342 | * out optimally. | 6999 | * out optimally. |
6343 | */ | 7000 | */ |
6344 | static cpumask_t sched_domain_node_span(int node) | 7001 | static void sched_domain_node_span(int node, cpumask_t *span) |
6345 | { | 7002 | { |
6346 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 7003 | nodemask_t used_nodes; |
6347 | cpumask_t span, nodemask; | 7004 | node_to_cpumask_ptr(nodemask, node); |
6348 | int i; | 7005 | int i; |
6349 | 7006 | ||
6350 | cpus_clear(span); | 7007 | cpus_clear(*span); |
6351 | bitmap_zero(used_nodes, MAX_NUMNODES); | 7008 | nodes_clear(used_nodes); |
6352 | 7009 | ||
6353 | nodemask = node_to_cpumask(node); | 7010 | cpus_or(*span, *span, *nodemask); |
6354 | cpus_or(span, span, nodemask); | 7011 | node_set(node, used_nodes); |
6355 | set_bit(node, used_nodes); | ||
6356 | 7012 | ||
6357 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 7013 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6358 | int next_node = find_next_best_node(node, used_nodes); | 7014 | int next_node = find_next_best_node(node, &used_nodes); |
6359 | 7015 | ||
6360 | nodemask = node_to_cpumask(next_node); | 7016 | node_to_cpumask_ptr_next(nodemask, next_node); |
6361 | cpus_or(span, span, nodemask); | 7017 | cpus_or(*span, *span, *nodemask); |
6362 | } | 7018 | } |
6363 | |||
6364 | return span; | ||
6365 | } | 7019 | } |
6366 | #endif | 7020 | #endif |
6367 | 7021 | ||
@@ -6375,7 +7029,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | |||
6375 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); | 7029 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
6376 | 7030 | ||
6377 | static int | 7031 | static int |
6378 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7032 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
7033 | cpumask_t *unused) | ||
6379 | { | 7034 | { |
6380 | if (sg) | 7035 | if (sg) |
6381 | *sg = &per_cpu(sched_group_cpus, cpu); | 7036 | *sg = &per_cpu(sched_group_cpus, cpu); |
@@ -6393,19 +7048,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); | |||
6393 | 7048 | ||
6394 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 7049 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6395 | static int | 7050 | static int |
6396 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7051 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
7052 | cpumask_t *mask) | ||
6397 | { | 7053 | { |
6398 | int group; | 7054 | int group; |
6399 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 7055 | |
6400 | cpus_and(mask, mask, *cpu_map); | 7056 | *mask = per_cpu(cpu_sibling_map, cpu); |
6401 | group = first_cpu(mask); | 7057 | cpus_and(*mask, *mask, *cpu_map); |
7058 | group = first_cpu(*mask); | ||
6402 | if (sg) | 7059 | if (sg) |
6403 | *sg = &per_cpu(sched_group_core, group); | 7060 | *sg = &per_cpu(sched_group_core, group); |
6404 | return group; | 7061 | return group; |
6405 | } | 7062 | } |
6406 | #elif defined(CONFIG_SCHED_MC) | 7063 | #elif defined(CONFIG_SCHED_MC) |
6407 | static int | 7064 | static int |
6408 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7065 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
7066 | cpumask_t *unused) | ||
6409 | { | 7067 | { |
6410 | if (sg) | 7068 | if (sg) |
6411 | *sg = &per_cpu(sched_group_core, cpu); | 7069 | *sg = &per_cpu(sched_group_core, cpu); |
@@ -6417,17 +7075,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); | |||
6417 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); | 7075 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
6418 | 7076 | ||
6419 | static int | 7077 | static int |
6420 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 7078 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, |
7079 | cpumask_t *mask) | ||
6421 | { | 7080 | { |
6422 | int group; | 7081 | int group; |
6423 | #ifdef CONFIG_SCHED_MC | 7082 | #ifdef CONFIG_SCHED_MC |
6424 | cpumask_t mask = cpu_coregroup_map(cpu); | 7083 | *mask = cpu_coregroup_map(cpu); |
6425 | cpus_and(mask, mask, *cpu_map); | 7084 | cpus_and(*mask, *mask, *cpu_map); |
6426 | group = first_cpu(mask); | 7085 | group = first_cpu(*mask); |
6427 | #elif defined(CONFIG_SCHED_SMT) | 7086 | #elif defined(CONFIG_SCHED_SMT) |
6428 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 7087 | *mask = per_cpu(cpu_sibling_map, cpu); |
6429 | cpus_and(mask, mask, *cpu_map); | 7088 | cpus_and(*mask, *mask, *cpu_map); |
6430 | group = first_cpu(mask); | 7089 | group = first_cpu(*mask); |
6431 | #else | 7090 | #else |
6432 | group = cpu; | 7091 | group = cpu; |
6433 | #endif | 7092 | #endif |
@@ -6443,19 +7102,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | |||
6443 | * gets dynamically allocated. | 7102 | * gets dynamically allocated. |
6444 | */ | 7103 | */ |
6445 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 7104 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
6446 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 7105 | static struct sched_group ***sched_group_nodes_bycpu; |
6447 | 7106 | ||
6448 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 7107 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
6449 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); | 7108 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
6450 | 7109 | ||
6451 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, | 7110 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
6452 | struct sched_group **sg) | 7111 | struct sched_group **sg, cpumask_t *nodemask) |
6453 | { | 7112 | { |
6454 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); | ||
6455 | int group; | 7113 | int group; |
6456 | 7114 | ||
6457 | cpus_and(nodemask, nodemask, *cpu_map); | 7115 | *nodemask = node_to_cpumask(cpu_to_node(cpu)); |
6458 | group = first_cpu(nodemask); | 7116 | cpus_and(*nodemask, *nodemask, *cpu_map); |
7117 | group = first_cpu(*nodemask); | ||
6459 | 7118 | ||
6460 | if (sg) | 7119 | if (sg) |
6461 | *sg = &per_cpu(sched_group_allnodes, group); | 7120 | *sg = &per_cpu(sched_group_allnodes, group); |
@@ -6491,7 +7150,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
6491 | 7150 | ||
6492 | #ifdef CONFIG_NUMA | 7151 | #ifdef CONFIG_NUMA |
6493 | /* Free memory allocated for various sched_group structures */ | 7152 | /* Free memory allocated for various sched_group structures */ |
6494 | static void free_sched_groups(const cpumask_t *cpu_map) | 7153 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
6495 | { | 7154 | { |
6496 | int cpu, i; | 7155 | int cpu, i; |
6497 | 7156 | ||
@@ -6503,11 +7162,11 @@ static void free_sched_groups(const cpumask_t *cpu_map) | |||
6503 | continue; | 7162 | continue; |
6504 | 7163 | ||
6505 | for (i = 0; i < MAX_NUMNODES; i++) { | 7164 | for (i = 0; i < MAX_NUMNODES; i++) { |
6506 | cpumask_t nodemask = node_to_cpumask(i); | ||
6507 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7165 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
6508 | 7166 | ||
6509 | cpus_and(nodemask, nodemask, *cpu_map); | 7167 | *nodemask = node_to_cpumask(i); |
6510 | if (cpus_empty(nodemask)) | 7168 | cpus_and(*nodemask, *nodemask, *cpu_map); |
7169 | if (cpus_empty(*nodemask)) | ||
6511 | continue; | 7170 | continue; |
6512 | 7171 | ||
6513 | if (sg == NULL) | 7172 | if (sg == NULL) |
@@ -6525,7 +7184,7 @@ next_sg: | |||
6525 | } | 7184 | } |
6526 | } | 7185 | } |
6527 | #else | 7186 | #else |
6528 | static void free_sched_groups(const cpumask_t *cpu_map) | 7187 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
6529 | { | 7188 | { |
6530 | } | 7189 | } |
6531 | #endif | 7190 | #endif |
@@ -6583,13 +7242,106 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6583 | } | 7242 | } |
6584 | 7243 | ||
6585 | /* | 7244 | /* |
7245 | * Initializers for schedule domains | ||
7246 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
7247 | */ | ||
7248 | |||
7249 | #define SD_INIT(sd, type) sd_init_##type(sd) | ||
7250 | #define SD_INIT_FUNC(type) \ | ||
7251 | static noinline void sd_init_##type(struct sched_domain *sd) \ | ||
7252 | { \ | ||
7253 | memset(sd, 0, sizeof(*sd)); \ | ||
7254 | *sd = SD_##type##_INIT; \ | ||
7255 | sd->level = SD_LV_##type; \ | ||
7256 | } | ||
7257 | |||
7258 | SD_INIT_FUNC(CPU) | ||
7259 | #ifdef CONFIG_NUMA | ||
7260 | SD_INIT_FUNC(ALLNODES) | ||
7261 | SD_INIT_FUNC(NODE) | ||
7262 | #endif | ||
7263 | #ifdef CONFIG_SCHED_SMT | ||
7264 | SD_INIT_FUNC(SIBLING) | ||
7265 | #endif | ||
7266 | #ifdef CONFIG_SCHED_MC | ||
7267 | SD_INIT_FUNC(MC) | ||
7268 | #endif | ||
7269 | |||
7270 | /* | ||
7271 | * To minimize stack usage kmalloc room for cpumasks and share the | ||
7272 | * space as the usage in build_sched_domains() dictates. Used only | ||
7273 | * if the amount of space is significant. | ||
7274 | */ | ||
7275 | struct allmasks { | ||
7276 | cpumask_t tmpmask; /* make this one first */ | ||
7277 | union { | ||
7278 | cpumask_t nodemask; | ||
7279 | cpumask_t this_sibling_map; | ||
7280 | cpumask_t this_core_map; | ||
7281 | }; | ||
7282 | cpumask_t send_covered; | ||
7283 | |||
7284 | #ifdef CONFIG_NUMA | ||
7285 | cpumask_t domainspan; | ||
7286 | cpumask_t covered; | ||
7287 | cpumask_t notcovered; | ||
7288 | #endif | ||
7289 | }; | ||
7290 | |||
7291 | #if NR_CPUS > 128 | ||
7292 | #define SCHED_CPUMASK_ALLOC 1 | ||
7293 | #define SCHED_CPUMASK_FREE(v) kfree(v) | ||
7294 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks *v | ||
7295 | #else | ||
7296 | #define SCHED_CPUMASK_ALLOC 0 | ||
7297 | #define SCHED_CPUMASK_FREE(v) | ||
7298 | #define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v | ||
7299 | #endif | ||
7300 | |||
7301 | #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ | ||
7302 | ((unsigned long)(a) + offsetof(struct allmasks, v)) | ||
7303 | |||
7304 | static int default_relax_domain_level = -1; | ||
7305 | |||
7306 | static int __init setup_relax_domain_level(char *str) | ||
7307 | { | ||
7308 | default_relax_domain_level = simple_strtoul(str, NULL, 0); | ||
7309 | return 1; | ||
7310 | } | ||
7311 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
7312 | |||
7313 | static void set_domain_attribute(struct sched_domain *sd, | ||
7314 | struct sched_domain_attr *attr) | ||
7315 | { | ||
7316 | int request; | ||
7317 | |||
7318 | if (!attr || attr->relax_domain_level < 0) { | ||
7319 | if (default_relax_domain_level < 0) | ||
7320 | return; | ||
7321 | else | ||
7322 | request = default_relax_domain_level; | ||
7323 | } else | ||
7324 | request = attr->relax_domain_level; | ||
7325 | if (request < sd->level) { | ||
7326 | /* turn off idle balance on this domain */ | ||
7327 | sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); | ||
7328 | } else { | ||
7329 | /* turn on idle balance on this domain */ | ||
7330 | sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); | ||
7331 | } | ||
7332 | } | ||
7333 | |||
7334 | /* | ||
6586 | * Build sched domains for a given set of cpus and attach the sched domains | 7335 | * Build sched domains for a given set of cpus and attach the sched domains |
6587 | * to the individual cpus | 7336 | * to the individual cpus |
6588 | */ | 7337 | */ |
6589 | static int build_sched_domains(const cpumask_t *cpu_map) | 7338 | static int __build_sched_domains(const cpumask_t *cpu_map, |
7339 | struct sched_domain_attr *attr) | ||
6590 | { | 7340 | { |
6591 | int i; | 7341 | int i; |
6592 | struct root_domain *rd; | 7342 | struct root_domain *rd; |
7343 | SCHED_CPUMASK_DECLARE(allmasks); | ||
7344 | cpumask_t *tmpmask; | ||
6593 | #ifdef CONFIG_NUMA | 7345 | #ifdef CONFIG_NUMA |
6594 | struct sched_group **sched_group_nodes = NULL; | 7346 | struct sched_group **sched_group_nodes = NULL; |
6595 | int sd_allnodes = 0; | 7347 | int sd_allnodes = 0; |
@@ -6603,39 +7355,65 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6603 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 7355 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
6604 | return -ENOMEM; | 7356 | return -ENOMEM; |
6605 | } | 7357 | } |
6606 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
6607 | #endif | 7358 | #endif |
6608 | 7359 | ||
6609 | rd = alloc_rootdomain(); | 7360 | rd = alloc_rootdomain(); |
6610 | if (!rd) { | 7361 | if (!rd) { |
6611 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7362 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
7363 | #ifdef CONFIG_NUMA | ||
7364 | kfree(sched_group_nodes); | ||
7365 | #endif | ||
7366 | return -ENOMEM; | ||
7367 | } | ||
7368 | |||
7369 | #if SCHED_CPUMASK_ALLOC | ||
7370 | /* get space for all scratch cpumask variables */ | ||
7371 | allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); | ||
7372 | if (!allmasks) { | ||
7373 | printk(KERN_WARNING "Cannot alloc cpumask array\n"); | ||
7374 | kfree(rd); | ||
7375 | #ifdef CONFIG_NUMA | ||
7376 | kfree(sched_group_nodes); | ||
7377 | #endif | ||
6612 | return -ENOMEM; | 7378 | return -ENOMEM; |
6613 | } | 7379 | } |
7380 | #endif | ||
7381 | tmpmask = (cpumask_t *)allmasks; | ||
7382 | |||
7383 | |||
7384 | #ifdef CONFIG_NUMA | ||
7385 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
7386 | #endif | ||
6614 | 7387 | ||
6615 | /* | 7388 | /* |
6616 | * Set up domains for cpus specified by the cpu_map. | 7389 | * Set up domains for cpus specified by the cpu_map. |
6617 | */ | 7390 | */ |
6618 | for_each_cpu_mask(i, *cpu_map) { | 7391 | for_each_cpu_mask(i, *cpu_map) { |
6619 | struct sched_domain *sd = NULL, *p; | 7392 | struct sched_domain *sd = NULL, *p; |
6620 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 7393 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
6621 | 7394 | ||
6622 | cpus_and(nodemask, nodemask, *cpu_map); | 7395 | *nodemask = node_to_cpumask(cpu_to_node(i)); |
7396 | cpus_and(*nodemask, *nodemask, *cpu_map); | ||
6623 | 7397 | ||
6624 | #ifdef CONFIG_NUMA | 7398 | #ifdef CONFIG_NUMA |
6625 | if (cpus_weight(*cpu_map) > | 7399 | if (cpus_weight(*cpu_map) > |
6626 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 7400 | SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { |
6627 | sd = &per_cpu(allnodes_domains, i); | 7401 | sd = &per_cpu(allnodes_domains, i); |
6628 | *sd = SD_ALLNODES_INIT; | 7402 | SD_INIT(sd, ALLNODES); |
7403 | set_domain_attribute(sd, attr); | ||
6629 | sd->span = *cpu_map; | 7404 | sd->span = *cpu_map; |
6630 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); | 7405 | sd->first_cpu = first_cpu(sd->span); |
7406 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
6631 | p = sd; | 7407 | p = sd; |
6632 | sd_allnodes = 1; | 7408 | sd_allnodes = 1; |
6633 | } else | 7409 | } else |
6634 | p = NULL; | 7410 | p = NULL; |
6635 | 7411 | ||
6636 | sd = &per_cpu(node_domains, i); | 7412 | sd = &per_cpu(node_domains, i); |
6637 | *sd = SD_NODE_INIT; | 7413 | SD_INIT(sd, NODE); |
6638 | sd->span = sched_domain_node_span(cpu_to_node(i)); | 7414 | set_domain_attribute(sd, attr); |
7415 | sched_domain_node_span(cpu_to_node(i), &sd->span); | ||
7416 | sd->first_cpu = first_cpu(sd->span); | ||
6639 | sd->parent = p; | 7417 | sd->parent = p; |
6640 | if (p) | 7418 | if (p) |
6641 | p->child = sd; | 7419 | p->child = sd; |
@@ -6644,94 +7422,120 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6644 | 7422 | ||
6645 | p = sd; | 7423 | p = sd; |
6646 | sd = &per_cpu(phys_domains, i); | 7424 | sd = &per_cpu(phys_domains, i); |
6647 | *sd = SD_CPU_INIT; | 7425 | SD_INIT(sd, CPU); |
6648 | sd->span = nodemask; | 7426 | set_domain_attribute(sd, attr); |
7427 | sd->span = *nodemask; | ||
7428 | sd->first_cpu = first_cpu(sd->span); | ||
6649 | sd->parent = p; | 7429 | sd->parent = p; |
6650 | if (p) | 7430 | if (p) |
6651 | p->child = sd; | 7431 | p->child = sd; |
6652 | cpu_to_phys_group(i, cpu_map, &sd->groups); | 7432 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); |
6653 | 7433 | ||
6654 | #ifdef CONFIG_SCHED_MC | 7434 | #ifdef CONFIG_SCHED_MC |
6655 | p = sd; | 7435 | p = sd; |
6656 | sd = &per_cpu(core_domains, i); | 7436 | sd = &per_cpu(core_domains, i); |
6657 | *sd = SD_MC_INIT; | 7437 | SD_INIT(sd, MC); |
7438 | set_domain_attribute(sd, attr); | ||
6658 | sd->span = cpu_coregroup_map(i); | 7439 | sd->span = cpu_coregroup_map(i); |
7440 | sd->first_cpu = first_cpu(sd->span); | ||
6659 | cpus_and(sd->span, sd->span, *cpu_map); | 7441 | cpus_and(sd->span, sd->span, *cpu_map); |
6660 | sd->parent = p; | 7442 | sd->parent = p; |
6661 | p->child = sd; | 7443 | p->child = sd; |
6662 | cpu_to_core_group(i, cpu_map, &sd->groups); | 7444 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); |
6663 | #endif | 7445 | #endif |
6664 | 7446 | ||
6665 | #ifdef CONFIG_SCHED_SMT | 7447 | #ifdef CONFIG_SCHED_SMT |
6666 | p = sd; | 7448 | p = sd; |
6667 | sd = &per_cpu(cpu_domains, i); | 7449 | sd = &per_cpu(cpu_domains, i); |
6668 | *sd = SD_SIBLING_INIT; | 7450 | SD_INIT(sd, SIBLING); |
7451 | set_domain_attribute(sd, attr); | ||
6669 | sd->span = per_cpu(cpu_sibling_map, i); | 7452 | sd->span = per_cpu(cpu_sibling_map, i); |
7453 | sd->first_cpu = first_cpu(sd->span); | ||
6670 | cpus_and(sd->span, sd->span, *cpu_map); | 7454 | cpus_and(sd->span, sd->span, *cpu_map); |
6671 | sd->parent = p; | 7455 | sd->parent = p; |
6672 | p->child = sd; | 7456 | p->child = sd; |
6673 | cpu_to_cpu_group(i, cpu_map, &sd->groups); | 7457 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); |
6674 | #endif | 7458 | #endif |
6675 | } | 7459 | } |
6676 | 7460 | ||
6677 | #ifdef CONFIG_SCHED_SMT | 7461 | #ifdef CONFIG_SCHED_SMT |
6678 | /* Set up CPU (sibling) groups */ | 7462 | /* Set up CPU (sibling) groups */ |
6679 | for_each_cpu_mask(i, *cpu_map) { | 7463 | for_each_cpu_mask(i, *cpu_map) { |
6680 | cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); | 7464 | SCHED_CPUMASK_VAR(this_sibling_map, allmasks); |
6681 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 7465 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
6682 | if (i != first_cpu(this_sibling_map)) | 7466 | |
7467 | *this_sibling_map = per_cpu(cpu_sibling_map, i); | ||
7468 | cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); | ||
7469 | if (i != first_cpu(*this_sibling_map)) | ||
6683 | continue; | 7470 | continue; |
6684 | 7471 | ||
6685 | init_sched_build_groups(this_sibling_map, cpu_map, | 7472 | init_sched_build_groups(this_sibling_map, cpu_map, |
6686 | &cpu_to_cpu_group); | 7473 | &cpu_to_cpu_group, |
7474 | send_covered, tmpmask); | ||
6687 | } | 7475 | } |
6688 | #endif | 7476 | #endif |
6689 | 7477 | ||
6690 | #ifdef CONFIG_SCHED_MC | 7478 | #ifdef CONFIG_SCHED_MC |
6691 | /* Set up multi-core groups */ | 7479 | /* Set up multi-core groups */ |
6692 | for_each_cpu_mask(i, *cpu_map) { | 7480 | for_each_cpu_mask(i, *cpu_map) { |
6693 | cpumask_t this_core_map = cpu_coregroup_map(i); | 7481 | SCHED_CPUMASK_VAR(this_core_map, allmasks); |
6694 | cpus_and(this_core_map, this_core_map, *cpu_map); | 7482 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
6695 | if (i != first_cpu(this_core_map)) | 7483 | |
7484 | *this_core_map = cpu_coregroup_map(i); | ||
7485 | cpus_and(*this_core_map, *this_core_map, *cpu_map); | ||
7486 | if (i != first_cpu(*this_core_map)) | ||
6696 | continue; | 7487 | continue; |
7488 | |||
6697 | init_sched_build_groups(this_core_map, cpu_map, | 7489 | init_sched_build_groups(this_core_map, cpu_map, |
6698 | &cpu_to_core_group); | 7490 | &cpu_to_core_group, |
7491 | send_covered, tmpmask); | ||
6699 | } | 7492 | } |
6700 | #endif | 7493 | #endif |
6701 | 7494 | ||
6702 | /* Set up physical groups */ | 7495 | /* Set up physical groups */ |
6703 | for (i = 0; i < MAX_NUMNODES; i++) { | 7496 | for (i = 0; i < MAX_NUMNODES; i++) { |
6704 | cpumask_t nodemask = node_to_cpumask(i); | 7497 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
7498 | SCHED_CPUMASK_VAR(send_covered, allmasks); | ||
6705 | 7499 | ||
6706 | cpus_and(nodemask, nodemask, *cpu_map); | 7500 | *nodemask = node_to_cpumask(i); |
6707 | if (cpus_empty(nodemask)) | 7501 | cpus_and(*nodemask, *nodemask, *cpu_map); |
7502 | if (cpus_empty(*nodemask)) | ||
6708 | continue; | 7503 | continue; |
6709 | 7504 | ||
6710 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); | 7505 | init_sched_build_groups(nodemask, cpu_map, |
7506 | &cpu_to_phys_group, | ||
7507 | send_covered, tmpmask); | ||
6711 | } | 7508 | } |
6712 | 7509 | ||
6713 | #ifdef CONFIG_NUMA | 7510 | #ifdef CONFIG_NUMA |
6714 | /* Set up node groups */ | 7511 | /* Set up node groups */ |
6715 | if (sd_allnodes) | 7512 | if (sd_allnodes) { |
6716 | init_sched_build_groups(*cpu_map, cpu_map, | 7513 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
6717 | &cpu_to_allnodes_group); | 7514 | |
7515 | init_sched_build_groups(cpu_map, cpu_map, | ||
7516 | &cpu_to_allnodes_group, | ||
7517 | send_covered, tmpmask); | ||
7518 | } | ||
6718 | 7519 | ||
6719 | for (i = 0; i < MAX_NUMNODES; i++) { | 7520 | for (i = 0; i < MAX_NUMNODES; i++) { |
6720 | /* Set up node groups */ | 7521 | /* Set up node groups */ |
6721 | struct sched_group *sg, *prev; | 7522 | struct sched_group *sg, *prev; |
6722 | cpumask_t nodemask = node_to_cpumask(i); | 7523 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
6723 | cpumask_t domainspan; | 7524 | SCHED_CPUMASK_VAR(domainspan, allmasks); |
6724 | cpumask_t covered = CPU_MASK_NONE; | 7525 | SCHED_CPUMASK_VAR(covered, allmasks); |
6725 | int j; | 7526 | int j; |
6726 | 7527 | ||
6727 | cpus_and(nodemask, nodemask, *cpu_map); | 7528 | *nodemask = node_to_cpumask(i); |
6728 | if (cpus_empty(nodemask)) { | 7529 | cpus_clear(*covered); |
7530 | |||
7531 | cpus_and(*nodemask, *nodemask, *cpu_map); | ||
7532 | if (cpus_empty(*nodemask)) { | ||
6729 | sched_group_nodes[i] = NULL; | 7533 | sched_group_nodes[i] = NULL; |
6730 | continue; | 7534 | continue; |
6731 | } | 7535 | } |
6732 | 7536 | ||
6733 | domainspan = sched_domain_node_span(i); | 7537 | sched_domain_node_span(i, domainspan); |
6734 | cpus_and(domainspan, domainspan, *cpu_map); | 7538 | cpus_and(*domainspan, *domainspan, *cpu_map); |
6735 | 7539 | ||
6736 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); | 7540 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6737 | if (!sg) { | 7541 | if (!sg) { |
@@ -6740,31 +7544,31 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6740 | goto error; | 7544 | goto error; |
6741 | } | 7545 | } |
6742 | sched_group_nodes[i] = sg; | 7546 | sched_group_nodes[i] = sg; |
6743 | for_each_cpu_mask(j, nodemask) { | 7547 | for_each_cpu_mask(j, *nodemask) { |
6744 | struct sched_domain *sd; | 7548 | struct sched_domain *sd; |
6745 | 7549 | ||
6746 | sd = &per_cpu(node_domains, j); | 7550 | sd = &per_cpu(node_domains, j); |
6747 | sd->groups = sg; | 7551 | sd->groups = sg; |
6748 | } | 7552 | } |
6749 | sg->__cpu_power = 0; | 7553 | sg->__cpu_power = 0; |
6750 | sg->cpumask = nodemask; | 7554 | sg->cpumask = *nodemask; |
6751 | sg->next = sg; | 7555 | sg->next = sg; |
6752 | cpus_or(covered, covered, nodemask); | 7556 | cpus_or(*covered, *covered, *nodemask); |
6753 | prev = sg; | 7557 | prev = sg; |
6754 | 7558 | ||
6755 | for (j = 0; j < MAX_NUMNODES; j++) { | 7559 | for (j = 0; j < MAX_NUMNODES; j++) { |
6756 | cpumask_t tmp, notcovered; | 7560 | SCHED_CPUMASK_VAR(notcovered, allmasks); |
6757 | int n = (i + j) % MAX_NUMNODES; | 7561 | int n = (i + j) % MAX_NUMNODES; |
7562 | node_to_cpumask_ptr(pnodemask, n); | ||
6758 | 7563 | ||
6759 | cpus_complement(notcovered, covered); | 7564 | cpus_complement(*notcovered, *covered); |
6760 | cpus_and(tmp, notcovered, *cpu_map); | 7565 | cpus_and(*tmpmask, *notcovered, *cpu_map); |
6761 | cpus_and(tmp, tmp, domainspan); | 7566 | cpus_and(*tmpmask, *tmpmask, *domainspan); |
6762 | if (cpus_empty(tmp)) | 7567 | if (cpus_empty(*tmpmask)) |
6763 | break; | 7568 | break; |
6764 | 7569 | ||
6765 | nodemask = node_to_cpumask(n); | 7570 | cpus_and(*tmpmask, *tmpmask, *pnodemask); |
6766 | cpus_and(tmp, tmp, nodemask); | 7571 | if (cpus_empty(*tmpmask)) |
6767 | if (cpus_empty(tmp)) | ||
6768 | continue; | 7572 | continue; |
6769 | 7573 | ||
6770 | sg = kmalloc_node(sizeof(struct sched_group), | 7574 | sg = kmalloc_node(sizeof(struct sched_group), |
@@ -6775,9 +7579,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6775 | goto error; | 7579 | goto error; |
6776 | } | 7580 | } |
6777 | sg->__cpu_power = 0; | 7581 | sg->__cpu_power = 0; |
6778 | sg->cpumask = tmp; | 7582 | sg->cpumask = *tmpmask; |
6779 | sg->next = prev->next; | 7583 | sg->next = prev->next; |
6780 | cpus_or(covered, covered, tmp); | 7584 | cpus_or(*covered, *covered, *tmpmask); |
6781 | prev->next = sg; | 7585 | prev->next = sg; |
6782 | prev = sg; | 7586 | prev = sg; |
6783 | } | 7587 | } |
@@ -6813,7 +7617,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6813 | if (sd_allnodes) { | 7617 | if (sd_allnodes) { |
6814 | struct sched_group *sg; | 7618 | struct sched_group *sg; |
6815 | 7619 | ||
6816 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | 7620 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, |
7621 | tmpmask); | ||
6817 | init_numa_sched_groups_power(sg); | 7622 | init_numa_sched_groups_power(sg); |
6818 | } | 7623 | } |
6819 | #endif | 7624 | #endif |
@@ -6831,17 +7636,26 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6831 | cpu_attach_domain(sd, rd, i); | 7636 | cpu_attach_domain(sd, rd, i); |
6832 | } | 7637 | } |
6833 | 7638 | ||
7639 | SCHED_CPUMASK_FREE((void *)allmasks); | ||
6834 | return 0; | 7640 | return 0; |
6835 | 7641 | ||
6836 | #ifdef CONFIG_NUMA | 7642 | #ifdef CONFIG_NUMA |
6837 | error: | 7643 | error: |
6838 | free_sched_groups(cpu_map); | 7644 | free_sched_groups(cpu_map, tmpmask); |
7645 | SCHED_CPUMASK_FREE((void *)allmasks); | ||
6839 | return -ENOMEM; | 7646 | return -ENOMEM; |
6840 | #endif | 7647 | #endif |
6841 | } | 7648 | } |
6842 | 7649 | ||
7650 | static int build_sched_domains(const cpumask_t *cpu_map) | ||
7651 | { | ||
7652 | return __build_sched_domains(cpu_map, NULL); | ||
7653 | } | ||
7654 | |||
6843 | static cpumask_t *doms_cur; /* current sched domains */ | 7655 | static cpumask_t *doms_cur; /* current sched domains */ |
6844 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 7656 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
7657 | static struct sched_domain_attr *dattr_cur; /* attribues of custom domains | ||
7658 | in 'doms_cur' */ | ||
6845 | 7659 | ||
6846 | /* | 7660 | /* |
6847 | * Special case: If a kmalloc of a doms_cur partition (array of | 7661 | * Special case: If a kmalloc of a doms_cur partition (array of |
@@ -6869,15 +7683,17 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) | |||
6869 | if (!doms_cur) | 7683 | if (!doms_cur) |
6870 | doms_cur = &fallback_doms; | 7684 | doms_cur = &fallback_doms; |
6871 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); | 7685 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); |
7686 | dattr_cur = NULL; | ||
6872 | err = build_sched_domains(doms_cur); | 7687 | err = build_sched_domains(doms_cur); |
6873 | register_sched_domain_sysctl(); | 7688 | register_sched_domain_sysctl(); |
6874 | 7689 | ||
6875 | return err; | 7690 | return err; |
6876 | } | 7691 | } |
6877 | 7692 | ||
6878 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 7693 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map, |
7694 | cpumask_t *tmpmask) | ||
6879 | { | 7695 | { |
6880 | free_sched_groups(cpu_map); | 7696 | free_sched_groups(cpu_map, tmpmask); |
6881 | } | 7697 | } |
6882 | 7698 | ||
6883 | /* | 7699 | /* |
@@ -6886,6 +7702,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | |||
6886 | */ | 7702 | */ |
6887 | static void detach_destroy_domains(const cpumask_t *cpu_map) | 7703 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
6888 | { | 7704 | { |
7705 | cpumask_t tmpmask; | ||
6889 | int i; | 7706 | int i; |
6890 | 7707 | ||
6891 | unregister_sched_domain_sysctl(); | 7708 | unregister_sched_domain_sysctl(); |
@@ -6893,7 +7710,23 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6893 | for_each_cpu_mask(i, *cpu_map) | 7710 | for_each_cpu_mask(i, *cpu_map) |
6894 | cpu_attach_domain(NULL, &def_root_domain, i); | 7711 | cpu_attach_domain(NULL, &def_root_domain, i); |
6895 | synchronize_sched(); | 7712 | synchronize_sched(); |
6896 | arch_destroy_sched_domains(cpu_map); | 7713 | arch_destroy_sched_domains(cpu_map, &tmpmask); |
7714 | } | ||
7715 | |||
7716 | /* handle null as "default" */ | ||
7717 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
7718 | struct sched_domain_attr *new, int idx_new) | ||
7719 | { | ||
7720 | struct sched_domain_attr tmp; | ||
7721 | |||
7722 | /* fast path */ | ||
7723 | if (!new && !cur) | ||
7724 | return 1; | ||
7725 | |||
7726 | tmp = SD_ATTR_INIT; | ||
7727 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
7728 | new ? (new + idx_new) : &tmp, | ||
7729 | sizeof(struct sched_domain_attr)); | ||
6897 | } | 7730 | } |
6898 | 7731 | ||
6899 | /* | 7732 | /* |
@@ -6917,7 +7750,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6917 | * | 7750 | * |
6918 | * Call with hotplug lock held | 7751 | * Call with hotplug lock held |
6919 | */ | 7752 | */ |
6920 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | 7753 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
7754 | struct sched_domain_attr *dattr_new) | ||
6921 | { | 7755 | { |
6922 | int i, j; | 7756 | int i, j; |
6923 | 7757 | ||
@@ -6930,12 +7764,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
6930 | ndoms_new = 1; | 7764 | ndoms_new = 1; |
6931 | doms_new = &fallback_doms; | 7765 | doms_new = &fallback_doms; |
6932 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 7766 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
7767 | dattr_new = NULL; | ||
6933 | } | 7768 | } |
6934 | 7769 | ||
6935 | /* Destroy deleted domains */ | 7770 | /* Destroy deleted domains */ |
6936 | for (i = 0; i < ndoms_cur; i++) { | 7771 | for (i = 0; i < ndoms_cur; i++) { |
6937 | for (j = 0; j < ndoms_new; j++) { | 7772 | for (j = 0; j < ndoms_new; j++) { |
6938 | if (cpus_equal(doms_cur[i], doms_new[j])) | 7773 | if (cpus_equal(doms_cur[i], doms_new[j]) |
7774 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
6939 | goto match1; | 7775 | goto match1; |
6940 | } | 7776 | } |
6941 | /* no match - a current sched domain not in new doms_new[] */ | 7777 | /* no match - a current sched domain not in new doms_new[] */ |
@@ -6947,11 +7783,13 @@ match1: | |||
6947 | /* Build new domains */ | 7783 | /* Build new domains */ |
6948 | for (i = 0; i < ndoms_new; i++) { | 7784 | for (i = 0; i < ndoms_new; i++) { |
6949 | for (j = 0; j < ndoms_cur; j++) { | 7785 | for (j = 0; j < ndoms_cur; j++) { |
6950 | if (cpus_equal(doms_new[i], doms_cur[j])) | 7786 | if (cpus_equal(doms_new[i], doms_cur[j]) |
7787 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
6951 | goto match2; | 7788 | goto match2; |
6952 | } | 7789 | } |
6953 | /* no match - add a new doms_new */ | 7790 | /* no match - add a new doms_new */ |
6954 | build_sched_domains(doms_new + i); | 7791 | __build_sched_domains(doms_new + i, |
7792 | dattr_new ? dattr_new + i : NULL); | ||
6955 | match2: | 7793 | match2: |
6956 | ; | 7794 | ; |
6957 | } | 7795 | } |
@@ -6959,7 +7797,9 @@ match2: | |||
6959 | /* Remember the new sched domains */ | 7797 | /* Remember the new sched domains */ |
6960 | if (doms_cur != &fallback_doms) | 7798 | if (doms_cur != &fallback_doms) |
6961 | kfree(doms_cur); | 7799 | kfree(doms_cur); |
7800 | kfree(dattr_cur); /* kfree(NULL) is safe */ | ||
6962 | doms_cur = doms_new; | 7801 | doms_cur = doms_new; |
7802 | dattr_cur = dattr_new; | ||
6963 | ndoms_cur = ndoms_new; | 7803 | ndoms_cur = ndoms_new; |
6964 | 7804 | ||
6965 | register_sched_domain_sysctl(); | 7805 | register_sched_domain_sysctl(); |
@@ -7086,6 +7926,11 @@ void __init sched_init_smp(void) | |||
7086 | { | 7926 | { |
7087 | cpumask_t non_isolated_cpus; | 7927 | cpumask_t non_isolated_cpus; |
7088 | 7928 | ||
7929 | #if defined(CONFIG_NUMA) | ||
7930 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7931 | GFP_KERNEL); | ||
7932 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7933 | #endif | ||
7089 | get_online_cpus(); | 7934 | get_online_cpus(); |
7090 | arch_init_sched_domains(&cpu_online_map); | 7935 | arch_init_sched_domains(&cpu_online_map); |
7091 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7936 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
@@ -7096,7 +7941,7 @@ void __init sched_init_smp(void) | |||
7096 | hotcpu_notifier(update_sched_domains, 0); | 7941 | hotcpu_notifier(update_sched_domains, 0); |
7097 | 7942 | ||
7098 | /* Move init over to a non-isolated CPU */ | 7943 | /* Move init over to a non-isolated CPU */ |
7099 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7944 | if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) |
7100 | BUG(); | 7945 | BUG(); |
7101 | sched_init_granularity(); | 7946 | sched_init_granularity(); |
7102 | } | 7947 | } |
@@ -7117,6 +7962,7 @@ int in_sched_functions(unsigned long addr) | |||
7117 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 7962 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
7118 | { | 7963 | { |
7119 | cfs_rq->tasks_timeline = RB_ROOT; | 7964 | cfs_rq->tasks_timeline = RB_ROOT; |
7965 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
7120 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7966 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7121 | cfs_rq->rq = rq; | 7967 | cfs_rq->rq = rq; |
7122 | #endif | 7968 | #endif |
@@ -7146,6 +7992,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7146 | 7992 | ||
7147 | rt_rq->rt_time = 0; | 7993 | rt_rq->rt_time = 0; |
7148 | rt_rq->rt_throttled = 0; | 7994 | rt_rq->rt_throttled = 0; |
7995 | rt_rq->rt_runtime = 0; | ||
7996 | spin_lock_init(&rt_rq->rt_runtime_lock); | ||
7149 | 7997 | ||
7150 | #ifdef CONFIG_RT_GROUP_SCHED | 7998 | #ifdef CONFIG_RT_GROUP_SCHED |
7151 | rt_rq->rt_nr_boosted = 0; | 7999 | rt_rq->rt_nr_boosted = 0; |
@@ -7154,10 +8002,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7154 | } | 8002 | } |
7155 | 8003 | ||
7156 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8004 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7157 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | 8005 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7158 | struct cfs_rq *cfs_rq, struct sched_entity *se, | 8006 | struct sched_entity *se, int cpu, int add, |
7159 | int cpu, int add) | 8007 | struct sched_entity *parent) |
7160 | { | 8008 | { |
8009 | struct rq *rq = cpu_rq(cpu); | ||
7161 | tg->cfs_rq[cpu] = cfs_rq; | 8010 | tg->cfs_rq[cpu] = cfs_rq; |
7162 | init_cfs_rq(cfs_rq, rq); | 8011 | init_cfs_rq(cfs_rq, rq); |
7163 | cfs_rq->tg = tg; | 8012 | cfs_rq->tg = tg; |
@@ -7165,45 +8014,132 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | |||
7165 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 8014 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7166 | 8015 | ||
7167 | tg->se[cpu] = se; | 8016 | tg->se[cpu] = se; |
7168 | se->cfs_rq = &rq->cfs; | 8017 | /* se could be NULL for init_task_group */ |
8018 | if (!se) | ||
8019 | return; | ||
8020 | |||
8021 | if (!parent) | ||
8022 | se->cfs_rq = &rq->cfs; | ||
8023 | else | ||
8024 | se->cfs_rq = parent->my_q; | ||
8025 | |||
7169 | se->my_q = cfs_rq; | 8026 | se->my_q = cfs_rq; |
7170 | se->load.weight = tg->shares; | 8027 | se->load.weight = tg->shares; |
7171 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | 8028 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); |
7172 | se->parent = NULL; | 8029 | se->parent = parent; |
7173 | } | 8030 | } |
7174 | #endif | 8031 | #endif |
7175 | 8032 | ||
7176 | #ifdef CONFIG_RT_GROUP_SCHED | 8033 | #ifdef CONFIG_RT_GROUP_SCHED |
7177 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | 8034 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7178 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | 8035 | struct sched_rt_entity *rt_se, int cpu, int add, |
7179 | int cpu, int add) | 8036 | struct sched_rt_entity *parent) |
7180 | { | 8037 | { |
8038 | struct rq *rq = cpu_rq(cpu); | ||
8039 | |||
7181 | tg->rt_rq[cpu] = rt_rq; | 8040 | tg->rt_rq[cpu] = rt_rq; |
7182 | init_rt_rq(rt_rq, rq); | 8041 | init_rt_rq(rt_rq, rq); |
7183 | rt_rq->tg = tg; | 8042 | rt_rq->tg = tg; |
7184 | rt_rq->rt_se = rt_se; | 8043 | rt_rq->rt_se = rt_se; |
8044 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
7185 | if (add) | 8045 | if (add) |
7186 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 8046 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
7187 | 8047 | ||
7188 | tg->rt_se[cpu] = rt_se; | 8048 | tg->rt_se[cpu] = rt_se; |
8049 | if (!rt_se) | ||
8050 | return; | ||
8051 | |||
8052 | if (!parent) | ||
8053 | rt_se->rt_rq = &rq->rt; | ||
8054 | else | ||
8055 | rt_se->rt_rq = parent->my_q; | ||
8056 | |||
7189 | rt_se->rt_rq = &rq->rt; | 8057 | rt_se->rt_rq = &rq->rt; |
7190 | rt_se->my_q = rt_rq; | 8058 | rt_se->my_q = rt_rq; |
7191 | rt_se->parent = NULL; | 8059 | rt_se->parent = parent; |
7192 | INIT_LIST_HEAD(&rt_se->run_list); | 8060 | INIT_LIST_HEAD(&rt_se->run_list); |
7193 | } | 8061 | } |
7194 | #endif | 8062 | #endif |
7195 | 8063 | ||
7196 | void __init sched_init(void) | 8064 | void __init sched_init(void) |
7197 | { | 8065 | { |
7198 | int highest_cpu = 0; | ||
7199 | int i, j; | 8066 | int i, j; |
8067 | unsigned long alloc_size = 0, ptr; | ||
8068 | |||
8069 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8070 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | ||
8071 | #endif | ||
8072 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8073 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | ||
8074 | #endif | ||
8075 | #ifdef CONFIG_USER_SCHED | ||
8076 | alloc_size *= 2; | ||
8077 | #endif | ||
8078 | /* | ||
8079 | * As sched_init() is called before page_alloc is setup, | ||
8080 | * we use alloc_bootmem(). | ||
8081 | */ | ||
8082 | if (alloc_size) { | ||
8083 | ptr = (unsigned long)alloc_bootmem(alloc_size); | ||
8084 | |||
8085 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8086 | init_task_group.se = (struct sched_entity **)ptr; | ||
8087 | ptr += nr_cpu_ids * sizeof(void **); | ||
8088 | |||
8089 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
8090 | ptr += nr_cpu_ids * sizeof(void **); | ||
8091 | |||
8092 | #ifdef CONFIG_USER_SCHED | ||
8093 | root_task_group.se = (struct sched_entity **)ptr; | ||
8094 | ptr += nr_cpu_ids * sizeof(void **); | ||
8095 | |||
8096 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | ||
8097 | ptr += nr_cpu_ids * sizeof(void **); | ||
8098 | #endif | ||
8099 | #endif | ||
8100 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8101 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
8102 | ptr += nr_cpu_ids * sizeof(void **); | ||
8103 | |||
8104 | init_task_group.rt_rq = (struct rt_rq **)ptr; | ||
8105 | ptr += nr_cpu_ids * sizeof(void **); | ||
8106 | |||
8107 | #ifdef CONFIG_USER_SCHED | ||
8108 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; | ||
8109 | ptr += nr_cpu_ids * sizeof(void **); | ||
8110 | |||
8111 | root_task_group.rt_rq = (struct rt_rq **)ptr; | ||
8112 | ptr += nr_cpu_ids * sizeof(void **); | ||
8113 | #endif | ||
8114 | #endif | ||
8115 | } | ||
7200 | 8116 | ||
7201 | #ifdef CONFIG_SMP | 8117 | #ifdef CONFIG_SMP |
8118 | init_aggregate(); | ||
7202 | init_defrootdomain(); | 8119 | init_defrootdomain(); |
7203 | #endif | 8120 | #endif |
7204 | 8121 | ||
8122 | init_rt_bandwidth(&def_rt_bandwidth, | ||
8123 | global_rt_period(), global_rt_runtime()); | ||
8124 | |||
8125 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8126 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | ||
8127 | global_rt_period(), global_rt_runtime()); | ||
8128 | #ifdef CONFIG_USER_SCHED | ||
8129 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | ||
8130 | global_rt_period(), RUNTIME_INF); | ||
8131 | #endif | ||
8132 | #endif | ||
8133 | |||
7205 | #ifdef CONFIG_GROUP_SCHED | 8134 | #ifdef CONFIG_GROUP_SCHED |
7206 | list_add(&init_task_group.list, &task_groups); | 8135 | list_add(&init_task_group.list, &task_groups); |
8136 | INIT_LIST_HEAD(&init_task_group.children); | ||
8137 | |||
8138 | #ifdef CONFIG_USER_SCHED | ||
8139 | INIT_LIST_HEAD(&root_task_group.children); | ||
8140 | init_task_group.parent = &root_task_group; | ||
8141 | list_add(&init_task_group.siblings, &root_task_group.children); | ||
8142 | #endif | ||
7207 | #endif | 8143 | #endif |
7208 | 8144 | ||
7209 | for_each_possible_cpu(i) { | 8145 | for_each_possible_cpu(i) { |
@@ -7214,26 +8150,68 @@ void __init sched_init(void) | |||
7214 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 8150 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
7215 | rq->nr_running = 0; | 8151 | rq->nr_running = 0; |
7216 | rq->clock = 1; | 8152 | rq->clock = 1; |
8153 | update_last_tick_seen(rq); | ||
7217 | init_cfs_rq(&rq->cfs, rq); | 8154 | init_cfs_rq(&rq->cfs, rq); |
7218 | init_rt_rq(&rq->rt, rq); | 8155 | init_rt_rq(&rq->rt, rq); |
7219 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8156 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7220 | init_task_group.shares = init_task_group_load; | 8157 | init_task_group.shares = init_task_group_load; |
7221 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 8158 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7222 | init_tg_cfs_entry(rq, &init_task_group, | 8159 | #ifdef CONFIG_CGROUP_SCHED |
8160 | /* | ||
8161 | * How much cpu bandwidth does init_task_group get? | ||
8162 | * | ||
8163 | * In case of task-groups formed thr' the cgroup filesystem, it | ||
8164 | * gets 100% of the cpu resources in the system. This overall | ||
8165 | * system cpu resource is divided among the tasks of | ||
8166 | * init_task_group and its child task-groups in a fair manner, | ||
8167 | * based on each entity's (task or task-group's) weight | ||
8168 | * (se->load.weight). | ||
8169 | * | ||
8170 | * In other words, if init_task_group has 10 tasks of weight | ||
8171 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | ||
8172 | * then A0's share of the cpu resource is: | ||
8173 | * | ||
8174 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | ||
8175 | * | ||
8176 | * We achieve this by letting init_task_group's tasks sit | ||
8177 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | ||
8178 | */ | ||
8179 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | ||
8180 | #elif defined CONFIG_USER_SCHED | ||
8181 | root_task_group.shares = NICE_0_LOAD; | ||
8182 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); | ||
8183 | /* | ||
8184 | * In case of task-groups formed thr' the user id of tasks, | ||
8185 | * init_task_group represents tasks belonging to root user. | ||
8186 | * Hence it forms a sibling of all subsequent groups formed. | ||
8187 | * In this case, init_task_group gets only a fraction of overall | ||
8188 | * system cpu resource, based on the weight assigned to root | ||
8189 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
8190 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
8191 | * (init_cfs_rq) and having one entity represent this group of | ||
8192 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
8193 | */ | ||
8194 | init_tg_cfs_entry(&init_task_group, | ||
7223 | &per_cpu(init_cfs_rq, i), | 8195 | &per_cpu(init_cfs_rq, i), |
7224 | &per_cpu(init_sched_entity, i), i, 1); | 8196 | &per_cpu(init_sched_entity, i), i, 1, |
8197 | root_task_group.se[i]); | ||
7225 | 8198 | ||
7226 | #endif | 8199 | #endif |
8200 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
8201 | |||
8202 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | ||
7227 | #ifdef CONFIG_RT_GROUP_SCHED | 8203 | #ifdef CONFIG_RT_GROUP_SCHED |
7228 | init_task_group.rt_runtime = | ||
7229 | sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
7230 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 8204 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7231 | init_tg_rt_entry(rq, &init_task_group, | 8205 | #ifdef CONFIG_CGROUP_SCHED |
8206 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
8207 | #elif defined CONFIG_USER_SCHED | ||
8208 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); | ||
8209 | init_tg_rt_entry(&init_task_group, | ||
7232 | &per_cpu(init_rt_rq, i), | 8210 | &per_cpu(init_rt_rq, i), |
7233 | &per_cpu(init_sched_rt_entity, i), i, 1); | 8211 | &per_cpu(init_sched_rt_entity, i), i, 1, |
8212 | root_task_group.rt_se[i]); | ||
8213 | #endif | ||
7234 | #endif | 8214 | #endif |
7235 | rq->rt_period_expire = 0; | ||
7236 | rq->rt_throttled = 0; | ||
7237 | 8215 | ||
7238 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8216 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
7239 | rq->cpu_load[j] = 0; | 8217 | rq->cpu_load[j] = 0; |
@@ -7250,7 +8228,6 @@ void __init sched_init(void) | |||
7250 | #endif | 8228 | #endif |
7251 | init_rq_hrtick(rq); | 8229 | init_rq_hrtick(rq); |
7252 | atomic_set(&rq->nr_iowait, 0); | 8230 | atomic_set(&rq->nr_iowait, 0); |
7253 | highest_cpu = i; | ||
7254 | } | 8231 | } |
7255 | 8232 | ||
7256 | set_load_weight(&init_task); | 8233 | set_load_weight(&init_task); |
@@ -7260,7 +8237,6 @@ void __init sched_init(void) | |||
7260 | #endif | 8237 | #endif |
7261 | 8238 | ||
7262 | #ifdef CONFIG_SMP | 8239 | #ifdef CONFIG_SMP |
7263 | nr_cpu_ids = highest_cpu + 1; | ||
7264 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 8240 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); |
7265 | #endif | 8241 | #endif |
7266 | 8242 | ||
@@ -7419,8 +8395,6 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
7419 | 8395 | ||
7420 | #endif | 8396 | #endif |
7421 | 8397 | ||
7422 | #ifdef CONFIG_GROUP_SCHED | ||
7423 | |||
7424 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8398 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7425 | static void free_fair_sched_group(struct task_group *tg) | 8399 | static void free_fair_sched_group(struct task_group *tg) |
7426 | { | 8400 | { |
@@ -7437,17 +8411,18 @@ static void free_fair_sched_group(struct task_group *tg) | |||
7437 | kfree(tg->se); | 8411 | kfree(tg->se); |
7438 | } | 8412 | } |
7439 | 8413 | ||
7440 | static int alloc_fair_sched_group(struct task_group *tg) | 8414 | static |
8415 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
7441 | { | 8416 | { |
7442 | struct cfs_rq *cfs_rq; | 8417 | struct cfs_rq *cfs_rq; |
7443 | struct sched_entity *se; | 8418 | struct sched_entity *se, *parent_se; |
7444 | struct rq *rq; | 8419 | struct rq *rq; |
7445 | int i; | 8420 | int i; |
7446 | 8421 | ||
7447 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); | 8422 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
7448 | if (!tg->cfs_rq) | 8423 | if (!tg->cfs_rq) |
7449 | goto err; | 8424 | goto err; |
7450 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 8425 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); |
7451 | if (!tg->se) | 8426 | if (!tg->se) |
7452 | goto err; | 8427 | goto err; |
7453 | 8428 | ||
@@ -7466,7 +8441,8 @@ static int alloc_fair_sched_group(struct task_group *tg) | |||
7466 | if (!se) | 8441 | if (!se) |
7467 | goto err; | 8442 | goto err; |
7468 | 8443 | ||
7469 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); | 8444 | parent_se = parent ? parent->se[i] : NULL; |
8445 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); | ||
7470 | } | 8446 | } |
7471 | 8447 | ||
7472 | return 1; | 8448 | return 1; |
@@ -7490,7 +8466,8 @@ static inline void free_fair_sched_group(struct task_group *tg) | |||
7490 | { | 8466 | { |
7491 | } | 8467 | } |
7492 | 8468 | ||
7493 | static inline int alloc_fair_sched_group(struct task_group *tg) | 8469 | static inline |
8470 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
7494 | { | 8471 | { |
7495 | return 1; | 8472 | return 1; |
7496 | } | 8473 | } |
@@ -7509,6 +8486,8 @@ static void free_rt_sched_group(struct task_group *tg) | |||
7509 | { | 8486 | { |
7510 | int i; | 8487 | int i; |
7511 | 8488 | ||
8489 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
8490 | |||
7512 | for_each_possible_cpu(i) { | 8491 | for_each_possible_cpu(i) { |
7513 | if (tg->rt_rq) | 8492 | if (tg->rt_rq) |
7514 | kfree(tg->rt_rq[i]); | 8493 | kfree(tg->rt_rq[i]); |
@@ -7520,21 +8499,23 @@ static void free_rt_sched_group(struct task_group *tg) | |||
7520 | kfree(tg->rt_se); | 8499 | kfree(tg->rt_se); |
7521 | } | 8500 | } |
7522 | 8501 | ||
7523 | static int alloc_rt_sched_group(struct task_group *tg) | 8502 | static |
8503 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
7524 | { | 8504 | { |
7525 | struct rt_rq *rt_rq; | 8505 | struct rt_rq *rt_rq; |
7526 | struct sched_rt_entity *rt_se; | 8506 | struct sched_rt_entity *rt_se, *parent_se; |
7527 | struct rq *rq; | 8507 | struct rq *rq; |
7528 | int i; | 8508 | int i; |
7529 | 8509 | ||
7530 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | 8510 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
7531 | if (!tg->rt_rq) | 8511 | if (!tg->rt_rq) |
7532 | goto err; | 8512 | goto err; |
7533 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | 8513 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); |
7534 | if (!tg->rt_se) | 8514 | if (!tg->rt_se) |
7535 | goto err; | 8515 | goto err; |
7536 | 8516 | ||
7537 | tg->rt_runtime = 0; | 8517 | init_rt_bandwidth(&tg->rt_bandwidth, |
8518 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
7538 | 8519 | ||
7539 | for_each_possible_cpu(i) { | 8520 | for_each_possible_cpu(i) { |
7540 | rq = cpu_rq(i); | 8521 | rq = cpu_rq(i); |
@@ -7549,7 +8530,8 @@ static int alloc_rt_sched_group(struct task_group *tg) | |||
7549 | if (!rt_se) | 8530 | if (!rt_se) |
7550 | goto err; | 8531 | goto err; |
7551 | 8532 | ||
7552 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); | 8533 | parent_se = parent ? parent->rt_se[i] : NULL; |
8534 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); | ||
7553 | } | 8535 | } |
7554 | 8536 | ||
7555 | return 1; | 8537 | return 1; |
@@ -7573,7 +8555,8 @@ static inline void free_rt_sched_group(struct task_group *tg) | |||
7573 | { | 8555 | { |
7574 | } | 8556 | } |
7575 | 8557 | ||
7576 | static inline int alloc_rt_sched_group(struct task_group *tg) | 8558 | static inline |
8559 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
7577 | { | 8560 | { |
7578 | return 1; | 8561 | return 1; |
7579 | } | 8562 | } |
@@ -7587,6 +8570,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
7587 | } | 8570 | } |
7588 | #endif | 8571 | #endif |
7589 | 8572 | ||
8573 | #ifdef CONFIG_GROUP_SCHED | ||
7590 | static void free_sched_group(struct task_group *tg) | 8574 | static void free_sched_group(struct task_group *tg) |
7591 | { | 8575 | { |
7592 | free_fair_sched_group(tg); | 8576 | free_fair_sched_group(tg); |
@@ -7595,7 +8579,7 @@ static void free_sched_group(struct task_group *tg) | |||
7595 | } | 8579 | } |
7596 | 8580 | ||
7597 | /* allocate runqueue etc for a new task group */ | 8581 | /* allocate runqueue etc for a new task group */ |
7598 | struct task_group *sched_create_group(void) | 8582 | struct task_group *sched_create_group(struct task_group *parent) |
7599 | { | 8583 | { |
7600 | struct task_group *tg; | 8584 | struct task_group *tg; |
7601 | unsigned long flags; | 8585 | unsigned long flags; |
@@ -7605,10 +8589,10 @@ struct task_group *sched_create_group(void) | |||
7605 | if (!tg) | 8589 | if (!tg) |
7606 | return ERR_PTR(-ENOMEM); | 8590 | return ERR_PTR(-ENOMEM); |
7607 | 8591 | ||
7608 | if (!alloc_fair_sched_group(tg)) | 8592 | if (!alloc_fair_sched_group(tg, parent)) |
7609 | goto err; | 8593 | goto err; |
7610 | 8594 | ||
7611 | if (!alloc_rt_sched_group(tg)) | 8595 | if (!alloc_rt_sched_group(tg, parent)) |
7612 | goto err; | 8596 | goto err; |
7613 | 8597 | ||
7614 | spin_lock_irqsave(&task_group_lock, flags); | 8598 | spin_lock_irqsave(&task_group_lock, flags); |
@@ -7617,6 +8601,12 @@ struct task_group *sched_create_group(void) | |||
7617 | register_rt_sched_group(tg, i); | 8601 | register_rt_sched_group(tg, i); |
7618 | } | 8602 | } |
7619 | list_add_rcu(&tg->list, &task_groups); | 8603 | list_add_rcu(&tg->list, &task_groups); |
8604 | |||
8605 | WARN_ON(!parent); /* root should already exist */ | ||
8606 | |||
8607 | tg->parent = parent; | ||
8608 | list_add_rcu(&tg->siblings, &parent->children); | ||
8609 | INIT_LIST_HEAD(&tg->children); | ||
7620 | spin_unlock_irqrestore(&task_group_lock, flags); | 8610 | spin_unlock_irqrestore(&task_group_lock, flags); |
7621 | 8611 | ||
7622 | return tg; | 8612 | return tg; |
@@ -7645,6 +8635,7 @@ void sched_destroy_group(struct task_group *tg) | |||
7645 | unregister_rt_sched_group(tg, i); | 8635 | unregister_rt_sched_group(tg, i); |
7646 | } | 8636 | } |
7647 | list_del_rcu(&tg->list); | 8637 | list_del_rcu(&tg->list); |
8638 | list_del_rcu(&tg->siblings); | ||
7648 | spin_unlock_irqrestore(&task_group_lock, flags); | 8639 | spin_unlock_irqrestore(&task_group_lock, flags); |
7649 | 8640 | ||
7650 | /* wait for possible concurrent references to cfs_rqs complete */ | 8641 | /* wait for possible concurrent references to cfs_rqs complete */ |
@@ -7688,16 +8679,14 @@ void sched_move_task(struct task_struct *tsk) | |||
7688 | 8679 | ||
7689 | task_rq_unlock(rq, &flags); | 8680 | task_rq_unlock(rq, &flags); |
7690 | } | 8681 | } |
8682 | #endif | ||
7691 | 8683 | ||
7692 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8684 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7693 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8685 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
7694 | { | 8686 | { |
7695 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8687 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7696 | struct rq *rq = cfs_rq->rq; | ||
7697 | int on_rq; | 8688 | int on_rq; |
7698 | 8689 | ||
7699 | spin_lock_irq(&rq->lock); | ||
7700 | |||
7701 | on_rq = se->on_rq; | 8690 | on_rq = se->on_rq; |
7702 | if (on_rq) | 8691 | if (on_rq) |
7703 | dequeue_entity(cfs_rq, se, 0); | 8692 | dequeue_entity(cfs_rq, se, 0); |
@@ -7707,8 +8696,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
7707 | 8696 | ||
7708 | if (on_rq) | 8697 | if (on_rq) |
7709 | enqueue_entity(cfs_rq, se, 0); | 8698 | enqueue_entity(cfs_rq, se, 0); |
8699 | } | ||
7710 | 8700 | ||
7711 | spin_unlock_irq(&rq->lock); | 8701 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
8702 | { | ||
8703 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8704 | struct rq *rq = cfs_rq->rq; | ||
8705 | unsigned long flags; | ||
8706 | |||
8707 | spin_lock_irqsave(&rq->lock, flags); | ||
8708 | __set_se_shares(se, shares); | ||
8709 | spin_unlock_irqrestore(&rq->lock, flags); | ||
7712 | } | 8710 | } |
7713 | 8711 | ||
7714 | static DEFINE_MUTEX(shares_mutex); | 8712 | static DEFINE_MUTEX(shares_mutex); |
@@ -7719,12 +8717,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7719 | unsigned long flags; | 8717 | unsigned long flags; |
7720 | 8718 | ||
7721 | /* | 8719 | /* |
8720 | * We can't change the weight of the root cgroup. | ||
8721 | */ | ||
8722 | if (!tg->se[0]) | ||
8723 | return -EINVAL; | ||
8724 | |||
8725 | /* | ||
7722 | * A weight of 0 or 1 can cause arithmetics problems. | 8726 | * A weight of 0 or 1 can cause arithmetics problems. |
7723 | * (The default weight is 1024 - so there's no practical | 8727 | * (The default weight is 1024 - so there's no practical |
7724 | * limitation from this.) | 8728 | * limitation from this.) |
7725 | */ | 8729 | */ |
7726 | if (shares < 2) | 8730 | if (shares < MIN_SHARES) |
7727 | shares = 2; | 8731 | shares = MIN_SHARES; |
7728 | 8732 | ||
7729 | mutex_lock(&shares_mutex); | 8733 | mutex_lock(&shares_mutex); |
7730 | if (tg->shares == shares) | 8734 | if (tg->shares == shares) |
@@ -7733,6 +8737,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7733 | spin_lock_irqsave(&task_group_lock, flags); | 8737 | spin_lock_irqsave(&task_group_lock, flags); |
7734 | for_each_possible_cpu(i) | 8738 | for_each_possible_cpu(i) |
7735 | unregister_fair_sched_group(tg, i); | 8739 | unregister_fair_sched_group(tg, i); |
8740 | list_del_rcu(&tg->siblings); | ||
7736 | spin_unlock_irqrestore(&task_group_lock, flags); | 8741 | spin_unlock_irqrestore(&task_group_lock, flags); |
7737 | 8742 | ||
7738 | /* wait for any ongoing reference to this group to finish */ | 8743 | /* wait for any ongoing reference to this group to finish */ |
@@ -7743,8 +8748,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7743 | * w/o tripping rebalance_share or load_balance_fair. | 8748 | * w/o tripping rebalance_share or load_balance_fair. |
7744 | */ | 8749 | */ |
7745 | tg->shares = shares; | 8750 | tg->shares = shares; |
7746 | for_each_possible_cpu(i) | 8751 | for_each_possible_cpu(i) { |
7747 | set_se_shares(tg->se[i], shares); | 8752 | /* |
8753 | * force a rebalance | ||
8754 | */ | ||
8755 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8756 | set_se_shares(tg->se[i], shares/nr_cpu_ids); | ||
8757 | } | ||
7748 | 8758 | ||
7749 | /* | 8759 | /* |
7750 | * Enable load balance activity on this group, by inserting it back on | 8760 | * Enable load balance activity on this group, by inserting it back on |
@@ -7753,6 +8763,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7753 | spin_lock_irqsave(&task_group_lock, flags); | 8763 | spin_lock_irqsave(&task_group_lock, flags); |
7754 | for_each_possible_cpu(i) | 8764 | for_each_possible_cpu(i) |
7755 | register_fair_sched_group(tg, i); | 8765 | register_fair_sched_group(tg, i); |
8766 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
7756 | spin_unlock_irqrestore(&task_group_lock, flags); | 8767 | spin_unlock_irqrestore(&task_group_lock, flags); |
7757 | done: | 8768 | done: |
7758 | mutex_unlock(&shares_mutex); | 8769 | mutex_unlock(&shares_mutex); |
@@ -7779,26 +8790,58 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
7779 | return div64_64(runtime << 16, period); | 8790 | return div64_64(runtime << 16, period); |
7780 | } | 8791 | } |
7781 | 8792 | ||
8793 | #ifdef CONFIG_CGROUP_SCHED | ||
8794 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
8795 | { | ||
8796 | struct task_group *tgi, *parent = tg->parent; | ||
8797 | unsigned long total = 0; | ||
8798 | |||
8799 | if (!parent) { | ||
8800 | if (global_rt_period() < period) | ||
8801 | return 0; | ||
8802 | |||
8803 | return to_ratio(period, runtime) < | ||
8804 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
8805 | } | ||
8806 | |||
8807 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | ||
8808 | return 0; | ||
8809 | |||
8810 | rcu_read_lock(); | ||
8811 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | ||
8812 | if (tgi == tg) | ||
8813 | continue; | ||
8814 | |||
8815 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | ||
8816 | tgi->rt_bandwidth.rt_runtime); | ||
8817 | } | ||
8818 | rcu_read_unlock(); | ||
8819 | |||
8820 | return total + to_ratio(period, runtime) < | ||
8821 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | ||
8822 | parent->rt_bandwidth.rt_runtime); | ||
8823 | } | ||
8824 | #elif defined CONFIG_USER_SCHED | ||
7782 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8825 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
7783 | { | 8826 | { |
7784 | struct task_group *tgi; | 8827 | struct task_group *tgi; |
7785 | unsigned long total = 0; | 8828 | unsigned long total = 0; |
7786 | unsigned long global_ratio = | 8829 | unsigned long global_ratio = |
7787 | to_ratio(sysctl_sched_rt_period, | 8830 | to_ratio(global_rt_period(), global_rt_runtime()); |
7788 | sysctl_sched_rt_runtime < 0 ? | ||
7789 | RUNTIME_INF : sysctl_sched_rt_runtime); | ||
7790 | 8831 | ||
7791 | rcu_read_lock(); | 8832 | rcu_read_lock(); |
7792 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8833 | list_for_each_entry_rcu(tgi, &task_groups, list) { |
7793 | if (tgi == tg) | 8834 | if (tgi == tg) |
7794 | continue; | 8835 | continue; |
7795 | 8836 | ||
7796 | total += to_ratio(period, tgi->rt_runtime); | 8837 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), |
8838 | tgi->rt_bandwidth.rt_runtime); | ||
7797 | } | 8839 | } |
7798 | rcu_read_unlock(); | 8840 | rcu_read_unlock(); |
7799 | 8841 | ||
7800 | return total + to_ratio(period, runtime) < global_ratio; | 8842 | return total + to_ratio(period, runtime) < global_ratio; |
7801 | } | 8843 | } |
8844 | #endif | ||
7802 | 8845 | ||
7803 | /* Must be called with tasklist_lock held */ | 8846 | /* Must be called with tasklist_lock held */ |
7804 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8847 | static inline int tg_has_rt_tasks(struct task_group *tg) |
@@ -7811,19 +8854,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
7811 | return 0; | 8854 | return 0; |
7812 | } | 8855 | } |
7813 | 8856 | ||
7814 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 8857 | static int tg_set_bandwidth(struct task_group *tg, |
8858 | u64 rt_period, u64 rt_runtime) | ||
7815 | { | 8859 | { |
7816 | u64 rt_runtime, rt_period; | 8860 | int i, err = 0; |
7817 | int err = 0; | ||
7818 | |||
7819 | rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
7820 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
7821 | if (rt_runtime_us == -1) | ||
7822 | rt_runtime = RUNTIME_INF; | ||
7823 | 8861 | ||
7824 | mutex_lock(&rt_constraints_mutex); | 8862 | mutex_lock(&rt_constraints_mutex); |
7825 | read_lock(&tasklist_lock); | 8863 | read_lock(&tasklist_lock); |
7826 | if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { | 8864 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { |
7827 | err = -EBUSY; | 8865 | err = -EBUSY; |
7828 | goto unlock; | 8866 | goto unlock; |
7829 | } | 8867 | } |
@@ -7831,7 +8869,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
7831 | err = -EINVAL; | 8869 | err = -EINVAL; |
7832 | goto unlock; | 8870 | goto unlock; |
7833 | } | 8871 | } |
7834 | tg->rt_runtime = rt_runtime; | 8872 | |
8873 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
8874 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | ||
8875 | tg->rt_bandwidth.rt_runtime = rt_runtime; | ||
8876 | |||
8877 | for_each_possible_cpu(i) { | ||
8878 | struct rt_rq *rt_rq = tg->rt_rq[i]; | ||
8879 | |||
8880 | spin_lock(&rt_rq->rt_runtime_lock); | ||
8881 | rt_rq->rt_runtime = rt_runtime; | ||
8882 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
8883 | } | ||
8884 | spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
7835 | unlock: | 8885 | unlock: |
7836 | read_unlock(&tasklist_lock); | 8886 | read_unlock(&tasklist_lock); |
7837 | mutex_unlock(&rt_constraints_mutex); | 8887 | mutex_unlock(&rt_constraints_mutex); |
@@ -7839,19 +8889,109 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
7839 | return err; | 8889 | return err; |
7840 | } | 8890 | } |
7841 | 8891 | ||
8892 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | ||
8893 | { | ||
8894 | u64 rt_runtime, rt_period; | ||
8895 | |||
8896 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8897 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
8898 | if (rt_runtime_us < 0) | ||
8899 | rt_runtime = RUNTIME_INF; | ||
8900 | |||
8901 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | ||
8902 | } | ||
8903 | |||
7842 | long sched_group_rt_runtime(struct task_group *tg) | 8904 | long sched_group_rt_runtime(struct task_group *tg) |
7843 | { | 8905 | { |
7844 | u64 rt_runtime_us; | 8906 | u64 rt_runtime_us; |
7845 | 8907 | ||
7846 | if (tg->rt_runtime == RUNTIME_INF) | 8908 | if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) |
7847 | return -1; | 8909 | return -1; |
7848 | 8910 | ||
7849 | rt_runtime_us = tg->rt_runtime; | 8911 | rt_runtime_us = tg->rt_bandwidth.rt_runtime; |
7850 | do_div(rt_runtime_us, NSEC_PER_USEC); | 8912 | do_div(rt_runtime_us, NSEC_PER_USEC); |
7851 | return rt_runtime_us; | 8913 | return rt_runtime_us; |
7852 | } | 8914 | } |
8915 | |||
8916 | int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | ||
8917 | { | ||
8918 | u64 rt_runtime, rt_period; | ||
8919 | |||
8920 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | ||
8921 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8922 | |||
8923 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | ||
8924 | } | ||
8925 | |||
8926 | long sched_group_rt_period(struct task_group *tg) | ||
8927 | { | ||
8928 | u64 rt_period_us; | ||
8929 | |||
8930 | rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8931 | do_div(rt_period_us, NSEC_PER_USEC); | ||
8932 | return rt_period_us; | ||
8933 | } | ||
8934 | |||
8935 | static int sched_rt_global_constraints(void) | ||
8936 | { | ||
8937 | int ret = 0; | ||
8938 | |||
8939 | mutex_lock(&rt_constraints_mutex); | ||
8940 | if (!__rt_schedulable(NULL, 1, 0)) | ||
8941 | ret = -EINVAL; | ||
8942 | mutex_unlock(&rt_constraints_mutex); | ||
8943 | |||
8944 | return ret; | ||
8945 | } | ||
8946 | #else | ||
8947 | static int sched_rt_global_constraints(void) | ||
8948 | { | ||
8949 | unsigned long flags; | ||
8950 | int i; | ||
8951 | |||
8952 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
8953 | for_each_possible_cpu(i) { | ||
8954 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | ||
8955 | |||
8956 | spin_lock(&rt_rq->rt_runtime_lock); | ||
8957 | rt_rq->rt_runtime = global_rt_runtime(); | ||
8958 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
8959 | } | ||
8960 | spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
8961 | |||
8962 | return 0; | ||
8963 | } | ||
7853 | #endif | 8964 | #endif |
7854 | #endif /* CONFIG_GROUP_SCHED */ | 8965 | |
8966 | int sched_rt_handler(struct ctl_table *table, int write, | ||
8967 | struct file *filp, void __user *buffer, size_t *lenp, | ||
8968 | loff_t *ppos) | ||
8969 | { | ||
8970 | int ret; | ||
8971 | int old_period, old_runtime; | ||
8972 | static DEFINE_MUTEX(mutex); | ||
8973 | |||
8974 | mutex_lock(&mutex); | ||
8975 | old_period = sysctl_sched_rt_period; | ||
8976 | old_runtime = sysctl_sched_rt_runtime; | ||
8977 | |||
8978 | ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
8979 | |||
8980 | if (!ret && write) { | ||
8981 | ret = sched_rt_global_constraints(); | ||
8982 | if (ret) { | ||
8983 | sysctl_sched_rt_period = old_period; | ||
8984 | sysctl_sched_rt_runtime = old_runtime; | ||
8985 | } else { | ||
8986 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
8987 | def_rt_bandwidth.rt_period = | ||
8988 | ns_to_ktime(global_rt_period()); | ||
8989 | } | ||
8990 | } | ||
8991 | mutex_unlock(&mutex); | ||
8992 | |||
8993 | return ret; | ||
8994 | } | ||
7855 | 8995 | ||
7856 | #ifdef CONFIG_CGROUP_SCHED | 8996 | #ifdef CONFIG_CGROUP_SCHED |
7857 | 8997 | ||
@@ -7865,7 +9005,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | |||
7865 | static struct cgroup_subsys_state * | 9005 | static struct cgroup_subsys_state * |
7866 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | 9006 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) |
7867 | { | 9007 | { |
7868 | struct task_group *tg; | 9008 | struct task_group *tg, *parent; |
7869 | 9009 | ||
7870 | if (!cgrp->parent) { | 9010 | if (!cgrp->parent) { |
7871 | /* This is early initialization for the top cgroup */ | 9011 | /* This is early initialization for the top cgroup */ |
@@ -7873,11 +9013,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
7873 | return &init_task_group.css; | 9013 | return &init_task_group.css; |
7874 | } | 9014 | } |
7875 | 9015 | ||
7876 | /* we support only 1-level deep hierarchical scheduler atm */ | 9016 | parent = cgroup_tg(cgrp->parent); |
7877 | if (cgrp->parent->parent) | 9017 | tg = sched_create_group(parent); |
7878 | return ERR_PTR(-EINVAL); | ||
7879 | |||
7880 | tg = sched_create_group(); | ||
7881 | if (IS_ERR(tg)) | 9018 | if (IS_ERR(tg)) |
7882 | return ERR_PTR(-ENOMEM); | 9019 | return ERR_PTR(-ENOMEM); |
7883 | 9020 | ||
@@ -7901,7 +9038,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
7901 | { | 9038 | { |
7902 | #ifdef CONFIG_RT_GROUP_SCHED | 9039 | #ifdef CONFIG_RT_GROUP_SCHED |
7903 | /* Don't accept realtime tasks when there is no way for them to run */ | 9040 | /* Don't accept realtime tasks when there is no way for them to run */ |
7904 | if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) | 9041 | if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) |
7905 | return -EINVAL; | 9042 | return -EINVAL; |
7906 | #else | 9043 | #else |
7907 | /* We don't support RT-tasks being in separate groups */ | 9044 | /* We don't support RT-tasks being in separate groups */ |
@@ -7935,7 +9072,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
7935 | #endif | 9072 | #endif |
7936 | 9073 | ||
7937 | #ifdef CONFIG_RT_GROUP_SCHED | 9074 | #ifdef CONFIG_RT_GROUP_SCHED |
7938 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 9075 | static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
7939 | struct file *file, | 9076 | struct file *file, |
7940 | const char __user *userbuf, | 9077 | const char __user *userbuf, |
7941 | size_t nbytes, loff_t *unused_ppos) | 9078 | size_t nbytes, loff_t *unused_ppos) |
@@ -7979,6 +9116,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, | |||
7979 | 9116 | ||
7980 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 9117 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
7981 | } | 9118 | } |
9119 | |||
9120 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, | ||
9121 | u64 rt_period_us) | ||
9122 | { | ||
9123 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); | ||
9124 | } | ||
9125 | |||
9126 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | ||
9127 | { | ||
9128 | return sched_group_rt_period(cgroup_tg(cgrp)); | ||
9129 | } | ||
7982 | #endif | 9130 | #endif |
7983 | 9131 | ||
7984 | static struct cftype cpu_files[] = { | 9132 | static struct cftype cpu_files[] = { |
@@ -7995,6 +9143,11 @@ static struct cftype cpu_files[] = { | |||
7995 | .read = cpu_rt_runtime_read, | 9143 | .read = cpu_rt_runtime_read, |
7996 | .write = cpu_rt_runtime_write, | 9144 | .write = cpu_rt_runtime_write, |
7997 | }, | 9145 | }, |
9146 | { | ||
9147 | .name = "rt_period_us", | ||
9148 | .read_uint = cpu_rt_period_read_uint, | ||
9149 | .write_uint = cpu_rt_period_write_uint, | ||
9150 | }, | ||
7998 | #endif | 9151 | #endif |
7999 | }; | 9152 | }; |
8000 | 9153 | ||
@@ -8035,9 +9188,9 @@ struct cpuacct { | |||
8035 | struct cgroup_subsys cpuacct_subsys; | 9188 | struct cgroup_subsys cpuacct_subsys; |
8036 | 9189 | ||
8037 | /* return cpu accounting group corresponding to this container */ | 9190 | /* return cpu accounting group corresponding to this container */ |
8038 | static inline struct cpuacct *cgroup_ca(struct cgroup *cont) | 9191 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) |
8039 | { | 9192 | { |
8040 | return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), | 9193 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), |
8041 | struct cpuacct, css); | 9194 | struct cpuacct, css); |
8042 | } | 9195 | } |
8043 | 9196 | ||
@@ -8050,7 +9203,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) | |||
8050 | 9203 | ||
8051 | /* create a new cpu accounting group */ | 9204 | /* create a new cpu accounting group */ |
8052 | static struct cgroup_subsys_state *cpuacct_create( | 9205 | static struct cgroup_subsys_state *cpuacct_create( |
8053 | struct cgroup_subsys *ss, struct cgroup *cont) | 9206 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
8054 | { | 9207 | { |
8055 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 9208 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
8056 | 9209 | ||
@@ -8068,18 +9221,18 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
8068 | 9221 | ||
8069 | /* destroy an existing cpu accounting group */ | 9222 | /* destroy an existing cpu accounting group */ |
8070 | static void | 9223 | static void |
8071 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 9224 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
8072 | { | 9225 | { |
8073 | struct cpuacct *ca = cgroup_ca(cont); | 9226 | struct cpuacct *ca = cgroup_ca(cgrp); |
8074 | 9227 | ||
8075 | free_percpu(ca->cpuusage); | 9228 | free_percpu(ca->cpuusage); |
8076 | kfree(ca); | 9229 | kfree(ca); |
8077 | } | 9230 | } |
8078 | 9231 | ||
8079 | /* return total cpu usage (in nanoseconds) of a group */ | 9232 | /* return total cpu usage (in nanoseconds) of a group */ |
8080 | static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | 9233 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) |
8081 | { | 9234 | { |
8082 | struct cpuacct *ca = cgroup_ca(cont); | 9235 | struct cpuacct *ca = cgroup_ca(cgrp); |
8083 | u64 totalcpuusage = 0; | 9236 | u64 totalcpuusage = 0; |
8084 | int i; | 9237 | int i; |
8085 | 9238 | ||
@@ -8098,16 +9251,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | |||
8098 | return totalcpuusage; | 9251 | return totalcpuusage; |
8099 | } | 9252 | } |
8100 | 9253 | ||
9254 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
9255 | u64 reset) | ||
9256 | { | ||
9257 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
9258 | int err = 0; | ||
9259 | int i; | ||
9260 | |||
9261 | if (reset) { | ||
9262 | err = -EINVAL; | ||
9263 | goto out; | ||
9264 | } | ||
9265 | |||
9266 | for_each_possible_cpu(i) { | ||
9267 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); | ||
9268 | |||
9269 | spin_lock_irq(&cpu_rq(i)->lock); | ||
9270 | *cpuusage = 0; | ||
9271 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
9272 | } | ||
9273 | out: | ||
9274 | return err; | ||
9275 | } | ||
9276 | |||
8101 | static struct cftype files[] = { | 9277 | static struct cftype files[] = { |
8102 | { | 9278 | { |
8103 | .name = "usage", | 9279 | .name = "usage", |
8104 | .read_uint = cpuusage_read, | 9280 | .read_uint = cpuusage_read, |
9281 | .write_uint = cpuusage_write, | ||
8105 | }, | 9282 | }, |
8106 | }; | 9283 | }; |
8107 | 9284 | ||
8108 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 9285 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) |
8109 | { | 9286 | { |
8110 | return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 9287 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); |
8111 | } | 9288 | } |
8112 | 9289 | ||
8113 | /* | 9290 | /* |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index ef358ba07683..f3f4af4b8b0f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
67 | (long long)(p->nvcsw + p->nivcsw), | 67 | (long long)(p->nvcsw + p->nivcsw), |
68 | p->prio); | 68 | p->prio); |
69 | #ifdef CONFIG_SCHEDSTATS | 69 | #ifdef CONFIG_SCHEDSTATS |
70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", | 70 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
71 | SPLIT_NS(p->se.vruntime), | 71 | SPLIT_NS(p->se.vruntime), |
72 | SPLIT_NS(p->se.sum_exec_runtime), | 72 | SPLIT_NS(p->se.sum_exec_runtime), |
73 | SPLIT_NS(p->se.sum_sleep_runtime)); | 73 | SPLIT_NS(p->se.sum_sleep_runtime)); |
74 | #else | 74 | #else |
75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", | 75 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 76 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
77 | #endif | 77 | #endif |
78 | |||
79 | #ifdef CONFIG_CGROUP_SCHED | ||
80 | { | ||
81 | char path[64]; | ||
82 | |||
83 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); | ||
84 | SEQ_printf(m, " %s", path); | ||
85 | } | ||
86 | #endif | ||
87 | SEQ_printf(m, "\n"); | ||
78 | } | 88 | } |
79 | 89 | ||
80 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | 90 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
@@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
109 | struct sched_entity *last; | 119 | struct sched_entity *last; |
110 | unsigned long flags; | 120 | unsigned long flags; |
111 | 121 | ||
112 | SEQ_printf(m, "\ncfs_rq\n"); | 122 | #if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) |
123 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | ||
124 | #else | ||
125 | char path[128] = ""; | ||
126 | struct cgroup *cgroup = NULL; | ||
127 | struct task_group *tg = cfs_rq->tg; | ||
128 | |||
129 | if (tg) | ||
130 | cgroup = tg->css.cgroup; | ||
131 | |||
132 | if (cgroup) | ||
133 | cgroup_path(cgroup, path, sizeof(path)); | ||
134 | |||
135 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | ||
136 | #endif | ||
113 | 137 | ||
114 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", | 138 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
115 | SPLIT_NS(cfs_rq->exec_clock)); | 139 | SPLIT_NS(cfs_rq->exec_clock)); |
@@ -143,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
143 | #endif | 167 | #endif |
144 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | 168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", |
145 | cfs_rq->nr_spread_over); | 169 | cfs_rq->nr_spread_over); |
170 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
171 | #ifdef CONFIG_SMP | ||
172 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | ||
173 | #endif | ||
174 | #endif | ||
146 | } | 175 | } |
147 | 176 | ||
148 | static void print_cpu(struct seq_file *m, int cpu) | 177 | static void print_cpu(struct seq_file *m, int cpu) |
@@ -214,7 +243,6 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
214 | PN(sysctl_sched_latency); | 243 | PN(sysctl_sched_latency); |
215 | PN(sysctl_sched_min_granularity); | 244 | PN(sysctl_sched_min_granularity); |
216 | PN(sysctl_sched_wakeup_granularity); | 245 | PN(sysctl_sched_wakeup_granularity); |
217 | PN(sysctl_sched_batch_wakeup_granularity); | ||
218 | PN(sysctl_sched_child_runs_first); | 246 | PN(sysctl_sched_child_runs_first); |
219 | P(sysctl_sched_features); | 247 | P(sysctl_sched_features); |
220 | #undef PN | 248 | #undef PN |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 86a93376282c..89fa32b4edf2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -62,24 +62,14 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1; | |||
62 | unsigned int __read_mostly sysctl_sched_compat_yield; | 62 | unsigned int __read_mostly sysctl_sched_compat_yield; |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * SCHED_BATCH wake-up granularity. | ||
66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | ||
67 | * | ||
68 | * This option delays the preemption effects of decoupled workloads | ||
69 | * and reduces their over-scheduling. Synchronous workloads will still | ||
70 | * have immediate wakeup/sleep latencies. | ||
71 | */ | ||
72 | unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; | ||
73 | |||
74 | /* | ||
75 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
76 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) |
77 | * | 67 | * |
78 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
79 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
80 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
81 | */ | 71 | */ |
82 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
83 | 73 | ||
84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
85 | 75 | ||
@@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
87 | * CFS operations on generic schedulable entities: | 77 | * CFS operations on generic schedulable entities: |
88 | */ | 78 | */ |
89 | 79 | ||
80 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
81 | { | ||
82 | return container_of(se, struct task_struct, se); | ||
83 | } | ||
84 | |||
90 | #ifdef CONFIG_FAIR_GROUP_SCHED | 85 | #ifdef CONFIG_FAIR_GROUP_SCHED |
91 | 86 | ||
92 | /* cpu runqueue to which this cfs_rq is attached */ | 87 | /* cpu runqueue to which this cfs_rq is attached */ |
@@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
98 | /* An entity is a task if it doesn't "own" a runqueue */ | 93 | /* An entity is a task if it doesn't "own" a runqueue */ |
99 | #define entity_is_task(se) (!se->my_q) | 94 | #define entity_is_task(se) (!se->my_q) |
100 | 95 | ||
96 | /* Walk up scheduling entities hierarchy */ | ||
97 | #define for_each_sched_entity(se) \ | ||
98 | for (; se; se = se->parent) | ||
99 | |||
100 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
101 | { | ||
102 | return p->se.cfs_rq; | ||
103 | } | ||
104 | |||
105 | /* runqueue on which this entity is (to be) queued */ | ||
106 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
107 | { | ||
108 | return se->cfs_rq; | ||
109 | } | ||
110 | |||
111 | /* runqueue "owned" by this group */ | ||
112 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
113 | { | ||
114 | return grp->my_q; | ||
115 | } | ||
116 | |||
117 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
118 | * another cpu ('this_cpu') | ||
119 | */ | ||
120 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
121 | { | ||
122 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
123 | } | ||
124 | |||
125 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
126 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
127 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
128 | |||
129 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
130 | static inline int | ||
131 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
132 | { | ||
133 | if (se->cfs_rq == pse->cfs_rq) | ||
134 | return 1; | ||
135 | |||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
140 | { | ||
141 | return se->parent; | ||
142 | } | ||
143 | |||
101 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 144 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
102 | 145 | ||
103 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 146 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
@@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
107 | 150 | ||
108 | #define entity_is_task(se) 1 | 151 | #define entity_is_task(se) 1 |
109 | 152 | ||
110 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 153 | #define for_each_sched_entity(se) \ |
154 | for (; se; se = NULL) | ||
111 | 155 | ||
112 | static inline struct task_struct *task_of(struct sched_entity *se) | 156 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) |
113 | { | 157 | { |
114 | return container_of(se, struct task_struct, se); | 158 | return &task_rq(p)->cfs; |
159 | } | ||
160 | |||
161 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
162 | { | ||
163 | struct task_struct *p = task_of(se); | ||
164 | struct rq *rq = task_rq(p); | ||
165 | |||
166 | return &rq->cfs; | ||
115 | } | 167 | } |
116 | 168 | ||
169 | /* runqueue "owned" by this group */ | ||
170 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
171 | { | ||
172 | return NULL; | ||
173 | } | ||
174 | |||
175 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
176 | { | ||
177 | return &cpu_rq(this_cpu)->cfs; | ||
178 | } | ||
179 | |||
180 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
181 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
182 | |||
183 | static inline int | ||
184 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
185 | { | ||
186 | return 1; | ||
187 | } | ||
188 | |||
189 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
190 | { | ||
191 | return NULL; | ||
192 | } | ||
193 | |||
194 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
195 | |||
117 | 196 | ||
118 | /************************************************************** | 197 | /************************************************************** |
119 | * Scheduling class tree data structure manipulation methods: | 198 | * Scheduling class tree data structure manipulation methods: |
@@ -255,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
255 | #endif | 334 | #endif |
256 | 335 | ||
257 | /* | 336 | /* |
337 | * delta *= w / rw | ||
338 | */ | ||
339 | static inline unsigned long | ||
340 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
341 | { | ||
342 | for_each_sched_entity(se) { | ||
343 | delta = calc_delta_mine(delta, | ||
344 | se->load.weight, &cfs_rq_of(se)->load); | ||
345 | } | ||
346 | |||
347 | return delta; | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * delta *= rw / w | ||
352 | */ | ||
353 | static inline unsigned long | ||
354 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
355 | { | ||
356 | for_each_sched_entity(se) { | ||
357 | delta = calc_delta_mine(delta, | ||
358 | cfs_rq_of(se)->load.weight, &se->load); | ||
359 | } | ||
360 | |||
361 | return delta; | ||
362 | } | ||
363 | |||
364 | /* | ||
258 | * The idea is to set a period in which each task runs once. | 365 | * The idea is to set a period in which each task runs once. |
259 | * | 366 | * |
260 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 367 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
@@ -283,29 +390,54 @@ static u64 __sched_period(unsigned long nr_running) | |||
283 | */ | 390 | */ |
284 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 391 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
285 | { | 392 | { |
286 | return calc_delta_mine(__sched_period(cfs_rq->nr_running), | 393 | return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); |
287 | se->load.weight, &cfs_rq->load); | ||
288 | } | 394 | } |
289 | 395 | ||
290 | /* | 396 | /* |
291 | * We calculate the vruntime slice. | 397 | * We calculate the vruntime slice of a to be inserted task |
292 | * | 398 | * |
293 | * vs = s/w = p/rw | 399 | * vs = s*rw/w = p |
294 | */ | 400 | */ |
295 | static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | 401 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
296 | { | 402 | { |
297 | u64 vslice = __sched_period(nr_running); | 403 | unsigned long nr_running = cfs_rq->nr_running; |
298 | 404 | ||
299 | vslice *= NICE_0_LOAD; | 405 | if (!se->on_rq) |
300 | do_div(vslice, rq_weight); | 406 | nr_running++; |
301 | 407 | ||
302 | return vslice; | 408 | return __sched_period(nr_running); |
303 | } | 409 | } |
304 | 410 | ||
305 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 411 | /* |
412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
413 | * that it favours >=0 over <0. | ||
414 | * | ||
415 | * -20 | | ||
416 | * | | ||
417 | * 0 --------+------- | ||
418 | * .' | ||
419 | * 19 .' | ||
420 | * | ||
421 | */ | ||
422 | static unsigned long | ||
423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
306 | { | 424 | { |
307 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, | 425 | struct load_weight lw = { |
308 | cfs_rq->nr_running + 1); | 426 | .weight = NICE_0_LOAD, |
427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
428 | }; | ||
429 | |||
430 | for_each_sched_entity(se) { | ||
431 | struct load_weight *se_lw = &se->load; | ||
432 | |||
433 | if (se->load.weight < NICE_0_LOAD) | ||
434 | se_lw = &lw; | ||
435 | |||
436 | delta = calc_delta_mine(delta, | ||
437 | cfs_rq_of(se)->load.weight, se_lw); | ||
438 | } | ||
439 | |||
440 | return delta; | ||
309 | } | 441 | } |
310 | 442 | ||
311 | /* | 443 | /* |
@@ -322,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
322 | 454 | ||
323 | curr->sum_exec_runtime += delta_exec; | 455 | curr->sum_exec_runtime += delta_exec; |
324 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 456 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
325 | delta_exec_weighted = delta_exec; | 457 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); |
326 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { | ||
327 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, | ||
328 | &curr->load); | ||
329 | } | ||
330 | curr->vruntime += delta_exec_weighted; | 458 | curr->vruntime += delta_exec_weighted; |
331 | } | 459 | } |
332 | 460 | ||
@@ -413,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
413 | * Scheduling class queueing methods: | 541 | * Scheduling class queueing methods: |
414 | */ | 542 | */ |
415 | 543 | ||
544 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
545 | static void | ||
546 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
547 | { | ||
548 | cfs_rq->task_weight += weight; | ||
549 | } | ||
550 | #else | ||
551 | static inline void | ||
552 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
553 | { | ||
554 | } | ||
555 | #endif | ||
556 | |||
416 | static void | 557 | static void |
417 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 558 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
418 | { | 559 | { |
419 | update_load_add(&cfs_rq->load, se->load.weight); | 560 | update_load_add(&cfs_rq->load, se->load.weight); |
561 | if (!parent_entity(se)) | ||
562 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
563 | if (entity_is_task(se)) | ||
564 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
420 | cfs_rq->nr_running++; | 565 | cfs_rq->nr_running++; |
421 | se->on_rq = 1; | 566 | se->on_rq = 1; |
567 | list_add(&se->group_node, &cfs_rq->tasks); | ||
422 | } | 568 | } |
423 | 569 | ||
424 | static void | 570 | static void |
425 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 571 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
426 | { | 572 | { |
427 | update_load_sub(&cfs_rq->load, se->load.weight); | 573 | update_load_sub(&cfs_rq->load, se->load.weight); |
574 | if (!parent_entity(se)) | ||
575 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
576 | if (entity_is_task(se)) | ||
577 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
428 | cfs_rq->nr_running--; | 578 | cfs_rq->nr_running--; |
429 | se->on_rq = 0; | 579 | se->on_rq = 0; |
580 | list_del_init(&se->group_node); | ||
430 | } | 581 | } |
431 | 582 | ||
432 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 583 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -511,8 +662,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
511 | if (!initial) { | 662 | if (!initial) { |
512 | /* sleeps upto a single latency don't count. */ | 663 | /* sleeps upto a single latency don't count. */ |
513 | if (sched_feat(NEW_FAIR_SLEEPERS)) { | 664 | if (sched_feat(NEW_FAIR_SLEEPERS)) { |
514 | vruntime -= calc_delta_fair(sysctl_sched_latency, | 665 | if (sched_feat(NORMALIZED_SLEEPER)) |
515 | &cfs_rq->load); | 666 | vruntime -= calc_delta_weight(sysctl_sched_latency, se); |
667 | else | ||
668 | vruntime -= sysctl_sched_latency; | ||
516 | } | 669 | } |
517 | 670 | ||
518 | /* ensure we never gain time by being placed backwards. */ | 671 | /* ensure we never gain time by being placed backwards. */ |
@@ -629,20 +782,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
629 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 782 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
630 | } | 783 | } |
631 | 784 | ||
785 | static int | ||
786 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | ||
787 | |||
632 | static struct sched_entity * | 788 | static struct sched_entity * |
633 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) | 789 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) |
634 | { | 790 | { |
635 | s64 diff, gran; | ||
636 | |||
637 | if (!cfs_rq->next) | 791 | if (!cfs_rq->next) |
638 | return se; | 792 | return se; |
639 | 793 | ||
640 | diff = cfs_rq->next->vruntime - se->vruntime; | 794 | if (wakeup_preempt_entity(cfs_rq->next, se) != 0) |
641 | if (diff < 0) | ||
642 | return se; | ||
643 | |||
644 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); | ||
645 | if (diff > gran) | ||
646 | return se; | 795 | return se; |
647 | 796 | ||
648 | return cfs_rq->next; | 797 | return cfs_rq->next; |
@@ -710,101 +859,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
710 | * CFS operations on tasks: | 859 | * CFS operations on tasks: |
711 | */ | 860 | */ |
712 | 861 | ||
713 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
714 | |||
715 | /* Walk up scheduling entities hierarchy */ | ||
716 | #define for_each_sched_entity(se) \ | ||
717 | for (; se; se = se->parent) | ||
718 | |||
719 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
720 | { | ||
721 | return p->se.cfs_rq; | ||
722 | } | ||
723 | |||
724 | /* runqueue on which this entity is (to be) queued */ | ||
725 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
726 | { | ||
727 | return se->cfs_rq; | ||
728 | } | ||
729 | |||
730 | /* runqueue "owned" by this group */ | ||
731 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
732 | { | ||
733 | return grp->my_q; | ||
734 | } | ||
735 | |||
736 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
737 | * another cpu ('this_cpu') | ||
738 | */ | ||
739 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
740 | { | ||
741 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
742 | } | ||
743 | |||
744 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
745 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
746 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
747 | |||
748 | /* Do the two (enqueued) entities belong to the same group ? */ | ||
749 | static inline int | ||
750 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
751 | { | ||
752 | if (se->cfs_rq == pse->cfs_rq) | ||
753 | return 1; | ||
754 | |||
755 | return 0; | ||
756 | } | ||
757 | |||
758 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
759 | { | ||
760 | return se->parent; | ||
761 | } | ||
762 | |||
763 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
764 | |||
765 | #define for_each_sched_entity(se) \ | ||
766 | for (; se; se = NULL) | ||
767 | |||
768 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
769 | { | ||
770 | return &task_rq(p)->cfs; | ||
771 | } | ||
772 | |||
773 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
774 | { | ||
775 | struct task_struct *p = task_of(se); | ||
776 | struct rq *rq = task_rq(p); | ||
777 | |||
778 | return &rq->cfs; | ||
779 | } | ||
780 | |||
781 | /* runqueue "owned" by this group */ | ||
782 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
783 | { | ||
784 | return NULL; | ||
785 | } | ||
786 | |||
787 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
788 | { | ||
789 | return &cpu_rq(this_cpu)->cfs; | ||
790 | } | ||
791 | |||
792 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
793 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
794 | |||
795 | static inline int | ||
796 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
797 | { | ||
798 | return 1; | ||
799 | } | ||
800 | |||
801 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
802 | { | ||
803 | return NULL; | ||
804 | } | ||
805 | |||
806 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
807 | |||
808 | #ifdef CONFIG_SCHED_HRTICK | 862 | #ifdef CONFIG_SCHED_HRTICK |
809 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | 863 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) |
810 | { | 864 | { |
@@ -918,7 +972,7 @@ static void yield_task_fair(struct rq *rq) | |||
918 | /* | 972 | /* |
919 | * Already in the rightmost position? | 973 | * Already in the rightmost position? |
920 | */ | 974 | */ |
921 | if (unlikely(rightmost->vruntime < se->vruntime)) | 975 | if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) |
922 | return; | 976 | return; |
923 | 977 | ||
924 | /* | 978 | /* |
@@ -957,7 +1011,9 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
957 | return cpu; | 1011 | return cpu; |
958 | 1012 | ||
959 | for_each_domain(cpu, sd) { | 1013 | for_each_domain(cpu, sd) { |
960 | if (sd->flags & SD_WAKE_IDLE) { | 1014 | if ((sd->flags & SD_WAKE_IDLE) |
1015 | || ((sd->flags & SD_WAKE_IDLE_FAR) | ||
1016 | && !task_hot(p, task_rq(p)->clock, sd))) { | ||
961 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1017 | cpus_and(tmp, sd->span, p->cpus_allowed); |
962 | for_each_cpu_mask(i, tmp) { | 1018 | for_each_cpu_mask(i, tmp) { |
963 | if (idle_cpu(i)) { | 1019 | if (idle_cpu(i)) { |
@@ -1101,6 +1157,58 @@ out: | |||
1101 | } | 1157 | } |
1102 | #endif /* CONFIG_SMP */ | 1158 | #endif /* CONFIG_SMP */ |
1103 | 1159 | ||
1160 | static unsigned long wakeup_gran(struct sched_entity *se) | ||
1161 | { | ||
1162 | unsigned long gran = sysctl_sched_wakeup_granularity; | ||
1163 | |||
1164 | /* | ||
1165 | * More easily preempt - nice tasks, while not making it harder for | ||
1166 | * + nice tasks. | ||
1167 | */ | ||
1168 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); | ||
1169 | |||
1170 | return gran; | ||
1171 | } | ||
1172 | |||
1173 | /* | ||
1174 | * Should 'se' preempt 'curr'. | ||
1175 | * | ||
1176 | * |s1 | ||
1177 | * |s2 | ||
1178 | * |s3 | ||
1179 | * g | ||
1180 | * |<--->|c | ||
1181 | * | ||
1182 | * w(c, s1) = -1 | ||
1183 | * w(c, s2) = 0 | ||
1184 | * w(c, s3) = 1 | ||
1185 | * | ||
1186 | */ | ||
1187 | static int | ||
1188 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | ||
1189 | { | ||
1190 | s64 gran, vdiff = curr->vruntime - se->vruntime; | ||
1191 | |||
1192 | if (vdiff < 0) | ||
1193 | return -1; | ||
1194 | |||
1195 | gran = wakeup_gran(curr); | ||
1196 | if (vdiff > gran) | ||
1197 | return 1; | ||
1198 | |||
1199 | return 0; | ||
1200 | } | ||
1201 | |||
1202 | /* return depth at which a sched entity is present in the hierarchy */ | ||
1203 | static inline int depth_se(struct sched_entity *se) | ||
1204 | { | ||
1205 | int depth = 0; | ||
1206 | |||
1207 | for_each_sched_entity(se) | ||
1208 | depth++; | ||
1209 | |||
1210 | return depth; | ||
1211 | } | ||
1104 | 1212 | ||
1105 | /* | 1213 | /* |
1106 | * Preempt the current task with a newly woken task if needed: | 1214 | * Preempt the current task with a newly woken task if needed: |
@@ -1110,7 +1218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1110 | struct task_struct *curr = rq->curr; | 1218 | struct task_struct *curr = rq->curr; |
1111 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1219 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1112 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1220 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1113 | unsigned long gran; | 1221 | int se_depth, pse_depth; |
1114 | 1222 | ||
1115 | if (unlikely(rt_prio(p->prio))) { | 1223 | if (unlikely(rt_prio(p->prio))) { |
1116 | update_rq_clock(rq); | 1224 | update_rq_clock(rq); |
@@ -1135,20 +1243,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1135 | if (!sched_feat(WAKEUP_PREEMPT)) | 1243 | if (!sched_feat(WAKEUP_PREEMPT)) |
1136 | return; | 1244 | return; |
1137 | 1245 | ||
1138 | while (!is_same_group(se, pse)) { | 1246 | /* |
1247 | * preemption test can be made between sibling entities who are in the | ||
1248 | * same cfs_rq i.e who have a common parent. Walk up the hierarchy of | ||
1249 | * both tasks until we find their ancestors who are siblings of common | ||
1250 | * parent. | ||
1251 | */ | ||
1252 | |||
1253 | /* First walk up until both entities are at same depth */ | ||
1254 | se_depth = depth_se(se); | ||
1255 | pse_depth = depth_se(pse); | ||
1256 | |||
1257 | while (se_depth > pse_depth) { | ||
1258 | se_depth--; | ||
1139 | se = parent_entity(se); | 1259 | se = parent_entity(se); |
1260 | } | ||
1261 | |||
1262 | while (pse_depth > se_depth) { | ||
1263 | pse_depth--; | ||
1140 | pse = parent_entity(pse); | 1264 | pse = parent_entity(pse); |
1141 | } | 1265 | } |
1142 | 1266 | ||
1143 | gran = sysctl_sched_wakeup_granularity; | 1267 | while (!is_same_group(se, pse)) { |
1144 | /* | 1268 | se = parent_entity(se); |
1145 | * More easily preempt - nice tasks, while not making | 1269 | pse = parent_entity(pse); |
1146 | * it harder for + nice tasks. | 1270 | } |
1147 | */ | ||
1148 | if (unlikely(se->load.weight > NICE_0_LOAD)) | ||
1149 | gran = calc_delta_fair(gran, &se->load); | ||
1150 | 1271 | ||
1151 | if (pse->vruntime + gran < se->vruntime) | 1272 | if (wakeup_preempt_entity(se, pse) == 1) |
1152 | resched_task(curr); | 1273 | resched_task(curr); |
1153 | } | 1274 | } |
1154 | 1275 | ||
@@ -1199,15 +1320,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1199 | * the current task: | 1320 | * the current task: |
1200 | */ | 1321 | */ |
1201 | static struct task_struct * | 1322 | static struct task_struct * |
1202 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 1323 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) |
1203 | { | 1324 | { |
1204 | struct task_struct *p; | 1325 | struct task_struct *p = NULL; |
1326 | struct sched_entity *se; | ||
1327 | |||
1328 | if (next == &cfs_rq->tasks) | ||
1329 | return NULL; | ||
1330 | |||
1331 | /* Skip over entities that are not tasks */ | ||
1332 | do { | ||
1333 | se = list_entry(next, struct sched_entity, group_node); | ||
1334 | next = next->next; | ||
1335 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
1205 | 1336 | ||
1206 | if (!curr) | 1337 | if (next == &cfs_rq->tasks) |
1207 | return NULL; | 1338 | return NULL; |
1208 | 1339 | ||
1209 | p = rb_entry(curr, struct task_struct, se.run_node); | 1340 | cfs_rq->balance_iterator = next; |
1210 | cfs_rq->rb_load_balance_curr = rb_next(curr); | 1341 | |
1342 | if (entity_is_task(se)) | ||
1343 | p = task_of(se); | ||
1211 | 1344 | ||
1212 | return p; | 1345 | return p; |
1213 | } | 1346 | } |
@@ -1216,85 +1349,100 @@ static struct task_struct *load_balance_start_fair(void *arg) | |||
1216 | { | 1349 | { |
1217 | struct cfs_rq *cfs_rq = arg; | 1350 | struct cfs_rq *cfs_rq = arg; |
1218 | 1351 | ||
1219 | return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); | 1352 | return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); |
1220 | } | 1353 | } |
1221 | 1354 | ||
1222 | static struct task_struct *load_balance_next_fair(void *arg) | 1355 | static struct task_struct *load_balance_next_fair(void *arg) |
1223 | { | 1356 | { |
1224 | struct cfs_rq *cfs_rq = arg; | 1357 | struct cfs_rq *cfs_rq = arg; |
1225 | 1358 | ||
1226 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1359 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
1227 | } | 1360 | } |
1228 | 1361 | ||
1229 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1362 | static unsigned long |
1230 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1363 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1364 | unsigned long max_load_move, struct sched_domain *sd, | ||
1365 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
1366 | struct cfs_rq *cfs_rq) | ||
1231 | { | 1367 | { |
1232 | struct sched_entity *curr; | 1368 | struct rq_iterator cfs_rq_iterator; |
1233 | struct task_struct *p; | ||
1234 | |||
1235 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
1236 | return MAX_PRIO; | ||
1237 | |||
1238 | curr = cfs_rq->curr; | ||
1239 | if (!curr) | ||
1240 | curr = __pick_next_entity(cfs_rq); | ||
1241 | 1369 | ||
1242 | p = task_of(curr); | 1370 | cfs_rq_iterator.start = load_balance_start_fair; |
1371 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1372 | cfs_rq_iterator.arg = cfs_rq; | ||
1243 | 1373 | ||
1244 | return p->prio; | 1374 | return balance_tasks(this_rq, this_cpu, busiest, |
1375 | max_load_move, sd, idle, all_pinned, | ||
1376 | this_best_prio, &cfs_rq_iterator); | ||
1245 | } | 1377 | } |
1246 | #endif | ||
1247 | 1378 | ||
1379 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1248 | static unsigned long | 1380 | static unsigned long |
1249 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1381 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1250 | unsigned long max_load_move, | 1382 | unsigned long max_load_move, |
1251 | struct sched_domain *sd, enum cpu_idle_type idle, | 1383 | struct sched_domain *sd, enum cpu_idle_type idle, |
1252 | int *all_pinned, int *this_best_prio) | 1384 | int *all_pinned, int *this_best_prio) |
1253 | { | 1385 | { |
1254 | struct cfs_rq *busy_cfs_rq; | ||
1255 | long rem_load_move = max_load_move; | 1386 | long rem_load_move = max_load_move; |
1256 | struct rq_iterator cfs_rq_iterator; | 1387 | int busiest_cpu = cpu_of(busiest); |
1257 | 1388 | struct task_group *tg; | |
1258 | cfs_rq_iterator.start = load_balance_start_fair; | ||
1259 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1260 | 1389 | ||
1261 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1390 | rcu_read_lock(); |
1262 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1391 | list_for_each_entry(tg, &task_groups, list) { |
1263 | struct cfs_rq *this_cfs_rq; | ||
1264 | long imbalance; | 1392 | long imbalance; |
1265 | unsigned long maxload; | 1393 | unsigned long this_weight, busiest_weight; |
1394 | long rem_load, max_load, moved_load; | ||
1395 | |||
1396 | /* | ||
1397 | * empty group | ||
1398 | */ | ||
1399 | if (!aggregate(tg, sd)->task_weight) | ||
1400 | continue; | ||
1401 | |||
1402 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
1403 | rem_load /= aggregate(tg, sd)->load + 1; | ||
1404 | |||
1405 | this_weight = tg->cfs_rq[this_cpu]->task_weight; | ||
1406 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
1407 | |||
1408 | imbalance = (busiest_weight - this_weight) / 2; | ||
1266 | 1409 | ||
1267 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1410 | if (imbalance < 0) |
1411 | imbalance = busiest_weight; | ||
1268 | 1412 | ||
1269 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1413 | max_load = max(rem_load, imbalance); |
1270 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1414 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, |
1271 | if (imbalance <= 0) | 1415 | max_load, sd, idle, all_pinned, this_best_prio, |
1416 | tg->cfs_rq[busiest_cpu]); | ||
1417 | |||
1418 | if (!moved_load) | ||
1272 | continue; | 1419 | continue; |
1273 | 1420 | ||
1274 | /* Don't pull more than imbalance/2 */ | 1421 | move_group_shares(tg, sd, busiest_cpu, this_cpu); |
1275 | imbalance /= 2; | ||
1276 | maxload = min(rem_load_move, imbalance); | ||
1277 | 1422 | ||
1278 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1423 | moved_load *= aggregate(tg, sd)->load; |
1279 | #else | 1424 | moved_load /= aggregate(tg, sd)->rq_weight + 1; |
1280 | # define maxload rem_load_move | ||
1281 | #endif | ||
1282 | /* | ||
1283 | * pass busy_cfs_rq argument into | ||
1284 | * load_balance_[start|next]_fair iterators | ||
1285 | */ | ||
1286 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
1287 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
1288 | maxload, sd, idle, all_pinned, | ||
1289 | this_best_prio, | ||
1290 | &cfs_rq_iterator); | ||
1291 | 1425 | ||
1292 | if (rem_load_move <= 0) | 1426 | rem_load_move -= moved_load; |
1427 | if (rem_load_move < 0) | ||
1293 | break; | 1428 | break; |
1294 | } | 1429 | } |
1430 | rcu_read_unlock(); | ||
1295 | 1431 | ||
1296 | return max_load_move - rem_load_move; | 1432 | return max_load_move - rem_load_move; |
1297 | } | 1433 | } |
1434 | #else | ||
1435 | static unsigned long | ||
1436 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1437 | unsigned long max_load_move, | ||
1438 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1439 | int *all_pinned, int *this_best_prio) | ||
1440 | { | ||
1441 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
1442 | max_load_move, sd, idle, all_pinned, | ||
1443 | this_best_prio, &busiest->cfs); | ||
1444 | } | ||
1445 | #endif | ||
1298 | 1446 | ||
1299 | static int | 1447 | static int |
1300 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1448 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
@@ -1463,16 +1611,40 @@ static const struct sched_class fair_sched_class = { | |||
1463 | }; | 1611 | }; |
1464 | 1612 | ||
1465 | #ifdef CONFIG_SCHED_DEBUG | 1613 | #ifdef CONFIG_SCHED_DEBUG |
1614 | static void | ||
1615 | print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth) | ||
1616 | { | ||
1617 | struct sched_entity *se; | ||
1618 | |||
1619 | if (!cfs_rq) | ||
1620 | return; | ||
1621 | |||
1622 | list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) { | ||
1623 | int i; | ||
1624 | |||
1625 | for (i = depth; i; i--) | ||
1626 | seq_puts(m, " "); | ||
1627 | |||
1628 | seq_printf(m, "%lu %s %lu\n", | ||
1629 | se->load.weight, | ||
1630 | entity_is_task(se) ? "T" : "G", | ||
1631 | calc_delta_weight(SCHED_LOAD_SCALE, se) | ||
1632 | ); | ||
1633 | if (!entity_is_task(se)) | ||
1634 | print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1); | ||
1635 | } | ||
1636 | } | ||
1637 | |||
1466 | static void print_cfs_stats(struct seq_file *m, int cpu) | 1638 | static void print_cfs_stats(struct seq_file *m, int cpu) |
1467 | { | 1639 | { |
1468 | struct cfs_rq *cfs_rq; | 1640 | struct cfs_rq *cfs_rq; |
1469 | 1641 | ||
1470 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1471 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
1472 | #endif | ||
1473 | rcu_read_lock(); | 1642 | rcu_read_lock(); |
1474 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1643 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1475 | print_cfs_rq(m, cpu, cfs_rq); | 1644 | print_cfs_rq(m, cpu, cfs_rq); |
1645 | |||
1646 | seq_printf(m, "\nWeight tree:\n"); | ||
1647 | print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1); | ||
1476 | rcu_read_unlock(); | 1648 | rcu_read_unlock(); |
1477 | } | 1649 | } |
1478 | #endif | 1650 | #endif |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h new file mode 100644 index 000000000000..1c7283cb9581 --- /dev/null +++ b/kernel/sched_features.h | |||
@@ -0,0 +1,10 @@ | |||
1 | SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) | ||
2 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | ||
3 | SCHED_FEAT(START_DEBIT, 1) | ||
4 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | ||
5 | SCHED_FEAT(CACHE_HOT_BUDDY, 1) | ||
6 | SCHED_FEAT(SYNC_WAKEUPS, 1) | ||
7 | SCHED_FEAT(HRTICK, 1) | ||
8 | SCHED_FEAT(DOUBLE_TICK, 0) | ||
9 | SCHED_FEAT(NORMALIZED_SLEEPER, 1) | ||
10 | SCHED_FEAT(DEADLINE, 1) | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0a6d2e516420..c2730a5a4f05 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | |||
62 | if (!rt_rq->tg) | 62 | if (!rt_rq->tg) |
63 | return RUNTIME_INF; | 63 | return RUNTIME_INF; |
64 | 64 | ||
65 | return rt_rq->tg->rt_runtime; | 65 | return rt_rq->rt_runtime; |
66 | } | ||
67 | |||
68 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) | ||
69 | { | ||
70 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | ||
66 | } | 71 | } |
67 | 72 | ||
68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 73 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
@@ -127,14 +132,39 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) | |||
127 | return p->prio != p->normal_prio; | 132 | return p->prio != p->normal_prio; |
128 | } | 133 | } |
129 | 134 | ||
135 | #ifdef CONFIG_SMP | ||
136 | static inline cpumask_t sched_rt_period_mask(void) | ||
137 | { | ||
138 | return cpu_rq(smp_processor_id())->rd->span; | ||
139 | } | ||
140 | #else | ||
141 | static inline cpumask_t sched_rt_period_mask(void) | ||
142 | { | ||
143 | return cpu_online_map; | ||
144 | } | ||
145 | #endif | ||
146 | |||
147 | static inline | ||
148 | struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | ||
149 | { | ||
150 | return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; | ||
151 | } | ||
152 | |||
153 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
154 | { | ||
155 | return &rt_rq->tg->rt_bandwidth; | ||
156 | } | ||
157 | |||
130 | #else | 158 | #else |
131 | 159 | ||
132 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) | 160 | static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) |
133 | { | 161 | { |
134 | if (sysctl_sched_rt_runtime == -1) | 162 | return rt_rq->rt_runtime; |
135 | return RUNTIME_INF; | 163 | } |
136 | 164 | ||
137 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 165 | static inline u64 sched_rt_period(struct rt_rq *rt_rq) |
166 | { | ||
167 | return ktime_to_ns(def_rt_bandwidth.rt_period); | ||
138 | } | 168 | } |
139 | 169 | ||
140 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 170 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
@@ -173,6 +203,102 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) | |||
173 | { | 203 | { |
174 | return rt_rq->rt_throttled; | 204 | return rt_rq->rt_throttled; |
175 | } | 205 | } |
206 | |||
207 | static inline cpumask_t sched_rt_period_mask(void) | ||
208 | { | ||
209 | return cpu_online_map; | ||
210 | } | ||
211 | |||
212 | static inline | ||
213 | struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) | ||
214 | { | ||
215 | return &cpu_rq(cpu)->rt; | ||
216 | } | ||
217 | |||
218 | static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | ||
219 | { | ||
220 | return &def_rt_bandwidth; | ||
221 | } | ||
222 | |||
223 | #endif | ||
224 | |||
225 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | ||
226 | { | ||
227 | int i, idle = 1; | ||
228 | cpumask_t span; | ||
229 | |||
230 | if (rt_b->rt_runtime == RUNTIME_INF) | ||
231 | return 1; | ||
232 | |||
233 | span = sched_rt_period_mask(); | ||
234 | for_each_cpu_mask(i, span) { | ||
235 | int enqueue = 0; | ||
236 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | ||
237 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
238 | |||
239 | spin_lock(&rq->lock); | ||
240 | if (rt_rq->rt_time) { | ||
241 | u64 runtime; | ||
242 | |||
243 | spin_lock(&rt_rq->rt_runtime_lock); | ||
244 | runtime = rt_rq->rt_runtime; | ||
245 | rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); | ||
246 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
247 | rt_rq->rt_throttled = 0; | ||
248 | enqueue = 1; | ||
249 | } | ||
250 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | ||
251 | idle = 0; | ||
252 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
253 | } | ||
254 | |||
255 | if (enqueue) | ||
256 | sched_rt_rq_enqueue(rt_rq); | ||
257 | spin_unlock(&rq->lock); | ||
258 | } | ||
259 | |||
260 | return idle; | ||
261 | } | ||
262 | |||
263 | #ifdef CONFIG_SMP | ||
264 | static int balance_runtime(struct rt_rq *rt_rq) | ||
265 | { | ||
266 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
267 | struct root_domain *rd = cpu_rq(smp_processor_id())->rd; | ||
268 | int i, weight, more = 0; | ||
269 | u64 rt_period; | ||
270 | |||
271 | weight = cpus_weight(rd->span); | ||
272 | |||
273 | spin_lock(&rt_b->rt_runtime_lock); | ||
274 | rt_period = ktime_to_ns(rt_b->rt_period); | ||
275 | for_each_cpu_mask(i, rd->span) { | ||
276 | struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); | ||
277 | s64 diff; | ||
278 | |||
279 | if (iter == rt_rq) | ||
280 | continue; | ||
281 | |||
282 | spin_lock(&iter->rt_runtime_lock); | ||
283 | diff = iter->rt_runtime - iter->rt_time; | ||
284 | if (diff > 0) { | ||
285 | do_div(diff, weight); | ||
286 | if (rt_rq->rt_runtime + diff > rt_period) | ||
287 | diff = rt_period - rt_rq->rt_runtime; | ||
288 | iter->rt_runtime -= diff; | ||
289 | rt_rq->rt_runtime += diff; | ||
290 | more = 1; | ||
291 | if (rt_rq->rt_runtime == rt_period) { | ||
292 | spin_unlock(&iter->rt_runtime_lock); | ||
293 | break; | ||
294 | } | ||
295 | } | ||
296 | spin_unlock(&iter->rt_runtime_lock); | ||
297 | } | ||
298 | spin_unlock(&rt_b->rt_runtime_lock); | ||
299 | |||
300 | return more; | ||
301 | } | ||
176 | #endif | 302 | #endif |
177 | 303 | ||
178 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | 304 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) |
@@ -197,12 +323,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
197 | if (rt_rq->rt_throttled) | 323 | if (rt_rq->rt_throttled) |
198 | return rt_rq_throttled(rt_rq); | 324 | return rt_rq_throttled(rt_rq); |
199 | 325 | ||
326 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | ||
327 | return 0; | ||
328 | |||
329 | #ifdef CONFIG_SMP | ||
200 | if (rt_rq->rt_time > runtime) { | 330 | if (rt_rq->rt_time > runtime) { |
201 | struct rq *rq = rq_of_rt_rq(rt_rq); | 331 | int more; |
202 | 332 | ||
203 | rq->rt_throttled = 1; | 333 | spin_unlock(&rt_rq->rt_runtime_lock); |
204 | rt_rq->rt_throttled = 1; | 334 | more = balance_runtime(rt_rq); |
335 | spin_lock(&rt_rq->rt_runtime_lock); | ||
205 | 336 | ||
337 | if (more) | ||
338 | runtime = sched_rt_runtime(rt_rq); | ||
339 | } | ||
340 | #endif | ||
341 | |||
342 | if (rt_rq->rt_time > runtime) { | ||
343 | rt_rq->rt_throttled = 1; | ||
206 | if (rt_rq_throttled(rt_rq)) { | 344 | if (rt_rq_throttled(rt_rq)) { |
207 | sched_rt_rq_dequeue(rt_rq); | 345 | sched_rt_rq_dequeue(rt_rq); |
208 | return 1; | 346 | return 1; |
@@ -212,29 +350,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
212 | return 0; | 350 | return 0; |
213 | } | 351 | } |
214 | 352 | ||
215 | static void update_sched_rt_period(struct rq *rq) | ||
216 | { | ||
217 | struct rt_rq *rt_rq; | ||
218 | u64 period; | ||
219 | |||
220 | while (rq->clock > rq->rt_period_expire) { | ||
221 | period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
222 | rq->rt_period_expire += period; | ||
223 | |||
224 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
225 | u64 runtime = sched_rt_runtime(rt_rq); | ||
226 | |||
227 | rt_rq->rt_time -= min(rt_rq->rt_time, runtime); | ||
228 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | ||
229 | rt_rq->rt_throttled = 0; | ||
230 | sched_rt_rq_enqueue(rt_rq); | ||
231 | } | ||
232 | } | ||
233 | |||
234 | rq->rt_throttled = 0; | ||
235 | } | ||
236 | } | ||
237 | |||
238 | /* | 353 | /* |
239 | * Update the current task's runtime statistics. Skip current tasks that | 354 | * Update the current task's runtime statistics. Skip current tasks that |
240 | * are not in our scheduling class. | 355 | * are not in our scheduling class. |
@@ -259,9 +374,15 @@ static void update_curr_rt(struct rq *rq) | |||
259 | curr->se.exec_start = rq->clock; | 374 | curr->se.exec_start = rq->clock; |
260 | cpuacct_charge(curr, delta_exec); | 375 | cpuacct_charge(curr, delta_exec); |
261 | 376 | ||
262 | rt_rq->rt_time += delta_exec; | 377 | for_each_sched_rt_entity(rt_se) { |
263 | if (sched_rt_runtime_exceeded(rt_rq)) | 378 | rt_rq = rt_rq_of_se(rt_se); |
264 | resched_task(curr); | 379 | |
380 | spin_lock(&rt_rq->rt_runtime_lock); | ||
381 | rt_rq->rt_time += delta_exec; | ||
382 | if (sched_rt_runtime_exceeded(rt_rq)) | ||
383 | resched_task(curr); | ||
384 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
385 | } | ||
265 | } | 386 | } |
266 | 387 | ||
267 | static inline | 388 | static inline |
@@ -284,6 +405,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
284 | #ifdef CONFIG_RT_GROUP_SCHED | 405 | #ifdef CONFIG_RT_GROUP_SCHED |
285 | if (rt_se_boosted(rt_se)) | 406 | if (rt_se_boosted(rt_se)) |
286 | rt_rq->rt_nr_boosted++; | 407 | rt_rq->rt_nr_boosted++; |
408 | |||
409 | if (rt_rq->tg) | ||
410 | start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); | ||
411 | #else | ||
412 | start_rt_bandwidth(&def_rt_bandwidth); | ||
287 | #endif | 413 | #endif |
288 | } | 414 | } |
289 | 415 | ||
@@ -353,27 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
353 | /* | 479 | /* |
354 | * Because the prio of an upper entry depends on the lower | 480 | * Because the prio of an upper entry depends on the lower |
355 | * entries, we must remove entries top - down. | 481 | * entries, we must remove entries top - down. |
356 | * | ||
357 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | ||
358 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
359 | */ | 482 | */ |
360 | static void dequeue_rt_stack(struct task_struct *p) | 483 | static void dequeue_rt_stack(struct task_struct *p) |
361 | { | 484 | { |
362 | struct sched_rt_entity *rt_se, *top_se; | 485 | struct sched_rt_entity *rt_se, *back = NULL; |
363 | 486 | ||
364 | /* | 487 | rt_se = &p->rt; |
365 | * dequeue all, top - down. | 488 | for_each_sched_rt_entity(rt_se) { |
366 | */ | 489 | rt_se->back = back; |
367 | do { | 490 | back = rt_se; |
368 | rt_se = &p->rt; | 491 | } |
369 | top_se = NULL; | 492 | |
370 | for_each_sched_rt_entity(rt_se) { | 493 | for (rt_se = back; rt_se; rt_se = rt_se->back) { |
371 | if (on_rt_rq(rt_se)) | 494 | if (on_rt_rq(rt_se)) |
372 | top_se = rt_se; | 495 | dequeue_rt_entity(rt_se); |
373 | } | 496 | } |
374 | if (top_se) | ||
375 | dequeue_rt_entity(top_se); | ||
376 | } while (top_se); | ||
377 | } | 497 | } |
378 | 498 | ||
379 | /* | 499 | /* |
@@ -393,6 +513,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
393 | */ | 513 | */ |
394 | for_each_sched_rt_entity(rt_se) | 514 | for_each_sched_rt_entity(rt_se) |
395 | enqueue_rt_entity(rt_se); | 515 | enqueue_rt_entity(rt_se); |
516 | |||
517 | inc_cpu_load(rq, p->se.load.weight); | ||
396 | } | 518 | } |
397 | 519 | ||
398 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 520 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
@@ -412,6 +534,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
412 | if (rt_rq && rt_rq->rt_nr_running) | 534 | if (rt_rq && rt_rq->rt_nr_running) |
413 | enqueue_rt_entity(rt_se); | 535 | enqueue_rt_entity(rt_se); |
414 | } | 536 | } |
537 | |||
538 | dec_cpu_load(rq, p->se.load.weight); | ||
415 | } | 539 | } |
416 | 540 | ||
417 | /* | 541 | /* |
@@ -1001,7 +1125,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1001 | return 0; | 1125 | return 0; |
1002 | } | 1126 | } |
1003 | 1127 | ||
1004 | static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) | 1128 | static void set_cpus_allowed_rt(struct task_struct *p, |
1129 | const cpumask_t *new_mask) | ||
1005 | { | 1130 | { |
1006 | int weight = cpus_weight(*new_mask); | 1131 | int weight = cpus_weight(*new_mask); |
1007 | 1132 | ||
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 5b32433e7ee5..5bae2e0c3ff2 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -9,6 +9,11 @@ | |||
9 | static int show_schedstat(struct seq_file *seq, void *v) | 9 | static int show_schedstat(struct seq_file *seq, void *v) |
10 | { | 10 | { |
11 | int cpu; | 11 | int cpu; |
12 | int mask_len = NR_CPUS/32 * 9; | ||
13 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
14 | |||
15 | if (mask_str == NULL) | ||
16 | return -ENOMEM; | ||
12 | 17 | ||
13 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 18 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
14 | seq_printf(seq, "timestamp %lu\n", jiffies); | 19 | seq_printf(seq, "timestamp %lu\n", jiffies); |
@@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
36 | preempt_disable(); | 41 | preempt_disable(); |
37 | for_each_domain(cpu, sd) { | 42 | for_each_domain(cpu, sd) { |
38 | enum cpu_idle_type itype; | 43 | enum cpu_idle_type itype; |
39 | char mask_str[NR_CPUS]; | ||
40 | 44 | ||
41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | 45 | cpumask_scnprintf(mask_str, mask_len, sd->span); |
42 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | 46 | seq_printf(seq, "domain%d %s", dcount++, mask_str); |
43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | 47 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; |
44 | itype++) { | 48 | itype++) { |
diff --git a/kernel/semaphore.c b/kernel/semaphore.c new file mode 100644 index 000000000000..5c2942e768cd --- /dev/null +++ b/kernel/semaphore.c | |||
@@ -0,0 +1,264 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2008 Intel Corporation | ||
3 | * Author: Matthew Wilcox <willy@linux.intel.com> | ||
4 | * | ||
5 | * Distributed under the terms of the GNU GPL, version 2 | ||
6 | * | ||
7 | * This file implements counting semaphores. | ||
8 | * A counting semaphore may be acquired 'n' times before sleeping. | ||
9 | * See mutex.c for single-acquisition sleeping locks which enforce | ||
10 | * rules which allow code to be debugged more easily. | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | * Some notes on the implementation: | ||
15 | * | ||
16 | * The spinlock controls access to the other members of the semaphore. | ||
17 | * down_trylock() and up() can be called from interrupt context, so we | ||
18 | * have to disable interrupts when taking the lock. It turns out various | ||
19 | * parts of the kernel expect to be able to use down() on a semaphore in | ||
20 | * interrupt context when they know it will succeed, so we have to use | ||
21 | * irqsave variants for down(), down_interruptible() and down_killable() | ||
22 | * too. | ||
23 | * | ||
24 | * The ->count variable represents how many more tasks can acquire this | ||
25 | * semaphore. If it's zero, there may be tasks waiting on the wait_list. | ||
26 | */ | ||
27 | |||
28 | #include <linux/compiler.h> | ||
29 | #include <linux/kernel.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/sched.h> | ||
32 | #include <linux/semaphore.h> | ||
33 | #include <linux/spinlock.h> | ||
34 | |||
35 | static noinline void __down(struct semaphore *sem); | ||
36 | static noinline int __down_interruptible(struct semaphore *sem); | ||
37 | static noinline int __down_killable(struct semaphore *sem); | ||
38 | static noinline int __down_timeout(struct semaphore *sem, long jiffies); | ||
39 | static noinline void __up(struct semaphore *sem); | ||
40 | |||
41 | /** | ||
42 | * down - acquire the semaphore | ||
43 | * @sem: the semaphore to be acquired | ||
44 | * | ||
45 | * Acquires the semaphore. If no more tasks are allowed to acquire the | ||
46 | * semaphore, calling this function will put the task to sleep until the | ||
47 | * semaphore is released. | ||
48 | * | ||
49 | * Use of this function is deprecated, please use down_interruptible() or | ||
50 | * down_killable() instead. | ||
51 | */ | ||
52 | void down(struct semaphore *sem) | ||
53 | { | ||
54 | unsigned long flags; | ||
55 | |||
56 | spin_lock_irqsave(&sem->lock, flags); | ||
57 | if (likely(sem->count > 0)) | ||
58 | sem->count--; | ||
59 | else | ||
60 | __down(sem); | ||
61 | spin_unlock_irqrestore(&sem->lock, flags); | ||
62 | } | ||
63 | EXPORT_SYMBOL(down); | ||
64 | |||
65 | /** | ||
66 | * down_interruptible - acquire the semaphore unless interrupted | ||
67 | * @sem: the semaphore to be acquired | ||
68 | * | ||
69 | * Attempts to acquire the semaphore. If no more tasks are allowed to | ||
70 | * acquire the semaphore, calling this function will put the task to sleep. | ||
71 | * If the sleep is interrupted by a signal, this function will return -EINTR. | ||
72 | * If the semaphore is successfully acquired, this function returns 0. | ||
73 | */ | ||
74 | int down_interruptible(struct semaphore *sem) | ||
75 | { | ||
76 | unsigned long flags; | ||
77 | int result = 0; | ||
78 | |||
79 | spin_lock_irqsave(&sem->lock, flags); | ||
80 | if (likely(sem->count > 0)) | ||
81 | sem->count--; | ||
82 | else | ||
83 | result = __down_interruptible(sem); | ||
84 | spin_unlock_irqrestore(&sem->lock, flags); | ||
85 | |||
86 | return result; | ||
87 | } | ||
88 | EXPORT_SYMBOL(down_interruptible); | ||
89 | |||
90 | /** | ||
91 | * down_killable - acquire the semaphore unless killed | ||
92 | * @sem: the semaphore to be acquired | ||
93 | * | ||
94 | * Attempts to acquire the semaphore. If no more tasks are allowed to | ||
95 | * acquire the semaphore, calling this function will put the task to sleep. | ||
96 | * If the sleep is interrupted by a fatal signal, this function will return | ||
97 | * -EINTR. If the semaphore is successfully acquired, this function returns | ||
98 | * 0. | ||
99 | */ | ||
100 | int down_killable(struct semaphore *sem) | ||
101 | { | ||
102 | unsigned long flags; | ||
103 | int result = 0; | ||
104 | |||
105 | spin_lock_irqsave(&sem->lock, flags); | ||
106 | if (likely(sem->count > 0)) | ||
107 | sem->count--; | ||
108 | else | ||
109 | result = __down_killable(sem); | ||
110 | spin_unlock_irqrestore(&sem->lock, flags); | ||
111 | |||
112 | return result; | ||
113 | } | ||
114 | EXPORT_SYMBOL(down_killable); | ||
115 | |||
116 | /** | ||
117 | * down_trylock - try to acquire the semaphore, without waiting | ||
118 | * @sem: the semaphore to be acquired | ||
119 | * | ||
120 | * Try to acquire the semaphore atomically. Returns 0 if the mutex has | ||
121 | * been acquired successfully or 1 if it it cannot be acquired. | ||
122 | * | ||
123 | * NOTE: This return value is inverted from both spin_trylock and | ||
124 | * mutex_trylock! Be careful about this when converting code. | ||
125 | * | ||
126 | * Unlike mutex_trylock, this function can be used from interrupt context, | ||
127 | * and the semaphore can be released by any task or interrupt. | ||
128 | */ | ||
129 | int down_trylock(struct semaphore *sem) | ||
130 | { | ||
131 | unsigned long flags; | ||
132 | int count; | ||
133 | |||
134 | spin_lock_irqsave(&sem->lock, flags); | ||
135 | count = sem->count - 1; | ||
136 | if (likely(count >= 0)) | ||
137 | sem->count = count; | ||
138 | spin_unlock_irqrestore(&sem->lock, flags); | ||
139 | |||
140 | return (count < 0); | ||
141 | } | ||
142 | EXPORT_SYMBOL(down_trylock); | ||
143 | |||
144 | /** | ||
145 | * down_timeout - acquire the semaphore within a specified time | ||
146 | * @sem: the semaphore to be acquired | ||
147 | * @jiffies: how long to wait before failing | ||
148 | * | ||
149 | * Attempts to acquire the semaphore. If no more tasks are allowed to | ||
150 | * acquire the semaphore, calling this function will put the task to sleep. | ||
151 | * If the semaphore is not released within the specified number of jiffies, | ||
152 | * this function returns -ETIME. It returns 0 if the semaphore was acquired. | ||
153 | */ | ||
154 | int down_timeout(struct semaphore *sem, long jiffies) | ||
155 | { | ||
156 | unsigned long flags; | ||
157 | int result = 0; | ||
158 | |||
159 | spin_lock_irqsave(&sem->lock, flags); | ||
160 | if (likely(sem->count > 0)) | ||
161 | sem->count--; | ||
162 | else | ||
163 | result = __down_timeout(sem, jiffies); | ||
164 | spin_unlock_irqrestore(&sem->lock, flags); | ||
165 | |||
166 | return result; | ||
167 | } | ||
168 | EXPORT_SYMBOL(down_timeout); | ||
169 | |||
170 | /** | ||
171 | * up - release the semaphore | ||
172 | * @sem: the semaphore to release | ||
173 | * | ||
174 | * Release the semaphore. Unlike mutexes, up() may be called from any | ||
175 | * context and even by tasks which have never called down(). | ||
176 | */ | ||
177 | void up(struct semaphore *sem) | ||
178 | { | ||
179 | unsigned long flags; | ||
180 | |||
181 | spin_lock_irqsave(&sem->lock, flags); | ||
182 | if (likely(list_empty(&sem->wait_list))) | ||
183 | sem->count++; | ||
184 | else | ||
185 | __up(sem); | ||
186 | spin_unlock_irqrestore(&sem->lock, flags); | ||
187 | } | ||
188 | EXPORT_SYMBOL(up); | ||
189 | |||
190 | /* Functions for the contended case */ | ||
191 | |||
192 | struct semaphore_waiter { | ||
193 | struct list_head list; | ||
194 | struct task_struct *task; | ||
195 | int up; | ||
196 | }; | ||
197 | |||
198 | /* | ||
199 | * Because this function is inlined, the 'state' parameter will be | ||
200 | * constant, and thus optimised away by the compiler. Likewise the | ||
201 | * 'timeout' parameter for the cases without timeouts. | ||
202 | */ | ||
203 | static inline int __sched __down_common(struct semaphore *sem, long state, | ||
204 | long timeout) | ||
205 | { | ||
206 | struct task_struct *task = current; | ||
207 | struct semaphore_waiter waiter; | ||
208 | |||
209 | list_add_tail(&waiter.list, &sem->wait_list); | ||
210 | waiter.task = task; | ||
211 | waiter.up = 0; | ||
212 | |||
213 | for (;;) { | ||
214 | if (state == TASK_INTERRUPTIBLE && signal_pending(task)) | ||
215 | goto interrupted; | ||
216 | if (state == TASK_KILLABLE && fatal_signal_pending(task)) | ||
217 | goto interrupted; | ||
218 | if (timeout <= 0) | ||
219 | goto timed_out; | ||
220 | __set_task_state(task, state); | ||
221 | spin_unlock_irq(&sem->lock); | ||
222 | timeout = schedule_timeout(timeout); | ||
223 | spin_lock_irq(&sem->lock); | ||
224 | if (waiter.up) | ||
225 | return 0; | ||
226 | } | ||
227 | |||
228 | timed_out: | ||
229 | list_del(&waiter.list); | ||
230 | return -ETIME; | ||
231 | |||
232 | interrupted: | ||
233 | list_del(&waiter.list); | ||
234 | return -EINTR; | ||
235 | } | ||
236 | |||
237 | static noinline void __sched __down(struct semaphore *sem) | ||
238 | { | ||
239 | __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
240 | } | ||
241 | |||
242 | static noinline int __sched __down_interruptible(struct semaphore *sem) | ||
243 | { | ||
244 | return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
245 | } | ||
246 | |||
247 | static noinline int __sched __down_killable(struct semaphore *sem) | ||
248 | { | ||
249 | return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); | ||
250 | } | ||
251 | |||
252 | static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) | ||
253 | { | ||
254 | return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); | ||
255 | } | ||
256 | |||
257 | static noinline void __sched __up(struct semaphore *sem) | ||
258 | { | ||
259 | struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, | ||
260 | struct semaphore_waiter, list); | ||
261 | list_del(&waiter->list); | ||
262 | waiter->up = 1; | ||
263 | wake_up_process(waiter->task); | ||
264 | } | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 6af1210092c3..64ad0ed15992 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -220,7 +220,7 @@ void flush_signals(struct task_struct *t) | |||
220 | unsigned long flags; | 220 | unsigned long flags; |
221 | 221 | ||
222 | spin_lock_irqsave(&t->sighand->siglock, flags); | 222 | spin_lock_irqsave(&t->sighand->siglock, flags); |
223 | clear_tsk_thread_flag(t,TIF_SIGPENDING); | 223 | clear_tsk_thread_flag(t, TIF_SIGPENDING); |
224 | flush_sigqueue(&t->pending); | 224 | flush_sigqueue(&t->pending); |
225 | flush_sigqueue(&t->signal->shared_pending); | 225 | flush_sigqueue(&t->signal->shared_pending); |
226 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | 226 | spin_unlock_irqrestore(&t->sighand->siglock, flags); |
@@ -424,7 +424,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
424 | } | 424 | } |
425 | if (signr && | 425 | if (signr && |
426 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | 426 | ((info->si_code & __SI_MASK) == __SI_TIMER) && |
427 | info->si_sys_private){ | 427 | info->si_sys_private) { |
428 | /* | 428 | /* |
429 | * Release the siglock to ensure proper locking order | 429 | * Release the siglock to ensure proper locking order |
430 | * of timer locks outside of siglocks. Note, we leave | 430 | * of timer locks outside of siglocks. Note, we leave |
@@ -1757,6 +1757,45 @@ static int do_signal_stop(int signr) | |||
1757 | return 1; | 1757 | return 1; |
1758 | } | 1758 | } |
1759 | 1759 | ||
1760 | static int ptrace_signal(int signr, siginfo_t *info, | ||
1761 | struct pt_regs *regs, void *cookie) | ||
1762 | { | ||
1763 | if (!(current->ptrace & PT_PTRACED)) | ||
1764 | return signr; | ||
1765 | |||
1766 | ptrace_signal_deliver(regs, cookie); | ||
1767 | |||
1768 | /* Let the debugger run. */ | ||
1769 | ptrace_stop(signr, 0, info); | ||
1770 | |||
1771 | /* We're back. Did the debugger cancel the sig? */ | ||
1772 | signr = current->exit_code; | ||
1773 | if (signr == 0) | ||
1774 | return signr; | ||
1775 | |||
1776 | current->exit_code = 0; | ||
1777 | |||
1778 | /* Update the siginfo structure if the signal has | ||
1779 | changed. If the debugger wanted something | ||
1780 | specific in the siginfo structure then it should | ||
1781 | have updated *info via PTRACE_SETSIGINFO. */ | ||
1782 | if (signr != info->si_signo) { | ||
1783 | info->si_signo = signr; | ||
1784 | info->si_errno = 0; | ||
1785 | info->si_code = SI_USER; | ||
1786 | info->si_pid = task_pid_vnr(current->parent); | ||
1787 | info->si_uid = current->parent->uid; | ||
1788 | } | ||
1789 | |||
1790 | /* If the (new) signal is now blocked, requeue it. */ | ||
1791 | if (sigismember(¤t->blocked, signr)) { | ||
1792 | specific_send_sig_info(signr, info, current); | ||
1793 | signr = 0; | ||
1794 | } | ||
1795 | |||
1796 | return signr; | ||
1797 | } | ||
1798 | |||
1760 | int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | 1799 | int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, |
1761 | struct pt_regs *regs, void *cookie) | 1800 | struct pt_regs *regs, void *cookie) |
1762 | { | 1801 | { |
@@ -1785,36 +1824,10 @@ relock: | |||
1785 | if (!signr) | 1824 | if (!signr) |
1786 | break; /* will return 0 */ | 1825 | break; /* will return 0 */ |
1787 | 1826 | ||
1788 | if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { | 1827 | if (signr != SIGKILL) { |
1789 | ptrace_signal_deliver(regs, cookie); | 1828 | signr = ptrace_signal(signr, info, regs, cookie); |
1790 | 1829 | if (!signr) | |
1791 | /* Let the debugger run. */ | ||
1792 | ptrace_stop(signr, 0, info); | ||
1793 | |||
1794 | /* We're back. Did the debugger cancel the sig? */ | ||
1795 | signr = current->exit_code; | ||
1796 | if (signr == 0) | ||
1797 | continue; | ||
1798 | |||
1799 | current->exit_code = 0; | ||
1800 | |||
1801 | /* Update the siginfo structure if the signal has | ||
1802 | changed. If the debugger wanted something | ||
1803 | specific in the siginfo structure then it should | ||
1804 | have updated *info via PTRACE_SETSIGINFO. */ | ||
1805 | if (signr != info->si_signo) { | ||
1806 | info->si_signo = signr; | ||
1807 | info->si_errno = 0; | ||
1808 | info->si_code = SI_USER; | ||
1809 | info->si_pid = task_pid_vnr(current->parent); | ||
1810 | info->si_uid = current->parent->uid; | ||
1811 | } | ||
1812 | |||
1813 | /* If the (new) signal is now blocked, requeue it. */ | ||
1814 | if (sigismember(¤t->blocked, signr)) { | ||
1815 | specific_send_sig_info(signr, info, current); | ||
1816 | continue; | 1830 | continue; |
1817 | } | ||
1818 | } | 1831 | } |
1819 | 1832 | ||
1820 | ka = ¤t->sighand->action[signr-1]; | 1833 | ka = ¤t->sighand->action[signr-1]; |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 31e9f2a47928..3c44956ee7e2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -356,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) | |||
356 | /* Tasklets */ | 356 | /* Tasklets */ |
357 | struct tasklet_head | 357 | struct tasklet_head |
358 | { | 358 | { |
359 | struct tasklet_struct *list; | 359 | struct tasklet_struct *head; |
360 | struct tasklet_struct **tail; | ||
360 | }; | 361 | }; |
361 | 362 | ||
362 | /* Some compilers disobey section attribute on statics when not | 363 | /* Some compilers disobey section attribute on statics when not |
@@ -369,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t) | |||
369 | unsigned long flags; | 370 | unsigned long flags; |
370 | 371 | ||
371 | local_irq_save(flags); | 372 | local_irq_save(flags); |
372 | t->next = __get_cpu_var(tasklet_vec).list; | 373 | t->next = NULL; |
373 | __get_cpu_var(tasklet_vec).list = t; | 374 | *__get_cpu_var(tasklet_vec).tail = t; |
375 | __get_cpu_var(tasklet_vec).tail = &(t->next); | ||
374 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 376 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
375 | local_irq_restore(flags); | 377 | local_irq_restore(flags); |
376 | } | 378 | } |
@@ -382,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) | |||
382 | unsigned long flags; | 384 | unsigned long flags; |
383 | 385 | ||
384 | local_irq_save(flags); | 386 | local_irq_save(flags); |
385 | t->next = __get_cpu_var(tasklet_hi_vec).list; | 387 | t->next = NULL; |
386 | __get_cpu_var(tasklet_hi_vec).list = t; | 388 | *__get_cpu_var(tasklet_hi_vec).tail = t; |
389 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | ||
387 | raise_softirq_irqoff(HI_SOFTIRQ); | 390 | raise_softirq_irqoff(HI_SOFTIRQ); |
388 | local_irq_restore(flags); | 391 | local_irq_restore(flags); |
389 | } | 392 | } |
@@ -395,8 +398,9 @@ static void tasklet_action(struct softirq_action *a) | |||
395 | struct tasklet_struct *list; | 398 | struct tasklet_struct *list; |
396 | 399 | ||
397 | local_irq_disable(); | 400 | local_irq_disable(); |
398 | list = __get_cpu_var(tasklet_vec).list; | 401 | list = __get_cpu_var(tasklet_vec).head; |
399 | __get_cpu_var(tasklet_vec).list = NULL; | 402 | __get_cpu_var(tasklet_vec).head = NULL; |
403 | __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; | ||
400 | local_irq_enable(); | 404 | local_irq_enable(); |
401 | 405 | ||
402 | while (list) { | 406 | while (list) { |
@@ -416,8 +420,9 @@ static void tasklet_action(struct softirq_action *a) | |||
416 | } | 420 | } |
417 | 421 | ||
418 | local_irq_disable(); | 422 | local_irq_disable(); |
419 | t->next = __get_cpu_var(tasklet_vec).list; | 423 | t->next = NULL; |
420 | __get_cpu_var(tasklet_vec).list = t; | 424 | *__get_cpu_var(tasklet_vec).tail = t; |
425 | __get_cpu_var(tasklet_vec).tail = &(t->next); | ||
421 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | 426 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); |
422 | local_irq_enable(); | 427 | local_irq_enable(); |
423 | } | 428 | } |
@@ -428,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
428 | struct tasklet_struct *list; | 433 | struct tasklet_struct *list; |
429 | 434 | ||
430 | local_irq_disable(); | 435 | local_irq_disable(); |
431 | list = __get_cpu_var(tasklet_hi_vec).list; | 436 | list = __get_cpu_var(tasklet_hi_vec).head; |
432 | __get_cpu_var(tasklet_hi_vec).list = NULL; | 437 | __get_cpu_var(tasklet_hi_vec).head = NULL; |
438 | __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; | ||
433 | local_irq_enable(); | 439 | local_irq_enable(); |
434 | 440 | ||
435 | while (list) { | 441 | while (list) { |
@@ -449,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
449 | } | 455 | } |
450 | 456 | ||
451 | local_irq_disable(); | 457 | local_irq_disable(); |
452 | t->next = __get_cpu_var(tasklet_hi_vec).list; | 458 | t->next = NULL; |
453 | __get_cpu_var(tasklet_hi_vec).list = t; | 459 | *__get_cpu_var(tasklet_hi_vec).tail = t; |
460 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | ||
454 | __raise_softirq_irqoff(HI_SOFTIRQ); | 461 | __raise_softirq_irqoff(HI_SOFTIRQ); |
455 | local_irq_enable(); | 462 | local_irq_enable(); |
456 | } | 463 | } |
@@ -487,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill); | |||
487 | 494 | ||
488 | void __init softirq_init(void) | 495 | void __init softirq_init(void) |
489 | { | 496 | { |
497 | int cpu; | ||
498 | |||
499 | for_each_possible_cpu(cpu) { | ||
500 | per_cpu(tasklet_vec, cpu).tail = | ||
501 | &per_cpu(tasklet_vec, cpu).head; | ||
502 | per_cpu(tasklet_hi_vec, cpu).tail = | ||
503 | &per_cpu(tasklet_hi_vec, cpu).head; | ||
504 | } | ||
505 | |||
490 | open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); | 506 | open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); |
491 | open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); | 507 | open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); |
492 | } | 508 | } |
@@ -555,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
555 | return; | 571 | return; |
556 | 572 | ||
557 | /* CPU is dead, so no lock needed. */ | 573 | /* CPU is dead, so no lock needed. */ |
558 | for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { | 574 | for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { |
559 | if (*i == t) { | 575 | if (*i == t) { |
560 | *i = t->next; | 576 | *i = t->next; |
577 | /* If this was the tail element, move the tail ptr */ | ||
578 | if (*i == NULL) | ||
579 | per_cpu(tasklet_vec, cpu).tail = i; | ||
561 | return; | 580 | return; |
562 | } | 581 | } |
563 | } | 582 | } |
@@ -566,20 +585,20 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
566 | 585 | ||
567 | static void takeover_tasklets(unsigned int cpu) | 586 | static void takeover_tasklets(unsigned int cpu) |
568 | { | 587 | { |
569 | struct tasklet_struct **i; | ||
570 | |||
571 | /* CPU is dead, so no lock needed. */ | 588 | /* CPU is dead, so no lock needed. */ |
572 | local_irq_disable(); | 589 | local_irq_disable(); |
573 | 590 | ||
574 | /* Find end, append list for that CPU. */ | 591 | /* Find end, append list for that CPU. */ |
575 | for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); | 592 | *__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head; |
576 | *i = per_cpu(tasklet_vec, cpu).list; | 593 | __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; |
577 | per_cpu(tasklet_vec, cpu).list = NULL; | 594 | per_cpu(tasklet_vec, cpu).head = NULL; |
595 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | ||
578 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 596 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
579 | 597 | ||
580 | for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); | 598 | *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; |
581 | *i = per_cpu(tasklet_hi_vec, cpu).list; | 599 | __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; |
582 | per_cpu(tasklet_hi_vec, cpu).list = NULL; | 600 | per_cpu(tasklet_hi_vec, cpu).head = NULL; |
601 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; | ||
583 | raise_softirq_irqoff(HI_SOFTIRQ); | 602 | raise_softirq_irqoff(HI_SOFTIRQ); |
584 | 603 | ||
585 | local_irq_enable(); | 604 | local_irq_enable(); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6f4e0e13f70c..0101aeef7ed7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -11,7 +11,6 @@ | |||
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | 12 | ||
13 | #include <asm/atomic.h> | 13 | #include <asm/atomic.h> |
14 | #include <asm/semaphore.h> | ||
15 | #include <asm/uaccess.h> | 14 | #include <asm/uaccess.h> |
16 | 15 | ||
17 | /* Since we effect priority and affinity (both of which are visible | 16 | /* Since we effect priority and affinity (both of which are visible |
@@ -35,7 +34,7 @@ static int stopmachine(void *cpu) | |||
35 | int irqs_disabled = 0; | 34 | int irqs_disabled = 0; |
36 | int prepared = 0; | 35 | int prepared = 0; |
37 | 36 | ||
38 | set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); | 37 | set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); |
39 | 38 | ||
40 | /* Ack: we are alive */ | 39 | /* Ack: we are alive */ |
41 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ | 40 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ |
@@ -135,8 +134,7 @@ static void restart_machine(void) | |||
135 | preempt_enable_no_resched(); | 134 | preempt_enable_no_resched(); |
136 | } | 135 | } |
137 | 136 | ||
138 | struct stop_machine_data | 137 | struct stop_machine_data { |
139 | { | ||
140 | int (*fn)(void *); | 138 | int (*fn)(void *); |
141 | void *data; | 139 | void *data; |
142 | struct completion done; | 140 | struct completion done; |
diff --git a/kernel/sys.c b/kernel/sys.c index a626116af5db..f2a451366953 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -67,6 +67,12 @@ | |||
67 | #ifndef SET_ENDIAN | 67 | #ifndef SET_ENDIAN |
68 | # define SET_ENDIAN(a,b) (-EINVAL) | 68 | # define SET_ENDIAN(a,b) (-EINVAL) |
69 | #endif | 69 | #endif |
70 | #ifndef GET_TSC_CTL | ||
71 | # define GET_TSC_CTL(a) (-EINVAL) | ||
72 | #endif | ||
73 | #ifndef SET_TSC_CTL | ||
74 | # define SET_TSC_CTL(a) (-EINVAL) | ||
75 | #endif | ||
70 | 76 | ||
71 | /* | 77 | /* |
72 | * this is where the system-wide overflow UID and GID are defined, for | 78 | * this is where the system-wide overflow UID and GID are defined, for |
@@ -1626,10 +1632,9 @@ asmlinkage long sys_umask(int mask) | |||
1626 | asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | 1632 | asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, |
1627 | unsigned long arg4, unsigned long arg5) | 1633 | unsigned long arg4, unsigned long arg5) |
1628 | { | 1634 | { |
1629 | long error; | 1635 | long uninitialized_var(error); |
1630 | 1636 | ||
1631 | error = security_task_prctl(option, arg2, arg3, arg4, arg5); | 1637 | if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) |
1632 | if (error) | ||
1633 | return error; | 1638 | return error; |
1634 | 1639 | ||
1635 | switch (option) { | 1640 | switch (option) { |
@@ -1682,17 +1687,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
1682 | error = -EINVAL; | 1687 | error = -EINVAL; |
1683 | break; | 1688 | break; |
1684 | 1689 | ||
1685 | case PR_GET_KEEPCAPS: | ||
1686 | if (current->keep_capabilities) | ||
1687 | error = 1; | ||
1688 | break; | ||
1689 | case PR_SET_KEEPCAPS: | ||
1690 | if (arg2 != 0 && arg2 != 1) { | ||
1691 | error = -EINVAL; | ||
1692 | break; | ||
1693 | } | ||
1694 | current->keep_capabilities = arg2; | ||
1695 | break; | ||
1696 | case PR_SET_NAME: { | 1690 | case PR_SET_NAME: { |
1697 | struct task_struct *me = current; | 1691 | struct task_struct *me = current; |
1698 | unsigned char ncomm[sizeof(me->comm)]; | 1692 | unsigned char ncomm[sizeof(me->comm)]; |
@@ -1726,18 +1720,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
1726 | case PR_SET_SECCOMP: | 1720 | case PR_SET_SECCOMP: |
1727 | error = prctl_set_seccomp(arg2); | 1721 | error = prctl_set_seccomp(arg2); |
1728 | break; | 1722 | break; |
1729 | 1723 | case PR_GET_TSC: | |
1730 | case PR_CAPBSET_READ: | 1724 | error = GET_TSC_CTL(arg2); |
1731 | if (!cap_valid(arg2)) | 1725 | break; |
1732 | return -EINVAL; | 1726 | case PR_SET_TSC: |
1733 | return !!cap_raised(current->cap_bset, arg2); | 1727 | error = SET_TSC_CTL(arg2); |
1734 | case PR_CAPBSET_DROP: | 1728 | break; |
1735 | #ifdef CONFIG_SECURITY_FILE_CAPABILITIES | ||
1736 | return cap_prctl_drop(arg2); | ||
1737 | #else | ||
1738 | return -EINVAL; | ||
1739 | #endif | ||
1740 | |||
1741 | default: | 1729 | default: |
1742 | error = -EINVAL; | 1730 | error = -EINVAL; |
1743 | break; | 1731 | break; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2a2d6889bab..fd3364827ccf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -270,17 +270,6 @@ static struct ctl_table kern_table[] = { | |||
270 | }, | 270 | }, |
271 | { | 271 | { |
272 | .ctl_name = CTL_UNNUMBERED, | 272 | .ctl_name = CTL_UNNUMBERED, |
273 | .procname = "sched_batch_wakeup_granularity_ns", | ||
274 | .data = &sysctl_sched_batch_wakeup_granularity, | ||
275 | .maxlen = sizeof(unsigned int), | ||
276 | .mode = 0644, | ||
277 | .proc_handler = &proc_dointvec_minmax, | ||
278 | .strategy = &sysctl_intvec, | ||
279 | .extra1 = &min_wakeup_granularity_ns, | ||
280 | .extra2 = &max_wakeup_granularity_ns, | ||
281 | }, | ||
282 | { | ||
283 | .ctl_name = CTL_UNNUMBERED, | ||
284 | .procname = "sched_child_runs_first", | 273 | .procname = "sched_child_runs_first", |
285 | .data = &sysctl_sched_child_runs_first, | 274 | .data = &sysctl_sched_child_runs_first, |
286 | .maxlen = sizeof(unsigned int), | 275 | .maxlen = sizeof(unsigned int), |
@@ -318,7 +307,7 @@ static struct ctl_table kern_table[] = { | |||
318 | .data = &sysctl_sched_rt_period, | 307 | .data = &sysctl_sched_rt_period, |
319 | .maxlen = sizeof(unsigned int), | 308 | .maxlen = sizeof(unsigned int), |
320 | .mode = 0644, | 309 | .mode = 0644, |
321 | .proc_handler = &proc_dointvec, | 310 | .proc_handler = &sched_rt_handler, |
322 | }, | 311 | }, |
323 | { | 312 | { |
324 | .ctl_name = CTL_UNNUMBERED, | 313 | .ctl_name = CTL_UNNUMBERED, |
@@ -326,7 +315,7 @@ static struct ctl_table kern_table[] = { | |||
326 | .data = &sysctl_sched_rt_runtime, | 315 | .data = &sysctl_sched_rt_runtime, |
327 | .maxlen = sizeof(int), | 316 | .maxlen = sizeof(int), |
328 | .mode = 0644, | 317 | .mode = 0644, |
329 | .proc_handler = &proc_dointvec, | 318 | .proc_handler = &sched_rt_handler, |
330 | }, | 319 | }, |
331 | { | 320 | { |
332 | .ctl_name = CTL_UNNUMBERED, | 321 | .ctl_name = CTL_UNNUMBERED, |
diff --git a/kernel/time.c b/kernel/time.c index a5ec013b6c80..35d373a98782 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -379,6 +379,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) | |||
379 | ts->tv_sec = sec; | 379 | ts->tv_sec = sec; |
380 | ts->tv_nsec = nsec; | 380 | ts->tv_nsec = nsec; |
381 | } | 381 | } |
382 | EXPORT_SYMBOL(set_normalized_timespec); | ||
382 | 383 | ||
383 | /** | 384 | /** |
384 | * ns_to_timespec - Convert nanoseconds to timespec | 385 | * ns_to_timespec - Convert nanoseconds to timespec |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7f60097d443a..73961f35fdc8 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -141,8 +141,16 @@ static void clocksource_watchdog(unsigned long data) | |||
141 | } | 141 | } |
142 | 142 | ||
143 | if (!list_empty(&watchdog_list)) { | 143 | if (!list_empty(&watchdog_list)) { |
144 | __mod_timer(&watchdog_timer, | 144 | /* |
145 | watchdog_timer.expires + WATCHDOG_INTERVAL); | 145 | * Cycle through CPUs to check if the CPUs stay |
146 | * synchronized to each other. | ||
147 | */ | ||
148 | int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); | ||
149 | |||
150 | if (next_cpu >= NR_CPUS) | ||
151 | next_cpu = first_cpu(cpu_online_map); | ||
152 | watchdog_timer.expires += WATCHDOG_INTERVAL; | ||
153 | add_timer_on(&watchdog_timer, next_cpu); | ||
146 | } | 154 | } |
147 | spin_unlock(&watchdog_lock); | 155 | spin_unlock(&watchdog_lock); |
148 | } | 156 | } |
@@ -164,7 +172,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
164 | if (!started && watchdog) { | 172 | if (!started && watchdog) { |
165 | watchdog_last = watchdog->read(); | 173 | watchdog_last = watchdog->read(); |
166 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | 174 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; |
167 | add_timer(&watchdog_timer); | 175 | add_timer_on(&watchdog_timer, |
176 | first_cpu(cpu_online_map)); | ||
168 | } | 177 | } |
169 | } else { | 178 | } else { |
170 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) | 179 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) |
@@ -185,7 +194,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
185 | watchdog_last = watchdog->read(); | 194 | watchdog_last = watchdog->read(); |
186 | watchdog_timer.expires = | 195 | watchdog_timer.expires = |
187 | jiffies + WATCHDOG_INTERVAL; | 196 | jiffies + WATCHDOG_INTERVAL; |
188 | add_timer(&watchdog_timer); | 197 | add_timer_on(&watchdog_timer, |
198 | first_cpu(cpu_online_map)); | ||
189 | } | 199 | } |
190 | } | 200 | } |
191 | } | 201 | } |
@@ -222,6 +232,18 @@ void clocksource_resume(void) | |||
222 | } | 232 | } |
223 | 233 | ||
224 | /** | 234 | /** |
235 | * clocksource_touch_watchdog - Update watchdog | ||
236 | * | ||
237 | * Update the watchdog after exception contexts such as kgdb so as not | ||
238 | * to incorrectly trip the watchdog. | ||
239 | * | ||
240 | */ | ||
241 | void clocksource_touch_watchdog(void) | ||
242 | { | ||
243 | clocksource_resume_watchdog(); | ||
244 | } | ||
245 | |||
246 | /** | ||
225 | * clocksource_get_next - Returns the selected clocksource | 247 | * clocksource_get_next - Returns the selected clocksource |
226 | * | 248 | * |
227 | */ | 249 | */ |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e1bd50cbbf5d..57a1f02e5ec0 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -14,7 +14,7 @@ | |||
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
16 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
17 | #include <linux/irq.h> | 17 | #include <linux/interrupt.h> |
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
@@ -262,7 +262,7 @@ out: | |||
262 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) | 262 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) |
263 | { | 263 | { |
264 | if (!cpu_isset(*oncpu, cpu_online_map)) | 264 | if (!cpu_isset(*oncpu, cpu_online_map)) |
265 | printk(KERN_ERR "tick-braodcast: ignoring broadcast for " | 265 | printk(KERN_ERR "tick-broadcast: ignoring broadcast for " |
266 | "offline CPU #%d\n", *oncpu); | 266 | "offline CPU #%d\n", *oncpu); |
267 | else | 267 | else |
268 | smp_call_function_single(*oncpu, tick_do_broadcast_on_off, | 268 | smp_call_function_single(*oncpu, tick_do_broadcast_on_off, |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 1bea399a9ef0..4f3886562b8c 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -14,12 +14,14 @@ | |||
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
16 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
17 | #include <linux/irq.h> | 17 | #include <linux/interrupt.h> |
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/tick.h> | 21 | #include <linux/tick.h> |
22 | 22 | ||
23 | #include <asm/irq_regs.h> | ||
24 | |||
23 | #include "tick-internal.h" | 25 | #include "tick-internal.h" |
24 | 26 | ||
25 | /* | 27 | /* |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0258d3115d54..450c04935b66 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -14,7 +14,7 @@ | |||
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
16 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
17 | #include <linux/irq.h> | 17 | #include <linux/interrupt.h> |
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 686da821d376..b854a895591e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -158,9 +158,8 @@ void tick_nohz_stop_idle(int cpu) | |||
158 | } | 158 | } |
159 | } | 159 | } |
160 | 160 | ||
161 | static ktime_t tick_nohz_start_idle(int cpu) | 161 | static ktime_t tick_nohz_start_idle(struct tick_sched *ts) |
162 | { | 162 | { |
163 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
164 | ktime_t now, delta; | 163 | ktime_t now, delta; |
165 | 164 | ||
166 | now = ktime_get(); | 165 | now = ktime_get(); |
@@ -192,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | |||
192 | void tick_nohz_stop_sched_tick(void) | 191 | void tick_nohz_stop_sched_tick(void) |
193 | { | 192 | { |
194 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 193 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; |
195 | unsigned long rt_jiffies; | ||
196 | struct tick_sched *ts; | 194 | struct tick_sched *ts; |
197 | ktime_t last_update, expires, now; | 195 | ktime_t last_update, expires, now; |
198 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 196 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
@@ -201,8 +199,8 @@ void tick_nohz_stop_sched_tick(void) | |||
201 | local_irq_save(flags); | 199 | local_irq_save(flags); |
202 | 200 | ||
203 | cpu = smp_processor_id(); | 201 | cpu = smp_processor_id(); |
204 | now = tick_nohz_start_idle(cpu); | ||
205 | ts = &per_cpu(tick_cpu_sched, cpu); | 202 | ts = &per_cpu(tick_cpu_sched, cpu); |
203 | now = tick_nohz_start_idle(ts); | ||
206 | 204 | ||
207 | /* | 205 | /* |
208 | * If this cpu is offline and it is the one which updates | 206 | * If this cpu is offline and it is the one which updates |
@@ -222,7 +220,6 @@ void tick_nohz_stop_sched_tick(void) | |||
222 | if (need_resched()) | 220 | if (need_resched()) |
223 | goto end; | 221 | goto end; |
224 | 222 | ||
225 | cpu = smp_processor_id(); | ||
226 | if (unlikely(local_softirq_pending())) { | 223 | if (unlikely(local_softirq_pending())) { |
227 | static int ratelimit; | 224 | static int ratelimit; |
228 | 225 | ||
@@ -245,10 +242,6 @@ void tick_nohz_stop_sched_tick(void) | |||
245 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 242 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
246 | delta_jiffies = next_jiffies - last_jiffies; | 243 | delta_jiffies = next_jiffies - last_jiffies; |
247 | 244 | ||
248 | rt_jiffies = rt_needs_cpu(cpu); | ||
249 | if (rt_jiffies && rt_jiffies < delta_jiffies) | ||
250 | delta_jiffies = rt_jiffies; | ||
251 | |||
252 | if (rcu_needs_cpu(cpu)) | 245 | if (rcu_needs_cpu(cpu)) |
253 | delta_jiffies = 1; | 246 | delta_jiffies = 1; |
254 | /* | 247 | /* |
@@ -400,6 +393,7 @@ void tick_nohz_restart_sched_tick(void) | |||
400 | sub_preempt_count(HARDIRQ_OFFSET); | 393 | sub_preempt_count(HARDIRQ_OFFSET); |
401 | } | 394 | } |
402 | 395 | ||
396 | touch_softlockup_watchdog(); | ||
403 | /* | 397 | /* |
404 | * Cancel the scheduled timer and restore the tick | 398 | * Cancel the scheduled timer and restore the tick |
405 | */ | 399 | */ |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a3fa587c350c..2d6087c7cf98 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -178,6 +178,7 @@ static void change_clocksource(void) | |||
178 | if (clock == new) | 178 | if (clock == new) |
179 | return; | 179 | return; |
180 | 180 | ||
181 | new->cycle_last = 0; | ||
181 | now = clocksource_read(new); | 182 | now = clocksource_read(new); |
182 | nsec = __get_nsec_offset(); | 183 | nsec = __get_nsec_offset(); |
183 | timespec_add_ns(&xtime, nsec); | 184 | timespec_add_ns(&xtime, nsec); |
@@ -295,6 +296,7 @@ static int timekeeping_resume(struct sys_device *dev) | |||
295 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); | 296 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); |
296 | update_xtime_cache(0); | 297 | update_xtime_cache(0); |
297 | /* re-base the last cycle value */ | 298 | /* re-base the last cycle value */ |
299 | clock->cycle_last = 0; | ||
298 | clock->cycle_last = clocksource_read(clock); | 300 | clock->cycle_last = clocksource_read(clock); |
299 | clock->error = 0; | 301 | clock->error = 0; |
300 | timekeeping_suspended = 0; | 302 | timekeeping_suspended = 0; |
diff --git a/kernel/timer.c b/kernel/timer.c index b024106daa70..f3d35d4ea42e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1228,13 +1228,6 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
1228 | return 0; | 1228 | return 0; |
1229 | } | 1229 | } |
1230 | 1230 | ||
1231 | /* | ||
1232 | * lockdep: we want to track each per-CPU base as a separate lock-class, | ||
1233 | * but timer-bases are kmalloc()-ed, so we need to attach separate | ||
1234 | * keys to them: | ||
1235 | */ | ||
1236 | static struct lock_class_key base_lock_keys[NR_CPUS]; | ||
1237 | |||
1238 | static int __cpuinit init_timers_cpu(int cpu) | 1231 | static int __cpuinit init_timers_cpu(int cpu) |
1239 | { | 1232 | { |
1240 | int j; | 1233 | int j; |
@@ -1277,7 +1270,6 @@ static int __cpuinit init_timers_cpu(int cpu) | |||
1277 | } | 1270 | } |
1278 | 1271 | ||
1279 | spin_lock_init(&base->lock); | 1272 | spin_lock_init(&base->lock); |
1280 | lockdep_set_class(&base->lock, base_lock_keys + cpu); | ||
1281 | 1273 | ||
1282 | for (j = 0; j < TVN_SIZE; j++) { | 1274 | for (j = 0; j < TVN_SIZE; j++) { |
1283 | INIT_LIST_HEAD(base->tv5.vec + j); | 1275 | INIT_LIST_HEAD(base->tv5.vec + j); |
@@ -1316,8 +1308,8 @@ static void __cpuinit migrate_timers(int cpu) | |||
1316 | new_base = get_cpu_var(tvec_bases); | 1308 | new_base = get_cpu_var(tvec_bases); |
1317 | 1309 | ||
1318 | local_irq_disable(); | 1310 | local_irq_disable(); |
1319 | double_spin_lock(&new_base->lock, &old_base->lock, | 1311 | spin_lock(&new_base->lock); |
1320 | smp_processor_id() < cpu); | 1312 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); |
1321 | 1313 | ||
1322 | BUG_ON(old_base->running_timer); | 1314 | BUG_ON(old_base->running_timer); |
1323 | 1315 | ||
@@ -1330,8 +1322,8 @@ static void __cpuinit migrate_timers(int cpu) | |||
1330 | migrate_timer_list(new_base, old_base->tv5.vec + i); | 1322 | migrate_timer_list(new_base, old_base->tv5.vec + i); |
1331 | } | 1323 | } |
1332 | 1324 | ||
1333 | double_spin_unlock(&new_base->lock, &old_base->lock, | 1325 | spin_unlock(&old_base->lock); |
1334 | smp_processor_id() < cpu); | 1326 | spin_unlock(&new_base->lock); |
1335 | local_irq_enable(); | 1327 | local_irq_enable(); |
1336 | put_cpu_var(tvec_bases); | 1328 | put_cpu_var(tvec_bases); |
1337 | } | 1329 | } |
diff --git a/kernel/uid16.c b/kernel/uid16.c index dd308ba4e03b..3e41c1673e2f 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -21,7 +21,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi | |||
21 | { | 21 | { |
22 | long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); | 22 | long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); |
23 | /* avoid REGPARM breakage on x86: */ | 23 | /* avoid REGPARM breakage on x86: */ |
24 | prevent_tail_call(ret); | 24 | asmlinkage_protect(3, ret, filename, user, group); |
25 | return ret; | 25 | return ret; |
26 | } | 26 | } |
27 | 27 | ||
@@ -29,7 +29,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g | |||
29 | { | 29 | { |
30 | long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); | 30 | long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); |
31 | /* avoid REGPARM breakage on x86: */ | 31 | /* avoid REGPARM breakage on x86: */ |
32 | prevent_tail_call(ret); | 32 | asmlinkage_protect(3, ret, filename, user, group); |
33 | return ret; | 33 | return ret; |
34 | } | 34 | } |
35 | 35 | ||
@@ -37,7 +37,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) | |||
37 | { | 37 | { |
38 | long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); | 38 | long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); |
39 | /* avoid REGPARM breakage on x86: */ | 39 | /* avoid REGPARM breakage on x86: */ |
40 | prevent_tail_call(ret); | 40 | asmlinkage_protect(3, ret, fd, user, group); |
41 | return ret; | 41 | return ret; |
42 | } | 42 | } |
43 | 43 | ||
@@ -45,7 +45,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) | |||
45 | { | 45 | { |
46 | long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); | 46 | long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); |
47 | /* avoid REGPARM breakage on x86: */ | 47 | /* avoid REGPARM breakage on x86: */ |
48 | prevent_tail_call(ret); | 48 | asmlinkage_protect(2, ret, rgid, egid); |
49 | return ret; | 49 | return ret; |
50 | } | 50 | } |
51 | 51 | ||
@@ -53,7 +53,7 @@ asmlinkage long sys_setgid16(old_gid_t gid) | |||
53 | { | 53 | { |
54 | long ret = sys_setgid(low2highgid(gid)); | 54 | long ret = sys_setgid(low2highgid(gid)); |
55 | /* avoid REGPARM breakage on x86: */ | 55 | /* avoid REGPARM breakage on x86: */ |
56 | prevent_tail_call(ret); | 56 | asmlinkage_protect(1, ret, gid); |
57 | return ret; | 57 | return ret; |
58 | } | 58 | } |
59 | 59 | ||
@@ -61,7 +61,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) | |||
61 | { | 61 | { |
62 | long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); | 62 | long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); |
63 | /* avoid REGPARM breakage on x86: */ | 63 | /* avoid REGPARM breakage on x86: */ |
64 | prevent_tail_call(ret); | 64 | asmlinkage_protect(2, ret, ruid, euid); |
65 | return ret; | 65 | return ret; |
66 | } | 66 | } |
67 | 67 | ||
@@ -69,7 +69,7 @@ asmlinkage long sys_setuid16(old_uid_t uid) | |||
69 | { | 69 | { |
70 | long ret = sys_setuid(low2highuid(uid)); | 70 | long ret = sys_setuid(low2highuid(uid)); |
71 | /* avoid REGPARM breakage on x86: */ | 71 | /* avoid REGPARM breakage on x86: */ |
72 | prevent_tail_call(ret); | 72 | asmlinkage_protect(1, ret, uid); |
73 | return ret; | 73 | return ret; |
74 | } | 74 | } |
75 | 75 | ||
@@ -78,7 +78,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) | |||
78 | long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), | 78 | long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), |
79 | low2highuid(suid)); | 79 | low2highuid(suid)); |
80 | /* avoid REGPARM breakage on x86: */ | 80 | /* avoid REGPARM breakage on x86: */ |
81 | prevent_tail_call(ret); | 81 | asmlinkage_protect(3, ret, ruid, euid, suid); |
82 | return ret; | 82 | return ret; |
83 | } | 83 | } |
84 | 84 | ||
@@ -98,7 +98,7 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) | |||
98 | long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), | 98 | long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), |
99 | low2highgid(sgid)); | 99 | low2highgid(sgid)); |
100 | /* avoid REGPARM breakage on x86: */ | 100 | /* avoid REGPARM breakage on x86: */ |
101 | prevent_tail_call(ret); | 101 | asmlinkage_protect(3, ret, rgid, egid, sgid); |
102 | return ret; | 102 | return ret; |
103 | } | 103 | } |
104 | 104 | ||
@@ -117,7 +117,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid) | |||
117 | { | 117 | { |
118 | long ret = sys_setfsuid(low2highuid(uid)); | 118 | long ret = sys_setfsuid(low2highuid(uid)); |
119 | /* avoid REGPARM breakage on x86: */ | 119 | /* avoid REGPARM breakage on x86: */ |
120 | prevent_tail_call(ret); | 120 | asmlinkage_protect(1, ret, uid); |
121 | return ret; | 121 | return ret; |
122 | } | 122 | } |
123 | 123 | ||
@@ -125,7 +125,7 @@ asmlinkage long sys_setfsgid16(old_gid_t gid) | |||
125 | { | 125 | { |
126 | long ret = sys_setfsgid(low2highgid(gid)); | 126 | long ret = sys_setfsgid(low2highgid(gid)); |
127 | /* avoid REGPARM breakage on x86: */ | 127 | /* avoid REGPARM breakage on x86: */ |
128 | prevent_tail_call(ret); | 128 | asmlinkage_protect(1, ret, gid); |
129 | return ret; | 129 | return ret; |
130 | } | 130 | } |
131 | 131 | ||
diff --git a/kernel/user.c b/kernel/user.c index 7132022a040c..debce602bfdd 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up) | |||
101 | { | 101 | { |
102 | int rc = 0; | 102 | int rc = 0; |
103 | 103 | ||
104 | up->tg = sched_create_group(); | 104 | up->tg = sched_create_group(&root_task_group); |
105 | if (IS_ERR(up->tg)) | 105 | if (IS_ERR(up->tg)) |
106 | rc = -ENOMEM; | 106 | rc = -ENOMEM; |
107 | 107 | ||
@@ -193,6 +193,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, | |||
193 | 193 | ||
194 | static struct kobj_attribute cpu_rt_runtime_attr = | 194 | static struct kobj_attribute cpu_rt_runtime_attr = |
195 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); | 195 | __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); |
196 | |||
197 | static ssize_t cpu_rt_period_show(struct kobject *kobj, | ||
198 | struct kobj_attribute *attr, | ||
199 | char *buf) | ||
200 | { | ||
201 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
202 | |||
203 | return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); | ||
204 | } | ||
205 | |||
206 | static ssize_t cpu_rt_period_store(struct kobject *kobj, | ||
207 | struct kobj_attribute *attr, | ||
208 | const char *buf, size_t size) | ||
209 | { | ||
210 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); | ||
211 | unsigned long rt_period; | ||
212 | int rc; | ||
213 | |||
214 | sscanf(buf, "%lu", &rt_period); | ||
215 | |||
216 | rc = sched_group_set_rt_period(up->tg, rt_period); | ||
217 | |||
218 | return (rc ? rc : size); | ||
219 | } | ||
220 | |||
221 | static struct kobj_attribute cpu_rt_period_attr = | ||
222 | __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); | ||
196 | #endif | 223 | #endif |
197 | 224 | ||
198 | /* default attributes per uid directory */ | 225 | /* default attributes per uid directory */ |
@@ -202,6 +229,7 @@ static struct attribute *uids_attributes[] = { | |||
202 | #endif | 229 | #endif |
203 | #ifdef CONFIG_RT_GROUP_SCHED | 230 | #ifdef CONFIG_RT_GROUP_SCHED |
204 | &cpu_rt_runtime_attr.attr, | 231 | &cpu_rt_runtime_attr.attr, |
232 | &cpu_rt_period_attr.attr, | ||
205 | #endif | 233 | #endif |
206 | NULL | 234 | NULL |
207 | }; | 235 | }; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ff06611655af..00ff4d08e370 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -219,6 +219,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
219 | struct timer_list *timer = &dwork->timer; | 219 | struct timer_list *timer = &dwork->timer; |
220 | struct work_struct *work = &dwork->work; | 220 | struct work_struct *work = &dwork->work; |
221 | 221 | ||
222 | timer_stats_timer_set_start_info(&dwork->timer); | ||
222 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { | 223 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { |
223 | BUG_ON(timer_pending(timer)); | 224 | BUG_ON(timer_pending(timer)); |
224 | BUG_ON(!list_empty(&work->entry)); | 225 | BUG_ON(!list_empty(&work->entry)); |
@@ -580,6 +581,7 @@ EXPORT_SYMBOL(schedule_delayed_work); | |||
580 | int schedule_delayed_work_on(int cpu, | 581 | int schedule_delayed_work_on(int cpu, |
581 | struct delayed_work *dwork, unsigned long delay) | 582 | struct delayed_work *dwork, unsigned long delay) |
582 | { | 583 | { |
584 | timer_stats_timer_set_start_info(&dwork->timer); | ||
583 | return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); | 585 | return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); |
584 | } | 586 | } |
585 | EXPORT_SYMBOL(schedule_delayed_work_on); | 587 | EXPORT_SYMBOL(schedule_delayed_work_on); |