diff options
Diffstat (limited to 'kernel')
78 files changed, 6467 insertions, 3656 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 642d4277c2..2a999836ca 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -4,11 +4,12 @@ | |||
4 | 4 | ||
5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | 5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ |
6 | exit.o itimer.o time.o softirq.o resource.o \ | 6 | exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o | 11 | hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \ |
12 | utsname.o | ||
12 | 13 | ||
13 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 14 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
14 | obj-y += time/ | 15 | obj-y += time/ |
@@ -48,7 +49,6 @@ obj-$(CONFIG_SECCOMP) += seccomp.o | |||
48 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 49 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
49 | obj-$(CONFIG_RELAY) += relay.o | 50 | obj-$(CONFIG_RELAY) += relay.o |
50 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 51 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
51 | obj-$(CONFIG_UTS_NS) += utsname.o | ||
52 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 52 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
53 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 53 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
54 | 54 | ||
diff --git a/kernel/audit.c b/kernel/audit.c index d13276d414..eb0f9165b4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -58,6 +58,7 @@ | |||
58 | #include <linux/selinux.h> | 58 | #include <linux/selinux.h> |
59 | #include <linux/inotify.h> | 59 | #include <linux/inotify.h> |
60 | #include <linux/freezer.h> | 60 | #include <linux/freezer.h> |
61 | #include <linux/tty.h> | ||
61 | 62 | ||
62 | #include "audit.h" | 63 | #include "audit.h" |
63 | 64 | ||
@@ -391,6 +392,7 @@ static int kauditd_thread(void *dummy) | |||
391 | { | 392 | { |
392 | struct sk_buff *skb; | 393 | struct sk_buff *skb; |
393 | 394 | ||
395 | set_freezable(); | ||
394 | while (!kthread_should_stop()) { | 396 | while (!kthread_should_stop()) { |
395 | skb = skb_dequeue(&audit_skb_queue); | 397 | skb = skb_dequeue(&audit_skb_queue); |
396 | wake_up(&audit_backlog_wait); | 398 | wake_up(&audit_backlog_wait); |
@@ -423,6 +425,31 @@ static int kauditd_thread(void *dummy) | |||
423 | return 0; | 425 | return 0; |
424 | } | 426 | } |
425 | 427 | ||
428 | static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) | ||
429 | { | ||
430 | struct task_struct *tsk; | ||
431 | int err; | ||
432 | |||
433 | read_lock(&tasklist_lock); | ||
434 | tsk = find_task_by_pid(pid); | ||
435 | err = -ESRCH; | ||
436 | if (!tsk) | ||
437 | goto out; | ||
438 | err = 0; | ||
439 | |||
440 | spin_lock_irq(&tsk->sighand->siglock); | ||
441 | if (!tsk->signal->audit_tty) | ||
442 | err = -EPERM; | ||
443 | spin_unlock_irq(&tsk->sighand->siglock); | ||
444 | if (err) | ||
445 | goto out; | ||
446 | |||
447 | tty_audit_push_task(tsk, loginuid); | ||
448 | out: | ||
449 | read_unlock(&tasklist_lock); | ||
450 | return err; | ||
451 | } | ||
452 | |||
426 | int audit_send_list(void *_dest) | 453 | int audit_send_list(void *_dest) |
427 | { | 454 | { |
428 | struct audit_netlink_list *dest = _dest; | 455 | struct audit_netlink_list *dest = _dest; |
@@ -511,6 +538,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
511 | case AUDIT_DEL: | 538 | case AUDIT_DEL: |
512 | case AUDIT_DEL_RULE: | 539 | case AUDIT_DEL_RULE: |
513 | case AUDIT_SIGNAL_INFO: | 540 | case AUDIT_SIGNAL_INFO: |
541 | case AUDIT_TTY_GET: | ||
542 | case AUDIT_TTY_SET: | ||
514 | if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) | 543 | if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) |
515 | err = -EPERM; | 544 | err = -EPERM; |
516 | break; | 545 | break; |
@@ -622,6 +651,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
622 | err = audit_filter_user(&NETLINK_CB(skb), msg_type); | 651 | err = audit_filter_user(&NETLINK_CB(skb), msg_type); |
623 | if (err == 1) { | 652 | if (err == 1) { |
624 | err = 0; | 653 | err = 0; |
654 | if (msg_type == AUDIT_USER_TTY) { | ||
655 | err = audit_prepare_user_tty(pid, loginuid); | ||
656 | if (err) | ||
657 | break; | ||
658 | } | ||
625 | ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 659 | ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
626 | if (ab) { | 660 | if (ab) { |
627 | audit_log_format(ab, | 661 | audit_log_format(ab, |
@@ -638,8 +672,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
638 | " subj=%s", ctx); | 672 | " subj=%s", ctx); |
639 | kfree(ctx); | 673 | kfree(ctx); |
640 | } | 674 | } |
641 | audit_log_format(ab, " msg='%.1024s'", | 675 | if (msg_type != AUDIT_USER_TTY) |
642 | (char *)data); | 676 | audit_log_format(ab, " msg='%.1024s'", |
677 | (char *)data); | ||
678 | else { | ||
679 | int size; | ||
680 | |||
681 | audit_log_format(ab, " msg="); | ||
682 | size = nlmsg_len(nlh); | ||
683 | audit_log_n_untrustedstring(ab, size, | ||
684 | data); | ||
685 | } | ||
643 | audit_set_pid(ab, pid); | 686 | audit_set_pid(ab, pid); |
644 | audit_log_end(ab); | 687 | audit_log_end(ab); |
645 | } | 688 | } |
@@ -730,6 +773,45 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
730 | 0, 0, sig_data, sizeof(*sig_data) + len); | 773 | 0, 0, sig_data, sizeof(*sig_data) + len); |
731 | kfree(sig_data); | 774 | kfree(sig_data); |
732 | break; | 775 | break; |
776 | case AUDIT_TTY_GET: { | ||
777 | struct audit_tty_status s; | ||
778 | struct task_struct *tsk; | ||
779 | |||
780 | read_lock(&tasklist_lock); | ||
781 | tsk = find_task_by_pid(pid); | ||
782 | if (!tsk) | ||
783 | err = -ESRCH; | ||
784 | else { | ||
785 | spin_lock_irq(&tsk->sighand->siglock); | ||
786 | s.enabled = tsk->signal->audit_tty != 0; | ||
787 | spin_unlock_irq(&tsk->sighand->siglock); | ||
788 | } | ||
789 | read_unlock(&tasklist_lock); | ||
790 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, | ||
791 | &s, sizeof(s)); | ||
792 | break; | ||
793 | } | ||
794 | case AUDIT_TTY_SET: { | ||
795 | struct audit_tty_status *s; | ||
796 | struct task_struct *tsk; | ||
797 | |||
798 | if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) | ||
799 | return -EINVAL; | ||
800 | s = data; | ||
801 | if (s->enabled != 0 && s->enabled != 1) | ||
802 | return -EINVAL; | ||
803 | read_lock(&tasklist_lock); | ||
804 | tsk = find_task_by_pid(pid); | ||
805 | if (!tsk) | ||
806 | err = -ESRCH; | ||
807 | else { | ||
808 | spin_lock_irq(&tsk->sighand->siglock); | ||
809 | tsk->signal->audit_tty = s->enabled != 0; | ||
810 | spin_unlock_irq(&tsk->sighand->siglock); | ||
811 | } | ||
812 | read_unlock(&tasklist_lock); | ||
813 | break; | ||
814 | } | ||
733 | default: | 815 | default: |
734 | err = -EINVAL; | 816 | err = -EINVAL; |
735 | break; | 817 | break; |
@@ -1185,7 +1267,7 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | |||
1185 | } | 1267 | } |
1186 | 1268 | ||
1187 | /** | 1269 | /** |
1188 | * audit_log_n_unstrustedstring - log a string that may contain random characters | 1270 | * audit_log_n_untrustedstring - log a string that may contain random characters |
1189 | * @ab: audit_buffer | 1271 | * @ab: audit_buffer |
1190 | * @len: lenth of string (not including trailing null) | 1272 | * @len: lenth of string (not including trailing null) |
1191 | * @string: string to be logged | 1273 | * @string: string to be logged |
@@ -1201,25 +1283,24 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | |||
1201 | const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, | 1283 | const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, |
1202 | const char *string) | 1284 | const char *string) |
1203 | { | 1285 | { |
1204 | const unsigned char *p = string; | 1286 | const unsigned char *p; |
1205 | 1287 | ||
1206 | while (*p) { | 1288 | for (p = string; p < (const unsigned char *)string + len && *p; p++) { |
1207 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { | 1289 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { |
1208 | audit_log_hex(ab, string, len); | 1290 | audit_log_hex(ab, string, len); |
1209 | return string + len + 1; | 1291 | return string + len + 1; |
1210 | } | 1292 | } |
1211 | p++; | ||
1212 | } | 1293 | } |
1213 | audit_log_n_string(ab, len, string); | 1294 | audit_log_n_string(ab, len, string); |
1214 | return p + 1; | 1295 | return p + 1; |
1215 | } | 1296 | } |
1216 | 1297 | ||
1217 | /** | 1298 | /** |
1218 | * audit_log_unstrustedstring - log a string that may contain random characters | 1299 | * audit_log_untrustedstring - log a string that may contain random characters |
1219 | * @ab: audit_buffer | 1300 | * @ab: audit_buffer |
1220 | * @string: string to be logged | 1301 | * @string: string to be logged |
1221 | * | 1302 | * |
1222 | * Same as audit_log_n_unstrustedstring(), except that strlen is used to | 1303 | * Same as audit_log_n_untrustedstring(), except that strlen is used to |
1223 | * determine string length. | 1304 | * determine string length. |
1224 | */ | 1305 | */ |
1225 | const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | 1306 | const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) |
diff --git a/kernel/audit.h b/kernel/audit.h index 815d6f5c04..95877435c3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -115,7 +115,6 @@ extern struct sk_buff * audit_make_reply(int pid, int seq, int type, | |||
115 | extern void audit_send_reply(int pid, int seq, int type, | 115 | extern void audit_send_reply(int pid, int seq, int type, |
116 | int done, int multi, | 116 | int done, int multi, |
117 | void *payload, int size); | 117 | void *payload, int size); |
118 | extern void audit_log_lost(const char *message); | ||
119 | extern void audit_panic(const char *message); | 118 | extern void audit_panic(const char *message); |
120 | 119 | ||
121 | struct audit_netlink_list { | 120 | struct audit_netlink_list { |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ce61f42354..1bf093dcff 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -1210,8 +1210,8 @@ static inline int audit_add_rule(struct audit_entry *entry, | |||
1210 | struct audit_entry *e; | 1210 | struct audit_entry *e; |
1211 | struct audit_field *inode_f = entry->rule.inode_f; | 1211 | struct audit_field *inode_f = entry->rule.inode_f; |
1212 | struct audit_watch *watch = entry->rule.watch; | 1212 | struct audit_watch *watch = entry->rule.watch; |
1213 | struct nameidata *ndp, *ndw; | 1213 | struct nameidata *ndp = NULL, *ndw = NULL; |
1214 | int h, err, putnd_needed = 0; | 1214 | int h, err; |
1215 | #ifdef CONFIG_AUDITSYSCALL | 1215 | #ifdef CONFIG_AUDITSYSCALL |
1216 | int dont_count = 0; | 1216 | int dont_count = 0; |
1217 | 1217 | ||
@@ -1239,7 +1239,6 @@ static inline int audit_add_rule(struct audit_entry *entry, | |||
1239 | err = audit_get_nd(watch->path, &ndp, &ndw); | 1239 | err = audit_get_nd(watch->path, &ndp, &ndw); |
1240 | if (err) | 1240 | if (err) |
1241 | goto error; | 1241 | goto error; |
1242 | putnd_needed = 1; | ||
1243 | } | 1242 | } |
1244 | 1243 | ||
1245 | mutex_lock(&audit_filter_mutex); | 1244 | mutex_lock(&audit_filter_mutex); |
@@ -1269,14 +1268,11 @@ static inline int audit_add_rule(struct audit_entry *entry, | |||
1269 | #endif | 1268 | #endif |
1270 | mutex_unlock(&audit_filter_mutex); | 1269 | mutex_unlock(&audit_filter_mutex); |
1271 | 1270 | ||
1272 | if (putnd_needed) | 1271 | audit_put_nd(ndp, ndw); /* NULL args OK */ |
1273 | audit_put_nd(ndp, ndw); | ||
1274 | |||
1275 | return 0; | 1272 | return 0; |
1276 | 1273 | ||
1277 | error: | 1274 | error: |
1278 | if (putnd_needed) | 1275 | audit_put_nd(ndp, ndw); /* NULL args OK */ |
1279 | audit_put_nd(ndp, ndw); | ||
1280 | if (watch) | 1276 | if (watch) |
1281 | audit_put_watch(watch); /* tmp watch, matches initial get */ | 1277 | audit_put_watch(watch); /* tmp watch, matches initial get */ |
1282 | return err; | 1278 | return err; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e36481ed61..145cbb79c4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -71,9 +71,6 @@ | |||
71 | 71 | ||
72 | extern struct list_head audit_filter_list[]; | 72 | extern struct list_head audit_filter_list[]; |
73 | 73 | ||
74 | /* No syscall auditing will take place unless audit_enabled != 0. */ | ||
75 | extern int audit_enabled; | ||
76 | |||
77 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context | 74 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context |
78 | * for saving names from getname(). */ | 75 | * for saving names from getname(). */ |
79 | #define AUDIT_NAMES 20 | 76 | #define AUDIT_NAMES 20 |
@@ -156,7 +153,7 @@ struct audit_aux_data_execve { | |||
156 | struct audit_aux_data d; | 153 | struct audit_aux_data d; |
157 | int argc; | 154 | int argc; |
158 | int envc; | 155 | int envc; |
159 | char mem[0]; | 156 | struct mm_struct *mm; |
160 | }; | 157 | }; |
161 | 158 | ||
162 | struct audit_aux_data_socketcall { | 159 | struct audit_aux_data_socketcall { |
@@ -834,6 +831,55 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
834 | return rc; | 831 | return rc; |
835 | } | 832 | } |
836 | 833 | ||
834 | static void audit_log_execve_info(struct audit_buffer *ab, | ||
835 | struct audit_aux_data_execve *axi) | ||
836 | { | ||
837 | int i; | ||
838 | long len, ret; | ||
839 | const char __user *p = (const char __user *)axi->mm->arg_start; | ||
840 | char *buf; | ||
841 | |||
842 | if (axi->mm != current->mm) | ||
843 | return; /* execve failed, no additional info */ | ||
844 | |||
845 | for (i = 0; i < axi->argc; i++, p += len) { | ||
846 | len = strnlen_user(p, MAX_ARG_STRLEN); | ||
847 | /* | ||
848 | * We just created this mm, if we can't find the strings | ||
849 | * we just copied into it something is _very_ wrong. Similar | ||
850 | * for strings that are too long, we should not have created | ||
851 | * any. | ||
852 | */ | ||
853 | if (!len || len > MAX_ARG_STRLEN) { | ||
854 | WARN_ON(1); | ||
855 | send_sig(SIGKILL, current, 0); | ||
856 | } | ||
857 | |||
858 | buf = kmalloc(len, GFP_KERNEL); | ||
859 | if (!buf) { | ||
860 | audit_panic("out of memory for argv string\n"); | ||
861 | break; | ||
862 | } | ||
863 | |||
864 | ret = copy_from_user(buf, p, len); | ||
865 | /* | ||
866 | * There is no reason for this copy to be short. We just | ||
867 | * copied them here, and the mm hasn't been exposed to user- | ||
868 | * space yet. | ||
869 | */ | ||
870 | if (!ret) { | ||
871 | WARN_ON(1); | ||
872 | send_sig(SIGKILL, current, 0); | ||
873 | } | ||
874 | |||
875 | audit_log_format(ab, "a%d=", i); | ||
876 | audit_log_untrustedstring(ab, buf); | ||
877 | audit_log_format(ab, "\n"); | ||
878 | |||
879 | kfree(buf); | ||
880 | } | ||
881 | } | ||
882 | |||
837 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) | 883 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) |
838 | { | 884 | { |
839 | int i, call_panic = 0; | 885 | int i, call_panic = 0; |
@@ -974,13 +1020,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
974 | 1020 | ||
975 | case AUDIT_EXECVE: { | 1021 | case AUDIT_EXECVE: { |
976 | struct audit_aux_data_execve *axi = (void *)aux; | 1022 | struct audit_aux_data_execve *axi = (void *)aux; |
977 | int i; | 1023 | audit_log_execve_info(ab, axi); |
978 | const char *p; | ||
979 | for (i = 0, p = axi->mem; i < axi->argc; i++) { | ||
980 | audit_log_format(ab, "a%d=", i); | ||
981 | p = audit_log_untrustedstring(ab, p); | ||
982 | audit_log_format(ab, "\n"); | ||
983 | } | ||
984 | break; } | 1024 | break; } |
985 | 1025 | ||
986 | case AUDIT_SOCKETCALL: { | 1026 | case AUDIT_SOCKETCALL: { |
@@ -1824,32 +1864,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode | |||
1824 | return 0; | 1864 | return 0; |
1825 | } | 1865 | } |
1826 | 1866 | ||
1867 | int audit_argv_kb = 32; | ||
1868 | |||
1827 | int audit_bprm(struct linux_binprm *bprm) | 1869 | int audit_bprm(struct linux_binprm *bprm) |
1828 | { | 1870 | { |
1829 | struct audit_aux_data_execve *ax; | 1871 | struct audit_aux_data_execve *ax; |
1830 | struct audit_context *context = current->audit_context; | 1872 | struct audit_context *context = current->audit_context; |
1831 | unsigned long p, next; | ||
1832 | void *to; | ||
1833 | 1873 | ||
1834 | if (likely(!audit_enabled || !context || context->dummy)) | 1874 | if (likely(!audit_enabled || !context || context->dummy)) |
1835 | return 0; | 1875 | return 0; |
1836 | 1876 | ||
1837 | ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, | 1877 | /* |
1838 | GFP_KERNEL); | 1878 | * Even though the stack code doesn't limit the arg+env size any more, |
1879 | * the audit code requires that _all_ arguments be logged in a single | ||
1880 | * netlink skb. Hence cap it :-( | ||
1881 | */ | ||
1882 | if (bprm->argv_len > (audit_argv_kb << 10)) | ||
1883 | return -E2BIG; | ||
1884 | |||
1885 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); | ||
1839 | if (!ax) | 1886 | if (!ax) |
1840 | return -ENOMEM; | 1887 | return -ENOMEM; |
1841 | 1888 | ||
1842 | ax->argc = bprm->argc; | 1889 | ax->argc = bprm->argc; |
1843 | ax->envc = bprm->envc; | 1890 | ax->envc = bprm->envc; |
1844 | for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { | 1891 | ax->mm = bprm->mm; |
1845 | struct page *page = bprm->page[p / PAGE_SIZE]; | ||
1846 | void *kaddr = kmap(page); | ||
1847 | next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); | ||
1848 | memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); | ||
1849 | to += next - p; | ||
1850 | kunmap(page); | ||
1851 | } | ||
1852 | |||
1853 | ax->d.type = AUDIT_EXECVE; | 1892 | ax->d.type = AUDIT_EXECVE; |
1854 | ax->d.next = context->aux; | 1893 | ax->d.next = context->aux; |
1855 | context->aux = (void *)ax; | 1894 | context->aux = (void *)ax; |
@@ -2040,7 +2079,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2040 | 2079 | ||
2041 | /** | 2080 | /** |
2042 | * audit_core_dumps - record information about processes that end abnormally | 2081 | * audit_core_dumps - record information about processes that end abnormally |
2043 | * @sig: signal value | 2082 | * @signr: signal value |
2044 | * | 2083 | * |
2045 | * If a process ends with a core dump, something fishy is going on and we | 2084 | * If a process ends with a core dump, something fishy is going on and we |
2046 | * should record the event for investigation. | 2085 | * should record the event for investigation. |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 208cf3497c..181ae70860 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu) | |||
103 | write_unlock_irq(&tasklist_lock); | 103 | write_unlock_irq(&tasklist_lock); |
104 | } | 104 | } |
105 | 105 | ||
106 | struct take_cpu_down_param { | ||
107 | unsigned long mod; | ||
108 | void *hcpu; | ||
109 | }; | ||
110 | |||
106 | /* Take this CPU down. */ | 111 | /* Take this CPU down. */ |
107 | static int take_cpu_down(void *unused) | 112 | static int take_cpu_down(void *_param) |
108 | { | 113 | { |
114 | struct take_cpu_down_param *param = _param; | ||
109 | int err; | 115 | int err; |
110 | 116 | ||
117 | raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, | ||
118 | param->hcpu); | ||
111 | /* Ensure this CPU doesn't handle any more interrupts. */ | 119 | /* Ensure this CPU doesn't handle any more interrupts. */ |
112 | err = __cpu_disable(); | 120 | err = __cpu_disable(); |
113 | if (err < 0) | 121 | if (err < 0) |
@@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
127 | cpumask_t old_allowed, tmp; | 135 | cpumask_t old_allowed, tmp; |
128 | void *hcpu = (void *)(long)cpu; | 136 | void *hcpu = (void *)(long)cpu; |
129 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 137 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
138 | struct take_cpu_down_param tcd_param = { | ||
139 | .mod = mod, | ||
140 | .hcpu = hcpu, | ||
141 | }; | ||
130 | 142 | ||
131 | if (num_online_cpus() == 1) | 143 | if (num_online_cpus() == 1) |
132 | return -EBUSY; | 144 | return -EBUSY; |
@@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
153 | set_cpus_allowed(current, tmp); | 165 | set_cpus_allowed(current, tmp); |
154 | 166 | ||
155 | mutex_lock(&cpu_bitmask_lock); | 167 | mutex_lock(&cpu_bitmask_lock); |
156 | p = __stop_machine_run(take_cpu_down, NULL, cpu); | 168 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
157 | mutex_unlock(&cpu_bitmask_lock); | 169 | mutex_unlock(&cpu_bitmask_lock); |
158 | 170 | ||
159 | if (IS_ERR(p) || cpu_online(cpu)) { | 171 | if (IS_ERR(p) || cpu_online(cpu)) { |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4c49188cc4..57e6448b17 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf) | |||
516 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | 516 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
517 | envp[i] = NULL; | 517 | envp[i] = NULL; |
518 | 518 | ||
519 | call_usermodehelper(argv[0], argv, envp, 0); | 519 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); |
520 | kfree(pathbuf); | 520 | kfree(pathbuf); |
521 | } | 521 | } |
522 | 522 | ||
@@ -981,10 +981,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
981 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | 981 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); |
982 | if (!mmarray) | 982 | if (!mmarray) |
983 | goto done; | 983 | goto done; |
984 | write_lock_irq(&tasklist_lock); /* block fork */ | 984 | read_lock(&tasklist_lock); /* block fork */ |
985 | if (atomic_read(&cs->count) <= ntasks) | 985 | if (atomic_read(&cs->count) <= ntasks) |
986 | break; /* got enough */ | 986 | break; /* got enough */ |
987 | write_unlock_irq(&tasklist_lock); /* try again */ | 987 | read_unlock(&tasklist_lock); /* try again */ |
988 | kfree(mmarray); | 988 | kfree(mmarray); |
989 | } | 989 | } |
990 | 990 | ||
@@ -1006,7 +1006,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
1006 | continue; | 1006 | continue; |
1007 | mmarray[n++] = mm; | 1007 | mmarray[n++] = mm; |
1008 | } while_each_thread(g, p); | 1008 | } while_each_thread(g, p); |
1009 | write_unlock_irq(&tasklist_lock); | 1009 | read_unlock(&tasklist_lock); |
1010 | 1010 | ||
1011 | /* | 1011 | /* |
1012 | * Now that we've dropped the tasklist spinlock, we can | 1012 | * Now that we've dropped the tasklist spinlock, we can |
@@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void) | |||
2138 | static int cpuset_handle_cpuhp(struct notifier_block *nb, | 2138 | static int cpuset_handle_cpuhp(struct notifier_block *nb, |
2139 | unsigned long phase, void *cpu) | 2139 | unsigned long phase, void *cpu) |
2140 | { | 2140 | { |
2141 | if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) | ||
2142 | return NOTIFY_DONE; | ||
2143 | |||
2141 | common_cpu_mem_hotplug_unplug(); | 2144 | common_cpu_mem_hotplug_unplug(); |
2142 | return 0; | 2145 | return 0; |
2143 | } | 2146 | } |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index c0148ae992..81e6978296 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -99,9 +99,10 @@ void __delayacct_blkio_end(void) | |||
99 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | 99 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) |
100 | { | 100 | { |
101 | s64 tmp; | 101 | s64 tmp; |
102 | struct timespec ts; | 102 | unsigned long t1; |
103 | unsigned long t1,t2,t3; | 103 | unsigned long long t2, t3; |
104 | unsigned long flags; | 104 | unsigned long flags; |
105 | struct timespec ts; | ||
105 | 106 | ||
106 | /* Though tsk->delays accessed later, early exit avoids | 107 | /* Though tsk->delays accessed later, early exit avoids |
107 | * unnecessary returning of other data | 108 | * unnecessary returning of other data |
@@ -124,11 +125,10 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
124 | 125 | ||
125 | d->cpu_count += t1; | 126 | d->cpu_count += t1; |
126 | 127 | ||
127 | jiffies_to_timespec(t2, &ts); | 128 | tmp = (s64)d->cpu_delay_total + t2; |
128 | tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); | ||
129 | d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; | 129 | d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; |
130 | 130 | ||
131 | tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; | 131 | tmp = (s64)d->cpu_run_virtual_total + t3; |
132 | d->cpu_run_virtual_total = | 132 | d->cpu_run_virtual_total = |
133 | (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; | 133 | (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; |
134 | 134 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 5c8ecbaa19..464c2b172f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/mempolicy.h> | 31 | #include <linux/mempolicy.h> |
32 | #include <linux/taskstats_kern.h> | 32 | #include <linux/taskstats_kern.h> |
33 | #include <linux/delayacct.h> | 33 | #include <linux/delayacct.h> |
34 | #include <linux/freezer.h> | ||
34 | #include <linux/cpuset.h> | 35 | #include <linux/cpuset.h> |
35 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
36 | #include <linux/signal.h> | 37 | #include <linux/signal.h> |
@@ -44,6 +45,7 @@ | |||
44 | #include <linux/resource.h> | 45 | #include <linux/resource.h> |
45 | #include <linux/blkdev.h> | 46 | #include <linux/blkdev.h> |
46 | #include <linux/task_io_accounting_ops.h> | 47 | #include <linux/task_io_accounting_ops.h> |
48 | #include <linux/freezer.h> | ||
47 | 49 | ||
48 | #include <asm/uaccess.h> | 50 | #include <asm/uaccess.h> |
49 | #include <asm/unistd.h> | 51 | #include <asm/unistd.h> |
@@ -122,9 +124,9 @@ static void __exit_signal(struct task_struct *tsk) | |||
122 | sig->maj_flt += tsk->maj_flt; | 124 | sig->maj_flt += tsk->maj_flt; |
123 | sig->nvcsw += tsk->nvcsw; | 125 | sig->nvcsw += tsk->nvcsw; |
124 | sig->nivcsw += tsk->nivcsw; | 126 | sig->nivcsw += tsk->nivcsw; |
125 | sig->sched_time += tsk->sched_time; | ||
126 | sig->inblock += task_io_get_inblock(tsk); | 127 | sig->inblock += task_io_get_inblock(tsk); |
127 | sig->oublock += task_io_get_oublock(tsk); | 128 | sig->oublock += task_io_get_oublock(tsk); |
129 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||
128 | sig = NULL; /* Marker for below. */ | 130 | sig = NULL; /* Marker for below. */ |
129 | } | 131 | } |
130 | 132 | ||
@@ -182,7 +184,6 @@ repeat: | |||
182 | zap_leader = (leader->exit_signal == -1); | 184 | zap_leader = (leader->exit_signal == -1); |
183 | } | 185 | } |
184 | 186 | ||
185 | sched_exit(p); | ||
186 | write_unlock_irq(&tasklist_lock); | 187 | write_unlock_irq(&tasklist_lock); |
187 | proc_flush_task(p); | 188 | proc_flush_task(p); |
188 | release_thread(p); | 189 | release_thread(p); |
@@ -291,7 +292,7 @@ static void reparent_to_kthreadd(void) | |||
291 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | 292 | /* Set the exit signal to SIGCHLD so we signal init on exit */ |
292 | current->exit_signal = SIGCHLD; | 293 | current->exit_signal = SIGCHLD; |
293 | 294 | ||
294 | if (!has_rt_policy(current) && (task_nice(current) < 0)) | 295 | if (task_nice(current) < 0) |
295 | set_user_nice(current, 0); | 296 | set_user_nice(current, 0); |
296 | /* cpus_allowed? */ | 297 | /* cpus_allowed? */ |
297 | /* rt_priority? */ | 298 | /* rt_priority? */ |
@@ -388,6 +389,11 @@ void daemonize(const char *name, ...) | |||
388 | * they would be locked into memory. | 389 | * they would be locked into memory. |
389 | */ | 390 | */ |
390 | exit_mm(current); | 391 | exit_mm(current); |
392 | /* | ||
393 | * We don't want to have TIF_FREEZE set if the system-wide hibernation | ||
394 | * or suspend transition begins right now. | ||
395 | */ | ||
396 | current->flags |= PF_NOFREEZE; | ||
391 | 397 | ||
392 | set_special_pids(1, 1); | 398 | set_special_pids(1, 1); |
393 | proc_clear_tty(current); | 399 | proc_clear_tty(current); |
@@ -589,6 +595,8 @@ static void exit_mm(struct task_struct * tsk) | |||
589 | tsk->mm = NULL; | 595 | tsk->mm = NULL; |
590 | up_read(&mm->mmap_sem); | 596 | up_read(&mm->mmap_sem); |
591 | enter_lazy_tlb(mm, current); | 597 | enter_lazy_tlb(mm, current); |
598 | /* We don't want this task to be frozen prematurely */ | ||
599 | clear_freeze_flag(tsk); | ||
592 | task_unlock(tsk); | 600 | task_unlock(tsk); |
593 | mmput(mm); | 601 | mmput(mm); |
594 | } | 602 | } |
@@ -859,6 +867,34 @@ static void exit_notify(struct task_struct *tsk) | |||
859 | release_task(tsk); | 867 | release_task(tsk); |
860 | } | 868 | } |
861 | 869 | ||
870 | #ifdef CONFIG_DEBUG_STACK_USAGE | ||
871 | static void check_stack_usage(void) | ||
872 | { | ||
873 | static DEFINE_SPINLOCK(low_water_lock); | ||
874 | static int lowest_to_date = THREAD_SIZE; | ||
875 | unsigned long *n = end_of_stack(current); | ||
876 | unsigned long free; | ||
877 | |||
878 | while (*n == 0) | ||
879 | n++; | ||
880 | free = (unsigned long)n - (unsigned long)end_of_stack(current); | ||
881 | |||
882 | if (free >= lowest_to_date) | ||
883 | return; | ||
884 | |||
885 | spin_lock(&low_water_lock); | ||
886 | if (free < lowest_to_date) { | ||
887 | printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " | ||
888 | "left\n", | ||
889 | current->comm, free); | ||
890 | lowest_to_date = free; | ||
891 | } | ||
892 | spin_unlock(&low_water_lock); | ||
893 | } | ||
894 | #else | ||
895 | static inline void check_stack_usage(void) {} | ||
896 | #endif | ||
897 | |||
862 | fastcall NORET_TYPE void do_exit(long code) | 898 | fastcall NORET_TYPE void do_exit(long code) |
863 | { | 899 | { |
864 | struct task_struct *tsk = current; | 900 | struct task_struct *tsk = current; |
@@ -938,6 +974,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
938 | if (unlikely(tsk->compat_robust_list)) | 974 | if (unlikely(tsk->compat_robust_list)) |
939 | compat_exit_robust_list(tsk); | 975 | compat_exit_robust_list(tsk); |
940 | #endif | 976 | #endif |
977 | if (group_dead) | ||
978 | tty_audit_exit(); | ||
941 | if (unlikely(tsk->audit_context)) | 979 | if (unlikely(tsk->audit_context)) |
942 | audit_free(tsk); | 980 | audit_free(tsk); |
943 | 981 | ||
@@ -950,6 +988,7 @@ fastcall NORET_TYPE void do_exit(long code) | |||
950 | exit_sem(tsk); | 988 | exit_sem(tsk); |
951 | __exit_files(tsk); | 989 | __exit_files(tsk); |
952 | __exit_fs(tsk); | 990 | __exit_fs(tsk); |
991 | check_stack_usage(); | ||
953 | exit_thread(); | 992 | exit_thread(); |
954 | cpuset_exit(tsk); | 993 | cpuset_exit(tsk); |
955 | exit_keys(tsk); | 994 | exit_keys(tsk); |
diff --git a/kernel/fork.c b/kernel/fork.c index 73ad5cda1b..7332e236d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/delayacct.h> | 49 | #include <linux/delayacct.h> |
50 | #include <linux/taskstats_kern.h> | 50 | #include <linux/taskstats_kern.h> |
51 | #include <linux/random.h> | 51 | #include <linux/random.h> |
52 | #include <linux/tty.h> | ||
52 | 53 | ||
53 | #include <asm/pgtable.h> | 54 | #include <asm/pgtable.h> |
54 | #include <asm/pgalloc.h> | 55 | #include <asm/pgalloc.h> |
@@ -136,7 +137,7 @@ void __init fork_init(unsigned long mempages) | |||
136 | /* create a slab on which task_structs can be allocated */ | 137 | /* create a slab on which task_structs can be allocated */ |
137 | task_struct_cachep = | 138 | task_struct_cachep = |
138 | kmem_cache_create("task_struct", sizeof(struct task_struct), | 139 | kmem_cache_create("task_struct", sizeof(struct task_struct), |
139 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); | 140 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); |
140 | #endif | 141 | #endif |
141 | 142 | ||
142 | /* | 143 | /* |
@@ -333,6 +334,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) | |||
333 | atomic_set(&mm->mm_count, 1); | 334 | atomic_set(&mm->mm_count, 1); |
334 | init_rwsem(&mm->mmap_sem); | 335 | init_rwsem(&mm->mmap_sem); |
335 | INIT_LIST_HEAD(&mm->mmlist); | 336 | INIT_LIST_HEAD(&mm->mmlist); |
337 | mm->flags = (current->mm) ? current->mm->flags | ||
338 | : MMF_DUMP_FILTER_DEFAULT; | ||
336 | mm->core_waiters = 0; | 339 | mm->core_waiters = 0; |
337 | mm->nr_ptes = 0; | 340 | mm->nr_ptes = 0; |
338 | set_mm_counter(mm, file_rss, 0); | 341 | set_mm_counter(mm, file_rss, 0); |
@@ -877,7 +880,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
877 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; | 880 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; |
878 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; | 881 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; |
879 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; | 882 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; |
880 | sig->sched_time = 0; | 883 | sig->sum_sched_runtime = 0; |
881 | INIT_LIST_HEAD(&sig->cpu_timers[0]); | 884 | INIT_LIST_HEAD(&sig->cpu_timers[0]); |
882 | INIT_LIST_HEAD(&sig->cpu_timers[1]); | 885 | INIT_LIST_HEAD(&sig->cpu_timers[1]); |
883 | INIT_LIST_HEAD(&sig->cpu_timers[2]); | 886 | INIT_LIST_HEAD(&sig->cpu_timers[2]); |
@@ -897,6 +900,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
897 | } | 900 | } |
898 | acct_init_pacct(&sig->pacct); | 901 | acct_init_pacct(&sig->pacct); |
899 | 902 | ||
903 | tty_audit_fork(sig); | ||
904 | |||
900 | return 0; | 905 | return 0; |
901 | } | 906 | } |
902 | 907 | ||
@@ -920,7 +925,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) | |||
920 | { | 925 | { |
921 | unsigned long new_flags = p->flags; | 926 | unsigned long new_flags = p->flags; |
922 | 927 | ||
923 | new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); | 928 | new_flags &= ~PF_SUPERPRIV; |
924 | new_flags |= PF_FORKNOEXEC; | 929 | new_flags |= PF_FORKNOEXEC; |
925 | if (!(clone_flags & CLONE_PTRACE)) | 930 | if (!(clone_flags & CLONE_PTRACE)) |
926 | p->ptrace = 0; | 931 | p->ptrace = 0; |
@@ -999,7 +1004,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
999 | if (atomic_read(&p->user->processes) >= | 1004 | if (atomic_read(&p->user->processes) >= |
1000 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { | 1005 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { |
1001 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && | 1006 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && |
1002 | p->user != &root_user) | 1007 | p->user != current->nsproxy->user_ns->root_user) |
1003 | goto bad_fork_free; | 1008 | goto bad_fork_free; |
1004 | } | 1009 | } |
1005 | 1010 | ||
@@ -1040,7 +1045,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1040 | 1045 | ||
1041 | p->utime = cputime_zero; | 1046 | p->utime = cputime_zero; |
1042 | p->stime = cputime_zero; | 1047 | p->stime = cputime_zero; |
1043 | p->sched_time = 0; | 1048 | |
1044 | #ifdef CONFIG_TASK_XACCT | 1049 | #ifdef CONFIG_TASK_XACCT |
1045 | p->rchar = 0; /* I/O counter: bytes read */ | 1050 | p->rchar = 0; /* I/O counter: bytes read */ |
1046 | p->wchar = 0; /* I/O counter: bytes written */ | 1051 | p->wchar = 0; /* I/O counter: bytes written */ |
@@ -1059,6 +1064,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1059 | 1064 | ||
1060 | p->lock_depth = -1; /* -1 = no lock */ | 1065 | p->lock_depth = -1; /* -1 = no lock */ |
1061 | do_posix_clock_monotonic_gettime(&p->start_time); | 1066 | do_posix_clock_monotonic_gettime(&p->start_time); |
1067 | p->real_start_time = p->start_time; | ||
1068 | monotonic_to_bootbased(&p->real_start_time); | ||
1062 | p->security = NULL; | 1069 | p->security = NULL; |
1063 | p->io_context = NULL; | 1070 | p->io_context = NULL; |
1064 | p->io_wait = NULL; | 1071 | p->io_wait = NULL; |
@@ -1439,22 +1446,22 @@ void __init proc_caches_init(void) | |||
1439 | sighand_cachep = kmem_cache_create("sighand_cache", | 1446 | sighand_cachep = kmem_cache_create("sighand_cache", |
1440 | sizeof(struct sighand_struct), 0, | 1447 | sizeof(struct sighand_struct), 0, |
1441 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, | 1448 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, |
1442 | sighand_ctor, NULL); | 1449 | sighand_ctor); |
1443 | signal_cachep = kmem_cache_create("signal_cache", | 1450 | signal_cachep = kmem_cache_create("signal_cache", |
1444 | sizeof(struct signal_struct), 0, | 1451 | sizeof(struct signal_struct), 0, |
1445 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1452 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1446 | files_cachep = kmem_cache_create("files_cache", | 1453 | files_cachep = kmem_cache_create("files_cache", |
1447 | sizeof(struct files_struct), 0, | 1454 | sizeof(struct files_struct), 0, |
1448 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1455 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1449 | fs_cachep = kmem_cache_create("fs_cache", | 1456 | fs_cachep = kmem_cache_create("fs_cache", |
1450 | sizeof(struct fs_struct), 0, | 1457 | sizeof(struct fs_struct), 0, |
1451 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1458 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1452 | vm_area_cachep = kmem_cache_create("vm_area_struct", | 1459 | vm_area_cachep = kmem_cache_create("vm_area_struct", |
1453 | sizeof(struct vm_area_struct), 0, | 1460 | sizeof(struct vm_area_struct), 0, |
1454 | SLAB_PANIC, NULL, NULL); | 1461 | SLAB_PANIC, NULL); |
1455 | mm_cachep = kmem_cache_create("mm_struct", | 1462 | mm_cachep = kmem_cache_create("mm_struct", |
1456 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 1463 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
1457 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1464 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1458 | } | 1465 | } |
1459 | 1466 | ||
1460 | /* | 1467 | /* |
@@ -1601,7 +1608,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1601 | err = -EINVAL; | 1608 | err = -EINVAL; |
1602 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | 1609 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
1603 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | 1610 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
1604 | CLONE_NEWUTS|CLONE_NEWIPC)) | 1611 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) |
1605 | goto bad_unshare_out; | 1612 | goto bad_unshare_out; |
1606 | 1613 | ||
1607 | if ((err = unshare_thread(unshare_flags))) | 1614 | if ((err = unshare_thread(unshare_flags))) |
diff --git a/kernel/futex.c b/kernel/futex.c index 45490bec58..a12425051e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -121,6 +121,24 @@ static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; | |||
121 | static struct vfsmount *futex_mnt; | 121 | static struct vfsmount *futex_mnt; |
122 | 122 | ||
123 | /* | 123 | /* |
124 | * Take mm->mmap_sem, when futex is shared | ||
125 | */ | ||
126 | static inline void futex_lock_mm(struct rw_semaphore *fshared) | ||
127 | { | ||
128 | if (fshared) | ||
129 | down_read(fshared); | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Release mm->mmap_sem, when the futex is shared | ||
134 | */ | ||
135 | static inline void futex_unlock_mm(struct rw_semaphore *fshared) | ||
136 | { | ||
137 | if (fshared) | ||
138 | up_read(fshared); | ||
139 | } | ||
140 | |||
141 | /* | ||
124 | * We hash on the keys returned from get_futex_key (see below). | 142 | * We hash on the keys returned from get_futex_key (see below). |
125 | */ | 143 | */ |
126 | static struct futex_hash_bucket *hash_futex(union futex_key *key) | 144 | static struct futex_hash_bucket *hash_futex(union futex_key *key) |
@@ -287,7 +305,18 @@ void drop_futex_key_refs(union futex_key *key) | |||
287 | } | 305 | } |
288 | EXPORT_SYMBOL_GPL(drop_futex_key_refs); | 306 | EXPORT_SYMBOL_GPL(drop_futex_key_refs); |
289 | 307 | ||
290 | static inline int get_futex_value_locked(u32 *dest, u32 __user *from) | 308 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) |
309 | { | ||
310 | u32 curval; | ||
311 | |||
312 | pagefault_disable(); | ||
313 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
314 | pagefault_enable(); | ||
315 | |||
316 | return curval; | ||
317 | } | ||
318 | |||
319 | static int get_futex_value_locked(u32 *dest, u32 __user *from) | ||
291 | { | 320 | { |
292 | int ret; | 321 | int ret; |
293 | 322 | ||
@@ -317,15 +346,20 @@ static int futex_handle_fault(unsigned long address, | |||
317 | vma = find_vma(mm, address); | 346 | vma = find_vma(mm, address); |
318 | if (vma && address >= vma->vm_start && | 347 | if (vma && address >= vma->vm_start && |
319 | (vma->vm_flags & VM_WRITE)) { | 348 | (vma->vm_flags & VM_WRITE)) { |
320 | switch (handle_mm_fault(mm, vma, address, 1)) { | 349 | int fault; |
321 | case VM_FAULT_MINOR: | 350 | fault = handle_mm_fault(mm, vma, address, 1); |
322 | ret = 0; | 351 | if (unlikely((fault & VM_FAULT_ERROR))) { |
323 | current->min_flt++; | 352 | #if 0 |
324 | break; | 353 | /* XXX: let's do this when we verify it is OK */ |
325 | case VM_FAULT_MAJOR: | 354 | if (ret & VM_FAULT_OOM) |
355 | ret = -ENOMEM; | ||
356 | #endif | ||
357 | } else { | ||
326 | ret = 0; | 358 | ret = 0; |
327 | current->maj_flt++; | 359 | if (fault & VM_FAULT_MAJOR) |
328 | break; | 360 | current->maj_flt++; |
361 | else | ||
362 | current->min_flt++; | ||
329 | } | 363 | } |
330 | } | 364 | } |
331 | if (!fshared) | 365 | if (!fshared) |
@@ -620,9 +654,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
620 | 654 | ||
621 | newval = FUTEX_WAITERS | new_owner->pid; | 655 | newval = FUTEX_WAITERS | new_owner->pid; |
622 | 656 | ||
623 | pagefault_disable(); | 657 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); |
624 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
625 | pagefault_enable(); | ||
626 | 658 | ||
627 | if (curval == -EFAULT) | 659 | if (curval == -EFAULT) |
628 | ret = -EFAULT; | 660 | ret = -EFAULT; |
@@ -659,9 +691,7 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | |||
659 | * There is no waiter, so we unlock the futex. The owner died | 691 | * There is no waiter, so we unlock the futex. The owner died |
660 | * bit has not to be preserved here. We are the owner: | 692 | * bit has not to be preserved here. We are the owner: |
661 | */ | 693 | */ |
662 | pagefault_disable(); | 694 | oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); |
663 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | ||
664 | pagefault_enable(); | ||
665 | 695 | ||
666 | if (oldval == -EFAULT) | 696 | if (oldval == -EFAULT) |
667 | return oldval; | 697 | return oldval; |
@@ -700,8 +730,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
700 | union futex_key key; | 730 | union futex_key key; |
701 | int ret; | 731 | int ret; |
702 | 732 | ||
703 | if (fshared) | 733 | futex_lock_mm(fshared); |
704 | down_read(fshared); | ||
705 | 734 | ||
706 | ret = get_futex_key(uaddr, fshared, &key); | 735 | ret = get_futex_key(uaddr, fshared, &key); |
707 | if (unlikely(ret != 0)) | 736 | if (unlikely(ret != 0)) |
@@ -725,8 +754,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
725 | 754 | ||
726 | spin_unlock(&hb->lock); | 755 | spin_unlock(&hb->lock); |
727 | out: | 756 | out: |
728 | if (fshared) | 757 | futex_unlock_mm(fshared); |
729 | up_read(fshared); | ||
730 | return ret; | 758 | return ret; |
731 | } | 759 | } |
732 | 760 | ||
@@ -746,8 +774,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, | |||
746 | int ret, op_ret, attempt = 0; | 774 | int ret, op_ret, attempt = 0; |
747 | 775 | ||
748 | retryfull: | 776 | retryfull: |
749 | if (fshared) | 777 | futex_lock_mm(fshared); |
750 | down_read(fshared); | ||
751 | 778 | ||
752 | ret = get_futex_key(uaddr1, fshared, &key1); | 779 | ret = get_futex_key(uaddr1, fshared, &key1); |
753 | if (unlikely(ret != 0)) | 780 | if (unlikely(ret != 0)) |
@@ -793,7 +820,7 @@ retry: | |||
793 | */ | 820 | */ |
794 | if (attempt++) { | 821 | if (attempt++) { |
795 | ret = futex_handle_fault((unsigned long)uaddr2, | 822 | ret = futex_handle_fault((unsigned long)uaddr2, |
796 | fshared, attempt); | 823 | fshared, attempt); |
797 | if (ret) | 824 | if (ret) |
798 | goto out; | 825 | goto out; |
799 | goto retry; | 826 | goto retry; |
@@ -803,8 +830,7 @@ retry: | |||
803 | * If we would have faulted, release mmap_sem, | 830 | * If we would have faulted, release mmap_sem, |
804 | * fault it in and start all over again. | 831 | * fault it in and start all over again. |
805 | */ | 832 | */ |
806 | if (fshared) | 833 | futex_unlock_mm(fshared); |
807 | up_read(fshared); | ||
808 | 834 | ||
809 | ret = get_user(dummy, uaddr2); | 835 | ret = get_user(dummy, uaddr2); |
810 | if (ret) | 836 | if (ret) |
@@ -841,8 +867,8 @@ retry: | |||
841 | if (hb1 != hb2) | 867 | if (hb1 != hb2) |
842 | spin_unlock(&hb2->lock); | 868 | spin_unlock(&hb2->lock); |
843 | out: | 869 | out: |
844 | if (fshared) | 870 | futex_unlock_mm(fshared); |
845 | up_read(fshared); | 871 | |
846 | return ret; | 872 | return ret; |
847 | } | 873 | } |
848 | 874 | ||
@@ -861,8 +887,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, | |||
861 | int ret, drop_count = 0; | 887 | int ret, drop_count = 0; |
862 | 888 | ||
863 | retry: | 889 | retry: |
864 | if (fshared) | 890 | futex_lock_mm(fshared); |
865 | down_read(fshared); | ||
866 | 891 | ||
867 | ret = get_futex_key(uaddr1, fshared, &key1); | 892 | ret = get_futex_key(uaddr1, fshared, &key1); |
868 | if (unlikely(ret != 0)) | 893 | if (unlikely(ret != 0)) |
@@ -890,8 +915,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, | |||
890 | * If we would have faulted, release mmap_sem, fault | 915 | * If we would have faulted, release mmap_sem, fault |
891 | * it in and start all over again. | 916 | * it in and start all over again. |
892 | */ | 917 | */ |
893 | if (fshared) | 918 | futex_unlock_mm(fshared); |
894 | up_read(fshared); | ||
895 | 919 | ||
896 | ret = get_user(curval, uaddr1); | 920 | ret = get_user(curval, uaddr1); |
897 | 921 | ||
@@ -944,8 +968,7 @@ out_unlock: | |||
944 | drop_futex_key_refs(&key1); | 968 | drop_futex_key_refs(&key1); |
945 | 969 | ||
946 | out: | 970 | out: |
947 | if (fshared) | 971 | futex_unlock_mm(fshared); |
948 | up_read(fshared); | ||
949 | return ret; | 972 | return ret; |
950 | } | 973 | } |
951 | 974 | ||
@@ -1113,10 +1136,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1113 | while (!ret) { | 1136 | while (!ret) { |
1114 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 1137 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
1115 | 1138 | ||
1116 | pagefault_disable(); | 1139 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); |
1117 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1118 | uval, newval); | ||
1119 | pagefault_enable(); | ||
1120 | 1140 | ||
1121 | if (curval == -EFAULT) | 1141 | if (curval == -EFAULT) |
1122 | ret = -EFAULT; | 1142 | ret = -EFAULT; |
@@ -1134,6 +1154,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1134 | #define ARG3_SHARED 1 | 1154 | #define ARG3_SHARED 1 |
1135 | 1155 | ||
1136 | static long futex_wait_restart(struct restart_block *restart); | 1156 | static long futex_wait_restart(struct restart_block *restart); |
1157 | |||
1137 | static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | 1158 | static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, |
1138 | u32 val, ktime_t *abs_time) | 1159 | u32 val, ktime_t *abs_time) |
1139 | { | 1160 | { |
@@ -1148,8 +1169,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1148 | 1169 | ||
1149 | q.pi_state = NULL; | 1170 | q.pi_state = NULL; |
1150 | retry: | 1171 | retry: |
1151 | if (fshared) | 1172 | futex_lock_mm(fshared); |
1152 | down_read(fshared); | ||
1153 | 1173 | ||
1154 | ret = get_futex_key(uaddr, fshared, &q.key); | 1174 | ret = get_futex_key(uaddr, fshared, &q.key); |
1155 | if (unlikely(ret != 0)) | 1175 | if (unlikely(ret != 0)) |
@@ -1186,8 +1206,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1186 | * If we would have faulted, release mmap_sem, fault it in and | 1206 | * If we would have faulted, release mmap_sem, fault it in and |
1187 | * start all over again. | 1207 | * start all over again. |
1188 | */ | 1208 | */ |
1189 | if (fshared) | 1209 | futex_unlock_mm(fshared); |
1190 | up_read(fshared); | ||
1191 | 1210 | ||
1192 | ret = get_user(uval, uaddr); | 1211 | ret = get_user(uval, uaddr); |
1193 | 1212 | ||
@@ -1206,8 +1225,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1206 | * Now the futex is queued and we have checked the data, we | 1225 | * Now the futex is queued and we have checked the data, we |
1207 | * don't want to hold mmap_sem while we sleep. | 1226 | * don't want to hold mmap_sem while we sleep. |
1208 | */ | 1227 | */ |
1209 | if (fshared) | 1228 | futex_unlock_mm(fshared); |
1210 | up_read(fshared); | ||
1211 | 1229 | ||
1212 | /* | 1230 | /* |
1213 | * There might have been scheduling since the queue_me(), as we | 1231 | * There might have been scheduling since the queue_me(), as we |
@@ -1285,8 +1303,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1285 | queue_unlock(&q, hb); | 1303 | queue_unlock(&q, hb); |
1286 | 1304 | ||
1287 | out_release_sem: | 1305 | out_release_sem: |
1288 | if (fshared) | 1306 | futex_unlock_mm(fshared); |
1289 | up_read(fshared); | ||
1290 | return ret; | 1307 | return ret; |
1291 | } | 1308 | } |
1292 | 1309 | ||
@@ -1333,8 +1350,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1333 | 1350 | ||
1334 | q.pi_state = NULL; | 1351 | q.pi_state = NULL; |
1335 | retry: | 1352 | retry: |
1336 | if (fshared) | 1353 | futex_lock_mm(fshared); |
1337 | down_read(fshared); | ||
1338 | 1354 | ||
1339 | ret = get_futex_key(uaddr, fshared, &q.key); | 1355 | ret = get_futex_key(uaddr, fshared, &q.key); |
1340 | if (unlikely(ret != 0)) | 1356 | if (unlikely(ret != 0)) |
@@ -1353,9 +1369,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1353 | */ | 1369 | */ |
1354 | newval = current->pid; | 1370 | newval = current->pid; |
1355 | 1371 | ||
1356 | pagefault_disable(); | 1372 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); |
1357 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | ||
1358 | pagefault_enable(); | ||
1359 | 1373 | ||
1360 | if (unlikely(curval == -EFAULT)) | 1374 | if (unlikely(curval == -EFAULT)) |
1361 | goto uaddr_faulted; | 1375 | goto uaddr_faulted; |
@@ -1398,9 +1412,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1398 | lock_taken = 1; | 1412 | lock_taken = 1; |
1399 | } | 1413 | } |
1400 | 1414 | ||
1401 | pagefault_disable(); | 1415 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); |
1402 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
1403 | pagefault_enable(); | ||
1404 | 1416 | ||
1405 | if (unlikely(curval == -EFAULT)) | 1417 | if (unlikely(curval == -EFAULT)) |
1406 | goto uaddr_faulted; | 1418 | goto uaddr_faulted; |
@@ -1428,8 +1440,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1428 | * exit to complete. | 1440 | * exit to complete. |
1429 | */ | 1441 | */ |
1430 | queue_unlock(&q, hb); | 1442 | queue_unlock(&q, hb); |
1431 | if (fshared) | 1443 | futex_unlock_mm(fshared); |
1432 | up_read(fshared); | ||
1433 | cond_resched(); | 1444 | cond_resched(); |
1434 | goto retry; | 1445 | goto retry; |
1435 | 1446 | ||
@@ -1465,8 +1476,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1465 | * Now the futex is queued and we have checked the data, we | 1476 | * Now the futex is queued and we have checked the data, we |
1466 | * don't want to hold mmap_sem while we sleep. | 1477 | * don't want to hold mmap_sem while we sleep. |
1467 | */ | 1478 | */ |
1468 | if (fshared) | 1479 | futex_unlock_mm(fshared); |
1469 | up_read(fshared); | ||
1470 | 1480 | ||
1471 | WARN_ON(!q.pi_state); | 1481 | WARN_ON(!q.pi_state); |
1472 | /* | 1482 | /* |
@@ -1480,8 +1490,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1480 | ret = ret ? 0 : -EWOULDBLOCK; | 1490 | ret = ret ? 0 : -EWOULDBLOCK; |
1481 | } | 1491 | } |
1482 | 1492 | ||
1483 | if (fshared) | 1493 | futex_lock_mm(fshared); |
1484 | down_read(fshared); | ||
1485 | spin_lock(q.lock_ptr); | 1494 | spin_lock(q.lock_ptr); |
1486 | 1495 | ||
1487 | if (!ret) { | 1496 | if (!ret) { |
@@ -1518,8 +1527,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1518 | 1527 | ||
1519 | /* Unqueue and drop the lock */ | 1528 | /* Unqueue and drop the lock */ |
1520 | unqueue_me_pi(&q); | 1529 | unqueue_me_pi(&q); |
1521 | if (fshared) | 1530 | futex_unlock_mm(fshared); |
1522 | up_read(fshared); | ||
1523 | 1531 | ||
1524 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | 1532 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
1525 | 1533 | ||
@@ -1527,8 +1535,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1527 | queue_unlock(&q, hb); | 1535 | queue_unlock(&q, hb); |
1528 | 1536 | ||
1529 | out_release_sem: | 1537 | out_release_sem: |
1530 | if (fshared) | 1538 | futex_unlock_mm(fshared); |
1531 | up_read(fshared); | ||
1532 | return ret; | 1539 | return ret; |
1533 | 1540 | ||
1534 | uaddr_faulted: | 1541 | uaddr_faulted: |
@@ -1550,8 +1557,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1550 | goto retry_unlocked; | 1557 | goto retry_unlocked; |
1551 | } | 1558 | } |
1552 | 1559 | ||
1553 | if (fshared) | 1560 | futex_unlock_mm(fshared); |
1554 | up_read(fshared); | ||
1555 | 1561 | ||
1556 | ret = get_user(uval, uaddr); | 1562 | ret = get_user(uval, uaddr); |
1557 | if (!ret && (uval != -EFAULT)) | 1563 | if (!ret && (uval != -EFAULT)) |
@@ -1585,8 +1591,7 @@ retry: | |||
1585 | /* | 1591 | /* |
1586 | * First take all the futex related locks: | 1592 | * First take all the futex related locks: |
1587 | */ | 1593 | */ |
1588 | if (fshared) | 1594 | futex_lock_mm(fshared); |
1589 | down_read(fshared); | ||
1590 | 1595 | ||
1591 | ret = get_futex_key(uaddr, fshared, &key); | 1596 | ret = get_futex_key(uaddr, fshared, &key); |
1592 | if (unlikely(ret != 0)) | 1597 | if (unlikely(ret != 0)) |
@@ -1601,11 +1606,9 @@ retry_unlocked: | |||
1601 | * again. If it succeeds then we can return without waking | 1606 | * again. If it succeeds then we can return without waking |
1602 | * anyone else up: | 1607 | * anyone else up: |
1603 | */ | 1608 | */ |
1604 | if (!(uval & FUTEX_OWNER_DIED)) { | 1609 | if (!(uval & FUTEX_OWNER_DIED)) |
1605 | pagefault_disable(); | 1610 | uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0); |
1606 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | 1611 | |
1607 | pagefault_enable(); | ||
1608 | } | ||
1609 | 1612 | ||
1610 | if (unlikely(uval == -EFAULT)) | 1613 | if (unlikely(uval == -EFAULT)) |
1611 | goto pi_faulted; | 1614 | goto pi_faulted; |
@@ -1647,8 +1650,7 @@ retry_unlocked: | |||
1647 | out_unlock: | 1650 | out_unlock: |
1648 | spin_unlock(&hb->lock); | 1651 | spin_unlock(&hb->lock); |
1649 | out: | 1652 | out: |
1650 | if (fshared) | 1653 | futex_unlock_mm(fshared); |
1651 | up_read(fshared); | ||
1652 | 1654 | ||
1653 | return ret; | 1655 | return ret; |
1654 | 1656 | ||
@@ -1671,8 +1673,7 @@ pi_faulted: | |||
1671 | goto retry_unlocked; | 1673 | goto retry_unlocked; |
1672 | } | 1674 | } |
1673 | 1675 | ||
1674 | if (fshared) | 1676 | futex_unlock_mm(fshared); |
1675 | up_read(fshared); | ||
1676 | 1677 | ||
1677 | ret = get_user(uval, uaddr); | 1678 | ret = get_user(uval, uaddr); |
1678 | if (!ret && (uval != -EFAULT)) | 1679 | if (!ret && (uval != -EFAULT)) |
@@ -1729,8 +1730,8 @@ static int futex_fd(u32 __user *uaddr, int signal) | |||
1729 | 1730 | ||
1730 | if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { | 1731 | if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { |
1731 | printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " | 1732 | printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " |
1732 | "will be removed from the kernel in June 2007\n", | 1733 | "will be removed from the kernel in June 2007\n", |
1733 | current->comm); | 1734 | current->comm); |
1734 | } | 1735 | } |
1735 | 1736 | ||
1736 | ret = -EINVAL; | 1737 | ret = -EINVAL; |
@@ -1908,10 +1909,8 @@ retry: | |||
1908 | * Wake robust non-PI futexes here. The wakeup of | 1909 | * Wake robust non-PI futexes here. The wakeup of |
1909 | * PI futexes happens in exit_pi_state(): | 1910 | * PI futexes happens in exit_pi_state(): |
1910 | */ | 1911 | */ |
1911 | if (!pi) { | 1912 | if (!pi && (uval & FUTEX_WAITERS)) |
1912 | if (uval & FUTEX_WAITERS) | ||
1913 | futex_wake(uaddr, &curr->mm->mmap_sem, 1); | 1913 | futex_wake(uaddr, &curr->mm->mmap_sem, 1); |
1914 | } | ||
1915 | } | 1914 | } |
1916 | return 0; | 1915 | return 0; |
1917 | } | 1916 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 23c03f43e1..eb1ddebd2c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -558,7 +558,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
558 | */ | 558 | */ |
559 | static int hrtimer_switch_to_hres(void) | 559 | static int hrtimer_switch_to_hres(void) |
560 | { | 560 | { |
561 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | 561 | int cpu = smp_processor_id(); |
562 | struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); | ||
562 | unsigned long flags; | 563 | unsigned long flags; |
563 | 564 | ||
564 | if (base->hres_active) | 565 | if (base->hres_active) |
@@ -568,6 +569,8 @@ static int hrtimer_switch_to_hres(void) | |||
568 | 569 | ||
569 | if (tick_init_highres()) { | 570 | if (tick_init_highres()) { |
570 | local_irq_restore(flags); | 571 | local_irq_restore(flags); |
572 | printk(KERN_WARNING "Could not switch to high resolution " | ||
573 | "mode on CPU %d\n", cpu); | ||
571 | return 0; | 574 | return 0; |
572 | } | 575 | } |
573 | base->hres_active = 1; | 576 | base->hres_active = 1; |
@@ -683,6 +686,7 @@ static void enqueue_hrtimer(struct hrtimer *timer, | |||
683 | struct rb_node **link = &base->active.rb_node; | 686 | struct rb_node **link = &base->active.rb_node; |
684 | struct rb_node *parent = NULL; | 687 | struct rb_node *parent = NULL; |
685 | struct hrtimer *entry; | 688 | struct hrtimer *entry; |
689 | int leftmost = 1; | ||
686 | 690 | ||
687 | /* | 691 | /* |
688 | * Find the right place in the rbtree: | 692 | * Find the right place in the rbtree: |
@@ -694,18 +698,19 @@ static void enqueue_hrtimer(struct hrtimer *timer, | |||
694 | * We dont care about collisions. Nodes with | 698 | * We dont care about collisions. Nodes with |
695 | * the same expiry time stay together. | 699 | * the same expiry time stay together. |
696 | */ | 700 | */ |
697 | if (timer->expires.tv64 < entry->expires.tv64) | 701 | if (timer->expires.tv64 < entry->expires.tv64) { |
698 | link = &(*link)->rb_left; | 702 | link = &(*link)->rb_left; |
699 | else | 703 | } else { |
700 | link = &(*link)->rb_right; | 704 | link = &(*link)->rb_right; |
705 | leftmost = 0; | ||
706 | } | ||
701 | } | 707 | } |
702 | 708 | ||
703 | /* | 709 | /* |
704 | * Insert the timer to the rbtree and check whether it | 710 | * Insert the timer to the rbtree and check whether it |
705 | * replaces the first pending timer | 711 | * replaces the first pending timer |
706 | */ | 712 | */ |
707 | if (!base->first || timer->expires.tv64 < | 713 | if (leftmost) { |
708 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) { | ||
709 | /* | 714 | /* |
710 | * Reprogram the clock event device. When the timer is already | 715 | * Reprogram the clock event device. When the timer is already |
711 | * expired hrtimer_enqueue_reprogram has either called the | 716 | * expired hrtimer_enqueue_reprogram has either called the |
@@ -1406,7 +1411,7 @@ static void migrate_hrtimers(int cpu) | |||
1406 | static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, | 1411 | static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, |
1407 | unsigned long action, void *hcpu) | 1412 | unsigned long action, void *hcpu) |
1408 | { | 1413 | { |
1409 | long cpu = (long)hcpu; | 1414 | unsigned int cpu = (long)hcpu; |
1410 | 1415 | ||
1411 | switch (action) { | 1416 | switch (action) { |
1412 | 1417 | ||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index b4f1674fca..50b81b9804 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -19,7 +19,15 @@ static struct proc_dir_entry *root_irq_dir; | |||
19 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 19 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
20 | int count, int *eof, void *data) | 20 | int count, int *eof, void *data) |
21 | { | 21 | { |
22 | int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); | 22 | struct irq_desc *desc = irq_desc + (long)data; |
23 | cpumask_t *mask = &desc->affinity; | ||
24 | int len; | ||
25 | |||
26 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
27 | if (desc->status & IRQ_MOVE_PENDING) | ||
28 | mask = &desc->pending_mask; | ||
29 | #endif | ||
30 | len = cpumask_scnprintf(page, count, *mask); | ||
23 | 31 | ||
24 | if (count - len < 2) | 32 | if (count - len < 2) |
25 | return -EINVAL; | 33 | return -EINVAL; |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index bd9e272d55..32b161972f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -172,7 +172,17 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
172 | irqreturn_t action_ret) | 172 | irqreturn_t action_ret) |
173 | { | 173 | { |
174 | if (unlikely(action_ret != IRQ_HANDLED)) { | 174 | if (unlikely(action_ret != IRQ_HANDLED)) { |
175 | desc->irqs_unhandled++; | 175 | /* |
176 | * If we are seeing only the odd spurious IRQ caused by | ||
177 | * bus asynchronicity then don't eventually trigger an error, | ||
178 | * otherwise the couter becomes a doomsday timer for otherwise | ||
179 | * working systems | ||
180 | */ | ||
181 | if (jiffies - desc->last_unhandled > HZ/10) | ||
182 | desc->irqs_unhandled = 1; | ||
183 | else | ||
184 | desc->irqs_unhandled++; | ||
185 | desc->last_unhandled = jiffies; | ||
176 | if (unlikely(action_ret != IRQ_NONE)) | 186 | if (unlikely(action_ret != IRQ_NONE)) |
177 | report_bad_irq(irq, desc, action_ret); | 187 | report_bad_irq(irq, desc, action_ret); |
178 | } | 188 | } |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fed5441862..474219a419 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -152,7 +152,7 @@ static unsigned int get_symbol_offset(unsigned long pos) | |||
152 | /* Lookup the address for this symbol. Returns 0 if not found. */ | 152 | /* Lookup the address for this symbol. Returns 0 if not found. */ |
153 | unsigned long kallsyms_lookup_name(const char *name) | 153 | unsigned long kallsyms_lookup_name(const char *name) |
154 | { | 154 | { |
155 | char namebuf[KSYM_NAME_LEN+1]; | 155 | char namebuf[KSYM_NAME_LEN]; |
156 | unsigned long i; | 156 | unsigned long i; |
157 | unsigned int off; | 157 | unsigned int off; |
158 | 158 | ||
@@ -248,7 +248,7 @@ const char *kallsyms_lookup(unsigned long addr, | |||
248 | { | 248 | { |
249 | const char *msym; | 249 | const char *msym; |
250 | 250 | ||
251 | namebuf[KSYM_NAME_LEN] = 0; | 251 | namebuf[KSYM_NAME_LEN - 1] = 0; |
252 | namebuf[0] = 0; | 252 | namebuf[0] = 0; |
253 | 253 | ||
254 | if (is_ksym_addr(addr)) { | 254 | if (is_ksym_addr(addr)) { |
@@ -265,7 +265,7 @@ const char *kallsyms_lookup(unsigned long addr, | |||
265 | /* see if it's in a module */ | 265 | /* see if it's in a module */ |
266 | msym = module_address_lookup(addr, symbolsize, offset, modname); | 266 | msym = module_address_lookup(addr, symbolsize, offset, modname); |
267 | if (msym) | 267 | if (msym) |
268 | return strncpy(namebuf, msym, KSYM_NAME_LEN); | 268 | return strncpy(namebuf, msym, KSYM_NAME_LEN - 1); |
269 | 269 | ||
270 | return NULL; | 270 | return NULL; |
271 | } | 271 | } |
@@ -273,7 +273,7 @@ const char *kallsyms_lookup(unsigned long addr, | |||
273 | int lookup_symbol_name(unsigned long addr, char *symname) | 273 | int lookup_symbol_name(unsigned long addr, char *symname) |
274 | { | 274 | { |
275 | symname[0] = '\0'; | 275 | symname[0] = '\0'; |
276 | symname[KSYM_NAME_LEN] = '\0'; | 276 | symname[KSYM_NAME_LEN - 1] = '\0'; |
277 | 277 | ||
278 | if (is_ksym_addr(addr)) { | 278 | if (is_ksym_addr(addr)) { |
279 | unsigned long pos; | 279 | unsigned long pos; |
@@ -291,7 +291,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, | |||
291 | unsigned long *offset, char *modname, char *name) | 291 | unsigned long *offset, char *modname, char *name) |
292 | { | 292 | { |
293 | name[0] = '\0'; | 293 | name[0] = '\0'; |
294 | name[KSYM_NAME_LEN] = '\0'; | 294 | name[KSYM_NAME_LEN - 1] = '\0'; |
295 | 295 | ||
296 | if (is_ksym_addr(addr)) { | 296 | if (is_ksym_addr(addr)) { |
297 | unsigned long pos; | 297 | unsigned long pos; |
@@ -312,18 +312,17 @@ int sprint_symbol(char *buffer, unsigned long address) | |||
312 | char *modname; | 312 | char *modname; |
313 | const char *name; | 313 | const char *name; |
314 | unsigned long offset, size; | 314 | unsigned long offset, size; |
315 | char namebuf[KSYM_NAME_LEN+1]; | 315 | char namebuf[KSYM_NAME_LEN]; |
316 | 316 | ||
317 | name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); | 317 | name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); |
318 | if (!name) | 318 | if (!name) |
319 | return sprintf(buffer, "0x%lx", address); | 319 | return sprintf(buffer, "0x%lx", address); |
320 | else { | 320 | |
321 | if (modname) | 321 | if (modname) |
322 | return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, | 322 | return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, |
323 | size, modname); | 323 | size, modname); |
324 | else | 324 | else |
325 | return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); | 325 | return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); |
326 | } | ||
327 | } | 326 | } |
328 | 327 | ||
329 | /* Look up a kernel symbol and print it to the kernel messages. */ | 328 | /* Look up a kernel symbol and print it to the kernel messages. */ |
@@ -343,8 +342,8 @@ struct kallsym_iter | |||
343 | unsigned long value; | 342 | unsigned long value; |
344 | unsigned int nameoff; /* If iterating in core kernel symbols */ | 343 | unsigned int nameoff; /* If iterating in core kernel symbols */ |
345 | char type; | 344 | char type; |
346 | char name[KSYM_NAME_LEN+1]; | 345 | char name[KSYM_NAME_LEN]; |
347 | char module_name[MODULE_NAME_LEN + 1]; | 346 | char module_name[MODULE_NAME_LEN]; |
348 | int exported; | 347 | int exported; |
349 | }; | 348 | }; |
350 | 349 | ||
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index cee419143f..bc41ad0f24 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
26 | #include <linux/kfifo.h> | 26 | #include <linux/kfifo.h> |
27 | #include <linux/log2.h> | ||
27 | 28 | ||
28 | /** | 29 | /** |
29 | * kfifo_init - allocates a new FIFO using a preallocated buffer | 30 | * kfifo_init - allocates a new FIFO using a preallocated buffer |
@@ -41,7 +42,7 @@ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, | |||
41 | struct kfifo *fifo; | 42 | struct kfifo *fifo; |
42 | 43 | ||
43 | /* size must be a power of 2 */ | 44 | /* size must be a power of 2 */ |
44 | BUG_ON(size & (size - 1)); | 45 | BUG_ON(!is_power_of_2(size)); |
45 | 46 | ||
46 | fifo = kmalloc(sizeof(struct kfifo), gfp_mask); | 47 | fifo = kmalloc(sizeof(struct kfifo), gfp_mask); |
47 | if (!fifo) | 48 | if (!fifo) |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 4d32eb0771..beedbdc646 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -33,6 +33,8 @@ | |||
33 | #include <linux/kernel.h> | 33 | #include <linux/kernel.h> |
34 | #include <linux/init.h> | 34 | #include <linux/init.h> |
35 | #include <linux/resource.h> | 35 | #include <linux/resource.h> |
36 | #include <linux/notifier.h> | ||
37 | #include <linux/suspend.h> | ||
36 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
37 | 39 | ||
38 | extern int max_threads; | 40 | extern int max_threads; |
@@ -119,9 +121,10 @@ struct subprocess_info { | |||
119 | char **argv; | 121 | char **argv; |
120 | char **envp; | 122 | char **envp; |
121 | struct key *ring; | 123 | struct key *ring; |
122 | int wait; | 124 | enum umh_wait wait; |
123 | int retval; | 125 | int retval; |
124 | struct file *stdin; | 126 | struct file *stdin; |
127 | void (*cleanup)(char **argv, char **envp); | ||
125 | }; | 128 | }; |
126 | 129 | ||
127 | /* | 130 | /* |
@@ -180,6 +183,14 @@ static int ____call_usermodehelper(void *data) | |||
180 | do_exit(0); | 183 | do_exit(0); |
181 | } | 184 | } |
182 | 185 | ||
186 | void call_usermodehelper_freeinfo(struct subprocess_info *info) | ||
187 | { | ||
188 | if (info->cleanup) | ||
189 | (*info->cleanup)(info->argv, info->envp); | ||
190 | kfree(info); | ||
191 | } | ||
192 | EXPORT_SYMBOL(call_usermodehelper_freeinfo); | ||
193 | |||
183 | /* Keventd can't block, but this (a child) can. */ | 194 | /* Keventd can't block, but this (a child) can. */ |
184 | static int wait_for_helper(void *data) | 195 | static int wait_for_helper(void *data) |
185 | { | 196 | { |
@@ -216,8 +227,8 @@ static int wait_for_helper(void *data) | |||
216 | sub_info->retval = ret; | 227 | sub_info->retval = ret; |
217 | } | 228 | } |
218 | 229 | ||
219 | if (sub_info->wait < 0) | 230 | if (sub_info->wait == UMH_NO_WAIT) |
220 | kfree(sub_info); | 231 | call_usermodehelper_freeinfo(sub_info); |
221 | else | 232 | else |
222 | complete(sub_info->complete); | 233 | complete(sub_info->complete); |
223 | return 0; | 234 | return 0; |
@@ -229,34 +240,204 @@ static void __call_usermodehelper(struct work_struct *work) | |||
229 | struct subprocess_info *sub_info = | 240 | struct subprocess_info *sub_info = |
230 | container_of(work, struct subprocess_info, work); | 241 | container_of(work, struct subprocess_info, work); |
231 | pid_t pid; | 242 | pid_t pid; |
232 | int wait = sub_info->wait; | 243 | enum umh_wait wait = sub_info->wait; |
233 | 244 | ||
234 | /* CLONE_VFORK: wait until the usermode helper has execve'd | 245 | /* CLONE_VFORK: wait until the usermode helper has execve'd |
235 | * successfully We need the data structures to stay around | 246 | * successfully We need the data structures to stay around |
236 | * until that is done. */ | 247 | * until that is done. */ |
237 | if (wait) | 248 | if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) |
238 | pid = kernel_thread(wait_for_helper, sub_info, | 249 | pid = kernel_thread(wait_for_helper, sub_info, |
239 | CLONE_FS | CLONE_FILES | SIGCHLD); | 250 | CLONE_FS | CLONE_FILES | SIGCHLD); |
240 | else | 251 | else |
241 | pid = kernel_thread(____call_usermodehelper, sub_info, | 252 | pid = kernel_thread(____call_usermodehelper, sub_info, |
242 | CLONE_VFORK | SIGCHLD); | 253 | CLONE_VFORK | SIGCHLD); |
243 | 254 | ||
244 | if (wait < 0) | 255 | switch (wait) { |
245 | return; | 256 | case UMH_NO_WAIT: |
257 | break; | ||
246 | 258 | ||
247 | if (pid < 0) { | 259 | case UMH_WAIT_PROC: |
260 | if (pid > 0) | ||
261 | break; | ||
248 | sub_info->retval = pid; | 262 | sub_info->retval = pid; |
263 | /* FALLTHROUGH */ | ||
264 | |||
265 | case UMH_WAIT_EXEC: | ||
249 | complete(sub_info->complete); | 266 | complete(sub_info->complete); |
250 | } else if (!wait) | 267 | } |
251 | complete(sub_info->complete); | 268 | } |
269 | |||
270 | #ifdef CONFIG_PM | ||
271 | /* | ||
272 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY | ||
273 | * (used for preventing user land processes from being created after the user | ||
274 | * land has been frozen during a system-wide hibernation or suspend operation). | ||
275 | */ | ||
276 | static int usermodehelper_disabled; | ||
277 | |||
278 | /* Number of helpers running */ | ||
279 | static atomic_t running_helpers = ATOMIC_INIT(0); | ||
280 | |||
281 | /* | ||
282 | * Wait queue head used by usermodehelper_pm_callback() to wait for all running | ||
283 | * helpers to finish. | ||
284 | */ | ||
285 | static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); | ||
286 | |||
287 | /* | ||
288 | * Time to wait for running_helpers to become zero before the setting of | ||
289 | * usermodehelper_disabled in usermodehelper_pm_callback() fails | ||
290 | */ | ||
291 | #define RUNNING_HELPERS_TIMEOUT (5 * HZ) | ||
292 | |||
293 | static int usermodehelper_pm_callback(struct notifier_block *nfb, | ||
294 | unsigned long action, | ||
295 | void *ignored) | ||
296 | { | ||
297 | long retval; | ||
298 | |||
299 | switch (action) { | ||
300 | case PM_HIBERNATION_PREPARE: | ||
301 | case PM_SUSPEND_PREPARE: | ||
302 | usermodehelper_disabled = 1; | ||
303 | smp_mb(); | ||
304 | /* | ||
305 | * From now on call_usermodehelper_exec() won't start any new | ||
306 | * helpers, so it is sufficient if running_helpers turns out to | ||
307 | * be zero at one point (it may be increased later, but that | ||
308 | * doesn't matter). | ||
309 | */ | ||
310 | retval = wait_event_timeout(running_helpers_waitq, | ||
311 | atomic_read(&running_helpers) == 0, | ||
312 | RUNNING_HELPERS_TIMEOUT); | ||
313 | if (retval) { | ||
314 | return NOTIFY_OK; | ||
315 | } else { | ||
316 | usermodehelper_disabled = 0; | ||
317 | return NOTIFY_BAD; | ||
318 | } | ||
319 | case PM_POST_HIBERNATION: | ||
320 | case PM_POST_SUSPEND: | ||
321 | usermodehelper_disabled = 0; | ||
322 | return NOTIFY_OK; | ||
323 | } | ||
324 | |||
325 | return NOTIFY_DONE; | ||
326 | } | ||
327 | |||
328 | static void helper_lock(void) | ||
329 | { | ||
330 | atomic_inc(&running_helpers); | ||
331 | smp_mb__after_atomic_inc(); | ||
332 | } | ||
333 | |||
334 | static void helper_unlock(void) | ||
335 | { | ||
336 | if (atomic_dec_and_test(&running_helpers)) | ||
337 | wake_up(&running_helpers_waitq); | ||
338 | } | ||
339 | |||
340 | static void register_pm_notifier_callback(void) | ||
341 | { | ||
342 | pm_notifier(usermodehelper_pm_callback, 0); | ||
252 | } | 343 | } |
344 | #else /* CONFIG_PM */ | ||
345 | #define usermodehelper_disabled 0 | ||
346 | |||
347 | static inline void helper_lock(void) {} | ||
348 | static inline void helper_unlock(void) {} | ||
349 | static inline void register_pm_notifier_callback(void) {} | ||
350 | #endif /* CONFIG_PM */ | ||
253 | 351 | ||
254 | /** | 352 | /** |
255 | * call_usermodehelper_keys - start a usermode application | 353 | * call_usermodehelper_setup - prepare to call a usermode helper |
256 | * @path: pathname for the application | 354 | * @path - path to usermode executable |
257 | * @argv: null-terminated argument list | 355 | * @argv - arg vector for process |
258 | * @envp: null-terminated environment list | 356 | * @envp - environment for process |
259 | * @session_keyring: session keyring for process (NULL for an empty keyring) | 357 | * |
358 | * Returns either NULL on allocation failure, or a subprocess_info | ||
359 | * structure. This should be passed to call_usermodehelper_exec to | ||
360 | * exec the process and free the structure. | ||
361 | */ | ||
362 | struct subprocess_info *call_usermodehelper_setup(char *path, | ||
363 | char **argv, char **envp) | ||
364 | { | ||
365 | struct subprocess_info *sub_info; | ||
366 | sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); | ||
367 | if (!sub_info) | ||
368 | goto out; | ||
369 | |||
370 | INIT_WORK(&sub_info->work, __call_usermodehelper); | ||
371 | sub_info->path = path; | ||
372 | sub_info->argv = argv; | ||
373 | sub_info->envp = envp; | ||
374 | |||
375 | out: | ||
376 | return sub_info; | ||
377 | } | ||
378 | EXPORT_SYMBOL(call_usermodehelper_setup); | ||
379 | |||
380 | /** | ||
381 | * call_usermodehelper_setkeys - set the session keys for usermode helper | ||
382 | * @info: a subprocess_info returned by call_usermodehelper_setup | ||
383 | * @session_keyring: the session keyring for the process | ||
384 | */ | ||
385 | void call_usermodehelper_setkeys(struct subprocess_info *info, | ||
386 | struct key *session_keyring) | ||
387 | { | ||
388 | info->ring = session_keyring; | ||
389 | } | ||
390 | EXPORT_SYMBOL(call_usermodehelper_setkeys); | ||
391 | |||
392 | /** | ||
393 | * call_usermodehelper_setcleanup - set a cleanup function | ||
394 | * @info: a subprocess_info returned by call_usermodehelper_setup | ||
395 | * @cleanup: a cleanup function | ||
396 | * | ||
397 | * The cleanup function is just befor ethe subprocess_info is about to | ||
398 | * be freed. This can be used for freeing the argv and envp. The | ||
399 | * Function must be runnable in either a process context or the | ||
400 | * context in which call_usermodehelper_exec is called. | ||
401 | */ | ||
402 | void call_usermodehelper_setcleanup(struct subprocess_info *info, | ||
403 | void (*cleanup)(char **argv, char **envp)) | ||
404 | { | ||
405 | info->cleanup = cleanup; | ||
406 | } | ||
407 | EXPORT_SYMBOL(call_usermodehelper_setcleanup); | ||
408 | |||
409 | /** | ||
410 | * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin | ||
411 | * @sub_info: a subprocess_info returned by call_usermodehelper_setup | ||
412 | * @filp: set to the write-end of a pipe | ||
413 | * | ||
414 | * This constructs a pipe, and sets the read end to be the stdin of the | ||
415 | * subprocess, and returns the write-end in *@filp. | ||
416 | */ | ||
417 | int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, | ||
418 | struct file **filp) | ||
419 | { | ||
420 | struct file *f; | ||
421 | |||
422 | f = create_write_pipe(); | ||
423 | if (IS_ERR(f)) | ||
424 | return PTR_ERR(f); | ||
425 | *filp = f; | ||
426 | |||
427 | f = create_read_pipe(f); | ||
428 | if (IS_ERR(f)) { | ||
429 | free_write_pipe(*filp); | ||
430 | return PTR_ERR(f); | ||
431 | } | ||
432 | sub_info->stdin = f; | ||
433 | |||
434 | return 0; | ||
435 | } | ||
436 | EXPORT_SYMBOL(call_usermodehelper_stdinpipe); | ||
437 | |||
438 | /** | ||
439 | * call_usermodehelper_exec - start a usermode application | ||
440 | * @sub_info: information about the subprocessa | ||
260 | * @wait: wait for the application to finish and return status. | 441 | * @wait: wait for the application to finish and return status. |
261 | * when -1 don't wait at all, but you get no useful error back when | 442 | * when -1 don't wait at all, but you get no useful error back when |
262 | * the program couldn't be exec'ed. This makes it safe to call | 443 | * the program couldn't be exec'ed. This makes it safe to call |
@@ -265,81 +446,70 @@ static void __call_usermodehelper(struct work_struct *work) | |||
265 | * Runs a user-space application. The application is started | 446 | * Runs a user-space application. The application is started |
266 | * asynchronously if wait is not set, and runs as a child of keventd. | 447 | * asynchronously if wait is not set, and runs as a child of keventd. |
267 | * (ie. it runs with full root capabilities). | 448 | * (ie. it runs with full root capabilities). |
268 | * | ||
269 | * Must be called from process context. Returns a negative error code | ||
270 | * if program was not execed successfully, or 0. | ||
271 | */ | 449 | */ |
272 | int call_usermodehelper_keys(char *path, char **argv, char **envp, | 450 | int call_usermodehelper_exec(struct subprocess_info *sub_info, |
273 | struct key *session_keyring, int wait) | 451 | enum umh_wait wait) |
274 | { | 452 | { |
275 | DECLARE_COMPLETION_ONSTACK(done); | 453 | DECLARE_COMPLETION_ONSTACK(done); |
276 | struct subprocess_info *sub_info; | ||
277 | int retval; | 454 | int retval; |
278 | 455 | ||
279 | if (!khelper_wq) | 456 | helper_lock(); |
280 | return -EBUSY; | 457 | if (sub_info->path[0] == '\0') { |
281 | 458 | retval = 0; | |
282 | if (path[0] == '\0') | 459 | goto out; |
283 | return 0; | 460 | } |
284 | 461 | ||
285 | sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); | 462 | if (!khelper_wq || usermodehelper_disabled) { |
286 | if (!sub_info) | 463 | retval = -EBUSY; |
287 | return -ENOMEM; | 464 | goto out; |
465 | } | ||
288 | 466 | ||
289 | INIT_WORK(&sub_info->work, __call_usermodehelper); | ||
290 | sub_info->complete = &done; | 467 | sub_info->complete = &done; |
291 | sub_info->path = path; | ||
292 | sub_info->argv = argv; | ||
293 | sub_info->envp = envp; | ||
294 | sub_info->ring = session_keyring; | ||
295 | sub_info->wait = wait; | 468 | sub_info->wait = wait; |
296 | 469 | ||
297 | queue_work(khelper_wq, &sub_info->work); | 470 | queue_work(khelper_wq, &sub_info->work); |
298 | if (wait < 0) /* task has freed sub_info */ | 471 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ |
299 | return 0; | 472 | return 0; |
300 | wait_for_completion(&done); | 473 | wait_for_completion(&done); |
301 | retval = sub_info->retval; | 474 | retval = sub_info->retval; |
302 | kfree(sub_info); | 475 | |
476 | out: | ||
477 | call_usermodehelper_freeinfo(sub_info); | ||
478 | helper_unlock(); | ||
303 | return retval; | 479 | return retval; |
304 | } | 480 | } |
305 | EXPORT_SYMBOL(call_usermodehelper_keys); | 481 | EXPORT_SYMBOL(call_usermodehelper_exec); |
306 | 482 | ||
483 | /** | ||
484 | * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin | ||
485 | * @path: path to usermode executable | ||
486 | * @argv: arg vector for process | ||
487 | * @envp: environment for process | ||
488 | * @filp: set to the write-end of a pipe | ||
489 | * | ||
490 | * This is a simple wrapper which executes a usermode-helper function | ||
491 | * with a pipe as stdin. It is implemented entirely in terms of | ||
492 | * lower-level call_usermodehelper_* functions. | ||
493 | */ | ||
307 | int call_usermodehelper_pipe(char *path, char **argv, char **envp, | 494 | int call_usermodehelper_pipe(char *path, char **argv, char **envp, |
308 | struct file **filp) | 495 | struct file **filp) |
309 | { | 496 | { |
310 | DECLARE_COMPLETION(done); | 497 | struct subprocess_info *sub_info; |
311 | struct subprocess_info sub_info = { | 498 | int ret; |
312 | .work = __WORK_INITIALIZER(sub_info.work, | ||
313 | __call_usermodehelper), | ||
314 | .complete = &done, | ||
315 | .path = path, | ||
316 | .argv = argv, | ||
317 | .envp = envp, | ||
318 | .retval = 0, | ||
319 | }; | ||
320 | struct file *f; | ||
321 | |||
322 | if (!khelper_wq) | ||
323 | return -EBUSY; | ||
324 | 499 | ||
325 | if (path[0] == '\0') | 500 | sub_info = call_usermodehelper_setup(path, argv, envp); |
326 | return 0; | 501 | if (sub_info == NULL) |
502 | return -ENOMEM; | ||
327 | 503 | ||
328 | f = create_write_pipe(); | 504 | ret = call_usermodehelper_stdinpipe(sub_info, filp); |
329 | if (IS_ERR(f)) | 505 | if (ret < 0) |
330 | return PTR_ERR(f); | 506 | goto out; |
331 | *filp = f; | ||
332 | 507 | ||
333 | f = create_read_pipe(f); | 508 | return call_usermodehelper_exec(sub_info, 1); |
334 | if (IS_ERR(f)) { | ||
335 | free_write_pipe(*filp); | ||
336 | return PTR_ERR(f); | ||
337 | } | ||
338 | sub_info.stdin = f; | ||
339 | 509 | ||
340 | queue_work(khelper_wq, &sub_info.work); | 510 | out: |
341 | wait_for_completion(&done); | 511 | call_usermodehelper_freeinfo(sub_info); |
342 | return sub_info.retval; | 512 | return ret; |
343 | } | 513 | } |
344 | EXPORT_SYMBOL(call_usermodehelper_pipe); | 514 | EXPORT_SYMBOL(call_usermodehelper_pipe); |
345 | 515 | ||
@@ -347,4 +517,5 @@ void __init usermodehelper_init(void) | |||
347 | { | 517 | { |
348 | khelper_wq = create_singlethread_workqueue("khelper"); | 518 | khelper_wq = create_singlethread_workqueue("khelper"); |
349 | BUG_ON(!khelper_wq); | 519 | BUG_ON(!khelper_wq); |
520 | register_pm_notifier_callback(); | ||
350 | } | 521 | } |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9e47d8c493..3e9f513a72 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -675,9 +675,18 @@ static struct notifier_block kprobe_exceptions_nb = { | |||
675 | .priority = 0x7fffffff /* we need to be notified first */ | 675 | .priority = 0x7fffffff /* we need to be notified first */ |
676 | }; | 676 | }; |
677 | 677 | ||
678 | unsigned long __weak arch_deref_entry_point(void *entry) | ||
679 | { | ||
680 | return (unsigned long)entry; | ||
681 | } | ||
678 | 682 | ||
679 | int __kprobes register_jprobe(struct jprobe *jp) | 683 | int __kprobes register_jprobe(struct jprobe *jp) |
680 | { | 684 | { |
685 | unsigned long addr = arch_deref_entry_point(jp->entry); | ||
686 | |||
687 | if (!kernel_text_address(addr)) | ||
688 | return -EINVAL; | ||
689 | |||
681 | /* Todo: Verify probepoint is a function entry point */ | 690 | /* Todo: Verify probepoint is a function entry point */ |
682 | jp->kp.pre_handler = setjmp_pre_handler; | 691 | jp->kp.pre_handler = setjmp_pre_handler; |
683 | jp->kp.break_handler = longjmp_break_handler; | 692 | jp->kp.break_handler = longjmp_break_handler; |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 559deca5ed..d0e5c48e18 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -62,6 +62,28 @@ static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) | |||
62 | KERNEL_ATTR_RO(kexec_crash_loaded); | 62 | KERNEL_ATTR_RO(kexec_crash_loaded); |
63 | #endif /* CONFIG_KEXEC */ | 63 | #endif /* CONFIG_KEXEC */ |
64 | 64 | ||
65 | /* | ||
66 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | ||
67 | */ | ||
68 | extern const void __start_notes __attribute__((weak)); | ||
69 | extern const void __stop_notes __attribute__((weak)); | ||
70 | #define notes_size (&__stop_notes - &__start_notes) | ||
71 | |||
72 | static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, | ||
73 | char *buf, loff_t off, size_t count) | ||
74 | { | ||
75 | memcpy(buf, &__start_notes + off, count); | ||
76 | return count; | ||
77 | } | ||
78 | |||
79 | static struct bin_attribute notes_attr = { | ||
80 | .attr = { | ||
81 | .name = "notes", | ||
82 | .mode = S_IRUGO, | ||
83 | }, | ||
84 | .read = ¬es_read, | ||
85 | }; | ||
86 | |||
65 | decl_subsys(kernel, NULL, NULL); | 87 | decl_subsys(kernel, NULL, NULL); |
66 | EXPORT_SYMBOL_GPL(kernel_subsys); | 88 | EXPORT_SYMBOL_GPL(kernel_subsys); |
67 | 89 | ||
@@ -88,6 +110,12 @@ static int __init ksysfs_init(void) | |||
88 | error = sysfs_create_group(&kernel_subsys.kobj, | 110 | error = sysfs_create_group(&kernel_subsys.kobj, |
89 | &kernel_attr_group); | 111 | &kernel_attr_group); |
90 | 112 | ||
113 | if (!error && notes_size > 0) { | ||
114 | notes_attr.size = notes_size; | ||
115 | error = sysfs_create_bin_file(&kernel_subsys.kobj, | ||
116 | ¬es_attr); | ||
117 | } | ||
118 | |||
91 | return error; | 119 | return error; |
92 | } | 120 | } |
93 | 121 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index bbd51b81a3..a404f7ee73 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -215,7 +215,7 @@ int kthread_stop(struct task_struct *k) | |||
215 | EXPORT_SYMBOL(kthread_stop); | 215 | EXPORT_SYMBOL(kthread_stop); |
216 | 216 | ||
217 | 217 | ||
218 | static __init void kthreadd_setup(void) | 218 | static noinline __init_refok void kthreadd_setup(void) |
219 | { | 219 | { |
220 | struct task_struct *tsk = current; | 220 | struct task_struct *tsk = current; |
221 | 221 | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1a5ff2211d..734da579ad 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -5,7 +5,8 @@ | |||
5 | * | 5 | * |
6 | * Started by Ingo Molnar: | 6 | * Started by Ingo Molnar: |
7 | * | 7 | * |
8 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | 8 | * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
9 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
9 | * | 10 | * |
10 | * this code maps all the lock dependencies as they occur in a live kernel | 11 | * this code maps all the lock dependencies as they occur in a live kernel |
11 | * and will warn about the following classes of locking bugs: | 12 | * and will warn about the following classes of locking bugs: |
@@ -37,11 +38,26 @@ | |||
37 | #include <linux/debug_locks.h> | 38 | #include <linux/debug_locks.h> |
38 | #include <linux/irqflags.h> | 39 | #include <linux/irqflags.h> |
39 | #include <linux/utsname.h> | 40 | #include <linux/utsname.h> |
41 | #include <linux/hash.h> | ||
40 | 42 | ||
41 | #include <asm/sections.h> | 43 | #include <asm/sections.h> |
42 | 44 | ||
43 | #include "lockdep_internals.h" | 45 | #include "lockdep_internals.h" |
44 | 46 | ||
47 | #ifdef CONFIG_PROVE_LOCKING | ||
48 | int prove_locking = 1; | ||
49 | module_param(prove_locking, int, 0644); | ||
50 | #else | ||
51 | #define prove_locking 0 | ||
52 | #endif | ||
53 | |||
54 | #ifdef CONFIG_LOCK_STAT | ||
55 | int lock_stat = 1; | ||
56 | module_param(lock_stat, int, 0644); | ||
57 | #else | ||
58 | #define lock_stat 0 | ||
59 | #endif | ||
60 | |||
45 | /* | 61 | /* |
46 | * lockdep_lock: protects the lockdep graph, the hashes and the | 62 | * lockdep_lock: protects the lockdep graph, the hashes and the |
47 | * class/list/hash allocators. | 63 | * class/list/hash allocators. |
@@ -96,23 +112,6 @@ unsigned long nr_list_entries; | |||
96 | static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; | 112 | static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; |
97 | 113 | ||
98 | /* | 114 | /* |
99 | * Allocate a lockdep entry. (assumes the graph_lock held, returns | ||
100 | * with NULL on failure) | ||
101 | */ | ||
102 | static struct lock_list *alloc_list_entry(void) | ||
103 | { | ||
104 | if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { | ||
105 | if (!debug_locks_off_graph_unlock()) | ||
106 | return NULL; | ||
107 | |||
108 | printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); | ||
109 | printk("turning off the locking correctness validator.\n"); | ||
110 | return NULL; | ||
111 | } | ||
112 | return list_entries + nr_list_entries++; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * All data structures here are protected by the global debug_lock. | 115 | * All data structures here are protected by the global debug_lock. |
117 | * | 116 | * |
118 | * Mutex key structs only get allocated, once during bootup, and never | 117 | * Mutex key structs only get allocated, once during bootup, and never |
@@ -121,6 +120,117 @@ static struct lock_list *alloc_list_entry(void) | |||
121 | unsigned long nr_lock_classes; | 120 | unsigned long nr_lock_classes; |
122 | static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; | 121 | static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; |
123 | 122 | ||
123 | #ifdef CONFIG_LOCK_STAT | ||
124 | static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); | ||
125 | |||
126 | static int lock_contention_point(struct lock_class *class, unsigned long ip) | ||
127 | { | ||
128 | int i; | ||
129 | |||
130 | for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { | ||
131 | if (class->contention_point[i] == 0) { | ||
132 | class->contention_point[i] = ip; | ||
133 | break; | ||
134 | } | ||
135 | if (class->contention_point[i] == ip) | ||
136 | break; | ||
137 | } | ||
138 | |||
139 | return i; | ||
140 | } | ||
141 | |||
142 | static void lock_time_inc(struct lock_time *lt, s64 time) | ||
143 | { | ||
144 | if (time > lt->max) | ||
145 | lt->max = time; | ||
146 | |||
147 | if (time < lt->min || !lt->min) | ||
148 | lt->min = time; | ||
149 | |||
150 | lt->total += time; | ||
151 | lt->nr++; | ||
152 | } | ||
153 | |||
154 | static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) | ||
155 | { | ||
156 | dst->min += src->min; | ||
157 | dst->max += src->max; | ||
158 | dst->total += src->total; | ||
159 | dst->nr += src->nr; | ||
160 | } | ||
161 | |||
162 | struct lock_class_stats lock_stats(struct lock_class *class) | ||
163 | { | ||
164 | struct lock_class_stats stats; | ||
165 | int cpu, i; | ||
166 | |||
167 | memset(&stats, 0, sizeof(struct lock_class_stats)); | ||
168 | for_each_possible_cpu(cpu) { | ||
169 | struct lock_class_stats *pcs = | ||
170 | &per_cpu(lock_stats, cpu)[class - lock_classes]; | ||
171 | |||
172 | for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) | ||
173 | stats.contention_point[i] += pcs->contention_point[i]; | ||
174 | |||
175 | lock_time_add(&pcs->read_waittime, &stats.read_waittime); | ||
176 | lock_time_add(&pcs->write_waittime, &stats.write_waittime); | ||
177 | |||
178 | lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); | ||
179 | lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); | ||
180 | |||
181 | for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) | ||
182 | stats.bounces[i] += pcs->bounces[i]; | ||
183 | } | ||
184 | |||
185 | return stats; | ||
186 | } | ||
187 | |||
188 | void clear_lock_stats(struct lock_class *class) | ||
189 | { | ||
190 | int cpu; | ||
191 | |||
192 | for_each_possible_cpu(cpu) { | ||
193 | struct lock_class_stats *cpu_stats = | ||
194 | &per_cpu(lock_stats, cpu)[class - lock_classes]; | ||
195 | |||
196 | memset(cpu_stats, 0, sizeof(struct lock_class_stats)); | ||
197 | } | ||
198 | memset(class->contention_point, 0, sizeof(class->contention_point)); | ||
199 | } | ||
200 | |||
201 | static struct lock_class_stats *get_lock_stats(struct lock_class *class) | ||
202 | { | ||
203 | return &get_cpu_var(lock_stats)[class - lock_classes]; | ||
204 | } | ||
205 | |||
206 | static void put_lock_stats(struct lock_class_stats *stats) | ||
207 | { | ||
208 | put_cpu_var(lock_stats); | ||
209 | } | ||
210 | |||
211 | static void lock_release_holdtime(struct held_lock *hlock) | ||
212 | { | ||
213 | struct lock_class_stats *stats; | ||
214 | s64 holdtime; | ||
215 | |||
216 | if (!lock_stat) | ||
217 | return; | ||
218 | |||
219 | holdtime = sched_clock() - hlock->holdtime_stamp; | ||
220 | |||
221 | stats = get_lock_stats(hlock->class); | ||
222 | if (hlock->read) | ||
223 | lock_time_inc(&stats->read_holdtime, holdtime); | ||
224 | else | ||
225 | lock_time_inc(&stats->write_holdtime, holdtime); | ||
226 | put_lock_stats(stats); | ||
227 | } | ||
228 | #else | ||
229 | static inline void lock_release_holdtime(struct held_lock *hlock) | ||
230 | { | ||
231 | } | ||
232 | #endif | ||
233 | |||
124 | /* | 234 | /* |
125 | * We keep a global list of all lock classes. The list only grows, | 235 | * We keep a global list of all lock classes. The list only grows, |
126 | * never shrinks. The list is only accessed with the lockdep | 236 | * never shrinks. The list is only accessed with the lockdep |
@@ -133,24 +243,18 @@ LIST_HEAD(all_lock_classes); | |||
133 | */ | 243 | */ |
134 | #define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) | 244 | #define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) |
135 | #define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) | 245 | #define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) |
136 | #define CLASSHASH_MASK (CLASSHASH_SIZE - 1) | 246 | #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) |
137 | #define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK) | ||
138 | #define classhashentry(key) (classhash_table + __classhashfn((key))) | 247 | #define classhashentry(key) (classhash_table + __classhashfn((key))) |
139 | 248 | ||
140 | static struct list_head classhash_table[CLASSHASH_SIZE]; | 249 | static struct list_head classhash_table[CLASSHASH_SIZE]; |
141 | 250 | ||
142 | unsigned long nr_lock_chains; | ||
143 | static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; | ||
144 | |||
145 | /* | 251 | /* |
146 | * We put the lock dependency chains into a hash-table as well, to cache | 252 | * We put the lock dependency chains into a hash-table as well, to cache |
147 | * their existence: | 253 | * their existence: |
148 | */ | 254 | */ |
149 | #define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) | 255 | #define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) |
150 | #define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) | 256 | #define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) |
151 | #define CHAINHASH_MASK (CHAINHASH_SIZE - 1) | 257 | #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) |
152 | #define __chainhashfn(chain) \ | ||
153 | (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK) | ||
154 | #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) | 258 | #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) |
155 | 259 | ||
156 | static struct list_head chainhash_table[CHAINHASH_SIZE]; | 260 | static struct list_head chainhash_table[CHAINHASH_SIZE]; |
@@ -223,26 +327,6 @@ static int verbose(struct lock_class *class) | |||
223 | return 0; | 327 | return 0; |
224 | } | 328 | } |
225 | 329 | ||
226 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
227 | |||
228 | static int hardirq_verbose(struct lock_class *class) | ||
229 | { | ||
230 | #if HARDIRQ_VERBOSE | ||
231 | return class_filter(class); | ||
232 | #endif | ||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | static int softirq_verbose(struct lock_class *class) | ||
237 | { | ||
238 | #if SOFTIRQ_VERBOSE | ||
239 | return class_filter(class); | ||
240 | #endif | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | #endif | ||
245 | |||
246 | /* | 330 | /* |
247 | * Stack-trace: tightly packed array of stack backtrace | 331 | * Stack-trace: tightly packed array of stack backtrace |
248 | * addresses. Protected by the graph_lock. | 332 | * addresses. Protected by the graph_lock. |
@@ -291,6 +375,11 @@ unsigned int max_recursion_depth; | |||
291 | * about it later on, in lockdep_info(). | 375 | * about it later on, in lockdep_info(). |
292 | */ | 376 | */ |
293 | static int lockdep_init_error; | 377 | static int lockdep_init_error; |
378 | static unsigned long lockdep_init_trace_data[20]; | ||
379 | static struct stack_trace lockdep_init_trace = { | ||
380 | .max_entries = ARRAY_SIZE(lockdep_init_trace_data), | ||
381 | .entries = lockdep_init_trace_data, | ||
382 | }; | ||
294 | 383 | ||
295 | /* | 384 | /* |
296 | * Various lockdep statistics: | 385 | * Various lockdep statistics: |
@@ -379,7 +468,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4 | |||
379 | 468 | ||
380 | static void print_lock_name(struct lock_class *class) | 469 | static void print_lock_name(struct lock_class *class) |
381 | { | 470 | { |
382 | char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; | 471 | char str[KSYM_NAME_LEN], c1, c2, c3, c4; |
383 | const char *name; | 472 | const char *name; |
384 | 473 | ||
385 | get_usage_chars(class, &c1, &c2, &c3, &c4); | 474 | get_usage_chars(class, &c1, &c2, &c3, &c4); |
@@ -401,7 +490,7 @@ static void print_lock_name(struct lock_class *class) | |||
401 | static void print_lockdep_cache(struct lockdep_map *lock) | 490 | static void print_lockdep_cache(struct lockdep_map *lock) |
402 | { | 491 | { |
403 | const char *name; | 492 | const char *name; |
404 | char str[KSYM_NAME_LEN + 1]; | 493 | char str[KSYM_NAME_LEN]; |
405 | 494 | ||
406 | name = lock->name; | 495 | name = lock->name; |
407 | if (!name) | 496 | if (!name) |
@@ -482,6 +571,262 @@ static void print_lock_dependencies(struct lock_class *class, int depth) | |||
482 | } | 571 | } |
483 | } | 572 | } |
484 | 573 | ||
574 | static void print_kernel_version(void) | ||
575 | { | ||
576 | printk("%s %.*s\n", init_utsname()->release, | ||
577 | (int)strcspn(init_utsname()->version, " "), | ||
578 | init_utsname()->version); | ||
579 | } | ||
580 | |||
581 | static int very_verbose(struct lock_class *class) | ||
582 | { | ||
583 | #if VERY_VERBOSE | ||
584 | return class_filter(class); | ||
585 | #endif | ||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Is this the address of a static object: | ||
591 | */ | ||
592 | static int static_obj(void *obj) | ||
593 | { | ||
594 | unsigned long start = (unsigned long) &_stext, | ||
595 | end = (unsigned long) &_end, | ||
596 | addr = (unsigned long) obj; | ||
597 | #ifdef CONFIG_SMP | ||
598 | int i; | ||
599 | #endif | ||
600 | |||
601 | /* | ||
602 | * static variable? | ||
603 | */ | ||
604 | if ((addr >= start) && (addr < end)) | ||
605 | return 1; | ||
606 | |||
607 | #ifdef CONFIG_SMP | ||
608 | /* | ||
609 | * percpu var? | ||
610 | */ | ||
611 | for_each_possible_cpu(i) { | ||
612 | start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); | ||
613 | end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM | ||
614 | + per_cpu_offset(i); | ||
615 | |||
616 | if ((addr >= start) && (addr < end)) | ||
617 | return 1; | ||
618 | } | ||
619 | #endif | ||
620 | |||
621 | /* | ||
622 | * module var? | ||
623 | */ | ||
624 | return is_module_address(addr); | ||
625 | } | ||
626 | |||
627 | /* | ||
628 | * To make lock name printouts unique, we calculate a unique | ||
629 | * class->name_version generation counter: | ||
630 | */ | ||
631 | static int count_matching_names(struct lock_class *new_class) | ||
632 | { | ||
633 | struct lock_class *class; | ||
634 | int count = 0; | ||
635 | |||
636 | if (!new_class->name) | ||
637 | return 0; | ||
638 | |||
639 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
640 | if (new_class->key - new_class->subclass == class->key) | ||
641 | return class->name_version; | ||
642 | if (class->name && !strcmp(class->name, new_class->name)) | ||
643 | count = max(count, class->name_version); | ||
644 | } | ||
645 | |||
646 | return count + 1; | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * Register a lock's class in the hash-table, if the class is not present | ||
651 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
652 | * itself, so actual lookup of the hash should be once per lock object. | ||
653 | */ | ||
654 | static inline struct lock_class * | ||
655 | look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | ||
656 | { | ||
657 | struct lockdep_subclass_key *key; | ||
658 | struct list_head *hash_head; | ||
659 | struct lock_class *class; | ||
660 | |||
661 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
662 | /* | ||
663 | * If the architecture calls into lockdep before initializing | ||
664 | * the hashes then we'll warn about it later. (we cannot printk | ||
665 | * right now) | ||
666 | */ | ||
667 | if (unlikely(!lockdep_initialized)) { | ||
668 | lockdep_init(); | ||
669 | lockdep_init_error = 1; | ||
670 | save_stack_trace(&lockdep_init_trace); | ||
671 | } | ||
672 | #endif | ||
673 | |||
674 | /* | ||
675 | * Static locks do not have their class-keys yet - for them the key | ||
676 | * is the lock object itself: | ||
677 | */ | ||
678 | if (unlikely(!lock->key)) | ||
679 | lock->key = (void *)lock; | ||
680 | |||
681 | /* | ||
682 | * NOTE: the class-key must be unique. For dynamic locks, a static | ||
683 | * lock_class_key variable is passed in through the mutex_init() | ||
684 | * (or spin_lock_init()) call - which acts as the key. For static | ||
685 | * locks we use the lock object itself as the key. | ||
686 | */ | ||
687 | BUILD_BUG_ON(sizeof(struct lock_class_key) > | ||
688 | sizeof(struct lockdep_map)); | ||
689 | |||
690 | key = lock->key->subkeys + subclass; | ||
691 | |||
692 | hash_head = classhashentry(key); | ||
693 | |||
694 | /* | ||
695 | * We can walk the hash lockfree, because the hash only | ||
696 | * grows, and we are careful when adding entries to the end: | ||
697 | */ | ||
698 | list_for_each_entry(class, hash_head, hash_entry) { | ||
699 | if (class->key == key) { | ||
700 | WARN_ON_ONCE(class->name != lock->name); | ||
701 | return class; | ||
702 | } | ||
703 | } | ||
704 | |||
705 | return NULL; | ||
706 | } | ||
707 | |||
708 | /* | ||
709 | * Register a lock's class in the hash-table, if the class is not present | ||
710 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
711 | * itself, so actual lookup of the hash should be once per lock object. | ||
712 | */ | ||
713 | static inline struct lock_class * | ||
714 | register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | ||
715 | { | ||
716 | struct lockdep_subclass_key *key; | ||
717 | struct list_head *hash_head; | ||
718 | struct lock_class *class; | ||
719 | unsigned long flags; | ||
720 | |||
721 | class = look_up_lock_class(lock, subclass); | ||
722 | if (likely(class)) | ||
723 | return class; | ||
724 | |||
725 | /* | ||
726 | * Debug-check: all keys must be persistent! | ||
727 | */ | ||
728 | if (!static_obj(lock->key)) { | ||
729 | debug_locks_off(); | ||
730 | printk("INFO: trying to register non-static key.\n"); | ||
731 | printk("the code is fine but needs lockdep annotation.\n"); | ||
732 | printk("turning off the locking correctness validator.\n"); | ||
733 | dump_stack(); | ||
734 | |||
735 | return NULL; | ||
736 | } | ||
737 | |||
738 | key = lock->key->subkeys + subclass; | ||
739 | hash_head = classhashentry(key); | ||
740 | |||
741 | raw_local_irq_save(flags); | ||
742 | if (!graph_lock()) { | ||
743 | raw_local_irq_restore(flags); | ||
744 | return NULL; | ||
745 | } | ||
746 | /* | ||
747 | * We have to do the hash-walk again, to avoid races | ||
748 | * with another CPU: | ||
749 | */ | ||
750 | list_for_each_entry(class, hash_head, hash_entry) | ||
751 | if (class->key == key) | ||
752 | goto out_unlock_set; | ||
753 | /* | ||
754 | * Allocate a new key from the static array, and add it to | ||
755 | * the hash: | ||
756 | */ | ||
757 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | ||
758 | if (!debug_locks_off_graph_unlock()) { | ||
759 | raw_local_irq_restore(flags); | ||
760 | return NULL; | ||
761 | } | ||
762 | raw_local_irq_restore(flags); | ||
763 | |||
764 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); | ||
765 | printk("turning off the locking correctness validator.\n"); | ||
766 | return NULL; | ||
767 | } | ||
768 | class = lock_classes + nr_lock_classes++; | ||
769 | debug_atomic_inc(&nr_unused_locks); | ||
770 | class->key = key; | ||
771 | class->name = lock->name; | ||
772 | class->subclass = subclass; | ||
773 | INIT_LIST_HEAD(&class->lock_entry); | ||
774 | INIT_LIST_HEAD(&class->locks_before); | ||
775 | INIT_LIST_HEAD(&class->locks_after); | ||
776 | class->name_version = count_matching_names(class); | ||
777 | /* | ||
778 | * We use RCU's safe list-add method to make | ||
779 | * parallel walking of the hash-list safe: | ||
780 | */ | ||
781 | list_add_tail_rcu(&class->hash_entry, hash_head); | ||
782 | |||
783 | if (verbose(class)) { | ||
784 | graph_unlock(); | ||
785 | raw_local_irq_restore(flags); | ||
786 | |||
787 | printk("\nnew class %p: %s", class->key, class->name); | ||
788 | if (class->name_version > 1) | ||
789 | printk("#%d", class->name_version); | ||
790 | printk("\n"); | ||
791 | dump_stack(); | ||
792 | |||
793 | raw_local_irq_save(flags); | ||
794 | if (!graph_lock()) { | ||
795 | raw_local_irq_restore(flags); | ||
796 | return NULL; | ||
797 | } | ||
798 | } | ||
799 | out_unlock_set: | ||
800 | graph_unlock(); | ||
801 | raw_local_irq_restore(flags); | ||
802 | |||
803 | if (!subclass || force) | ||
804 | lock->class_cache = class; | ||
805 | |||
806 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | ||
807 | return NULL; | ||
808 | |||
809 | return class; | ||
810 | } | ||
811 | |||
812 | #ifdef CONFIG_PROVE_LOCKING | ||
813 | /* | ||
814 | * Allocate a lockdep entry. (assumes the graph_lock held, returns | ||
815 | * with NULL on failure) | ||
816 | */ | ||
817 | static struct lock_list *alloc_list_entry(void) | ||
818 | { | ||
819 | if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { | ||
820 | if (!debug_locks_off_graph_unlock()) | ||
821 | return NULL; | ||
822 | |||
823 | printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); | ||
824 | printk("turning off the locking correctness validator.\n"); | ||
825 | return NULL; | ||
826 | } | ||
827 | return list_entries + nr_list_entries++; | ||
828 | } | ||
829 | |||
485 | /* | 830 | /* |
486 | * Add a new dependency to the head of the list: | 831 | * Add a new dependency to the head of the list: |
487 | */ | 832 | */ |
@@ -542,13 +887,6 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) | |||
542 | return 0; | 887 | return 0; |
543 | } | 888 | } |
544 | 889 | ||
545 | static void print_kernel_version(void) | ||
546 | { | ||
547 | printk("%s %.*s\n", init_utsname()->release, | ||
548 | (int)strcspn(init_utsname()->version, " "), | ||
549 | init_utsname()->version); | ||
550 | } | ||
551 | |||
552 | /* | 890 | /* |
553 | * When a circular dependency is detected, print the | 891 | * When a circular dependency is detected, print the |
554 | * header first: | 892 | * header first: |
@@ -640,15 +978,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) | |||
640 | return 1; | 978 | return 1; |
641 | } | 979 | } |
642 | 980 | ||
643 | static int very_verbose(struct lock_class *class) | ||
644 | { | ||
645 | #if VERY_VERBOSE | ||
646 | return class_filter(class); | ||
647 | #endif | ||
648 | return 0; | ||
649 | } | ||
650 | #ifdef CONFIG_TRACE_IRQFLAGS | 981 | #ifdef CONFIG_TRACE_IRQFLAGS |
651 | |||
652 | /* | 982 | /* |
653 | * Forwards and backwards subgraph searching, for the purposes of | 983 | * Forwards and backwards subgraph searching, for the purposes of |
654 | * proving that two subgraphs can be connected by a new dependency | 984 | * proving that two subgraphs can be connected by a new dependency |
@@ -821,6 +1151,78 @@ check_usage(struct task_struct *curr, struct held_lock *prev, | |||
821 | bit_backwards, bit_forwards, irqclass); | 1151 | bit_backwards, bit_forwards, irqclass); |
822 | } | 1152 | } |
823 | 1153 | ||
1154 | static int | ||
1155 | check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, | ||
1156 | struct held_lock *next) | ||
1157 | { | ||
1158 | /* | ||
1159 | * Prove that the new dependency does not connect a hardirq-safe | ||
1160 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
1161 | * the backwards-subgraph starting at <prev>, and the | ||
1162 | * forwards-subgraph starting at <next>: | ||
1163 | */ | ||
1164 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, | ||
1165 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
1166 | return 0; | ||
1167 | |||
1168 | /* | ||
1169 | * Prove that the new dependency does not connect a hardirq-safe-read | ||
1170 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
1171 | * the backwards-subgraph starting at <prev>, and the | ||
1172 | * forwards-subgraph starting at <next>: | ||
1173 | */ | ||
1174 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, | ||
1175 | LOCK_ENABLED_HARDIRQS, "hard-read")) | ||
1176 | return 0; | ||
1177 | |||
1178 | /* | ||
1179 | * Prove that the new dependency does not connect a softirq-safe | ||
1180 | * lock with a softirq-unsafe lock - to achieve this we search | ||
1181 | * the backwards-subgraph starting at <prev>, and the | ||
1182 | * forwards-subgraph starting at <next>: | ||
1183 | */ | ||
1184 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, | ||
1185 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
1186 | return 0; | ||
1187 | /* | ||
1188 | * Prove that the new dependency does not connect a softirq-safe-read | ||
1189 | * lock with a softirq-unsafe lock - to achieve this we search | ||
1190 | * the backwards-subgraph starting at <prev>, and the | ||
1191 | * forwards-subgraph starting at <next>: | ||
1192 | */ | ||
1193 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, | ||
1194 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
1195 | return 0; | ||
1196 | |||
1197 | return 1; | ||
1198 | } | ||
1199 | |||
1200 | static void inc_chains(void) | ||
1201 | { | ||
1202 | if (current->hardirq_context) | ||
1203 | nr_hardirq_chains++; | ||
1204 | else { | ||
1205 | if (current->softirq_context) | ||
1206 | nr_softirq_chains++; | ||
1207 | else | ||
1208 | nr_process_chains++; | ||
1209 | } | ||
1210 | } | ||
1211 | |||
1212 | #else | ||
1213 | |||
1214 | static inline int | ||
1215 | check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, | ||
1216 | struct held_lock *next) | ||
1217 | { | ||
1218 | return 1; | ||
1219 | } | ||
1220 | |||
1221 | static inline void inc_chains(void) | ||
1222 | { | ||
1223 | nr_process_chains++; | ||
1224 | } | ||
1225 | |||
824 | #endif | 1226 | #endif |
825 | 1227 | ||
826 | static int | 1228 | static int |
@@ -922,47 +1324,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
922 | if (!(check_noncircular(next->class, 0))) | 1324 | if (!(check_noncircular(next->class, 0))) |
923 | return print_circular_bug_tail(); | 1325 | return print_circular_bug_tail(); |
924 | 1326 | ||
925 | #ifdef CONFIG_TRACE_IRQFLAGS | 1327 | if (!check_prev_add_irq(curr, prev, next)) |
926 | /* | ||
927 | * Prove that the new dependency does not connect a hardirq-safe | ||
928 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
929 | * the backwards-subgraph starting at <prev>, and the | ||
930 | * forwards-subgraph starting at <next>: | ||
931 | */ | ||
932 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, | ||
933 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
934 | return 0; | 1328 | return 0; |
935 | 1329 | ||
936 | /* | 1330 | /* |
937 | * Prove that the new dependency does not connect a hardirq-safe-read | ||
938 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
939 | * the backwards-subgraph starting at <prev>, and the | ||
940 | * forwards-subgraph starting at <next>: | ||
941 | */ | ||
942 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, | ||
943 | LOCK_ENABLED_HARDIRQS, "hard-read")) | ||
944 | return 0; | ||
945 | |||
946 | /* | ||
947 | * Prove that the new dependency does not connect a softirq-safe | ||
948 | * lock with a softirq-unsafe lock - to achieve this we search | ||
949 | * the backwards-subgraph starting at <prev>, and the | ||
950 | * forwards-subgraph starting at <next>: | ||
951 | */ | ||
952 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, | ||
953 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
954 | return 0; | ||
955 | /* | ||
956 | * Prove that the new dependency does not connect a softirq-safe-read | ||
957 | * lock with a softirq-unsafe lock - to achieve this we search | ||
958 | * the backwards-subgraph starting at <prev>, and the | ||
959 | * forwards-subgraph starting at <next>: | ||
960 | */ | ||
961 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, | ||
962 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
963 | return 0; | ||
964 | #endif | ||
965 | /* | ||
966 | * For recursive read-locks we do all the dependency checks, | 1331 | * For recursive read-locks we do all the dependency checks, |
967 | * but we dont store read-triggered dependencies (only | 1332 | * but we dont store read-triggered dependencies (only |
968 | * write-triggered dependencies). This ensures that only the | 1333 | * write-triggered dependencies). This ensures that only the |
@@ -1088,224 +1453,8 @@ out_bug: | |||
1088 | return 0; | 1453 | return 0; |
1089 | } | 1454 | } |
1090 | 1455 | ||
1091 | 1456 | unsigned long nr_lock_chains; | |
1092 | /* | 1457 | static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; |
1093 | * Is this the address of a static object: | ||
1094 | */ | ||
1095 | static int static_obj(void *obj) | ||
1096 | { | ||
1097 | unsigned long start = (unsigned long) &_stext, | ||
1098 | end = (unsigned long) &_end, | ||
1099 | addr = (unsigned long) obj; | ||
1100 | #ifdef CONFIG_SMP | ||
1101 | int i; | ||
1102 | #endif | ||
1103 | |||
1104 | /* | ||
1105 | * static variable? | ||
1106 | */ | ||
1107 | if ((addr >= start) && (addr < end)) | ||
1108 | return 1; | ||
1109 | |||
1110 | #ifdef CONFIG_SMP | ||
1111 | /* | ||
1112 | * percpu var? | ||
1113 | */ | ||
1114 | for_each_possible_cpu(i) { | ||
1115 | start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); | ||
1116 | end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM | ||
1117 | + per_cpu_offset(i); | ||
1118 | |||
1119 | if ((addr >= start) && (addr < end)) | ||
1120 | return 1; | ||
1121 | } | ||
1122 | #endif | ||
1123 | |||
1124 | /* | ||
1125 | * module var? | ||
1126 | */ | ||
1127 | return is_module_address(addr); | ||
1128 | } | ||
1129 | |||
1130 | /* | ||
1131 | * To make lock name printouts unique, we calculate a unique | ||
1132 | * class->name_version generation counter: | ||
1133 | */ | ||
1134 | static int count_matching_names(struct lock_class *new_class) | ||
1135 | { | ||
1136 | struct lock_class *class; | ||
1137 | int count = 0; | ||
1138 | |||
1139 | if (!new_class->name) | ||
1140 | return 0; | ||
1141 | |||
1142 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
1143 | if (new_class->key - new_class->subclass == class->key) | ||
1144 | return class->name_version; | ||
1145 | if (class->name && !strcmp(class->name, new_class->name)) | ||
1146 | count = max(count, class->name_version); | ||
1147 | } | ||
1148 | |||
1149 | return count + 1; | ||
1150 | } | ||
1151 | |||
1152 | /* | ||
1153 | * Register a lock's class in the hash-table, if the class is not present | ||
1154 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
1155 | * itself, so actual lookup of the hash should be once per lock object. | ||
1156 | */ | ||
1157 | static inline struct lock_class * | ||
1158 | look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | ||
1159 | { | ||
1160 | struct lockdep_subclass_key *key; | ||
1161 | struct list_head *hash_head; | ||
1162 | struct lock_class *class; | ||
1163 | |||
1164 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
1165 | /* | ||
1166 | * If the architecture calls into lockdep before initializing | ||
1167 | * the hashes then we'll warn about it later. (we cannot printk | ||
1168 | * right now) | ||
1169 | */ | ||
1170 | if (unlikely(!lockdep_initialized)) { | ||
1171 | lockdep_init(); | ||
1172 | lockdep_init_error = 1; | ||
1173 | } | ||
1174 | #endif | ||
1175 | |||
1176 | /* | ||
1177 | * Static locks do not have their class-keys yet - for them the key | ||
1178 | * is the lock object itself: | ||
1179 | */ | ||
1180 | if (unlikely(!lock->key)) | ||
1181 | lock->key = (void *)lock; | ||
1182 | |||
1183 | /* | ||
1184 | * NOTE: the class-key must be unique. For dynamic locks, a static | ||
1185 | * lock_class_key variable is passed in through the mutex_init() | ||
1186 | * (or spin_lock_init()) call - which acts as the key. For static | ||
1187 | * locks we use the lock object itself as the key. | ||
1188 | */ | ||
1189 | BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class)); | ||
1190 | |||
1191 | key = lock->key->subkeys + subclass; | ||
1192 | |||
1193 | hash_head = classhashentry(key); | ||
1194 | |||
1195 | /* | ||
1196 | * We can walk the hash lockfree, because the hash only | ||
1197 | * grows, and we are careful when adding entries to the end: | ||
1198 | */ | ||
1199 | list_for_each_entry(class, hash_head, hash_entry) | ||
1200 | if (class->key == key) | ||
1201 | return class; | ||
1202 | |||
1203 | return NULL; | ||
1204 | } | ||
1205 | |||
1206 | /* | ||
1207 | * Register a lock's class in the hash-table, if the class is not present | ||
1208 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
1209 | * itself, so actual lookup of the hash should be once per lock object. | ||
1210 | */ | ||
1211 | static inline struct lock_class * | ||
1212 | register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | ||
1213 | { | ||
1214 | struct lockdep_subclass_key *key; | ||
1215 | struct list_head *hash_head; | ||
1216 | struct lock_class *class; | ||
1217 | unsigned long flags; | ||
1218 | |||
1219 | class = look_up_lock_class(lock, subclass); | ||
1220 | if (likely(class)) | ||
1221 | return class; | ||
1222 | |||
1223 | /* | ||
1224 | * Debug-check: all keys must be persistent! | ||
1225 | */ | ||
1226 | if (!static_obj(lock->key)) { | ||
1227 | debug_locks_off(); | ||
1228 | printk("INFO: trying to register non-static key.\n"); | ||
1229 | printk("the code is fine but needs lockdep annotation.\n"); | ||
1230 | printk("turning off the locking correctness validator.\n"); | ||
1231 | dump_stack(); | ||
1232 | |||
1233 | return NULL; | ||
1234 | } | ||
1235 | |||
1236 | key = lock->key->subkeys + subclass; | ||
1237 | hash_head = classhashentry(key); | ||
1238 | |||
1239 | raw_local_irq_save(flags); | ||
1240 | if (!graph_lock()) { | ||
1241 | raw_local_irq_restore(flags); | ||
1242 | return NULL; | ||
1243 | } | ||
1244 | /* | ||
1245 | * We have to do the hash-walk again, to avoid races | ||
1246 | * with another CPU: | ||
1247 | */ | ||
1248 | list_for_each_entry(class, hash_head, hash_entry) | ||
1249 | if (class->key == key) | ||
1250 | goto out_unlock_set; | ||
1251 | /* | ||
1252 | * Allocate a new key from the static array, and add it to | ||
1253 | * the hash: | ||
1254 | */ | ||
1255 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | ||
1256 | if (!debug_locks_off_graph_unlock()) { | ||
1257 | raw_local_irq_restore(flags); | ||
1258 | return NULL; | ||
1259 | } | ||
1260 | raw_local_irq_restore(flags); | ||
1261 | |||
1262 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); | ||
1263 | printk("turning off the locking correctness validator.\n"); | ||
1264 | return NULL; | ||
1265 | } | ||
1266 | class = lock_classes + nr_lock_classes++; | ||
1267 | debug_atomic_inc(&nr_unused_locks); | ||
1268 | class->key = key; | ||
1269 | class->name = lock->name; | ||
1270 | class->subclass = subclass; | ||
1271 | INIT_LIST_HEAD(&class->lock_entry); | ||
1272 | INIT_LIST_HEAD(&class->locks_before); | ||
1273 | INIT_LIST_HEAD(&class->locks_after); | ||
1274 | class->name_version = count_matching_names(class); | ||
1275 | /* | ||
1276 | * We use RCU's safe list-add method to make | ||
1277 | * parallel walking of the hash-list safe: | ||
1278 | */ | ||
1279 | list_add_tail_rcu(&class->hash_entry, hash_head); | ||
1280 | |||
1281 | if (verbose(class)) { | ||
1282 | graph_unlock(); | ||
1283 | raw_local_irq_restore(flags); | ||
1284 | |||
1285 | printk("\nnew class %p: %s", class->key, class->name); | ||
1286 | if (class->name_version > 1) | ||
1287 | printk("#%d", class->name_version); | ||
1288 | printk("\n"); | ||
1289 | dump_stack(); | ||
1290 | |||
1291 | raw_local_irq_save(flags); | ||
1292 | if (!graph_lock()) { | ||
1293 | raw_local_irq_restore(flags); | ||
1294 | return NULL; | ||
1295 | } | ||
1296 | } | ||
1297 | out_unlock_set: | ||
1298 | graph_unlock(); | ||
1299 | raw_local_irq_restore(flags); | ||
1300 | |||
1301 | if (!subclass || force) | ||
1302 | lock->class_cache = class; | ||
1303 | |||
1304 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | ||
1305 | return NULL; | ||
1306 | |||
1307 | return class; | ||
1308 | } | ||
1309 | 1458 | ||
1310 | /* | 1459 | /* |
1311 | * Look up a dependency chain. If the key is not present yet then | 1460 | * Look up a dependency chain. If the key is not present yet then |
@@ -1366,21 +1515,72 @@ cache_hit: | |||
1366 | chain->chain_key = chain_key; | 1515 | chain->chain_key = chain_key; |
1367 | list_add_tail_rcu(&chain->entry, hash_head); | 1516 | list_add_tail_rcu(&chain->entry, hash_head); |
1368 | debug_atomic_inc(&chain_lookup_misses); | 1517 | debug_atomic_inc(&chain_lookup_misses); |
1369 | #ifdef CONFIG_TRACE_IRQFLAGS | 1518 | inc_chains(); |
1370 | if (current->hardirq_context) | 1519 | |
1371 | nr_hardirq_chains++; | 1520 | return 1; |
1372 | else { | 1521 | } |
1373 | if (current->softirq_context) | 1522 | |
1374 | nr_softirq_chains++; | 1523 | static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, |
1375 | else | 1524 | struct held_lock *hlock, int chain_head) |
1376 | nr_process_chains++; | 1525 | { |
1377 | } | 1526 | /* |
1378 | #else | 1527 | * Trylock needs to maintain the stack of held locks, but it |
1379 | nr_process_chains++; | 1528 | * does not add new dependencies, because trylock can be done |
1380 | #endif | 1529 | * in any order. |
1530 | * | ||
1531 | * We look up the chain_key and do the O(N^2) check and update of | ||
1532 | * the dependencies only if this is a new dependency chain. | ||
1533 | * (If lookup_chain_cache() returns with 1 it acquires | ||
1534 | * graph_lock for us) | ||
1535 | */ | ||
1536 | if (!hlock->trylock && (hlock->check == 2) && | ||
1537 | lookup_chain_cache(curr->curr_chain_key, hlock->class)) { | ||
1538 | /* | ||
1539 | * Check whether last held lock: | ||
1540 | * | ||
1541 | * - is irq-safe, if this lock is irq-unsafe | ||
1542 | * - is softirq-safe, if this lock is hardirq-unsafe | ||
1543 | * | ||
1544 | * And check whether the new lock's dependency graph | ||
1545 | * could lead back to the previous lock. | ||
1546 | * | ||
1547 | * any of these scenarios could lead to a deadlock. If | ||
1548 | * All validations | ||
1549 | */ | ||
1550 | int ret = check_deadlock(curr, hlock, lock, hlock->read); | ||
1551 | |||
1552 | if (!ret) | ||
1553 | return 0; | ||
1554 | /* | ||
1555 | * Mark recursive read, as we jump over it when | ||
1556 | * building dependencies (just like we jump over | ||
1557 | * trylock entries): | ||
1558 | */ | ||
1559 | if (ret == 2) | ||
1560 | hlock->read = 2; | ||
1561 | /* | ||
1562 | * Add dependency only if this lock is not the head | ||
1563 | * of the chain, and if it's not a secondary read-lock: | ||
1564 | */ | ||
1565 | if (!chain_head && ret != 2) | ||
1566 | if (!check_prevs_add(curr, hlock)) | ||
1567 | return 0; | ||
1568 | graph_unlock(); | ||
1569 | } else | ||
1570 | /* after lookup_chain_cache(): */ | ||
1571 | if (unlikely(!debug_locks)) | ||
1572 | return 0; | ||
1381 | 1573 | ||
1382 | return 1; | 1574 | return 1; |
1383 | } | 1575 | } |
1576 | #else | ||
1577 | static inline int validate_chain(struct task_struct *curr, | ||
1578 | struct lockdep_map *lock, struct held_lock *hlock, | ||
1579 | int chain_head) | ||
1580 | { | ||
1581 | return 1; | ||
1582 | } | ||
1583 | #endif | ||
1384 | 1584 | ||
1385 | /* | 1585 | /* |
1386 | * We are building curr_chain_key incrementally, so double-check | 1586 | * We are building curr_chain_key incrementally, so double-check |
@@ -1425,6 +1625,57 @@ static void check_chain_key(struct task_struct *curr) | |||
1425 | #endif | 1625 | #endif |
1426 | } | 1626 | } |
1427 | 1627 | ||
1628 | static int | ||
1629 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | ||
1630 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | ||
1631 | { | ||
1632 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | ||
1633 | return 0; | ||
1634 | |||
1635 | printk("\n=================================\n"); | ||
1636 | printk( "[ INFO: inconsistent lock state ]\n"); | ||
1637 | print_kernel_version(); | ||
1638 | printk( "---------------------------------\n"); | ||
1639 | |||
1640 | printk("inconsistent {%s} -> {%s} usage.\n", | ||
1641 | usage_str[prev_bit], usage_str[new_bit]); | ||
1642 | |||
1643 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", | ||
1644 | curr->comm, curr->pid, | ||
1645 | trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, | ||
1646 | trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, | ||
1647 | trace_hardirqs_enabled(curr), | ||
1648 | trace_softirqs_enabled(curr)); | ||
1649 | print_lock(this); | ||
1650 | |||
1651 | printk("{%s} state was registered at:\n", usage_str[prev_bit]); | ||
1652 | print_stack_trace(this->class->usage_traces + prev_bit, 1); | ||
1653 | |||
1654 | print_irqtrace_events(curr); | ||
1655 | printk("\nother info that might help us debug this:\n"); | ||
1656 | lockdep_print_held_locks(curr); | ||
1657 | |||
1658 | printk("\nstack backtrace:\n"); | ||
1659 | dump_stack(); | ||
1660 | |||
1661 | return 0; | ||
1662 | } | ||
1663 | |||
1664 | /* | ||
1665 | * Print out an error if an invalid bit is set: | ||
1666 | */ | ||
1667 | static inline int | ||
1668 | valid_state(struct task_struct *curr, struct held_lock *this, | ||
1669 | enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) | ||
1670 | { | ||
1671 | if (unlikely(this->class->usage_mask & (1 << bad_bit))) | ||
1672 | return print_usage_bug(curr, this, bad_bit, new_bit); | ||
1673 | return 1; | ||
1674 | } | ||
1675 | |||
1676 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
1677 | enum lock_usage_bit new_bit); | ||
1678 | |||
1428 | #ifdef CONFIG_TRACE_IRQFLAGS | 1679 | #ifdef CONFIG_TRACE_IRQFLAGS |
1429 | 1680 | ||
1430 | /* | 1681 | /* |
@@ -1518,90 +1769,30 @@ void print_irqtrace_events(struct task_struct *curr) | |||
1518 | print_ip_sym(curr->softirq_disable_ip); | 1769 | print_ip_sym(curr->softirq_disable_ip); |
1519 | } | 1770 | } |
1520 | 1771 | ||
1521 | #endif | 1772 | static int hardirq_verbose(struct lock_class *class) |
1522 | |||
1523 | static int | ||
1524 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | ||
1525 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | ||
1526 | { | 1773 | { |
1527 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1774 | #if HARDIRQ_VERBOSE |
1528 | return 0; | 1775 | return class_filter(class); |
1529 | 1776 | #endif | |
1530 | printk("\n=================================\n"); | ||
1531 | printk( "[ INFO: inconsistent lock state ]\n"); | ||
1532 | print_kernel_version(); | ||
1533 | printk( "---------------------------------\n"); | ||
1534 | |||
1535 | printk("inconsistent {%s} -> {%s} usage.\n", | ||
1536 | usage_str[prev_bit], usage_str[new_bit]); | ||
1537 | |||
1538 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", | ||
1539 | curr->comm, curr->pid, | ||
1540 | trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, | ||
1541 | trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, | ||
1542 | trace_hardirqs_enabled(curr), | ||
1543 | trace_softirqs_enabled(curr)); | ||
1544 | print_lock(this); | ||
1545 | |||
1546 | printk("{%s} state was registered at:\n", usage_str[prev_bit]); | ||
1547 | print_stack_trace(this->class->usage_traces + prev_bit, 1); | ||
1548 | |||
1549 | print_irqtrace_events(curr); | ||
1550 | printk("\nother info that might help us debug this:\n"); | ||
1551 | lockdep_print_held_locks(curr); | ||
1552 | |||
1553 | printk("\nstack backtrace:\n"); | ||
1554 | dump_stack(); | ||
1555 | |||
1556 | return 0; | 1777 | return 0; |
1557 | } | 1778 | } |
1558 | 1779 | ||
1559 | /* | 1780 | static int softirq_verbose(struct lock_class *class) |
1560 | * Print out an error if an invalid bit is set: | ||
1561 | */ | ||
1562 | static inline int | ||
1563 | valid_state(struct task_struct *curr, struct held_lock *this, | ||
1564 | enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) | ||
1565 | { | 1781 | { |
1566 | if (unlikely(this->class->usage_mask & (1 << bad_bit))) | 1782 | #if SOFTIRQ_VERBOSE |
1567 | return print_usage_bug(curr, this, bad_bit, new_bit); | 1783 | return class_filter(class); |
1568 | return 1; | 1784 | #endif |
1785 | return 0; | ||
1569 | } | 1786 | } |
1570 | 1787 | ||
1571 | #define STRICT_READ_CHECKS 1 | 1788 | #define STRICT_READ_CHECKS 1 |
1572 | 1789 | ||
1573 | /* | 1790 | static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, |
1574 | * Mark a lock with a usage bit, and validate the state transition: | 1791 | enum lock_usage_bit new_bit) |
1575 | */ | ||
1576 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
1577 | enum lock_usage_bit new_bit) | ||
1578 | { | 1792 | { |
1579 | unsigned int new_mask = 1 << new_bit, ret = 1; | 1793 | int ret = 1; |
1580 | |||
1581 | /* | ||
1582 | * If already set then do not dirty the cacheline, | ||
1583 | * nor do any checks: | ||
1584 | */ | ||
1585 | if (likely(this->class->usage_mask & new_mask)) | ||
1586 | return 1; | ||
1587 | |||
1588 | if (!graph_lock()) | ||
1589 | return 0; | ||
1590 | /* | ||
1591 | * Make sure we didnt race: | ||
1592 | */ | ||
1593 | if (unlikely(this->class->usage_mask & new_mask)) { | ||
1594 | graph_unlock(); | ||
1595 | return 1; | ||
1596 | } | ||
1597 | |||
1598 | this->class->usage_mask |= new_mask; | ||
1599 | 1794 | ||
1600 | if (!save_trace(this->class->usage_traces + new_bit)) | 1795 | switch(new_bit) { |
1601 | return 0; | ||
1602 | |||
1603 | switch (new_bit) { | ||
1604 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1605 | case LOCK_USED_IN_HARDIRQ: | 1796 | case LOCK_USED_IN_HARDIRQ: |
1606 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) | 1797 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) |
1607 | return 0; | 1798 | return 0; |
@@ -1760,37 +1951,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
1760 | if (softirq_verbose(this->class)) | 1951 | if (softirq_verbose(this->class)) |
1761 | ret = 2; | 1952 | ret = 2; |
1762 | break; | 1953 | break; |
1763 | #endif | ||
1764 | case LOCK_USED: | ||
1765 | /* | ||
1766 | * Add it to the global list of classes: | ||
1767 | */ | ||
1768 | list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); | ||
1769 | debug_atomic_dec(&nr_unused_locks); | ||
1770 | break; | ||
1771 | default: | 1954 | default: |
1772 | if (!debug_locks_off_graph_unlock()) | ||
1773 | return 0; | ||
1774 | WARN_ON(1); | 1955 | WARN_ON(1); |
1775 | return 0; | 1956 | break; |
1776 | } | ||
1777 | |||
1778 | graph_unlock(); | ||
1779 | |||
1780 | /* | ||
1781 | * We must printk outside of the graph_lock: | ||
1782 | */ | ||
1783 | if (ret == 2) { | ||
1784 | printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); | ||
1785 | print_lock(this); | ||
1786 | print_irqtrace_events(curr); | ||
1787 | dump_stack(); | ||
1788 | } | 1957 | } |
1789 | 1958 | ||
1790 | return ret; | 1959 | return ret; |
1791 | } | 1960 | } |
1792 | 1961 | ||
1793 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1794 | /* | 1962 | /* |
1795 | * Mark all held locks with a usage bit: | 1963 | * Mark all held locks with a usage bit: |
1796 | */ | 1964 | */ |
@@ -1973,9 +2141,176 @@ void trace_softirqs_off(unsigned long ip) | |||
1973 | debug_atomic_inc(&redundant_softirqs_off); | 2141 | debug_atomic_inc(&redundant_softirqs_off); |
1974 | } | 2142 | } |
1975 | 2143 | ||
2144 | static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) | ||
2145 | { | ||
2146 | /* | ||
2147 | * If non-trylock use in a hardirq or softirq context, then | ||
2148 | * mark the lock as used in these contexts: | ||
2149 | */ | ||
2150 | if (!hlock->trylock) { | ||
2151 | if (hlock->read) { | ||
2152 | if (curr->hardirq_context) | ||
2153 | if (!mark_lock(curr, hlock, | ||
2154 | LOCK_USED_IN_HARDIRQ_READ)) | ||
2155 | return 0; | ||
2156 | if (curr->softirq_context) | ||
2157 | if (!mark_lock(curr, hlock, | ||
2158 | LOCK_USED_IN_SOFTIRQ_READ)) | ||
2159 | return 0; | ||
2160 | } else { | ||
2161 | if (curr->hardirq_context) | ||
2162 | if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) | ||
2163 | return 0; | ||
2164 | if (curr->softirq_context) | ||
2165 | if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) | ||
2166 | return 0; | ||
2167 | } | ||
2168 | } | ||
2169 | if (!hlock->hardirqs_off) { | ||
2170 | if (hlock->read) { | ||
2171 | if (!mark_lock(curr, hlock, | ||
2172 | LOCK_ENABLED_HARDIRQS_READ)) | ||
2173 | return 0; | ||
2174 | if (curr->softirqs_enabled) | ||
2175 | if (!mark_lock(curr, hlock, | ||
2176 | LOCK_ENABLED_SOFTIRQS_READ)) | ||
2177 | return 0; | ||
2178 | } else { | ||
2179 | if (!mark_lock(curr, hlock, | ||
2180 | LOCK_ENABLED_HARDIRQS)) | ||
2181 | return 0; | ||
2182 | if (curr->softirqs_enabled) | ||
2183 | if (!mark_lock(curr, hlock, | ||
2184 | LOCK_ENABLED_SOFTIRQS)) | ||
2185 | return 0; | ||
2186 | } | ||
2187 | } | ||
2188 | |||
2189 | return 1; | ||
2190 | } | ||
2191 | |||
2192 | static int separate_irq_context(struct task_struct *curr, | ||
2193 | struct held_lock *hlock) | ||
2194 | { | ||
2195 | unsigned int depth = curr->lockdep_depth; | ||
2196 | |||
2197 | /* | ||
2198 | * Keep track of points where we cross into an interrupt context: | ||
2199 | */ | ||
2200 | hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + | ||
2201 | curr->softirq_context; | ||
2202 | if (depth) { | ||
2203 | struct held_lock *prev_hlock; | ||
2204 | |||
2205 | prev_hlock = curr->held_locks + depth-1; | ||
2206 | /* | ||
2207 | * If we cross into another context, reset the | ||
2208 | * hash key (this also prevents the checking and the | ||
2209 | * adding of the dependency to 'prev'): | ||
2210 | */ | ||
2211 | if (prev_hlock->irq_context != hlock->irq_context) | ||
2212 | return 1; | ||
2213 | } | ||
2214 | return 0; | ||
2215 | } | ||
2216 | |||
2217 | #else | ||
2218 | |||
2219 | static inline | ||
2220 | int mark_lock_irq(struct task_struct *curr, struct held_lock *this, | ||
2221 | enum lock_usage_bit new_bit) | ||
2222 | { | ||
2223 | WARN_ON(1); | ||
2224 | return 1; | ||
2225 | } | ||
2226 | |||
2227 | static inline int mark_irqflags(struct task_struct *curr, | ||
2228 | struct held_lock *hlock) | ||
2229 | { | ||
2230 | return 1; | ||
2231 | } | ||
2232 | |||
2233 | static inline int separate_irq_context(struct task_struct *curr, | ||
2234 | struct held_lock *hlock) | ||
2235 | { | ||
2236 | return 0; | ||
2237 | } | ||
2238 | |||
1976 | #endif | 2239 | #endif |
1977 | 2240 | ||
1978 | /* | 2241 | /* |
2242 | * Mark a lock with a usage bit, and validate the state transition: | ||
2243 | */ | ||
2244 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
2245 | enum lock_usage_bit new_bit) | ||
2246 | { | ||
2247 | unsigned int new_mask = 1 << new_bit, ret = 1; | ||
2248 | |||
2249 | /* | ||
2250 | * If already set then do not dirty the cacheline, | ||
2251 | * nor do any checks: | ||
2252 | */ | ||
2253 | if (likely(this->class->usage_mask & new_mask)) | ||
2254 | return 1; | ||
2255 | |||
2256 | if (!graph_lock()) | ||
2257 | return 0; | ||
2258 | /* | ||
2259 | * Make sure we didnt race: | ||
2260 | */ | ||
2261 | if (unlikely(this->class->usage_mask & new_mask)) { | ||
2262 | graph_unlock(); | ||
2263 | return 1; | ||
2264 | } | ||
2265 | |||
2266 | this->class->usage_mask |= new_mask; | ||
2267 | |||
2268 | if (!save_trace(this->class->usage_traces + new_bit)) | ||
2269 | return 0; | ||
2270 | |||
2271 | switch (new_bit) { | ||
2272 | case LOCK_USED_IN_HARDIRQ: | ||
2273 | case LOCK_USED_IN_SOFTIRQ: | ||
2274 | case LOCK_USED_IN_HARDIRQ_READ: | ||
2275 | case LOCK_USED_IN_SOFTIRQ_READ: | ||
2276 | case LOCK_ENABLED_HARDIRQS: | ||
2277 | case LOCK_ENABLED_SOFTIRQS: | ||
2278 | case LOCK_ENABLED_HARDIRQS_READ: | ||
2279 | case LOCK_ENABLED_SOFTIRQS_READ: | ||
2280 | ret = mark_lock_irq(curr, this, new_bit); | ||
2281 | if (!ret) | ||
2282 | return 0; | ||
2283 | break; | ||
2284 | case LOCK_USED: | ||
2285 | /* | ||
2286 | * Add it to the global list of classes: | ||
2287 | */ | ||
2288 | list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); | ||
2289 | debug_atomic_dec(&nr_unused_locks); | ||
2290 | break; | ||
2291 | default: | ||
2292 | if (!debug_locks_off_graph_unlock()) | ||
2293 | return 0; | ||
2294 | WARN_ON(1); | ||
2295 | return 0; | ||
2296 | } | ||
2297 | |||
2298 | graph_unlock(); | ||
2299 | |||
2300 | /* | ||
2301 | * We must printk outside of the graph_lock: | ||
2302 | */ | ||
2303 | if (ret == 2) { | ||
2304 | printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); | ||
2305 | print_lock(this); | ||
2306 | print_irqtrace_events(curr); | ||
2307 | dump_stack(); | ||
2308 | } | ||
2309 | |||
2310 | return ret; | ||
2311 | } | ||
2312 | |||
2313 | /* | ||
1979 | * Initialize a lock instance's lock-class mapping info: | 2314 | * Initialize a lock instance's lock-class mapping info: |
1980 | */ | 2315 | */ |
1981 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 2316 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
@@ -1999,6 +2334,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
1999 | lock->name = name; | 2334 | lock->name = name; |
2000 | lock->key = key; | 2335 | lock->key = key; |
2001 | lock->class_cache = NULL; | 2336 | lock->class_cache = NULL; |
2337 | #ifdef CONFIG_LOCK_STAT | ||
2338 | lock->cpu = raw_smp_processor_id(); | ||
2339 | #endif | ||
2002 | if (subclass) | 2340 | if (subclass) |
2003 | register_lock_class(lock, subclass, 1); | 2341 | register_lock_class(lock, subclass, 1); |
2004 | } | 2342 | } |
@@ -2020,6 +2358,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2020 | int chain_head = 0; | 2358 | int chain_head = 0; |
2021 | u64 chain_key; | 2359 | u64 chain_key; |
2022 | 2360 | ||
2361 | if (!prove_locking) | ||
2362 | check = 1; | ||
2363 | |||
2023 | if (unlikely(!debug_locks)) | 2364 | if (unlikely(!debug_locks)) |
2024 | return 0; | 2365 | return 0; |
2025 | 2366 | ||
@@ -2070,57 +2411,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2070 | hlock->read = read; | 2411 | hlock->read = read; |
2071 | hlock->check = check; | 2412 | hlock->check = check; |
2072 | hlock->hardirqs_off = hardirqs_off; | 2413 | hlock->hardirqs_off = hardirqs_off; |
2073 | 2414 | #ifdef CONFIG_LOCK_STAT | |
2074 | if (check != 2) | 2415 | hlock->waittime_stamp = 0; |
2075 | goto out_calc_hash; | 2416 | hlock->holdtime_stamp = sched_clock(); |
2076 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
2077 | /* | ||
2078 | * If non-trylock use in a hardirq or softirq context, then | ||
2079 | * mark the lock as used in these contexts: | ||
2080 | */ | ||
2081 | if (!trylock) { | ||
2082 | if (read) { | ||
2083 | if (curr->hardirq_context) | ||
2084 | if (!mark_lock(curr, hlock, | ||
2085 | LOCK_USED_IN_HARDIRQ_READ)) | ||
2086 | return 0; | ||
2087 | if (curr->softirq_context) | ||
2088 | if (!mark_lock(curr, hlock, | ||
2089 | LOCK_USED_IN_SOFTIRQ_READ)) | ||
2090 | return 0; | ||
2091 | } else { | ||
2092 | if (curr->hardirq_context) | ||
2093 | if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) | ||
2094 | return 0; | ||
2095 | if (curr->softirq_context) | ||
2096 | if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) | ||
2097 | return 0; | ||
2098 | } | ||
2099 | } | ||
2100 | if (!hardirqs_off) { | ||
2101 | if (read) { | ||
2102 | if (!mark_lock(curr, hlock, | ||
2103 | LOCK_ENABLED_HARDIRQS_READ)) | ||
2104 | return 0; | ||
2105 | if (curr->softirqs_enabled) | ||
2106 | if (!mark_lock(curr, hlock, | ||
2107 | LOCK_ENABLED_SOFTIRQS_READ)) | ||
2108 | return 0; | ||
2109 | } else { | ||
2110 | if (!mark_lock(curr, hlock, | ||
2111 | LOCK_ENABLED_HARDIRQS)) | ||
2112 | return 0; | ||
2113 | if (curr->softirqs_enabled) | ||
2114 | if (!mark_lock(curr, hlock, | ||
2115 | LOCK_ENABLED_SOFTIRQS)) | ||
2116 | return 0; | ||
2117 | } | ||
2118 | } | ||
2119 | #endif | 2417 | #endif |
2418 | |||
2419 | if (check == 2 && !mark_irqflags(curr, hlock)) | ||
2420 | return 0; | ||
2421 | |||
2120 | /* mark it as used: */ | 2422 | /* mark it as used: */ |
2121 | if (!mark_lock(curr, hlock, LOCK_USED)) | 2423 | if (!mark_lock(curr, hlock, LOCK_USED)) |
2122 | return 0; | 2424 | return 0; |
2123 | out_calc_hash: | 2425 | |
2124 | /* | 2426 | /* |
2125 | * Calculate the chain hash: it's the combined has of all the | 2427 | * Calculate the chain hash: it's the combined has of all the |
2126 | * lock keys along the dependency chain. We save the hash value | 2428 | * lock keys along the dependency chain. We save the hash value |
@@ -2143,77 +2445,15 @@ out_calc_hash: | |||
2143 | } | 2445 | } |
2144 | 2446 | ||
2145 | hlock->prev_chain_key = chain_key; | 2447 | hlock->prev_chain_key = chain_key; |
2146 | 2448 | if (separate_irq_context(curr, hlock)) { | |
2147 | #ifdef CONFIG_TRACE_IRQFLAGS | 2449 | chain_key = 0; |
2148 | /* | 2450 | chain_head = 1; |
2149 | * Keep track of points where we cross into an interrupt context: | ||
2150 | */ | ||
2151 | hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + | ||
2152 | curr->softirq_context; | ||
2153 | if (depth) { | ||
2154 | struct held_lock *prev_hlock; | ||
2155 | |||
2156 | prev_hlock = curr->held_locks + depth-1; | ||
2157 | /* | ||
2158 | * If we cross into another context, reset the | ||
2159 | * hash key (this also prevents the checking and the | ||
2160 | * adding of the dependency to 'prev'): | ||
2161 | */ | ||
2162 | if (prev_hlock->irq_context != hlock->irq_context) { | ||
2163 | chain_key = 0; | ||
2164 | chain_head = 1; | ||
2165 | } | ||
2166 | } | 2451 | } |
2167 | #endif | ||
2168 | chain_key = iterate_chain_key(chain_key, id); | 2452 | chain_key = iterate_chain_key(chain_key, id); |
2169 | curr->curr_chain_key = chain_key; | 2453 | curr->curr_chain_key = chain_key; |
2170 | 2454 | ||
2171 | /* | 2455 | if (!validate_chain(curr, lock, hlock, chain_head)) |
2172 | * Trylock needs to maintain the stack of held locks, but it | 2456 | return 0; |
2173 | * does not add new dependencies, because trylock can be done | ||
2174 | * in any order. | ||
2175 | * | ||
2176 | * We look up the chain_key and do the O(N^2) check and update of | ||
2177 | * the dependencies only if this is a new dependency chain. | ||
2178 | * (If lookup_chain_cache() returns with 1 it acquires | ||
2179 | * graph_lock for us) | ||
2180 | */ | ||
2181 | if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) { | ||
2182 | /* | ||
2183 | * Check whether last held lock: | ||
2184 | * | ||
2185 | * - is irq-safe, if this lock is irq-unsafe | ||
2186 | * - is softirq-safe, if this lock is hardirq-unsafe | ||
2187 | * | ||
2188 | * And check whether the new lock's dependency graph | ||
2189 | * could lead back to the previous lock. | ||
2190 | * | ||
2191 | * any of these scenarios could lead to a deadlock. If | ||
2192 | * All validations | ||
2193 | */ | ||
2194 | int ret = check_deadlock(curr, hlock, lock, read); | ||
2195 | |||
2196 | if (!ret) | ||
2197 | return 0; | ||
2198 | /* | ||
2199 | * Mark recursive read, as we jump over it when | ||
2200 | * building dependencies (just like we jump over | ||
2201 | * trylock entries): | ||
2202 | */ | ||
2203 | if (ret == 2) | ||
2204 | hlock->read = 2; | ||
2205 | /* | ||
2206 | * Add dependency only if this lock is not the head | ||
2207 | * of the chain, and if it's not a secondary read-lock: | ||
2208 | */ | ||
2209 | if (!chain_head && ret != 2) | ||
2210 | if (!check_prevs_add(curr, hlock)) | ||
2211 | return 0; | ||
2212 | graph_unlock(); | ||
2213 | } else | ||
2214 | /* after lookup_chain_cache(): */ | ||
2215 | if (unlikely(!debug_locks)) | ||
2216 | return 0; | ||
2217 | 2457 | ||
2218 | curr->lockdep_depth++; | 2458 | curr->lockdep_depth++; |
2219 | check_chain_key(curr); | 2459 | check_chain_key(curr); |
@@ -2315,6 +2555,8 @@ lock_release_non_nested(struct task_struct *curr, | |||
2315 | return print_unlock_inbalance_bug(curr, lock, ip); | 2555 | return print_unlock_inbalance_bug(curr, lock, ip); |
2316 | 2556 | ||
2317 | found_it: | 2557 | found_it: |
2558 | lock_release_holdtime(hlock); | ||
2559 | |||
2318 | /* | 2560 | /* |
2319 | * We have the right lock to unlock, 'hlock' points to it. | 2561 | * We have the right lock to unlock, 'hlock' points to it. |
2320 | * Now we remove it from the stack, and add back the other | 2562 | * Now we remove it from the stack, and add back the other |
@@ -2367,6 +2609,8 @@ static int lock_release_nested(struct task_struct *curr, | |||
2367 | 2609 | ||
2368 | curr->curr_chain_key = hlock->prev_chain_key; | 2610 | curr->curr_chain_key = hlock->prev_chain_key; |
2369 | 2611 | ||
2612 | lock_release_holdtime(hlock); | ||
2613 | |||
2370 | #ifdef CONFIG_DEBUG_LOCKDEP | 2614 | #ifdef CONFIG_DEBUG_LOCKDEP |
2371 | hlock->prev_chain_key = 0; | 2615 | hlock->prev_chain_key = 0; |
2372 | hlock->class = NULL; | 2616 | hlock->class = NULL; |
@@ -2441,6 +2685,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2441 | { | 2685 | { |
2442 | unsigned long flags; | 2686 | unsigned long flags; |
2443 | 2687 | ||
2688 | if (unlikely(!lock_stat && !prove_locking)) | ||
2689 | return; | ||
2690 | |||
2444 | if (unlikely(current->lockdep_recursion)) | 2691 | if (unlikely(current->lockdep_recursion)) |
2445 | return; | 2692 | return; |
2446 | 2693 | ||
@@ -2460,6 +2707,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
2460 | { | 2707 | { |
2461 | unsigned long flags; | 2708 | unsigned long flags; |
2462 | 2709 | ||
2710 | if (unlikely(!lock_stat && !prove_locking)) | ||
2711 | return; | ||
2712 | |||
2463 | if (unlikely(current->lockdep_recursion)) | 2713 | if (unlikely(current->lockdep_recursion)) |
2464 | return; | 2714 | return; |
2465 | 2715 | ||
@@ -2473,6 +2723,166 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
2473 | 2723 | ||
2474 | EXPORT_SYMBOL_GPL(lock_release); | 2724 | EXPORT_SYMBOL_GPL(lock_release); |
2475 | 2725 | ||
2726 | #ifdef CONFIG_LOCK_STAT | ||
2727 | static int | ||
2728 | print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | ||
2729 | unsigned long ip) | ||
2730 | { | ||
2731 | if (!debug_locks_off()) | ||
2732 | return 0; | ||
2733 | if (debug_locks_silent) | ||
2734 | return 0; | ||
2735 | |||
2736 | printk("\n=================================\n"); | ||
2737 | printk( "[ BUG: bad contention detected! ]\n"); | ||
2738 | printk( "---------------------------------\n"); | ||
2739 | printk("%s/%d is trying to contend lock (", | ||
2740 | curr->comm, curr->pid); | ||
2741 | print_lockdep_cache(lock); | ||
2742 | printk(") at:\n"); | ||
2743 | print_ip_sym(ip); | ||
2744 | printk("but there are no locks held!\n"); | ||
2745 | printk("\nother info that might help us debug this:\n"); | ||
2746 | lockdep_print_held_locks(curr); | ||
2747 | |||
2748 | printk("\nstack backtrace:\n"); | ||
2749 | dump_stack(); | ||
2750 | |||
2751 | return 0; | ||
2752 | } | ||
2753 | |||
2754 | static void | ||
2755 | __lock_contended(struct lockdep_map *lock, unsigned long ip) | ||
2756 | { | ||
2757 | struct task_struct *curr = current; | ||
2758 | struct held_lock *hlock, *prev_hlock; | ||
2759 | struct lock_class_stats *stats; | ||
2760 | unsigned int depth; | ||
2761 | int i, point; | ||
2762 | |||
2763 | depth = curr->lockdep_depth; | ||
2764 | if (DEBUG_LOCKS_WARN_ON(!depth)) | ||
2765 | return; | ||
2766 | |||
2767 | prev_hlock = NULL; | ||
2768 | for (i = depth-1; i >= 0; i--) { | ||
2769 | hlock = curr->held_locks + i; | ||
2770 | /* | ||
2771 | * We must not cross into another context: | ||
2772 | */ | ||
2773 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
2774 | break; | ||
2775 | if (hlock->instance == lock) | ||
2776 | goto found_it; | ||
2777 | prev_hlock = hlock; | ||
2778 | } | ||
2779 | print_lock_contention_bug(curr, lock, ip); | ||
2780 | return; | ||
2781 | |||
2782 | found_it: | ||
2783 | hlock->waittime_stamp = sched_clock(); | ||
2784 | |||
2785 | point = lock_contention_point(hlock->class, ip); | ||
2786 | |||
2787 | stats = get_lock_stats(hlock->class); | ||
2788 | if (point < ARRAY_SIZE(stats->contention_point)) | ||
2789 | stats->contention_point[i]++; | ||
2790 | if (lock->cpu != smp_processor_id()) | ||
2791 | stats->bounces[bounce_contended + !!hlock->read]++; | ||
2792 | put_lock_stats(stats); | ||
2793 | } | ||
2794 | |||
2795 | static void | ||
2796 | __lock_acquired(struct lockdep_map *lock) | ||
2797 | { | ||
2798 | struct task_struct *curr = current; | ||
2799 | struct held_lock *hlock, *prev_hlock; | ||
2800 | struct lock_class_stats *stats; | ||
2801 | unsigned int depth; | ||
2802 | u64 now; | ||
2803 | s64 waittime = 0; | ||
2804 | int i, cpu; | ||
2805 | |||
2806 | depth = curr->lockdep_depth; | ||
2807 | if (DEBUG_LOCKS_WARN_ON(!depth)) | ||
2808 | return; | ||
2809 | |||
2810 | prev_hlock = NULL; | ||
2811 | for (i = depth-1; i >= 0; i--) { | ||
2812 | hlock = curr->held_locks + i; | ||
2813 | /* | ||
2814 | * We must not cross into another context: | ||
2815 | */ | ||
2816 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
2817 | break; | ||
2818 | if (hlock->instance == lock) | ||
2819 | goto found_it; | ||
2820 | prev_hlock = hlock; | ||
2821 | } | ||
2822 | print_lock_contention_bug(curr, lock, _RET_IP_); | ||
2823 | return; | ||
2824 | |||
2825 | found_it: | ||
2826 | cpu = smp_processor_id(); | ||
2827 | if (hlock->waittime_stamp) { | ||
2828 | now = sched_clock(); | ||
2829 | waittime = now - hlock->waittime_stamp; | ||
2830 | hlock->holdtime_stamp = now; | ||
2831 | } | ||
2832 | |||
2833 | stats = get_lock_stats(hlock->class); | ||
2834 | if (waittime) { | ||
2835 | if (hlock->read) | ||
2836 | lock_time_inc(&stats->read_waittime, waittime); | ||
2837 | else | ||
2838 | lock_time_inc(&stats->write_waittime, waittime); | ||
2839 | } | ||
2840 | if (lock->cpu != cpu) | ||
2841 | stats->bounces[bounce_acquired + !!hlock->read]++; | ||
2842 | put_lock_stats(stats); | ||
2843 | |||
2844 | lock->cpu = cpu; | ||
2845 | } | ||
2846 | |||
2847 | void lock_contended(struct lockdep_map *lock, unsigned long ip) | ||
2848 | { | ||
2849 | unsigned long flags; | ||
2850 | |||
2851 | if (unlikely(!lock_stat)) | ||
2852 | return; | ||
2853 | |||
2854 | if (unlikely(current->lockdep_recursion)) | ||
2855 | return; | ||
2856 | |||
2857 | raw_local_irq_save(flags); | ||
2858 | check_flags(flags); | ||
2859 | current->lockdep_recursion = 1; | ||
2860 | __lock_contended(lock, ip); | ||
2861 | current->lockdep_recursion = 0; | ||
2862 | raw_local_irq_restore(flags); | ||
2863 | } | ||
2864 | EXPORT_SYMBOL_GPL(lock_contended); | ||
2865 | |||
2866 | void lock_acquired(struct lockdep_map *lock) | ||
2867 | { | ||
2868 | unsigned long flags; | ||
2869 | |||
2870 | if (unlikely(!lock_stat)) | ||
2871 | return; | ||
2872 | |||
2873 | if (unlikely(current->lockdep_recursion)) | ||
2874 | return; | ||
2875 | |||
2876 | raw_local_irq_save(flags); | ||
2877 | check_flags(flags); | ||
2878 | current->lockdep_recursion = 1; | ||
2879 | __lock_acquired(lock); | ||
2880 | current->lockdep_recursion = 0; | ||
2881 | raw_local_irq_restore(flags); | ||
2882 | } | ||
2883 | EXPORT_SYMBOL_GPL(lock_acquired); | ||
2884 | #endif | ||
2885 | |||
2476 | /* | 2886 | /* |
2477 | * Used by the testsuite, sanitize the validator state | 2887 | * Used by the testsuite, sanitize the validator state |
2478 | * after a simulated failure: | 2888 | * after a simulated failure: |
@@ -2636,8 +3046,11 @@ void __init lockdep_info(void) | |||
2636 | sizeof(struct held_lock) * MAX_LOCK_DEPTH); | 3046 | sizeof(struct held_lock) * MAX_LOCK_DEPTH); |
2637 | 3047 | ||
2638 | #ifdef CONFIG_DEBUG_LOCKDEP | 3048 | #ifdef CONFIG_DEBUG_LOCKDEP |
2639 | if (lockdep_init_error) | 3049 | if (lockdep_init_error) { |
2640 | printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); | 3050 | printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); |
3051 | printk("Call stack leading to lockdep invocation was:\n"); | ||
3052 | print_stack_trace(&lockdep_init_trace, 0); | ||
3053 | } | ||
2641 | #endif | 3054 | #endif |
2642 | } | 3055 | } |
2643 | 3056 | ||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 58f35e586e..9f17af4a24 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -5,7 +5,8 @@ | |||
5 | * | 5 | * |
6 | * Started by Ingo Molnar: | 6 | * Started by Ingo Molnar: |
7 | * | 7 | * |
8 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | 8 | * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
9 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
9 | * | 10 | * |
10 | * Code for /proc/lockdep and /proc/lockdep_stats: | 11 | * Code for /proc/lockdep and /proc/lockdep_stats: |
11 | * | 12 | * |
@@ -15,6 +16,10 @@ | |||
15 | #include <linux/seq_file.h> | 16 | #include <linux/seq_file.h> |
16 | #include <linux/kallsyms.h> | 17 | #include <linux/kallsyms.h> |
17 | #include <linux/debug_locks.h> | 18 | #include <linux/debug_locks.h> |
19 | #include <linux/vmalloc.h> | ||
20 | #include <linux/sort.h> | ||
21 | #include <asm/uaccess.h> | ||
22 | #include <asm/div64.h> | ||
18 | 23 | ||
19 | #include "lockdep_internals.h" | 24 | #include "lockdep_internals.h" |
20 | 25 | ||
@@ -271,8 +276,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v) | |||
271 | if (nr_list_entries) | 276 | if (nr_list_entries) |
272 | factor = sum_forward_deps / nr_list_entries; | 277 | factor = sum_forward_deps / nr_list_entries; |
273 | 278 | ||
279 | #ifdef CONFIG_PROVE_LOCKING | ||
274 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", | 280 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", |
275 | nr_lock_chains, MAX_LOCKDEP_CHAINS); | 281 | nr_lock_chains, MAX_LOCKDEP_CHAINS); |
282 | #endif | ||
276 | 283 | ||
277 | #ifdef CONFIG_TRACE_IRQFLAGS | 284 | #ifdef CONFIG_TRACE_IRQFLAGS |
278 | seq_printf(m, " in-hardirq chains: %11u\n", | 285 | seq_printf(m, " in-hardirq chains: %11u\n", |
@@ -342,6 +349,292 @@ static const struct file_operations proc_lockdep_stats_operations = { | |||
342 | .release = seq_release, | 349 | .release = seq_release, |
343 | }; | 350 | }; |
344 | 351 | ||
352 | #ifdef CONFIG_LOCK_STAT | ||
353 | |||
354 | struct lock_stat_data { | ||
355 | struct lock_class *class; | ||
356 | struct lock_class_stats stats; | ||
357 | }; | ||
358 | |||
359 | struct lock_stat_seq { | ||
360 | struct lock_stat_data *iter; | ||
361 | struct lock_stat_data *iter_end; | ||
362 | struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; | ||
363 | }; | ||
364 | |||
365 | /* | ||
366 | * sort on absolute number of contentions | ||
367 | */ | ||
368 | static int lock_stat_cmp(const void *l, const void *r) | ||
369 | { | ||
370 | const struct lock_stat_data *dl = l, *dr = r; | ||
371 | unsigned long nl, nr; | ||
372 | |||
373 | nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; | ||
374 | nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; | ||
375 | |||
376 | return nr - nl; | ||
377 | } | ||
378 | |||
379 | static void seq_line(struct seq_file *m, char c, int offset, int length) | ||
380 | { | ||
381 | int i; | ||
382 | |||
383 | for (i = 0; i < offset; i++) | ||
384 | seq_puts(m, " "); | ||
385 | for (i = 0; i < length; i++) | ||
386 | seq_printf(m, "%c", c); | ||
387 | seq_puts(m, "\n"); | ||
388 | } | ||
389 | |||
390 | static void snprint_time(char *buf, size_t bufsiz, s64 nr) | ||
391 | { | ||
392 | unsigned long rem; | ||
393 | |||
394 | rem = do_div(nr, 1000); /* XXX: do_div_signed */ | ||
395 | snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); | ||
396 | } | ||
397 | |||
398 | static void seq_time(struct seq_file *m, s64 time) | ||
399 | { | ||
400 | char num[15]; | ||
401 | |||
402 | snprint_time(num, sizeof(num), time); | ||
403 | seq_printf(m, " %14s", num); | ||
404 | } | ||
405 | |||
406 | static void seq_lock_time(struct seq_file *m, struct lock_time *lt) | ||
407 | { | ||
408 | seq_printf(m, "%14lu", lt->nr); | ||
409 | seq_time(m, lt->min); | ||
410 | seq_time(m, lt->max); | ||
411 | seq_time(m, lt->total); | ||
412 | } | ||
413 | |||
414 | static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | ||
415 | { | ||
416 | char name[39]; | ||
417 | struct lock_class *class; | ||
418 | struct lock_class_stats *stats; | ||
419 | int i, namelen; | ||
420 | |||
421 | class = data->class; | ||
422 | stats = &data->stats; | ||
423 | |||
424 | namelen = 38; | ||
425 | if (class->name_version > 1) | ||
426 | namelen -= 2; /* XXX truncates versions > 9 */ | ||
427 | if (class->subclass) | ||
428 | namelen -= 2; | ||
429 | |||
430 | if (!class->name) { | ||
431 | char str[KSYM_NAME_LEN]; | ||
432 | const char *key_name; | ||
433 | |||
434 | key_name = __get_key_name(class->key, str); | ||
435 | snprintf(name, namelen, "%s", key_name); | ||
436 | } else { | ||
437 | snprintf(name, namelen, "%s", class->name); | ||
438 | } | ||
439 | namelen = strlen(name); | ||
440 | if (class->name_version > 1) { | ||
441 | snprintf(name+namelen, 3, "#%d", class->name_version); | ||
442 | namelen += 2; | ||
443 | } | ||
444 | if (class->subclass) { | ||
445 | snprintf(name+namelen, 3, "/%d", class->subclass); | ||
446 | namelen += 2; | ||
447 | } | ||
448 | |||
449 | if (stats->write_holdtime.nr) { | ||
450 | if (stats->read_holdtime.nr) | ||
451 | seq_printf(m, "%38s-W:", name); | ||
452 | else | ||
453 | seq_printf(m, "%40s:", name); | ||
454 | |||
455 | seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); | ||
456 | seq_lock_time(m, &stats->write_waittime); | ||
457 | seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); | ||
458 | seq_lock_time(m, &stats->write_holdtime); | ||
459 | seq_puts(m, "\n"); | ||
460 | } | ||
461 | |||
462 | if (stats->read_holdtime.nr) { | ||
463 | seq_printf(m, "%38s-R:", name); | ||
464 | seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); | ||
465 | seq_lock_time(m, &stats->read_waittime); | ||
466 | seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); | ||
467 | seq_lock_time(m, &stats->read_holdtime); | ||
468 | seq_puts(m, "\n"); | ||
469 | } | ||
470 | |||
471 | if (stats->read_waittime.nr + stats->write_waittime.nr == 0) | ||
472 | return; | ||
473 | |||
474 | if (stats->read_holdtime.nr) | ||
475 | namelen += 2; | ||
476 | |||
477 | for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { | ||
478 | char sym[KSYM_SYMBOL_LEN]; | ||
479 | char ip[32]; | ||
480 | |||
481 | if (class->contention_point[i] == 0) | ||
482 | break; | ||
483 | |||
484 | if (!i) | ||
485 | seq_line(m, '-', 40-namelen, namelen); | ||
486 | |||
487 | sprint_symbol(sym, class->contention_point[i]); | ||
488 | snprintf(ip, sizeof(ip), "[<%p>]", | ||
489 | (void *)class->contention_point[i]); | ||
490 | seq_printf(m, "%40s %14lu %29s %s\n", name, | ||
491 | stats->contention_point[i], | ||
492 | ip, sym); | ||
493 | } | ||
494 | if (i) { | ||
495 | seq_puts(m, "\n"); | ||
496 | seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); | ||
497 | seq_puts(m, "\n"); | ||
498 | } | ||
499 | } | ||
500 | |||
501 | static void seq_header(struct seq_file *m) | ||
502 | { | ||
503 | seq_printf(m, "lock_stat version 0.2\n"); | ||
504 | seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); | ||
505 | seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " | ||
506 | "%14s %14s\n", | ||
507 | "class name", | ||
508 | "con-bounces", | ||
509 | "contentions", | ||
510 | "waittime-min", | ||
511 | "waittime-max", | ||
512 | "waittime-total", | ||
513 | "acq-bounces", | ||
514 | "acquisitions", | ||
515 | "holdtime-min", | ||
516 | "holdtime-max", | ||
517 | "holdtime-total"); | ||
518 | seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); | ||
519 | seq_printf(m, "\n"); | ||
520 | } | ||
521 | |||
522 | static void *ls_start(struct seq_file *m, loff_t *pos) | ||
523 | { | ||
524 | struct lock_stat_seq *data = m->private; | ||
525 | |||
526 | if (data->iter == data->stats) | ||
527 | seq_header(m); | ||
528 | |||
529 | if (data->iter == data->iter_end) | ||
530 | data->iter = NULL; | ||
531 | |||
532 | return data->iter; | ||
533 | } | ||
534 | |||
535 | static void *ls_next(struct seq_file *m, void *v, loff_t *pos) | ||
536 | { | ||
537 | struct lock_stat_seq *data = m->private; | ||
538 | |||
539 | (*pos)++; | ||
540 | |||
541 | data->iter = v; | ||
542 | data->iter++; | ||
543 | if (data->iter == data->iter_end) | ||
544 | data->iter = NULL; | ||
545 | |||
546 | return data->iter; | ||
547 | } | ||
548 | |||
549 | static void ls_stop(struct seq_file *m, void *v) | ||
550 | { | ||
551 | } | ||
552 | |||
553 | static int ls_show(struct seq_file *m, void *v) | ||
554 | { | ||
555 | struct lock_stat_seq *data = m->private; | ||
556 | |||
557 | seq_stats(m, data->iter); | ||
558 | return 0; | ||
559 | } | ||
560 | |||
561 | static struct seq_operations lockstat_ops = { | ||
562 | .start = ls_start, | ||
563 | .next = ls_next, | ||
564 | .stop = ls_stop, | ||
565 | .show = ls_show, | ||
566 | }; | ||
567 | |||
568 | static int lock_stat_open(struct inode *inode, struct file *file) | ||
569 | { | ||
570 | int res; | ||
571 | struct lock_class *class; | ||
572 | struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); | ||
573 | |||
574 | if (!data) | ||
575 | return -ENOMEM; | ||
576 | |||
577 | res = seq_open(file, &lockstat_ops); | ||
578 | if (!res) { | ||
579 | struct lock_stat_data *iter = data->stats; | ||
580 | struct seq_file *m = file->private_data; | ||
581 | |||
582 | data->iter = iter; | ||
583 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
584 | iter->class = class; | ||
585 | iter->stats = lock_stats(class); | ||
586 | iter++; | ||
587 | } | ||
588 | data->iter_end = iter; | ||
589 | |||
590 | sort(data->stats, data->iter_end - data->iter, | ||
591 | sizeof(struct lock_stat_data), | ||
592 | lock_stat_cmp, NULL); | ||
593 | |||
594 | m->private = data; | ||
595 | } else | ||
596 | vfree(data); | ||
597 | |||
598 | return res; | ||
599 | } | ||
600 | |||
601 | static ssize_t lock_stat_write(struct file *file, const char __user *buf, | ||
602 | size_t count, loff_t *ppos) | ||
603 | { | ||
604 | struct lock_class *class; | ||
605 | char c; | ||
606 | |||
607 | if (count) { | ||
608 | if (get_user(c, buf)) | ||
609 | return -EFAULT; | ||
610 | |||
611 | if (c != '0') | ||
612 | return count; | ||
613 | |||
614 | list_for_each_entry(class, &all_lock_classes, lock_entry) | ||
615 | clear_lock_stats(class); | ||
616 | } | ||
617 | return count; | ||
618 | } | ||
619 | |||
620 | static int lock_stat_release(struct inode *inode, struct file *file) | ||
621 | { | ||
622 | struct seq_file *seq = file->private_data; | ||
623 | |||
624 | vfree(seq->private); | ||
625 | seq->private = NULL; | ||
626 | return seq_release(inode, file); | ||
627 | } | ||
628 | |||
629 | static const struct file_operations proc_lock_stat_operations = { | ||
630 | .open = lock_stat_open, | ||
631 | .write = lock_stat_write, | ||
632 | .read = seq_read, | ||
633 | .llseek = seq_lseek, | ||
634 | .release = lock_stat_release, | ||
635 | }; | ||
636 | #endif /* CONFIG_LOCK_STAT */ | ||
637 | |||
345 | static int __init lockdep_proc_init(void) | 638 | static int __init lockdep_proc_init(void) |
346 | { | 639 | { |
347 | struct proc_dir_entry *entry; | 640 | struct proc_dir_entry *entry; |
@@ -354,6 +647,12 @@ static int __init lockdep_proc_init(void) | |||
354 | if (entry) | 647 | if (entry) |
355 | entry->proc_fops = &proc_lockdep_stats_operations; | 648 | entry->proc_fops = &proc_lockdep_stats_operations; |
356 | 649 | ||
650 | #ifdef CONFIG_LOCK_STAT | ||
651 | entry = create_proc_entry("lock_stat", S_IRUSR, NULL); | ||
652 | if (entry) | ||
653 | entry->proc_fops = &proc_lock_stat_operations; | ||
654 | #endif | ||
655 | |||
357 | return 0; | 656 | return 0; |
358 | } | 657 | } |
359 | 658 | ||
diff --git a/kernel/module.c b/kernel/module.c index 9bd93de01f..33c04ad511 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -61,10 +61,8 @@ extern int module_sysfs_initialized; | |||
61 | /* If this is set, the section belongs in the init part of the module */ | 61 | /* If this is set, the section belongs in the init part of the module */ |
62 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) | 62 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) |
63 | 63 | ||
64 | /* Protects module list */ | 64 | /* List of modules, protected by module_mutex or preempt_disable |
65 | static DEFINE_SPINLOCK(modlist_lock); | 65 | * (add/delete uses stop_machine). */ |
66 | |||
67 | /* List of modules, protected by module_mutex AND modlist_lock */ | ||
68 | static DEFINE_MUTEX(module_mutex); | 66 | static DEFINE_MUTEX(module_mutex); |
69 | static LIST_HEAD(modules); | 67 | static LIST_HEAD(modules); |
70 | 68 | ||
@@ -488,8 +486,7 @@ static void free_modinfo_##field(struct module *mod) \ | |||
488 | mod->field = NULL; \ | 486 | mod->field = NULL; \ |
489 | } \ | 487 | } \ |
490 | static struct module_attribute modinfo_##field = { \ | 488 | static struct module_attribute modinfo_##field = { \ |
491 | .attr = { .name = __stringify(field), .mode = 0444, \ | 489 | .attr = { .name = __stringify(field), .mode = 0444 }, \ |
492 | .owner = THIS_MODULE }, \ | ||
493 | .show = show_modinfo_##field, \ | 490 | .show = show_modinfo_##field, \ |
494 | .setup = setup_modinfo_##field, \ | 491 | .setup = setup_modinfo_##field, \ |
495 | .test = modinfo_##field##_exists, \ | 492 | .test = modinfo_##field##_exists, \ |
@@ -761,14 +758,13 @@ static void print_unload_info(struct seq_file *m, struct module *mod) | |||
761 | void __symbol_put(const char *symbol) | 758 | void __symbol_put(const char *symbol) |
762 | { | 759 | { |
763 | struct module *owner; | 760 | struct module *owner; |
764 | unsigned long flags; | ||
765 | const unsigned long *crc; | 761 | const unsigned long *crc; |
766 | 762 | ||
767 | spin_lock_irqsave(&modlist_lock, flags); | 763 | preempt_disable(); |
768 | if (!__find_symbol(symbol, &owner, &crc, 1)) | 764 | if (!__find_symbol(symbol, &owner, &crc, 1)) |
769 | BUG(); | 765 | BUG(); |
770 | module_put(owner); | 766 | module_put(owner); |
771 | spin_unlock_irqrestore(&modlist_lock, flags); | 767 | preempt_enable(); |
772 | } | 768 | } |
773 | EXPORT_SYMBOL(__symbol_put); | 769 | EXPORT_SYMBOL(__symbol_put); |
774 | 770 | ||
@@ -793,7 +789,7 @@ static ssize_t show_refcnt(struct module_attribute *mattr, | |||
793 | } | 789 | } |
794 | 790 | ||
795 | static struct module_attribute refcnt = { | 791 | static struct module_attribute refcnt = { |
796 | .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE }, | 792 | .attr = { .name = "refcnt", .mode = 0444 }, |
797 | .show = show_refcnt, | 793 | .show = show_refcnt, |
798 | }; | 794 | }; |
799 | 795 | ||
@@ -851,7 +847,7 @@ static ssize_t show_initstate(struct module_attribute *mattr, | |||
851 | } | 847 | } |
852 | 848 | ||
853 | static struct module_attribute initstate = { | 849 | static struct module_attribute initstate = { |
854 | .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE }, | 850 | .attr = { .name = "initstate", .mode = 0444 }, |
855 | .show = show_initstate, | 851 | .show = show_initstate, |
856 | }; | 852 | }; |
857 | 853 | ||
@@ -1032,7 +1028,6 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, | |||
1032 | sattr->mattr.show = module_sect_show; | 1028 | sattr->mattr.show = module_sect_show; |
1033 | sattr->mattr.store = NULL; | 1029 | sattr->mattr.store = NULL; |
1034 | sattr->mattr.attr.name = sattr->name; | 1030 | sattr->mattr.attr.name = sattr->name; |
1035 | sattr->mattr.attr.owner = mod; | ||
1036 | sattr->mattr.attr.mode = S_IRUGO; | 1031 | sattr->mattr.attr.mode = S_IRUGO; |
1037 | *(gattr++) = &(sattr++)->mattr.attr; | 1032 | *(gattr++) = &(sattr++)->mattr.attr; |
1038 | } | 1033 | } |
@@ -1090,7 +1085,6 @@ int module_add_modinfo_attrs(struct module *mod) | |||
1090 | if (!attr->test || | 1085 | if (!attr->test || |
1091 | (attr->test && attr->test(mod))) { | 1086 | (attr->test && attr->test(mod))) { |
1092 | memcpy(temp_attr, attr, sizeof(*temp_attr)); | 1087 | memcpy(temp_attr, attr, sizeof(*temp_attr)); |
1093 | temp_attr->attr.owner = mod; | ||
1094 | error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); | 1088 | error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); |
1095 | ++temp_attr; | 1089 | ++temp_attr; |
1096 | } | 1090 | } |
@@ -1231,14 +1225,14 @@ static void free_module(struct module *mod) | |||
1231 | void *__symbol_get(const char *symbol) | 1225 | void *__symbol_get(const char *symbol) |
1232 | { | 1226 | { |
1233 | struct module *owner; | 1227 | struct module *owner; |
1234 | unsigned long value, flags; | 1228 | unsigned long value; |
1235 | const unsigned long *crc; | 1229 | const unsigned long *crc; |
1236 | 1230 | ||
1237 | spin_lock_irqsave(&modlist_lock, flags); | 1231 | preempt_disable(); |
1238 | value = __find_symbol(symbol, &owner, &crc, 1); | 1232 | value = __find_symbol(symbol, &owner, &crc, 1); |
1239 | if (value && !strong_try_module_get(owner)) | 1233 | if (value && !strong_try_module_get(owner)) |
1240 | value = 0; | 1234 | value = 0; |
1241 | spin_unlock_irqrestore(&modlist_lock, flags); | 1235 | preempt_enable(); |
1242 | 1236 | ||
1243 | return (void *)value; | 1237 | return (void *)value; |
1244 | } | 1238 | } |
@@ -2139,7 +2133,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) | |||
2139 | sym = get_ksymbol(mod, addr, NULL, NULL); | 2133 | sym = get_ksymbol(mod, addr, NULL, NULL); |
2140 | if (!sym) | 2134 | if (!sym) |
2141 | goto out; | 2135 | goto out; |
2142 | strlcpy(symname, sym, KSYM_NAME_LEN + 1); | 2136 | strlcpy(symname, sym, KSYM_NAME_LEN); |
2143 | mutex_unlock(&module_mutex); | 2137 | mutex_unlock(&module_mutex); |
2144 | return 0; | 2138 | return 0; |
2145 | } | 2139 | } |
@@ -2164,9 +2158,9 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | |||
2164 | if (!sym) | 2158 | if (!sym) |
2165 | goto out; | 2159 | goto out; |
2166 | if (modname) | 2160 | if (modname) |
2167 | strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); | 2161 | strlcpy(modname, mod->name, MODULE_NAME_LEN); |
2168 | if (name) | 2162 | if (name) |
2169 | strlcpy(name, sym, KSYM_NAME_LEN + 1); | 2163 | strlcpy(name, sym, KSYM_NAME_LEN); |
2170 | mutex_unlock(&module_mutex); | 2164 | mutex_unlock(&module_mutex); |
2171 | return 0; | 2165 | return 0; |
2172 | } | 2166 | } |
@@ -2187,8 +2181,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, | |||
2187 | *value = mod->symtab[symnum].st_value; | 2181 | *value = mod->symtab[symnum].st_value; |
2188 | *type = mod->symtab[symnum].st_info; | 2182 | *type = mod->symtab[symnum].st_info; |
2189 | strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, | 2183 | strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, |
2190 | KSYM_NAME_LEN + 1); | 2184 | KSYM_NAME_LEN); |
2191 | strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); | 2185 | strlcpy(module_name, mod->name, MODULE_NAME_LEN); |
2192 | *exported = is_exported(name, mod); | 2186 | *exported = is_exported(name, mod); |
2193 | mutex_unlock(&module_mutex); | 2187 | mutex_unlock(&module_mutex); |
2194 | return 0; | 2188 | return 0; |
@@ -2235,26 +2229,13 @@ unsigned long module_kallsyms_lookup_name(const char *name) | |||
2235 | /* Called by the /proc file system to return a list of modules. */ | 2229 | /* Called by the /proc file system to return a list of modules. */ |
2236 | static void *m_start(struct seq_file *m, loff_t *pos) | 2230 | static void *m_start(struct seq_file *m, loff_t *pos) |
2237 | { | 2231 | { |
2238 | struct list_head *i; | ||
2239 | loff_t n = 0; | ||
2240 | |||
2241 | mutex_lock(&module_mutex); | 2232 | mutex_lock(&module_mutex); |
2242 | list_for_each(i, &modules) { | 2233 | return seq_list_start(&modules, *pos); |
2243 | if (n++ == *pos) | ||
2244 | break; | ||
2245 | } | ||
2246 | if (i == &modules) | ||
2247 | return NULL; | ||
2248 | return i; | ||
2249 | } | 2234 | } |
2250 | 2235 | ||
2251 | static void *m_next(struct seq_file *m, void *p, loff_t *pos) | 2236 | static void *m_next(struct seq_file *m, void *p, loff_t *pos) |
2252 | { | 2237 | { |
2253 | struct list_head *i = p; | 2238 | return seq_list_next(p, &modules, pos); |
2254 | (*pos)++; | ||
2255 | if (i->next == &modules) | ||
2256 | return NULL; | ||
2257 | return i->next; | ||
2258 | } | 2239 | } |
2259 | 2240 | ||
2260 | static void m_stop(struct seq_file *m, void *p) | 2241 | static void m_stop(struct seq_file *m, void *p) |
@@ -2324,11 +2305,10 @@ const struct seq_operations modules_op = { | |||
2324 | /* Given an address, look for it in the module exception tables. */ | 2305 | /* Given an address, look for it in the module exception tables. */ |
2325 | const struct exception_table_entry *search_module_extables(unsigned long addr) | 2306 | const struct exception_table_entry *search_module_extables(unsigned long addr) |
2326 | { | 2307 | { |
2327 | unsigned long flags; | ||
2328 | const struct exception_table_entry *e = NULL; | 2308 | const struct exception_table_entry *e = NULL; |
2329 | struct module *mod; | 2309 | struct module *mod; |
2330 | 2310 | ||
2331 | spin_lock_irqsave(&modlist_lock, flags); | 2311 | preempt_disable(); |
2332 | list_for_each_entry(mod, &modules, list) { | 2312 | list_for_each_entry(mod, &modules, list) { |
2333 | if (mod->num_exentries == 0) | 2313 | if (mod->num_exentries == 0) |
2334 | continue; | 2314 | continue; |
@@ -2339,7 +2319,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) | |||
2339 | if (e) | 2319 | if (e) |
2340 | break; | 2320 | break; |
2341 | } | 2321 | } |
2342 | spin_unlock_irqrestore(&modlist_lock, flags); | 2322 | preempt_enable(); |
2343 | 2323 | ||
2344 | /* Now, if we found one, we are running inside it now, hence | 2324 | /* Now, if we found one, we are running inside it now, hence |
2345 | we cannot unload the module, hence no refcnt needed. */ | 2325 | we cannot unload the module, hence no refcnt needed. */ |
@@ -2351,25 +2331,24 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) | |||
2351 | */ | 2331 | */ |
2352 | int is_module_address(unsigned long addr) | 2332 | int is_module_address(unsigned long addr) |
2353 | { | 2333 | { |
2354 | unsigned long flags; | ||
2355 | struct module *mod; | 2334 | struct module *mod; |
2356 | 2335 | ||
2357 | spin_lock_irqsave(&modlist_lock, flags); | 2336 | preempt_disable(); |
2358 | 2337 | ||
2359 | list_for_each_entry(mod, &modules, list) { | 2338 | list_for_each_entry(mod, &modules, list) { |
2360 | if (within(addr, mod->module_core, mod->core_size)) { | 2339 | if (within(addr, mod->module_core, mod->core_size)) { |
2361 | spin_unlock_irqrestore(&modlist_lock, flags); | 2340 | preempt_enable(); |
2362 | return 1; | 2341 | return 1; |
2363 | } | 2342 | } |
2364 | } | 2343 | } |
2365 | 2344 | ||
2366 | spin_unlock_irqrestore(&modlist_lock, flags); | 2345 | preempt_enable(); |
2367 | 2346 | ||
2368 | return 0; | 2347 | return 0; |
2369 | } | 2348 | } |
2370 | 2349 | ||
2371 | 2350 | ||
2372 | /* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ | 2351 | /* Is this a valid kernel address? */ |
2373 | struct module *__module_text_address(unsigned long addr) | 2352 | struct module *__module_text_address(unsigned long addr) |
2374 | { | 2353 | { |
2375 | struct module *mod; | 2354 | struct module *mod; |
@@ -2384,11 +2363,10 @@ struct module *__module_text_address(unsigned long addr) | |||
2384 | struct module *module_text_address(unsigned long addr) | 2363 | struct module *module_text_address(unsigned long addr) |
2385 | { | 2364 | { |
2386 | struct module *mod; | 2365 | struct module *mod; |
2387 | unsigned long flags; | ||
2388 | 2366 | ||
2389 | spin_lock_irqsave(&modlist_lock, flags); | 2367 | preempt_disable(); |
2390 | mod = __module_text_address(addr); | 2368 | mod = __module_text_address(addr); |
2391 | spin_unlock_irqrestore(&modlist_lock, flags); | 2369 | preempt_enable(); |
2392 | 2370 | ||
2393 | return mod; | 2371 | return mod; |
2394 | } | 2372 | } |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 303eab1848..691b86564d 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -139,6 +139,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) | |||
139 | list_add_tail(&waiter.list, &lock->wait_list); | 139 | list_add_tail(&waiter.list, &lock->wait_list); |
140 | waiter.task = task; | 140 | waiter.task = task; |
141 | 141 | ||
142 | old_val = atomic_xchg(&lock->count, -1); | ||
143 | if (old_val == 1) | ||
144 | goto done; | ||
145 | |||
146 | lock_contended(&lock->dep_map, _RET_IP_); | ||
147 | |||
142 | for (;;) { | 148 | for (;;) { |
143 | /* | 149 | /* |
144 | * Lets try to take the lock again - this is needed even if | 150 | * Lets try to take the lock again - this is needed even if |
@@ -174,6 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) | |||
174 | spin_lock_mutex(&lock->wait_lock, flags); | 180 | spin_lock_mutex(&lock->wait_lock, flags); |
175 | } | 181 | } |
176 | 182 | ||
183 | done: | ||
184 | lock_acquired(&lock->dep_map); | ||
177 | /* got the lock - rejoice! */ | 185 | /* got the lock - rejoice! */ |
178 | mutex_remove_waiter(lock, &waiter, task_thread_info(task)); | 186 | mutex_remove_waiter(lock, &waiter, task_thread_info(task)); |
179 | debug_mutex_set_owner(lock, task_thread_info(task)); | 187 | debug_mutex_set_owner(lock, task_thread_info(task)); |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 9e83b589f7..a4fb7d4697 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -21,6 +21,8 @@ | |||
21 | #include <linux/utsname.h> | 21 | #include <linux/utsname.h> |
22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
23 | 23 | ||
24 | static struct kmem_cache *nsproxy_cachep; | ||
25 | |||
24 | struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); | 26 | struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); |
25 | 27 | ||
26 | static inline void get_nsproxy(struct nsproxy *ns) | 28 | static inline void get_nsproxy(struct nsproxy *ns) |
@@ -43,9 +45,11 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) | |||
43 | { | 45 | { |
44 | struct nsproxy *ns; | 46 | struct nsproxy *ns; |
45 | 47 | ||
46 | ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); | 48 | ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); |
47 | if (ns) | 49 | if (ns) { |
50 | memcpy(ns, orig, sizeof(struct nsproxy)); | ||
48 | atomic_set(&ns->count, 1); | 51 | atomic_set(&ns->count, 1); |
52 | } | ||
49 | return ns; | 53 | return ns; |
50 | } | 54 | } |
51 | 55 | ||
@@ -54,33 +58,51 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) | |||
54 | * Return the newly created nsproxy. Do not attach this to the task, | 58 | * Return the newly created nsproxy. Do not attach this to the task, |
55 | * leave it to the caller to do proper locking and attach it to task. | 59 | * leave it to the caller to do proper locking and attach it to task. |
56 | */ | 60 | */ |
57 | static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, | 61 | static struct nsproxy *create_new_namespaces(unsigned long flags, |
58 | struct fs_struct *new_fs) | 62 | struct task_struct *tsk, struct fs_struct *new_fs) |
59 | { | 63 | { |
60 | struct nsproxy *new_nsp; | 64 | struct nsproxy *new_nsp; |
65 | int err; | ||
61 | 66 | ||
62 | new_nsp = clone_nsproxy(tsk->nsproxy); | 67 | new_nsp = clone_nsproxy(tsk->nsproxy); |
63 | if (!new_nsp) | 68 | if (!new_nsp) |
64 | return ERR_PTR(-ENOMEM); | 69 | return ERR_PTR(-ENOMEM); |
65 | 70 | ||
66 | new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); | 71 | new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); |
67 | if (IS_ERR(new_nsp->mnt_ns)) | 72 | if (IS_ERR(new_nsp->mnt_ns)) { |
73 | err = PTR_ERR(new_nsp->mnt_ns); | ||
68 | goto out_ns; | 74 | goto out_ns; |
75 | } | ||
69 | 76 | ||
70 | new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); | 77 | new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); |
71 | if (IS_ERR(new_nsp->uts_ns)) | 78 | if (IS_ERR(new_nsp->uts_ns)) { |
79 | err = PTR_ERR(new_nsp->uts_ns); | ||
72 | goto out_uts; | 80 | goto out_uts; |
81 | } | ||
73 | 82 | ||
74 | new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); | 83 | new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); |
75 | if (IS_ERR(new_nsp->ipc_ns)) | 84 | if (IS_ERR(new_nsp->ipc_ns)) { |
85 | err = PTR_ERR(new_nsp->ipc_ns); | ||
76 | goto out_ipc; | 86 | goto out_ipc; |
87 | } | ||
77 | 88 | ||
78 | new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); | 89 | new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); |
79 | if (IS_ERR(new_nsp->pid_ns)) | 90 | if (IS_ERR(new_nsp->pid_ns)) { |
91 | err = PTR_ERR(new_nsp->pid_ns); | ||
80 | goto out_pid; | 92 | goto out_pid; |
93 | } | ||
94 | |||
95 | new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns); | ||
96 | if (IS_ERR(new_nsp->user_ns)) { | ||
97 | err = PTR_ERR(new_nsp->user_ns); | ||
98 | goto out_user; | ||
99 | } | ||
81 | 100 | ||
82 | return new_nsp; | 101 | return new_nsp; |
83 | 102 | ||
103 | out_user: | ||
104 | if (new_nsp->pid_ns) | ||
105 | put_pid_ns(new_nsp->pid_ns); | ||
84 | out_pid: | 106 | out_pid: |
85 | if (new_nsp->ipc_ns) | 107 | if (new_nsp->ipc_ns) |
86 | put_ipc_ns(new_nsp->ipc_ns); | 108 | put_ipc_ns(new_nsp->ipc_ns); |
@@ -91,15 +113,15 @@ out_uts: | |||
91 | if (new_nsp->mnt_ns) | 113 | if (new_nsp->mnt_ns) |
92 | put_mnt_ns(new_nsp->mnt_ns); | 114 | put_mnt_ns(new_nsp->mnt_ns); |
93 | out_ns: | 115 | out_ns: |
94 | kfree(new_nsp); | 116 | kmem_cache_free(nsproxy_cachep, new_nsp); |
95 | return ERR_PTR(-ENOMEM); | 117 | return ERR_PTR(err); |
96 | } | 118 | } |
97 | 119 | ||
98 | /* | 120 | /* |
99 | * called from clone. This now handles copy for nsproxy and all | 121 | * called from clone. This now handles copy for nsproxy and all |
100 | * namespaces therein. | 122 | * namespaces therein. |
101 | */ | 123 | */ |
102 | int copy_namespaces(int flags, struct task_struct *tsk) | 124 | int copy_namespaces(unsigned long flags, struct task_struct *tsk) |
103 | { | 125 | { |
104 | struct nsproxy *old_ns = tsk->nsproxy; | 126 | struct nsproxy *old_ns = tsk->nsproxy; |
105 | struct nsproxy *new_ns; | 127 | struct nsproxy *new_ns; |
@@ -110,7 +132,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) | |||
110 | 132 | ||
111 | get_nsproxy(old_ns); | 133 | get_nsproxy(old_ns); |
112 | 134 | ||
113 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) | 135 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) |
114 | return 0; | 136 | return 0; |
115 | 137 | ||
116 | if (!capable(CAP_SYS_ADMIN)) { | 138 | if (!capable(CAP_SYS_ADMIN)) { |
@@ -140,7 +162,9 @@ void free_nsproxy(struct nsproxy *ns) | |||
140 | put_ipc_ns(ns->ipc_ns); | 162 | put_ipc_ns(ns->ipc_ns); |
141 | if (ns->pid_ns) | 163 | if (ns->pid_ns) |
142 | put_pid_ns(ns->pid_ns); | 164 | put_pid_ns(ns->pid_ns); |
143 | kfree(ns); | 165 | if (ns->user_ns) |
166 | put_user_ns(ns->user_ns); | ||
167 | kmem_cache_free(nsproxy_cachep, ns); | ||
144 | } | 168 | } |
145 | 169 | ||
146 | /* | 170 | /* |
@@ -152,19 +176,10 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, | |||
152 | { | 176 | { |
153 | int err = 0; | 177 | int err = 0; |
154 | 178 | ||
155 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) | 179 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | |
180 | CLONE_NEWUSER))) | ||
156 | return 0; | 181 | return 0; |
157 | 182 | ||
158 | #ifndef CONFIG_IPC_NS | ||
159 | if (unshare_flags & CLONE_NEWIPC) | ||
160 | return -EINVAL; | ||
161 | #endif | ||
162 | |||
163 | #ifndef CONFIG_UTS_NS | ||
164 | if (unshare_flags & CLONE_NEWUTS) | ||
165 | return -EINVAL; | ||
166 | #endif | ||
167 | |||
168 | if (!capable(CAP_SYS_ADMIN)) | 183 | if (!capable(CAP_SYS_ADMIN)) |
169 | return -EPERM; | 184 | return -EPERM; |
170 | 185 | ||
@@ -174,3 +189,12 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, | |||
174 | err = PTR_ERR(*new_nsp); | 189 | err = PTR_ERR(*new_nsp); |
175 | return err; | 190 | return err; |
176 | } | 191 | } |
192 | |||
193 | static int __init nsproxy_cache_init(void) | ||
194 | { | ||
195 | nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), | ||
196 | 0, SLAB_PANIC, NULL); | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | module_init(nsproxy_cache_init); | ||
diff --git a/kernel/panic.c b/kernel/panic.c index 623d182825..f64f4c1ac1 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -159,14 +159,15 @@ const char *print_tainted(void) | |||
159 | { | 159 | { |
160 | static char buf[20]; | 160 | static char buf[20]; |
161 | if (tainted) { | 161 | if (tainted) { |
162 | snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", | 162 | snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c", |
163 | tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', | 163 | tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', |
164 | tainted & TAINT_FORCED_MODULE ? 'F' : ' ', | 164 | tainted & TAINT_FORCED_MODULE ? 'F' : ' ', |
165 | tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', | 165 | tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', |
166 | tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', | 166 | tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', |
167 | tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', | 167 | tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', |
168 | tainted & TAINT_BAD_PAGE ? 'B' : ' ', | 168 | tainted & TAINT_BAD_PAGE ? 'B' : ' ', |
169 | tainted & TAINT_USER ? 'U' : ' '); | 169 | tainted & TAINT_USER ? 'U' : ' ', |
170 | tainted & TAINT_DIE ? 'D' : ' '); | ||
170 | } | 171 | } |
171 | else | 172 | else |
172 | snprintf(buf, sizeof(buf), "Not tainted"); | 173 | snprintf(buf, sizeof(buf), "Not tainted"); |
diff --git a/kernel/params.c b/kernel/params.c index e61c46c97c..effbaaedd7 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -491,7 +491,6 @@ param_sysfs_setup(struct module_kobject *mk, | |||
491 | pattr->mattr.show = param_attr_show; | 491 | pattr->mattr.show = param_attr_show; |
492 | pattr->mattr.store = param_attr_store; | 492 | pattr->mattr.store = param_attr_store; |
493 | pattr->mattr.attr.name = (char *)&kp->name[name_skip]; | 493 | pattr->mattr.attr.name = (char *)&kp->name[name_skip]; |
494 | pattr->mattr.attr.owner = mk->mod; | ||
495 | pattr->mattr.attr.mode = kp->perm; | 494 | pattr->mattr.attr.mode = kp->perm; |
496 | *(gattr++) = &(pattr++)->mattr.attr; | 495 | *(gattr++) = &(pattr++)->mattr.attr; |
497 | } | 496 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index eb66bd2953..c6e3f9ffff 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -365,7 +365,7 @@ struct pid *find_ge_pid(int nr) | |||
365 | } | 365 | } |
366 | EXPORT_SYMBOL_GPL(find_get_pid); | 366 | EXPORT_SYMBOL_GPL(find_get_pid); |
367 | 367 | ||
368 | struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) | 368 | struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) |
369 | { | 369 | { |
370 | BUG_ON(!old_ns); | 370 | BUG_ON(!old_ns); |
371 | get_pid_ns(old_ns); | 371 | get_pid_ns(old_ns); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 1de710e183..b53c8fcd9d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p) | |||
161 | } | 161 | } |
162 | static inline unsigned long long sched_ns(struct task_struct *p) | 162 | static inline unsigned long long sched_ns(struct task_struct *p) |
163 | { | 163 | { |
164 | return (p == current) ? current_sched_time(p) : p->sched_time; | 164 | return task_sched_runtime(p); |
165 | } | 165 | } |
166 | 166 | ||
167 | int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | 167 | int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) |
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, | |||
246 | } while (t != p); | 246 | } while (t != p); |
247 | break; | 247 | break; |
248 | case CPUCLOCK_SCHED: | 248 | case CPUCLOCK_SCHED: |
249 | cpu->sched = p->signal->sched_time; | 249 | cpu->sched = p->signal->sum_sched_runtime; |
250 | /* Add in each other live thread. */ | 250 | /* Add in each other live thread. */ |
251 | while ((t = next_thread(t)) != p) { | 251 | while ((t = next_thread(t)) != p) { |
252 | cpu->sched += t->sched_time; | 252 | cpu->sched += t->se.sum_exec_runtime; |
253 | } | 253 | } |
254 | cpu->sched += sched_ns(p); | 254 | cpu->sched += sched_ns(p); |
255 | break; | 255 | break; |
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer) | |||
422 | */ | 422 | */ |
423 | static void cleanup_timers(struct list_head *head, | 423 | static void cleanup_timers(struct list_head *head, |
424 | cputime_t utime, cputime_t stime, | 424 | cputime_t utime, cputime_t stime, |
425 | unsigned long long sched_time) | 425 | unsigned long long sum_exec_runtime) |
426 | { | 426 | { |
427 | struct cpu_timer_list *timer, *next; | 427 | struct cpu_timer_list *timer, *next; |
428 | cputime_t ptime = cputime_add(utime, stime); | 428 | cputime_t ptime = cputime_add(utime, stime); |
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head, | |||
451 | ++head; | 451 | ++head; |
452 | list_for_each_entry_safe(timer, next, head, entry) { | 452 | list_for_each_entry_safe(timer, next, head, entry) { |
453 | list_del_init(&timer->entry); | 453 | list_del_init(&timer->entry); |
454 | if (timer->expires.sched < sched_time) { | 454 | if (timer->expires.sched < sum_exec_runtime) { |
455 | timer->expires.sched = 0; | 455 | timer->expires.sched = 0; |
456 | } else { | 456 | } else { |
457 | timer->expires.sched -= sched_time; | 457 | timer->expires.sched -= sum_exec_runtime; |
458 | } | 458 | } |
459 | } | 459 | } |
460 | } | 460 | } |
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head, | |||
467 | void posix_cpu_timers_exit(struct task_struct *tsk) | 467 | void posix_cpu_timers_exit(struct task_struct *tsk) |
468 | { | 468 | { |
469 | cleanup_timers(tsk->cpu_timers, | 469 | cleanup_timers(tsk->cpu_timers, |
470 | tsk->utime, tsk->stime, tsk->sched_time); | 470 | tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); |
471 | 471 | ||
472 | } | 472 | } |
473 | void posix_cpu_timers_exit_group(struct task_struct *tsk) | 473 | void posix_cpu_timers_exit_group(struct task_struct *tsk) |
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) | |||
475 | cleanup_timers(tsk->signal->cpu_timers, | 475 | cleanup_timers(tsk->signal->cpu_timers, |
476 | cputime_add(tsk->utime, tsk->signal->utime), | 476 | cputime_add(tsk->utime, tsk->signal->utime), |
477 | cputime_add(tsk->stime, tsk->signal->stime), | 477 | cputime_add(tsk->stime, tsk->signal->stime), |
478 | tsk->sched_time + tsk->signal->sched_time); | 478 | tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); |
479 | } | 479 | } |
480 | 480 | ||
481 | 481 | ||
@@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p, | |||
536 | nsleft = max_t(unsigned long long, nsleft, 1); | 536 | nsleft = max_t(unsigned long long, nsleft, 1); |
537 | do { | 537 | do { |
538 | if (likely(!(t->flags & PF_EXITING))) { | 538 | if (likely(!(t->flags & PF_EXITING))) { |
539 | ns = t->sched_time + nsleft; | 539 | ns = t->se.sum_exec_runtime + nsleft; |
540 | if (t->it_sched_expires == 0 || | 540 | if (t->it_sched_expires == 0 || |
541 | t->it_sched_expires > ns) { | 541 | t->it_sched_expires > ns) { |
542 | t->it_sched_expires = ns; | 542 | t->it_sched_expires = ns; |
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1004 | struct cpu_timer_list *t = list_first_entry(timers, | 1004 | struct cpu_timer_list *t = list_first_entry(timers, |
1005 | struct cpu_timer_list, | 1005 | struct cpu_timer_list, |
1006 | entry); | 1006 | entry); |
1007 | if (!--maxfire || tsk->sched_time < t->expires.sched) { | 1007 | if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { |
1008 | tsk->it_sched_expires = t->expires.sched; | 1008 | tsk->it_sched_expires = t->expires.sched; |
1009 | break; | 1009 | break; |
1010 | } | 1010 | } |
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1024 | int maxfire; | 1024 | int maxfire; |
1025 | struct signal_struct *const sig = tsk->signal; | 1025 | struct signal_struct *const sig = tsk->signal; |
1026 | cputime_t utime, stime, ptime, virt_expires, prof_expires; | 1026 | cputime_t utime, stime, ptime, virt_expires, prof_expires; |
1027 | unsigned long long sched_time, sched_expires; | 1027 | unsigned long long sum_sched_runtime, sched_expires; |
1028 | struct task_struct *t; | 1028 | struct task_struct *t; |
1029 | struct list_head *timers = sig->cpu_timers; | 1029 | struct list_head *timers = sig->cpu_timers; |
1030 | 1030 | ||
@@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk, | |||
1044 | */ | 1044 | */ |
1045 | utime = sig->utime; | 1045 | utime = sig->utime; |
1046 | stime = sig->stime; | 1046 | stime = sig->stime; |
1047 | sched_time = sig->sched_time; | 1047 | sum_sched_runtime = sig->sum_sched_runtime; |
1048 | t = tsk; | 1048 | t = tsk; |
1049 | do { | 1049 | do { |
1050 | utime = cputime_add(utime, t->utime); | 1050 | utime = cputime_add(utime, t->utime); |
1051 | stime = cputime_add(stime, t->stime); | 1051 | stime = cputime_add(stime, t->stime); |
1052 | sched_time += t->sched_time; | 1052 | sum_sched_runtime += t->se.sum_exec_runtime; |
1053 | t = next_thread(t); | 1053 | t = next_thread(t); |
1054 | } while (t != tsk); | 1054 | } while (t != tsk); |
1055 | ptime = cputime_add(utime, stime); | 1055 | ptime = cputime_add(utime, stime); |
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1090 | struct cpu_timer_list *t = list_first_entry(timers, | 1090 | struct cpu_timer_list *t = list_first_entry(timers, |
1091 | struct cpu_timer_list, | 1091 | struct cpu_timer_list, |
1092 | entry); | 1092 | entry); |
1093 | if (!--maxfire || sched_time < t->expires.sched) { | 1093 | if (!--maxfire || sum_sched_runtime < t->expires.sched) { |
1094 | sched_expires = t->expires.sched; | 1094 | sched_expires = t->expires.sched; |
1095 | break; | 1095 | break; |
1096 | } | 1096 | } |
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1182 | virt_left = cputime_sub(virt_expires, utime); | 1182 | virt_left = cputime_sub(virt_expires, utime); |
1183 | virt_left = cputime_div_non_zero(virt_left, nthreads); | 1183 | virt_left = cputime_div_non_zero(virt_left, nthreads); |
1184 | if (sched_expires) { | 1184 | if (sched_expires) { |
1185 | sched_left = sched_expires - sched_time; | 1185 | sched_left = sched_expires - sum_sched_runtime; |
1186 | do_div(sched_left, nthreads); | 1186 | do_div(sched_left, nthreads); |
1187 | sched_left = max_t(unsigned long long, sched_left, 1); | 1187 | sched_left = max_t(unsigned long long, sched_left, 1); |
1188 | } else { | 1188 | } else { |
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1208 | t->it_virt_expires = ticks; | 1208 | t->it_virt_expires = ticks; |
1209 | } | 1209 | } |
1210 | 1210 | ||
1211 | sched = t->sched_time + sched_left; | 1211 | sched = t->se.sum_exec_runtime + sched_left; |
1212 | if (sched_expires && (t->it_sched_expires == 0 || | 1212 | if (sched_expires && (t->it_sched_expires == 0 || |
1213 | t->it_sched_expires > sched)) { | 1213 | t->it_sched_expires > sched)) { |
1214 | t->it_sched_expires = sched; | 1214 | t->it_sched_expires = sched; |
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1300 | 1300 | ||
1301 | if (UNEXPIRED(prof) && UNEXPIRED(virt) && | 1301 | if (UNEXPIRED(prof) && UNEXPIRED(virt) && |
1302 | (tsk->it_sched_expires == 0 || | 1302 | (tsk->it_sched_expires == 0 || |
1303 | tsk->sched_time < tsk->it_sched_expires)) | 1303 | tsk->se.sum_exec_runtime < tsk->it_sched_expires)) |
1304 | return; | 1304 | return; |
1305 | 1305 | ||
1306 | #undef UNEXPIRED | 1306 | #undef UNEXPIRED |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 329ce01720..55b3761eda 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -241,7 +241,7 @@ static __init int init_posix_timers(void) | |||
241 | register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); | 241 | register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); |
242 | 242 | ||
243 | posix_timers_cache = kmem_cache_create("posix_timers_cache", | 243 | posix_timers_cache = kmem_cache_create("posix_timers_cache", |
244 | sizeof (struct k_itimer), 0, 0, NULL, NULL); | 244 | sizeof (struct k_itimer), 0, 0, NULL); |
245 | idr_init(&posix_timers_id); | 245 | idr_init(&posix_timers_id); |
246 | return 0; | 246 | return 0; |
247 | } | 247 | } |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 495b7d4dd3..c1a106d87d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -33,13 +33,20 @@ config PM_DEBUG | |||
33 | bool "Power Management Debug Support" | 33 | bool "Power Management Debug Support" |
34 | depends on PM | 34 | depends on PM |
35 | ---help--- | 35 | ---help--- |
36 | This option enables verbose debugging support in the Power Management | 36 | This option enables various debugging support in the Power Management |
37 | code. This is helpful when debugging and reporting various PM bugs, | 37 | code. This is helpful when debugging and reporting PM bugs, like |
38 | like suspend support. | 38 | suspend support. |
39 | |||
40 | config PM_VERBOSE | ||
41 | bool "Verbose Power Management debugging" | ||
42 | depends on PM_DEBUG | ||
43 | default n | ||
44 | ---help--- | ||
45 | This option enables verbose messages from the Power Management code. | ||
39 | 46 | ||
40 | config DISABLE_CONSOLE_SUSPEND | 47 | config DISABLE_CONSOLE_SUSPEND |
41 | bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" | 48 | bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" |
42 | depends on PM && PM_DEBUG | 49 | depends on PM_DEBUG |
43 | default n | 50 | default n |
44 | ---help--- | 51 | ---help--- |
45 | This option turns off the console suspend mechanism that prevents | 52 | This option turns off the console suspend mechanism that prevents |
@@ -50,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND | |||
50 | 57 | ||
51 | config PM_TRACE | 58 | config PM_TRACE |
52 | bool "Suspend/resume event tracing" | 59 | bool "Suspend/resume event tracing" |
53 | depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL | 60 | depends on PM_DEBUG && X86 && EXPERIMENTAL |
54 | default n | 61 | default n |
55 | ---help--- | 62 | ---help--- |
56 | This enables some cheesy code to save the last PM event point in the | 63 | This enables some cheesy code to save the last PM event point in the |
@@ -65,18 +72,6 @@ config PM_TRACE | |||
65 | CAUTION: this option will cause your machine's real-time clock to be | 72 | CAUTION: this option will cause your machine's real-time clock to be |
66 | set to an invalid time after a resume. | 73 | set to an invalid time after a resume. |
67 | 74 | ||
68 | config PM_SYSFS_DEPRECATED | ||
69 | bool "Driver model /sys/devices/.../power/state files (DEPRECATED)" | ||
70 | depends on PM && SYSFS | ||
71 | default n | ||
72 | help | ||
73 | The driver model started out with a sysfs file intended to provide | ||
74 | a userspace hook for device power management. This feature has never | ||
75 | worked very well, except for limited testing purposes, and so it will | ||
76 | be removed. It's not clear that a generic mechanism could really | ||
77 | handle the wide variability of device power states; any replacements | ||
78 | are likely to be bus or driver specific. | ||
79 | |||
80 | config SOFTWARE_SUSPEND | 75 | config SOFTWARE_SUSPEND |
81 | bool "Software Suspend (Hibernation)" | 76 | bool "Software Suspend (Hibernation)" |
82 | depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) | 77 | depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f445b9cd60..324ac0188c 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -45,7 +45,7 @@ enum { | |||
45 | 45 | ||
46 | static int hibernation_mode = HIBERNATION_SHUTDOWN; | 46 | static int hibernation_mode = HIBERNATION_SHUTDOWN; |
47 | 47 | ||
48 | struct hibernation_ops *hibernation_ops; | 48 | static struct hibernation_ops *hibernation_ops; |
49 | 49 | ||
50 | /** | 50 | /** |
51 | * hibernation_set_ops - set the global hibernate operations | 51 | * hibernation_set_ops - set the global hibernate operations |
@@ -54,7 +54,8 @@ struct hibernation_ops *hibernation_ops; | |||
54 | 54 | ||
55 | void hibernation_set_ops(struct hibernation_ops *ops) | 55 | void hibernation_set_ops(struct hibernation_ops *ops) |
56 | { | 56 | { |
57 | if (ops && !(ops->prepare && ops->enter && ops->finish)) { | 57 | if (ops && !(ops->prepare && ops->enter && ops->finish |
58 | && ops->pre_restore && ops->restore_cleanup)) { | ||
58 | WARN_ON(1); | 59 | WARN_ON(1); |
59 | return; | 60 | return; |
60 | } | 61 | } |
@@ -74,9 +75,9 @@ void hibernation_set_ops(struct hibernation_ops *ops) | |||
74 | * platform driver if so configured and return an error code if it fails | 75 | * platform driver if so configured and return an error code if it fails |
75 | */ | 76 | */ |
76 | 77 | ||
77 | static int platform_prepare(void) | 78 | static int platform_prepare(int platform_mode) |
78 | { | 79 | { |
79 | return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? | 80 | return (platform_mode && hibernation_ops) ? |
80 | hibernation_ops->prepare() : 0; | 81 | hibernation_ops->prepare() : 0; |
81 | } | 82 | } |
82 | 83 | ||
@@ -85,13 +86,145 @@ static int platform_prepare(void) | |||
85 | * using the platform driver (must be called after platform_prepare()) | 86 | * using the platform driver (must be called after platform_prepare()) |
86 | */ | 87 | */ |
87 | 88 | ||
88 | static void platform_finish(void) | 89 | static void platform_finish(int platform_mode) |
89 | { | 90 | { |
90 | if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) | 91 | if (platform_mode && hibernation_ops) |
91 | hibernation_ops->finish(); | 92 | hibernation_ops->finish(); |
92 | } | 93 | } |
93 | 94 | ||
94 | /** | 95 | /** |
96 | * platform_pre_restore - prepare the platform for the restoration from a | ||
97 | * hibernation image. If the restore fails after this function has been | ||
98 | * called, platform_restore_cleanup() must be called. | ||
99 | */ | ||
100 | |||
101 | static int platform_pre_restore(int platform_mode) | ||
102 | { | ||
103 | return (platform_mode && hibernation_ops) ? | ||
104 | hibernation_ops->pre_restore() : 0; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * platform_restore_cleanup - switch the platform to the normal mode of | ||
109 | * operation after a failing restore. If platform_pre_restore() has been | ||
110 | * called before the failing restore, this function must be called too, | ||
111 | * regardless of the result of platform_pre_restore(). | ||
112 | */ | ||
113 | |||
114 | static void platform_restore_cleanup(int platform_mode) | ||
115 | { | ||
116 | if (platform_mode && hibernation_ops) | ||
117 | hibernation_ops->restore_cleanup(); | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * hibernation_snapshot - quiesce devices and create the hibernation | ||
122 | * snapshot image. | ||
123 | * @platform_mode - if set, use the platform driver, if available, to | ||
124 | * prepare the platform frimware for the power transition. | ||
125 | * | ||
126 | * Must be called with pm_mutex held | ||
127 | */ | ||
128 | |||
129 | int hibernation_snapshot(int platform_mode) | ||
130 | { | ||
131 | int error; | ||
132 | |||
133 | /* Free memory before shutting down devices. */ | ||
134 | error = swsusp_shrink_memory(); | ||
135 | if (error) | ||
136 | return error; | ||
137 | |||
138 | suspend_console(); | ||
139 | error = device_suspend(PMSG_FREEZE); | ||
140 | if (error) | ||
141 | goto Resume_console; | ||
142 | |||
143 | error = platform_prepare(platform_mode); | ||
144 | if (error) | ||
145 | goto Resume_devices; | ||
146 | |||
147 | error = disable_nonboot_cpus(); | ||
148 | if (!error) { | ||
149 | if (hibernation_mode != HIBERNATION_TEST) { | ||
150 | in_suspend = 1; | ||
151 | error = swsusp_suspend(); | ||
152 | /* Control returns here after successful restore */ | ||
153 | } else { | ||
154 | printk("swsusp debug: Waiting for 5 seconds.\n"); | ||
155 | mdelay(5000); | ||
156 | } | ||
157 | } | ||
158 | enable_nonboot_cpus(); | ||
159 | Resume_devices: | ||
160 | platform_finish(platform_mode); | ||
161 | device_resume(); | ||
162 | Resume_console: | ||
163 | resume_console(); | ||
164 | return error; | ||
165 | } | ||
166 | |||
167 | /** | ||
168 | * hibernation_restore - quiesce devices and restore the hibernation | ||
169 | * snapshot image. If successful, control returns in hibernation_snaphot() | ||
170 | * @platform_mode - if set, use the platform driver, if available, to | ||
171 | * prepare the platform frimware for the transition. | ||
172 | * | ||
173 | * Must be called with pm_mutex held | ||
174 | */ | ||
175 | |||
176 | int hibernation_restore(int platform_mode) | ||
177 | { | ||
178 | int error; | ||
179 | |||
180 | pm_prepare_console(); | ||
181 | suspend_console(); | ||
182 | error = device_suspend(PMSG_PRETHAW); | ||
183 | if (error) | ||
184 | goto Finish; | ||
185 | |||
186 | error = platform_pre_restore(platform_mode); | ||
187 | if (!error) { | ||
188 | error = disable_nonboot_cpus(); | ||
189 | if (!error) | ||
190 | error = swsusp_resume(); | ||
191 | enable_nonboot_cpus(); | ||
192 | } | ||
193 | platform_restore_cleanup(platform_mode); | ||
194 | device_resume(); | ||
195 | Finish: | ||
196 | resume_console(); | ||
197 | pm_restore_console(); | ||
198 | return error; | ||
199 | } | ||
200 | |||
201 | /** | ||
202 | * hibernation_platform_enter - enter the hibernation state using the | ||
203 | * platform driver (if available) | ||
204 | */ | ||
205 | |||
206 | int hibernation_platform_enter(void) | ||
207 | { | ||
208 | int error; | ||
209 | |||
210 | if (hibernation_ops) { | ||
211 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | ||
212 | /* | ||
213 | * We have cancelled the power transition by running | ||
214 | * hibernation_ops->finish() before saving the image, so we | ||
215 | * should let the firmware know that we're going to enter the | ||
216 | * sleep state after all | ||
217 | */ | ||
218 | error = hibernation_ops->prepare(); | ||
219 | if (!error) | ||
220 | error = hibernation_ops->enter(); | ||
221 | } else { | ||
222 | error = -ENOSYS; | ||
223 | } | ||
224 | return error; | ||
225 | } | ||
226 | |||
227 | /** | ||
95 | * power_down - Shut the machine down for hibernation. | 228 | * power_down - Shut the machine down for hibernation. |
96 | * | 229 | * |
97 | * Use the platform driver, if configured so; otherwise try | 230 | * Use the platform driver, if configured so; otherwise try |
@@ -111,11 +244,7 @@ static void power_down(void) | |||
111 | kernel_restart(NULL); | 244 | kernel_restart(NULL); |
112 | break; | 245 | break; |
113 | case HIBERNATION_PLATFORM: | 246 | case HIBERNATION_PLATFORM: |
114 | if (hibernation_ops) { | 247 | hibernation_platform_enter(); |
115 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | ||
116 | hibernation_ops->enter(); | ||
117 | break; | ||
118 | } | ||
119 | } | 248 | } |
120 | kernel_halt(); | 249 | kernel_halt(); |
121 | /* | 250 | /* |
@@ -152,9 +281,16 @@ int hibernate(void) | |||
152 | { | 281 | { |
153 | int error; | 282 | int error; |
154 | 283 | ||
284 | mutex_lock(&pm_mutex); | ||
155 | /* The snapshot device should not be opened while we're running */ | 285 | /* The snapshot device should not be opened while we're running */ |
156 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) | 286 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { |
157 | return -EBUSY; | 287 | error = -EBUSY; |
288 | goto Unlock; | ||
289 | } | ||
290 | |||
291 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | ||
292 | if (error) | ||
293 | goto Exit; | ||
158 | 294 | ||
159 | /* Allocate memory management structures */ | 295 | /* Allocate memory management structures */ |
160 | error = create_basic_memory_bitmaps(); | 296 | error = create_basic_memory_bitmaps(); |
@@ -165,75 +301,35 @@ int hibernate(void) | |||
165 | if (error) | 301 | if (error) |
166 | goto Finish; | 302 | goto Finish; |
167 | 303 | ||
168 | mutex_lock(&pm_mutex); | ||
169 | if (hibernation_mode == HIBERNATION_TESTPROC) { | 304 | if (hibernation_mode == HIBERNATION_TESTPROC) { |
170 | printk("swsusp debug: Waiting for 5 seconds.\n"); | 305 | printk("swsusp debug: Waiting for 5 seconds.\n"); |
171 | mdelay(5000); | 306 | mdelay(5000); |
172 | goto Thaw; | 307 | goto Thaw; |
173 | } | 308 | } |
309 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); | ||
310 | if (in_suspend && !error) { | ||
311 | unsigned int flags = 0; | ||
174 | 312 | ||
175 | /* Free memory before shutting down devices. */ | 313 | if (hibernation_mode == HIBERNATION_PLATFORM) |
176 | error = swsusp_shrink_memory(); | 314 | flags |= SF_PLATFORM_MODE; |
177 | if (error) | ||
178 | goto Thaw; | ||
179 | |||
180 | error = platform_prepare(); | ||
181 | if (error) | ||
182 | goto Thaw; | ||
183 | |||
184 | suspend_console(); | ||
185 | error = device_suspend(PMSG_FREEZE); | ||
186 | if (error) { | ||
187 | printk(KERN_ERR "PM: Some devices failed to suspend\n"); | ||
188 | goto Resume_devices; | ||
189 | } | ||
190 | error = disable_nonboot_cpus(); | ||
191 | if (error) | ||
192 | goto Enable_cpus; | ||
193 | |||
194 | if (hibernation_mode == HIBERNATION_TEST) { | ||
195 | printk("swsusp debug: Waiting for 5 seconds.\n"); | ||
196 | mdelay(5000); | ||
197 | goto Enable_cpus; | ||
198 | } | ||
199 | |||
200 | pr_debug("PM: snapshotting memory.\n"); | ||
201 | in_suspend = 1; | ||
202 | error = swsusp_suspend(); | ||
203 | if (error) | ||
204 | goto Enable_cpus; | ||
205 | |||
206 | if (in_suspend) { | ||
207 | enable_nonboot_cpus(); | ||
208 | platform_finish(); | ||
209 | device_resume(); | ||
210 | resume_console(); | ||
211 | pr_debug("PM: writing image.\n"); | 315 | pr_debug("PM: writing image.\n"); |
212 | error = swsusp_write(); | 316 | error = swsusp_write(flags); |
317 | swsusp_free(); | ||
213 | if (!error) | 318 | if (!error) |
214 | power_down(); | 319 | power_down(); |
215 | else { | ||
216 | swsusp_free(); | ||
217 | goto Thaw; | ||
218 | } | ||
219 | } else { | 320 | } else { |
220 | pr_debug("PM: Image restored successfully.\n"); | 321 | pr_debug("PM: Image restored successfully.\n"); |
322 | swsusp_free(); | ||
221 | } | 323 | } |
222 | |||
223 | swsusp_free(); | ||
224 | Enable_cpus: | ||
225 | enable_nonboot_cpus(); | ||
226 | Resume_devices: | ||
227 | platform_finish(); | ||
228 | device_resume(); | ||
229 | resume_console(); | ||
230 | Thaw: | 324 | Thaw: |
231 | mutex_unlock(&pm_mutex); | ||
232 | unprepare_processes(); | 325 | unprepare_processes(); |
233 | Finish: | 326 | Finish: |
234 | free_basic_memory_bitmaps(); | 327 | free_basic_memory_bitmaps(); |
235 | Exit: | 328 | Exit: |
329 | pm_notifier_call_chain(PM_POST_HIBERNATION); | ||
236 | atomic_inc(&snapshot_device_available); | 330 | atomic_inc(&snapshot_device_available); |
331 | Unlock: | ||
332 | mutex_unlock(&pm_mutex); | ||
237 | return error; | 333 | return error; |
238 | } | 334 | } |
239 | 335 | ||
@@ -253,6 +349,7 @@ int hibernate(void) | |||
253 | static int software_resume(void) | 349 | static int software_resume(void) |
254 | { | 350 | { |
255 | int error; | 351 | int error; |
352 | unsigned int flags; | ||
256 | 353 | ||
257 | mutex_lock(&pm_mutex); | 354 | mutex_lock(&pm_mutex); |
258 | if (!swsusp_resume_device) { | 355 | if (!swsusp_resume_device) { |
@@ -300,30 +397,12 @@ static int software_resume(void) | |||
300 | 397 | ||
301 | pr_debug("PM: Reading swsusp image.\n"); | 398 | pr_debug("PM: Reading swsusp image.\n"); |
302 | 399 | ||
303 | error = swsusp_read(); | 400 | error = swsusp_read(&flags); |
304 | if (error) { | ||
305 | swsusp_free(); | ||
306 | goto Thaw; | ||
307 | } | ||
308 | |||
309 | pr_debug("PM: Preparing devices for restore.\n"); | ||
310 | |||
311 | suspend_console(); | ||
312 | error = device_suspend(PMSG_PRETHAW); | ||
313 | if (error) | ||
314 | goto Free; | ||
315 | |||
316 | error = disable_nonboot_cpus(); | ||
317 | if (!error) | 401 | if (!error) |
318 | swsusp_resume(); | 402 | hibernation_restore(flags & SF_PLATFORM_MODE); |
319 | 403 | ||
320 | enable_nonboot_cpus(); | ||
321 | Free: | ||
322 | swsusp_free(); | ||
323 | device_resume(); | ||
324 | resume_console(); | ||
325 | Thaw: | ||
326 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); | 404 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); |
405 | swsusp_free(); | ||
327 | unprepare_processes(); | 406 | unprepare_processes(); |
328 | Done: | 407 | Done: |
329 | free_basic_memory_bitmaps(); | 408 | free_basic_memory_bitmaps(); |
@@ -333,7 +412,7 @@ static int software_resume(void) | |||
333 | Unlock: | 412 | Unlock: |
334 | mutex_unlock(&pm_mutex); | 413 | mutex_unlock(&pm_mutex); |
335 | pr_debug("PM: Resume from disk failed.\n"); | 414 | pr_debug("PM: Resume from disk failed.\n"); |
336 | return 0; | 415 | return error; |
337 | } | 416 | } |
338 | 417 | ||
339 | late_initcall(software_resume); | 418 | late_initcall(software_resume); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index fc45ed2262..32147b57c3 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -23,6 +23,8 @@ | |||
23 | 23 | ||
24 | #include "power.h" | 24 | #include "power.h" |
25 | 25 | ||
26 | BLOCKING_NOTIFIER_HEAD(pm_chain_head); | ||
27 | |||
26 | /*This is just an arbitrary number */ | 28 | /*This is just an arbitrary number */ |
27 | #define FREE_PAGE_NUMBER (100) | 29 | #define FREE_PAGE_NUMBER (100) |
28 | 30 | ||
@@ -63,14 +65,11 @@ static inline void pm_finish(suspend_state_t state) | |||
63 | 65 | ||
64 | /** | 66 | /** |
65 | * suspend_prepare - Do prep work before entering low-power state. | 67 | * suspend_prepare - Do prep work before entering low-power state. |
66 | * @state: State we're entering. | ||
67 | * | 68 | * |
68 | * This is common code that is called for each state that we're | 69 | * This is common code that is called for each state that we're entering. |
69 | * entering. Allocate a console, stop all processes, then make sure | 70 | * Run suspend notifiers, allocate a console and stop all processes. |
70 | * the platform can enter the requested state. | ||
71 | */ | 71 | */ |
72 | 72 | static int suspend_prepare(void) | |
73 | static int suspend_prepare(suspend_state_t state) | ||
74 | { | 73 | { |
75 | int error; | 74 | int error; |
76 | unsigned int free_pages; | 75 | unsigned int free_pages; |
@@ -78,6 +77,10 @@ static int suspend_prepare(suspend_state_t state) | |||
78 | if (!pm_ops || !pm_ops->enter) | 77 | if (!pm_ops || !pm_ops->enter) |
79 | return -EPERM; | 78 | return -EPERM; |
80 | 79 | ||
80 | error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); | ||
81 | if (error) | ||
82 | goto Finish; | ||
83 | |||
81 | pm_prepare_console(); | 84 | pm_prepare_console(); |
82 | 85 | ||
83 | if (freeze_processes()) { | 86 | if (freeze_processes()) { |
@@ -85,46 +88,23 @@ static int suspend_prepare(suspend_state_t state) | |||
85 | goto Thaw; | 88 | goto Thaw; |
86 | } | 89 | } |
87 | 90 | ||
88 | if ((free_pages = global_page_state(NR_FREE_PAGES)) | 91 | free_pages = global_page_state(NR_FREE_PAGES); |
89 | < FREE_PAGE_NUMBER) { | 92 | if (free_pages < FREE_PAGE_NUMBER) { |
90 | pr_debug("PM: free some memory\n"); | 93 | pr_debug("PM: free some memory\n"); |
91 | shrink_all_memory(FREE_PAGE_NUMBER - free_pages); | 94 | shrink_all_memory(FREE_PAGE_NUMBER - free_pages); |
92 | if (nr_free_pages() < FREE_PAGE_NUMBER) { | 95 | if (nr_free_pages() < FREE_PAGE_NUMBER) { |
93 | error = -ENOMEM; | 96 | error = -ENOMEM; |
94 | printk(KERN_ERR "PM: No enough memory\n"); | 97 | printk(KERN_ERR "PM: No enough memory\n"); |
95 | goto Thaw; | ||
96 | } | 98 | } |
97 | } | 99 | } |
98 | |||
99 | if (pm_ops->set_target) { | ||
100 | error = pm_ops->set_target(state); | ||
101 | if (error) | ||
102 | goto Thaw; | ||
103 | } | ||
104 | suspend_console(); | ||
105 | error = device_suspend(PMSG_SUSPEND); | ||
106 | if (error) { | ||
107 | printk(KERN_ERR "Some devices failed to suspend\n"); | ||
108 | goto Resume_console; | ||
109 | } | ||
110 | if (pm_ops->prepare) { | ||
111 | if ((error = pm_ops->prepare(state))) | ||
112 | goto Resume_devices; | ||
113 | } | ||
114 | |||
115 | error = disable_nonboot_cpus(); | ||
116 | if (!error) | 100 | if (!error) |
117 | return 0; | 101 | return 0; |
118 | 102 | ||
119 | enable_nonboot_cpus(); | ||
120 | pm_finish(state); | ||
121 | Resume_devices: | ||
122 | device_resume(); | ||
123 | Resume_console: | ||
124 | resume_console(); | ||
125 | Thaw: | 103 | Thaw: |
126 | thaw_processes(); | 104 | thaw_processes(); |
127 | pm_restore_console(); | 105 | pm_restore_console(); |
106 | Finish: | ||
107 | pm_notifier_call_chain(PM_POST_SUSPEND); | ||
128 | return error; | 108 | return error; |
129 | } | 109 | } |
130 | 110 | ||
@@ -140,6 +120,12 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) | |||
140 | local_irq_enable(); | 120 | local_irq_enable(); |
141 | } | 121 | } |
142 | 122 | ||
123 | /** | ||
124 | * suspend_enter - enter the desired system sleep state. | ||
125 | * @state: state to enter | ||
126 | * | ||
127 | * This function should be called after devices have been suspended. | ||
128 | */ | ||
143 | int suspend_enter(suspend_state_t state) | 129 | int suspend_enter(suspend_state_t state) |
144 | { | 130 | { |
145 | int error = 0; | 131 | int error = 0; |
@@ -159,23 +145,58 @@ int suspend_enter(suspend_state_t state) | |||
159 | return error; | 145 | return error; |
160 | } | 146 | } |
161 | 147 | ||
148 | /** | ||
149 | * suspend_devices_and_enter - suspend devices and enter the desired system sleep | ||
150 | * state. | ||
151 | * @state: state to enter | ||
152 | */ | ||
153 | int suspend_devices_and_enter(suspend_state_t state) | ||
154 | { | ||
155 | int error; | ||
156 | |||
157 | if (!pm_ops) | ||
158 | return -ENOSYS; | ||
159 | |||
160 | if (pm_ops->set_target) { | ||
161 | error = pm_ops->set_target(state); | ||
162 | if (error) | ||
163 | return error; | ||
164 | } | ||
165 | suspend_console(); | ||
166 | error = device_suspend(PMSG_SUSPEND); | ||
167 | if (error) { | ||
168 | printk(KERN_ERR "Some devices failed to suspend\n"); | ||
169 | goto Resume_console; | ||
170 | } | ||
171 | if (pm_ops->prepare) { | ||
172 | error = pm_ops->prepare(state); | ||
173 | if (error) | ||
174 | goto Resume_devices; | ||
175 | } | ||
176 | error = disable_nonboot_cpus(); | ||
177 | if (!error) | ||
178 | suspend_enter(state); | ||
179 | |||
180 | enable_nonboot_cpus(); | ||
181 | pm_finish(state); | ||
182 | Resume_devices: | ||
183 | device_resume(); | ||
184 | Resume_console: | ||
185 | resume_console(); | ||
186 | return error; | ||
187 | } | ||
162 | 188 | ||
163 | /** | 189 | /** |
164 | * suspend_finish - Do final work before exiting suspend sequence. | 190 | * suspend_finish - Do final work before exiting suspend sequence. |
165 | * @state: State we're coming out of. | ||
166 | * | 191 | * |
167 | * Call platform code to clean up, restart processes, and free the | 192 | * Call platform code to clean up, restart processes, and free the |
168 | * console that we've allocated. This is not called for suspend-to-disk. | 193 | * console that we've allocated. This is not called for suspend-to-disk. |
169 | */ | 194 | */ |
170 | 195 | static void suspend_finish(void) | |
171 | static void suspend_finish(suspend_state_t state) | ||
172 | { | 196 | { |
173 | enable_nonboot_cpus(); | ||
174 | pm_finish(state); | ||
175 | device_resume(); | ||
176 | resume_console(); | ||
177 | thaw_processes(); | 197 | thaw_processes(); |
178 | pm_restore_console(); | 198 | pm_restore_console(); |
199 | pm_notifier_call_chain(PM_POST_SUSPEND); | ||
179 | } | 200 | } |
180 | 201 | ||
181 | 202 | ||
@@ -207,7 +228,6 @@ static inline int valid_state(suspend_state_t state) | |||
207 | * Then, do the setup for suspend, enter the state, and cleaup (after | 228 | * Then, do the setup for suspend, enter the state, and cleaup (after |
208 | * we've woken up). | 229 | * we've woken up). |
209 | */ | 230 | */ |
210 | |||
211 | static int enter_state(suspend_state_t state) | 231 | static int enter_state(suspend_state_t state) |
212 | { | 232 | { |
213 | int error; | 233 | int error; |
@@ -218,14 +238,14 @@ static int enter_state(suspend_state_t state) | |||
218 | return -EBUSY; | 238 | return -EBUSY; |
219 | 239 | ||
220 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); | 240 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); |
221 | if ((error = suspend_prepare(state))) | 241 | if ((error = suspend_prepare())) |
222 | goto Unlock; | 242 | goto Unlock; |
223 | 243 | ||
224 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); | 244 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); |
225 | error = suspend_enter(state); | 245 | error = suspend_devices_and_enter(state); |
226 | 246 | ||
227 | pr_debug("PM: Finishing wakeup.\n"); | 247 | pr_debug("PM: Finishing wakeup.\n"); |
228 | suspend_finish(state); | 248 | suspend_finish(); |
229 | Unlock: | 249 | Unlock: |
230 | mutex_unlock(&pm_mutex); | 250 | mutex_unlock(&pm_mutex); |
231 | return error; | 251 | return error; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 5138148710..5f24c786f8 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -25,7 +25,10 @@ struct swsusp_info { | |||
25 | */ | 25 | */ |
26 | #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) | 26 | #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) |
27 | 27 | ||
28 | extern struct hibernation_ops *hibernation_ops; | 28 | /* kernel/power/disk.c */ |
29 | extern int hibernation_snapshot(int platform_mode); | ||
30 | extern int hibernation_restore(int platform_mode); | ||
31 | extern int hibernation_platform_enter(void); | ||
29 | #endif | 32 | #endif |
30 | 33 | ||
31 | extern int pfn_is_nosave(unsigned long); | 34 | extern int pfn_is_nosave(unsigned long); |
@@ -152,16 +155,34 @@ extern sector_t alloc_swapdev_block(int swap); | |||
152 | extern void free_all_swap_pages(int swap); | 155 | extern void free_all_swap_pages(int swap); |
153 | extern int swsusp_swap_in_use(void); | 156 | extern int swsusp_swap_in_use(void); |
154 | 157 | ||
158 | /* | ||
159 | * Flags that can be passed from the hibernatig hernel to the "boot" kernel in | ||
160 | * the image header. | ||
161 | */ | ||
162 | #define SF_PLATFORM_MODE 1 | ||
163 | |||
164 | /* kernel/power/disk.c */ | ||
155 | extern int swsusp_check(void); | 165 | extern int swsusp_check(void); |
156 | extern int swsusp_shrink_memory(void); | 166 | extern int swsusp_shrink_memory(void); |
157 | extern void swsusp_free(void); | 167 | extern void swsusp_free(void); |
158 | extern int swsusp_suspend(void); | 168 | extern int swsusp_suspend(void); |
159 | extern int swsusp_resume(void); | 169 | extern int swsusp_resume(void); |
160 | extern int swsusp_read(void); | 170 | extern int swsusp_read(unsigned int *flags_p); |
161 | extern int swsusp_write(void); | 171 | extern int swsusp_write(unsigned int flags); |
162 | extern void swsusp_close(void); | 172 | extern void swsusp_close(void); |
163 | extern int suspend_enter(suspend_state_t state); | ||
164 | 173 | ||
165 | struct timeval; | 174 | struct timeval; |
175 | /* kernel/power/swsusp.c */ | ||
166 | extern void swsusp_show_speed(struct timeval *, struct timeval *, | 176 | extern void swsusp_show_speed(struct timeval *, struct timeval *, |
167 | unsigned int, char *); | 177 | unsigned int, char *); |
178 | |||
179 | /* kernel/power/main.c */ | ||
180 | extern int suspend_enter(suspend_state_t state); | ||
181 | extern int suspend_devices_and_enter(suspend_state_t state); | ||
182 | extern struct blocking_notifier_head pm_chain_head; | ||
183 | |||
184 | static inline int pm_notifier_call_chain(unsigned long val) | ||
185 | { | ||
186 | return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) | ||
187 | == NOTIFY_BAD) ? -EINVAL : 0; | ||
188 | } | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index e0233d8422..3434940a3d 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -40,7 +40,7 @@ static inline void frozen_process(void) | |||
40 | current->flags |= PF_FROZEN; | 40 | current->flags |= PF_FROZEN; |
41 | wmb(); | 41 | wmb(); |
42 | } | 42 | } |
43 | clear_tsk_thread_flag(current, TIF_FREEZE); | 43 | clear_freeze_flag(current); |
44 | } | 44 | } |
45 | 45 | ||
46 | /* Refrigerator is place where frozen processes are stored :-). */ | 46 | /* Refrigerator is place where frozen processes are stored :-). */ |
@@ -72,20 +72,19 @@ void refrigerator(void) | |||
72 | schedule(); | 72 | schedule(); |
73 | } | 73 | } |
74 | pr_debug("%s left refrigerator\n", current->comm); | 74 | pr_debug("%s left refrigerator\n", current->comm); |
75 | current->state = save; | 75 | __set_current_state(save); |
76 | } | 76 | } |
77 | 77 | ||
78 | static inline void freeze_process(struct task_struct *p) | 78 | static void freeze_task(struct task_struct *p) |
79 | { | 79 | { |
80 | unsigned long flags; | 80 | unsigned long flags; |
81 | 81 | ||
82 | if (!freezing(p)) { | 82 | if (!freezing(p)) { |
83 | rmb(); | 83 | rmb(); |
84 | if (!frozen(p)) { | 84 | if (!frozen(p)) { |
85 | set_freeze_flag(p); | ||
85 | if (p->state == TASK_STOPPED) | 86 | if (p->state == TASK_STOPPED) |
86 | force_sig_specific(SIGSTOP, p); | 87 | force_sig_specific(SIGSTOP, p); |
87 | |||
88 | freeze(p); | ||
89 | spin_lock_irqsave(&p->sighand->siglock, flags); | 88 | spin_lock_irqsave(&p->sighand->siglock, flags); |
90 | signal_wake_up(p, p->state == TASK_STOPPED); | 89 | signal_wake_up(p, p->state == TASK_STOPPED); |
91 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 90 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
@@ -99,19 +98,14 @@ static void cancel_freezing(struct task_struct *p) | |||
99 | 98 | ||
100 | if (freezing(p)) { | 99 | if (freezing(p)) { |
101 | pr_debug(" clean up: %s\n", p->comm); | 100 | pr_debug(" clean up: %s\n", p->comm); |
102 | do_not_freeze(p); | 101 | clear_freeze_flag(p); |
103 | spin_lock_irqsave(&p->sighand->siglock, flags); | 102 | spin_lock_irqsave(&p->sighand->siglock, flags); |
104 | recalc_sigpending_and_wake(p); | 103 | recalc_sigpending_and_wake(p); |
105 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 104 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
106 | } | 105 | } |
107 | } | 106 | } |
108 | 107 | ||
109 | static inline int is_user_space(struct task_struct *p) | 108 | static int try_to_freeze_tasks(int freeze_user_space) |
110 | { | ||
111 | return p->mm && !(p->flags & PF_BORROWED_MM); | ||
112 | } | ||
113 | |||
114 | static unsigned int try_to_freeze_tasks(int freeze_user_space) | ||
115 | { | 109 | { |
116 | struct task_struct *g, *p; | 110 | struct task_struct *g, *p; |
117 | unsigned long end_time; | 111 | unsigned long end_time; |
@@ -122,26 +116,40 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) | |||
122 | todo = 0; | 116 | todo = 0; |
123 | read_lock(&tasklist_lock); | 117 | read_lock(&tasklist_lock); |
124 | do_each_thread(g, p) { | 118 | do_each_thread(g, p) { |
125 | if (!freezeable(p)) | 119 | if (frozen(p) || !freezeable(p)) |
126 | continue; | 120 | continue; |
127 | 121 | ||
128 | if (frozen(p)) | 122 | if (freeze_user_space) { |
129 | continue; | 123 | if (p->state == TASK_TRACED && |
130 | 124 | frozen(p->parent)) { | |
131 | if (p->state == TASK_TRACED && frozen(p->parent)) { | 125 | cancel_freezing(p); |
132 | cancel_freezing(p); | 126 | continue; |
133 | continue; | 127 | } |
128 | /* | ||
129 | * Kernel threads should not have TIF_FREEZE set | ||
130 | * at this point, so we must ensure that either | ||
131 | * p->mm is not NULL *and* PF_BORROWED_MM is | ||
132 | * unset, or TIF_FRREZE is left unset. | ||
133 | * The task_lock() is necessary to prevent races | ||
134 | * with exit_mm() or use_mm()/unuse_mm() from | ||
135 | * occuring. | ||
136 | */ | ||
137 | task_lock(p); | ||
138 | if (!p->mm || (p->flags & PF_BORROWED_MM)) { | ||
139 | task_unlock(p); | ||
140 | continue; | ||
141 | } | ||
142 | freeze_task(p); | ||
143 | task_unlock(p); | ||
144 | } else { | ||
145 | freeze_task(p); | ||
134 | } | 146 | } |
135 | if (freeze_user_space && !is_user_space(p)) | ||
136 | continue; | ||
137 | |||
138 | freeze_process(p); | ||
139 | if (!freezer_should_skip(p)) | 147 | if (!freezer_should_skip(p)) |
140 | todo++; | 148 | todo++; |
141 | } while_each_thread(g, p); | 149 | } while_each_thread(g, p); |
142 | read_unlock(&tasklist_lock); | 150 | read_unlock(&tasklist_lock); |
143 | yield(); /* Yield is okay here */ | 151 | yield(); /* Yield is okay here */ |
144 | if (todo && time_after(jiffies, end_time)) | 152 | if (time_after(jiffies, end_time)) |
145 | break; | 153 | break; |
146 | } while (todo); | 154 | } while (todo); |
147 | 155 | ||
@@ -152,49 +160,41 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) | |||
152 | * but it cleans up leftover PF_FREEZE requests. | 160 | * but it cleans up leftover PF_FREEZE requests. |
153 | */ | 161 | */ |
154 | printk("\n"); | 162 | printk("\n"); |
155 | printk(KERN_ERR "Stopping %s timed out after %d seconds " | 163 | printk(KERN_ERR "Freezing of %s timed out after %d seconds " |
156 | "(%d tasks refusing to freeze):\n", | 164 | "(%d tasks refusing to freeze):\n", |
157 | freeze_user_space ? "user space processes" : | 165 | freeze_user_space ? "user space " : "tasks ", |
158 | "kernel threads", | ||
159 | TIMEOUT / HZ, todo); | 166 | TIMEOUT / HZ, todo); |
167 | show_state(); | ||
160 | read_lock(&tasklist_lock); | 168 | read_lock(&tasklist_lock); |
161 | do_each_thread(g, p) { | 169 | do_each_thread(g, p) { |
162 | if (freeze_user_space && !is_user_space(p)) | ||
163 | continue; | ||
164 | |||
165 | task_lock(p); | 170 | task_lock(p); |
166 | if (freezeable(p) && !frozen(p) && | 171 | if (freezing(p) && !freezer_should_skip(p)) |
167 | !freezer_should_skip(p)) | ||
168 | printk(KERN_ERR " %s\n", p->comm); | 172 | printk(KERN_ERR " %s\n", p->comm); |
169 | |||
170 | cancel_freezing(p); | 173 | cancel_freezing(p); |
171 | task_unlock(p); | 174 | task_unlock(p); |
172 | } while_each_thread(g, p); | 175 | } while_each_thread(g, p); |
173 | read_unlock(&tasklist_lock); | 176 | read_unlock(&tasklist_lock); |
174 | } | 177 | } |
175 | 178 | ||
176 | return todo; | 179 | return todo ? -EBUSY : 0; |
177 | } | 180 | } |
178 | 181 | ||
179 | /** | 182 | /** |
180 | * freeze_processes - tell processes to enter the refrigerator | 183 | * freeze_processes - tell processes to enter the refrigerator |
181 | * | ||
182 | * Returns 0 on success, or the number of processes that didn't freeze, | ||
183 | * although they were told to. | ||
184 | */ | 184 | */ |
185 | int freeze_processes(void) | 185 | int freeze_processes(void) |
186 | { | 186 | { |
187 | unsigned int nr_unfrozen; | 187 | int error; |
188 | 188 | ||
189 | printk("Stopping tasks ... "); | 189 | printk("Stopping tasks ... "); |
190 | nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); | 190 | error = try_to_freeze_tasks(FREEZER_USER_SPACE); |
191 | if (nr_unfrozen) | 191 | if (error) |
192 | return nr_unfrozen; | 192 | return error; |
193 | 193 | ||
194 | sys_sync(); | 194 | sys_sync(); |
195 | nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); | 195 | error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); |
196 | if (nr_unfrozen) | 196 | if (error) |
197 | return nr_unfrozen; | 197 | return error; |
198 | 198 | ||
199 | printk("done.\n"); | 199 | printk("done.\n"); |
200 | BUG_ON(in_atomic()); | 200 | BUG_ON(in_atomic()); |
@@ -210,7 +210,7 @@ static void thaw_tasks(int thaw_user_space) | |||
210 | if (!freezeable(p)) | 210 | if (!freezeable(p)) |
211 | continue; | 211 | continue; |
212 | 212 | ||
213 | if (is_user_space(p) == !thaw_user_space) | 213 | if (!p->mm == thaw_user_space) |
214 | continue; | 214 | continue; |
215 | 215 | ||
216 | thaw_process(p); | 216 | thaw_process(p); |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8b1a1b8371..917aba1005 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -33,8 +33,9 @@ extern char resume_file[]; | |||
33 | #define SWSUSP_SIG "S1SUSPEND" | 33 | #define SWSUSP_SIG "S1SUSPEND" |
34 | 34 | ||
35 | struct swsusp_header { | 35 | struct swsusp_header { |
36 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; | 36 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; |
37 | sector_t image; | 37 | sector_t image; |
38 | unsigned int flags; /* Flags to pass to the "boot" kernel */ | ||
38 | char orig_sig[10]; | 39 | char orig_sig[10]; |
39 | char sig[10]; | 40 | char sig[10]; |
40 | } __attribute__((packed)); | 41 | } __attribute__((packed)); |
@@ -138,7 +139,7 @@ static int wait_on_bio_chain(struct bio **bio_chain) | |||
138 | * Saving part | 139 | * Saving part |
139 | */ | 140 | */ |
140 | 141 | ||
141 | static int mark_swapfiles(sector_t start) | 142 | static int mark_swapfiles(sector_t start, unsigned int flags) |
142 | { | 143 | { |
143 | int error; | 144 | int error; |
144 | 145 | ||
@@ -148,6 +149,7 @@ static int mark_swapfiles(sector_t start) | |||
148 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); | 149 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); |
149 | memcpy(swsusp_header->sig,SWSUSP_SIG, 10); | 150 | memcpy(swsusp_header->sig,SWSUSP_SIG, 10); |
150 | swsusp_header->image = start; | 151 | swsusp_header->image = start; |
152 | swsusp_header->flags = flags; | ||
151 | error = bio_write_page(swsusp_resume_block, | 153 | error = bio_write_page(swsusp_resume_block, |
152 | swsusp_header, NULL); | 154 | swsusp_header, NULL); |
153 | } else { | 155 | } else { |
@@ -369,6 +371,7 @@ static int enough_swap(unsigned int nr_pages) | |||
369 | 371 | ||
370 | /** | 372 | /** |
371 | * swsusp_write - Write entire image and metadata. | 373 | * swsusp_write - Write entire image and metadata. |
374 | * @flags: flags to pass to the "boot" kernel in the image header | ||
372 | * | 375 | * |
373 | * It is important _NOT_ to umount filesystems at this point. We want | 376 | * It is important _NOT_ to umount filesystems at this point. We want |
374 | * them synced (in case something goes wrong) but we DO not want to mark | 377 | * them synced (in case something goes wrong) but we DO not want to mark |
@@ -376,7 +379,7 @@ static int enough_swap(unsigned int nr_pages) | |||
376 | * correctly, we'll mark system clean, anyway.) | 379 | * correctly, we'll mark system clean, anyway.) |
377 | */ | 380 | */ |
378 | 381 | ||
379 | int swsusp_write(void) | 382 | int swsusp_write(unsigned int flags) |
380 | { | 383 | { |
381 | struct swap_map_handle handle; | 384 | struct swap_map_handle handle; |
382 | struct snapshot_handle snapshot; | 385 | struct snapshot_handle snapshot; |
@@ -415,7 +418,7 @@ int swsusp_write(void) | |||
415 | if (!error) { | 418 | if (!error) { |
416 | flush_swap_writer(&handle); | 419 | flush_swap_writer(&handle); |
417 | printk("S"); | 420 | printk("S"); |
418 | error = mark_swapfiles(start); | 421 | error = mark_swapfiles(start, flags); |
419 | printk("|\n"); | 422 | printk("|\n"); |
420 | } | 423 | } |
421 | } | 424 | } |
@@ -540,13 +543,20 @@ static int load_image(struct swap_map_handle *handle, | |||
540 | return error; | 543 | return error; |
541 | } | 544 | } |
542 | 545 | ||
543 | int swsusp_read(void) | 546 | /** |
547 | * swsusp_read - read the hibernation image. | ||
548 | * @flags_p: flags passed by the "frozen" kernel in the image header should | ||
549 | * be written into this memeory location | ||
550 | */ | ||
551 | |||
552 | int swsusp_read(unsigned int *flags_p) | ||
544 | { | 553 | { |
545 | int error; | 554 | int error; |
546 | struct swap_map_handle handle; | 555 | struct swap_map_handle handle; |
547 | struct snapshot_handle snapshot; | 556 | struct snapshot_handle snapshot; |
548 | struct swsusp_info *header; | 557 | struct swsusp_info *header; |
549 | 558 | ||
559 | *flags_p = swsusp_header->flags; | ||
550 | if (IS_ERR(resume_bdev)) { | 560 | if (IS_ERR(resume_bdev)) { |
551 | pr_debug("swsusp: block device not initialised\n"); | 561 | pr_debug("swsusp: block device not initialised\n"); |
552 | return PTR_ERR(resume_bdev); | 562 | return PTR_ERR(resume_bdev); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index d65305b515..bd0723a7df 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -128,92 +128,6 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
128 | return res; | 128 | return res; |
129 | } | 129 | } |
130 | 130 | ||
131 | static inline int platform_prepare(void) | ||
132 | { | ||
133 | int error = 0; | ||
134 | |||
135 | if (hibernation_ops) | ||
136 | error = hibernation_ops->prepare(); | ||
137 | |||
138 | return error; | ||
139 | } | ||
140 | |||
141 | static inline void platform_finish(void) | ||
142 | { | ||
143 | if (hibernation_ops) | ||
144 | hibernation_ops->finish(); | ||
145 | } | ||
146 | |||
147 | static inline int snapshot_suspend(int platform_suspend) | ||
148 | { | ||
149 | int error; | ||
150 | |||
151 | mutex_lock(&pm_mutex); | ||
152 | /* Free memory before shutting down devices. */ | ||
153 | error = swsusp_shrink_memory(); | ||
154 | if (error) | ||
155 | goto Finish; | ||
156 | |||
157 | if (platform_suspend) { | ||
158 | error = platform_prepare(); | ||
159 | if (error) | ||
160 | goto Finish; | ||
161 | } | ||
162 | suspend_console(); | ||
163 | error = device_suspend(PMSG_FREEZE); | ||
164 | if (error) | ||
165 | goto Resume_devices; | ||
166 | |||
167 | error = disable_nonboot_cpus(); | ||
168 | if (!error) { | ||
169 | in_suspend = 1; | ||
170 | error = swsusp_suspend(); | ||
171 | } | ||
172 | enable_nonboot_cpus(); | ||
173 | Resume_devices: | ||
174 | if (platform_suspend) | ||
175 | platform_finish(); | ||
176 | |||
177 | device_resume(); | ||
178 | resume_console(); | ||
179 | Finish: | ||
180 | mutex_unlock(&pm_mutex); | ||
181 | return error; | ||
182 | } | ||
183 | |||
184 | static inline int snapshot_restore(int platform_suspend) | ||
185 | { | ||
186 | int error; | ||
187 | |||
188 | mutex_lock(&pm_mutex); | ||
189 | pm_prepare_console(); | ||
190 | if (platform_suspend) { | ||
191 | error = platform_prepare(); | ||
192 | if (error) | ||
193 | goto Finish; | ||
194 | } | ||
195 | suspend_console(); | ||
196 | error = device_suspend(PMSG_PRETHAW); | ||
197 | if (error) | ||
198 | goto Resume_devices; | ||
199 | |||
200 | error = disable_nonboot_cpus(); | ||
201 | if (!error) | ||
202 | error = swsusp_resume(); | ||
203 | |||
204 | enable_nonboot_cpus(); | ||
205 | Resume_devices: | ||
206 | if (platform_suspend) | ||
207 | platform_finish(); | ||
208 | |||
209 | device_resume(); | ||
210 | resume_console(); | ||
211 | Finish: | ||
212 | pm_restore_console(); | ||
213 | mutex_unlock(&pm_mutex); | ||
214 | return error; | ||
215 | } | ||
216 | |||
217 | static int snapshot_ioctl(struct inode *inode, struct file *filp, | 131 | static int snapshot_ioctl(struct inode *inode, struct file *filp, |
218 | unsigned int cmd, unsigned long arg) | 132 | unsigned int cmd, unsigned long arg) |
219 | { | 133 | { |
@@ -237,10 +151,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
237 | if (data->frozen) | 151 | if (data->frozen) |
238 | break; | 152 | break; |
239 | mutex_lock(&pm_mutex); | 153 | mutex_lock(&pm_mutex); |
240 | if (freeze_processes()) { | 154 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); |
241 | thaw_processes(); | 155 | if (!error) { |
242 | error = -EBUSY; | 156 | error = freeze_processes(); |
157 | if (error) | ||
158 | thaw_processes(); | ||
243 | } | 159 | } |
160 | if (error) | ||
161 | pm_notifier_call_chain(PM_POST_HIBERNATION); | ||
244 | mutex_unlock(&pm_mutex); | 162 | mutex_unlock(&pm_mutex); |
245 | if (!error) | 163 | if (!error) |
246 | data->frozen = 1; | 164 | data->frozen = 1; |
@@ -251,6 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
251 | break; | 169 | break; |
252 | mutex_lock(&pm_mutex); | 170 | mutex_lock(&pm_mutex); |
253 | thaw_processes(); | 171 | thaw_processes(); |
172 | pm_notifier_call_chain(PM_POST_HIBERNATION); | ||
254 | mutex_unlock(&pm_mutex); | 173 | mutex_unlock(&pm_mutex); |
255 | data->frozen = 0; | 174 | data->frozen = 0; |
256 | break; | 175 | break; |
@@ -260,7 +179,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
260 | error = -EPERM; | 179 | error = -EPERM; |
261 | break; | 180 | break; |
262 | } | 181 | } |
263 | error = snapshot_suspend(data->platform_suspend); | 182 | error = hibernation_snapshot(data->platform_suspend); |
264 | if (!error) | 183 | if (!error) |
265 | error = put_user(in_suspend, (unsigned int __user *)arg); | 184 | error = put_user(in_suspend, (unsigned int __user *)arg); |
266 | if (!error) | 185 | if (!error) |
@@ -274,7 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
274 | error = -EPERM; | 193 | error = -EPERM; |
275 | break; | 194 | break; |
276 | } | 195 | } |
277 | error = snapshot_restore(data->platform_suspend); | 196 | error = hibernation_restore(data->platform_suspend); |
278 | break; | 197 | break; |
279 | 198 | ||
280 | case SNAPSHOT_FREE: | 199 | case SNAPSHOT_FREE: |
@@ -336,47 +255,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
336 | break; | 255 | break; |
337 | 256 | ||
338 | case SNAPSHOT_S2RAM: | 257 | case SNAPSHOT_S2RAM: |
339 | if (!pm_ops) { | ||
340 | error = -ENOSYS; | ||
341 | break; | ||
342 | } | ||
343 | |||
344 | if (!data->frozen) { | 258 | if (!data->frozen) { |
345 | error = -EPERM; | 259 | error = -EPERM; |
346 | break; | 260 | break; |
347 | } | 261 | } |
348 | |||
349 | if (!mutex_trylock(&pm_mutex)) { | 262 | if (!mutex_trylock(&pm_mutex)) { |
350 | error = -EBUSY; | 263 | error = -EBUSY; |
351 | break; | 264 | break; |
352 | } | 265 | } |
353 | 266 | /* | |
354 | if (pm_ops->prepare) { | 267 | * Tasks are frozen and the notifiers have been called with |
355 | error = pm_ops->prepare(PM_SUSPEND_MEM); | 268 | * PM_HIBERNATION_PREPARE |
356 | if (error) | 269 | */ |
357 | goto OutS3; | 270 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); |
358 | } | ||
359 | |||
360 | /* Put devices to sleep */ | ||
361 | suspend_console(); | ||
362 | error = device_suspend(PMSG_SUSPEND); | ||
363 | if (error) { | ||
364 | printk(KERN_ERR "Failed to suspend some devices.\n"); | ||
365 | } else { | ||
366 | error = disable_nonboot_cpus(); | ||
367 | if (!error) { | ||
368 | /* Enter S3, system is already frozen */ | ||
369 | suspend_enter(PM_SUSPEND_MEM); | ||
370 | enable_nonboot_cpus(); | ||
371 | } | ||
372 | /* Wake up devices */ | ||
373 | device_resume(); | ||
374 | } | ||
375 | resume_console(); | ||
376 | if (pm_ops->finish) | ||
377 | pm_ops->finish(PM_SUSPEND_MEM); | ||
378 | |||
379 | OutS3: | ||
380 | mutex_unlock(&pm_mutex); | 271 | mutex_unlock(&pm_mutex); |
381 | break; | 272 | break; |
382 | 273 | ||
@@ -386,19 +277,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
386 | switch (arg) { | 277 | switch (arg) { |
387 | 278 | ||
388 | case PMOPS_PREPARE: | 279 | case PMOPS_PREPARE: |
389 | if (hibernation_ops) { | 280 | data->platform_suspend = 1; |
390 | data->platform_suspend = 1; | 281 | error = 0; |
391 | error = 0; | ||
392 | } else { | ||
393 | error = -ENOSYS; | ||
394 | } | ||
395 | break; | 282 | break; |
396 | 283 | ||
397 | case PMOPS_ENTER: | 284 | case PMOPS_ENTER: |
398 | if (data->platform_suspend) { | 285 | if (data->platform_suspend) |
399 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | 286 | error = hibernation_platform_enter(); |
400 | error = hibernation_ops->enter(); | 287 | |
401 | } | ||
402 | break; | 288 | break; |
403 | 289 | ||
404 | case PMOPS_FINISH: | 290 | case PMOPS_FINISH: |
diff --git a/kernel/printk.c b/kernel/printk.c index 0bbdeac281..051d27e36a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -449,13 +449,16 @@ static int printk_time = 1; | |||
449 | #else | 449 | #else |
450 | static int printk_time = 0; | 450 | static int printk_time = 0; |
451 | #endif | 451 | #endif |
452 | module_param(printk_time, int, S_IRUGO | S_IWUSR); | 452 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
453 | 453 | ||
454 | static int __init printk_time_setup(char *str) | 454 | static int __init printk_time_setup(char *str) |
455 | { | 455 | { |
456 | if (*str) | 456 | if (*str) |
457 | return 0; | 457 | return 0; |
458 | printk_time = 1; | 458 | printk_time = 1; |
459 | printk(KERN_NOTICE "The 'time' option is deprecated and " | ||
460 | "is scheduled for removal in early 2008\n"); | ||
461 | printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n"); | ||
459 | return 1; | 462 | return 1; |
460 | } | 463 | } |
461 | 464 | ||
@@ -483,6 +486,9 @@ static int have_callable_console(void) | |||
483 | * @fmt: format string | 486 | * @fmt: format string |
484 | * | 487 | * |
485 | * This is printk(). It can be called from any context. We want it to work. | 488 | * This is printk(). It can be called from any context. We want it to work. |
489 | * Be aware of the fact that if oops_in_progress is not set, we might try to | ||
490 | * wake klogd up which could deadlock on runqueue lock if printk() is called | ||
491 | * from scheduler code. | ||
486 | * | 492 | * |
487 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | 493 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and |
488 | * call the console drivers. If we fail to get the semaphore we place the output | 494 | * call the console drivers. If we fail to get the semaphore we place the output |
@@ -654,7 +660,7 @@ static void call_console_drivers(unsigned long start, unsigned long end) | |||
654 | */ | 660 | */ |
655 | static int __init console_setup(char *str) | 661 | static int __init console_setup(char *str) |
656 | { | 662 | { |
657 | char name[sizeof(console_cmdline[0].name)]; | 663 | char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ |
658 | char *s, *options; | 664 | char *s, *options; |
659 | int idx; | 665 | int idx; |
660 | 666 | ||
@@ -662,27 +668,27 @@ static int __init console_setup(char *str) | |||
662 | * Decode str into name, index, options. | 668 | * Decode str into name, index, options. |
663 | */ | 669 | */ |
664 | if (str[0] >= '0' && str[0] <= '9') { | 670 | if (str[0] >= '0' && str[0] <= '9') { |
665 | strcpy(name, "ttyS"); | 671 | strcpy(buf, "ttyS"); |
666 | strncpy(name + 4, str, sizeof(name) - 5); | 672 | strncpy(buf + 4, str, sizeof(buf) - 5); |
667 | } else { | 673 | } else { |
668 | strncpy(name, str, sizeof(name) - 1); | 674 | strncpy(buf, str, sizeof(buf) - 1); |
669 | } | 675 | } |
670 | name[sizeof(name) - 1] = 0; | 676 | buf[sizeof(buf) - 1] = 0; |
671 | if ((options = strchr(str, ',')) != NULL) | 677 | if ((options = strchr(str, ',')) != NULL) |
672 | *(options++) = 0; | 678 | *(options++) = 0; |
673 | #ifdef __sparc__ | 679 | #ifdef __sparc__ |
674 | if (!strcmp(str, "ttya")) | 680 | if (!strcmp(str, "ttya")) |
675 | strcpy(name, "ttyS0"); | 681 | strcpy(buf, "ttyS0"); |
676 | if (!strcmp(str, "ttyb")) | 682 | if (!strcmp(str, "ttyb")) |
677 | strcpy(name, "ttyS1"); | 683 | strcpy(buf, "ttyS1"); |
678 | #endif | 684 | #endif |
679 | for (s = name; *s; s++) | 685 | for (s = buf; *s; s++) |
680 | if ((*s >= '0' && *s <= '9') || *s == ',') | 686 | if ((*s >= '0' && *s <= '9') || *s == ',') |
681 | break; | 687 | break; |
682 | idx = simple_strtoul(s, NULL, 10); | 688 | idx = simple_strtoul(s, NULL, 10); |
683 | *s = 0; | 689 | *s = 0; |
684 | 690 | ||
685 | add_preferred_console(name, idx, options); | 691 | add_preferred_console(buf, idx, options); |
686 | return 1; | 692 | return 1; |
687 | } | 693 | } |
688 | __setup("console=", console_setup); | 694 | __setup("console=", console_setup); |
@@ -709,7 +715,7 @@ int __init add_preferred_console(char *name, int idx, char *options) | |||
709 | * See if this tty is not yet registered, and | 715 | * See if this tty is not yet registered, and |
710 | * if we have a slot free. | 716 | * if we have a slot free. |
711 | */ | 717 | */ |
712 | for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) | 718 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) |
713 | if (strcmp(console_cmdline[i].name, name) == 0 && | 719 | if (strcmp(console_cmdline[i].name, name) == 0 && |
714 | console_cmdline[i].index == idx) { | 720 | console_cmdline[i].index == idx) { |
715 | selected_console = i; | 721 | selected_console = i; |
@@ -726,6 +732,25 @@ int __init add_preferred_console(char *name, int idx, char *options) | |||
726 | return 0; | 732 | return 0; |
727 | } | 733 | } |
728 | 734 | ||
735 | int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) | ||
736 | { | ||
737 | struct console_cmdline *c; | ||
738 | int i; | ||
739 | |||
740 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) | ||
741 | if (strcmp(console_cmdline[i].name, name) == 0 && | ||
742 | console_cmdline[i].index == idx) { | ||
743 | c = &console_cmdline[i]; | ||
744 | memcpy(c->name, name_new, sizeof(c->name)); | ||
745 | c->name[sizeof(c->name) - 1] = 0; | ||
746 | c->options = options; | ||
747 | c->index = idx_new; | ||
748 | return i; | ||
749 | } | ||
750 | /* not found */ | ||
751 | return -1; | ||
752 | } | ||
753 | |||
729 | #ifndef CONFIG_DISABLE_CONSOLE_SUSPEND | 754 | #ifndef CONFIG_DISABLE_CONSOLE_SUSPEND |
730 | /** | 755 | /** |
731 | * suspend_console - suspend the console subsystem | 756 | * suspend_console - suspend the console subsystem |
@@ -942,6 +967,9 @@ void register_console(struct console *console) | |||
942 | if (preferred_console < 0 || bootconsole || !console_drivers) | 967 | if (preferred_console < 0 || bootconsole || !console_drivers) |
943 | preferred_console = selected_console; | 968 | preferred_console = selected_console; |
944 | 969 | ||
970 | if (console->early_setup) | ||
971 | console->early_setup(); | ||
972 | |||
945 | /* | 973 | /* |
946 | * See if we want to use this console driver. If we | 974 | * See if we want to use this console driver. If we |
947 | * didn't select a console we take the first one | 975 | * didn't select a console we take the first one |
@@ -985,12 +1013,15 @@ void register_console(struct console *console) | |||
985 | if (!(console->flags & CON_ENABLED)) | 1013 | if (!(console->flags & CON_ENABLED)) |
986 | return; | 1014 | return; |
987 | 1015 | ||
988 | if (bootconsole) { | 1016 | if (bootconsole && (console->flags & CON_CONSDEV)) { |
989 | printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", | 1017 | printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", |
990 | bootconsole->name, bootconsole->index, | 1018 | bootconsole->name, bootconsole->index, |
991 | console->name, console->index); | 1019 | console->name, console->index); |
992 | unregister_console(bootconsole); | 1020 | unregister_console(bootconsole); |
993 | console->flags &= ~CON_PRINTBUFFER; | 1021 | console->flags &= ~CON_PRINTBUFFER; |
1022 | } else { | ||
1023 | printk(KERN_INFO "console [%s%d] enabled\n", | ||
1024 | console->name, console->index); | ||
994 | } | 1025 | } |
995 | 1026 | ||
996 | /* | 1027 | /* |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ad7949a589..82a558b655 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -142,7 +142,7 @@ static int may_attach(struct task_struct *task) | |||
142 | return -EPERM; | 142 | return -EPERM; |
143 | smp_rmb(); | 143 | smp_rmb(); |
144 | if (task->mm) | 144 | if (task->mm) |
145 | dumpable = task->mm->dumpable; | 145 | dumpable = get_dumpable(task->mm); |
146 | if (!dumpable && !capable(CAP_SYS_PTRACE)) | 146 | if (!dumpable && !capable(CAP_SYS_PTRACE)) |
147 | return -EPERM; | 147 | return -EPERM; |
148 | 148 | ||
@@ -161,6 +161,7 @@ int ptrace_may_attach(struct task_struct *task) | |||
161 | int ptrace_attach(struct task_struct *task) | 161 | int ptrace_attach(struct task_struct *task) |
162 | { | 162 | { |
163 | int retval; | 163 | int retval; |
164 | unsigned long flags; | ||
164 | 165 | ||
165 | audit_ptrace(task); | 166 | audit_ptrace(task); |
166 | 167 | ||
@@ -181,9 +182,7 @@ repeat: | |||
181 | * cpu's that may have task_lock). | 182 | * cpu's that may have task_lock). |
182 | */ | 183 | */ |
183 | task_lock(task); | 184 | task_lock(task); |
184 | local_irq_disable(); | 185 | if (!write_trylock_irqsave(&tasklist_lock, flags)) { |
185 | if (!write_trylock(&tasklist_lock)) { | ||
186 | local_irq_enable(); | ||
187 | task_unlock(task); | 186 | task_unlock(task); |
188 | do { | 187 | do { |
189 | cpu_relax(); | 188 | cpu_relax(); |
@@ -211,7 +210,7 @@ repeat: | |||
211 | force_sig_specific(SIGSTOP, task); | 210 | force_sig_specific(SIGSTOP, task); |
212 | 211 | ||
213 | bad: | 212 | bad: |
214 | write_unlock_irq(&tasklist_lock); | 213 | write_unlock_irqrestore(&tasklist_lock, flags); |
215 | task_unlock(task); | 214 | task_unlock(task); |
216 | out: | 215 | out: |
217 | return retval; | 216 | return retval; |
@@ -491,3 +490,22 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | |||
491 | return ret; | 490 | return ret; |
492 | } | 491 | } |
493 | #endif /* __ARCH_SYS_PTRACE */ | 492 | #endif /* __ARCH_SYS_PTRACE */ |
493 | |||
494 | int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) | ||
495 | { | ||
496 | unsigned long tmp; | ||
497 | int copied; | ||
498 | |||
499 | copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); | ||
500 | if (copied != sizeof(tmp)) | ||
501 | return -EIO; | ||
502 | return put_user(tmp, (unsigned long __user *)data); | ||
503 | } | ||
504 | |||
505 | int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) | ||
506 | { | ||
507 | int copied; | ||
508 | |||
509 | copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); | ||
510 | return (copied == sizeof(data)) ? 0 : -EIO; | ||
511 | } | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 55ba82a85a..ddff332477 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/moduleparam.h> | 40 | #include <linux/moduleparam.h> |
41 | #include <linux/percpu.h> | 41 | #include <linux/percpu.h> |
42 | #include <linux/notifier.h> | 42 | #include <linux/notifier.h> |
43 | #include <linux/freezer.h> | ||
43 | #include <linux/cpu.h> | 44 | #include <linux/cpu.h> |
44 | #include <linux/random.h> | 45 | #include <linux/random.h> |
45 | #include <linux/delay.h> | 46 | #include <linux/delay.h> |
@@ -518,7 +519,6 @@ rcu_torture_writer(void *arg) | |||
518 | 519 | ||
519 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); | 520 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); |
520 | set_user_nice(current, 19); | 521 | set_user_nice(current, 19); |
521 | current->flags |= PF_NOFREEZE; | ||
522 | 522 | ||
523 | do { | 523 | do { |
524 | schedule_timeout_uninterruptible(1); | 524 | schedule_timeout_uninterruptible(1); |
@@ -558,7 +558,6 @@ rcu_torture_fakewriter(void *arg) | |||
558 | 558 | ||
559 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); | 559 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); |
560 | set_user_nice(current, 19); | 560 | set_user_nice(current, 19); |
561 | current->flags |= PF_NOFREEZE; | ||
562 | 561 | ||
563 | do { | 562 | do { |
564 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | 563 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); |
@@ -589,7 +588,6 @@ rcu_torture_reader(void *arg) | |||
589 | 588 | ||
590 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); | 589 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); |
591 | set_user_nice(current, 19); | 590 | set_user_nice(current, 19); |
592 | current->flags |= PF_NOFREEZE; | ||
593 | 591 | ||
594 | do { | 592 | do { |
595 | idx = cur_ops->readlock(); | 593 | idx = cur_ops->readlock(); |
diff --git a/kernel/relay.c b/kernel/relay.c index 95db8c79fe..510fbbd7b5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/vmalloc.h> | 21 | #include <linux/vmalloc.h> |
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/cpu.h> | 23 | #include <linux/cpu.h> |
24 | #include <linux/splice.h> | ||
24 | 25 | ||
25 | /* list of open channels, for cpu hotplug */ | 26 | /* list of open channels, for cpu hotplug */ |
26 | static DEFINE_MUTEX(relay_channels_mutex); | 27 | static DEFINE_MUTEX(relay_channels_mutex); |
@@ -79,7 +80,7 @@ static struct vm_operations_struct relay_file_mmap_ops = { | |||
79 | * | 80 | * |
80 | * Caller should already have grabbed mmap_sem. | 81 | * Caller should already have grabbed mmap_sem. |
81 | */ | 82 | */ |
82 | int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) | 83 | static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) |
83 | { | 84 | { |
84 | unsigned long length = vma->vm_end - vma->vm_start; | 85 | unsigned long length = vma->vm_end - vma->vm_start; |
85 | struct file *filp = vma->vm_file; | 86 | struct file *filp = vma->vm_file; |
@@ -121,6 +122,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) | |||
121 | buf->page_array[i] = alloc_page(GFP_KERNEL); | 122 | buf->page_array[i] = alloc_page(GFP_KERNEL); |
122 | if (unlikely(!buf->page_array[i])) | 123 | if (unlikely(!buf->page_array[i])) |
123 | goto depopulate; | 124 | goto depopulate; |
125 | set_page_private(buf->page_array[i], (unsigned long)buf); | ||
124 | } | 126 | } |
125 | mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); | 127 | mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); |
126 | if (!mem) | 128 | if (!mem) |
@@ -143,7 +145,7 @@ depopulate: | |||
143 | * | 145 | * |
144 | * Returns channel buffer if successful, %NULL otherwise. | 146 | * Returns channel buffer if successful, %NULL otherwise. |
145 | */ | 147 | */ |
146 | struct rchan_buf *relay_create_buf(struct rchan *chan) | 148 | static struct rchan_buf *relay_create_buf(struct rchan *chan) |
147 | { | 149 | { |
148 | struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); | 150 | struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); |
149 | if (!buf) | 151 | if (!buf) |
@@ -173,7 +175,7 @@ free_buf: | |||
173 | * | 175 | * |
174 | * Should only be called from kref_put(). | 176 | * Should only be called from kref_put(). |
175 | */ | 177 | */ |
176 | void relay_destroy_channel(struct kref *kref) | 178 | static void relay_destroy_channel(struct kref *kref) |
177 | { | 179 | { |
178 | struct rchan *chan = container_of(kref, struct rchan, kref); | 180 | struct rchan *chan = container_of(kref, struct rchan, kref); |
179 | kfree(chan); | 181 | kfree(chan); |
@@ -183,7 +185,7 @@ void relay_destroy_channel(struct kref *kref) | |||
183 | * relay_destroy_buf - destroy an rchan_buf struct and associated buffer | 185 | * relay_destroy_buf - destroy an rchan_buf struct and associated buffer |
184 | * @buf: the buffer struct | 186 | * @buf: the buffer struct |
185 | */ | 187 | */ |
186 | void relay_destroy_buf(struct rchan_buf *buf) | 188 | static void relay_destroy_buf(struct rchan_buf *buf) |
187 | { | 189 | { |
188 | struct rchan *chan = buf->chan; | 190 | struct rchan *chan = buf->chan; |
189 | unsigned int i; | 191 | unsigned int i; |
@@ -208,7 +210,7 @@ void relay_destroy_buf(struct rchan_buf *buf) | |||
208 | * rchan_buf_struct and the channel buffer. Should only be called from | 210 | * rchan_buf_struct and the channel buffer. Should only be called from |
209 | * kref_put(). | 211 | * kref_put(). |
210 | */ | 212 | */ |
211 | void relay_remove_buf(struct kref *kref) | 213 | static void relay_remove_buf(struct kref *kref) |
212 | { | 214 | { |
213 | struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); | 215 | struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); |
214 | buf->chan->cb->remove_buf_file(buf->dentry); | 216 | buf->chan->cb->remove_buf_file(buf->dentry); |
@@ -221,11 +223,10 @@ void relay_remove_buf(struct kref *kref) | |||
221 | * | 223 | * |
222 | * Returns 1 if the buffer is empty, 0 otherwise. | 224 | * Returns 1 if the buffer is empty, 0 otherwise. |
223 | */ | 225 | */ |
224 | int relay_buf_empty(struct rchan_buf *buf) | 226 | static int relay_buf_empty(struct rchan_buf *buf) |
225 | { | 227 | { |
226 | return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; | 228 | return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; |
227 | } | 229 | } |
228 | EXPORT_SYMBOL_GPL(relay_buf_empty); | ||
229 | 230 | ||
230 | /** | 231 | /** |
231 | * relay_buf_full - boolean, is the channel buffer full? | 232 | * relay_buf_full - boolean, is the channel buffer full? |
@@ -970,43 +971,6 @@ static int subbuf_read_actor(size_t read_start, | |||
970 | return ret; | 971 | return ret; |
971 | } | 972 | } |
972 | 973 | ||
973 | /* | ||
974 | * subbuf_send_actor - send up to one subbuf's worth of data | ||
975 | */ | ||
976 | static int subbuf_send_actor(size_t read_start, | ||
977 | struct rchan_buf *buf, | ||
978 | size_t avail, | ||
979 | read_descriptor_t *desc, | ||
980 | read_actor_t actor) | ||
981 | { | ||
982 | unsigned long pidx, poff; | ||
983 | unsigned int subbuf_pages; | ||
984 | int ret = 0; | ||
985 | |||
986 | subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT; | ||
987 | pidx = (read_start / PAGE_SIZE) % subbuf_pages; | ||
988 | poff = read_start & ~PAGE_MASK; | ||
989 | while (avail) { | ||
990 | struct page *p = buf->page_array[pidx]; | ||
991 | unsigned int len; | ||
992 | |||
993 | len = PAGE_SIZE - poff; | ||
994 | if (len > avail) | ||
995 | len = avail; | ||
996 | |||
997 | len = actor(desc, p, poff, len); | ||
998 | if (desc->error) | ||
999 | break; | ||
1000 | |||
1001 | avail -= len; | ||
1002 | ret += len; | ||
1003 | poff = 0; | ||
1004 | pidx = (pidx + 1) % subbuf_pages; | ||
1005 | } | ||
1006 | |||
1007 | return ret; | ||
1008 | } | ||
1009 | |||
1010 | typedef int (*subbuf_actor_t) (size_t read_start, | 974 | typedef int (*subbuf_actor_t) (size_t read_start, |
1011 | struct rchan_buf *buf, | 975 | struct rchan_buf *buf, |
1012 | size_t avail, | 976 | size_t avail, |
@@ -1067,19 +1031,161 @@ static ssize_t relay_file_read(struct file *filp, | |||
1067 | NULL, &desc); | 1031 | NULL, &desc); |
1068 | } | 1032 | } |
1069 | 1033 | ||
1070 | static ssize_t relay_file_sendfile(struct file *filp, | 1034 | static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) |
1071 | loff_t *ppos, | ||
1072 | size_t count, | ||
1073 | read_actor_t actor, | ||
1074 | void *target) | ||
1075 | { | 1035 | { |
1076 | read_descriptor_t desc; | 1036 | rbuf->bytes_consumed += bytes_consumed; |
1077 | desc.written = 0; | 1037 | |
1078 | desc.count = count; | 1038 | if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { |
1079 | desc.arg.data = target; | 1039 | relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); |
1080 | desc.error = 0; | 1040 | rbuf->bytes_consumed %= rbuf->chan->subbuf_size; |
1081 | return relay_file_read_subbufs(filp, ppos, subbuf_send_actor, | 1041 | } |
1082 | actor, &desc); | 1042 | } |
1043 | |||
1044 | static void relay_pipe_buf_release(struct pipe_inode_info *pipe, | ||
1045 | struct pipe_buffer *buf) | ||
1046 | { | ||
1047 | struct rchan_buf *rbuf; | ||
1048 | |||
1049 | rbuf = (struct rchan_buf *)page_private(buf->page); | ||
1050 | relay_consume_bytes(rbuf, buf->private); | ||
1051 | } | ||
1052 | |||
1053 | static struct pipe_buf_operations relay_pipe_buf_ops = { | ||
1054 | .can_merge = 0, | ||
1055 | .map = generic_pipe_buf_map, | ||
1056 | .unmap = generic_pipe_buf_unmap, | ||
1057 | .confirm = generic_pipe_buf_confirm, | ||
1058 | .release = relay_pipe_buf_release, | ||
1059 | .steal = generic_pipe_buf_steal, | ||
1060 | .get = generic_pipe_buf_get, | ||
1061 | }; | ||
1062 | |||
1063 | /* | ||
1064 | * subbuf_splice_actor - splice up to one subbuf's worth of data | ||
1065 | */ | ||
1066 | static int subbuf_splice_actor(struct file *in, | ||
1067 | loff_t *ppos, | ||
1068 | struct pipe_inode_info *pipe, | ||
1069 | size_t len, | ||
1070 | unsigned int flags, | ||
1071 | int *nonpad_ret) | ||
1072 | { | ||
1073 | unsigned int pidx, poff, total_len, subbuf_pages, ret; | ||
1074 | struct rchan_buf *rbuf = in->private_data; | ||
1075 | unsigned int subbuf_size = rbuf->chan->subbuf_size; | ||
1076 | uint64_t pos = (uint64_t) *ppos; | ||
1077 | uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size; | ||
1078 | size_t read_start = (size_t) do_div(pos, alloc_size); | ||
1079 | size_t read_subbuf = read_start / subbuf_size; | ||
1080 | size_t padding = rbuf->padding[read_subbuf]; | ||
1081 | size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; | ||
1082 | struct page *pages[PIPE_BUFFERS]; | ||
1083 | struct partial_page partial[PIPE_BUFFERS]; | ||
1084 | struct splice_pipe_desc spd = { | ||
1085 | .pages = pages, | ||
1086 | .nr_pages = 0, | ||
1087 | .partial = partial, | ||
1088 | .flags = flags, | ||
1089 | .ops = &relay_pipe_buf_ops, | ||
1090 | }; | ||
1091 | |||
1092 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) | ||
1093 | return 0; | ||
1094 | |||
1095 | /* | ||
1096 | * Adjust read len, if longer than what is available | ||
1097 | */ | ||
1098 | if (len > (subbuf_size - read_start % subbuf_size)) | ||
1099 | len = subbuf_size - read_start % subbuf_size; | ||
1100 | |||
1101 | subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; | ||
1102 | pidx = (read_start / PAGE_SIZE) % subbuf_pages; | ||
1103 | poff = read_start & ~PAGE_MASK; | ||
1104 | |||
1105 | for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { | ||
1106 | unsigned int this_len, this_end, private; | ||
1107 | unsigned int cur_pos = read_start + total_len; | ||
1108 | |||
1109 | if (!len) | ||
1110 | break; | ||
1111 | |||
1112 | this_len = min_t(unsigned long, len, PAGE_SIZE - poff); | ||
1113 | private = this_len; | ||
1114 | |||
1115 | spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; | ||
1116 | spd.partial[spd.nr_pages].offset = poff; | ||
1117 | |||
1118 | this_end = cur_pos + this_len; | ||
1119 | if (this_end >= nonpad_end) { | ||
1120 | this_len = nonpad_end - cur_pos; | ||
1121 | private = this_len + padding; | ||
1122 | } | ||
1123 | spd.partial[spd.nr_pages].len = this_len; | ||
1124 | spd.partial[spd.nr_pages].private = private; | ||
1125 | |||
1126 | len -= this_len; | ||
1127 | total_len += this_len; | ||
1128 | poff = 0; | ||
1129 | pidx = (pidx + 1) % subbuf_pages; | ||
1130 | |||
1131 | if (this_end >= nonpad_end) { | ||
1132 | spd.nr_pages++; | ||
1133 | break; | ||
1134 | } | ||
1135 | } | ||
1136 | |||
1137 | if (!spd.nr_pages) | ||
1138 | return 0; | ||
1139 | |||
1140 | ret = *nonpad_ret = splice_to_pipe(pipe, &spd); | ||
1141 | if (ret < 0 || ret < total_len) | ||
1142 | return ret; | ||
1143 | |||
1144 | if (read_start + ret == nonpad_end) | ||
1145 | ret += padding; | ||
1146 | |||
1147 | return ret; | ||
1148 | } | ||
1149 | |||
1150 | static ssize_t relay_file_splice_read(struct file *in, | ||
1151 | loff_t *ppos, | ||
1152 | struct pipe_inode_info *pipe, | ||
1153 | size_t len, | ||
1154 | unsigned int flags) | ||
1155 | { | ||
1156 | ssize_t spliced; | ||
1157 | int ret; | ||
1158 | int nonpad_ret = 0; | ||
1159 | |||
1160 | ret = 0; | ||
1161 | spliced = 0; | ||
1162 | |||
1163 | while (len) { | ||
1164 | ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); | ||
1165 | if (ret < 0) | ||
1166 | break; | ||
1167 | else if (!ret) { | ||
1168 | if (spliced) | ||
1169 | break; | ||
1170 | if (flags & SPLICE_F_NONBLOCK) { | ||
1171 | ret = -EAGAIN; | ||
1172 | break; | ||
1173 | } | ||
1174 | } | ||
1175 | |||
1176 | *ppos += ret; | ||
1177 | if (ret > len) | ||
1178 | len = 0; | ||
1179 | else | ||
1180 | len -= ret; | ||
1181 | spliced += nonpad_ret; | ||
1182 | nonpad_ret = 0; | ||
1183 | } | ||
1184 | |||
1185 | if (spliced) | ||
1186 | return spliced; | ||
1187 | |||
1188 | return ret; | ||
1083 | } | 1189 | } |
1084 | 1190 | ||
1085 | const struct file_operations relay_file_operations = { | 1191 | const struct file_operations relay_file_operations = { |
@@ -1089,7 +1195,7 @@ const struct file_operations relay_file_operations = { | |||
1089 | .read = relay_file_read, | 1195 | .read = relay_file_read, |
1090 | .llseek = no_llseek, | 1196 | .llseek = no_llseek, |
1091 | .release = relay_file_release, | 1197 | .release = relay_file_release, |
1092 | .sendfile = relay_file_sendfile, | 1198 | .splice_read = relay_file_splice_read, |
1093 | }; | 1199 | }; |
1094 | EXPORT_SYMBOL_GPL(relay_file_operations); | 1200 | EXPORT_SYMBOL_GPL(relay_file_operations); |
1095 | 1201 | ||
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index da8d6bf464..5aedbee014 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c | |||
@@ -29,12 +29,6 @@ | |||
29 | 29 | ||
30 | #include "rtmutex_common.h" | 30 | #include "rtmutex_common.h" |
31 | 31 | ||
32 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
33 | # include "rtmutex-debug.h" | ||
34 | #else | ||
35 | # include "rtmutex.h" | ||
36 | #endif | ||
37 | |||
38 | # define TRACE_WARN_ON(x) WARN_ON(x) | 32 | # define TRACE_WARN_ON(x) WARN_ON(x) |
39 | # define TRACE_BUG_ON(x) BUG_ON(x) | 33 | # define TRACE_BUG_ON(x) BUG_ON(x) |
40 | 34 | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 015fc633c9..e3055ba691 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -260,6 +260,7 @@ static int test_func(void *data) | |||
260 | int ret; | 260 | int ret; |
261 | 261 | ||
262 | current->flags |= PF_MUTEX_TESTER; | 262 | current->flags |= PF_MUTEX_TESTER; |
263 | set_freezable(); | ||
263 | allow_signal(SIGHUP); | 264 | allow_signal(SIGHUP); |
264 | 265 | ||
265 | for(;;) { | 266 | for(;;) { |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 17d28ce203..8cd9bd2cdb 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -17,12 +17,6 @@ | |||
17 | 17 | ||
18 | #include "rtmutex_common.h" | 18 | #include "rtmutex_common.h" |
19 | 19 | ||
20 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
21 | # include "rtmutex-debug.h" | ||
22 | #else | ||
23 | # include "rtmutex.h" | ||
24 | #endif | ||
25 | |||
26 | /* | 20 | /* |
27 | * lock->owner state tracking: | 21 | * lock->owner state tracking: |
28 | * | 22 | * |
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 9c75856e79..2d3b83593c 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h | |||
@@ -103,7 +103,7 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | |||
103 | 103 | ||
104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) | 104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) |
105 | { | 105 | { |
106 | return (struct task_struct *) | 106 | return (struct task_struct *) |
107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | 107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); |
108 | } | 108 | } |
109 | 109 | ||
@@ -120,4 +120,11 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | |||
120 | struct task_struct *proxy_owner); | 120 | struct task_struct *proxy_owner); |
121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | 121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, |
122 | struct task_struct *proxy_owner); | 122 | struct task_struct *proxy_owner); |
123 | |||
124 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
125 | # include "rtmutex-debug.h" | ||
126 | #else | ||
127 | # include "rtmutex.h" | ||
128 | #endif | ||
129 | |||
123 | #endif | 130 | #endif |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 9a87886b02..1ec620c030 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -20,7 +20,7 @@ void down_read(struct rw_semaphore *sem) | |||
20 | might_sleep(); | 20 | might_sleep(); |
21 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | 21 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
22 | 22 | ||
23 | __down_read(sem); | 23 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
24 | } | 24 | } |
25 | 25 | ||
26 | EXPORT_SYMBOL(down_read); | 26 | EXPORT_SYMBOL(down_read); |
@@ -47,7 +47,7 @@ void down_write(struct rw_semaphore *sem) | |||
47 | might_sleep(); | 47 | might_sleep(); |
48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
49 | 49 | ||
50 | __down_write(sem); | 50 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
51 | } | 51 | } |
52 | 52 | ||
53 | EXPORT_SYMBOL(down_write); | 53 | EXPORT_SYMBOL(down_write); |
@@ -111,7 +111,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) | |||
111 | might_sleep(); | 111 | might_sleep(); |
112 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); | 112 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); |
113 | 113 | ||
114 | __down_read(sem); | 114 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
115 | } | 115 | } |
116 | 116 | ||
117 | EXPORT_SYMBOL(down_read_nested); | 117 | EXPORT_SYMBOL(down_read_nested); |
@@ -130,7 +130,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) | |||
130 | might_sleep(); | 130 | might_sleep(); |
131 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | 131 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); |
132 | 132 | ||
133 | __down_write_nested(sem, subclass); | 133 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
134 | } | 134 | } |
135 | 135 | ||
136 | EXPORT_SYMBOL(down_write_nested); | 136 | EXPORT_SYMBOL(down_write_nested); |
diff --git a/kernel/sched.c b/kernel/sched.c index 50e1a31226..93cf241cfb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -16,13 +16,19 @@ | |||
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | 16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | 17 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | 18 | * 2004-04-02 Scheduler domains code by Nick Piggin |
19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a | ||
20 | * fair scheduling design by Con Kolivas. | ||
21 | * 2007-05-05 Load balancing (smp-nice) and other improvements | ||
22 | * by Peter Williams | ||
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | ||
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | ||
19 | */ | 25 | */ |
20 | 26 | ||
21 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
22 | #include <linux/module.h> | 28 | #include <linux/module.h> |
23 | #include <linux/nmi.h> | 29 | #include <linux/nmi.h> |
24 | #include <linux/init.h> | 30 | #include <linux/init.h> |
25 | #include <asm/uaccess.h> | 31 | #include <linux/uaccess.h> |
26 | #include <linux/highmem.h> | 32 | #include <linux/highmem.h> |
27 | #include <linux/smp_lock.h> | 33 | #include <linux/smp_lock.h> |
28 | #include <asm/mmu_context.h> | 34 | #include <asm/mmu_context.h> |
@@ -53,9 +59,9 @@ | |||
53 | #include <linux/kprobes.h> | 59 | #include <linux/kprobes.h> |
54 | #include <linux/delayacct.h> | 60 | #include <linux/delayacct.h> |
55 | #include <linux/reciprocal_div.h> | 61 | #include <linux/reciprocal_div.h> |
62 | #include <linux/unistd.h> | ||
56 | 63 | ||
57 | #include <asm/tlb.h> | 64 | #include <asm/tlb.h> |
58 | #include <asm/unistd.h> | ||
59 | 65 | ||
60 | /* | 66 | /* |
61 | * Scheduler clock - returns current time in nanosec units. | 67 | * Scheduler clock - returns current time in nanosec units. |
@@ -91,6 +97,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
91 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | 97 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) |
92 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | 98 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
93 | 99 | ||
100 | #define NICE_0_LOAD SCHED_LOAD_SCALE | ||
101 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | ||
102 | |||
94 | /* | 103 | /* |
95 | * These are the 'tuning knobs' of the scheduler: | 104 | * These are the 'tuning knobs' of the scheduler: |
96 | * | 105 | * |
@@ -100,87 +109,6 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
100 | */ | 109 | */ |
101 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) | 110 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) |
102 | #define DEF_TIMESLICE (100 * HZ / 1000) | 111 | #define DEF_TIMESLICE (100 * HZ / 1000) |
103 | #define ON_RUNQUEUE_WEIGHT 30 | ||
104 | #define CHILD_PENALTY 95 | ||
105 | #define PARENT_PENALTY 100 | ||
106 | #define EXIT_WEIGHT 3 | ||
107 | #define PRIO_BONUS_RATIO 25 | ||
108 | #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) | ||
109 | #define INTERACTIVE_DELTA 2 | ||
110 | #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) | ||
111 | #define STARVATION_LIMIT (MAX_SLEEP_AVG) | ||
112 | #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) | ||
113 | |||
114 | /* | ||
115 | * If a task is 'interactive' then we reinsert it in the active | ||
116 | * array after it has expired its current timeslice. (it will not | ||
117 | * continue to run immediately, it will still roundrobin with | ||
118 | * other interactive tasks.) | ||
119 | * | ||
120 | * This part scales the interactivity limit depending on niceness. | ||
121 | * | ||
122 | * We scale it linearly, offset by the INTERACTIVE_DELTA delta. | ||
123 | * Here are a few examples of different nice levels: | ||
124 | * | ||
125 | * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] | ||
126 | * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] | ||
127 | * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] | ||
128 | * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] | ||
129 | * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] | ||
130 | * | ||
131 | * (the X axis represents the possible -5 ... 0 ... +5 dynamic | ||
132 | * priority range a task can explore, a value of '1' means the | ||
133 | * task is rated interactive.) | ||
134 | * | ||
135 | * Ie. nice +19 tasks can never get 'interactive' enough to be | ||
136 | * reinserted into the active array. And only heavily CPU-hog nice -20 | ||
137 | * tasks will be expired. Default nice 0 tasks are somewhere between, | ||
138 | * it takes some effort for them to get interactive, but it's not | ||
139 | * too hard. | ||
140 | */ | ||
141 | |||
142 | #define CURRENT_BONUS(p) \ | ||
143 | (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ | ||
144 | MAX_SLEEP_AVG) | ||
145 | |||
146 | #define GRANULARITY (10 * HZ / 1000 ? : 1) | ||
147 | |||
148 | #ifdef CONFIG_SMP | ||
149 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
150 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ | ||
151 | num_online_cpus()) | ||
152 | #else | ||
153 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
154 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) | ||
155 | #endif | ||
156 | |||
157 | #define SCALE(v1,v1_max,v2_max) \ | ||
158 | (v1) * (v2_max) / (v1_max) | ||
159 | |||
160 | #define DELTA(p) \ | ||
161 | (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ | ||
162 | INTERACTIVE_DELTA) | ||
163 | |||
164 | #define TASK_INTERACTIVE(p) \ | ||
165 | ((p)->prio <= (p)->static_prio - DELTA(p)) | ||
166 | |||
167 | #define INTERACTIVE_SLEEP(p) \ | ||
168 | (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ | ||
169 | (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) | ||
170 | |||
171 | #define TASK_PREEMPTS_CURR(p, rq) \ | ||
172 | ((p)->prio < (rq)->curr->prio) | ||
173 | |||
174 | #define SCALE_PRIO(x, prio) \ | ||
175 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | ||
176 | |||
177 | static unsigned int static_prio_timeslice(int static_prio) | ||
178 | { | ||
179 | if (static_prio < NICE_TO_PRIO(0)) | ||
180 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | ||
181 | else | ||
182 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | ||
183 | } | ||
184 | 112 | ||
185 | #ifdef CONFIG_SMP | 113 | #ifdef CONFIG_SMP |
186 | /* | 114 | /* |
@@ -203,28 +131,87 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | |||
203 | } | 131 | } |
204 | #endif | 132 | #endif |
205 | 133 | ||
134 | #define SCALE_PRIO(x, prio) \ | ||
135 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | ||
136 | |||
206 | /* | 137 | /* |
207 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | 138 | * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] |
208 | * to time slice values: [800ms ... 100ms ... 5ms] | 139 | * to time slice values: [800ms ... 100ms ... 5ms] |
209 | * | ||
210 | * The higher a thread's priority, the bigger timeslices | ||
211 | * it gets during one round of execution. But even the lowest | ||
212 | * priority thread gets MIN_TIMESLICE worth of execution time. | ||
213 | */ | 140 | */ |
141 | static unsigned int static_prio_timeslice(int static_prio) | ||
142 | { | ||
143 | if (static_prio == NICE_TO_PRIO(19)) | ||
144 | return 1; | ||
145 | |||
146 | if (static_prio < NICE_TO_PRIO(0)) | ||
147 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | ||
148 | else | ||
149 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | ||
150 | } | ||
151 | |||
152 | static inline int rt_policy(int policy) | ||
153 | { | ||
154 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) | ||
155 | return 1; | ||
156 | return 0; | ||
157 | } | ||
214 | 158 | ||
215 | static inline unsigned int task_timeslice(struct task_struct *p) | 159 | static inline int task_has_rt_policy(struct task_struct *p) |
216 | { | 160 | { |
217 | return static_prio_timeslice(p->static_prio); | 161 | return rt_policy(p->policy); |
218 | } | 162 | } |
219 | 163 | ||
220 | /* | 164 | /* |
221 | * These are the runqueue data structures: | 165 | * This is the priority-queue data structure of the RT scheduling class: |
222 | */ | 166 | */ |
167 | struct rt_prio_array { | ||
168 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | ||
169 | struct list_head queue[MAX_RT_PRIO]; | ||
170 | }; | ||
171 | |||
172 | struct load_stat { | ||
173 | struct load_weight load; | ||
174 | u64 load_update_start, load_update_last; | ||
175 | unsigned long delta_fair, delta_exec, delta_stat; | ||
176 | }; | ||
177 | |||
178 | /* CFS-related fields in a runqueue */ | ||
179 | struct cfs_rq { | ||
180 | struct load_weight load; | ||
181 | unsigned long nr_running; | ||
182 | |||
183 | s64 fair_clock; | ||
184 | u64 exec_clock; | ||
185 | s64 wait_runtime; | ||
186 | u64 sleeper_bonus; | ||
187 | unsigned long wait_runtime_overruns, wait_runtime_underruns; | ||
188 | |||
189 | struct rb_root tasks_timeline; | ||
190 | struct rb_node *rb_leftmost; | ||
191 | struct rb_node *rb_load_balance_curr; | ||
192 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
193 | /* 'curr' points to currently running entity on this cfs_rq. | ||
194 | * It is set to NULL otherwise (i.e when none are currently running). | ||
195 | */ | ||
196 | struct sched_entity *curr; | ||
197 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
198 | |||
199 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
200 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
201 | * (like users, containers etc.) | ||
202 | * | ||
203 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
204 | * list is used during load balance. | ||
205 | */ | ||
206 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ | ||
207 | #endif | ||
208 | }; | ||
223 | 209 | ||
224 | struct prio_array { | 210 | /* Real-Time classes' related field in a runqueue: */ |
225 | unsigned int nr_active; | 211 | struct rt_rq { |
226 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ | 212 | struct rt_prio_array active; |
227 | struct list_head queue[MAX_PRIO]; | 213 | int rt_load_balance_idx; |
214 | struct list_head *rt_load_balance_head, *rt_load_balance_curr; | ||
228 | }; | 215 | }; |
229 | 216 | ||
230 | /* | 217 | /* |
@@ -235,22 +222,28 @@ struct prio_array { | |||
235 | * acquire operations must be ordered by ascending &runqueue. | 222 | * acquire operations must be ordered by ascending &runqueue. |
236 | */ | 223 | */ |
237 | struct rq { | 224 | struct rq { |
238 | spinlock_t lock; | 225 | spinlock_t lock; /* runqueue lock */ |
239 | 226 | ||
240 | /* | 227 | /* |
241 | * nr_running and cpu_load should be in the same cacheline because | 228 | * nr_running and cpu_load should be in the same cacheline because |
242 | * remote CPUs use both these fields when doing load calculation. | 229 | * remote CPUs use both these fields when doing load calculation. |
243 | */ | 230 | */ |
244 | unsigned long nr_running; | 231 | unsigned long nr_running; |
245 | unsigned long raw_weighted_load; | 232 | #define CPU_LOAD_IDX_MAX 5 |
246 | #ifdef CONFIG_SMP | 233 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
247 | unsigned long cpu_load[3]; | ||
248 | unsigned char idle_at_tick; | 234 | unsigned char idle_at_tick; |
249 | #ifdef CONFIG_NO_HZ | 235 | #ifdef CONFIG_NO_HZ |
250 | unsigned char in_nohz_recently; | 236 | unsigned char in_nohz_recently; |
251 | #endif | 237 | #endif |
238 | struct load_stat ls; /* capture load from *all* tasks on this cpu */ | ||
239 | unsigned long nr_load_updates; | ||
240 | u64 nr_switches; | ||
241 | |||
242 | struct cfs_rq cfs; | ||
243 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
244 | struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ | ||
252 | #endif | 245 | #endif |
253 | unsigned long long nr_switches; | 246 | struct rt_rq rt; |
254 | 247 | ||
255 | /* | 248 | /* |
256 | * This is part of a global counter where only the total sum | 249 | * This is part of a global counter where only the total sum |
@@ -260,14 +253,18 @@ struct rq { | |||
260 | */ | 253 | */ |
261 | unsigned long nr_uninterruptible; | 254 | unsigned long nr_uninterruptible; |
262 | 255 | ||
263 | unsigned long expired_timestamp; | ||
264 | /* Cached timestamp set by update_cpu_clock() */ | ||
265 | unsigned long long most_recent_timestamp; | ||
266 | struct task_struct *curr, *idle; | 256 | struct task_struct *curr, *idle; |
267 | unsigned long next_balance; | 257 | unsigned long next_balance; |
268 | struct mm_struct *prev_mm; | 258 | struct mm_struct *prev_mm; |
269 | struct prio_array *active, *expired, arrays[2]; | 259 | |
270 | int best_expired_prio; | 260 | u64 clock, prev_clock_raw; |
261 | s64 clock_max_delta; | ||
262 | |||
263 | unsigned int clock_warps, clock_overflows; | ||
264 | unsigned int clock_unstable_events; | ||
265 | |||
266 | struct sched_class *load_balance_class; | ||
267 | |||
271 | atomic_t nr_iowait; | 268 | atomic_t nr_iowait; |
272 | 269 | ||
273 | #ifdef CONFIG_SMP | 270 | #ifdef CONFIG_SMP |
@@ -304,9 +301,14 @@ struct rq { | |||
304 | struct lock_class_key rq_lock_key; | 301 | struct lock_class_key rq_lock_key; |
305 | }; | 302 | }; |
306 | 303 | ||
307 | static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; | 304 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
308 | static DEFINE_MUTEX(sched_hotcpu_mutex); | 305 | static DEFINE_MUTEX(sched_hotcpu_mutex); |
309 | 306 | ||
307 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | ||
308 | { | ||
309 | rq->curr->sched_class->check_preempt_curr(rq, p); | ||
310 | } | ||
311 | |||
310 | static inline int cpu_of(struct rq *rq) | 312 | static inline int cpu_of(struct rq *rq) |
311 | { | 313 | { |
312 | #ifdef CONFIG_SMP | 314 | #ifdef CONFIG_SMP |
@@ -317,6 +319,52 @@ static inline int cpu_of(struct rq *rq) | |||
317 | } | 319 | } |
318 | 320 | ||
319 | /* | 321 | /* |
322 | * Per-runqueue clock, as finegrained as the platform can give us: | ||
323 | */ | ||
324 | static unsigned long long __rq_clock(struct rq *rq) | ||
325 | { | ||
326 | u64 prev_raw = rq->prev_clock_raw; | ||
327 | u64 now = sched_clock(); | ||
328 | s64 delta = now - prev_raw; | ||
329 | u64 clock = rq->clock; | ||
330 | |||
331 | /* | ||
332 | * Protect against sched_clock() occasionally going backwards: | ||
333 | */ | ||
334 | if (unlikely(delta < 0)) { | ||
335 | clock++; | ||
336 | rq->clock_warps++; | ||
337 | } else { | ||
338 | /* | ||
339 | * Catch too large forward jumps too: | ||
340 | */ | ||
341 | if (unlikely(delta > 2*TICK_NSEC)) { | ||
342 | clock++; | ||
343 | rq->clock_overflows++; | ||
344 | } else { | ||
345 | if (unlikely(delta > rq->clock_max_delta)) | ||
346 | rq->clock_max_delta = delta; | ||
347 | clock += delta; | ||
348 | } | ||
349 | } | ||
350 | |||
351 | rq->prev_clock_raw = now; | ||
352 | rq->clock = clock; | ||
353 | |||
354 | return clock; | ||
355 | } | ||
356 | |||
357 | static inline unsigned long long rq_clock(struct rq *rq) | ||
358 | { | ||
359 | int this_cpu = smp_processor_id(); | ||
360 | |||
361 | if (this_cpu == cpu_of(rq)) | ||
362 | return __rq_clock(rq); | ||
363 | |||
364 | return rq->clock; | ||
365 | } | ||
366 | |||
367 | /* | ||
320 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 368 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
321 | * See detach_destroy_domains: synchronize_sched for details. | 369 | * See detach_destroy_domains: synchronize_sched for details. |
322 | * | 370 | * |
@@ -331,6 +379,35 @@ static inline int cpu_of(struct rq *rq) | |||
331 | #define task_rq(p) cpu_rq(task_cpu(p)) | 379 | #define task_rq(p) cpu_rq(task_cpu(p)) |
332 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 380 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
333 | 381 | ||
382 | /* | ||
383 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
384 | * clock constructed from sched_clock(): | ||
385 | */ | ||
386 | unsigned long long cpu_clock(int cpu) | ||
387 | { | ||
388 | struct rq *rq = cpu_rq(cpu); | ||
389 | unsigned long long now; | ||
390 | unsigned long flags; | ||
391 | |||
392 | spin_lock_irqsave(&rq->lock, flags); | ||
393 | now = rq_clock(rq); | ||
394 | spin_unlock_irqrestore(&rq->lock, flags); | ||
395 | |||
396 | return now; | ||
397 | } | ||
398 | |||
399 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
400 | /* Change a task's ->cfs_rq if it moves across CPUs */ | ||
401 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
402 | { | ||
403 | p->se.cfs_rq = &task_rq(p)->cfs; | ||
404 | } | ||
405 | #else | ||
406 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
407 | { | ||
408 | } | ||
409 | #endif | ||
410 | |||
334 | #ifndef prepare_arch_switch | 411 | #ifndef prepare_arch_switch |
335 | # define prepare_arch_switch(next) do { } while (0) | 412 | # define prepare_arch_switch(next) do { } while (0) |
336 | #endif | 413 | #endif |
@@ -460,134 +537,6 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | |||
460 | spin_unlock_irqrestore(&rq->lock, *flags); | 537 | spin_unlock_irqrestore(&rq->lock, *flags); |
461 | } | 538 | } |
462 | 539 | ||
463 | #ifdef CONFIG_SCHEDSTATS | ||
464 | /* | ||
465 | * bump this up when changing the output format or the meaning of an existing | ||
466 | * format, so that tools can adapt (or abort) | ||
467 | */ | ||
468 | #define SCHEDSTAT_VERSION 14 | ||
469 | |||
470 | static int show_schedstat(struct seq_file *seq, void *v) | ||
471 | { | ||
472 | int cpu; | ||
473 | |||
474 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
475 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
476 | for_each_online_cpu(cpu) { | ||
477 | struct rq *rq = cpu_rq(cpu); | ||
478 | #ifdef CONFIG_SMP | ||
479 | struct sched_domain *sd; | ||
480 | int dcnt = 0; | ||
481 | #endif | ||
482 | |||
483 | /* runqueue-specific stats */ | ||
484 | seq_printf(seq, | ||
485 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", | ||
486 | cpu, rq->yld_both_empty, | ||
487 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, | ||
488 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, | ||
489 | rq->ttwu_cnt, rq->ttwu_local, | ||
490 | rq->rq_sched_info.cpu_time, | ||
491 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); | ||
492 | |||
493 | seq_printf(seq, "\n"); | ||
494 | |||
495 | #ifdef CONFIG_SMP | ||
496 | /* domain-specific stats */ | ||
497 | preempt_disable(); | ||
498 | for_each_domain(cpu, sd) { | ||
499 | enum idle_type itype; | ||
500 | char mask_str[NR_CPUS]; | ||
501 | |||
502 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | ||
503 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | ||
504 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; | ||
505 | itype++) { | ||
506 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " | ||
507 | "%lu", | ||
508 | sd->lb_cnt[itype], | ||
509 | sd->lb_balanced[itype], | ||
510 | sd->lb_failed[itype], | ||
511 | sd->lb_imbalance[itype], | ||
512 | sd->lb_gained[itype], | ||
513 | sd->lb_hot_gained[itype], | ||
514 | sd->lb_nobusyq[itype], | ||
515 | sd->lb_nobusyg[itype]); | ||
516 | } | ||
517 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" | ||
518 | " %lu %lu %lu\n", | ||
519 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | ||
520 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | ||
521 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | ||
522 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
523 | sd->ttwu_move_balance); | ||
524 | } | ||
525 | preempt_enable(); | ||
526 | #endif | ||
527 | } | ||
528 | return 0; | ||
529 | } | ||
530 | |||
531 | static int schedstat_open(struct inode *inode, struct file *file) | ||
532 | { | ||
533 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
534 | char *buf = kmalloc(size, GFP_KERNEL); | ||
535 | struct seq_file *m; | ||
536 | int res; | ||
537 | |||
538 | if (!buf) | ||
539 | return -ENOMEM; | ||
540 | res = single_open(file, show_schedstat, NULL); | ||
541 | if (!res) { | ||
542 | m = file->private_data; | ||
543 | m->buf = buf; | ||
544 | m->size = size; | ||
545 | } else | ||
546 | kfree(buf); | ||
547 | return res; | ||
548 | } | ||
549 | |||
550 | const struct file_operations proc_schedstat_operations = { | ||
551 | .open = schedstat_open, | ||
552 | .read = seq_read, | ||
553 | .llseek = seq_lseek, | ||
554 | .release = single_release, | ||
555 | }; | ||
556 | |||
557 | /* | ||
558 | * Expects runqueue lock to be held for atomicity of update | ||
559 | */ | ||
560 | static inline void | ||
561 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
562 | { | ||
563 | if (rq) { | ||
564 | rq->rq_sched_info.run_delay += delta_jiffies; | ||
565 | rq->rq_sched_info.pcnt++; | ||
566 | } | ||
567 | } | ||
568 | |||
569 | /* | ||
570 | * Expects runqueue lock to be held for atomicity of update | ||
571 | */ | ||
572 | static inline void | ||
573 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
574 | { | ||
575 | if (rq) | ||
576 | rq->rq_sched_info.cpu_time += delta_jiffies; | ||
577 | } | ||
578 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | ||
579 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | ||
580 | #else /* !CONFIG_SCHEDSTATS */ | ||
581 | static inline void | ||
582 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
583 | {} | ||
584 | static inline void | ||
585 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
586 | {} | ||
587 | # define schedstat_inc(rq, field) do { } while (0) | ||
588 | # define schedstat_add(rq, field, amt) do { } while (0) | ||
589 | #endif | ||
590 | |||
591 | /* | 540 | /* |
592 | * this_rq_lock - lock this runqueue and disable interrupts. | 541 | * this_rq_lock - lock this runqueue and disable interrupts. |
593 | */ | 542 | */ |
@@ -603,177 +552,172 @@ static inline struct rq *this_rq_lock(void) | |||
603 | return rq; | 552 | return rq; |
604 | } | 553 | } |
605 | 554 | ||
606 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | ||
607 | /* | 555 | /* |
608 | * Called when a process is dequeued from the active array and given | 556 | * CPU frequency is/was unstable - start new by setting prev_clock_raw: |
609 | * the cpu. We should note that with the exception of interactive | ||
610 | * tasks, the expired queue will become the active queue after the active | ||
611 | * queue is empty, without explicitly dequeuing and requeuing tasks in the | ||
612 | * expired queue. (Interactive tasks may be requeued directly to the | ||
613 | * active queue, thus delaying tasks in the expired queue from running; | ||
614 | * see scheduler_tick()). | ||
615 | * | ||
616 | * This function is only called from sched_info_arrive(), rather than | ||
617 | * dequeue_task(). Even though a task may be queued and dequeued multiple | ||
618 | * times as it is shuffled about, we're really interested in knowing how | ||
619 | * long it was from the *first* time it was queued to the time that it | ||
620 | * finally hit a cpu. | ||
621 | */ | 557 | */ |
622 | static inline void sched_info_dequeued(struct task_struct *t) | 558 | void sched_clock_unstable_event(void) |
623 | { | 559 | { |
624 | t->sched_info.last_queued = 0; | 560 | unsigned long flags; |
561 | struct rq *rq; | ||
562 | |||
563 | rq = task_rq_lock(current, &flags); | ||
564 | rq->prev_clock_raw = sched_clock(); | ||
565 | rq->clock_unstable_events++; | ||
566 | task_rq_unlock(rq, &flags); | ||
625 | } | 567 | } |
626 | 568 | ||
627 | /* | 569 | /* |
628 | * Called when a task finally hits the cpu. We can now calculate how | 570 | * resched_task - mark a task 'to be rescheduled now'. |
629 | * long it was waiting to run. We also note when it began so that we | 571 | * |
630 | * can keep stats on how long its timeslice is. | 572 | * On UP this means the setting of the need_resched flag, on SMP it |
573 | * might also involve a cross-CPU call to trigger the scheduler on | ||
574 | * the target CPU. | ||
631 | */ | 575 | */ |
632 | static void sched_info_arrive(struct task_struct *t) | 576 | #ifdef CONFIG_SMP |
577 | |||
578 | #ifndef tsk_is_polling | ||
579 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | ||
580 | #endif | ||
581 | |||
582 | static void resched_task(struct task_struct *p) | ||
633 | { | 583 | { |
634 | unsigned long now = jiffies, delta_jiffies = 0; | 584 | int cpu; |
585 | |||
586 | assert_spin_locked(&task_rq(p)->lock); | ||
587 | |||
588 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | ||
589 | return; | ||
590 | |||
591 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | ||
635 | 592 | ||
636 | if (t->sched_info.last_queued) | 593 | cpu = task_cpu(p); |
637 | delta_jiffies = now - t->sched_info.last_queued; | 594 | if (cpu == smp_processor_id()) |
638 | sched_info_dequeued(t); | 595 | return; |
639 | t->sched_info.run_delay += delta_jiffies; | ||
640 | t->sched_info.last_arrival = now; | ||
641 | t->sched_info.pcnt++; | ||
642 | 596 | ||
643 | rq_sched_info_arrive(task_rq(t), delta_jiffies); | 597 | /* NEED_RESCHED must be visible before we test polling */ |
598 | smp_mb(); | ||
599 | if (!tsk_is_polling(p)) | ||
600 | smp_send_reschedule(cpu); | ||
644 | } | 601 | } |
645 | 602 | ||
646 | /* | 603 | static void resched_cpu(int cpu) |
647 | * Called when a process is queued into either the active or expired | 604 | { |
648 | * array. The time is noted and later used to determine how long we | 605 | struct rq *rq = cpu_rq(cpu); |
649 | * had to wait for us to reach the cpu. Since the expired queue will | 606 | unsigned long flags; |
650 | * become the active queue after active queue is empty, without dequeuing | 607 | |
651 | * and requeuing any tasks, we are interested in queuing to either. It | 608 | if (!spin_trylock_irqsave(&rq->lock, flags)) |
652 | * is unusual but not impossible for tasks to be dequeued and immediately | 609 | return; |
653 | * requeued in the same or another array: this can happen in sched_yield(), | 610 | resched_task(cpu_curr(cpu)); |
654 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | 611 | spin_unlock_irqrestore(&rq->lock, flags); |
655 | * to runqueue. | 612 | } |
656 | * | 613 | #else |
657 | * This function is only called from enqueue_task(), but also only updates | 614 | static inline void resched_task(struct task_struct *p) |
658 | * the timestamp if it is already not set. It's assumed that | ||
659 | * sched_info_dequeued() will clear that stamp when appropriate. | ||
660 | */ | ||
661 | static inline void sched_info_queued(struct task_struct *t) | ||
662 | { | 615 | { |
663 | if (unlikely(sched_info_on())) | 616 | assert_spin_locked(&task_rq(p)->lock); |
664 | if (!t->sched_info.last_queued) | 617 | set_tsk_need_resched(p); |
665 | t->sched_info.last_queued = jiffies; | ||
666 | } | 618 | } |
619 | #endif | ||
667 | 620 | ||
668 | /* | 621 | static u64 div64_likely32(u64 divident, unsigned long divisor) |
669 | * Called when a process ceases being the active-running process, either | ||
670 | * voluntarily or involuntarily. Now we can calculate how long we ran. | ||
671 | */ | ||
672 | static inline void sched_info_depart(struct task_struct *t) | ||
673 | { | 622 | { |
674 | unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; | 623 | #if BITS_PER_LONG == 32 |
624 | if (likely(divident <= 0xffffffffULL)) | ||
625 | return (u32)divident / divisor; | ||
626 | do_div(divident, divisor); | ||
675 | 627 | ||
676 | t->sched_info.cpu_time += delta_jiffies; | 628 | return divident; |
677 | rq_sched_info_depart(task_rq(t), delta_jiffies); | 629 | #else |
630 | return divident / divisor; | ||
631 | #endif | ||
678 | } | 632 | } |
679 | 633 | ||
680 | /* | 634 | #if BITS_PER_LONG == 32 |
681 | * Called when tasks are switched involuntarily due, typically, to expiring | 635 | # define WMULT_CONST (~0UL) |
682 | * their time slice. (This may also be called when switching to or from | 636 | #else |
683 | * the idle task.) We are only called when prev != next. | 637 | # define WMULT_CONST (1UL << 32) |
684 | */ | 638 | #endif |
685 | static inline void | 639 | |
686 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | 640 | #define WMULT_SHIFT 32 |
641 | |||
642 | static inline unsigned long | ||
643 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
644 | struct load_weight *lw) | ||
687 | { | 645 | { |
688 | struct rq *rq = task_rq(prev); | 646 | u64 tmp; |
689 | 647 | ||
648 | if (unlikely(!lw->inv_weight)) | ||
649 | lw->inv_weight = WMULT_CONST / lw->weight; | ||
650 | |||
651 | tmp = (u64)delta_exec * weight; | ||
690 | /* | 652 | /* |
691 | * prev now departs the cpu. It's not interesting to record | 653 | * Check whether we'd overflow the 64-bit multiplication: |
692 | * stats about how efficient we were at scheduling the idle | ||
693 | * process, however. | ||
694 | */ | 654 | */ |
695 | if (prev != rq->idle) | 655 | if (unlikely(tmp > WMULT_CONST)) { |
696 | sched_info_depart(prev); | 656 | tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) |
657 | >> (WMULT_SHIFT/2); | ||
658 | } else { | ||
659 | tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; | ||
660 | } | ||
697 | 661 | ||
698 | if (next != rq->idle) | 662 | return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit); |
699 | sched_info_arrive(next); | ||
700 | } | ||
701 | static inline void | ||
702 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
703 | { | ||
704 | if (unlikely(sched_info_on())) | ||
705 | __sched_info_switch(prev, next); | ||
706 | } | 663 | } |
707 | #else | ||
708 | #define sched_info_queued(t) do { } while (0) | ||
709 | #define sched_info_switch(t, next) do { } while (0) | ||
710 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | ||
711 | 664 | ||
712 | /* | 665 | static inline unsigned long |
713 | * Adding/removing a task to/from a priority array: | 666 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) |
714 | */ | ||
715 | static void dequeue_task(struct task_struct *p, struct prio_array *array) | ||
716 | { | 667 | { |
717 | array->nr_active--; | 668 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); |
718 | list_del(&p->run_list); | ||
719 | if (list_empty(array->queue + p->prio)) | ||
720 | __clear_bit(p->prio, array->bitmap); | ||
721 | } | 669 | } |
722 | 670 | ||
723 | static void enqueue_task(struct task_struct *p, struct prio_array *array) | 671 | static void update_load_add(struct load_weight *lw, unsigned long inc) |
724 | { | 672 | { |
725 | sched_info_queued(p); | 673 | lw->weight += inc; |
726 | list_add_tail(&p->run_list, array->queue + p->prio); | 674 | lw->inv_weight = 0; |
727 | __set_bit(p->prio, array->bitmap); | ||
728 | array->nr_active++; | ||
729 | p->array = array; | ||
730 | } | 675 | } |
731 | 676 | ||
732 | /* | 677 | static void update_load_sub(struct load_weight *lw, unsigned long dec) |
733 | * Put task to the end of the run list without the overhead of dequeue | ||
734 | * followed by enqueue. | ||
735 | */ | ||
736 | static void requeue_task(struct task_struct *p, struct prio_array *array) | ||
737 | { | 678 | { |
738 | list_move_tail(&p->run_list, array->queue + p->prio); | 679 | lw->weight -= dec; |
680 | lw->inv_weight = 0; | ||
739 | } | 681 | } |
740 | 682 | ||
741 | static inline void | 683 | static void __update_curr_load(struct rq *rq, struct load_stat *ls) |
742 | enqueue_task_head(struct task_struct *p, struct prio_array *array) | ||
743 | { | 684 | { |
744 | list_add(&p->run_list, array->queue + p->prio); | 685 | if (rq->curr != rq->idle && ls->load.weight) { |
745 | __set_bit(p->prio, array->bitmap); | 686 | ls->delta_exec += ls->delta_stat; |
746 | array->nr_active++; | 687 | ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); |
747 | p->array = array; | 688 | ls->delta_stat = 0; |
689 | } | ||
748 | } | 690 | } |
749 | 691 | ||
750 | /* | 692 | /* |
751 | * __normal_prio - return the priority that is based on the static | 693 | * Update delta_exec, delta_fair fields for rq. |
752 | * priority but is modified by bonuses/penalties. | ||
753 | * | 694 | * |
754 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 695 | * delta_fair clock advances at a rate inversely proportional to |
755 | * into the -5 ... 0 ... +5 bonus/penalty range. | 696 | * total load (rq->ls.load.weight) on the runqueue, while |
697 | * delta_exec advances at the same rate as wall-clock (provided | ||
698 | * cpu is not idle). | ||
756 | * | 699 | * |
757 | * We use 25% of the full 0...39 priority range so that: | 700 | * delta_exec / delta_fair is a measure of the (smoothened) load on this |
701 | * runqueue over any given interval. This (smoothened) load is used | ||
702 | * during load balance. | ||
758 | * | 703 | * |
759 | * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | 704 | * This function is called /before/ updating rq->ls.load |
760 | * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | 705 | * and when switching tasks. |
761 | * | ||
762 | * Both properties are important to certain workloads. | ||
763 | */ | 706 | */ |
764 | 707 | static void update_curr_load(struct rq *rq, u64 now) | |
765 | static inline int __normal_prio(struct task_struct *p) | ||
766 | { | 708 | { |
767 | int bonus, prio; | 709 | struct load_stat *ls = &rq->ls; |
768 | 710 | u64 start; | |
769 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | ||
770 | 711 | ||
771 | prio = p->static_prio - bonus; | 712 | start = ls->load_update_start; |
772 | if (prio < MAX_RT_PRIO) | 713 | ls->load_update_start = now; |
773 | prio = MAX_RT_PRIO; | 714 | ls->delta_stat += now - start; |
774 | if (prio > MAX_PRIO-1) | 715 | /* |
775 | prio = MAX_PRIO-1; | 716 | * Stagger updates to ls->delta_fair. Very frequent updates |
776 | return prio; | 717 | * can be expensive. |
718 | */ | ||
719 | if (ls->delta_stat >= sysctl_sched_stat_granularity) | ||
720 | __update_curr_load(rq, ls); | ||
777 | } | 721 | } |
778 | 722 | ||
779 | /* | 723 | /* |
@@ -791,53 +735,155 @@ static inline int __normal_prio(struct task_struct *p) | |||
791 | * this code will need modification | 735 | * this code will need modification |
792 | */ | 736 | */ |
793 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | 737 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE |
794 | #define LOAD_WEIGHT(lp) \ | 738 | #define load_weight(lp) \ |
795 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | 739 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) |
796 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | 740 | #define PRIO_TO_LOAD_WEIGHT(prio) \ |
797 | LOAD_WEIGHT(static_prio_timeslice(prio)) | 741 | load_weight(static_prio_timeslice(prio)) |
798 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | 742 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
799 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | 743 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp)) |
800 | 744 | ||
801 | static void set_load_weight(struct task_struct *p) | 745 | #define WEIGHT_IDLEPRIO 2 |
802 | { | 746 | #define WMULT_IDLEPRIO (1 << 31) |
803 | if (has_rt_policy(p)) { | 747 | |
804 | #ifdef CONFIG_SMP | 748 | /* |
805 | if (p == task_rq(p)->migration_thread) | 749 | * Nice levels are multiplicative, with a gentle 10% change for every |
806 | /* | 750 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to |
807 | * The migration thread does the actual balancing. | 751 | * nice 1, it will get ~10% less CPU time than another CPU-bound task |
808 | * Giving its load any weight will skew balancing | 752 | * that remained on nice 0. |
809 | * adversely. | 753 | * |
810 | */ | 754 | * The "10% effect" is relative and cumulative: from _any_ nice level, |
811 | p->load_weight = 0; | 755 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level |
812 | else | 756 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. |
813 | #endif | 757 | * If a task goes up by ~10% and another task goes down by ~10% then |
814 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | 758 | * the relative distance between them is ~25%.) |
815 | } else | 759 | */ |
816 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | 760 | static const int prio_to_weight[40] = { |
817 | } | 761 | /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, |
762 | /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, | ||
763 | /* 0 */ NICE_0_LOAD /* 1024 */, | ||
764 | /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, | ||
765 | /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, | ||
766 | }; | ||
767 | |||
768 | /* | ||
769 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
770 | * | ||
771 | * In cases where the weight does not change often, we can use the | ||
772 | * precalculated inverse to speed up arithmetics by turning divisions | ||
773 | * into multiplications: | ||
774 | */ | ||
775 | static const u32 prio_to_wmult[40] = { | ||
776 | /* -20 */ 48356, 60446, 75558, 94446, 118058, | ||
777 | /* -15 */ 147573, 184467, 230589, 288233, 360285, | ||
778 | /* -10 */ 450347, 562979, 703746, 879575, 1099582, | ||
779 | /* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443, | ||
780 | /* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518, | ||
781 | /* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126, | ||
782 | /* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717, | ||
783 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
784 | }; | ||
818 | 785 | ||
819 | static inline void | 786 | static inline void |
820 | inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) | 787 | inc_load(struct rq *rq, const struct task_struct *p, u64 now) |
821 | { | 788 | { |
822 | rq->raw_weighted_load += p->load_weight; | 789 | update_curr_load(rq, now); |
790 | update_load_add(&rq->ls.load, p->se.load.weight); | ||
823 | } | 791 | } |
824 | 792 | ||
825 | static inline void | 793 | static inline void |
826 | dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) | 794 | dec_load(struct rq *rq, const struct task_struct *p, u64 now) |
827 | { | 795 | { |
828 | rq->raw_weighted_load -= p->load_weight; | 796 | update_curr_load(rq, now); |
797 | update_load_sub(&rq->ls.load, p->se.load.weight); | ||
829 | } | 798 | } |
830 | 799 | ||
831 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq) | 800 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) |
832 | { | 801 | { |
833 | rq->nr_running++; | 802 | rq->nr_running++; |
834 | inc_raw_weighted_load(rq, p); | 803 | inc_load(rq, p, now); |
835 | } | 804 | } |
836 | 805 | ||
837 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq) | 806 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) |
838 | { | 807 | { |
839 | rq->nr_running--; | 808 | rq->nr_running--; |
840 | dec_raw_weighted_load(rq, p); | 809 | dec_load(rq, p, now); |
810 | } | ||
811 | |||
812 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | ||
813 | |||
814 | /* | ||
815 | * runqueue iterator, to support SMP load-balancing between different | ||
816 | * scheduling classes, without having to expose their internal data | ||
817 | * structures to the load-balancing proper: | ||
818 | */ | ||
819 | struct rq_iterator { | ||
820 | void *arg; | ||
821 | struct task_struct *(*start)(void *); | ||
822 | struct task_struct *(*next)(void *); | ||
823 | }; | ||
824 | |||
825 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
826 | unsigned long max_nr_move, unsigned long max_load_move, | ||
827 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
828 | int *all_pinned, unsigned long *load_moved, | ||
829 | int this_best_prio, int best_prio, int best_prio_seen, | ||
830 | struct rq_iterator *iterator); | ||
831 | |||
832 | #include "sched_stats.h" | ||
833 | #include "sched_rt.c" | ||
834 | #include "sched_fair.c" | ||
835 | #include "sched_idletask.c" | ||
836 | #ifdef CONFIG_SCHED_DEBUG | ||
837 | # include "sched_debug.c" | ||
838 | #endif | ||
839 | |||
840 | #define sched_class_highest (&rt_sched_class) | ||
841 | |||
842 | static void set_load_weight(struct task_struct *p) | ||
843 | { | ||
844 | task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; | ||
845 | p->se.wait_runtime = 0; | ||
846 | |||
847 | if (task_has_rt_policy(p)) { | ||
848 | p->se.load.weight = prio_to_weight[0] * 2; | ||
849 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; | ||
850 | return; | ||
851 | } | ||
852 | |||
853 | /* | ||
854 | * SCHED_IDLE tasks get minimal weight: | ||
855 | */ | ||
856 | if (p->policy == SCHED_IDLE) { | ||
857 | p->se.load.weight = WEIGHT_IDLEPRIO; | ||
858 | p->se.load.inv_weight = WMULT_IDLEPRIO; | ||
859 | return; | ||
860 | } | ||
861 | |||
862 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; | ||
863 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | ||
864 | } | ||
865 | |||
866 | static void | ||
867 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | ||
868 | { | ||
869 | sched_info_queued(p); | ||
870 | p->sched_class->enqueue_task(rq, p, wakeup, now); | ||
871 | p->se.on_rq = 1; | ||
872 | } | ||
873 | |||
874 | static void | ||
875 | dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) | ||
876 | { | ||
877 | p->sched_class->dequeue_task(rq, p, sleep, now); | ||
878 | p->se.on_rq = 0; | ||
879 | } | ||
880 | |||
881 | /* | ||
882 | * __normal_prio - return the priority that is based on the static prio | ||
883 | */ | ||
884 | static inline int __normal_prio(struct task_struct *p) | ||
885 | { | ||
886 | return p->static_prio; | ||
841 | } | 887 | } |
842 | 888 | ||
843 | /* | 889 | /* |
@@ -851,7 +897,7 @@ static inline int normal_prio(struct task_struct *p) | |||
851 | { | 897 | { |
852 | int prio; | 898 | int prio; |
853 | 899 | ||
854 | if (has_rt_policy(p)) | 900 | if (task_has_rt_policy(p)) |
855 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 901 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
856 | else | 902 | else |
857 | prio = __normal_prio(p); | 903 | prio = __normal_prio(p); |
@@ -879,222 +925,47 @@ static int effective_prio(struct task_struct *p) | |||
879 | } | 925 | } |
880 | 926 | ||
881 | /* | 927 | /* |
882 | * __activate_task - move a task to the runqueue. | 928 | * activate_task - move a task to the runqueue. |
883 | */ | 929 | */ |
884 | static void __activate_task(struct task_struct *p, struct rq *rq) | 930 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) |
885 | { | 931 | { |
886 | struct prio_array *target = rq->active; | 932 | u64 now = rq_clock(rq); |
887 | 933 | ||
888 | if (batch_task(p)) | 934 | if (p->state == TASK_UNINTERRUPTIBLE) |
889 | target = rq->expired; | 935 | rq->nr_uninterruptible--; |
890 | enqueue_task(p, target); | ||
891 | inc_nr_running(p, rq); | ||
892 | } | ||
893 | |||
894 | /* | ||
895 | * __activate_idle_task - move idle task to the _front_ of runqueue. | ||
896 | */ | ||
897 | static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) | ||
898 | { | ||
899 | enqueue_task_head(p, rq->active); | ||
900 | inc_nr_running(p, rq); | ||
901 | } | ||
902 | |||
903 | /* | ||
904 | * Recalculate p->normal_prio and p->prio after having slept, | ||
905 | * updating the sleep-average too: | ||
906 | */ | ||
907 | static int recalc_task_prio(struct task_struct *p, unsigned long long now) | ||
908 | { | ||
909 | /* Caller must always ensure 'now >= p->timestamp' */ | ||
910 | unsigned long sleep_time = now - p->timestamp; | ||
911 | |||
912 | if (batch_task(p)) | ||
913 | sleep_time = 0; | ||
914 | |||
915 | if (likely(sleep_time > 0)) { | ||
916 | /* | ||
917 | * This ceiling is set to the lowest priority that would allow | ||
918 | * a task to be reinserted into the active array on timeslice | ||
919 | * completion. | ||
920 | */ | ||
921 | unsigned long ceiling = INTERACTIVE_SLEEP(p); | ||
922 | |||
923 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { | ||
924 | /* | ||
925 | * Prevents user tasks from achieving best priority | ||
926 | * with one single large enough sleep. | ||
927 | */ | ||
928 | p->sleep_avg = ceiling; | ||
929 | /* | ||
930 | * Using INTERACTIVE_SLEEP() as a ceiling places a | ||
931 | * nice(0) task 1ms sleep away from promotion, and | ||
932 | * gives it 700ms to round-robin with no chance of | ||
933 | * being demoted. This is more than generous, so | ||
934 | * mark this sleep as non-interactive to prevent the | ||
935 | * on-runqueue bonus logic from intervening should | ||
936 | * this task not receive cpu immediately. | ||
937 | */ | ||
938 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
939 | } else { | ||
940 | /* | ||
941 | * Tasks waking from uninterruptible sleep are | ||
942 | * limited in their sleep_avg rise as they | ||
943 | * are likely to be waiting on I/O | ||
944 | */ | ||
945 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | ||
946 | if (p->sleep_avg >= ceiling) | ||
947 | sleep_time = 0; | ||
948 | else if (p->sleep_avg + sleep_time >= | ||
949 | ceiling) { | ||
950 | p->sleep_avg = ceiling; | ||
951 | sleep_time = 0; | ||
952 | } | ||
953 | } | ||
954 | |||
955 | /* | ||
956 | * This code gives a bonus to interactive tasks. | ||
957 | * | ||
958 | * The boost works by updating the 'average sleep time' | ||
959 | * value here, based on ->timestamp. The more time a | ||
960 | * task spends sleeping, the higher the average gets - | ||
961 | * and the higher the priority boost gets as well. | ||
962 | */ | ||
963 | p->sleep_avg += sleep_time; | ||
964 | |||
965 | } | ||
966 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
967 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
968 | } | ||
969 | 936 | ||
970 | return effective_prio(p); | 937 | enqueue_task(rq, p, wakeup, now); |
938 | inc_nr_running(p, rq, now); | ||
971 | } | 939 | } |
972 | 940 | ||
973 | /* | 941 | /* |
974 | * activate_task - move a task to the runqueue and do priority recalculation | 942 | * activate_idle_task - move idle task to the _front_ of runqueue. |
975 | * | ||
976 | * Update all the scheduling statistics stuff. (sleep average | ||
977 | * calculation, priority modifiers, etc.) | ||
978 | */ | 943 | */ |
979 | static void activate_task(struct task_struct *p, struct rq *rq, int local) | 944 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) |
980 | { | 945 | { |
981 | unsigned long long now; | 946 | u64 now = rq_clock(rq); |
982 | |||
983 | if (rt_task(p)) | ||
984 | goto out; | ||
985 | 947 | ||
986 | now = sched_clock(); | 948 | if (p->state == TASK_UNINTERRUPTIBLE) |
987 | #ifdef CONFIG_SMP | 949 | rq->nr_uninterruptible--; |
988 | if (!local) { | ||
989 | /* Compensate for drifting sched_clock */ | ||
990 | struct rq *this_rq = this_rq(); | ||
991 | now = (now - this_rq->most_recent_timestamp) | ||
992 | + rq->most_recent_timestamp; | ||
993 | } | ||
994 | #endif | ||
995 | |||
996 | /* | ||
997 | * Sleep time is in units of nanosecs, so shift by 20 to get a | ||
998 | * milliseconds-range estimation of the amount of time that the task | ||
999 | * spent sleeping: | ||
1000 | */ | ||
1001 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
1002 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
1003 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), | ||
1004 | (now - p->timestamp) >> 20); | ||
1005 | } | ||
1006 | |||
1007 | p->prio = recalc_task_prio(p, now); | ||
1008 | 950 | ||
1009 | /* | 951 | enqueue_task(rq, p, 0, now); |
1010 | * This checks to make sure it's not an uninterruptible task | 952 | inc_nr_running(p, rq, now); |
1011 | * that is now waking up. | ||
1012 | */ | ||
1013 | if (p->sleep_type == SLEEP_NORMAL) { | ||
1014 | /* | ||
1015 | * Tasks which were woken up by interrupts (ie. hw events) | ||
1016 | * are most likely of interactive nature. So we give them | ||
1017 | * the credit of extending their sleep time to the period | ||
1018 | * of time they spend on the runqueue, waiting for execution | ||
1019 | * on a CPU, first time around: | ||
1020 | */ | ||
1021 | if (in_interrupt()) | ||
1022 | p->sleep_type = SLEEP_INTERRUPTED; | ||
1023 | else { | ||
1024 | /* | ||
1025 | * Normal first-time wakeups get a credit too for | ||
1026 | * on-runqueue time, but it will be weighted down: | ||
1027 | */ | ||
1028 | p->sleep_type = SLEEP_INTERACTIVE; | ||
1029 | } | ||
1030 | } | ||
1031 | p->timestamp = now; | ||
1032 | out: | ||
1033 | __activate_task(p, rq); | ||
1034 | } | 953 | } |
1035 | 954 | ||
1036 | /* | 955 | /* |
1037 | * deactivate_task - remove a task from the runqueue. | 956 | * deactivate_task - remove a task from the runqueue. |
1038 | */ | 957 | */ |
1039 | static void deactivate_task(struct task_struct *p, struct rq *rq) | 958 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
1040 | { | 959 | { |
1041 | dec_nr_running(p, rq); | 960 | u64 now = rq_clock(rq); |
1042 | dequeue_task(p, p->array); | ||
1043 | p->array = NULL; | ||
1044 | } | ||
1045 | 961 | ||
1046 | /* | 962 | if (p->state == TASK_UNINTERRUPTIBLE) |
1047 | * resched_task - mark a task 'to be rescheduled now'. | 963 | rq->nr_uninterruptible++; |
1048 | * | ||
1049 | * On UP this means the setting of the need_resched flag, on SMP it | ||
1050 | * might also involve a cross-CPU call to trigger the scheduler on | ||
1051 | * the target CPU. | ||
1052 | */ | ||
1053 | #ifdef CONFIG_SMP | ||
1054 | 964 | ||
1055 | #ifndef tsk_is_polling | 965 | dequeue_task(rq, p, sleep, now); |
1056 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 966 | dec_nr_running(p, rq, now); |
1057 | #endif | ||
1058 | |||
1059 | static void resched_task(struct task_struct *p) | ||
1060 | { | ||
1061 | int cpu; | ||
1062 | |||
1063 | assert_spin_locked(&task_rq(p)->lock); | ||
1064 | |||
1065 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | ||
1066 | return; | ||
1067 | |||
1068 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | ||
1069 | |||
1070 | cpu = task_cpu(p); | ||
1071 | if (cpu == smp_processor_id()) | ||
1072 | return; | ||
1073 | |||
1074 | /* NEED_RESCHED must be visible before we test polling */ | ||
1075 | smp_mb(); | ||
1076 | if (!tsk_is_polling(p)) | ||
1077 | smp_send_reschedule(cpu); | ||
1078 | } | 967 | } |
1079 | 968 | ||
1080 | static void resched_cpu(int cpu) | ||
1081 | { | ||
1082 | struct rq *rq = cpu_rq(cpu); | ||
1083 | unsigned long flags; | ||
1084 | |||
1085 | if (!spin_trylock_irqsave(&rq->lock, flags)) | ||
1086 | return; | ||
1087 | resched_task(cpu_curr(cpu)); | ||
1088 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1089 | } | ||
1090 | #else | ||
1091 | static inline void resched_task(struct task_struct *p) | ||
1092 | { | ||
1093 | assert_spin_locked(&task_rq(p)->lock); | ||
1094 | set_tsk_need_resched(p); | ||
1095 | } | ||
1096 | #endif | ||
1097 | |||
1098 | /** | 969 | /** |
1099 | * task_curr - is this task currently executing on a CPU? | 970 | * task_curr - is this task currently executing on a CPU? |
1100 | * @p: the task in question. | 971 | * @p: the task in question. |
@@ -1107,10 +978,42 @@ inline int task_curr(const struct task_struct *p) | |||
1107 | /* Used instead of source_load when we know the type == 0 */ | 978 | /* Used instead of source_load when we know the type == 0 */ |
1108 | unsigned long weighted_cpuload(const int cpu) | 979 | unsigned long weighted_cpuload(const int cpu) |
1109 | { | 980 | { |
1110 | return cpu_rq(cpu)->raw_weighted_load; | 981 | return cpu_rq(cpu)->ls.load.weight; |
1111 | } | 982 | } |
1112 | 983 | ||
984 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
985 | { | ||
1113 | #ifdef CONFIG_SMP | 986 | #ifdef CONFIG_SMP |
987 | task_thread_info(p)->cpu = cpu; | ||
988 | set_task_cfs_rq(p); | ||
989 | #endif | ||
990 | } | ||
991 | |||
992 | #ifdef CONFIG_SMP | ||
993 | |||
994 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | ||
995 | { | ||
996 | int old_cpu = task_cpu(p); | ||
997 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | ||
998 | u64 clock_offset, fair_clock_offset; | ||
999 | |||
1000 | clock_offset = old_rq->clock - new_rq->clock; | ||
1001 | fair_clock_offset = old_rq->cfs.fair_clock - | ||
1002 | new_rq->cfs.fair_clock; | ||
1003 | if (p->se.wait_start) | ||
1004 | p->se.wait_start -= clock_offset; | ||
1005 | if (p->se.wait_start_fair) | ||
1006 | p->se.wait_start_fair -= fair_clock_offset; | ||
1007 | if (p->se.sleep_start) | ||
1008 | p->se.sleep_start -= clock_offset; | ||
1009 | if (p->se.block_start) | ||
1010 | p->se.block_start -= clock_offset; | ||
1011 | if (p->se.sleep_start_fair) | ||
1012 | p->se.sleep_start_fair -= fair_clock_offset; | ||
1013 | |||
1014 | __set_task_cpu(p, new_cpu); | ||
1015 | } | ||
1016 | |||
1114 | struct migration_req { | 1017 | struct migration_req { |
1115 | struct list_head list; | 1018 | struct list_head list; |
1116 | 1019 | ||
@@ -1133,7 +1036,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
1133 | * If the task is not on a runqueue (and not running), then | 1036 | * If the task is not on a runqueue (and not running), then |
1134 | * it is sufficient to simply update the task's cpu field. | 1037 | * it is sufficient to simply update the task's cpu field. |
1135 | */ | 1038 | */ |
1136 | if (!p->array && !task_running(rq, p)) { | 1039 | if (!p->se.on_rq && !task_running(rq, p)) { |
1137 | set_task_cpu(p, dest_cpu); | 1040 | set_task_cpu(p, dest_cpu); |
1138 | return 0; | 1041 | return 0; |
1139 | } | 1042 | } |
@@ -1158,9 +1061,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
1158 | void wait_task_inactive(struct task_struct *p) | 1061 | void wait_task_inactive(struct task_struct *p) |
1159 | { | 1062 | { |
1160 | unsigned long flags; | 1063 | unsigned long flags; |
1064 | int running, on_rq; | ||
1161 | struct rq *rq; | 1065 | struct rq *rq; |
1162 | struct prio_array *array; | ||
1163 | int running; | ||
1164 | 1066 | ||
1165 | repeat: | 1067 | repeat: |
1166 | /* | 1068 | /* |
@@ -1192,7 +1094,7 @@ repeat: | |||
1192 | */ | 1094 | */ |
1193 | rq = task_rq_lock(p, &flags); | 1095 | rq = task_rq_lock(p, &flags); |
1194 | running = task_running(rq, p); | 1096 | running = task_running(rq, p); |
1195 | array = p->array; | 1097 | on_rq = p->se.on_rq; |
1196 | task_rq_unlock(rq, &flags); | 1098 | task_rq_unlock(rq, &flags); |
1197 | 1099 | ||
1198 | /* | 1100 | /* |
@@ -1215,7 +1117,7 @@ repeat: | |||
1215 | * running right now), it's preempted, and we should | 1117 | * running right now), it's preempted, and we should |
1216 | * yield - it could be a while. | 1118 | * yield - it could be a while. |
1217 | */ | 1119 | */ |
1218 | if (unlikely(array)) { | 1120 | if (unlikely(on_rq)) { |
1219 | yield(); | 1121 | yield(); |
1220 | goto repeat; | 1122 | goto repeat; |
1221 | } | 1123 | } |
@@ -1261,11 +1163,12 @@ void kick_process(struct task_struct *p) | |||
1261 | static inline unsigned long source_load(int cpu, int type) | 1163 | static inline unsigned long source_load(int cpu, int type) |
1262 | { | 1164 | { |
1263 | struct rq *rq = cpu_rq(cpu); | 1165 | struct rq *rq = cpu_rq(cpu); |
1166 | unsigned long total = weighted_cpuload(cpu); | ||
1264 | 1167 | ||
1265 | if (type == 0) | 1168 | if (type == 0) |
1266 | return rq->raw_weighted_load; | 1169 | return total; |
1267 | 1170 | ||
1268 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); | 1171 | return min(rq->cpu_load[type-1], total); |
1269 | } | 1172 | } |
1270 | 1173 | ||
1271 | /* | 1174 | /* |
@@ -1275,11 +1178,12 @@ static inline unsigned long source_load(int cpu, int type) | |||
1275 | static inline unsigned long target_load(int cpu, int type) | 1178 | static inline unsigned long target_load(int cpu, int type) |
1276 | { | 1179 | { |
1277 | struct rq *rq = cpu_rq(cpu); | 1180 | struct rq *rq = cpu_rq(cpu); |
1181 | unsigned long total = weighted_cpuload(cpu); | ||
1278 | 1182 | ||
1279 | if (type == 0) | 1183 | if (type == 0) |
1280 | return rq->raw_weighted_load; | 1184 | return total; |
1281 | 1185 | ||
1282 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); | 1186 | return max(rq->cpu_load[type-1], total); |
1283 | } | 1187 | } |
1284 | 1188 | ||
1285 | /* | 1189 | /* |
@@ -1288,9 +1192,10 @@ static inline unsigned long target_load(int cpu, int type) | |||
1288 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1192 | static inline unsigned long cpu_avg_load_per_task(int cpu) |
1289 | { | 1193 | { |
1290 | struct rq *rq = cpu_rq(cpu); | 1194 | struct rq *rq = cpu_rq(cpu); |
1195 | unsigned long total = weighted_cpuload(cpu); | ||
1291 | unsigned long n = rq->nr_running; | 1196 | unsigned long n = rq->nr_running; |
1292 | 1197 | ||
1293 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; | 1198 | return n ? total / n : SCHED_LOAD_SCALE; |
1294 | } | 1199 | } |
1295 | 1200 | ||
1296 | /* | 1201 | /* |
@@ -1392,9 +1297,9 @@ static int sched_balance_self(int cpu, int flag) | |||
1392 | struct sched_domain *tmp, *sd = NULL; | 1297 | struct sched_domain *tmp, *sd = NULL; |
1393 | 1298 | ||
1394 | for_each_domain(cpu, tmp) { | 1299 | for_each_domain(cpu, tmp) { |
1395 | /* | 1300 | /* |
1396 | * If power savings logic is enabled for a domain, stop there. | 1301 | * If power savings logic is enabled for a domain, stop there. |
1397 | */ | 1302 | */ |
1398 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | 1303 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) |
1399 | break; | 1304 | break; |
1400 | if (tmp->flags & flag) | 1305 | if (tmp->flags & flag) |
@@ -1477,9 +1382,9 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
1477 | if (idle_cpu(i)) | 1382 | if (idle_cpu(i)) |
1478 | return i; | 1383 | return i; |
1479 | } | 1384 | } |
1480 | } | 1385 | } else { |
1481 | else | ||
1482 | break; | 1386 | break; |
1387 | } | ||
1483 | } | 1388 | } |
1484 | return cpu; | 1389 | return cpu; |
1485 | } | 1390 | } |
@@ -1521,7 +1426,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1521 | if (!(old_state & state)) | 1426 | if (!(old_state & state)) |
1522 | goto out; | 1427 | goto out; |
1523 | 1428 | ||
1524 | if (p->array) | 1429 | if (p->se.on_rq) |
1525 | goto out_running; | 1430 | goto out_running; |
1526 | 1431 | ||
1527 | cpu = task_cpu(p); | 1432 | cpu = task_cpu(p); |
@@ -1576,11 +1481,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1576 | * of the current CPU: | 1481 | * of the current CPU: |
1577 | */ | 1482 | */ |
1578 | if (sync) | 1483 | if (sync) |
1579 | tl -= current->load_weight; | 1484 | tl -= current->se.load.weight; |
1580 | 1485 | ||
1581 | if ((tl <= load && | 1486 | if ((tl <= load && |
1582 | tl + target_load(cpu, idx) <= tl_per_task) || | 1487 | tl + target_load(cpu, idx) <= tl_per_task) || |
1583 | 100*(tl + p->load_weight) <= imbalance*load) { | 1488 | 100*(tl + p->se.load.weight) <= imbalance*load) { |
1584 | /* | 1489 | /* |
1585 | * This domain has SD_WAKE_AFFINE and | 1490 | * This domain has SD_WAKE_AFFINE and |
1586 | * p is cache cold in this domain, and | 1491 | * p is cache cold in this domain, and |
@@ -1614,7 +1519,7 @@ out_set_cpu: | |||
1614 | old_state = p->state; | 1519 | old_state = p->state; |
1615 | if (!(old_state & state)) | 1520 | if (!(old_state & state)) |
1616 | goto out; | 1521 | goto out; |
1617 | if (p->array) | 1522 | if (p->se.on_rq) |
1618 | goto out_running; | 1523 | goto out_running; |
1619 | 1524 | ||
1620 | this_cpu = smp_processor_id(); | 1525 | this_cpu = smp_processor_id(); |
@@ -1623,25 +1528,7 @@ out_set_cpu: | |||
1623 | 1528 | ||
1624 | out_activate: | 1529 | out_activate: |
1625 | #endif /* CONFIG_SMP */ | 1530 | #endif /* CONFIG_SMP */ |
1626 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1531 | activate_task(rq, p, 1); |
1627 | rq->nr_uninterruptible--; | ||
1628 | /* | ||
1629 | * Tasks on involuntary sleep don't earn | ||
1630 | * sleep_avg beyond just interactive state. | ||
1631 | */ | ||
1632 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
1633 | } else | ||
1634 | |||
1635 | /* | ||
1636 | * Tasks that have marked their sleep as noninteractive get | ||
1637 | * woken up with their sleep average not weighted in an | ||
1638 | * interactive way. | ||
1639 | */ | ||
1640 | if (old_state & TASK_NONINTERACTIVE) | ||
1641 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
1642 | |||
1643 | |||
1644 | activate_task(p, rq, cpu == this_cpu); | ||
1645 | /* | 1532 | /* |
1646 | * Sync wakeups (i.e. those types of wakeups where the waker | 1533 | * Sync wakeups (i.e. those types of wakeups where the waker |
1647 | * has indicated that it will leave the CPU in short order) | 1534 | * has indicated that it will leave the CPU in short order) |
@@ -1650,10 +1537,8 @@ out_activate: | |||
1650 | * the waker guarantees that the freshly woken up task is going | 1537 | * the waker guarantees that the freshly woken up task is going |
1651 | * to be considered on this CPU.) | 1538 | * to be considered on this CPU.) |
1652 | */ | 1539 | */ |
1653 | if (!sync || cpu != this_cpu) { | 1540 | if (!sync || cpu != this_cpu) |
1654 | if (TASK_PREEMPTS_CURR(p, rq)) | 1541 | check_preempt_curr(rq, p); |
1655 | resched_task(rq->curr); | ||
1656 | } | ||
1657 | success = 1; | 1542 | success = 1; |
1658 | 1543 | ||
1659 | out_running: | 1544 | out_running: |
@@ -1676,19 +1561,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) | |||
1676 | return try_to_wake_up(p, state, 0); | 1561 | return try_to_wake_up(p, state, 0); |
1677 | } | 1562 | } |
1678 | 1563 | ||
1679 | static void task_running_tick(struct rq *rq, struct task_struct *p); | ||
1680 | /* | 1564 | /* |
1681 | * Perform scheduler related setup for a newly forked process p. | 1565 | * Perform scheduler related setup for a newly forked process p. |
1682 | * p is forked by current. | 1566 | * p is forked by current. |
1683 | */ | 1567 | * |
1684 | void fastcall sched_fork(struct task_struct *p, int clone_flags) | 1568 | * __sched_fork() is basic setup used by init_idle() too: |
1685 | { | 1569 | */ |
1686 | int cpu = get_cpu(); | 1570 | static void __sched_fork(struct task_struct *p) |
1571 | { | ||
1572 | p->se.wait_start_fair = 0; | ||
1573 | p->se.wait_start = 0; | ||
1574 | p->se.exec_start = 0; | ||
1575 | p->se.sum_exec_runtime = 0; | ||
1576 | p->se.delta_exec = 0; | ||
1577 | p->se.delta_fair_run = 0; | ||
1578 | p->se.delta_fair_sleep = 0; | ||
1579 | p->se.wait_runtime = 0; | ||
1580 | p->se.sum_wait_runtime = 0; | ||
1581 | p->se.sum_sleep_runtime = 0; | ||
1582 | p->se.sleep_start = 0; | ||
1583 | p->se.sleep_start_fair = 0; | ||
1584 | p->se.block_start = 0; | ||
1585 | p->se.sleep_max = 0; | ||
1586 | p->se.block_max = 0; | ||
1587 | p->se.exec_max = 0; | ||
1588 | p->se.wait_max = 0; | ||
1589 | p->se.wait_runtime_overruns = 0; | ||
1590 | p->se.wait_runtime_underruns = 0; | ||
1687 | 1591 | ||
1688 | #ifdef CONFIG_SMP | 1592 | INIT_LIST_HEAD(&p->run_list); |
1689 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1593 | p->se.on_rq = 0; |
1690 | #endif | ||
1691 | set_task_cpu(p, cpu); | ||
1692 | 1594 | ||
1693 | /* | 1595 | /* |
1694 | * We mark the process as running here, but have not actually | 1596 | * We mark the process as running here, but have not actually |
@@ -1697,16 +1599,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
1697 | * event cannot wake it up and insert it on the runqueue either. | 1599 | * event cannot wake it up and insert it on the runqueue either. |
1698 | */ | 1600 | */ |
1699 | p->state = TASK_RUNNING; | 1601 | p->state = TASK_RUNNING; |
1602 | } | ||
1603 | |||
1604 | /* | ||
1605 | * fork()/clone()-time setup: | ||
1606 | */ | ||
1607 | void sched_fork(struct task_struct *p, int clone_flags) | ||
1608 | { | ||
1609 | int cpu = get_cpu(); | ||
1610 | |||
1611 | __sched_fork(p); | ||
1612 | |||
1613 | #ifdef CONFIG_SMP | ||
1614 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | ||
1615 | #endif | ||
1616 | __set_task_cpu(p, cpu); | ||
1700 | 1617 | ||
1701 | /* | 1618 | /* |
1702 | * Make sure we do not leak PI boosting priority to the child: | 1619 | * Make sure we do not leak PI boosting priority to the child: |
1703 | */ | 1620 | */ |
1704 | p->prio = current->normal_prio; | 1621 | p->prio = current->normal_prio; |
1705 | 1622 | ||
1706 | INIT_LIST_HEAD(&p->run_list); | ||
1707 | p->array = NULL; | ||
1708 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1623 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1709 | if (unlikely(sched_info_on())) | 1624 | if (likely(sched_info_on())) |
1710 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1625 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1711 | #endif | 1626 | #endif |
1712 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1627 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
@@ -1716,34 +1631,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
1716 | /* Want to start with kernel preemption disabled. */ | 1631 | /* Want to start with kernel preemption disabled. */ |
1717 | task_thread_info(p)->preempt_count = 1; | 1632 | task_thread_info(p)->preempt_count = 1; |
1718 | #endif | 1633 | #endif |
1719 | /* | ||
1720 | * Share the timeslice between parent and child, thus the | ||
1721 | * total amount of pending timeslices in the system doesn't change, | ||
1722 | * resulting in more scheduling fairness. | ||
1723 | */ | ||
1724 | local_irq_disable(); | ||
1725 | p->time_slice = (current->time_slice + 1) >> 1; | ||
1726 | /* | ||
1727 | * The remainder of the first timeslice might be recovered by | ||
1728 | * the parent if the child exits early enough. | ||
1729 | */ | ||
1730 | p->first_time_slice = 1; | ||
1731 | current->time_slice >>= 1; | ||
1732 | p->timestamp = sched_clock(); | ||
1733 | if (unlikely(!current->time_slice)) { | ||
1734 | /* | ||
1735 | * This case is rare, it happens when the parent has only | ||
1736 | * a single jiffy left from its timeslice. Taking the | ||
1737 | * runqueue lock is not a problem. | ||
1738 | */ | ||
1739 | current->time_slice = 1; | ||
1740 | task_running_tick(cpu_rq(cpu), current); | ||
1741 | } | ||
1742 | local_irq_enable(); | ||
1743 | put_cpu(); | 1634 | put_cpu(); |
1744 | } | 1635 | } |
1745 | 1636 | ||
1746 | /* | 1637 | /* |
1638 | * After fork, child runs first. (default) If set to 0 then | ||
1639 | * parent will (try to) run first. | ||
1640 | */ | ||
1641 | unsigned int __read_mostly sysctl_sched_child_runs_first = 1; | ||
1642 | |||
1643 | /* | ||
1747 | * wake_up_new_task - wake up a newly created task for the first time. | 1644 | * wake_up_new_task - wake up a newly created task for the first time. |
1748 | * | 1645 | * |
1749 | * This function will do some initial scheduler statistics housekeeping | 1646 | * This function will do some initial scheduler statistics housekeeping |
@@ -1752,107 +1649,27 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
1752 | */ | 1649 | */ |
1753 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 1650 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
1754 | { | 1651 | { |
1755 | struct rq *rq, *this_rq; | ||
1756 | unsigned long flags; | 1652 | unsigned long flags; |
1757 | int this_cpu, cpu; | 1653 | struct rq *rq; |
1654 | int this_cpu; | ||
1758 | 1655 | ||
1759 | rq = task_rq_lock(p, &flags); | 1656 | rq = task_rq_lock(p, &flags); |
1760 | BUG_ON(p->state != TASK_RUNNING); | 1657 | BUG_ON(p->state != TASK_RUNNING); |
1761 | this_cpu = smp_processor_id(); | 1658 | this_cpu = smp_processor_id(); /* parent's CPU */ |
1762 | cpu = task_cpu(p); | ||
1763 | |||
1764 | /* | ||
1765 | * We decrease the sleep average of forking parents | ||
1766 | * and children as well, to keep max-interactive tasks | ||
1767 | * from forking tasks that are max-interactive. The parent | ||
1768 | * (current) is done further down, under its lock. | ||
1769 | */ | ||
1770 | p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * | ||
1771 | CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
1772 | 1659 | ||
1773 | p->prio = effective_prio(p); | 1660 | p->prio = effective_prio(p); |
1774 | 1661 | ||
1775 | if (likely(cpu == this_cpu)) { | 1662 | if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || |
1776 | if (!(clone_flags & CLONE_VM)) { | 1663 | task_cpu(p) != this_cpu || !current->se.on_rq) { |
1777 | /* | 1664 | activate_task(rq, p, 0); |
1778 | * The VM isn't cloned, so we're in a good position to | ||
1779 | * do child-runs-first in anticipation of an exec. This | ||
1780 | * usually avoids a lot of COW overhead. | ||
1781 | */ | ||
1782 | if (unlikely(!current->array)) | ||
1783 | __activate_task(p, rq); | ||
1784 | else { | ||
1785 | p->prio = current->prio; | ||
1786 | p->normal_prio = current->normal_prio; | ||
1787 | list_add_tail(&p->run_list, ¤t->run_list); | ||
1788 | p->array = current->array; | ||
1789 | p->array->nr_active++; | ||
1790 | inc_nr_running(p, rq); | ||
1791 | } | ||
1792 | set_need_resched(); | ||
1793 | } else | ||
1794 | /* Run child last */ | ||
1795 | __activate_task(p, rq); | ||
1796 | /* | ||
1797 | * We skip the following code due to cpu == this_cpu | ||
1798 | * | ||
1799 | * task_rq_unlock(rq, &flags); | ||
1800 | * this_rq = task_rq_lock(current, &flags); | ||
1801 | */ | ||
1802 | this_rq = rq; | ||
1803 | } else { | 1665 | } else { |
1804 | this_rq = cpu_rq(this_cpu); | ||
1805 | |||
1806 | /* | ||
1807 | * Not the local CPU - must adjust timestamp. This should | ||
1808 | * get optimised away in the !CONFIG_SMP case. | ||
1809 | */ | ||
1810 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) | ||
1811 | + rq->most_recent_timestamp; | ||
1812 | __activate_task(p, rq); | ||
1813 | if (TASK_PREEMPTS_CURR(p, rq)) | ||
1814 | resched_task(rq->curr); | ||
1815 | |||
1816 | /* | 1666 | /* |
1817 | * Parent and child are on different CPUs, now get the | 1667 | * Let the scheduling class do new task startup |
1818 | * parent runqueue to update the parent's ->sleep_avg: | 1668 | * management (if any): |
1819 | */ | 1669 | */ |
1820 | task_rq_unlock(rq, &flags); | 1670 | p->sched_class->task_new(rq, p); |
1821 | this_rq = task_rq_lock(current, &flags); | ||
1822 | } | ||
1823 | current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * | ||
1824 | PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
1825 | task_rq_unlock(this_rq, &flags); | ||
1826 | } | ||
1827 | |||
1828 | /* | ||
1829 | * Potentially available exiting-child timeslices are | ||
1830 | * retrieved here - this way the parent does not get | ||
1831 | * penalized for creating too many threads. | ||
1832 | * | ||
1833 | * (this cannot be used to 'generate' timeslices | ||
1834 | * artificially, because any timeslice recovered here | ||
1835 | * was given away by the parent in the first place.) | ||
1836 | */ | ||
1837 | void fastcall sched_exit(struct task_struct *p) | ||
1838 | { | ||
1839 | unsigned long flags; | ||
1840 | struct rq *rq; | ||
1841 | |||
1842 | /* | ||
1843 | * If the child was a (relative-) CPU hog then decrease | ||
1844 | * the sleep_avg of the parent as well. | ||
1845 | */ | ||
1846 | rq = task_rq_lock(p->parent, &flags); | ||
1847 | if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { | ||
1848 | p->parent->time_slice += p->time_slice; | ||
1849 | if (unlikely(p->parent->time_slice > task_timeslice(p))) | ||
1850 | p->parent->time_slice = task_timeslice(p); | ||
1851 | } | 1671 | } |
1852 | if (p->sleep_avg < p->parent->sleep_avg) | 1672 | check_preempt_curr(rq, p); |
1853 | p->parent->sleep_avg = p->parent->sleep_avg / | ||
1854 | (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / | ||
1855 | (EXIT_WEIGHT + 1); | ||
1856 | task_rq_unlock(rq, &flags); | 1673 | task_rq_unlock(rq, &flags); |
1857 | } | 1674 | } |
1858 | 1675 | ||
@@ -1917,7 +1734,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1917 | /* | 1734 | /* |
1918 | * Remove function-return probe instances associated with this | 1735 | * Remove function-return probe instances associated with this |
1919 | * task and put them back on the free list. | 1736 | * task and put them back on the free list. |
1920 | */ | 1737 | */ |
1921 | kprobe_flush_task(prev); | 1738 | kprobe_flush_task(prev); |
1922 | put_task_struct(prev); | 1739 | put_task_struct(prev); |
1923 | } | 1740 | } |
@@ -1945,13 +1762,15 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
1945 | * context_switch - switch to the new MM and the new | 1762 | * context_switch - switch to the new MM and the new |
1946 | * thread's register state. | 1763 | * thread's register state. |
1947 | */ | 1764 | */ |
1948 | static inline struct task_struct * | 1765 | static inline void |
1949 | context_switch(struct rq *rq, struct task_struct *prev, | 1766 | context_switch(struct rq *rq, struct task_struct *prev, |
1950 | struct task_struct *next) | 1767 | struct task_struct *next) |
1951 | { | 1768 | { |
1952 | struct mm_struct *mm = next->mm; | 1769 | struct mm_struct *mm, *oldmm; |
1953 | struct mm_struct *oldmm = prev->active_mm; | ||
1954 | 1770 | ||
1771 | prepare_task_switch(rq, next); | ||
1772 | mm = next->mm; | ||
1773 | oldmm = prev->active_mm; | ||
1955 | /* | 1774 | /* |
1956 | * For paravirt, this is coupled with an exit in switch_to to | 1775 | * For paravirt, this is coupled with an exit in switch_to to |
1957 | * combine the page table reload and the switch backend into | 1776 | * combine the page table reload and the switch backend into |
@@ -1959,16 +1778,15 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
1959 | */ | 1778 | */ |
1960 | arch_enter_lazy_cpu_mode(); | 1779 | arch_enter_lazy_cpu_mode(); |
1961 | 1780 | ||
1962 | if (!mm) { | 1781 | if (unlikely(!mm)) { |
1963 | next->active_mm = oldmm; | 1782 | next->active_mm = oldmm; |
1964 | atomic_inc(&oldmm->mm_count); | 1783 | atomic_inc(&oldmm->mm_count); |
1965 | enter_lazy_tlb(oldmm, next); | 1784 | enter_lazy_tlb(oldmm, next); |
1966 | } else | 1785 | } else |
1967 | switch_mm(oldmm, mm, next); | 1786 | switch_mm(oldmm, mm, next); |
1968 | 1787 | ||
1969 | if (!prev->mm) { | 1788 | if (unlikely(!prev->mm)) { |
1970 | prev->active_mm = NULL; | 1789 | prev->active_mm = NULL; |
1971 | WARN_ON(rq->prev_mm); | ||
1972 | rq->prev_mm = oldmm; | 1790 | rq->prev_mm = oldmm; |
1973 | } | 1791 | } |
1974 | /* | 1792 | /* |
@@ -1984,7 +1802,13 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
1984 | /* Here we just switch the register state and the stack. */ | 1802 | /* Here we just switch the register state and the stack. */ |
1985 | switch_to(prev, next, prev); | 1803 | switch_to(prev, next, prev); |
1986 | 1804 | ||
1987 | return prev; | 1805 | barrier(); |
1806 | /* | ||
1807 | * this_rq must be evaluated again because prev may have moved | ||
1808 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
1809 | * frame will be invalid. | ||
1810 | */ | ||
1811 | finish_task_switch(this_rq(), prev); | ||
1988 | } | 1812 | } |
1989 | 1813 | ||
1990 | /* | 1814 | /* |
@@ -2057,17 +1881,65 @@ unsigned long nr_active(void) | |||
2057 | return running + uninterruptible; | 1881 | return running + uninterruptible; |
2058 | } | 1882 | } |
2059 | 1883 | ||
2060 | #ifdef CONFIG_SMP | ||
2061 | |||
2062 | /* | 1884 | /* |
2063 | * Is this task likely cache-hot: | 1885 | * Update rq->cpu_load[] statistics. This function is usually called every |
1886 | * scheduler tick (TICK_NSEC). | ||
2064 | */ | 1887 | */ |
2065 | static inline int | 1888 | static void update_cpu_load(struct rq *this_rq) |
2066 | task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) | ||
2067 | { | 1889 | { |
2068 | return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; | 1890 | u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; |
1891 | unsigned long total_load = this_rq->ls.load.weight; | ||
1892 | unsigned long this_load = total_load; | ||
1893 | struct load_stat *ls = &this_rq->ls; | ||
1894 | u64 now = __rq_clock(this_rq); | ||
1895 | int i, scale; | ||
1896 | |||
1897 | this_rq->nr_load_updates++; | ||
1898 | if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) | ||
1899 | goto do_avg; | ||
1900 | |||
1901 | /* Update delta_fair/delta_exec fields first */ | ||
1902 | update_curr_load(this_rq, now); | ||
1903 | |||
1904 | fair_delta64 = ls->delta_fair + 1; | ||
1905 | ls->delta_fair = 0; | ||
1906 | |||
1907 | exec_delta64 = ls->delta_exec + 1; | ||
1908 | ls->delta_exec = 0; | ||
1909 | |||
1910 | sample_interval64 = now - ls->load_update_last; | ||
1911 | ls->load_update_last = now; | ||
1912 | |||
1913 | if ((s64)sample_interval64 < (s64)TICK_NSEC) | ||
1914 | sample_interval64 = TICK_NSEC; | ||
1915 | |||
1916 | if (exec_delta64 > sample_interval64) | ||
1917 | exec_delta64 = sample_interval64; | ||
1918 | |||
1919 | idle_delta64 = sample_interval64 - exec_delta64; | ||
1920 | |||
1921 | tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); | ||
1922 | tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); | ||
1923 | |||
1924 | this_load = (unsigned long)tmp64; | ||
1925 | |||
1926 | do_avg: | ||
1927 | |||
1928 | /* Update our load: */ | ||
1929 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
1930 | unsigned long old_load, new_load; | ||
1931 | |||
1932 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
1933 | |||
1934 | old_load = this_rq->cpu_load[i]; | ||
1935 | new_load = this_load; | ||
1936 | |||
1937 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | ||
1938 | } | ||
2069 | } | 1939 | } |
2070 | 1940 | ||
1941 | #ifdef CONFIG_SMP | ||
1942 | |||
2071 | /* | 1943 | /* |
2072 | * double_rq_lock - safely lock two runqueues | 1944 | * double_rq_lock - safely lock two runqueues |
2073 | * | 1945 | * |
@@ -2184,23 +2056,17 @@ void sched_exec(void) | |||
2184 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2056 | * pull_task - move a task from a remote runqueue to the local runqueue. |
2185 | * Both runqueues must be locked. | 2057 | * Both runqueues must be locked. |
2186 | */ | 2058 | */ |
2187 | static void pull_task(struct rq *src_rq, struct prio_array *src_array, | 2059 | static void pull_task(struct rq *src_rq, struct task_struct *p, |
2188 | struct task_struct *p, struct rq *this_rq, | 2060 | struct rq *this_rq, int this_cpu) |
2189 | struct prio_array *this_array, int this_cpu) | ||
2190 | { | 2061 | { |
2191 | dequeue_task(p, src_array); | 2062 | deactivate_task(src_rq, p, 0); |
2192 | dec_nr_running(p, src_rq); | ||
2193 | set_task_cpu(p, this_cpu); | 2063 | set_task_cpu(p, this_cpu); |
2194 | inc_nr_running(p, this_rq); | 2064 | activate_task(this_rq, p, 0); |
2195 | enqueue_task(p, this_array); | ||
2196 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) | ||
2197 | + this_rq->most_recent_timestamp; | ||
2198 | /* | 2065 | /* |
2199 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2066 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2200 | * to be always true for them. | 2067 | * to be always true for them. |
2201 | */ | 2068 | */ |
2202 | if (TASK_PREEMPTS_CURR(p, this_rq)) | 2069 | check_preempt_curr(this_rq, p); |
2203 | resched_task(this_rq->curr); | ||
2204 | } | 2070 | } |
2205 | 2071 | ||
2206 | /* | 2072 | /* |
@@ -2208,7 +2074,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, | |||
2208 | */ | 2074 | */ |
2209 | static | 2075 | static |
2210 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 2076 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
2211 | struct sched_domain *sd, enum idle_type idle, | 2077 | struct sched_domain *sd, enum cpu_idle_type idle, |
2212 | int *all_pinned) | 2078 | int *all_pinned) |
2213 | { | 2079 | { |
2214 | /* | 2080 | /* |
@@ -2225,132 +2091,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2225 | return 0; | 2091 | return 0; |
2226 | 2092 | ||
2227 | /* | 2093 | /* |
2228 | * Aggressive migration if: | 2094 | * Aggressive migration if too many balance attempts have failed: |
2229 | * 1) task is cache cold, or | ||
2230 | * 2) too many balance attempts have failed. | ||
2231 | */ | 2095 | */ |
2232 | 2096 | if (sd->nr_balance_failed > sd->cache_nice_tries) | |
2233 | if (sd->nr_balance_failed > sd->cache_nice_tries) { | ||
2234 | #ifdef CONFIG_SCHEDSTATS | ||
2235 | if (task_hot(p, rq->most_recent_timestamp, sd)) | ||
2236 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
2237 | #endif | ||
2238 | return 1; | 2097 | return 1; |
2239 | } | ||
2240 | 2098 | ||
2241 | if (task_hot(p, rq->most_recent_timestamp, sd)) | ||
2242 | return 0; | ||
2243 | return 1; | 2099 | return 1; |
2244 | } | 2100 | } |
2245 | 2101 | ||
2246 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | 2102 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2247 | |||
2248 | /* | ||
2249 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | ||
2250 | * load from busiest to this_rq, as part of a balancing operation within | ||
2251 | * "domain". Returns the number of tasks moved. | ||
2252 | * | ||
2253 | * Called with both runqueues locked. | ||
2254 | */ | ||
2255 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
2256 | unsigned long max_nr_move, unsigned long max_load_move, | 2103 | unsigned long max_nr_move, unsigned long max_load_move, |
2257 | struct sched_domain *sd, enum idle_type idle, | 2104 | struct sched_domain *sd, enum cpu_idle_type idle, |
2258 | int *all_pinned) | 2105 | int *all_pinned, unsigned long *load_moved, |
2106 | int this_best_prio, int best_prio, int best_prio_seen, | ||
2107 | struct rq_iterator *iterator) | ||
2259 | { | 2108 | { |
2260 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, | 2109 | int pulled = 0, pinned = 0, skip_for_load; |
2261 | best_prio_seen, skip_for_load; | 2110 | struct task_struct *p; |
2262 | struct prio_array *array, *dst_array; | 2111 | long rem_load_move = max_load_move; |
2263 | struct list_head *head, *curr; | ||
2264 | struct task_struct *tmp; | ||
2265 | long rem_load_move; | ||
2266 | 2112 | ||
2267 | if (max_nr_move == 0 || max_load_move == 0) | 2113 | if (max_nr_move == 0 || max_load_move == 0) |
2268 | goto out; | 2114 | goto out; |
2269 | 2115 | ||
2270 | rem_load_move = max_load_move; | ||
2271 | pinned = 1; | 2116 | pinned = 1; |
2272 | this_best_prio = rq_best_prio(this_rq); | ||
2273 | best_prio = rq_best_prio(busiest); | ||
2274 | /* | ||
2275 | * Enable handling of the case where there is more than one task | ||
2276 | * with the best priority. If the current running task is one | ||
2277 | * of those with prio==best_prio we know it won't be moved | ||
2278 | * and therefore it's safe to override the skip (based on load) of | ||
2279 | * any task we find with that prio. | ||
2280 | */ | ||
2281 | best_prio_seen = best_prio == busiest->curr->prio; | ||
2282 | 2117 | ||
2283 | /* | 2118 | /* |
2284 | * We first consider expired tasks. Those will likely not be | 2119 | * Start the load-balancing iterator: |
2285 | * executed in the near future, and they are most likely to | ||
2286 | * be cache-cold, thus switching CPUs has the least effect | ||
2287 | * on them. | ||
2288 | */ | 2120 | */ |
2289 | if (busiest->expired->nr_active) { | 2121 | p = iterator->start(iterator->arg); |
2290 | array = busiest->expired; | 2122 | next: |
2291 | dst_array = this_rq->expired; | 2123 | if (!p) |
2292 | } else { | ||
2293 | array = busiest->active; | ||
2294 | dst_array = this_rq->active; | ||
2295 | } | ||
2296 | |||
2297 | new_array: | ||
2298 | /* Start searching at priority 0: */ | ||
2299 | idx = 0; | ||
2300 | skip_bitmap: | ||
2301 | if (!idx) | ||
2302 | idx = sched_find_first_bit(array->bitmap); | ||
2303 | else | ||
2304 | idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | ||
2305 | if (idx >= MAX_PRIO) { | ||
2306 | if (array == busiest->expired && busiest->active->nr_active) { | ||
2307 | array = busiest->active; | ||
2308 | dst_array = this_rq->active; | ||
2309 | goto new_array; | ||
2310 | } | ||
2311 | goto out; | 2124 | goto out; |
2312 | } | ||
2313 | |||
2314 | head = array->queue + idx; | ||
2315 | curr = head->prev; | ||
2316 | skip_queue: | ||
2317 | tmp = list_entry(curr, struct task_struct, run_list); | ||
2318 | |||
2319 | curr = curr->prev; | ||
2320 | |||
2321 | /* | 2125 | /* |
2322 | * To help distribute high priority tasks accross CPUs we don't | 2126 | * To help distribute high priority tasks accross CPUs we don't |
2323 | * skip a task if it will be the highest priority task (i.e. smallest | 2127 | * skip a task if it will be the highest priority task (i.e. smallest |
2324 | * prio value) on its new queue regardless of its load weight | 2128 | * prio value) on its new queue regardless of its load weight |
2325 | */ | 2129 | */ |
2326 | skip_for_load = tmp->load_weight > rem_load_move; | 2130 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + |
2327 | if (skip_for_load && idx < this_best_prio) | 2131 | SCHED_LOAD_SCALE_FUZZ; |
2328 | skip_for_load = !best_prio_seen && idx == best_prio; | 2132 | if (skip_for_load && p->prio < this_best_prio) |
2133 | skip_for_load = !best_prio_seen && p->prio == best_prio; | ||
2329 | if (skip_for_load || | 2134 | if (skip_for_load || |
2330 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2135 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2331 | 2136 | ||
2332 | best_prio_seen |= idx == best_prio; | 2137 | best_prio_seen |= p->prio == best_prio; |
2333 | if (curr != head) | 2138 | p = iterator->next(iterator->arg); |
2334 | goto skip_queue; | 2139 | goto next; |
2335 | idx++; | ||
2336 | goto skip_bitmap; | ||
2337 | } | 2140 | } |
2338 | 2141 | ||
2339 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2142 | pull_task(busiest, p, this_rq, this_cpu); |
2340 | pulled++; | 2143 | pulled++; |
2341 | rem_load_move -= tmp->load_weight; | 2144 | rem_load_move -= p->se.load.weight; |
2342 | 2145 | ||
2343 | /* | 2146 | /* |
2344 | * We only want to steal up to the prescribed number of tasks | 2147 | * We only want to steal up to the prescribed number of tasks |
2345 | * and the prescribed amount of weighted load. | 2148 | * and the prescribed amount of weighted load. |
2346 | */ | 2149 | */ |
2347 | if (pulled < max_nr_move && rem_load_move > 0) { | 2150 | if (pulled < max_nr_move && rem_load_move > 0) { |
2348 | if (idx < this_best_prio) | 2151 | if (p->prio < this_best_prio) |
2349 | this_best_prio = idx; | 2152 | this_best_prio = p->prio; |
2350 | if (curr != head) | 2153 | p = iterator->next(iterator->arg); |
2351 | goto skip_queue; | 2154 | goto next; |
2352 | idx++; | ||
2353 | goto skip_bitmap; | ||
2354 | } | 2155 | } |
2355 | out: | 2156 | out: |
2356 | /* | 2157 | /* |
@@ -2362,18 +2163,48 @@ out: | |||
2362 | 2163 | ||
2363 | if (all_pinned) | 2164 | if (all_pinned) |
2364 | *all_pinned = pinned; | 2165 | *all_pinned = pinned; |
2166 | *load_moved = max_load_move - rem_load_move; | ||
2365 | return pulled; | 2167 | return pulled; |
2366 | } | 2168 | } |
2367 | 2169 | ||
2368 | /* | 2170 | /* |
2171 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | ||
2172 | * load from busiest to this_rq, as part of a balancing operation within | ||
2173 | * "domain". Returns the number of tasks moved. | ||
2174 | * | ||
2175 | * Called with both runqueues locked. | ||
2176 | */ | ||
2177 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
2178 | unsigned long max_nr_move, unsigned long max_load_move, | ||
2179 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
2180 | int *all_pinned) | ||
2181 | { | ||
2182 | struct sched_class *class = sched_class_highest; | ||
2183 | unsigned long load_moved, total_nr_moved = 0, nr_moved; | ||
2184 | long rem_load_move = max_load_move; | ||
2185 | |||
2186 | do { | ||
2187 | nr_moved = class->load_balance(this_rq, this_cpu, busiest, | ||
2188 | max_nr_move, (unsigned long)rem_load_move, | ||
2189 | sd, idle, all_pinned, &load_moved); | ||
2190 | total_nr_moved += nr_moved; | ||
2191 | max_nr_move -= nr_moved; | ||
2192 | rem_load_move -= load_moved; | ||
2193 | class = class->next; | ||
2194 | } while (class && max_nr_move && rem_load_move > 0); | ||
2195 | |||
2196 | return total_nr_moved; | ||
2197 | } | ||
2198 | |||
2199 | /* | ||
2369 | * find_busiest_group finds and returns the busiest CPU group within the | 2200 | * find_busiest_group finds and returns the busiest CPU group within the |
2370 | * domain. It calculates and returns the amount of weighted load which | 2201 | * domain. It calculates and returns the amount of weighted load which |
2371 | * should be moved to restore balance via the imbalance parameter. | 2202 | * should be moved to restore balance via the imbalance parameter. |
2372 | */ | 2203 | */ |
2373 | static struct sched_group * | 2204 | static struct sched_group * |
2374 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2205 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2375 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, | 2206 | unsigned long *imbalance, enum cpu_idle_type idle, |
2376 | cpumask_t *cpus, int *balance) | 2207 | int *sd_idle, cpumask_t *cpus, int *balance) |
2377 | { | 2208 | { |
2378 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2209 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
2379 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2210 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
@@ -2391,9 +2222,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2391 | max_load = this_load = total_load = total_pwr = 0; | 2222 | max_load = this_load = total_load = total_pwr = 0; |
2392 | busiest_load_per_task = busiest_nr_running = 0; | 2223 | busiest_load_per_task = busiest_nr_running = 0; |
2393 | this_load_per_task = this_nr_running = 0; | 2224 | this_load_per_task = this_nr_running = 0; |
2394 | if (idle == NOT_IDLE) | 2225 | if (idle == CPU_NOT_IDLE) |
2395 | load_idx = sd->busy_idx; | 2226 | load_idx = sd->busy_idx; |
2396 | else if (idle == NEWLY_IDLE) | 2227 | else if (idle == CPU_NEWLY_IDLE) |
2397 | load_idx = sd->newidle_idx; | 2228 | load_idx = sd->newidle_idx; |
2398 | else | 2229 | else |
2399 | load_idx = sd->idle_idx; | 2230 | load_idx = sd->idle_idx; |
@@ -2421,7 +2252,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2421 | 2252 | ||
2422 | rq = cpu_rq(i); | 2253 | rq = cpu_rq(i); |
2423 | 2254 | ||
2424 | if (*sd_idle && !idle_cpu(i)) | 2255 | if (*sd_idle && rq->nr_running) |
2425 | *sd_idle = 0; | 2256 | *sd_idle = 0; |
2426 | 2257 | ||
2427 | /* Bias balancing toward cpus of our domain */ | 2258 | /* Bias balancing toward cpus of our domain */ |
@@ -2437,15 +2268,17 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2437 | 2268 | ||
2438 | avg_load += load; | 2269 | avg_load += load; |
2439 | sum_nr_running += rq->nr_running; | 2270 | sum_nr_running += rq->nr_running; |
2440 | sum_weighted_load += rq->raw_weighted_load; | 2271 | sum_weighted_load += weighted_cpuload(i); |
2441 | } | 2272 | } |
2442 | 2273 | ||
2443 | /* | 2274 | /* |
2444 | * First idle cpu or the first cpu(busiest) in this sched group | 2275 | * First idle cpu or the first cpu(busiest) in this sched group |
2445 | * is eligible for doing load balancing at this and above | 2276 | * is eligible for doing load balancing at this and above |
2446 | * domains. | 2277 | * domains. In the newly idle case, we will allow all the cpu's |
2278 | * to do the newly idle load balance. | ||
2447 | */ | 2279 | */ |
2448 | if (local_group && balance_cpu != this_cpu && balance) { | 2280 | if (idle != CPU_NEWLY_IDLE && local_group && |
2281 | balance_cpu != this_cpu && balance) { | ||
2449 | *balance = 0; | 2282 | *balance = 0; |
2450 | goto ret; | 2283 | goto ret; |
2451 | } | 2284 | } |
@@ -2477,8 +2310,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2477 | * Busy processors will not participate in power savings | 2310 | * Busy processors will not participate in power savings |
2478 | * balance. | 2311 | * balance. |
2479 | */ | 2312 | */ |
2480 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2313 | if (idle == CPU_NOT_IDLE || |
2481 | goto group_next; | 2314 | !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
2315 | goto group_next; | ||
2482 | 2316 | ||
2483 | /* | 2317 | /* |
2484 | * If the local group is idle or completely loaded | 2318 | * If the local group is idle or completely loaded |
@@ -2488,42 +2322,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2488 | !this_nr_running)) | 2322 | !this_nr_running)) |
2489 | power_savings_balance = 0; | 2323 | power_savings_balance = 0; |
2490 | 2324 | ||
2491 | /* | 2325 | /* |
2492 | * If a group is already running at full capacity or idle, | 2326 | * If a group is already running at full capacity or idle, |
2493 | * don't include that group in power savings calculations | 2327 | * don't include that group in power savings calculations |
2494 | */ | 2328 | */ |
2495 | if (!power_savings_balance || sum_nr_running >= group_capacity | 2329 | if (!power_savings_balance || sum_nr_running >= group_capacity |
2496 | || !sum_nr_running) | 2330 | || !sum_nr_running) |
2497 | goto group_next; | 2331 | goto group_next; |
2498 | 2332 | ||
2499 | /* | 2333 | /* |
2500 | * Calculate the group which has the least non-idle load. | 2334 | * Calculate the group which has the least non-idle load. |
2501 | * This is the group from where we need to pick up the load | 2335 | * This is the group from where we need to pick up the load |
2502 | * for saving power | 2336 | * for saving power |
2503 | */ | 2337 | */ |
2504 | if ((sum_nr_running < min_nr_running) || | 2338 | if ((sum_nr_running < min_nr_running) || |
2505 | (sum_nr_running == min_nr_running && | 2339 | (sum_nr_running == min_nr_running && |
2506 | first_cpu(group->cpumask) < | 2340 | first_cpu(group->cpumask) < |
2507 | first_cpu(group_min->cpumask))) { | 2341 | first_cpu(group_min->cpumask))) { |
2508 | group_min = group; | 2342 | group_min = group; |
2509 | min_nr_running = sum_nr_running; | 2343 | min_nr_running = sum_nr_running; |
2510 | min_load_per_task = sum_weighted_load / | 2344 | min_load_per_task = sum_weighted_load / |
2511 | sum_nr_running; | 2345 | sum_nr_running; |
2512 | } | 2346 | } |
2513 | 2347 | ||
2514 | /* | 2348 | /* |
2515 | * Calculate the group which is almost near its | 2349 | * Calculate the group which is almost near its |
2516 | * capacity but still has some space to pick up some load | 2350 | * capacity but still has some space to pick up some load |
2517 | * from other group and save more power | 2351 | * from other group and save more power |
2518 | */ | 2352 | */ |
2519 | if (sum_nr_running <= group_capacity - 1) { | 2353 | if (sum_nr_running <= group_capacity - 1) { |
2520 | if (sum_nr_running > leader_nr_running || | 2354 | if (sum_nr_running > leader_nr_running || |
2521 | (sum_nr_running == leader_nr_running && | 2355 | (sum_nr_running == leader_nr_running && |
2522 | first_cpu(group->cpumask) > | 2356 | first_cpu(group->cpumask) > |
2523 | first_cpu(group_leader->cpumask))) { | 2357 | first_cpu(group_leader->cpumask))) { |
2524 | group_leader = group; | 2358 | group_leader = group; |
2525 | leader_nr_running = sum_nr_running; | 2359 | leader_nr_running = sum_nr_running; |
2526 | } | 2360 | } |
2527 | } | 2361 | } |
2528 | group_next: | 2362 | group_next: |
2529 | #endif | 2363 | #endif |
@@ -2578,7 +2412,7 @@ group_next: | |||
2578 | * a think about bumping its value to force at least one task to be | 2412 | * a think about bumping its value to force at least one task to be |
2579 | * moved | 2413 | * moved |
2580 | */ | 2414 | */ |
2581 | if (*imbalance < busiest_load_per_task) { | 2415 | if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { |
2582 | unsigned long tmp, pwr_now, pwr_move; | 2416 | unsigned long tmp, pwr_now, pwr_move; |
2583 | unsigned int imbn; | 2417 | unsigned int imbn; |
2584 | 2418 | ||
@@ -2592,7 +2426,8 @@ small_imbalance: | |||
2592 | } else | 2426 | } else |
2593 | this_load_per_task = SCHED_LOAD_SCALE; | 2427 | this_load_per_task = SCHED_LOAD_SCALE; |
2594 | 2428 | ||
2595 | if (max_load - this_load >= busiest_load_per_task * imbn) { | 2429 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= |
2430 | busiest_load_per_task * imbn) { | ||
2596 | *imbalance = busiest_load_per_task; | 2431 | *imbalance = busiest_load_per_task; |
2597 | return busiest; | 2432 | return busiest; |
2598 | } | 2433 | } |
@@ -2639,7 +2474,7 @@ small_imbalance: | |||
2639 | 2474 | ||
2640 | out_balanced: | 2475 | out_balanced: |
2641 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2476 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2642 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2477 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
2643 | goto ret; | 2478 | goto ret; |
2644 | 2479 | ||
2645 | if (this == group_leader && group_leader != group_min) { | 2480 | if (this == group_leader && group_leader != group_min) { |
@@ -2656,7 +2491,7 @@ ret: | |||
2656 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2491 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2657 | */ | 2492 | */ |
2658 | static struct rq * | 2493 | static struct rq * |
2659 | find_busiest_queue(struct sched_group *group, enum idle_type idle, | 2494 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, |
2660 | unsigned long imbalance, cpumask_t *cpus) | 2495 | unsigned long imbalance, cpumask_t *cpus) |
2661 | { | 2496 | { |
2662 | struct rq *busiest = NULL, *rq; | 2497 | struct rq *busiest = NULL, *rq; |
@@ -2664,17 +2499,19 @@ find_busiest_queue(struct sched_group *group, enum idle_type idle, | |||
2664 | int i; | 2499 | int i; |
2665 | 2500 | ||
2666 | for_each_cpu_mask(i, group->cpumask) { | 2501 | for_each_cpu_mask(i, group->cpumask) { |
2502 | unsigned long wl; | ||
2667 | 2503 | ||
2668 | if (!cpu_isset(i, *cpus)) | 2504 | if (!cpu_isset(i, *cpus)) |
2669 | continue; | 2505 | continue; |
2670 | 2506 | ||
2671 | rq = cpu_rq(i); | 2507 | rq = cpu_rq(i); |
2508 | wl = weighted_cpuload(i); | ||
2672 | 2509 | ||
2673 | if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) | 2510 | if (rq->nr_running == 1 && wl > imbalance) |
2674 | continue; | 2511 | continue; |
2675 | 2512 | ||
2676 | if (rq->raw_weighted_load > max_load) { | 2513 | if (wl > max_load) { |
2677 | max_load = rq->raw_weighted_load; | 2514 | max_load = wl; |
2678 | busiest = rq; | 2515 | busiest = rq; |
2679 | } | 2516 | } |
2680 | } | 2517 | } |
@@ -2698,7 +2535,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n) | |||
2698 | * tasks if there is an imbalance. | 2535 | * tasks if there is an imbalance. |
2699 | */ | 2536 | */ |
2700 | static int load_balance(int this_cpu, struct rq *this_rq, | 2537 | static int load_balance(int this_cpu, struct rq *this_rq, |
2701 | struct sched_domain *sd, enum idle_type idle, | 2538 | struct sched_domain *sd, enum cpu_idle_type idle, |
2702 | int *balance) | 2539 | int *balance) |
2703 | { | 2540 | { |
2704 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 2541 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
@@ -2711,10 +2548,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2711 | /* | 2548 | /* |
2712 | * When power savings policy is enabled for the parent domain, idle | 2549 | * When power savings policy is enabled for the parent domain, idle |
2713 | * sibling can pick up load irrespective of busy siblings. In this case, | 2550 | * sibling can pick up load irrespective of busy siblings. In this case, |
2714 | * let the state of idle sibling percolate up as IDLE, instead of | 2551 | * let the state of idle sibling percolate up as CPU_IDLE, instead of |
2715 | * portraying it as NOT_IDLE. | 2552 | * portraying it as CPU_NOT_IDLE. |
2716 | */ | 2553 | */ |
2717 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | 2554 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2718 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2555 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2719 | sd_idle = 1; | 2556 | sd_idle = 1; |
2720 | 2557 | ||
@@ -2848,7 +2685,7 @@ out_one_pinned: | |||
2848 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2685 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2849 | * tasks if there is an imbalance. | 2686 | * tasks if there is an imbalance. |
2850 | * | 2687 | * |
2851 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). | 2688 | * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). |
2852 | * this_rq is locked. | 2689 | * this_rq is locked. |
2853 | */ | 2690 | */ |
2854 | static int | 2691 | static int |
@@ -2859,37 +2696,38 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
2859 | unsigned long imbalance; | 2696 | unsigned long imbalance; |
2860 | int nr_moved = 0; | 2697 | int nr_moved = 0; |
2861 | int sd_idle = 0; | 2698 | int sd_idle = 0; |
2699 | int all_pinned = 0; | ||
2862 | cpumask_t cpus = CPU_MASK_ALL; | 2700 | cpumask_t cpus = CPU_MASK_ALL; |
2863 | 2701 | ||
2864 | /* | 2702 | /* |
2865 | * When power savings policy is enabled for the parent domain, idle | 2703 | * When power savings policy is enabled for the parent domain, idle |
2866 | * sibling can pick up load irrespective of busy siblings. In this case, | 2704 | * sibling can pick up load irrespective of busy siblings. In this case, |
2867 | * let the state of idle sibling percolate up as IDLE, instead of | 2705 | * let the state of idle sibling percolate up as IDLE, instead of |
2868 | * portraying it as NOT_IDLE. | 2706 | * portraying it as CPU_NOT_IDLE. |
2869 | */ | 2707 | */ |
2870 | if (sd->flags & SD_SHARE_CPUPOWER && | 2708 | if (sd->flags & SD_SHARE_CPUPOWER && |
2871 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2709 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2872 | sd_idle = 1; | 2710 | sd_idle = 1; |
2873 | 2711 | ||
2874 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2712 | schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); |
2875 | redo: | 2713 | redo: |
2876 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, | 2714 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
2877 | &sd_idle, &cpus, NULL); | 2715 | &sd_idle, &cpus, NULL); |
2878 | if (!group) { | 2716 | if (!group) { |
2879 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2717 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); |
2880 | goto out_balanced; | 2718 | goto out_balanced; |
2881 | } | 2719 | } |
2882 | 2720 | ||
2883 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, | 2721 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, |
2884 | &cpus); | 2722 | &cpus); |
2885 | if (!busiest) { | 2723 | if (!busiest) { |
2886 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2724 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); |
2887 | goto out_balanced; | 2725 | goto out_balanced; |
2888 | } | 2726 | } |
2889 | 2727 | ||
2890 | BUG_ON(busiest == this_rq); | 2728 | BUG_ON(busiest == this_rq); |
2891 | 2729 | ||
2892 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); | 2730 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); |
2893 | 2731 | ||
2894 | nr_moved = 0; | 2732 | nr_moved = 0; |
2895 | if (busiest->nr_running > 1) { | 2733 | if (busiest->nr_running > 1) { |
@@ -2897,10 +2735,11 @@ redo: | |||
2897 | double_lock_balance(this_rq, busiest); | 2735 | double_lock_balance(this_rq, busiest); |
2898 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2736 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2899 | minus_1_or_zero(busiest->nr_running), | 2737 | minus_1_or_zero(busiest->nr_running), |
2900 | imbalance, sd, NEWLY_IDLE, NULL); | 2738 | imbalance, sd, CPU_NEWLY_IDLE, |
2739 | &all_pinned); | ||
2901 | spin_unlock(&busiest->lock); | 2740 | spin_unlock(&busiest->lock); |
2902 | 2741 | ||
2903 | if (!nr_moved) { | 2742 | if (unlikely(all_pinned)) { |
2904 | cpu_clear(cpu_of(busiest), cpus); | 2743 | cpu_clear(cpu_of(busiest), cpus); |
2905 | if (!cpus_empty(cpus)) | 2744 | if (!cpus_empty(cpus)) |
2906 | goto redo; | 2745 | goto redo; |
@@ -2908,7 +2747,7 @@ redo: | |||
2908 | } | 2747 | } |
2909 | 2748 | ||
2910 | if (!nr_moved) { | 2749 | if (!nr_moved) { |
2911 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | 2750 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); |
2912 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2751 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2913 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2752 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2914 | return -1; | 2753 | return -1; |
@@ -2918,7 +2757,7 @@ redo: | |||
2918 | return nr_moved; | 2757 | return nr_moved; |
2919 | 2758 | ||
2920 | out_balanced: | 2759 | out_balanced: |
2921 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2760 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); |
2922 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2761 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2923 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2762 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2924 | return -1; | 2763 | return -1; |
@@ -2934,8 +2773,8 @@ out_balanced: | |||
2934 | static void idle_balance(int this_cpu, struct rq *this_rq) | 2773 | static void idle_balance(int this_cpu, struct rq *this_rq) |
2935 | { | 2774 | { |
2936 | struct sched_domain *sd; | 2775 | struct sched_domain *sd; |
2937 | int pulled_task = 0; | 2776 | int pulled_task = -1; |
2938 | unsigned long next_balance = jiffies + 60 * HZ; | 2777 | unsigned long next_balance = jiffies + HZ; |
2939 | 2778 | ||
2940 | for_each_domain(this_cpu, sd) { | 2779 | for_each_domain(this_cpu, sd) { |
2941 | unsigned long interval; | 2780 | unsigned long interval; |
@@ -2954,12 +2793,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
2954 | if (pulled_task) | 2793 | if (pulled_task) |
2955 | break; | 2794 | break; |
2956 | } | 2795 | } |
2957 | if (!pulled_task) | 2796 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { |
2958 | /* | 2797 | /* |
2959 | * We are going idle. next_balance may be set based on | 2798 | * We are going idle. next_balance may be set based on |
2960 | * a busy processor. So reset next_balance. | 2799 | * a busy processor. So reset next_balance. |
2961 | */ | 2800 | */ |
2962 | this_rq->next_balance = next_balance; | 2801 | this_rq->next_balance = next_balance; |
2802 | } | ||
2963 | } | 2803 | } |
2964 | 2804 | ||
2965 | /* | 2805 | /* |
@@ -3003,7 +2843,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3003 | schedstat_inc(sd, alb_cnt); | 2843 | schedstat_inc(sd, alb_cnt); |
3004 | 2844 | ||
3005 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, | 2845 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, |
3006 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, | 2846 | RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE, |
3007 | NULL)) | 2847 | NULL)) |
3008 | schedstat_inc(sd, alb_pushed); | 2848 | schedstat_inc(sd, alb_pushed); |
3009 | else | 2849 | else |
@@ -3012,32 +2852,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3012 | spin_unlock(&target_rq->lock); | 2852 | spin_unlock(&target_rq->lock); |
3013 | } | 2853 | } |
3014 | 2854 | ||
3015 | static void update_load(struct rq *this_rq) | ||
3016 | { | ||
3017 | unsigned long this_load; | ||
3018 | unsigned int i, scale; | ||
3019 | |||
3020 | this_load = this_rq->raw_weighted_load; | ||
3021 | |||
3022 | /* Update our load: */ | ||
3023 | for (i = 0, scale = 1; i < 3; i++, scale += scale) { | ||
3024 | unsigned long old_load, new_load; | ||
3025 | |||
3026 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
3027 | |||
3028 | old_load = this_rq->cpu_load[i]; | ||
3029 | new_load = this_load; | ||
3030 | /* | ||
3031 | * Round up the averaging division if load is increasing. This | ||
3032 | * prevents us from getting stuck on 9 if the load is 10, for | ||
3033 | * example. | ||
3034 | */ | ||
3035 | if (new_load > old_load) | ||
3036 | new_load += scale-1; | ||
3037 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | ||
3038 | } | ||
3039 | } | ||
3040 | |||
3041 | #ifdef CONFIG_NO_HZ | 2855 | #ifdef CONFIG_NO_HZ |
3042 | static struct { | 2856 | static struct { |
3043 | atomic_t load_balancer; | 2857 | atomic_t load_balancer; |
@@ -3120,7 +2934,7 @@ static DEFINE_SPINLOCK(balancing); | |||
3120 | * | 2934 | * |
3121 | * Balancing parameters are set up in arch_init_sched_domains. | 2935 | * Balancing parameters are set up in arch_init_sched_domains. |
3122 | */ | 2936 | */ |
3123 | static inline void rebalance_domains(int cpu, enum idle_type idle) | 2937 | static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) |
3124 | { | 2938 | { |
3125 | int balance = 1; | 2939 | int balance = 1; |
3126 | struct rq *rq = cpu_rq(cpu); | 2940 | struct rq *rq = cpu_rq(cpu); |
@@ -3134,13 +2948,16 @@ static inline void rebalance_domains(int cpu, enum idle_type idle) | |||
3134 | continue; | 2948 | continue; |
3135 | 2949 | ||
3136 | interval = sd->balance_interval; | 2950 | interval = sd->balance_interval; |
3137 | if (idle != SCHED_IDLE) | 2951 | if (idle != CPU_IDLE) |
3138 | interval *= sd->busy_factor; | 2952 | interval *= sd->busy_factor; |
3139 | 2953 | ||
3140 | /* scale ms to jiffies */ | 2954 | /* scale ms to jiffies */ |
3141 | interval = msecs_to_jiffies(interval); | 2955 | interval = msecs_to_jiffies(interval); |
3142 | if (unlikely(!interval)) | 2956 | if (unlikely(!interval)) |
3143 | interval = 1; | 2957 | interval = 1; |
2958 | if (interval > HZ*NR_CPUS/10) | ||
2959 | interval = HZ*NR_CPUS/10; | ||
2960 | |||
3144 | 2961 | ||
3145 | if (sd->flags & SD_SERIALIZE) { | 2962 | if (sd->flags & SD_SERIALIZE) { |
3146 | if (!spin_trylock(&balancing)) | 2963 | if (!spin_trylock(&balancing)) |
@@ -3154,7 +2971,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle) | |||
3154 | * longer idle, or one of our SMT siblings is | 2971 | * longer idle, or one of our SMT siblings is |
3155 | * not idle. | 2972 | * not idle. |
3156 | */ | 2973 | */ |
3157 | idle = NOT_IDLE; | 2974 | idle = CPU_NOT_IDLE; |
3158 | } | 2975 | } |
3159 | sd->last_balance = jiffies; | 2976 | sd->last_balance = jiffies; |
3160 | } | 2977 | } |
@@ -3182,11 +2999,12 @@ out: | |||
3182 | */ | 2999 | */ |
3183 | static void run_rebalance_domains(struct softirq_action *h) | 3000 | static void run_rebalance_domains(struct softirq_action *h) |
3184 | { | 3001 | { |
3185 | int local_cpu = smp_processor_id(); | 3002 | int this_cpu = smp_processor_id(); |
3186 | struct rq *local_rq = cpu_rq(local_cpu); | 3003 | struct rq *this_rq = cpu_rq(this_cpu); |
3187 | enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; | 3004 | enum cpu_idle_type idle = this_rq->idle_at_tick ? |
3005 | CPU_IDLE : CPU_NOT_IDLE; | ||
3188 | 3006 | ||
3189 | rebalance_domains(local_cpu, idle); | 3007 | rebalance_domains(this_cpu, idle); |
3190 | 3008 | ||
3191 | #ifdef CONFIG_NO_HZ | 3009 | #ifdef CONFIG_NO_HZ |
3192 | /* | 3010 | /* |
@@ -3194,13 +3012,13 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3194 | * balancing on behalf of the other idle cpus whose ticks are | 3012 | * balancing on behalf of the other idle cpus whose ticks are |
3195 | * stopped. | 3013 | * stopped. |
3196 | */ | 3014 | */ |
3197 | if (local_rq->idle_at_tick && | 3015 | if (this_rq->idle_at_tick && |
3198 | atomic_read(&nohz.load_balancer) == local_cpu) { | 3016 | atomic_read(&nohz.load_balancer) == this_cpu) { |
3199 | cpumask_t cpus = nohz.cpu_mask; | 3017 | cpumask_t cpus = nohz.cpu_mask; |
3200 | struct rq *rq; | 3018 | struct rq *rq; |
3201 | int balance_cpu; | 3019 | int balance_cpu; |
3202 | 3020 | ||
3203 | cpu_clear(local_cpu, cpus); | 3021 | cpu_clear(this_cpu, cpus); |
3204 | for_each_cpu_mask(balance_cpu, cpus) { | 3022 | for_each_cpu_mask(balance_cpu, cpus) { |
3205 | /* | 3023 | /* |
3206 | * If this cpu gets work to do, stop the load balancing | 3024 | * If this cpu gets work to do, stop the load balancing |
@@ -3213,8 +3031,8 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3213 | rebalance_domains(balance_cpu, SCHED_IDLE); | 3031 | rebalance_domains(balance_cpu, SCHED_IDLE); |
3214 | 3032 | ||
3215 | rq = cpu_rq(balance_cpu); | 3033 | rq = cpu_rq(balance_cpu); |
3216 | if (time_after(local_rq->next_balance, rq->next_balance)) | 3034 | if (time_after(this_rq->next_balance, rq->next_balance)) |
3217 | local_rq->next_balance = rq->next_balance; | 3035 | this_rq->next_balance = rq->next_balance; |
3218 | } | 3036 | } |
3219 | } | 3037 | } |
3220 | #endif | 3038 | #endif |
@@ -3227,9 +3045,8 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3227 | * idle load balancing owner or decide to stop the periodic load balancing, | 3045 | * idle load balancing owner or decide to stop the periodic load balancing, |
3228 | * if the whole system is idle. | 3046 | * if the whole system is idle. |
3229 | */ | 3047 | */ |
3230 | static inline void trigger_load_balance(int cpu) | 3048 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
3231 | { | 3049 | { |
3232 | struct rq *rq = cpu_rq(cpu); | ||
3233 | #ifdef CONFIG_NO_HZ | 3050 | #ifdef CONFIG_NO_HZ |
3234 | /* | 3051 | /* |
3235 | * If we were in the nohz mode recently and busy at the current | 3052 | * If we were in the nohz mode recently and busy at the current |
@@ -3281,13 +3098,29 @@ static inline void trigger_load_balance(int cpu) | |||
3281 | if (time_after_eq(jiffies, rq->next_balance)) | 3098 | if (time_after_eq(jiffies, rq->next_balance)) |
3282 | raise_softirq(SCHED_SOFTIRQ); | 3099 | raise_softirq(SCHED_SOFTIRQ); |
3283 | } | 3100 | } |
3284 | #else | 3101 | |
3102 | #else /* CONFIG_SMP */ | ||
3103 | |||
3285 | /* | 3104 | /* |
3286 | * on UP we do not need to balance between CPUs: | 3105 | * on UP we do not need to balance between CPUs: |
3287 | */ | 3106 | */ |
3288 | static inline void idle_balance(int cpu, struct rq *rq) | 3107 | static inline void idle_balance(int cpu, struct rq *rq) |
3289 | { | 3108 | { |
3290 | } | 3109 | } |
3110 | |||
3111 | /* Avoid "used but not defined" warning on UP */ | ||
3112 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3113 | unsigned long max_nr_move, unsigned long max_load_move, | ||
3114 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3115 | int *all_pinned, unsigned long *load_moved, | ||
3116 | int this_best_prio, int best_prio, int best_prio_seen, | ||
3117 | struct rq_iterator *iterator) | ||
3118 | { | ||
3119 | *load_moved = 0; | ||
3120 | |||
3121 | return 0; | ||
3122 | } | ||
3123 | |||
3291 | #endif | 3124 | #endif |
3292 | 3125 | ||
3293 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3126 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
@@ -3295,54 +3128,28 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); | |||
3295 | EXPORT_PER_CPU_SYMBOL(kstat); | 3128 | EXPORT_PER_CPU_SYMBOL(kstat); |
3296 | 3129 | ||
3297 | /* | 3130 | /* |
3298 | * This is called on clock ticks and on context switches. | 3131 | * Return p->sum_exec_runtime plus any more ns on the sched_clock |
3299 | * Bank in p->sched_time the ns elapsed since the last tick or switch. | 3132 | * that have not yet been banked in case the task is currently running. |
3300 | */ | ||
3301 | static inline void | ||
3302 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) | ||
3303 | { | ||
3304 | p->sched_time += now - p->last_ran; | ||
3305 | p->last_ran = rq->most_recent_timestamp = now; | ||
3306 | } | ||
3307 | |||
3308 | /* | ||
3309 | * Return current->sched_time plus any more ns on the sched_clock | ||
3310 | * that have not yet been banked. | ||
3311 | */ | 3133 | */ |
3312 | unsigned long long current_sched_time(const struct task_struct *p) | 3134 | unsigned long long task_sched_runtime(struct task_struct *p) |
3313 | { | 3135 | { |
3314 | unsigned long long ns; | ||
3315 | unsigned long flags; | 3136 | unsigned long flags; |
3137 | u64 ns, delta_exec; | ||
3138 | struct rq *rq; | ||
3316 | 3139 | ||
3317 | local_irq_save(flags); | 3140 | rq = task_rq_lock(p, &flags); |
3318 | ns = p->sched_time + sched_clock() - p->last_ran; | 3141 | ns = p->se.sum_exec_runtime; |
3319 | local_irq_restore(flags); | 3142 | if (rq->curr == p) { |
3143 | delta_exec = rq_clock(rq) - p->se.exec_start; | ||
3144 | if ((s64)delta_exec > 0) | ||
3145 | ns += delta_exec; | ||
3146 | } | ||
3147 | task_rq_unlock(rq, &flags); | ||
3320 | 3148 | ||
3321 | return ns; | 3149 | return ns; |
3322 | } | 3150 | } |
3323 | 3151 | ||
3324 | /* | 3152 | /* |
3325 | * We place interactive tasks back into the active array, if possible. | ||
3326 | * | ||
3327 | * To guarantee that this does not starve expired tasks we ignore the | ||
3328 | * interactivity of a task if the first expired task had to wait more | ||
3329 | * than a 'reasonable' amount of time. This deadline timeout is | ||
3330 | * load-dependent, as the frequency of array switched decreases with | ||
3331 | * increasing number of running tasks. We also ignore the interactivity | ||
3332 | * if a better static_prio task has expired: | ||
3333 | */ | ||
3334 | static inline int expired_starving(struct rq *rq) | ||
3335 | { | ||
3336 | if (rq->curr->static_prio > rq->best_expired_prio) | ||
3337 | return 1; | ||
3338 | if (!STARVATION_LIMIT || !rq->expired_timestamp) | ||
3339 | return 0; | ||
3340 | if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) | ||
3341 | return 1; | ||
3342 | return 0; | ||
3343 | } | ||
3344 | |||
3345 | /* | ||
3346 | * Account user cpu time to a process. | 3153 | * Account user cpu time to a process. |
3347 | * @p: the process that the cpu time gets accounted to | 3154 | * @p: the process that the cpu time gets accounted to |
3348 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3155 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3415,81 +3222,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
3415 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3222 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
3416 | } | 3223 | } |
3417 | 3224 | ||
3418 | static void task_running_tick(struct rq *rq, struct task_struct *p) | ||
3419 | { | ||
3420 | if (p->array != rq->active) { | ||
3421 | /* Task has expired but was not scheduled yet */ | ||
3422 | set_tsk_need_resched(p); | ||
3423 | return; | ||
3424 | } | ||
3425 | spin_lock(&rq->lock); | ||
3426 | /* | ||
3427 | * The task was running during this tick - update the | ||
3428 | * time slice counter. Note: we do not update a thread's | ||
3429 | * priority until it either goes to sleep or uses up its | ||
3430 | * timeslice. This makes it possible for interactive tasks | ||
3431 | * to use up their timeslices at their highest priority levels. | ||
3432 | */ | ||
3433 | if (rt_task(p)) { | ||
3434 | /* | ||
3435 | * RR tasks need a special form of timeslice management. | ||
3436 | * FIFO tasks have no timeslices. | ||
3437 | */ | ||
3438 | if ((p->policy == SCHED_RR) && !--p->time_slice) { | ||
3439 | p->time_slice = task_timeslice(p); | ||
3440 | p->first_time_slice = 0; | ||
3441 | set_tsk_need_resched(p); | ||
3442 | |||
3443 | /* put it at the end of the queue: */ | ||
3444 | requeue_task(p, rq->active); | ||
3445 | } | ||
3446 | goto out_unlock; | ||
3447 | } | ||
3448 | if (!--p->time_slice) { | ||
3449 | dequeue_task(p, rq->active); | ||
3450 | set_tsk_need_resched(p); | ||
3451 | p->prio = effective_prio(p); | ||
3452 | p->time_slice = task_timeslice(p); | ||
3453 | p->first_time_slice = 0; | ||
3454 | |||
3455 | if (!rq->expired_timestamp) | ||
3456 | rq->expired_timestamp = jiffies; | ||
3457 | if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { | ||
3458 | enqueue_task(p, rq->expired); | ||
3459 | if (p->static_prio < rq->best_expired_prio) | ||
3460 | rq->best_expired_prio = p->static_prio; | ||
3461 | } else | ||
3462 | enqueue_task(p, rq->active); | ||
3463 | } else { | ||
3464 | /* | ||
3465 | * Prevent a too long timeslice allowing a task to monopolize | ||
3466 | * the CPU. We do this by splitting up the timeslice into | ||
3467 | * smaller pieces. | ||
3468 | * | ||
3469 | * Note: this does not mean the task's timeslices expire or | ||
3470 | * get lost in any way, they just might be preempted by | ||
3471 | * another task of equal priority. (one with higher | ||
3472 | * priority would have preempted this task already.) We | ||
3473 | * requeue this task to the end of the list on this priority | ||
3474 | * level, which is in essence a round-robin of tasks with | ||
3475 | * equal priority. | ||
3476 | * | ||
3477 | * This only applies to tasks in the interactive | ||
3478 | * delta range with at least TIMESLICE_GRANULARITY to requeue. | ||
3479 | */ | ||
3480 | if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - | ||
3481 | p->time_slice) % TIMESLICE_GRANULARITY(p)) && | ||
3482 | (p->time_slice >= TIMESLICE_GRANULARITY(p)) && | ||
3483 | (p->array == rq->active)) { | ||
3484 | |||
3485 | requeue_task(p, rq->active); | ||
3486 | set_tsk_need_resched(p); | ||
3487 | } | ||
3488 | } | ||
3489 | out_unlock: | ||
3490 | spin_unlock(&rq->lock); | ||
3491 | } | ||
3492 | |||
3493 | /* | 3225 | /* |
3494 | * This function gets called by the timer code, with HZ frequency. | 3226 | * This function gets called by the timer code, with HZ frequency. |
3495 | * We call it with interrupts disabled. | 3227 | * We call it with interrupts disabled. |
@@ -3499,20 +3231,19 @@ out_unlock: | |||
3499 | */ | 3231 | */ |
3500 | void scheduler_tick(void) | 3232 | void scheduler_tick(void) |
3501 | { | 3233 | { |
3502 | unsigned long long now = sched_clock(); | ||
3503 | struct task_struct *p = current; | ||
3504 | int cpu = smp_processor_id(); | 3234 | int cpu = smp_processor_id(); |
3505 | int idle_at_tick = idle_cpu(cpu); | ||
3506 | struct rq *rq = cpu_rq(cpu); | 3235 | struct rq *rq = cpu_rq(cpu); |
3236 | struct task_struct *curr = rq->curr; | ||
3507 | 3237 | ||
3508 | update_cpu_clock(p, rq, now); | 3238 | spin_lock(&rq->lock); |
3239 | if (curr != rq->idle) /* FIXME: needed? */ | ||
3240 | curr->sched_class->task_tick(rq, curr); | ||
3241 | update_cpu_load(rq); | ||
3242 | spin_unlock(&rq->lock); | ||
3509 | 3243 | ||
3510 | if (!idle_at_tick) | ||
3511 | task_running_tick(rq, p); | ||
3512 | #ifdef CONFIG_SMP | 3244 | #ifdef CONFIG_SMP |
3513 | update_load(rq); | 3245 | rq->idle_at_tick = idle_cpu(cpu); |
3514 | rq->idle_at_tick = idle_at_tick; | 3246 | trigger_load_balance(rq, cpu); |
3515 | trigger_load_balance(cpu); | ||
3516 | #endif | 3247 | #endif |
3517 | } | 3248 | } |
3518 | 3249 | ||
@@ -3554,170 +3285,129 @@ EXPORT_SYMBOL(sub_preempt_count); | |||
3554 | 3285 | ||
3555 | #endif | 3286 | #endif |
3556 | 3287 | ||
3557 | static inline int interactive_sleep(enum sleep_type sleep_type) | 3288 | /* |
3289 | * Print scheduling while atomic bug: | ||
3290 | */ | ||
3291 | static noinline void __schedule_bug(struct task_struct *prev) | ||
3558 | { | 3292 | { |
3559 | return (sleep_type == SLEEP_INTERACTIVE || | 3293 | printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", |
3560 | sleep_type == SLEEP_INTERRUPTED); | 3294 | prev->comm, preempt_count(), prev->pid); |
3295 | debug_show_held_locks(prev); | ||
3296 | if (irqs_disabled()) | ||
3297 | print_irqtrace_events(prev); | ||
3298 | dump_stack(); | ||
3561 | } | 3299 | } |
3562 | 3300 | ||
3563 | /* | 3301 | /* |
3564 | * schedule() is the main scheduler function. | 3302 | * Various schedule()-time debugging checks and statistics: |
3565 | */ | 3303 | */ |
3566 | asmlinkage void __sched schedule(void) | 3304 | static inline void schedule_debug(struct task_struct *prev) |
3567 | { | 3305 | { |
3568 | struct task_struct *prev, *next; | ||
3569 | struct prio_array *array; | ||
3570 | struct list_head *queue; | ||
3571 | unsigned long long now; | ||
3572 | unsigned long run_time; | ||
3573 | int cpu, idx, new_prio; | ||
3574 | long *switch_count; | ||
3575 | struct rq *rq; | ||
3576 | |||
3577 | /* | 3306 | /* |
3578 | * Test if we are atomic. Since do_exit() needs to call into | 3307 | * Test if we are atomic. Since do_exit() needs to call into |
3579 | * schedule() atomically, we ignore that path for now. | 3308 | * schedule() atomically, we ignore that path for now. |
3580 | * Otherwise, whine if we are scheduling when we should not be. | 3309 | * Otherwise, whine if we are scheduling when we should not be. |
3581 | */ | 3310 | */ |
3582 | if (unlikely(in_atomic() && !current->exit_state)) { | 3311 | if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) |
3583 | printk(KERN_ERR "BUG: scheduling while atomic: " | 3312 | __schedule_bug(prev); |
3584 | "%s/0x%08x/%d\n", | ||
3585 | current->comm, preempt_count(), current->pid); | ||
3586 | debug_show_held_locks(current); | ||
3587 | if (irqs_disabled()) | ||
3588 | print_irqtrace_events(current); | ||
3589 | dump_stack(); | ||
3590 | } | ||
3591 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | ||
3592 | 3313 | ||
3593 | need_resched: | 3314 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3594 | preempt_disable(); | ||
3595 | prev = current; | ||
3596 | release_kernel_lock(prev); | ||
3597 | need_resched_nonpreemptible: | ||
3598 | rq = this_rq(); | ||
3599 | 3315 | ||
3600 | /* | 3316 | schedstat_inc(this_rq(), sched_cnt); |
3601 | * The idle thread is not allowed to schedule! | 3317 | } |
3602 | * Remove this check after it has been exercised a bit. | ||
3603 | */ | ||
3604 | if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { | ||
3605 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
3606 | dump_stack(); | ||
3607 | } | ||
3608 | 3318 | ||
3609 | schedstat_inc(rq, sched_cnt); | 3319 | /* |
3610 | now = sched_clock(); | 3320 | * Pick up the highest-prio task: |
3611 | if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { | 3321 | */ |
3612 | run_time = now - prev->timestamp; | 3322 | static inline struct task_struct * |
3613 | if (unlikely((long long)(now - prev->timestamp) < 0)) | 3323 | pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) |
3614 | run_time = 0; | 3324 | { |
3615 | } else | 3325 | struct sched_class *class; |
3616 | run_time = NS_MAX_SLEEP_AVG; | 3326 | struct task_struct *p; |
3617 | 3327 | ||
3618 | /* | 3328 | /* |
3619 | * Tasks charged proportionately less run_time at high sleep_avg to | 3329 | * Optimization: we know that if all tasks are in |
3620 | * delay them losing their interactive status | 3330 | * the fair class we can call that function directly: |
3621 | */ | 3331 | */ |
3622 | run_time /= (CURRENT_BONUS(prev) ? : 1); | 3332 | if (likely(rq->nr_running == rq->cfs.nr_running)) { |
3623 | 3333 | p = fair_sched_class.pick_next_task(rq, now); | |
3624 | spin_lock_irq(&rq->lock); | 3334 | if (likely(p)) |
3625 | 3335 | return p; | |
3626 | switch_count = &prev->nivcsw; | ||
3627 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | ||
3628 | switch_count = &prev->nvcsw; | ||
3629 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | ||
3630 | unlikely(signal_pending(prev)))) | ||
3631 | prev->state = TASK_RUNNING; | ||
3632 | else { | ||
3633 | if (prev->state == TASK_UNINTERRUPTIBLE) | ||
3634 | rq->nr_uninterruptible++; | ||
3635 | deactivate_task(prev, rq); | ||
3636 | } | ||
3637 | } | 3336 | } |
3638 | 3337 | ||
3639 | cpu = smp_processor_id(); | 3338 | class = sched_class_highest; |
3640 | if (unlikely(!rq->nr_running)) { | 3339 | for ( ; ; ) { |
3641 | idle_balance(cpu, rq); | 3340 | p = class->pick_next_task(rq, now); |
3642 | if (!rq->nr_running) { | 3341 | if (p) |
3643 | next = rq->idle; | 3342 | return p; |
3644 | rq->expired_timestamp = 0; | ||
3645 | goto switch_tasks; | ||
3646 | } | ||
3647 | } | ||
3648 | |||
3649 | array = rq->active; | ||
3650 | if (unlikely(!array->nr_active)) { | ||
3651 | /* | 3343 | /* |
3652 | * Switch the active and expired arrays. | 3344 | * Will never be NULL as the idle class always |
3345 | * returns a non-NULL p: | ||
3653 | */ | 3346 | */ |
3654 | schedstat_inc(rq, sched_switch); | 3347 | class = class->next; |
3655 | rq->active = rq->expired; | ||
3656 | rq->expired = array; | ||
3657 | array = rq->active; | ||
3658 | rq->expired_timestamp = 0; | ||
3659 | rq->best_expired_prio = MAX_PRIO; | ||
3660 | } | 3348 | } |
3349 | } | ||
3661 | 3350 | ||
3662 | idx = sched_find_first_bit(array->bitmap); | 3351 | /* |
3663 | queue = array->queue + idx; | 3352 | * schedule() is the main scheduler function. |
3664 | next = list_entry(queue->next, struct task_struct, run_list); | 3353 | */ |
3354 | asmlinkage void __sched schedule(void) | ||
3355 | { | ||
3356 | struct task_struct *prev, *next; | ||
3357 | long *switch_count; | ||
3358 | struct rq *rq; | ||
3359 | u64 now; | ||
3360 | int cpu; | ||
3665 | 3361 | ||
3666 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { | 3362 | need_resched: |
3667 | unsigned long long delta = now - next->timestamp; | 3363 | preempt_disable(); |
3668 | if (unlikely((long long)(now - next->timestamp) < 0)) | 3364 | cpu = smp_processor_id(); |
3669 | delta = 0; | 3365 | rq = cpu_rq(cpu); |
3366 | rcu_qsctr_inc(cpu); | ||
3367 | prev = rq->curr; | ||
3368 | switch_count = &prev->nivcsw; | ||
3670 | 3369 | ||
3671 | if (next->sleep_type == SLEEP_INTERACTIVE) | 3370 | release_kernel_lock(prev); |
3672 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | 3371 | need_resched_nonpreemptible: |
3673 | 3372 | ||
3674 | array = next->array; | 3373 | schedule_debug(prev); |
3675 | new_prio = recalc_task_prio(next, next->timestamp + delta); | ||
3676 | 3374 | ||
3677 | if (unlikely(next->prio != new_prio)) { | 3375 | spin_lock_irq(&rq->lock); |
3678 | dequeue_task(next, array); | 3376 | clear_tsk_need_resched(prev); |
3679 | next->prio = new_prio; | 3377 | |
3680 | enqueue_task(next, array); | 3378 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3379 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | ||
3380 | unlikely(signal_pending(prev)))) { | ||
3381 | prev->state = TASK_RUNNING; | ||
3382 | } else { | ||
3383 | deactivate_task(rq, prev, 1); | ||
3681 | } | 3384 | } |
3385 | switch_count = &prev->nvcsw; | ||
3682 | } | 3386 | } |
3683 | next->sleep_type = SLEEP_NORMAL; | ||
3684 | switch_tasks: | ||
3685 | if (next == rq->idle) | ||
3686 | schedstat_inc(rq, sched_goidle); | ||
3687 | prefetch(next); | ||
3688 | prefetch_stack(next); | ||
3689 | clear_tsk_need_resched(prev); | ||
3690 | rcu_qsctr_inc(task_cpu(prev)); | ||
3691 | 3387 | ||
3692 | update_cpu_clock(prev, rq, now); | 3388 | if (unlikely(!rq->nr_running)) |
3389 | idle_balance(cpu, rq); | ||
3693 | 3390 | ||
3694 | prev->sleep_avg -= run_time; | 3391 | now = __rq_clock(rq); |
3695 | if ((long)prev->sleep_avg <= 0) | 3392 | prev->sched_class->put_prev_task(rq, prev, now); |
3696 | prev->sleep_avg = 0; | 3393 | next = pick_next_task(rq, prev, now); |
3697 | prev->timestamp = prev->last_ran = now; | ||
3698 | 3394 | ||
3699 | sched_info_switch(prev, next); | 3395 | sched_info_switch(prev, next); |
3396 | |||
3700 | if (likely(prev != next)) { | 3397 | if (likely(prev != next)) { |
3701 | next->timestamp = next->last_ran = now; | ||
3702 | rq->nr_switches++; | 3398 | rq->nr_switches++; |
3703 | rq->curr = next; | 3399 | rq->curr = next; |
3704 | ++*switch_count; | 3400 | ++*switch_count; |
3705 | 3401 | ||
3706 | prepare_task_switch(rq, next); | 3402 | context_switch(rq, prev, next); /* unlocks the rq */ |
3707 | prev = context_switch(rq, prev, next); | ||
3708 | barrier(); | ||
3709 | /* | ||
3710 | * this_rq must be evaluated again because prev may have moved | ||
3711 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
3712 | * frame will be invalid. | ||
3713 | */ | ||
3714 | finish_task_switch(this_rq(), prev); | ||
3715 | } else | 3403 | } else |
3716 | spin_unlock_irq(&rq->lock); | 3404 | spin_unlock_irq(&rq->lock); |
3717 | 3405 | ||
3718 | prev = current; | 3406 | if (unlikely(reacquire_kernel_lock(current) < 0)) { |
3719 | if (unlikely(reacquire_kernel_lock(prev) < 0)) | 3407 | cpu = smp_processor_id(); |
3408 | rq = cpu_rq(cpu); | ||
3720 | goto need_resched_nonpreemptible; | 3409 | goto need_resched_nonpreemptible; |
3410 | } | ||
3721 | preempt_enable_no_resched(); | 3411 | preempt_enable_no_resched(); |
3722 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3412 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3723 | goto need_resched; | 3413 | goto need_resched; |
@@ -4045,74 +3735,85 @@ out: | |||
4045 | } | 3735 | } |
4046 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 3736 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
4047 | 3737 | ||
4048 | 3738 | static inline void | |
4049 | #define SLEEP_ON_VAR \ | 3739 | sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) |
4050 | unsigned long flags; \ | 3740 | { |
4051 | wait_queue_t wait; \ | 3741 | spin_lock_irqsave(&q->lock, *flags); |
4052 | init_waitqueue_entry(&wait, current); | 3742 | __add_wait_queue(q, wait); |
4053 | |||
4054 | #define SLEEP_ON_HEAD \ | ||
4055 | spin_lock_irqsave(&q->lock,flags); \ | ||
4056 | __add_wait_queue(q, &wait); \ | ||
4057 | spin_unlock(&q->lock); | 3743 | spin_unlock(&q->lock); |
3744 | } | ||
4058 | 3745 | ||
4059 | #define SLEEP_ON_TAIL \ | 3746 | static inline void |
4060 | spin_lock_irq(&q->lock); \ | 3747 | sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) |
4061 | __remove_wait_queue(q, &wait); \ | 3748 | { |
4062 | spin_unlock_irqrestore(&q->lock, flags); | 3749 | spin_lock_irq(&q->lock); |
3750 | __remove_wait_queue(q, wait); | ||
3751 | spin_unlock_irqrestore(&q->lock, *flags); | ||
3752 | } | ||
4063 | 3753 | ||
4064 | void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | 3754 | void __sched interruptible_sleep_on(wait_queue_head_t *q) |
4065 | { | 3755 | { |
4066 | SLEEP_ON_VAR | 3756 | unsigned long flags; |
3757 | wait_queue_t wait; | ||
3758 | |||
3759 | init_waitqueue_entry(&wait, current); | ||
4067 | 3760 | ||
4068 | current->state = TASK_INTERRUPTIBLE; | 3761 | current->state = TASK_INTERRUPTIBLE; |
4069 | 3762 | ||
4070 | SLEEP_ON_HEAD | 3763 | sleep_on_head(q, &wait, &flags); |
4071 | schedule(); | 3764 | schedule(); |
4072 | SLEEP_ON_TAIL | 3765 | sleep_on_tail(q, &wait, &flags); |
4073 | } | 3766 | } |
4074 | EXPORT_SYMBOL(interruptible_sleep_on); | 3767 | EXPORT_SYMBOL(interruptible_sleep_on); |
4075 | 3768 | ||
4076 | long fastcall __sched | 3769 | long __sched |
4077 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3770 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
4078 | { | 3771 | { |
4079 | SLEEP_ON_VAR | 3772 | unsigned long flags; |
3773 | wait_queue_t wait; | ||
3774 | |||
3775 | init_waitqueue_entry(&wait, current); | ||
4080 | 3776 | ||
4081 | current->state = TASK_INTERRUPTIBLE; | 3777 | current->state = TASK_INTERRUPTIBLE; |
4082 | 3778 | ||
4083 | SLEEP_ON_HEAD | 3779 | sleep_on_head(q, &wait, &flags); |
4084 | timeout = schedule_timeout(timeout); | 3780 | timeout = schedule_timeout(timeout); |
4085 | SLEEP_ON_TAIL | 3781 | sleep_on_tail(q, &wait, &flags); |
4086 | 3782 | ||
4087 | return timeout; | 3783 | return timeout; |
4088 | } | 3784 | } |
4089 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3785 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
4090 | 3786 | ||
4091 | void fastcall __sched sleep_on(wait_queue_head_t *q) | 3787 | void __sched sleep_on(wait_queue_head_t *q) |
4092 | { | 3788 | { |
4093 | SLEEP_ON_VAR | 3789 | unsigned long flags; |
3790 | wait_queue_t wait; | ||
3791 | |||
3792 | init_waitqueue_entry(&wait, current); | ||
4094 | 3793 | ||
4095 | current->state = TASK_UNINTERRUPTIBLE; | 3794 | current->state = TASK_UNINTERRUPTIBLE; |
4096 | 3795 | ||
4097 | SLEEP_ON_HEAD | 3796 | sleep_on_head(q, &wait, &flags); |
4098 | schedule(); | 3797 | schedule(); |
4099 | SLEEP_ON_TAIL | 3798 | sleep_on_tail(q, &wait, &flags); |
4100 | } | 3799 | } |
4101 | EXPORT_SYMBOL(sleep_on); | 3800 | EXPORT_SYMBOL(sleep_on); |
4102 | 3801 | ||
4103 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3802 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
4104 | { | 3803 | { |
4105 | SLEEP_ON_VAR | 3804 | unsigned long flags; |
3805 | wait_queue_t wait; | ||
3806 | |||
3807 | init_waitqueue_entry(&wait, current); | ||
4106 | 3808 | ||
4107 | current->state = TASK_UNINTERRUPTIBLE; | 3809 | current->state = TASK_UNINTERRUPTIBLE; |
4108 | 3810 | ||
4109 | SLEEP_ON_HEAD | 3811 | sleep_on_head(q, &wait, &flags); |
4110 | timeout = schedule_timeout(timeout); | 3812 | timeout = schedule_timeout(timeout); |
4111 | SLEEP_ON_TAIL | 3813 | sleep_on_tail(q, &wait, &flags); |
4112 | 3814 | ||
4113 | return timeout; | 3815 | return timeout; |
4114 | } | 3816 | } |
4115 | |||
4116 | EXPORT_SYMBOL(sleep_on_timeout); | 3817 | EXPORT_SYMBOL(sleep_on_timeout); |
4117 | 3818 | ||
4118 | #ifdef CONFIG_RT_MUTEXES | 3819 | #ifdef CONFIG_RT_MUTEXES |
@@ -4129,29 +3830,30 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
4129 | */ | 3830 | */ |
4130 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3831 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4131 | { | 3832 | { |
4132 | struct prio_array *array; | ||
4133 | unsigned long flags; | 3833 | unsigned long flags; |
3834 | int oldprio, on_rq; | ||
4134 | struct rq *rq; | 3835 | struct rq *rq; |
4135 | int oldprio; | 3836 | u64 now; |
4136 | 3837 | ||
4137 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3838 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4138 | 3839 | ||
4139 | rq = task_rq_lock(p, &flags); | 3840 | rq = task_rq_lock(p, &flags); |
3841 | now = rq_clock(rq); | ||
4140 | 3842 | ||
4141 | oldprio = p->prio; | 3843 | oldprio = p->prio; |
4142 | array = p->array; | 3844 | on_rq = p->se.on_rq; |
4143 | if (array) | 3845 | if (on_rq) |
4144 | dequeue_task(p, array); | 3846 | dequeue_task(rq, p, 0, now); |
3847 | |||
3848 | if (rt_prio(prio)) | ||
3849 | p->sched_class = &rt_sched_class; | ||
3850 | else | ||
3851 | p->sched_class = &fair_sched_class; | ||
3852 | |||
4145 | p->prio = prio; | 3853 | p->prio = prio; |
4146 | 3854 | ||
4147 | if (array) { | 3855 | if (on_rq) { |
4148 | /* | 3856 | enqueue_task(rq, p, 0, now); |
4149 | * If changing to an RT priority then queue it | ||
4150 | * in the active array! | ||
4151 | */ | ||
4152 | if (rt_task(p)) | ||
4153 | array = rq->active; | ||
4154 | enqueue_task(p, array); | ||
4155 | /* | 3857 | /* |
4156 | * Reschedule if we are currently running on this runqueue and | 3858 | * Reschedule if we are currently running on this runqueue and |
4157 | * our priority decreased, or if we are not currently running on | 3859 | * our priority decreased, or if we are not currently running on |
@@ -4160,8 +3862,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4160 | if (task_running(rq, p)) { | 3862 | if (task_running(rq, p)) { |
4161 | if (p->prio > oldprio) | 3863 | if (p->prio > oldprio) |
4162 | resched_task(rq->curr); | 3864 | resched_task(rq->curr); |
4163 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 3865 | } else { |
4164 | resched_task(rq->curr); | 3866 | check_preempt_curr(rq, p); |
3867 | } | ||
4165 | } | 3868 | } |
4166 | task_rq_unlock(rq, &flags); | 3869 | task_rq_unlock(rq, &flags); |
4167 | } | 3870 | } |
@@ -4170,10 +3873,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4170 | 3873 | ||
4171 | void set_user_nice(struct task_struct *p, long nice) | 3874 | void set_user_nice(struct task_struct *p, long nice) |
4172 | { | 3875 | { |
4173 | struct prio_array *array; | 3876 | int old_prio, delta, on_rq; |
4174 | int old_prio, delta; | ||
4175 | unsigned long flags; | 3877 | unsigned long flags; |
4176 | struct rq *rq; | 3878 | struct rq *rq; |
3879 | u64 now; | ||
4177 | 3880 | ||
4178 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3881 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
4179 | return; | 3882 | return; |
@@ -4182,20 +3885,21 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4182 | * the task might be in the middle of scheduling on another CPU. | 3885 | * the task might be in the middle of scheduling on another CPU. |
4183 | */ | 3886 | */ |
4184 | rq = task_rq_lock(p, &flags); | 3887 | rq = task_rq_lock(p, &flags); |
3888 | now = rq_clock(rq); | ||
4185 | /* | 3889 | /* |
4186 | * The RT priorities are set via sched_setscheduler(), but we still | 3890 | * The RT priorities are set via sched_setscheduler(), but we still |
4187 | * allow the 'normal' nice value to be set - but as expected | 3891 | * allow the 'normal' nice value to be set - but as expected |
4188 | * it wont have any effect on scheduling until the task is | 3892 | * it wont have any effect on scheduling until the task is |
4189 | * not SCHED_NORMAL/SCHED_BATCH: | 3893 | * SCHED_FIFO/SCHED_RR: |
4190 | */ | 3894 | */ |
4191 | if (has_rt_policy(p)) { | 3895 | if (task_has_rt_policy(p)) { |
4192 | p->static_prio = NICE_TO_PRIO(nice); | 3896 | p->static_prio = NICE_TO_PRIO(nice); |
4193 | goto out_unlock; | 3897 | goto out_unlock; |
4194 | } | 3898 | } |
4195 | array = p->array; | 3899 | on_rq = p->se.on_rq; |
4196 | if (array) { | 3900 | if (on_rq) { |
4197 | dequeue_task(p, array); | 3901 | dequeue_task(rq, p, 0, now); |
4198 | dec_raw_weighted_load(rq, p); | 3902 | dec_load(rq, p, now); |
4199 | } | 3903 | } |
4200 | 3904 | ||
4201 | p->static_prio = NICE_TO_PRIO(nice); | 3905 | p->static_prio = NICE_TO_PRIO(nice); |
@@ -4204,9 +3908,9 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4204 | p->prio = effective_prio(p); | 3908 | p->prio = effective_prio(p); |
4205 | delta = p->prio - old_prio; | 3909 | delta = p->prio - old_prio; |
4206 | 3910 | ||
4207 | if (array) { | 3911 | if (on_rq) { |
4208 | enqueue_task(p, array); | 3912 | enqueue_task(rq, p, 0, now); |
4209 | inc_raw_weighted_load(rq, p); | 3913 | inc_load(rq, p, now); |
4210 | /* | 3914 | /* |
4211 | * If the task increased its priority or is running and | 3915 | * If the task increased its priority or is running and |
4212 | * lowered its priority, then reschedule its CPU: | 3916 | * lowered its priority, then reschedule its CPU: |
@@ -4326,20 +4030,28 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) | |||
4326 | } | 4030 | } |
4327 | 4031 | ||
4328 | /* Actually do priority change: must hold rq lock. */ | 4032 | /* Actually do priority change: must hold rq lock. */ |
4329 | static void __setscheduler(struct task_struct *p, int policy, int prio) | 4033 | static void |
4034 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | ||
4330 | { | 4035 | { |
4331 | BUG_ON(p->array); | 4036 | BUG_ON(p->se.on_rq); |
4332 | 4037 | ||
4333 | p->policy = policy; | 4038 | p->policy = policy; |
4039 | switch (p->policy) { | ||
4040 | case SCHED_NORMAL: | ||
4041 | case SCHED_BATCH: | ||
4042 | case SCHED_IDLE: | ||
4043 | p->sched_class = &fair_sched_class; | ||
4044 | break; | ||
4045 | case SCHED_FIFO: | ||
4046 | case SCHED_RR: | ||
4047 | p->sched_class = &rt_sched_class; | ||
4048 | break; | ||
4049 | } | ||
4050 | |||
4334 | p->rt_priority = prio; | 4051 | p->rt_priority = prio; |
4335 | p->normal_prio = normal_prio(p); | 4052 | p->normal_prio = normal_prio(p); |
4336 | /* we are holding p->pi_lock already */ | 4053 | /* we are holding p->pi_lock already */ |
4337 | p->prio = rt_mutex_getprio(p); | 4054 | p->prio = rt_mutex_getprio(p); |
4338 | /* | ||
4339 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | ||
4340 | */ | ||
4341 | if (policy == SCHED_BATCH) | ||
4342 | p->sleep_avg = 0; | ||
4343 | set_load_weight(p); | 4055 | set_load_weight(p); |
4344 | } | 4056 | } |
4345 | 4057 | ||
@@ -4354,8 +4066,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
4354 | int sched_setscheduler(struct task_struct *p, int policy, | 4066 | int sched_setscheduler(struct task_struct *p, int policy, |
4355 | struct sched_param *param) | 4067 | struct sched_param *param) |
4356 | { | 4068 | { |
4357 | int retval, oldprio, oldpolicy = -1; | 4069 | int retval, oldprio, oldpolicy = -1, on_rq; |
4358 | struct prio_array *array; | ||
4359 | unsigned long flags; | 4070 | unsigned long flags; |
4360 | struct rq *rq; | 4071 | struct rq *rq; |
4361 | 4072 | ||
@@ -4366,27 +4077,27 @@ recheck: | |||
4366 | if (policy < 0) | 4077 | if (policy < 0) |
4367 | policy = oldpolicy = p->policy; | 4078 | policy = oldpolicy = p->policy; |
4368 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 4079 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
4369 | policy != SCHED_NORMAL && policy != SCHED_BATCH) | 4080 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
4081 | policy != SCHED_IDLE) | ||
4370 | return -EINVAL; | 4082 | return -EINVAL; |
4371 | /* | 4083 | /* |
4372 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 4084 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
4373 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and | 4085 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
4374 | * SCHED_BATCH is 0. | 4086 | * SCHED_BATCH and SCHED_IDLE is 0. |
4375 | */ | 4087 | */ |
4376 | if (param->sched_priority < 0 || | 4088 | if (param->sched_priority < 0 || |
4377 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 4089 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
4378 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 4090 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
4379 | return -EINVAL; | 4091 | return -EINVAL; |
4380 | if (is_rt_policy(policy) != (param->sched_priority != 0)) | 4092 | if (rt_policy(policy) != (param->sched_priority != 0)) |
4381 | return -EINVAL; | 4093 | return -EINVAL; |
4382 | 4094 | ||
4383 | /* | 4095 | /* |
4384 | * Allow unprivileged RT tasks to decrease priority: | 4096 | * Allow unprivileged RT tasks to decrease priority: |
4385 | */ | 4097 | */ |
4386 | if (!capable(CAP_SYS_NICE)) { | 4098 | if (!capable(CAP_SYS_NICE)) { |
4387 | if (is_rt_policy(policy)) { | 4099 | if (rt_policy(policy)) { |
4388 | unsigned long rlim_rtprio; | 4100 | unsigned long rlim_rtprio; |
4389 | unsigned long flags; | ||
4390 | 4101 | ||
4391 | if (!lock_task_sighand(p, &flags)) | 4102 | if (!lock_task_sighand(p, &flags)) |
4392 | return -ESRCH; | 4103 | return -ESRCH; |
@@ -4402,6 +4113,12 @@ recheck: | |||
4402 | param->sched_priority > rlim_rtprio) | 4113 | param->sched_priority > rlim_rtprio) |
4403 | return -EPERM; | 4114 | return -EPERM; |
4404 | } | 4115 | } |
4116 | /* | ||
4117 | * Like positive nice levels, dont allow tasks to | ||
4118 | * move out of SCHED_IDLE either: | ||
4119 | */ | ||
4120 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | ||
4121 | return -EPERM; | ||
4405 | 4122 | ||
4406 | /* can't change other user's priorities */ | 4123 | /* can't change other user's priorities */ |
4407 | if ((current->euid != p->euid) && | 4124 | if ((current->euid != p->euid) && |
@@ -4429,13 +4146,13 @@ recheck: | |||
4429 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4146 | spin_unlock_irqrestore(&p->pi_lock, flags); |
4430 | goto recheck; | 4147 | goto recheck; |
4431 | } | 4148 | } |
4432 | array = p->array; | 4149 | on_rq = p->se.on_rq; |
4433 | if (array) | 4150 | if (on_rq) |
4434 | deactivate_task(p, rq); | 4151 | deactivate_task(rq, p, 0); |
4435 | oldprio = p->prio; | 4152 | oldprio = p->prio; |
4436 | __setscheduler(p, policy, param->sched_priority); | 4153 | __setscheduler(rq, p, policy, param->sched_priority); |
4437 | if (array) { | 4154 | if (on_rq) { |
4438 | __activate_task(p, rq); | 4155 | activate_task(rq, p, 0); |
4439 | /* | 4156 | /* |
4440 | * Reschedule if we are currently running on this runqueue and | 4157 | * Reschedule if we are currently running on this runqueue and |
4441 | * our priority decreased, or if we are not currently running on | 4158 | * our priority decreased, or if we are not currently running on |
@@ -4444,8 +4161,9 @@ recheck: | |||
4444 | if (task_running(rq, p)) { | 4161 | if (task_running(rq, p)) { |
4445 | if (p->prio > oldprio) | 4162 | if (p->prio > oldprio) |
4446 | resched_task(rq->curr); | 4163 | resched_task(rq->curr); |
4447 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4164 | } else { |
4448 | resched_task(rq->curr); | 4165 | check_preempt_curr(rq, p); |
4166 | } | ||
4449 | } | 4167 | } |
4450 | __task_rq_unlock(rq); | 4168 | __task_rq_unlock(rq); |
4451 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4169 | spin_unlock_irqrestore(&p->pi_lock, flags); |
@@ -4717,41 +4435,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
4717 | /** | 4435 | /** |
4718 | * sys_sched_yield - yield the current processor to other threads. | 4436 | * sys_sched_yield - yield the current processor to other threads. |
4719 | * | 4437 | * |
4720 | * This function yields the current CPU by moving the calling thread | 4438 | * This function yields the current CPU to other tasks. If there are no |
4721 | * to the expired array. If there are no other threads running on this | 4439 | * other threads running on this CPU then this function will return. |
4722 | * CPU then this function will return. | ||
4723 | */ | 4440 | */ |
4724 | asmlinkage long sys_sched_yield(void) | 4441 | asmlinkage long sys_sched_yield(void) |
4725 | { | 4442 | { |
4726 | struct rq *rq = this_rq_lock(); | 4443 | struct rq *rq = this_rq_lock(); |
4727 | struct prio_array *array = current->array, *target = rq->expired; | ||
4728 | 4444 | ||
4729 | schedstat_inc(rq, yld_cnt); | 4445 | schedstat_inc(rq, yld_cnt); |
4730 | /* | 4446 | if (unlikely(rq->nr_running == 1)) |
4731 | * We implement yielding by moving the task into the expired | ||
4732 | * queue. | ||
4733 | * | ||
4734 | * (special rule: RT tasks will just roundrobin in the active | ||
4735 | * array.) | ||
4736 | */ | ||
4737 | if (rt_task(current)) | ||
4738 | target = rq->active; | ||
4739 | |||
4740 | if (array->nr_active == 1) { | ||
4741 | schedstat_inc(rq, yld_act_empty); | 4447 | schedstat_inc(rq, yld_act_empty); |
4742 | if (!rq->expired->nr_active) | 4448 | else |
4743 | schedstat_inc(rq, yld_both_empty); | 4449 | current->sched_class->yield_task(rq, current); |
4744 | } else if (!rq->expired->nr_active) | ||
4745 | schedstat_inc(rq, yld_exp_empty); | ||
4746 | |||
4747 | if (array != target) { | ||
4748 | dequeue_task(current, array); | ||
4749 | enqueue_task(current, target); | ||
4750 | } else | ||
4751 | /* | ||
4752 | * requeue_task is cheaper so perform that if possible. | ||
4753 | */ | ||
4754 | requeue_task(current, array); | ||
4755 | 4450 | ||
4756 | /* | 4451 | /* |
4757 | * Since we are going to call schedule() anyway, there's | 4452 | * Since we are going to call schedule() anyway, there's |
@@ -4902,6 +4597,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) | |||
4902 | break; | 4597 | break; |
4903 | case SCHED_NORMAL: | 4598 | case SCHED_NORMAL: |
4904 | case SCHED_BATCH: | 4599 | case SCHED_BATCH: |
4600 | case SCHED_IDLE: | ||
4905 | ret = 0; | 4601 | ret = 0; |
4906 | break; | 4602 | break; |
4907 | } | 4603 | } |
@@ -4926,6 +4622,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
4926 | break; | 4622 | break; |
4927 | case SCHED_NORMAL: | 4623 | case SCHED_NORMAL: |
4928 | case SCHED_BATCH: | 4624 | case SCHED_BATCH: |
4625 | case SCHED_IDLE: | ||
4929 | ret = 0; | 4626 | ret = 0; |
4930 | } | 4627 | } |
4931 | return ret; | 4628 | return ret; |
@@ -4960,7 +4657,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4960 | goto out_unlock; | 4657 | goto out_unlock; |
4961 | 4658 | ||
4962 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | 4659 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
4963 | 0 : task_timeslice(p), &t); | 4660 | 0 : static_prio_timeslice(p->static_prio), &t); |
4964 | read_unlock(&tasklist_lock); | 4661 | read_unlock(&tasklist_lock); |
4965 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4662 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
4966 | out_nounlock: | 4663 | out_nounlock: |
@@ -4980,14 +4677,14 @@ static void show_task(struct task_struct *p) | |||
4980 | state = p->state ? __ffs(p->state) + 1 : 0; | 4677 | state = p->state ? __ffs(p->state) + 1 : 0; |
4981 | printk("%-13.13s %c", p->comm, | 4678 | printk("%-13.13s %c", p->comm, |
4982 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 4679 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
4983 | #if (BITS_PER_LONG == 32) | 4680 | #if BITS_PER_LONG == 32 |
4984 | if (state == TASK_RUNNING) | 4681 | if (state == TASK_RUNNING) |
4985 | printk(" running "); | 4682 | printk(" running "); |
4986 | else | 4683 | else |
4987 | printk(" %08lX ", thread_saved_pc(p)); | 4684 | printk(" %08lx ", thread_saved_pc(p)); |
4988 | #else | 4685 | #else |
4989 | if (state == TASK_RUNNING) | 4686 | if (state == TASK_RUNNING) |
4990 | printk(" running task "); | 4687 | printk(" running task "); |
4991 | else | 4688 | else |
4992 | printk(" %016lx ", thread_saved_pc(p)); | 4689 | printk(" %016lx ", thread_saved_pc(p)); |
4993 | #endif | 4690 | #endif |
@@ -4999,11 +4696,7 @@ static void show_task(struct task_struct *p) | |||
4999 | free = (unsigned long)n - (unsigned long)end_of_stack(p); | 4696 | free = (unsigned long)n - (unsigned long)end_of_stack(p); |
5000 | } | 4697 | } |
5001 | #endif | 4698 | #endif |
5002 | printk("%5lu %5d %6d", free, p->pid, p->parent->pid); | 4699 | printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); |
5003 | if (!p->mm) | ||
5004 | printk(" (L-TLB)\n"); | ||
5005 | else | ||
5006 | printk(" (NOTLB)\n"); | ||
5007 | 4700 | ||
5008 | if (state != TASK_RUNNING) | 4701 | if (state != TASK_RUNNING) |
5009 | show_stack(p, NULL); | 4702 | show_stack(p, NULL); |
@@ -5013,14 +4706,12 @@ void show_state_filter(unsigned long state_filter) | |||
5013 | { | 4706 | { |
5014 | struct task_struct *g, *p; | 4707 | struct task_struct *g, *p; |
5015 | 4708 | ||
5016 | #if (BITS_PER_LONG == 32) | 4709 | #if BITS_PER_LONG == 32 |
5017 | printk("\n" | 4710 | printk(KERN_INFO |
5018 | " free sibling\n"); | 4711 | " task PC stack pid father\n"); |
5019 | printk(" task PC stack pid father child younger older\n"); | ||
5020 | #else | 4712 | #else |
5021 | printk("\n" | 4713 | printk(KERN_INFO |
5022 | " free sibling\n"); | 4714 | " task PC stack pid father\n"); |
5023 | printk(" task PC stack pid father child younger older\n"); | ||
5024 | #endif | 4715 | #endif |
5025 | read_lock(&tasklist_lock); | 4716 | read_lock(&tasklist_lock); |
5026 | do_each_thread(g, p) { | 4717 | do_each_thread(g, p) { |
@@ -5035,6 +4726,9 @@ void show_state_filter(unsigned long state_filter) | |||
5035 | 4726 | ||
5036 | touch_all_softlockup_watchdogs(); | 4727 | touch_all_softlockup_watchdogs(); |
5037 | 4728 | ||
4729 | #ifdef CONFIG_SCHED_DEBUG | ||
4730 | sysrq_sched_debug_show(); | ||
4731 | #endif | ||
5038 | read_unlock(&tasklist_lock); | 4732 | read_unlock(&tasklist_lock); |
5039 | /* | 4733 | /* |
5040 | * Only show locks if all tasks are dumped: | 4734 | * Only show locks if all tasks are dumped: |
@@ -5043,6 +4737,11 @@ void show_state_filter(unsigned long state_filter) | |||
5043 | debug_show_all_locks(); | 4737 | debug_show_all_locks(); |
5044 | } | 4738 | } |
5045 | 4739 | ||
4740 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) | ||
4741 | { | ||
4742 | idle->sched_class = &idle_sched_class; | ||
4743 | } | ||
4744 | |||
5046 | /** | 4745 | /** |
5047 | * init_idle - set up an idle thread for a given CPU | 4746 | * init_idle - set up an idle thread for a given CPU |
5048 | * @idle: task in question | 4747 | * @idle: task in question |
@@ -5056,13 +4755,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5056 | struct rq *rq = cpu_rq(cpu); | 4755 | struct rq *rq = cpu_rq(cpu); |
5057 | unsigned long flags; | 4756 | unsigned long flags; |
5058 | 4757 | ||
5059 | idle->timestamp = sched_clock(); | 4758 | __sched_fork(idle); |
5060 | idle->sleep_avg = 0; | 4759 | idle->se.exec_start = sched_clock(); |
5061 | idle->array = NULL; | 4760 | |
5062 | idle->prio = idle->normal_prio = MAX_PRIO; | 4761 | idle->prio = idle->normal_prio = MAX_PRIO; |
5063 | idle->state = TASK_RUNNING; | ||
5064 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4762 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
5065 | set_task_cpu(idle, cpu); | 4763 | __set_task_cpu(idle, cpu); |
5066 | 4764 | ||
5067 | spin_lock_irqsave(&rq->lock, flags); | 4765 | spin_lock_irqsave(&rq->lock, flags); |
5068 | rq->curr = rq->idle = idle; | 4766 | rq->curr = rq->idle = idle; |
@@ -5077,6 +4775,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5077 | #else | 4775 | #else |
5078 | task_thread_info(idle)->preempt_count = 0; | 4776 | task_thread_info(idle)->preempt_count = 0; |
5079 | #endif | 4777 | #endif |
4778 | /* | ||
4779 | * The idle tasks have their own, simple scheduling class: | ||
4780 | */ | ||
4781 | idle->sched_class = &idle_sched_class; | ||
5080 | } | 4782 | } |
5081 | 4783 | ||
5082 | /* | 4784 | /* |
@@ -5088,6 +4790,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5088 | */ | 4790 | */ |
5089 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4791 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
5090 | 4792 | ||
4793 | /* | ||
4794 | * Increase the granularity value when there are more CPUs, | ||
4795 | * because with more CPUs the 'effective latency' as visible | ||
4796 | * to users decreases. But the relationship is not linear, | ||
4797 | * so pick a second-best guess by going with the log2 of the | ||
4798 | * number of CPUs. | ||
4799 | * | ||
4800 | * This idea comes from the SD scheduler of Con Kolivas: | ||
4801 | */ | ||
4802 | static inline void sched_init_granularity(void) | ||
4803 | { | ||
4804 | unsigned int factor = 1 + ilog2(num_online_cpus()); | ||
4805 | const unsigned long gran_limit = 100000000; | ||
4806 | |||
4807 | sysctl_sched_granularity *= factor; | ||
4808 | if (sysctl_sched_granularity > gran_limit) | ||
4809 | sysctl_sched_granularity = gran_limit; | ||
4810 | |||
4811 | sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; | ||
4812 | sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; | ||
4813 | } | ||
4814 | |||
5091 | #ifdef CONFIG_SMP | 4815 | #ifdef CONFIG_SMP |
5092 | /* | 4816 | /* |
5093 | * This is how migration works: | 4817 | * This is how migration works: |
@@ -5161,7 +4885,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
5161 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4885 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
5162 | { | 4886 | { |
5163 | struct rq *rq_dest, *rq_src; | 4887 | struct rq *rq_dest, *rq_src; |
5164 | int ret = 0; | 4888 | int ret = 0, on_rq; |
5165 | 4889 | ||
5166 | if (unlikely(cpu_is_offline(dest_cpu))) | 4890 | if (unlikely(cpu_is_offline(dest_cpu))) |
5167 | return ret; | 4891 | return ret; |
@@ -5177,20 +4901,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5177 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 4901 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
5178 | goto out; | 4902 | goto out; |
5179 | 4903 | ||
4904 | on_rq = p->se.on_rq; | ||
4905 | if (on_rq) | ||
4906 | deactivate_task(rq_src, p, 0); | ||
5180 | set_task_cpu(p, dest_cpu); | 4907 | set_task_cpu(p, dest_cpu); |
5181 | if (p->array) { | 4908 | if (on_rq) { |
5182 | /* | 4909 | activate_task(rq_dest, p, 0); |
5183 | * Sync timestamp with rq_dest's before activating. | 4910 | check_preempt_curr(rq_dest, p); |
5184 | * The same thing could be achieved by doing this step | ||
5185 | * afterwards, and pretending it was a local activate. | ||
5186 | * This way is cleaner and logically correct. | ||
5187 | */ | ||
5188 | p->timestamp = p->timestamp - rq_src->most_recent_timestamp | ||
5189 | + rq_dest->most_recent_timestamp; | ||
5190 | deactivate_task(p, rq_src); | ||
5191 | __activate_task(p, rq_dest); | ||
5192 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | ||
5193 | resched_task(rq_dest->curr); | ||
5194 | } | 4911 | } |
5195 | ret = 1; | 4912 | ret = 1; |
5196 | out: | 4913 | out: |
@@ -5216,8 +4933,6 @@ static int migration_thread(void *data) | |||
5216 | struct migration_req *req; | 4933 | struct migration_req *req; |
5217 | struct list_head *head; | 4934 | struct list_head *head; |
5218 | 4935 | ||
5219 | try_to_freeze(); | ||
5220 | |||
5221 | spin_lock_irq(&rq->lock); | 4936 | spin_lock_irq(&rq->lock); |
5222 | 4937 | ||
5223 | if (cpu_is_offline(cpu)) { | 4938 | if (cpu_is_offline(cpu)) { |
@@ -5342,7 +5057,8 @@ static void migrate_live_tasks(int src_cpu) | |||
5342 | write_unlock_irq(&tasklist_lock); | 5057 | write_unlock_irq(&tasklist_lock); |
5343 | } | 5058 | } |
5344 | 5059 | ||
5345 | /* Schedules idle task to be the next runnable task on current CPU. | 5060 | /* |
5061 | * Schedules idle task to be the next runnable task on current CPU. | ||
5346 | * It does so by boosting its priority to highest possible and adding it to | 5062 | * It does so by boosting its priority to highest possible and adding it to |
5347 | * the _front_ of the runqueue. Used by CPU offline code. | 5063 | * the _front_ of the runqueue. Used by CPU offline code. |
5348 | */ | 5064 | */ |
@@ -5362,10 +5078,10 @@ void sched_idle_next(void) | |||
5362 | */ | 5078 | */ |
5363 | spin_lock_irqsave(&rq->lock, flags); | 5079 | spin_lock_irqsave(&rq->lock, flags); |
5364 | 5080 | ||
5365 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5081 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
5366 | 5082 | ||
5367 | /* Add idle task to the _front_ of its priority queue: */ | 5083 | /* Add idle task to the _front_ of its priority queue: */ |
5368 | __activate_idle_task(p, rq); | 5084 | activate_idle_task(p, rq); |
5369 | 5085 | ||
5370 | spin_unlock_irqrestore(&rq->lock, flags); | 5086 | spin_unlock_irqrestore(&rq->lock, flags); |
5371 | } | 5087 | } |
@@ -5415,16 +5131,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | |||
5415 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5131 | static void migrate_dead_tasks(unsigned int dead_cpu) |
5416 | { | 5132 | { |
5417 | struct rq *rq = cpu_rq(dead_cpu); | 5133 | struct rq *rq = cpu_rq(dead_cpu); |
5418 | unsigned int arr, i; | 5134 | struct task_struct *next; |
5419 | 5135 | ||
5420 | for (arr = 0; arr < 2; arr++) { | 5136 | for ( ; ; ) { |
5421 | for (i = 0; i < MAX_PRIO; i++) { | 5137 | if (!rq->nr_running) |
5422 | struct list_head *list = &rq->arrays[arr].queue[i]; | 5138 | break; |
5423 | 5139 | next = pick_next_task(rq, rq->curr, rq_clock(rq)); | |
5424 | while (!list_empty(list)) | 5140 | if (!next) |
5425 | migrate_dead(dead_cpu, list_entry(list->next, | 5141 | break; |
5426 | struct task_struct, run_list)); | 5142 | migrate_dead(dead_cpu, next); |
5427 | } | ||
5428 | } | 5143 | } |
5429 | } | 5144 | } |
5430 | #endif /* CONFIG_HOTPLUG_CPU */ | 5145 | #endif /* CONFIG_HOTPLUG_CPU */ |
@@ -5448,14 +5163,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5448 | 5163 | ||
5449 | case CPU_UP_PREPARE: | 5164 | case CPU_UP_PREPARE: |
5450 | case CPU_UP_PREPARE_FROZEN: | 5165 | case CPU_UP_PREPARE_FROZEN: |
5451 | p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); | 5166 | p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); |
5452 | if (IS_ERR(p)) | 5167 | if (IS_ERR(p)) |
5453 | return NOTIFY_BAD; | 5168 | return NOTIFY_BAD; |
5454 | p->flags |= PF_NOFREEZE; | ||
5455 | kthread_bind(p, cpu); | 5169 | kthread_bind(p, cpu); |
5456 | /* Must be high prio: stop_machine expects to yield to it. */ | 5170 | /* Must be high prio: stop_machine expects to yield to it. */ |
5457 | rq = task_rq_lock(p, &flags); | 5171 | rq = task_rq_lock(p, &flags); |
5458 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5172 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
5459 | task_rq_unlock(rq, &flags); | 5173 | task_rq_unlock(rq, &flags); |
5460 | cpu_rq(cpu)->migration_thread = p; | 5174 | cpu_rq(cpu)->migration_thread = p; |
5461 | break; | 5175 | break; |
@@ -5486,9 +5200,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5486 | rq->migration_thread = NULL; | 5200 | rq->migration_thread = NULL; |
5487 | /* Idle task back to normal (off runqueue, low prio) */ | 5201 | /* Idle task back to normal (off runqueue, low prio) */ |
5488 | rq = task_rq_lock(rq->idle, &flags); | 5202 | rq = task_rq_lock(rq->idle, &flags); |
5489 | deactivate_task(rq->idle, rq); | 5203 | deactivate_task(rq, rq->idle, 0); |
5490 | rq->idle->static_prio = MAX_PRIO; | 5204 | rq->idle->static_prio = MAX_PRIO; |
5491 | __setscheduler(rq->idle, SCHED_NORMAL, 0); | 5205 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); |
5206 | rq->idle->sched_class = &idle_sched_class; | ||
5492 | migrate_dead_tasks(cpu); | 5207 | migrate_dead_tasks(cpu); |
5493 | task_rq_unlock(rq, &flags); | 5208 | task_rq_unlock(rq, &flags); |
5494 | migrate_nr_uninterruptible(rq); | 5209 | migrate_nr_uninterruptible(rq); |
@@ -5797,483 +5512,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
5797 | 5512 | ||
5798 | #define SD_NODES_PER_DOMAIN 16 | 5513 | #define SD_NODES_PER_DOMAIN 16 |
5799 | 5514 | ||
5800 | /* | ||
5801 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
5802 | * | ||
5803 | * This is done by measuring the cost of manipulating buffers of varying | ||
5804 | * sizes. For a given buffer-size here are the steps that are taken: | ||
5805 | * | ||
5806 | * 1) the source CPU reads+dirties a shared buffer | ||
5807 | * 2) the target CPU reads+dirties the same shared buffer | ||
5808 | * | ||
5809 | * We measure how long they take, in the following 4 scenarios: | ||
5810 | * | ||
5811 | * - source: CPU1, target: CPU2 | cost1 | ||
5812 | * - source: CPU2, target: CPU1 | cost2 | ||
5813 | * - source: CPU1, target: CPU1 | cost3 | ||
5814 | * - source: CPU2, target: CPU2 | cost4 | ||
5815 | * | ||
5816 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
5817 | * the cost of migration. | ||
5818 | * | ||
5819 | * We then start off from a small buffer-size and iterate up to larger | ||
5820 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
5821 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
5822 | * normally occurs when the working set size is around the effective cache | ||
5823 | * size.) | ||
5824 | */ | ||
5825 | #define SEARCH_SCOPE 2 | ||
5826 | #define MIN_CACHE_SIZE (64*1024U) | ||
5827 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
5828 | #define ITERATIONS 1 | ||
5829 | #define SIZE_THRESH 130 | ||
5830 | #define COST_THRESH 130 | ||
5831 | |||
5832 | /* | ||
5833 | * The migration cost is a function of 'domain distance'. Domain | ||
5834 | * distance is the number of steps a CPU has to iterate down its | ||
5835 | * domain tree to share a domain with the other CPU. The farther | ||
5836 | * two CPUs are from each other, the larger the distance gets. | ||
5837 | * | ||
5838 | * Note that we use the distance only to cache measurement results, | ||
5839 | * the distance value is not used numerically otherwise. When two | ||
5840 | * CPUs have the same distance it is assumed that the migration | ||
5841 | * cost is the same. (this is a simplification but quite practical) | ||
5842 | */ | ||
5843 | #define MAX_DOMAIN_DISTANCE 32 | ||
5844 | |||
5845 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
5846 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = | ||
5847 | /* | ||
5848 | * Architectures may override the migration cost and thus avoid | ||
5849 | * boot-time calibration. Unit is nanoseconds. Mostly useful for | ||
5850 | * virtualized hardware: | ||
5851 | */ | ||
5852 | #ifdef CONFIG_DEFAULT_MIGRATION_COST | ||
5853 | CONFIG_DEFAULT_MIGRATION_COST | ||
5854 | #else | ||
5855 | -1LL | ||
5856 | #endif | ||
5857 | }; | ||
5858 | |||
5859 | /* | ||
5860 | * Allow override of migration cost - in units of microseconds. | ||
5861 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
5862 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
5863 | */ | ||
5864 | static int __init migration_cost_setup(char *str) | ||
5865 | { | ||
5866 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
5867 | |||
5868 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
5869 | |||
5870 | printk("#ints: %d\n", ints[0]); | ||
5871 | for (i = 1; i <= ints[0]; i++) { | ||
5872 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
5873 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
5874 | } | ||
5875 | return 1; | ||
5876 | } | ||
5877 | |||
5878 | __setup ("migration_cost=", migration_cost_setup); | ||
5879 | |||
5880 | /* | ||
5881 | * Global multiplier (divisor) for migration-cutoff values, | ||
5882 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
5883 | * longer cache-hot cutoff times. | ||
5884 | * | ||
5885 | * (We scale it from 100 to 128 to long long handling easier.) | ||
5886 | */ | ||
5887 | |||
5888 | #define MIGRATION_FACTOR_SCALE 128 | ||
5889 | |||
5890 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
5891 | |||
5892 | static int __init setup_migration_factor(char *str) | ||
5893 | { | ||
5894 | get_option(&str, &migration_factor); | ||
5895 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
5896 | return 1; | ||
5897 | } | ||
5898 | |||
5899 | __setup("migration_factor=", setup_migration_factor); | ||
5900 | |||
5901 | /* | ||
5902 | * Estimated distance of two CPUs, measured via the number of domains | ||
5903 | * we have to pass for the two CPUs to be in the same span: | ||
5904 | */ | ||
5905 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
5906 | { | ||
5907 | unsigned long distance = 0; | ||
5908 | struct sched_domain *sd; | ||
5909 | |||
5910 | for_each_domain(cpu1, sd) { | ||
5911 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
5912 | if (cpu_isset(cpu2, sd->span)) | ||
5913 | return distance; | ||
5914 | distance++; | ||
5915 | } | ||
5916 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
5917 | WARN_ON(1); | ||
5918 | distance = MAX_DOMAIN_DISTANCE-1; | ||
5919 | } | ||
5920 | |||
5921 | return distance; | ||
5922 | } | ||
5923 | |||
5924 | static unsigned int migration_debug; | ||
5925 | |||
5926 | static int __init setup_migration_debug(char *str) | ||
5927 | { | ||
5928 | get_option(&str, &migration_debug); | ||
5929 | return 1; | ||
5930 | } | ||
5931 | |||
5932 | __setup("migration_debug=", setup_migration_debug); | ||
5933 | |||
5934 | /* | ||
5935 | * Maximum cache-size that the scheduler should try to measure. | ||
5936 | * Architectures with larger caches should tune this up during | ||
5937 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
5938 | * bootup). | ||
5939 | */ | ||
5940 | unsigned int max_cache_size; | ||
5941 | |||
5942 | static int __init setup_max_cache_size(char *str) | ||
5943 | { | ||
5944 | get_option(&str, &max_cache_size); | ||
5945 | return 1; | ||
5946 | } | ||
5947 | |||
5948 | __setup("max_cache_size=", setup_max_cache_size); | ||
5949 | |||
5950 | /* | ||
5951 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
5952 | * is the operation that is timed, so we try to generate unpredictable | ||
5953 | * cachemisses that still end up filling the L2 cache: | ||
5954 | */ | ||
5955 | static void touch_cache(void *__cache, unsigned long __size) | ||
5956 | { | ||
5957 | unsigned long size = __size / sizeof(long); | ||
5958 | unsigned long chunk1 = size / 3; | ||
5959 | unsigned long chunk2 = 2 * size / 3; | ||
5960 | unsigned long *cache = __cache; | ||
5961 | int i; | ||
5962 | |||
5963 | for (i = 0; i < size/6; i += 8) { | ||
5964 | switch (i % 6) { | ||
5965 | case 0: cache[i]++; | ||
5966 | case 1: cache[size-1-i]++; | ||
5967 | case 2: cache[chunk1-i]++; | ||
5968 | case 3: cache[chunk1+i]++; | ||
5969 | case 4: cache[chunk2-i]++; | ||
5970 | case 5: cache[chunk2+i]++; | ||
5971 | } | ||
5972 | } | ||
5973 | } | ||
5974 | |||
5975 | /* | ||
5976 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
5977 | */ | ||
5978 | static unsigned long long | ||
5979 | measure_one(void *cache, unsigned long size, int source, int target) | ||
5980 | { | ||
5981 | cpumask_t mask, saved_mask; | ||
5982 | unsigned long long t0, t1, t2, t3, cost; | ||
5983 | |||
5984 | saved_mask = current->cpus_allowed; | ||
5985 | |||
5986 | /* | ||
5987 | * Flush source caches to RAM and invalidate them: | ||
5988 | */ | ||
5989 | sched_cacheflush(); | ||
5990 | |||
5991 | /* | ||
5992 | * Migrate to the source CPU: | ||
5993 | */ | ||
5994 | mask = cpumask_of_cpu(source); | ||
5995 | set_cpus_allowed(current, mask); | ||
5996 | WARN_ON(smp_processor_id() != source); | ||
5997 | |||
5998 | /* | ||
5999 | * Dirty the working set: | ||
6000 | */ | ||
6001 | t0 = sched_clock(); | ||
6002 | touch_cache(cache, size); | ||
6003 | t1 = sched_clock(); | ||
6004 | |||
6005 | /* | ||
6006 | * Migrate to the target CPU, dirty the L2 cache and access | ||
6007 | * the shared buffer. (which represents the working set | ||
6008 | * of a migrated task.) | ||
6009 | */ | ||
6010 | mask = cpumask_of_cpu(target); | ||
6011 | set_cpus_allowed(current, mask); | ||
6012 | WARN_ON(smp_processor_id() != target); | ||
6013 | |||
6014 | t2 = sched_clock(); | ||
6015 | touch_cache(cache, size); | ||
6016 | t3 = sched_clock(); | ||
6017 | |||
6018 | cost = t1-t0 + t3-t2; | ||
6019 | |||
6020 | if (migration_debug >= 2) | ||
6021 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
6022 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
6023 | /* | ||
6024 | * Flush target caches to RAM and invalidate them: | ||
6025 | */ | ||
6026 | sched_cacheflush(); | ||
6027 | |||
6028 | set_cpus_allowed(current, saved_mask); | ||
6029 | |||
6030 | return cost; | ||
6031 | } | ||
6032 | |||
6033 | /* | ||
6034 | * Measure a series of task migrations and return the average | ||
6035 | * result. Since this code runs early during bootup the system | ||
6036 | * is 'undisturbed' and the average latency makes sense. | ||
6037 | * | ||
6038 | * The algorithm in essence auto-detects the relevant cache-size, | ||
6039 | * so it will properly detect different cachesizes for different | ||
6040 | * cache-hierarchies, depending on how the CPUs are connected. | ||
6041 | * | ||
6042 | * Architectures can prime the upper limit of the search range via | ||
6043 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
6044 | */ | ||
6045 | static unsigned long long | ||
6046 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
6047 | { | ||
6048 | unsigned long long cost1, cost2; | ||
6049 | int i; | ||
6050 | |||
6051 | /* | ||
6052 | * Measure the migration cost of 'size' bytes, over an | ||
6053 | * average of 10 runs: | ||
6054 | * | ||
6055 | * (We perturb the cache size by a small (0..4k) | ||
6056 | * value to compensate size/alignment related artifacts. | ||
6057 | * We also subtract the cost of the operation done on | ||
6058 | * the same CPU.) | ||
6059 | */ | ||
6060 | cost1 = 0; | ||
6061 | |||
6062 | /* | ||
6063 | * dry run, to make sure we start off cache-cold on cpu1, | ||
6064 | * and to get any vmalloc pagefaults in advance: | ||
6065 | */ | ||
6066 | measure_one(cache, size, cpu1, cpu2); | ||
6067 | for (i = 0; i < ITERATIONS; i++) | ||
6068 | cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); | ||
6069 | |||
6070 | measure_one(cache, size, cpu2, cpu1); | ||
6071 | for (i = 0; i < ITERATIONS; i++) | ||
6072 | cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); | ||
6073 | |||
6074 | /* | ||
6075 | * (We measure the non-migrating [cached] cost on both | ||
6076 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
6077 | */ | ||
6078 | cost2 = 0; | ||
6079 | |||
6080 | measure_one(cache, size, cpu1, cpu1); | ||
6081 | for (i = 0; i < ITERATIONS; i++) | ||
6082 | cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); | ||
6083 | |||
6084 | measure_one(cache, size, cpu2, cpu2); | ||
6085 | for (i = 0; i < ITERATIONS; i++) | ||
6086 | cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); | ||
6087 | |||
6088 | /* | ||
6089 | * Get the per-iteration migration cost: | ||
6090 | */ | ||
6091 | do_div(cost1, 2 * ITERATIONS); | ||
6092 | do_div(cost2, 2 * ITERATIONS); | ||
6093 | |||
6094 | return cost1 - cost2; | ||
6095 | } | ||
6096 | |||
6097 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
6098 | { | ||
6099 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
6100 | unsigned int max_size, size, size_found = 0; | ||
6101 | long long cost = 0, prev_cost; | ||
6102 | void *cache; | ||
6103 | |||
6104 | /* | ||
6105 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
6106 | * cachesize has to lie somewhere inbetween. | ||
6107 | */ | ||
6108 | if (max_cache_size) { | ||
6109 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
6110 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
6111 | } else { | ||
6112 | /* | ||
6113 | * Since we have no estimation about the relevant | ||
6114 | * search range | ||
6115 | */ | ||
6116 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
6117 | size = MIN_CACHE_SIZE; | ||
6118 | } | ||
6119 | |||
6120 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
6121 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
6122 | return 0; | ||
6123 | } | ||
6124 | |||
6125 | /* | ||
6126 | * Allocate the working set: | ||
6127 | */ | ||
6128 | cache = vmalloc(max_size); | ||
6129 | if (!cache) { | ||
6130 | printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); | ||
6131 | return 1000000; /* return 1 msec on very small boxen */ | ||
6132 | } | ||
6133 | |||
6134 | while (size <= max_size) { | ||
6135 | prev_cost = cost; | ||
6136 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
6137 | |||
6138 | /* | ||
6139 | * Update the max: | ||
6140 | */ | ||
6141 | if (cost > 0) { | ||
6142 | if (max_cost < cost) { | ||
6143 | max_cost = cost; | ||
6144 | size_found = size; | ||
6145 | } | ||
6146 | } | ||
6147 | /* | ||
6148 | * Calculate average fluctuation, we use this to prevent | ||
6149 | * noise from triggering an early break out of the loop: | ||
6150 | */ | ||
6151 | fluct = abs(cost - prev_cost); | ||
6152 | avg_fluct = (avg_fluct + fluct)/2; | ||
6153 | |||
6154 | if (migration_debug) | ||
6155 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " | ||
6156 | "(%8Ld %8Ld)\n", | ||
6157 | cpu1, cpu2, size, | ||
6158 | (long)cost / 1000000, | ||
6159 | ((long)cost / 100000) % 10, | ||
6160 | (long)max_cost / 1000000, | ||
6161 | ((long)max_cost / 100000) % 10, | ||
6162 | domain_distance(cpu1, cpu2), | ||
6163 | cost, avg_fluct); | ||
6164 | |||
6165 | /* | ||
6166 | * If we iterated at least 20% past the previous maximum, | ||
6167 | * and the cost has dropped by more than 20% already, | ||
6168 | * (taking fluctuations into account) then we assume to | ||
6169 | * have found the maximum and break out of the loop early: | ||
6170 | */ | ||
6171 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
6172 | if (cost+avg_fluct <= 0 || | ||
6173 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
6174 | |||
6175 | if (migration_debug) | ||
6176 | printk("-> found max.\n"); | ||
6177 | break; | ||
6178 | } | ||
6179 | /* | ||
6180 | * Increase the cachesize in 10% steps: | ||
6181 | */ | ||
6182 | size = size * 10 / 9; | ||
6183 | } | ||
6184 | |||
6185 | if (migration_debug) | ||
6186 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
6187 | cpu1, cpu2, size_found, max_cost); | ||
6188 | |||
6189 | vfree(cache); | ||
6190 | |||
6191 | /* | ||
6192 | * A task is considered 'cache cold' if at least 2 times | ||
6193 | * the worst-case cost of migration has passed. | ||
6194 | * | ||
6195 | * (this limit is only listened to if the load-balancing | ||
6196 | * situation is 'nice' - if there is a large imbalance we | ||
6197 | * ignore it for the sake of CPU utilization and | ||
6198 | * processing fairness.) | ||
6199 | */ | ||
6200 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
6201 | } | ||
6202 | |||
6203 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
6204 | { | ||
6205 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
6206 | unsigned long j0, j1, distance, max_distance = 0; | ||
6207 | struct sched_domain *sd; | ||
6208 | |||
6209 | j0 = jiffies; | ||
6210 | |||
6211 | /* | ||
6212 | * First pass - calculate the cacheflush times: | ||
6213 | */ | ||
6214 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
6215 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
6216 | if (cpu1 == cpu2) | ||
6217 | continue; | ||
6218 | distance = domain_distance(cpu1, cpu2); | ||
6219 | max_distance = max(max_distance, distance); | ||
6220 | /* | ||
6221 | * No result cached yet? | ||
6222 | */ | ||
6223 | if (migration_cost[distance] == -1LL) | ||
6224 | migration_cost[distance] = | ||
6225 | measure_migration_cost(cpu1, cpu2); | ||
6226 | } | ||
6227 | } | ||
6228 | /* | ||
6229 | * Second pass - update the sched domain hierarchy with | ||
6230 | * the new cache-hot-time estimations: | ||
6231 | */ | ||
6232 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6233 | distance = 0; | ||
6234 | for_each_domain(cpu, sd) { | ||
6235 | sd->cache_hot_time = migration_cost[distance]; | ||
6236 | distance++; | ||
6237 | } | ||
6238 | } | ||
6239 | /* | ||
6240 | * Print the matrix: | ||
6241 | */ | ||
6242 | if (migration_debug) | ||
6243 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
6244 | max_cache_size, | ||
6245 | #ifdef CONFIG_X86 | ||
6246 | cpu_khz/1000 | ||
6247 | #else | ||
6248 | -1 | ||
6249 | #endif | ||
6250 | ); | ||
6251 | if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { | ||
6252 | printk("migration_cost="); | ||
6253 | for (distance = 0; distance <= max_distance; distance++) { | ||
6254 | if (distance) | ||
6255 | printk(","); | ||
6256 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
6257 | } | ||
6258 | printk("\n"); | ||
6259 | } | ||
6260 | j1 = jiffies; | ||
6261 | if (migration_debug) | ||
6262 | printk("migration: %ld seconds\n", (j1-j0) / HZ); | ||
6263 | |||
6264 | /* | ||
6265 | * Move back to the original CPU. NUMA-Q gets confused | ||
6266 | * if we migrate to another quad during bootup. | ||
6267 | */ | ||
6268 | if (raw_smp_processor_id() != orig_cpu) { | ||
6269 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
6270 | saved_mask = current->cpus_allowed; | ||
6271 | |||
6272 | set_cpus_allowed(current, mask); | ||
6273 | set_cpus_allowed(current, saved_mask); | ||
6274 | } | ||
6275 | } | ||
6276 | |||
6277 | #ifdef CONFIG_NUMA | 5515 | #ifdef CONFIG_NUMA |
6278 | 5516 | ||
6279 | /** | 5517 | /** |
@@ -6574,7 +5812,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6574 | static int build_sched_domains(const cpumask_t *cpu_map) | 5812 | static int build_sched_domains(const cpumask_t *cpu_map) |
6575 | { | 5813 | { |
6576 | int i; | 5814 | int i; |
6577 | struct sched_domain *sd; | ||
6578 | #ifdef CONFIG_NUMA | 5815 | #ifdef CONFIG_NUMA |
6579 | struct sched_group **sched_group_nodes = NULL; | 5816 | struct sched_group **sched_group_nodes = NULL; |
6580 | int sd_allnodes = 0; | 5817 | int sd_allnodes = 0; |
@@ -6582,7 +5819,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6582 | /* | 5819 | /* |
6583 | * Allocate the per-node list of sched groups | 5820 | * Allocate the per-node list of sched groups |
6584 | */ | 5821 | */ |
6585 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 5822 | sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, |
6586 | GFP_KERNEL); | 5823 | GFP_KERNEL); |
6587 | if (!sched_group_nodes) { | 5824 | if (!sched_group_nodes) { |
6588 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 5825 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
@@ -6601,8 +5838,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6601 | cpus_and(nodemask, nodemask, *cpu_map); | 5838 | cpus_and(nodemask, nodemask, *cpu_map); |
6602 | 5839 | ||
6603 | #ifdef CONFIG_NUMA | 5840 | #ifdef CONFIG_NUMA |
6604 | if (cpus_weight(*cpu_map) | 5841 | if (cpus_weight(*cpu_map) > |
6605 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 5842 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
6606 | sd = &per_cpu(allnodes_domains, i); | 5843 | sd = &per_cpu(allnodes_domains, i); |
6607 | *sd = SD_ALLNODES_INIT; | 5844 | *sd = SD_ALLNODES_INIT; |
6608 | sd->span = *cpu_map; | 5845 | sd->span = *cpu_map; |
@@ -6661,7 +5898,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6661 | if (i != first_cpu(this_sibling_map)) | 5898 | if (i != first_cpu(this_sibling_map)) |
6662 | continue; | 5899 | continue; |
6663 | 5900 | ||
6664 | init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); | 5901 | init_sched_build_groups(this_sibling_map, cpu_map, |
5902 | &cpu_to_cpu_group); | ||
6665 | } | 5903 | } |
6666 | #endif | 5904 | #endif |
6667 | 5905 | ||
@@ -6672,11 +5910,11 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6672 | cpus_and(this_core_map, this_core_map, *cpu_map); | 5910 | cpus_and(this_core_map, this_core_map, *cpu_map); |
6673 | if (i != first_cpu(this_core_map)) | 5911 | if (i != first_cpu(this_core_map)) |
6674 | continue; | 5912 | continue; |
6675 | init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); | 5913 | init_sched_build_groups(this_core_map, cpu_map, |
5914 | &cpu_to_core_group); | ||
6676 | } | 5915 | } |
6677 | #endif | 5916 | #endif |
6678 | 5917 | ||
6679 | |||
6680 | /* Set up physical groups */ | 5918 | /* Set up physical groups */ |
6681 | for (i = 0; i < MAX_NUMNODES; i++) { | 5919 | for (i = 0; i < MAX_NUMNODES; i++) { |
6682 | cpumask_t nodemask = node_to_cpumask(i); | 5920 | cpumask_t nodemask = node_to_cpumask(i); |
@@ -6691,7 +5929,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6691 | #ifdef CONFIG_NUMA | 5929 | #ifdef CONFIG_NUMA |
6692 | /* Set up node groups */ | 5930 | /* Set up node groups */ |
6693 | if (sd_allnodes) | 5931 | if (sd_allnodes) |
6694 | init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); | 5932 | init_sched_build_groups(*cpu_map, cpu_map, |
5933 | &cpu_to_allnodes_group); | ||
6695 | 5934 | ||
6696 | for (i = 0; i < MAX_NUMNODES; i++) { | 5935 | for (i = 0; i < MAX_NUMNODES; i++) { |
6697 | /* Set up node groups */ | 5936 | /* Set up node groups */ |
@@ -6719,6 +5958,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6719 | sched_group_nodes[i] = sg; | 5958 | sched_group_nodes[i] = sg; |
6720 | for_each_cpu_mask(j, nodemask) { | 5959 | for_each_cpu_mask(j, nodemask) { |
6721 | struct sched_domain *sd; | 5960 | struct sched_domain *sd; |
5961 | |||
6722 | sd = &per_cpu(node_domains, j); | 5962 | sd = &per_cpu(node_domains, j); |
6723 | sd->groups = sg; | 5963 | sd->groups = sg; |
6724 | } | 5964 | } |
@@ -6763,19 +6003,22 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6763 | /* Calculate CPU power for physical packages and nodes */ | 6003 | /* Calculate CPU power for physical packages and nodes */ |
6764 | #ifdef CONFIG_SCHED_SMT | 6004 | #ifdef CONFIG_SCHED_SMT |
6765 | for_each_cpu_mask(i, *cpu_map) { | 6005 | for_each_cpu_mask(i, *cpu_map) { |
6766 | sd = &per_cpu(cpu_domains, i); | 6006 | struct sched_domain *sd = &per_cpu(cpu_domains, i); |
6007 | |||
6767 | init_sched_groups_power(i, sd); | 6008 | init_sched_groups_power(i, sd); |
6768 | } | 6009 | } |
6769 | #endif | 6010 | #endif |
6770 | #ifdef CONFIG_SCHED_MC | 6011 | #ifdef CONFIG_SCHED_MC |
6771 | for_each_cpu_mask(i, *cpu_map) { | 6012 | for_each_cpu_mask(i, *cpu_map) { |
6772 | sd = &per_cpu(core_domains, i); | 6013 | struct sched_domain *sd = &per_cpu(core_domains, i); |
6014 | |||
6773 | init_sched_groups_power(i, sd); | 6015 | init_sched_groups_power(i, sd); |
6774 | } | 6016 | } |
6775 | #endif | 6017 | #endif |
6776 | 6018 | ||
6777 | for_each_cpu_mask(i, *cpu_map) { | 6019 | for_each_cpu_mask(i, *cpu_map) { |
6778 | sd = &per_cpu(phys_domains, i); | 6020 | struct sched_domain *sd = &per_cpu(phys_domains, i); |
6021 | |||
6779 | init_sched_groups_power(i, sd); | 6022 | init_sched_groups_power(i, sd); |
6780 | } | 6023 | } |
6781 | 6024 | ||
@@ -6803,10 +6046,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6803 | #endif | 6046 | #endif |
6804 | cpu_attach_domain(sd, i); | 6047 | cpu_attach_domain(sd, i); |
6805 | } | 6048 | } |
6806 | /* | ||
6807 | * Tune cache-hot values: | ||
6808 | */ | ||
6809 | calibrate_migration_costs(cpu_map); | ||
6810 | 6049 | ||
6811 | return 0; | 6050 | return 0; |
6812 | 6051 | ||
@@ -7013,10 +6252,12 @@ void __init sched_init_smp(void) | |||
7013 | /* Move init over to a non-isolated CPU */ | 6252 | /* Move init over to a non-isolated CPU */ |
7014 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6253 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
7015 | BUG(); | 6254 | BUG(); |
6255 | sched_init_granularity(); | ||
7016 | } | 6256 | } |
7017 | #else | 6257 | #else |
7018 | void __init sched_init_smp(void) | 6258 | void __init sched_init_smp(void) |
7019 | { | 6259 | { |
6260 | sched_init_granularity(); | ||
7020 | } | 6261 | } |
7021 | #endif /* CONFIG_SMP */ | 6262 | #endif /* CONFIG_SMP */ |
7022 | 6263 | ||
@@ -7030,28 +6271,51 @@ int in_sched_functions(unsigned long addr) | |||
7030 | && addr < (unsigned long)__sched_text_end); | 6271 | && addr < (unsigned long)__sched_text_end); |
7031 | } | 6272 | } |
7032 | 6273 | ||
6274 | static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | ||
6275 | { | ||
6276 | cfs_rq->tasks_timeline = RB_ROOT; | ||
6277 | cfs_rq->fair_clock = 1; | ||
6278 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
6279 | cfs_rq->rq = rq; | ||
6280 | #endif | ||
6281 | } | ||
6282 | |||
7033 | void __init sched_init(void) | 6283 | void __init sched_init(void) |
7034 | { | 6284 | { |
7035 | int i, j, k; | 6285 | u64 now = sched_clock(); |
7036 | int highest_cpu = 0; | 6286 | int highest_cpu = 0; |
6287 | int i, j; | ||
6288 | |||
6289 | /* | ||
6290 | * Link up the scheduling class hierarchy: | ||
6291 | */ | ||
6292 | rt_sched_class.next = &fair_sched_class; | ||
6293 | fair_sched_class.next = &idle_sched_class; | ||
6294 | idle_sched_class.next = NULL; | ||
7037 | 6295 | ||
7038 | for_each_possible_cpu(i) { | 6296 | for_each_possible_cpu(i) { |
7039 | struct prio_array *array; | 6297 | struct rt_prio_array *array; |
7040 | struct rq *rq; | 6298 | struct rq *rq; |
7041 | 6299 | ||
7042 | rq = cpu_rq(i); | 6300 | rq = cpu_rq(i); |
7043 | spin_lock_init(&rq->lock); | 6301 | spin_lock_init(&rq->lock); |
7044 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 6302 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
7045 | rq->nr_running = 0; | 6303 | rq->nr_running = 0; |
7046 | rq->active = rq->arrays; | 6304 | rq->clock = 1; |
7047 | rq->expired = rq->arrays + 1; | 6305 | init_cfs_rq(&rq->cfs, rq); |
7048 | rq->best_expired_prio = MAX_PRIO; | 6306 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6307 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | ||
6308 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
6309 | #endif | ||
6310 | rq->ls.load_update_last = now; | ||
6311 | rq->ls.load_update_start = now; | ||
7049 | 6312 | ||
6313 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | ||
6314 | rq->cpu_load[j] = 0; | ||
7050 | #ifdef CONFIG_SMP | 6315 | #ifdef CONFIG_SMP |
7051 | rq->sd = NULL; | 6316 | rq->sd = NULL; |
7052 | for (j = 1; j < 3; j++) | ||
7053 | rq->cpu_load[j] = 0; | ||
7054 | rq->active_balance = 0; | 6317 | rq->active_balance = 0; |
6318 | rq->next_balance = jiffies; | ||
7055 | rq->push_cpu = 0; | 6319 | rq->push_cpu = 0; |
7056 | rq->cpu = i; | 6320 | rq->cpu = i; |
7057 | rq->migration_thread = NULL; | 6321 | rq->migration_thread = NULL; |
@@ -7059,16 +6323,14 @@ void __init sched_init(void) | |||
7059 | #endif | 6323 | #endif |
7060 | atomic_set(&rq->nr_iowait, 0); | 6324 | atomic_set(&rq->nr_iowait, 0); |
7061 | 6325 | ||
7062 | for (j = 0; j < 2; j++) { | 6326 | array = &rq->rt.active; |
7063 | array = rq->arrays + j; | 6327 | for (j = 0; j < MAX_RT_PRIO; j++) { |
7064 | for (k = 0; k < MAX_PRIO; k++) { | 6328 | INIT_LIST_HEAD(array->queue + j); |
7065 | INIT_LIST_HEAD(array->queue + k); | 6329 | __clear_bit(j, array->bitmap); |
7066 | __clear_bit(k, array->bitmap); | ||
7067 | } | ||
7068 | // delimiter for bitsearch | ||
7069 | __set_bit(MAX_PRIO, array->bitmap); | ||
7070 | } | 6330 | } |
7071 | highest_cpu = i; | 6331 | highest_cpu = i; |
6332 | /* delimiter for bitsearch: */ | ||
6333 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
7072 | } | 6334 | } |
7073 | 6335 | ||
7074 | set_load_weight(&init_task); | 6336 | set_load_weight(&init_task); |
@@ -7095,6 +6357,10 @@ void __init sched_init(void) | |||
7095 | * when this runqueue becomes "idle". | 6357 | * when this runqueue becomes "idle". |
7096 | */ | 6358 | */ |
7097 | init_idle(current, smp_processor_id()); | 6359 | init_idle(current, smp_processor_id()); |
6360 | /* | ||
6361 | * During early bootup we pretend to be a normal task: | ||
6362 | */ | ||
6363 | current->sched_class = &fair_sched_class; | ||
7098 | } | 6364 | } |
7099 | 6365 | ||
7100 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6366 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
@@ -7125,29 +6391,55 @@ EXPORT_SYMBOL(__might_sleep); | |||
7125 | #ifdef CONFIG_MAGIC_SYSRQ | 6391 | #ifdef CONFIG_MAGIC_SYSRQ |
7126 | void normalize_rt_tasks(void) | 6392 | void normalize_rt_tasks(void) |
7127 | { | 6393 | { |
7128 | struct prio_array *array; | ||
7129 | struct task_struct *g, *p; | 6394 | struct task_struct *g, *p; |
7130 | unsigned long flags; | 6395 | unsigned long flags; |
7131 | struct rq *rq; | 6396 | struct rq *rq; |
6397 | int on_rq; | ||
7132 | 6398 | ||
7133 | read_lock_irq(&tasklist_lock); | 6399 | read_lock_irq(&tasklist_lock); |
7134 | |||
7135 | do_each_thread(g, p) { | 6400 | do_each_thread(g, p) { |
7136 | if (!rt_task(p)) | 6401 | p->se.fair_key = 0; |
6402 | p->se.wait_runtime = 0; | ||
6403 | p->se.wait_start_fair = 0; | ||
6404 | p->se.wait_start = 0; | ||
6405 | p->se.exec_start = 0; | ||
6406 | p->se.sleep_start = 0; | ||
6407 | p->se.sleep_start_fair = 0; | ||
6408 | p->se.block_start = 0; | ||
6409 | task_rq(p)->cfs.fair_clock = 0; | ||
6410 | task_rq(p)->clock = 0; | ||
6411 | |||
6412 | if (!rt_task(p)) { | ||
6413 | /* | ||
6414 | * Renice negative nice level userspace | ||
6415 | * tasks back to 0: | ||
6416 | */ | ||
6417 | if (TASK_NICE(p) < 0 && p->mm) | ||
6418 | set_user_nice(p, 0); | ||
7137 | continue; | 6419 | continue; |
6420 | } | ||
7138 | 6421 | ||
7139 | spin_lock_irqsave(&p->pi_lock, flags); | 6422 | spin_lock_irqsave(&p->pi_lock, flags); |
7140 | rq = __task_rq_lock(p); | 6423 | rq = __task_rq_lock(p); |
6424 | #ifdef CONFIG_SMP | ||
6425 | /* | ||
6426 | * Do not touch the migration thread: | ||
6427 | */ | ||
6428 | if (p == rq->migration_thread) | ||
6429 | goto out_unlock; | ||
6430 | #endif | ||
7141 | 6431 | ||
7142 | array = p->array; | 6432 | on_rq = p->se.on_rq; |
7143 | if (array) | 6433 | if (on_rq) |
7144 | deactivate_task(p, task_rq(p)); | 6434 | deactivate_task(task_rq(p), p, 0); |
7145 | __setscheduler(p, SCHED_NORMAL, 0); | 6435 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
7146 | if (array) { | 6436 | if (on_rq) { |
7147 | __activate_task(p, task_rq(p)); | 6437 | activate_task(task_rq(p), p, 0); |
7148 | resched_task(rq->curr); | 6438 | resched_task(rq->curr); |
7149 | } | 6439 | } |
7150 | 6440 | #ifdef CONFIG_SMP | |
6441 | out_unlock: | ||
6442 | #endif | ||
7151 | __task_rq_unlock(rq); | 6443 | __task_rq_unlock(rq); |
7152 | spin_unlock_irqrestore(&p->pi_lock, flags); | 6444 | spin_unlock_irqrestore(&p->pi_lock, flags); |
7153 | } while_each_thread(g, p); | 6445 | } while_each_thread(g, p); |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c new file mode 100644 index 0000000000..29f2c21e7d --- /dev/null +++ b/kernel/sched_debug.c | |||
@@ -0,0 +1,275 @@ | |||
1 | /* | ||
2 | * kernel/time/sched_debug.c | ||
3 | * | ||
4 | * Print the CFS rbtree | ||
5 | * | ||
6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License version 2 as | ||
10 | * published by the Free Software Foundation. | ||
11 | */ | ||
12 | |||
13 | #include <linux/proc_fs.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/seq_file.h> | ||
16 | #include <linux/kallsyms.h> | ||
17 | #include <linux/utsname.h> | ||
18 | |||
19 | /* | ||
20 | * This allows printing both to /proc/sched_debug and | ||
21 | * to the console | ||
22 | */ | ||
23 | #define SEQ_printf(m, x...) \ | ||
24 | do { \ | ||
25 | if (m) \ | ||
26 | seq_printf(m, x); \ | ||
27 | else \ | ||
28 | printk(x); \ | ||
29 | } while (0) | ||
30 | |||
31 | static void | ||
32 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now) | ||
33 | { | ||
34 | if (rq->curr == p) | ||
35 | SEQ_printf(m, "R"); | ||
36 | else | ||
37 | SEQ_printf(m, " "); | ||
38 | |||
39 | SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d " | ||
40 | "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | ||
41 | p->comm, p->pid, | ||
42 | (long long)p->se.fair_key, | ||
43 | (long long)(p->se.fair_key - rq->cfs.fair_clock), | ||
44 | (long long)p->se.wait_runtime, | ||
45 | (long long)(p->nvcsw + p->nivcsw), | ||
46 | p->prio, | ||
47 | (long long)p->se.sum_exec_runtime, | ||
48 | (long long)p->se.sum_wait_runtime, | ||
49 | (long long)p->se.sum_sleep_runtime, | ||
50 | (long long)p->se.wait_runtime_overruns, | ||
51 | (long long)p->se.wait_runtime_underruns); | ||
52 | } | ||
53 | |||
54 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now) | ||
55 | { | ||
56 | struct task_struct *g, *p; | ||
57 | |||
58 | SEQ_printf(m, | ||
59 | "\nrunnable tasks:\n" | ||
60 | " task PID tree-key delta waiting" | ||
61 | " switches prio" | ||
62 | " sum-exec sum-wait sum-sleep" | ||
63 | " wait-overrun wait-underrun\n" | ||
64 | "------------------------------------------------------------------" | ||
65 | "----------------" | ||
66 | "------------------------------------------------" | ||
67 | "--------------------------------\n"); | ||
68 | |||
69 | read_lock_irq(&tasklist_lock); | ||
70 | |||
71 | do_each_thread(g, p) { | ||
72 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) | ||
73 | continue; | ||
74 | |||
75 | print_task(m, rq, p, now); | ||
76 | } while_each_thread(g, p); | ||
77 | |||
78 | read_unlock_irq(&tasklist_lock); | ||
79 | } | ||
80 | |||
81 | static void | ||
82 | print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | ||
83 | { | ||
84 | s64 wait_runtime_rq_sum = 0; | ||
85 | struct task_struct *p; | ||
86 | struct rb_node *curr; | ||
87 | unsigned long flags; | ||
88 | struct rq *rq = &per_cpu(runqueues, cpu); | ||
89 | |||
90 | spin_lock_irqsave(&rq->lock, flags); | ||
91 | curr = first_fair(cfs_rq); | ||
92 | while (curr) { | ||
93 | p = rb_entry(curr, struct task_struct, se.run_node); | ||
94 | wait_runtime_rq_sum += p->se.wait_runtime; | ||
95 | |||
96 | curr = rb_next(curr); | ||
97 | } | ||
98 | spin_unlock_irqrestore(&rq->lock, flags); | ||
99 | |||
100 | SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum", | ||
101 | (long long)wait_runtime_rq_sum); | ||
102 | } | ||
103 | |||
104 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) | ||
105 | { | ||
106 | SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq); | ||
107 | |||
108 | #define P(x) \ | ||
109 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) | ||
110 | |||
111 | P(fair_clock); | ||
112 | P(exec_clock); | ||
113 | P(wait_runtime); | ||
114 | P(wait_runtime_overruns); | ||
115 | P(wait_runtime_underruns); | ||
116 | P(sleeper_bonus); | ||
117 | #undef P | ||
118 | |||
119 | print_cfs_rq_runtime_sum(m, cpu, cfs_rq); | ||
120 | } | ||
121 | |||
122 | static void print_cpu(struct seq_file *m, int cpu, u64 now) | ||
123 | { | ||
124 | struct rq *rq = &per_cpu(runqueues, cpu); | ||
125 | |||
126 | #ifdef CONFIG_X86 | ||
127 | { | ||
128 | unsigned int freq = cpu_khz ? : 1; | ||
129 | |||
130 | SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", | ||
131 | cpu, freq / 1000, (freq % 1000)); | ||
132 | } | ||
133 | #else | ||
134 | SEQ_printf(m, "\ncpu#%d\n", cpu); | ||
135 | #endif | ||
136 | |||
137 | #define P(x) \ | ||
138 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) | ||
139 | |||
140 | P(nr_running); | ||
141 | SEQ_printf(m, " .%-30s: %lu\n", "load", | ||
142 | rq->ls.load.weight); | ||
143 | P(ls.delta_fair); | ||
144 | P(ls.delta_exec); | ||
145 | P(nr_switches); | ||
146 | P(nr_load_updates); | ||
147 | P(nr_uninterruptible); | ||
148 | SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); | ||
149 | P(next_balance); | ||
150 | P(curr->pid); | ||
151 | P(clock); | ||
152 | P(prev_clock_raw); | ||
153 | P(clock_warps); | ||
154 | P(clock_overflows); | ||
155 | P(clock_unstable_events); | ||
156 | P(clock_max_delta); | ||
157 | P(cpu_load[0]); | ||
158 | P(cpu_load[1]); | ||
159 | P(cpu_load[2]); | ||
160 | P(cpu_load[3]); | ||
161 | P(cpu_load[4]); | ||
162 | #undef P | ||
163 | |||
164 | print_cfs_stats(m, cpu, now); | ||
165 | |||
166 | print_rq(m, rq, cpu, now); | ||
167 | } | ||
168 | |||
169 | static int sched_debug_show(struct seq_file *m, void *v) | ||
170 | { | ||
171 | u64 now = ktime_to_ns(ktime_get()); | ||
172 | int cpu; | ||
173 | |||
174 | SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n", | ||
175 | init_utsname()->release, | ||
176 | (int)strcspn(init_utsname()->version, " "), | ||
177 | init_utsname()->version); | ||
178 | |||
179 | SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); | ||
180 | |||
181 | for_each_online_cpu(cpu) | ||
182 | print_cpu(m, cpu, now); | ||
183 | |||
184 | SEQ_printf(m, "\n"); | ||
185 | |||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | void sysrq_sched_debug_show(void) | ||
190 | { | ||
191 | sched_debug_show(NULL, NULL); | ||
192 | } | ||
193 | |||
194 | static int sched_debug_open(struct inode *inode, struct file *filp) | ||
195 | { | ||
196 | return single_open(filp, sched_debug_show, NULL); | ||
197 | } | ||
198 | |||
199 | static struct file_operations sched_debug_fops = { | ||
200 | .open = sched_debug_open, | ||
201 | .read = seq_read, | ||
202 | .llseek = seq_lseek, | ||
203 | .release = seq_release, | ||
204 | }; | ||
205 | |||
206 | static int __init init_sched_debug_procfs(void) | ||
207 | { | ||
208 | struct proc_dir_entry *pe; | ||
209 | |||
210 | pe = create_proc_entry("sched_debug", 0644, NULL); | ||
211 | if (!pe) | ||
212 | return -ENOMEM; | ||
213 | |||
214 | pe->proc_fops = &sched_debug_fops; | ||
215 | |||
216 | return 0; | ||
217 | } | ||
218 | |||
219 | __initcall(init_sched_debug_procfs); | ||
220 | |||
221 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | ||
222 | { | ||
223 | unsigned long flags; | ||
224 | int num_threads = 1; | ||
225 | |||
226 | rcu_read_lock(); | ||
227 | if (lock_task_sighand(p, &flags)) { | ||
228 | num_threads = atomic_read(&p->signal->count); | ||
229 | unlock_task_sighand(p, &flags); | ||
230 | } | ||
231 | rcu_read_unlock(); | ||
232 | |||
233 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); | ||
234 | SEQ_printf(m, "----------------------------------------------\n"); | ||
235 | #define P(F) \ | ||
236 | SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) | ||
237 | |||
238 | P(se.wait_start); | ||
239 | P(se.wait_start_fair); | ||
240 | P(se.exec_start); | ||
241 | P(se.sleep_start); | ||
242 | P(se.sleep_start_fair); | ||
243 | P(se.block_start); | ||
244 | P(se.sleep_max); | ||
245 | P(se.block_max); | ||
246 | P(se.exec_max); | ||
247 | P(se.wait_max); | ||
248 | P(se.wait_runtime); | ||
249 | P(se.wait_runtime_overruns); | ||
250 | P(se.wait_runtime_underruns); | ||
251 | P(se.sum_wait_runtime); | ||
252 | P(se.sum_exec_runtime); | ||
253 | SEQ_printf(m, "%-25s:%20Ld\n", | ||
254 | "nr_switches", (long long)(p->nvcsw + p->nivcsw)); | ||
255 | P(se.load.weight); | ||
256 | P(policy); | ||
257 | P(prio); | ||
258 | #undef P | ||
259 | |||
260 | { | ||
261 | u64 t0, t1; | ||
262 | |||
263 | t0 = sched_clock(); | ||
264 | t1 = sched_clock(); | ||
265 | SEQ_printf(m, "%-25s:%20Ld\n", | ||
266 | "clock-delta", (long long)(t1-t0)); | ||
267 | } | ||
268 | } | ||
269 | |||
270 | void proc_sched_set_task(struct task_struct *p) | ||
271 | { | ||
272 | p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; | ||
273 | p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; | ||
274 | p->se.sum_exec_runtime = 0; | ||
275 | } | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c new file mode 100644 index 0000000000..6971db0a71 --- /dev/null +++ b/kernel/sched_fair.c | |||
@@ -0,0 +1,1131 @@ | |||
1 | /* | ||
2 | * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) | ||
3 | * | ||
4 | * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
5 | * | ||
6 | * Interactivity improvements by Mike Galbraith | ||
7 | * (C) 2007 Mike Galbraith <efault@gmx.de> | ||
8 | * | ||
9 | * Various enhancements by Dmitry Adamushko. | ||
10 | * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> | ||
11 | * | ||
12 | * Group scheduling enhancements by Srivatsa Vaddagiri | ||
13 | * Copyright IBM Corporation, 2007 | ||
14 | * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> | ||
15 | * | ||
16 | * Scaled math optimizations by Thomas Gleixner | ||
17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * Preemption granularity: | ||
22 | * (default: 2 msec, units: nanoseconds) | ||
23 | * | ||
24 | * NOTE: this granularity value is not the same as the concept of | ||
25 | * 'timeslice length' - timeslices in CFS will typically be somewhat | ||
26 | * larger than this value. (to see the precise effective timeslice | ||
27 | * length of your workload, run vmstat and monitor the context-switches | ||
28 | * field) | ||
29 | * | ||
30 | * On SMP systems the value of this is multiplied by the log2 of the | ||
31 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | ||
32 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | ||
33 | */ | ||
34 | unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; | ||
35 | |||
36 | /* | ||
37 | * SCHED_BATCH wake-up granularity. | ||
38 | * (default: 10 msec, units: nanoseconds) | ||
39 | * | ||
40 | * This option delays the preemption effects of decoupled workloads | ||
41 | * and reduces their over-scheduling. Synchronous workloads will still | ||
42 | * have immediate wakeup/sleep latencies. | ||
43 | */ | ||
44 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = | ||
45 | 10000000000ULL/HZ; | ||
46 | |||
47 | /* | ||
48 | * SCHED_OTHER wake-up granularity. | ||
49 | * (default: 1 msec, units: nanoseconds) | ||
50 | * | ||
51 | * This option delays the preemption effects of decoupled workloads | ||
52 | * and reduces their over-scheduling. Synchronous workloads will still | ||
53 | * have immediate wakeup/sleep latencies. | ||
54 | */ | ||
55 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; | ||
56 | |||
57 | unsigned int sysctl_sched_stat_granularity __read_mostly; | ||
58 | |||
59 | /* | ||
60 | * Initialized in sched_init_granularity(): | ||
61 | */ | ||
62 | unsigned int sysctl_sched_runtime_limit __read_mostly; | ||
63 | |||
64 | /* | ||
65 | * Debugging: various feature bits | ||
66 | */ | ||
67 | enum { | ||
68 | SCHED_FEAT_FAIR_SLEEPERS = 1, | ||
69 | SCHED_FEAT_SLEEPER_AVG = 2, | ||
70 | SCHED_FEAT_SLEEPER_LOAD_AVG = 4, | ||
71 | SCHED_FEAT_PRECISE_CPU_LOAD = 8, | ||
72 | SCHED_FEAT_START_DEBIT = 16, | ||
73 | SCHED_FEAT_SKIP_INITIAL = 32, | ||
74 | }; | ||
75 | |||
76 | unsigned int sysctl_sched_features __read_mostly = | ||
77 | SCHED_FEAT_FAIR_SLEEPERS *1 | | ||
78 | SCHED_FEAT_SLEEPER_AVG *1 | | ||
79 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | | ||
80 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | | ||
81 | SCHED_FEAT_START_DEBIT *1 | | ||
82 | SCHED_FEAT_SKIP_INITIAL *0; | ||
83 | |||
84 | extern struct sched_class fair_sched_class; | ||
85 | |||
86 | /************************************************************** | ||
87 | * CFS operations on generic schedulable entities: | ||
88 | */ | ||
89 | |||
90 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
91 | |||
92 | /* cpu runqueue to which this cfs_rq is attached */ | ||
93 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | ||
94 | { | ||
95 | return cfs_rq->rq; | ||
96 | } | ||
97 | |||
98 | /* currently running entity (if any) on this cfs_rq */ | ||
99 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
100 | { | ||
101 | return cfs_rq->curr; | ||
102 | } | ||
103 | |||
104 | /* An entity is a task if it doesn't "own" a runqueue */ | ||
105 | #define entity_is_task(se) (!se->my_q) | ||
106 | |||
107 | static inline void | ||
108 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
109 | { | ||
110 | cfs_rq->curr = se; | ||
111 | } | ||
112 | |||
113 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
114 | |||
115 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | ||
116 | { | ||
117 | return container_of(cfs_rq, struct rq, cfs); | ||
118 | } | ||
119 | |||
120 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
121 | { | ||
122 | struct rq *rq = rq_of(cfs_rq); | ||
123 | |||
124 | if (unlikely(rq->curr->sched_class != &fair_sched_class)) | ||
125 | return NULL; | ||
126 | |||
127 | return &rq->curr->se; | ||
128 | } | ||
129 | |||
130 | #define entity_is_task(se) 1 | ||
131 | |||
132 | static inline void | ||
133 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | ||
134 | |||
135 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
136 | |||
137 | static inline struct task_struct *task_of(struct sched_entity *se) | ||
138 | { | ||
139 | return container_of(se, struct task_struct, se); | ||
140 | } | ||
141 | |||
142 | |||
143 | /************************************************************** | ||
144 | * Scheduling class tree data structure manipulation methods: | ||
145 | */ | ||
146 | |||
147 | /* | ||
148 | * Enqueue an entity into the rb-tree: | ||
149 | */ | ||
150 | static inline void | ||
151 | __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
152 | { | ||
153 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | ||
154 | struct rb_node *parent = NULL; | ||
155 | struct sched_entity *entry; | ||
156 | s64 key = se->fair_key; | ||
157 | int leftmost = 1; | ||
158 | |||
159 | /* | ||
160 | * Find the right place in the rbtree: | ||
161 | */ | ||
162 | while (*link) { | ||
163 | parent = *link; | ||
164 | entry = rb_entry(parent, struct sched_entity, run_node); | ||
165 | /* | ||
166 | * We dont care about collisions. Nodes with | ||
167 | * the same key stay together. | ||
168 | */ | ||
169 | if (key - entry->fair_key < 0) { | ||
170 | link = &parent->rb_left; | ||
171 | } else { | ||
172 | link = &parent->rb_right; | ||
173 | leftmost = 0; | ||
174 | } | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Maintain a cache of leftmost tree entries (it is frequently | ||
179 | * used): | ||
180 | */ | ||
181 | if (leftmost) | ||
182 | cfs_rq->rb_leftmost = &se->run_node; | ||
183 | |||
184 | rb_link_node(&se->run_node, parent, link); | ||
185 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | ||
186 | update_load_add(&cfs_rq->load, se->load.weight); | ||
187 | cfs_rq->nr_running++; | ||
188 | se->on_rq = 1; | ||
189 | } | ||
190 | |||
191 | static inline void | ||
192 | __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
193 | { | ||
194 | if (cfs_rq->rb_leftmost == &se->run_node) | ||
195 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | ||
196 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
197 | update_load_sub(&cfs_rq->load, se->load.weight); | ||
198 | cfs_rq->nr_running--; | ||
199 | se->on_rq = 0; | ||
200 | } | ||
201 | |||
202 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) | ||
203 | { | ||
204 | return cfs_rq->rb_leftmost; | ||
205 | } | ||
206 | |||
207 | static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | ||
208 | { | ||
209 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); | ||
210 | } | ||
211 | |||
212 | /************************************************************** | ||
213 | * Scheduling class statistics methods: | ||
214 | */ | ||
215 | |||
216 | /* | ||
217 | * We rescale the rescheduling granularity of tasks according to their | ||
218 | * nice level, but only linearly, not exponentially: | ||
219 | */ | ||
220 | static long | ||
221 | niced_granularity(struct sched_entity *curr, unsigned long granularity) | ||
222 | { | ||
223 | u64 tmp; | ||
224 | |||
225 | /* | ||
226 | * Negative nice levels get the same granularity as nice-0: | ||
227 | */ | ||
228 | if (likely(curr->load.weight >= NICE_0_LOAD)) | ||
229 | return granularity; | ||
230 | /* | ||
231 | * Positive nice level tasks get linearly finer | ||
232 | * granularity: | ||
233 | */ | ||
234 | tmp = curr->load.weight * (u64)granularity; | ||
235 | |||
236 | /* | ||
237 | * It will always fit into 'long': | ||
238 | */ | ||
239 | return (long) (tmp >> NICE_0_SHIFT); | ||
240 | } | ||
241 | |||
242 | static inline void | ||
243 | limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
244 | { | ||
245 | long limit = sysctl_sched_runtime_limit; | ||
246 | |||
247 | /* | ||
248 | * Niced tasks have the same history dynamic range as | ||
249 | * non-niced tasks: | ||
250 | */ | ||
251 | if (unlikely(se->wait_runtime > limit)) { | ||
252 | se->wait_runtime = limit; | ||
253 | schedstat_inc(se, wait_runtime_overruns); | ||
254 | schedstat_inc(cfs_rq, wait_runtime_overruns); | ||
255 | } | ||
256 | if (unlikely(se->wait_runtime < -limit)) { | ||
257 | se->wait_runtime = -limit; | ||
258 | schedstat_inc(se, wait_runtime_underruns); | ||
259 | schedstat_inc(cfs_rq, wait_runtime_underruns); | ||
260 | } | ||
261 | } | ||
262 | |||
263 | static inline void | ||
264 | __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
265 | { | ||
266 | se->wait_runtime += delta; | ||
267 | schedstat_add(se, sum_wait_runtime, delta); | ||
268 | limit_wait_runtime(cfs_rq, se); | ||
269 | } | ||
270 | |||
271 | static void | ||
272 | add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
273 | { | ||
274 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | ||
275 | __add_wait_runtime(cfs_rq, se, delta); | ||
276 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
277 | } | ||
278 | |||
279 | /* | ||
280 | * Update the current task's runtime statistics. Skip current tasks that | ||
281 | * are not in our scheduling class. | ||
282 | */ | ||
283 | static inline void | ||
284 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) | ||
285 | { | ||
286 | unsigned long delta, delta_exec, delta_fair; | ||
287 | long delta_mine; | ||
288 | struct load_weight *lw = &cfs_rq->load; | ||
289 | unsigned long load = lw->weight; | ||
290 | |||
291 | if (unlikely(!load)) | ||
292 | return; | ||
293 | |||
294 | delta_exec = curr->delta_exec; | ||
295 | #ifdef CONFIG_SCHEDSTATS | ||
296 | if (unlikely(delta_exec > curr->exec_max)) | ||
297 | curr->exec_max = delta_exec; | ||
298 | #endif | ||
299 | |||
300 | curr->sum_exec_runtime += delta_exec; | ||
301 | cfs_rq->exec_clock += delta_exec; | ||
302 | |||
303 | delta_fair = calc_delta_fair(delta_exec, lw); | ||
304 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | ||
305 | |||
306 | if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) { | ||
307 | delta = calc_delta_mine(cfs_rq->sleeper_bonus, | ||
308 | curr->load.weight, lw); | ||
309 | if (unlikely(delta > cfs_rq->sleeper_bonus)) | ||
310 | delta = cfs_rq->sleeper_bonus; | ||
311 | |||
312 | cfs_rq->sleeper_bonus -= delta; | ||
313 | delta_mine -= delta; | ||
314 | } | ||
315 | |||
316 | cfs_rq->fair_clock += delta_fair; | ||
317 | /* | ||
318 | * We executed delta_exec amount of time on the CPU, | ||
319 | * but we were only entitled to delta_mine amount of | ||
320 | * time during that period (if nr_running == 1 then | ||
321 | * the two values are equal) | ||
322 | * [Note: delta_mine - delta_exec is negative]: | ||
323 | */ | ||
324 | add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); | ||
325 | } | ||
326 | |||
327 | static void update_curr(struct cfs_rq *cfs_rq, u64 now) | ||
328 | { | ||
329 | struct sched_entity *curr = cfs_rq_curr(cfs_rq); | ||
330 | unsigned long delta_exec; | ||
331 | |||
332 | if (unlikely(!curr)) | ||
333 | return; | ||
334 | |||
335 | /* | ||
336 | * Get the amount of time the current task was running | ||
337 | * since the last time we changed load (this cannot | ||
338 | * overflow on 32 bits): | ||
339 | */ | ||
340 | delta_exec = (unsigned long)(now - curr->exec_start); | ||
341 | |||
342 | curr->delta_exec += delta_exec; | ||
343 | |||
344 | if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { | ||
345 | __update_curr(cfs_rq, curr, now); | ||
346 | curr->delta_exec = 0; | ||
347 | } | ||
348 | curr->exec_start = now; | ||
349 | } | ||
350 | |||
351 | static inline void | ||
352 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
353 | { | ||
354 | se->wait_start_fair = cfs_rq->fair_clock; | ||
355 | se->wait_start = now; | ||
356 | } | ||
357 | |||
358 | /* | ||
359 | * We calculate fair deltas here, so protect against the random effects | ||
360 | * of a multiplication overflow by capping it to the runtime limit: | ||
361 | */ | ||
362 | #if BITS_PER_LONG == 32 | ||
363 | static inline unsigned long | ||
364 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
365 | { | ||
366 | u64 tmp = (u64)delta * weight >> shift; | ||
367 | |||
368 | if (unlikely(tmp > sysctl_sched_runtime_limit*2)) | ||
369 | return sysctl_sched_runtime_limit*2; | ||
370 | return tmp; | ||
371 | } | ||
372 | #else | ||
373 | static inline unsigned long | ||
374 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
375 | { | ||
376 | return delta * weight >> shift; | ||
377 | } | ||
378 | #endif | ||
379 | |||
380 | /* | ||
381 | * Task is being enqueued - update stats: | ||
382 | */ | ||
383 | static void | ||
384 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
385 | { | ||
386 | s64 key; | ||
387 | |||
388 | /* | ||
389 | * Are we enqueueing a waiting task? (for current tasks | ||
390 | * a dequeue/enqueue event is a NOP) | ||
391 | */ | ||
392 | if (se != cfs_rq_curr(cfs_rq)) | ||
393 | update_stats_wait_start(cfs_rq, se, now); | ||
394 | /* | ||
395 | * Update the key: | ||
396 | */ | ||
397 | key = cfs_rq->fair_clock; | ||
398 | |||
399 | /* | ||
400 | * Optimize the common nice 0 case: | ||
401 | */ | ||
402 | if (likely(se->load.weight == NICE_0_LOAD)) { | ||
403 | key -= se->wait_runtime; | ||
404 | } else { | ||
405 | u64 tmp; | ||
406 | |||
407 | if (se->wait_runtime < 0) { | ||
408 | tmp = -se->wait_runtime; | ||
409 | key += (tmp * se->load.inv_weight) >> | ||
410 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
411 | } else { | ||
412 | tmp = se->wait_runtime; | ||
413 | key -= (tmp * se->load.weight) >> NICE_0_SHIFT; | ||
414 | } | ||
415 | } | ||
416 | |||
417 | se->fair_key = key; | ||
418 | } | ||
419 | |||
420 | /* | ||
421 | * Note: must be called with a freshly updated rq->fair_clock. | ||
422 | */ | ||
423 | static inline void | ||
424 | __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
425 | { | ||
426 | unsigned long delta_fair = se->delta_fair_run; | ||
427 | |||
428 | #ifdef CONFIG_SCHEDSTATS | ||
429 | { | ||
430 | s64 delta_wait = now - se->wait_start; | ||
431 | if (unlikely(delta_wait > se->wait_max)) | ||
432 | se->wait_max = delta_wait; | ||
433 | } | ||
434 | #endif | ||
435 | |||
436 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
437 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
438 | NICE_0_SHIFT); | ||
439 | |||
440 | add_wait_runtime(cfs_rq, se, delta_fair); | ||
441 | } | ||
442 | |||
443 | static void | ||
444 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
445 | { | ||
446 | unsigned long delta_fair; | ||
447 | |||
448 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
449 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); | ||
450 | |||
451 | se->delta_fair_run += delta_fair; | ||
452 | if (unlikely(abs(se->delta_fair_run) >= | ||
453 | sysctl_sched_stat_granularity)) { | ||
454 | __update_stats_wait_end(cfs_rq, se, now); | ||
455 | se->delta_fair_run = 0; | ||
456 | } | ||
457 | |||
458 | se->wait_start_fair = 0; | ||
459 | se->wait_start = 0; | ||
460 | } | ||
461 | |||
462 | static inline void | ||
463 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
464 | { | ||
465 | update_curr(cfs_rq, now); | ||
466 | /* | ||
467 | * Mark the end of the wait period if dequeueing a | ||
468 | * waiting task: | ||
469 | */ | ||
470 | if (se != cfs_rq_curr(cfs_rq)) | ||
471 | update_stats_wait_end(cfs_rq, se, now); | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * We are picking a new current task - update its stats: | ||
476 | */ | ||
477 | static inline void | ||
478 | update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
479 | { | ||
480 | /* | ||
481 | * We are starting a new run period: | ||
482 | */ | ||
483 | se->exec_start = now; | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * We are descheduling a task - update its stats: | ||
488 | */ | ||
489 | static inline void | ||
490 | update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
491 | { | ||
492 | se->exec_start = 0; | ||
493 | } | ||
494 | |||
495 | /************************************************** | ||
496 | * Scheduling class queueing methods: | ||
497 | */ | ||
498 | |||
499 | static void | ||
500 | __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
501 | { | ||
502 | unsigned long load = cfs_rq->load.weight, delta_fair; | ||
503 | long prev_runtime; | ||
504 | |||
505 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) | ||
506 | load = rq_of(cfs_rq)->cpu_load[2]; | ||
507 | |||
508 | delta_fair = se->delta_fair_sleep; | ||
509 | |||
510 | /* | ||
511 | * Fix up delta_fair with the effect of us running | ||
512 | * during the whole sleep period: | ||
513 | */ | ||
514 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) | ||
515 | delta_fair = div64_likely32((u64)delta_fair * load, | ||
516 | load + se->load.weight); | ||
517 | |||
518 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
519 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
520 | NICE_0_SHIFT); | ||
521 | |||
522 | prev_runtime = se->wait_runtime; | ||
523 | __add_wait_runtime(cfs_rq, se, delta_fair); | ||
524 | delta_fair = se->wait_runtime - prev_runtime; | ||
525 | |||
526 | /* | ||
527 | * Track the amount of bonus we've given to sleepers: | ||
528 | */ | ||
529 | cfs_rq->sleeper_bonus += delta_fair; | ||
530 | |||
531 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
532 | } | ||
533 | |||
534 | static void | ||
535 | enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
536 | { | ||
537 | struct task_struct *tsk = task_of(se); | ||
538 | unsigned long delta_fair; | ||
539 | |||
540 | if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || | ||
541 | !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) | ||
542 | return; | ||
543 | |||
544 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
545 | (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); | ||
546 | |||
547 | se->delta_fair_sleep += delta_fair; | ||
548 | if (unlikely(abs(se->delta_fair_sleep) >= | ||
549 | sysctl_sched_stat_granularity)) { | ||
550 | __enqueue_sleeper(cfs_rq, se, now); | ||
551 | se->delta_fair_sleep = 0; | ||
552 | } | ||
553 | |||
554 | se->sleep_start_fair = 0; | ||
555 | |||
556 | #ifdef CONFIG_SCHEDSTATS | ||
557 | if (se->sleep_start) { | ||
558 | u64 delta = now - se->sleep_start; | ||
559 | |||
560 | if ((s64)delta < 0) | ||
561 | delta = 0; | ||
562 | |||
563 | if (unlikely(delta > se->sleep_max)) | ||
564 | se->sleep_max = delta; | ||
565 | |||
566 | se->sleep_start = 0; | ||
567 | se->sum_sleep_runtime += delta; | ||
568 | } | ||
569 | if (se->block_start) { | ||
570 | u64 delta = now - se->block_start; | ||
571 | |||
572 | if ((s64)delta < 0) | ||
573 | delta = 0; | ||
574 | |||
575 | if (unlikely(delta > se->block_max)) | ||
576 | se->block_max = delta; | ||
577 | |||
578 | se->block_start = 0; | ||
579 | se->sum_sleep_runtime += delta; | ||
580 | } | ||
581 | #endif | ||
582 | } | ||
583 | |||
584 | static void | ||
585 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
586 | int wakeup, u64 now) | ||
587 | { | ||
588 | /* | ||
589 | * Update the fair clock. | ||
590 | */ | ||
591 | update_curr(cfs_rq, now); | ||
592 | |||
593 | if (wakeup) | ||
594 | enqueue_sleeper(cfs_rq, se, now); | ||
595 | |||
596 | update_stats_enqueue(cfs_rq, se, now); | ||
597 | __enqueue_entity(cfs_rq, se); | ||
598 | } | ||
599 | |||
600 | static void | ||
601 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
602 | int sleep, u64 now) | ||
603 | { | ||
604 | update_stats_dequeue(cfs_rq, se, now); | ||
605 | if (sleep) { | ||
606 | se->sleep_start_fair = cfs_rq->fair_clock; | ||
607 | #ifdef CONFIG_SCHEDSTATS | ||
608 | if (entity_is_task(se)) { | ||
609 | struct task_struct *tsk = task_of(se); | ||
610 | |||
611 | if (tsk->state & TASK_INTERRUPTIBLE) | ||
612 | se->sleep_start = now; | ||
613 | if (tsk->state & TASK_UNINTERRUPTIBLE) | ||
614 | se->block_start = now; | ||
615 | } | ||
616 | cfs_rq->wait_runtime -= se->wait_runtime; | ||
617 | #endif | ||
618 | } | ||
619 | __dequeue_entity(cfs_rq, se); | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * Preempt the current task with a newly woken task if needed: | ||
624 | */ | ||
625 | static void | ||
626 | __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
627 | struct sched_entity *curr, unsigned long granularity) | ||
628 | { | ||
629 | s64 __delta = curr->fair_key - se->fair_key; | ||
630 | |||
631 | /* | ||
632 | * Take scheduling granularity into account - do not | ||
633 | * preempt the current task unless the best task has | ||
634 | * a larger than sched_granularity fairness advantage: | ||
635 | */ | ||
636 | if (__delta > niced_granularity(curr, granularity)) | ||
637 | resched_task(rq_of(cfs_rq)->curr); | ||
638 | } | ||
639 | |||
640 | static inline void | ||
641 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
642 | { | ||
643 | /* | ||
644 | * Any task has to be enqueued before it get to execute on | ||
645 | * a CPU. So account for the time it spent waiting on the | ||
646 | * runqueue. (note, here we rely on pick_next_task() having | ||
647 | * done a put_prev_task_fair() shortly before this, which | ||
648 | * updated rq->fair_clock - used by update_stats_wait_end()) | ||
649 | */ | ||
650 | update_stats_wait_end(cfs_rq, se, now); | ||
651 | update_stats_curr_start(cfs_rq, se, now); | ||
652 | set_cfs_rq_curr(cfs_rq, se); | ||
653 | } | ||
654 | |||
655 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now) | ||
656 | { | ||
657 | struct sched_entity *se = __pick_next_entity(cfs_rq); | ||
658 | |||
659 | set_next_entity(cfs_rq, se, now); | ||
660 | |||
661 | return se; | ||
662 | } | ||
663 | |||
664 | static void | ||
665 | put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now) | ||
666 | { | ||
667 | /* | ||
668 | * If still on the runqueue then deactivate_task() | ||
669 | * was not called and update_curr() has to be done: | ||
670 | */ | ||
671 | if (prev->on_rq) | ||
672 | update_curr(cfs_rq, now); | ||
673 | |||
674 | update_stats_curr_end(cfs_rq, prev, now); | ||
675 | |||
676 | if (prev->on_rq) | ||
677 | update_stats_wait_start(cfs_rq, prev, now); | ||
678 | set_cfs_rq_curr(cfs_rq, NULL); | ||
679 | } | ||
680 | |||
681 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | ||
682 | { | ||
683 | struct rq *rq = rq_of(cfs_rq); | ||
684 | struct sched_entity *next; | ||
685 | u64 now = __rq_clock(rq); | ||
686 | |||
687 | /* | ||
688 | * Dequeue and enqueue the task to update its | ||
689 | * position within the tree: | ||
690 | */ | ||
691 | dequeue_entity(cfs_rq, curr, 0, now); | ||
692 | enqueue_entity(cfs_rq, curr, 0, now); | ||
693 | |||
694 | /* | ||
695 | * Reschedule if another task tops the current one. | ||
696 | */ | ||
697 | next = __pick_next_entity(cfs_rq); | ||
698 | if (next == curr) | ||
699 | return; | ||
700 | |||
701 | __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); | ||
702 | } | ||
703 | |||
704 | /************************************************** | ||
705 | * CFS operations on tasks: | ||
706 | */ | ||
707 | |||
708 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
709 | |||
710 | /* Walk up scheduling entities hierarchy */ | ||
711 | #define for_each_sched_entity(se) \ | ||
712 | for (; se; se = se->parent) | ||
713 | |||
714 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
715 | { | ||
716 | return p->se.cfs_rq; | ||
717 | } | ||
718 | |||
719 | /* runqueue on which this entity is (to be) queued */ | ||
720 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
721 | { | ||
722 | return se->cfs_rq; | ||
723 | } | ||
724 | |||
725 | /* runqueue "owned" by this group */ | ||
726 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
727 | { | ||
728 | return grp->my_q; | ||
729 | } | ||
730 | |||
731 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
732 | * another cpu ('this_cpu') | ||
733 | */ | ||
734 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
735 | { | ||
736 | /* A later patch will take group into account */ | ||
737 | return &cpu_rq(this_cpu)->cfs; | ||
738 | } | ||
739 | |||
740 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
741 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
742 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | ||
743 | |||
744 | /* Do the two (enqueued) tasks belong to the same group ? */ | ||
745 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | ||
746 | { | ||
747 | if (curr->se.cfs_rq == p->se.cfs_rq) | ||
748 | return 1; | ||
749 | |||
750 | return 0; | ||
751 | } | ||
752 | |||
753 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
754 | |||
755 | #define for_each_sched_entity(se) \ | ||
756 | for (; se; se = NULL) | ||
757 | |||
758 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | ||
759 | { | ||
760 | return &task_rq(p)->cfs; | ||
761 | } | ||
762 | |||
763 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | ||
764 | { | ||
765 | struct task_struct *p = task_of(se); | ||
766 | struct rq *rq = task_rq(p); | ||
767 | |||
768 | return &rq->cfs; | ||
769 | } | ||
770 | |||
771 | /* runqueue "owned" by this group */ | ||
772 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | ||
773 | { | ||
774 | return NULL; | ||
775 | } | ||
776 | |||
777 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
778 | { | ||
779 | return &cpu_rq(this_cpu)->cfs; | ||
780 | } | ||
781 | |||
782 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | ||
783 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | ||
784 | |||
785 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | ||
786 | { | ||
787 | return 1; | ||
788 | } | ||
789 | |||
790 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
791 | |||
792 | /* | ||
793 | * The enqueue_task method is called before nr_running is | ||
794 | * increased. Here we update the fair scheduling stats and | ||
795 | * then put the task into the rbtree: | ||
796 | */ | ||
797 | static void | ||
798 | enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | ||
799 | { | ||
800 | struct cfs_rq *cfs_rq; | ||
801 | struct sched_entity *se = &p->se; | ||
802 | |||
803 | for_each_sched_entity(se) { | ||
804 | if (se->on_rq) | ||
805 | break; | ||
806 | cfs_rq = cfs_rq_of(se); | ||
807 | enqueue_entity(cfs_rq, se, wakeup, now); | ||
808 | } | ||
809 | } | ||
810 | |||
811 | /* | ||
812 | * The dequeue_task method is called before nr_running is | ||
813 | * decreased. We remove the task from the rbtree and | ||
814 | * update the fair scheduling stats: | ||
815 | */ | ||
816 | static void | ||
817 | dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) | ||
818 | { | ||
819 | struct cfs_rq *cfs_rq; | ||
820 | struct sched_entity *se = &p->se; | ||
821 | |||
822 | for_each_sched_entity(se) { | ||
823 | cfs_rq = cfs_rq_of(se); | ||
824 | dequeue_entity(cfs_rq, se, sleep, now); | ||
825 | /* Don't dequeue parent if it has other entities besides us */ | ||
826 | if (cfs_rq->load.weight) | ||
827 | break; | ||
828 | } | ||
829 | } | ||
830 | |||
831 | /* | ||
832 | * sched_yield() support is very simple - we dequeue and enqueue | ||
833 | */ | ||
834 | static void yield_task_fair(struct rq *rq, struct task_struct *p) | ||
835 | { | ||
836 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | ||
837 | u64 now = __rq_clock(rq); | ||
838 | |||
839 | /* | ||
840 | * Dequeue and enqueue the task to update its | ||
841 | * position within the tree: | ||
842 | */ | ||
843 | dequeue_entity(cfs_rq, &p->se, 0, now); | ||
844 | enqueue_entity(cfs_rq, &p->se, 0, now); | ||
845 | } | ||
846 | |||
847 | /* | ||
848 | * Preempt the current task with a newly woken task if needed: | ||
849 | */ | ||
850 | static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | ||
851 | { | ||
852 | struct task_struct *curr = rq->curr; | ||
853 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
854 | unsigned long gran; | ||
855 | |||
856 | if (unlikely(rt_prio(p->prio))) { | ||
857 | update_curr(cfs_rq, rq_clock(rq)); | ||
858 | resched_task(curr); | ||
859 | return; | ||
860 | } | ||
861 | |||
862 | gran = sysctl_sched_wakeup_granularity; | ||
863 | /* | ||
864 | * Batch tasks prefer throughput over latency: | ||
865 | */ | ||
866 | if (unlikely(p->policy == SCHED_BATCH)) | ||
867 | gran = sysctl_sched_batch_wakeup_granularity; | ||
868 | |||
869 | if (is_same_group(curr, p)) | ||
870 | __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); | ||
871 | } | ||
872 | |||
873 | static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) | ||
874 | { | ||
875 | struct cfs_rq *cfs_rq = &rq->cfs; | ||
876 | struct sched_entity *se; | ||
877 | |||
878 | if (unlikely(!cfs_rq->nr_running)) | ||
879 | return NULL; | ||
880 | |||
881 | do { | ||
882 | se = pick_next_entity(cfs_rq, now); | ||
883 | cfs_rq = group_cfs_rq(se); | ||
884 | } while (cfs_rq); | ||
885 | |||
886 | return task_of(se); | ||
887 | } | ||
888 | |||
889 | /* | ||
890 | * Account for a descheduled task: | ||
891 | */ | ||
892 | static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) | ||
893 | { | ||
894 | struct sched_entity *se = &prev->se; | ||
895 | struct cfs_rq *cfs_rq; | ||
896 | |||
897 | for_each_sched_entity(se) { | ||
898 | cfs_rq = cfs_rq_of(se); | ||
899 | put_prev_entity(cfs_rq, se, now); | ||
900 | } | ||
901 | } | ||
902 | |||
903 | /************************************************** | ||
904 | * Fair scheduling class load-balancing methods: | ||
905 | */ | ||
906 | |||
907 | /* | ||
908 | * Load-balancing iterator. Note: while the runqueue stays locked | ||
909 | * during the whole iteration, the current task might be | ||
910 | * dequeued so the iterator has to be dequeue-safe. Here we | ||
911 | * achieve that by always pre-iterating before returning | ||
912 | * the current task: | ||
913 | */ | ||
914 | static inline struct task_struct * | ||
915 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | ||
916 | { | ||
917 | struct task_struct *p; | ||
918 | |||
919 | if (!curr) | ||
920 | return NULL; | ||
921 | |||
922 | p = rb_entry(curr, struct task_struct, se.run_node); | ||
923 | cfs_rq->rb_load_balance_curr = rb_next(curr); | ||
924 | |||
925 | return p; | ||
926 | } | ||
927 | |||
928 | static struct task_struct *load_balance_start_fair(void *arg) | ||
929 | { | ||
930 | struct cfs_rq *cfs_rq = arg; | ||
931 | |||
932 | return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); | ||
933 | } | ||
934 | |||
935 | static struct task_struct *load_balance_next_fair(void *arg) | ||
936 | { | ||
937 | struct cfs_rq *cfs_rq = arg; | ||
938 | |||
939 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | ||
940 | } | ||
941 | |||
942 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
943 | { | ||
944 | struct sched_entity *curr; | ||
945 | struct task_struct *p; | ||
946 | |||
947 | if (!cfs_rq->nr_running) | ||
948 | return MAX_PRIO; | ||
949 | |||
950 | curr = __pick_next_entity(cfs_rq); | ||
951 | p = task_of(curr); | ||
952 | |||
953 | return p->prio; | ||
954 | } | ||
955 | |||
956 | static int | ||
957 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
958 | unsigned long max_nr_move, unsigned long max_load_move, | ||
959 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
960 | int *all_pinned, unsigned long *total_load_moved) | ||
961 | { | ||
962 | struct cfs_rq *busy_cfs_rq; | ||
963 | unsigned long load_moved, total_nr_moved = 0, nr_moved; | ||
964 | long rem_load_move = max_load_move; | ||
965 | struct rq_iterator cfs_rq_iterator; | ||
966 | |||
967 | cfs_rq_iterator.start = load_balance_start_fair; | ||
968 | cfs_rq_iterator.next = load_balance_next_fair; | ||
969 | |||
970 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | ||
971 | struct cfs_rq *this_cfs_rq; | ||
972 | long imbalance; | ||
973 | unsigned long maxload; | ||
974 | int this_best_prio, best_prio, best_prio_seen = 0; | ||
975 | |||
976 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | ||
977 | |||
978 | imbalance = busy_cfs_rq->load.weight - | ||
979 | this_cfs_rq->load.weight; | ||
980 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
981 | if (imbalance <= 0) | ||
982 | continue; | ||
983 | |||
984 | /* Don't pull more than imbalance/2 */ | ||
985 | imbalance /= 2; | ||
986 | maxload = min(rem_load_move, imbalance); | ||
987 | |||
988 | this_best_prio = cfs_rq_best_prio(this_cfs_rq); | ||
989 | best_prio = cfs_rq_best_prio(busy_cfs_rq); | ||
990 | |||
991 | /* | ||
992 | * Enable handling of the case where there is more than one task | ||
993 | * with the best priority. If the current running task is one | ||
994 | * of those with prio==best_prio we know it won't be moved | ||
995 | * and therefore it's safe to override the skip (based on load) | ||
996 | * of any task we find with that prio. | ||
997 | */ | ||
998 | if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) | ||
999 | best_prio_seen = 1; | ||
1000 | |||
1001 | /* pass busy_cfs_rq argument into | ||
1002 | * load_balance_[start|next]_fair iterators | ||
1003 | */ | ||
1004 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
1005 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, | ||
1006 | max_nr_move, maxload, sd, idle, all_pinned, | ||
1007 | &load_moved, this_best_prio, best_prio, | ||
1008 | best_prio_seen, &cfs_rq_iterator); | ||
1009 | |||
1010 | total_nr_moved += nr_moved; | ||
1011 | max_nr_move -= nr_moved; | ||
1012 | rem_load_move -= load_moved; | ||
1013 | |||
1014 | if (max_nr_move <= 0 || rem_load_move <= 0) | ||
1015 | break; | ||
1016 | } | ||
1017 | |||
1018 | *total_load_moved = max_load_move - rem_load_move; | ||
1019 | |||
1020 | return total_nr_moved; | ||
1021 | } | ||
1022 | |||
1023 | /* | ||
1024 | * scheduler tick hitting a task of our scheduling class: | ||
1025 | */ | ||
1026 | static void task_tick_fair(struct rq *rq, struct task_struct *curr) | ||
1027 | { | ||
1028 | struct cfs_rq *cfs_rq; | ||
1029 | struct sched_entity *se = &curr->se; | ||
1030 | |||
1031 | for_each_sched_entity(se) { | ||
1032 | cfs_rq = cfs_rq_of(se); | ||
1033 | entity_tick(cfs_rq, se); | ||
1034 | } | ||
1035 | } | ||
1036 | |||
1037 | /* | ||
1038 | * Share the fairness runtime between parent and child, thus the | ||
1039 | * total amount of pressure for CPU stays equal - new tasks | ||
1040 | * get a chance to run but frequent forkers are not allowed to | ||
1041 | * monopolize the CPU. Note: the parent runqueue is locked, | ||
1042 | * the child is not running yet. | ||
1043 | */ | ||
1044 | static void task_new_fair(struct rq *rq, struct task_struct *p) | ||
1045 | { | ||
1046 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | ||
1047 | struct sched_entity *se = &p->se; | ||
1048 | u64 now = rq_clock(rq); | ||
1049 | |||
1050 | sched_info_queued(p); | ||
1051 | |||
1052 | update_stats_enqueue(cfs_rq, se, now); | ||
1053 | /* | ||
1054 | * Child runs first: we let it run before the parent | ||
1055 | * until it reschedules once. We set up the key so that | ||
1056 | * it will preempt the parent: | ||
1057 | */ | ||
1058 | p->se.fair_key = current->se.fair_key - | ||
1059 | niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; | ||
1060 | /* | ||
1061 | * The first wait is dominated by the child-runs-first logic, | ||
1062 | * so do not credit it with that waiting time yet: | ||
1063 | */ | ||
1064 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) | ||
1065 | p->se.wait_start_fair = 0; | ||
1066 | |||
1067 | /* | ||
1068 | * The statistical average of wait_runtime is about | ||
1069 | * -granularity/2, so initialize the task with that: | ||
1070 | */ | ||
1071 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | ||
1072 | p->se.wait_runtime = -(sysctl_sched_granularity / 2); | ||
1073 | |||
1074 | __enqueue_entity(cfs_rq, se); | ||
1075 | inc_nr_running(p, rq, now); | ||
1076 | } | ||
1077 | |||
1078 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1079 | /* Account for a task changing its policy or group. | ||
1080 | * | ||
1081 | * This routine is mostly called to set cfs_rq->curr field when a task | ||
1082 | * migrates between groups/classes. | ||
1083 | */ | ||
1084 | static void set_curr_task_fair(struct rq *rq) | ||
1085 | { | ||
1086 | struct task_struct *curr = rq->curr; | ||
1087 | struct sched_entity *se = &curr->se; | ||
1088 | u64 now = rq_clock(rq); | ||
1089 | struct cfs_rq *cfs_rq; | ||
1090 | |||
1091 | for_each_sched_entity(se) { | ||
1092 | cfs_rq = cfs_rq_of(se); | ||
1093 | set_next_entity(cfs_rq, se, now); | ||
1094 | } | ||
1095 | } | ||
1096 | #else | ||
1097 | static void set_curr_task_fair(struct rq *rq) | ||
1098 | { | ||
1099 | } | ||
1100 | #endif | ||
1101 | |||
1102 | /* | ||
1103 | * All the scheduling class methods: | ||
1104 | */ | ||
1105 | struct sched_class fair_sched_class __read_mostly = { | ||
1106 | .enqueue_task = enqueue_task_fair, | ||
1107 | .dequeue_task = dequeue_task_fair, | ||
1108 | .yield_task = yield_task_fair, | ||
1109 | |||
1110 | .check_preempt_curr = check_preempt_curr_fair, | ||
1111 | |||
1112 | .pick_next_task = pick_next_task_fair, | ||
1113 | .put_prev_task = put_prev_task_fair, | ||
1114 | |||
1115 | .load_balance = load_balance_fair, | ||
1116 | |||
1117 | .set_curr_task = set_curr_task_fair, | ||
1118 | .task_tick = task_tick_fair, | ||
1119 | .task_new = task_new_fair, | ||
1120 | }; | ||
1121 | |||
1122 | #ifdef CONFIG_SCHED_DEBUG | ||
1123 | void print_cfs_stats(struct seq_file *m, int cpu, u64 now) | ||
1124 | { | ||
1125 | struct rq *rq = cpu_rq(cpu); | ||
1126 | struct cfs_rq *cfs_rq; | ||
1127 | |||
1128 | for_each_leaf_cfs_rq(rq, cfs_rq) | ||
1129 | print_cfs_rq(m, cpu, cfs_rq, now); | ||
1130 | } | ||
1131 | #endif | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c new file mode 100644 index 0000000000..41841e741c --- /dev/null +++ b/kernel/sched_idletask.c | |||
@@ -0,0 +1,71 @@ | |||
1 | /* | ||
2 | * idle-task scheduling class. | ||
3 | * | ||
4 | * (NOTE: these are not related to SCHED_IDLE tasks which are | ||
5 | * handled in sched_fair.c) | ||
6 | */ | ||
7 | |||
8 | /* | ||
9 | * Idle tasks are unconditionally rescheduled: | ||
10 | */ | ||
11 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) | ||
12 | { | ||
13 | resched_task(rq->idle); | ||
14 | } | ||
15 | |||
16 | static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) | ||
17 | { | ||
18 | schedstat_inc(rq, sched_goidle); | ||
19 | |||
20 | return rq->idle; | ||
21 | } | ||
22 | |||
23 | /* | ||
24 | * It is not legal to sleep in the idle task - print a warning | ||
25 | * message if some code attempts to do it: | ||
26 | */ | ||
27 | static void | ||
28 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) | ||
29 | { | ||
30 | spin_unlock_irq(&rq->lock); | ||
31 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
32 | dump_stack(); | ||
33 | spin_lock_irq(&rq->lock); | ||
34 | } | ||
35 | |||
36 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now) | ||
37 | { | ||
38 | } | ||
39 | |||
40 | static int | ||
41 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
42 | unsigned long max_nr_move, unsigned long max_load_move, | ||
43 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
44 | int *all_pinned, unsigned long *total_load_moved) | ||
45 | { | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | static void task_tick_idle(struct rq *rq, struct task_struct *curr) | ||
50 | { | ||
51 | } | ||
52 | |||
53 | /* | ||
54 | * Simple, special scheduling class for the per-CPU idle tasks: | ||
55 | */ | ||
56 | static struct sched_class idle_sched_class __read_mostly = { | ||
57 | /* no enqueue/yield_task for idle tasks */ | ||
58 | |||
59 | /* dequeue is not valid, we print a debug message there: */ | ||
60 | .dequeue_task = dequeue_task_idle, | ||
61 | |||
62 | .check_preempt_curr = check_preempt_curr_idle, | ||
63 | |||
64 | .pick_next_task = pick_next_task_idle, | ||
65 | .put_prev_task = put_prev_task_idle, | ||
66 | |||
67 | .load_balance = load_balance_idle, | ||
68 | |||
69 | .task_tick = task_tick_idle, | ||
70 | /* no .task_new for idle tasks */ | ||
71 | }; | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c new file mode 100644 index 0000000000..1192a2741b --- /dev/null +++ b/kernel/sched_rt.c | |||
@@ -0,0 +1,255 @@ | |||
1 | /* | ||
2 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR | ||
3 | * policies) | ||
4 | */ | ||
5 | |||
6 | /* | ||
7 | * Update the current task's runtime statistics. Skip current tasks that | ||
8 | * are not in our scheduling class. | ||
9 | */ | ||
10 | static inline void update_curr_rt(struct rq *rq, u64 now) | ||
11 | { | ||
12 | struct task_struct *curr = rq->curr; | ||
13 | u64 delta_exec; | ||
14 | |||
15 | if (!task_has_rt_policy(curr)) | ||
16 | return; | ||
17 | |||
18 | delta_exec = now - curr->se.exec_start; | ||
19 | if (unlikely((s64)delta_exec < 0)) | ||
20 | delta_exec = 0; | ||
21 | if (unlikely(delta_exec > curr->se.exec_max)) | ||
22 | curr->se.exec_max = delta_exec; | ||
23 | |||
24 | curr->se.sum_exec_runtime += delta_exec; | ||
25 | curr->se.exec_start = now; | ||
26 | } | ||
27 | |||
28 | static void | ||
29 | enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | ||
30 | { | ||
31 | struct rt_prio_array *array = &rq->rt.active; | ||
32 | |||
33 | list_add_tail(&p->run_list, array->queue + p->prio); | ||
34 | __set_bit(p->prio, array->bitmap); | ||
35 | } | ||
36 | |||
37 | /* | ||
38 | * Adding/removing a task to/from a priority array: | ||
39 | */ | ||
40 | static void | ||
41 | dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now) | ||
42 | { | ||
43 | struct rt_prio_array *array = &rq->rt.active; | ||
44 | |||
45 | update_curr_rt(rq, now); | ||
46 | |||
47 | list_del(&p->run_list); | ||
48 | if (list_empty(array->queue + p->prio)) | ||
49 | __clear_bit(p->prio, array->bitmap); | ||
50 | } | ||
51 | |||
52 | /* | ||
53 | * Put task to the end of the run list without the overhead of dequeue | ||
54 | * followed by enqueue. | ||
55 | */ | ||
56 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) | ||
57 | { | ||
58 | struct rt_prio_array *array = &rq->rt.active; | ||
59 | |||
60 | list_move_tail(&p->run_list, array->queue + p->prio); | ||
61 | } | ||
62 | |||
63 | static void | ||
64 | yield_task_rt(struct rq *rq, struct task_struct *p) | ||
65 | { | ||
66 | requeue_task_rt(rq, p); | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * Preempt the current task with a newly woken task if needed: | ||
71 | */ | ||
72 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | ||
73 | { | ||
74 | if (p->prio < rq->curr->prio) | ||
75 | resched_task(rq->curr); | ||
76 | } | ||
77 | |||
78 | static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) | ||
79 | { | ||
80 | struct rt_prio_array *array = &rq->rt.active; | ||
81 | struct task_struct *next; | ||
82 | struct list_head *queue; | ||
83 | int idx; | ||
84 | |||
85 | idx = sched_find_first_bit(array->bitmap); | ||
86 | if (idx >= MAX_RT_PRIO) | ||
87 | return NULL; | ||
88 | |||
89 | queue = array->queue + idx; | ||
90 | next = list_entry(queue->next, struct task_struct, run_list); | ||
91 | |||
92 | next->se.exec_start = now; | ||
93 | |||
94 | return next; | ||
95 | } | ||
96 | |||
97 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now) | ||
98 | { | ||
99 | update_curr_rt(rq, now); | ||
100 | p->se.exec_start = 0; | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * Load-balancing iterator. Note: while the runqueue stays locked | ||
105 | * during the whole iteration, the current task might be | ||
106 | * dequeued so the iterator has to be dequeue-safe. Here we | ||
107 | * achieve that by always pre-iterating before returning | ||
108 | * the current task: | ||
109 | */ | ||
110 | static struct task_struct *load_balance_start_rt(void *arg) | ||
111 | { | ||
112 | struct rq *rq = arg; | ||
113 | struct rt_prio_array *array = &rq->rt.active; | ||
114 | struct list_head *head, *curr; | ||
115 | struct task_struct *p; | ||
116 | int idx; | ||
117 | |||
118 | idx = sched_find_first_bit(array->bitmap); | ||
119 | if (idx >= MAX_RT_PRIO) | ||
120 | return NULL; | ||
121 | |||
122 | head = array->queue + idx; | ||
123 | curr = head->prev; | ||
124 | |||
125 | p = list_entry(curr, struct task_struct, run_list); | ||
126 | |||
127 | curr = curr->prev; | ||
128 | |||
129 | rq->rt.rt_load_balance_idx = idx; | ||
130 | rq->rt.rt_load_balance_head = head; | ||
131 | rq->rt.rt_load_balance_curr = curr; | ||
132 | |||
133 | return p; | ||
134 | } | ||
135 | |||
136 | static struct task_struct *load_balance_next_rt(void *arg) | ||
137 | { | ||
138 | struct rq *rq = arg; | ||
139 | struct rt_prio_array *array = &rq->rt.active; | ||
140 | struct list_head *head, *curr; | ||
141 | struct task_struct *p; | ||
142 | int idx; | ||
143 | |||
144 | idx = rq->rt.rt_load_balance_idx; | ||
145 | head = rq->rt.rt_load_balance_head; | ||
146 | curr = rq->rt.rt_load_balance_curr; | ||
147 | |||
148 | /* | ||
149 | * If we arrived back to the head again then | ||
150 | * iterate to the next queue (if any): | ||
151 | */ | ||
152 | if (unlikely(head == curr)) { | ||
153 | int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | ||
154 | |||
155 | if (next_idx >= MAX_RT_PRIO) | ||
156 | return NULL; | ||
157 | |||
158 | idx = next_idx; | ||
159 | head = array->queue + idx; | ||
160 | curr = head->prev; | ||
161 | |||
162 | rq->rt.rt_load_balance_idx = idx; | ||
163 | rq->rt.rt_load_balance_head = head; | ||
164 | } | ||
165 | |||
166 | p = list_entry(curr, struct task_struct, run_list); | ||
167 | |||
168 | curr = curr->prev; | ||
169 | |||
170 | rq->rt.rt_load_balance_curr = curr; | ||
171 | |||
172 | return p; | ||
173 | } | ||
174 | |||
175 | static int | ||
176 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
177 | unsigned long max_nr_move, unsigned long max_load_move, | ||
178 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
179 | int *all_pinned, unsigned long *load_moved) | ||
180 | { | ||
181 | int this_best_prio, best_prio, best_prio_seen = 0; | ||
182 | int nr_moved; | ||
183 | struct rq_iterator rt_rq_iterator; | ||
184 | |||
185 | best_prio = sched_find_first_bit(busiest->rt.active.bitmap); | ||
186 | this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); | ||
187 | |||
188 | /* | ||
189 | * Enable handling of the case where there is more than one task | ||
190 | * with the best priority. If the current running task is one | ||
191 | * of those with prio==best_prio we know it won't be moved | ||
192 | * and therefore it's safe to override the skip (based on load) | ||
193 | * of any task we find with that prio. | ||
194 | */ | ||
195 | if (busiest->curr->prio == best_prio) | ||
196 | best_prio_seen = 1; | ||
197 | |||
198 | rt_rq_iterator.start = load_balance_start_rt; | ||
199 | rt_rq_iterator.next = load_balance_next_rt; | ||
200 | /* pass 'busiest' rq argument into | ||
201 | * load_balance_[start|next]_rt iterators | ||
202 | */ | ||
203 | rt_rq_iterator.arg = busiest; | ||
204 | |||
205 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, | ||
206 | max_load_move, sd, idle, all_pinned, load_moved, | ||
207 | this_best_prio, best_prio, best_prio_seen, | ||
208 | &rt_rq_iterator); | ||
209 | |||
210 | return nr_moved; | ||
211 | } | ||
212 | |||
213 | static void task_tick_rt(struct rq *rq, struct task_struct *p) | ||
214 | { | ||
215 | /* | ||
216 | * RR tasks need a special form of timeslice management. | ||
217 | * FIFO tasks have no timeslices. | ||
218 | */ | ||
219 | if (p->policy != SCHED_RR) | ||
220 | return; | ||
221 | |||
222 | if (--p->time_slice) | ||
223 | return; | ||
224 | |||
225 | p->time_slice = static_prio_timeslice(p->static_prio); | ||
226 | set_tsk_need_resched(p); | ||
227 | |||
228 | /* put it at the end of the queue: */ | ||
229 | requeue_task_rt(rq, p); | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * No parent/child timeslice management necessary for RT tasks, | ||
234 | * just activate them: | ||
235 | */ | ||
236 | static void task_new_rt(struct rq *rq, struct task_struct *p) | ||
237 | { | ||
238 | activate_task(rq, p, 1); | ||
239 | } | ||
240 | |||
241 | static struct sched_class rt_sched_class __read_mostly = { | ||
242 | .enqueue_task = enqueue_task_rt, | ||
243 | .dequeue_task = dequeue_task_rt, | ||
244 | .yield_task = yield_task_rt, | ||
245 | |||
246 | .check_preempt_curr = check_preempt_curr_rt, | ||
247 | |||
248 | .pick_next_task = pick_next_task_rt, | ||
249 | .put_prev_task = put_prev_task_rt, | ||
250 | |||
251 | .load_balance = load_balance_rt, | ||
252 | |||
253 | .task_tick = task_tick_rt, | ||
254 | .task_new = task_new_rt, | ||
255 | }; | ||
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h new file mode 100644 index 0000000000..c63c38f6fa --- /dev/null +++ b/kernel/sched_stats.h | |||
@@ -0,0 +1,235 @@ | |||
1 | |||
2 | #ifdef CONFIG_SCHEDSTATS | ||
3 | /* | ||
4 | * bump this up when changing the output format or the meaning of an existing | ||
5 | * format, so that tools can adapt (or abort) | ||
6 | */ | ||
7 | #define SCHEDSTAT_VERSION 14 | ||
8 | |||
9 | static int show_schedstat(struct seq_file *seq, void *v) | ||
10 | { | ||
11 | int cpu; | ||
12 | |||
13 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
14 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
15 | for_each_online_cpu(cpu) { | ||
16 | struct rq *rq = cpu_rq(cpu); | ||
17 | #ifdef CONFIG_SMP | ||
18 | struct sched_domain *sd; | ||
19 | int dcnt = 0; | ||
20 | #endif | ||
21 | |||
22 | /* runqueue-specific stats */ | ||
23 | seq_printf(seq, | ||
24 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", | ||
25 | cpu, rq->yld_both_empty, | ||
26 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, | ||
27 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, | ||
28 | rq->ttwu_cnt, rq->ttwu_local, | ||
29 | rq->rq_sched_info.cpu_time, | ||
30 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); | ||
31 | |||
32 | seq_printf(seq, "\n"); | ||
33 | |||
34 | #ifdef CONFIG_SMP | ||
35 | /* domain-specific stats */ | ||
36 | preempt_disable(); | ||
37 | for_each_domain(cpu, sd) { | ||
38 | enum cpu_idle_type itype; | ||
39 | char mask_str[NR_CPUS]; | ||
40 | |||
41 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | ||
42 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | ||
43 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | ||
44 | itype++) { | ||
45 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " | ||
46 | "%lu", | ||
47 | sd->lb_cnt[itype], | ||
48 | sd->lb_balanced[itype], | ||
49 | sd->lb_failed[itype], | ||
50 | sd->lb_imbalance[itype], | ||
51 | sd->lb_gained[itype], | ||
52 | sd->lb_hot_gained[itype], | ||
53 | sd->lb_nobusyq[itype], | ||
54 | sd->lb_nobusyg[itype]); | ||
55 | } | ||
56 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" | ||
57 | " %lu %lu %lu\n", | ||
58 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | ||
59 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | ||
60 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | ||
61 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
62 | sd->ttwu_move_balance); | ||
63 | } | ||
64 | preempt_enable(); | ||
65 | #endif | ||
66 | } | ||
67 | return 0; | ||
68 | } | ||
69 | |||
70 | static int schedstat_open(struct inode *inode, struct file *file) | ||
71 | { | ||
72 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
73 | char *buf = kmalloc(size, GFP_KERNEL); | ||
74 | struct seq_file *m; | ||
75 | int res; | ||
76 | |||
77 | if (!buf) | ||
78 | return -ENOMEM; | ||
79 | res = single_open(file, show_schedstat, NULL); | ||
80 | if (!res) { | ||
81 | m = file->private_data; | ||
82 | m->buf = buf; | ||
83 | m->size = size; | ||
84 | } else | ||
85 | kfree(buf); | ||
86 | return res; | ||
87 | } | ||
88 | |||
89 | const struct file_operations proc_schedstat_operations = { | ||
90 | .open = schedstat_open, | ||
91 | .read = seq_read, | ||
92 | .llseek = seq_lseek, | ||
93 | .release = single_release, | ||
94 | }; | ||
95 | |||
96 | /* | ||
97 | * Expects runqueue lock to be held for atomicity of update | ||
98 | */ | ||
99 | static inline void | ||
100 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | ||
101 | { | ||
102 | if (rq) { | ||
103 | rq->rq_sched_info.run_delay += delta; | ||
104 | rq->rq_sched_info.pcnt++; | ||
105 | } | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Expects runqueue lock to be held for atomicity of update | ||
110 | */ | ||
111 | static inline void | ||
112 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | ||
113 | { | ||
114 | if (rq) | ||
115 | rq->rq_sched_info.cpu_time += delta; | ||
116 | } | ||
117 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | ||
118 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | ||
119 | #else /* !CONFIG_SCHEDSTATS */ | ||
120 | static inline void | ||
121 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | ||
122 | {} | ||
123 | static inline void | ||
124 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | ||
125 | {} | ||
126 | # define schedstat_inc(rq, field) do { } while (0) | ||
127 | # define schedstat_add(rq, field, amt) do { } while (0) | ||
128 | #endif | ||
129 | |||
130 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | ||
131 | /* | ||
132 | * Called when a process is dequeued from the active array and given | ||
133 | * the cpu. We should note that with the exception of interactive | ||
134 | * tasks, the expired queue will become the active queue after the active | ||
135 | * queue is empty, without explicitly dequeuing and requeuing tasks in the | ||
136 | * expired queue. (Interactive tasks may be requeued directly to the | ||
137 | * active queue, thus delaying tasks in the expired queue from running; | ||
138 | * see scheduler_tick()). | ||
139 | * | ||
140 | * This function is only called from sched_info_arrive(), rather than | ||
141 | * dequeue_task(). Even though a task may be queued and dequeued multiple | ||
142 | * times as it is shuffled about, we're really interested in knowing how | ||
143 | * long it was from the *first* time it was queued to the time that it | ||
144 | * finally hit a cpu. | ||
145 | */ | ||
146 | static inline void sched_info_dequeued(struct task_struct *t) | ||
147 | { | ||
148 | t->sched_info.last_queued = 0; | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | * Called when a task finally hits the cpu. We can now calculate how | ||
153 | * long it was waiting to run. We also note when it began so that we | ||
154 | * can keep stats on how long its timeslice is. | ||
155 | */ | ||
156 | static void sched_info_arrive(struct task_struct *t) | ||
157 | { | ||
158 | unsigned long long now = sched_clock(), delta = 0; | ||
159 | |||
160 | if (t->sched_info.last_queued) | ||
161 | delta = now - t->sched_info.last_queued; | ||
162 | sched_info_dequeued(t); | ||
163 | t->sched_info.run_delay += delta; | ||
164 | t->sched_info.last_arrival = now; | ||
165 | t->sched_info.pcnt++; | ||
166 | |||
167 | rq_sched_info_arrive(task_rq(t), delta); | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * Called when a process is queued into either the active or expired | ||
172 | * array. The time is noted and later used to determine how long we | ||
173 | * had to wait for us to reach the cpu. Since the expired queue will | ||
174 | * become the active queue after active queue is empty, without dequeuing | ||
175 | * and requeuing any tasks, we are interested in queuing to either. It | ||
176 | * is unusual but not impossible for tasks to be dequeued and immediately | ||
177 | * requeued in the same or another array: this can happen in sched_yield(), | ||
178 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | ||
179 | * to runqueue. | ||
180 | * | ||
181 | * This function is only called from enqueue_task(), but also only updates | ||
182 | * the timestamp if it is already not set. It's assumed that | ||
183 | * sched_info_dequeued() will clear that stamp when appropriate. | ||
184 | */ | ||
185 | static inline void sched_info_queued(struct task_struct *t) | ||
186 | { | ||
187 | if (unlikely(sched_info_on())) | ||
188 | if (!t->sched_info.last_queued) | ||
189 | t->sched_info.last_queued = sched_clock(); | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * Called when a process ceases being the active-running process, either | ||
194 | * voluntarily or involuntarily. Now we can calculate how long we ran. | ||
195 | */ | ||
196 | static inline void sched_info_depart(struct task_struct *t) | ||
197 | { | ||
198 | unsigned long long delta = sched_clock() - t->sched_info.last_arrival; | ||
199 | |||
200 | t->sched_info.cpu_time += delta; | ||
201 | rq_sched_info_depart(task_rq(t), delta); | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * Called when tasks are switched involuntarily due, typically, to expiring | ||
206 | * their time slice. (This may also be called when switching to or from | ||
207 | * the idle task.) We are only called when prev != next. | ||
208 | */ | ||
209 | static inline void | ||
210 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
211 | { | ||
212 | struct rq *rq = task_rq(prev); | ||
213 | |||
214 | /* | ||
215 | * prev now departs the cpu. It's not interesting to record | ||
216 | * stats about how efficient we were at scheduling the idle | ||
217 | * process, however. | ||
218 | */ | ||
219 | if (prev != rq->idle) | ||
220 | sched_info_depart(prev); | ||
221 | |||
222 | if (next != rq->idle) | ||
223 | sched_info_arrive(next); | ||
224 | } | ||
225 | static inline void | ||
226 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
227 | { | ||
228 | if (unlikely(sched_info_on())) | ||
229 | __sched_info_switch(prev, next); | ||
230 | } | ||
231 | #else | ||
232 | #define sched_info_queued(t) do { } while (0) | ||
233 | #define sched_info_switch(t, next) do { } while (0) | ||
234 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | ||
235 | |||
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index c3391b6020..ad64fcb731 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/sched.h> | 10 | #include <linux/sched.h> |
11 | 11 | ||
12 | /* #define SECCOMP_DEBUG 1 */ | 12 | /* #define SECCOMP_DEBUG 1 */ |
13 | #define NR_SECCOMP_MODES 1 | ||
13 | 14 | ||
14 | /* | 15 | /* |
15 | * Secure computing mode 1 allows only read/write/exit/sigreturn. | 16 | * Secure computing mode 1 allows only read/write/exit/sigreturn. |
@@ -54,3 +55,31 @@ void __secure_computing(int this_syscall) | |||
54 | #endif | 55 | #endif |
55 | do_exit(SIGKILL); | 56 | do_exit(SIGKILL); |
56 | } | 57 | } |
58 | |||
59 | long prctl_get_seccomp(void) | ||
60 | { | ||
61 | return current->seccomp.mode; | ||
62 | } | ||
63 | |||
64 | long prctl_set_seccomp(unsigned long seccomp_mode) | ||
65 | { | ||
66 | long ret; | ||
67 | |||
68 | /* can set it only once to be even more secure */ | ||
69 | ret = -EPERM; | ||
70 | if (unlikely(current->seccomp.mode)) | ||
71 | goto out; | ||
72 | |||
73 | ret = -EINVAL; | ||
74 | if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { | ||
75 | current->seccomp.mode = seccomp_mode; | ||
76 | set_thread_flag(TIF_SECCOMP); | ||
77 | #ifdef TIF_NOTSC | ||
78 | disable_TSC(); | ||
79 | #endif | ||
80 | ret = 0; | ||
81 | } | ||
82 | |||
83 | out: | ||
84 | return ret; | ||
85 | } | ||
diff --git a/kernel/signal.c b/kernel/signal.c index f940560977..39d122753b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -718,6 +718,37 @@ out_set: | |||
718 | #define LEGACY_QUEUE(sigptr, sig) \ | 718 | #define LEGACY_QUEUE(sigptr, sig) \ |
719 | (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) | 719 | (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) |
720 | 720 | ||
721 | int print_fatal_signals; | ||
722 | |||
723 | static void print_fatal_signal(struct pt_regs *regs, int signr) | ||
724 | { | ||
725 | printk("%s/%d: potentially unexpected fatal signal %d.\n", | ||
726 | current->comm, current->pid, signr); | ||
727 | |||
728 | #ifdef __i386__ | ||
729 | printk("code at %08lx: ", regs->eip); | ||
730 | { | ||
731 | int i; | ||
732 | for (i = 0; i < 16; i++) { | ||
733 | unsigned char insn; | ||
734 | |||
735 | __get_user(insn, (unsigned char *)(regs->eip + i)); | ||
736 | printk("%02x ", insn); | ||
737 | } | ||
738 | } | ||
739 | #endif | ||
740 | printk("\n"); | ||
741 | show_regs(regs); | ||
742 | } | ||
743 | |||
744 | static int __init setup_print_fatal_signals(char *str) | ||
745 | { | ||
746 | get_option (&str, &print_fatal_signals); | ||
747 | |||
748 | return 1; | ||
749 | } | ||
750 | |||
751 | __setup("print-fatal-signals=", setup_print_fatal_signals); | ||
721 | 752 | ||
722 | static int | 753 | static int |
723 | specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) | 754 | specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) |
@@ -1855,6 +1886,8 @@ relock: | |||
1855 | * Anything else is fatal, maybe with a core dump. | 1886 | * Anything else is fatal, maybe with a core dump. |
1856 | */ | 1887 | */ |
1857 | current->flags |= PF_SIGNALED; | 1888 | current->flags |= PF_SIGNALED; |
1889 | if ((signr != SIGKILL) && print_fatal_signals) | ||
1890 | print_fatal_signal(regs, signr); | ||
1858 | if (sig_kernel_coredump(signr)) { | 1891 | if (sig_kernel_coredump(signr)) { |
1859 | /* | 1892 | /* |
1860 | * If it was able to dump core, this kills all | 1893 | * If it was able to dump core, this kills all |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 0b9886a00e..0f546ddea4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/notifier.h> | 14 | #include <linux/notifier.h> |
15 | #include <linux/percpu.h> | 15 | #include <linux/percpu.h> |
16 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
17 | #include <linux/freezer.h> | ||
17 | #include <linux/kthread.h> | 18 | #include <linux/kthread.h> |
18 | #include <linux/rcupdate.h> | 19 | #include <linux/rcupdate.h> |
19 | #include <linux/smp.h> | 20 | #include <linux/smp.h> |
@@ -488,9 +489,6 @@ void __init softirq_init(void) | |||
488 | 489 | ||
489 | static int ksoftirqd(void * __bind_cpu) | 490 | static int ksoftirqd(void * __bind_cpu) |
490 | { | 491 | { |
491 | set_user_nice(current, 19); | ||
492 | current->flags |= PF_NOFREEZE; | ||
493 | |||
494 | set_current_state(TASK_INTERRUPTIBLE); | 492 | set_current_state(TASK_INTERRUPTIBLE); |
495 | 493 | ||
496 | while (!kthread_should_stop()) { | 494 | while (!kthread_should_stop()) { |
@@ -615,12 +613,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
615 | kthread_bind(per_cpu(ksoftirqd, hotcpu), | 613 | kthread_bind(per_cpu(ksoftirqd, hotcpu), |
616 | any_online_cpu(cpu_online_map)); | 614 | any_online_cpu(cpu_online_map)); |
617 | case CPU_DEAD: | 615 | case CPU_DEAD: |
618 | case CPU_DEAD_FROZEN: | 616 | case CPU_DEAD_FROZEN: { |
617 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
618 | |||
619 | p = per_cpu(ksoftirqd, hotcpu); | 619 | p = per_cpu(ksoftirqd, hotcpu); |
620 | per_cpu(ksoftirqd, hotcpu) = NULL; | 620 | per_cpu(ksoftirqd, hotcpu) = NULL; |
621 | sched_setscheduler(p, SCHED_FIFO, ¶m); | ||
621 | kthread_stop(p); | 622 | kthread_stop(p); |
622 | takeover_tasklets(hotcpu); | 623 | takeover_tasklets(hotcpu); |
623 | break; | 624 | break; |
625 | } | ||
624 | #endif /* CONFIG_HOTPLUG_CPU */ | 626 | #endif /* CONFIG_HOTPLUG_CPU */ |
625 | } | 627 | } |
626 | return NOTIFY_OK; | 628 | return NOTIFY_OK; |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 0131e296ff..708d4882c0 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/cpu.h> | 10 | #include <linux/cpu.h> |
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/delay.h> | 12 | #include <linux/delay.h> |
13 | #include <linux/freezer.h> | ||
13 | #include <linux/kthread.h> | 14 | #include <linux/kthread.h> |
14 | #include <linux/notifier.h> | 15 | #include <linux/notifier.h> |
15 | #include <linux/module.h> | 16 | #include <linux/module.h> |
@@ -116,7 +117,6 @@ static int watchdog(void * __bind_cpu) | |||
116 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 117 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
117 | 118 | ||
118 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 119 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
119 | current->flags |= PF_NOFREEZE; | ||
120 | 120 | ||
121 | /* initialize timestamp */ | 121 | /* initialize timestamp */ |
122 | touch_softlockup_watchdog(); | 122 | touch_softlockup_watchdog(); |
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 2c6c2bf855..cd72424c26 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -72,7 +72,7 @@ void __lockfunc _read_lock(rwlock_t *lock) | |||
72 | { | 72 | { |
73 | preempt_disable(); | 73 | preempt_disable(); |
74 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 74 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
75 | _raw_read_lock(lock); | 75 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
76 | } | 76 | } |
77 | EXPORT_SYMBOL(_read_lock); | 77 | EXPORT_SYMBOL(_read_lock); |
78 | 78 | ||
@@ -88,8 +88,8 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) | |||
88 | * _raw_spin_lock_flags() code, because lockdep assumes | 88 | * _raw_spin_lock_flags() code, because lockdep assumes |
89 | * that interrupts are not re-enabled during lock-acquire: | 89 | * that interrupts are not re-enabled during lock-acquire: |
90 | */ | 90 | */ |
91 | #ifdef CONFIG_PROVE_LOCKING | 91 | #ifdef CONFIG_LOCKDEP |
92 | _raw_spin_lock(lock); | 92 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
93 | #else | 93 | #else |
94 | _raw_spin_lock_flags(lock, &flags); | 94 | _raw_spin_lock_flags(lock, &flags); |
95 | #endif | 95 | #endif |
@@ -102,7 +102,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock) | |||
102 | local_irq_disable(); | 102 | local_irq_disable(); |
103 | preempt_disable(); | 103 | preempt_disable(); |
104 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 104 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
105 | _raw_spin_lock(lock); | 105 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
106 | } | 106 | } |
107 | EXPORT_SYMBOL(_spin_lock_irq); | 107 | EXPORT_SYMBOL(_spin_lock_irq); |
108 | 108 | ||
@@ -111,7 +111,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock) | |||
111 | local_bh_disable(); | 111 | local_bh_disable(); |
112 | preempt_disable(); | 112 | preempt_disable(); |
113 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 113 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
114 | _raw_spin_lock(lock); | 114 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
115 | } | 115 | } |
116 | EXPORT_SYMBOL(_spin_lock_bh); | 116 | EXPORT_SYMBOL(_spin_lock_bh); |
117 | 117 | ||
@@ -122,7 +122,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) | |||
122 | local_irq_save(flags); | 122 | local_irq_save(flags); |
123 | preempt_disable(); | 123 | preempt_disable(); |
124 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 124 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
125 | _raw_read_lock(lock); | 125 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
126 | return flags; | 126 | return flags; |
127 | } | 127 | } |
128 | EXPORT_SYMBOL(_read_lock_irqsave); | 128 | EXPORT_SYMBOL(_read_lock_irqsave); |
@@ -132,7 +132,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock) | |||
132 | local_irq_disable(); | 132 | local_irq_disable(); |
133 | preempt_disable(); | 133 | preempt_disable(); |
134 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 134 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
135 | _raw_read_lock(lock); | 135 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
136 | } | 136 | } |
137 | EXPORT_SYMBOL(_read_lock_irq); | 137 | EXPORT_SYMBOL(_read_lock_irq); |
138 | 138 | ||
@@ -141,7 +141,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock) | |||
141 | local_bh_disable(); | 141 | local_bh_disable(); |
142 | preempt_disable(); | 142 | preempt_disable(); |
143 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 143 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
144 | _raw_read_lock(lock); | 144 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
145 | } | 145 | } |
146 | EXPORT_SYMBOL(_read_lock_bh); | 146 | EXPORT_SYMBOL(_read_lock_bh); |
147 | 147 | ||
@@ -152,7 +152,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) | |||
152 | local_irq_save(flags); | 152 | local_irq_save(flags); |
153 | preempt_disable(); | 153 | preempt_disable(); |
154 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 154 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
155 | _raw_write_lock(lock); | 155 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
156 | return flags; | 156 | return flags; |
157 | } | 157 | } |
158 | EXPORT_SYMBOL(_write_lock_irqsave); | 158 | EXPORT_SYMBOL(_write_lock_irqsave); |
@@ -162,7 +162,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock) | |||
162 | local_irq_disable(); | 162 | local_irq_disable(); |
163 | preempt_disable(); | 163 | preempt_disable(); |
164 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 164 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
165 | _raw_write_lock(lock); | 165 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
166 | } | 166 | } |
167 | EXPORT_SYMBOL(_write_lock_irq); | 167 | EXPORT_SYMBOL(_write_lock_irq); |
168 | 168 | ||
@@ -171,7 +171,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock) | |||
171 | local_bh_disable(); | 171 | local_bh_disable(); |
172 | preempt_disable(); | 172 | preempt_disable(); |
173 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 173 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
174 | _raw_write_lock(lock); | 174 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
175 | } | 175 | } |
176 | EXPORT_SYMBOL(_write_lock_bh); | 176 | EXPORT_SYMBOL(_write_lock_bh); |
177 | 177 | ||
@@ -179,7 +179,7 @@ void __lockfunc _spin_lock(spinlock_t *lock) | |||
179 | { | 179 | { |
180 | preempt_disable(); | 180 | preempt_disable(); |
181 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 181 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
182 | _raw_spin_lock(lock); | 182 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
183 | } | 183 | } |
184 | 184 | ||
185 | EXPORT_SYMBOL(_spin_lock); | 185 | EXPORT_SYMBOL(_spin_lock); |
@@ -188,7 +188,7 @@ void __lockfunc _write_lock(rwlock_t *lock) | |||
188 | { | 188 | { |
189 | preempt_disable(); | 189 | preempt_disable(); |
190 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 190 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
191 | _raw_write_lock(lock); | 191 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
192 | } | 192 | } |
193 | 193 | ||
194 | EXPORT_SYMBOL(_write_lock); | 194 | EXPORT_SYMBOL(_write_lock); |
@@ -289,7 +289,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) | |||
289 | { | 289 | { |
290 | preempt_disable(); | 290 | preempt_disable(); |
291 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | 291 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); |
292 | _raw_spin_lock(lock); | 292 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
293 | } | 293 | } |
294 | 294 | ||
295 | EXPORT_SYMBOL(_spin_lock_nested); | 295 | EXPORT_SYMBOL(_spin_lock_nested); |
@@ -305,8 +305,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas | |||
305 | * _raw_spin_lock_flags() code, because lockdep assumes | 305 | * _raw_spin_lock_flags() code, because lockdep assumes |
306 | * that interrupts are not re-enabled during lock-acquire: | 306 | * that interrupts are not re-enabled during lock-acquire: |
307 | */ | 307 | */ |
308 | #ifdef CONFIG_PROVE_SPIN_LOCKING | 308 | #ifdef CONFIG_LOCKDEP |
309 | _raw_spin_lock(lock); | 309 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
310 | #else | 310 | #else |
311 | _raw_spin_lock_flags(lock, &flags); | 311 | _raw_spin_lock_flags(lock, &flags); |
312 | #endif | 312 | #endif |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index fcee2a8e6d..319821ef78 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -93,10 +93,6 @@ static void stopmachine_set_state(enum stopmachine_state state) | |||
93 | static int stop_machine(void) | 93 | static int stop_machine(void) |
94 | { | 94 | { |
95 | int i, ret = 0; | 95 | int i, ret = 0; |
96 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
97 | |||
98 | /* One high-prio thread per cpu. We'll do this one. */ | ||
99 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
100 | 96 | ||
101 | atomic_set(&stopmachine_thread_ack, 0); | 97 | atomic_set(&stopmachine_thread_ack, 0); |
102 | stopmachine_num_threads = 0; | 98 | stopmachine_num_threads = 0; |
@@ -189,6 +185,10 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, | |||
189 | 185 | ||
190 | p = kthread_create(do_stop, &smdata, "kstopmachine"); | 186 | p = kthread_create(do_stop, &smdata, "kstopmachine"); |
191 | if (!IS_ERR(p)) { | 187 | if (!IS_ERR(p)) { |
188 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
189 | |||
190 | /* One high-prio thread per cpu. We'll do this one. */ | ||
191 | sched_setscheduler(p, SCHED_FIFO, ¶m); | ||
192 | kthread_bind(p, cpu); | 192 | kthread_bind(p, cpu); |
193 | wake_up_process(p); | 193 | wake_up_process(p); |
194 | wait_for_completion(&smdata.done); | 194 | wait_for_completion(&smdata.done); |
diff --git a/kernel/sys.c b/kernel/sys.c index 872271ccc3..08562f4197 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -31,10 +31,12 @@ | |||
31 | #include <linux/cn_proc.h> | 31 | #include <linux/cn_proc.h> |
32 | #include <linux/getcpu.h> | 32 | #include <linux/getcpu.h> |
33 | #include <linux/task_io_accounting_ops.h> | 33 | #include <linux/task_io_accounting_ops.h> |
34 | #include <linux/seccomp.h> | ||
34 | 35 | ||
35 | #include <linux/compat.h> | 36 | #include <linux/compat.h> |
36 | #include <linux/syscalls.h> | 37 | #include <linux/syscalls.h> |
37 | #include <linux/kprobes.h> | 38 | #include <linux/kprobes.h> |
39 | #include <linux/user_namespace.h> | ||
38 | 40 | ||
39 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
40 | #include <asm/io.h> | 42 | #include <asm/io.h> |
@@ -98,6 +100,13 @@ struct pid *cad_pid; | |||
98 | EXPORT_SYMBOL(cad_pid); | 100 | EXPORT_SYMBOL(cad_pid); |
99 | 101 | ||
100 | /* | 102 | /* |
103 | * If set, this is used for preparing the system to power off. | ||
104 | */ | ||
105 | |||
106 | void (*pm_power_off_prepare)(void); | ||
107 | EXPORT_SYMBOL(pm_power_off_prepare); | ||
108 | |||
109 | /* | ||
101 | * Notifier list for kernel code which wants to be called | 110 | * Notifier list for kernel code which wants to be called |
102 | * at shutdown. This is used to stop any idling DMA operations | 111 | * at shutdown. This is used to stop any idling DMA operations |
103 | * and the like. | 112 | * and the like. |
@@ -865,6 +874,8 @@ EXPORT_SYMBOL_GPL(kernel_halt); | |||
865 | void kernel_power_off(void) | 874 | void kernel_power_off(void) |
866 | { | 875 | { |
867 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); | 876 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); |
877 | if (pm_power_off_prepare) | ||
878 | pm_power_off_prepare(); | ||
868 | printk(KERN_EMERG "Power down.\n"); | 879 | printk(KERN_EMERG "Power down.\n"); |
869 | machine_power_off(); | 880 | machine_power_off(); |
870 | } | 881 | } |
@@ -1025,7 +1036,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) | |||
1025 | return -EPERM; | 1036 | return -EPERM; |
1026 | } | 1037 | } |
1027 | if (new_egid != old_egid) { | 1038 | if (new_egid != old_egid) { |
1028 | current->mm->dumpable = suid_dumpable; | 1039 | set_dumpable(current->mm, suid_dumpable); |
1029 | smp_wmb(); | 1040 | smp_wmb(); |
1030 | } | 1041 | } |
1031 | if (rgid != (gid_t) -1 || | 1042 | if (rgid != (gid_t) -1 || |
@@ -1055,13 +1066,13 @@ asmlinkage long sys_setgid(gid_t gid) | |||
1055 | 1066 | ||
1056 | if (capable(CAP_SETGID)) { | 1067 | if (capable(CAP_SETGID)) { |
1057 | if (old_egid != gid) { | 1068 | if (old_egid != gid) { |
1058 | current->mm->dumpable = suid_dumpable; | 1069 | set_dumpable(current->mm, suid_dumpable); |
1059 | smp_wmb(); | 1070 | smp_wmb(); |
1060 | } | 1071 | } |
1061 | current->gid = current->egid = current->sgid = current->fsgid = gid; | 1072 | current->gid = current->egid = current->sgid = current->fsgid = gid; |
1062 | } else if ((gid == current->gid) || (gid == current->sgid)) { | 1073 | } else if ((gid == current->gid) || (gid == current->sgid)) { |
1063 | if (old_egid != gid) { | 1074 | if (old_egid != gid) { |
1064 | current->mm->dumpable = suid_dumpable; | 1075 | set_dumpable(current->mm, suid_dumpable); |
1065 | smp_wmb(); | 1076 | smp_wmb(); |
1066 | } | 1077 | } |
1067 | current->egid = current->fsgid = gid; | 1078 | current->egid = current->fsgid = gid; |
@@ -1078,13 +1089,13 @@ static int set_user(uid_t new_ruid, int dumpclear) | |||
1078 | { | 1089 | { |
1079 | struct user_struct *new_user; | 1090 | struct user_struct *new_user; |
1080 | 1091 | ||
1081 | new_user = alloc_uid(new_ruid); | 1092 | new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); |
1082 | if (!new_user) | 1093 | if (!new_user) |
1083 | return -EAGAIN; | 1094 | return -EAGAIN; |
1084 | 1095 | ||
1085 | if (atomic_read(&new_user->processes) >= | 1096 | if (atomic_read(&new_user->processes) >= |
1086 | current->signal->rlim[RLIMIT_NPROC].rlim_cur && | 1097 | current->signal->rlim[RLIMIT_NPROC].rlim_cur && |
1087 | new_user != &root_user) { | 1098 | new_user != current->nsproxy->user_ns->root_user) { |
1088 | free_uid(new_user); | 1099 | free_uid(new_user); |
1089 | return -EAGAIN; | 1100 | return -EAGAIN; |
1090 | } | 1101 | } |
@@ -1092,7 +1103,7 @@ static int set_user(uid_t new_ruid, int dumpclear) | |||
1092 | switch_uid(new_user); | 1103 | switch_uid(new_user); |
1093 | 1104 | ||
1094 | if (dumpclear) { | 1105 | if (dumpclear) { |
1095 | current->mm->dumpable = suid_dumpable; | 1106 | set_dumpable(current->mm, suid_dumpable); |
1096 | smp_wmb(); | 1107 | smp_wmb(); |
1097 | } | 1108 | } |
1098 | current->uid = new_ruid; | 1109 | current->uid = new_ruid; |
@@ -1148,7 +1159,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) | |||
1148 | return -EAGAIN; | 1159 | return -EAGAIN; |
1149 | 1160 | ||
1150 | if (new_euid != old_euid) { | 1161 | if (new_euid != old_euid) { |
1151 | current->mm->dumpable = suid_dumpable; | 1162 | set_dumpable(current->mm, suid_dumpable); |
1152 | smp_wmb(); | 1163 | smp_wmb(); |
1153 | } | 1164 | } |
1154 | current->fsuid = current->euid = new_euid; | 1165 | current->fsuid = current->euid = new_euid; |
@@ -1198,7 +1209,7 @@ asmlinkage long sys_setuid(uid_t uid) | |||
1198 | return -EPERM; | 1209 | return -EPERM; |
1199 | 1210 | ||
1200 | if (old_euid != uid) { | 1211 | if (old_euid != uid) { |
1201 | current->mm->dumpable = suid_dumpable; | 1212 | set_dumpable(current->mm, suid_dumpable); |
1202 | smp_wmb(); | 1213 | smp_wmb(); |
1203 | } | 1214 | } |
1204 | current->fsuid = current->euid = uid; | 1215 | current->fsuid = current->euid = uid; |
@@ -1243,7 +1254,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) | |||
1243 | } | 1254 | } |
1244 | if (euid != (uid_t) -1) { | 1255 | if (euid != (uid_t) -1) { |
1245 | if (euid != current->euid) { | 1256 | if (euid != current->euid) { |
1246 | current->mm->dumpable = suid_dumpable; | 1257 | set_dumpable(current->mm, suid_dumpable); |
1247 | smp_wmb(); | 1258 | smp_wmb(); |
1248 | } | 1259 | } |
1249 | current->euid = euid; | 1260 | current->euid = euid; |
@@ -1293,7 +1304,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) | |||
1293 | } | 1304 | } |
1294 | if (egid != (gid_t) -1) { | 1305 | if (egid != (gid_t) -1) { |
1295 | if (egid != current->egid) { | 1306 | if (egid != current->egid) { |
1296 | current->mm->dumpable = suid_dumpable; | 1307 | set_dumpable(current->mm, suid_dumpable); |
1297 | smp_wmb(); | 1308 | smp_wmb(); |
1298 | } | 1309 | } |
1299 | current->egid = egid; | 1310 | current->egid = egid; |
@@ -1339,7 +1350,7 @@ asmlinkage long sys_setfsuid(uid_t uid) | |||
1339 | uid == current->suid || uid == current->fsuid || | 1350 | uid == current->suid || uid == current->fsuid || |
1340 | capable(CAP_SETUID)) { | 1351 | capable(CAP_SETUID)) { |
1341 | if (uid != old_fsuid) { | 1352 | if (uid != old_fsuid) { |
1342 | current->mm->dumpable = suid_dumpable; | 1353 | set_dumpable(current->mm, suid_dumpable); |
1343 | smp_wmb(); | 1354 | smp_wmb(); |
1344 | } | 1355 | } |
1345 | current->fsuid = uid; | 1356 | current->fsuid = uid; |
@@ -1368,7 +1379,7 @@ asmlinkage long sys_setfsgid(gid_t gid) | |||
1368 | gid == current->sgid || gid == current->fsgid || | 1379 | gid == current->sgid || gid == current->fsgid || |
1369 | capable(CAP_SETGID)) { | 1380 | capable(CAP_SETGID)) { |
1370 | if (gid != old_fsgid) { | 1381 | if (gid != old_fsgid) { |
1371 | current->mm->dumpable = suid_dumpable; | 1382 | set_dumpable(current->mm, suid_dumpable); |
1372 | smp_wmb(); | 1383 | smp_wmb(); |
1373 | } | 1384 | } |
1374 | current->fsgid = gid; | 1385 | current->fsgid = gid; |
@@ -2165,14 +2176,14 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
2165 | error = put_user(current->pdeath_signal, (int __user *)arg2); | 2176 | error = put_user(current->pdeath_signal, (int __user *)arg2); |
2166 | break; | 2177 | break; |
2167 | case PR_GET_DUMPABLE: | 2178 | case PR_GET_DUMPABLE: |
2168 | error = current->mm->dumpable; | 2179 | error = get_dumpable(current->mm); |
2169 | break; | 2180 | break; |
2170 | case PR_SET_DUMPABLE: | 2181 | case PR_SET_DUMPABLE: |
2171 | if (arg2 < 0 || arg2 > 1) { | 2182 | if (arg2 < 0 || arg2 > 1) { |
2172 | error = -EINVAL; | 2183 | error = -EINVAL; |
2173 | break; | 2184 | break; |
2174 | } | 2185 | } |
2175 | current->mm->dumpable = arg2; | 2186 | set_dumpable(current->mm, arg2); |
2176 | break; | 2187 | break; |
2177 | 2188 | ||
2178 | case PR_SET_UNALIGN: | 2189 | case PR_SET_UNALIGN: |
@@ -2241,6 +2252,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
2241 | error = SET_ENDIAN(current, arg2); | 2252 | error = SET_ENDIAN(current, arg2); |
2242 | break; | 2253 | break; |
2243 | 2254 | ||
2255 | case PR_GET_SECCOMP: | ||
2256 | error = prctl_get_seccomp(); | ||
2257 | break; | ||
2258 | case PR_SET_SECCOMP: | ||
2259 | error = prctl_set_seccomp(arg2); | ||
2260 | break; | ||
2261 | |||
2244 | default: | 2262 | default: |
2245 | error = -EINVAL; | 2263 | error = -EINVAL; |
2246 | break; | 2264 | break; |
@@ -2277,3 +2295,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, | |||
2277 | } | 2295 | } |
2278 | return err ? -EFAULT : 0; | 2296 | return err ? -EFAULT : 0; |
2279 | } | 2297 | } |
2298 | |||
2299 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | ||
2300 | |||
2301 | static void argv_cleanup(char **argv, char **envp) | ||
2302 | { | ||
2303 | argv_free(argv); | ||
2304 | } | ||
2305 | |||
2306 | /** | ||
2307 | * orderly_poweroff - Trigger an orderly system poweroff | ||
2308 | * @force: force poweroff if command execution fails | ||
2309 | * | ||
2310 | * This may be called from any context to trigger a system shutdown. | ||
2311 | * If the orderly shutdown fails, it will force an immediate shutdown. | ||
2312 | */ | ||
2313 | int orderly_poweroff(bool force) | ||
2314 | { | ||
2315 | int argc; | ||
2316 | char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); | ||
2317 | static char *envp[] = { | ||
2318 | "HOME=/", | ||
2319 | "PATH=/sbin:/bin:/usr/sbin:/usr/bin", | ||
2320 | NULL | ||
2321 | }; | ||
2322 | int ret = -ENOMEM; | ||
2323 | struct subprocess_info *info; | ||
2324 | |||
2325 | if (argv == NULL) { | ||
2326 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", | ||
2327 | __func__, poweroff_cmd); | ||
2328 | goto out; | ||
2329 | } | ||
2330 | |||
2331 | info = call_usermodehelper_setup(argv[0], argv, envp); | ||
2332 | if (info == NULL) { | ||
2333 | argv_free(argv); | ||
2334 | goto out; | ||
2335 | } | ||
2336 | |||
2337 | call_usermodehelper_setcleanup(info, argv_cleanup); | ||
2338 | |||
2339 | ret = call_usermodehelper_exec(info, UMH_NO_WAIT); | ||
2340 | |||
2341 | out: | ||
2342 | if (ret && force) { | ||
2343 | printk(KERN_WARNING "Failed to start orderly shutdown: " | ||
2344 | "forcing the issue\n"); | ||
2345 | |||
2346 | /* I guess this should try to kick off some daemon to | ||
2347 | sync and poweroff asap. Or not even bother syncing | ||
2348 | if we're doing an emergency shutdown? */ | ||
2349 | emergency_sync(); | ||
2350 | kernel_power_off(); | ||
2351 | } | ||
2352 | |||
2353 | return ret; | ||
2354 | } | ||
2355 | EXPORT_SYMBOL_GPL(orderly_poweroff); | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7e11e2c98b..b0ec498a18 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -14,6 +14,7 @@ asmlinkage long sys_ni_syscall(void) | |||
14 | 14 | ||
15 | cond_syscall(sys_nfsservctl); | 15 | cond_syscall(sys_nfsservctl); |
16 | cond_syscall(sys_quotactl); | 16 | cond_syscall(sys_quotactl); |
17 | cond_syscall(sys32_quotactl); | ||
17 | cond_syscall(sys_acct); | 18 | cond_syscall(sys_acct); |
18 | cond_syscall(sys_lookup_dcookie); | 19 | cond_syscall(sys_lookup_dcookie); |
19 | cond_syscall(sys_swapon); | 20 | cond_syscall(sys_swapon); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 30ee462ee7..222299844a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/utsname.h> | 29 | #include <linux/utsname.h> |
30 | #include <linux/capability.h> | 30 | #include <linux/capability.h> |
31 | #include <linux/smp_lock.h> | 31 | #include <linux/smp_lock.h> |
32 | #include <linux/fs.h> | ||
32 | #include <linux/init.h> | 33 | #include <linux/init.h> |
33 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
34 | #include <linux/kobject.h> | 35 | #include <linux/kobject.h> |
@@ -45,13 +46,11 @@ | |||
45 | #include <linux/syscalls.h> | 46 | #include <linux/syscalls.h> |
46 | #include <linux/nfs_fs.h> | 47 | #include <linux/nfs_fs.h> |
47 | #include <linux/acpi.h> | 48 | #include <linux/acpi.h> |
49 | #include <linux/reboot.h> | ||
48 | 50 | ||
49 | #include <asm/uaccess.h> | 51 | #include <asm/uaccess.h> |
50 | #include <asm/processor.h> | 52 | #include <asm/processor.h> |
51 | 53 | ||
52 | extern int proc_nr_files(ctl_table *table, int write, struct file *filp, | ||
53 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
54 | |||
55 | #ifdef CONFIG_X86 | 54 | #ifdef CONFIG_X86 |
56 | #include <asm/nmi.h> | 55 | #include <asm/nmi.h> |
57 | #include <asm/stacktrace.h> | 56 | #include <asm/stacktrace.h> |
@@ -61,6 +60,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, | |||
61 | 60 | ||
62 | /* External variables not in a header file. */ | 61 | /* External variables not in a header file. */ |
63 | extern int C_A_D; | 62 | extern int C_A_D; |
63 | extern int print_fatal_signals; | ||
64 | extern int sysctl_overcommit_memory; | 64 | extern int sysctl_overcommit_memory; |
65 | extern int sysctl_overcommit_ratio; | 65 | extern int sysctl_overcommit_ratio; |
66 | extern int sysctl_panic_on_oom; | 66 | extern int sysctl_panic_on_oom; |
@@ -78,6 +78,7 @@ extern int percpu_pagelist_fraction; | |||
78 | extern int compat_log; | 78 | extern int compat_log; |
79 | extern int maps_protect; | 79 | extern int maps_protect; |
80 | extern int sysctl_stat_interval; | 80 | extern int sysctl_stat_interval; |
81 | extern int audit_argv_kb; | ||
81 | 82 | ||
82 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 83 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
83 | static int maxolduid = 65535; | 84 | static int maxolduid = 65535; |
@@ -160,6 +161,8 @@ extern ctl_table inotify_table[]; | |||
160 | int sysctl_legacy_va_layout; | 161 | int sysctl_legacy_va_layout; |
161 | #endif | 162 | #endif |
162 | 163 | ||
164 | extern int prove_locking; | ||
165 | extern int lock_stat; | ||
163 | 166 | ||
164 | /* The default sysctl tables: */ | 167 | /* The default sysctl tables: */ |
165 | 168 | ||
@@ -202,11 +205,114 @@ static ctl_table root_table[] = { | |||
202 | .mode = 0555, | 205 | .mode = 0555, |
203 | .child = dev_table, | 206 | .child = dev_table, |
204 | }, | 207 | }, |
205 | 208 | /* | |
209 | * NOTE: do not add new entries to this table unless you have read | ||
210 | * Documentation/sysctl/ctl_unnumbered.txt | ||
211 | */ | ||
206 | { .ctl_name = 0 } | 212 | { .ctl_name = 0 } |
207 | }; | 213 | }; |
208 | 214 | ||
215 | #ifdef CONFIG_SCHED_DEBUG | ||
216 | static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ | ||
217 | static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */ | ||
218 | static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ | ||
219 | static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ | ||
220 | #endif | ||
221 | |||
209 | static ctl_table kern_table[] = { | 222 | static ctl_table kern_table[] = { |
223 | #ifdef CONFIG_SCHED_DEBUG | ||
224 | { | ||
225 | .ctl_name = CTL_UNNUMBERED, | ||
226 | .procname = "sched_granularity_ns", | ||
227 | .data = &sysctl_sched_granularity, | ||
228 | .maxlen = sizeof(unsigned int), | ||
229 | .mode = 0644, | ||
230 | .proc_handler = &proc_dointvec_minmax, | ||
231 | .strategy = &sysctl_intvec, | ||
232 | .extra1 = &min_sched_granularity_ns, | ||
233 | .extra2 = &max_sched_granularity_ns, | ||
234 | }, | ||
235 | { | ||
236 | .ctl_name = CTL_UNNUMBERED, | ||
237 | .procname = "sched_wakeup_granularity_ns", | ||
238 | .data = &sysctl_sched_wakeup_granularity, | ||
239 | .maxlen = sizeof(unsigned int), | ||
240 | .mode = 0644, | ||
241 | .proc_handler = &proc_dointvec_minmax, | ||
242 | .strategy = &sysctl_intvec, | ||
243 | .extra1 = &min_wakeup_granularity_ns, | ||
244 | .extra2 = &max_wakeup_granularity_ns, | ||
245 | }, | ||
246 | { | ||
247 | .ctl_name = CTL_UNNUMBERED, | ||
248 | .procname = "sched_batch_wakeup_granularity_ns", | ||
249 | .data = &sysctl_sched_batch_wakeup_granularity, | ||
250 | .maxlen = sizeof(unsigned int), | ||
251 | .mode = 0644, | ||
252 | .proc_handler = &proc_dointvec_minmax, | ||
253 | .strategy = &sysctl_intvec, | ||
254 | .extra1 = &min_wakeup_granularity_ns, | ||
255 | .extra2 = &max_wakeup_granularity_ns, | ||
256 | }, | ||
257 | { | ||
258 | .ctl_name = CTL_UNNUMBERED, | ||
259 | .procname = "sched_stat_granularity_ns", | ||
260 | .data = &sysctl_sched_stat_granularity, | ||
261 | .maxlen = sizeof(unsigned int), | ||
262 | .mode = 0644, | ||
263 | .proc_handler = &proc_dointvec_minmax, | ||
264 | .strategy = &sysctl_intvec, | ||
265 | .extra1 = &min_wakeup_granularity_ns, | ||
266 | .extra2 = &max_wakeup_granularity_ns, | ||
267 | }, | ||
268 | { | ||
269 | .ctl_name = CTL_UNNUMBERED, | ||
270 | .procname = "sched_runtime_limit_ns", | ||
271 | .data = &sysctl_sched_runtime_limit, | ||
272 | .maxlen = sizeof(unsigned int), | ||
273 | .mode = 0644, | ||
274 | .proc_handler = &proc_dointvec_minmax, | ||
275 | .strategy = &sysctl_intvec, | ||
276 | .extra1 = &min_sched_granularity_ns, | ||
277 | .extra2 = &max_sched_granularity_ns, | ||
278 | }, | ||
279 | { | ||
280 | .ctl_name = CTL_UNNUMBERED, | ||
281 | .procname = "sched_child_runs_first", | ||
282 | .data = &sysctl_sched_child_runs_first, | ||
283 | .maxlen = sizeof(unsigned int), | ||
284 | .mode = 0644, | ||
285 | .proc_handler = &proc_dointvec, | ||
286 | }, | ||
287 | #ifdef CONFIG_PROVE_LOCKING | ||
288 | { | ||
289 | .ctl_name = CTL_UNNUMBERED, | ||
290 | .procname = "prove_locking", | ||
291 | .data = &prove_locking, | ||
292 | .maxlen = sizeof(int), | ||
293 | .mode = 0644, | ||
294 | .proc_handler = &proc_dointvec, | ||
295 | }, | ||
296 | #endif | ||
297 | #ifdef CONFIG_LOCK_STAT | ||
298 | { | ||
299 | .ctl_name = CTL_UNNUMBERED, | ||
300 | .procname = "lock_stat", | ||
301 | .data = &lock_stat, | ||
302 | .maxlen = sizeof(int), | ||
303 | .mode = 0644, | ||
304 | .proc_handler = &proc_dointvec, | ||
305 | }, | ||
306 | #endif | ||
307 | { | ||
308 | .ctl_name = CTL_UNNUMBERED, | ||
309 | .procname = "sched_features", | ||
310 | .data = &sysctl_sched_features, | ||
311 | .maxlen = sizeof(unsigned int), | ||
312 | .mode = 0644, | ||
313 | .proc_handler = &proc_dointvec, | ||
314 | }, | ||
315 | #endif | ||
210 | { | 316 | { |
211 | .ctl_name = KERN_PANIC, | 317 | .ctl_name = KERN_PANIC, |
212 | .procname = "panic", | 318 | .procname = "panic", |
@@ -223,6 +329,16 @@ static ctl_table kern_table[] = { | |||
223 | .mode = 0644, | 329 | .mode = 0644, |
224 | .proc_handler = &proc_dointvec, | 330 | .proc_handler = &proc_dointvec, |
225 | }, | 331 | }, |
332 | #ifdef CONFIG_AUDITSYSCALL | ||
333 | { | ||
334 | .ctl_name = CTL_UNNUMBERED, | ||
335 | .procname = "audit_argv_kb", | ||
336 | .data = &audit_argv_kb, | ||
337 | .maxlen = sizeof(int), | ||
338 | .mode = 0644, | ||
339 | .proc_handler = &proc_dointvec, | ||
340 | }, | ||
341 | #endif | ||
226 | { | 342 | { |
227 | .ctl_name = KERN_CORE_PATTERN, | 343 | .ctl_name = KERN_CORE_PATTERN, |
228 | .procname = "core_pattern", | 344 | .procname = "core_pattern", |
@@ -260,6 +376,14 @@ static ctl_table kern_table[] = { | |||
260 | .proc_handler = &proc_dointvec, | 376 | .proc_handler = &proc_dointvec, |
261 | }, | 377 | }, |
262 | #endif | 378 | #endif |
379 | { | ||
380 | .ctl_name = CTL_UNNUMBERED, | ||
381 | .procname = "print-fatal-signals", | ||
382 | .data = &print_fatal_signals, | ||
383 | .maxlen = sizeof(int), | ||
384 | .mode = 0644, | ||
385 | .proc_handler = &proc_dointvec, | ||
386 | }, | ||
263 | #ifdef __sparc__ | 387 | #ifdef __sparc__ |
264 | { | 388 | { |
265 | .ctl_name = KERN_SPARC_REBOOT, | 389 | .ctl_name = KERN_SPARC_REBOOT, |
@@ -569,7 +693,7 @@ static ctl_table kern_table[] = { | |||
569 | { | 693 | { |
570 | .ctl_name = KERN_ACPI_VIDEO_FLAGS, | 694 | .ctl_name = KERN_ACPI_VIDEO_FLAGS, |
571 | .procname = "acpi_video_flags", | 695 | .procname = "acpi_video_flags", |
572 | .data = &acpi_video_flags, | 696 | .data = &acpi_realmode_flags, |
573 | .maxlen = sizeof (unsigned long), | 697 | .maxlen = sizeof (unsigned long), |
574 | .mode = 0644, | 698 | .mode = 0644, |
575 | .proc_handler = &proc_doulongvec_minmax, | 699 | .proc_handler = &proc_doulongvec_minmax, |
@@ -615,13 +739,26 @@ static ctl_table kern_table[] = { | |||
615 | .proc_handler = &proc_dointvec, | 739 | .proc_handler = &proc_dointvec, |
616 | }, | 740 | }, |
617 | #endif | 741 | #endif |
618 | 742 | { | |
743 | .ctl_name = CTL_UNNUMBERED, | ||
744 | .procname = "poweroff_cmd", | ||
745 | .data = &poweroff_cmd, | ||
746 | .maxlen = POWEROFF_CMD_PATH_LEN, | ||
747 | .mode = 0644, | ||
748 | .proc_handler = &proc_dostring, | ||
749 | .strategy = &sysctl_string, | ||
750 | }, | ||
751 | /* | ||
752 | * NOTE: do not add new entries to this table unless you have read | ||
753 | * Documentation/sysctl/ctl_unnumbered.txt | ||
754 | */ | ||
619 | { .ctl_name = 0 } | 755 | { .ctl_name = 0 } |
620 | }; | 756 | }; |
621 | 757 | ||
622 | /* Constants for minimum and maximum testing in vm_table. | 758 | /* Constants for minimum and maximum testing in vm_table. |
623 | We use these as one-element integer vectors. */ | 759 | We use these as one-element integer vectors. */ |
624 | static int zero; | 760 | static int zero; |
761 | static int two = 2; | ||
625 | static int one_hundred = 100; | 762 | static int one_hundred = 100; |
626 | 763 | ||
627 | 764 | ||
@@ -734,6 +871,14 @@ static ctl_table vm_table[] = { | |||
734 | .mode = 0644, | 871 | .mode = 0644, |
735 | .proc_handler = &proc_dointvec, | 872 | .proc_handler = &proc_dointvec, |
736 | }, | 873 | }, |
874 | { | ||
875 | .ctl_name = CTL_UNNUMBERED, | ||
876 | .procname = "hugepages_treat_as_movable", | ||
877 | .data = &hugepages_treat_as_movable, | ||
878 | .maxlen = sizeof(int), | ||
879 | .mode = 0644, | ||
880 | .proc_handler = &hugetlb_treat_movable_handler, | ||
881 | }, | ||
737 | #endif | 882 | #endif |
738 | { | 883 | { |
739 | .ctl_name = VM_LOWMEM_RESERVE_RATIO, | 884 | .ctl_name = VM_LOWMEM_RESERVE_RATIO, |
@@ -869,6 +1014,27 @@ static ctl_table vm_table[] = { | |||
869 | .strategy = &sysctl_jiffies, | 1014 | .strategy = &sysctl_jiffies, |
870 | }, | 1015 | }, |
871 | #endif | 1016 | #endif |
1017 | #ifdef CONFIG_SECURITY | ||
1018 | { | ||
1019 | .ctl_name = CTL_UNNUMBERED, | ||
1020 | .procname = "mmap_min_addr", | ||
1021 | .data = &mmap_min_addr, | ||
1022 | .maxlen = sizeof(unsigned long), | ||
1023 | .mode = 0644, | ||
1024 | .proc_handler = &proc_doulongvec_minmax, | ||
1025 | }, | ||
1026 | #ifdef CONFIG_NUMA | ||
1027 | { | ||
1028 | .ctl_name = CTL_UNNUMBERED, | ||
1029 | .procname = "numa_zonelist_order", | ||
1030 | .data = &numa_zonelist_order, | ||
1031 | .maxlen = NUMA_ZONELIST_ORDER_LEN, | ||
1032 | .mode = 0644, | ||
1033 | .proc_handler = &numa_zonelist_order_handler, | ||
1034 | .strategy = &sysctl_string, | ||
1035 | }, | ||
1036 | #endif | ||
1037 | #endif | ||
872 | #if defined(CONFIG_X86_32) || \ | 1038 | #if defined(CONFIG_X86_32) || \ |
873 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) | 1039 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) |
874 | { | 1040 | { |
@@ -882,6 +1048,10 @@ static ctl_table vm_table[] = { | |||
882 | .extra1 = &zero, | 1048 | .extra1 = &zero, |
883 | }, | 1049 | }, |
884 | #endif | 1050 | #endif |
1051 | /* | ||
1052 | * NOTE: do not add new entries to this table unless you have read | ||
1053 | * Documentation/sysctl/ctl_unnumbered.txt | ||
1054 | */ | ||
885 | { .ctl_name = 0 } | 1055 | { .ctl_name = 0 } |
886 | }; | 1056 | }; |
887 | 1057 | ||
@@ -979,7 +1149,10 @@ static ctl_table fs_table[] = { | |||
979 | .data = &lease_break_time, | 1149 | .data = &lease_break_time, |
980 | .maxlen = sizeof(int), | 1150 | .maxlen = sizeof(int), |
981 | .mode = 0644, | 1151 | .mode = 0644, |
982 | .proc_handler = &proc_dointvec, | 1152 | .proc_handler = &proc_dointvec_minmax, |
1153 | .strategy = &sysctl_intvec, | ||
1154 | .extra1 = &zero, | ||
1155 | .extra2 = &two, | ||
983 | }, | 1156 | }, |
984 | { | 1157 | { |
985 | .ctl_name = FS_AIO_NR, | 1158 | .ctl_name = FS_AIO_NR, |
@@ -1022,6 +1195,10 @@ static ctl_table fs_table[] = { | |||
1022 | .child = binfmt_misc_table, | 1195 | .child = binfmt_misc_table, |
1023 | }, | 1196 | }, |
1024 | #endif | 1197 | #endif |
1198 | /* | ||
1199 | * NOTE: do not add new entries to this table unless you have read | ||
1200 | * Documentation/sysctl/ctl_unnumbered.txt | ||
1201 | */ | ||
1025 | { .ctl_name = 0 } | 1202 | { .ctl_name = 0 } |
1026 | }; | 1203 | }; |
1027 | 1204 | ||
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 906cae7715..059431ed67 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -196,6 +196,8 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, | |||
196 | 196 | ||
197 | /* fill in basic acct fields */ | 197 | /* fill in basic acct fields */ |
198 | stats->version = TASKSTATS_VERSION; | 198 | stats->version = TASKSTATS_VERSION; |
199 | stats->nvcsw = tsk->nvcsw; | ||
200 | stats->nivcsw = tsk->nivcsw; | ||
199 | bacct_add_tsk(stats, tsk); | 201 | bacct_add_tsk(stats, tsk); |
200 | 202 | ||
201 | /* fill in extended acct fields */ | 203 | /* fill in extended acct fields */ |
@@ -242,6 +244,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, | |||
242 | */ | 244 | */ |
243 | delayacct_add_tsk(stats, tsk); | 245 | delayacct_add_tsk(stats, tsk); |
244 | 246 | ||
247 | stats->nvcsw += tsk->nvcsw; | ||
248 | stats->nivcsw += tsk->nivcsw; | ||
245 | } while_each_thread(first, tsk); | 249 | } while_each_thread(first, tsk); |
246 | 250 | ||
247 | unlock_task_sighand(first, &flags); | 251 | unlock_task_sighand(first, &flags); |
diff --git a/kernel/time.c b/kernel/time.c index f04791f694..5b81da08bb 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -58,9 +58,9 @@ EXPORT_SYMBOL(sys_tz); | |||
58 | asmlinkage long sys_time(time_t __user * tloc) | 58 | asmlinkage long sys_time(time_t __user * tloc) |
59 | { | 59 | { |
60 | time_t i; | 60 | time_t i; |
61 | struct timeval tv; | 61 | struct timespec tv; |
62 | 62 | ||
63 | do_gettimeofday(&tv); | 63 | getnstimeofday(&tv); |
64 | i = tv.tv_sec; | 64 | i = tv.tv_sec; |
65 | 65 | ||
66 | if (tloc) { | 66 | if (tloc) { |
@@ -133,7 +133,6 @@ static inline void warp_clock(void) | |||
133 | write_seqlock_irq(&xtime_lock); | 133 | write_seqlock_irq(&xtime_lock); |
134 | wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; | 134 | wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; |
135 | xtime.tv_sec += sys_tz.tz_minuteswest * 60; | 135 | xtime.tv_sec += sys_tz.tz_minuteswest * 60; |
136 | time_interpolator_reset(); | ||
137 | write_sequnlock_irq(&xtime_lock); | 136 | write_sequnlock_irq(&xtime_lock); |
138 | clock_was_set(); | 137 | clock_was_set(); |
139 | } | 138 | } |
@@ -306,79 +305,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) | |||
306 | } | 305 | } |
307 | EXPORT_SYMBOL(timespec_trunc); | 306 | EXPORT_SYMBOL(timespec_trunc); |
308 | 307 | ||
309 | #ifdef CONFIG_TIME_INTERPOLATION | ||
310 | void getnstimeofday (struct timespec *tv) | ||
311 | { | ||
312 | unsigned long seq,sec,nsec; | ||
313 | |||
314 | do { | ||
315 | seq = read_seqbegin(&xtime_lock); | ||
316 | sec = xtime.tv_sec; | ||
317 | nsec = xtime.tv_nsec+time_interpolator_get_offset(); | ||
318 | } while (unlikely(read_seqretry(&xtime_lock, seq))); | ||
319 | |||
320 | while (unlikely(nsec >= NSEC_PER_SEC)) { | ||
321 | nsec -= NSEC_PER_SEC; | ||
322 | ++sec; | ||
323 | } | ||
324 | tv->tv_sec = sec; | ||
325 | tv->tv_nsec = nsec; | ||
326 | } | ||
327 | EXPORT_SYMBOL_GPL(getnstimeofday); | ||
328 | |||
329 | int do_settimeofday (struct timespec *tv) | ||
330 | { | ||
331 | time_t wtm_sec, sec = tv->tv_sec; | ||
332 | long wtm_nsec, nsec = tv->tv_nsec; | ||
333 | |||
334 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
335 | return -EINVAL; | ||
336 | |||
337 | write_seqlock_irq(&xtime_lock); | ||
338 | { | ||
339 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
340 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
341 | |||
342 | set_normalized_timespec(&xtime, sec, nsec); | ||
343 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
344 | |||
345 | time_adjust = 0; /* stop active adjtime() */ | ||
346 | time_status |= STA_UNSYNC; | ||
347 | time_maxerror = NTP_PHASE_LIMIT; | ||
348 | time_esterror = NTP_PHASE_LIMIT; | ||
349 | time_interpolator_reset(); | ||
350 | } | ||
351 | write_sequnlock_irq(&xtime_lock); | ||
352 | clock_was_set(); | ||
353 | return 0; | ||
354 | } | ||
355 | EXPORT_SYMBOL(do_settimeofday); | ||
356 | |||
357 | void do_gettimeofday (struct timeval *tv) | ||
358 | { | ||
359 | unsigned long seq, nsec, usec, sec, offset; | ||
360 | do { | ||
361 | seq = read_seqbegin(&xtime_lock); | ||
362 | offset = time_interpolator_get_offset(); | ||
363 | sec = xtime.tv_sec; | ||
364 | nsec = xtime.tv_nsec; | ||
365 | } while (unlikely(read_seqretry(&xtime_lock, seq))); | ||
366 | |||
367 | usec = (nsec + offset) / 1000; | ||
368 | |||
369 | while (unlikely(usec >= USEC_PER_SEC)) { | ||
370 | usec -= USEC_PER_SEC; | ||
371 | ++sec; | ||
372 | } | ||
373 | |||
374 | tv->tv_sec = sec; | ||
375 | tv->tv_usec = usec; | ||
376 | } | ||
377 | |||
378 | EXPORT_SYMBOL(do_gettimeofday); | ||
379 | |||
380 | |||
381 | #else | ||
382 | #ifndef CONFIG_GENERIC_TIME | 308 | #ifndef CONFIG_GENERIC_TIME |
383 | /* | 309 | /* |
384 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | 310 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval |
@@ -394,7 +320,6 @@ void getnstimeofday(struct timespec *tv) | |||
394 | } | 320 | } |
395 | EXPORT_SYMBOL_GPL(getnstimeofday); | 321 | EXPORT_SYMBOL_GPL(getnstimeofday); |
396 | #endif | 322 | #endif |
397 | #endif | ||
398 | 323 | ||
399 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 324 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
400 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 325 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 76212b2a99..2ad1c37b8d 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -205,47 +205,6 @@ void clockevents_exchange_device(struct clock_event_device *old, | |||
205 | } | 205 | } |
206 | 206 | ||
207 | /** | 207 | /** |
208 | * clockevents_request_device | ||
209 | */ | ||
210 | struct clock_event_device *clockevents_request_device(unsigned int features, | ||
211 | cpumask_t cpumask) | ||
212 | { | ||
213 | struct clock_event_device *cur, *dev = NULL; | ||
214 | struct list_head *tmp; | ||
215 | |||
216 | spin_lock(&clockevents_lock); | ||
217 | |||
218 | list_for_each(tmp, &clockevent_devices) { | ||
219 | cur = list_entry(tmp, struct clock_event_device, list); | ||
220 | |||
221 | if ((cur->features & features) == features && | ||
222 | cpus_equal(cpumask, cur->cpumask)) { | ||
223 | if (!dev || dev->rating < cur->rating) | ||
224 | dev = cur; | ||
225 | } | ||
226 | } | ||
227 | |||
228 | clockevents_exchange_device(NULL, dev); | ||
229 | |||
230 | spin_unlock(&clockevents_lock); | ||
231 | |||
232 | return dev; | ||
233 | } | ||
234 | |||
235 | /** | ||
236 | * clockevents_release_device | ||
237 | */ | ||
238 | void clockevents_release_device(struct clock_event_device *dev) | ||
239 | { | ||
240 | spin_lock(&clockevents_lock); | ||
241 | |||
242 | clockevents_exchange_device(dev, NULL); | ||
243 | clockevents_notify_released(); | ||
244 | |||
245 | spin_unlock(&clockevents_lock); | ||
246 | } | ||
247 | |||
248 | /** | ||
249 | * clockevents_notify - notification about relevant events | 208 | * clockevents_notify - notification about relevant events |
250 | */ | 209 | */ |
251 | void clockevents_notify(unsigned long reason, void *arg) | 210 | void clockevents_notify(unsigned long reason, void *arg) |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 87aa5ff931..cd91237dbf 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -10,10 +10,11 @@ | |||
10 | 10 | ||
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/time.h> | 12 | #include <linux/time.h> |
13 | #include <linux/timer.h> | ||
13 | #include <linux/timex.h> | 14 | #include <linux/timex.h> |
14 | #include <linux/jiffies.h> | 15 | #include <linux/jiffies.h> |
15 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
16 | 17 | #include <linux/capability.h> | |
17 | #include <asm/div64.h> | 18 | #include <asm/div64.h> |
18 | #include <asm/timex.h> | 19 | #include <asm/timex.h> |
19 | 20 | ||
@@ -116,13 +117,7 @@ void second_overflow(void) | |||
116 | if (xtime.tv_sec % 86400 == 0) { | 117 | if (xtime.tv_sec % 86400 == 0) { |
117 | xtime.tv_sec--; | 118 | xtime.tv_sec--; |
118 | wall_to_monotonic.tv_sec++; | 119 | wall_to_monotonic.tv_sec++; |
119 | /* | ||
120 | * The timer interpolator will make time change | ||
121 | * gradually instead of an immediate jump by one second | ||
122 | */ | ||
123 | time_interpolator_update(-NSEC_PER_SEC); | ||
124 | time_state = TIME_OOP; | 120 | time_state = TIME_OOP; |
125 | clock_was_set(); | ||
126 | printk(KERN_NOTICE "Clock: inserting leap second " | 121 | printk(KERN_NOTICE "Clock: inserting leap second " |
127 | "23:59:60 UTC\n"); | 122 | "23:59:60 UTC\n"); |
128 | } | 123 | } |
@@ -131,13 +126,7 @@ void second_overflow(void) | |||
131 | if ((xtime.tv_sec + 1) % 86400 == 0) { | 126 | if ((xtime.tv_sec + 1) % 86400 == 0) { |
132 | xtime.tv_sec++; | 127 | xtime.tv_sec++; |
133 | wall_to_monotonic.tv_sec--; | 128 | wall_to_monotonic.tv_sec--; |
134 | /* | ||
135 | * Use of time interpolator for a gradual change of | ||
136 | * time | ||
137 | */ | ||
138 | time_interpolator_update(NSEC_PER_SEC); | ||
139 | time_state = TIME_WAIT; | 129 | time_state = TIME_WAIT; |
140 | clock_was_set(); | ||
141 | printk(KERN_NOTICE "Clock: deleting leap second " | 130 | printk(KERN_NOTICE "Clock: deleting leap second " |
142 | "23:59:59 UTC\n"); | 131 | "23:59:59 UTC\n"); |
143 | } | 132 | } |
@@ -187,12 +176,64 @@ u64 current_tick_length(void) | |||
187 | return tick_length; | 176 | return tick_length; |
188 | } | 177 | } |
189 | 178 | ||
179 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | ||
180 | |||
181 | /* Disable the cmos update - used by virtualization and embedded */ | ||
182 | int no_sync_cmos_clock __read_mostly; | ||
183 | |||
184 | static void sync_cmos_clock(unsigned long dummy); | ||
185 | |||
186 | static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); | ||
190 | 187 | ||
191 | void __attribute__ ((weak)) notify_arch_cmos_timer(void) | 188 | static void sync_cmos_clock(unsigned long dummy) |
192 | { | 189 | { |
193 | return; | 190 | struct timespec now, next; |
191 | int fail = 1; | ||
192 | |||
193 | /* | ||
194 | * If we have an externally synchronized Linux clock, then update | ||
195 | * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be | ||
196 | * called as close as possible to 500 ms before the new second starts. | ||
197 | * This code is run on a timer. If the clock is set, that timer | ||
198 | * may not expire at the correct time. Thus, we adjust... | ||
199 | */ | ||
200 | if (!ntp_synced()) | ||
201 | /* | ||
202 | * Not synced, exit, do not restart a timer (if one is | ||
203 | * running, let it run out). | ||
204 | */ | ||
205 | return; | ||
206 | |||
207 | getnstimeofday(&now); | ||
208 | if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) | ||
209 | fail = update_persistent_clock(now); | ||
210 | |||
211 | next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; | ||
212 | if (next.tv_nsec <= 0) | ||
213 | next.tv_nsec += NSEC_PER_SEC; | ||
214 | |||
215 | if (!fail) | ||
216 | next.tv_sec = 659; | ||
217 | else | ||
218 | next.tv_sec = 0; | ||
219 | |||
220 | if (next.tv_nsec >= NSEC_PER_SEC) { | ||
221 | next.tv_sec++; | ||
222 | next.tv_nsec -= NSEC_PER_SEC; | ||
223 | } | ||
224 | mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); | ||
194 | } | 225 | } |
195 | 226 | ||
227 | static void notify_cmos_timer(void) | ||
228 | { | ||
229 | if (no_sync_cmos_clock) | ||
230 | mod_timer(&sync_cmos_timer, jiffies + 1); | ||
231 | } | ||
232 | |||
233 | #else | ||
234 | static inline void notify_cmos_timer(void) { } | ||
235 | #endif | ||
236 | |||
196 | /* adjtimex mainly allows reading (and writing, if superuser) of | 237 | /* adjtimex mainly allows reading (and writing, if superuser) of |
197 | * kernel time-keeping variables. used by xntpd. | 238 | * kernel time-keeping variables. used by xntpd. |
198 | */ | 239 | */ |
@@ -357,6 +398,6 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) | |||
357 | txc->stbcnt = 0; | 398 | txc->stbcnt = 0; |
358 | write_sequnlock_irq(&xtime_lock); | 399 | write_sequnlock_irq(&xtime_lock); |
359 | do_gettimeofday(&txc->time); | 400 | do_gettimeofday(&txc->time); |
360 | notify_arch_cmos_timer(); | 401 | notify_cmos_timer(); |
361 | return(result); | 402 | return(result); |
362 | } | 403 | } |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 8001d37071..db8e0f3d40 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -31,6 +31,12 @@ struct tick_device tick_broadcast_device; | |||
31 | static cpumask_t tick_broadcast_mask; | 31 | static cpumask_t tick_broadcast_mask; |
32 | static DEFINE_SPINLOCK(tick_broadcast_lock); | 32 | static DEFINE_SPINLOCK(tick_broadcast_lock); |
33 | 33 | ||
34 | #ifdef CONFIG_TICK_ONESHOT | ||
35 | static void tick_broadcast_clear_oneshot(int cpu); | ||
36 | #else | ||
37 | static inline void tick_broadcast_clear_oneshot(int cpu) { } | ||
38 | #endif | ||
39 | |||
34 | /* | 40 | /* |
35 | * Debugging: see timer_list.c | 41 | * Debugging: see timer_list.c |
36 | */ | 42 | */ |
@@ -49,7 +55,7 @@ cpumask_t *tick_get_broadcast_mask(void) | |||
49 | */ | 55 | */ |
50 | static void tick_broadcast_start_periodic(struct clock_event_device *bc) | 56 | static void tick_broadcast_start_periodic(struct clock_event_device *bc) |
51 | { | 57 | { |
52 | if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) | 58 | if (bc) |
53 | tick_setup_periodic(bc, 1); | 59 | tick_setup_periodic(bc, 1); |
54 | } | 60 | } |
55 | 61 | ||
@@ -99,8 +105,19 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
99 | cpu_set(cpu, tick_broadcast_mask); | 105 | cpu_set(cpu, tick_broadcast_mask); |
100 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); | 106 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); |
101 | ret = 1; | 107 | ret = 1; |
102 | } | 108 | } else { |
109 | /* | ||
110 | * When the new device is not affected by the stop | ||
111 | * feature and the cpu is marked in the broadcast mask | ||
112 | * then clear the broadcast bit. | ||
113 | */ | ||
114 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { | ||
115 | int cpu = smp_processor_id(); | ||
103 | 116 | ||
117 | cpu_clear(cpu, tick_broadcast_mask); | ||
118 | tick_broadcast_clear_oneshot(cpu); | ||
119 | } | ||
120 | } | ||
104 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 121 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
105 | return ret; | 122 | return ret; |
106 | } | 123 | } |
@@ -299,7 +316,7 @@ void tick_suspend_broadcast(void) | |||
299 | spin_lock_irqsave(&tick_broadcast_lock, flags); | 316 | spin_lock_irqsave(&tick_broadcast_lock, flags); |
300 | 317 | ||
301 | bc = tick_broadcast_device.evtdev; | 318 | bc = tick_broadcast_device.evtdev; |
302 | if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | 319 | if (bc) |
303 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | 320 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); |
304 | 321 | ||
305 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 322 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
@@ -316,6 +333,8 @@ int tick_resume_broadcast(void) | |||
316 | bc = tick_broadcast_device.evtdev; | 333 | bc = tick_broadcast_device.evtdev; |
317 | 334 | ||
318 | if (bc) { | 335 | if (bc) { |
336 | clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); | ||
337 | |||
319 | switch (tick_broadcast_device.mode) { | 338 | switch (tick_broadcast_device.mode) { |
320 | case TICKDEV_MODE_PERIODIC: | 339 | case TICKDEV_MODE_PERIODIC: |
321 | if(!cpus_empty(tick_broadcast_mask)) | 340 | if(!cpus_empty(tick_broadcast_mask)) |
@@ -485,6 +504,16 @@ out: | |||
485 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 504 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
486 | } | 505 | } |
487 | 506 | ||
507 | /* | ||
508 | * Reset the one shot broadcast for a cpu | ||
509 | * | ||
510 | * Called with tick_broadcast_lock held | ||
511 | */ | ||
512 | static void tick_broadcast_clear_oneshot(int cpu) | ||
513 | { | ||
514 | cpu_clear(cpu, tick_broadcast_oneshot_mask); | ||
515 | } | ||
516 | |||
488 | /** | 517 | /** |
489 | * tick_broadcast_setup_highres - setup the broadcast device for highres | 518 | * tick_broadcast_setup_highres - setup the broadcast device for highres |
490 | */ | 519 | */ |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a96ec9ab34..77a21abc87 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -318,12 +318,17 @@ static void tick_resume(void) | |||
318 | { | 318 | { |
319 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 319 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); |
320 | unsigned long flags; | 320 | unsigned long flags; |
321 | int broadcast = tick_resume_broadcast(); | ||
321 | 322 | ||
322 | spin_lock_irqsave(&tick_device_lock, flags); | 323 | spin_lock_irqsave(&tick_device_lock, flags); |
323 | if (td->mode == TICKDEV_MODE_PERIODIC) | 324 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); |
324 | tick_setup_periodic(td->evtdev, 0); | 325 | |
325 | else | 326 | if (!broadcast) { |
326 | tick_resume_oneshot(); | 327 | if (td->mode == TICKDEV_MODE_PERIODIC) |
328 | tick_setup_periodic(td->evtdev, 0); | ||
329 | else | ||
330 | tick_resume_oneshot(); | ||
331 | } | ||
327 | spin_unlock_irqrestore(&tick_device_lock, flags); | 332 | spin_unlock_irqrestore(&tick_device_lock, flags); |
328 | } | 333 | } |
329 | 334 | ||
@@ -360,8 +365,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, | |||
360 | break; | 365 | break; |
361 | 366 | ||
362 | case CLOCK_EVT_NOTIFY_RESUME: | 367 | case CLOCK_EVT_NOTIFY_RESUME: |
363 | if (!tick_resume_broadcast()) | 368 | tick_resume(); |
364 | tick_resume(); | ||
365 | break; | 369 | break; |
366 | 370 | ||
367 | default: | 371 | default: |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index f6997ab0c3..0258d3115d 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) | |||
73 | struct clock_event_device *dev = td->evtdev; | 73 | struct clock_event_device *dev = td->evtdev; |
74 | 74 | ||
75 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || | 75 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || |
76 | !tick_device_is_functional(dev)) | 76 | !tick_device_is_functional(dev)) { |
77 | |||
78 | printk(KERN_INFO "Clockevents: " | ||
79 | "could not switch to one-shot mode:"); | ||
80 | if (!dev) { | ||
81 | printk(" no tick device\n"); | ||
82 | } else { | ||
83 | if (!tick_device_is_functional(dev)) | ||
84 | printk(" %s is not functional.\n", dev->name); | ||
85 | else | ||
86 | printk(" %s does not support one-shot mode.\n", | ||
87 | dev->name); | ||
88 | } | ||
77 | return -EINVAL; | 89 | return -EINVAL; |
90 | } | ||
78 | 91 | ||
79 | td->mode = TICKDEV_MODE_ONESHOT; | 92 | td->mode = TICKDEV_MODE_ONESHOT; |
80 | dev->event_handler = handler; | 93 | dev->event_handler = handler; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52db9e3c52..b416995b97 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -546,6 +546,7 @@ void tick_setup_sched_timer(void) | |||
546 | { | 546 | { |
547 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 547 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
548 | ktime_t now = ktime_get(); | 548 | ktime_t now = ktime_get(); |
549 | u64 offset; | ||
549 | 550 | ||
550 | /* | 551 | /* |
551 | * Emulate tick processing via per-CPU hrtimers: | 552 | * Emulate tick processing via per-CPU hrtimers: |
@@ -554,8 +555,12 @@ void tick_setup_sched_timer(void) | |||
554 | ts->sched_timer.function = tick_sched_timer; | 555 | ts->sched_timer.function = tick_sched_timer; |
555 | ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | 556 | ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; |
556 | 557 | ||
557 | /* Get the next period */ | 558 | /* Get the next period (per cpu) */ |
558 | ts->sched_timer.expires = tick_init_jiffy_update(); | 559 | ts->sched_timer.expires = tick_init_jiffy_update(); |
560 | offset = ktime_to_ns(tick_period) >> 1; | ||
561 | do_div(offset, NR_CPUS); | ||
562 | offset *= smp_processor_id(); | ||
563 | ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); | ||
559 | 564 | ||
560 | for (;;) { | 565 | for (;;) { |
561 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 566 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3d1042f82a..88c81026e0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -36,9 +36,17 @@ EXPORT_SYMBOL(xtime_lock); | |||
36 | * at zero at system boot time, so wall_to_monotonic will be negative, | 36 | * at zero at system boot time, so wall_to_monotonic will be negative, |
37 | * however, we will ALWAYS keep the tv_nsec part positive so we can use | 37 | * however, we will ALWAYS keep the tv_nsec part positive so we can use |
38 | * the usual normalization. | 38 | * the usual normalization. |
39 | * | ||
40 | * wall_to_monotonic is moved after resume from suspend for the monotonic | ||
41 | * time not to jump. We need to add total_sleep_time to wall_to_monotonic | ||
42 | * to get the real boot based time offset. | ||
43 | * | ||
44 | * - wall_to_monotonic is no longer the boot time, getboottime must be | ||
45 | * used instead. | ||
39 | */ | 46 | */ |
40 | struct timespec xtime __attribute__ ((aligned (16))); | 47 | struct timespec xtime __attribute__ ((aligned (16))); |
41 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | 48 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); |
49 | static unsigned long total_sleep_time; /* seconds */ | ||
42 | 50 | ||
43 | EXPORT_SYMBOL(xtime); | 51 | EXPORT_SYMBOL(xtime); |
44 | 52 | ||
@@ -251,6 +259,7 @@ void __init timekeeping_init(void) | |||
251 | xtime.tv_nsec = 0; | 259 | xtime.tv_nsec = 0; |
252 | set_normalized_timespec(&wall_to_monotonic, | 260 | set_normalized_timespec(&wall_to_monotonic, |
253 | -xtime.tv_sec, -xtime.tv_nsec); | 261 | -xtime.tv_sec, -xtime.tv_nsec); |
262 | total_sleep_time = 0; | ||
254 | 263 | ||
255 | write_sequnlock_irqrestore(&xtime_lock, flags); | 264 | write_sequnlock_irqrestore(&xtime_lock, flags); |
256 | } | 265 | } |
@@ -282,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev) | |||
282 | 291 | ||
283 | xtime.tv_sec += sleep_length; | 292 | xtime.tv_sec += sleep_length; |
284 | wall_to_monotonic.tv_sec -= sleep_length; | 293 | wall_to_monotonic.tv_sec -= sleep_length; |
294 | total_sleep_time += sleep_length; | ||
285 | } | 295 | } |
286 | /* re-base the last cycle value */ | 296 | /* re-base the last cycle value */ |
287 | clock->cycle_last = clocksource_read(clock); | 297 | clock->cycle_last = clocksource_read(clock); |
@@ -391,7 +401,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, | |||
391 | * this is optimized for the most common adjustments of -1,0,1, | 401 | * this is optimized for the most common adjustments of -1,0,1, |
392 | * for other values we can do a bit more work. | 402 | * for other values we can do a bit more work. |
393 | */ | 403 | */ |
394 | static void clocksource_adjust(struct clocksource *clock, s64 offset) | 404 | static void clocksource_adjust(s64 offset) |
395 | { | 405 | { |
396 | s64 error, interval = clock->cycle_interval; | 406 | s64 error, interval = clock->cycle_interval; |
397 | int adj; | 407 | int adj; |
@@ -456,17 +466,13 @@ void update_wall_time(void) | |||
456 | second_overflow(); | 466 | second_overflow(); |
457 | } | 467 | } |
458 | 468 | ||
459 | /* interpolator bits */ | ||
460 | time_interpolator_update(clock->xtime_interval | ||
461 | >> clock->shift); | ||
462 | |||
463 | /* accumulate error between NTP and clock interval */ | 469 | /* accumulate error between NTP and clock interval */ |
464 | clock->error += current_tick_length(); | 470 | clock->error += current_tick_length(); |
465 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); | 471 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); |
466 | } | 472 | } |
467 | 473 | ||
468 | /* correct the clock when NTP error is too big */ | 474 | /* correct the clock when NTP error is too big */ |
469 | clocksource_adjust(clock, offset); | 475 | clocksource_adjust(offset); |
470 | 476 | ||
471 | /* store full nanoseconds into xtime */ | 477 | /* store full nanoseconds into xtime */ |
472 | xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; | 478 | xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; |
@@ -476,3 +482,30 @@ void update_wall_time(void) | |||
476 | change_clocksource(); | 482 | change_clocksource(); |
477 | update_vsyscall(&xtime, clock); | 483 | update_vsyscall(&xtime, clock); |
478 | } | 484 | } |
485 | |||
486 | /** | ||
487 | * getboottime - Return the real time of system boot. | ||
488 | * @ts: pointer to the timespec to be set | ||
489 | * | ||
490 | * Returns the time of day in a timespec. | ||
491 | * | ||
492 | * This is based on the wall_to_monotonic offset and the total suspend | ||
493 | * time. Calls to settimeofday will affect the value returned (which | ||
494 | * basically means that however wrong your real time clock is at boot time, | ||
495 | * you get the right time here). | ||
496 | */ | ||
497 | void getboottime(struct timespec *ts) | ||
498 | { | ||
499 | set_normalized_timespec(ts, | ||
500 | - (wall_to_monotonic.tv_sec + total_sleep_time), | ||
501 | - wall_to_monotonic.tv_nsec); | ||
502 | } | ||
503 | |||
504 | /** | ||
505 | * monotonic_to_bootbased - Convert the monotonic time to boot based. | ||
506 | * @ts: pointer to the timespec to be converted | ||
507 | */ | ||
508 | void monotonic_to_bootbased(struct timespec *ts) | ||
509 | { | ||
510 | ts->tv_sec += total_sleep_time; | ||
511 | } | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 8bbcfb77f7..e5edc3a22a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -38,7 +38,7 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); | |||
38 | 38 | ||
39 | static void print_name_offset(struct seq_file *m, void *sym) | 39 | static void print_name_offset(struct seq_file *m, void *sym) |
40 | { | 40 | { |
41 | char symname[KSYM_NAME_LEN+1]; | 41 | char symname[KSYM_NAME_LEN]; |
42 | 42 | ||
43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) | 43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) |
44 | SEQ_printf(m, "<%p>", sym); | 44 | SEQ_printf(m, "<%p>", sym); |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 321693724a..8ed62fda16 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -68,6 +68,7 @@ struct entry { | |||
68 | * Number of timeout events: | 68 | * Number of timeout events: |
69 | */ | 69 | */ |
70 | unsigned long count; | 70 | unsigned long count; |
71 | unsigned int timer_flag; | ||
71 | 72 | ||
72 | /* | 73 | /* |
73 | * We save the command-line string to preserve | 74 | * We save the command-line string to preserve |
@@ -231,7 +232,8 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) | |||
231 | * incremented. Otherwise the timer is registered in a free slot. | 232 | * incremented. Otherwise the timer is registered in a free slot. |
232 | */ | 233 | */ |
233 | void timer_stats_update_stats(void *timer, pid_t pid, void *startf, | 234 | void timer_stats_update_stats(void *timer, pid_t pid, void *startf, |
234 | void *timerf, char * comm) | 235 | void *timerf, char *comm, |
236 | unsigned int timer_flag) | ||
235 | { | 237 | { |
236 | /* | 238 | /* |
237 | * It doesnt matter which lock we take: | 239 | * It doesnt matter which lock we take: |
@@ -249,6 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, | |||
249 | input.start_func = startf; | 251 | input.start_func = startf; |
250 | input.expire_func = timerf; | 252 | input.expire_func = timerf; |
251 | input.pid = pid; | 253 | input.pid = pid; |
254 | input.timer_flag = timer_flag; | ||
252 | 255 | ||
253 | spin_lock_irqsave(lock, flags); | 256 | spin_lock_irqsave(lock, flags); |
254 | if (!active) | 257 | if (!active) |
@@ -266,7 +269,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, | |||
266 | 269 | ||
267 | static void print_name_offset(struct seq_file *m, unsigned long addr) | 270 | static void print_name_offset(struct seq_file *m, unsigned long addr) |
268 | { | 271 | { |
269 | char symname[KSYM_NAME_LEN+1]; | 272 | char symname[KSYM_NAME_LEN]; |
270 | 273 | ||
271 | if (lookup_symbol_name(addr, symname) < 0) | 274 | if (lookup_symbol_name(addr, symname) < 0) |
272 | seq_printf(m, "<%p>", (void *)addr); | 275 | seq_printf(m, "<%p>", (void *)addr); |
@@ -295,7 +298,7 @@ static int tstats_show(struct seq_file *m, void *v) | |||
295 | period = ktime_to_timespec(time); | 298 | period = ktime_to_timespec(time); |
296 | ms = period.tv_nsec / 1000000; | 299 | ms = period.tv_nsec / 1000000; |
297 | 300 | ||
298 | seq_puts(m, "Timer Stats Version: v0.1\n"); | 301 | seq_puts(m, "Timer Stats Version: v0.2\n"); |
299 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); | 302 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); |
300 | if (atomic_read(&overflow_count)) | 303 | if (atomic_read(&overflow_count)) |
301 | seq_printf(m, "Overflow: %d entries\n", | 304 | seq_printf(m, "Overflow: %d entries\n", |
@@ -303,8 +306,13 @@ static int tstats_show(struct seq_file *m, void *v) | |||
303 | 306 | ||
304 | for (i = 0; i < nr_entries; i++) { | 307 | for (i = 0; i < nr_entries; i++) { |
305 | entry = entries + i; | 308 | entry = entries + i; |
306 | seq_printf(m, "%4lu, %5d %-16s ", | 309 | if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { |
310 | seq_printf(m, "%4luD, %5d %-16s ", | ||
307 | entry->count, entry->pid, entry->comm); | 311 | entry->count, entry->pid, entry->comm); |
312 | } else { | ||
313 | seq_printf(m, " %4lu, %5d %-16s ", | ||
314 | entry->count, entry->pid, entry->comm); | ||
315 | } | ||
308 | 316 | ||
309 | print_name_offset(m, (unsigned long)entry->start_func); | 317 | print_name_offset(m, (unsigned long)entry->start_func); |
310 | seq_puts(m, " ("); | 318 | seq_puts(m, " ("); |
diff --git a/kernel/timer.c b/kernel/timer.c index 1a69705c2f..6ce1952eea 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_base(tvec_base_t *base) | |||
103 | static inline void timer_set_deferrable(struct timer_list *timer) | 103 | static inline void timer_set_deferrable(struct timer_list *timer) |
104 | { | 104 | { |
105 | timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | | 105 | timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | |
106 | TBASE_DEFERRABLE_FLAG)); | 106 | TBASE_DEFERRABLE_FLAG)); |
107 | } | 107 | } |
108 | 108 | ||
109 | static inline void | 109 | static inline void |
110 | timer_set_base(struct timer_list *timer, tvec_base_t *new_base) | 110 | timer_set_base(struct timer_list *timer, tvec_base_t *new_base) |
111 | { | 111 | { |
112 | timer->base = (tvec_base_t *)((unsigned long)(new_base) | | 112 | timer->base = (tvec_base_t *)((unsigned long)(new_base) | |
113 | tbase_get_deferrable(timer->base)); | 113 | tbase_get_deferrable(timer->base)); |
114 | } | 114 | } |
115 | 115 | ||
116 | /** | 116 | /** |
@@ -305,6 +305,20 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) | |||
305 | memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); | 305 | memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); |
306 | timer->start_pid = current->pid; | 306 | timer->start_pid = current->pid; |
307 | } | 307 | } |
308 | |||
309 | static void timer_stats_account_timer(struct timer_list *timer) | ||
310 | { | ||
311 | unsigned int flag = 0; | ||
312 | |||
313 | if (unlikely(tbase_get_deferrable(timer->base))) | ||
314 | flag |= TIMER_STATS_FLAG_DEFERRABLE; | ||
315 | |||
316 | timer_stats_update_stats(timer, timer->start_pid, timer->start_site, | ||
317 | timer->function, timer->start_comm, flag); | ||
318 | } | ||
319 | |||
320 | #else | ||
321 | static void timer_stats_account_timer(struct timer_list *timer) {} | ||
308 | #endif | 322 | #endif |
309 | 323 | ||
310 | /** | 324 | /** |
@@ -431,10 +445,10 @@ EXPORT_SYMBOL(__mod_timer); | |||
431 | void add_timer_on(struct timer_list *timer, int cpu) | 445 | void add_timer_on(struct timer_list *timer, int cpu) |
432 | { | 446 | { |
433 | tvec_base_t *base = per_cpu(tvec_bases, cpu); | 447 | tvec_base_t *base = per_cpu(tvec_bases, cpu); |
434 | unsigned long flags; | 448 | unsigned long flags; |
435 | 449 | ||
436 | timer_stats_timer_set_start_info(timer); | 450 | timer_stats_timer_set_start_info(timer); |
437 | BUG_ON(timer_pending(timer) || !timer->function); | 451 | BUG_ON(timer_pending(timer) || !timer->function); |
438 | spin_lock_irqsave(&base->lock, flags); | 452 | spin_lock_irqsave(&base->lock, flags); |
439 | timer_set_base(timer, base); | 453 | timer_set_base(timer, base); |
440 | internal_add_timer(base, timer); | 454 | internal_add_timer(base, timer); |
@@ -613,7 +627,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
613 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 627 | while (time_after_eq(jiffies, base->timer_jiffies)) { |
614 | struct list_head work_list; | 628 | struct list_head work_list; |
615 | struct list_head *head = &work_list; | 629 | struct list_head *head = &work_list; |
616 | int index = base->timer_jiffies & TVR_MASK; | 630 | int index = base->timer_jiffies & TVR_MASK; |
617 | 631 | ||
618 | /* | 632 | /* |
619 | * Cascade timers: | 633 | * Cascade timers: |
@@ -630,8 +644,8 @@ static inline void __run_timers(tvec_base_t *base) | |||
630 | unsigned long data; | 644 | unsigned long data; |
631 | 645 | ||
632 | timer = list_first_entry(head, struct timer_list,entry); | 646 | timer = list_first_entry(head, struct timer_list,entry); |
633 | fn = timer->function; | 647 | fn = timer->function; |
634 | data = timer->data; | 648 | data = timer->data; |
635 | 649 | ||
636 | timer_stats_account_timer(timer); | 650 | timer_stats_account_timer(timer); |
637 | 651 | ||
@@ -675,8 +689,8 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base) | |||
675 | index = slot = timer_jiffies & TVR_MASK; | 689 | index = slot = timer_jiffies & TVR_MASK; |
676 | do { | 690 | do { |
677 | list_for_each_entry(nte, base->tv1.vec + slot, entry) { | 691 | list_for_each_entry(nte, base->tv1.vec + slot, entry) { |
678 | if (tbase_get_deferrable(nte->base)) | 692 | if (tbase_get_deferrable(nte->base)) |
679 | continue; | 693 | continue; |
680 | 694 | ||
681 | found = 1; | 695 | found = 1; |
682 | expires = nte->expires; | 696 | expires = nte->expires; |
@@ -820,7 +834,7 @@ void update_process_times(int user_tick) | |||
820 | if (rcu_pending(cpu)) | 834 | if (rcu_pending(cpu)) |
821 | rcu_check_callbacks(cpu, user_tick); | 835 | rcu_check_callbacks(cpu, user_tick); |
822 | scheduler_tick(); | 836 | scheduler_tick(); |
823 | run_posix_cpu_timers(p); | 837 | run_posix_cpu_timers(p); |
824 | } | 838 | } |
825 | 839 | ||
826 | /* | 840 | /* |
@@ -895,7 +909,7 @@ static inline void update_times(unsigned long ticks) | |||
895 | update_wall_time(); | 909 | update_wall_time(); |
896 | calc_load(ticks); | 910 | calc_load(ticks); |
897 | } | 911 | } |
898 | 912 | ||
899 | /* | 913 | /* |
900 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | 914 | * The 64-bit jiffies value is not atomic - you MUST NOT read it |
901 | * without sampling the sequence number in xtime_lock. | 915 | * without sampling the sequence number in xtime_lock. |
@@ -1091,7 +1105,7 @@ asmlinkage long sys_gettid(void) | |||
1091 | /** | 1105 | /** |
1092 | * do_sysinfo - fill in sysinfo struct | 1106 | * do_sysinfo - fill in sysinfo struct |
1093 | * @info: pointer to buffer to fill | 1107 | * @info: pointer to buffer to fill |
1094 | */ | 1108 | */ |
1095 | int do_sysinfo(struct sysinfo *info) | 1109 | int do_sysinfo(struct sysinfo *info) |
1096 | { | 1110 | { |
1097 | unsigned long mem_total, sav_total; | 1111 | unsigned long mem_total, sav_total; |
@@ -1114,6 +1128,7 @@ int do_sysinfo(struct sysinfo *info) | |||
1114 | getnstimeofday(&tp); | 1128 | getnstimeofday(&tp); |
1115 | tp.tv_sec += wall_to_monotonic.tv_sec; | 1129 | tp.tv_sec += wall_to_monotonic.tv_sec; |
1116 | tp.tv_nsec += wall_to_monotonic.tv_nsec; | 1130 | tp.tv_nsec += wall_to_monotonic.tv_nsec; |
1131 | monotonic_to_bootbased(&tp); | ||
1117 | if (tp.tv_nsec - NSEC_PER_SEC >= 0) { | 1132 | if (tp.tv_nsec - NSEC_PER_SEC >= 0) { |
1118 | tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; | 1133 | tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; |
1119 | tp.tv_sec++; | 1134 | tp.tv_sec++; |
@@ -1206,7 +1221,8 @@ static int __devinit init_timers_cpu(int cpu) | |||
1206 | /* | 1221 | /* |
1207 | * The APs use this path later in boot | 1222 | * The APs use this path later in boot |
1208 | */ | 1223 | */ |
1209 | base = kmalloc_node(sizeof(*base), GFP_KERNEL, | 1224 | base = kmalloc_node(sizeof(*base), |
1225 | GFP_KERNEL | __GFP_ZERO, | ||
1210 | cpu_to_node(cpu)); | 1226 | cpu_to_node(cpu)); |
1211 | if (!base) | 1227 | if (!base) |
1212 | return -ENOMEM; | 1228 | return -ENOMEM; |
@@ -1217,7 +1233,6 @@ static int __devinit init_timers_cpu(int cpu) | |||
1217 | kfree(base); | 1233 | kfree(base); |
1218 | return -ENOMEM; | 1234 | return -ENOMEM; |
1219 | } | 1235 | } |
1220 | memset(base, 0, sizeof(*base)); | ||
1221 | per_cpu(tvec_bases, cpu) = base; | 1236 | per_cpu(tvec_bases, cpu) = base; |
1222 | } else { | 1237 | } else { |
1223 | /* | 1238 | /* |
@@ -1334,194 +1349,6 @@ void __init init_timers(void) | |||
1334 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); | 1349 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); |
1335 | } | 1350 | } |
1336 | 1351 | ||
1337 | #ifdef CONFIG_TIME_INTERPOLATION | ||
1338 | |||
1339 | struct time_interpolator *time_interpolator __read_mostly; | ||
1340 | static struct time_interpolator *time_interpolator_list __read_mostly; | ||
1341 | static DEFINE_SPINLOCK(time_interpolator_lock); | ||
1342 | |||
1343 | static inline cycles_t time_interpolator_get_cycles(unsigned int src) | ||
1344 | { | ||
1345 | unsigned long (*x)(void); | ||
1346 | |||
1347 | switch (src) | ||
1348 | { | ||
1349 | case TIME_SOURCE_FUNCTION: | ||
1350 | x = time_interpolator->addr; | ||
1351 | return x(); | ||
1352 | |||
1353 | case TIME_SOURCE_MMIO64 : | ||
1354 | return readq_relaxed((void __iomem *)time_interpolator->addr); | ||
1355 | |||
1356 | case TIME_SOURCE_MMIO32 : | ||
1357 | return readl_relaxed((void __iomem *)time_interpolator->addr); | ||
1358 | |||
1359 | default: return get_cycles(); | ||
1360 | } | ||
1361 | } | ||
1362 | |||
1363 | static inline u64 time_interpolator_get_counter(int writelock) | ||
1364 | { | ||
1365 | unsigned int src = time_interpolator->source; | ||
1366 | |||
1367 | if (time_interpolator->jitter) | ||
1368 | { | ||
1369 | cycles_t lcycle; | ||
1370 | cycles_t now; | ||
1371 | |||
1372 | do { | ||
1373 | lcycle = time_interpolator->last_cycle; | ||
1374 | now = time_interpolator_get_cycles(src); | ||
1375 | if (lcycle && time_after(lcycle, now)) | ||
1376 | return lcycle; | ||
1377 | |||
1378 | /* When holding the xtime write lock, there's no need | ||
1379 | * to add the overhead of the cmpxchg. Readers are | ||
1380 | * force to retry until the write lock is released. | ||
1381 | */ | ||
1382 | if (writelock) { | ||
1383 | time_interpolator->last_cycle = now; | ||
1384 | return now; | ||
1385 | } | ||
1386 | /* Keep track of the last timer value returned. The use of cmpxchg here | ||
1387 | * will cause contention in an SMP environment. | ||
1388 | */ | ||
1389 | } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); | ||
1390 | return now; | ||
1391 | } | ||
1392 | else | ||
1393 | return time_interpolator_get_cycles(src); | ||
1394 | } | ||
1395 | |||
1396 | void time_interpolator_reset(void) | ||
1397 | { | ||
1398 | time_interpolator->offset = 0; | ||
1399 | time_interpolator->last_counter = time_interpolator_get_counter(1); | ||
1400 | } | ||
1401 | |||
1402 | #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) | ||
1403 | |||
1404 | unsigned long time_interpolator_get_offset(void) | ||
1405 | { | ||
1406 | /* If we do not have a time interpolator set up then just return zero */ | ||
1407 | if (!time_interpolator) | ||
1408 | return 0; | ||
1409 | |||
1410 | return time_interpolator->offset + | ||
1411 | GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); | ||
1412 | } | ||
1413 | |||
1414 | #define INTERPOLATOR_ADJUST 65536 | ||
1415 | #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST | ||
1416 | |||
1417 | void time_interpolator_update(long delta_nsec) | ||
1418 | { | ||
1419 | u64 counter; | ||
1420 | unsigned long offset; | ||
1421 | |||
1422 | /* If there is no time interpolator set up then do nothing */ | ||
1423 | if (!time_interpolator) | ||
1424 | return; | ||
1425 | |||
1426 | /* | ||
1427 | * The interpolator compensates for late ticks by accumulating the late | ||
1428 | * time in time_interpolator->offset. A tick earlier than expected will | ||
1429 | * lead to a reset of the offset and a corresponding jump of the clock | ||
1430 | * forward. Again this only works if the interpolator clock is running | ||
1431 | * slightly slower than the regular clock and the tuning logic insures | ||
1432 | * that. | ||
1433 | */ | ||
1434 | |||
1435 | counter = time_interpolator_get_counter(1); | ||
1436 | offset = time_interpolator->offset + | ||
1437 | GET_TI_NSECS(counter, time_interpolator); | ||
1438 | |||
1439 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) | ||
1440 | time_interpolator->offset = offset - delta_nsec; | ||
1441 | else { | ||
1442 | time_interpolator->skips++; | ||
1443 | time_interpolator->ns_skipped += delta_nsec - offset; | ||
1444 | time_interpolator->offset = 0; | ||
1445 | } | ||
1446 | time_interpolator->last_counter = counter; | ||
1447 | |||
1448 | /* Tuning logic for time interpolator invoked every minute or so. | ||
1449 | * Decrease interpolator clock speed if no skips occurred and an offset is carried. | ||
1450 | * Increase interpolator clock speed if we skip too much time. | ||
1451 | */ | ||
1452 | if (jiffies % INTERPOLATOR_ADJUST == 0) | ||
1453 | { | ||
1454 | if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec) | ||
1455 | time_interpolator->nsec_per_cyc--; | ||
1456 | if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) | ||
1457 | time_interpolator->nsec_per_cyc++; | ||
1458 | time_interpolator->skips = 0; | ||
1459 | time_interpolator->ns_skipped = 0; | ||
1460 | } | ||
1461 | } | ||
1462 | |||
1463 | static inline int | ||
1464 | is_better_time_interpolator(struct time_interpolator *new) | ||
1465 | { | ||
1466 | if (!time_interpolator) | ||
1467 | return 1; | ||
1468 | return new->frequency > 2*time_interpolator->frequency || | ||
1469 | (unsigned long)new->drift < (unsigned long)time_interpolator->drift; | ||
1470 | } | ||
1471 | |||
1472 | void | ||
1473 | register_time_interpolator(struct time_interpolator *ti) | ||
1474 | { | ||
1475 | unsigned long flags; | ||
1476 | |||
1477 | /* Sanity check */ | ||
1478 | BUG_ON(ti->frequency == 0 || ti->mask == 0); | ||
1479 | |||
1480 | ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; | ||
1481 | spin_lock(&time_interpolator_lock); | ||
1482 | write_seqlock_irqsave(&xtime_lock, flags); | ||
1483 | if (is_better_time_interpolator(ti)) { | ||
1484 | time_interpolator = ti; | ||
1485 | time_interpolator_reset(); | ||
1486 | } | ||
1487 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
1488 | |||
1489 | ti->next = time_interpolator_list; | ||
1490 | time_interpolator_list = ti; | ||
1491 | spin_unlock(&time_interpolator_lock); | ||
1492 | } | ||
1493 | |||
1494 | void | ||
1495 | unregister_time_interpolator(struct time_interpolator *ti) | ||
1496 | { | ||
1497 | struct time_interpolator *curr, **prev; | ||
1498 | unsigned long flags; | ||
1499 | |||
1500 | spin_lock(&time_interpolator_lock); | ||
1501 | prev = &time_interpolator_list; | ||
1502 | for (curr = *prev; curr; curr = curr->next) { | ||
1503 | if (curr == ti) { | ||
1504 | *prev = curr->next; | ||
1505 | break; | ||
1506 | } | ||
1507 | prev = &curr->next; | ||
1508 | } | ||
1509 | |||
1510 | write_seqlock_irqsave(&xtime_lock, flags); | ||
1511 | if (ti == time_interpolator) { | ||
1512 | /* we lost the best time-interpolator: */ | ||
1513 | time_interpolator = NULL; | ||
1514 | /* find the next-best interpolator */ | ||
1515 | for (curr = time_interpolator_list; curr; curr = curr->next) | ||
1516 | if (is_better_time_interpolator(curr)) | ||
1517 | time_interpolator = curr; | ||
1518 | time_interpolator_reset(); | ||
1519 | } | ||
1520 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
1521 | spin_unlock(&time_interpolator_lock); | ||
1522 | } | ||
1523 | #endif /* CONFIG_TIME_INTERPOLATION */ | ||
1524 | |||
1525 | /** | 1352 | /** |
1526 | * msleep - sleep safely even with waitqueue interruptions | 1353 | * msleep - sleep safely even with waitqueue interruptions |
1527 | * @msecs: Time in milliseconds to sleep for | 1354 | * @msecs: Time in milliseconds to sleep for |
diff --git a/kernel/user.c b/kernel/user.c index 4869563080..e7d11cef69 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -14,20 +14,19 @@ | |||
14 | #include <linux/bitops.h> | 14 | #include <linux/bitops.h> |
15 | #include <linux/key.h> | 15 | #include <linux/key.h> |
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/module.h> | ||
18 | #include <linux/user_namespace.h> | ||
17 | 19 | ||
18 | /* | 20 | /* |
19 | * UID task count cache, to get fast user lookup in "alloc_uid" | 21 | * UID task count cache, to get fast user lookup in "alloc_uid" |
20 | * when changing user ID's (ie setuid() and friends). | 22 | * when changing user ID's (ie setuid() and friends). |
21 | */ | 23 | */ |
22 | 24 | ||
23 | #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8) | ||
24 | #define UIDHASH_SZ (1 << UIDHASH_BITS) | ||
25 | #define UIDHASH_MASK (UIDHASH_SZ - 1) | 25 | #define UIDHASH_MASK (UIDHASH_SZ - 1) |
26 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) | 26 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) |
27 | #define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) | 27 | #define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) |
28 | 28 | ||
29 | static struct kmem_cache *uid_cachep; | 29 | static struct kmem_cache *uid_cachep; |
30 | static struct list_head uidhash_table[UIDHASH_SZ]; | ||
31 | 30 | ||
32 | /* | 31 | /* |
33 | * The uidhash_lock is mostly taken from process context, but it is | 32 | * The uidhash_lock is mostly taken from process context, but it is |
@@ -94,9 +93,10 @@ struct user_struct *find_user(uid_t uid) | |||
94 | { | 93 | { |
95 | struct user_struct *ret; | 94 | struct user_struct *ret; |
96 | unsigned long flags; | 95 | unsigned long flags; |
96 | struct user_namespace *ns = current->nsproxy->user_ns; | ||
97 | 97 | ||
98 | spin_lock_irqsave(&uidhash_lock, flags); | 98 | spin_lock_irqsave(&uidhash_lock, flags); |
99 | ret = uid_hash_find(uid, uidhashentry(uid)); | 99 | ret = uid_hash_find(uid, uidhashentry(ns, uid)); |
100 | spin_unlock_irqrestore(&uidhash_lock, flags); | 100 | spin_unlock_irqrestore(&uidhash_lock, flags); |
101 | return ret; | 101 | return ret; |
102 | } | 102 | } |
@@ -120,9 +120,9 @@ void free_uid(struct user_struct *up) | |||
120 | } | 120 | } |
121 | } | 121 | } |
122 | 122 | ||
123 | struct user_struct * alloc_uid(uid_t uid) | 123 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
124 | { | 124 | { |
125 | struct list_head *hashent = uidhashentry(uid); | 125 | struct list_head *hashent = uidhashentry(ns, uid); |
126 | struct user_struct *up; | 126 | struct user_struct *up; |
127 | 127 | ||
128 | spin_lock_irq(&uidhash_lock); | 128 | spin_lock_irq(&uidhash_lock); |
@@ -208,14 +208,14 @@ static int __init uid_cache_init(void) | |||
208 | int n; | 208 | int n; |
209 | 209 | ||
210 | uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), | 210 | uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), |
211 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 211 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
212 | 212 | ||
213 | for(n = 0; n < UIDHASH_SZ; ++n) | 213 | for(n = 0; n < UIDHASH_SZ; ++n) |
214 | INIT_LIST_HEAD(uidhash_table + n); | 214 | INIT_LIST_HEAD(init_user_ns.uidhash_table + n); |
215 | 215 | ||
216 | /* Insert the root user immediately (init already runs as root) */ | 216 | /* Insert the root user immediately (init already runs as root) */ |
217 | spin_lock_irq(&uidhash_lock); | 217 | spin_lock_irq(&uidhash_lock); |
218 | uid_hash_insert(&root_user, uidhashentry(0)); | 218 | uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); |
219 | spin_unlock_irq(&uidhash_lock); | 219 | spin_unlock_irq(&uidhash_lock); |
220 | 220 | ||
221 | return 0; | 221 | return 0; |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c new file mode 100644 index 0000000000..d055d98785 --- /dev/null +++ b/kernel/user_namespace.c | |||
@@ -0,0 +1,87 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or | ||
3 | * modify it under the terms of the GNU General Public License as | ||
4 | * published by the Free Software Foundation, version 2 of the | ||
5 | * License. | ||
6 | */ | ||
7 | |||
8 | #include <linux/module.h> | ||
9 | #include <linux/version.h> | ||
10 | #include <linux/nsproxy.h> | ||
11 | #include <linux/user_namespace.h> | ||
12 | |||
13 | struct user_namespace init_user_ns = { | ||
14 | .kref = { | ||
15 | .refcount = ATOMIC_INIT(2), | ||
16 | }, | ||
17 | .root_user = &root_user, | ||
18 | }; | ||
19 | |||
20 | EXPORT_SYMBOL_GPL(init_user_ns); | ||
21 | |||
22 | #ifdef CONFIG_USER_NS | ||
23 | |||
24 | /* | ||
25 | * Clone a new ns copying an original user ns, setting refcount to 1 | ||
26 | * @old_ns: namespace to clone | ||
27 | * Return NULL on error (failure to kmalloc), new ns otherwise | ||
28 | */ | ||
29 | static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) | ||
30 | { | ||
31 | struct user_namespace *ns; | ||
32 | struct user_struct *new_user; | ||
33 | int n; | ||
34 | |||
35 | ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); | ||
36 | if (!ns) | ||
37 | return ERR_PTR(-ENOMEM); | ||
38 | |||
39 | kref_init(&ns->kref); | ||
40 | |||
41 | for (n = 0; n < UIDHASH_SZ; ++n) | ||
42 | INIT_LIST_HEAD(ns->uidhash_table + n); | ||
43 | |||
44 | /* Insert new root user. */ | ||
45 | ns->root_user = alloc_uid(ns, 0); | ||
46 | if (!ns->root_user) { | ||
47 | kfree(ns); | ||
48 | return ERR_PTR(-ENOMEM); | ||
49 | } | ||
50 | |||
51 | /* Reset current->user with a new one */ | ||
52 | new_user = alloc_uid(ns, current->uid); | ||
53 | if (!new_user) { | ||
54 | free_uid(ns->root_user); | ||
55 | kfree(ns); | ||
56 | return ERR_PTR(-ENOMEM); | ||
57 | } | ||
58 | |||
59 | switch_uid(new_user); | ||
60 | return ns; | ||
61 | } | ||
62 | |||
63 | struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns) | ||
64 | { | ||
65 | struct user_namespace *new_ns; | ||
66 | |||
67 | BUG_ON(!old_ns); | ||
68 | get_user_ns(old_ns); | ||
69 | |||
70 | if (!(flags & CLONE_NEWUSER)) | ||
71 | return old_ns; | ||
72 | |||
73 | new_ns = clone_user_ns(old_ns); | ||
74 | |||
75 | put_user_ns(old_ns); | ||
76 | return new_ns; | ||
77 | } | ||
78 | |||
79 | void free_user_ns(struct kref *kref) | ||
80 | { | ||
81 | struct user_namespace *ns; | ||
82 | |||
83 | ns = container_of(kref, struct user_namespace, kref); | ||
84 | kfree(ns); | ||
85 | } | ||
86 | |||
87 | #endif /* CONFIG_USER_NS */ | ||
diff --git a/kernel/utsname.c b/kernel/utsname.c index 160c8c5136..9d8180a0f0 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/uts.h> | 13 | #include <linux/uts.h> |
14 | #include <linux/utsname.h> | 14 | #include <linux/utsname.h> |
15 | #include <linux/version.h> | 15 | #include <linux/version.h> |
16 | #include <linux/err.h> | ||
16 | 17 | ||
17 | /* | 18 | /* |
18 | * Clone a new ns copying an original utsname, setting refcount to 1 | 19 | * Clone a new ns copying an original utsname, setting refcount to 1 |
@@ -24,10 +25,11 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | |||
24 | struct uts_namespace *ns; | 25 | struct uts_namespace *ns; |
25 | 26 | ||
26 | ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); | 27 | ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); |
27 | if (ns) { | 28 | if (!ns) |
28 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 29 | return ERR_PTR(-ENOMEM); |
29 | kref_init(&ns->kref); | 30 | |
30 | } | 31 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
32 | kref_init(&ns->kref); | ||
31 | return ns; | 33 | return ns; |
32 | } | 34 | } |
33 | 35 | ||
@@ -37,7 +39,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | |||
37 | * utsname of this process won't be seen by parent, and vice | 39 | * utsname of this process won't be seen by parent, and vice |
38 | * versa. | 40 | * versa. |
39 | */ | 41 | */ |
40 | struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) | 42 | struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) |
41 | { | 43 | { |
42 | struct uts_namespace *new_ns; | 44 | struct uts_namespace *new_ns; |
43 | 45 | ||
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index f22b9dbd2a..c76c06466b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c | |||
@@ -18,10 +18,7 @@ | |||
18 | static void *get_uts(ctl_table *table, int write) | 18 | static void *get_uts(ctl_table *table, int write) |
19 | { | 19 | { |
20 | char *which = table->data; | 20 | char *which = table->data; |
21 | #ifdef CONFIG_UTS_NS | 21 | |
22 | struct uts_namespace *uts_ns = current->nsproxy->uts_ns; | ||
23 | which = (which - (char *)&init_uts_ns) + (char *)uts_ns; | ||
24 | #endif | ||
25 | if (!write) | 22 | if (!write) |
26 | down_read(&uts_sem); | 23 | down_read(&uts_sem); |
27 | else | 24 | else |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3bebf73be9..58e5c152a6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -282,8 +282,8 @@ static int worker_thread(void *__cwq) | |||
282 | struct cpu_workqueue_struct *cwq = __cwq; | 282 | struct cpu_workqueue_struct *cwq = __cwq; |
283 | DEFINE_WAIT(wait); | 283 | DEFINE_WAIT(wait); |
284 | 284 | ||
285 | if (!cwq->wq->freezeable) | 285 | if (cwq->wq->freezeable) |
286 | current->flags |= PF_NOFREEZE; | 286 | set_freezable(); |
287 | 287 | ||
288 | set_user_nice(current, -5); | 288 | set_user_nice(current, -5); |
289 | 289 | ||
@@ -382,16 +382,16 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
382 | EXPORT_SYMBOL_GPL(flush_workqueue); | 382 | EXPORT_SYMBOL_GPL(flush_workqueue); |
383 | 383 | ||
384 | /* | 384 | /* |
385 | * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit, | 385 | * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, |
386 | * so this work can't be re-armed in any way. | 386 | * so this work can't be re-armed in any way. |
387 | */ | 387 | */ |
388 | static int try_to_grab_pending(struct work_struct *work) | 388 | static int try_to_grab_pending(struct work_struct *work) |
389 | { | 389 | { |
390 | struct cpu_workqueue_struct *cwq; | 390 | struct cpu_workqueue_struct *cwq; |
391 | int ret = 0; | 391 | int ret = -1; |
392 | 392 | ||
393 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) | 393 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) |
394 | return 1; | 394 | return 0; |
395 | 395 | ||
396 | /* | 396 | /* |
397 | * The queueing is in progress, or it is already queued. Try to | 397 | * The queueing is in progress, or it is already queued. Try to |
@@ -457,10 +457,28 @@ static void wait_on_work(struct work_struct *work) | |||
457 | wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 457 | wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
458 | } | 458 | } |
459 | 459 | ||
460 | static int __cancel_work_timer(struct work_struct *work, | ||
461 | struct timer_list* timer) | ||
462 | { | ||
463 | int ret; | ||
464 | |||
465 | do { | ||
466 | ret = (timer && likely(del_timer(timer))); | ||
467 | if (!ret) | ||
468 | ret = try_to_grab_pending(work); | ||
469 | wait_on_work(work); | ||
470 | } while (unlikely(ret < 0)); | ||
471 | |||
472 | work_clear_pending(work); | ||
473 | return ret; | ||
474 | } | ||
475 | |||
460 | /** | 476 | /** |
461 | * cancel_work_sync - block until a work_struct's callback has terminated | 477 | * cancel_work_sync - block until a work_struct's callback has terminated |
462 | * @work: the work which is to be flushed | 478 | * @work: the work which is to be flushed |
463 | * | 479 | * |
480 | * Returns true if @work was pending. | ||
481 | * | ||
464 | * cancel_work_sync() will cancel the work if it is queued. If the work's | 482 | * cancel_work_sync() will cancel the work if it is queued. If the work's |
465 | * callback appears to be running, cancel_work_sync() will block until it | 483 | * callback appears to be running, cancel_work_sync() will block until it |
466 | * has completed. | 484 | * has completed. |
@@ -476,31 +494,26 @@ static void wait_on_work(struct work_struct *work) | |||
476 | * The caller must ensure that workqueue_struct on which this work was last | 494 | * The caller must ensure that workqueue_struct on which this work was last |
477 | * queued can't be destroyed before this function returns. | 495 | * queued can't be destroyed before this function returns. |
478 | */ | 496 | */ |
479 | void cancel_work_sync(struct work_struct *work) | 497 | int cancel_work_sync(struct work_struct *work) |
480 | { | 498 | { |
481 | while (!try_to_grab_pending(work)) | 499 | return __cancel_work_timer(work, NULL); |
482 | cpu_relax(); | ||
483 | wait_on_work(work); | ||
484 | work_clear_pending(work); | ||
485 | } | 500 | } |
486 | EXPORT_SYMBOL_GPL(cancel_work_sync); | 501 | EXPORT_SYMBOL_GPL(cancel_work_sync); |
487 | 502 | ||
488 | /** | 503 | /** |
489 | * cancel_rearming_delayed_work - reliably kill off a delayed work. | 504 | * cancel_delayed_work_sync - reliably kill off a delayed work. |
490 | * @dwork: the delayed work struct | 505 | * @dwork: the delayed work struct |
491 | * | 506 | * |
507 | * Returns true if @dwork was pending. | ||
508 | * | ||
492 | * It is possible to use this function if @dwork rearms itself via queue_work() | 509 | * It is possible to use this function if @dwork rearms itself via queue_work() |
493 | * or queue_delayed_work(). See also the comment for cancel_work_sync(). | 510 | * or queue_delayed_work(). See also the comment for cancel_work_sync(). |
494 | */ | 511 | */ |
495 | void cancel_rearming_delayed_work(struct delayed_work *dwork) | 512 | int cancel_delayed_work_sync(struct delayed_work *dwork) |
496 | { | 513 | { |
497 | while (!del_timer(&dwork->timer) && | 514 | return __cancel_work_timer(&dwork->work, &dwork->timer); |
498 | !try_to_grab_pending(&dwork->work)) | ||
499 | cpu_relax(); | ||
500 | wait_on_work(&dwork->work); | ||
501 | work_clear_pending(&dwork->work); | ||
502 | } | 515 | } |
503 | EXPORT_SYMBOL(cancel_rearming_delayed_work); | 516 | EXPORT_SYMBOL(cancel_delayed_work_sync); |
504 | 517 | ||
505 | static struct workqueue_struct *keventd_wq __read_mostly; | 518 | static struct workqueue_struct *keventd_wq __read_mostly; |
506 | 519 | ||
@@ -739,18 +752,17 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | |||
739 | if (cwq->thread == NULL) | 752 | if (cwq->thread == NULL) |
740 | return; | 753 | return; |
741 | 754 | ||
755 | flush_cpu_workqueue(cwq); | ||
742 | /* | 756 | /* |
743 | * If the caller is CPU_DEAD the single flush_cpu_workqueue() | 757 | * If the caller is CPU_DEAD and cwq->worklist was not empty, |
744 | * is not enough, a concurrent flush_workqueue() can insert a | 758 | * a concurrent flush_workqueue() can insert a barrier after us. |
745 | * barrier after us. | 759 | * However, in that case run_workqueue() won't return and check |
760 | * kthread_should_stop() until it flushes all work_struct's. | ||
746 | * When ->worklist becomes empty it is safe to exit because no | 761 | * When ->worklist becomes empty it is safe to exit because no |
747 | * more work_structs can be queued on this cwq: flush_workqueue | 762 | * more work_structs can be queued on this cwq: flush_workqueue |
748 | * checks list_empty(), and a "normal" queue_work() can't use | 763 | * checks list_empty(), and a "normal" queue_work() can't use |
749 | * a dead CPU. | 764 | * a dead CPU. |
750 | */ | 765 | */ |
751 | while (flush_cpu_workqueue(cwq)) | ||
752 | ; | ||
753 | |||
754 | kthread_stop(cwq->thread); | 766 | kthread_stop(cwq->thread); |
755 | cwq->thread = NULL; | 767 | cwq->thread = NULL; |
756 | } | 768 | } |