diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2007-10-12 21:27:47 -0400 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2007-10-12 21:27:47 -0400 |
commit | b981d8b3f5e008ff10d993be633ad00564fc22cd (patch) | |
tree | e292dc07b22308912cf6a58354a608b9e5e8e1fd /kernel | |
parent | b11d2127c4893a7315d1e16273bc8560049fa3ca (diff) | |
parent | 2b9e0aae1d50e880c58d46788e5e3ebd89d75d62 (diff) |
Merge master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts:
drivers/macintosh/adbhid.c
Diffstat (limited to 'kernel')
71 files changed, 3643 insertions, 2175 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c64ce9c14207..6b066632e40c 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
@@ -63,3 +63,6 @@ config PREEMPT_BKL | |||
63 | Say Y here if you are building a kernel for a desktop system. | 63 | Say Y here if you are building a kernel for a desktop system. |
64 | Say N if you are unsure. | 64 | Say N if you are unsure. |
65 | 65 | ||
66 | config PREEMPT_NOTIFIERS | ||
67 | bool | ||
68 | |||
diff --git a/kernel/acct.c b/kernel/acct.c index 70d0d88e5554..24f0f8b2ba72 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -468,7 +468,7 @@ static void do_acct_process(struct file *file) | |||
468 | } | 468 | } |
469 | #endif | 469 | #endif |
470 | do_div(elapsed, AHZ); | 470 | do_div(elapsed, AHZ); |
471 | ac.ac_btime = xtime.tv_sec - elapsed; | 471 | ac.ac_btime = get_seconds() - elapsed; |
472 | /* we really need to bite the bullet and change layout */ | 472 | /* we really need to bite the bullet and change layout */ |
473 | ac.ac_uid = current->uid; | 473 | ac.ac_uid = current->uid; |
474 | ac.ac_gid = current->gid; | 474 | ac.ac_gid = current->gid; |
diff --git a/kernel/audit.c b/kernel/audit.c index eb0f9165b401..2924251a6547 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -847,18 +847,10 @@ static void audit_receive_skb(struct sk_buff *skb) | |||
847 | } | 847 | } |
848 | 848 | ||
849 | /* Receive messages from netlink socket. */ | 849 | /* Receive messages from netlink socket. */ |
850 | static void audit_receive(struct sock *sk, int length) | 850 | static void audit_receive(struct sk_buff *skb) |
851 | { | 851 | { |
852 | struct sk_buff *skb; | ||
853 | unsigned int qlen; | ||
854 | |||
855 | mutex_lock(&audit_cmd_mutex); | 852 | mutex_lock(&audit_cmd_mutex); |
856 | 853 | audit_receive_skb(skb); | |
857 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { | ||
858 | skb = skb_dequeue(&sk->sk_receive_queue); | ||
859 | audit_receive_skb(skb); | ||
860 | kfree_skb(skb); | ||
861 | } | ||
862 | mutex_unlock(&audit_cmd_mutex); | 854 | mutex_unlock(&audit_cmd_mutex); |
863 | } | 855 | } |
864 | 856 | ||
@@ -876,8 +868,8 @@ static int __init audit_init(void) | |||
876 | 868 | ||
877 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 869 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
878 | audit_default ? "enabled" : "disabled"); | 870 | audit_default ? "enabled" : "disabled"); |
879 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, | 871 | audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, |
880 | NULL, THIS_MODULE); | 872 | audit_receive, NULL, THIS_MODULE); |
881 | if (!audit_sock) | 873 | if (!audit_sock) |
882 | audit_panic("cannot initialize netlink socket"); | 874 | audit_panic("cannot initialize netlink socket"); |
883 | else | 875 | else |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 1bf093dcffe0..359645cff5b2 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -304,7 +304,7 @@ int __init audit_register_class(int class, unsigned *list) | |||
304 | 304 | ||
305 | int audit_match_class(int class, unsigned syscall) | 305 | int audit_match_class(int class, unsigned syscall) |
306 | { | 306 | { |
307 | if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32))) | 307 | if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32)) |
308 | return 0; | 308 | return 0; |
309 | if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) | 309 | if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) |
310 | return 0; | 310 | return 0; |
@@ -456,6 +456,13 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
456 | case AUDIT_DEVMINOR: | 456 | case AUDIT_DEVMINOR: |
457 | case AUDIT_EXIT: | 457 | case AUDIT_EXIT: |
458 | case AUDIT_SUCCESS: | 458 | case AUDIT_SUCCESS: |
459 | /* bit ops are only useful on syscall args */ | ||
460 | if (f->op == AUDIT_BIT_MASK || | ||
461 | f->op == AUDIT_BIT_TEST) { | ||
462 | err = -EINVAL; | ||
463 | goto exit_free; | ||
464 | } | ||
465 | break; | ||
459 | case AUDIT_ARG0: | 466 | case AUDIT_ARG0: |
460 | case AUDIT_ARG1: | 467 | case AUDIT_ARG1: |
461 | case AUDIT_ARG2: | 468 | case AUDIT_ARG2: |
@@ -1566,6 +1573,10 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) | |||
1566 | return (left > right); | 1573 | return (left > right); |
1567 | case AUDIT_GREATER_THAN_OR_EQUAL: | 1574 | case AUDIT_GREATER_THAN_OR_EQUAL: |
1568 | return (left >= right); | 1575 | return (left >= right); |
1576 | case AUDIT_BIT_MASK: | ||
1577 | return (left & right); | ||
1578 | case AUDIT_BIT_TEST: | ||
1579 | return ((left & right) == right); | ||
1569 | } | 1580 | } |
1570 | BUG(); | 1581 | BUG(); |
1571 | return 0; | 1582 | return 0; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b7640a5f382a..04f3ffb8d9d4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -153,7 +153,7 @@ struct audit_aux_data_execve { | |||
153 | struct audit_aux_data d; | 153 | struct audit_aux_data d; |
154 | int argc; | 154 | int argc; |
155 | int envc; | 155 | int envc; |
156 | char mem[0]; | 156 | struct mm_struct *mm; |
157 | }; | 157 | }; |
158 | 158 | ||
159 | struct audit_aux_data_socketcall { | 159 | struct audit_aux_data_socketcall { |
@@ -173,12 +173,6 @@ struct audit_aux_data_fd_pair { | |||
173 | int fd[2]; | 173 | int fd[2]; |
174 | }; | 174 | }; |
175 | 175 | ||
176 | struct audit_aux_data_path { | ||
177 | struct audit_aux_data d; | ||
178 | struct dentry *dentry; | ||
179 | struct vfsmount *mnt; | ||
180 | }; | ||
181 | |||
182 | struct audit_aux_data_pids { | 176 | struct audit_aux_data_pids { |
183 | struct audit_aux_data d; | 177 | struct audit_aux_data d; |
184 | pid_t target_pid[AUDIT_AUX_PIDS]; | 178 | pid_t target_pid[AUDIT_AUX_PIDS]; |
@@ -654,12 +648,6 @@ static inline void audit_free_aux(struct audit_context *context) | |||
654 | struct audit_aux_data *aux; | 648 | struct audit_aux_data *aux; |
655 | 649 | ||
656 | while ((aux = context->aux)) { | 650 | while ((aux = context->aux)) { |
657 | if (aux->type == AUDIT_AVC_PATH) { | ||
658 | struct audit_aux_data_path *axi = (void *)aux; | ||
659 | dput(axi->dentry); | ||
660 | mntput(axi->mnt); | ||
661 | } | ||
662 | |||
663 | context->aux = aux->next; | 651 | context->aux = aux->next; |
664 | kfree(aux); | 652 | kfree(aux); |
665 | } | 653 | } |
@@ -831,6 +819,57 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
831 | return rc; | 819 | return rc; |
832 | } | 820 | } |
833 | 821 | ||
822 | static void audit_log_execve_info(struct audit_buffer *ab, | ||
823 | struct audit_aux_data_execve *axi) | ||
824 | { | ||
825 | int i; | ||
826 | long len, ret; | ||
827 | const char __user *p; | ||
828 | char *buf; | ||
829 | |||
830 | if (axi->mm != current->mm) | ||
831 | return; /* execve failed, no additional info */ | ||
832 | |||
833 | p = (const char __user *)axi->mm->arg_start; | ||
834 | |||
835 | for (i = 0; i < axi->argc; i++, p += len) { | ||
836 | len = strnlen_user(p, MAX_ARG_STRLEN); | ||
837 | /* | ||
838 | * We just created this mm, if we can't find the strings | ||
839 | * we just copied into it something is _very_ wrong. Similar | ||
840 | * for strings that are too long, we should not have created | ||
841 | * any. | ||
842 | */ | ||
843 | if (!len || len > MAX_ARG_STRLEN) { | ||
844 | WARN_ON(1); | ||
845 | send_sig(SIGKILL, current, 0); | ||
846 | } | ||
847 | |||
848 | buf = kmalloc(len, GFP_KERNEL); | ||
849 | if (!buf) { | ||
850 | audit_panic("out of memory for argv string\n"); | ||
851 | break; | ||
852 | } | ||
853 | |||
854 | ret = copy_from_user(buf, p, len); | ||
855 | /* | ||
856 | * There is no reason for this copy to be short. We just | ||
857 | * copied them here, and the mm hasn't been exposed to user- | ||
858 | * space yet. | ||
859 | */ | ||
860 | if (ret) { | ||
861 | WARN_ON(1); | ||
862 | send_sig(SIGKILL, current, 0); | ||
863 | } | ||
864 | |||
865 | audit_log_format(ab, "a%d=", i); | ||
866 | audit_log_untrustedstring(ab, buf); | ||
867 | audit_log_format(ab, "\n"); | ||
868 | |||
869 | kfree(buf); | ||
870 | } | ||
871 | } | ||
872 | |||
834 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) | 873 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) |
835 | { | 874 | { |
836 | int i, call_panic = 0; | 875 | int i, call_panic = 0; |
@@ -946,7 +985,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
946 | case AUDIT_IPC: { | 985 | case AUDIT_IPC: { |
947 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 986 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
948 | audit_log_format(ab, | 987 | audit_log_format(ab, |
949 | "ouid=%u ogid=%u mode=%x", | 988 | "ouid=%u ogid=%u mode=%#o", |
950 | axi->uid, axi->gid, axi->mode); | 989 | axi->uid, axi->gid, axi->mode); |
951 | if (axi->osid != 0) { | 990 | if (axi->osid != 0) { |
952 | char *ctx = NULL; | 991 | char *ctx = NULL; |
@@ -965,19 +1004,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
965 | case AUDIT_IPC_SET_PERM: { | 1004 | case AUDIT_IPC_SET_PERM: { |
966 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 1005 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
967 | audit_log_format(ab, | 1006 | audit_log_format(ab, |
968 | "qbytes=%lx ouid=%u ogid=%u mode=%x", | 1007 | "qbytes=%lx ouid=%u ogid=%u mode=%#o", |
969 | axi->qbytes, axi->uid, axi->gid, axi->mode); | 1008 | axi->qbytes, axi->uid, axi->gid, axi->mode); |
970 | break; } | 1009 | break; } |
971 | 1010 | ||
972 | case AUDIT_EXECVE: { | 1011 | case AUDIT_EXECVE: { |
973 | struct audit_aux_data_execve *axi = (void *)aux; | 1012 | struct audit_aux_data_execve *axi = (void *)aux; |
974 | int i; | 1013 | audit_log_execve_info(ab, axi); |
975 | const char *p; | ||
976 | for (i = 0, p = axi->mem; i < axi->argc; i++) { | ||
977 | audit_log_format(ab, "a%d=", i); | ||
978 | p = audit_log_untrustedstring(ab, p); | ||
979 | audit_log_format(ab, "\n"); | ||
980 | } | ||
981 | break; } | 1014 | break; } |
982 | 1015 | ||
983 | case AUDIT_SOCKETCALL: { | 1016 | case AUDIT_SOCKETCALL: { |
@@ -995,11 +1028,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
995 | audit_log_hex(ab, axs->a, axs->len); | 1028 | audit_log_hex(ab, axs->a, axs->len); |
996 | break; } | 1029 | break; } |
997 | 1030 | ||
998 | case AUDIT_AVC_PATH: { | ||
999 | struct audit_aux_data_path *axi = (void *)aux; | ||
1000 | audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); | ||
1001 | break; } | ||
1002 | |||
1003 | case AUDIT_FD_PAIR: { | 1031 | case AUDIT_FD_PAIR: { |
1004 | struct audit_aux_data_fd_pair *axs = (void *)aux; | 1032 | struct audit_aux_data_fd_pair *axs = (void *)aux; |
1005 | audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); | 1033 | audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); |
@@ -1821,32 +1849,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode | |||
1821 | return 0; | 1849 | return 0; |
1822 | } | 1850 | } |
1823 | 1851 | ||
1852 | int audit_argv_kb = 32; | ||
1853 | |||
1824 | int audit_bprm(struct linux_binprm *bprm) | 1854 | int audit_bprm(struct linux_binprm *bprm) |
1825 | { | 1855 | { |
1826 | struct audit_aux_data_execve *ax; | 1856 | struct audit_aux_data_execve *ax; |
1827 | struct audit_context *context = current->audit_context; | 1857 | struct audit_context *context = current->audit_context; |
1828 | unsigned long p, next; | ||
1829 | void *to; | ||
1830 | 1858 | ||
1831 | if (likely(!audit_enabled || !context || context->dummy)) | 1859 | if (likely(!audit_enabled || !context || context->dummy)) |
1832 | return 0; | 1860 | return 0; |
1833 | 1861 | ||
1834 | ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, | 1862 | /* |
1835 | GFP_KERNEL); | 1863 | * Even though the stack code doesn't limit the arg+env size any more, |
1864 | * the audit code requires that _all_ arguments be logged in a single | ||
1865 | * netlink skb. Hence cap it :-( | ||
1866 | */ | ||
1867 | if (bprm->argv_len > (audit_argv_kb << 10)) | ||
1868 | return -E2BIG; | ||
1869 | |||
1870 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); | ||
1836 | if (!ax) | 1871 | if (!ax) |
1837 | return -ENOMEM; | 1872 | return -ENOMEM; |
1838 | 1873 | ||
1839 | ax->argc = bprm->argc; | 1874 | ax->argc = bprm->argc; |
1840 | ax->envc = bprm->envc; | 1875 | ax->envc = bprm->envc; |
1841 | for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { | 1876 | ax->mm = bprm->mm; |
1842 | struct page *page = bprm->page[p / PAGE_SIZE]; | ||
1843 | void *kaddr = kmap(page); | ||
1844 | next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); | ||
1845 | memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); | ||
1846 | to += next - p; | ||
1847 | kunmap(page); | ||
1848 | } | ||
1849 | |||
1850 | ax->d.type = AUDIT_EXECVE; | 1877 | ax->d.type = AUDIT_EXECVE; |
1851 | ax->d.next = context->aux; | 1878 | ax->d.next = context->aux; |
1852 | context->aux = (void *)ax; | 1879 | context->aux = (void *)ax; |
@@ -1949,36 +1976,6 @@ void __audit_ptrace(struct task_struct *t) | |||
1949 | } | 1976 | } |
1950 | 1977 | ||
1951 | /** | 1978 | /** |
1952 | * audit_avc_path - record the granting or denial of permissions | ||
1953 | * @dentry: dentry to record | ||
1954 | * @mnt: mnt to record | ||
1955 | * | ||
1956 | * Returns 0 for success or NULL context or < 0 on error. | ||
1957 | * | ||
1958 | * Called from security/selinux/avc.c::avc_audit() | ||
1959 | */ | ||
1960 | int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) | ||
1961 | { | ||
1962 | struct audit_aux_data_path *ax; | ||
1963 | struct audit_context *context = current->audit_context; | ||
1964 | |||
1965 | if (likely(!context)) | ||
1966 | return 0; | ||
1967 | |||
1968 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1969 | if (!ax) | ||
1970 | return -ENOMEM; | ||
1971 | |||
1972 | ax->dentry = dget(dentry); | ||
1973 | ax->mnt = mntget(mnt); | ||
1974 | |||
1975 | ax->d.type = AUDIT_AVC_PATH; | ||
1976 | ax->d.next = context->aux; | ||
1977 | context->aux = (void *)ax; | ||
1978 | return 0; | ||
1979 | } | ||
1980 | |||
1981 | /** | ||
1982 | * audit_signal_info - record signal info for shutting down audit subsystem | 1979 | * audit_signal_info - record signal info for shutting down audit subsystem |
1983 | * @sig: signal value | 1980 | * @sig: signal value |
1984 | * @t: task being signaled | 1981 | * @t: task being signaled |
@@ -1995,19 +1992,19 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
1995 | extern uid_t audit_sig_uid; | 1992 | extern uid_t audit_sig_uid; |
1996 | extern u32 audit_sig_sid; | 1993 | extern u32 audit_sig_sid; |
1997 | 1994 | ||
1998 | if (audit_pid && t->tgid == audit_pid && | 1995 | if (audit_pid && t->tgid == audit_pid) { |
1999 | (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1)) { | 1996 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { |
2000 | audit_sig_pid = tsk->pid; | 1997 | audit_sig_pid = tsk->pid; |
2001 | if (ctx) | 1998 | if (ctx) |
2002 | audit_sig_uid = ctx->loginuid; | 1999 | audit_sig_uid = ctx->loginuid; |
2003 | else | 2000 | else |
2004 | audit_sig_uid = tsk->uid; | 2001 | audit_sig_uid = tsk->uid; |
2005 | selinux_get_task_sid(tsk, &audit_sig_sid); | 2002 | selinux_get_task_sid(tsk, &audit_sig_sid); |
2003 | } | ||
2004 | if (!audit_signals || audit_dummy_context()) | ||
2005 | return 0; | ||
2006 | } | 2006 | } |
2007 | 2007 | ||
2008 | if (!audit_signals) /* audit_context checked in wrapper */ | ||
2009 | return 0; | ||
2010 | |||
2011 | /* optimize the common case by putting first signal recipient directly | 2008 | /* optimize the common case by putting first signal recipient directly |
2012 | * in audit_context */ | 2009 | * in audit_context */ |
2013 | if (!ctx->target_pid) { | 2010 | if (!ctx->target_pid) { |
@@ -2026,7 +2023,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2026 | axp->d.next = ctx->aux_pids; | 2023 | axp->d.next = ctx->aux_pids; |
2027 | ctx->aux_pids = (void *)axp; | 2024 | ctx->aux_pids = (void *)axp; |
2028 | } | 2025 | } |
2029 | BUG_ON(axp->pid_count > AUDIT_AUX_PIDS); | 2026 | BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); |
2030 | 2027 | ||
2031 | axp->target_pid[axp->pid_count] = t->tgid; | 2028 | axp->target_pid[axp->pid_count] = t->tgid; |
2032 | selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); | 2029 | selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 181ae7086029..38033db8d8ec 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
273 | return err; | 273 | return err; |
274 | } | 274 | } |
275 | 275 | ||
276 | #ifdef CONFIG_SUSPEND_SMP | 276 | #ifdef CONFIG_PM_SLEEP_SMP |
277 | static cpumask_t frozen_cpus; | 277 | static cpumask_t frozen_cpus; |
278 | 278 | ||
279 | int disable_nonboot_cpus(void) | 279 | int disable_nonboot_cpus(void) |
@@ -334,4 +334,4 @@ void enable_nonboot_cpus(void) | |||
334 | out: | 334 | out: |
335 | mutex_unlock(&cpu_add_remove_lock); | 335 | mutex_unlock(&cpu_add_remove_lock); |
336 | } | 336 | } |
337 | #endif | 337 | #endif /* CONFIG_PM_SLEEP_SMP */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b4796d850140..57e6448b171e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf) | |||
516 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | 516 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
517 | envp[i] = NULL; | 517 | envp[i] = NULL; |
518 | 518 | ||
519 | call_usermodehelper(argv[0], argv, envp, 0); | 519 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); |
520 | kfree(pathbuf); | 520 | kfree(pathbuf); |
521 | } | 521 | } |
522 | 522 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index e8af8d0c2483..993369ee94d1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/pid_namespace.h> | 24 | #include <linux/pid_namespace.h> |
25 | #include <linux/ptrace.h> | 25 | #include <linux/ptrace.h> |
26 | #include <linux/profile.h> | 26 | #include <linux/profile.h> |
27 | #include <linux/signalfd.h> | ||
28 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
29 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
30 | #include <linux/kthread.h> | 29 | #include <linux/kthread.h> |
@@ -45,6 +44,7 @@ | |||
45 | #include <linux/resource.h> | 44 | #include <linux/resource.h> |
46 | #include <linux/blkdev.h> | 45 | #include <linux/blkdev.h> |
47 | #include <linux/task_io_accounting_ops.h> | 46 | #include <linux/task_io_accounting_ops.h> |
47 | #include <linux/freezer.h> | ||
48 | 48 | ||
49 | #include <asm/uaccess.h> | 49 | #include <asm/uaccess.h> |
50 | #include <asm/unistd.h> | 50 | #include <asm/unistd.h> |
@@ -85,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk) | |||
85 | sighand = rcu_dereference(tsk->sighand); | 85 | sighand = rcu_dereference(tsk->sighand); |
86 | spin_lock(&sighand->siglock); | 86 | spin_lock(&sighand->siglock); |
87 | 87 | ||
88 | /* | ||
89 | * Notify that this sighand has been detached. This must | ||
90 | * be called with the tsk->sighand lock held. Also, this | ||
91 | * access tsk->sighand internally, so it must be called | ||
92 | * before tsk->sighand is reset. | ||
93 | */ | ||
94 | signalfd_detach_locked(tsk); | ||
95 | |||
96 | posix_cpu_timers_exit(tsk); | 88 | posix_cpu_timers_exit(tsk); |
97 | if (atomic_dec_and_test(&sig->count)) | 89 | if (atomic_dec_and_test(&sig->count)) |
98 | posix_cpu_timers_exit_group(tsk); | 90 | posix_cpu_timers_exit_group(tsk); |
@@ -594,6 +586,8 @@ static void exit_mm(struct task_struct * tsk) | |||
594 | tsk->mm = NULL; | 586 | tsk->mm = NULL; |
595 | up_read(&mm->mmap_sem); | 587 | up_read(&mm->mmap_sem); |
596 | enter_lazy_tlb(mm, current); | 588 | enter_lazy_tlb(mm, current); |
589 | /* We don't want this task to be frozen prematurely */ | ||
590 | clear_freeze_flag(tsk); | ||
597 | task_unlock(tsk); | 591 | task_unlock(tsk); |
598 | mmput(mm); | 592 | mmput(mm); |
599 | } | 593 | } |
@@ -810,7 +804,7 @@ static void exit_notify(struct task_struct *tsk) | |||
810 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); | 804 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
811 | } | 805 | } |
812 | 806 | ||
813 | /* Let father know we died | 807 | /* Let father know we died |
814 | * | 808 | * |
815 | * Thread signals are configurable, but you aren't going to use | 809 | * Thread signals are configurable, but you aren't going to use |
816 | * that to send signals to arbitary processes. | 810 | * that to send signals to arbitary processes. |
@@ -823,9 +817,7 @@ static void exit_notify(struct task_struct *tsk) | |||
823 | * If our self_exec id doesn't match our parent_exec_id then | 817 | * If our self_exec id doesn't match our parent_exec_id then |
824 | * we have changed execution domain as these two values started | 818 | * we have changed execution domain as these two values started |
825 | * the same after a fork. | 819 | * the same after a fork. |
826 | * | ||
827 | */ | 820 | */ |
828 | |||
829 | if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && | 821 | if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && |
830 | ( tsk->parent_exec_id != t->self_exec_id || | 822 | ( tsk->parent_exec_id != t->self_exec_id || |
831 | tsk->self_exec_id != tsk->parent_exec_id) | 823 | tsk->self_exec_id != tsk->parent_exec_id) |
@@ -845,9 +837,7 @@ static void exit_notify(struct task_struct *tsk) | |||
845 | } | 837 | } |
846 | 838 | ||
847 | state = EXIT_ZOMBIE; | 839 | state = EXIT_ZOMBIE; |
848 | if (tsk->exit_signal == -1 && | 840 | if (tsk->exit_signal == -1 && likely(!tsk->ptrace)) |
849 | (likely(tsk->ptrace == 0) || | ||
850 | unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) | ||
851 | state = EXIT_DEAD; | 841 | state = EXIT_DEAD; |
852 | tsk->exit_state = state; | 842 | tsk->exit_state = state; |
853 | 843 | ||
@@ -976,6 +966,7 @@ fastcall NORET_TYPE void do_exit(long code) | |||
976 | if (unlikely(tsk->audit_context)) | 966 | if (unlikely(tsk->audit_context)) |
977 | audit_free(tsk); | 967 | audit_free(tsk); |
978 | 968 | ||
969 | tsk->exit_code = code; | ||
979 | taskstats_exit(tsk, group_dead); | 970 | taskstats_exit(tsk, group_dead); |
980 | 971 | ||
981 | exit_mm(tsk); | 972 | exit_mm(tsk); |
@@ -997,7 +988,6 @@ fastcall NORET_TYPE void do_exit(long code) | |||
997 | if (tsk->binfmt) | 988 | if (tsk->binfmt) |
998 | module_put(tsk->binfmt->module); | 989 | module_put(tsk->binfmt->module); |
999 | 990 | ||
1000 | tsk->exit_code = code; | ||
1001 | proc_exit_connector(tsk); | 991 | proc_exit_connector(tsk); |
1002 | exit_task_namespaces(tsk); | 992 | exit_task_namespaces(tsk); |
1003 | exit_notify(tsk); | 993 | exit_notify(tsk); |
diff --git a/kernel/fork.c b/kernel/fork.c index ba39bdb2a7b8..5e67f90a1694 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -137,7 +137,7 @@ void __init fork_init(unsigned long mempages) | |||
137 | /* create a slab on which task_structs can be allocated */ | 137 | /* create a slab on which task_structs can be allocated */ |
138 | task_struct_cachep = | 138 | task_struct_cachep = |
139 | kmem_cache_create("task_struct", sizeof(struct task_struct), | 139 | kmem_cache_create("task_struct", sizeof(struct task_struct), |
140 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); | 140 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); |
141 | #endif | 141 | #endif |
142 | 142 | ||
143 | /* | 143 | /* |
@@ -334,6 +334,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) | |||
334 | atomic_set(&mm->mm_count, 1); | 334 | atomic_set(&mm->mm_count, 1); |
335 | init_rwsem(&mm->mmap_sem); | 335 | init_rwsem(&mm->mmap_sem); |
336 | INIT_LIST_HEAD(&mm->mmlist); | 336 | INIT_LIST_HEAD(&mm->mmlist); |
337 | mm->flags = (current->mm) ? current->mm->flags | ||
338 | : MMF_DUMP_FILTER_DEFAULT; | ||
337 | mm->core_waiters = 0; | 339 | mm->core_waiters = 0; |
338 | mm->nr_ptes = 0; | 340 | mm->nr_ptes = 0; |
339 | set_mm_counter(mm, file_rss, 0); | 341 | set_mm_counter(mm, file_rss, 0); |
@@ -1436,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, | |||
1436 | struct sighand_struct *sighand = data; | 1438 | struct sighand_struct *sighand = data; |
1437 | 1439 | ||
1438 | spin_lock_init(&sighand->siglock); | 1440 | spin_lock_init(&sighand->siglock); |
1439 | INIT_LIST_HEAD(&sighand->signalfd_list); | 1441 | init_waitqueue_head(&sighand->signalfd_wqh); |
1440 | } | 1442 | } |
1441 | 1443 | ||
1442 | void __init proc_caches_init(void) | 1444 | void __init proc_caches_init(void) |
@@ -1444,22 +1446,22 @@ void __init proc_caches_init(void) | |||
1444 | sighand_cachep = kmem_cache_create("sighand_cache", | 1446 | sighand_cachep = kmem_cache_create("sighand_cache", |
1445 | sizeof(struct sighand_struct), 0, | 1447 | sizeof(struct sighand_struct), 0, |
1446 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, | 1448 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, |
1447 | sighand_ctor, NULL); | 1449 | sighand_ctor); |
1448 | signal_cachep = kmem_cache_create("signal_cache", | 1450 | signal_cachep = kmem_cache_create("signal_cache", |
1449 | sizeof(struct signal_struct), 0, | 1451 | sizeof(struct signal_struct), 0, |
1450 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1452 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1451 | files_cachep = kmem_cache_create("files_cache", | 1453 | files_cachep = kmem_cache_create("files_cache", |
1452 | sizeof(struct files_struct), 0, | 1454 | sizeof(struct files_struct), 0, |
1453 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1455 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1454 | fs_cachep = kmem_cache_create("fs_cache", | 1456 | fs_cachep = kmem_cache_create("fs_cache", |
1455 | sizeof(struct fs_struct), 0, | 1457 | sizeof(struct fs_struct), 0, |
1456 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1458 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1457 | vm_area_cachep = kmem_cache_create("vm_area_struct", | 1459 | vm_area_cachep = kmem_cache_create("vm_area_struct", |
1458 | sizeof(struct vm_area_struct), 0, | 1460 | sizeof(struct vm_area_struct), 0, |
1459 | SLAB_PANIC, NULL, NULL); | 1461 | SLAB_PANIC, NULL); |
1460 | mm_cachep = kmem_cache_create("mm_struct", | 1462 | mm_cachep = kmem_cache_create("mm_struct", |
1461 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 1463 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
1462 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1464 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
1463 | } | 1465 | } |
1464 | 1466 | ||
1465 | /* | 1467 | /* |
@@ -1606,7 +1608,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1606 | err = -EINVAL; | 1608 | err = -EINVAL; |
1607 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | 1609 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
1608 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | 1610 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
1609 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) | 1611 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| |
1612 | CLONE_NEWNET)) | ||
1610 | goto bad_unshare_out; | 1613 | goto bad_unshare_out; |
1611 | 1614 | ||
1612 | if ((err = unshare_thread(unshare_flags))) | 1615 | if ((err = unshare_thread(unshare_flags))) |
diff --git a/kernel/futex.c b/kernel/futex.c index 5c3f45d07c53..fcc94e7b4086 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -346,15 +346,20 @@ static int futex_handle_fault(unsigned long address, | |||
346 | vma = find_vma(mm, address); | 346 | vma = find_vma(mm, address); |
347 | if (vma && address >= vma->vm_start && | 347 | if (vma && address >= vma->vm_start && |
348 | (vma->vm_flags & VM_WRITE)) { | 348 | (vma->vm_flags & VM_WRITE)) { |
349 | switch (handle_mm_fault(mm, vma, address, 1)) { | 349 | int fault; |
350 | case VM_FAULT_MINOR: | 350 | fault = handle_mm_fault(mm, vma, address, 1); |
351 | ret = 0; | 351 | if (unlikely((fault & VM_FAULT_ERROR))) { |
352 | current->min_flt++; | 352 | #if 0 |
353 | break; | 353 | /* XXX: let's do this when we verify it is OK */ |
354 | case VM_FAULT_MAJOR: | 354 | if (ret & VM_FAULT_OOM) |
355 | ret = -ENOMEM; | ||
356 | #endif | ||
357 | } else { | ||
355 | ret = 0; | 358 | ret = 0; |
356 | current->maj_flt++; | 359 | if (fault & VM_FAULT_MAJOR) |
357 | break; | 360 | current->maj_flt++; |
361 | else | ||
362 | current->min_flt++; | ||
358 | } | 363 | } |
359 | } | 364 | } |
360 | if (!fshared) | 365 | if (!fshared) |
@@ -1665,6 +1670,7 @@ pi_faulted: | |||
1665 | attempt); | 1670 | attempt); |
1666 | if (ret) | 1671 | if (ret) |
1667 | goto out; | 1672 | goto out; |
1673 | uval = 0; | ||
1668 | goto retry_unlocked; | 1674 | goto retry_unlocked; |
1669 | } | 1675 | } |
1670 | 1676 | ||
@@ -1937,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, | |||
1937 | void exit_robust_list(struct task_struct *curr) | 1943 | void exit_robust_list(struct task_struct *curr) |
1938 | { | 1944 | { |
1939 | struct robust_list_head __user *head = curr->robust_list; | 1945 | struct robust_list_head __user *head = curr->robust_list; |
1940 | struct robust_list __user *entry, *pending; | 1946 | struct robust_list __user *entry, *next_entry, *pending; |
1941 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; | 1947 | unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; |
1942 | unsigned long futex_offset; | 1948 | unsigned long futex_offset; |
1949 | int rc; | ||
1943 | 1950 | ||
1944 | /* | 1951 | /* |
1945 | * Fetch the list head (which was registered earlier, via | 1952 | * Fetch the list head (which was registered earlier, via |
@@ -1959,12 +1966,14 @@ void exit_robust_list(struct task_struct *curr) | |||
1959 | if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) | 1966 | if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) |
1960 | return; | 1967 | return; |
1961 | 1968 | ||
1962 | if (pending) | 1969 | next_entry = NULL; /* avoid warning with gcc */ |
1963 | handle_futex_death((void __user *)pending + futex_offset, | ||
1964 | curr, pip); | ||
1965 | |||
1966 | while (entry != &head->list) { | 1970 | while (entry != &head->list) { |
1967 | /* | 1971 | /* |
1972 | * Fetch the next entry in the list before calling | ||
1973 | * handle_futex_death: | ||
1974 | */ | ||
1975 | rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); | ||
1976 | /* | ||
1968 | * A pending lock might already be on the list, so | 1977 | * A pending lock might already be on the list, so |
1969 | * don't process it twice: | 1978 | * don't process it twice: |
1970 | */ | 1979 | */ |
@@ -1972,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr) | |||
1972 | if (handle_futex_death((void __user *)entry + futex_offset, | 1981 | if (handle_futex_death((void __user *)entry + futex_offset, |
1973 | curr, pi)) | 1982 | curr, pi)) |
1974 | return; | 1983 | return; |
1975 | /* | 1984 | if (rc) |
1976 | * Fetch the next entry in the list: | ||
1977 | */ | ||
1978 | if (fetch_robust_entry(&entry, &entry->next, &pi)) | ||
1979 | return; | 1985 | return; |
1986 | entry = next_entry; | ||
1987 | pi = next_pi; | ||
1980 | /* | 1988 | /* |
1981 | * Avoid excessively long or circular lists: | 1989 | * Avoid excessively long or circular lists: |
1982 | */ | 1990 | */ |
@@ -1985,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr) | |||
1985 | 1993 | ||
1986 | cond_resched(); | 1994 | cond_resched(); |
1987 | } | 1995 | } |
1996 | |||
1997 | if (pending) | ||
1998 | handle_futex_death((void __user *)pending + futex_offset, | ||
1999 | curr, pip); | ||
1988 | } | 2000 | } |
1989 | 2001 | ||
1990 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | 2002 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
@@ -2055,8 +2067,10 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, | |||
2055 | } | 2067 | } |
2056 | /* | 2068 | /* |
2057 | * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. | 2069 | * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. |
2070 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. | ||
2058 | */ | 2071 | */ |
2059 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) | 2072 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || |
2073 | cmd == FUTEX_WAKE_OP) | ||
2060 | val2 = (u32) (unsigned long) utime; | 2074 | val2 = (u32) (unsigned long) utime; |
2061 | 2075 | ||
2062 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); | 2076 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index f7921360efad..2c2e2954b713 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | |||
38 | void compat_exit_robust_list(struct task_struct *curr) | 38 | void compat_exit_robust_list(struct task_struct *curr) |
39 | { | 39 | { |
40 | struct compat_robust_list_head __user *head = curr->compat_robust_list; | 40 | struct compat_robust_list_head __user *head = curr->compat_robust_list; |
41 | struct robust_list __user *entry, *pending; | 41 | struct robust_list __user *entry, *next_entry, *pending; |
42 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; | 42 | unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; |
43 | compat_uptr_t uentry, upending; | 43 | compat_uptr_t uentry, next_uentry, upending; |
44 | compat_long_t futex_offset; | 44 | compat_long_t futex_offset; |
45 | int rc; | ||
45 | 46 | ||
46 | /* | 47 | /* |
47 | * Fetch the list head (which was registered earlier, via | 48 | * Fetch the list head (which was registered earlier, via |
@@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
61 | if (fetch_robust_entry(&upending, &pending, | 62 | if (fetch_robust_entry(&upending, &pending, |
62 | &head->list_op_pending, &pip)) | 63 | &head->list_op_pending, &pip)) |
63 | return; | 64 | return; |
64 | if (upending) | ||
65 | handle_futex_death((void __user *)pending + futex_offset, curr, pip); | ||
66 | 65 | ||
67 | while (compat_ptr(uentry) != &head->list) { | 66 | next_entry = NULL; /* avoid warning with gcc */ |
67 | while (entry != (struct robust_list __user *) &head->list) { | ||
68 | /* | ||
69 | * Fetch the next entry in the list before calling | ||
70 | * handle_futex_death: | ||
71 | */ | ||
72 | rc = fetch_robust_entry(&next_uentry, &next_entry, | ||
73 | (compat_uptr_t __user *)&entry->next, &next_pi); | ||
68 | /* | 74 | /* |
69 | * A pending lock might already be on the list, so | 75 | * A pending lock might already be on the list, so |
70 | * dont process it twice: | 76 | * dont process it twice: |
@@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
74 | curr, pi)) | 80 | curr, pi)) |
75 | return; | 81 | return; |
76 | 82 | ||
77 | /* | 83 | if (rc) |
78 | * Fetch the next entry in the list: | ||
79 | */ | ||
80 | if (fetch_robust_entry(&uentry, &entry, | ||
81 | (compat_uptr_t __user *)&entry->next, &pi)) | ||
82 | return; | 84 | return; |
85 | uentry = next_uentry; | ||
86 | entry = next_entry; | ||
87 | pi = next_pi; | ||
83 | /* | 88 | /* |
84 | * Avoid excessively long or circular lists: | 89 | * Avoid excessively long or circular lists: |
85 | */ | 90 | */ |
@@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
88 | 93 | ||
89 | cond_resched(); | 94 | cond_resched(); |
90 | } | 95 | } |
96 | if (pending) | ||
97 | handle_futex_death((void __user *)pending + futex_offset, | ||
98 | curr, pip); | ||
91 | } | 99 | } |
92 | 100 | ||
93 | asmlinkage long | 101 | asmlinkage long |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72d034258ba1..dc8a4451d79b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -141,11 +141,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | |||
141 | 141 | ||
142 | do { | 142 | do { |
143 | seq = read_seqbegin(&xtime_lock); | 143 | seq = read_seqbegin(&xtime_lock); |
144 | #ifdef CONFIG_NO_HZ | 144 | xts = current_kernel_time(); |
145 | getnstimeofday(&xts); | ||
146 | #else | ||
147 | xts = xtime; | ||
148 | #endif | ||
149 | tom = wall_to_monotonic; | 145 | tom = wall_to_monotonic; |
150 | } while (read_seqretry(&xtime_lock, seq)); | 146 | } while (read_seqretry(&xtime_lock, seq)); |
151 | 147 | ||
@@ -281,6 +277,30 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) | |||
281 | } | 277 | } |
282 | 278 | ||
283 | EXPORT_SYMBOL_GPL(ktime_add_ns); | 279 | EXPORT_SYMBOL_GPL(ktime_add_ns); |
280 | |||
281 | /** | ||
282 | * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable | ||
283 | * @kt: minuend | ||
284 | * @nsec: the scalar nsec value to subtract | ||
285 | * | ||
286 | * Returns the subtraction of @nsec from @kt in ktime_t format | ||
287 | */ | ||
288 | ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) | ||
289 | { | ||
290 | ktime_t tmp; | ||
291 | |||
292 | if (likely(nsec < NSEC_PER_SEC)) { | ||
293 | tmp.tv64 = nsec; | ||
294 | } else { | ||
295 | unsigned long rem = do_div(nsec, NSEC_PER_SEC); | ||
296 | |||
297 | tmp = ktime_set((long)nsec, rem); | ||
298 | } | ||
299 | |||
300 | return ktime_sub(kt, tmp); | ||
301 | } | ||
302 | |||
303 | EXPORT_SYMBOL_GPL(ktime_sub_ns); | ||
284 | # endif /* !CONFIG_KTIME_SCALAR */ | 304 | # endif /* !CONFIG_KTIME_SCALAR */ |
285 | 305 | ||
286 | /* | 306 | /* |
@@ -558,7 +578,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
558 | */ | 578 | */ |
559 | static int hrtimer_switch_to_hres(void) | 579 | static int hrtimer_switch_to_hres(void) |
560 | { | 580 | { |
561 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | 581 | int cpu = smp_processor_id(); |
582 | struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); | ||
562 | unsigned long flags; | 583 | unsigned long flags; |
563 | 584 | ||
564 | if (base->hres_active) | 585 | if (base->hres_active) |
@@ -568,6 +589,8 @@ static int hrtimer_switch_to_hres(void) | |||
568 | 589 | ||
569 | if (tick_init_highres()) { | 590 | if (tick_init_highres()) { |
570 | local_irq_restore(flags); | 591 | local_irq_restore(flags); |
592 | printk(KERN_WARNING "Could not switch to high resolution " | ||
593 | "mode on CPU %d\n", cpu); | ||
571 | return 0; | 594 | return 0; |
572 | } | 595 | } |
573 | base->hres_active = 1; | 596 | base->hres_active = 1; |
@@ -683,6 +706,7 @@ static void enqueue_hrtimer(struct hrtimer *timer, | |||
683 | struct rb_node **link = &base->active.rb_node; | 706 | struct rb_node **link = &base->active.rb_node; |
684 | struct rb_node *parent = NULL; | 707 | struct rb_node *parent = NULL; |
685 | struct hrtimer *entry; | 708 | struct hrtimer *entry; |
709 | int leftmost = 1; | ||
686 | 710 | ||
687 | /* | 711 | /* |
688 | * Find the right place in the rbtree: | 712 | * Find the right place in the rbtree: |
@@ -694,18 +718,19 @@ static void enqueue_hrtimer(struct hrtimer *timer, | |||
694 | * We dont care about collisions. Nodes with | 718 | * We dont care about collisions. Nodes with |
695 | * the same expiry time stay together. | 719 | * the same expiry time stay together. |
696 | */ | 720 | */ |
697 | if (timer->expires.tv64 < entry->expires.tv64) | 721 | if (timer->expires.tv64 < entry->expires.tv64) { |
698 | link = &(*link)->rb_left; | 722 | link = &(*link)->rb_left; |
699 | else | 723 | } else { |
700 | link = &(*link)->rb_right; | 724 | link = &(*link)->rb_right; |
725 | leftmost = 0; | ||
726 | } | ||
701 | } | 727 | } |
702 | 728 | ||
703 | /* | 729 | /* |
704 | * Insert the timer to the rbtree and check whether it | 730 | * Insert the timer to the rbtree and check whether it |
705 | * replaces the first pending timer | 731 | * replaces the first pending timer |
706 | */ | 732 | */ |
707 | if (!base->first || timer->expires.tv64 < | 733 | if (leftmost) { |
708 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) { | ||
709 | /* | 734 | /* |
710 | * Reprogram the clock event device. When the timer is already | 735 | * Reprogram the clock event device. When the timer is already |
711 | * expired hrtimer_enqueue_reprogram has either called the | 736 | * expired hrtimer_enqueue_reprogram has either called the |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 615ce97c6cfd..f1a73f0b54e7 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -352,13 +352,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
352 | * keep it masked and get out of here | 352 | * keep it masked and get out of here |
353 | */ | 353 | */ |
354 | action = desc->action; | 354 | action = desc->action; |
355 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 355 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) |
356 | desc->status |= IRQ_PENDING; | ||
357 | goto out_unlock; | 356 | goto out_unlock; |
358 | } | ||
359 | 357 | ||
360 | desc->status |= IRQ_INPROGRESS; | 358 | desc->status |= IRQ_INPROGRESS; |
361 | desc->status &= ~IRQ_PENDING; | ||
362 | spin_unlock(&desc->lock); | 359 | spin_unlock(&desc->lock); |
363 | 360 | ||
364 | action_ret = handle_IRQ_event(irq, action); | 361 | action_ret = handle_IRQ_event(irq, action); |
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index d8ee241115f5..6d9204f3a370 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c | |||
@@ -1,5 +1,6 @@ | |||
1 | #include <linux/module.h> | 1 | #include <linux/module.h> |
2 | #include <linux/interrupt.h> | 2 | #include <linux/interrupt.h> |
3 | #include <linux/device.h> | ||
3 | 4 | ||
4 | /* | 5 | /* |
5 | * Device resource management aware IRQ request/free implementation. | 6 | * Device resource management aware IRQ request/free implementation. |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 203a518b6f14..7230d914eaa2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -462,7 +462,9 @@ void free_irq(unsigned int irq, void *dev_id) | |||
462 | * We do this after actually deregistering it, to make sure that | 462 | * We do this after actually deregistering it, to make sure that |
463 | * a 'real' IRQ doesn't run in parallel with our fake | 463 | * a 'real' IRQ doesn't run in parallel with our fake |
464 | */ | 464 | */ |
465 | local_irq_save(flags); | ||
465 | handler(irq, dev_id); | 466 | handler(irq, dev_id); |
467 | local_irq_restore(flags); | ||
466 | } | 468 | } |
467 | #endif | 469 | #endif |
468 | } | 470 | } |
@@ -545,14 +547,11 @@ int request_irq(unsigned int irq, irq_handler_t handler, | |||
545 | * We do this before actually registering it, to make sure that | 547 | * We do this before actually registering it, to make sure that |
546 | * a 'real' IRQ doesn't run in parallel with our fake | 548 | * a 'real' IRQ doesn't run in parallel with our fake |
547 | */ | 549 | */ |
548 | if (irqflags & IRQF_DISABLED) { | 550 | unsigned long flags; |
549 | unsigned long flags; | ||
550 | 551 | ||
551 | local_irq_save(flags); | 552 | local_irq_save(flags); |
552 | handler(irq, dev_id); | 553 | handler(irq, dev_id); |
553 | local_irq_restore(flags); | 554 | local_irq_restore(flags); |
554 | } else | ||
555 | handler(irq, dev_id); | ||
556 | } | 555 | } |
557 | #endif | 556 | #endif |
558 | 557 | ||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index b4f1674fca79..50b81b98046a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -19,7 +19,15 @@ static struct proc_dir_entry *root_irq_dir; | |||
19 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 19 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
20 | int count, int *eof, void *data) | 20 | int count, int *eof, void *data) |
21 | { | 21 | { |
22 | int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); | 22 | struct irq_desc *desc = irq_desc + (long)data; |
23 | cpumask_t *mask = &desc->affinity; | ||
24 | int len; | ||
25 | |||
26 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
27 | if (desc->status & IRQ_MOVE_PENDING) | ||
28 | mask = &desc->pending_mask; | ||
29 | #endif | ||
30 | len = cpumask_scnprintf(page, count, *mask); | ||
23 | 31 | ||
24 | if (count - len < 2) | 32 | if (count - len < 2) |
25 | return -EINVAL; | 33 | return -EINVAL; |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 5bfeaed7e487..a8046791ba2d 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -62,7 +62,12 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
62 | */ | 62 | */ |
63 | desc->chip->enable(irq); | 63 | desc->chip->enable(irq); |
64 | 64 | ||
65 | if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | 65 | /* |
66 | * We do not resend level type interrupts. Level type | ||
67 | * interrupts are resent by hardware when they are still | ||
68 | * active. | ||
69 | */ | ||
70 | if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | ||
66 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; | 71 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; |
67 | 72 | ||
68 | if (!desc->chip || !desc->chip->retrigger || | 73 | if (!desc->chip || !desc->chip->retrigger || |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 4d32eb077179..c6a4f8aebeba 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -33,6 +33,8 @@ | |||
33 | #include <linux/kernel.h> | 33 | #include <linux/kernel.h> |
34 | #include <linux/init.h> | 34 | #include <linux/init.h> |
35 | #include <linux/resource.h> | 35 | #include <linux/resource.h> |
36 | #include <linux/notifier.h> | ||
37 | #include <linux/suspend.h> | ||
36 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
37 | 39 | ||
38 | extern int max_threads; | 40 | extern int max_threads; |
@@ -119,9 +121,10 @@ struct subprocess_info { | |||
119 | char **argv; | 121 | char **argv; |
120 | char **envp; | 122 | char **envp; |
121 | struct key *ring; | 123 | struct key *ring; |
122 | int wait; | 124 | enum umh_wait wait; |
123 | int retval; | 125 | int retval; |
124 | struct file *stdin; | 126 | struct file *stdin; |
127 | void (*cleanup)(char **argv, char **envp); | ||
125 | }; | 128 | }; |
126 | 129 | ||
127 | /* | 130 | /* |
@@ -180,6 +183,14 @@ static int ____call_usermodehelper(void *data) | |||
180 | do_exit(0); | 183 | do_exit(0); |
181 | } | 184 | } |
182 | 185 | ||
186 | void call_usermodehelper_freeinfo(struct subprocess_info *info) | ||
187 | { | ||
188 | if (info->cleanup) | ||
189 | (*info->cleanup)(info->argv, info->envp); | ||
190 | kfree(info); | ||
191 | } | ||
192 | EXPORT_SYMBOL(call_usermodehelper_freeinfo); | ||
193 | |||
183 | /* Keventd can't block, but this (a child) can. */ | 194 | /* Keventd can't block, but this (a child) can. */ |
184 | static int wait_for_helper(void *data) | 195 | static int wait_for_helper(void *data) |
185 | { | 196 | { |
@@ -216,8 +227,8 @@ static int wait_for_helper(void *data) | |||
216 | sub_info->retval = ret; | 227 | sub_info->retval = ret; |
217 | } | 228 | } |
218 | 229 | ||
219 | if (sub_info->wait < 0) | 230 | if (sub_info->wait == UMH_NO_WAIT) |
220 | kfree(sub_info); | 231 | call_usermodehelper_freeinfo(sub_info); |
221 | else | 232 | else |
222 | complete(sub_info->complete); | 233 | complete(sub_info->complete); |
223 | return 0; | 234 | return 0; |
@@ -229,34 +240,204 @@ static void __call_usermodehelper(struct work_struct *work) | |||
229 | struct subprocess_info *sub_info = | 240 | struct subprocess_info *sub_info = |
230 | container_of(work, struct subprocess_info, work); | 241 | container_of(work, struct subprocess_info, work); |
231 | pid_t pid; | 242 | pid_t pid; |
232 | int wait = sub_info->wait; | 243 | enum umh_wait wait = sub_info->wait; |
233 | 244 | ||
234 | /* CLONE_VFORK: wait until the usermode helper has execve'd | 245 | /* CLONE_VFORK: wait until the usermode helper has execve'd |
235 | * successfully We need the data structures to stay around | 246 | * successfully We need the data structures to stay around |
236 | * until that is done. */ | 247 | * until that is done. */ |
237 | if (wait) | 248 | if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) |
238 | pid = kernel_thread(wait_for_helper, sub_info, | 249 | pid = kernel_thread(wait_for_helper, sub_info, |
239 | CLONE_FS | CLONE_FILES | SIGCHLD); | 250 | CLONE_FS | CLONE_FILES | SIGCHLD); |
240 | else | 251 | else |
241 | pid = kernel_thread(____call_usermodehelper, sub_info, | 252 | pid = kernel_thread(____call_usermodehelper, sub_info, |
242 | CLONE_VFORK | SIGCHLD); | 253 | CLONE_VFORK | SIGCHLD); |
243 | 254 | ||
244 | if (wait < 0) | 255 | switch (wait) { |
245 | return; | 256 | case UMH_NO_WAIT: |
257 | break; | ||
246 | 258 | ||
247 | if (pid < 0) { | 259 | case UMH_WAIT_PROC: |
260 | if (pid > 0) | ||
261 | break; | ||
248 | sub_info->retval = pid; | 262 | sub_info->retval = pid; |
263 | /* FALLTHROUGH */ | ||
264 | |||
265 | case UMH_WAIT_EXEC: | ||
249 | complete(sub_info->complete); | 266 | complete(sub_info->complete); |
250 | } else if (!wait) | 267 | } |
251 | complete(sub_info->complete); | 268 | } |
269 | |||
270 | #ifdef CONFIG_PM | ||
271 | /* | ||
272 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY | ||
273 | * (used for preventing user land processes from being created after the user | ||
274 | * land has been frozen during a system-wide hibernation or suspend operation). | ||
275 | */ | ||
276 | static int usermodehelper_disabled; | ||
277 | |||
278 | /* Number of helpers running */ | ||
279 | static atomic_t running_helpers = ATOMIC_INIT(0); | ||
280 | |||
281 | /* | ||
282 | * Wait queue head used by usermodehelper_pm_callback() to wait for all running | ||
283 | * helpers to finish. | ||
284 | */ | ||
285 | static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); | ||
286 | |||
287 | /* | ||
288 | * Time to wait for running_helpers to become zero before the setting of | ||
289 | * usermodehelper_disabled in usermodehelper_pm_callback() fails | ||
290 | */ | ||
291 | #define RUNNING_HELPERS_TIMEOUT (5 * HZ) | ||
292 | |||
293 | static int usermodehelper_pm_callback(struct notifier_block *nfb, | ||
294 | unsigned long action, | ||
295 | void *ignored) | ||
296 | { | ||
297 | long retval; | ||
298 | |||
299 | switch (action) { | ||
300 | case PM_HIBERNATION_PREPARE: | ||
301 | case PM_SUSPEND_PREPARE: | ||
302 | usermodehelper_disabled = 1; | ||
303 | smp_mb(); | ||
304 | /* | ||
305 | * From now on call_usermodehelper_exec() won't start any new | ||
306 | * helpers, so it is sufficient if running_helpers turns out to | ||
307 | * be zero at one point (it may be increased later, but that | ||
308 | * doesn't matter). | ||
309 | */ | ||
310 | retval = wait_event_timeout(running_helpers_waitq, | ||
311 | atomic_read(&running_helpers) == 0, | ||
312 | RUNNING_HELPERS_TIMEOUT); | ||
313 | if (retval) { | ||
314 | return NOTIFY_OK; | ||
315 | } else { | ||
316 | usermodehelper_disabled = 0; | ||
317 | return NOTIFY_BAD; | ||
318 | } | ||
319 | case PM_POST_HIBERNATION: | ||
320 | case PM_POST_SUSPEND: | ||
321 | usermodehelper_disabled = 0; | ||
322 | return NOTIFY_OK; | ||
323 | } | ||
324 | |||
325 | return NOTIFY_DONE; | ||
326 | } | ||
327 | |||
328 | static void helper_lock(void) | ||
329 | { | ||
330 | atomic_inc(&running_helpers); | ||
331 | smp_mb__after_atomic_inc(); | ||
332 | } | ||
333 | |||
334 | static void helper_unlock(void) | ||
335 | { | ||
336 | if (atomic_dec_and_test(&running_helpers)) | ||
337 | wake_up(&running_helpers_waitq); | ||
338 | } | ||
339 | |||
340 | static void register_pm_notifier_callback(void) | ||
341 | { | ||
342 | pm_notifier(usermodehelper_pm_callback, 0); | ||
252 | } | 343 | } |
344 | #else /* CONFIG_PM */ | ||
345 | #define usermodehelper_disabled 0 | ||
346 | |||
347 | static inline void helper_lock(void) {} | ||
348 | static inline void helper_unlock(void) {} | ||
349 | static inline void register_pm_notifier_callback(void) {} | ||
350 | #endif /* CONFIG_PM */ | ||
253 | 351 | ||
254 | /** | 352 | /** |
255 | * call_usermodehelper_keys - start a usermode application | 353 | * call_usermodehelper_setup - prepare to call a usermode helper |
256 | * @path: pathname for the application | 354 | * @path: path to usermode executable |
257 | * @argv: null-terminated argument list | 355 | * @argv: arg vector for process |
258 | * @envp: null-terminated environment list | 356 | * @envp: environment for process |
259 | * @session_keyring: session keyring for process (NULL for an empty keyring) | 357 | * |
358 | * Returns either %NULL on allocation failure, or a subprocess_info | ||
359 | * structure. This should be passed to call_usermodehelper_exec to | ||
360 | * exec the process and free the structure. | ||
361 | */ | ||
362 | struct subprocess_info *call_usermodehelper_setup(char *path, | ||
363 | char **argv, char **envp) | ||
364 | { | ||
365 | struct subprocess_info *sub_info; | ||
366 | sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); | ||
367 | if (!sub_info) | ||
368 | goto out; | ||
369 | |||
370 | INIT_WORK(&sub_info->work, __call_usermodehelper); | ||
371 | sub_info->path = path; | ||
372 | sub_info->argv = argv; | ||
373 | sub_info->envp = envp; | ||
374 | |||
375 | out: | ||
376 | return sub_info; | ||
377 | } | ||
378 | EXPORT_SYMBOL(call_usermodehelper_setup); | ||
379 | |||
380 | /** | ||
381 | * call_usermodehelper_setkeys - set the session keys for usermode helper | ||
382 | * @info: a subprocess_info returned by call_usermodehelper_setup | ||
383 | * @session_keyring: the session keyring for the process | ||
384 | */ | ||
385 | void call_usermodehelper_setkeys(struct subprocess_info *info, | ||
386 | struct key *session_keyring) | ||
387 | { | ||
388 | info->ring = session_keyring; | ||
389 | } | ||
390 | EXPORT_SYMBOL(call_usermodehelper_setkeys); | ||
391 | |||
392 | /** | ||
393 | * call_usermodehelper_setcleanup - set a cleanup function | ||
394 | * @info: a subprocess_info returned by call_usermodehelper_setup | ||
395 | * @cleanup: a cleanup function | ||
396 | * | ||
397 | * The cleanup function is just befor ethe subprocess_info is about to | ||
398 | * be freed. This can be used for freeing the argv and envp. The | ||
399 | * Function must be runnable in either a process context or the | ||
400 | * context in which call_usermodehelper_exec is called. | ||
401 | */ | ||
402 | void call_usermodehelper_setcleanup(struct subprocess_info *info, | ||
403 | void (*cleanup)(char **argv, char **envp)) | ||
404 | { | ||
405 | info->cleanup = cleanup; | ||
406 | } | ||
407 | EXPORT_SYMBOL(call_usermodehelper_setcleanup); | ||
408 | |||
409 | /** | ||
410 | * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin | ||
411 | * @sub_info: a subprocess_info returned by call_usermodehelper_setup | ||
412 | * @filp: set to the write-end of a pipe | ||
413 | * | ||
414 | * This constructs a pipe, and sets the read end to be the stdin of the | ||
415 | * subprocess, and returns the write-end in *@filp. | ||
416 | */ | ||
417 | int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, | ||
418 | struct file **filp) | ||
419 | { | ||
420 | struct file *f; | ||
421 | |||
422 | f = create_write_pipe(); | ||
423 | if (IS_ERR(f)) | ||
424 | return PTR_ERR(f); | ||
425 | *filp = f; | ||
426 | |||
427 | f = create_read_pipe(f); | ||
428 | if (IS_ERR(f)) { | ||
429 | free_write_pipe(*filp); | ||
430 | return PTR_ERR(f); | ||
431 | } | ||
432 | sub_info->stdin = f; | ||
433 | |||
434 | return 0; | ||
435 | } | ||
436 | EXPORT_SYMBOL(call_usermodehelper_stdinpipe); | ||
437 | |||
438 | /** | ||
439 | * call_usermodehelper_exec - start a usermode application | ||
440 | * @sub_info: information about the subprocessa | ||
260 | * @wait: wait for the application to finish and return status. | 441 | * @wait: wait for the application to finish and return status. |
261 | * when -1 don't wait at all, but you get no useful error back when | 442 | * when -1 don't wait at all, but you get no useful error back when |
262 | * the program couldn't be exec'ed. This makes it safe to call | 443 | * the program couldn't be exec'ed. This makes it safe to call |
@@ -265,81 +446,70 @@ static void __call_usermodehelper(struct work_struct *work) | |||
265 | * Runs a user-space application. The application is started | 446 | * Runs a user-space application. The application is started |
266 | * asynchronously if wait is not set, and runs as a child of keventd. | 447 | * asynchronously if wait is not set, and runs as a child of keventd. |
267 | * (ie. it runs with full root capabilities). | 448 | * (ie. it runs with full root capabilities). |
268 | * | ||
269 | * Must be called from process context. Returns a negative error code | ||
270 | * if program was not execed successfully, or 0. | ||
271 | */ | 449 | */ |
272 | int call_usermodehelper_keys(char *path, char **argv, char **envp, | 450 | int call_usermodehelper_exec(struct subprocess_info *sub_info, |
273 | struct key *session_keyring, int wait) | 451 | enum umh_wait wait) |
274 | { | 452 | { |
275 | DECLARE_COMPLETION_ONSTACK(done); | 453 | DECLARE_COMPLETION_ONSTACK(done); |
276 | struct subprocess_info *sub_info; | ||
277 | int retval; | 454 | int retval; |
278 | 455 | ||
279 | if (!khelper_wq) | 456 | helper_lock(); |
280 | return -EBUSY; | 457 | if (sub_info->path[0] == '\0') { |
281 | 458 | retval = 0; | |
282 | if (path[0] == '\0') | 459 | goto out; |
283 | return 0; | 460 | } |
284 | 461 | ||
285 | sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); | 462 | if (!khelper_wq || usermodehelper_disabled) { |
286 | if (!sub_info) | 463 | retval = -EBUSY; |
287 | return -ENOMEM; | 464 | goto out; |
465 | } | ||
288 | 466 | ||
289 | INIT_WORK(&sub_info->work, __call_usermodehelper); | ||
290 | sub_info->complete = &done; | 467 | sub_info->complete = &done; |
291 | sub_info->path = path; | ||
292 | sub_info->argv = argv; | ||
293 | sub_info->envp = envp; | ||
294 | sub_info->ring = session_keyring; | ||
295 | sub_info->wait = wait; | 468 | sub_info->wait = wait; |
296 | 469 | ||
297 | queue_work(khelper_wq, &sub_info->work); | 470 | queue_work(khelper_wq, &sub_info->work); |
298 | if (wait < 0) /* task has freed sub_info */ | 471 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ |
299 | return 0; | 472 | return 0; |
300 | wait_for_completion(&done); | 473 | wait_for_completion(&done); |
301 | retval = sub_info->retval; | 474 | retval = sub_info->retval; |
302 | kfree(sub_info); | 475 | |
476 | out: | ||
477 | call_usermodehelper_freeinfo(sub_info); | ||
478 | helper_unlock(); | ||
303 | return retval; | 479 | return retval; |
304 | } | 480 | } |
305 | EXPORT_SYMBOL(call_usermodehelper_keys); | 481 | EXPORT_SYMBOL(call_usermodehelper_exec); |
306 | 482 | ||
483 | /** | ||
484 | * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin | ||
485 | * @path: path to usermode executable | ||
486 | * @argv: arg vector for process | ||
487 | * @envp: environment for process | ||
488 | * @filp: set to the write-end of a pipe | ||
489 | * | ||
490 | * This is a simple wrapper which executes a usermode-helper function | ||
491 | * with a pipe as stdin. It is implemented entirely in terms of | ||
492 | * lower-level call_usermodehelper_* functions. | ||
493 | */ | ||
307 | int call_usermodehelper_pipe(char *path, char **argv, char **envp, | 494 | int call_usermodehelper_pipe(char *path, char **argv, char **envp, |
308 | struct file **filp) | 495 | struct file **filp) |
309 | { | 496 | { |
310 | DECLARE_COMPLETION(done); | 497 | struct subprocess_info *sub_info; |
311 | struct subprocess_info sub_info = { | 498 | int ret; |
312 | .work = __WORK_INITIALIZER(sub_info.work, | ||
313 | __call_usermodehelper), | ||
314 | .complete = &done, | ||
315 | .path = path, | ||
316 | .argv = argv, | ||
317 | .envp = envp, | ||
318 | .retval = 0, | ||
319 | }; | ||
320 | struct file *f; | ||
321 | |||
322 | if (!khelper_wq) | ||
323 | return -EBUSY; | ||
324 | 499 | ||
325 | if (path[0] == '\0') | 500 | sub_info = call_usermodehelper_setup(path, argv, envp); |
326 | return 0; | 501 | if (sub_info == NULL) |
502 | return -ENOMEM; | ||
327 | 503 | ||
328 | f = create_write_pipe(); | 504 | ret = call_usermodehelper_stdinpipe(sub_info, filp); |
329 | if (IS_ERR(f)) | 505 | if (ret < 0) |
330 | return PTR_ERR(f); | 506 | goto out; |
331 | *filp = f; | ||
332 | 507 | ||
333 | f = create_read_pipe(f); | 508 | return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); |
334 | if (IS_ERR(f)) { | ||
335 | free_write_pipe(*filp); | ||
336 | return PTR_ERR(f); | ||
337 | } | ||
338 | sub_info.stdin = f; | ||
339 | 509 | ||
340 | queue_work(khelper_wq, &sub_info.work); | 510 | out: |
341 | wait_for_completion(&done); | 511 | call_usermodehelper_freeinfo(sub_info); |
342 | return sub_info.retval; | 512 | return ret; |
343 | } | 513 | } |
344 | EXPORT_SYMBOL(call_usermodehelper_pipe); | 514 | EXPORT_SYMBOL(call_usermodehelper_pipe); |
345 | 515 | ||
@@ -347,4 +517,5 @@ void __init usermodehelper_init(void) | |||
347 | { | 517 | { |
348 | khelper_wq = create_singlethread_workqueue("khelper"); | 518 | khelper_wq = create_singlethread_workqueue("khelper"); |
349 | BUG_ON(!khelper_wq); | 519 | BUG_ON(!khelper_wq); |
520 | register_pm_notifier_callback(); | ||
350 | } | 521 | } |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9e47d8c493f3..4b8a4493c541 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -675,9 +675,18 @@ static struct notifier_block kprobe_exceptions_nb = { | |||
675 | .priority = 0x7fffffff /* we need to be notified first */ | 675 | .priority = 0x7fffffff /* we need to be notified first */ |
676 | }; | 676 | }; |
677 | 677 | ||
678 | unsigned long __weak arch_deref_entry_point(void *entry) | ||
679 | { | ||
680 | return (unsigned long)entry; | ||
681 | } | ||
678 | 682 | ||
679 | int __kprobes register_jprobe(struct jprobe *jp) | 683 | int __kprobes register_jprobe(struct jprobe *jp) |
680 | { | 684 | { |
685 | unsigned long addr = arch_deref_entry_point(jp->entry); | ||
686 | |||
687 | if (!kernel_text_address(addr)) | ||
688 | return -EINVAL; | ||
689 | |||
681 | /* Todo: Verify probepoint is a function entry point */ | 690 | /* Todo: Verify probepoint is a function entry point */ |
682 | jp->kp.pre_handler = setjmp_pre_handler; | 691 | jp->kp.pre_handler = setjmp_pre_handler; |
683 | jp->kp.break_handler = longjmp_break_handler; | 692 | jp->kp.break_handler = longjmp_break_handler; |
@@ -1054,6 +1063,11 @@ EXPORT_SYMBOL_GPL(register_kprobe); | |||
1054 | EXPORT_SYMBOL_GPL(unregister_kprobe); | 1063 | EXPORT_SYMBOL_GPL(unregister_kprobe); |
1055 | EXPORT_SYMBOL_GPL(register_jprobe); | 1064 | EXPORT_SYMBOL_GPL(register_jprobe); |
1056 | EXPORT_SYMBOL_GPL(unregister_jprobe); | 1065 | EXPORT_SYMBOL_GPL(unregister_jprobe); |
1066 | #ifdef CONFIG_KPROBES | ||
1057 | EXPORT_SYMBOL_GPL(jprobe_return); | 1067 | EXPORT_SYMBOL_GPL(jprobe_return); |
1068 | #endif | ||
1069 | |||
1070 | #ifdef CONFIG_KPROBES | ||
1058 | EXPORT_SYMBOL_GPL(register_kretprobe); | 1071 | EXPORT_SYMBOL_GPL(register_kretprobe); |
1059 | EXPORT_SYMBOL_GPL(unregister_kretprobe); | 1072 | EXPORT_SYMBOL_GPL(unregister_kretprobe); |
1073 | #endif | ||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 559deca5ed15..d0e5c48e18c7 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -62,6 +62,28 @@ static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) | |||
62 | KERNEL_ATTR_RO(kexec_crash_loaded); | 62 | KERNEL_ATTR_RO(kexec_crash_loaded); |
63 | #endif /* CONFIG_KEXEC */ | 63 | #endif /* CONFIG_KEXEC */ |
64 | 64 | ||
65 | /* | ||
66 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | ||
67 | */ | ||
68 | extern const void __start_notes __attribute__((weak)); | ||
69 | extern const void __stop_notes __attribute__((weak)); | ||
70 | #define notes_size (&__stop_notes - &__start_notes) | ||
71 | |||
72 | static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, | ||
73 | char *buf, loff_t off, size_t count) | ||
74 | { | ||
75 | memcpy(buf, &__start_notes + off, count); | ||
76 | return count; | ||
77 | } | ||
78 | |||
79 | static struct bin_attribute notes_attr = { | ||
80 | .attr = { | ||
81 | .name = "notes", | ||
82 | .mode = S_IRUGO, | ||
83 | }, | ||
84 | .read = ¬es_read, | ||
85 | }; | ||
86 | |||
65 | decl_subsys(kernel, NULL, NULL); | 87 | decl_subsys(kernel, NULL, NULL); |
66 | EXPORT_SYMBOL_GPL(kernel_subsys); | 88 | EXPORT_SYMBOL_GPL(kernel_subsys); |
67 | 89 | ||
@@ -88,6 +110,12 @@ static int __init ksysfs_init(void) | |||
88 | error = sysfs_create_group(&kernel_subsys.kobj, | 110 | error = sysfs_create_group(&kernel_subsys.kobj, |
89 | &kernel_attr_group); | 111 | &kernel_attr_group); |
90 | 112 | ||
113 | if (!error && notes_size > 0) { | ||
114 | notes_attr.size = notes_size; | ||
115 | error = sysfs_create_bin_file(&kernel_subsys.kobj, | ||
116 | ¬es_attr); | ||
117 | } | ||
118 | |||
91 | return error; | 119 | return error; |
92 | } | 120 | } |
93 | 121 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index a404f7ee7395..dcfe724300eb 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -214,23 +214,15 @@ int kthread_stop(struct task_struct *k) | |||
214 | } | 214 | } |
215 | EXPORT_SYMBOL(kthread_stop); | 215 | EXPORT_SYMBOL(kthread_stop); |
216 | 216 | ||
217 | 217 | int kthreadd(void *unused) | |
218 | static noinline __init_refok void kthreadd_setup(void) | ||
219 | { | 218 | { |
220 | struct task_struct *tsk = current; | 219 | struct task_struct *tsk = current; |
221 | 220 | ||
221 | /* Setup a clean context for our children to inherit. */ | ||
222 | set_task_comm(tsk, "kthreadd"); | 222 | set_task_comm(tsk, "kthreadd"); |
223 | |||
224 | ignore_signals(tsk); | 223 | ignore_signals(tsk); |
225 | |||
226 | set_user_nice(tsk, -5); | 224 | set_user_nice(tsk, -5); |
227 | set_cpus_allowed(tsk, CPU_MASK_ALL); | 225 | set_cpus_allowed(tsk, CPU_MASK_ALL); |
228 | } | ||
229 | |||
230 | int kthreadd(void *unused) | ||
231 | { | ||
232 | /* Setup a clean context for our children to inherit. */ | ||
233 | kthreadd_setup(); | ||
234 | 226 | ||
235 | current->flags |= PF_NOFREEZE; | 227 | current->flags |= PF_NOFREEZE; |
236 | 228 | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index edba2ffb43de..734da579ad13 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -5,7 +5,8 @@ | |||
5 | * | 5 | * |
6 | * Started by Ingo Molnar: | 6 | * Started by Ingo Molnar: |
7 | * | 7 | * |
8 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | 8 | * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
9 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
9 | * | 10 | * |
10 | * this code maps all the lock dependencies as they occur in a live kernel | 11 | * this code maps all the lock dependencies as they occur in a live kernel |
11 | * and will warn about the following classes of locking bugs: | 12 | * and will warn about the following classes of locking bugs: |
@@ -37,11 +38,26 @@ | |||
37 | #include <linux/debug_locks.h> | 38 | #include <linux/debug_locks.h> |
38 | #include <linux/irqflags.h> | 39 | #include <linux/irqflags.h> |
39 | #include <linux/utsname.h> | 40 | #include <linux/utsname.h> |
41 | #include <linux/hash.h> | ||
40 | 42 | ||
41 | #include <asm/sections.h> | 43 | #include <asm/sections.h> |
42 | 44 | ||
43 | #include "lockdep_internals.h" | 45 | #include "lockdep_internals.h" |
44 | 46 | ||
47 | #ifdef CONFIG_PROVE_LOCKING | ||
48 | int prove_locking = 1; | ||
49 | module_param(prove_locking, int, 0644); | ||
50 | #else | ||
51 | #define prove_locking 0 | ||
52 | #endif | ||
53 | |||
54 | #ifdef CONFIG_LOCK_STAT | ||
55 | int lock_stat = 1; | ||
56 | module_param(lock_stat, int, 0644); | ||
57 | #else | ||
58 | #define lock_stat 0 | ||
59 | #endif | ||
60 | |||
45 | /* | 61 | /* |
46 | * lockdep_lock: protects the lockdep graph, the hashes and the | 62 | * lockdep_lock: protects the lockdep graph, the hashes and the |
47 | * class/list/hash allocators. | 63 | * class/list/hash allocators. |
@@ -96,23 +112,6 @@ unsigned long nr_list_entries; | |||
96 | static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; | 112 | static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; |
97 | 113 | ||
98 | /* | 114 | /* |
99 | * Allocate a lockdep entry. (assumes the graph_lock held, returns | ||
100 | * with NULL on failure) | ||
101 | */ | ||
102 | static struct lock_list *alloc_list_entry(void) | ||
103 | { | ||
104 | if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { | ||
105 | if (!debug_locks_off_graph_unlock()) | ||
106 | return NULL; | ||
107 | |||
108 | printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); | ||
109 | printk("turning off the locking correctness validator.\n"); | ||
110 | return NULL; | ||
111 | } | ||
112 | return list_entries + nr_list_entries++; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * All data structures here are protected by the global debug_lock. | 115 | * All data structures here are protected by the global debug_lock. |
117 | * | 116 | * |
118 | * Mutex key structs only get allocated, once during bootup, and never | 117 | * Mutex key structs only get allocated, once during bootup, and never |
@@ -121,6 +120,117 @@ static struct lock_list *alloc_list_entry(void) | |||
121 | unsigned long nr_lock_classes; | 120 | unsigned long nr_lock_classes; |
122 | static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; | 121 | static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; |
123 | 122 | ||
123 | #ifdef CONFIG_LOCK_STAT | ||
124 | static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); | ||
125 | |||
126 | static int lock_contention_point(struct lock_class *class, unsigned long ip) | ||
127 | { | ||
128 | int i; | ||
129 | |||
130 | for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { | ||
131 | if (class->contention_point[i] == 0) { | ||
132 | class->contention_point[i] = ip; | ||
133 | break; | ||
134 | } | ||
135 | if (class->contention_point[i] == ip) | ||
136 | break; | ||
137 | } | ||
138 | |||
139 | return i; | ||
140 | } | ||
141 | |||
142 | static void lock_time_inc(struct lock_time *lt, s64 time) | ||
143 | { | ||
144 | if (time > lt->max) | ||
145 | lt->max = time; | ||
146 | |||
147 | if (time < lt->min || !lt->min) | ||
148 | lt->min = time; | ||
149 | |||
150 | lt->total += time; | ||
151 | lt->nr++; | ||
152 | } | ||
153 | |||
154 | static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) | ||
155 | { | ||
156 | dst->min += src->min; | ||
157 | dst->max += src->max; | ||
158 | dst->total += src->total; | ||
159 | dst->nr += src->nr; | ||
160 | } | ||
161 | |||
162 | struct lock_class_stats lock_stats(struct lock_class *class) | ||
163 | { | ||
164 | struct lock_class_stats stats; | ||
165 | int cpu, i; | ||
166 | |||
167 | memset(&stats, 0, sizeof(struct lock_class_stats)); | ||
168 | for_each_possible_cpu(cpu) { | ||
169 | struct lock_class_stats *pcs = | ||
170 | &per_cpu(lock_stats, cpu)[class - lock_classes]; | ||
171 | |||
172 | for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) | ||
173 | stats.contention_point[i] += pcs->contention_point[i]; | ||
174 | |||
175 | lock_time_add(&pcs->read_waittime, &stats.read_waittime); | ||
176 | lock_time_add(&pcs->write_waittime, &stats.write_waittime); | ||
177 | |||
178 | lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); | ||
179 | lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); | ||
180 | |||
181 | for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) | ||
182 | stats.bounces[i] += pcs->bounces[i]; | ||
183 | } | ||
184 | |||
185 | return stats; | ||
186 | } | ||
187 | |||
188 | void clear_lock_stats(struct lock_class *class) | ||
189 | { | ||
190 | int cpu; | ||
191 | |||
192 | for_each_possible_cpu(cpu) { | ||
193 | struct lock_class_stats *cpu_stats = | ||
194 | &per_cpu(lock_stats, cpu)[class - lock_classes]; | ||
195 | |||
196 | memset(cpu_stats, 0, sizeof(struct lock_class_stats)); | ||
197 | } | ||
198 | memset(class->contention_point, 0, sizeof(class->contention_point)); | ||
199 | } | ||
200 | |||
201 | static struct lock_class_stats *get_lock_stats(struct lock_class *class) | ||
202 | { | ||
203 | return &get_cpu_var(lock_stats)[class - lock_classes]; | ||
204 | } | ||
205 | |||
206 | static void put_lock_stats(struct lock_class_stats *stats) | ||
207 | { | ||
208 | put_cpu_var(lock_stats); | ||
209 | } | ||
210 | |||
211 | static void lock_release_holdtime(struct held_lock *hlock) | ||
212 | { | ||
213 | struct lock_class_stats *stats; | ||
214 | s64 holdtime; | ||
215 | |||
216 | if (!lock_stat) | ||
217 | return; | ||
218 | |||
219 | holdtime = sched_clock() - hlock->holdtime_stamp; | ||
220 | |||
221 | stats = get_lock_stats(hlock->class); | ||
222 | if (hlock->read) | ||
223 | lock_time_inc(&stats->read_holdtime, holdtime); | ||
224 | else | ||
225 | lock_time_inc(&stats->write_holdtime, holdtime); | ||
226 | put_lock_stats(stats); | ||
227 | } | ||
228 | #else | ||
229 | static inline void lock_release_holdtime(struct held_lock *hlock) | ||
230 | { | ||
231 | } | ||
232 | #endif | ||
233 | |||
124 | /* | 234 | /* |
125 | * We keep a global list of all lock classes. The list only grows, | 235 | * We keep a global list of all lock classes. The list only grows, |
126 | * never shrinks. The list is only accessed with the lockdep | 236 | * never shrinks. The list is only accessed with the lockdep |
@@ -133,24 +243,18 @@ LIST_HEAD(all_lock_classes); | |||
133 | */ | 243 | */ |
134 | #define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) | 244 | #define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) |
135 | #define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) | 245 | #define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) |
136 | #define CLASSHASH_MASK (CLASSHASH_SIZE - 1) | 246 | #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) |
137 | #define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK) | ||
138 | #define classhashentry(key) (classhash_table + __classhashfn((key))) | 247 | #define classhashentry(key) (classhash_table + __classhashfn((key))) |
139 | 248 | ||
140 | static struct list_head classhash_table[CLASSHASH_SIZE]; | 249 | static struct list_head classhash_table[CLASSHASH_SIZE]; |
141 | 250 | ||
142 | unsigned long nr_lock_chains; | ||
143 | static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; | ||
144 | |||
145 | /* | 251 | /* |
146 | * We put the lock dependency chains into a hash-table as well, to cache | 252 | * We put the lock dependency chains into a hash-table as well, to cache |
147 | * their existence: | 253 | * their existence: |
148 | */ | 254 | */ |
149 | #define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) | 255 | #define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) |
150 | #define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) | 256 | #define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) |
151 | #define CHAINHASH_MASK (CHAINHASH_SIZE - 1) | 257 | #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) |
152 | #define __chainhashfn(chain) \ | ||
153 | (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK) | ||
154 | #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) | 258 | #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) |
155 | 259 | ||
156 | static struct list_head chainhash_table[CHAINHASH_SIZE]; | 260 | static struct list_head chainhash_table[CHAINHASH_SIZE]; |
@@ -223,26 +327,6 @@ static int verbose(struct lock_class *class) | |||
223 | return 0; | 327 | return 0; |
224 | } | 328 | } |
225 | 329 | ||
226 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
227 | |||
228 | static int hardirq_verbose(struct lock_class *class) | ||
229 | { | ||
230 | #if HARDIRQ_VERBOSE | ||
231 | return class_filter(class); | ||
232 | #endif | ||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | static int softirq_verbose(struct lock_class *class) | ||
237 | { | ||
238 | #if SOFTIRQ_VERBOSE | ||
239 | return class_filter(class); | ||
240 | #endif | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | #endif | ||
245 | |||
246 | /* | 330 | /* |
247 | * Stack-trace: tightly packed array of stack backtrace | 331 | * Stack-trace: tightly packed array of stack backtrace |
248 | * addresses. Protected by the graph_lock. | 332 | * addresses. Protected by the graph_lock. |
@@ -291,6 +375,11 @@ unsigned int max_recursion_depth; | |||
291 | * about it later on, in lockdep_info(). | 375 | * about it later on, in lockdep_info(). |
292 | */ | 376 | */ |
293 | static int lockdep_init_error; | 377 | static int lockdep_init_error; |
378 | static unsigned long lockdep_init_trace_data[20]; | ||
379 | static struct stack_trace lockdep_init_trace = { | ||
380 | .max_entries = ARRAY_SIZE(lockdep_init_trace_data), | ||
381 | .entries = lockdep_init_trace_data, | ||
382 | }; | ||
294 | 383 | ||
295 | /* | 384 | /* |
296 | * Various lockdep statistics: | 385 | * Various lockdep statistics: |
@@ -482,6 +571,262 @@ static void print_lock_dependencies(struct lock_class *class, int depth) | |||
482 | } | 571 | } |
483 | } | 572 | } |
484 | 573 | ||
574 | static void print_kernel_version(void) | ||
575 | { | ||
576 | printk("%s %.*s\n", init_utsname()->release, | ||
577 | (int)strcspn(init_utsname()->version, " "), | ||
578 | init_utsname()->version); | ||
579 | } | ||
580 | |||
581 | static int very_verbose(struct lock_class *class) | ||
582 | { | ||
583 | #if VERY_VERBOSE | ||
584 | return class_filter(class); | ||
585 | #endif | ||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Is this the address of a static object: | ||
591 | */ | ||
592 | static int static_obj(void *obj) | ||
593 | { | ||
594 | unsigned long start = (unsigned long) &_stext, | ||
595 | end = (unsigned long) &_end, | ||
596 | addr = (unsigned long) obj; | ||
597 | #ifdef CONFIG_SMP | ||
598 | int i; | ||
599 | #endif | ||
600 | |||
601 | /* | ||
602 | * static variable? | ||
603 | */ | ||
604 | if ((addr >= start) && (addr < end)) | ||
605 | return 1; | ||
606 | |||
607 | #ifdef CONFIG_SMP | ||
608 | /* | ||
609 | * percpu var? | ||
610 | */ | ||
611 | for_each_possible_cpu(i) { | ||
612 | start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); | ||
613 | end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM | ||
614 | + per_cpu_offset(i); | ||
615 | |||
616 | if ((addr >= start) && (addr < end)) | ||
617 | return 1; | ||
618 | } | ||
619 | #endif | ||
620 | |||
621 | /* | ||
622 | * module var? | ||
623 | */ | ||
624 | return is_module_address(addr); | ||
625 | } | ||
626 | |||
627 | /* | ||
628 | * To make lock name printouts unique, we calculate a unique | ||
629 | * class->name_version generation counter: | ||
630 | */ | ||
631 | static int count_matching_names(struct lock_class *new_class) | ||
632 | { | ||
633 | struct lock_class *class; | ||
634 | int count = 0; | ||
635 | |||
636 | if (!new_class->name) | ||
637 | return 0; | ||
638 | |||
639 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
640 | if (new_class->key - new_class->subclass == class->key) | ||
641 | return class->name_version; | ||
642 | if (class->name && !strcmp(class->name, new_class->name)) | ||
643 | count = max(count, class->name_version); | ||
644 | } | ||
645 | |||
646 | return count + 1; | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * Register a lock's class in the hash-table, if the class is not present | ||
651 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
652 | * itself, so actual lookup of the hash should be once per lock object. | ||
653 | */ | ||
654 | static inline struct lock_class * | ||
655 | look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | ||
656 | { | ||
657 | struct lockdep_subclass_key *key; | ||
658 | struct list_head *hash_head; | ||
659 | struct lock_class *class; | ||
660 | |||
661 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
662 | /* | ||
663 | * If the architecture calls into lockdep before initializing | ||
664 | * the hashes then we'll warn about it later. (we cannot printk | ||
665 | * right now) | ||
666 | */ | ||
667 | if (unlikely(!lockdep_initialized)) { | ||
668 | lockdep_init(); | ||
669 | lockdep_init_error = 1; | ||
670 | save_stack_trace(&lockdep_init_trace); | ||
671 | } | ||
672 | #endif | ||
673 | |||
674 | /* | ||
675 | * Static locks do not have their class-keys yet - for them the key | ||
676 | * is the lock object itself: | ||
677 | */ | ||
678 | if (unlikely(!lock->key)) | ||
679 | lock->key = (void *)lock; | ||
680 | |||
681 | /* | ||
682 | * NOTE: the class-key must be unique. For dynamic locks, a static | ||
683 | * lock_class_key variable is passed in through the mutex_init() | ||
684 | * (or spin_lock_init()) call - which acts as the key. For static | ||
685 | * locks we use the lock object itself as the key. | ||
686 | */ | ||
687 | BUILD_BUG_ON(sizeof(struct lock_class_key) > | ||
688 | sizeof(struct lockdep_map)); | ||
689 | |||
690 | key = lock->key->subkeys + subclass; | ||
691 | |||
692 | hash_head = classhashentry(key); | ||
693 | |||
694 | /* | ||
695 | * We can walk the hash lockfree, because the hash only | ||
696 | * grows, and we are careful when adding entries to the end: | ||
697 | */ | ||
698 | list_for_each_entry(class, hash_head, hash_entry) { | ||
699 | if (class->key == key) { | ||
700 | WARN_ON_ONCE(class->name != lock->name); | ||
701 | return class; | ||
702 | } | ||
703 | } | ||
704 | |||
705 | return NULL; | ||
706 | } | ||
707 | |||
708 | /* | ||
709 | * Register a lock's class in the hash-table, if the class is not present | ||
710 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
711 | * itself, so actual lookup of the hash should be once per lock object. | ||
712 | */ | ||
713 | static inline struct lock_class * | ||
714 | register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | ||
715 | { | ||
716 | struct lockdep_subclass_key *key; | ||
717 | struct list_head *hash_head; | ||
718 | struct lock_class *class; | ||
719 | unsigned long flags; | ||
720 | |||
721 | class = look_up_lock_class(lock, subclass); | ||
722 | if (likely(class)) | ||
723 | return class; | ||
724 | |||
725 | /* | ||
726 | * Debug-check: all keys must be persistent! | ||
727 | */ | ||
728 | if (!static_obj(lock->key)) { | ||
729 | debug_locks_off(); | ||
730 | printk("INFO: trying to register non-static key.\n"); | ||
731 | printk("the code is fine but needs lockdep annotation.\n"); | ||
732 | printk("turning off the locking correctness validator.\n"); | ||
733 | dump_stack(); | ||
734 | |||
735 | return NULL; | ||
736 | } | ||
737 | |||
738 | key = lock->key->subkeys + subclass; | ||
739 | hash_head = classhashentry(key); | ||
740 | |||
741 | raw_local_irq_save(flags); | ||
742 | if (!graph_lock()) { | ||
743 | raw_local_irq_restore(flags); | ||
744 | return NULL; | ||
745 | } | ||
746 | /* | ||
747 | * We have to do the hash-walk again, to avoid races | ||
748 | * with another CPU: | ||
749 | */ | ||
750 | list_for_each_entry(class, hash_head, hash_entry) | ||
751 | if (class->key == key) | ||
752 | goto out_unlock_set; | ||
753 | /* | ||
754 | * Allocate a new key from the static array, and add it to | ||
755 | * the hash: | ||
756 | */ | ||
757 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | ||
758 | if (!debug_locks_off_graph_unlock()) { | ||
759 | raw_local_irq_restore(flags); | ||
760 | return NULL; | ||
761 | } | ||
762 | raw_local_irq_restore(flags); | ||
763 | |||
764 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); | ||
765 | printk("turning off the locking correctness validator.\n"); | ||
766 | return NULL; | ||
767 | } | ||
768 | class = lock_classes + nr_lock_classes++; | ||
769 | debug_atomic_inc(&nr_unused_locks); | ||
770 | class->key = key; | ||
771 | class->name = lock->name; | ||
772 | class->subclass = subclass; | ||
773 | INIT_LIST_HEAD(&class->lock_entry); | ||
774 | INIT_LIST_HEAD(&class->locks_before); | ||
775 | INIT_LIST_HEAD(&class->locks_after); | ||
776 | class->name_version = count_matching_names(class); | ||
777 | /* | ||
778 | * We use RCU's safe list-add method to make | ||
779 | * parallel walking of the hash-list safe: | ||
780 | */ | ||
781 | list_add_tail_rcu(&class->hash_entry, hash_head); | ||
782 | |||
783 | if (verbose(class)) { | ||
784 | graph_unlock(); | ||
785 | raw_local_irq_restore(flags); | ||
786 | |||
787 | printk("\nnew class %p: %s", class->key, class->name); | ||
788 | if (class->name_version > 1) | ||
789 | printk("#%d", class->name_version); | ||
790 | printk("\n"); | ||
791 | dump_stack(); | ||
792 | |||
793 | raw_local_irq_save(flags); | ||
794 | if (!graph_lock()) { | ||
795 | raw_local_irq_restore(flags); | ||
796 | return NULL; | ||
797 | } | ||
798 | } | ||
799 | out_unlock_set: | ||
800 | graph_unlock(); | ||
801 | raw_local_irq_restore(flags); | ||
802 | |||
803 | if (!subclass || force) | ||
804 | lock->class_cache = class; | ||
805 | |||
806 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | ||
807 | return NULL; | ||
808 | |||
809 | return class; | ||
810 | } | ||
811 | |||
812 | #ifdef CONFIG_PROVE_LOCKING | ||
813 | /* | ||
814 | * Allocate a lockdep entry. (assumes the graph_lock held, returns | ||
815 | * with NULL on failure) | ||
816 | */ | ||
817 | static struct lock_list *alloc_list_entry(void) | ||
818 | { | ||
819 | if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { | ||
820 | if (!debug_locks_off_graph_unlock()) | ||
821 | return NULL; | ||
822 | |||
823 | printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); | ||
824 | printk("turning off the locking correctness validator.\n"); | ||
825 | return NULL; | ||
826 | } | ||
827 | return list_entries + nr_list_entries++; | ||
828 | } | ||
829 | |||
485 | /* | 830 | /* |
486 | * Add a new dependency to the head of the list: | 831 | * Add a new dependency to the head of the list: |
487 | */ | 832 | */ |
@@ -542,13 +887,6 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) | |||
542 | return 0; | 887 | return 0; |
543 | } | 888 | } |
544 | 889 | ||
545 | static void print_kernel_version(void) | ||
546 | { | ||
547 | printk("%s %.*s\n", init_utsname()->release, | ||
548 | (int)strcspn(init_utsname()->version, " "), | ||
549 | init_utsname()->version); | ||
550 | } | ||
551 | |||
552 | /* | 890 | /* |
553 | * When a circular dependency is detected, print the | 891 | * When a circular dependency is detected, print the |
554 | * header first: | 892 | * header first: |
@@ -640,15 +978,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) | |||
640 | return 1; | 978 | return 1; |
641 | } | 979 | } |
642 | 980 | ||
643 | static int very_verbose(struct lock_class *class) | ||
644 | { | ||
645 | #if VERY_VERBOSE | ||
646 | return class_filter(class); | ||
647 | #endif | ||
648 | return 0; | ||
649 | } | ||
650 | #ifdef CONFIG_TRACE_IRQFLAGS | 981 | #ifdef CONFIG_TRACE_IRQFLAGS |
651 | |||
652 | /* | 982 | /* |
653 | * Forwards and backwards subgraph searching, for the purposes of | 983 | * Forwards and backwards subgraph searching, for the purposes of |
654 | * proving that two subgraphs can be connected by a new dependency | 984 | * proving that two subgraphs can be connected by a new dependency |
@@ -821,6 +1151,78 @@ check_usage(struct task_struct *curr, struct held_lock *prev, | |||
821 | bit_backwards, bit_forwards, irqclass); | 1151 | bit_backwards, bit_forwards, irqclass); |
822 | } | 1152 | } |
823 | 1153 | ||
1154 | static int | ||
1155 | check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, | ||
1156 | struct held_lock *next) | ||
1157 | { | ||
1158 | /* | ||
1159 | * Prove that the new dependency does not connect a hardirq-safe | ||
1160 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
1161 | * the backwards-subgraph starting at <prev>, and the | ||
1162 | * forwards-subgraph starting at <next>: | ||
1163 | */ | ||
1164 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, | ||
1165 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
1166 | return 0; | ||
1167 | |||
1168 | /* | ||
1169 | * Prove that the new dependency does not connect a hardirq-safe-read | ||
1170 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
1171 | * the backwards-subgraph starting at <prev>, and the | ||
1172 | * forwards-subgraph starting at <next>: | ||
1173 | */ | ||
1174 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, | ||
1175 | LOCK_ENABLED_HARDIRQS, "hard-read")) | ||
1176 | return 0; | ||
1177 | |||
1178 | /* | ||
1179 | * Prove that the new dependency does not connect a softirq-safe | ||
1180 | * lock with a softirq-unsafe lock - to achieve this we search | ||
1181 | * the backwards-subgraph starting at <prev>, and the | ||
1182 | * forwards-subgraph starting at <next>: | ||
1183 | */ | ||
1184 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, | ||
1185 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
1186 | return 0; | ||
1187 | /* | ||
1188 | * Prove that the new dependency does not connect a softirq-safe-read | ||
1189 | * lock with a softirq-unsafe lock - to achieve this we search | ||
1190 | * the backwards-subgraph starting at <prev>, and the | ||
1191 | * forwards-subgraph starting at <next>: | ||
1192 | */ | ||
1193 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, | ||
1194 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
1195 | return 0; | ||
1196 | |||
1197 | return 1; | ||
1198 | } | ||
1199 | |||
1200 | static void inc_chains(void) | ||
1201 | { | ||
1202 | if (current->hardirq_context) | ||
1203 | nr_hardirq_chains++; | ||
1204 | else { | ||
1205 | if (current->softirq_context) | ||
1206 | nr_softirq_chains++; | ||
1207 | else | ||
1208 | nr_process_chains++; | ||
1209 | } | ||
1210 | } | ||
1211 | |||
1212 | #else | ||
1213 | |||
1214 | static inline int | ||
1215 | check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, | ||
1216 | struct held_lock *next) | ||
1217 | { | ||
1218 | return 1; | ||
1219 | } | ||
1220 | |||
1221 | static inline void inc_chains(void) | ||
1222 | { | ||
1223 | nr_process_chains++; | ||
1224 | } | ||
1225 | |||
824 | #endif | 1226 | #endif |
825 | 1227 | ||
826 | static int | 1228 | static int |
@@ -922,47 +1324,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
922 | if (!(check_noncircular(next->class, 0))) | 1324 | if (!(check_noncircular(next->class, 0))) |
923 | return print_circular_bug_tail(); | 1325 | return print_circular_bug_tail(); |
924 | 1326 | ||
925 | #ifdef CONFIG_TRACE_IRQFLAGS | 1327 | if (!check_prev_add_irq(curr, prev, next)) |
926 | /* | ||
927 | * Prove that the new dependency does not connect a hardirq-safe | ||
928 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
929 | * the backwards-subgraph starting at <prev>, and the | ||
930 | * forwards-subgraph starting at <next>: | ||
931 | */ | ||
932 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, | ||
933 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
934 | return 0; | 1328 | return 0; |
935 | 1329 | ||
936 | /* | 1330 | /* |
937 | * Prove that the new dependency does not connect a hardirq-safe-read | ||
938 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
939 | * the backwards-subgraph starting at <prev>, and the | ||
940 | * forwards-subgraph starting at <next>: | ||
941 | */ | ||
942 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, | ||
943 | LOCK_ENABLED_HARDIRQS, "hard-read")) | ||
944 | return 0; | ||
945 | |||
946 | /* | ||
947 | * Prove that the new dependency does not connect a softirq-safe | ||
948 | * lock with a softirq-unsafe lock - to achieve this we search | ||
949 | * the backwards-subgraph starting at <prev>, and the | ||
950 | * forwards-subgraph starting at <next>: | ||
951 | */ | ||
952 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, | ||
953 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
954 | return 0; | ||
955 | /* | ||
956 | * Prove that the new dependency does not connect a softirq-safe-read | ||
957 | * lock with a softirq-unsafe lock - to achieve this we search | ||
958 | * the backwards-subgraph starting at <prev>, and the | ||
959 | * forwards-subgraph starting at <next>: | ||
960 | */ | ||
961 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, | ||
962 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
963 | return 0; | ||
964 | #endif | ||
965 | /* | ||
966 | * For recursive read-locks we do all the dependency checks, | 1331 | * For recursive read-locks we do all the dependency checks, |
967 | * but we dont store read-triggered dependencies (only | 1332 | * but we dont store read-triggered dependencies (only |
968 | * write-triggered dependencies). This ensures that only the | 1333 | * write-triggered dependencies). This ensures that only the |
@@ -1088,224 +1453,8 @@ out_bug: | |||
1088 | return 0; | 1453 | return 0; |
1089 | } | 1454 | } |
1090 | 1455 | ||
1091 | 1456 | unsigned long nr_lock_chains; | |
1092 | /* | 1457 | static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; |
1093 | * Is this the address of a static object: | ||
1094 | */ | ||
1095 | static int static_obj(void *obj) | ||
1096 | { | ||
1097 | unsigned long start = (unsigned long) &_stext, | ||
1098 | end = (unsigned long) &_end, | ||
1099 | addr = (unsigned long) obj; | ||
1100 | #ifdef CONFIG_SMP | ||
1101 | int i; | ||
1102 | #endif | ||
1103 | |||
1104 | /* | ||
1105 | * static variable? | ||
1106 | */ | ||
1107 | if ((addr >= start) && (addr < end)) | ||
1108 | return 1; | ||
1109 | |||
1110 | #ifdef CONFIG_SMP | ||
1111 | /* | ||
1112 | * percpu var? | ||
1113 | */ | ||
1114 | for_each_possible_cpu(i) { | ||
1115 | start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); | ||
1116 | end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM | ||
1117 | + per_cpu_offset(i); | ||
1118 | |||
1119 | if ((addr >= start) && (addr < end)) | ||
1120 | return 1; | ||
1121 | } | ||
1122 | #endif | ||
1123 | |||
1124 | /* | ||
1125 | * module var? | ||
1126 | */ | ||
1127 | return is_module_address(addr); | ||
1128 | } | ||
1129 | |||
1130 | /* | ||
1131 | * To make lock name printouts unique, we calculate a unique | ||
1132 | * class->name_version generation counter: | ||
1133 | */ | ||
1134 | static int count_matching_names(struct lock_class *new_class) | ||
1135 | { | ||
1136 | struct lock_class *class; | ||
1137 | int count = 0; | ||
1138 | |||
1139 | if (!new_class->name) | ||
1140 | return 0; | ||
1141 | |||
1142 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
1143 | if (new_class->key - new_class->subclass == class->key) | ||
1144 | return class->name_version; | ||
1145 | if (class->name && !strcmp(class->name, new_class->name)) | ||
1146 | count = max(count, class->name_version); | ||
1147 | } | ||
1148 | |||
1149 | return count + 1; | ||
1150 | } | ||
1151 | |||
1152 | /* | ||
1153 | * Register a lock's class in the hash-table, if the class is not present | ||
1154 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
1155 | * itself, so actual lookup of the hash should be once per lock object. | ||
1156 | */ | ||
1157 | static inline struct lock_class * | ||
1158 | look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | ||
1159 | { | ||
1160 | struct lockdep_subclass_key *key; | ||
1161 | struct list_head *hash_head; | ||
1162 | struct lock_class *class; | ||
1163 | |||
1164 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
1165 | /* | ||
1166 | * If the architecture calls into lockdep before initializing | ||
1167 | * the hashes then we'll warn about it later. (we cannot printk | ||
1168 | * right now) | ||
1169 | */ | ||
1170 | if (unlikely(!lockdep_initialized)) { | ||
1171 | lockdep_init(); | ||
1172 | lockdep_init_error = 1; | ||
1173 | } | ||
1174 | #endif | ||
1175 | |||
1176 | /* | ||
1177 | * Static locks do not have their class-keys yet - for them the key | ||
1178 | * is the lock object itself: | ||
1179 | */ | ||
1180 | if (unlikely(!lock->key)) | ||
1181 | lock->key = (void *)lock; | ||
1182 | |||
1183 | /* | ||
1184 | * NOTE: the class-key must be unique. For dynamic locks, a static | ||
1185 | * lock_class_key variable is passed in through the mutex_init() | ||
1186 | * (or spin_lock_init()) call - which acts as the key. For static | ||
1187 | * locks we use the lock object itself as the key. | ||
1188 | */ | ||
1189 | BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class)); | ||
1190 | |||
1191 | key = lock->key->subkeys + subclass; | ||
1192 | |||
1193 | hash_head = classhashentry(key); | ||
1194 | |||
1195 | /* | ||
1196 | * We can walk the hash lockfree, because the hash only | ||
1197 | * grows, and we are careful when adding entries to the end: | ||
1198 | */ | ||
1199 | list_for_each_entry(class, hash_head, hash_entry) | ||
1200 | if (class->key == key) | ||
1201 | return class; | ||
1202 | |||
1203 | return NULL; | ||
1204 | } | ||
1205 | |||
1206 | /* | ||
1207 | * Register a lock's class in the hash-table, if the class is not present | ||
1208 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
1209 | * itself, so actual lookup of the hash should be once per lock object. | ||
1210 | */ | ||
1211 | static inline struct lock_class * | ||
1212 | register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | ||
1213 | { | ||
1214 | struct lockdep_subclass_key *key; | ||
1215 | struct list_head *hash_head; | ||
1216 | struct lock_class *class; | ||
1217 | unsigned long flags; | ||
1218 | |||
1219 | class = look_up_lock_class(lock, subclass); | ||
1220 | if (likely(class)) | ||
1221 | return class; | ||
1222 | |||
1223 | /* | ||
1224 | * Debug-check: all keys must be persistent! | ||
1225 | */ | ||
1226 | if (!static_obj(lock->key)) { | ||
1227 | debug_locks_off(); | ||
1228 | printk("INFO: trying to register non-static key.\n"); | ||
1229 | printk("the code is fine but needs lockdep annotation.\n"); | ||
1230 | printk("turning off the locking correctness validator.\n"); | ||
1231 | dump_stack(); | ||
1232 | |||
1233 | return NULL; | ||
1234 | } | ||
1235 | |||
1236 | key = lock->key->subkeys + subclass; | ||
1237 | hash_head = classhashentry(key); | ||
1238 | |||
1239 | raw_local_irq_save(flags); | ||
1240 | if (!graph_lock()) { | ||
1241 | raw_local_irq_restore(flags); | ||
1242 | return NULL; | ||
1243 | } | ||
1244 | /* | ||
1245 | * We have to do the hash-walk again, to avoid races | ||
1246 | * with another CPU: | ||
1247 | */ | ||
1248 | list_for_each_entry(class, hash_head, hash_entry) | ||
1249 | if (class->key == key) | ||
1250 | goto out_unlock_set; | ||
1251 | /* | ||
1252 | * Allocate a new key from the static array, and add it to | ||
1253 | * the hash: | ||
1254 | */ | ||
1255 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | ||
1256 | if (!debug_locks_off_graph_unlock()) { | ||
1257 | raw_local_irq_restore(flags); | ||
1258 | return NULL; | ||
1259 | } | ||
1260 | raw_local_irq_restore(flags); | ||
1261 | |||
1262 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); | ||
1263 | printk("turning off the locking correctness validator.\n"); | ||
1264 | return NULL; | ||
1265 | } | ||
1266 | class = lock_classes + nr_lock_classes++; | ||
1267 | debug_atomic_inc(&nr_unused_locks); | ||
1268 | class->key = key; | ||
1269 | class->name = lock->name; | ||
1270 | class->subclass = subclass; | ||
1271 | INIT_LIST_HEAD(&class->lock_entry); | ||
1272 | INIT_LIST_HEAD(&class->locks_before); | ||
1273 | INIT_LIST_HEAD(&class->locks_after); | ||
1274 | class->name_version = count_matching_names(class); | ||
1275 | /* | ||
1276 | * We use RCU's safe list-add method to make | ||
1277 | * parallel walking of the hash-list safe: | ||
1278 | */ | ||
1279 | list_add_tail_rcu(&class->hash_entry, hash_head); | ||
1280 | |||
1281 | if (verbose(class)) { | ||
1282 | graph_unlock(); | ||
1283 | raw_local_irq_restore(flags); | ||
1284 | |||
1285 | printk("\nnew class %p: %s", class->key, class->name); | ||
1286 | if (class->name_version > 1) | ||
1287 | printk("#%d", class->name_version); | ||
1288 | printk("\n"); | ||
1289 | dump_stack(); | ||
1290 | |||
1291 | raw_local_irq_save(flags); | ||
1292 | if (!graph_lock()) { | ||
1293 | raw_local_irq_restore(flags); | ||
1294 | return NULL; | ||
1295 | } | ||
1296 | } | ||
1297 | out_unlock_set: | ||
1298 | graph_unlock(); | ||
1299 | raw_local_irq_restore(flags); | ||
1300 | |||
1301 | if (!subclass || force) | ||
1302 | lock->class_cache = class; | ||
1303 | |||
1304 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | ||
1305 | return NULL; | ||
1306 | |||
1307 | return class; | ||
1308 | } | ||
1309 | 1458 | ||
1310 | /* | 1459 | /* |
1311 | * Look up a dependency chain. If the key is not present yet then | 1460 | * Look up a dependency chain. If the key is not present yet then |
@@ -1366,21 +1515,72 @@ cache_hit: | |||
1366 | chain->chain_key = chain_key; | 1515 | chain->chain_key = chain_key; |
1367 | list_add_tail_rcu(&chain->entry, hash_head); | 1516 | list_add_tail_rcu(&chain->entry, hash_head); |
1368 | debug_atomic_inc(&chain_lookup_misses); | 1517 | debug_atomic_inc(&chain_lookup_misses); |
1369 | #ifdef CONFIG_TRACE_IRQFLAGS | 1518 | inc_chains(); |
1370 | if (current->hardirq_context) | 1519 | |
1371 | nr_hardirq_chains++; | 1520 | return 1; |
1372 | else { | 1521 | } |
1373 | if (current->softirq_context) | 1522 | |
1374 | nr_softirq_chains++; | 1523 | static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, |
1375 | else | 1524 | struct held_lock *hlock, int chain_head) |
1376 | nr_process_chains++; | 1525 | { |
1377 | } | 1526 | /* |
1378 | #else | 1527 | * Trylock needs to maintain the stack of held locks, but it |
1379 | nr_process_chains++; | 1528 | * does not add new dependencies, because trylock can be done |
1380 | #endif | 1529 | * in any order. |
1530 | * | ||
1531 | * We look up the chain_key and do the O(N^2) check and update of | ||
1532 | * the dependencies only if this is a new dependency chain. | ||
1533 | * (If lookup_chain_cache() returns with 1 it acquires | ||
1534 | * graph_lock for us) | ||
1535 | */ | ||
1536 | if (!hlock->trylock && (hlock->check == 2) && | ||
1537 | lookup_chain_cache(curr->curr_chain_key, hlock->class)) { | ||
1538 | /* | ||
1539 | * Check whether last held lock: | ||
1540 | * | ||
1541 | * - is irq-safe, if this lock is irq-unsafe | ||
1542 | * - is softirq-safe, if this lock is hardirq-unsafe | ||
1543 | * | ||
1544 | * And check whether the new lock's dependency graph | ||
1545 | * could lead back to the previous lock. | ||
1546 | * | ||
1547 | * any of these scenarios could lead to a deadlock. If | ||
1548 | * All validations | ||
1549 | */ | ||
1550 | int ret = check_deadlock(curr, hlock, lock, hlock->read); | ||
1551 | |||
1552 | if (!ret) | ||
1553 | return 0; | ||
1554 | /* | ||
1555 | * Mark recursive read, as we jump over it when | ||
1556 | * building dependencies (just like we jump over | ||
1557 | * trylock entries): | ||
1558 | */ | ||
1559 | if (ret == 2) | ||
1560 | hlock->read = 2; | ||
1561 | /* | ||
1562 | * Add dependency only if this lock is not the head | ||
1563 | * of the chain, and if it's not a secondary read-lock: | ||
1564 | */ | ||
1565 | if (!chain_head && ret != 2) | ||
1566 | if (!check_prevs_add(curr, hlock)) | ||
1567 | return 0; | ||
1568 | graph_unlock(); | ||
1569 | } else | ||
1570 | /* after lookup_chain_cache(): */ | ||
1571 | if (unlikely(!debug_locks)) | ||
1572 | return 0; | ||
1381 | 1573 | ||
1382 | return 1; | 1574 | return 1; |
1383 | } | 1575 | } |
1576 | #else | ||
1577 | static inline int validate_chain(struct task_struct *curr, | ||
1578 | struct lockdep_map *lock, struct held_lock *hlock, | ||
1579 | int chain_head) | ||
1580 | { | ||
1581 | return 1; | ||
1582 | } | ||
1583 | #endif | ||
1384 | 1584 | ||
1385 | /* | 1585 | /* |
1386 | * We are building curr_chain_key incrementally, so double-check | 1586 | * We are building curr_chain_key incrementally, so double-check |
@@ -1425,6 +1625,57 @@ static void check_chain_key(struct task_struct *curr) | |||
1425 | #endif | 1625 | #endif |
1426 | } | 1626 | } |
1427 | 1627 | ||
1628 | static int | ||
1629 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | ||
1630 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | ||
1631 | { | ||
1632 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | ||
1633 | return 0; | ||
1634 | |||
1635 | printk("\n=================================\n"); | ||
1636 | printk( "[ INFO: inconsistent lock state ]\n"); | ||
1637 | print_kernel_version(); | ||
1638 | printk( "---------------------------------\n"); | ||
1639 | |||
1640 | printk("inconsistent {%s} -> {%s} usage.\n", | ||
1641 | usage_str[prev_bit], usage_str[new_bit]); | ||
1642 | |||
1643 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", | ||
1644 | curr->comm, curr->pid, | ||
1645 | trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, | ||
1646 | trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, | ||
1647 | trace_hardirqs_enabled(curr), | ||
1648 | trace_softirqs_enabled(curr)); | ||
1649 | print_lock(this); | ||
1650 | |||
1651 | printk("{%s} state was registered at:\n", usage_str[prev_bit]); | ||
1652 | print_stack_trace(this->class->usage_traces + prev_bit, 1); | ||
1653 | |||
1654 | print_irqtrace_events(curr); | ||
1655 | printk("\nother info that might help us debug this:\n"); | ||
1656 | lockdep_print_held_locks(curr); | ||
1657 | |||
1658 | printk("\nstack backtrace:\n"); | ||
1659 | dump_stack(); | ||
1660 | |||
1661 | return 0; | ||
1662 | } | ||
1663 | |||
1664 | /* | ||
1665 | * Print out an error if an invalid bit is set: | ||
1666 | */ | ||
1667 | static inline int | ||
1668 | valid_state(struct task_struct *curr, struct held_lock *this, | ||
1669 | enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) | ||
1670 | { | ||
1671 | if (unlikely(this->class->usage_mask & (1 << bad_bit))) | ||
1672 | return print_usage_bug(curr, this, bad_bit, new_bit); | ||
1673 | return 1; | ||
1674 | } | ||
1675 | |||
1676 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
1677 | enum lock_usage_bit new_bit); | ||
1678 | |||
1428 | #ifdef CONFIG_TRACE_IRQFLAGS | 1679 | #ifdef CONFIG_TRACE_IRQFLAGS |
1429 | 1680 | ||
1430 | /* | 1681 | /* |
@@ -1518,90 +1769,30 @@ void print_irqtrace_events(struct task_struct *curr) | |||
1518 | print_ip_sym(curr->softirq_disable_ip); | 1769 | print_ip_sym(curr->softirq_disable_ip); |
1519 | } | 1770 | } |
1520 | 1771 | ||
1521 | #endif | 1772 | static int hardirq_verbose(struct lock_class *class) |
1522 | |||
1523 | static int | ||
1524 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | ||
1525 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | ||
1526 | { | 1773 | { |
1527 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1774 | #if HARDIRQ_VERBOSE |
1528 | return 0; | 1775 | return class_filter(class); |
1529 | 1776 | #endif | |
1530 | printk("\n=================================\n"); | ||
1531 | printk( "[ INFO: inconsistent lock state ]\n"); | ||
1532 | print_kernel_version(); | ||
1533 | printk( "---------------------------------\n"); | ||
1534 | |||
1535 | printk("inconsistent {%s} -> {%s} usage.\n", | ||
1536 | usage_str[prev_bit], usage_str[new_bit]); | ||
1537 | |||
1538 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", | ||
1539 | curr->comm, curr->pid, | ||
1540 | trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, | ||
1541 | trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, | ||
1542 | trace_hardirqs_enabled(curr), | ||
1543 | trace_softirqs_enabled(curr)); | ||
1544 | print_lock(this); | ||
1545 | |||
1546 | printk("{%s} state was registered at:\n", usage_str[prev_bit]); | ||
1547 | print_stack_trace(this->class->usage_traces + prev_bit, 1); | ||
1548 | |||
1549 | print_irqtrace_events(curr); | ||
1550 | printk("\nother info that might help us debug this:\n"); | ||
1551 | lockdep_print_held_locks(curr); | ||
1552 | |||
1553 | printk("\nstack backtrace:\n"); | ||
1554 | dump_stack(); | ||
1555 | |||
1556 | return 0; | 1777 | return 0; |
1557 | } | 1778 | } |
1558 | 1779 | ||
1559 | /* | 1780 | static int softirq_verbose(struct lock_class *class) |
1560 | * Print out an error if an invalid bit is set: | ||
1561 | */ | ||
1562 | static inline int | ||
1563 | valid_state(struct task_struct *curr, struct held_lock *this, | ||
1564 | enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) | ||
1565 | { | 1781 | { |
1566 | if (unlikely(this->class->usage_mask & (1 << bad_bit))) | 1782 | #if SOFTIRQ_VERBOSE |
1567 | return print_usage_bug(curr, this, bad_bit, new_bit); | 1783 | return class_filter(class); |
1568 | return 1; | 1784 | #endif |
1785 | return 0; | ||
1569 | } | 1786 | } |
1570 | 1787 | ||
1571 | #define STRICT_READ_CHECKS 1 | 1788 | #define STRICT_READ_CHECKS 1 |
1572 | 1789 | ||
1573 | /* | 1790 | static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, |
1574 | * Mark a lock with a usage bit, and validate the state transition: | 1791 | enum lock_usage_bit new_bit) |
1575 | */ | ||
1576 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
1577 | enum lock_usage_bit new_bit) | ||
1578 | { | 1792 | { |
1579 | unsigned int new_mask = 1 << new_bit, ret = 1; | 1793 | int ret = 1; |
1580 | |||
1581 | /* | ||
1582 | * If already set then do not dirty the cacheline, | ||
1583 | * nor do any checks: | ||
1584 | */ | ||
1585 | if (likely(this->class->usage_mask & new_mask)) | ||
1586 | return 1; | ||
1587 | |||
1588 | if (!graph_lock()) | ||
1589 | return 0; | ||
1590 | /* | ||
1591 | * Make sure we didnt race: | ||
1592 | */ | ||
1593 | if (unlikely(this->class->usage_mask & new_mask)) { | ||
1594 | graph_unlock(); | ||
1595 | return 1; | ||
1596 | } | ||
1597 | |||
1598 | this->class->usage_mask |= new_mask; | ||
1599 | 1794 | ||
1600 | if (!save_trace(this->class->usage_traces + new_bit)) | 1795 | switch(new_bit) { |
1601 | return 0; | ||
1602 | |||
1603 | switch (new_bit) { | ||
1604 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1605 | case LOCK_USED_IN_HARDIRQ: | 1796 | case LOCK_USED_IN_HARDIRQ: |
1606 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) | 1797 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) |
1607 | return 0; | 1798 | return 0; |
@@ -1760,37 +1951,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
1760 | if (softirq_verbose(this->class)) | 1951 | if (softirq_verbose(this->class)) |
1761 | ret = 2; | 1952 | ret = 2; |
1762 | break; | 1953 | break; |
1763 | #endif | ||
1764 | case LOCK_USED: | ||
1765 | /* | ||
1766 | * Add it to the global list of classes: | ||
1767 | */ | ||
1768 | list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); | ||
1769 | debug_atomic_dec(&nr_unused_locks); | ||
1770 | break; | ||
1771 | default: | 1954 | default: |
1772 | if (!debug_locks_off_graph_unlock()) | ||
1773 | return 0; | ||
1774 | WARN_ON(1); | 1955 | WARN_ON(1); |
1775 | return 0; | 1956 | break; |
1776 | } | ||
1777 | |||
1778 | graph_unlock(); | ||
1779 | |||
1780 | /* | ||
1781 | * We must printk outside of the graph_lock: | ||
1782 | */ | ||
1783 | if (ret == 2) { | ||
1784 | printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); | ||
1785 | print_lock(this); | ||
1786 | print_irqtrace_events(curr); | ||
1787 | dump_stack(); | ||
1788 | } | 1957 | } |
1789 | 1958 | ||
1790 | return ret; | 1959 | return ret; |
1791 | } | 1960 | } |
1792 | 1961 | ||
1793 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1794 | /* | 1962 | /* |
1795 | * Mark all held locks with a usage bit: | 1963 | * Mark all held locks with a usage bit: |
1796 | */ | 1964 | */ |
@@ -1973,9 +2141,176 @@ void trace_softirqs_off(unsigned long ip) | |||
1973 | debug_atomic_inc(&redundant_softirqs_off); | 2141 | debug_atomic_inc(&redundant_softirqs_off); |
1974 | } | 2142 | } |
1975 | 2143 | ||
2144 | static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) | ||
2145 | { | ||
2146 | /* | ||
2147 | * If non-trylock use in a hardirq or softirq context, then | ||
2148 | * mark the lock as used in these contexts: | ||
2149 | */ | ||
2150 | if (!hlock->trylock) { | ||
2151 | if (hlock->read) { | ||
2152 | if (curr->hardirq_context) | ||
2153 | if (!mark_lock(curr, hlock, | ||
2154 | LOCK_USED_IN_HARDIRQ_READ)) | ||
2155 | return 0; | ||
2156 | if (curr->softirq_context) | ||
2157 | if (!mark_lock(curr, hlock, | ||
2158 | LOCK_USED_IN_SOFTIRQ_READ)) | ||
2159 | return 0; | ||
2160 | } else { | ||
2161 | if (curr->hardirq_context) | ||
2162 | if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) | ||
2163 | return 0; | ||
2164 | if (curr->softirq_context) | ||
2165 | if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) | ||
2166 | return 0; | ||
2167 | } | ||
2168 | } | ||
2169 | if (!hlock->hardirqs_off) { | ||
2170 | if (hlock->read) { | ||
2171 | if (!mark_lock(curr, hlock, | ||
2172 | LOCK_ENABLED_HARDIRQS_READ)) | ||
2173 | return 0; | ||
2174 | if (curr->softirqs_enabled) | ||
2175 | if (!mark_lock(curr, hlock, | ||
2176 | LOCK_ENABLED_SOFTIRQS_READ)) | ||
2177 | return 0; | ||
2178 | } else { | ||
2179 | if (!mark_lock(curr, hlock, | ||
2180 | LOCK_ENABLED_HARDIRQS)) | ||
2181 | return 0; | ||
2182 | if (curr->softirqs_enabled) | ||
2183 | if (!mark_lock(curr, hlock, | ||
2184 | LOCK_ENABLED_SOFTIRQS)) | ||
2185 | return 0; | ||
2186 | } | ||
2187 | } | ||
2188 | |||
2189 | return 1; | ||
2190 | } | ||
2191 | |||
2192 | static int separate_irq_context(struct task_struct *curr, | ||
2193 | struct held_lock *hlock) | ||
2194 | { | ||
2195 | unsigned int depth = curr->lockdep_depth; | ||
2196 | |||
2197 | /* | ||
2198 | * Keep track of points where we cross into an interrupt context: | ||
2199 | */ | ||
2200 | hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + | ||
2201 | curr->softirq_context; | ||
2202 | if (depth) { | ||
2203 | struct held_lock *prev_hlock; | ||
2204 | |||
2205 | prev_hlock = curr->held_locks + depth-1; | ||
2206 | /* | ||
2207 | * If we cross into another context, reset the | ||
2208 | * hash key (this also prevents the checking and the | ||
2209 | * adding of the dependency to 'prev'): | ||
2210 | */ | ||
2211 | if (prev_hlock->irq_context != hlock->irq_context) | ||
2212 | return 1; | ||
2213 | } | ||
2214 | return 0; | ||
2215 | } | ||
2216 | |||
2217 | #else | ||
2218 | |||
2219 | static inline | ||
2220 | int mark_lock_irq(struct task_struct *curr, struct held_lock *this, | ||
2221 | enum lock_usage_bit new_bit) | ||
2222 | { | ||
2223 | WARN_ON(1); | ||
2224 | return 1; | ||
2225 | } | ||
2226 | |||
2227 | static inline int mark_irqflags(struct task_struct *curr, | ||
2228 | struct held_lock *hlock) | ||
2229 | { | ||
2230 | return 1; | ||
2231 | } | ||
2232 | |||
2233 | static inline int separate_irq_context(struct task_struct *curr, | ||
2234 | struct held_lock *hlock) | ||
2235 | { | ||
2236 | return 0; | ||
2237 | } | ||
2238 | |||
1976 | #endif | 2239 | #endif |
1977 | 2240 | ||
1978 | /* | 2241 | /* |
2242 | * Mark a lock with a usage bit, and validate the state transition: | ||
2243 | */ | ||
2244 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
2245 | enum lock_usage_bit new_bit) | ||
2246 | { | ||
2247 | unsigned int new_mask = 1 << new_bit, ret = 1; | ||
2248 | |||
2249 | /* | ||
2250 | * If already set then do not dirty the cacheline, | ||
2251 | * nor do any checks: | ||
2252 | */ | ||
2253 | if (likely(this->class->usage_mask & new_mask)) | ||
2254 | return 1; | ||
2255 | |||
2256 | if (!graph_lock()) | ||
2257 | return 0; | ||
2258 | /* | ||
2259 | * Make sure we didnt race: | ||
2260 | */ | ||
2261 | if (unlikely(this->class->usage_mask & new_mask)) { | ||
2262 | graph_unlock(); | ||
2263 | return 1; | ||
2264 | } | ||
2265 | |||
2266 | this->class->usage_mask |= new_mask; | ||
2267 | |||
2268 | if (!save_trace(this->class->usage_traces + new_bit)) | ||
2269 | return 0; | ||
2270 | |||
2271 | switch (new_bit) { | ||
2272 | case LOCK_USED_IN_HARDIRQ: | ||
2273 | case LOCK_USED_IN_SOFTIRQ: | ||
2274 | case LOCK_USED_IN_HARDIRQ_READ: | ||
2275 | case LOCK_USED_IN_SOFTIRQ_READ: | ||
2276 | case LOCK_ENABLED_HARDIRQS: | ||
2277 | case LOCK_ENABLED_SOFTIRQS: | ||
2278 | case LOCK_ENABLED_HARDIRQS_READ: | ||
2279 | case LOCK_ENABLED_SOFTIRQS_READ: | ||
2280 | ret = mark_lock_irq(curr, this, new_bit); | ||
2281 | if (!ret) | ||
2282 | return 0; | ||
2283 | break; | ||
2284 | case LOCK_USED: | ||
2285 | /* | ||
2286 | * Add it to the global list of classes: | ||
2287 | */ | ||
2288 | list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); | ||
2289 | debug_atomic_dec(&nr_unused_locks); | ||
2290 | break; | ||
2291 | default: | ||
2292 | if (!debug_locks_off_graph_unlock()) | ||
2293 | return 0; | ||
2294 | WARN_ON(1); | ||
2295 | return 0; | ||
2296 | } | ||
2297 | |||
2298 | graph_unlock(); | ||
2299 | |||
2300 | /* | ||
2301 | * We must printk outside of the graph_lock: | ||
2302 | */ | ||
2303 | if (ret == 2) { | ||
2304 | printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); | ||
2305 | print_lock(this); | ||
2306 | print_irqtrace_events(curr); | ||
2307 | dump_stack(); | ||
2308 | } | ||
2309 | |||
2310 | return ret; | ||
2311 | } | ||
2312 | |||
2313 | /* | ||
1979 | * Initialize a lock instance's lock-class mapping info: | 2314 | * Initialize a lock instance's lock-class mapping info: |
1980 | */ | 2315 | */ |
1981 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 2316 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
@@ -1999,6 +2334,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
1999 | lock->name = name; | 2334 | lock->name = name; |
2000 | lock->key = key; | 2335 | lock->key = key; |
2001 | lock->class_cache = NULL; | 2336 | lock->class_cache = NULL; |
2337 | #ifdef CONFIG_LOCK_STAT | ||
2338 | lock->cpu = raw_smp_processor_id(); | ||
2339 | #endif | ||
2002 | if (subclass) | 2340 | if (subclass) |
2003 | register_lock_class(lock, subclass, 1); | 2341 | register_lock_class(lock, subclass, 1); |
2004 | } | 2342 | } |
@@ -2020,6 +2358,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2020 | int chain_head = 0; | 2358 | int chain_head = 0; |
2021 | u64 chain_key; | 2359 | u64 chain_key; |
2022 | 2360 | ||
2361 | if (!prove_locking) | ||
2362 | check = 1; | ||
2363 | |||
2023 | if (unlikely(!debug_locks)) | 2364 | if (unlikely(!debug_locks)) |
2024 | return 0; | 2365 | return 0; |
2025 | 2366 | ||
@@ -2070,57 +2411,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2070 | hlock->read = read; | 2411 | hlock->read = read; |
2071 | hlock->check = check; | 2412 | hlock->check = check; |
2072 | hlock->hardirqs_off = hardirqs_off; | 2413 | hlock->hardirqs_off = hardirqs_off; |
2073 | 2414 | #ifdef CONFIG_LOCK_STAT | |
2074 | if (check != 2) | 2415 | hlock->waittime_stamp = 0; |
2075 | goto out_calc_hash; | 2416 | hlock->holdtime_stamp = sched_clock(); |
2076 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
2077 | /* | ||
2078 | * If non-trylock use in a hardirq or softirq context, then | ||
2079 | * mark the lock as used in these contexts: | ||
2080 | */ | ||
2081 | if (!trylock) { | ||
2082 | if (read) { | ||
2083 | if (curr->hardirq_context) | ||
2084 | if (!mark_lock(curr, hlock, | ||
2085 | LOCK_USED_IN_HARDIRQ_READ)) | ||
2086 | return 0; | ||
2087 | if (curr->softirq_context) | ||
2088 | if (!mark_lock(curr, hlock, | ||
2089 | LOCK_USED_IN_SOFTIRQ_READ)) | ||
2090 | return 0; | ||
2091 | } else { | ||
2092 | if (curr->hardirq_context) | ||
2093 | if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) | ||
2094 | return 0; | ||
2095 | if (curr->softirq_context) | ||
2096 | if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) | ||
2097 | return 0; | ||
2098 | } | ||
2099 | } | ||
2100 | if (!hardirqs_off) { | ||
2101 | if (read) { | ||
2102 | if (!mark_lock(curr, hlock, | ||
2103 | LOCK_ENABLED_HARDIRQS_READ)) | ||
2104 | return 0; | ||
2105 | if (curr->softirqs_enabled) | ||
2106 | if (!mark_lock(curr, hlock, | ||
2107 | LOCK_ENABLED_SOFTIRQS_READ)) | ||
2108 | return 0; | ||
2109 | } else { | ||
2110 | if (!mark_lock(curr, hlock, | ||
2111 | LOCK_ENABLED_HARDIRQS)) | ||
2112 | return 0; | ||
2113 | if (curr->softirqs_enabled) | ||
2114 | if (!mark_lock(curr, hlock, | ||
2115 | LOCK_ENABLED_SOFTIRQS)) | ||
2116 | return 0; | ||
2117 | } | ||
2118 | } | ||
2119 | #endif | 2417 | #endif |
2418 | |||
2419 | if (check == 2 && !mark_irqflags(curr, hlock)) | ||
2420 | return 0; | ||
2421 | |||
2120 | /* mark it as used: */ | 2422 | /* mark it as used: */ |
2121 | if (!mark_lock(curr, hlock, LOCK_USED)) | 2423 | if (!mark_lock(curr, hlock, LOCK_USED)) |
2122 | return 0; | 2424 | return 0; |
2123 | out_calc_hash: | 2425 | |
2124 | /* | 2426 | /* |
2125 | * Calculate the chain hash: it's the combined has of all the | 2427 | * Calculate the chain hash: it's the combined has of all the |
2126 | * lock keys along the dependency chain. We save the hash value | 2428 | * lock keys along the dependency chain. We save the hash value |
@@ -2143,77 +2445,15 @@ out_calc_hash: | |||
2143 | } | 2445 | } |
2144 | 2446 | ||
2145 | hlock->prev_chain_key = chain_key; | 2447 | hlock->prev_chain_key = chain_key; |
2146 | 2448 | if (separate_irq_context(curr, hlock)) { | |
2147 | #ifdef CONFIG_TRACE_IRQFLAGS | 2449 | chain_key = 0; |
2148 | /* | 2450 | chain_head = 1; |
2149 | * Keep track of points where we cross into an interrupt context: | ||
2150 | */ | ||
2151 | hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + | ||
2152 | curr->softirq_context; | ||
2153 | if (depth) { | ||
2154 | struct held_lock *prev_hlock; | ||
2155 | |||
2156 | prev_hlock = curr->held_locks + depth-1; | ||
2157 | /* | ||
2158 | * If we cross into another context, reset the | ||
2159 | * hash key (this also prevents the checking and the | ||
2160 | * adding of the dependency to 'prev'): | ||
2161 | */ | ||
2162 | if (prev_hlock->irq_context != hlock->irq_context) { | ||
2163 | chain_key = 0; | ||
2164 | chain_head = 1; | ||
2165 | } | ||
2166 | } | 2451 | } |
2167 | #endif | ||
2168 | chain_key = iterate_chain_key(chain_key, id); | 2452 | chain_key = iterate_chain_key(chain_key, id); |
2169 | curr->curr_chain_key = chain_key; | 2453 | curr->curr_chain_key = chain_key; |
2170 | 2454 | ||
2171 | /* | 2455 | if (!validate_chain(curr, lock, hlock, chain_head)) |
2172 | * Trylock needs to maintain the stack of held locks, but it | 2456 | return 0; |
2173 | * does not add new dependencies, because trylock can be done | ||
2174 | * in any order. | ||
2175 | * | ||
2176 | * We look up the chain_key and do the O(N^2) check and update of | ||
2177 | * the dependencies only if this is a new dependency chain. | ||
2178 | * (If lookup_chain_cache() returns with 1 it acquires | ||
2179 | * graph_lock for us) | ||
2180 | */ | ||
2181 | if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) { | ||
2182 | /* | ||
2183 | * Check whether last held lock: | ||
2184 | * | ||
2185 | * - is irq-safe, if this lock is irq-unsafe | ||
2186 | * - is softirq-safe, if this lock is hardirq-unsafe | ||
2187 | * | ||
2188 | * And check whether the new lock's dependency graph | ||
2189 | * could lead back to the previous lock. | ||
2190 | * | ||
2191 | * any of these scenarios could lead to a deadlock. If | ||
2192 | * All validations | ||
2193 | */ | ||
2194 | int ret = check_deadlock(curr, hlock, lock, read); | ||
2195 | |||
2196 | if (!ret) | ||
2197 | return 0; | ||
2198 | /* | ||
2199 | * Mark recursive read, as we jump over it when | ||
2200 | * building dependencies (just like we jump over | ||
2201 | * trylock entries): | ||
2202 | */ | ||
2203 | if (ret == 2) | ||
2204 | hlock->read = 2; | ||
2205 | /* | ||
2206 | * Add dependency only if this lock is not the head | ||
2207 | * of the chain, and if it's not a secondary read-lock: | ||
2208 | */ | ||
2209 | if (!chain_head && ret != 2) | ||
2210 | if (!check_prevs_add(curr, hlock)) | ||
2211 | return 0; | ||
2212 | graph_unlock(); | ||
2213 | } else | ||
2214 | /* after lookup_chain_cache(): */ | ||
2215 | if (unlikely(!debug_locks)) | ||
2216 | return 0; | ||
2217 | 2457 | ||
2218 | curr->lockdep_depth++; | 2458 | curr->lockdep_depth++; |
2219 | check_chain_key(curr); | 2459 | check_chain_key(curr); |
@@ -2315,6 +2555,8 @@ lock_release_non_nested(struct task_struct *curr, | |||
2315 | return print_unlock_inbalance_bug(curr, lock, ip); | 2555 | return print_unlock_inbalance_bug(curr, lock, ip); |
2316 | 2556 | ||
2317 | found_it: | 2557 | found_it: |
2558 | lock_release_holdtime(hlock); | ||
2559 | |||
2318 | /* | 2560 | /* |
2319 | * We have the right lock to unlock, 'hlock' points to it. | 2561 | * We have the right lock to unlock, 'hlock' points to it. |
2320 | * Now we remove it from the stack, and add back the other | 2562 | * Now we remove it from the stack, and add back the other |
@@ -2367,6 +2609,8 @@ static int lock_release_nested(struct task_struct *curr, | |||
2367 | 2609 | ||
2368 | curr->curr_chain_key = hlock->prev_chain_key; | 2610 | curr->curr_chain_key = hlock->prev_chain_key; |
2369 | 2611 | ||
2612 | lock_release_holdtime(hlock); | ||
2613 | |||
2370 | #ifdef CONFIG_DEBUG_LOCKDEP | 2614 | #ifdef CONFIG_DEBUG_LOCKDEP |
2371 | hlock->prev_chain_key = 0; | 2615 | hlock->prev_chain_key = 0; |
2372 | hlock->class = NULL; | 2616 | hlock->class = NULL; |
@@ -2441,6 +2685,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2441 | { | 2685 | { |
2442 | unsigned long flags; | 2686 | unsigned long flags; |
2443 | 2687 | ||
2688 | if (unlikely(!lock_stat && !prove_locking)) | ||
2689 | return; | ||
2690 | |||
2444 | if (unlikely(current->lockdep_recursion)) | 2691 | if (unlikely(current->lockdep_recursion)) |
2445 | return; | 2692 | return; |
2446 | 2693 | ||
@@ -2460,6 +2707,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
2460 | { | 2707 | { |
2461 | unsigned long flags; | 2708 | unsigned long flags; |
2462 | 2709 | ||
2710 | if (unlikely(!lock_stat && !prove_locking)) | ||
2711 | return; | ||
2712 | |||
2463 | if (unlikely(current->lockdep_recursion)) | 2713 | if (unlikely(current->lockdep_recursion)) |
2464 | return; | 2714 | return; |
2465 | 2715 | ||
@@ -2473,6 +2723,166 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
2473 | 2723 | ||
2474 | EXPORT_SYMBOL_GPL(lock_release); | 2724 | EXPORT_SYMBOL_GPL(lock_release); |
2475 | 2725 | ||
2726 | #ifdef CONFIG_LOCK_STAT | ||
2727 | static int | ||
2728 | print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | ||
2729 | unsigned long ip) | ||
2730 | { | ||
2731 | if (!debug_locks_off()) | ||
2732 | return 0; | ||
2733 | if (debug_locks_silent) | ||
2734 | return 0; | ||
2735 | |||
2736 | printk("\n=================================\n"); | ||
2737 | printk( "[ BUG: bad contention detected! ]\n"); | ||
2738 | printk( "---------------------------------\n"); | ||
2739 | printk("%s/%d is trying to contend lock (", | ||
2740 | curr->comm, curr->pid); | ||
2741 | print_lockdep_cache(lock); | ||
2742 | printk(") at:\n"); | ||
2743 | print_ip_sym(ip); | ||
2744 | printk("but there are no locks held!\n"); | ||
2745 | printk("\nother info that might help us debug this:\n"); | ||
2746 | lockdep_print_held_locks(curr); | ||
2747 | |||
2748 | printk("\nstack backtrace:\n"); | ||
2749 | dump_stack(); | ||
2750 | |||
2751 | return 0; | ||
2752 | } | ||
2753 | |||
2754 | static void | ||
2755 | __lock_contended(struct lockdep_map *lock, unsigned long ip) | ||
2756 | { | ||
2757 | struct task_struct *curr = current; | ||
2758 | struct held_lock *hlock, *prev_hlock; | ||
2759 | struct lock_class_stats *stats; | ||
2760 | unsigned int depth; | ||
2761 | int i, point; | ||
2762 | |||
2763 | depth = curr->lockdep_depth; | ||
2764 | if (DEBUG_LOCKS_WARN_ON(!depth)) | ||
2765 | return; | ||
2766 | |||
2767 | prev_hlock = NULL; | ||
2768 | for (i = depth-1; i >= 0; i--) { | ||
2769 | hlock = curr->held_locks + i; | ||
2770 | /* | ||
2771 | * We must not cross into another context: | ||
2772 | */ | ||
2773 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
2774 | break; | ||
2775 | if (hlock->instance == lock) | ||
2776 | goto found_it; | ||
2777 | prev_hlock = hlock; | ||
2778 | } | ||
2779 | print_lock_contention_bug(curr, lock, ip); | ||
2780 | return; | ||
2781 | |||
2782 | found_it: | ||
2783 | hlock->waittime_stamp = sched_clock(); | ||
2784 | |||
2785 | point = lock_contention_point(hlock->class, ip); | ||
2786 | |||
2787 | stats = get_lock_stats(hlock->class); | ||
2788 | if (point < ARRAY_SIZE(stats->contention_point)) | ||
2789 | stats->contention_point[i]++; | ||
2790 | if (lock->cpu != smp_processor_id()) | ||
2791 | stats->bounces[bounce_contended + !!hlock->read]++; | ||
2792 | put_lock_stats(stats); | ||
2793 | } | ||
2794 | |||
2795 | static void | ||
2796 | __lock_acquired(struct lockdep_map *lock) | ||
2797 | { | ||
2798 | struct task_struct *curr = current; | ||
2799 | struct held_lock *hlock, *prev_hlock; | ||
2800 | struct lock_class_stats *stats; | ||
2801 | unsigned int depth; | ||
2802 | u64 now; | ||
2803 | s64 waittime = 0; | ||
2804 | int i, cpu; | ||
2805 | |||
2806 | depth = curr->lockdep_depth; | ||
2807 | if (DEBUG_LOCKS_WARN_ON(!depth)) | ||
2808 | return; | ||
2809 | |||
2810 | prev_hlock = NULL; | ||
2811 | for (i = depth-1; i >= 0; i--) { | ||
2812 | hlock = curr->held_locks + i; | ||
2813 | /* | ||
2814 | * We must not cross into another context: | ||
2815 | */ | ||
2816 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
2817 | break; | ||
2818 | if (hlock->instance == lock) | ||
2819 | goto found_it; | ||
2820 | prev_hlock = hlock; | ||
2821 | } | ||
2822 | print_lock_contention_bug(curr, lock, _RET_IP_); | ||
2823 | return; | ||
2824 | |||
2825 | found_it: | ||
2826 | cpu = smp_processor_id(); | ||
2827 | if (hlock->waittime_stamp) { | ||
2828 | now = sched_clock(); | ||
2829 | waittime = now - hlock->waittime_stamp; | ||
2830 | hlock->holdtime_stamp = now; | ||
2831 | } | ||
2832 | |||
2833 | stats = get_lock_stats(hlock->class); | ||
2834 | if (waittime) { | ||
2835 | if (hlock->read) | ||
2836 | lock_time_inc(&stats->read_waittime, waittime); | ||
2837 | else | ||
2838 | lock_time_inc(&stats->write_waittime, waittime); | ||
2839 | } | ||
2840 | if (lock->cpu != cpu) | ||
2841 | stats->bounces[bounce_acquired + !!hlock->read]++; | ||
2842 | put_lock_stats(stats); | ||
2843 | |||
2844 | lock->cpu = cpu; | ||
2845 | } | ||
2846 | |||
2847 | void lock_contended(struct lockdep_map *lock, unsigned long ip) | ||
2848 | { | ||
2849 | unsigned long flags; | ||
2850 | |||
2851 | if (unlikely(!lock_stat)) | ||
2852 | return; | ||
2853 | |||
2854 | if (unlikely(current->lockdep_recursion)) | ||
2855 | return; | ||
2856 | |||
2857 | raw_local_irq_save(flags); | ||
2858 | check_flags(flags); | ||
2859 | current->lockdep_recursion = 1; | ||
2860 | __lock_contended(lock, ip); | ||
2861 | current->lockdep_recursion = 0; | ||
2862 | raw_local_irq_restore(flags); | ||
2863 | } | ||
2864 | EXPORT_SYMBOL_GPL(lock_contended); | ||
2865 | |||
2866 | void lock_acquired(struct lockdep_map *lock) | ||
2867 | { | ||
2868 | unsigned long flags; | ||
2869 | |||
2870 | if (unlikely(!lock_stat)) | ||
2871 | return; | ||
2872 | |||
2873 | if (unlikely(current->lockdep_recursion)) | ||
2874 | return; | ||
2875 | |||
2876 | raw_local_irq_save(flags); | ||
2877 | check_flags(flags); | ||
2878 | current->lockdep_recursion = 1; | ||
2879 | __lock_acquired(lock); | ||
2880 | current->lockdep_recursion = 0; | ||
2881 | raw_local_irq_restore(flags); | ||
2882 | } | ||
2883 | EXPORT_SYMBOL_GPL(lock_acquired); | ||
2884 | #endif | ||
2885 | |||
2476 | /* | 2886 | /* |
2477 | * Used by the testsuite, sanitize the validator state | 2887 | * Used by the testsuite, sanitize the validator state |
2478 | * after a simulated failure: | 2888 | * after a simulated failure: |
@@ -2636,8 +3046,11 @@ void __init lockdep_info(void) | |||
2636 | sizeof(struct held_lock) * MAX_LOCK_DEPTH); | 3046 | sizeof(struct held_lock) * MAX_LOCK_DEPTH); |
2637 | 3047 | ||
2638 | #ifdef CONFIG_DEBUG_LOCKDEP | 3048 | #ifdef CONFIG_DEBUG_LOCKDEP |
2639 | if (lockdep_init_error) | 3049 | if (lockdep_init_error) { |
2640 | printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); | 3050 | printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); |
3051 | printk("Call stack leading to lockdep invocation was:\n"); | ||
3052 | print_stack_trace(&lockdep_init_trace, 0); | ||
3053 | } | ||
2641 | #endif | 3054 | #endif |
2642 | } | 3055 | } |
2643 | 3056 | ||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 58f35e586ee3..c851b2dcc685 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -5,7 +5,8 @@ | |||
5 | * | 5 | * |
6 | * Started by Ingo Molnar: | 6 | * Started by Ingo Molnar: |
7 | * | 7 | * |
8 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | 8 | * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
9 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
9 | * | 10 | * |
10 | * Code for /proc/lockdep and /proc/lockdep_stats: | 11 | * Code for /proc/lockdep and /proc/lockdep_stats: |
11 | * | 12 | * |
@@ -15,6 +16,10 @@ | |||
15 | #include <linux/seq_file.h> | 16 | #include <linux/seq_file.h> |
16 | #include <linux/kallsyms.h> | 17 | #include <linux/kallsyms.h> |
17 | #include <linux/debug_locks.h> | 18 | #include <linux/debug_locks.h> |
19 | #include <linux/vmalloc.h> | ||
20 | #include <linux/sort.h> | ||
21 | #include <asm/uaccess.h> | ||
22 | #include <asm/div64.h> | ||
18 | 23 | ||
19 | #include "lockdep_internals.h" | 24 | #include "lockdep_internals.h" |
20 | 25 | ||
@@ -271,8 +276,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v) | |||
271 | if (nr_list_entries) | 276 | if (nr_list_entries) |
272 | factor = sum_forward_deps / nr_list_entries; | 277 | factor = sum_forward_deps / nr_list_entries; |
273 | 278 | ||
279 | #ifdef CONFIG_PROVE_LOCKING | ||
274 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", | 280 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", |
275 | nr_lock_chains, MAX_LOCKDEP_CHAINS); | 281 | nr_lock_chains, MAX_LOCKDEP_CHAINS); |
282 | #endif | ||
276 | 283 | ||
277 | #ifdef CONFIG_TRACE_IRQFLAGS | 284 | #ifdef CONFIG_TRACE_IRQFLAGS |
278 | seq_printf(m, " in-hardirq chains: %11u\n", | 285 | seq_printf(m, " in-hardirq chains: %11u\n", |
@@ -339,9 +346,295 @@ static const struct file_operations proc_lockdep_stats_operations = { | |||
339 | .open = lockdep_stats_open, | 346 | .open = lockdep_stats_open, |
340 | .read = seq_read, | 347 | .read = seq_read, |
341 | .llseek = seq_lseek, | 348 | .llseek = seq_lseek, |
342 | .release = seq_release, | 349 | .release = single_release, |
350 | }; | ||
351 | |||
352 | #ifdef CONFIG_LOCK_STAT | ||
353 | |||
354 | struct lock_stat_data { | ||
355 | struct lock_class *class; | ||
356 | struct lock_class_stats stats; | ||
357 | }; | ||
358 | |||
359 | struct lock_stat_seq { | ||
360 | struct lock_stat_data *iter; | ||
361 | struct lock_stat_data *iter_end; | ||
362 | struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; | ||
343 | }; | 363 | }; |
344 | 364 | ||
365 | /* | ||
366 | * sort on absolute number of contentions | ||
367 | */ | ||
368 | static int lock_stat_cmp(const void *l, const void *r) | ||
369 | { | ||
370 | const struct lock_stat_data *dl = l, *dr = r; | ||
371 | unsigned long nl, nr; | ||
372 | |||
373 | nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; | ||
374 | nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; | ||
375 | |||
376 | return nr - nl; | ||
377 | } | ||
378 | |||
379 | static void seq_line(struct seq_file *m, char c, int offset, int length) | ||
380 | { | ||
381 | int i; | ||
382 | |||
383 | for (i = 0; i < offset; i++) | ||
384 | seq_puts(m, " "); | ||
385 | for (i = 0; i < length; i++) | ||
386 | seq_printf(m, "%c", c); | ||
387 | seq_puts(m, "\n"); | ||
388 | } | ||
389 | |||
390 | static void snprint_time(char *buf, size_t bufsiz, s64 nr) | ||
391 | { | ||
392 | unsigned long rem; | ||
393 | |||
394 | rem = do_div(nr, 1000); /* XXX: do_div_signed */ | ||
395 | snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); | ||
396 | } | ||
397 | |||
398 | static void seq_time(struct seq_file *m, s64 time) | ||
399 | { | ||
400 | char num[15]; | ||
401 | |||
402 | snprint_time(num, sizeof(num), time); | ||
403 | seq_printf(m, " %14s", num); | ||
404 | } | ||
405 | |||
406 | static void seq_lock_time(struct seq_file *m, struct lock_time *lt) | ||
407 | { | ||
408 | seq_printf(m, "%14lu", lt->nr); | ||
409 | seq_time(m, lt->min); | ||
410 | seq_time(m, lt->max); | ||
411 | seq_time(m, lt->total); | ||
412 | } | ||
413 | |||
414 | static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | ||
415 | { | ||
416 | char name[39]; | ||
417 | struct lock_class *class; | ||
418 | struct lock_class_stats *stats; | ||
419 | int i, namelen; | ||
420 | |||
421 | class = data->class; | ||
422 | stats = &data->stats; | ||
423 | |||
424 | namelen = 38; | ||
425 | if (class->name_version > 1) | ||
426 | namelen -= 2; /* XXX truncates versions > 9 */ | ||
427 | if (class->subclass) | ||
428 | namelen -= 2; | ||
429 | |||
430 | if (!class->name) { | ||
431 | char str[KSYM_NAME_LEN]; | ||
432 | const char *key_name; | ||
433 | |||
434 | key_name = __get_key_name(class->key, str); | ||
435 | snprintf(name, namelen, "%s", key_name); | ||
436 | } else { | ||
437 | snprintf(name, namelen, "%s", class->name); | ||
438 | } | ||
439 | namelen = strlen(name); | ||
440 | if (class->name_version > 1) { | ||
441 | snprintf(name+namelen, 3, "#%d", class->name_version); | ||
442 | namelen += 2; | ||
443 | } | ||
444 | if (class->subclass) { | ||
445 | snprintf(name+namelen, 3, "/%d", class->subclass); | ||
446 | namelen += 2; | ||
447 | } | ||
448 | |||
449 | if (stats->write_holdtime.nr) { | ||
450 | if (stats->read_holdtime.nr) | ||
451 | seq_printf(m, "%38s-W:", name); | ||
452 | else | ||
453 | seq_printf(m, "%40s:", name); | ||
454 | |||
455 | seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); | ||
456 | seq_lock_time(m, &stats->write_waittime); | ||
457 | seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); | ||
458 | seq_lock_time(m, &stats->write_holdtime); | ||
459 | seq_puts(m, "\n"); | ||
460 | } | ||
461 | |||
462 | if (stats->read_holdtime.nr) { | ||
463 | seq_printf(m, "%38s-R:", name); | ||
464 | seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); | ||
465 | seq_lock_time(m, &stats->read_waittime); | ||
466 | seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); | ||
467 | seq_lock_time(m, &stats->read_holdtime); | ||
468 | seq_puts(m, "\n"); | ||
469 | } | ||
470 | |||
471 | if (stats->read_waittime.nr + stats->write_waittime.nr == 0) | ||
472 | return; | ||
473 | |||
474 | if (stats->read_holdtime.nr) | ||
475 | namelen += 2; | ||
476 | |||
477 | for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { | ||
478 | char sym[KSYM_SYMBOL_LEN]; | ||
479 | char ip[32]; | ||
480 | |||
481 | if (class->contention_point[i] == 0) | ||
482 | break; | ||
483 | |||
484 | if (!i) | ||
485 | seq_line(m, '-', 40-namelen, namelen); | ||
486 | |||
487 | sprint_symbol(sym, class->contention_point[i]); | ||
488 | snprintf(ip, sizeof(ip), "[<%p>]", | ||
489 | (void *)class->contention_point[i]); | ||
490 | seq_printf(m, "%40s %14lu %29s %s\n", name, | ||
491 | stats->contention_point[i], | ||
492 | ip, sym); | ||
493 | } | ||
494 | if (i) { | ||
495 | seq_puts(m, "\n"); | ||
496 | seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); | ||
497 | seq_puts(m, "\n"); | ||
498 | } | ||
499 | } | ||
500 | |||
501 | static void seq_header(struct seq_file *m) | ||
502 | { | ||
503 | seq_printf(m, "lock_stat version 0.2\n"); | ||
504 | seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); | ||
505 | seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " | ||
506 | "%14s %14s\n", | ||
507 | "class name", | ||
508 | "con-bounces", | ||
509 | "contentions", | ||
510 | "waittime-min", | ||
511 | "waittime-max", | ||
512 | "waittime-total", | ||
513 | "acq-bounces", | ||
514 | "acquisitions", | ||
515 | "holdtime-min", | ||
516 | "holdtime-max", | ||
517 | "holdtime-total"); | ||
518 | seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); | ||
519 | seq_printf(m, "\n"); | ||
520 | } | ||
521 | |||
522 | static void *ls_start(struct seq_file *m, loff_t *pos) | ||
523 | { | ||
524 | struct lock_stat_seq *data = m->private; | ||
525 | |||
526 | if (data->iter == data->stats) | ||
527 | seq_header(m); | ||
528 | |||
529 | if (data->iter == data->iter_end) | ||
530 | data->iter = NULL; | ||
531 | |||
532 | return data->iter; | ||
533 | } | ||
534 | |||
535 | static void *ls_next(struct seq_file *m, void *v, loff_t *pos) | ||
536 | { | ||
537 | struct lock_stat_seq *data = m->private; | ||
538 | |||
539 | (*pos)++; | ||
540 | |||
541 | data->iter = v; | ||
542 | data->iter++; | ||
543 | if (data->iter == data->iter_end) | ||
544 | data->iter = NULL; | ||
545 | |||
546 | return data->iter; | ||
547 | } | ||
548 | |||
549 | static void ls_stop(struct seq_file *m, void *v) | ||
550 | { | ||
551 | } | ||
552 | |||
553 | static int ls_show(struct seq_file *m, void *v) | ||
554 | { | ||
555 | struct lock_stat_seq *data = m->private; | ||
556 | |||
557 | seq_stats(m, data->iter); | ||
558 | return 0; | ||
559 | } | ||
560 | |||
561 | static struct seq_operations lockstat_ops = { | ||
562 | .start = ls_start, | ||
563 | .next = ls_next, | ||
564 | .stop = ls_stop, | ||
565 | .show = ls_show, | ||
566 | }; | ||
567 | |||
568 | static int lock_stat_open(struct inode *inode, struct file *file) | ||
569 | { | ||
570 | int res; | ||
571 | struct lock_class *class; | ||
572 | struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); | ||
573 | |||
574 | if (!data) | ||
575 | return -ENOMEM; | ||
576 | |||
577 | res = seq_open(file, &lockstat_ops); | ||
578 | if (!res) { | ||
579 | struct lock_stat_data *iter = data->stats; | ||
580 | struct seq_file *m = file->private_data; | ||
581 | |||
582 | data->iter = iter; | ||
583 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
584 | iter->class = class; | ||
585 | iter->stats = lock_stats(class); | ||
586 | iter++; | ||
587 | } | ||
588 | data->iter_end = iter; | ||
589 | |||
590 | sort(data->stats, data->iter_end - data->iter, | ||
591 | sizeof(struct lock_stat_data), | ||
592 | lock_stat_cmp, NULL); | ||
593 | |||
594 | m->private = data; | ||
595 | } else | ||
596 | vfree(data); | ||
597 | |||
598 | return res; | ||
599 | } | ||
600 | |||
601 | static ssize_t lock_stat_write(struct file *file, const char __user *buf, | ||
602 | size_t count, loff_t *ppos) | ||
603 | { | ||
604 | struct lock_class *class; | ||
605 | char c; | ||
606 | |||
607 | if (count) { | ||
608 | if (get_user(c, buf)) | ||
609 | return -EFAULT; | ||
610 | |||
611 | if (c != '0') | ||
612 | return count; | ||
613 | |||
614 | list_for_each_entry(class, &all_lock_classes, lock_entry) | ||
615 | clear_lock_stats(class); | ||
616 | } | ||
617 | return count; | ||
618 | } | ||
619 | |||
620 | static int lock_stat_release(struct inode *inode, struct file *file) | ||
621 | { | ||
622 | struct seq_file *seq = file->private_data; | ||
623 | |||
624 | vfree(seq->private); | ||
625 | seq->private = NULL; | ||
626 | return seq_release(inode, file); | ||
627 | } | ||
628 | |||
629 | static const struct file_operations proc_lock_stat_operations = { | ||
630 | .open = lock_stat_open, | ||
631 | .write = lock_stat_write, | ||
632 | .read = seq_read, | ||
633 | .llseek = seq_lseek, | ||
634 | .release = lock_stat_release, | ||
635 | }; | ||
636 | #endif /* CONFIG_LOCK_STAT */ | ||
637 | |||
345 | static int __init lockdep_proc_init(void) | 638 | static int __init lockdep_proc_init(void) |
346 | { | 639 | { |
347 | struct proc_dir_entry *entry; | 640 | struct proc_dir_entry *entry; |
@@ -354,6 +647,12 @@ static int __init lockdep_proc_init(void) | |||
354 | if (entry) | 647 | if (entry) |
355 | entry->proc_fops = &proc_lockdep_stats_operations; | 648 | entry->proc_fops = &proc_lockdep_stats_operations; |
356 | 649 | ||
650 | #ifdef CONFIG_LOCK_STAT | ||
651 | entry = create_proc_entry("lock_stat", S_IRUSR, NULL); | ||
652 | if (entry) | ||
653 | entry->proc_fops = &proc_lock_stat_operations; | ||
654 | #endif | ||
655 | |||
357 | return 0; | 656 | return 0; |
358 | } | 657 | } |
359 | 658 | ||
diff --git a/kernel/module.c b/kernel/module.c index 33c04ad51175..db0ead0363e2 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); | |||
784 | static ssize_t show_refcnt(struct module_attribute *mattr, | 784 | static ssize_t show_refcnt(struct module_attribute *mattr, |
785 | struct module *mod, char *buffer) | 785 | struct module *mod, char *buffer) |
786 | { | 786 | { |
787 | /* sysfs holds a reference */ | 787 | return sprintf(buffer, "%u\n", module_refcount(mod)); |
788 | return sprintf(buffer, "%u\n", module_refcount(mod)-1); | ||
789 | } | 788 | } |
790 | 789 | ||
791 | static struct module_attribute refcnt = { | 790 | static struct module_attribute refcnt = { |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 303eab18484b..691b86564dd9 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -139,6 +139,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) | |||
139 | list_add_tail(&waiter.list, &lock->wait_list); | 139 | list_add_tail(&waiter.list, &lock->wait_list); |
140 | waiter.task = task; | 140 | waiter.task = task; |
141 | 141 | ||
142 | old_val = atomic_xchg(&lock->count, -1); | ||
143 | if (old_val == 1) | ||
144 | goto done; | ||
145 | |||
146 | lock_contended(&lock->dep_map, _RET_IP_); | ||
147 | |||
142 | for (;;) { | 148 | for (;;) { |
143 | /* | 149 | /* |
144 | * Lets try to take the lock again - this is needed even if | 150 | * Lets try to take the lock again - this is needed even if |
@@ -174,6 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) | |||
174 | spin_lock_mutex(&lock->wait_lock, flags); | 180 | spin_lock_mutex(&lock->wait_lock, flags); |
175 | } | 181 | } |
176 | 182 | ||
183 | done: | ||
184 | lock_acquired(&lock->dep_map); | ||
177 | /* got the lock - rejoice! */ | 185 | /* got the lock - rejoice! */ |
178 | mutex_remove_waiter(lock, &waiter, task_thread_info(task)); | 186 | mutex_remove_waiter(lock, &waiter, task_thread_info(task)); |
179 | debug_mutex_set_owner(lock, task_thread_info(task)); | 187 | debug_mutex_set_owner(lock, task_thread_info(task)); |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 10f0bbba382b..f1decd21a534 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/mnt_namespace.h> | 20 | #include <linux/mnt_namespace.h> |
21 | #include <linux/utsname.h> | 21 | #include <linux/utsname.h> |
22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
23 | #include <net/net_namespace.h> | ||
23 | 24 | ||
24 | static struct kmem_cache *nsproxy_cachep; | 25 | static struct kmem_cache *nsproxy_cachep; |
25 | 26 | ||
@@ -98,8 +99,17 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
98 | goto out_user; | 99 | goto out_user; |
99 | } | 100 | } |
100 | 101 | ||
102 | new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); | ||
103 | if (IS_ERR(new_nsp->net_ns)) { | ||
104 | err = PTR_ERR(new_nsp->net_ns); | ||
105 | goto out_net; | ||
106 | } | ||
107 | |||
101 | return new_nsp; | 108 | return new_nsp; |
102 | 109 | ||
110 | out_net: | ||
111 | if (new_nsp->user_ns) | ||
112 | put_user_ns(new_nsp->user_ns); | ||
103 | out_user: | 113 | out_user: |
104 | if (new_nsp->pid_ns) | 114 | if (new_nsp->pid_ns) |
105 | put_pid_ns(new_nsp->pid_ns); | 115 | put_pid_ns(new_nsp->pid_ns); |
@@ -132,7 +142,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
132 | 142 | ||
133 | get_nsproxy(old_ns); | 143 | get_nsproxy(old_ns); |
134 | 144 | ||
135 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) | 145 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) |
136 | return 0; | 146 | return 0; |
137 | 147 | ||
138 | if (!capable(CAP_SYS_ADMIN)) { | 148 | if (!capable(CAP_SYS_ADMIN)) { |
@@ -164,6 +174,7 @@ void free_nsproxy(struct nsproxy *ns) | |||
164 | put_pid_ns(ns->pid_ns); | 174 | put_pid_ns(ns->pid_ns); |
165 | if (ns->user_ns) | 175 | if (ns->user_ns) |
166 | put_user_ns(ns->user_ns); | 176 | put_user_ns(ns->user_ns); |
177 | put_net(ns->net_ns); | ||
167 | kmem_cache_free(nsproxy_cachep, ns); | 178 | kmem_cache_free(nsproxy_cachep, ns); |
168 | } | 179 | } |
169 | 180 | ||
@@ -177,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, | |||
177 | int err = 0; | 188 | int err = 0; |
178 | 189 | ||
179 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | | 190 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | |
180 | CLONE_NEWUSER))) | 191 | CLONE_NEWUSER | CLONE_NEWNET))) |
181 | return 0; | 192 | return 0; |
182 | 193 | ||
183 | if (!capable(CAP_SYS_ADMIN)) | 194 | if (!capable(CAP_SYS_ADMIN)) |
@@ -193,7 +204,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, | |||
193 | static int __init nsproxy_cache_init(void) | 204 | static int __init nsproxy_cache_init(void) |
194 | { | 205 | { |
195 | nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), | 206 | nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), |
196 | 0, SLAB_PANIC, NULL, NULL); | 207 | 0, SLAB_PANIC, NULL); |
197 | return 0; | 208 | return 0; |
198 | } | 209 | } |
199 | 210 | ||
diff --git a/kernel/params.c b/kernel/params.c index effbaaedd7f3..4e57732fcfb4 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -567,7 +567,12 @@ static void __init kernel_param_sysfs_setup(const char *name, | |||
567 | kobject_set_name(&mk->kobj, name); | 567 | kobject_set_name(&mk->kobj, name); |
568 | kobject_init(&mk->kobj); | 568 | kobject_init(&mk->kobj); |
569 | ret = kobject_add(&mk->kobj); | 569 | ret = kobject_add(&mk->kobj); |
570 | BUG_ON(ret < 0); | 570 | if (ret) { |
571 | printk(KERN_ERR "Module '%s' failed to be added to sysfs, " | ||
572 | "error number %d\n", name, ret); | ||
573 | printk(KERN_ERR "The system will be unstable now.\n"); | ||
574 | return; | ||
575 | } | ||
571 | param_sysfs_setup(mk, kparam, num_params, name_skip); | 576 | param_sysfs_setup(mk, kparam, num_params, name_skip); |
572 | kobject_uevent(&mk->kobj, KOBJ_ADD); | 577 | kobject_uevent(&mk->kobj, KOBJ_ADD); |
573 | } | 578 | } |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 329ce0172074..7a15afb73ed0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -241,7 +241,7 @@ static __init int init_posix_timers(void) | |||
241 | register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); | 241 | register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); |
242 | 242 | ||
243 | posix_timers_cache = kmem_cache_create("posix_timers_cache", | 243 | posix_timers_cache = kmem_cache_create("posix_timers_cache", |
244 | sizeof (struct k_itimer), 0, 0, NULL, NULL); | 244 | sizeof (struct k_itimer), 0, 0, NULL); |
245 | idr_init(&posix_timers_id); | 245 | idr_init(&posix_timers_id); |
246 | return 0; | 246 | return 0; |
247 | } | 247 | } |
@@ -547,9 +547,9 @@ sys_timer_create(const clockid_t which_clock, | |||
547 | new_timer->it_process = process; | 547 | new_timer->it_process = process; |
548 | list_add(&new_timer->list, | 548 | list_add(&new_timer->list, |
549 | &process->signal->posix_timers); | 549 | &process->signal->posix_timers); |
550 | spin_unlock_irqrestore(&process->sighand->siglock, flags); | ||
551 | if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) | 550 | if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) |
552 | get_task_struct(process); | 551 | get_task_struct(process); |
552 | spin_unlock_irqrestore(&process->sighand->siglock, flags); | ||
553 | } else { | 553 | } else { |
554 | spin_unlock_irqrestore(&process->sighand->siglock, flags); | 554 | spin_unlock_irqrestore(&process->sighand->siglock, flags); |
555 | process = NULL; | 555 | process = NULL; |
@@ -605,13 +605,14 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) | |||
605 | timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); | 605 | timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); |
606 | if (timr) { | 606 | if (timr) { |
607 | spin_lock(&timr->it_lock); | 607 | spin_lock(&timr->it_lock); |
608 | spin_unlock(&idr_lock); | ||
609 | 608 | ||
610 | if ((timr->it_id != timer_id) || !(timr->it_process) || | 609 | if ((timr->it_id != timer_id) || !(timr->it_process) || |
611 | timr->it_process->tgid != current->tgid) { | 610 | timr->it_process->tgid != current->tgid) { |
612 | unlock_timer(timr, *flags); | 611 | spin_unlock(&timr->it_lock); |
612 | spin_unlock_irqrestore(&idr_lock, *flags); | ||
613 | timr = NULL; | 613 | timr = NULL; |
614 | } | 614 | } else |
615 | spin_unlock(&idr_lock); | ||
615 | } else | 616 | } else |
616 | spin_unlock_irqrestore(&idr_lock, *flags); | 617 | spin_unlock_irqrestore(&idr_lock, *flags); |
617 | 618 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 495b7d4dd330..14b0e10dc95c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -33,13 +33,20 @@ config PM_DEBUG | |||
33 | bool "Power Management Debug Support" | 33 | bool "Power Management Debug Support" |
34 | depends on PM | 34 | depends on PM |
35 | ---help--- | 35 | ---help--- |
36 | This option enables verbose debugging support in the Power Management | 36 | This option enables various debugging support in the Power Management |
37 | code. This is helpful when debugging and reporting various PM bugs, | 37 | code. This is helpful when debugging and reporting PM bugs, like |
38 | like suspend support. | 38 | suspend support. |
39 | |||
40 | config PM_VERBOSE | ||
41 | bool "Verbose Power Management debugging" | ||
42 | depends on PM_DEBUG | ||
43 | default n | ||
44 | ---help--- | ||
45 | This option enables verbose messages from the Power Management code. | ||
39 | 46 | ||
40 | config DISABLE_CONSOLE_SUSPEND | 47 | config DISABLE_CONSOLE_SUSPEND |
41 | bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" | 48 | bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" |
42 | depends on PM && PM_DEBUG | 49 | depends on PM_DEBUG && PM_SLEEP |
43 | default n | 50 | default n |
44 | ---help--- | 51 | ---help--- |
45 | This option turns off the console suspend mechanism that prevents | 52 | This option turns off the console suspend mechanism that prevents |
@@ -50,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND | |||
50 | 57 | ||
51 | config PM_TRACE | 58 | config PM_TRACE |
52 | bool "Suspend/resume event tracing" | 59 | bool "Suspend/resume event tracing" |
53 | depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL | 60 | depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL |
54 | default n | 61 | default n |
55 | ---help--- | 62 | ---help--- |
56 | This enables some cheesy code to save the last PM event point in the | 63 | This enables some cheesy code to save the last PM event point in the |
@@ -65,21 +72,58 @@ config PM_TRACE | |||
65 | CAUTION: this option will cause your machine's real-time clock to be | 72 | CAUTION: this option will cause your machine's real-time clock to be |
66 | set to an invalid time after a resume. | 73 | set to an invalid time after a resume. |
67 | 74 | ||
68 | config PM_SYSFS_DEPRECATED | 75 | config PM_SLEEP_SMP |
69 | bool "Driver model /sys/devices/.../power/state files (DEPRECATED)" | 76 | bool |
70 | depends on PM && SYSFS | 77 | depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE |
71 | default n | 78 | depends on PM_SLEEP |
72 | help | 79 | select HOTPLUG_CPU |
73 | The driver model started out with a sysfs file intended to provide | 80 | default y |
74 | a userspace hook for device power management. This feature has never | 81 | |
75 | worked very well, except for limited testing purposes, and so it will | 82 | config PM_SLEEP |
76 | be removed. It's not clear that a generic mechanism could really | 83 | bool |
77 | handle the wide variability of device power states; any replacements | 84 | depends on SUSPEND || HIBERNATION |
78 | are likely to be bus or driver specific. | 85 | default y |
79 | 86 | ||
80 | config SOFTWARE_SUSPEND | 87 | config SUSPEND_UP_POSSIBLE |
81 | bool "Software Suspend (Hibernation)" | 88 | bool |
82 | depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) | 89 | depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \ |
90 | || SUPERH || FRV | ||
91 | depends on !SMP | ||
92 | default y | ||
93 | |||
94 | config SUSPEND_SMP_POSSIBLE | ||
95 | bool | ||
96 | depends on (X86 && !X86_VOYAGER) \ | ||
97 | || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM | ||
98 | depends on SMP | ||
99 | default y | ||
100 | |||
101 | config SUSPEND | ||
102 | bool "Suspend to RAM and standby" | ||
103 | depends on PM | ||
104 | depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE | ||
105 | default y | ||
106 | ---help--- | ||
107 | Allow the system to enter sleep states in which main memory is | ||
108 | powered and thus its contents are preserved, such as the | ||
109 | suspend-to-RAM state (i.e. the ACPI S3 state). | ||
110 | |||
111 | config HIBERNATION_UP_POSSIBLE | ||
112 | bool | ||
113 | depends on X86 || PPC64_SWSUSP || PPC32 | ||
114 | depends on !SMP | ||
115 | default y | ||
116 | |||
117 | config HIBERNATION_SMP_POSSIBLE | ||
118 | bool | ||
119 | depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP | ||
120 | depends on SMP | ||
121 | default y | ||
122 | |||
123 | config HIBERNATION | ||
124 | bool "Hibernation (aka 'suspend to disk')" | ||
125 | depends on PM && SWAP | ||
126 | depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE | ||
83 | ---help--- | 127 | ---help--- |
84 | Enable the suspend to disk (STD) functionality, which is usually | 128 | Enable the suspend to disk (STD) functionality, which is usually |
85 | called "hibernation" in user interfaces. STD checkpoints the | 129 | called "hibernation" in user interfaces. STD checkpoints the |
@@ -117,7 +161,7 @@ config SOFTWARE_SUSPEND | |||
117 | 161 | ||
118 | config PM_STD_PARTITION | 162 | config PM_STD_PARTITION |
119 | string "Default resume partition" | 163 | string "Default resume partition" |
120 | depends on SOFTWARE_SUSPEND | 164 | depends on HIBERNATION |
121 | default "" | 165 | default "" |
122 | ---help--- | 166 | ---help--- |
123 | The default resume partition is the partition that the suspend- | 167 | The default resume partition is the partition that the suspend- |
@@ -137,11 +181,6 @@ config PM_STD_PARTITION | |||
137 | suspended image to. It will simply pick the first available swap | 181 | suspended image to. It will simply pick the first available swap |
138 | device. | 182 | device. |
139 | 183 | ||
140 | config SUSPEND_SMP | ||
141 | bool | ||
142 | depends on HOTPLUG_CPU && (X86 || PPC64) && PM | ||
143 | default y | ||
144 | |||
145 | config APM_EMULATION | 184 | config APM_EMULATION |
146 | tristate "Advanced Power Management Emulation" | 185 | tristate "Advanced Power Management Emulation" |
147 | depends on PM && SYS_SUPPORTS_APM_EMULATION | 186 | depends on PM && SYS_SUPPORTS_APM_EMULATION |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 38725f526afc..f7dfff28ecdb 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y) | |||
3 | EXTRA_CFLAGS += -DDEBUG | 3 | EXTRA_CFLAGS += -DDEBUG |
4 | endif | 4 | endif |
5 | 5 | ||
6 | obj-y := main.o process.o console.o | 6 | obj-y := main.o |
7 | obj-$(CONFIG_PM_LEGACY) += pm.o | 7 | obj-$(CONFIG_PM_LEGACY) += pm.o |
8 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o | 8 | obj-$(CONFIG_PM_SLEEP) += process.o console.o |
9 | obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o | ||
9 | 10 | ||
10 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 11 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f445b9cd60fb..eb72255b5c86 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -45,7 +45,7 @@ enum { | |||
45 | 45 | ||
46 | static int hibernation_mode = HIBERNATION_SHUTDOWN; | 46 | static int hibernation_mode = HIBERNATION_SHUTDOWN; |
47 | 47 | ||
48 | struct hibernation_ops *hibernation_ops; | 48 | static struct hibernation_ops *hibernation_ops; |
49 | 49 | ||
50 | /** | 50 | /** |
51 | * hibernation_set_ops - set the global hibernate operations | 51 | * hibernation_set_ops - set the global hibernate operations |
@@ -54,7 +54,8 @@ struct hibernation_ops *hibernation_ops; | |||
54 | 54 | ||
55 | void hibernation_set_ops(struct hibernation_ops *ops) | 55 | void hibernation_set_ops(struct hibernation_ops *ops) |
56 | { | 56 | { |
57 | if (ops && !(ops->prepare && ops->enter && ops->finish)) { | 57 | if (ops && !(ops->prepare && ops->enter && ops->finish |
58 | && ops->pre_restore && ops->restore_cleanup)) { | ||
58 | WARN_ON(1); | 59 | WARN_ON(1); |
59 | return; | 60 | return; |
60 | } | 61 | } |
@@ -74,9 +75,9 @@ void hibernation_set_ops(struct hibernation_ops *ops) | |||
74 | * platform driver if so configured and return an error code if it fails | 75 | * platform driver if so configured and return an error code if it fails |
75 | */ | 76 | */ |
76 | 77 | ||
77 | static int platform_prepare(void) | 78 | static int platform_prepare(int platform_mode) |
78 | { | 79 | { |
79 | return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? | 80 | return (platform_mode && hibernation_ops) ? |
80 | hibernation_ops->prepare() : 0; | 81 | hibernation_ops->prepare() : 0; |
81 | } | 82 | } |
82 | 83 | ||
@@ -85,13 +86,146 @@ static int platform_prepare(void) | |||
85 | * using the platform driver (must be called after platform_prepare()) | 86 | * using the platform driver (must be called after platform_prepare()) |
86 | */ | 87 | */ |
87 | 88 | ||
88 | static void platform_finish(void) | 89 | static void platform_finish(int platform_mode) |
89 | { | 90 | { |
90 | if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) | 91 | if (platform_mode && hibernation_ops) |
91 | hibernation_ops->finish(); | 92 | hibernation_ops->finish(); |
92 | } | 93 | } |
93 | 94 | ||
94 | /** | 95 | /** |
96 | * platform_pre_restore - prepare the platform for the restoration from a | ||
97 | * hibernation image. If the restore fails after this function has been | ||
98 | * called, platform_restore_cleanup() must be called. | ||
99 | */ | ||
100 | |||
101 | static int platform_pre_restore(int platform_mode) | ||
102 | { | ||
103 | return (platform_mode && hibernation_ops) ? | ||
104 | hibernation_ops->pre_restore() : 0; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * platform_restore_cleanup - switch the platform to the normal mode of | ||
109 | * operation after a failing restore. If platform_pre_restore() has been | ||
110 | * called before the failing restore, this function must be called too, | ||
111 | * regardless of the result of platform_pre_restore(). | ||
112 | */ | ||
113 | |||
114 | static void platform_restore_cleanup(int platform_mode) | ||
115 | { | ||
116 | if (platform_mode && hibernation_ops) | ||
117 | hibernation_ops->restore_cleanup(); | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * hibernation_snapshot - quiesce devices and create the hibernation | ||
122 | * snapshot image. | ||
123 | * @platform_mode - if set, use the platform driver, if available, to | ||
124 | * prepare the platform frimware for the power transition. | ||
125 | * | ||
126 | * Must be called with pm_mutex held | ||
127 | */ | ||
128 | |||
129 | int hibernation_snapshot(int platform_mode) | ||
130 | { | ||
131 | int error; | ||
132 | |||
133 | /* Free memory before shutting down devices. */ | ||
134 | error = swsusp_shrink_memory(); | ||
135 | if (error) | ||
136 | return error; | ||
137 | |||
138 | suspend_console(); | ||
139 | error = device_suspend(PMSG_FREEZE); | ||
140 | if (error) | ||
141 | goto Resume_console; | ||
142 | |||
143 | error = platform_prepare(platform_mode); | ||
144 | if (error) | ||
145 | goto Resume_devices; | ||
146 | |||
147 | error = disable_nonboot_cpus(); | ||
148 | if (!error) { | ||
149 | if (hibernation_mode != HIBERNATION_TEST) { | ||
150 | in_suspend = 1; | ||
151 | error = swsusp_suspend(); | ||
152 | /* Control returns here after successful restore */ | ||
153 | } else { | ||
154 | printk("swsusp debug: Waiting for 5 seconds.\n"); | ||
155 | mdelay(5000); | ||
156 | } | ||
157 | } | ||
158 | enable_nonboot_cpus(); | ||
159 | Resume_devices: | ||
160 | platform_finish(platform_mode); | ||
161 | device_resume(); | ||
162 | Resume_console: | ||
163 | resume_console(); | ||
164 | return error; | ||
165 | } | ||
166 | |||
167 | /** | ||
168 | * hibernation_restore - quiesce devices and restore the hibernation | ||
169 | * snapshot image. If successful, control returns in hibernation_snaphot() | ||
170 | * @platform_mode - if set, use the platform driver, if available, to | ||
171 | * prepare the platform frimware for the transition. | ||
172 | * | ||
173 | * Must be called with pm_mutex held | ||
174 | */ | ||
175 | |||
176 | int hibernation_restore(int platform_mode) | ||
177 | { | ||
178 | int error; | ||
179 | |||
180 | pm_prepare_console(); | ||
181 | suspend_console(); | ||
182 | error = device_suspend(PMSG_PRETHAW); | ||
183 | if (error) | ||
184 | goto Finish; | ||
185 | |||
186 | error = platform_pre_restore(platform_mode); | ||
187 | if (!error) { | ||
188 | error = disable_nonboot_cpus(); | ||
189 | if (!error) | ||
190 | error = swsusp_resume(); | ||
191 | enable_nonboot_cpus(); | ||
192 | } | ||
193 | platform_restore_cleanup(platform_mode); | ||
194 | device_resume(); | ||
195 | Finish: | ||
196 | resume_console(); | ||
197 | pm_restore_console(); | ||
198 | return error; | ||
199 | } | ||
200 | |||
201 | /** | ||
202 | * hibernation_platform_enter - enter the hibernation state using the | ||
203 | * platform driver (if available) | ||
204 | */ | ||
205 | |||
206 | int hibernation_platform_enter(void) | ||
207 | { | ||
208 | int error; | ||
209 | |||
210 | if (hibernation_ops) { | ||
211 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | ||
212 | /* | ||
213 | * We have cancelled the power transition by running | ||
214 | * hibernation_ops->finish() before saving the image, so we | ||
215 | * should let the firmware know that we're going to enter the | ||
216 | * sleep state after all | ||
217 | */ | ||
218 | error = hibernation_ops->prepare(); | ||
219 | sysdev_shutdown(); | ||
220 | if (!error) | ||
221 | error = hibernation_ops->enter(); | ||
222 | } else { | ||
223 | error = -ENOSYS; | ||
224 | } | ||
225 | return error; | ||
226 | } | ||
227 | |||
228 | /** | ||
95 | * power_down - Shut the machine down for hibernation. | 229 | * power_down - Shut the machine down for hibernation. |
96 | * | 230 | * |
97 | * Use the platform driver, if configured so; otherwise try | 231 | * Use the platform driver, if configured so; otherwise try |
@@ -111,11 +245,7 @@ static void power_down(void) | |||
111 | kernel_restart(NULL); | 245 | kernel_restart(NULL); |
112 | break; | 246 | break; |
113 | case HIBERNATION_PLATFORM: | 247 | case HIBERNATION_PLATFORM: |
114 | if (hibernation_ops) { | 248 | hibernation_platform_enter(); |
115 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | ||
116 | hibernation_ops->enter(); | ||
117 | break; | ||
118 | } | ||
119 | } | 249 | } |
120 | kernel_halt(); | 250 | kernel_halt(); |
121 | /* | 251 | /* |
@@ -152,9 +282,16 @@ int hibernate(void) | |||
152 | { | 282 | { |
153 | int error; | 283 | int error; |
154 | 284 | ||
285 | mutex_lock(&pm_mutex); | ||
155 | /* The snapshot device should not be opened while we're running */ | 286 | /* The snapshot device should not be opened while we're running */ |
156 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) | 287 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { |
157 | return -EBUSY; | 288 | error = -EBUSY; |
289 | goto Unlock; | ||
290 | } | ||
291 | |||
292 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | ||
293 | if (error) | ||
294 | goto Exit; | ||
158 | 295 | ||
159 | /* Allocate memory management structures */ | 296 | /* Allocate memory management structures */ |
160 | error = create_basic_memory_bitmaps(); | 297 | error = create_basic_memory_bitmaps(); |
@@ -165,75 +302,35 @@ int hibernate(void) | |||
165 | if (error) | 302 | if (error) |
166 | goto Finish; | 303 | goto Finish; |
167 | 304 | ||
168 | mutex_lock(&pm_mutex); | ||
169 | if (hibernation_mode == HIBERNATION_TESTPROC) { | 305 | if (hibernation_mode == HIBERNATION_TESTPROC) { |
170 | printk("swsusp debug: Waiting for 5 seconds.\n"); | 306 | printk("swsusp debug: Waiting for 5 seconds.\n"); |
171 | mdelay(5000); | 307 | mdelay(5000); |
172 | goto Thaw; | 308 | goto Thaw; |
173 | } | 309 | } |
310 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); | ||
311 | if (in_suspend && !error) { | ||
312 | unsigned int flags = 0; | ||
174 | 313 | ||
175 | /* Free memory before shutting down devices. */ | 314 | if (hibernation_mode == HIBERNATION_PLATFORM) |
176 | error = swsusp_shrink_memory(); | 315 | flags |= SF_PLATFORM_MODE; |
177 | if (error) | ||
178 | goto Thaw; | ||
179 | |||
180 | error = platform_prepare(); | ||
181 | if (error) | ||
182 | goto Thaw; | ||
183 | |||
184 | suspend_console(); | ||
185 | error = device_suspend(PMSG_FREEZE); | ||
186 | if (error) { | ||
187 | printk(KERN_ERR "PM: Some devices failed to suspend\n"); | ||
188 | goto Resume_devices; | ||
189 | } | ||
190 | error = disable_nonboot_cpus(); | ||
191 | if (error) | ||
192 | goto Enable_cpus; | ||
193 | |||
194 | if (hibernation_mode == HIBERNATION_TEST) { | ||
195 | printk("swsusp debug: Waiting for 5 seconds.\n"); | ||
196 | mdelay(5000); | ||
197 | goto Enable_cpus; | ||
198 | } | ||
199 | |||
200 | pr_debug("PM: snapshotting memory.\n"); | ||
201 | in_suspend = 1; | ||
202 | error = swsusp_suspend(); | ||
203 | if (error) | ||
204 | goto Enable_cpus; | ||
205 | |||
206 | if (in_suspend) { | ||
207 | enable_nonboot_cpus(); | ||
208 | platform_finish(); | ||
209 | device_resume(); | ||
210 | resume_console(); | ||
211 | pr_debug("PM: writing image.\n"); | 316 | pr_debug("PM: writing image.\n"); |
212 | error = swsusp_write(); | 317 | error = swsusp_write(flags); |
318 | swsusp_free(); | ||
213 | if (!error) | 319 | if (!error) |
214 | power_down(); | 320 | power_down(); |
215 | else { | ||
216 | swsusp_free(); | ||
217 | goto Thaw; | ||
218 | } | ||
219 | } else { | 321 | } else { |
220 | pr_debug("PM: Image restored successfully.\n"); | 322 | pr_debug("PM: Image restored successfully.\n"); |
323 | swsusp_free(); | ||
221 | } | 324 | } |
222 | |||
223 | swsusp_free(); | ||
224 | Enable_cpus: | ||
225 | enable_nonboot_cpus(); | ||
226 | Resume_devices: | ||
227 | platform_finish(); | ||
228 | device_resume(); | ||
229 | resume_console(); | ||
230 | Thaw: | 325 | Thaw: |
231 | mutex_unlock(&pm_mutex); | ||
232 | unprepare_processes(); | 326 | unprepare_processes(); |
233 | Finish: | 327 | Finish: |
234 | free_basic_memory_bitmaps(); | 328 | free_basic_memory_bitmaps(); |
235 | Exit: | 329 | Exit: |
330 | pm_notifier_call_chain(PM_POST_HIBERNATION); | ||
236 | atomic_inc(&snapshot_device_available); | 331 | atomic_inc(&snapshot_device_available); |
332 | Unlock: | ||
333 | mutex_unlock(&pm_mutex); | ||
237 | return error; | 334 | return error; |
238 | } | 335 | } |
239 | 336 | ||
@@ -253,6 +350,7 @@ int hibernate(void) | |||
253 | static int software_resume(void) | 350 | static int software_resume(void) |
254 | { | 351 | { |
255 | int error; | 352 | int error; |
353 | unsigned int flags; | ||
256 | 354 | ||
257 | mutex_lock(&pm_mutex); | 355 | mutex_lock(&pm_mutex); |
258 | if (!swsusp_resume_device) { | 356 | if (!swsusp_resume_device) { |
@@ -300,30 +398,12 @@ static int software_resume(void) | |||
300 | 398 | ||
301 | pr_debug("PM: Reading swsusp image.\n"); | 399 | pr_debug("PM: Reading swsusp image.\n"); |
302 | 400 | ||
303 | error = swsusp_read(); | 401 | error = swsusp_read(&flags); |
304 | if (error) { | ||
305 | swsusp_free(); | ||
306 | goto Thaw; | ||
307 | } | ||
308 | |||
309 | pr_debug("PM: Preparing devices for restore.\n"); | ||
310 | |||
311 | suspend_console(); | ||
312 | error = device_suspend(PMSG_PRETHAW); | ||
313 | if (error) | ||
314 | goto Free; | ||
315 | |||
316 | error = disable_nonboot_cpus(); | ||
317 | if (!error) | 402 | if (!error) |
318 | swsusp_resume(); | 403 | hibernation_restore(flags & SF_PLATFORM_MODE); |
319 | 404 | ||
320 | enable_nonboot_cpus(); | ||
321 | Free: | ||
322 | swsusp_free(); | ||
323 | device_resume(); | ||
324 | resume_console(); | ||
325 | Thaw: | ||
326 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); | 405 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); |
406 | swsusp_free(); | ||
327 | unprepare_processes(); | 407 | unprepare_processes(); |
328 | Done: | 408 | Done: |
329 | free_basic_memory_bitmaps(); | 409 | free_basic_memory_bitmaps(); |
@@ -333,7 +413,7 @@ static int software_resume(void) | |||
333 | Unlock: | 413 | Unlock: |
334 | mutex_unlock(&pm_mutex); | 414 | mutex_unlock(&pm_mutex); |
335 | pr_debug("PM: Resume from disk failed.\n"); | 415 | pr_debug("PM: Resume from disk failed.\n"); |
336 | return 0; | 416 | return error; |
337 | } | 417 | } |
338 | 418 | ||
339 | late_initcall(software_resume); | 419 | late_initcall(software_resume); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index fc45ed22620f..350b485b3b60 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -23,11 +23,15 @@ | |||
23 | 23 | ||
24 | #include "power.h" | 24 | #include "power.h" |
25 | 25 | ||
26 | /*This is just an arbitrary number */ | 26 | BLOCKING_NOTIFIER_HEAD(pm_chain_head); |
27 | #define FREE_PAGE_NUMBER (100) | ||
28 | 27 | ||
29 | DEFINE_MUTEX(pm_mutex); | 28 | DEFINE_MUTEX(pm_mutex); |
30 | 29 | ||
30 | #ifdef CONFIG_SUSPEND | ||
31 | |||
32 | /* This is just an arbitrary number */ | ||
33 | #define FREE_PAGE_NUMBER (100) | ||
34 | |||
31 | struct pm_ops *pm_ops; | 35 | struct pm_ops *pm_ops; |
32 | 36 | ||
33 | /** | 37 | /** |
@@ -63,14 +67,11 @@ static inline void pm_finish(suspend_state_t state) | |||
63 | 67 | ||
64 | /** | 68 | /** |
65 | * suspend_prepare - Do prep work before entering low-power state. | 69 | * suspend_prepare - Do prep work before entering low-power state. |
66 | * @state: State we're entering. | ||
67 | * | 70 | * |
68 | * This is common code that is called for each state that we're | 71 | * This is common code that is called for each state that we're entering. |
69 | * entering. Allocate a console, stop all processes, then make sure | 72 | * Run suspend notifiers, allocate a console and stop all processes. |
70 | * the platform can enter the requested state. | ||
71 | */ | 73 | */ |
72 | 74 | static int suspend_prepare(void) | |
73 | static int suspend_prepare(suspend_state_t state) | ||
74 | { | 75 | { |
75 | int error; | 76 | int error; |
76 | unsigned int free_pages; | 77 | unsigned int free_pages; |
@@ -78,6 +79,10 @@ static int suspend_prepare(suspend_state_t state) | |||
78 | if (!pm_ops || !pm_ops->enter) | 79 | if (!pm_ops || !pm_ops->enter) |
79 | return -EPERM; | 80 | return -EPERM; |
80 | 81 | ||
82 | error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); | ||
83 | if (error) | ||
84 | goto Finish; | ||
85 | |||
81 | pm_prepare_console(); | 86 | pm_prepare_console(); |
82 | 87 | ||
83 | if (freeze_processes()) { | 88 | if (freeze_processes()) { |
@@ -85,46 +90,23 @@ static int suspend_prepare(suspend_state_t state) | |||
85 | goto Thaw; | 90 | goto Thaw; |
86 | } | 91 | } |
87 | 92 | ||
88 | if ((free_pages = global_page_state(NR_FREE_PAGES)) | 93 | free_pages = global_page_state(NR_FREE_PAGES); |
89 | < FREE_PAGE_NUMBER) { | 94 | if (free_pages < FREE_PAGE_NUMBER) { |
90 | pr_debug("PM: free some memory\n"); | 95 | pr_debug("PM: free some memory\n"); |
91 | shrink_all_memory(FREE_PAGE_NUMBER - free_pages); | 96 | shrink_all_memory(FREE_PAGE_NUMBER - free_pages); |
92 | if (nr_free_pages() < FREE_PAGE_NUMBER) { | 97 | if (nr_free_pages() < FREE_PAGE_NUMBER) { |
93 | error = -ENOMEM; | 98 | error = -ENOMEM; |
94 | printk(KERN_ERR "PM: No enough memory\n"); | 99 | printk(KERN_ERR "PM: No enough memory\n"); |
95 | goto Thaw; | ||
96 | } | 100 | } |
97 | } | 101 | } |
98 | |||
99 | if (pm_ops->set_target) { | ||
100 | error = pm_ops->set_target(state); | ||
101 | if (error) | ||
102 | goto Thaw; | ||
103 | } | ||
104 | suspend_console(); | ||
105 | error = device_suspend(PMSG_SUSPEND); | ||
106 | if (error) { | ||
107 | printk(KERN_ERR "Some devices failed to suspend\n"); | ||
108 | goto Resume_console; | ||
109 | } | ||
110 | if (pm_ops->prepare) { | ||
111 | if ((error = pm_ops->prepare(state))) | ||
112 | goto Resume_devices; | ||
113 | } | ||
114 | |||
115 | error = disable_nonboot_cpus(); | ||
116 | if (!error) | 102 | if (!error) |
117 | return 0; | 103 | return 0; |
118 | 104 | ||
119 | enable_nonboot_cpus(); | ||
120 | pm_finish(state); | ||
121 | Resume_devices: | ||
122 | device_resume(); | ||
123 | Resume_console: | ||
124 | resume_console(); | ||
125 | Thaw: | 105 | Thaw: |
126 | thaw_processes(); | 106 | thaw_processes(); |
127 | pm_restore_console(); | 107 | pm_restore_console(); |
108 | Finish: | ||
109 | pm_notifier_call_chain(PM_POST_SUSPEND); | ||
128 | return error; | 110 | return error; |
129 | } | 111 | } |
130 | 112 | ||
@@ -140,6 +122,12 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) | |||
140 | local_irq_enable(); | 122 | local_irq_enable(); |
141 | } | 123 | } |
142 | 124 | ||
125 | /** | ||
126 | * suspend_enter - enter the desired system sleep state. | ||
127 | * @state: state to enter | ||
128 | * | ||
129 | * This function should be called after devices have been suspended. | ||
130 | */ | ||
143 | int suspend_enter(suspend_state_t state) | 131 | int suspend_enter(suspend_state_t state) |
144 | { | 132 | { |
145 | int error = 0; | 133 | int error = 0; |
@@ -159,23 +147,58 @@ int suspend_enter(suspend_state_t state) | |||
159 | return error; | 147 | return error; |
160 | } | 148 | } |
161 | 149 | ||
150 | /** | ||
151 | * suspend_devices_and_enter - suspend devices and enter the desired system sleep | ||
152 | * state. | ||
153 | * @state: state to enter | ||
154 | */ | ||
155 | int suspend_devices_and_enter(suspend_state_t state) | ||
156 | { | ||
157 | int error; | ||
158 | |||
159 | if (!pm_ops) | ||
160 | return -ENOSYS; | ||
161 | |||
162 | if (pm_ops->set_target) { | ||
163 | error = pm_ops->set_target(state); | ||
164 | if (error) | ||
165 | return error; | ||
166 | } | ||
167 | suspend_console(); | ||
168 | error = device_suspend(PMSG_SUSPEND); | ||
169 | if (error) { | ||
170 | printk(KERN_ERR "Some devices failed to suspend\n"); | ||
171 | goto Resume_console; | ||
172 | } | ||
173 | if (pm_ops->prepare) { | ||
174 | error = pm_ops->prepare(state); | ||
175 | if (error) | ||
176 | goto Resume_devices; | ||
177 | } | ||
178 | error = disable_nonboot_cpus(); | ||
179 | if (!error) | ||
180 | suspend_enter(state); | ||
181 | |||
182 | enable_nonboot_cpus(); | ||
183 | pm_finish(state); | ||
184 | Resume_devices: | ||
185 | device_resume(); | ||
186 | Resume_console: | ||
187 | resume_console(); | ||
188 | return error; | ||
189 | } | ||
162 | 190 | ||
163 | /** | 191 | /** |
164 | * suspend_finish - Do final work before exiting suspend sequence. | 192 | * suspend_finish - Do final work before exiting suspend sequence. |
165 | * @state: State we're coming out of. | ||
166 | * | 193 | * |
167 | * Call platform code to clean up, restart processes, and free the | 194 | * Call platform code to clean up, restart processes, and free the |
168 | * console that we've allocated. This is not called for suspend-to-disk. | 195 | * console that we've allocated. This is not called for suspend-to-disk. |
169 | */ | 196 | */ |
170 | 197 | static void suspend_finish(void) | |
171 | static void suspend_finish(suspend_state_t state) | ||
172 | { | 198 | { |
173 | enable_nonboot_cpus(); | ||
174 | pm_finish(state); | ||
175 | device_resume(); | ||
176 | resume_console(); | ||
177 | thaw_processes(); | 199 | thaw_processes(); |
178 | pm_restore_console(); | 200 | pm_restore_console(); |
201 | pm_notifier_call_chain(PM_POST_SUSPEND); | ||
179 | } | 202 | } |
180 | 203 | ||
181 | 204 | ||
@@ -207,7 +230,6 @@ static inline int valid_state(suspend_state_t state) | |||
207 | * Then, do the setup for suspend, enter the state, and cleaup (after | 230 | * Then, do the setup for suspend, enter the state, and cleaup (after |
208 | * we've woken up). | 231 | * we've woken up). |
209 | */ | 232 | */ |
210 | |||
211 | static int enter_state(suspend_state_t state) | 233 | static int enter_state(suspend_state_t state) |
212 | { | 234 | { |
213 | int error; | 235 | int error; |
@@ -218,14 +240,14 @@ static int enter_state(suspend_state_t state) | |||
218 | return -EBUSY; | 240 | return -EBUSY; |
219 | 241 | ||
220 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); | 242 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); |
221 | if ((error = suspend_prepare(state))) | 243 | if ((error = suspend_prepare())) |
222 | goto Unlock; | 244 | goto Unlock; |
223 | 245 | ||
224 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); | 246 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); |
225 | error = suspend_enter(state); | 247 | error = suspend_devices_and_enter(state); |
226 | 248 | ||
227 | pr_debug("PM: Finishing wakeup.\n"); | 249 | pr_debug("PM: Finishing wakeup.\n"); |
228 | suspend_finish(state); | 250 | suspend_finish(); |
229 | Unlock: | 251 | Unlock: |
230 | mutex_unlock(&pm_mutex); | 252 | mutex_unlock(&pm_mutex); |
231 | return error; | 253 | return error; |
@@ -249,6 +271,8 @@ int pm_suspend(suspend_state_t state) | |||
249 | 271 | ||
250 | EXPORT_SYMBOL(pm_suspend); | 272 | EXPORT_SYMBOL(pm_suspend); |
251 | 273 | ||
274 | #endif /* CONFIG_SUSPEND */ | ||
275 | |||
252 | decl_subsys(power,NULL,NULL); | 276 | decl_subsys(power,NULL,NULL); |
253 | 277 | ||
254 | 278 | ||
@@ -265,14 +289,16 @@ decl_subsys(power,NULL,NULL); | |||
265 | 289 | ||
266 | static ssize_t state_show(struct kset *kset, char *buf) | 290 | static ssize_t state_show(struct kset *kset, char *buf) |
267 | { | 291 | { |
292 | char *s = buf; | ||
293 | #ifdef CONFIG_SUSPEND | ||
268 | int i; | 294 | int i; |
269 | char * s = buf; | ||
270 | 295 | ||
271 | for (i = 0; i < PM_SUSPEND_MAX; i++) { | 296 | for (i = 0; i < PM_SUSPEND_MAX; i++) { |
272 | if (pm_states[i] && valid_state(i)) | 297 | if (pm_states[i] && valid_state(i)) |
273 | s += sprintf(s,"%s ", pm_states[i]); | 298 | s += sprintf(s,"%s ", pm_states[i]); |
274 | } | 299 | } |
275 | #ifdef CONFIG_SOFTWARE_SUSPEND | 300 | #endif |
301 | #ifdef CONFIG_HIBERNATION | ||
276 | s += sprintf(s, "%s\n", "disk"); | 302 | s += sprintf(s, "%s\n", "disk"); |
277 | #else | 303 | #else |
278 | if (s != buf) | 304 | if (s != buf) |
@@ -284,11 +310,13 @@ static ssize_t state_show(struct kset *kset, char *buf) | |||
284 | 310 | ||
285 | static ssize_t state_store(struct kset *kset, const char *buf, size_t n) | 311 | static ssize_t state_store(struct kset *kset, const char *buf, size_t n) |
286 | { | 312 | { |
313 | #ifdef CONFIG_SUSPEND | ||
287 | suspend_state_t state = PM_SUSPEND_STANDBY; | 314 | suspend_state_t state = PM_SUSPEND_STANDBY; |
288 | const char * const *s; | 315 | const char * const *s; |
316 | #endif | ||
289 | char *p; | 317 | char *p; |
290 | int error; | ||
291 | int len; | 318 | int len; |
319 | int error = -EINVAL; | ||
292 | 320 | ||
293 | p = memchr(buf, '\n', n); | 321 | p = memchr(buf, '\n', n); |
294 | len = p ? p - buf : n; | 322 | len = p ? p - buf : n; |
@@ -296,17 +324,19 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n) | |||
296 | /* First, check if we are requested to hibernate */ | 324 | /* First, check if we are requested to hibernate */ |
297 | if (len == 4 && !strncmp(buf, "disk", len)) { | 325 | if (len == 4 && !strncmp(buf, "disk", len)) { |
298 | error = hibernate(); | 326 | error = hibernate(); |
299 | return error ? error : n; | 327 | goto Exit; |
300 | } | 328 | } |
301 | 329 | ||
330 | #ifdef CONFIG_SUSPEND | ||
302 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { | 331 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { |
303 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) | 332 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) |
304 | break; | 333 | break; |
305 | } | 334 | } |
306 | if (state < PM_SUSPEND_MAX && *s) | 335 | if (state < PM_SUSPEND_MAX && *s) |
307 | error = enter_state(state); | 336 | error = enter_state(state); |
308 | else | 337 | #endif |
309 | error = -EINVAL; | 338 | |
339 | Exit: | ||
310 | return error ? error : n; | 340 | return error ? error : n; |
311 | } | 341 | } |
312 | 342 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index 51381487103f..95fbf2dd3fe3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -13,7 +13,7 @@ struct swsusp_info { | |||
13 | 13 | ||
14 | 14 | ||
15 | 15 | ||
16 | #ifdef CONFIG_SOFTWARE_SUSPEND | 16 | #ifdef CONFIG_HIBERNATION |
17 | /* | 17 | /* |
18 | * Keep some memory free so that I/O operations can succeed without paging | 18 | * Keep some memory free so that I/O operations can succeed without paging |
19 | * [Might this be more than 4 MB?] | 19 | * [Might this be more than 4 MB?] |
@@ -25,7 +25,10 @@ struct swsusp_info { | |||
25 | */ | 25 | */ |
26 | #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) | 26 | #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) |
27 | 27 | ||
28 | extern struct hibernation_ops *hibernation_ops; | 28 | /* kernel/power/disk.c */ |
29 | extern int hibernation_snapshot(int platform_mode); | ||
30 | extern int hibernation_restore(int platform_mode); | ||
31 | extern int hibernation_platform_enter(void); | ||
29 | #endif | 32 | #endif |
30 | 33 | ||
31 | extern int pfn_is_nosave(unsigned long); | 34 | extern int pfn_is_nosave(unsigned long); |
@@ -152,16 +155,42 @@ extern sector_t alloc_swapdev_block(int swap); | |||
152 | extern void free_all_swap_pages(int swap); | 155 | extern void free_all_swap_pages(int swap); |
153 | extern int swsusp_swap_in_use(void); | 156 | extern int swsusp_swap_in_use(void); |
154 | 157 | ||
158 | /* | ||
159 | * Flags that can be passed from the hibernatig hernel to the "boot" kernel in | ||
160 | * the image header. | ||
161 | */ | ||
162 | #define SF_PLATFORM_MODE 1 | ||
163 | |||
164 | /* kernel/power/disk.c */ | ||
155 | extern int swsusp_check(void); | 165 | extern int swsusp_check(void); |
156 | extern int swsusp_shrink_memory(void); | 166 | extern int swsusp_shrink_memory(void); |
157 | extern void swsusp_free(void); | 167 | extern void swsusp_free(void); |
158 | extern int swsusp_suspend(void); | 168 | extern int swsusp_suspend(void); |
159 | extern int swsusp_resume(void); | 169 | extern int swsusp_resume(void); |
160 | extern int swsusp_read(void); | 170 | extern int swsusp_read(unsigned int *flags_p); |
161 | extern int swsusp_write(void); | 171 | extern int swsusp_write(unsigned int flags); |
162 | extern void swsusp_close(void); | 172 | extern void swsusp_close(void); |
163 | extern int suspend_enter(suspend_state_t state); | ||
164 | 173 | ||
165 | struct timeval; | 174 | struct timeval; |
175 | /* kernel/power/swsusp.c */ | ||
166 | extern void swsusp_show_speed(struct timeval *, struct timeval *, | 176 | extern void swsusp_show_speed(struct timeval *, struct timeval *, |
167 | unsigned int, char *); | 177 | unsigned int, char *); |
178 | |||
179 | #ifdef CONFIG_SUSPEND | ||
180 | /* kernel/power/main.c */ | ||
181 | extern int suspend_devices_and_enter(suspend_state_t state); | ||
182 | #else /* !CONFIG_SUSPEND */ | ||
183 | static inline int suspend_devices_and_enter(suspend_state_t state) | ||
184 | { | ||
185 | return -ENOSYS; | ||
186 | } | ||
187 | #endif /* !CONFIG_SUSPEND */ | ||
188 | |||
189 | /* kernel/power/common.c */ | ||
190 | extern struct blocking_notifier_head pm_chain_head; | ||
191 | |||
192 | static inline int pm_notifier_call_chain(unsigned long val) | ||
193 | { | ||
194 | return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) | ||
195 | == NOTIFY_BAD) ? -EINVAL : 0; | ||
196 | } | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index e0233d8422b9..3434940a3df1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -40,7 +40,7 @@ static inline void frozen_process(void) | |||
40 | current->flags |= PF_FROZEN; | 40 | current->flags |= PF_FROZEN; |
41 | wmb(); | 41 | wmb(); |
42 | } | 42 | } |
43 | clear_tsk_thread_flag(current, TIF_FREEZE); | 43 | clear_freeze_flag(current); |
44 | } | 44 | } |
45 | 45 | ||
46 | /* Refrigerator is place where frozen processes are stored :-). */ | 46 | /* Refrigerator is place where frozen processes are stored :-). */ |
@@ -72,20 +72,19 @@ void refrigerator(void) | |||
72 | schedule(); | 72 | schedule(); |
73 | } | 73 | } |
74 | pr_debug("%s left refrigerator\n", current->comm); | 74 | pr_debug("%s left refrigerator\n", current->comm); |
75 | current->state = save; | 75 | __set_current_state(save); |
76 | } | 76 | } |
77 | 77 | ||
78 | static inline void freeze_process(struct task_struct *p) | 78 | static void freeze_task(struct task_struct *p) |
79 | { | 79 | { |
80 | unsigned long flags; | 80 | unsigned long flags; |
81 | 81 | ||
82 | if (!freezing(p)) { | 82 | if (!freezing(p)) { |
83 | rmb(); | 83 | rmb(); |
84 | if (!frozen(p)) { | 84 | if (!frozen(p)) { |
85 | set_freeze_flag(p); | ||
85 | if (p->state == TASK_STOPPED) | 86 | if (p->state == TASK_STOPPED) |
86 | force_sig_specific(SIGSTOP, p); | 87 | force_sig_specific(SIGSTOP, p); |
87 | |||
88 | freeze(p); | ||
89 | spin_lock_irqsave(&p->sighand->siglock, flags); | 88 | spin_lock_irqsave(&p->sighand->siglock, flags); |
90 | signal_wake_up(p, p->state == TASK_STOPPED); | 89 | signal_wake_up(p, p->state == TASK_STOPPED); |
91 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 90 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
@@ -99,19 +98,14 @@ static void cancel_freezing(struct task_struct *p) | |||
99 | 98 | ||
100 | if (freezing(p)) { | 99 | if (freezing(p)) { |
101 | pr_debug(" clean up: %s\n", p->comm); | 100 | pr_debug(" clean up: %s\n", p->comm); |
102 | do_not_freeze(p); | 101 | clear_freeze_flag(p); |
103 | spin_lock_irqsave(&p->sighand->siglock, flags); | 102 | spin_lock_irqsave(&p->sighand->siglock, flags); |
104 | recalc_sigpending_and_wake(p); | 103 | recalc_sigpending_and_wake(p); |
105 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 104 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
106 | } | 105 | } |
107 | } | 106 | } |
108 | 107 | ||
109 | static inline int is_user_space(struct task_struct *p) | 108 | static int try_to_freeze_tasks(int freeze_user_space) |
110 | { | ||
111 | return p->mm && !(p->flags & PF_BORROWED_MM); | ||
112 | } | ||
113 | |||
114 | static unsigned int try_to_freeze_tasks(int freeze_user_space) | ||
115 | { | 109 | { |
116 | struct task_struct *g, *p; | 110 | struct task_struct *g, *p; |
117 | unsigned long end_time; | 111 | unsigned long end_time; |
@@ -122,26 +116,40 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) | |||
122 | todo = 0; | 116 | todo = 0; |
123 | read_lock(&tasklist_lock); | 117 | read_lock(&tasklist_lock); |
124 | do_each_thread(g, p) { | 118 | do_each_thread(g, p) { |
125 | if (!freezeable(p)) | 119 | if (frozen(p) || !freezeable(p)) |
126 | continue; | 120 | continue; |
127 | 121 | ||
128 | if (frozen(p)) | 122 | if (freeze_user_space) { |
129 | continue; | 123 | if (p->state == TASK_TRACED && |
130 | 124 | frozen(p->parent)) { | |
131 | if (p->state == TASK_TRACED && frozen(p->parent)) { | 125 | cancel_freezing(p); |
132 | cancel_freezing(p); | 126 | continue; |
133 | continue; | 127 | } |
128 | /* | ||
129 | * Kernel threads should not have TIF_FREEZE set | ||
130 | * at this point, so we must ensure that either | ||
131 | * p->mm is not NULL *and* PF_BORROWED_MM is | ||
132 | * unset, or TIF_FRREZE is left unset. | ||
133 | * The task_lock() is necessary to prevent races | ||
134 | * with exit_mm() or use_mm()/unuse_mm() from | ||
135 | * occuring. | ||
136 | */ | ||
137 | task_lock(p); | ||
138 | if (!p->mm || (p->flags & PF_BORROWED_MM)) { | ||
139 | task_unlock(p); | ||
140 | continue; | ||
141 | } | ||
142 | freeze_task(p); | ||
143 | task_unlock(p); | ||
144 | } else { | ||
145 | freeze_task(p); | ||
134 | } | 146 | } |
135 | if (freeze_user_space && !is_user_space(p)) | ||
136 | continue; | ||
137 | |||
138 | freeze_process(p); | ||
139 | if (!freezer_should_skip(p)) | 147 | if (!freezer_should_skip(p)) |
140 | todo++; | 148 | todo++; |
141 | } while_each_thread(g, p); | 149 | } while_each_thread(g, p); |
142 | read_unlock(&tasklist_lock); | 150 | read_unlock(&tasklist_lock); |
143 | yield(); /* Yield is okay here */ | 151 | yield(); /* Yield is okay here */ |
144 | if (todo && time_after(jiffies, end_time)) | 152 | if (time_after(jiffies, end_time)) |
145 | break; | 153 | break; |
146 | } while (todo); | 154 | } while (todo); |
147 | 155 | ||
@@ -152,49 +160,41 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) | |||
152 | * but it cleans up leftover PF_FREEZE requests. | 160 | * but it cleans up leftover PF_FREEZE requests. |
153 | */ | 161 | */ |
154 | printk("\n"); | 162 | printk("\n"); |
155 | printk(KERN_ERR "Stopping %s timed out after %d seconds " | 163 | printk(KERN_ERR "Freezing of %s timed out after %d seconds " |
156 | "(%d tasks refusing to freeze):\n", | 164 | "(%d tasks refusing to freeze):\n", |
157 | freeze_user_space ? "user space processes" : | 165 | freeze_user_space ? "user space " : "tasks ", |
158 | "kernel threads", | ||
159 | TIMEOUT / HZ, todo); | 166 | TIMEOUT / HZ, todo); |
167 | show_state(); | ||
160 | read_lock(&tasklist_lock); | 168 | read_lock(&tasklist_lock); |
161 | do_each_thread(g, p) { | 169 | do_each_thread(g, p) { |
162 | if (freeze_user_space && !is_user_space(p)) | ||
163 | continue; | ||
164 | |||
165 | task_lock(p); | 170 | task_lock(p); |
166 | if (freezeable(p) && !frozen(p) && | 171 | if (freezing(p) && !freezer_should_skip(p)) |
167 | !freezer_should_skip(p)) | ||
168 | printk(KERN_ERR " %s\n", p->comm); | 172 | printk(KERN_ERR " %s\n", p->comm); |
169 | |||
170 | cancel_freezing(p); | 173 | cancel_freezing(p); |
171 | task_unlock(p); | 174 | task_unlock(p); |
172 | } while_each_thread(g, p); | 175 | } while_each_thread(g, p); |
173 | read_unlock(&tasklist_lock); | 176 | read_unlock(&tasklist_lock); |
174 | } | 177 | } |
175 | 178 | ||
176 | return todo; | 179 | return todo ? -EBUSY : 0; |
177 | } | 180 | } |
178 | 181 | ||
179 | /** | 182 | /** |
180 | * freeze_processes - tell processes to enter the refrigerator | 183 | * freeze_processes - tell processes to enter the refrigerator |
181 | * | ||
182 | * Returns 0 on success, or the number of processes that didn't freeze, | ||
183 | * although they were told to. | ||
184 | */ | 184 | */ |
185 | int freeze_processes(void) | 185 | int freeze_processes(void) |
186 | { | 186 | { |
187 | unsigned int nr_unfrozen; | 187 | int error; |
188 | 188 | ||
189 | printk("Stopping tasks ... "); | 189 | printk("Stopping tasks ... "); |
190 | nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); | 190 | error = try_to_freeze_tasks(FREEZER_USER_SPACE); |
191 | if (nr_unfrozen) | 191 | if (error) |
192 | return nr_unfrozen; | 192 | return error; |
193 | 193 | ||
194 | sys_sync(); | 194 | sys_sync(); |
195 | nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); | 195 | error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); |
196 | if (nr_unfrozen) | 196 | if (error) |
197 | return nr_unfrozen; | 197 | return error; |
198 | 198 | ||
199 | printk("done.\n"); | 199 | printk("done.\n"); |
200 | BUG_ON(in_atomic()); | 200 | BUG_ON(in_atomic()); |
@@ -210,7 +210,7 @@ static void thaw_tasks(int thaw_user_space) | |||
210 | if (!freezeable(p)) | 210 | if (!freezeable(p)) |
211 | continue; | 211 | continue; |
212 | 212 | ||
213 | if (is_user_space(p) == !thaw_user_space) | 213 | if (!p->mm == thaw_user_space) |
214 | continue; | 214 | continue; |
215 | 215 | ||
216 | thaw_process(p); | 216 | thaw_process(p); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a3b7854b8f7c..a686590d88c1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -709,7 +709,8 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
709 | region->end_pfn << PAGE_SHIFT); | 709 | region->end_pfn << PAGE_SHIFT); |
710 | 710 | ||
711 | for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) | 711 | for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) |
712 | memory_bm_set_bit(bm, pfn); | 712 | if (pfn_valid(pfn)) |
713 | memory_bm_set_bit(bm, pfn); | ||
713 | } | 714 | } |
714 | } | 715 | } |
715 | 716 | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8b1a1b837145..917aba100575 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -33,8 +33,9 @@ extern char resume_file[]; | |||
33 | #define SWSUSP_SIG "S1SUSPEND" | 33 | #define SWSUSP_SIG "S1SUSPEND" |
34 | 34 | ||
35 | struct swsusp_header { | 35 | struct swsusp_header { |
36 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; | 36 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; |
37 | sector_t image; | 37 | sector_t image; |
38 | unsigned int flags; /* Flags to pass to the "boot" kernel */ | ||
38 | char orig_sig[10]; | 39 | char orig_sig[10]; |
39 | char sig[10]; | 40 | char sig[10]; |
40 | } __attribute__((packed)); | 41 | } __attribute__((packed)); |
@@ -138,7 +139,7 @@ static int wait_on_bio_chain(struct bio **bio_chain) | |||
138 | * Saving part | 139 | * Saving part |
139 | */ | 140 | */ |
140 | 141 | ||
141 | static int mark_swapfiles(sector_t start) | 142 | static int mark_swapfiles(sector_t start, unsigned int flags) |
142 | { | 143 | { |
143 | int error; | 144 | int error; |
144 | 145 | ||
@@ -148,6 +149,7 @@ static int mark_swapfiles(sector_t start) | |||
148 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); | 149 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); |
149 | memcpy(swsusp_header->sig,SWSUSP_SIG, 10); | 150 | memcpy(swsusp_header->sig,SWSUSP_SIG, 10); |
150 | swsusp_header->image = start; | 151 | swsusp_header->image = start; |
152 | swsusp_header->flags = flags; | ||
151 | error = bio_write_page(swsusp_resume_block, | 153 | error = bio_write_page(swsusp_resume_block, |
152 | swsusp_header, NULL); | 154 | swsusp_header, NULL); |
153 | } else { | 155 | } else { |
@@ -369,6 +371,7 @@ static int enough_swap(unsigned int nr_pages) | |||
369 | 371 | ||
370 | /** | 372 | /** |
371 | * swsusp_write - Write entire image and metadata. | 373 | * swsusp_write - Write entire image and metadata. |
374 | * @flags: flags to pass to the "boot" kernel in the image header | ||
372 | * | 375 | * |
373 | * It is important _NOT_ to umount filesystems at this point. We want | 376 | * It is important _NOT_ to umount filesystems at this point. We want |
374 | * them synced (in case something goes wrong) but we DO not want to mark | 377 | * them synced (in case something goes wrong) but we DO not want to mark |
@@ -376,7 +379,7 @@ static int enough_swap(unsigned int nr_pages) | |||
376 | * correctly, we'll mark system clean, anyway.) | 379 | * correctly, we'll mark system clean, anyway.) |
377 | */ | 380 | */ |
378 | 381 | ||
379 | int swsusp_write(void) | 382 | int swsusp_write(unsigned int flags) |
380 | { | 383 | { |
381 | struct swap_map_handle handle; | 384 | struct swap_map_handle handle; |
382 | struct snapshot_handle snapshot; | 385 | struct snapshot_handle snapshot; |
@@ -415,7 +418,7 @@ int swsusp_write(void) | |||
415 | if (!error) { | 418 | if (!error) { |
416 | flush_swap_writer(&handle); | 419 | flush_swap_writer(&handle); |
417 | printk("S"); | 420 | printk("S"); |
418 | error = mark_swapfiles(start); | 421 | error = mark_swapfiles(start, flags); |
419 | printk("|\n"); | 422 | printk("|\n"); |
420 | } | 423 | } |
421 | } | 424 | } |
@@ -540,13 +543,20 @@ static int load_image(struct swap_map_handle *handle, | |||
540 | return error; | 543 | return error; |
541 | } | 544 | } |
542 | 545 | ||
543 | int swsusp_read(void) | 546 | /** |
547 | * swsusp_read - read the hibernation image. | ||
548 | * @flags_p: flags passed by the "frozen" kernel in the image header should | ||
549 | * be written into this memeory location | ||
550 | */ | ||
551 | |||
552 | int swsusp_read(unsigned int *flags_p) | ||
544 | { | 553 | { |
545 | int error; | 554 | int error; |
546 | struct swap_map_handle handle; | 555 | struct swap_map_handle handle; |
547 | struct snapshot_handle snapshot; | 556 | struct snapshot_handle snapshot; |
548 | struct swsusp_info *header; | 557 | struct swsusp_info *header; |
549 | 558 | ||
559 | *flags_p = swsusp_header->flags; | ||
550 | if (IS_ERR(resume_bdev)) { | 560 | if (IS_ERR(resume_bdev)) { |
551 | pr_debug("swsusp: block device not initialised\n"); | 561 | pr_debug("swsusp: block device not initialised\n"); |
552 | return PTR_ERR(resume_bdev); | 562 | return PTR_ERR(resume_bdev); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index d65305b515b1..bd0723a7df3f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -128,92 +128,6 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
128 | return res; | 128 | return res; |
129 | } | 129 | } |
130 | 130 | ||
131 | static inline int platform_prepare(void) | ||
132 | { | ||
133 | int error = 0; | ||
134 | |||
135 | if (hibernation_ops) | ||
136 | error = hibernation_ops->prepare(); | ||
137 | |||
138 | return error; | ||
139 | } | ||
140 | |||
141 | static inline void platform_finish(void) | ||
142 | { | ||
143 | if (hibernation_ops) | ||
144 | hibernation_ops->finish(); | ||
145 | } | ||
146 | |||
147 | static inline int snapshot_suspend(int platform_suspend) | ||
148 | { | ||
149 | int error; | ||
150 | |||
151 | mutex_lock(&pm_mutex); | ||
152 | /* Free memory before shutting down devices. */ | ||
153 | error = swsusp_shrink_memory(); | ||
154 | if (error) | ||
155 | goto Finish; | ||
156 | |||
157 | if (platform_suspend) { | ||
158 | error = platform_prepare(); | ||
159 | if (error) | ||
160 | goto Finish; | ||
161 | } | ||
162 | suspend_console(); | ||
163 | error = device_suspend(PMSG_FREEZE); | ||
164 | if (error) | ||
165 | goto Resume_devices; | ||
166 | |||
167 | error = disable_nonboot_cpus(); | ||
168 | if (!error) { | ||
169 | in_suspend = 1; | ||
170 | error = swsusp_suspend(); | ||
171 | } | ||
172 | enable_nonboot_cpus(); | ||
173 | Resume_devices: | ||
174 | if (platform_suspend) | ||
175 | platform_finish(); | ||
176 | |||
177 | device_resume(); | ||
178 | resume_console(); | ||
179 | Finish: | ||
180 | mutex_unlock(&pm_mutex); | ||
181 | return error; | ||
182 | } | ||
183 | |||
184 | static inline int snapshot_restore(int platform_suspend) | ||
185 | { | ||
186 | int error; | ||
187 | |||
188 | mutex_lock(&pm_mutex); | ||
189 | pm_prepare_console(); | ||
190 | if (platform_suspend) { | ||
191 | error = platform_prepare(); | ||
192 | if (error) | ||
193 | goto Finish; | ||
194 | } | ||
195 | suspend_console(); | ||
196 | error = device_suspend(PMSG_PRETHAW); | ||
197 | if (error) | ||
198 | goto Resume_devices; | ||
199 | |||
200 | error = disable_nonboot_cpus(); | ||
201 | if (!error) | ||
202 | error = swsusp_resume(); | ||
203 | |||
204 | enable_nonboot_cpus(); | ||
205 | Resume_devices: | ||
206 | if (platform_suspend) | ||
207 | platform_finish(); | ||
208 | |||
209 | device_resume(); | ||
210 | resume_console(); | ||
211 | Finish: | ||
212 | pm_restore_console(); | ||
213 | mutex_unlock(&pm_mutex); | ||
214 | return error; | ||
215 | } | ||
216 | |||
217 | static int snapshot_ioctl(struct inode *inode, struct file *filp, | 131 | static int snapshot_ioctl(struct inode *inode, struct file *filp, |
218 | unsigned int cmd, unsigned long arg) | 132 | unsigned int cmd, unsigned long arg) |
219 | { | 133 | { |
@@ -237,10 +151,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
237 | if (data->frozen) | 151 | if (data->frozen) |
238 | break; | 152 | break; |
239 | mutex_lock(&pm_mutex); | 153 | mutex_lock(&pm_mutex); |
240 | if (freeze_processes()) { | 154 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); |
241 | thaw_processes(); | 155 | if (!error) { |
242 | error = -EBUSY; | 156 | error = freeze_processes(); |
157 | if (error) | ||
158 | thaw_processes(); | ||
243 | } | 159 | } |
160 | if (error) | ||
161 | pm_notifier_call_chain(PM_POST_HIBERNATION); | ||
244 | mutex_unlock(&pm_mutex); | 162 | mutex_unlock(&pm_mutex); |
245 | if (!error) | 163 | if (!error) |
246 | data->frozen = 1; | 164 | data->frozen = 1; |
@@ -251,6 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
251 | break; | 169 | break; |
252 | mutex_lock(&pm_mutex); | 170 | mutex_lock(&pm_mutex); |
253 | thaw_processes(); | 171 | thaw_processes(); |
172 | pm_notifier_call_chain(PM_POST_HIBERNATION); | ||
254 | mutex_unlock(&pm_mutex); | 173 | mutex_unlock(&pm_mutex); |
255 | data->frozen = 0; | 174 | data->frozen = 0; |
256 | break; | 175 | break; |
@@ -260,7 +179,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
260 | error = -EPERM; | 179 | error = -EPERM; |
261 | break; | 180 | break; |
262 | } | 181 | } |
263 | error = snapshot_suspend(data->platform_suspend); | 182 | error = hibernation_snapshot(data->platform_suspend); |
264 | if (!error) | 183 | if (!error) |
265 | error = put_user(in_suspend, (unsigned int __user *)arg); | 184 | error = put_user(in_suspend, (unsigned int __user *)arg); |
266 | if (!error) | 185 | if (!error) |
@@ -274,7 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
274 | error = -EPERM; | 193 | error = -EPERM; |
275 | break; | 194 | break; |
276 | } | 195 | } |
277 | error = snapshot_restore(data->platform_suspend); | 196 | error = hibernation_restore(data->platform_suspend); |
278 | break; | 197 | break; |
279 | 198 | ||
280 | case SNAPSHOT_FREE: | 199 | case SNAPSHOT_FREE: |
@@ -336,47 +255,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
336 | break; | 255 | break; |
337 | 256 | ||
338 | case SNAPSHOT_S2RAM: | 257 | case SNAPSHOT_S2RAM: |
339 | if (!pm_ops) { | ||
340 | error = -ENOSYS; | ||
341 | break; | ||
342 | } | ||
343 | |||
344 | if (!data->frozen) { | 258 | if (!data->frozen) { |
345 | error = -EPERM; | 259 | error = -EPERM; |
346 | break; | 260 | break; |
347 | } | 261 | } |
348 | |||
349 | if (!mutex_trylock(&pm_mutex)) { | 262 | if (!mutex_trylock(&pm_mutex)) { |
350 | error = -EBUSY; | 263 | error = -EBUSY; |
351 | break; | 264 | break; |
352 | } | 265 | } |
353 | 266 | /* | |
354 | if (pm_ops->prepare) { | 267 | * Tasks are frozen and the notifiers have been called with |
355 | error = pm_ops->prepare(PM_SUSPEND_MEM); | 268 | * PM_HIBERNATION_PREPARE |
356 | if (error) | 269 | */ |
357 | goto OutS3; | 270 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); |
358 | } | ||
359 | |||
360 | /* Put devices to sleep */ | ||
361 | suspend_console(); | ||
362 | error = device_suspend(PMSG_SUSPEND); | ||
363 | if (error) { | ||
364 | printk(KERN_ERR "Failed to suspend some devices.\n"); | ||
365 | } else { | ||
366 | error = disable_nonboot_cpus(); | ||
367 | if (!error) { | ||
368 | /* Enter S3, system is already frozen */ | ||
369 | suspend_enter(PM_SUSPEND_MEM); | ||
370 | enable_nonboot_cpus(); | ||
371 | } | ||
372 | /* Wake up devices */ | ||
373 | device_resume(); | ||
374 | } | ||
375 | resume_console(); | ||
376 | if (pm_ops->finish) | ||
377 | pm_ops->finish(PM_SUSPEND_MEM); | ||
378 | |||
379 | OutS3: | ||
380 | mutex_unlock(&pm_mutex); | 271 | mutex_unlock(&pm_mutex); |
381 | break; | 272 | break; |
382 | 273 | ||
@@ -386,19 +277,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
386 | switch (arg) { | 277 | switch (arg) { |
387 | 278 | ||
388 | case PMOPS_PREPARE: | 279 | case PMOPS_PREPARE: |
389 | if (hibernation_ops) { | 280 | data->platform_suspend = 1; |
390 | data->platform_suspend = 1; | 281 | error = 0; |
391 | error = 0; | ||
392 | } else { | ||
393 | error = -ENOSYS; | ||
394 | } | ||
395 | break; | 282 | break; |
396 | 283 | ||
397 | case PMOPS_ENTER: | 284 | case PMOPS_ENTER: |
398 | if (data->platform_suspend) { | 285 | if (data->platform_suspend) |
399 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | 286 | error = hibernation_platform_enter(); |
400 | error = hibernation_ops->enter(); | 287 | |
401 | } | ||
402 | break; | 288 | break; |
403 | 289 | ||
404 | case PMOPS_FINISH: | 290 | case PMOPS_FINISH: |
diff --git a/kernel/printk.c b/kernel/printk.c index 051d27e36a6c..8451dfc31d25 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -732,7 +732,7 @@ int __init add_preferred_console(char *name, int idx, char *options) | |||
732 | return 0; | 732 | return 0; |
733 | } | 733 | } |
734 | 734 | ||
735 | int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) | 735 | int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) |
736 | { | 736 | { |
737 | struct console_cmdline *c; | 737 | struct console_cmdline *c; |
738 | int i; | 738 | int i; |
@@ -1083,6 +1083,19 @@ int unregister_console(struct console *console) | |||
1083 | } | 1083 | } |
1084 | EXPORT_SYMBOL(unregister_console); | 1084 | EXPORT_SYMBOL(unregister_console); |
1085 | 1085 | ||
1086 | static int __init disable_boot_consoles(void) | ||
1087 | { | ||
1088 | if (console_drivers != NULL) { | ||
1089 | if (console_drivers->flags & CON_BOOT) { | ||
1090 | printk(KERN_INFO "turn off boot console %s%d\n", | ||
1091 | console_drivers->name, console_drivers->index); | ||
1092 | return unregister_console(console_drivers); | ||
1093 | } | ||
1094 | } | ||
1095 | return 0; | ||
1096 | } | ||
1097 | late_initcall(disable_boot_consoles); | ||
1098 | |||
1086 | /** | 1099 | /** |
1087 | * tty_write_message - write a message to a certain tty, not just the console. | 1100 | * tty_write_message - write a message to a certain tty, not just the console. |
1088 | * @tty: the destination tty_struct | 1101 | * @tty: the destination tty_struct |
diff --git a/kernel/profile.c b/kernel/profile.c index 5b20fe977bed..cb1e37d2dac3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -199,11 +199,11 @@ EXPORT_SYMBOL_GPL(register_timer_hook); | |||
199 | EXPORT_SYMBOL_GPL(unregister_timer_hook); | 199 | EXPORT_SYMBOL_GPL(unregister_timer_hook); |
200 | EXPORT_SYMBOL_GPL(task_handoff_register); | 200 | EXPORT_SYMBOL_GPL(task_handoff_register); |
201 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | 201 | EXPORT_SYMBOL_GPL(task_handoff_unregister); |
202 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
203 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
202 | 204 | ||
203 | #endif /* CONFIG_PROFILING */ | 205 | #endif /* CONFIG_PROFILING */ |
204 | 206 | ||
205 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
206 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
207 | 207 | ||
208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
209 | /* | 209 | /* |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4a1745f1dadf..3eca7a55f2ee 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -142,7 +142,7 @@ static int may_attach(struct task_struct *task) | |||
142 | return -EPERM; | 142 | return -EPERM; |
143 | smp_rmb(); | 143 | smp_rmb(); |
144 | if (task->mm) | 144 | if (task->mm) |
145 | dumpable = task->mm->dumpable; | 145 | dumpable = get_dumpable(task->mm); |
146 | if (!dumpable && !capable(CAP_SYS_PTRACE)) | 146 | if (!dumpable && !capable(CAP_SYS_PTRACE)) |
147 | return -EPERM; | 147 | return -EPERM; |
148 | 148 | ||
@@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
233 | 233 | ||
234 | /* Architecture-specific hardware disable .. */ | 234 | /* Architecture-specific hardware disable .. */ |
235 | ptrace_disable(child); | 235 | ptrace_disable(child); |
236 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
236 | 237 | ||
237 | write_lock_irq(&tasklist_lock); | 238 | write_lock_irq(&tasklist_lock); |
238 | /* protect against de_thread()->release_task() */ | 239 | /* protect against de_thread()->release_task() */ |
diff --git a/kernel/relay.c b/kernel/relay.c index a615a8f513fc..ad855017bc59 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Public API and common code for kernel->userspace relay file support. | 2 | * Public API and common code for kernel->userspace relay file support. |
3 | * | 3 | * |
4 | * See Documentation/filesystems/relayfs.txt for an overview of relayfs. | 4 | * See Documentation/filesystems/relay.txt for an overview. |
5 | * | 5 | * |
6 | * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp | 6 | * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp |
7 | * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) | 7 | * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) |
@@ -80,7 +80,7 @@ static struct vm_operations_struct relay_file_mmap_ops = { | |||
80 | * | 80 | * |
81 | * Caller should already have grabbed mmap_sem. | 81 | * Caller should already have grabbed mmap_sem. |
82 | */ | 82 | */ |
83 | int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) | 83 | static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) |
84 | { | 84 | { |
85 | unsigned long length = vma->vm_end - vma->vm_start; | 85 | unsigned long length = vma->vm_end - vma->vm_start; |
86 | struct file *filp = vma->vm_file; | 86 | struct file *filp = vma->vm_file; |
@@ -145,7 +145,7 @@ depopulate: | |||
145 | * | 145 | * |
146 | * Returns channel buffer if successful, %NULL otherwise. | 146 | * Returns channel buffer if successful, %NULL otherwise. |
147 | */ | 147 | */ |
148 | struct rchan_buf *relay_create_buf(struct rchan *chan) | 148 | static struct rchan_buf *relay_create_buf(struct rchan *chan) |
149 | { | 149 | { |
150 | struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); | 150 | struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); |
151 | if (!buf) | 151 | if (!buf) |
@@ -175,7 +175,7 @@ free_buf: | |||
175 | * | 175 | * |
176 | * Should only be called from kref_put(). | 176 | * Should only be called from kref_put(). |
177 | */ | 177 | */ |
178 | void relay_destroy_channel(struct kref *kref) | 178 | static void relay_destroy_channel(struct kref *kref) |
179 | { | 179 | { |
180 | struct rchan *chan = container_of(kref, struct rchan, kref); | 180 | struct rchan *chan = container_of(kref, struct rchan, kref); |
181 | kfree(chan); | 181 | kfree(chan); |
@@ -185,7 +185,7 @@ void relay_destroy_channel(struct kref *kref) | |||
185 | * relay_destroy_buf - destroy an rchan_buf struct and associated buffer | 185 | * relay_destroy_buf - destroy an rchan_buf struct and associated buffer |
186 | * @buf: the buffer struct | 186 | * @buf: the buffer struct |
187 | */ | 187 | */ |
188 | void relay_destroy_buf(struct rchan_buf *buf) | 188 | static void relay_destroy_buf(struct rchan_buf *buf) |
189 | { | 189 | { |
190 | struct rchan *chan = buf->chan; | 190 | struct rchan *chan = buf->chan; |
191 | unsigned int i; | 191 | unsigned int i; |
@@ -210,7 +210,7 @@ void relay_destroy_buf(struct rchan_buf *buf) | |||
210 | * rchan_buf_struct and the channel buffer. Should only be called from | 210 | * rchan_buf_struct and the channel buffer. Should only be called from |
211 | * kref_put(). | 211 | * kref_put(). |
212 | */ | 212 | */ |
213 | void relay_remove_buf(struct kref *kref) | 213 | static void relay_remove_buf(struct kref *kref) |
214 | { | 214 | { |
215 | struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); | 215 | struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); |
216 | buf->chan->cb->remove_buf_file(buf->dentry); | 216 | buf->chan->cb->remove_buf_file(buf->dentry); |
@@ -223,11 +223,10 @@ void relay_remove_buf(struct kref *kref) | |||
223 | * | 223 | * |
224 | * Returns 1 if the buffer is empty, 0 otherwise. | 224 | * Returns 1 if the buffer is empty, 0 otherwise. |
225 | */ | 225 | */ |
226 | int relay_buf_empty(struct rchan_buf *buf) | 226 | static int relay_buf_empty(struct rchan_buf *buf) |
227 | { | 227 | { |
228 | return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; | 228 | return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; |
229 | } | 229 | } |
230 | EXPORT_SYMBOL_GPL(relay_buf_empty); | ||
231 | 230 | ||
232 | /** | 231 | /** |
233 | * relay_buf_full - boolean, is the channel buffer full? | 232 | * relay_buf_full - boolean, is the channel buffer full? |
@@ -427,6 +426,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) | |||
427 | 426 | ||
428 | free_buf: | 427 | free_buf: |
429 | relay_destroy_buf(buf); | 428 | relay_destroy_buf(buf); |
429 | buf = NULL; | ||
430 | free_name: | 430 | free_name: |
431 | kfree(tmpname); | 431 | kfree(tmpname); |
432 | end: | 432 | end: |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 9a87886b022e..1ec620c03064 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -20,7 +20,7 @@ void down_read(struct rw_semaphore *sem) | |||
20 | might_sleep(); | 20 | might_sleep(); |
21 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | 21 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
22 | 22 | ||
23 | __down_read(sem); | 23 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
24 | } | 24 | } |
25 | 25 | ||
26 | EXPORT_SYMBOL(down_read); | 26 | EXPORT_SYMBOL(down_read); |
@@ -47,7 +47,7 @@ void down_write(struct rw_semaphore *sem) | |||
47 | might_sleep(); | 47 | might_sleep(); |
48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
49 | 49 | ||
50 | __down_write(sem); | 50 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
51 | } | 51 | } |
52 | 52 | ||
53 | EXPORT_SYMBOL(down_write); | 53 | EXPORT_SYMBOL(down_write); |
@@ -111,7 +111,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) | |||
111 | might_sleep(); | 111 | might_sleep(); |
112 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); | 112 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); |
113 | 113 | ||
114 | __down_read(sem); | 114 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
115 | } | 115 | } |
116 | 116 | ||
117 | EXPORT_SYMBOL(down_read_nested); | 117 | EXPORT_SYMBOL(down_read_nested); |
@@ -130,7 +130,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) | |||
130 | might_sleep(); | 130 | might_sleep(); |
131 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | 131 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); |
132 | 132 | ||
133 | __down_write_nested(sem, subclass); | 133 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
134 | } | 134 | } |
135 | 135 | ||
136 | EXPORT_SYMBOL(down_write_nested); | 136 | EXPORT_SYMBOL(down_write_nested); |
diff --git a/kernel/sched.c b/kernel/sched.c index cb31fb4a1379..6c10fa796ca0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #include <linux/percpu.h> | 53 | #include <linux/percpu.h> |
54 | #include <linux/kthread.h> | 54 | #include <linux/kthread.h> |
55 | #include <linux/seq_file.h> | 55 | #include <linux/seq_file.h> |
56 | #include <linux/sysctl.h> | ||
56 | #include <linux/syscalls.h> | 57 | #include <linux/syscalls.h> |
57 | #include <linux/times.h> | 58 | #include <linux/times.h> |
58 | #include <linux/tsacct_kern.h> | 59 | #include <linux/tsacct_kern.h> |
@@ -60,6 +61,7 @@ | |||
60 | #include <linux/delayacct.h> | 61 | #include <linux/delayacct.h> |
61 | #include <linux/reciprocal_div.h> | 62 | #include <linux/reciprocal_div.h> |
62 | #include <linux/unistd.h> | 63 | #include <linux/unistd.h> |
64 | #include <linux/pagemap.h> | ||
63 | 65 | ||
64 | #include <asm/tlb.h> | 66 | #include <asm/tlb.h> |
65 | 67 | ||
@@ -261,9 +263,9 @@ struct rq { | |||
261 | s64 clock_max_delta; | 263 | s64 clock_max_delta; |
262 | 264 | ||
263 | unsigned int clock_warps, clock_overflows; | 265 | unsigned int clock_warps, clock_overflows; |
264 | unsigned int clock_unstable_events; | 266 | u64 idle_clock; |
265 | 267 | unsigned int clock_deep_idle_events; | |
266 | struct sched_class *load_balance_class; | 268 | u64 tick_timestamp; |
267 | 269 | ||
268 | atomic_t nr_iowait; | 270 | atomic_t nr_iowait; |
269 | 271 | ||
@@ -301,7 +303,7 @@ struct rq { | |||
301 | struct lock_class_key rq_lock_key; | 303 | struct lock_class_key rq_lock_key; |
302 | }; | 304 | }; |
303 | 305 | ||
304 | static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; | 306 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
305 | static DEFINE_MUTEX(sched_hotcpu_mutex); | 307 | static DEFINE_MUTEX(sched_hotcpu_mutex); |
306 | 308 | ||
307 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 309 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) |
@@ -319,15 +321,19 @@ static inline int cpu_of(struct rq *rq) | |||
319 | } | 321 | } |
320 | 322 | ||
321 | /* | 323 | /* |
322 | * Per-runqueue clock, as finegrained as the platform can give us: | 324 | * Update the per-runqueue clock, as finegrained as the platform can give |
325 | * us, but without assuming monotonicity, etc.: | ||
323 | */ | 326 | */ |
324 | static unsigned long long __rq_clock(struct rq *rq) | 327 | static void __update_rq_clock(struct rq *rq) |
325 | { | 328 | { |
326 | u64 prev_raw = rq->prev_clock_raw; | 329 | u64 prev_raw = rq->prev_clock_raw; |
327 | u64 now = sched_clock(); | 330 | u64 now = sched_clock(); |
328 | s64 delta = now - prev_raw; | 331 | s64 delta = now - prev_raw; |
329 | u64 clock = rq->clock; | 332 | u64 clock = rq->clock; |
330 | 333 | ||
334 | #ifdef CONFIG_SCHED_DEBUG | ||
335 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
336 | #endif | ||
331 | /* | 337 | /* |
332 | * Protect against sched_clock() occasionally going backwards: | 338 | * Protect against sched_clock() occasionally going backwards: |
333 | */ | 339 | */ |
@@ -338,8 +344,11 @@ static unsigned long long __rq_clock(struct rq *rq) | |||
338 | /* | 344 | /* |
339 | * Catch too large forward jumps too: | 345 | * Catch too large forward jumps too: |
340 | */ | 346 | */ |
341 | if (unlikely(delta > 2*TICK_NSEC)) { | 347 | if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { |
342 | clock++; | 348 | if (clock < rq->tick_timestamp + TICK_NSEC) |
349 | clock = rq->tick_timestamp + TICK_NSEC; | ||
350 | else | ||
351 | clock++; | ||
343 | rq->clock_overflows++; | 352 | rq->clock_overflows++; |
344 | } else { | 353 | } else { |
345 | if (unlikely(delta > rq->clock_max_delta)) | 354 | if (unlikely(delta > rq->clock_max_delta)) |
@@ -350,18 +359,12 @@ static unsigned long long __rq_clock(struct rq *rq) | |||
350 | 359 | ||
351 | rq->prev_clock_raw = now; | 360 | rq->prev_clock_raw = now; |
352 | rq->clock = clock; | 361 | rq->clock = clock; |
353 | |||
354 | return clock; | ||
355 | } | 362 | } |
356 | 363 | ||
357 | static inline unsigned long long rq_clock(struct rq *rq) | 364 | static void update_rq_clock(struct rq *rq) |
358 | { | 365 | { |
359 | int this_cpu = smp_processor_id(); | 366 | if (likely(smp_processor_id() == cpu_of(rq))) |
360 | 367 | __update_rq_clock(rq); | |
361 | if (this_cpu == cpu_of(rq)) | ||
362 | return __rq_clock(rq); | ||
363 | |||
364 | return rq->clock; | ||
365 | } | 368 | } |
366 | 369 | ||
367 | /* | 370 | /* |
@@ -379,6 +382,25 @@ static inline unsigned long long rq_clock(struct rq *rq) | |||
379 | #define task_rq(p) cpu_rq(task_cpu(p)) | 382 | #define task_rq(p) cpu_rq(task_cpu(p)) |
380 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 383 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
381 | 384 | ||
385 | /* | ||
386 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
387 | * clock constructed from sched_clock(): | ||
388 | */ | ||
389 | unsigned long long cpu_clock(int cpu) | ||
390 | { | ||
391 | unsigned long long now; | ||
392 | unsigned long flags; | ||
393 | struct rq *rq; | ||
394 | |||
395 | local_irq_save(flags); | ||
396 | rq = cpu_rq(cpu); | ||
397 | update_rq_clock(rq); | ||
398 | now = rq->clock; | ||
399 | local_irq_restore(flags); | ||
400 | |||
401 | return now; | ||
402 | } | ||
403 | |||
382 | #ifdef CONFIG_FAIR_GROUP_SCHED | 404 | #ifdef CONFIG_FAIR_GROUP_SCHED |
383 | /* Change a task's ->cfs_rq if it moves across CPUs */ | 405 | /* Change a task's ->cfs_rq if it moves across CPUs */ |
384 | static inline void set_task_cfs_rq(struct task_struct *p) | 406 | static inline void set_task_cfs_rq(struct task_struct *p) |
@@ -536,18 +558,40 @@ static inline struct rq *this_rq_lock(void) | |||
536 | } | 558 | } |
537 | 559 | ||
538 | /* | 560 | /* |
539 | * CPU frequency is/was unstable - start new by setting prev_clock_raw: | 561 | * We are going deep-idle (irqs are disabled): |
540 | */ | 562 | */ |
541 | void sched_clock_unstable_event(void) | 563 | void sched_clock_idle_sleep_event(void) |
542 | { | 564 | { |
543 | unsigned long flags; | 565 | struct rq *rq = cpu_rq(smp_processor_id()); |
544 | struct rq *rq; | ||
545 | 566 | ||
546 | rq = task_rq_lock(current, &flags); | 567 | spin_lock(&rq->lock); |
547 | rq->prev_clock_raw = sched_clock(); | 568 | __update_rq_clock(rq); |
548 | rq->clock_unstable_events++; | 569 | spin_unlock(&rq->lock); |
549 | task_rq_unlock(rq, &flags); | 570 | rq->clock_deep_idle_events++; |
550 | } | 571 | } |
572 | EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); | ||
573 | |||
574 | /* | ||
575 | * We just idled delta nanoseconds (called with irqs disabled): | ||
576 | */ | ||
577 | void sched_clock_idle_wakeup_event(u64 delta_ns) | ||
578 | { | ||
579 | struct rq *rq = cpu_rq(smp_processor_id()); | ||
580 | u64 now = sched_clock(); | ||
581 | |||
582 | rq->idle_clock += delta_ns; | ||
583 | /* | ||
584 | * Override the previous timestamp and ignore all | ||
585 | * sched_clock() deltas that occured while we idled, | ||
586 | * and use the PM-provided delta_ns to advance the | ||
587 | * rq clock: | ||
588 | */ | ||
589 | spin_lock(&rq->lock); | ||
590 | rq->prev_clock_raw = now; | ||
591 | rq->clock += delta_ns; | ||
592 | spin_unlock(&rq->lock); | ||
593 | } | ||
594 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | ||
551 | 595 | ||
552 | /* | 596 | /* |
553 | * resched_task - mark a task 'to be rescheduled now'. | 597 | * resched_task - mark a task 'to be rescheduled now'. |
@@ -622,27 +666,31 @@ static u64 div64_likely32(u64 divident, unsigned long divisor) | |||
622 | 666 | ||
623 | #define WMULT_SHIFT 32 | 667 | #define WMULT_SHIFT 32 |
624 | 668 | ||
625 | static inline unsigned long | 669 | /* |
670 | * Shift right and round: | ||
671 | */ | ||
672 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | ||
673 | |||
674 | static unsigned long | ||
626 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 675 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
627 | struct load_weight *lw) | 676 | struct load_weight *lw) |
628 | { | 677 | { |
629 | u64 tmp; | 678 | u64 tmp; |
630 | 679 | ||
631 | if (unlikely(!lw->inv_weight)) | 680 | if (unlikely(!lw->inv_weight)) |
632 | lw->inv_weight = WMULT_CONST / lw->weight; | 681 | lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; |
633 | 682 | ||
634 | tmp = (u64)delta_exec * weight; | 683 | tmp = (u64)delta_exec * weight; |
635 | /* | 684 | /* |
636 | * Check whether we'd overflow the 64-bit multiplication: | 685 | * Check whether we'd overflow the 64-bit multiplication: |
637 | */ | 686 | */ |
638 | if (unlikely(tmp > WMULT_CONST)) { | 687 | if (unlikely(tmp > WMULT_CONST)) |
639 | tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) | 688 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, |
640 | >> (WMULT_SHIFT/2); | 689 | WMULT_SHIFT/2); |
641 | } else { | 690 | else |
642 | tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; | 691 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); |
643 | } | ||
644 | 692 | ||
645 | return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit); | 693 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
646 | } | 694 | } |
647 | 695 | ||
648 | static inline unsigned long | 696 | static inline unsigned long |
@@ -663,46 +711,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
663 | lw->inv_weight = 0; | 711 | lw->inv_weight = 0; |
664 | } | 712 | } |
665 | 713 | ||
666 | static void __update_curr_load(struct rq *rq, struct load_stat *ls) | ||
667 | { | ||
668 | if (rq->curr != rq->idle && ls->load.weight) { | ||
669 | ls->delta_exec += ls->delta_stat; | ||
670 | ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); | ||
671 | ls->delta_stat = 0; | ||
672 | } | ||
673 | } | ||
674 | |||
675 | /* | ||
676 | * Update delta_exec, delta_fair fields for rq. | ||
677 | * | ||
678 | * delta_fair clock advances at a rate inversely proportional to | ||
679 | * total load (rq->ls.load.weight) on the runqueue, while | ||
680 | * delta_exec advances at the same rate as wall-clock (provided | ||
681 | * cpu is not idle). | ||
682 | * | ||
683 | * delta_exec / delta_fair is a measure of the (smoothened) load on this | ||
684 | * runqueue over any given interval. This (smoothened) load is used | ||
685 | * during load balance. | ||
686 | * | ||
687 | * This function is called /before/ updating rq->ls.load | ||
688 | * and when switching tasks. | ||
689 | */ | ||
690 | static void update_curr_load(struct rq *rq, u64 now) | ||
691 | { | ||
692 | struct load_stat *ls = &rq->ls; | ||
693 | u64 start; | ||
694 | |||
695 | start = ls->load_update_start; | ||
696 | ls->load_update_start = now; | ||
697 | ls->delta_stat += now - start; | ||
698 | /* | ||
699 | * Stagger updates to ls->delta_fair. Very frequent updates | ||
700 | * can be expensive. | ||
701 | */ | ||
702 | if (ls->delta_stat >= sysctl_sched_stat_granularity) | ||
703 | __update_curr_load(rq, ls); | ||
704 | } | ||
705 | |||
706 | /* | 714 | /* |
707 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 715 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
708 | * of tasks with abnormal "nice" values across CPUs the contribution that | 716 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -712,19 +720,6 @@ static void update_curr_load(struct rq *rq, u64 now) | |||
712 | * slice expiry etc. | 720 | * slice expiry etc. |
713 | */ | 721 | */ |
714 | 722 | ||
715 | /* | ||
716 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE | ||
717 | * If static_prio_timeslice() is ever changed to break this assumption then | ||
718 | * this code will need modification | ||
719 | */ | ||
720 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
721 | #define load_weight(lp) \ | ||
722 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
723 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
724 | load_weight(static_prio_timeslice(prio)) | ||
725 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
726 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp)) | ||
727 | |||
728 | #define WEIGHT_IDLEPRIO 2 | 723 | #define WEIGHT_IDLEPRIO 2 |
729 | #define WMULT_IDLEPRIO (1 << 31) | 724 | #define WMULT_IDLEPRIO (1 << 31) |
730 | 725 | ||
@@ -741,11 +736,14 @@ static void update_curr_load(struct rq *rq, u64 now) | |||
741 | * the relative distance between them is ~25%.) | 736 | * the relative distance between them is ~25%.) |
742 | */ | 737 | */ |
743 | static const int prio_to_weight[40] = { | 738 | static const int prio_to_weight[40] = { |
744 | /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, | 739 | /* -20 */ 88761, 71755, 56483, 46273, 36291, |
745 | /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, | 740 | /* -15 */ 29154, 23254, 18705, 14949, 11916, |
746 | /* 0 */ NICE_0_LOAD /* 1024 */, | 741 | /* -10 */ 9548, 7620, 6100, 4904, 3906, |
747 | /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, | 742 | /* -5 */ 3121, 2501, 1991, 1586, 1277, |
748 | /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, | 743 | /* 0 */ 1024, 820, 655, 526, 423, |
744 | /* 5 */ 335, 272, 215, 172, 137, | ||
745 | /* 10 */ 110, 87, 70, 56, 45, | ||
746 | /* 15 */ 36, 29, 23, 18, 15, | ||
749 | }; | 747 | }; |
750 | 748 | ||
751 | /* | 749 | /* |
@@ -756,42 +754,16 @@ static const int prio_to_weight[40] = { | |||
756 | * into multiplications: | 754 | * into multiplications: |
757 | */ | 755 | */ |
758 | static const u32 prio_to_wmult[40] = { | 756 | static const u32 prio_to_wmult[40] = { |
759 | /* -20 */ 48356, 60446, 75558, 94446, 118058, | 757 | /* -20 */ 48388, 59856, 76040, 92818, 118348, |
760 | /* -15 */ 147573, 184467, 230589, 288233, 360285, | 758 | /* -15 */ 147320, 184698, 229616, 287308, 360437, |
761 | /* -10 */ 450347, 562979, 703746, 879575, 1099582, | 759 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, |
762 | /* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443, | 760 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, |
763 | /* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518, | 761 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, |
764 | /* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126, | 762 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, |
765 | /* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717, | 763 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, |
766 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 764 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
767 | }; | 765 | }; |
768 | 766 | ||
769 | static inline void | ||
770 | inc_load(struct rq *rq, const struct task_struct *p, u64 now) | ||
771 | { | ||
772 | update_curr_load(rq, now); | ||
773 | update_load_add(&rq->ls.load, p->se.load.weight); | ||
774 | } | ||
775 | |||
776 | static inline void | ||
777 | dec_load(struct rq *rq, const struct task_struct *p, u64 now) | ||
778 | { | ||
779 | update_curr_load(rq, now); | ||
780 | update_load_sub(&rq->ls.load, p->se.load.weight); | ||
781 | } | ||
782 | |||
783 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) | ||
784 | { | ||
785 | rq->nr_running++; | ||
786 | inc_load(rq, p, now); | ||
787 | } | ||
788 | |||
789 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) | ||
790 | { | ||
791 | rq->nr_running--; | ||
792 | dec_load(rq, p, now); | ||
793 | } | ||
794 | |||
795 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | 767 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); |
796 | 768 | ||
797 | /* | 769 | /* |
@@ -809,8 +781,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
809 | unsigned long max_nr_move, unsigned long max_load_move, | 781 | unsigned long max_nr_move, unsigned long max_load_move, |
810 | struct sched_domain *sd, enum cpu_idle_type idle, | 782 | struct sched_domain *sd, enum cpu_idle_type idle, |
811 | int *all_pinned, unsigned long *load_moved, | 783 | int *all_pinned, unsigned long *load_moved, |
812 | int this_best_prio, int best_prio, int best_prio_seen, | 784 | int *this_best_prio, struct rq_iterator *iterator); |
813 | struct rq_iterator *iterator); | ||
814 | 785 | ||
815 | #include "sched_stats.h" | 786 | #include "sched_stats.h" |
816 | #include "sched_rt.c" | 787 | #include "sched_rt.c" |
@@ -822,9 +793,72 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
822 | 793 | ||
823 | #define sched_class_highest (&rt_sched_class) | 794 | #define sched_class_highest (&rt_sched_class) |
824 | 795 | ||
796 | static void __update_curr_load(struct rq *rq, struct load_stat *ls) | ||
797 | { | ||
798 | if (rq->curr != rq->idle && ls->load.weight) { | ||
799 | ls->delta_exec += ls->delta_stat; | ||
800 | ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); | ||
801 | ls->delta_stat = 0; | ||
802 | } | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * Update delta_exec, delta_fair fields for rq. | ||
807 | * | ||
808 | * delta_fair clock advances at a rate inversely proportional to | ||
809 | * total load (rq->ls.load.weight) on the runqueue, while | ||
810 | * delta_exec advances at the same rate as wall-clock (provided | ||
811 | * cpu is not idle). | ||
812 | * | ||
813 | * delta_exec / delta_fair is a measure of the (smoothened) load on this | ||
814 | * runqueue over any given interval. This (smoothened) load is used | ||
815 | * during load balance. | ||
816 | * | ||
817 | * This function is called /before/ updating rq->ls.load | ||
818 | * and when switching tasks. | ||
819 | */ | ||
820 | static void update_curr_load(struct rq *rq) | ||
821 | { | ||
822 | struct load_stat *ls = &rq->ls; | ||
823 | u64 start; | ||
824 | |||
825 | start = ls->load_update_start; | ||
826 | ls->load_update_start = rq->clock; | ||
827 | ls->delta_stat += rq->clock - start; | ||
828 | /* | ||
829 | * Stagger updates to ls->delta_fair. Very frequent updates | ||
830 | * can be expensive. | ||
831 | */ | ||
832 | if (ls->delta_stat >= sysctl_sched_stat_granularity) | ||
833 | __update_curr_load(rq, ls); | ||
834 | } | ||
835 | |||
836 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | ||
837 | { | ||
838 | update_curr_load(rq); | ||
839 | update_load_add(&rq->ls.load, p->se.load.weight); | ||
840 | } | ||
841 | |||
842 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
843 | { | ||
844 | update_curr_load(rq); | ||
845 | update_load_sub(&rq->ls.load, p->se.load.weight); | ||
846 | } | ||
847 | |||
848 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
849 | { | ||
850 | rq->nr_running++; | ||
851 | inc_load(rq, p); | ||
852 | } | ||
853 | |||
854 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | ||
855 | { | ||
856 | rq->nr_running--; | ||
857 | dec_load(rq, p); | ||
858 | } | ||
859 | |||
825 | static void set_load_weight(struct task_struct *p) | 860 | static void set_load_weight(struct task_struct *p) |
826 | { | 861 | { |
827 | task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; | ||
828 | p->se.wait_runtime = 0; | 862 | p->se.wait_runtime = 0; |
829 | 863 | ||
830 | if (task_has_rt_policy(p)) { | 864 | if (task_has_rt_policy(p)) { |
@@ -846,18 +880,16 @@ static void set_load_weight(struct task_struct *p) | |||
846 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 880 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
847 | } | 881 | } |
848 | 882 | ||
849 | static void | 883 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) |
850 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | ||
851 | { | 884 | { |
852 | sched_info_queued(p); | 885 | sched_info_queued(p); |
853 | p->sched_class->enqueue_task(rq, p, wakeup, now); | 886 | p->sched_class->enqueue_task(rq, p, wakeup); |
854 | p->se.on_rq = 1; | 887 | p->se.on_rq = 1; |
855 | } | 888 | } |
856 | 889 | ||
857 | static void | 890 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) |
858 | dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) | ||
859 | { | 891 | { |
860 | p->sched_class->dequeue_task(rq, p, sleep, now); | 892 | p->sched_class->dequeue_task(rq, p, sleep); |
861 | p->se.on_rq = 0; | 893 | p->se.on_rq = 0; |
862 | } | 894 | } |
863 | 895 | ||
@@ -912,13 +944,11 @@ static int effective_prio(struct task_struct *p) | |||
912 | */ | 944 | */ |
913 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | 945 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) |
914 | { | 946 | { |
915 | u64 now = rq_clock(rq); | ||
916 | |||
917 | if (p->state == TASK_UNINTERRUPTIBLE) | 947 | if (p->state == TASK_UNINTERRUPTIBLE) |
918 | rq->nr_uninterruptible--; | 948 | rq->nr_uninterruptible--; |
919 | 949 | ||
920 | enqueue_task(rq, p, wakeup, now); | 950 | enqueue_task(rq, p, wakeup); |
921 | inc_nr_running(p, rq, now); | 951 | inc_nr_running(p, rq); |
922 | } | 952 | } |
923 | 953 | ||
924 | /* | 954 | /* |
@@ -926,13 +956,13 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
926 | */ | 956 | */ |
927 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) | 957 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) |
928 | { | 958 | { |
929 | u64 now = rq_clock(rq); | 959 | update_rq_clock(rq); |
930 | 960 | ||
931 | if (p->state == TASK_UNINTERRUPTIBLE) | 961 | if (p->state == TASK_UNINTERRUPTIBLE) |
932 | rq->nr_uninterruptible--; | 962 | rq->nr_uninterruptible--; |
933 | 963 | ||
934 | enqueue_task(rq, p, 0, now); | 964 | enqueue_task(rq, p, 0); |
935 | inc_nr_running(p, rq, now); | 965 | inc_nr_running(p, rq); |
936 | } | 966 | } |
937 | 967 | ||
938 | /* | 968 | /* |
@@ -940,13 +970,11 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq) | |||
940 | */ | 970 | */ |
941 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | 971 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
942 | { | 972 | { |
943 | u64 now = rq_clock(rq); | ||
944 | |||
945 | if (p->state == TASK_UNINTERRUPTIBLE) | 973 | if (p->state == TASK_UNINTERRUPTIBLE) |
946 | rq->nr_uninterruptible++; | 974 | rq->nr_uninterruptible++; |
947 | 975 | ||
948 | dequeue_task(rq, p, sleep, now); | 976 | dequeue_task(rq, p, sleep); |
949 | dec_nr_running(p, rq, now); | 977 | dec_nr_running(p, rq); |
950 | } | 978 | } |
951 | 979 | ||
952 | /** | 980 | /** |
@@ -981,18 +1009,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
981 | u64 clock_offset, fair_clock_offset; | 1009 | u64 clock_offset, fair_clock_offset; |
982 | 1010 | ||
983 | clock_offset = old_rq->clock - new_rq->clock; | 1011 | clock_offset = old_rq->clock - new_rq->clock; |
984 | fair_clock_offset = old_rq->cfs.fair_clock - | 1012 | fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; |
985 | new_rq->cfs.fair_clock; | 1013 | |
986 | if (p->se.wait_start) | ||
987 | p->se.wait_start -= clock_offset; | ||
988 | if (p->se.wait_start_fair) | 1014 | if (p->se.wait_start_fair) |
989 | p->se.wait_start_fair -= fair_clock_offset; | 1015 | p->se.wait_start_fair -= fair_clock_offset; |
1016 | if (p->se.sleep_start_fair) | ||
1017 | p->se.sleep_start_fair -= fair_clock_offset; | ||
1018 | |||
1019 | #ifdef CONFIG_SCHEDSTATS | ||
1020 | if (p->se.wait_start) | ||
1021 | p->se.wait_start -= clock_offset; | ||
990 | if (p->se.sleep_start) | 1022 | if (p->se.sleep_start) |
991 | p->se.sleep_start -= clock_offset; | 1023 | p->se.sleep_start -= clock_offset; |
992 | if (p->se.block_start) | 1024 | if (p->se.block_start) |
993 | p->se.block_start -= clock_offset; | 1025 | p->se.block_start -= clock_offset; |
994 | if (p->se.sleep_start_fair) | 1026 | #endif |
995 | p->se.sleep_start_fair -= fair_clock_offset; | ||
996 | 1027 | ||
997 | __set_task_cpu(p, new_cpu); | 1028 | __set_task_cpu(p, new_cpu); |
998 | } | 1029 | } |
@@ -1511,6 +1542,7 @@ out_set_cpu: | |||
1511 | 1542 | ||
1512 | out_activate: | 1543 | out_activate: |
1513 | #endif /* CONFIG_SMP */ | 1544 | #endif /* CONFIG_SMP */ |
1545 | update_rq_clock(rq); | ||
1514 | activate_task(rq, p, 1); | 1546 | activate_task(rq, p, 1); |
1515 | /* | 1547 | /* |
1516 | * Sync wakeups (i.e. those types of wakeups where the waker | 1548 | * Sync wakeups (i.e. those types of wakeups where the waker |
@@ -1553,17 +1585,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) | |||
1553 | static void __sched_fork(struct task_struct *p) | 1585 | static void __sched_fork(struct task_struct *p) |
1554 | { | 1586 | { |
1555 | p->se.wait_start_fair = 0; | 1587 | p->se.wait_start_fair = 0; |
1556 | p->se.wait_start = 0; | ||
1557 | p->se.exec_start = 0; | 1588 | p->se.exec_start = 0; |
1558 | p->se.sum_exec_runtime = 0; | 1589 | p->se.sum_exec_runtime = 0; |
1590 | p->se.prev_sum_exec_runtime = 0; | ||
1559 | p->se.delta_exec = 0; | 1591 | p->se.delta_exec = 0; |
1560 | p->se.delta_fair_run = 0; | 1592 | p->se.delta_fair_run = 0; |
1561 | p->se.delta_fair_sleep = 0; | 1593 | p->se.delta_fair_sleep = 0; |
1562 | p->se.wait_runtime = 0; | 1594 | p->se.wait_runtime = 0; |
1595 | p->se.sleep_start_fair = 0; | ||
1596 | |||
1597 | #ifdef CONFIG_SCHEDSTATS | ||
1598 | p->se.wait_start = 0; | ||
1563 | p->se.sum_wait_runtime = 0; | 1599 | p->se.sum_wait_runtime = 0; |
1564 | p->se.sum_sleep_runtime = 0; | 1600 | p->se.sum_sleep_runtime = 0; |
1565 | p->se.sleep_start = 0; | 1601 | p->se.sleep_start = 0; |
1566 | p->se.sleep_start_fair = 0; | ||
1567 | p->se.block_start = 0; | 1602 | p->se.block_start = 0; |
1568 | p->se.sleep_max = 0; | 1603 | p->se.sleep_max = 0; |
1569 | p->se.block_max = 0; | 1604 | p->se.block_max = 0; |
@@ -1571,10 +1606,15 @@ static void __sched_fork(struct task_struct *p) | |||
1571 | p->se.wait_max = 0; | 1606 | p->se.wait_max = 0; |
1572 | p->se.wait_runtime_overruns = 0; | 1607 | p->se.wait_runtime_overruns = 0; |
1573 | p->se.wait_runtime_underruns = 0; | 1608 | p->se.wait_runtime_underruns = 0; |
1609 | #endif | ||
1574 | 1610 | ||
1575 | INIT_LIST_HEAD(&p->run_list); | 1611 | INIT_LIST_HEAD(&p->run_list); |
1576 | p->se.on_rq = 0; | 1612 | p->se.on_rq = 0; |
1577 | 1613 | ||
1614 | #ifdef CONFIG_PREEMPT_NOTIFIERS | ||
1615 | INIT_HLIST_HEAD(&p->preempt_notifiers); | ||
1616 | #endif | ||
1617 | |||
1578 | /* | 1618 | /* |
1579 | * We mark the process as running here, but have not actually | 1619 | * We mark the process as running here, but have not actually |
1580 | * inserted it onto the runqueue yet. This guarantees that | 1620 | * inserted it onto the runqueue yet. This guarantees that |
@@ -1639,11 +1679,19 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1639 | rq = task_rq_lock(p, &flags); | 1679 | rq = task_rq_lock(p, &flags); |
1640 | BUG_ON(p->state != TASK_RUNNING); | 1680 | BUG_ON(p->state != TASK_RUNNING); |
1641 | this_cpu = smp_processor_id(); /* parent's CPU */ | 1681 | this_cpu = smp_processor_id(); /* parent's CPU */ |
1682 | update_rq_clock(rq); | ||
1642 | 1683 | ||
1643 | p->prio = effective_prio(p); | 1684 | p->prio = effective_prio(p); |
1644 | 1685 | ||
1645 | if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || | 1686 | if (rt_prio(p->prio)) |
1646 | task_cpu(p) != this_cpu || !current->se.on_rq) { | 1687 | p->sched_class = &rt_sched_class; |
1688 | else | ||
1689 | p->sched_class = &fair_sched_class; | ||
1690 | |||
1691 | if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || | ||
1692 | (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || | ||
1693 | !current->se.on_rq) { | ||
1694 | |||
1647 | activate_task(rq, p, 0); | 1695 | activate_task(rq, p, 0); |
1648 | } else { | 1696 | } else { |
1649 | /* | 1697 | /* |
@@ -1651,14 +1699,74 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1651 | * management (if any): | 1699 | * management (if any): |
1652 | */ | 1700 | */ |
1653 | p->sched_class->task_new(rq, p); | 1701 | p->sched_class->task_new(rq, p); |
1702 | inc_nr_running(p, rq); | ||
1654 | } | 1703 | } |
1655 | check_preempt_curr(rq, p); | 1704 | check_preempt_curr(rq, p); |
1656 | task_rq_unlock(rq, &flags); | 1705 | task_rq_unlock(rq, &flags); |
1657 | } | 1706 | } |
1658 | 1707 | ||
1708 | #ifdef CONFIG_PREEMPT_NOTIFIERS | ||
1709 | |||
1710 | /** | ||
1711 | * preempt_notifier_register - tell me when current is being being preempted & rescheduled | ||
1712 | * @notifier: notifier struct to register | ||
1713 | */ | ||
1714 | void preempt_notifier_register(struct preempt_notifier *notifier) | ||
1715 | { | ||
1716 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); | ||
1717 | } | ||
1718 | EXPORT_SYMBOL_GPL(preempt_notifier_register); | ||
1719 | |||
1720 | /** | ||
1721 | * preempt_notifier_unregister - no longer interested in preemption notifications | ||
1722 | * @notifier: notifier struct to unregister | ||
1723 | * | ||
1724 | * This is safe to call from within a preemption notifier. | ||
1725 | */ | ||
1726 | void preempt_notifier_unregister(struct preempt_notifier *notifier) | ||
1727 | { | ||
1728 | hlist_del(¬ifier->link); | ||
1729 | } | ||
1730 | EXPORT_SYMBOL_GPL(preempt_notifier_unregister); | ||
1731 | |||
1732 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | ||
1733 | { | ||
1734 | struct preempt_notifier *notifier; | ||
1735 | struct hlist_node *node; | ||
1736 | |||
1737 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | ||
1738 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); | ||
1739 | } | ||
1740 | |||
1741 | static void | ||
1742 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | ||
1743 | struct task_struct *next) | ||
1744 | { | ||
1745 | struct preempt_notifier *notifier; | ||
1746 | struct hlist_node *node; | ||
1747 | |||
1748 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | ||
1749 | notifier->ops->sched_out(notifier, next); | ||
1750 | } | ||
1751 | |||
1752 | #else | ||
1753 | |||
1754 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | ||
1755 | { | ||
1756 | } | ||
1757 | |||
1758 | static void | ||
1759 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | ||
1760 | struct task_struct *next) | ||
1761 | { | ||
1762 | } | ||
1763 | |||
1764 | #endif | ||
1765 | |||
1659 | /** | 1766 | /** |
1660 | * prepare_task_switch - prepare to switch tasks | 1767 | * prepare_task_switch - prepare to switch tasks |
1661 | * @rq: the runqueue preparing to switch | 1768 | * @rq: the runqueue preparing to switch |
1769 | * @prev: the current task that is being switched out | ||
1662 | * @next: the task we are going to switch to. | 1770 | * @next: the task we are going to switch to. |
1663 | * | 1771 | * |
1664 | * This is called with the rq lock held and interrupts off. It must | 1772 | * This is called with the rq lock held and interrupts off. It must |
@@ -1668,8 +1776,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1668 | * prepare_task_switch sets up locking and calls architecture specific | 1776 | * prepare_task_switch sets up locking and calls architecture specific |
1669 | * hooks. | 1777 | * hooks. |
1670 | */ | 1778 | */ |
1671 | static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) | 1779 | static inline void |
1780 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | ||
1781 | struct task_struct *next) | ||
1672 | { | 1782 | { |
1783 | fire_sched_out_preempt_notifiers(prev, next); | ||
1673 | prepare_lock_switch(rq, next); | 1784 | prepare_lock_switch(rq, next); |
1674 | prepare_arch_switch(next); | 1785 | prepare_arch_switch(next); |
1675 | } | 1786 | } |
@@ -1711,6 +1822,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1711 | prev_state = prev->state; | 1822 | prev_state = prev->state; |
1712 | finish_arch_switch(prev); | 1823 | finish_arch_switch(prev); |
1713 | finish_lock_switch(rq, prev); | 1824 | finish_lock_switch(rq, prev); |
1825 | fire_sched_in_preempt_notifiers(current); | ||
1714 | if (mm) | 1826 | if (mm) |
1715 | mmdrop(mm); | 1827 | mmdrop(mm); |
1716 | if (unlikely(prev_state == TASK_DEAD)) { | 1828 | if (unlikely(prev_state == TASK_DEAD)) { |
@@ -1751,7 +1863,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
1751 | { | 1863 | { |
1752 | struct mm_struct *mm, *oldmm; | 1864 | struct mm_struct *mm, *oldmm; |
1753 | 1865 | ||
1754 | prepare_task_switch(rq, next); | 1866 | prepare_task_switch(rq, prev, next); |
1755 | mm = next->mm; | 1867 | mm = next->mm; |
1756 | oldmm = prev->active_mm; | 1868 | oldmm = prev->active_mm; |
1757 | /* | 1869 | /* |
@@ -1874,7 +1986,6 @@ static void update_cpu_load(struct rq *this_rq) | |||
1874 | unsigned long total_load = this_rq->ls.load.weight; | 1986 | unsigned long total_load = this_rq->ls.load.weight; |
1875 | unsigned long this_load = total_load; | 1987 | unsigned long this_load = total_load; |
1876 | struct load_stat *ls = &this_rq->ls; | 1988 | struct load_stat *ls = &this_rq->ls; |
1877 | u64 now = __rq_clock(this_rq); | ||
1878 | int i, scale; | 1989 | int i, scale; |
1879 | 1990 | ||
1880 | this_rq->nr_load_updates++; | 1991 | this_rq->nr_load_updates++; |
@@ -1882,7 +1993,7 @@ static void update_cpu_load(struct rq *this_rq) | |||
1882 | goto do_avg; | 1993 | goto do_avg; |
1883 | 1994 | ||
1884 | /* Update delta_fair/delta_exec fields first */ | 1995 | /* Update delta_fair/delta_exec fields first */ |
1885 | update_curr_load(this_rq, now); | 1996 | update_curr_load(this_rq); |
1886 | 1997 | ||
1887 | fair_delta64 = ls->delta_fair + 1; | 1998 | fair_delta64 = ls->delta_fair + 1; |
1888 | ls->delta_fair = 0; | 1999 | ls->delta_fair = 0; |
@@ -1890,8 +2001,8 @@ static void update_cpu_load(struct rq *this_rq) | |||
1890 | exec_delta64 = ls->delta_exec + 1; | 2001 | exec_delta64 = ls->delta_exec + 1; |
1891 | ls->delta_exec = 0; | 2002 | ls->delta_exec = 0; |
1892 | 2003 | ||
1893 | sample_interval64 = now - ls->load_update_last; | 2004 | sample_interval64 = this_rq->clock - ls->load_update_last; |
1894 | ls->load_update_last = now; | 2005 | ls->load_update_last = this_rq->clock; |
1895 | 2006 | ||
1896 | if ((s64)sample_interval64 < (s64)TICK_NSEC) | 2007 | if ((s64)sample_interval64 < (s64)TICK_NSEC) |
1897 | sample_interval64 = TICK_NSEC; | 2008 | sample_interval64 = TICK_NSEC; |
@@ -1946,6 +2057,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) | |||
1946 | spin_lock(&rq1->lock); | 2057 | spin_lock(&rq1->lock); |
1947 | } | 2058 | } |
1948 | } | 2059 | } |
2060 | update_rq_clock(rq1); | ||
2061 | update_rq_clock(rq2); | ||
1949 | } | 2062 | } |
1950 | 2063 | ||
1951 | /* | 2064 | /* |
@@ -2073,12 +2186,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2073 | if (task_running(rq, p)) | 2186 | if (task_running(rq, p)) |
2074 | return 0; | 2187 | return 0; |
2075 | 2188 | ||
2076 | /* | ||
2077 | * Aggressive migration if too many balance attempts have failed: | ||
2078 | */ | ||
2079 | if (sd->nr_balance_failed > sd->cache_nice_tries) | ||
2080 | return 1; | ||
2081 | |||
2082 | return 1; | 2189 | return 1; |
2083 | } | 2190 | } |
2084 | 2191 | ||
@@ -2086,8 +2193,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2086 | unsigned long max_nr_move, unsigned long max_load_move, | 2193 | unsigned long max_nr_move, unsigned long max_load_move, |
2087 | struct sched_domain *sd, enum cpu_idle_type idle, | 2194 | struct sched_domain *sd, enum cpu_idle_type idle, |
2088 | int *all_pinned, unsigned long *load_moved, | 2195 | int *all_pinned, unsigned long *load_moved, |
2089 | int this_best_prio, int best_prio, int best_prio_seen, | 2196 | int *this_best_prio, struct rq_iterator *iterator) |
2090 | struct rq_iterator *iterator) | ||
2091 | { | 2197 | { |
2092 | int pulled = 0, pinned = 0, skip_for_load; | 2198 | int pulled = 0, pinned = 0, skip_for_load; |
2093 | struct task_struct *p; | 2199 | struct task_struct *p; |
@@ -2112,12 +2218,8 @@ next: | |||
2112 | */ | 2218 | */ |
2113 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + | 2219 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + |
2114 | SCHED_LOAD_SCALE_FUZZ; | 2220 | SCHED_LOAD_SCALE_FUZZ; |
2115 | if (skip_for_load && p->prio < this_best_prio) | 2221 | if ((skip_for_load && p->prio >= *this_best_prio) || |
2116 | skip_for_load = !best_prio_seen && p->prio == best_prio; | ||
2117 | if (skip_for_load || | ||
2118 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 2222 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2119 | |||
2120 | best_prio_seen |= p->prio == best_prio; | ||
2121 | p = iterator->next(iterator->arg); | 2223 | p = iterator->next(iterator->arg); |
2122 | goto next; | 2224 | goto next; |
2123 | } | 2225 | } |
@@ -2131,8 +2233,8 @@ next: | |||
2131 | * and the prescribed amount of weighted load. | 2233 | * and the prescribed amount of weighted load. |
2132 | */ | 2234 | */ |
2133 | if (pulled < max_nr_move && rem_load_move > 0) { | 2235 | if (pulled < max_nr_move && rem_load_move > 0) { |
2134 | if (p->prio < this_best_prio) | 2236 | if (p->prio < *this_best_prio) |
2135 | this_best_prio = p->prio; | 2237 | *this_best_prio = p->prio; |
2136 | p = iterator->next(iterator->arg); | 2238 | p = iterator->next(iterator->arg); |
2137 | goto next; | 2239 | goto next; |
2138 | } | 2240 | } |
@@ -2151,32 +2253,52 @@ out: | |||
2151 | } | 2253 | } |
2152 | 2254 | ||
2153 | /* | 2255 | /* |
2154 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | 2256 | * move_tasks tries to move up to max_load_move weighted load from busiest to |
2155 | * load from busiest to this_rq, as part of a balancing operation within | 2257 | * this_rq, as part of a balancing operation within domain "sd". |
2156 | * "domain". Returns the number of tasks moved. | 2258 | * Returns 1 if successful and 0 otherwise. |
2157 | * | 2259 | * |
2158 | * Called with both runqueues locked. | 2260 | * Called with both runqueues locked. |
2159 | */ | 2261 | */ |
2160 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2262 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2161 | unsigned long max_nr_move, unsigned long max_load_move, | 2263 | unsigned long max_load_move, |
2162 | struct sched_domain *sd, enum cpu_idle_type idle, | 2264 | struct sched_domain *sd, enum cpu_idle_type idle, |
2163 | int *all_pinned) | 2265 | int *all_pinned) |
2164 | { | 2266 | { |
2165 | struct sched_class *class = sched_class_highest; | 2267 | struct sched_class *class = sched_class_highest; |
2166 | unsigned long load_moved, total_nr_moved = 0, nr_moved; | 2268 | unsigned long total_load_moved = 0; |
2167 | long rem_load_move = max_load_move; | 2269 | int this_best_prio = this_rq->curr->prio; |
2168 | 2270 | ||
2169 | do { | 2271 | do { |
2170 | nr_moved = class->load_balance(this_rq, this_cpu, busiest, | 2272 | total_load_moved += |
2171 | max_nr_move, (unsigned long)rem_load_move, | 2273 | class->load_balance(this_rq, this_cpu, busiest, |
2172 | sd, idle, all_pinned, &load_moved); | 2274 | ULONG_MAX, max_load_move - total_load_moved, |
2173 | total_nr_moved += nr_moved; | 2275 | sd, idle, all_pinned, &this_best_prio); |
2174 | max_nr_move -= nr_moved; | ||
2175 | rem_load_move -= load_moved; | ||
2176 | class = class->next; | 2276 | class = class->next; |
2177 | } while (class && max_nr_move && rem_load_move > 0); | 2277 | } while (class && max_load_move > total_load_moved); |
2278 | |||
2279 | return total_load_moved > 0; | ||
2280 | } | ||
2281 | |||
2282 | /* | ||
2283 | * move_one_task tries to move exactly one task from busiest to this_rq, as | ||
2284 | * part of active balancing operations within "domain". | ||
2285 | * Returns 1 if successful and 0 otherwise. | ||
2286 | * | ||
2287 | * Called with both runqueues locked. | ||
2288 | */ | ||
2289 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
2290 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
2291 | { | ||
2292 | struct sched_class *class; | ||
2293 | int this_best_prio = MAX_PRIO; | ||
2294 | |||
2295 | for (class = sched_class_highest; class; class = class->next) | ||
2296 | if (class->load_balance(this_rq, this_cpu, busiest, | ||
2297 | 1, ULONG_MAX, sd, idle, NULL, | ||
2298 | &this_best_prio)) | ||
2299 | return 1; | ||
2178 | 2300 | ||
2179 | return total_nr_moved; | 2301 | return 0; |
2180 | } | 2302 | } |
2181 | 2303 | ||
2182 | /* | 2304 | /* |
@@ -2235,7 +2357,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2235 | 2357 | ||
2236 | rq = cpu_rq(i); | 2358 | rq = cpu_rq(i); |
2237 | 2359 | ||
2238 | if (*sd_idle && !idle_cpu(i)) | 2360 | if (*sd_idle && rq->nr_running) |
2239 | *sd_idle = 0; | 2361 | *sd_idle = 0; |
2240 | 2362 | ||
2241 | /* Bias balancing toward cpus of our domain */ | 2363 | /* Bias balancing toward cpus of our domain */ |
@@ -2257,9 +2379,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2257 | /* | 2379 | /* |
2258 | * First idle cpu or the first cpu(busiest) in this sched group | 2380 | * First idle cpu or the first cpu(busiest) in this sched group |
2259 | * is eligible for doing load balancing at this and above | 2381 | * is eligible for doing load balancing at this and above |
2260 | * domains. | 2382 | * domains. In the newly idle case, we will allow all the cpu's |
2383 | * to do the newly idle load balance. | ||
2261 | */ | 2384 | */ |
2262 | if (local_group && balance_cpu != this_cpu && balance) { | 2385 | if (idle != CPU_NEWLY_IDLE && local_group && |
2386 | balance_cpu != this_cpu && balance) { | ||
2263 | *balance = 0; | 2387 | *balance = 0; |
2264 | goto ret; | 2388 | goto ret; |
2265 | } | 2389 | } |
@@ -2393,7 +2517,7 @@ group_next: | |||
2393 | * a think about bumping its value to force at least one task to be | 2517 | * a think about bumping its value to force at least one task to be |
2394 | * moved | 2518 | * moved |
2395 | */ | 2519 | */ |
2396 | if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { | 2520 | if (*imbalance < busiest_load_per_task) { |
2397 | unsigned long tmp, pwr_now, pwr_move; | 2521 | unsigned long tmp, pwr_now, pwr_move; |
2398 | unsigned int imbn; | 2522 | unsigned int imbn; |
2399 | 2523 | ||
@@ -2445,10 +2569,8 @@ small_imbalance: | |||
2445 | pwr_move /= SCHED_LOAD_SCALE; | 2569 | pwr_move /= SCHED_LOAD_SCALE; |
2446 | 2570 | ||
2447 | /* Move if we gain throughput */ | 2571 | /* Move if we gain throughput */ |
2448 | if (pwr_move <= pwr_now) | 2572 | if (pwr_move > pwr_now) |
2449 | goto out_balanced; | 2573 | *imbalance = busiest_load_per_task; |
2450 | |||
2451 | *imbalance = busiest_load_per_task; | ||
2452 | } | 2574 | } |
2453 | 2575 | ||
2454 | return busiest; | 2576 | return busiest; |
@@ -2506,11 +2628,6 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2506 | */ | 2628 | */ |
2507 | #define MAX_PINNED_INTERVAL 512 | 2629 | #define MAX_PINNED_INTERVAL 512 |
2508 | 2630 | ||
2509 | static inline unsigned long minus_1_or_zero(unsigned long n) | ||
2510 | { | ||
2511 | return n > 0 ? n - 1 : 0; | ||
2512 | } | ||
2513 | |||
2514 | /* | 2631 | /* |
2515 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2632 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2516 | * tasks if there is an imbalance. | 2633 | * tasks if there is an imbalance. |
@@ -2519,7 +2636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2519 | struct sched_domain *sd, enum cpu_idle_type idle, | 2636 | struct sched_domain *sd, enum cpu_idle_type idle, |
2520 | int *balance) | 2637 | int *balance) |
2521 | { | 2638 | { |
2522 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 2639 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
2523 | struct sched_group *group; | 2640 | struct sched_group *group; |
2524 | unsigned long imbalance; | 2641 | unsigned long imbalance; |
2525 | struct rq *busiest; | 2642 | struct rq *busiest; |
@@ -2560,18 +2677,17 @@ redo: | |||
2560 | 2677 | ||
2561 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 2678 | schedstat_add(sd, lb_imbalance[idle], imbalance); |
2562 | 2679 | ||
2563 | nr_moved = 0; | 2680 | ld_moved = 0; |
2564 | if (busiest->nr_running > 1) { | 2681 | if (busiest->nr_running > 1) { |
2565 | /* | 2682 | /* |
2566 | * Attempt to move tasks. If find_busiest_group has found | 2683 | * Attempt to move tasks. If find_busiest_group has found |
2567 | * an imbalance but busiest->nr_running <= 1, the group is | 2684 | * an imbalance but busiest->nr_running <= 1, the group is |
2568 | * still unbalanced. nr_moved simply stays zero, so it is | 2685 | * still unbalanced. ld_moved simply stays zero, so it is |
2569 | * correctly treated as an imbalance. | 2686 | * correctly treated as an imbalance. |
2570 | */ | 2687 | */ |
2571 | local_irq_save(flags); | 2688 | local_irq_save(flags); |
2572 | double_rq_lock(this_rq, busiest); | 2689 | double_rq_lock(this_rq, busiest); |
2573 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2690 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
2574 | minus_1_or_zero(busiest->nr_running), | ||
2575 | imbalance, sd, idle, &all_pinned); | 2691 | imbalance, sd, idle, &all_pinned); |
2576 | double_rq_unlock(this_rq, busiest); | 2692 | double_rq_unlock(this_rq, busiest); |
2577 | local_irq_restore(flags); | 2693 | local_irq_restore(flags); |
@@ -2579,7 +2695,7 @@ redo: | |||
2579 | /* | 2695 | /* |
2580 | * some other cpu did the load balance for us. | 2696 | * some other cpu did the load balance for us. |
2581 | */ | 2697 | */ |
2582 | if (nr_moved && this_cpu != smp_processor_id()) | 2698 | if (ld_moved && this_cpu != smp_processor_id()) |
2583 | resched_cpu(this_cpu); | 2699 | resched_cpu(this_cpu); |
2584 | 2700 | ||
2585 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2701 | /* All tasks on this runqueue were pinned by CPU affinity */ |
@@ -2591,7 +2707,7 @@ redo: | |||
2591 | } | 2707 | } |
2592 | } | 2708 | } |
2593 | 2709 | ||
2594 | if (!nr_moved) { | 2710 | if (!ld_moved) { |
2595 | schedstat_inc(sd, lb_failed[idle]); | 2711 | schedstat_inc(sd, lb_failed[idle]); |
2596 | sd->nr_balance_failed++; | 2712 | sd->nr_balance_failed++; |
2597 | 2713 | ||
@@ -2640,10 +2756,10 @@ redo: | |||
2640 | sd->balance_interval *= 2; | 2756 | sd->balance_interval *= 2; |
2641 | } | 2757 | } |
2642 | 2758 | ||
2643 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2759 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2644 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2760 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2645 | return -1; | 2761 | return -1; |
2646 | return nr_moved; | 2762 | return ld_moved; |
2647 | 2763 | ||
2648 | out_balanced: | 2764 | out_balanced: |
2649 | schedstat_inc(sd, lb_balanced[idle]); | 2765 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -2675,8 +2791,9 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
2675 | struct sched_group *group; | 2791 | struct sched_group *group; |
2676 | struct rq *busiest = NULL; | 2792 | struct rq *busiest = NULL; |
2677 | unsigned long imbalance; | 2793 | unsigned long imbalance; |
2678 | int nr_moved = 0; | 2794 | int ld_moved = 0; |
2679 | int sd_idle = 0; | 2795 | int sd_idle = 0; |
2796 | int all_pinned = 0; | ||
2680 | cpumask_t cpus = CPU_MASK_ALL; | 2797 | cpumask_t cpus = CPU_MASK_ALL; |
2681 | 2798 | ||
2682 | /* | 2799 | /* |
@@ -2709,23 +2826,25 @@ redo: | |||
2709 | 2826 | ||
2710 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); | 2827 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); |
2711 | 2828 | ||
2712 | nr_moved = 0; | 2829 | ld_moved = 0; |
2713 | if (busiest->nr_running > 1) { | 2830 | if (busiest->nr_running > 1) { |
2714 | /* Attempt to move tasks */ | 2831 | /* Attempt to move tasks */ |
2715 | double_lock_balance(this_rq, busiest); | 2832 | double_lock_balance(this_rq, busiest); |
2716 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2833 | /* this_rq->clock is already updated */ |
2717 | minus_1_or_zero(busiest->nr_running), | 2834 | update_rq_clock(busiest); |
2718 | imbalance, sd, CPU_NEWLY_IDLE, NULL); | 2835 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
2836 | imbalance, sd, CPU_NEWLY_IDLE, | ||
2837 | &all_pinned); | ||
2719 | spin_unlock(&busiest->lock); | 2838 | spin_unlock(&busiest->lock); |
2720 | 2839 | ||
2721 | if (!nr_moved) { | 2840 | if (unlikely(all_pinned)) { |
2722 | cpu_clear(cpu_of(busiest), cpus); | 2841 | cpu_clear(cpu_of(busiest), cpus); |
2723 | if (!cpus_empty(cpus)) | 2842 | if (!cpus_empty(cpus)) |
2724 | goto redo; | 2843 | goto redo; |
2725 | } | 2844 | } |
2726 | } | 2845 | } |
2727 | 2846 | ||
2728 | if (!nr_moved) { | 2847 | if (!ld_moved) { |
2729 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); | 2848 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); |
2730 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2849 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2731 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2850 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
@@ -2733,7 +2852,7 @@ redo: | |||
2733 | } else | 2852 | } else |
2734 | sd->nr_balance_failed = 0; | 2853 | sd->nr_balance_failed = 0; |
2735 | 2854 | ||
2736 | return nr_moved; | 2855 | return ld_moved; |
2737 | 2856 | ||
2738 | out_balanced: | 2857 | out_balanced: |
2739 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); | 2858 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); |
@@ -2810,6 +2929,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
2810 | 2929 | ||
2811 | /* move a task from busiest_rq to target_rq */ | 2930 | /* move a task from busiest_rq to target_rq */ |
2812 | double_lock_balance(busiest_rq, target_rq); | 2931 | double_lock_balance(busiest_rq, target_rq); |
2932 | update_rq_clock(busiest_rq); | ||
2933 | update_rq_clock(target_rq); | ||
2813 | 2934 | ||
2814 | /* Search for an sd spanning us and the target CPU. */ | 2935 | /* Search for an sd spanning us and the target CPU. */ |
2815 | for_each_domain(target_cpu, sd) { | 2936 | for_each_domain(target_cpu, sd) { |
@@ -2821,9 +2942,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
2821 | if (likely(sd)) { | 2942 | if (likely(sd)) { |
2822 | schedstat_inc(sd, alb_cnt); | 2943 | schedstat_inc(sd, alb_cnt); |
2823 | 2944 | ||
2824 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, | 2945 | if (move_one_task(target_rq, target_cpu, busiest_rq, |
2825 | RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE, | 2946 | sd, CPU_IDLE)) |
2826 | NULL)) | ||
2827 | schedstat_inc(sd, alb_pushed); | 2947 | schedstat_inc(sd, alb_pushed); |
2828 | else | 2948 | else |
2829 | schedstat_inc(sd, alb_failed); | 2949 | schedstat_inc(sd, alb_failed); |
@@ -2921,6 +3041,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
2921 | struct sched_domain *sd; | 3041 | struct sched_domain *sd; |
2922 | /* Earliest time when we have to do rebalance again */ | 3042 | /* Earliest time when we have to do rebalance again */ |
2923 | unsigned long next_balance = jiffies + 60*HZ; | 3043 | unsigned long next_balance = jiffies + 60*HZ; |
3044 | int update_next_balance = 0; | ||
2924 | 3045 | ||
2925 | for_each_domain(cpu, sd) { | 3046 | for_each_domain(cpu, sd) { |
2926 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3047 | if (!(sd->flags & SD_LOAD_BALANCE)) |
@@ -2957,8 +3078,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
2957 | if (sd->flags & SD_SERIALIZE) | 3078 | if (sd->flags & SD_SERIALIZE) |
2958 | spin_unlock(&balancing); | 3079 | spin_unlock(&balancing); |
2959 | out: | 3080 | out: |
2960 | if (time_after(next_balance, sd->last_balance + interval)) | 3081 | if (time_after(next_balance, sd->last_balance + interval)) { |
2961 | next_balance = sd->last_balance + interval; | 3082 | next_balance = sd->last_balance + interval; |
3083 | update_next_balance = 1; | ||
3084 | } | ||
2962 | 3085 | ||
2963 | /* | 3086 | /* |
2964 | * Stop the load balance at this level. There is another | 3087 | * Stop the load balance at this level. There is another |
@@ -2968,7 +3091,14 @@ out: | |||
2968 | if (!balance) | 3091 | if (!balance) |
2969 | break; | 3092 | break; |
2970 | } | 3093 | } |
2971 | rq->next_balance = next_balance; | 3094 | |
3095 | /* | ||
3096 | * next_balance will be updated only when there is a need. | ||
3097 | * When the cpu is attached to null domain for ex, it will not be | ||
3098 | * updated. | ||
3099 | */ | ||
3100 | if (likely(update_next_balance)) | ||
3101 | rq->next_balance = next_balance; | ||
2972 | } | 3102 | } |
2973 | 3103 | ||
2974 | /* | 3104 | /* |
@@ -3007,7 +3137,7 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3007 | if (need_resched()) | 3137 | if (need_resched()) |
3008 | break; | 3138 | break; |
3009 | 3139 | ||
3010 | rebalance_domains(balance_cpu, SCHED_IDLE); | 3140 | rebalance_domains(balance_cpu, CPU_IDLE); |
3011 | 3141 | ||
3012 | rq = cpu_rq(balance_cpu); | 3142 | rq = cpu_rq(balance_cpu); |
3013 | if (time_after(this_rq->next_balance, rq->next_balance)) | 3143 | if (time_after(this_rq->next_balance, rq->next_balance)) |
@@ -3092,8 +3222,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3092 | unsigned long max_nr_move, unsigned long max_load_move, | 3222 | unsigned long max_nr_move, unsigned long max_load_move, |
3093 | struct sched_domain *sd, enum cpu_idle_type idle, | 3223 | struct sched_domain *sd, enum cpu_idle_type idle, |
3094 | int *all_pinned, unsigned long *load_moved, | 3224 | int *all_pinned, unsigned long *load_moved, |
3095 | int this_best_prio, int best_prio, int best_prio_seen, | 3225 | int *this_best_prio, struct rq_iterator *iterator) |
3096 | struct rq_iterator *iterator) | ||
3097 | { | 3226 | { |
3098 | *load_moved = 0; | 3227 | *load_moved = 0; |
3099 | 3228 | ||
@@ -3119,7 +3248,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3119 | rq = task_rq_lock(p, &flags); | 3248 | rq = task_rq_lock(p, &flags); |
3120 | ns = p->se.sum_exec_runtime; | 3249 | ns = p->se.sum_exec_runtime; |
3121 | if (rq->curr == p) { | 3250 | if (rq->curr == p) { |
3122 | delta_exec = rq_clock(rq) - p->se.exec_start; | 3251 | update_rq_clock(rq); |
3252 | delta_exec = rq->clock - p->se.exec_start; | ||
3123 | if ((s64)delta_exec > 0) | 3253 | if ((s64)delta_exec > 0) |
3124 | ns += delta_exec; | 3254 | ns += delta_exec; |
3125 | } | 3255 | } |
@@ -3213,11 +3343,19 @@ void scheduler_tick(void) | |||
3213 | int cpu = smp_processor_id(); | 3343 | int cpu = smp_processor_id(); |
3214 | struct rq *rq = cpu_rq(cpu); | 3344 | struct rq *rq = cpu_rq(cpu); |
3215 | struct task_struct *curr = rq->curr; | 3345 | struct task_struct *curr = rq->curr; |
3346 | u64 next_tick = rq->tick_timestamp + TICK_NSEC; | ||
3216 | 3347 | ||
3217 | spin_lock(&rq->lock); | 3348 | spin_lock(&rq->lock); |
3349 | __update_rq_clock(rq); | ||
3350 | /* | ||
3351 | * Let rq->clock advance by at least TICK_NSEC: | ||
3352 | */ | ||
3353 | if (unlikely(rq->clock < next_tick)) | ||
3354 | rq->clock = next_tick; | ||
3355 | rq->tick_timestamp = rq->clock; | ||
3356 | update_cpu_load(rq); | ||
3218 | if (curr != rq->idle) /* FIXME: needed? */ | 3357 | if (curr != rq->idle) /* FIXME: needed? */ |
3219 | curr->sched_class->task_tick(rq, curr); | 3358 | curr->sched_class->task_tick(rq, curr); |
3220 | update_cpu_load(rq); | ||
3221 | spin_unlock(&rq->lock); | 3359 | spin_unlock(&rq->lock); |
3222 | 3360 | ||
3223 | #ifdef CONFIG_SMP | 3361 | #ifdef CONFIG_SMP |
@@ -3299,7 +3437,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3299 | * Pick up the highest-prio task: | 3437 | * Pick up the highest-prio task: |
3300 | */ | 3438 | */ |
3301 | static inline struct task_struct * | 3439 | static inline struct task_struct * |
3302 | pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) | 3440 | pick_next_task(struct rq *rq, struct task_struct *prev) |
3303 | { | 3441 | { |
3304 | struct sched_class *class; | 3442 | struct sched_class *class; |
3305 | struct task_struct *p; | 3443 | struct task_struct *p; |
@@ -3309,14 +3447,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) | |||
3309 | * the fair class we can call that function directly: | 3447 | * the fair class we can call that function directly: |
3310 | */ | 3448 | */ |
3311 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 3449 | if (likely(rq->nr_running == rq->cfs.nr_running)) { |
3312 | p = fair_sched_class.pick_next_task(rq, now); | 3450 | p = fair_sched_class.pick_next_task(rq); |
3313 | if (likely(p)) | 3451 | if (likely(p)) |
3314 | return p; | 3452 | return p; |
3315 | } | 3453 | } |
3316 | 3454 | ||
3317 | class = sched_class_highest; | 3455 | class = sched_class_highest; |
3318 | for ( ; ; ) { | 3456 | for ( ; ; ) { |
3319 | p = class->pick_next_task(rq, now); | 3457 | p = class->pick_next_task(rq); |
3320 | if (p) | 3458 | if (p) |
3321 | return p; | 3459 | return p; |
3322 | /* | 3460 | /* |
@@ -3335,7 +3473,6 @@ asmlinkage void __sched schedule(void) | |||
3335 | struct task_struct *prev, *next; | 3473 | struct task_struct *prev, *next; |
3336 | long *switch_count; | 3474 | long *switch_count; |
3337 | struct rq *rq; | 3475 | struct rq *rq; |
3338 | u64 now; | ||
3339 | int cpu; | 3476 | int cpu; |
3340 | 3477 | ||
3341 | need_resched: | 3478 | need_resched: |
@@ -3353,6 +3490,7 @@ need_resched_nonpreemptible: | |||
3353 | 3490 | ||
3354 | spin_lock_irq(&rq->lock); | 3491 | spin_lock_irq(&rq->lock); |
3355 | clear_tsk_need_resched(prev); | 3492 | clear_tsk_need_resched(prev); |
3493 | __update_rq_clock(rq); | ||
3356 | 3494 | ||
3357 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3495 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3358 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3496 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
@@ -3367,9 +3505,8 @@ need_resched_nonpreemptible: | |||
3367 | if (unlikely(!rq->nr_running)) | 3505 | if (unlikely(!rq->nr_running)) |
3368 | idle_balance(cpu, rq); | 3506 | idle_balance(cpu, rq); |
3369 | 3507 | ||
3370 | now = __rq_clock(rq); | 3508 | prev->sched_class->put_prev_task(rq, prev); |
3371 | prev->sched_class->put_prev_task(rq, prev, now); | 3509 | next = pick_next_task(rq, prev); |
3372 | next = pick_next_task(rq, prev, now); | ||
3373 | 3510 | ||
3374 | sched_info_switch(prev, next); | 3511 | sched_info_switch(prev, next); |
3375 | 3512 | ||
@@ -3812,17 +3949,16 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3812 | unsigned long flags; | 3949 | unsigned long flags; |
3813 | int oldprio, on_rq; | 3950 | int oldprio, on_rq; |
3814 | struct rq *rq; | 3951 | struct rq *rq; |
3815 | u64 now; | ||
3816 | 3952 | ||
3817 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3953 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
3818 | 3954 | ||
3819 | rq = task_rq_lock(p, &flags); | 3955 | rq = task_rq_lock(p, &flags); |
3820 | now = rq_clock(rq); | 3956 | update_rq_clock(rq); |
3821 | 3957 | ||
3822 | oldprio = p->prio; | 3958 | oldprio = p->prio; |
3823 | on_rq = p->se.on_rq; | 3959 | on_rq = p->se.on_rq; |
3824 | if (on_rq) | 3960 | if (on_rq) |
3825 | dequeue_task(rq, p, 0, now); | 3961 | dequeue_task(rq, p, 0); |
3826 | 3962 | ||
3827 | if (rt_prio(prio)) | 3963 | if (rt_prio(prio)) |
3828 | p->sched_class = &rt_sched_class; | 3964 | p->sched_class = &rt_sched_class; |
@@ -3832,7 +3968,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3832 | p->prio = prio; | 3968 | p->prio = prio; |
3833 | 3969 | ||
3834 | if (on_rq) { | 3970 | if (on_rq) { |
3835 | enqueue_task(rq, p, 0, now); | 3971 | enqueue_task(rq, p, 0); |
3836 | /* | 3972 | /* |
3837 | * Reschedule if we are currently running on this runqueue and | 3973 | * Reschedule if we are currently running on this runqueue and |
3838 | * our priority decreased, or if we are not currently running on | 3974 | * our priority decreased, or if we are not currently running on |
@@ -3855,7 +3991,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3855 | int old_prio, delta, on_rq; | 3991 | int old_prio, delta, on_rq; |
3856 | unsigned long flags; | 3992 | unsigned long flags; |
3857 | struct rq *rq; | 3993 | struct rq *rq; |
3858 | u64 now; | ||
3859 | 3994 | ||
3860 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3995 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3861 | return; | 3996 | return; |
@@ -3864,7 +3999,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3864 | * the task might be in the middle of scheduling on another CPU. | 3999 | * the task might be in the middle of scheduling on another CPU. |
3865 | */ | 4000 | */ |
3866 | rq = task_rq_lock(p, &flags); | 4001 | rq = task_rq_lock(p, &flags); |
3867 | now = rq_clock(rq); | 4002 | update_rq_clock(rq); |
3868 | /* | 4003 | /* |
3869 | * The RT priorities are set via sched_setscheduler(), but we still | 4004 | * The RT priorities are set via sched_setscheduler(), but we still |
3870 | * allow the 'normal' nice value to be set - but as expected | 4005 | * allow the 'normal' nice value to be set - but as expected |
@@ -3877,8 +4012,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3877 | } | 4012 | } |
3878 | on_rq = p->se.on_rq; | 4013 | on_rq = p->se.on_rq; |
3879 | if (on_rq) { | 4014 | if (on_rq) { |
3880 | dequeue_task(rq, p, 0, now); | 4015 | dequeue_task(rq, p, 0); |
3881 | dec_load(rq, p, now); | 4016 | dec_load(rq, p); |
3882 | } | 4017 | } |
3883 | 4018 | ||
3884 | p->static_prio = NICE_TO_PRIO(nice); | 4019 | p->static_prio = NICE_TO_PRIO(nice); |
@@ -3888,8 +4023,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3888 | delta = p->prio - old_prio; | 4023 | delta = p->prio - old_prio; |
3889 | 4024 | ||
3890 | if (on_rq) { | 4025 | if (on_rq) { |
3891 | enqueue_task(rq, p, 0, now); | 4026 | enqueue_task(rq, p, 0); |
3892 | inc_load(rq, p, now); | 4027 | inc_load(rq, p); |
3893 | /* | 4028 | /* |
3894 | * If the task increased its priority or is running and | 4029 | * If the task increased its priority or is running and |
3895 | * lowered its priority, then reschedule its CPU: | 4030 | * lowered its priority, then reschedule its CPU: |
@@ -4125,6 +4260,7 @@ recheck: | |||
4125 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4260 | spin_unlock_irqrestore(&p->pi_lock, flags); |
4126 | goto recheck; | 4261 | goto recheck; |
4127 | } | 4262 | } |
4263 | update_rq_clock(rq); | ||
4128 | on_rq = p->se.on_rq; | 4264 | on_rq = p->se.on_rq; |
4129 | if (on_rq) | 4265 | if (on_rq) |
4130 | deactivate_task(rq, p, 0); | 4266 | deactivate_task(rq, p, 0); |
@@ -4380,10 +4516,8 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4380 | out_unlock: | 4516 | out_unlock: |
4381 | read_unlock(&tasklist_lock); | 4517 | read_unlock(&tasklist_lock); |
4382 | mutex_unlock(&sched_hotcpu_mutex); | 4518 | mutex_unlock(&sched_hotcpu_mutex); |
4383 | if (retval) | ||
4384 | return retval; | ||
4385 | 4519 | ||
4386 | return 0; | 4520 | return retval; |
4387 | } | 4521 | } |
4388 | 4522 | ||
4389 | /** | 4523 | /** |
@@ -4422,10 +4556,7 @@ asmlinkage long sys_sched_yield(void) | |||
4422 | struct rq *rq = this_rq_lock(); | 4556 | struct rq *rq = this_rq_lock(); |
4423 | 4557 | ||
4424 | schedstat_inc(rq, yld_cnt); | 4558 | schedstat_inc(rq, yld_cnt); |
4425 | if (unlikely(rq->nr_running == 1)) | 4559 | current->sched_class->yield_task(rq, current); |
4426 | schedstat_inc(rq, yld_act_empty); | ||
4427 | else | ||
4428 | current->sched_class->yield_task(rq, current); | ||
4429 | 4560 | ||
4430 | /* | 4561 | /* |
4431 | * Since we are going to call schedule() anyway, there's | 4562 | * Since we are going to call schedule() anyway, there's |
@@ -4781,14 +4912,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
4781 | static inline void sched_init_granularity(void) | 4912 | static inline void sched_init_granularity(void) |
4782 | { | 4913 | { |
4783 | unsigned int factor = 1 + ilog2(num_online_cpus()); | 4914 | unsigned int factor = 1 + ilog2(num_online_cpus()); |
4784 | const unsigned long gran_limit = 100000000; | 4915 | const unsigned long limit = 100000000; |
4916 | |||
4917 | sysctl_sched_min_granularity *= factor; | ||
4918 | if (sysctl_sched_min_granularity > limit) | ||
4919 | sysctl_sched_min_granularity = limit; | ||
4785 | 4920 | ||
4786 | sysctl_sched_granularity *= factor; | 4921 | sysctl_sched_latency *= factor; |
4787 | if (sysctl_sched_granularity > gran_limit) | 4922 | if (sysctl_sched_latency > limit) |
4788 | sysctl_sched_granularity = gran_limit; | 4923 | sysctl_sched_latency = limit; |
4789 | 4924 | ||
4790 | sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; | 4925 | sysctl_sched_runtime_limit = sysctl_sched_latency; |
4791 | sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; | 4926 | sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; |
4792 | } | 4927 | } |
4793 | 4928 | ||
4794 | #ifdef CONFIG_SMP | 4929 | #ifdef CONFIG_SMP |
@@ -4883,6 +5018,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
4883 | on_rq = p->se.on_rq; | 5018 | on_rq = p->se.on_rq; |
4884 | if (on_rq) | 5019 | if (on_rq) |
4885 | deactivate_task(rq_src, p, 0); | 5020 | deactivate_task(rq_src, p, 0); |
5021 | |||
4886 | set_task_cpu(p, dest_cpu); | 5022 | set_task_cpu(p, dest_cpu); |
4887 | if (on_rq) { | 5023 | if (on_rq) { |
4888 | activate_task(rq_dest, p, 0); | 5024 | activate_task(rq_dest, p, 0); |
@@ -5115,14 +5251,137 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
5115 | for ( ; ; ) { | 5251 | for ( ; ; ) { |
5116 | if (!rq->nr_running) | 5252 | if (!rq->nr_running) |
5117 | break; | 5253 | break; |
5118 | next = pick_next_task(rq, rq->curr, rq_clock(rq)); | 5254 | update_rq_clock(rq); |
5255 | next = pick_next_task(rq, rq->curr); | ||
5119 | if (!next) | 5256 | if (!next) |
5120 | break; | 5257 | break; |
5121 | migrate_dead(dead_cpu, next); | 5258 | migrate_dead(dead_cpu, next); |
5259 | |||
5122 | } | 5260 | } |
5123 | } | 5261 | } |
5124 | #endif /* CONFIG_HOTPLUG_CPU */ | 5262 | #endif /* CONFIG_HOTPLUG_CPU */ |
5125 | 5263 | ||
5264 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | ||
5265 | |||
5266 | static struct ctl_table sd_ctl_dir[] = { | ||
5267 | { | ||
5268 | .procname = "sched_domain", | ||
5269 | .mode = 0555, | ||
5270 | }, | ||
5271 | {0,}, | ||
5272 | }; | ||
5273 | |||
5274 | static struct ctl_table sd_ctl_root[] = { | ||
5275 | { | ||
5276 | .ctl_name = CTL_KERN, | ||
5277 | .procname = "kernel", | ||
5278 | .mode = 0555, | ||
5279 | .child = sd_ctl_dir, | ||
5280 | }, | ||
5281 | {0,}, | ||
5282 | }; | ||
5283 | |||
5284 | static struct ctl_table *sd_alloc_ctl_entry(int n) | ||
5285 | { | ||
5286 | struct ctl_table *entry = | ||
5287 | kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); | ||
5288 | |||
5289 | BUG_ON(!entry); | ||
5290 | memset(entry, 0, n * sizeof(struct ctl_table)); | ||
5291 | |||
5292 | return entry; | ||
5293 | } | ||
5294 | |||
5295 | static void | ||
5296 | set_table_entry(struct ctl_table *entry, | ||
5297 | const char *procname, void *data, int maxlen, | ||
5298 | mode_t mode, proc_handler *proc_handler) | ||
5299 | { | ||
5300 | entry->procname = procname; | ||
5301 | entry->data = data; | ||
5302 | entry->maxlen = maxlen; | ||
5303 | entry->mode = mode; | ||
5304 | entry->proc_handler = proc_handler; | ||
5305 | } | ||
5306 | |||
5307 | static struct ctl_table * | ||
5308 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | ||
5309 | { | ||
5310 | struct ctl_table *table = sd_alloc_ctl_entry(14); | ||
5311 | |||
5312 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | ||
5313 | sizeof(long), 0644, proc_doulongvec_minmax); | ||
5314 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | ||
5315 | sizeof(long), 0644, proc_doulongvec_minmax); | ||
5316 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | ||
5317 | sizeof(int), 0644, proc_dointvec_minmax); | ||
5318 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | ||
5319 | sizeof(int), 0644, proc_dointvec_minmax); | ||
5320 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | ||
5321 | sizeof(int), 0644, proc_dointvec_minmax); | ||
5322 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | ||
5323 | sizeof(int), 0644, proc_dointvec_minmax); | ||
5324 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | ||
5325 | sizeof(int), 0644, proc_dointvec_minmax); | ||
5326 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | ||
5327 | sizeof(int), 0644, proc_dointvec_minmax); | ||
5328 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | ||
5329 | sizeof(int), 0644, proc_dointvec_minmax); | ||
5330 | set_table_entry(&table[10], "cache_nice_tries", | ||
5331 | &sd->cache_nice_tries, | ||
5332 | sizeof(int), 0644, proc_dointvec_minmax); | ||
5333 | set_table_entry(&table[12], "flags", &sd->flags, | ||
5334 | sizeof(int), 0644, proc_dointvec_minmax); | ||
5335 | |||
5336 | return table; | ||
5337 | } | ||
5338 | |||
5339 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | ||
5340 | { | ||
5341 | struct ctl_table *entry, *table; | ||
5342 | struct sched_domain *sd; | ||
5343 | int domain_num = 0, i; | ||
5344 | char buf[32]; | ||
5345 | |||
5346 | for_each_domain(cpu, sd) | ||
5347 | domain_num++; | ||
5348 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | ||
5349 | |||
5350 | i = 0; | ||
5351 | for_each_domain(cpu, sd) { | ||
5352 | snprintf(buf, 32, "domain%d", i); | ||
5353 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
5354 | entry->mode = 0555; | ||
5355 | entry->child = sd_alloc_ctl_domain_table(sd); | ||
5356 | entry++; | ||
5357 | i++; | ||
5358 | } | ||
5359 | return table; | ||
5360 | } | ||
5361 | |||
5362 | static struct ctl_table_header *sd_sysctl_header; | ||
5363 | static void init_sched_domain_sysctl(void) | ||
5364 | { | ||
5365 | int i, cpu_num = num_online_cpus(); | ||
5366 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | ||
5367 | char buf[32]; | ||
5368 | |||
5369 | sd_ctl_dir[0].child = entry; | ||
5370 | |||
5371 | for (i = 0; i < cpu_num; i++, entry++) { | ||
5372 | snprintf(buf, 32, "cpu%d", i); | ||
5373 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
5374 | entry->mode = 0555; | ||
5375 | entry->child = sd_alloc_ctl_cpu_table(i); | ||
5376 | } | ||
5377 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | ||
5378 | } | ||
5379 | #else | ||
5380 | static void init_sched_domain_sysctl(void) | ||
5381 | { | ||
5382 | } | ||
5383 | #endif | ||
5384 | |||
5126 | /* | 5385 | /* |
5127 | * migration_call - callback that gets triggered when a CPU is added. | 5386 | * migration_call - callback that gets triggered when a CPU is added. |
5128 | * Here we can start up the necessary migration thread for the new CPU. | 5387 | * Here we can start up the necessary migration thread for the new CPU. |
@@ -5179,6 +5438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5179 | rq->migration_thread = NULL; | 5438 | rq->migration_thread = NULL; |
5180 | /* Idle task back to normal (off runqueue, low prio) */ | 5439 | /* Idle task back to normal (off runqueue, low prio) */ |
5181 | rq = task_rq_lock(rq->idle, &flags); | 5440 | rq = task_rq_lock(rq->idle, &flags); |
5441 | update_rq_clock(rq); | ||
5182 | deactivate_task(rq, rq->idle, 0); | 5442 | deactivate_task(rq, rq->idle, 0); |
5183 | rq->idle->static_prio = MAX_PRIO; | 5443 | rq->idle->static_prio = MAX_PRIO; |
5184 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | 5444 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); |
@@ -6101,7 +6361,7 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6101 | } | 6361 | } |
6102 | 6362 | ||
6103 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6363 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
6104 | int arch_reinit_sched_domains(void) | 6364 | static int arch_reinit_sched_domains(void) |
6105 | { | 6365 | { |
6106 | int err; | 6366 | int err; |
6107 | 6367 | ||
@@ -6130,24 +6390,6 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
6130 | return ret ? ret : count; | 6390 | return ret ? ret : count; |
6131 | } | 6391 | } |
6132 | 6392 | ||
6133 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
6134 | { | ||
6135 | int err = 0; | ||
6136 | |||
6137 | #ifdef CONFIG_SCHED_SMT | ||
6138 | if (smt_capable()) | ||
6139 | err = sysfs_create_file(&cls->kset.kobj, | ||
6140 | &attr_sched_smt_power_savings.attr); | ||
6141 | #endif | ||
6142 | #ifdef CONFIG_SCHED_MC | ||
6143 | if (!err && mc_capable()) | ||
6144 | err = sysfs_create_file(&cls->kset.kobj, | ||
6145 | &attr_sched_mc_power_savings.attr); | ||
6146 | #endif | ||
6147 | return err; | ||
6148 | } | ||
6149 | #endif | ||
6150 | |||
6151 | #ifdef CONFIG_SCHED_MC | 6393 | #ifdef CONFIG_SCHED_MC |
6152 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | 6394 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) |
6153 | { | 6395 | { |
@@ -6158,8 +6400,8 @@ static ssize_t sched_mc_power_savings_store(struct sys_device *dev, | |||
6158 | { | 6400 | { |
6159 | return sched_power_savings_store(buf, count, 0); | 6401 | return sched_power_savings_store(buf, count, 0); |
6160 | } | 6402 | } |
6161 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | 6403 | static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, |
6162 | sched_mc_power_savings_store); | 6404 | sched_mc_power_savings_store); |
6163 | #endif | 6405 | #endif |
6164 | 6406 | ||
6165 | #ifdef CONFIG_SCHED_SMT | 6407 | #ifdef CONFIG_SCHED_SMT |
@@ -6172,8 +6414,26 @@ static ssize_t sched_smt_power_savings_store(struct sys_device *dev, | |||
6172 | { | 6414 | { |
6173 | return sched_power_savings_store(buf, count, 1); | 6415 | return sched_power_savings_store(buf, count, 1); |
6174 | } | 6416 | } |
6175 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | 6417 | static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, |
6176 | sched_smt_power_savings_store); | 6418 | sched_smt_power_savings_store); |
6419 | #endif | ||
6420 | |||
6421 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
6422 | { | ||
6423 | int err = 0; | ||
6424 | |||
6425 | #ifdef CONFIG_SCHED_SMT | ||
6426 | if (smt_capable()) | ||
6427 | err = sysfs_create_file(&cls->kset.kobj, | ||
6428 | &attr_sched_smt_power_savings.attr); | ||
6429 | #endif | ||
6430 | #ifdef CONFIG_SCHED_MC | ||
6431 | if (!err && mc_capable()) | ||
6432 | err = sysfs_create_file(&cls->kset.kobj, | ||
6433 | &attr_sched_mc_power_savings.attr); | ||
6434 | #endif | ||
6435 | return err; | ||
6436 | } | ||
6177 | #endif | 6437 | #endif |
6178 | 6438 | ||
6179 | /* | 6439 | /* |
@@ -6228,6 +6488,8 @@ void __init sched_init_smp(void) | |||
6228 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 6488 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6229 | hotcpu_notifier(update_sched_domains, 0); | 6489 | hotcpu_notifier(update_sched_domains, 0); |
6230 | 6490 | ||
6491 | init_sched_domain_sysctl(); | ||
6492 | |||
6231 | /* Move init over to a non-isolated CPU */ | 6493 | /* Move init over to a non-isolated CPU */ |
6232 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6494 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6233 | BUG(); | 6495 | BUG(); |
@@ -6314,6 +6576,10 @@ void __init sched_init(void) | |||
6314 | 6576 | ||
6315 | set_load_weight(&init_task); | 6577 | set_load_weight(&init_task); |
6316 | 6578 | ||
6579 | #ifdef CONFIG_PREEMPT_NOTIFIERS | ||
6580 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | ||
6581 | #endif | ||
6582 | |||
6317 | #ifdef CONFIG_SMP | 6583 | #ifdef CONFIG_SMP |
6318 | nr_cpu_ids = highest_cpu + 1; | 6584 | nr_cpu_ids = highest_cpu + 1; |
6319 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 6585 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); |
@@ -6379,12 +6645,14 @@ void normalize_rt_tasks(void) | |||
6379 | do_each_thread(g, p) { | 6645 | do_each_thread(g, p) { |
6380 | p->se.fair_key = 0; | 6646 | p->se.fair_key = 0; |
6381 | p->se.wait_runtime = 0; | 6647 | p->se.wait_runtime = 0; |
6648 | p->se.exec_start = 0; | ||
6382 | p->se.wait_start_fair = 0; | 6649 | p->se.wait_start_fair = 0; |
6650 | p->se.sleep_start_fair = 0; | ||
6651 | #ifdef CONFIG_SCHEDSTATS | ||
6383 | p->se.wait_start = 0; | 6652 | p->se.wait_start = 0; |
6384 | p->se.exec_start = 0; | ||
6385 | p->se.sleep_start = 0; | 6653 | p->se.sleep_start = 0; |
6386 | p->se.sleep_start_fair = 0; | ||
6387 | p->se.block_start = 0; | 6654 | p->se.block_start = 0; |
6655 | #endif | ||
6388 | task_rq(p)->cfs.fair_clock = 0; | 6656 | task_rq(p)->cfs.fair_clock = 0; |
6389 | task_rq(p)->clock = 0; | 6657 | task_rq(p)->clock = 0; |
6390 | 6658 | ||
@@ -6408,12 +6676,13 @@ void normalize_rt_tasks(void) | |||
6408 | goto out_unlock; | 6676 | goto out_unlock; |
6409 | #endif | 6677 | #endif |
6410 | 6678 | ||
6679 | update_rq_clock(rq); | ||
6411 | on_rq = p->se.on_rq; | 6680 | on_rq = p->se.on_rq; |
6412 | if (on_rq) | 6681 | if (on_rq) |
6413 | deactivate_task(task_rq(p), p, 0); | 6682 | deactivate_task(rq, p, 0); |
6414 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 6683 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
6415 | if (on_rq) { | 6684 | if (on_rq) { |
6416 | activate_task(task_rq(p), p, 0); | 6685 | activate_task(rq, p, 0); |
6417 | resched_task(rq->curr); | 6686 | resched_task(rq->curr); |
6418 | } | 6687 | } |
6419 | #ifdef CONFIG_SMP | 6688 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 29f2c21e7da2..c3ee38bd3426 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -29,29 +29,34 @@ | |||
29 | } while (0) | 29 | } while (0) |
30 | 30 | ||
31 | static void | 31 | static void |
32 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now) | 32 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) |
33 | { | 33 | { |
34 | if (rq->curr == p) | 34 | if (rq->curr == p) |
35 | SEQ_printf(m, "R"); | 35 | SEQ_printf(m, "R"); |
36 | else | 36 | else |
37 | SEQ_printf(m, " "); | 37 | SEQ_printf(m, " "); |
38 | 38 | ||
39 | SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d " | 39 | SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", |
40 | "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | ||
41 | p->comm, p->pid, | 40 | p->comm, p->pid, |
42 | (long long)p->se.fair_key, | 41 | (long long)p->se.fair_key, |
43 | (long long)(p->se.fair_key - rq->cfs.fair_clock), | 42 | (long long)(p->se.fair_key - rq->cfs.fair_clock), |
44 | (long long)p->se.wait_runtime, | 43 | (long long)p->se.wait_runtime, |
45 | (long long)(p->nvcsw + p->nivcsw), | 44 | (long long)(p->nvcsw + p->nivcsw), |
46 | p->prio, | 45 | p->prio); |
46 | #ifdef CONFIG_SCHEDSTATS | ||
47 | SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | ||
47 | (long long)p->se.sum_exec_runtime, | 48 | (long long)p->se.sum_exec_runtime, |
48 | (long long)p->se.sum_wait_runtime, | 49 | (long long)p->se.sum_wait_runtime, |
49 | (long long)p->se.sum_sleep_runtime, | 50 | (long long)p->se.sum_sleep_runtime, |
50 | (long long)p->se.wait_runtime_overruns, | 51 | (long long)p->se.wait_runtime_overruns, |
51 | (long long)p->se.wait_runtime_underruns); | 52 | (long long)p->se.wait_runtime_underruns); |
53 | #else | ||
54 | SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", | ||
55 | 0LL, 0LL, 0LL, 0LL, 0LL); | ||
56 | #endif | ||
52 | } | 57 | } |
53 | 58 | ||
54 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now) | 59 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
55 | { | 60 | { |
56 | struct task_struct *g, *p; | 61 | struct task_struct *g, *p; |
57 | 62 | ||
@@ -72,7 +77,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now) | |||
72 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) | 77 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) |
73 | continue; | 78 | continue; |
74 | 79 | ||
75 | print_task(m, rq, p, now); | 80 | print_task(m, rq, p); |
76 | } while_each_thread(g, p); | 81 | } while_each_thread(g, p); |
77 | 82 | ||
78 | read_unlock_irq(&tasklist_lock); | 83 | read_unlock_irq(&tasklist_lock); |
@@ -101,9 +106,9 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
101 | (long long)wait_runtime_rq_sum); | 106 | (long long)wait_runtime_rq_sum); |
102 | } | 107 | } |
103 | 108 | ||
104 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) | 109 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
105 | { | 110 | { |
106 | SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq); | 111 | SEQ_printf(m, "\ncfs_rq\n"); |
107 | 112 | ||
108 | #define P(x) \ | 113 | #define P(x) \ |
109 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) | 114 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) |
@@ -119,7 +124,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) | |||
119 | print_cfs_rq_runtime_sum(m, cpu, cfs_rq); | 124 | print_cfs_rq_runtime_sum(m, cpu, cfs_rq); |
120 | } | 125 | } |
121 | 126 | ||
122 | static void print_cpu(struct seq_file *m, int cpu, u64 now) | 127 | static void print_cpu(struct seq_file *m, int cpu) |
123 | { | 128 | { |
124 | struct rq *rq = &per_cpu(runqueues, cpu); | 129 | struct rq *rq = &per_cpu(runqueues, cpu); |
125 | 130 | ||
@@ -149,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) | |||
149 | P(next_balance); | 154 | P(next_balance); |
150 | P(curr->pid); | 155 | P(curr->pid); |
151 | P(clock); | 156 | P(clock); |
157 | P(idle_clock); | ||
152 | P(prev_clock_raw); | 158 | P(prev_clock_raw); |
153 | P(clock_warps); | 159 | P(clock_warps); |
154 | P(clock_overflows); | 160 | P(clock_overflows); |
155 | P(clock_unstable_events); | 161 | P(clock_deep_idle_events); |
156 | P(clock_max_delta); | 162 | P(clock_max_delta); |
157 | P(cpu_load[0]); | 163 | P(cpu_load[0]); |
158 | P(cpu_load[1]); | 164 | P(cpu_load[1]); |
@@ -161,9 +167,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) | |||
161 | P(cpu_load[4]); | 167 | P(cpu_load[4]); |
162 | #undef P | 168 | #undef P |
163 | 169 | ||
164 | print_cfs_stats(m, cpu, now); | 170 | print_cfs_stats(m, cpu); |
165 | 171 | ||
166 | print_rq(m, rq, cpu, now); | 172 | print_rq(m, rq, cpu); |
167 | } | 173 | } |
168 | 174 | ||
169 | static int sched_debug_show(struct seq_file *m, void *v) | 175 | static int sched_debug_show(struct seq_file *m, void *v) |
@@ -171,7 +177,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
171 | u64 now = ktime_to_ns(ktime_get()); | 177 | u64 now = ktime_to_ns(ktime_get()); |
172 | int cpu; | 178 | int cpu; |
173 | 179 | ||
174 | SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n", | 180 | SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n", |
175 | init_utsname()->release, | 181 | init_utsname()->release, |
176 | (int)strcspn(init_utsname()->version, " "), | 182 | (int)strcspn(init_utsname()->version, " "), |
177 | init_utsname()->version); | 183 | init_utsname()->version); |
@@ -179,14 +185,14 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
179 | SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); | 185 | SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); |
180 | 186 | ||
181 | for_each_online_cpu(cpu) | 187 | for_each_online_cpu(cpu) |
182 | print_cpu(m, cpu, now); | 188 | print_cpu(m, cpu); |
183 | 189 | ||
184 | SEQ_printf(m, "\n"); | 190 | SEQ_printf(m, "\n"); |
185 | 191 | ||
186 | return 0; | 192 | return 0; |
187 | } | 193 | } |
188 | 194 | ||
189 | void sysrq_sched_debug_show(void) | 195 | static void sysrq_sched_debug_show(void) |
190 | { | 196 | { |
191 | sched_debug_show(NULL, NULL); | 197 | sched_debug_show(NULL, NULL); |
192 | } | 198 | } |
@@ -200,7 +206,7 @@ static struct file_operations sched_debug_fops = { | |||
200 | .open = sched_debug_open, | 206 | .open = sched_debug_open, |
201 | .read = seq_read, | 207 | .read = seq_read, |
202 | .llseek = seq_lseek, | 208 | .llseek = seq_lseek, |
203 | .release = seq_release, | 209 | .release = single_release, |
204 | }; | 210 | }; |
205 | 211 | ||
206 | static int __init init_sched_debug_procfs(void) | 212 | static int __init init_sched_debug_procfs(void) |
@@ -235,21 +241,24 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
235 | #define P(F) \ | 241 | #define P(F) \ |
236 | SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) | 242 | SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) |
237 | 243 | ||
238 | P(se.wait_start); | 244 | P(se.wait_runtime); |
239 | P(se.wait_start_fair); | 245 | P(se.wait_start_fair); |
240 | P(se.exec_start); | 246 | P(se.exec_start); |
241 | P(se.sleep_start); | ||
242 | P(se.sleep_start_fair); | 247 | P(se.sleep_start_fair); |
248 | P(se.sum_exec_runtime); | ||
249 | |||
250 | #ifdef CONFIG_SCHEDSTATS | ||
251 | P(se.wait_start); | ||
252 | P(se.sleep_start); | ||
243 | P(se.block_start); | 253 | P(se.block_start); |
244 | P(se.sleep_max); | 254 | P(se.sleep_max); |
245 | P(se.block_max); | 255 | P(se.block_max); |
246 | P(se.exec_max); | 256 | P(se.exec_max); |
247 | P(se.wait_max); | 257 | P(se.wait_max); |
248 | P(se.wait_runtime); | ||
249 | P(se.wait_runtime_overruns); | 258 | P(se.wait_runtime_overruns); |
250 | P(se.wait_runtime_underruns); | 259 | P(se.wait_runtime_underruns); |
251 | P(se.sum_wait_runtime); | 260 | P(se.sum_wait_runtime); |
252 | P(se.sum_exec_runtime); | 261 | #endif |
253 | SEQ_printf(m, "%-25s:%20Ld\n", | 262 | SEQ_printf(m, "%-25s:%20Ld\n", |
254 | "nr_switches", (long long)(p->nvcsw + p->nivcsw)); | 263 | "nr_switches", (long long)(p->nvcsw + p->nivcsw)); |
255 | P(se.load.weight); | 264 | P(se.load.weight); |
@@ -269,7 +278,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
269 | 278 | ||
270 | void proc_sched_set_task(struct task_struct *p) | 279 | void proc_sched_set_task(struct task_struct *p) |
271 | { | 280 | { |
281 | #ifdef CONFIG_SCHEDSTATS | ||
272 | p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; | 282 | p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; |
273 | p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; | 283 | p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; |
284 | #endif | ||
274 | p->se.sum_exec_runtime = 0; | 285 | p->se.sum_exec_runtime = 0; |
286 | p->se.prev_sum_exec_runtime = 0; | ||
275 | } | 287 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6971db0a7160..67c67a87146e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -15,34 +15,50 @@ | |||
15 | * | 15 | * |
16 | * Scaled math optimizations by Thomas Gleixner | 16 | * Scaled math optimizations by Thomas Gleixner |
17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> | 17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> |
18 | * | ||
19 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra | ||
20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
18 | */ | 21 | */ |
19 | 22 | ||
20 | /* | 23 | /* |
21 | * Preemption granularity: | 24 | * Targeted preemption latency for CPU-bound tasks: |
22 | * (default: 2 msec, units: nanoseconds) | 25 | * (default: 20ms, units: nanoseconds) |
23 | * | 26 | * |
24 | * NOTE: this granularity value is not the same as the concept of | 27 | * NOTE: this latency value is not the same as the concept of |
25 | * 'timeslice length' - timeslices in CFS will typically be somewhat | 28 | * 'timeslice length' - timeslices in CFS are of variable length. |
26 | * larger than this value. (to see the precise effective timeslice | 29 | * (to see the precise effective timeslice length of your workload, |
27 | * length of your workload, run vmstat and monitor the context-switches | 30 | * run vmstat and monitor the context-switches field) |
28 | * field) | ||
29 | * | 31 | * |
30 | * On SMP systems the value of this is multiplied by the log2 of the | 32 | * On SMP systems the value of this is multiplied by the log2 of the |
31 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | 33 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way |
32 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | 34 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) |
35 | * Targeted preemption latency for CPU-bound tasks: | ||
36 | */ | ||
37 | unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; | ||
38 | |||
39 | /* | ||
40 | * Minimal preemption granularity for CPU-bound tasks: | ||
41 | * (default: 2 msec, units: nanoseconds) | ||
33 | */ | 42 | */ |
34 | unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; | 43 | unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; |
44 | |||
45 | /* | ||
46 | * sys_sched_yield() compat mode | ||
47 | * | ||
48 | * This option switches the agressive yield implementation of the | ||
49 | * old scheduler back on. | ||
50 | */ | ||
51 | unsigned int __read_mostly sysctl_sched_compat_yield; | ||
35 | 52 | ||
36 | /* | 53 | /* |
37 | * SCHED_BATCH wake-up granularity. | 54 | * SCHED_BATCH wake-up granularity. |
38 | * (default: 10 msec, units: nanoseconds) | 55 | * (default: 25 msec, units: nanoseconds) |
39 | * | 56 | * |
40 | * This option delays the preemption effects of decoupled workloads | 57 | * This option delays the preemption effects of decoupled workloads |
41 | * and reduces their over-scheduling. Synchronous workloads will still | 58 | * and reduces their over-scheduling. Synchronous workloads will still |
42 | * have immediate wakeup/sleep latencies. | 59 | * have immediate wakeup/sleep latencies. |
43 | */ | 60 | */ |
44 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = | 61 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; |
45 | 10000000000ULL/HZ; | ||
46 | 62 | ||
47 | /* | 63 | /* |
48 | * SCHED_OTHER wake-up granularity. | 64 | * SCHED_OTHER wake-up granularity. |
@@ -52,12 +68,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = | |||
52 | * and reduces their over-scheduling. Synchronous workloads will still | 68 | * and reduces their over-scheduling. Synchronous workloads will still |
53 | * have immediate wakeup/sleep latencies. | 69 | * have immediate wakeup/sleep latencies. |
54 | */ | 70 | */ |
55 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; | 71 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; |
56 | 72 | ||
57 | unsigned int sysctl_sched_stat_granularity __read_mostly; | 73 | unsigned int sysctl_sched_stat_granularity __read_mostly; |
58 | 74 | ||
59 | /* | 75 | /* |
60 | * Initialized in sched_init_granularity(): | 76 | * Initialized in sched_init_granularity() [to 5 times the base granularity]: |
61 | */ | 77 | */ |
62 | unsigned int sysctl_sched_runtime_limit __read_mostly; | 78 | unsigned int sysctl_sched_runtime_limit __read_mostly; |
63 | 79 | ||
@@ -75,7 +91,7 @@ enum { | |||
75 | 91 | ||
76 | unsigned int sysctl_sched_features __read_mostly = | 92 | unsigned int sysctl_sched_features __read_mostly = |
77 | SCHED_FEAT_FAIR_SLEEPERS *1 | | 93 | SCHED_FEAT_FAIR_SLEEPERS *1 | |
78 | SCHED_FEAT_SLEEPER_AVG *1 | | 94 | SCHED_FEAT_SLEEPER_AVG *0 | |
79 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | | 95 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | |
80 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | | 96 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | |
81 | SCHED_FEAT_START_DEBIT *1 | | 97 | SCHED_FEAT_START_DEBIT *1 | |
@@ -186,6 +202,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
186 | update_load_add(&cfs_rq->load, se->load.weight); | 202 | update_load_add(&cfs_rq->load, se->load.weight); |
187 | cfs_rq->nr_running++; | 203 | cfs_rq->nr_running++; |
188 | se->on_rq = 1; | 204 | se->on_rq = 1; |
205 | |||
206 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
189 | } | 207 | } |
190 | 208 | ||
191 | static inline void | 209 | static inline void |
@@ -197,6 +215,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
197 | update_load_sub(&cfs_rq->load, se->load.weight); | 215 | update_load_sub(&cfs_rq->load, se->load.weight); |
198 | cfs_rq->nr_running--; | 216 | cfs_rq->nr_running--; |
199 | se->on_rq = 0; | 217 | se->on_rq = 0; |
218 | |||
219 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | ||
200 | } | 220 | } |
201 | 221 | ||
202 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) | 222 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) |
@@ -214,6 +234,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
214 | */ | 234 | */ |
215 | 235 | ||
216 | /* | 236 | /* |
237 | * Calculate the preemption granularity needed to schedule every | ||
238 | * runnable task once per sysctl_sched_latency amount of time. | ||
239 | * (down to a sensible low limit on granularity) | ||
240 | * | ||
241 | * For example, if there are 2 tasks running and latency is 10 msecs, | ||
242 | * we switch tasks every 5 msecs. If we have 3 tasks running, we have | ||
243 | * to switch tasks every 3.33 msecs to get a 10 msecs observed latency | ||
244 | * for each task. We do finer and finer scheduling up to until we | ||
245 | * reach the minimum granularity value. | ||
246 | * | ||
247 | * To achieve this we use the following dynamic-granularity rule: | ||
248 | * | ||
249 | * gran = lat/nr - lat/nr/nr | ||
250 | * | ||
251 | * This comes out of the following equations: | ||
252 | * | ||
253 | * kA1 + gran = kB1 | ||
254 | * kB2 + gran = kA2 | ||
255 | * kA2 = kA1 | ||
256 | * kB2 = kB1 - d + d/nr | ||
257 | * lat = d * nr | ||
258 | * | ||
259 | * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), | ||
260 | * '1' is start of time, '2' is end of time, 'd' is delay between | ||
261 | * 1 and 2 (during which task B was running), 'nr' is number of tasks | ||
262 | * running, 'lat' is the the period of each task. ('lat' is the | ||
263 | * sched_latency that we aim for.) | ||
264 | */ | ||
265 | static long | ||
266 | sched_granularity(struct cfs_rq *cfs_rq) | ||
267 | { | ||
268 | unsigned int gran = sysctl_sched_latency; | ||
269 | unsigned int nr = cfs_rq->nr_running; | ||
270 | |||
271 | if (nr > 1) { | ||
272 | gran = gran/nr - gran/nr/nr; | ||
273 | gran = max(gran, sysctl_sched_min_granularity); | ||
274 | } | ||
275 | |||
276 | return gran; | ||
277 | } | ||
278 | |||
279 | /* | ||
217 | * We rescale the rescheduling granularity of tasks according to their | 280 | * We rescale the rescheduling granularity of tasks according to their |
218 | * nice level, but only linearly, not exponentially: | 281 | * nice level, but only linearly, not exponentially: |
219 | */ | 282 | */ |
@@ -222,21 +285,25 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity) | |||
222 | { | 285 | { |
223 | u64 tmp; | 286 | u64 tmp; |
224 | 287 | ||
288 | if (likely(curr->load.weight == NICE_0_LOAD)) | ||
289 | return granularity; | ||
225 | /* | 290 | /* |
226 | * Negative nice levels get the same granularity as nice-0: | 291 | * Positive nice levels get the same granularity as nice-0: |
227 | */ | 292 | */ |
228 | if (likely(curr->load.weight >= NICE_0_LOAD)) | 293 | if (likely(curr->load.weight < NICE_0_LOAD)) { |
229 | return granularity; | 294 | tmp = curr->load.weight * (u64)granularity; |
295 | return (long) (tmp >> NICE_0_SHIFT); | ||
296 | } | ||
230 | /* | 297 | /* |
231 | * Positive nice level tasks get linearly finer | 298 | * Negative nice level tasks get linearly finer |
232 | * granularity: | 299 | * granularity: |
233 | */ | 300 | */ |
234 | tmp = curr->load.weight * (u64)granularity; | 301 | tmp = curr->load.inv_weight * (u64)granularity; |
235 | 302 | ||
236 | /* | 303 | /* |
237 | * It will always fit into 'long': | 304 | * It will always fit into 'long': |
238 | */ | 305 | */ |
239 | return (long) (tmp >> NICE_0_SHIFT); | 306 | return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); |
240 | } | 307 | } |
241 | 308 | ||
242 | static inline void | 309 | static inline void |
@@ -281,34 +348,28 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | |||
281 | * are not in our scheduling class. | 348 | * are not in our scheduling class. |
282 | */ | 349 | */ |
283 | static inline void | 350 | static inline void |
284 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) | 351 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
285 | { | 352 | { |
286 | unsigned long delta, delta_exec, delta_fair; | 353 | unsigned long delta, delta_exec, delta_fair, delta_mine; |
287 | long delta_mine; | ||
288 | struct load_weight *lw = &cfs_rq->load; | 354 | struct load_weight *lw = &cfs_rq->load; |
289 | unsigned long load = lw->weight; | 355 | unsigned long load = lw->weight; |
290 | 356 | ||
291 | if (unlikely(!load)) | ||
292 | return; | ||
293 | |||
294 | delta_exec = curr->delta_exec; | 357 | delta_exec = curr->delta_exec; |
295 | #ifdef CONFIG_SCHEDSTATS | 358 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); |
296 | if (unlikely(delta_exec > curr->exec_max)) | ||
297 | curr->exec_max = delta_exec; | ||
298 | #endif | ||
299 | 359 | ||
300 | curr->sum_exec_runtime += delta_exec; | 360 | curr->sum_exec_runtime += delta_exec; |
301 | cfs_rq->exec_clock += delta_exec; | 361 | cfs_rq->exec_clock += delta_exec; |
302 | 362 | ||
363 | if (unlikely(!load)) | ||
364 | return; | ||
365 | |||
303 | delta_fair = calc_delta_fair(delta_exec, lw); | 366 | delta_fair = calc_delta_fair(delta_exec, lw); |
304 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | 367 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); |
305 | 368 | ||
306 | if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) { | 369 | if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { |
307 | delta = calc_delta_mine(cfs_rq->sleeper_bonus, | 370 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); |
308 | curr->load.weight, lw); | 371 | delta = min(delta, (unsigned long)( |
309 | if (unlikely(delta > cfs_rq->sleeper_bonus)) | 372 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); |
310 | delta = cfs_rq->sleeper_bonus; | ||
311 | |||
312 | cfs_rq->sleeper_bonus -= delta; | 373 | cfs_rq->sleeper_bonus -= delta; |
313 | delta_mine -= delta; | 374 | delta_mine -= delta; |
314 | } | 375 | } |
@@ -324,7 +385,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) | |||
324 | add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); | 385 | add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); |
325 | } | 386 | } |
326 | 387 | ||
327 | static void update_curr(struct cfs_rq *cfs_rq, u64 now) | 388 | static void update_curr(struct cfs_rq *cfs_rq) |
328 | { | 389 | { |
329 | struct sched_entity *curr = cfs_rq_curr(cfs_rq); | 390 | struct sched_entity *curr = cfs_rq_curr(cfs_rq); |
330 | unsigned long delta_exec; | 391 | unsigned long delta_exec; |
@@ -337,22 +398,22 @@ static void update_curr(struct cfs_rq *cfs_rq, u64 now) | |||
337 | * since the last time we changed load (this cannot | 398 | * since the last time we changed load (this cannot |
338 | * overflow on 32 bits): | 399 | * overflow on 32 bits): |
339 | */ | 400 | */ |
340 | delta_exec = (unsigned long)(now - curr->exec_start); | 401 | delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); |
341 | 402 | ||
342 | curr->delta_exec += delta_exec; | 403 | curr->delta_exec += delta_exec; |
343 | 404 | ||
344 | if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { | 405 | if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { |
345 | __update_curr(cfs_rq, curr, now); | 406 | __update_curr(cfs_rq, curr); |
346 | curr->delta_exec = 0; | 407 | curr->delta_exec = 0; |
347 | } | 408 | } |
348 | curr->exec_start = now; | 409 | curr->exec_start = rq_of(cfs_rq)->clock; |
349 | } | 410 | } |
350 | 411 | ||
351 | static inline void | 412 | static inline void |
352 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 413 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
353 | { | 414 | { |
354 | se->wait_start_fair = cfs_rq->fair_clock; | 415 | se->wait_start_fair = cfs_rq->fair_clock; |
355 | se->wait_start = now; | 416 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); |
356 | } | 417 | } |
357 | 418 | ||
358 | /* | 419 | /* |
@@ -380,8 +441,7 @@ calc_weighted(unsigned long delta, unsigned long weight, int shift) | |||
380 | /* | 441 | /* |
381 | * Task is being enqueued - update stats: | 442 | * Task is being enqueued - update stats: |
382 | */ | 443 | */ |
383 | static void | 444 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
384 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
385 | { | 445 | { |
386 | s64 key; | 446 | s64 key; |
387 | 447 | ||
@@ -390,7 +450,7 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
390 | * a dequeue/enqueue event is a NOP) | 450 | * a dequeue/enqueue event is a NOP) |
391 | */ | 451 | */ |
392 | if (se != cfs_rq_curr(cfs_rq)) | 452 | if (se != cfs_rq_curr(cfs_rq)) |
393 | update_stats_wait_start(cfs_rq, se, now); | 453 | update_stats_wait_start(cfs_rq, se); |
394 | /* | 454 | /* |
395 | * Update the key: | 455 | * Update the key: |
396 | */ | 456 | */ |
@@ -410,7 +470,8 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
410 | (WMULT_SHIFT - NICE_0_SHIFT); | 470 | (WMULT_SHIFT - NICE_0_SHIFT); |
411 | } else { | 471 | } else { |
412 | tmp = se->wait_runtime; | 472 | tmp = se->wait_runtime; |
413 | key -= (tmp * se->load.weight) >> NICE_0_SHIFT; | 473 | key -= (tmp * se->load.inv_weight) >> |
474 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
414 | } | 475 | } |
415 | } | 476 | } |
416 | 477 | ||
@@ -421,17 +482,12 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
421 | * Note: must be called with a freshly updated rq->fair_clock. | 482 | * Note: must be called with a freshly updated rq->fair_clock. |
422 | */ | 483 | */ |
423 | static inline void | 484 | static inline void |
424 | __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 485 | __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
425 | { | 486 | { |
426 | unsigned long delta_fair = se->delta_fair_run; | 487 | unsigned long delta_fair = se->delta_fair_run; |
427 | 488 | ||
428 | #ifdef CONFIG_SCHEDSTATS | 489 | schedstat_set(se->wait_max, max(se->wait_max, |
429 | { | 490 | rq_of(cfs_rq)->clock - se->wait_start)); |
430 | s64 delta_wait = now - se->wait_start; | ||
431 | if (unlikely(delta_wait > se->wait_max)) | ||
432 | se->wait_max = delta_wait; | ||
433 | } | ||
434 | #endif | ||
435 | 491 | ||
436 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 492 | if (unlikely(se->load.weight != NICE_0_LOAD)) |
437 | delta_fair = calc_weighted(delta_fair, se->load.weight, | 493 | delta_fair = calc_weighted(delta_fair, se->load.weight, |
@@ -441,53 +497,56 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
441 | } | 497 | } |
442 | 498 | ||
443 | static void | 499 | static void |
444 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 500 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
445 | { | 501 | { |
446 | unsigned long delta_fair; | 502 | unsigned long delta_fair; |
447 | 503 | ||
504 | if (unlikely(!se->wait_start_fair)) | ||
505 | return; | ||
506 | |||
448 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | 507 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), |
449 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); | 508 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); |
450 | 509 | ||
451 | se->delta_fair_run += delta_fair; | 510 | se->delta_fair_run += delta_fair; |
452 | if (unlikely(abs(se->delta_fair_run) >= | 511 | if (unlikely(abs(se->delta_fair_run) >= |
453 | sysctl_sched_stat_granularity)) { | 512 | sysctl_sched_stat_granularity)) { |
454 | __update_stats_wait_end(cfs_rq, se, now); | 513 | __update_stats_wait_end(cfs_rq, se); |
455 | se->delta_fair_run = 0; | 514 | se->delta_fair_run = 0; |
456 | } | 515 | } |
457 | 516 | ||
458 | se->wait_start_fair = 0; | 517 | se->wait_start_fair = 0; |
459 | se->wait_start = 0; | 518 | schedstat_set(se->wait_start, 0); |
460 | } | 519 | } |
461 | 520 | ||
462 | static inline void | 521 | static inline void |
463 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 522 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
464 | { | 523 | { |
465 | update_curr(cfs_rq, now); | 524 | update_curr(cfs_rq); |
466 | /* | 525 | /* |
467 | * Mark the end of the wait period if dequeueing a | 526 | * Mark the end of the wait period if dequeueing a |
468 | * waiting task: | 527 | * waiting task: |
469 | */ | 528 | */ |
470 | if (se != cfs_rq_curr(cfs_rq)) | 529 | if (se != cfs_rq_curr(cfs_rq)) |
471 | update_stats_wait_end(cfs_rq, se, now); | 530 | update_stats_wait_end(cfs_rq, se); |
472 | } | 531 | } |
473 | 532 | ||
474 | /* | 533 | /* |
475 | * We are picking a new current task - update its stats: | 534 | * We are picking a new current task - update its stats: |
476 | */ | 535 | */ |
477 | static inline void | 536 | static inline void |
478 | update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 537 | update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
479 | { | 538 | { |
480 | /* | 539 | /* |
481 | * We are starting a new run period: | 540 | * We are starting a new run period: |
482 | */ | 541 | */ |
483 | se->exec_start = now; | 542 | se->exec_start = rq_of(cfs_rq)->clock; |
484 | } | 543 | } |
485 | 544 | ||
486 | /* | 545 | /* |
487 | * We are descheduling a task - update its stats: | 546 | * We are descheduling a task - update its stats: |
488 | */ | 547 | */ |
489 | static inline void | 548 | static inline void |
490 | update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 549 | update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
491 | { | 550 | { |
492 | se->exec_start = 0; | 551 | se->exec_start = 0; |
493 | } | 552 | } |
@@ -496,12 +555,18 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
496 | * Scheduling class queueing methods: | 555 | * Scheduling class queueing methods: |
497 | */ | 556 | */ |
498 | 557 | ||
499 | static void | 558 | static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
500 | __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
501 | { | 559 | { |
502 | unsigned long load = cfs_rq->load.weight, delta_fair; | 560 | unsigned long load = cfs_rq->load.weight, delta_fair; |
503 | long prev_runtime; | 561 | long prev_runtime; |
504 | 562 | ||
563 | /* | ||
564 | * Do not boost sleepers if there's too much bonus 'in flight' | ||
565 | * already: | ||
566 | */ | ||
567 | if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) | ||
568 | return; | ||
569 | |||
505 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) | 570 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) |
506 | load = rq_of(cfs_rq)->cpu_load[2]; | 571 | load = rq_of(cfs_rq)->cpu_load[2]; |
507 | 572 | ||
@@ -527,12 +592,9 @@ __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
527 | * Track the amount of bonus we've given to sleepers: | 592 | * Track the amount of bonus we've given to sleepers: |
528 | */ | 593 | */ |
529 | cfs_rq->sleeper_bonus += delta_fair; | 594 | cfs_rq->sleeper_bonus += delta_fair; |
530 | |||
531 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
532 | } | 595 | } |
533 | 596 | ||
534 | static void | 597 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
535 | enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | ||
536 | { | 598 | { |
537 | struct task_struct *tsk = task_of(se); | 599 | struct task_struct *tsk = task_of(se); |
538 | unsigned long delta_fair; | 600 | unsigned long delta_fair; |
@@ -547,7 +609,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
547 | se->delta_fair_sleep += delta_fair; | 609 | se->delta_fair_sleep += delta_fair; |
548 | if (unlikely(abs(se->delta_fair_sleep) >= | 610 | if (unlikely(abs(se->delta_fair_sleep) >= |
549 | sysctl_sched_stat_granularity)) { | 611 | sysctl_sched_stat_granularity)) { |
550 | __enqueue_sleeper(cfs_rq, se, now); | 612 | __enqueue_sleeper(cfs_rq, se); |
551 | se->delta_fair_sleep = 0; | 613 | se->delta_fair_sleep = 0; |
552 | } | 614 | } |
553 | 615 | ||
@@ -555,7 +617,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
555 | 617 | ||
556 | #ifdef CONFIG_SCHEDSTATS | 618 | #ifdef CONFIG_SCHEDSTATS |
557 | if (se->sleep_start) { | 619 | if (se->sleep_start) { |
558 | u64 delta = now - se->sleep_start; | 620 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
559 | 621 | ||
560 | if ((s64)delta < 0) | 622 | if ((s64)delta < 0) |
561 | delta = 0; | 623 | delta = 0; |
@@ -567,7 +629,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
567 | se->sum_sleep_runtime += delta; | 629 | se->sum_sleep_runtime += delta; |
568 | } | 630 | } |
569 | if (se->block_start) { | 631 | if (se->block_start) { |
570 | u64 delta = now - se->block_start; | 632 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; |
571 | 633 | ||
572 | if ((s64)delta < 0) | 634 | if ((s64)delta < 0) |
573 | delta = 0; | 635 | delta = 0; |
@@ -577,31 +639,39 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
577 | 639 | ||
578 | se->block_start = 0; | 640 | se->block_start = 0; |
579 | se->sum_sleep_runtime += delta; | 641 | se->sum_sleep_runtime += delta; |
642 | |||
643 | /* | ||
644 | * Blocking time is in units of nanosecs, so shift by 20 to | ||
645 | * get a milliseconds-range estimation of the amount of | ||
646 | * time that the task spent sleeping: | ||
647 | */ | ||
648 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
649 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | ||
650 | delta >> 20); | ||
651 | } | ||
580 | } | 652 | } |
581 | #endif | 653 | #endif |
582 | } | 654 | } |
583 | 655 | ||
584 | static void | 656 | static void |
585 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 657 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) |
586 | int wakeup, u64 now) | ||
587 | { | 658 | { |
588 | /* | 659 | /* |
589 | * Update the fair clock. | 660 | * Update the fair clock. |
590 | */ | 661 | */ |
591 | update_curr(cfs_rq, now); | 662 | update_curr(cfs_rq); |
592 | 663 | ||
593 | if (wakeup) | 664 | if (wakeup) |
594 | enqueue_sleeper(cfs_rq, se, now); | 665 | enqueue_sleeper(cfs_rq, se); |
595 | 666 | ||
596 | update_stats_enqueue(cfs_rq, se, now); | 667 | update_stats_enqueue(cfs_rq, se); |
597 | __enqueue_entity(cfs_rq, se); | 668 | __enqueue_entity(cfs_rq, se); |
598 | } | 669 | } |
599 | 670 | ||
600 | static void | 671 | static void |
601 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 672 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
602 | int sleep, u64 now) | ||
603 | { | 673 | { |
604 | update_stats_dequeue(cfs_rq, se, now); | 674 | update_stats_dequeue(cfs_rq, se); |
605 | if (sleep) { | 675 | if (sleep) { |
606 | se->sleep_start_fair = cfs_rq->fair_clock; | 676 | se->sleep_start_fair = cfs_rq->fair_clock; |
607 | #ifdef CONFIG_SCHEDSTATS | 677 | #ifdef CONFIG_SCHEDSTATS |
@@ -609,11 +679,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
609 | struct task_struct *tsk = task_of(se); | 679 | struct task_struct *tsk = task_of(se); |
610 | 680 | ||
611 | if (tsk->state & TASK_INTERRUPTIBLE) | 681 | if (tsk->state & TASK_INTERRUPTIBLE) |
612 | se->sleep_start = now; | 682 | se->sleep_start = rq_of(cfs_rq)->clock; |
613 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 683 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
614 | se->block_start = now; | 684 | se->block_start = rq_of(cfs_rq)->clock; |
615 | } | 685 | } |
616 | cfs_rq->wait_runtime -= se->wait_runtime; | ||
617 | #endif | 686 | #endif |
618 | } | 687 | } |
619 | __dequeue_entity(cfs_rq, se); | 688 | __dequeue_entity(cfs_rq, se); |
@@ -627,18 +696,38 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
627 | struct sched_entity *curr, unsigned long granularity) | 696 | struct sched_entity *curr, unsigned long granularity) |
628 | { | 697 | { |
629 | s64 __delta = curr->fair_key - se->fair_key; | 698 | s64 __delta = curr->fair_key - se->fair_key; |
699 | unsigned long ideal_runtime, delta_exec; | ||
700 | |||
701 | /* | ||
702 | * ideal_runtime is compared against sum_exec_runtime, which is | ||
703 | * walltime, hence do not scale. | ||
704 | */ | ||
705 | ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, | ||
706 | (unsigned long)sysctl_sched_min_granularity); | ||
707 | |||
708 | /* | ||
709 | * If we executed more than what the latency constraint suggests, | ||
710 | * reduce the rescheduling granularity. This way the total latency | ||
711 | * of how much a task is not scheduled converges to | ||
712 | * sysctl_sched_latency: | ||
713 | */ | ||
714 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | ||
715 | if (delta_exec > ideal_runtime) | ||
716 | granularity = 0; | ||
630 | 717 | ||
631 | /* | 718 | /* |
632 | * Take scheduling granularity into account - do not | 719 | * Take scheduling granularity into account - do not |
633 | * preempt the current task unless the best task has | 720 | * preempt the current task unless the best task has |
634 | * a larger than sched_granularity fairness advantage: | 721 | * a larger than sched_granularity fairness advantage: |
722 | * | ||
723 | * scale granularity as key space is in fair_clock. | ||
635 | */ | 724 | */ |
636 | if (__delta > niced_granularity(curr, granularity)) | 725 | if (__delta > niced_granularity(curr, granularity)) |
637 | resched_task(rq_of(cfs_rq)->curr); | 726 | resched_task(rq_of(cfs_rq)->curr); |
638 | } | 727 | } |
639 | 728 | ||
640 | static inline void | 729 | static inline void |
641 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 730 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
642 | { | 731 | { |
643 | /* | 732 | /* |
644 | * Any task has to be enqueued before it get to execute on | 733 | * Any task has to be enqueued before it get to execute on |
@@ -647,49 +736,47 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | |||
647 | * done a put_prev_task_fair() shortly before this, which | 736 | * done a put_prev_task_fair() shortly before this, which |
648 | * updated rq->fair_clock - used by update_stats_wait_end()) | 737 | * updated rq->fair_clock - used by update_stats_wait_end()) |
649 | */ | 738 | */ |
650 | update_stats_wait_end(cfs_rq, se, now); | 739 | update_stats_wait_end(cfs_rq, se); |
651 | update_stats_curr_start(cfs_rq, se, now); | 740 | update_stats_curr_start(cfs_rq, se); |
652 | set_cfs_rq_curr(cfs_rq, se); | 741 | set_cfs_rq_curr(cfs_rq, se); |
742 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | ||
653 | } | 743 | } |
654 | 744 | ||
655 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now) | 745 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
656 | { | 746 | { |
657 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 747 | struct sched_entity *se = __pick_next_entity(cfs_rq); |
658 | 748 | ||
659 | set_next_entity(cfs_rq, se, now); | 749 | set_next_entity(cfs_rq, se); |
660 | 750 | ||
661 | return se; | 751 | return se; |
662 | } | 752 | } |
663 | 753 | ||
664 | static void | 754 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) |
665 | put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now) | ||
666 | { | 755 | { |
667 | /* | 756 | /* |
668 | * If still on the runqueue then deactivate_task() | 757 | * If still on the runqueue then deactivate_task() |
669 | * was not called and update_curr() has to be done: | 758 | * was not called and update_curr() has to be done: |
670 | */ | 759 | */ |
671 | if (prev->on_rq) | 760 | if (prev->on_rq) |
672 | update_curr(cfs_rq, now); | 761 | update_curr(cfs_rq); |
673 | 762 | ||
674 | update_stats_curr_end(cfs_rq, prev, now); | 763 | update_stats_curr_end(cfs_rq, prev); |
675 | 764 | ||
676 | if (prev->on_rq) | 765 | if (prev->on_rq) |
677 | update_stats_wait_start(cfs_rq, prev, now); | 766 | update_stats_wait_start(cfs_rq, prev); |
678 | set_cfs_rq_curr(cfs_rq, NULL); | 767 | set_cfs_rq_curr(cfs_rq, NULL); |
679 | } | 768 | } |
680 | 769 | ||
681 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 770 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
682 | { | 771 | { |
683 | struct rq *rq = rq_of(cfs_rq); | ||
684 | struct sched_entity *next; | 772 | struct sched_entity *next; |
685 | u64 now = __rq_clock(rq); | ||
686 | 773 | ||
687 | /* | 774 | /* |
688 | * Dequeue and enqueue the task to update its | 775 | * Dequeue and enqueue the task to update its |
689 | * position within the tree: | 776 | * position within the tree: |
690 | */ | 777 | */ |
691 | dequeue_entity(cfs_rq, curr, 0, now); | 778 | dequeue_entity(cfs_rq, curr, 0); |
692 | enqueue_entity(cfs_rq, curr, 0, now); | 779 | enqueue_entity(cfs_rq, curr, 0); |
693 | 780 | ||
694 | /* | 781 | /* |
695 | * Reschedule if another task tops the current one. | 782 | * Reschedule if another task tops the current one. |
@@ -698,7 +785,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
698 | if (next == curr) | 785 | if (next == curr) |
699 | return; | 786 | return; |
700 | 787 | ||
701 | __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); | 788 | __check_preempt_curr_fair(cfs_rq, next, curr, |
789 | sched_granularity(cfs_rq)); | ||
702 | } | 790 | } |
703 | 791 | ||
704 | /************************************************** | 792 | /************************************************** |
@@ -794,8 +882,7 @@ static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | |||
794 | * increased. Here we update the fair scheduling stats and | 882 | * increased. Here we update the fair scheduling stats and |
795 | * then put the task into the rbtree: | 883 | * then put the task into the rbtree: |
796 | */ | 884 | */ |
797 | static void | 885 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) |
798 | enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | ||
799 | { | 886 | { |
800 | struct cfs_rq *cfs_rq; | 887 | struct cfs_rq *cfs_rq; |
801 | struct sched_entity *se = &p->se; | 888 | struct sched_entity *se = &p->se; |
@@ -804,7 +891,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | |||
804 | if (se->on_rq) | 891 | if (se->on_rq) |
805 | break; | 892 | break; |
806 | cfs_rq = cfs_rq_of(se); | 893 | cfs_rq = cfs_rq_of(se); |
807 | enqueue_entity(cfs_rq, se, wakeup, now); | 894 | enqueue_entity(cfs_rq, se, wakeup); |
808 | } | 895 | } |
809 | } | 896 | } |
810 | 897 | ||
@@ -813,15 +900,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | |||
813 | * decreased. We remove the task from the rbtree and | 900 | * decreased. We remove the task from the rbtree and |
814 | * update the fair scheduling stats: | 901 | * update the fair scheduling stats: |
815 | */ | 902 | */ |
816 | static void | 903 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) |
817 | dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) | ||
818 | { | 904 | { |
819 | struct cfs_rq *cfs_rq; | 905 | struct cfs_rq *cfs_rq; |
820 | struct sched_entity *se = &p->se; | 906 | struct sched_entity *se = &p->se; |
821 | 907 | ||
822 | for_each_sched_entity(se) { | 908 | for_each_sched_entity(se) { |
823 | cfs_rq = cfs_rq_of(se); | 909 | cfs_rq = cfs_rq_of(se); |
824 | dequeue_entity(cfs_rq, se, sleep, now); | 910 | dequeue_entity(cfs_rq, se, sleep); |
825 | /* Don't dequeue parent if it has other entities besides us */ | 911 | /* Don't dequeue parent if it has other entities besides us */ |
826 | if (cfs_rq->load.weight) | 912 | if (cfs_rq->load.weight) |
827 | break; | 913 | break; |
@@ -829,19 +915,62 @@ dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) | |||
829 | } | 915 | } |
830 | 916 | ||
831 | /* | 917 | /* |
832 | * sched_yield() support is very simple - we dequeue and enqueue | 918 | * sched_yield() support is very simple - we dequeue and enqueue. |
919 | * | ||
920 | * If compat_yield is turned on then we requeue to the end of the tree. | ||
833 | */ | 921 | */ |
834 | static void yield_task_fair(struct rq *rq, struct task_struct *p) | 922 | static void yield_task_fair(struct rq *rq, struct task_struct *p) |
835 | { | 923 | { |
836 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 924 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
837 | u64 now = __rq_clock(rq); | 925 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; |
926 | struct sched_entity *rightmost, *se = &p->se; | ||
927 | struct rb_node *parent; | ||
838 | 928 | ||
839 | /* | 929 | /* |
840 | * Dequeue and enqueue the task to update its | 930 | * Are we the only task in the tree? |
841 | * position within the tree: | 931 | */ |
932 | if (unlikely(cfs_rq->nr_running == 1)) | ||
933 | return; | ||
934 | |||
935 | if (likely(!sysctl_sched_compat_yield)) { | ||
936 | __update_rq_clock(rq); | ||
937 | /* | ||
938 | * Dequeue and enqueue the task to update its | ||
939 | * position within the tree: | ||
940 | */ | ||
941 | dequeue_entity(cfs_rq, &p->se, 0); | ||
942 | enqueue_entity(cfs_rq, &p->se, 0); | ||
943 | |||
944 | return; | ||
945 | } | ||
946 | /* | ||
947 | * Find the rightmost entry in the rbtree: | ||
948 | */ | ||
949 | do { | ||
950 | parent = *link; | ||
951 | link = &parent->rb_right; | ||
952 | } while (*link); | ||
953 | |||
954 | rightmost = rb_entry(parent, struct sched_entity, run_node); | ||
955 | /* | ||
956 | * Already in the rightmost position? | ||
842 | */ | 957 | */ |
843 | dequeue_entity(cfs_rq, &p->se, 0, now); | 958 | if (unlikely(rightmost == se)) |
844 | enqueue_entity(cfs_rq, &p->se, 0, now); | 959 | return; |
960 | |||
961 | /* | ||
962 | * Minimally necessary key value to be last in the tree: | ||
963 | */ | ||
964 | se->fair_key = rightmost->fair_key + 1; | ||
965 | |||
966 | if (cfs_rq->rb_leftmost == &se->run_node) | ||
967 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | ||
968 | /* | ||
969 | * Relink the task to the rightmost position: | ||
970 | */ | ||
971 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
972 | rb_link_node(&se->run_node, parent, link); | ||
973 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | ||
845 | } | 974 | } |
846 | 975 | ||
847 | /* | 976 | /* |
@@ -854,7 +983,8 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | |||
854 | unsigned long gran; | 983 | unsigned long gran; |
855 | 984 | ||
856 | if (unlikely(rt_prio(p->prio))) { | 985 | if (unlikely(rt_prio(p->prio))) { |
857 | update_curr(cfs_rq, rq_clock(rq)); | 986 | update_rq_clock(rq); |
987 | update_curr(cfs_rq); | ||
858 | resched_task(curr); | 988 | resched_task(curr); |
859 | return; | 989 | return; |
860 | } | 990 | } |
@@ -870,7 +1000,7 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | |||
870 | __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); | 1000 | __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); |
871 | } | 1001 | } |
872 | 1002 | ||
873 | static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) | 1003 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
874 | { | 1004 | { |
875 | struct cfs_rq *cfs_rq = &rq->cfs; | 1005 | struct cfs_rq *cfs_rq = &rq->cfs; |
876 | struct sched_entity *se; | 1006 | struct sched_entity *se; |
@@ -879,7 +1009,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) | |||
879 | return NULL; | 1009 | return NULL; |
880 | 1010 | ||
881 | do { | 1011 | do { |
882 | se = pick_next_entity(cfs_rq, now); | 1012 | se = pick_next_entity(cfs_rq); |
883 | cfs_rq = group_cfs_rq(se); | 1013 | cfs_rq = group_cfs_rq(se); |
884 | } while (cfs_rq); | 1014 | } while (cfs_rq); |
885 | 1015 | ||
@@ -889,14 +1019,14 @@ static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) | |||
889 | /* | 1019 | /* |
890 | * Account for a descheduled task: | 1020 | * Account for a descheduled task: |
891 | */ | 1021 | */ |
892 | static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) | 1022 | static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) |
893 | { | 1023 | { |
894 | struct sched_entity *se = &prev->se; | 1024 | struct sched_entity *se = &prev->se; |
895 | struct cfs_rq *cfs_rq; | 1025 | struct cfs_rq *cfs_rq; |
896 | 1026 | ||
897 | for_each_sched_entity(se) { | 1027 | for_each_sched_entity(se) { |
898 | cfs_rq = cfs_rq_of(se); | 1028 | cfs_rq = cfs_rq_of(se); |
899 | put_prev_entity(cfs_rq, se, now); | 1029 | put_prev_entity(cfs_rq, se); |
900 | } | 1030 | } |
901 | } | 1031 | } |
902 | 1032 | ||
@@ -939,6 +1069,7 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
939 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1069 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
940 | } | 1070 | } |
941 | 1071 | ||
1072 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
942 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1073 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) |
943 | { | 1074 | { |
944 | struct sched_entity *curr; | 1075 | struct sched_entity *curr; |
@@ -952,12 +1083,13 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | |||
952 | 1083 | ||
953 | return p->prio; | 1084 | return p->prio; |
954 | } | 1085 | } |
1086 | #endif | ||
955 | 1087 | ||
956 | static int | 1088 | static unsigned long |
957 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1089 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
958 | unsigned long max_nr_move, unsigned long max_load_move, | 1090 | unsigned long max_nr_move, unsigned long max_load_move, |
959 | struct sched_domain *sd, enum cpu_idle_type idle, | 1091 | struct sched_domain *sd, enum cpu_idle_type idle, |
960 | int *all_pinned, unsigned long *total_load_moved) | 1092 | int *all_pinned, int *this_best_prio) |
961 | { | 1093 | { |
962 | struct cfs_rq *busy_cfs_rq; | 1094 | struct cfs_rq *busy_cfs_rq; |
963 | unsigned long load_moved, total_nr_moved = 0, nr_moved; | 1095 | unsigned long load_moved, total_nr_moved = 0, nr_moved; |
@@ -968,15 +1100,14 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
968 | cfs_rq_iterator.next = load_balance_next_fair; | 1100 | cfs_rq_iterator.next = load_balance_next_fair; |
969 | 1101 | ||
970 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1102 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
1103 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
971 | struct cfs_rq *this_cfs_rq; | 1104 | struct cfs_rq *this_cfs_rq; |
972 | long imbalance; | 1105 | long imbalance; |
973 | unsigned long maxload; | 1106 | unsigned long maxload; |
974 | int this_best_prio, best_prio, best_prio_seen = 0; | ||
975 | 1107 | ||
976 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1108 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); |
977 | 1109 | ||
978 | imbalance = busy_cfs_rq->load.weight - | 1110 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; |
979 | this_cfs_rq->load.weight; | ||
980 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1111 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ |
981 | if (imbalance <= 0) | 1112 | if (imbalance <= 0) |
982 | continue; | 1113 | continue; |
@@ -985,27 +1116,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
985 | imbalance /= 2; | 1116 | imbalance /= 2; |
986 | maxload = min(rem_load_move, imbalance); | 1117 | maxload = min(rem_load_move, imbalance); |
987 | 1118 | ||
988 | this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1119 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); |
989 | best_prio = cfs_rq_best_prio(busy_cfs_rq); | 1120 | #else |
990 | 1121 | # define maxload rem_load_move | |
991 | /* | 1122 | #endif |
992 | * Enable handling of the case where there is more than one task | ||
993 | * with the best priority. If the current running task is one | ||
994 | * of those with prio==best_prio we know it won't be moved | ||
995 | * and therefore it's safe to override the skip (based on load) | ||
996 | * of any task we find with that prio. | ||
997 | */ | ||
998 | if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) | ||
999 | best_prio_seen = 1; | ||
1000 | |||
1001 | /* pass busy_cfs_rq argument into | 1123 | /* pass busy_cfs_rq argument into |
1002 | * load_balance_[start|next]_fair iterators | 1124 | * load_balance_[start|next]_fair iterators |
1003 | */ | 1125 | */ |
1004 | cfs_rq_iterator.arg = busy_cfs_rq; | 1126 | cfs_rq_iterator.arg = busy_cfs_rq; |
1005 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, | 1127 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, |
1006 | max_nr_move, maxload, sd, idle, all_pinned, | 1128 | max_nr_move, maxload, sd, idle, all_pinned, |
1007 | &load_moved, this_best_prio, best_prio, | 1129 | &load_moved, this_best_prio, &cfs_rq_iterator); |
1008 | best_prio_seen, &cfs_rq_iterator); | ||
1009 | 1130 | ||
1010 | total_nr_moved += nr_moved; | 1131 | total_nr_moved += nr_moved; |
1011 | max_nr_move -= nr_moved; | 1132 | max_nr_move -= nr_moved; |
@@ -1015,9 +1136,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1015 | break; | 1136 | break; |
1016 | } | 1137 | } |
1017 | 1138 | ||
1018 | *total_load_moved = max_load_move - rem_load_move; | 1139 | return max_load_move - rem_load_move; |
1019 | |||
1020 | return total_nr_moved; | ||
1021 | } | 1140 | } |
1022 | 1141 | ||
1023 | /* | 1142 | /* |
@@ -1044,35 +1163,34 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
1044 | static void task_new_fair(struct rq *rq, struct task_struct *p) | 1163 | static void task_new_fair(struct rq *rq, struct task_struct *p) |
1045 | { | 1164 | { |
1046 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 1165 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
1047 | struct sched_entity *se = &p->se; | 1166 | struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); |
1048 | u64 now = rq_clock(rq); | ||
1049 | 1167 | ||
1050 | sched_info_queued(p); | 1168 | sched_info_queued(p); |
1051 | 1169 | ||
1052 | update_stats_enqueue(cfs_rq, se, now); | 1170 | update_curr(cfs_rq); |
1171 | update_stats_enqueue(cfs_rq, se); | ||
1053 | /* | 1172 | /* |
1054 | * Child runs first: we let it run before the parent | 1173 | * Child runs first: we let it run before the parent |
1055 | * until it reschedules once. We set up the key so that | 1174 | * until it reschedules once. We set up the key so that |
1056 | * it will preempt the parent: | 1175 | * it will preempt the parent: |
1057 | */ | 1176 | */ |
1058 | p->se.fair_key = current->se.fair_key - | 1177 | se->fair_key = curr->fair_key - |
1059 | niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; | 1178 | niced_granularity(curr, sched_granularity(cfs_rq)) - 1; |
1060 | /* | 1179 | /* |
1061 | * The first wait is dominated by the child-runs-first logic, | 1180 | * The first wait is dominated by the child-runs-first logic, |
1062 | * so do not credit it with that waiting time yet: | 1181 | * so do not credit it with that waiting time yet: |
1063 | */ | 1182 | */ |
1064 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) | 1183 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) |
1065 | p->se.wait_start_fair = 0; | 1184 | se->wait_start_fair = 0; |
1066 | 1185 | ||
1067 | /* | 1186 | /* |
1068 | * The statistical average of wait_runtime is about | 1187 | * The statistical average of wait_runtime is about |
1069 | * -granularity/2, so initialize the task with that: | 1188 | * -granularity/2, so initialize the task with that: |
1070 | */ | 1189 | */ |
1071 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | 1190 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) |
1072 | p->se.wait_runtime = -(sysctl_sched_granularity / 2); | 1191 | se->wait_runtime = -(sched_granularity(cfs_rq) / 2); |
1073 | 1192 | ||
1074 | __enqueue_entity(cfs_rq, se); | 1193 | __enqueue_entity(cfs_rq, se); |
1075 | inc_nr_running(p, rq, now); | ||
1076 | } | 1194 | } |
1077 | 1195 | ||
1078 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1196 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1083,15 +1201,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1083 | */ | 1201 | */ |
1084 | static void set_curr_task_fair(struct rq *rq) | 1202 | static void set_curr_task_fair(struct rq *rq) |
1085 | { | 1203 | { |
1086 | struct task_struct *curr = rq->curr; | 1204 | struct sched_entity *se = &rq->curr->se; |
1087 | struct sched_entity *se = &curr->se; | ||
1088 | u64 now = rq_clock(rq); | ||
1089 | struct cfs_rq *cfs_rq; | ||
1090 | 1205 | ||
1091 | for_each_sched_entity(se) { | 1206 | for_each_sched_entity(se) |
1092 | cfs_rq = cfs_rq_of(se); | 1207 | set_next_entity(cfs_rq_of(se), se); |
1093 | set_next_entity(cfs_rq, se, now); | ||
1094 | } | ||
1095 | } | 1208 | } |
1096 | #else | 1209 | #else |
1097 | static void set_curr_task_fair(struct rq *rq) | 1210 | static void set_curr_task_fair(struct rq *rq) |
@@ -1120,12 +1233,11 @@ struct sched_class fair_sched_class __read_mostly = { | |||
1120 | }; | 1233 | }; |
1121 | 1234 | ||
1122 | #ifdef CONFIG_SCHED_DEBUG | 1235 | #ifdef CONFIG_SCHED_DEBUG |
1123 | void print_cfs_stats(struct seq_file *m, int cpu, u64 now) | 1236 | static void print_cfs_stats(struct seq_file *m, int cpu) |
1124 | { | 1237 | { |
1125 | struct rq *rq = cpu_rq(cpu); | ||
1126 | struct cfs_rq *cfs_rq; | 1238 | struct cfs_rq *cfs_rq; |
1127 | 1239 | ||
1128 | for_each_leaf_cfs_rq(rq, cfs_rq) | 1240 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1129 | print_cfs_rq(m, cpu, cfs_rq, now); | 1241 | print_cfs_rq(m, cpu, cfs_rq); |
1130 | } | 1242 | } |
1131 | #endif | 1243 | #endif |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 41841e741c4a..3503fb2d9f96 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -13,7 +13,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) | |||
13 | resched_task(rq->idle); | 13 | resched_task(rq->idle); |
14 | } | 14 | } |
15 | 15 | ||
16 | static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) | 16 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
17 | { | 17 | { |
18 | schedstat_inc(rq, sched_goidle); | 18 | schedstat_inc(rq, sched_goidle); |
19 | 19 | ||
@@ -25,7 +25,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) | |||
25 | * message if some code attempts to do it: | 25 | * message if some code attempts to do it: |
26 | */ | 26 | */ |
27 | static void | 27 | static void |
28 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) | 28 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) |
29 | { | 29 | { |
30 | spin_unlock_irq(&rq->lock); | 30 | spin_unlock_irq(&rq->lock); |
31 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | 31 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); |
@@ -33,15 +33,15 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) | |||
33 | spin_lock_irq(&rq->lock); | 33 | spin_lock_irq(&rq->lock); |
34 | } | 34 | } |
35 | 35 | ||
36 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now) | 36 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) |
37 | { | 37 | { |
38 | } | 38 | } |
39 | 39 | ||
40 | static int | 40 | static unsigned long |
41 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | 41 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, |
42 | unsigned long max_nr_move, unsigned long max_load_move, | 42 | unsigned long max_nr_move, unsigned long max_load_move, |
43 | struct sched_domain *sd, enum cpu_idle_type idle, | 43 | struct sched_domain *sd, enum cpu_idle_type idle, |
44 | int *all_pinned, unsigned long *total_load_moved) | 44 | int *all_pinned, int *this_best_prio) |
45 | { | 45 | { |
46 | return 0; | 46 | return 0; |
47 | } | 47 | } |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1192a2741b99..4b87476a02d0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Update the current task's runtime statistics. Skip current tasks that | 7 | * Update the current task's runtime statistics. Skip current tasks that |
8 | * are not in our scheduling class. | 8 | * are not in our scheduling class. |
9 | */ | 9 | */ |
10 | static inline void update_curr_rt(struct rq *rq, u64 now) | 10 | static inline void update_curr_rt(struct rq *rq) |
11 | { | 11 | { |
12 | struct task_struct *curr = rq->curr; | 12 | struct task_struct *curr = rq->curr; |
13 | u64 delta_exec; | 13 | u64 delta_exec; |
@@ -15,18 +15,17 @@ static inline void update_curr_rt(struct rq *rq, u64 now) | |||
15 | if (!task_has_rt_policy(curr)) | 15 | if (!task_has_rt_policy(curr)) |
16 | return; | 16 | return; |
17 | 17 | ||
18 | delta_exec = now - curr->se.exec_start; | 18 | delta_exec = rq->clock - curr->se.exec_start; |
19 | if (unlikely((s64)delta_exec < 0)) | 19 | if (unlikely((s64)delta_exec < 0)) |
20 | delta_exec = 0; | 20 | delta_exec = 0; |
21 | if (unlikely(delta_exec > curr->se.exec_max)) | 21 | |
22 | curr->se.exec_max = delta_exec; | 22 | schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); |
23 | 23 | ||
24 | curr->se.sum_exec_runtime += delta_exec; | 24 | curr->se.sum_exec_runtime += delta_exec; |
25 | curr->se.exec_start = now; | 25 | curr->se.exec_start = rq->clock; |
26 | } | 26 | } |
27 | 27 | ||
28 | static void | 28 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) |
29 | enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | ||
30 | { | 29 | { |
31 | struct rt_prio_array *array = &rq->rt.active; | 30 | struct rt_prio_array *array = &rq->rt.active; |
32 | 31 | ||
@@ -37,12 +36,11 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | |||
37 | /* | 36 | /* |
38 | * Adding/removing a task to/from a priority array: | 37 | * Adding/removing a task to/from a priority array: |
39 | */ | 38 | */ |
40 | static void | 39 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
41 | dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now) | ||
42 | { | 40 | { |
43 | struct rt_prio_array *array = &rq->rt.active; | 41 | struct rt_prio_array *array = &rq->rt.active; |
44 | 42 | ||
45 | update_curr_rt(rq, now); | 43 | update_curr_rt(rq); |
46 | 44 | ||
47 | list_del(&p->run_list); | 45 | list_del(&p->run_list); |
48 | if (list_empty(array->queue + p->prio)) | 46 | if (list_empty(array->queue + p->prio)) |
@@ -75,7 +73,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | |||
75 | resched_task(rq->curr); | 73 | resched_task(rq->curr); |
76 | } | 74 | } |
77 | 75 | ||
78 | static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) | 76 | static struct task_struct *pick_next_task_rt(struct rq *rq) |
79 | { | 77 | { |
80 | struct rt_prio_array *array = &rq->rt.active; | 78 | struct rt_prio_array *array = &rq->rt.active; |
81 | struct task_struct *next; | 79 | struct task_struct *next; |
@@ -89,14 +87,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) | |||
89 | queue = array->queue + idx; | 87 | queue = array->queue + idx; |
90 | next = list_entry(queue->next, struct task_struct, run_list); | 88 | next = list_entry(queue->next, struct task_struct, run_list); |
91 | 89 | ||
92 | next->se.exec_start = now; | 90 | next->se.exec_start = rq->clock; |
93 | 91 | ||
94 | return next; | 92 | return next; |
95 | } | 93 | } |
96 | 94 | ||
97 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now) | 95 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
98 | { | 96 | { |
99 | update_curr_rt(rq, now); | 97 | update_curr_rt(rq); |
100 | p->se.exec_start = 0; | 98 | p->se.exec_start = 0; |
101 | } | 99 | } |
102 | 100 | ||
@@ -172,28 +170,15 @@ static struct task_struct *load_balance_next_rt(void *arg) | |||
172 | return p; | 170 | return p; |
173 | } | 171 | } |
174 | 172 | ||
175 | static int | 173 | static unsigned long |
176 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | 174 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, |
177 | unsigned long max_nr_move, unsigned long max_load_move, | 175 | unsigned long max_nr_move, unsigned long max_load_move, |
178 | struct sched_domain *sd, enum cpu_idle_type idle, | 176 | struct sched_domain *sd, enum cpu_idle_type idle, |
179 | int *all_pinned, unsigned long *load_moved) | 177 | int *all_pinned, int *this_best_prio) |
180 | { | 178 | { |
181 | int this_best_prio, best_prio, best_prio_seen = 0; | ||
182 | int nr_moved; | 179 | int nr_moved; |
183 | struct rq_iterator rt_rq_iterator; | 180 | struct rq_iterator rt_rq_iterator; |
184 | 181 | unsigned long load_moved; | |
185 | best_prio = sched_find_first_bit(busiest->rt.active.bitmap); | ||
186 | this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); | ||
187 | |||
188 | /* | ||
189 | * Enable handling of the case where there is more than one task | ||
190 | * with the best priority. If the current running task is one | ||
191 | * of those with prio==best_prio we know it won't be moved | ||
192 | * and therefore it's safe to override the skip (based on load) | ||
193 | * of any task we find with that prio. | ||
194 | */ | ||
195 | if (busiest->curr->prio == best_prio) | ||
196 | best_prio_seen = 1; | ||
197 | 182 | ||
198 | rt_rq_iterator.start = load_balance_start_rt; | 183 | rt_rq_iterator.start = load_balance_start_rt; |
199 | rt_rq_iterator.next = load_balance_next_rt; | 184 | rt_rq_iterator.next = load_balance_next_rt; |
@@ -203,11 +188,10 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
203 | rt_rq_iterator.arg = busiest; | 188 | rt_rq_iterator.arg = busiest; |
204 | 189 | ||
205 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, | 190 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, |
206 | max_load_move, sd, idle, all_pinned, load_moved, | 191 | max_load_move, sd, idle, all_pinned, &load_moved, |
207 | this_best_prio, best_prio, best_prio_seen, | 192 | this_best_prio, &rt_rq_iterator); |
208 | &rt_rq_iterator); | ||
209 | 193 | ||
210 | return nr_moved; | 194 | return load_moved; |
211 | } | 195 | } |
212 | 196 | ||
213 | static void task_tick_rt(struct rq *rq, struct task_struct *p) | 197 | static void task_tick_rt(struct rq *rq, struct task_struct *p) |
@@ -223,19 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
223 | return; | 207 | return; |
224 | 208 | ||
225 | p->time_slice = static_prio_timeslice(p->static_prio); | 209 | p->time_slice = static_prio_timeslice(p->static_prio); |
226 | set_tsk_need_resched(p); | ||
227 | |||
228 | /* put it at the end of the queue: */ | ||
229 | requeue_task_rt(rq, p); | ||
230 | } | ||
231 | 210 | ||
232 | /* | 211 | /* |
233 | * No parent/child timeslice management necessary for RT tasks, | 212 | * Requeue to the end of queue if we are not the only element |
234 | * just activate them: | 213 | * on the queue: |
235 | */ | 214 | */ |
236 | static void task_new_rt(struct rq *rq, struct task_struct *p) | 215 | if (p->run_list.prev != p->run_list.next) { |
237 | { | 216 | requeue_task_rt(rq, p); |
238 | activate_task(rq, p, 1); | 217 | set_tsk_need_resched(p); |
218 | } | ||
239 | } | 219 | } |
240 | 220 | ||
241 | static struct sched_class rt_sched_class __read_mostly = { | 221 | static struct sched_class rt_sched_class __read_mostly = { |
@@ -251,5 +231,4 @@ static struct sched_class rt_sched_class __read_mostly = { | |||
251 | .load_balance = load_balance_rt, | 231 | .load_balance = load_balance_rt, |
252 | 232 | ||
253 | .task_tick = task_tick_rt, | 233 | .task_tick = task_tick_rt, |
254 | .task_new = task_new_rt, | ||
255 | }; | 234 | }; |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index c63c38f6fa6e..c20a94dda61e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -116,6 +116,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
116 | } | 116 | } |
117 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 117 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
118 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 118 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) |
119 | # define schedstat_set(var, val) do { var = (val); } while (0) | ||
119 | #else /* !CONFIG_SCHEDSTATS */ | 120 | #else /* !CONFIG_SCHEDSTATS */ |
120 | static inline void | 121 | static inline void |
121 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 122 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) |
@@ -125,6 +126,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
125 | {} | 126 | {} |
126 | # define schedstat_inc(rq, field) do { } while (0) | 127 | # define schedstat_inc(rq, field) do { } while (0) |
127 | # define schedstat_add(rq, field, amt) do { } while (0) | 128 | # define schedstat_add(rq, field, amt) do { } while (0) |
129 | # define schedstat_set(var, val) do { } while (0) | ||
128 | #endif | 130 | #endif |
129 | 131 | ||
130 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 132 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
diff --git a/kernel/signal.c b/kernel/signal.c index 39d122753bac..792952381092 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -255,6 +255,16 @@ flush_signal_handlers(struct task_struct *t, int force_default) | |||
255 | } | 255 | } |
256 | } | 256 | } |
257 | 257 | ||
258 | int unhandled_signal(struct task_struct *tsk, int sig) | ||
259 | { | ||
260 | if (is_init(tsk)) | ||
261 | return 1; | ||
262 | if (tsk->ptrace & PT_PTRACED) | ||
263 | return 0; | ||
264 | return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || | ||
265 | (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); | ||
266 | } | ||
267 | |||
258 | 268 | ||
259 | /* Notify the system that a driver wants to block all signals for this | 269 | /* Notify the system that a driver wants to block all signals for this |
260 | * process, and wants to be notified if any signals at all were to be | 270 | * process, and wants to be notified if any signals at all were to be |
@@ -368,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
368 | /* We only dequeue private signals from ourselves, we don't let | 378 | /* We only dequeue private signals from ourselves, we don't let |
369 | * signalfd steal them | 379 | * signalfd steal them |
370 | */ | 380 | */ |
371 | if (tsk == current) | 381 | signr = __dequeue_signal(&tsk->pending, mask, info); |
372 | signr = __dequeue_signal(&tsk->pending, mask, info); | ||
373 | if (!signr) { | 382 | if (!signr) { |
374 | signr = __dequeue_signal(&tsk->signal->shared_pending, | 383 | signr = __dequeue_signal(&tsk->signal->shared_pending, |
375 | mask, info); | 384 | mask, info); |
@@ -397,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
397 | } | 406 | } |
398 | } | 407 | } |
399 | } | 408 | } |
400 | if (likely(tsk == current)) | 409 | recalc_sigpending(); |
401 | recalc_sigpending(); | ||
402 | if (signr && unlikely(sig_kernel_stop(signr))) { | 410 | if (signr && unlikely(sig_kernel_stop(signr))) { |
403 | /* | 411 | /* |
404 | * Set a marker that we have dequeued a stop signal. Our | 412 | * Set a marker that we have dequeued a stop signal. Our |
@@ -415,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
415 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) | 423 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) |
416 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 424 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; |
417 | } | 425 | } |
418 | if ( signr && | 426 | if (signr && |
419 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | 427 | ((info->si_code & __SI_MASK) == __SI_TIMER) && |
420 | info->si_sys_private){ | 428 | info->si_sys_private){ |
421 | /* | 429 | /* |
@@ -523,18 +531,18 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
523 | if (!valid_signal(sig)) | 531 | if (!valid_signal(sig)) |
524 | return error; | 532 | return error; |
525 | 533 | ||
526 | error = audit_signal_info(sig, t); /* Let audit system see the signal */ | 534 | if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { |
527 | if (error) | 535 | error = audit_signal_info(sig, t); /* Let audit system see the signal */ |
528 | return error; | 536 | if (error) |
529 | 537 | return error; | |
530 | error = -EPERM; | 538 | error = -EPERM; |
531 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) | 539 | if (((sig != SIGCONT) || |
532 | && ((sig != SIGCONT) || | 540 | (process_session(current) != process_session(t))) |
533 | (process_session(current) != process_session(t))) | 541 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) |
534 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | 542 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) |
535 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) | 543 | && !capable(CAP_KILL)) |
536 | && !capable(CAP_KILL)) | ||
537 | return error; | 544 | return error; |
545 | } | ||
538 | 546 | ||
539 | return security_task_kill(t, info, sig, 0); | 547 | return security_task_kill(t, info, sig, 0); |
540 | } | 548 | } |
@@ -1290,20 +1298,19 @@ struct sigqueue *sigqueue_alloc(void) | |||
1290 | void sigqueue_free(struct sigqueue *q) | 1298 | void sigqueue_free(struct sigqueue *q) |
1291 | { | 1299 | { |
1292 | unsigned long flags; | 1300 | unsigned long flags; |
1301 | spinlock_t *lock = ¤t->sighand->siglock; | ||
1302 | |||
1293 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1303 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
1294 | /* | 1304 | /* |
1295 | * If the signal is still pending remove it from the | 1305 | * If the signal is still pending remove it from the |
1296 | * pending queue. | 1306 | * pending queue. We must hold ->siglock while testing |
1307 | * q->list to serialize with collect_signal(). | ||
1297 | */ | 1308 | */ |
1298 | if (unlikely(!list_empty(&q->list))) { | 1309 | spin_lock_irqsave(lock, flags); |
1299 | spinlock_t *lock = ¤t->sighand->siglock; | 1310 | if (!list_empty(&q->list)) |
1300 | read_lock(&tasklist_lock); | 1311 | list_del_init(&q->list); |
1301 | spin_lock_irqsave(lock, flags); | 1312 | spin_unlock_irqrestore(lock, flags); |
1302 | if (!list_empty(&q->list)) | 1313 | |
1303 | list_del_init(&q->list); | ||
1304 | spin_unlock_irqrestore(lock, flags); | ||
1305 | read_unlock(&tasklist_lock); | ||
1306 | } | ||
1307 | q->flags &= ~SIGQUEUE_PREALLOC; | 1314 | q->flags &= ~SIGQUEUE_PREALLOC; |
1308 | __sigqueue_free(q); | 1315 | __sigqueue_free(q); |
1309 | } | 1316 | } |
@@ -1551,10 +1558,6 @@ static inline int may_ptrace_stop(void) | |||
1551 | (current->ptrace & PT_ATTACHED))) | 1558 | (current->ptrace & PT_ATTACHED))) |
1552 | return 0; | 1559 | return 0; |
1553 | 1560 | ||
1554 | if (unlikely(current->signal == current->parent->signal) && | ||
1555 | unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) | ||
1556 | return 0; | ||
1557 | |||
1558 | /* | 1561 | /* |
1559 | * Are we in the middle of do_coredump? | 1562 | * Are we in the middle of do_coredump? |
1560 | * If so and our tracer is also part of the coredump stopping | 1563 | * If so and our tracer is also part of the coredump stopping |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 0f546ddea43d..bd89bc4eb0b9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -271,8 +271,6 @@ asmlinkage void do_softirq(void) | |||
271 | local_irq_restore(flags); | 271 | local_irq_restore(flags); |
272 | } | 272 | } |
273 | 273 | ||
274 | EXPORT_SYMBOL(do_softirq); | ||
275 | |||
276 | #endif | 274 | #endif |
277 | 275 | ||
278 | /* | 276 | /* |
@@ -332,8 +330,6 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr) | |||
332 | wakeup_softirqd(); | 330 | wakeup_softirqd(); |
333 | } | 331 | } |
334 | 332 | ||
335 | EXPORT_SYMBOL(raise_softirq_irqoff); | ||
336 | |||
337 | void fastcall raise_softirq(unsigned int nr) | 333 | void fastcall raise_softirq(unsigned int nr) |
338 | { | 334 | { |
339 | unsigned long flags; | 335 | unsigned long flags; |
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 2c6c2bf85514..cd72424c2662 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -72,7 +72,7 @@ void __lockfunc _read_lock(rwlock_t *lock) | |||
72 | { | 72 | { |
73 | preempt_disable(); | 73 | preempt_disable(); |
74 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 74 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
75 | _raw_read_lock(lock); | 75 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
76 | } | 76 | } |
77 | EXPORT_SYMBOL(_read_lock); | 77 | EXPORT_SYMBOL(_read_lock); |
78 | 78 | ||
@@ -88,8 +88,8 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) | |||
88 | * _raw_spin_lock_flags() code, because lockdep assumes | 88 | * _raw_spin_lock_flags() code, because lockdep assumes |
89 | * that interrupts are not re-enabled during lock-acquire: | 89 | * that interrupts are not re-enabled during lock-acquire: |
90 | */ | 90 | */ |
91 | #ifdef CONFIG_PROVE_LOCKING | 91 | #ifdef CONFIG_LOCKDEP |
92 | _raw_spin_lock(lock); | 92 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
93 | #else | 93 | #else |
94 | _raw_spin_lock_flags(lock, &flags); | 94 | _raw_spin_lock_flags(lock, &flags); |
95 | #endif | 95 | #endif |
@@ -102,7 +102,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock) | |||
102 | local_irq_disable(); | 102 | local_irq_disable(); |
103 | preempt_disable(); | 103 | preempt_disable(); |
104 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 104 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
105 | _raw_spin_lock(lock); | 105 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
106 | } | 106 | } |
107 | EXPORT_SYMBOL(_spin_lock_irq); | 107 | EXPORT_SYMBOL(_spin_lock_irq); |
108 | 108 | ||
@@ -111,7 +111,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock) | |||
111 | local_bh_disable(); | 111 | local_bh_disable(); |
112 | preempt_disable(); | 112 | preempt_disable(); |
113 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 113 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
114 | _raw_spin_lock(lock); | 114 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
115 | } | 115 | } |
116 | EXPORT_SYMBOL(_spin_lock_bh); | 116 | EXPORT_SYMBOL(_spin_lock_bh); |
117 | 117 | ||
@@ -122,7 +122,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) | |||
122 | local_irq_save(flags); | 122 | local_irq_save(flags); |
123 | preempt_disable(); | 123 | preempt_disable(); |
124 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 124 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
125 | _raw_read_lock(lock); | 125 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
126 | return flags; | 126 | return flags; |
127 | } | 127 | } |
128 | EXPORT_SYMBOL(_read_lock_irqsave); | 128 | EXPORT_SYMBOL(_read_lock_irqsave); |
@@ -132,7 +132,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock) | |||
132 | local_irq_disable(); | 132 | local_irq_disable(); |
133 | preempt_disable(); | 133 | preempt_disable(); |
134 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 134 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
135 | _raw_read_lock(lock); | 135 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
136 | } | 136 | } |
137 | EXPORT_SYMBOL(_read_lock_irq); | 137 | EXPORT_SYMBOL(_read_lock_irq); |
138 | 138 | ||
@@ -141,7 +141,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock) | |||
141 | local_bh_disable(); | 141 | local_bh_disable(); |
142 | preempt_disable(); | 142 | preempt_disable(); |
143 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 143 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
144 | _raw_read_lock(lock); | 144 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
145 | } | 145 | } |
146 | EXPORT_SYMBOL(_read_lock_bh); | 146 | EXPORT_SYMBOL(_read_lock_bh); |
147 | 147 | ||
@@ -152,7 +152,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) | |||
152 | local_irq_save(flags); | 152 | local_irq_save(flags); |
153 | preempt_disable(); | 153 | preempt_disable(); |
154 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 154 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
155 | _raw_write_lock(lock); | 155 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
156 | return flags; | 156 | return flags; |
157 | } | 157 | } |
158 | EXPORT_SYMBOL(_write_lock_irqsave); | 158 | EXPORT_SYMBOL(_write_lock_irqsave); |
@@ -162,7 +162,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock) | |||
162 | local_irq_disable(); | 162 | local_irq_disable(); |
163 | preempt_disable(); | 163 | preempt_disable(); |
164 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 164 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
165 | _raw_write_lock(lock); | 165 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
166 | } | 166 | } |
167 | EXPORT_SYMBOL(_write_lock_irq); | 167 | EXPORT_SYMBOL(_write_lock_irq); |
168 | 168 | ||
@@ -171,7 +171,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock) | |||
171 | local_bh_disable(); | 171 | local_bh_disable(); |
172 | preempt_disable(); | 172 | preempt_disable(); |
173 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 173 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
174 | _raw_write_lock(lock); | 174 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
175 | } | 175 | } |
176 | EXPORT_SYMBOL(_write_lock_bh); | 176 | EXPORT_SYMBOL(_write_lock_bh); |
177 | 177 | ||
@@ -179,7 +179,7 @@ void __lockfunc _spin_lock(spinlock_t *lock) | |||
179 | { | 179 | { |
180 | preempt_disable(); | 180 | preempt_disable(); |
181 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 181 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
182 | _raw_spin_lock(lock); | 182 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
183 | } | 183 | } |
184 | 184 | ||
185 | EXPORT_SYMBOL(_spin_lock); | 185 | EXPORT_SYMBOL(_spin_lock); |
@@ -188,7 +188,7 @@ void __lockfunc _write_lock(rwlock_t *lock) | |||
188 | { | 188 | { |
189 | preempt_disable(); | 189 | preempt_disable(); |
190 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 190 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
191 | _raw_write_lock(lock); | 191 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
192 | } | 192 | } |
193 | 193 | ||
194 | EXPORT_SYMBOL(_write_lock); | 194 | EXPORT_SYMBOL(_write_lock); |
@@ -289,7 +289,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) | |||
289 | { | 289 | { |
290 | preempt_disable(); | 290 | preempt_disable(); |
291 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | 291 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); |
292 | _raw_spin_lock(lock); | 292 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
293 | } | 293 | } |
294 | 294 | ||
295 | EXPORT_SYMBOL(_spin_lock_nested); | 295 | EXPORT_SYMBOL(_spin_lock_nested); |
@@ -305,8 +305,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas | |||
305 | * _raw_spin_lock_flags() code, because lockdep assumes | 305 | * _raw_spin_lock_flags() code, because lockdep assumes |
306 | * that interrupts are not re-enabled during lock-acquire: | 306 | * that interrupts are not re-enabled during lock-acquire: |
307 | */ | 307 | */ |
308 | #ifdef CONFIG_PROVE_SPIN_LOCKING | 308 | #ifdef CONFIG_LOCKDEP |
309 | _raw_spin_lock(lock); | 309 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
310 | #else | 310 | #else |
311 | _raw_spin_lock_flags(lock, &flags); | 311 | _raw_spin_lock_flags(lock, &flags); |
312 | #endif | 312 | #endif |
diff --git a/kernel/sys.c b/kernel/sys.c index 4d141ae3e802..8ae2e636eb1b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/getcpu.h> | 32 | #include <linux/getcpu.h> |
33 | #include <linux/task_io_accounting_ops.h> | 33 | #include <linux/task_io_accounting_ops.h> |
34 | #include <linux/seccomp.h> | 34 | #include <linux/seccomp.h> |
35 | #include <linux/cpu.h> | ||
35 | 36 | ||
36 | #include <linux/compat.h> | 37 | #include <linux/compat.h> |
37 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
@@ -100,6 +101,13 @@ struct pid *cad_pid; | |||
100 | EXPORT_SYMBOL(cad_pid); | 101 | EXPORT_SYMBOL(cad_pid); |
101 | 102 | ||
102 | /* | 103 | /* |
104 | * If set, this is used for preparing the system to power off. | ||
105 | */ | ||
106 | |||
107 | void (*pm_power_off_prepare)(void); | ||
108 | EXPORT_SYMBOL(pm_power_off_prepare); | ||
109 | |||
110 | /* | ||
103 | * Notifier list for kernel code which wants to be called | 111 | * Notifier list for kernel code which wants to be called |
104 | * at shutdown. This is used to stop any idling DMA operations | 112 | * at shutdown. This is used to stop any idling DMA operations |
105 | * and the like. | 113 | * and the like. |
@@ -797,6 +805,7 @@ static void kernel_restart_prepare(char *cmd) | |||
797 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 805 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
798 | system_state = SYSTEM_RESTART; | 806 | system_state = SYSTEM_RESTART; |
799 | device_shutdown(); | 807 | device_shutdown(); |
808 | sysdev_shutdown(); | ||
800 | } | 809 | } |
801 | 810 | ||
802 | /** | 811 | /** |
@@ -853,6 +862,7 @@ void kernel_shutdown_prepare(enum system_states state) | |||
853 | void kernel_halt(void) | 862 | void kernel_halt(void) |
854 | { | 863 | { |
855 | kernel_shutdown_prepare(SYSTEM_HALT); | 864 | kernel_shutdown_prepare(SYSTEM_HALT); |
865 | sysdev_shutdown(); | ||
856 | printk(KERN_EMERG "System halted.\n"); | 866 | printk(KERN_EMERG "System halted.\n"); |
857 | machine_halt(); | 867 | machine_halt(); |
858 | } | 868 | } |
@@ -867,6 +877,10 @@ EXPORT_SYMBOL_GPL(kernel_halt); | |||
867 | void kernel_power_off(void) | 877 | void kernel_power_off(void) |
868 | { | 878 | { |
869 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); | 879 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); |
880 | if (pm_power_off_prepare) | ||
881 | pm_power_off_prepare(); | ||
882 | disable_nonboot_cpus(); | ||
883 | sysdev_shutdown(); | ||
870 | printk(KERN_EMERG "Power down.\n"); | 884 | printk(KERN_EMERG "Power down.\n"); |
871 | machine_power_off(); | 885 | machine_power_off(); |
872 | } | 886 | } |
@@ -942,7 +956,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
942 | unlock_kernel(); | 956 | unlock_kernel(); |
943 | return -EINVAL; | 957 | return -EINVAL; |
944 | 958 | ||
945 | #ifdef CONFIG_SOFTWARE_SUSPEND | 959 | #ifdef CONFIG_HIBERNATION |
946 | case LINUX_REBOOT_CMD_SW_SUSPEND: | 960 | case LINUX_REBOOT_CMD_SW_SUSPEND: |
947 | { | 961 | { |
948 | int ret = hibernate(); | 962 | int ret = hibernate(); |
@@ -1027,7 +1041,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) | |||
1027 | return -EPERM; | 1041 | return -EPERM; |
1028 | } | 1042 | } |
1029 | if (new_egid != old_egid) { | 1043 | if (new_egid != old_egid) { |
1030 | current->mm->dumpable = suid_dumpable; | 1044 | set_dumpable(current->mm, suid_dumpable); |
1031 | smp_wmb(); | 1045 | smp_wmb(); |
1032 | } | 1046 | } |
1033 | if (rgid != (gid_t) -1 || | 1047 | if (rgid != (gid_t) -1 || |
@@ -1057,13 +1071,13 @@ asmlinkage long sys_setgid(gid_t gid) | |||
1057 | 1071 | ||
1058 | if (capable(CAP_SETGID)) { | 1072 | if (capable(CAP_SETGID)) { |
1059 | if (old_egid != gid) { | 1073 | if (old_egid != gid) { |
1060 | current->mm->dumpable = suid_dumpable; | 1074 | set_dumpable(current->mm, suid_dumpable); |
1061 | smp_wmb(); | 1075 | smp_wmb(); |
1062 | } | 1076 | } |
1063 | current->gid = current->egid = current->sgid = current->fsgid = gid; | 1077 | current->gid = current->egid = current->sgid = current->fsgid = gid; |
1064 | } else if ((gid == current->gid) || (gid == current->sgid)) { | 1078 | } else if ((gid == current->gid) || (gid == current->sgid)) { |
1065 | if (old_egid != gid) { | 1079 | if (old_egid != gid) { |
1066 | current->mm->dumpable = suid_dumpable; | 1080 | set_dumpable(current->mm, suid_dumpable); |
1067 | smp_wmb(); | 1081 | smp_wmb(); |
1068 | } | 1082 | } |
1069 | current->egid = current->fsgid = gid; | 1083 | current->egid = current->fsgid = gid; |
@@ -1094,7 +1108,7 @@ static int set_user(uid_t new_ruid, int dumpclear) | |||
1094 | switch_uid(new_user); | 1108 | switch_uid(new_user); |
1095 | 1109 | ||
1096 | if (dumpclear) { | 1110 | if (dumpclear) { |
1097 | current->mm->dumpable = suid_dumpable; | 1111 | set_dumpable(current->mm, suid_dumpable); |
1098 | smp_wmb(); | 1112 | smp_wmb(); |
1099 | } | 1113 | } |
1100 | current->uid = new_ruid; | 1114 | current->uid = new_ruid; |
@@ -1150,7 +1164,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) | |||
1150 | return -EAGAIN; | 1164 | return -EAGAIN; |
1151 | 1165 | ||
1152 | if (new_euid != old_euid) { | 1166 | if (new_euid != old_euid) { |
1153 | current->mm->dumpable = suid_dumpable; | 1167 | set_dumpable(current->mm, suid_dumpable); |
1154 | smp_wmb(); | 1168 | smp_wmb(); |
1155 | } | 1169 | } |
1156 | current->fsuid = current->euid = new_euid; | 1170 | current->fsuid = current->euid = new_euid; |
@@ -1200,7 +1214,7 @@ asmlinkage long sys_setuid(uid_t uid) | |||
1200 | return -EPERM; | 1214 | return -EPERM; |
1201 | 1215 | ||
1202 | if (old_euid != uid) { | 1216 | if (old_euid != uid) { |
1203 | current->mm->dumpable = suid_dumpable; | 1217 | set_dumpable(current->mm, suid_dumpable); |
1204 | smp_wmb(); | 1218 | smp_wmb(); |
1205 | } | 1219 | } |
1206 | current->fsuid = current->euid = uid; | 1220 | current->fsuid = current->euid = uid; |
@@ -1245,7 +1259,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) | |||
1245 | } | 1259 | } |
1246 | if (euid != (uid_t) -1) { | 1260 | if (euid != (uid_t) -1) { |
1247 | if (euid != current->euid) { | 1261 | if (euid != current->euid) { |
1248 | current->mm->dumpable = suid_dumpable; | 1262 | set_dumpable(current->mm, suid_dumpable); |
1249 | smp_wmb(); | 1263 | smp_wmb(); |
1250 | } | 1264 | } |
1251 | current->euid = euid; | 1265 | current->euid = euid; |
@@ -1295,7 +1309,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) | |||
1295 | } | 1309 | } |
1296 | if (egid != (gid_t) -1) { | 1310 | if (egid != (gid_t) -1) { |
1297 | if (egid != current->egid) { | 1311 | if (egid != current->egid) { |
1298 | current->mm->dumpable = suid_dumpable; | 1312 | set_dumpable(current->mm, suid_dumpable); |
1299 | smp_wmb(); | 1313 | smp_wmb(); |
1300 | } | 1314 | } |
1301 | current->egid = egid; | 1315 | current->egid = egid; |
@@ -1341,7 +1355,7 @@ asmlinkage long sys_setfsuid(uid_t uid) | |||
1341 | uid == current->suid || uid == current->fsuid || | 1355 | uid == current->suid || uid == current->fsuid || |
1342 | capable(CAP_SETUID)) { | 1356 | capable(CAP_SETUID)) { |
1343 | if (uid != old_fsuid) { | 1357 | if (uid != old_fsuid) { |
1344 | current->mm->dumpable = suid_dumpable; | 1358 | set_dumpable(current->mm, suid_dumpable); |
1345 | smp_wmb(); | 1359 | smp_wmb(); |
1346 | } | 1360 | } |
1347 | current->fsuid = uid; | 1361 | current->fsuid = uid; |
@@ -1370,7 +1384,7 @@ asmlinkage long sys_setfsgid(gid_t gid) | |||
1370 | gid == current->sgid || gid == current->fsgid || | 1384 | gid == current->sgid || gid == current->fsgid || |
1371 | capable(CAP_SETGID)) { | 1385 | capable(CAP_SETGID)) { |
1372 | if (gid != old_fsgid) { | 1386 | if (gid != old_fsgid) { |
1373 | current->mm->dumpable = suid_dumpable; | 1387 | set_dumpable(current->mm, suid_dumpable); |
1374 | smp_wmb(); | 1388 | smp_wmb(); |
1375 | } | 1389 | } |
1376 | current->fsgid = gid; | 1390 | current->fsgid = gid; |
@@ -1430,7 +1444,6 @@ asmlinkage long sys_times(struct tms __user * tbuf) | |||
1430 | * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. | 1444 | * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. |
1431 | * LBT 04.03.94 | 1445 | * LBT 04.03.94 |
1432 | */ | 1446 | */ |
1433 | |||
1434 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | 1447 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) |
1435 | { | 1448 | { |
1436 | struct task_struct *p; | 1449 | struct task_struct *p; |
@@ -1458,7 +1471,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1458 | if (!thread_group_leader(p)) | 1471 | if (!thread_group_leader(p)) |
1459 | goto out; | 1472 | goto out; |
1460 | 1473 | ||
1461 | if (p->real_parent == group_leader) { | 1474 | if (p->real_parent->tgid == group_leader->tgid) { |
1462 | err = -EPERM; | 1475 | err = -EPERM; |
1463 | if (task_session(p) != task_session(group_leader)) | 1476 | if (task_session(p) != task_session(group_leader)) |
1464 | goto out; | 1477 | goto out; |
@@ -2167,14 +2180,14 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
2167 | error = put_user(current->pdeath_signal, (int __user *)arg2); | 2180 | error = put_user(current->pdeath_signal, (int __user *)arg2); |
2168 | break; | 2181 | break; |
2169 | case PR_GET_DUMPABLE: | 2182 | case PR_GET_DUMPABLE: |
2170 | error = current->mm->dumpable; | 2183 | error = get_dumpable(current->mm); |
2171 | break; | 2184 | break; |
2172 | case PR_SET_DUMPABLE: | 2185 | case PR_SET_DUMPABLE: |
2173 | if (arg2 < 0 || arg2 > 1) { | 2186 | if (arg2 < 0 || arg2 > 1) { |
2174 | error = -EINVAL; | 2187 | error = -EINVAL; |
2175 | break; | 2188 | break; |
2176 | } | 2189 | } |
2177 | current->mm->dumpable = arg2; | 2190 | set_dumpable(current->mm, arg2); |
2178 | break; | 2191 | break; |
2179 | 2192 | ||
2180 | case PR_SET_UNALIGN: | 2193 | case PR_SET_UNALIGN: |
@@ -2286,3 +2299,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, | |||
2286 | } | 2299 | } |
2287 | return err ? -EFAULT : 0; | 2300 | return err ? -EFAULT : 0; |
2288 | } | 2301 | } |
2302 | |||
2303 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | ||
2304 | |||
2305 | static void argv_cleanup(char **argv, char **envp) | ||
2306 | { | ||
2307 | argv_free(argv); | ||
2308 | } | ||
2309 | |||
2310 | /** | ||
2311 | * orderly_poweroff - Trigger an orderly system poweroff | ||
2312 | * @force: force poweroff if command execution fails | ||
2313 | * | ||
2314 | * This may be called from any context to trigger a system shutdown. | ||
2315 | * If the orderly shutdown fails, it will force an immediate shutdown. | ||
2316 | */ | ||
2317 | int orderly_poweroff(bool force) | ||
2318 | { | ||
2319 | int argc; | ||
2320 | char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); | ||
2321 | static char *envp[] = { | ||
2322 | "HOME=/", | ||
2323 | "PATH=/sbin:/bin:/usr/sbin:/usr/bin", | ||
2324 | NULL | ||
2325 | }; | ||
2326 | int ret = -ENOMEM; | ||
2327 | struct subprocess_info *info; | ||
2328 | |||
2329 | if (argv == NULL) { | ||
2330 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", | ||
2331 | __func__, poweroff_cmd); | ||
2332 | goto out; | ||
2333 | } | ||
2334 | |||
2335 | info = call_usermodehelper_setup(argv[0], argv, envp); | ||
2336 | if (info == NULL) { | ||
2337 | argv_free(argv); | ||
2338 | goto out; | ||
2339 | } | ||
2340 | |||
2341 | call_usermodehelper_setcleanup(info, argv_cleanup); | ||
2342 | |||
2343 | ret = call_usermodehelper_exec(info, UMH_NO_WAIT); | ||
2344 | |||
2345 | out: | ||
2346 | if (ret && force) { | ||
2347 | printk(KERN_WARNING "Failed to start orderly shutdown: " | ||
2348 | "forcing the issue\n"); | ||
2349 | |||
2350 | /* I guess this should try to kick off some daemon to | ||
2351 | sync and poweroff asap. Or not even bother syncing | ||
2352 | if we're doing an emergency shutdown? */ | ||
2353 | emergency_sync(); | ||
2354 | kernel_power_off(); | ||
2355 | } | ||
2356 | |||
2357 | return ret; | ||
2358 | } | ||
2359 | EXPORT_SYMBOL_GPL(orderly_poweroff); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7063ebc6db05..c7314f952647 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -27,7 +27,6 @@ | |||
27 | #include <linux/capability.h> | 27 | #include <linux/capability.h> |
28 | #include <linux/ctype.h> | 28 | #include <linux/ctype.h> |
29 | #include <linux/utsname.h> | 29 | #include <linux/utsname.h> |
30 | #include <linux/capability.h> | ||
31 | #include <linux/smp_lock.h> | 30 | #include <linux/smp_lock.h> |
32 | #include <linux/fs.h> | 31 | #include <linux/fs.h> |
33 | #include <linux/init.h> | 32 | #include <linux/init.h> |
@@ -46,6 +45,7 @@ | |||
46 | #include <linux/syscalls.h> | 45 | #include <linux/syscalls.h> |
47 | #include <linux/nfs_fs.h> | 46 | #include <linux/nfs_fs.h> |
48 | #include <linux/acpi.h> | 47 | #include <linux/acpi.h> |
48 | #include <linux/reboot.h> | ||
49 | 49 | ||
50 | #include <asm/uaccess.h> | 50 | #include <asm/uaccess.h> |
51 | #include <asm/processor.h> | 51 | #include <asm/processor.h> |
@@ -77,6 +77,7 @@ extern int percpu_pagelist_fraction; | |||
77 | extern int compat_log; | 77 | extern int compat_log; |
78 | extern int maps_protect; | 78 | extern int maps_protect; |
79 | extern int sysctl_stat_interval; | 79 | extern int sysctl_stat_interval; |
80 | extern int audit_argv_kb; | ||
80 | 81 | ||
81 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 82 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
82 | static int maxolduid = 65535; | 83 | static int maxolduid = 65535; |
@@ -159,6 +160,8 @@ extern ctl_table inotify_table[]; | |||
159 | int sysctl_legacy_va_layout; | 160 | int sysctl_legacy_va_layout; |
160 | #endif | 161 | #endif |
161 | 162 | ||
163 | extern int prove_locking; | ||
164 | extern int lock_stat; | ||
162 | 165 | ||
163 | /* The default sysctl tables: */ | 166 | /* The default sysctl tables: */ |
164 | 167 | ||
@@ -219,8 +222,19 @@ static ctl_table kern_table[] = { | |||
219 | #ifdef CONFIG_SCHED_DEBUG | 222 | #ifdef CONFIG_SCHED_DEBUG |
220 | { | 223 | { |
221 | .ctl_name = CTL_UNNUMBERED, | 224 | .ctl_name = CTL_UNNUMBERED, |
222 | .procname = "sched_granularity_ns", | 225 | .procname = "sched_min_granularity_ns", |
223 | .data = &sysctl_sched_granularity, | 226 | .data = &sysctl_sched_min_granularity, |
227 | .maxlen = sizeof(unsigned int), | ||
228 | .mode = 0644, | ||
229 | .proc_handler = &proc_dointvec_minmax, | ||
230 | .strategy = &sysctl_intvec, | ||
231 | .extra1 = &min_sched_granularity_ns, | ||
232 | .extra2 = &max_sched_granularity_ns, | ||
233 | }, | ||
234 | { | ||
235 | .ctl_name = CTL_UNNUMBERED, | ||
236 | .procname = "sched_latency_ns", | ||
237 | .data = &sysctl_sched_latency, | ||
224 | .maxlen = sizeof(unsigned int), | 238 | .maxlen = sizeof(unsigned int), |
225 | .mode = 0644, | 239 | .mode = 0644, |
226 | .proc_handler = &proc_dointvec_minmax, | 240 | .proc_handler = &proc_dointvec_minmax, |
@@ -290,6 +304,34 @@ static ctl_table kern_table[] = { | |||
290 | }, | 304 | }, |
291 | #endif | 305 | #endif |
292 | { | 306 | { |
307 | .ctl_name = CTL_UNNUMBERED, | ||
308 | .procname = "sched_compat_yield", | ||
309 | .data = &sysctl_sched_compat_yield, | ||
310 | .maxlen = sizeof(unsigned int), | ||
311 | .mode = 0644, | ||
312 | .proc_handler = &proc_dointvec, | ||
313 | }, | ||
314 | #ifdef CONFIG_PROVE_LOCKING | ||
315 | { | ||
316 | .ctl_name = CTL_UNNUMBERED, | ||
317 | .procname = "prove_locking", | ||
318 | .data = &prove_locking, | ||
319 | .maxlen = sizeof(int), | ||
320 | .mode = 0644, | ||
321 | .proc_handler = &proc_dointvec, | ||
322 | }, | ||
323 | #endif | ||
324 | #ifdef CONFIG_LOCK_STAT | ||
325 | { | ||
326 | .ctl_name = CTL_UNNUMBERED, | ||
327 | .procname = "lock_stat", | ||
328 | .data = &lock_stat, | ||
329 | .maxlen = sizeof(int), | ||
330 | .mode = 0644, | ||
331 | .proc_handler = &proc_dointvec, | ||
332 | }, | ||
333 | #endif | ||
334 | { | ||
293 | .ctl_name = KERN_PANIC, | 335 | .ctl_name = KERN_PANIC, |
294 | .procname = "panic", | 336 | .procname = "panic", |
295 | .data = &panic_timeout, | 337 | .data = &panic_timeout, |
@@ -305,6 +347,16 @@ static ctl_table kern_table[] = { | |||
305 | .mode = 0644, | 347 | .mode = 0644, |
306 | .proc_handler = &proc_dointvec, | 348 | .proc_handler = &proc_dointvec, |
307 | }, | 349 | }, |
350 | #ifdef CONFIG_AUDITSYSCALL | ||
351 | { | ||
352 | .ctl_name = CTL_UNNUMBERED, | ||
353 | .procname = "audit_argv_kb", | ||
354 | .data = &audit_argv_kb, | ||
355 | .maxlen = sizeof(int), | ||
356 | .mode = 0644, | ||
357 | .proc_handler = &proc_dointvec, | ||
358 | }, | ||
359 | #endif | ||
308 | { | 360 | { |
309 | .ctl_name = KERN_CORE_PATTERN, | 361 | .ctl_name = KERN_CORE_PATTERN, |
310 | .procname = "core_pattern", | 362 | .procname = "core_pattern", |
@@ -655,11 +707,11 @@ static ctl_table kern_table[] = { | |||
655 | .proc_handler = &proc_dointvec, | 707 | .proc_handler = &proc_dointvec, |
656 | }, | 708 | }, |
657 | #endif | 709 | #endif |
658 | #ifdef CONFIG_ACPI_SLEEP | 710 | #if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) |
659 | { | 711 | { |
660 | .ctl_name = KERN_ACPI_VIDEO_FLAGS, | 712 | .ctl_name = KERN_ACPI_VIDEO_FLAGS, |
661 | .procname = "acpi_video_flags", | 713 | .procname = "acpi_video_flags", |
662 | .data = &acpi_video_flags, | 714 | .data = &acpi_realmode_flags, |
663 | .maxlen = sizeof (unsigned long), | 715 | .maxlen = sizeof (unsigned long), |
664 | .mode = 0644, | 716 | .mode = 0644, |
665 | .proc_handler = &proc_doulongvec_minmax, | 717 | .proc_handler = &proc_doulongvec_minmax, |
@@ -705,13 +757,26 @@ static ctl_table kern_table[] = { | |||
705 | .proc_handler = &proc_dointvec, | 757 | .proc_handler = &proc_dointvec, |
706 | }, | 758 | }, |
707 | #endif | 759 | #endif |
708 | 760 | { | |
761 | .ctl_name = CTL_UNNUMBERED, | ||
762 | .procname = "poweroff_cmd", | ||
763 | .data = &poweroff_cmd, | ||
764 | .maxlen = POWEROFF_CMD_PATH_LEN, | ||
765 | .mode = 0644, | ||
766 | .proc_handler = &proc_dostring, | ||
767 | .strategy = &sysctl_string, | ||
768 | }, | ||
769 | /* | ||
770 | * NOTE: do not add new entries to this table unless you have read | ||
771 | * Documentation/sysctl/ctl_unnumbered.txt | ||
772 | */ | ||
709 | { .ctl_name = 0 } | 773 | { .ctl_name = 0 } |
710 | }; | 774 | }; |
711 | 775 | ||
712 | /* Constants for minimum and maximum testing in vm_table. | 776 | /* Constants for minimum and maximum testing in vm_table. |
713 | We use these as one-element integer vectors. */ | 777 | We use these as one-element integer vectors. */ |
714 | static int zero; | 778 | static int zero; |
779 | static int two = 2; | ||
715 | static int one_hundred = 100; | 780 | static int one_hundred = 100; |
716 | 781 | ||
717 | 782 | ||
@@ -976,6 +1041,7 @@ static ctl_table vm_table[] = { | |||
976 | .mode = 0644, | 1041 | .mode = 0644, |
977 | .proc_handler = &proc_doulongvec_minmax, | 1042 | .proc_handler = &proc_doulongvec_minmax, |
978 | }, | 1043 | }, |
1044 | #endif | ||
979 | #ifdef CONFIG_NUMA | 1045 | #ifdef CONFIG_NUMA |
980 | { | 1046 | { |
981 | .ctl_name = CTL_UNNUMBERED, | 1047 | .ctl_name = CTL_UNNUMBERED, |
@@ -987,7 +1053,6 @@ static ctl_table vm_table[] = { | |||
987 | .strategy = &sysctl_string, | 1053 | .strategy = &sysctl_string, |
988 | }, | 1054 | }, |
989 | #endif | 1055 | #endif |
990 | #endif | ||
991 | #if defined(CONFIG_X86_32) || \ | 1056 | #if defined(CONFIG_X86_32) || \ |
992 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) | 1057 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) |
993 | { | 1058 | { |
@@ -1102,7 +1167,10 @@ static ctl_table fs_table[] = { | |||
1102 | .data = &lease_break_time, | 1167 | .data = &lease_break_time, |
1103 | .maxlen = sizeof(int), | 1168 | .maxlen = sizeof(int), |
1104 | .mode = 0644, | 1169 | .mode = 0644, |
1105 | .proc_handler = &proc_dointvec, | 1170 | .proc_handler = &proc_dointvec_minmax, |
1171 | .strategy = &sysctl_intvec, | ||
1172 | .extra1 = &zero, | ||
1173 | .extra2 = &two, | ||
1106 | }, | 1174 | }, |
1107 | { | 1175 | { |
1108 | .ctl_name = FS_AIO_NR, | 1176 | .ctl_name = FS_AIO_NR, |
@@ -1153,6 +1221,16 @@ static ctl_table fs_table[] = { | |||
1153 | }; | 1221 | }; |
1154 | 1222 | ||
1155 | static ctl_table debug_table[] = { | 1223 | static ctl_table debug_table[] = { |
1224 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) | ||
1225 | { | ||
1226 | .ctl_name = CTL_UNNUMBERED, | ||
1227 | .procname = "exception-trace", | ||
1228 | .data = &show_unhandled_signals, | ||
1229 | .maxlen = sizeof(int), | ||
1230 | .mode = 0644, | ||
1231 | .proc_handler = proc_dointvec | ||
1232 | }, | ||
1233 | #endif | ||
1156 | { .ctl_name = 0 } | 1234 | { .ctl_name = 0 } |
1157 | }; | 1235 | }; |
1158 | 1236 | ||
diff --git a/kernel/time.c b/kernel/time.c index ffe19149d770..2289a8d68314 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -57,17 +57,14 @@ EXPORT_SYMBOL(sys_tz); | |||
57 | */ | 57 | */ |
58 | asmlinkage long sys_time(time_t __user * tloc) | 58 | asmlinkage long sys_time(time_t __user * tloc) |
59 | { | 59 | { |
60 | /* | 60 | time_t i; |
61 | * We read xtime.tv_sec atomically - it's updated | 61 | struct timespec tv; |
62 | * atomically by update_wall_time(), so no need to | ||
63 | * even read-lock the xtime seqlock: | ||
64 | */ | ||
65 | time_t i = xtime.tv_sec; | ||
66 | 62 | ||
67 | smp_rmb(); /* sys_time() results are coherent */ | 63 | getnstimeofday(&tv); |
64 | i = tv.tv_sec; | ||
68 | 65 | ||
69 | if (tloc) { | 66 | if (tloc) { |
70 | if (put_user(i, tloc)) | 67 | if (put_user(i,tloc)) |
71 | i = -EFAULT; | 68 | i = -EFAULT; |
72 | } | 69 | } |
73 | return i; | 70 | return i; |
@@ -136,7 +133,6 @@ static inline void warp_clock(void) | |||
136 | write_seqlock_irq(&xtime_lock); | 133 | write_seqlock_irq(&xtime_lock); |
137 | wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; | 134 | wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; |
138 | xtime.tv_sec += sys_tz.tz_minuteswest * 60; | 135 | xtime.tv_sec += sys_tz.tz_minuteswest * 60; |
139 | time_interpolator_reset(); | ||
140 | write_sequnlock_irq(&xtime_lock); | 136 | write_sequnlock_irq(&xtime_lock); |
141 | clock_was_set(); | 137 | clock_was_set(); |
142 | } | 138 | } |
@@ -219,22 +215,6 @@ asmlinkage long sys_adjtimex(struct timex __user *txc_p) | |||
219 | return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; | 215 | return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; |
220 | } | 216 | } |
221 | 217 | ||
222 | inline struct timespec current_kernel_time(void) | ||
223 | { | ||
224 | struct timespec now; | ||
225 | unsigned long seq; | ||
226 | |||
227 | do { | ||
228 | seq = read_seqbegin(&xtime_lock); | ||
229 | |||
230 | now = xtime; | ||
231 | } while (read_seqretry(&xtime_lock, seq)); | ||
232 | |||
233 | return now; | ||
234 | } | ||
235 | |||
236 | EXPORT_SYMBOL(current_kernel_time); | ||
237 | |||
238 | /** | 218 | /** |
239 | * current_fs_time - Return FS time | 219 | * current_fs_time - Return FS time |
240 | * @sb: Superblock. | 220 | * @sb: Superblock. |
@@ -309,92 +289,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) | |||
309 | } | 289 | } |
310 | EXPORT_SYMBOL(timespec_trunc); | 290 | EXPORT_SYMBOL(timespec_trunc); |
311 | 291 | ||
312 | #ifdef CONFIG_TIME_INTERPOLATION | ||
313 | void getnstimeofday (struct timespec *tv) | ||
314 | { | ||
315 | unsigned long seq,sec,nsec; | ||
316 | |||
317 | do { | ||
318 | seq = read_seqbegin(&xtime_lock); | ||
319 | sec = xtime.tv_sec; | ||
320 | nsec = xtime.tv_nsec+time_interpolator_get_offset(); | ||
321 | } while (unlikely(read_seqretry(&xtime_lock, seq))); | ||
322 | |||
323 | while (unlikely(nsec >= NSEC_PER_SEC)) { | ||
324 | nsec -= NSEC_PER_SEC; | ||
325 | ++sec; | ||
326 | } | ||
327 | tv->tv_sec = sec; | ||
328 | tv->tv_nsec = nsec; | ||
329 | } | ||
330 | EXPORT_SYMBOL_GPL(getnstimeofday); | ||
331 | |||
332 | int do_settimeofday (struct timespec *tv) | ||
333 | { | ||
334 | time_t wtm_sec, sec = tv->tv_sec; | ||
335 | long wtm_nsec, nsec = tv->tv_nsec; | ||
336 | |||
337 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
338 | return -EINVAL; | ||
339 | |||
340 | write_seqlock_irq(&xtime_lock); | ||
341 | { | ||
342 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
343 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
344 | |||
345 | set_normalized_timespec(&xtime, sec, nsec); | ||
346 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
347 | |||
348 | time_adjust = 0; /* stop active adjtime() */ | ||
349 | time_status |= STA_UNSYNC; | ||
350 | time_maxerror = NTP_PHASE_LIMIT; | ||
351 | time_esterror = NTP_PHASE_LIMIT; | ||
352 | time_interpolator_reset(); | ||
353 | } | ||
354 | write_sequnlock_irq(&xtime_lock); | ||
355 | clock_was_set(); | ||
356 | return 0; | ||
357 | } | ||
358 | EXPORT_SYMBOL(do_settimeofday); | ||
359 | |||
360 | void do_gettimeofday (struct timeval *tv) | ||
361 | { | ||
362 | unsigned long seq, nsec, usec, sec, offset; | ||
363 | do { | ||
364 | seq = read_seqbegin(&xtime_lock); | ||
365 | offset = time_interpolator_get_offset(); | ||
366 | sec = xtime.tv_sec; | ||
367 | nsec = xtime.tv_nsec; | ||
368 | } while (unlikely(read_seqretry(&xtime_lock, seq))); | ||
369 | |||
370 | usec = (nsec + offset) / 1000; | ||
371 | |||
372 | while (unlikely(usec >= USEC_PER_SEC)) { | ||
373 | usec -= USEC_PER_SEC; | ||
374 | ++sec; | ||
375 | } | ||
376 | |||
377 | tv->tv_sec = sec; | ||
378 | tv->tv_usec = usec; | ||
379 | |||
380 | /* | ||
381 | * Make sure xtime.tv_sec [returned by sys_time()] always | ||
382 | * follows the gettimeofday() result precisely. This | ||
383 | * condition is extremely unlikely, it can hit at most | ||
384 | * once per second: | ||
385 | */ | ||
386 | if (unlikely(xtime.tv_sec != tv->tv_sec)) { | ||
387 | unsigned long flags; | ||
388 | |||
389 | write_seqlock_irqsave(&xtime_lock, flags); | ||
390 | update_wall_time(); | ||
391 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
392 | } | ||
393 | } | ||
394 | EXPORT_SYMBOL(do_gettimeofday); | ||
395 | |||
396 | #else /* CONFIG_TIME_INTERPOLATION */ | ||
397 | |||
398 | #ifndef CONFIG_GENERIC_TIME | 292 | #ifndef CONFIG_GENERIC_TIME |
399 | /* | 293 | /* |
400 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | 294 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval |
@@ -410,7 +304,6 @@ void getnstimeofday(struct timespec *tv) | |||
410 | } | 304 | } |
411 | EXPORT_SYMBOL_GPL(getnstimeofday); | 305 | EXPORT_SYMBOL_GPL(getnstimeofday); |
412 | #endif | 306 | #endif |
413 | #endif /* CONFIG_TIME_INTERPOLATION */ | ||
414 | 307 | ||
415 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 308 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
416 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 309 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f66351126544..8d53106a0a92 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -23,3 +23,8 @@ config HIGH_RES_TIMERS | |||
23 | hardware is not capable then this option only increases | 23 | hardware is not capable then this option only increases |
24 | the size of the kernel image. | 24 | the size of the kernel image. |
25 | 25 | ||
26 | config GENERIC_CLOCKEVENTS_BUILD | ||
27 | bool | ||
28 | default y | ||
29 | depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR | ||
30 | |||
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 99b6034fc86b..905b0b50792d 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,6 +1,6 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o |
2 | 2 | ||
3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o | 3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o |
6 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | 6 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 2ad1c37b8dfe..822beebe664a 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -113,16 +113,6 @@ int clockevents_register_notifier(struct notifier_block *nb) | |||
113 | return ret; | 113 | return ret; |
114 | } | 114 | } |
115 | 115 | ||
116 | /** | ||
117 | * clockevents_unregister_notifier - unregister a clock events change listener | ||
118 | */ | ||
119 | void clockevents_unregister_notifier(struct notifier_block *nb) | ||
120 | { | ||
121 | spin_lock(&clockevents_lock); | ||
122 | raw_notifier_chain_unregister(&clockevents_chain, nb); | ||
123 | spin_unlock(&clockevents_lock); | ||
124 | } | ||
125 | |||
126 | /* | 116 | /* |
127 | * Notify about a clock event change. Called with clockevents_lock | 117 | * Notify about a clock event change. Called with clockevents_lock |
128 | * held. | 118 | * held. |
@@ -204,6 +194,7 @@ void clockevents_exchange_device(struct clock_event_device *old, | |||
204 | local_irq_restore(flags); | 194 | local_irq_restore(flags); |
205 | } | 195 | } |
206 | 196 | ||
197 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | ||
207 | /** | 198 | /** |
208 | * clockevents_notify - notification about relevant events | 199 | * clockevents_notify - notification about relevant events |
209 | */ | 200 | */ |
@@ -232,4 +223,4 @@ void clockevents_notify(unsigned long reason, void *arg) | |||
232 | spin_unlock(&clockevents_lock); | 223 | spin_unlock(&clockevents_lock); |
233 | } | 224 | } |
234 | EXPORT_SYMBOL_GPL(clockevents_notify); | 225 | EXPORT_SYMBOL_GPL(clockevents_notify); |
235 | 226 | #endif | |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 438c6b723ee2..de6a2d6b3ebb 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -10,6 +10,7 @@ | |||
10 | 10 | ||
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/time.h> | 12 | #include <linux/time.h> |
13 | #include <linux/timer.h> | ||
13 | #include <linux/timex.h> | 14 | #include <linux/timex.h> |
14 | #include <linux/jiffies.h> | 15 | #include <linux/jiffies.h> |
15 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
@@ -116,11 +117,6 @@ void second_overflow(void) | |||
116 | if (xtime.tv_sec % 86400 == 0) { | 117 | if (xtime.tv_sec % 86400 == 0) { |
117 | xtime.tv_sec--; | 118 | xtime.tv_sec--; |
118 | wall_to_monotonic.tv_sec++; | 119 | wall_to_monotonic.tv_sec++; |
119 | /* | ||
120 | * The timer interpolator will make time change | ||
121 | * gradually instead of an immediate jump by one second | ||
122 | */ | ||
123 | time_interpolator_update(-NSEC_PER_SEC); | ||
124 | time_state = TIME_OOP; | 120 | time_state = TIME_OOP; |
125 | printk(KERN_NOTICE "Clock: inserting leap second " | 121 | printk(KERN_NOTICE "Clock: inserting leap second " |
126 | "23:59:60 UTC\n"); | 122 | "23:59:60 UTC\n"); |
@@ -130,11 +126,6 @@ void second_overflow(void) | |||
130 | if ((xtime.tv_sec + 1) % 86400 == 0) { | 126 | if ((xtime.tv_sec + 1) % 86400 == 0) { |
131 | xtime.tv_sec++; | 127 | xtime.tv_sec++; |
132 | wall_to_monotonic.tv_sec--; | 128 | wall_to_monotonic.tv_sec--; |
133 | /* | ||
134 | * Use of time interpolator for a gradual change of | ||
135 | * time | ||
136 | */ | ||
137 | time_interpolator_update(NSEC_PER_SEC); | ||
138 | time_state = TIME_WAIT; | 129 | time_state = TIME_WAIT; |
139 | printk(KERN_NOTICE "Clock: deleting leap second " | 130 | printk(KERN_NOTICE "Clock: deleting leap second " |
140 | "23:59:59 UTC\n"); | 131 | "23:59:59 UTC\n"); |
@@ -185,12 +176,64 @@ u64 current_tick_length(void) | |||
185 | return tick_length; | 176 | return tick_length; |
186 | } | 177 | } |
187 | 178 | ||
179 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | ||
188 | 180 | ||
189 | void __attribute__ ((weak)) notify_arch_cmos_timer(void) | 181 | /* Disable the cmos update - used by virtualization and embedded */ |
182 | int no_sync_cmos_clock __read_mostly; | ||
183 | |||
184 | static void sync_cmos_clock(unsigned long dummy); | ||
185 | |||
186 | static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); | ||
187 | |||
188 | static void sync_cmos_clock(unsigned long dummy) | ||
189 | { | ||
190 | struct timespec now, next; | ||
191 | int fail = 1; | ||
192 | |||
193 | /* | ||
194 | * If we have an externally synchronized Linux clock, then update | ||
195 | * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be | ||
196 | * called as close as possible to 500 ms before the new second starts. | ||
197 | * This code is run on a timer. If the clock is set, that timer | ||
198 | * may not expire at the correct time. Thus, we adjust... | ||
199 | */ | ||
200 | if (!ntp_synced()) | ||
201 | /* | ||
202 | * Not synced, exit, do not restart a timer (if one is | ||
203 | * running, let it run out). | ||
204 | */ | ||
205 | return; | ||
206 | |||
207 | getnstimeofday(&now); | ||
208 | if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) | ||
209 | fail = update_persistent_clock(now); | ||
210 | |||
211 | next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; | ||
212 | if (next.tv_nsec <= 0) | ||
213 | next.tv_nsec += NSEC_PER_SEC; | ||
214 | |||
215 | if (!fail) | ||
216 | next.tv_sec = 659; | ||
217 | else | ||
218 | next.tv_sec = 0; | ||
219 | |||
220 | if (next.tv_nsec >= NSEC_PER_SEC) { | ||
221 | next.tv_sec++; | ||
222 | next.tv_nsec -= NSEC_PER_SEC; | ||
223 | } | ||
224 | mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); | ||
225 | } | ||
226 | |||
227 | static void notify_cmos_timer(void) | ||
190 | { | 228 | { |
191 | return; | 229 | if (!no_sync_cmos_clock) |
230 | mod_timer(&sync_cmos_timer, jiffies + 1); | ||
192 | } | 231 | } |
193 | 232 | ||
233 | #else | ||
234 | static inline void notify_cmos_timer(void) { } | ||
235 | #endif | ||
236 | |||
194 | /* adjtimex mainly allows reading (and writing, if superuser) of | 237 | /* adjtimex mainly allows reading (and writing, if superuser) of |
195 | * kernel time-keeping variables. used by xntpd. | 238 | * kernel time-keeping variables. used by xntpd. |
196 | */ | 239 | */ |
@@ -355,6 +398,6 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) | |||
355 | txc->stbcnt = 0; | 398 | txc->stbcnt = 0; |
356 | write_sequnlock_irq(&xtime_lock); | 399 | write_sequnlock_irq(&xtime_lock); |
357 | do_gettimeofday(&txc->time); | 400 | do_gettimeofday(&txc->time); |
358 | notify_arch_cmos_timer(); | 401 | notify_cmos_timer(); |
359 | return(result); | 402 | return(result); |
360 | } | 403 | } |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 8001d37071f5..298bc7c6f09f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -31,6 +31,12 @@ struct tick_device tick_broadcast_device; | |||
31 | static cpumask_t tick_broadcast_mask; | 31 | static cpumask_t tick_broadcast_mask; |
32 | static DEFINE_SPINLOCK(tick_broadcast_lock); | 32 | static DEFINE_SPINLOCK(tick_broadcast_lock); |
33 | 33 | ||
34 | #ifdef CONFIG_TICK_ONESHOT | ||
35 | static void tick_broadcast_clear_oneshot(int cpu); | ||
36 | #else | ||
37 | static inline void tick_broadcast_clear_oneshot(int cpu) { } | ||
38 | #endif | ||
39 | |||
34 | /* | 40 | /* |
35 | * Debugging: see timer_list.c | 41 | * Debugging: see timer_list.c |
36 | */ | 42 | */ |
@@ -49,7 +55,7 @@ cpumask_t *tick_get_broadcast_mask(void) | |||
49 | */ | 55 | */ |
50 | static void tick_broadcast_start_periodic(struct clock_event_device *bc) | 56 | static void tick_broadcast_start_periodic(struct clock_event_device *bc) |
51 | { | 57 | { |
52 | if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) | 58 | if (bc) |
53 | tick_setup_periodic(bc, 1); | 59 | tick_setup_periodic(bc, 1); |
54 | } | 60 | } |
55 | 61 | ||
@@ -58,8 +64,9 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) | |||
58 | */ | 64 | */ |
59 | int tick_check_broadcast_device(struct clock_event_device *dev) | 65 | int tick_check_broadcast_device(struct clock_event_device *dev) |
60 | { | 66 | { |
61 | if (tick_broadcast_device.evtdev || | 67 | if ((tick_broadcast_device.evtdev && |
62 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | 68 | tick_broadcast_device.evtdev->rating >= dev->rating) || |
69 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
63 | return 0; | 70 | return 0; |
64 | 71 | ||
65 | clockevents_exchange_device(NULL, dev); | 72 | clockevents_exchange_device(NULL, dev); |
@@ -99,8 +106,19 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
99 | cpu_set(cpu, tick_broadcast_mask); | 106 | cpu_set(cpu, tick_broadcast_mask); |
100 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); | 107 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); |
101 | ret = 1; | 108 | ret = 1; |
102 | } | 109 | } else { |
110 | /* | ||
111 | * When the new device is not affected by the stop | ||
112 | * feature and the cpu is marked in the broadcast mask | ||
113 | * then clear the broadcast bit. | ||
114 | */ | ||
115 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { | ||
116 | int cpu = smp_processor_id(); | ||
103 | 117 | ||
118 | cpu_clear(cpu, tick_broadcast_mask); | ||
119 | tick_broadcast_clear_oneshot(cpu); | ||
120 | } | ||
121 | } | ||
104 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 122 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
105 | return ret; | 123 | return ret; |
106 | } | 124 | } |
@@ -159,8 +177,6 @@ static void tick_do_periodic_broadcast(void) | |||
159 | */ | 177 | */ |
160 | static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | 178 | static void tick_handle_periodic_broadcast(struct clock_event_device *dev) |
161 | { | 179 | { |
162 | dev->next_event.tv64 = KTIME_MAX; | ||
163 | |||
164 | tick_do_periodic_broadcast(); | 180 | tick_do_periodic_broadcast(); |
165 | 181 | ||
166 | /* | 182 | /* |
@@ -299,7 +315,7 @@ void tick_suspend_broadcast(void) | |||
299 | spin_lock_irqsave(&tick_broadcast_lock, flags); | 315 | spin_lock_irqsave(&tick_broadcast_lock, flags); |
300 | 316 | ||
301 | bc = tick_broadcast_device.evtdev; | 317 | bc = tick_broadcast_device.evtdev; |
302 | if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | 318 | if (bc) |
303 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | 319 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); |
304 | 320 | ||
305 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 321 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
@@ -316,6 +332,8 @@ int tick_resume_broadcast(void) | |||
316 | bc = tick_broadcast_device.evtdev; | 332 | bc = tick_broadcast_device.evtdev; |
317 | 333 | ||
318 | if (bc) { | 334 | if (bc) { |
335 | clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); | ||
336 | |||
319 | switch (tick_broadcast_device.mode) { | 337 | switch (tick_broadcast_device.mode) { |
320 | case TICKDEV_MODE_PERIODIC: | 338 | case TICKDEV_MODE_PERIODIC: |
321 | if(!cpus_empty(tick_broadcast_mask)) | 339 | if(!cpus_empty(tick_broadcast_mask)) |
@@ -364,11 +382,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force) | |||
364 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 382 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) |
365 | { | 383 | { |
366 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 384 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); |
367 | 385 | return 0; | |
368 | if(!cpus_empty(tick_broadcast_oneshot_mask)) | ||
369 | tick_broadcast_set_event(ktime_get(), 1); | ||
370 | |||
371 | return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask); | ||
372 | } | 386 | } |
373 | 387 | ||
374 | /* | 388 | /* |
@@ -485,16 +499,24 @@ out: | |||
485 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 499 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
486 | } | 500 | } |
487 | 501 | ||
502 | /* | ||
503 | * Reset the one shot broadcast for a cpu | ||
504 | * | ||
505 | * Called with tick_broadcast_lock held | ||
506 | */ | ||
507 | static void tick_broadcast_clear_oneshot(int cpu) | ||
508 | { | ||
509 | cpu_clear(cpu, tick_broadcast_oneshot_mask); | ||
510 | } | ||
511 | |||
488 | /** | 512 | /** |
489 | * tick_broadcast_setup_highres - setup the broadcast device for highres | 513 | * tick_broadcast_setup_highres - setup the broadcast device for highres |
490 | */ | 514 | */ |
491 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 515 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
492 | { | 516 | { |
493 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { | 517 | bc->event_handler = tick_handle_oneshot_broadcast; |
494 | bc->event_handler = tick_handle_oneshot_broadcast; | 518 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); |
495 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 519 | bc->next_event.tv64 = KTIME_MAX; |
496 | bc->next_event.tv64 = KTIME_MAX; | ||
497 | } | ||
498 | } | 520 | } |
499 | 521 | ||
500 | /* | 522 | /* |
@@ -520,20 +542,17 @@ void tick_broadcast_switch_to_oneshot(void) | |||
520 | */ | 542 | */ |
521 | void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | 543 | void tick_shutdown_broadcast_oneshot(unsigned int *cpup) |
522 | { | 544 | { |
523 | struct clock_event_device *bc; | ||
524 | unsigned long flags; | 545 | unsigned long flags; |
525 | unsigned int cpu = *cpup; | 546 | unsigned int cpu = *cpup; |
526 | 547 | ||
527 | spin_lock_irqsave(&tick_broadcast_lock, flags); | 548 | spin_lock_irqsave(&tick_broadcast_lock, flags); |
528 | 549 | ||
529 | bc = tick_broadcast_device.evtdev; | 550 | /* |
551 | * Clear the broadcast mask flag for the dead cpu, but do not | ||
552 | * stop the broadcast device! | ||
553 | */ | ||
530 | cpu_clear(cpu, tick_broadcast_oneshot_mask); | 554 | cpu_clear(cpu, tick_broadcast_oneshot_mask); |
531 | 555 | ||
532 | if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { | ||
533 | if (bc && cpus_empty(tick_broadcast_oneshot_mask)) | ||
534 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | ||
535 | } | ||
536 | |||
537 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 556 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
538 | } | 557 | } |
539 | 558 | ||
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a96ec9ab3454..3f3ae3907830 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -200,7 +200,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) | |||
200 | 200 | ||
201 | cpu = smp_processor_id(); | 201 | cpu = smp_processor_id(); |
202 | if (!cpu_isset(cpu, newdev->cpumask)) | 202 | if (!cpu_isset(cpu, newdev->cpumask)) |
203 | goto out; | 203 | goto out_bc; |
204 | 204 | ||
205 | td = &per_cpu(tick_cpu_device, cpu); | 205 | td = &per_cpu(tick_cpu_device, cpu); |
206 | curdev = td->evtdev; | 206 | curdev = td->evtdev; |
@@ -265,7 +265,7 @@ out_bc: | |||
265 | */ | 265 | */ |
266 | if (tick_check_broadcast_device(newdev)) | 266 | if (tick_check_broadcast_device(newdev)) |
267 | ret = NOTIFY_STOP; | 267 | ret = NOTIFY_STOP; |
268 | out: | 268 | |
269 | spin_unlock_irqrestore(&tick_device_lock, flags); | 269 | spin_unlock_irqrestore(&tick_device_lock, flags); |
270 | 270 | ||
271 | return ret; | 271 | return ret; |
@@ -318,12 +318,17 @@ static void tick_resume(void) | |||
318 | { | 318 | { |
319 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 319 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); |
320 | unsigned long flags; | 320 | unsigned long flags; |
321 | int broadcast = tick_resume_broadcast(); | ||
321 | 322 | ||
322 | spin_lock_irqsave(&tick_device_lock, flags); | 323 | spin_lock_irqsave(&tick_device_lock, flags); |
323 | if (td->mode == TICKDEV_MODE_PERIODIC) | 324 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); |
324 | tick_setup_periodic(td->evtdev, 0); | 325 | |
325 | else | 326 | if (!broadcast) { |
326 | tick_resume_oneshot(); | 327 | if (td->mode == TICKDEV_MODE_PERIODIC) |
328 | tick_setup_periodic(td->evtdev, 0); | ||
329 | else | ||
330 | tick_resume_oneshot(); | ||
331 | } | ||
327 | spin_unlock_irqrestore(&tick_device_lock, flags); | 332 | spin_unlock_irqrestore(&tick_device_lock, flags); |
328 | } | 333 | } |
329 | 334 | ||
@@ -360,8 +365,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, | |||
360 | break; | 365 | break; |
361 | 366 | ||
362 | case CLOCK_EVT_NOTIFY_RESUME: | 367 | case CLOCK_EVT_NOTIFY_RESUME: |
363 | if (!tick_resume_broadcast()) | 368 | tick_resume(); |
364 | tick_resume(); | ||
365 | break; | 369 | break; |
366 | 370 | ||
367 | default: | 371 | default: |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index f6997ab0c3c9..0258d3115d54 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) | |||
73 | struct clock_event_device *dev = td->evtdev; | 73 | struct clock_event_device *dev = td->evtdev; |
74 | 74 | ||
75 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || | 75 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || |
76 | !tick_device_is_functional(dev)) | 76 | !tick_device_is_functional(dev)) { |
77 | |||
78 | printk(KERN_INFO "Clockevents: " | ||
79 | "could not switch to one-shot mode:"); | ||
80 | if (!dev) { | ||
81 | printk(" no tick device\n"); | ||
82 | } else { | ||
83 | if (!tick_device_is_functional(dev)) | ||
84 | printk(" %s is not functional.\n", dev->name); | ||
85 | else | ||
86 | printk(" %s does not support one-shot mode.\n", | ||
87 | dev->name); | ||
88 | } | ||
77 | return -EINVAL; | 89 | return -EINVAL; |
90 | } | ||
78 | 91 | ||
79 | td->mode = TICKDEV_MODE_ONESHOT; | 92 | td->mode = TICKDEV_MODE_ONESHOT; |
80 | dev->event_handler = handler; | 93 | dev->event_handler = handler; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52db9e3c526e..8c3fef1db09c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void) | |||
160 | cpu = smp_processor_id(); | 160 | cpu = smp_processor_id(); |
161 | ts = &per_cpu(tick_cpu_sched, cpu); | 161 | ts = &per_cpu(tick_cpu_sched, cpu); |
162 | 162 | ||
163 | /* | ||
164 | * If this cpu is offline and it is the one which updates | ||
165 | * jiffies, then give up the assignment and let it be taken by | ||
166 | * the cpu which runs the tick timer next. If we don't drop | ||
167 | * this here the jiffies might be stale and do_timer() never | ||
168 | * invoked. | ||
169 | */ | ||
170 | if (unlikely(!cpu_online(cpu))) { | ||
171 | if (cpu == tick_do_timer_cpu) | ||
172 | tick_do_timer_cpu = -1; | ||
173 | } | ||
174 | |||
163 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | 175 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) |
164 | goto end; | 176 | goto end; |
165 | 177 | ||
@@ -546,6 +558,7 @@ void tick_setup_sched_timer(void) | |||
546 | { | 558 | { |
547 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 559 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
548 | ktime_t now = ktime_get(); | 560 | ktime_t now = ktime_get(); |
561 | u64 offset; | ||
549 | 562 | ||
550 | /* | 563 | /* |
551 | * Emulate tick processing via per-CPU hrtimers: | 564 | * Emulate tick processing via per-CPU hrtimers: |
@@ -554,8 +567,12 @@ void tick_setup_sched_timer(void) | |||
554 | ts->sched_timer.function = tick_sched_timer; | 567 | ts->sched_timer.function = tick_sched_timer; |
555 | ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | 568 | ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; |
556 | 569 | ||
557 | /* Get the next period */ | 570 | /* Get the next period (per cpu) */ |
558 | ts->sched_timer.expires = tick_init_jiffy_update(); | 571 | ts->sched_timer.expires = tick_init_jiffy_update(); |
572 | offset = ktime_to_ns(tick_period) >> 1; | ||
573 | do_div(offset, NR_CPUS); | ||
574 | offset *= smp_processor_id(); | ||
575 | ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); | ||
559 | 576 | ||
560 | for (;;) { | 577 | for (;;) { |
561 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 578 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 728cedfd3cbd..4ad79f6bdec6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -47,10 +47,22 @@ EXPORT_SYMBOL(xtime_lock); | |||
47 | struct timespec xtime __attribute__ ((aligned (16))); | 47 | struct timespec xtime __attribute__ ((aligned (16))); |
48 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | 48 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); |
49 | static unsigned long total_sleep_time; /* seconds */ | 49 | static unsigned long total_sleep_time; /* seconds */ |
50 | |||
51 | EXPORT_SYMBOL(xtime); | 50 | EXPORT_SYMBOL(xtime); |
52 | 51 | ||
53 | 52 | ||
53 | #ifdef CONFIG_NO_HZ | ||
54 | static struct timespec xtime_cache __attribute__ ((aligned (16))); | ||
55 | static inline void update_xtime_cache(u64 nsec) | ||
56 | { | ||
57 | xtime_cache = xtime; | ||
58 | timespec_add_ns(&xtime_cache, nsec); | ||
59 | } | ||
60 | #else | ||
61 | #define xtime_cache xtime | ||
62 | /* We do *not* want to evaluate the argument for this case */ | ||
63 | #define update_xtime_cache(n) do { } while (0) | ||
64 | #endif | ||
65 | |||
54 | static struct clocksource *clock; /* pointer to current clocksource */ | 66 | static struct clocksource *clock; /* pointer to current clocksource */ |
55 | 67 | ||
56 | 68 | ||
@@ -205,6 +217,7 @@ static void change_clocksource(void) | |||
205 | } | 217 | } |
206 | #else | 218 | #else |
207 | static inline void change_clocksource(void) { } | 219 | static inline void change_clocksource(void) { } |
220 | static inline s64 __get_nsec_offset(void) { return 0; } | ||
208 | #endif | 221 | #endif |
209 | 222 | ||
210 | /** | 223 | /** |
@@ -268,6 +281,8 @@ void __init timekeeping_init(void) | |||
268 | static int timekeeping_suspended; | 281 | static int timekeeping_suspended; |
269 | /* time in seconds when suspend began */ | 282 | /* time in seconds when suspend began */ |
270 | static unsigned long timekeeping_suspend_time; | 283 | static unsigned long timekeeping_suspend_time; |
284 | /* xtime offset when we went into suspend */ | ||
285 | static s64 timekeeping_suspend_nsecs; | ||
271 | 286 | ||
272 | /** | 287 | /** |
273 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 288 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
@@ -293,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev) | |||
293 | wall_to_monotonic.tv_sec -= sleep_length; | 308 | wall_to_monotonic.tv_sec -= sleep_length; |
294 | total_sleep_time += sleep_length; | 309 | total_sleep_time += sleep_length; |
295 | } | 310 | } |
311 | /* Make sure that we have the correct xtime reference */ | ||
312 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); | ||
296 | /* re-base the last cycle value */ | 313 | /* re-base the last cycle value */ |
297 | clock->cycle_last = clocksource_read(clock); | 314 | clock->cycle_last = clocksource_read(clock); |
298 | clock->error = 0; | 315 | clock->error = 0; |
@@ -313,9 +330,12 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
313 | { | 330 | { |
314 | unsigned long flags; | 331 | unsigned long flags; |
315 | 332 | ||
333 | timekeeping_suspend_time = read_persistent_clock(); | ||
334 | |||
316 | write_seqlock_irqsave(&xtime_lock, flags); | 335 | write_seqlock_irqsave(&xtime_lock, flags); |
336 | /* Get the current xtime offset */ | ||
337 | timekeeping_suspend_nsecs = __get_nsec_offset(); | ||
317 | timekeeping_suspended = 1; | 338 | timekeeping_suspended = 1; |
318 | timekeeping_suspend_time = read_persistent_clock(); | ||
319 | write_sequnlock_irqrestore(&xtime_lock, flags); | 339 | write_sequnlock_irqrestore(&xtime_lock, flags); |
320 | 340 | ||
321 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 341 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
@@ -401,7 +421,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, | |||
401 | * this is optimized for the most common adjustments of -1,0,1, | 421 | * this is optimized for the most common adjustments of -1,0,1, |
402 | * for other values we can do a bit more work. | 422 | * for other values we can do a bit more work. |
403 | */ | 423 | */ |
404 | static void clocksource_adjust(struct clocksource *clock, s64 offset) | 424 | static void clocksource_adjust(s64 offset) |
405 | { | 425 | { |
406 | s64 error, interval = clock->cycle_interval; | 426 | s64 error, interval = clock->cycle_interval; |
407 | int adj; | 427 | int adj; |
@@ -466,22 +486,20 @@ void update_wall_time(void) | |||
466 | second_overflow(); | 486 | second_overflow(); |
467 | } | 487 | } |
468 | 488 | ||
469 | /* interpolator bits */ | ||
470 | time_interpolator_update(clock->xtime_interval | ||
471 | >> clock->shift); | ||
472 | |||
473 | /* accumulate error between NTP and clock interval */ | 489 | /* accumulate error between NTP and clock interval */ |
474 | clock->error += current_tick_length(); | 490 | clock->error += current_tick_length(); |
475 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); | 491 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); |
476 | } | 492 | } |
477 | 493 | ||
478 | /* correct the clock when NTP error is too big */ | 494 | /* correct the clock when NTP error is too big */ |
479 | clocksource_adjust(clock, offset); | 495 | clocksource_adjust(offset); |
480 | 496 | ||
481 | /* store full nanoseconds into xtime */ | 497 | /* store full nanoseconds into xtime */ |
482 | xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; | 498 | xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; |
483 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; | 499 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; |
484 | 500 | ||
501 | update_xtime_cache(cyc2ns(clock, offset)); | ||
502 | |||
485 | /* check to see if there is a new clocksource to use */ | 503 | /* check to see if there is a new clocksource to use */ |
486 | change_clocksource(); | 504 | change_clocksource(); |
487 | update_vsyscall(&xtime, clock); | 505 | update_vsyscall(&xtime, clock); |
@@ -513,3 +531,25 @@ void monotonic_to_bootbased(struct timespec *ts) | |||
513 | { | 531 | { |
514 | ts->tv_sec += total_sleep_time; | 532 | ts->tv_sec += total_sleep_time; |
515 | } | 533 | } |
534 | |||
535 | unsigned long get_seconds(void) | ||
536 | { | ||
537 | return xtime_cache.tv_sec; | ||
538 | } | ||
539 | EXPORT_SYMBOL(get_seconds); | ||
540 | |||
541 | |||
542 | struct timespec current_kernel_time(void) | ||
543 | { | ||
544 | struct timespec now; | ||
545 | unsigned long seq; | ||
546 | |||
547 | do { | ||
548 | seq = read_seqbegin(&xtime_lock); | ||
549 | |||
550 | now = xtime_cache; | ||
551 | } while (read_seqretry(&xtime_lock, seq)); | ||
552 | |||
553 | return now; | ||
554 | } | ||
555 | EXPORT_SYMBOL(current_kernel_time); | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index e5edc3a22a08..fdb2e03d4fe0 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -267,7 +267,7 @@ static struct file_operations timer_list_fops = { | |||
267 | .open = timer_list_open, | 267 | .open = timer_list_open, |
268 | .read = seq_read, | 268 | .read = seq_read, |
269 | .llseek = seq_lseek, | 269 | .llseek = seq_lseek, |
270 | .release = seq_release, | 270 | .release = single_release, |
271 | }; | 271 | }; |
272 | 272 | ||
273 | static int __init init_timer_list_procfs(void) | 273 | static int __init init_timer_list_procfs(void) |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 8ed62fda16c6..c36bb7ed0301 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -327,8 +327,9 @@ static int tstats_show(struct seq_file *m, void *v) | |||
327 | ms = 1; | 327 | ms = 1; |
328 | 328 | ||
329 | if (events && period.tv_sec) | 329 | if (events && period.tv_sec) |
330 | seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, | 330 | seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", |
331 | events / period.tv_sec, events * 1000 / ms); | 331 | events, events * 1000 / ms, |
332 | (events * 1000000 / ms) % 1000); | ||
332 | else | 333 | else |
333 | seq_printf(m, "%ld total events\n", events); | 334 | seq_printf(m, "%ld total events\n", events); |
334 | 335 | ||
@@ -399,7 +400,7 @@ static struct file_operations tstats_fops = { | |||
399 | .read = seq_read, | 400 | .read = seq_read, |
400 | .write = tstats_write, | 401 | .write = tstats_write, |
401 | .llseek = seq_lseek, | 402 | .llseek = seq_lseek, |
402 | .release = seq_release, | 403 | .release = single_release, |
403 | }; | 404 | }; |
404 | 405 | ||
405 | void __init init_timer_stats(void) | 406 | void __init init_timer_stats(void) |
diff --git a/kernel/timer.c b/kernel/timer.c index b7792fb03387..6ce1952eea7d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_base(tvec_base_t *base) | |||
103 | static inline void timer_set_deferrable(struct timer_list *timer) | 103 | static inline void timer_set_deferrable(struct timer_list *timer) |
104 | { | 104 | { |
105 | timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | | 105 | timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | |
106 | TBASE_DEFERRABLE_FLAG)); | 106 | TBASE_DEFERRABLE_FLAG)); |
107 | } | 107 | } |
108 | 108 | ||
109 | static inline void | 109 | static inline void |
110 | timer_set_base(struct timer_list *timer, tvec_base_t *new_base) | 110 | timer_set_base(struct timer_list *timer, tvec_base_t *new_base) |
111 | { | 111 | { |
112 | timer->base = (tvec_base_t *)((unsigned long)(new_base) | | 112 | timer->base = (tvec_base_t *)((unsigned long)(new_base) | |
113 | tbase_get_deferrable(timer->base)); | 113 | tbase_get_deferrable(timer->base)); |
114 | } | 114 | } |
115 | 115 | ||
116 | /** | 116 | /** |
@@ -445,10 +445,10 @@ EXPORT_SYMBOL(__mod_timer); | |||
445 | void add_timer_on(struct timer_list *timer, int cpu) | 445 | void add_timer_on(struct timer_list *timer, int cpu) |
446 | { | 446 | { |
447 | tvec_base_t *base = per_cpu(tvec_bases, cpu); | 447 | tvec_base_t *base = per_cpu(tvec_bases, cpu); |
448 | unsigned long flags; | 448 | unsigned long flags; |
449 | 449 | ||
450 | timer_stats_timer_set_start_info(timer); | 450 | timer_stats_timer_set_start_info(timer); |
451 | BUG_ON(timer_pending(timer) || !timer->function); | 451 | BUG_ON(timer_pending(timer) || !timer->function); |
452 | spin_lock_irqsave(&base->lock, flags); | 452 | spin_lock_irqsave(&base->lock, flags); |
453 | timer_set_base(timer, base); | 453 | timer_set_base(timer, base); |
454 | internal_add_timer(base, timer); | 454 | internal_add_timer(base, timer); |
@@ -627,7 +627,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
627 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 627 | while (time_after_eq(jiffies, base->timer_jiffies)) { |
628 | struct list_head work_list; | 628 | struct list_head work_list; |
629 | struct list_head *head = &work_list; | 629 | struct list_head *head = &work_list; |
630 | int index = base->timer_jiffies & TVR_MASK; | 630 | int index = base->timer_jiffies & TVR_MASK; |
631 | 631 | ||
632 | /* | 632 | /* |
633 | * Cascade timers: | 633 | * Cascade timers: |
@@ -644,8 +644,8 @@ static inline void __run_timers(tvec_base_t *base) | |||
644 | unsigned long data; | 644 | unsigned long data; |
645 | 645 | ||
646 | timer = list_first_entry(head, struct timer_list,entry); | 646 | timer = list_first_entry(head, struct timer_list,entry); |
647 | fn = timer->function; | 647 | fn = timer->function; |
648 | data = timer->data; | 648 | data = timer->data; |
649 | 649 | ||
650 | timer_stats_account_timer(timer); | 650 | timer_stats_account_timer(timer); |
651 | 651 | ||
@@ -689,8 +689,8 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base) | |||
689 | index = slot = timer_jiffies & TVR_MASK; | 689 | index = slot = timer_jiffies & TVR_MASK; |
690 | do { | 690 | do { |
691 | list_for_each_entry(nte, base->tv1.vec + slot, entry) { | 691 | list_for_each_entry(nte, base->tv1.vec + slot, entry) { |
692 | if (tbase_get_deferrable(nte->base)) | 692 | if (tbase_get_deferrable(nte->base)) |
693 | continue; | 693 | continue; |
694 | 694 | ||
695 | found = 1; | 695 | found = 1; |
696 | expires = nte->expires; | 696 | expires = nte->expires; |
@@ -834,7 +834,7 @@ void update_process_times(int user_tick) | |||
834 | if (rcu_pending(cpu)) | 834 | if (rcu_pending(cpu)) |
835 | rcu_check_callbacks(cpu, user_tick); | 835 | rcu_check_callbacks(cpu, user_tick); |
836 | scheduler_tick(); | 836 | scheduler_tick(); |
837 | run_posix_cpu_timers(p); | 837 | run_posix_cpu_timers(p); |
838 | } | 838 | } |
839 | 839 | ||
840 | /* | 840 | /* |
@@ -909,7 +909,7 @@ static inline void update_times(unsigned long ticks) | |||
909 | update_wall_time(); | 909 | update_wall_time(); |
910 | calc_load(ticks); | 910 | calc_load(ticks); |
911 | } | 911 | } |
912 | 912 | ||
913 | /* | 913 | /* |
914 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | 914 | * The 64-bit jiffies value is not atomic - you MUST NOT read it |
915 | * without sampling the sequence number in xtime_lock. | 915 | * without sampling the sequence number in xtime_lock. |
@@ -1105,7 +1105,7 @@ asmlinkage long sys_gettid(void) | |||
1105 | /** | 1105 | /** |
1106 | * do_sysinfo - fill in sysinfo struct | 1106 | * do_sysinfo - fill in sysinfo struct |
1107 | * @info: pointer to buffer to fill | 1107 | * @info: pointer to buffer to fill |
1108 | */ | 1108 | */ |
1109 | int do_sysinfo(struct sysinfo *info) | 1109 | int do_sysinfo(struct sysinfo *info) |
1110 | { | 1110 | { |
1111 | unsigned long mem_total, sav_total; | 1111 | unsigned long mem_total, sav_total; |
@@ -1349,194 +1349,6 @@ void __init init_timers(void) | |||
1349 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); | 1349 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); |
1350 | } | 1350 | } |
1351 | 1351 | ||
1352 | #ifdef CONFIG_TIME_INTERPOLATION | ||
1353 | |||
1354 | struct time_interpolator *time_interpolator __read_mostly; | ||
1355 | static struct time_interpolator *time_interpolator_list __read_mostly; | ||
1356 | static DEFINE_SPINLOCK(time_interpolator_lock); | ||
1357 | |||
1358 | static inline cycles_t time_interpolator_get_cycles(unsigned int src) | ||
1359 | { | ||
1360 | unsigned long (*x)(void); | ||
1361 | |||
1362 | switch (src) | ||
1363 | { | ||
1364 | case TIME_SOURCE_FUNCTION: | ||
1365 | x = time_interpolator->addr; | ||
1366 | return x(); | ||
1367 | |||
1368 | case TIME_SOURCE_MMIO64 : | ||
1369 | return readq_relaxed((void __iomem *)time_interpolator->addr); | ||
1370 | |||
1371 | case TIME_SOURCE_MMIO32 : | ||
1372 | return readl_relaxed((void __iomem *)time_interpolator->addr); | ||
1373 | |||
1374 | default: return get_cycles(); | ||
1375 | } | ||
1376 | } | ||
1377 | |||
1378 | static inline u64 time_interpolator_get_counter(int writelock) | ||
1379 | { | ||
1380 | unsigned int src = time_interpolator->source; | ||
1381 | |||
1382 | if (time_interpolator->jitter) | ||
1383 | { | ||
1384 | cycles_t lcycle; | ||
1385 | cycles_t now; | ||
1386 | |||
1387 | do { | ||
1388 | lcycle = time_interpolator->last_cycle; | ||
1389 | now = time_interpolator_get_cycles(src); | ||
1390 | if (lcycle && time_after(lcycle, now)) | ||
1391 | return lcycle; | ||
1392 | |||
1393 | /* When holding the xtime write lock, there's no need | ||
1394 | * to add the overhead of the cmpxchg. Readers are | ||
1395 | * force to retry until the write lock is released. | ||
1396 | */ | ||
1397 | if (writelock) { | ||
1398 | time_interpolator->last_cycle = now; | ||
1399 | return now; | ||
1400 | } | ||
1401 | /* Keep track of the last timer value returned. The use of cmpxchg here | ||
1402 | * will cause contention in an SMP environment. | ||
1403 | */ | ||
1404 | } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); | ||
1405 | return now; | ||
1406 | } | ||
1407 | else | ||
1408 | return time_interpolator_get_cycles(src); | ||
1409 | } | ||
1410 | |||
1411 | void time_interpolator_reset(void) | ||
1412 | { | ||
1413 | time_interpolator->offset = 0; | ||
1414 | time_interpolator->last_counter = time_interpolator_get_counter(1); | ||
1415 | } | ||
1416 | |||
1417 | #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) | ||
1418 | |||
1419 | unsigned long time_interpolator_get_offset(void) | ||
1420 | { | ||
1421 | /* If we do not have a time interpolator set up then just return zero */ | ||
1422 | if (!time_interpolator) | ||
1423 | return 0; | ||
1424 | |||
1425 | return time_interpolator->offset + | ||
1426 | GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); | ||
1427 | } | ||
1428 | |||
1429 | #define INTERPOLATOR_ADJUST 65536 | ||
1430 | #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST | ||
1431 | |||
1432 | void time_interpolator_update(long delta_nsec) | ||
1433 | { | ||
1434 | u64 counter; | ||
1435 | unsigned long offset; | ||
1436 | |||
1437 | /* If there is no time interpolator set up then do nothing */ | ||
1438 | if (!time_interpolator) | ||
1439 | return; | ||
1440 | |||
1441 | /* | ||
1442 | * The interpolator compensates for late ticks by accumulating the late | ||
1443 | * time in time_interpolator->offset. A tick earlier than expected will | ||
1444 | * lead to a reset of the offset and a corresponding jump of the clock | ||
1445 | * forward. Again this only works if the interpolator clock is running | ||
1446 | * slightly slower than the regular clock and the tuning logic insures | ||
1447 | * that. | ||
1448 | */ | ||
1449 | |||
1450 | counter = time_interpolator_get_counter(1); | ||
1451 | offset = time_interpolator->offset + | ||
1452 | GET_TI_NSECS(counter, time_interpolator); | ||
1453 | |||
1454 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) | ||
1455 | time_interpolator->offset = offset - delta_nsec; | ||
1456 | else { | ||
1457 | time_interpolator->skips++; | ||
1458 | time_interpolator->ns_skipped += delta_nsec - offset; | ||
1459 | time_interpolator->offset = 0; | ||
1460 | } | ||
1461 | time_interpolator->last_counter = counter; | ||
1462 | |||
1463 | /* Tuning logic for time interpolator invoked every minute or so. | ||
1464 | * Decrease interpolator clock speed if no skips occurred and an offset is carried. | ||
1465 | * Increase interpolator clock speed if we skip too much time. | ||
1466 | */ | ||
1467 | if (jiffies % INTERPOLATOR_ADJUST == 0) | ||
1468 | { | ||
1469 | if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec) | ||
1470 | time_interpolator->nsec_per_cyc--; | ||
1471 | if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) | ||
1472 | time_interpolator->nsec_per_cyc++; | ||
1473 | time_interpolator->skips = 0; | ||
1474 | time_interpolator->ns_skipped = 0; | ||
1475 | } | ||
1476 | } | ||
1477 | |||
1478 | static inline int | ||
1479 | is_better_time_interpolator(struct time_interpolator *new) | ||
1480 | { | ||
1481 | if (!time_interpolator) | ||
1482 | return 1; | ||
1483 | return new->frequency > 2*time_interpolator->frequency || | ||
1484 | (unsigned long)new->drift < (unsigned long)time_interpolator->drift; | ||
1485 | } | ||
1486 | |||
1487 | void | ||
1488 | register_time_interpolator(struct time_interpolator *ti) | ||
1489 | { | ||
1490 | unsigned long flags; | ||
1491 | |||
1492 | /* Sanity check */ | ||
1493 | BUG_ON(ti->frequency == 0 || ti->mask == 0); | ||
1494 | |||
1495 | ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; | ||
1496 | spin_lock(&time_interpolator_lock); | ||
1497 | write_seqlock_irqsave(&xtime_lock, flags); | ||
1498 | if (is_better_time_interpolator(ti)) { | ||
1499 | time_interpolator = ti; | ||
1500 | time_interpolator_reset(); | ||
1501 | } | ||
1502 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
1503 | |||
1504 | ti->next = time_interpolator_list; | ||
1505 | time_interpolator_list = ti; | ||
1506 | spin_unlock(&time_interpolator_lock); | ||
1507 | } | ||
1508 | |||
1509 | void | ||
1510 | unregister_time_interpolator(struct time_interpolator *ti) | ||
1511 | { | ||
1512 | struct time_interpolator *curr, **prev; | ||
1513 | unsigned long flags; | ||
1514 | |||
1515 | spin_lock(&time_interpolator_lock); | ||
1516 | prev = &time_interpolator_list; | ||
1517 | for (curr = *prev; curr; curr = curr->next) { | ||
1518 | if (curr == ti) { | ||
1519 | *prev = curr->next; | ||
1520 | break; | ||
1521 | } | ||
1522 | prev = &curr->next; | ||
1523 | } | ||
1524 | |||
1525 | write_seqlock_irqsave(&xtime_lock, flags); | ||
1526 | if (ti == time_interpolator) { | ||
1527 | /* we lost the best time-interpolator: */ | ||
1528 | time_interpolator = NULL; | ||
1529 | /* find the next-best interpolator */ | ||
1530 | for (curr = time_interpolator_list; curr; curr = curr->next) | ||
1531 | if (is_better_time_interpolator(curr)) | ||
1532 | time_interpolator = curr; | ||
1533 | time_interpolator_reset(); | ||
1534 | } | ||
1535 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
1536 | spin_unlock(&time_interpolator_lock); | ||
1537 | } | ||
1538 | #endif /* CONFIG_TIME_INTERPOLATION */ | ||
1539 | |||
1540 | /** | 1352 | /** |
1541 | * msleep - sleep safely even with waitqueue interruptions | 1353 | * msleep - sleep safely even with waitqueue interruptions |
1542 | * @msecs: Time in milliseconds to sleep for | 1354 | * @msecs: Time in milliseconds to sleep for |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 658f638c402c..c122131a122f 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -39,7 +39,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
39 | ac_etime = timespec_to_ns(&ts); | 39 | ac_etime = timespec_to_ns(&ts); |
40 | do_div(ac_etime, NSEC_PER_USEC); | 40 | do_div(ac_etime, NSEC_PER_USEC); |
41 | stats->ac_etime = ac_etime; | 41 | stats->ac_etime = ac_etime; |
42 | stats->ac_btime = xtime.tv_sec - ts.tv_sec; | 42 | stats->ac_btime = get_seconds() - ts.tv_sec; |
43 | if (thread_group_leader(tsk)) { | 43 | if (thread_group_leader(tsk)) { |
44 | stats->ac_exitcode = tsk->exit_code; | 44 | stats->ac_exitcode = tsk->exit_code; |
45 | if (tsk->flags & PF_FORKNOEXEC) | 45 | if (tsk->flags & PF_FORKNOEXEC) |
diff --git a/kernel/user.c b/kernel/user.c index 98b82507797a..9ca2848fc356 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -55,25 +55,22 @@ struct user_struct root_user = { | |||
55 | /* | 55 | /* |
56 | * These routines must be called with the uidhash spinlock held! | 56 | * These routines must be called with the uidhash spinlock held! |
57 | */ | 57 | */ |
58 | static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) | 58 | static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) |
59 | { | 59 | { |
60 | list_add(&up->uidhash_list, hashent); | 60 | hlist_add_head(&up->uidhash_node, hashent); |
61 | } | 61 | } |
62 | 62 | ||
63 | static inline void uid_hash_remove(struct user_struct *up) | 63 | static inline void uid_hash_remove(struct user_struct *up) |
64 | { | 64 | { |
65 | list_del(&up->uidhash_list); | 65 | hlist_del_init(&up->uidhash_node); |
66 | } | 66 | } |
67 | 67 | ||
68 | static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) | 68 | static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) |
69 | { | 69 | { |
70 | struct list_head *up; | 70 | struct user_struct *user; |
71 | 71 | struct hlist_node *h; | |
72 | list_for_each(up, hashent) { | ||
73 | struct user_struct *user; | ||
74 | |||
75 | user = list_entry(up, struct user_struct, uidhash_list); | ||
76 | 72 | ||
73 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | ||
77 | if(user->uid == uid) { | 74 | if(user->uid == uid) { |
78 | atomic_inc(&user->__count); | 75 | atomic_inc(&user->__count); |
79 | return user; | 76 | return user; |
@@ -122,7 +119,7 @@ void free_uid(struct user_struct *up) | |||
122 | 119 | ||
123 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 120 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
124 | { | 121 | { |
125 | struct list_head *hashent = uidhashentry(ns, uid); | 122 | struct hlist_head *hashent = uidhashentry(ns, uid); |
126 | struct user_struct *up; | 123 | struct user_struct *up; |
127 | 124 | ||
128 | spin_lock_irq(&uidhash_lock); | 125 | spin_lock_irq(&uidhash_lock); |
@@ -202,16 +199,40 @@ void switch_uid(struct user_struct *new_user) | |||
202 | suid_keys(current); | 199 | suid_keys(current); |
203 | } | 200 | } |
204 | 201 | ||
202 | void release_uids(struct user_namespace *ns) | ||
203 | { | ||
204 | int i; | ||
205 | unsigned long flags; | ||
206 | struct hlist_head *head; | ||
207 | struct hlist_node *nd; | ||
208 | |||
209 | spin_lock_irqsave(&uidhash_lock, flags); | ||
210 | /* | ||
211 | * collapse the chains so that the user_struct-s will | ||
212 | * be still alive, but not in hashes. subsequent free_uid() | ||
213 | * will free them. | ||
214 | */ | ||
215 | for (i = 0; i < UIDHASH_SZ; i++) { | ||
216 | head = ns->uidhash_table + i; | ||
217 | while (!hlist_empty(head)) { | ||
218 | nd = head->first; | ||
219 | hlist_del_init(nd); | ||
220 | } | ||
221 | } | ||
222 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
223 | |||
224 | free_uid(ns->root_user); | ||
225 | } | ||
205 | 226 | ||
206 | static int __init uid_cache_init(void) | 227 | static int __init uid_cache_init(void) |
207 | { | 228 | { |
208 | int n; | 229 | int n; |
209 | 230 | ||
210 | uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), | 231 | uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), |
211 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 232 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
212 | 233 | ||
213 | for(n = 0; n < UIDHASH_SZ; ++n) | 234 | for(n = 0; n < UIDHASH_SZ; ++n) |
214 | INIT_LIST_HEAD(init_user_ns.uidhash_table + n); | 235 | INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); |
215 | 236 | ||
216 | /* Insert the root user immediately (init already runs as root) */ | 237 | /* Insert the root user immediately (init already runs as root) */ |
217 | spin_lock_irq(&uidhash_lock); | 238 | spin_lock_irq(&uidhash_lock); |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d055d987850c..7af90fc4f0fd 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) | |||
39 | kref_init(&ns->kref); | 39 | kref_init(&ns->kref); |
40 | 40 | ||
41 | for (n = 0; n < UIDHASH_SZ; ++n) | 41 | for (n = 0; n < UIDHASH_SZ; ++n) |
42 | INIT_LIST_HEAD(ns->uidhash_table + n); | 42 | INIT_HLIST_HEAD(ns->uidhash_table + n); |
43 | 43 | ||
44 | /* Insert new root user. */ | 44 | /* Insert new root user. */ |
45 | ns->root_user = alloc_uid(ns, 0); | 45 | ns->root_user = alloc_uid(ns, 0); |
@@ -81,6 +81,7 @@ void free_user_ns(struct kref *kref) | |||
81 | struct user_namespace *ns; | 81 | struct user_namespace *ns; |
82 | 82 | ||
83 | ns = container_of(kref, struct user_namespace, kref); | 83 | ns = container_of(kref, struct user_namespace, kref); |
84 | release_uids(ns); | ||
84 | kfree(ns); | 85 | kfree(ns); |
85 | } | 86 | } |
86 | 87 | ||
diff --git a/kernel/utsname.c b/kernel/utsname.c index 9d8180a0f0d8..816d7b24fa03 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | |||
28 | if (!ns) | 28 | if (!ns) |
29 | return ERR_PTR(-ENOMEM); | 29 | return ERR_PTR(-ENOMEM); |
30 | 30 | ||
31 | down_read(&uts_sem); | ||
31 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 32 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
33 | up_read(&uts_sem); | ||
32 | kref_init(&ns->kref); | 34 | kref_init(&ns->kref); |
33 | return ns; | 35 | return ns; |
34 | } | 36 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 58e5c152a6bb..e080d1d744cc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -635,7 +635,7 @@ int keventd_up(void) | |||
635 | int current_is_keventd(void) | 635 | int current_is_keventd(void) |
636 | { | 636 | { |
637 | struct cpu_workqueue_struct *cwq; | 637 | struct cpu_workqueue_struct *cwq; |
638 | int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ | 638 | int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ |
639 | int ret = 0; | 639 | int ret = 0; |
640 | 640 | ||
641 | BUG_ON(!keventd_wq); | 641 | BUG_ON(!keventd_wq); |