diff options
author | Dave Jones <davej@redhat.com> | 2006-12-12 17:41:41 -0500 |
---|---|---|
committer | Dave Jones <davej@redhat.com> | 2006-12-12 17:41:41 -0500 |
commit | c4366889dda8110247be59ca41fddb82951a8c26 (patch) | |
tree | 705c1a996bed8fd48ce94ff33ec9fd00f9b94875 /kernel | |
parent | db2fb9db5735cc532fd4fc55e94b9a3c3750378e (diff) | |
parent | e1036502e5263851259d147771226161e5ccc85a (diff) |
Merge ../linus
Conflicts:
drivers/cpufreq/cpufreq.c
Diffstat (limited to 'kernel')
67 files changed, 3483 insertions, 1699 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 248e1c396f8b..4af15802ccd4 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz | |||
@@ -7,7 +7,7 @@ choice | |||
7 | default HZ_250 | 7 | default HZ_250 |
8 | help | 8 | help |
9 | Allows the configuration of the timer frequency. It is customary | 9 | Allows the configuration of the timer frequency. It is customary |
10 | to have the timer interrupt run at 1000 HZ but 100 HZ may be more | 10 | to have the timer interrupt run at 1000 Hz but 100 Hz may be more |
11 | beneficial for servers and NUMA systems that do not need to have | 11 | beneficial for servers and NUMA systems that do not need to have |
12 | a fast response for user interaction and that may experience bus | 12 | a fast response for user interaction and that may experience bus |
13 | contention and cacheline bounces as a result of timer interrupts. | 13 | contention and cacheline bounces as a result of timer interrupts. |
@@ -19,21 +19,30 @@ choice | |||
19 | config HZ_100 | 19 | config HZ_100 |
20 | bool "100 HZ" | 20 | bool "100 HZ" |
21 | help | 21 | help |
22 | 100 HZ is a typical choice for servers, SMP and NUMA systems | 22 | 100 Hz is a typical choice for servers, SMP and NUMA systems |
23 | with lots of processors that may show reduced performance if | 23 | with lots of processors that may show reduced performance if |
24 | too many timer interrupts are occurring. | 24 | too many timer interrupts are occurring. |
25 | 25 | ||
26 | config HZ_250 | 26 | config HZ_250 |
27 | bool "250 HZ" | 27 | bool "250 HZ" |
28 | help | 28 | help |
29 | 250 HZ is a good compromise choice allowing server performance | 29 | 250 Hz is a good compromise choice allowing server performance |
30 | while also showing good interactive responsiveness even | 30 | while also showing good interactive responsiveness even |
31 | on SMP and NUMA systems. | 31 | on SMP and NUMA systems. If you are going to be using NTSC video |
32 | or multimedia, selected 300Hz instead. | ||
33 | |||
34 | config HZ_300 | ||
35 | bool "300 HZ" | ||
36 | help | ||
37 | 300 Hz is a good compromise choice allowing server performance | ||
38 | while also showing good interactive responsiveness even | ||
39 | on SMP and NUMA systems and exactly dividing by both PAL and | ||
40 | NTSC frame rates for video and multimedia work. | ||
32 | 41 | ||
33 | config HZ_1000 | 42 | config HZ_1000 |
34 | bool "1000 HZ" | 43 | bool "1000 HZ" |
35 | help | 44 | help |
36 | 1000 HZ is the preferred choice for desktop systems and other | 45 | 1000 Hz is the preferred choice for desktop systems and other |
37 | systems requiring fast interactive responses to events. | 46 | systems requiring fast interactive responses to events. |
38 | 47 | ||
39 | endchoice | 48 | endchoice |
@@ -42,5 +51,6 @@ config HZ | |||
42 | int | 51 | int |
43 | default 100 if HZ_100 | 52 | default 100 if HZ_100 |
44 | default 250 if HZ_250 | 53 | default 250 if HZ_250 |
54 | default 300 if HZ_300 | ||
45 | default 1000 if HZ_1000 | 55 | default 1000 if HZ_1000 |
46 | 56 | ||
diff --git a/kernel/acct.c b/kernel/acct.c index 0aad5ca36a81..70d0d88e5554 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -89,7 +89,8 @@ struct acct_glbs { | |||
89 | struct timer_list timer; | 89 | struct timer_list timer; |
90 | }; | 90 | }; |
91 | 91 | ||
92 | static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; | 92 | static struct acct_glbs acct_globals __cacheline_aligned = |
93 | {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; | ||
93 | 94 | ||
94 | /* | 95 | /* |
95 | * Called whenever the timer says to check the free space. | 96 | * Called whenever the timer says to check the free space. |
@@ -117,7 +118,7 @@ static int check_free_space(struct file *file) | |||
117 | spin_unlock(&acct_globals.lock); | 118 | spin_unlock(&acct_globals.lock); |
118 | 119 | ||
119 | /* May block */ | 120 | /* May block */ |
120 | if (vfs_statfs(file->f_dentry, &sbuf)) | 121 | if (vfs_statfs(file->f_path.dentry, &sbuf)) |
121 | return res; | 122 | return res; |
122 | suspend = sbuf.f_blocks * SUSPEND; | 123 | suspend = sbuf.f_blocks * SUSPEND; |
123 | resume = sbuf.f_blocks * RESUME; | 124 | resume = sbuf.f_blocks * RESUME; |
@@ -193,7 +194,7 @@ static void acct_file_reopen(struct file *file) | |||
193 | add_timer(&acct_globals.timer); | 194 | add_timer(&acct_globals.timer); |
194 | } | 195 | } |
195 | if (old_acct) { | 196 | if (old_acct) { |
196 | mnt_unpin(old_acct->f_vfsmnt); | 197 | mnt_unpin(old_acct->f_path.mnt); |
197 | spin_unlock(&acct_globals.lock); | 198 | spin_unlock(&acct_globals.lock); |
198 | do_acct_process(old_acct); | 199 | do_acct_process(old_acct); |
199 | filp_close(old_acct, NULL); | 200 | filp_close(old_acct, NULL); |
@@ -211,7 +212,7 @@ static int acct_on(char *name) | |||
211 | if (IS_ERR(file)) | 212 | if (IS_ERR(file)) |
212 | return PTR_ERR(file); | 213 | return PTR_ERR(file); |
213 | 214 | ||
214 | if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { | 215 | if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { |
215 | filp_close(file, NULL); | 216 | filp_close(file, NULL); |
216 | return -EACCES; | 217 | return -EACCES; |
217 | } | 218 | } |
@@ -228,11 +229,11 @@ static int acct_on(char *name) | |||
228 | } | 229 | } |
229 | 230 | ||
230 | spin_lock(&acct_globals.lock); | 231 | spin_lock(&acct_globals.lock); |
231 | mnt_pin(file->f_vfsmnt); | 232 | mnt_pin(file->f_path.mnt); |
232 | acct_file_reopen(file); | 233 | acct_file_reopen(file); |
233 | spin_unlock(&acct_globals.lock); | 234 | spin_unlock(&acct_globals.lock); |
234 | 235 | ||
235 | mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ | 236 | mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ |
236 | 237 | ||
237 | return 0; | 238 | return 0; |
238 | } | 239 | } |
@@ -282,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name) | |||
282 | void acct_auto_close_mnt(struct vfsmount *m) | 283 | void acct_auto_close_mnt(struct vfsmount *m) |
283 | { | 284 | { |
284 | spin_lock(&acct_globals.lock); | 285 | spin_lock(&acct_globals.lock); |
285 | if (acct_globals.file && acct_globals.file->f_vfsmnt == m) | 286 | if (acct_globals.file && acct_globals.file->f_path.mnt == m) |
286 | acct_file_reopen(NULL); | 287 | acct_file_reopen(NULL); |
287 | spin_unlock(&acct_globals.lock); | 288 | spin_unlock(&acct_globals.lock); |
288 | } | 289 | } |
@@ -298,7 +299,7 @@ void acct_auto_close(struct super_block *sb) | |||
298 | { | 299 | { |
299 | spin_lock(&acct_globals.lock); | 300 | spin_lock(&acct_globals.lock); |
300 | if (acct_globals.file && | 301 | if (acct_globals.file && |
301 | acct_globals.file->f_vfsmnt->mnt_sb == sb) { | 302 | acct_globals.file->f_path.mnt->mnt_sb == sb) { |
302 | acct_file_reopen(NULL); | 303 | acct_file_reopen(NULL); |
303 | } | 304 | } |
304 | spin_unlock(&acct_globals.lock); | 305 | spin_unlock(&acct_globals.lock); |
@@ -427,6 +428,7 @@ static void do_acct_process(struct file *file) | |||
427 | u64 elapsed; | 428 | u64 elapsed; |
428 | u64 run_time; | 429 | u64 run_time; |
429 | struct timespec uptime; | 430 | struct timespec uptime; |
431 | struct tty_struct *tty; | ||
430 | 432 | ||
431 | /* | 433 | /* |
432 | * First check to see if there is enough free_space to continue | 434 | * First check to see if there is enough free_space to continue |
@@ -483,16 +485,9 @@ static void do_acct_process(struct file *file) | |||
483 | ac.ac_ppid = current->parent->tgid; | 485 | ac.ac_ppid = current->parent->tgid; |
484 | #endif | 486 | #endif |
485 | 487 | ||
486 | mutex_lock(&tty_mutex); | ||
487 | /* FIXME: Whoever is responsible for current->signal locking needs | ||
488 | to use the same locking all over the kernel and document it */ | ||
489 | read_lock(&tasklist_lock); | ||
490 | ac.ac_tty = current->signal->tty ? | ||
491 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; | ||
492 | read_unlock(&tasklist_lock); | ||
493 | mutex_unlock(&tty_mutex); | ||
494 | |||
495 | spin_lock_irq(¤t->sighand->siglock); | 488 | spin_lock_irq(¤t->sighand->siglock); |
489 | tty = current->signal->tty; | ||
490 | ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; | ||
496 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); | 491 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); |
497 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); | 492 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); |
498 | ac.ac_flag = pacct->ac_flag; | 493 | ac.ac_flag = pacct->ac_flag; |
diff --git a/kernel/audit.c b/kernel/audit.c index 98106f6078b0..d9b690ac684b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/netlink.h> | 57 | #include <linux/netlink.h> |
58 | #include <linux/selinux.h> | 58 | #include <linux/selinux.h> |
59 | #include <linux/inotify.h> | 59 | #include <linux/inotify.h> |
60 | #include <linux/freezer.h> | ||
60 | 61 | ||
61 | #include "audit.h" | 62 | #include "audit.h" |
62 | 63 | ||
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4f40d923af8e..2e896f8ae29e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) | |||
636 | struct audit_rule *rule; | 636 | struct audit_rule *rule; |
637 | int i; | 637 | int i; |
638 | 638 | ||
639 | rule = kmalloc(sizeof(*rule), GFP_KERNEL); | 639 | rule = kzalloc(sizeof(*rule), GFP_KERNEL); |
640 | if (unlikely(!rule)) | 640 | if (unlikely(!rule)) |
641 | return NULL; | 641 | return NULL; |
642 | memset(rule, 0, sizeof(*rule)); | ||
643 | 642 | ||
644 | rule->flags = krule->flags | krule->listnr; | 643 | rule->flags = krule->flags | krule->listnr; |
645 | rule->action = krule->action; | 644 | rule->action = krule->action; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 42f2f1179711..298897559ca4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -64,6 +64,7 @@ | |||
64 | #include <linux/tty.h> | 64 | #include <linux/tty.h> |
65 | #include <linux/selinux.h> | 65 | #include <linux/selinux.h> |
66 | #include <linux/binfmts.h> | 66 | #include <linux/binfmts.h> |
67 | #include <linux/highmem.h> | ||
67 | #include <linux/syscalls.h> | 68 | #include <linux/syscalls.h> |
68 | 69 | ||
69 | #include "audit.h" | 70 | #include "audit.h" |
@@ -730,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context) | |||
730 | printk(KERN_ERR "audit: freed %d contexts\n", count); | 731 | printk(KERN_ERR "audit: freed %d contexts\n", count); |
731 | } | 732 | } |
732 | 733 | ||
733 | static void audit_log_task_context(struct audit_buffer *ab) | 734 | void audit_log_task_context(struct audit_buffer *ab) |
734 | { | 735 | { |
735 | char *ctx = NULL; | 736 | char *ctx = NULL; |
736 | ssize_t len = 0; | 737 | ssize_t len = 0; |
@@ -759,6 +760,8 @@ error_path: | |||
759 | return; | 760 | return; |
760 | } | 761 | } |
761 | 762 | ||
763 | EXPORT_SYMBOL(audit_log_task_context); | ||
764 | |||
762 | static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | 765 | static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) |
763 | { | 766 | { |
764 | char name[sizeof(tsk->comm)]; | 767 | char name[sizeof(tsk->comm)]; |
@@ -778,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk | |||
778 | if ((vma->vm_flags & VM_EXECUTABLE) && | 781 | if ((vma->vm_flags & VM_EXECUTABLE) && |
779 | vma->vm_file) { | 782 | vma->vm_file) { |
780 | audit_log_d_path(ab, "exe=", | 783 | audit_log_d_path(ab, "exe=", |
781 | vma->vm_file->f_dentry, | 784 | vma->vm_file->f_path.dentry, |
782 | vma->vm_file->f_vfsmnt); | 785 | vma->vm_file->f_path.mnt); |
783 | break; | 786 | break; |
784 | } | 787 | } |
785 | vma = vma->vm_next; | 788 | vma = vma->vm_next; |
@@ -823,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
823 | context->return_code); | 826 | context->return_code); |
824 | 827 | ||
825 | mutex_lock(&tty_mutex); | 828 | mutex_lock(&tty_mutex); |
829 | read_lock(&tasklist_lock); | ||
826 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | 830 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) |
827 | tty = tsk->signal->tty->name; | 831 | tty = tsk->signal->tty->name; |
828 | else | 832 | else |
829 | tty = "(none)"; | 833 | tty = "(none)"; |
834 | read_unlock(&tasklist_lock); | ||
830 | audit_log_format(ab, | 835 | audit_log_format(ab, |
831 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 836 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" |
832 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" | 837 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" |
@@ -1487,6 +1492,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx) | |||
1487 | return ctx ? ctx->loginuid : -1; | 1492 | return ctx ? ctx->loginuid : -1; |
1488 | } | 1493 | } |
1489 | 1494 | ||
1495 | EXPORT_SYMBOL(audit_get_loginuid); | ||
1496 | |||
1490 | /** | 1497 | /** |
1491 | * __audit_mq_open - record audit data for a POSIX MQ open | 1498 | * __audit_mq_open - record audit data for a POSIX MQ open |
1492 | * @oflag: open flag | 1499 | * @oflag: open flag |
diff --git a/kernel/compat.c b/kernel/compat.c index 75573e5d27b0..6952dd057300 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -678,7 +678,7 @@ int get_compat_sigevent(struct sigevent *event, | |||
678 | ? -EFAULT : 0; | 678 | ? -EFAULT : 0; |
679 | } | 679 | } |
680 | 680 | ||
681 | long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, | 681 | long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, |
682 | unsigned long bitmap_size) | 682 | unsigned long bitmap_size) |
683 | { | 683 | { |
684 | int i, j; | 684 | int i, j; |
@@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
982 | } | 982 | } |
983 | return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); | 983 | return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); |
984 | } | 984 | } |
985 | |||
986 | asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, | ||
987 | compat_ulong_t maxnode, | ||
988 | const compat_ulong_t __user *old_nodes, | ||
989 | const compat_ulong_t __user *new_nodes) | ||
990 | { | ||
991 | unsigned long __user *old = NULL; | ||
992 | unsigned long __user *new = NULL; | ||
993 | nodemask_t tmp_mask; | ||
994 | unsigned long nr_bits; | ||
995 | unsigned long size; | ||
996 | |||
997 | nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); | ||
998 | size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | ||
999 | if (old_nodes) { | ||
1000 | if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) | ||
1001 | return -EFAULT; | ||
1002 | old = compat_alloc_user_space(new_nodes ? size * 2 : size); | ||
1003 | if (new_nodes) | ||
1004 | new = old + size / sizeof(unsigned long); | ||
1005 | if (copy_to_user(old, nodes_addr(tmp_mask), size)) | ||
1006 | return -EFAULT; | ||
1007 | } | ||
1008 | if (new_nodes) { | ||
1009 | if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) | ||
1010 | return -EFAULT; | ||
1011 | if (new == NULL) | ||
1012 | new = compat_alloc_user_space(size); | ||
1013 | if (copy_to_user(new, nodes_addr(tmp_mask), size)) | ||
1014 | return -EFAULT; | ||
1015 | } | ||
1016 | return sys_migrate_pages(pid, nr_bits + 1, old, new); | ||
1017 | } | ||
985 | #endif | 1018 | #endif |
diff --git a/kernel/configs.c b/kernel/configs.c index f9e31974f4ad..8fa1fb28f8a7 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf, | |||
75 | return count; | 75 | return count; |
76 | } | 76 | } |
77 | 77 | ||
78 | static struct file_operations ikconfig_file_ops = { | 78 | static const struct file_operations ikconfig_file_ops = { |
79 | .owner = THIS_MODULE, | 79 | .owner = THIS_MODULE, |
80 | .read = ikconfig_read_current, | 80 | .read = ikconfig_read_current, |
81 | }; | 81 | }; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 32c96628463e..9124669f4586 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -19,7 +19,7 @@ | |||
19 | static DEFINE_MUTEX(cpu_add_remove_lock); | 19 | static DEFINE_MUTEX(cpu_add_remove_lock); |
20 | static DEFINE_MUTEX(cpu_bitmask_lock); | 20 | static DEFINE_MUTEX(cpu_bitmask_lock); |
21 | 21 | ||
22 | static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); | 22 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); |
23 | 23 | ||
24 | /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. | 24 | /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. |
25 | * Should always be manipulated under cpu_add_remove_lock | 25 | * Should always be manipulated under cpu_add_remove_lock |
@@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void) | |||
58 | recursive_depth--; | 58 | recursive_depth--; |
59 | return; | 59 | return; |
60 | } | 60 | } |
61 | mutex_unlock(&cpu_bitmask_lock); | ||
62 | recursive = NULL; | 61 | recursive = NULL; |
62 | mutex_unlock(&cpu_bitmask_lock); | ||
63 | } | 63 | } |
64 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 64 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); |
65 | 65 | ||
@@ -68,7 +68,11 @@ EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | |||
68 | /* Need to know about CPUs going up/down? */ | 68 | /* Need to know about CPUs going up/down? */ |
69 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) | 69 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
70 | { | 70 | { |
71 | return blocking_notifier_chain_register(&cpu_chain, nb); | 71 | int ret; |
72 | mutex_lock(&cpu_add_remove_lock); | ||
73 | ret = raw_notifier_chain_register(&cpu_chain, nb); | ||
74 | mutex_unlock(&cpu_add_remove_lock); | ||
75 | return ret; | ||
72 | } | 76 | } |
73 | 77 | ||
74 | #ifdef CONFIG_HOTPLUG_CPU | 78 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -77,7 +81,9 @@ EXPORT_SYMBOL(register_cpu_notifier); | |||
77 | 81 | ||
78 | void unregister_cpu_notifier(struct notifier_block *nb) | 82 | void unregister_cpu_notifier(struct notifier_block *nb) |
79 | { | 83 | { |
80 | blocking_notifier_chain_unregister(&cpu_chain, nb); | 84 | mutex_lock(&cpu_add_remove_lock); |
85 | raw_notifier_chain_unregister(&cpu_chain, nb); | ||
86 | mutex_unlock(&cpu_add_remove_lock); | ||
81 | } | 87 | } |
82 | EXPORT_SYMBOL(unregister_cpu_notifier); | 88 | EXPORT_SYMBOL(unregister_cpu_notifier); |
83 | 89 | ||
@@ -126,7 +132,7 @@ static int _cpu_down(unsigned int cpu) | |||
126 | if (!cpu_online(cpu)) | 132 | if (!cpu_online(cpu)) |
127 | return -EINVAL; | 133 | return -EINVAL; |
128 | 134 | ||
129 | err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, | 135 | err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, |
130 | (void *)(long)cpu); | 136 | (void *)(long)cpu); |
131 | if (err == NOTIFY_BAD) { | 137 | if (err == NOTIFY_BAD) { |
132 | printk("%s: attempt to take down CPU %u failed\n", | 138 | printk("%s: attempt to take down CPU %u failed\n", |
@@ -144,18 +150,18 @@ static int _cpu_down(unsigned int cpu) | |||
144 | p = __stop_machine_run(take_cpu_down, NULL, cpu); | 150 | p = __stop_machine_run(take_cpu_down, NULL, cpu); |
145 | mutex_unlock(&cpu_bitmask_lock); | 151 | mutex_unlock(&cpu_bitmask_lock); |
146 | 152 | ||
147 | if (IS_ERR(p)) { | 153 | if (IS_ERR(p) || cpu_online(cpu)) { |
148 | /* CPU didn't die: tell everyone. Can't complain. */ | 154 | /* CPU didn't die: tell everyone. Can't complain. */ |
149 | if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, | 155 | if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, |
150 | (void *)(long)cpu) == NOTIFY_BAD) | 156 | (void *)(long)cpu) == NOTIFY_BAD) |
151 | BUG(); | 157 | BUG(); |
152 | 158 | ||
153 | err = PTR_ERR(p); | 159 | if (IS_ERR(p)) { |
154 | goto out_allowed; | 160 | err = PTR_ERR(p); |
155 | } | 161 | goto out_allowed; |
156 | 162 | } | |
157 | if (cpu_online(cpu)) | ||
158 | goto out_thread; | 163 | goto out_thread; |
164 | } | ||
159 | 165 | ||
160 | /* Wait for it to sleep (leaving idle task). */ | 166 | /* Wait for it to sleep (leaving idle task). */ |
161 | while (!idle_cpu(cpu)) | 167 | while (!idle_cpu(cpu)) |
@@ -169,7 +175,7 @@ static int _cpu_down(unsigned int cpu) | |||
169 | put_cpu(); | 175 | put_cpu(); |
170 | 176 | ||
171 | /* CPU is completely dead: tell everyone. Too late to complain. */ | 177 | /* CPU is completely dead: tell everyone. Too late to complain. */ |
172 | if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD, | 178 | if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, |
173 | (void *)(long)cpu) == NOTIFY_BAD) | 179 | (void *)(long)cpu) == NOTIFY_BAD) |
174 | BUG(); | 180 | BUG(); |
175 | 181 | ||
@@ -206,7 +212,7 @@ static int __devinit _cpu_up(unsigned int cpu) | |||
206 | if (cpu_online(cpu) || !cpu_present(cpu)) | 212 | if (cpu_online(cpu) || !cpu_present(cpu)) |
207 | return -EINVAL; | 213 | return -EINVAL; |
208 | 214 | ||
209 | ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); | 215 | ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); |
210 | if (ret == NOTIFY_BAD) { | 216 | if (ret == NOTIFY_BAD) { |
211 | printk("%s: attempt to bring up CPU %u failed\n", | 217 | printk("%s: attempt to bring up CPU %u failed\n", |
212 | __FUNCTION__, cpu); | 218 | __FUNCTION__, cpu); |
@@ -223,11 +229,11 @@ static int __devinit _cpu_up(unsigned int cpu) | |||
223 | BUG_ON(!cpu_online(cpu)); | 229 | BUG_ON(!cpu_online(cpu)); |
224 | 230 | ||
225 | /* Now call notifier in preparation. */ | 231 | /* Now call notifier in preparation. */ |
226 | blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); | 232 | raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); |
227 | 233 | ||
228 | out_notify: | 234 | out_notify: |
229 | if (ret != 0) | 235 | if (ret != 0) |
230 | blocking_notifier_call_chain(&cpu_chain, | 236 | raw_notifier_call_chain(&cpu_chain, |
231 | CPU_UP_CANCELED, hcpu); | 237 | CPU_UP_CANCELED, hcpu); |
232 | 238 | ||
233 | return ret; | 239 | return ret; |
@@ -264,11 +270,7 @@ int disable_nonboot_cpus(void) | |||
264 | goto out; | 270 | goto out; |
265 | } | 271 | } |
266 | } | 272 | } |
267 | error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); | 273 | |
268 | if (error) { | ||
269 | printk(KERN_ERR "Could not run on CPU%d\n", first_cpu); | ||
270 | goto out; | ||
271 | } | ||
272 | /* We take down all of the non-boot CPUs in one shot to avoid races | 274 | /* We take down all of the non-boot CPUs in one shot to avoid races |
273 | * with the userspace trying to use the CPU hotplug at the same time | 275 | * with the userspace trying to use the CPU hotplug at the same time |
274 | */ | 276 | */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6313c38c930e..2c3b4431472b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = { | |||
413 | * | 413 | * |
414 | * | 414 | * |
415 | * When reading/writing to a file: | 415 | * When reading/writing to a file: |
416 | * - the cpuset to use in file->f_dentry->d_parent->d_fsdata | 416 | * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata |
417 | * - the 'cftype' of the file is file->f_dentry->d_fsdata | 417 | * - the 'cftype' of the file is file->f_path.dentry->d_fsdata |
418 | */ | 418 | */ |
419 | 419 | ||
420 | struct cftype { | 420 | struct cftype { |
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
729 | } | 729 | } |
730 | 730 | ||
731 | /* Remaining checks don't apply to root cpuset */ | 731 | /* Remaining checks don't apply to root cpuset */ |
732 | if ((par = cur->parent) == NULL) | 732 | if (cur == &top_cpuset) |
733 | return 0; | 733 | return 0; |
734 | 734 | ||
735 | par = cur->parent; | ||
736 | |||
735 | /* We must be a subset of our parent cpuset */ | 737 | /* We must be a subset of our parent cpuset */ |
736 | if (!is_cpuset_subset(trial, par)) | 738 | if (!is_cpuset_subset(trial, par)) |
737 | return -EACCES; | 739 | return -EACCES; |
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
1060 | cpu_exclusive_changed = | 1062 | cpu_exclusive_changed = |
1061 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 1063 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
1062 | mutex_lock(&callback_mutex); | 1064 | mutex_lock(&callback_mutex); |
1063 | if (turning_on) | 1065 | cs->flags = trialcs.flags; |
1064 | set_bit(bit, &cs->flags); | ||
1065 | else | ||
1066 | clear_bit(bit, &cs->flags); | ||
1067 | mutex_unlock(&callback_mutex); | 1066 | mutex_unlock(&callback_mutex); |
1068 | 1067 | ||
1069 | if (cpu_exclusive_changed) | 1068 | if (cpu_exclusive_changed) |
@@ -1281,18 +1280,19 @@ typedef enum { | |||
1281 | FILE_TASKLIST, | 1280 | FILE_TASKLIST, |
1282 | } cpuset_filetype_t; | 1281 | } cpuset_filetype_t; |
1283 | 1282 | ||
1284 | static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, | 1283 | static ssize_t cpuset_common_file_write(struct file *file, |
1284 | const char __user *userbuf, | ||
1285 | size_t nbytes, loff_t *unused_ppos) | 1285 | size_t nbytes, loff_t *unused_ppos) |
1286 | { | 1286 | { |
1287 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1287 | struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); |
1288 | struct cftype *cft = __d_cft(file->f_dentry); | 1288 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1289 | cpuset_filetype_t type = cft->private; | 1289 | cpuset_filetype_t type = cft->private; |
1290 | char *buffer; | 1290 | char *buffer; |
1291 | char *pathbuf = NULL; | 1291 | char *pathbuf = NULL; |
1292 | int retval = 0; | 1292 | int retval = 0; |
1293 | 1293 | ||
1294 | /* Crude upper limit on largest legitimate cpulist user might write. */ | 1294 | /* Crude upper limit on largest legitimate cpulist user might write. */ |
1295 | if (nbytes > 100 + 6 * NR_CPUS) | 1295 | if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) |
1296 | return -E2BIG; | 1296 | return -E2BIG; |
1297 | 1297 | ||
1298 | /* +1 for nul-terminator */ | 1298 | /* +1 for nul-terminator */ |
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf, | |||
1367 | size_t nbytes, loff_t *ppos) | 1367 | size_t nbytes, loff_t *ppos) |
1368 | { | 1368 | { |
1369 | ssize_t retval = 0; | 1369 | ssize_t retval = 0; |
1370 | struct cftype *cft = __d_cft(file->f_dentry); | 1370 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1371 | if (!cft) | 1371 | if (!cft) |
1372 | return -ENODEV; | 1372 | return -ENODEV; |
1373 | 1373 | ||
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
1417 | static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | 1417 | static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, |
1418 | size_t nbytes, loff_t *ppos) | 1418 | size_t nbytes, loff_t *ppos) |
1419 | { | 1419 | { |
1420 | struct cftype *cft = __d_cft(file->f_dentry); | 1420 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1421 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1421 | struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); |
1422 | cpuset_filetype_t type = cft->private; | 1422 | cpuset_filetype_t type = cft->private; |
1423 | char *page; | 1423 | char *page; |
1424 | ssize_t retval = 0; | 1424 | ssize_t retval = 0; |
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt | |||
1476 | loff_t *ppos) | 1476 | loff_t *ppos) |
1477 | { | 1477 | { |
1478 | ssize_t retval = 0; | 1478 | ssize_t retval = 0; |
1479 | struct cftype *cft = __d_cft(file->f_dentry); | 1479 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1480 | if (!cft) | 1480 | if (!cft) |
1481 | return -ENODEV; | 1481 | return -ENODEV; |
1482 | 1482 | ||
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) | |||
1498 | if (err) | 1498 | if (err) |
1499 | return err; | 1499 | return err; |
1500 | 1500 | ||
1501 | cft = __d_cft(file->f_dentry); | 1501 | cft = __d_cft(file->f_path.dentry); |
1502 | if (!cft) | 1502 | if (!cft) |
1503 | return -ENODEV; | 1503 | return -ENODEV; |
1504 | if (cft->open) | 1504 | if (cft->open) |
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) | |||
1511 | 1511 | ||
1512 | static int cpuset_file_release(struct inode *inode, struct file *file) | 1512 | static int cpuset_file_release(struct inode *inode, struct file *file) |
1513 | { | 1513 | { |
1514 | struct cftype *cft = __d_cft(file->f_dentry); | 1514 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1515 | if (cft->release) | 1515 | if (cft->release) |
1516 | return cft->release(inode, file); | 1516 | return cft->release(inode, file); |
1517 | return 0; | 1517 | return 0; |
@@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
1532 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | 1532 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); |
1533 | } | 1533 | } |
1534 | 1534 | ||
1535 | static struct file_operations cpuset_file_operations = { | 1535 | static const struct file_operations cpuset_file_operations = { |
1536 | .read = cpuset_file_read, | 1536 | .read = cpuset_file_read, |
1537 | .write = cpuset_file_write, | 1537 | .write = cpuset_file_write, |
1538 | .llseek = generic_file_llseek, | 1538 | .llseek = generic_file_llseek, |
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
1700 | */ | 1700 | */ |
1701 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1701 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
1702 | { | 1702 | { |
1703 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1703 | struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); |
1704 | struct ctr_struct *ctr; | 1704 | struct ctr_struct *ctr; |
1705 | pid_t *pidarray; | 1705 | pid_t *pidarray; |
1706 | int npids; | 1706 | int npids; |
@@ -2045,7 +2045,6 @@ out: | |||
2045 | return err; | 2045 | return err; |
2046 | } | 2046 | } |
2047 | 2047 | ||
2048 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) | ||
2049 | /* | 2048 | /* |
2050 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 2049 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs |
2051 | * or memory nodes, we need to walk over the cpuset hierarchy, | 2050 | * or memory nodes, we need to walk over the cpuset hierarchy, |
@@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void) | |||
2109 | mutex_unlock(&callback_mutex); | 2108 | mutex_unlock(&callback_mutex); |
2110 | mutex_unlock(&manage_mutex); | 2109 | mutex_unlock(&manage_mutex); |
2111 | } | 2110 | } |
2112 | #endif | ||
2113 | 2111 | ||
2114 | #ifdef CONFIG_HOTPLUG_CPU | ||
2115 | /* | 2112 | /* |
2116 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 2113 | * The top_cpuset tracks what CPUs and Memory Nodes are online, |
2117 | * period. This is necessary in order to make cpusets transparent | 2114 | * period. This is necessary in order to make cpusets transparent |
@@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb, | |||
2128 | common_cpu_mem_hotplug_unplug(); | 2125 | common_cpu_mem_hotplug_unplug(); |
2129 | return 0; | 2126 | return 0; |
2130 | } | 2127 | } |
2131 | #endif | ||
2132 | 2128 | ||
2133 | #ifdef CONFIG_MEMORY_HOTPLUG | 2129 | #ifdef CONFIG_MEMORY_HOTPLUG |
2134 | /* | 2130 | /* |
@@ -2610,7 +2606,7 @@ static int cpuset_open(struct inode *inode, struct file *file) | |||
2610 | return single_open(file, proc_cpuset_show, pid); | 2606 | return single_open(file, proc_cpuset_show, pid); |
2611 | } | 2607 | } |
2612 | 2608 | ||
2613 | struct file_operations proc_cpuset_operations = { | 2609 | const struct file_operations proc_cpuset_operations = { |
2614 | .open = cpuset_open, | 2610 | .open = cpuset_open, |
2615 | .read = seq_read, | 2611 | .read = seq_read, |
2616 | .llseek = seq_lseek, | 2612 | .llseek = seq_lseek, |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 36752f124c6a..766d5912b26a 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -20,7 +20,7 @@ | |||
20 | #include <linux/delayacct.h> | 20 | #include <linux/delayacct.h> |
21 | 21 | ||
22 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ | 22 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ |
23 | kmem_cache_t *delayacct_cache; | 23 | struct kmem_cache *delayacct_cache; |
24 | 24 | ||
25 | static int __init delayacct_setup_disable(char *str) | 25 | static int __init delayacct_setup_disable(char *str) |
26 | { | 26 | { |
@@ -41,7 +41,7 @@ void delayacct_init(void) | |||
41 | 41 | ||
42 | void __delayacct_tsk_init(struct task_struct *tsk) | 42 | void __delayacct_tsk_init(struct task_struct *tsk) |
43 | { | 43 | { |
44 | tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); | 44 | tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); |
45 | if (tsk->delays) | 45 | if (tsk->delays) |
46 | spin_lock_init(&tsk->delays->lock); | 46 | spin_lock_init(&tsk->delays->lock); |
47 | } | 47 | } |
@@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end, | |||
66 | { | 66 | { |
67 | struct timespec ts; | 67 | struct timespec ts; |
68 | s64 ns; | 68 | s64 ns; |
69 | unsigned long flags; | ||
69 | 70 | ||
70 | do_posix_clock_monotonic_gettime(end); | 71 | do_posix_clock_monotonic_gettime(end); |
71 | ts = timespec_sub(*end, *start); | 72 | ts = timespec_sub(*end, *start); |
@@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end, | |||
73 | if (ns < 0) | 74 | if (ns < 0) |
74 | return; | 75 | return; |
75 | 76 | ||
76 | spin_lock(¤t->delays->lock); | 77 | spin_lock_irqsave(¤t->delays->lock, flags); |
77 | *total += ns; | 78 | *total += ns; |
78 | (*count)++; | 79 | (*count)++; |
79 | spin_unlock(¤t->delays->lock); | 80 | spin_unlock_irqrestore(¤t->delays->lock, flags); |
80 | } | 81 | } |
81 | 82 | ||
82 | void __delayacct_blkio_start(void) | 83 | void __delayacct_blkio_start(void) |
@@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
104 | s64 tmp; | 105 | s64 tmp; |
105 | struct timespec ts; | 106 | struct timespec ts; |
106 | unsigned long t1,t2,t3; | 107 | unsigned long t1,t2,t3; |
108 | unsigned long flags; | ||
107 | 109 | ||
108 | /* Though tsk->delays accessed later, early exit avoids | 110 | /* Though tsk->delays accessed later, early exit avoids |
109 | * unnecessary returning of other data | 111 | * unnecessary returning of other data |
@@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | |||
136 | 138 | ||
137 | /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ | 139 | /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ |
138 | 140 | ||
139 | spin_lock(&tsk->delays->lock); | 141 | spin_lock_irqsave(&tsk->delays->lock, flags); |
140 | tmp = d->blkio_delay_total + tsk->delays->blkio_delay; | 142 | tmp = d->blkio_delay_total + tsk->delays->blkio_delay; |
141 | d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; | 143 | d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; |
142 | tmp = d->swapin_delay_total + tsk->delays->swapin_delay; | 144 | tmp = d->swapin_delay_total + tsk->delays->swapin_delay; |
143 | d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; | 145 | d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; |
144 | d->blkio_count += tsk->delays->blkio_count; | 146 | d->blkio_count += tsk->delays->blkio_count; |
145 | d->swapin_count += tsk->delays->swapin_count; | 147 | d->swapin_count += tsk->delays->swapin_count; |
146 | spin_unlock(&tsk->delays->lock); | 148 | spin_unlock_irqrestore(&tsk->delays->lock, flags); |
147 | 149 | ||
148 | done: | 150 | done: |
149 | return 0; | 151 | return 0; |
@@ -152,11 +154,12 @@ done: | |||
152 | __u64 __delayacct_blkio_ticks(struct task_struct *tsk) | 154 | __u64 __delayacct_blkio_ticks(struct task_struct *tsk) |
153 | { | 155 | { |
154 | __u64 ret; | 156 | __u64 ret; |
157 | unsigned long flags; | ||
155 | 158 | ||
156 | spin_lock(&tsk->delays->lock); | 159 | spin_lock_irqsave(&tsk->delays->lock, flags); |
157 | ret = nsec_to_clock_t(tsk->delays->blkio_delay + | 160 | ret = nsec_to_clock_t(tsk->delays->blkio_delay + |
158 | tsk->delays->swapin_delay); | 161 | tsk->delays->swapin_delay); |
159 | spin_unlock(&tsk->delays->lock); | 162 | spin_unlock_irqrestore(&tsk->delays->lock, flags); |
160 | return ret; | 163 | return ret; |
161 | } | 164 | } |
162 | 165 | ||
diff --git a/kernel/dma.c b/kernel/dma.c index 2020644c938a..937b13ca33ba 100644 --- a/kernel/dma.c +++ b/kernel/dma.c | |||
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file) | |||
140 | return single_open(file, proc_dma_show, NULL); | 140 | return single_open(file, proc_dma_show, NULL); |
141 | } | 141 | } |
142 | 142 | ||
143 | static struct file_operations proc_dma_operations = { | 143 | static const struct file_operations proc_dma_operations = { |
144 | .open = proc_dma_open, | 144 | .open = proc_dma_open, |
145 | .read = seq_read, | 145 | .read = seq_read, |
146 | .llseek = seq_lseek, | 146 | .llseek = seq_lseek, |
diff --git a/kernel/exit.c b/kernel/exit.c index f250a5e3e281..122fadb972fc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -13,7 +13,7 @@ | |||
13 | #include <linux/completion.h> | 13 | #include <linux/completion.h> |
14 | #include <linux/personality.h> | 14 | #include <linux/personality.h> |
15 | #include <linux/tty.h> | 15 | #include <linux/tty.h> |
16 | #include <linux/namespace.h> | 16 | #include <linux/mnt_namespace.h> |
17 | #include <linux/key.h> | 17 | #include <linux/key.h> |
18 | #include <linux/security.h> | 18 | #include <linux/security.h> |
19 | #include <linux/cpu.h> | 19 | #include <linux/cpu.h> |
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/file.h> | 22 | #include <linux/file.h> |
23 | #include <linux/binfmts.h> | 23 | #include <linux/binfmts.h> |
24 | #include <linux/nsproxy.h> | 24 | #include <linux/nsproxy.h> |
25 | #include <linux/pid_namespace.h> | ||
25 | #include <linux/ptrace.h> | 26 | #include <linux/ptrace.h> |
26 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
27 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
@@ -48,7 +49,6 @@ | |||
48 | #include <asm/mmu_context.h> | 49 | #include <asm/mmu_context.h> |
49 | 50 | ||
50 | extern void sem_exit (void); | 51 | extern void sem_exit (void); |
51 | extern struct task_struct *child_reaper; | ||
52 | 52 | ||
53 | static void exit_mm(struct task_struct * tsk); | 53 | static void exit_mm(struct task_struct * tsk); |
54 | 54 | ||
@@ -128,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
128 | flush_sigqueue(&tsk->pending); | 128 | flush_sigqueue(&tsk->pending); |
129 | if (sig) { | 129 | if (sig) { |
130 | flush_sigqueue(&sig->shared_pending); | 130 | flush_sigqueue(&sig->shared_pending); |
131 | taskstats_tgid_free(sig); | ||
131 | __cleanup_signal(sig); | 132 | __cleanup_signal(sig); |
132 | } | 133 | } |
133 | } | 134 | } |
@@ -188,21 +189,18 @@ repeat: | |||
188 | int session_of_pgrp(int pgrp) | 189 | int session_of_pgrp(int pgrp) |
189 | { | 190 | { |
190 | struct task_struct *p; | 191 | struct task_struct *p; |
191 | int sid = -1; | 192 | int sid = 0; |
192 | 193 | ||
193 | read_lock(&tasklist_lock); | 194 | read_lock(&tasklist_lock); |
194 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | 195 | |
195 | if (p->signal->session > 0) { | 196 | p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); |
196 | sid = p->signal->session; | 197 | if (p == NULL) |
197 | goto out; | 198 | p = find_task_by_pid(pgrp); |
198 | } | 199 | if (p != NULL) |
199 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | 200 | sid = process_session(p); |
200 | p = find_task_by_pid(pgrp); | 201 | |
201 | if (p) | ||
202 | sid = p->signal->session; | ||
203 | out: | ||
204 | read_unlock(&tasklist_lock); | 202 | read_unlock(&tasklist_lock); |
205 | 203 | ||
206 | return sid; | 204 | return sid; |
207 | } | 205 | } |
208 | 206 | ||
@@ -224,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) | |||
224 | || p->exit_state | 222 | || p->exit_state |
225 | || is_init(p->real_parent)) | 223 | || is_init(p->real_parent)) |
226 | continue; | 224 | continue; |
227 | if (process_group(p->real_parent) != pgrp | 225 | if (process_group(p->real_parent) != pgrp && |
228 | && p->real_parent->signal->session == p->signal->session) { | 226 | process_session(p->real_parent) == process_session(p)) { |
229 | ret = 0; | 227 | ret = 0; |
230 | break; | 228 | break; |
231 | } | 229 | } |
@@ -259,7 +257,8 @@ static int has_stopped_jobs(int pgrp) | |||
259 | } | 257 | } |
260 | 258 | ||
261 | /** | 259 | /** |
262 | * reparent_to_init - Reparent the calling kernel thread to the init task. | 260 | * reparent_to_init - Reparent the calling kernel thread to the init task |
261 | * of the pid space that the thread belongs to. | ||
263 | * | 262 | * |
264 | * If a kernel thread is launched as a result of a system call, or if | 263 | * If a kernel thread is launched as a result of a system call, or if |
265 | * it ever exits, it should generally reparent itself to init so that | 264 | * it ever exits, it should generally reparent itself to init so that |
@@ -277,8 +276,8 @@ static void reparent_to_init(void) | |||
277 | ptrace_unlink(current); | 276 | ptrace_unlink(current); |
278 | /* Reparent to init */ | 277 | /* Reparent to init */ |
279 | remove_parent(current); | 278 | remove_parent(current); |
280 | current->parent = child_reaper; | 279 | current->parent = child_reaper(current); |
281 | current->real_parent = child_reaper; | 280 | current->real_parent = child_reaper(current); |
282 | add_parent(current); | 281 | add_parent(current); |
283 | 282 | ||
284 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | 283 | /* Set the exit signal to SIGCHLD so we signal init on exit */ |
@@ -301,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp) | |||
301 | { | 300 | { |
302 | struct task_struct *curr = current->group_leader; | 301 | struct task_struct *curr = current->group_leader; |
303 | 302 | ||
304 | if (curr->signal->session != session) { | 303 | if (process_session(curr) != session) { |
305 | detach_pid(curr, PIDTYPE_SID); | 304 | detach_pid(curr, PIDTYPE_SID); |
306 | curr->signal->session = session; | 305 | set_signal_session(curr->signal, session); |
307 | attach_pid(curr, PIDTYPE_SID, session); | 306 | attach_pid(curr, PIDTYPE_SID, session); |
308 | } | 307 | } |
309 | if (process_group(curr) != pgrp) { | 308 | if (process_group(curr) != pgrp) { |
@@ -313,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) | |||
313 | } | 312 | } |
314 | } | 313 | } |
315 | 314 | ||
316 | void set_special_pids(pid_t session, pid_t pgrp) | 315 | static void set_special_pids(pid_t session, pid_t pgrp) |
317 | { | 316 | { |
318 | write_lock_irq(&tasklist_lock); | 317 | write_lock_irq(&tasklist_lock); |
319 | __set_special_pids(session, pgrp); | 318 | __set_special_pids(session, pgrp); |
@@ -383,9 +382,7 @@ void daemonize(const char *name, ...) | |||
383 | exit_mm(current); | 382 | exit_mm(current); |
384 | 383 | ||
385 | set_special_pids(1, 1); | 384 | set_special_pids(1, 1); |
386 | mutex_lock(&tty_mutex); | 385 | proc_clear_tty(current); |
387 | current->signal->tty = NULL; | ||
388 | mutex_unlock(&tty_mutex); | ||
389 | 386 | ||
390 | /* Block and flush all signals */ | 387 | /* Block and flush all signals */ |
391 | sigfillset(&blocked); | 388 | sigfillset(&blocked); |
@@ -428,7 +425,7 @@ static void close_files(struct files_struct * files) | |||
428 | for (;;) { | 425 | for (;;) { |
429 | unsigned long set; | 426 | unsigned long set; |
430 | i = j * __NFDBITS; | 427 | i = j * __NFDBITS; |
431 | if (i >= fdt->max_fdset || i >= fdt->max_fds) | 428 | if (i >= fdt->max_fds) |
432 | break; | 429 | break; |
433 | set = fdt->open_fds->fds_bits[j++]; | 430 | set = fdt->open_fds->fds_bits[j++]; |
434 | while (set) { | 431 | while (set) { |
@@ -469,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files) | |||
469 | * you can free files immediately. | 466 | * you can free files immediately. |
470 | */ | 467 | */ |
471 | fdt = files_fdtable(files); | 468 | fdt = files_fdtable(files); |
472 | if (fdt == &files->fdtab) | 469 | if (fdt != &files->fdtab) |
473 | fdt->free_files = files; | ||
474 | else | ||
475 | kmem_cache_free(files_cachep, files); | 470 | kmem_cache_free(files_cachep, files); |
476 | free_fdtable(fdt); | 471 | call_rcu(&fdt->rcu, free_fdtable_rcu); |
477 | } | 472 | } |
478 | } | 473 | } |
479 | 474 | ||
@@ -648,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
648 | * outside, so the child pgrp is now orphaned. | 643 | * outside, so the child pgrp is now orphaned. |
649 | */ | 644 | */ |
650 | if ((process_group(p) != process_group(father)) && | 645 | if ((process_group(p) != process_group(father)) && |
651 | (p->signal->session == father->signal->session)) { | 646 | (process_session(p) == process_session(father))) { |
652 | int pgrp = process_group(p); | 647 | int pgrp = process_group(p); |
653 | 648 | ||
654 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { | 649 | if (will_become_orphaned_pgrp(pgrp, NULL) && |
650 | has_stopped_jobs(pgrp)) { | ||
655 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); | 651 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
656 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); | 652 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
657 | } | 653 | } |
@@ -662,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
662 | * When we die, we re-parent all our children. | 658 | * When we die, we re-parent all our children. |
663 | * Try to give them to another thread in our thread | 659 | * Try to give them to another thread in our thread |
664 | * group, and if no such member exists, give it to | 660 | * group, and if no such member exists, give it to |
665 | * the global child reaper process (ie "init") | 661 | * the child reaper process (ie "init") in our pid |
662 | * space. | ||
666 | */ | 663 | */ |
667 | static void | 664 | static void |
668 | forget_original_parent(struct task_struct *father, struct list_head *to_release) | 665 | forget_original_parent(struct task_struct *father, struct list_head *to_release) |
@@ -673,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) | |||
673 | do { | 670 | do { |
674 | reaper = next_thread(reaper); | 671 | reaper = next_thread(reaper); |
675 | if (reaper == father) { | 672 | if (reaper == father) { |
676 | reaper = child_reaper; | 673 | reaper = child_reaper(father); |
677 | break; | 674 | break; |
678 | } | 675 | } |
679 | } while (reaper->exit_state); | 676 | } while (reaper->exit_state); |
@@ -785,7 +782,7 @@ static void exit_notify(struct task_struct *tsk) | |||
785 | t = tsk->real_parent; | 782 | t = tsk->real_parent; |
786 | 783 | ||
787 | if ((process_group(t) != process_group(tsk)) && | 784 | if ((process_group(t) != process_group(tsk)) && |
788 | (t->signal->session == tsk->signal->session) && | 785 | (process_session(t) == process_session(tsk)) && |
789 | will_become_orphaned_pgrp(process_group(tsk), tsk) && | 786 | will_become_orphaned_pgrp(process_group(tsk), tsk) && |
790 | has_stopped_jobs(process_group(tsk))) { | 787 | has_stopped_jobs(process_group(tsk))) { |
791 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); | 788 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); |
@@ -849,9 +846,7 @@ static void exit_notify(struct task_struct *tsk) | |||
849 | fastcall NORET_TYPE void do_exit(long code) | 846 | fastcall NORET_TYPE void do_exit(long code) |
850 | { | 847 | { |
851 | struct task_struct *tsk = current; | 848 | struct task_struct *tsk = current; |
852 | struct taskstats *tidstats; | ||
853 | int group_dead; | 849 | int group_dead; |
854 | unsigned int mycpu; | ||
855 | 850 | ||
856 | profile_task_exit(tsk); | 851 | profile_task_exit(tsk); |
857 | 852 | ||
@@ -861,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code) | |||
861 | panic("Aiee, killing interrupt handler!"); | 856 | panic("Aiee, killing interrupt handler!"); |
862 | if (unlikely(!tsk->pid)) | 857 | if (unlikely(!tsk->pid)) |
863 | panic("Attempted to kill the idle task!"); | 858 | panic("Attempted to kill the idle task!"); |
864 | if (unlikely(tsk == child_reaper)) | 859 | if (unlikely(tsk == child_reaper(tsk))) { |
865 | panic("Attempted to kill init!"); | 860 | if (tsk->nsproxy->pid_ns != &init_pid_ns) |
861 | tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper; | ||
862 | else | ||
863 | panic("Attempted to kill init!"); | ||
864 | } | ||
865 | |||
866 | 866 | ||
867 | if (unlikely(current->ptrace & PT_TRACE_EXIT)) { | 867 | if (unlikely(current->ptrace & PT_TRACE_EXIT)) { |
868 | current->ptrace_message = code; | 868 | current->ptrace_message = code; |
@@ -889,8 +889,6 @@ fastcall NORET_TYPE void do_exit(long code) | |||
889 | current->comm, current->pid, | 889 | current->comm, current->pid, |
890 | preempt_count()); | 890 | preempt_count()); |
891 | 891 | ||
892 | taskstats_exit_alloc(&tidstats, &mycpu); | ||
893 | |||
894 | acct_update_integrals(tsk); | 892 | acct_update_integrals(tsk); |
895 | if (tsk->mm) { | 893 | if (tsk->mm) { |
896 | update_hiwater_rss(tsk->mm); | 894 | update_hiwater_rss(tsk->mm); |
@@ -910,8 +908,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
910 | #endif | 908 | #endif |
911 | if (unlikely(tsk->audit_context)) | 909 | if (unlikely(tsk->audit_context)) |
912 | audit_free(tsk); | 910 | audit_free(tsk); |
913 | taskstats_exit_send(tsk, tidstats, group_dead, mycpu); | 911 | |
914 | taskstats_exit_free(tidstats); | 912 | taskstats_exit(tsk, group_dead); |
915 | 913 | ||
916 | exit_mm(tsk); | 914 | exit_mm(tsk); |
917 | 915 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 7dc6140baac6..d16c566eb645 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -18,7 +18,7 @@ | |||
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/vmalloc.h> | 19 | #include <linux/vmalloc.h> |
20 | #include <linux/completion.h> | 20 | #include <linux/completion.h> |
21 | #include <linux/namespace.h> | 21 | #include <linux/mnt_namespace.h> |
22 | #include <linux/personality.h> | 22 | #include <linux/personality.h> |
23 | #include <linux/mempolicy.h> | 23 | #include <linux/mempolicy.h> |
24 | #include <linux/sem.h> | 24 | #include <linux/sem.h> |
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
37 | #include <linux/jiffies.h> | 37 | #include <linux/jiffies.h> |
38 | #include <linux/futex.h> | 38 | #include <linux/futex.h> |
39 | #include <linux/task_io_accounting_ops.h> | ||
39 | #include <linux/rcupdate.h> | 40 | #include <linux/rcupdate.h> |
40 | #include <linux/ptrace.h> | 41 | #include <linux/ptrace.h> |
41 | #include <linux/mount.h> | 42 | #include <linux/mount.h> |
@@ -82,26 +83,26 @@ int nr_processes(void) | |||
82 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 83 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
83 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) | 84 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) |
84 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) | 85 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) |
85 | static kmem_cache_t *task_struct_cachep; | 86 | static struct kmem_cache *task_struct_cachep; |
86 | #endif | 87 | #endif |
87 | 88 | ||
88 | /* SLAB cache for signal_struct structures (tsk->signal) */ | 89 | /* SLAB cache for signal_struct structures (tsk->signal) */ |
89 | static kmem_cache_t *signal_cachep; | 90 | static struct kmem_cache *signal_cachep; |
90 | 91 | ||
91 | /* SLAB cache for sighand_struct structures (tsk->sighand) */ | 92 | /* SLAB cache for sighand_struct structures (tsk->sighand) */ |
92 | kmem_cache_t *sighand_cachep; | 93 | struct kmem_cache *sighand_cachep; |
93 | 94 | ||
94 | /* SLAB cache for files_struct structures (tsk->files) */ | 95 | /* SLAB cache for files_struct structures (tsk->files) */ |
95 | kmem_cache_t *files_cachep; | 96 | struct kmem_cache *files_cachep; |
96 | 97 | ||
97 | /* SLAB cache for fs_struct structures (tsk->fs) */ | 98 | /* SLAB cache for fs_struct structures (tsk->fs) */ |
98 | kmem_cache_t *fs_cachep; | 99 | struct kmem_cache *fs_cachep; |
99 | 100 | ||
100 | /* SLAB cache for vm_area_struct structures */ | 101 | /* SLAB cache for vm_area_struct structures */ |
101 | kmem_cache_t *vm_area_cachep; | 102 | struct kmem_cache *vm_area_cachep; |
102 | 103 | ||
103 | /* SLAB cache for mm_struct structures (tsk->mm) */ | 104 | /* SLAB cache for mm_struct structures (tsk->mm) */ |
104 | static kmem_cache_t *mm_cachep; | 105 | static struct kmem_cache *mm_cachep; |
105 | 106 | ||
106 | void free_task(struct task_struct *tsk) | 107 | void free_task(struct task_struct *tsk) |
107 | { | 108 | { |
@@ -237,7 +238,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
237 | goto fail_nomem; | 238 | goto fail_nomem; |
238 | charge = len; | 239 | charge = len; |
239 | } | 240 | } |
240 | tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 241 | tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
241 | if (!tmp) | 242 | if (!tmp) |
242 | goto fail_nomem; | 243 | goto fail_nomem; |
243 | *tmp = *mpnt; | 244 | *tmp = *mpnt; |
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
252 | anon_vma_link(tmp); | 253 | anon_vma_link(tmp); |
253 | file = tmp->vm_file; | 254 | file = tmp->vm_file; |
254 | if (file) { | 255 | if (file) { |
255 | struct inode *inode = file->f_dentry->d_inode; | 256 | struct inode *inode = file->f_path.dentry->d_inode; |
256 | get_file(file); | 257 | get_file(file); |
257 | if (tmp->vm_flags & VM_DENYWRITE) | 258 | if (tmp->vm_flags & VM_DENYWRITE) |
258 | atomic_dec(&inode->i_writecount); | 259 | atomic_dec(&inode->i_writecount); |
@@ -319,7 +320,7 @@ static inline void mm_free_pgd(struct mm_struct * mm) | |||
319 | 320 | ||
320 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); | 321 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); |
321 | 322 | ||
322 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) | 323 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) |
323 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) | 324 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) |
324 | 325 | ||
325 | #include <linux/init_task.h> | 326 | #include <linux/init_task.h> |
@@ -448,7 +449,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
448 | tsk->vfork_done = NULL; | 449 | tsk->vfork_done = NULL; |
449 | complete(vfork_done); | 450 | complete(vfork_done); |
450 | } | 451 | } |
451 | if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { | 452 | |
453 | /* | ||
454 | * If we're exiting normally, clear a user-space tid field if | ||
455 | * requested. We leave this alone when dying by signal, to leave | ||
456 | * the value intact in a core dump, and to save the unnecessary | ||
457 | * trouble otherwise. Userland only wants this done for a sys_exit. | ||
458 | */ | ||
459 | if (tsk->clear_child_tid | ||
460 | && !(tsk->flags & PF_SIGNALED) | ||
461 | && atomic_read(&mm->mm_users) > 1) { | ||
452 | u32 __user * tidptr = tsk->clear_child_tid; | 462 | u32 __user * tidptr = tsk->clear_child_tid; |
453 | tsk->clear_child_tid = NULL; | 463 | tsk->clear_child_tid = NULL; |
454 | 464 | ||
@@ -479,6 +489,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) | |||
479 | 489 | ||
480 | memcpy(mm, oldmm, sizeof(*mm)); | 490 | memcpy(mm, oldmm, sizeof(*mm)); |
481 | 491 | ||
492 | /* Initializing for Swap token stuff */ | ||
493 | mm->token_priority = 0; | ||
494 | mm->last_interval = 0; | ||
495 | |||
482 | if (!mm_init(mm)) | 496 | if (!mm_init(mm)) |
483 | goto fail_nomem; | 497 | goto fail_nomem; |
484 | 498 | ||
@@ -542,6 +556,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | |||
542 | goto fail_nomem; | 556 | goto fail_nomem; |
543 | 557 | ||
544 | good_mm: | 558 | good_mm: |
559 | /* Initializing for Swap token stuff */ | ||
560 | mm->token_priority = 0; | ||
561 | mm->last_interval = 0; | ||
562 | |||
545 | tsk->mm = mm; | 563 | tsk->mm = mm; |
546 | tsk->active_mm = mm; | 564 | tsk->active_mm = mm; |
547 | return 0; | 565 | return 0; |
@@ -596,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) | |||
596 | 614 | ||
597 | static int count_open_files(struct fdtable *fdt) | 615 | static int count_open_files(struct fdtable *fdt) |
598 | { | 616 | { |
599 | int size = fdt->max_fdset; | 617 | int size = fdt->max_fds; |
600 | int i; | 618 | int i; |
601 | 619 | ||
602 | /* Find the last open fd */ | 620 | /* Find the last open fd */ |
@@ -613,7 +631,7 @@ static struct files_struct *alloc_files(void) | |||
613 | struct files_struct *newf; | 631 | struct files_struct *newf; |
614 | struct fdtable *fdt; | 632 | struct fdtable *fdt; |
615 | 633 | ||
616 | newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); | 634 | newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); |
617 | if (!newf) | 635 | if (!newf) |
618 | goto out; | 636 | goto out; |
619 | 637 | ||
@@ -623,12 +641,10 @@ static struct files_struct *alloc_files(void) | |||
623 | newf->next_fd = 0; | 641 | newf->next_fd = 0; |
624 | fdt = &newf->fdtab; | 642 | fdt = &newf->fdtab; |
625 | fdt->max_fds = NR_OPEN_DEFAULT; | 643 | fdt->max_fds = NR_OPEN_DEFAULT; |
626 | fdt->max_fdset = EMBEDDED_FD_SET_SIZE; | ||
627 | fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; | 644 | fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; |
628 | fdt->open_fds = (fd_set *)&newf->open_fds_init; | 645 | fdt->open_fds = (fd_set *)&newf->open_fds_init; |
629 | fdt->fd = &newf->fd_array[0]; | 646 | fdt->fd = &newf->fd_array[0]; |
630 | INIT_RCU_HEAD(&fdt->rcu); | 647 | INIT_RCU_HEAD(&fdt->rcu); |
631 | fdt->free_files = NULL; | ||
632 | fdt->next = NULL; | 648 | fdt->next = NULL; |
633 | rcu_assign_pointer(newf->fdt, fdt); | 649 | rcu_assign_pointer(newf->fdt, fdt); |
634 | out: | 650 | out: |
@@ -644,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
644 | { | 660 | { |
645 | struct files_struct *newf; | 661 | struct files_struct *newf; |
646 | struct file **old_fds, **new_fds; | 662 | struct file **old_fds, **new_fds; |
647 | int open_files, size, i, expand; | 663 | int open_files, size, i; |
648 | struct fdtable *old_fdt, *new_fdt; | 664 | struct fdtable *old_fdt, *new_fdt; |
649 | 665 | ||
650 | *errorp = -ENOMEM; | 666 | *errorp = -ENOMEM; |
@@ -655,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
655 | spin_lock(&oldf->file_lock); | 671 | spin_lock(&oldf->file_lock); |
656 | old_fdt = files_fdtable(oldf); | 672 | old_fdt = files_fdtable(oldf); |
657 | new_fdt = files_fdtable(newf); | 673 | new_fdt = files_fdtable(newf); |
658 | size = old_fdt->max_fdset; | ||
659 | open_files = count_open_files(old_fdt); | 674 | open_files = count_open_files(old_fdt); |
660 | expand = 0; | ||
661 | 675 | ||
662 | /* | 676 | /* |
663 | * Check whether we need to allocate a larger fd array or fd set. | 677 | * Check whether we need to allocate a larger fd array and fd set. |
664 | * Note: we're not a clone task, so the open count won't change. | 678 | * Note: we're not a clone task, so the open count won't change. |
665 | */ | 679 | */ |
666 | if (open_files > new_fdt->max_fdset) { | ||
667 | new_fdt->max_fdset = 0; | ||
668 | expand = 1; | ||
669 | } | ||
670 | if (open_files > new_fdt->max_fds) { | 680 | if (open_files > new_fdt->max_fds) { |
671 | new_fdt->max_fds = 0; | 681 | new_fdt->max_fds = 0; |
672 | expand = 1; | ||
673 | } | ||
674 | |||
675 | /* if the old fdset gets grown now, we'll only copy up to "size" fds */ | ||
676 | if (expand) { | ||
677 | spin_unlock(&oldf->file_lock); | 682 | spin_unlock(&oldf->file_lock); |
678 | spin_lock(&newf->file_lock); | 683 | spin_lock(&newf->file_lock); |
679 | *errorp = expand_files(newf, open_files-1); | 684 | *errorp = expand_files(newf, open_files-1); |
@@ -693,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
693 | old_fds = old_fdt->fd; | 698 | old_fds = old_fdt->fd; |
694 | new_fds = new_fdt->fd; | 699 | new_fds = new_fdt->fd; |
695 | 700 | ||
696 | memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); | 701 | memcpy(new_fdt->open_fds->fds_bits, |
697 | memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); | 702 | old_fdt->open_fds->fds_bits, open_files/8); |
703 | memcpy(new_fdt->close_on_exec->fds_bits, | ||
704 | old_fdt->close_on_exec->fds_bits, open_files/8); | ||
698 | 705 | ||
699 | for (i = open_files; i != 0; i--) { | 706 | for (i = open_files; i != 0; i--) { |
700 | struct file *f = *old_fds++; | 707 | struct file *f = *old_fds++; |
@@ -719,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
719 | /* This is long word aligned thus could use a optimized version */ | 726 | /* This is long word aligned thus could use a optimized version */ |
720 | memset(new_fds, 0, size); | 727 | memset(new_fds, 0, size); |
721 | 728 | ||
722 | if (new_fdt->max_fdset > open_files) { | 729 | if (new_fdt->max_fds > open_files) { |
723 | int left = (new_fdt->max_fdset-open_files)/8; | 730 | int left = (new_fdt->max_fds-open_files)/8; |
724 | int start = open_files / (8 * sizeof(unsigned long)); | 731 | int start = open_files / (8 * sizeof(unsigned long)); |
725 | 732 | ||
726 | memset(&new_fdt->open_fds->fds_bits[start], 0, left); | 733 | memset(&new_fdt->open_fds->fds_bits[start], 0, left); |
727 | memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); | 734 | memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); |
728 | } | 735 | } |
729 | 736 | ||
730 | out: | ||
731 | return newf; | 737 | return newf; |
732 | 738 | ||
733 | out_release: | 739 | out_release: |
734 | free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); | ||
735 | free_fdset (new_fdt->open_fds, new_fdt->max_fdset); | ||
736 | free_fd_array(new_fdt->fd, new_fdt->max_fds); | ||
737 | kmem_cache_free(files_cachep, newf); | 740 | kmem_cache_free(files_cachep, newf); |
741 | out: | ||
738 | return NULL; | 742 | return NULL; |
739 | } | 743 | } |
740 | 744 | ||
@@ -830,7 +834,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
830 | if (clone_flags & CLONE_THREAD) { | 834 | if (clone_flags & CLONE_THREAD) { |
831 | atomic_inc(¤t->signal->count); | 835 | atomic_inc(¤t->signal->count); |
832 | atomic_inc(¤t->signal->live); | 836 | atomic_inc(¤t->signal->live); |
833 | taskstats_tgid_alloc(current->signal); | ||
834 | return 0; | 837 | return 0; |
835 | } | 838 | } |
836 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); | 839 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); |
@@ -897,7 +900,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
897 | void __cleanup_signal(struct signal_struct *sig) | 900 | void __cleanup_signal(struct signal_struct *sig) |
898 | { | 901 | { |
899 | exit_thread_group_keys(sig); | 902 | exit_thread_group_keys(sig); |
900 | taskstats_tgid_free(sig); | ||
901 | kmem_cache_free(signal_cachep, sig); | 903 | kmem_cache_free(signal_cachep, sig); |
902 | } | 904 | } |
903 | 905 | ||
@@ -984,6 +986,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
984 | if (!p) | 986 | if (!p) |
985 | goto fork_out; | 987 | goto fork_out; |
986 | 988 | ||
989 | rt_mutex_init_task(p); | ||
990 | |||
987 | #ifdef CONFIG_TRACE_IRQFLAGS | 991 | #ifdef CONFIG_TRACE_IRQFLAGS |
988 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); | 992 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); |
989 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); | 993 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); |
@@ -1038,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1038 | p->wchar = 0; /* I/O counter: bytes written */ | 1042 | p->wchar = 0; /* I/O counter: bytes written */ |
1039 | p->syscr = 0; /* I/O counter: read syscalls */ | 1043 | p->syscr = 0; /* I/O counter: read syscalls */ |
1040 | p->syscw = 0; /* I/O counter: write syscalls */ | 1044 | p->syscw = 0; /* I/O counter: write syscalls */ |
1045 | task_io_accounting_init(p); | ||
1041 | acct_clear_integrals(p); | 1046 | acct_clear_integrals(p); |
1042 | 1047 | ||
1043 | p->it_virt_expires = cputime_zero; | 1048 | p->it_virt_expires = cputime_zero; |
@@ -1088,8 +1093,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1088 | p->lockdep_recursion = 0; | 1093 | p->lockdep_recursion = 0; |
1089 | #endif | 1094 | #endif |
1090 | 1095 | ||
1091 | rt_mutex_init_task(p); | ||
1092 | |||
1093 | #ifdef CONFIG_DEBUG_MUTEXES | 1096 | #ifdef CONFIG_DEBUG_MUTEXES |
1094 | p->blocked_on = NULL; /* not blocked yet */ | 1097 | p->blocked_on = NULL; /* not blocked yet */ |
1095 | #endif | 1098 | #endif |
@@ -1244,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1244 | if (thread_group_leader(p)) { | 1247 | if (thread_group_leader(p)) { |
1245 | p->signal->tty = current->signal->tty; | 1248 | p->signal->tty = current->signal->tty; |
1246 | p->signal->pgrp = process_group(current); | 1249 | p->signal->pgrp = process_group(current); |
1247 | p->signal->session = current->signal->session; | 1250 | set_signal_session(p->signal, process_session(current)); |
1248 | attach_pid(p, PIDTYPE_PGID, process_group(p)); | 1251 | attach_pid(p, PIDTYPE_PGID, process_group(p)); |
1249 | attach_pid(p, PIDTYPE_SID, p->signal->session); | 1252 | attach_pid(p, PIDTYPE_SID, process_session(p)); |
1250 | 1253 | ||
1251 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1254 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1252 | __get_cpu_var(process_counts)++; | 1255 | __get_cpu_var(process_counts)++; |
@@ -1304,7 +1307,7 @@ fork_out: | |||
1304 | return ERR_PTR(retval); | 1307 | return ERR_PTR(retval); |
1305 | } | 1308 | } |
1306 | 1309 | ||
1307 | struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | 1310 | noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) |
1308 | { | 1311 | { |
1309 | memset(regs, 0, sizeof(struct pt_regs)); | 1312 | memset(regs, 0, sizeof(struct pt_regs)); |
1310 | return regs; | 1313 | return regs; |
@@ -1316,9 +1319,8 @@ struct task_struct * __devinit fork_idle(int cpu) | |||
1316 | struct pt_regs regs; | 1319 | struct pt_regs regs; |
1317 | 1320 | ||
1318 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); | 1321 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); |
1319 | if (!task) | 1322 | if (!IS_ERR(task)) |
1320 | return ERR_PTR(-ENOMEM); | 1323 | init_idle(task, cpu); |
1321 | init_idle(task, cpu); | ||
1322 | 1324 | ||
1323 | return task; | 1325 | return task; |
1324 | } | 1326 | } |
@@ -1415,7 +1417,7 @@ long do_fork(unsigned long clone_flags, | |||
1415 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 | 1417 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 |
1416 | #endif | 1418 | #endif |
1417 | 1419 | ||
1418 | static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) | 1420 | static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags) |
1419 | { | 1421 | { |
1420 | struct sighand_struct *sighand = data; | 1422 | struct sighand_struct *sighand = data; |
1421 | 1423 | ||
@@ -1511,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) | |||
1511 | } | 1513 | } |
1512 | 1514 | ||
1513 | /* | 1515 | /* |
1514 | * Unshare the namespace structure if it is being shared | 1516 | * Unshare the mnt_namespace structure if it is being shared |
1515 | */ | 1517 | */ |
1516 | static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) | 1518 | static int unshare_mnt_namespace(unsigned long unshare_flags, |
1519 | struct mnt_namespace **new_nsp, struct fs_struct *new_fs) | ||
1517 | { | 1520 | { |
1518 | struct namespace *ns = current->nsproxy->namespace; | 1521 | struct mnt_namespace *ns = current->nsproxy->mnt_ns; |
1519 | 1522 | ||
1520 | if ((unshare_flags & CLONE_NEWNS) && ns) { | 1523 | if ((unshare_flags & CLONE_NEWNS) && ns) { |
1521 | if (!capable(CAP_SYS_ADMIN)) | 1524 | if (!capable(CAP_SYS_ADMIN)) |
1522 | return -EPERM; | 1525 | return -EPERM; |
1523 | 1526 | ||
1524 | *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); | 1527 | *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); |
1525 | if (!*new_nsp) | 1528 | if (!*new_nsp) |
1526 | return -ENOMEM; | 1529 | return -ENOMEM; |
1527 | } | 1530 | } |
@@ -1530,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new | |||
1530 | } | 1533 | } |
1531 | 1534 | ||
1532 | /* | 1535 | /* |
1533 | * Unsharing of sighand for tasks created with CLONE_SIGHAND is not | 1536 | * Unsharing of sighand is not supported yet |
1534 | * supported yet | ||
1535 | */ | 1537 | */ |
1536 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) | 1538 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) |
1537 | { | 1539 | { |
1538 | struct sighand_struct *sigh = current->sighand; | 1540 | struct sighand_struct *sigh = current->sighand; |
1539 | 1541 | ||
1540 | if ((unshare_flags & CLONE_SIGHAND) && | 1542 | if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) |
1541 | (sigh && atomic_read(&sigh->count) > 1)) | ||
1542 | return -EINVAL; | 1543 | return -EINVAL; |
1543 | else | 1544 | else |
1544 | return 0; | 1545 | return 0; |
@@ -1611,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1611 | { | 1612 | { |
1612 | int err = 0; | 1613 | int err = 0; |
1613 | struct fs_struct *fs, *new_fs = NULL; | 1614 | struct fs_struct *fs, *new_fs = NULL; |
1614 | struct namespace *ns, *new_ns = NULL; | 1615 | struct mnt_namespace *ns, *new_ns = NULL; |
1615 | struct sighand_struct *sigh, *new_sigh = NULL; | 1616 | struct sighand_struct *new_sigh = NULL; |
1616 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; | 1617 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; |
1617 | struct files_struct *fd, *new_fd = NULL; | 1618 | struct files_struct *fd, *new_fd = NULL; |
1618 | struct sem_undo_list *new_ulist = NULL; | 1619 | struct sem_undo_list *new_ulist = NULL; |
@@ -1633,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1633 | goto bad_unshare_out; | 1634 | goto bad_unshare_out; |
1634 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1635 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
1635 | goto bad_unshare_cleanup_thread; | 1636 | goto bad_unshare_cleanup_thread; |
1636 | if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) | 1637 | if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) |
1637 | goto bad_unshare_cleanup_fs; | 1638 | goto bad_unshare_cleanup_fs; |
1638 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) | 1639 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) |
1639 | goto bad_unshare_cleanup_ns; | 1640 | goto bad_unshare_cleanup_ns; |
@@ -1657,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1657 | } | 1658 | } |
1658 | } | 1659 | } |
1659 | 1660 | ||
1660 | if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || | 1661 | if (new_fs || new_ns || new_mm || new_fd || new_ulist || |
1661 | new_uts || new_ipc) { | 1662 | new_uts || new_ipc) { |
1662 | 1663 | ||
1663 | task_lock(current); | 1664 | task_lock(current); |
@@ -1674,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1674 | } | 1675 | } |
1675 | 1676 | ||
1676 | if (new_ns) { | 1677 | if (new_ns) { |
1677 | ns = current->nsproxy->namespace; | 1678 | ns = current->nsproxy->mnt_ns; |
1678 | current->nsproxy->namespace = new_ns; | 1679 | current->nsproxy->mnt_ns = new_ns; |
1679 | new_ns = ns; | 1680 | new_ns = ns; |
1680 | } | 1681 | } |
1681 | 1682 | ||
1682 | if (new_sigh) { | ||
1683 | sigh = current->sighand; | ||
1684 | rcu_assign_pointer(current->sighand, new_sigh); | ||
1685 | new_sigh = sigh; | ||
1686 | } | ||
1687 | |||
1688 | if (new_mm) { | 1683 | if (new_mm) { |
1689 | mm = current->mm; | 1684 | mm = current->mm; |
1690 | active_mm = current->active_mm; | 1685 | active_mm = current->active_mm; |
@@ -1742,7 +1737,7 @@ bad_unshare_cleanup_sigh: | |||
1742 | 1737 | ||
1743 | bad_unshare_cleanup_ns: | 1738 | bad_unshare_cleanup_ns: |
1744 | if (new_ns) | 1739 | if (new_ns) |
1745 | put_namespace(new_ns); | 1740 | put_mnt_ns(new_ns); |
1746 | 1741 | ||
1747 | bad_unshare_cleanup_fs: | 1742 | bad_unshare_cleanup_fs: |
1748 | if (new_fs) | 1743 | if (new_fs) |
diff --git a/kernel/futex.c b/kernel/futex.c index b364e0026191..5a737de857d3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
166 | /* | 166 | /* |
167 | * Get parameters which are the keys for a futex. | 167 | * Get parameters which are the keys for a futex. |
168 | * | 168 | * |
169 | * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, | 169 | * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, |
170 | * offset_within_page). For private mappings, it's (uaddr, current->mm). | 170 | * offset_within_page). For private mappings, it's (uaddr, current->mm). |
171 | * We can usually work out the index without swapping in the page. | 171 | * We can usually work out the index without swapping in the page. |
172 | * | 172 | * |
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) | |||
223 | /* | 223 | /* |
224 | * Linear file mappings are also simple. | 224 | * Linear file mappings are also simple. |
225 | */ | 225 | */ |
226 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 226 | key->shared.inode = vma->vm_file->f_path.dentry->d_inode; |
227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ |
228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { |
229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) | 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) |
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) | |||
282 | { | 282 | { |
283 | int ret; | 283 | int ret; |
284 | 284 | ||
285 | inc_preempt_count(); | 285 | pagefault_disable(); |
286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); | 286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); |
287 | dec_preempt_count(); | 287 | pagefault_enable(); |
288 | 288 | ||
289 | return ret ? -EFAULT : 0; | 289 | return ret ? -EFAULT : 0; |
290 | } | 290 | } |
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void) | |||
324 | if (likely(current->pi_state_cache)) | 324 | if (likely(current->pi_state_cache)) |
325 | return 0; | 325 | return 0; |
326 | 326 | ||
327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | 327 | pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); |
328 | 328 | ||
329 | if (!pi_state) | 329 | if (!pi_state) |
330 | return -ENOMEM; | 330 | return -ENOMEM; |
331 | 331 | ||
332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
333 | INIT_LIST_HEAD(&pi_state->list); | 332 | INIT_LIST_HEAD(&pi_state->list); |
334 | /* pi_mutex gets initialized later */ | 333 | /* pi_mutex gets initialized later */ |
335 | pi_state->owner = NULL; | 334 | pi_state->owner = NULL; |
@@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q) | |||
553 | * at the end of wake_up_all() does not prevent this store from | 552 | * at the end of wake_up_all() does not prevent this store from |
554 | * moving. | 553 | * moving. |
555 | */ | 554 | */ |
556 | wmb(); | 555 | smp_wmb(); |
557 | q->lock_ptr = NULL; | 556 | q->lock_ptr = NULL; |
558 | } | 557 | } |
559 | 558 | ||
@@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
585 | if (!(uval & FUTEX_OWNER_DIED)) { | 584 | if (!(uval & FUTEX_OWNER_DIED)) { |
586 | newval = FUTEX_WAITERS | new_owner->pid; | 585 | newval = FUTEX_WAITERS | new_owner->pid; |
587 | 586 | ||
588 | inc_preempt_count(); | 587 | pagefault_disable(); |
589 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 588 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); |
590 | dec_preempt_count(); | 589 | pagefault_enable(); |
591 | if (curval == -EFAULT) | 590 | if (curval == -EFAULT) |
592 | return -EFAULT; | 591 | return -EFAULT; |
593 | if (curval != uval) | 592 | if (curval != uval) |
@@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | |||
618 | * There is no waiter, so we unlock the futex. The owner died | 617 | * There is no waiter, so we unlock the futex. The owner died |
619 | * bit has not to be preserved here. We are the owner: | 618 | * bit has not to be preserved here. We are the owner: |
620 | */ | 619 | */ |
621 | inc_preempt_count(); | 620 | pagefault_disable(); |
622 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | 621 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); |
623 | dec_preempt_count(); | 622 | pagefault_enable(); |
624 | 623 | ||
625 | if (oldval == -EFAULT) | 624 | if (oldval == -EFAULT) |
626 | return oldval; | 625 | return oldval; |
@@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | |||
1158 | */ | 1157 | */ |
1159 | newval = current->pid; | 1158 | newval = current->pid; |
1160 | 1159 | ||
1161 | inc_preempt_count(); | 1160 | pagefault_disable(); |
1162 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | 1161 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); |
1163 | dec_preempt_count(); | 1162 | pagefault_enable(); |
1164 | 1163 | ||
1165 | if (unlikely(curval == -EFAULT)) | 1164 | if (unlikely(curval == -EFAULT)) |
1166 | goto uaddr_faulted; | 1165 | goto uaddr_faulted; |
@@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | |||
1183 | uval = curval; | 1182 | uval = curval; |
1184 | newval = uval | FUTEX_WAITERS; | 1183 | newval = uval | FUTEX_WAITERS; |
1185 | 1184 | ||
1186 | inc_preempt_count(); | 1185 | pagefault_disable(); |
1187 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 1186 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); |
1188 | dec_preempt_count(); | 1187 | pagefault_enable(); |
1189 | 1188 | ||
1190 | if (unlikely(curval == -EFAULT)) | 1189 | if (unlikely(curval == -EFAULT)) |
1191 | goto uaddr_faulted; | 1190 | goto uaddr_faulted; |
@@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | |||
1215 | newval = current->pid | | 1214 | newval = current->pid | |
1216 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | 1215 | FUTEX_OWNER_DIED | FUTEX_WAITERS; |
1217 | 1216 | ||
1218 | inc_preempt_count(); | 1217 | pagefault_disable(); |
1219 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | 1218 | curval = futex_atomic_cmpxchg_inatomic(uaddr, |
1220 | uval, newval); | 1219 | uval, newval); |
1221 | dec_preempt_count(); | 1220 | pagefault_enable(); |
1222 | 1221 | ||
1223 | if (unlikely(curval == -EFAULT)) | 1222 | if (unlikely(curval == -EFAULT)) |
1224 | goto uaddr_faulted; | 1223 | goto uaddr_faulted; |
@@ -1390,9 +1389,9 @@ retry_locked: | |||
1390 | * anyone else up: | 1389 | * anyone else up: |
1391 | */ | 1390 | */ |
1392 | if (!(uval & FUTEX_OWNER_DIED)) { | 1391 | if (!(uval & FUTEX_OWNER_DIED)) { |
1393 | inc_preempt_count(); | 1392 | pagefault_disable(); |
1394 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | 1393 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); |
1395 | dec_preempt_count(); | 1394 | pagefault_enable(); |
1396 | } | 1395 | } |
1397 | 1396 | ||
1398 | if (unlikely(uval == -EFAULT)) | 1397 | if (unlikely(uval == -EFAULT)) |
@@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp, | |||
1493 | return ret; | 1492 | return ret; |
1494 | } | 1493 | } |
1495 | 1494 | ||
1496 | static struct file_operations futex_fops = { | 1495 | static const struct file_operations futex_fops = { |
1497 | .release = futex_close, | 1496 | .release = futex_close, |
1498 | .poll = futex_poll, | 1497 | .poll = futex_poll, |
1499 | }; | 1498 | }; |
@@ -1507,6 +1506,13 @@ static int futex_fd(u32 __user *uaddr, int signal) | |||
1507 | struct futex_q *q; | 1506 | struct futex_q *q; |
1508 | struct file *filp; | 1507 | struct file *filp; |
1509 | int ret, err; | 1508 | int ret, err; |
1509 | static unsigned long printk_interval; | ||
1510 | |||
1511 | if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { | ||
1512 | printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " | ||
1513 | "will be removed from the kernel in June 2007\n", | ||
1514 | current->comm); | ||
1515 | } | ||
1510 | 1516 | ||
1511 | ret = -EINVAL; | 1517 | ret = -EINVAL; |
1512 | if (!valid_signal(signal)) | 1518 | if (!valid_signal(signal)) |
@@ -1522,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal) | |||
1522 | goto out; | 1528 | goto out; |
1523 | } | 1529 | } |
1524 | filp->f_op = &futex_fops; | 1530 | filp->f_op = &futex_fops; |
1525 | filp->f_vfsmnt = mntget(futex_mnt); | 1531 | filp->f_path.mnt = mntget(futex_mnt); |
1526 | filp->f_dentry = dget(futex_mnt->mnt_root); | 1532 | filp->f_path.dentry = dget(futex_mnt->mnt_root); |
1527 | filp->f_mapping = filp->f_dentry->d_inode->i_mapping; | 1533 | filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; |
1528 | 1534 | ||
1529 | if (signal) { | 1535 | if (signal) { |
1530 | err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); | 1536 | err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); |
@@ -1851,10 +1857,16 @@ static struct file_system_type futex_fs_type = { | |||
1851 | 1857 | ||
1852 | static int __init init(void) | 1858 | static int __init init(void) |
1853 | { | 1859 | { |
1854 | unsigned int i; | 1860 | int i = register_filesystem(&futex_fs_type); |
1861 | |||
1862 | if (i) | ||
1863 | return i; | ||
1855 | 1864 | ||
1856 | register_filesystem(&futex_fs_type); | ||
1857 | futex_mnt = kern_mount(&futex_fs_type); | 1865 | futex_mnt = kern_mount(&futex_fs_type); |
1866 | if (IS_ERR(futex_mnt)) { | ||
1867 | unregister_filesystem(&futex_fs_type); | ||
1868 | return PTR_ERR(futex_mnt); | ||
1869 | } | ||
1858 | 1870 | ||
1859 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | 1871 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { |
1860 | INIT_LIST_HEAD(&futex_queues[i].chain); | 1872 | INIT_LIST_HEAD(&futex_queues[i].chain); |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 11c99697acfe..ebfd24a41858 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -233,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) | |||
233 | chip->shutdown = chip->disable; | 233 | chip->shutdown = chip->disable; |
234 | if (!chip->name) | 234 | if (!chip->name) |
235 | chip->name = chip->typename; | 235 | chip->name = chip->typename; |
236 | if (!chip->end) | ||
237 | chip->end = dummy_irq_chip.end; | ||
236 | } | 238 | } |
237 | 239 | ||
238 | static inline void mask_ack_irq(struct irq_desc *desc, int irq) | 240 | static inline void mask_ack_irq(struct irq_desc *desc, int irq) |
@@ -499,7 +501,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) | |||
499 | #endif /* CONFIG_SMP */ | 501 | #endif /* CONFIG_SMP */ |
500 | 502 | ||
501 | void | 503 | void |
502 | __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) | 504 | __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, |
505 | const char *name) | ||
503 | { | 506 | { |
504 | struct irq_desc *desc; | 507 | struct irq_desc *desc; |
505 | unsigned long flags; | 508 | unsigned long flags; |
@@ -540,6 +543,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) | |||
540 | desc->depth = 1; | 543 | desc->depth = 1; |
541 | } | 544 | } |
542 | desc->handle_irq = handle; | 545 | desc->handle_irq = handle; |
546 | desc->name = name; | ||
543 | 547 | ||
544 | if (handle != handle_bad_irq && is_chained) { | 548 | if (handle != handle_bad_irq && is_chained) { |
545 | desc->status &= ~IRQ_DISABLED; | 549 | desc->status &= ~IRQ_DISABLED; |
@@ -555,30 +559,13 @@ set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, | |||
555 | irq_flow_handler_t handle) | 559 | irq_flow_handler_t handle) |
556 | { | 560 | { |
557 | set_irq_chip(irq, chip); | 561 | set_irq_chip(irq, chip); |
558 | __set_irq_handler(irq, handle, 0); | 562 | __set_irq_handler(irq, handle, 0, NULL); |
559 | } | 563 | } |
560 | 564 | ||
561 | /* | 565 | void |
562 | * Get a descriptive string for the highlevel handler, for | 566 | set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, |
563 | * /proc/interrupts output: | 567 | irq_flow_handler_t handle, const char *name) |
564 | */ | ||
565 | const char * | ||
566 | handle_irq_name(irq_flow_handler_t handle) | ||
567 | { | 568 | { |
568 | if (handle == handle_level_irq) | 569 | set_irq_chip(irq, chip); |
569 | return "level "; | 570 | __set_irq_handler(irq, handle, 0, name); |
570 | if (handle == handle_fasteoi_irq) | ||
571 | return "fasteoi"; | ||
572 | if (handle == handle_edge_irq) | ||
573 | return "edge "; | ||
574 | if (handle == handle_simple_irq) | ||
575 | return "simple "; | ||
576 | #ifdef CONFIG_SMP | ||
577 | if (handle == handle_percpu_irq) | ||
578 | return "percpu "; | ||
579 | #endif | ||
580 | if (handle == handle_bad_irq) | ||
581 | return "bad "; | ||
582 | |||
583 | return NULL; | ||
584 | } | 571 | } |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 42aa6f1a3f0f..aff1f0fabb0d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { | |||
54 | .chip = &no_irq_chip, | 54 | .chip = &no_irq_chip, |
55 | .handle_irq = handle_bad_irq, | 55 | .handle_irq = handle_bad_irq, |
56 | .depth = 1, | 56 | .depth = 1, |
57 | .lock = SPIN_LOCK_UNLOCKED, | 57 | .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), |
58 | #ifdef CONFIG_SMP | 58 | #ifdef CONFIG_SMP |
59 | .affinity = CPU_MASK_ALL | 59 | .affinity = CPU_MASK_ALL |
60 | #endif | 60 | #endif |
@@ -231,10 +231,10 @@ fastcall unsigned int __do_IRQ(unsigned int irq) | |||
231 | spin_unlock(&desc->lock); | 231 | spin_unlock(&desc->lock); |
232 | 232 | ||
233 | action_ret = handle_IRQ_event(irq, action); | 233 | action_ret = handle_IRQ_event(irq, action); |
234 | |||
235 | spin_lock(&desc->lock); | ||
236 | if (!noirqdebug) | 234 | if (!noirqdebug) |
237 | note_interrupt(irq, desc, action_ret); | 235 | note_interrupt(irq, desc, action_ret); |
236 | |||
237 | spin_lock(&desc->lock); | ||
238 | if (likely(!(desc->status & IRQ_PENDING))) | 238 | if (likely(!(desc->status & IRQ_PENDING))) |
239 | break; | 239 | break; |
240 | desc->status &= ~IRQ_PENDING; | 240 | desc->status &= ~IRQ_PENDING; |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6879202afe9a..b385878c6e80 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
216 | { | 216 | { |
217 | struct irq_desc *desc = irq_desc + irq; | 217 | struct irq_desc *desc = irq_desc + irq; |
218 | struct irqaction *old, **p; | 218 | struct irqaction *old, **p; |
219 | const char *old_name = NULL; | ||
219 | unsigned long flags; | 220 | unsigned long flags; |
220 | int shared = 0; | 221 | int shared = 0; |
221 | 222 | ||
@@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
255 | * set the trigger type must match. | 256 | * set the trigger type must match. |
256 | */ | 257 | */ |
257 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 258 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
258 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) | 259 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { |
260 | old_name = old->name; | ||
259 | goto mismatch; | 261 | goto mismatch; |
262 | } | ||
260 | 263 | ||
261 | #if defined(CONFIG_IRQ_PER_CPU) | 264 | #if defined(CONFIG_IRQ_PER_CPU) |
262 | /* All handlers must agree on per-cpuness */ | 265 | /* All handlers must agree on per-cpuness */ |
@@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
322 | return 0; | 325 | return 0; |
323 | 326 | ||
324 | mismatch: | 327 | mismatch: |
325 | spin_unlock_irqrestore(&desc->lock, flags); | ||
326 | if (!(new->flags & IRQF_PROBE_SHARED)) { | 328 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
327 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); | 329 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); |
330 | if (old_name) | ||
331 | printk(KERN_ERR "current handler: %s\n", old_name); | ||
328 | dump_stack(); | 332 | dump_stack(); |
329 | } | 333 | } |
334 | spin_unlock_irqrestore(&desc->lock, flags); | ||
330 | return -EBUSY; | 335 | return -EBUSY; |
331 | } | 336 | } |
332 | 337 | ||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9a352667007c..61f5c717a8f5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
54 | unsigned int irq = (int)(long)data, full_count = count, err; | 54 | unsigned int irq = (int)(long)data, full_count = count, err; |
55 | cpumask_t new_value, tmp; | 55 | cpumask_t new_value, tmp; |
56 | 56 | ||
57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) | 57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || |
58 | CHECK_IRQ_PER_CPU(irq_desc[irq].status)) | ||
58 | return -EIO; | 59 | return -EIO; |
59 | 60 | ||
60 | err = cpumask_parse_user(buffer, count, new_value); | 61 | err = cpumask_parse_user(buffer, count, new_value); |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index eeac3e313b2b..6f294ff4f9ee 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
21 | #include <linux/sched.h> /* for cond_resched */ | 21 | #include <linux/sched.h> /* for cond_resched */ |
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/ctype.h> | ||
23 | 24 | ||
24 | #include <asm/sections.h> | 25 | #include <asm/sections.h> |
25 | 26 | ||
@@ -30,14 +31,14 @@ | |||
30 | #endif | 31 | #endif |
31 | 32 | ||
32 | /* These will be re-linked against their real values during the second link stage */ | 33 | /* These will be re-linked against their real values during the second link stage */ |
33 | extern unsigned long kallsyms_addresses[] __attribute__((weak)); | 34 | extern const unsigned long kallsyms_addresses[] __attribute__((weak)); |
34 | extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); | 35 | extern const unsigned long kallsyms_num_syms __attribute__((weak)); |
35 | extern u8 kallsyms_names[] __attribute__((weak)); | 36 | extern const u8 kallsyms_names[] __attribute__((weak)); |
36 | 37 | ||
37 | extern u8 kallsyms_token_table[] __attribute__((weak)); | 38 | extern const u8 kallsyms_token_table[] __attribute__((weak)); |
38 | extern u16 kallsyms_token_index[] __attribute__((weak)); | 39 | extern const u16 kallsyms_token_index[] __attribute__((weak)); |
39 | 40 | ||
40 | extern unsigned long kallsyms_markers[] __attribute__((weak)); | 41 | extern const unsigned long kallsyms_markers[] __attribute__((weak)); |
41 | 42 | ||
42 | static inline int is_kernel_inittext(unsigned long addr) | 43 | static inline int is_kernel_inittext(unsigned long addr) |
43 | { | 44 | { |
@@ -83,7 +84,7 @@ static int is_ksym_addr(unsigned long addr) | |||
83 | static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) | 84 | static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) |
84 | { | 85 | { |
85 | int len, skipped_first = 0; | 86 | int len, skipped_first = 0; |
86 | u8 *tptr, *data; | 87 | const u8 *tptr, *data; |
87 | 88 | ||
88 | /* get the compressed symbol length from the first symbol byte */ | 89 | /* get the compressed symbol length from the first symbol byte */ |
89 | data = &kallsyms_names[off]; | 90 | data = &kallsyms_names[off]; |
@@ -131,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off) | |||
131 | * kallsyms array */ | 132 | * kallsyms array */ |
132 | static unsigned int get_symbol_offset(unsigned long pos) | 133 | static unsigned int get_symbol_offset(unsigned long pos) |
133 | { | 134 | { |
134 | u8 *name; | 135 | const u8 *name; |
135 | int i; | 136 | int i; |
136 | 137 | ||
137 | /* use the closest marker we have. We have markers every 256 positions, | 138 | /* use the closest marker we have. We have markers every 256 positions, |
@@ -301,13 +302,6 @@ struct kallsym_iter | |||
301 | char name[KSYM_NAME_LEN+1]; | 302 | char name[KSYM_NAME_LEN+1]; |
302 | }; | 303 | }; |
303 | 304 | ||
304 | /* Only label it "global" if it is exported. */ | ||
305 | static void upcase_if_global(struct kallsym_iter *iter) | ||
306 | { | ||
307 | if (is_exported(iter->name, iter->owner)) | ||
308 | iter->type += 'A' - 'a'; | ||
309 | } | ||
310 | |||
311 | static int get_ksymbol_mod(struct kallsym_iter *iter) | 305 | static int get_ksymbol_mod(struct kallsym_iter *iter) |
312 | { | 306 | { |
313 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, | 307 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, |
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) | |||
316 | if (iter->owner == NULL) | 310 | if (iter->owner == NULL) |
317 | return 0; | 311 | return 0; |
318 | 312 | ||
319 | upcase_if_global(iter); | 313 | /* Label it "global" if it is exported, "local" if not exported. */ |
314 | iter->type = is_exported(iter->name, iter->owner) | ||
315 | ? toupper(iter->type) : tolower(iter->type); | ||
316 | |||
320 | return 1; | 317 | return 1; |
321 | } | 318 | } |
322 | 319 | ||
@@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p) | |||
401 | return 0; | 398 | return 0; |
402 | } | 399 | } |
403 | 400 | ||
404 | static struct seq_operations kallsyms_op = { | 401 | static const struct seq_operations kallsyms_op = { |
405 | .start = s_start, | 402 | .start = s_start, |
406 | .next = s_next, | 403 | .next = s_next, |
407 | .stop = s_stop, | 404 | .stop = s_stop, |
@@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file) | |||
436 | return seq_release(inode, file); | 433 | return seq_release(inode, file); |
437 | } | 434 | } |
438 | 435 | ||
439 | static struct file_operations kallsyms_operations = { | 436 | static const struct file_operations kallsyms_operations = { |
440 | .open = kallsyms_open, | 437 | .open = kallsyms_open, |
441 | .read = seq_read, | 438 | .read = seq_read, |
442 | .llseek = seq_lseek, | 439 | .llseek = seq_lseek, |
diff --git a/kernel/kexec.c b/kernel/kexec.c index fcdd5d2bc3f4..2a59c8a01ae0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -20,6 +20,8 @@ | |||
20 | #include <linux/syscalls.h> | 20 | #include <linux/syscalls.h> |
21 | #include <linux/ioport.h> | 21 | #include <linux/ioport.h> |
22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
23 | #include <linux/elf.h> | ||
24 | #include <linux/elfcore.h> | ||
23 | 25 | ||
24 | #include <asm/page.h> | 26 | #include <asm/page.h> |
25 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
@@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
108 | 110 | ||
109 | /* Allocate a controlling structure */ | 111 | /* Allocate a controlling structure */ |
110 | result = -ENOMEM; | 112 | result = -ENOMEM; |
111 | image = kmalloc(sizeof(*image), GFP_KERNEL); | 113 | image = kzalloc(sizeof(*image), GFP_KERNEL); |
112 | if (!image) | 114 | if (!image) |
113 | goto out; | 115 | goto out; |
114 | 116 | ||
115 | memset(image, 0, sizeof(*image)); | ||
116 | image->head = 0; | 117 | image->head = 0; |
117 | image->entry = &image->head; | 118 | image->entry = &image->head; |
118 | image->last_entry = &image->head; | 119 | image->last_entry = &image->head; |
@@ -851,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
851 | memset(ptr + uchunk, 0, mchunk - uchunk); | 852 | memset(ptr + uchunk, 0, mchunk - uchunk); |
852 | } | 853 | } |
853 | result = copy_from_user(ptr, buf, uchunk); | 854 | result = copy_from_user(ptr, buf, uchunk); |
855 | kexec_flush_icache_page(page); | ||
854 | kunmap(page); | 856 | kunmap(page); |
855 | if (result) { | 857 | if (result) { |
856 | result = (result < 0) ? result : -EIO; | 858 | result = (result < 0) ? result : -EIO; |
@@ -1067,6 +1069,60 @@ void crash_kexec(struct pt_regs *regs) | |||
1067 | } | 1069 | } |
1068 | } | 1070 | } |
1069 | 1071 | ||
1072 | static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, | ||
1073 | size_t data_len) | ||
1074 | { | ||
1075 | struct elf_note note; | ||
1076 | |||
1077 | note.n_namesz = strlen(name) + 1; | ||
1078 | note.n_descsz = data_len; | ||
1079 | note.n_type = type; | ||
1080 | memcpy(buf, ¬e, sizeof(note)); | ||
1081 | buf += (sizeof(note) + 3)/4; | ||
1082 | memcpy(buf, name, note.n_namesz); | ||
1083 | buf += (note.n_namesz + 3)/4; | ||
1084 | memcpy(buf, data, note.n_descsz); | ||
1085 | buf += (note.n_descsz + 3)/4; | ||
1086 | |||
1087 | return buf; | ||
1088 | } | ||
1089 | |||
1090 | static void final_note(u32 *buf) | ||
1091 | { | ||
1092 | struct elf_note note; | ||
1093 | |||
1094 | note.n_namesz = 0; | ||
1095 | note.n_descsz = 0; | ||
1096 | note.n_type = 0; | ||
1097 | memcpy(buf, ¬e, sizeof(note)); | ||
1098 | } | ||
1099 | |||
1100 | void crash_save_cpu(struct pt_regs *regs, int cpu) | ||
1101 | { | ||
1102 | struct elf_prstatus prstatus; | ||
1103 | u32 *buf; | ||
1104 | |||
1105 | if ((cpu < 0) || (cpu >= NR_CPUS)) | ||
1106 | return; | ||
1107 | |||
1108 | /* Using ELF notes here is opportunistic. | ||
1109 | * I need a well defined structure format | ||
1110 | * for the data I pass, and I need tags | ||
1111 | * on the data to indicate what information I have | ||
1112 | * squirrelled away. ELF notes happen to provide | ||
1113 | * all of that, so there is no need to invent something new. | ||
1114 | */ | ||
1115 | buf = (u32*)per_cpu_ptr(crash_notes, cpu); | ||
1116 | if (!buf) | ||
1117 | return; | ||
1118 | memset(&prstatus, 0, sizeof(prstatus)); | ||
1119 | prstatus.pr_pid = current->pid; | ||
1120 | elf_core_copy_regs(&prstatus.pr_reg, regs); | ||
1121 | buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, | ||
1122 | sizeof(prstatus)); | ||
1123 | final_note(buf); | ||
1124 | } | ||
1125 | |||
1070 | static int __init crash_notes_memory_init(void) | 1126 | static int __init crash_notes_memory_init(void) |
1071 | { | 1127 | { |
1072 | /* Allocate memory for saving cpu registers. */ | 1128 | /* Allocate memory for saving cpu registers. */ |
diff --git a/kernel/kmod.c b/kernel/kmod.c index bb4e29d924e4..3a7379aa31ca 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/kmod.h> | 25 | #include <linux/kmod.h> |
26 | #include <linux/smp_lock.h> | 26 | #include <linux/smp_lock.h> |
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/namespace.h> | 28 | #include <linux/mnt_namespace.h> |
29 | #include <linux/completion.h> | 29 | #include <linux/completion.h> |
30 | #include <linux/file.h> | 30 | #include <linux/file.h> |
31 | #include <linux/workqueue.h> | 31 | #include <linux/workqueue.h> |
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module); | |||
114 | #endif /* CONFIG_KMOD */ | 114 | #endif /* CONFIG_KMOD */ |
115 | 115 | ||
116 | struct subprocess_info { | 116 | struct subprocess_info { |
117 | struct work_struct work; | ||
117 | struct completion *complete; | 118 | struct completion *complete; |
118 | char *path; | 119 | char *path; |
119 | char **argv; | 120 | char **argv; |
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data) | |||
221 | } | 222 | } |
222 | 223 | ||
223 | /* This is run by khelper thread */ | 224 | /* This is run by khelper thread */ |
224 | static void __call_usermodehelper(void *data) | 225 | static void __call_usermodehelper(struct work_struct *work) |
225 | { | 226 | { |
226 | struct subprocess_info *sub_info = data; | 227 | struct subprocess_info *sub_info = |
228 | container_of(work, struct subprocess_info, work); | ||
227 | pid_t pid; | 229 | pid_t pid; |
228 | int wait = sub_info->wait; | 230 | int wait = sub_info->wait; |
229 | 231 | ||
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
264 | { | 266 | { |
265 | DECLARE_COMPLETION_ONSTACK(done); | 267 | DECLARE_COMPLETION_ONSTACK(done); |
266 | struct subprocess_info sub_info = { | 268 | struct subprocess_info sub_info = { |
269 | .work = __WORK_INITIALIZER(sub_info.work, | ||
270 | __call_usermodehelper), | ||
267 | .complete = &done, | 271 | .complete = &done, |
268 | .path = path, | 272 | .path = path, |
269 | .argv = argv, | 273 | .argv = argv, |
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
272 | .wait = wait, | 276 | .wait = wait, |
273 | .retval = 0, | 277 | .retval = 0, |
274 | }; | 278 | }; |
275 | DECLARE_WORK(work, __call_usermodehelper, &sub_info); | ||
276 | 279 | ||
277 | if (!khelper_wq) | 280 | if (!khelper_wq) |
278 | return -EBUSY; | 281 | return -EBUSY; |
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
280 | if (path[0] == '\0') | 283 | if (path[0] == '\0') |
281 | return 0; | 284 | return 0; |
282 | 285 | ||
283 | queue_work(khelper_wq, &work); | 286 | queue_work(khelper_wq, &sub_info.work); |
284 | wait_for_completion(&done); | 287 | wait_for_completion(&done); |
285 | return sub_info.retval; | 288 | return sub_info.retval; |
286 | } | 289 | } |
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, | |||
291 | { | 294 | { |
292 | DECLARE_COMPLETION(done); | 295 | DECLARE_COMPLETION(done); |
293 | struct subprocess_info sub_info = { | 296 | struct subprocess_info sub_info = { |
297 | .work = __WORK_INITIALIZER(sub_info.work, | ||
298 | __call_usermodehelper), | ||
294 | .complete = &done, | 299 | .complete = &done, |
295 | .path = path, | 300 | .path = path, |
296 | .argv = argv, | 301 | .argv = argv, |
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, | |||
298 | .retval = 0, | 303 | .retval = 0, |
299 | }; | 304 | }; |
300 | struct file *f; | 305 | struct file *f; |
301 | DECLARE_WORK(work, __call_usermodehelper, &sub_info); | ||
302 | 306 | ||
303 | if (!khelper_wq) | 307 | if (!khelper_wq) |
304 | return -EBUSY; | 308 | return -EBUSY; |
@@ -307,18 +311,18 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, | |||
307 | return 0; | 311 | return 0; |
308 | 312 | ||
309 | f = create_write_pipe(); | 313 | f = create_write_pipe(); |
310 | if (!f) | 314 | if (IS_ERR(f)) |
311 | return -ENOMEM; | 315 | return PTR_ERR(f); |
312 | *filp = f; | 316 | *filp = f; |
313 | 317 | ||
314 | f = create_read_pipe(f); | 318 | f = create_read_pipe(f); |
315 | if (!f) { | 319 | if (IS_ERR(f)) { |
316 | free_write_pipe(*filp); | 320 | free_write_pipe(*filp); |
317 | return -ENOMEM; | 321 | return PTR_ERR(f); |
318 | } | 322 | } |
319 | sub_info.stdin = f; | 323 | sub_info.stdin = f; |
320 | 324 | ||
321 | queue_work(khelper_wq, &work); | 325 | queue_work(khelper_wq, &sub_info.work); |
322 | wait_for_completion(&done); | 326 | wait_for_completion(&done); |
323 | return sub_info.retval; | 327 | return sub_info.retval; |
324 | } | 328 | } |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 610c837ad9e0..17ec4afb0994 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/module.h> | 38 | #include <linux/module.h> |
39 | #include <linux/moduleloader.h> | 39 | #include <linux/moduleloader.h> |
40 | #include <linux/kallsyms.h> | 40 | #include <linux/kallsyms.h> |
41 | #include <linux/freezer.h> | ||
41 | #include <asm-generic/sections.h> | 42 | #include <asm-generic/sections.h> |
42 | #include <asm/cacheflush.h> | 43 | #include <asm/cacheflush.h> |
43 | #include <asm/errno.h> | 44 | #include <asm/errno.h> |
@@ -83,9 +84,36 @@ struct kprobe_insn_page { | |||
83 | kprobe_opcode_t *insns; /* Page of instruction slots */ | 84 | kprobe_opcode_t *insns; /* Page of instruction slots */ |
84 | char slot_used[INSNS_PER_PAGE]; | 85 | char slot_used[INSNS_PER_PAGE]; |
85 | int nused; | 86 | int nused; |
87 | int ngarbage; | ||
86 | }; | 88 | }; |
87 | 89 | ||
88 | static struct hlist_head kprobe_insn_pages; | 90 | static struct hlist_head kprobe_insn_pages; |
91 | static int kprobe_garbage_slots; | ||
92 | static int collect_garbage_slots(void); | ||
93 | |||
94 | static int __kprobes check_safety(void) | ||
95 | { | ||
96 | int ret = 0; | ||
97 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) | ||
98 | ret = freeze_processes(); | ||
99 | if (ret == 0) { | ||
100 | struct task_struct *p, *q; | ||
101 | do_each_thread(p, q) { | ||
102 | if (p != current && p->state == TASK_RUNNING && | ||
103 | p->pid != 0) { | ||
104 | printk("Check failed: %s is running\n",p->comm); | ||
105 | ret = -1; | ||
106 | goto loop_end; | ||
107 | } | ||
108 | } while_each_thread(p, q); | ||
109 | } | ||
110 | loop_end: | ||
111 | thaw_processes(); | ||
112 | #else | ||
113 | synchronize_sched(); | ||
114 | #endif | ||
115 | return ret; | ||
116 | } | ||
89 | 117 | ||
90 | /** | 118 | /** |
91 | * get_insn_slot() - Find a slot on an executable page for an instruction. | 119 | * get_insn_slot() - Find a slot on an executable page for an instruction. |
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) | |||
96 | struct kprobe_insn_page *kip; | 124 | struct kprobe_insn_page *kip; |
97 | struct hlist_node *pos; | 125 | struct hlist_node *pos; |
98 | 126 | ||
127 | retry: | ||
99 | hlist_for_each(pos, &kprobe_insn_pages) { | 128 | hlist_for_each(pos, &kprobe_insn_pages) { |
100 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | 129 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); |
101 | if (kip->nused < INSNS_PER_PAGE) { | 130 | if (kip->nused < INSNS_PER_PAGE) { |
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) | |||
112 | } | 141 | } |
113 | } | 142 | } |
114 | 143 | ||
115 | /* All out of space. Need to allocate a new page. Use slot 0.*/ | 144 | /* If there are any garbage slots, collect it and try again. */ |
145 | if (kprobe_garbage_slots && collect_garbage_slots() == 0) { | ||
146 | goto retry; | ||
147 | } | ||
148 | /* All out of space. Need to allocate a new page. Use slot 0. */ | ||
116 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | 149 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); |
117 | if (!kip) { | 150 | if (!kip) { |
118 | return NULL; | 151 | return NULL; |
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) | |||
133 | memset(kip->slot_used, 0, INSNS_PER_PAGE); | 166 | memset(kip->slot_used, 0, INSNS_PER_PAGE); |
134 | kip->slot_used[0] = 1; | 167 | kip->slot_used[0] = 1; |
135 | kip->nused = 1; | 168 | kip->nused = 1; |
169 | kip->ngarbage = 0; | ||
136 | return kip->insns; | 170 | return kip->insns; |
137 | } | 171 | } |
138 | 172 | ||
139 | void __kprobes free_insn_slot(kprobe_opcode_t *slot) | 173 | /* Return 1 if all garbages are collected, otherwise 0. */ |
174 | static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | ||
175 | { | ||
176 | kip->slot_used[idx] = 0; | ||
177 | kip->nused--; | ||
178 | if (kip->nused == 0) { | ||
179 | /* | ||
180 | * Page is no longer in use. Free it unless | ||
181 | * it's the last one. We keep the last one | ||
182 | * so as not to have to set it up again the | ||
183 | * next time somebody inserts a probe. | ||
184 | */ | ||
185 | hlist_del(&kip->hlist); | ||
186 | if (hlist_empty(&kprobe_insn_pages)) { | ||
187 | INIT_HLIST_NODE(&kip->hlist); | ||
188 | hlist_add_head(&kip->hlist, | ||
189 | &kprobe_insn_pages); | ||
190 | } else { | ||
191 | module_free(NULL, kip->insns); | ||
192 | kfree(kip); | ||
193 | } | ||
194 | return 1; | ||
195 | } | ||
196 | return 0; | ||
197 | } | ||
198 | |||
199 | static int __kprobes collect_garbage_slots(void) | ||
200 | { | ||
201 | struct kprobe_insn_page *kip; | ||
202 | struct hlist_node *pos, *next; | ||
203 | |||
204 | /* Ensure no-one is preepmted on the garbages */ | ||
205 | if (check_safety() != 0) | ||
206 | return -EAGAIN; | ||
207 | |||
208 | hlist_for_each_safe(pos, next, &kprobe_insn_pages) { | ||
209 | int i; | ||
210 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
211 | if (kip->ngarbage == 0) | ||
212 | continue; | ||
213 | kip->ngarbage = 0; /* we will collect all garbages */ | ||
214 | for (i = 0; i < INSNS_PER_PAGE; i++) { | ||
215 | if (kip->slot_used[i] == -1 && | ||
216 | collect_one_slot(kip, i)) | ||
217 | break; | ||
218 | } | ||
219 | } | ||
220 | kprobe_garbage_slots = 0; | ||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | ||
140 | { | 225 | { |
141 | struct kprobe_insn_page *kip; | 226 | struct kprobe_insn_page *kip; |
142 | struct hlist_node *pos; | 227 | struct hlist_node *pos; |
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot) | |||
146 | if (kip->insns <= slot && | 231 | if (kip->insns <= slot && |
147 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { | 232 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { |
148 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | 233 | int i = (slot - kip->insns) / MAX_INSN_SIZE; |
149 | kip->slot_used[i] = 0; | 234 | if (dirty) { |
150 | kip->nused--; | 235 | kip->slot_used[i] = -1; |
151 | if (kip->nused == 0) { | 236 | kip->ngarbage++; |
152 | /* | 237 | } else { |
153 | * Page is no longer in use. Free it unless | 238 | collect_one_slot(kip, i); |
154 | * it's the last one. We keep the last one | ||
155 | * so as not to have to set it up again the | ||
156 | * next time somebody inserts a probe. | ||
157 | */ | ||
158 | hlist_del(&kip->hlist); | ||
159 | if (hlist_empty(&kprobe_insn_pages)) { | ||
160 | INIT_HLIST_NODE(&kip->hlist); | ||
161 | hlist_add_head(&kip->hlist, | ||
162 | &kprobe_insn_pages); | ||
163 | } else { | ||
164 | module_free(NULL, kip->insns); | ||
165 | kfree(kip); | ||
166 | } | ||
167 | } | 239 | } |
168 | return; | 240 | break; |
169 | } | 241 | } |
170 | } | 242 | } |
243 | if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { | ||
244 | collect_garbage_slots(); | ||
245 | } | ||
171 | } | 246 | } |
172 | #endif | 247 | #endif |
173 | 248 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index 4f9c60ef95e8..1db8c72d0d38 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -31,6 +31,8 @@ struct kthread_create_info | |||
31 | /* Result passed back to kthread_create() from keventd. */ | 31 | /* Result passed back to kthread_create() from keventd. */ |
32 | struct task_struct *result; | 32 | struct task_struct *result; |
33 | struct completion done; | 33 | struct completion done; |
34 | |||
35 | struct work_struct work; | ||
34 | }; | 36 | }; |
35 | 37 | ||
36 | struct kthread_stop_info | 38 | struct kthread_stop_info |
@@ -111,9 +113,10 @@ static int kthread(void *_create) | |||
111 | } | 113 | } |
112 | 114 | ||
113 | /* We are keventd: create a thread. */ | 115 | /* We are keventd: create a thread. */ |
114 | static void keventd_create_kthread(void *_create) | 116 | static void keventd_create_kthread(struct work_struct *work) |
115 | { | 117 | { |
116 | struct kthread_create_info *create = _create; | 118 | struct kthread_create_info *create = |
119 | container_of(work, struct kthread_create_info, work); | ||
117 | int pid; | 120 | int pid; |
118 | 121 | ||
119 | /* We want our own signal handler (we take no signals by default). */ | 122 | /* We want our own signal handler (we take no signals by default). */ |
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
154 | ...) | 157 | ...) |
155 | { | 158 | { |
156 | struct kthread_create_info create; | 159 | struct kthread_create_info create; |
157 | DECLARE_WORK(work, keventd_create_kthread, &create); | ||
158 | 160 | ||
159 | create.threadfn = threadfn; | 161 | create.threadfn = threadfn; |
160 | create.data = data; | 162 | create.data = data; |
161 | init_completion(&create.started); | 163 | init_completion(&create.started); |
162 | init_completion(&create.done); | 164 | init_completion(&create.done); |
165 | INIT_WORK(&create.work, keventd_create_kthread); | ||
163 | 166 | ||
164 | /* | 167 | /* |
165 | * The workqueue needs to start up first: | 168 | * The workqueue needs to start up first: |
166 | */ | 169 | */ |
167 | if (!helper_wq) | 170 | if (!helper_wq) |
168 | work.func(work.data); | 171 | create.work.func(&create.work); |
169 | else { | 172 | else { |
170 | queue_work(helper_wq, &work); | 173 | queue_work(helper_wq, &create.work); |
171 | wait_for_completion(&create.done); | 174 | wait_for_completion(&create.done); |
172 | } | 175 | } |
173 | if (!IS_ERR(create.result)) { | 176 | if (!IS_ERR(create.result)) { |
diff --git a/kernel/latency.c b/kernel/latency.c index 258f2555abbc..e63fcacb61a7 100644 --- a/kernel/latency.c +++ b/kernel/latency.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
37 | #include <linux/module.h> | 37 | #include <linux/module.h> |
38 | #include <linux/notifier.h> | 38 | #include <linux/notifier.h> |
39 | #include <linux/jiffies.h> | ||
39 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
40 | 41 | ||
41 | struct latency_info { | 42 | struct latency_info { |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 805a322a5655..b02032476dc2 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -140,13 +140,6 @@ void lockdep_on(void) | |||
140 | 140 | ||
141 | EXPORT_SYMBOL(lockdep_on); | 141 | EXPORT_SYMBOL(lockdep_on); |
142 | 142 | ||
143 | int lockdep_internal(void) | ||
144 | { | ||
145 | return current->lockdep_recursion != 0; | ||
146 | } | ||
147 | |||
148 | EXPORT_SYMBOL(lockdep_internal); | ||
149 | |||
150 | /* | 143 | /* |
151 | * Debugging switches: | 144 | * Debugging switches: |
152 | */ | 145 | */ |
@@ -228,17 +221,15 @@ static int save_trace(struct stack_trace *trace) | |||
228 | trace->skip = 3; | 221 | trace->skip = 3; |
229 | trace->all_contexts = 0; | 222 | trace->all_contexts = 0; |
230 | 223 | ||
231 | /* Make sure to not recurse in case the the unwinder needs to tak | ||
232 | e locks. */ | ||
233 | lockdep_off(); | ||
234 | save_stack_trace(trace, NULL); | 224 | save_stack_trace(trace, NULL); |
235 | lockdep_on(); | ||
236 | 225 | ||
237 | trace->max_entries = trace->nr_entries; | 226 | trace->max_entries = trace->nr_entries; |
238 | 227 | ||
239 | nr_stack_trace_entries += trace->nr_entries; | 228 | nr_stack_trace_entries += trace->nr_entries; |
240 | if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) | 229 | if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) { |
230 | __raw_spin_unlock(&hash_lock); | ||
241 | return 0; | 231 | return 0; |
232 | } | ||
242 | 233 | ||
243 | if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { | 234 | if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { |
244 | __raw_spin_unlock(&hash_lock); | 235 | __raw_spin_unlock(&hash_lock); |
@@ -357,7 +348,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4 | |||
357 | 348 | ||
358 | static void print_lock_name(struct lock_class *class) | 349 | static void print_lock_name(struct lock_class *class) |
359 | { | 350 | { |
360 | char str[128], c1, c2, c3, c4; | 351 | char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; |
361 | const char *name; | 352 | const char *name; |
362 | 353 | ||
363 | get_usage_chars(class, &c1, &c2, &c3, &c4); | 354 | get_usage_chars(class, &c1, &c2, &c3, &c4); |
@@ -379,7 +370,7 @@ static void print_lock_name(struct lock_class *class) | |||
379 | static void print_lockdep_cache(struct lockdep_map *lock) | 370 | static void print_lockdep_cache(struct lockdep_map *lock) |
380 | { | 371 | { |
381 | const char *name; | 372 | const char *name; |
382 | char str[128]; | 373 | char str[KSYM_NAME_LEN + 1]; |
383 | 374 | ||
384 | name = lock->name; | 375 | name = lock->name; |
385 | if (!name) | 376 | if (!name) |
@@ -449,7 +440,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth) | |||
449 | print_lock_class_header(class, depth); | 440 | print_lock_class_header(class, depth); |
450 | 441 | ||
451 | list_for_each_entry(entry, &class->locks_after, entry) { | 442 | list_for_each_entry(entry, &class->locks_after, entry) { |
452 | DEBUG_LOCKS_WARN_ON(!entry->class); | 443 | if (DEBUG_LOCKS_WARN_ON(!entry->class)) |
444 | return; | ||
445 | |||
453 | print_lock_dependencies(entry->class, depth + 1); | 446 | print_lock_dependencies(entry->class, depth + 1); |
454 | 447 | ||
455 | printk("%*s ... acquired at:\n",depth,""); | 448 | printk("%*s ... acquired at:\n",depth,""); |
@@ -474,7 +467,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, | |||
474 | return 0; | 467 | return 0; |
475 | 468 | ||
476 | entry->class = this; | 469 | entry->class = this; |
477 | save_trace(&entry->trace); | 470 | if (!save_trace(&entry->trace)) |
471 | return 0; | ||
478 | 472 | ||
479 | /* | 473 | /* |
480 | * Since we never remove from the dependency list, the list can | 474 | * Since we never remove from the dependency list, the list can |
@@ -562,8 +556,12 @@ static noinline int print_circular_bug_tail(void) | |||
562 | if (debug_locks_silent) | 556 | if (debug_locks_silent) |
563 | return 0; | 557 | return 0; |
564 | 558 | ||
559 | /* hash_lock unlocked by the header */ | ||
560 | __raw_spin_lock(&hash_lock); | ||
565 | this.class = check_source->class; | 561 | this.class = check_source->class; |
566 | save_trace(&this.trace); | 562 | if (!save_trace(&this.trace)) |
563 | return 0; | ||
564 | __raw_spin_unlock(&hash_lock); | ||
567 | print_circular_bug_entry(&this, 0); | 565 | print_circular_bug_entry(&this, 0); |
568 | 566 | ||
569 | printk("\nother info that might help us debug this:\n\n"); | 567 | printk("\nother info that might help us debug this:\n\n"); |
@@ -575,6 +573,8 @@ static noinline int print_circular_bug_tail(void) | |||
575 | return 0; | 573 | return 0; |
576 | } | 574 | } |
577 | 575 | ||
576 | #define RECURSION_LIMIT 40 | ||
577 | |||
578 | static int noinline print_infinite_recursion_bug(void) | 578 | static int noinline print_infinite_recursion_bug(void) |
579 | { | 579 | { |
580 | __raw_spin_unlock(&hash_lock); | 580 | __raw_spin_unlock(&hash_lock); |
@@ -595,7 +595,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) | |||
595 | debug_atomic_inc(&nr_cyclic_check_recursions); | 595 | debug_atomic_inc(&nr_cyclic_check_recursions); |
596 | if (depth > max_recursion_depth) | 596 | if (depth > max_recursion_depth) |
597 | max_recursion_depth = depth; | 597 | max_recursion_depth = depth; |
598 | if (depth >= 20) | 598 | if (depth >= RECURSION_LIMIT) |
599 | return print_infinite_recursion_bug(); | 599 | return print_infinite_recursion_bug(); |
600 | /* | 600 | /* |
601 | * Check this lock's dependency list: | 601 | * Check this lock's dependency list: |
@@ -645,7 +645,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) | |||
645 | 645 | ||
646 | if (depth > max_recursion_depth) | 646 | if (depth > max_recursion_depth) |
647 | max_recursion_depth = depth; | 647 | max_recursion_depth = depth; |
648 | if (depth >= 20) | 648 | if (depth >= RECURSION_LIMIT) |
649 | return print_infinite_recursion_bug(); | 649 | return print_infinite_recursion_bug(); |
650 | 650 | ||
651 | debug_atomic_inc(&nr_find_usage_forwards_checks); | 651 | debug_atomic_inc(&nr_find_usage_forwards_checks); |
@@ -684,7 +684,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) | |||
684 | 684 | ||
685 | if (depth > max_recursion_depth) | 685 | if (depth > max_recursion_depth) |
686 | max_recursion_depth = depth; | 686 | max_recursion_depth = depth; |
687 | if (depth >= 20) | 687 | if (depth >= RECURSION_LIMIT) |
688 | return print_infinite_recursion_bug(); | 688 | return print_infinite_recursion_bug(); |
689 | 689 | ||
690 | debug_atomic_inc(&nr_find_usage_backwards_checks); | 690 | debug_atomic_inc(&nr_find_usage_backwards_checks); |
@@ -964,14 +964,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
964 | &prev->class->locks_after, next->acquire_ip); | 964 | &prev->class->locks_after, next->acquire_ip); |
965 | if (!ret) | 965 | if (!ret) |
966 | return 0; | 966 | return 0; |
967 | /* | 967 | |
968 | * Return value of 2 signals 'dependency already added', | ||
969 | * in that case we dont have to add the backlink either. | ||
970 | */ | ||
971 | if (ret == 2) | ||
972 | return 2; | ||
973 | ret = add_lock_to_list(next->class, prev->class, | 968 | ret = add_lock_to_list(next->class, prev->class, |
974 | &next->class->locks_before, next->acquire_ip); | 969 | &next->class->locks_before, next->acquire_ip); |
970 | if (!ret) | ||
971 | return 0; | ||
975 | 972 | ||
976 | /* | 973 | /* |
977 | * Debugging printouts: | 974 | * Debugging printouts: |
@@ -1023,7 +1020,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
1023 | * added: | 1020 | * added: |
1024 | */ | 1021 | */ |
1025 | if (hlock->read != 2) { | 1022 | if (hlock->read != 2) { |
1026 | check_prev_add(curr, hlock, next); | 1023 | if (!check_prev_add(curr, hlock, next)) |
1024 | return 0; | ||
1027 | /* | 1025 | /* |
1028 | * Stop after the first non-trylock entry, | 1026 | * Stop after the first non-trylock entry, |
1029 | * as non-trylock entries have added their | 1027 | * as non-trylock entries have added their |
@@ -1079,7 +1077,8 @@ static int static_obj(void *obj) | |||
1079 | */ | 1077 | */ |
1080 | for_each_possible_cpu(i) { | 1078 | for_each_possible_cpu(i) { |
1081 | start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); | 1079 | start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); |
1082 | end = (unsigned long) &__per_cpu_end + per_cpu_offset(i); | 1080 | end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM |
1081 | + per_cpu_offset(i); | ||
1083 | 1082 | ||
1084 | if ((addr >= start) && (addr < end)) | 1083 | if ((addr >= start) && (addr < end)) |
1085 | return 1; | 1084 | return 1; |
@@ -1174,11 +1173,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
1174 | * itself, so actual lookup of the hash should be once per lock object. | 1173 | * itself, so actual lookup of the hash should be once per lock object. |
1175 | */ | 1174 | */ |
1176 | static inline struct lock_class * | 1175 | static inline struct lock_class * |
1177 | register_lock_class(struct lockdep_map *lock, unsigned int subclass) | 1176 | register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) |
1178 | { | 1177 | { |
1179 | struct lockdep_subclass_key *key; | 1178 | struct lockdep_subclass_key *key; |
1180 | struct list_head *hash_head; | 1179 | struct list_head *hash_head; |
1181 | struct lock_class *class; | 1180 | struct lock_class *class; |
1181 | unsigned long flags; | ||
1182 | 1182 | ||
1183 | class = look_up_lock_class(lock, subclass); | 1183 | class = look_up_lock_class(lock, subclass); |
1184 | if (likely(class)) | 1184 | if (likely(class)) |
@@ -1200,6 +1200,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
1200 | key = lock->key->subkeys + subclass; | 1200 | key = lock->key->subkeys + subclass; |
1201 | hash_head = classhashentry(key); | 1201 | hash_head = classhashentry(key); |
1202 | 1202 | ||
1203 | raw_local_irq_save(flags); | ||
1203 | __raw_spin_lock(&hash_lock); | 1204 | __raw_spin_lock(&hash_lock); |
1204 | /* | 1205 | /* |
1205 | * We have to do the hash-walk again, to avoid races | 1206 | * We have to do the hash-walk again, to avoid races |
@@ -1214,6 +1215,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
1214 | */ | 1215 | */ |
1215 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | 1216 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { |
1216 | __raw_spin_unlock(&hash_lock); | 1217 | __raw_spin_unlock(&hash_lock); |
1218 | raw_local_irq_restore(flags); | ||
1217 | debug_locks_off(); | 1219 | debug_locks_off(); |
1218 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); | 1220 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); |
1219 | printk("turning off the locking correctness validator.\n"); | 1221 | printk("turning off the locking correctness validator.\n"); |
@@ -1236,17 +1238,20 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
1236 | 1238 | ||
1237 | if (verbose(class)) { | 1239 | if (verbose(class)) { |
1238 | __raw_spin_unlock(&hash_lock); | 1240 | __raw_spin_unlock(&hash_lock); |
1241 | raw_local_irq_restore(flags); | ||
1239 | printk("\nnew class %p: %s", class->key, class->name); | 1242 | printk("\nnew class %p: %s", class->key, class->name); |
1240 | if (class->name_version > 1) | 1243 | if (class->name_version > 1) |
1241 | printk("#%d", class->name_version); | 1244 | printk("#%d", class->name_version); |
1242 | printk("\n"); | 1245 | printk("\n"); |
1243 | dump_stack(); | 1246 | dump_stack(); |
1247 | raw_local_irq_save(flags); | ||
1244 | __raw_spin_lock(&hash_lock); | 1248 | __raw_spin_lock(&hash_lock); |
1245 | } | 1249 | } |
1246 | out_unlock_set: | 1250 | out_unlock_set: |
1247 | __raw_spin_unlock(&hash_lock); | 1251 | __raw_spin_unlock(&hash_lock); |
1252 | raw_local_irq_restore(flags); | ||
1248 | 1253 | ||
1249 | if (!subclass) | 1254 | if (!subclass || force) |
1250 | lock->class_cache = class; | 1255 | lock->class_cache = class; |
1251 | 1256 | ||
1252 | DEBUG_LOCKS_WARN_ON(class->subclass != subclass); | 1257 | DEBUG_LOCKS_WARN_ON(class->subclass != subclass); |
@@ -1725,6 +1730,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
1725 | debug_atomic_dec(&nr_unused_locks); | 1730 | debug_atomic_dec(&nr_unused_locks); |
1726 | break; | 1731 | break; |
1727 | default: | 1732 | default: |
1733 | __raw_spin_unlock(&hash_lock); | ||
1728 | debug_locks_off(); | 1734 | debug_locks_off(); |
1729 | WARN_ON(1); | 1735 | WARN_ON(1); |
1730 | return 0; | 1736 | return 0; |
@@ -1934,7 +1940,7 @@ void trace_softirqs_off(unsigned long ip) | |||
1934 | * Initialize a lock instance's lock-class mapping info: | 1940 | * Initialize a lock instance's lock-class mapping info: |
1935 | */ | 1941 | */ |
1936 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 1942 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
1937 | struct lock_class_key *key) | 1943 | struct lock_class_key *key, int subclass) |
1938 | { | 1944 | { |
1939 | if (unlikely(!debug_locks)) | 1945 | if (unlikely(!debug_locks)) |
1940 | return; | 1946 | return; |
@@ -1954,6 +1960,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
1954 | lock->name = name; | 1960 | lock->name = name; |
1955 | lock->key = key; | 1961 | lock->key = key; |
1956 | lock->class_cache = NULL; | 1962 | lock->class_cache = NULL; |
1963 | if (subclass) | ||
1964 | register_lock_class(lock, subclass, 1); | ||
1957 | } | 1965 | } |
1958 | 1966 | ||
1959 | EXPORT_SYMBOL_GPL(lockdep_init_map); | 1967 | EXPORT_SYMBOL_GPL(lockdep_init_map); |
@@ -1992,7 +2000,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
1992 | * Not cached yet or subclass? | 2000 | * Not cached yet or subclass? |
1993 | */ | 2001 | */ |
1994 | if (unlikely(!class)) { | 2002 | if (unlikely(!class)) { |
1995 | class = register_lock_class(lock, subclass); | 2003 | class = register_lock_class(lock, subclass, 0); |
1996 | if (!class) | 2004 | if (!class) |
1997 | return 0; | 2005 | return 0; |
1998 | } | 2006 | } |
@@ -2640,6 +2648,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) | |||
2640 | } | 2648 | } |
2641 | local_irq_restore(flags); | 2649 | local_irq_restore(flags); |
2642 | } | 2650 | } |
2651 | EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); | ||
2643 | 2652 | ||
2644 | static void print_held_locks_bug(struct task_struct *curr) | 2653 | static void print_held_locks_bug(struct task_struct *curr) |
2645 | { | 2654 | { |
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index eab043c83bb2..8ce09bc4613d 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h | |||
@@ -20,7 +20,7 @@ | |||
20 | #define MAX_LOCKDEP_KEYS_BITS 11 | 20 | #define MAX_LOCKDEP_KEYS_BITS 11 |
21 | #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) | 21 | #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) |
22 | 22 | ||
23 | #define MAX_LOCKDEP_CHAINS_BITS 13 | 23 | #define MAX_LOCKDEP_CHAINS_BITS 14 |
24 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) | 24 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) |
25 | 25 | ||
26 | /* | 26 | /* |
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index f6e72eaab3fa..b554b40a4aa6 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v) | |||
113 | return 0; | 113 | return 0; |
114 | } | 114 | } |
115 | 115 | ||
116 | static struct seq_operations lockdep_ops = { | 116 | static const struct seq_operations lockdep_ops = { |
117 | .start = l_start, | 117 | .start = l_start, |
118 | .next = l_next, | 118 | .next = l_next, |
119 | .stop = l_stop, | 119 | .stop = l_stop, |
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file) | |||
135 | return res; | 135 | return res; |
136 | } | 136 | } |
137 | 137 | ||
138 | static struct file_operations proc_lockdep_operations = { | 138 | static const struct file_operations proc_lockdep_operations = { |
139 | .open = lockdep_open, | 139 | .open = lockdep_open, |
140 | .read = seq_read, | 140 | .read = seq_read, |
141 | .llseek = seq_lseek, | 141 | .llseek = seq_lseek, |
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file) | |||
319 | return single_open(file, lockdep_stats_show, NULL); | 319 | return single_open(file, lockdep_stats_show, NULL); |
320 | } | 320 | } |
321 | 321 | ||
322 | static struct file_operations proc_lockdep_stats_operations = { | 322 | static const struct file_operations proc_lockdep_stats_operations = { |
323 | .open = lockdep_stats_open, | 323 | .open = lockdep_stats_open, |
324 | .read = seq_read, | 324 | .read = seq_read, |
325 | .llseek = seq_lseek, | 325 | .llseek = seq_lseek, |
diff --git a/kernel/module.c b/kernel/module.c index 67009bd56c52..d9eae45d0145 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -34,10 +34,10 @@ | |||
34 | #include <linux/err.h> | 34 | #include <linux/err.h> |
35 | #include <linux/vermagic.h> | 35 | #include <linux/vermagic.h> |
36 | #include <linux/notifier.h> | 36 | #include <linux/notifier.h> |
37 | #include <linux/sched.h> | ||
37 | #include <linux/stop_machine.h> | 38 | #include <linux/stop_machine.h> |
38 | #include <linux/device.h> | 39 | #include <linux/device.h> |
39 | #include <linux/string.h> | 40 | #include <linux/string.h> |
40 | #include <linux/sched.h> | ||
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/unwind.h> | 42 | #include <linux/unwind.h> |
43 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
@@ -790,6 +790,19 @@ static struct module_attribute refcnt = { | |||
790 | .show = show_refcnt, | 790 | .show = show_refcnt, |
791 | }; | 791 | }; |
792 | 792 | ||
793 | void module_put(struct module *module) | ||
794 | { | ||
795 | if (module) { | ||
796 | unsigned int cpu = get_cpu(); | ||
797 | local_dec(&module->ref[cpu].count); | ||
798 | /* Maybe they're waiting for us to drop reference? */ | ||
799 | if (unlikely(!module_is_live(module))) | ||
800 | wake_up_process(module->waiter); | ||
801 | put_cpu(); | ||
802 | } | ||
803 | } | ||
804 | EXPORT_SYMBOL(module_put); | ||
805 | |||
793 | #else /* !CONFIG_MODULE_UNLOAD */ | 806 | #else /* !CONFIG_MODULE_UNLOAD */ |
794 | static void print_unload_info(struct seq_file *m, struct module *mod) | 807 | static void print_unload_info(struct seq_file *m, struct module *mod) |
795 | { | 808 | { |
@@ -1086,22 +1099,35 @@ static int mod_sysfs_setup(struct module *mod, | |||
1086 | goto out; | 1099 | goto out; |
1087 | kobj_set_kset_s(&mod->mkobj, module_subsys); | 1100 | kobj_set_kset_s(&mod->mkobj, module_subsys); |
1088 | mod->mkobj.mod = mod; | 1101 | mod->mkobj.mod = mod; |
1089 | err = kobject_register(&mod->mkobj.kobj); | 1102 | |
1103 | /* delay uevent until full sysfs population */ | ||
1104 | kobject_init(&mod->mkobj.kobj); | ||
1105 | err = kobject_add(&mod->mkobj.kobj); | ||
1090 | if (err) | 1106 | if (err) |
1091 | goto out; | 1107 | goto out; |
1092 | 1108 | ||
1109 | mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers"); | ||
1110 | if (!mod->drivers_dir) | ||
1111 | goto out_unreg; | ||
1112 | |||
1093 | err = module_param_sysfs_setup(mod, kparam, num_params); | 1113 | err = module_param_sysfs_setup(mod, kparam, num_params); |
1094 | if (err) | 1114 | if (err) |
1095 | goto out_unreg; | 1115 | goto out_unreg_drivers; |
1096 | 1116 | ||
1097 | err = module_add_modinfo_attrs(mod); | 1117 | err = module_add_modinfo_attrs(mod); |
1098 | if (err) | 1118 | if (err) |
1099 | goto out_unreg; | 1119 | goto out_unreg_param; |
1100 | 1120 | ||
1121 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); | ||
1101 | return 0; | 1122 | return 0; |
1102 | 1123 | ||
1124 | out_unreg_drivers: | ||
1125 | kobject_unregister(mod->drivers_dir); | ||
1126 | out_unreg_param: | ||
1127 | module_param_sysfs_remove(mod); | ||
1103 | out_unreg: | 1128 | out_unreg: |
1104 | kobject_unregister(&mod->mkobj.kobj); | 1129 | kobject_del(&mod->mkobj.kobj); |
1130 | kobject_put(&mod->mkobj.kobj); | ||
1105 | out: | 1131 | out: |
1106 | return err; | 1132 | return err; |
1107 | } | 1133 | } |
@@ -1110,6 +1136,7 @@ static void mod_kobject_remove(struct module *mod) | |||
1110 | { | 1136 | { |
1111 | module_remove_modinfo_attrs(mod); | 1137 | module_remove_modinfo_attrs(mod); |
1112 | module_param_sysfs_remove(mod); | 1138 | module_param_sysfs_remove(mod); |
1139 | kobject_unregister(mod->drivers_dir); | ||
1113 | 1140 | ||
1114 | kobject_unregister(&mod->mkobj.kobj); | 1141 | kobject_unregister(&mod->mkobj.kobj); |
1115 | } | 1142 | } |
@@ -1342,7 +1369,7 @@ static void set_license(struct module *mod, const char *license) | |||
1342 | 1369 | ||
1343 | if (!license_is_gpl_compatible(license)) { | 1370 | if (!license_is_gpl_compatible(license)) { |
1344 | if (!(tainted & TAINT_PROPRIETARY_MODULE)) | 1371 | if (!(tainted & TAINT_PROPRIETARY_MODULE)) |
1345 | printk(KERN_WARNING "%s: module license '%s' taints" | 1372 | printk(KERN_WARNING "%s: module license '%s' taints " |
1346 | "kernel.\n", mod->name, license); | 1373 | "kernel.\n", mod->name, license); |
1347 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 1374 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
1348 | } | 1375 | } |
@@ -1718,7 +1745,7 @@ static struct module *load_module(void __user *umod, | |||
1718 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1745 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
1719 | 1746 | ||
1720 | if (strcmp(mod->name, "ndiswrapper") == 0) | 1747 | if (strcmp(mod->name, "ndiswrapper") == 0) |
1721 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 1748 | add_taint(TAINT_PROPRIETARY_MODULE); |
1722 | if (strcmp(mod->name, "driverloader") == 0) | 1749 | if (strcmp(mod->name, "driverloader") == 0) |
1723 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 1750 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
1724 | 1751 | ||
@@ -2182,7 +2209,7 @@ static int m_show(struct seq_file *m, void *p) | |||
2182 | Where refcount is a number or -, and deps is a comma-separated list | 2209 | Where refcount is a number or -, and deps is a comma-separated list |
2183 | of depends or -. | 2210 | of depends or -. |
2184 | */ | 2211 | */ |
2185 | struct seq_operations modules_op = { | 2212 | const struct seq_operations modules_op = { |
2186 | .start = m_start, | 2213 | .start = m_start, |
2187 | .next = m_next, | 2214 | .next = m_next, |
2188 | .stop = m_stop, | 2215 | .stop = m_stop, |
@@ -2275,11 +2302,14 @@ void print_modules(void) | |||
2275 | 2302 | ||
2276 | void module_add_driver(struct module *mod, struct device_driver *drv) | 2303 | void module_add_driver(struct module *mod, struct device_driver *drv) |
2277 | { | 2304 | { |
2305 | int no_warn; | ||
2306 | |||
2278 | if (!mod || !drv) | 2307 | if (!mod || !drv) |
2279 | return; | 2308 | return; |
2280 | 2309 | ||
2281 | /* Don't check return code; this call is idempotent */ | 2310 | /* Don't check return codes; these calls are idempotent */ |
2282 | sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); | 2311 | no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); |
2312 | no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name); | ||
2283 | } | 2313 | } |
2284 | EXPORT_SYMBOL(module_add_driver); | 2314 | EXPORT_SYMBOL(module_add_driver); |
2285 | 2315 | ||
@@ -2288,6 +2318,8 @@ void module_remove_driver(struct device_driver *drv) | |||
2288 | if (!drv) | 2318 | if (!drv) |
2289 | return; | 2319 | return; |
2290 | sysfs_remove_link(&drv->kobj, "module"); | 2320 | sysfs_remove_link(&drv->kobj, "module"); |
2321 | if (drv->owner && drv->owner->drivers_dir) | ||
2322 | sysfs_remove_link(drv->owner->drivers_dir, drv->name); | ||
2291 | } | 2323 | } |
2292 | EXPORT_SYMBOL(module_remove_driver); | 2324 | EXPORT_SYMBOL(module_remove_driver); |
2293 | 2325 | ||
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index e3203c654dda..841539d72c55 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | |||
77 | 77 | ||
78 | void debug_mutex_unlock(struct mutex *lock) | 78 | void debug_mutex_unlock(struct mutex *lock) |
79 | { | 79 | { |
80 | if (unlikely(!debug_locks)) | ||
81 | return; | ||
82 | |||
80 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); | 83 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); |
81 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); | 84 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
82 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 85 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
@@ -91,7 +94,7 @@ void debug_mutex_init(struct mutex *lock, const char *name, | |||
91 | * Make sure we are not reinitializing a held lock: | 94 | * Make sure we are not reinitializing a held lock: |
92 | */ | 95 | */ |
93 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | 96 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); |
94 | lockdep_init_map(&lock->dep_map, name, key); | 97 | lockdep_init_map(&lock->dep_map, name, key, 0); |
95 | #endif | 98 | #endif |
96 | lock->owner = NULL; | 99 | lock->owner = NULL; |
97 | lock->magic = lock; | 100 | lock->magic = lock; |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 8c71cf72a497..e7cbbb82765b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) | |||
206 | } | 206 | } |
207 | 207 | ||
208 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 208 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
209 | |||
210 | int __sched | ||
211 | mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | ||
212 | { | ||
213 | might_sleep(); | ||
214 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); | ||
215 | } | ||
216 | |||
217 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | ||
209 | #endif | 218 | #endif |
210 | 219 | ||
211 | /* | 220 | /* |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 6ebdb82a0ce4..e2ce748e96af 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -17,8 +17,9 @@ | |||
17 | #include <linux/version.h> | 17 | #include <linux/version.h> |
18 | #include <linux/nsproxy.h> | 18 | #include <linux/nsproxy.h> |
19 | #include <linux/init_task.h> | 19 | #include <linux/init_task.h> |
20 | #include <linux/namespace.h> | 20 | #include <linux/mnt_namespace.h> |
21 | #include <linux/utsname.h> | 21 | #include <linux/utsname.h> |
22 | #include <linux/pid_namespace.h> | ||
22 | 23 | ||
23 | struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); | 24 | struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); |
24 | 25 | ||
@@ -44,10 +45,10 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) | |||
44 | { | 45 | { |
45 | struct nsproxy *ns; | 46 | struct nsproxy *ns; |
46 | 47 | ||
47 | ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); | 48 | ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); |
48 | if (ns) { | 49 | if (ns) { |
49 | memcpy(ns, orig, sizeof(struct nsproxy)); | ||
50 | atomic_set(&ns->count, 1); | 50 | atomic_set(&ns->count, 1); |
51 | ns->id = -1; | ||
51 | } | 52 | } |
52 | return ns; | 53 | return ns; |
53 | } | 54 | } |
@@ -62,12 +63,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) | |||
62 | struct nsproxy *ns = clone_namespaces(orig); | 63 | struct nsproxy *ns = clone_namespaces(orig); |
63 | 64 | ||
64 | if (ns) { | 65 | if (ns) { |
65 | if (ns->namespace) | 66 | if (ns->mnt_ns) |
66 | get_namespace(ns->namespace); | 67 | get_mnt_ns(ns->mnt_ns); |
67 | if (ns->uts_ns) | 68 | if (ns->uts_ns) |
68 | get_uts_ns(ns->uts_ns); | 69 | get_uts_ns(ns->uts_ns); |
69 | if (ns->ipc_ns) | 70 | if (ns->ipc_ns) |
70 | get_ipc_ns(ns->ipc_ns); | 71 | get_ipc_ns(ns->ipc_ns); |
72 | if (ns->pid_ns) | ||
73 | get_pid_ns(ns->pid_ns); | ||
71 | } | 74 | } |
72 | 75 | ||
73 | return ns; | 76 | return ns; |
@@ -99,7 +102,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) | |||
99 | 102 | ||
100 | tsk->nsproxy = new_ns; | 103 | tsk->nsproxy = new_ns; |
101 | 104 | ||
102 | err = copy_namespace(flags, tsk); | 105 | err = copy_mnt_ns(flags, tsk); |
103 | if (err) | 106 | if (err) |
104 | goto out_ns; | 107 | goto out_ns; |
105 | 108 | ||
@@ -111,16 +114,23 @@ int copy_namespaces(int flags, struct task_struct *tsk) | |||
111 | if (err) | 114 | if (err) |
112 | goto out_ipc; | 115 | goto out_ipc; |
113 | 116 | ||
117 | err = copy_pid_ns(flags, tsk); | ||
118 | if (err) | ||
119 | goto out_pid; | ||
120 | |||
114 | out: | 121 | out: |
115 | put_nsproxy(old_ns); | 122 | put_nsproxy(old_ns); |
116 | return err; | 123 | return err; |
117 | 124 | ||
125 | out_pid: | ||
126 | if (new_ns->ipc_ns) | ||
127 | put_ipc_ns(new_ns->ipc_ns); | ||
118 | out_ipc: | 128 | out_ipc: |
119 | if (new_ns->uts_ns) | 129 | if (new_ns->uts_ns) |
120 | put_uts_ns(new_ns->uts_ns); | 130 | put_uts_ns(new_ns->uts_ns); |
121 | out_uts: | 131 | out_uts: |
122 | if (new_ns->namespace) | 132 | if (new_ns->mnt_ns) |
123 | put_namespace(new_ns->namespace); | 133 | put_mnt_ns(new_ns->mnt_ns); |
124 | out_ns: | 134 | out_ns: |
125 | tsk->nsproxy = old_ns; | 135 | tsk->nsproxy = old_ns; |
126 | kfree(new_ns); | 136 | kfree(new_ns); |
@@ -129,11 +139,13 @@ out_ns: | |||
129 | 139 | ||
130 | void free_nsproxy(struct nsproxy *ns) | 140 | void free_nsproxy(struct nsproxy *ns) |
131 | { | 141 | { |
132 | if (ns->namespace) | 142 | if (ns->mnt_ns) |
133 | put_namespace(ns->namespace); | 143 | put_mnt_ns(ns->mnt_ns); |
134 | if (ns->uts_ns) | 144 | if (ns->uts_ns) |
135 | put_uts_ns(ns->uts_ns); | 145 | put_uts_ns(ns->uts_ns); |
136 | if (ns->ipc_ns) | 146 | if (ns->ipc_ns) |
137 | put_ipc_ns(ns->ipc_ns); | 147 | put_ipc_ns(ns->ipc_ns); |
138 | kfree(ns); | 148 | if (ns->pid_ns) |
149 | put_pid_ns(ns->pid_ns); | ||
150 | kfree(ns); | ||
139 | } | 151 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index b914392085f9..2efe9d8d367b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -26,12 +26,12 @@ | |||
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | #include <linux/bootmem.h> | 27 | #include <linux/bootmem.h> |
28 | #include <linux/hash.h> | 28 | #include <linux/hash.h> |
29 | #include <linux/pspace.h> | 29 | #include <linux/pid_namespace.h> |
30 | 30 | ||
31 | #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) | 31 | #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) |
32 | static struct hlist_head *pid_hash; | 32 | static struct hlist_head *pid_hash; |
33 | static int pidhash_shift; | 33 | static int pidhash_shift; |
34 | static kmem_cache_t *pid_cachep; | 34 | static struct kmem_cache *pid_cachep; |
35 | 35 | ||
36 | int pid_max = PID_MAX_DEFAULT; | 36 | int pid_max = PID_MAX_DEFAULT; |
37 | 37 | ||
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT; | |||
43 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 43 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
44 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) | 44 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) |
45 | 45 | ||
46 | static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) | 46 | static inline int mk_pid(struct pid_namespace *pid_ns, |
47 | struct pidmap *map, int off) | ||
47 | { | 48 | { |
48 | return (map - pspace->pidmap)*BITS_PER_PAGE + off; | 49 | return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; |
49 | } | 50 | } |
50 | 51 | ||
51 | #define find_next_offset(map, off) \ | 52 | #define find_next_offset(map, off) \ |
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) | |||
57 | * value does not cause lots of bitmaps to be allocated, but | 58 | * value does not cause lots of bitmaps to be allocated, but |
58 | * the scheme scales to up to 4 million PIDs, runtime. | 59 | * the scheme scales to up to 4 million PIDs, runtime. |
59 | */ | 60 | */ |
60 | struct pspace init_pspace = { | 61 | struct pid_namespace init_pid_ns = { |
62 | .kref = { | ||
63 | .refcount = ATOMIC_INIT(2), | ||
64 | }, | ||
61 | .pidmap = { | 65 | .pidmap = { |
62 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } | 66 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } |
63 | }, | 67 | }, |
64 | .last_pid = 0 | 68 | .last_pid = 0, |
69 | .child_reaper = &init_task | ||
65 | }; | 70 | }; |
66 | 71 | ||
67 | /* | 72 | /* |
@@ -80,25 +85,25 @@ struct pspace init_pspace = { | |||
80 | 85 | ||
81 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); | 86 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); |
82 | 87 | ||
83 | static fastcall void free_pidmap(struct pspace *pspace, int pid) | 88 | static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) |
84 | { | 89 | { |
85 | struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; | 90 | struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; |
86 | int offset = pid & BITS_PER_PAGE_MASK; | 91 | int offset = pid & BITS_PER_PAGE_MASK; |
87 | 92 | ||
88 | clear_bit(offset, map->page); | 93 | clear_bit(offset, map->page); |
89 | atomic_inc(&map->nr_free); | 94 | atomic_inc(&map->nr_free); |
90 | } | 95 | } |
91 | 96 | ||
92 | static int alloc_pidmap(struct pspace *pspace) | 97 | static int alloc_pidmap(struct pid_namespace *pid_ns) |
93 | { | 98 | { |
94 | int i, offset, max_scan, pid, last = pspace->last_pid; | 99 | int i, offset, max_scan, pid, last = pid_ns->last_pid; |
95 | struct pidmap *map; | 100 | struct pidmap *map; |
96 | 101 | ||
97 | pid = last + 1; | 102 | pid = last + 1; |
98 | if (pid >= pid_max) | 103 | if (pid >= pid_max) |
99 | pid = RESERVED_PIDS; | 104 | pid = RESERVED_PIDS; |
100 | offset = pid & BITS_PER_PAGE_MASK; | 105 | offset = pid & BITS_PER_PAGE_MASK; |
101 | map = &pspace->pidmap[pid/BITS_PER_PAGE]; | 106 | map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; |
102 | max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; | 107 | max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; |
103 | for (i = 0; i <= max_scan; ++i) { | 108 | for (i = 0; i <= max_scan; ++i) { |
104 | if (unlikely(!map->page)) { | 109 | if (unlikely(!map->page)) { |
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace) | |||
120 | do { | 125 | do { |
121 | if (!test_and_set_bit(offset, map->page)) { | 126 | if (!test_and_set_bit(offset, map->page)) { |
122 | atomic_dec(&map->nr_free); | 127 | atomic_dec(&map->nr_free); |
123 | pspace->last_pid = pid; | 128 | pid_ns->last_pid = pid; |
124 | return pid; | 129 | return pid; |
125 | } | 130 | } |
126 | offset = find_next_offset(map, offset); | 131 | offset = find_next_offset(map, offset); |
127 | pid = mk_pid(pspace, map, offset); | 132 | pid = mk_pid(pid_ns, map, offset); |
128 | /* | 133 | /* |
129 | * find_next_offset() found a bit, the pid from it | 134 | * find_next_offset() found a bit, the pid from it |
130 | * is in-bounds, and if we fell back to the last | 135 | * is in-bounds, and if we fell back to the last |
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace) | |||
135 | (i != max_scan || pid < last || | 140 | (i != max_scan || pid < last || |
136 | !((last+1) & BITS_PER_PAGE_MASK))); | 141 | !((last+1) & BITS_PER_PAGE_MASK))); |
137 | } | 142 | } |
138 | if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { | 143 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { |
139 | ++map; | 144 | ++map; |
140 | offset = 0; | 145 | offset = 0; |
141 | } else { | 146 | } else { |
142 | map = &pspace->pidmap[0]; | 147 | map = &pid_ns->pidmap[0]; |
143 | offset = RESERVED_PIDS; | 148 | offset = RESERVED_PIDS; |
144 | if (unlikely(last == offset)) | 149 | if (unlikely(last == offset)) |
145 | break; | 150 | break; |
146 | } | 151 | } |
147 | pid = mk_pid(pspace, map, offset); | 152 | pid = mk_pid(pid_ns, map, offset); |
148 | } | 153 | } |
149 | return -1; | 154 | return -1; |
150 | } | 155 | } |
151 | 156 | ||
152 | static int next_pidmap(struct pspace *pspace, int last) | 157 | static int next_pidmap(struct pid_namespace *pid_ns, int last) |
153 | { | 158 | { |
154 | int offset; | 159 | int offset; |
155 | struct pidmap *map, *end; | 160 | struct pidmap *map, *end; |
156 | 161 | ||
157 | offset = (last + 1) & BITS_PER_PAGE_MASK; | 162 | offset = (last + 1) & BITS_PER_PAGE_MASK; |
158 | map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; | 163 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; |
159 | end = &pspace->pidmap[PIDMAP_ENTRIES]; | 164 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; |
160 | for (; map < end; map++, offset = 0) { | 165 | for (; map < end; map++, offset = 0) { |
161 | if (unlikely(!map->page)) | 166 | if (unlikely(!map->page)) |
162 | continue; | 167 | continue; |
163 | offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); | 168 | offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); |
164 | if (offset < BITS_PER_PAGE) | 169 | if (offset < BITS_PER_PAGE) |
165 | return mk_pid(pspace, map, offset); | 170 | return mk_pid(pid_ns, map, offset); |
166 | } | 171 | } |
167 | return -1; | 172 | return -1; |
168 | } | 173 | } |
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid) | |||
192 | hlist_del_rcu(&pid->pid_chain); | 197 | hlist_del_rcu(&pid->pid_chain); |
193 | spin_unlock_irqrestore(&pidmap_lock, flags); | 198 | spin_unlock_irqrestore(&pidmap_lock, flags); |
194 | 199 | ||
195 | free_pidmap(&init_pspace, pid->nr); | 200 | free_pidmap(current->nsproxy->pid_ns, pid->nr); |
196 | call_rcu(&pid->rcu, delayed_put_pid); | 201 | call_rcu(&pid->rcu, delayed_put_pid); |
197 | } | 202 | } |
198 | 203 | ||
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void) | |||
206 | if (!pid) | 211 | if (!pid) |
207 | goto out; | 212 | goto out; |
208 | 213 | ||
209 | nr = alloc_pidmap(&init_pspace); | 214 | nr = alloc_pidmap(current->nsproxy->pid_ns); |
210 | if (nr < 0) | 215 | if (nr < 0) |
211 | goto out_free; | 216 | goto out_free; |
212 | 217 | ||
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr) | |||
348 | pid = find_pid(nr); | 353 | pid = find_pid(nr); |
349 | if (pid) | 354 | if (pid) |
350 | break; | 355 | break; |
351 | nr = next_pidmap(&init_pspace, nr); | 356 | nr = next_pidmap(current->nsproxy->pid_ns, nr); |
352 | } while (nr > 0); | 357 | } while (nr > 0); |
353 | 358 | ||
354 | return pid; | 359 | return pid; |
355 | } | 360 | } |
356 | EXPORT_SYMBOL_GPL(find_get_pid); | 361 | EXPORT_SYMBOL_GPL(find_get_pid); |
357 | 362 | ||
363 | int copy_pid_ns(int flags, struct task_struct *tsk) | ||
364 | { | ||
365 | struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; | ||
366 | int err = 0; | ||
367 | |||
368 | if (!old_ns) | ||
369 | return 0; | ||
370 | |||
371 | get_pid_ns(old_ns); | ||
372 | return err; | ||
373 | } | ||
374 | |||
375 | void free_pid_ns(struct kref *kref) | ||
376 | { | ||
377 | struct pid_namespace *ns; | ||
378 | |||
379 | ns = container_of(kref, struct pid_namespace, kref); | ||
380 | kfree(ns); | ||
381 | } | ||
382 | |||
358 | /* | 383 | /* |
359 | * The pid hash table is scaled according to the amount of memory in the | 384 | * The pid hash table is scaled according to the amount of memory in the |
360 | * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or | 385 | * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or |
@@ -382,10 +407,10 @@ void __init pidhash_init(void) | |||
382 | 407 | ||
383 | void __init pidmap_init(void) | 408 | void __init pidmap_init(void) |
384 | { | 409 | { |
385 | init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 410 | init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
386 | /* Reserve PID 0. We never call free_pidmap(0) */ | 411 | /* Reserve PID 0. We never call free_pidmap(0) */ |
387 | set_bit(0, init_pspace.pidmap[0].page); | 412 | set_bit(0, init_pid_ns.pidmap[0].page); |
388 | atomic_dec(&init_pspace.pidmap[0].nr_free); | 413 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); |
389 | 414 | ||
390 | pid_cachep = kmem_cache_create("pid", sizeof(struct pid), | 415 | pid_cachep = kmem_cache_create("pid", sizeof(struct pid), |
391 | __alignof__(struct pid), | 416 | __alignof__(struct pid), |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 479b16b44f79..7c3e1e6dfb5b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -88,6 +88,19 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, | |||
88 | } | 88 | } |
89 | 89 | ||
90 | /* | 90 | /* |
91 | * Divide and limit the result to res >= 1 | ||
92 | * | ||
93 | * This is necessary to prevent signal delivery starvation, when the result of | ||
94 | * the division would be rounded down to 0. | ||
95 | */ | ||
96 | static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) | ||
97 | { | ||
98 | cputime_t res = cputime_div(time, div); | ||
99 | |||
100 | return max_t(cputime_t, res, 1); | ||
101 | } | ||
102 | |||
103 | /* | ||
91 | * Update expiry time from increment, and increase overrun count, | 104 | * Update expiry time from increment, and increase overrun count, |
92 | * given the current clock sample. | 105 | * given the current clock sample. |
93 | */ | 106 | */ |
@@ -483,8 +496,8 @@ static void process_timer_rebalance(struct task_struct *p, | |||
483 | BUG(); | 496 | BUG(); |
484 | break; | 497 | break; |
485 | case CPUCLOCK_PROF: | 498 | case CPUCLOCK_PROF: |
486 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), | 499 | left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), |
487 | nthreads); | 500 | nthreads); |
488 | do { | 501 | do { |
489 | if (likely(!(t->flags & PF_EXITING))) { | 502 | if (likely(!(t->flags & PF_EXITING))) { |
490 | ticks = cputime_add(prof_ticks(t), left); | 503 | ticks = cputime_add(prof_ticks(t), left); |
@@ -498,8 +511,8 @@ static void process_timer_rebalance(struct task_struct *p, | |||
498 | } while (t != p); | 511 | } while (t != p); |
499 | break; | 512 | break; |
500 | case CPUCLOCK_VIRT: | 513 | case CPUCLOCK_VIRT: |
501 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), | 514 | left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), |
502 | nthreads); | 515 | nthreads); |
503 | do { | 516 | do { |
504 | if (likely(!(t->flags & PF_EXITING))) { | 517 | if (likely(!(t->flags & PF_EXITING))) { |
505 | ticks = cputime_add(virt_ticks(t), left); | 518 | ticks = cputime_add(virt_ticks(t), left); |
@@ -515,6 +528,7 @@ static void process_timer_rebalance(struct task_struct *p, | |||
515 | case CPUCLOCK_SCHED: | 528 | case CPUCLOCK_SCHED: |
516 | nsleft = expires.sched - val.sched; | 529 | nsleft = expires.sched - val.sched; |
517 | do_div(nsleft, nthreads); | 530 | do_div(nsleft, nthreads); |
531 | nsleft = max_t(unsigned long long, nsleft, 1); | ||
518 | do { | 532 | do { |
519 | if (likely(!(t->flags & PF_EXITING))) { | 533 | if (likely(!(t->flags & PF_EXITING))) { |
520 | ns = t->sched_time + nsleft; | 534 | ns = t->sched_time + nsleft; |
@@ -1159,12 +1173,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
1159 | 1173 | ||
1160 | prof_left = cputime_sub(prof_expires, utime); | 1174 | prof_left = cputime_sub(prof_expires, utime); |
1161 | prof_left = cputime_sub(prof_left, stime); | 1175 | prof_left = cputime_sub(prof_left, stime); |
1162 | prof_left = cputime_div(prof_left, nthreads); | 1176 | prof_left = cputime_div_non_zero(prof_left, nthreads); |
1163 | virt_left = cputime_sub(virt_expires, utime); | 1177 | virt_left = cputime_sub(virt_expires, utime); |
1164 | virt_left = cputime_div(virt_left, nthreads); | 1178 | virt_left = cputime_div_non_zero(virt_left, nthreads); |
1165 | if (sched_expires) { | 1179 | if (sched_expires) { |
1166 | sched_left = sched_expires - sched_time; | 1180 | sched_left = sched_expires - sched_time; |
1167 | do_div(sched_left, nthreads); | 1181 | do_div(sched_left, nthreads); |
1182 | sched_left = max_t(unsigned long long, sched_left, 1); | ||
1168 | } else { | 1183 | } else { |
1169 | sched_left = 0; | 1184 | sched_left = 0; |
1170 | } | 1185 | } |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9cbb5d1be06f..5fe87de10ff0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -70,7 +70,7 @@ | |||
70 | /* | 70 | /* |
71 | * Lets keep our timers in a slab cache :-) | 71 | * Lets keep our timers in a slab cache :-) |
72 | */ | 72 | */ |
73 | static kmem_cache_t *posix_timers_cache; | 73 | static struct kmem_cache *posix_timers_cache; |
74 | static struct idr posix_timers_id; | 74 | static struct idr posix_timers_id; |
75 | static DEFINE_SPINLOCK(idr_lock); | 75 | static DEFINE_SPINLOCK(idr_lock); |
76 | 76 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 825068ca3479..710ed084e7c5 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -78,7 +78,7 @@ config PM_SYSFS_DEPRECATED | |||
78 | 78 | ||
79 | config SOFTWARE_SUSPEND | 79 | config SOFTWARE_SUSPEND |
80 | bool "Software Suspend" | 80 | bool "Software Suspend" |
81 | depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) | 81 | depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) |
82 | ---help--- | 82 | ---help--- |
83 | Enable the possibility of suspending the machine. | 83 | Enable the possibility of suspending the machine. |
84 | It doesn't need ACPI or APM. | 84 | It doesn't need ACPI or APM. |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d3a158a60312..0b00f56c2ad0 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/pm.h> | 20 | #include <linux/pm.h> |
21 | #include <linux/console.h> | 21 | #include <linux/console.h> |
22 | #include <linux/cpu.h> | 22 | #include <linux/cpu.h> |
23 | #include <linux/freezer.h> | ||
23 | 24 | ||
24 | #include "power.h" | 25 | #include "power.h" |
25 | 26 | ||
@@ -27,6 +28,23 @@ | |||
27 | static int noresume = 0; | 28 | static int noresume = 0; |
28 | char resume_file[256] = CONFIG_PM_STD_PARTITION; | 29 | char resume_file[256] = CONFIG_PM_STD_PARTITION; |
29 | dev_t swsusp_resume_device; | 30 | dev_t swsusp_resume_device; |
31 | sector_t swsusp_resume_block; | ||
32 | |||
33 | /** | ||
34 | * platform_prepare - prepare the machine for hibernation using the | ||
35 | * platform driver if so configured and return an error code if it fails | ||
36 | */ | ||
37 | |||
38 | static inline int platform_prepare(void) | ||
39 | { | ||
40 | int error = 0; | ||
41 | |||
42 | if (pm_disk_mode == PM_DISK_PLATFORM) { | ||
43 | if (pm_ops && pm_ops->prepare) | ||
44 | error = pm_ops->prepare(PM_SUSPEND_DISK); | ||
45 | } | ||
46 | return error; | ||
47 | } | ||
30 | 48 | ||
31 | /** | 49 | /** |
32 | * power_down - Shut machine down for hibernate. | 50 | * power_down - Shut machine down for hibernate. |
@@ -40,12 +58,10 @@ dev_t swsusp_resume_device; | |||
40 | 58 | ||
41 | static void power_down(suspend_disk_method_t mode) | 59 | static void power_down(suspend_disk_method_t mode) |
42 | { | 60 | { |
43 | int error = 0; | ||
44 | |||
45 | switch(mode) { | 61 | switch(mode) { |
46 | case PM_DISK_PLATFORM: | 62 | case PM_DISK_PLATFORM: |
47 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | 63 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); |
48 | error = pm_ops->enter(PM_SUSPEND_DISK); | 64 | pm_ops->enter(PM_SUSPEND_DISK); |
49 | break; | 65 | break; |
50 | case PM_DISK_SHUTDOWN: | 66 | case PM_DISK_SHUTDOWN: |
51 | kernel_power_off(); | 67 | kernel_power_off(); |
@@ -71,7 +87,7 @@ static inline void platform_finish(void) | |||
71 | 87 | ||
72 | static int prepare_processes(void) | 88 | static int prepare_processes(void) |
73 | { | 89 | { |
74 | int error; | 90 | int error = 0; |
75 | 91 | ||
76 | pm_prepare_console(); | 92 | pm_prepare_console(); |
77 | 93 | ||
@@ -84,12 +100,24 @@ static int prepare_processes(void) | |||
84 | goto thaw; | 100 | goto thaw; |
85 | } | 101 | } |
86 | 102 | ||
103 | if (pm_disk_mode == PM_DISK_TESTPROC) { | ||
104 | printk("swsusp debug: Waiting for 5 seconds.\n"); | ||
105 | mdelay(5000); | ||
106 | goto thaw; | ||
107 | } | ||
108 | |||
109 | error = platform_prepare(); | ||
110 | if (error) | ||
111 | goto thaw; | ||
112 | |||
87 | /* Free memory before shutting down devices. */ | 113 | /* Free memory before shutting down devices. */ |
88 | if (!(error = swsusp_shrink_memory())) | 114 | if (!(error = swsusp_shrink_memory())) |
89 | return 0; | 115 | return 0; |
90 | thaw: | 116 | |
117 | platform_finish(); | ||
118 | thaw: | ||
91 | thaw_processes(); | 119 | thaw_processes(); |
92 | enable_cpus: | 120 | enable_cpus: |
93 | enable_nonboot_cpus(); | 121 | enable_nonboot_cpus(); |
94 | pm_restore_console(); | 122 | pm_restore_console(); |
95 | return error; | 123 | return error; |
@@ -120,13 +148,21 @@ int pm_suspend_disk(void) | |||
120 | if (error) | 148 | if (error) |
121 | return error; | 149 | return error; |
122 | 150 | ||
151 | if (pm_disk_mode == PM_DISK_TESTPROC) | ||
152 | return 0; | ||
153 | |||
123 | suspend_console(); | 154 | suspend_console(); |
124 | error = device_suspend(PMSG_FREEZE); | 155 | error = device_suspend(PMSG_FREEZE); |
125 | if (error) { | 156 | if (error) { |
126 | resume_console(); | 157 | resume_console(); |
127 | printk("Some devices failed to suspend\n"); | 158 | printk("Some devices failed to suspend\n"); |
128 | unprepare_processes(); | 159 | goto Thaw; |
129 | return error; | 160 | } |
161 | |||
162 | if (pm_disk_mode == PM_DISK_TEST) { | ||
163 | printk("swsusp debug: Waiting for 5 seconds.\n"); | ||
164 | mdelay(5000); | ||
165 | goto Done; | ||
130 | } | 166 | } |
131 | 167 | ||
132 | pr_debug("PM: snapshotting memory.\n"); | 168 | pr_debug("PM: snapshotting memory.\n"); |
@@ -143,16 +179,17 @@ int pm_suspend_disk(void) | |||
143 | power_down(pm_disk_mode); | 179 | power_down(pm_disk_mode); |
144 | else { | 180 | else { |
145 | swsusp_free(); | 181 | swsusp_free(); |
146 | unprepare_processes(); | 182 | goto Thaw; |
147 | return error; | ||
148 | } | 183 | } |
149 | } else | 184 | } else { |
150 | pr_debug("PM: Image restored successfully.\n"); | 185 | pr_debug("PM: Image restored successfully.\n"); |
186 | } | ||
151 | 187 | ||
152 | swsusp_free(); | 188 | swsusp_free(); |
153 | Done: | 189 | Done: |
154 | device_resume(); | 190 | device_resume(); |
155 | resume_console(); | 191 | resume_console(); |
192 | Thaw: | ||
156 | unprepare_processes(); | 193 | unprepare_processes(); |
157 | return error; | 194 | return error; |
158 | } | 195 | } |
@@ -174,10 +211,10 @@ static int software_resume(void) | |||
174 | { | 211 | { |
175 | int error; | 212 | int error; |
176 | 213 | ||
177 | down(&pm_sem); | 214 | mutex_lock(&pm_mutex); |
178 | if (!swsusp_resume_device) { | 215 | if (!swsusp_resume_device) { |
179 | if (!strlen(resume_file)) { | 216 | if (!strlen(resume_file)) { |
180 | up(&pm_sem); | 217 | mutex_unlock(&pm_mutex); |
181 | return -ENOENT; | 218 | return -ENOENT; |
182 | } | 219 | } |
183 | swsusp_resume_device = name_to_dev_t(resume_file); | 220 | swsusp_resume_device = name_to_dev_t(resume_file); |
@@ -192,7 +229,7 @@ static int software_resume(void) | |||
192 | * FIXME: If noresume is specified, we need to find the partition | 229 | * FIXME: If noresume is specified, we need to find the partition |
193 | * and reset it back to normal swap space. | 230 | * and reset it back to normal swap space. |
194 | */ | 231 | */ |
195 | up(&pm_sem); | 232 | mutex_unlock(&pm_mutex); |
196 | return 0; | 233 | return 0; |
197 | } | 234 | } |
198 | 235 | ||
@@ -236,7 +273,7 @@ static int software_resume(void) | |||
236 | unprepare_processes(); | 273 | unprepare_processes(); |
237 | Done: | 274 | Done: |
238 | /* For success case, the suspend path will release the lock */ | 275 | /* For success case, the suspend path will release the lock */ |
239 | up(&pm_sem); | 276 | mutex_unlock(&pm_mutex); |
240 | pr_debug("PM: Resume from disk failed.\n"); | 277 | pr_debug("PM: Resume from disk failed.\n"); |
241 | return 0; | 278 | return 0; |
242 | } | 279 | } |
@@ -249,6 +286,8 @@ static const char * const pm_disk_modes[] = { | |||
249 | [PM_DISK_PLATFORM] = "platform", | 286 | [PM_DISK_PLATFORM] = "platform", |
250 | [PM_DISK_SHUTDOWN] = "shutdown", | 287 | [PM_DISK_SHUTDOWN] = "shutdown", |
251 | [PM_DISK_REBOOT] = "reboot", | 288 | [PM_DISK_REBOOT] = "reboot", |
289 | [PM_DISK_TEST] = "test", | ||
290 | [PM_DISK_TESTPROC] = "testproc", | ||
252 | }; | 291 | }; |
253 | 292 | ||
254 | /** | 293 | /** |
@@ -295,7 +334,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) | |||
295 | p = memchr(buf, '\n', n); | 334 | p = memchr(buf, '\n', n); |
296 | len = p ? p - buf : n; | 335 | len = p ? p - buf : n; |
297 | 336 | ||
298 | down(&pm_sem); | 337 | mutex_lock(&pm_mutex); |
299 | for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { | 338 | for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { |
300 | if (!strncmp(buf, pm_disk_modes[i], len)) { | 339 | if (!strncmp(buf, pm_disk_modes[i], len)) { |
301 | mode = i; | 340 | mode = i; |
@@ -303,21 +342,23 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) | |||
303 | } | 342 | } |
304 | } | 343 | } |
305 | if (mode) { | 344 | if (mode) { |
306 | if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) | 345 | if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT || |
346 | mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) { | ||
307 | pm_disk_mode = mode; | 347 | pm_disk_mode = mode; |
308 | else { | 348 | } else { |
309 | if (pm_ops && pm_ops->enter && | 349 | if (pm_ops && pm_ops->enter && |
310 | (mode == pm_ops->pm_disk_mode)) | 350 | (mode == pm_ops->pm_disk_mode)) |
311 | pm_disk_mode = mode; | 351 | pm_disk_mode = mode; |
312 | else | 352 | else |
313 | error = -EINVAL; | 353 | error = -EINVAL; |
314 | } | 354 | } |
315 | } else | 355 | } else { |
316 | error = -EINVAL; | 356 | error = -EINVAL; |
357 | } | ||
317 | 358 | ||
318 | pr_debug("PM: suspend-to-disk mode set to '%s'\n", | 359 | pr_debug("PM: suspend-to-disk mode set to '%s'\n", |
319 | pm_disk_modes[mode]); | 360 | pm_disk_modes[mode]); |
320 | up(&pm_sem); | 361 | mutex_unlock(&pm_mutex); |
321 | return error ? error : n; | 362 | return error ? error : n; |
322 | } | 363 | } |
323 | 364 | ||
@@ -342,14 +383,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) | |||
342 | if (maj != MAJOR(res) || min != MINOR(res)) | 383 | if (maj != MAJOR(res) || min != MINOR(res)) |
343 | goto out; | 384 | goto out; |
344 | 385 | ||
345 | down(&pm_sem); | 386 | mutex_lock(&pm_mutex); |
346 | swsusp_resume_device = res; | 387 | swsusp_resume_device = res; |
347 | up(&pm_sem); | 388 | mutex_unlock(&pm_mutex); |
348 | printk("Attempting manual resume\n"); | 389 | printk("Attempting manual resume\n"); |
349 | noresume = 0; | 390 | noresume = 0; |
350 | software_resume(); | 391 | software_resume(); |
351 | ret = n; | 392 | ret = n; |
352 | out: | 393 | out: |
353 | return ret; | 394 | return ret; |
354 | } | 395 | } |
355 | 396 | ||
@@ -404,6 +445,19 @@ static int __init resume_setup(char *str) | |||
404 | return 1; | 445 | return 1; |
405 | } | 446 | } |
406 | 447 | ||
448 | static int __init resume_offset_setup(char *str) | ||
449 | { | ||
450 | unsigned long long offset; | ||
451 | |||
452 | if (noresume) | ||
453 | return 1; | ||
454 | |||
455 | if (sscanf(str, "%llu", &offset) == 1) | ||
456 | swsusp_resume_block = offset; | ||
457 | |||
458 | return 1; | ||
459 | } | ||
460 | |||
407 | static int __init noresume_setup(char *str) | 461 | static int __init noresume_setup(char *str) |
408 | { | 462 | { |
409 | noresume = 1; | 463 | noresume = 1; |
@@ -411,4 +465,5 @@ static int __init noresume_setup(char *str) | |||
411 | } | 465 | } |
412 | 466 | ||
413 | __setup("noresume", noresume_setup); | 467 | __setup("noresume", noresume_setup); |
468 | __setup("resume_offset=", resume_offset_setup); | ||
414 | __setup("resume=", resume_setup); | 469 | __setup("resume=", resume_setup); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 873228c71dab..500eb87f643d 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -8,6 +8,7 @@ | |||
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/module.h> | ||
11 | #include <linux/suspend.h> | 12 | #include <linux/suspend.h> |
12 | #include <linux/kobject.h> | 13 | #include <linux/kobject.h> |
13 | #include <linux/string.h> | 14 | #include <linux/string.h> |
@@ -18,13 +19,14 @@ | |||
18 | #include <linux/console.h> | 19 | #include <linux/console.h> |
19 | #include <linux/cpu.h> | 20 | #include <linux/cpu.h> |
20 | #include <linux/resume-trace.h> | 21 | #include <linux/resume-trace.h> |
22 | #include <linux/freezer.h> | ||
21 | 23 | ||
22 | #include "power.h" | 24 | #include "power.h" |
23 | 25 | ||
24 | /*This is just an arbitrary number */ | 26 | /*This is just an arbitrary number */ |
25 | #define FREE_PAGE_NUMBER (100) | 27 | #define FREE_PAGE_NUMBER (100) |
26 | 28 | ||
27 | DECLARE_MUTEX(pm_sem); | 29 | DEFINE_MUTEX(pm_mutex); |
28 | 30 | ||
29 | struct pm_ops *pm_ops; | 31 | struct pm_ops *pm_ops; |
30 | suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; | 32 | suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; |
@@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; | |||
36 | 38 | ||
37 | void pm_set_ops(struct pm_ops * ops) | 39 | void pm_set_ops(struct pm_ops * ops) |
38 | { | 40 | { |
39 | down(&pm_sem); | 41 | mutex_lock(&pm_mutex); |
40 | pm_ops = ops; | 42 | pm_ops = ops; |
41 | up(&pm_sem); | 43 | mutex_unlock(&pm_mutex); |
42 | } | 44 | } |
43 | 45 | ||
44 | 46 | ||
@@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state) | |||
182 | 184 | ||
183 | if (!valid_state(state)) | 185 | if (!valid_state(state)) |
184 | return -ENODEV; | 186 | return -ENODEV; |
185 | if (down_trylock(&pm_sem)) | 187 | if (!mutex_trylock(&pm_mutex)) |
186 | return -EBUSY; | 188 | return -EBUSY; |
187 | 189 | ||
188 | if (state == PM_SUSPEND_DISK) { | 190 | if (state == PM_SUSPEND_DISK) { |
@@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state) | |||
200 | pr_debug("PM: Finishing wakeup.\n"); | 202 | pr_debug("PM: Finishing wakeup.\n"); |
201 | suspend_finish(state); | 203 | suspend_finish(state); |
202 | Unlock: | 204 | Unlock: |
203 | up(&pm_sem); | 205 | mutex_unlock(&pm_mutex); |
204 | return error; | 206 | return error; |
205 | } | 207 | } |
206 | 208 | ||
@@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state) | |||
229 | return -EINVAL; | 231 | return -EINVAL; |
230 | } | 232 | } |
231 | 233 | ||
232 | 234 | EXPORT_SYMBOL(pm_suspend); | |
233 | 235 | ||
234 | decl_subsys(power,NULL,NULL); | 236 | decl_subsys(power,NULL,NULL); |
235 | 237 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index bfe999f7b272..eb461b816bf4 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void) | |||
22 | return -EPERM; | 22 | return -EPERM; |
23 | } | 23 | } |
24 | #endif | 24 | #endif |
25 | extern struct semaphore pm_sem; | 25 | |
26 | extern struct mutex pm_mutex; | ||
27 | |||
26 | #define power_attr(_name) \ | 28 | #define power_attr(_name) \ |
27 | static struct subsys_attribute _name##_attr = { \ | 29 | static struct subsys_attribute _name##_attr = { \ |
28 | .attr = { \ | 30 | .attr = { \ |
@@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end; | |||
42 | extern unsigned long image_size; | 44 | extern unsigned long image_size; |
43 | extern int in_suspend; | 45 | extern int in_suspend; |
44 | extern dev_t swsusp_resume_device; | 46 | extern dev_t swsusp_resume_device; |
47 | extern sector_t swsusp_resume_block; | ||
45 | 48 | ||
46 | extern asmlinkage int swsusp_arch_suspend(void); | 49 | extern asmlinkage int swsusp_arch_suspend(void); |
47 | extern asmlinkage int swsusp_arch_resume(void); | 50 | extern asmlinkage int swsusp_arch_resume(void); |
@@ -102,8 +105,18 @@ struct snapshot_handle { | |||
102 | extern unsigned int snapshot_additional_pages(struct zone *zone); | 105 | extern unsigned int snapshot_additional_pages(struct zone *zone); |
103 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); | 106 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); |
104 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); | 107 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); |
108 | extern void snapshot_write_finalize(struct snapshot_handle *handle); | ||
105 | extern int snapshot_image_loaded(struct snapshot_handle *handle); | 109 | extern int snapshot_image_loaded(struct snapshot_handle *handle); |
106 | extern void snapshot_free_unused_memory(struct snapshot_handle *handle); | 110 | |
111 | /* | ||
112 | * This structure is used to pass the values needed for the identification | ||
113 | * of the resume swap area from a user space to the kernel via the | ||
114 | * SNAPSHOT_SET_SWAP_AREA ioctl | ||
115 | */ | ||
116 | struct resume_swap_area { | ||
117 | loff_t offset; | ||
118 | u_int32_t dev; | ||
119 | } __attribute__((packed)); | ||
107 | 120 | ||
108 | #define SNAPSHOT_IOC_MAGIC '3' | 121 | #define SNAPSHOT_IOC_MAGIC '3' |
109 | #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) | 122 | #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) |
@@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle); | |||
117 | #define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) | 130 | #define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) |
118 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) | 131 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) |
119 | #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) | 132 | #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) |
120 | #define SNAPSHOT_IOC_MAXNR 11 | 133 | #define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) |
134 | #define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \ | ||
135 | struct resume_swap_area) | ||
136 | #define SNAPSHOT_IOC_MAXNR 13 | ||
137 | |||
138 | #define PMOPS_PREPARE 1 | ||
139 | #define PMOPS_ENTER 2 | ||
140 | #define PMOPS_FINISH 3 | ||
121 | 141 | ||
122 | /** | 142 | /** |
123 | * The bitmap is used for tracing allocated swap pages | 143 | * The bitmap is used for tracing allocated swap pages |
@@ -141,7 +161,7 @@ struct bitmap_page { | |||
141 | 161 | ||
142 | extern void free_bitmap(struct bitmap_page *bitmap); | 162 | extern void free_bitmap(struct bitmap_page *bitmap); |
143 | extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); | 163 | extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); |
144 | extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); | 164 | extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap); |
145 | extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); | 165 | extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); |
146 | 166 | ||
147 | extern int swsusp_check(void); | 167 | extern int swsusp_check(void); |
@@ -153,3 +173,7 @@ extern int swsusp_read(void); | |||
153 | extern int swsusp_write(void); | 173 | extern int swsusp_write(void); |
154 | extern void swsusp_close(void); | 174 | extern void swsusp_close(void); |
155 | extern int suspend_enter(suspend_state_t state); | 175 | extern int suspend_enter(suspend_state_t state); |
176 | |||
177 | struct timeval; | ||
178 | extern void swsusp_show_speed(struct timeval *, struct timeval *, | ||
179 | unsigned int, char *); | ||
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index f1f900ac3164..678ec736076b 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c | |||
@@ -16,12 +16,12 @@ | |||
16 | * callback we use. | 16 | * callback we use. |
17 | */ | 17 | */ |
18 | 18 | ||
19 | static void do_poweroff(void *dummy) | 19 | static void do_poweroff(struct work_struct *dummy) |
20 | { | 20 | { |
21 | kernel_power_off(); | 21 | kernel_power_off(); |
22 | } | 22 | } |
23 | 23 | ||
24 | static DECLARE_WORK(poweroff_work, do_poweroff, NULL); | 24 | static DECLARE_WORK(poweroff_work, do_poweroff); |
25 | 25 | ||
26 | static void handle_poweroff(int key, struct tty_struct *tty) | 26 | static void handle_poweroff(int key, struct tty_struct *tty) |
27 | { | 27 | { |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 72e72d2c61e6..99eeb119b06d 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -13,12 +13,15 @@ | |||
13 | #include <linux/suspend.h> | 13 | #include <linux/suspend.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
16 | #include <linux/freezer.h> | ||
16 | 17 | ||
17 | /* | 18 | /* |
18 | * Timeout for stopping processes | 19 | * Timeout for stopping processes |
19 | */ | 20 | */ |
20 | #define TIMEOUT (20 * HZ) | 21 | #define TIMEOUT (20 * HZ) |
21 | 22 | ||
23 | #define FREEZER_KERNEL_THREADS 0 | ||
24 | #define FREEZER_USER_SPACE 1 | ||
22 | 25 | ||
23 | static inline int freezeable(struct task_struct * p) | 26 | static inline int freezeable(struct task_struct * p) |
24 | { | 27 | { |
@@ -39,7 +42,6 @@ void refrigerator(void) | |||
39 | long save; | 42 | long save; |
40 | save = current->state; | 43 | save = current->state; |
41 | pr_debug("%s entered refrigerator\n", current->comm); | 44 | pr_debug("%s entered refrigerator\n", current->comm); |
42 | printk("="); | ||
43 | 45 | ||
44 | frozen_process(current); | 46 | frozen_process(current); |
45 | spin_lock_irq(¤t->sighand->siglock); | 47 | spin_lock_irq(¤t->sighand->siglock); |
@@ -79,96 +81,136 @@ static void cancel_freezing(struct task_struct *p) | |||
79 | } | 81 | } |
80 | } | 82 | } |
81 | 83 | ||
82 | /* 0 = success, else # of processes that we failed to stop */ | 84 | static inline int is_user_space(struct task_struct *p) |
83 | int freeze_processes(void) | 85 | { |
86 | return p->mm && !(p->flags & PF_BORROWED_MM); | ||
87 | } | ||
88 | |||
89 | static unsigned int try_to_freeze_tasks(int freeze_user_space) | ||
84 | { | 90 | { |
85 | int todo, nr_user, user_frozen; | ||
86 | unsigned long start_time; | ||
87 | struct task_struct *g, *p; | 91 | struct task_struct *g, *p; |
92 | unsigned long end_time; | ||
93 | unsigned int todo; | ||
88 | 94 | ||
89 | printk( "Stopping tasks: " ); | 95 | end_time = jiffies + TIMEOUT; |
90 | start_time = jiffies; | ||
91 | user_frozen = 0; | ||
92 | do { | 96 | do { |
93 | nr_user = todo = 0; | 97 | todo = 0; |
94 | read_lock(&tasklist_lock); | 98 | read_lock(&tasklist_lock); |
95 | do_each_thread(g, p) { | 99 | do_each_thread(g, p) { |
96 | if (!freezeable(p)) | 100 | if (!freezeable(p)) |
97 | continue; | 101 | continue; |
102 | |||
98 | if (frozen(p)) | 103 | if (frozen(p)) |
99 | continue; | 104 | continue; |
100 | if (p->state == TASK_TRACED && frozen(p->parent)) { | 105 | |
106 | if (p->state == TASK_TRACED && | ||
107 | (frozen(p->parent) || | ||
108 | p->parent->state == TASK_STOPPED)) { | ||
101 | cancel_freezing(p); | 109 | cancel_freezing(p); |
102 | continue; | 110 | continue; |
103 | } | 111 | } |
104 | if (p->mm && !(p->flags & PF_BORROWED_MM)) { | 112 | if (is_user_space(p)) { |
105 | /* The task is a user-space one. | 113 | if (!freeze_user_space) |
106 | * Freeze it unless there's a vfork completion | 114 | continue; |
107 | * pending | 115 | |
116 | /* Freeze the task unless there is a vfork | ||
117 | * completion pending | ||
108 | */ | 118 | */ |
109 | if (!p->vfork_done) | 119 | if (!p->vfork_done) |
110 | freeze_process(p); | 120 | freeze_process(p); |
111 | nr_user++; | ||
112 | } else { | 121 | } else { |
113 | /* Freeze only if the user space is frozen */ | 122 | if (freeze_user_space) |
114 | if (user_frozen) | 123 | continue; |
115 | freeze_process(p); | 124 | |
116 | todo++; | 125 | freeze_process(p); |
117 | } | 126 | } |
127 | todo++; | ||
118 | } while_each_thread(g, p); | 128 | } while_each_thread(g, p); |
119 | read_unlock(&tasklist_lock); | 129 | read_unlock(&tasklist_lock); |
120 | todo += nr_user; | ||
121 | if (!user_frozen && !nr_user) { | ||
122 | sys_sync(); | ||
123 | start_time = jiffies; | ||
124 | } | ||
125 | user_frozen = !nr_user; | ||
126 | yield(); /* Yield is okay here */ | 130 | yield(); /* Yield is okay here */ |
127 | if (todo && time_after(jiffies, start_time + TIMEOUT)) | 131 | if (todo && time_after(jiffies, end_time)) |
128 | break; | 132 | break; |
129 | } while(todo); | 133 | } while (todo); |
130 | 134 | ||
131 | /* This does not unfreeze processes that are already frozen | ||
132 | * (we have slightly ugly calling convention in that respect, | ||
133 | * and caller must call thaw_processes() if something fails), | ||
134 | * but it cleans up leftover PF_FREEZE requests. | ||
135 | */ | ||
136 | if (todo) { | 135 | if (todo) { |
137 | printk( "\n" ); | 136 | /* This does not unfreeze processes that are already frozen |
138 | printk(KERN_ERR " stopping tasks timed out " | 137 | * (we have slightly ugly calling convention in that respect, |
139 | "after %d seconds (%d tasks remaining):\n", | 138 | * and caller must call thaw_processes() if something fails), |
140 | TIMEOUT / HZ, todo); | 139 | * but it cleans up leftover PF_FREEZE requests. |
140 | */ | ||
141 | printk("\n"); | ||
142 | printk(KERN_ERR "Stopping %s timed out after %d seconds " | ||
143 | "(%d tasks refusing to freeze):\n", | ||
144 | freeze_user_space ? "user space processes" : | ||
145 | "kernel threads", | ||
146 | TIMEOUT / HZ, todo); | ||
141 | read_lock(&tasklist_lock); | 147 | read_lock(&tasklist_lock); |
142 | do_each_thread(g, p) { | 148 | do_each_thread(g, p) { |
149 | if (is_user_space(p) == !freeze_user_space) | ||
150 | continue; | ||
151 | |||
143 | if (freezeable(p) && !frozen(p)) | 152 | if (freezeable(p) && !frozen(p)) |
144 | printk(KERN_ERR " %s\n", p->comm); | 153 | printk(KERN_ERR " %s\n", p->comm); |
154 | |||
145 | cancel_freezing(p); | 155 | cancel_freezing(p); |
146 | } while_each_thread(g, p); | 156 | } while_each_thread(g, p); |
147 | read_unlock(&tasklist_lock); | 157 | read_unlock(&tasklist_lock); |
148 | return todo; | ||
149 | } | 158 | } |
150 | 159 | ||
151 | printk( "|\n" ); | 160 | return todo; |
161 | } | ||
162 | |||
163 | /** | ||
164 | * freeze_processes - tell processes to enter the refrigerator | ||
165 | * | ||
166 | * Returns 0 on success, or the number of processes that didn't freeze, | ||
167 | * although they were told to. | ||
168 | */ | ||
169 | int freeze_processes(void) | ||
170 | { | ||
171 | unsigned int nr_unfrozen; | ||
172 | |||
173 | printk("Stopping tasks ... "); | ||
174 | nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); | ||
175 | if (nr_unfrozen) | ||
176 | return nr_unfrozen; | ||
177 | |||
178 | sys_sync(); | ||
179 | nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); | ||
180 | if (nr_unfrozen) | ||
181 | return nr_unfrozen; | ||
182 | |||
183 | printk("done.\n"); | ||
152 | BUG_ON(in_atomic()); | 184 | BUG_ON(in_atomic()); |
153 | return 0; | 185 | return 0; |
154 | } | 186 | } |
155 | 187 | ||
156 | void thaw_processes(void) | 188 | static void thaw_tasks(int thaw_user_space) |
157 | { | 189 | { |
158 | struct task_struct *g, *p; | 190 | struct task_struct *g, *p; |
159 | 191 | ||
160 | printk( "Restarting tasks..." ); | ||
161 | read_lock(&tasklist_lock); | 192 | read_lock(&tasklist_lock); |
162 | do_each_thread(g, p) { | 193 | do_each_thread(g, p) { |
163 | if (!freezeable(p)) | 194 | if (!freezeable(p)) |
164 | continue; | 195 | continue; |
196 | |||
197 | if (is_user_space(p) == !thaw_user_space) | ||
198 | continue; | ||
199 | |||
165 | if (!thaw_process(p)) | 200 | if (!thaw_process(p)) |
166 | printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); | 201 | printk(KERN_WARNING " Strange, %s not stopped\n", |
202 | p->comm ); | ||
167 | } while_each_thread(g, p); | 203 | } while_each_thread(g, p); |
168 | |||
169 | read_unlock(&tasklist_lock); | 204 | read_unlock(&tasklist_lock); |
205 | } | ||
206 | |||
207 | void thaw_processes(void) | ||
208 | { | ||
209 | printk("Restarting tasks ... "); | ||
210 | thaw_tasks(FREEZER_KERNEL_THREADS); | ||
211 | thaw_tasks(FREEZER_USER_SPACE); | ||
170 | schedule(); | 212 | schedule(); |
171 | printk( " done\n" ); | 213 | printk("done.\n"); |
172 | } | 214 | } |
173 | 215 | ||
174 | EXPORT_SYMBOL(refrigerator); | 216 | EXPORT_SYMBOL(refrigerator); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 99f9b7d177d6..c024606221c4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1,15 +1,15 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/power/snapshot.c | 2 | * linux/kernel/power/snapshot.c |
3 | * | 3 | * |
4 | * This file provide system snapshot/restore functionality. | 4 | * This file provides system snapshot/restore functionality for swsusp. |
5 | * | 5 | * |
6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> | 6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> |
7 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | * | 8 | * |
8 | * This file is released under the GPLv2, and is based on swsusp.c. | 9 | * This file is released under the GPLv2. |
9 | * | 10 | * |
10 | */ | 11 | */ |
11 | 12 | ||
12 | |||
13 | #include <linux/version.h> | 13 | #include <linux/version.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
@@ -34,137 +34,24 @@ | |||
34 | 34 | ||
35 | #include "power.h" | 35 | #include "power.h" |
36 | 36 | ||
37 | /* List of PBEs used for creating and restoring the suspend image */ | 37 | /* List of PBEs needed for restoring the pages that were allocated before |
38 | * the suspend and included in the suspend image, but have also been | ||
39 | * allocated by the "resume" kernel, so their contents cannot be written | ||
40 | * directly to their "original" page frames. | ||
41 | */ | ||
38 | struct pbe *restore_pblist; | 42 | struct pbe *restore_pblist; |
39 | 43 | ||
40 | static unsigned int nr_copy_pages; | 44 | /* Pointer to an auxiliary buffer (1 page) */ |
41 | static unsigned int nr_meta_pages; | ||
42 | static void *buffer; | 45 | static void *buffer; |
43 | 46 | ||
44 | #ifdef CONFIG_HIGHMEM | ||
45 | unsigned int count_highmem_pages(void) | ||
46 | { | ||
47 | struct zone *zone; | ||
48 | unsigned long zone_pfn; | ||
49 | unsigned int n = 0; | ||
50 | |||
51 | for_each_zone (zone) | ||
52 | if (is_highmem(zone)) { | ||
53 | mark_free_pages(zone); | ||
54 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { | ||
55 | struct page *page; | ||
56 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
57 | if (!pfn_valid(pfn)) | ||
58 | continue; | ||
59 | page = pfn_to_page(pfn); | ||
60 | if (PageReserved(page)) | ||
61 | continue; | ||
62 | if (PageNosaveFree(page)) | ||
63 | continue; | ||
64 | n++; | ||
65 | } | ||
66 | } | ||
67 | return n; | ||
68 | } | ||
69 | |||
70 | struct highmem_page { | ||
71 | char *data; | ||
72 | struct page *page; | ||
73 | struct highmem_page *next; | ||
74 | }; | ||
75 | |||
76 | static struct highmem_page *highmem_copy; | ||
77 | |||
78 | static int save_highmem_zone(struct zone *zone) | ||
79 | { | ||
80 | unsigned long zone_pfn; | ||
81 | mark_free_pages(zone); | ||
82 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
83 | struct page *page; | ||
84 | struct highmem_page *save; | ||
85 | void *kaddr; | ||
86 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
87 | |||
88 | if (!(pfn%10000)) | ||
89 | printk("."); | ||
90 | if (!pfn_valid(pfn)) | ||
91 | continue; | ||
92 | page = pfn_to_page(pfn); | ||
93 | /* | ||
94 | * This condition results from rvmalloc() sans vmalloc_32() | ||
95 | * and architectural memory reservations. This should be | ||
96 | * corrected eventually when the cases giving rise to this | ||
97 | * are better understood. | ||
98 | */ | ||
99 | if (PageReserved(page)) | ||
100 | continue; | ||
101 | BUG_ON(PageNosave(page)); | ||
102 | if (PageNosaveFree(page)) | ||
103 | continue; | ||
104 | save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); | ||
105 | if (!save) | ||
106 | return -ENOMEM; | ||
107 | save->next = highmem_copy; | ||
108 | save->page = page; | ||
109 | save->data = (void *) get_zeroed_page(GFP_ATOMIC); | ||
110 | if (!save->data) { | ||
111 | kfree(save); | ||
112 | return -ENOMEM; | ||
113 | } | ||
114 | kaddr = kmap_atomic(page, KM_USER0); | ||
115 | memcpy(save->data, kaddr, PAGE_SIZE); | ||
116 | kunmap_atomic(kaddr, KM_USER0); | ||
117 | highmem_copy = save; | ||
118 | } | ||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | int save_highmem(void) | ||
123 | { | ||
124 | struct zone *zone; | ||
125 | int res = 0; | ||
126 | |||
127 | pr_debug("swsusp: Saving Highmem"); | ||
128 | drain_local_pages(); | ||
129 | for_each_zone (zone) { | ||
130 | if (is_highmem(zone)) | ||
131 | res = save_highmem_zone(zone); | ||
132 | if (res) | ||
133 | return res; | ||
134 | } | ||
135 | printk("\n"); | ||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | int restore_highmem(void) | ||
140 | { | ||
141 | printk("swsusp: Restoring Highmem\n"); | ||
142 | while (highmem_copy) { | ||
143 | struct highmem_page *save = highmem_copy; | ||
144 | void *kaddr; | ||
145 | highmem_copy = save->next; | ||
146 | |||
147 | kaddr = kmap_atomic(save->page, KM_USER0); | ||
148 | memcpy(kaddr, save->data, PAGE_SIZE); | ||
149 | kunmap_atomic(kaddr, KM_USER0); | ||
150 | free_page((long) save->data); | ||
151 | kfree(save); | ||
152 | } | ||
153 | return 0; | ||
154 | } | ||
155 | #else | ||
156 | static inline unsigned int count_highmem_pages(void) {return 0;} | ||
157 | static inline int save_highmem(void) {return 0;} | ||
158 | static inline int restore_highmem(void) {return 0;} | ||
159 | #endif | ||
160 | |||
161 | /** | 47 | /** |
162 | * @safe_needed - on resume, for storing the PBE list and the image, | 48 | * @safe_needed - on resume, for storing the PBE list and the image, |
163 | * we can only use memory pages that do not conflict with the pages | 49 | * we can only use memory pages that do not conflict with the pages |
164 | * used before suspend. | 50 | * used before suspend. The unsafe pages have PageNosaveFree set |
51 | * and we count them using unsafe_pages. | ||
165 | * | 52 | * |
166 | * The unsafe pages are marked with the PG_nosave_free flag | 53 | * Each allocated image page is marked as PageNosave and PageNosaveFree |
167 | * and we count them using unsafe_pages | 54 | * so that swsusp_free() can release it. |
168 | */ | 55 | */ |
169 | 56 | ||
170 | #define PG_ANY 0 | 57 | #define PG_ANY 0 |
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;} | |||
174 | 61 | ||
175 | static unsigned int allocated_unsafe_pages; | 62 | static unsigned int allocated_unsafe_pages; |
176 | 63 | ||
177 | static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | 64 | static void *get_image_page(gfp_t gfp_mask, int safe_needed) |
178 | { | 65 | { |
179 | void *res; | 66 | void *res; |
180 | 67 | ||
@@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | |||
195 | 82 | ||
196 | unsigned long get_safe_page(gfp_t gfp_mask) | 83 | unsigned long get_safe_page(gfp_t gfp_mask) |
197 | { | 84 | { |
198 | return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); | 85 | return (unsigned long)get_image_page(gfp_mask, PG_SAFE); |
86 | } | ||
87 | |||
88 | static struct page *alloc_image_page(gfp_t gfp_mask) | ||
89 | { | ||
90 | struct page *page; | ||
91 | |||
92 | page = alloc_page(gfp_mask); | ||
93 | if (page) { | ||
94 | SetPageNosave(page); | ||
95 | SetPageNosaveFree(page); | ||
96 | } | ||
97 | return page; | ||
199 | } | 98 | } |
200 | 99 | ||
201 | /** | 100 | /** |
202 | * free_image_page - free page represented by @addr, allocated with | 101 | * free_image_page - free page represented by @addr, allocated with |
203 | * alloc_image_page (page flags set by it must be cleared) | 102 | * get_image_page (page flags set by it must be cleared) |
204 | */ | 103 | */ |
205 | 104 | ||
206 | static inline void free_image_page(void *addr, int clear_nosave_free) | 105 | static inline void free_image_page(void *addr, int clear_nosave_free) |
207 | { | 106 | { |
208 | ClearPageNosave(virt_to_page(addr)); | 107 | struct page *page; |
108 | |||
109 | BUG_ON(!virt_addr_valid(addr)); | ||
110 | |||
111 | page = virt_to_page(addr); | ||
112 | |||
113 | ClearPageNosave(page); | ||
209 | if (clear_nosave_free) | 114 | if (clear_nosave_free) |
210 | ClearPageNosaveFree(virt_to_page(addr)); | 115 | ClearPageNosaveFree(page); |
211 | free_page((unsigned long)addr); | 116 | |
117 | __free_page(page); | ||
212 | } | 118 | } |
213 | 119 | ||
214 | /* struct linked_page is used to build chains of pages */ | 120 | /* struct linked_page is used to build chains of pages */ |
@@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) | |||
269 | if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { | 175 | if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { |
270 | struct linked_page *lp; | 176 | struct linked_page *lp; |
271 | 177 | ||
272 | lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); | 178 | lp = get_image_page(ca->gfp_mask, ca->safe_needed); |
273 | if (!lp) | 179 | if (!lp) |
274 | return NULL; | 180 | return NULL; |
275 | 181 | ||
@@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
446 | 352 | ||
447 | /* Compute the number of zones */ | 353 | /* Compute the number of zones */ |
448 | nr = 0; | 354 | nr = 0; |
449 | for_each_zone (zone) | 355 | for_each_zone(zone) |
450 | if (populated_zone(zone) && !is_highmem(zone)) | 356 | if (populated_zone(zone)) |
451 | nr++; | 357 | nr++; |
452 | 358 | ||
453 | /* Allocate the list of zones bitmap objects */ | 359 | /* Allocate the list of zones bitmap objects */ |
@@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
459 | } | 365 | } |
460 | 366 | ||
461 | /* Initialize the zone bitmap objects */ | 367 | /* Initialize the zone bitmap objects */ |
462 | for_each_zone (zone) { | 368 | for_each_zone(zone) { |
463 | unsigned long pfn; | 369 | unsigned long pfn; |
464 | 370 | ||
465 | if (!populated_zone(zone) || is_highmem(zone)) | 371 | if (!populated_zone(zone)) |
466 | continue; | 372 | continue; |
467 | 373 | ||
468 | zone_bm->start_pfn = zone->zone_start_pfn; | 374 | zone_bm->start_pfn = zone->zone_start_pfn; |
@@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
481 | while (bb) { | 387 | while (bb) { |
482 | unsigned long *ptr; | 388 | unsigned long *ptr; |
483 | 389 | ||
484 | ptr = alloc_image_page(gfp_mask, safe_needed); | 390 | ptr = get_image_page(gfp_mask, safe_needed); |
485 | bb->data = ptr; | 391 | bb->data = ptr; |
486 | if (!ptr) | 392 | if (!ptr) |
487 | goto Free; | 393 | goto Free; |
@@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
505 | memory_bm_position_reset(bm); | 411 | memory_bm_position_reset(bm); |
506 | return 0; | 412 | return 0; |
507 | 413 | ||
508 | Free: | 414 | Free: |
509 | bm->p_list = ca.chain; | 415 | bm->p_list = ca.chain; |
510 | memory_bm_free(bm, PG_UNSAFE_CLEAR); | 416 | memory_bm_free(bm, PG_UNSAFE_CLEAR); |
511 | return -ENOMEM; | 417 | return -ENOMEM; |
@@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) | |||
651 | memory_bm_position_reset(bm); | 557 | memory_bm_position_reset(bm); |
652 | return BM_END_OF_MAP; | 558 | return BM_END_OF_MAP; |
653 | 559 | ||
654 | Return_pfn: | 560 | Return_pfn: |
655 | bm->cur.chunk = chunk; | 561 | bm->cur.chunk = chunk; |
656 | bm->cur.bit = bit; | 562 | bm->cur.bit = bit; |
657 | return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; | 563 | return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; |
@@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone) | |||
669 | 575 | ||
670 | res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); | 576 | res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); |
671 | res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); | 577 | res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); |
672 | return res; | 578 | return 2 * res; |
579 | } | ||
580 | |||
581 | #ifdef CONFIG_HIGHMEM | ||
582 | /** | ||
583 | * count_free_highmem_pages - compute the total number of free highmem | ||
584 | * pages, system-wide. | ||
585 | */ | ||
586 | |||
587 | static unsigned int count_free_highmem_pages(void) | ||
588 | { | ||
589 | struct zone *zone; | ||
590 | unsigned int cnt = 0; | ||
591 | |||
592 | for_each_zone(zone) | ||
593 | if (populated_zone(zone) && is_highmem(zone)) | ||
594 | cnt += zone->free_pages; | ||
595 | |||
596 | return cnt; | ||
597 | } | ||
598 | |||
599 | /** | ||
600 | * saveable_highmem_page - Determine whether a highmem page should be | ||
601 | * included in the suspend image. | ||
602 | * | ||
603 | * We should save the page if it isn't Nosave or NosaveFree, or Reserved, | ||
604 | * and it isn't a part of a free chunk of pages. | ||
605 | */ | ||
606 | |||
607 | static struct page *saveable_highmem_page(unsigned long pfn) | ||
608 | { | ||
609 | struct page *page; | ||
610 | |||
611 | if (!pfn_valid(pfn)) | ||
612 | return NULL; | ||
613 | |||
614 | page = pfn_to_page(pfn); | ||
615 | |||
616 | BUG_ON(!PageHighMem(page)); | ||
617 | |||
618 | if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page)) | ||
619 | return NULL; | ||
620 | |||
621 | return page; | ||
673 | } | 622 | } |
674 | 623 | ||
675 | /** | 624 | /** |
625 | * count_highmem_pages - compute the total number of saveable highmem | ||
626 | * pages. | ||
627 | */ | ||
628 | |||
629 | unsigned int count_highmem_pages(void) | ||
630 | { | ||
631 | struct zone *zone; | ||
632 | unsigned int n = 0; | ||
633 | |||
634 | for_each_zone(zone) { | ||
635 | unsigned long pfn, max_zone_pfn; | ||
636 | |||
637 | if (!is_highmem(zone)) | ||
638 | continue; | ||
639 | |||
640 | mark_free_pages(zone); | ||
641 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
642 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | ||
643 | if (saveable_highmem_page(pfn)) | ||
644 | n++; | ||
645 | } | ||
646 | return n; | ||
647 | } | ||
648 | #else | ||
649 | static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } | ||
650 | static inline unsigned int count_highmem_pages(void) { return 0; } | ||
651 | #endif /* CONFIG_HIGHMEM */ | ||
652 | |||
653 | /** | ||
676 | * pfn_is_nosave - check if given pfn is in the 'nosave' section | 654 | * pfn_is_nosave - check if given pfn is in the 'nosave' section |
677 | */ | 655 | */ |
678 | 656 | ||
@@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn) | |||
684 | } | 662 | } |
685 | 663 | ||
686 | /** | 664 | /** |
687 | * saveable - Determine whether a page should be cloned or not. | 665 | * saveable - Determine whether a non-highmem page should be included in |
688 | * @pfn: The page | 666 | * the suspend image. |
689 | * | 667 | * |
690 | * We save a page if it isn't Nosave, and is not in the range of pages | 668 | * We should save the page if it isn't Nosave, and is not in the range |
691 | * statically defined as 'unsaveable', and it | 669 | * of pages statically defined as 'unsaveable', and it isn't a part of |
692 | * isn't a part of a free chunk of pages. | 670 | * a free chunk of pages. |
693 | */ | 671 | */ |
694 | 672 | ||
695 | static struct page *saveable_page(unsigned long pfn) | 673 | static struct page *saveable_page(unsigned long pfn) |
@@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn) | |||
701 | 679 | ||
702 | page = pfn_to_page(pfn); | 680 | page = pfn_to_page(pfn); |
703 | 681 | ||
704 | if (PageNosave(page)) | 682 | BUG_ON(PageHighMem(page)); |
683 | |||
684 | if (PageNosave(page) || PageNosaveFree(page)) | ||
705 | return NULL; | 685 | return NULL; |
686 | |||
706 | if (PageReserved(page) && pfn_is_nosave(pfn)) | 687 | if (PageReserved(page) && pfn_is_nosave(pfn)) |
707 | return NULL; | 688 | return NULL; |
708 | if (PageNosaveFree(page)) | ||
709 | return NULL; | ||
710 | 689 | ||
711 | return page; | 690 | return page; |
712 | } | 691 | } |
713 | 692 | ||
693 | /** | ||
694 | * count_data_pages - compute the total number of saveable non-highmem | ||
695 | * pages. | ||
696 | */ | ||
697 | |||
714 | unsigned int count_data_pages(void) | 698 | unsigned int count_data_pages(void) |
715 | { | 699 | { |
716 | struct zone *zone; | 700 | struct zone *zone; |
717 | unsigned long pfn, max_zone_pfn; | 701 | unsigned long pfn, max_zone_pfn; |
718 | unsigned int n = 0; | 702 | unsigned int n = 0; |
719 | 703 | ||
720 | for_each_zone (zone) { | 704 | for_each_zone(zone) { |
721 | if (is_highmem(zone)) | 705 | if (is_highmem(zone)) |
722 | continue; | 706 | continue; |
707 | |||
723 | mark_free_pages(zone); | 708 | mark_free_pages(zone); |
724 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 709 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
725 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 710 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
726 | n += !!saveable_page(pfn); | 711 | if(saveable_page(pfn)) |
712 | n++; | ||
727 | } | 713 | } |
728 | return n; | 714 | return n; |
729 | } | 715 | } |
730 | 716 | ||
731 | static inline void copy_data_page(long *dst, long *src) | 717 | /* This is needed, because copy_page and memcpy are not usable for copying |
718 | * task structs. | ||
719 | */ | ||
720 | static inline void do_copy_page(long *dst, long *src) | ||
732 | { | 721 | { |
733 | int n; | 722 | int n; |
734 | 723 | ||
735 | /* copy_page and memcpy are not usable for copying task structs. */ | ||
736 | for (n = PAGE_SIZE / sizeof(long); n; n--) | 724 | for (n = PAGE_SIZE / sizeof(long); n; n--) |
737 | *dst++ = *src++; | 725 | *dst++ = *src++; |
738 | } | 726 | } |
739 | 727 | ||
728 | #ifdef CONFIG_HIGHMEM | ||
729 | static inline struct page * | ||
730 | page_is_saveable(struct zone *zone, unsigned long pfn) | ||
731 | { | ||
732 | return is_highmem(zone) ? | ||
733 | saveable_highmem_page(pfn) : saveable_page(pfn); | ||
734 | } | ||
735 | |||
736 | static inline void | ||
737 | copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | ||
738 | { | ||
739 | struct page *s_page, *d_page; | ||
740 | void *src, *dst; | ||
741 | |||
742 | s_page = pfn_to_page(src_pfn); | ||
743 | d_page = pfn_to_page(dst_pfn); | ||
744 | if (PageHighMem(s_page)) { | ||
745 | src = kmap_atomic(s_page, KM_USER0); | ||
746 | dst = kmap_atomic(d_page, KM_USER1); | ||
747 | do_copy_page(dst, src); | ||
748 | kunmap_atomic(src, KM_USER0); | ||
749 | kunmap_atomic(dst, KM_USER1); | ||
750 | } else { | ||
751 | src = page_address(s_page); | ||
752 | if (PageHighMem(d_page)) { | ||
753 | /* Page pointed to by src may contain some kernel | ||
754 | * data modified by kmap_atomic() | ||
755 | */ | ||
756 | do_copy_page(buffer, src); | ||
757 | dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); | ||
758 | memcpy(dst, buffer, PAGE_SIZE); | ||
759 | kunmap_atomic(dst, KM_USER0); | ||
760 | } else { | ||
761 | dst = page_address(d_page); | ||
762 | do_copy_page(dst, src); | ||
763 | } | ||
764 | } | ||
765 | } | ||
766 | #else | ||
767 | #define page_is_saveable(zone, pfn) saveable_page(pfn) | ||
768 | |||
769 | static inline void | ||
770 | copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | ||
771 | { | ||
772 | do_copy_page(page_address(pfn_to_page(dst_pfn)), | ||
773 | page_address(pfn_to_page(src_pfn))); | ||
774 | } | ||
775 | #endif /* CONFIG_HIGHMEM */ | ||
776 | |||
740 | static void | 777 | static void |
741 | copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) | 778 | copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) |
742 | { | 779 | { |
743 | struct zone *zone; | 780 | struct zone *zone; |
744 | unsigned long pfn; | 781 | unsigned long pfn; |
745 | 782 | ||
746 | for_each_zone (zone) { | 783 | for_each_zone(zone) { |
747 | unsigned long max_zone_pfn; | 784 | unsigned long max_zone_pfn; |
748 | 785 | ||
749 | if (is_highmem(zone)) | ||
750 | continue; | ||
751 | |||
752 | mark_free_pages(zone); | 786 | mark_free_pages(zone); |
753 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 787 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
754 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 788 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
755 | if (saveable_page(pfn)) | 789 | if (page_is_saveable(zone, pfn)) |
756 | memory_bm_set_bit(orig_bm, pfn); | 790 | memory_bm_set_bit(orig_bm, pfn); |
757 | } | 791 | } |
758 | memory_bm_position_reset(orig_bm); | 792 | memory_bm_position_reset(orig_bm); |
759 | memory_bm_position_reset(copy_bm); | 793 | memory_bm_position_reset(copy_bm); |
760 | do { | 794 | do { |
761 | pfn = memory_bm_next_pfn(orig_bm); | 795 | pfn = memory_bm_next_pfn(orig_bm); |
762 | if (likely(pfn != BM_END_OF_MAP)) { | 796 | if (likely(pfn != BM_END_OF_MAP)) |
763 | struct page *page; | 797 | copy_data_page(memory_bm_next_pfn(copy_bm), pfn); |
764 | void *src; | ||
765 | |||
766 | page = pfn_to_page(pfn); | ||
767 | src = page_address(page); | ||
768 | page = pfn_to_page(memory_bm_next_pfn(copy_bm)); | ||
769 | copy_data_page(page_address(page), src); | ||
770 | } | ||
771 | } while (pfn != BM_END_OF_MAP); | 798 | } while (pfn != BM_END_OF_MAP); |
772 | } | 799 | } |
773 | 800 | ||
801 | /* Total number of image pages */ | ||
802 | static unsigned int nr_copy_pages; | ||
803 | /* Number of pages needed for saving the original pfns of the image pages */ | ||
804 | static unsigned int nr_meta_pages; | ||
805 | |||
774 | /** | 806 | /** |
775 | * swsusp_free - free pages allocated for the suspend. | 807 | * swsusp_free - free pages allocated for the suspend. |
776 | * | 808 | * |
@@ -792,7 +824,7 @@ void swsusp_free(void) | |||
792 | if (PageNosave(page) && PageNosaveFree(page)) { | 824 | if (PageNosave(page) && PageNosaveFree(page)) { |
793 | ClearPageNosave(page); | 825 | ClearPageNosave(page); |
794 | ClearPageNosaveFree(page); | 826 | ClearPageNosaveFree(page); |
795 | free_page((long) page_address(page)); | 827 | __free_page(page); |
796 | } | 828 | } |
797 | } | 829 | } |
798 | } | 830 | } |
@@ -802,34 +834,108 @@ void swsusp_free(void) | |||
802 | buffer = NULL; | 834 | buffer = NULL; |
803 | } | 835 | } |
804 | 836 | ||
837 | #ifdef CONFIG_HIGHMEM | ||
838 | /** | ||
839 | * count_pages_for_highmem - compute the number of non-highmem pages | ||
840 | * that will be necessary for creating copies of highmem pages. | ||
841 | */ | ||
842 | |||
843 | static unsigned int count_pages_for_highmem(unsigned int nr_highmem) | ||
844 | { | ||
845 | unsigned int free_highmem = count_free_highmem_pages(); | ||
846 | |||
847 | if (free_highmem >= nr_highmem) | ||
848 | nr_highmem = 0; | ||
849 | else | ||
850 | nr_highmem -= free_highmem; | ||
851 | |||
852 | return nr_highmem; | ||
853 | } | ||
854 | #else | ||
855 | static unsigned int | ||
856 | count_pages_for_highmem(unsigned int nr_highmem) { return 0; } | ||
857 | #endif /* CONFIG_HIGHMEM */ | ||
805 | 858 | ||
806 | /** | 859 | /** |
807 | * enough_free_mem - Make sure we enough free memory to snapshot. | 860 | * enough_free_mem - Make sure we have enough free memory for the |
808 | * | 861 | * snapshot image. |
809 | * Returns TRUE or FALSE after checking the number of available | ||
810 | * free pages. | ||
811 | */ | 862 | */ |
812 | 863 | ||
813 | static int enough_free_mem(unsigned int nr_pages) | 864 | static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) |
814 | { | 865 | { |
815 | struct zone *zone; | 866 | struct zone *zone; |
816 | unsigned int free = 0, meta = 0; | 867 | unsigned int free = 0, meta = 0; |
817 | 868 | ||
818 | for_each_zone (zone) | 869 | for_each_zone(zone) { |
819 | if (!is_highmem(zone)) { | 870 | meta += snapshot_additional_pages(zone); |
871 | if (!is_highmem(zone)) | ||
820 | free += zone->free_pages; | 872 | free += zone->free_pages; |
821 | meta += snapshot_additional_pages(zone); | 873 | } |
822 | } | ||
823 | 874 | ||
824 | pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", | 875 | nr_pages += count_pages_for_highmem(nr_highmem); |
876 | pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n", | ||
825 | nr_pages, PAGES_FOR_IO, meta, free); | 877 | nr_pages, PAGES_FOR_IO, meta, free); |
826 | 878 | ||
827 | return free > nr_pages + PAGES_FOR_IO + meta; | 879 | return free > nr_pages + PAGES_FOR_IO + meta; |
828 | } | 880 | } |
829 | 881 | ||
882 | #ifdef CONFIG_HIGHMEM | ||
883 | /** | ||
884 | * get_highmem_buffer - if there are some highmem pages in the suspend | ||
885 | * image, we may need the buffer to copy them and/or load their data. | ||
886 | */ | ||
887 | |||
888 | static inline int get_highmem_buffer(int safe_needed) | ||
889 | { | ||
890 | buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); | ||
891 | return buffer ? 0 : -ENOMEM; | ||
892 | } | ||
893 | |||
894 | /** | ||
895 | * alloc_highmem_image_pages - allocate some highmem pages for the image. | ||
896 | * Try to allocate as many pages as needed, but if the number of free | ||
897 | * highmem pages is lesser than that, allocate them all. | ||
898 | */ | ||
899 | |||
900 | static inline unsigned int | ||
901 | alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) | ||
902 | { | ||
903 | unsigned int to_alloc = count_free_highmem_pages(); | ||
904 | |||
905 | if (to_alloc > nr_highmem) | ||
906 | to_alloc = nr_highmem; | ||
907 | |||
908 | nr_highmem -= to_alloc; | ||
909 | while (to_alloc-- > 0) { | ||
910 | struct page *page; | ||
911 | |||
912 | page = alloc_image_page(__GFP_HIGHMEM); | ||
913 | memory_bm_set_bit(bm, page_to_pfn(page)); | ||
914 | } | ||
915 | return nr_highmem; | ||
916 | } | ||
917 | #else | ||
918 | static inline int get_highmem_buffer(int safe_needed) { return 0; } | ||
919 | |||
920 | static inline unsigned int | ||
921 | alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } | ||
922 | #endif /* CONFIG_HIGHMEM */ | ||
923 | |||
924 | /** | ||
925 | * swsusp_alloc - allocate memory for the suspend image | ||
926 | * | ||
927 | * We first try to allocate as many highmem pages as there are | ||
928 | * saveable highmem pages in the system. If that fails, we allocate | ||
929 | * non-highmem pages for the copies of the remaining highmem ones. | ||
930 | * | ||
931 | * In this approach it is likely that the copies of highmem pages will | ||
932 | * also be located in the high memory, because of the way in which | ||
933 | * copy_data_pages() works. | ||
934 | */ | ||
935 | |||
830 | static int | 936 | static int |
831 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | 937 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, |
832 | unsigned int nr_pages) | 938 | unsigned int nr_pages, unsigned int nr_highmem) |
833 | { | 939 | { |
834 | int error; | 940 | int error; |
835 | 941 | ||
@@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | |||
841 | if (error) | 947 | if (error) |
842 | goto Free; | 948 | goto Free; |
843 | 949 | ||
950 | if (nr_highmem > 0) { | ||
951 | error = get_highmem_buffer(PG_ANY); | ||
952 | if (error) | ||
953 | goto Free; | ||
954 | |||
955 | nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); | ||
956 | } | ||
844 | while (nr_pages-- > 0) { | 957 | while (nr_pages-- > 0) { |
845 | struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); | 958 | struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); |
959 | |||
846 | if (!page) | 960 | if (!page) |
847 | goto Free; | 961 | goto Free; |
848 | 962 | ||
849 | SetPageNosave(page); | ||
850 | SetPageNosaveFree(page); | ||
851 | memory_bm_set_bit(copy_bm, page_to_pfn(page)); | 963 | memory_bm_set_bit(copy_bm, page_to_pfn(page)); |
852 | } | 964 | } |
853 | return 0; | 965 | return 0; |
854 | 966 | ||
855 | Free: | 967 | Free: |
856 | swsusp_free(); | 968 | swsusp_free(); |
857 | return -ENOMEM; | 969 | return -ENOMEM; |
858 | } | 970 | } |
859 | 971 | ||
860 | /* Memory bitmap used for marking saveable pages */ | 972 | /* Memory bitmap used for marking saveable pages (during suspend) or the |
973 | * suspend image pages (during resume) | ||
974 | */ | ||
861 | static struct memory_bitmap orig_bm; | 975 | static struct memory_bitmap orig_bm; |
862 | /* Memory bitmap used for marking allocated pages that will contain the copies | 976 | /* Memory bitmap used on suspend for marking allocated pages that will contain |
863 | * of saveable pages | 977 | * the copies of saveable pages. During resume it is initially used for |
978 | * marking the suspend image pages, but then its set bits are duplicated in | ||
979 | * @orig_bm and it is released. Next, on systems with high memory, it may be | ||
980 | * used for marking "safe" highmem pages, but it has to be reinitialized for | ||
981 | * this purpose. | ||
864 | */ | 982 | */ |
865 | static struct memory_bitmap copy_bm; | 983 | static struct memory_bitmap copy_bm; |
866 | 984 | ||
867 | asmlinkage int swsusp_save(void) | 985 | asmlinkage int swsusp_save(void) |
868 | { | 986 | { |
869 | unsigned int nr_pages; | 987 | unsigned int nr_pages, nr_highmem; |
870 | 988 | ||
871 | pr_debug("swsusp: critical section: \n"); | 989 | printk("swsusp: critical section: \n"); |
872 | 990 | ||
873 | drain_local_pages(); | 991 | drain_local_pages(); |
874 | nr_pages = count_data_pages(); | 992 | nr_pages = count_data_pages(); |
875 | printk("swsusp: Need to copy %u pages\n", nr_pages); | 993 | nr_highmem = count_highmem_pages(); |
994 | printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem); | ||
876 | 995 | ||
877 | if (!enough_free_mem(nr_pages)) { | 996 | if (!enough_free_mem(nr_pages, nr_highmem)) { |
878 | printk(KERN_ERR "swsusp: Not enough free memory\n"); | 997 | printk(KERN_ERR "swsusp: Not enough free memory\n"); |
879 | return -ENOMEM; | 998 | return -ENOMEM; |
880 | } | 999 | } |
881 | 1000 | ||
882 | if (swsusp_alloc(&orig_bm, ©_bm, nr_pages)) | 1001 | if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { |
1002 | printk(KERN_ERR "swsusp: Memory allocation failed\n"); | ||
883 | return -ENOMEM; | 1003 | return -ENOMEM; |
1004 | } | ||
884 | 1005 | ||
885 | /* During allocating of suspend pagedir, new cold pages may appear. | 1006 | /* During allocating of suspend pagedir, new cold pages may appear. |
886 | * Kill them. | 1007 | * Kill them. |
@@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void) | |||
894 | * touch swap space! Except we must write out our image of course. | 1015 | * touch swap space! Except we must write out our image of course. |
895 | */ | 1016 | */ |
896 | 1017 | ||
1018 | nr_pages += nr_highmem; | ||
897 | nr_copy_pages = nr_pages; | 1019 | nr_copy_pages = nr_pages; |
898 | nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1020 | nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); |
899 | 1021 | ||
900 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); | 1022 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); |
1023 | |||
901 | return 0; | 1024 | return 0; |
902 | } | 1025 | } |
903 | 1026 | ||
@@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) | |||
960 | 1083 | ||
961 | if (!buffer) { | 1084 | if (!buffer) { |
962 | /* This makes the buffer be freed by swsusp_free() */ | 1085 | /* This makes the buffer be freed by swsusp_free() */ |
963 | buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); | 1086 | buffer = get_image_page(GFP_ATOMIC, PG_ANY); |
964 | if (!buffer) | 1087 | if (!buffer) |
965 | return -ENOMEM; | 1088 | return -ENOMEM; |
966 | } | 1089 | } |
@@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) | |||
975 | memset(buffer, 0, PAGE_SIZE); | 1098 | memset(buffer, 0, PAGE_SIZE); |
976 | pack_pfns(buffer, &orig_bm); | 1099 | pack_pfns(buffer, &orig_bm); |
977 | } else { | 1100 | } else { |
978 | unsigned long pfn = memory_bm_next_pfn(©_bm); | 1101 | struct page *page; |
979 | 1102 | ||
980 | handle->buffer = page_address(pfn_to_page(pfn)); | 1103 | page = pfn_to_page(memory_bm_next_pfn(©_bm)); |
1104 | if (PageHighMem(page)) { | ||
1105 | /* Highmem pages are copied to the buffer, | ||
1106 | * because we can't return with a kmapped | ||
1107 | * highmem page (we may not be called again). | ||
1108 | */ | ||
1109 | void *kaddr; | ||
1110 | |||
1111 | kaddr = kmap_atomic(page, KM_USER0); | ||
1112 | memcpy(buffer, kaddr, PAGE_SIZE); | ||
1113 | kunmap_atomic(kaddr, KM_USER0); | ||
1114 | handle->buffer = buffer; | ||
1115 | } else { | ||
1116 | handle->buffer = page_address(page); | ||
1117 | } | ||
981 | } | 1118 | } |
982 | handle->prev = handle->cur; | 1119 | handle->prev = handle->cur; |
983 | } | 1120 | } |
@@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) | |||
1005 | unsigned long pfn, max_zone_pfn; | 1142 | unsigned long pfn, max_zone_pfn; |
1006 | 1143 | ||
1007 | /* Clear page flags */ | 1144 | /* Clear page flags */ |
1008 | for_each_zone (zone) { | 1145 | for_each_zone(zone) { |
1009 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1146 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
1010 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1147 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
1011 | if (pfn_valid(pfn)) | 1148 | if (pfn_valid(pfn)) |
@@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
1101 | } | 1238 | } |
1102 | } | 1239 | } |
1103 | 1240 | ||
1241 | /* List of "safe" pages that may be used to store data loaded from the suspend | ||
1242 | * image | ||
1243 | */ | ||
1244 | static struct linked_page *safe_pages_list; | ||
1245 | |||
1246 | #ifdef CONFIG_HIGHMEM | ||
1247 | /* struct highmem_pbe is used for creating the list of highmem pages that | ||
1248 | * should be restored atomically during the resume from disk, because the page | ||
1249 | * frames they have occupied before the suspend are in use. | ||
1250 | */ | ||
1251 | struct highmem_pbe { | ||
1252 | struct page *copy_page; /* data is here now */ | ||
1253 | struct page *orig_page; /* data was here before the suspend */ | ||
1254 | struct highmem_pbe *next; | ||
1255 | }; | ||
1256 | |||
1257 | /* List of highmem PBEs needed for restoring the highmem pages that were | ||
1258 | * allocated before the suspend and included in the suspend image, but have | ||
1259 | * also been allocated by the "resume" kernel, so their contents cannot be | ||
1260 | * written directly to their "original" page frames. | ||
1261 | */ | ||
1262 | static struct highmem_pbe *highmem_pblist; | ||
1263 | |||
1264 | /** | ||
1265 | * count_highmem_image_pages - compute the number of highmem pages in the | ||
1266 | * suspend image. The bits in the memory bitmap @bm that correspond to the | ||
1267 | * image pages are assumed to be set. | ||
1268 | */ | ||
1269 | |||
1270 | static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) | ||
1271 | { | ||
1272 | unsigned long pfn; | ||
1273 | unsigned int cnt = 0; | ||
1274 | |||
1275 | memory_bm_position_reset(bm); | ||
1276 | pfn = memory_bm_next_pfn(bm); | ||
1277 | while (pfn != BM_END_OF_MAP) { | ||
1278 | if (PageHighMem(pfn_to_page(pfn))) | ||
1279 | cnt++; | ||
1280 | |||
1281 | pfn = memory_bm_next_pfn(bm); | ||
1282 | } | ||
1283 | return cnt; | ||
1284 | } | ||
1285 | |||
1286 | /** | ||
1287 | * prepare_highmem_image - try to allocate as many highmem pages as | ||
1288 | * there are highmem image pages (@nr_highmem_p points to the variable | ||
1289 | * containing the number of highmem image pages). The pages that are | ||
1290 | * "safe" (ie. will not be overwritten when the suspend image is | ||
1291 | * restored) have the corresponding bits set in @bm (it must be | ||
1292 | * unitialized). | ||
1293 | * | ||
1294 | * NOTE: This function should not be called if there are no highmem | ||
1295 | * image pages. | ||
1296 | */ | ||
1297 | |||
1298 | static unsigned int safe_highmem_pages; | ||
1299 | |||
1300 | static struct memory_bitmap *safe_highmem_bm; | ||
1301 | |||
1302 | static int | ||
1303 | prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | ||
1304 | { | ||
1305 | unsigned int to_alloc; | ||
1306 | |||
1307 | if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE)) | ||
1308 | return -ENOMEM; | ||
1309 | |||
1310 | if (get_highmem_buffer(PG_SAFE)) | ||
1311 | return -ENOMEM; | ||
1312 | |||
1313 | to_alloc = count_free_highmem_pages(); | ||
1314 | if (to_alloc > *nr_highmem_p) | ||
1315 | to_alloc = *nr_highmem_p; | ||
1316 | else | ||
1317 | *nr_highmem_p = to_alloc; | ||
1318 | |||
1319 | safe_highmem_pages = 0; | ||
1320 | while (to_alloc-- > 0) { | ||
1321 | struct page *page; | ||
1322 | |||
1323 | page = alloc_page(__GFP_HIGHMEM); | ||
1324 | if (!PageNosaveFree(page)) { | ||
1325 | /* The page is "safe", set its bit the bitmap */ | ||
1326 | memory_bm_set_bit(bm, page_to_pfn(page)); | ||
1327 | safe_highmem_pages++; | ||
1328 | } | ||
1329 | /* Mark the page as allocated */ | ||
1330 | SetPageNosave(page); | ||
1331 | SetPageNosaveFree(page); | ||
1332 | } | ||
1333 | memory_bm_position_reset(bm); | ||
1334 | safe_highmem_bm = bm; | ||
1335 | return 0; | ||
1336 | } | ||
1337 | |||
1338 | /** | ||
1339 | * get_highmem_page_buffer - for given highmem image page find the buffer | ||
1340 | * that suspend_write_next() should set for its caller to write to. | ||
1341 | * | ||
1342 | * If the page is to be saved to its "original" page frame or a copy of | ||
1343 | * the page is to be made in the highmem, @buffer is returned. Otherwise, | ||
1344 | * the copy of the page is to be made in normal memory, so the address of | ||
1345 | * the copy is returned. | ||
1346 | * | ||
1347 | * If @buffer is returned, the caller of suspend_write_next() will write | ||
1348 | * the page's contents to @buffer, so they will have to be copied to the | ||
1349 | * right location on the next call to suspend_write_next() and it is done | ||
1350 | * with the help of copy_last_highmem_page(). For this purpose, if | ||
1351 | * @buffer is returned, @last_highmem page is set to the page to which | ||
1352 | * the data will have to be copied from @buffer. | ||
1353 | */ | ||
1354 | |||
1355 | static struct page *last_highmem_page; | ||
1356 | |||
1357 | static void * | ||
1358 | get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | ||
1359 | { | ||
1360 | struct highmem_pbe *pbe; | ||
1361 | void *kaddr; | ||
1362 | |||
1363 | if (PageNosave(page) && PageNosaveFree(page)) { | ||
1364 | /* We have allocated the "original" page frame and we can | ||
1365 | * use it directly to store the loaded page. | ||
1366 | */ | ||
1367 | last_highmem_page = page; | ||
1368 | return buffer; | ||
1369 | } | ||
1370 | /* The "original" page frame has not been allocated and we have to | ||
1371 | * use a "safe" page frame to store the loaded page. | ||
1372 | */ | ||
1373 | pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); | ||
1374 | if (!pbe) { | ||
1375 | swsusp_free(); | ||
1376 | return NULL; | ||
1377 | } | ||
1378 | pbe->orig_page = page; | ||
1379 | if (safe_highmem_pages > 0) { | ||
1380 | struct page *tmp; | ||
1381 | |||
1382 | /* Copy of the page will be stored in high memory */ | ||
1383 | kaddr = buffer; | ||
1384 | tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); | ||
1385 | safe_highmem_pages--; | ||
1386 | last_highmem_page = tmp; | ||
1387 | pbe->copy_page = tmp; | ||
1388 | } else { | ||
1389 | /* Copy of the page will be stored in normal memory */ | ||
1390 | kaddr = safe_pages_list; | ||
1391 | safe_pages_list = safe_pages_list->next; | ||
1392 | pbe->copy_page = virt_to_page(kaddr); | ||
1393 | } | ||
1394 | pbe->next = highmem_pblist; | ||
1395 | highmem_pblist = pbe; | ||
1396 | return kaddr; | ||
1397 | } | ||
1398 | |||
1399 | /** | ||
1400 | * copy_last_highmem_page - copy the contents of a highmem image from | ||
1401 | * @buffer, where the caller of snapshot_write_next() has place them, | ||
1402 | * to the right location represented by @last_highmem_page . | ||
1403 | */ | ||
1404 | |||
1405 | static void copy_last_highmem_page(void) | ||
1406 | { | ||
1407 | if (last_highmem_page) { | ||
1408 | void *dst; | ||
1409 | |||
1410 | dst = kmap_atomic(last_highmem_page, KM_USER0); | ||
1411 | memcpy(dst, buffer, PAGE_SIZE); | ||
1412 | kunmap_atomic(dst, KM_USER0); | ||
1413 | last_highmem_page = NULL; | ||
1414 | } | ||
1415 | } | ||
1416 | |||
1417 | static inline int last_highmem_page_copied(void) | ||
1418 | { | ||
1419 | return !last_highmem_page; | ||
1420 | } | ||
1421 | |||
1422 | static inline void free_highmem_data(void) | ||
1423 | { | ||
1424 | if (safe_highmem_bm) | ||
1425 | memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR); | ||
1426 | |||
1427 | if (buffer) | ||
1428 | free_image_page(buffer, PG_UNSAFE_CLEAR); | ||
1429 | } | ||
1430 | #else | ||
1431 | static inline int get_safe_write_buffer(void) { return 0; } | ||
1432 | |||
1433 | static unsigned int | ||
1434 | count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } | ||
1435 | |||
1436 | static inline int | ||
1437 | prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | ||
1438 | { | ||
1439 | return 0; | ||
1440 | } | ||
1441 | |||
1442 | static inline void * | ||
1443 | get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | ||
1444 | { | ||
1445 | return NULL; | ||
1446 | } | ||
1447 | |||
1448 | static inline void copy_last_highmem_page(void) {} | ||
1449 | static inline int last_highmem_page_copied(void) { return 1; } | ||
1450 | static inline void free_highmem_data(void) {} | ||
1451 | #endif /* CONFIG_HIGHMEM */ | ||
1452 | |||
1104 | /** | 1453 | /** |
1105 | * prepare_image - use the memory bitmap @bm to mark the pages that will | 1454 | * prepare_image - use the memory bitmap @bm to mark the pages that will |
1106 | * be overwritten in the process of restoring the system memory state | 1455 | * be overwritten in the process of restoring the system memory state |
@@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
1110 | * The idea is to allocate a new memory bitmap first and then allocate | 1459 | * The idea is to allocate a new memory bitmap first and then allocate |
1111 | * as many pages as needed for the image data, but not to assign these | 1460 | * as many pages as needed for the image data, but not to assign these |
1112 | * pages to specific tasks initially. Instead, we just mark them as | 1461 | * pages to specific tasks initially. Instead, we just mark them as |
1113 | * allocated and create a list of "safe" pages that will be used later. | 1462 | * allocated and create a lists of "safe" pages that will be used |
1463 | * later. On systems with high memory a list of "safe" highmem pages is | ||
1464 | * also created. | ||
1114 | */ | 1465 | */ |
1115 | 1466 | ||
1116 | #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) | 1467 | #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) |
1117 | 1468 | ||
1118 | static struct linked_page *safe_pages_list; | ||
1119 | |||
1120 | static int | 1469 | static int |
1121 | prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | 1470 | prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) |
1122 | { | 1471 | { |
1123 | unsigned int nr_pages; | 1472 | unsigned int nr_pages, nr_highmem; |
1124 | struct linked_page *sp_list, *lp; | 1473 | struct linked_page *sp_list, *lp; |
1125 | int error; | 1474 | int error; |
1126 | 1475 | ||
1476 | /* If there is no highmem, the buffer will not be necessary */ | ||
1477 | free_image_page(buffer, PG_UNSAFE_CLEAR); | ||
1478 | buffer = NULL; | ||
1479 | |||
1480 | nr_highmem = count_highmem_image_pages(bm); | ||
1127 | error = mark_unsafe_pages(bm); | 1481 | error = mark_unsafe_pages(bm); |
1128 | if (error) | 1482 | if (error) |
1129 | goto Free; | 1483 | goto Free; |
@@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
1134 | 1488 | ||
1135 | duplicate_memory_bitmap(new_bm, bm); | 1489 | duplicate_memory_bitmap(new_bm, bm); |
1136 | memory_bm_free(bm, PG_UNSAFE_KEEP); | 1490 | memory_bm_free(bm, PG_UNSAFE_KEEP); |
1491 | if (nr_highmem > 0) { | ||
1492 | error = prepare_highmem_image(bm, &nr_highmem); | ||
1493 | if (error) | ||
1494 | goto Free; | ||
1495 | } | ||
1137 | /* Reserve some safe pages for potential later use. | 1496 | /* Reserve some safe pages for potential later use. |
1138 | * | 1497 | * |
1139 | * NOTE: This way we make sure there will be enough safe pages for the | 1498 | * NOTE: This way we make sure there will be enough safe pages for the |
@@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
1142 | */ | 1501 | */ |
1143 | sp_list = NULL; | 1502 | sp_list = NULL; |
1144 | /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ | 1503 | /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ |
1145 | nr_pages = nr_copy_pages - allocated_unsafe_pages; | 1504 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; |
1146 | nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); | 1505 | nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); |
1147 | while (nr_pages > 0) { | 1506 | while (nr_pages > 0) { |
1148 | lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); | 1507 | lp = get_image_page(GFP_ATOMIC, PG_SAFE); |
1149 | if (!lp) { | 1508 | if (!lp) { |
1150 | error = -ENOMEM; | 1509 | error = -ENOMEM; |
1151 | goto Free; | 1510 | goto Free; |
@@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
1156 | } | 1515 | } |
1157 | /* Preallocate memory for the image */ | 1516 | /* Preallocate memory for the image */ |
1158 | safe_pages_list = NULL; | 1517 | safe_pages_list = NULL; |
1159 | nr_pages = nr_copy_pages - allocated_unsafe_pages; | 1518 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; |
1160 | while (nr_pages > 0) { | 1519 | while (nr_pages > 0) { |
1161 | lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); | 1520 | lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); |
1162 | if (!lp) { | 1521 | if (!lp) { |
@@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
1181 | } | 1540 | } |
1182 | return 0; | 1541 | return 0; |
1183 | 1542 | ||
1184 | Free: | 1543 | Free: |
1185 | swsusp_free(); | 1544 | swsusp_free(); |
1186 | return error; | 1545 | return error; |
1187 | } | 1546 | } |
@@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | |||
1196 | struct pbe *pbe; | 1555 | struct pbe *pbe; |
1197 | struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); | 1556 | struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); |
1198 | 1557 | ||
1558 | if (PageHighMem(page)) | ||
1559 | return get_highmem_page_buffer(page, ca); | ||
1560 | |||
1199 | if (PageNosave(page) && PageNosaveFree(page)) | 1561 | if (PageNosave(page) && PageNosaveFree(page)) |
1200 | /* We have allocated the "original" page frame and we can | 1562 | /* We have allocated the "original" page frame and we can |
1201 | * use it directly to store the loaded page. | 1563 | * use it directly to store the loaded page. |
@@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | |||
1210 | swsusp_free(); | 1572 | swsusp_free(); |
1211 | return NULL; | 1573 | return NULL; |
1212 | } | 1574 | } |
1213 | pbe->orig_address = (unsigned long)page_address(page); | 1575 | pbe->orig_address = page_address(page); |
1214 | pbe->address = (unsigned long)safe_pages_list; | 1576 | pbe->address = safe_pages_list; |
1215 | safe_pages_list = safe_pages_list->next; | 1577 | safe_pages_list = safe_pages_list->next; |
1216 | pbe->next = restore_pblist; | 1578 | pbe->next = restore_pblist; |
1217 | restore_pblist = pbe; | 1579 | restore_pblist = pbe; |
1218 | return (void *)pbe->address; | 1580 | return pbe->address; |
1219 | } | 1581 | } |
1220 | 1582 | ||
1221 | /** | 1583 | /** |
@@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) | |||
1249 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) | 1611 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) |
1250 | return 0; | 1612 | return 0; |
1251 | 1613 | ||
1252 | if (!buffer) { | 1614 | if (handle->offset == 0) { |
1253 | /* This makes the buffer be freed by swsusp_free() */ | 1615 | if (!buffer) |
1254 | buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); | 1616 | /* This makes the buffer be freed by swsusp_free() */ |
1617 | buffer = get_image_page(GFP_ATOMIC, PG_ANY); | ||
1618 | |||
1255 | if (!buffer) | 1619 | if (!buffer) |
1256 | return -ENOMEM; | 1620 | return -ENOMEM; |
1257 | } | 1621 | |
1258 | if (!handle->offset) | ||
1259 | handle->buffer = buffer; | 1622 | handle->buffer = buffer; |
1623 | } | ||
1260 | handle->sync_read = 1; | 1624 | handle->sync_read = 1; |
1261 | if (handle->prev < handle->cur) { | 1625 | if (handle->prev < handle->cur) { |
1262 | if (handle->prev == 0) { | 1626 | if (handle->prev == 0) { |
@@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) | |||
1284 | return -ENOMEM; | 1648 | return -ENOMEM; |
1285 | } | 1649 | } |
1286 | } else { | 1650 | } else { |
1651 | copy_last_highmem_page(); | ||
1287 | handle->buffer = get_buffer(&orig_bm, &ca); | 1652 | handle->buffer = get_buffer(&orig_bm, &ca); |
1288 | handle->sync_read = 0; | 1653 | if (handle->buffer != buffer) |
1654 | handle->sync_read = 0; | ||
1289 | } | 1655 | } |
1290 | handle->prev = handle->cur; | 1656 | handle->prev = handle->cur; |
1291 | } | 1657 | } |
@@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) | |||
1301 | return count; | 1667 | return count; |
1302 | } | 1668 | } |
1303 | 1669 | ||
1670 | /** | ||
1671 | * snapshot_write_finalize - must be called after the last call to | ||
1672 | * snapshot_write_next() in case the last page in the image happens | ||
1673 | * to be a highmem page and its contents should be stored in the | ||
1674 | * highmem. Additionally, it releases the memory that will not be | ||
1675 | * used any more. | ||
1676 | */ | ||
1677 | |||
1678 | void snapshot_write_finalize(struct snapshot_handle *handle) | ||
1679 | { | ||
1680 | copy_last_highmem_page(); | ||
1681 | /* Free only if we have loaded the image entirely */ | ||
1682 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { | ||
1683 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); | ||
1684 | free_highmem_data(); | ||
1685 | } | ||
1686 | } | ||
1687 | |||
1304 | int snapshot_image_loaded(struct snapshot_handle *handle) | 1688 | int snapshot_image_loaded(struct snapshot_handle *handle) |
1305 | { | 1689 | { |
1306 | return !(!nr_copy_pages || | 1690 | return !(!nr_copy_pages || !last_highmem_page_copied() || |
1307 | handle->cur <= nr_meta_pages + nr_copy_pages); | 1691 | handle->cur <= nr_meta_pages + nr_copy_pages); |
1308 | } | 1692 | } |
1309 | 1693 | ||
1310 | void snapshot_free_unused_memory(struct snapshot_handle *handle) | 1694 | #ifdef CONFIG_HIGHMEM |
1695 | /* Assumes that @buf is ready and points to a "safe" page */ | ||
1696 | static inline void | ||
1697 | swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | ||
1311 | { | 1698 | { |
1312 | /* Free only if we have loaded the image entirely */ | 1699 | void *kaddr1, *kaddr2; |
1313 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) | 1700 | |
1314 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); | 1701 | kaddr1 = kmap_atomic(p1, KM_USER0); |
1702 | kaddr2 = kmap_atomic(p2, KM_USER1); | ||
1703 | memcpy(buf, kaddr1, PAGE_SIZE); | ||
1704 | memcpy(kaddr1, kaddr2, PAGE_SIZE); | ||
1705 | memcpy(kaddr2, buf, PAGE_SIZE); | ||
1706 | kunmap_atomic(kaddr1, KM_USER0); | ||
1707 | kunmap_atomic(kaddr2, KM_USER1); | ||
1708 | } | ||
1709 | |||
1710 | /** | ||
1711 | * restore_highmem - for each highmem page that was allocated before | ||
1712 | * the suspend and included in the suspend image, and also has been | ||
1713 | * allocated by the "resume" kernel swap its current (ie. "before | ||
1714 | * resume") contents with the previous (ie. "before suspend") one. | ||
1715 | * | ||
1716 | * If the resume eventually fails, we can call this function once | ||
1717 | * again and restore the "before resume" highmem state. | ||
1718 | */ | ||
1719 | |||
1720 | int restore_highmem(void) | ||
1721 | { | ||
1722 | struct highmem_pbe *pbe = highmem_pblist; | ||
1723 | void *buf; | ||
1724 | |||
1725 | if (!pbe) | ||
1726 | return 0; | ||
1727 | |||
1728 | buf = get_image_page(GFP_ATOMIC, PG_SAFE); | ||
1729 | if (!buf) | ||
1730 | return -ENOMEM; | ||
1731 | |||
1732 | while (pbe) { | ||
1733 | swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf); | ||
1734 | pbe = pbe->next; | ||
1735 | } | ||
1736 | free_image_page(buf, PG_UNSAFE_CLEAR); | ||
1737 | return 0; | ||
1315 | } | 1738 | } |
1739 | #endif /* CONFIG_HIGHMEM */ | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9b2ee5344dee..f133d4a6d817 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -34,34 +34,123 @@ extern char resume_file[]; | |||
34 | #define SWSUSP_SIG "S1SUSPEND" | 34 | #define SWSUSP_SIG "S1SUSPEND" |
35 | 35 | ||
36 | static struct swsusp_header { | 36 | static struct swsusp_header { |
37 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; | 37 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; |
38 | swp_entry_t image; | 38 | sector_t image; |
39 | char orig_sig[10]; | 39 | char orig_sig[10]; |
40 | char sig[10]; | 40 | char sig[10]; |
41 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; | 41 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * Saving part... | 44 | * General things |
45 | */ | 45 | */ |
46 | 46 | ||
47 | static unsigned short root_swap = 0xffff; | 47 | static unsigned short root_swap = 0xffff; |
48 | static struct block_device *resume_bdev; | ||
49 | |||
50 | /** | ||
51 | * submit - submit BIO request. | ||
52 | * @rw: READ or WRITE. | ||
53 | * @off physical offset of page. | ||
54 | * @page: page we're reading or writing. | ||
55 | * @bio_chain: list of pending biod (for async reading) | ||
56 | * | ||
57 | * Straight from the textbook - allocate and initialize the bio. | ||
58 | * If we're reading, make sure the page is marked as dirty. | ||
59 | * Then submit it and, if @bio_chain == NULL, wait. | ||
60 | */ | ||
61 | static int submit(int rw, pgoff_t page_off, struct page *page, | ||
62 | struct bio **bio_chain) | ||
63 | { | ||
64 | struct bio *bio; | ||
65 | |||
66 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); | ||
67 | if (!bio) | ||
68 | return -ENOMEM; | ||
69 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | ||
70 | bio->bi_bdev = resume_bdev; | ||
71 | bio->bi_end_io = end_swap_bio_read; | ||
72 | |||
73 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { | ||
74 | printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); | ||
75 | bio_put(bio); | ||
76 | return -EFAULT; | ||
77 | } | ||
78 | |||
79 | lock_page(page); | ||
80 | bio_get(bio); | ||
48 | 81 | ||
49 | static int mark_swapfiles(swp_entry_t start) | 82 | if (bio_chain == NULL) { |
83 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
84 | wait_on_page_locked(page); | ||
85 | if (rw == READ) | ||
86 | bio_set_pages_dirty(bio); | ||
87 | bio_put(bio); | ||
88 | } else { | ||
89 | if (rw == READ) | ||
90 | get_page(page); /* These pages are freed later */ | ||
91 | bio->bi_private = *bio_chain; | ||
92 | *bio_chain = bio; | ||
93 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
94 | } | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) | ||
99 | { | ||
100 | return submit(READ, page_off, virt_to_page(addr), bio_chain); | ||
101 | } | ||
102 | |||
103 | static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) | ||
104 | { | ||
105 | return submit(WRITE, page_off, virt_to_page(addr), bio_chain); | ||
106 | } | ||
107 | |||
108 | static int wait_on_bio_chain(struct bio **bio_chain) | ||
109 | { | ||
110 | struct bio *bio; | ||
111 | struct bio *next_bio; | ||
112 | int ret = 0; | ||
113 | |||
114 | if (bio_chain == NULL) | ||
115 | return 0; | ||
116 | |||
117 | bio = *bio_chain; | ||
118 | if (bio == NULL) | ||
119 | return 0; | ||
120 | while (bio) { | ||
121 | struct page *page; | ||
122 | |||
123 | next_bio = bio->bi_private; | ||
124 | page = bio->bi_io_vec[0].bv_page; | ||
125 | wait_on_page_locked(page); | ||
126 | if (!PageUptodate(page) || PageError(page)) | ||
127 | ret = -EIO; | ||
128 | put_page(page); | ||
129 | bio_put(bio); | ||
130 | bio = next_bio; | ||
131 | } | ||
132 | *bio_chain = NULL; | ||
133 | return ret; | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Saving part | ||
138 | */ | ||
139 | |||
140 | static int mark_swapfiles(sector_t start) | ||
50 | { | 141 | { |
51 | int error; | 142 | int error; |
52 | 143 | ||
53 | rw_swap_page_sync(READ, swp_entry(root_swap, 0), | 144 | bio_read_page(swsusp_resume_block, &swsusp_header, NULL); |
54 | virt_to_page((unsigned long)&swsusp_header), NULL); | ||
55 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | 145 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || |
56 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | 146 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { |
57 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | 147 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); |
58 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | 148 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); |
59 | swsusp_header.image = start; | 149 | swsusp_header.image = start; |
60 | error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), | 150 | error = bio_write_page(swsusp_resume_block, |
61 | virt_to_page((unsigned long)&swsusp_header), | 151 | &swsusp_header, NULL); |
62 | NULL); | ||
63 | } else { | 152 | } else { |
64 | pr_debug("swsusp: Partition is not swap space.\n"); | 153 | printk(KERN_ERR "swsusp: Swap header not found!\n"); |
65 | error = -ENODEV; | 154 | error = -ENODEV; |
66 | } | 155 | } |
67 | return error; | 156 | return error; |
@@ -74,12 +163,21 @@ static int mark_swapfiles(swp_entry_t start) | |||
74 | 163 | ||
75 | static int swsusp_swap_check(void) /* This is called before saving image */ | 164 | static int swsusp_swap_check(void) /* This is called before saving image */ |
76 | { | 165 | { |
77 | int res = swap_type_of(swsusp_resume_device); | 166 | int res; |
167 | |||
168 | res = swap_type_of(swsusp_resume_device, swsusp_resume_block); | ||
169 | if (res < 0) | ||
170 | return res; | ||
171 | |||
172 | root_swap = res; | ||
173 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE); | ||
174 | if (IS_ERR(resume_bdev)) | ||
175 | return PTR_ERR(resume_bdev); | ||
176 | |||
177 | res = set_blocksize(resume_bdev, PAGE_SIZE); | ||
178 | if (res < 0) | ||
179 | blkdev_put(resume_bdev); | ||
78 | 180 | ||
79 | if (res >= 0) { | ||
80 | root_swap = res; | ||
81 | return 0; | ||
82 | } | ||
83 | return res; | 181 | return res; |
84 | } | 182 | } |
85 | 183 | ||
@@ -90,36 +188,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */ | |||
90 | * @bio_chain: Link the next write BIO here | 188 | * @bio_chain: Link the next write BIO here |
91 | */ | 189 | */ |
92 | 190 | ||
93 | static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) | 191 | static int write_page(void *buf, sector_t offset, struct bio **bio_chain) |
94 | { | 192 | { |
95 | swp_entry_t entry; | 193 | void *src; |
96 | int error = -ENOSPC; | 194 | |
97 | 195 | if (!offset) | |
98 | if (offset) { | 196 | return -ENOSPC; |
99 | struct page *page = virt_to_page(buf); | 197 | |
100 | 198 | if (bio_chain) { | |
101 | if (bio_chain) { | 199 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); |
102 | /* | 200 | if (src) { |
103 | * Whether or not we successfully allocated a copy page, | 201 | memcpy(src, buf, PAGE_SIZE); |
104 | * we take a ref on the page here. It gets undone in | 202 | } else { |
105 | * wait_on_bio_chain(). | 203 | WARN_ON_ONCE(1); |
106 | */ | 204 | bio_chain = NULL; /* Go synchronous */ |
107 | struct page *page_copy; | 205 | src = buf; |
108 | page_copy = alloc_page(GFP_ATOMIC); | ||
109 | if (page_copy == NULL) { | ||
110 | WARN_ON_ONCE(1); | ||
111 | bio_chain = NULL; /* Go synchronous */ | ||
112 | get_page(page); | ||
113 | } else { | ||
114 | memcpy(page_address(page_copy), | ||
115 | page_address(page), PAGE_SIZE); | ||
116 | page = page_copy; | ||
117 | } | ||
118 | } | 206 | } |
119 | entry = swp_entry(root_swap, offset); | 207 | } else { |
120 | error = rw_swap_page_sync(WRITE, entry, page, bio_chain); | 208 | src = buf; |
121 | } | 209 | } |
122 | return error; | 210 | return bio_write_page(offset, src, bio_chain); |
123 | } | 211 | } |
124 | 212 | ||
125 | /* | 213 | /* |
@@ -137,11 +225,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) | |||
137 | * at a time. | 225 | * at a time. |
138 | */ | 226 | */ |
139 | 227 | ||
140 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) | 228 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) |
141 | 229 | ||
142 | struct swap_map_page { | 230 | struct swap_map_page { |
143 | unsigned long entries[MAP_PAGE_ENTRIES]; | 231 | sector_t entries[MAP_PAGE_ENTRIES]; |
144 | unsigned long next_swap; | 232 | sector_t next_swap; |
145 | }; | 233 | }; |
146 | 234 | ||
147 | /** | 235 | /** |
@@ -151,7 +239,7 @@ struct swap_map_page { | |||
151 | 239 | ||
152 | struct swap_map_handle { | 240 | struct swap_map_handle { |
153 | struct swap_map_page *cur; | 241 | struct swap_map_page *cur; |
154 | unsigned long cur_swap; | 242 | sector_t cur_swap; |
155 | struct bitmap_page *bitmap; | 243 | struct bitmap_page *bitmap; |
156 | unsigned int k; | 244 | unsigned int k; |
157 | }; | 245 | }; |
@@ -166,26 +254,6 @@ static void release_swap_writer(struct swap_map_handle *handle) | |||
166 | handle->bitmap = NULL; | 254 | handle->bitmap = NULL; |
167 | } | 255 | } |
168 | 256 | ||
169 | static void show_speed(struct timeval *start, struct timeval *stop, | ||
170 | unsigned nr_pages, char *msg) | ||
171 | { | ||
172 | s64 elapsed_centisecs64; | ||
173 | int centisecs; | ||
174 | int k; | ||
175 | int kps; | ||
176 | |||
177 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); | ||
178 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); | ||
179 | centisecs = elapsed_centisecs64; | ||
180 | if (centisecs == 0) | ||
181 | centisecs = 1; /* avoid div-by-zero */ | ||
182 | k = nr_pages * (PAGE_SIZE / 1024); | ||
183 | kps = (k * 100) / centisecs; | ||
184 | printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, | ||
185 | centisecs / 100, centisecs % 100, | ||
186 | kps / 1000, (kps % 1000) / 10); | ||
187 | } | ||
188 | |||
189 | static int get_swap_writer(struct swap_map_handle *handle) | 257 | static int get_swap_writer(struct swap_map_handle *handle) |
190 | { | 258 | { |
191 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); | 259 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); |
@@ -196,7 +264,7 @@ static int get_swap_writer(struct swap_map_handle *handle) | |||
196 | release_swap_writer(handle); | 264 | release_swap_writer(handle); |
197 | return -ENOMEM; | 265 | return -ENOMEM; |
198 | } | 266 | } |
199 | handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); | 267 | handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap); |
200 | if (!handle->cur_swap) { | 268 | if (!handle->cur_swap) { |
201 | release_swap_writer(handle); | 269 | release_swap_writer(handle); |
202 | return -ENOSPC; | 270 | return -ENOSPC; |
@@ -205,43 +273,15 @@ static int get_swap_writer(struct swap_map_handle *handle) | |||
205 | return 0; | 273 | return 0; |
206 | } | 274 | } |
207 | 275 | ||
208 | static int wait_on_bio_chain(struct bio **bio_chain) | ||
209 | { | ||
210 | struct bio *bio; | ||
211 | struct bio *next_bio; | ||
212 | int ret = 0; | ||
213 | |||
214 | if (bio_chain == NULL) | ||
215 | return 0; | ||
216 | |||
217 | bio = *bio_chain; | ||
218 | if (bio == NULL) | ||
219 | return 0; | ||
220 | while (bio) { | ||
221 | struct page *page; | ||
222 | |||
223 | next_bio = bio->bi_private; | ||
224 | page = bio->bi_io_vec[0].bv_page; | ||
225 | wait_on_page_locked(page); | ||
226 | if (!PageUptodate(page) || PageError(page)) | ||
227 | ret = -EIO; | ||
228 | put_page(page); | ||
229 | bio_put(bio); | ||
230 | bio = next_bio; | ||
231 | } | ||
232 | *bio_chain = NULL; | ||
233 | return ret; | ||
234 | } | ||
235 | |||
236 | static int swap_write_page(struct swap_map_handle *handle, void *buf, | 276 | static int swap_write_page(struct swap_map_handle *handle, void *buf, |
237 | struct bio **bio_chain) | 277 | struct bio **bio_chain) |
238 | { | 278 | { |
239 | int error = 0; | 279 | int error = 0; |
240 | unsigned long offset; | 280 | sector_t offset; |
241 | 281 | ||
242 | if (!handle->cur) | 282 | if (!handle->cur) |
243 | return -EINVAL; | 283 | return -EINVAL; |
244 | offset = alloc_swap_page(root_swap, handle->bitmap); | 284 | offset = alloc_swapdev_block(root_swap, handle->bitmap); |
245 | error = write_page(buf, offset, bio_chain); | 285 | error = write_page(buf, offset, bio_chain); |
246 | if (error) | 286 | if (error) |
247 | return error; | 287 | return error; |
@@ -250,7 +290,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
250 | error = wait_on_bio_chain(bio_chain); | 290 | error = wait_on_bio_chain(bio_chain); |
251 | if (error) | 291 | if (error) |
252 | goto out; | 292 | goto out; |
253 | offset = alloc_swap_page(root_swap, handle->bitmap); | 293 | offset = alloc_swapdev_block(root_swap, handle->bitmap); |
254 | if (!offset) | 294 | if (!offset) |
255 | return -ENOSPC; | 295 | return -ENOSPC; |
256 | handle->cur->next_swap = offset; | 296 | handle->cur->next_swap = offset; |
@@ -261,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
261 | handle->cur_swap = offset; | 301 | handle->cur_swap = offset; |
262 | handle->k = 0; | 302 | handle->k = 0; |
263 | } | 303 | } |
264 | out: | 304 | out: |
265 | return error; | 305 | return error; |
266 | } | 306 | } |
267 | 307 | ||
@@ -315,7 +355,7 @@ static int save_image(struct swap_map_handle *handle, | |||
315 | error = err2; | 355 | error = err2; |
316 | if (!error) | 356 | if (!error) |
317 | printk("\b\b\b\bdone\n"); | 357 | printk("\b\b\b\bdone\n"); |
318 | show_speed(&start, &stop, nr_to_write, "Wrote"); | 358 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); |
319 | return error; | 359 | return error; |
320 | } | 360 | } |
321 | 361 | ||
@@ -350,99 +390,50 @@ int swsusp_write(void) | |||
350 | struct swsusp_info *header; | 390 | struct swsusp_info *header; |
351 | int error; | 391 | int error; |
352 | 392 | ||
353 | if ((error = swsusp_swap_check())) { | 393 | error = swsusp_swap_check(); |
394 | if (error) { | ||
354 | printk(KERN_ERR "swsusp: Cannot find swap device, try " | 395 | printk(KERN_ERR "swsusp: Cannot find swap device, try " |
355 | "swapon -a.\n"); | 396 | "swapon -a.\n"); |
356 | return error; | 397 | return error; |
357 | } | 398 | } |
358 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | 399 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); |
359 | error = snapshot_read_next(&snapshot, PAGE_SIZE); | 400 | error = snapshot_read_next(&snapshot, PAGE_SIZE); |
360 | if (error < PAGE_SIZE) | 401 | if (error < PAGE_SIZE) { |
361 | return error < 0 ? error : -EFAULT; | 402 | if (error >= 0) |
403 | error = -EFAULT; | ||
404 | |||
405 | goto out; | ||
406 | } | ||
362 | header = (struct swsusp_info *)data_of(snapshot); | 407 | header = (struct swsusp_info *)data_of(snapshot); |
363 | if (!enough_swap(header->pages)) { | 408 | if (!enough_swap(header->pages)) { |
364 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | 409 | printk(KERN_ERR "swsusp: Not enough free swap\n"); |
365 | return -ENOSPC; | 410 | error = -ENOSPC; |
411 | goto out; | ||
366 | } | 412 | } |
367 | error = get_swap_writer(&handle); | 413 | error = get_swap_writer(&handle); |
368 | if (!error) { | 414 | if (!error) { |
369 | unsigned long start = handle.cur_swap; | 415 | sector_t start = handle.cur_swap; |
416 | |||
370 | error = swap_write_page(&handle, header, NULL); | 417 | error = swap_write_page(&handle, header, NULL); |
371 | if (!error) | 418 | if (!error) |
372 | error = save_image(&handle, &snapshot, | 419 | error = save_image(&handle, &snapshot, |
373 | header->pages - 1); | 420 | header->pages - 1); |
421 | |||
374 | if (!error) { | 422 | if (!error) { |
375 | flush_swap_writer(&handle); | 423 | flush_swap_writer(&handle); |
376 | printk("S"); | 424 | printk("S"); |
377 | error = mark_swapfiles(swp_entry(root_swap, start)); | 425 | error = mark_swapfiles(start); |
378 | printk("|\n"); | 426 | printk("|\n"); |
379 | } | 427 | } |
380 | } | 428 | } |
381 | if (error) | 429 | if (error) |
382 | free_all_swap_pages(root_swap, handle.bitmap); | 430 | free_all_swap_pages(root_swap, handle.bitmap); |
383 | release_swap_writer(&handle); | 431 | release_swap_writer(&handle); |
432 | out: | ||
433 | swsusp_close(); | ||
384 | return error; | 434 | return error; |
385 | } | 435 | } |
386 | 436 | ||
387 | static struct block_device *resume_bdev; | ||
388 | |||
389 | /** | ||
390 | * submit - submit BIO request. | ||
391 | * @rw: READ or WRITE. | ||
392 | * @off physical offset of page. | ||
393 | * @page: page we're reading or writing. | ||
394 | * @bio_chain: list of pending biod (for async reading) | ||
395 | * | ||
396 | * Straight from the textbook - allocate and initialize the bio. | ||
397 | * If we're reading, make sure the page is marked as dirty. | ||
398 | * Then submit it and, if @bio_chain == NULL, wait. | ||
399 | */ | ||
400 | static int submit(int rw, pgoff_t page_off, struct page *page, | ||
401 | struct bio **bio_chain) | ||
402 | { | ||
403 | struct bio *bio; | ||
404 | |||
405 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
406 | if (!bio) | ||
407 | return -ENOMEM; | ||
408 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | ||
409 | bio->bi_bdev = resume_bdev; | ||
410 | bio->bi_end_io = end_swap_bio_read; | ||
411 | |||
412 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { | ||
413 | printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); | ||
414 | bio_put(bio); | ||
415 | return -EFAULT; | ||
416 | } | ||
417 | |||
418 | lock_page(page); | ||
419 | bio_get(bio); | ||
420 | |||
421 | if (bio_chain == NULL) { | ||
422 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
423 | wait_on_page_locked(page); | ||
424 | if (rw == READ) | ||
425 | bio_set_pages_dirty(bio); | ||
426 | bio_put(bio); | ||
427 | } else { | ||
428 | get_page(page); | ||
429 | bio->bi_private = *bio_chain; | ||
430 | *bio_chain = bio; | ||
431 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
432 | } | ||
433 | return 0; | ||
434 | } | ||
435 | |||
436 | static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) | ||
437 | { | ||
438 | return submit(READ, page_off, virt_to_page(addr), bio_chain); | ||
439 | } | ||
440 | |||
441 | static int bio_write_page(pgoff_t page_off, void *addr) | ||
442 | { | ||
443 | return submit(WRITE, page_off, virt_to_page(addr), NULL); | ||
444 | } | ||
445 | |||
446 | /** | 437 | /** |
447 | * The following functions allow us to read data using a swap map | 438 | * The following functions allow us to read data using a swap map |
448 | * in a file-alike way | 439 | * in a file-alike way |
@@ -455,17 +446,18 @@ static void release_swap_reader(struct swap_map_handle *handle) | |||
455 | handle->cur = NULL; | 446 | handle->cur = NULL; |
456 | } | 447 | } |
457 | 448 | ||
458 | static int get_swap_reader(struct swap_map_handle *handle, | 449 | static int get_swap_reader(struct swap_map_handle *handle, sector_t start) |
459 | swp_entry_t start) | ||
460 | { | 450 | { |
461 | int error; | 451 | int error; |
462 | 452 | ||
463 | if (!swp_offset(start)) | 453 | if (!start) |
464 | return -EINVAL; | 454 | return -EINVAL; |
465 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | 455 | |
456 | handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); | ||
466 | if (!handle->cur) | 457 | if (!handle->cur) |
467 | return -ENOMEM; | 458 | return -ENOMEM; |
468 | error = bio_read_page(swp_offset(start), handle->cur, NULL); | 459 | |
460 | error = bio_read_page(start, handle->cur, NULL); | ||
469 | if (error) { | 461 | if (error) { |
470 | release_swap_reader(handle); | 462 | release_swap_reader(handle); |
471 | return error; | 463 | return error; |
@@ -477,7 +469,7 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
477 | static int swap_read_page(struct swap_map_handle *handle, void *buf, | 469 | static int swap_read_page(struct swap_map_handle *handle, void *buf, |
478 | struct bio **bio_chain) | 470 | struct bio **bio_chain) |
479 | { | 471 | { |
480 | unsigned long offset; | 472 | sector_t offset; |
481 | int error; | 473 | int error; |
482 | 474 | ||
483 | if (!handle->cur) | 475 | if (!handle->cur) |
@@ -546,11 +538,11 @@ static int load_image(struct swap_map_handle *handle, | |||
546 | error = err2; | 538 | error = err2; |
547 | if (!error) { | 539 | if (!error) { |
548 | printk("\b\b\b\bdone\n"); | 540 | printk("\b\b\b\bdone\n"); |
549 | snapshot_free_unused_memory(snapshot); | 541 | snapshot_write_finalize(snapshot); |
550 | if (!snapshot_image_loaded(snapshot)) | 542 | if (!snapshot_image_loaded(snapshot)) |
551 | error = -ENODATA; | 543 | error = -ENODATA; |
552 | } | 544 | } |
553 | show_speed(&start, &stop, nr_to_read, "Read"); | 545 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); |
554 | return error; | 546 | return error; |
555 | } | 547 | } |
556 | 548 | ||
@@ -599,12 +591,16 @@ int swsusp_check(void) | |||
599 | if (!IS_ERR(resume_bdev)) { | 591 | if (!IS_ERR(resume_bdev)) { |
600 | set_blocksize(resume_bdev, PAGE_SIZE); | 592 | set_blocksize(resume_bdev, PAGE_SIZE); |
601 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | 593 | memset(&swsusp_header, 0, sizeof(swsusp_header)); |
602 | if ((error = bio_read_page(0, &swsusp_header, NULL))) | 594 | error = bio_read_page(swsusp_resume_block, |
595 | &swsusp_header, NULL); | ||
596 | if (error) | ||
603 | return error; | 597 | return error; |
598 | |||
604 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | 599 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { |
605 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | 600 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); |
606 | /* Reset swap signature now */ | 601 | /* Reset swap signature now */ |
607 | error = bio_write_page(0, &swsusp_header); | 602 | error = bio_write_page(swsusp_resume_block, |
603 | &swsusp_header, NULL); | ||
608 | } else { | 604 | } else { |
609 | return -EINVAL; | 605 | return -EINVAL; |
610 | } | 606 | } |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 0b66659dc516..31aa0390c777 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/bootmem.h> | 49 | #include <linux/bootmem.h> |
50 | #include <linux/syscalls.h> | 50 | #include <linux/syscalls.h> |
51 | #include <linux/highmem.h> | 51 | #include <linux/highmem.h> |
52 | #include <linux/time.h> | ||
52 | 53 | ||
53 | #include "power.h" | 54 | #include "power.h" |
54 | 55 | ||
@@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0; | |||
64 | 65 | ||
65 | #ifdef CONFIG_HIGHMEM | 66 | #ifdef CONFIG_HIGHMEM |
66 | unsigned int count_highmem_pages(void); | 67 | unsigned int count_highmem_pages(void); |
67 | int save_highmem(void); | ||
68 | int restore_highmem(void); | 68 | int restore_highmem(void); |
69 | #else | 69 | #else |
70 | static inline int save_highmem(void) { return 0; } | ||
71 | static inline int restore_highmem(void) { return 0; } | 70 | static inline int restore_highmem(void) { return 0; } |
72 | static inline unsigned int count_highmem_pages(void) { return 0; } | 71 | static inline unsigned int count_highmem_pages(void) { return 0; } |
73 | #endif | 72 | #endif |
@@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) | |||
134 | return 0; | 133 | return 0; |
135 | } | 134 | } |
136 | 135 | ||
137 | unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) | 136 | sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap) |
138 | { | 137 | { |
139 | unsigned long offset; | 138 | unsigned long offset; |
140 | 139 | ||
141 | offset = swp_offset(get_swap_page_of_type(swap)); | 140 | offset = swp_offset(get_swap_page_of_type(swap)); |
142 | if (offset) { | 141 | if (offset) { |
143 | if (bitmap_set(bitmap, offset)) { | 142 | if (bitmap_set(bitmap, offset)) |
144 | swap_free(swp_entry(swap, offset)); | 143 | swap_free(swp_entry(swap, offset)); |
145 | offset = 0; | 144 | else |
146 | } | 145 | return swapdev_block(swap, offset); |
147 | } | 146 | } |
148 | return offset; | 147 | return 0; |
149 | } | 148 | } |
150 | 149 | ||
151 | void free_all_swap_pages(int swap, struct bitmap_page *bitmap) | 150 | void free_all_swap_pages(int swap, struct bitmap_page *bitmap) |
@@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) | |||
166 | } | 165 | } |
167 | 166 | ||
168 | /** | 167 | /** |
168 | * swsusp_show_speed - print the time elapsed between two events represented by | ||
169 | * @start and @stop | ||
170 | * | ||
171 | * @nr_pages - number of pages processed between @start and @stop | ||
172 | * @msg - introductory message to print | ||
173 | */ | ||
174 | |||
175 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, | ||
176 | unsigned nr_pages, char *msg) | ||
177 | { | ||
178 | s64 elapsed_centisecs64; | ||
179 | int centisecs; | ||
180 | int k; | ||
181 | int kps; | ||
182 | |||
183 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); | ||
184 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); | ||
185 | centisecs = elapsed_centisecs64; | ||
186 | if (centisecs == 0) | ||
187 | centisecs = 1; /* avoid div-by-zero */ | ||
188 | k = nr_pages * (PAGE_SIZE / 1024); | ||
189 | kps = (k * 100) / centisecs; | ||
190 | printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, | ||
191 | centisecs / 100, centisecs % 100, | ||
192 | kps / 1000, (kps % 1000) / 10); | ||
193 | } | ||
194 | |||
195 | /** | ||
169 | * swsusp_shrink_memory - Try to free as much memory as needed | 196 | * swsusp_shrink_memory - Try to free as much memory as needed |
170 | * | 197 | * |
171 | * ... but do not OOM-kill anyone | 198 | * ... but do not OOM-kill anyone |
@@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp) | |||
184 | 211 | ||
185 | int swsusp_shrink_memory(void) | 212 | int swsusp_shrink_memory(void) |
186 | { | 213 | { |
187 | long size, tmp; | 214 | long tmp; |
188 | struct zone *zone; | 215 | struct zone *zone; |
189 | unsigned long pages = 0; | 216 | unsigned long pages = 0; |
190 | unsigned int i = 0; | 217 | unsigned int i = 0; |
191 | char *p = "-\\|/"; | 218 | char *p = "-\\|/"; |
219 | struct timeval start, stop; | ||
192 | 220 | ||
193 | printk("Shrinking memory... "); | 221 | printk("Shrinking memory... "); |
222 | do_gettimeofday(&start); | ||
194 | do { | 223 | do { |
195 | size = 2 * count_highmem_pages(); | 224 | long size, highmem_size; |
196 | size += size / 50 + count_data_pages() + PAGES_FOR_IO; | 225 | |
226 | highmem_size = count_highmem_pages(); | ||
227 | size = count_data_pages() + PAGES_FOR_IO; | ||
197 | tmp = size; | 228 | tmp = size; |
229 | size += highmem_size; | ||
198 | for_each_zone (zone) | 230 | for_each_zone (zone) |
199 | if (!is_highmem(zone) && populated_zone(zone)) { | 231 | if (populated_zone(zone)) { |
200 | tmp -= zone->free_pages; | 232 | if (is_highmem(zone)) { |
201 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; | 233 | highmem_size -= zone->free_pages; |
202 | tmp += snapshot_additional_pages(zone); | 234 | } else { |
235 | tmp -= zone->free_pages; | ||
236 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; | ||
237 | tmp += snapshot_additional_pages(zone); | ||
238 | } | ||
203 | } | 239 | } |
240 | |||
241 | if (highmem_size < 0) | ||
242 | highmem_size = 0; | ||
243 | |||
244 | tmp += highmem_size; | ||
204 | if (tmp > 0) { | 245 | if (tmp > 0) { |
205 | tmp = __shrink_memory(tmp); | 246 | tmp = __shrink_memory(tmp); |
206 | if (!tmp) | 247 | if (!tmp) |
@@ -212,7 +253,9 @@ int swsusp_shrink_memory(void) | |||
212 | } | 253 | } |
213 | printk("\b%c", p[i++%4]); | 254 | printk("\b%c", p[i++%4]); |
214 | } while (tmp > 0); | 255 | } while (tmp > 0); |
256 | do_gettimeofday(&stop); | ||
215 | printk("\bdone (%lu pages freed)\n", pages); | 257 | printk("\bdone (%lu pages freed)\n", pages); |
258 | swsusp_show_speed(&start, &stop, pages, "Freed"); | ||
216 | 259 | ||
217 | return 0; | 260 | return 0; |
218 | } | 261 | } |
@@ -223,6 +266,7 @@ int swsusp_suspend(void) | |||
223 | 266 | ||
224 | if ((error = arch_prepare_suspend())) | 267 | if ((error = arch_prepare_suspend())) |
225 | return error; | 268 | return error; |
269 | |||
226 | local_irq_disable(); | 270 | local_irq_disable(); |
227 | /* At this point, device_suspend() has been called, but *not* | 271 | /* At this point, device_suspend() has been called, but *not* |
228 | * device_power_down(). We *must* device_power_down() now. | 272 | * device_power_down(). We *must* device_power_down() now. |
@@ -235,23 +279,16 @@ int swsusp_suspend(void) | |||
235 | goto Enable_irqs; | 279 | goto Enable_irqs; |
236 | } | 280 | } |
237 | 281 | ||
238 | if ((error = save_highmem())) { | ||
239 | printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); | ||
240 | goto Restore_highmem; | ||
241 | } | ||
242 | |||
243 | save_processor_state(); | 282 | save_processor_state(); |
244 | if ((error = swsusp_arch_suspend())) | 283 | if ((error = swsusp_arch_suspend())) |
245 | printk(KERN_ERR "Error %d suspending\n", error); | 284 | printk(KERN_ERR "Error %d suspending\n", error); |
246 | /* Restore control flow magically appears here */ | 285 | /* Restore control flow magically appears here */ |
247 | restore_processor_state(); | 286 | restore_processor_state(); |
248 | Restore_highmem: | ||
249 | restore_highmem(); | ||
250 | /* NOTE: device_power_up() is just a resume() for devices | 287 | /* NOTE: device_power_up() is just a resume() for devices |
251 | * that suspended with irqs off ... no overall powerup. | 288 | * that suspended with irqs off ... no overall powerup. |
252 | */ | 289 | */ |
253 | device_power_up(); | 290 | device_power_up(); |
254 | Enable_irqs: | 291 | Enable_irqs: |
255 | local_irq_enable(); | 292 | local_irq_enable(); |
256 | return error; | 293 | return error; |
257 | } | 294 | } |
@@ -268,18 +305,23 @@ int swsusp_resume(void) | |||
268 | printk(KERN_ERR "Some devices failed to power down, very bad\n"); | 305 | printk(KERN_ERR "Some devices failed to power down, very bad\n"); |
269 | /* We'll ignore saved state, but this gets preempt count (etc) right */ | 306 | /* We'll ignore saved state, but this gets preempt count (etc) right */ |
270 | save_processor_state(); | 307 | save_processor_state(); |
271 | error = swsusp_arch_resume(); | 308 | error = restore_highmem(); |
272 | /* Code below is only ever reached in case of failure. Otherwise | 309 | if (!error) { |
273 | * execution continues at place where swsusp_arch_suspend was called | 310 | error = swsusp_arch_resume(); |
274 | */ | 311 | /* The code below is only ever reached in case of a failure. |
275 | BUG_ON(!error); | 312 | * Otherwise execution continues at place where |
313 | * swsusp_arch_suspend() was called | ||
314 | */ | ||
315 | BUG_ON(!error); | ||
316 | /* This call to restore_highmem() undos the previous one */ | ||
317 | restore_highmem(); | ||
318 | } | ||
276 | /* The only reason why swsusp_arch_resume() can fail is memory being | 319 | /* The only reason why swsusp_arch_resume() can fail is memory being |
277 | * very tight, so we have to free it as soon as we can to avoid | 320 | * very tight, so we have to free it as soon as we can to avoid |
278 | * subsequent failures | 321 | * subsequent failures |
279 | */ | 322 | */ |
280 | swsusp_free(); | 323 | swsusp_free(); |
281 | restore_processor_state(); | 324 | restore_processor_state(); |
282 | restore_highmem(); | ||
283 | touch_softlockup_watchdog(); | 325 | touch_softlockup_watchdog(); |
284 | device_power_up(); | 326 | device_power_up(); |
285 | local_irq_enable(); | 327 | local_irq_enable(); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index d991d3b0e5a4..89443b85163b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -11,6 +11,7 @@ | |||
11 | 11 | ||
12 | #include <linux/suspend.h> | 12 | #include <linux/suspend.h> |
13 | #include <linux/syscalls.h> | 13 | #include <linux/syscalls.h> |
14 | #include <linux/reboot.h> | ||
14 | #include <linux/string.h> | 15 | #include <linux/string.h> |
15 | #include <linux/device.h> | 16 | #include <linux/device.h> |
16 | #include <linux/miscdevice.h> | 17 | #include <linux/miscdevice.h> |
@@ -21,6 +22,7 @@ | |||
21 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
22 | #include <linux/console.h> | 23 | #include <linux/console.h> |
23 | #include <linux/cpu.h> | 24 | #include <linux/cpu.h> |
25 | #include <linux/freezer.h> | ||
24 | 26 | ||
25 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
26 | 28 | ||
@@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
54 | filp->private_data = data; | 56 | filp->private_data = data; |
55 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); | 57 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); |
56 | if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { | 58 | if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { |
57 | data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; | 59 | data->swap = swsusp_resume_device ? |
60 | swap_type_of(swsusp_resume_device, 0) : -1; | ||
58 | data->mode = O_RDONLY; | 61 | data->mode = O_RDONLY; |
59 | } else { | 62 | } else { |
60 | data->swap = -1; | 63 | data->swap = -1; |
@@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
76 | free_all_swap_pages(data->swap, data->bitmap); | 79 | free_all_swap_pages(data->swap, data->bitmap); |
77 | free_bitmap(data->bitmap); | 80 | free_bitmap(data->bitmap); |
78 | if (data->frozen) { | 81 | if (data->frozen) { |
79 | down(&pm_sem); | 82 | mutex_lock(&pm_mutex); |
80 | thaw_processes(); | 83 | thaw_processes(); |
81 | enable_nonboot_cpus(); | 84 | enable_nonboot_cpus(); |
82 | up(&pm_sem); | 85 | mutex_unlock(&pm_mutex); |
83 | } | 86 | } |
84 | atomic_inc(&device_available); | 87 | atomic_inc(&device_available); |
85 | return 0; | 88 | return 0; |
@@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
124 | { | 127 | { |
125 | int error = 0; | 128 | int error = 0; |
126 | struct snapshot_data *data; | 129 | struct snapshot_data *data; |
127 | loff_t offset, avail; | 130 | loff_t avail; |
131 | sector_t offset; | ||
128 | 132 | ||
129 | if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) | 133 | if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) |
130 | return -ENOTTY; | 134 | return -ENOTTY; |
@@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
140 | case SNAPSHOT_FREEZE: | 144 | case SNAPSHOT_FREEZE: |
141 | if (data->frozen) | 145 | if (data->frozen) |
142 | break; | 146 | break; |
143 | down(&pm_sem); | 147 | mutex_lock(&pm_mutex); |
144 | error = disable_nonboot_cpus(); | 148 | error = disable_nonboot_cpus(); |
145 | if (!error) { | 149 | if (!error) { |
146 | error = freeze_processes(); | 150 | error = freeze_processes(); |
@@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
150 | error = -EBUSY; | 154 | error = -EBUSY; |
151 | } | 155 | } |
152 | } | 156 | } |
153 | up(&pm_sem); | 157 | mutex_unlock(&pm_mutex); |
154 | if (!error) | 158 | if (!error) |
155 | data->frozen = 1; | 159 | data->frozen = 1; |
156 | break; | 160 | break; |
@@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
158 | case SNAPSHOT_UNFREEZE: | 162 | case SNAPSHOT_UNFREEZE: |
159 | if (!data->frozen) | 163 | if (!data->frozen) |
160 | break; | 164 | break; |
161 | down(&pm_sem); | 165 | mutex_lock(&pm_mutex); |
162 | thaw_processes(); | 166 | thaw_processes(); |
163 | enable_nonboot_cpus(); | 167 | enable_nonboot_cpus(); |
164 | up(&pm_sem); | 168 | mutex_unlock(&pm_mutex); |
165 | data->frozen = 0; | 169 | data->frozen = 0; |
166 | break; | 170 | break; |
167 | 171 | ||
@@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
170 | error = -EPERM; | 174 | error = -EPERM; |
171 | break; | 175 | break; |
172 | } | 176 | } |
173 | down(&pm_sem); | 177 | mutex_lock(&pm_mutex); |
174 | /* Free memory before shutting down devices. */ | 178 | /* Free memory before shutting down devices. */ |
175 | error = swsusp_shrink_memory(); | 179 | error = swsusp_shrink_memory(); |
176 | if (!error) { | 180 | if (!error) { |
@@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
183 | } | 187 | } |
184 | resume_console(); | 188 | resume_console(); |
185 | } | 189 | } |
186 | up(&pm_sem); | 190 | mutex_unlock(&pm_mutex); |
187 | if (!error) | 191 | if (!error) |
188 | error = put_user(in_suspend, (unsigned int __user *)arg); | 192 | error = put_user(in_suspend, (unsigned int __user *)arg); |
189 | if (!error) | 193 | if (!error) |
@@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
191 | break; | 195 | break; |
192 | 196 | ||
193 | case SNAPSHOT_ATOMIC_RESTORE: | 197 | case SNAPSHOT_ATOMIC_RESTORE: |
198 | snapshot_write_finalize(&data->handle); | ||
194 | if (data->mode != O_WRONLY || !data->frozen || | 199 | if (data->mode != O_WRONLY || !data->frozen || |
195 | !snapshot_image_loaded(&data->handle)) { | 200 | !snapshot_image_loaded(&data->handle)) { |
196 | error = -EPERM; | 201 | error = -EPERM; |
197 | break; | 202 | break; |
198 | } | 203 | } |
199 | snapshot_free_unused_memory(&data->handle); | 204 | mutex_lock(&pm_mutex); |
200 | down(&pm_sem); | ||
201 | pm_prepare_console(); | 205 | pm_prepare_console(); |
202 | suspend_console(); | 206 | suspend_console(); |
203 | error = device_suspend(PMSG_PRETHAW); | 207 | error = device_suspend(PMSG_PRETHAW); |
@@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
207 | } | 211 | } |
208 | resume_console(); | 212 | resume_console(); |
209 | pm_restore_console(); | 213 | pm_restore_console(); |
210 | up(&pm_sem); | 214 | mutex_unlock(&pm_mutex); |
211 | break; | 215 | break; |
212 | 216 | ||
213 | case SNAPSHOT_FREE: | 217 | case SNAPSHOT_FREE: |
@@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
238 | break; | 242 | break; |
239 | } | 243 | } |
240 | } | 244 | } |
241 | offset = alloc_swap_page(data->swap, data->bitmap); | 245 | offset = alloc_swapdev_block(data->swap, data->bitmap); |
242 | if (offset) { | 246 | if (offset) { |
243 | offset <<= PAGE_SHIFT; | 247 | offset <<= PAGE_SHIFT; |
244 | error = put_user(offset, (loff_t __user *)arg); | 248 | error = put_user(offset, (sector_t __user *)arg); |
245 | } else { | 249 | } else { |
246 | error = -ENOSPC; | 250 | error = -ENOSPC; |
247 | } | 251 | } |
@@ -264,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
264 | * so we need to recode them | 268 | * so we need to recode them |
265 | */ | 269 | */ |
266 | if (old_decode_dev(arg)) { | 270 | if (old_decode_dev(arg)) { |
267 | data->swap = swap_type_of(old_decode_dev(arg)); | 271 | data->swap = swap_type_of(old_decode_dev(arg), 0); |
268 | if (data->swap < 0) | 272 | if (data->swap < 0) |
269 | error = -ENODEV; | 273 | error = -ENODEV; |
270 | } else { | 274 | } else { |
@@ -282,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
282 | break; | 286 | break; |
283 | } | 287 | } |
284 | 288 | ||
285 | if (down_trylock(&pm_sem)) { | 289 | if (!mutex_trylock(&pm_mutex)) { |
286 | error = -EBUSY; | 290 | error = -EBUSY; |
287 | break; | 291 | break; |
288 | } | 292 | } |
@@ -309,8 +313,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
309 | if (pm_ops->finish) | 313 | if (pm_ops->finish) |
310 | pm_ops->finish(PM_SUSPEND_MEM); | 314 | pm_ops->finish(PM_SUSPEND_MEM); |
311 | 315 | ||
312 | OutS3: | 316 | OutS3: |
313 | up(&pm_sem); | 317 | mutex_unlock(&pm_mutex); |
318 | break; | ||
319 | |||
320 | case SNAPSHOT_PMOPS: | ||
321 | switch (arg) { | ||
322 | |||
323 | case PMOPS_PREPARE: | ||
324 | if (pm_ops->prepare) { | ||
325 | error = pm_ops->prepare(PM_SUSPEND_DISK); | ||
326 | } | ||
327 | break; | ||
328 | |||
329 | case PMOPS_ENTER: | ||
330 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | ||
331 | error = pm_ops->enter(PM_SUSPEND_DISK); | ||
332 | break; | ||
333 | |||
334 | case PMOPS_FINISH: | ||
335 | if (pm_ops && pm_ops->finish) { | ||
336 | pm_ops->finish(PM_SUSPEND_DISK); | ||
337 | } | ||
338 | break; | ||
339 | |||
340 | default: | ||
341 | printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); | ||
342 | error = -EINVAL; | ||
343 | |||
344 | } | ||
345 | break; | ||
346 | |||
347 | case SNAPSHOT_SET_SWAP_AREA: | ||
348 | if (data->bitmap) { | ||
349 | error = -EPERM; | ||
350 | } else { | ||
351 | struct resume_swap_area swap_area; | ||
352 | dev_t swdev; | ||
353 | |||
354 | error = copy_from_user(&swap_area, (void __user *)arg, | ||
355 | sizeof(struct resume_swap_area)); | ||
356 | if (error) { | ||
357 | error = -EFAULT; | ||
358 | break; | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * User space encodes device types as two-byte values, | ||
363 | * so we need to recode them | ||
364 | */ | ||
365 | swdev = old_decode_dev(swap_area.dev); | ||
366 | if (swdev) { | ||
367 | offset = swap_area.offset; | ||
368 | data->swap = swap_type_of(swdev, offset); | ||
369 | if (data->swap < 0) | ||
370 | error = -ENODEV; | ||
371 | } else { | ||
372 | data->swap = -1; | ||
373 | error = -EINVAL; | ||
374 | } | ||
375 | } | ||
314 | break; | 376 | break; |
315 | 377 | ||
316 | default: | 378 | default: |
@@ -321,7 +383,7 @@ OutS3: | |||
321 | return error; | 383 | return error; |
322 | } | 384 | } |
323 | 385 | ||
324 | static struct file_operations snapshot_fops = { | 386 | static const struct file_operations snapshot_fops = { |
325 | .open = snapshot_open, | 387 | .open = snapshot_open, |
326 | .release = snapshot_release, | 388 | .release = snapshot_release, |
327 | .read = snapshot_read, | 389 | .read = snapshot_read, |
diff --git a/kernel/printk.c b/kernel/printk.c index f7d427ef5038..185bb45eacf7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <linux/bootmem.h> | 32 | #include <linux/bootmem.h> |
33 | #include <linux/syscalls.h> | 33 | #include <linux/syscalls.h> |
34 | #include <linux/jiffies.h> | ||
34 | 35 | ||
35 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
36 | 37 | ||
@@ -52,8 +53,6 @@ int console_printk[4] = { | |||
52 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | 53 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ |
53 | }; | 54 | }; |
54 | 55 | ||
55 | EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */ | ||
56 | |||
57 | /* | 56 | /* |
58 | * Low lever drivers may need that to know if they can schedule in | 57 | * Low lever drivers may need that to know if they can schedule in |
59 | * their unblank() callback or not. So let's export it. | 58 | * their unblank() callback or not. So let's export it. |
@@ -334,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end) | |||
334 | } | 333 | } |
335 | } | 334 | } |
336 | 335 | ||
336 | static int __read_mostly ignore_loglevel; | ||
337 | |||
338 | int __init ignore_loglevel_setup(char *str) | ||
339 | { | ||
340 | ignore_loglevel = 1; | ||
341 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | ||
342 | |||
343 | return 1; | ||
344 | } | ||
345 | |||
346 | __setup("ignore_loglevel", ignore_loglevel_setup); | ||
347 | |||
337 | /* | 348 | /* |
338 | * Write out chars from start to end - 1 inclusive | 349 | * Write out chars from start to end - 1 inclusive |
339 | */ | 350 | */ |
340 | static void _call_console_drivers(unsigned long start, | 351 | static void _call_console_drivers(unsigned long start, |
341 | unsigned long end, int msg_log_level) | 352 | unsigned long end, int msg_log_level) |
342 | { | 353 | { |
343 | if (msg_log_level < console_loglevel && | 354 | if ((msg_log_level < console_loglevel || ignore_loglevel) && |
344 | console_drivers && start != end) { | 355 | console_drivers && start != end) { |
345 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { | 356 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { |
346 | /* wrapped write */ | 357 | /* wrapped write */ |
@@ -630,12 +641,7 @@ EXPORT_SYMBOL(vprintk); | |||
630 | 641 | ||
631 | asmlinkage long sys_syslog(int type, char __user *buf, int len) | 642 | asmlinkage long sys_syslog(int type, char __user *buf, int len) |
632 | { | 643 | { |
633 | return 0; | 644 | return -ENOSYS; |
634 | } | ||
635 | |||
636 | int do_syslog(int type, char __user *buf, int len) | ||
637 | { | ||
638 | return 0; | ||
639 | } | 645 | } |
640 | 646 | ||
641 | static void call_console_drivers(unsigned long start, unsigned long end) | 647 | static void call_console_drivers(unsigned long start, unsigned long end) |
@@ -776,7 +782,6 @@ int is_console_locked(void) | |||
776 | { | 782 | { |
777 | return console_locked; | 783 | return console_locked; |
778 | } | 784 | } |
779 | EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */ | ||
780 | 785 | ||
781 | /** | 786 | /** |
782 | * release_console_sem - unlock the console system | 787 | * release_console_sem - unlock the console system |
@@ -1101,3 +1106,23 @@ int printk_ratelimit(void) | |||
1101 | printk_ratelimit_burst); | 1106 | printk_ratelimit_burst); |
1102 | } | 1107 | } |
1103 | EXPORT_SYMBOL(printk_ratelimit); | 1108 | EXPORT_SYMBOL(printk_ratelimit); |
1109 | |||
1110 | /** | ||
1111 | * printk_timed_ratelimit - caller-controlled printk ratelimiting | ||
1112 | * @caller_jiffies: pointer to caller's state | ||
1113 | * @interval_msecs: minimum interval between prints | ||
1114 | * | ||
1115 | * printk_timed_ratelimit() returns true if more than @interval_msecs | ||
1116 | * milliseconds have elapsed since the last time printk_timed_ratelimit() | ||
1117 | * returned true. | ||
1118 | */ | ||
1119 | bool printk_timed_ratelimit(unsigned long *caller_jiffies, | ||
1120 | unsigned int interval_msecs) | ||
1121 | { | ||
1122 | if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { | ||
1123 | *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); | ||
1124 | return true; | ||
1125 | } | ||
1126 | return false; | ||
1127 | } | ||
1128 | EXPORT_SYMBOL(printk_timed_ratelimit); | ||
diff --git a/kernel/profile.c b/kernel/profile.c index f940b462eec9..fb5e03d57e9d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly; | |||
40 | 40 | ||
41 | static atomic_t *prof_buffer; | 41 | static atomic_t *prof_buffer; |
42 | static unsigned long prof_len, prof_shift; | 42 | static unsigned long prof_len, prof_shift; |
43 | static int prof_on __read_mostly; | 43 | int prof_on __read_mostly; |
44 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; | 44 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; |
45 | #ifdef CONFIG_SMP | 45 | #ifdef CONFIG_SMP |
46 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); | 46 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); |
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex); | |||
51 | static int __init profile_setup(char * str) | 51 | static int __init profile_setup(char * str) |
52 | { | 52 | { |
53 | static char __initdata schedstr[] = "schedule"; | 53 | static char __initdata schedstr[] = "schedule"; |
54 | static char __initdata sleepstr[] = "sleep"; | ||
54 | int par; | 55 | int par; |
55 | 56 | ||
56 | if (!strncmp(str, schedstr, strlen(schedstr))) { | 57 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { |
58 | prof_on = SLEEP_PROFILING; | ||
59 | if (str[strlen(sleepstr)] == ',') | ||
60 | str += strlen(sleepstr) + 1; | ||
61 | if (get_option(&str, &par)) | ||
62 | prof_shift = par; | ||
63 | printk(KERN_INFO | ||
64 | "kernel sleep profiling enabled (shift: %ld)\n", | ||
65 | prof_shift); | ||
66 | } else if (!strncmp(str, sleepstr, strlen(sleepstr))) { | ||
57 | prof_on = SCHED_PROFILING; | 67 | prof_on = SCHED_PROFILING; |
58 | if (str[strlen(schedstr)] == ',') | 68 | if (str[strlen(schedstr)] == ',') |
59 | str += strlen(schedstr) + 1; | 69 | str += strlen(schedstr) + 1; |
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister); | |||
204 | * positions to which hits are accounted during short intervals (e.g. | 214 | * positions to which hits are accounted during short intervals (e.g. |
205 | * several seconds) is usually very small. Exclusion from buffer | 215 | * several seconds) is usually very small. Exclusion from buffer |
206 | * flipping is provided by interrupt disablement (note that for | 216 | * flipping is provided by interrupt disablement (note that for |
207 | * SCHED_PROFILING profile_hit() may be called from process context). | 217 | * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from |
218 | * process context). | ||
208 | * The hash function is meant to be lightweight as opposed to strong, | 219 | * The hash function is meant to be lightweight as opposed to strong, |
209 | * and was vaguely inspired by ppc64 firmware-supported inverted | 220 | * and was vaguely inspired by ppc64 firmware-supported inverted |
210 | * pagetable hash functions, but uses a full hashtable full of finite | 221 | * pagetable hash functions, but uses a full hashtable full of finite |
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void) | |||
257 | mutex_unlock(&profile_flip_mutex); | 268 | mutex_unlock(&profile_flip_mutex); |
258 | } | 269 | } |
259 | 270 | ||
260 | void profile_hit(int type, void *__pc) | 271 | void profile_hits(int type, void *__pc, unsigned int nr_hits) |
261 | { | 272 | { |
262 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; | 273 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; |
263 | int i, j, cpu; | 274 | int i, j, cpu; |
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc) | |||
274 | put_cpu(); | 285 | put_cpu(); |
275 | return; | 286 | return; |
276 | } | 287 | } |
288 | /* | ||
289 | * We buffer the global profiler buffer into a per-CPU | ||
290 | * queue and thus reduce the number of global (and possibly | ||
291 | * NUMA-alien) accesses. The write-queue is self-coalescing: | ||
292 | */ | ||
277 | local_irq_save(flags); | 293 | local_irq_save(flags); |
278 | do { | 294 | do { |
279 | for (j = 0; j < PROFILE_GRPSZ; ++j) { | 295 | for (j = 0; j < PROFILE_GRPSZ; ++j) { |
280 | if (hits[i + j].pc == pc) { | 296 | if (hits[i + j].pc == pc) { |
281 | hits[i + j].hits++; | 297 | hits[i + j].hits += nr_hits; |
282 | goto out; | 298 | goto out; |
283 | } else if (!hits[i + j].hits) { | 299 | } else if (!hits[i + j].hits) { |
284 | hits[i + j].pc = pc; | 300 | hits[i + j].pc = pc; |
285 | hits[i + j].hits = 1; | 301 | hits[i + j].hits = nr_hits; |
286 | goto out; | 302 | goto out; |
287 | } | 303 | } |
288 | } | 304 | } |
289 | i = (i + secondary) & (NR_PROFILE_HIT - 1); | 305 | i = (i + secondary) & (NR_PROFILE_HIT - 1); |
290 | } while (i != primary); | 306 | } while (i != primary); |
291 | atomic_inc(&prof_buffer[pc]); | 307 | |
308 | /* | ||
309 | * Add the current hit(s) and flush the write-queue out | ||
310 | * to the global buffer: | ||
311 | */ | ||
312 | atomic_add(nr_hits, &prof_buffer[pc]); | ||
292 | for (i = 0; i < NR_PROFILE_HIT; ++i) { | 313 | for (i = 0; i < NR_PROFILE_HIT; ++i) { |
293 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); | 314 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); |
294 | hits[i].pc = hits[i].hits = 0; | 315 | hits[i].pc = hits[i].hits = 0; |
@@ -298,7 +319,6 @@ out: | |||
298 | put_cpu(); | 319 | put_cpu(); |
299 | } | 320 | } |
300 | 321 | ||
301 | #ifdef CONFIG_HOTPLUG_CPU | ||
302 | static int __devinit profile_cpu_callback(struct notifier_block *info, | 322 | static int __devinit profile_cpu_callback(struct notifier_block *info, |
303 | unsigned long action, void *__cpu) | 323 | unsigned long action, void *__cpu) |
304 | { | 324 | { |
@@ -351,19 +371,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, | |||
351 | } | 371 | } |
352 | return NOTIFY_OK; | 372 | return NOTIFY_OK; |
353 | } | 373 | } |
354 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
355 | #else /* !CONFIG_SMP */ | 374 | #else /* !CONFIG_SMP */ |
356 | #define profile_flip_buffers() do { } while (0) | 375 | #define profile_flip_buffers() do { } while (0) |
357 | #define profile_discard_flip_buffers() do { } while (0) | 376 | #define profile_discard_flip_buffers() do { } while (0) |
377 | #define profile_cpu_callback NULL | ||
358 | 378 | ||
359 | void profile_hit(int type, void *__pc) | 379 | void profile_hits(int type, void *__pc, unsigned int nr_hits) |
360 | { | 380 | { |
361 | unsigned long pc; | 381 | unsigned long pc; |
362 | 382 | ||
363 | if (prof_on != type || !prof_buffer) | 383 | if (prof_on != type || !prof_buffer) |
364 | return; | 384 | return; |
365 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; | 385 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; |
366 | atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); | 386 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
367 | } | 387 | } |
368 | #endif /* !CONFIG_SMP */ | 388 | #endif /* !CONFIG_SMP */ |
369 | 389 | ||
@@ -442,7 +462,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
442 | read = 0; | 462 | read = 0; |
443 | 463 | ||
444 | while (p < sizeof(unsigned int) && count > 0) { | 464 | while (p < sizeof(unsigned int) && count > 0) { |
445 | put_user(*((char *)(&sample_step)+p),buf); | 465 | if (put_user(*((char *)(&sample_step)+p),buf)) |
466 | return -EFAULT; | ||
446 | buf++; p++; count--; read++; | 467 | buf++; p++; count--; read++; |
447 | } | 468 | } |
448 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); | 469 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); |
@@ -480,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
480 | return count; | 501 | return count; |
481 | } | 502 | } |
482 | 503 | ||
483 | static struct file_operations proc_profile_operations = { | 504 | static const struct file_operations proc_profile_operations = { |
484 | .read = read_profile, | 505 | .read = read_profile, |
485 | .write = write_profile, | 506 | .write = write_profile, |
486 | }; | 507 | }; |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 26bb5ffe1ef1..3554b76da84c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
235 | 235 | ||
236 | list = rdp->donelist; | 236 | list = rdp->donelist; |
237 | while (list) { | 237 | while (list) { |
238 | next = rdp->donelist = list->next; | 238 | next = list->next; |
239 | prefetch(next); | ||
239 | list->func(list); | 240 | list->func(list); |
240 | list = next; | 241 | list = next; |
241 | if (++count >= rdp->blimit) | 242 | if (++count >= rdp->blimit) |
242 | break; | 243 | break; |
243 | } | 244 | } |
245 | rdp->donelist = list; | ||
244 | 246 | ||
245 | local_irq_disable(); | 247 | local_irq_disable(); |
246 | rdp->qlen -= count; | 248 | rdp->qlen -= count; |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e2bda18f6f42..c52f981ea008 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void) | |||
401 | cleanup_srcu_struct(&srcu_ctl); | 401 | cleanup_srcu_struct(&srcu_ctl); |
402 | } | 402 | } |
403 | 403 | ||
404 | static int srcu_torture_read_lock(void) | 404 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) |
405 | { | 405 | { |
406 | return srcu_read_lock(&srcu_ctl); | 406 | return srcu_read_lock(&srcu_ctl); |
407 | } | 407 | } |
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) | |||
419 | schedule_timeout_interruptible(longdelay); | 419 | schedule_timeout_interruptible(longdelay); |
420 | } | 420 | } |
421 | 421 | ||
422 | static void srcu_torture_read_unlock(int idx) | 422 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) |
423 | { | 423 | { |
424 | srcu_read_unlock(&srcu_ctl, idx); | 424 | srcu_read_unlock(&srcu_ctl, idx); |
425 | } | 425 | } |
diff --git a/kernel/relay.c b/kernel/relay.c index f04bbdb56ac2..818e514729cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -308,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = { | |||
308 | * reason waking is deferred is that calling directly from write | 308 | * reason waking is deferred is that calling directly from write |
309 | * causes problems if you're writing from say the scheduler. | 309 | * causes problems if you're writing from say the scheduler. |
310 | */ | 310 | */ |
311 | static void wakeup_readers(void *private) | 311 | static void wakeup_readers(struct work_struct *work) |
312 | { | 312 | { |
313 | struct rchan_buf *buf = private; | 313 | struct rchan_buf *buf = |
314 | container_of(work, struct rchan_buf, wake_readers.work); | ||
314 | wake_up_interruptible(&buf->read_wait); | 315 | wake_up_interruptible(&buf->read_wait); |
315 | } | 316 | } |
316 | 317 | ||
@@ -328,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init) | |||
328 | if (init) { | 329 | if (init) { |
329 | init_waitqueue_head(&buf->read_wait); | 330 | init_waitqueue_head(&buf->read_wait); |
330 | kref_init(&buf->kref); | 331 | kref_init(&buf->kref); |
331 | INIT_WORK(&buf->wake_readers, NULL, NULL); | 332 | INIT_DELAYED_WORK(&buf->wake_readers, NULL); |
332 | } else { | 333 | } else { |
333 | cancel_delayed_work(&buf->wake_readers); | 334 | cancel_delayed_work(&buf->wake_readers); |
334 | flush_scheduled_work(); | 335 | flush_scheduled_work(); |
@@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) | |||
549 | buf->padding[old_subbuf]; | 550 | buf->padding[old_subbuf]; |
550 | smp_mb(); | 551 | smp_mb(); |
551 | if (waitqueue_active(&buf->read_wait)) { | 552 | if (waitqueue_active(&buf->read_wait)) { |
552 | PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); | 553 | PREPARE_DELAYED_WORK(&buf->wake_readers, |
554 | wakeup_readers); | ||
553 | schedule_delayed_work(&buf->wake_readers, 1); | 555 | schedule_delayed_work(&buf->wake_readers, 1); |
554 | } | 556 | } |
555 | } | 557 | } |
@@ -957,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, | |||
957 | if (!desc->count) | 959 | if (!desc->count) |
958 | return 0; | 960 | return 0; |
959 | 961 | ||
960 | mutex_lock(&filp->f_dentry->d_inode->i_mutex); | 962 | mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); |
961 | do { | 963 | do { |
962 | if (!relay_file_read_avail(buf, *ppos)) | 964 | if (!relay_file_read_avail(buf, *ppos)) |
963 | break; | 965 | break; |
@@ -977,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, | |||
977 | *ppos = relay_file_read_end_pos(buf, read_start, ret); | 979 | *ppos = relay_file_read_end_pos(buf, read_start, ret); |
978 | } | 980 | } |
979 | } while (desc->count && ret); | 981 | } while (desc->count && ret); |
980 | mutex_unlock(&filp->f_dentry->d_inode->i_mutex); | 982 | mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); |
981 | 983 | ||
982 | return desc->written; | 984 | return desc->written; |
983 | } | 985 | } |
@@ -1011,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp, | |||
1011 | actor, &desc); | 1013 | actor, &desc); |
1012 | } | 1014 | } |
1013 | 1015 | ||
1014 | struct file_operations relay_file_operations = { | 1016 | const struct file_operations relay_file_operations = { |
1015 | .open = relay_file_open, | 1017 | .open = relay_file_open, |
1016 | .poll = relay_file_poll, | 1018 | .poll = relay_file_poll, |
1017 | .mmap = relay_file_mmap, | 1019 | .mmap = relay_file_mmap, |
diff --git a/kernel/resource.c b/kernel/resource.c index 6de60c12143e..7b9a497419d9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v) | |||
88 | return 0; | 88 | return 0; |
89 | } | 89 | } |
90 | 90 | ||
91 | static struct seq_operations resource_op = { | 91 | static const struct seq_operations resource_op = { |
92 | .start = r_start, | 92 | .start = r_start, |
93 | .next = r_next, | 93 | .next = r_next, |
94 | .stop = r_stop, | 94 | .stop = r_stop, |
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file) | |||
115 | return res; | 115 | return res; |
116 | } | 116 | } |
117 | 117 | ||
118 | static struct file_operations proc_ioports_operations = { | 118 | static const struct file_operations proc_ioports_operations = { |
119 | .open = ioports_open, | 119 | .open = ioports_open, |
120 | .read = seq_read, | 120 | .read = seq_read, |
121 | .llseek = seq_lseek, | 121 | .llseek = seq_lseek, |
122 | .release = seq_release, | 122 | .release = seq_release, |
123 | }; | 123 | }; |
124 | 124 | ||
125 | static struct file_operations proc_iomem_operations = { | 125 | static const struct file_operations proc_iomem_operations = { |
126 | .open = iomem_open, | 126 | .open = iomem_open, |
127 | .read = seq_read, | 127 | .read = seq_read, |
128 | .llseek = seq_lseek, | 128 | .llseek = seq_lseek, |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 6dcea9dd8c94..015fc633c96c 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
14 | #include <linux/sysdev.h> | 14 | #include <linux/sysdev.h> |
15 | #include <linux/timer.h> | 15 | #include <linux/timer.h> |
16 | #include <linux/freezer.h> | ||
16 | 17 | ||
17 | #include "rtmutex.h" | 18 | #include "rtmutex.h" |
18 | 19 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 094b5687eef6..8a0afb97af71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -34,7 +34,7 @@ | |||
34 | #include <linux/security.h> | 34 | #include <linux/security.h> |
35 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
36 | #include <linux/profile.h> | 36 | #include <linux/profile.h> |
37 | #include <linux/suspend.h> | 37 | #include <linux/freezer.h> |
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/blkdev.h> | 39 | #include <linux/blkdev.h> |
40 | #include <linux/delay.h> | 40 | #include <linux/delay.h> |
@@ -160,15 +160,6 @@ | |||
160 | #define TASK_PREEMPTS_CURR(p, rq) \ | 160 | #define TASK_PREEMPTS_CURR(p, rq) \ |
161 | ((p)->prio < (rq)->curr->prio) | 161 | ((p)->prio < (rq)->curr->prio) |
162 | 162 | ||
163 | /* | ||
164 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
165 | * to time slice values: [800ms ... 100ms ... 5ms] | ||
166 | * | ||
167 | * The higher a thread's priority, the bigger timeslices | ||
168 | * it gets during one round of execution. But even the lowest | ||
169 | * priority thread gets MIN_TIMESLICE worth of execution time. | ||
170 | */ | ||
171 | |||
172 | #define SCALE_PRIO(x, prio) \ | 163 | #define SCALE_PRIO(x, prio) \ |
173 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | 164 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
174 | 165 | ||
@@ -180,6 +171,15 @@ static unsigned int static_prio_timeslice(int static_prio) | |||
180 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | 171 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
181 | } | 172 | } |
182 | 173 | ||
174 | /* | ||
175 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
176 | * to time slice values: [800ms ... 100ms ... 5ms] | ||
177 | * | ||
178 | * The higher a thread's priority, the bigger timeslices | ||
179 | * it gets during one round of execution. But even the lowest | ||
180 | * priority thread gets MIN_TIMESLICE worth of execution time. | ||
181 | */ | ||
182 | |||
183 | static inline unsigned int task_timeslice(struct task_struct *p) | 183 | static inline unsigned int task_timeslice(struct task_struct *p) |
184 | { | 184 | { |
185 | return static_prio_timeslice(p->static_prio); | 185 | return static_prio_timeslice(p->static_prio); |
@@ -225,8 +225,10 @@ struct rq { | |||
225 | unsigned long nr_uninterruptible; | 225 | unsigned long nr_uninterruptible; |
226 | 226 | ||
227 | unsigned long expired_timestamp; | 227 | unsigned long expired_timestamp; |
228 | unsigned long long timestamp_last_tick; | 228 | /* Cached timestamp set by update_cpu_clock() */ |
229 | unsigned long long most_recent_timestamp; | ||
229 | struct task_struct *curr, *idle; | 230 | struct task_struct *curr, *idle; |
231 | unsigned long next_balance; | ||
230 | struct mm_struct *prev_mm; | 232 | struct mm_struct *prev_mm; |
231 | struct prio_array *active, *expired, arrays[2]; | 233 | struct prio_array *active, *expired, arrays[2]; |
232 | int best_expired_prio; | 234 | int best_expired_prio; |
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | |||
426 | * bump this up when changing the output format or the meaning of an existing | 428 | * bump this up when changing the output format or the meaning of an existing |
427 | * format, so that tools can adapt (or abort) | 429 | * format, so that tools can adapt (or abort) |
428 | */ | 430 | */ |
429 | #define SCHEDSTAT_VERSION 12 | 431 | #define SCHEDSTAT_VERSION 14 |
430 | 432 | ||
431 | static int show_schedstat(struct seq_file *seq, void *v) | 433 | static int show_schedstat(struct seq_file *seq, void *v) |
432 | { | 434 | { |
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
464 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | 466 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); |
465 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; | 467 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; |
466 | itype++) { | 468 | itype++) { |
467 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", | 469 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " |
470 | "%lu", | ||
468 | sd->lb_cnt[itype], | 471 | sd->lb_cnt[itype], |
469 | sd->lb_balanced[itype], | 472 | sd->lb_balanced[itype], |
470 | sd->lb_failed[itype], | 473 | sd->lb_failed[itype], |
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
474 | sd->lb_nobusyq[itype], | 477 | sd->lb_nobusyq[itype], |
475 | sd->lb_nobusyg[itype]); | 478 | sd->lb_nobusyg[itype]); |
476 | } | 479 | } |
477 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", | 480 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" |
481 | " %lu %lu %lu\n", | ||
478 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 482 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, |
479 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | 483 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, |
480 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | 484 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, |
481 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); | 485 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
486 | sd->ttwu_move_balance); | ||
482 | } | 487 | } |
483 | preempt_enable(); | 488 | preempt_enable(); |
484 | #endif | 489 | #endif |
@@ -505,7 +510,7 @@ static int schedstat_open(struct inode *inode, struct file *file) | |||
505 | return res; | 510 | return res; |
506 | } | 511 | } |
507 | 512 | ||
508 | struct file_operations proc_schedstat_operations = { | 513 | const struct file_operations proc_schedstat_operations = { |
509 | .open = schedstat_open, | 514 | .open = schedstat_open, |
510 | .read = seq_read, | 515 | .read = seq_read, |
511 | .llseek = seq_lseek, | 516 | .llseek = seq_lseek, |
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | |||
547 | #endif | 552 | #endif |
548 | 553 | ||
549 | /* | 554 | /* |
550 | * rq_lock - lock a given runqueue and disable interrupts. | 555 | * this_rq_lock - lock this runqueue and disable interrupts. |
551 | */ | 556 | */ |
552 | static inline struct rq *this_rq_lock(void) | 557 | static inline struct rq *this_rq_lock(void) |
553 | __acquires(rq->lock) | 558 | __acquires(rq->lock) |
@@ -938,18 +943,31 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) | |||
938 | { | 943 | { |
939 | unsigned long long now; | 944 | unsigned long long now; |
940 | 945 | ||
946 | if (rt_task(p)) | ||
947 | goto out; | ||
948 | |||
941 | now = sched_clock(); | 949 | now = sched_clock(); |
942 | #ifdef CONFIG_SMP | 950 | #ifdef CONFIG_SMP |
943 | if (!local) { | 951 | if (!local) { |
944 | /* Compensate for drifting sched_clock */ | 952 | /* Compensate for drifting sched_clock */ |
945 | struct rq *this_rq = this_rq(); | 953 | struct rq *this_rq = this_rq(); |
946 | now = (now - this_rq->timestamp_last_tick) | 954 | now = (now - this_rq->most_recent_timestamp) |
947 | + rq->timestamp_last_tick; | 955 | + rq->most_recent_timestamp; |
948 | } | 956 | } |
949 | #endif | 957 | #endif |
950 | 958 | ||
951 | if (!rt_task(p)) | 959 | /* |
952 | p->prio = recalc_task_prio(p, now); | 960 | * Sleep time is in units of nanosecs, so shift by 20 to get a |
961 | * milliseconds-range estimation of the amount of time that the task | ||
962 | * spent sleeping: | ||
963 | */ | ||
964 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
965 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
966 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), | ||
967 | (now - p->timestamp) >> 20); | ||
968 | } | ||
969 | |||
970 | p->prio = recalc_task_prio(p, now); | ||
953 | 971 | ||
954 | /* | 972 | /* |
955 | * This checks to make sure it's not an uninterruptible task | 973 | * This checks to make sure it's not an uninterruptible task |
@@ -974,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) | |||
974 | } | 992 | } |
975 | } | 993 | } |
976 | p->timestamp = now; | 994 | p->timestamp = now; |
977 | 995 | out: | |
978 | __activate_task(p, rq); | 996 | __activate_task(p, rq); |
979 | } | 997 | } |
980 | 998 | ||
@@ -1439,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1439 | 1457 | ||
1440 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1458 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1441 | unsigned long tl = this_load; | 1459 | unsigned long tl = this_load; |
1442 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | 1460 | unsigned long tl_per_task; |
1461 | |||
1462 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1443 | 1463 | ||
1444 | /* | 1464 | /* |
1445 | * If sync wakeup then subtract the (maximum possible) | 1465 | * If sync wakeup then subtract the (maximum possible) |
@@ -1677,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1677 | * Not the local CPU - must adjust timestamp. This should | 1697 | * Not the local CPU - must adjust timestamp. This should |
1678 | * get optimised away in the !CONFIG_SMP case. | 1698 | * get optimised away in the !CONFIG_SMP case. |
1679 | */ | 1699 | */ |
1680 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) | 1700 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) |
1681 | + rq->timestamp_last_tick; | 1701 | + rq->most_recent_timestamp; |
1682 | __activate_task(p, rq); | 1702 | __activate_task(p, rq); |
1683 | if (TASK_PREEMPTS_CURR(p, rq)) | 1703 | if (TASK_PREEMPTS_CURR(p, rq)) |
1684 | resched_task(rq->curr); | 1704 | resched_task(rq->curr); |
@@ -1941,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) | |||
1941 | __acquires(rq1->lock) | 1961 | __acquires(rq1->lock) |
1942 | __acquires(rq2->lock) | 1962 | __acquires(rq2->lock) |
1943 | { | 1963 | { |
1964 | BUG_ON(!irqs_disabled()); | ||
1944 | if (rq1 == rq2) { | 1965 | if (rq1 == rq2) { |
1945 | spin_lock(&rq1->lock); | 1966 | spin_lock(&rq1->lock); |
1946 | __acquire(rq2->lock); /* Fake it out ;) */ | 1967 | __acquire(rq2->lock); /* Fake it out ;) */ |
@@ -1980,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1980 | __acquires(busiest->lock) | 2001 | __acquires(busiest->lock) |
1981 | __acquires(this_rq->lock) | 2002 | __acquires(this_rq->lock) |
1982 | { | 2003 | { |
2004 | if (unlikely(!irqs_disabled())) { | ||
2005 | /* printk() doesn't work good under rq->lock */ | ||
2006 | spin_unlock(&this_rq->lock); | ||
2007 | BUG_ON(1); | ||
2008 | } | ||
1983 | if (unlikely(!spin_trylock(&busiest->lock))) { | 2009 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1984 | if (busiest < this_rq) { | 2010 | if (busiest < this_rq) { |
1985 | spin_unlock(&this_rq->lock); | 2011 | spin_unlock(&this_rq->lock); |
@@ -2050,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, | |||
2050 | set_task_cpu(p, this_cpu); | 2076 | set_task_cpu(p, this_cpu); |
2051 | inc_nr_running(p, this_rq); | 2077 | inc_nr_running(p, this_rq); |
2052 | enqueue_task(p, this_array); | 2078 | enqueue_task(p, this_array); |
2053 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 2079 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) |
2054 | + this_rq->timestamp_last_tick; | 2080 | + this_rq->most_recent_timestamp; |
2055 | /* | 2081 | /* |
2056 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2082 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2057 | * to be always true for them. | 2083 | * to be always true for them. |
@@ -2087,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2087 | * 2) too many balance attempts have failed. | 2113 | * 2) too many balance attempts have failed. |
2088 | */ | 2114 | */ |
2089 | 2115 | ||
2090 | if (sd->nr_balance_failed > sd->cache_nice_tries) | 2116 | if (sd->nr_balance_failed > sd->cache_nice_tries) { |
2117 | #ifdef CONFIG_SCHEDSTATS | ||
2118 | if (task_hot(p, rq->most_recent_timestamp, sd)) | ||
2119 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
2120 | #endif | ||
2091 | return 1; | 2121 | return 1; |
2122 | } | ||
2092 | 2123 | ||
2093 | if (task_hot(p, rq->timestamp_last_tick, sd)) | 2124 | if (task_hot(p, rq->most_recent_timestamp, sd)) |
2094 | return 0; | 2125 | return 0; |
2095 | return 1; | 2126 | return 1; |
2096 | } | 2127 | } |
@@ -2188,11 +2219,6 @@ skip_queue: | |||
2188 | goto skip_bitmap; | 2219 | goto skip_bitmap; |
2189 | } | 2220 | } |
2190 | 2221 | ||
2191 | #ifdef CONFIG_SCHEDSTATS | ||
2192 | if (task_hot(tmp, busiest->timestamp_last_tick, sd)) | ||
2193 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
2194 | #endif | ||
2195 | |||
2196 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2222 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
2197 | pulled++; | 2223 | pulled++; |
2198 | rem_load_move -= tmp->load_weight; | 2224 | rem_load_move -= tmp->load_weight; |
@@ -2230,7 +2256,7 @@ out: | |||
2230 | static struct sched_group * | 2256 | static struct sched_group * |
2231 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2257 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2232 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, | 2258 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, |
2233 | cpumask_t *cpus) | 2259 | cpumask_t *cpus, int *balance) |
2234 | { | 2260 | { |
2235 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2261 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
2236 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2262 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
@@ -2259,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2259 | unsigned long load, group_capacity; | 2285 | unsigned long load, group_capacity; |
2260 | int local_group; | 2286 | int local_group; |
2261 | int i; | 2287 | int i; |
2288 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
2262 | unsigned long sum_nr_running, sum_weighted_load; | 2289 | unsigned long sum_nr_running, sum_weighted_load; |
2263 | 2290 | ||
2264 | local_group = cpu_isset(this_cpu, group->cpumask); | 2291 | local_group = cpu_isset(this_cpu, group->cpumask); |
2265 | 2292 | ||
2293 | if (local_group) | ||
2294 | balance_cpu = first_cpu(group->cpumask); | ||
2295 | |||
2266 | /* Tally up the load of all CPUs in the group */ | 2296 | /* Tally up the load of all CPUs in the group */ |
2267 | sum_weighted_load = sum_nr_running = avg_load = 0; | 2297 | sum_weighted_load = sum_nr_running = avg_load = 0; |
2268 | 2298 | ||
@@ -2278,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2278 | *sd_idle = 0; | 2308 | *sd_idle = 0; |
2279 | 2309 | ||
2280 | /* Bias balancing toward cpus of our domain */ | 2310 | /* Bias balancing toward cpus of our domain */ |
2281 | if (local_group) | 2311 | if (local_group) { |
2312 | if (idle_cpu(i) && !first_idle_cpu) { | ||
2313 | first_idle_cpu = 1; | ||
2314 | balance_cpu = i; | ||
2315 | } | ||
2316 | |||
2282 | load = target_load(i, load_idx); | 2317 | load = target_load(i, load_idx); |
2283 | else | 2318 | } else |
2284 | load = source_load(i, load_idx); | 2319 | load = source_load(i, load_idx); |
2285 | 2320 | ||
2286 | avg_load += load; | 2321 | avg_load += load; |
@@ -2288,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2288 | sum_weighted_load += rq->raw_weighted_load; | 2323 | sum_weighted_load += rq->raw_weighted_load; |
2289 | } | 2324 | } |
2290 | 2325 | ||
2326 | /* | ||
2327 | * First idle cpu or the first cpu(busiest) in this sched group | ||
2328 | * is eligible for doing load balancing at this and above | ||
2329 | * domains. | ||
2330 | */ | ||
2331 | if (local_group && balance_cpu != this_cpu && balance) { | ||
2332 | *balance = 0; | ||
2333 | goto ret; | ||
2334 | } | ||
2335 | |||
2291 | total_load += avg_load; | 2336 | total_load += avg_load; |
2292 | total_pwr += group->cpu_power; | 2337 | total_pwr += group->cpu_power; |
2293 | 2338 | ||
@@ -2447,18 +2492,21 @@ small_imbalance: | |||
2447 | pwr_now /= SCHED_LOAD_SCALE; | 2492 | pwr_now /= SCHED_LOAD_SCALE; |
2448 | 2493 | ||
2449 | /* Amount of load we'd subtract */ | 2494 | /* Amount of load we'd subtract */ |
2450 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; | 2495 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / |
2496 | busiest->cpu_power; | ||
2451 | if (max_load > tmp) | 2497 | if (max_load > tmp) |
2452 | pwr_move += busiest->cpu_power * | 2498 | pwr_move += busiest->cpu_power * |
2453 | min(busiest_load_per_task, max_load - tmp); | 2499 | min(busiest_load_per_task, max_load - tmp); |
2454 | 2500 | ||
2455 | /* Amount of load we'd add */ | 2501 | /* Amount of load we'd add */ |
2456 | if (max_load*busiest->cpu_power < | 2502 | if (max_load * busiest->cpu_power < |
2457 | busiest_load_per_task*SCHED_LOAD_SCALE) | 2503 | busiest_load_per_task * SCHED_LOAD_SCALE) |
2458 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2504 | tmp = max_load * busiest->cpu_power / this->cpu_power; |
2459 | else | 2505 | else |
2460 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; | 2506 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / |
2461 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); | 2507 | this->cpu_power; |
2508 | pwr_move += this->cpu_power * | ||
2509 | min(this_load_per_task, this_load + tmp); | ||
2462 | pwr_move /= SCHED_LOAD_SCALE; | 2510 | pwr_move /= SCHED_LOAD_SCALE; |
2463 | 2511 | ||
2464 | /* Move if we gain throughput */ | 2512 | /* Move if we gain throughput */ |
@@ -2479,8 +2527,8 @@ out_balanced: | |||
2479 | *imbalance = min_load_per_task; | 2527 | *imbalance = min_load_per_task; |
2480 | return group_min; | 2528 | return group_min; |
2481 | } | 2529 | } |
2482 | ret: | ||
2483 | #endif | 2530 | #endif |
2531 | ret: | ||
2484 | *imbalance = 0; | 2532 | *imbalance = 0; |
2485 | return NULL; | 2533 | return NULL; |
2486 | } | 2534 | } |
@@ -2529,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n) | |||
2529 | /* | 2577 | /* |
2530 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2578 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2531 | * tasks if there is an imbalance. | 2579 | * tasks if there is an imbalance. |
2532 | * | ||
2533 | * Called with this_rq unlocked. | ||
2534 | */ | 2580 | */ |
2535 | static int load_balance(int this_cpu, struct rq *this_rq, | 2581 | static int load_balance(int this_cpu, struct rq *this_rq, |
2536 | struct sched_domain *sd, enum idle_type idle) | 2582 | struct sched_domain *sd, enum idle_type idle, |
2583 | int *balance) | ||
2537 | { | 2584 | { |
2538 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 2585 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
2539 | struct sched_group *group; | 2586 | struct sched_group *group; |
2540 | unsigned long imbalance; | 2587 | unsigned long imbalance; |
2541 | struct rq *busiest; | 2588 | struct rq *busiest; |
2542 | cpumask_t cpus = CPU_MASK_ALL; | 2589 | cpumask_t cpus = CPU_MASK_ALL; |
2590 | unsigned long flags; | ||
2543 | 2591 | ||
2544 | /* | 2592 | /* |
2545 | * When power savings policy is enabled for the parent domain, idle | 2593 | * When power savings policy is enabled for the parent domain, idle |
@@ -2555,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2555 | 2603 | ||
2556 | redo: | 2604 | redo: |
2557 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 2605 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
2558 | &cpus); | 2606 | &cpus, balance); |
2607 | |||
2608 | if (*balance == 0) | ||
2609 | goto out_balanced; | ||
2610 | |||
2559 | if (!group) { | 2611 | if (!group) { |
2560 | schedstat_inc(sd, lb_nobusyg[idle]); | 2612 | schedstat_inc(sd, lb_nobusyg[idle]); |
2561 | goto out_balanced; | 2613 | goto out_balanced; |
@@ -2579,11 +2631,13 @@ redo: | |||
2579 | * still unbalanced. nr_moved simply stays zero, so it is | 2631 | * still unbalanced. nr_moved simply stays zero, so it is |
2580 | * correctly treated as an imbalance. | 2632 | * correctly treated as an imbalance. |
2581 | */ | 2633 | */ |
2634 | local_irq_save(flags); | ||
2582 | double_rq_lock(this_rq, busiest); | 2635 | double_rq_lock(this_rq, busiest); |
2583 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2636 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2584 | minus_1_or_zero(busiest->nr_running), | 2637 | minus_1_or_zero(busiest->nr_running), |
2585 | imbalance, sd, idle, &all_pinned); | 2638 | imbalance, sd, idle, &all_pinned); |
2586 | double_rq_unlock(this_rq, busiest); | 2639 | double_rq_unlock(this_rq, busiest); |
2640 | local_irq_restore(flags); | ||
2587 | 2641 | ||
2588 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2642 | /* All tasks on this runqueue were pinned by CPU affinity */ |
2589 | if (unlikely(all_pinned)) { | 2643 | if (unlikely(all_pinned)) { |
@@ -2600,13 +2654,13 @@ redo: | |||
2600 | 2654 | ||
2601 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2655 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
2602 | 2656 | ||
2603 | spin_lock(&busiest->lock); | 2657 | spin_lock_irqsave(&busiest->lock, flags); |
2604 | 2658 | ||
2605 | /* don't kick the migration_thread, if the curr | 2659 | /* don't kick the migration_thread, if the curr |
2606 | * task on busiest cpu can't be moved to this_cpu | 2660 | * task on busiest cpu can't be moved to this_cpu |
2607 | */ | 2661 | */ |
2608 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { | 2662 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { |
2609 | spin_unlock(&busiest->lock); | 2663 | spin_unlock_irqrestore(&busiest->lock, flags); |
2610 | all_pinned = 1; | 2664 | all_pinned = 1; |
2611 | goto out_one_pinned; | 2665 | goto out_one_pinned; |
2612 | } | 2666 | } |
@@ -2616,7 +2670,7 @@ redo: | |||
2616 | busiest->push_cpu = this_cpu; | 2670 | busiest->push_cpu = this_cpu; |
2617 | active_balance = 1; | 2671 | active_balance = 1; |
2618 | } | 2672 | } |
2619 | spin_unlock(&busiest->lock); | 2673 | spin_unlock_irqrestore(&busiest->lock, flags); |
2620 | if (active_balance) | 2674 | if (active_balance) |
2621 | wake_up_process(busiest->migration_thread); | 2675 | wake_up_process(busiest->migration_thread); |
2622 | 2676 | ||
@@ -2695,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
2695 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2749 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
2696 | redo: | 2750 | redo: |
2697 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, | 2751 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, |
2698 | &sd_idle, &cpus); | 2752 | &sd_idle, &cpus, NULL); |
2699 | if (!group) { | 2753 | if (!group) { |
2700 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2754 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
2701 | goto out_balanced; | 2755 | goto out_balanced; |
@@ -2755,14 +2809,28 @@ out_balanced: | |||
2755 | static void idle_balance(int this_cpu, struct rq *this_rq) | 2809 | static void idle_balance(int this_cpu, struct rq *this_rq) |
2756 | { | 2810 | { |
2757 | struct sched_domain *sd; | 2811 | struct sched_domain *sd; |
2812 | int pulled_task = 0; | ||
2813 | unsigned long next_balance = jiffies + 60 * HZ; | ||
2758 | 2814 | ||
2759 | for_each_domain(this_cpu, sd) { | 2815 | for_each_domain(this_cpu, sd) { |
2760 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 2816 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
2761 | /* If we've pulled tasks over stop searching: */ | 2817 | /* If we've pulled tasks over stop searching: */ |
2762 | if (load_balance_newidle(this_cpu, this_rq, sd)) | 2818 | pulled_task = load_balance_newidle(this_cpu, |
2819 | this_rq, sd); | ||
2820 | if (time_after(next_balance, | ||
2821 | sd->last_balance + sd->balance_interval)) | ||
2822 | next_balance = sd->last_balance | ||
2823 | + sd->balance_interval; | ||
2824 | if (pulled_task) | ||
2763 | break; | 2825 | break; |
2764 | } | 2826 | } |
2765 | } | 2827 | } |
2828 | if (!pulled_task) | ||
2829 | /* | ||
2830 | * We are going idle. next_balance may be set based on | ||
2831 | * a busy processor. So reset next_balance. | ||
2832 | */ | ||
2833 | this_rq->next_balance = next_balance; | ||
2766 | } | 2834 | } |
2767 | 2835 | ||
2768 | /* | 2836 | /* |
@@ -2815,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
2815 | spin_unlock(&target_rq->lock); | 2883 | spin_unlock(&target_rq->lock); |
2816 | } | 2884 | } |
2817 | 2885 | ||
2818 | /* | 2886 | static void update_load(struct rq *this_rq) |
2819 | * rebalance_tick will get called every timer tick, on every CPU. | ||
2820 | * | ||
2821 | * It checks each scheduling domain to see if it is due to be balanced, | ||
2822 | * and initiates a balancing operation if so. | ||
2823 | * | ||
2824 | * Balancing parameters are set up in arch_init_sched_domains. | ||
2825 | */ | ||
2826 | |||
2827 | /* Don't have all balancing operations going off at once: */ | ||
2828 | static inline unsigned long cpu_offset(int cpu) | ||
2829 | { | ||
2830 | return jiffies + cpu * HZ / NR_CPUS; | ||
2831 | } | ||
2832 | |||
2833 | static void | ||
2834 | rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | ||
2835 | { | 2887 | { |
2836 | unsigned long this_load, interval, j = cpu_offset(this_cpu); | 2888 | unsigned long this_load; |
2837 | struct sched_domain *sd; | ||
2838 | int i, scale; | 2889 | int i, scale; |
2839 | 2890 | ||
2840 | this_load = this_rq->raw_weighted_load; | 2891 | this_load = this_rq->raw_weighted_load; |
@@ -2854,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | |||
2854 | new_load += scale-1; | 2905 | new_load += scale-1; |
2855 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; | 2906 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; |
2856 | } | 2907 | } |
2908 | } | ||
2909 | |||
2910 | /* | ||
2911 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
2912 | * | ||
2913 | * It checks each scheduling domain to see if it is due to be balanced, | ||
2914 | * and initiates a balancing operation if so. | ||
2915 | * | ||
2916 | * Balancing parameters are set up in arch_init_sched_domains. | ||
2917 | */ | ||
2918 | static DEFINE_SPINLOCK(balancing); | ||
2919 | |||
2920 | static void run_rebalance_domains(struct softirq_action *h) | ||
2921 | { | ||
2922 | int this_cpu = smp_processor_id(), balance = 1; | ||
2923 | struct rq *this_rq = cpu_rq(this_cpu); | ||
2924 | unsigned long interval; | ||
2925 | struct sched_domain *sd; | ||
2926 | /* | ||
2927 | * We are idle if there are no processes running. This | ||
2928 | * is valid even if we are the idle process (SMT). | ||
2929 | */ | ||
2930 | enum idle_type idle = !this_rq->nr_running ? | ||
2931 | SCHED_IDLE : NOT_IDLE; | ||
2932 | /* Earliest time when we have to call run_rebalance_domains again */ | ||
2933 | unsigned long next_balance = jiffies + 60*HZ; | ||
2857 | 2934 | ||
2858 | for_each_domain(this_cpu, sd) { | 2935 | for_each_domain(this_cpu, sd) { |
2859 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2936 | if (!(sd->flags & SD_LOAD_BALANCE)) |
@@ -2868,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | |||
2868 | if (unlikely(!interval)) | 2945 | if (unlikely(!interval)) |
2869 | interval = 1; | 2946 | interval = 1; |
2870 | 2947 | ||
2871 | if (j - sd->last_balance >= interval) { | 2948 | if (sd->flags & SD_SERIALIZE) { |
2872 | if (load_balance(this_cpu, this_rq, sd, idle)) { | 2949 | if (!spin_trylock(&balancing)) |
2950 | goto out; | ||
2951 | } | ||
2952 | |||
2953 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
2954 | if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { | ||
2873 | /* | 2955 | /* |
2874 | * We've pulled tasks over so either we're no | 2956 | * We've pulled tasks over so either we're no |
2875 | * longer idle, or one of our SMT siblings is | 2957 | * longer idle, or one of our SMT siblings is |
@@ -2877,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | |||
2877 | */ | 2959 | */ |
2878 | idle = NOT_IDLE; | 2960 | idle = NOT_IDLE; |
2879 | } | 2961 | } |
2880 | sd->last_balance += interval; | 2962 | sd->last_balance = jiffies; |
2881 | } | 2963 | } |
2964 | if (sd->flags & SD_SERIALIZE) | ||
2965 | spin_unlock(&balancing); | ||
2966 | out: | ||
2967 | if (time_after(next_balance, sd->last_balance + interval)) | ||
2968 | next_balance = sd->last_balance + interval; | ||
2969 | |||
2970 | /* | ||
2971 | * Stop the load balance at this level. There is another | ||
2972 | * CPU in our sched group which is doing load balancing more | ||
2973 | * actively. | ||
2974 | */ | ||
2975 | if (!balance) | ||
2976 | break; | ||
2882 | } | 2977 | } |
2978 | this_rq->next_balance = next_balance; | ||
2883 | } | 2979 | } |
2884 | #else | 2980 | #else |
2885 | /* | 2981 | /* |
2886 | * on UP we do not need to balance between CPUs: | 2982 | * on UP we do not need to balance between CPUs: |
2887 | */ | 2983 | */ |
2888 | static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) | ||
2889 | { | ||
2890 | } | ||
2891 | static inline void idle_balance(int cpu, struct rq *rq) | 2984 | static inline void idle_balance(int cpu, struct rq *rq) |
2892 | { | 2985 | { |
2893 | } | 2986 | } |
2894 | #endif | 2987 | #endif |
2895 | 2988 | ||
2896 | static inline int wake_priority_sleeper(struct rq *rq) | 2989 | static inline void wake_priority_sleeper(struct rq *rq) |
2897 | { | 2990 | { |
2898 | int ret = 0; | ||
2899 | |||
2900 | #ifdef CONFIG_SCHED_SMT | 2991 | #ifdef CONFIG_SCHED_SMT |
2992 | if (!rq->nr_running) | ||
2993 | return; | ||
2994 | |||
2901 | spin_lock(&rq->lock); | 2995 | spin_lock(&rq->lock); |
2902 | /* | 2996 | /* |
2903 | * If an SMT sibling task has been put to sleep for priority | 2997 | * If an SMT sibling task has been put to sleep for priority |
2904 | * reasons reschedule the idle task to see if it can now run. | 2998 | * reasons reschedule the idle task to see if it can now run. |
2905 | */ | 2999 | */ |
2906 | if (rq->nr_running) { | 3000 | if (rq->nr_running) |
2907 | resched_task(rq->idle); | 3001 | resched_task(rq->idle); |
2908 | ret = 1; | ||
2909 | } | ||
2910 | spin_unlock(&rq->lock); | 3002 | spin_unlock(&rq->lock); |
2911 | #endif | 3003 | #endif |
2912 | return ret; | ||
2913 | } | 3004 | } |
2914 | 3005 | ||
2915 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3006 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
@@ -2923,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
2923 | static inline void | 3014 | static inline void |
2924 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) | 3015 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) |
2925 | { | 3016 | { |
2926 | p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); | 3017 | p->sched_time += now - p->last_ran; |
3018 | p->last_ran = rq->most_recent_timestamp = now; | ||
2927 | } | 3019 | } |
2928 | 3020 | ||
2929 | /* | 3021 | /* |
@@ -2936,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p) | |||
2936 | unsigned long flags; | 3028 | unsigned long flags; |
2937 | 3029 | ||
2938 | local_irq_save(flags); | 3030 | local_irq_save(flags); |
2939 | ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); | 3031 | ns = p->sched_time + sched_clock() - p->last_ran; |
2940 | ns = p->sched_time + sched_clock() - ns; | ||
2941 | local_irq_restore(flags); | 3032 | local_irq_restore(flags); |
2942 | 3033 | ||
2943 | return ns; | 3034 | return ns; |
@@ -3037,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
3037 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3128 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
3038 | } | 3129 | } |
3039 | 3130 | ||
3040 | /* | 3131 | static void task_running_tick(struct rq *rq, struct task_struct *p) |
3041 | * This function gets called by the timer code, with HZ frequency. | ||
3042 | * We call it with interrupts disabled. | ||
3043 | * | ||
3044 | * It also gets called by the fork code, when changing the parent's | ||
3045 | * timeslices. | ||
3046 | */ | ||
3047 | void scheduler_tick(void) | ||
3048 | { | 3132 | { |
3049 | unsigned long long now = sched_clock(); | ||
3050 | struct task_struct *p = current; | ||
3051 | int cpu = smp_processor_id(); | ||
3052 | struct rq *rq = cpu_rq(cpu); | ||
3053 | |||
3054 | update_cpu_clock(p, rq, now); | ||
3055 | |||
3056 | rq->timestamp_last_tick = now; | ||
3057 | |||
3058 | if (p == rq->idle) { | ||
3059 | if (wake_priority_sleeper(rq)) | ||
3060 | goto out; | ||
3061 | rebalance_tick(cpu, rq, SCHED_IDLE); | ||
3062 | return; | ||
3063 | } | ||
3064 | |||
3065 | /* Task might have expired already, but not scheduled off yet */ | ||
3066 | if (p->array != rq->active) { | 3133 | if (p->array != rq->active) { |
3134 | /* Task has expired but was not scheduled yet */ | ||
3067 | set_tsk_need_resched(p); | 3135 | set_tsk_need_resched(p); |
3068 | goto out; | 3136 | return; |
3069 | } | 3137 | } |
3070 | spin_lock(&rq->lock); | 3138 | spin_lock(&rq->lock); |
3071 | /* | 3139 | /* |
@@ -3133,8 +3201,34 @@ void scheduler_tick(void) | |||
3133 | } | 3201 | } |
3134 | out_unlock: | 3202 | out_unlock: |
3135 | spin_unlock(&rq->lock); | 3203 | spin_unlock(&rq->lock); |
3136 | out: | 3204 | } |
3137 | rebalance_tick(cpu, rq, NOT_IDLE); | 3205 | |
3206 | /* | ||
3207 | * This function gets called by the timer code, with HZ frequency. | ||
3208 | * We call it with interrupts disabled. | ||
3209 | * | ||
3210 | * It also gets called by the fork code, when changing the parent's | ||
3211 | * timeslices. | ||
3212 | */ | ||
3213 | void scheduler_tick(void) | ||
3214 | { | ||
3215 | unsigned long long now = sched_clock(); | ||
3216 | struct task_struct *p = current; | ||
3217 | int cpu = smp_processor_id(); | ||
3218 | struct rq *rq = cpu_rq(cpu); | ||
3219 | |||
3220 | update_cpu_clock(p, rq, now); | ||
3221 | |||
3222 | if (p == rq->idle) | ||
3223 | /* Task on the idle queue */ | ||
3224 | wake_priority_sleeper(rq); | ||
3225 | else | ||
3226 | task_running_tick(rq, p); | ||
3227 | #ifdef CONFIG_SMP | ||
3228 | update_load(rq); | ||
3229 | if (time_after_eq(jiffies, rq->next_balance)) | ||
3230 | raise_softirq(SCHED_SOFTIRQ); | ||
3231 | #endif | ||
3138 | } | 3232 | } |
3139 | 3233 | ||
3140 | #ifdef CONFIG_SCHED_SMT | 3234 | #ifdef CONFIG_SCHED_SMT |
@@ -3280,7 +3374,8 @@ void fastcall add_preempt_count(int val) | |||
3280 | /* | 3374 | /* |
3281 | * Spinlock count overflowing soon? | 3375 | * Spinlock count overflowing soon? |
3282 | */ | 3376 | */ |
3283 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); | 3377 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
3378 | PREEMPT_MASK - 10); | ||
3284 | } | 3379 | } |
3285 | EXPORT_SYMBOL(add_preempt_count); | 3380 | EXPORT_SYMBOL(add_preempt_count); |
3286 | 3381 | ||
@@ -3333,6 +3428,7 @@ asmlinkage void __sched schedule(void) | |||
3333 | printk(KERN_ERR "BUG: scheduling while atomic: " | 3428 | printk(KERN_ERR "BUG: scheduling while atomic: " |
3334 | "%s/0x%08x/%d\n", | 3429 | "%s/0x%08x/%d\n", |
3335 | current->comm, preempt_count(), current->pid); | 3430 | current->comm, preempt_count(), current->pid); |
3431 | debug_show_held_locks(current); | ||
3336 | dump_stack(); | 3432 | dump_stack(); |
3337 | } | 3433 | } |
3338 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3434 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
@@ -4804,18 +4900,18 @@ static void show_task(struct task_struct *p) | |||
4804 | show_stack(p, NULL); | 4900 | show_stack(p, NULL); |
4805 | } | 4901 | } |
4806 | 4902 | ||
4807 | void show_state(void) | 4903 | void show_state_filter(unsigned long state_filter) |
4808 | { | 4904 | { |
4809 | struct task_struct *g, *p; | 4905 | struct task_struct *g, *p; |
4810 | 4906 | ||
4811 | #if (BITS_PER_LONG == 32) | 4907 | #if (BITS_PER_LONG == 32) |
4812 | printk("\n" | 4908 | printk("\n" |
4813 | " sibling\n"); | 4909 | " free sibling\n"); |
4814 | printk(" task PC pid father child younger older\n"); | 4910 | printk(" task PC stack pid father child younger older\n"); |
4815 | #else | 4911 | #else |
4816 | printk("\n" | 4912 | printk("\n" |
4817 | " sibling\n"); | 4913 | " free sibling\n"); |
4818 | printk(" task PC pid father child younger older\n"); | 4914 | printk(" task PC stack pid father child younger older\n"); |
4819 | #endif | 4915 | #endif |
4820 | read_lock(&tasklist_lock); | 4916 | read_lock(&tasklist_lock); |
4821 | do_each_thread(g, p) { | 4917 | do_each_thread(g, p) { |
@@ -4824,11 +4920,16 @@ void show_state(void) | |||
4824 | * console might take alot of time: | 4920 | * console might take alot of time: |
4825 | */ | 4921 | */ |
4826 | touch_nmi_watchdog(); | 4922 | touch_nmi_watchdog(); |
4827 | show_task(p); | 4923 | if (p->state & state_filter) |
4924 | show_task(p); | ||
4828 | } while_each_thread(g, p); | 4925 | } while_each_thread(g, p); |
4829 | 4926 | ||
4830 | read_unlock(&tasklist_lock); | 4927 | read_unlock(&tasklist_lock); |
4831 | debug_show_all_locks(); | 4928 | /* |
4929 | * Only show locks if all tasks are dumped: | ||
4930 | */ | ||
4931 | if (state_filter == -1) | ||
4932 | debug_show_all_locks(); | ||
4832 | } | 4933 | } |
4833 | 4934 | ||
4834 | /** | 4935 | /** |
@@ -4973,8 +5074,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
4973 | * afterwards, and pretending it was a local activate. | 5074 | * afterwards, and pretending it was a local activate. |
4974 | * This way is cleaner and logically correct. | 5075 | * This way is cleaner and logically correct. |
4975 | */ | 5076 | */ |
4976 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick | 5077 | p->timestamp = p->timestamp - rq_src->most_recent_timestamp |
4977 | + rq_dest->timestamp_last_tick; | 5078 | + rq_dest->most_recent_timestamp; |
4978 | deactivate_task(p, rq_src); | 5079 | deactivate_task(p, rq_src); |
4979 | __activate_task(p, rq_dest); | 5080 | __activate_task(p, rq_dest); |
4980 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 5081 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
@@ -5050,7 +5151,10 @@ wait_to_die: | |||
5050 | } | 5151 | } |
5051 | 5152 | ||
5052 | #ifdef CONFIG_HOTPLUG_CPU | 5153 | #ifdef CONFIG_HOTPLUG_CPU |
5053 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 5154 | /* |
5155 | * Figure out where task on dead CPU should go, use force if neccessary. | ||
5156 | * NOTE: interrupts should be disabled by the caller | ||
5157 | */ | ||
5054 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5158 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
5055 | { | 5159 | { |
5056 | unsigned long flags; | 5160 | unsigned long flags; |
@@ -5170,6 +5274,7 @@ void idle_task_exit(void) | |||
5170 | mmdrop(mm); | 5274 | mmdrop(mm); |
5171 | } | 5275 | } |
5172 | 5276 | ||
5277 | /* called under rq->lock with disabled interrupts */ | ||
5173 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | 5278 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
5174 | { | 5279 | { |
5175 | struct rq *rq = cpu_rq(dead_cpu); | 5280 | struct rq *rq = cpu_rq(dead_cpu); |
@@ -5186,10 +5291,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | |||
5186 | * Drop lock around migration; if someone else moves it, | 5291 | * Drop lock around migration; if someone else moves it, |
5187 | * that's OK. No task can be added to this CPU, so iteration is | 5292 | * that's OK. No task can be added to this CPU, so iteration is |
5188 | * fine. | 5293 | * fine. |
5294 | * NOTE: interrupts should be left disabled --dev@ | ||
5189 | */ | 5295 | */ |
5190 | spin_unlock_irq(&rq->lock); | 5296 | spin_unlock(&rq->lock); |
5191 | move_task_off_dead_cpu(dead_cpu, p); | 5297 | move_task_off_dead_cpu(dead_cpu, p); |
5192 | spin_lock_irq(&rq->lock); | 5298 | spin_lock(&rq->lock); |
5193 | 5299 | ||
5194 | put_task_struct(p); | 5300 | put_task_struct(p); |
5195 | } | 5301 | } |
@@ -5342,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5342 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 5448 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
5343 | printk("does not load-balance\n"); | 5449 | printk("does not load-balance\n"); |
5344 | if (sd->parent) | 5450 | if (sd->parent) |
5345 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); | 5451 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" |
5452 | " has parent"); | ||
5346 | break; | 5453 | break; |
5347 | } | 5454 | } |
5348 | 5455 | ||
5349 | printk("span %s\n", str); | 5456 | printk("span %s\n", str); |
5350 | 5457 | ||
5351 | if (!cpu_isset(cpu, sd->span)) | 5458 | if (!cpu_isset(cpu, sd->span)) |
5352 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); | 5459 | printk(KERN_ERR "ERROR: domain->span does not contain " |
5460 | "CPU%d\n", cpu); | ||
5353 | if (!cpu_isset(cpu, group->cpumask)) | 5461 | if (!cpu_isset(cpu, group->cpumask)) |
5354 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); | 5462 | printk(KERN_ERR "ERROR: domain->groups does not contain" |
5463 | " CPU%d\n", cpu); | ||
5355 | 5464 | ||
5356 | printk(KERN_DEBUG); | 5465 | printk(KERN_DEBUG); |
5357 | for (i = 0; i < level + 2; i++) | 5466 | for (i = 0; i < level + 2; i++) |
@@ -5366,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5366 | 5475 | ||
5367 | if (!group->cpu_power) { | 5476 | if (!group->cpu_power) { |
5368 | printk("\n"); | 5477 | printk("\n"); |
5369 | printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); | 5478 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5479 | "set\n"); | ||
5370 | } | 5480 | } |
5371 | 5481 | ||
5372 | if (!cpus_weight(group->cpumask)) { | 5482 | if (!cpus_weight(group->cpumask)) { |
@@ -5389,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5389 | printk("\n"); | 5499 | printk("\n"); |
5390 | 5500 | ||
5391 | if (!cpus_equal(sd->span, groupmask)) | 5501 | if (!cpus_equal(sd->span, groupmask)) |
5392 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 5502 | printk(KERN_ERR "ERROR: groups don't span " |
5503 | "domain->span\n"); | ||
5393 | 5504 | ||
5394 | level++; | 5505 | level++; |
5395 | sd = sd->parent; | 5506 | sd = sd->parent; |
5507 | if (!sd) | ||
5508 | continue; | ||
5396 | 5509 | ||
5397 | if (sd) { | 5510 | if (!cpus_subset(groupmask, sd->span)) |
5398 | if (!cpus_subset(groupmask, sd->span)) | 5511 | printk(KERN_ERR "ERROR: parent span is not a superset " |
5399 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); | 5512 | "of domain->span\n"); |
5400 | } | ||
5401 | 5513 | ||
5402 | } while (sd); | 5514 | } while (sd); |
5403 | } | 5515 | } |
@@ -5511,28 +5623,27 @@ static int __init isolated_cpu_setup(char *str) | |||
5511 | __setup ("isolcpus=", isolated_cpu_setup); | 5623 | __setup ("isolcpus=", isolated_cpu_setup); |
5512 | 5624 | ||
5513 | /* | 5625 | /* |
5514 | * init_sched_build_groups takes an array of groups, the cpumask we wish | 5626 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
5515 | * to span, and a pointer to a function which identifies what group a CPU | 5627 | * to a function which identifies what group(along with sched group) a CPU |
5516 | * belongs to. The return value of group_fn must be a valid index into the | 5628 | * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS |
5517 | * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we | 5629 | * (due to the fact that we keep track of groups covered with a cpumask_t). |
5518 | * keep track of groups covered with a cpumask_t). | ||
5519 | * | 5630 | * |
5520 | * init_sched_build_groups will build a circular linked list of the groups | 5631 | * init_sched_build_groups will build a circular linked list of the groups |
5521 | * covered by the given span, and will set each group's ->cpumask correctly, | 5632 | * covered by the given span, and will set each group's ->cpumask correctly, |
5522 | * and ->cpu_power to 0. | 5633 | * and ->cpu_power to 0. |
5523 | */ | 5634 | */ |
5524 | static void | 5635 | static void |
5525 | init_sched_build_groups(struct sched_group groups[], cpumask_t span, | 5636 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, |
5526 | const cpumask_t *cpu_map, | 5637 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
5527 | int (*group_fn)(int cpu, const cpumask_t *cpu_map)) | 5638 | struct sched_group **sg)) |
5528 | { | 5639 | { |
5529 | struct sched_group *first = NULL, *last = NULL; | 5640 | struct sched_group *first = NULL, *last = NULL; |
5530 | cpumask_t covered = CPU_MASK_NONE; | 5641 | cpumask_t covered = CPU_MASK_NONE; |
5531 | int i; | 5642 | int i; |
5532 | 5643 | ||
5533 | for_each_cpu_mask(i, span) { | 5644 | for_each_cpu_mask(i, span) { |
5534 | int group = group_fn(i, cpu_map); | 5645 | struct sched_group *sg; |
5535 | struct sched_group *sg = &groups[group]; | 5646 | int group = group_fn(i, cpu_map, &sg); |
5536 | int j; | 5647 | int j; |
5537 | 5648 | ||
5538 | if (cpu_isset(i, covered)) | 5649 | if (cpu_isset(i, covered)) |
@@ -5542,7 +5653,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
5542 | sg->cpu_power = 0; | 5653 | sg->cpu_power = 0; |
5543 | 5654 | ||
5544 | for_each_cpu_mask(j, span) { | 5655 | for_each_cpu_mask(j, span) { |
5545 | if (group_fn(j, cpu_map) != group) | 5656 | if (group_fn(j, cpu_map, NULL) != group) |
5546 | continue; | 5657 | continue; |
5547 | 5658 | ||
5548 | cpu_set(j, covered); | 5659 | cpu_set(j, covered); |
@@ -5716,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size); | |||
5716 | */ | 5827 | */ |
5717 | static void touch_cache(void *__cache, unsigned long __size) | 5828 | static void touch_cache(void *__cache, unsigned long __size) |
5718 | { | 5829 | { |
5719 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | 5830 | unsigned long size = __size / sizeof(long); |
5720 | chunk2 = 2*size/3; | 5831 | unsigned long chunk1 = size / 3; |
5832 | unsigned long chunk2 = 2 * size / 3; | ||
5721 | unsigned long *cache = __cache; | 5833 | unsigned long *cache = __cache; |
5722 | int i; | 5834 | int i; |
5723 | 5835 | ||
@@ -5826,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | |||
5826 | */ | 5938 | */ |
5827 | measure_one(cache, size, cpu1, cpu2); | 5939 | measure_one(cache, size, cpu1, cpu2); |
5828 | for (i = 0; i < ITERATIONS; i++) | 5940 | for (i = 0; i < ITERATIONS; i++) |
5829 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | 5941 | cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); |
5830 | 5942 | ||
5831 | measure_one(cache, size, cpu2, cpu1); | 5943 | measure_one(cache, size, cpu2, cpu1); |
5832 | for (i = 0; i < ITERATIONS; i++) | 5944 | for (i = 0; i < ITERATIONS; i++) |
5833 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | 5945 | cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); |
5834 | 5946 | ||
5835 | /* | 5947 | /* |
5836 | * (We measure the non-migrating [cached] cost on both | 5948 | * (We measure the non-migrating [cached] cost on both |
@@ -5840,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | |||
5840 | 5952 | ||
5841 | measure_one(cache, size, cpu1, cpu1); | 5953 | measure_one(cache, size, cpu1, cpu1); |
5842 | for (i = 0; i < ITERATIONS; i++) | 5954 | for (i = 0; i < ITERATIONS; i++) |
5843 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | 5955 | cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); |
5844 | 5956 | ||
5845 | measure_one(cache, size, cpu2, cpu2); | 5957 | measure_one(cache, size, cpu2, cpu2); |
5846 | for (i = 0; i < ITERATIONS; i++) | 5958 | for (i = 0; i < ITERATIONS; i++) |
5847 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | 5959 | cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); |
5848 | 5960 | ||
5849 | /* | 5961 | /* |
5850 | * Get the per-iteration migration cost: | 5962 | * Get the per-iteration migration cost: |
5851 | */ | 5963 | */ |
5852 | do_div(cost1, 2*ITERATIONS); | 5964 | do_div(cost1, 2 * ITERATIONS); |
5853 | do_div(cost2, 2*ITERATIONS); | 5965 | do_div(cost2, 2 * ITERATIONS); |
5854 | 5966 | ||
5855 | return cost1 - cost2; | 5967 | return cost1 - cost2; |
5856 | } | 5968 | } |
@@ -5888,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
5888 | */ | 6000 | */ |
5889 | cache = vmalloc(max_size); | 6001 | cache = vmalloc(max_size); |
5890 | if (!cache) { | 6002 | if (!cache) { |
5891 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | 6003 | printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); |
5892 | return 1000000; /* return 1 msec on very small boxen */ | 6004 | return 1000000; /* return 1 msec on very small boxen */ |
5893 | } | 6005 | } |
5894 | 6006 | ||
@@ -5913,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
5913 | avg_fluct = (avg_fluct + fluct)/2; | 6025 | avg_fluct = (avg_fluct + fluct)/2; |
5914 | 6026 | ||
5915 | if (migration_debug) | 6027 | if (migration_debug) |
5916 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | 6028 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " |
6029 | "(%8Ld %8Ld)\n", | ||
5917 | cpu1, cpu2, size, | 6030 | cpu1, cpu2, size, |
5918 | (long)cost / 1000000, | 6031 | (long)cost / 1000000, |
5919 | ((long)cost / 100000) % 10, | 6032 | ((long)cost / 100000) % 10, |
@@ -6008,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) | |||
6008 | -1 | 6121 | -1 |
6009 | #endif | 6122 | #endif |
6010 | ); | 6123 | ); |
6011 | if (system_state == SYSTEM_BOOTING) { | 6124 | if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { |
6012 | if (num_online_cpus() > 1) { | 6125 | printk("migration_cost="); |
6013 | printk("migration_cost="); | 6126 | for (distance = 0; distance <= max_distance; distance++) { |
6014 | for (distance = 0; distance <= max_distance; distance++) { | 6127 | if (distance) |
6015 | if (distance) | 6128 | printk(","); |
6016 | printk(","); | 6129 | printk("%ld", (long)migration_cost[distance] / 1000); |
6017 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
6018 | } | ||
6019 | printk("\n"); | ||
6020 | } | 6130 | } |
6131 | printk("\n"); | ||
6021 | } | 6132 | } |
6022 | j1 = jiffies; | 6133 | j1 = jiffies; |
6023 | if (migration_debug) | 6134 | if (migration_debug) |
6024 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | 6135 | printk("migration: %ld seconds\n", (j1-j0) / HZ); |
6025 | 6136 | ||
6026 | /* | 6137 | /* |
6027 | * Move back to the original CPU. NUMA-Q gets confused | 6138 | * Move back to the original CPU. NUMA-Q gets confused |
@@ -6118,10 +6229,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | |||
6118 | */ | 6229 | */ |
6119 | #ifdef CONFIG_SCHED_SMT | 6230 | #ifdef CONFIG_SCHED_SMT |
6120 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6231 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
6121 | static struct sched_group sched_group_cpus[NR_CPUS]; | 6232 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
6122 | 6233 | ||
6123 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) | 6234 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, |
6235 | struct sched_group **sg) | ||
6124 | { | 6236 | { |
6237 | if (sg) | ||
6238 | *sg = &per_cpu(sched_group_cpus, cpu); | ||
6125 | return cpu; | 6239 | return cpu; |
6126 | } | 6240 | } |
6127 | #endif | 6241 | #endif |
@@ -6131,39 +6245,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) | |||
6131 | */ | 6245 | */ |
6132 | #ifdef CONFIG_SCHED_MC | 6246 | #ifdef CONFIG_SCHED_MC |
6133 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6247 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
6134 | static struct sched_group sched_group_core[NR_CPUS]; | 6248 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
6135 | #endif | 6249 | #endif |
6136 | 6250 | ||
6137 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6251 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6138 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) | 6252 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
6253 | struct sched_group **sg) | ||
6139 | { | 6254 | { |
6255 | int group; | ||
6140 | cpumask_t mask = cpu_sibling_map[cpu]; | 6256 | cpumask_t mask = cpu_sibling_map[cpu]; |
6141 | cpus_and(mask, mask, *cpu_map); | 6257 | cpus_and(mask, mask, *cpu_map); |
6142 | return first_cpu(mask); | 6258 | group = first_cpu(mask); |
6259 | if (sg) | ||
6260 | *sg = &per_cpu(sched_group_core, group); | ||
6261 | return group; | ||
6143 | } | 6262 | } |
6144 | #elif defined(CONFIG_SCHED_MC) | 6263 | #elif defined(CONFIG_SCHED_MC) |
6145 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) | 6264 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
6265 | struct sched_group **sg) | ||
6146 | { | 6266 | { |
6267 | if (sg) | ||
6268 | *sg = &per_cpu(sched_group_core, cpu); | ||
6147 | return cpu; | 6269 | return cpu; |
6148 | } | 6270 | } |
6149 | #endif | 6271 | #endif |
6150 | 6272 | ||
6151 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6273 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
6152 | static struct sched_group sched_group_phys[NR_CPUS]; | 6274 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
6153 | 6275 | ||
6154 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) | 6276 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, |
6277 | struct sched_group **sg) | ||
6155 | { | 6278 | { |
6279 | int group; | ||
6156 | #ifdef CONFIG_SCHED_MC | 6280 | #ifdef CONFIG_SCHED_MC |
6157 | cpumask_t mask = cpu_coregroup_map(cpu); | 6281 | cpumask_t mask = cpu_coregroup_map(cpu); |
6158 | cpus_and(mask, mask, *cpu_map); | 6282 | cpus_and(mask, mask, *cpu_map); |
6159 | return first_cpu(mask); | 6283 | group = first_cpu(mask); |
6160 | #elif defined(CONFIG_SCHED_SMT) | 6284 | #elif defined(CONFIG_SCHED_SMT) |
6161 | cpumask_t mask = cpu_sibling_map[cpu]; | 6285 | cpumask_t mask = cpu_sibling_map[cpu]; |
6162 | cpus_and(mask, mask, *cpu_map); | 6286 | cpus_and(mask, mask, *cpu_map); |
6163 | return first_cpu(mask); | 6287 | group = first_cpu(mask); |
6164 | #else | 6288 | #else |
6165 | return cpu; | 6289 | group = cpu; |
6166 | #endif | 6290 | #endif |
6291 | if (sg) | ||
6292 | *sg = &per_cpu(sched_group_phys, group); | ||
6293 | return group; | ||
6167 | } | 6294 | } |
6168 | 6295 | ||
6169 | #ifdef CONFIG_NUMA | 6296 | #ifdef CONFIG_NUMA |
@@ -6176,12 +6303,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); | |||
6176 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 6303 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
6177 | 6304 | ||
6178 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 6305 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
6179 | static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; | 6306 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
6180 | 6307 | ||
6181 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) | 6308 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
6309 | struct sched_group **sg) | ||
6182 | { | 6310 | { |
6183 | return cpu_to_node(cpu); | 6311 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); |
6312 | int group; | ||
6313 | |||
6314 | cpus_and(nodemask, nodemask, *cpu_map); | ||
6315 | group = first_cpu(nodemask); | ||
6316 | |||
6317 | if (sg) | ||
6318 | *sg = &per_cpu(sched_group_allnodes, group); | ||
6319 | return group; | ||
6184 | } | 6320 | } |
6321 | |||
6185 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 6322 | static void init_numa_sched_groups_power(struct sched_group *group_head) |
6186 | { | 6323 | { |
6187 | struct sched_group *sg = group_head; | 6324 | struct sched_group *sg = group_head; |
@@ -6217,16 +6354,9 @@ static void free_sched_groups(const cpumask_t *cpu_map) | |||
6217 | int cpu, i; | 6354 | int cpu, i; |
6218 | 6355 | ||
6219 | for_each_cpu_mask(cpu, *cpu_map) { | 6356 | for_each_cpu_mask(cpu, *cpu_map) { |
6220 | struct sched_group *sched_group_allnodes | ||
6221 | = sched_group_allnodes_bycpu[cpu]; | ||
6222 | struct sched_group **sched_group_nodes | 6357 | struct sched_group **sched_group_nodes |
6223 | = sched_group_nodes_bycpu[cpu]; | 6358 | = sched_group_nodes_bycpu[cpu]; |
6224 | 6359 | ||
6225 | if (sched_group_allnodes) { | ||
6226 | kfree(sched_group_allnodes); | ||
6227 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
6228 | } | ||
6229 | |||
6230 | if (!sched_group_nodes) | 6360 | if (!sched_group_nodes) |
6231 | continue; | 6361 | continue; |
6232 | 6362 | ||
@@ -6320,7 +6450,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6320 | struct sched_domain *sd; | 6450 | struct sched_domain *sd; |
6321 | #ifdef CONFIG_NUMA | 6451 | #ifdef CONFIG_NUMA |
6322 | struct sched_group **sched_group_nodes = NULL; | 6452 | struct sched_group **sched_group_nodes = NULL; |
6323 | struct sched_group *sched_group_allnodes = NULL; | 6453 | int sd_allnodes = 0; |
6324 | 6454 | ||
6325 | /* | 6455 | /* |
6326 | * Allocate the per-node list of sched groups | 6456 | * Allocate the per-node list of sched groups |
@@ -6338,7 +6468,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6338 | * Set up domains for cpus specified by the cpu_map. | 6468 | * Set up domains for cpus specified by the cpu_map. |
6339 | */ | 6469 | */ |
6340 | for_each_cpu_mask(i, *cpu_map) { | 6470 | for_each_cpu_mask(i, *cpu_map) { |
6341 | int group; | ||
6342 | struct sched_domain *sd = NULL, *p; | 6471 | struct sched_domain *sd = NULL, *p; |
6343 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 6472 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
6344 | 6473 | ||
@@ -6347,26 +6476,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6347 | #ifdef CONFIG_NUMA | 6476 | #ifdef CONFIG_NUMA |
6348 | if (cpus_weight(*cpu_map) | 6477 | if (cpus_weight(*cpu_map) |
6349 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 6478 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
6350 | if (!sched_group_allnodes) { | ||
6351 | sched_group_allnodes | ||
6352 | = kmalloc_node(sizeof(struct sched_group) | ||
6353 | * MAX_NUMNODES, | ||
6354 | GFP_KERNEL, | ||
6355 | cpu_to_node(i)); | ||
6356 | if (!sched_group_allnodes) { | ||
6357 | printk(KERN_WARNING | ||
6358 | "Can not alloc allnodes sched group\n"); | ||
6359 | goto error; | ||
6360 | } | ||
6361 | sched_group_allnodes_bycpu[i] | ||
6362 | = sched_group_allnodes; | ||
6363 | } | ||
6364 | sd = &per_cpu(allnodes_domains, i); | 6479 | sd = &per_cpu(allnodes_domains, i); |
6365 | *sd = SD_ALLNODES_INIT; | 6480 | *sd = SD_ALLNODES_INIT; |
6366 | sd->span = *cpu_map; | 6481 | sd->span = *cpu_map; |
6367 | group = cpu_to_allnodes_group(i, cpu_map); | 6482 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); |
6368 | sd->groups = &sched_group_allnodes[group]; | ||
6369 | p = sd; | 6483 | p = sd; |
6484 | sd_allnodes = 1; | ||
6370 | } else | 6485 | } else |
6371 | p = NULL; | 6486 | p = NULL; |
6372 | 6487 | ||
@@ -6381,36 +6496,33 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6381 | 6496 | ||
6382 | p = sd; | 6497 | p = sd; |
6383 | sd = &per_cpu(phys_domains, i); | 6498 | sd = &per_cpu(phys_domains, i); |
6384 | group = cpu_to_phys_group(i, cpu_map); | ||
6385 | *sd = SD_CPU_INIT; | 6499 | *sd = SD_CPU_INIT; |
6386 | sd->span = nodemask; | 6500 | sd->span = nodemask; |
6387 | sd->parent = p; | 6501 | sd->parent = p; |
6388 | if (p) | 6502 | if (p) |
6389 | p->child = sd; | 6503 | p->child = sd; |
6390 | sd->groups = &sched_group_phys[group]; | 6504 | cpu_to_phys_group(i, cpu_map, &sd->groups); |
6391 | 6505 | ||
6392 | #ifdef CONFIG_SCHED_MC | 6506 | #ifdef CONFIG_SCHED_MC |
6393 | p = sd; | 6507 | p = sd; |
6394 | sd = &per_cpu(core_domains, i); | 6508 | sd = &per_cpu(core_domains, i); |
6395 | group = cpu_to_core_group(i, cpu_map); | ||
6396 | *sd = SD_MC_INIT; | 6509 | *sd = SD_MC_INIT; |
6397 | sd->span = cpu_coregroup_map(i); | 6510 | sd->span = cpu_coregroup_map(i); |
6398 | cpus_and(sd->span, sd->span, *cpu_map); | 6511 | cpus_and(sd->span, sd->span, *cpu_map); |
6399 | sd->parent = p; | 6512 | sd->parent = p; |
6400 | p->child = sd; | 6513 | p->child = sd; |
6401 | sd->groups = &sched_group_core[group]; | 6514 | cpu_to_core_group(i, cpu_map, &sd->groups); |
6402 | #endif | 6515 | #endif |
6403 | 6516 | ||
6404 | #ifdef CONFIG_SCHED_SMT | 6517 | #ifdef CONFIG_SCHED_SMT |
6405 | p = sd; | 6518 | p = sd; |
6406 | sd = &per_cpu(cpu_domains, i); | 6519 | sd = &per_cpu(cpu_domains, i); |
6407 | group = cpu_to_cpu_group(i, cpu_map); | ||
6408 | *sd = SD_SIBLING_INIT; | 6520 | *sd = SD_SIBLING_INIT; |
6409 | sd->span = cpu_sibling_map[i]; | 6521 | sd->span = cpu_sibling_map[i]; |
6410 | cpus_and(sd->span, sd->span, *cpu_map); | 6522 | cpus_and(sd->span, sd->span, *cpu_map); |
6411 | sd->parent = p; | 6523 | sd->parent = p; |
6412 | p->child = sd; | 6524 | p->child = sd; |
6413 | sd->groups = &sched_group_cpus[group]; | 6525 | cpu_to_cpu_group(i, cpu_map, &sd->groups); |
6414 | #endif | 6526 | #endif |
6415 | } | 6527 | } |
6416 | 6528 | ||
@@ -6422,8 +6534,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6422 | if (i != first_cpu(this_sibling_map)) | 6534 | if (i != first_cpu(this_sibling_map)) |
6423 | continue; | 6535 | continue; |
6424 | 6536 | ||
6425 | init_sched_build_groups(sched_group_cpus, this_sibling_map, | 6537 | init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); |
6426 | cpu_map, &cpu_to_cpu_group); | ||
6427 | } | 6538 | } |
6428 | #endif | 6539 | #endif |
6429 | 6540 | ||
@@ -6434,8 +6545,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6434 | cpus_and(this_core_map, this_core_map, *cpu_map); | 6545 | cpus_and(this_core_map, this_core_map, *cpu_map); |
6435 | if (i != first_cpu(this_core_map)) | 6546 | if (i != first_cpu(this_core_map)) |
6436 | continue; | 6547 | continue; |
6437 | init_sched_build_groups(sched_group_core, this_core_map, | 6548 | init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); |
6438 | cpu_map, &cpu_to_core_group); | ||
6439 | } | 6549 | } |
6440 | #endif | 6550 | #endif |
6441 | 6551 | ||
@@ -6448,15 +6558,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6448 | if (cpus_empty(nodemask)) | 6558 | if (cpus_empty(nodemask)) |
6449 | continue; | 6559 | continue; |
6450 | 6560 | ||
6451 | init_sched_build_groups(sched_group_phys, nodemask, | 6561 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); |
6452 | cpu_map, &cpu_to_phys_group); | ||
6453 | } | 6562 | } |
6454 | 6563 | ||
6455 | #ifdef CONFIG_NUMA | 6564 | #ifdef CONFIG_NUMA |
6456 | /* Set up node groups */ | 6565 | /* Set up node groups */ |
6457 | if (sched_group_allnodes) | 6566 | if (sd_allnodes) |
6458 | init_sched_build_groups(sched_group_allnodes, *cpu_map, | 6567 | init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); |
6459 | cpu_map, &cpu_to_allnodes_group); | ||
6460 | 6568 | ||
6461 | for (i = 0; i < MAX_NUMNODES; i++) { | 6569 | for (i = 0; i < MAX_NUMNODES; i++) { |
6462 | /* Set up node groups */ | 6570 | /* Set up node groups */ |
@@ -6548,10 +6656,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6548 | for (i = 0; i < MAX_NUMNODES; i++) | 6656 | for (i = 0; i < MAX_NUMNODES; i++) |
6549 | init_numa_sched_groups_power(sched_group_nodes[i]); | 6657 | init_numa_sched_groups_power(sched_group_nodes[i]); |
6550 | 6658 | ||
6551 | if (sched_group_allnodes) { | 6659 | if (sd_allnodes) { |
6552 | int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); | 6660 | struct sched_group *sg; |
6553 | struct sched_group *sg = &sched_group_allnodes[group]; | ||
6554 | 6661 | ||
6662 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | ||
6555 | init_numa_sched_groups_power(sg); | 6663 | init_numa_sched_groups_power(sg); |
6556 | } | 6664 | } |
6557 | #endif | 6665 | #endif |
@@ -6723,8 +6831,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | |||
6723 | sched_smt_power_savings_store); | 6831 | sched_smt_power_savings_store); |
6724 | #endif | 6832 | #endif |
6725 | 6833 | ||
6726 | |||
6727 | #ifdef CONFIG_HOTPLUG_CPU | ||
6728 | /* | 6834 | /* |
6729 | * Force a reinitialization of the sched domains hierarchy. The domains | 6835 | * Force a reinitialization of the sched domains hierarchy. The domains |
6730 | * and groups cannot be updated in place without racing with the balancing | 6836 | * and groups cannot be updated in place without racing with the balancing |
@@ -6757,7 +6863,6 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
6757 | 6863 | ||
6758 | return NOTIFY_OK; | 6864 | return NOTIFY_OK; |
6759 | } | 6865 | } |
6760 | #endif | ||
6761 | 6866 | ||
6762 | void __init sched_init_smp(void) | 6867 | void __init sched_init_smp(void) |
6763 | { | 6868 | { |
@@ -6833,6 +6938,10 @@ void __init sched_init(void) | |||
6833 | 6938 | ||
6834 | set_load_weight(&init_task); | 6939 | set_load_weight(&init_task); |
6835 | 6940 | ||
6941 | #ifdef CONFIG_SMP | ||
6942 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | ||
6943 | #endif | ||
6944 | |||
6836 | #ifdef CONFIG_RT_MUTEXES | 6945 | #ifdef CONFIG_RT_MUTEXES |
6837 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); | 6946 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); |
6838 | #endif | 6947 | #endif |
@@ -6867,6 +6976,7 @@ void __might_sleep(char *file, int line) | |||
6867 | " context at %s:%d\n", file, line); | 6976 | " context at %s:%d\n", file, line); |
6868 | printk("in_atomic():%d, irqs_disabled():%d\n", | 6977 | printk("in_atomic():%d, irqs_disabled():%d\n", |
6869 | in_atomic(), irqs_disabled()); | 6978 | in_atomic(), irqs_disabled()); |
6979 | debug_show_held_locks(current); | ||
6870 | dump_stack(); | 6980 | dump_stack(); |
6871 | } | 6981 | } |
6872 | #endif | 6982 | #endif |
diff --git a/kernel/signal.c b/kernel/signal.c index 7ed8d5304bec..1921ffdc5e77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -23,6 +23,10 @@ | |||
23 | #include <linux/ptrace.h> | 23 | #include <linux/ptrace.h> |
24 | #include <linux/signal.h> | 24 | #include <linux/signal.h> |
25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
26 | #include <linux/freezer.h> | ||
27 | #include <linux/pid_namespace.h> | ||
28 | #include <linux/nsproxy.h> | ||
29 | |||
26 | #include <asm/param.h> | 30 | #include <asm/param.h> |
27 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
28 | #include <asm/unistd.h> | 32 | #include <asm/unistd.h> |
@@ -33,7 +37,7 @@ | |||
33 | * SLAB caches for signal bits. | 37 | * SLAB caches for signal bits. |
34 | */ | 38 | */ |
35 | 39 | ||
36 | static kmem_cache_t *sigqueue_cachep; | 40 | static struct kmem_cache *sigqueue_cachep; |
37 | 41 | ||
38 | /* | 42 | /* |
39 | * In POSIX a signal is sent either to a specific thread (Linux task) | 43 | * In POSIX a signal is sent either to a specific thread (Linux task) |
@@ -267,18 +271,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, | |||
267 | int override_rlimit) | 271 | int override_rlimit) |
268 | { | 272 | { |
269 | struct sigqueue *q = NULL; | 273 | struct sigqueue *q = NULL; |
274 | struct user_struct *user; | ||
270 | 275 | ||
271 | atomic_inc(&t->user->sigpending); | 276 | /* |
277 | * In order to avoid problems with "switch_user()", we want to make | ||
278 | * sure that the compiler doesn't re-load "t->user" | ||
279 | */ | ||
280 | user = t->user; | ||
281 | barrier(); | ||
282 | atomic_inc(&user->sigpending); | ||
272 | if (override_rlimit || | 283 | if (override_rlimit || |
273 | atomic_read(&t->user->sigpending) <= | 284 | atomic_read(&user->sigpending) <= |
274 | t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) | 285 | t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) |
275 | q = kmem_cache_alloc(sigqueue_cachep, flags); | 286 | q = kmem_cache_alloc(sigqueue_cachep, flags); |
276 | if (unlikely(q == NULL)) { | 287 | if (unlikely(q == NULL)) { |
277 | atomic_dec(&t->user->sigpending); | 288 | atomic_dec(&user->sigpending); |
278 | } else { | 289 | } else { |
279 | INIT_LIST_HEAD(&q->list); | 290 | INIT_LIST_HEAD(&q->list); |
280 | q->flags = 0; | 291 | q->flags = 0; |
281 | q->user = get_uid(t->user); | 292 | q->user = get_uid(user); |
282 | } | 293 | } |
283 | return(q); | 294 | return(q); |
284 | } | 295 | } |
@@ -575,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
575 | error = -EPERM; | 586 | error = -EPERM; |
576 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) | 587 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) |
577 | && ((sig != SIGCONT) || | 588 | && ((sig != SIGCONT) || |
578 | (current->signal->session != t->signal->session)) | 589 | (process_session(current) != process_session(t))) |
579 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | 590 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) |
580 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) | 591 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) |
581 | && !capable(CAP_KILL)) | 592 | && !capable(CAP_KILL)) |
@@ -1126,8 +1137,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) | |||
1126 | return error; | 1137 | return error; |
1127 | } | 1138 | } |
1128 | 1139 | ||
1129 | int | 1140 | static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
1130 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) | ||
1131 | { | 1141 | { |
1132 | int error; | 1142 | int error; |
1133 | rcu_read_lock(); | 1143 | rcu_read_lock(); |
@@ -1870,8 +1880,12 @@ relock: | |||
1870 | if (sig_kernel_ignore(signr)) /* Default is nothing. */ | 1880 | if (sig_kernel_ignore(signr)) /* Default is nothing. */ |
1871 | continue; | 1881 | continue; |
1872 | 1882 | ||
1873 | /* Init gets no signals it doesn't want. */ | 1883 | /* |
1874 | if (current == child_reaper) | 1884 | * Init of a pid space gets no signals it doesn't want from |
1885 | * within that pid space. It can of course get signals from | ||
1886 | * its parent pid space. | ||
1887 | */ | ||
1888 | if (current == child_reaper(current)) | ||
1875 | continue; | 1889 | continue; |
1876 | 1890 | ||
1877 | if (sig_kernel_stop(signr)) { | 1891 | if (sig_kernel_stop(signr)) { |
diff --git a/kernel/softirq.c b/kernel/softirq.c index bf25015dce16..918e52df090e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
574 | 574 | ||
575 | switch (action) { | 575 | switch (action) { |
576 | case CPU_UP_PREPARE: | 576 | case CPU_UP_PREPARE: |
577 | BUG_ON(per_cpu(tasklet_vec, hotcpu).list); | ||
578 | BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); | ||
579 | p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); | 577 | p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); |
580 | if (IS_ERR(p)) { | 578 | if (IS_ERR(p)) { |
581 | printk("ksoftirqd for %i failed\n", hotcpu); | 579 | printk("ksoftirqd for %i failed\n", hotcpu); |
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 476c3741511b..2c6c2bf85514 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -293,6 +293,27 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) | |||
293 | } | 293 | } |
294 | 294 | ||
295 | EXPORT_SYMBOL(_spin_lock_nested); | 295 | EXPORT_SYMBOL(_spin_lock_nested); |
296 | unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) | ||
297 | { | ||
298 | unsigned long flags; | ||
299 | |||
300 | local_irq_save(flags); | ||
301 | preempt_disable(); | ||
302 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | ||
303 | /* | ||
304 | * On lockdep we dont want the hand-coded irq-enable of | ||
305 | * _raw_spin_lock_flags() code, because lockdep assumes | ||
306 | * that interrupts are not re-enabled during lock-acquire: | ||
307 | */ | ||
308 | #ifdef CONFIG_PROVE_SPIN_LOCKING | ||
309 | _raw_spin_lock(lock); | ||
310 | #else | ||
311 | _raw_spin_lock_flags(lock, &flags); | ||
312 | #endif | ||
313 | return flags; | ||
314 | } | ||
315 | |||
316 | EXPORT_SYMBOL(_spin_lock_irqsave_nested); | ||
296 | 317 | ||
297 | #endif | 318 | #endif |
298 | 319 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 98489d82801b..c7675c1bfdf2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
880 | return 0; | 880 | return 0; |
881 | } | 881 | } |
882 | 882 | ||
883 | static void deferred_cad(void *dummy) | 883 | static void deferred_cad(struct work_struct *dummy) |
884 | { | 884 | { |
885 | kernel_restart(NULL); | 885 | kernel_restart(NULL); |
886 | } | 886 | } |
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy) | |||
892 | */ | 892 | */ |
893 | void ctrl_alt_del(void) | 893 | void ctrl_alt_del(void) |
894 | { | 894 | { |
895 | static DECLARE_WORK(cad_work, deferred_cad, NULL); | 895 | static DECLARE_WORK(cad_work, deferred_cad); |
896 | 896 | ||
897 | if (C_A_D) | 897 | if (C_A_D) |
898 | schedule_work(&cad_work); | 898 | schedule_work(&cad_work); |
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) | |||
1102 | asmlinkage long sys_setuid(uid_t uid) | 1102 | asmlinkage long sys_setuid(uid_t uid) |
1103 | { | 1103 | { |
1104 | int old_euid = current->euid; | 1104 | int old_euid = current->euid; |
1105 | int old_ruid, old_suid, new_ruid, new_suid; | 1105 | int old_ruid, old_suid, new_suid; |
1106 | int retval; | 1106 | int retval; |
1107 | 1107 | ||
1108 | retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); | 1108 | retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); |
1109 | if (retval) | 1109 | if (retval) |
1110 | return retval; | 1110 | return retval; |
1111 | 1111 | ||
1112 | old_ruid = new_ruid = current->uid; | 1112 | old_ruid = current->uid; |
1113 | old_suid = current->suid; | 1113 | old_suid = current->suid; |
1114 | new_suid = old_suid; | 1114 | new_suid = old_suid; |
1115 | 1115 | ||
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1381 | 1381 | ||
1382 | if (p->real_parent == group_leader) { | 1382 | if (p->real_parent == group_leader) { |
1383 | err = -EPERM; | 1383 | err = -EPERM; |
1384 | if (p->signal->session != group_leader->signal->session) | 1384 | if (process_session(p) != process_session(group_leader)) |
1385 | goto out; | 1385 | goto out; |
1386 | err = -EACCES; | 1386 | err = -EACCES; |
1387 | if (p->did_exec) | 1387 | if (p->did_exec) |
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1397 | goto out; | 1397 | goto out; |
1398 | 1398 | ||
1399 | if (pgid != pid) { | 1399 | if (pgid != pid) { |
1400 | struct task_struct *p; | 1400 | struct task_struct *g = |
1401 | find_task_by_pid_type(PIDTYPE_PGID, pgid); | ||
1401 | 1402 | ||
1402 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { | 1403 | if (!g || process_session(g) != process_session(group_leader)) |
1403 | if (p->signal->session == group_leader->signal->session) | 1404 | goto out; |
1404 | goto ok_pgid; | ||
1405 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); | ||
1406 | goto out; | ||
1407 | } | 1405 | } |
1408 | 1406 | ||
1409 | ok_pgid: | ||
1410 | err = security_task_setpgid(p, pgid); | 1407 | err = security_task_setpgid(p, pgid); |
1411 | if (err) | 1408 | if (err) |
1412 | goto out; | 1409 | goto out; |
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void) | |||
1459 | asmlinkage long sys_getsid(pid_t pid) | 1456 | asmlinkage long sys_getsid(pid_t pid) |
1460 | { | 1457 | { |
1461 | if (!pid) | 1458 | if (!pid) |
1462 | return current->signal->session; | 1459 | return process_session(current); |
1463 | else { | 1460 | else { |
1464 | int retval; | 1461 | int retval; |
1465 | struct task_struct *p; | 1462 | struct task_struct *p; |
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid) | |||
1471 | if (p) { | 1468 | if (p) { |
1472 | retval = security_task_getsid(p); | 1469 | retval = security_task_getsid(p); |
1473 | if (!retval) | 1470 | if (!retval) |
1474 | retval = p->signal->session; | 1471 | retval = process_session(p); |
1475 | } | 1472 | } |
1476 | read_unlock(&tasklist_lock); | 1473 | read_unlock(&tasklist_lock); |
1477 | return retval; | 1474 | return retval; |
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void) | |||
1484 | pid_t session; | 1481 | pid_t session; |
1485 | int err = -EPERM; | 1482 | int err = -EPERM; |
1486 | 1483 | ||
1487 | mutex_lock(&tty_mutex); | ||
1488 | write_lock_irq(&tasklist_lock); | 1484 | write_lock_irq(&tasklist_lock); |
1489 | 1485 | ||
1490 | /* Fail if I am already a session leader */ | 1486 | /* Fail if I am already a session leader */ |
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void) | |||
1504 | 1500 | ||
1505 | group_leader->signal->leader = 1; | 1501 | group_leader->signal->leader = 1; |
1506 | __set_special_pids(session, session); | 1502 | __set_special_pids(session, session); |
1503 | |||
1504 | spin_lock(&group_leader->sighand->siglock); | ||
1507 | group_leader->signal->tty = NULL; | 1505 | group_leader->signal->tty = NULL; |
1508 | group_leader->signal->tty_old_pgrp = 0; | 1506 | group_leader->signal->tty_old_pgrp = 0; |
1507 | spin_unlock(&group_leader->sighand->siglock); | ||
1508 | |||
1509 | err = process_group(group_leader); | 1509 | err = process_group(group_leader); |
1510 | out: | 1510 | out: |
1511 | write_unlock_irq(&tasklist_lock); | 1511 | write_unlock_irq(&tasklist_lock); |
1512 | mutex_unlock(&tty_mutex); | ||
1513 | return err; | 1512 | return err; |
1514 | } | 1513 | } |
1515 | 1514 | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7a3b2e75f040..d7306d0f3dfc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -49,6 +49,7 @@ cond_syscall(compat_sys_get_robust_list); | |||
49 | cond_syscall(sys_epoll_create); | 49 | cond_syscall(sys_epoll_create); |
50 | cond_syscall(sys_epoll_ctl); | 50 | cond_syscall(sys_epoll_ctl); |
51 | cond_syscall(sys_epoll_wait); | 51 | cond_syscall(sys_epoll_wait); |
52 | cond_syscall(sys_epoll_pwait); | ||
52 | cond_syscall(sys_semget); | 53 | cond_syscall(sys_semget); |
53 | cond_syscall(sys_semop); | 54 | cond_syscall(sys_semop); |
54 | cond_syscall(sys_semtimedop); | 55 | cond_syscall(sys_semtimedop); |
@@ -134,6 +135,7 @@ cond_syscall(sys_madvise); | |||
134 | cond_syscall(sys_mremap); | 135 | cond_syscall(sys_mremap); |
135 | cond_syscall(sys_remap_file_pages); | 136 | cond_syscall(sys_remap_file_pages); |
136 | cond_syscall(compat_sys_move_pages); | 137 | cond_syscall(compat_sys_move_pages); |
138 | cond_syscall(compat_sys_migrate_pages); | ||
137 | 139 | ||
138 | /* block-layer dependent */ | 140 | /* block-layer dependent */ |
139 | cond_syscall(sys_bdflush); | 141 | cond_syscall(sys_bdflush); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8020fb273c4f..130c5ec9ee0b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, | |||
54 | 54 | ||
55 | #ifdef CONFIG_X86 | 55 | #ifdef CONFIG_X86 |
56 | #include <asm/nmi.h> | 56 | #include <asm/nmi.h> |
57 | #include <asm/stacktrace.h> | ||
57 | #endif | 58 | #endif |
58 | 59 | ||
59 | #if defined(CONFIG_SYSCTL) | 60 | #if defined(CONFIG_SYSCTL) |
@@ -91,7 +92,9 @@ extern char modprobe_path[]; | |||
91 | extern int sg_big_buff; | 92 | extern int sg_big_buff; |
92 | #endif | 93 | #endif |
93 | #ifdef CONFIG_SYSVIPC | 94 | #ifdef CONFIG_SYSVIPC |
94 | static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | 95 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, |
96 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
97 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, | ||
95 | void __user *buffer, size_t *lenp, loff_t *ppos); | 98 | void __user *buffer, size_t *lenp, loff_t *ppos); |
96 | #endif | 99 | #endif |
97 | 100 | ||
@@ -130,14 +133,26 @@ extern int max_lock_depth; | |||
130 | 133 | ||
131 | #ifdef CONFIG_SYSCTL_SYSCALL | 134 | #ifdef CONFIG_SYSCTL_SYSCALL |
132 | static int parse_table(int __user *, int, void __user *, size_t __user *, | 135 | static int parse_table(int __user *, int, void __user *, size_t __user *, |
133 | void __user *, size_t, ctl_table *, void **); | 136 | void __user *, size_t, ctl_table *); |
134 | #endif | 137 | #endif |
135 | 138 | ||
136 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | 139 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, |
137 | void __user *buffer, size_t *lenp, loff_t *ppos); | 140 | void __user *buffer, size_t *lenp, loff_t *ppos); |
138 | 141 | ||
142 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
143 | void __user *oldval, size_t __user *oldlenp, | ||
144 | void __user *newval, size_t newlen); | ||
145 | |||
146 | #ifdef CONFIG_SYSVIPC | ||
147 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
148 | void __user *oldval, size_t __user *oldlenp, | ||
149 | void __user *newval, size_t newlen); | ||
150 | #endif | ||
151 | |||
152 | #ifdef CONFIG_PROC_SYSCTL | ||
139 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 153 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
140 | void __user *buffer, size_t *lenp, loff_t *ppos); | 154 | void __user *buffer, size_t *lenp, loff_t *ppos); |
155 | #endif | ||
141 | 156 | ||
142 | static ctl_table root_table[]; | 157 | static ctl_table root_table[]; |
143 | static struct ctl_table_header root_table_header = | 158 | static struct ctl_table_header root_table_header = |
@@ -160,6 +175,40 @@ extern ctl_table inotify_table[]; | |||
160 | int sysctl_legacy_va_layout; | 175 | int sysctl_legacy_va_layout; |
161 | #endif | 176 | #endif |
162 | 177 | ||
178 | static void *get_uts(ctl_table *table, int write) | ||
179 | { | ||
180 | char *which = table->data; | ||
181 | #ifdef CONFIG_UTS_NS | ||
182 | struct uts_namespace *uts_ns = current->nsproxy->uts_ns; | ||
183 | which = (which - (char *)&init_uts_ns) + (char *)uts_ns; | ||
184 | #endif | ||
185 | if (!write) | ||
186 | down_read(&uts_sem); | ||
187 | else | ||
188 | down_write(&uts_sem); | ||
189 | return which; | ||
190 | } | ||
191 | |||
192 | static void put_uts(ctl_table *table, int write, void *which) | ||
193 | { | ||
194 | if (!write) | ||
195 | up_read(&uts_sem); | ||
196 | else | ||
197 | up_write(&uts_sem); | ||
198 | } | ||
199 | |||
200 | #ifdef CONFIG_SYSVIPC | ||
201 | static void *get_ipc(ctl_table *table, int write) | ||
202 | { | ||
203 | char *which = table->data; | ||
204 | struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; | ||
205 | which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; | ||
206 | return which; | ||
207 | } | ||
208 | #else | ||
209 | #define get_ipc(T,W) ((T)->data) | ||
210 | #endif | ||
211 | |||
163 | /* /proc declarations: */ | 212 | /* /proc declarations: */ |
164 | 213 | ||
165 | #ifdef CONFIG_PROC_SYSCTL | 214 | #ifdef CONFIG_PROC_SYSCTL |
@@ -168,7 +217,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); | |||
168 | static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); | 217 | static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); |
169 | static int proc_opensys(struct inode *, struct file *); | 218 | static int proc_opensys(struct inode *, struct file *); |
170 | 219 | ||
171 | struct file_operations proc_sys_file_operations = { | 220 | const struct file_operations proc_sys_file_operations = { |
172 | .open = proc_opensys, | 221 | .open = proc_opensys, |
173 | .read = proc_readsys, | 222 | .read = proc_readsys, |
174 | .write = proc_writesys, | 223 | .write = proc_writesys, |
@@ -226,7 +275,6 @@ static ctl_table root_table[] = { | |||
226 | }; | 275 | }; |
227 | 276 | ||
228 | static ctl_table kern_table[] = { | 277 | static ctl_table kern_table[] = { |
229 | #ifndef CONFIG_UTS_NS | ||
230 | { | 278 | { |
231 | .ctl_name = KERN_OSTYPE, | 279 | .ctl_name = KERN_OSTYPE, |
232 | .procname = "ostype", | 280 | .procname = "ostype", |
@@ -234,7 +282,7 @@ static ctl_table kern_table[] = { | |||
234 | .maxlen = sizeof(init_uts_ns.name.sysname), | 282 | .maxlen = sizeof(init_uts_ns.name.sysname), |
235 | .mode = 0444, | 283 | .mode = 0444, |
236 | .proc_handler = &proc_do_uts_string, | 284 | .proc_handler = &proc_do_uts_string, |
237 | .strategy = &sysctl_string, | 285 | .strategy = &sysctl_uts_string, |
238 | }, | 286 | }, |
239 | { | 287 | { |
240 | .ctl_name = KERN_OSRELEASE, | 288 | .ctl_name = KERN_OSRELEASE, |
@@ -243,7 +291,7 @@ static ctl_table kern_table[] = { | |||
243 | .maxlen = sizeof(init_uts_ns.name.release), | 291 | .maxlen = sizeof(init_uts_ns.name.release), |
244 | .mode = 0444, | 292 | .mode = 0444, |
245 | .proc_handler = &proc_do_uts_string, | 293 | .proc_handler = &proc_do_uts_string, |
246 | .strategy = &sysctl_string, | 294 | .strategy = &sysctl_uts_string, |
247 | }, | 295 | }, |
248 | { | 296 | { |
249 | .ctl_name = KERN_VERSION, | 297 | .ctl_name = KERN_VERSION, |
@@ -252,7 +300,7 @@ static ctl_table kern_table[] = { | |||
252 | .maxlen = sizeof(init_uts_ns.name.version), | 300 | .maxlen = sizeof(init_uts_ns.name.version), |
253 | .mode = 0444, | 301 | .mode = 0444, |
254 | .proc_handler = &proc_do_uts_string, | 302 | .proc_handler = &proc_do_uts_string, |
255 | .strategy = &sysctl_string, | 303 | .strategy = &sysctl_uts_string, |
256 | }, | 304 | }, |
257 | { | 305 | { |
258 | .ctl_name = KERN_NODENAME, | 306 | .ctl_name = KERN_NODENAME, |
@@ -261,7 +309,7 @@ static ctl_table kern_table[] = { | |||
261 | .maxlen = sizeof(init_uts_ns.name.nodename), | 309 | .maxlen = sizeof(init_uts_ns.name.nodename), |
262 | .mode = 0644, | 310 | .mode = 0644, |
263 | .proc_handler = &proc_do_uts_string, | 311 | .proc_handler = &proc_do_uts_string, |
264 | .strategy = &sysctl_string, | 312 | .strategy = &sysctl_uts_string, |
265 | }, | 313 | }, |
266 | { | 314 | { |
267 | .ctl_name = KERN_DOMAINNAME, | 315 | .ctl_name = KERN_DOMAINNAME, |
@@ -270,56 +318,8 @@ static ctl_table kern_table[] = { | |||
270 | .maxlen = sizeof(init_uts_ns.name.domainname), | 318 | .maxlen = sizeof(init_uts_ns.name.domainname), |
271 | .mode = 0644, | 319 | .mode = 0644, |
272 | .proc_handler = &proc_do_uts_string, | 320 | .proc_handler = &proc_do_uts_string, |
273 | .strategy = &sysctl_string, | 321 | .strategy = &sysctl_uts_string, |
274 | }, | ||
275 | #else /* !CONFIG_UTS_NS */ | ||
276 | { | ||
277 | .ctl_name = KERN_OSTYPE, | ||
278 | .procname = "ostype", | ||
279 | .data = NULL, | ||
280 | /* could maybe use __NEW_UTS_LEN here? */ | ||
281 | .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), | ||
282 | .mode = 0444, | ||
283 | .proc_handler = &proc_do_uts_string, | ||
284 | .strategy = &sysctl_string, | ||
285 | }, | ||
286 | { | ||
287 | .ctl_name = KERN_OSRELEASE, | ||
288 | .procname = "osrelease", | ||
289 | .data = NULL, | ||
290 | .maxlen = FIELD_SIZEOF(struct new_utsname, release), | ||
291 | .mode = 0444, | ||
292 | .proc_handler = &proc_do_uts_string, | ||
293 | .strategy = &sysctl_string, | ||
294 | }, | ||
295 | { | ||
296 | .ctl_name = KERN_VERSION, | ||
297 | .procname = "version", | ||
298 | .data = NULL, | ||
299 | .maxlen = FIELD_SIZEOF(struct new_utsname, version), | ||
300 | .mode = 0444, | ||
301 | .proc_handler = &proc_do_uts_string, | ||
302 | .strategy = &sysctl_string, | ||
303 | }, | ||
304 | { | ||
305 | .ctl_name = KERN_NODENAME, | ||
306 | .procname = "hostname", | ||
307 | .data = NULL, | ||
308 | .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), | ||
309 | .mode = 0644, | ||
310 | .proc_handler = &proc_do_uts_string, | ||
311 | .strategy = &sysctl_string, | ||
312 | }, | ||
313 | { | ||
314 | .ctl_name = KERN_DOMAINNAME, | ||
315 | .procname = "domainname", | ||
316 | .data = NULL, | ||
317 | .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), | ||
318 | .mode = 0644, | ||
319 | .proc_handler = &proc_do_uts_string, | ||
320 | .strategy = &sysctl_string, | ||
321 | }, | 322 | }, |
322 | #endif /* !CONFIG_UTS_NS */ | ||
323 | { | 323 | { |
324 | .ctl_name = KERN_PANIC, | 324 | .ctl_name = KERN_PANIC, |
325 | .procname = "panic", | 325 | .procname = "panic", |
@@ -478,58 +478,65 @@ static ctl_table kern_table[] = { | |||
478 | { | 478 | { |
479 | .ctl_name = KERN_SHMMAX, | 479 | .ctl_name = KERN_SHMMAX, |
480 | .procname = "shmmax", | 480 | .procname = "shmmax", |
481 | .data = NULL, | 481 | .data = &init_ipc_ns.shm_ctlmax, |
482 | .maxlen = sizeof (size_t), | 482 | .maxlen = sizeof (init_ipc_ns.shm_ctlmax), |
483 | .mode = 0644, | 483 | .mode = 0644, |
484 | .proc_handler = &proc_do_ipc_string, | 484 | .proc_handler = &proc_ipc_doulongvec_minmax, |
485 | .strategy = sysctl_ipc_data, | ||
485 | }, | 486 | }, |
486 | { | 487 | { |
487 | .ctl_name = KERN_SHMALL, | 488 | .ctl_name = KERN_SHMALL, |
488 | .procname = "shmall", | 489 | .procname = "shmall", |
489 | .data = NULL, | 490 | .data = &init_ipc_ns.shm_ctlall, |
490 | .maxlen = sizeof (size_t), | 491 | .maxlen = sizeof (init_ipc_ns.shm_ctlall), |
491 | .mode = 0644, | 492 | .mode = 0644, |
492 | .proc_handler = &proc_do_ipc_string, | 493 | .proc_handler = &proc_ipc_doulongvec_minmax, |
494 | .strategy = sysctl_ipc_data, | ||
493 | }, | 495 | }, |
494 | { | 496 | { |
495 | .ctl_name = KERN_SHMMNI, | 497 | .ctl_name = KERN_SHMMNI, |
496 | .procname = "shmmni", | 498 | .procname = "shmmni", |
497 | .data = NULL, | 499 | .data = &init_ipc_ns.shm_ctlmni, |
498 | .maxlen = sizeof (int), | 500 | .maxlen = sizeof (init_ipc_ns.shm_ctlmni), |
499 | .mode = 0644, | 501 | .mode = 0644, |
500 | .proc_handler = &proc_do_ipc_string, | 502 | .proc_handler = &proc_ipc_dointvec, |
503 | .strategy = sysctl_ipc_data, | ||
501 | }, | 504 | }, |
502 | { | 505 | { |
503 | .ctl_name = KERN_MSGMAX, | 506 | .ctl_name = KERN_MSGMAX, |
504 | .procname = "msgmax", | 507 | .procname = "msgmax", |
505 | .data = NULL, | 508 | .data = &init_ipc_ns.msg_ctlmax, |
506 | .maxlen = sizeof (int), | 509 | .maxlen = sizeof (init_ipc_ns.msg_ctlmax), |
507 | .mode = 0644, | 510 | .mode = 0644, |
508 | .proc_handler = &proc_do_ipc_string, | 511 | .proc_handler = &proc_ipc_dointvec, |
512 | .strategy = sysctl_ipc_data, | ||
509 | }, | 513 | }, |
510 | { | 514 | { |
511 | .ctl_name = KERN_MSGMNI, | 515 | .ctl_name = KERN_MSGMNI, |
512 | .procname = "msgmni", | 516 | .procname = "msgmni", |
513 | .data = NULL, | 517 | .data = &init_ipc_ns.msg_ctlmni, |
514 | .maxlen = sizeof (int), | 518 | .maxlen = sizeof (init_ipc_ns.msg_ctlmni), |
515 | .mode = 0644, | 519 | .mode = 0644, |
516 | .proc_handler = &proc_do_ipc_string, | 520 | .proc_handler = &proc_ipc_dointvec, |
521 | .strategy = sysctl_ipc_data, | ||
517 | }, | 522 | }, |
518 | { | 523 | { |
519 | .ctl_name = KERN_MSGMNB, | 524 | .ctl_name = KERN_MSGMNB, |
520 | .procname = "msgmnb", | 525 | .procname = "msgmnb", |
521 | .data = NULL, | 526 | .data = &init_ipc_ns.msg_ctlmnb, |
522 | .maxlen = sizeof (int), | 527 | .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), |
523 | .mode = 0644, | 528 | .mode = 0644, |
524 | .proc_handler = &proc_do_ipc_string, | 529 | .proc_handler = &proc_ipc_dointvec, |
530 | .strategy = sysctl_ipc_data, | ||
525 | }, | 531 | }, |
526 | { | 532 | { |
527 | .ctl_name = KERN_SEM, | 533 | .ctl_name = KERN_SEM, |
528 | .procname = "sem", | 534 | .procname = "sem", |
529 | .data = NULL, | 535 | .data = &init_ipc_ns.sem_ctls, |
530 | .maxlen = 4*sizeof (int), | 536 | .maxlen = 4*sizeof (int), |
531 | .mode = 0644, | 537 | .mode = 0644, |
532 | .proc_handler = &proc_do_ipc_string, | 538 | .proc_handler = &proc_ipc_dointvec, |
539 | .strategy = sysctl_ipc_data, | ||
533 | }, | 540 | }, |
534 | #endif | 541 | #endif |
535 | #ifdef CONFIG_MAGIC_SYSRQ | 542 | #ifdef CONFIG_MAGIC_SYSRQ |
@@ -542,6 +549,7 @@ static ctl_table kern_table[] = { | |||
542 | .proc_handler = &proc_dointvec, | 549 | .proc_handler = &proc_dointvec, |
543 | }, | 550 | }, |
544 | #endif | 551 | #endif |
552 | #ifdef CONFIG_PROC_SYSCTL | ||
545 | { | 553 | { |
546 | .ctl_name = KERN_CADPID, | 554 | .ctl_name = KERN_CADPID, |
547 | .procname = "cad_pid", | 555 | .procname = "cad_pid", |
@@ -550,6 +558,7 @@ static ctl_table kern_table[] = { | |||
550 | .mode = 0600, | 558 | .mode = 0600, |
551 | .proc_handler = &proc_do_cad_pid, | 559 | .proc_handler = &proc_do_cad_pid, |
552 | }, | 560 | }, |
561 | #endif | ||
553 | { | 562 | { |
554 | .ctl_name = KERN_MAX_THREADS, | 563 | .ctl_name = KERN_MAX_THREADS, |
555 | .procname = "threads-max", | 564 | .procname = "threads-max", |
@@ -703,6 +712,14 @@ static ctl_table kern_table[] = { | |||
703 | .mode = 0444, | 712 | .mode = 0444, |
704 | .proc_handler = &proc_dointvec, | 713 | .proc_handler = &proc_dointvec, |
705 | }, | 714 | }, |
715 | { | ||
716 | .ctl_name = CTL_UNNUMBERED, | ||
717 | .procname = "kstack_depth_to_print", | ||
718 | .data = &kstack_depth_to_print, | ||
719 | .maxlen = sizeof(int), | ||
720 | .mode = 0644, | ||
721 | .proc_handler = &proc_dointvec, | ||
722 | }, | ||
706 | #endif | 723 | #endif |
707 | #if defined(CONFIG_MMU) | 724 | #if defined(CONFIG_MMU) |
708 | { | 725 | { |
@@ -973,17 +990,6 @@ static ctl_table vm_table[] = { | |||
973 | .extra1 = &zero, | 990 | .extra1 = &zero, |
974 | }, | 991 | }, |
975 | #endif | 992 | #endif |
976 | #ifdef CONFIG_SWAP | ||
977 | { | ||
978 | .ctl_name = VM_SWAP_TOKEN_TIMEOUT, | ||
979 | .procname = "swap_token_timeout", | ||
980 | .data = &swap_token_default_timeout, | ||
981 | .maxlen = sizeof(swap_token_default_timeout), | ||
982 | .mode = 0644, | ||
983 | .proc_handler = &proc_dointvec_jiffies, | ||
984 | .strategy = &sysctl_jiffies, | ||
985 | }, | ||
986 | #endif | ||
987 | #ifdef CONFIG_NUMA | 993 | #ifdef CONFIG_NUMA |
988 | { | 994 | { |
989 | .ctl_name = VM_ZONE_RECLAIM_MODE, | 995 | .ctl_name = VM_ZONE_RECLAIM_MODE, |
@@ -1237,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
1237 | do { | 1243 | do { |
1238 | struct ctl_table_header *head = | 1244 | struct ctl_table_header *head = |
1239 | list_entry(tmp, struct ctl_table_header, ctl_entry); | 1245 | list_entry(tmp, struct ctl_table_header, ctl_entry); |
1240 | void *context = NULL; | ||
1241 | 1246 | ||
1242 | if (!use_table(head)) | 1247 | if (!use_table(head)) |
1243 | continue; | 1248 | continue; |
@@ -1245,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
1245 | spin_unlock(&sysctl_lock); | 1250 | spin_unlock(&sysctl_lock); |
1246 | 1251 | ||
1247 | error = parse_table(name, nlen, oldval, oldlenp, | 1252 | error = parse_table(name, nlen, oldval, oldlenp, |
1248 | newval, newlen, head->ctl_table, | 1253 | newval, newlen, head->ctl_table); |
1249 | &context); | ||
1250 | kfree(context); | ||
1251 | 1254 | ||
1252 | spin_lock(&sysctl_lock); | 1255 | spin_lock(&sysctl_lock); |
1253 | unuse_table(head); | 1256 | unuse_table(head); |
@@ -1303,7 +1306,7 @@ static inline int ctl_perm(ctl_table *table, int op) | |||
1303 | static int parse_table(int __user *name, int nlen, | 1306 | static int parse_table(int __user *name, int nlen, |
1304 | void __user *oldval, size_t __user *oldlenp, | 1307 | void __user *oldval, size_t __user *oldlenp, |
1305 | void __user *newval, size_t newlen, | 1308 | void __user *newval, size_t newlen, |
1306 | ctl_table *table, void **context) | 1309 | ctl_table *table) |
1307 | { | 1310 | { |
1308 | int n; | 1311 | int n; |
1309 | repeat: | 1312 | repeat: |
@@ -1311,7 +1314,9 @@ repeat: | |||
1311 | return -ENOTDIR; | 1314 | return -ENOTDIR; |
1312 | if (get_user(n, name)) | 1315 | if (get_user(n, name)) |
1313 | return -EFAULT; | 1316 | return -EFAULT; |
1314 | for ( ; table->ctl_name; table++) { | 1317 | for ( ; table->ctl_name || table->procname; table++) { |
1318 | if (!table->ctl_name) | ||
1319 | continue; | ||
1315 | if (n == table->ctl_name || table->ctl_name == CTL_ANY) { | 1320 | if (n == table->ctl_name || table->ctl_name == CTL_ANY) { |
1316 | int error; | 1321 | int error; |
1317 | if (table->child) { | 1322 | if (table->child) { |
@@ -1321,7 +1326,7 @@ repeat: | |||
1321 | error = table->strategy( | 1326 | error = table->strategy( |
1322 | table, name, nlen, | 1327 | table, name, nlen, |
1323 | oldval, oldlenp, | 1328 | oldval, oldlenp, |
1324 | newval, newlen, context); | 1329 | newval, newlen); |
1325 | if (error) | 1330 | if (error) |
1326 | return error; | 1331 | return error; |
1327 | } | 1332 | } |
@@ -1332,7 +1337,7 @@ repeat: | |||
1332 | } | 1337 | } |
1333 | error = do_sysctl_strategy(table, name, nlen, | 1338 | error = do_sysctl_strategy(table, name, nlen, |
1334 | oldval, oldlenp, | 1339 | oldval, oldlenp, |
1335 | newval, newlen, context); | 1340 | newval, newlen); |
1336 | return error; | 1341 | return error; |
1337 | } | 1342 | } |
1338 | } | 1343 | } |
@@ -1343,7 +1348,7 @@ repeat: | |||
1343 | int do_sysctl_strategy (ctl_table *table, | 1348 | int do_sysctl_strategy (ctl_table *table, |
1344 | int __user *name, int nlen, | 1349 | int __user *name, int nlen, |
1345 | void __user *oldval, size_t __user *oldlenp, | 1350 | void __user *oldval, size_t __user *oldlenp, |
1346 | void __user *newval, size_t newlen, void **context) | 1351 | void __user *newval, size_t newlen) |
1347 | { | 1352 | { |
1348 | int op = 0, rc; | 1353 | int op = 0, rc; |
1349 | size_t len; | 1354 | size_t len; |
@@ -1357,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table, | |||
1357 | 1362 | ||
1358 | if (table->strategy) { | 1363 | if (table->strategy) { |
1359 | rc = table->strategy(table, name, nlen, oldval, oldlenp, | 1364 | rc = table->strategy(table, name, nlen, oldval, oldlenp, |
1360 | newval, newlen, context); | 1365 | newval, newlen); |
1361 | if (rc < 0) | 1366 | if (rc < 0) |
1362 | return rc; | 1367 | return rc; |
1363 | if (rc > 0) | 1368 | if (rc > 0) |
@@ -1528,7 +1533,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, | |||
1528 | int len; | 1533 | int len; |
1529 | mode_t mode; | 1534 | mode_t mode; |
1530 | 1535 | ||
1531 | for (; table->ctl_name; table++) { | 1536 | for (; table->ctl_name || table->procname; table++) { |
1532 | /* Can't do anything without a proc name. */ | 1537 | /* Can't do anything without a proc name. */ |
1533 | if (!table->procname) | 1538 | if (!table->procname) |
1534 | continue; | 1539 | continue; |
@@ -1575,7 +1580,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, | |||
1575 | static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) | 1580 | static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) |
1576 | { | 1581 | { |
1577 | struct proc_dir_entry *de; | 1582 | struct proc_dir_entry *de; |
1578 | for (; table->ctl_name; table++) { | 1583 | for (; table->ctl_name || table->procname; table++) { |
1579 | if (!(de = table->de)) | 1584 | if (!(de = table->de)) |
1580 | continue; | 1585 | continue; |
1581 | if (de->mode & S_IFDIR) { | 1586 | if (de->mode & S_IFDIR) { |
@@ -1610,7 +1615,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, | |||
1610 | size_t count, loff_t *ppos) | 1615 | size_t count, loff_t *ppos) |
1611 | { | 1616 | { |
1612 | int op; | 1617 | int op; |
1613 | struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); | 1618 | struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); |
1614 | struct ctl_table *table; | 1619 | struct ctl_table *table; |
1615 | size_t res; | 1620 | size_t res; |
1616 | ssize_t error = -ENOTDIR; | 1621 | ssize_t error = -ENOTDIR; |
@@ -1749,66 +1754,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, | |||
1749 | * Special case of dostring for the UTS structure. This has locks | 1754 | * Special case of dostring for the UTS structure. This has locks |
1750 | * to observe. Should this be in kernel/sys.c ???? | 1755 | * to observe. Should this be in kernel/sys.c ???? |
1751 | */ | 1756 | */ |
1752 | |||
1753 | #ifndef CONFIG_UTS_NS | ||
1754 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
1755 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1756 | { | ||
1757 | int r; | ||
1758 | 1757 | ||
1759 | if (!write) { | ||
1760 | down_read(&uts_sem); | ||
1761 | r=proc_dostring(table,0,filp,buffer,lenp, ppos); | ||
1762 | up_read(&uts_sem); | ||
1763 | } else { | ||
1764 | down_write(&uts_sem); | ||
1765 | r=proc_dostring(table,1,filp,buffer,lenp, ppos); | ||
1766 | up_write(&uts_sem); | ||
1767 | } | ||
1768 | return r; | ||
1769 | } | ||
1770 | #else /* !CONFIG_UTS_NS */ | ||
1771 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | 1758 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, |
1772 | void __user *buffer, size_t *lenp, loff_t *ppos) | 1759 | void __user *buffer, size_t *lenp, loff_t *ppos) |
1773 | { | 1760 | { |
1774 | int r; | 1761 | int r; |
1775 | struct uts_namespace* uts_ns = current->nsproxy->uts_ns; | 1762 | void *which; |
1776 | char* which; | 1763 | which = get_uts(table, write); |
1777 | 1764 | r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); | |
1778 | switch (table->ctl_name) { | 1765 | put_uts(table, write, which); |
1779 | case KERN_OSTYPE: | ||
1780 | which = uts_ns->name.sysname; | ||
1781 | break; | ||
1782 | case KERN_NODENAME: | ||
1783 | which = uts_ns->name.nodename; | ||
1784 | break; | ||
1785 | case KERN_OSRELEASE: | ||
1786 | which = uts_ns->name.release; | ||
1787 | break; | ||
1788 | case KERN_VERSION: | ||
1789 | which = uts_ns->name.version; | ||
1790 | break; | ||
1791 | case KERN_DOMAINNAME: | ||
1792 | which = uts_ns->name.domainname; | ||
1793 | break; | ||
1794 | default: | ||
1795 | r = -EINVAL; | ||
1796 | goto out; | ||
1797 | } | ||
1798 | |||
1799 | if (!write) { | ||
1800 | down_read(&uts_sem); | ||
1801 | r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); | ||
1802 | up_read(&uts_sem); | ||
1803 | } else { | ||
1804 | down_write(&uts_sem); | ||
1805 | r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); | ||
1806 | up_write(&uts_sem); | ||
1807 | } | ||
1808 | out: | ||
1809 | return r; | 1766 | return r; |
1810 | } | 1767 | } |
1811 | #endif /* !CONFIG_UTS_NS */ | ||
1812 | 1768 | ||
1813 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, | 1769 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, |
1814 | int *valp, | 1770 | int *valp, |
@@ -1880,7 +1836,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table, | |||
1880 | p = buf; | 1836 | p = buf; |
1881 | if (*p == '-' && left > 1) { | 1837 | if (*p == '-' && left > 1) { |
1882 | neg = 1; | 1838 | neg = 1; |
1883 | left--, p++; | 1839 | p++; |
1884 | } | 1840 | } |
1885 | if (*p < '0' || *p > '9') | 1841 | if (*p < '0' || *p > '9') |
1886 | break; | 1842 | break; |
@@ -1972,9 +1928,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, | |||
1972 | 1928 | ||
1973 | #define OP_SET 0 | 1929 | #define OP_SET 0 |
1974 | #define OP_AND 1 | 1930 | #define OP_AND 1 |
1975 | #define OP_OR 2 | ||
1976 | #define OP_MAX 3 | ||
1977 | #define OP_MIN 4 | ||
1978 | 1931 | ||
1979 | static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, | 1932 | static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, |
1980 | int *valp, | 1933 | int *valp, |
@@ -1986,13 +1939,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, | |||
1986 | switch(op) { | 1939 | switch(op) { |
1987 | case OP_SET: *valp = val; break; | 1940 | case OP_SET: *valp = val; break; |
1988 | case OP_AND: *valp &= val; break; | 1941 | case OP_AND: *valp &= val; break; |
1989 | case OP_OR: *valp |= val; break; | ||
1990 | case OP_MAX: if(*valp < val) | ||
1991 | *valp = val; | ||
1992 | break; | ||
1993 | case OP_MIN: if(*valp > val) | ||
1994 | *valp = val; | ||
1995 | break; | ||
1996 | } | 1942 | } |
1997 | } else { | 1943 | } else { |
1998 | int val = *valp; | 1944 | int val = *valp; |
@@ -2131,7 +2077,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, | |||
2131 | p = buf; | 2077 | p = buf; |
2132 | if (*p == '-' && left > 1) { | 2078 | if (*p == '-' && left > 1) { |
2133 | neg = 1; | 2079 | neg = 1; |
2134 | left--, p++; | 2080 | p++; |
2135 | } | 2081 | } |
2136 | if (*p < '0' || *p > '9') | 2082 | if (*p < '0' || *p > '9') |
2137 | break; | 2083 | break; |
@@ -2387,46 +2333,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, | |||
2387 | } | 2333 | } |
2388 | 2334 | ||
2389 | #ifdef CONFIG_SYSVIPC | 2335 | #ifdef CONFIG_SYSVIPC |
2390 | static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | 2336 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, |
2391 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2337 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2392 | { | 2338 | { |
2393 | void *data; | 2339 | void *which; |
2394 | struct ipc_namespace *ns; | 2340 | which = get_ipc(table, write); |
2395 | 2341 | return __do_proc_dointvec(which, table, write, filp, buffer, | |
2396 | ns = current->nsproxy->ipc_ns; | ||
2397 | |||
2398 | switch (table->ctl_name) { | ||
2399 | case KERN_SHMMAX: | ||
2400 | data = &ns->shm_ctlmax; | ||
2401 | goto proc_minmax; | ||
2402 | case KERN_SHMALL: | ||
2403 | data = &ns->shm_ctlall; | ||
2404 | goto proc_minmax; | ||
2405 | case KERN_SHMMNI: | ||
2406 | data = &ns->shm_ctlmni; | ||
2407 | break; | ||
2408 | case KERN_MSGMAX: | ||
2409 | data = &ns->msg_ctlmax; | ||
2410 | break; | ||
2411 | case KERN_MSGMNI: | ||
2412 | data = &ns->msg_ctlmni; | ||
2413 | break; | ||
2414 | case KERN_MSGMNB: | ||
2415 | data = &ns->msg_ctlmnb; | ||
2416 | break; | ||
2417 | case KERN_SEM: | ||
2418 | data = &ns->sem_ctls; | ||
2419 | break; | ||
2420 | default: | ||
2421 | return -EINVAL; | ||
2422 | } | ||
2423 | |||
2424 | return __do_proc_dointvec(data, table, write, filp, buffer, | ||
2425 | lenp, ppos, NULL, NULL); | 2342 | lenp, ppos, NULL, NULL); |
2426 | proc_minmax: | 2343 | } |
2427 | return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, | 2344 | |
2345 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
2346 | struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2347 | { | ||
2348 | void *which; | ||
2349 | which = get_ipc(table, write); | ||
2350 | return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, | ||
2428 | lenp, ppos, 1l, 1l); | 2351 | lenp, ppos, 1l, 1l); |
2429 | } | 2352 | } |
2353 | |||
2430 | #endif | 2354 | #endif |
2431 | 2355 | ||
2432 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 2356 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
@@ -2471,6 +2395,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | |||
2471 | { | 2395 | { |
2472 | return -ENOSYS; | 2396 | return -ENOSYS; |
2473 | } | 2397 | } |
2398 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, | ||
2399 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2400 | { | ||
2401 | return -ENOSYS; | ||
2402 | } | ||
2403 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
2404 | struct file *filp, void __user *buffer, | ||
2405 | size_t *lenp, loff_t *ppos) | ||
2406 | { | ||
2407 | return -ENOSYS; | ||
2408 | } | ||
2474 | #endif | 2409 | #endif |
2475 | 2410 | ||
2476 | int proc_dointvec(ctl_table *table, int write, struct file *filp, | 2411 | int proc_dointvec(ctl_table *table, int write, struct file *filp, |
@@ -2535,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, | |||
2535 | /* The generic string strategy routine: */ | 2470 | /* The generic string strategy routine: */ |
2536 | int sysctl_string(ctl_table *table, int __user *name, int nlen, | 2471 | int sysctl_string(ctl_table *table, int __user *name, int nlen, |
2537 | void __user *oldval, size_t __user *oldlenp, | 2472 | void __user *oldval, size_t __user *oldlenp, |
2538 | void __user *newval, size_t newlen, void **context) | 2473 | void __user *newval, size_t newlen) |
2539 | { | 2474 | { |
2540 | if (!table->data || !table->maxlen) | 2475 | if (!table->data || !table->maxlen) |
2541 | return -ENOTDIR; | 2476 | return -ENOTDIR; |
@@ -2581,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, | |||
2581 | */ | 2516 | */ |
2582 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | 2517 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, |
2583 | void __user *oldval, size_t __user *oldlenp, | 2518 | void __user *oldval, size_t __user *oldlenp, |
2584 | void __user *newval, size_t newlen, void **context) | 2519 | void __user *newval, size_t newlen) |
2585 | { | 2520 | { |
2586 | 2521 | ||
2587 | if (newval && newlen) { | 2522 | if (newval && newlen) { |
@@ -2617,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | |||
2617 | /* Strategy function to convert jiffies to seconds */ | 2552 | /* Strategy function to convert jiffies to seconds */ |
2618 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | 2553 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, |
2619 | void __user *oldval, size_t __user *oldlenp, | 2554 | void __user *oldval, size_t __user *oldlenp, |
2620 | void __user *newval, size_t newlen, void **context) | 2555 | void __user *newval, size_t newlen) |
2621 | { | 2556 | { |
2622 | if (oldval) { | 2557 | if (oldval) { |
2623 | size_t olen; | 2558 | size_t olen; |
@@ -2645,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2645 | /* Strategy function to convert jiffies to seconds */ | 2580 | /* Strategy function to convert jiffies to seconds */ |
2646 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | 2581 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, |
2647 | void __user *oldval, size_t __user *oldlenp, | 2582 | void __user *oldval, size_t __user *oldlenp, |
2648 | void __user *newval, size_t newlen, void **context) | 2583 | void __user *newval, size_t newlen) |
2649 | { | 2584 | { |
2650 | if (oldval) { | 2585 | if (oldval) { |
2651 | size_t olen; | 2586 | size_t olen; |
@@ -2670,50 +2605,140 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2670 | return 1; | 2605 | return 1; |
2671 | } | 2606 | } |
2672 | 2607 | ||
2608 | |||
2609 | /* The generic string strategy routine: */ | ||
2610 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
2611 | void __user *oldval, size_t __user *oldlenp, | ||
2612 | void __user *newval, size_t newlen) | ||
2613 | { | ||
2614 | struct ctl_table uts_table; | ||
2615 | int r, write; | ||
2616 | write = newval && newlen; | ||
2617 | memcpy(&uts_table, table, sizeof(uts_table)); | ||
2618 | uts_table.data = get_uts(table, write); | ||
2619 | r = sysctl_string(&uts_table, name, nlen, | ||
2620 | oldval, oldlenp, newval, newlen); | ||
2621 | put_uts(table, write, uts_table.data); | ||
2622 | return r; | ||
2623 | } | ||
2624 | |||
2625 | #ifdef CONFIG_SYSVIPC | ||
2626 | /* The generic sysctl ipc data routine. */ | ||
2627 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
2628 | void __user *oldval, size_t __user *oldlenp, | ||
2629 | void __user *newval, size_t newlen) | ||
2630 | { | ||
2631 | size_t len; | ||
2632 | void *data; | ||
2633 | |||
2634 | /* Get out of I don't have a variable */ | ||
2635 | if (!table->data || !table->maxlen) | ||
2636 | return -ENOTDIR; | ||
2637 | |||
2638 | data = get_ipc(table, 1); | ||
2639 | if (!data) | ||
2640 | return -ENOTDIR; | ||
2641 | |||
2642 | if (oldval && oldlenp) { | ||
2643 | if (get_user(len, oldlenp)) | ||
2644 | return -EFAULT; | ||
2645 | if (len) { | ||
2646 | if (len > table->maxlen) | ||
2647 | len = table->maxlen; | ||
2648 | if (copy_to_user(oldval, data, len)) | ||
2649 | return -EFAULT; | ||
2650 | if (put_user(len, oldlenp)) | ||
2651 | return -EFAULT; | ||
2652 | } | ||
2653 | } | ||
2654 | |||
2655 | if (newval && newlen) { | ||
2656 | if (newlen > table->maxlen) | ||
2657 | newlen = table->maxlen; | ||
2658 | |||
2659 | if (copy_from_user(data, newval, newlen)) | ||
2660 | return -EFAULT; | ||
2661 | } | ||
2662 | return 1; | ||
2663 | } | ||
2664 | #endif | ||
2665 | |||
2673 | #else /* CONFIG_SYSCTL_SYSCALL */ | 2666 | #else /* CONFIG_SYSCTL_SYSCALL */ |
2674 | 2667 | ||
2675 | 2668 | ||
2676 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | 2669 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) |
2677 | { | 2670 | { |
2678 | static int msg_count; | 2671 | static int msg_count; |
2672 | struct __sysctl_args tmp; | ||
2673 | int name[CTL_MAXNAME]; | ||
2674 | int i; | ||
2675 | |||
2676 | /* Read in the sysctl name for better debug message logging */ | ||
2677 | if (copy_from_user(&tmp, args, sizeof(tmp))) | ||
2678 | return -EFAULT; | ||
2679 | if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME) | ||
2680 | return -ENOTDIR; | ||
2681 | for (i = 0; i < tmp.nlen; i++) | ||
2682 | if (get_user(name[i], tmp.name + i)) | ||
2683 | return -EFAULT; | ||
2684 | |||
2685 | /* Ignore accesses to kernel.version */ | ||
2686 | if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) | ||
2687 | goto out; | ||
2679 | 2688 | ||
2680 | if (msg_count < 5) { | 2689 | if (msg_count < 5) { |
2681 | msg_count++; | 2690 | msg_count++; |
2682 | printk(KERN_INFO | 2691 | printk(KERN_INFO |
2683 | "warning: process `%s' used the removed sysctl " | 2692 | "warning: process `%s' used the removed sysctl " |
2684 | "system call\n", current->comm); | 2693 | "system call with ", current->comm); |
2694 | for (i = 0; i < tmp.nlen; i++) | ||
2695 | printk("%d.", name[i]); | ||
2696 | printk("\n"); | ||
2685 | } | 2697 | } |
2698 | out: | ||
2686 | return -ENOSYS; | 2699 | return -ENOSYS; |
2687 | } | 2700 | } |
2688 | 2701 | ||
2689 | int sysctl_string(ctl_table *table, int __user *name, int nlen, | 2702 | int sysctl_string(ctl_table *table, int __user *name, int nlen, |
2690 | void __user *oldval, size_t __user *oldlenp, | 2703 | void __user *oldval, size_t __user *oldlenp, |
2691 | void __user *newval, size_t newlen, void **context) | 2704 | void __user *newval, size_t newlen) |
2692 | { | 2705 | { |
2693 | return -ENOSYS; | 2706 | return -ENOSYS; |
2694 | } | 2707 | } |
2695 | 2708 | ||
2696 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | 2709 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, |
2697 | void __user *oldval, size_t __user *oldlenp, | 2710 | void __user *oldval, size_t __user *oldlenp, |
2698 | void __user *newval, size_t newlen, void **context) | 2711 | void __user *newval, size_t newlen) |
2699 | { | 2712 | { |
2700 | return -ENOSYS; | 2713 | return -ENOSYS; |
2701 | } | 2714 | } |
2702 | 2715 | ||
2703 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | 2716 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, |
2704 | void __user *oldval, size_t __user *oldlenp, | 2717 | void __user *oldval, size_t __user *oldlenp, |
2705 | void __user *newval, size_t newlen, void **context) | 2718 | void __user *newval, size_t newlen) |
2706 | { | 2719 | { |
2707 | return -ENOSYS; | 2720 | return -ENOSYS; |
2708 | } | 2721 | } |
2709 | 2722 | ||
2710 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | 2723 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, |
2711 | void __user *oldval, size_t __user *oldlenp, | 2724 | void __user *oldval, size_t __user *oldlenp, |
2712 | void __user *newval, size_t newlen, void **context) | 2725 | void __user *newval, size_t newlen) |
2713 | { | 2726 | { |
2714 | return -ENOSYS; | 2727 | return -ENOSYS; |
2715 | } | 2728 | } |
2716 | 2729 | ||
2730 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
2731 | void __user *oldval, size_t __user *oldlenp, | ||
2732 | void __user *newval, size_t newlen) | ||
2733 | { | ||
2734 | return -ENOSYS; | ||
2735 | } | ||
2736 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
2737 | void __user *oldval, size_t __user *oldlenp, | ||
2738 | void __user *newval, size_t newlen) | ||
2739 | { | ||
2740 | return -ENOSYS; | ||
2741 | } | ||
2717 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 2742 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
2718 | 2743 | ||
2719 | /* | 2744 | /* |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5d6a8c54ee85..4c3476fa058d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -34,7 +34,7 @@ | |||
34 | 34 | ||
35 | static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; | 35 | static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; |
36 | static int family_registered; | 36 | static int family_registered; |
37 | kmem_cache_t *taskstats_cache; | 37 | struct kmem_cache *taskstats_cache; |
38 | 38 | ||
39 | static struct genl_family family = { | 39 | static struct genl_family family = { |
40 | .id = GENL_ID_GENERATE, | 40 | .id = GENL_ID_GENERATE, |
@@ -69,7 +69,7 @@ enum actions { | |||
69 | }; | 69 | }; |
70 | 70 | ||
71 | static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | 71 | static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, |
72 | void **replyp, size_t size) | 72 | size_t size) |
73 | { | 73 | { |
74 | struct sk_buff *skb; | 74 | struct sk_buff *skb; |
75 | void *reply; | 75 | void *reply; |
@@ -77,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | |||
77 | /* | 77 | /* |
78 | * If new attributes are added, please revisit this allocation | 78 | * If new attributes are added, please revisit this allocation |
79 | */ | 79 | */ |
80 | skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); | 80 | skb = genlmsg_new(size, GFP_KERNEL); |
81 | if (!skb) | 81 | if (!skb) |
82 | return -ENOMEM; | 82 | return -ENOMEM; |
83 | 83 | ||
@@ -85,20 +85,15 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | |||
85 | int seq = get_cpu_var(taskstats_seqnum)++; | 85 | int seq = get_cpu_var(taskstats_seqnum)++; |
86 | put_cpu_var(taskstats_seqnum); | 86 | put_cpu_var(taskstats_seqnum); |
87 | 87 | ||
88 | reply = genlmsg_put(skb, 0, seq, | 88 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); |
89 | family.id, 0, 0, | ||
90 | cmd, family.version); | ||
91 | } else | 89 | } else |
92 | reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, | 90 | reply = genlmsg_put_reply(skb, info, &family, 0, cmd); |
93 | family.id, 0, 0, | ||
94 | cmd, family.version); | ||
95 | if (reply == NULL) { | 91 | if (reply == NULL) { |
96 | nlmsg_free(skb); | 92 | nlmsg_free(skb); |
97 | return -EINVAL; | 93 | return -EINVAL; |
98 | } | 94 | } |
99 | 95 | ||
100 | *skbp = skb; | 96 | *skbp = skb; |
101 | *replyp = reply; | ||
102 | return 0; | 97 | return 0; |
103 | } | 98 | } |
104 | 99 | ||
@@ -123,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid) | |||
123 | /* | 118 | /* |
124 | * Send taskstats data in @skb to listeners registered for @cpu's exit data | 119 | * Send taskstats data in @skb to listeners registered for @cpu's exit data |
125 | */ | 120 | */ |
126 | static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) | 121 | static void send_cpu_listeners(struct sk_buff *skb, |
122 | struct listener_list *listeners) | ||
127 | { | 123 | { |
128 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | 124 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); |
129 | struct listener_list *listeners; | ||
130 | struct listener *s, *tmp; | 125 | struct listener *s, *tmp; |
131 | struct sk_buff *skb_next, *skb_cur = skb; | 126 | struct sk_buff *skb_next, *skb_cur = skb; |
132 | void *reply = genlmsg_data(genlhdr); | 127 | void *reply = genlmsg_data(genlhdr); |
@@ -139,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) | |||
139 | } | 134 | } |
140 | 135 | ||
141 | rc = 0; | 136 | rc = 0; |
142 | listeners = &per_cpu(listener_array, cpu); | ||
143 | down_read(&listeners->sem); | 137 | down_read(&listeners->sem); |
144 | list_for_each_entry(s, &listeners->list, list) { | 138 | list_for_each_entry(s, &listeners->list, list) { |
145 | skb_next = NULL; | 139 | skb_next = NULL; |
@@ -174,24 +168,23 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) | |||
174 | up_write(&listeners->sem); | 168 | up_write(&listeners->sem); |
175 | } | 169 | } |
176 | 170 | ||
177 | static int fill_pid(pid_t pid, struct task_struct *pidtsk, | 171 | static int fill_pid(pid_t pid, struct task_struct *tsk, |
178 | struct taskstats *stats) | 172 | struct taskstats *stats) |
179 | { | 173 | { |
180 | int rc = 0; | 174 | int rc = 0; |
181 | struct task_struct *tsk = pidtsk; | ||
182 | 175 | ||
183 | if (!pidtsk) { | 176 | if (!tsk) { |
184 | read_lock(&tasklist_lock); | 177 | rcu_read_lock(); |
185 | tsk = find_task_by_pid(pid); | 178 | tsk = find_task_by_pid(pid); |
186 | if (!tsk) { | 179 | if (tsk) |
187 | read_unlock(&tasklist_lock); | 180 | get_task_struct(tsk); |
181 | rcu_read_unlock(); | ||
182 | if (!tsk) | ||
188 | return -ESRCH; | 183 | return -ESRCH; |
189 | } | ||
190 | get_task_struct(tsk); | ||
191 | read_unlock(&tasklist_lock); | ||
192 | } else | 184 | } else |
193 | get_task_struct(tsk); | 185 | get_task_struct(tsk); |
194 | 186 | ||
187 | memset(stats, 0, sizeof(*stats)); | ||
195 | /* | 188 | /* |
196 | * Each accounting subsystem adds calls to its functions to | 189 | * Each accounting subsystem adds calls to its functions to |
197 | * fill in relevant parts of struct taskstsats as follows | 190 | * fill in relevant parts of struct taskstsats as follows |
@@ -214,39 +207,32 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, | |||
214 | 207 | ||
215 | } | 208 | } |
216 | 209 | ||
217 | static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, | 210 | static int fill_tgid(pid_t tgid, struct task_struct *first, |
218 | struct taskstats *stats) | 211 | struct taskstats *stats) |
219 | { | 212 | { |
220 | struct task_struct *tsk, *first; | 213 | struct task_struct *tsk; |
221 | unsigned long flags; | 214 | unsigned long flags; |
215 | int rc = -ESRCH; | ||
222 | 216 | ||
223 | /* | 217 | /* |
224 | * Add additional stats from live tasks except zombie thread group | 218 | * Add additional stats from live tasks except zombie thread group |
225 | * leaders who are already counted with the dead tasks | 219 | * leaders who are already counted with the dead tasks |
226 | */ | 220 | */ |
227 | first = tgidtsk; | 221 | rcu_read_lock(); |
228 | if (!first) { | 222 | if (!first) |
229 | read_lock(&tasklist_lock); | ||
230 | first = find_task_by_pid(tgid); | 223 | first = find_task_by_pid(tgid); |
231 | if (!first) { | ||
232 | read_unlock(&tasklist_lock); | ||
233 | return -ESRCH; | ||
234 | } | ||
235 | get_task_struct(first); | ||
236 | read_unlock(&tasklist_lock); | ||
237 | } else | ||
238 | get_task_struct(first); | ||
239 | 224 | ||
240 | /* Start with stats from dead tasks */ | 225 | if (!first || !lock_task_sighand(first, &flags)) |
241 | spin_lock_irqsave(&first->signal->stats_lock, flags); | 226 | goto out; |
227 | |||
242 | if (first->signal->stats) | 228 | if (first->signal->stats) |
243 | memcpy(stats, first->signal->stats, sizeof(*stats)); | 229 | memcpy(stats, first->signal->stats, sizeof(*stats)); |
244 | spin_unlock_irqrestore(&first->signal->stats_lock, flags); | 230 | else |
231 | memset(stats, 0, sizeof(*stats)); | ||
245 | 232 | ||
246 | tsk = first; | 233 | tsk = first; |
247 | read_lock(&tasklist_lock); | ||
248 | do { | 234 | do { |
249 | if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) | 235 | if (tsk->exit_state) |
250 | continue; | 236 | continue; |
251 | /* | 237 | /* |
252 | * Accounting subsystem can call its functions here to | 238 | * Accounting subsystem can call its functions here to |
@@ -257,15 +243,18 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, | |||
257 | delayacct_add_tsk(stats, tsk); | 243 | delayacct_add_tsk(stats, tsk); |
258 | 244 | ||
259 | } while_each_thread(first, tsk); | 245 | } while_each_thread(first, tsk); |
260 | read_unlock(&tasklist_lock); | ||
261 | stats->version = TASKSTATS_VERSION; | ||
262 | 246 | ||
247 | unlock_task_sighand(first, &flags); | ||
248 | rc = 0; | ||
249 | out: | ||
250 | rcu_read_unlock(); | ||
251 | |||
252 | stats->version = TASKSTATS_VERSION; | ||
263 | /* | 253 | /* |
264 | * Accounting subsytems can also add calls here to modify | 254 | * Accounting subsytems can also add calls here to modify |
265 | * fields of taskstats. | 255 | * fields of taskstats. |
266 | */ | 256 | */ |
267 | 257 | return rc; | |
268 | return 0; | ||
269 | } | 258 | } |
270 | 259 | ||
271 | 260 | ||
@@ -273,7 +262,7 @@ static void fill_tgid_exit(struct task_struct *tsk) | |||
273 | { | 262 | { |
274 | unsigned long flags; | 263 | unsigned long flags; |
275 | 264 | ||
276 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); | 265 | spin_lock_irqsave(&tsk->sighand->siglock, flags); |
277 | if (!tsk->signal->stats) | 266 | if (!tsk->signal->stats) |
278 | goto ret; | 267 | goto ret; |
279 | 268 | ||
@@ -285,7 +274,7 @@ static void fill_tgid_exit(struct task_struct *tsk) | |||
285 | */ | 274 | */ |
286 | delayacct_add_tsk(tsk->signal->stats, tsk); | 275 | delayacct_add_tsk(tsk->signal->stats, tsk); |
287 | ret: | 276 | ret: |
288 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | 277 | spin_unlock_irqrestore(&tsk->sighand->siglock, flags); |
289 | return; | 278 | return; |
290 | } | 279 | } |
291 | 280 | ||
@@ -356,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask) | |||
356 | return ret; | 345 | return ret; |
357 | } | 346 | } |
358 | 347 | ||
348 | static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | ||
349 | { | ||
350 | struct nlattr *na, *ret; | ||
351 | int aggr; | ||
352 | |||
353 | aggr = (type == TASKSTATS_TYPE_PID) | ||
354 | ? TASKSTATS_TYPE_AGGR_PID | ||
355 | : TASKSTATS_TYPE_AGGR_TGID; | ||
356 | |||
357 | na = nla_nest_start(skb, aggr); | ||
358 | if (!na) | ||
359 | goto err; | ||
360 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) | ||
361 | goto err; | ||
362 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); | ||
363 | if (!ret) | ||
364 | goto err; | ||
365 | nla_nest_end(skb, na); | ||
366 | |||
367 | return nla_data(ret); | ||
368 | err: | ||
369 | return NULL; | ||
370 | } | ||
371 | |||
359 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | 372 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) |
360 | { | 373 | { |
361 | int rc = 0; | 374 | int rc = 0; |
362 | struct sk_buff *rep_skb; | 375 | struct sk_buff *rep_skb; |
363 | struct taskstats stats; | 376 | struct taskstats *stats; |
364 | void *reply; | ||
365 | size_t size; | 377 | size_t size; |
366 | struct nlattr *na; | ||
367 | cpumask_t mask; | 378 | cpumask_t mask; |
368 | 379 | ||
369 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); | 380 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); |
@@ -384,146 +395,122 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
384 | size = nla_total_size(sizeof(u32)) + | 395 | size = nla_total_size(sizeof(u32)) + |
385 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | 396 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); |
386 | 397 | ||
387 | memset(&stats, 0, sizeof(stats)); | 398 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); |
388 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
389 | if (rc < 0) | 399 | if (rc < 0) |
390 | return rc; | 400 | return rc; |
391 | 401 | ||
402 | rc = -EINVAL; | ||
392 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | 403 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { |
393 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | 404 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); |
394 | rc = fill_pid(pid, NULL, &stats); | 405 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); |
395 | if (rc < 0) | 406 | if (!stats) |
396 | goto err; | 407 | goto err; |
397 | 408 | ||
398 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | 409 | rc = fill_pid(pid, NULL, stats); |
399 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); | 410 | if (rc < 0) |
400 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | 411 | goto err; |
401 | stats); | ||
402 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | 412 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { |
403 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | 413 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); |
404 | rc = fill_tgid(tgid, NULL, &stats); | 414 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); |
405 | if (rc < 0) | 415 | if (!stats) |
406 | goto err; | 416 | goto err; |
407 | 417 | ||
408 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | 418 | rc = fill_tgid(tgid, NULL, stats); |
409 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); | 419 | if (rc < 0) |
410 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | 420 | goto err; |
411 | stats); | 421 | } else |
412 | } else { | ||
413 | rc = -EINVAL; | ||
414 | goto err; | 422 | goto err; |
415 | } | ||
416 | |||
417 | nla_nest_end(rep_skb, na); | ||
418 | 423 | ||
419 | return send_reply(rep_skb, info->snd_pid); | 424 | return send_reply(rep_skb, info->snd_pid); |
420 | |||
421 | nla_put_failure: | ||
422 | return genlmsg_cancel(rep_skb, reply); | ||
423 | err: | 425 | err: |
424 | nlmsg_free(rep_skb); | 426 | nlmsg_free(rep_skb); |
425 | return rc; | 427 | return rc; |
426 | } | 428 | } |
427 | 429 | ||
428 | void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) | 430 | static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) |
429 | { | 431 | { |
430 | struct listener_list *listeners; | 432 | struct signal_struct *sig = tsk->signal; |
431 | struct taskstats *tmp; | 433 | struct taskstats *stats; |
432 | /* | ||
433 | * This is the cpu on which the task is exiting currently and will | ||
434 | * be the one for which the exit event is sent, even if the cpu | ||
435 | * on which this function is running changes later. | ||
436 | */ | ||
437 | *mycpu = raw_smp_processor_id(); | ||
438 | 434 | ||
439 | *ptidstats = NULL; | 435 | if (sig->stats || thread_group_empty(tsk)) |
440 | tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); | 436 | goto ret; |
441 | if (!tmp) | ||
442 | return; | ||
443 | 437 | ||
444 | listeners = &per_cpu(listener_array, *mycpu); | 438 | /* No problem if kmem_cache_zalloc() fails */ |
445 | down_read(&listeners->sem); | 439 | stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); |
446 | if (!list_empty(&listeners->list)) { | 440 | |
447 | *ptidstats = tmp; | 441 | spin_lock_irq(&tsk->sighand->siglock); |
448 | tmp = NULL; | 442 | if (!sig->stats) { |
443 | sig->stats = stats; | ||
444 | stats = NULL; | ||
449 | } | 445 | } |
450 | up_read(&listeners->sem); | 446 | spin_unlock_irq(&tsk->sighand->siglock); |
451 | kfree(tmp); | 447 | |
448 | if (stats) | ||
449 | kmem_cache_free(taskstats_cache, stats); | ||
450 | ret: | ||
451 | return sig->stats; | ||
452 | } | 452 | } |
453 | 453 | ||
454 | /* Send pid data out on exit */ | 454 | /* Send pid data out on exit */ |
455 | void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, | 455 | void taskstats_exit(struct task_struct *tsk, int group_dead) |
456 | int group_dead, unsigned int mycpu) | ||
457 | { | 456 | { |
458 | int rc; | 457 | int rc; |
458 | struct listener_list *listeners; | ||
459 | struct taskstats *stats; | ||
459 | struct sk_buff *rep_skb; | 460 | struct sk_buff *rep_skb; |
460 | void *reply; | ||
461 | size_t size; | 461 | size_t size; |
462 | int is_thread_group; | 462 | int is_thread_group; |
463 | struct nlattr *na; | ||
464 | unsigned long flags; | ||
465 | 463 | ||
466 | if (!family_registered || !tidstats) | 464 | if (!family_registered) |
467 | return; | 465 | return; |
468 | 466 | ||
469 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); | ||
470 | is_thread_group = tsk->signal->stats ? 1 : 0; | ||
471 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | ||
472 | |||
473 | rc = 0; | ||
474 | /* | 467 | /* |
475 | * Size includes space for nested attributes | 468 | * Size includes space for nested attributes |
476 | */ | 469 | */ |
477 | size = nla_total_size(sizeof(u32)) + | 470 | size = nla_total_size(sizeof(u32)) + |
478 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | 471 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); |
479 | 472 | ||
480 | if (is_thread_group) | 473 | is_thread_group = !!taskstats_tgid_alloc(tsk); |
481 | size = 2 * size; /* PID + STATS + TGID + STATS */ | 474 | if (is_thread_group) { |
475 | /* PID + STATS + TGID + STATS */ | ||
476 | size = 2 * size; | ||
477 | /* fill the tsk->signal->stats structure */ | ||
478 | fill_tgid_exit(tsk); | ||
479 | } | ||
482 | 480 | ||
483 | rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | 481 | listeners = &__raw_get_cpu_var(listener_array); |
484 | if (rc < 0) | 482 | if (list_empty(&listeners->list)) |
485 | goto ret; | 483 | return; |
486 | 484 | ||
487 | rc = fill_pid(tsk->pid, tsk, tidstats); | 485 | rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); |
488 | if (rc < 0) | 486 | if (rc < 0) |
489 | goto err_skb; | 487 | return; |
490 | 488 | ||
491 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | 489 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); |
492 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); | 490 | if (!stats) |
493 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | 491 | goto err; |
494 | *tidstats); | ||
495 | nla_nest_end(rep_skb, na); | ||
496 | 492 | ||
497 | if (!is_thread_group) | 493 | rc = fill_pid(tsk->pid, tsk, stats); |
498 | goto send; | 494 | if (rc < 0) |
495 | goto err; | ||
499 | 496 | ||
500 | /* | 497 | /* |
501 | * tsk has/had a thread group so fill the tsk->signal->stats structure | ||
502 | * Doesn't matter if tsk is the leader or the last group member leaving | 498 | * Doesn't matter if tsk is the leader or the last group member leaving |
503 | */ | 499 | */ |
504 | 500 | if (!is_thread_group || !group_dead) | |
505 | fill_tgid_exit(tsk); | ||
506 | if (!group_dead) | ||
507 | goto send; | 501 | goto send; |
508 | 502 | ||
509 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | 503 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); |
510 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); | 504 | if (!stats) |
511 | /* No locking needed for tsk->signal->stats since group is dead */ | 505 | goto err; |
512 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | 506 | |
513 | *tsk->signal->stats); | 507 | memcpy(stats, tsk->signal->stats, sizeof(*stats)); |
514 | nla_nest_end(rep_skb, na); | ||
515 | 508 | ||
516 | send: | 509 | send: |
517 | send_cpu_listeners(rep_skb, mycpu); | 510 | send_cpu_listeners(rep_skb, listeners); |
518 | return; | 511 | return; |
519 | 512 | err: | |
520 | nla_put_failure: | ||
521 | genlmsg_cancel(rep_skb, reply); | ||
522 | goto ret; | ||
523 | err_skb: | ||
524 | nlmsg_free(rep_skb); | 513 | nlmsg_free(rep_skb); |
525 | ret: | ||
526 | return; | ||
527 | } | 514 | } |
528 | 515 | ||
529 | static struct genl_ops taskstats_ops = { | 516 | static struct genl_ops taskstats_ops = { |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 74eca5939bd9..22504afc0d34 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c) | |||
156 | /* check if clocksource is already registered */ | 156 | /* check if clocksource is already registered */ |
157 | if (is_registered_source(c)) { | 157 | if (is_registered_source(c)) { |
158 | printk("register_clocksource: Cannot register %s. " | 158 | printk("register_clocksource: Cannot register %s. " |
159 | "Already registered!", c->name); | 159 | "Already registered!", c->name); |
160 | ret = -EBUSY; | 160 | ret = -EBUSY; |
161 | } else { | 161 | } else { |
162 | /* register it */ | 162 | /* register it */ |
@@ -186,6 +186,7 @@ void clocksource_reselect(void) | |||
186 | } | 186 | } |
187 | EXPORT_SYMBOL(clocksource_reselect); | 187 | EXPORT_SYMBOL(clocksource_reselect); |
188 | 188 | ||
189 | #ifdef CONFIG_SYSFS | ||
189 | /** | 190 | /** |
190 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 191 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
191 | * @dev: unused | 192 | * @dev: unused |
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf) | |||
275 | * Sysfs setup bits: | 276 | * Sysfs setup bits: |
276 | */ | 277 | */ |
277 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, | 278 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, |
278 | sysfs_override_clocksource); | 279 | sysfs_override_clocksource); |
279 | 280 | ||
280 | static SYSDEV_ATTR(available_clocksource, 0600, | 281 | static SYSDEV_ATTR(available_clocksource, 0600, |
281 | sysfs_show_available_clocksources, NULL); | 282 | sysfs_show_available_clocksources, NULL); |
282 | 283 | ||
283 | static struct sysdev_class clocksource_sysclass = { | 284 | static struct sysdev_class clocksource_sysclass = { |
284 | set_kset_name("clocksource"), | 285 | set_kset_name("clocksource"), |
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void) | |||
307 | } | 308 | } |
308 | 309 | ||
309 | device_initcall(init_clocksource_sysfs); | 310 | device_initcall(init_clocksource_sysfs); |
311 | #endif /* CONFIG_SYSFS */ | ||
310 | 312 | ||
311 | /** | 313 | /** |
312 | * boot_override_clocksource - boot clock override | 314 | * boot_override_clocksource - boot clock override |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 126bb30c4afe..a99b2a6e6a07 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -57,7 +57,7 @@ static cycle_t jiffies_read(void) | |||
57 | 57 | ||
58 | struct clocksource clocksource_jiffies = { | 58 | struct clocksource clocksource_jiffies = { |
59 | .name = "jiffies", | 59 | .name = "jiffies", |
60 | .rating = 0, /* lowest rating*/ | 60 | .rating = 1, /* lowest valid rating*/ |
61 | .read = jiffies_read, | 61 | .read = jiffies_read, |
62 | .mask = 0xffffffff, /*32bits*/ | 62 | .mask = 0xffffffff, /*32bits*/ |
63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | 63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 47195fa0ec4f..3afeaa3a73f9 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -161,9 +161,9 @@ void second_overflow(void) | |||
161 | time_adjust += MAX_TICKADJ; | 161 | time_adjust += MAX_TICKADJ; |
162 | tick_length -= MAX_TICKADJ_SCALED; | 162 | tick_length -= MAX_TICKADJ_SCALED; |
163 | } else { | 163 | } else { |
164 | time_adjust = 0; | ||
165 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / | 164 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / |
166 | HZ) << TICK_LENGTH_SHIFT; | 165 | HZ) << TICK_LENGTH_SHIFT; |
166 | time_adjust = 0; | ||
167 | } | 167 | } |
168 | } | 168 | } |
169 | } | 169 | } |
diff --git a/kernel/timer.c b/kernel/timer.c index c1c7fbcffec1..0256ab443d8a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases; | |||
80 | EXPORT_SYMBOL(boot_tvec_bases); | 80 | EXPORT_SYMBOL(boot_tvec_bases); |
81 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; | 81 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; |
82 | 82 | ||
83 | /** | ||
84 | * __round_jiffies - function to round jiffies to a full second | ||
85 | * @j: the time in (absolute) jiffies that should be rounded | ||
86 | * @cpu: the processor number on which the timeout will happen | ||
87 | * | ||
88 | * __round_jiffies rounds an absolute time in the future (in jiffies) | ||
89 | * up or down to (approximately) full seconds. This is useful for timers | ||
90 | * for which the exact time they fire does not matter too much, as long as | ||
91 | * they fire approximately every X seconds. | ||
92 | * | ||
93 | * By rounding these timers to whole seconds, all such timers will fire | ||
94 | * at the same time, rather than at various times spread out. The goal | ||
95 | * of this is to have the CPU wake up less, which saves power. | ||
96 | * | ||
97 | * The exact rounding is skewed for each processor to avoid all | ||
98 | * processors firing at the exact same time, which could lead | ||
99 | * to lock contention or spurious cache line bouncing. | ||
100 | * | ||
101 | * The return value is the rounded version of the "j" parameter. | ||
102 | */ | ||
103 | unsigned long __round_jiffies(unsigned long j, int cpu) | ||
104 | { | ||
105 | int rem; | ||
106 | unsigned long original = j; | ||
107 | |||
108 | /* | ||
109 | * We don't want all cpus firing their timers at once hitting the | ||
110 | * same lock or cachelines, so we skew each extra cpu with an extra | ||
111 | * 3 jiffies. This 3 jiffies came originally from the mm/ code which | ||
112 | * already did this. | ||
113 | * The skew is done by adding 3*cpunr, then round, then subtract this | ||
114 | * extra offset again. | ||
115 | */ | ||
116 | j += cpu * 3; | ||
117 | |||
118 | rem = j % HZ; | ||
119 | |||
120 | /* | ||
121 | * If the target jiffie is just after a whole second (which can happen | ||
122 | * due to delays of the timer irq, long irq off times etc etc) then | ||
123 | * we should round down to the whole second, not up. Use 1/4th second | ||
124 | * as cutoff for this rounding as an extreme upper bound for this. | ||
125 | */ | ||
126 | if (rem < HZ/4) /* round down */ | ||
127 | j = j - rem; | ||
128 | else /* round up */ | ||
129 | j = j - rem + HZ; | ||
130 | |||
131 | /* now that we have rounded, subtract the extra skew again */ | ||
132 | j -= cpu * 3; | ||
133 | |||
134 | if (j <= jiffies) /* rounding ate our timeout entirely; */ | ||
135 | return original; | ||
136 | return j; | ||
137 | } | ||
138 | EXPORT_SYMBOL_GPL(__round_jiffies); | ||
139 | |||
140 | /** | ||
141 | * __round_jiffies_relative - function to round jiffies to a full second | ||
142 | * @j: the time in (relative) jiffies that should be rounded | ||
143 | * @cpu: the processor number on which the timeout will happen | ||
144 | * | ||
145 | * __round_jiffies_relative rounds a time delta in the future (in jiffies) | ||
146 | * up or down to (approximately) full seconds. This is useful for timers | ||
147 | * for which the exact time they fire does not matter too much, as long as | ||
148 | * they fire approximately every X seconds. | ||
149 | * | ||
150 | * By rounding these timers to whole seconds, all such timers will fire | ||
151 | * at the same time, rather than at various times spread out. The goal | ||
152 | * of this is to have the CPU wake up less, which saves power. | ||
153 | * | ||
154 | * The exact rounding is skewed for each processor to avoid all | ||
155 | * processors firing at the exact same time, which could lead | ||
156 | * to lock contention or spurious cache line bouncing. | ||
157 | * | ||
158 | * The return value is the rounded version of the "j" parameter. | ||
159 | */ | ||
160 | unsigned long __round_jiffies_relative(unsigned long j, int cpu) | ||
161 | { | ||
162 | /* | ||
163 | * In theory the following code can skip a jiffy in case jiffies | ||
164 | * increments right between the addition and the later subtraction. | ||
165 | * However since the entire point of this function is to use approximate | ||
166 | * timeouts, it's entirely ok to not handle that. | ||
167 | */ | ||
168 | return __round_jiffies(j + jiffies, cpu) - jiffies; | ||
169 | } | ||
170 | EXPORT_SYMBOL_GPL(__round_jiffies_relative); | ||
171 | |||
172 | /** | ||
173 | * round_jiffies - function to round jiffies to a full second | ||
174 | * @j: the time in (absolute) jiffies that should be rounded | ||
175 | * | ||
176 | * round_jiffies rounds an absolute time in the future (in jiffies) | ||
177 | * up or down to (approximately) full seconds. This is useful for timers | ||
178 | * for which the exact time they fire does not matter too much, as long as | ||
179 | * they fire approximately every X seconds. | ||
180 | * | ||
181 | * By rounding these timers to whole seconds, all such timers will fire | ||
182 | * at the same time, rather than at various times spread out. The goal | ||
183 | * of this is to have the CPU wake up less, which saves power. | ||
184 | * | ||
185 | * The return value is the rounded version of the "j" parameter. | ||
186 | */ | ||
187 | unsigned long round_jiffies(unsigned long j) | ||
188 | { | ||
189 | return __round_jiffies(j, raw_smp_processor_id()); | ||
190 | } | ||
191 | EXPORT_SYMBOL_GPL(round_jiffies); | ||
192 | |||
193 | /** | ||
194 | * round_jiffies_relative - function to round jiffies to a full second | ||
195 | * @j: the time in (relative) jiffies that should be rounded | ||
196 | * | ||
197 | * round_jiffies_relative rounds a time delta in the future (in jiffies) | ||
198 | * up or down to (approximately) full seconds. This is useful for timers | ||
199 | * for which the exact time they fire does not matter too much, as long as | ||
200 | * they fire approximately every X seconds. | ||
201 | * | ||
202 | * By rounding these timers to whole seconds, all such timers will fire | ||
203 | * at the same time, rather than at various times spread out. The goal | ||
204 | * of this is to have the CPU wake up less, which saves power. | ||
205 | * | ||
206 | * The return value is the rounded version of the "j" parameter. | ||
207 | */ | ||
208 | unsigned long round_jiffies_relative(unsigned long j) | ||
209 | { | ||
210 | return __round_jiffies_relative(j, raw_smp_processor_id()); | ||
211 | } | ||
212 | EXPORT_SYMBOL_GPL(round_jiffies_relative); | ||
213 | |||
214 | |||
83 | static inline void set_running_timer(tvec_base_t *base, | 215 | static inline void set_running_timer(tvec_base_t *base, |
84 | struct timer_list *timer) | 216 | struct timer_list *timer) |
85 | { | 217 | { |
@@ -714,7 +846,7 @@ static int change_clocksource(void) | |||
714 | clock = new; | 846 | clock = new; |
715 | clock->cycle_last = now; | 847 | clock->cycle_last = now; |
716 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | 848 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", |
717 | clock->name); | 849 | clock->name); |
718 | return 1; | 850 | return 1; |
719 | } else if (clock->update_callback) { | 851 | } else if (clock->update_callback) { |
720 | return clock->update_callback(); | 852 | return clock->update_callback(); |
@@ -722,7 +854,10 @@ static int change_clocksource(void) | |||
722 | return 0; | 854 | return 0; |
723 | } | 855 | } |
724 | #else | 856 | #else |
725 | #define change_clocksource() (0) | 857 | static inline int change_clocksource(void) |
858 | { | ||
859 | return 0; | ||
860 | } | ||
726 | #endif | 861 | #endif |
727 | 862 | ||
728 | /** | 863 | /** |
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device); | |||
820 | * If the error is already larger, we look ahead even further | 955 | * If the error is already larger, we look ahead even further |
821 | * to compensate for late or lost adjustments. | 956 | * to compensate for late or lost adjustments. |
822 | */ | 957 | */ |
823 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) | 958 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, |
959 | s64 *offset) | ||
824 | { | 960 | { |
825 | s64 tick_error, i; | 961 | s64 tick_error, i; |
826 | u32 look_ahead, adj; | 962 | u32 look_ahead, adj; |
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 * | |||
844 | * Now calculate the error in (1 << look_ahead) ticks, but first | 980 | * Now calculate the error in (1 << look_ahead) ticks, but first |
845 | * remove the single look ahead already included in the error. | 981 | * remove the single look ahead already included in the error. |
846 | */ | 982 | */ |
847 | tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); | 983 | tick_error = current_tick_length() >> |
984 | (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
848 | tick_error -= clock->xtime_interval >> 1; | 985 | tick_error -= clock->xtime_interval >> 1; |
849 | error = ((error - tick_error) >> look_ahead) + tick_error; | 986 | error = ((error - tick_error) >> look_ahead) + tick_error; |
850 | 987 | ||
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) | |||
896 | clock->mult += adj; | 1033 | clock->mult += adj; |
897 | clock->xtime_interval += interval; | 1034 | clock->xtime_interval += interval; |
898 | clock->xtime_nsec -= offset; | 1035 | clock->xtime_nsec -= offset; |
899 | clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); | 1036 | clock->error -= (interval - offset) << |
1037 | (TICK_LENGTH_SHIFT - clock->shift); | ||
900 | } | 1038 | } |
901 | 1039 | ||
902 | /** | 1040 | /** |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index db443221ba5b..baacc3691415 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -36,7 +36,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
36 | 36 | ||
37 | /* calculate task elapsed time in timespec */ | 37 | /* calculate task elapsed time in timespec */ |
38 | do_posix_clock_monotonic_gettime(&uptime); | 38 | do_posix_clock_monotonic_gettime(&uptime); |
39 | ts = timespec_sub(uptime, current->group_leader->start_time); | 39 | ts = timespec_sub(uptime, tsk->start_time); |
40 | /* rebase elapsed time to usec */ | 40 | /* rebase elapsed time to usec */ |
41 | ac_etime = timespec_to_ns(&ts); | 41 | ac_etime = timespec_to_ns(&ts); |
42 | do_div(ac_etime, NSEC_PER_USEC); | 42 | do_div(ac_etime, NSEC_PER_USEC); |
@@ -58,7 +58,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
58 | stats->ac_uid = tsk->uid; | 58 | stats->ac_uid = tsk->uid; |
59 | stats->ac_gid = tsk->gid; | 59 | stats->ac_gid = tsk->gid; |
60 | stats->ac_pid = tsk->pid; | 60 | stats->ac_pid = tsk->pid; |
61 | stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; | 61 | rcu_read_lock(); |
62 | stats->ac_ppid = pid_alive(tsk) ? | ||
63 | rcu_dereference(tsk->real_parent)->tgid : 0; | ||
64 | rcu_read_unlock(); | ||
62 | stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; | 65 | stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; |
63 | stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; | 66 | stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; |
64 | stats->ac_minflt = tsk->min_flt; | 67 | stats->ac_minflt = tsk->min_flt; |
@@ -77,18 +80,31 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
77 | */ | 80 | */ |
78 | void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | 81 | void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) |
79 | { | 82 | { |
83 | struct mm_struct *mm; | ||
84 | |||
80 | /* convert pages-jiffies to Mbyte-usec */ | 85 | /* convert pages-jiffies to Mbyte-usec */ |
81 | stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; | 86 | stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; |
82 | stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; | 87 | stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; |
83 | if (p->mm) { | 88 | mm = get_task_mm(p); |
89 | if (mm) { | ||
84 | /* adjust to KB unit */ | 90 | /* adjust to KB unit */ |
85 | stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB; | 91 | stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; |
86 | stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB; | 92 | stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; |
93 | mmput(mm); | ||
87 | } | 94 | } |
88 | stats->read_char = p->rchar; | 95 | stats->read_char = p->rchar; |
89 | stats->write_char = p->wchar; | 96 | stats->write_char = p->wchar; |
90 | stats->read_syscalls = p->syscr; | 97 | stats->read_syscalls = p->syscr; |
91 | stats->write_syscalls = p->syscw; | 98 | stats->write_syscalls = p->syscw; |
99 | #ifdef CONFIG_TASK_IO_ACCOUNTING | ||
100 | stats->read_bytes = p->ioac.read_bytes; | ||
101 | stats->write_bytes = p->ioac.write_bytes; | ||
102 | stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; | ||
103 | #else | ||
104 | stats->read_bytes = 0; | ||
105 | stats->write_bytes = 0; | ||
106 | stats->cancelled_write_bytes = 0; | ||
107 | #endif | ||
92 | } | 108 | } |
93 | #undef KB | 109 | #undef KB |
94 | #undef MB | 110 | #undef MB |
diff --git a/kernel/unwind.c b/kernel/unwind.c index 2e2368607aab..09c261329249 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c | |||
@@ -11,13 +11,16 @@ | |||
11 | 11 | ||
12 | #include <linux/unwind.h> | 12 | #include <linux/unwind.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/delay.h> | 14 | #include <linux/bootmem.h> |
15 | #include <linux/sort.h> | ||
15 | #include <linux/stop_machine.h> | 16 | #include <linux/stop_machine.h> |
17 | #include <linux/uaccess.h> | ||
16 | #include <asm/sections.h> | 18 | #include <asm/sections.h> |
17 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
18 | #include <asm/unaligned.h> | 20 | #include <asm/unaligned.h> |
19 | 21 | ||
20 | extern char __start_unwind[], __end_unwind[]; | 22 | extern const char __start_unwind[], __end_unwind[]; |
23 | extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; | ||
21 | 24 | ||
22 | #define MAX_STACK_DEPTH 8 | 25 | #define MAX_STACK_DEPTH 8 |
23 | 26 | ||
@@ -92,6 +95,7 @@ static const struct { | |||
92 | 95 | ||
93 | typedef unsigned long uleb128_t; | 96 | typedef unsigned long uleb128_t; |
94 | typedef signed long sleb128_t; | 97 | typedef signed long sleb128_t; |
98 | #define sleb128abs __builtin_labs | ||
95 | 99 | ||
96 | static struct unwind_table { | 100 | static struct unwind_table { |
97 | struct { | 101 | struct { |
@@ -100,6 +104,8 @@ static struct unwind_table { | |||
100 | } core, init; | 104 | } core, init; |
101 | const void *address; | 105 | const void *address; |
102 | unsigned long size; | 106 | unsigned long size; |
107 | const unsigned char *header; | ||
108 | unsigned long hdrsz; | ||
103 | struct unwind_table *link; | 109 | struct unwind_table *link; |
104 | const char *name; | 110 | const char *name; |
105 | } root_table; | 111 | } root_table; |
@@ -131,6 +137,17 @@ struct unwind_state { | |||
131 | 137 | ||
132 | static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; | 138 | static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; |
133 | 139 | ||
140 | static unsigned unwind_debug; | ||
141 | static int __init unwind_debug_setup(char *s) | ||
142 | { | ||
143 | unwind_debug = simple_strtoul(s, NULL, 0); | ||
144 | return 1; | ||
145 | } | ||
146 | __setup("unwind_debug=", unwind_debug_setup); | ||
147 | #define dprintk(lvl, fmt, args...) \ | ||
148 | ((void)(lvl > unwind_debug \ | ||
149 | || printk(KERN_DEBUG "unwind: " fmt "\n", ##args))) | ||
150 | |||
134 | static struct unwind_table *find_table(unsigned long pc) | 151 | static struct unwind_table *find_table(unsigned long pc) |
135 | { | 152 | { |
136 | struct unwind_table *table; | 153 | struct unwind_table *table; |
@@ -145,6 +162,12 @@ static struct unwind_table *find_table(unsigned long pc) | |||
145 | return table; | 162 | return table; |
146 | } | 163 | } |
147 | 164 | ||
165 | static unsigned long read_pointer(const u8 **pLoc, | ||
166 | const void *end, | ||
167 | signed ptrType, | ||
168 | unsigned long text_base, | ||
169 | unsigned long data_base); | ||
170 | |||
148 | static void init_unwind_table(struct unwind_table *table, | 171 | static void init_unwind_table(struct unwind_table *table, |
149 | const char *name, | 172 | const char *name, |
150 | const void *core_start, | 173 | const void *core_start, |
@@ -152,14 +175,33 @@ static void init_unwind_table(struct unwind_table *table, | |||
152 | const void *init_start, | 175 | const void *init_start, |
153 | unsigned long init_size, | 176 | unsigned long init_size, |
154 | const void *table_start, | 177 | const void *table_start, |
155 | unsigned long table_size) | 178 | unsigned long table_size, |
179 | const u8 *header_start, | ||
180 | unsigned long header_size) | ||
156 | { | 181 | { |
182 | const u8 *ptr = header_start + 4; | ||
183 | const u8 *end = header_start + header_size; | ||
184 | |||
157 | table->core.pc = (unsigned long)core_start; | 185 | table->core.pc = (unsigned long)core_start; |
158 | table->core.range = core_size; | 186 | table->core.range = core_size; |
159 | table->init.pc = (unsigned long)init_start; | 187 | table->init.pc = (unsigned long)init_start; |
160 | table->init.range = init_size; | 188 | table->init.range = init_size; |
161 | table->address = table_start; | 189 | table->address = table_start; |
162 | table->size = table_size; | 190 | table->size = table_size; |
191 | /* See if the linker provided table looks valid. */ | ||
192 | if (header_size <= 4 | ||
193 | || header_start[0] != 1 | ||
194 | || (void *)read_pointer(&ptr, end, header_start[1], 0, 0) | ||
195 | != table_start | ||
196 | || !read_pointer(&ptr, end, header_start[2], 0, 0) | ||
197 | || !read_pointer(&ptr, end, header_start[3], 0, | ||
198 | (unsigned long)header_start) | ||
199 | || !read_pointer(&ptr, end, header_start[3], 0, | ||
200 | (unsigned long)header_start)) | ||
201 | header_start = NULL; | ||
202 | table->hdrsz = header_size; | ||
203 | smp_wmb(); | ||
204 | table->header = header_start; | ||
163 | table->link = NULL; | 205 | table->link = NULL; |
164 | table->name = name; | 206 | table->name = name; |
165 | } | 207 | } |
@@ -169,7 +211,144 @@ void __init unwind_init(void) | |||
169 | init_unwind_table(&root_table, "kernel", | 211 | init_unwind_table(&root_table, "kernel", |
170 | _text, _end - _text, | 212 | _text, _end - _text, |
171 | NULL, 0, | 213 | NULL, 0, |
172 | __start_unwind, __end_unwind - __start_unwind); | 214 | __start_unwind, __end_unwind - __start_unwind, |
215 | __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr); | ||
216 | } | ||
217 | |||
218 | static const u32 bad_cie, not_fde; | ||
219 | static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *); | ||
220 | static signed fde_pointer_type(const u32 *cie); | ||
221 | |||
222 | struct eh_frame_hdr_table_entry { | ||
223 | unsigned long start, fde; | ||
224 | }; | ||
225 | |||
226 | static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2) | ||
227 | { | ||
228 | const struct eh_frame_hdr_table_entry *e1 = p1; | ||
229 | const struct eh_frame_hdr_table_entry *e2 = p2; | ||
230 | |||
231 | return (e1->start > e2->start) - (e1->start < e2->start); | ||
232 | } | ||
233 | |||
234 | static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) | ||
235 | { | ||
236 | struct eh_frame_hdr_table_entry *e1 = p1; | ||
237 | struct eh_frame_hdr_table_entry *e2 = p2; | ||
238 | unsigned long v; | ||
239 | |||
240 | v = e1->start; | ||
241 | e1->start = e2->start; | ||
242 | e2->start = v; | ||
243 | v = e1->fde; | ||
244 | e1->fde = e2->fde; | ||
245 | e2->fde = v; | ||
246 | } | ||
247 | |||
248 | static void __init setup_unwind_table(struct unwind_table *table, | ||
249 | void *(*alloc)(unsigned long)) | ||
250 | { | ||
251 | const u8 *ptr; | ||
252 | unsigned long tableSize = table->size, hdrSize; | ||
253 | unsigned n; | ||
254 | const u32 *fde; | ||
255 | struct { | ||
256 | u8 version; | ||
257 | u8 eh_frame_ptr_enc; | ||
258 | u8 fde_count_enc; | ||
259 | u8 table_enc; | ||
260 | unsigned long eh_frame_ptr; | ||
261 | unsigned int fde_count; | ||
262 | struct eh_frame_hdr_table_entry table[]; | ||
263 | } __attribute__((__packed__)) *header; | ||
264 | |||
265 | if (table->header) | ||
266 | return; | ||
267 | |||
268 | if (table->hdrsz) | ||
269 | printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n", | ||
270 | table->name); | ||
271 | |||
272 | if (tableSize & (sizeof(*fde) - 1)) | ||
273 | return; | ||
274 | |||
275 | for (fde = table->address, n = 0; | ||
276 | tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; | ||
277 | tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { | ||
278 | const u32 *cie = cie_for_fde(fde, table); | ||
279 | signed ptrType; | ||
280 | |||
281 | if (cie == ¬_fde) | ||
282 | continue; | ||
283 | if (cie == NULL | ||
284 | || cie == &bad_cie | ||
285 | || (ptrType = fde_pointer_type(cie)) < 0) | ||
286 | return; | ||
287 | ptr = (const u8 *)(fde + 2); | ||
288 | if (!read_pointer(&ptr, | ||
289 | (const u8 *)(fde + 1) + *fde, | ||
290 | ptrType, 0, 0)) | ||
291 | return; | ||
292 | ++n; | ||
293 | } | ||
294 | |||
295 | if (tableSize || !n) | ||
296 | return; | ||
297 | |||
298 | hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) | ||
299 | + 2 * n * sizeof(unsigned long); | ||
300 | dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize); | ||
301 | header = alloc(hdrSize); | ||
302 | if (!header) | ||
303 | return; | ||
304 | header->version = 1; | ||
305 | header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native; | ||
306 | header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4; | ||
307 | header->table_enc = DW_EH_PE_abs|DW_EH_PE_native; | ||
308 | put_unaligned((unsigned long)table->address, &header->eh_frame_ptr); | ||
309 | BUILD_BUG_ON(offsetof(typeof(*header), fde_count) | ||
310 | % __alignof(typeof(header->fde_count))); | ||
311 | header->fde_count = n; | ||
312 | |||
313 | BUILD_BUG_ON(offsetof(typeof(*header), table) | ||
314 | % __alignof(typeof(*header->table))); | ||
315 | for (fde = table->address, tableSize = table->size, n = 0; | ||
316 | tableSize; | ||
317 | tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { | ||
318 | const u32 *cie = fde + 1 - fde[1] / sizeof(*fde); | ||
319 | |||
320 | if (!fde[1]) | ||
321 | continue; /* this is a CIE */ | ||
322 | ptr = (const u8 *)(fde + 2); | ||
323 | header->table[n].start = read_pointer(&ptr, | ||
324 | (const u8 *)(fde + 1) + *fde, | ||
325 | fde_pointer_type(cie), 0, 0); | ||
326 | header->table[n].fde = (unsigned long)fde; | ||
327 | ++n; | ||
328 | } | ||
329 | WARN_ON(n != header->fde_count); | ||
330 | |||
331 | sort(header->table, | ||
332 | n, | ||
333 | sizeof(*header->table), | ||
334 | cmp_eh_frame_hdr_table_entries, | ||
335 | swap_eh_frame_hdr_table_entries); | ||
336 | |||
337 | table->hdrsz = hdrSize; | ||
338 | smp_wmb(); | ||
339 | table->header = (const void *)header; | ||
340 | } | ||
341 | |||
342 | static void *__init balloc(unsigned long sz) | ||
343 | { | ||
344 | return __alloc_bootmem_nopanic(sz, | ||
345 | sizeof(unsigned int), | ||
346 | __pa(MAX_DMA_ADDRESS)); | ||
347 | } | ||
348 | |||
349 | void __init unwind_setup(void) | ||
350 | { | ||
351 | setup_unwind_table(&root_table, balloc); | ||
173 | } | 352 | } |
174 | 353 | ||
175 | #ifdef CONFIG_MODULES | 354 | #ifdef CONFIG_MODULES |
@@ -193,7 +372,8 @@ void *unwind_add_table(struct module *module, | |||
193 | init_unwind_table(table, module->name, | 372 | init_unwind_table(table, module->name, |
194 | module->module_core, module->core_size, | 373 | module->module_core, module->core_size, |
195 | module->module_init, module->init_size, | 374 | module->module_init, module->init_size, |
196 | table_start, table_size); | 375 | table_start, table_size, |
376 | NULL, 0); | ||
197 | 377 | ||
198 | if (last_table) | 378 | if (last_table) |
199 | last_table->link = table; | 379 | last_table->link = table; |
@@ -303,9 +483,31 @@ static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) | |||
303 | return value; | 483 | return value; |
304 | } | 484 | } |
305 | 485 | ||
486 | static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table) | ||
487 | { | ||
488 | const u32 *cie; | ||
489 | |||
490 | if (!*fde || (*fde & (sizeof(*fde) - 1))) | ||
491 | return &bad_cie; | ||
492 | if (!fde[1]) | ||
493 | return ¬_fde; /* this is a CIE */ | ||
494 | if ((fde[1] & (sizeof(*fde) - 1)) | ||
495 | || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address) | ||
496 | return NULL; /* this is not a valid FDE */ | ||
497 | cie = fde + 1 - fde[1] / sizeof(*fde); | ||
498 | if (*cie <= sizeof(*cie) + 4 | ||
499 | || *cie >= fde[1] - sizeof(*fde) | ||
500 | || (*cie & (sizeof(*cie) - 1)) | ||
501 | || cie[1]) | ||
502 | return NULL; /* this is not a (valid) CIE */ | ||
503 | return cie; | ||
504 | } | ||
505 | |||
306 | static unsigned long read_pointer(const u8 **pLoc, | 506 | static unsigned long read_pointer(const u8 **pLoc, |
307 | const void *end, | 507 | const void *end, |
308 | signed ptrType) | 508 | signed ptrType, |
509 | unsigned long text_base, | ||
510 | unsigned long data_base) | ||
309 | { | 511 | { |
310 | unsigned long value = 0; | 512 | unsigned long value = 0; |
311 | union { | 513 | union { |
@@ -317,13 +519,17 @@ static unsigned long read_pointer(const u8 **pLoc, | |||
317 | const unsigned long *pul; | 519 | const unsigned long *pul; |
318 | } ptr; | 520 | } ptr; |
319 | 521 | ||
320 | if (ptrType < 0 || ptrType == DW_EH_PE_omit) | 522 | if (ptrType < 0 || ptrType == DW_EH_PE_omit) { |
523 | dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end); | ||
321 | return 0; | 524 | return 0; |
525 | } | ||
322 | ptr.p8 = *pLoc; | 526 | ptr.p8 = *pLoc; |
323 | switch(ptrType & DW_EH_PE_FORM) { | 527 | switch(ptrType & DW_EH_PE_FORM) { |
324 | case DW_EH_PE_data2: | 528 | case DW_EH_PE_data2: |
325 | if (end < (const void *)(ptr.p16u + 1)) | 529 | if (end < (const void *)(ptr.p16u + 1)) { |
530 | dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end); | ||
326 | return 0; | 531 | return 0; |
532 | } | ||
327 | if(ptrType & DW_EH_PE_signed) | 533 | if(ptrType & DW_EH_PE_signed) |
328 | value = get_unaligned(ptr.p16s++); | 534 | value = get_unaligned(ptr.p16s++); |
329 | else | 535 | else |
@@ -331,8 +537,10 @@ static unsigned long read_pointer(const u8 **pLoc, | |||
331 | break; | 537 | break; |
332 | case DW_EH_PE_data4: | 538 | case DW_EH_PE_data4: |
333 | #ifdef CONFIG_64BIT | 539 | #ifdef CONFIG_64BIT |
334 | if (end < (const void *)(ptr.p32u + 1)) | 540 | if (end < (const void *)(ptr.p32u + 1)) { |
541 | dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end); | ||
335 | return 0; | 542 | return 0; |
543 | } | ||
336 | if(ptrType & DW_EH_PE_signed) | 544 | if(ptrType & DW_EH_PE_signed) |
337 | value = get_unaligned(ptr.p32s++); | 545 | value = get_unaligned(ptr.p32s++); |
338 | else | 546 | else |
@@ -344,8 +552,10 @@ static unsigned long read_pointer(const u8 **pLoc, | |||
344 | BUILD_BUG_ON(sizeof(u32) != sizeof(value)); | 552 | BUILD_BUG_ON(sizeof(u32) != sizeof(value)); |
345 | #endif | 553 | #endif |
346 | case DW_EH_PE_native: | 554 | case DW_EH_PE_native: |
347 | if (end < (const void *)(ptr.pul + 1)) | 555 | if (end < (const void *)(ptr.pul + 1)) { |
556 | dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end); | ||
348 | return 0; | 557 | return 0; |
558 | } | ||
349 | value = get_unaligned(ptr.pul++); | 559 | value = get_unaligned(ptr.pul++); |
350 | break; | 560 | break; |
351 | case DW_EH_PE_leb128: | 561 | case DW_EH_PE_leb128: |
@@ -353,10 +563,14 @@ static unsigned long read_pointer(const u8 **pLoc, | |||
353 | value = ptrType & DW_EH_PE_signed | 563 | value = ptrType & DW_EH_PE_signed |
354 | ? get_sleb128(&ptr.p8, end) | 564 | ? get_sleb128(&ptr.p8, end) |
355 | : get_uleb128(&ptr.p8, end); | 565 | : get_uleb128(&ptr.p8, end); |
356 | if ((const void *)ptr.p8 > end) | 566 | if ((const void *)ptr.p8 > end) { |
567 | dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end); | ||
357 | return 0; | 568 | return 0; |
569 | } | ||
358 | break; | 570 | break; |
359 | default: | 571 | default: |
572 | dprintk(2, "Cannot decode pointer type %02X (%p,%p).", | ||
573 | ptrType, ptr.p8, end); | ||
360 | return 0; | 574 | return 0; |
361 | } | 575 | } |
362 | switch(ptrType & DW_EH_PE_ADJUST) { | 576 | switch(ptrType & DW_EH_PE_ADJUST) { |
@@ -365,12 +579,33 @@ static unsigned long read_pointer(const u8 **pLoc, | |||
365 | case DW_EH_PE_pcrel: | 579 | case DW_EH_PE_pcrel: |
366 | value += (unsigned long)*pLoc; | 580 | value += (unsigned long)*pLoc; |
367 | break; | 581 | break; |
582 | case DW_EH_PE_textrel: | ||
583 | if (likely(text_base)) { | ||
584 | value += text_base; | ||
585 | break; | ||
586 | } | ||
587 | dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.", | ||
588 | ptrType, *pLoc, end); | ||
589 | return 0; | ||
590 | case DW_EH_PE_datarel: | ||
591 | if (likely(data_base)) { | ||
592 | value += data_base; | ||
593 | break; | ||
594 | } | ||
595 | dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.", | ||
596 | ptrType, *pLoc, end); | ||
597 | return 0; | ||
368 | default: | 598 | default: |
599 | dprintk(2, "Cannot adjust pointer type %02X (%p,%p).", | ||
600 | ptrType, *pLoc, end); | ||
369 | return 0; | 601 | return 0; |
370 | } | 602 | } |
371 | if ((ptrType & DW_EH_PE_indirect) | 603 | if ((ptrType & DW_EH_PE_indirect) |
372 | && __get_user(value, (unsigned long *)value)) | 604 | && probe_kernel_address((unsigned long *)value, value)) { |
605 | dprintk(1, "Cannot read indirect value %lx (%p,%p).", | ||
606 | value, *pLoc, end); | ||
373 | return 0; | 607 | return 0; |
608 | } | ||
374 | *pLoc = ptr.p8; | 609 | *pLoc = ptr.p8; |
375 | 610 | ||
376 | return value; | 611 | return value; |
@@ -413,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie) | |||
413 | case 'P': { | 648 | case 'P': { |
414 | signed ptrType = *ptr++; | 649 | signed ptrType = *ptr++; |
415 | 650 | ||
416 | if (!read_pointer(&ptr, end, ptrType) || ptr > end) | 651 | if (!read_pointer(&ptr, end, ptrType, 0, 0) |
652 | || ptr > end) | ||
417 | return -1; | 653 | return -1; |
418 | } | 654 | } |
419 | break; | 655 | break; |
@@ -473,7 +709,8 @@ static int processCFI(const u8 *start, | |||
473 | case DW_CFA_nop: | 709 | case DW_CFA_nop: |
474 | break; | 710 | break; |
475 | case DW_CFA_set_loc: | 711 | case DW_CFA_set_loc: |
476 | if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) | 712 | state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0); |
713 | if (state->loc == 0) | ||
477 | result = 0; | 714 | result = 0; |
478 | break; | 715 | break; |
479 | case DW_CFA_advance_loc1: | 716 | case DW_CFA_advance_loc1: |
@@ -519,8 +756,10 @@ static int processCFI(const u8 *start, | |||
519 | state->label = NULL; | 756 | state->label = NULL; |
520 | return 1; | 757 | return 1; |
521 | } | 758 | } |
522 | if (state->stackDepth >= MAX_STACK_DEPTH) | 759 | if (state->stackDepth >= MAX_STACK_DEPTH) { |
760 | dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end); | ||
523 | return 0; | 761 | return 0; |
762 | } | ||
524 | state->stack[state->stackDepth++] = ptr.p8; | 763 | state->stack[state->stackDepth++] = ptr.p8; |
525 | break; | 764 | break; |
526 | case DW_CFA_restore_state: | 765 | case DW_CFA_restore_state: |
@@ -535,8 +774,10 @@ static int processCFI(const u8 *start, | |||
535 | result = processCFI(start, end, 0, ptrType, state); | 774 | result = processCFI(start, end, 0, ptrType, state); |
536 | state->loc = loc; | 775 | state->loc = loc; |
537 | state->label = label; | 776 | state->label = label; |
538 | } else | 777 | } else { |
778 | dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end); | ||
539 | return 0; | 779 | return 0; |
780 | } | ||
540 | break; | 781 | break; |
541 | case DW_CFA_def_cfa: | 782 | case DW_CFA_def_cfa: |
542 | state->cfa.reg = get_uleb128(&ptr.p8, end); | 783 | state->cfa.reg = get_uleb128(&ptr.p8, end); |
@@ -568,6 +809,7 @@ static int processCFI(const u8 *start, | |||
568 | break; | 809 | break; |
569 | case DW_CFA_GNU_window_save: | 810 | case DW_CFA_GNU_window_save: |
570 | default: | 811 | default: |
812 | dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end); | ||
571 | result = 0; | 813 | result = 0; |
572 | break; | 814 | break; |
573 | } | 815 | } |
@@ -583,12 +825,17 @@ static int processCFI(const u8 *start, | |||
583 | set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); | 825 | set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); |
584 | break; | 826 | break; |
585 | } | 827 | } |
586 | if (ptr.p8 > end) | 828 | if (ptr.p8 > end) { |
829 | dprintk(1, "Data overrun (%p,%p).", ptr.p8, end); | ||
587 | result = 0; | 830 | result = 0; |
831 | } | ||
588 | if (result && targetLoc != 0 && targetLoc < state->loc) | 832 | if (result && targetLoc != 0 && targetLoc < state->loc) |
589 | return 1; | 833 | return 1; |
590 | } | 834 | } |
591 | 835 | ||
836 | if (result && ptr.p8 < end) | ||
837 | dprintk(1, "Data underrun (%p,%p).", ptr.p8, end); | ||
838 | |||
592 | return result | 839 | return result |
593 | && ptr.p8 == end | 840 | && ptr.p8 == end |
594 | && (targetLoc == 0 | 841 | && (targetLoc == 0 |
@@ -605,54 +852,122 @@ int unwind(struct unwind_frame_info *frame) | |||
605 | #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) | 852 | #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) |
606 | const u32 *fde = NULL, *cie = NULL; | 853 | const u32 *fde = NULL, *cie = NULL; |
607 | const u8 *ptr = NULL, *end = NULL; | 854 | const u8 *ptr = NULL, *end = NULL; |
608 | unsigned long pc = UNW_PC(frame) - frame->call_frame; | 855 | unsigned long pc = UNW_PC(frame) - frame->call_frame, sp; |
609 | unsigned long startLoc = 0, endLoc = 0, cfa; | 856 | unsigned long startLoc = 0, endLoc = 0, cfa; |
610 | unsigned i; | 857 | unsigned i; |
611 | signed ptrType = -1; | 858 | signed ptrType = -1; |
612 | uleb128_t retAddrReg = 0; | 859 | uleb128_t retAddrReg = 0; |
613 | struct unwind_table *table; | 860 | const struct unwind_table *table; |
614 | struct unwind_state state; | 861 | struct unwind_state state; |
615 | 862 | ||
616 | if (UNW_PC(frame) == 0) | 863 | if (UNW_PC(frame) == 0) |
617 | return -EINVAL; | 864 | return -EINVAL; |
618 | if ((table = find_table(pc)) != NULL | 865 | if ((table = find_table(pc)) != NULL |
619 | && !(table->size & (sizeof(*fde) - 1))) { | 866 | && !(table->size & (sizeof(*fde) - 1))) { |
620 | unsigned long tableSize = table->size; | 867 | const u8 *hdr = table->header; |
621 | 868 | unsigned long tableSize; | |
622 | for (fde = table->address; | 869 | |
623 | tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; | 870 | smp_rmb(); |
624 | tableSize -= sizeof(*fde) + *fde, | 871 | if (hdr && hdr[0] == 1) { |
625 | fde += 1 + *fde / sizeof(*fde)) { | 872 | switch(hdr[3] & DW_EH_PE_FORM) { |
626 | if (!*fde || (*fde & (sizeof(*fde) - 1))) | 873 | case DW_EH_PE_native: tableSize = sizeof(unsigned long); break; |
627 | break; | 874 | case DW_EH_PE_data2: tableSize = 2; break; |
628 | if (!fde[1]) | 875 | case DW_EH_PE_data4: tableSize = 4; break; |
629 | continue; /* this is a CIE */ | 876 | case DW_EH_PE_data8: tableSize = 8; break; |
630 | if ((fde[1] & (sizeof(*fde) - 1)) | 877 | default: tableSize = 0; break; |
631 | || fde[1] > (unsigned long)(fde + 1) | 878 | } |
632 | - (unsigned long)table->address) | 879 | ptr = hdr + 4; |
633 | continue; /* this is not a valid FDE */ | 880 | end = hdr + table->hdrsz; |
634 | cie = fde + 1 - fde[1] / sizeof(*fde); | 881 | if (tableSize |
635 | if (*cie <= sizeof(*cie) + 4 | 882 | && read_pointer(&ptr, end, hdr[1], 0, 0) |
636 | || *cie >= fde[1] - sizeof(*fde) | 883 | == (unsigned long)table->address |
637 | || (*cie & (sizeof(*cie) - 1)) | 884 | && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0 |
638 | || cie[1] | 885 | && i == (end - ptr) / (2 * tableSize) |
639 | || (ptrType = fde_pointer_type(cie)) < 0) { | 886 | && !((end - ptr) % (2 * tableSize))) { |
640 | cie = NULL; /* this is not a (valid) CIE */ | 887 | do { |
641 | continue; | 888 | const u8 *cur = ptr + (i / 2) * (2 * tableSize); |
889 | |||
890 | startLoc = read_pointer(&cur, | ||
891 | cur + tableSize, | ||
892 | hdr[3], 0, | ||
893 | (unsigned long)hdr); | ||
894 | if (pc < startLoc) | ||
895 | i /= 2; | ||
896 | else { | ||
897 | ptr = cur - tableSize; | ||
898 | i = (i + 1) / 2; | ||
899 | } | ||
900 | } while (startLoc && i > 1); | ||
901 | if (i == 1 | ||
902 | && (startLoc = read_pointer(&ptr, | ||
903 | ptr + tableSize, | ||
904 | hdr[3], 0, | ||
905 | (unsigned long)hdr)) != 0 | ||
906 | && pc >= startLoc) | ||
907 | fde = (void *)read_pointer(&ptr, | ||
908 | ptr + tableSize, | ||
909 | hdr[3], 0, | ||
910 | (unsigned long)hdr); | ||
642 | } | 911 | } |
912 | } | ||
913 | if(hdr && !fde) | ||
914 | dprintk(3, "Binary lookup for %lx failed.", pc); | ||
915 | |||
916 | if (fde != NULL) { | ||
917 | cie = cie_for_fde(fde, table); | ||
643 | ptr = (const u8 *)(fde + 2); | 918 | ptr = (const u8 *)(fde + 2); |
644 | startLoc = read_pointer(&ptr, | 919 | if(cie != NULL |
645 | (const u8 *)(fde + 1) + *fde, | 920 | && cie != &bad_cie |
646 | ptrType); | 921 | && cie != ¬_fde |
647 | endLoc = startLoc | 922 | && (ptrType = fde_pointer_type(cie)) >= 0 |
648 | + read_pointer(&ptr, | 923 | && read_pointer(&ptr, |
649 | (const u8 *)(fde + 1) + *fde, | 924 | (const u8 *)(fde + 1) + *fde, |
650 | ptrType & DW_EH_PE_indirect | 925 | ptrType, 0, 0) == startLoc) { |
651 | ? ptrType | 926 | if (!(ptrType & DW_EH_PE_indirect)) |
652 | : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); | 927 | ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; |
653 | if (pc >= startLoc && pc < endLoc) | 928 | endLoc = startLoc |
654 | break; | 929 | + read_pointer(&ptr, |
655 | cie = NULL; | 930 | (const u8 *)(fde + 1) + *fde, |
931 | ptrType, 0, 0); | ||
932 | if(pc >= endLoc) | ||
933 | fde = NULL; | ||
934 | } else | ||
935 | fde = NULL; | ||
936 | if(!fde) | ||
937 | dprintk(1, "Binary lookup result for %lx discarded.", pc); | ||
938 | } | ||
939 | if (fde == NULL) { | ||
940 | for (fde = table->address, tableSize = table->size; | ||
941 | cie = NULL, tableSize > sizeof(*fde) | ||
942 | && tableSize - sizeof(*fde) >= *fde; | ||
943 | tableSize -= sizeof(*fde) + *fde, | ||
944 | fde += 1 + *fde / sizeof(*fde)) { | ||
945 | cie = cie_for_fde(fde, table); | ||
946 | if (cie == &bad_cie) { | ||
947 | cie = NULL; | ||
948 | break; | ||
949 | } | ||
950 | if (cie == NULL | ||
951 | || cie == ¬_fde | ||
952 | || (ptrType = fde_pointer_type(cie)) < 0) | ||
953 | continue; | ||
954 | ptr = (const u8 *)(fde + 2); | ||
955 | startLoc = read_pointer(&ptr, | ||
956 | (const u8 *)(fde + 1) + *fde, | ||
957 | ptrType, 0, 0); | ||
958 | if (!startLoc) | ||
959 | continue; | ||
960 | if (!(ptrType & DW_EH_PE_indirect)) | ||
961 | ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; | ||
962 | endLoc = startLoc | ||
963 | + read_pointer(&ptr, | ||
964 | (const u8 *)(fde + 1) + *fde, | ||
965 | ptrType, 0, 0); | ||
966 | if (pc >= startLoc && pc < endLoc) | ||
967 | break; | ||
968 | } | ||
969 | if(!fde) | ||
970 | dprintk(3, "Linear lookup for %lx failed.", pc); | ||
656 | } | 971 | } |
657 | } | 972 | } |
658 | if (cie != NULL) { | 973 | if (cie != NULL) { |
@@ -686,6 +1001,8 @@ int unwind(struct unwind_frame_info *frame) | |||
686 | if (ptr >= end || *ptr) | 1001 | if (ptr >= end || *ptr) |
687 | cie = NULL; | 1002 | cie = NULL; |
688 | } | 1003 | } |
1004 | if(!cie) | ||
1005 | dprintk(1, "CIE unusable (%p,%p).", ptr, end); | ||
689 | ++ptr; | 1006 | ++ptr; |
690 | } | 1007 | } |
691 | if (cie != NULL) { | 1008 | if (cie != NULL) { |
@@ -695,17 +1012,27 @@ int unwind(struct unwind_frame_info *frame) | |||
695 | state.dataAlign = get_sleb128(&ptr, end); | 1012 | state.dataAlign = get_sleb128(&ptr, end); |
696 | if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) | 1013 | if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) |
697 | cie = NULL; | 1014 | cie = NULL; |
698 | else { | 1015 | else if (UNW_PC(frame) % state.codeAlign |
1016 | || UNW_SP(frame) % sleb128abs(state.dataAlign)) { | ||
1017 | dprintk(1, "Input pointer(s) misaligned (%lx,%lx).", | ||
1018 | UNW_PC(frame), UNW_SP(frame)); | ||
1019 | return -EPERM; | ||
1020 | } else { | ||
699 | retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); | 1021 | retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); |
700 | /* skip augmentation */ | 1022 | /* skip augmentation */ |
701 | if (((const char *)(cie + 2))[1] == 'z') | 1023 | if (((const char *)(cie + 2))[1] == 'z') { |
702 | ptr += get_uleb128(&ptr, end); | 1024 | uleb128_t augSize = get_uleb128(&ptr, end); |
1025 | |||
1026 | ptr += augSize; | ||
1027 | } | ||
703 | if (ptr > end | 1028 | if (ptr > end |
704 | || retAddrReg >= ARRAY_SIZE(reg_info) | 1029 | || retAddrReg >= ARRAY_SIZE(reg_info) |
705 | || REG_INVALID(retAddrReg) | 1030 | || REG_INVALID(retAddrReg) |
706 | || reg_info[retAddrReg].width != sizeof(unsigned long)) | 1031 | || reg_info[retAddrReg].width != sizeof(unsigned long)) |
707 | cie = NULL; | 1032 | cie = NULL; |
708 | } | 1033 | } |
1034 | if(!cie) | ||
1035 | dprintk(1, "CIE validation failed (%p,%p).", ptr, end); | ||
709 | } | 1036 | } |
710 | if (cie != NULL) { | 1037 | if (cie != NULL) { |
711 | state.cieStart = ptr; | 1038 | state.cieStart = ptr; |
@@ -719,13 +1046,15 @@ int unwind(struct unwind_frame_info *frame) | |||
719 | if ((ptr += augSize) > end) | 1046 | if ((ptr += augSize) > end) |
720 | fde = NULL; | 1047 | fde = NULL; |
721 | } | 1048 | } |
1049 | if(!fde) | ||
1050 | dprintk(1, "FDE validation failed (%p,%p).", ptr, end); | ||
722 | } | 1051 | } |
723 | if (cie == NULL || fde == NULL) { | 1052 | if (cie == NULL || fde == NULL) { |
724 | #ifdef CONFIG_FRAME_POINTER | 1053 | #ifdef CONFIG_FRAME_POINTER |
725 | unsigned long top, bottom; | 1054 | unsigned long top, bottom; |
726 | #endif | ||
727 | 1055 | ||
728 | #ifdef CONFIG_FRAME_POINTER | 1056 | if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long)) |
1057 | return -EPERM; | ||
729 | top = STACK_TOP(frame->task); | 1058 | top = STACK_TOP(frame->task); |
730 | bottom = STACK_BOTTOM(frame->task); | 1059 | bottom = STACK_BOTTOM(frame->task); |
731 | # if FRAME_RETADDR_OFFSET < 0 | 1060 | # if FRAME_RETADDR_OFFSET < 0 |
@@ -741,18 +1070,19 @@ int unwind(struct unwind_frame_info *frame) | |||
741 | & (sizeof(unsigned long) - 1))) { | 1070 | & (sizeof(unsigned long) - 1))) { |
742 | unsigned long link; | 1071 | unsigned long link; |
743 | 1072 | ||
744 | if (!__get_user(link, | 1073 | if (!probe_kernel_address( |
745 | (unsigned long *)(UNW_FP(frame) | 1074 | (unsigned long *)(UNW_FP(frame) |
746 | + FRAME_LINK_OFFSET)) | 1075 | + FRAME_LINK_OFFSET), |
1076 | link) | ||
747 | # if FRAME_RETADDR_OFFSET < 0 | 1077 | # if FRAME_RETADDR_OFFSET < 0 |
748 | && link > bottom && link < UNW_FP(frame) | 1078 | && link > bottom && link < UNW_FP(frame) |
749 | # else | 1079 | # else |
750 | && link > UNW_FP(frame) && link < bottom | 1080 | && link > UNW_FP(frame) && link < bottom |
751 | # endif | 1081 | # endif |
752 | && !(link & (sizeof(link) - 1)) | 1082 | && !(link & (sizeof(link) - 1)) |
753 | && !__get_user(UNW_PC(frame), | 1083 | && !probe_kernel_address( |
754 | (unsigned long *)(UNW_FP(frame) | 1084 | (unsigned long *)(UNW_FP(frame) |
755 | + FRAME_RETADDR_OFFSET))) { | 1085 | + FRAME_RETADDR_OFFSET), UNW_PC(frame))) { |
756 | UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET | 1086 | UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET |
757 | # if FRAME_RETADDR_OFFSET < 0 | 1087 | # if FRAME_RETADDR_OFFSET < 0 |
758 | - | 1088 | - |
@@ -775,8 +1105,11 @@ int unwind(struct unwind_frame_info *frame) | |||
775 | || state.regs[retAddrReg].where == Nowhere | 1105 | || state.regs[retAddrReg].where == Nowhere |
776 | || state.cfa.reg >= ARRAY_SIZE(reg_info) | 1106 | || state.cfa.reg >= ARRAY_SIZE(reg_info) |
777 | || reg_info[state.cfa.reg].width != sizeof(unsigned long) | 1107 | || reg_info[state.cfa.reg].width != sizeof(unsigned long) |
778 | || state.cfa.offs % sizeof(unsigned long)) | 1108 | || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long) |
1109 | || state.cfa.offs % sizeof(unsigned long)) { | ||
1110 | dprintk(1, "Unusable unwind info (%p,%p).", ptr, end); | ||
779 | return -EIO; | 1111 | return -EIO; |
1112 | } | ||
780 | /* update frame */ | 1113 | /* update frame */ |
781 | #ifndef CONFIG_AS_CFI_SIGNAL_FRAME | 1114 | #ifndef CONFIG_AS_CFI_SIGNAL_FRAME |
782 | if(frame->call_frame | 1115 | if(frame->call_frame |
@@ -795,10 +1128,14 @@ int unwind(struct unwind_frame_info *frame) | |||
795 | #else | 1128 | #else |
796 | # define CASES CASE(8); CASE(16); CASE(32); CASE(64) | 1129 | # define CASES CASE(8); CASE(16); CASE(32); CASE(64) |
797 | #endif | 1130 | #endif |
1131 | pc = UNW_PC(frame); | ||
1132 | sp = UNW_SP(frame); | ||
798 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | 1133 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { |
799 | if (REG_INVALID(i)) { | 1134 | if (REG_INVALID(i)) { |
800 | if (state.regs[i].where == Nowhere) | 1135 | if (state.regs[i].where == Nowhere) |
801 | continue; | 1136 | continue; |
1137 | dprintk(1, "Cannot restore register %u (%d).", | ||
1138 | i, state.regs[i].where); | ||
802 | return -EIO; | 1139 | return -EIO; |
803 | } | 1140 | } |
804 | switch(state.regs[i].where) { | 1141 | switch(state.regs[i].where) { |
@@ -807,8 +1144,11 @@ int unwind(struct unwind_frame_info *frame) | |||
807 | case Register: | 1144 | case Register: |
808 | if (state.regs[i].value >= ARRAY_SIZE(reg_info) | 1145 | if (state.regs[i].value >= ARRAY_SIZE(reg_info) |
809 | || REG_INVALID(state.regs[i].value) | 1146 | || REG_INVALID(state.regs[i].value) |
810 | || reg_info[i].width > reg_info[state.regs[i].value].width) | 1147 | || reg_info[i].width > reg_info[state.regs[i].value].width) { |
1148 | dprintk(1, "Cannot restore register %u from register %lu.", | ||
1149 | i, state.regs[i].value); | ||
811 | return -EIO; | 1150 | return -EIO; |
1151 | } | ||
812 | switch(reg_info[state.regs[i].value].width) { | 1152 | switch(reg_info[state.regs[i].value].width) { |
813 | #define CASE(n) \ | 1153 | #define CASE(n) \ |
814 | case sizeof(u##n): \ | 1154 | case sizeof(u##n): \ |
@@ -818,6 +1158,9 @@ int unwind(struct unwind_frame_info *frame) | |||
818 | CASES; | 1158 | CASES; |
819 | #undef CASE | 1159 | #undef CASE |
820 | default: | 1160 | default: |
1161 | dprintk(1, "Unsupported register size %u (%lu).", | ||
1162 | reg_info[state.regs[i].value].width, | ||
1163 | state.regs[i].value); | ||
821 | return -EIO; | 1164 | return -EIO; |
822 | } | 1165 | } |
823 | break; | 1166 | break; |
@@ -842,12 +1185,17 @@ int unwind(struct unwind_frame_info *frame) | |||
842 | CASES; | 1185 | CASES; |
843 | #undef CASE | 1186 | #undef CASE |
844 | default: | 1187 | default: |
1188 | dprintk(1, "Unsupported register size %u (%u).", | ||
1189 | reg_info[i].width, i); | ||
845 | return -EIO; | 1190 | return -EIO; |
846 | } | 1191 | } |
847 | break; | 1192 | break; |
848 | case Value: | 1193 | case Value: |
849 | if (reg_info[i].width != sizeof(unsigned long)) | 1194 | if (reg_info[i].width != sizeof(unsigned long)) { |
1195 | dprintk(1, "Unsupported value size %u (%u).", | ||
1196 | reg_info[i].width, i); | ||
850 | return -EIO; | 1197 | return -EIO; |
1198 | } | ||
851 | FRAME_REG(i, unsigned long) = cfa + state.regs[i].value | 1199 | FRAME_REG(i, unsigned long) = cfa + state.regs[i].value |
852 | * state.dataAlign; | 1200 | * state.dataAlign; |
853 | break; | 1201 | break; |
@@ -859,15 +1207,20 @@ int unwind(struct unwind_frame_info *frame) | |||
859 | % sizeof(unsigned long) | 1207 | % sizeof(unsigned long) |
860 | || addr < startLoc | 1208 | || addr < startLoc |
861 | || addr + sizeof(unsigned long) < addr | 1209 | || addr + sizeof(unsigned long) < addr |
862 | || addr + sizeof(unsigned long) > endLoc) | 1210 | || addr + sizeof(unsigned long) > endLoc) { |
1211 | dprintk(1, "Bad memory location %lx (%lx).", | ||
1212 | addr, state.regs[i].value); | ||
863 | return -EIO; | 1213 | return -EIO; |
1214 | } | ||
864 | switch(reg_info[i].width) { | 1215 | switch(reg_info[i].width) { |
865 | #define CASE(n) case sizeof(u##n): \ | 1216 | #define CASE(n) case sizeof(u##n): \ |
866 | __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ | 1217 | probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \ |
867 | break | 1218 | break |
868 | CASES; | 1219 | CASES; |
869 | #undef CASE | 1220 | #undef CASE |
870 | default: | 1221 | default: |
1222 | dprintk(1, "Unsupported memory size %u (%u).", | ||
1223 | reg_info[i].width, i); | ||
871 | return -EIO; | 1224 | return -EIO; |
872 | } | 1225 | } |
873 | } | 1226 | } |
@@ -875,6 +1228,17 @@ int unwind(struct unwind_frame_info *frame) | |||
875 | } | 1228 | } |
876 | } | 1229 | } |
877 | 1230 | ||
1231 | if (UNW_PC(frame) % state.codeAlign | ||
1232 | || UNW_SP(frame) % sleb128abs(state.dataAlign)) { | ||
1233 | dprintk(1, "Output pointer(s) misaligned (%lx,%lx).", | ||
1234 | UNW_PC(frame), UNW_SP(frame)); | ||
1235 | return -EIO; | ||
1236 | } | ||
1237 | if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) { | ||
1238 | dprintk(1, "No progress (%lx,%lx).", pc, sp); | ||
1239 | return -EIO; | ||
1240 | } | ||
1241 | |||
878 | return 0; | 1242 | return 0; |
879 | #undef CASES | 1243 | #undef CASES |
880 | #undef FRAME_REG | 1244 | #undef FRAME_REG |
diff --git a/kernel/user.c b/kernel/user.c index 6408c0424291..4869563080e9 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -26,7 +26,7 @@ | |||
26 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) | 26 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) |
27 | #define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) | 27 | #define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) |
28 | 28 | ||
29 | static kmem_cache_t *uid_cachep; | 29 | static struct kmem_cache *uid_cachep; |
30 | static struct list_head uidhash_table[UIDHASH_SZ]; | 30 | static struct list_head uidhash_table[UIDHASH_SZ]; |
31 | 31 | ||
32 | /* | 32 | /* |
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid) | |||
132 | if (!up) { | 132 | if (!up) { |
133 | struct user_struct *new; | 133 | struct user_struct *new; |
134 | 134 | ||
135 | new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); | 135 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); |
136 | if (!new) | 136 | if (!new) |
137 | return NULL; | 137 | return NULL; |
138 | new->uid = uid; | 138 | new->uid = uid; |
@@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user) | |||
187 | atomic_dec(&old_user->processes); | 187 | atomic_dec(&old_user->processes); |
188 | switch_uid_keyring(new_user); | 188 | switch_uid_keyring(new_user); |
189 | current->user = new_user; | 189 | current->user = new_user; |
190 | |||
191 | /* | ||
192 | * We need to synchronize with __sigqueue_alloc() | ||
193 | * doing a get_uid(p->user).. If that saw the old | ||
194 | * user value, we need to wait until it has exited | ||
195 | * its critical region before we can free the old | ||
196 | * structure. | ||
197 | */ | ||
198 | smp_mb(); | ||
199 | spin_unlock_wait(¤t->sighand->siglock); | ||
200 | |||
190 | free_uid(old_user); | 201 | free_uid(old_user); |
191 | suid_keys(current); | 202 | suid_keys(current); |
192 | } | 203 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3df9bfc7ff78..db49886bfae1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -29,6 +29,9 @@ | |||
29 | #include <linux/kthread.h> | 29 | #include <linux/kthread.h> |
30 | #include <linux/hardirq.h> | 30 | #include <linux/hardirq.h> |
31 | #include <linux/mempolicy.h> | 31 | #include <linux/mempolicy.h> |
32 | #include <linux/freezer.h> | ||
33 | #include <linux/kallsyms.h> | ||
34 | #include <linux/debug_locks.h> | ||
32 | 35 | ||
33 | /* | 36 | /* |
34 | * The per-CPU workqueue (if single thread, we always use the first | 37 | * The per-CPU workqueue (if single thread, we always use the first |
@@ -55,6 +58,8 @@ struct cpu_workqueue_struct { | |||
55 | struct task_struct *thread; | 58 | struct task_struct *thread; |
56 | 59 | ||
57 | int run_depth; /* Detect run_workqueue() recursion depth */ | 60 | int run_depth; /* Detect run_workqueue() recursion depth */ |
61 | |||
62 | int freezeable; /* Freeze the thread during suspend */ | ||
58 | } ____cacheline_aligned; | 63 | } ____cacheline_aligned; |
59 | 64 | ||
60 | /* | 65 | /* |
@@ -80,6 +85,99 @@ static inline int is_single_threaded(struct workqueue_struct *wq) | |||
80 | return list_empty(&wq->list); | 85 | return list_empty(&wq->list); |
81 | } | 86 | } |
82 | 87 | ||
88 | /* | ||
89 | * Set the workqueue on which a work item is to be run | ||
90 | * - Must *only* be called if the pending flag is set | ||
91 | */ | ||
92 | static inline void set_wq_data(struct work_struct *work, void *wq) | ||
93 | { | ||
94 | unsigned long new; | ||
95 | |||
96 | BUG_ON(!work_pending(work)); | ||
97 | |||
98 | new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); | ||
99 | new |= work->management & WORK_STRUCT_FLAG_MASK; | ||
100 | work->management = new; | ||
101 | } | ||
102 | |||
103 | static inline void *get_wq_data(struct work_struct *work) | ||
104 | { | ||
105 | return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK); | ||
106 | } | ||
107 | |||
108 | static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) | ||
109 | { | ||
110 | int ret = 0; | ||
111 | unsigned long flags; | ||
112 | |||
113 | spin_lock_irqsave(&cwq->lock, flags); | ||
114 | /* | ||
115 | * We need to re-validate the work info after we've gotten | ||
116 | * the cpu_workqueue lock. We can run the work now iff: | ||
117 | * | ||
118 | * - the wq_data still matches the cpu_workqueue_struct | ||
119 | * - AND the work is still marked pending | ||
120 | * - AND the work is still on a list (which will be this | ||
121 | * workqueue_struct list) | ||
122 | * | ||
123 | * All these conditions are important, because we | ||
124 | * need to protect against the work being run right | ||
125 | * now on another CPU (all but the last one might be | ||
126 | * true if it's currently running and has not been | ||
127 | * released yet, for example). | ||
128 | */ | ||
129 | if (get_wq_data(work) == cwq | ||
130 | && work_pending(work) | ||
131 | && !list_empty(&work->entry)) { | ||
132 | work_func_t f = work->func; | ||
133 | list_del_init(&work->entry); | ||
134 | spin_unlock_irqrestore(&cwq->lock, flags); | ||
135 | |||
136 | if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management)) | ||
137 | work_release(work); | ||
138 | f(work); | ||
139 | |||
140 | spin_lock_irqsave(&cwq->lock, flags); | ||
141 | cwq->remove_sequence++; | ||
142 | wake_up(&cwq->work_done); | ||
143 | ret = 1; | ||
144 | } | ||
145 | spin_unlock_irqrestore(&cwq->lock, flags); | ||
146 | return ret; | ||
147 | } | ||
148 | |||
149 | /** | ||
150 | * run_scheduled_work - run scheduled work synchronously | ||
151 | * @work: work to run | ||
152 | * | ||
153 | * This checks if the work was pending, and runs it | ||
154 | * synchronously if so. It returns a boolean to indicate | ||
155 | * whether it had any scheduled work to run or not. | ||
156 | * | ||
157 | * NOTE! This _only_ works for normal work_structs. You | ||
158 | * CANNOT use this for delayed work, because the wq data | ||
159 | * for delayed work will not point properly to the per- | ||
160 | * CPU workqueue struct, but will change! | ||
161 | */ | ||
162 | int fastcall run_scheduled_work(struct work_struct *work) | ||
163 | { | ||
164 | for (;;) { | ||
165 | struct cpu_workqueue_struct *cwq; | ||
166 | |||
167 | if (!work_pending(work)) | ||
168 | return 0; | ||
169 | if (list_empty(&work->entry)) | ||
170 | return 0; | ||
171 | /* NOTE! This depends intimately on __queue_work! */ | ||
172 | cwq = get_wq_data(work); | ||
173 | if (!cwq) | ||
174 | return 0; | ||
175 | if (__run_work(cwq, work)) | ||
176 | return 1; | ||
177 | } | ||
178 | } | ||
179 | EXPORT_SYMBOL(run_scheduled_work); | ||
180 | |||
83 | /* Preempt must be disabled. */ | 181 | /* Preempt must be disabled. */ |
84 | static void __queue_work(struct cpu_workqueue_struct *cwq, | 182 | static void __queue_work(struct cpu_workqueue_struct *cwq, |
85 | struct work_struct *work) | 183 | struct work_struct *work) |
@@ -87,7 +185,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, | |||
87 | unsigned long flags; | 185 | unsigned long flags; |
88 | 186 | ||
89 | spin_lock_irqsave(&cwq->lock, flags); | 187 | spin_lock_irqsave(&cwq->lock, flags); |
90 | work->wq_data = cwq; | 188 | set_wq_data(work, cwq); |
91 | list_add_tail(&work->entry, &cwq->worklist); | 189 | list_add_tail(&work->entry, &cwq->worklist); |
92 | cwq->insert_sequence++; | 190 | cwq->insert_sequence++; |
93 | wake_up(&cwq->more_work); | 191 | wake_up(&cwq->more_work); |
@@ -99,7 +197,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, | |||
99 | * @wq: workqueue to use | 197 | * @wq: workqueue to use |
100 | * @work: work to queue | 198 | * @work: work to queue |
101 | * | 199 | * |
102 | * Returns non-zero if it was successfully added. | 200 | * Returns 0 if @work was already on a queue, non-zero otherwise. |
103 | * | 201 | * |
104 | * We queue the work to the CPU it was submitted, but there is no | 202 | * We queue the work to the CPU it was submitted, but there is no |
105 | * guarantee that it will be processed by that CPU. | 203 | * guarantee that it will be processed by that CPU. |
@@ -108,7 +206,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
108 | { | 206 | { |
109 | int ret = 0, cpu = get_cpu(); | 207 | int ret = 0, cpu = get_cpu(); |
110 | 208 | ||
111 | if (!test_and_set_bit(0, &work->pending)) { | 209 | if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { |
112 | if (unlikely(is_single_threaded(wq))) | 210 | if (unlikely(is_single_threaded(wq))) |
113 | cpu = singlethread_cpu; | 211 | cpu = singlethread_cpu; |
114 | BUG_ON(!list_empty(&work->entry)); | 212 | BUG_ON(!list_empty(&work->entry)); |
@@ -122,38 +220,42 @@ EXPORT_SYMBOL_GPL(queue_work); | |||
122 | 220 | ||
123 | static void delayed_work_timer_fn(unsigned long __data) | 221 | static void delayed_work_timer_fn(unsigned long __data) |
124 | { | 222 | { |
125 | struct work_struct *work = (struct work_struct *)__data; | 223 | struct delayed_work *dwork = (struct delayed_work *)__data; |
126 | struct workqueue_struct *wq = work->wq_data; | 224 | struct workqueue_struct *wq = get_wq_data(&dwork->work); |
127 | int cpu = smp_processor_id(); | 225 | int cpu = smp_processor_id(); |
128 | 226 | ||
129 | if (unlikely(is_single_threaded(wq))) | 227 | if (unlikely(is_single_threaded(wq))) |
130 | cpu = singlethread_cpu; | 228 | cpu = singlethread_cpu; |
131 | 229 | ||
132 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 230 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work); |
133 | } | 231 | } |
134 | 232 | ||
135 | /** | 233 | /** |
136 | * queue_delayed_work - queue work on a workqueue after delay | 234 | * queue_delayed_work - queue work on a workqueue after delay |
137 | * @wq: workqueue to use | 235 | * @wq: workqueue to use |
138 | * @work: work to queue | 236 | * @work: delayable work to queue |
139 | * @delay: number of jiffies to wait before queueing | 237 | * @delay: number of jiffies to wait before queueing |
140 | * | 238 | * |
141 | * Returns non-zero if it was successfully added. | 239 | * Returns 0 if @work was already on a queue, non-zero otherwise. |
142 | */ | 240 | */ |
143 | int fastcall queue_delayed_work(struct workqueue_struct *wq, | 241 | int fastcall queue_delayed_work(struct workqueue_struct *wq, |
144 | struct work_struct *work, unsigned long delay) | 242 | struct delayed_work *dwork, unsigned long delay) |
145 | { | 243 | { |
146 | int ret = 0; | 244 | int ret = 0; |
147 | struct timer_list *timer = &work->timer; | 245 | struct timer_list *timer = &dwork->timer; |
246 | struct work_struct *work = &dwork->work; | ||
247 | |||
248 | if (delay == 0) | ||
249 | return queue_work(wq, work); | ||
148 | 250 | ||
149 | if (!test_and_set_bit(0, &work->pending)) { | 251 | if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { |
150 | BUG_ON(timer_pending(timer)); | 252 | BUG_ON(timer_pending(timer)); |
151 | BUG_ON(!list_empty(&work->entry)); | 253 | BUG_ON(!list_empty(&work->entry)); |
152 | 254 | ||
153 | /* This stores wq for the moment, for the timer_fn */ | 255 | /* This stores wq for the moment, for the timer_fn */ |
154 | work->wq_data = wq; | 256 | set_wq_data(work, wq); |
155 | timer->expires = jiffies + delay; | 257 | timer->expires = jiffies + delay; |
156 | timer->data = (unsigned long)work; | 258 | timer->data = (unsigned long)dwork; |
157 | timer->function = delayed_work_timer_fn; | 259 | timer->function = delayed_work_timer_fn; |
158 | add_timer(timer); | 260 | add_timer(timer); |
159 | ret = 1; | 261 | ret = 1; |
@@ -169,22 +271,23 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); | |||
169 | * @work: work to queue | 271 | * @work: work to queue |
170 | * @delay: number of jiffies to wait before queueing | 272 | * @delay: number of jiffies to wait before queueing |
171 | * | 273 | * |
172 | * Returns non-zero if it was successfully added. | 274 | * Returns 0 if @work was already on a queue, non-zero otherwise. |
173 | */ | 275 | */ |
174 | int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | 276 | int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, |
175 | struct work_struct *work, unsigned long delay) | 277 | struct delayed_work *dwork, unsigned long delay) |
176 | { | 278 | { |
177 | int ret = 0; | 279 | int ret = 0; |
178 | struct timer_list *timer = &work->timer; | 280 | struct timer_list *timer = &dwork->timer; |
281 | struct work_struct *work = &dwork->work; | ||
179 | 282 | ||
180 | if (!test_and_set_bit(0, &work->pending)) { | 283 | if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { |
181 | BUG_ON(timer_pending(timer)); | 284 | BUG_ON(timer_pending(timer)); |
182 | BUG_ON(!list_empty(&work->entry)); | 285 | BUG_ON(!list_empty(&work->entry)); |
183 | 286 | ||
184 | /* This stores wq for the moment, for the timer_fn */ | 287 | /* This stores wq for the moment, for the timer_fn */ |
185 | work->wq_data = wq; | 288 | set_wq_data(work, wq); |
186 | timer->expires = jiffies + delay; | 289 | timer->expires = jiffies + delay; |
187 | timer->data = (unsigned long)work; | 290 | timer->data = (unsigned long)dwork; |
188 | timer->function = delayed_work_timer_fn; | 291 | timer->function = delayed_work_timer_fn; |
189 | add_timer_on(timer, cpu); | 292 | add_timer_on(timer, cpu); |
190 | ret = 1; | 293 | ret = 1; |
@@ -212,15 +315,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) | |||
212 | while (!list_empty(&cwq->worklist)) { | 315 | while (!list_empty(&cwq->worklist)) { |
213 | struct work_struct *work = list_entry(cwq->worklist.next, | 316 | struct work_struct *work = list_entry(cwq->worklist.next, |
214 | struct work_struct, entry); | 317 | struct work_struct, entry); |
215 | void (*f) (void *) = work->func; | 318 | work_func_t f = work->func; |
216 | void *data = work->data; | ||
217 | 319 | ||
218 | list_del_init(cwq->worklist.next); | 320 | list_del_init(cwq->worklist.next); |
219 | spin_unlock_irqrestore(&cwq->lock, flags); | 321 | spin_unlock_irqrestore(&cwq->lock, flags); |
220 | 322 | ||
221 | BUG_ON(work->wq_data != cwq); | 323 | BUG_ON(get_wq_data(work) != cwq); |
222 | clear_bit(0, &work->pending); | 324 | if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management)) |
223 | f(data); | 325 | work_release(work); |
326 | f(work); | ||
327 | |||
328 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | ||
329 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | ||
330 | "%s/0x%08x/%d\n", | ||
331 | current->comm, preempt_count(), | ||
332 | current->pid); | ||
333 | printk(KERN_ERR " last function: "); | ||
334 | print_symbol("%s\n", (unsigned long)f); | ||
335 | debug_show_held_locks(current); | ||
336 | dump_stack(); | ||
337 | } | ||
224 | 338 | ||
225 | spin_lock_irqsave(&cwq->lock, flags); | 339 | spin_lock_irqsave(&cwq->lock, flags); |
226 | cwq->remove_sequence++; | 340 | cwq->remove_sequence++; |
@@ -237,7 +351,8 @@ static int worker_thread(void *__cwq) | |||
237 | struct k_sigaction sa; | 351 | struct k_sigaction sa; |
238 | sigset_t blocked; | 352 | sigset_t blocked; |
239 | 353 | ||
240 | current->flags |= PF_NOFREEZE; | 354 | if (!cwq->freezeable) |
355 | current->flags |= PF_NOFREEZE; | ||
241 | 356 | ||
242 | set_user_nice(current, -5); | 357 | set_user_nice(current, -5); |
243 | 358 | ||
@@ -260,6 +375,9 @@ static int worker_thread(void *__cwq) | |||
260 | 375 | ||
261 | set_current_state(TASK_INTERRUPTIBLE); | 376 | set_current_state(TASK_INTERRUPTIBLE); |
262 | while (!kthread_should_stop()) { | 377 | while (!kthread_should_stop()) { |
378 | if (cwq->freezeable) | ||
379 | try_to_freeze(); | ||
380 | |||
263 | add_wait_queue(&cwq->more_work, &wait); | 381 | add_wait_queue(&cwq->more_work, &wait); |
264 | if (list_empty(&cwq->worklist)) | 382 | if (list_empty(&cwq->worklist)) |
265 | schedule(); | 383 | schedule(); |
@@ -336,7 +454,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
336 | EXPORT_SYMBOL_GPL(flush_workqueue); | 454 | EXPORT_SYMBOL_GPL(flush_workqueue); |
337 | 455 | ||
338 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | 456 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, |
339 | int cpu) | 457 | int cpu, int freezeable) |
340 | { | 458 | { |
341 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 459 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
342 | struct task_struct *p; | 460 | struct task_struct *p; |
@@ -346,6 +464,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | |||
346 | cwq->thread = NULL; | 464 | cwq->thread = NULL; |
347 | cwq->insert_sequence = 0; | 465 | cwq->insert_sequence = 0; |
348 | cwq->remove_sequence = 0; | 466 | cwq->remove_sequence = 0; |
467 | cwq->freezeable = freezeable; | ||
349 | INIT_LIST_HEAD(&cwq->worklist); | 468 | INIT_LIST_HEAD(&cwq->worklist); |
350 | init_waitqueue_head(&cwq->more_work); | 469 | init_waitqueue_head(&cwq->more_work); |
351 | init_waitqueue_head(&cwq->work_done); | 470 | init_waitqueue_head(&cwq->work_done); |
@@ -361,7 +480,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | |||
361 | } | 480 | } |
362 | 481 | ||
363 | struct workqueue_struct *__create_workqueue(const char *name, | 482 | struct workqueue_struct *__create_workqueue(const char *name, |
364 | int singlethread) | 483 | int singlethread, int freezeable) |
365 | { | 484 | { |
366 | int cpu, destroy = 0; | 485 | int cpu, destroy = 0; |
367 | struct workqueue_struct *wq; | 486 | struct workqueue_struct *wq; |
@@ -381,7 +500,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
381 | mutex_lock(&workqueue_mutex); | 500 | mutex_lock(&workqueue_mutex); |
382 | if (singlethread) { | 501 | if (singlethread) { |
383 | INIT_LIST_HEAD(&wq->list); | 502 | INIT_LIST_HEAD(&wq->list); |
384 | p = create_workqueue_thread(wq, singlethread_cpu); | 503 | p = create_workqueue_thread(wq, singlethread_cpu, freezeable); |
385 | if (!p) | 504 | if (!p) |
386 | destroy = 1; | 505 | destroy = 1; |
387 | else | 506 | else |
@@ -389,7 +508,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
389 | } else { | 508 | } else { |
390 | list_add(&wq->list, &workqueues); | 509 | list_add(&wq->list, &workqueues); |
391 | for_each_online_cpu(cpu) { | 510 | for_each_online_cpu(cpu) { |
392 | p = create_workqueue_thread(wq, cpu); | 511 | p = create_workqueue_thread(wq, cpu, freezeable); |
393 | if (p) { | 512 | if (p) { |
394 | kthread_bind(p, cpu); | 513 | kthread_bind(p, cpu); |
395 | wake_up_process(p); | 514 | wake_up_process(p); |
@@ -468,38 +587,37 @@ EXPORT_SYMBOL(schedule_work); | |||
468 | 587 | ||
469 | /** | 588 | /** |
470 | * schedule_delayed_work - put work task in global workqueue after delay | 589 | * schedule_delayed_work - put work task in global workqueue after delay |
471 | * @work: job to be done | 590 | * @dwork: job to be done |
472 | * @delay: number of jiffies to wait | 591 | * @delay: number of jiffies to wait or 0 for immediate execution |
473 | * | 592 | * |
474 | * After waiting for a given time this puts a job in the kernel-global | 593 | * After waiting for a given time this puts a job in the kernel-global |
475 | * workqueue. | 594 | * workqueue. |
476 | */ | 595 | */ |
477 | int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) | 596 | int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) |
478 | { | 597 | { |
479 | return queue_delayed_work(keventd_wq, work, delay); | 598 | return queue_delayed_work(keventd_wq, dwork, delay); |
480 | } | 599 | } |
481 | EXPORT_SYMBOL(schedule_delayed_work); | 600 | EXPORT_SYMBOL(schedule_delayed_work); |
482 | 601 | ||
483 | /** | 602 | /** |
484 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay | 603 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay |
485 | * @cpu: cpu to use | 604 | * @cpu: cpu to use |
486 | * @work: job to be done | 605 | * @dwork: job to be done |
487 | * @delay: number of jiffies to wait | 606 | * @delay: number of jiffies to wait |
488 | * | 607 | * |
489 | * After waiting for a given time this puts a job in the kernel-global | 608 | * After waiting for a given time this puts a job in the kernel-global |
490 | * workqueue on the specified CPU. | 609 | * workqueue on the specified CPU. |
491 | */ | 610 | */ |
492 | int schedule_delayed_work_on(int cpu, | 611 | int schedule_delayed_work_on(int cpu, |
493 | struct work_struct *work, unsigned long delay) | 612 | struct delayed_work *dwork, unsigned long delay) |
494 | { | 613 | { |
495 | return queue_delayed_work_on(cpu, keventd_wq, work, delay); | 614 | return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); |
496 | } | 615 | } |
497 | EXPORT_SYMBOL(schedule_delayed_work_on); | 616 | EXPORT_SYMBOL(schedule_delayed_work_on); |
498 | 617 | ||
499 | /** | 618 | /** |
500 | * schedule_on_each_cpu - call a function on each online CPU from keventd | 619 | * schedule_on_each_cpu - call a function on each online CPU from keventd |
501 | * @func: the function to call | 620 | * @func: the function to call |
502 | * @info: a pointer to pass to func() | ||
503 | * | 621 | * |
504 | * Returns zero on success. | 622 | * Returns zero on success. |
505 | * Returns -ve errno on failure. | 623 | * Returns -ve errno on failure. |
@@ -508,7 +626,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
508 | * | 626 | * |
509 | * schedule_on_each_cpu() is very slow. | 627 | * schedule_on_each_cpu() is very slow. |
510 | */ | 628 | */ |
511 | int schedule_on_each_cpu(void (*func)(void *info), void *info) | 629 | int schedule_on_each_cpu(work_func_t func) |
512 | { | 630 | { |
513 | int cpu; | 631 | int cpu; |
514 | struct work_struct *works; | 632 | struct work_struct *works; |
@@ -519,7 +637,7 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info) | |||
519 | 637 | ||
520 | mutex_lock(&workqueue_mutex); | 638 | mutex_lock(&workqueue_mutex); |
521 | for_each_online_cpu(cpu) { | 639 | for_each_online_cpu(cpu) { |
522 | INIT_WORK(per_cpu_ptr(works, cpu), func, info); | 640 | INIT_WORK(per_cpu_ptr(works, cpu), func); |
523 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), | 641 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), |
524 | per_cpu_ptr(works, cpu)); | 642 | per_cpu_ptr(works, cpu)); |
525 | } | 643 | } |
@@ -539,12 +657,12 @@ EXPORT_SYMBOL(flush_scheduled_work); | |||
539 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed | 657 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed |
540 | * work whose handler rearms the delayed work. | 658 | * work whose handler rearms the delayed work. |
541 | * @wq: the controlling workqueue structure | 659 | * @wq: the controlling workqueue structure |
542 | * @work: the delayed work struct | 660 | * @dwork: the delayed work struct |
543 | */ | 661 | */ |
544 | void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, | 662 | void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, |
545 | struct work_struct *work) | 663 | struct delayed_work *dwork) |
546 | { | 664 | { |
547 | while (!cancel_delayed_work(work)) | 665 | while (!cancel_delayed_work(dwork)) |
548 | flush_workqueue(wq); | 666 | flush_workqueue(wq); |
549 | } | 667 | } |
550 | EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); | 668 | EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); |
@@ -552,18 +670,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); | |||
552 | /** | 670 | /** |
553 | * cancel_rearming_delayed_work - reliably kill off a delayed keventd | 671 | * cancel_rearming_delayed_work - reliably kill off a delayed keventd |
554 | * work whose handler rearms the delayed work. | 672 | * work whose handler rearms the delayed work. |
555 | * @work: the delayed work struct | 673 | * @dwork: the delayed work struct |
556 | */ | 674 | */ |
557 | void cancel_rearming_delayed_work(struct work_struct *work) | 675 | void cancel_rearming_delayed_work(struct delayed_work *dwork) |
558 | { | 676 | { |
559 | cancel_rearming_delayed_workqueue(keventd_wq, work); | 677 | cancel_rearming_delayed_workqueue(keventd_wq, dwork); |
560 | } | 678 | } |
561 | EXPORT_SYMBOL(cancel_rearming_delayed_work); | 679 | EXPORT_SYMBOL(cancel_rearming_delayed_work); |
562 | 680 | ||
563 | /** | 681 | /** |
564 | * execute_in_process_context - reliably execute the routine with user context | 682 | * execute_in_process_context - reliably execute the routine with user context |
565 | * @fn: the function to execute | 683 | * @fn: the function to execute |
566 | * @data: data to pass to the function | ||
567 | * @ew: guaranteed storage for the execute work structure (must | 684 | * @ew: guaranteed storage for the execute work structure (must |
568 | * be available when the work executes) | 685 | * be available when the work executes) |
569 | * | 686 | * |
@@ -573,15 +690,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work); | |||
573 | * Returns: 0 - function was executed | 690 | * Returns: 0 - function was executed |
574 | * 1 - function was scheduled for execution | 691 | * 1 - function was scheduled for execution |
575 | */ | 692 | */ |
576 | int execute_in_process_context(void (*fn)(void *data), void *data, | 693 | int execute_in_process_context(work_func_t fn, struct execute_work *ew) |
577 | struct execute_work *ew) | ||
578 | { | 694 | { |
579 | if (!in_interrupt()) { | 695 | if (!in_interrupt()) { |
580 | fn(data); | 696 | fn(&ew->work); |
581 | return 0; | 697 | return 0; |
582 | } | 698 | } |
583 | 699 | ||
584 | INIT_WORK(&ew->work, fn, data); | 700 | INIT_WORK(&ew->work, fn); |
585 | schedule_work(&ew->work); | 701 | schedule_work(&ew->work); |
586 | 702 | ||
587 | return 1; | 703 | return 1; |
@@ -609,7 +725,6 @@ int current_is_keventd(void) | |||
609 | 725 | ||
610 | } | 726 | } |
611 | 727 | ||
612 | #ifdef CONFIG_HOTPLUG_CPU | ||
613 | /* Take the work from this (downed) CPU. */ | 728 | /* Take the work from this (downed) CPU. */ |
614 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | 729 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) |
615 | { | 730 | { |
@@ -642,7 +757,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
642 | mutex_lock(&workqueue_mutex); | 757 | mutex_lock(&workqueue_mutex); |
643 | /* Create a new workqueue thread for it. */ | 758 | /* Create a new workqueue thread for it. */ |
644 | list_for_each_entry(wq, &workqueues, list) { | 759 | list_for_each_entry(wq, &workqueues, list) { |
645 | if (!create_workqueue_thread(wq, hotcpu)) { | 760 | if (!create_workqueue_thread(wq, hotcpu, 0)) { |
646 | printk("workqueue for %i failed\n", hotcpu); | 761 | printk("workqueue for %i failed\n", hotcpu); |
647 | return NOTIFY_BAD; | 762 | return NOTIFY_BAD; |
648 | } | 763 | } |
@@ -692,7 +807,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
692 | 807 | ||
693 | return NOTIFY_OK; | 808 | return NOTIFY_OK; |
694 | } | 809 | } |
695 | #endif | ||
696 | 810 | ||
697 | void init_workqueues(void) | 811 | void init_workqueues(void) |
698 | { | 812 | { |